From b09cfa01baf1edf8d9cfbe90913d3e305036a5c2 Mon Sep 17 00:00:00 2001
From: Muhammad Haseeb <14217455+mhaseeb123@users.noreply.github.com>
Date: Mon, 12 Aug 2024 03:19:07 +0000
Subject: [PATCH 01/30] Works. Need to figure out equality_fn

---
 cpp/src/io/parquet/chunk_dict.cu   | 132 ++++++++++-------------------
 cpp/src/io/parquet/parquet_gpu.cuh |  72 ++++++++++++++--
 cpp/src/io/parquet/parquet_gpu.hpp |  43 +---------
 cpp/src/io/parquet/writer_impl.cu  |  68 ++++++++++++---
 4 files changed, 165 insertions(+), 150 deletions(-)
diff --git a/cpp/src/io/parquet/chunk_dict.cu b/cpp/src/io/parquet/chunk_dict.cu
index a43c6d4cbb6..147b1f3057b 100644
--- a/cpp/src/io/parquet/chunk_dict.cu
+++ b/cpp/src/io/parquet/chunk_dict.cu
@@ -26,28 +26,15 @@
 
 namespace cudf::io::parquet::detail {
 
+namespace cg = cooperative_groups;
+
 namespace {
 constexpr int DEFAULT_BLOCK_SIZE = 256;
 }
 
-template <int block_size>
-CUDF_KERNEL void __launch_bounds__(block_size)
-  initialize_chunk_hash_maps_kernel(device_span<EncColumnChunk> chunks)
-{
-  auto const chunk = chunks[blockIdx.x];
-  auto const t     = threadIdx.x;
-  // fut: Now that per-chunk dict is same size as ck.num_values, try to not use one block per chunk
-  for (thread_index_type i = 0; i < chunk.dict_map_size; i += block_size) {
-    if (t + i < chunk.dict_map_size) {
-      new (&chunk.dict_map_slots[t + i].first) map_type::atomic_key_type{KEY_SENTINEL};
-      new (&chunk.dict_map_slots[t + i].second) map_type::atomic_mapped_type{VALUE_SENTINEL};
-    }
-  }
-}
-
-template <typename T>
 struct equality_functor {
   column_device_view const& col;
+  template <typename T>
   __device__ bool operator()(size_type lhs_idx, size_type rhs_idx)
   {
     // We don't call this for nulls so this is fine
@@ -56,50 +43,23 @@ struct equality_functor {
   }
 };
 
-template <typename T>
 struct hash_functor {
   column_device_view const& col;
-  __device__ auto operator()(size_type idx) const
-  {
-    return cudf::hashing::detail::MurmurHash3_x86_32<T>{}(col.element<T>(idx));
-  }
-};
-
-struct map_insert_fn {
-  map_type::device_mutable_view& map;
-
-  template <typename T>
-  __device__ bool operator()(column_device_view const& col, size_type i)
-  {
-    if constexpr (column_device_view::has_element_accessor<T>()) {
-      auto hash_fn     = hash_functor<T>{col};
-      auto equality_fn = equality_functor<T>{col};
-      return map.insert(std::pair(i, i), hash_fn, equality_fn);
-    } else {
-      CUDF_UNREACHABLE("Unsupported type to insert in map");
-    }
-  }
-};
-
-struct map_find_fn {
-  map_type::device_view& map;
-
   template <typename T>
-  __device__ map_type::device_view::iterator operator()(column_device_view const& col, size_type i)
+  __device__ auto operator()(size_type idx) const
   {
     if constexpr (column_device_view::has_element_accessor<T>()) {
-      auto hash_fn     = hash_functor<T>{col};
-      auto equality_fn = equality_functor<T>{col};
-      return map.find(i, hash_fn, equality_fn);
+      return cudf::hashing::detail::MurmurHash3_x86_32<T>{}(col.element<T>(idx));
     } else {
-      CUDF_UNREACHABLE("Unsupported type to find in map");
+      return static_cast<uint32_t>(KEY_SENTINEL);
     }
   }
 };
 
 template <int block_size>
 CUDF_KERNEL void __launch_bounds__(block_size)
-  populate_chunk_hash_maps_kernel(cudf::detail::device_2dspan<PageFragment const> frags)
+  populate_chunk_hash_maps_kernel(map_ref_type* map_refs,
+                                  cudf::detail::device_2dspan<PageFragment const> frags)
 {
   auto col_idx = blockIdx.y;
   auto block_x = blockIdx.x;
@@ -123,10 +83,7 @@ CUDF_KERNEL void __launch_bounds__(block_size)
   column_device_view const& data_col = *col->leaf_column;
 
   // Make a view of the hash map
-  auto hash_map_mutable = map_type::device_mutable_view(chunk->dict_map_slots,
-                                                        chunk->dict_map_size,
-                                                        cuco::empty_key{KEY_SENTINEL},
-                                                        cuco::empty_value{VALUE_SENTINEL});
+  auto hash_map_ref = *(map_refs + chunk->dict_map_idx);
 
   __shared__ size_type total_num_dict_entries;
   thread_index_type val_idx = s_start_value_idx + t;
@@ -138,8 +95,14 @@ CUDF_KERNEL void __launch_bounds__(block_size)
     size_type is_unique      = 0;
     size_type uniq_elem_size = 0;
     if (is_valid) {
-      is_unique =
-        type_dispatcher(data_col.type(), map_insert_fn{hash_map_mutable}, data_col, val_idx);
+      auto const val = type_dispatcher(data_col.type(), hash_functor{data_col}, val_idx);
+      if (val != KEY_SENTINEL) {
+        auto map_insert_ref = hash_map_ref.with_operators(cuco::insert);
+        is_unique           = map_insert_ref.insert(cuco::pair{val, val_idx});
+      } else {
+        is_unique = false;
+      }
+      // if (!t) printf("inserted val = %u, val_idx = %ld\n", val, val_idx);
       uniq_elem_size = [&]() -> size_type {
         if (not is_unique) { return 0; }
         switch (col->physical_type) {
@@ -170,7 +133,6 @@ CUDF_KERNEL void __launch_bounds__(block_size)
         }
       }();
     }
-
     auto num_unique = block_reduce(reduce_storage).Sum(is_unique);
     __syncthreads();
     auto uniq_data_size = block_reduce(reduce_storage).Sum(uniq_elem_size);
@@ -190,24 +152,20 @@ CUDF_KERNEL void __launch_bounds__(block_size)
 
 template <int block_size>
 CUDF_KERNEL void __launch_bounds__(block_size)
-  collect_map_entries_kernel(device_span<EncColumnChunk> chunks)
+  collect_map_entries_kernel(map_ref_type* map_refs, device_span<EncColumnChunk> chunks)
 {
   auto& chunk = chunks[blockIdx.x];
   if (not chunk.use_dictionary) { return; }
 
   auto t   = threadIdx.x;
-  auto map = map_type::device_view(chunk.dict_map_slots,
-                                   chunk.dict_map_size,
-                                   cuco::empty_key{KEY_SENTINEL},
-                                   cuco::empty_value{VALUE_SENTINEL});
-
+  auto map = *(map_refs + chunk.dict_map_idx);
   __shared__ cuda::atomic<size_type, cuda::thread_scope_block> counter;
   using cuda::std::memory_order_relaxed;
   if (t == 0) { new (&counter) cuda::atomic<size_type, cuda::thread_scope_block>{0}; }
   __syncthreads();
   for (size_type i = 0; i < chunk.dict_map_size; i += block_size) {
     if (t + i < chunk.dict_map_size) {
-      auto* slot = reinterpret_cast<map_type::value_type*>(map.begin_slot() + t + i);
+      auto* slot = reinterpret_cast<map_ref_type::value_type*>(&map + t + i);
       auto key   = slot->first;
       if (key != KEY_SENTINEL) {
         auto loc = counter.fetch_add(1, memory_order_relaxed);
@@ -224,7 +182,8 @@ CUDF_KERNEL void __launch_bounds__(block_size)
 
 template <int block_size>
 CUDF_KERNEL void __launch_bounds__(block_size)
-  get_dictionary_indices_kernel(cudf::detail::device_2dspan<PageFragment const> frags)
+  get_dictionary_indices_kernel(map_ref_type* map_refs,
+                                cudf::detail::device_2dspan<PageFragment const> frags)
 {
   auto col_idx = blockIdx.y;
   auto block_x = blockIdx.x;
@@ -245,21 +204,22 @@ CUDF_KERNEL void __launch_bounds__(block_size)
 
   column_device_view const& data_col = *col->leaf_column;
 
-  auto map = map_type::device_view(chunk->dict_map_slots,
-                                   chunk->dict_map_size,
-                                   cuco::empty_key{KEY_SENTINEL},
-                                   cuco::empty_value{VALUE_SENTINEL});
+  auto map = *(map_refs + chunk->dict_map_idx);
 
   thread_index_type val_idx = s_start_value_idx + t;
   while (val_idx < end_value_idx) {
     if (data_col.is_valid(val_idx)) {
-      auto found_slot = type_dispatcher(data_col.type(), map_find_fn{map}, data_col, val_idx);
-      cudf_assert(found_slot != map.end() &&
-                  "Unable to find value in map in dictionary index construction");
-      if (found_slot != map.end()) {
-        // No need for atomic as this is not going to be modified by any other thread
-        auto* val_ptr = reinterpret_cast<map_type::mapped_type*>(&found_slot->second);
-        chunk->dict_index[val_idx - s_ck_start_val_idx] = *val_ptr;
+      auto val = type_dispatcher(data_col.type(), hash_functor{data_col}, val_idx);
+      if (val != static_cast<uint32_t>(KEY_SENTINEL)) {
+        auto map_find_ref = map.with_operators(cuco::find);
+        auto found_slot   = map_find_ref.find(val);
+        cudf_assert(found_slot != map.end() &&
+                    "Unable to find value in map in dictionary index construction");
+        if (found_slot != map.end()) {
+          // No need for atomic as this is not going to be modified by any other thread
+          auto* val_ptr = reinterpret_cast<map_ref_type::mapped_type*>(&found_slot->second);
+          chunk->dict_index[val_idx - s_ck_start_val_idx] = *val_ptr;
+        }
       }
     }
 
@@ -267,32 +227,30 @@ CUDF_KERNEL void __launch_bounds__(block_size)
   }
 }
 
-void initialize_chunk_hash_maps(device_span<EncColumnChunk> chunks, rmm::cuda_stream_view stream)
-{
-  constexpr int block_size = 1024;
-  initialize_chunk_hash_maps_kernel<block_size>
-    <<<chunks.size(), block_size, 0, stream.value()>>>(chunks);
-}
-
-void populate_chunk_hash_maps(cudf::detail::device_2dspan<PageFragment const> frags,
+void populate_chunk_hash_maps(map_ref_type* map_refs,
+                              cudf::detail::device_2dspan<PageFragment const> frags,
                               rmm::cuda_stream_view stream)
 {
   dim3 const dim_grid(frags.size().second, frags.size().first);
   populate_chunk_hash_maps_kernel<DEFAULT_BLOCK_SIZE>
-    <<<dim_grid, DEFAULT_BLOCK_SIZE, 0, stream.value()>>>(frags);
+    <<<dim_grid, DEFAULT_BLOCK_SIZE, 0, stream.value()>>>(map_refs, frags);
 }
 
-void collect_map_entries(device_span<EncColumnChunk> chunks, rmm::cuda_stream_view stream)
+void collect_map_entries(map_ref_type* map_refs,
+                         device_span<EncColumnChunk> chunks,
+                         rmm::cuda_stream_view stream)
 {
   constexpr int block_size = 1024;
-  collect_map_entries_kernel<block_size><<<chunks.size(), block_size, 0, stream.value()>>>(chunks);
+  collect_map_entries_kernel<block_size>
+    <<<chunks.size(), block_size, 0, stream.value()>>>(map_refs, chunks);
 }
 
-void get_dictionary_indices(cudf::detail::device_2dspan<PageFragment const> frags,
+void get_dictionary_indices(map_ref_type* map_refs,
+                            cudf::detail::device_2dspan<PageFragment const> frags,
                             rmm::cuda_stream_view stream)
 {
   dim3 const dim_grid(frags.size().second, frags.size().first);
   get_dictionary_indices_kernel<DEFAULT_BLOCK_SIZE>
-    <<<dim_grid, DEFAULT_BLOCK_SIZE, 0, stream.value()>>>(frags);
+    <<<dim_grid, DEFAULT_BLOCK_SIZE, 0, stream.value()>>>(map_refs, frags);
 }
 }  // namespace cudf::io::parquet::detail
diff --git a/cpp/src/io/parquet/parquet_gpu.cuh b/cpp/src/io/parquet/parquet_gpu.cuh
index e3c44c78898..e8d8eeb8dbf 100644
--- a/cpp/src/io/parquet/parquet_gpu.cuh
+++ b/cpp/src/io/parquet/parquet_gpu.cuh
@@ -21,22 +21,80 @@
 #include <cudf/lists/lists_column_device_view.cuh>
 #include <cudf/types.hpp>
 
+#include <cooperative_groups.h>
 #include <cuco/static_map.cuh>
+#include <cuco/static_map_ref.cuh>
+#include <cuco/storage.cuh>
+#include <thrust/device_vector.h>
+
+#include <limits>
 
 namespace cudf::io::parquet::detail {
 
-auto constexpr KEY_SENTINEL   = size_type{-1};
-auto constexpr VALUE_SENTINEL = size_type{-1};
+using key_type    = uint32_t;
+using mapped_type = uint32_t;
+
+auto constexpr cg_size     = 1;  ///< A CUDA Cooperative Group of 8 threads to handle each subset
+auto constexpr window_size = 1;  ///< Number of concurrent slots handled by each thread
+
+auto constexpr KEY_SENTINEL   = key_type{std::numeric_limits<uint32_t>::max()};
+auto constexpr VALUE_SENTINEL = mapped_type{std::numeric_limits<uint32_t>::max()};
+auto constexpr SCOPE          = cuda::thread_scope_device;
+
+using slot_type = cuco::pair<key_type, mapped_type>;
+
+using storage_type     = cuco::aow_storage<slot_type, window_size>;
+using storage_ref_type = typename storage_type::ref_type;
 
-using map_type = cuco::legacy::static_map<size_type, size_type>;
+template <typename T>
+struct my_hasher {
+  __device__ auto operator()(T index) const { return index; }
+};
+
+using probing_scheme_type = cuco::linear_probing<cg_size, my_hasher<key_type>>;
+
+using map_ref_type = cuco::static_map_ref<key_type,
+                                          mapped_type,
+                                          SCOPE,
+                                          thrust::equal_to<key_type>,
+                                          probing_scheme_type,
+                                          storage_ref_type>;  ///< Map ref type
 
 /**
- * @brief The alias of `map_type::pair_atomic_type` class.
+ * @brief Insert chunk values into their respective hash maps
+ *
+ * @param frags Column fragments
+ * @param stream CUDA stream to use
+ */
+void populate_chunk_hash_maps(map_ref_type* map_refs,
+                              cudf::detail::device_2dspan<PageFragment const> frags,
+                              rmm::cuda_stream_view stream);
+
+/**
+ * @brief Compact dictionary hash map entries into chunk.dict_data
+ *
+ * @param chunks Flat span of chunks to compact hash maps for
+ * @param stream CUDA stream to use
+ */
+void collect_map_entries(map_ref_type* map_refs,
+                         device_span<EncColumnChunk> chunks,
+                         rmm::cuda_stream_view stream);
+
+/**
+ * @brief Get the Dictionary Indices for each row
+ *
+ * For each row of a chunk, gets the indices into chunk.dict_data which contains the value otherwise
+ * stored in input column [row]. Stores these indices into chunk.dict_index.
+ *
+ * Since dict_data itself contains indices into the original cudf column, this means that
+ * col[row] == col[dict_data[dict_index[row - chunk.start_row]]]
  *
- * Declare this struct by trivial subclassing instead of type aliasing so we can have forward
- * declaration of this struct somewhere else.
+ * @param frags Column fragments
+ * @param stream CUDA stream to use
  */
-struct slot_type : public map_type::pair_atomic_type {};
+void get_dictionary_indices(map_ref_type* map_refs,
+                            cudf::detail::device_2dspan<PageFragment const> frags,
+                            rmm::cuda_stream_view stream);
 
 /**
  * @brief Return the byte length of parquet dtypes that are physically represented by INT32
diff --git a/cpp/src/io/parquet/parquet_gpu.hpp b/cpp/src/io/parquet/parquet_gpu.hpp
index efc1f5ebab1..7044b38f529 100644
--- a/cpp/src/io/parquet/parquet_gpu.hpp
+++ b/cpp/src/io/parquet/parquet_gpu.hpp
@@ -514,7 +514,6 @@ constexpr unsigned int kDictHashBits = 16;
 constexpr size_t kDictScratchSize    = (1 << kDictHashBits) * sizeof(uint32_t);
 
 struct EncPage;
-struct slot_type;
 
 // convert Encoding to a mask value
 constexpr uint32_t encoding_to_mask(Encoding encoding)
@@ -560,7 +559,7 @@ struct EncColumnChunk {
   uint8_t is_compressed;    //!< Nonzero if the chunk uses compression
   uint32_t dictionary_size;    //!< Size of dictionary page including header
   uint32_t ck_stat_size;       //!< Size of chunk-level statistics (included in 1st page header)
-  slot_type* dict_map_slots;   //!< Hash map storage for calculating dict encoding for this chunk
+  int32_t dict_map_idx;        //!< Hash map storage for calculating dict encoding for this chunk
   size_type dict_map_size;     //!< Size of dict_map_slots
   size_type num_dict_entries;  //!< Total number of entries in dictionary
   size_type
@@ -1001,46 +1000,6 @@ void InitFragmentStatistics(device_span<statistics_group> groups,
                             device_span<PageFragment const> fragments,
                             rmm::cuda_stream_view stream);
 
-/**
- * @brief Initialize per-chunk hash maps used for dictionary with sentinel values
- *
- * @param chunks Flat span of chunks to initialize hash maps for
- * @param stream CUDA stream to use
- */
-void initialize_chunk_hash_maps(device_span<EncColumnChunk> chunks, rmm::cuda_stream_view stream);
-
-/**
- * @brief Insert chunk values into their respective hash maps
- *
- * @param frags Column fragments
- * @param stream CUDA stream to use
- */
-void populate_chunk_hash_maps(cudf::detail::device_2dspan<PageFragment const> frags,
-                              rmm::cuda_stream_view stream);
-
-/**
- * @brief Compact dictionary hash map entries into chunk.dict_data
- *
- * @param chunks Flat span of chunks to compact hash maps for
- * @param stream CUDA stream to use
- */
-void collect_map_entries(device_span<EncColumnChunk> chunks, rmm::cuda_stream_view stream);
-
-/**
- * @brief Get the Dictionary Indices for each row
- *
- * For each row of a chunk, gets the indices into chunk.dict_data which contains the value otherwise
- * stored in input column [row]. Stores these indices into chunk.dict_index.
- *
- * Since dict_data itself contains indices into the original cudf column, this means that
- * col[row] == col[dict_data[dict_index[row - chunk.start_row]]]
- *
- * @param frags Column fragments
- * @param stream CUDA stream to use
- */
-void get_dictionary_indices(cudf::detail::device_2dspan<PageFragment const> frags,
-                            rmm::cuda_stream_view stream);
-
 /**
  * @brief Launches kernel for initializing encoder data pages
  *
diff --git a/cpp/src/io/parquet/writer_impl.cu b/cpp/src/io/parquet/writer_impl.cu
index 36a1d8377bf..7aee7f6dc1f 100644
--- a/cpp/src/io/parquet/writer_impl.cu
+++ b/cpp/src/io/parquet/writer_impl.cu
@@ -1285,10 +1285,10 @@ build_chunk_dictionaries(hostdevice_2dvector<EncColumnChunk>& chunks,
     return std::pair(std::move(dict_data), std::move(dict_index));
   }
 
-  // Allocate slots for each chunk
-  std::vector<rmm::device_uvector<slot_type>> hash_maps_storage;
-  hash_maps_storage.reserve(h_chunks.size());
-  for (auto& chunk : h_chunks) {
+  std::vector<size_t> valid_chunk_sizes;
+  valid_chunk_sizes.reserve(h_chunks.size());
+
+  std::for_each(h_chunks.begin(), h_chunks.end(), [&](auto& chunk) {
     auto const& chunk_col_desc = col_desc[chunk.col_desc_id];
     auto const is_requested_non_dict =
       chunk_col_desc.requested_encoding != column_encoding::USE_DEFAULT &&
@@ -1298,20 +1298,60 @@ build_chunk_dictionaries(hostdevice_2dvector<EncColumnChunk>& chunks,
 
     if (is_type_non_dict || is_requested_non_dict) {
       chunk.use_dictionary = false;
+      valid_chunk_sizes.emplace_back(static_cast<std::size_t>(0));
     } else {
       chunk.use_dictionary = true;
-      // cuCollections suggests using a hash map of size N * (1/0.7) = num_values * 1.43
-      // https://github.com/NVIDIA/cuCollections/blob/3a49fc71/include/cuco/static_map.cuh#L190-L193
-      auto& inserted_map   = hash_maps_storage.emplace_back(chunk.num_values * 1.43, stream);
-      chunk.dict_map_slots = inserted_map.data();
-      chunk.dict_map_size  = inserted_map.size();
+      valid_chunk_sizes.emplace_back(static_cast<std::size_t>(
+        cuco::make_window_extent<cg_size, window_size>(static_cast<size_t>(chunk.num_values))));
+      chunk.dict_map_size = valid_chunk_sizes.back();
     }
-  }
+  });
+
+  std::vector<std::size_t> map_offsets(valid_chunk_sizes.size() + 1, 0);
+  std::exclusive_scan(valid_chunk_sizes.begin(),
+                      valid_chunk_sizes.end(),
+                      map_offsets.begin(),
+                      static_cast<size_t>(0));
+  map_offsets.back() = map_offsets[valid_chunk_sizes.size() - 1] + valid_chunk_sizes.back();
+
+  // Create a single bulk storage used by all subsets
+  auto map_storage = storage_type{map_offsets.back()};
+  // Initializes the storage with the given sentinel
+  map_storage.initialize(cuco::pair{KEY_SENTINEL, VALUE_SENTINEL},
+                         cuda::stream_ref{stream.value()});
+
+  std::vector<map_ref_type> h_map_refs;
+  h_map_refs.reserve(h_chunks.size());
+
+  std::for_each(
+    thrust::make_counting_iterator(static_cast<std::size_t>(0)),
+    thrust::make_counting_iterator(h_chunks.size()),
+    [&](auto const idx) {
+      auto& chunk = h_chunks[idx];
+      if (chunk.use_dictionary) {
+        storage_ref_type storage_ref{valid_chunk_sizes[idx], map_storage.data() + map_offsets[idx]};
+
+        h_map_refs.emplace_back(map_ref_type{cuco::empty_key<key_type>{KEY_SENTINEL},
+                                             cuco::empty_value<mapped_type>{VALUE_SENTINEL},
+                                             {},
+                                             {},
+                                             {},
+                                             storage_ref});
+
+        chunk.dict_map_idx = h_map_refs.size() - 1;
+      }
+    });
+
+  rmm::device_uvector<map_ref_type> d_map_refs(h_map_refs.size(), stream);
+  CUDF_CUDA_TRY(cudaMemcpyAsync(d_map_refs.data(),
+                                h_map_refs.data(),
+                                sizeof(map_ref_type) * h_map_refs.size(),
+                                cudaMemcpyDefault,
+                                stream.value()));
 
   chunks.host_to_device_async(stream);
 
-  initialize_chunk_hash_maps(chunks.device_view().flat_view(), stream);
-  populate_chunk_hash_maps(frags, stream);
+  populate_chunk_hash_maps(d_map_refs.data(), frags, stream);
 
   chunks.device_to_host_sync(stream);
 
@@ -1372,8 +1412,8 @@ build_chunk_dictionaries(hostdevice_2dvector<EncColumnChunk>& chunks,
     chunk.dict_index          = inserted_dict_index.data();
   }
   chunks.host_to_device_async(stream);
-  collect_map_entries(chunks.device_view().flat_view(), stream);
-  get_dictionary_indices(frags, stream);
+  collect_map_entries(d_map_refs.data(), chunks.device_view().flat_view(), stream);
+  get_dictionary_indices(d_map_refs.data(), frags, stream);
 
   return std::pair(std::move(dict_data), std::move(dict_index));
 }

From 089f9095aadaff5771d1c9eb5f3a6a9ec43e0571 Mon Sep 17 00:00:00 2001
From: Muhammad Haseeb <14217455+mhaseeb123@users.noreply.github.com>
Date: Tue, 13 Aug 2024 01:46:03 +0000
Subject: [PATCH 02/30] Almost works. Need to do cg split work.

---
 cpp/src/io/parquet/chunk_dict.cu        | 202 +++++++++++++++++-------
 cpp/src/io/parquet/parquet_gpu.cuh      |  28 +---
 cpp/src/io/parquet/parquet_gpu.hpp      |   2 +-
 cpp/src/io/parquet/writer_impl.cu       |  40 ++---
 cpp/tests/io/parquet_writer_test.cpp    |  41 ++---
 cpp/tests/utilities/column_utilities.cu |   1 +
 6 files changed, 184 insertions(+), 130 deletions(-)

diff --git a/cpp/src/io/parquet/chunk_dict.cu b/cpp/src/io/parquet/chunk_dict.cu
index 147b1f3057b..bd5f27ce9f6 100644
--- a/cpp/src/io/parquet/chunk_dict.cu
+++ b/cpp/src/io/parquet/chunk_dict.cu
@@ -22,6 +22,7 @@
 
 #include <rmm/exec_policy.hpp>
 
+#include <cooperative_groups.h>
 #include <cuda/atomic>
 
 namespace cudf::io::parquet::detail {
@@ -32,47 +33,142 @@ namespace {
 constexpr int DEFAULT_BLOCK_SIZE = 256;
 }
 
+template <typename T>
 struct equality_functor {
   column_device_view const& col;
-  template <typename T>
-  __device__ bool operator()(size_type lhs_idx, size_type rhs_idx)
+  __device__ bool operator()(key_type const lhs_idx, key_type const rhs_idx) const
   {
-    // We don't call this for nulls so this is fine
-    auto const equal = cudf::experimental::row::equality::nan_equal_physical_equality_comparator{};
-    return equal(col.element<T>(lhs_idx), col.element<T>(rhs_idx));
+    //  We don't call this for nulls so this is fine
+    auto const equal  = cudf::experimental::row::equality::nan_equal_physical_equality_comparator{};
+    auto const result = equal(col.element<T>(lhs_idx), col.element<T>(rhs_idx));
+    printf("col_type_id:%d, equality idx1:%d, idx2:%d, eq:%d\n",
+           col.type().id(),
+           lhs_idx,
+           rhs_idx,
+           result);
+    return result;
   }
 };
 
+template <typename T>
 struct hash_functor {
   column_device_view const& col;
+  __device__ auto operator()(key_type idx) const
+  {
+    auto const hashed = cudf::hashing::detail::MurmurHash3_x86_32<T>{}(col.element<T>(idx));
+    printf("hashing idx: %d = %d\n", idx, hashed);
+    return hashed;  // cudf::hashing::detail::MurmurHash3_x86_32<T>{}(col.element<T>(idx));
+  }
+};
+
+struct map_insert_fn {
+  storage_ref_type const& storage_ref;
+
   template <typename T>
-  __device__ auto operator()(size_type idx) const
+  __device__ bool operator()(column_device_view const& col, key_type i)
   {
     if constexpr (column_device_view::has_element_accessor<T>()) {
-      return cudf::hashing::detail::MurmurHash3_x86_32<T>{}(col.element<T>(idx));
+      using equality_fn_type    = equality_functor<T>;
+      using hash_fn_type        = hash_functor<T>;
+      using probing_scheme_type = cuco::linear_probing<cg_size, hash_fn_type>;
+
+      // Instantiate hash and equality functors.
+      auto hash_fn  = hash_fn_type{col};
+      auto equal_fn = equality_fn_type{col};
+
+      // Make a view of the hash map
+      cuco::static_map_ref<key_type,
+                           mapped_type,
+                           SCOPE,
+                           equality_fn_type,
+                           probing_scheme_type,
+                           storage_ref_type>
+        hash_map_ref{cuco::empty_key{KEY_SENTINEL},
+                     cuco::empty_value{VALUE_SENTINEL},
+                     {equal_fn},
+                     {hash_fn},
+                     {},
+                     storage_ref};
+
+      // Create another map with insert operator
+      auto map_insert_ref = hash_map_ref.with_operators(cuco::insert_and_find);
+      // Insert
+      auto [iter, found] = map_insert_ref.insert_and_find(cuco::pair{i, i});
+      printf("Inserted k=%d, v=%d, unique=%d\n", iter->first, iter->second, found);
+      return found;
     } else {
-      return static_cast<uint32_t>(KEY_SENTINEL);
+      CUDF_UNREACHABLE("Unsupported type to insert in map");
+    }
+  }
+};  // namespace cudf::io::parquet::detail
+
+struct map_find_fn {
+  storage_ref_type const& storage_ref;
+
+  template <typename T>
+  __device__ cuco::pair<key_type, mapped_type> operator()(column_device_view const& col, key_type i)
+  {
+    if constexpr (column_device_view::has_element_accessor<T>()) {
+      using equality_fn_type    = equality_functor<T>;
+      using hash_fn_type        = hash_functor<T>;
+      using probing_scheme_type = cuco::linear_probing<cg_size, hash_fn_type>;
+
+      // Instantiate hash and equality functors.
+      auto hash_fn  = hash_fn_type{col};
+      auto equal_fn = equality_fn_type{col};
+
+      // Make a view of the hash map
+      cuco::static_map_ref<key_type,
+                           mapped_type,
+                           SCOPE,
+                           equality_fn_type,
+                           probing_scheme_type,
+                           storage_ref_type>
+        hash_map_ref{cuco::empty_key{KEY_SENTINEL},
+                     cuco::empty_value{VALUE_SENTINEL},
+                     {equal_fn},
+                     {hash_fn},
+                     {},
+                     storage_ref};
+
+      // Create another map with find operator
+      auto map_find_ref = hash_map_ref.with_operators(cuco::find);
+
+      // Find the key = i
+      auto found_slot = map_find_ref.find(i);
+
+      // Check if we found the previously inserted key.
+      cudf_assert(found_slot != map_find_ref.end() &&
+                  "Unable to find value in map in dictionary index construction");
+
+      // Return a pair of the found key and value.
+      printf("Find=%d, Found slot: k=%d, v=%d\n", i, found_slot->first, found_slot->second);
+      return {found_slot->first, found_slot->second};
+    } else {
+      CUDF_UNREACHABLE("Unsupported type to find in map");
     }
   }
 };
 
 template <int block_size>
 CUDF_KERNEL void __launch_bounds__(block_size)
-  populate_chunk_hash_maps_kernel(map_ref_type* map_refs,
+  populate_chunk_hash_maps_kernel(storage_type::window_type* map_storage,
                                   cudf::detail::device_2dspan<PageFragment const> frags)
 {
-  auto col_idx = blockIdx.y;
-  auto block_x = blockIdx.x;
-  auto t       = threadIdx.x;
-  auto frag    = frags[col_idx][block_x];
-  auto chunk   = frag.chunk;
-  auto col     = chunk->col_desc;
+  auto const col_idx = blockIdx.y;
+  auto const block_x = blockIdx.x;
+  auto const frag    = frags[col_idx][block_x];
+  auto chunk         = frag.chunk;
+  auto col           = chunk->col_desc;
 
   if (not chunk->use_dictionary) { return; }
 
   using block_reduce = cub::BlockReduce<size_type, block_size>;
   __shared__ typename block_reduce::TempStorage reduce_storage;
 
+  [[maybe_unused]] auto const tile = cg::tiled_partition<cg_size>(cg::this_thread_block());
+  auto const t                     = cg::this_thread_block().thread_rank();
+
   size_type start_row = frag.start_row;
   size_type end_row   = frag.start_row + frag.num_rows;
 
@@ -81,12 +177,11 @@ CUDF_KERNEL void __launch_bounds__(block_size)
   size_type const end_value_idx     = row_to_value_idx(end_row, *col);
 
   column_device_view const& data_col = *col->leaf_column;
-
-  // Make a view of the hash map
-  auto hash_map_ref = *(map_refs + chunk->dict_map_idx);
+  storage_ref_type const storage_ref{chunk->dict_map_size, map_storage + chunk->dict_map_offset};
 
   __shared__ size_type total_num_dict_entries;
   thread_index_type val_idx = s_start_value_idx + t;
+
   while (val_idx - block_size < end_value_idx) {
     auto const is_valid =
       val_idx < end_value_idx and val_idx < data_col.size() and data_col.is_valid(val_idx);
@@ -95,14 +190,8 @@ CUDF_KERNEL void __launch_bounds__(block_size)
     size_type is_unique      = 0;
     size_type uniq_elem_size = 0;
     if (is_valid) {
-      auto const val = type_dispatcher(data_col.type(), hash_functor{data_col}, val_idx);
-      if (val != KEY_SENTINEL) {
-        auto map_insert_ref = hash_map_ref.with_operators(cuco::insert);
-        is_unique           = map_insert_ref.insert(cuco::pair{val, val_idx});
-      } else {
-        is_unique = false;
-      }
-      // if (!t) printf("inserted val = %u, val_idx = %ld\n", val, val_idx);
+      auto const is_unique =
+        type_dispatcher(data_col.type(), map_insert_fn{storage_ref}, data_col, val_idx);
       uniq_elem_size = [&]() -> size_type {
         if (not is_unique) { return 0; }
         switch (col->physical_type) {
@@ -152,24 +241,30 @@ CUDF_KERNEL void __launch_bounds__(block_size)
 
 template <int block_size>
 CUDF_KERNEL void __launch_bounds__(block_size)
-  collect_map_entries_kernel(map_ref_type* map_refs, device_span<EncColumnChunk> chunks)
+  collect_map_entries_kernel(storage_type::window_type* map_storage,
+                             device_span<EncColumnChunk> chunks)
 {
   auto& chunk = chunks[blockIdx.x];
   if (not chunk.use_dictionary) { return; }
 
-  auto t   = threadIdx.x;
-  auto map = *(map_refs + chunk.dict_map_idx);
+  [[maybe_unused]] auto const tile = cg::tiled_partition<cg_size>(cg::this_thread_block());
+  auto const t                     = cg::this_thread_block().thread_rank();
+
+  storage_ref_type const storage_ref{chunk.dict_map_size, map_storage + chunk.dict_map_offset};
+
   __shared__ cuda::atomic<size_type, cuda::thread_scope_block> counter;
   using cuda::std::memory_order_relaxed;
   if (t == 0) { new (&counter) cuda::atomic<size_type, cuda::thread_scope_block>{0}; }
   __syncthreads();
+
   for (size_type i = 0; i < chunk.dict_map_size; i += block_size) {
     if (t + i < chunk.dict_map_size) {
-      auto* slot = reinterpret_cast<map_ref_type::value_type*>(&map + t + i);
+      auto* slot = reinterpret_cast<slot_type*>(storage_ref.data() + chunk.dict_map_offset + t + i);
       auto key   = slot->first;
       if (key != KEY_SENTINEL) {
         auto loc = counter.fetch_add(1, memory_order_relaxed);
         cudf_assert(loc < MAX_DICT_SIZE && "Number of filled slots exceeds max dict size");
+        printf("Writing %d at loc: %d\n", key, loc);
         chunk.dict_data[loc] = key;
         // If sorting dict page ever becomes a hard requirement, enable the following statement and
         // add a dict sorting step before storing into the slot's second field.
@@ -182,18 +277,20 @@ CUDF_KERNEL void __launch_bounds__(block_size)
 
 template <int block_size>
 CUDF_KERNEL void __launch_bounds__(block_size)
-  get_dictionary_indices_kernel(map_ref_type* map_refs,
+  get_dictionary_indices_kernel(storage_type::window_type* map_storage,
                                 cudf::detail::device_2dspan<PageFragment const> frags)
 {
-  auto col_idx = blockIdx.y;
-  auto block_x = blockIdx.x;
-  auto t       = threadIdx.x;
-  auto frag    = frags[col_idx][block_x];
-  auto chunk   = frag.chunk;
-  auto col     = chunk->col_desc;
+  auto const col_idx = blockIdx.y;
+  auto const block_x = blockIdx.x;
+  auto const frag    = frags[col_idx][block_x];
+  auto chunk         = frag.chunk;
+  auto const col     = chunk->col_desc;
 
   if (not chunk->use_dictionary) { return; }
 
+  [[maybe_unused]] auto const tile = cg::tiled_partition<cg_size>(cg::this_thread_block());
+  auto const t                     = cg::this_thread_block().thread_rank();
+
   size_type start_row = frag.start_row;
   size_type end_row   = frag.start_row + frag.num_rows;
 
@@ -203,54 +300,45 @@ CUDF_KERNEL void __launch_bounds__(block_size)
   auto const end_value_idx      = row_to_value_idx(end_row, *col);
 
   column_device_view const& data_col = *col->leaf_column;
-
-  auto map = *(map_refs + chunk->dict_map_idx);
+  storage_ref_type const storage_ref{chunk->dict_map_size, map_storage + chunk->dict_map_offset};
 
   thread_index_type val_idx = s_start_value_idx + t;
   while (val_idx < end_value_idx) {
     if (data_col.is_valid(val_idx)) {
-      auto val = type_dispatcher(data_col.type(), hash_functor{data_col}, val_idx);
-      if (val != static_cast<uint32_t>(KEY_SENTINEL)) {
-        auto map_find_ref = map.with_operators(cuco::find);
-        auto found_slot   = map_find_ref.find(val);
-        cudf_assert(found_slot != map.end() &&
-                    "Unable to find value in map in dictionary index construction");
-        if (found_slot != map.end()) {
-          // No need for atomic as this is not going to be modified by any other thread
-          auto* val_ptr = reinterpret_cast<map_ref_type::mapped_type*>(&found_slot->second);
-          chunk->dict_index[val_idx - s_ck_start_val_idx] = *val_ptr;
-        }
-      }
+      auto [found_key, found_value] =
+        type_dispatcher(data_col.type(), map_find_fn{storage_ref}, data_col, val_idx);
+      // No need for atomic as this is not going to be modified by any other thread
+      chunk->dict_index[val_idx - s_ck_start_val_idx] = found_value;
     }
 
     val_idx += block_size;
   }
 }
 
-void populate_chunk_hash_maps(map_ref_type* map_refs,
+void populate_chunk_hash_maps(storage_type::window_type* map_storage,
                               cudf::detail::device_2dspan<PageFragment const> frags,
                               rmm::cuda_stream_view stream)
 {
   dim3 const dim_grid(frags.size().second, frags.size().first);
   populate_chunk_hash_maps_kernel<DEFAULT_BLOCK_SIZE>
-    <<<dim_grid, DEFAULT_BLOCK_SIZE, 0, stream.value()>>>(map_refs, frags);
+    <<<dim_grid, DEFAULT_BLOCK_SIZE, 0, stream.value()>>>(map_storage, frags);
 }
 
-void collect_map_entries(map_ref_type* map_refs,
+void collect_map_entries(storage_type::window_type* map_storage,
                          device_span<EncColumnChunk> chunks,
                          rmm::cuda_stream_view stream)
 {
   constexpr int block_size = 1024;
   collect_map_entries_kernel<block_size>
-    <<<chunks.size(), block_size, 0, stream.value()>>>(map_refs, chunks);
+    <<<chunks.size(), block_size, 0, stream.value()>>>(map_storage, chunks);
 }
 
-void get_dictionary_indices(map_ref_type* map_refs,
+void get_dictionary_indices(storage_type::window_type* map_storage,
                             cudf::detail::device_2dspan<PageFragment const> frags,
                             rmm::cuda_stream_view stream)
 {
   dim3 const dim_grid(frags.size().second, frags.size().first);
   get_dictionary_indices_kernel<DEFAULT_BLOCK_SIZE>
-    <<<dim_grid, DEFAULT_BLOCK_SIZE, 0, stream.value()>>>(map_refs, frags);
+    <<<dim_grid, DEFAULT_BLOCK_SIZE, 0, stream.value()>>>(map_storage, frags);
 }
 }  // namespace cudf::io::parquet::detail
diff --git a/cpp/src/io/parquet/parquet_gpu.cuh b/cpp/src/io/parquet/parquet_gpu.cuh
index e8d8eeb8dbf..a97cbd02681 100644
--- a/cpp/src/io/parquet/parquet_gpu.cuh
+++ b/cpp/src/io/parquet/parquet_gpu.cuh
@@ -31,14 +31,14 @@
 
 namespace cudf::io::parquet::detail {
 
-using key_type    = uint32_t;
-using mapped_type = uint32_t;
+using key_type    = size_type;
+using mapped_type = size_type;
 
 auto constexpr cg_size     = 1;  ///< A CUDA Cooperative Group of 8 threads to handle each subset
 auto constexpr window_size = 1;  ///< Number of concurrent slots handled by each thread
 
-auto constexpr KEY_SENTINEL   = key_type{std::numeric_limits<uint32_t>::max()};
-auto constexpr VALUE_SENTINEL = mapped_type{std::numeric_limits<uint32_t>::max()};
+auto constexpr KEY_SENTINEL   = key_type{std::numeric_limits<key_type>::max()};
+auto constexpr VALUE_SENTINEL = mapped_type{std::numeric_limits<mapped_type>::max()};
 auto constexpr SCOPE          = cuda::thread_scope_device;
 
 using slot_type = cuco::pair<key_type, mapped_type>;
@@ -46,27 +46,13 @@ using slot_type = cuco::pair<key_type, mapped_type>;
 using storage_type     = cuco::aow_storage<slot_type, window_size>;
 using storage_ref_type = typename storage_type::ref_type;
 
-template <typename T>
-struct my_hasher {
-  __device__ auto operator()(T index) const { return index; }
-};
-
-using probing_scheme_type = cuco::linear_probing<cg_size, my_hasher<key_type>>;
-
-using map_ref_type = cuco::static_map_ref<key_type,
-                                          mapped_type,
-                                          SCOPE,
-                                          thrust::equal_to<key_type>,
-                                          probing_scheme_type,
-                                          storage_ref_type>;  ///< Map ref type
-
 /**
  * @brief Insert chunk values into their respective hash maps
  *
  * @param frags Column fragments
  * @param stream CUDA stream to use
  */
-void populate_chunk_hash_maps(map_ref_type* map_refs,
+void populate_chunk_hash_maps(storage_type::window_type* map_storage,
                               cudf::detail::device_2dspan<PageFragment const> frags,
                               rmm::cuda_stream_view stream);
 
@@ -76,7 +62,7 @@ void populate_chunk_hash_maps(map_ref_type* map_refs,
  * @param chunks Flat span of chunks to compact hash maps for
  * @param stream CUDA stream to use
  */
-void collect_map_entries(map_ref_type* map_refs,
+void collect_map_entries(storage_type::window_type* map_storage,
                          device_span<EncColumnChunk> chunks,
                          rmm::cuda_stream_view stream);
 
@@ -92,7 +78,7 @@ void collect_map_entries(map_ref_type* map_refs,
  * @param frags Column fragments
  * @param stream CUDA stream to use
  */
-void get_dictionary_indices(map_ref_type* map_refs,
+void get_dictionary_indices(storage_type::window_type* map_storage,
                             cudf::detail::device_2dspan<PageFragment const> frags,
                             rmm::cuda_stream_view stream);
 
diff --git a/cpp/src/io/parquet/parquet_gpu.hpp b/cpp/src/io/parquet/parquet_gpu.hpp
index 7044b38f529..daf97a566f9 100644
--- a/cpp/src/io/parquet/parquet_gpu.hpp
+++ b/cpp/src/io/parquet/parquet_gpu.hpp
@@ -559,7 +559,7 @@ struct EncColumnChunk {
   uint8_t is_compressed;    //!< Nonzero if the chunk uses compression
   uint32_t dictionary_size;    //!< Size of dictionary page including header
   uint32_t ck_stat_size;       //!< Size of chunk-level statistics (included in 1st page header)
-  int32_t dict_map_idx;        //!< Hash map storage for calculating dict encoding for this chunk
+  uint32_t dict_map_offset;    //!< Hash map storage for calculating dict encoding for this chunk
   size_type dict_map_size;     //!< Size of dict_map_slots
   size_type num_dict_entries;  //!< Total number of entries in dictionary
   size_type
diff --git a/cpp/src/io/parquet/writer_impl.cu b/cpp/src/io/parquet/writer_impl.cu
index 7aee7f6dc1f..8c30b8f5f79 100644
--- a/cpp/src/io/parquet/writer_impl.cu
+++ b/cpp/src/io/parquet/writer_impl.cu
@@ -1320,38 +1320,16 @@ build_chunk_dictionaries(hostdevice_2dvector<EncColumnChunk>& chunks,
   map_storage.initialize(cuco::pair{KEY_SENTINEL, VALUE_SENTINEL},
                          cuda::stream_ref{stream.value()});
 
-  std::vector<map_ref_type> h_map_refs;
-  h_map_refs.reserve(h_chunks.size());
-
-  std::for_each(
-    thrust::make_counting_iterator(static_cast<std::size_t>(0)),
-    thrust::make_counting_iterator(h_chunks.size()),
-    [&](auto const idx) {
-      auto& chunk = h_chunks[idx];
-      if (chunk.use_dictionary) {
-        storage_ref_type storage_ref{valid_chunk_sizes[idx], map_storage.data() + map_offsets[idx]};
-
-        h_map_refs.emplace_back(map_ref_type{cuco::empty_key<key_type>{KEY_SENTINEL},
-                                             cuco::empty_value<mapped_type>{VALUE_SENTINEL},
-                                             {},
-                                             {},
-                                             {},
-                                             storage_ref});
-
-        chunk.dict_map_idx = h_map_refs.size() - 1;
-      }
-    });
-
-  rmm::device_uvector<map_ref_type> d_map_refs(h_map_refs.size(), stream);
-  CUDF_CUDA_TRY(cudaMemcpyAsync(d_map_refs.data(),
-                                h_map_refs.data(),
-                                sizeof(map_ref_type) * h_map_refs.size(),
-                                cudaMemcpyDefault,
-                                stream.value()));
+  std::for_each(thrust::make_counting_iterator(static_cast<std::size_t>(0)),
+                thrust::make_counting_iterator(h_chunks.size()),
+                [&](auto const idx) {
+                  auto& chunk = h_chunks[idx];
+                  if (chunk.use_dictionary) { chunk.dict_map_offset = map_offsets[idx]; }
+                });
 
   chunks.host_to_device_async(stream);
 
-  populate_chunk_hash_maps(d_map_refs.data(), frags, stream);
+  populate_chunk_hash_maps(map_storage.data(), frags, stream);
 
   chunks.device_to_host_sync(stream);
 
@@ -1412,8 +1390,8 @@ build_chunk_dictionaries(hostdevice_2dvector<EncColumnChunk>& chunks,
     chunk.dict_index          = inserted_dict_index.data();
   }
   chunks.host_to_device_async(stream);
-  collect_map_entries(d_map_refs.data(), chunks.device_view().flat_view(), stream);
-  get_dictionary_indices(d_map_refs.data(), frags, stream);
+  collect_map_entries(map_storage.data(), chunks.device_view().flat_view(), stream);
+  get_dictionary_indices(map_storage.data(), frags, stream);
 
   return std::pair(std::move(dict_data), std::move(dict_index));
 }
diff --git a/cpp/tests/io/parquet_writer_test.cpp b/cpp/tests/io/parquet_writer_test.cpp
index e07ebe25322..5e39f9c9cfb 100644
--- a/cpp/tests/io/parquet_writer_test.cpp
+++ b/cpp/tests/io/parquet_writer_test.cpp
@@ -52,7 +52,7 @@ void test_durations(mask_op_t mask_op, bool use_byte_stream_split, bool arrow_sc
 
   auto mask = cudf::detail::make_counting_transform_iterator(0, mask_op);
 
-  constexpr auto num_rows = 100;
+  constexpr auto num_rows = 20;
   // Durations longer than a day are not exactly valid, but cudf should be able to round trip
   auto durations_d = cudf::test::fixed_width_column_wrapper<cudf::duration_D, int64_t>(
     sequence_d, sequence_d + num_rows, mask);
@@ -65,7 +65,7 @@ void test_durations(mask_op_t mask_op, bool use_byte_stream_split, bool arrow_sc
   auto durations_ns = cudf::test::fixed_width_column_wrapper<cudf::duration_ns, int64_t>(
     sequence, sequence + num_rows, mask);
 
-  auto expected = table_view{{durations_d, durations_s, durations_ms, durations_us, durations_ns}};
+  auto expected = table_view{{durations_d}};
 
   if (use_byte_stream_split) {
     cudf::io::table_input_metadata expected_metadata(expected);
@@ -90,30 +90,31 @@ void test_durations(mask_op_t mask_op, bool use_byte_stream_split, bool arrow_sc
     cudf::cast(result.tbl->view().column(0), cudf::data_type{cudf::type_id::DURATION_DAYS});
   CUDF_TEST_EXPECT_COLUMNS_EQUAL(durations_d, durations_d_got->view());
 
-  if (arrow_schema) {
-    CUDF_TEST_EXPECT_COLUMNS_EQUAL(durations_s, result.tbl->view().column(1));
-  } else {
-    auto durations_s_got =
-      cudf::cast(result.tbl->view().column(1), cudf::data_type{cudf::type_id::DURATION_SECONDS});
-    CUDF_TEST_EXPECT_COLUMNS_EQUAL(durations_s, durations_s_got->view());
-  }
+  /*
+    if (arrow_schema) {
+      CUDF_TEST_EXPECT_COLUMNS_EQUAL(durations_s, result.tbl->view().column(1));
+    } else {
+      auto durations_s_got =
+        cudf::cast(result.tbl->view().column(1), cudf::data_type{cudf::type_id::DURATION_SECONDS});
+      CUDF_TEST_EXPECT_COLUMNS_EQUAL(durations_s, durations_s_got->view());
+    }
 
-  CUDF_TEST_EXPECT_COLUMNS_EQUAL(durations_ms, result.tbl->view().column(2));
-  CUDF_TEST_EXPECT_COLUMNS_EQUAL(durations_us, result.tbl->view().column(3));
-  CUDF_TEST_EXPECT_COLUMNS_EQUAL(durations_ns, result.tbl->view().column(4));
+    CUDF_TEST_EXPECT_COLUMNS_EQUAL(durations_ms, result.tbl->view().column(2));
+    CUDF_TEST_EXPECT_COLUMNS_EQUAL(durations_us, result.tbl->view().column(3));
+    CUDF_TEST_EXPECT_COLUMNS_EQUAL(durations_ns, result.tbl->view().column(4));*/
 }
 
 TEST_F(ParquetWriterTest, Durations)
 {
   test_durations([](auto i) { return true; }, false, false);
-  test_durations([](auto i) { return (i % 2) != 0; }, false, false);
-  test_durations([](auto i) { return (i % 3) != 0; }, false, false);
-  test_durations([](auto i) { return false; }, false, false);
-
-  test_durations([](auto i) { return true; }, false, true);
-  test_durations([](auto i) { return (i % 2) != 0; }, false, true);
-  test_durations([](auto i) { return (i % 3) != 0; }, false, true);
-  test_durations([](auto i) { return false; }, false, true);
+  // test_durations([](auto i) { return (i % 2) != 0; }, false, false);
+  // test_durations([](auto i) { return (i % 3) != 0; }, false, false);
+  // test_durations([](auto i) { return false; }, false, false);
+
+  // test_durations([](auto i) { return true; }, false, true);
+  // test_durations([](auto i) { return (i % 2) != 0; }, false, true);
+  // test_durations([](auto i) { return (i % 3) != 0; }, false, true);
+  // test_durations([](auto i) { return false; }, false, true);
 }
 
 TEST_F(ParquetWriterTest, MultiIndex)
diff --git a/cpp/tests/utilities/column_utilities.cu b/cpp/tests/utilities/column_utilities.cu
index fb9bdeb0b22..be4655b0a98 100644
--- a/cpp/tests/utilities/column_utilities.cu
+++ b/cpp/tests/utilities/column_utilities.cu
@@ -460,6 +460,7 @@ std::string stringify_column_differences(cudf::device_span<int const> difference
                                          debug_output_level verbosity,
                                          int depth)
 {
+  verbosity = debug_output_level::ALL_ERRORS;
   CUDF_EXPECTS(not differences.empty(), "Shouldn't enter this function if `differences` is empty");
   std::string const depth_str = depth > 0 ? "depth " + std::to_string(depth) + '\n' : "";
   // move the differences to the host.

From fb742fbbdbdcfc40540fb3e24d68f739d0f0a994 Mon Sep 17 00:00:00 2001
From: Muhammad Haseeb <14217455+mhaseeb123@users.noreply.github.com>
Date: Tue, 13 Aug 2024 19:00:16 +0000
Subject: [PATCH 03/30] Some updates

---
 cpp/src/io/parquet/chunk_dict.cu     | 10 ++++------
 cpp/src/io/parquet/parquet_gpu.cuh   |  6 +++---
 cpp/src/io/parquet/writer_impl.cu    | 14 +++++++++++---
 cpp/tests/io/parquet_writer_test.cpp | 25 ++++++++++++-------------
 4 files changed, 30 insertions(+), 25 deletions(-)

diff --git a/cpp/src/io/parquet/chunk_dict.cu b/cpp/src/io/parquet/chunk_dict.cu
index bd5f27ce9f6..9b333e350db 100644
--- a/cpp/src/io/parquet/chunk_dict.cu
+++ b/cpp/src/io/parquet/chunk_dict.cu
@@ -250,8 +250,6 @@ CUDF_KERNEL void __launch_bounds__(block_size)
   [[maybe_unused]] auto const tile = cg::tiled_partition<cg_size>(cg::this_thread_block());
   auto const t                     = cg::this_thread_block().thread_rank();
 
-  storage_ref_type const storage_ref{chunk.dict_map_size, map_storage + chunk.dict_map_offset};
-
   __shared__ cuda::atomic<size_type, cuda::thread_scope_block> counter;
   using cuda::std::memory_order_relaxed;
   if (t == 0) { new (&counter) cuda::atomic<size_type, cuda::thread_scope_block>{0}; }
@@ -259,17 +257,17 @@ CUDF_KERNEL void __launch_bounds__(block_size)
 
   for (size_type i = 0; i < chunk.dict_map_size; i += block_size) {
     if (t + i < chunk.dict_map_size) {
-      auto* slot = reinterpret_cast<slot_type*>(storage_ref.data() + chunk.dict_map_offset + t + i);
-      auto key   = slot->first;
+      auto* slot     = map_storage + chunk.dict_map_offset + t + i;
+      auto const key = slot->data()->first;
       if (key != KEY_SENTINEL) {
         auto loc = counter.fetch_add(1, memory_order_relaxed);
         cudf_assert(loc < MAX_DICT_SIZE && "Number of filled slots exceeds max dict size");
-        printf("Writing %d at loc: %d\n", key, loc);
+        // printf("Writing %d at loc: %d\n", key, loc);
         chunk.dict_data[loc] = key;
         // If sorting dict page ever becomes a hard requirement, enable the following statement and
         // add a dict sorting step before storing into the slot's second field.
         // chunk.dict_data_idx[loc] = t + i;
-        slot->second = loc;
+        slot->data()->second = loc;
       }
     }
   }
diff --git a/cpp/src/io/parquet/parquet_gpu.cuh b/cpp/src/io/parquet/parquet_gpu.cuh
index a97cbd02681..315a9cf4449 100644
--- a/cpp/src/io/parquet/parquet_gpu.cuh
+++ b/cpp/src/io/parquet/parquet_gpu.cuh
@@ -37,9 +37,9 @@ using mapped_type = size_type;
 auto constexpr cg_size     = 1;  ///< A CUDA Cooperative Group of 8 threads to handle each subset
 auto constexpr window_size = 1;  ///< Number of concurrent slots handled by each thread
 
-auto constexpr KEY_SENTINEL   = key_type{std::numeric_limits<key_type>::max()};
-auto constexpr VALUE_SENTINEL = mapped_type{std::numeric_limits<mapped_type>::max()};
-auto constexpr SCOPE          = cuda::thread_scope_device;
+auto constexpr KEY_SENTINEL   = key_type{-1};
+auto constexpr VALUE_SENTINEL = mapped_type{-1};
+auto constexpr SCOPE          = cuda::thread_scope_block;
 
 using slot_type = cuco::pair<key_type, mapped_type>;
 
diff --git a/cpp/src/io/parquet/writer_impl.cu b/cpp/src/io/parquet/writer_impl.cu
index 8c30b8f5f79..8f674385f30 100644
--- a/cpp/src/io/parquet/writer_impl.cu
+++ b/cpp/src/io/parquet/writer_impl.cu
@@ -1301,8 +1301,10 @@ build_chunk_dictionaries(hostdevice_2dvector<EncColumnChunk>& chunks,
       valid_chunk_sizes.emplace_back(static_cast<std::size_t>(0));
     } else {
       chunk.use_dictionary = true;
-      valid_chunk_sizes.emplace_back(static_cast<std::size_t>(
-        cuco::make_window_extent<cg_size, window_size>(static_cast<size_t>(chunk.num_values))));
+      valid_chunk_sizes.emplace_back(
+        static_cast<std::size_t>(cuco::make_window_extent<cg_size, window_size>(
+          // Multiplying by 1/0.7 = 1.43 to target a 70% occupancy factor.
+          static_cast<size_t>(chunk.num_values * 1.43))));
       chunk.dict_map_size = valid_chunk_sizes.back();
     }
   });
@@ -1320,11 +1322,17 @@ build_chunk_dictionaries(hostdevice_2dvector<EncColumnChunk>& chunks,
   map_storage.initialize(cuco::pair{KEY_SENTINEL, VALUE_SENTINEL},
                          cuda::stream_ref{stream.value()});
 
+  std::cout << "Offsets: " << h_chunks.size() << std::endl;
+
   std::for_each(thrust::make_counting_iterator(static_cast<std::size_t>(0)),
                 thrust::make_counting_iterator(h_chunks.size()),
                 [&](auto const idx) {
                   auto& chunk = h_chunks[idx];
-                  if (chunk.use_dictionary) { chunk.dict_map_offset = map_offsets[idx]; }
+                  if (chunk.use_dictionary) {
+                    chunk.dict_map_offset = map_offsets[idx];
+                    std::cout << "off: " << map_offsets[idx] << ", size: " << chunk.dict_map_size
+                              << std::endl;
+                  }
                 });
 
   chunks.host_to_device_async(stream);
diff --git a/cpp/tests/io/parquet_writer_test.cpp b/cpp/tests/io/parquet_writer_test.cpp
index 5e39f9c9cfb..9f171d23e7d 100644
--- a/cpp/tests/io/parquet_writer_test.cpp
+++ b/cpp/tests/io/parquet_writer_test.cpp
@@ -39,7 +39,7 @@ void test_durations(mask_op_t mask_op, bool use_byte_stream_split, bool arrow_sc
 {
   std::default_random_engine generator;
   std::uniform_int_distribution<int> distribution_d(0, 30);
-  auto sequence_d = cudf::detail::make_counting_transform_iterator(
+  [[maybe_unused]]auto sequence_d = cudf::detail::make_counting_transform_iterator(
     0, [&](auto i) { return distribution_d(generator); });
 
   std::uniform_int_distribution<int> distribution_s(0, 86400);
@@ -52,8 +52,8 @@ void test_durations(mask_op_t mask_op, bool use_byte_stream_split, bool arrow_sc
 
   auto mask = cudf::detail::make_counting_transform_iterator(0, mask_op);
 
-  constexpr auto num_rows = 20;
-  // Durations longer than a day are not exactly valid, but cudf should be able to round trip
+  constexpr auto num_rows = 5650; // WORKS UNTIL 5649, fails beyond 5650
+    // Durations longer than a day are not exactly valid, but cudf should be able to round trip
   auto durations_d = cudf::test::fixed_width_column_wrapper<cudf::duration_D, int64_t>(
     sequence_d, sequence_d + num_rows, mask);
   auto durations_s = cudf::test::fixed_width_column_wrapper<cudf::duration_s, int64_t>(
@@ -65,7 +65,7 @@ void test_durations(mask_op_t mask_op, bool use_byte_stream_split, bool arrow_sc
   auto durations_ns = cudf::test::fixed_width_column_wrapper<cudf::duration_ns, int64_t>(
     sequence, sequence + num_rows, mask);
 
-  auto expected = table_view{{durations_d}};
+  auto expected = table_view{{/*durations_d, */durations_s,/*durations_ms, durations_us, durations_ns*/}};
 
   if (use_byte_stream_split) {
     cudf::io::table_input_metadata expected_metadata(expected);
@@ -85,23 +85,22 @@ void test_durations(mask_op_t mask_op, bool use_byte_stream_split, bool arrow_sc
     cudf::io::parquet_reader_options::builder(cudf::io::source_info{filepath})
       .use_arrow_schema(arrow_schema);
   auto result = cudf::io::read_parquet(in_opts);
-
+/*
   auto durations_d_got =
     cudf::cast(result.tbl->view().column(0), cudf::data_type{cudf::type_id::DURATION_DAYS});
-  CUDF_TEST_EXPECT_COLUMNS_EQUAL(durations_d, durations_d_got->view());
+  CUDF_TEST_EXPECT_COLUMNS_EQUAL(durations_d, durations_d_got->view());*/
 
-  /*
     if (arrow_schema) {
-      CUDF_TEST_EXPECT_COLUMNS_EQUAL(durations_s, result.tbl->view().column(1));
+      CUDF_TEST_EXPECT_COLUMNS_EQUAL(durations_s, result.tbl->view().column(0));
     } else {
       auto durations_s_got =
-        cudf::cast(result.tbl->view().column(1), cudf::data_type{cudf::type_id::DURATION_SECONDS});
+        cudf::cast(result.tbl->view().column(0), cudf::data_type{cudf::type_id::DURATION_SECONDS});
       CUDF_TEST_EXPECT_COLUMNS_EQUAL(durations_s, durations_s_got->view());
     }
-
-    CUDF_TEST_EXPECT_COLUMNS_EQUAL(durations_ms, result.tbl->view().column(2));
-    CUDF_TEST_EXPECT_COLUMNS_EQUAL(durations_us, result.tbl->view().column(3));
-    CUDF_TEST_EXPECT_COLUMNS_EQUAL(durations_ns, result.tbl->view().column(4));*/
+/*
+    CUDF_TEST_EXPECT_COLUMNS_EQUAL(durations_ms, result.tbl->view().column(1));
+    CUDF_TEST_EXPECT_COLUMNS_EQUAL(durations_us, result.tbl->view().column(2));
+    CUDF_TEST_EXPECT_COLUMNS_EQUAL(durations_ns, result.tbl->view().column(3));*/
 }
 
 TEST_F(ParquetWriterTest, Durations)

From b9067b064df89d2bd3700ef34b27e083df792a81 Mon Sep 17 00:00:00 2001
From: Muhammad Haseeb <14217455+mhaseeb123@users.noreply.github.com>
Date: Tue, 13 Aug 2024 21:40:13 +0000
Subject: [PATCH 04/30] Fixes and improvements. 330/346 gtests passing

---
 cpp/src/io/parquet/chunk_dict.cu     | 18 ++---
 cpp/src/io/parquet/writer_impl.cu    | 20 +++---
 cpp/tests/io/parquet_writer_test.cpp | 99 ++++++++++++++--------------
 3 files changed, 67 insertions(+), 70 deletions(-)

diff --git a/cpp/src/io/parquet/chunk_dict.cu b/cpp/src/io/parquet/chunk_dict.cu
index 9b333e350db..49dd7b0b7bf 100644
--- a/cpp/src/io/parquet/chunk_dict.cu
+++ b/cpp/src/io/parquet/chunk_dict.cu
@@ -41,11 +41,11 @@ struct equality_functor {
     //  We don't call this for nulls so this is fine
     auto const equal  = cudf::experimental::row::equality::nan_equal_physical_equality_comparator{};
     auto const result = equal(col.element<T>(lhs_idx), col.element<T>(rhs_idx));
-    printf("col_type_id:%d, equality idx1:%d, idx2:%d, eq:%d\n",
+    /*printf("col_type_id:%d, equality idx1:%d, idx2:%d, eq:%d\n",
            col.type().id(),
            lhs_idx,
            rhs_idx,
-           result);
+           result);*/
     return result;
   }
 };
@@ -56,7 +56,7 @@ struct hash_functor {
   __device__ auto operator()(key_type idx) const
   {
     auto const hashed = cudf::hashing::detail::MurmurHash3_x86_32<T>{}(col.element<T>(idx));
-    printf("hashing idx: %d = %d\n", idx, hashed);
+    // printf("hashing idx: %d = %d\n", idx, hashed);
     return hashed;  // cudf::hashing::detail::MurmurHash3_x86_32<T>{}(col.element<T>(idx));
   }
 };
@@ -94,7 +94,7 @@ struct map_insert_fn {
       auto map_insert_ref = hash_map_ref.with_operators(cuco::insert_and_find);
       // Insert
       auto [iter, found] = map_insert_ref.insert_and_find(cuco::pair{i, i});
-      printf("Inserted k=%d, v=%d, unique=%d\n", iter->first, iter->second, found);
+      // printf("Inserted k=%d, v=%d, unique=%d\n", iter->first, iter->second, found);
       return found;
     } else {
       CUDF_UNREACHABLE("Unsupported type to insert in map");
@@ -142,7 +142,7 @@ struct map_find_fn {
                   "Unable to find value in map in dictionary index construction");
 
       // Return a pair of the found key and value.
-      printf("Find=%d, Found slot: k=%d, v=%d\n", i, found_slot->first, found_slot->second);
+      // printf("Find=%d, Found slot: k=%d, v=%d\n", i, found_slot->first, found_slot->second);
       return {found_slot->first, found_slot->second};
     } else {
       CUDF_UNREACHABLE("Unsupported type to find in map");
@@ -190,8 +190,7 @@ CUDF_KERNEL void __launch_bounds__(block_size)
     size_type is_unique      = 0;
     size_type uniq_elem_size = 0;
     if (is_valid) {
-      auto const is_unique =
-        type_dispatcher(data_col.type(), map_insert_fn{storage_ref}, data_col, val_idx);
+      is_unique = type_dispatcher(data_col.type(), map_insert_fn{storage_ref}, data_col, val_idx);
       uniq_elem_size = [&]() -> size_type {
         if (not is_unique) { return 0; }
         switch (col->physical_type) {
@@ -250,9 +249,9 @@ CUDF_KERNEL void __launch_bounds__(block_size)
   [[maybe_unused]] auto const tile = cg::tiled_partition<cg_size>(cg::this_thread_block());
   auto const t                     = cg::this_thread_block().thread_rank();
 
-  __shared__ cuda::atomic<size_type, cuda::thread_scope_block> counter;
+  __shared__ cuda::atomic<size_type, SCOPE> counter;
   using cuda::std::memory_order_relaxed;
-  if (t == 0) { new (&counter) cuda::atomic<size_type, cuda::thread_scope_block>{0}; }
+  if (t == 0) { new (&counter) cuda::atomic<size_type, SCOPE>{0}; }
   __syncthreads();
 
   for (size_type i = 0; i < chunk.dict_map_size; i += block_size) {
@@ -267,6 +266,7 @@ CUDF_KERNEL void __launch_bounds__(block_size)
         // If sorting dict page ever becomes a hard requirement, enable the following statement and
         // add a dict sorting step before storing into the slot's second field.
         // chunk.dict_data_idx[loc] = t + i;
+        // printf("Replacing slot->data()->second: %d, %d\n", slot->data()->second, loc);
         slot->data()->second = loc;
       }
     }
diff --git a/cpp/src/io/parquet/writer_impl.cu b/cpp/src/io/parquet/writer_impl.cu
index 8f674385f30..403a5275e7e 100644
--- a/cpp/src/io/parquet/writer_impl.cu
+++ b/cpp/src/io/parquet/writer_impl.cu
@@ -1322,18 +1322,13 @@ build_chunk_dictionaries(hostdevice_2dvector<EncColumnChunk>& chunks,
   map_storage.initialize(cuco::pair{KEY_SENTINEL, VALUE_SENTINEL},
                          cuda::stream_ref{stream.value()});
 
-  std::cout << "Offsets: " << h_chunks.size() << std::endl;
-
-  std::for_each(thrust::make_counting_iterator(static_cast<std::size_t>(0)),
-                thrust::make_counting_iterator(h_chunks.size()),
-                [&](auto const idx) {
-                  auto& chunk = h_chunks[idx];
-                  if (chunk.use_dictionary) {
-                    chunk.dict_map_offset = map_offsets[idx];
-                    std::cout << "off: " << map_offsets[idx] << ", size: " << chunk.dict_map_size
-                              << std::endl;
-                  }
-                });
+  std::for_each(
+    thrust::make_zip_iterator(thrust::make_tuple(h_chunks.begin(), map_offsets.begin())),
+    thrust::make_zip_iterator(thrust::make_tuple(h_chunks.end(), map_offsets.end())),
+    [&](auto elem) -> void {
+      auto& chunk = thrust::get<0>(elem);
+      if (chunk.use_dictionary) { chunk.dict_map_offset = thrust::get<1>(elem); }
+    });
 
   chunks.host_to_device_async(stream);
 
@@ -1400,6 +1395,7 @@ build_chunk_dictionaries(hostdevice_2dvector<EncColumnChunk>& chunks,
   chunks.host_to_device_async(stream);
   collect_map_entries(map_storage.data(), chunks.device_view().flat_view(), stream);
   get_dictionary_indices(map_storage.data(), frags, stream);
+  chunks.device_to_host_async(stream);
 
   return std::pair(std::move(dict_data), std::move(dict_index));
 }
diff --git a/cpp/tests/io/parquet_writer_test.cpp b/cpp/tests/io/parquet_writer_test.cpp
index 9f171d23e7d..a06b4f38196 100644
--- a/cpp/tests/io/parquet_writer_test.cpp
+++ b/cpp/tests/io/parquet_writer_test.cpp
@@ -39,7 +39,7 @@ void test_durations(mask_op_t mask_op, bool use_byte_stream_split, bool arrow_sc
 {
   std::default_random_engine generator;
   std::uniform_int_distribution<int> distribution_d(0, 30);
-  [[maybe_unused]]auto sequence_d = cudf::detail::make_counting_transform_iterator(
+  auto sequence_d = cudf::detail::make_counting_transform_iterator(
     0, [&](auto i) { return distribution_d(generator); });
 
   std::uniform_int_distribution<int> distribution_s(0, 86400);
@@ -52,8 +52,8 @@ void test_durations(mask_op_t mask_op, bool use_byte_stream_split, bool arrow_sc
 
   auto mask = cudf::detail::make_counting_transform_iterator(0, mask_op);
 
-  constexpr auto num_rows = 5650; // WORKS UNTIL 5649, fails beyond 5650
-    // Durations longer than a day are not exactly valid, but cudf should be able to round trip
+  constexpr auto num_rows = 100;
+  // Durations longer than a day are not exactly valid, but cudf should be able to round trip
   auto durations_d = cudf::test::fixed_width_column_wrapper<cudf::duration_D, int64_t>(
     sequence_d, sequence_d + num_rows, mask);
   auto durations_s = cudf::test::fixed_width_column_wrapper<cudf::duration_s, int64_t>(
@@ -65,7 +65,7 @@ void test_durations(mask_op_t mask_op, bool use_byte_stream_split, bool arrow_sc
   auto durations_ns = cudf::test::fixed_width_column_wrapper<cudf::duration_ns, int64_t>(
     sequence, sequence + num_rows, mask);
 
-  auto expected = table_view{{/*durations_d, */durations_s,/*durations_ms, durations_us, durations_ns*/}};
+  auto expected = table_view{{durations_d, durations_s, durations_ms, durations_us, durations_ns}};
 
   if (use_byte_stream_split) {
     cudf::io::table_input_metadata expected_metadata(expected);
@@ -85,35 +85,35 @@ void test_durations(mask_op_t mask_op, bool use_byte_stream_split, bool arrow_sc
     cudf::io::parquet_reader_options::builder(cudf::io::source_info{filepath})
       .use_arrow_schema(arrow_schema);
   auto result = cudf::io::read_parquet(in_opts);
-/*
+
   auto durations_d_got =
     cudf::cast(result.tbl->view().column(0), cudf::data_type{cudf::type_id::DURATION_DAYS});
-  CUDF_TEST_EXPECT_COLUMNS_EQUAL(durations_d, durations_d_got->view());*/
+  CUDF_TEST_EXPECT_COLUMNS_EQUAL(durations_d, durations_d_got->view());
+
+  if (arrow_schema) {
+    CUDF_TEST_EXPECT_COLUMNS_EQUAL(durations_s, result.tbl->view().column(1));
+  } else {
+    auto durations_s_got =
+      cudf::cast(result.tbl->view().column(1), cudf::data_type{cudf::type_id::DURATION_SECONDS});
+    CUDF_TEST_EXPECT_COLUMNS_EQUAL(durations_s, durations_s_got->view());
+  }
 
-    if (arrow_schema) {
-      CUDF_TEST_EXPECT_COLUMNS_EQUAL(durations_s, result.tbl->view().column(0));
-    } else {
-      auto durations_s_got =
-        cudf::cast(result.tbl->view().column(0), cudf::data_type{cudf::type_id::DURATION_SECONDS});
-      CUDF_TEST_EXPECT_COLUMNS_EQUAL(durations_s, durations_s_got->view());
-    }
-/*
-    CUDF_TEST_EXPECT_COLUMNS_EQUAL(durations_ms, result.tbl->view().column(1));
-    CUDF_TEST_EXPECT_COLUMNS_EQUAL(durations_us, result.tbl->view().column(2));
-    CUDF_TEST_EXPECT_COLUMNS_EQUAL(durations_ns, result.tbl->view().column(3));*/
+  CUDF_TEST_EXPECT_COLUMNS_EQUAL(durations_ms, result.tbl->view().column(2));
+  CUDF_TEST_EXPECT_COLUMNS_EQUAL(durations_us, result.tbl->view().column(3));
+  CUDF_TEST_EXPECT_COLUMNS_EQUAL(durations_ns, result.tbl->view().column(4));
 }
 
 TEST_F(ParquetWriterTest, Durations)
 {
   test_durations([](auto i) { return true; }, false, false);
-  // test_durations([](auto i) { return (i % 2) != 0; }, false, false);
-  // test_durations([](auto i) { return (i % 3) != 0; }, false, false);
-  // test_durations([](auto i) { return false; }, false, false);
+  test_durations([](auto i) { return (i % 2) != 0; }, false, false);
+  test_durations([](auto i) { return (i % 3) != 0; }, false, false);
+  test_durations([](auto i) { return false; }, false, false);
 
-  // test_durations([](auto i) { return true; }, false, true);
-  // test_durations([](auto i) { return (i % 2) != 0; }, false, true);
-  // test_durations([](auto i) { return (i % 3) != 0; }, false, true);
-  // test_durations([](auto i) { return false; }, false, true);
+  test_durations([](auto i) { return true; }, false, true);
+  test_durations([](auto i) { return (i % 2) != 0; }, false, true);
+  test_durations([](auto i) { return (i % 3) != 0; }, false, true);
+  test_durations([](auto i) { return false; }, false, true);
 }
 
 TEST_F(ParquetWriterTest, MultiIndex)
@@ -2007,7 +2007,7 @@ TEST_F(ParquetWriterTest, WriteFixedLenByteArray)
   srand(31337);
   using cudf::io::parquet::detail::Encoding;
   constexpr int fixed_width          = 16;
-  constexpr cudf::size_type num_rows = 200;
+  constexpr cudf::size_type num_rows = 10;
   std::vector<uint8_t> data(num_rows * fixed_width);
   std::vector<cudf::size_type> offsets(num_rows + 1);
 
@@ -2025,25 +2025,26 @@ TEST_F(ParquetWriterTest, WriteFixedLenByteArray)
   auto off_child  = cudf::test::fixed_width_column_wrapper<int32_t>(offsets.begin(), offsets.end());
   auto col = cudf::make_lists_column(num_rows, off_child.release(), data_child.release(), 0, {});
 
-  auto expected = table_view{{*col, *col, *col, *col}};
+  auto expected = table_view{{/**col, *col, *col,*/ *col}};
   cudf::io::table_input_metadata expected_metadata(expected);
 
+  /*
+    expected_metadata.column_metadata[0]
+      .set_name("flba_plain")
+      .set_type_length(fixed_width)
+      .set_encoding(cudf::io::column_encoding::PLAIN)
+      .set_output_as_binary(true);
+    expected_metadata.column_metadata[1]
+      .set_name("flba_split")
+      .set_type_length(fixed_width)
+      .set_encoding(cudf::io::column_encoding::BYTE_STREAM_SPLIT)
+      .set_output_as_binary(true);
+    expected_metadata.column_metadata[2]
+      .set_name("flba_delta")
+      .set_type_length(fixed_width)
+      .set_encoding(cudf::io::column_encoding::DELTA_BYTE_ARRAY)
+      .set_output_as_binary(true);*/
   expected_metadata.column_metadata[0]
-    .set_name("flba_plain")
-    .set_type_length(fixed_width)
-    .set_encoding(cudf::io::column_encoding::PLAIN)
-    .set_output_as_binary(true);
-  expected_metadata.column_metadata[1]
-    .set_name("flba_split")
-    .set_type_length(fixed_width)
-    .set_encoding(cudf::io::column_encoding::BYTE_STREAM_SPLIT)
-    .set_output_as_binary(true);
-  expected_metadata.column_metadata[2]
-    .set_name("flba_delta")
-    .set_type_length(fixed_width)
-    .set_encoding(cudf::io::column_encoding::DELTA_BYTE_ARRAY)
-    .set_output_as_binary(true);
-  expected_metadata.column_metadata[3]
     .set_name("flba_dict")
     .set_type_length(fixed_width)
     .set_encoding(cudf::io::column_encoding::DICTIONARY)
@@ -2067,7 +2068,7 @@ TEST_F(ParquetWriterTest, WriteFixedLenByteArray)
   read_footer(source, &fmd);
 
   // check that the schema retains the FIXED_LEN_BYTE_ARRAY type
-  for (int i = 1; i <= 4; i++) {
+  for (int i = 1; i <= 1; i++) {
     EXPECT_EQ(fmd.schema[i].type, cudf::io::parquet::detail::Type::FIXED_LEN_BYTE_ARRAY);
     EXPECT_EQ(fmd.schema[i].type_length, fixed_width);
   }
@@ -2078,14 +2079,14 @@ TEST_F(ParquetWriterTest, WriteFixedLenByteArray)
   };
 
   // requested plain
+  /*  expect_enc(0, Encoding::PLAIN);
+    // requested byte_stream_split
+    expect_enc(1, Encoding::BYTE_STREAM_SPLIT);
+    // requested delta_byte_array
+    expect_enc(2, Encoding::DELTA_BYTE_ARRAY);
+    // requested dictionary, but should fall back to plain
+    // TODO: update if we get FLBA working with dictionary encoding*/
   expect_enc(0, Encoding::PLAIN);
-  // requested byte_stream_split
-  expect_enc(1, Encoding::BYTE_STREAM_SPLIT);
-  // requested delta_byte_array
-  expect_enc(2, Encoding::DELTA_BYTE_ARRAY);
-  // requested dictionary, but should fall back to plain
-  // TODO: update if we get FLBA working with dictionary encoding
-  expect_enc(3, Encoding::PLAIN);
 }
 
 /////////////////////////////////////////////////////////////

From 44a97ce265f948e1c05a8204cc1b61ab05d3876f Mon Sep 17 00:00:00 2001
From: Muhammad Haseeb <14217455+mhaseeb123@users.noreply.github.com>
Date: Tue, 13 Aug 2024 21:59:06 +0000
Subject: [PATCH 05/30] Fix headers being included

---
 cpp/src/io/parquet/chunk_dict.cu   | 1 +
 cpp/src/io/parquet/parquet_gpu.cuh | 7 +------
 cpp/src/io/parquet/writer_impl.cu  | 4 ++--
 3 files changed, 4 insertions(+), 8 deletions(-)

diff --git a/cpp/src/io/parquet/chunk_dict.cu b/cpp/src/io/parquet/chunk_dict.cu
index 49dd7b0b7bf..bda86f8596a 100644
--- a/cpp/src/io/parquet/chunk_dict.cu
+++ b/cpp/src/io/parquet/chunk_dict.cu
@@ -23,6 +23,7 @@
 #include <rmm/exec_policy.hpp>
 
 #include <cooperative_groups.h>
+#include <cuco/static_map_ref.cuh>
 #include <cuda/atomic>
 
 namespace cudf::io::parquet::detail {
diff --git a/cpp/src/io/parquet/parquet_gpu.cuh b/cpp/src/io/parquet/parquet_gpu.cuh
index 315a9cf4449..20ca717e01d 100644
--- a/cpp/src/io/parquet/parquet_gpu.cuh
+++ b/cpp/src/io/parquet/parquet_gpu.cuh
@@ -21,13 +21,8 @@
 #include <cudf/lists/lists_column_device_view.cuh>
 #include <cudf/types.hpp>
 
-#include <cooperative_groups.h>
-#include <cuco/static_map.cuh>
-#include <cuco/static_map_ref.cuh>
+#include <cuco/pair.cuh>
 #include <cuco/storage.cuh>
-#include <thrust/device_vector.h>
-
-#include <limits>
 
 namespace cudf::io::parquet::detail {
 
diff --git a/cpp/src/io/parquet/writer_impl.cu b/cpp/src/io/parquet/writer_impl.cu
index 403a5275e7e..5065b57e79d 100644
--- a/cpp/src/io/parquet/writer_impl.cu
+++ b/cpp/src/io/parquet/writer_impl.cu
@@ -1303,7 +1303,8 @@ build_chunk_dictionaries(hostdevice_2dvector<EncColumnChunk>& chunks,
       chunk.use_dictionary = true;
       valid_chunk_sizes.emplace_back(
         static_cast<std::size_t>(cuco::make_window_extent<cg_size, window_size>(
-          // Multiplying by 1/0.7 = 1.43 to target a 70% occupancy factor.
+          // cuCollections suggests using a hash map of size N * (1/0.7) = 1.43 to target a 70%
+          // occupancy factor.
           static_cast<size_t>(chunk.num_values * 1.43))));
       chunk.dict_map_size = valid_chunk_sizes.back();
     }
@@ -1395,7 +1396,6 @@ build_chunk_dictionaries(hostdevice_2dvector<EncColumnChunk>& chunks,
   chunks.host_to_device_async(stream);
   collect_map_entries(map_storage.data(), chunks.device_view().flat_view(), stream);
   get_dictionary_indices(map_storage.data(), frags, stream);
-  chunks.device_to_host_async(stream);
 
   return std::pair(std::move(dict_data), std::move(dict_index));
 }

From 393eaa657df7bdc865e3ae53a2a3493104a115d1 Mon Sep 17 00:00:00 2001
From: Muhammad Haseeb <14217455+mhaseeb123@users.noreply.github.com>
Date: Tue, 13 Aug 2024 22:59:21 +0000
Subject: [PATCH 06/30] All tests passing

---
 cpp/src/io/parquet/chunk_dict.cu  | 27 +++++++--------------------
 cpp/src/io/parquet/writer_impl.cu | 22 +++++++++++++++-------
 2 files changed, 22 insertions(+), 27 deletions(-)

diff --git a/cpp/src/io/parquet/chunk_dict.cu b/cpp/src/io/parquet/chunk_dict.cu
index bda86f8596a..da0f1a726cb 100644
--- a/cpp/src/io/parquet/chunk_dict.cu
+++ b/cpp/src/io/parquet/chunk_dict.cu
@@ -40,14 +40,8 @@ struct equality_functor {
   __device__ bool operator()(key_type const lhs_idx, key_type const rhs_idx) const
   {
     //  We don't call this for nulls so this is fine
-    auto const equal  = cudf::experimental::row::equality::nan_equal_physical_equality_comparator{};
-    auto const result = equal(col.element<T>(lhs_idx), col.element<T>(rhs_idx));
-    /*printf("col_type_id:%d, equality idx1:%d, idx2:%d, eq:%d\n",
-           col.type().id(),
-           lhs_idx,
-           rhs_idx,
-           result);*/
-    return result;
+    auto const equal = cudf::experimental::row::equality::nan_equal_physical_equality_comparator{};
+    return equal(col.element<T>(lhs_idx), col.element<T>(rhs_idx));
   }
 };
 
@@ -56,9 +50,7 @@ struct hash_functor {
   column_device_view const& col;
   __device__ auto operator()(key_type idx) const
   {
-    auto const hashed = cudf::hashing::detail::MurmurHash3_x86_32<T>{}(col.element<T>(idx));
-    // printf("hashing idx: %d = %d\n", idx, hashed);
-    return hashed;  // cudf::hashing::detail::MurmurHash3_x86_32<T>{}(col.element<T>(idx));
+    return cudf::hashing::detail::MurmurHash3_x86_32<T>{}(col.element<T>(idx));
   }
 };
 
@@ -91,12 +83,10 @@ struct map_insert_fn {
                      {},
                      storage_ref};
 
-      // Create another map with insert operator
-      auto map_insert_ref = hash_map_ref.with_operators(cuco::insert_and_find);
-      // Insert
-      auto [iter, found] = map_insert_ref.insert_and_find(cuco::pair{i, i});
-      // printf("Inserted k=%d, v=%d, unique=%d\n", iter->first, iter->second, found);
-      return found;
+      // Create another map ref with the insert operator
+      auto map_insert_ref = hash_map_ref.with_operators(cuco::insert);
+      // Insert into the hash map
+      return map_insert_ref.insert(cuco::pair{i, i});
     } else {
       CUDF_UNREACHABLE("Unsupported type to insert in map");
     }
@@ -143,7 +133,6 @@ struct map_find_fn {
                   "Unable to find value in map in dictionary index construction");
 
       // Return a pair of the found key and value.
-      // printf("Find=%d, Found slot: k=%d, v=%d\n", i, found_slot->first, found_slot->second);
       return {found_slot->first, found_slot->second};
     } else {
       CUDF_UNREACHABLE("Unsupported type to find in map");
@@ -262,12 +251,10 @@ CUDF_KERNEL void __launch_bounds__(block_size)
       if (key != KEY_SENTINEL) {
         auto loc = counter.fetch_add(1, memory_order_relaxed);
         cudf_assert(loc < MAX_DICT_SIZE && "Number of filled slots exceeds max dict size");
-        // printf("Writing %d at loc: %d\n", key, loc);
         chunk.dict_data[loc] = key;
         // If sorting dict page ever becomes a hard requirement, enable the following statement and
         // add a dict sorting step before storing into the slot's second field.
         // chunk.dict_data_idx[loc] = t + i;
-        // printf("Replacing slot->data()->second: %d, %d\n", slot->data()->second, loc);
         slot->data()->second = loc;
       }
     }
diff --git a/cpp/src/io/parquet/writer_impl.cu b/cpp/src/io/parquet/writer_impl.cu
index 5065b57e79d..e4fc1f00f06 100644
--- a/cpp/src/io/parquet/writer_impl.cu
+++ b/cpp/src/io/parquet/writer_impl.cu
@@ -1310,19 +1310,26 @@ build_chunk_dictionaries(hostdevice_2dvector<EncColumnChunk>& chunks,
     }
   });
 
-  std::vector<std::size_t> map_offsets(valid_chunk_sizes.size() + 1, 0);
+  std::vector<std::size_t> map_offsets(valid_chunk_sizes.size(), 0);
   std::exclusive_scan(valid_chunk_sizes.begin(),
                       valid_chunk_sizes.end(),
                       map_offsets.begin(),
                       static_cast<size_t>(0));
-  map_offsets.back() = map_offsets[valid_chunk_sizes.size() - 1] + valid_chunk_sizes.back();
 
-  // Create a single bulk storage used by all subsets
-  auto map_storage = storage_type{map_offsets.back()};
-  // Initializes the storage with the given sentinel
+  // Compute total map_storage
+  auto const map_storage_size = map_offsets.back() + valid_chunk_sizes.back();
+
+  // No chunk needs to create a dictionary, exit early
+  if (map_storage_size == 0) { return {std::move(dict_data), std::move(dict_index)}; }
+
+  // Create a single bulk storage used by all sub-hashmaps
+  auto map_storage = storage_type{map_storage_size};
+
+  // Only initialize storage with the given sentinel if and only if non-zero size
   map_storage.initialize(cuco::pair{KEY_SENTINEL, VALUE_SENTINEL},
                          cuda::stream_ref{stream.value()});
 
+  // Populate chunk dictionary offsets
   std::for_each(
     thrust::make_zip_iterator(thrust::make_tuple(h_chunks.begin(), map_offsets.begin())),
     thrust::make_zip_iterator(thrust::make_tuple(h_chunks.end(), map_offsets.end())),
@@ -1331,10 +1338,11 @@ build_chunk_dictionaries(hostdevice_2dvector<EncColumnChunk>& chunks,
       if (chunk.use_dictionary) { chunk.dict_map_offset = thrust::get<1>(elem); }
     });
 
+  // Synchronize
   chunks.host_to_device_async(stream);
-
+  // Populate the hash map for each chunk
   populate_chunk_hash_maps(map_storage.data(), frags, stream);
-
+  // Synchronize again
   chunks.device_to_host_sync(stream);
 
   // Make decision about which chunks have dictionary

From c27c4cf1d289ed2a91124756f4c6cb07bc07efb3 Mon Sep 17 00:00:00 2001
From: Muhammad Haseeb <14217455+mhaseeb123@users.noreply.github.com>
Date: Tue, 13 Aug 2024 23:16:19 +0000
Subject: [PATCH 07/30] Cosmetic improvements

---
 cpp/src/io/parquet/chunk_dict.cu   | 15 +++++++--------
 cpp/src/io/parquet/parquet_gpu.cuh | 11 +++++++----
 cpp/src/io/parquet/writer_impl.cu  | 18 ++++++++++--------
 3 files changed, 24 insertions(+), 20 deletions(-)

diff --git a/cpp/src/io/parquet/chunk_dict.cu b/cpp/src/io/parquet/chunk_dict.cu
index da0f1a726cb..588a5766c08 100644
--- a/cpp/src/io/parquet/chunk_dict.cu
+++ b/cpp/src/io/parquet/chunk_dict.cu
@@ -39,7 +39,7 @@ struct equality_functor {
   column_device_view const& col;
   __device__ bool operator()(key_type const lhs_idx, key_type const rhs_idx) const
   {
-    //  We don't call this for nulls so this is fine
+    // We don't call this for nulls so this is fine
     auto const equal = cudf::experimental::row::equality::nan_equal_physical_equality_comparator{};
     return equal(col.element<T>(lhs_idx), col.element<T>(rhs_idx));
   }
@@ -142,7 +142,7 @@ struct map_find_fn {
 
 template <int block_size>
 CUDF_KERNEL void __launch_bounds__(block_size)
-  populate_chunk_hash_maps_kernel(storage_type::window_type* map_storage,
+  populate_chunk_hash_maps_kernel(window_type* map_storage,
                                   cudf::detail::device_2dspan<PageFragment const> frags)
 {
   auto const col_idx = blockIdx.y;
@@ -230,8 +230,7 @@ CUDF_KERNEL void __launch_bounds__(block_size)
 
 template <int block_size>
 CUDF_KERNEL void __launch_bounds__(block_size)
-  collect_map_entries_kernel(storage_type::window_type* map_storage,
-                             device_span<EncColumnChunk> chunks)
+  collect_map_entries_kernel(window_type* map_storage, device_span<EncColumnChunk> chunks)
 {
   auto& chunk = chunks[blockIdx.x];
   if (not chunk.use_dictionary) { return; }
@@ -263,7 +262,7 @@ CUDF_KERNEL void __launch_bounds__(block_size)
 
 template <int block_size>
 CUDF_KERNEL void __launch_bounds__(block_size)
-  get_dictionary_indices_kernel(storage_type::window_type* map_storage,
+  get_dictionary_indices_kernel(window_type* map_storage,
                                 cudf::detail::device_2dspan<PageFragment const> frags)
 {
   auto const col_idx = blockIdx.y;
@@ -301,7 +300,7 @@ CUDF_KERNEL void __launch_bounds__(block_size)
   }
 }
 
-void populate_chunk_hash_maps(storage_type::window_type* map_storage,
+void populate_chunk_hash_maps(window_type* map_storage,
                               cudf::detail::device_2dspan<PageFragment const> frags,
                               rmm::cuda_stream_view stream)
 {
@@ -310,7 +309,7 @@ void populate_chunk_hash_maps(storage_type::window_type* map_storage,
     <<<dim_grid, DEFAULT_BLOCK_SIZE, 0, stream.value()>>>(map_storage, frags);
 }
 
-void collect_map_entries(storage_type::window_type* map_storage,
+void collect_map_entries(window_type* map_storage,
                          device_span<EncColumnChunk> chunks,
                          rmm::cuda_stream_view stream)
 {
@@ -319,7 +318,7 @@ void collect_map_entries(storage_type::window_type* map_storage,
     <<<chunks.size(), block_size, 0, stream.value()>>>(map_storage, chunks);
 }
 
-void get_dictionary_indices(storage_type::window_type* map_storage,
+void get_dictionary_indices(window_type* map_storage,
                             cudf::detail::device_2dspan<PageFragment const> frags,
                             rmm::cuda_stream_view stream)
 {
diff --git a/cpp/src/io/parquet/parquet_gpu.cuh b/cpp/src/io/parquet/parquet_gpu.cuh
index 20ca717e01d..0a0fc9095e0 100644
--- a/cpp/src/io/parquet/parquet_gpu.cuh
+++ b/cpp/src/io/parquet/parquet_gpu.cuh
@@ -40,24 +40,26 @@ using slot_type = cuco::pair<key_type, mapped_type>;
 
 using storage_type     = cuco::aow_storage<slot_type, window_size>;
 using storage_ref_type = typename storage_type::ref_type;
-
+using window_type      = storage_type::window_type;
 /**
  * @brief Insert chunk values into their respective hash maps
  *
+ * @param map_storage Pointer to the bulk hashmap storage
  * @param frags Column fragments
  * @param stream CUDA stream to use
  */
-void populate_chunk_hash_maps(storage_type::window_type* map_storage,
+void populate_chunk_hash_maps(window_type* map_storage,
                               cudf::detail::device_2dspan<PageFragment const> frags,
                               rmm::cuda_stream_view stream);
 
 /**
  * @brief Compact dictionary hash map entries into chunk.dict_data
  *
+ * @param map_storage Pointer to the bulk hashmap storage
  * @param chunks Flat span of chunks to compact hash maps for
  * @param stream CUDA stream to use
  */
-void collect_map_entries(storage_type::window_type* map_storage,
+void collect_map_entries(window_type* map_storage,
                          device_span<EncColumnChunk> chunks,
                          rmm::cuda_stream_view stream);
 
@@ -70,10 +72,11 @@ void collect_map_entries(storage_type::window_type* map_storage,
  * Since dict_data itself contains indices into the original cudf column, this means that
  * col[row] == col[dict_data[dict_index[row - chunk.start_row]]]
  *
+ * @param map_storage Pointer to the bulk hashmap storage
  * @param frags Column fragments
  * @param stream CUDA stream to use
  */
-void get_dictionary_indices(storage_type::window_type* map_storage,
+void get_dictionary_indices(window_type* map_storage,
                             cudf::detail::device_2dspan<PageFragment const> frags,
                             rmm::cuda_stream_view stream);
 
diff --git a/cpp/src/io/parquet/writer_impl.cu b/cpp/src/io/parquet/writer_impl.cu
index e4fc1f00f06..7ec3997b590 100644
--- a/cpp/src/io/parquet/writer_impl.cu
+++ b/cpp/src/io/parquet/writer_impl.cu
@@ -1285,9 +1285,11 @@ build_chunk_dictionaries(hostdevice_2dvector<EncColumnChunk>& chunks,
     return std::pair(std::move(dict_data), std::move(dict_index));
   }
 
+  // Create a vector to store valid chunk sizes using cuco::make_window_extent
   std::vector<size_t> valid_chunk_sizes;
   valid_chunk_sizes.reserve(h_chunks.size());
 
+  // Populate valid_chunk_sizes for chunks that need to build a dictionary.
   std::for_each(h_chunks.begin(), h_chunks.end(), [&](auto& chunk) {
     auto const& chunk_col_desc = col_desc[chunk.col_desc_id];
     auto const is_requested_non_dict =
@@ -1298,6 +1300,7 @@ build_chunk_dictionaries(hostdevice_2dvector<EncColumnChunk>& chunks,
 
     if (is_type_non_dict || is_requested_non_dict) {
       chunk.use_dictionary = false;
+      // Emplace a zero for 1-1 mapping between h_chunks and valid_chunk_sizes
       valid_chunk_sizes.emplace_back(static_cast<std::size_t>(0));
     } else {
       chunk.use_dictionary = true;
@@ -1310,22 +1313,21 @@ build_chunk_dictionaries(hostdevice_2dvector<EncColumnChunk>& chunks,
     }
   });
 
+  // Create a vector to map offsets from chunk sizes
   std::vector<std::size_t> map_offsets(valid_chunk_sizes.size(), 0);
   std::exclusive_scan(valid_chunk_sizes.begin(),
                       valid_chunk_sizes.end(),
                       map_offsets.begin(),
                       static_cast<size_t>(0));
 
-  // Compute total map_storage
-  auto const map_storage_size = map_offsets.back() + valid_chunk_sizes.back();
-
+  // Compute total map storage
+  auto const total_map_storage_size = map_offsets.back() + valid_chunk_sizes.back();
   // No chunk needs to create a dictionary, exit early
-  if (map_storage_size == 0) { return {std::move(dict_data), std::move(dict_index)}; }
-
-  // Create a single bulk storage used by all sub-hashmaps
-  auto map_storage = storage_type{map_storage_size};
+  if (total_map_storage_size == 0) { return {std::move(dict_data), std::move(dict_index)}; }
 
-  // Only initialize storage with the given sentinel if and only if non-zero size
+  // Create a single bulk storage used by all sub-dictionaries
+  auto map_storage = storage_type{total_map_storage_size};
+  // Initialize storage with the given sentinel iff non-zero size
   map_storage.initialize(cuco::pair{KEY_SENTINEL, VALUE_SENTINEL},
                          cuda::stream_ref{stream.value()});
 

From 76a2d14eed30f4a22b7a68d47e96f40e3170b53e Mon Sep 17 00:00:00 2001
From: Muhammad Haseeb <14217455+mhaseeb123@users.noreply.github.com>
Date: Tue, 13 Aug 2024 23:18:11 +0000
Subject: [PATCH 08/30] Revert changes to tests

---
 cpp/tests/io/parquet_writer_test.cpp    | 51 ++++++++++++-------------
 cpp/tests/utilities/column_utilities.cu |  1 -
 2 files changed, 25 insertions(+), 27 deletions(-)

diff --git a/cpp/tests/io/parquet_writer_test.cpp b/cpp/tests/io/parquet_writer_test.cpp
index a06b4f38196..e07ebe25322 100644
--- a/cpp/tests/io/parquet_writer_test.cpp
+++ b/cpp/tests/io/parquet_writer_test.cpp
@@ -2007,7 +2007,7 @@ TEST_F(ParquetWriterTest, WriteFixedLenByteArray)
   srand(31337);
   using cudf::io::parquet::detail::Encoding;
   constexpr int fixed_width          = 16;
-  constexpr cudf::size_type num_rows = 10;
+  constexpr cudf::size_type num_rows = 200;
   std::vector<uint8_t> data(num_rows * fixed_width);
   std::vector<cudf::size_type> offsets(num_rows + 1);
 
@@ -2025,26 +2025,25 @@ TEST_F(ParquetWriterTest, WriteFixedLenByteArray)
   auto off_child  = cudf::test::fixed_width_column_wrapper<int32_t>(offsets.begin(), offsets.end());
   auto col = cudf::make_lists_column(num_rows, off_child.release(), data_child.release(), 0, {});
 
-  auto expected = table_view{{/**col, *col, *col,*/ *col}};
+  auto expected = table_view{{*col, *col, *col, *col}};
   cudf::io::table_input_metadata expected_metadata(expected);
 
-  /*
-    expected_metadata.column_metadata[0]
-      .set_name("flba_plain")
-      .set_type_length(fixed_width)
-      .set_encoding(cudf::io::column_encoding::PLAIN)
-      .set_output_as_binary(true);
-    expected_metadata.column_metadata[1]
-      .set_name("flba_split")
-      .set_type_length(fixed_width)
-      .set_encoding(cudf::io::column_encoding::BYTE_STREAM_SPLIT)
-      .set_output_as_binary(true);
-    expected_metadata.column_metadata[2]
-      .set_name("flba_delta")
-      .set_type_length(fixed_width)
-      .set_encoding(cudf::io::column_encoding::DELTA_BYTE_ARRAY)
-      .set_output_as_binary(true);*/
   expected_metadata.column_metadata[0]
+    .set_name("flba_plain")
+    .set_type_length(fixed_width)
+    .set_encoding(cudf::io::column_encoding::PLAIN)
+    .set_output_as_binary(true);
+  expected_metadata.column_metadata[1]
+    .set_name("flba_split")
+    .set_type_length(fixed_width)
+    .set_encoding(cudf::io::column_encoding::BYTE_STREAM_SPLIT)
+    .set_output_as_binary(true);
+  expected_metadata.column_metadata[2]
+    .set_name("flba_delta")
+    .set_type_length(fixed_width)
+    .set_encoding(cudf::io::column_encoding::DELTA_BYTE_ARRAY)
+    .set_output_as_binary(true);
+  expected_metadata.column_metadata[3]
     .set_name("flba_dict")
     .set_type_length(fixed_width)
     .set_encoding(cudf::io::column_encoding::DICTIONARY)
@@ -2068,7 +2067,7 @@ TEST_F(ParquetWriterTest, WriteFixedLenByteArray)
   read_footer(source, &fmd);
 
   // check that the schema retains the FIXED_LEN_BYTE_ARRAY type
-  for (int i = 1; i <= 1; i++) {
+  for (int i = 1; i <= 4; i++) {
     EXPECT_EQ(fmd.schema[i].type, cudf::io::parquet::detail::Type::FIXED_LEN_BYTE_ARRAY);
     EXPECT_EQ(fmd.schema[i].type_length, fixed_width);
   }
@@ -2079,14 +2078,14 @@ TEST_F(ParquetWriterTest, WriteFixedLenByteArray)
   };
 
   // requested plain
-  /*  expect_enc(0, Encoding::PLAIN);
-    // requested byte_stream_split
-    expect_enc(1, Encoding::BYTE_STREAM_SPLIT);
-    // requested delta_byte_array
-    expect_enc(2, Encoding::DELTA_BYTE_ARRAY);
-    // requested dictionary, but should fall back to plain
-    // TODO: update if we get FLBA working with dictionary encoding*/
   expect_enc(0, Encoding::PLAIN);
+  // requested byte_stream_split
+  expect_enc(1, Encoding::BYTE_STREAM_SPLIT);
+  // requested delta_byte_array
+  expect_enc(2, Encoding::DELTA_BYTE_ARRAY);
+  // requested dictionary, but should fall back to plain
+  // TODO: update if we get FLBA working with dictionary encoding
+  expect_enc(3, Encoding::PLAIN);
 }
 
 /////////////////////////////////////////////////////////////
diff --git a/cpp/tests/utilities/column_utilities.cu b/cpp/tests/utilities/column_utilities.cu
index be4655b0a98..fb9bdeb0b22 100644
--- a/cpp/tests/utilities/column_utilities.cu
+++ b/cpp/tests/utilities/column_utilities.cu
@@ -460,7 +460,6 @@ std::string stringify_column_differences(cudf::device_span<int const> difference
                                          debug_output_level verbosity,
                                          int depth)
 {
-  verbosity = debug_output_level::ALL_ERRORS;
   CUDF_EXPECTS(not differences.empty(), "Shouldn't enter this function if `differences` is empty");
   std::string const depth_str = depth > 0 ? "depth " + std::to_string(depth) + '\n' : "";
   // move the differences to the host.

From ec32c0a878fdc4d12994dfe44eceb55f8bef6e86 Mon Sep 17 00:00:00 2001
From: Muhammad Haseeb <14217455+mhaseeb123@users.noreply.github.com>
Date: Wed, 14 Aug 2024 00:44:59 +0000
Subject: [PATCH 09/30] Migrate find_fn to use a tile

---
 cpp/src/io/parquet/chunk_dict.cu   | 45 +++++++++++++++++-------------
 cpp/src/io/parquet/parquet_gpu.cuh |  4 +--
 cpp/src/io/parquet/parquet_gpu.hpp |  2 +-
 cpp/src/io/parquet/writer_impl.cu  |  2 +-
 4 files changed, 29 insertions(+), 24 deletions(-)

diff --git a/cpp/src/io/parquet/chunk_dict.cu b/cpp/src/io/parquet/chunk_dict.cu
index 588a5766c08..0d4ae6b5748 100644
--- a/cpp/src/io/parquet/chunk_dict.cu
+++ b/cpp/src/io/parquet/chunk_dict.cu
@@ -58,12 +58,14 @@ struct map_insert_fn {
   storage_ref_type const& storage_ref;
 
   template <typename T>
-  __device__ bool operator()(column_device_view const& col, key_type i)
+  __device__ bool operator()(column_device_view const& col,
+                             cg::thread_block_tile<map_cg_size> const& tile,
+                             key_type i)
   {
     if constexpr (column_device_view::has_element_accessor<T>()) {
       using equality_fn_type    = equality_functor<T>;
       using hash_fn_type        = hash_functor<T>;
-      using probing_scheme_type = cuco::linear_probing<cg_size, hash_fn_type>;
+      using probing_scheme_type = cuco::linear_probing<map_cg_size, hash_fn_type>;
 
       // Instantiate hash and equality functors.
       auto hash_fn  = hash_fn_type{col};
@@ -85,8 +87,8 @@ struct map_insert_fn {
 
       // Create another map ref with the insert operator
       auto map_insert_ref = hash_map_ref.with_operators(cuco::insert);
-      // Insert into the hash map
-      return map_insert_ref.insert(cuco::pair{i, i});
+      // Insert into the hash map using the provided thread tile
+      return map_insert_ref.insert(tile, cuco::pair{i, i});
     } else {
       CUDF_UNREACHABLE("Unsupported type to insert in map");
     }
@@ -97,12 +99,13 @@ struct map_find_fn {
   storage_ref_type const& storage_ref;
 
   template <typename T>
-  __device__ cuco::pair<key_type, mapped_type> operator()(column_device_view const& col, key_type i)
+  __device__ cuco::pair<key_type, mapped_type> operator()(
+    column_device_view const& col, cg::thread_block_tile<map_cg_size> const& tile, key_type i)
   {
     if constexpr (column_device_view::has_element_accessor<T>()) {
       using equality_fn_type    = equality_functor<T>;
       using hash_fn_type        = hash_functor<T>;
-      using probing_scheme_type = cuco::linear_probing<cg_size, hash_fn_type>;
+      using probing_scheme_type = cuco::linear_probing<map_cg_size, hash_fn_type>;
 
       // Instantiate hash and equality functors.
       auto hash_fn  = hash_fn_type{col};
@@ -125,8 +128,8 @@ struct map_find_fn {
       // Create another map with find operator
       auto map_find_ref = hash_map_ref.with_operators(cuco::find);
 
-      // Find the key = i
-      auto found_slot = map_find_ref.find(i);
+      // Find the key = i using the provided thread tile
+      auto found_slot = map_find_ref.find(tile, i);
 
       // Check if we found the previously inserted key.
       cudf_assert(found_slot != map_find_ref.end() &&
@@ -156,8 +159,9 @@ CUDF_KERNEL void __launch_bounds__(block_size)
   using block_reduce = cub::BlockReduce<size_type, block_size>;
   __shared__ typename block_reduce::TempStorage reduce_storage;
 
-  [[maybe_unused]] auto const tile = cg::tiled_partition<cg_size>(cg::this_thread_block());
-  auto const t                     = cg::this_thread_block().thread_rank();
+  auto const block                 = cg::this_thread_block();
+  [[maybe_unused]] auto const tile = cg::tiled_partition<map_cg_size>(block);
+  auto const t                     = block.thread_rank();
 
   size_type start_row = frag.start_row;
   size_type end_row   = frag.start_row + frag.num_rows;
@@ -180,7 +184,8 @@ CUDF_KERNEL void __launch_bounds__(block_size)
     size_type is_unique      = 0;
     size_type uniq_elem_size = 0;
     if (is_valid) {
-      is_unique = type_dispatcher(data_col.type(), map_insert_fn{storage_ref}, data_col, val_idx);
+      is_unique =
+        type_dispatcher(data_col.type(), map_insert_fn{storage_ref}, data_col, tile, val_idx);
       uniq_elem_size = [&]() -> size_type {
         if (not is_unique) { return 0; }
         switch (col->physical_type) {
@@ -212,14 +217,14 @@ CUDF_KERNEL void __launch_bounds__(block_size)
       }();
     }
     auto num_unique = block_reduce(reduce_storage).Sum(is_unique);
-    __syncthreads();
+    block.sync();
     auto uniq_data_size = block_reduce(reduce_storage).Sum(uniq_elem_size);
     if (t == 0) {
       total_num_dict_entries = atomicAdd(&chunk->num_dict_entries, num_unique);
       total_num_dict_entries += num_unique;
       atomicAdd(&chunk->uniq_data_size, uniq_data_size);
     }
-    __syncthreads();
+    block.sync();
 
     // Check if the num unique values in chunk has already exceeded max dict size and early exit
     if (total_num_dict_entries > MAX_DICT_SIZE) { return; }
@@ -235,8 +240,7 @@ CUDF_KERNEL void __launch_bounds__(block_size)
   auto& chunk = chunks[blockIdx.x];
   if (not chunk.use_dictionary) { return; }
 
-  [[maybe_unused]] auto const tile = cg::tiled_partition<cg_size>(cg::this_thread_block());
-  auto const t                     = cg::this_thread_block().thread_rank();
+  auto const t = threadIdx.x;
 
   __shared__ cuda::atomic<size_type, SCOPE> counter;
   using cuda::std::memory_order_relaxed;
@@ -273,8 +277,9 @@ CUDF_KERNEL void __launch_bounds__(block_size)
 
   if (not chunk->use_dictionary) { return; }
 
-  [[maybe_unused]] auto const tile = cg::tiled_partition<cg_size>(cg::this_thread_block());
-  auto const t                     = cg::this_thread_block().thread_rank();
+  auto const block  = cg::this_thread_block();
+  auto const tile   = cg::tiled_partition<map_cg_size>(block);
+  auto const ntiles = tile.meta_group_size();
 
   size_type start_row = frag.start_row;
   size_type end_row   = frag.start_row + frag.num_rows;
@@ -287,16 +292,16 @@ CUDF_KERNEL void __launch_bounds__(block_size)
   column_device_view const& data_col = *col->leaf_column;
   storage_ref_type const storage_ref{chunk->dict_map_size, map_storage + chunk->dict_map_offset};
 
-  thread_index_type val_idx = s_start_value_idx + t;
+  thread_index_type val_idx = s_start_value_idx + tile.meta_group_rank();
   while (val_idx < end_value_idx) {
     if (data_col.is_valid(val_idx)) {
       auto [found_key, found_value] =
-        type_dispatcher(data_col.type(), map_find_fn{storage_ref}, data_col, val_idx);
+        type_dispatcher(data_col.type(), map_find_fn{storage_ref}, data_col, tile, val_idx);
       // No need for atomic as this is not going to be modified by any other thread
       chunk->dict_index[val_idx - s_ck_start_val_idx] = found_value;
     }
 
-    val_idx += block_size;
+    val_idx += ntiles;
   }
 }
 
diff --git a/cpp/src/io/parquet/parquet_gpu.cuh b/cpp/src/io/parquet/parquet_gpu.cuh
index 0a0fc9095e0..66b86140afc 100644
--- a/cpp/src/io/parquet/parquet_gpu.cuh
+++ b/cpp/src/io/parquet/parquet_gpu.cuh
@@ -29,7 +29,7 @@ namespace cudf::io::parquet::detail {
 using key_type    = size_type;
 using mapped_type = size_type;
 
-auto constexpr cg_size     = 1;  ///< A CUDA Cooperative Group of 8 threads to handle each subset
+auto constexpr map_cg_size = 1;  ///< A CUDA Cooperative Group of 1 thread to handle each subset
 auto constexpr window_size = 1;  ///< Number of concurrent slots handled by each thread
 
 auto constexpr KEY_SENTINEL   = key_type{-1};
@@ -40,7 +40,7 @@ using slot_type = cuco::pair<key_type, mapped_type>;
 
 using storage_type     = cuco::aow_storage<slot_type, window_size>;
 using storage_ref_type = typename storage_type::ref_type;
-using window_type      = storage_type::window_type;
+using window_type      = typename storage_type::window_type;
 /**
  * @brief Insert chunk values into their respective hash maps
  *
diff --git a/cpp/src/io/parquet/parquet_gpu.hpp b/cpp/src/io/parquet/parquet_gpu.hpp
index daf97a566f9..b26735d27b7 100644
--- a/cpp/src/io/parquet/parquet_gpu.hpp
+++ b/cpp/src/io/parquet/parquet_gpu.hpp
@@ -559,7 +559,7 @@ struct EncColumnChunk {
   uint8_t is_compressed;    //!< Nonzero if the chunk uses compression
   uint32_t dictionary_size;    //!< Size of dictionary page including header
   uint32_t ck_stat_size;       //!< Size of chunk-level statistics (included in 1st page header)
-  uint32_t dict_map_offset;    //!< Hash map storage for calculating dict encoding for this chunk
+  uint32_t dict_map_offset;    //!< Offset of the hash map storage for calculating dict encoding for this chunk
   size_type dict_map_size;     //!< Size of dict_map_slots
   size_type num_dict_entries;  //!< Total number of entries in dictionary
   size_type
diff --git a/cpp/src/io/parquet/writer_impl.cu b/cpp/src/io/parquet/writer_impl.cu
index 7ec3997b590..e319a39e464 100644
--- a/cpp/src/io/parquet/writer_impl.cu
+++ b/cpp/src/io/parquet/writer_impl.cu
@@ -1305,7 +1305,7 @@ build_chunk_dictionaries(hostdevice_2dvector<EncColumnChunk>& chunks,
     } else {
       chunk.use_dictionary = true;
       valid_chunk_sizes.emplace_back(
-        static_cast<std::size_t>(cuco::make_window_extent<cg_size, window_size>(
+        static_cast<std::size_t>(cuco::make_window_extent<map_cg_size, window_size>(
           // cuCollections suggests using a hash map of size N * (1/0.7) = 1.43 to target a 70%
           // occupancy factor.
           static_cast<size_t>(chunk.num_values * 1.43))));

From 0546fcc46aee5f90fb560b977c491dd92be12fda Mon Sep 17 00:00:00 2001
From: Muhammad Haseeb <14217455+mhaseeb123@users.noreply.github.com>
Date: Wed, 14 Aug 2024 01:50:37 +0000
Subject: [PATCH 10/30] Functionally correct working solution

---
 cpp/src/io/parquet/chunk_dict.cu   | 105 ++++++++++++++++-------------
 cpp/src/io/parquet/parquet_gpu.cuh |   2 +-
 cpp/src/io/parquet/parquet_gpu.hpp |   3 +-
 3 files changed, 62 insertions(+), 48 deletions(-)

diff --git a/cpp/src/io/parquet/chunk_dict.cu b/cpp/src/io/parquet/chunk_dict.cu
index 0d4ae6b5748..82478c01e2d 100644
--- a/cpp/src/io/parquet/chunk_dict.cu
+++ b/cpp/src/io/parquet/chunk_dict.cu
@@ -131,10 +131,11 @@ struct map_find_fn {
       // Find the key = i using the provided thread tile
       auto found_slot = map_find_ref.find(tile, i);
 
-      // Check if we found the previously inserted key.
-      cudf_assert(found_slot != map_find_ref.end() &&
-                  "Unable to find value in map in dictionary index construction");
-
+      // Check if didn't find the previously inserted key.
+      if (tile.thread_rank() == 0) {
+        cudf_assert(found_slot != map_find_ref.end() &&
+                    "Unable to find value in map in dictionary index construction");
+      }
       // Return a pair of the found key and value.
       return {found_slot->first, found_slot->second};
     } else {
@@ -145,7 +146,7 @@ struct map_find_fn {
 
 template <int block_size>
 CUDF_KERNEL void __launch_bounds__(block_size)
-  populate_chunk_hash_maps_kernel(window_type* map_storage,
+  populate_chunk_hash_maps_kernel(window_type* const map_storage,
                                   cudf::detail::device_2dspan<PageFragment const> frags)
 {
   auto const col_idx = blockIdx.y;
@@ -159,9 +160,9 @@ CUDF_KERNEL void __launch_bounds__(block_size)
   using block_reduce = cub::BlockReduce<size_type, block_size>;
   __shared__ typename block_reduce::TempStorage reduce_storage;
 
-  auto const block                 = cg::this_thread_block();
-  [[maybe_unused]] auto const tile = cg::tiled_partition<map_cg_size>(block);
-  auto const t                     = block.thread_rank();
+  auto const block  = cg::this_thread_block();
+  auto const tile   = cg::tiled_partition<map_cg_size>(block);
+  auto const ntiles = tile.meta_group_size();
 
   size_type start_row = frag.start_row;
   size_type end_row   = frag.start_row + frag.num_rows;
@@ -174,9 +175,9 @@ CUDF_KERNEL void __launch_bounds__(block_size)
   storage_ref_type const storage_ref{chunk->dict_map_size, map_storage + chunk->dict_map_offset};
 
   __shared__ size_type total_num_dict_entries;
-  thread_index_type val_idx = s_start_value_idx + t;
+  thread_index_type val_idx = s_start_value_idx + tile.meta_group_rank();
 
-  while (val_idx - block_size < end_value_idx) {
+  while (val_idx - ntiles < end_value_idx) {
     auto const is_valid =
       val_idx < end_value_idx and val_idx < data_col.size() and data_col.is_valid(val_idx);
 
@@ -184,42 +185,51 @@ CUDF_KERNEL void __launch_bounds__(block_size)
     size_type is_unique      = 0;
     size_type uniq_elem_size = 0;
     if (is_valid) {
+      // Insert the element to the map
       is_unique =
         type_dispatcher(data_col.type(), map_insert_fn{storage_ref}, data_col, tile, val_idx);
-      uniq_elem_size = [&]() -> size_type {
-        if (not is_unique) { return 0; }
-        switch (col->physical_type) {
-          case Type::INT32: return 4;
-          case Type::INT64: return 8;
-          case Type::INT96: return 12;
-          case Type::FLOAT: return 4;
-          case Type::DOUBLE: return 8;
-          case Type::BYTE_ARRAY: {
-            auto const col_type = data_col.type().id();
-            if (col_type == type_id::STRING) {
-              // Strings are stored as 4 byte length + string bytes
-              return 4 + data_col.element<string_view>(val_idx).size_bytes();
-            } else if (col_type == type_id::LIST) {
-              // Binary is stored as 4 byte length + bytes
-              return 4 + get_element<statistics::byte_array_view>(data_col, val_idx).size_bytes();
+      // First thread in the tile compute the size of the element if unique
+      if (tile.thread_rank() == 0) {
+        uniq_elem_size = [&]() -> size_type {
+          if (not is_unique) { return 0; }
+          switch (col->physical_type) {
+            case Type::INT32: return 4;
+            case Type::INT64: return 8;
+            case Type::INT96: return 12;
+            case Type::FLOAT: return 4;
+            case Type::DOUBLE: return 8;
+            case Type::BYTE_ARRAY: {
+              auto const col_type = data_col.type().id();
+              if (col_type == type_id::STRING) {
+                // Strings are stored as 4 byte length + string bytes
+                return 4 + data_col.element<string_view>(val_idx).size_bytes();
+              } else if (col_type == type_id::LIST) {
+                // Binary is stored as 4 byte length + bytes
+                return 4 + get_element<statistics::byte_array_view>(data_col, val_idx).size_bytes();
+              }
+              CUDF_UNREACHABLE(
+                "Byte array only supports string and list<byte> column types for dictionary "
+                "encoding!");
             }
-            CUDF_UNREACHABLE(
-              "Byte array only supports string and list<byte> column types for dictionary "
-              "encoding!");
+            case Type::FIXED_LEN_BYTE_ARRAY:
+              if (data_col.type().id() == type_id::DECIMAL128) { return sizeof(__int128_t); }
+              CUDF_UNREACHABLE(
+                "Fixed length byte array only supports decimal 128 column types for dictionary "
+                "encoding!");
+            default: CUDF_UNREACHABLE("Unsupported type for dictionary encoding");
           }
-          case Type::FIXED_LEN_BYTE_ARRAY:
-            if (data_col.type().id() == type_id::DECIMAL128) { return sizeof(__int128_t); }
-            CUDF_UNREACHABLE(
-              "Fixed length byte array only supports decimal 128 column types for dictionary "
-              "encoding!");
-          default: CUDF_UNREACHABLE("Unsupported type for dictionary encoding");
-        }
-      }();
+        }();
+      } else {
+        // All threads except the first thread in the tile must reset is_unique to zero
+        is_unique = 0;
+      }
     }
     auto num_unique = block_reduce(reduce_storage).Sum(is_unique);
     block.sync();
     auto uniq_data_size = block_reduce(reduce_storage).Sum(uniq_elem_size);
-    if (t == 0) {
+    // The first thread in the block atomically updates total num dict entries and unique elements
+    // data size
+    if (block.thread_rank() == 0) {
       total_num_dict_entries = atomicAdd(&chunk->num_dict_entries, num_unique);
       total_num_dict_entries += num_unique;
       atomicAdd(&chunk->uniq_data_size, uniq_data_size);
@@ -229,13 +239,13 @@ CUDF_KERNEL void __launch_bounds__(block_size)
     // Check if the num unique values in chunk has already exceeded max dict size and early exit
     if (total_num_dict_entries > MAX_DICT_SIZE) { return; }
 
-    val_idx += block_size;
+    val_idx += ntiles;
   }  // while
 }
 
 template <int block_size>
 CUDF_KERNEL void __launch_bounds__(block_size)
-  collect_map_entries_kernel(window_type* map_storage, device_span<EncColumnChunk> chunks)
+  collect_map_entries_kernel(window_type* const map_storage, device_span<EncColumnChunk> chunks)
 {
   auto& chunk = chunks[blockIdx.x];
   if (not chunk.use_dictionary) { return; }
@@ -266,7 +276,7 @@ CUDF_KERNEL void __launch_bounds__(block_size)
 
 template <int block_size>
 CUDF_KERNEL void __launch_bounds__(block_size)
-  get_dictionary_indices_kernel(window_type* map_storage,
+  get_dictionary_indices_kernel(window_type* const map_storage,
                                 cudf::detail::device_2dspan<PageFragment const> frags)
 {
   auto const col_idx = blockIdx.y;
@@ -297,15 +307,18 @@ CUDF_KERNEL void __launch_bounds__(block_size)
     if (data_col.is_valid(val_idx)) {
       auto [found_key, found_value] =
         type_dispatcher(data_col.type(), map_find_fn{storage_ref}, data_col, tile, val_idx);
-      // No need for atomic as this is not going to be modified by any other thread
-      chunk->dict_index[val_idx - s_ck_start_val_idx] = found_value;
+      // First thread in the tile updates the dict_index
+      if (tile.thread_rank() == 0) {
+        // No need for atomic as this is not going to be modified by any other thread
+        chunk->dict_index[val_idx - s_ck_start_val_idx] = found_value;
+      }
     }
 
     val_idx += ntiles;
   }
 }
 
-void populate_chunk_hash_maps(window_type* map_storage,
+void populate_chunk_hash_maps(window_type* const map_storage,
                               cudf::detail::device_2dspan<PageFragment const> frags,
                               rmm::cuda_stream_view stream)
 {
@@ -314,7 +327,7 @@ void populate_chunk_hash_maps(window_type* map_storage,
     <<<dim_grid, DEFAULT_BLOCK_SIZE, 0, stream.value()>>>(map_storage, frags);
 }
 
-void collect_map_entries(window_type* map_storage,
+void collect_map_entries(window_type* const map_storage,
                          device_span<EncColumnChunk> chunks,
                          rmm::cuda_stream_view stream)
 {
@@ -323,7 +336,7 @@ void collect_map_entries(window_type* map_storage,
     <<<chunks.size(), block_size, 0, stream.value()>>>(map_storage, chunks);
 }
 
-void get_dictionary_indices(window_type* map_storage,
+void get_dictionary_indices(window_type* const map_storage,
                             cudf::detail::device_2dspan<PageFragment const> frags,
                             rmm::cuda_stream_view stream)
 {
diff --git a/cpp/src/io/parquet/parquet_gpu.cuh b/cpp/src/io/parquet/parquet_gpu.cuh
index 66b86140afc..bfaab7cfcd2 100644
--- a/cpp/src/io/parquet/parquet_gpu.cuh
+++ b/cpp/src/io/parquet/parquet_gpu.cuh
@@ -29,7 +29,7 @@ namespace cudf::io::parquet::detail {
 using key_type    = size_type;
 using mapped_type = size_type;
 
-auto constexpr map_cg_size = 1;  ///< A CUDA Cooperative Group of 1 thread to handle each subset
+auto constexpr map_cg_size = 2;  ///< A CUDA Cooperative Group of 1 thread to handle each subset
 auto constexpr window_size = 1;  ///< Number of concurrent slots handled by each thread
 
 auto constexpr KEY_SENTINEL   = key_type{-1};
diff --git a/cpp/src/io/parquet/parquet_gpu.hpp b/cpp/src/io/parquet/parquet_gpu.hpp
index b26735d27b7..b2eb20708b1 100644
--- a/cpp/src/io/parquet/parquet_gpu.hpp
+++ b/cpp/src/io/parquet/parquet_gpu.hpp
@@ -559,7 +559,8 @@ struct EncColumnChunk {
   uint8_t is_compressed;    //!< Nonzero if the chunk uses compression
   uint32_t dictionary_size;    //!< Size of dictionary page including header
   uint32_t ck_stat_size;       //!< Size of chunk-level statistics (included in 1st page header)
-  uint32_t dict_map_offset;    //!< Offset of the hash map storage for calculating dict encoding for this chunk
+  uint32_t dict_map_offset;    //!< Offset of the hash map storage for calculating dict encoding for
+                               //!< this chunk
   size_type dict_map_size;     //!< Size of dict_map_slots
   size_type num_dict_entries;  //!< Total number of entries in dictionary
   size_type

From a14007d5b0c58e0db230e3de6f83965a7bd7c044 Mon Sep 17 00:00:00 2001
From: Muhammad Haseeb <14217455+mhaseeb123@users.noreply.github.com>
Date: Wed, 14 Aug 2024 03:30:45 +0000
Subject: [PATCH 11/30] Updated insert with tiles

---
 cpp/src/io/parquet/chunk_dict.cu   | 121 +++++++++++++++--------------
 cpp/src/io/parquet/parquet_gpu.cuh |  79 +++++++++----------
 2 files changed, 104 insertions(+), 96 deletions(-)

diff --git a/cpp/src/io/parquet/chunk_dict.cu b/cpp/src/io/parquet/chunk_dict.cu
index 82478c01e2d..c3998c36079 100644
--- a/cpp/src/io/parquet/chunk_dict.cu
+++ b/cpp/src/io/parquet/chunk_dict.cu
@@ -39,7 +39,7 @@ struct equality_functor {
   column_device_view const& col;
   __device__ bool operator()(key_type const lhs_idx, key_type const rhs_idx) const
   {
-    // We don't call this for nulls so this is fine
+    // We don't call this for nulls so this is fine.
     auto const equal = cudf::experimental::row::equality::nan_equal_physical_equality_comparator{};
     return equal(col.element<T>(lhs_idx), col.element<T>(rhs_idx));
   }
@@ -71,7 +71,7 @@ struct map_insert_fn {
       auto hash_fn  = hash_fn_type{col};
       auto equal_fn = equality_fn_type{col};
 
-      // Make a view of the hash map
+      // Make a view of the hash map.
       cuco::static_map_ref<key_type,
                            mapped_type,
                            SCOPE,
@@ -85,9 +85,9 @@ struct map_insert_fn {
                      {},
                      storage_ref};
 
-      // Create another map ref with the insert operator
+      // Create another map ref with the insert operator.
       auto map_insert_ref = hash_map_ref.with_operators(cuco::insert);
-      // Insert into the hash map using the provided thread tile
+      // Insert into the hash map using the provided thread tile.
       return map_insert_ref.insert(tile, cuco::pair{i, i});
     } else {
       CUDF_UNREACHABLE("Unsupported type to insert in map");
@@ -125,10 +125,10 @@ struct map_find_fn {
                      {},
                      storage_ref};
 
-      // Create another map with find operator
+      // Create another map with find operator.
       auto map_find_ref = hash_map_ref.with_operators(cuco::find);
 
-      // Find the key = i using the provided thread tile
+      // Find the key = i using the provided thread tile.
       auto found_slot = map_find_ref.find(tile, i);
 
       // Check if didn't find the previously inserted key.
@@ -167,7 +167,7 @@ CUDF_KERNEL void __launch_bounds__(block_size)
   size_type start_row = frag.start_row;
   size_type end_row   = frag.start_row + frag.num_rows;
 
-  // Find the bounds of values in leaf column to be inserted into the map for current chunk
+  // Find the bounds of values in leaf column to be inserted into the map for current chunk.
   size_type const s_start_value_idx = row_to_value_idx(start_row, *col);
   size_type const end_value_idx     = row_to_value_idx(end_row, *col);
 
@@ -175,60 +175,70 @@ CUDF_KERNEL void __launch_bounds__(block_size)
   storage_ref_type const storage_ref{chunk->dict_map_size, map_storage + chunk->dict_map_offset};
 
   __shared__ size_type total_num_dict_entries;
-  thread_index_type val_idx = s_start_value_idx + tile.meta_group_rank();
 
-  while (val_idx - ntiles < end_value_idx) {
-    auto const is_valid =
-      val_idx < end_value_idx and val_idx < data_col.size() and data_col.is_valid(val_idx);
+  for (thread_index_type val_idx = s_start_value_idx + block.thread_rank();
+       val_idx - block_size < end_value_idx;
+       val_idx += block_size) {
+    // Compute the index to the start of the tile.
+    auto const tile_val_idx =
+      val_idx - block.thread_rank() + (tile.meta_group_rank() * tile.num_threads());
 
-    // insert element at val_idx to hash map and count successful insertions
     size_type is_unique      = 0;
     size_type uniq_elem_size = 0;
-    if (is_valid) {
-      // Insert the element to the map
-      is_unique =
-        type_dispatcher(data_col.type(), map_insert_fn{storage_ref}, data_col, tile, val_idx);
-      // First thread in the tile compute the size of the element if unique
-      if (tile.thread_rank() == 0) {
-        uniq_elem_size = [&]() -> size_type {
-          if (not is_unique) { return 0; }
-          switch (col->physical_type) {
-            case Type::INT32: return 4;
-            case Type::INT64: return 8;
-            case Type::INT96: return 12;
-            case Type::FLOAT: return 4;
-            case Type::DOUBLE: return 8;
-            case Type::BYTE_ARRAY: {
-              auto const col_type = data_col.type().id();
-              if (col_type == type_id::STRING) {
-                // Strings are stored as 4 byte length + string bytes
-                return 4 + data_col.element<string_view>(val_idx).size_bytes();
-              } else if (col_type == type_id::LIST) {
-                // Binary is stored as 4 byte length + bytes
-                return 4 + get_element<statistics::byte_array_view>(data_col, val_idx).size_bytes();
+
+    // Insert all elements within each tile.
+    for (auto tile_offset = 0; tile_offset < tile.num_threads(); tile_offset++) {
+      // Compute the index to the element being inserted within the tile.
+      auto const tile_val_idx_plus_offset = tile_val_idx + tile_offset;
+      // Insert element at val_idx to hash map and count successful insertions.
+      auto const is_valid = tile_val_idx_plus_offset < end_value_idx and
+                            tile_val_idx_plus_offset < data_col.size() and
+                            data_col.is_valid(tile_val_idx_plus_offset);
+      if (is_valid) {
+        // Insert the element to the map using the entire tile.
+        auto const tile_is_unique = type_dispatcher(
+          data_col.type(), map_insert_fn{storage_ref}, data_col, tile, tile_val_idx_plus_offset);
+        // Only the tile_offset thread rank computes the size of the element if unique.
+        if (tile.thread_rank() == tile_offset) {
+          is_unique      = tile_is_unique;
+          uniq_elem_size = [&]() -> size_type {
+            if (not is_unique) { return 0; }
+            switch (col->physical_type) {
+              case Type::INT32: return 4;
+              case Type::INT64: return 8;
+              case Type::INT96: return 12;
+              case Type::FLOAT: return 4;
+              case Type::DOUBLE: return 8;
+              case Type::BYTE_ARRAY: {
+                auto const col_type = data_col.type().id();
+                if (col_type == type_id::STRING) {
+                  // Strings are stored as 4 byte length + string bytes
+                  return 4 + data_col.element<string_view>(val_idx).size_bytes();
+                } else if (col_type == type_id::LIST) {
+                  // Binary is stored as 4 byte length + bytes
+                  return 4 +
+                         get_element<statistics::byte_array_view>(data_col, val_idx).size_bytes();
+                }
+                CUDF_UNREACHABLE(
+                  "Byte array only supports string and list<byte> column types for dictionary "
+                  "encoding!");
               }
-              CUDF_UNREACHABLE(
-                "Byte array only supports string and list<byte> column types for dictionary "
-                "encoding!");
+              case Type::FIXED_LEN_BYTE_ARRAY:
+                if (data_col.type().id() == type_id::DECIMAL128) { return sizeof(__int128_t); }
+                CUDF_UNREACHABLE(
+                  "Fixed length byte array only supports decimal 128 column types for dictionary "
+                  "encoding!");
+              default: CUDF_UNREACHABLE("Unsupported type for dictionary encoding");
             }
-            case Type::FIXED_LEN_BYTE_ARRAY:
-              if (data_col.type().id() == type_id::DECIMAL128) { return sizeof(__int128_t); }
-              CUDF_UNREACHABLE(
-                "Fixed length byte array only supports decimal 128 column types for dictionary "
-                "encoding!");
-            default: CUDF_UNREACHABLE("Unsupported type for dictionary encoding");
-          }
-        }();
-      } else {
-        // All threads except the first thread in the tile must reset is_unique to zero
-        is_unique = 0;
+          }();
+        }
       }
     }
+    // All elements in all tiles inserted, reduce num_unique and uniq_data_size across the block.
     auto num_unique = block_reduce(reduce_storage).Sum(is_unique);
     block.sync();
     auto uniq_data_size = block_reduce(reduce_storage).Sum(uniq_elem_size);
-    // The first thread in the block atomically updates total num dict entries and unique elements
-    // data size
+    // The first thread in the block atomically updates total num_unique and uniq_data_size
     if (block.thread_rank() == 0) {
       total_num_dict_entries = atomicAdd(&chunk->num_dict_entries, num_unique);
       total_num_dict_entries += num_unique;
@@ -238,9 +248,7 @@ CUDF_KERNEL void __launch_bounds__(block_size)
 
     // Check if the num unique values in chunk has already exceeded max dict size and early exit
     if (total_num_dict_entries > MAX_DICT_SIZE) { return; }
-
-    val_idx += ntiles;
-  }  // while
+  }  // for loop
 }
 
 template <int block_size>
@@ -302,8 +310,9 @@ CUDF_KERNEL void __launch_bounds__(block_size)
   column_device_view const& data_col = *col->leaf_column;
   storage_ref_type const storage_ref{chunk->dict_map_size, map_storage + chunk->dict_map_offset};
 
-  thread_index_type val_idx = s_start_value_idx + tile.meta_group_rank();
-  while (val_idx < end_value_idx) {
+  for (thread_index_type val_idx = s_start_value_idx + tile.meta_group_rank();
+       val_idx < end_value_idx;
+       val_idx += ntiles) {
     if (data_col.is_valid(val_idx)) {
       auto [found_key, found_value] =
         type_dispatcher(data_col.type(), map_find_fn{storage_ref}, data_col, tile, val_idx);
@@ -313,8 +322,6 @@ CUDF_KERNEL void __launch_bounds__(block_size)
         chunk->dict_index[val_idx - s_ck_start_val_idx] = found_value;
       }
     }
-
-    val_idx += ntiles;
   }
 }
 
diff --git a/cpp/src/io/parquet/parquet_gpu.cuh b/cpp/src/io/parquet/parquet_gpu.cuh
index bfaab7cfcd2..5bfb531a1c0 100644
--- a/cpp/src/io/parquet/parquet_gpu.cuh
+++ b/cpp/src/io/parquet/parquet_gpu.cuh
@@ -29,7 +29,7 @@ namespace cudf::io::parquet::detail {
 using key_type    = size_type;
 using mapped_type = size_type;
 
-auto constexpr map_cg_size = 2;  ///< A CUDA Cooperative Group of 1 thread to handle each subset
+auto constexpr map_cg_size = 2;  ///< A CUDA Cooperative Group of 4 thread to handle each subset
 auto constexpr window_size = 1;  ///< Number of concurrent slots handled by each thread
 
 auto constexpr KEY_SENTINEL   = key_type{-1};
@@ -41,44 +41,6 @@ using slot_type = cuco::pair<key_type, mapped_type>;
 using storage_type     = cuco::aow_storage<slot_type, window_size>;
 using storage_ref_type = typename storage_type::ref_type;
 using window_type      = typename storage_type::window_type;
-/**
- * @brief Insert chunk values into their respective hash maps
- *
- * @param map_storage Pointer to the bulk hashmap storage
- * @param frags Column fragments
- * @param stream CUDA stream to use
- */
-void populate_chunk_hash_maps(window_type* map_storage,
-                              cudf::detail::device_2dspan<PageFragment const> frags,
-                              rmm::cuda_stream_view stream);
-
-/**
- * @brief Compact dictionary hash map entries into chunk.dict_data
- *
- * @param map_storage Pointer to the bulk hashmap storage
- * @param chunks Flat span of chunks to compact hash maps for
- * @param stream CUDA stream to use
- */
-void collect_map_entries(window_type* map_storage,
-                         device_span<EncColumnChunk> chunks,
-                         rmm::cuda_stream_view stream);
-
-/**
- * @brief Get the Dictionary Indices for each row
- *
- * For each row of a chunk, gets the indices into chunk.dict_data which contains the value otherwise
- * stored in input column [row]. Stores these indices into chunk.dict_index.
- *
- * Since dict_data itself contains indices into the original cudf column, this means that
- * col[row] == col[dict_data[dict_index[row - chunk.start_row]]]
- *
- * @param map_storage Pointer to the bulk hashmap storage
- * @param frags Column fragments
- * @param stream CUDA stream to use
- */
-void get_dictionary_indices(window_type* map_storage,
-                            cudf::detail::device_2dspan<PageFragment const> frags,
-                            rmm::cuda_stream_view stream);
 
 /**
  * @brief Return the byte length of parquet dtypes that are physically represented by INT32
@@ -123,4 +85,43 @@ inline size_type __device__ row_to_value_idx(size_type idx,
   return idx;
 }
 
+/**
+ * @brief Insert chunk values into their respective hash maps
+ *
+ * @param map_storage Pointer to the bulk hashmap storage
+ * @param frags Column fragments
+ * @param stream CUDA stream to use
+ */
+void populate_chunk_hash_maps(window_type* map_storage,
+                              cudf::detail::device_2dspan<PageFragment const> frags,
+                              rmm::cuda_stream_view stream);
+
+/**
+ * @brief Compact dictionary hash map entries into chunk.dict_data
+ *
+ * @param map_storage Pointer to the bulk hashmap storage
+ * @param chunks Flat span of chunks to compact hash maps for
+ * @param stream CUDA stream to use
+ */
+void collect_map_entries(window_type* map_storage,
+                         device_span<EncColumnChunk> chunks,
+                         rmm::cuda_stream_view stream);
+
+/**
+ * @brief Get the Dictionary Indices for each row
+ *
+ * For each row of a chunk, gets the indices into chunk.dict_data which contains the value otherwise
+ * stored in input column [row]. Stores these indices into chunk.dict_index.
+ *
+ * Since dict_data itself contains indices into the original cudf column, this means that
+ * col[row] == col[dict_data[dict_index[row - chunk.start_row]]]
+ *
+ * @param map_storage Pointer to the bulk hashmap storage
+ * @param frags Column fragments
+ * @param stream CUDA stream to use
+ */
+void get_dictionary_indices(window_type* map_storage,
+                            cudf::detail::device_2dspan<PageFragment const> frags,
+                            rmm::cuda_stream_view stream);
+
 }  // namespace cudf::io::parquet::detail

From 4935c661f185677e0ed386dd5f15193f91537bcc Mon Sep 17 00:00:00 2001
From: Muhammad Haseeb <14217455+mhaseeb123@users.noreply.github.com>
Date: Wed, 14 Aug 2024 05:15:22 +0000
Subject: [PATCH 12/30] Cosmetic updates

---
 cpp/src/io/parquet/chunk_dict.cu | 26 +++++++++++++++-----------
 1 file changed, 15 insertions(+), 11 deletions(-)

diff --git a/cpp/src/io/parquet/chunk_dict.cu b/cpp/src/io/parquet/chunk_dict.cu
index c3998c36079..e6b31fce764 100644
--- a/cpp/src/io/parquet/chunk_dict.cu
+++ b/cpp/src/io/parquet/chunk_dict.cu
@@ -176,11 +176,12 @@ CUDF_KERNEL void __launch_bounds__(block_size)
 
   __shared__ size_type total_num_dict_entries;
 
+  // Insert all column chunk elements to the hash map to build the dict.
   for (thread_index_type val_idx = s_start_value_idx + block.thread_rank();
        val_idx - block_size < end_value_idx;
        val_idx += block_size) {
     // Compute the index to the start of the tile.
-    auto const tile_val_idx =
+    auto const val_idx_base =
       val_idx - block.thread_rank() + (tile.meta_group_rank() * tile.num_threads());
 
     size_type is_unique      = 0;
@@ -189,16 +190,19 @@ CUDF_KERNEL void __launch_bounds__(block_size)
     // Insert all elements within each tile.
     for (auto tile_offset = 0; tile_offset < tile.num_threads(); tile_offset++) {
       // Compute the index to the element being inserted within the tile.
-      auto const tile_val_idx_plus_offset = tile_val_idx + tile_offset;
-      // Insert element at val_idx to hash map and count successful insertions.
-      auto const is_valid = tile_val_idx_plus_offset < end_value_idx and
-                            tile_val_idx_plus_offset < data_col.size() and
-                            data_col.is_valid(tile_val_idx_plus_offset);
+      auto const tile_val_idx = val_idx_base + tile_offset;
+
+      // Check if this index is valid.
+      auto const is_valid = tile_val_idx < end_value_idx and tile_val_idx < data_col.size() and
+                            data_col.is_valid(tile_val_idx);
+
+      // Insert tile_val_idx to hash map and count successful insertions.
       if (is_valid) {
         // Insert the element to the map using the entire tile.
         auto const tile_is_unique = type_dispatcher(
-          data_col.type(), map_insert_fn{storage_ref}, data_col, tile, tile_val_idx_plus_offset);
-        // Only the tile_offset thread rank computes the size of the element if unique.
+          data_col.type(), map_insert_fn{storage_ref}, data_col, tile, tile_val_idx);
+
+        // tile_offset'th thread updates its number and size of unique element.
         if (tile.thread_rank() == tile_offset) {
           is_unique      = tile_is_unique;
           uniq_elem_size = [&]() -> size_type {
@@ -213,11 +217,11 @@ CUDF_KERNEL void __launch_bounds__(block_size)
                 auto const col_type = data_col.type().id();
                 if (col_type == type_id::STRING) {
                   // Strings are stored as 4 byte length + string bytes
-                  return 4 + data_col.element<string_view>(val_idx).size_bytes();
+                  return 4 + data_col.element<string_view>(tile_val_idx).size_bytes();
                 } else if (col_type == type_id::LIST) {
                   // Binary is stored as 4 byte length + bytes
-                  return 4 +
-                         get_element<statistics::byte_array_view>(data_col, val_idx).size_bytes();
+                  return 4 + get_element<statistics::byte_array_view>(data_col, tile_val_idx)
+                               .size_bytes();
                 }
                 CUDF_UNREACHABLE(
                   "Byte array only supports string and list<byte> column types for dictionary "

From 63dab9de55400594a3a02c518877fd4da8bebb18 Mon Sep 17 00:00:00 2001
From: Muhammad Haseeb <14217455+mhaseeb123@users.noreply.github.com>
Date: Wed, 14 Aug 2024 18:13:19 +0000
Subject: [PATCH 13/30] Minor improvements

---
 cpp/src/io/parquet/chunk_dict.cu   | 17 +++++++----------
 cpp/src/io/parquet/parquet_gpu.cuh |  3 +--
 2 files changed, 8 insertions(+), 12 deletions(-)

diff --git a/cpp/src/io/parquet/chunk_dict.cu b/cpp/src/io/parquet/chunk_dict.cu
index e6b31fce764..f92d0eb8337 100644
--- a/cpp/src/io/parquet/chunk_dict.cu
+++ b/cpp/src/io/parquet/chunk_dict.cu
@@ -160,9 +160,8 @@ CUDF_KERNEL void __launch_bounds__(block_size)
   using block_reduce = cub::BlockReduce<size_type, block_size>;
   __shared__ typename block_reduce::TempStorage reduce_storage;
 
-  auto const block  = cg::this_thread_block();
-  auto const tile   = cg::tiled_partition<map_cg_size>(block);
-  auto const ntiles = tile.meta_group_size();
+  auto const block = cg::this_thread_block();
+  auto const tile  = cg::tiled_partition<map_cg_size>(block);
 
   size_type start_row = frag.start_row;
   size_type end_row   = frag.start_row + frag.num_rows;
@@ -177,20 +176,18 @@ CUDF_KERNEL void __launch_bounds__(block_size)
   __shared__ size_type total_num_dict_entries;
 
   // Insert all column chunk elements to the hash map to build the dict.
-  for (thread_index_type val_idx = s_start_value_idx + block.thread_rank();
-       val_idx - block_size < end_value_idx;
+  for (thread_index_type val_idx = s_start_value_idx;
+       val_idx + block.thread_rank() - block_size < end_value_idx;
        val_idx += block_size) {
     // Compute the index to the start of the tile.
-    auto const val_idx_base =
-      val_idx - block.thread_rank() + (tile.meta_group_rank() * tile.num_threads());
-
     size_type is_unique      = 0;
     size_type uniq_elem_size = 0;
 
     // Insert all elements within each tile.
     for (auto tile_offset = 0; tile_offset < tile.num_threads(); tile_offset++) {
       // Compute the index to the element being inserted within the tile.
-      auto const tile_val_idx = val_idx_base + tile_offset;
+      auto const tile_val_idx =
+        val_idx + tile_offset + (tile.meta_group_rank() * tile.num_threads());
 
       // Check if this index is valid.
       auto const is_valid = tile_val_idx < end_value_idx and tile_val_idx < data_col.size() and
@@ -202,7 +199,7 @@ CUDF_KERNEL void __launch_bounds__(block_size)
         auto const tile_is_unique = type_dispatcher(
           data_col.type(), map_insert_fn{storage_ref}, data_col, tile, tile_val_idx);
 
-        // tile_offset'th thread updates its number and size of unique element.
+        // tile_offset'th thread in the tile updates its number and size of unique element.
         if (tile.thread_rank() == tile_offset) {
           is_unique      = tile_is_unique;
           uniq_elem_size = [&]() -> size_type {
diff --git a/cpp/src/io/parquet/parquet_gpu.cuh b/cpp/src/io/parquet/parquet_gpu.cuh
index 5bfb531a1c0..95d4471fac9 100644
--- a/cpp/src/io/parquet/parquet_gpu.cuh
+++ b/cpp/src/io/parquet/parquet_gpu.cuh
@@ -28,6 +28,7 @@ namespace cudf::io::parquet::detail {
 
 using key_type    = size_type;
 using mapped_type = size_type;
+using slot_type   = cuco::pair<key_type, mapped_type>;
 
 auto constexpr map_cg_size = 2;  ///< A CUDA Cooperative Group of 4 thread to handle each subset
 auto constexpr window_size = 1;  ///< Number of concurrent slots handled by each thread
@@ -36,8 +37,6 @@ auto constexpr KEY_SENTINEL   = key_type{-1};
 auto constexpr VALUE_SENTINEL = mapped_type{-1};
 auto constexpr SCOPE          = cuda::thread_scope_block;
 
-using slot_type = cuco::pair<key_type, mapped_type>;
-
 using storage_type     = cuco::aow_storage<slot_type, window_size>;
 using storage_ref_type = typename storage_type::ref_type;
 using window_type      = typename storage_type::window_type;

From 40a2f395ecb338ec1c36772dec6c1adfe321fb8d Mon Sep 17 00:00:00 2001
From: Muhammad Haseeb <14217455+mhaseeb123@users.noreply.github.com>
Date: Wed, 14 Aug 2024 18:51:08 +0000
Subject: [PATCH 14/30] Use span instead of raw pointers.

---
 cpp/src/io/parquet/chunk_dict.cu   | 21 ++++++++++++---------
 cpp/src/io/parquet/parquet_gpu.cuh | 14 +++++++-------
 cpp/src/io/parquet/writer_impl.cu  |  8 +++++---
 3 files changed, 24 insertions(+), 19 deletions(-)

diff --git a/cpp/src/io/parquet/chunk_dict.cu b/cpp/src/io/parquet/chunk_dict.cu
index f92d0eb8337..62ba593f20f 100644
--- a/cpp/src/io/parquet/chunk_dict.cu
+++ b/cpp/src/io/parquet/chunk_dict.cu
@@ -146,7 +146,7 @@ struct map_find_fn {
 
 template <int block_size>
 CUDF_KERNEL void __launch_bounds__(block_size)
-  populate_chunk_hash_maps_kernel(window_type* const map_storage,
+  populate_chunk_hash_maps_kernel(device_span<window_type> const map_storage,
                                   cudf::detail::device_2dspan<PageFragment const> frags)
 {
   auto const col_idx = blockIdx.y;
@@ -171,7 +171,8 @@ CUDF_KERNEL void __launch_bounds__(block_size)
   size_type const end_value_idx     = row_to_value_idx(end_row, *col);
 
   column_device_view const& data_col = *col->leaf_column;
-  storage_ref_type const storage_ref{chunk->dict_map_size, map_storage + chunk->dict_map_offset};
+  storage_ref_type const storage_ref{chunk->dict_map_size,
+                                     map_storage.data() + chunk->dict_map_offset};
 
   __shared__ size_type total_num_dict_entries;
 
@@ -254,7 +255,8 @@ CUDF_KERNEL void __launch_bounds__(block_size)
 
 template <int block_size>
 CUDF_KERNEL void __launch_bounds__(block_size)
-  collect_map_entries_kernel(window_type* const map_storage, device_span<EncColumnChunk> chunks)
+  collect_map_entries_kernel(device_span<window_type> const map_storage,
+                             device_span<EncColumnChunk> chunks)
 {
   auto& chunk = chunks[blockIdx.x];
   if (not chunk.use_dictionary) { return; }
@@ -268,7 +270,7 @@ CUDF_KERNEL void __launch_bounds__(block_size)
 
   for (size_type i = 0; i < chunk.dict_map_size; i += block_size) {
     if (t + i < chunk.dict_map_size) {
-      auto* slot     = map_storage + chunk.dict_map_offset + t + i;
+      auto* slot     = map_storage.data() + chunk.dict_map_offset + t + i;
       auto const key = slot->data()->first;
       if (key != KEY_SENTINEL) {
         auto loc = counter.fetch_add(1, memory_order_relaxed);
@@ -285,7 +287,7 @@ CUDF_KERNEL void __launch_bounds__(block_size)
 
 template <int block_size>
 CUDF_KERNEL void __launch_bounds__(block_size)
-  get_dictionary_indices_kernel(window_type* const map_storage,
+  get_dictionary_indices_kernel(device_span<window_type> const map_storage,
                                 cudf::detail::device_2dspan<PageFragment const> frags)
 {
   auto const col_idx = blockIdx.y;
@@ -309,7 +311,8 @@ CUDF_KERNEL void __launch_bounds__(block_size)
   auto const end_value_idx      = row_to_value_idx(end_row, *col);
 
   column_device_view const& data_col = *col->leaf_column;
-  storage_ref_type const storage_ref{chunk->dict_map_size, map_storage + chunk->dict_map_offset};
+  storage_ref_type const storage_ref{chunk->dict_map_size,
+                                     map_storage.data() + chunk->dict_map_offset};
 
   for (thread_index_type val_idx = s_start_value_idx + tile.meta_group_rank();
        val_idx < end_value_idx;
@@ -326,7 +329,7 @@ CUDF_KERNEL void __launch_bounds__(block_size)
   }
 }
 
-void populate_chunk_hash_maps(window_type* const map_storage,
+void populate_chunk_hash_maps(device_span<window_type> const map_storage,
                               cudf::detail::device_2dspan<PageFragment const> frags,
                               rmm::cuda_stream_view stream)
 {
@@ -335,7 +338,7 @@ void populate_chunk_hash_maps(window_type* const map_storage,
     <<<dim_grid, DEFAULT_BLOCK_SIZE, 0, stream.value()>>>(map_storage, frags);
 }
 
-void collect_map_entries(window_type* const map_storage,
+void collect_map_entries(device_span<window_type> const map_storage,
                          device_span<EncColumnChunk> chunks,
                          rmm::cuda_stream_view stream)
 {
@@ -344,7 +347,7 @@ void collect_map_entries(window_type* const map_storage,
     <<<chunks.size(), block_size, 0, stream.value()>>>(map_storage, chunks);
 }
 
-void get_dictionary_indices(window_type* const map_storage,
+void get_dictionary_indices(device_span<window_type> const map_storage,
                             cudf::detail::device_2dspan<PageFragment const> frags,
                             rmm::cuda_stream_view stream)
 {
diff --git a/cpp/src/io/parquet/parquet_gpu.cuh b/cpp/src/io/parquet/parquet_gpu.cuh
index 95d4471fac9..34dc74d1880 100644
--- a/cpp/src/io/parquet/parquet_gpu.cuh
+++ b/cpp/src/io/parquet/parquet_gpu.cuh
@@ -30,7 +30,7 @@ using key_type    = size_type;
 using mapped_type = size_type;
 using slot_type   = cuco::pair<key_type, mapped_type>;
 
-auto constexpr map_cg_size = 2;  ///< A CUDA Cooperative Group of 4 thread to handle each subset
+auto constexpr map_cg_size = 2;  ///< A CUDA Cooperative Group of 2 threads to handle each subset
 auto constexpr window_size = 1;  ///< Number of concurrent slots handled by each thread
 
 auto constexpr KEY_SENTINEL   = key_type{-1};
@@ -87,22 +87,22 @@ inline size_type __device__ row_to_value_idx(size_type idx,
 /**
  * @brief Insert chunk values into their respective hash maps
  *
- * @param map_storage Pointer to the bulk hashmap storage
+ * @param map_storage Bulk hashmap storage
  * @param frags Column fragments
  * @param stream CUDA stream to use
  */
-void populate_chunk_hash_maps(window_type* map_storage,
+void populate_chunk_hash_maps(device_span<window_type> const map_storage,
                               cudf::detail::device_2dspan<PageFragment const> frags,
                               rmm::cuda_stream_view stream);
 
 /**
  * @brief Compact dictionary hash map entries into chunk.dict_data
  *
- * @param map_storage Pointer to the bulk hashmap storage
+ * @param map_storage Bulk hashmap storage
  * @param chunks Flat span of chunks to compact hash maps for
  * @param stream CUDA stream to use
  */
-void collect_map_entries(window_type* map_storage,
+void collect_map_entries(device_span<window_type> const map_storage,
                          device_span<EncColumnChunk> chunks,
                          rmm::cuda_stream_view stream);
 
@@ -115,11 +115,11 @@ void collect_map_entries(window_type* map_storage,
  * Since dict_data itself contains indices into the original cudf column, this means that
  * col[row] == col[dict_data[dict_index[row - chunk.start_row]]]
  *
- * @param map_storage Pointer to the bulk hashmap storage
+ * @param map_storage Bulk hashmap storage
  * @param frags Column fragments
  * @param stream CUDA stream to use
  */
-void get_dictionary_indices(window_type* map_storage,
+void get_dictionary_indices(device_span<window_type> const map_storage,
                             cudf::detail::device_2dspan<PageFragment const> frags,
                             rmm::cuda_stream_view stream);
 
diff --git a/cpp/src/io/parquet/writer_impl.cu b/cpp/src/io/parquet/writer_impl.cu
index e319a39e464..f026cd047e8 100644
--- a/cpp/src/io/parquet/writer_impl.cu
+++ b/cpp/src/io/parquet/writer_impl.cu
@@ -1330,6 +1330,8 @@ build_chunk_dictionaries(hostdevice_2dvector<EncColumnChunk>& chunks,
   // Initialize storage with the given sentinel iff non-zero size
   map_storage.initialize(cuco::pair{KEY_SENTINEL, VALUE_SENTINEL},
                          cuda::stream_ref{stream.value()});
+  // Create a span of non-const map_storage as map_storage_ref takes in a non-const pointer.
+  device_span<window_type> const map_storage_data{map_storage.data(), total_map_storage_size};
 
   // Populate chunk dictionary offsets
   std::for_each(
@@ -1343,7 +1345,7 @@ build_chunk_dictionaries(hostdevice_2dvector<EncColumnChunk>& chunks,
   // Synchronize
   chunks.host_to_device_async(stream);
   // Populate the hash map for each chunk
-  populate_chunk_hash_maps(map_storage.data(), frags, stream);
+  populate_chunk_hash_maps(map_storage_data, frags, stream);
   // Synchronize again
   chunks.device_to_host_sync(stream);
 
@@ -1404,8 +1406,8 @@ build_chunk_dictionaries(hostdevice_2dvector<EncColumnChunk>& chunks,
     chunk.dict_index          = inserted_dict_index.data();
   }
   chunks.host_to_device_async(stream);
-  collect_map_entries(map_storage.data(), chunks.device_view().flat_view(), stream);
-  get_dictionary_indices(map_storage.data(), frags, stream);
+  collect_map_entries(map_storage_data, chunks.device_view().flat_view(), stream);
+  get_dictionary_indices(map_storage_data, frags, stream);
 
   return std::pair(std::move(dict_data), std::move(dict_index));
 }

From 8f2e65091b23871e0ace6b0613e3112487f463dc Mon Sep 17 00:00:00 2001
From: Muhammad Haseeb <14217455+mhaseeb123@users.noreply.github.com>
Date: Wed, 14 Aug 2024 23:05:43 +0000
Subject: [PATCH 15/30] Minor improvements

---
 cpp/src/io/parquet/parquet_gpu.cuh |  2 +-
 cpp/src/io/parquet/writer_impl.cu  | 42 +++++++++---------------------
 2 files changed, 13 insertions(+), 31 deletions(-)

diff --git a/cpp/src/io/parquet/parquet_gpu.cuh b/cpp/src/io/parquet/parquet_gpu.cuh
index 34dc74d1880..0f3cfe1343b 100644
--- a/cpp/src/io/parquet/parquet_gpu.cuh
+++ b/cpp/src/io/parquet/parquet_gpu.cuh
@@ -30,7 +30,7 @@ using key_type    = size_type;
 using mapped_type = size_type;
 using slot_type   = cuco::pair<key_type, mapped_type>;
 
-auto constexpr map_cg_size = 2;  ///< A CUDA Cooperative Group of 2 threads to handle each subset
+auto constexpr map_cg_size = 1;  ///< A CUDA Cooperative Group of 2 threads to handle each subset
 auto constexpr window_size = 1;  ///< Number of concurrent slots handled by each thread
 
 auto constexpr KEY_SENTINEL   = key_type{-1};
diff --git a/cpp/src/io/parquet/writer_impl.cu b/cpp/src/io/parquet/writer_impl.cu
index f026cd047e8..e9efc8f7dde 100644
--- a/cpp/src/io/parquet/writer_impl.cu
+++ b/cpp/src/io/parquet/writer_impl.cu
@@ -1285,11 +1285,9 @@ build_chunk_dictionaries(hostdevice_2dvector<EncColumnChunk>& chunks,
     return std::pair(std::move(dict_data), std::move(dict_index));
   }
 
-  // Create a vector to store valid chunk sizes using cuco::make_window_extent
-  std::vector<size_t> valid_chunk_sizes;
-  valid_chunk_sizes.reserve(h_chunks.size());
-
-  // Populate valid_chunk_sizes for chunks that need to build a dictionary.
+  // Variable to keep track of the current offset
+  uint32_t curr_offset = 0;
+  // Populate dict offsets and sizes for each chunk that need to build a dictionary.
   std::for_each(h_chunks.begin(), h_chunks.end(), [&](auto& chunk) {
     auto const& chunk_col_desc = col_desc[chunk.col_desc_id];
     auto const is_requested_non_dict =
@@ -1300,28 +1298,21 @@ build_chunk_dictionaries(hostdevice_2dvector<EncColumnChunk>& chunks,
 
     if (is_type_non_dict || is_requested_non_dict) {
       chunk.use_dictionary = false;
-      // Emplace a zero for 1-1 mapping between h_chunks and valid_chunk_sizes
-      valid_chunk_sizes.emplace_back(static_cast<std::size_t>(0));
     } else {
       chunk.use_dictionary = true;
-      valid_chunk_sizes.emplace_back(
-        static_cast<std::size_t>(cuco::make_window_extent<map_cg_size, window_size>(
-          // cuCollections suggests using a hash map of size N * (1/0.7) = 1.43 to target a 70%
-          // occupancy factor.
-          static_cast<size_t>(chunk.num_values * 1.43))));
-      chunk.dict_map_size = valid_chunk_sizes.back();
+      // cuCollections suggests using a hash map of size N * (1/0.7) = 1.43 to target a 70%
+      // occupancy factor.
+      chunk.dict_map_size =
+        static_cast<cudf::size_type>(cuco::make_window_extent<map_cg_size, window_size>(
+          static_cast<cudf::size_type>(1.43 * chunk.num_values)));
+      chunk.dict_map_offset = curr_offset;
+      curr_offset += chunk.dict_map_size;
     }
   });
 
-  // Create a vector to map offsets from chunk sizes
-  std::vector<std::size_t> map_offsets(valid_chunk_sizes.size(), 0);
-  std::exclusive_scan(valid_chunk_sizes.begin(),
-                      valid_chunk_sizes.end(),
-                      map_offsets.begin(),
-                      static_cast<size_t>(0));
-
   // Compute total map storage
-  auto const total_map_storage_size = map_offsets.back() + valid_chunk_sizes.back();
+  auto const total_map_storage_size = static_cast<std::size_t>(curr_offset);
+
   // No chunk needs to create a dictionary, exit early
   if (total_map_storage_size == 0) { return {std::move(dict_data), std::move(dict_index)}; }
 
@@ -1333,15 +1324,6 @@ build_chunk_dictionaries(hostdevice_2dvector<EncColumnChunk>& chunks,
   // Create a span of non-const map_storage as map_storage_ref takes in a non-const pointer.
   device_span<window_type> const map_storage_data{map_storage.data(), total_map_storage_size};
 
-  // Populate chunk dictionary offsets
-  std::for_each(
-    thrust::make_zip_iterator(thrust::make_tuple(h_chunks.begin(), map_offsets.begin())),
-    thrust::make_zip_iterator(thrust::make_tuple(h_chunks.end(), map_offsets.end())),
-    [&](auto elem) -> void {
-      auto& chunk = thrust::get<0>(elem);
-      if (chunk.use_dictionary) { chunk.dict_map_offset = thrust::get<1>(elem); }
-    });
-
   // Synchronize
   chunks.host_to_device_async(stream);
   // Populate the hash map for each chunk

From 7bcca4529373808b55c55d1c3c914951dcf05286 Mon Sep 17 00:00:00 2001
From: Muhammad Haseeb <14217455+mhaseeb123@users.noreply.github.com>
Date: Wed, 14 Aug 2024 23:30:24 +0000
Subject: [PATCH 16/30] Minor improvements

---
 cpp/src/io/parquet/chunk_dict.cu   | 45 +++++++++++-------------------
 cpp/src/io/parquet/parquet_gpu.cuh |  2 +-
 2 files changed, 17 insertions(+), 30 deletions(-)

diff --git a/cpp/src/io/parquet/chunk_dict.cu b/cpp/src/io/parquet/chunk_dict.cu
index 62ba593f20f..6aa434eb59d 100644
--- a/cpp/src/io/parquet/chunk_dict.cu
+++ b/cpp/src/io/parquet/chunk_dict.cu
@@ -56,39 +56,32 @@ struct hash_functor {
 
 struct map_insert_fn {
   storage_ref_type const& storage_ref;
-
+  column_device_view const& col;
   template <typename T>
-  __device__ bool operator()(column_device_view const& col,
-                             cg::thread_block_tile<map_cg_size> const& tile,
-                             key_type i)
+  __device__ bool operator()(cg::thread_block_tile<map_cg_size> const& tile, key_type i)
   {
     if constexpr (column_device_view::has_element_accessor<T>()) {
       using equality_fn_type    = equality_functor<T>;
       using hash_fn_type        = hash_functor<T>;
       using probing_scheme_type = cuco::linear_probing<map_cg_size, hash_fn_type>;
 
-      // Instantiate hash and equality functors.
-      auto hash_fn  = hash_fn_type{col};
-      auto equal_fn = equality_fn_type{col};
-
       // Make a view of the hash map.
       cuco::static_map_ref<key_type,
                            mapped_type,
                            SCOPE,
                            equality_fn_type,
                            probing_scheme_type,
-                           storage_ref_type>
+                           storage_ref_type,
+                           cuco::op::insert_tag>
         hash_map_ref{cuco::empty_key{KEY_SENTINEL},
                      cuco::empty_value{VALUE_SENTINEL},
-                     {equal_fn},
-                     {hash_fn},
+                     equality_fn_type{col},
+                     hash_fn_type{col},
                      {},
                      storage_ref};
 
-      // Create another map ref with the insert operator.
-      auto map_insert_ref = hash_map_ref.with_operators(cuco::insert);
       // Insert into the hash map using the provided thread tile.
-      return map_insert_ref.insert(tile, cuco::pair{i, i});
+      return hash_map_ref.insert(tile, cuco::pair{i, i});
     } else {
       CUDF_UNREACHABLE("Unsupported type to insert in map");
     }
@@ -97,39 +90,33 @@ struct map_insert_fn {
 
 struct map_find_fn {
   storage_ref_type const& storage_ref;
-
+  column_device_view const& col;
   template <typename T>
   __device__ cuco::pair<key_type, mapped_type> operator()(
-    column_device_view const& col, cg::thread_block_tile<map_cg_size> const& tile, key_type i)
+    cg::thread_block_tile<map_cg_size> const& tile, key_type i)
   {
     if constexpr (column_device_view::has_element_accessor<T>()) {
       using equality_fn_type    = equality_functor<T>;
       using hash_fn_type        = hash_functor<T>;
       using probing_scheme_type = cuco::linear_probing<map_cg_size, hash_fn_type>;
 
-      // Instantiate hash and equality functors.
-      auto hash_fn  = hash_fn_type{col};
-      auto equal_fn = equality_fn_type{col};
-
       // Make a view of the hash map
       cuco::static_map_ref<key_type,
                            mapped_type,
                            SCOPE,
                            equality_fn_type,
                            probing_scheme_type,
-                           storage_ref_type>
+                           storage_ref_type,
+                           cuco::op::find_tag>
         hash_map_ref{cuco::empty_key{KEY_SENTINEL},
                      cuco::empty_value{VALUE_SENTINEL},
-                     {equal_fn},
-                     {hash_fn},
+                     equality_fn_type{col},
+                     hash_fn_type{col},
                      {},
                      storage_ref};
 
-      // Create another map with find operator.
-      auto map_find_ref = hash_map_ref.with_operators(cuco::find);
-
       // Find the key = i using the provided thread tile.
-      auto found_slot = map_find_ref.find(tile, i);
+      auto found_slot = hash_map_ref.find(tile, i);
 
       // Check if didn't find the previously inserted key.
       if (tile.thread_rank() == 0) {
@@ -198,7 +185,7 @@ CUDF_KERNEL void __launch_bounds__(block_size)
       if (is_valid) {
         // Insert the element to the map using the entire tile.
         auto const tile_is_unique = type_dispatcher(
-          data_col.type(), map_insert_fn{storage_ref}, data_col, tile, tile_val_idx);
+          data_col.type(), map_insert_fn{storage_ref, data_col}, tile, tile_val_idx);
 
         // tile_offset'th thread in the tile updates its number and size of unique element.
         if (tile.thread_rank() == tile_offset) {
@@ -319,7 +306,7 @@ CUDF_KERNEL void __launch_bounds__(block_size)
        val_idx += ntiles) {
     if (data_col.is_valid(val_idx)) {
       auto [found_key, found_value] =
-        type_dispatcher(data_col.type(), map_find_fn{storage_ref}, data_col, tile, val_idx);
+        type_dispatcher(data_col.type(), map_find_fn{storage_ref, data_col}, tile, val_idx);
       // First thread in the tile updates the dict_index
       if (tile.thread_rank() == 0) {
         // No need for atomic as this is not going to be modified by any other thread
diff --git a/cpp/src/io/parquet/parquet_gpu.cuh b/cpp/src/io/parquet/parquet_gpu.cuh
index 0f3cfe1343b..34dc74d1880 100644
--- a/cpp/src/io/parquet/parquet_gpu.cuh
+++ b/cpp/src/io/parquet/parquet_gpu.cuh
@@ -30,7 +30,7 @@ using key_type    = size_type;
 using mapped_type = size_type;
 using slot_type   = cuco::pair<key_type, mapped_type>;
 
-auto constexpr map_cg_size = 1;  ///< A CUDA Cooperative Group of 2 threads to handle each subset
+auto constexpr map_cg_size = 2;  ///< A CUDA Cooperative Group of 2 threads to handle each subset
 auto constexpr window_size = 1;  ///< Number of concurrent slots handled by each thread
 
 auto constexpr KEY_SENTINEL   = key_type{-1};

From b0e482ed0657cd840c57dbebc741944b6a0ce19f Mon Sep 17 00:00:00 2001
From: Muhammad Haseeb <14217455+mhaseeb123@users.noreply.github.com>
Date: Thu, 15 Aug 2024 02:12:43 +0000
Subject: [PATCH 17/30] Perf improvements

---
 cpp/src/io/parquet/chunk_dict.cu | 125 ++++++++++++++-----------------
 1 file changed, 56 insertions(+), 69 deletions(-)

diff --git a/cpp/src/io/parquet/chunk_dict.cu b/cpp/src/io/parquet/chunk_dict.cu
index 6aa434eb59d..20773ab2e27 100644
--- a/cpp/src/io/parquet/chunk_dict.cu
+++ b/cpp/src/io/parquet/chunk_dict.cu
@@ -164,66 +164,57 @@ CUDF_KERNEL void __launch_bounds__(block_size)
   __shared__ size_type total_num_dict_entries;
 
   // Insert all column chunk elements to the hash map to build the dict.
-  for (thread_index_type val_idx = s_start_value_idx;
-       val_idx + block.thread_rank() - block_size < end_value_idx;
-       val_idx += block_size) {
-    // Compute the index to the start of the tile.
+  for (thread_index_type val_idx = s_start_value_idx + tile.meta_group_rank();
+       val_idx - tile.meta_group_size() < end_value_idx;
+       val_idx += tile.meta_group_size()) {
     size_type is_unique      = 0;
     size_type uniq_elem_size = 0;
 
-    // Insert all elements within each tile.
-    for (auto tile_offset = 0; tile_offset < tile.num_threads(); tile_offset++) {
-      // Compute the index to the element being inserted within the tile.
-      auto const tile_val_idx =
-        val_idx + tile_offset + (tile.meta_group_rank() * tile.num_threads());
-
-      // Check if this index is valid.
-      auto const is_valid = tile_val_idx < end_value_idx and tile_val_idx < data_col.size() and
-                            data_col.is_valid(tile_val_idx);
-
-      // Insert tile_val_idx to hash map and count successful insertions.
-      if (is_valid) {
-        // Insert the element to the map using the entire tile.
-        auto const tile_is_unique = type_dispatcher(
-          data_col.type(), map_insert_fn{storage_ref, data_col}, tile, tile_val_idx);
-
-        // tile_offset'th thread in the tile updates its number and size of unique element.
-        if (tile.thread_rank() == tile_offset) {
-          is_unique      = tile_is_unique;
-          uniq_elem_size = [&]() -> size_type {
-            if (not is_unique) { return 0; }
-            switch (col->physical_type) {
-              case Type::INT32: return 4;
-              case Type::INT64: return 8;
-              case Type::INT96: return 12;
-              case Type::FLOAT: return 4;
-              case Type::DOUBLE: return 8;
-              case Type::BYTE_ARRAY: {
-                auto const col_type = data_col.type().id();
-                if (col_type == type_id::STRING) {
-                  // Strings are stored as 4 byte length + string bytes
-                  return 4 + data_col.element<string_view>(tile_val_idx).size_bytes();
-                } else if (col_type == type_id::LIST) {
-                  // Binary is stored as 4 byte length + bytes
-                  return 4 + get_element<statistics::byte_array_view>(data_col, tile_val_idx)
-                               .size_bytes();
-                }
-                CUDF_UNREACHABLE(
-                  "Byte array only supports string and list<byte> column types for dictionary "
-                  "encoding!");
+    // Check if this index is valid.
+    auto const is_valid =
+      val_idx < end_value_idx and val_idx < data_col.size() and data_col.is_valid(val_idx);
+
+    // Insert tile_val_idx to hash map and count successful insertions.
+    if (is_valid) {
+      // Insert the element to the map using the entire tile.
+      auto const tile_is_unique =
+        type_dispatcher(data_col.type(), map_insert_fn{storage_ref, data_col}, tile, val_idx);
+
+      // First thread in the tile updates its number and size of unique element.
+      if (tile.thread_rank() == 0) {
+        is_unique      = tile_is_unique;
+        uniq_elem_size = [&]() -> size_type {
+          if (not is_unique) { return 0; }
+          switch (col->physical_type) {
+            case Type::INT32: return 4;
+            case Type::INT64: return 8;
+            case Type::INT96: return 12;
+            case Type::FLOAT: return 4;
+            case Type::DOUBLE: return 8;
+            case Type::BYTE_ARRAY: {
+              auto const col_type = data_col.type().id();
+              if (col_type == type_id::STRING) {
+                // Strings are stored as 4 byte length + string bytes
+                return 4 + data_col.element<string_view>(val_idx).size_bytes();
+              } else if (col_type == type_id::LIST) {
+                // Binary is stored as 4 byte length + bytes
+                return 4 + get_element<statistics::byte_array_view>(data_col, val_idx).size_bytes();
               }
-              case Type::FIXED_LEN_BYTE_ARRAY:
-                if (data_col.type().id() == type_id::DECIMAL128) { return sizeof(__int128_t); }
-                CUDF_UNREACHABLE(
-                  "Fixed length byte array only supports decimal 128 column types for dictionary "
-                  "encoding!");
-              default: CUDF_UNREACHABLE("Unsupported type for dictionary encoding");
+              CUDF_UNREACHABLE(
+                "Byte array only supports string and list<byte> column types for dictionary "
+                "encoding!");
             }
-          }();
-        }
+            case Type::FIXED_LEN_BYTE_ARRAY:
+              if (data_col.type().id() == type_id::DECIMAL128) { return sizeof(__int128_t); }
+              CUDF_UNREACHABLE(
+                "Fixed length byte array only supports decimal 128 column types for dictionary "
+                "encoding!");
+            default: CUDF_UNREACHABLE("Unsupported type for dictionary encoding");
+          }
+        }();
       }
     }
-    // All elements in all tiles inserted, reduce num_unique and uniq_data_size across the block.
+    // Reduce num_unique and uniq_data_size from all tiles.
     auto num_unique = block_reduce(reduce_storage).Sum(is_unique);
     block.sync();
     auto uniq_data_size = block_reduce(reduce_storage).Sum(uniq_elem_size);
@@ -248,26 +239,22 @@ CUDF_KERNEL void __launch_bounds__(block_size)
   auto& chunk = chunks[blockIdx.x];
   if (not chunk.use_dictionary) { return; }
 
-  auto const t = threadIdx.x;
-
   __shared__ cuda::atomic<size_type, SCOPE> counter;
   using cuda::std::memory_order_relaxed;
-  if (t == 0) { new (&counter) cuda::atomic<size_type, SCOPE>{0}; }
+  if (threadIdx.x == 0) { new (&counter) cuda::atomic<size_type, SCOPE>{0}; }
   __syncthreads();
 
-  for (size_type i = 0; i < chunk.dict_map_size; i += block_size) {
-    if (t + i < chunk.dict_map_size) {
-      auto* slot     = map_storage.data() + chunk.dict_map_offset + t + i;
-      auto const key = slot->data()->first;
-      if (key != KEY_SENTINEL) {
-        auto loc = counter.fetch_add(1, memory_order_relaxed);
-        cudf_assert(loc < MAX_DICT_SIZE && "Number of filled slots exceeds max dict size");
-        chunk.dict_data[loc] = key;
-        // If sorting dict page ever becomes a hard requirement, enable the following statement and
-        // add a dict sorting step before storing into the slot's second field.
-        // chunk.dict_data_idx[loc] = t + i;
-        slot->data()->second = loc;
-      }
+  for (size_type idx = threadIdx.x; idx < chunk.dict_map_size; idx += block_size) {
+    auto* slot     = map_storage.data() + chunk.dict_map_offset + idx;
+    auto const key = slot->data()->first;
+    if (key != KEY_SENTINEL) {
+      auto loc = counter.fetch_add(1, memory_order_relaxed);
+      cudf_assert(loc < MAX_DICT_SIZE && "Number of filled slots exceeds max dict size");
+      chunk.dict_data[loc] = key;
+      // If sorting dict page ever becomes a hard requirement, enable the following statement and
+      // add a dict sorting step before storing into the slot's second field.
+      // chunk.dict_data_idx[loc] = idx;
+      slot->data()->second = loc;
     }
   }
 }

From 4f51253e57aaf1eed02d119004defae3da89f197 Mon Sep 17 00:00:00 2001
From: Muhammad Haseeb <14217455+mhaseeb123@users.noreply.github.com>
Date: Thu, 15 Aug 2024 03:01:59 +0000
Subject: [PATCH 18/30] Cosmetic improvements

---
 cpp/src/io/parquet/chunk_dict.cu | 11 +++++------
 1 file changed, 5 insertions(+), 6 deletions(-)

diff --git a/cpp/src/io/parquet/chunk_dict.cu b/cpp/src/io/parquet/chunk_dict.cu
index 20773ab2e27..46f7acb2a37 100644
--- a/cpp/src/io/parquet/chunk_dict.cu
+++ b/cpp/src/io/parquet/chunk_dict.cu
@@ -86,7 +86,7 @@ struct map_insert_fn {
       CUDF_UNREACHABLE("Unsupported type to insert in map");
     }
   }
-};  // namespace cudf::io::parquet::detail
+};
 
 struct map_find_fn {
   storage_ref_type const& storage_ref;
@@ -116,13 +116,12 @@ struct map_find_fn {
                      storage_ref};
 
       // Find the key = i using the provided thread tile.
-      auto found_slot = hash_map_ref.find(tile, i);
+      auto const found_slot = hash_map_ref.find(tile, i);
 
       // Check if didn't find the previously inserted key.
-      if (tile.thread_rank() == 0) {
-        cudf_assert(found_slot != map_find_ref.end() &&
-                    "Unable to find value in map in dictionary index construction");
-      }
+      cudf_assert(found_slot != hash_map_ref.end() &&
+                  "Unable to find value in map in dictionary index construction");
+
       // Return a pair of the found key and value.
       return {found_slot->first, found_slot->second};
     } else {

From 16fa57ead03d4f3dd5d716e8627f17b8337806b6 Mon Sep 17 00:00:00 2001
From: Muhammad Haseeb <14217455+mhaseeb123@users.noreply.github.com>
Date: Thu, 15 Aug 2024 03:10:01 +0000
Subject: [PATCH 19/30] Change cg_size to 1 for best perf so far

---
 cpp/src/io/parquet/parquet_gpu.cuh | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/cpp/src/io/parquet/parquet_gpu.cuh b/cpp/src/io/parquet/parquet_gpu.cuh
index 34dc74d1880..0f3cfe1343b 100644
--- a/cpp/src/io/parquet/parquet_gpu.cuh
+++ b/cpp/src/io/parquet/parquet_gpu.cuh
@@ -30,7 +30,7 @@ using key_type    = size_type;
 using mapped_type = size_type;
 using slot_type   = cuco::pair<key_type, mapped_type>;
 
-auto constexpr map_cg_size = 2;  ///< A CUDA Cooperative Group of 2 threads to handle each subset
+auto constexpr map_cg_size = 1;  ///< A CUDA Cooperative Group of 2 threads to handle each subset
 auto constexpr window_size = 1;  ///< Number of concurrent slots handled by each thread
 
 auto constexpr KEY_SENTINEL   = key_type{-1};

From e394a7108caec5be54323c6623336353bddae38a Mon Sep 17 00:00:00 2001
From: Muhammad Haseeb <14217455+mhaseeb123@users.noreply.github.com>
Date: Thu, 15 Aug 2024 19:01:12 +0000
Subject: [PATCH 20/30] Incorporate for window_size

---
 cpp/src/io/parquet/chunk_dict.cu   | 22 ++++++++++++----------
 cpp/src/io/parquet/parquet_gpu.cuh |  4 ++--
 2 files changed, 14 insertions(+), 12 deletions(-)

diff --git a/cpp/src/io/parquet/chunk_dict.cu b/cpp/src/io/parquet/chunk_dict.cu
index 46f7acb2a37..70ddb8a230f 100644
--- a/cpp/src/io/parquet/chunk_dict.cu
+++ b/cpp/src/io/parquet/chunk_dict.cu
@@ -244,16 +244,18 @@ CUDF_KERNEL void __launch_bounds__(block_size)
   __syncthreads();
 
   for (size_type idx = threadIdx.x; idx < chunk.dict_map_size; idx += block_size) {
-    auto* slot     = map_storage.data() + chunk.dict_map_offset + idx;
-    auto const key = slot->data()->first;
-    if (key != KEY_SENTINEL) {
-      auto loc = counter.fetch_add(1, memory_order_relaxed);
-      cudf_assert(loc < MAX_DICT_SIZE && "Number of filled slots exceeds max dict size");
-      chunk.dict_data[loc] = key;
-      // If sorting dict page ever becomes a hard requirement, enable the following statement and
-      // add a dict sorting step before storing into the slot's second field.
-      // chunk.dict_data_idx[loc] = idx;
-      slot->data()->second = loc;
+    auto* window = map_storage.data() + chunk.dict_map_offset + idx;
+    for (auto& slot : *window) {
+      auto const key = slot.first;
+      if (key != KEY_SENTINEL) {
+        auto loc = counter.fetch_add(1, memory_order_relaxed);
+        cudf_assert(loc < MAX_DICT_SIZE && "Number of filled slots exceeds max dict size");
+        chunk.dict_data[loc] = key;
+        // If sorting dict page ever becomes a hard requirement, enable the following statement and
+        // add a dict sorting step before storing into the slot's second field.
+        // chunk.dict_data_idx[loc] = idx;
+        slot.second = loc;
+      }
     }
   }
 }
diff --git a/cpp/src/io/parquet/parquet_gpu.cuh b/cpp/src/io/parquet/parquet_gpu.cuh
index 0f3cfe1343b..1904b98a87f 100644
--- a/cpp/src/io/parquet/parquet_gpu.cuh
+++ b/cpp/src/io/parquet/parquet_gpu.cuh
@@ -30,8 +30,8 @@ using key_type    = size_type;
 using mapped_type = size_type;
 using slot_type   = cuco::pair<key_type, mapped_type>;
 
-auto constexpr map_cg_size = 1;  ///< A CUDA Cooperative Group of 2 threads to handle each subset
-auto constexpr window_size = 1;  ///< Number of concurrent slots handled by each thread
+auto constexpr map_cg_size = 2;  ///< A CUDA Cooperative Group of 2 threads to handle each subset
+auto constexpr window_size = 2;  ///< Number of concurrent slots handled by each thread
 
 auto constexpr KEY_SENTINEL   = key_type{-1};
 auto constexpr VALUE_SENTINEL = mapped_type{-1};

From 3e6a0b7e6551ea3ccface05847b01a52ecd8fffd Mon Sep 17 00:00:00 2001
From: Muhammad Haseeb <14217455+mhaseeb123@users.noreply.github.com>
Date: Thu, 15 Aug 2024 23:41:21 +0000
Subject: [PATCH 21/30] Use double hashing

---
 cpp/src/io/parquet/chunk_dict.cu   | 117 ++++++++++++++++-------------
 cpp/src/io/parquet/parquet_gpu.cuh |   6 +-
 cpp/src/io/parquet/writer_impl.cu  |   9 +--
 3 files changed, 70 insertions(+), 62 deletions(-)

diff --git a/cpp/src/io/parquet/chunk_dict.cu b/cpp/src/io/parquet/chunk_dict.cu
index 70ddb8a230f..72592399575 100644
--- a/cpp/src/io/parquet/chunk_dict.cu
+++ b/cpp/src/io/parquet/chunk_dict.cu
@@ -48,9 +48,10 @@ struct equality_functor {
 template <typename T>
 struct hash_functor {
   column_device_view const& col;
+  uint32_t seed = 0;
   __device__ auto operator()(key_type idx) const
   {
-    return cudf::hashing::detail::MurmurHash3_x86_32<T>{}(col.element<T>(idx));
+    return cudf::hashing::detail::MurmurHash3_x86_32<T>{seed}(col.element<T>(idx));
   }
 };
 
@@ -58,12 +59,12 @@ struct map_insert_fn {
   storage_ref_type const& storage_ref;
   column_device_view const& col;
   template <typename T>
-  __device__ bool operator()(cg::thread_block_tile<map_cg_size> const& tile, key_type i)
+  __device__ bool operator()(key_type i, cg::thread_block_tile<map_cg_size> const& tile)
   {
     if constexpr (column_device_view::has_element_accessor<T>()) {
       using equality_fn_type    = equality_functor<T>;
       using hash_fn_type        = hash_functor<T>;
-      using probing_scheme_type = cuco::linear_probing<map_cg_size, hash_fn_type>;
+      using probing_scheme_type = cuco::double_hashing<map_cg_size, hash_fn_type>;
 
       // Make a view of the hash map.
       cuco::static_map_ref<key_type,
@@ -75,8 +76,8 @@ struct map_insert_fn {
                            cuco::op::insert_tag>
         hash_map_ref{cuco::empty_key{KEY_SENTINEL},
                      cuco::empty_value{VALUE_SENTINEL},
-                     equality_fn_type{col},
-                     hash_fn_type{col},
+                     {col},
+                     {{col}, {col, 1}},
                      {},
                      storage_ref};
 
@@ -93,14 +94,14 @@ struct map_find_fn {
   column_device_view const& col;
   template <typename T>
   __device__ cuco::pair<key_type, mapped_type> operator()(
-    cg::thread_block_tile<map_cg_size> const& tile, key_type i)
+    key_type i, cg::thread_block_tile<map_cg_size> const& tile)
   {
     if constexpr (column_device_view::has_element_accessor<T>()) {
       using equality_fn_type    = equality_functor<T>;
       using hash_fn_type        = hash_functor<T>;
-      using probing_scheme_type = cuco::linear_probing<map_cg_size, hash_fn_type>;
+      using probing_scheme_type = cuco::double_hashing<map_cg_size, hash_fn_type>;
 
-      // Make a view of the hash map
+      // Make a view of the hash map.
       cuco::static_map_ref<key_type,
                            mapped_type,
                            SCOPE,
@@ -110,8 +111,8 @@ struct map_find_fn {
                            cuco::op::find_tag>
         hash_map_ref{cuco::empty_key{KEY_SENTINEL},
                      cuco::empty_value{VALUE_SENTINEL},
-                     equality_fn_type{col},
-                     hash_fn_type{col},
+                     {col},
+                     {{col}, {col, 1}},
                      {},
                      storage_ref};
 
@@ -163,54 +164,62 @@ CUDF_KERNEL void __launch_bounds__(block_size)
   __shared__ size_type total_num_dict_entries;
 
   // Insert all column chunk elements to the hash map to build the dict.
-  for (thread_index_type val_idx = s_start_value_idx + tile.meta_group_rank();
-       val_idx - tile.meta_group_size() < end_value_idx;
-       val_idx += tile.meta_group_size()) {
+  for (thread_index_type val_idx = s_start_value_idx;
+       val_idx + block.thread_rank() - block_size < end_value_idx;
+       val_idx += block_size) {
     size_type is_unique      = 0;
     size_type uniq_elem_size = 0;
 
-    // Check if this index is valid.
-    auto const is_valid =
-      val_idx < end_value_idx and val_idx < data_col.size() and data_col.is_valid(val_idx);
-
-    // Insert tile_val_idx to hash map and count successful insertions.
-    if (is_valid) {
-      // Insert the element to the map using the entire tile.
-      auto const tile_is_unique =
-        type_dispatcher(data_col.type(), map_insert_fn{storage_ref, data_col}, tile, val_idx);
-
-      // First thread in the tile updates its number and size of unique element.
-      if (tile.thread_rank() == 0) {
-        is_unique      = tile_is_unique;
-        uniq_elem_size = [&]() -> size_type {
-          if (not is_unique) { return 0; }
-          switch (col->physical_type) {
-            case Type::INT32: return 4;
-            case Type::INT64: return 8;
-            case Type::INT96: return 12;
-            case Type::FLOAT: return 4;
-            case Type::DOUBLE: return 8;
-            case Type::BYTE_ARRAY: {
-              auto const col_type = data_col.type().id();
-              if (col_type == type_id::STRING) {
-                // Strings are stored as 4 byte length + string bytes
-                return 4 + data_col.element<string_view>(val_idx).size_bytes();
-              } else if (col_type == type_id::LIST) {
-                // Binary is stored as 4 byte length + bytes
-                return 4 + get_element<statistics::byte_array_view>(data_col, val_idx).size_bytes();
+    // Each tile inserts a portion of the elements to the hash map.
+    for (auto tile_offset = 0; tile_offset < tile.num_threads(); tile_offset++) {
+      // Compute the index of the value currently being inserted by this tile.
+      auto const tile_val_idx =
+        val_idx + tile_offset + (tile.meta_group_rank() * tile.num_threads());
+
+      // Check if this index is valid.
+      auto const is_valid = tile_val_idx < end_value_idx and tile_val_idx < data_col.size() and
+                            data_col.is_valid(tile_val_idx);
+
+      // Insert tile_val_idx to hash map and count successful insertions.
+      if (is_valid) {
+        // Insert the element to the map using the entire tile.
+        auto const tile_is_unique = type_dispatcher(
+          data_col.type(), map_insert_fn{storage_ref, data_col}, tile_val_idx, tile);
+
+        // tile_offset'th thread in the tile updates its number and size of unique element.
+        if (tile.thread_rank() == tile_offset) {
+          is_unique      = tile_is_unique;
+          uniq_elem_size = [&]() -> size_type {
+            if (not is_unique) { return 0; }
+            switch (col->physical_type) {
+              case Type::INT32: return 4;
+              case Type::INT64: return 8;
+              case Type::INT96: return 12;
+              case Type::FLOAT: return 4;
+              case Type::DOUBLE: return 8;
+              case Type::BYTE_ARRAY: {
+                auto const col_type = data_col.type().id();
+                if (col_type == type_id::STRING) {
+                  // Strings are stored as 4 byte length + string bytes
+                  return 4 + data_col.element<string_view>(tile_val_idx).size_bytes();
+                } else if (col_type == type_id::LIST) {
+                  // Binary is stored as 4 byte length + bytes
+                  return 4 + get_element<statistics::byte_array_view>(data_col, tile_val_idx)
+                               .size_bytes();
+                }
+                CUDF_UNREACHABLE(
+                  "Byte array only supports string and list<byte> column types for dictionary "
+                  "encoding!");
               }
-              CUDF_UNREACHABLE(
-                "Byte array only supports string and list<byte> column types for dictionary "
-                "encoding!");
+              case Type::FIXED_LEN_BYTE_ARRAY:
+                if (data_col.type().id() == type_id::DECIMAL128) { return sizeof(__int128_t); }
+                CUDF_UNREACHABLE(
+                  "Fixed length byte array only supports decimal 128 column types for dictionary "
+                  "encoding!");
+              default: CUDF_UNREACHABLE("Unsupported type for dictionary encoding");
             }
-            case Type::FIXED_LEN_BYTE_ARRAY:
-              if (data_col.type().id() == type_id::DECIMAL128) { return sizeof(__int128_t); }
-              CUDF_UNREACHABLE(
-                "Fixed length byte array only supports decimal 128 column types for dictionary "
-                "encoding!");
-            default: CUDF_UNREACHABLE("Unsupported type for dictionary encoding");
-          }
-        }();
+          }();
+        }
       }
     }
     // Reduce num_unique and uniq_data_size from all tiles.
@@ -294,7 +303,7 @@ CUDF_KERNEL void __launch_bounds__(block_size)
        val_idx += ntiles) {
     if (data_col.is_valid(val_idx)) {
       auto [found_key, found_value] =
-        type_dispatcher(data_col.type(), map_find_fn{storage_ref, data_col}, tile, val_idx);
+        type_dispatcher(data_col.type(), map_find_fn{storage_ref, data_col}, val_idx, tile);
       // First thread in the tile updates the dict_index
       if (tile.thread_rank() == 0) {
         // No need for atomic as this is not going to be modified by any other thread
diff --git a/cpp/src/io/parquet/parquet_gpu.cuh b/cpp/src/io/parquet/parquet_gpu.cuh
index 1904b98a87f..2c5c41d642d 100644
--- a/cpp/src/io/parquet/parquet_gpu.cuh
+++ b/cpp/src/io/parquet/parquet_gpu.cuh
@@ -30,8 +30,10 @@ using key_type    = size_type;
 using mapped_type = size_type;
 using slot_type   = cuco::pair<key_type, mapped_type>;
 
-auto constexpr map_cg_size = 2;  ///< A CUDA Cooperative Group of 2 threads to handle each subset
-auto constexpr window_size = 2;  ///< Number of concurrent slots handled by each thread
+auto constexpr map_cg_size = 1;  ///< A CUDA Cooperative Group of 1 thread(s) to handle each subset
+auto constexpr window_size = 1;  ///< Number of concurrent slots handled by each thread
+auto constexpr occupancy_factor = 1.43f;  ///< cuCollections suggests using a hash map of size N *
+                                          ///< (1/0.7) = 1.43 to target a 70% occupancy factor.
 
 auto constexpr KEY_SENTINEL   = key_type{-1};
 auto constexpr VALUE_SENTINEL = mapped_type{-1};
diff --git a/cpp/src/io/parquet/writer_impl.cu b/cpp/src/io/parquet/writer_impl.cu
index e9efc8f7dde..ad1b39e0fe7 100644
--- a/cpp/src/io/parquet/writer_impl.cu
+++ b/cpp/src/io/parquet/writer_impl.cu
@@ -1300,11 +1300,9 @@ build_chunk_dictionaries(hostdevice_2dvector<EncColumnChunk>& chunks,
       chunk.use_dictionary = false;
     } else {
       chunk.use_dictionary = true;
-      // cuCollections suggests using a hash map of size N * (1/0.7) = 1.43 to target a 70%
-      // occupancy factor.
       chunk.dict_map_size =
         static_cast<cudf::size_type>(cuco::make_window_extent<map_cg_size, window_size>(
-          static_cast<cudf::size_type>(1.43 * chunk.num_values)));
+          static_cast<cudf::size_type>(occupancy_factor * chunk.num_values)));
       chunk.dict_map_offset = curr_offset;
       curr_offset += chunk.dict_map_size;
     }
@@ -1318,14 +1316,13 @@ build_chunk_dictionaries(hostdevice_2dvector<EncColumnChunk>& chunks,
 
   // Create a single bulk storage used by all sub-dictionaries
   auto map_storage = storage_type{total_map_storage_size};
-  // Initialize storage with the given sentinel iff non-zero size
-  map_storage.initialize(cuco::pair{KEY_SENTINEL, VALUE_SENTINEL},
-                         cuda::stream_ref{stream.value()});
   // Create a span of non-const map_storage as map_storage_ref takes in a non-const pointer.
   device_span<window_type> const map_storage_data{map_storage.data(), total_map_storage_size};
 
   // Synchronize
   chunks.host_to_device_async(stream);
+  // Initialize storage with the given sentinel
+  map_storage.initialize({KEY_SENTINEL, VALUE_SENTINEL}, {stream.value()});
   // Populate the hash map for each chunk
   populate_chunk_hash_maps(map_storage_data, frags, stream);
   // Synchronize again

From 27bee058782a2e28d2b1e160eb31429d3765c3f3 Mon Sep 17 00:00:00 2001
From: Muhammad Haseeb <14217455+mhaseeb123@users.noreply.github.com>
Date: Fri, 16 Aug 2024 07:28:01 +0000
Subject: [PATCH 22/30] Avoid reconstructing hash_map_ref again and again.

---
 cpp/src/io/parquet/chunk_dict.cu | 257 ++++++++++++++++---------------
 1 file changed, 135 insertions(+), 122 deletions(-)

diff --git a/cpp/src/io/parquet/chunk_dict.cu b/cpp/src/io/parquet/chunk_dict.cu
index 72592399575..0a5769d0dd8 100644
--- a/cpp/src/io/parquet/chunk_dict.cu
+++ b/cpp/src/io/parquet/chunk_dict.cu
@@ -55,13 +55,21 @@ struct hash_functor {
   }
 };
 
+template <int block_size>
 struct map_insert_fn {
   storage_ref_type const& storage_ref;
-  column_device_view const& col;
+  EncColumnChunk* const& chunk;
+
   template <typename T>
-  __device__ bool operator()(key_type i, cg::thread_block_tile<map_cg_size> const& tile)
+  __device__ void operator()(size_type const s_start_value_idx, size_type const end_value_idx)
   {
     if constexpr (column_device_view::has_element_accessor<T>()) {
+      using block_reduce = cub::BlockReduce<size_type, block_size>;
+      __shared__ typename block_reduce::TempStorage reduce_storage;
+
+      auto const col                     = chunk->col_desc;
+      column_device_view const& data_col = *col->leaf_column;
+
       using equality_fn_type    = equality_functor<T>;
       using hash_fn_type        = hash_functor<T>;
       using probing_scheme_type = cuco::double_hashing<map_cg_size, hash_fn_type>;
@@ -76,13 +84,91 @@ struct map_insert_fn {
                            cuco::op::insert_tag>
         hash_map_ref{cuco::empty_key{KEY_SENTINEL},
                      cuco::empty_value{VALUE_SENTINEL},
-                     {col},
-                     {{col}, {col, 1}},
+                     {data_col},
+                     {{data_col}, {data_col, 1}},
                      {},
                      storage_ref};
 
-      // Insert into the hash map using the provided thread tile.
-      return hash_map_ref.insert(tile, cuco::pair{i, i});
+      __shared__ size_type total_num_dict_entries;
+
+      auto const block = cg::this_thread_block();
+      auto const tile  = cg::tiled_partition<map_cg_size>(block);
+
+      // Insert all column chunk elements to the hash map to build the dict.
+      for (thread_index_type val_idx = s_start_value_idx;
+           val_idx + block.thread_rank() - block_size < end_value_idx;
+           val_idx += block_size) {
+        size_type is_unique      = 0;
+        size_type uniq_elem_size = 0;
+
+        // Each tile inserts a portion of the elements to the hash map.
+        for (auto tile_offset = 0; tile_offset < tile.num_threads(); tile_offset++) {
+          // Compute the index of the value currently being inserted by this tile.
+          auto const tile_val_idx =
+            val_idx + tile_offset + (tile.meta_group_rank() * tile.num_threads());
+          // Check if this index is valid.
+          auto const is_valid = tile_val_idx < end_value_idx and tile_val_idx < data_col.size() and
+                                data_col.is_valid(tile_val_idx);
+
+          // Insert tile_val_idx to hash map and count successful insertions.
+          if (is_valid) {
+            // Insert the element to the map using the entire tile.
+            auto const tile_is_unique =
+              // Insert into the hash map using the provided thread tile.
+              hash_map_ref.insert(tile, cuco::pair{tile_val_idx, tile_val_idx});
+            // tile_offset'th thread in the tile updates its number and size of unique element.
+            if (tile.thread_rank() == tile_offset) {
+              is_unique      = tile_is_unique;
+              uniq_elem_size = [&]() -> size_type {
+                if (not is_unique) { return 0; }
+                switch (col->physical_type) {
+                  case Type::INT32: return 4;
+                  case Type::INT64: return 8;
+                  case Type::INT96: return 12;
+                  case Type::FLOAT: return 4;
+                  case Type::DOUBLE: return 8;
+                  case Type::BYTE_ARRAY: {
+                    auto const col_type = data_col.type().id();
+                    if (col_type == type_id::STRING) {
+                      // Strings are stored as 4 byte length + string bytes
+                      return 4 + data_col.element<string_view>(tile_val_idx).size_bytes();
+                    } else if (col_type == type_id::LIST) {
+                      // Binary is stored as 4 byte length + bytes
+                      return 4 + get_element<statistics::byte_array_view>(data_col, tile_val_idx)
+                                   .size_bytes();
+                    }
+                    CUDF_UNREACHABLE(
+                      "Byte array only supports string and list<byte> column types for dictionary "
+                      "encoding!");
+                  }
+                  case Type::FIXED_LEN_BYTE_ARRAY:
+                    if (data_col.type().id() == type_id::DECIMAL128) { return sizeof(__int128_t); }
+                    CUDF_UNREACHABLE(
+                      "Fixed length byte array only supports decimal 128 column types for "
+                      "dictionary "
+                      "encoding!");
+                  default: CUDF_UNREACHABLE("Unsupported type for dictionary encoding");
+                }
+              }();
+            }
+          }
+        }
+        // Reduce num_unique and uniq_data_size from all tiles.
+        auto num_unique = block_reduce(reduce_storage).Sum(is_unique);
+        block.sync();
+        auto uniq_data_size = block_reduce(reduce_storage).Sum(uniq_elem_size);
+        // The first thread in the block atomically updates total num_unique and uniq_data_size
+        if (block.thread_rank() == 0) {
+          total_num_dict_entries = atomicAdd(&chunk->num_dict_entries, num_unique);
+          total_num_dict_entries += num_unique;
+          atomicAdd(&chunk->uniq_data_size, uniq_data_size);
+        }
+        block.sync();
+
+        // Check if the num unique values in chunk has already exceeded max dict size and early
+        // exit
+        if (total_num_dict_entries > MAX_DICT_SIZE) { return; }
+      }  // for loop
     } else {
       CUDF_UNREACHABLE("Unsupported type to insert in map");
     }
@@ -91,12 +177,16 @@ struct map_insert_fn {
 
 struct map_find_fn {
   storage_ref_type const& storage_ref;
-  column_device_view const& col;
+  EncColumnChunk* const& chunk;
   template <typename T>
-  __device__ cuco::pair<key_type, mapped_type> operator()(
-    key_type i, cg::thread_block_tile<map_cg_size> const& tile)
+  __device__ void operator()(size_type const s_start_value_idx,
+                             size_type const end_value_idx,
+                             size_type const s_ck_start_val_idx)
   {
     if constexpr (column_device_view::has_element_accessor<T>()) {
+      auto const col                     = chunk->col_desc;
+      column_device_view const& data_col = *col->leaf_column;
+
       using equality_fn_type    = equality_functor<T>;
       using hash_fn_type        = hash_functor<T>;
       using probing_scheme_type = cuco::double_hashing<map_cg_size, hash_fn_type>;
@@ -111,20 +201,34 @@ struct map_find_fn {
                            cuco::op::find_tag>
         hash_map_ref{cuco::empty_key{KEY_SENTINEL},
                      cuco::empty_value{VALUE_SENTINEL},
-                     {col},
-                     {{col}, {col, 1}},
+                     {data_col},
+                     {{data_col}, {data_col, 1}},
                      {},
                      storage_ref};
 
       // Find the key = i using the provided thread tile.
-      auto const found_slot = hash_map_ref.find(tile, i);
-
-      // Check if didn't find the previously inserted key.
-      cudf_assert(found_slot != hash_map_ref.end() &&
-                  "Unable to find value in map in dictionary index construction");
-
-      // Return a pair of the found key and value.
-      return {found_slot->first, found_slot->second};
+      auto const tile   = cg::tiled_partition<map_cg_size>(cg::this_thread_block());
+      auto const ntiles = tile.meta_group_size();
+
+      for (thread_index_type val_idx = s_start_value_idx + tile.meta_group_rank();
+           val_idx < end_value_idx;
+           val_idx += tile.meta_group_size()) {
+        if (data_col.is_valid(val_idx)) {
+          // No need for atomic as this is not going to be modified by any other thread
+          auto const value = [&]() {
+            auto const found_slot = hash_map_ref.find(tile, val_idx);
+
+            // Check if didn't find the previously inserted key.
+            cudf_assert(found_slot != hash_map_ref.end() &&
+                        "Unable to find value in map in dictionary index construction");
+
+            // Return a pair of the found key and value.
+            return found_slot->second;
+          }();
+          // First thread in the tile updates the dict_index
+          if (tile.thread_rank() == 0) { chunk->dict_index[val_idx - s_ck_start_val_idx] = value; }
+        }
+      }
     } else {
       CUDF_UNREACHABLE("Unsupported type to find in map");
     }
@@ -144,12 +248,6 @@ CUDF_KERNEL void __launch_bounds__(block_size)
 
   if (not chunk->use_dictionary) { return; }
 
-  using block_reduce = cub::BlockReduce<size_type, block_size>;
-  __shared__ typename block_reduce::TempStorage reduce_storage;
-
-  auto const block = cg::this_thread_block();
-  auto const tile  = cg::tiled_partition<map_cg_size>(block);
-
   size_type start_row = frag.start_row;
   size_type end_row   = frag.start_row + frag.num_rows;
 
@@ -160,83 +258,10 @@ CUDF_KERNEL void __launch_bounds__(block_size)
   column_device_view const& data_col = *col->leaf_column;
   storage_ref_type const storage_ref{chunk->dict_map_size,
                                      map_storage.data() + chunk->dict_map_offset};
-
-  __shared__ size_type total_num_dict_entries;
-
-  // Insert all column chunk elements to the hash map to build the dict.
-  for (thread_index_type val_idx = s_start_value_idx;
-       val_idx + block.thread_rank() - block_size < end_value_idx;
-       val_idx += block_size) {
-    size_type is_unique      = 0;
-    size_type uniq_elem_size = 0;
-
-    // Each tile inserts a portion of the elements to the hash map.
-    for (auto tile_offset = 0; tile_offset < tile.num_threads(); tile_offset++) {
-      // Compute the index of the value currently being inserted by this tile.
-      auto const tile_val_idx =
-        val_idx + tile_offset + (tile.meta_group_rank() * tile.num_threads());
-
-      // Check if this index is valid.
-      auto const is_valid = tile_val_idx < end_value_idx and tile_val_idx < data_col.size() and
-                            data_col.is_valid(tile_val_idx);
-
-      // Insert tile_val_idx to hash map and count successful insertions.
-      if (is_valid) {
-        // Insert the element to the map using the entire tile.
-        auto const tile_is_unique = type_dispatcher(
-          data_col.type(), map_insert_fn{storage_ref, data_col}, tile_val_idx, tile);
-
-        // tile_offset'th thread in the tile updates its number and size of unique element.
-        if (tile.thread_rank() == tile_offset) {
-          is_unique      = tile_is_unique;
-          uniq_elem_size = [&]() -> size_type {
-            if (not is_unique) { return 0; }
-            switch (col->physical_type) {
-              case Type::INT32: return 4;
-              case Type::INT64: return 8;
-              case Type::INT96: return 12;
-              case Type::FLOAT: return 4;
-              case Type::DOUBLE: return 8;
-              case Type::BYTE_ARRAY: {
-                auto const col_type = data_col.type().id();
-                if (col_type == type_id::STRING) {
-                  // Strings are stored as 4 byte length + string bytes
-                  return 4 + data_col.element<string_view>(tile_val_idx).size_bytes();
-                } else if (col_type == type_id::LIST) {
-                  // Binary is stored as 4 byte length + bytes
-                  return 4 + get_element<statistics::byte_array_view>(data_col, tile_val_idx)
-                               .size_bytes();
-                }
-                CUDF_UNREACHABLE(
-                  "Byte array only supports string and list<byte> column types for dictionary "
-                  "encoding!");
-              }
-              case Type::FIXED_LEN_BYTE_ARRAY:
-                if (data_col.type().id() == type_id::DECIMAL128) { return sizeof(__int128_t); }
-                CUDF_UNREACHABLE(
-                  "Fixed length byte array only supports decimal 128 column types for dictionary "
-                  "encoding!");
-              default: CUDF_UNREACHABLE("Unsupported type for dictionary encoding");
-            }
-          }();
-        }
-      }
-    }
-    // Reduce num_unique and uniq_data_size from all tiles.
-    auto num_unique = block_reduce(reduce_storage).Sum(is_unique);
-    block.sync();
-    auto uniq_data_size = block_reduce(reduce_storage).Sum(uniq_elem_size);
-    // The first thread in the block atomically updates total num_unique and uniq_data_size
-    if (block.thread_rank() == 0) {
-      total_num_dict_entries = atomicAdd(&chunk->num_dict_entries, num_unique);
-      total_num_dict_entries += num_unique;
-      atomicAdd(&chunk->uniq_data_size, uniq_data_size);
-    }
-    block.sync();
-
-    // Check if the num unique values in chunk has already exceeded max dict size and early exit
-    if (total_num_dict_entries > MAX_DICT_SIZE) { return; }
-  }  // for loop
+  type_dispatcher(data_col.type(),
+                  map_insert_fn<block_size>{storage_ref, chunk},
+                  s_start_value_idx,
+                  end_value_idx);
 }
 
 template <int block_size>
@@ -260,8 +285,8 @@ CUDF_KERNEL void __launch_bounds__(block_size)
         auto loc = counter.fetch_add(1, memory_order_relaxed);
         cudf_assert(loc < MAX_DICT_SIZE && "Number of filled slots exceeds max dict size");
         chunk.dict_data[loc] = key;
-        // If sorting dict page ever becomes a hard requirement, enable the following statement and
-        // add a dict sorting step before storing into the slot's second field.
+        // If sorting dict page ever becomes a hard requirement, enable the following statement
+        // and add a dict sorting step before storing into the slot's second field.
         // chunk.dict_data_idx[loc] = idx;
         slot.second = loc;
       }
@@ -278,17 +303,13 @@ CUDF_KERNEL void __launch_bounds__(block_size)
   auto const block_x = blockIdx.x;
   auto const frag    = frags[col_idx][block_x];
   auto chunk         = frag.chunk;
-  auto const col     = chunk->col_desc;
 
   if (not chunk->use_dictionary) { return; }
 
-  auto const block  = cg::this_thread_block();
-  auto const tile   = cg::tiled_partition<map_cg_size>(block);
-  auto const ntiles = tile.meta_group_size();
-
   size_type start_row = frag.start_row;
   size_type end_row   = frag.start_row + frag.num_rows;
 
+  auto const col = chunk->col_desc;
   // Find the bounds of values in leaf column to be searched in the map for current chunk
   auto const s_start_value_idx  = row_to_value_idx(start_row, *col);
   auto const s_ck_start_val_idx = row_to_value_idx(chunk->start_row, *col);
@@ -298,19 +319,11 @@ CUDF_KERNEL void __launch_bounds__(block_size)
   storage_ref_type const storage_ref{chunk->dict_map_size,
                                      map_storage.data() + chunk->dict_map_offset};
 
-  for (thread_index_type val_idx = s_start_value_idx + tile.meta_group_rank();
-       val_idx < end_value_idx;
-       val_idx += ntiles) {
-    if (data_col.is_valid(val_idx)) {
-      auto [found_key, found_value] =
-        type_dispatcher(data_col.type(), map_find_fn{storage_ref, data_col}, val_idx, tile);
-      // First thread in the tile updates the dict_index
-      if (tile.thread_rank() == 0) {
-        // No need for atomic as this is not going to be modified by any other thread
-        chunk->dict_index[val_idx - s_ck_start_val_idx] = found_value;
-      }
-    }
-  }
+  type_dispatcher(data_col.type(),
+                  map_find_fn{storage_ref, chunk},
+                  s_start_value_idx,
+                  end_value_idx,
+                  s_ck_start_val_idx);
 }
 
 void populate_chunk_hash_maps(device_span<window_type> const map_storage,

From fd2cb7f768a86ddee36b3f397ab0794d685e085c Mon Sep 17 00:00:00 2001
From: Muhammad Haseeb <14217455+mhaseeb123@users.noreply.github.com>
Date: Fri, 16 Aug 2024 23:11:30 +0000
Subject: [PATCH 23/30] Add allocator to map's aow_storage

---
 cpp/src/io/parquet/parquet_gpu.cuh | 6 +++++-
 cpp/src/io/parquet/writer_impl.cu  | 6 ++++--
 2 files changed, 9 insertions(+), 3 deletions(-)

diff --git a/cpp/src/io/parquet/parquet_gpu.cuh b/cpp/src/io/parquet/parquet_gpu.cuh
index 2c5c41d642d..35d9c7846aa 100644
--- a/cpp/src/io/parquet/parquet_gpu.cuh
+++ b/cpp/src/io/parquet/parquet_gpu.cuh
@@ -18,6 +18,7 @@
 
 #include "parquet_gpu.hpp"
 
+#include <cudf/detail/cuco_helpers.hpp>
 #include <cudf/lists/lists_column_device_view.cuh>
 #include <cudf/types.hpp>
 
@@ -39,7 +40,10 @@ auto constexpr KEY_SENTINEL   = key_type{-1};
 auto constexpr VALUE_SENTINEL = mapped_type{-1};
 auto constexpr SCOPE          = cuda::thread_scope_block;
 
-using storage_type     = cuco::aow_storage<slot_type, window_size>;
+using storage_type     = cuco::aow_storage<slot_type,
+                                       window_size,
+                                       cuco::extent<std::size_t>,
+                                       cudf::detail::cuco_allocator<char>>;
 using storage_ref_type = typename storage_type::ref_type;
 using window_type      = typename storage_type::window_type;
 
diff --git a/cpp/src/io/parquet/writer_impl.cu b/cpp/src/io/parquet/writer_impl.cu
index ad1b39e0fe7..14504a4ef1c 100644
--- a/cpp/src/io/parquet/writer_impl.cu
+++ b/cpp/src/io/parquet/writer_impl.cu
@@ -1315,14 +1315,16 @@ build_chunk_dictionaries(hostdevice_2dvector<EncColumnChunk>& chunks,
   if (total_map_storage_size == 0) { return {std::move(dict_data), std::move(dict_index)}; }
 
   // Create a single bulk storage used by all sub-dictionaries
-  auto map_storage = storage_type{total_map_storage_size};
+  auto map_storage = storage_type{
+    total_map_storage_size,
+    cudf::detail::cuco_allocator<char>{rmm::mr::polymorphic_allocator<char>{}, stream}};
   // Create a span of non-const map_storage as map_storage_ref takes in a non-const pointer.
   device_span<window_type> const map_storage_data{map_storage.data(), total_map_storage_size};
 
   // Synchronize
   chunks.host_to_device_async(stream);
   // Initialize storage with the given sentinel
-  map_storage.initialize({KEY_SENTINEL, VALUE_SENTINEL}, {stream.value()});
+  map_storage.initialize_async({KEY_SENTINEL, VALUE_SENTINEL}, {stream.value()});
   // Populate the hash map for each chunk
   populate_chunk_hash_maps(map_storage_data, frags, stream);
   // Synchronize again

From 39c2a3514d448e11086f184453c378ed2b827ccd Mon Sep 17 00:00:00 2001
From: Muhammad Haseeb <14217455+mhaseeb123@users.noreply.github.com>
Date: Sat, 17 Aug 2024 00:55:47 +0000
Subject: [PATCH 24/30] Perf optimization

---
 cpp/src/io/parquet/chunk_dict.cu   | 168 +++++++++++++----------------
 cpp/src/io/parquet/parquet_gpu.cuh |  11 +-
 2 files changed, 82 insertions(+), 97 deletions(-)

diff --git a/cpp/src/io/parquet/chunk_dict.cu b/cpp/src/io/parquet/chunk_dict.cu
index 0a5769d0dd8..653b6ddef6b 100644
--- a/cpp/src/io/parquet/chunk_dict.cu
+++ b/cpp/src/io/parquet/chunk_dict.cu
@@ -22,14 +22,11 @@
 
 #include <rmm/exec_policy.hpp>
 
-#include <cooperative_groups.h>
 #include <cuco/static_map_ref.cuh>
 #include <cuda/atomic>
 
 namespace cudf::io::parquet::detail {
 
-namespace cg = cooperative_groups;
-
 namespace {
 constexpr int DEFAULT_BLOCK_SIZE = 256;
 }
@@ -40,7 +37,8 @@ struct equality_functor {
   __device__ bool operator()(key_type const lhs_idx, key_type const rhs_idx) const
   {
     // We don't call this for nulls so this is fine.
-    auto const equal = cudf::experimental::row::equality::nan_equal_physical_equality_comparator{};
+    auto constexpr equal =
+      cudf::experimental::row::equality::nan_equal_physical_equality_comparator{};
     return equal(col.element<T>(lhs_idx), col.element<T>(rhs_idx));
   }
 };
@@ -48,7 +46,7 @@ struct equality_functor {
 template <typename T>
 struct hash_functor {
   column_device_view const& col;
-  uint32_t seed = 0;
+  uint32_t const seed = 0;
   __device__ auto operator()(key_type idx) const
   {
     return cudf::hashing::detail::MurmurHash3_x86_32<T>{seed}(col.element<T>(idx));
@@ -69,10 +67,11 @@ struct map_insert_fn {
 
       auto const col                     = chunk->col_desc;
       column_device_view const& data_col = *col->leaf_column;
+      __shared__ size_type total_num_dict_entries;
 
       using equality_fn_type    = equality_functor<T>;
       using hash_fn_type        = hash_functor<T>;
-      using probing_scheme_type = cuco::double_hashing<map_cg_size, hash_fn_type>;
+      using probing_scheme_type = cuco::linear_probing<map_cg_size, hash_fn_type>;
 
       // Make a view of the hash map.
       cuco::static_map_ref<key_type,
@@ -80,93 +79,76 @@ struct map_insert_fn {
                            SCOPE,
                            equality_fn_type,
                            probing_scheme_type,
-                           storage_ref_type,
-                           cuco::op::insert_tag>
+                           storage_ref_type>
         hash_map_ref{cuco::empty_key{KEY_SENTINEL},
                      cuco::empty_value{VALUE_SENTINEL},
                      {data_col},
-                     {{data_col}, {data_col, 1}},
+                     hash_fn_type{data_col},
                      {},
                      storage_ref};
 
-      __shared__ size_type total_num_dict_entries;
-
-      auto const block = cg::this_thread_block();
-      auto const tile  = cg::tiled_partition<map_cg_size>(block);
+      // Create a map ref with `cuco::insert` operator
+      auto map_insert_ref = hash_map_ref.with_operators(cuco::insert);
+      auto const t        = threadIdx.x;
 
-      // Insert all column chunk elements to the hash map to build the dict.
-      for (thread_index_type val_idx = s_start_value_idx;
-           val_idx + block.thread_rank() - block_size < end_value_idx;
+      // Note: Adjust the following loop to use `cg::tile<map_cg_size>` if needed in the future.
+      for (thread_index_type val_idx = s_start_value_idx + t; val_idx - block_size < end_value_idx;
            val_idx += block_size) {
         size_type is_unique      = 0;
         size_type uniq_elem_size = 0;
 
-        // Each tile inserts a portion of the elements to the hash map.
-        for (auto tile_offset = 0; tile_offset < tile.num_threads(); tile_offset++) {
-          // Compute the index of the value currently being inserted by this tile.
-          auto const tile_val_idx =
-            val_idx + tile_offset + (tile.meta_group_rank() * tile.num_threads());
-          // Check if this index is valid.
-          auto const is_valid = tile_val_idx < end_value_idx and tile_val_idx < data_col.size() and
-                                data_col.is_valid(tile_val_idx);
-
-          // Insert tile_val_idx to hash map and count successful insertions.
-          if (is_valid) {
-            // Insert the element to the map using the entire tile.
-            auto const tile_is_unique =
-              // Insert into the hash map using the provided thread tile.
-              hash_map_ref.insert(tile, cuco::pair{tile_val_idx, tile_val_idx});
-            // tile_offset'th thread in the tile updates its number and size of unique element.
-            if (tile.thread_rank() == tile_offset) {
-              is_unique      = tile_is_unique;
-              uniq_elem_size = [&]() -> size_type {
-                if (not is_unique) { return 0; }
-                switch (col->physical_type) {
-                  case Type::INT32: return 4;
-                  case Type::INT64: return 8;
-                  case Type::INT96: return 12;
-                  case Type::FLOAT: return 4;
-                  case Type::DOUBLE: return 8;
-                  case Type::BYTE_ARRAY: {
-                    auto const col_type = data_col.type().id();
-                    if (col_type == type_id::STRING) {
-                      // Strings are stored as 4 byte length + string bytes
-                      return 4 + data_col.element<string_view>(tile_val_idx).size_bytes();
-                    } else if (col_type == type_id::LIST) {
-                      // Binary is stored as 4 byte length + bytes
-                      return 4 + get_element<statistics::byte_array_view>(data_col, tile_val_idx)
-                                   .size_bytes();
-                    }
-                    CUDF_UNREACHABLE(
-                      "Byte array only supports string and list<byte> column types for dictionary "
-                      "encoding!");
-                  }
-                  case Type::FIXED_LEN_BYTE_ARRAY:
-                    if (data_col.type().id() == type_id::DECIMAL128) { return sizeof(__int128_t); }
-                    CUDF_UNREACHABLE(
-                      "Fixed length byte array only supports decimal 128 column types for "
-                      "dictionary "
-                      "encoding!");
-                  default: CUDF_UNREACHABLE("Unsupported type for dictionary encoding");
+        // Check if this index is valid.
+        auto const is_valid =
+          val_idx < end_value_idx and val_idx < data_col.size() and data_col.is_valid(val_idx);
+
+        // Insert tile_val_idx to hash map and count successful insertions.
+        if (is_valid) {
+          // Insert the keys using a single thread for best performance for now.
+          is_unique      = map_insert_ref.insert(cuco::pair{val_idx, val_idx});
+          uniq_elem_size = [&]() -> size_type {
+            if (not is_unique) { return 0; }
+            switch (col->physical_type) {
+              case Type::INT32: return 4;
+              case Type::INT64: return 8;
+              case Type::INT96: return 12;
+              case Type::FLOAT: return 4;
+              case Type::DOUBLE: return 8;
+              case Type::BYTE_ARRAY: {
+                auto const col_type = data_col.type().id();
+                if (col_type == type_id::STRING) {
+                  // Strings are stored as 4 byte length + string bytes
+                  return 4 + data_col.element<string_view>(val_idx).size_bytes();
+                } else if (col_type == type_id::LIST) {
+                  // Binary is stored as 4 byte length + bytes
+                  return 4 +
+                         get_element<statistics::byte_array_view>(data_col, val_idx).size_bytes();
                 }
-              }();
+                CUDF_UNREACHABLE(
+                  "Byte array only supports string and list<byte> column types for dictionary "
+                  "encoding!");
+              }
+              case Type::FIXED_LEN_BYTE_ARRAY:
+                if (data_col.type().id() == type_id::DECIMAL128) { return sizeof(__int128_t); }
+                CUDF_UNREACHABLE(
+                  "Fixed length byte array only supports decimal 128 column types for dictionary "
+                  "encoding!");
+              default: CUDF_UNREACHABLE("Unsupported type for dictionary encoding");
             }
-          }
+          }();
         }
         // Reduce num_unique and uniq_data_size from all tiles.
         auto num_unique = block_reduce(reduce_storage).Sum(is_unique);
-        block.sync();
+        __syncthreads();
         auto uniq_data_size = block_reduce(reduce_storage).Sum(uniq_elem_size);
         // The first thread in the block atomically updates total num_unique and uniq_data_size
-        if (block.thread_rank() == 0) {
+        if (t == 0) {
           total_num_dict_entries = atomicAdd(&chunk->num_dict_entries, num_unique);
           total_num_dict_entries += num_unique;
           atomicAdd(&chunk->uniq_data_size, uniq_data_size);
         }
-        block.sync();
+        __syncthreads();
 
-        // Check if the num unique values in chunk has already exceeded max dict size and early
-        // exit
+        // Check if the num unique values in chunk has already exceeded max dict size and early exit
         if (total_num_dict_entries > MAX_DICT_SIZE) { return; }
       }  // for loop
     } else {
@@ -175,6 +157,7 @@ struct map_insert_fn {
   }
 };
 
+template <int block_size>
 struct map_find_fn {
   storage_ref_type const& storage_ref;
   EncColumnChunk* const& chunk;
@@ -189,7 +172,7 @@ struct map_find_fn {
 
       using equality_fn_type    = equality_functor<T>;
       using hash_fn_type        = hash_functor<T>;
-      using probing_scheme_type = cuco::double_hashing<map_cg_size, hash_fn_type>;
+      using probing_scheme_type = cuco::linear_probing<map_cg_size, hash_fn_type>;
 
       // Make a view of the hash map.
       cuco::static_map_ref<key_type,
@@ -197,36 +180,34 @@ struct map_find_fn {
                            SCOPE,
                            equality_fn_type,
                            probing_scheme_type,
-                           storage_ref_type,
-                           cuco::op::find_tag>
+                           storage_ref_type>
         hash_map_ref{cuco::empty_key{KEY_SENTINEL},
                      cuco::empty_value{VALUE_SENTINEL},
                      {data_col},
-                     {{data_col}, {data_col, 1}},
+                     hash_fn_type{data_col},
                      {},
                      storage_ref};
 
-      // Find the key = i using the provided thread tile.
-      auto const tile   = cg::tiled_partition<map_cg_size>(cg::this_thread_block());
-      auto const ntiles = tile.meta_group_size();
+      // Create a map ref with `cuco::find` operator
+      auto const map_find_ref = hash_map_ref.with_operators(cuco::find);
+      auto const t            = threadIdx.x;
 
-      for (thread_index_type val_idx = s_start_value_idx + tile.meta_group_rank();
-           val_idx < end_value_idx;
-           val_idx += tile.meta_group_size()) {
+      // Note: Adjust the following loop to use `cg::tiles<map_cg_size>` if needed in the future.
+      for (thread_index_type val_idx = s_start_value_idx + t; val_idx < end_value_idx;
+           val_idx += block_size) {
+        // Find the key using a single thread for best performance for now.
         if (data_col.is_valid(val_idx)) {
-          // No need for atomic as this is not going to be modified by any other thread
-          auto const value = [&]() {
-            auto const found_slot = hash_map_ref.find(tile, val_idx);
+          // No need for atomic as this is not going to be modified by any other thread.
+          chunk->dict_index[val_idx - s_ck_start_val_idx] = [&]() {
+            auto const found_slot = map_find_ref.find(val_idx);
 
-            // Check if didn't find the previously inserted key.
-            cudf_assert(found_slot != hash_map_ref.end() &&
+            // Fail if we didn't find the previously inserted key.
+            cudf_assert(found_slot != map_find_ref.end() &&
                         "Unable to find value in map in dictionary index construction");
 
-            // Return a pair of the found key and value.
+            // Return the found value.
             return found_slot->second;
           }();
-          // First thread in the tile updates the dict_index
-          if (tile.thread_rank() == 0) { chunk->dict_index[val_idx - s_ck_start_val_idx] = value; }
         }
       }
     } else {
@@ -272,17 +253,18 @@ CUDF_KERNEL void __launch_bounds__(block_size)
   auto& chunk = chunks[blockIdx.x];
   if (not chunk.use_dictionary) { return; }
 
+  auto t = threadIdx.x;
   __shared__ cuda::atomic<size_type, SCOPE> counter;
   using cuda::std::memory_order_relaxed;
-  if (threadIdx.x == 0) { new (&counter) cuda::atomic<size_type, SCOPE>{0}; }
+  if (t == 0) { new (&counter) cuda::atomic<size_type, SCOPE>{0}; }
   __syncthreads();
 
-  for (size_type idx = threadIdx.x; idx < chunk.dict_map_size; idx += block_size) {
-    auto* window = map_storage.data() + chunk.dict_map_offset + idx;
+  for (; t < chunk.dict_map_size; t += block_size) {
+    auto* window = map_storage.data() + chunk.dict_map_offset + t;
     for (auto& slot : *window) {
       auto const key = slot.first;
       if (key != KEY_SENTINEL) {
-        auto loc = counter.fetch_add(1, memory_order_relaxed);
+        auto const loc = counter.fetch_add(1, memory_order_relaxed);
         cudf_assert(loc < MAX_DICT_SIZE && "Number of filled slots exceeds max dict size");
         chunk.dict_data[loc] = key;
         // If sorting dict page ever becomes a hard requirement, enable the following statement
@@ -320,7 +302,7 @@ CUDF_KERNEL void __launch_bounds__(block_size)
                                      map_storage.data() + chunk->dict_map_offset};
 
   type_dispatcher(data_col.type(),
-                  map_find_fn{storage_ref, chunk},
+                  map_find_fn<block_size>{storage_ref, chunk},
                   s_start_value_idx,
                   end_value_idx,
                   s_ck_start_val_idx);
diff --git a/cpp/src/io/parquet/parquet_gpu.cuh b/cpp/src/io/parquet/parquet_gpu.cuh
index 35d9c7846aa..7c09764da2d 100644
--- a/cpp/src/io/parquet/parquet_gpu.cuh
+++ b/cpp/src/io/parquet/parquet_gpu.cuh
@@ -31,10 +31,13 @@ using key_type    = size_type;
 using mapped_type = size_type;
 using slot_type   = cuco::pair<key_type, mapped_type>;
 
-auto constexpr map_cg_size = 1;  ///< A CUDA Cooperative Group of 1 thread(s) to handle each subset
-auto constexpr window_size = 1;  ///< Number of concurrent slots handled by each thread
-auto constexpr occupancy_factor = 1.43f;  ///< cuCollections suggests using a hash map of size N *
-                                          ///< (1/0.7) = 1.43 to target a 70% occupancy factor.
+auto constexpr map_cg_size =
+  1;  ///< A CUDA Cooperative Group of 1 thread (set for best performance) to handle each subset.
+      ///< Note: Adjust insert and find loops to use `cg::tile<map_cg_size>` if increasing this.
+auto constexpr window_size =
+  1;  ///< Number of concurrent slots (set for best performance) handled by each thread.
+auto constexpr occupancy_factor = 1.43f;  ///< cuCollections suggests using a hash map of size
+                                          ///< N * (1/0.7) = 1.43 to target a 70% occupancy factor.
 
 auto constexpr KEY_SENTINEL   = key_type{-1};
 auto constexpr VALUE_SENTINEL = mapped_type{-1};

From 1b6454072440bdf6adc310fa59fe4b3a8c8554d3 Mon Sep 17 00:00:00 2001
From: Muhammad Haseeb <14217455+mhaseeb123@users.noreply.github.com>
Date: Sat, 17 Aug 2024 01:20:03 +0000
Subject: [PATCH 25/30] Update comments

---
 cpp/src/io/parquet/chunk_dict.cu | 12 ++++++++----
 1 file changed, 8 insertions(+), 4 deletions(-)

diff --git a/cpp/src/io/parquet/chunk_dict.cu b/cpp/src/io/parquet/chunk_dict.cu
index 653b6ddef6b..eb51714b8de 100644
--- a/cpp/src/io/parquet/chunk_dict.cu
+++ b/cpp/src/io/parquet/chunk_dict.cu
@@ -69,8 +69,10 @@ struct map_insert_fn {
       column_device_view const& data_col = *col->leaf_column;
       __shared__ size_type total_num_dict_entries;
 
-      using equality_fn_type    = equality_functor<T>;
-      using hash_fn_type        = hash_functor<T>;
+      using equality_fn_type = equality_functor<T>;
+      using hash_fn_type     = hash_functor<T>;
+      // Choosing `linear_probing` over `double_hashing` for slighhhtly better performance seen in
+      // benchmarks.
       using probing_scheme_type = cuco::linear_probing<map_cg_size, hash_fn_type>;
 
       // Make a view of the hash map.
@@ -170,8 +172,10 @@ struct map_find_fn {
       auto const col                     = chunk->col_desc;
       column_device_view const& data_col = *col->leaf_column;
 
-      using equality_fn_type    = equality_functor<T>;
-      using hash_fn_type        = hash_functor<T>;
+      using equality_fn_type = equality_functor<T>;
+      using hash_fn_type     = hash_functor<T>;
+      // Choosing `linear_probing` over `double_hashing` for slighhhtly better performance seen in
+      // benchmarks.
       using probing_scheme_type = cuco::linear_probing<map_cg_size, hash_fn_type>;
 
       // Make a view of the hash map.

From a0f5aaba9b6e153bf0a0d3dffe5b1fc863456922 Mon Sep 17 00:00:00 2001
From: Muhammad Haseeb <14217455+mhaseeb123@users.noreply.github.com>
Date: Mon, 19 Aug 2024 21:09:17 +0000
Subject: [PATCH 26/30] Minor improvement

---
 cpp/src/io/parquet/chunk_dict.cu | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/cpp/src/io/parquet/chunk_dict.cu b/cpp/src/io/parquet/chunk_dict.cu
index eb51714b8de..98c1d5ae206 100644
--- a/cpp/src/io/parquet/chunk_dict.cu
+++ b/cpp/src/io/parquet/chunk_dict.cu
@@ -263,8 +263,10 @@ CUDF_KERNEL void __launch_bounds__(block_size)
   if (t == 0) { new (&counter) cuda::atomic<size_type, SCOPE>{0}; }
   __syncthreads();
 
+  // Iterate over all windows in the map.
   for (; t < chunk.dict_map_size; t += block_size) {
-    auto* window = map_storage.data() + chunk.dict_map_offset + t;
+    auto window = map_storage.data() + chunk.dict_map_offset + t;
+    // Collect all slots from each window.
     for (auto& slot : *window) {
       auto const key = slot.first;
       if (key != KEY_SENTINEL) {

From 9a16e305ba0e579ddc70962e537312d76e2faea0 Mon Sep 17 00:00:00 2001
From: Muhammad Haseeb <14217455+mhaseeb123@users.noreply.github.com>
Date: Mon, 19 Aug 2024 18:42:49 -0700
Subject: [PATCH 27/30] Minor updates

---
 cpp/src/io/parquet/writer_impl.cu | 8 +++-----
 1 file changed, 3 insertions(+), 5 deletions(-)

diff --git a/cpp/src/io/parquet/writer_impl.cu b/cpp/src/io/parquet/writer_impl.cu
index 14504a4ef1c..fb4cad0713f 100644
--- a/cpp/src/io/parquet/writer_impl.cu
+++ b/cpp/src/io/parquet/writer_impl.cu
@@ -1286,7 +1286,7 @@ build_chunk_dictionaries(hostdevice_2dvector<EncColumnChunk>& chunks,
   }
 
   // Variable to keep track of the current offset
-  uint32_t curr_offset = 0;
+  size_t total_map_storage_size = 0;
   // Populate dict offsets and sizes for each chunk that need to build a dictionary.
   std::for_each(h_chunks.begin(), h_chunks.end(), [&](auto& chunk) {
     auto const& chunk_col_desc = col_desc[chunk.col_desc_id];
@@ -1303,13 +1303,11 @@ build_chunk_dictionaries(hostdevice_2dvector<EncColumnChunk>& chunks,
       chunk.dict_map_size =
         static_cast<cudf::size_type>(cuco::make_window_extent<map_cg_size, window_size>(
           static_cast<cudf::size_type>(occupancy_factor * chunk.num_values)));
-      chunk.dict_map_offset = curr_offset;
-      curr_offset += chunk.dict_map_size;
+      chunk.dict_map_offset = total_map_storage_size;
+      total_map_storage_size += chunk.dict_map_size;
     }
   });
 
-  // Compute total map storage
-  auto const total_map_storage_size = static_cast<std::size_t>(curr_offset);
 
   // No chunk needs to create a dictionary, exit early
   if (total_map_storage_size == 0) { return {std::move(dict_data), std::move(dict_index)}; }

From 24d16eb256b1251035890ba4248a66c0ffc9d8c5 Mon Sep 17 00:00:00 2001
From: Muhammad Haseeb <14217455+mhaseeb123@users.noreply.github.com>
Date: Tue, 20 Aug 2024 01:46:51 +0000
Subject: [PATCH 28/30] Apply clang-format

---
 cpp/src/io/parquet/writer_impl.cu | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

diff --git a/cpp/src/io/parquet/writer_impl.cu b/cpp/src/io/parquet/writer_impl.cu
index fb4cad0713f..9df53bc9b7e 100644
--- a/cpp/src/io/parquet/writer_impl.cu
+++ b/cpp/src/io/parquet/writer_impl.cu
@@ -1285,7 +1285,7 @@ build_chunk_dictionaries(hostdevice_2dvector<EncColumnChunk>& chunks,
     return std::pair(std::move(dict_data), std::move(dict_index));
   }
 
-  // Variable to keep track of the current offset
+  // Variable to keep track of the current total map storage size
   size_t total_map_storage_size = 0;
   // Populate dict offsets and sizes for each chunk that need to build a dictionary.
   std::for_each(h_chunks.begin(), h_chunks.end(), [&](auto& chunk) {
@@ -1308,7 +1308,6 @@ build_chunk_dictionaries(hostdevice_2dvector<EncColumnChunk>& chunks,
     }
   });
 
-
   // No chunk needs to create a dictionary, exit early
   if (total_map_storage_size == 0) { return {std::move(dict_data), std::move(dict_index)}; }
 

From f18c1f77a064a2ceff3e292ad4dcde05286ec33c Mon Sep 17 00:00:00 2001
From: Muhammad Haseeb <14217455+mhaseeb123@users.noreply.github.com>
Date: Wed, 21 Aug 2024 23:15:32 +0000
Subject: [PATCH 29/30] Address reviewer comments

---
 cpp/src/io/parquet/chunk_dict.cu | 47 ++++++++++++++------------------
 1 file changed, 20 insertions(+), 27 deletions(-)

diff --git a/cpp/src/io/parquet/chunk_dict.cu b/cpp/src/io/parquet/chunk_dict.cu
index 98c1d5ae206..7a6d470a027 100644
--- a/cpp/src/io/parquet/chunk_dict.cu
+++ b/cpp/src/io/parquet/chunk_dict.cu
@@ -34,7 +34,7 @@ constexpr int DEFAULT_BLOCK_SIZE = 256;
 template <typename T>
 struct equality_functor {
   column_device_view const& col;
-  __device__ bool operator()(key_type const lhs_idx, key_type const rhs_idx) const
+  __device__ bool operator()(key_type lhs_idx, key_type rhs_idx) const
   {
     // We don't call this for nulls so this is fine.
     auto constexpr equal =
@@ -76,23 +76,21 @@ struct map_insert_fn {
       using probing_scheme_type = cuco::linear_probing<map_cg_size, hash_fn_type>;
 
       // Make a view of the hash map.
-      cuco::static_map_ref<key_type,
-                           mapped_type,
-                           SCOPE,
-                           equality_fn_type,
-                           probing_scheme_type,
-                           storage_ref_type>
-        hash_map_ref{cuco::empty_key{KEY_SENTINEL},
-                     cuco::empty_value{VALUE_SENTINEL},
-                     {data_col},
-                     hash_fn_type{data_col},
-                     {},
-                     storage_ref};
+      auto hash_map_ref = cuco::static_map_ref{cuco::empty_key{KEY_SENTINEL},
+                                               cuco::empty_value{VALUE_SENTINEL},
+                                               equality_fn_type{data_col},
+                                               probing_scheme_type{hash_fn_type{data_col}},
+                                               cuco::thread_scope_block,
+                                               storage_ref};
 
       // Create a map ref with `cuco::insert` operator
       auto map_insert_ref = hash_map_ref.with_operators(cuco::insert);
       auto const t        = threadIdx.x;
 
+      // Create atomic refs to the current chunk's num_dict_entries and uniq_data_size
+      cuda::atomic_ref<size_type, SCOPE> const chunk_num_dict_entries{chunk->num_dict_entries};
+      cuda::atomic_ref<size_type, SCOPE> const chunk_uniq_data_size{chunk->uniq_data_size};
+
       // Note: Adjust the following loop to use `cg::tile<map_cg_size>` if needed in the future.
       for (thread_index_type val_idx = s_start_value_idx + t; val_idx - block_size < end_value_idx;
            val_idx += block_size) {
@@ -144,9 +142,10 @@ struct map_insert_fn {
         auto uniq_data_size = block_reduce(reduce_storage).Sum(uniq_elem_size);
         // The first thread in the block atomically updates total num_unique and uniq_data_size
         if (t == 0) {
-          total_num_dict_entries = atomicAdd(&chunk->num_dict_entries, num_unique);
+          total_num_dict_entries =
+            chunk_num_dict_entries.fetch_add(num_unique, cuda::std::memory_order_relaxed);
           total_num_dict_entries += num_unique;
-          atomicAdd(&chunk->uniq_data_size, uniq_data_size);
+          chunk_uniq_data_size.fetch_add(uniq_data_size, cuda::std::memory_order_relaxed);
         }
         __syncthreads();
 
@@ -179,18 +178,12 @@ struct map_find_fn {
       using probing_scheme_type = cuco::linear_probing<map_cg_size, hash_fn_type>;
 
       // Make a view of the hash map.
-      cuco::static_map_ref<key_type,
-                           mapped_type,
-                           SCOPE,
-                           equality_fn_type,
-                           probing_scheme_type,
-                           storage_ref_type>
-        hash_map_ref{cuco::empty_key{KEY_SENTINEL},
-                     cuco::empty_value{VALUE_SENTINEL},
-                     {data_col},
-                     hash_fn_type{data_col},
-                     {},
-                     storage_ref};
+      auto hash_map_ref = cuco::static_map_ref{cuco::empty_key{KEY_SENTINEL},
+                                               cuco::empty_value{VALUE_SENTINEL},
+                                               equality_fn_type{data_col},
+                                               probing_scheme_type{hash_fn_type{data_col}},
+                                               cuco::thread_scope_block,
+                                               storage_ref};
 
       // Create a map ref with `cuco::find` operator
       auto const map_find_ref = hash_map_ref.with_operators(cuco::find);

From 8f94acab58dd65198237f2e228e5b13cde3ab0fb Mon Sep 17 00:00:00 2001
From: Muhammad Haseeb <14217455+mhaseeb123@users.noreply.github.com>
Date: Thu, 29 Aug 2024 14:08:07 -0700
Subject: [PATCH 30/30] Apply suggestion

Co-authored-by: Yunsong Wang <yunsongw@nvidia.com>
---
 cpp/src/io/parquet/chunk_dict.cu | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/cpp/src/io/parquet/chunk_dict.cu b/cpp/src/io/parquet/chunk_dict.cu
index 7a6d470a027..17ccb73c0a8 100644
--- a/cpp/src/io/parquet/chunk_dict.cu
+++ b/cpp/src/io/parquet/chunk_dict.cu
@@ -92,7 +92,7 @@ struct map_insert_fn {
       cuda::atomic_ref<size_type, SCOPE> const chunk_uniq_data_size{chunk->uniq_data_size};
 
       // Note: Adjust the following loop to use `cg::tile<map_cg_size>` if needed in the future.
-      for (thread_index_type val_idx = s_start_value_idx + t; val_idx - block_size < end_value_idx;
+      for (thread_index_type val_idx = s_start_value_idx + t; val_idx - t < end_value_idx;
            val_idx += block_size) {
         size_type is_unique      = 0;
         size_type uniq_elem_size = 0;