diff --git a/cpp/benchmarks/join/conditional_join.cu b/cpp/benchmarks/join/conditional_join.cu
index e332d09d31b..2deb888cc5c 100644
--- a/cpp/benchmarks/join/conditional_join.cu
+++ b/cpp/benchmarks/join/conditional_join.cu
@@ -16,6 +16,8 @@
 
 #include "join_common.hpp"
 
+auto const CONDITIONAL_JOIN_SIZE_RANGE = std::vector<nvbench::int64_t>{1000, 100'000};
+
 template <typename Key, bool Nullable>
 void nvbench_conditional_inner_join(nvbench::state& state,
                                     nvbench::type_list<Key, nvbench::enum_type<Nullable>>)
@@ -46,12 +48,12 @@ NVBENCH_BENCH_TYPES(nvbench_conditional_inner_join,
                     NVBENCH_TYPE_AXES(JOIN_KEY_TYPE_RANGE, JOIN_NULLABLE_RANGE))
   .set_name("conditional_inner_join")
   .set_type_axes_names({"Key", "Nullable"})
-  .add_int64_axis("left_size", JOIN_SIZE_RANGE)
-  .add_int64_axis("right_size", JOIN_SIZE_RANGE);
+  .add_int64_axis("left_size", CONDITIONAL_JOIN_SIZE_RANGE)
+  .add_int64_axis("right_size", CONDITIONAL_JOIN_SIZE_RANGE);
 
 NVBENCH_BENCH_TYPES(nvbench_conditional_left_join,
                     NVBENCH_TYPE_AXES(JOIN_KEY_TYPE_RANGE, JOIN_NULLABLE_RANGE))
   .set_name("conditional_left_join")
   .set_type_axes_names({"Key", "Nullable"})
-  .add_int64_axis("left_size", JOIN_SIZE_RANGE)
-  .add_int64_axis("right_size", JOIN_SIZE_RANGE);
+  .add_int64_axis("left_size", CONDITIONAL_JOIN_SIZE_RANGE)
+  .add_int64_axis("right_size", CONDITIONAL_JOIN_SIZE_RANGE);
diff --git a/cpp/include/cudf/rolling.hpp b/cpp/include/cudf/rolling.hpp
index 2cd34f48265..d55322dd3e8 100644
--- a/cpp/include/cudf/rolling.hpp
+++ b/cpp/include/cudf/rolling.hpp
@@ -57,6 +57,7 @@ namespace cudf {
  * @param[in] min_periods Minimum number of observations in window required to have a value,
  *                        otherwise element `i` is null.
  * @param[in] agg The rolling window aggregation type (SUM, MAX, MIN, etc.)
+ * @param[in] stream CUDA stream used for device memory operations and kernel launches
  * @param[in] mr Device memory resource used to allocate the returned column's device memory
  *
  * @returns   A nullable output column containing the rolling window results
@@ -67,6 +68,7 @@ std::unique_ptr<column> rolling_window(
   size_type following_window,
   size_type min_periods,
   rolling_aggregation const& agg,
+  rmm::cuda_stream_view stream      = cudf::get_default_stream(),
   rmm::device_async_resource_ref mr = rmm::mr::get_current_device_resource());
 
 /**
@@ -77,6 +79,7 @@ std::unique_ptr<column> rolling_window(
  *            size_type following_window,
  *            size_type min_periods,
  *            rolling_aggregation const& agg,
+ *            rmm::cuda_stream_view stream,
  *            rmm::device_async_resource_ref mr)
  *
  * @param default_outputs A column of per-row default values to be returned instead
@@ -90,6 +93,7 @@ std::unique_ptr<column> rolling_window(
   size_type following_window,
   size_type min_periods,
   rolling_aggregation const& agg,
+  rmm::cuda_stream_view stream      = cudf::get_default_stream(),
   rmm::device_async_resource_ref mr = rmm::mr::get_current_device_resource());
 
 /**
@@ -227,6 +231,7 @@ struct window_bounds {
  * @param[in] min_periods Minimum number of observations in window required to have a value,
  *                        otherwise element `i` is null.
  * @param[in] aggr The rolling window aggregation type (SUM, MAX, MIN, etc.)
+ * @param[in] stream CUDA stream used for device memory operations and kernel launches
  * @param[in] mr Device memory resource used to allocate the returned column's device memory
  *
  * @returns   A nullable output column containing the rolling window results
@@ -238,6 +243,7 @@ std::unique_ptr<column> grouped_rolling_window(
   size_type following_window,
   size_type min_periods,
   rolling_aggregation const& aggr,
+  rmm::cuda_stream_view stream      = cudf::get_default_stream(),
   rmm::device_async_resource_ref mr = rmm::mr::get_current_device_resource());
 
 /**
@@ -249,6 +255,7 @@ std::unique_ptr<column> grouped_rolling_window(
  *            size_type following_window,
  *            size_type min_periods,
  *            rolling_aggregation const& aggr,
+ *            rmm::cuda_stream_view stream,
  *            rmm::device_async_resource_ref mr)
  */
 std::unique_ptr<column> grouped_rolling_window(
@@ -258,6 +265,7 @@ std::unique_ptr<column> grouped_rolling_window(
   window_bounds following_window,
   size_type min_periods,
   rolling_aggregation const& aggr,
+  rmm::cuda_stream_view stream      = cudf::get_default_stream(),
   rmm::device_async_resource_ref mr = rmm::mr::get_current_device_resource());
 
 /**
@@ -269,6 +277,7 @@ std::unique_ptr<column> grouped_rolling_window(
  *            size_type following_window,
  *            size_type min_periods,
  *            rolling_aggregation const& aggr,
+ *            rmm::cuda_stream_view stream,,
  *            rmm::device_async_resource_ref mr)
  *
  * @param default_outputs A column of per-row default values to be returned instead
@@ -283,6 +292,7 @@ std::unique_ptr<column> grouped_rolling_window(
   size_type following_window,
   size_type min_periods,
   rolling_aggregation const& aggr,
+  rmm::cuda_stream_view stream      = cudf::get_default_stream(),
   rmm::device_async_resource_ref mr = rmm::mr::get_current_device_resource());
 
 /**
@@ -295,6 +305,7 @@ std::unique_ptr<column> grouped_rolling_window(
  *            size_type following_window,
  *            size_type min_periods,
  *            rolling_aggregation const& aggr,
+ *            rmm::cuda_stream_view stream,
  *            rmm::device_async_resource_ref mr)
  */
 std::unique_ptr<column> grouped_rolling_window(
@@ -305,6 +316,7 @@ std::unique_ptr<column> grouped_rolling_window(
   window_bounds following_window,
   size_type min_periods,
   rolling_aggregation const& aggr,
+  rmm::cuda_stream_view stream      = cudf::get_default_stream(),
   rmm::device_async_resource_ref mr = rmm::mr::get_current_device_resource());
 
 /**
@@ -387,6 +399,7 @@ std::unique_ptr<column> grouped_rolling_window(
  * @param[in] min_periods Minimum number of observations in window required to have a value,
  *                        otherwise element `i` is null.
  * @param[in] aggr The rolling window aggregation type (SUM, MAX, MIN, etc.)
+ * @param[in] stream CUDA stream used for device memory operations and kernel launches
  * @param[in] mr Device memory resource used to allocate the returned column's device memory
  *
  * @returns   A nullable output column containing the rolling window results
@@ -400,6 +413,7 @@ std::unique_ptr<column> grouped_time_range_rolling_window(
   size_type following_window_in_days,
   size_type min_periods,
   rolling_aggregation const& aggr,
+  rmm::cuda_stream_view stream      = cudf::get_default_stream(),
   rmm::device_async_resource_ref mr = rmm::mr::get_current_device_resource());
 
 /**
@@ -415,6 +429,7 @@ std::unique_ptr<column> grouped_time_range_rolling_window(
  *                size_type following_window_in_days,
  *                size_type min_periods,
  *                rolling_aggregation const& aggr,
+ *                rmm::cuda_stream_view stream,
  *                rmm::device_async_resource_ref mr)
  *
  * The `preceding_window_in_days` and `following_window_in_days` are specified as a `window_bounds`
@@ -429,6 +444,7 @@ std::unique_ptr<column> grouped_time_range_rolling_window(
   window_bounds following_window_in_days,
   size_type min_periods,
   rolling_aggregation const& aggr,
+  rmm::cuda_stream_view stream      = cudf::get_default_stream(),
   rmm::device_async_resource_ref mr = rmm::mr::get_current_device_resource());
 
 /**
@@ -536,6 +552,7 @@ std::unique_ptr<column> grouped_time_range_rolling_window(
  * @param[in] min_periods Minimum number of observations in window required to have a value,
  *                        otherwise element `i` is null.
  * @param[in] aggr The rolling window aggregation type (SUM, MAX, MIN, etc.)
+ * @param[in] stream CUDA stream used for device memory operations and kernel launches
  * @param[in] mr Device memory resource used to allocate the returned column's device memory
  *
  * @returns   A nullable output column containing the rolling window results
@@ -549,6 +566,7 @@ std::unique_ptr<column> grouped_range_rolling_window(
   range_window_bounds const& following,
   size_type min_periods,
   rolling_aggregation const& aggr,
+  rmm::cuda_stream_view stream      = cudf::get_default_stream(),
   rmm::device_async_resource_ref mr = rmm::mr::get_current_device_resource());
 
 /**
@@ -582,6 +600,7 @@ std::unique_ptr<column> grouped_range_rolling_window(
  * @param[in] min_periods Minimum number of observations in window required to have a value,
  *                        otherwise element `i` is null.
  * @param[in] agg The rolling window aggregation type (sum, max, min, etc.)
+ * @param[in] stream CUDA stream used for device memory operations and kernel launches
  * @param[in] mr Device memory resource used to allocate the returned column's device memory
  *
  * @returns   A nullable output column containing the rolling window results
@@ -592,6 +611,7 @@ std::unique_ptr<column> rolling_window(
   column_view const& following_window,
   size_type min_periods,
   rolling_aggregation const& agg,
+  rmm::cuda_stream_view stream      = cudf::get_default_stream(),
   rmm::device_async_resource_ref mr = rmm::mr::get_current_device_resource());
 
 /** @} */  // end of group
diff --git a/cpp/include/cudf/rolling/range_window_bounds.hpp b/cpp/include/cudf/rolling/range_window_bounds.hpp
index 81885ade2f0..a9ee12cea27 100644
--- a/cpp/include/cudf/rolling/range_window_bounds.hpp
+++ b/cpp/include/cudf/rolling/range_window_bounds.hpp
@@ -56,18 +56,22 @@ struct range_window_bounds {
    * @brief Factory method to construct a bounded window boundary.
    *
    * @param boundary Finite window boundary
+   * @param stream CUDA stream used for device memory operations and kernel launches
    * @return A bounded window boundary object
    */
-  static range_window_bounds get(scalar const& boundary);
+  static range_window_bounds get(scalar const& boundary,
+                                 rmm::cuda_stream_view stream = cudf::get_default_stream());
 
   /**
    * @brief Factory method to construct a window boundary
    *  limited to the value of the current row
    *
    * @param type The datatype of the window boundary
+   * @param stream CUDA stream used for device memory operations and kernel launches
    * @return  A "current row" window boundary object
    */
-  static range_window_bounds current_row(data_type type);
+  static range_window_bounds current_row(data_type type,
+                                         rmm::cuda_stream_view stream = cudf::get_default_stream());
 
   /**
    * @brief Whether or not the window is bounded to the current row
@@ -81,9 +85,11 @@ struct range_window_bounds {
    * @brief Factory method to construct an unbounded window boundary.
    *
    * @param type The datatype of the window boundary
+   * @param stream CUDA stream used for device memory operations and kernel launches
    * @return  An unbounded window boundary object
    */
-  static range_window_bounds unbounded(data_type type);
+  static range_window_bounds unbounded(data_type type,
+                                       rmm::cuda_stream_view stream = cudf::get_default_stream());
 
   /**
    * @brief Whether or not the window is unbounded
@@ -107,7 +113,9 @@ struct range_window_bounds {
   extent_type _extent{extent_type::UNBOUNDED};
   std::shared_ptr<scalar> _range_scalar{nullptr};  // To enable copy construction/assignment.
 
-  range_window_bounds(extent_type extent_, std::unique_ptr<scalar> range_scalar_);
+  range_window_bounds(extent_type extent_,
+                      std::unique_ptr<scalar> range_scalar_,
+                      rmm::cuda_stream_view = cudf::get_default_stream());
 };
 
 /** @} */  // end of group
diff --git a/cpp/src/rolling/grouped_rolling.cu b/cpp/src/rolling/grouped_rolling.cu
index d461ed7a109..1158bf22494 100644
--- a/cpp/src/rolling/grouped_rolling.cu
+++ b/cpp/src/rolling/grouped_rolling.cu
@@ -40,59 +40,6 @@
 #include <thrust/partition.h>
 
 namespace cudf {
-std::unique_ptr<column> grouped_rolling_window(table_view const& group_keys,
-                                               column_view const& input,
-                                               size_type preceding_window,
-                                               size_type following_window,
-                                               size_type min_periods,
-                                               rolling_aggregation const& aggr,
-                                               rmm::device_async_resource_ref mr)
-{
-  return grouped_rolling_window(group_keys,
-                                input,
-                                window_bounds::get(preceding_window),
-                                window_bounds::get(following_window),
-                                min_periods,
-                                aggr,
-                                mr);
-}
-
-std::unique_ptr<column> grouped_rolling_window(table_view const& group_keys,
-                                               column_view const& input,
-                                               window_bounds preceding_window,
-                                               window_bounds following_window,
-                                               size_type min_periods,
-                                               rolling_aggregation const& aggr,
-                                               rmm::device_async_resource_ref mr)
-{
-  return grouped_rolling_window(group_keys,
-                                input,
-                                empty_like(input)->view(),
-                                preceding_window,
-                                following_window,
-                                min_periods,
-                                aggr,
-                                mr);
-}
-
-std::unique_ptr<column> grouped_rolling_window(table_view const& group_keys,
-                                               column_view const& input,
-                                               column_view const& default_outputs,
-                                               size_type preceding_window,
-                                               size_type following_window,
-                                               size_type min_periods,
-                                               rolling_aggregation const& aggr,
-                                               rmm::device_async_resource_ref mr)
-{
-  return grouped_rolling_window(group_keys,
-                                input,
-                                default_outputs,
-                                window_bounds::get(preceding_window),
-                                window_bounds::get(following_window),
-                                min_periods,
-                                aggr,
-                                mr);
-}
 
 namespace detail {
 
@@ -237,8 +184,8 @@ std::unique_ptr<column> grouped_rolling_window(table_view const& group_keys,
 
   if (group_keys.num_columns() == 0) {
     // No Groupby columns specified. Treat as one big group.
-    return rolling_window(
-      input, default_outputs, preceding_window, following_window, min_periods, aggr, mr);
+    return detail::rolling_window(
+      input, default_outputs, preceding_window, following_window, min_periods, aggr, stream, mr);
   }
 
   using sort_groupby_helper = cudf::groupby::detail::sort::sort_groupby_helper;
@@ -306,6 +253,7 @@ std::unique_ptr<column> grouped_rolling_window(table_view const& group_keys,
                                                window_bounds following_window_bounds,
                                                size_type min_periods,
                                                rolling_aggregation const& aggr,
+                                               rmm::cuda_stream_view stream,
                                                rmm::device_async_resource_ref mr)
 {
   return detail::grouped_rolling_window(group_keys,
@@ -315,7 +263,67 @@ std::unique_ptr<column> grouped_rolling_window(table_view const& group_keys,
                                         following_window_bounds,
                                         min_periods,
                                         aggr,
-                                        cudf::get_default_stream(),
+                                        stream,
+                                        mr);
+}
+
+std::unique_ptr<column> grouped_rolling_window(table_view const& group_keys,
+                                               column_view const& input,
+                                               size_type preceding_window,
+                                               size_type following_window,
+                                               size_type min_periods,
+                                               rolling_aggregation const& aggr,
+                                               rmm::cuda_stream_view stream,
+                                               rmm::device_async_resource_ref mr)
+{
+  return grouped_rolling_window(group_keys,
+                                input,
+                                window_bounds::get(preceding_window),
+                                window_bounds::get(following_window),
+                                min_periods,
+                                aggr,
+                                stream,
+                                mr);
+}
+
+std::unique_ptr<column> grouped_rolling_window(table_view const& group_keys,
+                                               column_view const& input,
+                                               window_bounds preceding_window,
+                                               window_bounds following_window,
+                                               size_type min_periods,
+                                               rolling_aggregation const& aggr,
+                                               rmm::cuda_stream_view stream,
+                                               rmm::device_async_resource_ref mr)
+{
+  return detail::grouped_rolling_window(group_keys,
+                                        input,
+                                        empty_like(input)->view(),
+                                        preceding_window,
+                                        following_window,
+                                        min_periods,
+                                        aggr,
+                                        stream,
+                                        mr);
+}
+
+std::unique_ptr<column> grouped_rolling_window(table_view const& group_keys,
+                                               column_view const& input,
+                                               column_view const& default_outputs,
+                                               size_type preceding_window,
+                                               size_type following_window,
+                                               size_type min_periods,
+                                               rolling_aggregation const& aggr,
+                                               rmm::cuda_stream_view stream,
+                                               rmm::device_async_resource_ref mr)
+{
+  return detail::grouped_rolling_window(group_keys,
+                                        input,
+                                        default_outputs,
+                                        window_bounds::get(preceding_window),
+                                        window_bounds::get(following_window),
+                                        min_periods,
+                                        aggr,
+                                        stream,
                                         mr);
 }
 
@@ -1047,14 +1055,15 @@ struct dispatch_grouped_range_rolling_window {
  */
 struct to_duration_bounds {
   template <typename OrderBy, std::enable_if_t<cudf::is_timestamp<OrderBy>(), void>* = nullptr>
-  range_window_bounds operator()(size_type num_days) const
+  range_window_bounds operator()(size_type num_days, rmm::cuda_stream_view stream) const
   {
     using DurationT = typename OrderBy::duration;
-    return range_window_bounds::get(duration_scalar<DurationT>{duration_D{num_days}, true});
+    return range_window_bounds::get(duration_scalar<DurationT>{duration_D{num_days}, true, stream},
+                                    stream);
   }
 
   template <typename OrderBy, std::enable_if_t<!cudf::is_timestamp<OrderBy>(), void>* = nullptr>
-  range_window_bounds operator()(size_type) const
+  range_window_bounds operator()(size_type, rmm::cuda_stream_view) const
   {
     CUDF_FAIL("Expected timestamp orderby column.");
   }
@@ -1085,9 +1094,11 @@ data_type get_duration_type_for(cudf::data_type timestamp_type)
  * @param timestamp_type Data-type of the orderby column to which the `num_days` is to be adapted.
  * @return range_window_bounds A `range_window_bounds` to be used with the new API.
  */
-range_window_bounds to_range_bounds(cudf::size_type num_days, cudf::data_type timestamp_type)
+range_window_bounds to_range_bounds(cudf::size_type num_days,
+                                    cudf::data_type timestamp_type,
+                                    rmm::cuda_stream_view stream)
 {
-  return cudf::type_dispatcher(timestamp_type, to_duration_bounds{}, num_days);
+  return cudf::type_dispatcher(timestamp_type, to_duration_bounds{}, num_days, stream);
 }
 
 /**
@@ -1101,11 +1112,13 @@ range_window_bounds to_range_bounds(cudf::size_type num_days, cudf::data_type ti
  * @return range_window_bounds A `range_window_bounds` to be used with the new API.
  */
 range_window_bounds to_range_bounds(cudf::window_bounds const& days_bounds,
-                                    cudf::data_type timestamp_type)
+                                    cudf::data_type timestamp_type,
+                                    rmm::cuda_stream_view stream)
 {
   return days_bounds.is_unbounded()
-           ? range_window_bounds::unbounded(get_duration_type_for(timestamp_type))
-           : cudf::type_dispatcher(timestamp_type, to_duration_bounds{}, days_bounds.value());
+           ? range_window_bounds::unbounded(get_duration_type_for(timestamp_type), stream)
+           : cudf::type_dispatcher(
+               timestamp_type, to_duration_bounds{}, days_bounds.value(), stream);
 }
 
 }  // namespace
@@ -1199,11 +1212,12 @@ std::unique_ptr<column> grouped_time_range_rolling_window(table_view const& grou
                                                           size_type following_window_in_days,
                                                           size_type min_periods,
                                                           rolling_aggregation const& aggr,
+                                                          rmm::cuda_stream_view stream,
                                                           rmm::device_async_resource_ref mr)
 {
   CUDF_FUNC_RANGE();
-  auto preceding = to_range_bounds(preceding_window_in_days, timestamp_column.type());
-  auto following = to_range_bounds(following_window_in_days, timestamp_column.type());
+  auto preceding = to_range_bounds(preceding_window_in_days, timestamp_column.type(), stream);
+  auto following = to_range_bounds(following_window_in_days, timestamp_column.type(), stream);
 
   return detail::grouped_range_rolling_window(group_keys,
                                               timestamp_column,
@@ -1213,7 +1227,7 @@ std::unique_ptr<column> grouped_time_range_rolling_window(table_view const& grou
                                               following,
                                               min_periods,
                                               aggr,
-                                              cudf::get_default_stream(),
+                                              stream,
                                               mr);
 }
 
@@ -1237,13 +1251,14 @@ std::unique_ptr<column> grouped_time_range_rolling_window(table_view const& grou
                                                           window_bounds following_window_in_days,
                                                           size_type min_periods,
                                                           rolling_aggregation const& aggr,
+                                                          rmm::cuda_stream_view stream,
                                                           rmm::device_async_resource_ref mr)
 {
   CUDF_FUNC_RANGE();
   range_window_bounds preceding =
-    to_range_bounds(preceding_window_in_days, timestamp_column.type());
+    to_range_bounds(preceding_window_in_days, timestamp_column.type(), stream);
   range_window_bounds following =
-    to_range_bounds(following_window_in_days, timestamp_column.type());
+    to_range_bounds(following_window_in_days, timestamp_column.type(), stream);
 
   return detail::grouped_range_rolling_window(group_keys,
                                               timestamp_column,
@@ -1253,7 +1268,7 @@ std::unique_ptr<column> grouped_time_range_rolling_window(table_view const& grou
                                               following,
                                               min_periods,
                                               aggr,
-                                              cudf::get_default_stream(),
+                                              stream,
                                               mr);
 }
 
@@ -1277,6 +1292,7 @@ std::unique_ptr<column> grouped_range_rolling_window(table_view const& group_key
                                                      range_window_bounds const& following,
                                                      size_type min_periods,
                                                      rolling_aggregation const& aggr,
+                                                     rmm::cuda_stream_view stream,
                                                      rmm::device_async_resource_ref mr)
 {
   CUDF_FUNC_RANGE();
@@ -1288,7 +1304,7 @@ std::unique_ptr<column> grouped_range_rolling_window(table_view const& group_key
                                               following,
                                               min_periods,
                                               aggr,
-                                              cudf::get_default_stream(),
+                                              stream,
                                               mr);
 }
 
diff --git a/cpp/src/rolling/range_window_bounds.cpp b/cpp/src/rolling/range_window_bounds.cpp
index 68e80c6e84e..69792136c64 100644
--- a/cpp/src/rolling/range_window_bounds.cpp
+++ b/cpp/src/rolling/range_window_bounds.cpp
@@ -32,7 +32,8 @@ namespace {
  */
 struct range_scalar_constructor {
   template <typename T, CUDF_ENABLE_IF(not detail::is_supported_range_type<T>())>
-  std::unique_ptr<scalar> operator()(scalar const& range_scalar_) const
+  std::unique_ptr<scalar> operator()(scalar const& range_scalar_,
+                                     rmm::cuda_stream_view stream) const
   {
     CUDF_FAIL(
       "Unsupported range type. "
@@ -40,51 +41,57 @@ struct range_scalar_constructor {
   }
 
   template <typename T, CUDF_ENABLE_IF(cudf::is_duration<T>())>
-  std::unique_ptr<scalar> operator()(scalar const& range_scalar_) const
+  std::unique_ptr<scalar> operator()(scalar const& range_scalar_,
+                                     rmm::cuda_stream_view stream) const
   {
     return std::make_unique<duration_scalar<T>>(
-      static_cast<duration_scalar<T> const&>(range_scalar_));
+      static_cast<duration_scalar<T> const&>(range_scalar_), stream);
   }
 
   template <typename T, CUDF_ENABLE_IF(cudf::is_numeric<T>() && not cudf::is_boolean<T>())>
-  std::unique_ptr<scalar> operator()(scalar const& range_scalar_) const
+  std::unique_ptr<scalar> operator()(scalar const& range_scalar_,
+                                     rmm::cuda_stream_view stream) const
   {
-    return std::make_unique<numeric_scalar<T>>(
-      static_cast<numeric_scalar<T> const&>(range_scalar_));
+    return std::make_unique<numeric_scalar<T>>(static_cast<numeric_scalar<T> const&>(range_scalar_),
+                                               stream);
   }
 
   template <typename T, CUDF_ENABLE_IF(cudf::is_fixed_point<T>())>
-  std::unique_ptr<scalar> operator()(scalar const& range_scalar_) const
+  std::unique_ptr<scalar> operator()(scalar const& range_scalar_,
+                                     rmm::cuda_stream_view stream) const
   {
     return std::make_unique<fixed_point_scalar<T>>(
-      static_cast<fixed_point_scalar<T> const&>(range_scalar_));
+      static_cast<fixed_point_scalar<T> const&>(range_scalar_), stream);
   }
 };
 }  // namespace
 
-range_window_bounds::range_window_bounds(extent_type extent_, std::unique_ptr<scalar> range_scalar_)
+range_window_bounds::range_window_bounds(extent_type extent_,
+                                         std::unique_ptr<scalar> range_scalar_,
+                                         rmm::cuda_stream_view stream)
   : _extent{extent_}, _range_scalar{std::move(range_scalar_)}
 {
   CUDF_EXPECTS(_range_scalar.get(), "Range window scalar cannot be null.");
   CUDF_EXPECTS(_extent == extent_type::UNBOUNDED || _extent == extent_type::CURRENT_ROW ||
-                 _range_scalar->is_valid(),
+                 _range_scalar->is_valid(stream),
                "Bounded Range window scalar must be valid.");
 }
 
-range_window_bounds range_window_bounds::unbounded(data_type type)
+range_window_bounds range_window_bounds::unbounded(data_type type, rmm::cuda_stream_view stream)
 {
-  return {extent_type::UNBOUNDED, make_default_constructed_scalar(type)};
+  return {extent_type::UNBOUNDED, make_default_constructed_scalar(type, stream), stream};
 }
 
-range_window_bounds range_window_bounds::current_row(data_type type)
+range_window_bounds range_window_bounds::current_row(data_type type, rmm::cuda_stream_view stream)
 {
-  return {extent_type::CURRENT_ROW, make_default_constructed_scalar(type)};
+  return {extent_type::CURRENT_ROW, make_default_constructed_scalar(type, stream), stream};
 }
 
-range_window_bounds range_window_bounds::get(scalar const& boundary)
+range_window_bounds range_window_bounds::get(scalar const& boundary, rmm::cuda_stream_view stream)
 {
   return {extent_type::BOUNDED,
-          cudf::type_dispatcher(boundary.type(), range_scalar_constructor{}, boundary)};
+          cudf::type_dispatcher(boundary.type(), range_scalar_constructor{}, boundary, stream),
+          stream};
 }
 
 }  // namespace cudf
diff --git a/cpp/src/rolling/rolling.cu b/cpp/src/rolling/rolling.cu
index a308ed8a7a6..e612bd01118 100644
--- a/cpp/src/rolling/rolling.cu
+++ b/cpp/src/rolling/rolling.cu
@@ -32,17 +32,12 @@ std::unique_ptr<column> rolling_window(column_view const& input,
                                        size_type following_window,
                                        size_type min_periods,
                                        rolling_aggregation const& agg,
+                                       rmm::cuda_stream_view stream,
                                        rmm::device_async_resource_ref mr)
 {
   CUDF_FUNC_RANGE();
-  return detail::rolling_window(input,
-                                default_outputs,
-                                preceding_window,
-                                following_window,
-                                min_periods,
-                                agg,
-                                cudf::get_default_stream(),
-                                mr);
+  return detail::rolling_window(
+    input, default_outputs, preceding_window, following_window, min_periods, agg, stream, mr);
 }
 
 // Applies a fixed-size rolling window function to the values in a column, without default specified
@@ -51,6 +46,7 @@ std::unique_ptr<column> rolling_window(column_view const& input,
                                        size_type following_window,
                                        size_type min_periods,
                                        rolling_aggregation const& agg,
+                                       rmm::cuda_stream_view stream,
                                        rmm::device_async_resource_ref mr)
 {
   CUDF_FUNC_RANGE();
@@ -62,7 +58,7 @@ std::unique_ptr<column> rolling_window(column_view const& input,
                                 following_window,
                                 min_periods,
                                 agg,
-                                cudf::get_default_stream(),
+                                stream,
                                 mr);
 }
 
@@ -72,11 +68,12 @@ std::unique_ptr<column> rolling_window(column_view const& input,
                                        column_view const& following_window,
                                        size_type min_periods,
                                        rolling_aggregation const& agg,
+                                       rmm::cuda_stream_view stream,
                                        rmm::device_async_resource_ref mr)
 {
   CUDF_FUNC_RANGE();
   return detail::rolling_window(
-    input, preceding_window, following_window, min_periods, agg, cudf::get_default_stream(), mr);
+    input, preceding_window, following_window, min_periods, agg, stream, mr);
 }
 
 }  // namespace cudf
diff --git a/cpp/src/strings/case.cu b/cpp/src/strings/case.cu
index c1688d20791..27befdea209 100644
--- a/cpp/src/strings/case.cu
+++ b/cpp/src/strings/case.cu
@@ -294,7 +294,7 @@ CUDF_KERNEL void has_multibytes_kernel(char const* d_input_chars,
   auto const mb_total = block_reduce(temp_storage).Reduce(mb_count, cub::Sum());
 
   if ((lane_idx == 0) && (mb_total > 0)) {
-    cuda::atomic_ref<int64_t, cuda::thread_scope_block> ref{*d_output};
+    cuda::atomic_ref<int64_t, cuda::thread_scope_device> ref{*d_output};
     ref.fetch_add(mb_total, cuda::std::memory_order_relaxed);
   }
 }
diff --git a/cpp/src/strings/replace/multi.cu b/cpp/src/strings/replace/multi.cu
index 8e5c5cf60b8..43a3d69091a 100644
--- a/cpp/src/strings/replace/multi.cu
+++ b/cpp/src/strings/replace/multi.cu
@@ -283,7 +283,7 @@ CUDF_KERNEL void count_targets(replace_multi_parallel_fn fn, int64_t chars_bytes
   auto const total = block_reduce(temp_storage).Reduce(count, cub::Sum());
 
   if ((lane_idx == 0) && (total > 0)) {
-    cuda::atomic_ref<int64_t, cuda::thread_scope_block> ref{*d_output};
+    cuda::atomic_ref<int64_t, cuda::thread_scope_device> ref{*d_output};
     ref.fetch_add(total, cuda::std::memory_order_relaxed);
   }
 }
diff --git a/cpp/src/strings/replace/replace.cu b/cpp/src/strings/replace/replace.cu
index 501e6d547e6..f7a3a3aea5c 100644
--- a/cpp/src/strings/replace/replace.cu
+++ b/cpp/src/strings/replace/replace.cu
@@ -238,6 +238,31 @@ struct replace_parallel_chars_fn {
   cudf::size_type maxrepl;
 };
 
+template <int64_t block_size, size_type bytes_per_thread>
+CUDF_KERNEL void count_targets_kernel(replace_parallel_chars_fn fn,
+                                      int64_t chars_bytes,
+                                      int64_t* d_output)
+{
+  auto const idx      = cudf::detail::grid_1d::global_thread_id();
+  auto const byte_idx = static_cast<int64_t>(idx) * bytes_per_thread;
+  auto const lane_idx = static_cast<cudf::size_type>(threadIdx.x);
+
+  using block_reduce = cub::BlockReduce<int64_t, block_size>;
+  __shared__ typename block_reduce::TempStorage temp_storage;
+
+  int64_t count = 0;
+  // each thread processes multiple bytes
+  for (auto i = byte_idx; (i < (byte_idx + bytes_per_thread)) && (i < chars_bytes); ++i) {
+    count += fn.has_target(i);
+  }
+  auto const total = block_reduce(temp_storage).Reduce(count, cub::Sum());
+
+  if ((lane_idx == 0) && (total > 0)) {
+    cuda::atomic_ref<int64_t, cuda::thread_scope_device> ref{*d_output};
+    ref.fetch_add(total, cuda::std::memory_order_relaxed);
+  }
+}
+
 std::unique_ptr<column> replace_character_parallel(strings_column_view const& input,
                                                    string_view const& d_target,
                                                    string_view const& d_replacement,
@@ -260,10 +285,14 @@ std::unique_ptr<column> replace_character_parallel(strings_column_view const& in
 
   // Count the number of targets in the entire column.
   // Note this may over-count in the case where a target spans adjacent strings.
-  auto target_count = thrust::count_if(rmm::exec_policy_nosync(stream),
-                                       thrust::make_counting_iterator<int64_t>(0),
-                                       thrust::make_counting_iterator<int64_t>(chars_bytes),
-                                       [fn] __device__(int64_t idx) { return fn.has_target(idx); });
+  rmm::device_scalar<int64_t> d_target_count(0, stream);
+  constexpr int64_t block_size         = 512;
+  constexpr size_type bytes_per_thread = 4;
+  auto const num_blocks                = util::div_rounding_up_safe(
+    util::div_rounding_up_safe(chars_bytes, static_cast<int64_t>(bytes_per_thread)), block_size);
+  count_targets_kernel<block_size, bytes_per_thread>
+    <<<num_blocks, block_size, 0, stream.value()>>>(fn, chars_bytes, d_target_count.data());
+  auto target_count = d_target_count.value(stream);
 
   // Create a vector of every target position in the chars column.
   // These may also include overlapping targets which will be resolved later.
diff --git a/cpp/src/strings/split/split.cuh b/cpp/src/strings/split/split.cuh
index ae3c0b3aa12..23614ac0733 100644
--- a/cpp/src/strings/split/split.cuh
+++ b/cpp/src/strings/split/split.cuh
@@ -327,7 +327,7 @@ CUDF_KERNEL void count_delimiters_kernel(Tokenizer tokenizer,
   auto const total = block_reduce(temp_storage).Reduce(count, cub::Sum());
 
   if ((lane_idx == 0) && (total > 0)) {
-    cuda::atomic_ref<int64_t, cuda::thread_scope_block> ref{*d_output};
+    cuda::atomic_ref<int64_t, cuda::thread_scope_device> ref{*d_output};
     ref.fetch_add(total, cuda::std::memory_order_relaxed);
   }
 }
diff --git a/cpp/src/text/tokenize.cu b/cpp/src/text/tokenize.cu
index 25406bce759..3ce6064d9c2 100644
--- a/cpp/src/text/tokenize.cu
+++ b/cpp/src/text/tokenize.cu
@@ -121,7 +121,7 @@ CUDF_KERNEL void count_characters(uint8_t const* d_chars, int64_t chars_bytes, i
   auto const total = block_reduce(temp_storage).Reduce(count, cub::Sum());
 
   if ((lane_idx == 0) && (total > 0)) {
-    cuda::atomic_ref<int64_t, cuda::thread_scope_block> ref{*d_output};
+    cuda::atomic_ref<int64_t, cuda::thread_scope_device> ref{*d_output};
     ref.fetch_add(total, cuda::std::memory_order_relaxed);
   }
 }
diff --git a/cpp/tests/CMakeLists.txt b/cpp/tests/CMakeLists.txt
index f6d762cc2ec..329edbe4d36 100644
--- a/cpp/tests/CMakeLists.txt
+++ b/cpp/tests/CMakeLists.txt
@@ -571,9 +571,9 @@ ConfigureTest(
   large_strings/concatenate_tests.cpp
   large_strings/case_tests.cpp
   large_strings/large_strings_fixture.cpp
-  large_strings/many_strings_tests.cpp
   large_strings/merge_tests.cpp
   large_strings/parquet_tests.cpp
+  large_strings/replace_tests.cpp
   large_strings/reshape_tests.cpp
   large_strings/split_strings_tests.cpp
   GPUS 1
@@ -693,10 +693,11 @@ ConfigureTest(STREAM_NULL_MASK_TEST streams/null_mask_test.cpp STREAM_MODE testi
 ConfigureTest(STREAM_ORCIO_TEST streams/io/orc_test.cpp STREAM_MODE testing)
 ConfigureTest(STREAM_PARQUETIO_TEST streams/io/parquet_test.cpp STREAM_MODE testing)
 ConfigureTest(STREAM_POOL_TEST streams/pool_test.cu STREAM_MODE testing)
+ConfigureTest(STREAM_REDUCTION_TEST streams/reduction_test.cpp STREAM_MODE testing)
 ConfigureTest(STREAM_REPLACE_TEST streams/replace_test.cpp STREAM_MODE testing)
+ConfigureTest(STREAM_ROLLING_TEST streams/rolling_test.cpp STREAM_MODE testing)
 ConfigureTest(STREAM_SEARCH_TEST streams/search_test.cpp STREAM_MODE testing)
 ConfigureTest(STREAM_SORTING_TEST streams/sorting_test.cpp STREAM_MODE testing)
-ConfigureTest(STREAM_REDUCTION_TEST streams/reduction_test.cpp STREAM_MODE testing)
 ConfigureTest(
   STREAM_STRINGS_TEST
   streams/strings/case_test.cpp
diff --git a/cpp/tests/large_strings/many_strings_tests.cpp b/cpp/tests/large_strings/replace_tests.cpp
similarity index 72%
rename from cpp/tests/large_strings/many_strings_tests.cpp
rename to cpp/tests/large_strings/replace_tests.cpp
index 73fbb21d014..aa65ec0c010 100644
--- a/cpp/tests/large_strings/many_strings_tests.cpp
+++ b/cpp/tests/large_strings/replace_tests.cpp
@@ -21,6 +21,7 @@
 #include <cudf/concatenate.hpp>
 #include <cudf/copying.hpp>
 #include <cudf/scalar/scalar.hpp>
+#include <cudf/strings/combine.hpp>
 #include <cudf/strings/replace.hpp>
 #include <cudf/strings/strings_column_view.hpp>
 #include <cudf/table/table_view.hpp>
@@ -28,9 +29,9 @@
 #include <limits>
 #include <vector>
 
-struct StringsManyTest : public cudf::test::StringsLargeTest {};
+struct ReplaceTest : public cudf::test::StringsLargeTest {};
 
-TEST_F(StringsManyTest, Replace)
+TEST_F(ReplaceTest, ReplaceLong)
 {
   auto const expected = this->very_long_column();
   auto const view     = cudf::column_view(expected);
@@ -65,3 +66,22 @@ TEST_F(StringsManyTest, Replace)
     CUDF_TEST_EXPECT_COLUMNS_EQUIVALENT(c, expected);
   }
 }
+
+TEST_F(ReplaceTest, ReplaceWide)
+{
+  auto const expected   = this->long_column();
+  auto const view       = cudf::column_view(expected);
+  auto const multiplier = 10;
+  auto const separator  = cudf::string_scalar("|");
+  auto const input      = cudf::strings::concatenate(
+    cudf::table_view(std::vector<cudf::column_view>(multiplier, view)), separator);
+
+  auto const input_view = cudf::strings_column_view(input->view());
+  auto const target     = cudf::string_scalar("3");  // fake the actual replace;
+  auto const repl       = cudf::string_scalar("3");  // logic still builds the output
+  auto result           = cudf::strings::replace(input_view, target, repl);
+
+  auto sv = cudf::strings_column_view(result->view());
+  EXPECT_EQ(sv.offsets().type(), cudf::data_type{cudf::type_id::INT64});
+  CUDF_TEST_EXPECT_COLUMNS_EQUIVALENT(input->view(), result->view());
+}
diff --git a/cpp/tests/streams/rolling_test.cpp b/cpp/tests/streams/rolling_test.cpp
new file mode 100644
index 00000000000..b352ad2c0d2
--- /dev/null
+++ b/cpp/tests/streams/rolling_test.cpp
@@ -0,0 +1,246 @@
+/*
+ * Copyright (c) 2024, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include <cudf_test/base_fixture.hpp>
+#include <cudf_test/column_wrapper.hpp>
+#include <cudf_test/default_stream.hpp>
+#include <cudf_test/type_lists.hpp>
+
+#include <cudf/detail/aggregation/aggregation.hpp>
+#include <cudf/rolling.hpp>
+#include <cudf/scalar/scalar.hpp>
+#include <cudf/scalar/scalar_factories.hpp>
+
+class RollingTest : public cudf::test::BaseFixture {};
+
+TEST_F(RollingTest, FixedSize)
+{
+  cudf::test::fixed_width_column_wrapper<cudf::size_type> input({1, 2, 3, 4, 5, 6, 7, 8, 9});
+
+  cudf::rolling_window(input,
+                       2,
+                       3,
+                       1,
+                       *cudf::make_min_aggregation<cudf::rolling_aggregation>(),
+                       cudf::test::get_default_stream());
+}
+
+TEST_F(RollingTest, FixedSizeDefault)
+{
+  cudf::test::fixed_width_column_wrapper<cudf::size_type> input({1, 2, 3, 4, 5, 6, 7, 8, 9});
+  cudf::test::fixed_width_column_wrapper<cudf::size_type> defaults({42, 42, 42, 42, 9, 9, 7, 1, 1});
+
+  cudf::rolling_window(input,
+                       defaults,
+                       2,
+                       3,
+                       1,
+                       *cudf::make_lead_aggregation<cudf::rolling_aggregation>(1),
+                       cudf::test::get_default_stream());
+}
+
+TEST_F(RollingTest, VariableSize)
+{
+  cudf::test::fixed_width_column_wrapper<cudf::size_type> input({1, 2, 3, 4, 5, 6, 7, 8, 9});
+  cudf::test::fixed_width_column_wrapper<cudf::size_type> preceding({2, 2, 2, 2, 3, 3, 3, 3, 3});
+  cudf::test::fixed_width_column_wrapper<cudf::size_type> following({3, 3, 3, 3, 3, 2, 2, 2, 2});
+
+  cudf::rolling_window(input,
+                       preceding,
+                       following,
+                       1,
+                       *cudf::make_min_aggregation<cudf::rolling_aggregation>(),
+                       cudf::test::get_default_stream());
+}
+
+class GroupedRollingTest : public cudf::test::BaseFixture {};
+
+TEST_F(GroupedRollingTest, FixedSize)
+{
+  cudf::test::fixed_width_column_wrapper<cudf::size_type> input({1, 2, 3, 4, 5, 6, 7, 8, 9});
+
+  cudf::test::fixed_width_column_wrapper<cudf::size_type> key_0({1, 1, 1, 2, 2, 2, 3, 3, 3});
+
+  cudf::test::fixed_width_column_wrapper<cudf::size_type> key_1({4, 4, 4, 5, 5, 5, 6, 6, 6});
+
+  cudf::table_view grouping_keys{std::vector<cudf::column_view>{key_0, key_1}};
+
+  cudf::grouped_rolling_window(grouping_keys,
+                               input,
+                               2,
+                               3,
+                               1,
+                               *cudf::make_min_aggregation<cudf::rolling_aggregation>(),
+                               cudf::test::get_default_stream());
+}
+
+TEST_F(GroupedRollingTest, FixedSizeDefault)
+{
+  cudf::test::fixed_width_column_wrapper<cudf::size_type> input({1, 2, 3, 4, 5, 6, 7, 8, 9});
+
+  cudf::test::fixed_width_column_wrapper<cudf::size_type> key_0({1, 1, 1, 2, 2, 2, 3, 3, 3});
+
+  cudf::test::fixed_width_column_wrapper<cudf::size_type> key_1({4, 4, 4, 5, 5, 5, 6, 6, 6});
+
+  cudf::test::fixed_width_column_wrapper<cudf::size_type> defaults({42, 42, 42, 42, 9, 9, 7, 1, 1});
+
+  cudf::table_view grouping_keys{std::vector<cudf::column_view>{key_0, key_1}};
+
+  cudf::grouped_rolling_window(grouping_keys,
+                               input,
+                               defaults,
+                               2,
+                               3,
+                               1,
+                               *cudf::make_lead_aggregation<cudf::rolling_aggregation>(1),
+                               cudf::test::get_default_stream());
+}
+
+TEST_F(GroupedRollingTest, WindowBounds)
+{
+  cudf::test::fixed_width_column_wrapper<cudf::size_type> input({1, 2, 3, 4, 5, 6, 7, 8, 9});
+
+  cudf::test::fixed_width_column_wrapper<cudf::size_type> key_0({1, 1, 1, 2, 2, 2, 3, 3, 3});
+
+  cudf::test::fixed_width_column_wrapper<cudf::size_type> key_1({4, 4, 4, 5, 5, 5, 6, 6, 6});
+
+  auto const unbounded_preceding = cudf::window_bounds::unbounded();
+  auto const following           = cudf::window_bounds::get(1L);
+
+  cudf::table_view grouping_keys{std::vector<cudf::column_view>{key_0, key_1}};
+
+  cudf::grouped_rolling_window(grouping_keys,
+                               input,
+                               unbounded_preceding,
+                               following,
+                               1,
+                               *cudf::make_min_aggregation<cudf::rolling_aggregation>(),
+                               cudf::test::get_default_stream());
+}
+
+TEST_F(GroupedRollingTest, WindowBoundsDefault)
+{
+  cudf::test::fixed_width_column_wrapper<cudf::size_type> input({1, 2, 3, 4, 5, 6, 7, 8, 9});
+
+  cudf::test::fixed_width_column_wrapper<cudf::size_type> key_0({1, 1, 1, 2, 2, 2, 3, 3, 3});
+
+  cudf::test::fixed_width_column_wrapper<cudf::size_type> key_1({4, 4, 4, 5, 5, 5, 6, 6, 6});
+
+  cudf::test::fixed_width_column_wrapper<cudf::size_type> defaults({42, 42, 42, 42, 9, 9, 7, 1, 1});
+
+  auto const unbounded_preceding = cudf::window_bounds::unbounded();
+  auto const following           = cudf::window_bounds::get(1L);
+
+  cudf::table_view grouping_keys{std::vector<cudf::column_view>{key_0, key_1}};
+
+  cudf::grouped_rolling_window(grouping_keys,
+                               input,
+                               defaults,
+                               unbounded_preceding,
+                               following,
+                               1,
+                               *cudf::make_lead_aggregation<cudf::rolling_aggregation>(1),
+                               cudf::test::get_default_stream());
+}
+
+class GroupedTimeRollingTest : public cudf::test::BaseFixture {};
+
+TEST_F(GroupedTimeRollingTest, FixedSize)
+{
+  auto const grp_col =
+    cudf::test::fixed_width_column_wrapper<cudf::size_type>{0, 0, 0, 0, 0, 0, 0, 0, 0, 0};
+  auto const agg_col = cudf::test::fixed_width_column_wrapper<cudf::size_type>{
+    {0, 1, 2, 3, 4, 5, 6, 7, 8, 9}, {1, 1, 1, 1, 1, 0, 1, 1, 1, 1}};
+  auto const time_col =
+    cudf::test::fixed_width_column_wrapper<cudf::timestamp_D, cudf::timestamp_D::rep>{
+      {0, 1, 2, 3, 4, 5, 6, 7, 8, 9}, {0, 0, 0, 0, 1, 1, 1, 1, 1, 1}};
+
+  auto const grouping_keys = cudf::table_view{std::vector<cudf::column_view>{grp_col}};
+  auto const preceding     = 1L;
+  auto const following     = 1L;
+  auto const min_periods   = 1L;
+  cudf::grouped_time_range_rolling_window(
+    grouping_keys,
+    time_col,
+    cudf::order::ASCENDING,
+    agg_col,
+    preceding,
+    following,
+    min_periods,
+    *cudf::make_count_aggregation<cudf::rolling_aggregation>(),
+    cudf::test::get_default_stream());
+}
+
+TEST_F(GroupedTimeRollingTest, WindowBounds)
+{
+  auto const grp_col =
+    cudf::test::fixed_width_column_wrapper<cudf::size_type>{0, 0, 0, 0, 0, 0, 0, 0, 0, 0};
+  auto const agg_col = cudf::test::fixed_width_column_wrapper<cudf::size_type>{
+    {0, 1, 2, 3, 4, 5, 6, 7, 8, 9}, {1, 1, 1, 1, 1, 0, 1, 1, 1, 1}};
+  auto const time_col =
+    cudf::test::fixed_width_column_wrapper<cudf::timestamp_D, cudf::timestamp_D::rep>{
+      {0, 1, 2, 3, 4, 5, 6, 7, 8, 9}, {0, 0, 0, 0, 1, 1, 1, 1, 1, 1}};
+
+  auto const grouping_keys       = cudf::table_view{std::vector<cudf::column_view>{grp_col}};
+  auto const unbounded_preceding = cudf::window_bounds::unbounded();
+  auto const following           = cudf::window_bounds::get(1L);
+
+  auto const min_periods = 1L;
+  cudf::grouped_time_range_rolling_window(
+    grouping_keys,
+    time_col,
+    cudf::order::ASCENDING,
+    agg_col,
+    unbounded_preceding,
+    following,
+    min_periods,
+    *cudf::make_count_aggregation<cudf::rolling_aggregation>(),
+    cudf::test::get_default_stream());
+}
+
+class GroupedRangeRollingTest : public cudf::test::BaseFixture {};
+
+TEST_F(GroupedRangeRollingTest, RangeWindowBounds)
+{
+  auto const grp_col = cudf::test::fixed_width_column_wrapper<int>{0, 0, 0, 0, 0, 0, 0, 0, 0, 0};
+  auto const agg_col = cudf::test::fixed_width_column_wrapper<int>{{0, 1, 2, 3, 4, 5, 6, 7, 8, 9},
+                                                                   {1, 1, 1, 1, 1, 0, 1, 1, 1, 1}};
+
+  auto const order_by = cudf::test::fixed_width_column_wrapper<int>{{0, 1, 2, 3, 4, 5, 6, 7, 8, 9},
+                                                                    {0, 0, 0, 0, 1, 1, 1, 1, 1, 1}};
+
+  cudf::range_window_bounds preceding = cudf::range_window_bounds::get(
+    cudf::numeric_scalar<int>{int{1}, true, cudf::test::get_default_stream()},
+    cudf::test::get_default_stream());
+
+  cudf::range_window_bounds following = cudf::range_window_bounds::get(
+    cudf::numeric_scalar<int>{int{1}, true, cudf::test::get_default_stream()},
+    cudf::test::get_default_stream());
+
+  auto const min_periods = cudf::size_type{1};
+
+  auto const grouping_keys = cudf::table_view{std::vector<cudf::column_view>{grp_col}};
+
+  cudf::grouped_range_rolling_window(grouping_keys,
+                                     order_by,
+                                     cudf::order::ASCENDING,
+                                     agg_col,
+                                     preceding,
+                                     following,
+                                     min_periods,
+                                     *cudf::make_count_aggregation<cudf::rolling_aggregation>(),
+                                     cudf::test::get_default_stream());
+}
diff --git a/python/cudf/cudf/core/column/datetime.py b/python/cudf/cudf/core/column/datetime.py
index 7fdebda7d76..9ac761b6be1 100644
--- a/python/cudf/cudf/core/column/datetime.py
+++ b/python/cudf/cudf/core/column/datetime.py
@@ -377,6 +377,16 @@ def floor(self, freq: str) -> ColumnBase:
     def round(self, freq: str) -> ColumnBase:
         return libcudf.datetime.round_datetime(self, freq)
 
+    def isocalendar(self) -> dict[str, ColumnBase]:
+        return {
+            field: self.as_string_column("str", format=directive).astype(
+                "uint32"
+            )
+            for field, directive in zip(
+                ["year", "week", "day"], ["%G", "%V", "%u"]
+            )
+        }
+
     def normalize_binop_value(self, other: DatetimeLikeScalar) -> ScalarLike:
         if isinstance(other, (cudf.Scalar, ColumnBase, cudf.DateOffset)):
             return other
diff --git a/python/cudf/cudf/core/column/timedelta.py b/python/cudf/cudf/core/column/timedelta.py
index 8eec84b64f7..26b449f1863 100644
--- a/python/cudf/cudf/core/column/timedelta.py
+++ b/python/cudf/cudf/core/column/timedelta.py
@@ -392,7 +392,7 @@ def corr(self, other: TimeDeltaColumn) -> float:
             other.as_numerical_column("int64")
         )
 
-    def components(self, index=None) -> "cudf.DataFrame":
+    def components(self) -> dict[str, ColumnBase]:
         """
         Return a Dataframe of the components of the Timedeltas.
 
@@ -484,11 +484,7 @@ def components(self, index=None) -> "cudf.DataFrame":
             if self.nullable:
                 res_col = res_col.set_mask(self.mask)
             data[name] = res_col
-
-        return cudf.DataFrame(
-            data=data,
-            index=index,
-        )
+        return data
 
     @property
     def days(self) -> "cudf.core.column.NumericalColumn":
diff --git a/python/cudf/cudf/core/index.py b/python/cudf/cudf/core/index.py
index 13fa187842d..df21d392311 100644
--- a/python/cudf/cudf/core/index.py
+++ b/python/cudf/cudf/core/index.py
@@ -2157,7 +2157,7 @@ def month_name(self, locale: str | None = None) -> Index:
         return Index._from_data({self.name: month_names})
 
     @_cudf_nvtx_annotate
-    def isocalendar(self):
+    def isocalendar(self) -> cudf.DataFrame:
         """
         Returns a DataFrame with the year, week, and day
         calculated according to the ISO 8601 standard.
@@ -2176,7 +2176,10 @@ def isocalendar(self):
         2020-05-31 08:00:00  2020    22    7
         1999-12-31 18:40:00  1999    52    5
         """
-        return cudf.core.tools.datetimes._to_iso_calendar(self)
+        ca = cudf.core.column_accessor.ColumnAccessor(
+            self._column.isocalendar(), verify=False
+        )
+        return cudf.DataFrame._from_data(ca, index=self)
 
     @_cudf_nvtx_annotate
     def to_pandas(
@@ -2546,7 +2549,10 @@ def components(self):
         Return a dataframe of the components (days, hours, minutes,
         seconds, milliseconds, microseconds, nanoseconds) of the Timedeltas.
         """
-        return self._values.components()
+        ca = cudf.core.column_accessor.ColumnAccessor(
+            self._column.components(), verify=False
+        )
+        return cudf.DataFrame._from_data(ca)
 
     @property
     def inferred_freq(self):
diff --git a/python/cudf/cudf/core/series.py b/python/cudf/cudf/core/series.py
index e532948fd11..c0716d7709a 100644
--- a/python/cudf/cudf/core/series.py
+++ b/python/cudf/cudf/core/series.py
@@ -778,9 +778,9 @@ def dt(self):
         ------
             TypeError if the Series does not contain datetimelike values.
         """
-        if isinstance(self._column, DatetimeColumn):
+        if self.dtype.kind == "M":
             return DatetimeProperties(self)
-        elif isinstance(self._column, TimeDeltaColumn):
+        elif self.dtype.kind == "m":
             return TimedeltaProperties(self)
         else:
             raise AttributeError(
@@ -3677,7 +3677,21 @@ def wrapper(self, other, level=None, fill_value=None, axis=0):
     setattr(Series, binop, make_binop_func(binop))
 
 
-class DatetimeProperties:
+class BaseDatelikeProperties:
+    """
+    Base accessor class for Series values.
+    """
+
+    def __init__(self, series: Series):
+        self.series = series
+
+    def _return_result_like_self(self, column: ColumnBase) -> Series:
+        """Return the method result like self.series"""
+        data = ColumnAccessor({self.series.name: column}, verify=False)
+        return self.series._from_data_like_self(data)
+
+
+class DatetimeProperties(BaseDatelikeProperties):
     """
     Accessor object for datetimelike properties of the Series values.
 
@@ -3727,12 +3741,9 @@ class DatetimeProperties:
     dtype: int16
     """
 
-    def __init__(self, series):
-        self.series = series
-
     @property  # type: ignore
     @_cudf_nvtx_annotate
-    def year(self):
+    def year(self) -> Series:
         """
         The year of the datetime.
 
@@ -3757,7 +3768,7 @@ def year(self):
 
     @property  # type: ignore
     @_cudf_nvtx_annotate
-    def month(self):
+    def month(self) -> Series:
         """
         The month as January=1, December=12.
 
@@ -3782,7 +3793,7 @@ def month(self):
 
     @property  # type: ignore
     @_cudf_nvtx_annotate
-    def day(self):
+    def day(self) -> Series:
         """
         The day of the datetime.
 
@@ -3807,7 +3818,7 @@ def day(self):
 
     @property  # type: ignore
     @_cudf_nvtx_annotate
-    def hour(self):
+    def hour(self) -> Series:
         """
         The hours of the datetime.
 
@@ -3832,7 +3843,7 @@ def hour(self):
 
     @property  # type: ignore
     @_cudf_nvtx_annotate
-    def minute(self):
+    def minute(self) -> Series:
         """
         The minutes of the datetime.
 
@@ -3857,7 +3868,7 @@ def minute(self):
 
     @property  # type: ignore
     @_cudf_nvtx_annotate
-    def second(self):
+    def second(self) -> Series:
         """
         The seconds of the datetime.
 
@@ -3882,7 +3893,7 @@ def second(self):
 
     @property  # type: ignore
     @_cudf_nvtx_annotate
-    def microsecond(self):
+    def microsecond(self) -> Series:
         """
         The microseconds of the datetime.
 
@@ -3903,22 +3914,18 @@ def microsecond(self):
         2    2
         dtype: int32
         """
-        return Series(
-            data=(
-                # Need to manually promote column to int32 because
-                # pandas-matching binop behaviour requires that this
-                # __mul__ returns an int16 column.
-                self.series._column.get_dt_field("millisecond").astype("int32")
-                * cudf.Scalar(1000, dtype="int32")
-            )
-            + self.series._column.get_dt_field("microsecond"),
-            index=self.series.index,
-            name=self.series.name,
-        )
+        micro = self.series._column.get_dt_field("microsecond")
+        # Need to manually promote column to int32 because
+        # pandas-matching binop behaviour requires that this
+        # __mul__ returns an int16 column.
+        extra = self.series._column.get_dt_field("millisecond").astype(
+            "int32"
+        ) * cudf.Scalar(1000, dtype="int32")
+        return self._return_result_like_self(micro + extra)
 
     @property  # type: ignore
     @_cudf_nvtx_annotate
-    def nanosecond(self):
+    def nanosecond(self) -> Series:
         """
         The nanoseconds of the datetime.
 
@@ -3943,7 +3950,7 @@ def nanosecond(self):
 
     @property  # type: ignore
     @_cudf_nvtx_annotate
-    def weekday(self):
+    def weekday(self) -> Series:
         """
         The day of the week with Monday=0, Sunday=6.
 
@@ -3980,7 +3987,7 @@ def weekday(self):
 
     @property  # type: ignore
     @_cudf_nvtx_annotate
-    def dayofweek(self):
+    def dayofweek(self) -> Series:
         """
         The day of the week with Monday=0, Sunday=6.
 
@@ -4017,7 +4024,7 @@ def dayofweek(self):
 
     @property  # type: ignore
     @_cudf_nvtx_annotate
-    def dayofyear(self):
+    def dayofyear(self) -> Series:
         """
         The day of the year, from 1-365 in non-leap years and
         from 1-366 in leap years.
@@ -4055,7 +4062,7 @@ def dayofyear(self):
 
     @property  # type: ignore
     @_cudf_nvtx_annotate
-    def day_of_year(self):
+    def day_of_year(self) -> Series:
         """
         The day of the year, from 1-365 in non-leap years and
         from 1-366 in leap years.
@@ -4093,7 +4100,7 @@ def day_of_year(self):
 
     @property  # type: ignore
     @_cudf_nvtx_annotate
-    def is_leap_year(self):
+    def is_leap_year(self) -> Series:
         """
         Boolean indicator if the date belongs to a leap year.
 
@@ -4144,15 +4151,11 @@ def is_leap_year(self):
         dtype: bool
         """
         res = libcudf.datetime.is_leap_year(self.series._column).fillna(False)
-        return Series._from_data(
-            ColumnAccessor({None: res}),
-            index=self.series.index,
-            name=self.series.name,
-        )
+        return self._return_result_like_self(res)
 
     @property  # type: ignore
     @_cudf_nvtx_annotate
-    def quarter(self):
+    def quarter(self) -> Series:
         """
         Integer indicator for which quarter of the year the date belongs in.
 
@@ -4178,14 +4181,10 @@ def quarter(self):
         res = libcudf.datetime.extract_quarter(self.series._column).astype(
             np.int8
         )
-        return Series._from_data(
-            {None: res},
-            index=self.series.index,
-            name=self.series.name,
-        )
+        return self._return_result_like_self(res)
 
     @_cudf_nvtx_annotate
-    def day_name(self, locale=None):
+    def day_name(self, locale: str | None = None) -> Series:
         """
         Return the day names. Currently supports English locale only.
 
@@ -4216,11 +4215,8 @@ def day_name(self, locale=None):
         7     Saturday
         dtype: object
         """
-        day_names = self.series._column.get_day_names(locale)
-        return Series._from_data(
-            ColumnAccessor({None: day_names}),
-            index=self.series.index,
-            name=self.series.name,
+        return self._return_result_like_self(
+            self.series._column.get_day_names(locale)
         )
 
     @_cudf_nvtx_annotate
@@ -4249,15 +4245,12 @@ def month_name(self, locale: str | None = None) -> Series:
         5    February
         dtype: object
         """
-        month_names = self.series._column.get_month_names(locale)
-        return Series._from_data(
-            ColumnAccessor({None: month_names}),
-            index=self.series.index,
-            name=self.series.name,
+        return self._return_result_like_self(
+            self.series._column.get_month_names(locale)
         )
 
     @_cudf_nvtx_annotate
-    def isocalendar(self):
+    def isocalendar(self) -> cudf.DataFrame:
         """
         Returns a DataFrame with the year, week, and day
         calculated according to the ISO 8601 standard.
@@ -4298,11 +4291,14 @@ def isocalendar(self):
         1    <NA>
         Name: year, dtype: object
         """
-        return cudf.core.tools.datetimes._to_iso_calendar(self)
+        ca = ColumnAccessor(self.series._column.isocalendar(), verify=False)
+        return self.series._constructor_expanddim._from_data(
+            ca, index=self.series.index
+        )
 
     @property  # type: ignore
     @_cudf_nvtx_annotate
-    def is_month_start(self):
+    def is_month_start(self) -> Series:
         """
         Booleans indicating if dates are the first day of the month.
         """
@@ -4310,7 +4306,7 @@ def is_month_start(self):
 
     @property  # type: ignore
     @_cudf_nvtx_annotate
-    def days_in_month(self):
+    def days_in_month(self) -> Series:
         """
         Get the total number of days in the month that the date falls on.
 
@@ -4353,16 +4349,13 @@ def days_in_month(self):
         11    31
         dtype: int16
         """
-        res = libcudf.datetime.days_in_month(self.series._column)
-        return Series._from_data(
-            ColumnAccessor({None: res}),
-            index=self.series.index,
-            name=self.series.name,
+        return self._return_result_like_self(
+            libcudf.datetime.days_in_month(self.series._column)
         )
 
     @property  # type: ignore
     @_cudf_nvtx_annotate
-    def is_month_end(self):
+    def is_month_end(self) -> Series:
         """
         Boolean indicator if the date is the last day of the month.
 
@@ -4399,17 +4392,13 @@ def is_month_end(self):
         8    False
         dtype: bool
         """  # noqa: E501
-        last_day = libcudf.datetime.last_day_of_month(self.series._column)
-        last_day = Series._from_data(
-            ColumnAccessor({None: last_day}),
-            index=self.series.index,
-            name=self.series.name,
-        )
+        last_day_col = libcudf.datetime.last_day_of_month(self.series._column)
+        last_day = self._return_result_like_self(last_day_col)
         return (self.day == last_day.dt.day).fillna(False)
 
     @property  # type: ignore
     @_cudf_nvtx_annotate
-    def is_quarter_start(self):
+    def is_quarter_start(self) -> Series:
         """
         Boolean indicator if the date is the first day of a quarter.
 
@@ -4450,15 +4439,11 @@ def is_quarter_start(self):
         )
 
         result = ((day == cudf.Scalar(1)) & first_month).fillna(False)
-        return Series._from_data(
-            {None: result},
-            index=self.series.index,
-            name=self.series.name,
-        )
+        return self._return_result_like_self(result)
 
     @property  # type: ignore
     @_cudf_nvtx_annotate
-    def is_quarter_end(self):
+    def is_quarter_end(self) -> Series:
         """
         Boolean indicator if the date is the last day of a quarter.
 
@@ -4501,15 +4486,11 @@ def is_quarter_end(self):
         )
 
         result = ((day == last_day) & last_month).fillna(False)
-        return Series._from_data(
-            {None: result},
-            index=self.series.index,
-            name=self.series.name,
-        )
+        return self._return_result_like_self(result)
 
     @property  # type: ignore
     @_cudf_nvtx_annotate
-    def is_year_start(self):
+    def is_year_start(self) -> Series:
         """
         Boolean indicator if the date is the first day of the year.
 
@@ -4536,15 +4517,11 @@ def is_year_start(self):
         outcol = self.series._column.get_dt_field(
             "day_of_year"
         ) == cudf.Scalar(1)
-        return Series._from_data(
-            {None: outcol.fillna(False)},
-            index=self.series.index,
-            name=self.series.name,
-        )
+        return self._return_result_like_self(outcol.fillna(False))
 
     @property  # type: ignore
     @_cudf_nvtx_annotate
-    def is_year_end(self):
+    def is_year_end(self) -> Series:
         """
         Boolean indicator if the date is the last day of the year.
 
@@ -4574,22 +4551,16 @@ def is_year_end(self):
         leap = day_of_year == cudf.Scalar(366)
         non_leap = day_of_year == cudf.Scalar(365)
         result = cudf._lib.copying.copy_if_else(leap, non_leap, leap_dates)
-        result = result.fillna(False)
-        return Series._from_data(
-            {None: result},
-            index=self.series.index,
-            name=self.series.name,
-        )
+        return self._return_result_like_self(result.fillna(False))
 
     @_cudf_nvtx_annotate
-    def _get_dt_field(self, field):
-        out_column = self.series._column.get_dt_field(field)
-        return Series(
-            data=out_column, index=self.series.index, name=self.series.name
+    def _get_dt_field(self, field: str) -> Series:
+        return self._return_result_like_self(
+            self.series._column.get_dt_field(field)
         )
 
     @_cudf_nvtx_annotate
-    def ceil(self, freq):
+    def ceil(self, freq: str) -> Series:
         """
         Perform ceil operation on the data to the specified freq.
 
@@ -4619,14 +4590,10 @@ def ceil(self, freq):
         2   2001-01-01 00:06:00
         dtype: datetime64[ns]
         """
-        out_column = self.series._column.ceil(freq)
-
-        return Series._from_data(
-            data={self.series.name: out_column}, index=self.series.index
-        )
+        return self._return_result_like_self(self.series._column.ceil(freq))
 
     @_cudf_nvtx_annotate
-    def floor(self, freq):
+    def floor(self, freq: str) -> Series:
         """
         Perform floor operation on the data to the specified freq.
 
@@ -4656,14 +4623,10 @@ def floor(self, freq):
         2   2001-01-01 00:05:00
         dtype: datetime64[ns]
         """
-        out_column = self.series._column.floor(freq)
-
-        return Series._from_data(
-            data={self.series.name: out_column}, index=self.series.index
-        )
+        return self._return_result_like_self(self.series._column.floor(freq))
 
     @_cudf_nvtx_annotate
-    def round(self, freq):
+    def round(self, freq: str) -> Series:
         """
         Perform round operation on the data to the specified freq.
 
@@ -4696,14 +4659,10 @@ def round(self, freq):
         2   2001-01-01 00:05:00
         dtype: datetime64[ns]
         """
-        out_column = self.series._column.round(freq)
-
-        return Series._from_data(
-            data={self.series.name: out_column}, index=self.series.index
-        )
+        return self._return_result_like_self(self.series._column.round(freq))
 
     @_cudf_nvtx_annotate
-    def strftime(self, date_format, *args, **kwargs):
+    def strftime(self, date_format: str, *args, **kwargs) -> Series:
         """
         Convert to Series using specified ``date_format``.
 
@@ -4777,11 +4736,10 @@ def strftime(self, date_format, *args, **kwargs):
                     f"https://github.com/rapidsai/cudf/issues/5991 "
                     f"for tracking purposes."
                 )
-        str_col = self.series._column.as_string_column(
-            dtype="str", format=date_format
-        )
-        return Series(
-            data=str_col, index=self.series.index, name=self.series.name
+        return self._return_result_like_self(
+            self.series._column.as_string_column(
+                dtype="str", format=date_format
+            )
         )
 
     @copy_docstring(DatetimeIndex.tz_localize)
@@ -4790,17 +4748,13 @@ def tz_localize(
         tz: str | None,
         ambiguous: Literal["NaT"] = "NaT",
         nonexistent: Literal["NaT"] = "NaT",
-    ):
-        result_col = self.series._column.tz_localize(
-            tz, ambiguous, nonexistent
-        )
-        return Series._from_data(
-            data={self.series.name: result_col},
-            index=self.series.index,
+    ) -> Series:
+        return self._return_result_like_self(
+            self.series._column.tz_localize(tz, ambiguous, nonexistent)
         )
 
     @copy_docstring(DatetimeIndex.tz_convert)
-    def tz_convert(self, tz: str | None):
+    def tz_convert(self, tz: str | None) -> Series:
         """
         Parameters
         ----------
@@ -4810,13 +4764,12 @@ def tz_convert(self, tz: str | None):
             A `tz` of None will convert to UTC and remove the
             timezone information.
         """
-        result_col = self.series._column.tz_convert(tz)
-        return Series._from_data(
-            {self.series.name: result_col}, index=self.series.index
+        return self._return_result_like_self(
+            self.series._column.tz_convert(tz)
         )
 
 
-class TimedeltaProperties:
+class TimedeltaProperties(BaseDatelikeProperties):
     """
     Accessor object for timedelta-like properties of the Series values.
 
@@ -4884,12 +4837,9 @@ class TimedeltaProperties:
     dtype: int64
     """
 
-    def __init__(self, series):
-        self.series = series
-
     @property  # type: ignore
     @_cudf_nvtx_annotate
-    def days(self):
+    def days(self) -> Series:
         """
         Number of days.
 
@@ -4921,7 +4871,7 @@ def days(self):
 
     @property  # type: ignore
     @_cudf_nvtx_annotate
-    def seconds(self):
+    def seconds(self) -> Series:
         """
         Number of seconds (>= 0 and less than 1 day).
 
@@ -4960,7 +4910,7 @@ def seconds(self):
 
     @property  # type: ignore
     @_cudf_nvtx_annotate
-    def microseconds(self):
+    def microseconds(self) -> Series:
         """
         Number of microseconds (>= 0 and less than 1 second).
 
@@ -4992,7 +4942,7 @@ def microseconds(self):
 
     @property  # type: ignore
     @_cudf_nvtx_annotate
-    def nanoseconds(self):
+    def nanoseconds(self) -> Series:
         """
         Return the number of nanoseconds (n), where 0 <= n < 1 microsecond.
 
@@ -5024,7 +4974,7 @@ def nanoseconds(self):
 
     @property  # type: ignore
     @_cudf_nvtx_annotate
-    def components(self):
+    def components(self) -> cudf.DataFrame:
         """
         Return a Dataframe of the components of the Timedeltas.
 
@@ -5050,13 +5000,15 @@ def components(self):
         3      0      0       35       35           656             0            0
         4     37     13       12       14           234             0            0
         """  # noqa: E501
-        return self.series._column.components(index=self.series.index)
+        ca = ColumnAccessor(self.series._column.components(), verify=False)
+        return self.series._constructor_expanddim._from_data(
+            ca, index=self.series.index
+        )
 
     @_cudf_nvtx_annotate
-    def _get_td_field(self, field):
-        out_column = getattr(self.series._column, field)
-        return Series(
-            data=out_column, index=self.series.index, name=self.series.name
+    def _get_td_field(self, field: str) -> Series:
+        return self._return_result_like_self(
+            getattr(self.series._column, field)
         )
 
 
diff --git a/python/cudf/cudf/core/tools/datetimes.py b/python/cudf/cudf/core/tools/datetimes.py
index 29130130732..397bfe1d472 100644
--- a/python/cudf/cudf/core/tools/datetimes.py
+++ b/python/cudf/cudf/core/tools/datetimes.py
@@ -1048,22 +1048,3 @@ def _offset_to_nanoseconds_lower_bound(offset: DateOffset) -> int:
         + kwds.get("microseconds", 0) * 10**3
         + kwds.get("nanoseconds", 0)
     )
-
-
-def _to_iso_calendar(arg):
-    formats = ["%G", "%V", "%u"]
-    if not isinstance(arg, (cudf.Index, cudf.core.series.DatetimeProperties)):
-        raise AttributeError(
-            "Can only use .isocalendar accessor with series or index"
-        )
-    if isinstance(arg, cudf.Index):
-        iso_params = [
-            arg._column.as_string_column(arg.dtype, fmt) for fmt in formats
-        ]
-        index = arg._column
-    elif isinstance(arg.series, cudf.Series):
-        iso_params = [arg.strftime(fmt) for fmt in formats]
-        index = arg.series.index
-
-    data = dict(zip(["year", "week", "day"], iso_params))
-    return cudf.DataFrame(data, index=index, dtype=np.int32)
diff --git a/python/cudf_polars/cudf_polars/dsl/expr.py b/python/cudf_polars/cudf_polars/dsl/expr.py
index 0605bba6642..c92e0714d54 100644
--- a/python/cudf_polars/cudf_polars/dsl/expr.py
+++ b/python/cudf_polars/cudf_polars/dsl/expr.py
@@ -512,13 +512,17 @@ def do_evaluate(
         ]
         if self.name == pl_expr.BooleanFunction.Any:
             (column,) = columns
-            return plc.Column.from_scalar(
-                plc.reduce.reduce(column.obj, plc.aggregation.any(), self.dtype), 1
+            return Column(
+                plc.Column.from_scalar(
+                    plc.reduce.reduce(column.obj, plc.aggregation.any(), self.dtype), 1
+                )
             )
         elif self.name == pl_expr.BooleanFunction.All:
             (column,) = columns
-            return plc.Column.from_scalar(
-                plc.reduce.reduce(column.obj, plc.aggregation.all(), self.dtype), 1
+            return Column(
+                plc.Column.from_scalar(
+                    plc.reduce.reduce(column.obj, plc.aggregation.all(), self.dtype), 1
+                )
             )
         if self.name == pl_expr.BooleanFunction.IsNull:
             (column,) = columns
@@ -612,20 +616,32 @@ def do_evaluate(
             column, lo, hi = columns
             (closed,) = self.options
             lop, rop = self._BETWEEN_OPS[closed]
+            lo_obj = (
+                lo.obj_scalar
+                if lo.is_scalar and lo.obj.size() != column.obj.size()
+                else lo.obj
+            )
+            hi_obj = (
+                hi.obj_scalar
+                if hi.is_scalar and hi.obj.size() != column.obj.size()
+                else hi.obj
+            )
             return Column(
                 plc.binaryop.binary_operation(
                     plc.binaryop.binary_operation(
-                        column.obj, lo.obj, lop, output_type=self.dtype
+                        column.obj, lo_obj, lop, output_type=self.dtype
                     ),
                     plc.binaryop.binary_operation(
-                        column.obj, hi.obj, rop, output_type=self.dtype
+                        column.obj, hi_obj, rop, output_type=self.dtype
                     ),
                     plc.binaryop.BinaryOperator.LOGICAL_AND,
                     self.dtype,
                 )
             )
         else:
-            raise NotImplementedError(f"BooleanFunction {self.name}")
+            raise NotImplementedError(
+                f"BooleanFunction {self.name}"
+            )  # pragma: no cover; handled by init raising
 
 
 class StringFunction(Expr):
diff --git a/python/cudf_polars/cudf_polars/dsl/ir.py b/python/cudf_polars/cudf_polars/dsl/ir.py
index 7f0920e1b57..83957e4286d 100644
--- a/python/cudf_polars/cudf_polars/dsl/ir.py
+++ b/python/cudf_polars/cudf_polars/dsl/ir.py
@@ -17,7 +17,7 @@
 import itertools
 import types
 from functools import cache
-from typing import TYPE_CHECKING, Any, Callable, ClassVar, NoReturn
+from typing import TYPE_CHECKING, Any, Callable, ClassVar
 
 import pyarrow as pa
 from typing_extensions import assert_never
@@ -56,7 +56,6 @@
     "MapFunction",
     "Union",
     "HConcat",
-    "ExtContext",
 ]
 
 
@@ -153,7 +152,9 @@ def evaluate(self, *, cache: MutableMapping[int, DataFrame]) -> DataFrame:
             since the translation phase should pick up things that we
             cannot handle.
         """
-        raise NotImplementedError
+        raise NotImplementedError(
+            f"Evaluation of plan {type(self).__name__}"
+        )  # pragma: no cover
 
 
 @dataclasses.dataclass(slots=True)
@@ -346,7 +347,9 @@ class Reduce(IR):
     expr: list[expr.NamedExpr]
     """List of expressions to evaluate to form the new dataframe."""
 
-    def evaluate(self, *, cache: MutableMapping[int, DataFrame]) -> DataFrame:
+    def evaluate(
+        self, *, cache: MutableMapping[int, DataFrame]
+    ) -> DataFrame:  # pragma: no cover; polars doesn't emit this node yet
         """Evaluate and return a dataframe."""
         df = self.df.evaluate(cache=cache)
         columns = broadcast(*(e.evaluate(df) for e in self.expr))
@@ -938,23 +941,3 @@ def evaluate(self, *, cache: MutableMapping[int, DataFrame]) -> DataFrame:
         return DataFrame(
             list(itertools.chain.from_iterable(df.columns for df in dfs)),
         )
-
-
-@dataclasses.dataclass(slots=True)
-class ExtContext(IR):
-    """
-    Concatenate dataframes horizontally.
-
-    Prefer HConcat, since this is going to be deprecated on the polars side.
-    """
-
-    df: IR
-    """Input."""
-    extra: list[IR]
-    """List of extra inputs."""
-
-    def __post_init__(self) -> NoReturn:
-        """Validate preconditions."""
-        raise NotImplementedError(
-            "ExtContext will be deprecated, use horizontal concat instead."
-        )
diff --git a/python/cudf_polars/cudf_polars/dsl/translate.py b/python/cudf_polars/cudf_polars/dsl/translate.py
index adde3b1a9dc..41bc3032bc5 100644
--- a/python/cudf_polars/cudf_polars/dsl/translate.py
+++ b/python/cudf_polars/cudf_polars/dsl/translate.py
@@ -63,7 +63,9 @@ def __exit__(self, *args: Any) -> None:
 def _translate_ir(
     node: Any, visitor: NodeTraverser, schema: dict[str, plc.DataType]
 ) -> ir.IR:
-    raise NotImplementedError(f"Translation for {type(node).__name__}")
+    raise NotImplementedError(
+        f"Translation for {type(node).__name__}"
+    )  # pragma: no cover
 
 
 @_translate_ir.register
@@ -172,7 +174,7 @@ def _(
 @_translate_ir.register
 def _(
     node: pl_ir.Reduce, visitor: NodeTraverser, schema: dict[str, plc.DataType]
-) -> ir.IR:
+) -> ir.IR:  # pragma: no cover; polars doesn't emit this node yet
     with set_node(visitor, node.input):
         inp = translate_ir(visitor, n=None)
         exprs = [translate_named_expr(visitor, n=e) for e in node.expr]
@@ -256,17 +258,6 @@ def _(
     return ir.HConcat(schema, [translate_ir(visitor, n=n) for n in node.inputs])
 
 
-@_translate_ir.register
-def _(
-    node: pl_ir.ExtContext, visitor: NodeTraverser, schema: dict[str, plc.DataType]
-) -> ir.IR:
-    return ir.ExtContext(
-        schema,
-        translate_ir(visitor, n=node.input),
-        [translate_ir(visitor, n=n) for n in node.contexts],
-    )
-
-
 def translate_ir(visitor: NodeTraverser, *, n: int | None = None) -> ir.IR:
     """
     Translate a polars-internal IR node to our representation.
@@ -333,7 +324,9 @@ def translate_named_expr(
 def _translate_expr(
     node: Any, visitor: NodeTraverser, dtype: plc.DataType
 ) -> expr.Expr:
-    raise NotImplementedError(f"Translation for {type(node).__name__}")
+    raise NotImplementedError(
+        f"Translation for {type(node).__name__}"
+    )  # pragma: no cover
 
 
 @_translate_expr.register
diff --git a/python/cudf_polars/tests/expressions/test_booleanfunction.py b/python/cudf_polars/tests/expressions/test_booleanfunction.py
new file mode 100644
index 00000000000..951b749e670
--- /dev/null
+++ b/python/cudf_polars/tests/expressions/test_booleanfunction.py
@@ -0,0 +1,129 @@
+# SPDX-FileCopyrightText: Copyright (c) 2024 NVIDIA CORPORATION & AFFILIATES.
+# SPDX-License-Identifier: Apache-2.0
+from __future__ import annotations
+
+import pytest
+
+import polars as pl
+
+from cudf_polars.testing.asserts import assert_gpu_result_equal
+
+
+@pytest.fixture(params=[False, True], ids=["no_nulls", "nulls"])
+def has_nulls(request):
+    return request.param
+
+
+@pytest.mark.parametrize(
+    "ignore_nulls",
+    [
+        pytest.param(
+            False, marks=pytest.mark.xfail(reason="No support for Kleene logic")
+        ),
+        True,
+    ],
+)
+def test_booleanfunction_reduction(ignore_nulls):
+    ldf = pl.LazyFrame(
+        {
+            "a": [1, 2, 3.0, 2, 5],
+            "b": [0, 3, 1, -1, None],
+            "c": [1, 6, 5, 3, 2],
+        }
+    )
+
+    query = ldf.select(
+        (pl.col("a") > 3).any(ignore_nulls=ignore_nulls),
+        (pl.col("b") > 2).all(ignore_nulls=ignore_nulls),
+    )
+
+    assert_gpu_result_equal(query)
+
+
+@pytest.mark.parametrize(
+    "expr",
+    [
+        pl.Expr.is_null,
+        pl.Expr.is_not_null,
+        pl.Expr.is_nan,
+        pl.Expr.is_not_nan,
+    ],
+    ids=lambda f: f"{f.__name__}()",
+)
+@pytest.mark.parametrize("has_nans", [False, True], ids=["no_nans", "nans"])
+def test_boolean_function_unary(request, expr, has_nans, has_nulls):
+    if has_nulls and expr in (pl.Expr.is_nan, pl.Expr.is_not_nan):
+        request.applymarker(
+            pytest.mark.xfail(
+                reason="Need to copy null mask since is_{not_}nan(null) => null"
+            )
+        )
+
+    values: list[float | None] = [1, 2, 3, 4, 5]
+    if has_nans:
+        values[3] = float("nan")
+    if has_nulls:
+        values[0] = None
+
+    df = pl.LazyFrame({"a": pl.Series(values, dtype=pl.Float32())})
+
+    q = df.select(expr(pl.col("a")))
+
+    assert_gpu_result_equal(q)
+
+
+@pytest.mark.xfail(reason="Evaluation handlers not yet implemented")
+@pytest.mark.parametrize(
+    "expr",
+    [
+        pl.col("a").is_finite(),
+        pl.col("a").is_infinite(),
+        pl.col("a").is_in(pl.col("b")),
+    ],
+)
+def test_unsupported_boolean_function(expr):
+    df = pl.LazyFrame({"a": [1, float("nan"), 2, 4], "b": [1, 2, 3, 4]})
+
+    q = df.select(expr)
+
+    assert_gpu_result_equal(q)
+
+
+@pytest.mark.parametrize("closed", ["both", "left", "right", "none"])
+@pytest.mark.parametrize(
+    "bounds", [(1, 2), (-1, 10), (11, 10), (pl.col("lo"), pl.col("hi"))]
+)
+def test_boolean_isbetween(closed, bounds):
+    df = pl.LazyFrame(
+        {"a": [1, float("nan"), 2, 4], "lo": [1, 2, 2, 3], "hi": [10, 4, 2, 4]}
+    )
+
+    q = df.select(pl.col("a").is_between(*bounds, closed=closed))
+
+    assert_gpu_result_equal(q)
+
+
+@pytest.mark.parametrize(
+    "expr", [pl.any_horizontal("*"), pl.all_horizontal("*")], ids=["any", "all"]
+)
+@pytest.mark.parametrize("wide", [False, True], ids=["narrow", "wide"])
+def test_boolean_horizontal(request, expr, has_nulls, wide):
+    if has_nulls:
+        request.applymarker(pytest.mark.xfail(reason="No support for Kleene logic"))
+    ldf = pl.LazyFrame(
+        {
+            "a": [False, False, False, False, False, True],
+            "b": [False, False, True, None, None, True],
+            "c": [False, True, True, False, True, True],
+        }
+    )
+    if not has_nulls:
+        ldf = ldf.select(pl.col("a"), pl.col("c"))
+
+    # To see the All/Any Horizontal nodes, we need a dataframe with
+    # more than 128 columns
+    if wide:
+        ldf = ldf.with_columns(pl.col("c").alias(f"col{i}") for i in range(128))
+    q = ldf.select(expr)
+
+    assert_gpu_result_equal(q)
diff --git a/python/cudf_polars/tests/test_extcontext.py b/python/cudf_polars/tests/test_extcontext.py
deleted file mode 100644
index 9daf88b4338..00000000000
--- a/python/cudf_polars/tests/test_extcontext.py
+++ /dev/null
@@ -1,23 +0,0 @@
-# SPDX-FileCopyrightText: Copyright (c) 2024 NVIDIA CORPORATION & AFFILIATES.
-# SPDX-License-Identifier: Apache-2.0
-from __future__ import annotations
-
-import pytest
-
-import polars as pl
-
-from cudf_polars.testing.asserts import assert_gpu_result_equal
-
-
-def test_extcontext():
-    ldf = pl.DataFrame(
-        {
-            "a": [1, 2, 3, 4, 5, 6, 7],
-            "b": [1, 1, 1, 1, 1, 1, 1],
-        }
-    ).lazy()
-    ldf2 = ldf.select((pl.col("b") + pl.col("a")).alias("c"))
-    query = ldf.with_context(ldf2).select(pl.col("b"), pl.col("c"))
-    with pytest.raises(pl.exceptions.ComputeError):
-        # ExtContext to be deprecated so we're not implementing it.
-        assert_gpu_result_equal(query)