diff --git a/cub/agent/agent_select_if.cuh b/cub/agent/agent_select_if.cuh
index f88d2f384f..807ea18c6d 100644
--- a/cub/agent/agent_select_if.cuh
+++ b/cub/agent/agent_select_if.cuh
@@ -93,7 +93,7 @@ template <
     typename    AgentSelectIfPolicyT,           ///< Parameterized AgentSelectIfPolicy tuning policy type
     typename    InputIteratorT,                 ///< Random-access input iterator type for selection items
     typename    FlagsInputIteratorT,            ///< Random-access input iterator type for selections (NullType* if a selection functor or discontinuity flagging is to be used for selection)
-    typename    SelectedOutputIteratorT,        ///< Random-access input iterator type for selection_flags items
+    typename    SelectedOutputIteratorT,        ///< Random-access output iterator type for selection_flags items
     typename    SelectOpT,                      ///< Selection operator type (NullType if selections or discontinuity flagging is to be used for selection)
     typename    EqualityOpT,                    ///< Equality operator type (NullType if selection functor or selections is to be used for selection)
     typename    OffsetT,                        ///< Signed integer type for global offsets
@@ -511,7 +511,7 @@ struct AgentSelectIf
      */
     template <bool IS_LAST_TILE>
     __device__ __forceinline__ OffsetT ConsumeFirstTile(
-        int                 num_tile_items,      ///< Number of input items comprising this tile
+        int                 num_tile_items,     ///< Number of input items comprising this tile
         OffsetT             tile_offset,        ///< Tile offset
         ScanTileStateT&     tile_state)         ///< Global tile state descriptor
     {
@@ -570,7 +570,7 @@ struct AgentSelectIf
      */
     template <bool IS_LAST_TILE>
     __device__ __forceinline__ OffsetT ConsumeSubsequentTile(
-        int                 num_tile_items,      ///< Number of input items comprising this tile
+        int                 num_tile_items,     ///< Number of input items comprising this tile
         int                 tile_idx,           ///< Tile index
         OffsetT             tile_offset,        ///< Tile offset
         ScanTileStateT&     tile_state)         ///< Global tile state descriptor
@@ -632,7 +632,7 @@ struct AgentSelectIf
      */
     template <bool IS_LAST_TILE>
     __device__ __forceinline__ OffsetT ConsumeTile(
-        int                 num_tile_items,         ///< Number of input items comprising this tile
+        int                 num_tile_items,     ///< Number of input items comprising this tile
         int                 tile_idx,           ///< Tile index
         OffsetT             tile_offset,        ///< Tile offset
         ScanTileStateT&     tile_state)         ///< Global tile state descriptor
@@ -688,4 +688,3 @@ struct AgentSelectIf
 
 
 CUB_NAMESPACE_END
-
diff --git a/cub/agent/agent_unique_by_key.cuh b/cub/agent/agent_unique_by_key.cuh
new file mode 100644
index 0000000000..d5925c43be
--- /dev/null
+++ b/cub/agent/agent_unique_by_key.cuh
@@ -0,0 +1,571 @@
+/******************************************************************************
+ * Copyright (c) NVIDIA CORPORATION.  All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *     * Redistributions of source code must retain the above copyright
+ *       notice, this list of conditions and the following disclaimer.
+ *     * Redistributions in binary form must reproduce the above copyright
+ *       notice, this list of conditions and the following disclaimer in the
+ *       documentation and/or other materials provided with the distribution.
+ *     * Neither the name of the NVIDIA CORPORATION nor the
+ *       names of its contributors may be used to endorse or promote products
+ *       derived from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+ * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
+ * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+ * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+ * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ ******************************************************************************/
+
+/**
+ * \file
+ * cub::AgentUniqueByKey implements a stateful abstraction of CUDA thread blocks for participating in device-wide unique-by-key.
+ */
+
+#pragma once
+
+#include <iterator>
+#include <type_traits>
+
+#include "../thread/thread_operators.cuh"
+#include "../block/block_load.cuh"
+#include "../block/block_scan.cuh"
+#include "../agent/single_pass_scan_operators.cuh"
+#include "../block/block_discontinuity.cuh"
+
+CUB_NAMESPACE_BEGIN
+
+
+/******************************************************************************
+ * Tuning policy types
+ ******************************************************************************/
+
+/**
+ * Parameterizable tuning policy type for AgentUniqueByKey
+ */
+template <int                     _BLOCK_THREADS,
+          int                     _ITEMS_PER_THREAD = 1,
+          cub::BlockLoadAlgorithm _LOAD_ALGORITHM   = cub::BLOCK_LOAD_DIRECT,
+          cub::CacheLoadModifier  _LOAD_MODIFIER    = cub::LOAD_LDG,
+          cub::BlockScanAlgorithm _SCAN_ALGORITHM   = cub::BLOCK_SCAN_WARP_SCANS>
+struct AgentUniqueByKeyPolicy
+{
+    enum
+    {
+        BLOCK_THREADS    = _BLOCK_THREADS,
+        ITEMS_PER_THREAD = _ITEMS_PER_THREAD,
+    };
+    static const cub::BlockLoadAlgorithm LOAD_ALGORITHM = _LOAD_ALGORITHM;
+    static const cub::CacheLoadModifier  LOAD_MODIFIER  = _LOAD_MODIFIER;
+    static const cub::BlockScanAlgorithm SCAN_ALGORITHM = _SCAN_ALGORITHM;
+};
+
+
+
+
+/******************************************************************************
+ * Thread block abstractions
+ ******************************************************************************/
+
+
+/**
+ * \brief AgentUniqueByKey implements a stateful abstraction of CUDA thread blocks for participating in device-wide unique-by-key
+ */
+template <
+    typename AgentUniqueByKeyPolicyT,           ///< Parameterized AgentUniqueByKeyPolicy tuning policy type
+    typename KeyInputIteratorT,                 ///< Random-access input iterator type for keys
+    typename ValueInputIteratorT,               ///< Random-access input iterator type for values
+    typename KeyOutputIteratorT,                ///< Random-access output iterator type for keys
+    typename ValueOutputIteratorT,              ///< Random-access output iterator type for values
+    typename EqualityOpT,                       ///< Equality operator type
+    typename OffsetT>                           ///< Signed integer type for global offsets
+struct AgentUniqueByKey
+{
+    //---------------------------------------------------------------------
+    // Types and constants
+    //---------------------------------------------------------------------
+
+    // The input key and value type
+    using KeyT = typename std::iterator_traits<KeyInputIteratorT>::value_type;
+    using ValueT = typename std::iterator_traits<ValueInputIteratorT>::value_type;
+
+    // Tile status descriptor interface type
+    using ScanTileStateT = ScanTileState<OffsetT>;
+
+    // Constants
+    enum
+    {
+        BLOCK_THREADS           = AgentUniqueByKeyPolicyT::BLOCK_THREADS,
+        ITEMS_PER_THREAD        = AgentUniqueByKeyPolicyT::ITEMS_PER_THREAD,
+        ITEMS_PER_TILE          = BLOCK_THREADS * ITEMS_PER_THREAD,
+    };
+
+    // Cache-modified Input iterator wrapper type (for applying cache modifier) for keys
+    using WrappedKeyInputIteratorT = typename std::conditional<std::is_pointer<KeyInputIteratorT>::value,
+            CacheModifiedInputIterator<AgentUniqueByKeyPolicyT::LOAD_MODIFIER, KeyT, OffsetT>,     // Wrap the native input pointer with CacheModifiedValuesInputIterator
+            KeyInputIteratorT>::type;                                                              // Directly use the supplied input iterator type
+
+    // Cache-modified Input iterator wrapper type (for applying cache modifier) for values
+    using WrappedValueInputIteratorT = typename std::conditional<std::is_pointer<ValueInputIteratorT>::value,
+            CacheModifiedInputIterator<AgentUniqueByKeyPolicyT::LOAD_MODIFIER, ValueT, OffsetT>,   // Wrap the native input pointer with CacheModifiedValuesInputIterator
+            ValueInputIteratorT>::type;                                                            // Directly use the supplied input iterator type
+
+    // Parameterized BlockLoad type for input data
+    using BlockLoadKeys = BlockLoad<
+            KeyT,
+            BLOCK_THREADS,
+            ITEMS_PER_THREAD,
+            AgentUniqueByKeyPolicyT::LOAD_ALGORITHM>;
+
+    // Parameterized BlockLoad type for flags
+    using BlockLoadValues = BlockLoad<
+            ValueT,
+            BLOCK_THREADS,
+            ITEMS_PER_THREAD,
+            AgentUniqueByKeyPolicyT::LOAD_ALGORITHM>;
+
+    // Parameterized BlockDiscontinuity type for items
+    using BlockDiscontinuityKeys = cub::BlockDiscontinuity<KeyT, BLOCK_THREADS>;
+
+    // Parameterized BlockScan type
+    using BlockScanT = cub::BlockScan<OffsetT, BLOCK_THREADS, AgentUniqueByKeyPolicyT::SCAN_ALGORITHM>;
+
+    // Parameterized BlockDiscontinuity type for items
+    using TilePrefixCallback = cub::TilePrefixCallbackOp<OffsetT, cub::Sum, ScanTileStateT>;
+
+    // Key exchange type
+    using KeyExchangeT = KeyT[ITEMS_PER_TILE];
+
+    // Value exchange type
+    using ValueExchangeT = ValueT[ITEMS_PER_TILE];
+
+    // Shared memory type for this thread block
+    union _TempStorage
+    {
+        struct ScanStorage
+        {
+            typename BlockScanT::TempStorage             scan;
+            typename TilePrefixCallback::TempStorage     prefix;
+            typename BlockDiscontinuityKeys::TempStorage discontinuity;
+        } scan_storage;
+
+        // Smem needed for loading keys
+        typename BlockLoadKeys::TempStorage   load_keys;
+
+        // Smem needed for loading values
+        typename BlockLoadValues::TempStorage load_values;
+
+        // Smem needed for compacting items (allows non POD items in this union)
+        Uninitialized<KeyExchangeT>   shared_keys;
+        Uninitialized<ValueExchangeT> shared_values;
+    };
+
+    // Alias wrapper allowing storage to be unioned
+    struct TempStorage : Uninitialized<_TempStorage> {};
+
+
+    //---------------------------------------------------------------------
+    // Per-thread fields
+    //---------------------------------------------------------------------
+
+    _TempStorage&                       temp_storage;
+    WrappedKeyInputIteratorT            d_keys_in;
+    WrappedValueInputIteratorT          d_values_in;
+    KeyOutputIteratorT                  d_keys_out;
+    ValueOutputIteratorT                d_values_out;
+    cub::InequalityWrapper<EqualityOpT> inequality_op;
+    OffsetT                             num_items;
+
+
+    //---------------------------------------------------------------------
+    // Constructor
+    //---------------------------------------------------------------------
+
+    // Constructor
+    __device__ __forceinline__
+    AgentUniqueByKey(
+        TempStorage                  &temp_storage_,
+        WrappedKeyInputIteratorT     d_keys_in_,
+        WrappedValueInputIteratorT   d_values_in_,
+        KeyOutputIteratorT           d_keys_out_,
+        ValueOutputIteratorT         d_values_out_,
+        EqualityOpT                  equality_op_,
+        OffsetT                      num_items_)
+    : 
+        temp_storage(temp_storage_.Alias()),
+        d_keys_in(d_keys_in_),
+        d_values_in(d_values_in_),
+        d_keys_out(d_keys_out_),
+        d_values_out(d_values_out_),
+        inequality_op(equality_op_),
+        num_items(num_items_)
+    {}
+
+
+
+    //---------------------------------------------------------------------
+    // Utility functions
+    //---------------------------------------------------------------------
+
+    struct KeyTagT {};
+    struct ValueTagT {};
+
+    __device__ __forceinline__
+    KeyExchangeT &GetShared(KeyTagT)
+    {
+        return temp_storage.shared_keys.Alias();
+    }
+    __device__ __forceinline__
+    ValueExchangeT &GetShared(ValueTagT)
+    {
+        return temp_storage.shared_values.Alias();
+    }
+
+
+    //---------------------------------------------------------------------
+    // Scatter utility methods
+    //---------------------------------------------------------------------
+    template <typename Tag,
+              typename OutputIt,
+              typename T>
+    __device__ __forceinline__ void Scatter(
+        Tag      tag,
+        OutputIt items_out,
+        T (&items)[ITEMS_PER_THREAD],
+        OffsetT (&selection_flags)[ITEMS_PER_THREAD],
+        OffsetT (&selection_indices)[ITEMS_PER_THREAD],
+        int  /*num_tile_items*/,
+        int  num_tile_selections,
+        OffsetT num_selections_prefix,
+        OffsetT /*num_selections*/)
+    {
+        #pragma unroll
+        for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ++ITEM)
+        {
+            int local_scatter_offset = selection_indices[ITEM] -
+                                       num_selections_prefix;
+            if (selection_flags[ITEM])
+            {
+                GetShared(tag)[local_scatter_offset] = items[ITEM];
+            }
+        }
+
+        CTA_SYNC();
+
+        for (int item = threadIdx.x;
+             item < num_tile_selections;
+             item += BLOCK_THREADS)
+        {
+            items_out[num_selections_prefix + item] = GetShared(tag)[item];
+        }
+
+        CTA_SYNC();
+    }
+
+
+    //---------------------------------------------------------------------
+    // Cooperatively scan a device-wide sequence of tiles with other CTAs
+    //---------------------------------------------------------------------
+
+
+    /**
+     * Process first tile of input (dynamic chained scan).  Returns the running count of selections (including this tile)
+     */
+    template <bool IS_LAST_TILE>
+    __device__ __forceinline__ OffsetT ConsumeFirstTile(
+        int                 num_tile_items,     ///< Number of input items comprising this tile
+        OffsetT             tile_offset,        ///< Tile offset
+        ScanTileStateT&     tile_state)         ///< Global tile state descriptor
+    {
+        KeyT        keys[ITEMS_PER_THREAD];
+        OffsetT     selection_flags[ITEMS_PER_THREAD];
+        OffsetT     selection_idx[ITEMS_PER_THREAD];
+
+        if (IS_LAST_TILE)
+        {
+            // Fill last elements with the first element
+            // because collectives are not suffix guarded
+            BlockLoadKeys(temp_storage.load_keys)
+                .Load(d_keys_in + tile_offset,
+                      keys,
+                      num_tile_items,
+                      *(d_keys_in + tile_offset));
+        }
+        else
+        {
+            BlockLoadKeys(temp_storage.load_keys).Load(d_keys_in + tile_offset, keys);
+        }
+
+
+        CTA_SYNC();
+
+        ValueT values[ITEMS_PER_THREAD];
+        if (IS_LAST_TILE)
+        {
+            // Fill last elements with the first element
+            // because collectives are not suffix guarded
+            BlockLoadValues(temp_storage.load_values)
+                .Load(d_values_in + tile_offset,
+                      values,
+                      num_tile_items,
+                      *(d_values_in + tile_offset));
+        }
+        else
+        {
+            BlockLoadValues(temp_storage.load_values)
+                .Load(d_values_in + tile_offset, values);
+        }
+
+        CTA_SYNC();
+
+        BlockDiscontinuityKeys(temp_storage.scan_storage.discontinuity)
+             .FlagHeads(selection_flags, keys, inequality_op);
+        #pragma unroll
+        for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ++ITEM)
+        {
+            // Set selection_flags for out-of-bounds items
+            if ((IS_LAST_TILE) && (OffsetT(threadIdx.x * ITEMS_PER_THREAD) + ITEM >= num_tile_items))
+                selection_flags[ITEM] = 1;
+        }
+
+        CTA_SYNC();
+
+
+        OffsetT num_tile_selections   = 0;
+        OffsetT num_selections        = 0;
+        OffsetT num_selections_prefix = 0;
+
+        BlockScanT(temp_storage.scan_storage.scan)
+             .ExclusiveSum(selection_flags,
+                           selection_idx,
+                           num_tile_selections);
+
+        if (threadIdx.x == 0)
+        {
+            // Update tile status if this is not the last tile
+            if (!IS_LAST_TILE)
+                tile_state.SetInclusive(0, num_tile_selections);
+        }
+
+        // Do not count any out-of-bounds selections
+        if (IS_LAST_TILE)
+        {
+            int num_discount = ITEMS_PER_TILE - num_tile_items;
+            num_tile_selections -= num_discount;
+        }
+        num_selections = num_tile_selections;
+
+        CTA_SYNC();
+
+        Scatter(KeyTagT(),
+                d_keys_out,
+                keys,
+                selection_flags,
+                selection_idx,
+                num_tile_items,
+                num_tile_selections,
+                num_selections_prefix,
+                num_selections);
+
+        CTA_SYNC();
+
+        Scatter(ValueTagT(),
+                d_values_out,
+                values,
+                selection_flags,
+                selection_idx,
+                num_tile_items,
+                num_tile_selections,
+                num_selections_prefix,
+                num_selections);
+
+        return num_selections;
+    }
+
+    /**
+     * Process subsequent tile of input (dynamic chained scan).  Returns the running count of selections (including this tile)
+     */
+    template <bool IS_LAST_TILE>
+    __device__ __forceinline__ OffsetT ConsumeSubsequentTile(
+        int                 num_tile_items,     ///< Number of input items comprising this tile
+        int                 tile_idx,           ///< Tile index
+        OffsetT             tile_offset,        ///< Tile offset
+        ScanTileStateT&     tile_state)         ///< Global tile state descriptor
+    {
+        KeyT        keys[ITEMS_PER_THREAD];
+        OffsetT     selection_flags[ITEMS_PER_THREAD];
+        OffsetT     selection_idx[ITEMS_PER_THREAD];
+
+        if (IS_LAST_TILE)
+        {
+            // Fill last elements with the first element
+            // because collectives are not suffix guarded
+            BlockLoadKeys(temp_storage.load_keys)
+                .Load(d_keys_in + tile_offset,
+                      keys,
+                      num_tile_items,
+                      *(d_keys_in + tile_offset));
+        }
+        else
+        {
+            BlockLoadKeys(temp_storage.load_keys).Load(d_keys_in + tile_offset, keys);
+        }
+
+
+        CTA_SYNC();
+
+        ValueT values[ITEMS_PER_THREAD];
+        if (IS_LAST_TILE)
+        {
+            // Fill last elements with the first element
+            // because collectives are not suffix guarded
+            BlockLoadValues(temp_storage.load_values)
+                .Load(d_values_in + tile_offset,
+                      values,
+                      num_tile_items,
+                      *(d_values_in + tile_offset));
+        }
+        else
+        {
+            BlockLoadValues(temp_storage.load_values)
+                .Load(d_values_in + tile_offset, values);
+        }
+
+        CTA_SYNC();
+
+        KeyT tile_predecessor = d_keys_in[tile_offset - 1];
+        BlockDiscontinuityKeys(temp_storage.scan_storage.discontinuity)
+            .FlagHeads(selection_flags, keys, inequality_op, tile_predecessor);
+
+        #pragma unroll
+        for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ++ITEM)
+        {
+            // Set selection_flags for out-of-bounds items
+            if ((IS_LAST_TILE) && (OffsetT(threadIdx.x * ITEMS_PER_THREAD) + ITEM >= num_tile_items))
+                selection_flags[ITEM] = 1;
+        }
+
+        CTA_SYNC();
+
+
+        OffsetT num_tile_selections   = 0;
+        OffsetT num_selections        = 0;
+        OffsetT num_selections_prefix = 0;
+
+        TilePrefixCallback prefix_cb(tile_state,
+                                     temp_storage.scan_storage.prefix,
+                                     cub::Sum(),
+                                     tile_idx);
+        BlockScanT(temp_storage.scan_storage.scan)
+            .ExclusiveSum(selection_flags,
+                          selection_idx,
+                          prefix_cb);
+
+        num_selections        = prefix_cb.GetInclusivePrefix();
+        num_tile_selections   = prefix_cb.GetBlockAggregate();
+        num_selections_prefix = prefix_cb.GetExclusivePrefix();
+
+        if (IS_LAST_TILE)
+        {
+            int num_discount = ITEMS_PER_TILE - num_tile_items;
+            num_tile_selections -= num_discount;
+            num_selections -= num_discount;
+        }
+
+        CTA_SYNC();
+
+        Scatter(KeyTagT(),
+                d_keys_out,
+                keys,
+                selection_flags,
+                selection_idx,
+                num_tile_items,
+                num_tile_selections,
+                num_selections_prefix,
+                num_selections);
+
+        CTA_SYNC();
+
+        Scatter(ValueTagT(),
+                d_values_out,
+                values,
+                selection_flags,
+                selection_idx,
+                num_tile_items,
+                num_tile_selections,
+                num_selections_prefix,
+                num_selections);
+
+        return num_selections;
+    }
+
+
+    /**
+     * Process a tile of input
+     */
+    template <bool IS_LAST_TILE>
+    __device__ __forceinline__ OffsetT ConsumeTile(
+        int                 num_tile_items,     ///< Number of input items comprising this tile
+        int                 tile_idx,           ///< Tile index
+        OffsetT             tile_offset,        ///< Tile offset
+        ScanTileStateT&     tile_state)         ///< Global tile state descriptor
+    {
+        OffsetT num_selections;
+        if (tile_idx == 0)
+        {
+            num_selections = ConsumeFirstTile<IS_LAST_TILE>(num_tile_items, tile_offset, tile_state);
+        }
+        else
+        {
+            num_selections = ConsumeSubsequentTile<IS_LAST_TILE>(num_tile_items, tile_idx, tile_offset, tile_state);
+        }
+
+        return num_selections;
+    }
+
+    /**
+     * Scan tiles of items as part of a dynamic chained scan
+     */
+    template <typename NumSelectedIteratorT>        ///< Output iterator type for recording number of items selection_flags
+    __device__ __forceinline__ void ConsumeRange(
+        int                     num_tiles,          ///< Total number of input tiles
+        ScanTileStateT&         tile_state,         ///< Global tile state descriptor
+        NumSelectedIteratorT    d_num_selected_out) ///< Output total number selection_flags
+    {
+        // Blocks are launched in increasing order, so just assign one tile per block
+        int     tile_idx        = (blockIdx.x * gridDim.y) + blockIdx.y;    // Current tile index
+        OffsetT tile_offset     = tile_idx * ITEMS_PER_TILE;                // Global offset for the current tile
+
+        if (tile_idx < num_tiles - 1)
+        {
+            ConsumeTile<false>(ITEMS_PER_TILE,
+                               tile_idx,
+                               tile_offset,
+                               tile_state);
+        }
+        else
+        {
+            int  num_remaining  = static_cast<int>(num_items - tile_offset);
+            OffsetT num_selections = ConsumeTile<true>(num_remaining,
+                                                       tile_idx,                                    
+                                                       tile_offset,
+                                                       tile_state);
+            if (threadIdx.x == 0)                                                               
+            {
+                *d_num_selected_out = num_selections;
+            }
+        }
+    }
+};
+
+
+
+CUB_NAMESPACE_END
diff --git a/cub/device/device_select.cuh b/cub/device/device_select.cuh
index ed896a68d2..f9ed6d0a25 100644
--- a/cub/device/device_select.cuh
+++ b/cub/device/device_select.cuh
@@ -1,7 +1,7 @@
 
 /******************************************************************************
  * Copyright (c) 2011, Duane Merrill.  All rights reserved.
- * Copyright (c) 2011-2018, NVIDIA CORPORATION.  All rights reserved.
+ * Copyright (c) 2011-2021, NVIDIA CORPORATION.  All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions are met:
@@ -38,6 +38,7 @@
 #include <iterator>
 
 #include "dispatch/dispatch_select_if.cuh"
+#include "dispatch/dispatch_unique_by_key.cuh"
 #include "../config.cuh"
 
 CUB_NAMESPACE_BEGIN
@@ -323,7 +324,7 @@ struct DeviceSelect
         typename                    NumSelectedIteratorT>
     CUB_RUNTIME_FUNCTION __forceinline__
     static cudaError_t Unique(
-        void*                        d_temp_storage,                ///< [in] Device-accessible allocation of temporary storage.  When NULL, the required allocation size is written to \p temp_storage_bytes and no work is done.
+        void*                       d_temp_storage,                 ///< [in] Device-accessible allocation of temporary storage.  When NULL, the required allocation size is written to \p temp_storage_bytes and no work is done.
         size_t                      &temp_storage_bytes,            ///< [in,out] Reference to size in bytes of \p d_temp_storage allocation
         InputIteratorT              d_in,                           ///< [in] Pointer to the input sequence of data items
         OutputIteratorT             d_out,                          ///< [out] Pointer to the output sequence of selected data items
@@ -332,7 +333,7 @@ struct DeviceSelect
         cudaStream_t                stream             = 0,         ///< [in] <b>[optional]</b> CUDA stream to launch kernels within.  Default is stream<sub>0</sub>.
         bool                        debug_synchronous  = false)     ///< [in] <b>[optional]</b> Whether or not to synchronize the stream after every kernel launch to check for errors.  May cause significant slowdown.  Default is \p false.
     {
-        typedef int                     OffsetT;         // Signed integer type for global offsets
+        typedef int                     OffsetT;        // Signed integer type for global offsets
         typedef NullType*               FlagIterator;   // FlagT iterator type (not used)
         typedef NullType                SelectOp;       // Selection op (not used)
         typedef Equality                EqualityOp;     // Default == operator
@@ -351,6 +352,88 @@ struct DeviceSelect
             debug_synchronous);
     }
 
+
+    /**
+     * \brief Given an input sequence \p d_keys_in and \p d_values_in with runs of key-value pairs with consecutive equal-valued keys, only the first key and its value from each run is selectively copied to \p d_keys_out and \p d_values_out.  The total number of items selected is written to \p d_num_selected_out. ![](unique_logo.png)
+     *
+     * \par
+     * - The <tt>==</tt> equality operator is used to determine whether keys are equivalent
+     * - Copies of the selected items are compacted into \p d_out and maintain their original relative ordering.
+     * - \devicestorage
+     *
+     * \par Snippet
+     * The code snippet below illustrates the compaction of items selected from an \p int device vector.
+     * \par
+     * \code
+     * #include <cub/cub.cuh>       // or equivalently <cub/device/device_select.cuh>
+     *
+     * // Declare, allocate, and initialize device-accessible pointers for input and output
+     * int  num_items;              // e.g., 8
+     * int  *d_keys_in;             // e.g., [0, 2, 2, 9, 5, 5, 5, 8]
+     * int  *d_values_in;           // e.g., [1, 2, 3, 4, 5, 6, 7, 8]
+     * int  *d_keys_out;            // e.g., [ ,  ,  ,  ,  ,  ,  ,  ]
+     * int  *d_values_out;          // e.g., [ ,  ,  ,  ,  ,  ,  ,  ]
+     * int  *d_num_selected_out;    // e.g., [ ]
+     * ...
+     *
+     * // Determine temporary device storage requirements
+     * void     *d_temp_storage = NULL;
+     * size_t   temp_storage_bytes = 0;
+     * cub::DeviceSelect::UniqueByKey(d_temp_storage, temp_storage_bytes, d_keys_in, d_values_in, d_keys_out, d_values_out, d_num_selected_out, num_items);
+     *
+     * // Allocate temporary storage
+     * cudaMalloc(&d_temp_storage, temp_storage_bytes);
+     *
+     * // Run selection
+     * cub::DeviceSelect::UniqueByKey(d_temp_storage, temp_storage_bytes, d_keys_in, d_values_in, d_keys_out, d_values_out, d_num_selected_out, num_items);
+     *
+     * // d_keys_out            <-- [0, 2, 9, 5, 8]
+     * // d_values_out          <-- [1, 2, 4, 5, 8]
+     * // d_num_selected_out    <-- [5]
+     *
+     * \endcode
+     *
+     * \tparam KeyInputIteratorT       <b>[inferred]</b> Random-access input iterator type for reading input keys \iterator
+     * \tparam ValueInputIteratorT     <b>[inferred]</b> Random-access input iterator type for reading input values \iterator
+     * \tparam KeyOutputIteratorT      <b>[inferred]</b> Random-access output iterator type for writing selected keys \iterator
+     * \tparam ValueOutputIteratorT    <b>[inferred]</b> Random-access output iterator type for writing selected values \iterator
+     * \tparam NumSelectedIteratorT    <b>[inferred]</b> Output iterator type for recording the number of items selected \iterator
+     */
+    template <
+        typename                    KeyInputIteratorT,
+        typename                    ValueInputIteratorT,
+        typename                    KeyOutputIteratorT,
+        typename                    ValueOutputIteratorT,
+        typename                    NumSelectedIteratorT>
+    CUB_RUNTIME_FUNCTION __forceinline__
+    static cudaError_t UniqueByKey(
+        void*                       d_temp_storage,                 ///< [in] Device-accessible allocation of temporary storage.  When NULL, the required allocation size is written to \p temp_storage_bytes and no work is done.
+        size_t                      &temp_storage_bytes,            ///< [in,out] Reference to size in bytes of \p d_temp_storage allocation
+        KeyInputIteratorT           d_keys_in,                      ///< [in] Pointer to the input sequence of keys
+        ValueInputIteratorT         d_values_in,                    ///< [in] Pointer to the input sequence of values
+        KeyOutputIteratorT          d_keys_out,                     ///< [out] Pointer to the output sequence of selected keys
+        ValueOutputIteratorT        d_values_out,                   ///< [out] Pointer to the output sequence of selected values
+        NumSelectedIteratorT        d_num_selected_out,             ///< [out] Pointer to the total number of items selected (i.e., length of \p d_keys_out or \p d_values_out)
+        int                         num_items,                      ///< [in] Total number of input items (i.e., length of \p d_keys_in or \p d_values_in)
+        cudaStream_t                stream             = 0,         ///< [in] <b>[optional]</b> CUDA stream to launch kernels within.  Default is stream<sub>0</sub>.
+        bool                        debug_synchronous  = false)     ///< [in] <b>[optional]</b> Whether or not to synchronize the stream after every kernel launch to check for errors.  May cause significant slowdown.  Default is \p false.
+    {
+        using OffsetT = int;
+        using EqualityOp = Equality;
+
+        return DispatchUniqueByKey<KeyInputIteratorT, ValueInputIteratorT, KeyOutputIteratorT, ValueOutputIteratorT, NumSelectedIteratorT, EqualityOp, OffsetT>::Dispatch(
+            d_temp_storage,
+            temp_storage_bytes,
+            d_keys_in,
+            d_values_in,
+            d_keys_out,
+            d_values_out,
+            d_num_selected_out,
+            EqualityOp(),
+            num_items,
+            stream,
+            debug_synchronous);
+    }
 };
 
 /**
diff --git a/cub/device/dispatch/dispatch_merge_sort.cuh b/cub/device/dispatch/dispatch_merge_sort.cuh
index 8f4f0be638..0b480899c5 100644
--- a/cub/device/dispatch/dispatch_merge_sort.cuh
+++ b/cub/device/dispatch/dispatch_merge_sort.cuh
@@ -488,21 +488,6 @@ struct DispatchMergeSort : SelectedPolicy
   bool debug_synchronous;
   int ptx_version;
 
-  CUB_RUNTIME_FUNCTION __forceinline__ std::size_t
-  vshmem_size(std::size_t max_shmem,
-              std::size_t shmem_per_block,
-              std::size_t num_blocks)
-  {
-    if (shmem_per_block > max_shmem)
-    {
-      return shmem_per_block * num_blocks;
-    }
-    else
-    {
-      return 0;
-    }
-  }
-
   // Constructor
   CUB_RUNTIME_FUNCTION __forceinline__
   DispatchMergeSort(void *d_temp_storage,
@@ -637,9 +622,9 @@ struct DispatchMergeSort : SelectedPolicy
                                 static_cast<std::size_t>(max_shmem);
 
         virtual_shared_memory_size =
-          vshmem_size(static_cast<std::size_t>(max_shmem),
-                      (cub::max)(block_sort_shmem_size, merge_shmem_size),
-                      static_cast<std::size_t>(num_tiles));
+          detail::VshmemSize(static_cast<std::size_t>(max_shmem),
+                     (cub::max)(block_sort_shmem_size, merge_shmem_size),
+                     static_cast<std::size_t>(num_tiles));
       }
 
 
diff --git a/cub/device/dispatch/dispatch_scan.cuh b/cub/device/dispatch/dispatch_scan.cuh
index 2aac82a262..c2d04588be 100644
--- a/cub/device/dispatch/dispatch_scan.cuh
+++ b/cub/device/dispatch/dispatch_scan.cuh
@@ -348,7 +348,7 @@ struct DispatchScan:
 
             // Get max x-dimension of grid
             int max_dim_x;
-            if (CubDebug(error = cudaDeviceGetAttribute(&max_dim_x, cudaDevAttrMaxGridDimX, device_ordinal))) break;;
+            if (CubDebug(error = cudaDeviceGetAttribute(&max_dim_x, cudaDevAttrMaxGridDimX, device_ordinal))) break;
 
             // Run grids in epochs (in case number of tiles exceeds max x-dimension
             int scan_grid_size = CUB_MIN(num_tiles, max_dim_x);
diff --git a/cub/device/dispatch/dispatch_select_if.cuh b/cub/device/dispatch/dispatch_select_if.cuh
index a052252317..94a17419a8 100644
--- a/cub/device/dispatch/dispatch_select_if.cuh
+++ b/cub/device/dispatch/dispatch_select_if.cuh
@@ -333,7 +333,7 @@ struct DispatchSelectIf
 
             // Get max x-dimension of grid
             int max_dim_x;
-            if (CubDebug(error = cudaDeviceGetAttribute(&max_dim_x, cudaDevAttrMaxGridDimX, device_ordinal))) break;;
+            if (CubDebug(error = cudaDeviceGetAttribute(&max_dim_x, cudaDevAttrMaxGridDimX, device_ordinal))) break;
 
             // Get grid size for scanning tiles
             dim3 scan_grid_size;
@@ -446,5 +446,3 @@ struct DispatchSelectIf
 
 
 CUB_NAMESPACE_END
-
-
diff --git a/cub/device/dispatch/dispatch_unique_by_key.cuh b/cub/device/dispatch/dispatch_unique_by_key.cuh
new file mode 100644
index 0000000000..8597e3c821
--- /dev/null
+++ b/cub/device/dispatch/dispatch_unique_by_key.cuh
@@ -0,0 +1,430 @@
+
+/******************************************************************************
+ * Copyright (c) NVIDIA CORPORATION.  All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *     * Redistributions of source code must retain the above copyright
+ *       notice, this list of conditions and the following disclaimer.
+ *     * Redistributions in binary form must reproduce the above copyright
+ *       notice, this list of conditions and the following disclaimer in the
+ *       documentation and/or other materials provided with the distribution.
+ *     * Neither the name of the NVIDIA CORPORATION nor the
+ *       names of its contributors may be used to endorse or promote products
+ *       derived from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+ * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
+ * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+ * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+ * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ ******************************************************************************/
+
+/**
+ * \file
+ * cub::DeviceSelect::UniqueByKey provides device-wide, parallel operations for selecting unique items by key from sequences of data items residing within device-accessible memory.
+ */
+
+#include "../../agent/agent_unique_by_key.cuh"
+#include "../../util_math.cuh"
+#include "../../util_macro.cuh"
+
+#include "dispatch_scan.cuh"
+
+CUB_NAMESPACE_BEGIN
+
+/******************************************************************************
+ * Kernel entry points
+ *****************************************************************************/
+
+/**
+ * Unique by key kernel entry point (multi-block)
+ */
+template <
+    typename AgentUniqueByKeyPolicyT,               ///< Parameterized AgentUniqueByKeyPolicy tuning policy type
+    typename KeyInputIteratorT,                     ///< Random-access input iterator type for keys
+    typename ValueInputIteratorT,                   ///< Random-access input iterator type for values
+    typename KeyOutputIteratorT,                    ///< Random-access output iterator type for keys
+    typename ValueOutputIteratorT,                  ///< Random-access output iterator type for values
+    typename NumSelectedIteratorT,                  ///< Output iterator type for recording the number of items selected
+    typename ScanTileStateT,                        ///< Tile status interface type
+    typename EqualityOpT,                           ///< Equality operator type
+    typename OffsetT>                               ///< Signed integer type for global offsets
+__launch_bounds__ (int(AgentUniqueByKeyPolicyT::UniqueByKeyPolicyT::BLOCK_THREADS))
+__global__ void DeviceUniqueByKeySweepKernel(
+    KeyInputIteratorT       d_keys_in,              ///< [in] Pointer to the input sequence of keys
+    ValueInputIteratorT     d_values_in,            ///< [in] Pointer to the input sequence of values
+    KeyOutputIteratorT      d_keys_out,             ///< [out] Pointer to the output sequence of selected data items
+    ValueOutputIteratorT    d_values_out,           ///< [out] Pointer to the output sequence of selected data items
+    NumSelectedIteratorT    d_num_selected_out,     ///< [out] Pointer to the total number of items selected (i.e., length of \p d_keys_out or \p d_values_out)
+    ScanTileStateT          tile_state,             ///< [in] Tile status interface
+    EqualityOpT             equality_op,            ///< [in] Equality operator
+    OffsetT                 num_items,              ///< [in] Total number of input items (i.e., length of \p d_keys_in or \p d_values_in)
+    int                     num_tiles)              ///< [in] Total number of tiles for the entire problem
+{
+    // Thread block type for selecting data from input tiles
+    using AgentUniqueByKeyT = AgentUniqueByKey<
+        typename AgentUniqueByKeyPolicyT::UniqueByKeyPolicyT,
+        KeyInputIteratorT,
+        ValueInputIteratorT,
+        KeyOutputIteratorT,
+        ValueOutputIteratorT,
+        EqualityOpT,
+        OffsetT>;
+
+    // Shared memory for AgentSelectIf
+    __shared__ typename AgentUniqueByKeyT::TempStorage temp_storage;
+
+    // Process tiles
+    AgentUniqueByKeyT(temp_storage, d_keys_in, d_values_in, d_keys_out, d_values_out, equality_op, num_items).ConsumeRange(
+        num_tiles,
+        tile_state,
+        d_num_selected_out);
+}
+
+
+/******************************************************************************
+ * Policy
+ ******************************************************************************/
+
+template <typename KeyInputIteratorT>
+struct DeviceUniqueByKeyPolicy
+{
+    using KeyT = typename std::iterator_traits<KeyInputIteratorT>::value_type;
+
+    // SM350
+    struct Policy350 : ChainedPolicy<350, Policy350, Policy350> {
+        const static int INPUT_SIZE = sizeof(KeyT);
+        enum
+        {
+            NOMINAL_4B_ITEMS_PER_THREAD = 9,
+            ITEMS_PER_THREAD = Nominal4BItemsToItems<KeyT>(NOMINAL_4B_ITEMS_PER_THREAD),
+        };
+
+        using UniqueByKeyPolicyT = AgentUniqueByKeyPolicy<128,
+                          ITEMS_PER_THREAD,
+                          cub::BLOCK_LOAD_WARP_TRANSPOSE,
+                          cub::LOAD_LDG,
+                          cub::BLOCK_SCAN_WARP_SCANS>;
+    };
+
+    // SM520
+    struct Policy520 : ChainedPolicy<520, Policy520, Policy350>
+    {
+        const static int INPUT_SIZE = sizeof(KeyT);
+        enum
+        {
+            NOMINAL_4B_ITEMS_PER_THREAD = 11,
+            ITEMS_PER_THREAD = Nominal4BItemsToItems<KeyT>(NOMINAL_4B_ITEMS_PER_THREAD),
+        };
+
+        using UniqueByKeyPolicyT =  AgentUniqueByKeyPolicy<64,
+                            ITEMS_PER_THREAD,
+                            cub::BLOCK_LOAD_WARP_TRANSPOSE,
+                            cub::LOAD_LDG,
+                            cub::BLOCK_SCAN_WARP_SCANS>;
+    };
+
+    /// MaxPolicy
+    using MaxPolicy = Policy520;
+};
+
+
+/******************************************************************************
+ * Dispatch
+ ******************************************************************************/
+
+/**
+ * Utility class for dispatching the appropriately-tuned kernels for DeviceSelect
+ */
+template <
+    typename KeyInputIteratorT,                 ///< Random-access input iterator type for keys
+    typename ValueInputIteratorT,               ///< Random-access input iterator type for values
+    typename KeyOutputIteratorT,                ///< Random-access output iterator type for keys
+    typename ValueOutputIteratorT,              ///< Random-access output iterator type for values
+    typename NumSelectedIteratorT,              ///< Output iterator type for recording the number of items selected
+    typename EqualityOpT,                       ///< Equality operator type
+    typename OffsetT,                           ///< Signed integer type for global offsets
+    typename SelectedPolicy = DeviceUniqueByKeyPolicy<KeyInputIteratorT>>
+struct DispatchUniqueByKey: SelectedPolicy
+{
+    /******************************************************************************
+     * Types and constants
+     ******************************************************************************/
+
+    enum
+    {
+        INIT_KERNEL_THREADS = 128,
+    };
+
+    // The input key and value type
+    using KeyT = typename std::iterator_traits<KeyInputIteratorT>::value_type;
+    using ValueT = typename std::iterator_traits<ValueInputIteratorT>::value_type;
+
+    // Tile status descriptor interface type
+    using ScanTileStateT = ScanTileState<OffsetT>;
+
+
+    void*                   d_temp_storage;             ///< [in] Device-accessible allocation of temporary storage.  When NULL, the required allocation size is written to \p temp_storage_bytes and no work is done.
+    size_t&                 temp_storage_bytes;         ///< [in,out] Reference to size in bytes of \p d_temp_storage allocation
+    KeyInputIteratorT       d_keys_in;                  ///< [in] Pointer to the input sequence of keys
+    ValueInputIteratorT     d_values_in;                ///< [in] Pointer to the input sequence of values
+    KeyOutputIteratorT      d_keys_out;                 ///< [out] Pointer to the output sequence of selected data items
+    ValueOutputIteratorT    d_values_out;               ///< [out] Pointer to the output sequence of selected data items
+    NumSelectedIteratorT    d_num_selected_out;         ///< [out] Pointer to the total number of items selected (i.e., length of \p d_keys_out or \p d_values_out)
+    EqualityOpT             equality_op;                ///< [in] Equality operator
+    OffsetT                 num_items;                  ///< [in] Total number of input items (i.e., length of \p d_keys_in or \p d_values_in)
+    cudaStream_t            stream;                     ///< [in] <b>[optional]</b> CUDA stream to launch kernels within.  Default is stream<sub>0</sub>.
+    bool                    debug_synchronous;
+
+    CUB_RUNTIME_FUNCTION __forceinline__
+    DispatchUniqueByKey(
+        void*                   d_temp_storage,         ///< [in] Device-accessible allocation of temporary storage.  When NULL, the required allocation size is written to \p temp_storage_bytes and no work is done.
+        size_t&                 temp_storage_bytes,     ///< [in,out] Reference to size in bytes of \p d_temp_storage allocation
+        KeyInputIteratorT       d_keys_in,              ///< [in] Pointer to the input sequence of keys
+        ValueInputIteratorT     d_values_in,            ///< [in] Pointer to the input sequence of values
+        KeyOutputIteratorT      d_keys_out,             ///< [out] Pointer to the output sequence of selected data items
+        ValueOutputIteratorT    d_values_out,           ///< [out] Pointer to the output sequence of selected data items
+        NumSelectedIteratorT    d_num_selected_out,     ///< [out] Pointer to the total number of items selected (i.e., length of \p d_keys_out or \p d_values_out)
+        EqualityOpT             equality_op,            ///< [in] Equality operator
+        OffsetT                 num_items,              ///< [in] Total number of input items (i.e., length of \p d_keys_in or \p d_values_in)
+        cudaStream_t            stream,                 ///< [in] <b>[optional]</b> CUDA stream to launch kernels within.  Default is stream<sub>0</sub>.
+        bool                    debug_synchronous
+    ):
+        d_temp_storage(d_temp_storage),
+        temp_storage_bytes(temp_storage_bytes),
+        d_keys_in(d_keys_in),
+        d_values_in(d_values_in),
+        d_keys_out(d_keys_out),
+        d_values_out(d_values_out),
+        d_num_selected_out(d_num_selected_out),
+        equality_op(equality_op),
+        num_items(num_items),
+        stream(stream),
+        debug_synchronous(debug_synchronous)
+    {}
+
+
+    /******************************************************************************
+     * Dispatch entrypoints
+     ******************************************************************************/
+
+    template <typename ActivePolicyT, typename InitKernel, typename ScanKernel>
+    CUB_RUNTIME_FUNCTION __host__  __forceinline__
+    cudaError_t Invoke(InitKernel init_kernel, ScanKernel scan_kernel)
+    {
+#ifndef CUB_RUNTIME_ENABLED
+
+        (void)init_kernel;
+        (void)scan_kernel;
+
+        // Kernel launch not supported from this device
+        return CubDebug(cudaErrorNotSupported);
+
+#else
+
+        using Policy = typename ActivePolicyT::UniqueByKeyPolicyT;
+        using UniqueByKeyAgentT = AgentUniqueByKey<Policy,
+                                                   KeyInputIteratorT,
+                                                   ValueInputIteratorT,
+                                                   KeyOutputIteratorT,
+                                                   ValueOutputIteratorT,
+                                                   EqualityOpT,
+                                                   OffsetT>;
+
+        cudaError error = cudaSuccess;
+        do
+        {
+            // Get device ordinal
+            int device_ordinal;
+            if (CubDebug(error = cudaGetDevice(&device_ordinal))) break;
+
+            // Number of input tiles
+            int tile_size = Policy::BLOCK_THREADS * Policy::ITEMS_PER_THREAD;
+            int num_tiles = static_cast<int>(cub::DivideAndRoundUp(num_items, tile_size));
+
+            // Size of virtual shared memory
+            int max_shmem = 0;
+            if (CubDebug(
+                error = cudaDeviceGetAttribute(&max_shmem,
+                                               cudaDevAttrMaxSharedMemoryPerBlock,
+                                               device_ordinal)))
+            {
+                break;
+            }
+            std::size_t vshmem_size = detail::VshmemSize(max_shmem, sizeof(typename UniqueByKeyAgentT::TempStorage), num_tiles);
+
+            // Specify temporary storage allocation requirements
+            size_t allocation_sizes[2] = {0, vshmem_size};
+            if (CubDebug(error = ScanTileStateT::AllocationSize(num_tiles, allocation_sizes[0]))) break;    // bytes needed for tile status descriptors
+
+            // Compute allocation pointers into the single storage blob (or compute the necessary size of the blob)
+            void *allocations[2] = {NULL, NULL};
+            if (CubDebug(error = AliasTemporaries(d_temp_storage, temp_storage_bytes, allocations, allocation_sizes))) break;
+            if (d_temp_storage == NULL)
+            {
+                // Return if the caller is simply requesting the size of the storage allocation
+                break;
+            }
+
+            // Construct the tile status interface
+            ScanTileStateT tile_state;
+            if (CubDebug(error = tile_state.Init(num_tiles, allocations[0], allocation_sizes[0]))) break;
+
+            // Log init_kernel configuration
+            num_tiles = CUB_MAX(1, num_tiles);
+            int init_grid_size = cub::DivideAndRoundUp(num_tiles, INIT_KERNEL_THREADS);
+            if (debug_synchronous) _CubLog("Invoking init_kernel<<<%d, %d, 0, %lld>>>()\n", init_grid_size, INIT_KERNEL_THREADS, (long long) stream);
+
+            // Invoke init_kernel to initialize tile descriptors
+            THRUST_NS_QUALIFIER::cuda_cub::launcher::triple_chevron(
+                init_grid_size, INIT_KERNEL_THREADS, 0, stream
+            ).doit(init_kernel, tile_state, num_tiles, d_num_selected_out);
+
+            // Check for failure to launch
+            if (CubDebug(error = cudaPeekAtLastError())) break;
+
+            // Sync the stream if specified to flush runtime errors
+            if (debug_synchronous && (CubDebug(error = SyncStream(stream)))) break;
+
+            // Return if empty problem
+            if (num_items == 0) break;
+
+            // Get max x-dimension of grid
+            int max_dim_x;
+            if (CubDebug(error = cudaDeviceGetAttribute(&max_dim_x, cudaDevAttrMaxGridDimX, device_ordinal))) break;
+
+            // Get grid size for scanning tiles
+            dim3 scan_grid_size;
+            scan_grid_size.z = 1;
+            scan_grid_size.y = cub::DivideAndRoundUp(num_tiles, max_dim_x);
+            scan_grid_size.x = CUB_MIN(num_tiles, max_dim_x);
+
+            // Log select_if_kernel configuration
+            if (debug_synchronous)
+            {
+              // Get SM occupancy for unique_by_key_kernel
+              int scan_sm_occupancy;
+              if (CubDebug(error = MaxSmOccupancy(scan_sm_occupancy, // out
+                                                  scan_kernel,
+                                                  Policy::BLOCK_THREADS)))
+              {
+                break;
+              }
+
+              _CubLog("Invoking unique_by_key_kernel<<<{%d,%d,%d}, %d, 0, "
+                      "%lld>>>(), %d items per thread, %d SM occupancy\n",
+                      scan_grid_size.x,
+                      scan_grid_size.y,
+                      scan_grid_size.z,
+                      Policy::BLOCK_THREADS,
+                      (long long)stream,
+                      Policy::ITEMS_PER_THREAD,
+                      scan_sm_occupancy);
+            }
+
+            // Invoke select_if_kernel
+            THRUST_NS_QUALIFIER::cuda_cub::launcher::triple_chevron(
+                scan_grid_size, Policy::BLOCK_THREADS, 0, stream
+            ).doit(scan_kernel,
+                   d_keys_in,
+                   d_values_in,
+                   d_keys_out,
+                   d_values_out,
+                   d_num_selected_out,
+                   tile_state,
+                   equality_op,
+                   num_items,
+                   num_tiles);
+
+            // Check for failure to launch
+            if (CubDebug(error = cudaPeekAtLastError())) break;
+
+            // Sync the stream if specified to flush runtime errors
+            if (debug_synchronous && (CubDebug(error = SyncStream(stream)))) break;
+        }
+        while(0);
+
+        return error;
+
+#endif  // CUB_RUNTIME_ENABLED
+    }
+
+    template <typename ActivePolicyT>
+    CUB_RUNTIME_FUNCTION __host__  __forceinline__
+    cudaError_t Invoke()
+    {
+        // Ensure kernels are instantiated.
+        return Invoke<ActivePolicyT>(
+            DeviceCompactInitKernel<ScanTileStateT, NumSelectedIteratorT>,
+            DeviceUniqueByKeySweepKernel<
+                ActivePolicyT,
+                KeyInputIteratorT,
+                ValueInputIteratorT,
+                KeyOutputIteratorT,
+                ValueOutputIteratorT,
+                NumSelectedIteratorT,
+                ScanTileStateT,
+                EqualityOpT,
+                OffsetT>
+        );
+    }
+
+
+    /**
+    * Internal dispatch routine
+    */
+    CUB_RUNTIME_FUNCTION __forceinline__
+    static cudaError_t Dispatch(
+        void*                   d_temp_storage,         ///< [in] Device-accessible allocation of temporary storage.  When NULL, the required allocation size is written to \p temp_storage_bytes and no work is done.
+        size_t                  &temp_storage_bytes,    ///< [in,out] Reference to size in bytes of \p d_temp_storage allocation
+        KeyInputIteratorT       d_keys_in,              ///< [in] Pointer to the input sequence of keys
+        ValueInputIteratorT     d_values_in,            ///< [in] Pointer to the input sequence of values
+        KeyOutputIteratorT      d_keys_out,             ///< [out] Pointer to the output sequence of selected data items
+        ValueOutputIteratorT    d_values_out,           ///< [out] Pointer to the output sequence of selected data items
+        NumSelectedIteratorT    d_num_selected_out,     ///< [out] Pointer to the total number of items selected (i.e., length of \p d_keys_out or \p d_values_out)
+        EqualityOpT             equality_op,            ///< [in] Equality operator
+        OffsetT                 num_items,              ///< [in] Total number of input items (i.e., the length of \p d_in)
+        cudaStream_t            stream,                 ///< [in] <b>[optional]</b> CUDA stream to launch kernels within.  Default is stream<sub>0</sub>.
+        bool                    debug_synchronous)      ///< [in] <b>[optional]</b> Whether or not to synchronize the stream after every kernel launch to check for errors.  Also causes launch configurations to be printed to the console.  Default is \p false.
+    {
+        using MaxPolicyT = typename DispatchUniqueByKey::MaxPolicy;
+
+        cudaError_t error;
+        do
+        {
+            // Get PTX version
+            int ptx_version = 0;
+            if (CubDebug(error = PtxVersion(ptx_version))) break;
+
+            // Create dispatch functor
+            DispatchUniqueByKey dispatch(
+                d_temp_storage,
+                temp_storage_bytes,
+                d_keys_in,
+                d_values_in,
+                d_keys_out,
+                d_values_out,
+                d_num_selected_out,
+                equality_op,
+                num_items,
+                stream,
+                debug_synchronous
+            );
+
+            // Dispatch to chained policy
+            if (CubDebug(error = MaxPolicyT::Invoke(ptx_version, dispatch))) break;
+        }
+        while (0);
+
+        return error;
+    }
+};
+
+CUB_NAMESPACE_END
diff --git a/cub/util_math.cuh b/cub/util_math.cuh
index 3c5406fa46..d69fc2ee2d 100644
--- a/cub/util_math.cuh
+++ b/cub/util_math.cuh
@@ -47,6 +47,14 @@ using is_integral_or_enum =
   std::integral_constant<bool,
                          std::is_integral<T>::value || std::is_enum<T>::value>;
 
+__host__ __device__ __forceinline__ constexpr  std::size_t
+VshmemSize(std::size_t max_shmem,
+           std::size_t shmem_per_block,
+           std::size_t num_blocks)
+{
+  return shmem_per_block > max_shmem ? shmem_per_block * num_blocks : 0;
+}
+
 }
 
 /**
diff --git a/test/test_device_select_unique.cu b/test/test_device_select_unique.cu
index d33e4895ac..497b43fc36 100644
--- a/test/test_device_select_unique.cu
+++ b/test/test_device_select_unique.cu
@@ -78,11 +78,11 @@ cudaError_t Dispatch(
     size_t                      */*d_temp_storage_bytes*/,
     cudaError_t                 */*d_cdp_error*/,
 
-    void*               d_temp_storage,
+    void*                       d_temp_storage,
     size_t                      &temp_storage_bytes,
     InputIteratorT              d_in,
-    OutputIteratorT              d_out,
-    NumSelectedIteratorT         d_num_selected_out,
+    OutputIteratorT             d_out,
+    NumSelectedIteratorT        d_num_selected_out,
     OffsetT                     num_items,
     cudaStream_t                stream,
     bool                        debug_synchronous)
@@ -147,11 +147,11 @@ cudaError_t Dispatch(
     size_t                      *d_temp_storage_bytes,
     cudaError_t                 *d_cdp_error,
 
-    void*               d_temp_storage,
+    void*                       d_temp_storage,
     size_t                      &temp_storage_bytes,
     InputIteratorT              d_in,
-    OutputIteratorT              d_out,
-    NumSelectedIteratorT         d_num_selected_out,
+    OutputIteratorT             d_out,
+    NumSelectedIteratorT        d_num_selected_out,
     OffsetT                     num_items,
     cudaStream_t                stream,
     bool                        debug_synchronous)
@@ -275,8 +275,8 @@ void Test(
     int                 num_items)
 {
     // Allocate device output array and num selected
-    T       *d_out            = NULL;
-    int     *d_num_selected_out   = NULL;
+    T       *d_out = NULL;
+    int     *d_num_selected_out = NULL;
     CubDebugExit(g_allocator.DeviceAllocate((void**)&d_out, sizeof(T) * num_items));
     CubDebugExit(g_allocator.DeviceAllocate((void**)&d_num_selected_out, sizeof(int)));
 
diff --git a/test/test_device_select_unique_by_key.cu b/test/test_device_select_unique_by_key.cu
new file mode 100644
index 0000000000..d8db2f7935
--- /dev/null
+++ b/test/test_device_select_unique_by_key.cu
@@ -0,0 +1,602 @@
+/******************************************************************************
+ * Copyright (c) NVIDIA CORPORATION.  All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *     * Redistributions of source code must retain the above copyright
+ *       notice, this list of conditions and the following disclaimer.
+ *     * Redistributions in binary form must reproduce the above copyright
+ *       notice, this list of conditions and the following disclaimer in the
+ *       documentation and/or other materials provided with the distribution.
+ *     * Neither the name of the NVIDIA CORPORATION nor the
+ *       names of its contributors may be used to endorse or promote products
+ *       derived from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+ * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
+ * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+ * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+ * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ ******************************************************************************/
+
+/******************************************************************************
+ * Test of DeviceSelect::Unique utilities
+ ******************************************************************************/
+
+// Ensure printing of CUDA runtime errors to console
+#define CUB_STDERR
+
+#include <stdio.h>
+#include <typeinfo>
+
+#include <cub/util_allocator.cuh>
+#include <cub/iterator/counting_input_iterator.cuh>
+#include <cub/device/device_select.cuh>
+
+#include "test_util.h"
+
+using namespace cub;
+
+
+//---------------------------------------------------------------------
+// Globals, constants and typedefs
+//---------------------------------------------------------------------
+
+bool                    g_verbose               = false;
+int                     g_timing_iterations     = 0;
+int                     g_repeat                = 0;
+float                   g_device_giga_bandwidth;
+CachingDeviceAllocator  g_allocator(true);
+
+// Dispatch types
+enum Backend
+{
+    CUB,        // CUB method
+    CDP,        // GPU-based (dynamic parallelism) dispatch to CUB method
+};
+
+
+//---------------------------------------------------------------------
+// Dispatch to different CUB DeviceSelect entrypoints
+//---------------------------------------------------------------------
+
+
+/**
+ * Dispatch to unique entrypoint
+ */
+template <typename KeyInputIteratorT, typename ValueInputIteratorT, typename KeyOutputIteratorT, typename ValueOutputIteratorT, typename NumSelectedIteratorT, typename OffsetT>
+CUB_RUNTIME_FUNCTION __forceinline__
+cudaError_t Dispatch(
+    Int2Type<CUB>               /*dispatch_to*/,
+    int                         timing_timing_iterations,
+    size_t                      */*d_temp_storage_bytes*/,
+    cudaError_t                 */*d_cdp_error*/,
+
+    void*                       d_temp_storage,
+    size_t                      &temp_storage_bytes,
+    KeyInputIteratorT           d_keys_in,
+    ValueInputIteratorT         d_values_in,
+    KeyOutputIteratorT          d_keys_out,
+    ValueOutputIteratorT        d_values_out,
+    NumSelectedIteratorT        d_num_selected_out,
+    OffsetT                     num_items,
+    cudaStream_t                stream,
+    bool                        debug_synchronous)
+{
+    cudaError_t error = cudaSuccess;
+    for (int i = 0; i < timing_timing_iterations; ++i)
+    {
+        error = DeviceSelect::UniqueByKey(d_temp_storage, temp_storage_bytes, d_keys_in, d_values_in, d_keys_out, d_values_out, d_num_selected_out, num_items, stream, debug_synchronous);
+    }
+    return error;
+}
+
+//---------------------------------------------------------------------
+// CUDA Nested Parallelism Test Kernel
+//---------------------------------------------------------------------
+
+/**
+ * Simple wrapper kernel to invoke DeviceSelect
+ */
+template <typename KeyInputIteratorT, typename ValueInputIteratorT, typename KeyOutputIteratorT, typename ValueOutputIteratorT, typename NumSelectedIteratorT, typename OffsetT>
+__global__ void CnpDispatchKernel(
+    int                         timing_timing_iterations,
+    size_t                      *d_temp_storage_bytes,
+    cudaError_t                 *d_cdp_error,
+
+    void*                       d_temp_storage,
+    size_t                      temp_storage_bytes,
+    KeyInputIteratorT           d_keys_in,
+    ValueInputIteratorT         d_values_in,
+    KeyOutputIteratorT          d_keys_out,
+    ValueOutputIteratorT        d_values_out,
+    NumSelectedIteratorT        d_num_selected_out,
+    OffsetT                     num_items,
+    bool                        debug_synchronous)
+{
+
+#ifndef CUB_CDP
+    (void)timing_timing_iterations;
+    (void)d_temp_storage_bytes;
+    (void)d_cdp_error;
+    (void)d_temp_storage;
+    (void)temp_storage_bytes;
+    (void)d_keys_in;
+    (void)d_values_in;
+    (void)d_keys_out;
+    (void)d_values_out;
+    (void)d_num_selected_out;
+    (void)num_items;
+    (void)debug_synchronous;
+    *d_cdp_error = cudaErrorNotSupported;
+#else
+    *d_cdp_error = Dispatch(Int2Type<CUB>(), timing_timing_iterations, d_temp_storage_bytes, d_cdp_error,
+        d_temp_storage, temp_storage_bytes, d_keys_in, d_values_in, d_keys_out, d_values_out, d_num_selected_out, num_items, 0, debug_synchronous);
+    *d_temp_storage_bytes = temp_storage_bytes;
+#endif
+}
+
+
+/**
+ * Dispatch to CDP kernel
+ */
+template <typename KeyInputIteratorT, typename ValueInputIteratorT, typename KeyOutputIteratorT, typename ValueOutputIteratorT, typename NumSelectedIteratorT, typename OffsetT>
+cudaError_t Dispatch(
+    Int2Type<CDP>               dispatch_to,
+    int                         timing_timing_iterations,
+    size_t                      *d_temp_storage_bytes,
+    cudaError_t                 *d_cdp_error,
+
+    void*                       d_temp_storage,
+    size_t                      &temp_storage_bytes,
+    KeyInputIteratorT           d_keys_in,
+    ValueInputIteratorT         d_values_in,
+    KeyOutputIteratorT          d_keys_out,
+    ValueOutputIteratorT        d_values_out,
+    NumSelectedIteratorT        d_num_selected_out,
+    OffsetT                     num_items,
+    cudaStream_t                stream,
+    bool                        debug_synchronous)
+{
+    // Invoke kernel to invoke device-side dispatch
+    CnpDispatchKernel<<<1,1>>>(timing_timing_iterations, d_temp_storage_bytes, d_cdp_error,
+        d_temp_storage, temp_storage_bytes, d_keys_in, d_values_in, d_keys_out, d_values_out, d_num_selected_out, num_items, debug_synchronous);
+
+    // Copy out temp_storage_bytes
+    CubDebugExit(cudaMemcpy(&temp_storage_bytes, d_temp_storage_bytes, sizeof(size_t) * 1, cudaMemcpyDeviceToHost));
+
+    // Copy out error
+    cudaError_t retval;
+    CubDebugExit(cudaMemcpy(&retval, d_cdp_error, sizeof(cudaError_t) * 1, cudaMemcpyDeviceToHost));
+    return retval;
+}
+
+
+
+//---------------------------------------------------------------------
+// Test generation
+//---------------------------------------------------------------------
+
+
+/**
+ * Initialize problem
+ */
+template <typename T>
+void Initialize(
+    int         entropy_reduction,
+    T           *h_in,
+    int         num_items,
+    int         max_segment)
+{
+    unsigned int max_int = (unsigned int) -1;
+
+    int key = 0;
+    int i = 0;
+    while (i < num_items)
+    {
+        // Select number of repeating occurrences for the current run
+        int repeat;
+        if (max_segment < 0)
+        {
+            repeat = num_items;
+        }
+        else if (max_segment < 2)
+        {
+            repeat = 1;
+        }
+        else
+        {
+            RandomBits(repeat, entropy_reduction);
+            repeat = (int) ((double(repeat) * double(max_segment)) / double(max_int));
+            repeat = CUB_MAX(1, repeat);
+        }
+
+        int j = i;
+        while (j < CUB_MIN(i + repeat, num_items))
+        {
+            InitValue(INTEGER_SEED, h_in[j], key);
+            j++;
+        }
+
+        i = j;
+        key++;
+    }
+
+    if (g_verbose)
+    {
+        printf("Input:\n");
+        DisplayResults(h_in, num_items);
+        printf("\n\n");
+    }
+}
+
+
+/**
+ * Solve unique problem
+ */
+template <
+    typename        KeyInputIteratorT,
+    typename        ValueInputIteratorT,
+    typename        KeyT,
+    typename        ValueT>
+int Solve(
+    KeyInputIteratorT    h_keys_in,
+    ValueInputIteratorT  h_values_in,
+    KeyT                 *h_keys_reference,
+    ValueT               *h_values_reference,
+    int                  num_items)
+{
+    int num_selected = 0;
+    if (num_items > 0)
+    {
+        h_keys_reference[num_selected] = h_keys_in[0];
+        h_values_reference[num_selected] = h_values_in[0];
+        num_selected++;
+    }
+
+    for (int i = 1; i < num_items; ++i)
+    {
+        if (h_keys_in[i] != h_keys_in[i - 1])
+        {
+            h_keys_reference[num_selected] = h_keys_in[i];
+            h_values_reference[num_selected] = h_values_in[i];
+            num_selected++;
+        }
+    }
+
+    return num_selected;
+}
+
+
+
+/**
+ * Test DeviceSelect for a given problem input
+ */
+template <
+    Backend             BACKEND,
+    typename            KeyInputIteratorT,
+    typename            ValueInputIteratorT,
+    typename            KeyT,
+    typename            ValueT>
+void Test(
+    KeyInputIteratorT    d_keys_in,
+    ValueInputIteratorT  d_values_in,
+    KeyT                 *h_keys_reference,
+    ValueT               *h_values_reference,
+    int                  num_selected,
+    int                  num_items)
+{
+    // Allocate device output array and num selected
+    KeyT    *d_keys_out = NULL;
+    ValueT  *d_values_out = NULL;
+    int     *d_num_selected_out = NULL;
+    CubDebugExit(g_allocator.DeviceAllocate((void**)&d_keys_out, sizeof(KeyT) * num_items));
+    CubDebugExit(g_allocator.DeviceAllocate((void**)&d_values_out, sizeof(ValueT) * num_items));
+    CubDebugExit(g_allocator.DeviceAllocate((void**)&d_num_selected_out, sizeof(int)));
+
+    // Allocate CDP device arrays
+    size_t          *d_temp_storage_bytes = NULL;
+    cudaError_t     *d_cdp_error = NULL;
+    CubDebugExit(g_allocator.DeviceAllocate((void**)&d_temp_storage_bytes,  sizeof(size_t) * 1));
+    CubDebugExit(g_allocator.DeviceAllocate((void**)&d_cdp_error,           sizeof(cudaError_t) * 1));
+
+    // Allocate temporary storage
+    void            *d_temp_storage = NULL;
+    size_t          temp_storage_bytes = 0;
+    CubDebugExit(Dispatch(Int2Type<BACKEND>(), 1, d_temp_storage_bytes, d_cdp_error, d_temp_storage, temp_storage_bytes, d_keys_in, d_values_in, d_keys_out, d_values_out, d_num_selected_out, num_items, 0, true));
+    CubDebugExit(g_allocator.DeviceAllocate(&d_temp_storage, temp_storage_bytes));
+
+    // Clear device output array
+    CubDebugExit(cudaMemset(d_keys_out, 0, sizeof(KeyT) * num_items));
+    CubDebugExit(cudaMemset(d_values_out, 0, sizeof(ValueT) * num_items));
+    CubDebugExit(cudaMemset(d_num_selected_out, 0, sizeof(int)));
+
+    // Run warmup/correctness iteration
+    CubDebugExit(Dispatch(Int2Type<BACKEND>(), 1, d_temp_storage_bytes, d_cdp_error, d_temp_storage, temp_storage_bytes, d_keys_in, d_values_in, d_keys_out, d_values_out, d_num_selected_out, num_items, 0, true));
+
+    // Check for correctness (and display results, if specified)
+    int compare11 = CompareDeviceResults(h_keys_reference, d_keys_out, num_selected, true, g_verbose);
+    int compare12 = CompareDeviceResults(h_values_reference, d_values_out, num_selected, true, g_verbose);
+    int compare1 = compare11 && compare12;
+    printf("\t Data %s ", compare1 ? "FAIL" : "PASS");
+
+    int compare2 = CompareDeviceResults(&num_selected, d_num_selected_out, 1, true, g_verbose);
+    printf("\t Count %s ", compare2 ? "FAIL" : "PASS");
+
+    // Flush any stdout/stderr
+    fflush(stdout);
+    fflush(stderr);
+
+    // Performance
+    GpuTimer gpu_timer;
+    gpu_timer.Start();
+    CubDebugExit(Dispatch(Int2Type<BACKEND>(), g_timing_iterations, d_temp_storage_bytes, d_cdp_error, d_temp_storage, temp_storage_bytes, d_keys_in, d_values_in, d_keys_out, d_values_out, d_num_selected_out, num_items, 0, false));
+    gpu_timer.Stop();
+    float elapsed_millis = gpu_timer.ElapsedMillis();
+
+    // Display performance
+    if (g_timing_iterations > 0)
+    {
+        float avg_millis        = elapsed_millis / g_timing_iterations;
+        float giga_rate         = float(num_items) / avg_millis / 1000.0f / 1000.0f;
+        float giga_bandwidth    = float((num_items + num_selected) * (sizeof(KeyT) + sizeof(ValueT))) / avg_millis / 1000.0f / 1000.0f;
+        printf(", %.3f avg ms, %.3f billion items/s, %.3f logical GB/s, %.1f%% peak", avg_millis, giga_rate, giga_bandwidth, giga_bandwidth / g_device_giga_bandwidth * 100.0);
+    }
+    printf("\n\n");
+
+    // Flush any stdout/stderr
+    fflush(stdout);
+    fflush(stderr);
+
+    // Cleanup
+    if (d_keys_out) CubDebugExit(g_allocator.DeviceFree(d_keys_out));
+    if (d_values_out) CubDebugExit(g_allocator.DeviceFree(d_values_out));
+    if (d_num_selected_out) CubDebugExit(g_allocator.DeviceFree(d_num_selected_out));
+    if (d_temp_storage_bytes) CubDebugExit(g_allocator.DeviceFree(d_temp_storage_bytes));
+    if (d_cdp_error) CubDebugExit(g_allocator.DeviceFree(d_cdp_error));
+    if (d_temp_storage) CubDebugExit(g_allocator.DeviceFree(d_temp_storage));
+
+    // Correctness asserts
+    AssertEquals(0, compare1 | compare2);
+}
+
+
+/**
+ * Test DeviceSelect on pointer type
+ */
+template <
+    Backend         BACKEND,
+    typename        KeyT,
+    typename        ValueT>
+void TestPointer(
+    int             num_items,
+    int             entropy_reduction,
+    int             max_segment)
+{
+    // Allocate host arrays
+    KeyT*    h_keys_in        = new KeyT[num_items];
+    ValueT*  h_values_in      = new ValueT[num_items];
+    KeyT*    h_keys_reference = new KeyT[num_items];
+    ValueT*  h_values_reference = new ValueT[num_items];
+
+    // Initialize problem and solution
+    Initialize(entropy_reduction, h_keys_in, num_items, max_segment);
+    Initialize(entropy_reduction, h_values_in, num_items, max_segment);
+    int num_selected = Solve(h_keys_in, h_values_in, h_keys_reference, h_values_reference, num_items);
+
+    printf("\nPointer %s cub::DeviceSelect::Unique %d items, %d selected (avg run length %.3f), %s %d-byte elements, entropy_reduction %d\n",
+        (BACKEND == CDP) ? "CDP CUB" : "CUB",
+        num_items, num_selected, float(num_items) / num_selected,
+        typeid(KeyT).name(),
+        (int) sizeof(KeyT),
+        entropy_reduction);
+    fflush(stdout);
+
+    // Allocate problem device arrays
+    KeyT *d_keys_in = NULL;
+    ValueT *d_values_in = NULL;
+    CubDebugExit(g_allocator.DeviceAllocate((void**)&d_keys_in, sizeof(KeyT) * num_items));
+    CubDebugExit(g_allocator.DeviceAllocate((void**)&d_values_in, sizeof(ValueT) * num_items));
+
+    // Initialize device input
+    CubDebugExit(cudaMemcpy(d_keys_in, h_keys_in, sizeof(KeyT) * num_items, cudaMemcpyHostToDevice));
+    CubDebugExit(cudaMemcpy(d_values_in, h_values_in, sizeof(ValueT) * num_items, cudaMemcpyHostToDevice));
+
+    // Run Test
+    Test<BACKEND>(d_keys_in, d_values_in, h_keys_reference, h_values_reference, num_selected, num_items);
+
+    // Cleanup
+    if (h_keys_in) delete[] h_keys_in;
+    if (h_values_in) delete[] h_values_in;
+    if (h_keys_reference) delete[] h_keys_reference;
+    if (h_values_reference) delete[] h_values_reference;
+    if (d_keys_in) CubDebugExit(g_allocator.DeviceFree(d_keys_in));
+    if (d_values_in) CubDebugExit(g_allocator.DeviceFree(d_values_in));
+}
+
+
+/**
+ * Test DeviceSelect on iterator type
+ */
+template <
+    Backend         BACKEND,
+    typename        KeyT,
+    typename        ValueT>
+void TestIterator(
+    int             num_items)
+{
+    // Use a counting iterator as the input
+    CountingInputIterator<KeyT, int> h_keys_in(0);
+    CountingInputIterator<ValueT, int> h_values_in(0);
+
+    // Allocate host arrays
+    KeyT*    h_keys_reference   = new KeyT[num_items];
+    ValueT*  h_values_reference = new ValueT[num_items];
+
+    // Initialize problem and solution
+    int num_selected = Solve(h_keys_in, h_values_in, h_keys_reference, h_values_reference, num_items);
+
+    printf("\nIterator %s cub::DeviceSelect::Unique %d items, %d selected (avg run length %.3f), %s %d-byte elements\n",
+        (BACKEND == CDP) ? "CDP CUB" : "CUB",
+        num_items, num_selected, float(num_items) / num_selected,
+        typeid(KeyT).name(),
+        (int) sizeof(ValueT));
+    fflush(stdout);
+
+    // Run Test
+    Test<BACKEND>(h_keys_in, h_values_in, h_keys_reference, h_values_reference, num_selected, num_items);
+
+    // Cleanup
+    if (h_keys_reference) delete[] h_keys_reference;
+    if (h_values_reference) delete[] h_values_reference;
+}
+
+
+/**
+ * Test different gen modes
+ */
+template <
+    Backend         BACKEND,
+    typename        KeyT,
+    typename        ValueT>
+void Test(
+    int             num_items)
+{
+    for (int max_segment = 1; ((max_segment > 0) && (max_segment < num_items)); max_segment *= 11)
+    {
+        TestPointer<BACKEND, KeyT, ValueT>(num_items, 0, max_segment);
+        TestPointer<BACKEND, KeyT, ValueT>(num_items, 2, max_segment);
+        TestPointer<BACKEND, KeyT, ValueT>(num_items, 7, max_segment);
+    }
+}
+
+
+/**
+ * Test different dispatch
+ */
+template <
+    typename        KeyT,
+    typename        ValueT>
+void TestOp(
+    int             num_items)
+{
+    Test<CUB, KeyT, ValueT>(num_items);
+#ifdef CUB_CDP
+    Test<CDP, KeyT, ValueT>(num_items);
+#endif
+}
+
+
+/**
+ * Test different input sizes
+ */
+template <
+    typename        KeyT,
+    typename        ValueT>
+void Test(
+    int             num_items)
+{
+    if (num_items < 0)
+    {
+        TestOp<KeyT, ValueT>(0);
+        TestOp<KeyT, ValueT>(1);
+        TestOp<KeyT, ValueT>(100);
+        TestOp<KeyT, ValueT>(10000);
+        TestOp<KeyT, ValueT>(1000000);
+    }
+    else
+    {
+        TestOp<KeyT, ValueT>(num_items);
+    }
+}
+
+
+
+//---------------------------------------------------------------------
+// Main
+//---------------------------------------------------------------------
+
+/**
+ * Main
+ */
+int main(int argc, char** argv)
+{
+    int num_items           = -1;
+    int entropy_reduction   = 0;
+    int maxseg              = 1000;
+
+    // Initialize command line
+    CommandLineArgs args(argc, argv);
+    g_verbose = args.CheckCmdLineFlag("v");
+    args.GetCmdLineArgument("n", num_items);
+    args.GetCmdLineArgument("i", g_timing_iterations);
+    args.GetCmdLineArgument("repeat", g_repeat);
+    args.GetCmdLineArgument("maxseg", maxseg);
+    args.GetCmdLineArgument("entropy", entropy_reduction);
+
+    // Print usage
+    if (args.CheckCmdLineFlag("help"))
+    {
+        printf("%s "
+            "[--n=<input items> "
+            "[--i=<timing iterations> "
+            "[--device=<device-id>] "
+            "[--maxseg=<max segment length>]"
+            "[--entropy=<segment length bit entropy reduction rounds>]"
+            "[--v] "
+            "[--cdp]"
+            "\n", argv[0]);
+        exit(0);
+    }
+
+    // Initialize device
+    CubDebugExit(args.DeviceInit());
+    g_device_giga_bandwidth = args.device_giga_bandwidth;
+    printf("\n");
+
+    // Test different input types
+    Test<unsigned char, int>(num_items);
+    Test<unsigned char, long>(num_items);
+    Test<unsigned short, int>(num_items);
+    Test<unsigned short, long>(num_items);
+    Test<unsigned int, int>(num_items);
+    Test<unsigned int, long>(num_items);
+    Test<unsigned long long, int>(num_items);
+    Test<unsigned long long, long>(num_items);
+
+    Test<uchar2, uint2>(num_items);
+    Test<uchar2, ulonglong2>(num_items);
+    Test<ushort2, uint2>(num_items);
+    Test<ushort2, ulonglong2>(num_items);
+    Test<uint2, uint2>(num_items);
+    Test<uint2, ulonglong2>(num_items);
+    Test<ulonglong2, uint2>(num_items);
+    Test<ulonglong2, ulonglong2>(num_items);
+
+    Test<uchar4, uint4>(num_items);
+    Test<uchar4, ulonglong4>(num_items);
+    Test<ushort4, uint4>(num_items);
+    Test<ushort4, ulonglong4>(num_items);
+    Test<uint4, uint4>(num_items);
+    Test<uint4, ulonglong4>(num_items);
+    Test<ulonglong4, uint4>(num_items);
+    Test<ulonglong4, ulonglong4>(num_items);
+
+    Test<TestFoo, TestFoo>(num_items);
+    Test<TestFoo, TestBar>(num_items);
+    Test<TestFoo, int>(num_items);
+    Test<TestBar, TestFoo>(num_items);
+    Test<TestBar, TestBar>(num_items);
+    Test<TestBar, int>(num_items);
+
+    return 0;
+}
+
+
+