Use data type OffsetT for DispatchRadixSort correctly in InitPassConfig #258

fkallen · 2021-01-20T16:18:43Z

I observed a problem in my code when trying to use DeviceRadixSort::SortPairs with num_items >= 2^32, despite correct instatiation of template cub::DispatchRadixSort<false, KeyType, ValueType, IndexType>::Dispatch

With num_items = 4333734380ull results were incorrect, num_items = std::numeric_limits<std::uint32_t>::max() caused a crash.

The reason is that InitPassConfig uses int for the number of items, instead of provided OffsetT. This is fixed by this commit.

I used the following test code. It requires approx 140GB of device (managed) memory.

#include <iostream>
#include <algorithm>
#include <fstream>
#include <cassert>
#include <vector>
#include <numeric>
#include <limits>

#include <cub/cub.cuh>

__global__
void initKernel(
    std::uint64_t* keys,
    std::uint64_t* values,
    std::uint64_t N
){
    const std::uint64_t tid = std::uint64_t(threadIdx.x) + std::uint64_t(blockIdx.x) * std::uint64_t(blockDim.x);
    const std::uint64_t stride = std::uint64_t(blockDim.x) * std::uint64_t(gridDim.x);

    for(std::uint64_t i = tid; i < N; i += stride){
        keys[i] = N - 1 - i;
        values[i] = i;
    }
}

__global__
void checkOrderingKernel(
    std::uint64_t* keys,
    std::uint64_t* values,
    std::uint64_t N
){
    const std::uint64_t tid = std::uint64_t(threadIdx.x) + std::uint64_t(blockIdx.x) * std::uint64_t(blockDim.x);
    const std::uint64_t stride = std::uint64_t(blockDim.x) * std::uint64_t(gridDim.x);

    for(std::uint64_t i = tid; i < N; i += stride){
        assert(keys[i] == i);
        assert(values[i] == N - 1 - i);
    }
}

int main(int argc, char** argv){
    using KeyType = std::uint64_t;
    using ValueType = std::uint64_t;
    using IndexType = std::uint64_t;

    auto DeviceRadixSort_SortPairs = [](
        void* d_temp_storage, 
        std::size_t& temp_storage_bytes, 
        cub::DoubleBuffer<KeyType>& d_keys, 
        cub::DoubleBuffer<ValueType>& d_values,
        IndexType num_items,
        cudaStream_t stream
    ){
        return cub::DispatchRadixSort<false, KeyType, ValueType, IndexType>::Dispatch(
            d_temp_storage,
            temp_storage_bytes,
            d_keys,
            d_values,
            num_items,
            0,
            sizeof(KeyType) * 8,
            true, //is_overwrite_ok
            stream,
            true //debug modus
        );
    };

    IndexType N = std::numeric_limits<std::uint32_t>::max();
    //IndexType N = 4333734380ull;
    

    std::size_t sizeOfKeys = sizeof(KeyType) * N;
    std::size_t sizeOfValues = sizeof(ValueType) * N;

    cub::DoubleBuffer<KeyType> d_keys_dbl{nullptr, nullptr};
    cub::DoubleBuffer<ValueType> d_values_dbl{nullptr, nullptr};

    std::size_t requiredCubSize = 0;

    cudaError_t cubstatus = DeviceRadixSort_SortPairs(
        nullptr,
        requiredCubSize,
        d_keys_dbl,
        d_values_dbl,
        N,
        (cudaStream_t)0
    );

    assert(cubstatus == cudaSuccess);

    void* temp_allocations[5]{};
    std::size_t temp_allocation_sizes[5]{};

    temp_allocation_sizes[0] = sizeOfKeys; // d_keys
    temp_allocation_sizes[1] = sizeOfKeys; // d_keys_temp
    temp_allocation_sizes[2] = sizeOfValues; // d_values
    temp_allocation_sizes[3] = sizeOfValues; // d_values_temp
    temp_allocation_sizes[4] = requiredCubSize;

    std::size_t temp_storage_bytes = 0;
    cubstatus = cub::AliasTemporaries(
        nullptr,
        temp_storage_bytes,
        temp_allocations,
        temp_allocation_sizes
    );
    assert(cubstatus == cudaSuccess);

    std::size_t freeMem,totalMem;
    cudaMemGetInfo(&freeMem, &totalMem);

    std::cerr << "free gpu mem: " << freeMem << ", temp_storage_bytes: " << temp_storage_bytes << "\n";

    void* temp_storage = nullptr;
    if(freeMem > temp_storage_bytes){
        cudaMalloc(&temp_storage, temp_storage_bytes);
    }else{
        cudaMallocManaged(&temp_storage, temp_storage_bytes);
        int deviceId = 0;
        cudaGetDevice(&deviceId);
        cudaMemAdvise(temp_storage, temp_storage_bytes, cudaMemAdviseSetAccessedBy, deviceId);      
    }

    cudaError_t cudastatus = cudaGetLastError();
    assert(cudastatus == cudaSuccess);

    cubstatus = cub::AliasTemporaries(
        temp_storage,
        temp_storage_bytes,
        temp_allocations,
        temp_allocation_sizes
    );
    assert(cubstatus == cudaSuccess);

    d_keys_dbl = cub::DoubleBuffer<KeyType>{(KeyType*)temp_allocations[0], (KeyType*)temp_allocations[1]};
    d_values_dbl = cub::DoubleBuffer<ValueType>{(ValueType*)temp_allocations[2], (ValueType*)temp_allocations[3]};

    dim3 block = 128;
    dim3 grid = 4096;

    initKernel<<<grid, block>>>(d_keys_dbl.Current(), d_values_dbl.Current(), N);

    cudastatus = cudaDeviceSynchronize();

    assert(cudastatus == cudaSuccess);

    cubstatus = DeviceRadixSort_SortPairs(
        temp_allocations[4],
        requiredCubSize,
        d_keys_dbl,
        d_values_dbl,
        N,
        (cudaStream_t)0
    );
    cudastatus = cudaDeviceSynchronize();

    assert(cudastatus == cudaSuccess);
    assert(cubstatus == cudaSuccess);

    checkOrderingKernel<<<grid, block>>>(d_keys_dbl.Current(), d_values_dbl.Current(), N);

    cudastatus = cudaDeviceSynchronize();

    assert(cudastatus == cudaSuccess);

    cudaFree(temp_storage);

}

…ig. This allows to use radix sort on problem sizes which cannot be represented by int

alliepiper · 2021-01-21T18:25:48Z

Thanks for the patch! This looks good to me, I'll start tests/integrations today.

Related to the on-going #212.

alliepiper · 2021-01-21T21:31:39Z

DVS CL: 29528902

Use data type OffsetT for DispatchRadixSort correctly in InitPassConf…

767dadb

…ig. This allows to use radix sort on problem sizes which cannot be represented by int

alliepiper added this to the 1.12.0 milestone Jan 21, 2021

alliepiper self-assigned this Jan 21, 2021

alliepiper approved these changes Jan 22, 2021

View reviewed changes

alliepiper merged commit 8128ac8 into NVIDIA:main Jan 22, 2021

fkallen mentioned this pull request Jul 5, 2021

Radix sort issues with cuda 11.4 #336

Closed

Provide feedback

Saved searches

Use saved searches to filter your results more quickly

Use data type OffsetT for DispatchRadixSort correctly in InitPassConfig #258

Use data type OffsetT for DispatchRadixSort correctly in InitPassConfig #258

fkallen commented Jan 20, 2021

alliepiper commented Jan 21, 2021

alliepiper commented Jan 21, 2021

Use data type OffsetT for DispatchRadixSort correctly in InitPassConfig #258

Use data type OffsetT for DispatchRadixSort correctly in InitPassConfig #258

Conversation

fkallen commented Jan 20, 2021

alliepiper commented Jan 21, 2021

alliepiper commented Jan 21, 2021