Skip to content
This repository has been archived by the owner on Mar 21, 2024. It is now read-only.

Use data type OffsetT for DispatchRadixSort correctly in InitPassConfig #258

Merged
merged 1 commit into from
Jan 22, 2021
Merged

Use data type OffsetT for DispatchRadixSort correctly in InitPassConfig #258

merged 1 commit into from
Jan 22, 2021

Conversation

fkallen
Copy link
Contributor

@fkallen fkallen commented Jan 20, 2021

I observed a problem in my code when trying to use DeviceRadixSort::SortPairs with num_items >= 2^32, despite correct instatiation of template cub::DispatchRadixSort<false, KeyType, ValueType, IndexType>::Dispatch

With num_items = 4333734380ull results were incorrect, num_items = std::numeric_limits<std::uint32_t>::max() caused a crash.

The reason is that InitPassConfig uses int for the number of items, instead of provided OffsetT. This is fixed by this commit.

I used the following test code. It requires approx 140GB of device (managed) memory.

#include <iostream>
#include <algorithm>
#include <fstream>
#include <cassert>
#include <vector>
#include <numeric>
#include <limits>

#include <cub/cub.cuh>

__global__
void initKernel(
    std::uint64_t* keys,
    std::uint64_t* values,
    std::uint64_t N
){
    const std::uint64_t tid = std::uint64_t(threadIdx.x) + std::uint64_t(blockIdx.x) * std::uint64_t(blockDim.x);
    const std::uint64_t stride = std::uint64_t(blockDim.x) * std::uint64_t(gridDim.x);

    for(std::uint64_t i = tid; i < N; i += stride){
        keys[i] = N - 1 - i;
        values[i] = i;
    }
}

__global__
void checkOrderingKernel(
    std::uint64_t* keys,
    std::uint64_t* values,
    std::uint64_t N
){
    const std::uint64_t tid = std::uint64_t(threadIdx.x) + std::uint64_t(blockIdx.x) * std::uint64_t(blockDim.x);
    const std::uint64_t stride = std::uint64_t(blockDim.x) * std::uint64_t(gridDim.x);

    for(std::uint64_t i = tid; i < N; i += stride){
        assert(keys[i] == i);
        assert(values[i] == N - 1 - i);
    }
}

int main(int argc, char** argv){
    using KeyType = std::uint64_t;
    using ValueType = std::uint64_t;
    using IndexType = std::uint64_t;

    auto DeviceRadixSort_SortPairs = [](
        void* d_temp_storage, 
        std::size_t& temp_storage_bytes, 
        cub::DoubleBuffer<KeyType>& d_keys, 
        cub::DoubleBuffer<ValueType>& d_values,
        IndexType num_items,
        cudaStream_t stream
    ){
        return cub::DispatchRadixSort<false, KeyType, ValueType, IndexType>::Dispatch(
            d_temp_storage,
            temp_storage_bytes,
            d_keys,
            d_values,
            num_items,
            0,
            sizeof(KeyType) * 8,
            true, //is_overwrite_ok
            stream,
            true //debug modus
        );
    };

    IndexType N = std::numeric_limits<std::uint32_t>::max();
    //IndexType N = 4333734380ull;
    

    std::size_t sizeOfKeys = sizeof(KeyType) * N;
    std::size_t sizeOfValues = sizeof(ValueType) * N;

    cub::DoubleBuffer<KeyType> d_keys_dbl{nullptr, nullptr};
    cub::DoubleBuffer<ValueType> d_values_dbl{nullptr, nullptr};

    std::size_t requiredCubSize = 0;

    cudaError_t cubstatus = DeviceRadixSort_SortPairs(
        nullptr,
        requiredCubSize,
        d_keys_dbl,
        d_values_dbl,
        N,
        (cudaStream_t)0
    );

    assert(cubstatus == cudaSuccess);

    void* temp_allocations[5]{};
    std::size_t temp_allocation_sizes[5]{};

    temp_allocation_sizes[0] = sizeOfKeys; // d_keys
    temp_allocation_sizes[1] = sizeOfKeys; // d_keys_temp
    temp_allocation_sizes[2] = sizeOfValues; // d_values
    temp_allocation_sizes[3] = sizeOfValues; // d_values_temp
    temp_allocation_sizes[4] = requiredCubSize;

    std::size_t temp_storage_bytes = 0;
    cubstatus = cub::AliasTemporaries(
        nullptr,
        temp_storage_bytes,
        temp_allocations,
        temp_allocation_sizes
    );
    assert(cubstatus == cudaSuccess);

    std::size_t freeMem,totalMem;
    cudaMemGetInfo(&freeMem, &totalMem);

    std::cerr << "free gpu mem: " << freeMem << ", temp_storage_bytes: " << temp_storage_bytes << "\n";

    void* temp_storage = nullptr;
    if(freeMem > temp_storage_bytes){
        cudaMalloc(&temp_storage, temp_storage_bytes);
    }else{
        cudaMallocManaged(&temp_storage, temp_storage_bytes);
        int deviceId = 0;
        cudaGetDevice(&deviceId);
        cudaMemAdvise(temp_storage, temp_storage_bytes, cudaMemAdviseSetAccessedBy, deviceId);      
    }

    cudaError_t cudastatus = cudaGetLastError();
    assert(cudastatus == cudaSuccess);

    cubstatus = cub::AliasTemporaries(
        temp_storage,
        temp_storage_bytes,
        temp_allocations,
        temp_allocation_sizes
    );
    assert(cubstatus == cudaSuccess);

    d_keys_dbl = cub::DoubleBuffer<KeyType>{(KeyType*)temp_allocations[0], (KeyType*)temp_allocations[1]};
    d_values_dbl = cub::DoubleBuffer<ValueType>{(ValueType*)temp_allocations[2], (ValueType*)temp_allocations[3]};

    dim3 block = 128;
    dim3 grid = 4096;

    initKernel<<<grid, block>>>(d_keys_dbl.Current(), d_values_dbl.Current(), N);

    cudastatus = cudaDeviceSynchronize();

    assert(cudastatus == cudaSuccess);

    cubstatus = DeviceRadixSort_SortPairs(
        temp_allocations[4],
        requiredCubSize,
        d_keys_dbl,
        d_values_dbl,
        N,
        (cudaStream_t)0
    );
    cudastatus = cudaDeviceSynchronize();

    assert(cudastatus == cudaSuccess);
    assert(cubstatus == cudaSuccess);

    checkOrderingKernel<<<grid, block>>>(d_keys_dbl.Current(), d_values_dbl.Current(), N);

    cudastatus = cudaDeviceSynchronize();

    assert(cudastatus == cudaSuccess);

    cudaFree(temp_storage);

}

…ig. This allows to use radix sort on problem sizes which cannot be represented by int
@alliepiper
Copy link
Collaborator

Thanks for the patch! This looks good to me, I'll start tests/integrations today.

Related to the on-going #212.

@alliepiper alliepiper added this to the 1.12.0 milestone Jan 21, 2021
@alliepiper alliepiper self-assigned this Jan 21, 2021
@alliepiper
Copy link
Collaborator

DVS CL: 29528902

@alliepiper alliepiper added testing: internal ci in progress Currently testing on internal NVIDIA CI (DVS). testing: gpuCI in progress Started gpuCI testing. testing: internal ci passed Passed internal NVIDIA CI (DVS). testing: gpuCI passed Passed gpuCI testing. and removed testing: internal ci in progress Currently testing on internal NVIDIA CI (DVS). testing: gpuCI in progress Started gpuCI testing. labels Jan 21, 2021
@alliepiper alliepiper merged commit 8128ac8 into NVIDIA:main Jan 22, 2021
Sign up for free to subscribe to this conversation on GitHub. Already have an account? Sign in.
Labels
testing: gpuCI passed Passed gpuCI testing. testing: internal ci passed Passed internal NVIDIA CI (DVS).
Projects
None yet
Development

Successfully merging this pull request may close these issues.

2 participants