Skip to content
This repository was archived by the owner on Mar 21, 2024. It is now read-only.

Commit

Permalink
After making a CUDA API call, always clear the global CUDA error stat…
Browse files Browse the repository at this point in the history
…e by

calling cudaGetLastError. Otherwise, if the CUDA API call is followed directly
by a kernel launch, checking for a synchronous error during the kernel launch
by calling cudaGetLastError may potentially return the error code from the CUDA
API call. This type of error leakage is very subtle and difficult to trace.

Also, update Makefile to remove old architectures and allow you to override the
NVCC variant used.

Bug 2720132
Bug 2808654

Reviewed-by: Michał 'Griwes' Dominiak <griwes@griwes.info>
  • Loading branch information
brycelelbach committed Feb 6, 2020
1 parent 11755ca commit 22b0573
Show file tree
Hide file tree
Showing 6 changed files with 62 additions and 75 deletions.
74 changes: 22 additions & 52 deletions common.mk
Original file line number Diff line number Diff line change
@@ -1,7 +1,7 @@
#/******************************************************************************
# * Copyright (c) 2011, Duane Merrill. All rights reserved.
# * Copyright (c) 2011-2018, NVIDIA CORPORATION. All rights reserved.
# *
# *
# * Redistribution and use in source and binary forms, with or without
# * modification, are permitted provided that the following conditions are met:
# * * Redistributions of source code must retain the above copyright
Expand All @@ -12,7 +12,7 @@
# * * Neither the name of the NVIDIA CORPORATION nor the
# * names of its contributors may be used to endorse or promote products
# * derived from this software without specific prior written permission.
# *
# *
# * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
# * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
# * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
Expand All @@ -32,46 +32,46 @@
#-------------------------------------------------------------------------------

# [sm=<XXX,...>] Compute-capability to compile for, e.g., "sm=200,300,350" (SM20 by default).

COMMA = ,
ifdef sm
SM_ARCH = $(subst $(COMMA),-,$(sm))
else
SM_ARCH = 200
else
SM_ARCH = 600
endif

ifeq (700, $(findstring 700, $(SM_ARCH)))
SM_TARGETS += -gencode=arch=compute_70,code=\"sm_70,compute_70\"
SM_TARGETS += -gencode=arch=compute_70,code=\"sm_70,compute_70\"
SM_DEF += -DSM700
TEST_ARCH = 700
endif
ifeq (620, $(findstring 620, $(SM_ARCH)))
SM_TARGETS += -gencode=arch=compute_62,code=\"sm_62,compute_62\"
SM_TARGETS += -gencode=arch=compute_62,code=\"sm_62,compute_62\"
SM_DEF += -DSM620
TEST_ARCH = 620
endif
ifeq (610, $(findstring 610, $(SM_ARCH)))
SM_TARGETS += -gencode=arch=compute_61,code=\"sm_61,compute_61\"
SM_TARGETS += -gencode=arch=compute_61,code=\"sm_61,compute_61\"
SM_DEF += -DSM610
TEST_ARCH = 610
endif
ifeq (600, $(findstring 600, $(SM_ARCH)))
SM_TARGETS += -gencode=arch=compute_60,code=\"sm_60,compute_60\"
SM_TARGETS += -gencode=arch=compute_60,code=\"sm_60,compute_60\"
SM_DEF += -DSM600
TEST_ARCH = 600
endif
ifeq (520, $(findstring 520, $(SM_ARCH)))
SM_TARGETS += -gencode=arch=compute_52,code=\"sm_52,compute_52\"
SM_TARGETS += -gencode=arch=compute_52,code=\"sm_52,compute_52\"
SM_DEF += -DSM520
TEST_ARCH = 520
endif
ifeq (370, $(findstring 370, $(SM_ARCH)))
SM_TARGETS += -gencode=arch=compute_37,code=\"sm_37,compute_37\"
SM_TARGETS += -gencode=arch=compute_37,code=\"sm_37,compute_37\"
SM_DEF += -DSM370
TEST_ARCH = 370
endif
ifeq (350, $(findstring 350, $(SM_ARCH)))
SM_TARGETS += -gencode=arch=compute_35,code=\"sm_35,compute_35\"
SM_TARGETS += -gencode=arch=compute_35,code=\"sm_35,compute_35\"
SM_DEF += -DSM350
TEST_ARCH = 350
endif
Expand All @@ -80,36 +80,6 @@ ifeq (300, $(findstring 300, $(SM_ARCH)))
SM_DEF += -DSM300
TEST_ARCH = 300
endif
ifeq (210, $(findstring 210, $(SM_ARCH)))
SM_TARGETS += -gencode=arch=compute_20,code=\"sm_21,compute_20\"
SM_DEF += -DSM210
TEST_ARCH = 210
endif
ifeq (200, $(findstring 200, $(SM_ARCH)))
SM_TARGETS += -gencode=arch=compute_20,code=\"sm_20,compute_20\"
SM_DEF += -DSM200
TEST_ARCH = 200
endif
ifeq (130, $(findstring 130, $(SM_ARCH)))
SM_TARGETS += -gencode=arch=compute_13,code=\"sm_13,compute_13\"
SM_DEF += -DSM130
TEST_ARCH = 130
endif
ifeq (120, $(findstring 120, $(SM_ARCH)))
SM_TARGETS += -gencode=arch=compute_12,code=\"sm_12,compute_12\"
SM_DEF += -DSM120
TEST_ARCH = 120
endif
ifeq (110, $(findstring 110, $(SM_ARCH)))
SM_TARGETS += -gencode=arch=compute_11,code=\"sm_11,compute_11\"
SM_DEF += -DSM110
TEST_ARCH = 110
endif
ifeq (100, $(findstring 100, $(SM_ARCH)))
SM_TARGETS += -gencode=arch=compute_10,code=\"sm_10,compute_10\"
SM_DEF += -DSM100
TEST_ARCH = 100
endif


# [cdp=<0|1>] CDP enable option (default: no)
Expand All @@ -122,7 +92,7 @@ else
endif


# [force32=<0|1>] Device addressing mode option (64-bit device pointers by default)
# [force32=<0|1>] Device addressing mode option (64-bit device pointers by default)
ifeq ($(force32), 1)
CPU_ARCH = -m32
CPU_ARCH_SUFFIX = i386
Expand All @@ -133,10 +103,10 @@ else
endif


# [abi=<0|1>] CUDA ABI option (enabled by default)
# [abi=<0|1>] CUDA ABI option (enabled by default)
ifneq ($(abi), 0)
ABI_SUFFIX = abi
else
else
NVCCFLAGS += -Xptxas -abi=no
ABI_SUFFIX = noabi
endif
Expand All @@ -146,7 +116,7 @@ endif
ifeq ($(open64), 1)
NVCCFLAGS += -open64
PTX_SUFFIX = open64
else
else
PTX_SUFFIX = nvvm
endif

Expand Down Expand Up @@ -174,7 +144,7 @@ endif

CUB_DIR = $(dir $(lastword $(MAKEFILE_LIST)))

NVCC = "$(shell which nvcc)"
NVCC ?= "$(shell which nvcc)"
ifdef nvccver
NVCC_VERSION = $(nvccver)
else
Expand All @@ -184,8 +154,8 @@ endif
# detect OS
OSUPPER = $(shell uname -s 2>/dev/null | tr [:lower:] [:upper:])

# Default flags: verbose kernel properties (regs, smem, cmem, etc.); runtimes for compilation phases
NVCCFLAGS += $(SM_DEF) -Xptxas -v -Xcudafe -\#
# Default flags: verbose kernel properties (regs, smem, cmem, etc.); runtimes for compilation phases
NVCCFLAGS += $(SM_DEF) -Xptxas -v -Xcudafe -\#

ifeq (WIN_NT, $(findstring WIN_NT, $(OSUPPER)))
# For MSVC
Expand All @@ -196,10 +166,10 @@ ifeq (WIN_NT, $(findstring WIN_NT, $(OSUPPER)))
# Help the compiler/linker work with huge numbers of kernels on Windows
NVCCFLAGS += -Xcompiler /bigobj -Xcompiler /Zm500
CC = cl

# Multithreaded runtime
NVCCFLAGS += -Xcompiler /MT

ifneq ($(force32), 1)
CUDART_CYG = "$(shell dirname $(NVCC))/../lib/Win32/cudart.lib"
else
Expand Down Expand Up @@ -230,4 +200,4 @@ rwildcard=$(foreach d,$(wildcard $1*),$(call rwildcard,$d/,$2) $(filter $(subst

CUB_DEPS = $(call rwildcard, $(CUB_DIR),*.cuh) \
$(CUB_DIR)common.mk

6 changes: 3 additions & 3 deletions cub/block/block_load.cuh
Original file line number Diff line number Diff line change
@@ -1,7 +1,7 @@
/******************************************************************************
* Copyright (c) 2011, Duane Merrill. All rights reserved.
* Copyright (c) 2011-2016, NVIDIA CORPORATION. All rights reserved.
*
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions are met:
* * Redistributions of source code must retain the above copyright
Expand All @@ -12,7 +12,7 @@
* * Neither the name of the NVIDIA CORPORATION nor the
* names of its contributors may be used to endorse or promote products
* derived from this software without specific prior written permission.
*
*
* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
* ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
* WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
Expand Down Expand Up @@ -418,7 +418,7 @@ __device__ __forceinline__ void LoadDirectWarpStriped(
*
* \tparam T <b>[inferred]</b> The data type to load.
* \tparam ITEMS_PER_THREAD <b>[inferred]</b> The number of consecutive items partitioned onto each thread.
* \tparam InputIteratorT <b>[inferred]</b> The random-access iterator type for input \iterator.
* \tparam InputIteratorT <b>[inferred]</b> The random-access iterator type for input \iterator.
*/
template <
typename InputT,
Expand Down
8 changes: 4 additions & 4 deletions cub/iterator/tex_obj_input_iterator.cuh
Original file line number Diff line number Diff line change
@@ -1,7 +1,7 @@
/******************************************************************************
* Copyright (c) 2011, Duane Merrill. All rights reserved.
* Copyright (c) 2011-2018, NVIDIA CORPORATION. All rights reserved.
*
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions are met:
* * Redistributions of source code must retain the above copyright
Expand All @@ -12,7 +12,7 @@
* * Neither the name of the NVIDIA CORPORATION nor the
* names of its contributors may be used to endorse or promote products
* derived from this software without specific prior written permission.
*
*
* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
* ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
* WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
Expand Down Expand Up @@ -177,13 +177,13 @@ public:
res_desc.res.linear.desc = channel_desc;
res_desc.res.linear.sizeInBytes = bytes;
tex_desc.readMode = cudaReadModeElementType;
return cudaCreateTextureObject(&tex_obj, &res_desc, &tex_desc, NULL);
return CubDebug(cudaCreateTextureObject(&tex_obj, &res_desc, &tex_desc, NULL));
}

/// Unbind this iterator from its texture reference
cudaError_t UnbindTexture()
{
return cudaDestroyTextureObject(tex_obj);
return CubDebug(cudaDestroyTextureObject(tex_obj));
}

/// Postfix increment
Expand Down
2 changes: 1 addition & 1 deletion cub/util_allocator.cuh
Original file line number Diff line number Diff line change
Expand Up @@ -406,7 +406,7 @@ struct CachingDeviceAllocator
// in use by the device, only consider cached blocks that are
// either (from the active stream) or (from an idle stream)
if ((active_stream == block_itr->associated_stream) ||
(cudaEventQuery(block_itr->ready_event) != cudaErrorNotReady))
(CubDebug(cudaEventQuery(block_itr->ready_event) != cudaErrorNotReady)))
{
// Reuse existing cache block. Insert into live blocks.
found = true;
Expand Down
11 changes: 9 additions & 2 deletions cub/util_debug.cuh
Original file line number Diff line number Diff line change
@@ -1,7 +1,7 @@
/******************************************************************************
* Copyright (c) 2011, Duane Merrill. All rights reserved.
* Copyright (c) 2011-2018, NVIDIA CORPORATION. All rights reserved.
*
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions are met:
* * Redistributions of source code must retain the above copyright
Expand All @@ -12,7 +12,7 @@
* * Neither the name of the NVIDIA CORPORATION nor the
* names of its contributors may be used to endorse or promote products
* derived from this software without specific prior written permission.
*
*
* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
* ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
* WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
Expand Down Expand Up @@ -72,6 +72,13 @@ __host__ __device__ __forceinline__ cudaError_t Debug(
{
(void)filename;
(void)line;

#ifdef CUB_RUNTIME_ENABLED
// Clear the global CUDA error state which may have been set by the last
// call. Otherwise, errors may "leak" to unrelated kernel launches.
cudaGetLastError();
#endif

#ifdef CUB_STDERR
if (error)
{
Expand Down
36 changes: 23 additions & 13 deletions cub/util_device.cuh
Original file line number Diff line number Diff line change
Expand Up @@ -119,8 +119,7 @@ CUB_RUNTIME_FUNCTION __forceinline__ int CurrentDevice()
#if defined(CUB_RUNTIME_ENABLED) // Host code or device code with the CUDA runtime.

int device = -1;
cudaError_t const error = cudaGetDevice(&device);
if (CubDebug(error)) return -1;
if (CubDebug(cudaGetDevice(&device))) return -1;
return device;

#else // Device code without the CUDA runtime.
Expand Down Expand Up @@ -179,8 +178,13 @@ struct PerDeviceAttributeCache
// If this fails, we haven't compiled device code that can run on
// this device. This is only an error if we actually use this device,
// so we don't use CubDebug here.
if (error[device] = uncached_function(attribute[device], device))
if (error[device] = uncached_function(attribute[device], device)) {
// Clear the global CUDA error state which may have been set by
// the last call. Otherwise, errors may "leak" to unrelated
// kernel launches.
cudaGetLastError();
break;
}
}

// Make sure the entries for non-existent devices are initialized.
Expand Down Expand Up @@ -210,11 +214,16 @@ CUB_RUNTIME_FUNCTION __forceinline__ cudaError_t PtxVersionUncached(int &ptx_ver
cudaFuncAttributes empty_kernel_attrs;

do {
if (error = cudaFuncGetAttributes(&empty_kernel_attrs, empty_kernel))
// We do not `CubDebug` here because failure is not a hard error.
// We may be querying a device that we do not have code for but
// never use.
// We do not `CubDebug` here because failure is not a hard error.
// We may be querying a device that we do not have code for but
// never use.
if (error = cudaFuncGetAttributes(&empty_kernel_attrs, empty_kernel)) {
// Clear the global CUDA error state which may have been set by
// the last call. Otherwise, errors may "leak" to unrelated
// kernel launches.
cudaGetLastError();
break;
}
}
while(0);

Expand All @@ -225,6 +234,7 @@ CUB_RUNTIME_FUNCTION __forceinline__ cudaError_t PtxVersionUncached(int &ptx_ver
#else // Device code.

// The `reinterpret_cast` is necessary to suppress a set-but-unused warnings.
// This is a meme now: https://twitter.com/blelbach/status/1222391615576100864
(void)reinterpret_cast<EmptyKernelPtr>(empty_kernel);

ptx_version = CUB_PTX_ARCH;
Expand Down Expand Up @@ -318,7 +328,7 @@ CUB_RUNTIME_FUNCTION __forceinline__ cudaError_t SmVersionUncached(int &sm_versi
(void)device;

// CUDA API calls are not supported from this device.
return cudaErrorInvalidConfiguration;
return CubDebug(cudaErrorInvalidConfiguration);

#endif
}
Expand Down Expand Up @@ -357,19 +367,19 @@ CUB_RUNTIME_FUNCTION __forceinline__ cudaError_t SyncStream(cudaStream_t stream)
{
#if (CUB_PTX_ARCH == 0) // Host code.

return cudaStreamSynchronize(stream);
return CubDebug(cudaStreamSynchronize(stream));

#elif defined(CUB_RUNTIME_ENABLED) // Device code with the CUDA runtime.

(void)stream;
// Device can't yet sync on a specific stream
return cudaDeviceSynchronize();
return CubDebug(cudaDeviceSynchronize());

#else // Device code without the CUDA runtime.

(void)stream;
// CUDA API calls are not supported from this device.
return cudaErrorInvalidConfiguration;
return CubDebug(cudaErrorInvalidConfiguration);

#endif
}
Expand Down Expand Up @@ -426,11 +436,11 @@ cudaError_t MaxSmOccupancy(

#else

return cudaOccupancyMaxActiveBlocksPerMultiprocessor(
return CubDebug(cudaOccupancyMaxActiveBlocksPerMultiprocessor(
&max_sm_occupancy,
kernel_ptr,
block_threads,
dynamic_smem_bytes);
dynamic_smem_bytes));

#endif // CUB_RUNTIME_ENABLED
}
Expand Down

0 comments on commit 22b0573

Please sign in to comment.