After making a CUDA API call, always clear the global CUDA error stat…

…e by calling cudaGetLastError. Otherwise, if the CUDA API call is followed directly by a kernel launch, checking for a synchronous error during the kernel launch by calling cudaGetLastError may potentially return the error code from the CUDA API call. This type of error leakage is very subtle and difficult to trace. Also, update Makefile to remove old architectures and allow you to override the NVCC variant used. Bug 2720132 Bug 2808654 Reviewed-by: Michał 'Griwes' Dominiak <griwes@griwes.info>
NVIDIA · Feb 6, 2020 · 22b0573 · 22b0573
1 parent 11755ca
commit 22b0573
Show file tree

Hide file tree

Showing 6 changed files with 62 additions and 75 deletions.
diff --git a/common.mk b/common.mk
@@ -1,7 +1,7 @@
 #/******************************************************************************
 # * Copyright (c) 2011, Duane Merrill.  All rights reserved.
 # * Copyright (c) 2011-2018, NVIDIA CORPORATION.  All rights reserved.
-# * 
+# *
 # * Redistribution and use in source and binary forms, with or without
 # * modification, are permitted provided that the following conditions are met:
 # *	 * Redistributions of source code must retain the above copyright
@@ -12,7 +12,7 @@
 # *	 * Neither the name of the NVIDIA CORPORATION nor the
 # *	   names of its contributors may be used to endorse or promote products
 # *	   derived from this software without specific prior written permission.
-# * 
+# *
 # * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
 # * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
 # * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
@@ -32,46 +32,46 @@
 #-------------------------------------------------------------------------------
 
 # [sm=<XXX,...>] Compute-capability to compile for, e.g., "sm=200,300,350" (SM20 by default).
-  
+
 COMMA = ,
 ifdef sm
 	SM_ARCH = $(subst $(COMMA),-,$(sm))
-else 
-    SM_ARCH = 200
+else
+    SM_ARCH = 600
 endif
 
 ifeq (700, $(findstring 700, $(SM_ARCH)))
-    SM_TARGETS 	+= -gencode=arch=compute_70,code=\"sm_70,compute_70\" 
+    SM_TARGETS 	+= -gencode=arch=compute_70,code=\"sm_70,compute_70\"
     SM_DEF 		+= -DSM700
     TEST_ARCH 	= 700
 endif
 ifeq (620, $(findstring 620, $(SM_ARCH)))
-    SM_TARGETS 	+= -gencode=arch=compute_62,code=\"sm_62,compute_62\" 
+    SM_TARGETS 	+= -gencode=arch=compute_62,code=\"sm_62,compute_62\"
     SM_DEF 		+= -DSM620
     TEST_ARCH 	= 620
 endif
 ifeq (610, $(findstring 610, $(SM_ARCH)))
-    SM_TARGETS 	+= -gencode=arch=compute_61,code=\"sm_61,compute_61\" 
+    SM_TARGETS 	+= -gencode=arch=compute_61,code=\"sm_61,compute_61\"
     SM_DEF 		+= -DSM610
     TEST_ARCH 	= 610
 endif
 ifeq (600, $(findstring 600, $(SM_ARCH)))
-    SM_TARGETS 	+= -gencode=arch=compute_60,code=\"sm_60,compute_60\" 
+    SM_TARGETS 	+= -gencode=arch=compute_60,code=\"sm_60,compute_60\"
     SM_DEF 		+= -DSM600
     TEST_ARCH 	= 600
 endif
 ifeq (520, $(findstring 520, $(SM_ARCH)))
-    SM_TARGETS 	+= -gencode=arch=compute_52,code=\"sm_52,compute_52\" 
+    SM_TARGETS 	+= -gencode=arch=compute_52,code=\"sm_52,compute_52\"
     SM_DEF 		+= -DSM520
     TEST_ARCH 	= 520
 endif
 ifeq (370, $(findstring 370, $(SM_ARCH)))
-    SM_TARGETS 	+= -gencode=arch=compute_37,code=\"sm_37,compute_37\" 
+    SM_TARGETS 	+= -gencode=arch=compute_37,code=\"sm_37,compute_37\"
     SM_DEF 		+= -DSM370
     TEST_ARCH 	= 370
 endif
 ifeq (350, $(findstring 350, $(SM_ARCH)))
-    SM_TARGETS 	+= -gencode=arch=compute_35,code=\"sm_35,compute_35\" 
+    SM_TARGETS 	+= -gencode=arch=compute_35,code=\"sm_35,compute_35\"
     SM_DEF 		+= -DSM350
     TEST_ARCH 	= 350
 endif
@@ -80,36 +80,6 @@ ifeq (300, $(findstring 300, $(SM_ARCH)))
     SM_DEF 		+= -DSM300
     TEST_ARCH 	= 300
 endif
-ifeq (210, $(findstring 210, $(SM_ARCH)))
-    SM_TARGETS 	+= -gencode=arch=compute_20,code=\"sm_21,compute_20\"
-    SM_DEF 		+= -DSM210
-    TEST_ARCH 	= 210
-endif
-ifeq (200, $(findstring 200, $(SM_ARCH)))
-    SM_TARGETS 	+= -gencode=arch=compute_20,code=\"sm_20,compute_20\"
-    SM_DEF 		+= -DSM200
-    TEST_ARCH 	= 200
-endif
-ifeq (130, $(findstring 130, $(SM_ARCH)))
-    SM_TARGETS 	+= -gencode=arch=compute_13,code=\"sm_13,compute_13\" 
-    SM_DEF 		+= -DSM130
-    TEST_ARCH 	= 130
-endif
-ifeq (120, $(findstring 120, $(SM_ARCH)))
-    SM_TARGETS 	+= -gencode=arch=compute_12,code=\"sm_12,compute_12\" 
-    SM_DEF 		+= -DSM120
-    TEST_ARCH 	= 120
-endif
-ifeq (110, $(findstring 110, $(SM_ARCH)))
-    SM_TARGETS 	+= -gencode=arch=compute_11,code=\"sm_11,compute_11\" 
-    SM_DEF 		+= -DSM110
-    TEST_ARCH 	= 110
-endif
-ifeq (100, $(findstring 100, $(SM_ARCH)))
-    SM_TARGETS 	+= -gencode=arch=compute_10,code=\"sm_10,compute_10\" 
-    SM_DEF 		+= -DSM100
-    TEST_ARCH 	= 100
-endif
 
 
 # [cdp=<0|1>] CDP enable option (default: no)
@@ -122,7 +92,7 @@ else
 endif
 
 
-# [force32=<0|1>] Device addressing mode option (64-bit device pointers by default) 
+# [force32=<0|1>] Device addressing mode option (64-bit device pointers by default)
 ifeq ($(force32), 1)
 	CPU_ARCH = -m32
 	CPU_ARCH_SUFFIX = i386
@@ -133,10 +103,10 @@ else
 endif
 
 
-# [abi=<0|1>] CUDA ABI option (enabled by default) 
+# [abi=<0|1>] CUDA ABI option (enabled by default)
 ifneq ($(abi), 0)
 	ABI_SUFFIX = abi
-else 
+else
 	NVCCFLAGS += -Xptxas -abi=no
 	ABI_SUFFIX = noabi
 endif
@@ -146,7 +116,7 @@ endif
 ifeq ($(open64), 1)
 	NVCCFLAGS += -open64
 	PTX_SUFFIX = open64
-else 
+else
 	PTX_SUFFIX = nvvm
 endif
 
@@ -174,7 +144,7 @@ endif
 
 CUB_DIR = $(dir $(lastword $(MAKEFILE_LIST)))
 
-NVCC = "$(shell which nvcc)"
+NVCC ?= "$(shell which nvcc)"
 ifdef nvccver
     NVCC_VERSION = $(nvccver)
 else
@@ -184,8 +154,8 @@ endif
 # detect OS
 OSUPPER = $(shell uname -s 2>/dev/null | tr [:lower:] [:upper:])
 
-# Default flags: verbose kernel properties (regs, smem, cmem, etc.); runtimes for compilation phases 
-NVCCFLAGS += $(SM_DEF) -Xptxas -v -Xcudafe -\# 
+# Default flags: verbose kernel properties (regs, smem, cmem, etc.); runtimes for compilation phases
+NVCCFLAGS += $(SM_DEF) -Xptxas -v -Xcudafe -\#
 
 ifeq (WIN_NT, $(findstring WIN_NT, $(OSUPPER)))
     # For MSVC
@@ -196,10 +166,10 @@ ifeq (WIN_NT, $(findstring WIN_NT, $(OSUPPER)))
     # Help the compiler/linker work with huge numbers of kernels on Windows
 	NVCCFLAGS += -Xcompiler /bigobj -Xcompiler /Zm500
 	CC = cl
-	
+
 	# Multithreaded runtime
 	NVCCFLAGS += -Xcompiler /MT
-	
+
 ifneq ($(force32), 1)
 	CUDART_CYG = "$(shell dirname $(NVCC))/../lib/Win32/cudart.lib"
 else
@@ -230,4 +200,4 @@ rwildcard=$(foreach d,$(wildcard $1*),$(call rwildcard,$d/,$2) $(filter $(subst
 
 CUB_DEPS = 	$(call rwildcard, $(CUB_DIR),*.cuh) \
 			$(CUB_DIR)common.mk
-		
+
diff --git a/cub/block/block_load.cuh b/cub/block/block_load.cuh
@@ -1,7 +1,7 @@
 /******************************************************************************
  * Copyright (c) 2011, Duane Merrill.  All rights reserved.
  * Copyright (c) 2011-2016, NVIDIA CORPORATION.  All rights reserved.
- * 
+ *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions are met:
  *     * Redistributions of source code must retain the above copyright
@@ -12,7 +12,7 @@
  *     * Neither the name of the NVIDIA CORPORATION nor the
  *       names of its contributors may be used to endorse or promote products
  *       derived from this software without specific prior written permission.
- * 
+ *
  * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
  * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
@@ -418,7 +418,7 @@ __device__ __forceinline__ void LoadDirectWarpStriped(
  *
  * \tparam T                    <b>[inferred]</b> The data type to load.
  * \tparam ITEMS_PER_THREAD     <b>[inferred]</b> The number of consecutive items partitioned onto each thread.
- * \tparam InputIteratorT        <b>[inferred]</b> The random-access iterator type for input \iterator.
+ * \tparam InputIteratorT       <b>[inferred]</b> The random-access iterator type for input \iterator.
  */
 template <
     typename        InputT,

diff --git a/cub/iterator/tex_obj_input_iterator.cuh b/cub/iterator/tex_obj_input_iterator.cuh
@@ -1,7 +1,7 @@
 /******************************************************************************
  * Copyright (c) 2011, Duane Merrill.  All rights reserved.
  * Copyright (c) 2011-2018, NVIDIA CORPORATION.  All rights reserved.
- * 
+ *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions are met:
  *     * Redistributions of source code must retain the above copyright
@@ -12,7 +12,7 @@
  *     * Neither the name of the NVIDIA CORPORATION nor the
  *       names of its contributors may be used to endorse or promote products
  *       derived from this software without specific prior written permission.
- * 
+ *
  * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
  * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
@@ -177,13 +177,13 @@ public:
         res_desc.res.linear.desc        = channel_desc;
         res_desc.res.linear.sizeInBytes = bytes;
         tex_desc.readMode               = cudaReadModeElementType;
-        return cudaCreateTextureObject(&tex_obj, &res_desc, &tex_desc, NULL);
+        return CubDebug(cudaCreateTextureObject(&tex_obj, &res_desc, &tex_desc, NULL));
     }
 
     /// Unbind this iterator from its texture reference
     cudaError_t UnbindTexture()
     {
-        return cudaDestroyTextureObject(tex_obj);
+        return CubDebug(cudaDestroyTextureObject(tex_obj));
     }
 
     /// Postfix increment

diff --git a/cub/util_allocator.cuh b/cub/util_allocator.cuh
@@ -406,7 +406,7 @@ struct CachingDeviceAllocator
                 // in use by the device, only consider cached blocks that are
                 // either (from the active stream) or (from an idle stream)
                 if ((active_stream == block_itr->associated_stream) ||
-                    (cudaEventQuery(block_itr->ready_event) != cudaErrorNotReady))
+                    (CubDebug(cudaEventQuery(block_itr->ready_event) != cudaErrorNotReady)))
                 {
                     // Reuse existing cache block.  Insert into live blocks.
                     found = true;

diff --git a/cub/util_debug.cuh b/cub/util_debug.cuh
@@ -1,7 +1,7 @@
 /******************************************************************************
  * Copyright (c) 2011, Duane Merrill.  All rights reserved.
  * Copyright (c) 2011-2018, NVIDIA CORPORATION.  All rights reserved.
- * 
+ *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions are met:
  *     * Redistributions of source code must retain the above copyright
@@ -12,7 +12,7 @@
  *     * Neither the name of the NVIDIA CORPORATION nor the
  *       names of its contributors may be used to endorse or promote products
  *       derived from this software without specific prior written permission.
- * 
+ *
  * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
  * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
@@ -72,6 +72,13 @@ __host__ __device__ __forceinline__ cudaError_t Debug(
 {
     (void)filename;
     (void)line;
+
+#ifdef CUB_RUNTIME_ENABLED
+    // Clear the global CUDA error state which may have been set by the last
+    // call. Otherwise, errors may "leak" to unrelated kernel launches.
+    cudaGetLastError();
+#endif
+
 #ifdef CUB_STDERR
     if (error)
     {

diff --git a/cub/util_device.cuh b/cub/util_device.cuh
@@ -119,8 +119,7 @@ CUB_RUNTIME_FUNCTION __forceinline__ int CurrentDevice()
 #if defined(CUB_RUNTIME_ENABLED) // Host code or device code with the CUDA runtime.
 
     int device = -1;
-    cudaError_t const error = cudaGetDevice(&device);
-    if (CubDebug(error)) return -1;
+    if (CubDebug(cudaGetDevice(&device))) return -1;
     return device;
 
 #else // Device code without the CUDA runtime.
@@ -179,8 +178,13 @@ struct PerDeviceAttributeCache
             // If this fails, we haven't compiled device code that can run on
             // this device. This is only an error if we actually use this device,
             // so we don't use CubDebug here.
-            if (error[device] = uncached_function(attribute[device], device))
+            if (error[device] = uncached_function(attribute[device], device)) {
+              // Clear the global CUDA error state which may have been set by
+              // the last call. Otherwise, errors may "leak" to unrelated
+              // kernel launches.
+              cudaGetLastError();
               break;
+            }
         }
 
         // Make sure the entries for non-existent devices are initialized.
@@ -210,11 +214,16 @@ CUB_RUNTIME_FUNCTION __forceinline__ cudaError_t PtxVersionUncached(int &ptx_ver
     cudaFuncAttributes empty_kernel_attrs;
 
     do {
-        if (error = cudaFuncGetAttributes(&empty_kernel_attrs, empty_kernel))
-            // We do not `CubDebug` here because failure is not a hard error.
-            // We may be querying a device that we do not have code for but
-            // never use.
+        // We do not `CubDebug` here because failure is not a hard error.
+        // We may be querying a device that we do not have code for but
+        // never use.
+        if (error = cudaFuncGetAttributes(&empty_kernel_attrs, empty_kernel)) {
+            // Clear the global CUDA error state which may have been set by
+            // the last call. Otherwise, errors may "leak" to unrelated
+            // kernel launches.
+            cudaGetLastError();
             break;
+        }
     }
     while(0);
 
@@ -225,6 +234,7 @@ CUB_RUNTIME_FUNCTION __forceinline__ cudaError_t PtxVersionUncached(int &ptx_ver
 #else // Device code.
 
     // The `reinterpret_cast` is necessary to suppress a set-but-unused warnings.
+    // This is a meme now: https://twitter.com/blelbach/status/1222391615576100864
     (void)reinterpret_cast<EmptyKernelPtr>(empty_kernel);
 
     ptx_version = CUB_PTX_ARCH;
@@ -318,7 +328,7 @@ CUB_RUNTIME_FUNCTION __forceinline__ cudaError_t SmVersionUncached(int &sm_versi
     (void)device;
 
     // CUDA API calls are not supported from this device.
-    return cudaErrorInvalidConfiguration;
+    return CubDebug(cudaErrorInvalidConfiguration);
 
 #endif
 }
@@ -357,19 +367,19 @@ CUB_RUNTIME_FUNCTION __forceinline__ cudaError_t SyncStream(cudaStream_t stream)
 {
 #if (CUB_PTX_ARCH == 0) // Host code.
 
-    return cudaStreamSynchronize(stream);
+    return CubDebug(cudaStreamSynchronize(stream));
 
 #elif defined(CUB_RUNTIME_ENABLED) // Device code with the CUDA runtime.
 
     (void)stream;
     // Device can't yet sync on a specific stream
-    return cudaDeviceSynchronize();
+    return CubDebug(cudaDeviceSynchronize());
 
 #else // Device code without the CUDA runtime.
 
     (void)stream;
     // CUDA API calls are not supported from this device.
-    return cudaErrorInvalidConfiguration;
+    return CubDebug(cudaErrorInvalidConfiguration);
 
 #endif
 }
@@ -426,11 +436,11 @@ cudaError_t MaxSmOccupancy(
 
 #else
 
-    return cudaOccupancyMaxActiveBlocksPerMultiprocessor(
+    return CubDebug(cudaOccupancyMaxActiveBlocksPerMultiprocessor(
         &max_sm_occupancy,
         kernel_ptr,
         block_threads,
-        dynamic_smem_bytes);
+        dynamic_smem_bytes));
 
 #endif  // CUB_RUNTIME_ENABLED
 }