Skip to content

Commit

Permalink
Browse files Browse the repository at this point in the history
WarpReduce segmented reduction broken in CUDA 9 for logical warp sizes <
32
  • Loading branch information
dumerrill committed Sep 20, 2017
1 parent d759b2e commit d622848
Show file tree
Hide file tree
Showing 8 changed files with 59 additions and 31 deletions.
16 changes: 14 additions & 2 deletions .cproject
Original file line number Diff line number Diff line change
Expand Up @@ -23,13 +23,19 @@
<builder buildPath="${workspace_loc:/PrivateCub}/Default" id="cdt.managedbuild.target.gnu.builder.cygwin.base.412463247" keepEnvironmentInBuildfile="false" name="Gnu Make Builder" superClass="cdt.managedbuild.target.gnu.builder.cygwin.base"/>
<tool id="cdt.managedbuild.tool.gnu.assembler.cygwin.base.996758685" name="GCC Assembler" superClass="cdt.managedbuild.tool.gnu.assembler.cygwin.base">
<option id="gnu.both.asm.option.include.paths.900454792" name="Include paths (-I)" superClass="gnu.both.asm.option.include.paths" valueType="includePath">
<listOptionValue builtIn="false" value="&quot;${CUDA_PATH}/include/device_launch_parameters.h&quot;"/>
<listOptionValue builtIn="false" value="&quot;${CUDA_PATH}/include/crt/device_functions.h&quot;"/>
<listOptionValue builtIn="false" value="&quot;${CUDA_PATH}/include&quot;"/>
</option>
<inputType id="cdt.managedbuild.tool.gnu.assembler.input.221302756" superClass="cdt.managedbuild.tool.gnu.assembler.input"/>
</tool>
<tool id="cdt.managedbuild.tool.gnu.archiver.cygwin.base.1353653670" name="GCC Archiver" superClass="cdt.managedbuild.tool.gnu.archiver.cygwin.base"/>
<tool id="cdt.managedbuild.tool.gnu.cpp.compiler.cygwin.base.1401626953" name="Cygwin C++ Compiler" superClass="cdt.managedbuild.tool.gnu.cpp.compiler.cygwin.base">
<option id="gnu.cpp.compiler.option.include.paths.1909687606" name="Include paths (-I)" superClass="gnu.cpp.compiler.option.include.paths" useByScannerDiscovery="false" valueType="includePath"/>
<option id="gnu.cpp.compiler.option.include.paths.1909687606" name="Include paths (-I)" superClass="gnu.cpp.compiler.option.include.paths" useByScannerDiscovery="false" valueType="includePath">
<listOptionValue builtIn="false" value="&quot;${CUDA_PATH}/include/device_launch_parameters.h&quot;"/>
<listOptionValue builtIn="false" value="&quot;${CUDA_PATH}/include/device_functions.h&quot;"/>
<listOptionValue builtIn="false" value="&quot;${CUDA_PATH}/include&quot;"/>
</option>
<option id="gnu.cpp.compiler.option.preprocessor.def.1893619952" name="Defined symbols (-D)" superClass="gnu.cpp.compiler.option.preprocessor.def" useByScannerDiscovery="false" valueType="definedSymbols">
<listOptionValue builtIn="false" value="__device__"/>
<listOptionValue builtIn="false" value="__global__"/>
Expand All @@ -42,12 +48,17 @@
<listOptionValue builtIn="false" value="__launch_bounds__(...)"/>
<listOptionValue builtIn="false" value="__align__(...)"/>
<listOptionValue builtIn="false" value="__CUDA_ARCH__=350"/>
<listOptionValue builtIn="false" value="__CUDACC__=1"/>
</option>
<option id="gnu.cpp.compiler.option.dialect.std.49639338" name="Language standard" superClass="gnu.cpp.compiler.option.dialect.std" useByScannerDiscovery="true" value="gnu.cpp.compiler.dialect.default" valueType="enumerated"/>
<inputType id="cdt.managedbuild.tool.gnu.cpp.compiler.input.cygwin.1708330939" superClass="cdt.managedbuild.tool.gnu.cpp.compiler.input.cygwin"/>
</tool>
<tool id="cdt.managedbuild.tool.gnu.c.compiler.cygwin.base.1940954787" name="Cygwin C Compiler" superClass="cdt.managedbuild.tool.gnu.c.compiler.cygwin.base">
<option id="gnu.c.compiler.option.include.paths.1945618846" name="Include paths (-I)" superClass="gnu.c.compiler.option.include.paths" useByScannerDiscovery="false" valueType="includePath"/>
<option id="gnu.c.compiler.option.include.paths.1945618846" name="Include paths (-I)" superClass="gnu.c.compiler.option.include.paths" useByScannerDiscovery="false" valueType="includePath">
<listOptionValue builtIn="false" value="&quot;${CUDA_PATH}/include/device_launch_parameters.h&quot;"/>
<listOptionValue builtIn="false" value="&quot;${CUDA_PATH}/include/crt/device_functions.h&quot;"/>
<listOptionValue builtIn="false" value="&quot;${CUDA_PATH}/include&quot;"/>
</option>
<option id="gnu.c.compiler.option.preprocessor.def.symbols.1005509663" name="Defined symbols (-D)" superClass="gnu.c.compiler.option.preprocessor.def.symbols" useByScannerDiscovery="false" valueType="definedSymbols">
<listOptionValue builtIn="false" value="__device__"/>
<listOptionValue builtIn="false" value="__global__"/>
Expand All @@ -60,6 +71,7 @@
<listOptionValue builtIn="false" value="__launch_bounds__(...)"/>
<listOptionValue builtIn="false" value="__align__(...)"/>
<listOptionValue builtIn="false" value="__CUDA_ARCH__=350"/>
<listOptionValue builtIn="false" value="__CUDACC__=1"/>
</option>
<inputType id="cdt.managedbuild.tool.gnu.c.compiler.input.cygwin.469104331" superClass="cdt.managedbuild.tool.gnu.c.compiler.input.cygwin"/>
</tool>
Expand Down
5 changes: 3 additions & 2 deletions CHANGE_LOG.TXT
Original file line number Diff line number Diff line change
@@ -1,6 +1,7 @@
1.7.4 09/19/2017
1.7.4 09/20/2017
- Bug fixes:
- Issue #114: Can't pair non-trivially-constructible values in radix sort
- Issue #114: Can't pair non-trivially-constructible values in radix sort
- Issue #115: WarpReduce segmented reduction broken in CUDA 9 for logical warp sizes < 32

//-----------------------------------------------------------------------------

Expand Down
2 changes: 1 addition & 1 deletion README.md
Original file line number Diff line number Diff line change
@@ -1,7 +1,7 @@
<hr>
<h3>About CUB</h3>

Current release: v1.7.4 (09/19/2017)
Current release: v1.7.4 (09/20/2017)

We recommend the [CUB Project Website](http://nvlabs.github.com/cub) and the [cub-users discussion forum](http://groups.google.com/group/cub-users) for further information and examples.

Expand Down
26 changes: 14 additions & 12 deletions cub/warp/specializations/warp_reduce_shfl.cuh
Original file line number Diff line number Diff line change
Expand Up @@ -112,9 +112,10 @@ struct WarpReduceShfl
// Thread fields
//---------------------------------------------------------------------

int lane_id;

int member_mask;
unsigned int lane_id;

unsigned int member_mask;

//---------------------------------------------------------------------
// Construction
Expand All @@ -126,9 +127,9 @@ struct WarpReduceShfl
:
lane_id(LaneId()),

member_mask(IS_ARCH_WARP ?
0xffffffff :
(0xffffffff >> (32 - LOGICAL_WARP_THREADS)) << (LaneId() / LOGICAL_WARP_THREADS))
member_mask((0xffffffff >> (32 - LOGICAL_WARP_THREADS)) << ((IS_ARCH_WARP) ?
0 : // arch-width subwarps need not be tiled within the arch-warp
((lane_id / LOGICAL_WARP_THREADS) * LOGICAL_WARP_THREADS)))
{}


Expand Down Expand Up @@ -470,22 +471,22 @@ struct WarpReduceShfl
int folded_items_per_warp, ///< [in] Total number of valid items folded into each logical warp
ReductionOp reduction_op) ///< [in] Binary reduction operator
{
// Get the last thread in the logical warp
int first_warp_thread = 0;
int last_warp_thread = LOGICAL_WARP_THREADS - 1;
// Get the lane of the first and last thread in the logical warp
int first_thread = 0;
int last_thread = LOGICAL_WARP_THREADS - 1;
if (!IS_ARCH_WARP)
{
first_warp_thread = lane_id & (~(LOGICAL_WARP_THREADS - 1));
last_warp_thread |= lane_id;
first_thread = lane_id & (~(LOGICAL_WARP_THREADS - 1));
last_thread |= lane_id;
}

// Common case is FOLDED_ITEMS_PER_LANE = 1 (or a multiple of 32)
int lanes_with_valid_data = (folded_items_per_warp - 1) / FOLDED_ITEMS_PER_LANE;

// Get the last valid lane
int last_lane = (ALL_LANES_VALID) ?
last_warp_thread :
CUB_MIN(last_warp_thread, first_warp_thread + lanes_with_valid_data);
last_thread :
CUB_MIN(last_thread, first_thread + lanes_with_valid_data);

T output = input;

Expand Down Expand Up @@ -516,6 +517,7 @@ struct WarpReduceShfl
// Get the start flags for each thread in the warp.
int warp_flags = WARP_BALLOT(flag, member_mask);

// Convert to tail-segmented
if (HEAD_SEGMENTED)
warp_flags >>= 1;

Expand Down
8 changes: 5 additions & 3 deletions cub/warp/specializations/warp_reduce_smem.cuh
Original file line number Diff line number Diff line change
Expand Up @@ -113,12 +113,14 @@ struct WarpReduceSmem
TempStorage &temp_storage)
:
temp_storage(temp_storage.Alias()),

lane_id(IS_ARCH_WARP ?
LaneId() :
LaneId() % LOGICAL_WARP_THREADS),
member_mask(!IS_POW_OF_TWO ?
(0xffffffff >> (32 - LOGICAL_WARP_THREADS)) : // non-power-of-two subwarps cannot be tiled
(0xffffffff >> (32 - LOGICAL_WARP_THREADS)) << (LaneId() / LOGICAL_WARP_THREADS))

member_mask((0xffffffff >> (32 - LOGICAL_WARP_THREADS)) << ((IS_ARCH_WARP || !IS_POW_OF_TWO ) ?
0 : // arch-width and non-power-of-two subwarps cannot be tiled with the arch-warp
((LaneId() / LOGICAL_WARP_THREADS) * LOGICAL_WARP_THREADS)))
{}

/******************************************************************************
Expand Down
20 changes: 13 additions & 7 deletions cub/warp/specializations/warp_scan_shfl.cuh
Original file line number Diff line number Diff line change
Expand Up @@ -46,6 +46,8 @@ namespace cub {

/**
* \brief WarpScanShfl provides SHFL-based variants of parallel prefix scan of items partitioned across a CUDA thread warp.
*
* LOGICAL_WARP_THREADS must be a power-of-two
*/
template <
typename T, ///< Data type being scanned
Expand Down Expand Up @@ -98,12 +100,11 @@ struct WarpScanShfl
__device__ __forceinline__ WarpScanShfl(
TempStorage &/*temp_storage*/)
:
lane_id(IS_ARCH_WARP ?
LaneId() :
LaneId() % LOGICAL_WARP_THREADS),
member_mask(IS_ARCH_WARP ?
0xffffffff :
(0xffffffff >> (32 - LOGICAL_WARP_THREADS)) << (LaneId() / LOGICAL_WARP_THREADS))
lane_id(LaneId()),

member_mask((0xffffffff >> (32 - LOGICAL_WARP_THREADS)) << ((IS_ARCH_WARP) ?
0 : // arch-width subwarps need not be tiled within the arch-warp
((lane_id / LOGICAL_WARP_THREADS) * LOGICAL_WARP_THREADS)))
{}


Expand Down Expand Up @@ -594,7 +595,12 @@ struct WarpScanShfl
{
inclusive = scan_op(initial_value, inclusive);
exclusive = ShuffleUp(inclusive, 1, 0, member_mask);
if (lane_id == 0)

unsigned int segment_id = (IS_ARCH_WARP) ?
lane_id :
lane_id % LOGICAL_WARP_THREADS;

if (segment_id == 0)
exclusive = initial_value;
}

Expand Down
8 changes: 5 additions & 3 deletions cub/warp/specializations/warp_scan_smem.cuh
Original file line number Diff line number Diff line change
Expand Up @@ -104,12 +104,14 @@ struct WarpScanSmem
TempStorage &temp_storage)
:
temp_storage(temp_storage.Alias()),

lane_id(IS_ARCH_WARP ?
LaneId() :
LaneId() % LOGICAL_WARP_THREADS),
member_mask(!IS_POW_OF_TWO ?
(0xffffffff >> (32 - LOGICAL_WARP_THREADS)) : // non-power-of-two subwarps cannot be tiled
(0xffffffff >> (32 - LOGICAL_WARP_THREADS)) << (LaneId() / LOGICAL_WARP_THREADS))

member_mask((0xffffffff >> (32 - LOGICAL_WARP_THREADS)) << ((IS_ARCH_WARP || !IS_POW_OF_TWO ) ?
0 : // arch-width and non-power-of-two subwarps cannot be tiled with the arch-warp
((LaneId() / LOGICAL_WARP_THREADS) * LOGICAL_WARP_THREADS)))
{}


Expand Down
5 changes: 4 additions & 1 deletion test/test_warp_reduce.cu
Original file line number Diff line number Diff line change
Expand Up @@ -778,7 +778,10 @@ template <int LOGICAL_WARP_THREADS>
void Test()
{
Test<1, LOGICAL_WARP_THREADS>();
Test<2, LOGICAL_WARP_THREADS>();

// Only power-of-two subwarps can be tiled
if ((LOGICAL_WARP_THREADS == 32) || PowerOfTwo<LOGICAL_WARP_THREADS>::VALUE)
Test<2, LOGICAL_WARP_THREADS>();
}


Expand Down

0 comments on commit d622848

Please sign in to comment.