From 090f6b886ad0ebef62ffb0ea25adc42f5b059081 Mon Sep 17 00:00:00 2001 From: Bradley Dice Date: Mon, 4 Apr 2022 17:08:56 -0700 Subject: [PATCH] Revert Thrust 1.16 to Thrust 1.15 (#10586) PR #10489 updated from Thrust 1.15 to Thrust 1.16. However, this appears to be causing conflicts with other repositories -- [cuSpatial](https://github.com/rapidsai/cuspatial/pull/511#issuecomment-1087738627) and cuGraph have reported issues where their builds are finding Thrust 1.16 from libcudf instead of Thrust 1.15 which is [currently pinned by rapids-cmake](https://github.com/rapidsai/rapids-cmake/blob/06a657281cdd83781e49afcdbb39abc491eeab17/rapids-cmake/cpm/versions.json#L26). This PR is intended to unblock local builds and CI builds for other RAPIDS packages until we are able to identify the root cause (which may be due to CMake include path orderingsrapids-cmake). Last time Thrust was updated, [rapids-cmake was updated](https://github.com/rapidsai/rapids-cmake/pull/138) one day before [libcudf was updated](https://github.com/rapidsai/cudf/pull/9912). That may explain why we didn't notice this problem with the 1.15 update. The plan I currently have in mind is: 1. Merge this PR to roll back libcudf to Thrust 1.15 (and revert the patch for Thrust 1.16 [10577](https://github.com/rapidsai/cudf/pull/10577)). This will hopefully unblock CI for cugraph and cuspatial. 2. Try to work out whatever issues with CMake / include paths may exist. 3. Prepare all rapids-cmake repos for Thrust 1.16 compatibility. I've [done this for RMM already](https://github.com/rapidsai/rmm/pull/1011), and I am working on [PR 4675](https://github.com/rapidsai/cuml/pull/4675) to cuML now. I am planning to make the same fixes for `#include`s in cuCollections, raft, cuSpatial, and cuGraph so they will be compatible with Thrust 1.16. 4. Try to upgrade libcudf to Thrust 1.16 again (and re-apply the updated patch). If (2) has been resolved, I hope we won't see any issues in other RAPIDS libraries 5. Upgrade rapids-cmake to Thrust 1.16. Authors: - Bradley Dice (https://github.com/bdice) Approvers: - Vyas Ramasubramani (https://github.com/vyasr) - Mark Harris (https://github.com/harrism) URL: https://github.com/rapidsai/cudf/pull/10586 --- cpp/cmake/thirdparty/get_thrust.cmake | 2 +- cpp/cmake/thrust.patch | 102 +++++++++++++------------- 2 files changed, 50 insertions(+), 54 deletions(-) diff --git a/cpp/cmake/thirdparty/get_thrust.cmake b/cpp/cmake/thirdparty/get_thrust.cmake index 295617c9996..927186d3f49 100644 --- a/cpp/cmake/thirdparty/get_thrust.cmake +++ b/cpp/cmake/thirdparty/get_thrust.cmake @@ -80,6 +80,6 @@ function(find_and_configure_thrust VERSION) endif() endfunction() -set(CUDF_MIN_VERSION_Thrust 1.16.0) +set(CUDF_MIN_VERSION_Thrust 1.15.0) find_and_configure_thrust(${CUDF_MIN_VERSION_Thrust}) diff --git a/cpp/cmake/thrust.patch b/cpp/cmake/thrust.patch index 6f735b955cf..2f9201d8ab4 100644 --- a/cpp/cmake/thrust.patch +++ b/cpp/cmake/thrust.patch @@ -1,39 +1,52 @@ -diff --git a/cub/block/block_merge_sort.cuh b/cub/block/block_merge_sort.cuh -index 4769df36..d86d6342 100644 ---- a/cub/block/block_merge_sort.cuh -+++ b/cub/block/block_merge_sort.cuh -@@ -91,7 +91,7 @@ __device__ __forceinline__ void SerialMerge(KeyT *keys_shared, - KeyT key1 = keys_shared[keys1_beg]; - KeyT key2 = keys_shared[keys2_beg]; +diff --git a/thrust/system/cuda/detail/sort.h b/thrust/system/cuda/detail/sort.h +index 1ffeef0..5e80800 100644 +--- a/thrust/system/cuda/detail/sort.h ++++ b/thrust/system/cuda/detail/sort.h +@@ -108,7 +108,7 @@ namespace __merge_sort { + key_type key2 = keys_shared[keys2_beg]; + -#pragma unroll +#pragma unroll 1 - for (int item = 0; item < ITEMS_PER_THREAD; ++item) - { - bool p = (keys2_beg < keys2_end) && -@@ -383,7 +383,7 @@ public: - // - KeyT max_key = oob_default; - -- #pragma unroll -+ #pragma unroll 1 - for (int item = 1; item < ITEMS_PER_THREAD; ++item) + for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ++ITEM) + { + bool p = (keys2_beg < keys2_end) && +@@ -311,10 +311,10 @@ namespace __merge_sort { + void stable_odd_even_sort(key_type (&keys)[ITEMS_PER_THREAD], + item_type (&items)[ITEMS_PER_THREAD]) { - if (ITEMS_PER_THREAD * linear_tid + item < valid_items) -@@ -407,7 +407,7 @@ public: - // each thread has sorted keys - // merge sort keys in shared memory - // -- #pragma unroll -+ #pragma unroll 1 - for (int target_merged_threads_number = 2; - target_merged_threads_number <= NUM_THREADS; - target_merged_threads_number *= 2) -diff --git a/cub/device/dispatch/dispatch_radix_sort.cuh b/cub/device/dispatch/dispatch_radix_sort.cuh -index b188c75f..3f36656f 100644 +-#pragma unroll ++#pragma unroll 1 + for (int i = 0; i < ITEMS_PER_THREAD; ++i) + { +-#pragma unroll ++#pragma unroll 1 + for (int j = 1 & i; j < ITEMS_PER_THREAD - 1; j += 2) + { + if (compare_op(keys[j + 1], keys[j])) +@@ -350,7 +350,7 @@ namespace __merge_sort { + // each thread has sorted keys_loc + // merge sort keys_loc in shared memory + // +-#pragma unroll ++#pragma unroll 1 + for (int coop = 2; coop <= BLOCK_THREADS; coop *= 2) + { + sync_threadblock(); +@@ -479,7 +479,7 @@ namespace __merge_sort { + // and fill the remainig keys with it + // + key_type max_key = keys_loc[0]; +-#pragma unroll ++#pragma unroll 1 + for (int ITEM = 1; ITEM < ITEMS_PER_THREAD; ++ITEM) + { + if (ITEMS_PER_THREAD * tid + ITEM < num_remaining) +diff a/cub/device/dispatch/dispatch_radix_sort.cuh b/cub/device/dispatch/dispatch_radix_sort.cuh +index 41eb1d2..f2893b4 100644 --- a/cub/device/dispatch/dispatch_radix_sort.cuh +++ b/cub/device/dispatch/dispatch_radix_sort.cuh -@@ -736,7 +736,7 @@ struct DeviceRadixSortPolicy +@@ -723,7 +723,7 @@ struct DeviceRadixSortPolicy /// SM60 (GP100) @@ -42,11 +55,11 @@ index b188c75f..3f36656f 100644 { enum { PRIMARY_RADIX_BITS = (sizeof(KeyT) > 1) ? 7 : 5, // 6.9B 32b keys/s (Quadro P100) -diff --git a/cub/device/dispatch/dispatch_reduce.cuh b/cub/device/dispatch/dispatch_reduce.cuh -index e0470ccb..6a0c2ed6 100644 +diff a/cub/device/dispatch/dispatch_reduce.cuh b/cub/device/dispatch/dispatch_reduce.cuh +index f6aee45..dd64301 100644 --- a/cub/device/dispatch/dispatch_reduce.cuh +++ b/cub/device/dispatch/dispatch_reduce.cuh -@@ -280,7 +280,7 @@ struct DeviceReducePolicy +@@ -284,7 +284,7 @@ struct DeviceReducePolicy }; /// SM60 @@ -55,11 +68,11 @@ index e0470ccb..6a0c2ed6 100644 { // ReducePolicy (P100: 591 GB/s @ 64M 4B items; 583 GB/s @ 256M 1B items) typedef AgentReducePolicy< -diff --git a/cub/device/dispatch/dispatch_scan.cuh b/cub/device/dispatch/dispatch_scan.cuh -index c2d04588..ac2d10e0 100644 +diff a/cub/device/dispatch/dispatch_scan.cuh b/cub/device/dispatch/dispatch_scan.cuh +index c0c6d59..937ee31 100644 --- a/cub/device/dispatch/dispatch_scan.cuh +++ b/cub/device/dispatch/dispatch_scan.cuh -@@ -177,7 +177,7 @@ struct DeviceScanPolicy +@@ -178,7 +178,7 @@ struct DeviceScanPolicy }; /// SM600 @@ -68,20 +81,3 @@ index c2d04588..ac2d10e0 100644 { typedef AgentScanPolicy< 128, 15, ///< Threads per block, items per thread -diff --git a/cub/thread/thread_sort.cuh b/cub/thread/thread_sort.cuh -index 5d486789..b42fb5f0 100644 ---- a/cub/thread/thread_sort.cuh -+++ b/cub/thread/thread_sort.cuh -@@ -83,10 +83,10 @@ StableOddEvenSort(KeyT (&keys)[ITEMS_PER_THREAD], - { - constexpr bool KEYS_ONLY = std::is_same::value; - -- #pragma unroll -+ #pragma unroll 1 - for (int i = 0; i < ITEMS_PER_THREAD; ++i) - { -- #pragma unroll -+ #pragma unroll 1 - for (int j = 1 & i; j < ITEMS_PER_THREAD - 1; j += 2) - { - if (compare_op(keys[j + 1], keys[j]))