feat: use latest rust-gpu-tools

rust-gpu-tools had a few breaking changes, most notable: When creating kernels, the way the global work size has changed. it is no longer the total number of threads, but how many groups of local work size sized threads there are.
filecoin-project · Jul 9, 2021 · 8f133be · 8f133be
1 parent 5196224
commit 8f133be
Show file tree

Hide file tree

Showing 3 changed files with 8 additions and 9 deletions.
diff --git a/Cargo.toml b/Cargo.toml
@@ -43,7 +43,7 @@ paired = { version = "0.22.0", optional = true }
 
 # gpu feature 
 #rust-gpu-tools = { version = "0.3.0", optional = true }
-rust-gpu-tools = { git = "https://github.com/filecoin-project/rust-gpu-tools", rev = "2827a11196dd638c9afe5aeb99f6425c0d1a7670", optional = true }
+rust-gpu-tools = { git = "https://github.com/filecoin-project/rust-gpu-tools", branch = "master", default-features = false, optional = true }
 ff-cl-gen = { version = "0.3.0", optional = true }
 fs2 = { version = "0.4.3", optional = true }
 

diff --git a/src/gpu/fft.rs b/src/gpu/fft.rs
@@ -39,7 +39,7 @@ where
         // Select the first device for FFT
         let device = devices[0];
 
-        let src = sources::kernel::<E>(device.brand() == opencl::Brand::Nvidia);
+        let src = sources::kernel::<E>(device.vendor() == opencl::Vendor::Nvidia);
 
         let program = opencl::Program::from_opencl(&device, &src)?;
         let pq_buffer = program.create_buffer::<E::Fr>(1 << MAX_LOG2_RADIX >> 1)?;
@@ -77,7 +77,7 @@ where
 
         let n = 1u32 << log_n;
         let local_work_size = 1 << cmp::min(deg - 1, MAX_LOG2_LOCAL_WORK_SIZE);
-        let global_work_size = (n >> deg) * local_work_size;
+        let global_work_size = n >> deg;
         let kernel = self.program.create_kernel(
             "radix_fft",
             global_work_size as usize,

diff --git a/src/gpu/multiexp.rs b/src/gpu/multiexp.rs
@@ -101,7 +101,7 @@ where
     E: Engine,
 {
     pub fn create(d: opencl::Device, priority: bool) -> GPUResult<SingleMultiexpKernel<E>> {
-        let src = sources::kernel::<E>(d.brand() == opencl::Brand::Nvidia);
+        let src = sources::kernel::<E>(d.vendor() == opencl::Vendor::Nvidia);
 
         let exp_bits = exp_size::<E>() * 8;
         let core_count = utils::get_core_count(&d);
@@ -156,10 +156,9 @@ where
             .program
             .create_buffer::<<G as CurveAffine>::Projective>(2 * self.core_count)?;
 
-        // Make global work size divisible by `LOCAL_WORK_SIZE`
-        let mut global_work_size = num_windows * num_groups;
-        global_work_size +=
-            (LOCAL_WORK_SIZE - (global_work_size % LOCAL_WORK_SIZE)) % LOCAL_WORK_SIZE;
+        // The global work size follows CUDA's definition and is the number of `LOCAL_WORK_SIZE`
+        // sized thread groups.
+        let global_work_size = (num_windows * num_groups + LOCAL_WORK_SIZE - 1) / LOCAL_WORK_SIZE;
 
         let kernel = self.program.create_kernel(
             if TypeId::of::<G>() == TypeId::of::<E::G1Affine>() {
@@ -184,7 +183,7 @@ where
             .arg(&(window_size as u32))
             .run()?;
 
-        let mut results = vec![<G as CurveAffine>::Projective::zero(); num_groups * num_windows];
+        let mut results = vec![<G as CurveAffine>::Projective::zero(); 2 * self.core_count];
         self.program
             .read_into_buffer(&result_buffer, 0, &mut results)?;