From 8ee9b2eee585d2281cc9d9555b96c0c7c9a38a10 Mon Sep 17 00:00:00 2001 From: nonam3e Date: Mon, 27 May 2024 14:47:46 +0000 Subject: [PATCH 01/14] bit reverse --- icicle/include/api/babybear.h | 11 ++ icicle/include/api/bls12_377.h | 11 ++ icicle/include/api/bls12_381.h | 11 ++ icicle/include/api/bn254.h | 11 ++ icicle/include/api/bw6_761.h | 11 ++ icicle/include/api/grumpkin.h | 11 ++ icicle/include/api/stark252.h | 11 ++ icicle/include/vec_ops/vec_ops.cuh | 19 +++ icicle/src/vec_ops/extern.cu | 9 ++ icicle/src/vec_ops/vec_ops.cu | 86 ++++++++++++ wrappers/rust/icicle-core/src/vec_ops/mod.rs | 123 ++++++++++++++++++ .../rust/icicle-core/src/vec_ops/tests.rs | 43 +++++- .../icicle-bls12-377/src/vec_ops/mod.rs | 2 +- .../icicle-bls12-381/src/vec_ops/mod.rs | 2 +- .../icicle-bn254/src/vec_ops/mod.rs | 2 +- .../icicle-grumpkin/src/vec_ops/mod.rs | 2 +- .../icicle-babybear/src/vec_ops/mod.rs | 54 +++++++- .../icicle-stark252/src/vec_ops/mod.rs | 2 +- 18 files changed, 413 insertions(+), 8 deletions(-) diff --git a/icicle/include/api/babybear.h b/icicle/include/api/babybear.h index ff2cf851e..50102ba58 100644 --- a/icicle/include/api/babybear.h +++ b/icicle/include/api/babybear.h @@ -68,6 +68,17 @@ extern "C" cudaError_t babybear_transpose_matrix_cuda( bool on_device, bool is_async); +extern "C" cudaError_t babybear_bit_reverse_cuda( + const babybear::scalar_t* input, + unsigned n, + vec_ops::BitReverseConfig& config, + babybear::scalar_t* output); + +extern "C" cudaError_t babybear_bit_reverse_inplace_cuda( + babybear::scalar_t* input, + unsigned n, + vec_ops::BitReverseConfig& config); + extern "C" void babybear_generate_scalars(babybear::scalar_t* scalars, int size); extern "C" cudaError_t babybear_scalar_convert_montgomery( diff --git a/icicle/include/api/bls12_377.h b/icicle/include/api/bls12_377.h index 800dc6810..e83c9656a 100644 --- a/icicle/include/api/bls12_377.h +++ b/icicle/include/api/bls12_377.h @@ -116,6 +116,17 @@ extern "C" cudaError_t bls12_377_transpose_matrix_cuda( bool on_device, bool is_async); +extern "C" cudaError_t bls12_377_bit_reverse_cuda( + const bls12_377::scalar_t* input, + unsigned n, + vec_ops::BitReverseConfig& config, + bls12_377::scalar_t* output); + +extern "C" cudaError_t bls12_377_bit_reverse_inplace_cuda( + bls12_377::scalar_t* input, + unsigned n, + vec_ops::BitReverseConfig& config); + extern "C" void bls12_377_generate_scalars(bls12_377::scalar_t* scalars, int size); extern "C" cudaError_t bls12_377_scalar_convert_montgomery( diff --git a/icicle/include/api/bls12_381.h b/icicle/include/api/bls12_381.h index f96c886c5..bcede3914 100644 --- a/icicle/include/api/bls12_381.h +++ b/icicle/include/api/bls12_381.h @@ -116,6 +116,17 @@ extern "C" cudaError_t bls12_381_transpose_matrix_cuda( bool on_device, bool is_async); +extern "C" cudaError_t bls12_381_bit_reverse_cuda( + const bls12_381::scalar_t* input, + unsigned n, + vec_ops::BitReverseConfig& config, + bls12_381::scalar_t* output); + +extern "C" cudaError_t bls12_381_bit_reverse_inplace_cuda( + bls12_381::scalar_t* input, + unsigned n, + vec_ops::BitReverseConfig& config); + extern "C" void bls12_381_generate_scalars(bls12_381::scalar_t* scalars, int size); extern "C" cudaError_t bls12_381_scalar_convert_montgomery( diff --git a/icicle/include/api/bn254.h b/icicle/include/api/bn254.h index 7c8a81b87..8753ea35a 100644 --- a/icicle/include/api/bn254.h +++ b/icicle/include/api/bn254.h @@ -148,6 +148,17 @@ extern "C" cudaError_t bn254_transpose_matrix_cuda( bool on_device, bool is_async); +extern "C" cudaError_t bn254_bit_reverse_cuda( + const bn254::scalar_t* input, + unsigned n, + vec_ops::BitReverseConfig& config, + bn254::scalar_t* output); + +extern "C" cudaError_t bn254_bit_reverse_inplace_cuda( + bn254::scalar_t* input, + unsigned n, + vec_ops::BitReverseConfig& config); + extern "C" void bn254_generate_scalars(bn254::scalar_t* scalars, int size); extern "C" cudaError_t bn254_scalar_convert_montgomery( diff --git a/icicle/include/api/bw6_761.h b/icicle/include/api/bw6_761.h index d10d93d7a..7ebb5e9f1 100644 --- a/icicle/include/api/bw6_761.h +++ b/icicle/include/api/bw6_761.h @@ -116,6 +116,17 @@ extern "C" cudaError_t bw6_761_transpose_matrix_cuda( bool on_device, bool is_async); +extern "C" cudaError_t bw6_761_bit_reverse_cuda( + const bw6_761::scalar_t* input, + unsigned n, + vec_ops::BitReverseConfig& config, + bw6_761::scalar_t* output); + +extern "C" cudaError_t bw6_761_bit_reverse_inplace_cuda( + bw6_761::scalar_t* input, + unsigned n, + vec_ops::BitReverseConfig& config); + extern "C" void bw6_761_generate_scalars(bw6_761::scalar_t* scalars, int size); extern "C" cudaError_t bw6_761_scalar_convert_montgomery( diff --git a/icicle/include/api/grumpkin.h b/icicle/include/api/grumpkin.h index 821bed6af..95d02ddd3 100644 --- a/icicle/include/api/grumpkin.h +++ b/icicle/include/api/grumpkin.h @@ -86,6 +86,17 @@ extern "C" cudaError_t grumpkin_transpose_matrix_cuda( bool on_device, bool is_async); +extern "C" cudaError_t grumpkin_bit_reverse_cuda( + const grumpkin::scalar_t* input, + unsigned n, + vec_ops::BitReverseConfig& config, + grumpkin::scalar_t* output); + +extern "C" cudaError_t grumpkin_bit_reverse_inplace_cuda( + grumpkin::scalar_t* input, + unsigned n, + vec_ops::BitReverseConfig& config); + extern "C" void grumpkin_generate_scalars(grumpkin::scalar_t* scalars, int size); extern "C" cudaError_t grumpkin_scalar_convert_montgomery( diff --git a/icicle/include/api/stark252.h b/icicle/include/api/stark252.h index ff9671241..2b4acc334 100644 --- a/icicle/include/api/stark252.h +++ b/icicle/include/api/stark252.h @@ -31,6 +31,17 @@ extern "C" cudaError_t stark252_transpose_matrix_cuda( bool on_device, bool is_async); +extern "C" cudaError_t stark252_bit_reverse_cuda( + const stark252::scalar_t* input, + unsigned n, + vec_ops::BitReverseConfig& config, + stark252::scalar_t* output); + +extern "C" cudaError_t stark252_bit_reverse_inplace_cuda( + stark252::scalar_t* input, + unsigned n, + vec_ops::BitReverseConfig& config); + extern "C" void stark252_generate_scalars(stark252::scalar_t* scalars, int size); extern "C" cudaError_t stark252_scalar_convert_montgomery( diff --git a/icicle/include/vec_ops/vec_ops.cuh b/icicle/include/vec_ops/vec_ops.cuh index f52ab686c..3135cba0b 100644 --- a/icicle/include/vec_ops/vec_ops.cuh +++ b/icicle/include/vec_ops/vec_ops.cuh @@ -113,6 +113,25 @@ namespace vec_ops { device_context::DeviceContext& ctx, bool on_device, bool is_async); + + struct BitReverseConfig { + device_context::DeviceContext ctx; /**< Details related to the device such as its id and stream. */ + bool is_input_on_device; /**< True if `input` is on device and false if it is not. Default value: false. */ + bool is_output_on_device; /**< True if `output` is on device and false if it is not. Default value: false. */ + bool is_async; /**< Whether to run the vector operations asynchronously. If set to `true`, the function will be + * non-blocking and you'd need to synchronize it explicitly by running + * `cudaStreamSynchronize` or `cudaDeviceSynchronize`. If set to false, the + * function will block the current CPU thread. */ + }; + static BitReverseConfig DefaultBitReverseConfig(const device_context::DeviceContext& ctx = device_context::get_default_device_context()) { + BitReverseConfig config = { + ctx, // ctx + false, // is_input_on_device + false, // is_output_on_device + false, // is_async + }; + return config; + } } // namespace vec_ops #endif diff --git a/icicle/src/vec_ops/extern.cu b/icicle/src/vec_ops/extern.cu index 00be401a2..c1790f442 100644 --- a/icicle/src/vec_ops/extern.cu +++ b/icicle/src/vec_ops/extern.cu @@ -59,4 +59,13 @@ namespace vec_ops { { return transpose_matrix(input, output, row_size, column_size, ctx, on_device, is_async); } + + extern "C" cudaError_t CONCAT_EXPAND(FIELD, bit_reverse_cuda)(const scalar_t* input, unsigned n, BitReverseConfig& config, scalar_t* output) + { + return bit_reverse(input, n, config, output); + } + extern "C" cudaError_t CONCAT_EXPAND(FIELD, bit_reverse_inplace_cuda)(scalar_t* input, unsigned n, BitReverseConfig& config) + { + return bit_reverse_inplace(input, n, config); + } } // namespace vec_ops \ No newline at end of file diff --git a/icicle/src/vec_ops/vec_ops.cu b/icicle/src/vec_ops/vec_ops.cu index d87dc2a89..ed32aa926 100644 --- a/icicle/src/vec_ops/vec_ops.cu +++ b/icicle/src/vec_ops/vec_ops.cu @@ -54,6 +54,29 @@ namespace vec_ops { if (tid >= row_size * column_size) return; out[(tid % row_size) * column_size + (tid / row_size)] = in[tid]; } + + template + __global__ void bit_reverse_kernel(const E* input, unsigned n, unsigned shift, E* output) { + int tid = blockIdx.x * blockDim.x + threadIdx.x; + // Handling arbitrary vector size + if (tid < n) { + int reversed_index = __brev(tid) >> shift; + output[reversed_index] = input[tid]; + } + } + template + __global__ void bit_reverse_inplace_kernel(E* input, unsigned n, unsigned shift) { + int tid = blockIdx.x * blockDim.x + threadIdx.x; + // Handling arbitrary vector size + if (tid < n) { + int reversed_index = __brev(tid) >> shift; + if(reversed_index > tid) { + E temp = input[tid]; + input[tid] = input[reversed_index]; + input[reversed_index] = temp; + } + } + } } // namespace template @@ -164,4 +187,67 @@ namespace vec_ops { return CHK_LAST(); } + + template + cudaError_t bit_reverse_inplace(E* input, unsigned size, BitReverseConfig& cfg) { + if (size & (size - 1)) + THROW_ICICLE_ERR(IcicleError_t::InvalidArgument, "bit_reverse: size must be a power of 2"); + E* d_input; + if (cfg.is_input_on_device) { + d_input = input; + } + else { + // copy input to gpu + CHK_IF_RETURN(cudaMallocAsync(&d_input, sizeof(E) * size, cfg.ctx.stream)); + CHK_IF_RETURN(cudaMemcpyAsync(d_input, input, sizeof(E) * size, cudaMemcpyHostToDevice, cfg.ctx.stream)); + } + unsigned shift = __builtin_clz(size) + 1; + unsigned num_blocks = (size + MAX_THREADS_PER_BLOCK - 1) / MAX_THREADS_PER_BLOCK; + bit_reverse_inplace_kernel<<>>(d_input, size, shift); + if (!cfg.is_input_on_device) { + CHK_IF_RETURN(cudaMemcpyAsync(input, d_input, sizeof(E) * size, cudaMemcpyDeviceToHost, cfg.ctx.stream)); + CHK_IF_RETURN(cudaFreeAsync(d_input, cfg.ctx.stream)); + } + if (!cfg.is_async) + CHK_IF_RETURN(cudaStreamSynchronize(cfg.ctx.stream)); + return CHK_LAST(); + } + + template + cudaError_t bit_reverse(const E* input, unsigned size, BitReverseConfig& cfg, E* output) { + if (size & (size - 1)) + THROW_ICICLE_ERR(IcicleError_t::InvalidArgument, "bit_reverse: size must be a power of 2"); + const E* d_input; + E* d_alloc_input; + if (cfg.is_input_on_device) { + d_input = input; + } + else { + // copy input to gpu + CHK_IF_RETURN(cudaMallocAsync(&d_alloc_input, sizeof(E) * size, cfg.ctx.stream)); + CHK_IF_RETURN(cudaMemcpyAsync(d_alloc_input, input, sizeof(E) * size, cudaMemcpyHostToDevice, cfg.ctx.stream)); + d_input = d_alloc_input; + } + E* d_output; + if (cfg.is_output_on_device) { + d_output = output; + } + else { + // allocate output on gpu + CHK_IF_RETURN(cudaMallocAsync(&d_output, sizeof(E) * size, cfg.ctx.stream)); + } + unsigned shift = __builtin_clz(size) + 1; + unsigned num_blocks = (size + MAX_THREADS_PER_BLOCK - 1) / MAX_THREADS_PER_BLOCK; + bit_reverse_kernel<<>>(d_input, size, shift, d_output); + if (!cfg.is_input_on_device) { + CHK_IF_RETURN(cudaFreeAsync(d_alloc_input, cfg.ctx.stream)); + } + if (!cfg.is_output_on_device) { + CHK_IF_RETURN(cudaMemcpyAsync(output, d_output, sizeof(E) * size, cudaMemcpyDeviceToHost, cfg.ctx.stream)); + CHK_IF_RETURN(cudaFreeAsync(d_output, cfg.ctx.stream)); + } + if (!cfg.is_async) + CHK_IF_RETURN(cudaStreamSynchronize(cfg.ctx.stream)); + return CHK_LAST(); + } } // namespace vec_ops \ No newline at end of file diff --git a/wrappers/rust/icicle-core/src/vec_ops/mod.rs b/wrappers/rust/icicle-core/src/vec_ops/mod.rs index 5cba18424..78d38c16a 100644 --- a/wrappers/rust/icicle-core/src/vec_ops/mod.rs +++ b/wrappers/rust/icicle-core/src/vec_ops/mod.rs @@ -40,6 +40,40 @@ impl<'a> VecOpsConfig<'a> { } } +#[repr(C)] +#[derive(Debug, Clone)] +pub struct BitReverseConfig<'a> { + /// Details related to the device such as its id and stream id. See [DeviceContext](@ref device_context::DeviceContext). + pub ctx: DeviceContext<'a>, + + /// True if inputs are on device and false if they're on host. Default value: false. + pub are_inputs_on_device: bool, + + /// If true, output is preserved on device, otherwise on host. Default value: false. + pub are_outputs_on_device: bool, + + /// Whether to run the vector operations asynchronously. If set to `true`, the functions will be non-blocking and you'd need to synchronize + /// it explicitly by running `stream.synchronize()`. If set to false, the functions will block the current CPU thread. + pub is_async: bool, +} + +impl<'a> Default for BitReverseConfig<'a> { + fn default() -> Self { + Self::default_for_device(DEFAULT_DEVICE_ID) + } +} + +impl<'a> BitReverseConfig<'a> { + pub fn default_for_device(device_id: usize) -> Self { + BitReverseConfig { + ctx: DeviceContext::default_for_device(device_id), + are_inputs_on_device: false, + are_outputs_on_device: false, + is_async: false, + } + } +} + #[doc(hidden)] pub trait VecOps { fn add( @@ -72,6 +106,17 @@ pub trait VecOps { on_device: bool, is_async: bool, ) -> IcicleResult<()>; + + fn bit_reverse( + input: &(impl HostOrDeviceSlice + ?Sized), + cfg: &BitReverseConfig, + output: &mut (impl HostOrDeviceSlice + ?Sized), + ) -> IcicleResult<()>; + + fn bit_reverse_inplace( + input: &mut (impl HostOrDeviceSlice + ?Sized), + cfg: &BitReverseConfig, + ) -> IcicleResult<()>; } fn check_vec_ops_args<'a, F>( @@ -170,6 +215,29 @@ where <::Config as VecOps>::transpose(input, row_size, column_size, output, ctx, on_device, is_async) } +pub fn bit_reverse( + input: &(impl HostOrDeviceSlice + ?Sized), + cfg: &BitReverseConfig, + output: &mut (impl HostOrDeviceSlice + ?Sized), +) -> IcicleResult<()> +where + F: FieldImpl, + ::Config: VecOps, +{ + <::Config as VecOps>::bit_reverse(input, cfg, output) +} + +pub fn bit_reverse_inplace( + input: &mut (impl HostOrDeviceSlice + ?Sized), + cfg: &BitReverseConfig, +) -> IcicleResult<()> +where + F: FieldImpl, + ::Config: VecOps, +{ + <::Config as VecOps>::bit_reverse_inplace(input, cfg) +} + #[macro_export] macro_rules! impl_vec_ops_field { ( @@ -181,6 +249,7 @@ macro_rules! impl_vec_ops_field { mod $field_prefix_ident { use crate::vec_ops::{$field, CudaError, DeviceContext, HostOrDeviceSlice}; use icicle_core::vec_ops::VecOpsConfig; + use icicle_core::vec_ops::BitReverseConfig; extern "C" { #[link_name = concat!($field_prefix, "_add_cuda")] @@ -220,6 +289,21 @@ macro_rules! impl_vec_ops_field { on_device: bool, is_async: bool, ) -> CudaError; + + #[link_name = concat!($field_prefix, "_bit_reverse_cuda")] + pub(crate) fn bit_reverse_cuda( + input: *const $field, + size: u32, + config: *const BitReverseConfig, + output: *mut $field, + ) -> CudaError; + + #[link_name = concat!($field_prefix, "_bit_reverse_inplace_cuda")] + pub(crate) fn bit_reverse_inplace_cuda( + input: *mut $field, + size: u32, + config: *const BitReverseConfig, + ) -> CudaError; } } @@ -300,6 +384,36 @@ macro_rules! impl_vec_ops_field { .wrap() } } + + fn bit_reverse( + input: &(impl HostOrDeviceSlice<$field> + ?Sized), + cfg: &BitReverseConfig, + output: &mut (impl HostOrDeviceSlice<$field> + ?Sized), + ) -> IcicleResult<()> { + unsafe { + $field_prefix_ident::bit_reverse_cuda( + input.as_ptr(), + input.len() as u32, + cfg as *const BitReverseConfig, + output.as_mut_ptr(), + ) + .wrap() + } + } + + fn bit_reverse_inplace( + input: &mut (impl HostOrDeviceSlice<$field> + ?Sized), + cfg: &BitReverseConfig, + ) -> IcicleResult<()> { + unsafe { + $field_prefix_ident::bit_reverse_inplace_cuda( + input.as_mut_ptr(), + input.len() as u32, + cfg as *const BitReverseConfig, + ) + .wrap() + } + } } }; } @@ -313,5 +427,14 @@ macro_rules! impl_vec_add_tests { pub fn test_vec_add_scalars() { check_vec_ops_scalars::<$field>() } + + #[test] + pub fn test_bit_reverse() { + check_bit_reverse::<$field>() + } + #[test] + pub fn test_bit_reverse_inplace() { + check_bit_reverse_inplace::<$field>() + } }; } diff --git a/wrappers/rust/icicle-core/src/vec_ops/tests.rs b/wrappers/rust/icicle-core/src/vec_ops/tests.rs index 41ac263b0..74a4a416a 100644 --- a/wrappers/rust/icicle-core/src/vec_ops/tests.rs +++ b/wrappers/rust/icicle-core/src/vec_ops/tests.rs @@ -1,6 +1,6 @@ use crate::traits::GenerateRandom; -use crate::vec_ops::{add_scalars, mul_scalars, sub_scalars, FieldImpl, VecOps, VecOpsConfig}; -use icicle_cuda_runtime::memory::HostSlice; +use crate::vec_ops::{add_scalars, bit_reverse, bit_reverse_inplace, mul_scalars, sub_scalars, BitReverseConfig, FieldImpl, VecOps, VecOpsConfig}; +use icicle_cuda_runtime::memory::{DeviceVec, HostSlice}; pub fn check_vec_ops_scalars() where @@ -32,3 +32,42 @@ where assert_eq!(a[0], result3[0]); } + +pub fn check_bit_reverse() +where + ::Config: VecOps + GenerateRandom, +{ + const TEST_SIZE: usize = 1 << 20; + let input = F::Config::generate_random(TEST_SIZE); + let input = HostSlice::from_slice(&input); + let mut intermediate_result = DeviceVec::::cuda_malloc(TEST_SIZE).unwrap(); + let mut cfg = BitReverseConfig::default(); + cfg.are_outputs_on_device = true; + bit_reverse(input, &cfg, &mut intermediate_result[..]).unwrap(); + + let mut result = vec![F::one(); TEST_SIZE]; + let result = HostSlice::from_mut_slice(&mut result); + let mut cfg = BitReverseConfig::default(); + cfg.are_outputs_on_device = false; + cfg.are_inputs_on_device = true; + bit_reverse(&intermediate_result[..], &cfg, result).unwrap(); + assert_eq!(input.as_slice(), result.as_slice()); +} + +pub fn check_bit_reverse_inplace() +where + ::Config: VecOps + GenerateRandom, +{ + const TEST_SIZE: usize = 1 << 20; + let input = F::Config::generate_random(TEST_SIZE); + let input = HostSlice::from_slice(&input); + let mut intermediate = DeviceVec::::cuda_malloc(TEST_SIZE).unwrap(); + intermediate.copy_from_host(&input).unwrap(); + let mut cfg = BitReverseConfig::default(); + cfg.are_inputs_on_device = true; + bit_reverse_inplace(&mut intermediate[..], &cfg).unwrap(); + bit_reverse_inplace(&mut intermediate[..], &cfg).unwrap(); + let mut result_host = vec![F::one(); TEST_SIZE]; + intermediate.copy_to_host(HostSlice::from_mut_slice(&mut result_host[..])).unwrap(); + assert_eq!(input.as_slice(), result_host.as_slice()); +} \ No newline at end of file diff --git a/wrappers/rust/icicle-curves/icicle-bls12-377/src/vec_ops/mod.rs b/wrappers/rust/icicle-curves/icicle-bls12-377/src/vec_ops/mod.rs index e41b3f17c..8d01706ab 100644 --- a/wrappers/rust/icicle-curves/icicle-bls12-377/src/vec_ops/mod.rs +++ b/wrappers/rust/icicle-curves/icicle-bls12-377/src/vec_ops/mod.rs @@ -5,7 +5,7 @@ use crate::curve::{ScalarCfg, ScalarField}; use icicle_core::error::IcicleResult; use icicle_core::impl_vec_ops_field; use icicle_core::traits::IcicleResultWrap; -use icicle_core::vec_ops::{VecOps, VecOpsConfig}; +use icicle_core::vec_ops::{VecOps, VecOpsConfig, BitReverseConfig}; use icicle_cuda_runtime::device_context::DeviceContext; use icicle_cuda_runtime::error::CudaError; use icicle_cuda_runtime::memory::HostOrDeviceSlice; diff --git a/wrappers/rust/icicle-curves/icicle-bls12-381/src/vec_ops/mod.rs b/wrappers/rust/icicle-curves/icicle-bls12-381/src/vec_ops/mod.rs index 01cff3db7..f543eb263 100644 --- a/wrappers/rust/icicle-curves/icicle-bls12-381/src/vec_ops/mod.rs +++ b/wrappers/rust/icicle-curves/icicle-bls12-381/src/vec_ops/mod.rs @@ -3,7 +3,7 @@ use crate::curve::{ScalarCfg, ScalarField}; use icicle_core::error::IcicleResult; use icicle_core::impl_vec_ops_field; use icicle_core::traits::IcicleResultWrap; -use icicle_core::vec_ops::{VecOps, VecOpsConfig}; +use icicle_core::vec_ops::{VecOps, VecOpsConfig, BitReverseConfig}; use icicle_cuda_runtime::device_context::DeviceContext; use icicle_cuda_runtime::error::CudaError; use icicle_cuda_runtime::memory::HostOrDeviceSlice; diff --git a/wrappers/rust/icicle-curves/icicle-bn254/src/vec_ops/mod.rs b/wrappers/rust/icicle-curves/icicle-bn254/src/vec_ops/mod.rs index 9447aa8dc..ad4eaadb2 100644 --- a/wrappers/rust/icicle-curves/icicle-bn254/src/vec_ops/mod.rs +++ b/wrappers/rust/icicle-curves/icicle-bn254/src/vec_ops/mod.rs @@ -3,7 +3,7 @@ use crate::curve::{ScalarCfg, ScalarField}; use icicle_core::error::IcicleResult; use icicle_core::impl_vec_ops_field; use icicle_core::traits::IcicleResultWrap; -use icicle_core::vec_ops::{VecOps, VecOpsConfig}; +use icicle_core::vec_ops::{VecOps, VecOpsConfig, BitReverseConfig}; use icicle_cuda_runtime::device_context::DeviceContext; use icicle_cuda_runtime::error::CudaError; use icicle_cuda_runtime::memory::HostOrDeviceSlice; diff --git a/wrappers/rust/icicle-curves/icicle-grumpkin/src/vec_ops/mod.rs b/wrappers/rust/icicle-curves/icicle-grumpkin/src/vec_ops/mod.rs index c8954d864..7b9a42eb5 100644 --- a/wrappers/rust/icicle-curves/icicle-grumpkin/src/vec_ops/mod.rs +++ b/wrappers/rust/icicle-curves/icicle-grumpkin/src/vec_ops/mod.rs @@ -3,7 +3,7 @@ use crate::curve::{ScalarCfg, ScalarField}; use icicle_core::error::IcicleResult; use icicle_core::impl_vec_ops_field; use icicle_core::traits::IcicleResultWrap; -use icicle_core::vec_ops::{VecOps, VecOpsConfig}; +use icicle_core::vec_ops::{VecOps, VecOpsConfig, BitReverseConfig}; use icicle_cuda_runtime::device_context::DeviceContext; use icicle_cuda_runtime::error::CudaError; use icicle_cuda_runtime::memory::HostOrDeviceSlice; diff --git a/wrappers/rust/icicle-fields/icicle-babybear/src/vec_ops/mod.rs b/wrappers/rust/icicle-fields/icicle-babybear/src/vec_ops/mod.rs index 8df5fbfbf..a98ad7b40 100644 --- a/wrappers/rust/icicle-fields/icicle-babybear/src/vec_ops/mod.rs +++ b/wrappers/rust/icicle-fields/icicle-babybear/src/vec_ops/mod.rs @@ -3,7 +3,7 @@ use crate::field::{ExtensionCfg, ExtensionField, ScalarCfg, ScalarField}; use icicle_core::error::IcicleResult; use icicle_core::impl_vec_ops_field; use icicle_core::traits::IcicleResultWrap; -use icicle_core::vec_ops::{VecOps, VecOpsConfig}; +use icicle_core::vec_ops::{VecOps, VecOpsConfig, BitReverseConfig}; use icicle_cuda_runtime::device_context::DeviceContext; use icicle_cuda_runtime::error::CudaError; use icicle_cuda_runtime::memory::HostOrDeviceSlice; @@ -11,6 +11,57 @@ use icicle_cuda_runtime::memory::HostOrDeviceSlice; impl_vec_ops_field!("babybear", babybear, ScalarField, ScalarCfg); impl_vec_ops_field!("babybear_extension", babybear_extension, ExtensionField, ExtensionCfg); +// #[repr(C)] +// #[derive(Debug, Clone)] +// pub struct BitReverseConfig<'a> { +// /// Details related to the device such as its id and stream id. See [DeviceContext](@ref device_context::DeviceContext). +// pub ctx: DeviceContext<'a>, + +// /// True if inputs are on device and false if they're on host. Default value: false. +// pub are_inputs_on_device: bool, + +// /// If true, output is preserved on device, otherwise on host. Default value: false. +// pub are_outputs_on_device: bool, + +// /// Whether to run the Keccak asynchronously. If set to `true`, the keccak_hash function will be +// /// non-blocking and you'd need to synchronize it explicitly by running +// /// `cudaStreamSynchronize` or `cudaDeviceSynchronize`. If set to false, keccak_hash +// /// function will block the current CPU thread. +// pub is_async: bool, +// } + +// impl<'a> Default for BitReverseConfig<'a> { +// fn default() -> Self { +// Self::default_for_device(DEFAULT_DEVICE_ID) +// } +// } + +// impl<'a> BitReverseConfig<'a> { +// pub fn default_for_device(device_id: usize) -> Self { +// BitReverseConfig { +// ctx: DeviceContext::default_for_device(device_id), +// are_inputs_on_device: false, +// are_outputs_on_device: false, +// is_async: false, +// } +// } +// } + +extern "C" { + pub(crate) fn babybear_bit_reverse_cuda( + input: *const ScalarField, + size: u32, + config: &BitReverseConfig, + output: *mut ScalarField + ) -> CudaError; + + pub(crate) fn babybear_bit_reverse_inplace_cuda( + input: *mut ScalarField, + size: u32, + config: &BitReverseConfig + ) -> CudaError; +} + #[cfg(test)] pub(crate) mod tests { use crate::field::{ExtensionField, ScalarField}; @@ -18,6 +69,7 @@ pub(crate) mod tests { use icicle_core::vec_ops::tests::*; impl_vec_add_tests!(ScalarField); + mod extension { use super::*; diff --git a/wrappers/rust/icicle-fields/icicle-stark252/src/vec_ops/mod.rs b/wrappers/rust/icicle-fields/icicle-stark252/src/vec_ops/mod.rs index 8e7e88465..1d431ab8f 100644 --- a/wrappers/rust/icicle-fields/icicle-stark252/src/vec_ops/mod.rs +++ b/wrappers/rust/icicle-fields/icicle-stark252/src/vec_ops/mod.rs @@ -3,7 +3,7 @@ use crate::field::{ScalarCfg, ScalarField}; use icicle_core::error::IcicleResult; use icicle_core::impl_vec_ops_field; use icicle_core::traits::IcicleResultWrap; -use icicle_core::vec_ops::{VecOps, VecOpsConfig}; +use icicle_core::vec_ops::{VecOps, VecOpsConfig, BitReverseConfig}; use icicle_cuda_runtime::device_context::DeviceContext; use icicle_cuda_runtime::error::CudaError; use icicle_cuda_runtime::memory::HostOrDeviceSlice; From 48d91145a7042ddbbb25ade01777ac493be34aba Mon Sep 17 00:00:00 2001 From: nonam3e Date: Mon, 27 May 2024 14:59:57 +0000 Subject: [PATCH 02/14] cleanup --- icicle/include/vec_ops/vec_ops.cuh | 32 ++++++++------- icicle/src/vec_ops/extern.cu | 12 +++--- icicle/src/vec_ops/vec_ops.cu | 39 ++++++++----------- wrappers/rust/icicle-core/src/vec_ops/mod.rs | 2 +- .../rust/icicle-core/src/vec_ops/tests.rs | 17 +++++--- .../icicle-bls12-377/src/vec_ops/mod.rs | 2 +- .../icicle-bls12-381/src/vec_ops/mod.rs | 2 +- .../icicle-bn254/src/vec_ops/mod.rs | 2 +- .../icicle-grumpkin/src/vec_ops/mod.rs | 2 +- .../icicle-babybear/src/vec_ops/mod.rs | 6 +-- .../icicle-stark252/src/vec_ops/mod.rs | 2 +- 11 files changed, 62 insertions(+), 56 deletions(-) diff --git a/icicle/include/vec_ops/vec_ops.cuh b/icicle/include/vec_ops/vec_ops.cuh index 3135cba0b..909c6886b 100644 --- a/icicle/include/vec_ops/vec_ops.cuh +++ b/icicle/include/vec_ops/vec_ops.cuh @@ -115,22 +115,24 @@ namespace vec_ops { bool is_async); struct BitReverseConfig { - device_context::DeviceContext ctx; /**< Details related to the device such as its id and stream. */ - bool is_input_on_device; /**< True if `input` is on device and false if it is not. Default value: false. */ - bool is_output_on_device; /**< True if `output` is on device and false if it is not. Default value: false. */ - bool is_async; /**< Whether to run the vector operations asynchronously. If set to `true`, the function will be - * non-blocking and you'd need to synchronize it explicitly by running - * `cudaStreamSynchronize` or `cudaDeviceSynchronize`. If set to false, the - * function will block the current CPU thread. */ + device_context::DeviceContext ctx; /**< Details related to the device such as its id and stream. */ + bool is_input_on_device; /**< True if `input` is on device and false if it is not. Default value: false. */ + bool is_output_on_device; /**< True if `output` is on device and false if it is not. Default value: false. */ + bool is_async; /**< Whether to run the vector operations asynchronously. If set to `true`, the function will be + * non-blocking and you'd need to synchronize it explicitly by running + * `cudaStreamSynchronize` or `cudaDeviceSynchronize`. If set to false, the + * function will block the current CPU thread. */ }; - static BitReverseConfig DefaultBitReverseConfig(const device_context::DeviceContext& ctx = device_context::get_default_device_context()) { - BitReverseConfig config = { - ctx, // ctx - false, // is_input_on_device - false, // is_output_on_device - false, // is_async - }; - return config; + static BitReverseConfig + DefaultBitReverseConfig(const device_context::DeviceContext& ctx = device_context::get_default_device_context()) + { + BitReverseConfig config = { + ctx, // ctx + false, // is_input_on_device + false, // is_output_on_device + false, // is_async + }; + return config; } } // namespace vec_ops diff --git a/icicle/src/vec_ops/extern.cu b/icicle/src/vec_ops/extern.cu index c1790f442..7c9079a46 100644 --- a/icicle/src/vec_ops/extern.cu +++ b/icicle/src/vec_ops/extern.cu @@ -13,7 +13,7 @@ namespace vec_ops { * @return `cudaSuccess` if the execution was successful and an error code otherwise. */ extern "C" cudaError_t - CONCAT_EXPAND(FIELD, mul_cuda)(scalar_t* vec_a, scalar_t* vec_b, int n, VecOpsConfig& config, scalar_t* result) + CONCAT_EXPAND(FIELD, mul_cuda)(scalar_t* vec_a, scalar_t* vec_b, int n, VecOpsConfig& config, scalar_t* result) { return mul(vec_a, vec_b, n, config, result); } @@ -25,7 +25,7 @@ namespace vec_ops { * @return `cudaSuccess` if the execution was successful and an error code otherwise. */ extern "C" cudaError_t - CONCAT_EXPAND(FIELD, add_cuda)(scalar_t* vec_a, scalar_t* vec_b, int n, VecOpsConfig& config, scalar_t* result) + CONCAT_EXPAND(FIELD, add_cuda)(scalar_t* vec_a, scalar_t* vec_b, int n, VecOpsConfig& config, scalar_t* result) { return add(vec_a, vec_b, n, config, result); } @@ -37,7 +37,7 @@ namespace vec_ops { * @return `cudaSuccess` if the execution was successful and an error code otherwise. */ extern "C" cudaError_t - CONCAT_EXPAND(FIELD, sub_cuda)(scalar_t* vec_a, scalar_t* vec_b, int n, VecOpsConfig& config, scalar_t* result) + CONCAT_EXPAND(FIELD, sub_cuda)(scalar_t* vec_a, scalar_t* vec_b, int n, VecOpsConfig& config, scalar_t* result) { return sub(vec_a, vec_b, n, config, result); } @@ -60,11 +60,13 @@ namespace vec_ops { return transpose_matrix(input, output, row_size, column_size, ctx, on_device, is_async); } - extern "C" cudaError_t CONCAT_EXPAND(FIELD, bit_reverse_cuda)(const scalar_t* input, unsigned n, BitReverseConfig& config, scalar_t* output) + extern "C" cudaError_t CONCAT_EXPAND(FIELD, bit_reverse_cuda)( + const scalar_t* input, unsigned n, BitReverseConfig& config, scalar_t* output) { return bit_reverse(input, n, config, output); } - extern "C" cudaError_t CONCAT_EXPAND(FIELD, bit_reverse_inplace_cuda)(scalar_t* input, unsigned n, BitReverseConfig& config) + extern "C" cudaError_t + CONCAT_EXPAND(FIELD, bit_reverse_inplace_cuda)(scalar_t* input, unsigned n, BitReverseConfig& config) { return bit_reverse_inplace(input, n, config); } diff --git a/icicle/src/vec_ops/vec_ops.cu b/icicle/src/vec_ops/vec_ops.cu index ed32aa926..86445a69e 100644 --- a/icicle/src/vec_ops/vec_ops.cu +++ b/icicle/src/vec_ops/vec_ops.cu @@ -56,7 +56,8 @@ namespace vec_ops { } template - __global__ void bit_reverse_kernel(const E* input, unsigned n, unsigned shift, E* output) { + __global__ void bit_reverse_kernel(const E* input, unsigned n, unsigned shift, E* output) + { int tid = blockIdx.x * blockDim.x + threadIdx.x; // Handling arbitrary vector size if (tid < n) { @@ -65,12 +66,13 @@ namespace vec_ops { } } template - __global__ void bit_reverse_inplace_kernel(E* input, unsigned n, unsigned shift) { + __global__ void bit_reverse_inplace_kernel(E* input, unsigned n, unsigned shift) + { int tid = blockIdx.x * blockDim.x + threadIdx.x; // Handling arbitrary vector size if (tid < n) { int reversed_index = __brev(tid) >> shift; - if(reversed_index > tid) { + if (reversed_index > tid) { E temp = input[tid]; input[tid] = input[reversed_index]; input[reversed_index] = temp; @@ -189,14 +191,13 @@ namespace vec_ops { } template - cudaError_t bit_reverse_inplace(E* input, unsigned size, BitReverseConfig& cfg) { - if (size & (size - 1)) - THROW_ICICLE_ERR(IcicleError_t::InvalidArgument, "bit_reverse: size must be a power of 2"); + cudaError_t bit_reverse_inplace(E* input, unsigned size, BitReverseConfig& cfg) + { + if (size & (size - 1)) THROW_ICICLE_ERR(IcicleError_t::InvalidArgument, "bit_reverse: size must be a power of 2"); E* d_input; if (cfg.is_input_on_device) { d_input = input; - } - else { + } else { // copy input to gpu CHK_IF_RETURN(cudaMallocAsync(&d_input, sizeof(E) * size, cfg.ctx.stream)); CHK_IF_RETURN(cudaMemcpyAsync(d_input, input, sizeof(E) * size, cudaMemcpyHostToDevice, cfg.ctx.stream)); @@ -208,21 +209,19 @@ namespace vec_ops { CHK_IF_RETURN(cudaMemcpyAsync(input, d_input, sizeof(E) * size, cudaMemcpyDeviceToHost, cfg.ctx.stream)); CHK_IF_RETURN(cudaFreeAsync(d_input, cfg.ctx.stream)); } - if (!cfg.is_async) - CHK_IF_RETURN(cudaStreamSynchronize(cfg.ctx.stream)); + if (!cfg.is_async) CHK_IF_RETURN(cudaStreamSynchronize(cfg.ctx.stream)); return CHK_LAST(); } template - cudaError_t bit_reverse(const E* input, unsigned size, BitReverseConfig& cfg, E* output) { - if (size & (size - 1)) - THROW_ICICLE_ERR(IcicleError_t::InvalidArgument, "bit_reverse: size must be a power of 2"); + cudaError_t bit_reverse(const E* input, unsigned size, BitReverseConfig& cfg, E* output) + { + if (size & (size - 1)) THROW_ICICLE_ERR(IcicleError_t::InvalidArgument, "bit_reverse: size must be a power of 2"); const E* d_input; E* d_alloc_input; if (cfg.is_input_on_device) { d_input = input; - } - else { + } else { // copy input to gpu CHK_IF_RETURN(cudaMallocAsync(&d_alloc_input, sizeof(E) * size, cfg.ctx.stream)); CHK_IF_RETURN(cudaMemcpyAsync(d_alloc_input, input, sizeof(E) * size, cudaMemcpyHostToDevice, cfg.ctx.stream)); @@ -231,23 +230,19 @@ namespace vec_ops { E* d_output; if (cfg.is_output_on_device) { d_output = output; - } - else { + } else { // allocate output on gpu CHK_IF_RETURN(cudaMallocAsync(&d_output, sizeof(E) * size, cfg.ctx.stream)); } unsigned shift = __builtin_clz(size) + 1; unsigned num_blocks = (size + MAX_THREADS_PER_BLOCK - 1) / MAX_THREADS_PER_BLOCK; bit_reverse_kernel<<>>(d_input, size, shift, d_output); - if (!cfg.is_input_on_device) { - CHK_IF_RETURN(cudaFreeAsync(d_alloc_input, cfg.ctx.stream)); - } + if (!cfg.is_input_on_device) { CHK_IF_RETURN(cudaFreeAsync(d_alloc_input, cfg.ctx.stream)); } if (!cfg.is_output_on_device) { CHK_IF_RETURN(cudaMemcpyAsync(output, d_output, sizeof(E) * size, cudaMemcpyDeviceToHost, cfg.ctx.stream)); CHK_IF_RETURN(cudaFreeAsync(d_output, cfg.ctx.stream)); } - if (!cfg.is_async) - CHK_IF_RETURN(cudaStreamSynchronize(cfg.ctx.stream)); + if (!cfg.is_async) CHK_IF_RETURN(cudaStreamSynchronize(cfg.ctx.stream)); return CHK_LAST(); } } // namespace vec_ops \ No newline at end of file diff --git a/wrappers/rust/icicle-core/src/vec_ops/mod.rs b/wrappers/rust/icicle-core/src/vec_ops/mod.rs index 78d38c16a..536649d7e 100644 --- a/wrappers/rust/icicle-core/src/vec_ops/mod.rs +++ b/wrappers/rust/icicle-core/src/vec_ops/mod.rs @@ -248,8 +248,8 @@ macro_rules! impl_vec_ops_field { ) => { mod $field_prefix_ident { use crate::vec_ops::{$field, CudaError, DeviceContext, HostOrDeviceSlice}; - use icicle_core::vec_ops::VecOpsConfig; use icicle_core::vec_ops::BitReverseConfig; + use icicle_core::vec_ops::VecOpsConfig; extern "C" { #[link_name = concat!($field_prefix, "_add_cuda")] diff --git a/wrappers/rust/icicle-core/src/vec_ops/tests.rs b/wrappers/rust/icicle-core/src/vec_ops/tests.rs index 74a4a416a..a1a45671b 100644 --- a/wrappers/rust/icicle-core/src/vec_ops/tests.rs +++ b/wrappers/rust/icicle-core/src/vec_ops/tests.rs @@ -1,5 +1,8 @@ use crate::traits::GenerateRandom; -use crate::vec_ops::{add_scalars, bit_reverse, bit_reverse_inplace, mul_scalars, sub_scalars, BitReverseConfig, FieldImpl, VecOps, VecOpsConfig}; +use crate::vec_ops::{ + add_scalars, bit_reverse, bit_reverse_inplace, mul_scalars, sub_scalars, BitReverseConfig, FieldImpl, VecOps, + VecOpsConfig, +}; use icicle_cuda_runtime::memory::{DeviceVec, HostSlice}; pub fn check_vec_ops_scalars() @@ -62,12 +65,16 @@ where let input = F::Config::generate_random(TEST_SIZE); let input = HostSlice::from_slice(&input); let mut intermediate = DeviceVec::::cuda_malloc(TEST_SIZE).unwrap(); - intermediate.copy_from_host(&input).unwrap(); + intermediate + .copy_from_host(&input) + .unwrap(); let mut cfg = BitReverseConfig::default(); cfg.are_inputs_on_device = true; bit_reverse_inplace(&mut intermediate[..], &cfg).unwrap(); bit_reverse_inplace(&mut intermediate[..], &cfg).unwrap(); - let mut result_host = vec![F::one(); TEST_SIZE]; - intermediate.copy_to_host(HostSlice::from_mut_slice(&mut result_host[..])).unwrap(); + let mut result_host = vec![F::one(); TEST_SIZE]; + intermediate + .copy_to_host(HostSlice::from_mut_slice(&mut result_host[..])) + .unwrap(); assert_eq!(input.as_slice(), result_host.as_slice()); -} \ No newline at end of file +} diff --git a/wrappers/rust/icicle-curves/icicle-bls12-377/src/vec_ops/mod.rs b/wrappers/rust/icicle-curves/icicle-bls12-377/src/vec_ops/mod.rs index 8d01706ab..959f24e60 100644 --- a/wrappers/rust/icicle-curves/icicle-bls12-377/src/vec_ops/mod.rs +++ b/wrappers/rust/icicle-curves/icicle-bls12-377/src/vec_ops/mod.rs @@ -5,7 +5,7 @@ use crate::curve::{ScalarCfg, ScalarField}; use icicle_core::error::IcicleResult; use icicle_core::impl_vec_ops_field; use icicle_core::traits::IcicleResultWrap; -use icicle_core::vec_ops::{VecOps, VecOpsConfig, BitReverseConfig}; +use icicle_core::vec_ops::{BitReverseConfig, VecOps, VecOpsConfig}; use icicle_cuda_runtime::device_context::DeviceContext; use icicle_cuda_runtime::error::CudaError; use icicle_cuda_runtime::memory::HostOrDeviceSlice; diff --git a/wrappers/rust/icicle-curves/icicle-bls12-381/src/vec_ops/mod.rs b/wrappers/rust/icicle-curves/icicle-bls12-381/src/vec_ops/mod.rs index f543eb263..9591c15b0 100644 --- a/wrappers/rust/icicle-curves/icicle-bls12-381/src/vec_ops/mod.rs +++ b/wrappers/rust/icicle-curves/icicle-bls12-381/src/vec_ops/mod.rs @@ -3,7 +3,7 @@ use crate::curve::{ScalarCfg, ScalarField}; use icicle_core::error::IcicleResult; use icicle_core::impl_vec_ops_field; use icicle_core::traits::IcicleResultWrap; -use icicle_core::vec_ops::{VecOps, VecOpsConfig, BitReverseConfig}; +use icicle_core::vec_ops::{BitReverseConfig, VecOps, VecOpsConfig}; use icicle_cuda_runtime::device_context::DeviceContext; use icicle_cuda_runtime::error::CudaError; use icicle_cuda_runtime::memory::HostOrDeviceSlice; diff --git a/wrappers/rust/icicle-curves/icicle-bn254/src/vec_ops/mod.rs b/wrappers/rust/icicle-curves/icicle-bn254/src/vec_ops/mod.rs index ad4eaadb2..fea979e6b 100644 --- a/wrappers/rust/icicle-curves/icicle-bn254/src/vec_ops/mod.rs +++ b/wrappers/rust/icicle-curves/icicle-bn254/src/vec_ops/mod.rs @@ -3,7 +3,7 @@ use crate::curve::{ScalarCfg, ScalarField}; use icicle_core::error::IcicleResult; use icicle_core::impl_vec_ops_field; use icicle_core::traits::IcicleResultWrap; -use icicle_core::vec_ops::{VecOps, VecOpsConfig, BitReverseConfig}; +use icicle_core::vec_ops::{BitReverseConfig, VecOps, VecOpsConfig}; use icicle_cuda_runtime::device_context::DeviceContext; use icicle_cuda_runtime::error::CudaError; use icicle_cuda_runtime::memory::HostOrDeviceSlice; diff --git a/wrappers/rust/icicle-curves/icicle-grumpkin/src/vec_ops/mod.rs b/wrappers/rust/icicle-curves/icicle-grumpkin/src/vec_ops/mod.rs index 7b9a42eb5..0ef913c9b 100644 --- a/wrappers/rust/icicle-curves/icicle-grumpkin/src/vec_ops/mod.rs +++ b/wrappers/rust/icicle-curves/icicle-grumpkin/src/vec_ops/mod.rs @@ -3,7 +3,7 @@ use crate::curve::{ScalarCfg, ScalarField}; use icicle_core::error::IcicleResult; use icicle_core::impl_vec_ops_field; use icicle_core::traits::IcicleResultWrap; -use icicle_core::vec_ops::{VecOps, VecOpsConfig, BitReverseConfig}; +use icicle_core::vec_ops::{BitReverseConfig, VecOps, VecOpsConfig}; use icicle_cuda_runtime::device_context::DeviceContext; use icicle_cuda_runtime::error::CudaError; use icicle_cuda_runtime::memory::HostOrDeviceSlice; diff --git a/wrappers/rust/icicle-fields/icicle-babybear/src/vec_ops/mod.rs b/wrappers/rust/icicle-fields/icicle-babybear/src/vec_ops/mod.rs index a98ad7b40..451f4aeaa 100644 --- a/wrappers/rust/icicle-fields/icicle-babybear/src/vec_ops/mod.rs +++ b/wrappers/rust/icicle-fields/icicle-babybear/src/vec_ops/mod.rs @@ -3,7 +3,7 @@ use crate::field::{ExtensionCfg, ExtensionField, ScalarCfg, ScalarField}; use icicle_core::error::IcicleResult; use icicle_core::impl_vec_ops_field; use icicle_core::traits::IcicleResultWrap; -use icicle_core::vec_ops::{VecOps, VecOpsConfig, BitReverseConfig}; +use icicle_core::vec_ops::{BitReverseConfig, VecOps, VecOpsConfig}; use icicle_cuda_runtime::device_context::DeviceContext; use icicle_cuda_runtime::error::CudaError; use icicle_cuda_runtime::memory::HostOrDeviceSlice; @@ -52,13 +52,13 @@ extern "C" { input: *const ScalarField, size: u32, config: &BitReverseConfig, - output: *mut ScalarField + output: *mut ScalarField, ) -> CudaError; pub(crate) fn babybear_bit_reverse_inplace_cuda( input: *mut ScalarField, size: u32, - config: &BitReverseConfig + config: &BitReverseConfig, ) -> CudaError; } diff --git a/wrappers/rust/icicle-fields/icicle-stark252/src/vec_ops/mod.rs b/wrappers/rust/icicle-fields/icicle-stark252/src/vec_ops/mod.rs index 1d431ab8f..4fcd95205 100644 --- a/wrappers/rust/icicle-fields/icicle-stark252/src/vec_ops/mod.rs +++ b/wrappers/rust/icicle-fields/icicle-stark252/src/vec_ops/mod.rs @@ -3,7 +3,7 @@ use crate::field::{ScalarCfg, ScalarField}; use icicle_core::error::IcicleResult; use icicle_core::impl_vec_ops_field; use icicle_core::traits::IcicleResultWrap; -use icicle_core::vec_ops::{VecOps, VecOpsConfig, BitReverseConfig}; +use icicle_core::vec_ops::{BitReverseConfig, VecOps, VecOpsConfig}; use icicle_cuda_runtime::device_context::DeviceContext; use icicle_cuda_runtime::error::CudaError; use icicle_cuda_runtime::memory::HostOrDeviceSlice; From 00deb481594b583fd0bb6ffa1dc1967164679054 Mon Sep 17 00:00:00 2001 From: nonam3e Date: Mon, 27 May 2024 15:04:02 +0000 Subject: [PATCH 03/14] typo --- docs/docs/icicle/rust-bindings/ntt.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/docs/docs/icicle/rust-bindings/ntt.md b/docs/docs/icicle/rust-bindings/ntt.md index 5636c6081..9d0b8f62f 100644 --- a/docs/docs/icicle/rust-bindings/ntt.md +++ b/docs/docs/icicle/rust-bindings/ntt.md @@ -180,7 +180,7 @@ where - **`IcicleResult<()>`**: Will return an error if the operation fails. -### Releaseing the domain +### Releasing the domain The `release_domain` function is responsible for releasing the resources associated with a specific domain in the CUDA device context. From 769f9268e421e46cb66eea53a53c46ecc290543d Mon Sep 17 00:00:00 2001 From: nonam3e Date: Tue, 28 May 2024 08:18:49 +0000 Subject: [PATCH 04/14] merge bit_reverse funcs --- icicle/include/api/babybear.h | 8 +-- icicle/include/api/bls12_377.h | 8 +-- icicle/include/api/bls12_381.h | 8 +-- icicle/include/api/bn254.h | 8 +-- icicle/include/api/bw6_761.h | 8 +-- icicle/include/api/grumpkin.h | 8 +-- icicle/include/api/stark252.h | 8 +-- icicle/src/vec_ops/extern.cu | 10 +-- icicle/src/vec_ops/extern_extension.cu | 6 ++ icicle/src/vec_ops/vec_ops.cu | 31 ++++++---- wrappers/rust/icicle-core/src/vec_ops/mod.rs | 61 ++++++++++++++----- .../rust/icicle-core/src/vec_ops/tests.rs | 10 +-- .../icicle-babybear/src/vec_ops/mod.rs | 50 --------------- 13 files changed, 108 insertions(+), 116 deletions(-) diff --git a/icicle/include/api/babybear.h b/icicle/include/api/babybear.h index 50102ba58..7d06141d3 100644 --- a/icicle/include/api/babybear.h +++ b/icicle/include/api/babybear.h @@ -74,10 +74,10 @@ extern "C" cudaError_t babybear_bit_reverse_cuda( vec_ops::BitReverseConfig& config, babybear::scalar_t* output); -extern "C" cudaError_t babybear_bit_reverse_inplace_cuda( - babybear::scalar_t* input, - unsigned n, - vec_ops::BitReverseConfig& config); +// extern "C" cudaError_t babybear_bit_reverse_inplace_cuda( +// babybear::scalar_t* input, +// unsigned n, +// vec_ops::BitReverseConfig& config); extern "C" void babybear_generate_scalars(babybear::scalar_t* scalars, int size); diff --git a/icicle/include/api/bls12_377.h b/icicle/include/api/bls12_377.h index e83c9656a..2191c4435 100644 --- a/icicle/include/api/bls12_377.h +++ b/icicle/include/api/bls12_377.h @@ -122,10 +122,10 @@ extern "C" cudaError_t bls12_377_bit_reverse_cuda( vec_ops::BitReverseConfig& config, bls12_377::scalar_t* output); -extern "C" cudaError_t bls12_377_bit_reverse_inplace_cuda( - bls12_377::scalar_t* input, - unsigned n, - vec_ops::BitReverseConfig& config); +// extern "C" cudaError_t bls12_377_bit_reverse_inplace_cuda( +// bls12_377::scalar_t* input, +// unsigned n, +// vec_ops::BitReverseConfig& config); extern "C" void bls12_377_generate_scalars(bls12_377::scalar_t* scalars, int size); diff --git a/icicle/include/api/bls12_381.h b/icicle/include/api/bls12_381.h index bcede3914..6e034a4fa 100644 --- a/icicle/include/api/bls12_381.h +++ b/icicle/include/api/bls12_381.h @@ -122,10 +122,10 @@ extern "C" cudaError_t bls12_381_bit_reverse_cuda( vec_ops::BitReverseConfig& config, bls12_381::scalar_t* output); -extern "C" cudaError_t bls12_381_bit_reverse_inplace_cuda( - bls12_381::scalar_t* input, - unsigned n, - vec_ops::BitReverseConfig& config); +// extern "C" cudaError_t bls12_381_bit_reverse_inplace_cuda( +// bls12_381::scalar_t* input, +// unsigned n, +// vec_ops::BitReverseConfig& config); extern "C" void bls12_381_generate_scalars(bls12_381::scalar_t* scalars, int size); diff --git a/icicle/include/api/bn254.h b/icicle/include/api/bn254.h index 8753ea35a..2b76882b6 100644 --- a/icicle/include/api/bn254.h +++ b/icicle/include/api/bn254.h @@ -154,10 +154,10 @@ extern "C" cudaError_t bn254_bit_reverse_cuda( vec_ops::BitReverseConfig& config, bn254::scalar_t* output); -extern "C" cudaError_t bn254_bit_reverse_inplace_cuda( - bn254::scalar_t* input, - unsigned n, - vec_ops::BitReverseConfig& config); +// extern "C" cudaError_t bn254_bit_reverse_inplace_cuda( +// bn254::scalar_t* input, +// unsigned n, +// vec_ops::BitReverseConfig& config); extern "C" void bn254_generate_scalars(bn254::scalar_t* scalars, int size); diff --git a/icicle/include/api/bw6_761.h b/icicle/include/api/bw6_761.h index 7ebb5e9f1..c4f9c0bce 100644 --- a/icicle/include/api/bw6_761.h +++ b/icicle/include/api/bw6_761.h @@ -122,10 +122,10 @@ extern "C" cudaError_t bw6_761_bit_reverse_cuda( vec_ops::BitReverseConfig& config, bw6_761::scalar_t* output); -extern "C" cudaError_t bw6_761_bit_reverse_inplace_cuda( - bw6_761::scalar_t* input, - unsigned n, - vec_ops::BitReverseConfig& config); +// extern "C" cudaError_t bw6_761_bit_reverse_inplace_cuda( +// bw6_761::scalar_t* input, +// unsigned n, +// vec_ops::BitReverseConfig& config); extern "C" void bw6_761_generate_scalars(bw6_761::scalar_t* scalars, int size); diff --git a/icicle/include/api/grumpkin.h b/icicle/include/api/grumpkin.h index 95d02ddd3..22b4c539f 100644 --- a/icicle/include/api/grumpkin.h +++ b/icicle/include/api/grumpkin.h @@ -92,10 +92,10 @@ extern "C" cudaError_t grumpkin_bit_reverse_cuda( vec_ops::BitReverseConfig& config, grumpkin::scalar_t* output); -extern "C" cudaError_t grumpkin_bit_reverse_inplace_cuda( - grumpkin::scalar_t* input, - unsigned n, - vec_ops::BitReverseConfig& config); +// extern "C" cudaError_t grumpkin_bit_reverse_inplace_cuda( +// grumpkin::scalar_t* input, +// unsigned n, +// vec_ops::BitReverseConfig& config); extern "C" void grumpkin_generate_scalars(grumpkin::scalar_t* scalars, int size); diff --git a/icicle/include/api/stark252.h b/icicle/include/api/stark252.h index 2b4acc334..e12d06fb8 100644 --- a/icicle/include/api/stark252.h +++ b/icicle/include/api/stark252.h @@ -37,10 +37,10 @@ extern "C" cudaError_t stark252_bit_reverse_cuda( vec_ops::BitReverseConfig& config, stark252::scalar_t* output); -extern "C" cudaError_t stark252_bit_reverse_inplace_cuda( - stark252::scalar_t* input, - unsigned n, - vec_ops::BitReverseConfig& config); +// extern "C" cudaError_t stark252_bit_reverse_inplace_cuda( +// stark252::scalar_t* input, +// unsigned n, +// vec_ops::BitReverseConfig& config); extern "C" void stark252_generate_scalars(stark252::scalar_t* scalars, int size); diff --git a/icicle/src/vec_ops/extern.cu b/icicle/src/vec_ops/extern.cu index 7c9079a46..d74a9bca7 100644 --- a/icicle/src/vec_ops/extern.cu +++ b/icicle/src/vec_ops/extern.cu @@ -65,9 +65,9 @@ namespace vec_ops { { return bit_reverse(input, n, config, output); } - extern "C" cudaError_t - CONCAT_EXPAND(FIELD, bit_reverse_inplace_cuda)(scalar_t* input, unsigned n, BitReverseConfig& config) - { - return bit_reverse_inplace(input, n, config); - } + // extern "C" cudaError_t + // CONCAT_EXPAND(FIELD, bit_reverse_inplace_cuda)(scalar_t* input, unsigned n, BitReverseConfig& config) + // { + // return bit_reverse_inplace(input, n, config); + // } } // namespace vec_ops \ No newline at end of file diff --git a/icicle/src/vec_ops/extern_extension.cu b/icicle/src/vec_ops/extern_extension.cu index df63ab34a..a91792913 100644 --- a/icicle/src/vec_ops/extern_extension.cu +++ b/icicle/src/vec_ops/extern_extension.cu @@ -56,4 +56,10 @@ namespace vec_ops { { return transpose_matrix(input, output, row_size, column_size, ctx, on_device, is_async); } + + extern "C" cudaError_t CONCAT_EXPAND(FIELD, extension_bit_reverse_cuda)( + const extension_t* input, unsigned n, BitReverseConfig& config, extension_t* output) + { + return bit_reverse(input, n, config, output); + } } // namespace vec_ops \ No newline at end of file diff --git a/icicle/src/vec_ops/vec_ops.cu b/icicle/src/vec_ops/vec_ops.cu index 86445a69e..1a264c6f5 100644 --- a/icicle/src/vec_ops/vec_ops.cu +++ b/icicle/src/vec_ops/vec_ops.cu @@ -217,16 +217,6 @@ namespace vec_ops { cudaError_t bit_reverse(const E* input, unsigned size, BitReverseConfig& cfg, E* output) { if (size & (size - 1)) THROW_ICICLE_ERR(IcicleError_t::InvalidArgument, "bit_reverse: size must be a power of 2"); - const E* d_input; - E* d_alloc_input; - if (cfg.is_input_on_device) { - d_input = input; - } else { - // copy input to gpu - CHK_IF_RETURN(cudaMallocAsync(&d_alloc_input, sizeof(E) * size, cfg.ctx.stream)); - CHK_IF_RETURN(cudaMemcpyAsync(d_alloc_input, input, sizeof(E) * size, cudaMemcpyHostToDevice, cfg.ctx.stream)); - d_input = d_alloc_input; - } E* d_output; if (cfg.is_output_on_device) { d_output = output; @@ -236,8 +226,25 @@ namespace vec_ops { } unsigned shift = __builtin_clz(size) + 1; unsigned num_blocks = (size + MAX_THREADS_PER_BLOCK - 1) / MAX_THREADS_PER_BLOCK; - bit_reverse_kernel<<>>(d_input, size, shift, d_output); - if (!cfg.is_input_on_device) { CHK_IF_RETURN(cudaFreeAsync(d_alloc_input, cfg.ctx.stream)); } + if (input == output) { + if (cfg.is_input_on_device) + CHK_IF_RETURN(cudaMemcpyAsync(d_output, output, sizeof(E) * size, cudaMemcpyHostToDevice, cfg.ctx.stream)); + bit_reverse_inplace_kernel<<>>(d_output, size, shift); + } + else { + const E* d_input; + E* d_alloc_input; + if (cfg.is_input_on_device) { + d_input = input; + } else { + // copy input to gpu + CHK_IF_RETURN(cudaMallocAsync(&d_alloc_input, sizeof(E) * size, cfg.ctx.stream)); + CHK_IF_RETURN(cudaMemcpyAsync(d_alloc_input, input, sizeof(E) * size, cudaMemcpyHostToDevice, cfg.ctx.stream)); + d_input = d_alloc_input; + } + bit_reverse_kernel<<>>(d_input, size, shift, d_output); + if (!cfg.is_input_on_device) { CHK_IF_RETURN(cudaFreeAsync(d_alloc_input, cfg.ctx.stream)); } + } if (!cfg.is_output_on_device) { CHK_IF_RETURN(cudaMemcpyAsync(output, d_output, sizeof(E) * size, cudaMemcpyDeviceToHost, cfg.ctx.stream)); CHK_IF_RETURN(cudaFreeAsync(d_output, cfg.ctx.stream)); diff --git a/wrappers/rust/icicle-core/src/vec_ops/mod.rs b/wrappers/rust/icicle-core/src/vec_ops/mod.rs index 536649d7e..88b1ba474 100644 --- a/wrappers/rust/icicle-core/src/vec_ops/mod.rs +++ b/wrappers/rust/icicle-core/src/vec_ops/mod.rs @@ -47,10 +47,10 @@ pub struct BitReverseConfig<'a> { pub ctx: DeviceContext<'a>, /// True if inputs are on device and false if they're on host. Default value: false. - pub are_inputs_on_device: bool, + pub is_input_on_device: bool, /// If true, output is preserved on device, otherwise on host. Default value: false. - pub are_outputs_on_device: bool, + pub is_output_on_device: bool, /// Whether to run the vector operations asynchronously. If set to `true`, the functions will be non-blocking and you'd need to synchronize /// it explicitly by running `stream.synchronize()`. If set to false, the functions will block the current CPU thread. @@ -67,8 +67,8 @@ impl<'a> BitReverseConfig<'a> { pub fn default_for_device(device_id: usize) -> Self { BitReverseConfig { ctx: DeviceContext::default_for_device(device_id), - are_inputs_on_device: false, - are_outputs_on_device: false, + is_input_on_device: false, + is_output_on_device: false, is_async: false, } } @@ -156,6 +156,36 @@ fn check_vec_ops_args<'a, F>( res_cfg.is_result_on_device = result.is_on_device(); res_cfg } +fn check_bit_reverse_args<'a, F>( + input: &(impl HostOrDeviceSlice + ?Sized), + cfg: &BitReverseConfig<'a>, + output: &(impl HostOrDeviceSlice + ?Sized), +) -> BitReverseConfig<'a> { + if input.len() & (input.len() - 1) != 0 { + panic!("input length must be a power of 2, input length: {}", input.len()); + } + if input.len() != output.len() { + panic!( + "input and output lengths {}; {} do not match", + input.len(), + output.len() + ); + } + let ctx_device_id = cfg + .ctx + .device_id; + if let Some(device_id) = input.device_id() { + assert_eq!(device_id, ctx_device_id, "Device ids in input and context are different"); + } + if let Some(device_id) = output.device_id() { + assert_eq!(device_id, ctx_device_id, "Device ids in output and context are different"); + } + check_device(ctx_device_id); + let mut res_cfg = cfg.clone(); + res_cfg.is_input_on_device = input.is_on_device(); + res_cfg.is_output_on_device = output.is_on_device(); + res_cfg +} pub fn add_scalars( a: &(impl HostOrDeviceSlice + ?Sized), @@ -224,7 +254,8 @@ where F: FieldImpl, ::Config: VecOps, { - <::Config as VecOps>::bit_reverse(input, cfg, output) + let cfg = check_bit_reverse_args(input, cfg, output); + <::Config as VecOps>::bit_reverse(input, &cfg, output) } pub fn bit_reverse_inplace( @@ -235,7 +266,8 @@ where F: FieldImpl, ::Config: VecOps, { - <::Config as VecOps>::bit_reverse_inplace(input, cfg) + let cfg = check_bit_reverse_args(input, cfg, input); + <::Config as VecOps>::bit_reverse_inplace(input, &cfg) } #[macro_export] @@ -298,12 +330,12 @@ macro_rules! impl_vec_ops_field { output: *mut $field, ) -> CudaError; - #[link_name = concat!($field_prefix, "_bit_reverse_inplace_cuda")] - pub(crate) fn bit_reverse_inplace_cuda( - input: *mut $field, - size: u32, - config: *const BitReverseConfig, - ) -> CudaError; + // #[link_name = concat!($field_prefix, "_bit_reverse_inplace_cuda")] + // pub(crate) fn bit_reverse_inplace_cuda( + // input: *mut $field, + // size: u32, + // config: *const BitReverseConfig, + // ) -> CudaError; } } @@ -406,10 +438,11 @@ macro_rules! impl_vec_ops_field { cfg: &BitReverseConfig, ) -> IcicleResult<()> { unsafe { - $field_prefix_ident::bit_reverse_inplace_cuda( - input.as_mut_ptr(), + $field_prefix_ident::bit_reverse_cuda( + input.as_ptr(), input.len() as u32, cfg as *const BitReverseConfig, + input.as_mut_ptr(), ) .wrap() } diff --git a/wrappers/rust/icicle-core/src/vec_ops/tests.rs b/wrappers/rust/icicle-core/src/vec_ops/tests.rs index a1a45671b..cad95d827 100644 --- a/wrappers/rust/icicle-core/src/vec_ops/tests.rs +++ b/wrappers/rust/icicle-core/src/vec_ops/tests.rs @@ -44,15 +44,12 @@ where let input = F::Config::generate_random(TEST_SIZE); let input = HostSlice::from_slice(&input); let mut intermediate_result = DeviceVec::::cuda_malloc(TEST_SIZE).unwrap(); - let mut cfg = BitReverseConfig::default(); - cfg.are_outputs_on_device = true; + let cfg = BitReverseConfig::default(); bit_reverse(input, &cfg, &mut intermediate_result[..]).unwrap(); let mut result = vec![F::one(); TEST_SIZE]; let result = HostSlice::from_mut_slice(&mut result); - let mut cfg = BitReverseConfig::default(); - cfg.are_outputs_on_device = false; - cfg.are_inputs_on_device = true; + let cfg = BitReverseConfig::default(); bit_reverse(&intermediate_result[..], &cfg, result).unwrap(); assert_eq!(input.as_slice(), result.as_slice()); } @@ -68,8 +65,7 @@ where intermediate .copy_from_host(&input) .unwrap(); - let mut cfg = BitReverseConfig::default(); - cfg.are_inputs_on_device = true; + let cfg = BitReverseConfig::default(); bit_reverse_inplace(&mut intermediate[..], &cfg).unwrap(); bit_reverse_inplace(&mut intermediate[..], &cfg).unwrap(); let mut result_host = vec![F::one(); TEST_SIZE]; diff --git a/wrappers/rust/icicle-fields/icicle-babybear/src/vec_ops/mod.rs b/wrappers/rust/icicle-fields/icicle-babybear/src/vec_ops/mod.rs index 451f4aeaa..22a19417d 100644 --- a/wrappers/rust/icicle-fields/icicle-babybear/src/vec_ops/mod.rs +++ b/wrappers/rust/icicle-fields/icicle-babybear/src/vec_ops/mod.rs @@ -11,56 +11,6 @@ use icicle_cuda_runtime::memory::HostOrDeviceSlice; impl_vec_ops_field!("babybear", babybear, ScalarField, ScalarCfg); impl_vec_ops_field!("babybear_extension", babybear_extension, ExtensionField, ExtensionCfg); -// #[repr(C)] -// #[derive(Debug, Clone)] -// pub struct BitReverseConfig<'a> { -// /// Details related to the device such as its id and stream id. See [DeviceContext](@ref device_context::DeviceContext). -// pub ctx: DeviceContext<'a>, - -// /// True if inputs are on device and false if they're on host. Default value: false. -// pub are_inputs_on_device: bool, - -// /// If true, output is preserved on device, otherwise on host. Default value: false. -// pub are_outputs_on_device: bool, - -// /// Whether to run the Keccak asynchronously. If set to `true`, the keccak_hash function will be -// /// non-blocking and you'd need to synchronize it explicitly by running -// /// `cudaStreamSynchronize` or `cudaDeviceSynchronize`. If set to false, keccak_hash -// /// function will block the current CPU thread. -// pub is_async: bool, -// } - -// impl<'a> Default for BitReverseConfig<'a> { -// fn default() -> Self { -// Self::default_for_device(DEFAULT_DEVICE_ID) -// } -// } - -// impl<'a> BitReverseConfig<'a> { -// pub fn default_for_device(device_id: usize) -> Self { -// BitReverseConfig { -// ctx: DeviceContext::default_for_device(device_id), -// are_inputs_on_device: false, -// are_outputs_on_device: false, -// is_async: false, -// } -// } -// } - -extern "C" { - pub(crate) fn babybear_bit_reverse_cuda( - input: *const ScalarField, - size: u32, - config: &BitReverseConfig, - output: *mut ScalarField, - ) -> CudaError; - - pub(crate) fn babybear_bit_reverse_inplace_cuda( - input: *mut ScalarField, - size: u32, - config: &BitReverseConfig, - ) -> CudaError; -} #[cfg(test)] pub(crate) mod tests { From 0ce2fd48d3e05dc9e473ee452005903855a8118e Mon Sep 17 00:00:00 2001 From: nonam3e Date: Tue, 28 May 2024 10:00:10 +0000 Subject: [PATCH 05/14] fmt --- icicle/src/vec_ops/extern.cu | 4 ++-- icicle/src/vec_ops/vec_ops.cu | 5 ++--- wrappers/rust/icicle-core/src/vec_ops/mod.rs | 10 ++++++++-- .../icicle-fields/icicle-babybear/src/vec_ops/mod.rs | 1 - 4 files changed, 12 insertions(+), 8 deletions(-) diff --git a/icicle/src/vec_ops/extern.cu b/icicle/src/vec_ops/extern.cu index d74a9bca7..b8aca686a 100644 --- a/icicle/src/vec_ops/extern.cu +++ b/icicle/src/vec_ops/extern.cu @@ -60,8 +60,8 @@ namespace vec_ops { return transpose_matrix(input, output, row_size, column_size, ctx, on_device, is_async); } - extern "C" cudaError_t CONCAT_EXPAND(FIELD, bit_reverse_cuda)( - const scalar_t* input, unsigned n, BitReverseConfig& config, scalar_t* output) + extern "C" cudaError_t + CONCAT_EXPAND(FIELD, bit_reverse_cuda)(const scalar_t* input, unsigned n, BitReverseConfig& config, scalar_t* output) { return bit_reverse(input, n, config, output); } diff --git a/icicle/src/vec_ops/vec_ops.cu b/icicle/src/vec_ops/vec_ops.cu index 1a264c6f5..8ea7232f3 100644 --- a/icicle/src/vec_ops/vec_ops.cu +++ b/icicle/src/vec_ops/vec_ops.cu @@ -227,11 +227,10 @@ namespace vec_ops { unsigned shift = __builtin_clz(size) + 1; unsigned num_blocks = (size + MAX_THREADS_PER_BLOCK - 1) / MAX_THREADS_PER_BLOCK; if (input == output) { - if (cfg.is_input_on_device) + if (cfg.is_input_on_device) CHK_IF_RETURN(cudaMemcpyAsync(d_output, output, sizeof(E) * size, cudaMemcpyHostToDevice, cfg.ctx.stream)); bit_reverse_inplace_kernel<<>>(d_output, size, shift); - } - else { + } else { const E* d_input; E* d_alloc_input; if (cfg.is_input_on_device) { diff --git a/wrappers/rust/icicle-core/src/vec_ops/mod.rs b/wrappers/rust/icicle-core/src/vec_ops/mod.rs index 88b1ba474..af9e4ddf7 100644 --- a/wrappers/rust/icicle-core/src/vec_ops/mod.rs +++ b/wrappers/rust/icicle-core/src/vec_ops/mod.rs @@ -175,10 +175,16 @@ fn check_bit_reverse_args<'a, F>( .ctx .device_id; if let Some(device_id) = input.device_id() { - assert_eq!(device_id, ctx_device_id, "Device ids in input and context are different"); + assert_eq!( + device_id, ctx_device_id, + "Device ids in input and context are different" + ); } if let Some(device_id) = output.device_id() { - assert_eq!(device_id, ctx_device_id, "Device ids in output and context are different"); + assert_eq!( + device_id, ctx_device_id, + "Device ids in output and context are different" + ); } check_device(ctx_device_id); let mut res_cfg = cfg.clone(); diff --git a/wrappers/rust/icicle-fields/icicle-babybear/src/vec_ops/mod.rs b/wrappers/rust/icicle-fields/icicle-babybear/src/vec_ops/mod.rs index 22a19417d..8654d0512 100644 --- a/wrappers/rust/icicle-fields/icicle-babybear/src/vec_ops/mod.rs +++ b/wrappers/rust/icicle-fields/icicle-babybear/src/vec_ops/mod.rs @@ -11,7 +11,6 @@ use icicle_cuda_runtime::memory::HostOrDeviceSlice; impl_vec_ops_field!("babybear", babybear, ScalarField, ScalarCfg); impl_vec_ops_field!("babybear_extension", babybear_extension, ExtensionField, ExtensionCfg); - #[cfg(test)] pub(crate) mod tests { use crate::field::{ExtensionField, ScalarField}; From 984e87a65931fb6ef3fbb154d507ed437b39ddd5 Mon Sep 17 00:00:00 2001 From: ChickenLover Date: Tue, 28 May 2024 17:10:51 +0700 Subject: [PATCH 06/14] fmt --- icicle/src/vec_ops/extern.cu | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/icicle/src/vec_ops/extern.cu b/icicle/src/vec_ops/extern.cu index b8aca686a..aaeeec10a 100644 --- a/icicle/src/vec_ops/extern.cu +++ b/icicle/src/vec_ops/extern.cu @@ -13,7 +13,7 @@ namespace vec_ops { * @return `cudaSuccess` if the execution was successful and an error code otherwise. */ extern "C" cudaError_t - CONCAT_EXPAND(FIELD, mul_cuda)(scalar_t* vec_a, scalar_t* vec_b, int n, VecOpsConfig& config, scalar_t* result) + CONCAT_EXPAND(FIELD, mul_cuda)(scalar_t* vec_a, scalar_t* vec_b, int n, VecOpsConfig& config, scalar_t* result) { return mul(vec_a, vec_b, n, config, result); } @@ -25,7 +25,7 @@ namespace vec_ops { * @return `cudaSuccess` if the execution was successful and an error code otherwise. */ extern "C" cudaError_t - CONCAT_EXPAND(FIELD, add_cuda)(scalar_t* vec_a, scalar_t* vec_b, int n, VecOpsConfig& config, scalar_t* result) + CONCAT_EXPAND(FIELD, add_cuda)(scalar_t* vec_a, scalar_t* vec_b, int n, VecOpsConfig& config, scalar_t* result) { return add(vec_a, vec_b, n, config, result); } @@ -37,7 +37,7 @@ namespace vec_ops { * @return `cudaSuccess` if the execution was successful and an error code otherwise. */ extern "C" cudaError_t - CONCAT_EXPAND(FIELD, sub_cuda)(scalar_t* vec_a, scalar_t* vec_b, int n, VecOpsConfig& config, scalar_t* result) + CONCAT_EXPAND(FIELD, sub_cuda)(scalar_t* vec_a, scalar_t* vec_b, int n, VecOpsConfig& config, scalar_t* result) { return sub(vec_a, vec_b, n, config, result); } @@ -61,7 +61,7 @@ namespace vec_ops { } extern "C" cudaError_t - CONCAT_EXPAND(FIELD, bit_reverse_cuda)(const scalar_t* input, unsigned n, BitReverseConfig& config, scalar_t* output) + CONCAT_EXPAND(FIELD, bit_reverse_cuda)(const scalar_t* input, unsigned n, BitReverseConfig& config, scalar_t* output) { return bit_reverse(input, n, config, output); } From 2b04511d41f60cbd8f78d70e5a1ea7e2c1759ac1 Mon Sep 17 00:00:00 2001 From: nonam3e Date: Tue, 28 May 2024 10:49:48 +0000 Subject: [PATCH 07/14] remove unused func --- icicle/src/vec_ops/vec_ops.cu | 23 ----------------------- 1 file changed, 23 deletions(-) diff --git a/icicle/src/vec_ops/vec_ops.cu b/icicle/src/vec_ops/vec_ops.cu index 8ea7232f3..e9737ae27 100644 --- a/icicle/src/vec_ops/vec_ops.cu +++ b/icicle/src/vec_ops/vec_ops.cu @@ -190,29 +190,6 @@ namespace vec_ops { return CHK_LAST(); } - template - cudaError_t bit_reverse_inplace(E* input, unsigned size, BitReverseConfig& cfg) - { - if (size & (size - 1)) THROW_ICICLE_ERR(IcicleError_t::InvalidArgument, "bit_reverse: size must be a power of 2"); - E* d_input; - if (cfg.is_input_on_device) { - d_input = input; - } else { - // copy input to gpu - CHK_IF_RETURN(cudaMallocAsync(&d_input, sizeof(E) * size, cfg.ctx.stream)); - CHK_IF_RETURN(cudaMemcpyAsync(d_input, input, sizeof(E) * size, cudaMemcpyHostToDevice, cfg.ctx.stream)); - } - unsigned shift = __builtin_clz(size) + 1; - unsigned num_blocks = (size + MAX_THREADS_PER_BLOCK - 1) / MAX_THREADS_PER_BLOCK; - bit_reverse_inplace_kernel<<>>(d_input, size, shift); - if (!cfg.is_input_on_device) { - CHK_IF_RETURN(cudaMemcpyAsync(input, d_input, sizeof(E) * size, cudaMemcpyDeviceToHost, cfg.ctx.stream)); - CHK_IF_RETURN(cudaFreeAsync(d_input, cfg.ctx.stream)); - } - if (!cfg.is_async) CHK_IF_RETURN(cudaStreamSynchronize(cfg.ctx.stream)); - return CHK_LAST(); - } - template cudaError_t bit_reverse(const E* input, unsigned size, BitReverseConfig& cfg, E* output) { From c06947ba5e5303874cb2fddf66cf06df8f023fc8 Mon Sep 17 00:00:00 2001 From: nonam3e Date: Tue, 28 May 2024 14:35:18 +0000 Subject: [PATCH 08/14] memory usage optimization --- icicle/src/vec_ops/vec_ops.cu | 24 +++++++++--------------- 1 file changed, 9 insertions(+), 15 deletions(-) diff --git a/icicle/src/vec_ops/vec_ops.cu b/icicle/src/vec_ops/vec_ops.cu index e9737ae27..b5a415a35 100644 --- a/icicle/src/vec_ops/vec_ops.cu +++ b/icicle/src/vec_ops/vec_ops.cu @@ -194,6 +194,8 @@ namespace vec_ops { cudaError_t bit_reverse(const E* input, unsigned size, BitReverseConfig& cfg, E* output) { if (size & (size - 1)) THROW_ICICLE_ERR(IcicleError_t::InvalidArgument, "bit_reverse: size must be a power of 2"); + if ((input == output) ^ (cfg.is_input_on_device == cfg.is_output_on_device)) THROW_ICICLE_ERR(IcicleError_t::InvalidArgument, "bit_reverse: equal devices should have same is_on_device parameters"); + E* d_output; if (cfg.is_output_on_device) { d_output = output; @@ -201,25 +203,17 @@ namespace vec_ops { // allocate output on gpu CHK_IF_RETURN(cudaMallocAsync(&d_output, sizeof(E) * size, cfg.ctx.stream)); } + unsigned shift = __builtin_clz(size) + 1; unsigned num_blocks = (size + MAX_THREADS_PER_BLOCK - 1) / MAX_THREADS_PER_BLOCK; - if (input == output) { - if (cfg.is_input_on_device) - CHK_IF_RETURN(cudaMemcpyAsync(d_output, output, sizeof(E) * size, cudaMemcpyHostToDevice, cfg.ctx.stream)); - bit_reverse_inplace_kernel<<>>(d_output, size, shift); + + if ((input != output) & cfg.is_input_on_device) { + bit_reverse_kernel<<>>(input, size, shift, d_output); } else { - const E* d_input; - E* d_alloc_input; - if (cfg.is_input_on_device) { - d_input = input; - } else { - // copy input to gpu - CHK_IF_RETURN(cudaMallocAsync(&d_alloc_input, sizeof(E) * size, cfg.ctx.stream)); - CHK_IF_RETURN(cudaMemcpyAsync(d_alloc_input, input, sizeof(E) * size, cudaMemcpyHostToDevice, cfg.ctx.stream)); - d_input = d_alloc_input; + if (!cfg.is_input_on_device) { + CHK_IF_RETURN(cudaMemcpyAsync(d_output, input, sizeof(E) * size, cudaMemcpyHostToDevice, cfg.ctx.stream)); } - bit_reverse_kernel<<>>(d_input, size, shift, d_output); - if (!cfg.is_input_on_device) { CHK_IF_RETURN(cudaFreeAsync(d_alloc_input, cfg.ctx.stream)); } + bit_reverse_inplace_kernel<<>>(d_output, size, shift); } if (!cfg.is_output_on_device) { CHK_IF_RETURN(cudaMemcpyAsync(output, d_output, sizeof(E) * size, cudaMemcpyDeviceToHost, cfg.ctx.stream)); From 6fd4116199b7839bb32855fe2e7085f7e39d38c7 Mon Sep 17 00:00:00 2001 From: nonam3e <71525212+nonam3e@users.noreply.github.com> Date: Wed, 29 May 2024 10:37:42 +0700 Subject: [PATCH 09/14] Update vec_ops.cu --- icicle/src/vec_ops/vec_ops.cu | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/icicle/src/vec_ops/vec_ops.cu b/icicle/src/vec_ops/vec_ops.cu index b5a415a35..ed6e5050a 100644 --- a/icicle/src/vec_ops/vec_ops.cu +++ b/icicle/src/vec_ops/vec_ops.cu @@ -194,7 +194,7 @@ namespace vec_ops { cudaError_t bit_reverse(const E* input, unsigned size, BitReverseConfig& cfg, E* output) { if (size & (size - 1)) THROW_ICICLE_ERR(IcicleError_t::InvalidArgument, "bit_reverse: size must be a power of 2"); - if ((input == output) ^ (cfg.is_input_on_device == cfg.is_output_on_device)) THROW_ICICLE_ERR(IcicleError_t::InvalidArgument, "bit_reverse: equal devices should have same is_on_device parameters"); + if ((input == output) & (cfg.is_input_on_device != cfg.is_output_on_device)) THROW_ICICLE_ERR(IcicleError_t::InvalidArgument, "bit_reverse: equal devices should have same is_on_device parameters"); E* d_output; if (cfg.is_output_on_device) { @@ -222,4 +222,4 @@ namespace vec_ops { if (!cfg.is_async) CHK_IF_RETURN(cudaStreamSynchronize(cfg.ctx.stream)); return CHK_LAST(); } -} // namespace vec_ops \ No newline at end of file +} // namespace vec_ops From 3604f27daeca38229fb4ba7ca5164577dff731ba Mon Sep 17 00:00:00 2001 From: nonam3e Date: Wed, 29 May 2024 19:42:19 +0700 Subject: [PATCH 10/14] fmt --- examples/c++/best-practice-ntt/example.cu | 63 +++-- examples/c++/msm/example.cu | 4 +- examples/c++/multi-gpu-poseidon/example.cu | 257 +++++++++--------- examples/c++/multiply/example.cu | 6 +- examples/c++/ntt/example.cu | 10 +- examples/c++/pedersen-commitment/example.cu | 109 ++++---- .../c++/polynomial_multiplication/example.cu | 8 +- examples/c++/poseidon/example.cu | 19 +- icicle/include/api/babybear.h | 5 - icicle/include/api/bls12_377.h | 5 - icicle/include/api/bls12_381.h | 5 - icicle/include/api/bn254.h | 5 - icicle/include/api/bw6_761.h | 5 - icicle/include/api/grumpkin.h | 5 - icicle/include/api/stark252.h | 5 - icicle/src/ntt/thread_ntt.cu | 20 +- icicle/src/vec_ops/extern.cu | 5 - icicle/src/vec_ops/vec_ops.cu | 4 +- wrappers/rust/icicle-core/src/vec_ops/mod.rs | 7 - 19 files changed, 259 insertions(+), 288 deletions(-) diff --git a/examples/c++/best-practice-ntt/example.cu b/examples/c++/best-practice-ntt/example.cu index ccb14e53e..341f42a9e 100644 --- a/examples/c++/best-practice-ntt/example.cu +++ b/examples/c++/best-practice-ntt/example.cu @@ -15,20 +15,23 @@ typedef scalar_t E; const unsigned max_log_ntt_size = 27; -void initialize_input(const unsigned ntt_size, const unsigned nof_ntts, E * elements ) { +void initialize_input(const unsigned ntt_size, const unsigned nof_ntts, E* elements) +{ for (unsigned i = 0; i < ntt_size * nof_ntts; i++) { - elements[i] = E::from(i+1); + elements[i] = E::from(i + 1); } } using FpMilliseconds = std::chrono::duration; #define START_TIMER(timer) auto timer##_start = std::chrono::high_resolution_clock::now(); -#define END_TIMER(timer, msg) printf("%s: %.0f ms\n", msg, FpMilliseconds(std::chrono::high_resolution_clock::now() - timer##_start).count()); +#define END_TIMER(timer, msg) \ + printf("%s: %.0f ms\n", msg, FpMilliseconds(std::chrono::high_resolution_clock::now() - timer##_start).count()); -int main(int argc, char** argv) { +int main(int argc, char** argv) +{ cudaDeviceReset(); cudaDeviceProp deviceProperties; - int deviceId=0; + int deviceId = 0; cudaGetDeviceProperties(&deviceProperties, deviceId); std::string gpu_full_name = deviceProperties.name; std::cout << gpu_full_name << std::endl; @@ -38,7 +41,7 @@ int main(int argc, char** argv) { S basic_root = S::omega(max_log_ntt_size); - // change these parameters to match the desired NTT size and batch size + // change these parameters to match the desired NTT size and batch size const unsigned log_ntt_size = 22; const unsigned nof_ntts = 16; @@ -49,7 +52,7 @@ int main(int argc, char** argv) { // Create separate CUDA streams for overlapping data transfers and kernel execution. cudaStream_t stream_compute, stream_h2d, stream_d2h; - cudaStreamCreate(&stream_compute); + cudaStreamCreate(&stream_compute); cudaStreamCreate(&stream_h2d); cudaStreamCreate(&stream_d2h); @@ -68,27 +71,27 @@ int main(int argc, char** argv) { config_compute.are_inputs_on_device = true; config_compute.are_outputs_on_device = true; config_compute.is_async = true; - + std::cout << "Concurrent Download, Upload, and Compute In-place NTT" << std::endl; int nof_blocks = 32; std::cout << "Number of blocks: " << nof_blocks << std::endl; - int block_size = ntt_size*nof_ntts/nof_blocks; - + int block_size = ntt_size * nof_ntts / nof_blocks; + // on-host pinned data - E * h_inp[2]; - E * h_out[2]; + E* h_inp[2]; + E* h_out[2]; for (int i = 0; i < 2; i++) { - cudaHostAlloc((void**)&h_inp[i], sizeof(E)*ntt_size*nof_ntts, cudaHostAllocDefault); - cudaHostAlloc((void**)&h_out[i], sizeof(E)*ntt_size*nof_ntts, cudaHostAllocDefault); + cudaHostAlloc((void**)&h_inp[i], sizeof(E) * ntt_size * nof_ntts, cudaHostAllocDefault); + cudaHostAlloc((void**)&h_out[i], sizeof(E) * ntt_size * nof_ntts, cudaHostAllocDefault); } - + // on-device in-place data // we need two on-device vectors to overlap data transfers with NTT kernel execution - E * d_vec[2]; + E* d_vec[2]; for (int i = 0; i < 2; i++) { - cudaMalloc((void**)&d_vec[i], sizeof(E)*ntt_size*nof_ntts); + cudaMalloc((void**)&d_vec[i], sizeof(E) * ntt_size * nof_ntts); } - + // initialize input data initialize_input(ntt_size, nof_ntts, h_inp[0]); initialize_input(ntt_size, nof_ntts, h_inp[1]); @@ -97,7 +100,7 @@ int main(int argc, char** argv) { cudaEventCreate(&compute_start); cudaEventCreate(&compute_stop); - for ( int run = 0; run < 10; run++ ) { + for (int run = 0; run < 10; run++) { int vec_compute = run % 2; int vec_transfer = (run + 1) % 2; std::cout << "Run: " << run << std::endl; @@ -110,14 +113,18 @@ int main(int argc, char** argv) { // we have to delay upload to device relative to download from device by one block: preserve write after read for (int i = 0; i <= nof_blocks; i++) { if (i < nof_blocks) { - cudaMemcpyAsync(&h_out[vec_transfer][i*block_size], &d_vec[vec_transfer][i*block_size], sizeof(E)*block_size, cudaMemcpyDeviceToHost, stream_d2h); + cudaMemcpyAsync( + &h_out[vec_transfer][i * block_size], &d_vec[vec_transfer][i * block_size], sizeof(E) * block_size, + cudaMemcpyDeviceToHost, stream_d2h); } - if (i>0) { - cudaMemcpyAsync(&d_vec[vec_transfer][(i-1)*block_size], &h_inp[vec_transfer][(i-1)*block_size], sizeof(E)*block_size, cudaMemcpyHostToDevice, stream_h2d); + if (i > 0) { + cudaMemcpyAsync( + &d_vec[vec_transfer][(i - 1) * block_size], &h_inp[vec_transfer][(i - 1) * block_size], + sizeof(E) * block_size, cudaMemcpyHostToDevice, stream_h2d); } // synchronize upload and download at the end of the block to ensure data integrity - cudaStreamSynchronize(stream_d2h); - cudaStreamSynchronize(stream_h2d); + cudaStreamSynchronize(stream_d2h); + cudaStreamSynchronize(stream_h2d); } // synchronize compute stream with the end of the computation cudaEventSynchronize(compute_stop); @@ -126,12 +133,12 @@ int main(int argc, char** argv) { END_TIMER(inplace, "Concurrent In-Place NTT"); std::cout << "NTT time: " << milliseconds << " ms" << std::endl; }; - + // Clean-up for (int i = 0; i < 2; i++) { - cudaFree(d_vec[i]); - cudaFreeHost(h_inp[i]); - cudaFreeHost(h_out[i]); + cudaFree(d_vec[i]); + cudaFreeHost(h_inp[i]); + cudaFreeHost(h_out[i]); } cudaEventDestroy(compute_start); cudaEventDestroy(compute_stop); diff --git a/examples/c++/msm/example.cu b/examples/c++/msm/example.cu index ded07ee93..abdc3f5cb 100644 --- a/examples/c++/msm/example.cu +++ b/examples/c++/msm/example.cu @@ -16,7 +16,7 @@ int main(int argc, char* argv[]) int N = batch_size * msm_size; std::cout << "Part I: use G1 points" << std::endl; - + std::cout << "Generating random inputs on-host" << std::endl; scalar_t* scalars = new scalar_t[N]; affine_t* points = new affine_t[N]; @@ -43,7 +43,7 @@ int main(int argc, char* argv[]) false, // is_async }; config.batch_size = batch_size; - + std::cout << "Running MSM kernel with on-host inputs" << std::endl; cudaStream_t stream = config.ctx.stream; // Execute the MSM kernel diff --git a/examples/c++/multi-gpu-poseidon/example.cu b/examples/c++/multi-gpu-poseidon/example.cu index cd7725019..32f0c920c 100644 --- a/examples/c++/multi-gpu-poseidon/example.cu +++ b/examples/c++/multi-gpu-poseidon/example.cu @@ -9,137 +9,148 @@ using namespace poseidon; using namespace bn254; -void checkCudaError(cudaError_t error) { - if (error != cudaSuccess) { - std::cerr << "CUDA error: " << cudaGetErrorString(error) << std::endl; - // Handle the error, e.g., exit the program or throw an exception. - } +void checkCudaError(cudaError_t error) +{ + if (error != cudaSuccess) { + std::cerr << "CUDA error: " << cudaGetErrorString(error) << std::endl; + // Handle the error, e.g., exit the program or throw an exception. + } } // these global constants go into template calls const int size_col = 11; // this function executes the Poseidon thread -void threadPoseidon(device_context::DeviceContext ctx, unsigned size_partition, scalar_t * layers, scalar_t * column_hashes, PoseidonConstants * constants) { - cudaError_t err_result = CHK_STICKY(cudaSetDevice(ctx.device_id)); - if (err_result != cudaSuccess) { - std::cerr << "CUDA error: " << cudaGetErrorString(err_result) << std::endl; - return; - } - // CHK_IF_RETURN(); I can't use it in a standard thread function - PoseidonConfig column_config = { - ctx, // ctx - false, // are_inputes_on_device - false, // are_outputs_on_device - false, // input_is_a_state - false, // aligned - false, // loop_state - false, // is_async - }; - cudaError_t err = bn254_poseidon_hash_cuda(layers, column_hashes, (size_t) size_partition, size_col, *constants, column_config); - checkCudaError(err); +void threadPoseidon( + device_context::DeviceContext ctx, + unsigned size_partition, + scalar_t* layers, + scalar_t* column_hashes, + PoseidonConstants* constants) +{ + cudaError_t err_result = CHK_STICKY(cudaSetDevice(ctx.device_id)); + if (err_result != cudaSuccess) { + std::cerr << "CUDA error: " << cudaGetErrorString(err_result) << std::endl; + return; + } + // CHK_IF_RETURN(); I can't use it in a standard thread function + PoseidonConfig column_config = { + ctx, // ctx + false, // are_inputes_on_device + false, // are_outputs_on_device + false, // input_is_a_state + false, // aligned + false, // loop_state + false, // is_async + }; + cudaError_t err = + bn254_poseidon_hash_cuda(layers, column_hashes, (size_t)size_partition, size_col, *constants, column_config); + checkCudaError(err); } using FpMilliseconds = std::chrono::duration; #define START_TIMER(timer) auto timer##_start = std::chrono::high_resolution_clock::now(); -#define END_TIMER(timer, msg) printf("%s: %.0f ms\n", msg, FpMilliseconds(std::chrono::high_resolution_clock::now() - timer##_start).count()); - - -#define CHECK_ALLOC(ptr) if ((ptr) == nullptr) { \ - std::cerr << "Memory allocation for '" #ptr "' failed." << std::endl; \ - exit(EXIT_FAILURE); \ -} - -int main() { - const unsigned size_row = (1<<30); - const unsigned nof_partitions = 64; - const unsigned size_partition = size_row / nof_partitions; - // layers is allocated only for one partition, need to reuse for different partitions - const uint32_t size_layers = size_col * size_partition; - - nvmlInit(); - unsigned int deviceCount; - nvmlDeviceGetCount(&deviceCount); - std::cout << "Available GPUs: " << deviceCount << std::endl; - - for (unsigned int i = 0; i < deviceCount; ++i) { - nvmlDevice_t device; - nvmlMemory_t memory; - char name[NVML_DEVICE_NAME_BUFFER_SIZE]; - nvmlDeviceGetHandleByIndex(i, &device); - nvmlDeviceGetName(device, name, NVML_DEVICE_NAME_BUFFER_SIZE); - nvmlDeviceGetMemoryInfo(device, &memory); - std::cout << "Device ID: " << i << ", Type: " << name << ", Memory Total/Free (MiB) " << memory.total/1024/1024 << "/" << memory.free/1024/1024 << std::endl; - } - - const unsigned memory_partition = sizeof(scalar_t)*(size_col+1)*size_partition/1024/1024; - std::cout << "Required Memory (MiB) " << memory_partition << std::endl; - - //=============================================================================== - // Key: multiple devices are supported by device context - //=============================================================================== - - device_context::DeviceContext ctx0 = device_context::get_default_device_context(); - ctx0.device_id=0; - device_context::DeviceContext ctx1 = device_context::get_default_device_context(); - ctx1.device_id=1; - - std::cout << "Allocate and initialize the memory for layers and hashes" << std::endl; - scalar_t* layers0 = static_cast(malloc(size_layers * sizeof(scalar_t))); - CHECK_ALLOC(layers0); - scalar_t s = scalar_t::zero(); - for (unsigned i = 0; i < size_col*size_partition ; i++) { - layers0[i] = s; - s = s + scalar_t::one(); - } - scalar_t* layers1 = static_cast(malloc(size_layers * sizeof(scalar_t))); - CHECK_ALLOC(layers1); - s = scalar_t::zero() + scalar_t::one(); - for (unsigned i = 0; i < size_col*size_partition ; i++) { - layers1[i] = s; - s = s + scalar_t::one(); - } - - scalar_t* column_hash0 = static_cast(malloc(size_partition * sizeof(scalar_t))); - CHECK_ALLOC(column_hash0); - scalar_t* column_hash1 = static_cast(malloc(size_partition * sizeof(scalar_t))); - CHECK_ALLOC(column_hash1); - - PoseidonConstants column_constants0, column_constants1; - bn254_init_optimized_poseidon_constants_cuda(size_col, ctx0, &column_constants0); - cudaError_t err_result = CHK_STICKY(cudaSetDevice(ctx1.device_id)); - if (err_result != cudaSuccess) { - std::cerr << "CUDA error: " << cudaGetErrorString(err_result) << std::endl; - return; - } - bn254_init_optimized_poseidon_constants_cuda(size_col, ctx1, &column_constants1); - - std::cout << "Parallel execution of Poseidon threads" << std::endl; - START_TIMER(parallel); - std::thread thread0(threadPoseidon, ctx0, size_partition, layers0, column_hash0, &column_constants0); - std::thread thread1(threadPoseidon, ctx1, size_partition, layers1, column_hash1, &column_constants1); - - // Wait for the threads to finish - thread0.join(); - thread1.join(); - END_TIMER(parallel,"2 GPUs"); - std::cout << "Output Data from Thread 0: "; - std::cout << column_hash0[0] << std::endl; - std::cout << "Output Data from Thread 1: "; - std::cout << column_hash1[0] << std::endl; - - std::cout << "Sequential execution of Poseidon threads" << std::endl; - START_TIMER(sequential); - std::thread thread2(threadPoseidon, ctx0, size_partition, layers0, column_hash0, &column_constants0); - thread2.join(); - std::thread thread3(threadPoseidon, ctx0, size_partition, layers1, column_hash1, &column_constants0); - thread3.join(); - END_TIMER(sequential,"1 GPU"); - std::cout << "Output Data from Thread 2: "; - std::cout << column_hash0[0] << std::endl; - std::cout << "Output Data from Thread 3: "; - std::cout << column_hash1[0] << std::endl; - - nvmlShutdown(); - return 0; +#define END_TIMER(timer, msg) \ + printf("%s: %.0f ms\n", msg, FpMilliseconds(std::chrono::high_resolution_clock::now() - timer##_start).count()); + +#define CHECK_ALLOC(ptr) \ + if ((ptr) == nullptr) { \ + std::cerr << "Memory allocation for '" #ptr "' failed." << std::endl; \ + exit(EXIT_FAILURE); \ + } + +int main() +{ + const unsigned size_row = (1 << 30); + const unsigned nof_partitions = 64; + const unsigned size_partition = size_row / nof_partitions; + // layers is allocated only for one partition, need to reuse for different partitions + const uint32_t size_layers = size_col * size_partition; + + nvmlInit(); + unsigned int deviceCount; + nvmlDeviceGetCount(&deviceCount); + std::cout << "Available GPUs: " << deviceCount << std::endl; + + for (unsigned int i = 0; i < deviceCount; ++i) { + nvmlDevice_t device; + nvmlMemory_t memory; + char name[NVML_DEVICE_NAME_BUFFER_SIZE]; + nvmlDeviceGetHandleByIndex(i, &device); + nvmlDeviceGetName(device, name, NVML_DEVICE_NAME_BUFFER_SIZE); + nvmlDeviceGetMemoryInfo(device, &memory); + std::cout << "Device ID: " << i << ", Type: " << name << ", Memory Total/Free (MiB) " << memory.total / 1024 / 1024 + << "/" << memory.free / 1024 / 1024 << std::endl; + } + + const unsigned memory_partition = sizeof(scalar_t) * (size_col + 1) * size_partition / 1024 / 1024; + std::cout << "Required Memory (MiB) " << memory_partition << std::endl; + + //=============================================================================== + // Key: multiple devices are supported by device context + //=============================================================================== + + device_context::DeviceContext ctx0 = device_context::get_default_device_context(); + ctx0.device_id = 0; + device_context::DeviceContext ctx1 = device_context::get_default_device_context(); + ctx1.device_id = 1; + + std::cout << "Allocate and initialize the memory for layers and hashes" << std::endl; + scalar_t* layers0 = static_cast(malloc(size_layers * sizeof(scalar_t))); + CHECK_ALLOC(layers0); + scalar_t s = scalar_t::zero(); + for (unsigned i = 0; i < size_col * size_partition; i++) { + layers0[i] = s; + s = s + scalar_t::one(); + } + scalar_t* layers1 = static_cast(malloc(size_layers * sizeof(scalar_t))); + CHECK_ALLOC(layers1); + s = scalar_t::zero() + scalar_t::one(); + for (unsigned i = 0; i < size_col * size_partition; i++) { + layers1[i] = s; + s = s + scalar_t::one(); + } + + scalar_t* column_hash0 = static_cast(malloc(size_partition * sizeof(scalar_t))); + CHECK_ALLOC(column_hash0); + scalar_t* column_hash1 = static_cast(malloc(size_partition * sizeof(scalar_t))); + CHECK_ALLOC(column_hash1); + + PoseidonConstants column_constants0, column_constants1; + bn254_init_optimized_poseidon_constants_cuda(size_col, ctx0, &column_constants0); + cudaError_t err_result = CHK_STICKY(cudaSetDevice(ctx1.device_id)); + if (err_result != cudaSuccess) { + std::cerr << "CUDA error: " << cudaGetErrorString(err_result) << std::endl; + return; + } + bn254_init_optimized_poseidon_constants_cuda(size_col, ctx1, &column_constants1); + + std::cout << "Parallel execution of Poseidon threads" << std::endl; + START_TIMER(parallel); + std::thread thread0(threadPoseidon, ctx0, size_partition, layers0, column_hash0, &column_constants0); + std::thread thread1(threadPoseidon, ctx1, size_partition, layers1, column_hash1, &column_constants1); + + // Wait for the threads to finish + thread0.join(); + thread1.join(); + END_TIMER(parallel, "2 GPUs"); + std::cout << "Output Data from Thread 0: "; + std::cout << column_hash0[0] << std::endl; + std::cout << "Output Data from Thread 1: "; + std::cout << column_hash1[0] << std::endl; + + std::cout << "Sequential execution of Poseidon threads" << std::endl; + START_TIMER(sequential); + std::thread thread2(threadPoseidon, ctx0, size_partition, layers0, column_hash0, &column_constants0); + thread2.join(); + std::thread thread3(threadPoseidon, ctx0, size_partition, layers1, column_hash1, &column_constants0); + thread3.join(); + END_TIMER(sequential, "1 GPU"); + std::cout << "Output Data from Thread 2: "; + std::cout << column_hash0[0] << std::endl; + std::cout << "Output Data from Thread 3: "; + std::cout << column_hash1[0] << std::endl; + + nvmlShutdown(); + return 0; } diff --git a/examples/c++/multiply/example.cu b/examples/c++/multiply/example.cu index 0b378f247..77eb7a0b4 100644 --- a/examples/c++/multiply/example.cu +++ b/examples/c++/multiply/example.cu @@ -17,7 +17,7 @@ int vector_mult(T* vec_b, T* vec_a, T* vec_result, size_t n_elments, device_cont config.is_a_on_device = true; config.is_b_on_device = true; config.is_result_on_device = true; - cudaError_t err = bn254_mul_cuda(vec_a, vec_b, n_elments, config, vec_result); + cudaError_t err = bn254_mul_cuda(vec_a, vec_b, n_elments, config, vec_result); if (err != cudaSuccess) { std::cerr << "Failed to multiply vectors - " << cudaGetErrorString(err) << std::endl; return 0; @@ -100,7 +100,7 @@ int main(int argc, char** argv) std::cerr << "Failed to copy data from host to device - " << cudaGetErrorString(err) << std::endl; return 0; } - + std::cout << "Starting warm-up" << std::endl; // Warm-up loop for (int i = 0; i < repetitions; i++) { @@ -151,7 +151,7 @@ int main(int argc, char** argv) // validate multiplication here... // clean up and exit - free(host_in1); + free(host_in1); free(host_in2); free(host_out); cudaFree(device_in1); diff --git a/examples/c++/ntt/example.cu b/examples/c++/ntt/example.cu index 5d98e4758..5e50f0dd7 100644 --- a/examples/c++/ntt/example.cu +++ b/examples/c++/ntt/example.cu @@ -60,8 +60,8 @@ int validate_output(const unsigned ntt_size, const unsigned nof_ntts, E* element using FpMilliseconds = std::chrono::duration; #define START_TIMER(timer) auto timer##_start = std::chrono::high_resolution_clock::now(); -#define END_TIMER(timer, msg) printf("%s: %.0f ms\n", msg, FpMilliseconds(std::chrono::high_resolution_clock::now() - timer##_start).count()); - +#define END_TIMER(timer, msg) \ + printf("%s: %.0f ms\n", msg, FpMilliseconds(std::chrono::high_resolution_clock::now() - timer##_start).count()); int main(int argc, char* argv[]) { @@ -89,16 +89,16 @@ int main(int argc, char* argv[]) bn254_initialize_domain(&basic_root, ctx, true); // Create an NTTConfig instance NTTConfig config = default_ntt_config(); - config.ntt_algorithm = NttAlgorithm::MixedRadix; + config.ntt_algorithm = NttAlgorithm::MixedRadix; config.batch_size = nof_ntts; START_TIMER(MixedRadix); cudaError_t err = bn254_ntt_cuda(input, ntt_size, NTTDir::kForward, config, output); END_TIMER(MixedRadix, "MixedRadix NTT"); - + std::cout << "Validating output" << std::endl; validate_output(ntt_size, nof_ntts, output); - config.ntt_algorithm = NttAlgorithm::Radix2; + config.ntt_algorithm = NttAlgorithm::Radix2; START_TIMER(Radix2); err = bn254_ntt_cuda(input, ntt_size, NTTDir::kForward, config, output); END_TIMER(Radix2, "Radix2 NTT"); diff --git a/examples/c++/pedersen-commitment/example.cu b/examples/c++/pedersen-commitment/example.cu index 93f47b2ba..106a47df1 100644 --- a/examples/c++/pedersen-commitment/example.cu +++ b/examples/c++/pedersen-commitment/example.cu @@ -11,49 +11,47 @@ using namespace bn254; typedef point_field_t T; // modular power -T modPow(T base, T exp) { +T modPow(T base, T exp) +{ T r = T::one(); T b = base; T e = exp; while (e != T::zero()) { - // If exp is odd, multiply the base with result - if (T::is_odd(e)) { - r = r * b; - } - // Now exp must be even, divide it by 2 - e =T::div2(e); - b = b * b; + // If exp is odd, multiply the base with result + if (T::is_odd(e)) { r = r * b; } + // Now exp must be even, divide it by 2 + e = T::div2(e); + b = b * b; } return r; } // Check if y2 is a quadratic residue using Euler's Criterion -bool quadratic_residue(T y2) { - return modPow(y2, T::div2(T::zero() - T::one())) == T::one(); -} +bool quadratic_residue(T y2) { return modPow(y2, T::div2(T::zero() - T::one())) == T::one(); } // modular square root adapted from: // https://github.com/ShahjalalShohag/code-library/blob/main/Number%20Theory/Tonelli%20Shanks%20Algorithm.cpp -bool mySQRT(T a, T *result) { +bool mySQRT(T a, T* result) +{ if (a == T::zero()) { *result = T::zero(); return true; } - if (modPow(a, T::div2(T::zero() - T::one())) != T::one() ) { + if (modPow(a, T::div2(T::zero() - T::one())) != T::one()) { return false; // solution does not exist } // TODO: consider special cases - // if (p % 4 == 3) return power(a, (p + 1) / 4, p); - T s = T::zero() - T::one(); // p - 1, - T n = T::one() + T::one(); //2; - T r = T::zero(); + // if (p % 4 == 3) return power(a, (p + 1) / 4, p); + T s = T::zero() - T::one(); // p - 1, + T n = T::one() + T::one(); // 2; + T r = T::zero(); T m; while (T::is_even(s)) { r = r + T::one(); - s = T::div2(s); //s /= 2; + s = T::div2(s); // s /= 2; } // find a non-square mod p - while (modPow(n, T::div2((T::zero() - T::one())) ) != T::zero() - T::one()) { + while (modPow(n, T::div2((T::zero() - T::one()))) != T::zero() - T::one()) { n = n + T::one(); } T x = modPow(a, T::div2(s + T::one())); @@ -61,83 +59,86 @@ bool mySQRT(T a, T *result) { T g = modPow(n, s); for (;; r = m) { T t = b; - for (m = T::zero(); T::lt(m,r) /* m < r*/ && t != T::one(); m = m + T::one()) t = t * t; - if (m == T::zero() ) { + for (m = T::zero(); T::lt(m, r) /* m < r*/ && t != T::one(); m = m + T::one()) + t = t * t; + if (m == T::zero()) { *result = x; return true; } - T gs = modPow(g, modPow(T::one() + T::one(), r - m - T::one()) ); - g = gs * gs ; - x = x * gs ; - b = b * g ; + T gs = modPow(g, modPow(T::one() + T::one(), r - m - T::one())); + g = gs * gs; + x = x * gs; + b = b * g; } } -void point_near_x(T x, affine_t *point) { - const T wb = T { weierstrass_b }; +void point_near_x(T x, affine_t* point) +{ + const T wb = T{weierstrass_b}; T y2; - while (y2 = x*x*x + wb, quadratic_residue(y2) == false) - { + while (y2 = x * x * x + wb, quadratic_residue(y2) == false) { x = x + T::one(); }; T y; bool found = mySQRT(y2, &y); - assert(y*y == y2); + assert(y * y == y2); point->x = x; point->y = y; } static int seed = 0; static HOST_INLINE T rand_host_seed() - { - std::mt19937_64 generator(seed++); - std::uniform_int_distribution distribution; - - T value; - for (unsigned i = 0; i < T::TLC-1 ; i++) +{ + std::mt19937_64 generator(seed++); + std::uniform_int_distribution distribution; + + T value; + for (unsigned i = 0; i < T::TLC - 1; i++) // TODO: use the full range of limbs: for (unsigned i = 0; i < T::TLC ; i++) - value.limbs_storage.limbs[i] = distribution(generator); - // while (lt(Field{get_modulus()}, value)) - // value = value - Field{get_modulus()}; - return value; - } + value.limbs_storage.limbs[i] = distribution(generator); + // while (lt(Field{get_modulus()}, value)) + // value = value - Field{get_modulus()}; + return value; +} using FpMilliseconds = std::chrono::duration; #define START_TIMER(timer) auto timer##_start = std::chrono::high_resolution_clock::now(); -#define END_TIMER(timer, msg) printf("%s: %.0f ms\n", msg, FpMilliseconds(std::chrono::high_resolution_clock::now() - timer##_start).count()); +#define END_TIMER(timer, msg) \ + printf("%s: %.0f ms\n", msg, FpMilliseconds(std::chrono::high_resolution_clock::now() - timer##_start).count()); int main(int argc, char** argv) { const unsigned N = pow(2, 10); std::cout << "Commitment vector size: " << N << "+1 for salt (a.k.a blinding factor)" << std::endl; - T* xs = new T[N+1]; - + T* xs = new T[N + 1]; + std::cout << "Generating random points transparently using publicly chosen seed" << std::endl; - std::cout << "Public seed prevents committer from knowing the discrete logs of points used in the commitment" << std::endl; + std::cout << "Public seed prevents committer from knowing the discrete logs of points used in the commitment" + << std::endl; seed = 1234; std::cout << "Using seed: " << seed << std::endl; std::cout << "Generating random field values" << std::endl; START_TIMER(gen); - + for (unsigned i = 0; i < N; i++) { xs[i] = rand_host_seed(); } END_TIMER(gen, "Time to generate field values"); - std::cout << "xs[0]: " << xs[0] << std::endl; - std::cout << "xs[1]: " << xs[1] << std::endl; - + std::cout << "xs[0]: " << xs[0] << std::endl; + std::cout << "xs[1]: " << xs[1] << std::endl; + // affine_t points[N]; - affine_t* points = new affine_t[N+1]; + affine_t* points = new affine_t[N + 1]; std::cout << "Generating point about random field values" << std::endl; START_TIMER(points); - for (unsigned i = 0; i < N+1; i++) { + for (unsigned i = 0; i < N + 1; i++) { point_near_x(xs[i], &points[i]); } END_TIMER(points, "Time to generate points"); - + std::cout << "Generating commitment vector" << std::endl; projective_t result; - scalar_t* scalars = new scalar_t[N+1]; + scalar_t* scalars = new scalar_t[N + 1]; scalar_t::rand_host_many(scalars, N); std::cout << "Generating salt" << std::endl; @@ -146,7 +147,7 @@ int main(int argc, char** argv) std::cout << "Executing MSM" << std::endl; auto config = msm::default_msm_config(); START_TIMER(msm); - bn254_msm_cuda(scalars, points, N+1, config, &result); + bn254_msm_cuda(scalars, points, N + 1, config, &result); END_TIMER(msm, "Time to execute MSM"); std::cout << "Computed commitment: " << result << std::endl; diff --git a/examples/c++/polynomial_multiplication/example.cu b/examples/c++/polynomial_multiplication/example.cu index 7a6e4da19..9c80864e5 100644 --- a/examples/c++/polynomial_multiplication/example.cu +++ b/examples/c++/polynomial_multiplication/example.cu @@ -82,10 +82,10 @@ int main(int argc, char** argv) CHK_IF_RETURN(cudaMallocAsync(&MulGpu, sizeof(test_data) * NTT_SIZE, ntt_config.ctx.stream)); vec_ops::VecOpsConfig config{ ntt_config.ctx, - true, // is_a_on_device - true, // is_b_on_device - true, // is_result_on_device - false // is_async + true, // is_a_on_device + true, // is_b_on_device + true, // is_result_on_device + false // is_async }; CHK_IF_RETURN(bn254_mul_cuda(GpuA, GpuB, NTT_SIZE, config, MulGpu)); diff --git a/examples/c++/poseidon/example.cu b/examples/c++/poseidon/example.cu index 3b2fe517c..171196213 100644 --- a/examples/c++/poseidon/example.cu +++ b/examples/c++/poseidon/example.cu @@ -14,12 +14,13 @@ inline uint32_t tree_index(uint32_t level, uint32_t offset) { return (1 << level // We assume the tree has leaves already set, compute all other levels void build_tree( - const uint32_t tree_height, scalar_t* tree, PoseidonConstants * constants, PoseidonConfig config) + const uint32_t tree_height, scalar_t* tree, PoseidonConstants* constants, PoseidonConfig config) { for (uint32_t level = tree_height - 1; level > 0; level--) { const uint32_t next_level = level - 1; const uint32_t next_level_width = 1 << next_level; - bn254_poseidon_hash_cuda(&tree[tree_index(level, 0)], &tree[tree_index(next_level, 0)], next_level_width, 2, *constants, config); + bn254_poseidon_hash_cuda( + &tree[tree_index(level, 0)], &tree[tree_index(next_level, 0)], next_level_width, 2, *constants, config); } } @@ -37,11 +38,7 @@ uint32_t query_membership(scalar_t query, scalar_t* tree, const uint32_t tree_he } void generate_proof( - uint32_t position, - scalar_t* tree, - const uint32_t tree_height, - uint32_t* proof_lr, - scalar_t* proof_hash) + uint32_t position, scalar_t* tree, const uint32_t tree_height, uint32_t* proof_lr, scalar_t* proof_hash) { uint32_t level_index = position; for (uint32_t level = tree_height - 1; level > 0; level--) { @@ -68,7 +65,7 @@ uint32_t validate_proof( const uint32_t tree_height, const uint32_t* proof_lr, const scalar_t* proof_hash, - PoseidonConstants * constants, + PoseidonConstants* constants, PoseidonConfig config) { scalar_t hashes_in[2], hash_out[1], level_hash; @@ -114,13 +111,13 @@ int main(int argc, char* argv[]) std::cout << "Hashing blocks into tree leaves..." << std::endl; PoseidonConstants constants; bn254_init_optimized_poseidon_constants_cuda(data_arity, ctx, &constants); - PoseidonConfig config = default_poseidon_config(data_arity+1); + PoseidonConfig config = default_poseidon_config(data_arity + 1); bn254_poseidon_hash_cuda(data, &tree[tree_index(leaf_level, 0)], tree_width, 4, constants, config); std::cout << "3. Building Merkle tree" << std::endl; PoseidonConstants tree_constants; bn254_init_optimized_poseidon_constants_cuda(tree_arity, ctx, &tree_constants); - PoseidonConfig tree_config = default_poseidon_config(tree_arity+1); + PoseidonConfig tree_config = default_poseidon_config(tree_arity + 1); build_tree(tree_height, tree, &tree_constants, tree_config); std::cout << "4. Generate membership proof" << std::endl; @@ -142,7 +139,7 @@ int main(int argc, char* argv[]) std::cout << "6. Tamper the hash" << std::endl; const scalar_t tampered_hash = hash + scalar_t::one(); validated = validate_proof(tampered_hash, tree_height, proof_lr, proof_hash, &tree_constants, tree_config); - + std::cout << "7. Invalidate tamper hash membership" << std::endl; std::cout << "Validated: " << validated << std::endl; return 0; diff --git a/icicle/include/api/babybear.h b/icicle/include/api/babybear.h index 7d06141d3..6703f97b4 100644 --- a/icicle/include/api/babybear.h +++ b/icicle/include/api/babybear.h @@ -74,11 +74,6 @@ extern "C" cudaError_t babybear_bit_reverse_cuda( vec_ops::BitReverseConfig& config, babybear::scalar_t* output); -// extern "C" cudaError_t babybear_bit_reverse_inplace_cuda( -// babybear::scalar_t* input, -// unsigned n, -// vec_ops::BitReverseConfig& config); - extern "C" void babybear_generate_scalars(babybear::scalar_t* scalars, int size); extern "C" cudaError_t babybear_scalar_convert_montgomery( diff --git a/icicle/include/api/bls12_377.h b/icicle/include/api/bls12_377.h index 2191c4435..c033103d2 100644 --- a/icicle/include/api/bls12_377.h +++ b/icicle/include/api/bls12_377.h @@ -122,11 +122,6 @@ extern "C" cudaError_t bls12_377_bit_reverse_cuda( vec_ops::BitReverseConfig& config, bls12_377::scalar_t* output); -// extern "C" cudaError_t bls12_377_bit_reverse_inplace_cuda( -// bls12_377::scalar_t* input, -// unsigned n, -// vec_ops::BitReverseConfig& config); - extern "C" void bls12_377_generate_scalars(bls12_377::scalar_t* scalars, int size); extern "C" cudaError_t bls12_377_scalar_convert_montgomery( diff --git a/icicle/include/api/bls12_381.h b/icicle/include/api/bls12_381.h index 6e034a4fa..bdb65d8b3 100644 --- a/icicle/include/api/bls12_381.h +++ b/icicle/include/api/bls12_381.h @@ -122,11 +122,6 @@ extern "C" cudaError_t bls12_381_bit_reverse_cuda( vec_ops::BitReverseConfig& config, bls12_381::scalar_t* output); -// extern "C" cudaError_t bls12_381_bit_reverse_inplace_cuda( -// bls12_381::scalar_t* input, -// unsigned n, -// vec_ops::BitReverseConfig& config); - extern "C" void bls12_381_generate_scalars(bls12_381::scalar_t* scalars, int size); extern "C" cudaError_t bls12_381_scalar_convert_montgomery( diff --git a/icicle/include/api/bn254.h b/icicle/include/api/bn254.h index 2b76882b6..0ad15f593 100644 --- a/icicle/include/api/bn254.h +++ b/icicle/include/api/bn254.h @@ -154,11 +154,6 @@ extern "C" cudaError_t bn254_bit_reverse_cuda( vec_ops::BitReverseConfig& config, bn254::scalar_t* output); -// extern "C" cudaError_t bn254_bit_reverse_inplace_cuda( -// bn254::scalar_t* input, -// unsigned n, -// vec_ops::BitReverseConfig& config); - extern "C" void bn254_generate_scalars(bn254::scalar_t* scalars, int size); extern "C" cudaError_t bn254_scalar_convert_montgomery( diff --git a/icicle/include/api/bw6_761.h b/icicle/include/api/bw6_761.h index c4f9c0bce..957e0bb92 100644 --- a/icicle/include/api/bw6_761.h +++ b/icicle/include/api/bw6_761.h @@ -122,11 +122,6 @@ extern "C" cudaError_t bw6_761_bit_reverse_cuda( vec_ops::BitReverseConfig& config, bw6_761::scalar_t* output); -// extern "C" cudaError_t bw6_761_bit_reverse_inplace_cuda( -// bw6_761::scalar_t* input, -// unsigned n, -// vec_ops::BitReverseConfig& config); - extern "C" void bw6_761_generate_scalars(bw6_761::scalar_t* scalars, int size); extern "C" cudaError_t bw6_761_scalar_convert_montgomery( diff --git a/icicle/include/api/grumpkin.h b/icicle/include/api/grumpkin.h index 22b4c539f..c547807ea 100644 --- a/icicle/include/api/grumpkin.h +++ b/icicle/include/api/grumpkin.h @@ -92,11 +92,6 @@ extern "C" cudaError_t grumpkin_bit_reverse_cuda( vec_ops::BitReverseConfig& config, grumpkin::scalar_t* output); -// extern "C" cudaError_t grumpkin_bit_reverse_inplace_cuda( -// grumpkin::scalar_t* input, -// unsigned n, -// vec_ops::BitReverseConfig& config); - extern "C" void grumpkin_generate_scalars(grumpkin::scalar_t* scalars, int size); extern "C" cudaError_t grumpkin_scalar_convert_montgomery( diff --git a/icicle/include/api/stark252.h b/icicle/include/api/stark252.h index e12d06fb8..c370257e4 100644 --- a/icicle/include/api/stark252.h +++ b/icicle/include/api/stark252.h @@ -37,11 +37,6 @@ extern "C" cudaError_t stark252_bit_reverse_cuda( vec_ops::BitReverseConfig& config, stark252::scalar_t* output); -// extern "C" cudaError_t stark252_bit_reverse_inplace_cuda( -// stark252::scalar_t* input, -// unsigned n, -// vec_ops::BitReverseConfig& config); - extern "C" void stark252_generate_scalars(stark252::scalar_t* scalars, int size); extern "C" cudaError_t stark252_scalar_convert_montgomery( diff --git a/icicle/src/ntt/thread_ntt.cu b/icicle/src/ntt/thread_ntt.cu index 8fd2764df..19321a81b 100644 --- a/icicle/src/ntt/thread_ntt.cu +++ b/icicle/src/ntt/thread_ntt.cu @@ -17,11 +17,11 @@ struct stage_metadata { #define STAGE_SIZES_DATA \ { \ {0, 0, 0, 0, 0}, {0, 0, 0, 0, 0}, {0, 0, 0, 0, 0}, {0, 0, 0, 0, 0}, {4, 0, 0, 0, 0}, {5, 0, 0, 0, 0}, \ - {6, 0, 0, 0, 0}, {0, 0, 0, 0, 0}, {4, 4, 0, 0, 0}, {5, 4, 0, 0, 0}, {5, 5, 0, 0, 0}, {6, 5, 0, 0, 0}, \ - {6, 6, 0, 0, 0}, {4, 5, 4, 0, 0}, {4, 6, 4, 0, 0}, {5, 5, 5, 0, 0}, {6, 4, 6, 0, 0}, {6, 5, 6, 0, 0}, \ - {6, 6, 6, 0, 0}, {6, 5, 4, 4, 0}, {5, 5, 5, 5, 0}, {6, 5, 5, 5, 0}, {6, 5, 5, 6, 0}, {6, 6, 6, 5, 0}, \ - {6, 6, 6, 6, 0}, {5, 5, 5, 5, 5}, {6, 5, 4, 5, 6}, {6, 5, 5, 5, 6}, {6, 5, 6, 5, 6}, {6, 6, 5, 6, 6}, \ - {6, 6, 6, 6, 6}, \ + {6, 0, 0, 0, 0}, {0, 0, 0, 0, 0}, {4, 4, 0, 0, 0}, {5, 4, 0, 0, 0}, {5, 5, 0, 0, 0}, {6, 5, 0, 0, 0}, \ + {6, 6, 0, 0, 0}, {4, 5, 4, 0, 0}, {4, 6, 4, 0, 0}, {5, 5, 5, 0, 0}, {6, 4, 6, 0, 0}, {6, 5, 6, 0, 0}, \ + {6, 6, 6, 0, 0}, {6, 5, 4, 4, 0}, {5, 5, 5, 5, 0}, {6, 5, 5, 5, 0}, {6, 5, 5, 6, 0}, {6, 6, 6, 5, 0}, \ + {6, 6, 6, 6, 0}, {5, 5, 5, 5, 5}, {6, 5, 4, 5, 6}, {6, 5, 5, 5, 6}, {6, 5, 6, 5, 6}, {6, 6, 5, 6, 6}, \ + {6, 6, 6, 6, 6}, \ } uint32_t constexpr STAGE_SIZES_HOST[31][5] = STAGE_SIZES_DATA; __device__ constexpr uint32_t STAGE_SIZES_DEVICE[31][5] = STAGE_SIZES_DATA; @@ -33,11 +33,11 @@ uint32_t constexpr STAGE_PREV_SIZES[31] = {0, 0, 0, 0, 0, 0, 0, 0, 4, 5 #define STAGE_SIZES_DATA_FAST_TW \ { \ {0, 0, 0, 0, 0}, {0, 0, 0, 0, 0}, {0, 0, 0, 0, 0}, {0, 0, 0, 0, 0}, {4, 0, 0, 0, 0}, {5, 0, 0, 0, 0}, \ - {6, 0, 0, 0, 0}, {0, 0, 0, 0, 0}, {4, 4, 0, 0, 0}, {5, 4, 0, 0, 0}, {5, 5, 0, 0, 0}, {6, 5, 0, 0, 0}, \ - {6, 6, 0, 0, 0}, {5, 4, 4, 0, 0}, {5, 4, 5, 0, 0}, {5, 5, 5, 0, 0}, {6, 5, 5, 0, 0}, {6, 5, 6, 0, 0}, \ - {6, 6, 6, 0, 0}, {5, 5, 5, 4, 0}, {5, 5, 5, 5, 0}, {6, 5, 5, 5, 0}, {6, 5, 5, 6, 0}, {6, 6, 6, 5, 0}, \ - {6, 6, 6, 6, 0}, {5, 5, 5, 5, 5}, {6, 5, 5, 5, 5}, {6, 5, 5, 5, 6}, {6, 5, 5, 6, 6}, {6, 6, 6, 5, 6}, \ - {6, 6, 6, 6, 6}, \ + {6, 0, 0, 0, 0}, {0, 0, 0, 0, 0}, {4, 4, 0, 0, 0}, {5, 4, 0, 0, 0}, {5, 5, 0, 0, 0}, {6, 5, 0, 0, 0}, \ + {6, 6, 0, 0, 0}, {5, 4, 4, 0, 0}, {5, 4, 5, 0, 0}, {5, 5, 5, 0, 0}, {6, 5, 5, 0, 0}, {6, 5, 6, 0, 0}, \ + {6, 6, 6, 0, 0}, {5, 5, 5, 4, 0}, {5, 5, 5, 5, 0}, {6, 5, 5, 5, 0}, {6, 5, 5, 6, 0}, {6, 6, 6, 5, 0}, \ + {6, 6, 6, 6, 0}, {5, 5, 5, 5, 5}, {6, 5, 5, 5, 5}, {6, 5, 5, 5, 6}, {6, 5, 5, 6, 6}, {6, 6, 6, 5, 6}, \ + {6, 6, 6, 6, 6}, \ } uint32_t constexpr STAGE_SIZES_HOST_FT[31][5] = STAGE_SIZES_DATA_FAST_TW; __device__ uint32_t constexpr STAGE_SIZES_DEVICE_FT[31][5] = STAGE_SIZES_DATA_FAST_TW; diff --git a/icicle/src/vec_ops/extern.cu b/icicle/src/vec_ops/extern.cu index aaeeec10a..e9504f782 100644 --- a/icicle/src/vec_ops/extern.cu +++ b/icicle/src/vec_ops/extern.cu @@ -65,9 +65,4 @@ namespace vec_ops { { return bit_reverse(input, n, config, output); } - // extern "C" cudaError_t - // CONCAT_EXPAND(FIELD, bit_reverse_inplace_cuda)(scalar_t* input, unsigned n, BitReverseConfig& config) - // { - // return bit_reverse_inplace(input, n, config); - // } } // namespace vec_ops \ No newline at end of file diff --git a/icicle/src/vec_ops/vec_ops.cu b/icicle/src/vec_ops/vec_ops.cu index ed6e5050a..f3c4ff1b1 100644 --- a/icicle/src/vec_ops/vec_ops.cu +++ b/icicle/src/vec_ops/vec_ops.cu @@ -194,7 +194,9 @@ namespace vec_ops { cudaError_t bit_reverse(const E* input, unsigned size, BitReverseConfig& cfg, E* output) { if (size & (size - 1)) THROW_ICICLE_ERR(IcicleError_t::InvalidArgument, "bit_reverse: size must be a power of 2"); - if ((input == output) & (cfg.is_input_on_device != cfg.is_output_on_device)) THROW_ICICLE_ERR(IcicleError_t::InvalidArgument, "bit_reverse: equal devices should have same is_on_device parameters"); + if ((input == output) & (cfg.is_input_on_device != cfg.is_output_on_device)) + THROW_ICICLE_ERR( + IcicleError_t::InvalidArgument, "bit_reverse: equal devices should have same is_on_device parameters"); E* d_output; if (cfg.is_output_on_device) { diff --git a/wrappers/rust/icicle-core/src/vec_ops/mod.rs b/wrappers/rust/icicle-core/src/vec_ops/mod.rs index af9e4ddf7..a064a0bd7 100644 --- a/wrappers/rust/icicle-core/src/vec_ops/mod.rs +++ b/wrappers/rust/icicle-core/src/vec_ops/mod.rs @@ -335,13 +335,6 @@ macro_rules! impl_vec_ops_field { config: *const BitReverseConfig, output: *mut $field, ) -> CudaError; - - // #[link_name = concat!($field_prefix, "_bit_reverse_inplace_cuda")] - // pub(crate) fn bit_reverse_inplace_cuda( - // input: *mut $field, - // size: u32, - // config: *const BitReverseConfig, - // ) -> CudaError; } } From cd3afe3cc4af6a78eabf87f3f2e88aeac4d15438 Mon Sep 17 00:00:00 2001 From: nonam3e Date: Wed, 29 May 2024 16:59:52 +0000 Subject: [PATCH 11/14] u64 size + tests --- icicle/include/api/babybear.h | 2 +- icicle/include/api/bls12_377.h | 2 +- icicle/include/api/bls12_381.h | 2 +- icicle/include/api/bn254.h | 2 +- icicle/include/api/bw6_761.h | 2 +- icicle/include/api/grumpkin.h | 2 +- icicle/include/api/stark252.h | 2 +- icicle/src/vec_ops/extern.cu | 2 +- icicle/src/vec_ops/extern_extension.cu | 2 +- icicle/src/vec_ops/vec_ops.cu | 18 ++++---- wrappers/rust/icicle-core/src/vec_ops/mod.rs | 6 +-- .../rust/icicle-core/src/vec_ops/tests.rs | 41 +++++++++++++++---- 12 files changed, 53 insertions(+), 30 deletions(-) diff --git a/icicle/include/api/babybear.h b/icicle/include/api/babybear.h index 6703f97b4..9b0726918 100644 --- a/icicle/include/api/babybear.h +++ b/icicle/include/api/babybear.h @@ -70,7 +70,7 @@ extern "C" cudaError_t babybear_transpose_matrix_cuda( extern "C" cudaError_t babybear_bit_reverse_cuda( const babybear::scalar_t* input, - unsigned n, + uint64_t n, vec_ops::BitReverseConfig& config, babybear::scalar_t* output); diff --git a/icicle/include/api/bls12_377.h b/icicle/include/api/bls12_377.h index c033103d2..d1123c0c3 100644 --- a/icicle/include/api/bls12_377.h +++ b/icicle/include/api/bls12_377.h @@ -118,7 +118,7 @@ extern "C" cudaError_t bls12_377_transpose_matrix_cuda( extern "C" cudaError_t bls12_377_bit_reverse_cuda( const bls12_377::scalar_t* input, - unsigned n, + uint64_t n, vec_ops::BitReverseConfig& config, bls12_377::scalar_t* output); diff --git a/icicle/include/api/bls12_381.h b/icicle/include/api/bls12_381.h index bdb65d8b3..dcca3d17f 100644 --- a/icicle/include/api/bls12_381.h +++ b/icicle/include/api/bls12_381.h @@ -118,7 +118,7 @@ extern "C" cudaError_t bls12_381_transpose_matrix_cuda( extern "C" cudaError_t bls12_381_bit_reverse_cuda( const bls12_381::scalar_t* input, - unsigned n, + uint64_t n, vec_ops::BitReverseConfig& config, bls12_381::scalar_t* output); diff --git a/icicle/include/api/bn254.h b/icicle/include/api/bn254.h index 0ad15f593..9b9e7bb02 100644 --- a/icicle/include/api/bn254.h +++ b/icicle/include/api/bn254.h @@ -150,7 +150,7 @@ extern "C" cudaError_t bn254_transpose_matrix_cuda( extern "C" cudaError_t bn254_bit_reverse_cuda( const bn254::scalar_t* input, - unsigned n, + uint64_t n, vec_ops::BitReverseConfig& config, bn254::scalar_t* output); diff --git a/icicle/include/api/bw6_761.h b/icicle/include/api/bw6_761.h index 957e0bb92..78877da5e 100644 --- a/icicle/include/api/bw6_761.h +++ b/icicle/include/api/bw6_761.h @@ -118,7 +118,7 @@ extern "C" cudaError_t bw6_761_transpose_matrix_cuda( extern "C" cudaError_t bw6_761_bit_reverse_cuda( const bw6_761::scalar_t* input, - unsigned n, + uint64_t n, vec_ops::BitReverseConfig& config, bw6_761::scalar_t* output); diff --git a/icicle/include/api/grumpkin.h b/icicle/include/api/grumpkin.h index c547807ea..c72be91a8 100644 --- a/icicle/include/api/grumpkin.h +++ b/icicle/include/api/grumpkin.h @@ -88,7 +88,7 @@ extern "C" cudaError_t grumpkin_transpose_matrix_cuda( extern "C" cudaError_t grumpkin_bit_reverse_cuda( const grumpkin::scalar_t* input, - unsigned n, + uint64_t n, vec_ops::BitReverseConfig& config, grumpkin::scalar_t* output); diff --git a/icicle/include/api/stark252.h b/icicle/include/api/stark252.h index c370257e4..c04702a1a 100644 --- a/icicle/include/api/stark252.h +++ b/icicle/include/api/stark252.h @@ -33,7 +33,7 @@ extern "C" cudaError_t stark252_transpose_matrix_cuda( extern "C" cudaError_t stark252_bit_reverse_cuda( const stark252::scalar_t* input, - unsigned n, + uint64_t n, vec_ops::BitReverseConfig& config, stark252::scalar_t* output); diff --git a/icicle/src/vec_ops/extern.cu b/icicle/src/vec_ops/extern.cu index e9504f782..fc1d1ac65 100644 --- a/icicle/src/vec_ops/extern.cu +++ b/icicle/src/vec_ops/extern.cu @@ -61,7 +61,7 @@ namespace vec_ops { } extern "C" cudaError_t - CONCAT_EXPAND(FIELD, bit_reverse_cuda)(const scalar_t* input, unsigned n, BitReverseConfig& config, scalar_t* output) + CONCAT_EXPAND(FIELD, bit_reverse_cuda)(const scalar_t* input, uint64_t n, BitReverseConfig& config, scalar_t* output) { return bit_reverse(input, n, config, output); } diff --git a/icicle/src/vec_ops/extern_extension.cu b/icicle/src/vec_ops/extern_extension.cu index a91792913..12927aec7 100644 --- a/icicle/src/vec_ops/extern_extension.cu +++ b/icicle/src/vec_ops/extern_extension.cu @@ -58,7 +58,7 @@ namespace vec_ops { } extern "C" cudaError_t CONCAT_EXPAND(FIELD, extension_bit_reverse_cuda)( - const extension_t* input, unsigned n, BitReverseConfig& config, extension_t* output) + const extension_t* input, uint64_t n, BitReverseConfig& config, extension_t* output) { return bit_reverse(input, n, config, output); } diff --git a/icicle/src/vec_ops/vec_ops.cu b/icicle/src/vec_ops/vec_ops.cu index f3c4ff1b1..64fe2d256 100644 --- a/icicle/src/vec_ops/vec_ops.cu +++ b/icicle/src/vec_ops/vec_ops.cu @@ -56,22 +56,22 @@ namespace vec_ops { } template - __global__ void bit_reverse_kernel(const E* input, unsigned n, unsigned shift, E* output) + __global__ void bit_reverse_kernel(const E* input, uint64_t n, unsigned shift, E* output) { - int tid = blockIdx.x * blockDim.x + threadIdx.x; + uint64_t tid = blockIdx.x * blockDim.x + threadIdx.x; // Handling arbitrary vector size if (tid < n) { - int reversed_index = __brev(tid) >> shift; + int reversed_index = __brevll(tid) >> shift; output[reversed_index] = input[tid]; } } template - __global__ void bit_reverse_inplace_kernel(E* input, unsigned n, unsigned shift) + __global__ void bit_reverse_inplace_kernel(E* input, uint64_t n, unsigned shift) { - int tid = blockIdx.x * blockDim.x + threadIdx.x; + uint64_t tid = blockIdx.x * blockDim.x + threadIdx.x; // Handling arbitrary vector size if (tid < n) { - int reversed_index = __brev(tid) >> shift; + int reversed_index = __brevll(tid) >> shift; if (reversed_index > tid) { E temp = input[tid]; input[tid] = input[reversed_index]; @@ -191,7 +191,7 @@ namespace vec_ops { } template - cudaError_t bit_reverse(const E* input, unsigned size, BitReverseConfig& cfg, E* output) + cudaError_t bit_reverse(const E* input, uint64_t size, BitReverseConfig& cfg, E* output) { if (size & (size - 1)) THROW_ICICLE_ERR(IcicleError_t::InvalidArgument, "bit_reverse: size must be a power of 2"); if ((input == output) & (cfg.is_input_on_device != cfg.is_output_on_device)) @@ -206,8 +206,8 @@ namespace vec_ops { CHK_IF_RETURN(cudaMallocAsync(&d_output, sizeof(E) * size, cfg.ctx.stream)); } - unsigned shift = __builtin_clz(size) + 1; - unsigned num_blocks = (size + MAX_THREADS_PER_BLOCK - 1) / MAX_THREADS_PER_BLOCK; + uint64_t shift = __builtin_clzll(size) + 1; + uint64_t num_blocks = (size + MAX_THREADS_PER_BLOCK - 1) / MAX_THREADS_PER_BLOCK; if ((input != output) & cfg.is_input_on_device) { bit_reverse_kernel<<>>(input, size, shift, d_output); diff --git a/wrappers/rust/icicle-core/src/vec_ops/mod.rs b/wrappers/rust/icicle-core/src/vec_ops/mod.rs index a064a0bd7..900f41f46 100644 --- a/wrappers/rust/icicle-core/src/vec_ops/mod.rs +++ b/wrappers/rust/icicle-core/src/vec_ops/mod.rs @@ -331,7 +331,7 @@ macro_rules! impl_vec_ops_field { #[link_name = concat!($field_prefix, "_bit_reverse_cuda")] pub(crate) fn bit_reverse_cuda( input: *const $field, - size: u32, + size: u64, config: *const BitReverseConfig, output: *mut $field, ) -> CudaError; @@ -424,7 +424,7 @@ macro_rules! impl_vec_ops_field { unsafe { $field_prefix_ident::bit_reverse_cuda( input.as_ptr(), - input.len() as u32, + input.len() as u64, cfg as *const BitReverseConfig, output.as_mut_ptr(), ) @@ -439,7 +439,7 @@ macro_rules! impl_vec_ops_field { unsafe { $field_prefix_ident::bit_reverse_cuda( input.as_ptr(), - input.len() as u32, + input.len() as u64, cfg as *const BitReverseConfig, input.as_mut_ptr(), ) diff --git a/wrappers/rust/icicle-core/src/vec_ops/tests.rs b/wrappers/rust/icicle-core/src/vec_ops/tests.rs index cad95d827..1b6cd46d1 100644 --- a/wrappers/rust/icicle-core/src/vec_ops/tests.rs +++ b/wrappers/rust/icicle-core/src/vec_ops/tests.rs @@ -40,17 +40,28 @@ pub fn check_bit_reverse() where ::Config: VecOps + GenerateRandom, { - const TEST_SIZE: usize = 1 << 20; - let input = F::Config::generate_random(TEST_SIZE); - let input = HostSlice::from_slice(&input); - let mut intermediate_result = DeviceVec::::cuda_malloc(TEST_SIZE).unwrap(); + const LOG_SIZE: u32 = 20; + const TEST_SIZE: usize = 1 << LOG_SIZE; + let input_vec = F::Config::generate_random(TEST_SIZE); + let input = HostSlice::from_slice(&input_vec); + let mut intermediate = DeviceVec::::cuda_malloc(TEST_SIZE).unwrap(); let cfg = BitReverseConfig::default(); - bit_reverse(input, &cfg, &mut intermediate_result[..]).unwrap(); + bit_reverse(input, &cfg, &mut intermediate[..]).unwrap(); + + let mut intermediate_host = vec![F::one(); TEST_SIZE]; + intermediate + .copy_to_host(HostSlice::from_mut_slice(&mut intermediate_host[..])) + .unwrap(); + let index_reverser = |i: usize| i.reverse_bits() >> (usize::BITS - LOG_SIZE); + intermediate_host + .iter() + .enumerate() + .for_each(|(i, val)| assert_eq!(val, &input_vec[index_reverser(i)])); let mut result = vec![F::one(); TEST_SIZE]; let result = HostSlice::from_mut_slice(&mut result); let cfg = BitReverseConfig::default(); - bit_reverse(&intermediate_result[..], &cfg, result).unwrap(); + bit_reverse(&intermediate[..], &cfg, result).unwrap(); assert_eq!(input.as_slice(), result.as_slice()); } @@ -58,15 +69,27 @@ pub fn check_bit_reverse_inplace() where ::Config: VecOps + GenerateRandom, { - const TEST_SIZE: usize = 1 << 20; - let input = F::Config::generate_random(TEST_SIZE); - let input = HostSlice::from_slice(&input); + const LOG_SIZE: u32 = 20; + const TEST_SIZE: usize = 1 << LOG_SIZE; + let input_vec = F::Config::generate_random(TEST_SIZE); + let input = HostSlice::from_slice(&input_vec); let mut intermediate = DeviceVec::::cuda_malloc(TEST_SIZE).unwrap(); intermediate .copy_from_host(&input) .unwrap(); let cfg = BitReverseConfig::default(); bit_reverse_inplace(&mut intermediate[..], &cfg).unwrap(); + + let mut intermediate_host = vec![F::one(); TEST_SIZE]; + intermediate + .copy_to_host(HostSlice::from_mut_slice(&mut intermediate_host[..])) + .unwrap(); + let index_reverser = |i: usize| i.reverse_bits() >> (usize::BITS - LOG_SIZE); + intermediate_host + .iter() + .enumerate() + .for_each(|(i, val)| assert_eq!(val, &input_vec[index_reverser(i)])); + bit_reverse_inplace(&mut intermediate[..], &cfg).unwrap(); let mut result_host = vec![F::one(); TEST_SIZE]; intermediate From 5e6e6e84bbe50ee3a72ff1bb439f8f68658068aa Mon Sep 17 00:00:00 2001 From: ChickenLover Date: Thu, 30 May 2024 00:04:10 +0700 Subject: [PATCH 12/14] fmt --- icicle/src/ntt/thread_ntt.cu | 20 ++++++++++---------- 1 file changed, 10 insertions(+), 10 deletions(-) diff --git a/icicle/src/ntt/thread_ntt.cu b/icicle/src/ntt/thread_ntt.cu index 19321a81b..8fd2764df 100644 --- a/icicle/src/ntt/thread_ntt.cu +++ b/icicle/src/ntt/thread_ntt.cu @@ -17,11 +17,11 @@ struct stage_metadata { #define STAGE_SIZES_DATA \ { \ {0, 0, 0, 0, 0}, {0, 0, 0, 0, 0}, {0, 0, 0, 0, 0}, {0, 0, 0, 0, 0}, {4, 0, 0, 0, 0}, {5, 0, 0, 0, 0}, \ - {6, 0, 0, 0, 0}, {0, 0, 0, 0, 0}, {4, 4, 0, 0, 0}, {5, 4, 0, 0, 0}, {5, 5, 0, 0, 0}, {6, 5, 0, 0, 0}, \ - {6, 6, 0, 0, 0}, {4, 5, 4, 0, 0}, {4, 6, 4, 0, 0}, {5, 5, 5, 0, 0}, {6, 4, 6, 0, 0}, {6, 5, 6, 0, 0}, \ - {6, 6, 6, 0, 0}, {6, 5, 4, 4, 0}, {5, 5, 5, 5, 0}, {6, 5, 5, 5, 0}, {6, 5, 5, 6, 0}, {6, 6, 6, 5, 0}, \ - {6, 6, 6, 6, 0}, {5, 5, 5, 5, 5}, {6, 5, 4, 5, 6}, {6, 5, 5, 5, 6}, {6, 5, 6, 5, 6}, {6, 6, 5, 6, 6}, \ - {6, 6, 6, 6, 6}, \ + {6, 0, 0, 0, 0}, {0, 0, 0, 0, 0}, {4, 4, 0, 0, 0}, {5, 4, 0, 0, 0}, {5, 5, 0, 0, 0}, {6, 5, 0, 0, 0}, \ + {6, 6, 0, 0, 0}, {4, 5, 4, 0, 0}, {4, 6, 4, 0, 0}, {5, 5, 5, 0, 0}, {6, 4, 6, 0, 0}, {6, 5, 6, 0, 0}, \ + {6, 6, 6, 0, 0}, {6, 5, 4, 4, 0}, {5, 5, 5, 5, 0}, {6, 5, 5, 5, 0}, {6, 5, 5, 6, 0}, {6, 6, 6, 5, 0}, \ + {6, 6, 6, 6, 0}, {5, 5, 5, 5, 5}, {6, 5, 4, 5, 6}, {6, 5, 5, 5, 6}, {6, 5, 6, 5, 6}, {6, 6, 5, 6, 6}, \ + {6, 6, 6, 6, 6}, \ } uint32_t constexpr STAGE_SIZES_HOST[31][5] = STAGE_SIZES_DATA; __device__ constexpr uint32_t STAGE_SIZES_DEVICE[31][5] = STAGE_SIZES_DATA; @@ -33,11 +33,11 @@ uint32_t constexpr STAGE_PREV_SIZES[31] = {0, 0, 0, 0, 0, 0, 0, 0, 4, 5 #define STAGE_SIZES_DATA_FAST_TW \ { \ {0, 0, 0, 0, 0}, {0, 0, 0, 0, 0}, {0, 0, 0, 0, 0}, {0, 0, 0, 0, 0}, {4, 0, 0, 0, 0}, {5, 0, 0, 0, 0}, \ - {6, 0, 0, 0, 0}, {0, 0, 0, 0, 0}, {4, 4, 0, 0, 0}, {5, 4, 0, 0, 0}, {5, 5, 0, 0, 0}, {6, 5, 0, 0, 0}, \ - {6, 6, 0, 0, 0}, {5, 4, 4, 0, 0}, {5, 4, 5, 0, 0}, {5, 5, 5, 0, 0}, {6, 5, 5, 0, 0}, {6, 5, 6, 0, 0}, \ - {6, 6, 6, 0, 0}, {5, 5, 5, 4, 0}, {5, 5, 5, 5, 0}, {6, 5, 5, 5, 0}, {6, 5, 5, 6, 0}, {6, 6, 6, 5, 0}, \ - {6, 6, 6, 6, 0}, {5, 5, 5, 5, 5}, {6, 5, 5, 5, 5}, {6, 5, 5, 5, 6}, {6, 5, 5, 6, 6}, {6, 6, 6, 5, 6}, \ - {6, 6, 6, 6, 6}, \ + {6, 0, 0, 0, 0}, {0, 0, 0, 0, 0}, {4, 4, 0, 0, 0}, {5, 4, 0, 0, 0}, {5, 5, 0, 0, 0}, {6, 5, 0, 0, 0}, \ + {6, 6, 0, 0, 0}, {5, 4, 4, 0, 0}, {5, 4, 5, 0, 0}, {5, 5, 5, 0, 0}, {6, 5, 5, 0, 0}, {6, 5, 6, 0, 0}, \ + {6, 6, 6, 0, 0}, {5, 5, 5, 4, 0}, {5, 5, 5, 5, 0}, {6, 5, 5, 5, 0}, {6, 5, 5, 6, 0}, {6, 6, 6, 5, 0}, \ + {6, 6, 6, 6, 0}, {5, 5, 5, 5, 5}, {6, 5, 5, 5, 5}, {6, 5, 5, 5, 6}, {6, 5, 5, 6, 6}, {6, 6, 6, 5, 6}, \ + {6, 6, 6, 6, 6}, \ } uint32_t constexpr STAGE_SIZES_HOST_FT[31][5] = STAGE_SIZES_DATA_FAST_TW; __device__ uint32_t constexpr STAGE_SIZES_DEVICE_FT[31][5] = STAGE_SIZES_DATA_FAST_TW; From 8c4ec1be25d0af6e7d20f2266000239220edbe8f Mon Sep 17 00:00:00 2001 From: nonam3e Date: Thu, 30 May 2024 00:29:38 +0700 Subject: [PATCH 13/14] tid --- icicle/src/vec_ops/vec_ops.cu | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/icicle/src/vec_ops/vec_ops.cu b/icicle/src/vec_ops/vec_ops.cu index 64fe2d256..f7bf7479e 100644 --- a/icicle/src/vec_ops/vec_ops.cu +++ b/icicle/src/vec_ops/vec_ops.cu @@ -58,7 +58,7 @@ namespace vec_ops { template __global__ void bit_reverse_kernel(const E* input, uint64_t n, unsigned shift, E* output) { - uint64_t tid = blockIdx.x * blockDim.x + threadIdx.x; + uint64_t tid = uint64_t(blockIdx.x) * blockDim.x + threadIdx.x; // Handling arbitrary vector size if (tid < n) { int reversed_index = __brevll(tid) >> shift; @@ -68,7 +68,7 @@ namespace vec_ops { template __global__ void bit_reverse_inplace_kernel(E* input, uint64_t n, unsigned shift) { - uint64_t tid = blockIdx.x * blockDim.x + threadIdx.x; + uint64_t tid = uint64_t(blockIdx.x) * blockDim.x + threadIdx.x; // Handling arbitrary vector size if (tid < n) { int reversed_index = __brevll(tid) >> shift; From 69f91e5e8f57047512bc54daafb25c08b4185913 Mon Sep 17 00:00:00 2001 From: ChickenLover Date: Sun, 2 Jun 2024 10:03:01 +0300 Subject: [PATCH 14/14] disable windows test --- .github/workflows/rust.yml | 48 +++++++++++++++++++------------------- 1 file changed, 24 insertions(+), 24 deletions(-) diff --git a/.github/workflows/rust.yml b/.github/workflows/rust.yml index c486bddb5..cc80fda9d 100644 --- a/.github/workflows/rust.yml +++ b/.github/workflows/rust.yml @@ -79,27 +79,27 @@ jobs: cargo test --release --verbose -- --ignored cargo test --release --verbose - build-windows: - name: Build on Windows - runs-on: windows-2022 - needs: check-changed-files - steps: - - name: Checkout Repo - uses: actions/checkout@v4 - - name: Download and Install Cuda - if: needs.check-changed-files.outputs.rust == 'true' || needs.check-changed-files.outputs.cpp_cuda == 'true' - id: cuda-toolkit - uses: Jimver/cuda-toolkit@v0.2.11 - with: - cuda: '12.0.0' - method: 'network' - # https://docs.nvidia.com/cuda/archive/12.0.0/cuda-installation-guide-microsoft-windows/index.html - sub-packages: '["cudart", "nvcc", "thrust", "visual_studio_integration"]' - - name: Build targets - working-directory: ./wrappers/rust - if: needs.check-changed-files.outputs.rust == 'true' || needs.check-changed-files.outputs.cpp_cuda == 'true' - env: - CUDA_PATH: ${{ steps.cuda-toolkit.outputs.CUDA_PATH }} - CUDA_ARCH: 50 # Using CUDA_ARCH=50 env variable since the CI machines have no GPUs - # Building from the root workspace will build all members of the workspace by default - run: cargo build --release --verbose + # build-windows: + # name: Build on Windows + # runs-on: windows-2022 + # needs: check-changed-files + # steps: + # - name: Checkout Repo + # uses: actions/checkout@v4 + # - name: Download and Install Cuda + # if: needs.check-changed-files.outputs.rust == 'true' || needs.check-changed-files.outputs.cpp_cuda == 'true' + # id: cuda-toolkit + # uses: Jimver/cuda-toolkit@v0.2.11 + # with: + # cuda: '12.0.0' + # method: 'network' + # # https://docs.nvidia.com/cuda/archive/12.0.0/cuda-installation-guide-microsoft-windows/index.html + # sub-packages: '["cudart", "nvcc", "thrust", "visual_studio_integration"]' + # - name: Build targets + # working-directory: ./wrappers/rust + # if: needs.check-changed-files.outputs.rust == 'true' || needs.check-changed-files.outputs.cpp_cuda == 'true' + # env: + # CUDA_PATH: ${{ steps.cuda-toolkit.outputs.CUDA_PATH }} + # CUDA_ARCH: 50 # Using CUDA_ARCH=50 env variable since the CI machines have no GPUs + # # Building from the root workspace will build all members of the workspace by default + # run: cargo build --release --verbose