From 8ee9b2eee585d2281cc9d9555b96c0c7c9a38a10 Mon Sep 17 00:00:00 2001
From: nonam3e <timur@ingonyama.com>
Date: Mon, 27 May 2024 14:47:46 +0000
Subject: [PATCH 01/14] bit reverse

---
 icicle/include/api/babybear.h                 |  11 ++
 icicle/include/api/bls12_377.h                |  11 ++
 icicle/include/api/bls12_381.h                |  11 ++
 icicle/include/api/bn254.h                    |  11 ++
 icicle/include/api/bw6_761.h                  |  11 ++
 icicle/include/api/grumpkin.h                 |  11 ++
 icicle/include/api/stark252.h                 |  11 ++
 icicle/include/vec_ops/vec_ops.cuh            |  19 +++
 icicle/src/vec_ops/extern.cu                  |   9 ++
 icicle/src/vec_ops/vec_ops.cu                 |  86 ++++++++++++
 wrappers/rust/icicle-core/src/vec_ops/mod.rs  | 123 ++++++++++++++++++
 .../rust/icicle-core/src/vec_ops/tests.rs     |  43 +++++-
 .../icicle-bls12-377/src/vec_ops/mod.rs       |   2 +-
 .../icicle-bls12-381/src/vec_ops/mod.rs       |   2 +-
 .../icicle-bn254/src/vec_ops/mod.rs           |   2 +-
 .../icicle-grumpkin/src/vec_ops/mod.rs        |   2 +-
 .../icicle-babybear/src/vec_ops/mod.rs        |  54 +++++++-
 .../icicle-stark252/src/vec_ops/mod.rs        |   2 +-
 18 files changed, 413 insertions(+), 8 deletions(-)

diff --git a/icicle/include/api/babybear.h b/icicle/include/api/babybear.h
index ff2cf851e..50102ba58 100644
--- a/icicle/include/api/babybear.h
+++ b/icicle/include/api/babybear.h
@@ -68,6 +68,17 @@ extern "C" cudaError_t babybear_transpose_matrix_cuda(
   bool on_device,
   bool is_async);
 
+extern "C" cudaError_t babybear_bit_reverse_cuda(
+  const babybear::scalar_t* input,
+  unsigned n,
+  vec_ops::BitReverseConfig& config,
+  babybear::scalar_t* output);
+
+extern "C" cudaError_t babybear_bit_reverse_inplace_cuda(
+  babybear::scalar_t* input,
+  unsigned n,
+  vec_ops::BitReverseConfig& config);
+
 extern "C" void babybear_generate_scalars(babybear::scalar_t* scalars, int size);
 
 extern "C" cudaError_t babybear_scalar_convert_montgomery(
diff --git a/icicle/include/api/bls12_377.h b/icicle/include/api/bls12_377.h
index 800dc6810..e83c9656a 100644
--- a/icicle/include/api/bls12_377.h
+++ b/icicle/include/api/bls12_377.h
@@ -116,6 +116,17 @@ extern "C" cudaError_t bls12_377_transpose_matrix_cuda(
   bool on_device,
   bool is_async);
 
+extern "C" cudaError_t bls12_377_bit_reverse_cuda(
+  const bls12_377::scalar_t* input,
+  unsigned n,
+  vec_ops::BitReverseConfig& config,
+  bls12_377::scalar_t* output);
+
+extern "C" cudaError_t bls12_377_bit_reverse_inplace_cuda(
+  bls12_377::scalar_t* input,
+  unsigned n,
+  vec_ops::BitReverseConfig& config);
+
 extern "C" void bls12_377_generate_scalars(bls12_377::scalar_t* scalars, int size);
 
 extern "C" cudaError_t bls12_377_scalar_convert_montgomery(
diff --git a/icicle/include/api/bls12_381.h b/icicle/include/api/bls12_381.h
index f96c886c5..bcede3914 100644
--- a/icicle/include/api/bls12_381.h
+++ b/icicle/include/api/bls12_381.h
@@ -116,6 +116,17 @@ extern "C" cudaError_t bls12_381_transpose_matrix_cuda(
   bool on_device,
   bool is_async);
 
+extern "C" cudaError_t bls12_381_bit_reverse_cuda(
+  const bls12_381::scalar_t* input,
+  unsigned n,
+  vec_ops::BitReverseConfig& config,
+  bls12_381::scalar_t* output);
+
+extern "C" cudaError_t bls12_381_bit_reverse_inplace_cuda(
+  bls12_381::scalar_t* input,
+  unsigned n,
+  vec_ops::BitReverseConfig& config);
+
 extern "C" void bls12_381_generate_scalars(bls12_381::scalar_t* scalars, int size);
 
 extern "C" cudaError_t bls12_381_scalar_convert_montgomery(
diff --git a/icicle/include/api/bn254.h b/icicle/include/api/bn254.h
index 7c8a81b87..8753ea35a 100644
--- a/icicle/include/api/bn254.h
+++ b/icicle/include/api/bn254.h
@@ -148,6 +148,17 @@ extern "C" cudaError_t bn254_transpose_matrix_cuda(
   bool on_device,
   bool is_async);
 
+extern "C" cudaError_t bn254_bit_reverse_cuda(
+  const bn254::scalar_t* input,
+  unsigned n,
+  vec_ops::BitReverseConfig& config,
+  bn254::scalar_t* output);
+
+extern "C" cudaError_t bn254_bit_reverse_inplace_cuda(
+  bn254::scalar_t* input,
+  unsigned n,
+  vec_ops::BitReverseConfig& config);
+
 extern "C" void bn254_generate_scalars(bn254::scalar_t* scalars, int size);
 
 extern "C" cudaError_t bn254_scalar_convert_montgomery(
diff --git a/icicle/include/api/bw6_761.h b/icicle/include/api/bw6_761.h
index d10d93d7a..7ebb5e9f1 100644
--- a/icicle/include/api/bw6_761.h
+++ b/icicle/include/api/bw6_761.h
@@ -116,6 +116,17 @@ extern "C" cudaError_t bw6_761_transpose_matrix_cuda(
   bool on_device,
   bool is_async);
 
+extern "C" cudaError_t bw6_761_bit_reverse_cuda(
+  const bw6_761::scalar_t* input,
+  unsigned n,
+  vec_ops::BitReverseConfig& config,
+  bw6_761::scalar_t* output);
+
+extern "C" cudaError_t bw6_761_bit_reverse_inplace_cuda(
+  bw6_761::scalar_t* input,
+  unsigned n,
+  vec_ops::BitReverseConfig& config);
+
 extern "C" void bw6_761_generate_scalars(bw6_761::scalar_t* scalars, int size);
 
 extern "C" cudaError_t bw6_761_scalar_convert_montgomery(
diff --git a/icicle/include/api/grumpkin.h b/icicle/include/api/grumpkin.h
index 821bed6af..95d02ddd3 100644
--- a/icicle/include/api/grumpkin.h
+++ b/icicle/include/api/grumpkin.h
@@ -86,6 +86,17 @@ extern "C" cudaError_t grumpkin_transpose_matrix_cuda(
   bool on_device,
   bool is_async);
 
+extern "C" cudaError_t grumpkin_bit_reverse_cuda(
+  const grumpkin::scalar_t* input,
+  unsigned n,
+  vec_ops::BitReverseConfig& config,
+  grumpkin::scalar_t* output);
+
+extern "C" cudaError_t grumpkin_bit_reverse_inplace_cuda(
+  grumpkin::scalar_t* input,
+  unsigned n,
+  vec_ops::BitReverseConfig& config);
+
 extern "C" void grumpkin_generate_scalars(grumpkin::scalar_t* scalars, int size);
 
 extern "C" cudaError_t grumpkin_scalar_convert_montgomery(
diff --git a/icicle/include/api/stark252.h b/icicle/include/api/stark252.h
index ff9671241..2b4acc334 100644
--- a/icicle/include/api/stark252.h
+++ b/icicle/include/api/stark252.h
@@ -31,6 +31,17 @@ extern "C" cudaError_t stark252_transpose_matrix_cuda(
   bool on_device,
   bool is_async);
 
+extern "C" cudaError_t stark252_bit_reverse_cuda(
+  const stark252::scalar_t* input,
+  unsigned n,
+  vec_ops::BitReverseConfig& config,
+  stark252::scalar_t* output);
+
+extern "C" cudaError_t stark252_bit_reverse_inplace_cuda(
+  stark252::scalar_t* input,
+  unsigned n,
+  vec_ops::BitReverseConfig& config);
+
 extern "C" void stark252_generate_scalars(stark252::scalar_t* scalars, int size);
 
 extern "C" cudaError_t stark252_scalar_convert_montgomery(
diff --git a/icicle/include/vec_ops/vec_ops.cuh b/icicle/include/vec_ops/vec_ops.cuh
index f52ab686c..3135cba0b 100644
--- a/icicle/include/vec_ops/vec_ops.cuh
+++ b/icicle/include/vec_ops/vec_ops.cuh
@@ -113,6 +113,25 @@ namespace vec_ops {
     device_context::DeviceContext& ctx,
     bool on_device,
     bool is_async);
+
+  struct BitReverseConfig {
+      device_context::DeviceContext ctx; /**< Details related to the device such as its id and stream. */
+      bool is_input_on_device; /**< True if `input` is on device and false if it is not. Default value: false. */
+      bool is_output_on_device; /**< True if `output` is on device and false if it is not. Default value: false. */
+      bool is_async; /**< Whether to run the vector operations asynchronously. If set to `true`, the function will be
+                  *   non-blocking and you'd need to synchronize it explicitly by running
+                  *   `cudaStreamSynchronize` or `cudaDeviceSynchronize`. If set to false, the
+                  *   function will block the current CPU thread. */
+  };
+  static BitReverseConfig DefaultBitReverseConfig(const device_context::DeviceContext& ctx = device_context::get_default_device_context()) {
+      BitReverseConfig config = {
+          ctx,   // ctx
+          false, // is_input_on_device
+          false, // is_output_on_device
+          false, // is_async
+      };
+      return config;
+  }
 } // namespace vec_ops
 
 #endif
diff --git a/icicle/src/vec_ops/extern.cu b/icicle/src/vec_ops/extern.cu
index 00be401a2..c1790f442 100644
--- a/icicle/src/vec_ops/extern.cu
+++ b/icicle/src/vec_ops/extern.cu
@@ -59,4 +59,13 @@ namespace vec_ops {
   {
     return transpose_matrix<scalar_t>(input, output, row_size, column_size, ctx, on_device, is_async);
   }
+
+  extern "C" cudaError_t CONCAT_EXPAND(FIELD, bit_reverse_cuda)(const scalar_t* input, unsigned n, BitReverseConfig& config, scalar_t* output)
+  {
+    return bit_reverse<scalar_t>(input, n, config, output);
+  }
+  extern "C" cudaError_t CONCAT_EXPAND(FIELD, bit_reverse_inplace_cuda)(scalar_t* input, unsigned n, BitReverseConfig& config)
+  {
+    return bit_reverse_inplace<scalar_t>(input, n, config);
+  }
 } // namespace vec_ops
\ No newline at end of file
diff --git a/icicle/src/vec_ops/vec_ops.cu b/icicle/src/vec_ops/vec_ops.cu
index d87dc2a89..ed32aa926 100644
--- a/icicle/src/vec_ops/vec_ops.cu
+++ b/icicle/src/vec_ops/vec_ops.cu
@@ -54,6 +54,29 @@ namespace vec_ops {
       if (tid >= row_size * column_size) return;
       out[(tid % row_size) * column_size + (tid / row_size)] = in[tid];
     }
+
+    template <typename E>
+    __global__ void bit_reverse_kernel(const E* input, unsigned n, unsigned shift, E* output) {
+      int tid = blockIdx.x * blockDim.x + threadIdx.x;
+      // Handling arbitrary vector size
+      if (tid < n) {
+        int reversed_index = __brev(tid) >> shift;
+        output[reversed_index] = input[tid];
+      }
+    }
+    template <typename E>
+    __global__ void bit_reverse_inplace_kernel(E* input, unsigned n, unsigned shift) {
+      int tid = blockIdx.x * blockDim.x + threadIdx.x;
+      // Handling arbitrary vector size
+      if (tid < n) {
+        int reversed_index = __brev(tid) >> shift;
+        if(reversed_index > tid) {
+          E temp = input[tid];
+          input[tid] = input[reversed_index];
+          input[reversed_index] = temp;
+        }
+      }
+    }
   } // namespace
 
   template <typename E, void (*Kernel)(const E*, const E*, int, E*)>
@@ -164,4 +187,67 @@ namespace vec_ops {
 
     return CHK_LAST();
   }
+
+  template <typename E>
+  cudaError_t bit_reverse_inplace(E* input, unsigned size, BitReverseConfig& cfg) {
+    if (size & (size - 1))
+      THROW_ICICLE_ERR(IcicleError_t::InvalidArgument, "bit_reverse: size must be a power of 2");
+    E* d_input;
+    if (cfg.is_input_on_device) {
+      d_input = input;
+    }
+    else {
+      // copy input to gpu
+      CHK_IF_RETURN(cudaMallocAsync(&d_input, sizeof(E) * size, cfg.ctx.stream));
+      CHK_IF_RETURN(cudaMemcpyAsync(d_input, input, sizeof(E) * size, cudaMemcpyHostToDevice, cfg.ctx.stream));
+    }
+    unsigned shift = __builtin_clz(size) + 1;
+    unsigned num_blocks = (size + MAX_THREADS_PER_BLOCK - 1) / MAX_THREADS_PER_BLOCK;
+    bit_reverse_inplace_kernel<<<num_blocks, MAX_THREADS_PER_BLOCK, 0, cfg.ctx.stream>>>(d_input, size, shift);
+    if (!cfg.is_input_on_device) {
+      CHK_IF_RETURN(cudaMemcpyAsync(input, d_input, sizeof(E) * size, cudaMemcpyDeviceToHost, cfg.ctx.stream));
+      CHK_IF_RETURN(cudaFreeAsync(d_input, cfg.ctx.stream));
+    }
+    if (!cfg.is_async) 
+      CHK_IF_RETURN(cudaStreamSynchronize(cfg.ctx.stream));
+    return CHK_LAST();
+  }
+
+  template <typename E>
+  cudaError_t bit_reverse(const E* input, unsigned size, BitReverseConfig& cfg, E* output) {
+    if (size & (size - 1))
+      THROW_ICICLE_ERR(IcicleError_t::InvalidArgument, "bit_reverse: size must be a power of 2");
+    const E* d_input;
+    E* d_alloc_input;
+    if (cfg.is_input_on_device) {
+      d_input = input;
+    }
+    else {
+      // copy input to gpu
+      CHK_IF_RETURN(cudaMallocAsync(&d_alloc_input, sizeof(E) * size, cfg.ctx.stream));
+      CHK_IF_RETURN(cudaMemcpyAsync(d_alloc_input, input, sizeof(E) * size, cudaMemcpyHostToDevice, cfg.ctx.stream));
+      d_input = d_alloc_input;
+    }
+    E* d_output;
+    if (cfg.is_output_on_device) {
+      d_output = output;
+    }
+    else {
+      // allocate output on gpu
+      CHK_IF_RETURN(cudaMallocAsync(&d_output, sizeof(E) * size, cfg.ctx.stream));
+    }
+    unsigned shift = __builtin_clz(size) + 1;
+    unsigned num_blocks = (size + MAX_THREADS_PER_BLOCK - 1) / MAX_THREADS_PER_BLOCK;
+    bit_reverse_kernel<<<num_blocks, MAX_THREADS_PER_BLOCK, 0, cfg.ctx.stream>>>(d_input, size, shift, d_output);
+    if (!cfg.is_input_on_device) {
+      CHK_IF_RETURN(cudaFreeAsync(d_alloc_input, cfg.ctx.stream));
+    }
+    if (!cfg.is_output_on_device) {
+      CHK_IF_RETURN(cudaMemcpyAsync(output, d_output, sizeof(E) * size, cudaMemcpyDeviceToHost, cfg.ctx.stream));
+      CHK_IF_RETURN(cudaFreeAsync(d_output, cfg.ctx.stream));
+    }
+    if (!cfg.is_async) 
+      CHK_IF_RETURN(cudaStreamSynchronize(cfg.ctx.stream));
+    return CHK_LAST();
+  }
 } // namespace vec_ops
\ No newline at end of file
diff --git a/wrappers/rust/icicle-core/src/vec_ops/mod.rs b/wrappers/rust/icicle-core/src/vec_ops/mod.rs
index 5cba18424..78d38c16a 100644
--- a/wrappers/rust/icicle-core/src/vec_ops/mod.rs
+++ b/wrappers/rust/icicle-core/src/vec_ops/mod.rs
@@ -40,6 +40,40 @@ impl<'a> VecOpsConfig<'a> {
     }
 }
 
+#[repr(C)]
+#[derive(Debug, Clone)]
+pub struct BitReverseConfig<'a> {
+    /// Details related to the device such as its id and stream id. See [DeviceContext](@ref device_context::DeviceContext).
+    pub ctx: DeviceContext<'a>,
+
+    /// True if inputs are on device and false if they're on host. Default value: false.
+    pub are_inputs_on_device: bool,
+
+    /// If true, output is preserved on device, otherwise on host. Default value: false.
+    pub are_outputs_on_device: bool,
+
+    /// Whether to run the vector operations asynchronously. If set to `true`, the functions will be non-blocking and you'd need to synchronize
+    /// it explicitly by running `stream.synchronize()`. If set to false, the functions will block the current CPU thread.
+    pub is_async: bool,
+}
+
+impl<'a> Default for BitReverseConfig<'a> {
+    fn default() -> Self {
+        Self::default_for_device(DEFAULT_DEVICE_ID)
+    }
+}
+
+impl<'a> BitReverseConfig<'a> {
+    pub fn default_for_device(device_id: usize) -> Self {
+        BitReverseConfig {
+            ctx: DeviceContext::default_for_device(device_id),
+            are_inputs_on_device: false,
+            are_outputs_on_device: false,
+            is_async: false,
+        }
+    }
+}
+
 #[doc(hidden)]
 pub trait VecOps<F> {
     fn add(
@@ -72,6 +106,17 @@ pub trait VecOps<F> {
         on_device: bool,
         is_async: bool,
     ) -> IcicleResult<()>;
+
+    fn bit_reverse(
+        input: &(impl HostOrDeviceSlice<F> + ?Sized),
+        cfg: &BitReverseConfig,
+        output: &mut (impl HostOrDeviceSlice<F> + ?Sized),
+    ) -> IcicleResult<()>;
+
+    fn bit_reverse_inplace(
+        input: &mut (impl HostOrDeviceSlice<F> + ?Sized),
+        cfg: &BitReverseConfig,
+    ) -> IcicleResult<()>;
 }
 
 fn check_vec_ops_args<'a, F>(
@@ -170,6 +215,29 @@ where
     <<F as FieldImpl>::Config as VecOps<F>>::transpose(input, row_size, column_size, output, ctx, on_device, is_async)
 }
 
+pub fn bit_reverse<F>(
+    input: &(impl HostOrDeviceSlice<F> + ?Sized),
+    cfg: &BitReverseConfig,
+    output: &mut (impl HostOrDeviceSlice<F> + ?Sized),
+) -> IcicleResult<()>
+where
+    F: FieldImpl,
+    <F as FieldImpl>::Config: VecOps<F>,
+{
+    <<F as FieldImpl>::Config as VecOps<F>>::bit_reverse(input, cfg, output)
+}
+
+pub fn bit_reverse_inplace<F>(
+    input: &mut (impl HostOrDeviceSlice<F> + ?Sized),
+    cfg: &BitReverseConfig,
+) -> IcicleResult<()>
+where
+    F: FieldImpl,
+    <F as FieldImpl>::Config: VecOps<F>,
+{
+    <<F as FieldImpl>::Config as VecOps<F>>::bit_reverse_inplace(input, cfg)
+}
+
 #[macro_export]
 macro_rules! impl_vec_ops_field {
     (
@@ -181,6 +249,7 @@ macro_rules! impl_vec_ops_field {
         mod $field_prefix_ident {
             use crate::vec_ops::{$field, CudaError, DeviceContext, HostOrDeviceSlice};
             use icicle_core::vec_ops::VecOpsConfig;
+            use icicle_core::vec_ops::BitReverseConfig;
 
             extern "C" {
                 #[link_name = concat!($field_prefix, "_add_cuda")]
@@ -220,6 +289,21 @@ macro_rules! impl_vec_ops_field {
                     on_device: bool,
                     is_async: bool,
                 ) -> CudaError;
+
+                #[link_name = concat!($field_prefix, "_bit_reverse_cuda")]
+                pub(crate) fn bit_reverse_cuda(
+                    input: *const $field,
+                    size: u32,
+                    config: *const BitReverseConfig,
+                    output: *mut $field,
+                ) -> CudaError;
+
+                #[link_name = concat!($field_prefix, "_bit_reverse_inplace_cuda")]
+                pub(crate) fn bit_reverse_inplace_cuda(
+                    input: *mut $field,
+                    size: u32,
+                    config: *const BitReverseConfig,
+                ) -> CudaError;
             }
         }
 
@@ -300,6 +384,36 @@ macro_rules! impl_vec_ops_field {
                     .wrap()
                 }
             }
+
+            fn bit_reverse(
+                input: &(impl HostOrDeviceSlice<$field> + ?Sized),
+                cfg: &BitReverseConfig,
+                output: &mut (impl HostOrDeviceSlice<$field> + ?Sized),
+            ) -> IcicleResult<()> {
+                unsafe {
+                    $field_prefix_ident::bit_reverse_cuda(
+                        input.as_ptr(),
+                        input.len() as u32,
+                        cfg as *const BitReverseConfig,
+                        output.as_mut_ptr(),
+                    )
+                    .wrap()
+                }
+            }
+
+            fn bit_reverse_inplace(
+                input: &mut (impl HostOrDeviceSlice<$field> + ?Sized),
+                cfg: &BitReverseConfig,
+            ) -> IcicleResult<()> {
+                unsafe {
+                    $field_prefix_ident::bit_reverse_inplace_cuda(
+                        input.as_mut_ptr(),
+                        input.len() as u32,
+                        cfg as *const BitReverseConfig,
+                    )
+                    .wrap()
+                }
+            }
         }
     };
 }
@@ -313,5 +427,14 @@ macro_rules! impl_vec_add_tests {
         pub fn test_vec_add_scalars() {
             check_vec_ops_scalars::<$field>()
         }
+
+        #[test]
+        pub fn test_bit_reverse() {
+            check_bit_reverse::<$field>()
+        }
+        #[test]
+        pub fn test_bit_reverse_inplace() {
+            check_bit_reverse_inplace::<$field>()
+        }
     };
 }
diff --git a/wrappers/rust/icicle-core/src/vec_ops/tests.rs b/wrappers/rust/icicle-core/src/vec_ops/tests.rs
index 41ac263b0..74a4a416a 100644
--- a/wrappers/rust/icicle-core/src/vec_ops/tests.rs
+++ b/wrappers/rust/icicle-core/src/vec_ops/tests.rs
@@ -1,6 +1,6 @@
 use crate::traits::GenerateRandom;
-use crate::vec_ops::{add_scalars, mul_scalars, sub_scalars, FieldImpl, VecOps, VecOpsConfig};
-use icicle_cuda_runtime::memory::HostSlice;
+use crate::vec_ops::{add_scalars, bit_reverse, bit_reverse_inplace, mul_scalars, sub_scalars, BitReverseConfig, FieldImpl, VecOps, VecOpsConfig};
+use icicle_cuda_runtime::memory::{DeviceVec, HostSlice};
 
 pub fn check_vec_ops_scalars<F: FieldImpl>()
 where
@@ -32,3 +32,42 @@ where
 
     assert_eq!(a[0], result3[0]);
 }
+
+pub fn check_bit_reverse<F: FieldImpl>()
+where
+    <F as FieldImpl>::Config: VecOps<F> + GenerateRandom<F>,
+{
+    const TEST_SIZE: usize = 1 << 20;
+    let input = F::Config::generate_random(TEST_SIZE);
+    let input = HostSlice::from_slice(&input);
+    let mut intermediate_result = DeviceVec::<F>::cuda_malloc(TEST_SIZE).unwrap();
+    let mut cfg = BitReverseConfig::default();
+    cfg.are_outputs_on_device = true;
+    bit_reverse(input, &cfg, &mut intermediate_result[..]).unwrap();
+
+    let mut result = vec![F::one(); TEST_SIZE];
+    let result = HostSlice::from_mut_slice(&mut result);
+    let mut cfg = BitReverseConfig::default();
+    cfg.are_outputs_on_device = false;
+    cfg.are_inputs_on_device = true;
+    bit_reverse(&intermediate_result[..], &cfg, result).unwrap();
+    assert_eq!(input.as_slice(), result.as_slice());
+}
+
+pub fn check_bit_reverse_inplace<F: FieldImpl>()
+where
+    <F as FieldImpl>::Config: VecOps<F> + GenerateRandom<F>,
+{
+    const TEST_SIZE: usize = 1 << 20;
+    let input = F::Config::generate_random(TEST_SIZE);
+    let input = HostSlice::from_slice(&input);
+    let mut intermediate = DeviceVec::<F>::cuda_malloc(TEST_SIZE).unwrap();
+    intermediate.copy_from_host(&input).unwrap();
+    let mut cfg = BitReverseConfig::default();
+    cfg.are_inputs_on_device = true;
+    bit_reverse_inplace(&mut intermediate[..], &cfg).unwrap();
+    bit_reverse_inplace(&mut intermediate[..], &cfg).unwrap();
+    let mut result_host = vec![F::one(); TEST_SIZE]; 
+    intermediate.copy_to_host(HostSlice::from_mut_slice(&mut result_host[..])).unwrap();
+    assert_eq!(input.as_slice(), result_host.as_slice());
+}
\ No newline at end of file
diff --git a/wrappers/rust/icicle-curves/icicle-bls12-377/src/vec_ops/mod.rs b/wrappers/rust/icicle-curves/icicle-bls12-377/src/vec_ops/mod.rs
index e41b3f17c..8d01706ab 100644
--- a/wrappers/rust/icicle-curves/icicle-bls12-377/src/vec_ops/mod.rs
+++ b/wrappers/rust/icicle-curves/icicle-bls12-377/src/vec_ops/mod.rs
@@ -5,7 +5,7 @@ use crate::curve::{ScalarCfg, ScalarField};
 use icicle_core::error::IcicleResult;
 use icicle_core::impl_vec_ops_field;
 use icicle_core::traits::IcicleResultWrap;
-use icicle_core::vec_ops::{VecOps, VecOpsConfig};
+use icicle_core::vec_ops::{VecOps, VecOpsConfig, BitReverseConfig};
 use icicle_cuda_runtime::device_context::DeviceContext;
 use icicle_cuda_runtime::error::CudaError;
 use icicle_cuda_runtime::memory::HostOrDeviceSlice;
diff --git a/wrappers/rust/icicle-curves/icicle-bls12-381/src/vec_ops/mod.rs b/wrappers/rust/icicle-curves/icicle-bls12-381/src/vec_ops/mod.rs
index 01cff3db7..f543eb263 100644
--- a/wrappers/rust/icicle-curves/icicle-bls12-381/src/vec_ops/mod.rs
+++ b/wrappers/rust/icicle-curves/icicle-bls12-381/src/vec_ops/mod.rs
@@ -3,7 +3,7 @@ use crate::curve::{ScalarCfg, ScalarField};
 use icicle_core::error::IcicleResult;
 use icicle_core::impl_vec_ops_field;
 use icicle_core::traits::IcicleResultWrap;
-use icicle_core::vec_ops::{VecOps, VecOpsConfig};
+use icicle_core::vec_ops::{VecOps, VecOpsConfig, BitReverseConfig};
 use icicle_cuda_runtime::device_context::DeviceContext;
 use icicle_cuda_runtime::error::CudaError;
 use icicle_cuda_runtime::memory::HostOrDeviceSlice;
diff --git a/wrappers/rust/icicle-curves/icicle-bn254/src/vec_ops/mod.rs b/wrappers/rust/icicle-curves/icicle-bn254/src/vec_ops/mod.rs
index 9447aa8dc..ad4eaadb2 100644
--- a/wrappers/rust/icicle-curves/icicle-bn254/src/vec_ops/mod.rs
+++ b/wrappers/rust/icicle-curves/icicle-bn254/src/vec_ops/mod.rs
@@ -3,7 +3,7 @@ use crate::curve::{ScalarCfg, ScalarField};
 use icicle_core::error::IcicleResult;
 use icicle_core::impl_vec_ops_field;
 use icicle_core::traits::IcicleResultWrap;
-use icicle_core::vec_ops::{VecOps, VecOpsConfig};
+use icicle_core::vec_ops::{VecOps, VecOpsConfig, BitReverseConfig};
 use icicle_cuda_runtime::device_context::DeviceContext;
 use icicle_cuda_runtime::error::CudaError;
 use icicle_cuda_runtime::memory::HostOrDeviceSlice;
diff --git a/wrappers/rust/icicle-curves/icicle-grumpkin/src/vec_ops/mod.rs b/wrappers/rust/icicle-curves/icicle-grumpkin/src/vec_ops/mod.rs
index c8954d864..7b9a42eb5 100644
--- a/wrappers/rust/icicle-curves/icicle-grumpkin/src/vec_ops/mod.rs
+++ b/wrappers/rust/icicle-curves/icicle-grumpkin/src/vec_ops/mod.rs
@@ -3,7 +3,7 @@ use crate::curve::{ScalarCfg, ScalarField};
 use icicle_core::error::IcicleResult;
 use icicle_core::impl_vec_ops_field;
 use icicle_core::traits::IcicleResultWrap;
-use icicle_core::vec_ops::{VecOps, VecOpsConfig};
+use icicle_core::vec_ops::{VecOps, VecOpsConfig, BitReverseConfig};
 use icicle_cuda_runtime::device_context::DeviceContext;
 use icicle_cuda_runtime::error::CudaError;
 use icicle_cuda_runtime::memory::HostOrDeviceSlice;
diff --git a/wrappers/rust/icicle-fields/icicle-babybear/src/vec_ops/mod.rs b/wrappers/rust/icicle-fields/icicle-babybear/src/vec_ops/mod.rs
index 8df5fbfbf..a98ad7b40 100644
--- a/wrappers/rust/icicle-fields/icicle-babybear/src/vec_ops/mod.rs
+++ b/wrappers/rust/icicle-fields/icicle-babybear/src/vec_ops/mod.rs
@@ -3,7 +3,7 @@ use crate::field::{ExtensionCfg, ExtensionField, ScalarCfg, ScalarField};
 use icicle_core::error::IcicleResult;
 use icicle_core::impl_vec_ops_field;
 use icicle_core::traits::IcicleResultWrap;
-use icicle_core::vec_ops::{VecOps, VecOpsConfig};
+use icicle_core::vec_ops::{VecOps, VecOpsConfig, BitReverseConfig};
 use icicle_cuda_runtime::device_context::DeviceContext;
 use icicle_cuda_runtime::error::CudaError;
 use icicle_cuda_runtime::memory::HostOrDeviceSlice;
@@ -11,6 +11,57 @@ use icicle_cuda_runtime::memory::HostOrDeviceSlice;
 impl_vec_ops_field!("babybear", babybear, ScalarField, ScalarCfg);
 impl_vec_ops_field!("babybear_extension", babybear_extension, ExtensionField, ExtensionCfg);
 
+// #[repr(C)]
+// #[derive(Debug, Clone)]
+// pub struct BitReverseConfig<'a> {
+//     /// Details related to the device such as its id and stream id. See [DeviceContext](@ref device_context::DeviceContext).
+//     pub ctx: DeviceContext<'a>,
+
+//     /// True if inputs are on device and false if they're on host. Default value: false.
+//     pub are_inputs_on_device: bool,
+
+//     /// If true, output is preserved on device, otherwise on host. Default value: false.
+//     pub are_outputs_on_device: bool,
+
+//     /// Whether to run the Keccak asynchronously. If set to `true`, the keccak_hash function will be
+//     /// non-blocking and you'd need to synchronize it explicitly by running
+//     /// `cudaStreamSynchronize` or `cudaDeviceSynchronize`. If set to false, keccak_hash
+//     /// function will block the current CPU thread.
+//     pub is_async: bool,
+// }
+
+// impl<'a> Default for BitReverseConfig<'a> {
+//     fn default() -> Self {
+//         Self::default_for_device(DEFAULT_DEVICE_ID)
+//     }
+// }
+
+// impl<'a> BitReverseConfig<'a> {
+//     pub fn default_for_device(device_id: usize) -> Self {
+//         BitReverseConfig {
+//             ctx: DeviceContext::default_for_device(device_id),
+//             are_inputs_on_device: false,
+//             are_outputs_on_device: false,
+//             is_async: false,
+//         }
+//     }
+// }
+
+extern "C" {
+    pub(crate) fn babybear_bit_reverse_cuda(
+        input: *const ScalarField,
+        size: u32,
+        config: &BitReverseConfig,
+        output: *mut ScalarField
+    ) -> CudaError;
+
+    pub(crate) fn babybear_bit_reverse_inplace_cuda(
+        input: *mut ScalarField,
+        size: u32,
+        config: &BitReverseConfig
+    ) -> CudaError;
+}
+
 #[cfg(test)]
 pub(crate) mod tests {
     use crate::field::{ExtensionField, ScalarField};
@@ -18,6 +69,7 @@ pub(crate) mod tests {
     use icicle_core::vec_ops::tests::*;
 
     impl_vec_add_tests!(ScalarField);
+
     mod extension {
         use super::*;
 
diff --git a/wrappers/rust/icicle-fields/icicle-stark252/src/vec_ops/mod.rs b/wrappers/rust/icicle-fields/icicle-stark252/src/vec_ops/mod.rs
index 8e7e88465..1d431ab8f 100644
--- a/wrappers/rust/icicle-fields/icicle-stark252/src/vec_ops/mod.rs
+++ b/wrappers/rust/icicle-fields/icicle-stark252/src/vec_ops/mod.rs
@@ -3,7 +3,7 @@ use crate::field::{ScalarCfg, ScalarField};
 use icicle_core::error::IcicleResult;
 use icicle_core::impl_vec_ops_field;
 use icicle_core::traits::IcicleResultWrap;
-use icicle_core::vec_ops::{VecOps, VecOpsConfig};
+use icicle_core::vec_ops::{VecOps, VecOpsConfig, BitReverseConfig};
 use icicle_cuda_runtime::device_context::DeviceContext;
 use icicle_cuda_runtime::error::CudaError;
 use icicle_cuda_runtime::memory::HostOrDeviceSlice;

From 48d91145a7042ddbbb25ade01777ac493be34aba Mon Sep 17 00:00:00 2001
From: nonam3e <timur@ingonyama.com>
Date: Mon, 27 May 2024 14:59:57 +0000
Subject: [PATCH 02/14] cleanup

---
 icicle/include/vec_ops/vec_ops.cuh            | 32 ++++++++-------
 icicle/src/vec_ops/extern.cu                  | 12 +++---
 icicle/src/vec_ops/vec_ops.cu                 | 39 ++++++++-----------
 wrappers/rust/icicle-core/src/vec_ops/mod.rs  |  2 +-
 .../rust/icicle-core/src/vec_ops/tests.rs     | 17 +++++---
 .../icicle-bls12-377/src/vec_ops/mod.rs       |  2 +-
 .../icicle-bls12-381/src/vec_ops/mod.rs       |  2 +-
 .../icicle-bn254/src/vec_ops/mod.rs           |  2 +-
 .../icicle-grumpkin/src/vec_ops/mod.rs        |  2 +-
 .../icicle-babybear/src/vec_ops/mod.rs        |  6 +--
 .../icicle-stark252/src/vec_ops/mod.rs        |  2 +-
 11 files changed, 62 insertions(+), 56 deletions(-)

diff --git a/icicle/include/vec_ops/vec_ops.cuh b/icicle/include/vec_ops/vec_ops.cuh
index 3135cba0b..909c6886b 100644
--- a/icicle/include/vec_ops/vec_ops.cuh
+++ b/icicle/include/vec_ops/vec_ops.cuh
@@ -115,22 +115,24 @@ namespace vec_ops {
     bool is_async);
 
   struct BitReverseConfig {
-      device_context::DeviceContext ctx; /**< Details related to the device such as its id and stream. */
-      bool is_input_on_device; /**< True if `input` is on device and false if it is not. Default value: false. */
-      bool is_output_on_device; /**< True if `output` is on device and false if it is not. Default value: false. */
-      bool is_async; /**< Whether to run the vector operations asynchronously. If set to `true`, the function will be
-                  *   non-blocking and you'd need to synchronize it explicitly by running
-                  *   `cudaStreamSynchronize` or `cudaDeviceSynchronize`. If set to false, the
-                  *   function will block the current CPU thread. */
+    device_context::DeviceContext ctx; /**< Details related to the device such as its id and stream. */
+    bool is_input_on_device;  /**< True if `input` is on device and false if it is not. Default value: false. */
+    bool is_output_on_device; /**< True if `output` is on device and false if it is not. Default value: false. */
+    bool is_async; /**< Whether to run the vector operations asynchronously. If set to `true`, the function will be
+                    *   non-blocking and you'd need to synchronize it explicitly by running
+                    *   `cudaStreamSynchronize` or `cudaDeviceSynchronize`. If set to false, the
+                    *   function will block the current CPU thread. */
   };
-  static BitReverseConfig DefaultBitReverseConfig(const device_context::DeviceContext& ctx = device_context::get_default_device_context()) {
-      BitReverseConfig config = {
-          ctx,   // ctx
-          false, // is_input_on_device
-          false, // is_output_on_device
-          false, // is_async
-      };
-      return config;
+  static BitReverseConfig
+  DefaultBitReverseConfig(const device_context::DeviceContext& ctx = device_context::get_default_device_context())
+  {
+    BitReverseConfig config = {
+      ctx,   // ctx
+      false, // is_input_on_device
+      false, // is_output_on_device
+      false, // is_async
+    };
+    return config;
   }
 } // namespace vec_ops
 
diff --git a/icicle/src/vec_ops/extern.cu b/icicle/src/vec_ops/extern.cu
index c1790f442..7c9079a46 100644
--- a/icicle/src/vec_ops/extern.cu
+++ b/icicle/src/vec_ops/extern.cu
@@ -13,7 +13,7 @@ namespace vec_ops {
    * @return `cudaSuccess` if the execution was successful and an error code otherwise.
    */
   extern "C" cudaError_t
-  CONCAT_EXPAND(FIELD, mul_cuda)(scalar_t* vec_a, scalar_t* vec_b, int n, VecOpsConfig& config, scalar_t* result)
+    CONCAT_EXPAND(FIELD, mul_cuda)(scalar_t* vec_a, scalar_t* vec_b, int n, VecOpsConfig& config, scalar_t* result)
   {
     return mul<scalar_t>(vec_a, vec_b, n, config, result);
   }
@@ -25,7 +25,7 @@ namespace vec_ops {
    * @return `cudaSuccess` if the execution was successful and an error code otherwise.
    */
   extern "C" cudaError_t
-  CONCAT_EXPAND(FIELD, add_cuda)(scalar_t* vec_a, scalar_t* vec_b, int n, VecOpsConfig& config, scalar_t* result)
+    CONCAT_EXPAND(FIELD, add_cuda)(scalar_t* vec_a, scalar_t* vec_b, int n, VecOpsConfig& config, scalar_t* result)
   {
     return add<scalar_t>(vec_a, vec_b, n, config, result);
   }
@@ -37,7 +37,7 @@ namespace vec_ops {
    * @return `cudaSuccess` if the execution was successful and an error code otherwise.
    */
   extern "C" cudaError_t
-  CONCAT_EXPAND(FIELD, sub_cuda)(scalar_t* vec_a, scalar_t* vec_b, int n, VecOpsConfig& config, scalar_t* result)
+    CONCAT_EXPAND(FIELD, sub_cuda)(scalar_t* vec_a, scalar_t* vec_b, int n, VecOpsConfig& config, scalar_t* result)
   {
     return sub<scalar_t>(vec_a, vec_b, n, config, result);
   }
@@ -60,11 +60,13 @@ namespace vec_ops {
     return transpose_matrix<scalar_t>(input, output, row_size, column_size, ctx, on_device, is_async);
   }
 
-  extern "C" cudaError_t CONCAT_EXPAND(FIELD, bit_reverse_cuda)(const scalar_t* input, unsigned n, BitReverseConfig& config, scalar_t* output)
+  extern "C" cudaError_t CONCAT_EXPAND(FIELD, bit_reverse_cuda)(
+    const scalar_t* input, unsigned n, BitReverseConfig& config, scalar_t* output)
   {
     return bit_reverse<scalar_t>(input, n, config, output);
   }
-  extern "C" cudaError_t CONCAT_EXPAND(FIELD, bit_reverse_inplace_cuda)(scalar_t* input, unsigned n, BitReverseConfig& config)
+  extern "C" cudaError_t
+    CONCAT_EXPAND(FIELD, bit_reverse_inplace_cuda)(scalar_t* input, unsigned n, BitReverseConfig& config)
   {
     return bit_reverse_inplace<scalar_t>(input, n, config);
   }
diff --git a/icicle/src/vec_ops/vec_ops.cu b/icicle/src/vec_ops/vec_ops.cu
index ed32aa926..86445a69e 100644
--- a/icicle/src/vec_ops/vec_ops.cu
+++ b/icicle/src/vec_ops/vec_ops.cu
@@ -56,7 +56,8 @@ namespace vec_ops {
     }
 
     template <typename E>
-    __global__ void bit_reverse_kernel(const E* input, unsigned n, unsigned shift, E* output) {
+    __global__ void bit_reverse_kernel(const E* input, unsigned n, unsigned shift, E* output)
+    {
       int tid = blockIdx.x * blockDim.x + threadIdx.x;
       // Handling arbitrary vector size
       if (tid < n) {
@@ -65,12 +66,13 @@ namespace vec_ops {
       }
     }
     template <typename E>
-    __global__ void bit_reverse_inplace_kernel(E* input, unsigned n, unsigned shift) {
+    __global__ void bit_reverse_inplace_kernel(E* input, unsigned n, unsigned shift)
+    {
       int tid = blockIdx.x * blockDim.x + threadIdx.x;
       // Handling arbitrary vector size
       if (tid < n) {
         int reversed_index = __brev(tid) >> shift;
-        if(reversed_index > tid) {
+        if (reversed_index > tid) {
           E temp = input[tid];
           input[tid] = input[reversed_index];
           input[reversed_index] = temp;
@@ -189,14 +191,13 @@ namespace vec_ops {
   }
 
   template <typename E>
-  cudaError_t bit_reverse_inplace(E* input, unsigned size, BitReverseConfig& cfg) {
-    if (size & (size - 1))
-      THROW_ICICLE_ERR(IcicleError_t::InvalidArgument, "bit_reverse: size must be a power of 2");
+  cudaError_t bit_reverse_inplace(E* input, unsigned size, BitReverseConfig& cfg)
+  {
+    if (size & (size - 1)) THROW_ICICLE_ERR(IcicleError_t::InvalidArgument, "bit_reverse: size must be a power of 2");
     E* d_input;
     if (cfg.is_input_on_device) {
       d_input = input;
-    }
-    else {
+    } else {
       // copy input to gpu
       CHK_IF_RETURN(cudaMallocAsync(&d_input, sizeof(E) * size, cfg.ctx.stream));
       CHK_IF_RETURN(cudaMemcpyAsync(d_input, input, sizeof(E) * size, cudaMemcpyHostToDevice, cfg.ctx.stream));
@@ -208,21 +209,19 @@ namespace vec_ops {
       CHK_IF_RETURN(cudaMemcpyAsync(input, d_input, sizeof(E) * size, cudaMemcpyDeviceToHost, cfg.ctx.stream));
       CHK_IF_RETURN(cudaFreeAsync(d_input, cfg.ctx.stream));
     }
-    if (!cfg.is_async) 
-      CHK_IF_RETURN(cudaStreamSynchronize(cfg.ctx.stream));
+    if (!cfg.is_async) CHK_IF_RETURN(cudaStreamSynchronize(cfg.ctx.stream));
     return CHK_LAST();
   }
 
   template <typename E>
-  cudaError_t bit_reverse(const E* input, unsigned size, BitReverseConfig& cfg, E* output) {
-    if (size & (size - 1))
-      THROW_ICICLE_ERR(IcicleError_t::InvalidArgument, "bit_reverse: size must be a power of 2");
+  cudaError_t bit_reverse(const E* input, unsigned size, BitReverseConfig& cfg, E* output)
+  {
+    if (size & (size - 1)) THROW_ICICLE_ERR(IcicleError_t::InvalidArgument, "bit_reverse: size must be a power of 2");
     const E* d_input;
     E* d_alloc_input;
     if (cfg.is_input_on_device) {
       d_input = input;
-    }
-    else {
+    } else {
       // copy input to gpu
       CHK_IF_RETURN(cudaMallocAsync(&d_alloc_input, sizeof(E) * size, cfg.ctx.stream));
       CHK_IF_RETURN(cudaMemcpyAsync(d_alloc_input, input, sizeof(E) * size, cudaMemcpyHostToDevice, cfg.ctx.stream));
@@ -231,23 +230,19 @@ namespace vec_ops {
     E* d_output;
     if (cfg.is_output_on_device) {
       d_output = output;
-    }
-    else {
+    } else {
       // allocate output on gpu
       CHK_IF_RETURN(cudaMallocAsync(&d_output, sizeof(E) * size, cfg.ctx.stream));
     }
     unsigned shift = __builtin_clz(size) + 1;
     unsigned num_blocks = (size + MAX_THREADS_PER_BLOCK - 1) / MAX_THREADS_PER_BLOCK;
     bit_reverse_kernel<<<num_blocks, MAX_THREADS_PER_BLOCK, 0, cfg.ctx.stream>>>(d_input, size, shift, d_output);
-    if (!cfg.is_input_on_device) {
-      CHK_IF_RETURN(cudaFreeAsync(d_alloc_input, cfg.ctx.stream));
-    }
+    if (!cfg.is_input_on_device) { CHK_IF_RETURN(cudaFreeAsync(d_alloc_input, cfg.ctx.stream)); }
     if (!cfg.is_output_on_device) {
       CHK_IF_RETURN(cudaMemcpyAsync(output, d_output, sizeof(E) * size, cudaMemcpyDeviceToHost, cfg.ctx.stream));
       CHK_IF_RETURN(cudaFreeAsync(d_output, cfg.ctx.stream));
     }
-    if (!cfg.is_async) 
-      CHK_IF_RETURN(cudaStreamSynchronize(cfg.ctx.stream));
+    if (!cfg.is_async) CHK_IF_RETURN(cudaStreamSynchronize(cfg.ctx.stream));
     return CHK_LAST();
   }
 } // namespace vec_ops
\ No newline at end of file
diff --git a/wrappers/rust/icicle-core/src/vec_ops/mod.rs b/wrappers/rust/icicle-core/src/vec_ops/mod.rs
index 78d38c16a..536649d7e 100644
--- a/wrappers/rust/icicle-core/src/vec_ops/mod.rs
+++ b/wrappers/rust/icicle-core/src/vec_ops/mod.rs
@@ -248,8 +248,8 @@ macro_rules! impl_vec_ops_field {
     ) => {
         mod $field_prefix_ident {
             use crate::vec_ops::{$field, CudaError, DeviceContext, HostOrDeviceSlice};
-            use icicle_core::vec_ops::VecOpsConfig;
             use icicle_core::vec_ops::BitReverseConfig;
+            use icicle_core::vec_ops::VecOpsConfig;
 
             extern "C" {
                 #[link_name = concat!($field_prefix, "_add_cuda")]
diff --git a/wrappers/rust/icicle-core/src/vec_ops/tests.rs b/wrappers/rust/icicle-core/src/vec_ops/tests.rs
index 74a4a416a..a1a45671b 100644
--- a/wrappers/rust/icicle-core/src/vec_ops/tests.rs
+++ b/wrappers/rust/icicle-core/src/vec_ops/tests.rs
@@ -1,5 +1,8 @@
 use crate::traits::GenerateRandom;
-use crate::vec_ops::{add_scalars, bit_reverse, bit_reverse_inplace, mul_scalars, sub_scalars, BitReverseConfig, FieldImpl, VecOps, VecOpsConfig};
+use crate::vec_ops::{
+    add_scalars, bit_reverse, bit_reverse_inplace, mul_scalars, sub_scalars, BitReverseConfig, FieldImpl, VecOps,
+    VecOpsConfig,
+};
 use icicle_cuda_runtime::memory::{DeviceVec, HostSlice};
 
 pub fn check_vec_ops_scalars<F: FieldImpl>()
@@ -62,12 +65,16 @@ where
     let input = F::Config::generate_random(TEST_SIZE);
     let input = HostSlice::from_slice(&input);
     let mut intermediate = DeviceVec::<F>::cuda_malloc(TEST_SIZE).unwrap();
-    intermediate.copy_from_host(&input).unwrap();
+    intermediate
+        .copy_from_host(&input)
+        .unwrap();
     let mut cfg = BitReverseConfig::default();
     cfg.are_inputs_on_device = true;
     bit_reverse_inplace(&mut intermediate[..], &cfg).unwrap();
     bit_reverse_inplace(&mut intermediate[..], &cfg).unwrap();
-    let mut result_host = vec![F::one(); TEST_SIZE]; 
-    intermediate.copy_to_host(HostSlice::from_mut_slice(&mut result_host[..])).unwrap();
+    let mut result_host = vec![F::one(); TEST_SIZE];
+    intermediate
+        .copy_to_host(HostSlice::from_mut_slice(&mut result_host[..]))
+        .unwrap();
     assert_eq!(input.as_slice(), result_host.as_slice());
-}
\ No newline at end of file
+}
diff --git a/wrappers/rust/icicle-curves/icicle-bls12-377/src/vec_ops/mod.rs b/wrappers/rust/icicle-curves/icicle-bls12-377/src/vec_ops/mod.rs
index 8d01706ab..959f24e60 100644
--- a/wrappers/rust/icicle-curves/icicle-bls12-377/src/vec_ops/mod.rs
+++ b/wrappers/rust/icicle-curves/icicle-bls12-377/src/vec_ops/mod.rs
@@ -5,7 +5,7 @@ use crate::curve::{ScalarCfg, ScalarField};
 use icicle_core::error::IcicleResult;
 use icicle_core::impl_vec_ops_field;
 use icicle_core::traits::IcicleResultWrap;
-use icicle_core::vec_ops::{VecOps, VecOpsConfig, BitReverseConfig};
+use icicle_core::vec_ops::{BitReverseConfig, VecOps, VecOpsConfig};
 use icicle_cuda_runtime::device_context::DeviceContext;
 use icicle_cuda_runtime::error::CudaError;
 use icicle_cuda_runtime::memory::HostOrDeviceSlice;
diff --git a/wrappers/rust/icicle-curves/icicle-bls12-381/src/vec_ops/mod.rs b/wrappers/rust/icicle-curves/icicle-bls12-381/src/vec_ops/mod.rs
index f543eb263..9591c15b0 100644
--- a/wrappers/rust/icicle-curves/icicle-bls12-381/src/vec_ops/mod.rs
+++ b/wrappers/rust/icicle-curves/icicle-bls12-381/src/vec_ops/mod.rs
@@ -3,7 +3,7 @@ use crate::curve::{ScalarCfg, ScalarField};
 use icicle_core::error::IcicleResult;
 use icicle_core::impl_vec_ops_field;
 use icicle_core::traits::IcicleResultWrap;
-use icicle_core::vec_ops::{VecOps, VecOpsConfig, BitReverseConfig};
+use icicle_core::vec_ops::{BitReverseConfig, VecOps, VecOpsConfig};
 use icicle_cuda_runtime::device_context::DeviceContext;
 use icicle_cuda_runtime::error::CudaError;
 use icicle_cuda_runtime::memory::HostOrDeviceSlice;
diff --git a/wrappers/rust/icicle-curves/icicle-bn254/src/vec_ops/mod.rs b/wrappers/rust/icicle-curves/icicle-bn254/src/vec_ops/mod.rs
index ad4eaadb2..fea979e6b 100644
--- a/wrappers/rust/icicle-curves/icicle-bn254/src/vec_ops/mod.rs
+++ b/wrappers/rust/icicle-curves/icicle-bn254/src/vec_ops/mod.rs
@@ -3,7 +3,7 @@ use crate::curve::{ScalarCfg, ScalarField};
 use icicle_core::error::IcicleResult;
 use icicle_core::impl_vec_ops_field;
 use icicle_core::traits::IcicleResultWrap;
-use icicle_core::vec_ops::{VecOps, VecOpsConfig, BitReverseConfig};
+use icicle_core::vec_ops::{BitReverseConfig, VecOps, VecOpsConfig};
 use icicle_cuda_runtime::device_context::DeviceContext;
 use icicle_cuda_runtime::error::CudaError;
 use icicle_cuda_runtime::memory::HostOrDeviceSlice;
diff --git a/wrappers/rust/icicle-curves/icicle-grumpkin/src/vec_ops/mod.rs b/wrappers/rust/icicle-curves/icicle-grumpkin/src/vec_ops/mod.rs
index 7b9a42eb5..0ef913c9b 100644
--- a/wrappers/rust/icicle-curves/icicle-grumpkin/src/vec_ops/mod.rs
+++ b/wrappers/rust/icicle-curves/icicle-grumpkin/src/vec_ops/mod.rs
@@ -3,7 +3,7 @@ use crate::curve::{ScalarCfg, ScalarField};
 use icicle_core::error::IcicleResult;
 use icicle_core::impl_vec_ops_field;
 use icicle_core::traits::IcicleResultWrap;
-use icicle_core::vec_ops::{VecOps, VecOpsConfig, BitReverseConfig};
+use icicle_core::vec_ops::{BitReverseConfig, VecOps, VecOpsConfig};
 use icicle_cuda_runtime::device_context::DeviceContext;
 use icicle_cuda_runtime::error::CudaError;
 use icicle_cuda_runtime::memory::HostOrDeviceSlice;
diff --git a/wrappers/rust/icicle-fields/icicle-babybear/src/vec_ops/mod.rs b/wrappers/rust/icicle-fields/icicle-babybear/src/vec_ops/mod.rs
index a98ad7b40..451f4aeaa 100644
--- a/wrappers/rust/icicle-fields/icicle-babybear/src/vec_ops/mod.rs
+++ b/wrappers/rust/icicle-fields/icicle-babybear/src/vec_ops/mod.rs
@@ -3,7 +3,7 @@ use crate::field::{ExtensionCfg, ExtensionField, ScalarCfg, ScalarField};
 use icicle_core::error::IcicleResult;
 use icicle_core::impl_vec_ops_field;
 use icicle_core::traits::IcicleResultWrap;
-use icicle_core::vec_ops::{VecOps, VecOpsConfig, BitReverseConfig};
+use icicle_core::vec_ops::{BitReverseConfig, VecOps, VecOpsConfig};
 use icicle_cuda_runtime::device_context::DeviceContext;
 use icicle_cuda_runtime::error::CudaError;
 use icicle_cuda_runtime::memory::HostOrDeviceSlice;
@@ -52,13 +52,13 @@ extern "C" {
         input: *const ScalarField,
         size: u32,
         config: &BitReverseConfig,
-        output: *mut ScalarField
+        output: *mut ScalarField,
     ) -> CudaError;
 
     pub(crate) fn babybear_bit_reverse_inplace_cuda(
         input: *mut ScalarField,
         size: u32,
-        config: &BitReverseConfig
+        config: &BitReverseConfig,
     ) -> CudaError;
 }
 
diff --git a/wrappers/rust/icicle-fields/icicle-stark252/src/vec_ops/mod.rs b/wrappers/rust/icicle-fields/icicle-stark252/src/vec_ops/mod.rs
index 1d431ab8f..4fcd95205 100644
--- a/wrappers/rust/icicle-fields/icicle-stark252/src/vec_ops/mod.rs
+++ b/wrappers/rust/icicle-fields/icicle-stark252/src/vec_ops/mod.rs
@@ -3,7 +3,7 @@ use crate::field::{ScalarCfg, ScalarField};
 use icicle_core::error::IcicleResult;
 use icicle_core::impl_vec_ops_field;
 use icicle_core::traits::IcicleResultWrap;
-use icicle_core::vec_ops::{VecOps, VecOpsConfig, BitReverseConfig};
+use icicle_core::vec_ops::{BitReverseConfig, VecOps, VecOpsConfig};
 use icicle_cuda_runtime::device_context::DeviceContext;
 use icicle_cuda_runtime::error::CudaError;
 use icicle_cuda_runtime::memory::HostOrDeviceSlice;

From 00deb481594b583fd0bb6ffa1dc1967164679054 Mon Sep 17 00:00:00 2001
From: nonam3e <timur@ingonyama.com>
Date: Mon, 27 May 2024 15:04:02 +0000
Subject: [PATCH 03/14] typo

---
 docs/docs/icicle/rust-bindings/ntt.md | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/docs/docs/icicle/rust-bindings/ntt.md b/docs/docs/icicle/rust-bindings/ntt.md
index 5636c6081..9d0b8f62f 100644
--- a/docs/docs/icicle/rust-bindings/ntt.md
+++ b/docs/docs/icicle/rust-bindings/ntt.md
@@ -180,7 +180,7 @@ where
 
 - **`IcicleResult<()>`**: Will return an error if the operation fails.
 
-### Releaseing the domain
+### Releasing the domain
 
 The `release_domain` function is responsible for releasing the resources associated with a specific domain in the CUDA device context.
 

From 769f9268e421e46cb66eea53a53c46ecc290543d Mon Sep 17 00:00:00 2001
From: nonam3e <timur@ingonyama.com>
Date: Tue, 28 May 2024 08:18:49 +0000
Subject: [PATCH 04/14] merge bit_reverse funcs

---
 icicle/include/api/babybear.h                 |  8 +--
 icicle/include/api/bls12_377.h                |  8 +--
 icicle/include/api/bls12_381.h                |  8 +--
 icicle/include/api/bn254.h                    |  8 +--
 icicle/include/api/bw6_761.h                  |  8 +--
 icicle/include/api/grumpkin.h                 |  8 +--
 icicle/include/api/stark252.h                 |  8 +--
 icicle/src/vec_ops/extern.cu                  | 10 +--
 icicle/src/vec_ops/extern_extension.cu        |  6 ++
 icicle/src/vec_ops/vec_ops.cu                 | 31 ++++++----
 wrappers/rust/icicle-core/src/vec_ops/mod.rs  | 61 ++++++++++++++-----
 .../rust/icicle-core/src/vec_ops/tests.rs     | 10 +--
 .../icicle-babybear/src/vec_ops/mod.rs        | 50 ---------------
 13 files changed, 108 insertions(+), 116 deletions(-)

diff --git a/icicle/include/api/babybear.h b/icicle/include/api/babybear.h
index 50102ba58..7d06141d3 100644
--- a/icicle/include/api/babybear.h
+++ b/icicle/include/api/babybear.h
@@ -74,10 +74,10 @@ extern "C" cudaError_t babybear_bit_reverse_cuda(
   vec_ops::BitReverseConfig& config,
   babybear::scalar_t* output);
 
-extern "C" cudaError_t babybear_bit_reverse_inplace_cuda(
-  babybear::scalar_t* input,
-  unsigned n,
-  vec_ops::BitReverseConfig& config);
+// extern "C" cudaError_t babybear_bit_reverse_inplace_cuda(
+//   babybear::scalar_t* input,
+//   unsigned n,
+//   vec_ops::BitReverseConfig& config);
 
 extern "C" void babybear_generate_scalars(babybear::scalar_t* scalars, int size);
 
diff --git a/icicle/include/api/bls12_377.h b/icicle/include/api/bls12_377.h
index e83c9656a..2191c4435 100644
--- a/icicle/include/api/bls12_377.h
+++ b/icicle/include/api/bls12_377.h
@@ -122,10 +122,10 @@ extern "C" cudaError_t bls12_377_bit_reverse_cuda(
   vec_ops::BitReverseConfig& config,
   bls12_377::scalar_t* output);
 
-extern "C" cudaError_t bls12_377_bit_reverse_inplace_cuda(
-  bls12_377::scalar_t* input,
-  unsigned n,
-  vec_ops::BitReverseConfig& config);
+// extern "C" cudaError_t bls12_377_bit_reverse_inplace_cuda(
+//   bls12_377::scalar_t* input,
+//   unsigned n,
+//   vec_ops::BitReverseConfig& config);
 
 extern "C" void bls12_377_generate_scalars(bls12_377::scalar_t* scalars, int size);
 
diff --git a/icicle/include/api/bls12_381.h b/icicle/include/api/bls12_381.h
index bcede3914..6e034a4fa 100644
--- a/icicle/include/api/bls12_381.h
+++ b/icicle/include/api/bls12_381.h
@@ -122,10 +122,10 @@ extern "C" cudaError_t bls12_381_bit_reverse_cuda(
   vec_ops::BitReverseConfig& config,
   bls12_381::scalar_t* output);
 
-extern "C" cudaError_t bls12_381_bit_reverse_inplace_cuda(
-  bls12_381::scalar_t* input,
-  unsigned n,
-  vec_ops::BitReverseConfig& config);
+// extern "C" cudaError_t bls12_381_bit_reverse_inplace_cuda(
+//   bls12_381::scalar_t* input,
+//   unsigned n,
+//   vec_ops::BitReverseConfig& config);
 
 extern "C" void bls12_381_generate_scalars(bls12_381::scalar_t* scalars, int size);
 
diff --git a/icicle/include/api/bn254.h b/icicle/include/api/bn254.h
index 8753ea35a..2b76882b6 100644
--- a/icicle/include/api/bn254.h
+++ b/icicle/include/api/bn254.h
@@ -154,10 +154,10 @@ extern "C" cudaError_t bn254_bit_reverse_cuda(
   vec_ops::BitReverseConfig& config,
   bn254::scalar_t* output);
 
-extern "C" cudaError_t bn254_bit_reverse_inplace_cuda(
-  bn254::scalar_t* input,
-  unsigned n,
-  vec_ops::BitReverseConfig& config);
+// extern "C" cudaError_t bn254_bit_reverse_inplace_cuda(
+//   bn254::scalar_t* input,
+//   unsigned n,
+//   vec_ops::BitReverseConfig& config);
 
 extern "C" void bn254_generate_scalars(bn254::scalar_t* scalars, int size);
 
diff --git a/icicle/include/api/bw6_761.h b/icicle/include/api/bw6_761.h
index 7ebb5e9f1..c4f9c0bce 100644
--- a/icicle/include/api/bw6_761.h
+++ b/icicle/include/api/bw6_761.h
@@ -122,10 +122,10 @@ extern "C" cudaError_t bw6_761_bit_reverse_cuda(
   vec_ops::BitReverseConfig& config,
   bw6_761::scalar_t* output);
 
-extern "C" cudaError_t bw6_761_bit_reverse_inplace_cuda(
-  bw6_761::scalar_t* input,
-  unsigned n,
-  vec_ops::BitReverseConfig& config);
+// extern "C" cudaError_t bw6_761_bit_reverse_inplace_cuda(
+//   bw6_761::scalar_t* input,
+//   unsigned n,
+//   vec_ops::BitReverseConfig& config);
 
 extern "C" void bw6_761_generate_scalars(bw6_761::scalar_t* scalars, int size);
 
diff --git a/icicle/include/api/grumpkin.h b/icicle/include/api/grumpkin.h
index 95d02ddd3..22b4c539f 100644
--- a/icicle/include/api/grumpkin.h
+++ b/icicle/include/api/grumpkin.h
@@ -92,10 +92,10 @@ extern "C" cudaError_t grumpkin_bit_reverse_cuda(
   vec_ops::BitReverseConfig& config,
   grumpkin::scalar_t* output);
 
-extern "C" cudaError_t grumpkin_bit_reverse_inplace_cuda(
-  grumpkin::scalar_t* input,
-  unsigned n,
-  vec_ops::BitReverseConfig& config);
+// extern "C" cudaError_t grumpkin_bit_reverse_inplace_cuda(
+//   grumpkin::scalar_t* input,
+//   unsigned n,
+//   vec_ops::BitReverseConfig& config);
 
 extern "C" void grumpkin_generate_scalars(grumpkin::scalar_t* scalars, int size);
 
diff --git a/icicle/include/api/stark252.h b/icicle/include/api/stark252.h
index 2b4acc334..e12d06fb8 100644
--- a/icicle/include/api/stark252.h
+++ b/icicle/include/api/stark252.h
@@ -37,10 +37,10 @@ extern "C" cudaError_t stark252_bit_reverse_cuda(
   vec_ops::BitReverseConfig& config,
   stark252::scalar_t* output);
 
-extern "C" cudaError_t stark252_bit_reverse_inplace_cuda(
-  stark252::scalar_t* input,
-  unsigned n,
-  vec_ops::BitReverseConfig& config);
+// extern "C" cudaError_t stark252_bit_reverse_inplace_cuda(
+//   stark252::scalar_t* input,
+//   unsigned n,
+//   vec_ops::BitReverseConfig& config);
 
 extern "C" void stark252_generate_scalars(stark252::scalar_t* scalars, int size);
 
diff --git a/icicle/src/vec_ops/extern.cu b/icicle/src/vec_ops/extern.cu
index 7c9079a46..d74a9bca7 100644
--- a/icicle/src/vec_ops/extern.cu
+++ b/icicle/src/vec_ops/extern.cu
@@ -65,9 +65,9 @@ namespace vec_ops {
   {
     return bit_reverse<scalar_t>(input, n, config, output);
   }
-  extern "C" cudaError_t
-    CONCAT_EXPAND(FIELD, bit_reverse_inplace_cuda)(scalar_t* input, unsigned n, BitReverseConfig& config)
-  {
-    return bit_reverse_inplace<scalar_t>(input, n, config);
-  }
+  // extern "C" cudaError_t
+  //   CONCAT_EXPAND(FIELD, bit_reverse_inplace_cuda)(scalar_t* input, unsigned n, BitReverseConfig& config)
+  // {
+  //   return bit_reverse_inplace<scalar_t>(input, n, config);
+  // }
 } // namespace vec_ops
\ No newline at end of file
diff --git a/icicle/src/vec_ops/extern_extension.cu b/icicle/src/vec_ops/extern_extension.cu
index df63ab34a..a91792913 100644
--- a/icicle/src/vec_ops/extern_extension.cu
+++ b/icicle/src/vec_ops/extern_extension.cu
@@ -56,4 +56,10 @@ namespace vec_ops {
   {
     return transpose_matrix<extension_t>(input, output, row_size, column_size, ctx, on_device, is_async);
   }
+
+  extern "C" cudaError_t CONCAT_EXPAND(FIELD, extension_bit_reverse_cuda)(
+    const extension_t* input, unsigned n, BitReverseConfig& config, extension_t* output)
+  {
+    return bit_reverse<extension_t>(input, n, config, output);
+  }
 } // namespace vec_ops
\ No newline at end of file
diff --git a/icicle/src/vec_ops/vec_ops.cu b/icicle/src/vec_ops/vec_ops.cu
index 86445a69e..1a264c6f5 100644
--- a/icicle/src/vec_ops/vec_ops.cu
+++ b/icicle/src/vec_ops/vec_ops.cu
@@ -217,16 +217,6 @@ namespace vec_ops {
   cudaError_t bit_reverse(const E* input, unsigned size, BitReverseConfig& cfg, E* output)
   {
     if (size & (size - 1)) THROW_ICICLE_ERR(IcicleError_t::InvalidArgument, "bit_reverse: size must be a power of 2");
-    const E* d_input;
-    E* d_alloc_input;
-    if (cfg.is_input_on_device) {
-      d_input = input;
-    } else {
-      // copy input to gpu
-      CHK_IF_RETURN(cudaMallocAsync(&d_alloc_input, sizeof(E) * size, cfg.ctx.stream));
-      CHK_IF_RETURN(cudaMemcpyAsync(d_alloc_input, input, sizeof(E) * size, cudaMemcpyHostToDevice, cfg.ctx.stream));
-      d_input = d_alloc_input;
-    }
     E* d_output;
     if (cfg.is_output_on_device) {
       d_output = output;
@@ -236,8 +226,25 @@ namespace vec_ops {
     }
     unsigned shift = __builtin_clz(size) + 1;
     unsigned num_blocks = (size + MAX_THREADS_PER_BLOCK - 1) / MAX_THREADS_PER_BLOCK;
-    bit_reverse_kernel<<<num_blocks, MAX_THREADS_PER_BLOCK, 0, cfg.ctx.stream>>>(d_input, size, shift, d_output);
-    if (!cfg.is_input_on_device) { CHK_IF_RETURN(cudaFreeAsync(d_alloc_input, cfg.ctx.stream)); }
+    if (input == output) {
+      if (cfg.is_input_on_device) 
+        CHK_IF_RETURN(cudaMemcpyAsync(d_output, output, sizeof(E) * size, cudaMemcpyHostToDevice, cfg.ctx.stream));
+      bit_reverse_inplace_kernel<<<num_blocks, MAX_THREADS_PER_BLOCK, 0, cfg.ctx.stream>>>(d_output, size, shift);
+    }
+    else {
+      const E* d_input;
+      E* d_alloc_input;
+      if (cfg.is_input_on_device) {
+        d_input = input;
+      } else {
+        // copy input to gpu
+        CHK_IF_RETURN(cudaMallocAsync(&d_alloc_input, sizeof(E) * size, cfg.ctx.stream));
+        CHK_IF_RETURN(cudaMemcpyAsync(d_alloc_input, input, sizeof(E) * size, cudaMemcpyHostToDevice, cfg.ctx.stream));
+        d_input = d_alloc_input;
+      }
+      bit_reverse_kernel<<<num_blocks, MAX_THREADS_PER_BLOCK, 0, cfg.ctx.stream>>>(d_input, size, shift, d_output);
+      if (!cfg.is_input_on_device) { CHK_IF_RETURN(cudaFreeAsync(d_alloc_input, cfg.ctx.stream)); }
+    }
     if (!cfg.is_output_on_device) {
       CHK_IF_RETURN(cudaMemcpyAsync(output, d_output, sizeof(E) * size, cudaMemcpyDeviceToHost, cfg.ctx.stream));
       CHK_IF_RETURN(cudaFreeAsync(d_output, cfg.ctx.stream));
diff --git a/wrappers/rust/icicle-core/src/vec_ops/mod.rs b/wrappers/rust/icicle-core/src/vec_ops/mod.rs
index 536649d7e..88b1ba474 100644
--- a/wrappers/rust/icicle-core/src/vec_ops/mod.rs
+++ b/wrappers/rust/icicle-core/src/vec_ops/mod.rs
@@ -47,10 +47,10 @@ pub struct BitReverseConfig<'a> {
     pub ctx: DeviceContext<'a>,
 
     /// True if inputs are on device and false if they're on host. Default value: false.
-    pub are_inputs_on_device: bool,
+    pub is_input_on_device: bool,
 
     /// If true, output is preserved on device, otherwise on host. Default value: false.
-    pub are_outputs_on_device: bool,
+    pub is_output_on_device: bool,
 
     /// Whether to run the vector operations asynchronously. If set to `true`, the functions will be non-blocking and you'd need to synchronize
     /// it explicitly by running `stream.synchronize()`. If set to false, the functions will block the current CPU thread.
@@ -67,8 +67,8 @@ impl<'a> BitReverseConfig<'a> {
     pub fn default_for_device(device_id: usize) -> Self {
         BitReverseConfig {
             ctx: DeviceContext::default_for_device(device_id),
-            are_inputs_on_device: false,
-            are_outputs_on_device: false,
+            is_input_on_device: false,
+            is_output_on_device: false,
             is_async: false,
         }
     }
@@ -156,6 +156,36 @@ fn check_vec_ops_args<'a, F>(
     res_cfg.is_result_on_device = result.is_on_device();
     res_cfg
 }
+fn check_bit_reverse_args<'a, F>(
+    input: &(impl HostOrDeviceSlice<F> + ?Sized),
+    cfg: &BitReverseConfig<'a>,
+    output: &(impl HostOrDeviceSlice<F> + ?Sized),
+) -> BitReverseConfig<'a> {
+    if input.len() & (input.len() - 1) != 0 {
+        panic!("input length must be a power of 2, input length: {}", input.len());
+    }
+    if input.len() != output.len() {
+        panic!(
+            "input and output lengths {}; {} do not match",
+            input.len(),
+            output.len()
+        );
+    }
+    let ctx_device_id = cfg
+        .ctx
+        .device_id;
+    if let Some(device_id) = input.device_id() {
+        assert_eq!(device_id, ctx_device_id, "Device ids in input and context are different");
+    }
+    if let Some(device_id) = output.device_id() {
+        assert_eq!(device_id, ctx_device_id, "Device ids in output and context are different");
+    }
+    check_device(ctx_device_id);
+    let mut res_cfg = cfg.clone();
+    res_cfg.is_input_on_device = input.is_on_device();
+    res_cfg.is_output_on_device = output.is_on_device();
+    res_cfg
+}
 
 pub fn add_scalars<F>(
     a: &(impl HostOrDeviceSlice<F> + ?Sized),
@@ -224,7 +254,8 @@ where
     F: FieldImpl,
     <F as FieldImpl>::Config: VecOps<F>,
 {
-    <<F as FieldImpl>::Config as VecOps<F>>::bit_reverse(input, cfg, output)
+    let cfg = check_bit_reverse_args(input, cfg, output);
+    <<F as FieldImpl>::Config as VecOps<F>>::bit_reverse(input, &cfg, output)
 }
 
 pub fn bit_reverse_inplace<F>(
@@ -235,7 +266,8 @@ where
     F: FieldImpl,
     <F as FieldImpl>::Config: VecOps<F>,
 {
-    <<F as FieldImpl>::Config as VecOps<F>>::bit_reverse_inplace(input, cfg)
+    let cfg = check_bit_reverse_args(input, cfg, input);
+    <<F as FieldImpl>::Config as VecOps<F>>::bit_reverse_inplace(input, &cfg)
 }
 
 #[macro_export]
@@ -298,12 +330,12 @@ macro_rules! impl_vec_ops_field {
                     output: *mut $field,
                 ) -> CudaError;
 
-                #[link_name = concat!($field_prefix, "_bit_reverse_inplace_cuda")]
-                pub(crate) fn bit_reverse_inplace_cuda(
-                    input: *mut $field,
-                    size: u32,
-                    config: *const BitReverseConfig,
-                ) -> CudaError;
+                // #[link_name = concat!($field_prefix, "_bit_reverse_inplace_cuda")]
+                // pub(crate) fn bit_reverse_inplace_cuda(
+                //     input: *mut $field,
+                //     size: u32,
+                //     config: *const BitReverseConfig,
+                // ) -> CudaError;
             }
         }
 
@@ -406,10 +438,11 @@ macro_rules! impl_vec_ops_field {
                 cfg: &BitReverseConfig,
             ) -> IcicleResult<()> {
                 unsafe {
-                    $field_prefix_ident::bit_reverse_inplace_cuda(
-                        input.as_mut_ptr(),
+                    $field_prefix_ident::bit_reverse_cuda(
+                        input.as_ptr(),
                         input.len() as u32,
                         cfg as *const BitReverseConfig,
+                        input.as_mut_ptr(),
                     )
                     .wrap()
                 }
diff --git a/wrappers/rust/icicle-core/src/vec_ops/tests.rs b/wrappers/rust/icicle-core/src/vec_ops/tests.rs
index a1a45671b..cad95d827 100644
--- a/wrappers/rust/icicle-core/src/vec_ops/tests.rs
+++ b/wrappers/rust/icicle-core/src/vec_ops/tests.rs
@@ -44,15 +44,12 @@ where
     let input = F::Config::generate_random(TEST_SIZE);
     let input = HostSlice::from_slice(&input);
     let mut intermediate_result = DeviceVec::<F>::cuda_malloc(TEST_SIZE).unwrap();
-    let mut cfg = BitReverseConfig::default();
-    cfg.are_outputs_on_device = true;
+    let cfg = BitReverseConfig::default();
     bit_reverse(input, &cfg, &mut intermediate_result[..]).unwrap();
 
     let mut result = vec![F::one(); TEST_SIZE];
     let result = HostSlice::from_mut_slice(&mut result);
-    let mut cfg = BitReverseConfig::default();
-    cfg.are_outputs_on_device = false;
-    cfg.are_inputs_on_device = true;
+    let cfg = BitReverseConfig::default();
     bit_reverse(&intermediate_result[..], &cfg, result).unwrap();
     assert_eq!(input.as_slice(), result.as_slice());
 }
@@ -68,8 +65,7 @@ where
     intermediate
         .copy_from_host(&input)
         .unwrap();
-    let mut cfg = BitReverseConfig::default();
-    cfg.are_inputs_on_device = true;
+    let cfg = BitReverseConfig::default();
     bit_reverse_inplace(&mut intermediate[..], &cfg).unwrap();
     bit_reverse_inplace(&mut intermediate[..], &cfg).unwrap();
     let mut result_host = vec![F::one(); TEST_SIZE];
diff --git a/wrappers/rust/icicle-fields/icicle-babybear/src/vec_ops/mod.rs b/wrappers/rust/icicle-fields/icicle-babybear/src/vec_ops/mod.rs
index 451f4aeaa..22a19417d 100644
--- a/wrappers/rust/icicle-fields/icicle-babybear/src/vec_ops/mod.rs
+++ b/wrappers/rust/icicle-fields/icicle-babybear/src/vec_ops/mod.rs
@@ -11,56 +11,6 @@ use icicle_cuda_runtime::memory::HostOrDeviceSlice;
 impl_vec_ops_field!("babybear", babybear, ScalarField, ScalarCfg);
 impl_vec_ops_field!("babybear_extension", babybear_extension, ExtensionField, ExtensionCfg);
 
-// #[repr(C)]
-// #[derive(Debug, Clone)]
-// pub struct BitReverseConfig<'a> {
-//     /// Details related to the device such as its id and stream id. See [DeviceContext](@ref device_context::DeviceContext).
-//     pub ctx: DeviceContext<'a>,
-
-//     /// True if inputs are on device and false if they're on host. Default value: false.
-//     pub are_inputs_on_device: bool,
-
-//     /// If true, output is preserved on device, otherwise on host. Default value: false.
-//     pub are_outputs_on_device: bool,
-
-//     /// Whether to run the Keccak asynchronously. If set to `true`, the keccak_hash function will be
-//     /// non-blocking and you'd need to synchronize it explicitly by running
-//     /// `cudaStreamSynchronize` or `cudaDeviceSynchronize`. If set to false, keccak_hash
-//     /// function will block the current CPU thread.
-//     pub is_async: bool,
-// }
-
-// impl<'a> Default for BitReverseConfig<'a> {
-//     fn default() -> Self {
-//         Self::default_for_device(DEFAULT_DEVICE_ID)
-//     }
-// }
-
-// impl<'a> BitReverseConfig<'a> {
-//     pub fn default_for_device(device_id: usize) -> Self {
-//         BitReverseConfig {
-//             ctx: DeviceContext::default_for_device(device_id),
-//             are_inputs_on_device: false,
-//             are_outputs_on_device: false,
-//             is_async: false,
-//         }
-//     }
-// }
-
-extern "C" {
-    pub(crate) fn babybear_bit_reverse_cuda(
-        input: *const ScalarField,
-        size: u32,
-        config: &BitReverseConfig,
-        output: *mut ScalarField,
-    ) -> CudaError;
-
-    pub(crate) fn babybear_bit_reverse_inplace_cuda(
-        input: *mut ScalarField,
-        size: u32,
-        config: &BitReverseConfig,
-    ) -> CudaError;
-}
 
 #[cfg(test)]
 pub(crate) mod tests {

From 0ce2fd48d3e05dc9e473ee452005903855a8118e Mon Sep 17 00:00:00 2001
From: nonam3e <timur@ingonyama.com>
Date: Tue, 28 May 2024 10:00:10 +0000
Subject: [PATCH 05/14] fmt

---
 icicle/src/vec_ops/extern.cu                           |  4 ++--
 icicle/src/vec_ops/vec_ops.cu                          |  5 ++---
 wrappers/rust/icicle-core/src/vec_ops/mod.rs           | 10 ++++++++--
 .../icicle-fields/icicle-babybear/src/vec_ops/mod.rs   |  1 -
 4 files changed, 12 insertions(+), 8 deletions(-)

diff --git a/icicle/src/vec_ops/extern.cu b/icicle/src/vec_ops/extern.cu
index d74a9bca7..b8aca686a 100644
--- a/icicle/src/vec_ops/extern.cu
+++ b/icicle/src/vec_ops/extern.cu
@@ -60,8 +60,8 @@ namespace vec_ops {
     return transpose_matrix<scalar_t>(input, output, row_size, column_size, ctx, on_device, is_async);
   }
 
-  extern "C" cudaError_t CONCAT_EXPAND(FIELD, bit_reverse_cuda)(
-    const scalar_t* input, unsigned n, BitReverseConfig& config, scalar_t* output)
+  extern "C" cudaError_t
+    CONCAT_EXPAND(FIELD, bit_reverse_cuda)(const scalar_t* input, unsigned n, BitReverseConfig& config, scalar_t* output)
   {
     return bit_reverse<scalar_t>(input, n, config, output);
   }
diff --git a/icicle/src/vec_ops/vec_ops.cu b/icicle/src/vec_ops/vec_ops.cu
index 1a264c6f5..8ea7232f3 100644
--- a/icicle/src/vec_ops/vec_ops.cu
+++ b/icicle/src/vec_ops/vec_ops.cu
@@ -227,11 +227,10 @@ namespace vec_ops {
     unsigned shift = __builtin_clz(size) + 1;
     unsigned num_blocks = (size + MAX_THREADS_PER_BLOCK - 1) / MAX_THREADS_PER_BLOCK;
     if (input == output) {
-      if (cfg.is_input_on_device) 
+      if (cfg.is_input_on_device)
         CHK_IF_RETURN(cudaMemcpyAsync(d_output, output, sizeof(E) * size, cudaMemcpyHostToDevice, cfg.ctx.stream));
       bit_reverse_inplace_kernel<<<num_blocks, MAX_THREADS_PER_BLOCK, 0, cfg.ctx.stream>>>(d_output, size, shift);
-    }
-    else {
+    } else {
       const E* d_input;
       E* d_alloc_input;
       if (cfg.is_input_on_device) {
diff --git a/wrappers/rust/icicle-core/src/vec_ops/mod.rs b/wrappers/rust/icicle-core/src/vec_ops/mod.rs
index 88b1ba474..af9e4ddf7 100644
--- a/wrappers/rust/icicle-core/src/vec_ops/mod.rs
+++ b/wrappers/rust/icicle-core/src/vec_ops/mod.rs
@@ -175,10 +175,16 @@ fn check_bit_reverse_args<'a, F>(
         .ctx
         .device_id;
     if let Some(device_id) = input.device_id() {
-        assert_eq!(device_id, ctx_device_id, "Device ids in input and context are different");
+        assert_eq!(
+            device_id, ctx_device_id,
+            "Device ids in input and context are different"
+        );
     }
     if let Some(device_id) = output.device_id() {
-        assert_eq!(device_id, ctx_device_id, "Device ids in output and context are different");
+        assert_eq!(
+            device_id, ctx_device_id,
+            "Device ids in output and context are different"
+        );
     }
     check_device(ctx_device_id);
     let mut res_cfg = cfg.clone();
diff --git a/wrappers/rust/icicle-fields/icicle-babybear/src/vec_ops/mod.rs b/wrappers/rust/icicle-fields/icicle-babybear/src/vec_ops/mod.rs
index 22a19417d..8654d0512 100644
--- a/wrappers/rust/icicle-fields/icicle-babybear/src/vec_ops/mod.rs
+++ b/wrappers/rust/icicle-fields/icicle-babybear/src/vec_ops/mod.rs
@@ -11,7 +11,6 @@ use icicle_cuda_runtime::memory::HostOrDeviceSlice;
 impl_vec_ops_field!("babybear", babybear, ScalarField, ScalarCfg);
 impl_vec_ops_field!("babybear_extension", babybear_extension, ExtensionField, ExtensionCfg);
 
-
 #[cfg(test)]
 pub(crate) mod tests {
     use crate::field::{ExtensionField, ScalarField};

From 984e87a65931fb6ef3fbb154d507ed437b39ddd5 Mon Sep 17 00:00:00 2001
From: ChickenLover <romangg81@gmail.com>
Date: Tue, 28 May 2024 17:10:51 +0700
Subject: [PATCH 06/14] fmt

---
 icicle/src/vec_ops/extern.cu | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/icicle/src/vec_ops/extern.cu b/icicle/src/vec_ops/extern.cu
index b8aca686a..aaeeec10a 100644
--- a/icicle/src/vec_ops/extern.cu
+++ b/icicle/src/vec_ops/extern.cu
@@ -13,7 +13,7 @@ namespace vec_ops {
    * @return `cudaSuccess` if the execution was successful and an error code otherwise.
    */
   extern "C" cudaError_t
-    CONCAT_EXPAND(FIELD, mul_cuda)(scalar_t* vec_a, scalar_t* vec_b, int n, VecOpsConfig& config, scalar_t* result)
+  CONCAT_EXPAND(FIELD, mul_cuda)(scalar_t* vec_a, scalar_t* vec_b, int n, VecOpsConfig& config, scalar_t* result)
   {
     return mul<scalar_t>(vec_a, vec_b, n, config, result);
   }
@@ -25,7 +25,7 @@ namespace vec_ops {
    * @return `cudaSuccess` if the execution was successful and an error code otherwise.
    */
   extern "C" cudaError_t
-    CONCAT_EXPAND(FIELD, add_cuda)(scalar_t* vec_a, scalar_t* vec_b, int n, VecOpsConfig& config, scalar_t* result)
+  CONCAT_EXPAND(FIELD, add_cuda)(scalar_t* vec_a, scalar_t* vec_b, int n, VecOpsConfig& config, scalar_t* result)
   {
     return add<scalar_t>(vec_a, vec_b, n, config, result);
   }
@@ -37,7 +37,7 @@ namespace vec_ops {
    * @return `cudaSuccess` if the execution was successful and an error code otherwise.
    */
   extern "C" cudaError_t
-    CONCAT_EXPAND(FIELD, sub_cuda)(scalar_t* vec_a, scalar_t* vec_b, int n, VecOpsConfig& config, scalar_t* result)
+  CONCAT_EXPAND(FIELD, sub_cuda)(scalar_t* vec_a, scalar_t* vec_b, int n, VecOpsConfig& config, scalar_t* result)
   {
     return sub<scalar_t>(vec_a, vec_b, n, config, result);
   }
@@ -61,7 +61,7 @@ namespace vec_ops {
   }
 
   extern "C" cudaError_t
-    CONCAT_EXPAND(FIELD, bit_reverse_cuda)(const scalar_t* input, unsigned n, BitReverseConfig& config, scalar_t* output)
+  CONCAT_EXPAND(FIELD, bit_reverse_cuda)(const scalar_t* input, unsigned n, BitReverseConfig& config, scalar_t* output)
   {
     return bit_reverse<scalar_t>(input, n, config, output);
   }

From 2b04511d41f60cbd8f78d70e5a1ea7e2c1759ac1 Mon Sep 17 00:00:00 2001
From: nonam3e <timur@ingonyama.com>
Date: Tue, 28 May 2024 10:49:48 +0000
Subject: [PATCH 07/14] remove unused func

---
 icicle/src/vec_ops/vec_ops.cu | 23 -----------------------
 1 file changed, 23 deletions(-)

diff --git a/icicle/src/vec_ops/vec_ops.cu b/icicle/src/vec_ops/vec_ops.cu
index 8ea7232f3..e9737ae27 100644
--- a/icicle/src/vec_ops/vec_ops.cu
+++ b/icicle/src/vec_ops/vec_ops.cu
@@ -190,29 +190,6 @@ namespace vec_ops {
     return CHK_LAST();
   }
 
-  template <typename E>
-  cudaError_t bit_reverse_inplace(E* input, unsigned size, BitReverseConfig& cfg)
-  {
-    if (size & (size - 1)) THROW_ICICLE_ERR(IcicleError_t::InvalidArgument, "bit_reverse: size must be a power of 2");
-    E* d_input;
-    if (cfg.is_input_on_device) {
-      d_input = input;
-    } else {
-      // copy input to gpu
-      CHK_IF_RETURN(cudaMallocAsync(&d_input, sizeof(E) * size, cfg.ctx.stream));
-      CHK_IF_RETURN(cudaMemcpyAsync(d_input, input, sizeof(E) * size, cudaMemcpyHostToDevice, cfg.ctx.stream));
-    }
-    unsigned shift = __builtin_clz(size) + 1;
-    unsigned num_blocks = (size + MAX_THREADS_PER_BLOCK - 1) / MAX_THREADS_PER_BLOCK;
-    bit_reverse_inplace_kernel<<<num_blocks, MAX_THREADS_PER_BLOCK, 0, cfg.ctx.stream>>>(d_input, size, shift);
-    if (!cfg.is_input_on_device) {
-      CHK_IF_RETURN(cudaMemcpyAsync(input, d_input, sizeof(E) * size, cudaMemcpyDeviceToHost, cfg.ctx.stream));
-      CHK_IF_RETURN(cudaFreeAsync(d_input, cfg.ctx.stream));
-    }
-    if (!cfg.is_async) CHK_IF_RETURN(cudaStreamSynchronize(cfg.ctx.stream));
-    return CHK_LAST();
-  }
-
   template <typename E>
   cudaError_t bit_reverse(const E* input, unsigned size, BitReverseConfig& cfg, E* output)
   {

From c06947ba5e5303874cb2fddf66cf06df8f023fc8 Mon Sep 17 00:00:00 2001
From: nonam3e <timur@ingonyama.com>
Date: Tue, 28 May 2024 14:35:18 +0000
Subject: [PATCH 08/14] memory usage optimization

---
 icicle/src/vec_ops/vec_ops.cu | 24 +++++++++---------------
 1 file changed, 9 insertions(+), 15 deletions(-)

diff --git a/icicle/src/vec_ops/vec_ops.cu b/icicle/src/vec_ops/vec_ops.cu
index e9737ae27..b5a415a35 100644
--- a/icicle/src/vec_ops/vec_ops.cu
+++ b/icicle/src/vec_ops/vec_ops.cu
@@ -194,6 +194,8 @@ namespace vec_ops {
   cudaError_t bit_reverse(const E* input, unsigned size, BitReverseConfig& cfg, E* output)
   {
     if (size & (size - 1)) THROW_ICICLE_ERR(IcicleError_t::InvalidArgument, "bit_reverse: size must be a power of 2");
+    if ((input == output) ^ (cfg.is_input_on_device == cfg.is_output_on_device)) THROW_ICICLE_ERR(IcicleError_t::InvalidArgument, "bit_reverse: equal devices should have same is_on_device parameters");
+
     E* d_output;
     if (cfg.is_output_on_device) {
       d_output = output;
@@ -201,25 +203,17 @@ namespace vec_ops {
       // allocate output on gpu
       CHK_IF_RETURN(cudaMallocAsync(&d_output, sizeof(E) * size, cfg.ctx.stream));
     }
+
     unsigned shift = __builtin_clz(size) + 1;
     unsigned num_blocks = (size + MAX_THREADS_PER_BLOCK - 1) / MAX_THREADS_PER_BLOCK;
-    if (input == output) {
-      if (cfg.is_input_on_device)
-        CHK_IF_RETURN(cudaMemcpyAsync(d_output, output, sizeof(E) * size, cudaMemcpyHostToDevice, cfg.ctx.stream));
-      bit_reverse_inplace_kernel<<<num_blocks, MAX_THREADS_PER_BLOCK, 0, cfg.ctx.stream>>>(d_output, size, shift);
+
+    if ((input != output) & cfg.is_input_on_device) {
+      bit_reverse_kernel<<<num_blocks, MAX_THREADS_PER_BLOCK, 0, cfg.ctx.stream>>>(input, size, shift, d_output);
     } else {
-      const E* d_input;
-      E* d_alloc_input;
-      if (cfg.is_input_on_device) {
-        d_input = input;
-      } else {
-        // copy input to gpu
-        CHK_IF_RETURN(cudaMallocAsync(&d_alloc_input, sizeof(E) * size, cfg.ctx.stream));
-        CHK_IF_RETURN(cudaMemcpyAsync(d_alloc_input, input, sizeof(E) * size, cudaMemcpyHostToDevice, cfg.ctx.stream));
-        d_input = d_alloc_input;
+      if (!cfg.is_input_on_device) {
+        CHK_IF_RETURN(cudaMemcpyAsync(d_output, input, sizeof(E) * size, cudaMemcpyHostToDevice, cfg.ctx.stream));
       }
-      bit_reverse_kernel<<<num_blocks, MAX_THREADS_PER_BLOCK, 0, cfg.ctx.stream>>>(d_input, size, shift, d_output);
-      if (!cfg.is_input_on_device) { CHK_IF_RETURN(cudaFreeAsync(d_alloc_input, cfg.ctx.stream)); }
+      bit_reverse_inplace_kernel<<<num_blocks, MAX_THREADS_PER_BLOCK, 0, cfg.ctx.stream>>>(d_output, size, shift);
     }
     if (!cfg.is_output_on_device) {
       CHK_IF_RETURN(cudaMemcpyAsync(output, d_output, sizeof(E) * size, cudaMemcpyDeviceToHost, cfg.ctx.stream));

From 6fd4116199b7839bb32855fe2e7085f7e39d38c7 Mon Sep 17 00:00:00 2001
From: nonam3e <71525212+nonam3e@users.noreply.github.com>
Date: Wed, 29 May 2024 10:37:42 +0700
Subject: [PATCH 09/14] Update vec_ops.cu

---
 icicle/src/vec_ops/vec_ops.cu | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/icicle/src/vec_ops/vec_ops.cu b/icicle/src/vec_ops/vec_ops.cu
index b5a415a35..ed6e5050a 100644
--- a/icicle/src/vec_ops/vec_ops.cu
+++ b/icicle/src/vec_ops/vec_ops.cu
@@ -194,7 +194,7 @@ namespace vec_ops {
   cudaError_t bit_reverse(const E* input, unsigned size, BitReverseConfig& cfg, E* output)
   {
     if (size & (size - 1)) THROW_ICICLE_ERR(IcicleError_t::InvalidArgument, "bit_reverse: size must be a power of 2");
-    if ((input == output) ^ (cfg.is_input_on_device == cfg.is_output_on_device)) THROW_ICICLE_ERR(IcicleError_t::InvalidArgument, "bit_reverse: equal devices should have same is_on_device parameters");
+    if ((input == output) & (cfg.is_input_on_device != cfg.is_output_on_device)) THROW_ICICLE_ERR(IcicleError_t::InvalidArgument, "bit_reverse: equal devices should have same is_on_device parameters");
 
     E* d_output;
     if (cfg.is_output_on_device) {
@@ -222,4 +222,4 @@ namespace vec_ops {
     if (!cfg.is_async) CHK_IF_RETURN(cudaStreamSynchronize(cfg.ctx.stream));
     return CHK_LAST();
   }
-} // namespace vec_ops
\ No newline at end of file
+} // namespace vec_ops

From 3604f27daeca38229fb4ba7ca5164577dff731ba Mon Sep 17 00:00:00 2001
From: nonam3e <timur@ingonyama.com>
Date: Wed, 29 May 2024 19:42:19 +0700
Subject: [PATCH 10/14] fmt

---
 examples/c++/best-practice-ntt/example.cu     |  63 +++--
 examples/c++/msm/example.cu                   |   4 +-
 examples/c++/multi-gpu-poseidon/example.cu    | 257 +++++++++---------
 examples/c++/multiply/example.cu              |   6 +-
 examples/c++/ntt/example.cu                   |  10 +-
 examples/c++/pedersen-commitment/example.cu   | 109 ++++----
 .../c++/polynomial_multiplication/example.cu  |   8 +-
 examples/c++/poseidon/example.cu              |  19 +-
 icicle/include/api/babybear.h                 |   5 -
 icicle/include/api/bls12_377.h                |   5 -
 icicle/include/api/bls12_381.h                |   5 -
 icicle/include/api/bn254.h                    |   5 -
 icicle/include/api/bw6_761.h                  |   5 -
 icicle/include/api/grumpkin.h                 |   5 -
 icicle/include/api/stark252.h                 |   5 -
 icicle/src/ntt/thread_ntt.cu                  |  20 +-
 icicle/src/vec_ops/extern.cu                  |   5 -
 icicle/src/vec_ops/vec_ops.cu                 |   4 +-
 wrappers/rust/icicle-core/src/vec_ops/mod.rs  |   7 -
 19 files changed, 259 insertions(+), 288 deletions(-)

diff --git a/examples/c++/best-practice-ntt/example.cu b/examples/c++/best-practice-ntt/example.cu
index ccb14e53e..341f42a9e 100644
--- a/examples/c++/best-practice-ntt/example.cu
+++ b/examples/c++/best-practice-ntt/example.cu
@@ -15,20 +15,23 @@ typedef scalar_t E;
 
 const unsigned max_log_ntt_size = 27;
 
-void initialize_input(const unsigned ntt_size, const unsigned nof_ntts, E * elements ) {
+void initialize_input(const unsigned ntt_size, const unsigned nof_ntts, E* elements)
+{
   for (unsigned i = 0; i < ntt_size * nof_ntts; i++) {
-    elements[i] = E::from(i+1);
+    elements[i] = E::from(i + 1);
   }
 }
 
 using FpMilliseconds = std::chrono::duration<float, std::chrono::milliseconds::period>;
 #define START_TIMER(timer) auto timer##_start = std::chrono::high_resolution_clock::now();
-#define END_TIMER(timer, msg) printf("%s: %.0f ms\n", msg, FpMilliseconds(std::chrono::high_resolution_clock::now() - timer##_start).count());
+#define END_TIMER(timer, msg)                                                                                          \
+  printf("%s: %.0f ms\n", msg, FpMilliseconds(std::chrono::high_resolution_clock::now() - timer##_start).count());
 
-int main(int argc, char** argv) {
+int main(int argc, char** argv)
+{
   cudaDeviceReset();
   cudaDeviceProp deviceProperties;
-  int deviceId=0;
+  int deviceId = 0;
   cudaGetDeviceProperties(&deviceProperties, deviceId);
   std::string gpu_full_name = deviceProperties.name;
   std::cout << gpu_full_name << std::endl;
@@ -38,7 +41,7 @@ int main(int argc, char** argv) {
 
   S basic_root = S::omega(max_log_ntt_size);
 
-  // change these parameters to match the desired NTT size and batch size  
+  // change these parameters to match the desired NTT size and batch size
   const unsigned log_ntt_size = 22;
   const unsigned nof_ntts = 16;
 
@@ -49,7 +52,7 @@ int main(int argc, char** argv) {
 
   // Create separate CUDA streams for overlapping data transfers and kernel execution.
   cudaStream_t stream_compute, stream_h2d, stream_d2h;
-  cudaStreamCreate(&stream_compute); 
+  cudaStreamCreate(&stream_compute);
   cudaStreamCreate(&stream_h2d);
   cudaStreamCreate(&stream_d2h);
 
@@ -68,27 +71,27 @@ int main(int argc, char** argv) {
   config_compute.are_inputs_on_device = true;
   config_compute.are_outputs_on_device = true;
   config_compute.is_async = true;
-  
+
   std::cout << "Concurrent Download, Upload, and Compute In-place NTT" << std::endl;
   int nof_blocks = 32;
   std::cout << "Number of blocks: " << nof_blocks << std::endl;
-  int block_size = ntt_size*nof_ntts/nof_blocks;
-  
+  int block_size = ntt_size * nof_ntts / nof_blocks;
+
   // on-host pinned data
-  E * h_inp[2];
-  E * h_out[2];
+  E* h_inp[2];
+  E* h_out[2];
   for (int i = 0; i < 2; i++) {
-    cudaHostAlloc((void**)&h_inp[i], sizeof(E)*ntt_size*nof_ntts, cudaHostAllocDefault);
-    cudaHostAlloc((void**)&h_out[i], sizeof(E)*ntt_size*nof_ntts, cudaHostAllocDefault);
+    cudaHostAlloc((void**)&h_inp[i], sizeof(E) * ntt_size * nof_ntts, cudaHostAllocDefault);
+    cudaHostAlloc((void**)&h_out[i], sizeof(E) * ntt_size * nof_ntts, cudaHostAllocDefault);
   }
-  
+
   // on-device in-place data
   // we need two on-device vectors to overlap data transfers with NTT kernel execution
-  E * d_vec[2];
+  E* d_vec[2];
   for (int i = 0; i < 2; i++) {
-    cudaMalloc((void**)&d_vec[i], sizeof(E)*ntt_size*nof_ntts);
+    cudaMalloc((void**)&d_vec[i], sizeof(E) * ntt_size * nof_ntts);
   }
-  
+
   // initialize input data
   initialize_input(ntt_size, nof_ntts, h_inp[0]);
   initialize_input(ntt_size, nof_ntts, h_inp[1]);
@@ -97,7 +100,7 @@ int main(int argc, char** argv) {
   cudaEventCreate(&compute_start);
   cudaEventCreate(&compute_stop);
 
-  for ( int run = 0; run < 10; run++ ) {  
+  for (int run = 0; run < 10; run++) {
     int vec_compute = run % 2;
     int vec_transfer = (run + 1) % 2;
     std::cout << "Run: " << run << std::endl;
@@ -110,14 +113,18 @@ int main(int argc, char** argv) {
     // we have to delay upload to device relative to download from device by one block: preserve write after read
     for (int i = 0; i <= nof_blocks; i++) {
       if (i < nof_blocks) {
-        cudaMemcpyAsync(&h_out[vec_transfer][i*block_size], &d_vec[vec_transfer][i*block_size], sizeof(E)*block_size, cudaMemcpyDeviceToHost, stream_d2h);    
+        cudaMemcpyAsync(
+          &h_out[vec_transfer][i * block_size], &d_vec[vec_transfer][i * block_size], sizeof(E) * block_size,
+          cudaMemcpyDeviceToHost, stream_d2h);
       }
-      if (i>0) {
-        cudaMemcpyAsync(&d_vec[vec_transfer][(i-1)*block_size], &h_inp[vec_transfer][(i-1)*block_size], sizeof(E)*block_size, cudaMemcpyHostToDevice, stream_h2d);
+      if (i > 0) {
+        cudaMemcpyAsync(
+          &d_vec[vec_transfer][(i - 1) * block_size], &h_inp[vec_transfer][(i - 1) * block_size],
+          sizeof(E) * block_size, cudaMemcpyHostToDevice, stream_h2d);
       }
       // synchronize upload and download at the end of the block to ensure data integrity
-      cudaStreamSynchronize(stream_d2h); 
-      cudaStreamSynchronize(stream_h2d); 
+      cudaStreamSynchronize(stream_d2h);
+      cudaStreamSynchronize(stream_h2d);
     }
     // synchronize compute stream with the end of the computation
     cudaEventSynchronize(compute_stop);
@@ -126,12 +133,12 @@ int main(int argc, char** argv) {
     END_TIMER(inplace, "Concurrent In-Place  NTT");
     std::cout << "NTT time: " << milliseconds << " ms" << std::endl;
   };
-  
+
   // Clean-up
   for (int i = 0; i < 2; i++) {
-    cudaFree(d_vec[i]); 
-    cudaFreeHost(h_inp[i]); 
-    cudaFreeHost(h_out[i]); 
+    cudaFree(d_vec[i]);
+    cudaFreeHost(h_inp[i]);
+    cudaFreeHost(h_out[i]);
   }
   cudaEventDestroy(compute_start);
   cudaEventDestroy(compute_stop);
diff --git a/examples/c++/msm/example.cu b/examples/c++/msm/example.cu
index ded07ee93..abdc3f5cb 100644
--- a/examples/c++/msm/example.cu
+++ b/examples/c++/msm/example.cu
@@ -16,7 +16,7 @@ int main(int argc, char* argv[])
   int N = batch_size * msm_size;
 
   std::cout << "Part I: use G1 points" << std::endl;
-  
+
   std::cout << "Generating random inputs on-host" << std::endl;
   scalar_t* scalars = new scalar_t[N];
   affine_t* points = new affine_t[N];
@@ -43,7 +43,7 @@ int main(int argc, char* argv[])
     false, // is_async
   };
   config.batch_size = batch_size;
-  
+
   std::cout << "Running MSM kernel with on-host inputs" << std::endl;
   cudaStream_t stream = config.ctx.stream;
   // Execute the MSM kernel
diff --git a/examples/c++/multi-gpu-poseidon/example.cu b/examples/c++/multi-gpu-poseidon/example.cu
index cd7725019..32f0c920c 100644
--- a/examples/c++/multi-gpu-poseidon/example.cu
+++ b/examples/c++/multi-gpu-poseidon/example.cu
@@ -9,137 +9,148 @@
 using namespace poseidon;
 using namespace bn254;
 
-void checkCudaError(cudaError_t error) {
-    if (error != cudaSuccess) {
-        std::cerr << "CUDA error: " << cudaGetErrorString(error) << std::endl;
-        // Handle the error, e.g., exit the program or throw an exception.
-    }
+void checkCudaError(cudaError_t error)
+{
+  if (error != cudaSuccess) {
+    std::cerr << "CUDA error: " << cudaGetErrorString(error) << std::endl;
+    // Handle the error, e.g., exit the program or throw an exception.
+  }
 }
 
 // these global constants go into template calls
 const int size_col = 11;
 
 // this function executes the Poseidon thread
-void threadPoseidon(device_context::DeviceContext ctx, unsigned size_partition, scalar_t * layers, scalar_t * column_hashes, PoseidonConstants<scalar_t> * constants) {
-    cudaError_t err_result =  CHK_STICKY(cudaSetDevice(ctx.device_id));
-    if (err_result != cudaSuccess) {
-        std::cerr << "CUDA error: " << cudaGetErrorString(err_result) << std::endl;
-        return; 
-    }
-    // CHK_IF_RETURN(); I can't use it in a standard thread function
-    PoseidonConfig column_config = {
-        ctx,   // ctx
-        false, // are_inputes_on_device
-        false, // are_outputs_on_device
-        false, // input_is_a_state
-        false, // aligned
-        false, // loop_state
-        false, // is_async
-        };
-    cudaError_t err = bn254_poseidon_hash_cuda(layers, column_hashes, (size_t) size_partition, size_col, *constants, column_config);
-    checkCudaError(err);
+void threadPoseidon(
+  device_context::DeviceContext ctx,
+  unsigned size_partition,
+  scalar_t* layers,
+  scalar_t* column_hashes,
+  PoseidonConstants<scalar_t>* constants)
+{
+  cudaError_t err_result = CHK_STICKY(cudaSetDevice(ctx.device_id));
+  if (err_result != cudaSuccess) {
+    std::cerr << "CUDA error: " << cudaGetErrorString(err_result) << std::endl;
+    return;
+  }
+  // CHK_IF_RETURN(); I can't use it in a standard thread function
+  PoseidonConfig column_config = {
+    ctx,   // ctx
+    false, // are_inputes_on_device
+    false, // are_outputs_on_device
+    false, // input_is_a_state
+    false, // aligned
+    false, // loop_state
+    false, // is_async
+  };
+  cudaError_t err =
+    bn254_poseidon_hash_cuda(layers, column_hashes, (size_t)size_partition, size_col, *constants, column_config);
+  checkCudaError(err);
 }
 
 using FpMilliseconds = std::chrono::duration<float, std::chrono::milliseconds::period>;
 #define START_TIMER(timer) auto timer##_start = std::chrono::high_resolution_clock::now();
-#define END_TIMER(timer, msg) printf("%s: %.0f ms\n", msg, FpMilliseconds(std::chrono::high_resolution_clock::now() - timer##_start).count());
-
-
-#define CHECK_ALLOC(ptr) if ((ptr) == nullptr) { \
-    std::cerr << "Memory allocation for '" #ptr "' failed." << std::endl; \
-    exit(EXIT_FAILURE); \
-}
-
-int main() {
-    const unsigned size_row = (1<<30);
-    const unsigned nof_partitions = 64;
-    const unsigned size_partition = size_row / nof_partitions;
-    // layers is allocated only for one partition, need to reuse for different partitions
-    const uint32_t size_layers = size_col * size_partition;
-    
-    nvmlInit();
-    unsigned int deviceCount;
-    nvmlDeviceGetCount(&deviceCount);
-    std::cout << "Available GPUs: " << deviceCount << std::endl;
-
-    for (unsigned int i = 0; i < deviceCount; ++i) {
-        nvmlDevice_t device;
-        nvmlMemory_t memory;
-        char name[NVML_DEVICE_NAME_BUFFER_SIZE];
-        nvmlDeviceGetHandleByIndex(i, &device);
-        nvmlDeviceGetName(device, name, NVML_DEVICE_NAME_BUFFER_SIZE);
-        nvmlDeviceGetMemoryInfo(device, &memory);
-        std::cout << "Device ID: " << i << ", Type: " << name << ", Memory Total/Free (MiB) " << memory.total/1024/1024 << "/"  << memory.free/1024/1024 << std::endl;
-    }
-
-    const unsigned memory_partition = sizeof(scalar_t)*(size_col+1)*size_partition/1024/1024;
-    std::cout << "Required Memory (MiB) " << memory_partition << std::endl;
-
-    //===============================================================================
-    // Key: multiple devices are supported by device context
-    //===============================================================================
-
-    device_context::DeviceContext ctx0 = device_context::get_default_device_context();
-    ctx0.device_id=0;
-    device_context::DeviceContext ctx1 = device_context::get_default_device_context();
-    ctx1.device_id=1;
-    
-    std::cout << "Allocate and initialize the memory for layers and hashes" << std::endl;
-    scalar_t* layers0 = static_cast<scalar_t*>(malloc(size_layers * sizeof(scalar_t)));
-    CHECK_ALLOC(layers0);
-    scalar_t s = scalar_t::zero();
-    for (unsigned i = 0; i < size_col*size_partition ; i++) {
-        layers0[i] = s;
-        s = s + scalar_t::one();
-    }
-    scalar_t* layers1 = static_cast<scalar_t*>(malloc(size_layers * sizeof(scalar_t)));
-    CHECK_ALLOC(layers1);
-    s = scalar_t::zero() + scalar_t::one();
-    for (unsigned i = 0; i < size_col*size_partition ; i++) {
-        layers1[i] = s;
-        s = s + scalar_t::one();
-    }
-
-    scalar_t* column_hash0 = static_cast<scalar_t*>(malloc(size_partition * sizeof(scalar_t)));
-    CHECK_ALLOC(column_hash0);
-    scalar_t* column_hash1 = static_cast<scalar_t*>(malloc(size_partition * sizeof(scalar_t)));
-    CHECK_ALLOC(column_hash1);
-
-    PoseidonConstants<scalar_t> column_constants0, column_constants1;
-    bn254_init_optimized_poseidon_constants_cuda(size_col, ctx0, &column_constants0);
-    cudaError_t err_result =  CHK_STICKY(cudaSetDevice(ctx1.device_id));
-    if (err_result != cudaSuccess) {
-        std::cerr << "CUDA error: " << cudaGetErrorString(err_result) << std::endl;
-        return; 
-    }
-    bn254_init_optimized_poseidon_constants_cuda(size_col, ctx1, &column_constants1);
-
-    std::cout << "Parallel execution of Poseidon threads" << std::endl;
-    START_TIMER(parallel);
-    std::thread thread0(threadPoseidon, ctx0, size_partition, layers0, column_hash0, &column_constants0);
-    std::thread thread1(threadPoseidon, ctx1, size_partition, layers1, column_hash1, &column_constants1);
-
-    // Wait for the threads to finish
-    thread0.join();
-    thread1.join();
-    END_TIMER(parallel,"2 GPUs");
-    std::cout << "Output Data from Thread 0: ";
-    std::cout << column_hash0[0] << std::endl;
-    std::cout << "Output Data from Thread 1: ";
-    std::cout << column_hash1[0] << std::endl;
-
-    std::cout << "Sequential execution of Poseidon threads" << std::endl;
-    START_TIMER(sequential);
-    std::thread thread2(threadPoseidon, ctx0, size_partition, layers0, column_hash0, &column_constants0);
-    thread2.join();
-    std::thread thread3(threadPoseidon, ctx0, size_partition, layers1, column_hash1, &column_constants0);
-    thread3.join();
-    END_TIMER(sequential,"1 GPU");
-    std::cout << "Output Data from Thread 2: ";
-    std::cout << column_hash0[0] << std::endl;
-    std::cout << "Output Data from Thread 3: ";
-    std::cout << column_hash1[0] << std::endl;
-
-    nvmlShutdown();
-    return 0;
+#define END_TIMER(timer, msg)                                                                                          \
+  printf("%s: %.0f ms\n", msg, FpMilliseconds(std::chrono::high_resolution_clock::now() - timer##_start).count());
+
+#define CHECK_ALLOC(ptr)                                                                                               \
+  if ((ptr) == nullptr) {                                                                                              \
+    std::cerr << "Memory allocation for '" #ptr "' failed." << std::endl;                                              \
+    exit(EXIT_FAILURE);                                                                                                \
+  }
+
+int main()
+{
+  const unsigned size_row = (1 << 30);
+  const unsigned nof_partitions = 64;
+  const unsigned size_partition = size_row / nof_partitions;
+  // layers is allocated only for one partition, need to reuse for different partitions
+  const uint32_t size_layers = size_col * size_partition;
+
+  nvmlInit();
+  unsigned int deviceCount;
+  nvmlDeviceGetCount(&deviceCount);
+  std::cout << "Available GPUs: " << deviceCount << std::endl;
+
+  for (unsigned int i = 0; i < deviceCount; ++i) {
+    nvmlDevice_t device;
+    nvmlMemory_t memory;
+    char name[NVML_DEVICE_NAME_BUFFER_SIZE];
+    nvmlDeviceGetHandleByIndex(i, &device);
+    nvmlDeviceGetName(device, name, NVML_DEVICE_NAME_BUFFER_SIZE);
+    nvmlDeviceGetMemoryInfo(device, &memory);
+    std::cout << "Device ID: " << i << ", Type: " << name << ", Memory Total/Free (MiB) " << memory.total / 1024 / 1024
+              << "/" << memory.free / 1024 / 1024 << std::endl;
+  }
+
+  const unsigned memory_partition = sizeof(scalar_t) * (size_col + 1) * size_partition / 1024 / 1024;
+  std::cout << "Required Memory (MiB) " << memory_partition << std::endl;
+
+  //===============================================================================
+  // Key: multiple devices are supported by device context
+  //===============================================================================
+
+  device_context::DeviceContext ctx0 = device_context::get_default_device_context();
+  ctx0.device_id = 0;
+  device_context::DeviceContext ctx1 = device_context::get_default_device_context();
+  ctx1.device_id = 1;
+
+  std::cout << "Allocate and initialize the memory for layers and hashes" << std::endl;
+  scalar_t* layers0 = static_cast<scalar_t*>(malloc(size_layers * sizeof(scalar_t)));
+  CHECK_ALLOC(layers0);
+  scalar_t s = scalar_t::zero();
+  for (unsigned i = 0; i < size_col * size_partition; i++) {
+    layers0[i] = s;
+    s = s + scalar_t::one();
+  }
+  scalar_t* layers1 = static_cast<scalar_t*>(malloc(size_layers * sizeof(scalar_t)));
+  CHECK_ALLOC(layers1);
+  s = scalar_t::zero() + scalar_t::one();
+  for (unsigned i = 0; i < size_col * size_partition; i++) {
+    layers1[i] = s;
+    s = s + scalar_t::one();
+  }
+
+  scalar_t* column_hash0 = static_cast<scalar_t*>(malloc(size_partition * sizeof(scalar_t)));
+  CHECK_ALLOC(column_hash0);
+  scalar_t* column_hash1 = static_cast<scalar_t*>(malloc(size_partition * sizeof(scalar_t)));
+  CHECK_ALLOC(column_hash1);
+
+  PoseidonConstants<scalar_t> column_constants0, column_constants1;
+  bn254_init_optimized_poseidon_constants_cuda(size_col, ctx0, &column_constants0);
+  cudaError_t err_result = CHK_STICKY(cudaSetDevice(ctx1.device_id));
+  if (err_result != cudaSuccess) {
+    std::cerr << "CUDA error: " << cudaGetErrorString(err_result) << std::endl;
+    return;
+  }
+  bn254_init_optimized_poseidon_constants_cuda(size_col, ctx1, &column_constants1);
+
+  std::cout << "Parallel execution of Poseidon threads" << std::endl;
+  START_TIMER(parallel);
+  std::thread thread0(threadPoseidon, ctx0, size_partition, layers0, column_hash0, &column_constants0);
+  std::thread thread1(threadPoseidon, ctx1, size_partition, layers1, column_hash1, &column_constants1);
+
+  // Wait for the threads to finish
+  thread0.join();
+  thread1.join();
+  END_TIMER(parallel, "2 GPUs");
+  std::cout << "Output Data from Thread 0: ";
+  std::cout << column_hash0[0] << std::endl;
+  std::cout << "Output Data from Thread 1: ";
+  std::cout << column_hash1[0] << std::endl;
+
+  std::cout << "Sequential execution of Poseidon threads" << std::endl;
+  START_TIMER(sequential);
+  std::thread thread2(threadPoseidon, ctx0, size_partition, layers0, column_hash0, &column_constants0);
+  thread2.join();
+  std::thread thread3(threadPoseidon, ctx0, size_partition, layers1, column_hash1, &column_constants0);
+  thread3.join();
+  END_TIMER(sequential, "1 GPU");
+  std::cout << "Output Data from Thread 2: ";
+  std::cout << column_hash0[0] << std::endl;
+  std::cout << "Output Data from Thread 3: ";
+  std::cout << column_hash1[0] << std::endl;
+
+  nvmlShutdown();
+  return 0;
 }
diff --git a/examples/c++/multiply/example.cu b/examples/c++/multiply/example.cu
index 0b378f247..77eb7a0b4 100644
--- a/examples/c++/multiply/example.cu
+++ b/examples/c++/multiply/example.cu
@@ -17,7 +17,7 @@ int vector_mult(T* vec_b, T* vec_a, T* vec_result, size_t n_elments, device_cont
   config.is_a_on_device = true;
   config.is_b_on_device = true;
   config.is_result_on_device = true;
-  cudaError_t err =  bn254_mul_cuda(vec_a, vec_b, n_elments, config, vec_result);
+  cudaError_t err = bn254_mul_cuda(vec_a, vec_b, n_elments, config, vec_result);
   if (err != cudaSuccess) {
     std::cerr << "Failed to multiply vectors - " << cudaGetErrorString(err) << std::endl;
     return 0;
@@ -100,7 +100,7 @@ int main(int argc, char** argv)
     std::cerr << "Failed to copy data from host to device - " << cudaGetErrorString(err) << std::endl;
     return 0;
   }
-  
+
   std::cout << "Starting warm-up" << std::endl;
   // Warm-up loop
   for (int i = 0; i < repetitions; i++) {
@@ -151,7 +151,7 @@ int main(int argc, char** argv)
   // validate multiplication here...
 
   // clean up and exit
-  free(host_in1); 
+  free(host_in1);
   free(host_in2);
   free(host_out);
   cudaFree(device_in1);
diff --git a/examples/c++/ntt/example.cu b/examples/c++/ntt/example.cu
index 5d98e4758..5e50f0dd7 100644
--- a/examples/c++/ntt/example.cu
+++ b/examples/c++/ntt/example.cu
@@ -60,8 +60,8 @@ int validate_output(const unsigned ntt_size, const unsigned nof_ntts, E* element
 
 using FpMilliseconds = std::chrono::duration<float, std::chrono::milliseconds::period>;
 #define START_TIMER(timer) auto timer##_start = std::chrono::high_resolution_clock::now();
-#define END_TIMER(timer, msg) printf("%s: %.0f ms\n", msg, FpMilliseconds(std::chrono::high_resolution_clock::now() - timer##_start).count());
-
+#define END_TIMER(timer, msg)                                                                                          \
+  printf("%s: %.0f ms\n", msg, FpMilliseconds(std::chrono::high_resolution_clock::now() - timer##_start).count());
 
 int main(int argc, char* argv[])
 {
@@ -89,16 +89,16 @@ int main(int argc, char* argv[])
   bn254_initialize_domain(&basic_root, ctx, true);
   // Create an NTTConfig instance
   NTTConfig<S> config = default_ntt_config<S>();
-  config.ntt_algorithm = NttAlgorithm::MixedRadix; 
+  config.ntt_algorithm = NttAlgorithm::MixedRadix;
   config.batch_size = nof_ntts;
   START_TIMER(MixedRadix);
   cudaError_t err = bn254_ntt_cuda(input, ntt_size, NTTDir::kForward, config, output);
   END_TIMER(MixedRadix, "MixedRadix NTT");
-  
+
   std::cout << "Validating output" << std::endl;
   validate_output(ntt_size, nof_ntts, output);
 
-  config.ntt_algorithm = NttAlgorithm::Radix2; 
+  config.ntt_algorithm = NttAlgorithm::Radix2;
   START_TIMER(Radix2);
   err = bn254_ntt_cuda(input, ntt_size, NTTDir::kForward, config, output);
   END_TIMER(Radix2, "Radix2 NTT");
diff --git a/examples/c++/pedersen-commitment/example.cu b/examples/c++/pedersen-commitment/example.cu
index 93f47b2ba..106a47df1 100644
--- a/examples/c++/pedersen-commitment/example.cu
+++ b/examples/c++/pedersen-commitment/example.cu
@@ -11,49 +11,47 @@ using namespace bn254;
 typedef point_field_t T;
 
 // modular power
-T modPow(T base, T exp) {
+T modPow(T base, T exp)
+{
   T r = T::one();
   T b = base;
   T e = exp;
   while (e != T::zero()) {
-      // If exp is odd, multiply the base with result
-      if (T::is_odd(e)) {
-          r = r * b;
-      }
-      // Now exp must be even, divide it by 2
-      e =T::div2(e);
-      b = b * b;
+    // If exp is odd, multiply the base with result
+    if (T::is_odd(e)) { r = r * b; }
+    // Now exp must be even, divide it by 2
+    e = T::div2(e);
+    b = b * b;
   }
   return r;
 }
 
 // Check if y2 is a quadratic residue using Euler's Criterion
-bool quadratic_residue(T y2) {
-  return modPow(y2, T::div2(T::zero() - T::one())) == T::one();
-}
+bool quadratic_residue(T y2) { return modPow(y2, T::div2(T::zero() - T::one())) == T::one(); }
 
 // modular square root adapted from:
 // https://github.com/ShahjalalShohag/code-library/blob/main/Number%20Theory/Tonelli%20Shanks%20Algorithm.cpp
-bool mySQRT(T a, T *result) {
+bool mySQRT(T a, T* result)
+{
   if (a == T::zero()) {
     *result = T::zero();
     return true;
   }
-  if (modPow(a, T::div2(T::zero() - T::one())) != T::one() ) {
+  if (modPow(a, T::div2(T::zero() - T::one())) != T::one()) {
     return false; // solution does not exist
   }
   // TODO: consider special cases
-  // if (p % 4 == 3) return power(a, (p + 1) / 4, p); 
-  T s = T::zero() - T::one(); // p - 1, 
-  T n = T::one() + T::one(); //2;
-  T r = T::zero(); 
+  // if (p % 4 == 3) return power(a, (p + 1) / 4, p);
+  T s = T::zero() - T::one(); // p - 1,
+  T n = T::one() + T::one();  // 2;
+  T r = T::zero();
   T m;
   while (T::is_even(s)) {
     r = r + T::one();
-    s = T::div2(s); //s /= 2;
+    s = T::div2(s); // s /= 2;
   }
   // find a non-square mod p
-  while (modPow(n, T::div2((T::zero() - T::one())) ) != T::zero() - T::one()) {
+  while (modPow(n, T::div2((T::zero() - T::one()))) != T::zero() - T::one()) {
     n = n + T::one();
   }
   T x = modPow(a, T::div2(s + T::one()));
@@ -61,83 +59,86 @@ bool mySQRT(T a, T *result) {
   T g = modPow(n, s);
   for (;; r = m) {
     T t = b;
-    for (m = T::zero(); T::lt(m,r) /* m < r*/ && t != T::one(); m = m + T::one()) t =  t * t;
-    if (m == T::zero() ) {
+    for (m = T::zero(); T::lt(m, r) /* m < r*/ && t != T::one(); m = m + T::one())
+      t = t * t;
+    if (m == T::zero()) {
       *result = x;
       return true;
     }
-    T gs = modPow(g, modPow(T::one() + T::one(), r - m - T::one()) );
-    g = gs * gs ;
-    x = x * gs ;
-    b =  b * g ;
+    T gs = modPow(g, modPow(T::one() + T::one(), r - m - T::one()));
+    g = gs * gs;
+    x = x * gs;
+    b = b * g;
   }
 }
 
-void point_near_x(T x, affine_t *point) {
-  const T wb = T { weierstrass_b };
+void point_near_x(T x, affine_t* point)
+{
+  const T wb = T{weierstrass_b};
   T y2;
-  while (y2 = x*x*x + wb, quadratic_residue(y2) == false)
-  {
+  while (y2 = x * x * x + wb, quadratic_residue(y2) == false) {
     x = x + T::one();
   };
   T y;
   bool found = mySQRT(y2, &y);
-  assert(y*y == y2);
+  assert(y * y == y2);
   point->x = x;
   point->y = y;
 }
 
 static int seed = 0;
 static HOST_INLINE T rand_host_seed()
-  {
-    std::mt19937_64 generator(seed++);
-    std::uniform_int_distribution<unsigned> distribution;
-    
-    T value;
-    for (unsigned i = 0; i <  T::TLC-1 ; i++)
+{
+  std::mt19937_64 generator(seed++);
+  std::uniform_int_distribution<unsigned> distribution;
+
+  T value;
+  for (unsigned i = 0; i < T::TLC - 1; i++)
     // TODO: use the full range of limbs: for (unsigned i = 0; i <  T::TLC ; i++)
-      value.limbs_storage.limbs[i] = distribution(generator);
-    // while (lt(Field{get_modulus()}, value))
-    //   value = value - Field{get_modulus()};
-    return value;
-  }
+    value.limbs_storage.limbs[i] = distribution(generator);
+  // while (lt(Field{get_modulus()}, value))
+  //   value = value - Field{get_modulus()};
+  return value;
+}
 
 using FpMilliseconds = std::chrono::duration<float, std::chrono::milliseconds::period>;
 #define START_TIMER(timer) auto timer##_start = std::chrono::high_resolution_clock::now();
-#define END_TIMER(timer, msg) printf("%s: %.0f ms\n", msg, FpMilliseconds(std::chrono::high_resolution_clock::now() - timer##_start).count());
+#define END_TIMER(timer, msg)                                                                                          \
+  printf("%s: %.0f ms\n", msg, FpMilliseconds(std::chrono::high_resolution_clock::now() - timer##_start).count());
 
 int main(int argc, char** argv)
 {
   const unsigned N = pow(2, 10);
   std::cout << "Commitment vector size: " << N << "+1 for salt (a.k.a blinding factor)" << std::endl;
-  T* xs = new T[N+1];
-  
+  T* xs = new T[N + 1];
+
   std::cout << "Generating random points transparently using publicly chosen seed" << std::endl;
-  std::cout << "Public seed prevents committer from knowing the discrete logs of points used in the commitment" << std::endl;
+  std::cout << "Public seed prevents committer from knowing the discrete logs of points used in the commitment"
+            << std::endl;
   seed = 1234;
   std::cout << "Using seed: " << seed << std::endl;
   std::cout << "Generating random field values" << std::endl;
   START_TIMER(gen);
-  
+
   for (unsigned i = 0; i < N; i++) {
     xs[i] = rand_host_seed();
   }
   END_TIMER(gen, "Time to generate field values");
-  std::cout << "xs[0]: " << xs[0]  << std::endl;
-  std::cout << "xs[1]: " << xs[1]  << std::endl;
-  
+  std::cout << "xs[0]: " << xs[0] << std::endl;
+  std::cout << "xs[1]: " << xs[1] << std::endl;
+
   // affine_t points[N];
-  affine_t* points = new affine_t[N+1];
+  affine_t* points = new affine_t[N + 1];
   std::cout << "Generating point about random field values" << std::endl;
   START_TIMER(points);
-  for (unsigned i = 0; i < N+1; i++) {
+  for (unsigned i = 0; i < N + 1; i++) {
     point_near_x(xs[i], &points[i]);
   }
   END_TIMER(points, "Time to generate points");
-  
+
   std::cout << "Generating commitment vector" << std::endl;
   projective_t result;
-  scalar_t* scalars = new scalar_t[N+1];
+  scalar_t* scalars = new scalar_t[N + 1];
   scalar_t::rand_host_many(scalars, N);
 
   std::cout << "Generating salt" << std::endl;
@@ -146,7 +147,7 @@ int main(int argc, char** argv)
   std::cout << "Executing MSM" << std::endl;
   auto config = msm::default_msm_config();
   START_TIMER(msm);
-  bn254_msm_cuda(scalars, points, N+1, config, &result);
+  bn254_msm_cuda(scalars, points, N + 1, config, &result);
   END_TIMER(msm, "Time to execute MSM");
 
   std::cout << "Computed commitment: " << result << std::endl;
diff --git a/examples/c++/polynomial_multiplication/example.cu b/examples/c++/polynomial_multiplication/example.cu
index 7a6e4da19..9c80864e5 100644
--- a/examples/c++/polynomial_multiplication/example.cu
+++ b/examples/c++/polynomial_multiplication/example.cu
@@ -82,10 +82,10 @@ int main(int argc, char** argv)
       CHK_IF_RETURN(cudaMallocAsync(&MulGpu, sizeof(test_data) * NTT_SIZE, ntt_config.ctx.stream));
       vec_ops::VecOpsConfig config{
         ntt_config.ctx,
-        true,  // is_a_on_device
-        true,  // is_b_on_device
-        true,  // is_result_on_device
-        false  // is_async
+        true, // is_a_on_device
+        true, // is_b_on_device
+        true, // is_result_on_device
+        false // is_async
       };
       CHK_IF_RETURN(bn254_mul_cuda(GpuA, GpuB, NTT_SIZE, config, MulGpu));
 
diff --git a/examples/c++/poseidon/example.cu b/examples/c++/poseidon/example.cu
index 3b2fe517c..171196213 100644
--- a/examples/c++/poseidon/example.cu
+++ b/examples/c++/poseidon/example.cu
@@ -14,12 +14,13 @@ inline uint32_t tree_index(uint32_t level, uint32_t offset) { return (1 << level
 
 // We assume the tree has leaves already set, compute all other levels
 void build_tree(
-  const uint32_t tree_height, scalar_t* tree, PoseidonConstants<scalar_t> * constants, PoseidonConfig config)
+  const uint32_t tree_height, scalar_t* tree, PoseidonConstants<scalar_t>* constants, PoseidonConfig config)
 {
   for (uint32_t level = tree_height - 1; level > 0; level--) {
     const uint32_t next_level = level - 1;
     const uint32_t next_level_width = 1 << next_level;
-    bn254_poseidon_hash_cuda(&tree[tree_index(level, 0)], &tree[tree_index(next_level, 0)], next_level_width, 2, *constants, config);
+    bn254_poseidon_hash_cuda(
+      &tree[tree_index(level, 0)], &tree[tree_index(next_level, 0)], next_level_width, 2, *constants, config);
   }
 }
 
@@ -37,11 +38,7 @@ uint32_t query_membership(scalar_t query, scalar_t* tree, const uint32_t tree_he
 }
 
 void generate_proof(
-  uint32_t position,
-  scalar_t* tree,
-  const uint32_t tree_height,
-  uint32_t* proof_lr,
-  scalar_t* proof_hash)
+  uint32_t position, scalar_t* tree, const uint32_t tree_height, uint32_t* proof_lr, scalar_t* proof_hash)
 {
   uint32_t level_index = position;
   for (uint32_t level = tree_height - 1; level > 0; level--) {
@@ -68,7 +65,7 @@ uint32_t validate_proof(
   const uint32_t tree_height,
   const uint32_t* proof_lr,
   const scalar_t* proof_hash,
-  PoseidonConstants<scalar_t> * constants,
+  PoseidonConstants<scalar_t>* constants,
   PoseidonConfig config)
 {
   scalar_t hashes_in[2], hash_out[1], level_hash;
@@ -114,13 +111,13 @@ int main(int argc, char* argv[])
   std::cout << "Hashing blocks into tree leaves..." << std::endl;
   PoseidonConstants<scalar_t> constants;
   bn254_init_optimized_poseidon_constants_cuda(data_arity, ctx, &constants);
-  PoseidonConfig config = default_poseidon_config(data_arity+1); 
+  PoseidonConfig config = default_poseidon_config(data_arity + 1);
   bn254_poseidon_hash_cuda(data, &tree[tree_index(leaf_level, 0)], tree_width, 4, constants, config);
 
   std::cout << "3. Building Merkle tree" << std::endl;
   PoseidonConstants<scalar_t> tree_constants;
   bn254_init_optimized_poseidon_constants_cuda(tree_arity, ctx, &tree_constants);
-  PoseidonConfig tree_config = default_poseidon_config(tree_arity+1);
+  PoseidonConfig tree_config = default_poseidon_config(tree_arity + 1);
   build_tree(tree_height, tree, &tree_constants, tree_config);
 
   std::cout << "4. Generate membership proof" << std::endl;
@@ -142,7 +139,7 @@ int main(int argc, char* argv[])
   std::cout << "6. Tamper the hash" << std::endl;
   const scalar_t tampered_hash = hash + scalar_t::one();
   validated = validate_proof(tampered_hash, tree_height, proof_lr, proof_hash, &tree_constants, tree_config);
-  
+
   std::cout << "7. Invalidate tamper hash membership" << std::endl;
   std::cout << "Validated: " << validated << std::endl;
   return 0;
diff --git a/icicle/include/api/babybear.h b/icicle/include/api/babybear.h
index 7d06141d3..6703f97b4 100644
--- a/icicle/include/api/babybear.h
+++ b/icicle/include/api/babybear.h
@@ -74,11 +74,6 @@ extern "C" cudaError_t babybear_bit_reverse_cuda(
   vec_ops::BitReverseConfig& config,
   babybear::scalar_t* output);
 
-// extern "C" cudaError_t babybear_bit_reverse_inplace_cuda(
-//   babybear::scalar_t* input,
-//   unsigned n,
-//   vec_ops::BitReverseConfig& config);
-
 extern "C" void babybear_generate_scalars(babybear::scalar_t* scalars, int size);
 
 extern "C" cudaError_t babybear_scalar_convert_montgomery(
diff --git a/icicle/include/api/bls12_377.h b/icicle/include/api/bls12_377.h
index 2191c4435..c033103d2 100644
--- a/icicle/include/api/bls12_377.h
+++ b/icicle/include/api/bls12_377.h
@@ -122,11 +122,6 @@ extern "C" cudaError_t bls12_377_bit_reverse_cuda(
   vec_ops::BitReverseConfig& config,
   bls12_377::scalar_t* output);
 
-// extern "C" cudaError_t bls12_377_bit_reverse_inplace_cuda(
-//   bls12_377::scalar_t* input,
-//   unsigned n,
-//   vec_ops::BitReverseConfig& config);
-
 extern "C" void bls12_377_generate_scalars(bls12_377::scalar_t* scalars, int size);
 
 extern "C" cudaError_t bls12_377_scalar_convert_montgomery(
diff --git a/icicle/include/api/bls12_381.h b/icicle/include/api/bls12_381.h
index 6e034a4fa..bdb65d8b3 100644
--- a/icicle/include/api/bls12_381.h
+++ b/icicle/include/api/bls12_381.h
@@ -122,11 +122,6 @@ extern "C" cudaError_t bls12_381_bit_reverse_cuda(
   vec_ops::BitReverseConfig& config,
   bls12_381::scalar_t* output);
 
-// extern "C" cudaError_t bls12_381_bit_reverse_inplace_cuda(
-//   bls12_381::scalar_t* input,
-//   unsigned n,
-//   vec_ops::BitReverseConfig& config);
-
 extern "C" void bls12_381_generate_scalars(bls12_381::scalar_t* scalars, int size);
 
 extern "C" cudaError_t bls12_381_scalar_convert_montgomery(
diff --git a/icicle/include/api/bn254.h b/icicle/include/api/bn254.h
index 2b76882b6..0ad15f593 100644
--- a/icicle/include/api/bn254.h
+++ b/icicle/include/api/bn254.h
@@ -154,11 +154,6 @@ extern "C" cudaError_t bn254_bit_reverse_cuda(
   vec_ops::BitReverseConfig& config,
   bn254::scalar_t* output);
 
-// extern "C" cudaError_t bn254_bit_reverse_inplace_cuda(
-//   bn254::scalar_t* input,
-//   unsigned n,
-//   vec_ops::BitReverseConfig& config);
-
 extern "C" void bn254_generate_scalars(bn254::scalar_t* scalars, int size);
 
 extern "C" cudaError_t bn254_scalar_convert_montgomery(
diff --git a/icicle/include/api/bw6_761.h b/icicle/include/api/bw6_761.h
index c4f9c0bce..957e0bb92 100644
--- a/icicle/include/api/bw6_761.h
+++ b/icicle/include/api/bw6_761.h
@@ -122,11 +122,6 @@ extern "C" cudaError_t bw6_761_bit_reverse_cuda(
   vec_ops::BitReverseConfig& config,
   bw6_761::scalar_t* output);
 
-// extern "C" cudaError_t bw6_761_bit_reverse_inplace_cuda(
-//   bw6_761::scalar_t* input,
-//   unsigned n,
-//   vec_ops::BitReverseConfig& config);
-
 extern "C" void bw6_761_generate_scalars(bw6_761::scalar_t* scalars, int size);
 
 extern "C" cudaError_t bw6_761_scalar_convert_montgomery(
diff --git a/icicle/include/api/grumpkin.h b/icicle/include/api/grumpkin.h
index 22b4c539f..c547807ea 100644
--- a/icicle/include/api/grumpkin.h
+++ b/icicle/include/api/grumpkin.h
@@ -92,11 +92,6 @@ extern "C" cudaError_t grumpkin_bit_reverse_cuda(
   vec_ops::BitReverseConfig& config,
   grumpkin::scalar_t* output);
 
-// extern "C" cudaError_t grumpkin_bit_reverse_inplace_cuda(
-//   grumpkin::scalar_t* input,
-//   unsigned n,
-//   vec_ops::BitReverseConfig& config);
-
 extern "C" void grumpkin_generate_scalars(grumpkin::scalar_t* scalars, int size);
 
 extern "C" cudaError_t grumpkin_scalar_convert_montgomery(
diff --git a/icicle/include/api/stark252.h b/icicle/include/api/stark252.h
index e12d06fb8..c370257e4 100644
--- a/icicle/include/api/stark252.h
+++ b/icicle/include/api/stark252.h
@@ -37,11 +37,6 @@ extern "C" cudaError_t stark252_bit_reverse_cuda(
   vec_ops::BitReverseConfig& config,
   stark252::scalar_t* output);
 
-// extern "C" cudaError_t stark252_bit_reverse_inplace_cuda(
-//   stark252::scalar_t* input,
-//   unsigned n,
-//   vec_ops::BitReverseConfig& config);
-
 extern "C" void stark252_generate_scalars(stark252::scalar_t* scalars, int size);
 
 extern "C" cudaError_t stark252_scalar_convert_montgomery(
diff --git a/icicle/src/ntt/thread_ntt.cu b/icicle/src/ntt/thread_ntt.cu
index 8fd2764df..19321a81b 100644
--- a/icicle/src/ntt/thread_ntt.cu
+++ b/icicle/src/ntt/thread_ntt.cu
@@ -17,11 +17,11 @@ struct stage_metadata {
 #define STAGE_SIZES_DATA                                                                                               \
   {                                                                                                                    \
     {0, 0, 0, 0, 0}, {0, 0, 0, 0, 0}, {0, 0, 0, 0, 0}, {0, 0, 0, 0, 0}, {4, 0, 0, 0, 0}, {5, 0, 0, 0, 0},              \
-      {6, 0, 0, 0, 0}, {0, 0, 0, 0, 0}, {4, 4, 0, 0, 0}, {5, 4, 0, 0, 0}, {5, 5, 0, 0, 0}, {6, 5, 0, 0, 0},            \
-      {6, 6, 0, 0, 0}, {4, 5, 4, 0, 0}, {4, 6, 4, 0, 0}, {5, 5, 5, 0, 0}, {6, 4, 6, 0, 0}, {6, 5, 6, 0, 0},            \
-      {6, 6, 6, 0, 0}, {6, 5, 4, 4, 0}, {5, 5, 5, 5, 0}, {6, 5, 5, 5, 0}, {6, 5, 5, 6, 0}, {6, 6, 6, 5, 0},            \
-      {6, 6, 6, 6, 0}, {5, 5, 5, 5, 5}, {6, 5, 4, 5, 6}, {6, 5, 5, 5, 6}, {6, 5, 6, 5, 6}, {6, 6, 5, 6, 6},            \
-      {6, 6, 6, 6, 6},                                                                                                 \
+    {6, 0, 0, 0, 0}, {0, 0, 0, 0, 0}, {4, 4, 0, 0, 0}, {5, 4, 0, 0, 0}, {5, 5, 0, 0, 0}, {6, 5, 0, 0, 0},              \
+    {6, 6, 0, 0, 0}, {4, 5, 4, 0, 0}, {4, 6, 4, 0, 0}, {5, 5, 5, 0, 0}, {6, 4, 6, 0, 0}, {6, 5, 6, 0, 0},              \
+    {6, 6, 6, 0, 0}, {6, 5, 4, 4, 0}, {5, 5, 5, 5, 0}, {6, 5, 5, 5, 0}, {6, 5, 5, 6, 0}, {6, 6, 6, 5, 0},              \
+    {6, 6, 6, 6, 0}, {5, 5, 5, 5, 5}, {6, 5, 4, 5, 6}, {6, 5, 5, 5, 6}, {6, 5, 6, 5, 6}, {6, 6, 5, 6, 6},              \
+    {6, 6, 6, 6, 6},                                                                                                   \
   }
 uint32_t constexpr STAGE_SIZES_HOST[31][5] = STAGE_SIZES_DATA;
 __device__ constexpr uint32_t STAGE_SIZES_DEVICE[31][5] = STAGE_SIZES_DATA;
@@ -33,11 +33,11 @@ uint32_t constexpr STAGE_PREV_SIZES[31] = {0,  0,  0,  0,  0,  0,  0,  0,  4,  5
 #define STAGE_SIZES_DATA_FAST_TW                                                                                       \
   {                                                                                                                    \
     {0, 0, 0, 0, 0}, {0, 0, 0, 0, 0}, {0, 0, 0, 0, 0}, {0, 0, 0, 0, 0}, {4, 0, 0, 0, 0}, {5, 0, 0, 0, 0},              \
-      {6, 0, 0, 0, 0}, {0, 0, 0, 0, 0}, {4, 4, 0, 0, 0}, {5, 4, 0, 0, 0}, {5, 5, 0, 0, 0}, {6, 5, 0, 0, 0},            \
-      {6, 6, 0, 0, 0}, {5, 4, 4, 0, 0}, {5, 4, 5, 0, 0}, {5, 5, 5, 0, 0}, {6, 5, 5, 0, 0}, {6, 5, 6, 0, 0},            \
-      {6, 6, 6, 0, 0}, {5, 5, 5, 4, 0}, {5, 5, 5, 5, 0}, {6, 5, 5, 5, 0}, {6, 5, 5, 6, 0}, {6, 6, 6, 5, 0},            \
-      {6, 6, 6, 6, 0}, {5, 5, 5, 5, 5}, {6, 5, 5, 5, 5}, {6, 5, 5, 5, 6}, {6, 5, 5, 6, 6}, {6, 6, 6, 5, 6},            \
-      {6, 6, 6, 6, 6},                                                                                                 \
+    {6, 0, 0, 0, 0}, {0, 0, 0, 0, 0}, {4, 4, 0, 0, 0}, {5, 4, 0, 0, 0}, {5, 5, 0, 0, 0}, {6, 5, 0, 0, 0},              \
+    {6, 6, 0, 0, 0}, {5, 4, 4, 0, 0}, {5, 4, 5, 0, 0}, {5, 5, 5, 0, 0}, {6, 5, 5, 0, 0}, {6, 5, 6, 0, 0},              \
+    {6, 6, 6, 0, 0}, {5, 5, 5, 4, 0}, {5, 5, 5, 5, 0}, {6, 5, 5, 5, 0}, {6, 5, 5, 6, 0}, {6, 6, 6, 5, 0},              \
+    {6, 6, 6, 6, 0}, {5, 5, 5, 5, 5}, {6, 5, 5, 5, 5}, {6, 5, 5, 5, 6}, {6, 5, 5, 6, 6}, {6, 6, 6, 5, 6},              \
+    {6, 6, 6, 6, 6},                                                                                                   \
   }
 uint32_t constexpr STAGE_SIZES_HOST_FT[31][5] = STAGE_SIZES_DATA_FAST_TW;
 __device__ uint32_t constexpr STAGE_SIZES_DEVICE_FT[31][5] = STAGE_SIZES_DATA_FAST_TW;
diff --git a/icicle/src/vec_ops/extern.cu b/icicle/src/vec_ops/extern.cu
index aaeeec10a..e9504f782 100644
--- a/icicle/src/vec_ops/extern.cu
+++ b/icicle/src/vec_ops/extern.cu
@@ -65,9 +65,4 @@ namespace vec_ops {
   {
     return bit_reverse<scalar_t>(input, n, config, output);
   }
-  // extern "C" cudaError_t
-  //   CONCAT_EXPAND(FIELD, bit_reverse_inplace_cuda)(scalar_t* input, unsigned n, BitReverseConfig& config)
-  // {
-  //   return bit_reverse_inplace<scalar_t>(input, n, config);
-  // }
 } // namespace vec_ops
\ No newline at end of file
diff --git a/icicle/src/vec_ops/vec_ops.cu b/icicle/src/vec_ops/vec_ops.cu
index ed6e5050a..f3c4ff1b1 100644
--- a/icicle/src/vec_ops/vec_ops.cu
+++ b/icicle/src/vec_ops/vec_ops.cu
@@ -194,7 +194,9 @@ namespace vec_ops {
   cudaError_t bit_reverse(const E* input, unsigned size, BitReverseConfig& cfg, E* output)
   {
     if (size & (size - 1)) THROW_ICICLE_ERR(IcicleError_t::InvalidArgument, "bit_reverse: size must be a power of 2");
-    if ((input == output) & (cfg.is_input_on_device != cfg.is_output_on_device)) THROW_ICICLE_ERR(IcicleError_t::InvalidArgument, "bit_reverse: equal devices should have same is_on_device parameters");
+    if ((input == output) & (cfg.is_input_on_device != cfg.is_output_on_device))
+      THROW_ICICLE_ERR(
+        IcicleError_t::InvalidArgument, "bit_reverse: equal devices should have same is_on_device parameters");
 
     E* d_output;
     if (cfg.is_output_on_device) {
diff --git a/wrappers/rust/icicle-core/src/vec_ops/mod.rs b/wrappers/rust/icicle-core/src/vec_ops/mod.rs
index af9e4ddf7..a064a0bd7 100644
--- a/wrappers/rust/icicle-core/src/vec_ops/mod.rs
+++ b/wrappers/rust/icicle-core/src/vec_ops/mod.rs
@@ -335,13 +335,6 @@ macro_rules! impl_vec_ops_field {
                     config: *const BitReverseConfig,
                     output: *mut $field,
                 ) -> CudaError;
-
-                // #[link_name = concat!($field_prefix, "_bit_reverse_inplace_cuda")]
-                // pub(crate) fn bit_reverse_inplace_cuda(
-                //     input: *mut $field,
-                //     size: u32,
-                //     config: *const BitReverseConfig,
-                // ) -> CudaError;
             }
         }
 

From cd3afe3cc4af6a78eabf87f3f2e88aeac4d15438 Mon Sep 17 00:00:00 2001
From: nonam3e <timur@ingonyama.com>
Date: Wed, 29 May 2024 16:59:52 +0000
Subject: [PATCH 11/14] u64 size + tests

---
 icicle/include/api/babybear.h                 |  2 +-
 icicle/include/api/bls12_377.h                |  2 +-
 icicle/include/api/bls12_381.h                |  2 +-
 icicle/include/api/bn254.h                    |  2 +-
 icicle/include/api/bw6_761.h                  |  2 +-
 icicle/include/api/grumpkin.h                 |  2 +-
 icicle/include/api/stark252.h                 |  2 +-
 icicle/src/vec_ops/extern.cu                  |  2 +-
 icicle/src/vec_ops/extern_extension.cu        |  2 +-
 icicle/src/vec_ops/vec_ops.cu                 | 18 ++++----
 wrappers/rust/icicle-core/src/vec_ops/mod.rs  |  6 +--
 .../rust/icicle-core/src/vec_ops/tests.rs     | 41 +++++++++++++++----
 12 files changed, 53 insertions(+), 30 deletions(-)

diff --git a/icicle/include/api/babybear.h b/icicle/include/api/babybear.h
index 6703f97b4..9b0726918 100644
--- a/icicle/include/api/babybear.h
+++ b/icicle/include/api/babybear.h
@@ -70,7 +70,7 @@ extern "C" cudaError_t babybear_transpose_matrix_cuda(
 
 extern "C" cudaError_t babybear_bit_reverse_cuda(
   const babybear::scalar_t* input,
-  unsigned n,
+  uint64_t n,
   vec_ops::BitReverseConfig& config,
   babybear::scalar_t* output);
 
diff --git a/icicle/include/api/bls12_377.h b/icicle/include/api/bls12_377.h
index c033103d2..d1123c0c3 100644
--- a/icicle/include/api/bls12_377.h
+++ b/icicle/include/api/bls12_377.h
@@ -118,7 +118,7 @@ extern "C" cudaError_t bls12_377_transpose_matrix_cuda(
 
 extern "C" cudaError_t bls12_377_bit_reverse_cuda(
   const bls12_377::scalar_t* input,
-  unsigned n,
+  uint64_t n,
   vec_ops::BitReverseConfig& config,
   bls12_377::scalar_t* output);
 
diff --git a/icicle/include/api/bls12_381.h b/icicle/include/api/bls12_381.h
index bdb65d8b3..dcca3d17f 100644
--- a/icicle/include/api/bls12_381.h
+++ b/icicle/include/api/bls12_381.h
@@ -118,7 +118,7 @@ extern "C" cudaError_t bls12_381_transpose_matrix_cuda(
 
 extern "C" cudaError_t bls12_381_bit_reverse_cuda(
   const bls12_381::scalar_t* input,
-  unsigned n,
+  uint64_t n,
   vec_ops::BitReverseConfig& config,
   bls12_381::scalar_t* output);
 
diff --git a/icicle/include/api/bn254.h b/icicle/include/api/bn254.h
index 0ad15f593..9b9e7bb02 100644
--- a/icicle/include/api/bn254.h
+++ b/icicle/include/api/bn254.h
@@ -150,7 +150,7 @@ extern "C" cudaError_t bn254_transpose_matrix_cuda(
 
 extern "C" cudaError_t bn254_bit_reverse_cuda(
   const bn254::scalar_t* input,
-  unsigned n,
+  uint64_t n,
   vec_ops::BitReverseConfig& config,
   bn254::scalar_t* output);
 
diff --git a/icicle/include/api/bw6_761.h b/icicle/include/api/bw6_761.h
index 957e0bb92..78877da5e 100644
--- a/icicle/include/api/bw6_761.h
+++ b/icicle/include/api/bw6_761.h
@@ -118,7 +118,7 @@ extern "C" cudaError_t bw6_761_transpose_matrix_cuda(
 
 extern "C" cudaError_t bw6_761_bit_reverse_cuda(
   const bw6_761::scalar_t* input,
-  unsigned n,
+  uint64_t n,
   vec_ops::BitReverseConfig& config,
   bw6_761::scalar_t* output);
 
diff --git a/icicle/include/api/grumpkin.h b/icicle/include/api/grumpkin.h
index c547807ea..c72be91a8 100644
--- a/icicle/include/api/grumpkin.h
+++ b/icicle/include/api/grumpkin.h
@@ -88,7 +88,7 @@ extern "C" cudaError_t grumpkin_transpose_matrix_cuda(
 
 extern "C" cudaError_t grumpkin_bit_reverse_cuda(
   const grumpkin::scalar_t* input,
-  unsigned n,
+  uint64_t n,
   vec_ops::BitReverseConfig& config,
   grumpkin::scalar_t* output);
 
diff --git a/icicle/include/api/stark252.h b/icicle/include/api/stark252.h
index c370257e4..c04702a1a 100644
--- a/icicle/include/api/stark252.h
+++ b/icicle/include/api/stark252.h
@@ -33,7 +33,7 @@ extern "C" cudaError_t stark252_transpose_matrix_cuda(
 
 extern "C" cudaError_t stark252_bit_reverse_cuda(
   const stark252::scalar_t* input,
-  unsigned n,
+  uint64_t n,
   vec_ops::BitReverseConfig& config,
   stark252::scalar_t* output);
 
diff --git a/icicle/src/vec_ops/extern.cu b/icicle/src/vec_ops/extern.cu
index e9504f782..fc1d1ac65 100644
--- a/icicle/src/vec_ops/extern.cu
+++ b/icicle/src/vec_ops/extern.cu
@@ -61,7 +61,7 @@ namespace vec_ops {
   }
 
   extern "C" cudaError_t
-  CONCAT_EXPAND(FIELD, bit_reverse_cuda)(const scalar_t* input, unsigned n, BitReverseConfig& config, scalar_t* output)
+  CONCAT_EXPAND(FIELD, bit_reverse_cuda)(const scalar_t* input, uint64_t n, BitReverseConfig& config, scalar_t* output)
   {
     return bit_reverse<scalar_t>(input, n, config, output);
   }
diff --git a/icicle/src/vec_ops/extern_extension.cu b/icicle/src/vec_ops/extern_extension.cu
index a91792913..12927aec7 100644
--- a/icicle/src/vec_ops/extern_extension.cu
+++ b/icicle/src/vec_ops/extern_extension.cu
@@ -58,7 +58,7 @@ namespace vec_ops {
   }
 
   extern "C" cudaError_t CONCAT_EXPAND(FIELD, extension_bit_reverse_cuda)(
-    const extension_t* input, unsigned n, BitReverseConfig& config, extension_t* output)
+    const extension_t* input, uint64_t n, BitReverseConfig& config, extension_t* output)
   {
     return bit_reverse<extension_t>(input, n, config, output);
   }
diff --git a/icicle/src/vec_ops/vec_ops.cu b/icicle/src/vec_ops/vec_ops.cu
index f3c4ff1b1..64fe2d256 100644
--- a/icicle/src/vec_ops/vec_ops.cu
+++ b/icicle/src/vec_ops/vec_ops.cu
@@ -56,22 +56,22 @@ namespace vec_ops {
     }
 
     template <typename E>
-    __global__ void bit_reverse_kernel(const E* input, unsigned n, unsigned shift, E* output)
+    __global__ void bit_reverse_kernel(const E* input, uint64_t n, unsigned shift, E* output)
     {
-      int tid = blockIdx.x * blockDim.x + threadIdx.x;
+      uint64_t tid = blockIdx.x * blockDim.x + threadIdx.x;
       // Handling arbitrary vector size
       if (tid < n) {
-        int reversed_index = __brev(tid) >> shift;
+        int reversed_index = __brevll(tid) >> shift;
         output[reversed_index] = input[tid];
       }
     }
     template <typename E>
-    __global__ void bit_reverse_inplace_kernel(E* input, unsigned n, unsigned shift)
+    __global__ void bit_reverse_inplace_kernel(E* input, uint64_t n, unsigned shift)
     {
-      int tid = blockIdx.x * blockDim.x + threadIdx.x;
+      uint64_t tid = blockIdx.x * blockDim.x + threadIdx.x;
       // Handling arbitrary vector size
       if (tid < n) {
-        int reversed_index = __brev(tid) >> shift;
+        int reversed_index = __brevll(tid) >> shift;
         if (reversed_index > tid) {
           E temp = input[tid];
           input[tid] = input[reversed_index];
@@ -191,7 +191,7 @@ namespace vec_ops {
   }
 
   template <typename E>
-  cudaError_t bit_reverse(const E* input, unsigned size, BitReverseConfig& cfg, E* output)
+  cudaError_t bit_reverse(const E* input, uint64_t size, BitReverseConfig& cfg, E* output)
   {
     if (size & (size - 1)) THROW_ICICLE_ERR(IcicleError_t::InvalidArgument, "bit_reverse: size must be a power of 2");
     if ((input == output) & (cfg.is_input_on_device != cfg.is_output_on_device))
@@ -206,8 +206,8 @@ namespace vec_ops {
       CHK_IF_RETURN(cudaMallocAsync(&d_output, sizeof(E) * size, cfg.ctx.stream));
     }
 
-    unsigned shift = __builtin_clz(size) + 1;
-    unsigned num_blocks = (size + MAX_THREADS_PER_BLOCK - 1) / MAX_THREADS_PER_BLOCK;
+    uint64_t shift = __builtin_clzll(size) + 1;
+    uint64_t num_blocks = (size + MAX_THREADS_PER_BLOCK - 1) / MAX_THREADS_PER_BLOCK;
 
     if ((input != output) & cfg.is_input_on_device) {
       bit_reverse_kernel<<<num_blocks, MAX_THREADS_PER_BLOCK, 0, cfg.ctx.stream>>>(input, size, shift, d_output);
diff --git a/wrappers/rust/icicle-core/src/vec_ops/mod.rs b/wrappers/rust/icicle-core/src/vec_ops/mod.rs
index a064a0bd7..900f41f46 100644
--- a/wrappers/rust/icicle-core/src/vec_ops/mod.rs
+++ b/wrappers/rust/icicle-core/src/vec_ops/mod.rs
@@ -331,7 +331,7 @@ macro_rules! impl_vec_ops_field {
                 #[link_name = concat!($field_prefix, "_bit_reverse_cuda")]
                 pub(crate) fn bit_reverse_cuda(
                     input: *const $field,
-                    size: u32,
+                    size: u64,
                     config: *const BitReverseConfig,
                     output: *mut $field,
                 ) -> CudaError;
@@ -424,7 +424,7 @@ macro_rules! impl_vec_ops_field {
                 unsafe {
                     $field_prefix_ident::bit_reverse_cuda(
                         input.as_ptr(),
-                        input.len() as u32,
+                        input.len() as u64,
                         cfg as *const BitReverseConfig,
                         output.as_mut_ptr(),
                     )
@@ -439,7 +439,7 @@ macro_rules! impl_vec_ops_field {
                 unsafe {
                     $field_prefix_ident::bit_reverse_cuda(
                         input.as_ptr(),
-                        input.len() as u32,
+                        input.len() as u64,
                         cfg as *const BitReverseConfig,
                         input.as_mut_ptr(),
                     )
diff --git a/wrappers/rust/icicle-core/src/vec_ops/tests.rs b/wrappers/rust/icicle-core/src/vec_ops/tests.rs
index cad95d827..1b6cd46d1 100644
--- a/wrappers/rust/icicle-core/src/vec_ops/tests.rs
+++ b/wrappers/rust/icicle-core/src/vec_ops/tests.rs
@@ -40,17 +40,28 @@ pub fn check_bit_reverse<F: FieldImpl>()
 where
     <F as FieldImpl>::Config: VecOps<F> + GenerateRandom<F>,
 {
-    const TEST_SIZE: usize = 1 << 20;
-    let input = F::Config::generate_random(TEST_SIZE);
-    let input = HostSlice::from_slice(&input);
-    let mut intermediate_result = DeviceVec::<F>::cuda_malloc(TEST_SIZE).unwrap();
+    const LOG_SIZE: u32 = 20;
+    const TEST_SIZE: usize = 1 << LOG_SIZE;
+    let input_vec = F::Config::generate_random(TEST_SIZE);
+    let input = HostSlice::from_slice(&input_vec);
+    let mut intermediate = DeviceVec::<F>::cuda_malloc(TEST_SIZE).unwrap();
     let cfg = BitReverseConfig::default();
-    bit_reverse(input, &cfg, &mut intermediate_result[..]).unwrap();
+    bit_reverse(input, &cfg, &mut intermediate[..]).unwrap();
+
+    let mut intermediate_host = vec![F::one(); TEST_SIZE];
+    intermediate
+        .copy_to_host(HostSlice::from_mut_slice(&mut intermediate_host[..]))
+        .unwrap();
+    let index_reverser = |i: usize| i.reverse_bits() >> (usize::BITS - LOG_SIZE);
+    intermediate_host
+        .iter()
+        .enumerate()
+        .for_each(|(i, val)| assert_eq!(val, &input_vec[index_reverser(i)]));
 
     let mut result = vec![F::one(); TEST_SIZE];
     let result = HostSlice::from_mut_slice(&mut result);
     let cfg = BitReverseConfig::default();
-    bit_reverse(&intermediate_result[..], &cfg, result).unwrap();
+    bit_reverse(&intermediate[..], &cfg, result).unwrap();
     assert_eq!(input.as_slice(), result.as_slice());
 }
 
@@ -58,15 +69,27 @@ pub fn check_bit_reverse_inplace<F: FieldImpl>()
 where
     <F as FieldImpl>::Config: VecOps<F> + GenerateRandom<F>,
 {
-    const TEST_SIZE: usize = 1 << 20;
-    let input = F::Config::generate_random(TEST_SIZE);
-    let input = HostSlice::from_slice(&input);
+    const LOG_SIZE: u32 = 20;
+    const TEST_SIZE: usize = 1 << LOG_SIZE;
+    let input_vec = F::Config::generate_random(TEST_SIZE);
+    let input = HostSlice::from_slice(&input_vec);
     let mut intermediate = DeviceVec::<F>::cuda_malloc(TEST_SIZE).unwrap();
     intermediate
         .copy_from_host(&input)
         .unwrap();
     let cfg = BitReverseConfig::default();
     bit_reverse_inplace(&mut intermediate[..], &cfg).unwrap();
+
+    let mut intermediate_host = vec![F::one(); TEST_SIZE];
+    intermediate
+        .copy_to_host(HostSlice::from_mut_slice(&mut intermediate_host[..]))
+        .unwrap();
+    let index_reverser = |i: usize| i.reverse_bits() >> (usize::BITS - LOG_SIZE);
+    intermediate_host
+        .iter()
+        .enumerate()
+        .for_each(|(i, val)| assert_eq!(val, &input_vec[index_reverser(i)]));
+
     bit_reverse_inplace(&mut intermediate[..], &cfg).unwrap();
     let mut result_host = vec![F::one(); TEST_SIZE];
     intermediate

From 5e6e6e84bbe50ee3a72ff1bb439f8f68658068aa Mon Sep 17 00:00:00 2001
From: ChickenLover <romangg81@gmail.com>
Date: Thu, 30 May 2024 00:04:10 +0700
Subject: [PATCH 12/14] fmt

---
 icicle/src/ntt/thread_ntt.cu | 20 ++++++++++----------
 1 file changed, 10 insertions(+), 10 deletions(-)

diff --git a/icicle/src/ntt/thread_ntt.cu b/icicle/src/ntt/thread_ntt.cu
index 19321a81b..8fd2764df 100644
--- a/icicle/src/ntt/thread_ntt.cu
+++ b/icicle/src/ntt/thread_ntt.cu
@@ -17,11 +17,11 @@ struct stage_metadata {
 #define STAGE_SIZES_DATA                                                                                               \
   {                                                                                                                    \
     {0, 0, 0, 0, 0}, {0, 0, 0, 0, 0}, {0, 0, 0, 0, 0}, {0, 0, 0, 0, 0}, {4, 0, 0, 0, 0}, {5, 0, 0, 0, 0},              \
-    {6, 0, 0, 0, 0}, {0, 0, 0, 0, 0}, {4, 4, 0, 0, 0}, {5, 4, 0, 0, 0}, {5, 5, 0, 0, 0}, {6, 5, 0, 0, 0},              \
-    {6, 6, 0, 0, 0}, {4, 5, 4, 0, 0}, {4, 6, 4, 0, 0}, {5, 5, 5, 0, 0}, {6, 4, 6, 0, 0}, {6, 5, 6, 0, 0},              \
-    {6, 6, 6, 0, 0}, {6, 5, 4, 4, 0}, {5, 5, 5, 5, 0}, {6, 5, 5, 5, 0}, {6, 5, 5, 6, 0}, {6, 6, 6, 5, 0},              \
-    {6, 6, 6, 6, 0}, {5, 5, 5, 5, 5}, {6, 5, 4, 5, 6}, {6, 5, 5, 5, 6}, {6, 5, 6, 5, 6}, {6, 6, 5, 6, 6},              \
-    {6, 6, 6, 6, 6},                                                                                                   \
+      {6, 0, 0, 0, 0}, {0, 0, 0, 0, 0}, {4, 4, 0, 0, 0}, {5, 4, 0, 0, 0}, {5, 5, 0, 0, 0}, {6, 5, 0, 0, 0},            \
+      {6, 6, 0, 0, 0}, {4, 5, 4, 0, 0}, {4, 6, 4, 0, 0}, {5, 5, 5, 0, 0}, {6, 4, 6, 0, 0}, {6, 5, 6, 0, 0},            \
+      {6, 6, 6, 0, 0}, {6, 5, 4, 4, 0}, {5, 5, 5, 5, 0}, {6, 5, 5, 5, 0}, {6, 5, 5, 6, 0}, {6, 6, 6, 5, 0},            \
+      {6, 6, 6, 6, 0}, {5, 5, 5, 5, 5}, {6, 5, 4, 5, 6}, {6, 5, 5, 5, 6}, {6, 5, 6, 5, 6}, {6, 6, 5, 6, 6},            \
+      {6, 6, 6, 6, 6},                                                                                                 \
   }
 uint32_t constexpr STAGE_SIZES_HOST[31][5] = STAGE_SIZES_DATA;
 __device__ constexpr uint32_t STAGE_SIZES_DEVICE[31][5] = STAGE_SIZES_DATA;
@@ -33,11 +33,11 @@ uint32_t constexpr STAGE_PREV_SIZES[31] = {0,  0,  0,  0,  0,  0,  0,  0,  4,  5
 #define STAGE_SIZES_DATA_FAST_TW                                                                                       \
   {                                                                                                                    \
     {0, 0, 0, 0, 0}, {0, 0, 0, 0, 0}, {0, 0, 0, 0, 0}, {0, 0, 0, 0, 0}, {4, 0, 0, 0, 0}, {5, 0, 0, 0, 0},              \
-    {6, 0, 0, 0, 0}, {0, 0, 0, 0, 0}, {4, 4, 0, 0, 0}, {5, 4, 0, 0, 0}, {5, 5, 0, 0, 0}, {6, 5, 0, 0, 0},              \
-    {6, 6, 0, 0, 0}, {5, 4, 4, 0, 0}, {5, 4, 5, 0, 0}, {5, 5, 5, 0, 0}, {6, 5, 5, 0, 0}, {6, 5, 6, 0, 0},              \
-    {6, 6, 6, 0, 0}, {5, 5, 5, 4, 0}, {5, 5, 5, 5, 0}, {6, 5, 5, 5, 0}, {6, 5, 5, 6, 0}, {6, 6, 6, 5, 0},              \
-    {6, 6, 6, 6, 0}, {5, 5, 5, 5, 5}, {6, 5, 5, 5, 5}, {6, 5, 5, 5, 6}, {6, 5, 5, 6, 6}, {6, 6, 6, 5, 6},              \
-    {6, 6, 6, 6, 6},                                                                                                   \
+      {6, 0, 0, 0, 0}, {0, 0, 0, 0, 0}, {4, 4, 0, 0, 0}, {5, 4, 0, 0, 0}, {5, 5, 0, 0, 0}, {6, 5, 0, 0, 0},            \
+      {6, 6, 0, 0, 0}, {5, 4, 4, 0, 0}, {5, 4, 5, 0, 0}, {5, 5, 5, 0, 0}, {6, 5, 5, 0, 0}, {6, 5, 6, 0, 0},            \
+      {6, 6, 6, 0, 0}, {5, 5, 5, 4, 0}, {5, 5, 5, 5, 0}, {6, 5, 5, 5, 0}, {6, 5, 5, 6, 0}, {6, 6, 6, 5, 0},            \
+      {6, 6, 6, 6, 0}, {5, 5, 5, 5, 5}, {6, 5, 5, 5, 5}, {6, 5, 5, 5, 6}, {6, 5, 5, 6, 6}, {6, 6, 6, 5, 6},            \
+      {6, 6, 6, 6, 6},                                                                                                 \
   }
 uint32_t constexpr STAGE_SIZES_HOST_FT[31][5] = STAGE_SIZES_DATA_FAST_TW;
 __device__ uint32_t constexpr STAGE_SIZES_DEVICE_FT[31][5] = STAGE_SIZES_DATA_FAST_TW;

From 8c4ec1be25d0af6e7d20f2266000239220edbe8f Mon Sep 17 00:00:00 2001
From: nonam3e <timur@ingonyama.com>
Date: Thu, 30 May 2024 00:29:38 +0700
Subject: [PATCH 13/14] tid

---
 icicle/src/vec_ops/vec_ops.cu | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/icicle/src/vec_ops/vec_ops.cu b/icicle/src/vec_ops/vec_ops.cu
index 64fe2d256..f7bf7479e 100644
--- a/icicle/src/vec_ops/vec_ops.cu
+++ b/icicle/src/vec_ops/vec_ops.cu
@@ -58,7 +58,7 @@ namespace vec_ops {
     template <typename E>
     __global__ void bit_reverse_kernel(const E* input, uint64_t n, unsigned shift, E* output)
     {
-      uint64_t tid = blockIdx.x * blockDim.x + threadIdx.x;
+      uint64_t tid = uint64_t(blockIdx.x) * blockDim.x + threadIdx.x;
       // Handling arbitrary vector size
       if (tid < n) {
         int reversed_index = __brevll(tid) >> shift;
@@ -68,7 +68,7 @@ namespace vec_ops {
     template <typename E>
     __global__ void bit_reverse_inplace_kernel(E* input, uint64_t n, unsigned shift)
     {
-      uint64_t tid = blockIdx.x * blockDim.x + threadIdx.x;
+      uint64_t tid = uint64_t(blockIdx.x) * blockDim.x + threadIdx.x;
       // Handling arbitrary vector size
       if (tid < n) {
         int reversed_index = __brevll(tid) >> shift;

From 69f91e5e8f57047512bc54daafb25c08b4185913 Mon Sep 17 00:00:00 2001
From: ChickenLover <romangg81@gmail.com>
Date: Sun, 2 Jun 2024 10:03:01 +0300
Subject: [PATCH 14/14] disable windows test

---
 .github/workflows/rust.yml | 48 +++++++++++++++++++-------------------
 1 file changed, 24 insertions(+), 24 deletions(-)

diff --git a/.github/workflows/rust.yml b/.github/workflows/rust.yml
index c486bddb5..cc80fda9d 100644
--- a/.github/workflows/rust.yml
+++ b/.github/workflows/rust.yml
@@ -79,27 +79,27 @@ jobs:
         cargo test --release --verbose -- --ignored
         cargo test --release --verbose
 
-  build-windows:
-    name: Build on Windows
-    runs-on: windows-2022
-    needs: check-changed-files
-    steps:     
-    - name: Checkout Repo
-      uses: actions/checkout@v4
-    - name: Download and Install Cuda
-      if: needs.check-changed-files.outputs.rust == 'true' || needs.check-changed-files.outputs.cpp_cuda == 'true'
-      id: cuda-toolkit
-      uses: Jimver/cuda-toolkit@v0.2.11
-      with:
-        cuda: '12.0.0'
-        method: 'network'
-        # https://docs.nvidia.com/cuda/archive/12.0.0/cuda-installation-guide-microsoft-windows/index.html
-        sub-packages: '["cudart", "nvcc", "thrust", "visual_studio_integration"]'
-    - name: Build targets
-      working-directory: ./wrappers/rust
-      if: needs.check-changed-files.outputs.rust == 'true' || needs.check-changed-files.outputs.cpp_cuda == 'true'
-      env:
-        CUDA_PATH: ${{ steps.cuda-toolkit.outputs.CUDA_PATH }}
-        CUDA_ARCH: 50 # Using CUDA_ARCH=50 env variable since the CI machines have no GPUs
-      # Building from the root workspace will build all members of the workspace by default
-      run: cargo build --release --verbose
+  # build-windows:
+  #   name: Build on Windows
+  #   runs-on: windows-2022
+  #   needs: check-changed-files
+  #   steps:     
+  #   - name: Checkout Repo
+  #     uses: actions/checkout@v4
+  #   - name: Download and Install Cuda
+  #     if: needs.check-changed-files.outputs.rust == 'true' || needs.check-changed-files.outputs.cpp_cuda == 'true'
+  #     id: cuda-toolkit
+  #     uses: Jimver/cuda-toolkit@v0.2.11
+  #     with:
+  #       cuda: '12.0.0'
+  #       method: 'network'
+  #       # https://docs.nvidia.com/cuda/archive/12.0.0/cuda-installation-guide-microsoft-windows/index.html
+  #       sub-packages: '["cudart", "nvcc", "thrust", "visual_studio_integration"]'
+  #   - name: Build targets
+  #     working-directory: ./wrappers/rust
+  #     if: needs.check-changed-files.outputs.rust == 'true' || needs.check-changed-files.outputs.cpp_cuda == 'true'
+  #     env:
+  #       CUDA_PATH: ${{ steps.cuda-toolkit.outputs.CUDA_PATH }}
+  #       CUDA_ARCH: 50 # Using CUDA_ARCH=50 env variable since the CI machines have no GPUs
+  #     # Building from the root workspace will build all members of the workspace by default
+  #     run: cargo build --release --verbose