diff --git a/.github/ISSUE_TEMPLATE/bug_issue.md b/.github/ISSUE_TEMPLATE/bug_issue.md
index 818d00e14..60b439855 100644
--- a/.github/ISSUE_TEMPLATE/bug_issue.md
+++ b/.github/ISSUE_TEMPLATE/bug_issue.md
@@ -2,7 +2,7 @@
 name: ":bug: Bug Report"
 about: Create a bug report to help us improve the repo
 title: "[BUG]: "
-labels: bug
+labels: type:bug
 ---
 
 ## Description
diff --git a/.github/ISSUE_TEMPLATE/feature_request.md b/.github/ISSUE_TEMPLATE/feature_request.md
index 02a4be3ec..de824ce46 100644
--- a/.github/ISSUE_TEMPLATE/feature_request.md
+++ b/.github/ISSUE_TEMPLATE/feature_request.md
@@ -2,7 +2,7 @@
 name: ":sparkles: Feature Request"
 about: Request the inclusion of a new feature or functionality
 title: "[FEAT]: "
-labels: enhancement
+labels: type:feature
 ---
 
 ## Description
diff --git a/.github/workflows/dev-pr.yml b/.github/workflows/dev-pr.yml
deleted file mode 100644
index d418ba3d9..000000000
--- a/.github/workflows/dev-pr.yml
+++ /dev/null
@@ -1,101 +0,0 @@
-name: Dev PR
-
-on:
-  pull_request:
-    branches:
-      - dev
-
-env:
-  CARGO_TERM_COLOR: always
-  ARCH_TYPE: sm_70
-  LD_LIBRARY_PATH: $GITHUB_WORKSPACE/goicicle
-
-jobs:
-  build-rust-linux:
-    name: Build Rust on Linux
-    runs-on: [self-hosted, Linux, X64, icicle]
-    steps:
-    - name: Checkout Repo
-      uses: actions/checkout@v3
-    - name: Build Rust
-      run: cargo build --release --verbose
-
-  test-rust-linux:
-    name: Test Rust on Linux
-    needs: build-rust-linux
-    runs-on: [self-hosted, Linux, X64, icicle]
-    steps:
-    - name: Checkout Repo
-      uses: actions/checkout@v3
-    - name: Run Rust Tests
-      run: cargo test --release --verbose -- --test-threads=1
-
-  formatting-rust:
-    name: Check Rust Code Formatting
-    runs-on: ubuntu-22.04
-    steps:
-    - name: Checkout
-      uses: actions/checkout@v3
-    - name: Check rustfmt
-      run: if [[ $(cargo fmt --check) ]]; then echo "Please run cargo fmt"; exit 1; fi
-    # - name: Check clippy
-    #   run: cargo clippy --no-deps --all-features --all-targets
-
-  build-rust-windows:
-    name: Build Rust on Windows
-    runs-on: windows-2022
-    steps:     
-    - name: Checkout Repo
-      uses: actions/checkout@v3
-    - name: Download and Install Cuda
-      uses: Jimver/cuda-toolkit@v0.2.11
-      with:
-        cuda: '12.0.0'
-        method: 'network'
-        # https://docs.nvidia.com/cuda/archive/12.0.0/cuda-installation-guide-microsoft-windows/index.html
-        sub-packages: '["cudart", "nvcc", "thrust"]'
-    - name: Build Rust Targets
-      run: cargo build --release --verbose
-
-  test-golang-linux:
-    name: Test Golang on Linux
-    runs-on: [self-hosted, Linux, X64, icicle]
-    steps:
-    - name: Checkout Repo
-      uses: actions/checkout@v3
-    - name: Build CUDA libs
-      run: make libbn254.so
-      working-directory: ./goicicle
-    - name: Run Golang Tests
-      run: |
-        export LD_LIBRARY_PATH=$LD_LIBRARY_PATH:$(pwd)/goicicle
-        go test ./goicicle/curves/bn254 -count=1
-      # TODO: Fix tests for bls12377
-      # TODO: Fix tests for bls12381
-      # run: go test ./goicicle/curves/bn254 ./goicicle/curves/bls12377 ./goicicle/curves/bls12381 -count=1
-
-  formatting-golang:
-    name: Check Golang Code Formatting
-    runs-on: ubuntu-22.04
-    steps:
-    - name: Checkout
-      uses: actions/checkout@v3
-    - name: Check gofmt
-      run: if [[ $(go list ./... | xargs go fmt) ]]; then echo "Please run go fmt"; exit 1; fi
-
-  # TODO: Add once Golang make file supports building for Windows
-  # build-golang-windows:
-  #   name: Build Golang on Windows
-  #   runs-on: windows-2022
-  #   steps:     
-  #   - name: Checkout Repo
-  #     uses: actions/checkout@v3
-  #   - name: Download and Install Cuda
-  #     uses: Jimver/cuda-toolkit@v0.2.11
-  #     with:
-  #       cuda: '12.0.0'
-  #       method: 'network'
-  #       # https://docs.nvidia.com/cuda/archive/12.0.0/cuda-installation-guide-microsoft-windows/index.html
-  #       sub-packages: '["cudart", "nvcc", "thrust"]'
-  #   - name: Build cpp libs
-  #     run: cd goicicle && make all
diff --git a/.github/workflows/main-build.yml b/.github/workflows/main-build.yml
index c40959a42..fe2e37ae9 100644
--- a/.github/workflows/main-build.yml
+++ b/.github/workflows/main-build.yml
@@ -5,6 +5,10 @@ on:
     branches:
       - main
 
+concurrency:
+  group: ${{ github.workflow }}-${{ github.ref }}
+  cancel-in-progress: true
+
 env:
   CARGO_TERM_COLOR: always
   ARCH_TYPE: native
@@ -59,9 +63,11 @@ jobs:
         cuda: '12.0.0'
         method: 'network'
         # https://docs.nvidia.com/cuda/archive/12.0.0/cuda-installation-guide-microsoft-windows/index.html
-        sub-packages: '["cudart", "nvcc", "thrust"]'
+        sub-packages: '["cudart", "nvcc", "thrust", "visual_studio_integration"]'
     - name: Build Rust Targets
       if: needs.check-changed-files.outputs.rust == 'true' || needs.check-changed-files.outputs.cpp_cuda == 'true'
+      env:
+        CUDA_PATH: ${{steps.cuda-toolkit.outputs.CUDA_PATH}}
       run: cargo build --release --verbose
 
   build-golang-linux:
diff --git a/.github/workflows/main-format.yml b/.github/workflows/main-format.yml
index 4f0c4d2b2..646d0221f 100644
--- a/.github/workflows/main-format.yml
+++ b/.github/workflows/main-format.yml
@@ -5,6 +5,10 @@ on:
     branches:
       - main
 
+concurrency:
+  group: ${{ github.workflow }}-${{ github.ref }}
+  cancel-in-progress: true
+
 jobs:
   formatting-rust:
     name: Check Rust Code Formatting
@@ -33,6 +37,5 @@ jobs:
     - name: Checkout
       uses: actions/checkout@v3
     - name: Check clang-format
-      run: |
-        if [[ $(find ./ -path ./icicle/build -prune -iname *.h -or -iname *.cuh -or -iname *.cu -or -iname *.c -or -iname *.cpp | xargs clang-format --dry-run -ferror-limit=1 -style=file 2>&1) ]]; then echo "Please run clang-format"; exit 1; fi
-
+      run: unformatted_files=$(find ./ -path ./icicle/build -prune -o -path ./target -prune -iname *.h -or -iname *.cuh -or -iname *.cu -or -iname *.c -or -iname *.cpp | xargs clang-format --dry-run -ferror-limit=1 -style=file >&2); if [[ $unformatted_files ]]; then echo $unformatted_files; echo "Please run clang-format"; exit 1; fi
+        
diff --git a/.github/workflows/main-test.yml b/.github/workflows/main-test.yml
index 6b13fe21a..4bc2e2c63 100644
--- a/.github/workflows/main-test.yml
+++ b/.github/workflows/main-test.yml
@@ -5,6 +5,10 @@ on:
     branches:
       - main
 
+concurrency:
+  group: ${{ github.workflow }}-${{ github.ref }}
+  cancel-in-progress: true
+
 env:
   CARGO_TERM_COLOR: always
   ARCH_TYPE: native
@@ -57,7 +61,7 @@ jobs:
       if: needs.check-changed-files.outputs.cpp_cuda == 'true'
       run: |
         mkdir -p build
-        cmake -S . -B build
+        cmake -DBUILD_TESTS=ON -S . -B build
         cmake --build build
     - name: Run C++ Tests
       working-directory: ./icicle/build
diff --git a/README.md b/README.md
index f9a55a416..4d95f18b5 100644
--- a/README.md
+++ b/README.md
@@ -5,13 +5,15 @@
 ![image (4)](https://user-images.githubusercontent.com/2446179/223707486-ed8eb5ab-0616-4601-8557-12050df8ccf7.png)
 
 
-<div align="center">
-
-![Build status](https://github.com/ingonyama-zk/icicle/actions/workflows/main-build.yml/badge.svg)
-![Discord server](https://img.shields.io/discord/1063033227788423299?label=Discord&logo=Discord&logoColor=%23&style=plastic)
-![Follow us on twitter](https://img.shields.io/twitter/follow/Ingo_zk?style=social)
-
-</div>
+<p align="center">
+  <img src="https://github.com/ingonyama-zk/icicle/actions/workflows/main-build.yml/badge.svg" alt="Build status">
+  <a href="https://discord.gg/EVVXTdt6DF">
+    <img src="https://img.shields.io/discord/1063033227788423299?logo=discord" alt="Chat with us on Discord">
+  </a>
+  <a href="https://twitter.com/intent/follow?screen_name=Ingo_zk">
+    <img src="https://img.shields.io/twitter/follow/Ingo_zk?style=social&logo=twitter" alt="Follow us on Twitter">
+  </a>
+</p>
 
 ## Background
 
@@ -34,6 +36,7 @@ ICICLE is a CUDA implementation of general functions widely used in ZKP. ICICLE
     - [BLS12-381]
     - [BLS12-377]
     - [BN254]
+    - [BW6-671]
 
 ## Build and usage
 
@@ -43,6 +46,10 @@ ICICLE is a CUDA implementation of general functions widely used in ZKP. ICICLE
 - [NVCC] (version 12.0 or newer)
 - cmake 3.18 and above
 - follow [these instructions](https://github.com/ingonyama-zk/icicle/tree/main/icicle#prerequisites-on-ubuntu)
+- Any Nvidia GPU
+
+If you don't have access to a Nvidia GPU check out [google-colab](#google-colab). If you require more compute power and are looking to build or do research with ICICLE refer to our [grant program][GRANT_PROGRAM].
+
 
 ### Steps
 
@@ -59,6 +66,14 @@ nvcc -o build/<binary_name> ./icicle/curves/index.cu -lib -arch=native
 
 We are using [googletest] library for testing. To build and run [the test suite](./icicle/README.md) for finite field and elliptic curve arithmetic, run from the `icicle` folder:
 
+For testing, ensure the `BUILD_TESTS` option is enabled in cmake. If not, toggle it on by adding `-DBUILD_TESTS=ON` in the cmake configuration command:
+
+```sh
+cmake -S . -B build -DBUILD_TESTS=ON
+```
+
+Proceed with the following commands:
+
 ```sh
 mkdir -p build
 cmake -S . -B build
@@ -68,6 +83,7 @@ cd build && ctest
 
 NOTE: If you are using cmake versions < 3.24 add `-DCUDA_ARCH=<target_cumpute_arch>` to the command `cmake -S . -B build`
 
+
 ### Rust Bindings
 
 For convenience, we also provide rust bindings to the ICICLE library for the following primitives:
@@ -82,7 +98,7 @@ For convenience, we also provide rust bindings to the ICICLE library for the fol
 - Scalar Vector Multiplication
 - Point Vector Multiplication
 
-A custom [build script][B_SCRIPT] is used to compile and link the ICICLE library. The environement variable `ARCH_TYPE` is used to determine which GPU type the library should be compiled for and it defaults to `native` when it is not set allowing the compiler to detect the installed GPU type.
+A custom [build script][B_SCRIPT] is used to compile and link the ICICLE library. The environment variable `ARCH_TYPE` is used to determine which GPU type the library should be compiled for and it defaults to `native` when it is not set allowing the compiler to detect the installed GPU type.
 
 > NOTE: A GPU must be detectable and therefore installed if the `ARCH_TYPE` is not set.
 
@@ -115,20 +131,21 @@ Create a JSON file with the curve parameters. The curve is defined by the follow
 - ``curve_name`` - e.g. ``bls12_381``.
 - ``modulus_p`` - scalar field modulus (in decimal).
 - ``bit_count_p`` - number of bits needed to represent `` modulus_p`` .
-- ``limb_p`` - number of bytes needed to represent `` modulus_p``  (rounded).
-- ``ntt_size`` - log of the maximal size subgroup of the scalar field.    
+- ``limb_p`` - number of (32-bit) limbs needed to represent `` modulus_p`` (rounded up).
+- ``ntt_size`` - log of the maximal size subgroup of the scalar field.
 - ``modulus_q`` - base field modulus (in decimal).
 - ``bit_count_q`` - number of bits needed to represent `` modulus_q`` .
-- ``limb_q`` number of bytes needed to represent `` modulus_p``  (rounded).
-- ``weierstrass_b`` - Weierstrauss constant of the curve. 
-- ``weierstrass_b_g2_re`` - Weierstrauss real constant of the g2 curve. 
-- ``weierstrass_b_g2_im`` - Weierstrauss imaginary constant of the g2 curve. 
-- ``gen_x`` - x-value of a generator element for the curve. 
-- ``gen_y`` - y-value of a generator element for the curve.
-- ``gen_x_re`` - real x-value of a generator element for the g2 curve. 
-- ``gen_x_im`` - imaginary x-value of a generator element for the g2 curve. 
-- ``gen_y_re`` - real y-value of a generator element for the g2 curve. 
-- ``gen_y_im`` - imaginary y-value of a generator element for the g2 curve. 
+- ``limb_q`` - number of (32-bit) limbs needed to represent `` modulus_q`` (rounded up).
+- ``weierstrass_b`` - `b` of the curve in Weierstrauss form.
+- ``weierstrass_b_g2_re`` - real part of the `b` value in of the g2 curve in Weierstrass form.
+- ``weierstrass_b_g2_im`` - imaginary part of the `b` value in of the g2 curve in Weierstrass form.
+- ``gen_x`` - `x` coordinate of a generator element for the curve.
+- ``gen_y`` - `y` coordinate of a generator element for the curve.
+- ``gen_x_re`` - real part of the `x` coordinate of generator element for the g2 curve.
+- ``gen_x_im`` - imaginary part of the `x` coordinate of generator element for the g2 curve.
+- ``gen_y_re`` - real part of the `y` coordinate of generator element for the g2 curve.
+- ``gen_y_im`` - imaginary part of the `y` coordinate of generator element for the g2 curve.
+- ``nonresidue`` - nonresidue, or `i^2`, or `u^2` - square of the element that generates quadratic extension field of the base field.
 
 Here's an example for BLS12-381.
 ```
@@ -142,14 +159,15 @@ Here's an example for BLS12-381.
     "bit_count_q" : 381,
     "limb_q" : 12,
     "weierstrass_b" : 4,
-    "weierstrass_b_g2_re":4,
-    "weierstrass_b_g2_im":4,
+    "weierstrass_b_g2_re" : 4,
+    "weierstrass_b_g2_im" : 4,
     "gen_x" : 3685416753713387016781088315183077757961620795782546409894578378688607592378376318836054947676345821548104185464507,
     "gen_y" : 1339506544944476473020471379941921221584933875938349620426543736416511423956333506472724655353366534992391756441569,
     "gen_x_re" : 352701069587466618187139116011060144890029952792775240219908644239793785735715026873347600343865175952761926303160,
     "gen_x_im" : 3059144344244213709971259814753781636986470325476647558659373206291635324768958432433509563104347017837885763365758,
     "gen_y_re" : 1985150602287291935568054521177171638300868978215655730859378665066344726373823718423869104263333984641494340347905,
-    "gen_y_im" : 927553665492332455747201965776037880757740193453592970025027978793976877002675564980949289727957565575433344219582
+    "gen_y_im" : 927553665492332455747201965776037880757740193453592970025027978793976877002675564980949289727957565575433344219582,
+    "nonresidue" : -1
 }
 ```
 
@@ -167,17 +185,27 @@ The script does the following:
 - Creates a file with the curve name in ``src/curves`` with the relevant objects for the curve. 
 - Creates a test file with the curve name in ``src``. 
 
+Also files from ``./icicle/curves/<curve_name>/supported_operations.cu`` should be added individually to ``add_library`` section of [``./icicle/CMakeLists.txt``][CMAKELISTS]
+
 Testing the new curve could be done by running the tests in ``tests_curve_name`` (e.g. ``tests_bls12_381``).
 
 ## Docker
 
-We offer a simple Docker container so you can simply run ICICLE without settig everything up locally.
+We offer a simple Docker container so you can simply run ICICLE without setting everything up locally.
 
 ```
 docker build -t <name_of_your_choice> .
 docker run --gpus all -it <name_of_your_choice> /bin/bash
 ```
 
+## Google Colab
+
+[Colab](https://colab.google/) is a hosted Jupyter Notebook service that requires no setup to use and provides free access to computing resources including GPUS!
+
+You can easily run ICICLE in Google Colab on a free GPU instance, this is a great option for those who want to get started with ICICLE instantly without any local setup or GPU. 
+
+Follow this [guide][GOOGLE_COLAB_ICICLE] for more details.
+
 ## Contributions
 
 Join our [Discord Server][DISCORD] and find us on the icicle channel. We will be happy to work together to support your use case and talk features, bugs and design.
@@ -190,12 +218,19 @@ If you are changing code, please make sure to change your [git hooks path][HOOKS
 git config core.hooksPath ./scripts/hooks
 ```
 
+In case `clang-format` is missing on your system, you can install it  using the following command:
+
+```sh
+sudo apt install clang-format
+```
+
 This will ensure our custom hooks are run and will make it easier to follow our coding guidelines.
 
 ### Hall of Fame
 
-- [Robik](https://github.com/robik75), for his on-going support and mentorship
+- [Robik](https://github.com/robik75), for his ongoing support and mentorship
 - [liuxiao](https://github.com/liuxiaobleach), for being a top notch bug smasher
+- [gkigiermo](https://github.com/gkigiermo), for making it intuitive to use ICICLE in Google Colab.
 
 ## Help & Support
 
@@ -212,6 +247,7 @@ See [LICENSE-MIT][LMIT] for details.
 [BLS12-381]: ./icicle/curves/bls12_381/supported_operations.cu
 [BLS12-377]: ./icicle/curves/bls12_377/supported_operations.cu
 [BN254]: ./icicle/curves/bn254/supported_operations.cu
+[BW6-671]: ./icicle/curves/bw6_671/supported_operations.cu
 [NVCC]: https://docs.nvidia.com/cuda/#installation-guides
 [CRV_TEMPLATE]: ./icicle/curves/curve_template/
 [CRV_CONFIG]: ./icicle/curves/index.cu
@@ -222,5 +258,8 @@ See [LICENSE-MIT][LMIT] for details.
 [googletest]: https://github.com/google/googletest/
 [HOOKS_DOCS]: https://git-scm.com/docs/githooks
 [HOOKS_PATH]: ./scripts/hooks/
+[CMAKELISTS]: https://github.com/ingonyama-zk/icicle/blob/f0e6b465611227b858ec4590f4de5432e892748d/icicle/CMakeLists.txt#L28
+[GOOGLE_COLAB_ICICLE]: https://github.com/gkigiermo/rust-cuda-colab
+[GRANT_PROGRAM]: https://docs.google.com/forms/d/e/1FAIpQLSc967TnNwxZZ4akejcSi4KOUmGrEc68ZZV-FHLfo8KnP1wbpg/viewform
 
 <!-- End Links -->
diff --git a/curve_parameters/bls12_377.json b/curve_parameters/bls12_377.json
index 22814b746..ae896f1fd 100644
--- a/curve_parameters/bls12_377.json
+++ b/curve_parameters/bls12_377.json
@@ -3,7 +3,7 @@
     "modulus_p" : 8444461749428370424248824938781546531375899335154063827935233455917409239041,
     "bit_count_p" : 253,
     "limb_p" :  8,
-    "ntt_size" : 32,
+    "ntt_size" : 47,
     "modulus_q" : 258664426012969094010652733694893533536393512754914660539884262666720468348340822774968888139573360124440321458177,
     "bit_count_q" : 377,
     "limb_q" : 12,
@@ -16,5 +16,6 @@
     "g2_gen_x_re" : 233578398248691099356572568220835526895379068987715365179118596935057653620464273615301663571204657964920925606294,
     "g2_gen_x_im" : 140913150380207355837477652521042157274541796891053068589147167627541651775299824604154852141315666357241556069118,
     "g2_gen_y_re" : 63160294768292073209381361943935198908131692476676907196754037919244929611450776219210369229519898517858833747423,
-    "g2_gen_y_im" : 149157405641012693445398062341192467754805999074082136895788947234480009303640899064710353187729182149407503257491
+    "g2_gen_y_im" : 149157405641012693445398062341192467754805999074082136895788947234480009303640899064710353187729182149407503257491,
+    "nonresidue" : -5
 }
\ No newline at end of file
diff --git a/curve_parameters/bls12_381.json b/curve_parameters/bls12_381.json
index f7557bab1..ddbce2931 100644
--- a/curve_parameters/bls12_381.json
+++ b/curve_parameters/bls12_381.json
@@ -16,5 +16,6 @@
     "g2_gen_x_re" : 352701069587466618187139116011060144890029952792775240219908644239793785735715026873347600343865175952761926303160,
     "g2_gen_x_im" : 3059144344244213709971259814753781636986470325476647558659373206291635324768958432433509563104347017837885763365758,
     "g2_gen_y_re" : 1985150602287291935568054521177171638300868978215655730859378665066344726373823718423869104263333984641494340347905,
-    "g2_gen_y_im" : 927553665492332455747201965776037880757740193453592970025027978793976877002675564980949289727957565575433344219582
+    "g2_gen_y_im" : 927553665492332455747201965776037880757740193453592970025027978793976877002675564980949289727957565575433344219582,
+    "nonresidue" : -1
 }
\ No newline at end of file
diff --git a/curve_parameters/bn254.json b/curve_parameters/bn254.json
index 4fcaa16d0..766e0a416 100644
--- a/curve_parameters/bn254.json
+++ b/curve_parameters/bn254.json
@@ -16,5 +16,6 @@
     "g2_gen_x_re" : 10857046999023057135944570762232829481370756359578518086990519993285655852781,
     "g2_gen_x_im" : 11559732032986387107991004021392285783925812861821192530917403151452391805634,
     "g2_gen_y_re" : 8495653923123431417604973247489272438418190587263600148770280649306958101930,
-    "g2_gen_y_im" : 4082367875863433681332203403145435568316851327593401208105741076214120093531
+    "g2_gen_y_im" : 4082367875863433681332203403145435568316851327593401208105741076214120093531,
+    "nonresidue" : -1
 }
\ No newline at end of file
diff --git a/curve_parameters/bw6-761.json b/curve_parameters/bw6-761.json
new file mode 100644
index 000000000..71fdb1690
--- /dev/null
+++ b/curve_parameters/bw6-761.json
@@ -0,0 +1,21 @@
+{
+    "curve_name" : "bw6_761",
+    "modulus_p" : 258664426012969094010652733694893533536393512754914660539884262666720468348340822774968888139573360124440321458177,
+    "bit_count_p" : 377,
+    "limb_p" :  12,
+    "ntt_size" : 46,
+    "modulus_q" : 6891450384315732539396789682275657542479668912536150109513790160209623422243491736087683183289411687640864567753786613451161759120554247759349511699125301598951605099378508850372543631423596795951899700429969112842764913119068299,
+    "bit_count_q" : 761,
+    "limb_q" : 24,
+    "root_of_unity" : 32863578547254505029601261939868325669770508939375122462904745766352256812585773382134936404344547323199885654433,
+    "weierstrass_b" : 6891450384315732539396789682275657542479668912536150109513790160209623422243491736087683183289411687640864567753786613451161759120554247759349511699125301598951605099378508850372543631423596795951899700429969112842764913119068298,
+    "weierstrass_b_g2_re" : 4,
+    "weierstrass_b_g2_im" : 0,
+    "g1_gen_x" : 6238772257594679368032145693622812838779005809760824733138787810501188623461307351759238099287535516224314149266511977132140828635950940021790489507611754366317801811090811367945064510304504157188661901055903167026722666149426237,
+    "g1_gen_y" : 2101735126520897423911504562215834951148127555913367997162789335052900271653517958562461315794228241561913734371411178226936527683203879553093934185950470971848972085321797958124416462268292467002957525517188485984766314758624099,
+    "g2_gen_x_re" : 6445332910596979336035888152774071626898886139774101364933948236926875073754470830732273879639675437155036544153105017729592600560631678554299562762294743927912429096636156401171909259073181112518725201388196280039960074422214428,
+    "g2_gen_x_im" : 1,
+    "g2_gen_y_re" : 562923658089539719386922163444547387757586534741080263946953401595155211934630598999300396317104182598044793758153214972605680357108252243146746187917218885078195819486220416605630144001533548163105316661692978285266378674355041,
+    "g2_gen_y_im" : 1,
+    "nonresidue" : -1
+}
diff --git a/curve_parameters/new_curve_script.py b/curve_parameters/new_curve_script.py
index ea8c4ac13..5220a3553 100644
--- a/curve_parameters/new_curve_script.py
+++ b/curve_parameters/new_curve_script.py
@@ -17,7 +17,7 @@ def to_hex(val: int, length):
     n = 8
     chunks = [x[i:i+n] for i in range(0, len(x), n)][::-1]
     s = ""
-    for c in chunks:
+    for c in chunks[:length // n]:
         s += f'0x{c}, '
         
     return s[:-2]
@@ -30,15 +30,15 @@ def compute_values(modulus, modulus_bit_count, limbs):
     modulus_2 = to_hex(modulus*2,limb_size)
     modulus_4 = to_hex(modulus*4,limb_size)
     modulus_wide = to_hex(modulus,limb_size*2)
-    modulus_squared = to_hex(modulus*modulus,limb_size)
-    modulus_squared_2 = to_hex(modulus*modulus*2,limb_size)
-    modulus_squared_4 = to_hex(modulus*modulus*4,limb_size)
+    modulus_squared = to_hex(modulus*modulus,limb_size*2)
+    modulus_squared_2 = to_hex(modulus*modulus*2,limb_size*2)
+    modulus_squared_4 = to_hex(modulus*modulus*4,limb_size*2)
     m_raw = int(math.floor(int(pow(2,2*modulus_bit_count) // modulus)))
     m = to_hex(m_raw,limb_size)
     one = to_hex(1,limb_size)
     zero = to_hex(0,limb_size)
-    montgomery_r = to_hex((2 ** bit_size) % modulus, limb_size)
-    montgomery_r_inv = to_hex(((modulus+1)//2)**bit_size % modulus, limb_size)
+    montgomery_r = to_hex(pow(2,bit_size,modulus),limb_size)
+    montgomery_r_inv = to_hex(pow(2,-bit_size,modulus),limb_size)
 
     return (
         modulus_,
@@ -56,7 +56,7 @@ def compute_values(modulus, modulus_bit_count, limbs):
     )
 
 
-def get_fq_params(modulus, modulus_bit_count, limbs, g1_gen_x, g1_gen_y, g2_gen_x_re, g2_gen_x_im, g2_gen_y_re, g2_gen_y_im):
+def get_fq_params(modulus, modulus_bit_count, limbs, nonresidue):
     (
         modulus,
         modulus_2,
@@ -73,6 +73,8 @@ def get_fq_params(modulus, modulus_bit_count, limbs, g1_gen_x, g1_gen_y, g2_gen_
     ) = compute_values(modulus, modulus_bit_count, limbs)
 
     limb_size = 8*limbs
+    nonresidue_is_negative = str(nonresidue < 0).lower()
+    nonresidue = abs(nonresidue)
     return {
         'fq_modulus': modulus,
         'fq_modulus_2': modulus_2,
@@ -86,12 +88,8 @@ def get_fq_params(modulus, modulus_bit_count, limbs, g1_gen_x, g1_gen_y, g2_gen_
         'fq_zero': zero,
         'fq_montgomery_r': montgomery_r,
         'fq_montgomery_r_inv': montgomery_r_inv,
-        'fq_gen_x': to_hex(g1_gen_x, limb_size),
-        'fq_gen_y': to_hex(g1_gen_y, limb_size),
-        'fq_gen_x_re': to_hex(g2_gen_x_re, limb_size),
-        'fq_gen_x_im': to_hex(g2_gen_x_im, limb_size),
-        'fq_gen_y_re': to_hex(g2_gen_y_re, limb_size),
-        'fq_gen_y_im': to_hex(g2_gen_y_im, limb_size)
+        'nonresidue': nonresidue,
+        'nonresidue_is_negative': nonresidue_is_negative
     }
 
 
@@ -151,6 +149,18 @@ def get_fp_params(modulus, modulus_bit_count, limbs, root_of_unity, size=0):
     }
 
 
+def get_generators(g1_gen_x, g1_gen_y, g2_gen_x_re, g2_gen_x_im, g2_gen_y_re, g2_gen_y_im, size):
+
+    return {
+        'fq_gen_x': to_hex(g1_gen_x, size),
+        'fq_gen_y': to_hex(g1_gen_y, size),
+        'fq_gen_x_re': to_hex(g2_gen_x_re, size),
+        'fq_gen_x_im': to_hex(g2_gen_x_im, size),
+        'fq_gen_y_re': to_hex(g2_gen_y_re, size),
+        'fq_gen_y_im': to_hex(g2_gen_y_im, size)
+    }
+
+
 def get_weier_params(weierstrass_b, weierstrass_b_g2_re, weierstrass_b_g2_im, size):
     
     return {
@@ -171,6 +181,7 @@ def get_params(config):
     bit_count_q = config["bit_count_q"] 
     limb_q = config["limb_q"]
     root_of_unity = config["root_of_unity"]
+    nonresidue = config["nonresidue"]
     if root_of_unity == modulus_p:
         sys.exit("Invalid root_of_unity value; please update in curve parameters")
 
@@ -194,13 +205,15 @@ def get_params(config):
     }
     
     fp_params = get_fp_params(modulus_p, bit_count_p, limb_p, root_of_unity, ntt_size)
-    fq_params = get_fq_params(modulus_q, bit_count_q, limb_q, g1_gen_x, g1_gen_y, g2_generator_x_re, g2_generator_x_im, g2_generator_y_re, g2_generator_y_im)
+    fq_params = get_fq_params(modulus_q, bit_count_q, limb_q, nonresidue)
+    generators = get_generators(g1_gen_x, g1_gen_y, g2_generator_x_re, g2_generator_x_im, g2_generator_y_re, g2_generator_y_im, 8*limb_q)
     weier_params = get_weier_params(weierstrass_b, weierstrass_b_g2_re, weierstrass_b_g2_im, 8*limb_q)
 
     return {
         **params,
         **fp_params,
         **fq_params,
+        **generators,
         **weier_params
     }
 
diff --git a/goicicle/Makefile b/goicicle/Makefile
index 0e11c9112..a12860cc0 100644
--- a/goicicle/Makefile
+++ b/goicicle/Makefile
@@ -5,20 +5,25 @@ LDFLAGS = -shared
 FEATURES = -DG2_DEFINED
 
 TARGET_BN254 = libbn254.so
+TARGET_BW6761 = libbw6761.so
 TARGET_BLS12_381 = libbls12_381.so
 TARGET_BLS12_377 = libbls12_377.so
 
-VPATH = ../icicle/curves/bn254:../icicle/curves/bls12_377:../icicle/curves/bls12_381
+VPATH = ../icicle/curves/bn254:../icicle/curves/bls12_377:../icicle/curves/bls12_381:../icicle/curves/bw6_761
 
 SRCS_BN254 = lde.cu msm.cu projective.cu ve_mod_mult.cu
+SRCS_BW6761 = lde.cu msm.cu projective.cu ve_mod_mult.cu
 SRCS_BLS12_381 = lde.cu msm.cu projective.cu ve_mod_mult.cu poseidon.cu
 SRCS_BLS12_377 = lde.cu msm.cu projective.cu ve_mod_mult.cu
 
-all: $(TARGET_BN254) $(TARGET_BLS12_381) $(TARGET_BLS12_377)
+all: $(TARGET_BN254) $(TARGET_BLS12_381) $(TARGET_BLS12_377) $(TARGET_BW6761)
 
 $(TARGET_BN254): 
 	$(NVCC) $(FEATURES) $(CFLAGS) $(LDFLAGS) $(addprefix ../icicle/curves/bn254/, $(SRCS_BN254)) -o $@
 
+$(TARGET_BW6761): 
+	$(NVCC) $(FEATURES) $(CFLAGS) $(LDFLAGS) $(addprefix ../icicle/curves/bw6_761/, $(SRCS_BW6761)) -o $@
+
 $(TARGET_BLS12_381):
 	$(NVCC) $(FEATURES) $(CFLAGS) $(LDFLAGS) $(addprefix ../icicle/curves/bls12_381/, $(SRCS_BLS12_381)) -o $@
 
@@ -26,4 +31,4 @@ $(TARGET_BLS12_377):
 	$(NVCC) $(FEATURES) $(CFLAGS) $(LDFLAGS) $(addprefix ../icicle/curves/bls12_377/, $(SRCS_BLS12_377)) -o $@
 
 clean:
-	rm -f $(TARGET_BN254) $(TARGET_BLS12_381) $(TARGET_BLS12_377)
+	rm -f $(TARGET_BN254) $(TARGET_BLS12_381) $(TARGET_BLS12_377) $(TARGET_BW6761)
diff --git a/goicicle/README.md b/goicicle/README.md
index 82995ecb8..623b3953a 100644
--- a/goicicle/README.md
+++ b/goicicle/README.md
@@ -11,13 +11,13 @@ To compile the CUDA files, you will need:
 
 ## Structure of the Makefile
 
-The Makefile is designed to compile CUDA files for three curves: BN254, BLS12_381, and BLS12_377. The source files are located in the `icicle/curves/` directory.
+The Makefile is designed to compile CUDA files for four curves: BN254, BLS12_381, BLS12_377 and BW6_671. The source files are located in the `icicle/curves/` directory.
 
 ## Compiling CUDA Code
 
 1. Navigate to the directory containing the Makefile in your terminal.
-2. To compile all curve libraries, use the `make all` command. This will create three shared libraries: `libbn254.so`, `libbls12_381.so`, and `libbls12_377.so`.
-3. If you want to compile a specific curve, you can do so by specifying the target. For example, to compile only the BN254 curve, use `make libbn254.so`. Replace `libbn254.so` with `libbls12_381.so` or `libbls12_377.so` to compile those curves instead.
+2. To compile all curve libraries, use the `make all` command. This will create four shared libraries: `libbn254.so`, `libbls12_381.so`, `libbls12_377.so` and `libbw6_671.so`.
+3. If you want to compile a specific curve, you can do so by specifying the target. For example, to compile only the BN254 curve, use `make libbn254.so`. Replace `libbn254.so` with `libbls12_381.so`, `libbls12_377.so` or `libbw6_671.so` to compile those curves instead.
 
 The resulting `.so` files are the compiled shared libraries for each curve.
 
@@ -25,13 +25,13 @@ The resulting `.so` files are the compiled shared libraries for each curve.
 
 The shared libraries produced from the CUDA code compilation are used to bind Golang to ICICLE's CUDA code.
 
-1. These shared libraries (`libbn254.so`, `libbls12_381.so`, `libbls12_377.so`) can be imported in your Go project to leverage the GPU accelerated functionalities provided by ICICLE. 
+1. These shared libraries (`libbn254.so`, `libbls12_381.so`, `libbls12_377.so`, `libbw6_671.so`) can be imported in your Go project to leverage the GPU accelerated functionalities provided by ICICLE.
 
 2. In your Go project, you can use `cgo` to link these shared libraries. Here's a basic example on how you can use `cgo` to link these libraries:
 
 ```go
 /*
-#cgo LDFLAGS: -L/path/to/shared/libs -lbn254 -lbls12_381 -lbls12_377
+#cgo LDFLAGS: -L/path/to/shared/libs -lbn254 -lbls12_381 -lbls12_377 -lbw6_671
 #include "icicle.h" // make sure you use the correct header file(s)
 */
 import "C"
@@ -46,7 +46,7 @@ Replace `/path/to/shared/libs` with the actual path where the shared libraries a
 
 ## Cleaning up
 
-If you want to remove the compiled files, you can use the `make clean` command. This will remove the `libbn254.so`, `libbls12_381.so`, and `libbls12_377.so` files.
+If you want to remove the compiled files, you can use the `make clean` command. This will remove the `libbn254.so`, `libbls12_381.so`, `libbls12_377.so` and `libbw6_671.so` files.
 
 ## Common issues
 
diff --git a/goicicle/curves/bls12377/g2.go b/goicicle/curves/bls12377/g2.go
index 92ca068ca..837354719 100644
--- a/goicicle/curves/bls12377/g2.go
+++ b/goicicle/curves/bls12377/g2.go
@@ -84,17 +84,6 @@ func (f *G2Element) ToBytesLe() []byte {
 	return bytes
 }
 
-func (p *G2PointAffine) ToProjective() G2Point {
-	return G2Point{
-		X: p.X,
-		Y: p.Y,
-		Z: ExtentionField{
-			A0: G2Element{1, 0, 0, 0},
-			A1: G2Element{0, 0, 0, 0},
-		},
-	}
-}
-
 func (p *G2PointAffine) FromProjective(projective *G2Point) *G2PointAffine {
 	out := (*C.BLS12_377_g2_affine_t)(unsafe.Pointer(p))
 	in := (*C.BLS12_377_g2_projective_t)(unsafe.Pointer(projective))
diff --git a/goicicle/curves/bls12377/g2_test.go b/goicicle/curves/bls12377/g2_test.go
index b1cc5fd13..f6652c9e9 100644
--- a/goicicle/curves/bls12377/g2_test.go
+++ b/goicicle/curves/bls12377/g2_test.go
@@ -71,7 +71,8 @@ func TestG2ShouldConvertToProjective(t *testing.T) {
 	var pointAffine G2PointAffine
 	pointAffine.FromProjective(&pointProjective)
 
-	proj := pointAffine.ToProjective()
+	var proj G2Point
+	proj.FromAffine(&pointAffine)
 
 	assert.True(t, proj.IsOnCurve())
 	assert.True(t, pointProjective.Eq(&proj))
diff --git a/goicicle/curves/bls12377/msm_test.go b/goicicle/curves/bls12377/msm_test.go
index f8c53ff3e..6382c755c 100644
--- a/goicicle/curves/bls12377/msm_test.go
+++ b/goicicle/curves/bls12377/msm_test.go
@@ -179,7 +179,7 @@ func BenchmarkCommit(b *testing.B) {
 				e := Commit(out_d, scalars_d, points_d, msmSize, 10)
 
 				if e != 0 {
-					panic("Error occured")
+					panic("Error occurred")
 				}
 			}
 		})
@@ -226,7 +226,7 @@ func BenchmarkMSM(b *testing.B) {
 				_, e := Msm(out, points, scalars, 0)
 
 				if e != nil {
-					panic("Error occured")
+					panic("Error occurred")
 				}
 			}
 		})
@@ -288,7 +288,7 @@ func BenchmarkMsmG2BLS12_377(b *testing.B) {
 				_, e := MsmG2(out, points, scalars, 0)
 
 				if e != nil {
-					panic("Error occured")
+					panic("Error occurred")
 				}
 			}
 		})
diff --git a/goicicle/curves/bls12381/g2.go b/goicicle/curves/bls12381/g2.go
index cb62eb814..3ba78cb7d 100644
--- a/goicicle/curves/bls12381/g2.go
+++ b/goicicle/curves/bls12381/g2.go
@@ -84,17 +84,6 @@ func (f *G2Element) ToBytesLe() []byte {
 	return bytes
 }
 
-func (p *G2PointAffine) ToProjective() G2Point {
-	return G2Point{
-		X: p.X,
-		Y: p.Y,
-		Z: ExtentionField{
-			A0: G2Element{1, 0, 0, 0},
-			A1: G2Element{0, 0, 0, 0},
-		},
-	}
-}
-
 func (p *G2PointAffine) FromProjective(projective *G2Point) *G2PointAffine {
 	out := (*C.BLS12_381_g2_affine_t)(unsafe.Pointer(p))
 	in := (*C.BLS12_381_g2_projective_t)(unsafe.Pointer(projective))
diff --git a/goicicle/curves/bls12381/g2_test.go b/goicicle/curves/bls12381/g2_test.go
index c3b3035d1..38311b458 100644
--- a/goicicle/curves/bls12381/g2_test.go
+++ b/goicicle/curves/bls12381/g2_test.go
@@ -71,7 +71,8 @@ func TestG2ShouldConvertToProjective(t *testing.T) {
 	var pointAffine G2PointAffine
 	pointAffine.FromProjective(&pointProjective)
 
-	proj := pointAffine.ToProjective()
+	var proj G2Point
+	proj.FromAffine(&pointAffine)
 
 	assert.True(t, proj.IsOnCurve())
 	assert.True(t, pointProjective.Eq(&proj))
diff --git a/goicicle/curves/bls12381/msm_test.go b/goicicle/curves/bls12381/msm_test.go
index 6a12db32d..15e1f0971 100644
--- a/goicicle/curves/bls12381/msm_test.go
+++ b/goicicle/curves/bls12381/msm_test.go
@@ -179,7 +179,7 @@ func BenchmarkCommit(b *testing.B) {
 				e := Commit(out_d, scalars_d, points_d, msmSize, 10)
 
 				if e != 0 {
-					panic("Error occured")
+					panic("Error occurred")
 				}
 			}
 		})
@@ -226,7 +226,7 @@ func BenchmarkMSM(b *testing.B) {
 				_, e := Msm(out, points, scalars, 0)
 
 				if e != nil {
-					panic("Error occured")
+					panic("Error occurred")
 				}
 			}
 		})
@@ -288,7 +288,7 @@ func BenchmarkMsmG2BLS12_381(b *testing.B) {
 				_, e := MsmG2(out, points, scalars, 0)
 
 				if e != nil {
-					panic("Error occured")
+					panic("Error occurred")
 				}
 			}
 		})
diff --git a/goicicle/curves/bn254/g2.go b/goicicle/curves/bn254/g2.go
index 2a54b493d..c48fdea63 100644
--- a/goicicle/curves/bn254/g2.go
+++ b/goicicle/curves/bn254/g2.go
@@ -84,17 +84,6 @@ func (f *G2Element) ToBytesLe() []byte {
 	return bytes
 }
 
-func (p *G2PointAffine) ToProjective() G2Point {
-	return G2Point{
-		X: p.X,
-		Y: p.Y,
-		Z: ExtentionField{
-			A0: G2Element{1, 0, 0, 0},
-			A1: G2Element{0, 0, 0, 0},
-		},
-	}
-}
-
 func (p *G2PointAffine) FromProjective(projective *G2Point) *G2PointAffine {
 	out := (*C.BN254_g2_affine_t)(unsafe.Pointer(p))
 	in := (*C.BN254_g2_projective_t)(unsafe.Pointer(projective))
diff --git a/goicicle/curves/bn254/g2_test.go b/goicicle/curves/bn254/g2_test.go
index c0198cd35..a19538714 100644
--- a/goicicle/curves/bn254/g2_test.go
+++ b/goicicle/curves/bn254/g2_test.go
@@ -71,7 +71,8 @@ func TestG2ShouldConvertToProjective(t *testing.T) {
 	var pointAffine G2PointAffine
 	pointAffine.FromProjective(&pointProjective)
 
-	proj := pointAffine.ToProjective()
+	var proj G2Point
+	proj.FromAffine(&pointAffine)
 
 	assert.True(t, proj.IsOnCurve())
 	assert.True(t, pointProjective.Eq(&proj))
diff --git a/goicicle/curves/bn254/msm_test.go b/goicicle/curves/bn254/msm_test.go
index 73cb41ab6..c8f04346e 100644
--- a/goicicle/curves/bn254/msm_test.go
+++ b/goicicle/curves/bn254/msm_test.go
@@ -179,7 +179,7 @@ func BenchmarkCommit(b *testing.B) {
 				e := Commit(out_d, scalars_d, points_d, msmSize, 10)
 
 				if e != 0 {
-					panic("Error occured")
+					panic("Error occurred")
 				}
 			}
 		})
@@ -226,7 +226,7 @@ func BenchmarkMSM(b *testing.B) {
 				_, e := Msm(out, points, scalars, 0)
 
 				if e != nil {
-					panic("Error occured")
+					panic("Error occurred")
 				}
 			}
 		})
@@ -288,7 +288,7 @@ func BenchmarkMsmG2BN254(b *testing.B) {
 				_, e := MsmG2(out, points, scalars, 0)
 
 				if e != nil {
-					panic("Error occured")
+					panic("Error occurred")
 				}
 			}
 		})
diff --git a/goicicle/curves/bw6761/g1.go b/goicicle/curves/bw6761/g1.go
new file mode 100644
index 000000000..4b69ba05e
--- /dev/null
+++ b/goicicle/curves/bw6761/g1.go
@@ -0,0 +1,328 @@
+// Copyright 2023 Ingonyama
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+// Code generated by Ingonyama DO NOT EDIT
+
+package bw6761
+
+import (
+	"unsafe"
+
+	"encoding/binary"
+)
+
+// #cgo CFLAGS: -I./include/
+// #cgo CFLAGS: -I/usr/local/cuda/include
+// #cgo LDFLAGS: -L${SRCDIR}/../../ -lbw6761
+// #include "projective.h"
+// #include "ve_mod_mult.h"
+import "C"
+
+const SCALAR_SIZE = 12
+const BASE_SIZE = 24
+
+type G1ScalarField struct {
+	S [SCALAR_SIZE]uint32
+}
+
+type G1BaseField struct {
+	S [BASE_SIZE]uint32
+}
+
+/*
+ * BaseField Constrctors
+ */
+
+func (f *G1BaseField) SetZero() *G1BaseField {
+	var S [BASE_SIZE]uint32
+	f.S = S
+
+	return f
+}
+
+func (f *G1BaseField) SetOne() *G1BaseField {
+	var S [BASE_SIZE]uint32
+
+	S[0] = 1
+
+	f.S = S
+	return f
+}
+
+func (p *G1ProjectivePoint) FromAffine(affine *G1PointAffine) *G1ProjectivePoint {
+	out := (*C.BW6761_projective_t)(unsafe.Pointer(p))
+	in := (*C.BW6761_affine_t)(unsafe.Pointer(affine))
+
+	C.projective_from_affine_bw6_761(out, in)
+
+	return p
+}
+
+func (f *G1BaseField) FromLimbs(limbs [BASE_SIZE]uint32) *G1BaseField {
+	copy(f.S[:], limbs[:])
+
+	return f
+}
+
+/*
+ * BaseField methods
+ */
+
+func (f *G1BaseField) Limbs() [BASE_SIZE]uint32 {
+	return f.S
+}
+
+func (f *G1BaseField) ToBytesLe() []byte {
+	bytes := make([]byte, len(f.S)*4)
+	for i, v := range f.S {
+		binary.LittleEndian.PutUint32(bytes[i*4:], v)
+	}
+
+	return bytes
+}
+
+/*
+ * ScalarField methods
+ */
+
+func (p *G1ScalarField) Random() *G1ScalarField {
+	outC := (*C.BW6761_scalar_t)(unsafe.Pointer(p))
+	C.random_scalar_bw6_761(outC)
+
+	return p
+}
+
+func (f *G1ScalarField) SetZero() *G1ScalarField {
+	var S [SCALAR_SIZE]uint32
+	f.S = S
+
+	return f
+}
+
+func (f *G1ScalarField) SetOne() *G1ScalarField {
+	var S [SCALAR_SIZE]uint32
+	S[0] = 1
+	f.S = S
+
+	return f
+}
+
+func (a *G1ScalarField) Eq(b *G1ScalarField) bool {
+	for i, v := range a.S {
+		if b.S[i] != v {
+			return false
+		}
+	}
+	return true
+}
+
+/*
+ * ScalarField methods
+ */
+
+func (f *G1ScalarField) Limbs() [SCALAR_SIZE]uint32 {
+	return f.S
+}
+
+func (f *G1ScalarField) ToBytesLe() []byte {
+	bytes := make([]byte, len(f.S)*4)
+	for i, v := range f.S {
+		binary.LittleEndian.PutUint32(bytes[i*4:], v)
+	}
+
+	return bytes
+}
+
+/*
+ * PointBW6761
+ */
+
+type G1ProjectivePoint struct {
+	X, Y, Z G1BaseField
+}
+
+func (f *G1ProjectivePoint) SetZero() *G1ProjectivePoint {
+	var yOne G1BaseField
+	yOne.SetOne()
+
+	var xZero G1BaseField
+	xZero.SetZero()
+
+	var zZero G1BaseField
+	zZero.SetZero()
+
+	f.X = xZero
+	f.Y = yOne
+	f.Z = zZero
+
+	return f
+}
+
+func (p *G1ProjectivePoint) Eq(pCompare *G1ProjectivePoint) bool {
+	// Cast *PointBW6761 to *C.BW6761_projective_t
+	// The unsafe.Pointer cast is necessary because Go doesn't allow direct casts
+	// between different pointer types.
+	// It'S your responsibility to ensure that the types are compatible.
+	pC := (*C.BW6761_projective_t)(unsafe.Pointer(p))
+	pCompareC := (*C.BW6761_projective_t)(unsafe.Pointer(pCompare))
+
+	// Call the C function
+	// The C function doesn't keep any references to the data,
+	// so it'S fine if the Go garbage collector moves or deletes the data later.
+	return bool(C.eq_bw6_761(pC, pCompareC))
+}
+
+func (p *G1ProjectivePoint) IsOnCurve() bool {
+	point := (*C.BW6761_projective_t)(unsafe.Pointer(p))
+	res := C.projective_is_on_curve_bw6_761(point)
+
+	return bool(res)
+}
+
+func (p *G1ProjectivePoint) Random() *G1ProjectivePoint {
+	outC := (*C.BW6761_projective_t)(unsafe.Pointer(p))
+	C.random_projective_bw6_761(outC)
+
+	return p
+}
+
+func (p *G1ProjectivePoint) StripZ() *G1PointAffine {
+	return &G1PointAffine{
+		X: p.X,
+		Y: p.Y,
+	}
+}
+
+func (p *G1ProjectivePoint) FromLimbs(x, y, z *[]uint32) *G1ProjectivePoint {
+	var _x G1BaseField
+	var _y G1BaseField
+	var _z G1BaseField
+
+	_x.FromLimbs(GetFixedLimbs(x))
+	_y.FromLimbs(GetFixedLimbs(y))
+	_z.FromLimbs(GetFixedLimbs(z))
+
+	p.X = _x
+	p.Y = _y
+	p.Z = _z
+
+	return p
+}
+
+/*
+ * PointAffineNoInfinityBW6761
+ */
+
+type G1PointAffine struct {
+	X, Y G1BaseField
+}
+
+func (p *G1PointAffine) FromProjective(projective *G1ProjectivePoint) *G1PointAffine {
+	in := (*C.BW6761_projective_t)(unsafe.Pointer(projective))
+	out := (*C.BW6761_affine_t)(unsafe.Pointer(p))
+
+	C.projective_to_affine_bw6_761(out, in)
+
+	return p
+}
+
+func (p *G1PointAffine) ToProjective() *G1ProjectivePoint {
+	var Z G1BaseField
+	Z.SetOne()
+
+	return &G1ProjectivePoint{
+		X: p.X,
+		Y: p.Y,
+		Z: Z,
+	}
+}
+
+func (p *G1PointAffine) FromLimbs(X, Y *[]uint32) *G1PointAffine {
+	var _x G1BaseField
+	var _y G1BaseField
+
+	_x.FromLimbs(GetFixedLimbs(X))
+	_y.FromLimbs(GetFixedLimbs(Y))
+
+	p.X = _x
+	p.Y = _y
+
+	return p
+}
+
+/*
+ * Multiplication
+ */
+
+func MultiplyVec(a []G1ProjectivePoint, b []G1ScalarField, deviceID int) {
+	if len(a) != len(b) {
+		panic("a and b have different lengths")
+	}
+
+	pointsC := (*C.BW6761_projective_t)(unsafe.Pointer(&a[0]))
+	scalarsC := (*C.BW6761_scalar_t)(unsafe.Pointer(&b[0]))
+	deviceIdC := C.size_t(deviceID)
+	nElementsC := C.size_t(len(a))
+
+	C.vec_mod_mult_point_bw6_761(pointsC, scalarsC, nElementsC, deviceIdC)
+}
+
+func MultiplyScalar(a []G1ScalarField, b []G1ScalarField, deviceID int) {
+	if len(a) != len(b) {
+		panic("a and b have different lengths")
+	}
+
+	aC := (*C.BW6761_scalar_t)(unsafe.Pointer(&a[0]))
+	bC := (*C.BW6761_scalar_t)(unsafe.Pointer(&b[0]))
+	deviceIdC := C.size_t(deviceID)
+	nElementsC := C.size_t(len(a))
+
+	C.vec_mod_mult_scalar_bw6_761(aC, bC, nElementsC, deviceIdC)
+}
+
+// Multiply a matrix by a scalar:
+//
+//	`a` - flattenned matrix;
+//	`b` - vector to multiply `a` by;
+func MultiplyMatrix(a []G1ScalarField, b []G1ScalarField, deviceID int) {
+	c := make([]G1ScalarField, len(b))
+	for i := range c {
+		var p G1ScalarField
+		p.SetZero()
+
+		c[i] = p
+	}
+
+	aC := (*C.BW6761_scalar_t)(unsafe.Pointer(&a[0]))
+	bC := (*C.BW6761_scalar_t)(unsafe.Pointer(&b[0]))
+	cC := (*C.BW6761_scalar_t)(unsafe.Pointer(&c[0]))
+	deviceIdC := C.size_t(deviceID)
+	nElementsC := C.size_t(len(a))
+
+	C.matrix_vec_mod_mult_bw6_761(aC, bC, cC, nElementsC, deviceIdC)
+}
+
+/*
+ * Utils
+ */
+
+func GetFixedLimbs(slice *[]uint32) [BASE_SIZE]uint32 {
+	if len(*slice) <= BASE_SIZE {
+		limbs := [BASE_SIZE]uint32{}
+		copy(limbs[:len(*slice)], *slice)
+		return limbs
+	}
+
+	panic("slice has too many elements")
+}
diff --git a/goicicle/curves/bw6761/g1_test.go b/goicicle/curves/bw6761/g1_test.go
new file mode 100644
index 000000000..b530c7a69
--- /dev/null
+++ b/goicicle/curves/bw6761/g1_test.go
@@ -0,0 +1,212 @@
+// Copyright 2023 Ingonyama
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+// Code generated by Ingonyama DO NOT EDIT
+
+package bw6761
+
+import (
+	"encoding/binary"
+	"fmt"
+	"testing"
+
+	"github.com/stretchr/testify/assert"
+)
+
+func generateUint32Array(length int, isZero bool) []uint32 {
+	arr := make([]uint32, length)
+	for i := 0; i < length; i++ {
+		if isZero {
+			arr[i] = 0x0
+		} else {
+			arr[i] = uint32(i + 1) // You can modify this line to fill the array as needed
+		}
+	}
+	return arr
+}
+
+func TestNewFieldBW6761One(t *testing.T) {
+	var oneField G1BaseField
+	oneField.SetOne()
+
+	rawOneField := [24]uint32([24]uint32{0x1, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0})
+
+	assert.Equal(t, oneField.S, rawOneField)
+}
+
+func TestNewFieldBW6761Zero(t *testing.T) {
+	var zeroField G1BaseField
+	zeroField.SetZero()
+
+	rawZeroField := [24]uint32([24]uint32{0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0})
+
+	assert.Equal(t, zeroField.S, rawZeroField)
+}
+
+func TestFieldBW6761ToBytesLe(t *testing.T) {
+	var p G1ProjectivePoint
+	p.Random()
+
+	expected := make([]byte, len(p.X.S)*4) // each uint32 takes 4 bytes
+	for i, v := range p.X.S {
+		binary.LittleEndian.PutUint32(expected[i*4:], v)
+	}
+
+	assert.Equal(t, p.X.ToBytesLe(), expected)
+	assert.Equal(t, len(p.X.ToBytesLe()), 96)
+}
+
+func TestNewPointBW6761Zero(t *testing.T) {
+	var pointZero G1ProjectivePoint
+	pointZero.SetZero()
+
+	var baseOne G1BaseField
+	baseOne.SetOne()
+
+	var zeroSanity G1BaseField
+	zeroSanity.SetZero()
+
+	assert.Equal(t, pointZero.X, zeroSanity)
+	assert.Equal(t, pointZero.Y, baseOne)
+	assert.Equal(t, pointZero.Z, zeroSanity)
+}
+
+func TestFromProjectiveToAffine(t *testing.T) {
+	fmt.Print() // this prevents the test from hanging. TODO: figure out why
+	var projective G1ProjectivePoint
+	var affine G1PointAffine
+
+	projective.Random()
+
+	affine.FromProjective(&projective)
+	var projective2 G1ProjectivePoint
+	projective2.FromAffine(&affine)
+
+	assert.True(t, projective.IsOnCurve())
+	assert.True(t, projective2.IsOnCurve())
+	assert.True(t, projective.Eq(&projective2))
+}
+
+func TestBW6761Eq(t *testing.T) {
+	var p1 G1ProjectivePoint
+	p1.Random()
+	var p2 G1ProjectivePoint
+	p2.Random()
+
+	assert.Equal(t, p1.Eq(&p1), true)
+	assert.Equal(t, p1.Eq(&p2), false)
+}
+
+func TestBW6761StripZ(t *testing.T) {
+	var p1 G1ProjectivePoint
+	p1.Random()
+
+	p2ZLess := p1.StripZ()
+
+	assert.IsType(t, G1PointAffine{}, *p2ZLess)
+	assert.Equal(t, p1.X, p2ZLess.X)
+	assert.Equal(t, p1.Y, p2ZLess.Y)
+}
+
+func TestPointBW6761fromLimbs(t *testing.T) {
+	var p G1ProjectivePoint
+	p.Random()
+
+	x := p.X.Limbs()
+	y := p.Y.Limbs()
+	z := p.Z.Limbs()
+
+	xSlice := x[:]
+	ySlice := y[:]
+	zSlice := z[:]
+
+	var pFromLimbs G1ProjectivePoint
+	pFromLimbs.FromLimbs(&xSlice, &ySlice, &zSlice)
+
+	assert.Equal(t, pFromLimbs, p)
+}
+
+func TestNewPointAffineNoInfinityBW6761Zero(t *testing.T) {
+	var zeroP G1PointAffine
+
+	var zeroSanity G1BaseField
+	zeroSanity.SetZero()
+
+	assert.Equal(t, zeroP.X, zeroSanity)
+	assert.Equal(t, zeroP.Y, zeroSanity)
+}
+
+func TestPointAffineNoInfinityBW6761FromLimbs(t *testing.T) {
+	// Initialize your test values
+	x := [24]uint32{1, 2, 3, 4, 5, 6, 7, 8, 1, 2, 3, 4, 5, 6, 7, 8, 1, 2, 3, 4, 5, 6, 7, 8}
+	y := [24]uint32{1, 2, 3, 4, 5, 6, 7, 8, 1, 2, 3, 4, 5, 6, 7, 8, 1, 2, 3, 4, 5, 6, 7, 8}
+	xSlice := x[:]
+	ySlice := y[:]
+
+	// Execute your function
+	var result G1PointAffine
+	result.FromLimbs(&xSlice, &ySlice)
+
+	var xBase G1BaseField
+	var yBase G1BaseField
+	xBase.FromLimbs(x)
+	yBase.FromLimbs(y)
+
+	// Define your expected result
+	expected := G1PointAffine{
+		X: xBase,
+		Y: yBase,
+	}
+
+	// Test if result is as expected
+	assert.Equal(t, expected, result)
+}
+
+func TestGetFixedLimbs(t *testing.T) {
+	t.Run("case of valid input of length less than 8", func(t *testing.T) {
+		slice := []uint32{1, 2, 3, 4, 5, 6, 7}
+		expected := [24]uint32{1, 2, 3, 4, 5, 6, 7, 0}
+
+		result := GetFixedLimbs(&slice)
+		assert.Equal(t, result, expected)
+	})
+
+	t.Run("case of valid input of length 24", func(t *testing.T) {
+		slice := generateUint32Array(24, false)
+		expected := [24]uint32(generateUint32Array(24, false))
+
+		result := GetFixedLimbs(&slice)
+		assert.Equal(t, result, expected)
+	})
+
+	t.Run("case of empty input", func(t *testing.T) {
+		slice := []uint32{}
+		expected := [24]uint32(generateUint32Array(24, true))
+
+		result := GetFixedLimbs(&slice)
+		assert.Equal(t, result, expected)
+	})
+
+	t.Run("case of input length greater than 24", func(t *testing.T) {
+		slice := generateUint32Array(25, false)
+
+		defer func() {
+			if r := recover(); r == nil {
+				t.Errorf("the code did not panic")
+			}
+		}()
+
+		GetFixedLimbs(&slice)
+	})
+}
diff --git a/goicicle/curves/bw6761/g2.go b/goicicle/curves/bw6761/g2.go
new file mode 100644
index 000000000..6cbc7d2a7
--- /dev/null
+++ b/goicicle/curves/bw6761/g2.go
@@ -0,0 +1,98 @@
+// Copyright 2023 Ingonyama
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+// Code generated by Ingonyama DO NOT EDIT
+
+package bw6761
+
+import (
+	"encoding/binary"
+	"unsafe"
+)
+
+// #cgo CFLAGS: -I./include/
+// #cgo CFLAGS: -I/usr/local/cuda/include
+// #cgo LDFLAGS: -L${SRCDIR}/../../ -lbw6761
+// #include "projective.h"
+// #include "ve_mod_mult.h"
+import "C"
+
+// G2 extension field
+
+type G2Element [12]uint64
+
+type G2PointAffine struct {
+	X, Y G2Element
+}
+
+type G2Point struct {
+	X, Y, Z G2Element
+}
+
+func (p *G2Point) Random() *G2Point {
+	outC := (*C.BW6761_g2_projective_t)(unsafe.Pointer(p))
+	C.random_g2_projective_bw6_761(outC)
+
+	return p
+}
+
+func (p *G2Point) FromAffine(affine *G2PointAffine) *G2Point {
+	out := (*C.BW6761_g2_projective_t)(unsafe.Pointer(p))
+	in := (*C.BW6761_g2_affine_t)(unsafe.Pointer(affine))
+
+	C.g2_projective_from_affine_bw6_761(out, in)
+
+	return p
+}
+
+func (p *G2Point) Eq(pCompare *G2Point) bool {
+	// Cast *PointBW6761 to *C.BW6761_projective_t
+	// The unsafe.Pointer cast is necessary because Go doesn't allow direct casts
+	// between different pointer types.
+	// It's your responsibility to ensure that the types are compatible.
+	pC := (*C.BW6761_g2_projective_t)(unsafe.Pointer(p))
+	pCompareC := (*C.BW6761_g2_projective_t)(unsafe.Pointer(pCompare))
+
+	// Call the C function
+	// The C function doesn't keep any references to the data,
+	// so it's fine if the Go garbage collector moves or deletes the data later.
+	return bool(C.eq_g2_bw6_761(pC, pCompareC))
+}
+
+func (f *G2Element) ToBytesLe() []byte {
+	var bytes []byte
+	for _, val := range f {
+		buf := make([]byte, 8) // 8 bytes because uint64 is 64-bit
+		binary.LittleEndian.PutUint64(buf, val)
+		bytes = append(bytes, buf...)
+	}
+	return bytes
+}
+
+func (p *G2PointAffine) FromProjective(projective *G2Point) *G2PointAffine {
+	out := (*C.BW6761_g2_affine_t)(unsafe.Pointer(p))
+	in := (*C.BW6761_g2_projective_t)(unsafe.Pointer(projective))
+
+	C.g2_projective_to_affine_bw6_761(out, in)
+
+	return p
+}
+
+func (p *G2Point) IsOnCurve() bool {
+	// Directly copy memory from the C struct to the Go struct
+	point := (*C.BW6761_g2_projective_t)(unsafe.Pointer(p))
+	res := C.g2_projective_is_on_curve_bw6_761(point)
+
+	return bool(res)
+}
diff --git a/goicicle/curves/bw6761/g2_test.go b/goicicle/curves/bw6761/g2_test.go
new file mode 100644
index 000000000..bc7ebf845
--- /dev/null
+++ b/goicicle/curves/bw6761/g2_test.go
@@ -0,0 +1,83 @@
+// Copyright 2023 Ingonyama
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+// Code generated by Ingonyama DO NOT EDIT
+
+package bw6761
+
+import (
+	"fmt"
+	"testing"
+	"unsafe"
+
+	"github.com/stretchr/testify/assert"
+)
+
+func TestG2Eqg2(t *testing.T) {
+	var point G2Point
+
+	point.Random()
+
+	assert.True(t, point.Eq(&point))
+}
+
+func TestG2FromProjectiveToAffine(t *testing.T) {
+	fmt.Print() // this prevents the test from hanging. TODO: figure out why
+	var projective G2Point
+	projective.Random()
+
+	var affine G2PointAffine
+	affine.FromProjective(&projective)
+
+	var projective2 G2Point
+	projective2.FromAffine(&affine)
+
+	assert.True(t, projective.IsOnCurve())
+	assert.True(t, projective2.IsOnCurve())
+	assert.True(t, projective.Eq(&projective2))
+}
+
+func TestG2Eqg2NotEqual(t *testing.T) {
+	var point G2Point
+	point.Random()
+
+	var point2 G2Point
+	point2.Random()
+
+	assert.False(t, point.Eq(&point2))
+}
+
+func TestG2ToBytes(t *testing.T) {
+	var point G2Point
+	var element G2Element
+	point.Random()
+	bytes := point.X.ToBytesLe()
+
+	assert.Equal(t, len(bytes), int(unsafe.Sizeof(element)))
+}
+
+func TestG2ShouldConvertToProjective(t *testing.T) {
+	fmt.Print() // this prevents the test from hanging. TODO: figure out why
+	var pointProjective G2Point
+	pointProjective.Random()
+
+	var pointAffine G2PointAffine
+	pointAffine.FromProjective(&pointProjective)
+
+	var proj G2Point
+	proj.FromAffine(&pointAffine)
+
+	assert.True(t, proj.IsOnCurve())
+	assert.True(t, pointProjective.Eq(&proj))
+}
diff --git a/goicicle/curves/bw6761/include/msm.h b/goicicle/curves/bw6761/include/msm.h
new file mode 100644
index 000000000..03901c147
--- /dev/null
+++ b/goicicle/curves/bw6761/include/msm.h
@@ -0,0 +1,101 @@
+
+	// Copyright 2023 Ingonyama
+	//
+	// Licensed under the Apache License, Version 2.0 (the "License");
+	// you may not use this file except in compliance with the License.
+	// You may obtain a copy of the License at
+	//
+	//     http://www.apache.org/licenses/LICENSE-2.0
+	//
+	// Unless required by applicable law or agreed to in writing, software
+	// distributed under the License is distributed on an "AS IS" BASIS,
+	// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+	// See the License for the specific language governing permissions and
+	// limitations under the License.
+	
+// Code generated by Ingonyama DO NOT EDIT
+
+#include <cuda.h>
+#include <cuda_runtime.h>
+#include <stdbool.h>
+// msm.h
+
+#ifndef _BW6761_MSM_H
+#define _BW6761_MSM_H
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+// Incomplete declaration of BW6761 projective and affine structs
+typedef struct BW6761_projective_t BW6761_projective_t;
+typedef struct BW6761_g2_projective_t BW6761_g2_projective_t;
+typedef struct BW6761_affine_t BW6761_affine_t;
+typedef struct BW6761_g2_affine_t BW6761_g2_affine_t;
+typedef struct BW6761_scalar_t BW6761_scalar_t;
+typedef cudaStream_t CudaStream_t;
+
+int msm_cuda_bw6_761(
+  BW6761_projective_t* out, BW6761_affine_t* points, BW6761_scalar_t* scalars, size_t count, size_t device_id);
+
+int msm_batch_cuda_bw6_761(
+  BW6761_projective_t* out,
+  BW6761_affine_t* points,
+  BW6761_scalar_t* scalars,
+  size_t batch_size,
+  size_t msm_size,
+  size_t device_id);
+
+int commit_cuda_bw6_761(
+  BW6761_projective_t* d_out,
+  BW6761_scalar_t* d_scalars,
+  BW6761_affine_t* d_points,
+  size_t count,
+  unsigned large_bucket_factor,
+  size_t device_id);
+
+int commit_batch_cuda_bw6_761(
+  BW6761_projective_t* d_out,
+  BW6761_scalar_t* d_scalars,
+  BW6761_affine_t* d_points,
+  size_t count,
+  size_t batch_size,
+  size_t device_id);
+
+int msm_g2_cuda_bw6_761(
+  BW6761_g2_projective_t* out,
+  BW6761_g2_affine_t* points,
+  BW6761_scalar_t* scalars,
+  size_t count,
+  size_t device_id);
+
+int msm_batch_g2_cuda_bw6_761(
+  BW6761_g2_projective_t* out,
+  BW6761_g2_affine_t* points,
+  BW6761_scalar_t* scalars,
+  size_t batch_size,
+  size_t msm_size,
+  size_t device_id);
+
+int commit_g2_cuda_bw6_761(
+  BW6761_g2_projective_t* d_out,
+  BW6761_scalar_t* d_scalars,
+  BW6761_g2_affine_t* d_points,
+  size_t count,
+  unsigned large_bucket_factor,
+  size_t device_id);
+
+int commit_batch_g2_cuda_bw6_761(
+  BW6761_g2_projective_t* d_out,
+  BW6761_scalar_t* d_scalars,
+  BW6761_g2_affine_t* d_points,
+  size_t count,
+  size_t batch_size,
+  size_t device_id,
+  cudaStream_t stream);
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif /* _BW6761_MSM_H */
diff --git a/goicicle/curves/bw6761/include/ntt.h b/goicicle/curves/bw6761/include/ntt.h
new file mode 100644
index 000000000..61f371427
--- /dev/null
+++ b/goicicle/curves/bw6761/include/ntt.h
@@ -0,0 +1,198 @@
+
+	// Copyright 2023 Ingonyama
+	//
+	// Licensed under the Apache License, Version 2.0 (the "License");
+	// you may not use this file except in compliance with the License.
+	// You may obtain a copy of the License at
+	//
+	//     http://www.apache.org/licenses/LICENSE-2.0
+	//
+	// Unless required by applicable law or agreed to in writing, software
+	// distributed under the License is distributed on an "AS IS" BASIS,
+	// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+	// See the License for the specific language governing permissions and
+	// limitations under the License.
+	
+// Code generated by Ingonyama DO NOT EDIT
+
+#include <cuda.h>
+#include <stdbool.h>
+// ntt.h
+
+#ifndef _BW6761_NTT_H
+#define _BW6761_NTT_H
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+// Incomplete declaration of BW6761 projective and affine structs
+typedef struct BW6761_projective_t BW6761_projective_t;
+typedef struct BW6761_affine_t BW6761_affine_t;
+typedef struct BW6761_scalar_t BW6761_scalar_t;
+
+typedef struct BW6761_g2_projective_t BW6761_g2_projective_t;
+typedef struct BW6761_g2_affine_t BW6761_g2_affine_t;
+
+int ntt_cuda_bw6_761(BW6761_scalar_t* arr, uint32_t n, bool inverse, size_t device_id);
+int ntt_batch_cuda_bw6_761(
+  BW6761_scalar_t* arr, uint32_t arr_size, uint32_t batch_size, bool inverse, size_t device_id);
+
+int ecntt_cuda_bw6_761(BW6761_projective_t* arr, uint32_t n, bool inverse, size_t device_id);
+int ecntt_batch_cuda_bw6_761(
+  BW6761_projective_t* arr, uint32_t arr_size, uint32_t batch_size, bool inverse, size_t device_id);
+
+BW6761_scalar_t* 
+build_domain_cuda_bw6_761(uint32_t domain_size, uint32_t logn, bool inverse, size_t device_id, size_t stream);
+
+int interpolate_scalars_cuda_bw6_761(
+  BW6761_scalar_t* d_out,
+  BW6761_scalar_t* d_evaluations,
+  BW6761_scalar_t* d_domain,
+  unsigned n,
+  unsigned device_id,
+  size_t stream);
+int interpolate_scalars_batch_cuda_bw6_761(
+  BW6761_scalar_t* d_out,
+  BW6761_scalar_t* d_evaluations,
+  BW6761_scalar_t* d_domain,
+  unsigned n,
+  unsigned batch_size,
+  size_t device_id,
+  size_t stream);
+int interpolate_points_cuda_bw6_761(
+  BW6761_projective_t* d_out,
+  BW6761_projective_t* d_evaluations,
+  BW6761_scalar_t* d_domain,
+  unsigned n,
+  size_t device_id,
+  size_t stream);
+int interpolate_points_batch_cuda_bw6_761(
+  BW6761_projective_t* d_out,
+  BW6761_projective_t* d_evaluations,
+  BW6761_scalar_t* d_domain,
+  unsigned n,
+  unsigned batch_size,
+  size_t device_id,
+  size_t stream);
+int interpolate_scalars_on_coset_cuda_bw6_761(
+  BW6761_scalar_t* d_out,
+  BW6761_scalar_t* d_evaluations,
+  BW6761_scalar_t* d_domain,
+  unsigned n,
+  BW6761_scalar_t* coset_powers,
+  size_t device_id,
+  size_t stream);
+int interpolate_scalars_batch_on_coset_cuda_bw6_761(
+  BW6761_scalar_t* d_out,
+  BW6761_scalar_t* d_evaluations,
+  BW6761_scalar_t* d_domain,
+  unsigned n,
+  unsigned batch_size,
+  BW6761_scalar_t* coset_powers,
+  size_t device_id,
+  size_t stream);
+
+int evaluate_scalars_cuda_bw6_761(
+  BW6761_scalar_t* d_out,
+  BW6761_scalar_t* d_coefficients,
+  BW6761_scalar_t* d_domain,
+  unsigned domain_size,
+  unsigned n,
+  unsigned device_id,
+  size_t stream);
+int evaluate_scalars_batch_cuda_bw6_761(
+  BW6761_scalar_t* d_out,
+  BW6761_scalar_t* d_coefficients,
+  BW6761_scalar_t* d_domain,
+  unsigned domain_size,
+  unsigned n,
+  unsigned batch_size,
+  size_t device_id,
+  size_t stream);
+int evaluate_points_cuda_bw6_761(
+  BW6761_projective_t* d_out,
+  BW6761_projective_t* d_coefficients,
+  BW6761_scalar_t* d_domain,
+  unsigned domain_size,
+  unsigned n,
+  size_t device_id,
+  size_t stream);
+int evaluate_points_batch_cuda_bw6_761(
+  BW6761_projective_t* d_out,
+  BW6761_projective_t* d_coefficients,
+  BW6761_scalar_t* d_domain,
+  unsigned domain_size,
+  unsigned n,
+  unsigned batch_size,
+  size_t device_id,
+  size_t stream);
+int evaluate_scalars_on_coset_cuda_bw6_761(
+  BW6761_scalar_t* d_out,
+  BW6761_scalar_t* d_coefficients,
+  BW6761_scalar_t* d_domain,
+  unsigned domain_size,
+  unsigned n,
+  BW6761_scalar_t* coset_powers,
+  unsigned device_id,
+  size_t stream);
+int evaluate_scalars_on_coset_batch_cuda_bw6_761(
+  BW6761_scalar_t* d_out,
+  BW6761_scalar_t* d_coefficients,
+  BW6761_scalar_t* d_domain,
+  unsigned domain_size,
+  unsigned n,
+  unsigned batch_size,
+  BW6761_scalar_t* coset_powers,
+  size_t device_id,
+  size_t stream);
+int evaluate_points_on_coset_cuda_bw6_761(
+  BW6761_projective_t* d_out,
+  BW6761_projective_t* d_coefficients,
+  BW6761_scalar_t* d_domain,
+  unsigned domain_size,
+  unsigned n,
+  BW6761_scalar_t* coset_powers,
+  size_t device_id,
+  size_t stream);
+int evaluate_points_on_coset_batch_cuda_bw6_761(
+  BW6761_projective_t* d_out,
+  BW6761_projective_t* d_coefficients,
+  BW6761_scalar_t* d_domain,
+  unsigned domain_size,
+  unsigned n,
+  unsigned batch_size,
+  BW6761_scalar_t* coset_powers,
+  size_t device_id,
+  size_t stream);
+
+int reverse_order_scalars_cuda_bw6_761(BW6761_scalar_t* arr, int n, size_t device_id, size_t stream);
+int reverse_order_scalars_batch_cuda_bw6_761(
+  BW6761_scalar_t* arr, int n, int batch_size, size_t device_id, size_t stream);
+int reverse_order_points_cuda_bw6_761(BW6761_projective_t* arr, int n, size_t device_id, size_t stream);
+int reverse_order_points_batch_cuda_bw6_761(
+  BW6761_projective_t* arr, int n, int batch_size, size_t device_id, size_t stream);
+int add_scalars_cuda_bw6_761(
+  BW6761_scalar_t* d_out, BW6761_scalar_t* d_in1, BW6761_scalar_t* d_in2, unsigned n, size_t stream);
+int sub_scalars_cuda_bw6_761(
+  BW6761_scalar_t* d_out, BW6761_scalar_t* d_in1, BW6761_scalar_t* d_in2, unsigned n, size_t stream);
+int to_montgomery_scalars_cuda_bw6_761(BW6761_scalar_t* d_inout, unsigned n, size_t stream);
+int from_montgomery_scalars_cuda_bw6_761(BW6761_scalar_t* d_inout, unsigned n, size_t stream);
+
+// points g1
+int to_montgomery_proj_points_cuda_bw6_761(BW6761_projective_t* d_inout, unsigned n, size_t stream);
+int from_montgomery_proj_points_cuda_bw6_761(BW6761_projective_t* d_inout, unsigned n, size_t stream);
+int to_montgomery_aff_points_cuda_bw6_761(BW6761_affine_t* d_inout, unsigned n, size_t stream);
+int from_montgomery_aff_points_cuda_bw6_761(BW6761_affine_t* d_inout, unsigned n, size_t stream);
+
+// points g2
+int to_montgomery_proj_points_g2_cuda_bw6_761(BW6761_g2_projective_t* d_inout, unsigned n, size_t stream);
+int from_montgomery_proj_points_g2_cuda_bw6_761(BW6761_g2_projective_t* d_inout, unsigned n, size_t stream);
+int to_montgomery_aff_points_g2_cuda_bw6_761(BW6761_g2_affine_t* d_inout, unsigned n, size_t stream);
+int from_montgomery_aff_points_g2_cuda_bw6_761(BW6761_g2_affine_t* d_inout, unsigned n, size_t stream);
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif /* _BW6761_NTT_H */
diff --git a/goicicle/curves/bw6761/include/projective.h b/goicicle/curves/bw6761/include/projective.h
new file mode 100644
index 000000000..74f347d24
--- /dev/null
+++ b/goicicle/curves/bw6761/include/projective.h
@@ -0,0 +1,50 @@
+
+	// Copyright 2023 Ingonyama
+	//
+	// Licensed under the Apache License, Version 2.0 (the "License");
+	// you may not use this file except in compliance with the License.
+	// You may obtain a copy of the License at
+	//
+	//     http://www.apache.org/licenses/LICENSE-2.0
+	//
+	// Unless required by applicable law or agreed to in writing, software
+	// distributed under the License is distributed on an "AS IS" BASIS,
+	// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+	// See the License for the specific language governing permissions and
+	// limitations under the License.
+	
+// Code generated by Ingonyama DO NOT EDIT
+
+#include <cuda.h>
+#include <stdbool.h>
+// projective.h
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+typedef struct BW6761_projective_t BW6761_projective_t;
+typedef struct BW6761_g2_projective_t BW6761_g2_projective_t;
+typedef struct BW6761_affine_t BW6761_affine_t;
+typedef struct BW6761_g2_affine_t BW6761_g2_affine_t;
+typedef struct BW6761_scalar_t BW6761_scalar_t;
+
+bool projective_is_on_curve_bw6_761(BW6761_projective_t* point1);
+
+int random_scalar_bw6_761(BW6761_scalar_t* out); 
+int random_projective_bw6_761(BW6761_projective_t* out);
+BW6761_projective_t* projective_zero_bw6_761();
+int projective_to_affine_bw6_761(BW6761_affine_t* out, BW6761_projective_t* point1);
+int projective_from_affine_bw6_761(BW6761_projective_t* out, BW6761_affine_t* point1);
+
+int random_g2_projective_bw6_761(BW6761_g2_projective_t* out);
+int g2_projective_to_affine_bw6_761(BW6761_g2_affine_t* out, BW6761_g2_projective_t* point1);
+int g2_projective_from_affine_bw6_761(BW6761_g2_projective_t* out, BW6761_g2_affine_t* point1);
+bool g2_projective_is_on_curve_bw6_761(BW6761_g2_projective_t* point1);
+
+bool eq_bw6_761(BW6761_projective_t* point1, BW6761_projective_t* point2);
+bool eq_g2_bw6_761(BW6761_g2_projective_t* point1, BW6761_g2_projective_t* point2);
+
+#ifdef __cplusplus
+}
+#endif
diff --git a/goicicle/curves/bw6761/include/ve_mod_mult.h b/goicicle/curves/bw6761/include/ve_mod_mult.h
new file mode 100644
index 000000000..fbc2b5a8d
--- /dev/null
+++ b/goicicle/curves/bw6761/include/ve_mod_mult.h
@@ -0,0 +1,49 @@
+
+	// Copyright 2023 Ingonyama
+	//
+	// Licensed under the Apache License, Version 2.0 (the "License");
+	// you may not use this file except in compliance with the License.
+	// You may obtain a copy of the License at
+	//
+	//     http://www.apache.org/licenses/LICENSE-2.0
+	//
+	// Unless required by applicable law or agreed to in writing, software
+	// distributed under the License is distributed on an "AS IS" BASIS,
+	// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+	// See the License for the specific language governing permissions and
+	// limitations under the License.
+	
+// Code generated by Ingonyama DO NOT EDIT
+
+#include <stdbool.h>
+#include <cuda.h>
+// ve_mod_mult.h
+
+#ifndef _BW6761_VEC_MULT_H
+#define _BW6761_VEC_MULT_H
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+typedef struct BW6761_projective_t BW6761_projective_t;
+typedef struct BW6761_scalar_t BW6761_scalar_t;
+
+int32_t vec_mod_mult_point_bw6_761(
+  BW6761_projective_t* inout, BW6761_scalar_t* scalar_vec, size_t n_elments, size_t device_id);
+int32_t vec_mod_mult_scalar_bw6_761(
+  BW6761_scalar_t* inout, BW6761_scalar_t* scalar_vec, size_t n_elments, size_t device_id);
+int32_t vec_mod_mult_device_scalar_bw6_761(
+  BW6761_scalar_t* inout, BW6761_scalar_t* scalar_vec, size_t n_elements, size_t device_id);
+int32_t matrix_vec_mod_mult_bw6_761(
+  BW6761_scalar_t* matrix_flattened,
+  BW6761_scalar_t* input,
+  BW6761_scalar_t* output,
+  size_t n_elments,
+  size_t device_id);
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif /* _BW6761_VEC_MULT_H */
diff --git a/goicicle/curves/bw6761/msm.go b/goicicle/curves/bw6761/msm.go
new file mode 100644
index 000000000..c0a39ffcd
--- /dev/null
+++ b/goicicle/curves/bw6761/msm.go
@@ -0,0 +1,209 @@
+// Copyright 2023 Ingonyama
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+// Code generated by Ingonyama DO NOT EDIT
+
+package bw6761
+
+import (
+	"errors"
+	"fmt"
+	"unsafe"
+)
+
+// #cgo CFLAGS: -I./include/
+// #cgo CFLAGS: -I/usr/local/cuda/include
+// #cgo LDFLAGS: -L${SRCDIR}/../../ -lbw6761
+// #include "msm.h"
+import "C"
+
+func Msm(out *G1ProjectivePoint, points []G1PointAffine, scalars []G1ScalarField, device_id int) (*G1ProjectivePoint, error) {
+	if len(points) != len(scalars) {
+		return nil, errors.New("error on: len(points) != len(scalars)")
+	}
+
+	pointsC := (*C.BW6761_affine_t)(unsafe.Pointer(&points[0]))
+	scalarsC := (*C.BW6761_scalar_t)(unsafe.Pointer(&scalars[0]))
+	outC := (*C.BW6761_projective_t)(unsafe.Pointer(out))
+	ret := C.msm_cuda_bw6_761(outC, pointsC, scalarsC, C.size_t(len(points)), C.size_t(device_id))
+
+	if ret != 0 {
+		return nil, fmt.Errorf("msm_cuda_bw6_761 returned error code: %d", ret)
+	}
+
+	return out, nil
+}
+
+func MsmG2(out *G2Point, points []G2PointAffine, scalars []G1ScalarField, device_id int) (*G2Point, error) {
+	if len(points) != len(scalars) {
+		return nil, errors.New("error on: len(points) != len(scalars)")
+	}
+
+	pointsC := (*C.BW6761_g2_affine_t)(unsafe.Pointer(&points[0]))
+	scalarsC := (*C.BW6761_scalar_t)(unsafe.Pointer(&scalars[0]))
+	outC := (*C.BW6761_g2_projective_t)(unsafe.Pointer(out))
+
+	ret := C.msm_g2_cuda_bw6_761(outC, pointsC, scalarsC, C.size_t(len(points)), C.size_t(device_id))
+
+	if ret != 0 {
+		return nil, fmt.Errorf("msm_g2_cuda_bw6_761 returned error code: %d", ret)
+	}
+
+	return out, nil
+}
+
+func MsmBatch(points *[]G1PointAffine, scalars *[]G1ScalarField, batchSize, deviceId int) ([]G1ProjectivePoint, error) {
+	// Check for nil pointers
+	if points == nil || scalars == nil {
+		return nil, errors.New("points or scalars is nil")
+	}
+
+	if len(*points) != len(*scalars) {
+		return nil, errors.New("error on: len(points) != len(scalars)")
+	}
+
+	// Check for empty slices
+	if len(*points) == 0 || len(*scalars) == 0 {
+		return nil, errors.New("points or scalars is empty")
+	}
+
+	// Check for zero batchSize
+	if batchSize <= 0 {
+		return nil, errors.New("error on: batchSize must be greater than zero")
+	}
+
+	out := make([]G1ProjectivePoint, batchSize)
+
+	for i := 0; i < len(out); i++ {
+		var p G1ProjectivePoint
+		p.SetZero()
+
+		out[i] = p
+	}
+
+	outC := (*C.BW6761_projective_t)(unsafe.Pointer(&out[0]))
+	pointsC := (*C.BW6761_affine_t)(unsafe.Pointer(&(*points)[0]))
+	scalarsC := (*C.BW6761_scalar_t)(unsafe.Pointer(&(*scalars)[0]))
+	msmSizeC := C.size_t(len(*points) / batchSize)
+	deviceIdC := C.size_t(deviceId)
+	batchSizeC := C.size_t(batchSize)
+
+	ret := C.msm_batch_cuda_bw6_761(outC, pointsC, scalarsC, batchSizeC, msmSizeC, deviceIdC)
+	if ret != 0 {
+		return nil, fmt.Errorf("msm_batch_cuda_bw6_761 returned error code: %d", ret)
+	}
+
+	return out, nil
+}
+
+func MsmG2Batch(points *[]G2PointAffine, scalars *[]G1ScalarField, batchSize, deviceId int) ([]G2Point, error) {
+	// Check for nil pointers
+	if points == nil || scalars == nil {
+		return nil, errors.New("points or scalars is nil")
+	}
+
+	if len(*points) != len(*scalars) {
+		return nil, errors.New("error on: len(points) != len(scalars)")
+	}
+
+	// Check for empty slices
+	if len(*points) == 0 || len(*scalars) == 0 {
+		return nil, errors.New("points or scalars is empty")
+	}
+
+	// Check for zero batchSize
+	if batchSize <= 0 {
+		return nil, errors.New("error on: batchSize must be greater than zero")
+	}
+
+	out := make([]G2Point, batchSize)
+
+	outC := (*C.BW6761_g2_projective_t)(unsafe.Pointer(&out[0]))
+	pointsC := (*C.BW6761_g2_affine_t)(unsafe.Pointer(&(*points)[0]))
+	scalarsC := (*C.BW6761_scalar_t)(unsafe.Pointer(&(*scalars)[0]))
+	msmSizeC := C.size_t(len(*points) / batchSize)
+	deviceIdC := C.size_t(deviceId)
+	batchSizeC := C.size_t(batchSize)
+
+	ret := C.msm_batch_g2_cuda_bw6_761(outC, pointsC, scalarsC, batchSizeC, msmSizeC, deviceIdC)
+	if ret != 0 {
+		return nil, fmt.Errorf("msm_batch_cuda_bw6_761 returned error code: %d", ret)
+	}
+
+	return out, nil
+}
+
+func Commit(d_out, d_scalars, d_points unsafe.Pointer, count, bucketFactor int) int {
+	d_outC := (*C.BW6761_projective_t)(d_out)
+	scalarsC := (*C.BW6761_scalar_t)(d_scalars)
+	pointsC := (*C.BW6761_affine_t)(d_points)
+	countC := (C.size_t)(count)
+	largeBucketFactorC := C.uint(bucketFactor)
+
+	ret := C.commit_cuda_bw6_761(d_outC, scalarsC, pointsC, countC, largeBucketFactorC, 0)
+
+	if ret != 0 {
+		return -1
+	}
+
+	return 0
+}
+
+func CommitG2(d_out, d_scalars, d_points unsafe.Pointer, count, bucketFactor int) int {
+	d_outC := (*C.BW6761_g2_projective_t)(d_out)
+	scalarsC := (*C.BW6761_scalar_t)(d_scalars)
+	pointsC := (*C.BW6761_g2_affine_t)(d_points)
+	countC := (C.size_t)(count)
+	largeBucketFactorC := C.uint(bucketFactor)
+
+	ret := C.commit_g2_cuda_bw6_761(d_outC, scalarsC, pointsC, countC, largeBucketFactorC, 0)
+
+	if ret != 0 {
+		return -1
+	}
+
+	return 0
+}
+
+func CommitBatch(d_out, d_scalars, d_points unsafe.Pointer, count, batch_size int) int {
+	d_outC := (*C.BW6761_projective_t)(d_out)
+	scalarsC := (*C.BW6761_scalar_t)(d_scalars)
+	pointsC := (*C.BW6761_affine_t)(d_points)
+	countC := (C.size_t)(count)
+	batch_sizeC := (C.size_t)(batch_size)
+
+	ret := C.commit_batch_cuda_bw6_761(d_outC, scalarsC, pointsC, countC, batch_sizeC, 0)
+
+	if ret != 0 {
+		return -1
+	}
+
+	return 0
+}
+
+func CommitG2Batch(d_out, d_scalars, d_points unsafe.Pointer, count, batch_size int) int {
+	d_outC := (*C.BW6761_g2_projective_t)(d_out)
+	scalarsC := (*C.BW6761_scalar_t)(d_scalars)
+	pointsC := (*C.BW6761_g2_affine_t)(d_points)
+	countC := (C.size_t)(count)
+	batch_sizeC := (C.size_t)(batch_size)
+
+	ret := C.msm_batch_g2_cuda_bw6_761(d_outC, pointsC, scalarsC, countC, batch_sizeC, 0)
+
+	if ret != 0 {
+		return -1
+	}
+
+	return 0
+}
diff --git a/goicicle/curves/bw6761/msm_test.go b/goicicle/curves/bw6761/msm_test.go
new file mode 100644
index 000000000..53b70a4fc
--- /dev/null
+++ b/goicicle/curves/bw6761/msm_test.go
@@ -0,0 +1,367 @@
+// Copyright 2023 Ingonyama
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+// Code generated by Ingonyama DO NOT EDIT
+
+package bw6761
+
+import (
+	"fmt"
+	"math"
+	"testing"
+	"time"
+	"unsafe"
+
+	"github.com/ingonyama-zk/icicle/goicicle"
+	"github.com/stretchr/testify/assert"
+)
+
+func GeneratePoints(count int) []G1PointAffine {
+	// Declare a slice of integers
+	var points []G1PointAffine
+
+	// populate the slice
+	for i := 0; i < 10; i++ {
+		var pointProjective G1ProjectivePoint
+		pointProjective.Random()
+
+		var pointAffine G1PointAffine
+		pointAffine.FromProjective(&pointProjective)
+
+		points = append(points, pointAffine)
+	}
+
+	log2_10 := math.Log2(10)
+	log2Count := math.Log2(float64(count))
+	log2Size := int(math.Ceil(log2Count - log2_10))
+
+	for i := 0; i < log2Size; i++ {
+		points = append(points, points...)
+	}
+
+	return points[:count]
+}
+
+func GeneratePointsProj(count int) []G1ProjectivePoint {
+	// Declare a slice of integers
+	var points []G1ProjectivePoint
+	// Use a loop to populate the slice
+	for i := 0; i < count; i++ {
+		var p G1ProjectivePoint
+		p.Random()
+
+		points = append(points, p)
+	}
+
+	return points
+}
+
+func GenerateScalars(count int, skewed bool) []G1ScalarField {
+	// Declare a slice of integers
+	var scalars []G1ScalarField
+
+	var rand G1ScalarField
+	var zero G1ScalarField
+	var one G1ScalarField
+	var randLarge G1ScalarField
+
+	zero.SetZero()
+	one.SetOne()
+	randLarge.Random()
+
+	if skewed && count > 1_200_000 {
+		for i := 0; i < count-1_200_000; i++ {
+			rand.Random()
+			scalars = append(scalars, rand)
+		}
+
+		for i := 0; i < 600_000; i++ {
+			scalars = append(scalars, randLarge)
+		}
+		for i := 0; i < 400_000; i++ {
+			scalars = append(scalars, zero)
+		}
+		for i := 0; i < 200_000; i++ {
+			scalars = append(scalars, one)
+		}
+	} else {
+		for i := 0; i < count; i++ {
+			rand.Random()
+			scalars = append(scalars, rand)
+		}
+	}
+
+	return scalars[:count]
+}
+
+func TestMSM(t *testing.T) {
+	fmt.Print() // this prevents the test from hanging. TODO: figure out why
+	for _, v := range []int{8} {
+		count := 1 << v
+
+		points := GeneratePoints(count)
+		fmt.Print("Finished generating points\n")
+		scalars := GenerateScalars(count, false)
+		fmt.Print("Finished generating scalars\n")
+
+		out := new(G1ProjectivePoint)
+		startTime := time.Now()
+		_, e := Msm(out, points, scalars, 0) // non mont
+		fmt.Printf("icicle MSM took: %d ms\n", time.Since(startTime).Milliseconds())
+
+		assert.Equal(t, e, nil, "error should be nil")
+
+		assert.True(t, out.IsOnCurve())
+	}
+}
+
+func TestCommitMSM(t *testing.T) {
+	for _, v := range []int{8} {
+		count := 1<<v - 1
+		fmt.Print("Started generating points and scalars\n")
+		points := GeneratePoints(count)
+		scalars := GenerateScalars(count, false)
+		fmt.Print("Finished generating points and scalars\n")
+
+		var sizeOutD G1ProjectivePoint
+		out_d, _ := goicicle.CudaMalloc(int(unsafe.Sizeof(sizeOutD)))
+
+		var sizePoints G1PointAffine
+		pointsBytes := count * int(unsafe.Sizeof(sizePoints))
+		points_d, _ := goicicle.CudaMalloc(pointsBytes)
+		goicicle.CudaMemCpyHtoD[G1PointAffine](points_d, points, pointsBytes)
+
+		var sizeScalar G1ScalarField
+		scalarBytes := count * int(unsafe.Sizeof(sizeScalar))
+		scalars_d, _ := goicicle.CudaMalloc(scalarBytes)
+		goicicle.CudaMemCpyHtoD[G1ScalarField](scalars_d, scalars, scalarBytes)
+
+		startTime := time.Now()
+		e := Commit(out_d, scalars_d, points_d, count, 10)
+		fmt.Printf("icicle MSM took: %d ms\n", time.Since(startTime).Milliseconds())
+
+		outHost := make([]G1ProjectivePoint, 1)
+		goicicle.CudaMemCpyDtoH[G1ProjectivePoint](outHost, out_d, int(unsafe.Sizeof(sizeOutD)))
+
+		assert.Equal(t, e, 0, "error should be 0")
+		assert.True(t, outHost[0].IsOnCurve())
+	}
+}
+
+func BenchmarkCommit(b *testing.B) {
+	LOG_MSM_SIZES := []int{20, 21, 22, 23, 24, 25, 26}
+
+	for _, logMsmSize := range LOG_MSM_SIZES {
+		msmSize := 1 << logMsmSize
+		points := GeneratePoints(msmSize)
+		scalars := GenerateScalars(msmSize, false)
+
+		out_d, _ := goicicle.CudaMalloc(96)
+
+		pointsBytes := msmSize * 64
+		points_d, _ := goicicle.CudaMalloc(pointsBytes)
+		goicicle.CudaMemCpyHtoD[G1PointAffine](points_d, points, pointsBytes)
+
+		scalarBytes := msmSize * 32
+		scalars_d, _ := goicicle.CudaMalloc(scalarBytes)
+		goicicle.CudaMemCpyHtoD[G1ScalarField](scalars_d, scalars, scalarBytes)
+
+		b.Run(fmt.Sprintf("MSM %d", logMsmSize), func(b *testing.B) {
+			for n := 0; n < b.N; n++ {
+				e := Commit(out_d, scalars_d, points_d, msmSize, 10)
+
+				assert.Equal(b, e, 0, "error should be 0")
+				outHost := make([]G1ProjectivePoint, 1)
+				goicicle.CudaMemCpyDtoH[G1ProjectivePoint](outHost, out_d, 288)
+				assert.True(b, outHost[0].IsOnCurve())
+				if e != 0 {
+					panic("Error occurred")
+				}
+			}
+		})
+	}
+}
+
+func TestBatchMSM(t *testing.T) {
+	for _, batchPow2 := range []int{2, 4} {
+		for _, pow2 := range []int{4, 6} {
+			msmSize := 1 << pow2
+			batchSize := 1 << batchPow2
+			count := msmSize * batchSize
+
+			points := GeneratePoints(count)
+			scalars := GenerateScalars(count, false)
+
+			pointsResults, e := MsmBatch(&points, &scalars, batchSize, 0)
+
+			if e != nil {
+				t.Errorf("MsmBatchBW6761 returned an error: %v", e)
+			}
+
+			if len(pointsResults) != batchSize {
+				t.Errorf("Expected length %d, but got %d", batchSize, len(pointsResults))
+			}
+
+			for _, s := range pointsResults {
+				assert.True(t, s.IsOnCurve())
+			}
+		}
+	}
+}
+
+func BenchmarkMSM(b *testing.B) {
+	LOG_MSM_SIZES := []int{20, 21, 22, 23, 24, 25, 26}
+
+	for _, logMsmSize := range LOG_MSM_SIZES {
+		msmSize := 1 << logMsmSize
+		points := GeneratePoints(msmSize)
+		scalars := GenerateScalars(msmSize, false)
+		b.Run(fmt.Sprintf("MSM %d", logMsmSize), func(b *testing.B) {
+			for n := 0; n < b.N; n++ {
+				out := new(G1ProjectivePoint)
+				_, e := Msm(out, points, scalars, 0)
+
+				if e != nil {
+					panic("Error occurred")
+				}
+			}
+		})
+	}
+}
+
+// G2
+func GenerateG2Points(count int) []G2PointAffine {
+	// Declare a slice of integers
+	var points []G2PointAffine
+
+	// populate the slice
+	for i := 0; i < 10; i++ {
+		fmt.Print() // this prevents the test from hanging. TODO: figure out why
+		var p G2Point
+		p.Random()
+		var affine G2PointAffine
+		affine.FromProjective(&p)
+
+		points = append(points, affine)
+	}
+
+	log2_10 := math.Log2(10)
+	log2Count := math.Log2(float64(count))
+	log2Size := int(math.Ceil(log2Count - log2_10))
+
+	for i := 0; i < log2Size; i++ {
+		points = append(points, points...)
+	}
+
+	return points[:count]
+}
+
+func TestMsmG2BW6761(t *testing.T) {
+	for _, v := range []int{8} {
+		count := 1 << v
+		points := GenerateG2Points(count)
+		fmt.Print("Finished generating points\n")
+		scalars := GenerateScalars(count, false)
+		fmt.Print("Finished generating scalars\n")
+
+		out := new(G2Point)
+		_, e := MsmG2(out, points, scalars, 0)
+		assert.Equal(t, e, nil, "error should be nil")
+		assert.True(t, out.IsOnCurve())
+	}
+}
+
+func BenchmarkMsmG2BW6761(b *testing.B) {
+	LOG_MSM_SIZES := []int{20, 21, 22, 23, 24, 25, 26}
+
+	for _, logMsmSize := range LOG_MSM_SIZES {
+		msmSize := 1 << logMsmSize
+		points := GenerateG2Points(msmSize)
+		scalars := GenerateScalars(msmSize, false)
+		b.Run(fmt.Sprintf("MSM G2 %d", logMsmSize), func(b *testing.B) {
+			for n := 0; n < b.N; n++ {
+				out := new(G2Point)
+				_, e := MsmG2(out, points, scalars, 0)
+
+				if e != nil {
+					panic("Error occurred")
+				}
+			}
+		})
+	}
+}
+
+func TestCommitG2MSM(t *testing.T) {
+	for _, v := range []int{8} {
+		count := 1 << v
+
+		points := GenerateG2Points(count)
+		fmt.Print("Finished generating points\n")
+		scalars := GenerateScalars(count, false)
+		fmt.Print("Finished generating scalars\n")
+
+		var sizeCheckG2PointAffine G2PointAffine
+		inputPointsBytes := count * int(unsafe.Sizeof(sizeCheckG2PointAffine))
+
+		var sizeCheckG2Point G2Point
+		out_d, _ := goicicle.CudaMalloc(int(unsafe.Sizeof(sizeCheckG2Point)))
+
+		points_d, _ := goicicle.CudaMalloc(inputPointsBytes)
+		goicicle.CudaMemCpyHtoD[G2PointAffine](points_d, points, inputPointsBytes)
+
+		scalarBytes := count * 32
+		scalars_d, _ := goicicle.CudaMalloc(scalarBytes)
+		goicicle.CudaMemCpyHtoD[G1ScalarField](scalars_d, scalars, scalarBytes)
+
+		startTime := time.Now()
+		e := CommitG2(out_d, scalars_d, points_d, count, 10)
+		fmt.Printf("icicle MSM took: %d ms\n", time.Since(startTime).Milliseconds())
+
+		outHost := make([]G2Point, 1)
+		goicicle.CudaMemCpyDtoH[G2Point](outHost, out_d, int(unsafe.Sizeof(sizeCheckG2Point)))
+
+		assert.Equal(t, e, 0, "error should be 0")
+		assert.Equal(t, len(outHost), 1)
+		result := outHost[0]
+
+		assert.True(t, result.IsOnCurve())
+	}
+}
+
+func TestBatchG2MSM(t *testing.T) {
+	for _, batchPow2 := range []int{2, 4} {
+		for _, pow2 := range []int{4, 6} {
+			msmSize := 1 << pow2
+			batchSize := 1 << batchPow2
+			count := msmSize * batchSize
+
+			points := GenerateG2Points(count)
+			scalars := GenerateScalars(count, false)
+
+			pointsResults, e := MsmG2Batch(&points, &scalars, batchSize, 0)
+
+			if e != nil {
+				t.Errorf("MsmBatchBW6761 returned an error: %v", e)
+			}
+
+			if len(pointsResults) != batchSize {
+				t.Errorf("Expected length %d, but got %d", batchSize, len(pointsResults))
+			}
+
+			for _, s := range pointsResults {
+				assert.True(t, s.IsOnCurve())
+			}
+		}
+	}
+}
diff --git a/goicicle/curves/bw6761/ntt.go b/goicicle/curves/bw6761/ntt.go
new file mode 100644
index 000000000..3cbb71f14
--- /dev/null
+++ b/goicicle/curves/bw6761/ntt.go
@@ -0,0 +1,222 @@
+// Copyright 2023 Ingonyama
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+// Code generated by Ingonyama DO NOT EDIT
+
+package bw6761
+
+import (
+	"errors"
+	"fmt"
+	"unsafe"
+
+	"github.com/ingonyama-zk/icicle/goicicle"
+)
+
+// #cgo CFLAGS: -I./include/
+// #cgo CFLAGS: -I/usr/local/cuda/include
+// #cgo LDFLAGS: -L${SRCDIR}/../../ -lbw6761
+// #include "ntt.h"
+import "C"
+
+const (
+	NONE = 0
+	DIF  = 1
+	DIT  = 2
+)
+
+func Ntt(scalars *[]G1ScalarField, isInverse bool, deviceId int) uint64 {
+	scalarsC := (*C.BW6761_scalar_t)(unsafe.Pointer(&(*scalars)[0]))
+
+	ret := C.ntt_cuda_bw6_761(scalarsC, C.uint32_t(len(*scalars)), C.bool(isInverse), C.size_t(deviceId))
+
+	return uint64(ret)
+}
+
+func NttBatch(scalars *[]G1ScalarField, isInverse bool, batchSize, deviceId int) uint64 {
+	scalarsC := (*C.BW6761_scalar_t)(unsafe.Pointer(&(*scalars)[0]))
+	isInverseC := C.bool(isInverse)
+	batchSizeC := C.uint32_t(batchSize)
+	deviceIdC := C.size_t(deviceId)
+
+	ret := C.ntt_batch_cuda_bw6_761(scalarsC, C.uint32_t(len(*scalars)), batchSizeC, isInverseC, deviceIdC)
+
+	return uint64(ret)
+}
+
+func EcNtt(values *[]G1ProjectivePoint, isInverse bool, deviceId int) uint64 {
+	valuesC := (*C.BW6761_projective_t)(unsafe.Pointer(&(*values)[0]))
+	deviceIdC := C.size_t(deviceId)
+	isInverseC := C.bool(isInverse)
+	n := C.uint32_t(len(*values))
+
+	ret := C.ecntt_cuda_bw6_761(valuesC, n, isInverseC, deviceIdC)
+
+	return uint64(ret)
+}
+
+func EcNttBatch(values *[]G1ProjectivePoint, isInverse bool, batchSize, deviceId int) uint64 {
+	valuesC := (*C.BW6761_projective_t)(unsafe.Pointer(&(*values)[0]))
+	deviceIdC := C.size_t(deviceId)
+	isInverseC := C.bool(isInverse)
+	n := C.uint32_t(len(*values))
+	batchSizeC := C.uint32_t(batchSize)
+
+	ret := C.ecntt_batch_cuda_bw6_761(valuesC, n, batchSizeC, isInverseC, deviceIdC)
+
+	return uint64(ret)
+}
+
+func GenerateTwiddles(d_size int, log_d_size int, inverse bool) (up unsafe.Pointer, err error) {
+	domain_size := C.uint32_t(d_size)
+	logn := C.uint32_t(log_d_size)
+	is_inverse := C.bool(inverse)
+
+	dp := C.build_domain_cuda_bw6_761(domain_size, logn, is_inverse, 0, 0)
+
+	if dp == nil {
+		err = errors.New("nullptr returned from generating twiddles")
+		return unsafe.Pointer(nil), err
+	}
+
+	return unsafe.Pointer(dp), nil
+}
+
+// Reverses d_scalars in-place
+func ReverseScalars(d_scalars unsafe.Pointer, len int) (int, error) {
+	scalarsC := (*C.BW6761_scalar_t)(d_scalars)
+	lenC := C.int(len)
+	if success := C.reverse_order_scalars_cuda_bw6_761(scalarsC, lenC, 0, 0); success != 0 {
+		return -1, errors.New("reversing failed")
+	}
+	return 0, nil
+}
+
+func Interpolate(scalars, twiddles, cosetPowers unsafe.Pointer, size int, isCoset bool) unsafe.Pointer {
+	size_d := size * 48
+	dp, err := goicicle.CudaMalloc(size_d)
+
+	if err != nil {
+		return nil
+	}
+
+	d_out := (*C.BW6761_scalar_t)(dp)
+	scalarsC := (*C.BW6761_scalar_t)(scalars)
+	twiddlesC := (*C.BW6761_scalar_t)(twiddles)
+	cosetPowersC := (*C.BW6761_scalar_t)(cosetPowers)
+	sizeC := C.uint(size)
+
+	var ret C.int
+	if isCoset {
+		ret = C.interpolate_scalars_on_coset_cuda_bw6_761(d_out, scalarsC, twiddlesC, sizeC, cosetPowersC, 0, 0)
+	} else {
+		ret = C.interpolate_scalars_cuda_bw6_761(d_out, scalarsC, twiddlesC, sizeC, 0, 0)
+	}
+	if ret != 0 {
+		fmt.Print("error interpolating")
+	}
+
+	return unsafe.Pointer(d_out)
+}
+
+func Evaluate(scalars_out, scalars, twiddles, coset_powers unsafe.Pointer, scalars_size, twiddles_size int, isCoset bool) int {
+	scalars_outC := (*C.BW6761_scalar_t)(scalars_out)
+	scalarsC := (*C.BW6761_scalar_t)(scalars)
+	twiddlesC := (*C.BW6761_scalar_t)(twiddles)
+	coset_powersC := (*C.BW6761_scalar_t)(coset_powers)
+	sizeC := C.uint(scalars_size)
+	twiddlesC_size := C.uint(twiddles_size)
+
+	var ret C.int
+	if isCoset {
+		ret = C.evaluate_scalars_on_coset_cuda_bw6_761(scalars_outC, scalarsC, twiddlesC, twiddlesC_size, sizeC, coset_powersC, 0, 0)
+	} else {
+		ret = C.evaluate_scalars_cuda_bw6_761(scalars_outC, scalarsC, twiddlesC, twiddlesC_size, sizeC, 0, 0)
+	}
+
+	if ret != 0 {
+		fmt.Print("error interpolating")
+		return -1
+	}
+
+	return 0
+}
+
+func VecScalarAdd(in1_d, in2_d unsafe.Pointer, size int) int {
+	in1_dC := (*C.BW6761_scalar_t)(in1_d)
+	in2_dC := (*C.BW6761_scalar_t)(in2_d)
+	sizeC := C.uint(size)
+
+	ret := C.add_scalars_cuda_bw6_761(in1_dC, in1_dC, in2_dC, sizeC, 0)
+
+	if ret != 0 {
+		fmt.Print("error adding scalar vectors")
+		return -1
+	}
+
+	return 0
+}
+
+func VecScalarSub(in1_d, in2_d unsafe.Pointer, size int) int {
+	in1_dC := (*C.BW6761_scalar_t)(in1_d)
+	in2_dC := (*C.BW6761_scalar_t)(in2_d)
+	sizeC := C.uint(size)
+
+	ret := C.sub_scalars_cuda_bw6_761(in1_dC, in1_dC, in2_dC, sizeC, 0)
+
+	if ret != 0 {
+		fmt.Print("error subtracting scalar vectors")
+		return -1
+	}
+
+	return 0
+}
+
+func ToMontgomery(d_scalars unsafe.Pointer, len int) (int, error) {
+	scalarsC := (*C.BW6761_scalar_t)(d_scalars)
+	lenC := C.uint(len)
+	if success := C.to_montgomery_scalars_cuda_bw6_761(scalarsC, lenC, 0); success != 0 {
+		return -1, errors.New("reversing failed")
+	}
+	return 0, nil
+}
+
+func FromMontgomery(d_scalars unsafe.Pointer, len int) (int, error) {
+	scalarsC := (*C.BW6761_scalar_t)(d_scalars)
+	lenC := C.uint(len)
+	if success := C.from_montgomery_scalars_cuda_bw6_761(scalarsC, lenC, 0); success != 0 {
+		return -1, errors.New("reversing failed")
+	}
+	return 0, nil
+}
+
+func AffinePointFromMontgomery(d_points unsafe.Pointer, len int) (int, error) {
+	pointsC := (*C.BW6761_affine_t)(d_points)
+	lenC := C.uint(len)
+
+	if success := C.from_montgomery_aff_points_cuda_bw6_761(pointsC, lenC, 0); success != 0 {
+		return -1, errors.New("reversing failed")
+	}
+	return 0, nil
+}
+
+func G2AffinePointFromMontgomery(d_points unsafe.Pointer, len int) (int, error) {
+	pointsC := (*C.BW6761_g2_affine_t)(d_points)
+	lenC := C.uint(len)
+
+	if success := C.from_montgomery_aff_points_g2_cuda_bw6_761(pointsC, lenC, 0); success != 0 {
+		return -1, errors.New("reversing failed")
+	}
+	return 0, nil
+}
diff --git a/goicicle/curves/bw6761/ntt_test.go b/goicicle/curves/bw6761/ntt_test.go
new file mode 100644
index 000000000..be1579ba2
--- /dev/null
+++ b/goicicle/curves/bw6761/ntt_test.go
@@ -0,0 +1,148 @@
+// Copyright 2023 Ingonyama
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+// Code generated by Ingonyama DO NOT EDIT
+
+package bw6761
+
+import (
+	"fmt"
+	"github.com/stretchr/testify/assert"
+	"reflect"
+	"testing"
+)
+
+func TestNttBW6761Batch(t *testing.T) {
+	count := 1 << 20
+	scalars := GenerateScalars(count, false)
+
+	nttResult := make([]G1ScalarField, len(scalars)) // Make a new slice with the same length
+	copy(nttResult, scalars)
+
+	assert.Equal(t, nttResult, scalars)
+	NttBatch(&nttResult, false, count, 0)
+	assert.NotEqual(t, nttResult, scalars)
+
+	assert.Equal(t, nttResult, nttResult)
+}
+
+func TestNttBW6761CompareToGnarkDIF(t *testing.T) {
+	count := 1 << 2
+	scalars := GenerateScalars(count, false)
+
+	nttResult := make([]G1ScalarField, len(scalars)) // Make a new slice with the same length
+	copy(nttResult, scalars)
+
+	assert.Equal(t, nttResult, scalars)
+	Ntt(&nttResult, false, 0)
+	assert.NotEqual(t, nttResult, scalars)
+
+	assert.Equal(t, nttResult, nttResult)
+}
+
+func TestINttBW6761CompareToGnarkDIT(t *testing.T) {
+	count := 1 << 3
+	scalars := GenerateScalars(count, false)
+
+	nttResult := make([]G1ScalarField, len(scalars)) // Make a new slice with the same length
+	copy(nttResult, scalars)
+
+	assert.Equal(t, nttResult, scalars)
+	Ntt(&nttResult, true, 0)
+	assert.NotEqual(t, nttResult, scalars)
+
+	assert.Equal(t, nttResult, nttResult)
+}
+
+func TestNttBW6761(t *testing.T) {
+	count := 1 << 3
+
+	scalars := GenerateScalars(count, false)
+
+	nttResult := make([]G1ScalarField, len(scalars)) // Make a new slice with the same length
+	copy(nttResult, scalars)
+
+	assert.Equal(t, nttResult, scalars)
+	Ntt(&nttResult, false, 0)
+	assert.NotEqual(t, nttResult, scalars)
+
+	inttResult := make([]G1ScalarField, len(nttResult))
+	copy(inttResult, nttResult)
+
+	assert.Equal(t, inttResult, nttResult)
+	Ntt(&inttResult, true, 0)
+	assert.Equal(t, inttResult, scalars)
+}
+
+func TestNttBatchBW6761(t *testing.T) {
+	count := 1 << 5
+	batches := 4
+
+	scalars := GenerateScalars(count*batches, false)
+
+	var scalarVecOfVec [][]G1ScalarField = make([][]G1ScalarField, 0)
+
+	for i := 0; i < batches; i++ {
+		start := i * count
+		end := (i + 1) * count
+		batch := make([]G1ScalarField, len(scalars[start:end]))
+		copy(batch, scalars[start:end])
+		scalarVecOfVec = append(scalarVecOfVec, batch)
+	}
+
+	nttBatchResult := make([]G1ScalarField, len(scalars))
+	copy(nttBatchResult, scalars)
+
+	NttBatch(&nttBatchResult, false, count, 0)
+
+	var nttResultVecOfVec [][]G1ScalarField
+
+	for i := 0; i < batches; i++ {
+		// Clone the slice
+		clone := make([]G1ScalarField, len(scalarVecOfVec[i]))
+		copy(clone, scalarVecOfVec[i])
+
+		// Add it to the result vector of vectors
+		nttResultVecOfVec = append(nttResultVecOfVec, clone)
+
+		// Call the ntt_bw6_761 function
+		Ntt(&nttResultVecOfVec[i], false, 0)
+	}
+
+	assert.NotEqual(t, nttBatchResult, scalars)
+
+	// Check that the ntt of each vec of scalars is equal to the intt of the specific batch
+	for i := 0; i < batches; i++ {
+		if !reflect.DeepEqual(nttResultVecOfVec[i], nttBatchResult[i*count:((i+1)*count)]) {
+			t.Errorf("ntt of vec of scalars not equal to intt of specific batch")
+		}
+	}
+}
+
+func BenchmarkNTT(b *testing.B) {
+	LOG_NTT_SIZES := []int{12, 15, 20, 21, 22, 23, 24, 25, 26}
+
+	for _, logNTTSize := range LOG_NTT_SIZES {
+		nttSize := 1 << logNTTSize
+		b.Run(fmt.Sprintf("NTT %d", logNTTSize), func(b *testing.B) {
+			scalars := GenerateScalars(nttSize, false)
+
+			nttResult := make([]G1ScalarField, len(scalars)) // Make a new slice with the same length
+			copy(nttResult, scalars)
+			for n := 0; n < b.N; n++ {
+				Ntt(&nttResult, false, 0)
+			}
+		})
+	}
+}
diff --git a/goicicle/curves/bw6761/vec_mod.go b/goicicle/curves/bw6761/vec_mod.go
new file mode 100644
index 000000000..9b7932e40
--- /dev/null
+++ b/goicicle/curves/bw6761/vec_mod.go
@@ -0,0 +1,42 @@
+// Copyright 2023 Ingonyama
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+// Code generated by Ingonyama DO NOT EDIT
+
+package bw6761
+
+// #cgo CFLAGS: -I./include/
+// #cgo CFLAGS: -I/usr/local/cuda/include
+// #cgo LDFLAGS: -L${SRCDIR}/../../ -lbw6761
+// #include "ve_mod_mult.h"
+import "C"
+import (
+	"fmt"
+	"unsafe"
+)
+
+func VecScalarMulMod(scalarVec1, scalarVec2 unsafe.Pointer, size int) int {
+	scalarVec1C := (*C.BW6761_scalar_t)(scalarVec1)
+	scalarVec2C := (*C.BW6761_scalar_t)(scalarVec2)
+	sizeC := C.size_t(size)
+
+	ret := C.vec_mod_mult_device_scalar_bw6_761(scalarVec1C, scalarVec2C, sizeC, 0)
+
+	if ret != 0 {
+		fmt.Print("error multiplying scalar vectors")
+		return -1
+	}
+
+	return 0
+}
diff --git a/goicicle/setup.sh b/goicicle/setup.sh
index 025942ef7..62f771b2c 100755
--- a/goicicle/setup.sh
+++ b/goicicle/setup.sh
@@ -10,6 +10,7 @@ fi
 TARGET_BN254="libbn254.so"
 TARGET_BLS12_381="libbls12_381.so"
 TARGET_BLS12_377="libbls12_377.so"
+TARGET_BW6_671="libbw6_671.so"
 
 MAKE_FAIL=0
 
@@ -23,6 +24,7 @@ fi
 TARGET_BN254_PATH=$(dirname "$(find `pwd` -name $TARGET_BN254 -print -quit)")/
 TARGET_BLS12_381_PATH=$(dirname "$(find `pwd` -name $TARGET_BLS12_381 -print -quit)")/
 TARGET_BLS12_377_PATH=$(dirname "$(find `pwd` -name $TARGET_BLS12_377 -print -quit)")/
+TARGET_BW6_671_PATH=$(dirname "$(find `pwd` -name $TARGET_BW6_671 -print -quit)")/
 
 
 if [[ "$TARGET_BLS12_377_PATH" != "" ]]; then
@@ -36,6 +38,11 @@ if [[ "$TARGET_BN254_PATH" != "" ]]; then
 fi
 
 if [[ "$TARGET_BLS12_381_PATH" != "" ]]; then
-    echo "BLS12_381_PATH found @ $TARGET_BLS12_381_PATH"
+    echo "BLS12_381 found @ $TARGET_BLS12_381_PATH"
     export LD_LIBRARY_PATH=${LD_LIBRARY_PATH}:$TARGET_BLS12_381_PATH
 fi
+
+if [[ "$TARGET_BW6_671_PATH" != "" ]]; then
+    echo "BW6_671 found @ $TARGET_BW6_671_PATH"
+    export LD_LIBRARY_PATH=${LD_LIBRARY_PATH}:$TARGET_BW6_671_PATH
+fi
diff --git a/goicicle/templates/curves/curves.go b/goicicle/templates/curves/curves.go
index 6847c8b48..535aef6d0 100644
--- a/goicicle/templates/curves/curves.go
+++ b/goicicle/templates/curves/curves.go
@@ -11,6 +11,16 @@ type Curve struct {
 	G2ElementSize      int
 }
 
+var BW6_761 = Curve{
+	PackageName:        "bw6761",
+	CurveNameUpperCase: "BW6761",
+	CurveNameLowerCase: "bw6_761",
+	SharedLib:          "-lbw6761",
+	ScalarSize:         12,
+	BaseSize:           24,
+	G2ElementSize:      6,
+}
+
 var BN_254 = Curve{
 	PackageName:        "bn254",
 	CurveNameUpperCase: "BN254",
diff --git a/goicicle/templates/curves/g2.go.tmpl b/goicicle/templates/curves/g2.go.tmpl
index b258d18f3..48be4fba0 100644
--- a/goicicle/templates/curves/g2.go.tmpl
+++ b/goicicle/templates/curves/g2.go.tmpl
@@ -33,14 +33,6 @@ func (p *G2Point) Random() *G2Point {
 	return p
 }
 
-func (p *G2Point) FromAffine(affine *G2PointAffine) *G2Point {
-	out := (*C.{{.CurveNameUpperCase}}_g2_projective_t)(unsafe.Pointer(p))
-	in := (*C.{{.CurveNameUpperCase}}_g2_affine_t)(unsafe.Pointer(affine))
-
-	C.g2_projective_from_affine_{{.CurveNameLowerCase}}(out, in)
-
-	return p
-}
 
 func (p *G2Point) Eq(pCompare *G2Point) bool {
 	// Cast *Point{{.CurveNameUpperCase}} to *C.{{.CurveNameUpperCase}}_projective_t
@@ -66,15 +58,13 @@ func (f *G2Element) ToBytesLe() []byte {
 	return bytes
 }
 
-func (p *G2PointAffine) ToProjective() G2Point {
-	return G2Point{
-		X: p.X,
-		Y: p.Y,
-		Z: ExtentionField{
-			A0: G2Element{1, 0, 0, 0},
-			A1: G2Element{0, 0, 0, 0},
-		},
-	}
+func (p *G2Point) FromAffine(affine *G2PointAffine) *G2Point {
+	out := (*C.{{.CurveNameUpperCase}}_g2_projective_t)(unsafe.Pointer(p))
+	in := (*C.{{.CurveNameUpperCase}}_g2_affine_t)(unsafe.Pointer(affine))
+
+	C.g2_projective_from_affine_{{.CurveNameLowerCase}}(out, in)
+
+	return p
 }
 
 func (p *G2PointAffine) FromProjective(projective *G2Point) *G2PointAffine {
diff --git a/goicicle/templates/curves/g2_test.go.tmpl b/goicicle/templates/curves/g2_test.go.tmpl
index ff0341e57..3b90041f2 100644
--- a/goicicle/templates/curves/g2_test.go.tmpl
+++ b/goicicle/templates/curves/g2_test.go.tmpl
@@ -53,7 +53,8 @@ func TestG2ShouldConvertToProjective(t *testing.T) {
 	var pointAffine G2PointAffine
 	pointAffine.FromProjective(&pointProjective)
 
-	proj := pointAffine.ToProjective()
+	var proj G2Point
+	proj.FromAffine(&pointAffine)
 
 	assert.True(t, proj.IsOnCurve())
 	assert.True(t, pointProjective.Eq(&proj))
diff --git a/goicicle/templates/main.go b/goicicle/templates/main.go
index 43dbdccaa..570d927d8 100644
--- a/goicicle/templates/main.go
+++ b/goicicle/templates/main.go
@@ -31,9 +31,14 @@ func genMainFiles() {
 		{File: filepath.Join(baseDir, "bls12381", "g1.go"), Templates: []string{"g1.go.tmpl"}},
 	}
 
+	bw6761_entries := []bavard.Entry{
+		{File: filepath.Join(baseDir, "bw6761", "g1.go"), Templates: []string{"g1.go.tmpl"}},
+	}
+
 	assertNoError(bgen.Generate(config.BLS_12_377, config.BLS_12_377.PackageName, "./curves/", bls12377_entries...))
 	assertNoError(bgen.Generate(config.BN_254, config.BN_254.PackageName, "./curves/", bn254_entries...))
 	assertNoError(bgen.Generate(config.BLS_12_381, config.BLS_12_381.PackageName, "./curves/", bls12381_entries...))
+	assertNoError(bgen.Generate(config.BW6_761, config.BW6_761.PackageName, "./curves/", bw6761_entries...))
 
 	bn254_g2_entries := []bavard.Entry{
 		{File: filepath.Join(baseDir, "bn254", "g2.go"), Templates: []string{"g2.go.tmpl"}},
@@ -47,9 +52,15 @@ func genMainFiles() {
 		{File: filepath.Join(baseDir, "bls12381", "g2.go"), Templates: []string{"g2.go.tmpl"}},
 	}
 
+	bw6761_g2_entries := []bavard.Entry{
+		{File: filepath.Join(baseDir, "bw6761", "g2.go"), Templates: []string{"g2.go.tmpl"}},
+	}
+
 	assertNoError(bgen.Generate(config.BLS_12_377, config.BLS_12_377.PackageName, "./curves/", bls12377_g2_entries...))
 	assertNoError(bgen.Generate(config.BN_254, config.BN_254.PackageName, "./curves/", bn254_g2_entries...))
 	assertNoError(bgen.Generate(config.BLS_12_381, config.BLS_12_381.PackageName, "./curves/", bls12381_g2_entries...))
+	assertNoError(bgen.Generate(config.BW6_761, config.BW6_761.PackageName, "./curves/", bw6761_g2_entries...))
+
 	bn254_msm_entries := []bavard.Entry{
 		{File: filepath.Join(baseDir, "bn254", "msm.go"), Templates: []string{"msm.go.tmpl"}},
 	}
@@ -62,9 +73,14 @@ func genMainFiles() {
 		{File: filepath.Join(baseDir, "bls12381", "msm.go"), Templates: []string{"msm.go.tmpl"}},
 	}
 
+	bw6761_msm_entries := []bavard.Entry{
+		{File: filepath.Join(baseDir, "bw6761", "msm.go"), Templates: []string{"msm.go.tmpl"}},
+	}
+
 	assertNoError(bgen.Generate(config.BLS_12_377, config.BLS_12_377.PackageName, "./msm/", bls12377_msm_entries...))
 	assertNoError(bgen.Generate(config.BN_254, config.BN_254.PackageName, "./msm/", bn254_msm_entries...))
 	assertNoError(bgen.Generate(config.BLS_12_381, config.BLS_12_381.PackageName, "./msm/", bls12381_msm_entries...))
+	assertNoError(bgen.Generate(config.BW6_761, config.BW6_761.PackageName, "./msm/", bw6761_msm_entries...))
 
 	bn254_ntt_entries := []bavard.Entry{
 		{File: filepath.Join(baseDir, "bn254", "ntt.go"), Templates: []string{"ntt.go.tmpl"}},
@@ -78,9 +94,14 @@ func genMainFiles() {
 		{File: filepath.Join(baseDir, "bls12381", "ntt.go"), Templates: []string{"ntt.go.tmpl"}},
 	}
 
+	bw6761_ntt_entries := []bavard.Entry{
+		{File: filepath.Join(baseDir, "bw6761", "ntt.go"), Templates: []string{"ntt.go.tmpl"}},
+	}
+
 	assertNoError(bgen.Generate(config.BLS_12_377, config.BLS_12_377.PackageName, "./ntt/", bls12377_ntt_entries...))
 	assertNoError(bgen.Generate(config.BN_254, config.BN_254.PackageName, "./ntt/", bn254_ntt_entries...))
 	assertNoError(bgen.Generate(config.BLS_12_381, config.BLS_12_381.PackageName, "./ntt/", bls12381_ntt_entries...))
+	assertNoError(bgen.Generate(config.BW6_761, config.BW6_761.PackageName, "./ntt/", bw6761_ntt_entries...))
 
 	bn254_vec_mod_entries := []bavard.Entry{
 		{File: filepath.Join(baseDir, "bn254", "vec_mod.go"), Templates: []string{"vec_mod.go.tmpl"}},
@@ -94,9 +115,14 @@ func genMainFiles() {
 		{File: filepath.Join(baseDir, "bls12381", "vec_mod.go"), Templates: []string{"vec_mod.go.tmpl"}},
 	}
 
+	bw6761_vec_mod_entries := []bavard.Entry{
+		{File: filepath.Join(baseDir, "bw6761", "vec_mod.go"), Templates: []string{"vec_mod.go.tmpl"}},
+	}
+
 	assertNoError(bgen.Generate(config.BLS_12_377, config.BLS_12_377.PackageName, "./ops/", bls12377_vec_mod_entries...))
 	assertNoError(bgen.Generate(config.BN_254, config.BN_254.PackageName, "./ops/", bn254_vec_mod_entries...))
 	assertNoError(bgen.Generate(config.BLS_12_381, config.BLS_12_381.PackageName, "./ops/", bls12381_vec_mod_entries...))
+	assertNoError(bgen.Generate(config.BW6_761, config.BW6_761.PackageName, "./ops/", bw6761_vec_mod_entries...))
 
 	h_msm_bn254 := []bavard.Entry{
 		{File: filepath.Join(baseDir, "bn254", "include", "msm.h"), Templates: []string{"msm.h.tmpl"}},
@@ -110,9 +136,14 @@ func genMainFiles() {
 		{File: filepath.Join(baseDir, "bls12381", "include", "msm.h"), Templates: []string{"msm.h.tmpl"}},
 	}
 
+	h_msm_bw6761 := []bavard.Entry{
+		{File: filepath.Join(baseDir, "bw6761", "include", "msm.h"), Templates: []string{"msm.h.tmpl"}},
+	}
+
 	assertNoError(bgen.Generate(config.BLS_12_377, config.BLS_12_377.PackageName, "./hfiles/", h_msm_bls12377...))
 	assertNoError(bgen.Generate(config.BN_254, config.BN_254.PackageName, "./hfiles/", h_msm_bn254...))
 	assertNoError(bgen.Generate(config.BLS_12_381, config.BLS_12_381.PackageName, "./hfiles/", h_msm_bls12381...))
+	assertNoError(bgen.Generate(config.BW6_761, config.BW6_761.PackageName, "./hfiles/", h_msm_bw6761...))
 
 	h_ntt_bn254 := []bavard.Entry{
 		{File: filepath.Join(baseDir, "bn254", "include", "ntt.h"), Templates: []string{"ntt.h.tmpl"}},
@@ -126,9 +157,14 @@ func genMainFiles() {
 		{File: filepath.Join(baseDir, "bls12381", "include", "ntt.h"), Templates: []string{"ntt.h.tmpl"}},
 	}
 
+	h_ntt_bw6761 := []bavard.Entry{
+		{File: filepath.Join(baseDir, "bw6761", "include", "ntt.h"), Templates: []string{"ntt.h.tmpl"}},
+	}
+
 	assertNoError(bgen.Generate(config.BLS_12_377, config.BLS_12_377.PackageName, "./hfiles/", h_ntt_bls12377...))
 	assertNoError(bgen.Generate(config.BN_254, config.BN_254.PackageName, "./hfiles/", h_ntt_bn254...))
 	assertNoError(bgen.Generate(config.BLS_12_381, config.BLS_12_381.PackageName, "./hfiles/", h_ntt_bls12381...))
+	assertNoError(bgen.Generate(config.BW6_761, config.BW6_761.PackageName, "./hfiles/", h_ntt_bw6761...))
 
 	ve_mod_mult_h_bn254 := []bavard.Entry{
 		{File: filepath.Join(baseDir, "bn254", "include", "ve_mod_mult.h"), Templates: []string{"ve_mod_mult.h.tmpl"}},
@@ -142,9 +178,14 @@ func genMainFiles() {
 		{File: filepath.Join(baseDir, "bls12381", "include", "ve_mod_mult.h"), Templates: []string{"ve_mod_mult.h.tmpl"}},
 	}
 
+	ve_mod_mult_ht_bw6761 := []bavard.Entry{
+		{File: filepath.Join(baseDir, "bw6761", "include", "ve_mod_mult.h"), Templates: []string{"ve_mod_mult.h.tmpl"}},
+	}
+
 	assertNoError(bgen.Generate(config.BLS_12_377, config.BLS_12_377.PackageName, "./hfiles/", ve_mod_mult_h_bls12377...))
 	assertNoError(bgen.Generate(config.BN_254, config.BN_254.PackageName, "./hfiles/", ve_mod_mult_h_bn254...))
 	assertNoError(bgen.Generate(config.BLS_12_381, config.BLS_12_381.PackageName, "./hfiles/", ve_mod_mult_ht_bls12381...))
+	assertNoError(bgen.Generate(config.BW6_761, config.BW6_761.PackageName, "./hfiles/", ve_mod_mult_ht_bw6761...))
 
 	projective_bn254 := []bavard.Entry{
 		{File: filepath.Join(baseDir, "bn254", "include", "projective.h"), Templates: []string{"projective.h.tmpl"}},
@@ -158,9 +199,14 @@ func genMainFiles() {
 		{File: filepath.Join(baseDir, "bls12381", "include", "projective.h"), Templates: []string{"projective.h.tmpl"}},
 	}
 
+	projective_bw6761 := []bavard.Entry{
+		{File: filepath.Join(baseDir, "bw6761", "include", "projective.h"), Templates: []string{"projective.h.tmpl"}},
+	}
+
 	assertNoError(bgen.Generate(config.BLS_12_377, config.BLS_12_377.PackageName, "./hfiles/", projective_bls12377...))
 	assertNoError(bgen.Generate(config.BN_254, config.BN_254.PackageName, "./hfiles/", projective_bn254...))
 	assertNoError(bgen.Generate(config.BLS_12_381, config.BLS_12_381.PackageName, "./hfiles/", projective_bls12381...))
+	assertNoError(bgen.Generate(config.BW6_761, config.BW6_761.PackageName, "./hfiles/", projective_bw6761...))
 }
 
 func genTestFiles() {
@@ -177,9 +223,14 @@ func genTestFiles() {
 		{File: filepath.Join(baseDir, "bls12381", "g1_test.go"), Templates: []string{"g1_test.go.tmpl"}},
 	}
 
+	bw6761_entries := []bavard.Entry{
+		{File: filepath.Join(baseDir, "bw6761", "g1_test.go"), Templates: []string{"g1_test.go.tmpl"}},
+	}
+
 	assertNoError(bgen.Generate(config.BLS_12_377, config.BLS_12_377.PackageName, "./curves/", bls12377_entries...))
 	assertNoError(bgen.Generate(config.BN_254, config.BN_254.PackageName, "./curves/", bn254_entries...))
 	assertNoError(bgen.Generate(config.BLS_12_381, config.BLS_12_381.PackageName, "./curves/", bls12381_entries...))
+	assertNoError(bgen.Generate(config.BW6_761, config.BW6_761.PackageName, "./curves/", bw6761_entries...))
 
 	// G2 TESTS
 	bn254_entries_g2_test := []bavard.Entry{
@@ -194,9 +245,14 @@ func genTestFiles() {
 		{File: filepath.Join(baseDir, "bls12381", "g2_test.go"), Templates: []string{"g2_test.go.tmpl"}},
 	}
 
+	bw6761_entries_g2_test := []bavard.Entry{
+		{File: filepath.Join(baseDir, "bw6761", "g2_test.go"), Templates: []string{"g2_test.go.tmpl"}},
+	}
+
 	assertNoError(bgen.Generate(config.BLS_12_377, config.BLS_12_377.PackageName, "./curves/", bls12377_entries_g2_test...))
 	assertNoError(bgen.Generate(config.BN_254, config.BN_254.PackageName, "./curves/", bn254_entries_g2_test...))
 	assertNoError(bgen.Generate(config.BLS_12_381, config.BLS_12_381.PackageName, "./curves/", bls12381_entries_g2_test...))
+	assertNoError(bgen.Generate(config.BW6_761, config.BW6_761.PackageName, "./curves/", bw6761_entries_g2_test...))
 
 	// MSM TEST
 	bn254_entries_msm_test := []bavard.Entry{
@@ -211,9 +267,14 @@ func genTestFiles() {
 		{File: filepath.Join(baseDir, "bls12381", "msm_test.go"), Templates: []string{"msm_test.go.tmpl"}},
 	}
 
+	bw6761_entries_msm_test := []bavard.Entry{
+		{File: filepath.Join(baseDir, "bw6761", "msm_test.go"), Templates: []string{"msm_test.go.tmpl"}},
+	}
+
 	assertNoError(bgen.Generate(config.BLS_12_377, config.BLS_12_377.PackageName, "./msm/", bls12377_entries_msm_test...))
 	assertNoError(bgen.Generate(config.BN_254, config.BN_254.PackageName, "./msm/", bn254_entries_msm_test...))
 	assertNoError(bgen.Generate(config.BLS_12_381, config.BLS_12_381.PackageName, "./msm/", bls12381_entries_msm_test...))
+	assertNoError(bgen.Generate(config.BW6_761, config.BW6_761.PackageName, "./msm/", bw6761_entries_msm_test...))
 
 	// FFT TEST
 	bn254_entries_fft_test := []bavard.Entry{
@@ -228,9 +289,14 @@ func genTestFiles() {
 		{File: filepath.Join(baseDir, "bls12381", "ntt_test.go"), Templates: []string{"ntt_test.go.tmpl"}},
 	}
 
+	bw6761_entries_msm_test_entries_fft_test := []bavard.Entry{
+		{File: filepath.Join(baseDir, "bw6761", "ntt_test.go"), Templates: []string{"ntt_test.go.tmpl"}},
+	}
+
 	assertNoError(bgen.Generate(config.BLS_12_377, config.BLS_12_377.PackageName, "./ntt/", bls12377_entries_fft_test...))
 	assertNoError(bgen.Generate(config.BN_254, config.BN_254.PackageName, "./ntt/", bn254_entries_fft_test...))
 	assertNoError(bgen.Generate(config.BLS_12_381, config.BLS_12_381.PackageName, "./ntt/", bls12381_entries_fft_test...))
+	assertNoError(bgen.Generate(config.BW6_761, config.BW6_761.PackageName, "./ntt/", bw6761_entries_msm_test_entries_fft_test...))
 }
 
 func main() {
diff --git a/goicicle/templates/msm/msm_test.go.tmpl b/goicicle/templates/msm/msm_test.go.tmpl
index cef152d64..4f757ec33 100644
--- a/goicicle/templates/msm/msm_test.go.tmpl
+++ b/goicicle/templates/msm/msm_test.go.tmpl
@@ -161,7 +161,7 @@ func BenchmarkCommit(b *testing.B) {
 				e := Commit(out_d, scalars_d, points_d, msmSize, 10)
 
 				if e != 0 {
-					panic("Error occured")
+					panic("Error occurred")
 				}
 			}
 		})
@@ -208,7 +208,7 @@ func BenchmarkMSM(b *testing.B) {
 				_, e := Msm(out, points, scalars, 0)
 
 				if e != nil {
-					panic("Error occured")
+					panic("Error occurred")
 				}
 			}
 		})
@@ -270,7 +270,7 @@ func BenchmarkMsmG2{{.CurveNameUpperCase}}(b *testing.B) {
 				_, e := MsmG2(out, points, scalars, 0)
 
 				if e != nil {
-					panic("Error occured")
+					panic("Error occurred")
 				}
 			}
 		})
diff --git a/icicle/CMakeLists.txt b/icicle/CMakeLists.txt
index 526ceda1d..9e17f70f0 100644
--- a/icicle/CMakeLists.txt
+++ b/icicle/CMakeLists.txt
@@ -5,13 +5,54 @@ set(CMAKE_CXX_STANDARD 17)
 set(CMAKE_CUDA_STANDARD 17)
 set(CMAKE_CUDA_STANDARD_REQUIRED TRUE)
 set(CMAKE_CXX_STANDARD_REQUIRED TRUE)
+
 # add the target cuda architectures
 # each additional architecture increases the compilation time and output file size
-if (${CMAKE_VERSION} VERSION_LESS "3.24.0")
-    set(CMAKE_CUDA_ARCHITECTURES ${CUDA_ARCH})
+if(${CMAKE_VERSION} VERSION_LESS "3.24.0")
+  set(CMAKE_CUDA_ARCHITECTURES ${CUDA_ARCH})
 else()
-    set(CMAKE_CUDA_ARCHITECTURES native) # on 3.24+, on earlier it is ignored, and the target is not passed
-endif ()
+  find_program(_nvidia_smi "nvidia-smi")
+
+  if(_nvidia_smi)
+    set(DETECT_GPU_COUNT_NVIDIA_SMI 0)
+
+    # execute nvidia-smi -L to get a short list of GPUs available
+    exec_program(${_nvidia_smi_path} ARGS -L
+      OUTPUT_VARIABLE _nvidia_smi_out
+      RETURN_VALUE _nvidia_smi_ret)
+
+    # process the stdout of nvidia-smi
+    if(_nvidia_smi_ret EQUAL 0)
+      # convert string with newlines to list of strings
+      string(REGEX REPLACE "\n" ";" _nvidia_smi_out "${_nvidia_smi_out}")
+
+      foreach(_line ${_nvidia_smi_out})
+        if(_line MATCHES "^GPU [0-9]+:")
+          math(EXPR DETECT_GPU_COUNT_NVIDIA_SMI "${DETECT_GPU_COUNT_NVIDIA_SMI}+1")
+
+          # the UUID is not very useful for the user, remove it
+          string(REGEX REPLACE " \\(UUID:.*\\)" "" _gpu_info "${_line}")
+
+          if(NOT _gpu_info STREQUAL "")
+            list(APPEND DETECT_GPU_INFO "${_gpu_info}")
+          endif()
+        endif()
+      endforeach()
+
+      check_num_gpu_info(${DETECT_GPU_COUNT_NVIDIA_SMI} DETECT_GPU_INFO)
+      set(DETECT_GPU_COUNT ${DETECT_GPU_COUNT_NVIDIA_SMI})
+    endif()
+  endif()
+
+  # ##
+  if(DETECT_GPU_COUNT GREATER 0)
+    set(CMAKE_CUDA_ARCHITECTURES native) # do native
+  else()
+    # no GPUs found, like on Github CI runners
+    set(CMAKE_CUDA_ARCHITECTURES 50) # some safe value
+  endif()
+endif()
+
 project(icicle LANGUAGES CUDA CXX)
 
 set(CMAKE_CUDA_FLAGS "${CMAKE_CUDA_FLAGS} --expt-relaxed-constexpr")
@@ -20,7 +61,7 @@ set(CMAKE_CUDA_FLAGS_DEBUG "${CMAKE_CUDA_FLAGS_DEBUG} -g -G -O0")
 
 
 # when adding a new curve/field, append its name to the end of this list
-set(SUPPORTED_CURVES bn254;bls12_381;bls12_377;bw6_671)
+set(SUPPORTED_CURVES bn254;bls12_381;bls12_377;bw6_761)
 
 set(IS_CURVE_SUPPORTED FALSE)
 set(I 0)
@@ -43,7 +84,6 @@ if (NOT BUILD_TESTS)
   add_library(
     icicle
     utils/error_handler.cu
-    utils/utils_kernels.cu
     utils/vec_ops.cu
     primitives/field.cu
     primitives/projective.cu
@@ -65,7 +105,10 @@ if (NOT BUILD_TESTS)
     TARGET icicle
     POST_BUILD
     COMMAND ${CMAKE_OBJCOPY} ARGS --redefine-sym MSMCuda=${CURVE}MSMCuda ${PROJECT_BINARY_DIR}/CMakeFiles/icicle.dir/appUtils/msm/msm.cu.o
-    COMMAND ${CMAKE_OBJCOPY} ARGS --redefine-sym GetDefaultMSMConfig=${CURVE}GetDefaultMSMConfig ${PROJECT_BINARY_DIR}/CMakeFiles/icicle.dir/appUtils/msm/msm.cu.o
+    COMMAND ${CMAKE_OBJCOPY} ARGS --redefine-sym DefaultMSMConfig=${CURVE}DefaultMSMConfig ${PROJECT_BINARY_DIR}/CMakeFiles/icicle.dir/appUtils/msm/msm.cu.o
+    COMMAND ${CMAKE_OBJCOPY} ARGS --redefine-sym NTTCuda=${CURVE}NTTCuda ${PROJECT_BINARY_DIR}/CMakeFiles/icicle.dir/appUtils/ntt/ntt.cu.o
+    COMMAND ${CMAKE_OBJCOPY} ARGS --redefine-sym GetDefaultNTTConfig=${CURVE}DefaultNTTConfig ${PROJECT_BINARY_DIR}/CMakeFiles/icicle.dir/appUtils/ntt/ntt.cu.o
+    COMMAND ${CMAKE_OBJCOPY} ARGS --redefine-sym InitializeDomain=${CURVE}InitializeDomain ${PROJECT_BINARY_DIR}/CMakeFiles/icicle.dir/appUtils/ntt/ntt.cu.o
     # COMMAND ${CMAKE_OBJCOPY} ARGS --prefix-symbols=${CURVE}_ ${PROJECT_BINARY_DIR}/CMakeFiles/icicle.dir/appUtils/ntt/ntt.cu.o
     # COMMAND ${CMAKE_OBJCOPY} ARGS --prefix-symbols=${CURVE}_ ${PROJECT_BINARY_DIR}/CMakeFiles/icicle.dir/appUtils/lde/lde.cu.o
     COMMAND ${CMAKE_AR} ARGS -rcs ${LIBRARY_OUTPUT_DIRECTORY}/libingo_${CURVE}.a 
@@ -73,7 +116,6 @@ if (NOT BUILD_TESTS)
     ${PROJECT_BINARY_DIR}/CMakeFiles/icicle.dir/utils/vec_ops.cu.o
     ${PROJECT_BINARY_DIR}/CMakeFiles/icicle.dir/primitives/field.cu.o
     ${PROJECT_BINARY_DIR}/CMakeFiles/icicle.dir/primitives/projective.cu.o
-    ${PROJECT_BINARY_DIR}/CMakeFiles/icicle.dir/utils/utils_kernels.cu.o 
     ${PROJECT_BINARY_DIR}/CMakeFiles/icicle.dir/appUtils/msm/msm.cu.o 
     ${PROJECT_BINARY_DIR}/CMakeFiles/icicle.dir/appUtils/ntt/ntt.cu.o
                                                                                  
diff --git a/icicle/README.md b/icicle/README.md
index 3d7b47807..1a0e3a4a5 100644
--- a/icicle/README.md
+++ b/icicle/README.md
@@ -10,7 +10,7 @@ mkdir -p build; cmake -S . -B build; cmake --build build; cd build && ctest; cd
 
 Before proceeding, make sure the following software installed:
 
-1. CMake at least version 3.16, which can be downloaded from [cmake.org](https://cmake.org/files/)
+1. CMake at least version 3.18, which can be downloaded from [cmake.org](https://cmake.org/files/)
    It is recommended to have the latest version installed.
 2. [CUDA Toolkit](https://developer.nvidia.com/cuda-downloads?target_os=Linux&target_arch=x86_64&Distribution=Ubuntu) version 12.0 or newer.
 3. GCC - version 9 or newer recommended.
@@ -74,7 +74,7 @@ If the `update-alternatives` settings are broken, you can try to fix them with t
 
 If you encounter the error, check if the `$CUDA_HOME/bin/crt/link.stub` file is available.
 
-Othrewise create a symlink. For example, if the CUDA toolkit is installed with apt-get to the default path, you can create a symlink with the following command:
+Otherwise create a symlink. For example, if the CUDA toolkit is installed with apt-get to the default path, you can create a symlink with the following command:
 
 `ln -sf /usr/local/cuda-12.1/bin/crt/link.stub /usr/lib/nvidia-cuda-toolkit/bin/crt/link.stub`
 
diff --git a/icicle/appUtils/msm/msm.cu b/icicle/appUtils/msm/msm.cu
index 70b1dee7c..722c251c2 100644
--- a/icicle/appUtils/msm/msm.cu
+++ b/icicle/appUtils/msm/msm.cu
@@ -31,7 +31,7 @@ namespace msm {
     template <typename S>
     int get_optimal_c(int bitsize)
     {
-      return ceil(log2(bitsize)) - 4;
+      return max((int)ceil(log2(bitsize)) - 4, 1);
     }
 
     template <typename P>
@@ -904,13 +904,9 @@ namespace msm {
 
   } // namespace
 
-  MSMConfig DefaultMSMConfig()
+  extern "C" MSMConfig DefaultMSMConfig()
   {
-    device_context::DeviceContext ctx = {
-      0,               // device_id
-      (cudaStream_t)0, // stream
-      0,               // mempool
-    };
+    device_context::DeviceContext ctx = device_context::get_default_device_context();
     MSMConfig config = {
       false, // are_scalars_on_device
       false, // are_scalars_montgomery_form
@@ -925,7 +921,7 @@ namespace msm {
       false, // is_big_triangle
       10,    // large_bucket_factor
       false, // is_async
-      ctx,   // DeviceContext
+      ctx,   // ctx
     };
     return config;
   }
@@ -950,13 +946,7 @@ namespace msm {
   }
 
   /**
-   * Extern version of [DefaultMSMConfig](@ref DefaultMSMConfig) function.
-   * @return Default value of [MSMConfig](@ref MSMConfig).
-   */
-  extern "C" MSMConfig GetDefaultMSMConfig() { return DefaultMSMConfig(); }
-
-  /**
-   * Extern version of [MSM](@ref MSM) function with the following values of template parameters
+   * Extern "C" version of [MSM](@ref MSM) function with the following values of template parameters
    * (where the curve is given by `-DCURVE` env variable during build):
    *  - `S` is the [scalar field](@ref scalar_t) of the curve;
    *  - `A` is the [affine representation](@ref affine_t) of curve points;
@@ -977,7 +967,7 @@ namespace msm {
 #if defined(G2_DEFINED)
 
   /**
-   * Extern version of [MSM](@ref MSM) function with the following values of template parameters
+   * Extern "C" version of [MSM](@ref MSM) function with the following values of template parameters
    * (where the curve is given by `-DCURVE` env variable during build):
    *  - `S` is the [scalar field](@ref scalar_t) of the curve;
    *  - `A` is the [affine representation](@ref g2_affine_t) of G2 curve points;
diff --git a/icicle/appUtils/msm/msm.cuh b/icicle/appUtils/msm/msm.cuh
index 356033b1a..e1fcc9cce 100644
--- a/icicle/appUtils/msm/msm.cuh
+++ b/icicle/appUtils/msm/msm.cuh
@@ -33,47 +33,50 @@ namespace msm {
 
   /**
    * @struct MSMConfig
-   * Struct that encodes MSM parameters to be passed into the [msm](@ref msm) function.
+   * Struct that encodes MSM parameters to be passed into the [MSM](@ref MSM) function. The intended use of this struct
+   * is to create it using [DefaultMSMConfig](@ref DefaultMSMConfig) function and then you'll hopefully only need to 
+   * change a small number of default values for each of your MSMs.
    */
   struct MSMConfig {
-    bool
-      are_scalars_on_device; /**< True if scalars are on device and false if they're on host. Default value: false. */
+    bool are_scalars_on_device; /**< True if scalars are on device and false if they're on host. Default value: 
+                                 *   false. */
     bool are_scalars_montgomery_form; /**< True if scalars are in Montgomery form and false otherwise. Default value:
-                                         true. */
-    int points_size; /**< Number of points in the MSM. If a batch of MSMs needs to be computed, this should be a number
-                      *   of different points. So, if each MSM re-uses the same set of points, this variable is set
-                      * equal to the MSM size. And if every MSM uses a distinct set of points, it should be set to the
-                      * product of MSM size and [batch_size](@ref batch_size). Default value: 0 (meaning it's equal to
-                      * the MSM size). */
+                                       *   true. */
+    int points_size;           /**< Number of points in the MSM. If a batch of MSMs needs to be computed, this should be
+                                *   a number of different points. So, if each MSM re-uses the same set of points, this
+                                *   variable is set equal to the MSM size. And if every MSM uses a distinct set of
+                                *   points, it should be set to the product of MSM size and [batch_size](@ref
+                                *   batch_size). Default value: 0 (meaning it's equal to the MSM size). */
     int precompute_factor;     /**< The number of extra points to pre-compute for each point. Larger values decrease the
-                                * number of computations     to make, on-line memory footprint, but increase the static
-                                * memory     footprint. Default value: 1 (i.e. don't pre-compute). */
+                                *   number of computations to make, on-line memory footprint, but increase the static
+                                *   memory footprint. Default value: 1 (i.e. don't pre-compute). */
     bool are_points_on_device; /**< True if points are on device and false if they're on host. Default value: false. */
     bool are_points_montgomery_form; /**< True if coordinates of points are in Montgomery form and false otherwise.
-                                        Default value: true. */
+                                      *   Default value: true. */
     int batch_size;                  /**< The number of MSMs to compute. Default value: 1. */
     bool are_results_on_device; /**< True if the results should be on device and false if they should be on host. If set
-                                 * to false, `is_async` won't take effect because a synchronization is needed to
-                                 * transfer results to the host. Default value: false. */
-    int c;       /**< \f$ c \f$ value, or "window bitsize" which is the main parameter of the "bucket method"
-                  *   that we use to solve the MSM problem. As a rule of thumb, larger value means more on-line memory
-                  *   footprint but also more parallelism and less computational complexity (up to a certain point).
-                  *   Default value: 0 (the optimal value of \f$ c \f$ is chosen automatically). */
-    int bitsize; /**< Number of bits of the largest scalar. Typically equals the bitsize of scalar field, but if a
-                  * different (better) upper bound is known, it should be reflected in this variable. Default value: 0
-                  * (set to the bitsize of scalar field). */
-    bool is_big_triangle;    /**< Whether to do "bucket accumulation" serially. Decreases computational complexity, but
-                              * also greatly    decreases parallelism, so only suitable for large batches of MSMs. Default
-                              * value: false. */
-    int large_bucket_factor; /**< Variable that controls how sensitive the algorithm is to the buckets that occur very
-                              * frequently. Useful for efficient treatment of non-uniform distributions of scalars and
-                              * "top windows" with few bits. Can be set to 0 to disable separate treatment of large
-                              * buckets altogether. Default value: 10. */
-    int is_async; /**< Whether to run the MSM asyncronously. If set to `true`, the MSM function will be non-blocking
-                   *   and you'd need to synchronize it explicitly by running `cudaStreamSynchronize` or
-                   * `cudaDeviceSynchronize`. If set to false, the MSM function will block the current CPU thread. */
-    device_context::DeviceContext ctx; /**< Details related to the device such as its id and stream id. See
-                                          [DeviceContext](@ref `device_context::DeviceContext`). */
+                                 *   to false, `is_async` won't take effect because a synchronization is needed to
+                                 *   transfer results to the host. Default value: false. */
+    int c;                      /**< \f$ c \f$ value, or "window bitsize" which is the main parameter of the "bucket
+                                 *   method" that we use to solve the MSM problem. As a rule of thumb, larger value
+                                 *   means more on-line memory footprint but also more parallelism and less computational
+                                 *   complexity (up to a certain point). Default value: 0 (the optimal value of \f$ c \f$
+                                 *   is chosen automatically). */
+    int bitsize;                /**< Number of bits of the largest scalar. Typically equals the bitsize of scalar field,
+                                 *   but if a different (better) upper bound is known, it should be reflected in this
+                                 *   variable. Default value: 0 (set to the bitsize of scalar field). */
+    bool is_big_triangle;       /**< Whether to do "bucket accumulation" serially. Decreases computational complexity
+                                 *   but also greatly decreases parallelism, so only suitable for large batches of MSMs.
+                                 *   Default value: false. */
+    int large_bucket_factor;    /**< Variable that controls how sensitive the algorithm is to the buckets that occur
+                                 *   very frequently. Useful for efficient treatment of non-uniform distributions of
+                                 *   scalars and "top windows" with few bits. Can be set to 0 to disable separate
+                                 *   treatment of large buckets altogether. Default value: 10. */
+    bool is_async;              /**< Whether to run the MSM asyncronously. If set to true, the MSM function will be
+                                 *   non-blocking and you'd need to synchronize it explicitly by running
+                                 *   `cudaStreamSynchronize` or `cudaDeviceSynchronize`. If set to false, the MSM
+                                 *   function will block the current CPU thread. */
+    device_context::DeviceContext ctx; /**< Details related to the device such as its id and stream id. */
   };
 
   /**
@@ -89,8 +92,8 @@ namespace msm {
    * So, if for example all MSMs share the same base points, they can be repeated only once.
    * @param msm_size MSM size \f$ N \f$. If a batch of MSMs (which all need to have the same size) is computed, this is
    * the size of 1 MSM.
-   * @param results Result (or results in the case of batch MSM).
    * @param config [MSMConfig](@ref MSMConfig) used in this MSM.
+   * @param results Buffer for the result (or results in the case of batch MSM).
    * @tparam S Scalar field type.
    * @tparam A The type of points \f$ \{P_i\} \f$ which is typically an [affine
    * Weierstrass](https://hyperelliptic.org/EFD/g1p/auto-shortw.html) point.
@@ -98,10 +101,6 @@ namespace msm {
    * Weierstrass](https://hyperelliptic.org/EFD/g1p/auto-shortw-projective.html) point in our codebase.
    * @return `cudaSuccess` if the execution was successful and an error code otherwise.
    *
-   * This function is asyncronous, and to sync it with host, you need to call `cudaDeviceSyncronize()`. To syncronize
-   * with a different stream `stream1`, call `cudaStreamSynchronize(config.stream)` and
-   * `cudaStreamSynchronize(stream1)`.
-   *
    * **Note:** this function is still WIP and the following [MSMConfig](@ref MSMConfig) members do not yet have any
    * effect: `points_size` (it's always equal to the msm size currenly), `precompute_factor` (always equals 1) and
    * `ctx.device_id` (0 device is always used). Also, it's currently better to use `batch_size=1` in most cases (expept
diff --git a/icicle/appUtils/ntt/ntt.cu b/icicle/appUtils/ntt/ntt.cu
index 66016f208..e2acbfec1 100644
--- a/icicle/appUtils/ntt/ntt.cu
+++ b/icicle/appUtils/ntt/ntt.cu
@@ -1,5 +1,8 @@
 #include "ntt.cuh"
 
+#include <vector>
+#include <unordered_map>
+
 #include "../../curves/curve_config.cuh"
 #include "../../utils/sharedmem.cuh"
 #include "../../utils/utils_kernels.cuh"
@@ -42,59 +45,59 @@ namespace ntt {
     }
 
     /**
-     * Bit-reverses a batch of input arrays in-place inside GPU.
+     * Bit-reverses a batch of input arrays out-of-place inside GPU.
      * for example: on input array ([a[0],a[1],a[2],a[3]], 4, 2) it returns
      * [a[0],a[3],a[2],a[1]] (elements at indices 3 and 1 swhich places).
-     * @param arr batch of arrays of some object of type T. Should be on GPU.
+     * @param arr_in batch of arrays of some object of type T. Should be on GPU.
      * @param n length of `arr`.
      * @param logn log(n).
      * @param batch_size the size of the batch.
+     * @param arr_out buffer of the same size as `arr_in` on the GPU to write the bit-permuted array into.
      */
     template <typename E>
-    void reverse_order_batch(E* arr, uint32_t n, uint32_t logn, uint32_t batch_size, cudaStream_t stream)
+    void reverse_order_batch(E* arr_in, uint32_t n, uint32_t logn, uint32_t batch_size, cudaStream_t stream, E* arr_out)
     {
-      E* arr_reversed;
-      cudaMallocAsync(&arr_reversed, n * batch_size * sizeof(E), stream);
       int number_of_threads = MAX_THREADS_BATCH;
       int number_of_blocks = (n * batch_size + number_of_threads - 1) / number_of_threads;
-      reverse_order_kernel<<<number_of_blocks, number_of_threads, 0, stream>>>(arr, arr_reversed, n, logn, batch_size);
-      cudaMemcpyAsync(arr, arr_reversed, n * batch_size * sizeof(E), cudaMemcpyDefault, stream);
-      cudaFreeAsync(arr_reversed, stream);
+      reverse_order_kernel<<<number_of_blocks, number_of_threads, 0, stream>>>(arr_in, arr_out, n, logn, batch_size);
     }
 
     /**
-     * Bit-reverses an input array in-place inside GPU.
+     * Bit-reverses an input array out-of-place inside GPU.
      * for example: on array ([a[0],a[1],a[2],a[3]], 4, 2) it returns
      * [a[0],a[3],a[2],a[1]] (elements at indices 3 and 1 swhich places).
-     * @param arr array of some object of type T of size which is a power of 2. Should be on GPU.
+     * @param arr_in array of some object of type T of size which is a power of 2. Should be on GPU.
      * @param n length of `arr`.
      * @param logn log(n).
+     * @param arr_out buffer of the same size as `arr_in` on the GPU to write the bit-permuted array into.
      */
     template <typename E>
-    void reverse_order(E* arr, uint32_t n, uint32_t logn, cudaStream_t stream)
+    void reverse_order(E* arr_in, uint32_t n, uint32_t logn, cudaStream_t stream, E* arr_out)
     {
-      reverse_order_batch(arr, n, logn, 1, stream);
+      reverse_order_batch(arr_in, n, logn, 1, stream, arr_out);
     }
 
     /**
      * Cooley-Tuckey NTT.
      * NOTE! this function assumes that d_twiddles are located in the device memory.
-     * @param arr input array of type E (elements).
+     * @param arr_in input array of type E (elements).
      * @param n length of d_arr.
      * @param twiddles twiddle factors of type S (scalars) array allocated on the device memory (must be a power of 2).
-     * @param n_twiddles length of twiddles.
+     * @param n_twiddles length of twiddles, should be negative for intt.
      * @param max_task max count of parallel tasks.
      * @param s log2(n) loop index.
+     * @param arr_out buffer for the output.
      */
     template <typename E, typename S>
     __global__ void ntt_template_kernel_shared_rev(
-      E* __restrict__ arr_g,
-      uint32_t n,
+      E* __restrict__ arr_in,
+      int n,
       const S* __restrict__ r_twiddles,
-      uint32_t n_twiddles,
-      uint32_t max_task,
-      uint32_t ss,
-      uint32_t logn)
+      int n_twiddles,
+      int max_task,
+      int ss,
+      int logn,
+      E* __restrict__ arr_out)
     {
       SharedMemory<E> smem;
       E* arr = smem.getPointer();
@@ -128,13 +131,13 @@ namespace ntt {
             uint32_t oij = i + j;
             uint32_t k = oij + shift_s;
 
-            S tw = r_twiddles[j * n_twiddles_div];
+            S tw = *(r_twiddles + (int)(j * n_twiddles_div));
 
-            E u = is_beginning ? arr_g[offset + oij] : arr[oij];
-            E v = is_beginning ? arr_g[offset + k] : arr[k];
+            E u = is_beginning ? arr_in[offset + oij] : arr[oij];
+            E v = is_beginning ? arr_in[offset + k] : arr[k];
             if (is_end) {
-              arr_g[offset + oij] = u + v;
-              arr_g[offset + k] = tw * (u - v);
+              arr_out[offset + oij] = u + v;
+              arr_out[offset + k] = tw * (u - v);
             } else {
               arr[oij] = u + v;
               arr[k] = tw * (u - v);
@@ -149,22 +152,24 @@ namespace ntt {
     /**
      * Cooley-Tuckey NTT.
      * NOTE! this function assumes that d_twiddles are located in the device memory.
-     * @param arr input array of type E (elements).
+     * @param arr_in input array of type E (elements).
      * @param n length of d_arr.
      * @param twiddles twiddle factors of type S (scalars) array allocated on the device memory (must be a power of 2).
-     * @param n_twiddles length of twiddles.
+     * @param n_twiddles length of twiddles, should be negative for intt.
      * @param max_task max count of parallel tasks.
      * @param s log2(n) loop index.
+     * @param arr_out buffer for the output.
      */
     template <typename E, typename S>
     __global__ void ntt_template_kernel_shared(
-      E* __restrict__ arr_g,
-      uint32_t n,
+      E* __restrict__ arr_in,
+      int n,
       const S* __restrict__ r_twiddles,
-      uint32_t n_twiddles,
-      uint32_t max_task,
-      uint32_t s,
-      uint32_t logn)
+      int n_twiddles,
+      int max_task,
+      int s,
+      int logn,
+      E* __restrict__ arr_out)
     {
       SharedMemory<E> smem;
       E* arr = smem.getPointer();
@@ -194,14 +199,14 @@ namespace ntt {
             uint32_t i = ((l >> s) * shift2_s) & (n - 1); // (..) % n (assuming n is power of 2)
             uint32_t oij = i + j;
             uint32_t k = oij + shift_s;
-            S tw = r_twiddles[j * n_twiddles_div];
+            S tw = *(r_twiddles + (int)(j * n_twiddles_div));
 
-            E u = s == 0 ? arr_g[offset + oij] : arr[oij];
-            E v = s == 0 ? arr_g[offset + k] : arr[k];
+            E u = s == 0 ? arr_in[offset + oij] : arr[oij];
+            E v = s == 0 ? arr_in[offset + k] : arr[k];
             v = tw * v;
             if (s == (logn - 1)) {
-              arr_g[offset + oij] = u + v;
-              arr_g[offset + k] = u - v;
+              arr_out[offset + oij] = u + v;
+              arr_out[offset + k] = u - v;
             } else {
               arr[oij] = u + v;
               arr[k] = u - v;
@@ -219,13 +224,13 @@ namespace ntt {
      * @param arr input array of type E (elements).
      * @param n length of d_arr.
      * @param twiddles twiddle factors of type S (scalars) array allocated on the device memory (must be a power of 2).
-     * @param n_twiddles length of twiddles.
+     * @param n_twiddles length of twiddles, should be negative for intt.
      * @param max_task max count of parallel tasks.
      * @param s log2(n) loop index.
      */
     template <typename E, typename S>
     __global__ void
-    ntt_template_kernel(E* arr, uint32_t n, S* twiddles, uint32_t n_twiddles, uint32_t max_task, uint32_t s, bool rev)
+    ntt_template_kernel(E* arr_in, int n, S* twiddles, int n_twiddles, int max_task, int s, bool rev, E* arr_out)
     {
       int task = blockIdx.x;
       int chunks = n / (blockDim.x * 2);
@@ -248,15 +253,15 @@ namespace ntt {
           uint32_t i = ((l >> s) * shift2_s) & (n - 1); // (..) % n (assuming n is power of 2)
           uint32_t k = i + j + shift_s;
 
-          S tw = twiddles[j * n_twiddles_div];
+          S tw = *(twiddles + (int)(j * n_twiddles_div));
 
           uint32_t offset = (task / chunks) * n;
-          E u = arr[offset + i + j];
-          E v = arr[offset + k];
+          E u = arr_in[offset + i + j];
+          E v = arr_in[offset + k];
           if (!rev) v = tw * v;
-          arr[offset + i + j] = u + v;
+          arr_out[offset + i + j] = u + v;
           v = u - v;
-          arr[offset + k] = rev ? tw * v : v;
+          arr_out[offset + k] = rev ? tw * v : v;
         }
       }
     }
@@ -264,29 +269,32 @@ namespace ntt {
     /**
      * NTT/INTT inplace batch
      * Note: this function does not preform any bit-reverse permutations on its inputs or outputs.
-     * @param d_inout Array for inplace processing
-     * @param d_twiddles
-     * @param n Length of `d_twiddles` array
+     * @param d_input Input array
+     * @param n Size of `d_input`
+     * @param d_twiddles Twiddles
+     * @param n_twiddles Size of `d_twiddles`
      * @param batch_size The size of the batch; the length of `d_inout` is `n` * `batch_size`.
      * @param inverse true for iNTT
-     * @param is_coset true for multiplication by coset
-     * @param coset should be array of lenght n - or in case of lesser than n, right-padded with zeroes
+     * @param coset should be array of lenght n or a nullptr if NTT is not computed on a coset
      * @param stream CUDA stream
-     * @param is_sync_needed do perform sync of the supplied CUDA stream at the end of processing
+     * @param is_async if false, perform sync of the supplied CUDA stream at the end of processing
+     * @param d_output Output array
      */
     template <typename E, typename S>
     void ntt_inplace_batch_template(
-      E* d_inout,
+      E* d_input,
+      int n,
       S* d_twiddles,
-      unsigned n,
-      unsigned batch_size,
+      int n_twiddles,
+      int batch_size,
+      int logn,
       bool inverse,
-      bool is_coset,
-      S* coset,
+      bool ct_buttterfly,
+      int coset_gen_index,
       cudaStream_t stream,
-      bool is_sync_needed)
+      bool is_async,
+      E* d_output)
     {
-      const int logn = int(log(n) / log(2));
       bool is_shared_mem_enabled = sizeof(E) <= MAX_SHARED_MEM_ELEMENT_SIZE;
       const int log2_shmem_elems = is_shared_mem_enabled ? int(log(int(MAX_SHARED_MEM / sizeof(E))) / log(2)) : logn;
       int num_threads = max(min(min(n / 2, MAX_THREADS_BATCH), 1 << (log2_shmem_elems - 1)), 1);
@@ -297,215 +305,231 @@ namespace ntt {
                                                           // less then max to allow more concurrent blocks on SM
       const int logn_shmem = is_shared_mem_enabled ? int(log(2 * num_threads) / log(2))
                                                    : 0; // TODO: shared memory support only for types <= 32 bytes
+      int num_threads_coset = max(min(n / 2, MAX_NUM_THREADS), 1);
+      int num_blocks_coset = (n * batch_size + num_threads_coset - 1) / num_threads_coset;
 
       if (inverse) {
+        d_twiddles = d_twiddles + n_twiddles;
+        n_twiddles = -n_twiddles;
+      }
+
+      bool is_on_coset = (coset_gen_index > 0);
+      bool direct_coset = (!inverse && is_on_coset);
+      if (direct_coset)
+        utils_internal::BatchMulKernel<E, S>
+          <<<num_blocks_coset, num_threads_coset, 0, stream>>>(d_input, n, batch_size, d_twiddles, coset_gen_index, n_twiddles, d_output);
+
+      if (ct_buttterfly) {
         if (is_shared_mem_enabled)
           ntt_template_kernel_shared<<<num_blocks, num_threads, shared_mem, stream>>>(
-            d_inout, 1 << logn_shmem, d_twiddles, n, total_tasks, 0, logn_shmem);
+            direct_coset ? d_output : d_input, 1 << logn_shmem, d_twiddles, n_twiddles, total_tasks, 0, logn_shmem, d_output);
 
         for (int s = logn_shmem; s < logn; s++) // TODO: this loop also can be unrolled
         {
-          ntt_template_kernel<E, S>
-            <<<num_blocks, num_threads, 0, stream>>>(d_inout, n, d_twiddles, n, total_tasks, s, false);
+          ntt_template_kernel<E, S><<<num_blocks, num_threads, 0, stream>>>(
+            (direct_coset && (s == 0)) ? d_input : d_output, n, d_twiddles, n_twiddles, total_tasks, s, false, d_output);
         }
-
-        if (is_coset)
-          utils_internal::BatchMulKernel<E, S><<<num_blocks, num_threads, 0, stream>>>(d_inout, coset, n, batch_size);
-
-        num_threads = max(min(n / 2, MAX_NUM_THREADS), 1);
-        num_blocks = (n * batch_size + num_threads - 1) / num_threads;
-        utils_internal::NormalizeKernel<E, S>
-          <<<num_blocks, num_threads, 0, stream>>>(d_inout, S::inv_log_size(logn), n * batch_size);
       } else {
-        if (is_coset)
-          utils_internal::BatchMulKernel<E, S><<<num_blocks, num_threads, 0, stream>>>(d_inout, coset, n, batch_size);
-
         for (int s = logn - 1; s >= logn_shmem; s--) // TODO: this loop also can be unrolled
         {
-          ntt_template_kernel<<<num_blocks, num_threads, 0, stream>>>(d_inout, n, d_twiddles, n, total_tasks, s, true);
+          ntt_template_kernel<<<num_blocks, num_threads, 0, stream>>>(
+            (direct_coset || (s < logn - 1)) ? d_output : d_input, n, d_twiddles, n_twiddles, total_tasks, s, true, d_output);
         }
 
         if (is_shared_mem_enabled)
           ntt_template_kernel_shared_rev<<<num_blocks, num_threads, shared_mem, stream>>>(
-            d_inout, 1 << logn_shmem, d_twiddles, n, total_tasks, 0, logn_shmem);
+            (direct_coset || (logn > logn_shmem)) ? d_output : d_input, 1 << logn_shmem, d_twiddles,
+            n_twiddles, total_tasks, 0, logn_shmem, d_output);
       }
 
-      if (!is_sync_needed) return;
+      if (inverse) {
+        if (is_on_coset)
+          utils_internal::BatchMulKernel<E, S>
+            <<<num_blocks_coset, num_threads_coset, 0, stream>>>(d_output, n, batch_size, d_twiddles, -coset_gen_index, -n_twiddles, d_output);
+
+        utils_internal::NormalizeKernel<E, S>
+          <<<num_blocks_coset, num_threads_coset, 0, stream>>>(d_output, S::inv_log_size(logn), n * batch_size);
+      }
+
+      if (is_async) return;
 
       cudaStreamSynchronize(stream);
     }
 
   } // namespace
 
+  /**
+   * @struct Domain
+   * Struct containing information about the domain on which (i)NTT is evaluated i.e. twiddle factors.
+   * Twiddle factors are private, static and can only be set using [InitDomain](@ref InitDomain) function.
+   * The internal representation of twiddles is prone to change in accordance with changing [NTT](@ref NTT) algorithm.
+   * @tparam S The type of twiddle factors \f$ \{ \omega^i \} \f$. Must be a field.
+   */
+  template <typename S>
+  class Domain {
+    static int max_size;
+    static S* twiddles;
+    static std::unordered_map<S, int> coset_index;
+
+    public:
+      template <typename U>
+      friend cudaError_t InitDomain<U>(U primitive_root, device_context::DeviceContext& ctx);
+
+      template <typename U, typename E>
+      friend cudaError_t NTT<U, E>(E* input, int size, bool is_inverse, NTTConfig<U>& config, E* output);
+  };
+
+  template<typename S> int Domain<S>::max_size = 0;
+  template<typename S> S* Domain<S>::twiddles = nullptr;
+  template<typename S> std::unordered_map<S, int> Domain<S>::coset_index = {};
+
   template <typename S>
-  cudaError_t GenerateTwiddleFactors(S* d_twiddles, int n_twiddles, S omega, device_context::DeviceContext ctx)
+  cudaError_t InitDomain(S primitive_root, device_context::DeviceContext& ctx)
   {
-    twiddle_factors_kernel<S><<<1, 1, 0, ctx.stream>>>(d_twiddles, n_twiddles, omega);
-    cudaStreamSynchronize(ctx.stream);
+    // only generate twiddles if they haven't been generated yet (TODO: thread safety)
+    if (!Domain<S>::twiddles) {
+      // TODO DmytroTym: the following line is just a temporary patch to make it work, 
+      // having issues creating default stream on rust side
+      device_context::DeviceContext ctx = device_context::get_default_device_context();
+      std::vector<S> h_twiddles;
+      h_twiddles.push_back(S::one());
+      int n = 1;
+      do {
+        Domain<S>::coset_index[h_twiddles.at(n - 1)] = n - 1;
+        h_twiddles.push_back(h_twiddles.at(n - 1) * primitive_root);
+      } while (h_twiddles.at(n++) != S::one());
+      cudaMallocAsync(&Domain<S>::twiddles, n * sizeof(S), ctx.stream);
+      cudaMemcpyAsync(Domain<S>::twiddles, &h_twiddles.front(), n * sizeof(S), cudaMemcpyHostToDevice, ctx.stream);
+      Domain<S>::max_size = n - 1;
+    }
     return cudaSuccess;
   }
 
-  template <typename E, typename S>
-  cudaError_t NTT(NTTConfig<E, S>* config)
+  template <typename S, typename E>
+  cudaError_t NTT(E* input, int size, bool is_inverse, NTTConfig<S>& config, E* output)
   {
     CHECK_LAST_CUDA_ERROR();
 
-    cudaStream_t stream = config->ctx.stream;
-    int size = config->size;
-    int batch_size = config->batch_size;
-    bool is_inverse = config->is_inverse;
-    int n_twiddles = size;
+    cudaStream_t stream = config.ctx.stream;
+    int batch_size = config.batch_size;
     int logn = int(log(size) / log(2));
     int input_size_bytes = size * batch_size * sizeof(E);
-    bool is_input_on_device = config->are_inputs_on_device;
-    bool is_output_on_device = config->is_output_on_device;
-    bool is_forward_twiddle_empty = config->twiddles == nullptr;
-    bool is_inverse_twiddle_empty = config->inv_twiddles == nullptr;
-    bool is_generating_twiddles = (is_forward_twiddle_empty && is_inverse_twiddle_empty) ||
-                                  (is_forward_twiddle_empty && !is_inverse) || (is_inverse_twiddle_empty && is_inverse);
-
-    S* d_twiddles;
-    if (is_generating_twiddles) {
-      cudaMallocAsync(&d_twiddles, n_twiddles * sizeof(S), stream);
-      S omega = is_inverse ? S::omega_inv(logn) : S::omega(logn);
-      GenerateTwiddleFactors(d_twiddles, n_twiddles, omega, config->ctx);
+    bool is_input_on_device = config.are_inputs_on_device;
+    bool is_output_on_device = config.are_outputs_on_device;
+
+    E* d_input;
+    if (is_input_on_device) {
+      d_input = input;
     } else {
-      d_twiddles = is_inverse ? config->inv_twiddles : config->twiddles;
+      cudaMallocAsync(&d_input, input_size_bytes, stream);
+      cudaMemcpyAsync(d_input, input, input_size_bytes, cudaMemcpyHostToDevice, stream);
     }
-
-    E* d_inout;
+    E* d_output;
     if (is_input_on_device) {
-      d_inout = config->inout;
+      d_output = output;
     } else {
-      cudaMallocAsync(&d_inout, input_size_bytes, stream);
-      cudaMemcpyAsync(d_inout, config->inout, input_size_bytes, cudaMemcpyHostToDevice, stream);
+      cudaMallocAsync(&d_output, input_size_bytes, stream);
     }
 
-    bool reverse_input;
-    bool reverse_output;
-    switch (config->ordering) {
+    bool ct_butterfly = true;
+    bool reverse_input = false;
+    switch (config.ordering) {
     case Ordering::kNN:
-      reverse_input = is_inverse;
-      reverse_output = !is_inverse;
+      reverse_input = true;
       break;
     case Ordering::kNR:
-      reverse_input = is_inverse;
-      reverse_output = is_inverse;
-      break;
-    case Ordering::kRN:
-      reverse_input = !is_inverse;
-      reverse_output = !is_inverse;
+      ct_butterfly = false;
       break;
     case Ordering::kRR:
-      reverse_input = !is_inverse;
-      reverse_output = is_inverse;
+      reverse_input = true;
+      ct_butterfly = false;
       break;
     }
     CHECK_LAST_CUDA_ERROR();
 
-    if (reverse_input) reverse_order_batch(d_inout, size, logn, config->batch_size, stream);
+    if (reverse_input) reverse_order_batch(d_input, size, logn, batch_size, stream, d_output);
     CHECK_LAST_CUDA_ERROR();
 
     ntt_inplace_batch_template(
-      d_inout, d_twiddles, size, batch_size, is_inverse, config->is_coset, config->coset_gen, stream, false);
-    CHECK_LAST_CUDA_ERROR();
-
-    if (reverse_output) reverse_order_batch(d_inout, size, logn, batch_size, stream);
+      reverse_input ? d_output : d_input, size, Domain<S>::twiddles, Domain<S>::max_size, batch_size, logn,
+      is_inverse, ct_butterfly, Domain<S>::coset_index[config.coset_gen], stream, !config.is_async, d_output);
     CHECK_LAST_CUDA_ERROR();
 
     if (is_output_on_device) {
       // free(config->inout); // TODO: ? or callback?+
-      config->inout = d_inout;
+      output = d_output;
     } else {
-      if (is_input_on_device) {
-        E* h_output = (E*)malloc(input_size_bytes); // TODO: caller responsible for memory management
-        cudaMemcpyAsync(h_output, d_inout, input_size_bytes, cudaMemcpyDeviceToHost, stream);
-        config->inout = h_output;
-        CHECK_LAST_CUDA_ERROR();
-      } else {
-        cudaMemcpyAsync(config->inout, d_inout, input_size_bytes, cudaMemcpyDeviceToHost, stream);
-        CHECK_LAST_CUDA_ERROR();
-      }
-      cudaFreeAsync(d_inout, stream); // TODO: make it optional? so can be reused
+      cudaMemcpyAsync(output, d_output, input_size_bytes, cudaMemcpyDeviceToHost, stream);
+      CHECK_LAST_CUDA_ERROR();
     }
     CHECK_LAST_CUDA_ERROR();
 
-    if (is_generating_twiddles && !config->is_preserving_twiddles) { cudaFreeAsync(d_twiddles, stream); }
-
-    if (config->is_preserving_twiddles) {
-      if (is_inverse)
-        config->inv_twiddles = d_twiddles;
-      else {
-        config->twiddles = d_twiddles;
-      }
-    }
-
-    cudaStreamSynchronize(stream);
+    if (!config.is_async) cudaStreamSynchronize(stream);
 
     CHECK_LAST_CUDA_ERROR();
 
     return cudaSuccess;
   }
 
+  template <typename S>
+  NTTConfig<S> DefaultNTTConfig() {
+    device_context::DeviceContext ctx = device_context::get_default_device_context();
+    NTTConfig<S> config = {
+      S::one(),      // coset_gen
+      Ordering::kNN, // ordering
+      false,         // are_inputs_on_device
+      false,         // are_outputs_on_device
+      1,             // batch_size
+      false,         // is_async
+      ctx,           // ctx
+    };
+    return config;
+  }
+
   /**
-   * Extern version of [ntt](@ref ntt) function with the following values of template parameters
-   * (where the curve is given by `-DCURVE` env variable during build):
-   *  - `S` and `E` are both the [scalar field](@ref scalar_t) of the curve;
-   * @return `cudaSuccess` if the execution was successful and an error code otherwise.
+   * Extern "C" version of [DefaultNTTConfig](@ref DefaultNTTConfig) function with the following 
+   * value of template parameter (where the curve is given by `-DCURVE` env variable during build):
+   *  - `S` is the [scalar field](@ref scalar_t) of the curve;
+   * @return Default [NTTConfig](@ref NTTConfig).
    */
-  extern "C" cudaError_t NTTCuda(NTTConfig<curve_config::scalar_t, curve_config::scalar_t>* config)
-  {
-    return NTT<curve_config::scalar_t, curve_config::scalar_t>(config);
+  extern "C" NTTConfig<curve_config::scalar_t> GetDefaultNTTConfig() {
+    return DefaultNTTConfig<curve_config::scalar_t>();
   }
 
   /**
-   * Extern version of [ntt](@ref ntt) function with the following values of template parameters
-   * (where the curve is given by `-DCURVE` env variable during build):
-   *  - `S` and `E` are both the [scalar field](@ref scalar_t) of the curve;
-   * @return `cudaSuccess` if the execution was successful and an error code otherwise.
+   * Extern "C" version of [InitDomain](@ref InitDomain) function with the following 
+   * value of template parameter (where the curve is given by `-DCURVE` env variable during build):
+   *  - `S` is the [scalar field](@ref scalar_t) of the curve;
    */
-  template <typename E, typename S>
-  cudaError_t NTTDefaultContext(NTTConfig<E, S>* config)
-  {
-    // TODO: if empty - create default
-    cudaMemPool_t mempool;
-    cudaDeviceGetDefaultMemPool(&mempool, config->ctx.device_id);
-
-    device_context::DeviceContext context = {
-      config->ctx.device_id,
-      0, // default stream
-      mempool};
-
-    config->ctx = context;
-
-    return NTT<E, S>(config);
+  extern "C" cudaError_t InitializeDomain(curve_config::scalar_t primitive_root, device_context::DeviceContext& ctx) {
+    return InitDomain(primitive_root, ctx);
   }
 
   /**
-   * Extern version of [ntt](@ref ntt) function with the following values of template parameters
+   * Extern "C" version of [NTT](@ref NTT) function with the following values of template parameters
    * (where the curve is given by `-DCURVE` env variable during build):
    *  - `S` and `E` are both the [scalar field](@ref scalar_t) of the curve;
    * @return `cudaSuccess` if the execution was successful and an error code otherwise.
    */
-  extern "C" cudaError_t NTTDefaultContextCuda(NTTConfig<curve_config::scalar_t, curve_config::scalar_t>* config)
+  extern "C" cudaError_t NTTCuda(curve_config::scalar_t* input, int size, bool is_inverse, NTTConfig<curve_config::scalar_t>& config, curve_config::scalar_t* output)
   {
-    return NTTDefaultContext(config);
+    return NTT<curve_config::scalar_t, curve_config::scalar_t>(input, size, is_inverse, config, output);
   }
 
 #if defined(ECNTT_DEFINED)
 
   /**
-   * Extern version of [NTT](@ref NTT) function with the following values of template parameters
+   * Extern "C" version of [NTT](@ref NTT) function with the following values of template parameters
    * (where the curve is given by `-DCURVE` env variable during build):
    *  - `S` is the [projective representation](@ref projective_t) of the curve (i.e. EC NTT is computed);
    *  - `E` is the [scalar field](@ref scalar_t) of the curve;
    * @return `cudaSuccess` if the execution was successful and an error code otherwise.
    */
-  extern "C" cudaError_t ECNTTCuda(NTTConfig<curve_config::projective_t, curve_config::scalar_t>* config)
+  extern "C" cudaError_t ECNTTCuda(curve_config::projective_t* input, int size, bool is_inverse, NTTConfig<curve_config::scalar_t>& config, curve_config::projective_t* output)
   {
-    return NTT<curve_config::projective_t, curve_config::scalar_t>(config);
+    return NTT<curve_config::scalar_t, curve_config::projective_t>(input, size, is_inverse, config, output);
   }
 
 #endif
 
-} // namespace ntt
+} // namespace ntt
\ No newline at end of file
diff --git a/icicle/appUtils/ntt/ntt.cuh b/icicle/appUtils/ntt/ntt.cuh
index 1bc976d6b..1eb57aee3 100644
--- a/icicle/appUtils/ntt/ntt.cuh
+++ b/icicle/appUtils/ntt/ntt.cuh
@@ -15,16 +15,36 @@
  * Number Theoretic Transform, or NTT is a version of [fast Fourier
  * transform](https://en.wikipedia.org/wiki/Fast_Fourier_transform) where instead of real or complex numbers, inputs and
  * outputs belong to certain finite groups or fields. NTT computes the values of a polynomial \f$ p(x) = p_0 + p_1 \cdot
- * x + \dots + p_{n-1} \cdot x^{n-1} \f$ on special subfields called "roots of unity", or "twiddle factors": \f[ NTT(p)
- * = \{ p(\omega^0), p(\omega^1), \dots, p(\omega^{n-1}) \} \f] Inverse NTT, or iNTT solves the inverse problem of
- * computing coefficients of \f$ p(x) \f$ from evaluations \f$ \{ p(\omega^0), p(\omega^1), \dots, p(\omega^{n-1}) \}
- * \f$. If not specified otherwise, \f$ n \f$ is a power of 2.
+ * x + \dots + p_{n-1} \cdot x^{n-1} \f$ on special subfields called "roots of unity", or "twiddle factors" (optionally
+ * shifted by an additional element called "coset generator"): \f[ NTT(p) = \{ p(\omega^0), p(\omega^1), \dots,
+ * p(\omega^{n-1}) \} \f] Inverse NTT, or iNTT solves the inverse problem of computing coefficients of \f$ p(x) \f$
+ * given evaluations \f$ \{ p(\omega^0), p(\omega^1), \dots, p(\omega^{n-1}) \} \f$. If not specified otherwise,
+ * \f$ n \f$ is a power of 2.
  */
 namespace ntt {
 
+  /**
+   * Generate a domain that supports all NTTs of sizes under a certain threshold. Note that the this function might
+   * be expensive, so if possible it should be called before all time-critical operations.
+   * It's assumed that during program execution only the coset generator might change, but twiddles stay fixed, so
+   * they are initialized at the first call of this function and don't change afterwards.
+   * @param primitive_root Primitive root in field `S` of order \f$ 2^s \f$. This should be the smallest power-of-2
+   * order that's large enough to support any NTT you might want to perform.
+   * @param ctx Details related to the device such as its id and stream id.
+   * @return `cudaSuccess` if the execution was successful and an error code otherwise.
+   */
+  template <typename S>
+  cudaError_t InitDomain(S primitive_root, device_context::DeviceContext& ctx);
+
   /**
    * @enum Ordering
-   * How to order inputs and outputs of the NTT:
+   * How to order inputs and outputs of the NTT. If needed, use this field to specify decimation: decimation in time
+   * (DIT) corresponds to `Ordering::kRN` while decimation in frequency (DIF) to `Ordering::kNR`. Also, to specify
+   * butterfly to be used, select `Ordering::kRN` for Cooley-Tukey and `Ordering::kNR` for Gentleman-Sande. There's
+   * no implication that a certain decimation or butterfly will actually be used under the hood, this is just for
+   * compatibility with codebases that use "decimation" and "butterfly" to denote ordering of inputs and outputs.
+   *
+   * Ordering options are:
    * - kNN: inputs and outputs are natural-order (example of natural ordering: \f$ \{a_0, a_1, a_2, a_3, a_4, a_5, a_6,
    * a_7\} \f$).
    * - kNR: inputs are natural-order and outputs are bit-reversed-order (example of bit-reversed ordering: \f$ \{a_0,
@@ -34,100 +54,50 @@ namespace ntt {
    */
   enum class Ordering { kNN, kNR, kRN, kRR };
 
-  /**
-   * @enum Decimation
-   * Decimation of the NTT algorithm:
-   * - kDIT: decimation in time.
-   * - kDIF: decimation in frequency.
-   */
-  enum class Decimation { kDIT, kDIF };
-
-  /**
-   * @enum Butterfly
-   * [Butterfly](https://en.wikipedia.org/wiki/Butterfly_diagram) used in the NTT algorithm (i.e. what happens to each
-   * pair of inputs on every iteration):
-   * - kCooleyTukey: Cooley-Tukey butterfly.
-   * - kGentlemanSande: Gentleman-Sande butterfly.
-   */
-  enum class Butterfly { kCooleyTukey, kGentlemanSande };
-
   /**
    * @struct NTTConfig
-   * Struct that encodes NTT parameters to be passed into the [ntt](@ref ntt) function.
+   * Struct that encodes NTT parameters to be passed into the [NTT](@ref NTT) function.
    */
-  template <typename E, typename S>
+  template <typename S>
   struct NTTConfig {
-    E* inout; /**< Input that's mutated in-place by this function. Length of this array needs to be \f$ size \cdot
-               * config.batch_size \f$. Note that if inputs are in Montgomery form, the outputs will be as well and
-               * vice-verse: non-Montgomery inputs produce non-Montgomety outputs.*/
-    bool are_inputs_on_device; /**< True if inputs/outputs are on device and false if they're on host. Default value:
-                                  false. */
-    bool is_inverse;           /**< True if true . Default value: false. */
-    Ordering
-      ordering; /**< Ordering of inputs and outputs. See [Ordering](@ref Ordering). Default value: `Ordering::kNN`. */
-    Decimation
-      decimation; /**< Decimation of the algorithm, see [Decimation](@ref Decimation). Default value:
-                   * `Decimation::kDIT`.
-                   *   __Note:__ this variable exists mainly for compatibility with codebases that use similar notation.
-                   *   If [ordering](@ref ordering) is `Ordering::kRN`, the value of this variable will be overridden to
-                   *   `Decimation::kDIT` and if ordering is `Ordering::kNR` — to `Decimation::kDIF`. */
-    Butterfly
-      butterfly;     /**< Butterfly used by the NTT. See [Butterfly](@ref Butterfly). Default value:
-                      * `Butterfly::kCooleyTukey`.
-                      *   __Note:__ this variable exists mainly for compatibility with codebases that use similar notation.
-                      *   If [ordering](@ref ordering) is `Ordering::kRN`, the value of this variable will be overridden to
-                      *   `Butterfly::kCooleyTukey` and if ordering is `Ordering::kNR` — to `Butterfly::kGentlemanSande`. */
-    bool is_coset;   /**< If false, NTT is computed on a subfield given by [twiddles](@ref twiddles). If true, NTT is
-                      * computed   on a coset of [twiddles](@ref twiddles) given by [the coset generator](@ref coset_gen),
-                      * so:   \f$ \{coset\_gen\cdot\omega^0, coset\_gen\cdot\omega^1, \dots, coset\_gen\cdot\omega^{n-1}\}
-                      * \f$. Default value: false. */
-    S* coset_gen;    /**< The field element that generates a coset if [is_coset](@ref is_coset) is true.
-                      *   Otherwise should be set to `nullptr`. Default value: `nullptr`. */
-    S* twiddles;     /**< "Twiddle factors", (or "domain", or "roots of unity") on which the NTT is evaluated.
-                      *   This pointer is expected to live on device. The order is as follows:
-                      *   \f$ \{\omega^0=1, \omega^1, \dots, \omega^{n-1}\} \f$. If this pointer is `nullptr`, twiddle
-                      * factors     are generated online using the default generator (TODO: link to twiddle gen here) and
-                      * function     [GenerateTwiddleFactors](@ref GenerateTwiddleFactors). Default value: `nullptr`. */
-    S* inv_twiddles; /**< "Inverse twiddle factors", (or "domain", or "roots of unity") on which the iNTT is evaluated.
-                      *   This pointer is expected to live on device. The order is as follows:
-                      *   \f$ \{\omega^0=1, \omega^1, \dots, \omega^{n-1}\} \f$. If this pointer is `nullptr`, twiddle
-                      * factors are generated online using the default generator (TODO: link to twiddle gen here) and
-                      * function [GenerateTwiddleFactors](@ref GenerateTwiddleFactors). Default value: `nullptr`. */
-    int size; /**< NTT size \f$ n \f$. If a batch of NTTs (which all need to have the same size) is computed, this is
-                  the size of 1 NTT. */
-    int batch_size;              /**< The number of NTTs to compute. Default value: 1. */
-    bool is_preserving_twiddles; /**< If true, twiddle factors are preserved on device for subsequent use in config and
-                                    not freed after calculation. Default value: false. */
-    bool is_output_on_device;    /**< If true, output is preserved on device for subsequent use in config and not freed
-                                    after calculation. Default value: false. */
-    device_context::DeviceContext ctx; /**< Details related to the device such as its id and stream id. See
-                                          [DeviceContext](@ref device_context::DeviceContext). */
+    S coset_gen;                /**< Coset generator. Used to perform coset (i)NTTs. Default value: `S::one()` 
+                                 *   (corresponding to no coset being used). */
+    Ordering ordering;          /**< Ordering of inputs and outputs. See [Ordering](@ref Ordering). Default value:
+                                 *   `Ordering::kNN`. */
+    bool are_inputs_on_device;  /**< True if inputs are on device and false if they're on host. Default value: false. */
+    bool are_outputs_on_device; /**< If true, output is preserved on device, otherwise on host. Default value: false. */
+    int batch_size;             /**< The number of NTTs to compute. Default value: 1. */
+    bool is_async;              /**< Whether to run the NTT asyncronously. If set to `true`, the NTT function will be
+                                 *   non-blocking and you'd need to synchronize it explicitly by running
+                                 *   `cudaStreamSynchronize` or `cudaDeviceSynchronize`. If set to false, the NTT
+                                 *   function will block the current CPU thread. */
+    device_context::DeviceContext ctx; /**< Details related to the device such as its id and stream. */
   };
 
+  /**
+   * A function that returns the default value of [NTTConfig](@ref NTTConfig) for the [NTT](@ref NTT) function.
+   * @return Default value of [NTTConfig](@ref NTTConfig).
+   */
+  template <typename S>
+  NTTConfig<S> DefaultNTTConfig();
+
   /**
    * A function that computes NTT or iNTT in-place.
+   * @param input Input of the NTT. Length of this array needs to be \f$ size \cdot config.batch\_size \f$. Note
+   * that if inputs are in Montgomery form, the outputs will be as well and vice-versa: non-Montgomery inputs produce
+   * non-Montgomety outputs.
+   * @param size NTT size. If a batch of NTTs (which all need to have the same size) is computed, this is the size
+   * of 1 NTT, so it must equal the size of `inout` divided by `config.batch_size`.
+   * @param is_inverse True for inverse NTT and false for direct NTT. Default value: false.
    * @param config [NTTConfig](@ref NTTConfig) used in this NTT.
+   * @param output Buffer for the output of the NTT. Should be of the same size as `input`.
    * @tparam E The type of inputs and outputs (i.e. coefficients \f$ \{p_i\} \f$ and values \f$ p(x) \f$). Must be a
    * group.
    * @tparam S The type of "twiddle factors" \f$ \{ \omega^i \} \f$. Must be a field. Often (but not always) `S=E`.
    * @return `cudaSuccess` if the execution was successful and an error code otherwise.
    */
-  template <typename E, typename S>
-  cudaError_t NTT(NTTConfig<E, S>* config);
-
-  /**
-   * Generates twiddles \f$ \{\omega^0=1, \omega^1, \dots, \omega^{n-1}\} \f$ from root of unity \f$ \omega \f$ and
-   * stores them on device.
-   * @param d_twiddles Input empty array on device to which twiddles are to be written.
-   * @param n_twiddles Number of twiddle \f$ n \f$ factors to generate.
-   * @param omega Root of unity \f$ \omega \f$.
-   * @param ctx Details related to the device such as its id and stream id. See [DeviceContext](@ref
-   * device_context::DeviceContext).
-   * @tparam S The type of twiddle factors \f$ \{ \omega^i \} \f$.
-   * @return `cudaSuccess` if the execution was successful and an error code otherwise.
-   */
-  template <typename S>
-  cudaError_t GenerateTwiddleFactors(S* d_twiddles, int n_twiddles, S omega, device_context::DeviceContext ctx);
+  template <typename S, typename E>
+  cudaError_t NTT(E* input, int size, bool is_inverse, NTTConfig<S>& config, E* output);
 
 } // namespace ntt
 
diff --git a/icicle/curves/bls12_377_params.cuh b/icicle/curves/bls12_377_params.cuh
index 596dc6ff8..f6a8e457b 100644
--- a/icicle/curves/bls12_377_params.cuh
+++ b/icicle/curves/bls12_377_params.cuh
@@ -7,8 +7,9 @@
 namespace bls12_377 {
   struct fp_config {
     static constexpr unsigned limbs_count = 8;
-    static constexpr unsigned omegas_count = 32;
+    static constexpr unsigned omegas_count = 47;
     static constexpr unsigned modulus_bit_count = 253;
+    static constexpr unsigned num_of_reductions = 1;
 
     static constexpr storage<limbs_count> modulus = {0x00000001, 0x0a118000, 0xd0000001, 0x59aa76fe,
                                                      0x5c37b001, 0x60b44d1e, 0x9a2ca556, 0x12ab655e};
@@ -16,6 +17,8 @@ namespace bls12_377 {
                                                        0xb86f6002, 0xc1689a3c, 0x34594aac, 0x2556cabd};
     static constexpr storage<limbs_count> modulus_4 = {0x00000004, 0x28460000, 0x40000004, 0x66a9dbfb,
                                                        0x70dec005, 0x82d13479, 0x68b29559, 0x4aad957a};
+    static constexpr storage<limbs_count> neg_modulus = {0xffffffff, 0xf5ee7fff, 0x2ffffffe, 0xa6558901,
+                                                         0xa3c84ffe, 0x9f4bb2e1, 0x65d35aa9, 0xed549aa1};
     static constexpr storage<2 * limbs_count> modulus_wide = {
       0x00000001, 0x0a118000, 0xd0000001, 0x59aa76fe, 0x5c37b001, 0x60b44d1e, 0x9a2ca556, 0x12ab655e,
       0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000};
@@ -41,72 +44,102 @@ namespace bls12_377 {
                                                               0x27b28e2f, 0x838557e2, 0x2290c02c, 0x07b30191};
 
     static constexpr storage_array<omegas_count, limbs_count> omega = {
-      {{0xec2a895e, 0x476ef4a4, 0x63e3f04a, 0x9b506ee3, 0xd1a8a12f, 0x60c69477, 0x0cb92cc1, 0x11d4b7f6},
-       {0x01ab3a4d, 0x006f60fa, 0x814ba450, 0xe6600e15, 0xdf9eb147, 0xbde4df36, 0x33760d7b, 0x055d58fa},
-       {0xfdacff58, 0x8215b91d, 0x98331645, 0xd8d9177d, 0x439e803c, 0xe85223ad, 0xcca42c1f, 0x04aa8ef0},
-       {0x293f8481, 0xd52cc17a, 0x6f133205, 0x041178fb, 0xb2961832, 0xbbc70d18, 0x481760cd, 0x073d34d1},
-       {0x5e9020dd, 0xade9d4b4, 0x87db8813, 0x489259d2, 0x25051238, 0x5ddce740, 0xb5bc4d11, 0x0c775db1},
-       {0xd5fba57b, 0x90684fea, 0xe0defe98, 0xed237883, 0x030ae924, 0xc502b692, 0xe7a1ec2c, 0x08aa58e8},
-       {0x44ddbbdc, 0xbafb92a6, 0x26b01974, 0x63c7a02d, 0x5f28a274, 0x0ff86e13, 0x867f2e29, 0x0a7b462a},
-       {0x355dd694, 0x4258374d, 0x44c76a20, 0x5c31e8ac, 0xaa5fd062, 0x9b473969, 0x1a37b6b4, 0x0a693d77},
-       {0x22df9f13, 0x56313de8, 0x599e7536, 0xe2e75200, 0x6d163e50, 0xa1b4fce7, 0xc8111763, 0x0aec2172},
-       {0xf32d6bac, 0xa0b973d4, 0xf0d81b72, 0xae951889, 0x2e2daa0a, 0x51dbe098, 0x40d9af8f, 0x04679474},
-       {0x1b29736e, 0x8f267f19, 0x1d5a0c3a, 0xa2e04d58, 0x1ae99514, 0x76803064, 0x57f7c806, 0x12129439},
-       {0xbd83a3da, 0xd3b69b29, 0xe02ce197, 0x9543950f, 0xc2f87783, 0x80799665, 0xc15be215, 0x11ce8199},
-       {0xf284f768, 0xdeee484b, 0xe26a0475, 0x2a02e015, 0x88d968c2, 0xf0eb4925, 0x82a391c9, 0x0620ce9e},
-       {0xa90a2740, 0xfe3ca4f0, 0x512a7c7a, 0xd259ff36, 0xb41fe696, 0xbca3176a, 0xf33132ce, 0x05bd5ea3},
-       {0xb14361d4, 0x7f1db43f, 0x25ab6d51, 0x7927e578, 0x383bf21e, 0xb43e52a5, 0xd27fa99f, 0x077595e9},
-       {0xa9966ac4, 0x1ae0ea67, 0xda83fb3b, 0x4e2dbb1c, 0x0b51380e, 0xf77cf749, 0xb28a7670, 0x048b4b0e},
-       {0xa0234d2d, 0xe943054c, 0xe5f5be5e, 0x673b0ee0, 0x5048a19a, 0xcdd48e41, 0xabc3cb99, 0x0997d277},
-       {0x1912f7fa, 0x77d7da1d, 0x299fd7d6, 0xbcb7a5b2, 0x142a4480, 0x705e45dd, 0xb492dbd8, 0x0dc835fd},
-       {0x20b7298a, 0xd7652451, 0x65013b06, 0xc7c9a0b7, 0xad0d8457, 0x479b82a9, 0x0c99f5ce, 0x0bef1e5a},
-       {0xe5f8848a, 0x270a2326, 0xa727567d, 0x97d14afa, 0x48746fc7, 0x1a3a5a4e, 0xa42f077a, 0x0044e4b1},
-       {0x4dd87a5e, 0xf423a283, 0xd9a4c364, 0x1fe46601, 0xbfdc7e9b, 0xda4addbf, 0x3bf94b2b, 0x0a7f2bd8},
-       {0xf02ba42c, 0x553085d9, 0x1119b10d, 0x59662159, 0x6b8ea03f, 0xaa670958, 0x7ce92983, 0x066f6f5f},
-       {0xedc626c3, 0xf30e312d, 0xcf1f3a94, 0x8367a7ca, 0x917a1b28, 0x621e15e1, 0xf2e93b82, 0x07cd59f8},
-       {0xafeb494b, 0x97319dcd, 0x1d78404c, 0xab30c83e, 0xf26ffe90, 0x452d8a48, 0xa36452c7, 0x0bfc2e92},
-       {0xcc943028, 0xed2576ad, 0xfa4c6090, 0x846e49bc, 0x0049d8e6, 0xc74c1865, 0x665d7be5, 0x0e9c5a12},
-       {0xf45b9621, 0x102fbfb0, 0xf04faac0, 0xe80f4241, 0x7ca61177, 0x0b830bfd, 0x7033169d, 0x10521892},
-       {0x3358eb25, 0xdbc547bc, 0x722037db, 0x8909d398, 0x5e705b6d, 0x8b7075b5, 0x9bdaf407, 0x02694bb2},
-       {0x66a16869, 0x50c487c1, 0xd1fd4525, 0x380a66ab, 0x265e8539, 0xd455a01a, 0x064b5334, 0x0cd62875},
-       {0x4637701d, 0x0848f958, 0x4c8353af, 0x8a750076, 0x0ef6174a, 0x485f4e4f, 0xf38db632, 0x078d97a1},
-       {0x3d766f80, 0x1b4b71cf, 0x1069012d, 0x47d21195, 0x9151ebec, 0x5635235f, 0x2b13c808, 0x093f7d91},
+      {{0x00000000, 0x0a118000, 0xd0000001, 0x59aa76fe, 0x5c37b001, 0x60b44d1e, 0x9a2ca556, 0x12ab655e},
+       {0x00000001, 0x8f1a4000, 0xb0000001, 0xcf664765, 0x970dec00, 0x23ed1347, 0x00000000, 0x00000000},
+       {0xfbfa0a01, 0x0f830f7e, 0xd75769a0, 0x20f8b46c, 0xf05d5033, 0x7108bd18, 0x0788de01, 0x07405e08},
+       {0x60b9bdae, 0xc78085a6, 0x789094f5, 0x3116ec22, 0xce87d660, 0x0a02a81d, 0xc2a94856, 0x0ead8236},
+       {0x3e83a7cc, 0x6ffc39d9, 0x958a0a74, 0x117d996e, 0x0b92e8c9, 0xc242289d, 0x29d977d6, 0x0484efb4},
+       {0x0111ec3f, 0x15455b00, 0xc5f6be6f, 0x6b62d7af, 0x337f2d07, 0xfcba0365, 0x43fccd26, 0x0f151842},
+       {0xc31ec69b, 0x57951b2e, 0x2a37ce1f, 0x3e0a4be7, 0xcf3b198a, 0x960aeb4a, 0x341fd5cd, 0x04fb0673},
+       {0xa921851f, 0x71c1b78e, 0x7808f239, 0x3c26340c, 0x976fb990, 0xbcc8f69b, 0xe880dc71, 0x06a5edb2},
+       {0xc0f5679e, 0x7619eab5, 0x0dc0b9cd, 0x1f4cd10e, 0xbf6a480a, 0x7e1b70aa, 0x7f5461bb, 0x0ffc66da},
+       {0xec5cbab2, 0x8159806d, 0x498264a3, 0x14ea1333, 0xe3abfaa6, 0x56bbe1d8, 0x02aa031f, 0x09d2b5c4},
+       {0xc010c48a, 0xd2aa9562, 0x3b004b60, 0x447e5c11, 0x11e243bb, 0xd5a21c13, 0x0ab418b1, 0x01eab23e},
+       {0xacff6986, 0x08715ee8, 0xa93924d0, 0xab01878a, 0x6e9ae5c4, 0xbfbc5e71, 0x26b08d6e, 0x0f8000bf},
+       {0x3ddbc679, 0x06bc13b0, 0x615256ce, 0x7269a1f1, 0x1f5221a2, 0xf7716fbf, 0x8c66c14f, 0x0fa1f02c},
+       {0x906f531f, 0xdd40f131, 0x30728eff, 0xb06b29c7, 0x88839294, 0xc891fd19, 0x646978e8, 0x04e88447},
+       {0x6e259cdc, 0xb1e4b769, 0x00514e5e, 0xbcb0b709, 0x05113e7f, 0x74edb7c0, 0xe92e22af, 0x10c88511},
+       {0x240ede5b, 0xebb2e898, 0x42cd84c6, 0xc2639185, 0x9408f956, 0xf79e8391, 0x94e87a7d, 0x06872fa1},
        {0x260678ff, 0xf8522249, 0xa8de9973, 0x6148cb16, 0x5a4e8d56, 0x5750f3f4, 0xbaeaf0c3, 0x0e805156},
-       {0x240ede5b, 0xebb2e898, 0x42cd84c6, 0xc2639185, 0x9408f956, 0xf79e8391, 0x94e87a7d, 0x06872fa1}}};
+       {0x3d766f80, 0x1b4b71cf, 0x1069012d, 0x47d21195, 0x9151ebec, 0x5635235f, 0x2b13c808, 0x093f7d91},
+       {0x4637701d, 0x0848f958, 0x4c8353af, 0x8a750076, 0x0ef6174a, 0x485f4e4f, 0xf38db632, 0x078d97a1},
+       {0x66a16869, 0x50c487c1, 0xd1fd4525, 0x380a66ab, 0x265e8539, 0xd455a01a, 0x064b5334, 0x0cd62875},
+       {0x3358eb25, 0xdbc547bc, 0x722037db, 0x8909d398, 0x5e705b6d, 0x8b7075b5, 0x9bdaf407, 0x02694bb2},
+       {0xf45b9621, 0x102fbfb0, 0xf04faac0, 0xe80f4241, 0x7ca61177, 0x0b830bfd, 0x7033169d, 0x10521892},
+       {0xcc943028, 0xed2576ad, 0xfa4c6090, 0x846e49bc, 0x0049d8e6, 0xc74c1865, 0x665d7be5, 0x0e9c5a12},
+       {0xafeb494b, 0x97319dcd, 0x1d78404c, 0xab30c83e, 0xf26ffe90, 0x452d8a48, 0xa36452c7, 0x0bfc2e92},
+       {0xedc626c3, 0xf30e312d, 0xcf1f3a94, 0x8367a7ca, 0x917a1b28, 0x621e15e1, 0xf2e93b82, 0x07cd59f8},
+       {0xf02ba42c, 0x553085d9, 0x1119b10d, 0x59662159, 0x6b8ea03f, 0xaa670958, 0x7ce92983, 0x066f6f5f},
+       {0x4dd87a5e, 0xf423a283, 0xd9a4c364, 0x1fe46601, 0xbfdc7e9b, 0xda4addbf, 0x3bf94b2b, 0x0a7f2bd8},
+       {0xe5f8848a, 0x270a2326, 0xa727567d, 0x97d14afa, 0x48746fc7, 0x1a3a5a4e, 0xa42f077a, 0x0044e4b1},
+       {0x20b7298a, 0xd7652451, 0x65013b06, 0xc7c9a0b7, 0xad0d8457, 0x479b82a9, 0x0c99f5ce, 0x0bef1e5a},
+       {0x1912f7fa, 0x77d7da1d, 0x299fd7d6, 0xbcb7a5b2, 0x142a4480, 0x705e45dd, 0xb492dbd8, 0x0dc835fd},
+       {0xa0234d2d, 0xe943054c, 0xe5f5be5e, 0x673b0ee0, 0x5048a19a, 0xcdd48e41, 0xabc3cb99, 0x0997d277},
+       {0xa9966ac4, 0x1ae0ea67, 0xda83fb3b, 0x4e2dbb1c, 0x0b51380e, 0xf77cf749, 0xb28a7670, 0x048b4b0e},
+       {0xb14361d4, 0x7f1db43f, 0x25ab6d51, 0x7927e578, 0x383bf21e, 0xb43e52a5, 0xd27fa99f, 0x077595e9},
+       {0xa90a2740, 0xfe3ca4f0, 0x512a7c7a, 0xd259ff36, 0xb41fe696, 0xbca3176a, 0xf33132ce, 0x05bd5ea3},
+       {0xf284f768, 0xdeee484b, 0xe26a0475, 0x2a02e015, 0x88d968c2, 0xf0eb4925, 0x82a391c9, 0x0620ce9e},
+       {0xbd83a3da, 0xd3b69b29, 0xe02ce197, 0x9543950f, 0xc2f87783, 0x80799665, 0xc15be215, 0x11ce8199},
+       {0x1b29736e, 0x8f267f19, 0x1d5a0c3a, 0xa2e04d58, 0x1ae99514, 0x76803064, 0x57f7c806, 0x12129439},
+       {0xf32d6bac, 0xa0b973d4, 0xf0d81b72, 0xae951889, 0x2e2daa0a, 0x51dbe098, 0x40d9af8f, 0x04679474},
+       {0x22df9f13, 0x56313de8, 0x599e7536, 0xe2e75200, 0x6d163e50, 0xa1b4fce7, 0xc8111763, 0x0aec2172},
+       {0x355dd694, 0x4258374d, 0x44c76a20, 0x5c31e8ac, 0xaa5fd062, 0x9b473969, 0x1a37b6b4, 0x0a693d77},
+       {0x44ddbbdc, 0xbafb92a6, 0x26b01974, 0x63c7a02d, 0x5f28a274, 0x0ff86e13, 0x867f2e29, 0x0a7b462a},
+       {0xd5fba57b, 0x90684fea, 0xe0defe98, 0xed237883, 0x030ae924, 0xc502b692, 0xe7a1ec2c, 0x08aa58e8},
+       {0x5e9020dd, 0xade9d4b4, 0x87db8813, 0x489259d2, 0x25051238, 0x5ddce740, 0xb5bc4d11, 0x0c775db1},
+       {0x293f8481, 0xd52cc17a, 0x6f133205, 0x041178fb, 0xb2961832, 0xbbc70d18, 0x481760cd, 0x073d34d1},
+       {0xfdacff58, 0x8215b91d, 0x98331645, 0xd8d9177d, 0x439e803c, 0xe85223ad, 0xcca42c1f, 0x04aa8ef0},
+       {0x01ab3a4d, 0x006f60fa, 0x814ba450, 0xe6600e15, 0xdf9eb147, 0xbde4df36, 0x33760d7b, 0x055d58fa},
+       {0xec2a895e, 0x476ef4a4, 0x63e3f04a, 0x9b506ee3, 0xd1a8a12f, 0x60c69477, 0x0cb92cc1, 0x11d4b7f6}}};
 
     static constexpr storage_array<omegas_count, limbs_count> omega_inv = {
-      {{0xb9112c51, 0x2542c2b2, 0x6e23b3ce, 0x36ead8da, 0x76476754, 0x9a268d13, 0xa1ad7cf1, 0x121f44ad},
-       {0x3b3625b6, 0x1e62401f, 0x28471e5a, 0xd0692164, 0x5cad6b77, 0xb85aa9ec, 0xaa95acf2, 0x063e4b66},
-       {0x4ffa086a, 0xecc89610, 0xca06afc6, 0x4db82291, 0x8f3a6426, 0x9ae7c68c, 0x2a874432, 0x0b3dae8c},
-       {0xe3b4dc56, 0xa0594a67, 0x91b698e1, 0xc8e6b582, 0x8df78057, 0x711cadbf, 0x396466f8, 0x0049abdf},
-       {0x6464580f, 0x33e6c8c0, 0x3c4aa09f, 0x9d560eb3, 0xcc98f404, 0xb3f1a899, 0x8ca24b48, 0x012c1ea5},
-       {0xaf858193, 0x2b955be2, 0x5fb5e378, 0xa513d8be, 0xa326aeb9, 0x88c4ebeb, 0xf3d45990, 0x00c378e2},
-       {0x33bf2a1c, 0x842b0c9c, 0xa29b9236, 0x1fd43c95, 0xc06795d3, 0x6b37a603, 0x0c1b712a, 0x00017b17},
-       {0x526bf9fc, 0x023031cc, 0x79c209ba, 0x0e4136c0, 0x3ec42e5c, 0xe5234df1, 0x1d455234, 0x00cb9592},
-       {0xef01ed78, 0xf2828212, 0xf103c9ca, 0xa66094ac, 0x7a2d5573, 0xdceb481d, 0x8af46aab, 0x0190fcde},
-       {0x89b0ca6f, 0xb4d938e2, 0x2c897570, 0x0214eb59, 0x2d4cf27a, 0x56c45327, 0x3ed546a4, 0x10a2f358},
-       {0x78500f1a, 0x98310dd7, 0x735ccb27, 0x1c6050bf, 0xb2081df4, 0x07b6fa7f, 0xfa0f1e20, 0x003edf24},
-       {0xa39b02a3, 0x8a3de898, 0xdc94422c, 0x068b2992, 0xf493db31, 0x1c5f019a, 0x11b0f668, 0x066b1790},
-       {0xdddb58ec, 0x41f8042f, 0x10886d85, 0x7dd54384, 0x622ff4b4, 0x19544f90, 0x050cc539, 0x02f0b49a},
-       {0x7998b62c, 0xbb53132b, 0x22c9b4aa, 0x064a9186, 0x71d61334, 0xd56de253, 0x04e416f6, 0x10fcf25f},
-       {0xdf80223d, 0x55f432c9, 0x11a2fed9, 0x23daf2f6, 0x41ae8c34, 0x9e43e003, 0x95f22373, 0x0d51533b},
-       {0x78fd3239, 0xaf29730b, 0x40c3e723, 0xbd907ac9, 0x77f214f7, 0x5dcc0aad, 0xb05fb3a1, 0x02d958da},
-       {0x498fb549, 0xd5993cd5, 0x09da9272, 0x718adcee, 0x72bd5bc0, 0x9e03cbb4, 0xc592813f, 0x07206942},
-       {0xe978594b, 0x4ddd3320, 0x3abe3f79, 0xe5f36fbe, 0xe4dcff8e, 0x5dba9ef2, 0x7105148f, 0x0bfc27e2},
-       {0x3e47b53f, 0x50380ce2, 0x3a9613fc, 0x6ea3c2d3, 0x4c87ab50, 0xfe743105, 0xd192221c, 0x07871979},
-       {0x49c6284a, 0x9ba6aa00, 0xeacbdc63, 0x0b8429fb, 0xedafdf37, 0x9b9c6c5b, 0xad0c78c6, 0x009907e8},
-       {0x5d4e643c, 0x3da791ea, 0x85bff013, 0xb6a956ef, 0xd73de6a3, 0x86c629a8, 0x6b8c48a9, 0x0a5a5f55},
-       {0x4b9ac952, 0x3d29f5ba, 0xc8ea8f94, 0x7c7f2662, 0xcefc3052, 0x736ccb63, 0x0981f3cb, 0x04bfce2f},
-       {0x930cee0b, 0x432d3626, 0xf26e8ba3, 0x55ed3efb, 0x14c5457f, 0x802eebcc, 0xe2310f22, 0x00d300e3},
-       {0x60cf1330, 0x840f913b, 0x1df5ed87, 0x5610cde6, 0x72b36ddf, 0x858381b0, 0x6f64e0b7, 0x109bf66c},
-       {0x03ad3139, 0x01d3f431, 0xa137ce16, 0xe56f6002, 0x1deb42e8, 0x97f53369, 0xaa37cddd, 0x033fa9ac},
-       {0xc161761f, 0x271d7caf, 0xc369a371, 0xf1001d6f, 0x00e60f51, 0x65286415, 0xb74d14b8, 0x00b918f9},
-       {0xa26c8c12, 0xa6f4e1d1, 0xf6610f7e, 0x13571553, 0x56701caf, 0xd95e5df6, 0x2263d69d, 0x050e7b89},
-       {0x1d75bec9, 0xe29ef6c0, 0xd4b0183b, 0xead287a2, 0xedfd3795, 0x75a017cf, 0x64427c8e, 0x107f8d0f},
-       {0x00db2b48, 0xa43c0e02, 0x933d10ee, 0x76585489, 0xc0ba6a80, 0x12d64af1, 0x2fad8d8e, 0x01940f43},
-       {0x4b1b63a9, 0x12998cbc, 0xcf420c9f, 0x0f780c6c, 0x129289ad, 0xa5e48723, 0x240a141d, 0x0a3a1223},
+      {{0x00000000, 0x0a118000, 0xd0000001, 0x59aa76fe, 0x5c37b001, 0x60b44d1e, 0x9a2ca556, 0x12ab655e},
+       {0x00000000, 0x7af74000, 0x1fffffff, 0x8a442f99, 0xc529c400, 0x3cc739d6, 0x9a2ca556, 0x12ab655e},
+       {0xd60fb046, 0xc9fa190c, 0xc5b4674e, 0xdb5c179b, 0xbc7b8726, 0x2b2bce0b, 0xbf6e69bf, 0x0e4eb338},
+       {0x8ffc4ed5, 0x74732d1f, 0xb7f2eefc, 0x42d9f590, 0xa24dd4dd, 0xf70461e5, 0xef64676f, 0x03b6eba4},
+       {0x102bbab0, 0x5a21f98a, 0x8d8e2efb, 0xa6a147a9, 0x7612906f, 0x0eb4f005, 0x47d8d2e3, 0x0e1a5481},
+       {0xd01e5aa8, 0x6e509add, 0x6e3f123d, 0xe1582468, 0x8274db24, 0xbd6313ee, 0xd173a634, 0x05d5836e},
+       {0xe975c0cf, 0x6aab3344, 0x6f1dc38e, 0xca362e0e, 0x1dd1743a, 0x2fe72cda, 0xc1b4c4c2, 0x0c1c956e},
+       {0xec89a64f, 0x59fe97a0, 0xe8de5d4c, 0x579617d7, 0xc9c1ea7b, 0x256a305b, 0x53fa131b, 0x01ffae4e},
+       {0x29bcb088, 0x463a73ff, 0xe1438e80, 0xee9e9a5e, 0x3c9369e4, 0x2a00951f, 0x80a32052, 0x09711183},
+       {0x4bec8dd2, 0xa36899db, 0x96393687, 0x2946872e, 0x842df3c8, 0xd4b5734f, 0x5f5cd8fb, 0x0834098f},
+       {0xe3c711b9, 0x4bc485f6, 0x648d1d7e, 0xf43a2598, 0xee88abaa, 0x7f981a0e, 0xec6a3f27, 0x0c88c9c3},
+       {0x49046b52, 0x42bcc6c2, 0x56ab9ecc, 0xcc77294a, 0xe4df3ddd, 0x02ecb41a, 0x67f76726, 0x0e567d22},
+       {0x91c64fc2, 0x1cc56cc3, 0xd16a490b, 0x8cb71e65, 0x14fac366, 0x984be37e, 0xa25d7ba5, 0x0a08e032},
+       {0xd4f5941e, 0x966d9739, 0xe5772a73, 0x5805deb6, 0x5c1f970c, 0xe4eb0d33, 0xbdf35409, 0x039715db},
+       {0xcc6518ac, 0x8419686c, 0x9c7a2366, 0x96dec3a8, 0x71724384, 0xefbfcac6, 0xaf34c239, 0x0c44b99a},
+       {0xc18ff4fd, 0xcb66fe1b, 0x86c8d586, 0x588e18b3, 0x1dfab57c, 0xc6e6d2a3, 0x7d7d4efd, 0x10918ad2},
        {0x97a18f58, 0x56d6cf22, 0xd0d7abd9, 0x11710758, 0x5eb7a9c5, 0xd1a6608b, 0xc4937e38, 0x04059bdb},
-       {0xc18ff4fd, 0xcb66fe1b, 0x86c8d586, 0x588e18b3, 0x1dfab57c, 0xc6e6d2a3, 0x7d7d4efd, 0x10918ad2}}};
+       {0x4b1b63a9, 0x12998cbc, 0xcf420c9f, 0x0f780c6c, 0x129289ad, 0xa5e48723, 0x240a141d, 0x0a3a1223},
+       {0x00db2b48, 0xa43c0e02, 0x933d10ee, 0x76585489, 0xc0ba6a80, 0x12d64af1, 0x2fad8d8e, 0x01940f43},
+       {0x1d75bec9, 0xe29ef6c0, 0xd4b0183b, 0xead287a2, 0xedfd3795, 0x75a017cf, 0x64427c8e, 0x107f8d0f},
+       {0xa26c8c12, 0xa6f4e1d1, 0xf6610f7e, 0x13571553, 0x56701caf, 0xd95e5df6, 0x2263d69d, 0x050e7b89},
+       {0xc161761f, 0x271d7caf, 0xc369a371, 0xf1001d6f, 0x00e60f51, 0x65286415, 0xb74d14b8, 0x00b918f9},
+       {0x03ad3139, 0x01d3f431, 0xa137ce16, 0xe56f6002, 0x1deb42e8, 0x97f53369, 0xaa37cddd, 0x033fa9ac},
+       {0x60cf1330, 0x840f913b, 0x1df5ed87, 0x5610cde6, 0x72b36ddf, 0x858381b0, 0x6f64e0b7, 0x109bf66c},
+       {0x930cee0b, 0x432d3626, 0xf26e8ba3, 0x55ed3efb, 0x14c5457f, 0x802eebcc, 0xe2310f22, 0x00d300e3},
+       {0x4b9ac952, 0x3d29f5ba, 0xc8ea8f94, 0x7c7f2662, 0xcefc3052, 0x736ccb63, 0x0981f3cb, 0x04bfce2f},
+       {0x5d4e643c, 0x3da791ea, 0x85bff013, 0xb6a956ef, 0xd73de6a3, 0x86c629a8, 0x6b8c48a9, 0x0a5a5f55},
+       {0x49c6284a, 0x9ba6aa00, 0xeacbdc63, 0x0b8429fb, 0xedafdf37, 0x9b9c6c5b, 0xad0c78c6, 0x009907e8},
+       {0x3e47b53f, 0x50380ce2, 0x3a9613fc, 0x6ea3c2d3, 0x4c87ab50, 0xfe743105, 0xd192221c, 0x07871979},
+       {0xe978594b, 0x4ddd3320, 0x3abe3f79, 0xe5f36fbe, 0xe4dcff8e, 0x5dba9ef2, 0x7105148f, 0x0bfc27e2},
+       {0x498fb549, 0xd5993cd5, 0x09da9272, 0x718adcee, 0x72bd5bc0, 0x9e03cbb4, 0xc592813f, 0x07206942},
+       {0x78fd3239, 0xaf29730b, 0x40c3e723, 0xbd907ac9, 0x77f214f7, 0x5dcc0aad, 0xb05fb3a1, 0x02d958da},
+       {0xdf80223d, 0x55f432c9, 0x11a2fed9, 0x23daf2f6, 0x41ae8c34, 0x9e43e003, 0x95f22373, 0x0d51533b},
+       {0x7998b62c, 0xbb53132b, 0x22c9b4aa, 0x064a9186, 0x71d61334, 0xd56de253, 0x04e416f6, 0x10fcf25f},
+       {0xdddb58ec, 0x41f8042f, 0x10886d85, 0x7dd54384, 0x622ff4b4, 0x19544f90, 0x050cc539, 0x02f0b49a},
+       {0xa39b02a3, 0x8a3de898, 0xdc94422c, 0x068b2992, 0xf493db31, 0x1c5f019a, 0x11b0f668, 0x066b1790},
+       {0x78500f1a, 0x98310dd7, 0x735ccb27, 0x1c6050bf, 0xb2081df4, 0x07b6fa7f, 0xfa0f1e20, 0x003edf24},
+       {0x89b0ca6f, 0xb4d938e2, 0x2c897570, 0x0214eb59, 0x2d4cf27a, 0x56c45327, 0x3ed546a4, 0x10a2f358},
+       {0xef01ed78, 0xf2828212, 0xf103c9ca, 0xa66094ac, 0x7a2d5573, 0xdceb481d, 0x8af46aab, 0x0190fcde},
+       {0x526bf9fc, 0x023031cc, 0x79c209ba, 0x0e4136c0, 0x3ec42e5c, 0xe5234df1, 0x1d455234, 0x00cb9592},
+       {0x33bf2a1c, 0x842b0c9c, 0xa29b9236, 0x1fd43c95, 0xc06795d3, 0x6b37a603, 0x0c1b712a, 0x00017b17},
+       {0xaf858193, 0x2b955be2, 0x5fb5e378, 0xa513d8be, 0xa326aeb9, 0x88c4ebeb, 0xf3d45990, 0x00c378e2},
+       {0x6464580f, 0x33e6c8c0, 0x3c4aa09f, 0x9d560eb3, 0xcc98f404, 0xb3f1a899, 0x8ca24b48, 0x012c1ea5},
+       {0xe3b4dc56, 0xa0594a67, 0x91b698e1, 0xc8e6b582, 0x8df78057, 0x711cadbf, 0x396466f8, 0x0049abdf},
+       {0x4ffa086a, 0xecc89610, 0xca06afc6, 0x4db82291, 0x8f3a6426, 0x9ae7c68c, 0x2a874432, 0x0b3dae8c},
+       {0x3b3625b6, 0x1e62401f, 0x28471e5a, 0xd0692164, 0x5cad6b77, 0xb85aa9ec, 0xaa95acf2, 0x063e4b66},
+       {0xb9112c51, 0x2542c2b2, 0x6e23b3ce, 0x36ead8da, 0x76476754, 0x9a268d13, 0xa1ad7cf1, 0x121f44ad}}};
 
     static constexpr storage_array<omegas_count, limbs_count> inv = {
       {{0x00000001, 0x8508c000, 0x68000000, 0xacd53b7f, 0x2e1bd800, 0x305a268f, 0x4d1652ab, 0x0955b2af},
@@ -140,12 +173,29 @@ namespace bls12_377 {
        {0xaf740001, 0x8a117ff7, 0x02ac480a, 0x77ecf6f4, 0x5695470e, 0x8f4f226b, 0x04d17a61, 0x12ab655e},
        {0xd7ba0001, 0xca117ffb, 0x69562405, 0xe8cbb6f9, 0xd9667b87, 0xf801b7c4, 0x4f7f0fdb, 0x12ab655e},
        {0xebdd0001, 0x6a117ffd, 0x1cab1203, 0xa13b16fc, 0x9acf15c4, 0x2c5b0271, 0x74d5da99, 0x12ab655e},
-       {0xf5ee8001, 0x3a117ffe, 0x76558902, 0xfd72c6fd, 0xfb8362e2, 0xc687a7c7, 0x87813ff7, 0x12ab655e}}};
+       {0xf5ee8001, 0x3a117ffe, 0x76558902, 0xfd72c6fd, 0xfb8362e2, 0xc687a7c7, 0x87813ff7, 0x12ab655e},
+       {0x7af74001, 0xa2117fff, 0x232ac481, 0x2b8e9efe, 0x2bdd8972, 0x139dfa73, 0x90d6f2a7, 0x12ab655e},
+       {0xbd7ba001, 0x56117fff, 0x79956241, 0xc29c8afe, 0xc40a9cb9, 0xba2923c8, 0x9581cbfe, 0x12ab655e},
+       {0xdebdd001, 0x30117fff, 0xa4cab121, 0x8e2380fe, 0x9021265d, 0x8d6eb873, 0x97d738aa, 0x12ab655e},
+       {0xef5ee801, 0x1d117fff, 0xba655891, 0x73e6fbfe, 0xf62c6b2f, 0x771182c8, 0x9901ef00, 0x12ab655e},
+       {0xf7af7401, 0x13917fff, 0xc532ac49, 0x66c8b97e, 0xa9320d98, 0x6be2e7f3, 0x99974a2b, 0x12ab655e},
+       {0xfbd7ba01, 0x0ed17fff, 0xca995625, 0xe039983e, 0x02b4decc, 0xe64b9a89, 0x99e1f7c0, 0x12ab655e},
+       {0xfdebdd01, 0x0c717fff, 0xcd4cab13, 0x1cf2079e, 0xaf764767, 0xa37ff3d3, 0x9a074e8b, 0x12ab655e},
+       {0xfef5ee81, 0x0b417fff, 0xcea6558a, 0x3b4e3f4e, 0x05d6fbb4, 0x021a2079, 0x9a19f9f1, 0x12ab655e},
+       {0xff7af741, 0x8aa97fff, 0xcf532ac5, 0xca7c5b26, 0xb10755da, 0xb16736cb, 0x9a234fa3, 0x12ab655e},
+       {0xffbd7ba1, 0x4a5d7fff, 0xcfa99563, 0x12136912, 0x069f82ee, 0x090dc1f5, 0x9a27fa7d, 0x12ab655e},
+       {0xffdebdd1, 0x2a377fff, 0xcfd4cab2, 0xb5def008, 0xb16b9977, 0xb4e10789, 0x9a2a4fe9, 0x12ab655e},
+       {0xffef5ee9, 0x9a247fff, 0xcfea6559, 0x87c4b383, 0x06d1a4bc, 0x0acaaa54, 0x9a2b7aa0, 0x12ab655e},
+       {0xfff7af75, 0x521affff, 0x4ff532ad, 0xf0b79541, 0x3184aa5e, 0x35bf7bb9, 0x9a2c0ffb, 0x12ab655e},
+       {0xfffbd7bb, 0x2e163fff, 0x0ffa9957, 0x25310620, 0xc6de2d30, 0xcb39e46b, 0x9a2c5aa8, 0x12ab655e},
+       {0xfffdebde, 0x1c13dfff, 0x6ffd4cac, 0xbf6dbe8f, 0x118aee98, 0x95f718c5, 0x9a2c7fff, 0x12ab655e}}};
   };
 
   struct fq_config {
     static constexpr unsigned limbs_count = 12;
+    static constexpr unsigned omegas_count = 48;
     static constexpr unsigned modulus_bit_count = 377;
+    static constexpr unsigned num_of_reductions = 1;
     static constexpr storage<limbs_count> modulus = {0x00000001, 0x8508c000, 0x30000000, 0x170b5d44,
                                                      0xba094800, 0x1ef3622f, 0x00f5138f, 0x1a22d9f3,
                                                      0x6ca1493b, 0xc63b05c0, 0x17c510ea, 0x01ae3a46};
@@ -155,6 +205,9 @@ namespace bls12_377 {
     static constexpr storage<limbs_count> modulus_4 = {0x00000004, 0x14230000, 0xc0000002, 0x5c2d7510,
                                                        0xe8252000, 0x7bcd88be, 0x03d44e3c, 0x688b67cc,
                                                        0xb28524ec, 0x18ec1701, 0x5f1443ab, 0x06b8e918};
+    static constexpr storage<limbs_count> neg_modulus = {0xffffffff, 0x7af73fff, 0xcfffffff, 0xe8f4a2bb,
+                                                         0x45f6b7ff, 0xe10c9dd0, 0xff0aec70, 0xe5dd260c,
+                                                         0x935eb6c4, 0x39c4fa3f, 0xe83aef15, 0xfe51c5b9};
     static constexpr storage<2 * limbs_count> modulus_wide = {
       0x00000001, 0x8508c000, 0x30000000, 0x170b5d44, 0xba094800, 0x1ef3622f, 0x00f5138f, 0x1a22d9f3,
       0x6ca1493b, 0xc63b05c0, 0x17c510ea, 0x01ae3a46, 0x00000000, 0x00000000, 0x00000000, 0x00000000,
@@ -179,37 +232,321 @@ namespace bls12_377 {
     static constexpr storage<limbs_count> zero = {0x00000000, 0x00000000, 0x00000000, 0x00000000,
                                                   0x00000000, 0x00000000, 0x00000000, 0x00000000,
                                                   0x00000000, 0x00000000, 0x00000000, 0x00000000};
-    static constexpr storage<limbs_count> montgomery_r = {0xffffff,   0xf73fffff, 0xffffff7a, 0xf4a2bbcf,
-                                                          0xf6b7ffe8, 0x0c9dd045, 0x0aec70e1, 0xdd260cff,
-                                                          0x5eb6c4e5, 0xc4fa3f93, 0x3aef1539, 0x51c5b9e8};
-    static constexpr storage<limbs_count> montgomery_r_inv = {0x934f3a1,  0xb0909a28, 0xc1cfac62, 0x3264aa55,
-                                                              0x2a491ae8, 0xaccd49ca, 0xe80e9a61, 0x28b2dce9,
-                                                              0x26f7c08a, 0x4d313ea1, 0x36254563, 0x161de1ee};
+    static constexpr storage<limbs_count> montgomery_r = {0xffffff68, 0x02cdffff, 0x7fffffb1, 0x51409f83,
+                                                          0x8a7d3ff2, 0x9f7db3a9, 0x6e7c6305, 0x7b4e97b7,
+                                                          0x803c84e8, 0x4cf495bf, 0xe2fdf49a, 0x008d6661};
+    static constexpr storage<limbs_count> montgomery_r_inv = {0x451269e8, 0xef129093, 0xe65839f5, 0x6e20bbcd,
+                                                              0xa5582c93, 0x852e3c88, 0xf7f2e657, 0xeeaaf41d,
+                                                              0xa4c49351, 0xeb89746c, 0x436b0736, 0x014212fc};
+
+    static constexpr storage_array<omegas_count, limbs_count> omega = {
+      {{0x00000000, 0x8508c000, 0x30000000, 0x170b5d44, 0xba094800, 0x1ef3622f, 0x00f5138f, 0x1a22d9f3, 0x6ca1493b,
+        0xc63b05c0, 0x17c510ea, 0x01ae3a46},
+       {0xf1391c63, 0x6e76d5ec, 0xbff27d8e, 0x99588459, 0x436b0f62, 0xbce649cf, 0x0ad1dec1, 0x400398f5, 0x1a79beb1,
+        0xc0c534db, 0x796537ca, 0x01680a40},
+       {0x554c85ba, 0x6cbff0e3, 0x0be8ff9d, 0xc07c7a91, 0x9dde4fa2, 0xc3c79f67, 0xb5726bde, 0x44bc6d1a, 0x76d6d607,
+        0xad812919, 0x95e8fd0e, 0x001bc0c2},
+       {0x6d5db237, 0xb8c206b0, 0xcabde6ba, 0x08fed85d, 0xcd92eb6f, 0xf2f54ffc, 0xe39c1788, 0xee81121f, 0x88e82edb,
+        0x852def4d, 0xb95fdb80, 0x00bf1268},
+       {0x192bf14f, 0x3663c26a, 0xe6351854, 0x99c859be, 0x159361b8, 0xf9430828, 0xfbe33d7d, 0x478ed715, 0xdb79c984,
+        0x41e220cf, 0xd961f2be, 0x00cedb38},
+       {0xcc724685, 0xb99caa69, 0x1388a46d, 0xc24087ba, 0x08f03491, 0xeb13a05a, 0x98fb0ff7, 0x558ab21e, 0x86bbd802,
+        0x0166d08d, 0xf5b5728a, 0x00d1dec9},
+       {0x92db32a2, 0x2e3951fe, 0x6014b201, 0x8f5a16c9, 0xa91fbb38, 0xa9e942b9, 0x17b4dbd2, 0xf7bf5b43, 0x81325c7d,
+        0x57f3934a, 0x615ad019, 0x012be78e},
+       {0xdce33f04, 0xb42b84a2, 0x0db0b91c, 0x7a0c1423, 0x88d9f8c8, 0xaed11a0c, 0xd484c501, 0x712d6bc0, 0xfa3f7633,
+        0x50aca1e5, 0xb90f34d0, 0x01002f29},
+       {0xf012f6a0, 0xbc3db054, 0x0d332ea7, 0x00d66897, 0xfd416167, 0x8278ef44, 0x20268e84, 0x1a1a3c4d, 0x4b11d215,
+        0x7c976aa6, 0x63b6e925, 0x00949581},
+       {0x339637c6, 0x9d73cf29, 0xa5642677, 0x8257d1a2, 0xcafd597c, 0xcb48f07f, 0x081435a3, 0x7a505010, 0xacbb9c39,
+        0xaaa45ce1, 0x7431b9c8, 0x013f2b13},
+       {0xd4710c0b, 0x9ef8bddb, 0x85047671, 0xb4c73188, 0x134695ba, 0x87a51d65, 0x022416dd, 0x67f3bc43, 0xcb2a157b,
+        0x21d965b2, 0x5ce4195d, 0x013a57e4},
+       {0xd2461368, 0xf2db3a9f, 0x3802aef2, 0x0595c232, 0x5ea85bd6, 0xa53d621a, 0xa34ee943, 0xce930fbc, 0x6b372bee,
+        0x1d216665, 0xa4535740, 0x009f0159},
+       {0x656bf68d, 0x73cf953a, 0xeac5c1d7, 0x50a5a5b5, 0xaa5355a9, 0x2697b2e1, 0x08de37d2, 0x6be70306, 0x44c5afab,
+        0x907f6976, 0xd4ec46b1, 0x0155cfa2},
+       {0x090e3e20, 0x034160c4, 0xf77a6fbb, 0xbc73cc59, 0x188e54f6, 0x437cd23b, 0x17e42614, 0x5a788edd, 0xebdc8eae,
+        0xf1ad4f54, 0x2f129bcd, 0x005d1440},
+       {0x4e269ee5, 0x5626c031, 0x0d1501ec, 0x5f97673e, 0x86d31c18, 0x4fe089bd, 0x62d1259a, 0x3e9fffcb, 0x1ff89d01,
+        0xe1898f32, 0x59d01a38, 0x00fa1331},
+       {0x38d427b1, 0xda80661b, 0xa814f14b, 0x1913027d, 0xcda4061d, 0xd3f61e24, 0x5da8fcb2, 0x9509e69d, 0x1f05e6d3,
+        0x0e7493a5, 0xa5c6bd06, 0x00dcb8db},
+       {0x61cff9ed, 0x88499d0a, 0x53718444, 0x0b317da2, 0x4b7eec5f, 0xc1624bfd, 0x5af10e6f, 0x6ffc3241, 0xd6c66ff2,
+        0x27d0edf3, 0x73ab0f4a, 0x013019b5},
+       {0x06027b24, 0x42dc7673, 0x3341b9e7, 0x018f8bbd, 0xa435f7e2, 0xd3b389d9, 0xea031176, 0x279739a5, 0x74c35801,
+        0x3555ca51, 0x049dcf87, 0x00748c30},
+       {0x81fe14de, 0x731b16f0, 0x333cc61a, 0x528d6ada, 0x5736dc15, 0x7ae87278, 0xc8bfd40c, 0xa94b9fd2, 0x299b0487,
+        0x714dd8ed, 0xf1a53233, 0x00642b62},
+       {0x5bc45170, 0x31270ddf, 0x7f72c758, 0x7efb6b06, 0xcf4973a8, 0x2eb9f2aa, 0xe556d234, 0xdcb534c9, 0x0e043fef,
+        0xf0b1a210, 0x54dda04e, 0x00e79c44},
+       {0x2d5f1bc2, 0x213b3f52, 0xfd933428, 0x9e115ba7, 0x434c9e2a, 0x7f77d57e, 0xcdb944ef, 0x47a78418, 0x699aa559,
+        0x8cb01cbb, 0xb064c4d7, 0x0075bf81},
+       {0x3fbfc66c, 0x0b6c2e65, 0x6fcab2f8, 0x7bece031, 0xb79dcd4d, 0x2ba7e325, 0xa5c6881b, 0x8c18f66a, 0x7283805a,
+        0x4d893e5a, 0xfc296bfe, 0x0107d3c5},
+       {0x948c881a, 0x53fbdbb4, 0x16803d18, 0xf27a9c14, 0xeddfafef, 0x8490f6c5, 0x3e57fa15, 0xfe068e1d, 0xd26b296b,
+        0xbe923119, 0x9fa377a1, 0x00d56016},
+       {0x6f5b2ad1, 0xb3bbaeb3, 0x11886a1c, 0x0efd4ba9, 0xdedb7083, 0x5911498f, 0x5bd0a90f, 0x0921fe19, 0x83d379cb,
+        0x38e05d4e, 0xb7ba3c73, 0x006b39e2},
+       {0xa55550ba, 0x61b560e4, 0xe7288461, 0xd9ac545b, 0xc6e3e282, 0xde8d2826, 0x7e49dd2c, 0x9e87a310, 0xc43080b7,
+        0xf2edfc44, 0x95b7d300, 0x012b4875},
+       {0x27591e60, 0x4048ddc3, 0xc5d21791, 0xb77c9738, 0x49826bea, 0xf2f82033, 0x42f97e95, 0xf60bb703, 0x5966139d,
+        0xef8f6f16, 0xc0e95e39, 0x00327618},
+       {0x441e395f, 0xf9059c8f, 0xbd087238, 0x29eab35f, 0x7dee5ff1, 0x5d4abeff, 0x771e60e9, 0x7222499b, 0x7ac324a2,
+        0xb70c1ea3, 0x0da51ce8, 0x015b3af9},
+       {0xe9a70026, 0xf7aa576b, 0x01c4a126, 0xb28733ef, 0xa3307647, 0x06b8e768, 0xe12588ce, 0x115500e1, 0x6c9f9b1d,
+        0x7e8dd6b9, 0x6ec020b3, 0x014d091e},
+       {0x8e5bbc8d, 0xd318265d, 0x141bee9b, 0x70b460ba, 0x1aa9df5b, 0x145dd6a6, 0xe3478cb3, 0xd9da2548, 0x7b509387,
+        0x47250509, 0xe967973c, 0x00de53d3},
+       {0xd2aa57b8, 0x5ff4399c, 0xa6ae9b07, 0x90360194, 0x6cfcdb7a, 0x68979991, 0x64e56abb, 0xf517467c, 0xad7a6573,
+        0x44227491, 0xa35ebf55, 0x0001da0b},
+       {0x4d80f6da, 0xd8b22d5a, 0x10ee1a06, 0x6e7b2bfb, 0x17faeac0, 0xac8d97e5, 0x7a12c923, 0x8b75540b, 0x5b42ce02,
+        0xa2787368, 0xe98d9998, 0x008d30a5},
+       {0x9dc292bb, 0xee29c02a, 0xc5b7e1c9, 0x9e7ea016, 0x9a908e5f, 0x62daf95d, 0x3e98eae9, 0x80a71c61, 0xfdda3bba,
+        0x2d514723, 0x068ef829, 0x00f65844},
+       {0x185b1ad6, 0xf62fdfa4, 0xf90ccbe6, 0x2ae7f104, 0x972ce78e, 0xfa435fb6, 0x45e59f91, 0x53a75d3c, 0x2f320b7a,
+        0x7290cac2, 0xe7cb5108, 0x01a2022a},
+       {0xd59dda24, 0xcf0a15be, 0xf2ec72b4, 0xbc77f6d4, 0x96c31202, 0xa8df0caf, 0xbb4f8842, 0xb95429c0, 0xd0087306,
+        0xb989b210, 0x5571e9f0, 0x002b1694},
+       {0x67ae536e, 0x7e84d4b5, 0xc8fb9b80, 0x3a920871, 0x1948ee86, 0x1a82df2b, 0xb3c66ed3, 0xdef79467, 0xef64d05a,
+        0x58fd84f2, 0xd999f400, 0x00c6d5b7},
+       {0x81ee0d53, 0x7639f9a2, 0xb5747565, 0x8ade807d, 0xe6235609, 0xfd9d6266, 0x53730f18, 0xea1948a3, 0xd890142e,
+        0xa356108a, 0xe3e8a723, 0x00a48ac6},
+       {0xd0ca5e04, 0x531c4b83, 0x2ba0a328, 0xff35ced6, 0xa4e563aa, 0x01613079, 0x1442dcd1, 0x6f52b3a3, 0x9e19b0a6,
+        0x813b4616, 0x9536db26, 0x004828c5},
+       {0x0bce1b4e, 0x8a9321a9, 0xae85d6ff, 0xb9759dbe, 0x5cb206e0, 0x1ce1d522, 0x35a1607a, 0x87df044f, 0x94e1329a,
+        0x2ebabee7, 0x73586cc9, 0x01a73170},
+       {0x3dd667f3, 0x69824754, 0x28fd63a2, 0x61a081a7, 0x99499385, 0x0b9f6d2e, 0x5c253e16, 0x6d45622b, 0x765a7f5f,
+        0xcd672e4d, 0x7150d847, 0x01182798},
+       {0x2742d2f6, 0x0af0bfd2, 0x3a02631d, 0x93616956, 0xac8a2203, 0x32dae751, 0x85cf4e2d, 0xea4ffbe7, 0x7dba6eb9,
+        0x673424f4, 0x61f4060d, 0x002ec230},
+       {0x5a5b5c2b, 0x226293ca, 0x0684dbc9, 0xbc0ca23e, 0x7d637c4f, 0x4510cf3a, 0x9b2f4a52, 0x7869c488, 0x2fd73a53,
+        0xec009b90, 0xa8c99cca, 0x003499d6},
+       {0xfd745afc, 0x9da60b0a, 0x41c5362e, 0xff0769ec, 0xfa9fd8ee, 0x487621e9, 0xab04558f, 0x138910d1, 0xc1ed03ce,
+        0x870903cf, 0xed3ffb51, 0x002c1cfa},
+       {0x42870c46, 0x271b1ff3, 0x13b4b491, 0x1e0a9cd1, 0x3c55c65e, 0x2d58cb1a, 0x74756f6e, 0xa6e12c32, 0x2e313bc4,
+        0xf774a43d, 0xcc386ffc, 0x00ca156d},
+       {0x4a67741c, 0x588f79b6, 0xc3590b63, 0xc0ae78b5, 0xc3576385, 0xad0bb97d, 0xb8473137, 0x0583dd49, 0x515d8604,
+        0xb31d9631, 0xd3ba3b12, 0x015337bc},
+       {0x8a458e8c, 0x976a14f5, 0xc3a26ae8, 0xc90809b4, 0x089acf15, 0x270a1575, 0x5013d4b1, 0x614a0d25, 0x6d09901e,
+        0x1314e076, 0xf208945e, 0x0022f414},
+       {0xc563b9a1, 0x7eca603c, 0x06fe0bc3, 0x06df0a43, 0x0ddff8c6, 0xb44d994a, 0x4512a3d4, 0x40fbe05b, 0x8aeffc9b,
+        0x30f15248, 0x05198a80, 0x0036a92e}}};
+
+    static constexpr storage_array<omegas_count, limbs_count> omega_inv = {
+      {{0x00000000, 0x8508c000, 0x30000000, 0x170b5d44, 0xba094800, 0x1ef3622f, 0x00f5138f, 0x1a22d9f3, 0x6ca1493b,
+        0xc63b05c0, 0x17c510ea, 0x01ae3a46},
+       {0x0ec6e39e, 0x1691ea13, 0x700d8272, 0x7db2d8ea, 0x769e389d, 0x620d1860, 0xf62334cd, 0xda1f40fd, 0x52278a89,
+        0x0575d0e5, 0x9e5fd920, 0x00463005},
+       {0x93997f11, 0x9403412c, 0xdfb2323f, 0x845557b3, 0x2d50c7fc, 0x66f2eaaa, 0xc103f92f, 0x992358fb, 0x5d7a3179,
+        0x01d60217, 0xd2af5da0, 0x0077b354},
+       {0xc1000ea4, 0x7ac2ca7a, 0x7f8d9495, 0x937db751, 0x0de62931, 0x401b3873, 0x980129ba, 0x59be7529, 0xa545a303,
+        0x2ba8f85d, 0xb6705512, 0x00573e3a},
+       {0x2c1b22e6, 0xb55712f9, 0x0f91cddd, 0x66cfc0f3, 0x8bb345d8, 0x8d5fcd42, 0x86c0abc3, 0x61e4cf98, 0x432fe8f3,
+        0x93556354, 0xad005fb6, 0x00ff87d5},
+       {0x7aba560e, 0x05065a97, 0x7918b9db, 0x333ff005, 0xdf6be708, 0x03938ae1, 0x7410a77b, 0x922d3376, 0x03a15063,
+        0xa5aeaa56, 0x4aea89e5, 0x01542cb6},
+       {0xe4d6a772, 0x61a6a2d6, 0x6e6239a7, 0xc18c9ef7, 0x04cac70f, 0x8772bb3f, 0x16c5916b, 0x8bbb4185, 0x46335dc0,
+        0x4aa656e2, 0x842c1664, 0x008187ac},
+       {0xdd4e93c5, 0xa002ea0a, 0x07458704, 0xb40a45e8, 0xbaa65f2a, 0xee9ee3ea, 0x8f3b8a87, 0xeffa4f9e, 0x95b5feba,
+        0xb6e03897, 0x81751c63, 0x003c41de},
+       {0x13043a4a, 0x50221a3b, 0xda73331a, 0x6537fca8, 0x8e85077c, 0x8b74cef4, 0x0e5bbe67, 0x65705341, 0xefa22d23,
+        0xf0f56caa, 0xd1865d98, 0x001f8eb5},
+       {0x3e26a605, 0xd9af8944, 0x6970166f, 0xad0efb6e, 0x2c7464ec, 0xc16d7972, 0xf788281b, 0xe0de4b04, 0xaa878b0e,
+        0x0c049e55, 0x63e2e7cd, 0x0135383a},
+       {0x6f6893f7, 0x6b12c42e, 0x44bbbf63, 0x831f38c0, 0x191be6c9, 0xa57797d4, 0x447475cb, 0x6af7f695, 0x4b8be189,
+        0x3295e9e7, 0x350d0aad, 0x00a9a32b},
+       {0x7656ef1d, 0xc2243f86, 0xf4211219, 0x3e4c3bc3, 0x3c9a3d21, 0xaa4db6e0, 0xe8a4c946, 0x29ac638a, 0xa4cf856e,
+        0x21449f8b, 0x7d4c9c67, 0x018cf097},
+       {0x6a8e0139, 0x18e472a2, 0xd6b1c835, 0xcc7c80fd, 0x6546fc0a, 0x1f760883, 0x4ea3417c, 0x5bcfc1fb, 0xe9acb8b0,
+        0x52c9a29b, 0xd9f265a2, 0x01a6d8b2},
+       {0xebb83ac0, 0x95eb1dc8, 0x9f390cf2, 0x1e8d70f5, 0xb0d85145, 0xf9e4955d, 0x89720ee1, 0xe9690d30, 0x50fc879f,
+        0x629972a5, 0x69ccd670, 0x00456e23},
+       {0x83f38be4, 0xfbfb11a1, 0x388e6726, 0xb90a19b9, 0xc860d62c, 0x3fc10bc7, 0xc3c4e575, 0xc9fe043e, 0x7396d780,
+        0x67aeff74, 0x01cadaee, 0x019059fa},
+       {0xfd581be8, 0x43506d6e, 0x018b1b76, 0xf09563e6, 0xe87f9d80, 0x5cd193b2, 0x0a933402, 0x18ba3260, 0x50524c77,
+        0x4de839d9, 0xd90315ce, 0x0018c2ed},
+       {0xa737701d, 0xf900eb81, 0x995e6672, 0x6874c90e, 0xa495900b, 0x69ade94a, 0xd07bd4b1, 0xd5f358e7, 0x6f88e8e4,
+        0xbd437e9d, 0x1d6b88cf, 0x0130d706},
+       {0xfc29b95f, 0x064629bd, 0xb546585c, 0x0a897bff, 0x54a80d9a, 0x856c8d4f, 0x944568ff, 0x85410cc4, 0x59fc4370,
+        0xc1978c65, 0xc668dc52, 0x017c86c8},
+       {0xf6109131, 0x65cecd55, 0x7d2f52e5, 0x6d7e892e, 0xb90b2403, 0xe9a09007, 0xae0a060d, 0x92ca9aac, 0xa22b1e96,
+        0x5ce1cc4f, 0x45201e6f, 0x012eb33c},
+       {0x20d1aac5, 0x9d2cb4cf, 0xded22997, 0x3e4a1e77, 0x07fae2e2, 0x09d692f7, 0xd49bdcbe, 0x6a6aa4f8, 0x09c01cab,
+        0xa8e21ead, 0x6b03b72e, 0x01a19e81},
+       {0x935650ca, 0xf3d94623, 0x2ffd937e, 0x4a688a46, 0xa622b139, 0xf55fd53a, 0x7a1a1e40, 0x227406aa, 0x9a3fea60,
+        0x40dd4504, 0x1edbb584, 0x00fc2332},
+       {0xf28db3fc, 0x9707402f, 0xc28593f1, 0x3d898bd7, 0xb30effcd, 0xcaee2dfd, 0x4fb6ec9d, 0xff1b0790, 0x09ed1120,
+        0x9cb0597e, 0xb78d15e9, 0x005c73a5},
+       {0xb0a8a3b9, 0x739a4c2e, 0xc57196ae, 0x083bde21, 0xba602f29, 0x247eb070, 0x1c2c7132, 0x4ba1dd6a, 0xe2187c6c,
+        0x4ce59fb6, 0x606880b1, 0x0014a7b5},
+       {0x484baf56, 0xdd0eccab, 0x4541b101, 0xe6c80eaf, 0xf7964f64, 0x35b8a558, 0xc50ccf94, 0xb3b824d4, 0x21c71aeb,
+        0xe1f6b4c8, 0x23031df0, 0x01a8a647},
+       {0x592a9620, 0x5338dc01, 0xd94a401b, 0xb217f96d, 0xf830b00e, 0xfefb6601, 0xafd3dee4, 0x1ec061b5, 0x05a199bd,
+        0x0d5d4d3c, 0xc8489913, 0x0196c768},
+       {0x1f980ca0, 0x4acb430e, 0x71c6821c, 0x8973a3cc, 0xb3e9aa75, 0x74414c20, 0x0c13f042, 0x79212a5f, 0x375c705b,
+        0x5c44d226, 0x29439af2, 0x000a2fdd},
+       {0xa387b60c, 0xf01901e6, 0x4561ff3d, 0xa7b1b7dc, 0x0558e085, 0x5d82d374, 0xf2bc1d29, 0x519298e5, 0x3d332207,
+        0x0ad719a8, 0xea19a807, 0x0150a138},
+       {0x9deb8e06, 0x7c6b3eb1, 0x28206b6c, 0x3a8f53c4, 0x7fed1065, 0x039f575f, 0x40c1f898, 0x31be74ba, 0x790ac003,
+        0x76db938e, 0x5508c5e4, 0x0096d5e1},
+       {0xb83f8358, 0x3e940e0e, 0x372a4b8b, 0x204d80e0, 0xa820b2ec, 0x956454b2, 0x2cc8078c, 0x8e2cb3d4, 0xc6f81363,
+        0xdd0d3e12, 0x49041a64, 0x0052f327},
+       {0x2aec0be2, 0x37ca2eb7, 0x555cc652, 0x05093570, 0xd2588d31, 0xe62f1adb, 0x798be240, 0x2fd2518e, 0x0ff6b579,
+        0x9302d4e3, 0x6ee95e5d, 0x0025ca57},
+       {0x233eed68, 0xcc664858, 0xece3a327, 0x600ca1ac, 0x93a2e34f, 0x330d1102, 0xdb5e3bb4, 0xc84ab55f, 0xe4d5576e,
+        0x5179c101, 0x0938f714, 0x00efb20e},
+       {0xfdddaf5c, 0x907f96e7, 0x1ffe49da, 0x348dab77, 0xc14ab779, 0x3eca44ad, 0x4cdc5d98, 0xe9b10b2e, 0xa95c5a36,
+        0x65a25d16, 0x6e616518, 0x00c9f759},
+       {0x7a5aff62, 0x9497d331, 0xb57cd01d, 0x21896195, 0x6c7ba745, 0xe09e22f7, 0x5a7acff0, 0xcc9f1064, 0xc93c46b0,
+        0x7b867cdf, 0x23eba5ae, 0x01a05dcb},
+       {0x4dcc71f4, 0xa56a8e33, 0xcbebdba2, 0xc480b083, 0x36ea43af, 0x748448fa, 0xe7859f3c, 0xee9b4b0e, 0x5af41919,
+        0x9ab2bb09, 0x65caa0ea, 0x0127262d},
+       {0x352a05cc, 0x77c7d12f, 0xdc7160c9, 0xb91ca5be, 0x5a3feda0, 0x245106da, 0x7669f7cd, 0xfd45012d, 0xdc5489fa,
+        0xc4774629, 0x2872daa0, 0x00241273},
+       {0x0d3e0b0b, 0x1838ae6f, 0xff67fc2c, 0x7fcc9b21, 0x23956100, 0xaedca59e, 0x1e79aa4b, 0x572ed634, 0xc7f0673c,
+        0xaeeda160, 0xc8047256, 0x00360e2c},
+       {0xe05044f9, 0xec5e4514, 0x7ec9b4ef, 0xe915b7e7, 0x9c4bec48, 0x9fb78cd8, 0xa38d95a3, 0xd7b84113, 0xb86fd119,
+        0x7be64440, 0xe4f9e70a, 0x009e3a60},
+       {0xc7435591, 0xc61cc546, 0xe5e94dc4, 0xea99a96f, 0xdb8ff17d, 0x5b10e2b4, 0x3dd0ff10, 0x13f8fb9d, 0xe118b9e9,
+        0xcbb1c0ce, 0x7ebf8a0d, 0x00b37258},
+       {0xce5943e7, 0xd44fdb9d, 0x79fa927a, 0xcb7d41ea, 0xdcee72ca, 0x9a4bcebf, 0x11634905, 0x2317799d, 0x584055ac,
+        0x3f1c302e, 0xdc2d0017, 0x013ef021},
+       {0xa78a1578, 0x345cb052, 0x5961b8fe, 0x1ed4d48a, 0x74a5e2af, 0x5858e93c, 0x0fd17e9f, 0xaf643f0a, 0x79d94009,
+        0x61530753, 0xde7b2f53, 0x010a3393},
+       {0x813925df, 0x548b1d28, 0xca3e79b6, 0xabab3a4e, 0x7e51071a, 0xb3c9c068, 0x6c5fcedb, 0x8014e879, 0x95d9facc,
+        0x3ba5db77, 0x7f5c3d2f, 0x0105c419},
+       {0x26bc1104, 0xbb9cbd28, 0xe03cc852, 0x27f09abb, 0x22e5be61, 0x02763b4a, 0xb94fa254, 0xa3940542, 0xff34c35f,
+        0xcf058850, 0x1482533c, 0x019f538f},
+       {0xb3f42de9, 0xf2126047, 0xbeb0a1b8, 0xdb0451c4, 0x9aabc291, 0x1a945bc0, 0x7fe3a6f2, 0x13d08312, 0x390e1c07,
+        0xd8fb13f1, 0x6b30562b, 0x005a41c4},
+       {0xe8b3d5dd, 0x1c60fcc5, 0x75b3a464, 0x5d7babba, 0xf3989910, 0x0d9f52c7, 0x9beec571, 0x464a2840, 0x79689d4b,
+        0x139c496f, 0x099e64c4, 0x0022c6a3},
+       {0x023e0cd1, 0x9df6c2d5, 0xa6b747de, 0x8e23def9, 0x90da6876, 0x7bc83eee, 0xc88bb007, 0xdaeac352, 0x68bb6a7f,
+        0x45cabb6f, 0x94697b34, 0x001e7154},
+       {0x0203d905, 0xffcee91d, 0xc99df56d, 0xd878ee01, 0x210d754c, 0xa0e882f9, 0x7d0aec6a, 0x26c96db8, 0x8ff7afe4,
+        0x46e2e145, 0x54749283, 0x015cd1b0}}};
+
+    static constexpr storage_array<omegas_count, limbs_count> inv = {
+      {{0x00000001, 0x42846000, 0x18000000, 0x0b85aea2, 0xdd04a400, 0x8f79b117, 0x807a89c7, 0x8d116cf9, 0x3650a49d,
+        0x631d82e0, 0x0be28875, 0x00d71d23},
+       {0x00000001, 0x63c69000, 0x24000000, 0x114885f3, 0xcb86f600, 0x573689a3, 0x40b7ceab, 0x539a2376, 0x5178f6ec,
+        0x14ac4450, 0x91d3ccb0, 0x0142abb4},
+       {0x00000001, 0x7467a800, 0xaa000000, 0x1429f19b, 0xc2c81f00, 0x3b14f5e9, 0xa0d6711d, 0xb6de7eb4, 0x5f0d2013,
+        0x6d73a508, 0x54cc6ecd, 0x017872fd},
+       {0x00000001, 0x7cb83400, 0xed000000, 0x159aa76f, 0xbe68b380, 0x2d042c0c, 0xd0e5c256, 0x6880ac53, 0x65d734a7,
+        0x19d75564, 0xb648bfdc, 0x019356a1},
+       {0x00000001, 0x80e07a00, 0x0e800000, 0x1653025a, 0x3c38fdc0, 0xa5fbc71e, 0x68ed6af2, 0x4151c323, 0x693c3ef1,
+        0x70092d92, 0xe706e863, 0x01a0c873},
+       {0x00000001, 0x82f49d00, 0x1f400000, 0x16af2fcf, 0xfb2122e0, 0xe27794a6, 0x34f13f40, 0x2dba4e8b, 0x6aeec416,
+        0x1b2219a9, 0xff65fca7, 0x01a7815c},
+       {0x00000001, 0x83feae80, 0xa7a00000, 0x16dd4689, 0x5a953570, 0x00b57b6b, 0x1af32968, 0xa3ee943f, 0xebc806a8,
+        0xf0ae8fb4, 0x8b9586c8, 0x01aaddd1},
+       {0x00000001, 0x8483b740, 0xebd00000, 0x16f451e6, 0x8a4f3eb8, 0x8fd46ecd, 0x0df41e7b, 0xdf08b719, 0xac34a7f1,
+        0xdb74caba, 0xd1ad4bd9, 0x01ac8c0b},
+       {0x00000001, 0x84c63ba0, 0x8de80000, 0x16ffd795, 0xa22c435c, 0x5763e87e, 0x07749905, 0x7c95c886, 0x8c6af896,
+        0x50d7e83d, 0xf4b92e62, 0x01ad6328},
+       {0x00000001, 0x84e77dd0, 0xdef40000, 0x17059a6c, 0x2e1ac5ae, 0x3b2ba557, 0x8434d64a, 0xcb5c513c, 0xfc8620e8,
+        0x8b8976fe, 0x863f1fa6, 0x01adceb7},
+       {0x00000001, 0x84f81ee8, 0x877a0000, 0x17087bd8, 0x741206d7, 0xad0f83c3, 0xc294f4ec, 0xf2bf9597, 0xb493b511,
+        0xa8e23e5f, 0xcf021848, 0x01ae047e},
+       {0x00000001, 0x85006f74, 0x5bbd0000, 0x9709ec8e, 0x970da76b, 0xe60172f9, 0x61c5043d, 0x867137c5, 0x109a7f26,
+        0xb78ea210, 0x73639499, 0x01ae1f62},
+       {0x00000001, 0x850497ba, 0x45de8000, 0xd70aa4e9, 0xa88b77b5, 0x827a6a94, 0x315d0be6, 0xd04a08dc, 0x3e9de430,
+        0x3ee4d3e8, 0x459452c2, 0x01ae2cd4},
+       {0x00000001, 0x8506abdd, 0xbaef4000, 0xf70b0116, 0x314a5fda, 0xd0b6e662, 0x99290fba, 0xf5367167, 0x559f96b5,
+        0x828fecd4, 0x2eacb1d6, 0x01ae338d},
+       {0x80000001, 0x8507b5ee, 0x7577a000, 0x870b2f2d, 0xf5a9d3ed, 0xf7d52448, 0x4d0f11a4, 0x87aca5ad, 0x61206ff8,
+        0xa465794a, 0xa338e160, 0x01ae36e9},
+       {0x40000001, 0x85083af7, 0xd2bbd000, 0xcf0b4638, 0x57d98df6, 0x0b64433c, 0x2702129a, 0xd0e7bfd0, 0x66e0dc99,
+        0xb5503f85, 0xdd7ef925, 0x01ae3897},
+       {0xa0000001, 0x85087d7b, 0x815de800, 0x730b51be, 0x08f16afb, 0x952bd2b6, 0x93fb9314, 0x75854ce1, 0xe9c112ea,
+        0x3dc5a2a2, 0xfaa20508, 0x01ae396e},
+       {0xd0000001, 0x85089ebd, 0x58aef400, 0xc50b5781, 0xe17d597d, 0xda0f9a72, 0x4a785351, 0xc7d4136a, 0xab312e12,
+        0x82005431, 0x89338af9, 0x01ae39da},
+       {0xe8000001, 0x8508af5e, 0xc4577a00, 0xee0b5a62, 0x4dc350be, 0x7c817e51, 0xa5b6b370, 0xf0fb76ae, 0x0be93ba6,
+        0x241dacf9, 0x507c4df2, 0x01ae3a10},
+       {0x74000001, 0x8508b7af, 0x7a2bbd00, 0x828b5bd3, 0x83e64c5f, 0xcdba7040, 0xd355e37f, 0x058f2850, 0xbc454271,
+        0x752c595c, 0x3420af6e, 0x01ae3a2b},
+       {0xba000001, 0x8508bbd7, 0xd515de80, 0xcccb5c8b, 0x1ef7ca2f, 0x7656e938, 0xea257b87, 0x0fd90121, 0x947345d6,
+        0x9db3af8e, 0xa5f2e02c, 0x01ae3a38},
+       {0xdd000001, 0x8508bdeb, 0x028aef40, 0xf1eb5ce8, 0xec808917, 0x4aa525b3, 0x758d478b, 0x94fded8a, 0x808a4788,
+        0xb1f75aa7, 0x5edbf88b, 0x01ae3a3f},
+       {0xee800001, 0x8508bef5, 0x194577a0, 0x047b5d16, 0xd344e88c, 0x34cc43f1, 0xbb412d8d, 0xd79063be, 0xf695c861,
+        0x3c193033, 0xbb5084bb, 0x01ae3a42},
+       {0xf7400001, 0x8508bf7a, 0x24a2bbd0, 0x0dc35d2d, 0xc6a71846, 0x29dfd310, 0xde1b208e, 0x78d99ed8, 0x319b88ce,
+        0x012a1afa, 0x698acad3, 0x01ae3a44},
+       {0x7ba00001, 0x8508bfbd, 0xaa515de8, 0x12675d38, 0x40583023, 0xa4699aa0, 0xef881a0e, 0xc97e3c65, 0x4f1e6904,
+        0xe3b2905d, 0x40a7edde, 0x01ae3a45},
+       {0xbdd00001, 0x8508bfde, 0x6d28aef4, 0x94b95d3e, 0xfd30bc11, 0xe1ae7e67, 0x783e96ce, 0xf1d08b2c, 0xdddfd91f,
+        0xd4f6cb0e, 0xac367f64, 0x01ae3a45},
+       {0x5ee80001, 0x8508bfef, 0x4e94577a, 0xd5e25d41, 0xdb9d0208, 0x0050f04b, 0xbc99d52f, 0x85f9b28f, 0xa540912d,
+        0xcd98e867, 0xe1fdc827, 0x01ae3a45},
+       {0xaf740001, 0x8508bff7, 0xbf4a2bbd, 0x7676dd42, 0xcad32504, 0x0fa2293d, 0x5ec7745f, 0x500e4641, 0x08f0ed34,
+        0x49e9f714, 0xfce16c89, 0x01ae3a45},
+       {0xd7ba0001, 0x0508bffb, 0x77a515df, 0x46c11d43, 0xc26e3682, 0x174ac5b6, 0x2fde43f7, 0xb518901a, 0x3ac91b37,
+        0x08127e6a, 0x0a533eba, 0x01ae3a46},
+       {0xebdd0001, 0xc508bffd, 0xd3d28aef, 0x2ee63d43, 0x3e3bbf41, 0x1b1f13f3, 0x9869abc3, 0x679db506, 0x53b53239,
+        0x6726c215, 0x110c27d2, 0x01ae3a46},
+       {0xf5ee8001, 0x2508bffe, 0x01e94578, 0xa2f8cd44, 0x7c2283a0, 0x1d093b11, 0xccaf5fa9, 0x40e0477c, 0xe02b3dba,
+        0x96b0e3ea, 0x14689c5e, 0x01ae3a46},
+       {0x7af74001, 0x5508bfff, 0x18f4a2bc, 0x5d021544, 0x9b15e5d0, 0x1dfe4ea0, 0xe6d2399c, 0xad8190b7, 0xa666437a,
+        0xae75f4d5, 0x1616d6a4, 0x01ae3a46},
+       {0xbd7ba001, 0x6d08bfff, 0x247a515e, 0x3a06b944, 0x2a8f96e8, 0x9e78d868, 0x73e3a695, 0xe3d23555, 0x0983c65a,
+        0xba587d4b, 0x16edf3c7, 0x01ae3a46},
+       {0xdebdd001, 0x7908bfff, 0x2a3d28af, 0x28890b44, 0xf24c6f74, 0x5eb61d4b, 0x3a6c5d12, 0xfefa87a4, 0xbb1287ca,
+        0x4049c185, 0x17598259, 0x01ae3a46},
+       {0xef5ee801, 0xff08bfff, 0x2d1e9457, 0x1fca3444, 0xd62adbba, 0xbed4bfbd, 0x9db0b850, 0x0c8eb0cb, 0x13d9e883,
+        0x034263a3, 0x178f49a2, 0x01ae3a46},
+       {0xf7af7401, 0x4208bfff, 0x2e8f4a2c, 0x1b6ac8c4, 0xc81a11dd, 0xeee410f6, 0x4f52e5ef, 0x1358c55f, 0xc03d98df,
+        0x64beb4b1, 0x17aa2d46, 0x01ae3a46},
+       {0xfbd7ba01, 0x6388bfff, 0x2f47a516, 0x993b1304, 0x4111acee, 0x86ebb993, 0x2823fcbf, 0x16bdcfa9, 0x166f710d,
+        0x957cdd39, 0x17b79f18, 0x01ae3a46},
+       {0xfdebdd01, 0x7448bfff, 0x2fa3d28b, 0x58233824, 0x7d8d7a77, 0x52ef8de1, 0x148c8827, 0x187054ce, 0xc1885d24,
+        0xaddbf17c, 0x17be5801, 0x01ae3a46},
+       {0xfef5ee81, 0xfca8bfff, 0x2fd1e945, 0xb7974ab4, 0x9bcb613b, 0x38f17808, 0x8ac0cddb, 0x99499760, 0x9714d32f,
+        0x3a0b7b9e, 0x17c1b476, 0x01ae3a46},
+       {0xff7af741, 0x40d8bfff, 0x2fe8f4a3, 0xe75153fc, 0x2aea549d, 0x2bf26d1c, 0xc5daf0b5, 0x59b638a9, 0x81db0e35,
+        0x802340af, 0x17c362b0, 0x01ae3a46},
+       {0xffbd7ba1, 0xe2f0bfff, 0x2ff47a51, 0xff2e58a0, 0xf279ce4e, 0x2572e7a5, 0x63680222, 0x39ec894e, 0xf73e2bb8,
+        0xa32f2337, 0x17c439cd, 0x01ae3a46},
+       {0xffdebdd1, 0x33fcbfff, 0x2ffa3d29, 0x8b1cdaf2, 0xd6418b27, 0xa23324ea, 0xb22e8ad8, 0xaa07b1a0, 0x31efba79,
+        0x34b5147c, 0x17c4a55c, 0x01ae3a46},
+       {0xffef5ee9, 0xdc82bfff, 0x2ffd1e94, 0xd1141c1b, 0x48256993, 0xe093438d, 0xd991cf33, 0x621545c9, 0x4f4881da,
+        0x7d780d1e, 0x17c4db23, 0x01ae3a46},
+       {0xfff7af75, 0xb0c5bfff, 0xaffe8f4a, 0xf40fbcaf, 0x811758c9, 0x7fc352de, 0x6d437161, 0xbe1c0fde, 0x5df4e58a,
+        0x21d9896f, 0x17c4f607, 0x01ae3a46},
+       {0xfffbd7bb, 0x9ae73fff, 0xefff47a5, 0x058d8cf9, 0x1d905065, 0x4f5b5a87, 0xb71c4278, 0xec1f74e8, 0xe54b1762,
+        0xf40a4797, 0x17c50378, 0x01ae3a46},
+       {0xfffdebde, 0x0ff7ffff, 0x0fffa3d3, 0x8e4c751f, 0x6bcccc32, 0xb7275e5b, 0xdc08ab03, 0x0321276d, 0x28f6304f,
+        0xdd22a6ac, 0x17c50a31, 0x01ae3a46}}};
+
     // i^2, the square of the imaginary unit for the extension field
     static constexpr uint32_t i_squared = 5;
     // true if i^2 is negative
     static constexpr bool i_squared_is_negative = true;
-    // G1 and G2 generators
-    static constexpr storage<limbs_count> g1_gen_x = {0xb21be9ef, 0xeab9b16e, 0xffcd394e, 0xd5481512,
-                                                      0xbd37cb5c, 0x188282c8, 0xaa9d41bb, 0x85951e2c,
-                                                      0xbf87ff54, 0xc8fc6225, 0xfe740a67, 0x008848de};
-    static constexpr storage<limbs_count> g1_gen_y = {0x559c8ea6, 0xfd82de55, 0x34a9591a, 0xc2fe3d36,
-                                                      0x4fb82305, 0x6d182ad4, 0xca3e52d9, 0xbd7fb348,
-                                                      0x30afeec4, 0x1f674f5d, 0xc5102eff, 0x01914a69};
-    static constexpr storage<limbs_count> g2_gen_x_re = {0x7c005196, 0x74e3e48f, 0xbb535402, 0x71889f52,
-                                                         0x57db6b9b, 0x7ea501f5, 0x203e5031, 0xc565f071,
-                                                         0xa3841d01, 0xc89630a2, 0x71c785fe, 0x018480be};
-    static constexpr storage<limbs_count> g2_gen_x_im = {0x6ea16afe, 0xb26bfefa, 0xbff76fe6, 0x5cf89984,
-                                                         0x0799c9de, 0xe7223ece, 0x6651cecb, 0x532777ee,
-                                                         0xb1b140d5, 0x70dc5a51, 0xe7004031, 0x00ea6040};
-    static constexpr storage<limbs_count> g2_gen_y_re = {0x09fd4ddf, 0xf0940944, 0x6d8c7c2e, 0xf2cf8888,
-                                                         0xf832d204, 0xe458c282, 0x74b49a58, 0xde03ed72,
-                                                         0xcbb2efb4, 0xd960736b, 0x5d446f7b, 0x00690d66};
-    static constexpr storage<limbs_count> g2_gen_y_im = {0x85eb8f93, 0xd9a1cdd1, 0x5e52270b, 0x4279b83f,
-                                                         0xcee304c2, 0x2463b01a, 0x3d591bf1, 0x61ef11ac,
-                                                         0x151a70aa, 0x9e549da3, 0xd2835518, 0x00f8169f};
   };
 
+  // G1 and G2 generators
+  static constexpr storage<fq_config::limbs_count> g1_gen_x = {0xb21be9ef, 0xeab9b16e, 0xffcd394e, 0xd5481512,
+                                                               0xbd37cb5c, 0x188282c8, 0xaa9d41bb, 0x85951e2c,
+                                                               0xbf87ff54, 0xc8fc6225, 0xfe740a67, 0x008848de};
+  static constexpr storage<fq_config::limbs_count> g1_gen_y = {0x559c8ea6, 0xfd82de55, 0x34a9591a, 0xc2fe3d36,
+                                                               0x4fb82305, 0x6d182ad4, 0xca3e52d9, 0xbd7fb348,
+                                                               0x30afeec4, 0x1f674f5d, 0xc5102eff, 0x01914a69};
+  static constexpr storage<fq_config::limbs_count> g2_gen_x_re = {0x7c005196, 0x74e3e48f, 0xbb535402, 0x71889f52,
+                                                                  0x57db6b9b, 0x7ea501f5, 0x203e5031, 0xc565f071,
+                                                                  0xa3841d01, 0xc89630a2, 0x71c785fe, 0x018480be};
+  static constexpr storage<fq_config::limbs_count> g2_gen_x_im = {0x6ea16afe, 0xb26bfefa, 0xbff76fe6, 0x5cf89984,
+                                                                  0x0799c9de, 0xe7223ece, 0x6651cecb, 0x532777ee,
+                                                                  0xb1b140d5, 0x70dc5a51, 0xe7004031, 0x00ea6040};
+  static constexpr storage<fq_config::limbs_count> g2_gen_y_re = {0x09fd4ddf, 0xf0940944, 0x6d8c7c2e, 0xf2cf8888,
+                                                                  0xf832d204, 0xe458c282, 0x74b49a58, 0xde03ed72,
+                                                                  0xcbb2efb4, 0xd960736b, 0x5d446f7b, 0x00690d66};
+  static constexpr storage<fq_config::limbs_count> g2_gen_y_im = {0x85eb8f93, 0xd9a1cdd1, 0x5e52270b, 0x4279b83f,
+                                                                  0xcee304c2, 0x2463b01a, 0x3d591bf1, 0x61ef11ac,
+                                                                  0x151a70aa, 0x9e549da3, 0xd2835518, 0x00f8169f};
+
   static constexpr storage<fq_config::limbs_count> weierstrass_b = {0x00000001, 0x00000000, 0x00000000, 0x00000000,
                                                                     0x00000000, 0x00000000, 0x00000000, 0x00000000,
                                                                     0x00000000, 0x00000000, 0x00000000, 0x00000000};
diff --git a/icicle/curves/bls12_381_params.cuh b/icicle/curves/bls12_381_params.cuh
index 1b60c0c94..102590975 100644
--- a/icicle/curves/bls12_381_params.cuh
+++ b/icicle/curves/bls12_381_params.cuh
@@ -6,38 +6,34 @@
 
 namespace bls12_381 {
   struct fp_config {
-    // field structure size = 8 * 32 bit
     static constexpr unsigned limbs_count = 8;
     static constexpr unsigned omegas_count = 32;
-    // modulus = 52435875175126190479447740508185965837690552500527637822603658699938581184513
+    static constexpr unsigned modulus_bit_count = 255;
+    static constexpr unsigned num_of_reductions = 2;
+
     static constexpr storage<limbs_count> modulus = {0x00000001, 0xffffffff, 0xfffe5bfe, 0x53bda402,
                                                      0x09a1d805, 0x3339d808, 0x299d7d48, 0x73eda753};
-    // modulus*2 = 104871750350252380958895481016371931675381105001055275645207317399877162369026
     static constexpr storage<limbs_count> modulus_2 = {0x00000002, 0xfffffffe, 0xfffcb7fd, 0xa77b4805,
                                                        0x1343b00a, 0x6673b010, 0x533afa90, 0xe7db4ea6};
-    static constexpr storage<limbs_count> modulus_4 = {0x00000000, 0x00000000, 0x00000000, 0x00000000,
-                                                       0x00000000, 0x00000000, 0x00000000, 0x00000000};
-
+    static constexpr storage<limbs_count> modulus_4 = {0x00000004, 0xfffffffc, 0xfff96ffb, 0x4ef6900b,
+                                                       0x26876015, 0xcce76020, 0xa675f520, 0xcfb69d4c};
+    static constexpr storage<limbs_count> neg_modulus = {0xffffffff, 0x00000000, 0x0001a401, 0xac425bfd,
+                                                         0xf65e27fa, 0xccc627f7, 0xd66282b7, 0x8c1258ac};
     static constexpr storage<2 * limbs_count> modulus_wide = {
       0x00000001, 0xffffffff, 0xfffe5bfe, 0x53bda402, 0x09a1d805, 0x3339d808, 0x299d7d48, 0x73eda753,
       0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000};
-    // modulus^2
     static constexpr storage<2 * limbs_count> modulus_squared = {
       0x00000001, 0xfffffffe, 0xfffcb7fe, 0xa77e9007, 0x1cdbb005, 0x698ae002, 0x5433f7b8, 0x48aa415e,
       0x4aa9c661, 0xc2611f6f, 0x59934a1d, 0x0e9593f9, 0xef2cc20f, 0x520c13db, 0xf4bc2778, 0x347f60f3};
-    // 2*modulus^2
     static constexpr storage<2 * limbs_count> modulus_squared_2 = {
       0x00000002, 0xfffffffc, 0xfff96ffd, 0x4efd200f, 0x39b7600b, 0xd315c004, 0xa867ef70, 0x915482bc,
       0x95538cc2, 0x84c23ede, 0xb326943b, 0x1d2b27f2, 0xde59841e, 0xa41827b7, 0xe9784ef0, 0x68fec1e7};
-    // note: doesnt actually fit into 384 bits, and shouldnt be used! is added for compilation
     static constexpr storage<2 * limbs_count> modulus_squared_4 = {
-      0x00000002, 0xfffffffc, 0xfff96ffd, 0x4efd200f, 0x39b7600b, 0xd315c004, 0xa867ef70, 0x915482bc,
-      0x95538cc2, 0x84c23ede, 0xb326943b, 0x1d2b27f2, 0xde59841e, 0xa41827b7, 0xe9784ef0, 0x68fec1e7};
-    static constexpr unsigned modulus_bit_count = 255;
-    // m = floor(2^(2*modulus_bit_count) / modulus)
-    static constexpr storage<limbs_count> m = {0x830358e4, 0x509cde80, 0x2f92eb5c, 0xd9410fad,
-                                               0xc1f823b4, 0xe2d772d,  0x7fb78ddf, 0x8d54253b};
+      0x00000004, 0xfffffff8, 0xfff2dffb, 0x9dfa401f, 0x736ec016, 0xa62b8008, 0x50cfdee1, 0x22a90579,
+      0x2aa71985, 0x09847dbd, 0x664d2877, 0x3a564fe5, 0xbcb3083c, 0x48304f6f, 0xd2f09de1, 0xd1fd83cf};
 
+    static constexpr storage<limbs_count> m = {0x830358e4, 0x509cde80, 0x2f92eb5c, 0xd9410fad,
+                                               0xc1f823b4, 0x0e2d772d, 0x7fb78ddf, 0x8d54253b};
     static constexpr storage<limbs_count> one = {0x00000001, 0x00000000, 0x00000000, 0x00000000,
                                                  0x00000000, 0x00000000, 0x00000000, 0x00000000};
     static constexpr storage<limbs_count> zero = {0x00000000, 0x00000000, 0x00000000, 0x00000000,
@@ -47,322 +43,141 @@ namespace bls12_381 {
     static constexpr storage<limbs_count> montgomery_r_inv = {0xfe75c040, 0x13f75b69, 0x09dc705f, 0xab6fca8f,
                                                               0x4f77266a, 0x7204078a, 0x30009d57, 0x1bbe8693};
 
-    // static constexpr storage<limbs_count> omega[32]= { {0x00000000, 0xffffffff, 0xfffe5bfe, 0x53bda402, 0x09a1d805,
-    // 0x3339d808, 0x299d7d48, 0x73eda753}, {0x00000000, 0x00010000, 0x76030000, 0xec030002, 0x760304d0, 0x8d51ccce,
-    // 0x00000000, 0x00000000}, {0x688bc087, 0x8dd702cb, 0x78eaa4fe, 0xa0328240, 0x98ca5b22, 0xa733b23a, 0x25a31660,
-    // 0x3f96405d}, {0x0411fe73, 0x95df4b36, 0xebc1e1bb, 0x1ef4e672, 0x60afca4a, 0x6e92a9c4, 0x753e4fcc, 0x4f2c596e},
-    // {0xba60eaa6, 0x9733f3a6, 0x77487ae7, 0xbd7fdf9c, 0xc8b6cc00, 0xd84f8612, 0x6162ffab, 0x476fa2fb}, {0xac5db47f,
-    // 0xd2fc5e69, 0x15d0b8e4, 0xa12a70a6, 0xbc8de5d9, 0x293b1d67, 0x57f86f5e, 0x0e4840ac}, {0xab28e208, 0xb750da4c,
-    // 0x3be95635, 0x501dff64, 0xf0b4b276, 0x8cbe2437, 0xa94a946e, 0x07d0c802}, {0x2fe322b8, 0x2cabadec, 0x15412560,
-    // 0x752c84f3, 0x1a3b0aef, 0x32a732ae, 0xa33dcbf2, 0x2e95da59}, {0xfe0c65f4, 0x33811ea1, 0x687f28a2, 0x15c1ad4c,
-    // 0x42dee7f4, 0xecfbede3, 0x9a5d88b1, 0x1bb46667}, {0x2d010ff9, 0xd58a5af4, 0x570bf109, 0x79efd6b0, 0x6350721d,
-    // 0x3ed6d55a, 0x58f43cef, 0x2f27b098}, {0x8c130477, 0x74a1f671, 0xb61e0abe, 0xa534af14, 0x620890d7, 0xeb674a1a,
-    // 0xca252472, 0x43527a8b}, {0x7ea8ee05, 0x450d9f97, 0x37d56fc0, 0x565af171, 0x93f9e9ac, 0xe155cb48, 0xc8e9101b,
-    // 0x110cebd0}, {0x59a0be92, 0x23c91599, 0x7a027759, 0x87d188ce, 0xcab3c3cc, 0x70491431, 0xb3f7f8da, 0x0ac00eb8},
-    // {0x69583404, 0x13e96ade, 0x5306243d, 0x82c05727, 0x29ca9f2a, 0x77e48bf5, 0x1fe19595, 0x50646ac8}, {0xa97eccd4,
-    // 0xe6a354dd, 0x88fbbc57, 0x39929d2e, 0xd6e7b1c8, 0xa22ba63d, 0xf5f07f43, 0x42c22911}, {0xcfc35f7a, 0x137b458a,
-    // 0x29c01b06, 0x0caba63a, 0x7a02402c, 0x0409ee98, 0x56aa725b, 0x6709c6cd}, {0x8831e03e, 0x10251f7d, 0x7ff858ec,
-    // 0x77d85a93, 0x4fb9ac5c, 0xebe905bd, 0xf8727901, 0x05deb333}, {0xb9009408, 0xbf87b689, 0xdd3ccc96, 0x4f730e7d,
-    // 0x4610300c, 0xfd7f05ba, 0x0b8ac903, 0x5ef5e8db}, {0x17cd0c14, 0x64996884, 0x68812f7f, 0xa6728673, 0x22cc3253,
-    // 0x2e1d9a19, 0xaa0a1d80, 0x3a689e83}, {0x41144dea, 0x20b53cbe, 0xc2f0fcbd, 0x870c46fa, 0x537d6971, 0x556c35f6,
-    // 0x5f686d91, 0x3436287f}, {0x436ba2e7, 0x007e082a, 0x9116e877, 0x67c6630f, 0xfb4460f7, 0x36f8f165, 0x7e7046e0,
-    // 0x6eee34d5}, {0xa53a56d1, 0xc5b670ee, 0x53037d7b, 0x127d1f42, 0xa722c2e2, 0x57d4257e, 0x33cbd838, 0x03ae26a3},
-    // {0x76504cf8, 0x1e914848, 0xb63edd02, 0x55bbbf1e, 0x4e55aa02, 0xbcdafec8, 0x2dc0beb0, 0x5145c4cd}, {0x1ab70e2c,
-    // 0x5b90153a, 0x75fb0ab8, 0x8deffa31, 0x46900c95, 0xc553ae23, 0x6bd3118c, 0x1d31dcdc}, {0x59a2e8eb, 0x801c894c,
-    // 0xe12fc974, 0xbc535c5c, 0x47d39803, 0x95508d27, 0xac5d094f, 0x16d9d3cd}, {0xcca1d8be, 0x810fa372, 0x82e0bfa7,
-    // 0xc67b8c28, 0xe2d35bc2, 0xdbb4edf0, 0x5087c995, 0x712d1580}, {0xfd88f133, 0xeb162203, 0xf010ea74, 0xac96c38f,
-    // 0xe64cfc70, 0x4307987f, 0x37b7a114, 0x350fe98d}, {0x42f2a254, 0xaba2f518, 0xa71efc0c, 0x4d7f3c3a, 0xd274a80a,
-    // 0x97ae418d, 0x5e3e7682, 0x2967385d}, {0x575a0b79, 0x75c55c7b, 0x74a7ded1, 0x3ba4a157, 0xa04fccf3, 0xc3974d73,
-    // 0x4a939684, 0x705aba4f}, {0x14ebb608, 0x8409a9ea, 0x66bac611, 0xfad0084e, 0x811c1dfb, 0x04287254, 0x23b30c29,
-    // 0x086d072b}, {0x67e4756a, 0xb427c9b3, 0x02ebc38d, 0xc7537fb9, 0xcd6a205f, 0x51de21be, 0x7923597d, 0x6064ab72},
-    // {0x0b912f1f, 0x1b788f50, 0x70b3e094, 0xc4024ff2, 0xd168d6c0, 0x0fd56dc8, 0x5b416b6f, 0x0212d79e}}; Quick fix for
-    // linking issue
-    static constexpr storage<limbs_count> omega1 = {0x00000000, 0xffffffff, 0xfffe5bfe, 0x53bda402,
-                                                    0x09a1d805, 0x3339d808, 0x299d7d48, 0x73eda753};
-    static constexpr storage<limbs_count> omega2 = {0x00000000, 0x00010000, 0x76030000, 0xec030002,
-                                                    0x760304d0, 0x8d51ccce, 0x00000000, 0x00000000};
-    static constexpr storage<limbs_count> omega3 = {0x688bc087, 0x8dd702cb, 0x78eaa4fe, 0xa0328240,
-                                                    0x98ca5b22, 0xa733b23a, 0x25a31660, 0x3f96405d};
-    static constexpr storage<limbs_count> omega4 = {0x0411fe73, 0x95df4b36, 0xebc1e1bb, 0x1ef4e672,
-                                                    0x60afca4a, 0x6e92a9c4, 0x753e4fcc, 0x4f2c596e};
-    static constexpr storage<limbs_count> omega5 = {0xba60eaa6, 0x9733f3a6, 0x77487ae7, 0xbd7fdf9c,
-                                                    0xc8b6cc00, 0xd84f8612, 0x6162ffab, 0x476fa2fb};
-    static constexpr storage<limbs_count> omega6 = {0xac5db47f, 0xd2fc5e69, 0x15d0b8e4, 0xa12a70a6,
-                                                    0xbc8de5d9, 0x293b1d67, 0x57f86f5e, 0x0e4840ac};
-    static constexpr storage<limbs_count> omega7 = {0xab28e208, 0xb750da4c, 0x3be95635, 0x501dff64,
-                                                    0xf0b4b276, 0x8cbe2437, 0xa94a946e, 0x07d0c802};
-    static constexpr storage<limbs_count> omega8 = {0x2fe322b8, 0x2cabadec, 0x15412560, 0x752c84f3,
-                                                    0x1a3b0aef, 0x32a732ae, 0xa33dcbf2, 0x2e95da59};
-    static constexpr storage<limbs_count> omega9 = {0xfe0c65f4, 0x33811ea1, 0x687f28a2, 0x15c1ad4c,
-                                                    0x42dee7f4, 0xecfbede3, 0x9a5d88b1, 0x1bb46667};
-    static constexpr storage<limbs_count> omega10 = {0x2d010ff9, 0xd58a5af4, 0x570bf109, 0x79efd6b0,
-                                                     0x6350721d, 0x3ed6d55a, 0x58f43cef, 0x2f27b098};
-    static constexpr storage<limbs_count> omega11 = {0x8c130477, 0x74a1f671, 0xb61e0abe, 0xa534af14,
-                                                     0x620890d7, 0xeb674a1a, 0xca252472, 0x43527a8b};
-    static constexpr storage<limbs_count> omega12 = {0x7ea8ee05, 0x450d9f97, 0x37d56fc0, 0x565af171,
-                                                     0x93f9e9ac, 0xe155cb48, 0xc8e9101b, 0x110cebd0};
-    static constexpr storage<limbs_count> omega13 = {0x59a0be92, 0x23c91599, 0x7a027759, 0x87d188ce,
-                                                     0xcab3c3cc, 0x70491431, 0xb3f7f8da, 0x0ac00eb8};
-    static constexpr storage<limbs_count> omega14 = {0x69583404, 0x13e96ade, 0x5306243d, 0x82c05727,
-                                                     0x29ca9f2a, 0x77e48bf5, 0x1fe19595, 0x50646ac8};
-    static constexpr storage<limbs_count> omega15 = {0xa97eccd4, 0xe6a354dd, 0x88fbbc57, 0x39929d2e,
-                                                     0xd6e7b1c8, 0xa22ba63d, 0xf5f07f43, 0x42c22911};
-    static constexpr storage<limbs_count> omega16 = {0xcfc35f7a, 0x137b458a, 0x29c01b06, 0x0caba63a,
-                                                     0x7a02402c, 0x0409ee98, 0x56aa725b, 0x6709c6cd};
-    static constexpr storage<limbs_count> omega17 = {0x8831e03e, 0x10251f7d, 0x7ff858ec, 0x77d85a93,
-                                                     0x4fb9ac5c, 0xebe905bd, 0xf8727901, 0x05deb333};
-    static constexpr storage<limbs_count> omega18 = {0xb9009408, 0xbf87b689, 0xdd3ccc96, 0x4f730e7d,
-                                                     0x4610300c, 0xfd7f05ba, 0x0b8ac903, 0x5ef5e8db};
-    static constexpr storage<limbs_count> omega19 = {0x17cd0c14, 0x64996884, 0x68812f7f, 0xa6728673,
-                                                     0x22cc3253, 0x2e1d9a19, 0xaa0a1d80, 0x3a689e83};
-    static constexpr storage<limbs_count> omega20 = {0x41144dea, 0x20b53cbe, 0xc2f0fcbd, 0x870c46fa,
-                                                     0x537d6971, 0x556c35f6, 0x5f686d91, 0x3436287f};
-    static constexpr storage<limbs_count> omega21 = {0x436ba2e7, 0x007e082a, 0x9116e877, 0x67c6630f,
-                                                     0xfb4460f7, 0x36f8f165, 0x7e7046e0, 0x6eee34d5};
-    static constexpr storage<limbs_count> omega22 = {0xa53a56d1, 0xc5b670ee, 0x53037d7b, 0x127d1f42,
-                                                     0xa722c2e2, 0x57d4257e, 0x33cbd838, 0x03ae26a3};
-    static constexpr storage<limbs_count> omega23 = {0x76504cf8, 0x1e914848, 0xb63edd02, 0x55bbbf1e,
-                                                     0x4e55aa02, 0xbcdafec8, 0x2dc0beb0, 0x5145c4cd};
-    static constexpr storage<limbs_count> omega24 = {0x1ab70e2c, 0x5b90153a, 0x75fb0ab8, 0x8deffa31,
-                                                     0x46900c95, 0xc553ae23, 0x6bd3118c, 0x1d31dcdc};
-    static constexpr storage<limbs_count> omega25 = {0x59a2e8eb, 0x801c894c, 0xe12fc974, 0xbc535c5c,
-                                                     0x47d39803, 0x95508d27, 0xac5d094f, 0x16d9d3cd};
-    static constexpr storage<limbs_count> omega26 = {0xcca1d8be, 0x810fa372, 0x82e0bfa7, 0xc67b8c28,
-                                                     0xe2d35bc2, 0xdbb4edf0, 0x5087c995, 0x712d1580};
-    static constexpr storage<limbs_count> omega27 = {0xfd88f133, 0xeb162203, 0xf010ea74, 0xac96c38f,
-                                                     0xe64cfc70, 0x4307987f, 0x37b7a114, 0x350fe98d};
-    static constexpr storage<limbs_count> omega28 = {0x42f2a254, 0xaba2f518, 0xa71efc0c, 0x4d7f3c3a,
-                                                     0xd274a80a, 0x97ae418d, 0x5e3e7682, 0x2967385d};
-    static constexpr storage<limbs_count> omega29 = {0x575a0b79, 0x75c55c7b, 0x74a7ded1, 0x3ba4a157,
-                                                     0xa04fccf3, 0xc3974d73, 0x4a939684, 0x705aba4f};
-    static constexpr storage<limbs_count> omega30 = {0x14ebb608, 0x8409a9ea, 0x66bac611, 0xfad0084e,
-                                                     0x811c1dfb, 0x04287254, 0x23b30c29, 0x086d072b};
-    static constexpr storage<limbs_count> omega31 = {0x67e4756a, 0xb427c9b3, 0x02ebc38d, 0xc7537fb9,
-                                                     0xcd6a205f, 0x51de21be, 0x7923597d, 0x6064ab72};
-    static constexpr storage<limbs_count> omega32 = {0x0b912f1f, 0x1b788f50, 0x70b3e094, 0xc4024ff2,
-                                                     0xd168d6c0, 0x0fd56dc8, 0x5b416b6f, 0x0212d79e};
-
     static constexpr storage_array<omegas_count, limbs_count> omega = {
-      omega1,  omega2,  omega3,  omega4,  omega5,  omega6,  omega7,  omega8,  omega9,  omega10, omega11,
-      omega12, omega13, omega14, omega15, omega16, omega17, omega18, omega19, omega20, omega21, omega22,
-      omega23, omega24, omega25, omega26, omega27, omega28, omega29, omega30, omega31, omega32,
-    };
-
-    // static constexpr storage<limbs_count> omega_inv[32]={ {0x00000000, 0xffffffff, 0xfffe5bfe, 0x53bda402,
-    // 0x09a1d805, 0x3339d808, 0x299d7d48, 0x73eda753}, {0x00000001, 0xfffeffff, 0x89fb5bfe, 0x67baa400, 0x939ed334,
-    // 0xa5e80b39, 0x299d7d47, 0x73eda753}, {0xae99502e, 0x6037fe81, 0x94b04fd8, 0x8e749036, 0xca86bf65, 0xbabc5aff,
-    // 0x5ce11044, 0x1333b22e}, {0x7dc08d74, 0x7f847ee4, 0x04eeaf5a, 0xbd433896, 0x1832fc60, 0xd66c91d6, 0x607e449b,
-    // 0x551115b4}, {0x4e7773cb, 0xee5bcecc, 0xf6dab086, 0x45593d6f, 0x4016e2bd, 0xa3a95d2d, 0xaf96816f, 0x047cb16c},
-    // {0x982b68c5, 0xb891fa3f, 0x1d426b52, 0xa41e8501, 0x882952d6, 0x566009b5, 0x7b3c79d6, 0x199cdaee}, {0xcf28601b,
-    // 0x571ba2fc, 0xac74db12, 0x166fb582, 0x3501370b, 0x51420be4, 0x52f970ba, 0x1996fa8d}, {0x6a2f777a, 0xe9561c17,
-    // 0x2393991b, 0xc03cae03, 0x5a5bfd4f, 0x91b00023, 0x272e58ee, 0x6d64ed25}, {0xf02a116e, 0xfb350dbe, 0xb4543a3e,
-    // 0x1c510ebf, 0x37ad4eca, 0xf675522e, 0x80f82b2d, 0x1907a56e}, {0x4eb71aa6, 0xb0ad8003, 0xaa67e0be, 0x50a32c41,
-    // 0x19141f44, 0x105f0672, 0xa3dad316, 0x2bcd9508}, {0x0f6fb2ac, 0x3dc9e560, 0x9aa58ff5, 0x3cc5bb32, 0x36f376e1,
-    // 0xdeae67bc, 0x65ba213e, 0x394fda0d}, {0x60b82267, 0x09f239f7, 0x8b24f123, 0x14180e0e, 0x45625d95, 0xad5a5340,
-    // 0x6d174692, 0x58c3ba63}, {0x348b416f, 0x0acf21c2, 0xbc086439, 0x798b6bf6, 0xb1ca111d, 0x222d411f, 0x30ba1e0f,
-    // 0x044107b7}, {0x014abe84, 0xa3b861b8, 0x427ed008, 0x37c017e4, 0xae0ff4f5, 0xae51f613, 0xcb1218d3, 0x1a2d00e1},
-    // {0x4de7eb2b, 0x48aaa3bf, 0x6772057d, 0x4a58d54d, 0x7093b551, 0xce25f16c, 0xd206337c, 0x242150ac}, {0x9ed57ae5,
-    // 0xdf3ec9ae, 0x7166577f, 0xea7df73a, 0x022fbbe4, 0x6ca8d281, 0x151e3f6b, 0x5850c003}, {0x645e1cfa, 0x903a0a0c,
-    // 0x34788c37, 0xfbac54cb, 0x8cf73d78, 0xdc127d11, 0x975d3c82, 0x6d0b5c7c}, {0x14b1ba04, 0xb49d6b05, 0xf00b84f2,
-    // 0x56e466b4, 0x0b904f22, 0x30c390cf, 0x3ee254cc, 0x3e11cfb7}, {0xbe8201ab, 0x84dfa547, 0x530715d2, 0x3887ce8b,
-    // 0x3eed4ed7, 0xa4c719c6, 0x8f8007b4, 0x18c44950}, {0x7d813cd1, 0xdaf0346d, 0xf755beb1, 0xeccf6f9a, 0xe08143e3,
-    // 0x167fce38, 0x6f5d6dfa, 0x545ad9b2}, {0x577605de, 0x973f5466, 0x974f953c, 0x0ce8986e, 0x074382f9, 0x8941cf4b,
-    // 0x6fa2672c, 0x156cd7f6}, {0x33b66141, 0x24315404, 0x1992f584, 0x5d1375ab, 0x8b20ca1a, 0xf193ffa6, 0x2701a503,
-    // 0x47880cd5}, {0xe9f7b9af, 0xf7b6847d, 0x62c83ce2, 0x9a339673, 0x6e5e6f79, 0xfabf4537, 0x35af33a3, 0x0975acd9},
-    // {0x0eddd248, 0x4fb4204a, 0xc9e509b3, 0x8c98706a, 0x2bb27eb1, 0xd0be8987, 0xc831438b, 0x6ec5f960}, {0x20238f62,
-    // 0xa13c95b7, 0x83b476b9, 0x130aa097, 0x14860881, 0x758a04e0, 0x97066493, 0x58e2f8d6}, {0xe8bff41e, 0x65b09c73,
-    // 0x37f1c6a3, 0x8b3280e8, 0x2846fb21, 0xe17b82ce, 0xb1ae27df, 0x476534bf}, {0xd5fdb757, 0x8480c0e7, 0x365bf9fd,
-    // 0x3644eea0, 0xb776be86, 0x4ca116ca, 0x8b58390c, 0x17b6395f}, {0x252eb0db, 0x2c811e9a, 0x7479e161, 0x1b7d960d,
-    // 0xb0a89a26, 0xb3afc7c1, 0x32b5e793, 0x6a2f9533}, {0x08b8a7ad, 0xe877b2c4, 0x341652b4, 0x68b0e8f0, 0xe8b6a2d9,
-    // 0x2d44da3b, 0xfd09be59, 0x092778ff}, {0x7988f244, 0x84a1aa6f, 0x24faf63f, 0xa164b3d9, 0xc1bbb915, 0x7aae9724,
-    // 0xf386c0d2, 0x24e5d287}, {0x41a1b30c, 0xa70a7efd, 0x39f0e511, 0xc49c55a5, 0x033bb323, 0xab307a8f, 0x17acbd7f,
-    // 0x0158abd6}, {0x0f642025, 0x2c228b30, 0x01bd882b, 0xb0878e8d, 0xd7377fea, 0xd862b255, 0xf0490536, 0x18ac3666}};
-    // Quick fix for linking issue
-    static constexpr storage<limbs_count> omega_inv1 = {0x00000000, 0xffffffff, 0xfffe5bfe, 0x53bda402,
-                                                        0x09a1d805, 0x3339d808, 0x299d7d48, 0x73eda753};
-    static constexpr storage<limbs_count> omega_inv2 = {0x00000001, 0xfffeffff, 0x89fb5bfe, 0x67baa400,
-                                                        0x939ed334, 0xa5e80b39, 0x299d7d47, 0x73eda753};
-    static constexpr storage<limbs_count> omega_inv3 = {0xae99502e, 0x6037fe81, 0x94b04fd8, 0x8e749036,
-                                                        0xca86bf65, 0xbabc5aff, 0x5ce11044, 0x1333b22e};
-    static constexpr storage<limbs_count> omega_inv4 = {0x7dc08d74, 0x7f847ee4, 0x04eeaf5a, 0xbd433896,
-                                                        0x1832fc60, 0xd66c91d6, 0x607e449b, 0x551115b4};
-    static constexpr storage<limbs_count> omega_inv5 = {0x4e7773cb, 0xee5bcecc, 0xf6dab086, 0x45593d6f,
-                                                        0x4016e2bd, 0xa3a95d2d, 0xaf96816f, 0x047cb16c};
-    static constexpr storage<limbs_count> omega_inv6 = {0x982b68c5, 0xb891fa3f, 0x1d426b52, 0xa41e8501,
-                                                        0x882952d6, 0x566009b5, 0x7b3c79d6, 0x199cdaee};
-    static constexpr storage<limbs_count> omega_inv7 = {0xcf28601b, 0x571ba2fc, 0xac74db12, 0x166fb582,
-                                                        0x3501370b, 0x51420be4, 0x52f970ba, 0x1996fa8d};
-    static constexpr storage<limbs_count> omega_inv8 = {0x6a2f777a, 0xe9561c17, 0x2393991b, 0xc03cae03,
-                                                        0x5a5bfd4f, 0x91b00023, 0x272e58ee, 0x6d64ed25};
-    static constexpr storage<limbs_count> omega_inv9 = {0xf02a116e, 0xfb350dbe, 0xb4543a3e, 0x1c510ebf,
-                                                        0x37ad4eca, 0xf675522e, 0x80f82b2d, 0x1907a56e};
-    static constexpr storage<limbs_count> omega_inv10 = {0x4eb71aa6, 0xb0ad8003, 0xaa67e0be, 0x50a32c41,
-                                                         0x19141f44, 0x105f0672, 0xa3dad316, 0x2bcd9508};
-    static constexpr storage<limbs_count> omega_inv11 = {0x0f6fb2ac, 0x3dc9e560, 0x9aa58ff5, 0x3cc5bb32,
-                                                         0x36f376e1, 0xdeae67bc, 0x65ba213e, 0x394fda0d};
-    static constexpr storage<limbs_count> omega_inv12 = {0x60b82267, 0x09f239f7, 0x8b24f123, 0x14180e0e,
-                                                         0x45625d95, 0xad5a5340, 0x6d174692, 0x58c3ba63};
-    static constexpr storage<limbs_count> omega_inv13 = {0x348b416f, 0x0acf21c2, 0xbc086439, 0x798b6bf6,
-                                                         0xb1ca111d, 0x222d411f, 0x30ba1e0f, 0x044107b7};
-    static constexpr storage<limbs_count> omega_inv14 = {0x014abe84, 0xa3b861b8, 0x427ed008, 0x37c017e4,
-                                                         0xae0ff4f5, 0xae51f613, 0xcb1218d3, 0x1a2d00e1};
-    static constexpr storage<limbs_count> omega_inv15 = {0x4de7eb2b, 0x48aaa3bf, 0x6772057d, 0x4a58d54d,
-                                                         0x7093b551, 0xce25f16c, 0xd206337c, 0x242150ac};
-    static constexpr storage<limbs_count> omega_inv16 = {0x9ed57ae5, 0xdf3ec9ae, 0x7166577f, 0xea7df73a,
-                                                         0x022fbbe4, 0x6ca8d281, 0x151e3f6b, 0x5850c003};
-    static constexpr storage<limbs_count> omega_inv17 = {0x645e1cfa, 0x903a0a0c, 0x34788c37, 0xfbac54cb,
-                                                         0x8cf73d78, 0xdc127d11, 0x975d3c82, 0x6d0b5c7c};
-    static constexpr storage<limbs_count> omega_inv18 = {0x14b1ba04, 0xb49d6b05, 0xf00b84f2, 0x56e466b4,
-                                                         0x0b904f22, 0x30c390cf, 0x3ee254cc, 0x3e11cfb7};
-    static constexpr storage<limbs_count> omega_inv19 = {0xbe8201ab, 0x84dfa547, 0x530715d2, 0x3887ce8b,
-                                                         0x3eed4ed7, 0xa4c719c6, 0x8f8007b4, 0x18c44950};
-    static constexpr storage<limbs_count> omega_inv20 = {0x7d813cd1, 0xdaf0346d, 0xf755beb1, 0xeccf6f9a,
-                                                         0xe08143e3, 0x167fce38, 0x6f5d6dfa, 0x545ad9b2};
-    static constexpr storage<limbs_count> omega_inv21 = {0x577605de, 0x973f5466, 0x974f953c, 0x0ce8986e,
-                                                         0x074382f9, 0x8941cf4b, 0x6fa2672c, 0x156cd7f6};
-    static constexpr storage<limbs_count> omega_inv22 = {0x33b66141, 0x24315404, 0x1992f584, 0x5d1375ab,
-                                                         0x8b20ca1a, 0xf193ffa6, 0x2701a503, 0x47880cd5};
-    static constexpr storage<limbs_count> omega_inv23 = {0xe9f7b9af, 0xf7b6847d, 0x62c83ce2, 0x9a339673,
-                                                         0x6e5e6f79, 0xfabf4537, 0x35af33a3, 0x0975acd9};
-    static constexpr storage<limbs_count> omega_inv24 = {0x0eddd248, 0x4fb4204a, 0xc9e509b3, 0x8c98706a,
-                                                         0x2bb27eb1, 0xd0be8987, 0xc831438b, 0x6ec5f960};
-    static constexpr storage<limbs_count> omega_inv25 = {0x20238f62, 0xa13c95b7, 0x83b476b9, 0x130aa097,
-                                                         0x14860881, 0x758a04e0, 0x97066493, 0x58e2f8d6};
-    static constexpr storage<limbs_count> omega_inv26 = {0xe8bff41e, 0x65b09c73, 0x37f1c6a3, 0x8b3280e8,
-                                                         0x2846fb21, 0xe17b82ce, 0xb1ae27df, 0x476534bf};
-    static constexpr storage<limbs_count> omega_inv27 = {0xd5fdb757, 0x8480c0e7, 0x365bf9fd, 0x3644eea0,
-                                                         0xb776be86, 0x4ca116ca, 0x8b58390c, 0x17b6395f};
-    static constexpr storage<limbs_count> omega_inv28 = {0x252eb0db, 0x2c811e9a, 0x7479e161, 0x1b7d960d,
-                                                         0xb0a89a26, 0xb3afc7c1, 0x32b5e793, 0x6a2f9533};
-    static constexpr storage<limbs_count> omega_inv29 = {0x08b8a7ad, 0xe877b2c4, 0x341652b4, 0x68b0e8f0,
-                                                         0xe8b6a2d9, 0x2d44da3b, 0xfd09be59, 0x092778ff};
-    static constexpr storage<limbs_count> omega_inv30 = {0x7988f244, 0x84a1aa6f, 0x24faf63f, 0xa164b3d9,
-                                                         0xc1bbb915, 0x7aae9724, 0xf386c0d2, 0x24e5d287};
-    static constexpr storage<limbs_count> omega_inv31 = {0x41a1b30c, 0xa70a7efd, 0x39f0e511, 0xc49c55a5,
-                                                         0x033bb323, 0xab307a8f, 0x17acbd7f, 0x0158abd6};
-    static constexpr storage<limbs_count> omega_inv32 = {0x0f642025, 0x2c228b30, 0x01bd882b, 0xb0878e8d,
-                                                         0xd7377fea, 0xd862b255, 0xf0490536, 0x18ac3666};
+      {{0x00000000, 0xffffffff, 0xfffe5bfe, 0x53bda402, 0x09a1d805, 0x3339d808, 0x299d7d48, 0x73eda753},
+       {0x00000000, 0x00010000, 0x76030000, 0xec030002, 0x760304d0, 0x8d51ccce, 0x00000000, 0x00000000},
+       {0x688bc087, 0x8dd702cb, 0x78eaa4fe, 0xa0328240, 0x98ca5b22, 0xa733b23a, 0x25a31660, 0x3f96405d},
+       {0x0411fe73, 0x95df4b36, 0xebc1e1bb, 0x1ef4e672, 0x60afca4a, 0x6e92a9c4, 0x753e4fcc, 0x4f2c596e},
+       {0xba60eaa6, 0x9733f3a6, 0x77487ae7, 0xbd7fdf9c, 0xc8b6cc00, 0xd84f8612, 0x6162ffab, 0x476fa2fb},
+       {0xac5db47f, 0xd2fc5e69, 0x15d0b8e4, 0xa12a70a6, 0xbc8de5d9, 0x293b1d67, 0x57f86f5e, 0x0e4840ac},
+       {0xab28e208, 0xb750da4c, 0x3be95635, 0x501dff64, 0xf0b4b276, 0x8cbe2437, 0xa94a946e, 0x07d0c802},
+       {0x2fe322b8, 0x2cabadec, 0x15412560, 0x752c84f3, 0x1a3b0aef, 0x32a732ae, 0xa33dcbf2, 0x2e95da59},
+       {0xfe0c65f4, 0x33811ea1, 0x687f28a2, 0x15c1ad4c, 0x42dee7f4, 0xecfbede3, 0x9a5d88b1, 0x1bb46667},
+       {0x2d010ff9, 0xd58a5af4, 0x570bf109, 0x79efd6b0, 0x6350721d, 0x3ed6d55a, 0x58f43cef, 0x2f27b098},
+       {0x8c130477, 0x74a1f671, 0xb61e0abe, 0xa534af14, 0x620890d7, 0xeb674a1a, 0xca252472, 0x43527a8b},
+       {0x7ea8ee05, 0x450d9f97, 0x37d56fc0, 0x565af171, 0x93f9e9ac, 0xe155cb48, 0xc8e9101b, 0x110cebd0},
+       {0x59a0be92, 0x23c91599, 0x7a027759, 0x87d188ce, 0xcab3c3cc, 0x70491431, 0xb3f7f8da, 0x0ac00eb8},
+       {0x69583404, 0x13e96ade, 0x5306243d, 0x82c05727, 0x29ca9f2a, 0x77e48bf5, 0x1fe19595, 0x50646ac8},
+       {0xa97eccd4, 0xe6a354dd, 0x88fbbc57, 0x39929d2e, 0xd6e7b1c8, 0xa22ba63d, 0xf5f07f43, 0x42c22911},
+       {0xcfc35f7a, 0x137b458a, 0x29c01b06, 0x0caba63a, 0x7a02402c, 0x0409ee98, 0x56aa725b, 0x6709c6cd},
+       {0x8831e03e, 0x10251f7d, 0x7ff858ec, 0x77d85a93, 0x4fb9ac5c, 0xebe905bd, 0xf8727901, 0x05deb333},
+       {0xb9009408, 0xbf87b689, 0xdd3ccc96, 0x4f730e7d, 0x4610300c, 0xfd7f05ba, 0x0b8ac903, 0x5ef5e8db},
+       {0x17cd0c14, 0x64996884, 0x68812f7f, 0xa6728673, 0x22cc3253, 0x2e1d9a19, 0xaa0a1d80, 0x3a689e83},
+       {0x41144dea, 0x20b53cbe, 0xc2f0fcbd, 0x870c46fa, 0x537d6971, 0x556c35f6, 0x5f686d91, 0x3436287f},
+       {0x436ba2e7, 0x007e082a, 0x9116e877, 0x67c6630f, 0xfb4460f7, 0x36f8f165, 0x7e7046e0, 0x6eee34d5},
+       {0xa53a56d1, 0xc5b670ee, 0x53037d7b, 0x127d1f42, 0xa722c2e2, 0x57d4257e, 0x33cbd838, 0x03ae26a3},
+       {0x76504cf8, 0x1e914848, 0xb63edd02, 0x55bbbf1e, 0x4e55aa02, 0xbcdafec8, 0x2dc0beb0, 0x5145c4cd},
+       {0x1ab70e2c, 0x5b90153a, 0x75fb0ab8, 0x8deffa31, 0x46900c95, 0xc553ae23, 0x6bd3118c, 0x1d31dcdc},
+       {0x59a2e8eb, 0x801c894c, 0xe12fc974, 0xbc535c5c, 0x47d39803, 0x95508d27, 0xac5d094f, 0x16d9d3cd},
+       {0xcca1d8be, 0x810fa372, 0x82e0bfa7, 0xc67b8c28, 0xe2d35bc2, 0xdbb4edf0, 0x5087c995, 0x712d1580},
+       {0xfd88f133, 0xeb162203, 0xf010ea74, 0xac96c38f, 0xe64cfc70, 0x4307987f, 0x37b7a114, 0x350fe98d},
+       {0x42f2a254, 0xaba2f518, 0xa71efc0c, 0x4d7f3c3a, 0xd274a80a, 0x97ae418d, 0x5e3e7682, 0x2967385d},
+       {0x575a0b79, 0x75c55c7b, 0x74a7ded1, 0x3ba4a157, 0xa04fccf3, 0xc3974d73, 0x4a939684, 0x705aba4f},
+       {0x14ebb608, 0x8409a9ea, 0x66bac611, 0xfad0084e, 0x811c1dfb, 0x04287254, 0x23b30c29, 0x086d072b},
+       {0x67e4756a, 0xb427c9b3, 0x02ebc38d, 0xc7537fb9, 0xcd6a205f, 0x51de21be, 0x7923597d, 0x6064ab72},
+       {0x0b912f1f, 0x1b788f50, 0x70b3e094, 0xc4024ff2, 0xd168d6c0, 0x0fd56dc8, 0x5b416b6f, 0x0212d79e}}};
 
     static constexpr storage_array<omegas_count, limbs_count> omega_inv = {
-      omega_inv1,  omega_inv2,  omega_inv3,  omega_inv4,  omega_inv5,  omega_inv6,  omega_inv7,  omega_inv8,
-      omega_inv9,  omega_inv10, omega_inv11, omega_inv12, omega_inv13, omega_inv14, omega_inv15, omega_inv16,
-      omega_inv17, omega_inv18, omega_inv19, omega_inv20, omega_inv21, omega_inv22, omega_inv23, omega_inv24,
-      omega_inv25, omega_inv26, omega_inv27, omega_inv28, omega_inv29, omega_inv30, omega_inv31, omega_inv32,
-    };
-
-    // Quick fix for linking issue
-    static constexpr storage<limbs_count> inv1 = {0x80000001, 0x7fffffff, 0x7fff2dff, 0xa9ded201,
-                                                  0x04d0ec02, 0x199cec04, 0x94cebea4, 0x39f6d3a9};
-    static constexpr storage<limbs_count> inv2 = {0x40000001, 0x3fffffff, 0x3ffec4ff, 0xfece3b02,
-                                                  0x07396203, 0x266b6206, 0x5f361df6, 0x56f23d7e};
-    static constexpr storage<limbs_count> inv3 = {0x20000001, 0x1fffffff, 0x9ffe907f, 0xa945ef82,
-                                                  0x086d9d04, 0x2cd29d07, 0xc469cd9f, 0x656ff268};
-    static constexpr storage<limbs_count> inv4 = {0x10000001, 0x0fffffff, 0xcffe763f, 0xfe81c9c2,
-                                                  0x8907ba84, 0xb0063a87, 0xf703a573, 0x6caeccdd};
-    static constexpr storage<limbs_count> inv5 = {0x08000001, 0x07ffffff, 0xe7fe691f, 0x291fb6e2,
-                                                  0xc954c945, 0xf1a00947, 0x9050915d, 0x704e3a18};
-    static constexpr storage<limbs_count> inv6 = {0x04000001, 0x03ffffff, 0xf3fe628f, 0x3e6ead72,
-                                                  0xe97b50a5, 0x126cf0a7, 0xdcf70753, 0x721df0b5};
-    static constexpr storage<limbs_count> inv7 = {0x02000001, 0x01ffffff, 0xf9fe5f47, 0x491628ba,
-                                                  0xf98e9455, 0xa2d36457, 0x834a424d, 0x7305cc04};
-    static constexpr storage<limbs_count> inv8 = {0x01000001, 0x00ffffff, 0xfcfe5da3, 0x4e69e65e,
-                                                  0x0198362d, 0xeb069e30, 0xd673dfca, 0x7379b9ab};
-    static constexpr storage<limbs_count> inv9 = {0x00800001, 0x007fffff, 0xfe7e5cd1, 0x5113c530,
-                                                  0x059d0719, 0x8f203b1c, 0x8008ae89, 0x73b3b07f};
-    static constexpr storage<limbs_count> inv10 = {0x00400001, 0x003fffff, 0xff3e5c68, 0x5268b499,
-                                                   0x079f6f8f, 0xe12d0992, 0x54d315e8, 0x73d0abe9};
-    static constexpr storage<limbs_count> inv11 = {0x00200001, 0x801fffff, 0x7f9e5c33, 0x53132c4e,
-                                                   0x08a0a3ca, 0x8a3370cd, 0x3f384998, 0x73df299e};
-    static constexpr storage<limbs_count> inv12 = {0x00100001, 0x400fffff, 0xbfce5c19, 0xd3686828,
-                                                   0x89213de7, 0x5eb6a46a, 0xb46ae370, 0x73e66878};
-    static constexpr storage<limbs_count> inv13 = {0x00080001, 0x2007ffff, 0xdfe65c0c, 0x93930615,
-                                                   0x49618af6, 0x48f83e39, 0xef04305c, 0x73ea07e5};
-    static constexpr storage<limbs_count> inv14 = {0x00040001, 0x9003ffff, 0x6ff25c05, 0xf3a8550c,
-                                                   0xa981b17d, 0x3e190b20, 0x8c50d6d2, 0x73ebd79c};
-    static constexpr storage<limbs_count> inv15 = {0x00020001, 0x4801ffff, 0xb7f85c02, 0xa3b2fc87,
-                                                   0x5991c4c1, 0x38a97194, 0xdaf72a0d, 0x73ecbf77};
-    static constexpr storage<limbs_count> inv16 = {0x00010001, 0xa400ffff, 0x5bfb5c00, 0x7bb85045,
-                                                   0x3199ce63, 0xb5f1a4ce, 0x824a53aa, 0x73ed3365};
-    static constexpr storage<limbs_count> inv17 = {0x00008001, 0xd2007fff, 0x2dfcdbff, 0x67bafa24,
-                                                   0x1d9dd334, 0x7495be6b, 0x55f3e879, 0x73ed6d5c};
-    static constexpr storage<limbs_count> inv18 = {0x00004001, 0x69003fff, 0x96fd9bff, 0xddbc4f13,
-                                                   0x939fd59c, 0xd3e7cb39, 0xbfc8b2e0, 0x73ed8a57};
-    static constexpr storage<limbs_count> inv19 = {0x00002001, 0x34801fff, 0x4b7dfbff, 0x18bcf98b,
-                                                   0xcea0d6d1, 0x8390d1a0, 0x74b31814, 0x73ed98d5};
-    static constexpr storage<limbs_count> inv20 = {0x00001001, 0x1a400fff, 0x25be2bff, 0x363d4ec7,
-                                                   0x6c21576b, 0x5b6554d4, 0x4f284aae, 0x73eda014};
-    static constexpr storage<limbs_count> inv21 = {0x00000801, 0x0d2007ff, 0x12de43ff, 0x44fd7965,
-                                                   0x3ae197b8, 0x474f966e, 0xbc62e3fb, 0x73eda3b3};
-    static constexpr storage<limbs_count> inv22 = {0x00000401, 0x069003ff, 0x096e4fff, 0xcc5d8eb4,
-                                                   0x2241b7de, 0xbd44b73b, 0x730030a1, 0x73eda583};
-    static constexpr storage<limbs_count> inv23 = {0x00000201, 0x034801ff, 0x84b655ff, 0x100d995b,
-                                                   0x95f1c7f2, 0xf83f47a1, 0x4e4ed6f4, 0x73eda66b};
-    static constexpr storage<limbs_count> inv24 = {0x00000101, 0x01a400ff, 0x425a58ff, 0xb1e59eaf,
-                                                   0xcfc9cffb, 0x95bc8fd4, 0x3bf62a1e, 0x73eda6df};
-    static constexpr storage<limbs_count> inv25 = {0x00000081, 0x00d2007f, 0x212c5a7f, 0x82d1a159,
-                                                   0x6cb5d400, 0x647b33ee, 0x32c9d3b3, 0x73eda719};
-    static constexpr storage<limbs_count> inv26 = {0x00000041, 0x0069003f, 0x10955b3f, 0xeb47a2ae,
-                                                   0x3b2bd602, 0xcbda85fb, 0x2e33a87d, 0x73eda736};
-    static constexpr storage<limbs_count> inv27 = {0x00000021, 0x0034801f, 0x8849db9f, 0x1f82a358,
-                                                   0xa266d704, 0xff8a2f01, 0xabe892e2, 0x73eda744};
-    static constexpr storage<limbs_count> inv28 = {0x00000011, 0x001a400f, 0xc4241bcf, 0xb9a023ad,
-                                                   0xd6045784, 0x99620384, 0xeac30815, 0x73eda74b};
-    static constexpr storage<limbs_count> inv29 = {0x00000009, 0x000d2007, 0x62113be7, 0x06aee3d8,
-                                                   0x6fd317c5, 0xe64dedc6, 0x8a3042ae, 0x73eda74f};
-    static constexpr storage<limbs_count> inv30 = {0x00000005, 0x00069003, 0xb107cbf3, 0x2d3643ed,
-                                                   0x3cba77e5, 0x8cc3e2e7, 0x59e6dffb, 0x73eda751};
-    static constexpr storage<limbs_count> inv31 = {0x00000003, 0x00034801, 0x588313f9, 0x4079f3f8,
-                                                   0xa32e27f5, 0xdffedd77, 0x41c22ea1, 0x73eda752};
-    static constexpr storage<limbs_count> inv32 = {0x00000002, 0x0001a400, 0xac40b7fc, 0x4a1bcbfd,
-                                                   0xd667fffd, 0x099c5abf, 0xb5afd5f5, 0x73eda752};
+      {{0x00000000, 0xffffffff, 0xfffe5bfe, 0x53bda402, 0x09a1d805, 0x3339d808, 0x299d7d48, 0x73eda753},
+       {0x00000001, 0xfffeffff, 0x89fb5bfe, 0x67baa400, 0x939ed334, 0xa5e80b39, 0x299d7d47, 0x73eda753},
+       {0xae99502e, 0x6037fe81, 0x94b04fd8, 0x8e749036, 0xca86bf65, 0xbabc5aff, 0x5ce11044, 0x1333b22e},
+       {0x7dc08d74, 0x7f847ee4, 0x04eeaf5a, 0xbd433896, 0x1832fc60, 0xd66c91d6, 0x607e449b, 0x551115b4},
+       {0x4e7773cb, 0xee5bcecc, 0xf6dab086, 0x45593d6f, 0x4016e2bd, 0xa3a95d2d, 0xaf96816f, 0x047cb16c},
+       {0x982b68c5, 0xb891fa3f, 0x1d426b52, 0xa41e8501, 0x882952d6, 0x566009b5, 0x7b3c79d6, 0x199cdaee},
+       {0xcf28601b, 0x571ba2fc, 0xac74db12, 0x166fb582, 0x3501370b, 0x51420be4, 0x52f970ba, 0x1996fa8d},
+       {0x6a2f777a, 0xe9561c17, 0x2393991b, 0xc03cae03, 0x5a5bfd4f, 0x91b00023, 0x272e58ee, 0x6d64ed25},
+       {0xf02a116e, 0xfb350dbe, 0xb4543a3e, 0x1c510ebf, 0x37ad4eca, 0xf675522e, 0x80f82b2d, 0x1907a56e},
+       {0x4eb71aa6, 0xb0ad8003, 0xaa67e0be, 0x50a32c41, 0x19141f44, 0x105f0672, 0xa3dad316, 0x2bcd9508},
+       {0x0f6fb2ac, 0x3dc9e560, 0x9aa58ff5, 0x3cc5bb32, 0x36f376e1, 0xdeae67bc, 0x65ba213e, 0x394fda0d},
+       {0x60b82267, 0x09f239f7, 0x8b24f123, 0x14180e0e, 0x45625d95, 0xad5a5340, 0x6d174692, 0x58c3ba63},
+       {0x348b416f, 0x0acf21c2, 0xbc086439, 0x798b6bf6, 0xb1ca111d, 0x222d411f, 0x30ba1e0f, 0x044107b7},
+       {0x014abe84, 0xa3b861b8, 0x427ed008, 0x37c017e4, 0xae0ff4f5, 0xae51f613, 0xcb1218d3, 0x1a2d00e1},
+       {0x4de7eb2b, 0x48aaa3bf, 0x6772057d, 0x4a58d54d, 0x7093b551, 0xce25f16c, 0xd206337c, 0x242150ac},
+       {0x9ed57ae5, 0xdf3ec9ae, 0x7166577f, 0xea7df73a, 0x022fbbe4, 0x6ca8d281, 0x151e3f6b, 0x5850c003},
+       {0x645e1cfa, 0x903a0a0c, 0x34788c37, 0xfbac54cb, 0x8cf73d78, 0xdc127d11, 0x975d3c82, 0x6d0b5c7c},
+       {0x14b1ba04, 0xb49d6b05, 0xf00b84f2, 0x56e466b4, 0x0b904f22, 0x30c390cf, 0x3ee254cc, 0x3e11cfb7},
+       {0xbe8201ab, 0x84dfa547, 0x530715d2, 0x3887ce8b, 0x3eed4ed7, 0xa4c719c6, 0x8f8007b4, 0x18c44950},
+       {0x7d813cd1, 0xdaf0346d, 0xf755beb1, 0xeccf6f9a, 0xe08143e3, 0x167fce38, 0x6f5d6dfa, 0x545ad9b2},
+       {0x577605de, 0x973f5466, 0x974f953c, 0x0ce8986e, 0x074382f9, 0x8941cf4b, 0x6fa2672c, 0x156cd7f6},
+       {0x33b66141, 0x24315404, 0x1992f584, 0x5d1375ab, 0x8b20ca1a, 0xf193ffa6, 0x2701a503, 0x47880cd5},
+       {0xe9f7b9af, 0xf7b6847d, 0x62c83ce2, 0x9a339673, 0x6e5e6f79, 0xfabf4537, 0x35af33a3, 0x0975acd9},
+       {0x0eddd248, 0x4fb4204a, 0xc9e509b3, 0x8c98706a, 0x2bb27eb1, 0xd0be8987, 0xc831438b, 0x6ec5f960},
+       {0x20238f62, 0xa13c95b7, 0x83b476b9, 0x130aa097, 0x14860881, 0x758a04e0, 0x97066493, 0x58e2f8d6},
+       {0xe8bff41e, 0x65b09c73, 0x37f1c6a3, 0x8b3280e8, 0x2846fb21, 0xe17b82ce, 0xb1ae27df, 0x476534bf},
+       {0xd5fdb757, 0x8480c0e7, 0x365bf9fd, 0x3644eea0, 0xb776be86, 0x4ca116ca, 0x8b58390c, 0x17b6395f},
+       {0x252eb0db, 0x2c811e9a, 0x7479e161, 0x1b7d960d, 0xb0a89a26, 0xb3afc7c1, 0x32b5e793, 0x6a2f9533},
+       {0x08b8a7ad, 0xe877b2c4, 0x341652b4, 0x68b0e8f0, 0xe8b6a2d9, 0x2d44da3b, 0xfd09be59, 0x092778ff},
+       {0x7988f244, 0x84a1aa6f, 0x24faf63f, 0xa164b3d9, 0xc1bbb915, 0x7aae9724, 0xf386c0d2, 0x24e5d287},
+       {0x41a1b30c, 0xa70a7efd, 0x39f0e511, 0xc49c55a5, 0x033bb323, 0xab307a8f, 0x17acbd7f, 0x0158abd6},
+       {0x0f642025, 0x2c228b30, 0x01bd882b, 0xb0878e8d, 0xd7377fea, 0xd862b255, 0xf0490536, 0x18ac3666}}};
 
     static constexpr storage_array<omegas_count, limbs_count> inv = {
-      inv1,  inv2,  inv3,  inv4,  inv5,  inv6,  inv7,  inv8,  inv9,  inv10, inv11, inv12, inv13, inv14, inv15, inv16,
-      inv17, inv18, inv19, inv20, inv21, inv22, inv23, inv24, inv25, inv26, inv27, inv28, inv29, inv30, inv31, inv32,
-    };
+      {{0x80000001, 0x7fffffff, 0x7fff2dff, 0xa9ded201, 0x04d0ec02, 0x199cec04, 0x94cebea4, 0x39f6d3a9},
+       {0x40000001, 0x3fffffff, 0x3ffec4ff, 0xfece3b02, 0x07396203, 0x266b6206, 0x5f361df6, 0x56f23d7e},
+       {0x20000001, 0x1fffffff, 0x9ffe907f, 0xa945ef82, 0x086d9d04, 0x2cd29d07, 0xc469cd9f, 0x656ff268},
+       {0x10000001, 0x0fffffff, 0xcffe763f, 0xfe81c9c2, 0x8907ba84, 0xb0063a87, 0xf703a573, 0x6caeccdd},
+       {0x08000001, 0x07ffffff, 0xe7fe691f, 0x291fb6e2, 0xc954c945, 0xf1a00947, 0x9050915d, 0x704e3a18},
+       {0x04000001, 0x03ffffff, 0xf3fe628f, 0x3e6ead72, 0xe97b50a5, 0x126cf0a7, 0xdcf70753, 0x721df0b5},
+       {0x02000001, 0x01ffffff, 0xf9fe5f47, 0x491628ba, 0xf98e9455, 0xa2d36457, 0x834a424d, 0x7305cc04},
+       {0x01000001, 0x00ffffff, 0xfcfe5da3, 0x4e69e65e, 0x0198362d, 0xeb069e30, 0xd673dfca, 0x7379b9ab},
+       {0x00800001, 0x007fffff, 0xfe7e5cd1, 0x5113c530, 0x059d0719, 0x8f203b1c, 0x8008ae89, 0x73b3b07f},
+       {0x00400001, 0x003fffff, 0xff3e5c68, 0x5268b499, 0x079f6f8f, 0xe12d0992, 0x54d315e8, 0x73d0abe9},
+       {0x00200001, 0x801fffff, 0x7f9e5c33, 0x53132c4e, 0x08a0a3ca, 0x8a3370cd, 0x3f384998, 0x73df299e},
+       {0x00100001, 0x400fffff, 0xbfce5c19, 0xd3686828, 0x89213de7, 0x5eb6a46a, 0xb46ae370, 0x73e66878},
+       {0x00080001, 0x2007ffff, 0xdfe65c0c, 0x93930615, 0x49618af6, 0x48f83e39, 0xef04305c, 0x73ea07e5},
+       {0x00040001, 0x9003ffff, 0x6ff25c05, 0xf3a8550c, 0xa981b17d, 0x3e190b20, 0x8c50d6d2, 0x73ebd79c},
+       {0x00020001, 0x4801ffff, 0xb7f85c02, 0xa3b2fc87, 0x5991c4c1, 0x38a97194, 0xdaf72a0d, 0x73ecbf77},
+       {0x00010001, 0xa400ffff, 0x5bfb5c00, 0x7bb85045, 0x3199ce63, 0xb5f1a4ce, 0x824a53aa, 0x73ed3365},
+       {0x00008001, 0xd2007fff, 0x2dfcdbff, 0x67bafa24, 0x1d9dd334, 0x7495be6b, 0x55f3e879, 0x73ed6d5c},
+       {0x00004001, 0x69003fff, 0x96fd9bff, 0xddbc4f13, 0x939fd59c, 0xd3e7cb39, 0xbfc8b2e0, 0x73ed8a57},
+       {0x00002001, 0x34801fff, 0x4b7dfbff, 0x18bcf98b, 0xcea0d6d1, 0x8390d1a0, 0x74b31814, 0x73ed98d5},
+       {0x00001001, 0x1a400fff, 0x25be2bff, 0x363d4ec7, 0x6c21576b, 0x5b6554d4, 0x4f284aae, 0x73eda014},
+       {0x00000801, 0x0d2007ff, 0x12de43ff, 0x44fd7965, 0x3ae197b8, 0x474f966e, 0xbc62e3fb, 0x73eda3b3},
+       {0x00000401, 0x069003ff, 0x096e4fff, 0xcc5d8eb4, 0x2241b7de, 0xbd44b73b, 0x730030a1, 0x73eda583},
+       {0x00000201, 0x034801ff, 0x84b655ff, 0x100d995b, 0x95f1c7f2, 0xf83f47a1, 0x4e4ed6f4, 0x73eda66b},
+       {0x00000101, 0x01a400ff, 0x425a58ff, 0xb1e59eaf, 0xcfc9cffb, 0x95bc8fd4, 0x3bf62a1e, 0x73eda6df},
+       {0x00000081, 0x00d2007f, 0x212c5a7f, 0x82d1a159, 0x6cb5d400, 0x647b33ee, 0x32c9d3b3, 0x73eda719},
+       {0x00000041, 0x0069003f, 0x10955b3f, 0xeb47a2ae, 0x3b2bd602, 0xcbda85fb, 0x2e33a87d, 0x73eda736},
+       {0x00000021, 0x0034801f, 0x8849db9f, 0x1f82a358, 0xa266d704, 0xff8a2f01, 0xabe892e2, 0x73eda744},
+       {0x00000011, 0x001a400f, 0xc4241bcf, 0xb9a023ad, 0xd6045784, 0x99620384, 0xeac30815, 0x73eda74b},
+       {0x00000009, 0x000d2007, 0x62113be7, 0x06aee3d8, 0x6fd317c5, 0xe64dedc6, 0x8a3042ae, 0x73eda74f},
+       {0x00000005, 0x00069003, 0xb107cbf3, 0x2d3643ed, 0x3cba77e5, 0x8cc3e2e7, 0x59e6dffb, 0x73eda751},
+       {0x00000003, 0x00034801, 0x588313f9, 0x4079f3f8, 0xa32e27f5, 0xdffedd77, 0x41c22ea1, 0x73eda752},
+       {0x00000002, 0x0001a400, 0xac40b7fc, 0x4a1bcbfd, 0xd667fffd, 0x099c5abf, 0xb5afd5f5, 0x73eda752}}};
   };
 
   struct fq_config {
-    // field structure size = 12 * 32 bit
     static constexpr unsigned limbs_count = 12;
-    // modulus =
-    // 4002409555221667393417789825735904156556882819939007885332058136124031650490837864442687629129015664037894272559787
+    static constexpr unsigned modulus_bit_count = 381;
+    static constexpr unsigned num_of_reductions = 1;
     static constexpr storage<limbs_count> modulus = {0xffffaaab, 0xb9feffff, 0xb153ffff, 0x1eabfffe,
                                                      0xf6b0f624, 0x6730d2a0, 0xf38512bf, 0x64774b84,
                                                      0x434bacd7, 0x4b1ba7b6, 0x397fe69a, 0x1a0111ea};
-    // modulus*2 =
-    // 8004819110443334786835579651471808313113765639878015770664116272248063300981675728885375258258031328075788545119574
     static constexpr storage<limbs_count> modulus_2 = {0xffff5556, 0x73fdffff, 0x62a7ffff, 0x3d57fffd,
                                                        0xed61ec48, 0xce61a541, 0xe70a257e, 0xc8ee9709,
                                                        0x869759ae, 0x96374f6c, 0x72ffcd34, 0x340223d4};
-    // modulus*4 =
-    // 16009638220886669573671159302943616626227531279756031541328232544496126601963351457770750516516062656151577090239148
     static constexpr storage<limbs_count> modulus_4 = {0xfffeaaac, 0xe7fbffff, 0xc54ffffe, 0x7aaffffa,
                                                        0xdac3d890, 0x9cc34a83, 0xce144afd, 0x91dd2e13,
-                                                       0xd2eb35d,  0x2c6e9ed9, 0xe5ff9a69, 0x680447a8};
-
+                                                       0x0d2eb35d, 0x2c6e9ed9, 0xe5ff9a69, 0x680447a8};
+    static constexpr storage<limbs_count> neg_modulus = {0x00005555, 0x46010000, 0x4eac0000, 0xe1540001,
+                                                         0x094f09db, 0x98cf2d5f, 0x0c7aed40, 0x9b88b47b,
+                                                         0xbcb45328, 0xb4e45849, 0xc6801965, 0xe5feee15};
     static constexpr storage<2 * limbs_count> modulus_wide = {
       0xffffaaab, 0xb9feffff, 0xb153ffff, 0x1eabfffe, 0xf6b0f624, 0x6730d2a0, 0xf38512bf, 0x64774b84,
       0x434bacd7, 0x4b1ba7b6, 0x397fe69a, 0x1a0111ea, 0x00000000, 0x00000000, 0x00000000, 0x00000000,
       0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000};
-
-    // modulus^2
     static constexpr storage<2 * limbs_count> modulus_squared = {
       0x1c718e39, 0x26aa0000, 0x76382eab, 0x7ced6b1d, 0x62113cfd, 0x162c3383, 0x3e71b743, 0x66bf91ed,
       0x7091a049, 0x292e85a8, 0x86185c7b, 0x1d68619c, 0x0978ef01, 0xf5314933, 0x16ddca6e, 0x50a62cfd,
       0x349e8bd0, 0x66e59e49, 0x0e7046b4, 0xe2dc90e5, 0xa22f25e9, 0x4bd278ea, 0xb8c35fc7, 0x02a437a4};
-    // 2*modulus^2
     static constexpr storage<2 * limbs_count> modulus_squared_2 = {
       0x38e31c72, 0x4d540000, 0xec705d56, 0xf9dad63a, 0xc42279fa, 0x2c586706, 0x7ce36e86, 0xcd7f23da,
       0xe1234092, 0x525d0b50, 0x0c30b8f6, 0x3ad0c339, 0x12f1de02, 0xea629266, 0x2dbb94dd, 0xa14c59fa,
       0x693d17a0, 0xcdcb3c92, 0x1ce08d68, 0xc5b921ca, 0x445e4bd3, 0x97a4f1d5, 0x7186bf8e, 0x05486f49};
-    // 4*modulus^2
     static constexpr storage<2 * limbs_count> modulus_squared_4 = {
       0x71c638e4, 0x9aa80000, 0xd8e0baac, 0xf3b5ac75, 0x8844f3f5, 0x58b0ce0d, 0xf9c6dd0c, 0x9afe47b4,
       0xc2468125, 0xa4ba16a1, 0x186171ec, 0x75a18672, 0x25e3bc04, 0xd4c524cc, 0x5b7729bb, 0x4298b3f4,
       0xd27a2f41, 0x9b967924, 0x39c11ad1, 0x8b724394, 0x88bc97a7, 0x2f49e3aa, 0xe30d7f1d, 0x0a90de92};
-    static constexpr unsigned modulus_bit_count = 381;
-    // m = floor(2^(2*modulus_bit_count) / modulus)
     static constexpr storage<limbs_count> m = {0xd59646e8, 0xec4f881f, 0x8163c701, 0x4e65c59e, 0x80a19de7, 0x2f7d1dc7,
                                                0x7fda82a5, 0xa46e09d0, 0x331e9ae8, 0x38a0406c, 0xcf327917, 0x2760d74b};
     static constexpr storage<limbs_count> one = {0x00000001, 0x00000000, 0x00000000, 0x00000000,
@@ -371,37 +186,38 @@ namespace bls12_381 {
     static constexpr storage<limbs_count> zero = {0x00000000, 0x00000000, 0x00000000, 0x00000000,
                                                   0x00000000, 0x00000000, 0x00000000, 0x00000000,
                                                   0x00000000, 0x00000000, 0x00000000, 0x00000000};
-    static constexpr storage<limbs_count> montgomery_r = {0x0005555,  0x60100000, 0xeac00004, 0x15400014,
-                                                          0x94f09dbe, 0x8cf2d5f0, 0xc7aed409, 0xb88b47b0,
-                                                          0xcb453289, 0x4e45849b, 0x6801965b, 0x5feee15c};
-    static constexpr storage<limbs_count> montgomery_r_inv = {0x05c40fe,  0xaa212c9c, 0xccfd7e14, 0x70093ae9,
-                                                              0xc85a96b4, 0x6d05c02d, 0x025fecd3, 0x1f193851,
-                                                              0xeb48f4c6, 0x84d32f44, 0xed8ffb1a, 0xbefcc91e};
+    static constexpr storage<limbs_count> montgomery_r = {0x0002fffd, 0x76090000, 0xc40c0002, 0xebf4000b,
+                                                          0x53c758ba, 0x5f489857, 0x70525745, 0x77ce5853,
+                                                          0xa256ec6d, 0x5c071a97, 0xfa80e493, 0x15f65ec3};
+    static constexpr storage<limbs_count> montgomery_r_inv = {0x380b4820, 0xf4d38259, 0xd898fafb, 0x7fe11274,
+                                                              0x14956dc8, 0x343ea979, 0x58a88de9, 0x1797ab14,
+                                                              0x3c4f538b, 0xed5e6427, 0xe8fb0ce9, 0x14fec701};
     // i^2, the square of the imaginary unit for the extension field
     static constexpr uint32_t i_squared = 1;
     // true if i^2 is negative
     static constexpr bool i_squared_is_negative = true;
-    // G1 and G2 generators
-    static constexpr storage<limbs_count> g1_gen_x = {0xdb22c6bb, 0xfb3af00a, 0xf97a1aef, 0x6c55e83f,
-                                                      0x171bac58, 0xa14e3a3f, 0x9774b905, 0xc3688c4f,
-                                                      0x4fa9ac0f, 0x2695638c, 0x3197d794, 0x17f1d3a7};
-    static constexpr storage<limbs_count> g1_gen_y = {0x46c5e7e1, 0x0caa2329, 0xa2888ae4, 0xd03cc744,
-                                                      0x2c04b3ed, 0x00db18cb, 0xd5d00af6, 0xfcf5e095,
-                                                      0x741d8ae4, 0xa09e30ed, 0xe3aaa0f1, 0x08b3f481};
-    static constexpr storage<limbs_count> g2_gen_x_re = {0xc121bdb8, 0xd48056c8, 0xa805bbef, 0x0bac0326,
-                                                         0x7ae3d177, 0xb4510b64, 0xfa403b02, 0xc6e47ad4,
-                                                         0x2dc51051, 0x26080527, 0xf08f0a91, 0x024aa2b2};
-    static constexpr storage<limbs_count> g2_gen_x_im = {0x5d042b7e, 0xe5ac7d05, 0x13945d57, 0x334cf112,
-                                                         0xdc7f5049, 0xb5da61bb, 0x9920b61a, 0x596bd0d0,
-                                                         0x88274f65, 0x7dacd3a0, 0x52719f60, 0x13e02b60};
-    static constexpr storage<limbs_count> g2_gen_y_re = {0x08b82801, 0xe1935486, 0x3baca289, 0x923ac9cc,
-                                                         0x5160d12c, 0x6d429a69, 0x8cbdd3a7, 0xadfd9baa,
-                                                         0xda2e351a, 0x8cc9cdc6, 0x727d6e11, 0x0ce5d527};
-    static constexpr storage<limbs_count> g2_gen_y_im = {0xf05f79be, 0xaaa9075f, 0x5cec1da1, 0x3f370d27,
-                                                         0x572e99ab, 0x267492ab, 0x85a763af, 0xcb3e287e,
-                                                         0x2bc28b99, 0x32acd2b0, 0x2ea734cc, 0x0606c4a0};
   };
 
+  // G1 and G2 generators
+  static constexpr storage<fq_config::limbs_count> g1_gen_x = {0xdb22c6bb, 0xfb3af00a, 0xf97a1aef, 0x6c55e83f,
+                                                               0x171bac58, 0xa14e3a3f, 0x9774b905, 0xc3688c4f,
+                                                               0x4fa9ac0f, 0x2695638c, 0x3197d794, 0x17f1d3a7};
+  static constexpr storage<fq_config::limbs_count> g1_gen_y = {0x46c5e7e1, 0x0caa2329, 0xa2888ae4, 0xd03cc744,
+                                                               0x2c04b3ed, 0x00db18cb, 0xd5d00af6, 0xfcf5e095,
+                                                               0x741d8ae4, 0xa09e30ed, 0xe3aaa0f1, 0x08b3f481};
+  static constexpr storage<fq_config::limbs_count> g2_gen_x_re = {0xc121bdb8, 0xd48056c8, 0xa805bbef, 0x0bac0326,
+                                                                  0x7ae3d177, 0xb4510b64, 0xfa403b02, 0xc6e47ad4,
+                                                                  0x2dc51051, 0x26080527, 0xf08f0a91, 0x024aa2b2};
+  static constexpr storage<fq_config::limbs_count> g2_gen_x_im = {0x5d042b7e, 0xe5ac7d05, 0x13945d57, 0x334cf112,
+                                                                  0xdc7f5049, 0xb5da61bb, 0x9920b61a, 0x596bd0d0,
+                                                                  0x88274f65, 0x7dacd3a0, 0x52719f60, 0x13e02b60};
+  static constexpr storage<fq_config::limbs_count> g2_gen_y_re = {0x08b82801, 0xe1935486, 0x3baca289, 0x923ac9cc,
+                                                                  0x5160d12c, 0x6d429a69, 0x8cbdd3a7, 0xadfd9baa,
+                                                                  0xda2e351a, 0x8cc9cdc6, 0x727d6e11, 0x0ce5d527};
+  static constexpr storage<fq_config::limbs_count> g2_gen_y_im = {0xf05f79be, 0xaaa9075f, 0x5cec1da1, 0x3f370d27,
+                                                                  0x572e99ab, 0x267492ab, 0x85a763af, 0xcb3e287e,
+                                                                  0x2bc28b99, 0x32acd2b0, 0x2ea734cc, 0x0606c4a0};
+
   static constexpr storage<fq_config::limbs_count> weierstrass_b = {0x00000004, 0x00000000, 0x00000000, 0x00000000,
                                                                     0x00000000, 0x00000000, 0x00000000, 0x00000000,
                                                                     0x00000000, 0x00000000, 0x00000000, 0x00000000};
diff --git a/icicle/curves/bn254_params.cuh b/icicle/curves/bn254_params.cuh
index 0708a15dd..d06474a38 100644
--- a/icicle/curves/bn254_params.cuh
+++ b/icicle/curves/bn254_params.cuh
@@ -9,6 +9,7 @@ namespace bn254 {
     static constexpr unsigned limbs_count = 8;
     static constexpr unsigned omegas_count = 28;
     static constexpr unsigned modulus_bit_count = 254;
+    static constexpr unsigned num_of_reductions = 1;
 
     static constexpr storage<limbs_count> modulus = {0xf0000001, 0x43e1f593, 0x79b97091, 0x2833e848,
                                                      0x8181585d, 0xb85045b6, 0xe131a029, 0x30644e72};
@@ -16,6 +17,8 @@ namespace bn254 {
                                                        0x0302b0ba, 0x70a08b6d, 0xc2634053, 0x60c89ce5};
     static constexpr storage<limbs_count> modulus_4 = {0xc0000004, 0x0f87d64f, 0xe6e5c245, 0xa0cfa121,
                                                        0x06056174, 0xe14116da, 0x84c680a6, 0xc19139cb};
+    static constexpr storage<limbs_count> neg_modulus = {0x0fffffff, 0xbc1e0a6c, 0x86468f6e, 0xd7cc17b7,
+                                                         0x7e7ea7a2, 0x47afba49, 0x1ece5fd6, 0xcf9bb18d};
     static constexpr storage<2 * limbs_count> modulus_wide = {
       0xf0000001, 0x43e1f593, 0x79b97091, 0x2833e848, 0x8181585d, 0xb85045b6, 0xe131a029, 0x30644e72,
       0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000};
@@ -36,8 +39,8 @@ namespace bn254 {
     static constexpr storage<limbs_count> zero = {0x00000000, 0x00000000, 0x00000000, 0x00000000,
                                                   0x00000000, 0x00000000, 0x00000000, 0x00000000};
     static constexpr storage<limbs_count> montgomery_r = {0x4ffffffb, 0xac96341c, 0x9f60cd29, 0x36fc7695,
-                                                          0x7879462e, 0x666ea36f, 0x9a07df2f, 0xe0a77c1};
-    static constexpr storage<limbs_count> montgomery_r_inv = {0x6db1194e, 0xdc5ba005, 0xe111ec87, 0x90ef5a9,
+                                                          0x7879462e, 0x666ea36f, 0x9a07df2f, 0x0e0a77c1};
+    static constexpr storage<limbs_count> montgomery_r_inv = {0x6db1194e, 0xdc5ba005, 0xe111ec87, 0x090ef5a9,
                                                               0xaeb85d5d, 0xc8260de4, 0x82c5551c, 0x15ebf951};
 
     static constexpr storage_array<omegas_count, limbs_count> omega = {
@@ -134,12 +137,15 @@ namespace bn254 {
   struct fq_config {
     static constexpr unsigned limbs_count = 8;
     static constexpr unsigned modulus_bit_count = 254;
+    static constexpr unsigned num_of_reductions = 1;
     static constexpr storage<limbs_count> modulus = {0xd87cfd47, 0x3c208c16, 0x6871ca8d, 0x97816a91,
                                                      0x8181585d, 0xb85045b6, 0xe131a029, 0x30644e72};
     static constexpr storage<limbs_count> modulus_2 = {0xb0f9fa8e, 0x7841182d, 0xd0e3951a, 0x2f02d522,
                                                        0x0302b0bb, 0x70a08b6d, 0xc2634053, 0x60c89ce5};
     static constexpr storage<limbs_count> modulus_4 = {0x61f3f51c, 0xf082305b, 0xa1c72a34, 0x5e05aa45,
                                                        0x06056176, 0xe14116da, 0x84c680a6, 0xc19139cb};
+    static constexpr storage<limbs_count> neg_modulus = {0x278302b9, 0xc3df73e9, 0x978e3572, 0x687e956e,
+                                                         0x7e7ea7a2, 0x47afba49, 0x1ece5fd6, 0xcf9bb18d};
     static constexpr storage<2 * limbs_count> modulus_wide = {
       0xd87cfd47, 0x3c208c16, 0x6871ca8d, 0x97816a91, 0x8181585d, 0xb85045b6, 0xe131a029, 0x30644e72,
       0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000};
@@ -158,30 +164,30 @@ namespace bn254 {
                                                  0x00000000, 0x00000000, 0x00000000, 0x00000000};
     static constexpr storage<limbs_count> zero = {0x00000000, 0x00000000, 0x00000000, 0x00000000,
                                                   0x00000000, 0x00000000, 0x00000000, 0x00000000};
-    static constexpr storage<limbs_count> montgomery_r = {0xc58f0d9d, 0xd35d438d, 0xf5c70b3d, 0xa78eb28,
-                                                          0x7879462c, 0x666ea36f, 0x9a07df2f, 0xe0a77c1};
-    static constexpr storage<limbs_count> montgomery_r_inv = {0x14afa37,  0xed84884a, 0x278edf8,  0xeb202285,
+    static constexpr storage<limbs_count> montgomery_r = {0xc58f0d9d, 0xd35d438d, 0xf5c70b3d, 0x0a78eb28,
+                                                          0x7879462c, 0x666ea36f, 0x9a07df2f, 0x0e0a77c1};
+    static constexpr storage<limbs_count> montgomery_r_inv = {0x014afa37, 0xed84884a, 0x0278edf8, 0xeb202285,
                                                               0xb74492d9, 0xcf63e9cf, 0x59e5c639, 0x2e671571};
-
     // i^2, the square of the imaginary unit for the extension field
     static constexpr uint32_t i_squared = 1;
     // true if i^2 is negative
     static constexpr bool i_squared_is_negative = true;
-    // G1 and G2 generators
-    static constexpr storage<limbs_count> g1_gen_x = {0x00000001, 0x00000000, 0x00000000, 0x00000000,
-                                                      0x00000000, 0x00000000, 0x00000000, 0x00000000};
-    static constexpr storage<limbs_count> g1_gen_y = {0x00000002, 0x00000000, 0x00000000, 0x00000000,
-                                                      0x00000000, 0x00000000, 0x00000000, 0x00000000};
-    static constexpr storage<limbs_count> g2_gen_x_re = {0xd992f6ed, 0x46debd5c, 0xf75edadd, 0x674322d4,
-                                                         0x5e5c4479, 0x426a0066, 0x121f1e76, 0x1800deef};
-    static constexpr storage<limbs_count> g2_gen_x_im = {0xaef312c2, 0x97e485b7, 0x35a9e712, 0xf1aa4933,
-                                                         0x31fb5d25, 0x7260bfb7, 0x920d483a, 0x198e9393};
-    static constexpr storage<limbs_count> g2_gen_y_re = {0x66fa7daa, 0x4ce6cc01, 0x0c43d37b, 0xe3d1e769,
-                                                         0x8dcb408f, 0x4aab7180, 0xdb8c6deb, 0x12c85ea5};
-    static constexpr storage<limbs_count> g2_gen_y_im = {0xd122975b, 0x55acdadc, 0x70b38ef3, 0xbc4b3133,
-                                                         0x690c3395, 0xec9e99ad, 0x585ff075, 0x090689d0};
   };
 
+  // G1 and G2 generators
+  static constexpr storage<fq_config::limbs_count> g1_gen_x = {0x00000001, 0x00000000, 0x00000000, 0x00000000,
+                                                               0x00000000, 0x00000000, 0x00000000, 0x00000000};
+  static constexpr storage<fq_config::limbs_count> g1_gen_y = {0x00000002, 0x00000000, 0x00000000, 0x00000000,
+                                                               0x00000000, 0x00000000, 0x00000000, 0x00000000};
+  static constexpr storage<fq_config::limbs_count> g2_gen_x_re = {0xd992f6ed, 0x46debd5c, 0xf75edadd, 0x674322d4,
+                                                                  0x5e5c4479, 0x426a0066, 0x121f1e76, 0x1800deef};
+  static constexpr storage<fq_config::limbs_count> g2_gen_x_im = {0xaef312c2, 0x97e485b7, 0x35a9e712, 0xf1aa4933,
+                                                                  0x31fb5d25, 0x7260bfb7, 0x920d483a, 0x198e9393};
+  static constexpr storage<fq_config::limbs_count> g2_gen_y_re = {0x66fa7daa, 0x4ce6cc01, 0x0c43d37b, 0xe3d1e769,
+                                                                  0x8dcb408f, 0x4aab7180, 0xdb8c6deb, 0x12c85ea5};
+  static constexpr storage<fq_config::limbs_count> g2_gen_y_im = {0xd122975b, 0x55acdadc, 0x70b38ef3, 0xbc4b3133,
+                                                                  0x690c3395, 0xec9e99ad, 0x585ff075, 0x090689d0};
+
   static constexpr storage<fq_config::limbs_count> weierstrass_b = {0x00000003, 0x00000000, 0x00000000, 0x00000000,
                                                                     0x00000000, 0x00000000, 0x00000000, 0x00000000};
   static constexpr storage<fq_config::limbs_count> weierstrass_b_g2_re = {
diff --git a/icicle/curves/bw6_761_params.cuh b/icicle/curves/bw6_761_params.cuh
new file mode 100644
index 000000000..d9ab08b3b
--- /dev/null
+++ b/icicle/curves/bw6_761_params.cuh
@@ -0,0 +1,106 @@
+#pragma once
+#ifndef BW6_761_PARAMS_H
+#define BW6_761_PARAMS_H
+
+#include "../utils/storage.cuh"
+
+namespace bw6_761 {
+  struct fq_config {
+    static constexpr unsigned limbs_count = 24;
+    static constexpr unsigned modulus_bit_count = 761;
+    static constexpr unsigned num_of_reductions = 1;
+    static constexpr storage<limbs_count> modulus = {
+      0x0000008b, 0xf49d0000, 0x70000082, 0xe6913e68, 0xeaf0a437, 0x160cf8ae, 0x5667a8f8, 0x98a116c2,
+      0x73ebff2e, 0x71dcd3dc, 0x12f9fd90, 0x8689c8ed, 0x25b42304, 0x03cebaff, 0xe584e919, 0x707ba638,
+      0x8087be41, 0x528275ef, 0x81d14688, 0xb926186a, 0x04faff3e, 0xd187c940, 0xfb83ce0a, 0x0122e824};
+    static constexpr storage<limbs_count> modulus_2 = {
+      0x00000116, 0xe93a0000, 0xe0000105, 0xcd227cd0, 0xd5e1486f, 0x2c19f15d, 0xaccf51f0, 0x31422d84,
+      0xe7d7fe5d, 0xe3b9a7b8, 0x25f3fb20, 0x0d1391da, 0x4b684609, 0x079d75fe, 0xcb09d232, 0xe0f74c71,
+      0x010f7c82, 0xa504ebdf, 0x03a28d10, 0x724c30d5, 0x09f5fe7d, 0xa30f9280, 0xf7079c15, 0x0245d049};
+    static constexpr storage<limbs_count> modulus_4 = {
+      0x0000022c, 0xd2740000, 0xc000020b, 0x9a44f9a1, 0xabc290df, 0x5833e2bb, 0x599ea3e0, 0x62845b09,
+      0xcfaffcba, 0xc7734f71, 0x4be7f641, 0x1a2723b4, 0x96d08c12, 0x0f3aebfc, 0x9613a464, 0xc1ee98e3,
+      0x021ef905, 0x4a09d7be, 0x07451a21, 0xe49861aa, 0x13ebfcfa, 0x461f2500, 0xee0f382b, 0x048ba093};
+    static constexpr storage<limbs_count> neg_modulus = {
+      0xffffff75, 0x0b62ffff, 0x8fffff7d, 0x196ec197, 0x150f5bc8, 0xe9f30751, 0xa9985707, 0x675ee93d,
+      0x8c1400d1, 0x8e232c23, 0xed06026f, 0x79763712, 0xda4bdcfb, 0xfc314500, 0x1a7b16e6, 0x8f8459c7,
+      0x7f7841be, 0xad7d8a10, 0x7e2eb977, 0x46d9e795, 0xfb0500c1, 0x2e7836bf, 0x047c31f5, 0xfedd17db};
+    static constexpr storage<2 * limbs_count> modulus_wide = {
+      0x0000008b, 0xf49d0000, 0x70000082, 0xe6913e68, 0xeaf0a437, 0x160cf8ae, 0x5667a8f8, 0x98a116c2,
+      0x73ebff2e, 0x71dcd3dc, 0x12f9fd90, 0x8689c8ed, 0x25b42304, 0x03cebaff, 0xe584e919, 0x707ba638,
+      0x8087be41, 0x528275ef, 0x81d14688, 0xb926186a, 0x04faff3e, 0xd187c940, 0xfb83ce0a, 0x0122e824,
+      0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000,
+      0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000,
+      0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000};
+    static constexpr storage<2 * limbs_count> modulus_squared = {
+      0x00004b79, 0xa27e0000, 0xa0008e35, 0xbae96db2, 0x82ebf7b1, 0x4aaf1d22, 0x7224cb3d, 0x7908fd92,
+      0x29b17ed1, 0x6fe68290, 0xafc968db, 0xfe1b7282, 0x9028bbf0, 0xe1e548cb, 0x3a8ffc03, 0x09094ed6,
+      0x61e9cf95, 0xd63ea631, 0x54918abf, 0xe834ca62, 0x52aa651e, 0xe52594ed, 0xb4c46a4f, 0xe2423252,
+      0x6c09aae4, 0xa8cf17d8, 0xc5f5cee5, 0x2d80ffb0, 0x55bbc10d, 0x2dede100, 0xe2360382, 0x1f4e7a7c,
+      0xae2fe433, 0x586c3847, 0x78eadae1, 0x915c56e1, 0x69a5ce00, 0xa35b2945, 0x767c08ca, 0x9d66e7fe,
+      0xd8b88c77, 0x7e44cf6a, 0x67c9c873, 0xb29bfc93, 0xbbc80af9, 0x6a24005a, 0xc64ce3d5, 0x00014a92};
+    static constexpr storage<2 * limbs_count> modulus_squared_2 = {
+      0x000096f2, 0x44fc0000, 0x40011c6b, 0x75d2db65, 0x05d7ef63, 0x955e3a45, 0xe449967a, 0xf211fb24,
+      0x5362fda2, 0xdfcd0520, 0x5f92d1b6, 0xfc36e505, 0x205177e1, 0xc3ca9197, 0x751ff807, 0x12129dac,
+      0xc3d39f2a, 0xac7d4c62, 0xa923157f, 0xd06994c4, 0xa554ca3d, 0xca4b29da, 0x6988d49f, 0xc48464a5,
+      0xd81355c9, 0x519e2fb0, 0x8beb9dcb, 0x5b01ff61, 0xab77821a, 0x5bdbc200, 0xc46c0704, 0x3e9cf4f9,
+      0x5c5fc866, 0xb0d8708f, 0xf1d5b5c2, 0x22b8adc2, 0xd34b9c01, 0x46b6528a, 0xecf81195, 0x3acdcffc,
+      0xb17118ef, 0xfc899ed5, 0xcf9390e6, 0x6537f926, 0x779015f3, 0xd44800b5, 0x8c99c7aa, 0x00029525};
+    static constexpr storage<2 * limbs_count> modulus_squared_4 = {
+      0x00012de4, 0x89f80000, 0x800238d6, 0xeba5b6ca, 0x0bafdec6, 0x2abc748a, 0xc8932cf5, 0xe423f649,
+      0xa6c5fb45, 0xbf9a0a40, 0xbf25a36d, 0xf86dca0a, 0x40a2efc3, 0x8795232e, 0xea3ff00f, 0x24253b58,
+      0x87a73e54, 0x58fa98c5, 0x52462aff, 0xa0d32989, 0x4aa9947b, 0x949653b5, 0xd311a93f, 0x8908c94a,
+      0xb026ab93, 0xa33c5f61, 0x17d73b96, 0xb603fec3, 0x56ef0434, 0xb7b78401, 0x88d80e08, 0x7d39e9f3,
+      0xb8bf90cc, 0x61b0e11e, 0xe3ab6b85, 0x45715b85, 0xa6973802, 0x8d6ca515, 0xd9f0232a, 0x759b9ff9,
+      0x62e231de, 0xf9133dab, 0x9f2721cd, 0xca6ff24d, 0xef202be6, 0xa890016a, 0x19338f55, 0x00052a4b};
+    static constexpr storage<limbs_count> m = {0x2507e899, 0x11629ccd, 0x2e4424dd, 0xab1eef5b, 0x481d2cfa, 0xb82146a9,
+                                               0x34e4227b, 0xf3182afa, 0xbeb25621, 0xf615fdb5, 0xccc261d6, 0xc4d8988c,
+                                               0xaaf4fab0, 0x3590d652, 0x2ab9ff30, 0x9c5d0a04, 0x6ec3f460, 0xf6e8534f,
+                                               0x88075ab4, 0xe8d78b06, 0x6f3fc8fe, 0xa8d3675b, 0x7bc5cd4b, 0x03852086};
+    static constexpr storage<limbs_count> one = {
+      0x00000001, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000,
+      0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000,
+      0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000};
+    static constexpr storage<limbs_count> zero = {
+      0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000,
+      0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000,
+      0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000};
+    static constexpr storage<limbs_count> montgomery_r = {
+      0xffff85d5, 0x0202ffff, 0x8fff8ce7, 0x5a582635, 0x827faade, 0x9e996e43, 0x0ee47df4, 0xda6aff32,
+      0x1d94b80b, 0xece9cb3e, 0x5248240b, 0xc0e667a2, 0xdcad3905, 0xa74da5bf, 0x462f2103, 0x2352e7fe,
+      0x08b1c87c, 0x7b565880, 0xe711022f, 0x45848a63, 0x9f65a9df, 0xd7a81ebb, 0xf127e87d, 0x0051f77e};
+    static constexpr storage<limbs_count> montgomery_r_inv = {
+      0x181fa3f1, 0x27c2b2a0, 0x25a0e1b8, 0x7d9ca9f9, 0x0a004a5d, 0x35a910f0, 0xdb6b8539, 0x54655b3f,
+      0x7695ef18, 0x5e763565, 0x4fae56bb, 0x226022c2, 0xb70d7652, 0x80e7f067, 0x72116b89, 0x435a8b4a,
+      0x5d84e0d4, 0xac258fd6, 0x4427c7b2, 0x47ee8ac5, 0xd04e621b, 0x478c4048, 0x2add3e93, 0x00e0aa7d};
+  };
+
+  // G1 and G2 generators
+  static constexpr storage<fq_config::limbs_count> g1_gen_x = {
+    0x66e5b43d, 0x4088f3af, 0xa6af603f, 0x055928ac, 0x56133e82, 0x6750dd03, 0x280ca27f, 0x03758f9a,
+    0xc9ea0971, 0x5bd71fa0, 0x47729b90, 0xa17a54ce, 0x94c2e746, 0x11dbfcd2, 0xc15520ac, 0x79017ffa,
+    0x85f56fc7, 0xee05c54b, 0x551b27f0, 0xe6a0cfb7, 0xa477beae, 0xb277ce98, 0x0ea190c8, 0x01075b02};
+  static constexpr storage<fq_config::limbs_count> g1_gen_y = {
+    0xb4e95363, 0xbafc8f2d, 0x0b20d2a1, 0xad1cb2be, 0xcad0fb93, 0xb2b08119, 0xb3053253, 0x9f9df141,
+    0x6fc2cdd4, 0xbe3fb90b, 0x717a4c55, 0xcc685d31, 0x71b5b806, 0xc5b8fa17, 0xaf7e0dba, 0x265909f1,
+    0xa2e573a3, 0x1a7348d2, 0x884c9ec6, 0x0f952589, 0x45cc2a42, 0xe6fd637b, 0x0a6fc574, 0x0058b84e};
+  static constexpr storage<fq_config::limbs_count> g2_gen_x = {
+    0xcd025f1c, 0xa830c194, 0xe1bf995b, 0x6410cf4f, 0xc2ad54b0, 0x00e96efb, 0x3cd208d7, 0xce6948cb,
+    0x00e1b6ba, 0x963317a3, 0xac70e7c7, 0xc5bbcae9, 0xf09feb58, 0x734ec3f1, 0xab3da268, 0x26b41c5d,
+    0x13890f6d, 0x4c062010, 0xc5a7115f, 0xd61053aa, 0x69d660f9, 0xc852a82e, 0x41d9b816, 0x01101332};
+  static constexpr storage<fq_config::limbs_count> g2_gen_y = {
+    0x28c73b61, 0xeb70a167, 0xf9eac689, 0x91ec0594, 0x3c5a02a5, 0x58aa2d3a, 0x504affc7, 0x3ea96fcd,
+    0xffa82300, 0x8906c170, 0xd2c712b8, 0x64f293db, 0x33293fef, 0x94c97eb7, 0x0b95a59c, 0x0a1d86c8,
+    0x53ffe316, 0x81a78e27, 0xcec2181c, 0x26b7cf9a, 0xe4b6d2dc, 0x8179eb10, 0x7761369f, 0x0017c335};
+
+  static constexpr storage<fq_config::limbs_count> weierstrass_b = {
+    0x0000008a, 0xf49d0000, 0x70000082, 0xe6913e68, 0xeaf0a437, 0x160cf8ae, 0x5667a8f8, 0x98a116c2,
+    0x73ebff2e, 0x71dcd3dc, 0x12f9fd90, 0x8689c8ed, 0x25b42304, 0x03cebaff, 0xe584e919, 0x707ba638,
+    0x8087be41, 0x528275ef, 0x81d14688, 0xb926186a, 0x04faff3e, 0xd187c940, 0xfb83ce0a, 0x0122e824};
+  static constexpr storage<fq_config::limbs_count> g2_weierstrass_b = {
+    0x00000004, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000,
+    0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000,
+    0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000};
+} // namespace bw6_761
+
+#endif
diff --git a/icicle/curves/curve_config.cuh b/icicle/curves/curve_config.cuh
index 2600db954..510e75db5 100644
--- a/icicle/curves/curve_config.cuh
+++ b/icicle/curves/curve_config.cuh
@@ -5,6 +5,7 @@
 #define BN254     1
 #define BLS12_381 2
 #define BLS12_377 3
+#define BW6_761   4
 
 #include "../primitives/field.cuh"
 #include "../primitives/projective.cuh"
@@ -21,21 +22,41 @@ using namespace bls12_381;
 #elif CURVE_ID == BLS12_377
 #include "bls12_377_params.cuh"
 using namespace bls12_377;
+#elif CURVE_ID == BW6_761
+#include "bls12_377_params.cuh"
+#include "bw6_761_params.cuh"
+using namespace bw6_761;
 #endif
 
 namespace curve_config {
 
+#if CURVE_ID == BW6_761
+  typedef bls12_377::fq_config fp_config;
+#endif
   typedef Field<fp_config> scalar_t;
   typedef Field<fq_config> point_field_t;
+  static constexpr point_field_t generator_x = point_field_t{g1_gen_x};
+  static constexpr point_field_t generator_y = point_field_t{g1_gen_y};
   static constexpr point_field_t b = point_field_t{weierstrass_b};
-  typedef Projective<point_field_t, scalar_t, b> projective_t;
+  typedef Projective<point_field_t, scalar_t, b, generator_x, generator_y> projective_t;
   typedef Affine<point_field_t> affine_t;
 
 #if defined(G2_DEFINED)
+#if CURVE_ID == BW6_761
+  typedef point_field_t g2_point_field_t;
+  static constexpr g2_point_field_t g2_generator_x = g2_point_field_t{g2_gen_x};
+  static constexpr g2_point_field_t g2_generator_y = g2_point_field_t{g2_gen_y};
+  static constexpr g2_point_field_t g2_b = g2_point_field_t{g2_weierstrass_b};
+#else
   typedef ExtensionField<fq_config> g2_point_field_t;
-  static constexpr g2_point_field_t b_g2 =
-    g2_point_field_t{point_field_t{weierstrass_b_g2_re}, point_field_t{weierstrass_b_g2_im}};
-  typedef Projective<g2_point_field_t, scalar_field_t, b_g2> g2_projective_t;
+  static constexpr g2_point_field_t g2_generator_x = g2_point_field_t{
+    point_field_t{g2_gen_x_re}, point_field_t{g2_gen_x_im}};
+  static constexpr g2_point_field_t g2_generator_y = g2_point_field_t{
+    point_field_t{g2_gen_y_re}, point_field_t{g2_gen_y_im}};
+  static constexpr g2_point_field_t g2_b = g2_point_field_t{
+    point_field_t{weierstrass_b_g2_re}, point_field_t{weierstrass_b_g2_im}};
+#endif
+  typedef Projective<g2_point_field_t, scalar_t, g2_b, g2_generator_x, g2_generator_y> g2_projective_t;
   typedef Affine<g2_point_field_t> g2_affine_t;
 #endif
 
diff --git a/icicle/primitives/field.cuh b/icicle/primitives/field.cuh
index f7b4d4150..d4e3affda 100644
--- a/icicle/primitives/field.cuh
+++ b/icicle/primitives/field.cuh
@@ -1,3 +1,21 @@
+/**
+ * This file contains methods for working with elements of a prime field. It is based on and evolved from Matter Labs'
+ * [Zprize
+ * submission](https://github.com/matter-labs/z-prize-msm-gpu/blob/main/bellman-cuda-rust/bellman-cuda-sys/native/ff_dispatch_st.cuh).
+ *
+ * TODO: DmytroTym: current version needs refactoring (e.g. there's no reason to have different classes Field and
+ * ff_storage among other issues). But because this is an internal file and correctness and performance are unaffected,
+ * refactoring it is low in the priority list.
+ *
+ * Documentation of methods is intended to explain inner workings to developers working on icicle. In its current state
+ * it mostly explains modular mutliplication and related methods. One important quirk of modern CUDA that's affecting
+ * most methods is explained by [Niall Emmart](https://youtu.be/KAWlySN7Hm8?si=h7nzDujnvubWXeDX&t=4039). In short, when
+ * 64-bit MAD (`r = a * b + c`) instructions get compiled down to SASS (CUDA assembly) they require two-register values
+ * `r` and `c` to start from even register (e.g. `r` can live in registers 20 and 21, or 14 and 15, but not 15 and 16).
+ * This complicates implementations forcing us to segregate terms into two categories depending on their alignment.
+ * Which is where `even` and `odd` arrays across the codebase come from.
+ */
+
 #pragma once
 
 #include "../utils/host_math.cuh"
@@ -34,10 +52,6 @@ public:
     return Field{scalar};
   }
 
-  static constexpr HOST_DEVICE_INLINE Field generator_x() { return Field{CONFIG::g1_gen_x}; }
-
-  static constexpr HOST_DEVICE_INLINE Field generator_y() { return Field{CONFIG::g1_gen_y}; }
-
   static HOST_INLINE Field omega(uint32_t logn)
   {
     if (logn == 0) { return Field{CONFIG::one}; }
@@ -67,12 +81,20 @@ public:
     return Field{inv.storages[logn - 1]};
   }
 
-  static constexpr HOST_DEVICE_INLINE Field modulus() { return Field{CONFIG::modulus}; }
-
   // private:
   typedef storage<TLC> ff_storage;
   typedef storage<2 * TLC> ff_wide_storage;
 
+  /**
+   * A new addition to the config file - \f$ 2^{32 \cdot num\_limbs} - p \f$.
+   */
+  static constexpr HOST_DEVICE_INLINE ff_storage get_neg_modulus() { return CONFIG::neg_modulus; }
+
+  /**
+   * A new addition to the config file - the number of times to reduce in [reduce](@ref reduce) function.
+   */
+  static constexpr HOST_DEVICE_INLINE unsigned num_of_reductions() { return CONFIG::num_of_reductions; }
+
   static constexpr unsigned slack_bits = 32 * TLC - NBITS;
 
   struct Wide {
@@ -89,6 +111,17 @@ public:
       return out;
     }
 
+    static constexpr Field HOST_DEVICE_INLINE get_higher(const Wide& xs)
+    {
+      Field out{};
+#ifdef __CUDA_ARCH__
+#pragma unroll
+#endif
+      for (unsigned i = 0; i < TLC; i++)
+        out.limbs_storage.limbs[i] = xs.limbs_storage.limbs[i + TLC];
+      return out;
+    }
+
     static constexpr Field HOST_DEVICE_INLINE get_higher_with_slack(const Wide& xs)
     {
       Field out{};
@@ -98,10 +131,10 @@ public:
       for (unsigned i = 0; i < TLC; i++) {
 #ifdef __CUDA_ARCH__
         out.limbs_storage.limbs[i] =
-          __funnelshift_lc(xs.limbs_storage.limbs[i + TLC - 1], xs.limbs_storage.limbs[i + TLC], slack_bits);
+          __funnelshift_lc(xs.limbs_storage.limbs[i + TLC - 1], xs.limbs_storage.limbs[i + TLC], 2 * slack_bits);
 #else
-        out.limbs_storage.limbs[i] =
-          (xs.limbs_storage.limbs[i + TLC] << slack_bits) + (xs.limbs_storage.limbs[i + TLC - 1] >> (32 - slack_bits));
+        out.limbs_storage.limbs[i] = (xs.limbs_storage.limbs[i + TLC] << 2 * slack_bits) +
+                                     (xs.limbs_storage.limbs[i + TLC - 1] >> (32 - 2 * slack_bits));
 #endif
       }
       return out;
@@ -143,7 +176,7 @@ public:
     }
   };
 
-  // return modulus
+  // return modulus multiplied by 1, 2 or 4
   template <unsigned MULTIPLIER = 1>
   static constexpr HOST_DEVICE_INLINE ff_storage get_modulus()
   {
@@ -184,27 +217,31 @@ public:
     }
   }
 
-  // add or subtract limbs
   template <bool SUBTRACT, bool CARRY_OUT>
-  static constexpr DEVICE_INLINE uint32_t
-  add_sub_limbs_device(const ff_storage& xs, const ff_storage& ys, ff_storage& rs)
+  static constexpr __device__ __forceinline__ uint32_t
+  add_sub_u32_device(const uint32_t* x, const uint32_t* y, uint32_t* r, size_t n = (TLC >> 1))
   {
-    const uint32_t* x = xs.limbs;
-    const uint32_t* y = ys.limbs;
-    uint32_t* r = rs.limbs;
     r[0] = SUBTRACT ? ptx::sub_cc(x[0], y[0]) : ptx::add_cc(x[0], y[0]);
-#ifdef __CUDA_ARCH__
-#pragma unroll
-#endif
-    for (unsigned i = 1; i < (CARRY_OUT ? TLC : TLC - 1); i++)
+    for (unsigned i = 1; i < (CARRY_OUT ? n : n - 1); i++)
       r[i] = SUBTRACT ? ptx::subc_cc(x[i], y[i]) : ptx::addc_cc(x[i], y[i]);
     if (!CARRY_OUT) {
-      r[TLC - 1] = SUBTRACT ? ptx::subc(x[TLC - 1], y[TLC - 1]) : ptx::addc(x[TLC - 1], y[TLC - 1]);
+      r[n - 1] = SUBTRACT ? ptx::subc(x[n - 1], y[n - 1]) : ptx::addc(x[n - 1], y[n - 1]);
       return 0;
     }
     return SUBTRACT ? ptx::subc(0, 0) : ptx::addc(0, 0);
   }
 
+  // add or subtract limbs
+  template <bool SUBTRACT, bool CARRY_OUT>
+  static constexpr DEVICE_INLINE uint32_t
+  add_sub_limbs_device(const ff_storage& xs, const ff_storage& ys, ff_storage& rs)
+  {
+    const uint32_t* x = xs.limbs;
+    const uint32_t* y = ys.limbs;
+    uint32_t* r = rs.limbs;
+    return add_sub_u32_device<SUBTRACT, CARRY_OUT>(x, y, r, TLC);
+  }
+
   template <bool SUBTRACT, bool CARRY_OUT>
   static constexpr DEVICE_INLINE uint32_t
   add_sub_limbs_device(const ff_wide_storage& xs, const ff_wide_storage& ys, ff_wide_storage& rs)
@@ -212,17 +249,7 @@ public:
     const uint32_t* x = xs.limbs;
     const uint32_t* y = ys.limbs;
     uint32_t* r = rs.limbs;
-    r[0] = SUBTRACT ? ptx::sub_cc(x[0], y[0]) : ptx::add_cc(x[0], y[0]);
-#ifdef __CUDA_ARCH__
-#pragma unroll
-#endif
-    for (unsigned i = 1; i < (CARRY_OUT ? 2 * TLC : 2 * TLC - 1); i++)
-      r[i] = SUBTRACT ? ptx::subc_cc(x[i], y[i]) : ptx::addc_cc(x[i], y[i]);
-    if (!CARRY_OUT) {
-      r[2 * TLC - 1] = SUBTRACT ? ptx::subc(x[2 * TLC - 1], y[2 * TLC - 1]) : ptx::addc(x[2 * TLC - 1], y[2 * TLC - 1]);
-      return 0;
-    }
-    return SUBTRACT ? ptx::subc(0, 0) : ptx::addc(0, 0);
+    return add_sub_u32_device<SUBTRACT, CARRY_OUT>(x, y, r, 2 * TLC);
   }
 
   template <bool SUBTRACT, bool CARRY_OUT>
@@ -252,16 +279,6 @@ public:
     return CARRY_OUT ? carry : 0;
   }
 
-  static constexpr HOST_INLINE uint32_t
-  sub_limbs_partial_host(uint32_t* x, uint32_t* y, uint32_t* r, uint32_t num_limbs)
-  {
-    uint32_t carry = 0;
-    host_math::carry_chain<2 * TLC, false, true> chain;
-    for (unsigned i = 0; i < num_limbs; i++)
-      r[i] = chain.sub(x[i], y[i], carry);
-    return carry;
-  }
-
   template <bool CARRY_OUT, typename T>
   static constexpr HOST_DEVICE_INLINE uint32_t add_limbs(const T& xs, const T& ys, T& rs)
   {
@@ -300,12 +317,14 @@ public:
     }
   }
 
-  static DEVICE_INLINE void cmad_n(uint32_t* acc, const uint32_t* a, uint32_t bi, size_t n = TLC)
+  template <bool CARRY_IN = false>
+  static __device__ __forceinline__ void
+  cmad_n(uint32_t* acc, const uint32_t* a, uint32_t bi, size_t n = TLC, uint32_t optional_carry = 0)
   {
-    // multiply scalar by vector
-    // acc = acc + bi*A[::2]
-    acc[0] = ptx::mad_lo_cc(a[0], bi, acc[0]);
+    if (CARRY_IN) ptx::add_cc(UINT32_MAX, optional_carry);
+    acc[0] = CARRY_IN ? ptx::madc_lo_cc(a[0], bi, acc[0]) : ptx::mad_lo_cc(a[0], bi, acc[0]);
     acc[1] = ptx::madc_hi_cc(a[0], bi, acc[1]);
+
 #pragma unroll
     for (size_t i = 2; i < n; i += 2) {
       acc[i] = ptx::madc_lo_cc(a[i], bi, acc[i]);
@@ -313,320 +332,277 @@ public:
     }
   }
 
-  static DEVICE_INLINE void
-  cmad_n_msb(uint32_t* acc, const uint32_t* a, uint32_t bi, size_t n = TLC, size_t a_start_idx = 0)
+  template <bool EVEN_PHASE>
+  static __device__ __forceinline__ void cmad_n_msb(uint32_t* acc, const uint32_t* a, uint32_t bi, size_t n = TLC)
   {
-    // multiply scalar by vector
-    // acc = acc + bi*A[::2]
-    acc[a_start_idx] = ptx::mad_lo_cc(a[a_start_idx], bi, acc[a_start_idx]);
-    acc[a_start_idx + 1] = ptx::madc_hi_cc(a[a_start_idx], bi, acc[a_start_idx + 1]);
+    if (EVEN_PHASE) {
+      acc[0] = ptx::mad_lo_cc(a[0], bi, acc[0]);
+      acc[1] = ptx::madc_hi_cc(a[0], bi, acc[1]);
+    } else {
+      acc[1] = ptx::mad_hi_cc(a[0], bi, acc[1]);
+    }
+
 #pragma unroll
-    for (size_t i = a_start_idx + 2; i < n; i += 2) {
+    for (size_t i = 2; i < n; i += 2) {
       acc[i] = ptx::madc_lo_cc(a[i], bi, acc[i]);
       acc[i + 1] = ptx::madc_hi_cc(a[i], bi, acc[i + 1]);
     }
   }
 
-  static DEVICE_INLINE void mad_row(uint32_t* odd, uint32_t* even, const uint32_t* a, uint32_t bi, size_t n = TLC)
+  static __device__ __forceinline__ void cmad_n_lsb(uint32_t* acc, const uint32_t* a, uint32_t bi, size_t n = TLC)
   {
-    // odd = odd + bi*A
-    // even = even + bi*A
-    cmad_n(odd, a + 1, bi, n - 2);
-    odd[n - 2] = ptx::madc_lo_cc(a[n - 1], bi, 0);
-    odd[n - 1] = ptx::madc_hi(a[n - 1], bi, 0);
+    if (n > 1)
+      acc[0] = ptx::mad_lo_cc(a[0], bi, acc[0]);
+    else
+      acc[0] = ptx::mad_lo(a[0], bi, acc[0]);
+
+    size_t i;
+#pragma unroll
+    for (i = 1; i < n - 1; i += 2) {
+      acc[i] = ptx::madc_hi_cc(a[i - 1], bi, acc[i]);
+      if (i == n - 2)
+        acc[i + 1] = ptx::madc_lo(a[i + 1], bi, acc[i + 1]);
+      else
+        acc[i + 1] = ptx::madc_lo_cc(a[i + 1], bi, acc[i + 1]);
+    }
+    if (i == n - 1) acc[i] = ptx::madc_hi(a[i - 1], bi, acc[i]);
+  }
+
+  template <bool CARRY_OUT = false, bool CARRY_IN = false>
+  static __device__ __forceinline__ uint32_t mad_row(
+    uint32_t* odd,
+    uint32_t* even,
+    const uint32_t* a,
+    uint32_t bi,
+    size_t n = TLC,
+    uint32_t ci = 0,
+    uint32_t di = 0,
+    uint32_t carry_for_high = 0,
+    uint32_t carry_for_low = 0)
+  {
+    cmad_n<CARRY_IN>(odd, a + 1, bi, n - 2, carry_for_low);
+    odd[n - 2] = ptx::madc_lo_cc(a[n - 1], bi, ci);
+    odd[n - 1] = CARRY_OUT ? ptx::madc_hi_cc(a[n - 1], bi, di) : ptx::madc_hi(a[n - 1], bi, di);
+    uint32_t cr = CARRY_OUT ? ptx::addc(0, 0) : 0;
     cmad_n(even, a, bi, n);
-    odd[n - 1] = ptx::addc(odd[n - 1], 0);
+    if (CARRY_OUT) {
+      odd[n - 1] = ptx::addc_cc(odd[n - 1], carry_for_high);
+      cr = ptx::addc(cr, 0);
+    } else
+      odd[n - 1] = ptx::addc(odd[n - 1], carry_for_high);
+    return cr;
   }
 
-  static DEVICE_INLINE void
-  mad_row_msb(uint32_t* odd, uint32_t* even, const uint32_t* a, uint32_t bi, size_t n = TLC, size_t a_start_idx = 0)
+  template <bool EVEN_PHASE>
+  static __device__ __forceinline__ void
+  mad_row_msb(uint32_t* odd, uint32_t* even, const uint32_t* a, uint32_t bi, size_t n = TLC)
   {
-    // odd = odd + bi*A
-    // even = even + bi*A
-    cmad_n_msb(odd, a + 1, bi, n - 2, a_start_idx - 1);
-    odd[n - 2] = ptx::madc_lo_cc(a[n - 1], bi, 0);
-    odd[n - 1] = ptx::madc_hi(a[n - 1], bi, 0);
-    cmad_n_msb(even, a, bi, n, a_start_idx);
-    odd[n - 1] = ptx::addc(odd[n - 1], 0);
+    cmad_n_msb<!EVEN_PHASE>(odd, EVEN_PHASE ? a : (a + 1), bi, n - 2);
+    odd[EVEN_PHASE ? (n - 1) : (n - 2)] = ptx::madc_lo_cc(a[n - 1], bi, 0);
+    odd[EVEN_PHASE ? n : (n - 1)] = ptx::madc_hi(a[n - 1], bi, 0);
+    cmad_n_msb<EVEN_PHASE>(even, EVEN_PHASE ? (a + 1) : a, bi, n - 1);
+    odd[EVEN_PHASE ? n : (n - 1)] = ptx::addc(odd[EVEN_PHASE ? n : (n - 1)], 0);
   }
 
-  static DEVICE_INLINE void multiply_raw_device(const ff_storage& as, const ff_storage& bs, ff_wide_storage& rs)
+  static __device__ __forceinline__ void
+  mad_row_lsb(uint32_t* odd, uint32_t* even, const uint32_t* a, uint32_t bi, size_t n = TLC)
   {
-    const uint32_t* a = as.limbs;
-    const uint32_t* b = bs.limbs;
-    uint32_t* even = rs.limbs;
-    __align__(8) uint32_t odd[2 * TLC - 2];
-    mul_n(even, a, b[0]);
-    mul_n(odd, a + 1, b[0]);
-    mad_row(&even[2], &odd[0], a, b[1]);
-    size_t i;
-#pragma unroll
-    for (i = 2; i < TLC - 1; i += 2) {
-      mad_row(&odd[i], &even[i], a, b[i]);
-      mad_row(&even[i + 2], &odd[i], a, b[i + 1]);
+    // bi here is constant so we can do a compile-time check for zero (which does happen once for bls12-381 scalar field
+    // modulus)
+    if (bi != 0) {
+      if (n > 1) cmad_n_lsb(odd, a + 1, bi, n - 1);
+      cmad_n_lsb(even, a, bi, n);
     }
-    // merge |even| and |odd|
-    even[1] = ptx::add_cc(even[1], odd[0]);
-    for (i = 1; i < 2 * TLC - 2; i++)
-      even[i + 1] = ptx::addc_cc(even[i + 1], odd[i]);
-    even[i + 1] = ptx::addc(even[i + 1], 0);
+    return;
   }
 
-  static DEVICE_INLINE void mult_no_carry(uint32_t a, uint32_t b, uint32_t* r)
+  static __device__ __forceinline__ uint32_t
+  mul_n_and_add(uint32_t* acc, const uint32_t* a, uint32_t bi, uint32_t* extra, size_t n = (TLC >> 1))
   {
-    r[0] = ptx::mul_lo(a, b);
-    r[1] = ptx::mul_hi(a, b);
-  }
+    acc[0] = ptx::mad_lo_cc(a[0], bi, extra[0]);
 
-  static DEVICE_INLINE void ingo_multiply_raw_device(const ff_storage& as, const ff_storage& bs, ff_wide_storage& rs)
+#pragma unroll
+    for (size_t i = 1; i < n - 1; i += 2) {
+      acc[i] = ptx::madc_hi_cc(a[i - 1], bi, extra[i]);
+      acc[i + 1] = ptx::madc_lo_cc(a[i + 1], bi, extra[i + 1]);
+    }
+
+    acc[n - 1] = ptx::madc_hi_cc(a[n - 2], bi, extra[n - 1]);
+    return ptx::addc(0, 0);
+  }
+
+  /**
+   * A function that computes wide product \f$ rs = as \cdot bs \f$ that's correct for the higher TLC + 1 limbs with a
+   * small maximum error.
+   *
+   * The way this function saves computations (as compared to regular school-book multiplication) is by not including
+   * terms that are too small. Namely, limb product \f$ a_i \cdot b_j \f$ is excluded if \f$ i + j < TLC - 2 \f$ and
+   * only the higher half is included if \f$ i + j = TLC - 2 \f$. All other limb products are included. So, the error
+   * i.e. difference between true product and the result of this function written to `rs` is exactly the sum of all
+   * dropped limbs products, which we can bound: \f$ a_0 \cdot b_0 + 2^{32}(a_0 \cdot b_1 + a_1 \cdot b_0) + \dots +
+   * 2^{32(TLC - 3)}(a_{TLC - 3} \cdot b_0 + \dots + a_0 \cdot b_{TLC - 3}) + 2^{32(TLC - 2)}(\floor{\frac{a_{TLC - 2}
+   * \cdot b_0}{2^{32}}} + \dots + \floor{\frac{a_0 \cdot b_{TLC - 2}}{2^{32}}}) \leq 2^{64} + 2\cdot 2^{96} + \dots +
+   * (TLC - 2) \cdot 2^{32(TLC - 1)} + (TLC - 1) \cdot 2^{32(TLC - 1)} \leq 2(TLC - 1) \cdot 2^{32(TLC - 1)}\f$.
+   */
+  static __device__ __forceinline__ void
+  multiply_msb_raw_device(const ff_storage& as, const ff_storage& bs, ff_wide_storage& rs)
   {
     const uint32_t* a = as.limbs;
     const uint32_t* b = bs.limbs;
-    uint32_t* r = rs.limbs;
-    uint32_t i, j;
     uint32_t* even = rs.limbs;
-    __align__(8) uint32_t odd[2 * TLC];
-    for (uint32_t i = 0; i < 2 * TLC; i++) {
-      even[i] = 0;
-      odd[i] = 0;
-    }
-    // first row special case, no carry in no carry out. split to non parts, even and odd.
-    for (i = 0; i < TLC - 1; i += 2) {
-      mult_no_carry(b[0], a[i], &even[i]);
-      mult_no_carry(b[0], a[i + 1], &odd[i]);
-    }
-
-    // doing two rows at one loop
-    for (i = 1; i < TLC - 1; i += 2) {
-      // odd bi's
-      // multiply accumulate even part of new row with odd part prev row (needs a carry)
-      // // j = 0, no carry in, only carry out
-      odd[i - 1] = ptx::mad_lo_cc(a[0], b[i], odd[i - 1]);
-      odd[i] = ptx::madc_hi_cc(a[0], b[i], odd[i]);
-      // for loop carry in carry out
-      for (j = 2; j < TLC; j += 2) // 2, 4, 6
-      {
-        odd[i + j - 1] = ptx::madc_lo_cc(a[j], b[i], odd[i + j - 1]);
-        odd[i + j] = ptx::madc_hi_cc(a[j], b[i], odd[i + j]);
-      }
-      odd[i + j - 1] = ptx::addc(odd[i + j - 1], 0); // handling last carry
-
-      // multiply accumulate odd part of new row with even part prev row (doesnt need a carry)
-      // j = 1, no carry in, only carry out
-      even[i + 1] = ptx::mad_lo_cc(a[1], b[i], even[i + 1]);
-      even[i + 2] = ptx::madc_hi_cc(a[1], b[i], even[i + 2]);
-      // for loop carry in carry out
-      for (j = 3; j < TLC; j += 2) {
-        even[i + j] = ptx::madc_lo_cc(a[j], b[i], even[i + j]);
-        even[i + j + 1] = ptx::madc_hi_cc(a[j], b[i], even[i + j + 1]);
-      }
-
-      // even bi's
-      // multiply accumulate even part of new row with even part of prev row // needs a carry
-      // j = 0, no carry in, only carry out
-      even[i + 1] = ptx::mad_lo_cc(a[0], b[i + 1], even[i + 1]);
-      even[i + 2] = ptx::madc_hi_cc(a[0], b[i + 1], even[i + 2]);
-      // for loop, carry in, carry out.
-      for (j = 2; j < TLC; j += 2) {
-        even[i + j + 1] = ptx::madc_lo_cc(a[j], b[i + 1], even[i + j + 1]);
-        even[i + j + 2] = ptx::madc_hi_cc(a[j], b[i + 1], even[i + j + 2]);
-      }
-      even[i + j + 1] = ptx::addc(even[i + j + 1], 0); // handling last carry
-
-      // multiply accumulate odd part of new row with odd part of prev row
-      // j = 1, no carry in, only carry out
-      odd[i + 1] = ptx::mad_lo_cc(a[1], b[i + 1], odd[i + 1]);
-      odd[i + 2] = ptx::madc_hi_cc(a[1], b[i + 1], odd[i + 2]);
-      // for loop, carry in, carry out.
-      for (j = 3; j < TLC; j += 2) {
-        odd[i + j] = ptx::madc_lo_cc(a[j], b[i + 1], odd[i + j]);
-        odd[i + j + 1] = ptx::madc_hi_cc(a[j], b[i + 1], odd[i + j + 1]);
-      }
-    }
+    __align__(16) uint32_t odd[2 * TLC - 2];
 
-    odd[i - 1] = ptx::mad_lo_cc(a[0], b[i], odd[i - 1]);
-    odd[i] = ptx::madc_hi_cc(a[0], b[i], odd[i]);
-    // for loop carry in carry out
-    for (j = 2; j < TLC; j += 2) {
-      odd[i + j - 1] = ptx::madc_lo_cc(a[j], b[i], odd[i + j - 1]);
-      odd[i + j] = ptx::madc_hi_cc(a[j], b[i], odd[i + j]);
-    }
-    odd[i + j - 1] = ptx::addc(odd[i + j - 1], 0); // handling last carry
-
-    // multiply accumulate odd part of new row with even part prev row
-    // j = 1, no carry in, only carry out
-    even[i + 1] = ptx::mad_lo_cc(a[1], b[i], even[i + 1]);
-    even[i + 2] = ptx::madc_hi_cc(a[1], b[i], even[i + 2]);
-    // for loop carry in carry out
-    for (j = 3; j < TLC; j += 2) {
-      even[i + j] = ptx::madc_lo_cc(a[j], b[i], even[i + j]);
-      even[i + j + 1] = ptx::madc_hi_cc(a[j], b[i], even[i + j + 1]);
+    even[TLC - 1] = ptx::mul_hi(a[TLC - 2], b[0]);
+    odd[TLC - 2] = ptx::mul_lo(a[TLC - 1], b[0]);
+    odd[TLC - 1] = ptx::mul_hi(a[TLC - 1], b[0]);
+    size_t i;
+#pragma unroll
+    for (i = 2; i < TLC - 1; i += 2) {
+      mad_row_msb<true>(&even[TLC - 2], &odd[TLC - 2], &a[TLC - i - 1], b[i - 1], i + 1);
+      mad_row_msb<false>(&odd[TLC - 2], &even[TLC - 2], &a[TLC - i - 2], b[i], i + 2);
     }
+    mad_row(&even[TLC], &odd[TLC - 2], a, b[TLC - 1]);
 
-    // add even and odd parts
-    even[1] = ptx::add_cc(even[1], odd[0]);
-    for (i = 1; i < 2 * TLC - 2; i++)
+    // merge |even| and |odd|
+    ptx::add_cc(even[TLC - 1], odd[TLC - 2]);
+    for (i = TLC - 1; i < 2 * TLC - 2; i++)
       even[i + 1] = ptx::addc_cc(even[i + 1], odd[i]);
     even[i + 1] = ptx::addc(even[i + 1], 0);
   }
 
-  static DEVICE_INLINE void
-  ingo_msb_multiply_raw_device(const ff_storage& as, const ff_storage& bs, ff_wide_storage& rs)
+  /**
+   * A function that computes the low half of the fused multiply-and-add \f$ rs = as \cdot bs + cs \f$.
+   *
+   * For efficiency, this method does not include terms that are too large. Namely, limb product \f$ a_i \cdot b_j \f$
+   * is excluded if \f$ i + j > TLC - 1 \f$ and only the lower half is included if \f$ i + j = TLC - 1 \f$. All other
+   * limb products are included.
+   */
+  static __device__ __forceinline__ void
+  multiply_and_add_lsb_raw_device(const ff_storage& as, const ff_storage& bs, ff_storage& cs, ff_storage& rs)
   {
     const uint32_t* a = as.limbs;
     const uint32_t* b = bs.limbs;
-    uint32_t* r = rs.limbs;
-    uint32_t i, j;
     uint32_t* even = rs.limbs;
-    __align__(8) uint32_t odd[2 * TLC];
-    for (uint32_t i = 0; i < 2 * TLC; i++) {
-      even[i] = 0;
-      odd[i] = 0;
-    }
-    // only last element from first row.
-    mult_no_carry(b[0], a[TLC - 1], &odd[TLC - 2]);
-
-// doing two rows at one loop
-#pragma unroll
-    for (i = 1; i < TLC - 1; i += 2) {
-      const uint32_t first_active_j = TLC - 1 - i;
-      const uint32_t first_active_j_odd = first_active_j + (1 - (first_active_j % 2));
-      const uint32_t first_active_j_even = first_active_j + first_active_j % 2;
-      // odd bi's
-      // multiply accumulate even part of new row with odd part prev row (needs a carry)
-      // j = 0, no carry in, only carry out
-      odd[first_active_j_even + i - 1] = ptx::mad_lo_cc(a[first_active_j_even], b[i], odd[first_active_j_even + i - 1]);
-      odd[first_active_j_even + i] = ptx::madc_hi_cc(a[first_active_j_even], b[i], odd[first_active_j_even + i]);
-// for loop carry in carry out
-#pragma unroll
-      for (j = first_active_j_even + 2; j < TLC; j += 2) {
-        odd[i + j - 1] = ptx::madc_lo_cc(a[j], b[i], odd[i + j - 1]);
-        odd[i + j] = ptx::madc_hi_cc(a[j], b[i], odd[i + j]);
-      }
-      odd[i + j - 1] = ptx::addc(odd[i + j - 1], 0); // handling last carry
-
-      // multiply accumulate odd part of new row with even part prev row (doesnt need a carry)
-      // j = 1, no carry in, only carry out
-      even[i + first_active_j_odd] = ptx::mad_lo_cc(a[first_active_j_odd], b[i], even[i + first_active_j_odd]);
-      even[i + first_active_j_odd + 1] = ptx::madc_hi_cc(a[first_active_j_odd], b[i], even[i + first_active_j_odd + 1]);
-// for loop carry in carry out
-#pragma unroll
-      for (j = first_active_j_odd + 2; j < TLC; j += 2) {
-        even[i + j] = ptx::madc_lo_cc(a[j], b[i], even[i + j]);
-        even[i + j + 1] = ptx::madc_hi_cc(a[j], b[i], even[i + j + 1]);
-      }
-
-      // even bi's
-      uint32_t const first_active_j1 = TLC - 1 - (i + 1);
-      uint32_t const first_active_j_odd1 = first_active_j1 + (1 - (first_active_j1 % 2));
-      uint32_t const first_active_j_even1 = first_active_j1 + first_active_j1 % 2;
-      // multiply accumulate even part of new row with even part of prev row // needs a carry
-      // j = 0, no carry in, only carry out
-      even[first_active_j_even1 + i + 1] =
-        ptx::mad_lo_cc(a[first_active_j_even1], b[i + 1], even[first_active_j_even1 + i + 1]);
-      even[first_active_j_even1 + i + 2] =
-        ptx::madc_hi_cc(a[first_active_j_even1], b[i + 1], even[first_active_j_even1 + i + 2]);
-// for loop, carry in, carry out.
-#pragma unroll
-      for (j = first_active_j_even1 + 2; j < TLC; j += 2) {
-        even[i + j + 1] = ptx::madc_lo_cc(a[j], b[i + 1], even[i + j + 1]);
-        even[i + j + 2] = ptx::madc_hi_cc(a[j], b[i + 1], even[i + j + 2]);
-      }
-      even[i + j + 1] = ptx::addc(even[i + j + 1], 0); // handling last carry
-
-      // multiply accumulate odd part of new row with odd part of prev row
-      // j = 1, no carry in, only carry out
-      odd[first_active_j_odd1 + i] = ptx::mad_lo_cc(a[first_active_j_odd1], b[i + 1], odd[first_active_j_odd1 + i]);
-      odd[first_active_j_odd1 + i + 1] =
-        ptx::madc_hi_cc(a[first_active_j_odd1], b[i + 1], odd[first_active_j_odd1 + i + 1]);
-// for loop, carry in, carry out.
+    __align__(16) uint32_t odd[TLC - 1];
+    size_t i;
+    // `b[0]` is \f$ 2^{32} \f$ minus the last limb of prime modulus. Because most scalar (and some base) primes
+    // are neccessarily NTT-friendly, `b[0]` often turns out to be \f$ 2^{32} - 1 \f$. This actually leads to
+    // less efficient SASS generated by nvcc, so this case needed separate handling.
+    if (b[0] == UINT32_MAX) {
+      add_sub_u32_device<true, false>(cs.limbs, a, even, TLC);
+      for (i = 0; i < TLC - 1; i++)
+        odd[i] = a[i];
+    } else {
+      mul_n_and_add(even, a, b[0], cs.limbs, TLC);
+      mul_n(odd, a + 1, b[0], TLC - 1);
+    }
+    mad_row_lsb(&even[2], &odd[0], a, b[1], TLC - 1);
 #pragma unroll
-      for (j = first_active_j_odd1 + 2; j < TLC; j += 2) {
-        odd[i + j] = ptx::madc_lo_cc(a[j], b[i + 1], odd[i + j]);
-        odd[i + j + 1] = ptx::madc_hi_cc(a[j], b[i + 1], odd[i + j + 1]);
-      }
+    for (i = 2; i < TLC - 1; i += 2) {
+      mad_row_lsb(&odd[i], &even[i], a, b[i], TLC - i);
+      mad_row_lsb(&even[i + 2], &odd[i], a, b[i + 1], TLC - i - 1);
     }
 
-    // last round, i = TLC - 1
-    odd[i - 1] = ptx::mad_lo_cc(a[0], b[i], odd[i - 1]);
-    odd[i] = ptx::madc_hi_cc(a[0], b[i], odd[i]);
-// for loop carry in carry out
-#pragma unroll
-    for (j = 2; j < TLC; j += 2) {
-      odd[i + j - 1] = ptx::madc_lo_cc(a[j], b[i], odd[i + j - 1]);
-      odd[i + j] = ptx::madc_hi_cc(a[j], b[i], odd[i + j]);
-    }
-    odd[i + j - 1] = ptx::addc(odd[i + j - 1], 0); // handling last carry
+    // merge |even| and |odd|
+    even[1] = ptx::add_cc(even[1], odd[0]);
+    for (i = 1; i < TLC - 2; i++)
+      even[i + 1] = ptx::addc_cc(even[i + 1], odd[i]);
+    even[i + 1] = ptx::addc(even[i + 1], odd[i]);
+  }
 
-    // multiply accumulate odd part of new row with even part prev row
-    // j = 1, no carry in, only carry out
-    even[i + 1] = ptx::mad_lo_cc(a[1], b[i], even[i + 1]);
-    even[i + 2] = ptx::madc_hi_cc(a[1], b[i], even[i + 2]);
-// for loop carry in carry out
-#pragma unroll
-    for (j = 3; j < TLC; j += 2) {
-      even[i + j] = ptx::madc_lo_cc(a[j], b[i], even[i + j]);
-      even[i + j + 1] = ptx::madc_hi_cc(a[j], b[i], even[i + j + 1]);
-    }
+  /**
+   * This method multiplies `a` and `b` (both assumed to have TLC / 2 limbs) and adds `in1` and `in2` (TLC limbs each)
+   * to the result which is written to `even`.
+   *
+   * It is used to compute the "middle" part of Karatsuba: \f$ a_{lo} \cdot b_{hi} + b_{lo} \cdot a_{hi} =
+   * (a_{hi} - a_{lo})(b_{lo} - b_{hi}) + a_{lo} \cdot b_{lo} + a_{hi} \cdot b_{hi} \f$. Currently this method assumes
+   * that the top bit of \f$ a_{hi} \f$ and \f$ b_{hi} \f$ are unset. This ensures correctness by allowing to keep the
+   * result inside TLC limbs and ignore the carries from the highest limb.
+   */
+  static __device__ __forceinline__ void
+  multiply_and_add_short_raw_device(const uint32_t* a, const uint32_t* b, uint32_t* even, uint32_t* in1, uint32_t* in2)
+  {
+    __align__(16) uint32_t odd[TLC - 2];
+    uint32_t first_row_carry = mul_n_and_add(even, a, b[0], in1);
+    uint32_t carry = mul_n_and_add(odd, a + 1, b[0], &in2[1]);
 
-    // add even and odd parts
-    even[1] = ptx::add_cc(even[1], odd[0]);
+    size_t i;
 #pragma unroll
-    for (i = 1; i < 2 * TLC - 2; i++)
+    for (i = 2; i < ((TLC >> 1) - 1); i += 2) {
+      carry = mad_row<true, false>(
+        &even[i], &odd[i - 2], a, b[i - 1], TLC >> 1, in1[(TLC >> 1) + i - 2], in1[(TLC >> 1) + i - 1], carry);
+      carry =
+        mad_row<true, false>(&odd[i], &even[i], a, b[i], TLC >> 1, in2[(TLC >> 1) + i - 1], in2[(TLC >> 1) + i], carry);
+    }
+    mad_row<false, true>(
+      &even[TLC >> 1], &odd[(TLC >> 1) - 2], a, b[(TLC >> 1) - 1], TLC >> 1, in1[TLC - 2], in1[TLC - 1], carry,
+      first_row_carry);
+    // merge |even| and |odd| plus the parts of `in2` we haven't added yet (first and last limbs)
+    even[0] = ptx::add_cc(even[0], in2[0]);
+    for (i = 0; i < (TLC - 2); i++)
       even[i + 1] = ptx::addc_cc(even[i + 1], odd[i]);
-    even[i + 1] = ptx::addc(even[i + 1], 0);
+    even[i + 1] = ptx::addc(even[i + 1], in2[i + 1]);
   }
 
-  static DEVICE_INLINE void multiply_lsb_raw_device(const ff_storage& as, const ff_storage& bs, ff_wide_storage& rs)
+  /**
+   * This method multiplies `a` and `b` and writes the result into `even`. It assumes that `a` and `b` are TLC/2 limbs
+   * long. The usual schoolbook algorithm is used.
+   */
+  static __device__ __forceinline__ void multiply_short_raw_device(const uint32_t* a, const uint32_t* b, uint32_t* even)
   {
-    // r = a * b is correcrt for the first TLC + 1 digits. (not computing from TLC + 1 to 2*TLC - 2).
-    const uint32_t* a = as.limbs;
-    const uint32_t* b = bs.limbs;
-    uint32_t* even = rs.limbs;
-    __align__(8) uint32_t odd[2 * TLC - 2];
-    mul_n(even, a, b[0]);
-    mul_n(odd, a + 1, b[0]);
-    mad_row(&even[2], &odd[0], a, b[1]);
+    __align__(16) uint32_t odd[TLC - 2];
+    mul_n(even, a, b[0], TLC >> 1);
+    mul_n(odd, a + 1, b[0], TLC >> 1);
+    mad_row(&even[2], &odd[0], a, b[1], TLC >> 1);
+
     size_t i;
 #pragma unroll
-    for (i = 2; i < TLC - 1; i += 2) {
-      mad_row(&odd[i], &even[i], a, b[i], TLC - i + 2);
-      mad_row(&even[i + 2], &odd[i], a, b[i + 1], TLC - i + 2);
+    for (i = 2; i < ((TLC >> 1) - 1); i += 2) {
+      mad_row(&odd[i], &even[i], a, b[i], TLC >> 1);
+      mad_row(&even[i + 2], &odd[i], a, b[i + 1], TLC >> 1);
     }
-
     // merge |even| and |odd|
     even[1] = ptx::add_cc(even[1], odd[0]);
-    for (i = 1; i < TLC + 1; i++)
+    for (i = 1; i < TLC - 2; i++)
       even[i + 1] = ptx::addc_cc(even[i + 1], odd[i]);
     even[i + 1] = ptx::addc(even[i + 1], 0);
   }
 
-  static DEVICE_INLINE void multiply_msb_raw_device(const ff_storage& as, const ff_storage& bs, ff_wide_storage& rs)
+  /**
+   * This method multiplies `as` and `bs` and writes the (wide) result into `rs`.
+   *
+   * It is assumed that the highest bits of `as` and `bs` are unset which is true for all the numbers icicle had to deal
+   * with so far. This method implements [subtractive
+   * Karatsuba](https://en.wikipedia.org/wiki/Karatsuba_algorithm#Implementation).
+   */
+  static DEVICE_INLINE void multiply_raw_device(const ff_storage& as, const ff_storage& bs, ff_wide_storage& rs)
   {
     const uint32_t* a = as.limbs;
     const uint32_t* b = bs.limbs;
-    uint32_t* even = rs.limbs;
-    __align__(8) uint32_t odd[2 * TLC - 2];
-    for (int i = 0; i < 2 * TLC - 1; i++) {
-      even[i] = 0;
-      odd[i] = 0;
-    }
-    uint32_t min_indexes_sum = TLC - 1;
-    // only diagonal
-    mul_n_msb(even, a, b[0], TLC, min_indexes_sum);
-    mul_n_msb(odd, a + 1, b[0], TLC, min_indexes_sum - 1);
-    mad_row_msb(&even[2], &odd[0], a, b[1], TLC, min_indexes_sum - 1);
-    size_t i;
-#pragma unroll
-    for (i = 2; i < TLC - 1; i += 2) {
-      mad_row(&odd[i], &even[i], a, b[i]);
-      mad_row(&even[i + 2], &odd[i], a, b[i + 1]);
-    }
-    // merge |even| and |odd|
-    even[1] = ptx::add_cc(even[1], odd[0]);
-    for (i = 1; i < 2 * TLC - 2; i++)
-      even[i + 1] = ptx::addc_cc(even[i + 1], odd[i]);
-    even[i + 1] = ptx::addc(even[i + 1], 0);
+    uint32_t* r = rs.limbs;
+    // Next two lines multiply high and low halves of operands (\f$ a_{lo} \cdot b_{lo}; a_{hi} \cdot b_{hi} \$f) and
+    // write the results into `r`.
+    multiply_short_raw_device(a, b, r);
+    multiply_short_raw_device(&a[TLC >> 1], &b[TLC >> 1], &r[TLC]);
+    __align__(16) uint32_t middle_part[TLC];
+    __align__(16) uint32_t diffs[TLC];
+    // Differences of halves \f$ a_{hi} - a_{lo}; b_{lo} - b_{hi} \$f are written into `diffs`, signs written to
+    // `carry1` and `carry2`.
+    uint32_t carry1 = add_sub_u32_device<true, true>(&a[TLC >> 1], a, diffs);
+    uint32_t carry2 = add_sub_u32_device<true, true>(b, &b[TLC >> 1], &diffs[TLC >> 1]);
+    // Compute the "middle part" of Karatsuba: \f$ a_{lo} \cdot b_{hi} + b_{lo} \cdot a_{hi} \f$.
+    // This is where the assumption about unset high bit of `a` and `b` is relevant.
+    multiply_and_add_short_raw_device(diffs, &diffs[TLC >> 1], middle_part, r, &r[TLC]);
+    // Corrections that need to be performed when differences are negative.
+    // Again, carry doesn't need to be propagated due to unset high bits of `a` and `b`.
+    if (carry1) add_sub_u32_device<true, false>(&middle_part[TLC >> 1], &diffs[TLC >> 1], &middle_part[TLC >> 1]);
+    if (carry2) add_sub_u32_device<true, false>(&middle_part[TLC >> 1], diffs, &middle_part[TLC >> 1]);
+    // Now that middle part is fully correct, it can be added to the result.
+    add_sub_u32_device<false, true>(&r[TLC >> 1], middle_part, &r[TLC >> 1], TLC);
+
+    // Carry from adding middle part has to be propagated to the highest limb.
+    for (size_t i = TLC + (TLC >> 1); i < 2 * TLC; i++)
+      r[i] = ptx::addc_cc(r[i], 0);
   }
 
   static HOST_INLINE void multiply_raw_host(const ff_storage& as, const ff_storage& bs, ff_wide_storage& rs)
@@ -651,19 +627,23 @@ public:
 #endif
   }
 
-  static HOST_DEVICE_INLINE void multiply_raw_lsb(const ff_storage& as, const ff_storage& bs, ff_wide_storage& rs)
+  static HOST_DEVICE_INLINE void
+  multiply_and_add_lsb_raw(const ff_storage& as, const ff_storage& bs, ff_storage& cs, ff_storage& rs)
   {
 #ifdef __CUDA_ARCH__
-    return multiply_lsb_raw_device(as, bs, rs);
+    return multiply_and_add_lsb_raw_device(as, bs, cs, rs);
 #else
-    return multiply_raw_host(as, bs, rs);
+    Wide r_wide = {};
+    multiply_raw_host(as, bs, r_wide.limbs_storage);
+    Field r = Wide::get_lower(r_wide);
+    add_limbs<false>(cs, r.limbs_storage, rs);
 #endif
   }
 
-  static HOST_DEVICE_INLINE void multiply_raw_msb(const ff_storage& as, const ff_storage& bs, ff_wide_storage& rs)
+  static HOST_DEVICE_INLINE void multiply_msb_raw(const ff_storage& as, const ff_storage& bs, ff_wide_storage& rs)
   {
 #ifdef __CUDA_ARCH__
-    return multiply_raw_device(as, bs, rs);
+    return multiply_msb_raw_device(as, bs, rs);
 #else
     return multiply_raw_host(as, bs, rs);
 #endif
@@ -694,8 +674,8 @@ public:
     Field value{};
     for (unsigned i = 0; i < TLC; i++)
       value.limbs_storage.limbs[i] = distribution(generator);
-    while (lt(modulus(), value))
-      value = value - modulus();
+    while (lt(Field{get_modulus()}, value))
+      value = value - Field{get_modulus()};
     return value;
   }
 
@@ -752,55 +732,64 @@ public:
     return rs;
   }
 
-  static constexpr DEVICE_INLINE uint32_t
-  sub_limbs_partial_device(uint32_t* x, uint32_t* y, uint32_t* r, uint32_t num_limbs)
-  {
-    r[0] = ptx::sub_cc(x[0], y[0]);
-#pragma unroll
-    for (unsigned i = 1; i < num_limbs; i++)
-      r[i] = ptx::subc_cc(x[i], y[i]);
-    return ptx::subc(0, 0);
-  }
+  static constexpr HOST_DEVICE_INLINE Field to_montgomery(const Field& xs) { return xs * Field{CONFIG::montgomery_r}; }
 
-  static constexpr HOST_DEVICE_INLINE uint32_t
-  sub_limbs_partial(uint32_t* x, uint32_t* y, uint32_t* r, uint32_t num_limbs)
+  static constexpr HOST_DEVICE_INLINE Field from_montgomery(const Field& xs)
   {
-#ifdef __CUDA_ARCH__
-    return sub_limbs_partial_device(x, y, r, num_limbs);
-#else
-    return sub_limbs_partial_host(x, y, r, num_limbs);
-#endif
+    return xs * Field{CONFIG::montgomery_r_inv};
   }
 
+  /**
+   * This method reduces a Wide number `xs` modulo `p` and returns the result as a Field element.
+   *
+   * It is assumed that the high `2 * slack_bits` bits of `xs` are unset which is always the case for the product of 2
+   * numbers with thier high `slack_bits` unset. Larger Wide numbers should be reduced by subtracting an appropriate
+   * factor of `modulus_squared` first.
+   *
+   * This function implements ["multi-precision Barrett"](https://github.com/ingonyama-zk/modular_multiplication). As
+   * opposed to Montgomery reduction, it doesn't require numbers to have a special representation but lets us work with
+   * them as-is. The general idea of Barrett reduction is to estimate the quotient \f$ l \approx \floor{\frac{xs}{p}}
+   * \f$ and return \f$ xs - l \cdot p \f$. But since \f$ l \f$ is inevitably computed with an error (it's always less
+   * or equal than the real quotient). So the modulus `p` might need to be subtracted several times before the result is
+   * in the desired range \f$ [0;p-1] \f$. The estimate of the error is as follows: \f[ \frac{xs}{p} - l = \frac{xs}{p}
+   * - \frac{xs \cdot m}{2^{2n}} + \frac{xs \cdot m}{2^{2n}} - \floor{\frac{xs}{2^k}}\frac{m}{2^{2n-k}}
+   *  + \floor{\frac{xs}{2^k}}\frac{m}{2^{2n-k}} - l \leq p^2(\frac{1}{p}-\frac{m}{2^{2n}}) + \frac{m}{2^{2n-k}} + 2(TLC
+   * - 1) \cdot 2^{-32} \f] Here \f$ l \f$ is the result of [multiply_msb_raw](@ref multiply_msb_raw) function and the
+   * last term in the error is due to its approximation. \f$ n \f$ is the number of bits in \f$ p \f$ and \f$ k = 2n -
+   * 32\cdot TLC \f$. Overall, the error is always less than 2 so at most 2 reductions are needed. However, in most
+   * cases it's less than 1, so setting the [num_of_reductions](@ref num_of_reductions) variable for a field equal to 1
+   * will cause only 1 reduction to be performed.
+   */
   template <unsigned MODULUS_MULTIPLE = 1>
   static constexpr HOST_DEVICE_INLINE Field reduce(const Wide& xs)
   {
-    Field xs_hi = Wide::get_higher_with_slack(xs); // xy << slack_bits
+    // `xs` is left-shifted by `2 * slack_bits` and higher half is written to `xs_hi`
+    Field xs_hi = Wide::get_higher_with_slack(xs);
     Wide l = {};
-    multiply_raw_msb(xs_hi.limbs_storage, get_m(), l.limbs_storage); // MSB mult
-    Field l_hi = Wide::get_higher_with_slack(l);
-    Wide lp = {};
-    multiply_raw_lsb(l_hi.limbs_storage, get_modulus(), lp.limbs_storage); // LSB mult
-    Wide r_wide = xs - lp;
-    Wide r_wide_reduced = {};
-    for (unsigned i = 0; i < TLC + 1; i++) {
-      uint32_t carry = sub_limbs_partial(
-        r_wide.limbs_storage.limbs, modulus_wide().limbs, r_wide_reduced.limbs_storage.limbs, TLC + 1);
-      if (carry == 0) // continue to reduce
-        r_wide = r_wide_reduced;
-      else // done
-        break;
-    }
+    multiply_msb_raw(xs_hi.limbs_storage, get_m(), l.limbs_storage); // MSB mult by `m`
+    Field l_hi = Wide::get_higher(l);
+    Field r = {};
+    Field xs_lo = Wide::get_lower(xs);
+    // Here we need to compute the lsb of `xs - l \cdot p` and to make use of fused multiply-and-add, we rewrite it as
+    // `xs + l \cdot (2^{32 \cdot TLC}-p)` which is the same as original (up to higher limbs which we don't care about).
+    multiply_and_add_lsb_raw(l_hi.limbs_storage, get_neg_modulus(), xs_lo.limbs_storage, r.limbs_storage);
+    ff_storage r_reduced = {};
+    uint32_t carry;
+    // As mentioned, either 2 or 1 reduction can be performed depending on the field in question.
+    if (num_of_reductions() == 2) {
+      carry = sub_limbs<true>(r.limbs_storage, get_modulus<2>(), r_reduced);
+      if (carry == 0) r = Field{r_reduced};
+    }
+    carry = sub_limbs<true>(r.limbs_storage, get_modulus<1>(), r_reduced);
+    if (carry == 0) r = Field{r_reduced};
 
-    // number of wrap around is bounded by TLC +  1 times.
-    Field r = Wide::get_lower(r_wide);
     return r;
   }
 
   friend HOST_DEVICE_INLINE Field operator*(const Field& xs, const Field& ys)
   {
     Wide xy = mul_wide(xs, ys); // full mult
-    return reduce(xy);
+    return reduce(xy);          // reduce mod p
   }
 
   friend HOST_DEVICE_INLINE bool operator==(const Field& xs, const Field& ys)
@@ -949,3 +938,16 @@ public:
     return (u == one) ? b : c;
   }
 };
+
+template <class CONFIG>
+struct std::hash<Field<CONFIG>>
+{
+  std::size_t operator()(const Field<CONFIG>& key) const
+  {
+    std::size_t hash = 0;
+    // boost hashing, see https://stackoverflow.com/questions/35985960/c-why-is-boosthash-combine-the-best-way-to-combine-hash-values/35991300#35991300
+    for (int i = 0; i < CONFIG::limbs_count; i++)
+      hash ^= std::hash<uint32_t>()(key.limbs_storage.limbs[i]) + 0x9e3779b9 + (hash<<6) + (hash>>2);
+    return hash;
+  }
+};
diff --git a/icicle/primitives/projective.cuh b/icicle/primitives/projective.cuh
index bc0ca067f..4aa81609b 100644
--- a/icicle/primitives/projective.cuh
+++ b/icicle/primitives/projective.cuh
@@ -2,7 +2,7 @@
 
 #include "affine.cuh"
 
-template <typename FF, class SCALAR_FF, const FF& B_VALUE>
+template <typename FF, class SCALAR_FF, const FF& B_VALUE, const FF& GENERATOR_X, const FF& GENERATOR_Y>
 class Projective
 {
   friend Affine<FF>;
@@ -32,7 +32,7 @@ public:
     return {FF::FromMontgomery(point.x), FF::FromMontgomery(point.y), FF::FromMontgomery(point.z)};
   }
 
-  static HOST_DEVICE_INLINE Projective generator() { return {FF::generator_x(), FF::generator_y(), FF::one()}; }
+  static HOST_DEVICE_INLINE Projective generator() { return {GENERATOR_X, GENERATOR_Y, FF::one()}; }
 
   static HOST_DEVICE_INLINE Projective neg(const Projective& point) { return {point.x, FF::neg(point.y), point.z}; }
 
diff --git a/icicle/primitives/test.cu b/icicle/primitives/test.cu
index d76dcc8b0..fb6cd7729 100644
--- a/icicle/primitives/test.cu
+++ b/icicle/primitives/test.cu
@@ -30,22 +30,22 @@ protected:
   projective_t* points2{};
   g2_projective_t* g2_points1{};
   g2_projective_t* g2_points2{};
-  scalar_field_t* scalars1{};
-  scalar_field_t* scalars2{};
+  scalar_t* scalars1{};
+  scalar_t* scalars2{};
   projective_t* zero_points{};
   g2_projective_t* g2_zero_points{};
-  scalar_field_t* zero_scalars{};
-  scalar_field_t* one_scalars{};
+  scalar_t* zero_scalars{};
+  scalar_t* one_scalars{};
   affine_t* aff_points{};
   g2_affine_t* g2_aff_points{};
   projective_t* res_points1{};
   projective_t* res_points2{};
   g2_projective_t* g2_res_points1{};
   g2_projective_t* g2_res_points2{};
-  scalar_field_t* res_scalars1{};
-  scalar_field_t* res_scalars2{};
-  scalar_field_t::Wide* res_scalars_wide{};
-  scalar_field_t::Wide* res_scalars_wide_full{};
+  scalar_t* res_scalars1{};
+  scalar_t* res_scalars2{};
+  scalar_t::Wide* res_scalars_wide{};
+  scalar_t::Wide* res_scalars_wide_full{};
 
   PrimitivesTest()
   {
@@ -54,22 +54,20 @@ protected:
     assert(!cudaMallocManaged(&points2, n * sizeof(projective_t)));
     assert(!cudaMallocManaged(&g2_points1, n * sizeof(g2_projective_t)));
     assert(!cudaMallocManaged(&g2_points2, n * sizeof(g2_projective_t)));
-    assert(!cudaMallocManaged(&scalars1, n * sizeof(scalar_field_t)));
-    assert(!cudaMallocManaged(&scalars2, n * sizeof(scalar_field_t)));
+    assert(!cudaMallocManaged(&scalars1, n * sizeof(scalar_t)));
+    assert(!cudaMallocManaged(&scalars2, n * sizeof(scalar_t)));
     assert(!cudaMallocManaged(&zero_points, n * sizeof(projective_t)));
     assert(!cudaMallocManaged(&g2_zero_points, n * sizeof(g2_projective_t)));
-    assert(!cudaMallocManaged(&zero_scalars, n * sizeof(scalar_field_t)));
-    assert(!cudaMallocManaged(&one_scalars, n * sizeof(scalar_field_t)));
+    assert(!cudaMallocManaged(&zero_scalars, n * sizeof(scalar_t)));
+    assert(!cudaMallocManaged(&one_scalars, n * sizeof(scalar_t)));
     assert(!cudaMallocManaged(&aff_points, n * sizeof(affine_t)));
     assert(!cudaMallocManaged(&g2_aff_points, n * sizeof(g2_affine_t)));
     assert(!cudaMallocManaged(&res_points1, n * sizeof(projective_t)));
     assert(!cudaMallocManaged(&res_points2, n * sizeof(projective_t)));
     assert(!cudaMallocManaged(&g2_res_points1, n * sizeof(g2_projective_t)));
     assert(!cudaMallocManaged(&g2_res_points2, n * sizeof(g2_projective_t)));
-    assert(!cudaMallocManaged(&res_scalars1, n * sizeof(scalar_field_t)));
-    assert(!cudaMallocManaged(&res_scalars2, n * sizeof(scalar_field_t)));
-    assert(!cudaMallocManaged(&res_scalars_wide, n * sizeof(scalar_field_t::Wide)));
-    assert(!cudaMallocManaged(&res_scalars_wide_full, n * sizeof(scalar_field_t::Wide)));
+    assert(!cudaMallocManaged(&res_scalars1, n * sizeof(scalar_t)));
+    assert(!cudaMallocManaged(&res_scalars2, n * sizeof(scalar_t)));
   }
 
   ~PrimitivesTest() override
@@ -93,9 +91,6 @@ protected:
     cudaFree(res_scalars1);
     cudaFree(res_scalars2);
 
-    cudaFree(res_scalars_wide);
-    cudaFree(res_scalars_wide_full);
-
     cudaDeviceReset();
   }
 
@@ -105,22 +100,20 @@ protected:
     ASSERT_EQ(device_populate_random<projective_t>(points2, n), cudaSuccess);
     ASSERT_EQ(device_populate_random<g2_projective_t>(g2_points1, n), cudaSuccess);
     ASSERT_EQ(device_populate_random<g2_projective_t>(g2_points2, n), cudaSuccess);
-    ASSERT_EQ(device_populate_random<scalar_field_t>(scalars1, n), cudaSuccess);
-    ASSERT_EQ(device_populate_random<scalar_field_t>(scalars2, n), cudaSuccess);
+    ASSERT_EQ(device_populate_random<scalar_t>(scalars1, n), cudaSuccess);
+    ASSERT_EQ(device_populate_random<scalar_t>(scalars2, n), cudaSuccess);
     ASSERT_EQ(device_set<projective_t>(zero_points, projective_t::zero(), n), cudaSuccess);
     ASSERT_EQ(device_set<g2_projective_t>(g2_zero_points, g2_projective_t::zero(), n), cudaSuccess);
-    ASSERT_EQ(device_set<scalar_field_t>(zero_scalars, scalar_field_t::zero(), n), cudaSuccess);
-    ASSERT_EQ(device_set<scalar_field_t>(one_scalars, scalar_field_t::one(), n), cudaSuccess);
+    ASSERT_EQ(device_set<scalar_t>(zero_scalars, scalar_t::zero(), n), cudaSuccess);
+    ASSERT_EQ(device_set<scalar_t>(one_scalars, scalar_t::one(), n), cudaSuccess);
     ASSERT_EQ(cudaMemset(aff_points, 0, n * sizeof(affine_t)), cudaSuccess);
     ASSERT_EQ(cudaMemset(g2_aff_points, 0, n * sizeof(g2_affine_t)), cudaSuccess);
     ASSERT_EQ(cudaMemset(res_points1, 0, n * sizeof(projective_t)), cudaSuccess);
     ASSERT_EQ(cudaMemset(res_points2, 0, n * sizeof(projective_t)), cudaSuccess);
     ASSERT_EQ(cudaMemset(g2_res_points1, 0, n * sizeof(g2_projective_t)), cudaSuccess);
     ASSERT_EQ(cudaMemset(g2_res_points2, 0, n * sizeof(g2_projective_t)), cudaSuccess);
-    ASSERT_EQ(cudaMemset(res_scalars1, 0, n * sizeof(scalar_field_t)), cudaSuccess);
-    ASSERT_EQ(cudaMemset(res_scalars2, 0, n * sizeof(scalar_field_t)), cudaSuccess);
-    ASSERT_EQ(cudaMemset(res_scalars_wide, 0, n * sizeof(scalar_field_t::Wide)), cudaSuccess);
-    ASSERT_EQ(cudaMemset(res_scalars_wide_full, 0, n * sizeof(scalar_field_t::Wide)), cudaSuccess);
+    ASSERT_EQ(cudaMemset(res_scalars1, 0, n * sizeof(scalar_t)), cudaSuccess);
+    ASSERT_EQ(cudaMemset(res_scalars2, 0, n * sizeof(scalar_t)), cudaSuccess);
   }
 };
 
@@ -319,82 +312,6 @@ TEST_F(PrimitivesTest, ECMixedAdditionOfNegatedPointEqSubtraction)
     ASSERT_EQ(res_points1[i], points1[i] + res_points2[i]);
 }
 
-TEST_F(PrimitivesTest, MP_LSB_MULT)
-{
-  // LSB multiply, check correctness of first TLC + 1 digits result.
-  ASSERT_EQ(mp_lsb_mult(scalars1, scalars2, res_scalars_wide), cudaSuccess);
-  std::cout << "first GPU lsb mult output  = 0x";
-  for (int i = 0; i < 2 * scalar_field_t::TLC; i++) {
-    std::cout << std::hex << res_scalars_wide[0].limbs_storage.limbs[i];
-  }
-  std::cout << std::endl;
-
-  ASSERT_EQ(mp_mult(scalars1, scalars2, res_scalars_wide_full), cudaSuccess);
-  std::cout << "first GPU full mult output = 0x";
-  for (int i = 0; i < 2 * scalar_field_t::TLC; i++) {
-    std::cout << std::hex << res_scalars_wide_full[0].limbs_storage.limbs[i];
-  }
-  std::cout << std::endl;
-  for (int j = 0; j < n; j++) {
-    for (int i = 0; i < scalar_field_t::TLC + 1; i++) {
-      ASSERT_EQ(res_scalars_wide_full[j].limbs_storage.limbs[i], res_scalars_wide[j].limbs_storage.limbs[i]);
-    }
-  }
-}
-
-TEST_F(PrimitivesTest, MP_MSB_MULT)
-{
-  // MSB multiply, take n msb bits of multiplication, assert that the error is up to 1.
-  ASSERT_EQ(mp_msb_mult(scalars1, scalars2, res_scalars_wide), cudaSuccess);
-  std::cout << "first GPU msb mult output  = 0x";
-  for (int i = 2 * scalar_field_t::TLC - 1; i >= 0; i--) {
-    std::cout << std::hex << res_scalars_wide[0].limbs_storage.limbs[i] << " ";
-  }
-  std::cout << std::endl;
-
-  ASSERT_EQ(mp_mult(scalars1, scalars2, res_scalars_wide_full), cudaSuccess);
-  std::cout << "first GPU full mult output = 0x";
-  for (int i = 2 * scalar_field_t::TLC - 1; i >= 0; i--) {
-    std::cout << std::hex << res_scalars_wide_full[0].limbs_storage.limbs[i] << " ";
-  }
-
-  std::cout << std::endl;
-
-  for (int i = 0; i < 2 * scalar_field_t::TLC - 1; i++) {
-    if (res_scalars_wide_full[0].limbs_storage.limbs[i] == res_scalars_wide[0].limbs_storage.limbs[i])
-      std::cout << "matched word idx = " << i << std::endl;
-  }
-}
-
-TEST_F(PrimitivesTest, INGO_MP_MULT)
-{
-  // MSB multiply, take n msb bits of multiplication, assert that the error is up to 1.
-  ASSERT_EQ(ingo_mp_mult(scalars1, scalars2, res_scalars_wide), cudaSuccess);
-  std::cout << "INGO   = 0x";
-  for (int i = 0; i < 2 * scalar_field_t::TLC; i++) {
-    std::cout << std::hex << res_scalars_wide[0].limbs_storage.limbs[i] << " ";
-  }
-  std::cout << std::endl;
-
-  ASSERT_EQ(mp_mult(scalars1, scalars2, res_scalars_wide_full), cudaSuccess);
-  std::cout << "ZKSYNC = 0x";
-  for (int i = 0; i < 2 * scalar_field_t::TLC; i++) {
-    std::cout << std::hex << res_scalars_wide_full[0].limbs_storage.limbs[i] << " ";
-  }
-
-  std::cout << std::endl;
-
-  for (int i = 0; i < 2 * scalar_field_t::TLC - 1; i++) {
-    if (res_scalars_wide_full[0].limbs_storage.limbs[i] == res_scalars_wide[0].limbs_storage.limbs[i])
-      std::cout << "matched word idx = " << i << std::endl;
-  }
-  for (int j = 0; j < n; j++) {
-    for (int i = 0; i < 2 * scalar_field_t::TLC - 1; i++) {
-      ASSERT_EQ(res_scalars_wide_full[j].limbs_storage.limbs[i], res_scalars_wide[j].limbs_storage.limbs[i]);
-    }
-  }
-}
-
 TEST_F(PrimitivesTest, G2ECRandomPointsAreOnCurve)
 {
   for (unsigned i = 0; i < n; i++)
diff --git a/icicle/primitives/test_kernels.cuh b/icicle/primitives/test_kernels.cuh
index 2555ab569..bd8d2e145 100644
--- a/icicle/primitives/test_kernels.cuh
+++ b/icicle/primitives/test_kernels.cuh
@@ -75,28 +75,28 @@ int vec_mul(const F* x, const G* y, G* result, const unsigned count)
   return error ? error : cudaDeviceSynchronize();
 }
 
-__global__ void inv_field_elements_kernel(const scalar_field_t* x, scalar_field_t* result, const unsigned count)
+__global__ void inv_field_elements_kernel(const scalar_t* x, scalar_t* result, const unsigned count)
 {
   const unsigned gid = blockIdx.x * blockDim.x + threadIdx.x;
   if (gid >= count) return;
-  result[gid] = scalar_field_t::inverse(x[gid]);
+  result[gid] = scalar_t::inverse(x[gid]);
 }
 
-int field_vec_inv(const scalar_field_t* x, scalar_field_t* result, const unsigned count)
+int field_vec_inv(const scalar_t* x, scalar_t* result, const unsigned count)
 {
   inv_field_elements_kernel<<<(count - 1) / 32 + 1, 32>>>(x, result, count);
   int error = cudaGetLastError();
   return error ? error : cudaDeviceSynchronize();
 }
 
-__global__ void sqr_field_elements_kernel(const scalar_field_t* x, scalar_field_t* result, const unsigned count)
+__global__ void sqr_field_elements_kernel(const scalar_t* x, scalar_t* result, const unsigned count)
 {
   const unsigned gid = blockIdx.x * blockDim.x + threadIdx.x;
   if (gid >= count) return;
-  result[gid] = scalar_field_t::sqr(x[gid]);
+  result[gid] = scalar_t::sqr(x[gid]);
 }
 
-int field_vec_sqr(const scalar_field_t* x, scalar_field_t* result, const unsigned count)
+int field_vec_sqr(const scalar_t* x, scalar_t* result, const unsigned count)
 {
   sqr_field_elements_kernel<<<(count - 1) / 32 + 1, 32>>>(x, result, count);
   int error = cudaGetLastError();
@@ -118,81 +118,3 @@ int point_vec_to_affine(const P* x, A* result, const unsigned count)
   int error = cudaGetLastError();
   return error ? error : cudaDeviceSynchronize();
 }
-
-__global__ void mp_mult_kernel(const scalar_field_t* x, const scalar_field_t* y, scalar_field_t::Wide* result)
-{
-  const unsigned gid = blockIdx.x * blockDim.x + threadIdx.x;
-  scalar_field_t::multiply_raw_device(x[gid].limbs_storage, y[gid].limbs_storage, result[gid].limbs_storage);
-}
-
-int mp_mult(const scalar_field_t* x, scalar_field_t* y, scalar_field_t::Wide* result)
-{
-  mp_mult_kernel<<<1, 32>>>(x, y, result);
-  int error = cudaGetLastError();
-  return error ? error : cudaDeviceSynchronize();
-}
-
-__global__ void mp_lsb_mult_kernel(const scalar_field_t* x, const scalar_field_t* y, scalar_field_t::Wide* result)
-{
-  const unsigned gid = blockIdx.x * blockDim.x + threadIdx.x;
-  scalar_field_t::multiply_lsb_raw_device(x[gid].limbs_storage, y[gid].limbs_storage, result[gid].limbs_storage);
-}
-
-int mp_lsb_mult(const scalar_field_t* x, scalar_field_t* y, scalar_field_t::Wide* result)
-{
-  mp_lsb_mult_kernel<<<1, 32>>>(x, y, result);
-  int error = cudaGetLastError();
-  return error ? error : cudaDeviceSynchronize();
-}
-
-__global__ void mp_msb_mult_kernel(const scalar_field_t* x, const scalar_field_t* y, scalar_field_t::Wide* result)
-{
-  const unsigned gid = blockIdx.x * blockDim.x + threadIdx.x;
-  scalar_field_t::multiply_msb_raw_device(x[gid].limbs_storage, y[gid].limbs_storage, result[gid].limbs_storage);
-}
-
-int mp_msb_mult(const scalar_field_t* x, scalar_field_t* y, scalar_field_t::Wide* result)
-{
-  mp_msb_mult_kernel<<<1, 1>>>(x, y, result);
-  int error = cudaGetLastError();
-  return error ? error : cudaDeviceSynchronize();
-}
-
-__global__ void ingo_mp_mult_kernel(const scalar_field_t* x, const scalar_field_t* y, scalar_field_t::Wide* result)
-{
-  const unsigned gid = blockIdx.x * blockDim.x + threadIdx.x;
-  scalar_field_t::ingo_multiply_raw_device(x[gid].limbs_storage, y[gid].limbs_storage, result[gid].limbs_storage);
-}
-
-int ingo_mp_mult(const scalar_field_t* x, scalar_field_t* y, scalar_field_t::Wide* result)
-{
-  ingo_mp_mult_kernel<<<1, 32>>>(x, y, result);
-  int error = cudaGetLastError();
-  return error ? error : cudaDeviceSynchronize();
-}
-
-__global__ void ingo_mp_msb_mult_kernel(const scalar_field_t* x, const scalar_field_t* y, scalar_field_t::Wide* result)
-{
-  const unsigned gid = blockIdx.x * blockDim.x + threadIdx.x;
-  scalar_field_t::ingo_msb_multiply_raw_device(x[gid].limbs_storage, y[gid].limbs_storage, result[gid].limbs_storage);
-}
-
-int ingo_mp_msb_mult(const scalar_field_t* x, scalar_field_t* y, scalar_field_t::Wide* result, const unsigned n)
-{
-  ingo_mp_msb_mult_kernel<<<1, n>>>(x, y, result);
-  int error = cudaGetLastError();
-  return error ? error : cudaDeviceSynchronize();
-}
-
-__global__ void ingo_mp_mod_mult_kernel(const scalar_field_t* x, const scalar_field_t* y, scalar_field_t* result)
-{
-  const unsigned gid = blockIdx.x * blockDim.x + threadIdx.x;
-  result[gid] = x[gid] * y[gid];
-}
-
-int ingo_mp_mod_mult(const scalar_field_t* x, scalar_field_t* y, scalar_field_t* result, const unsigned n)
-{
-  ingo_mp_mod_mult_kernel<<<1, n>>>(x, y, result);
-  int error = cudaGetLastError();
-  return error ? error : cudaDeviceSynchronize();
-}
\ No newline at end of file
diff --git a/icicle/utils/device_context.cu b/icicle/utils/device_context.cu
new file mode 100644
index 000000000..7e2c42f98
--- /dev/null
+++ b/icicle/utils/device_context.cu
@@ -0,0 +1,9 @@
+#include "device_context.cuh"
+
+namespace device_context {
+
+    extern "C" DeviceContext GetDefaultDeviceContext() {
+        return get_default_device_context();
+    }
+
+}
diff --git a/icicle/utils/device_context.cuh b/icicle/utils/device_context.cuh
index 5ce30e485..21ac61cdf 100644
--- a/icicle/utils/device_context.cuh
+++ b/icicle/utils/device_context.cuh
@@ -15,6 +15,17 @@ namespace device_context {
     cudaMemPool_t mempool; /**< Mempool to use. Default value: 0. */
   };
 
+  /**
+   * Return default device context that corresponds to using the default stream of the first GPU
+   */
+  inline DeviceContext get_default_device_context() {
+    return DeviceContext {
+      0,               // device_id
+      (cudaStream_t)0, // stream
+      0,               // mempool
+    };
+  }
+
 } // namespace device_context
 
 #endif
diff --git a/icicle/utils/error_handler.cuh b/icicle/utils/error_handler.cuh
index 58c008153..b8af83050 100644
--- a/icicle/utils/error_handler.cuh
+++ b/icicle/utils/error_handler.cuh
@@ -6,12 +6,32 @@
 
 #define CHECK_CUDA_ERROR(val) check((val), #val, __FILE__, __LINE__)
 template <typename T>
-void check(T err, const char* const func, const char* const file, const int line);
+void inline check(T err, const char* const func, const char* const file, const int line)
+{
+  if (err != cudaSuccess) {
+    std::cerr << "CUDA Runtime Error at: " << file << ":" << line << std::endl;
+    std::cerr << cudaGetErrorString(err) << " " << func << std::endl;
+  }
+}
 
 #define CHECK_LAST_CUDA_ERROR() checkLast(__FILE__, __LINE__)
-void checkLast(const char* const file, const int line);
+void inline checkLast(const char* const file, const int line)
+{
+  cudaError_t err{cudaGetLastError()};
+  if (err != cudaSuccess) {
+    std::cerr << "CUDA Runtime Error at: " << file << ":" << line << std::endl;
+    std::cerr << cudaGetErrorString(err) << std::endl;
+  }
+}
 
 #define CHECK_SYNC_DEVICE_ERROR() syncDevice(__FILE__, __LINE__)
-void syncDevice(const char* const file, const int line);
+void inline syncDevice(const char* const file, const int line)
+{
+  cudaError_t err{cudaDeviceSynchronize()};
+  if (err != cudaSuccess) {
+    std::cerr << "CUDA Runtime Error at: " << file << ":" << line << std::endl;
+    std::cerr << cudaGetErrorString(err) << std::endl;
+  }
+}
 
 #endif
diff --git a/icicle/utils/utils_kernels.cu b/icicle/utils/utils_kernels.cu
deleted file mode 100644
index e1099cc36..000000000
--- a/icicle/utils/utils_kernels.cu
+++ /dev/null
@@ -1,29 +0,0 @@
-#include "utils_kernels.cuh"
-
-namespace utils_internal {
-  // TODO: weird linking issue - only works in headers
-  // template <typename E, typename S>
-  // __global__ void NormalizeKernel(E* arr, S scalar, unsigned n)
-  // {
-  //   int tid = blockIdx.x * blockDim.x + threadIdx.x;
-  //   if (tid < n) { arr[tid] = scalar * arr[tid]; }
-  // }
-
-  template <typename E, typename S>
-  __global__ void NormalizeKernel(E* arr, S scalar, int n)
-  {
-    int tid = blockIdx.x * blockDim.x + threadIdx.x;
-    if (tid < n) { arr[tid] = scalar * arr[tid]; }
-  }
-
-  template <typename E, typename S>
-  __global__ void BatchMulKernel(E* element_vec, S* scalar_vec, int n_scalars, int batch_size)
-  {
-    int tid = blockDim.x * blockIdx.x + threadIdx.x;
-    if (tid < n_scalars * batch_size) {
-      int scalar_id = tid % n_scalars;
-      element_vec[tid] = scalar_vec[scalar_id] * element_vec[tid];
-    }
-  }
-
-} // namespace utils_internal
diff --git a/icicle/utils/utils_kernels.cuh b/icicle/utils/utils_kernels.cuh
index ae73da595..5ef3dd2b5 100644
--- a/icicle/utils/utils_kernels.cuh
+++ b/icicle/utils/utils_kernels.cuh
@@ -2,22 +2,31 @@
 #ifndef UTILS_KERNELS_H
 #define UTILS_KERNELS_H
 
+#include "utils_kernels.cuh"
+
 namespace utils_internal {
+  // TODO: weird linking issue - only works in headers
+  // template <typename E, typename S>
+  // __global__ void NormalizeKernel(E* arr, S scalar, unsigned n)
+  // {
+  //   int tid = blockIdx.x * blockDim.x + threadIdx.x;
+  //   if (tid < n) { arr[tid] = scalar * arr[tid]; }
+  // }
 
   template <typename E, typename S>
-  __global__ void NormalizeKernel(E* arr, S scalar, unsigned n)
+  __global__ void NormalizeKernel(E* arr, S scalar, int n)
   {
     int tid = blockIdx.x * blockDim.x + threadIdx.x;
     if (tid < n) { arr[tid] = scalar * arr[tid]; }
   }
 
   template <typename E, typename S>
-  __global__ void BatchMulKernel(E* element_vec, S* scalar_vec, unsigned n_scalars, unsigned batch_size)
+  __global__ void BatchMulKernel(E* in_vec, int n_elements, int batch_size, S* scalar_vec, int step, int n_scalars, E* out_vec)
   {
     int tid = blockDim.x * blockIdx.x + threadIdx.x;
-    if (tid < n_scalars * batch_size) {
-      int scalar_id = tid % n_scalars;
-      element_vec[tid] = scalar_vec[scalar_id] * element_vec[tid];
+    if (tid < n_elements * batch_size) {
+      int scalar_id = tid % n_elements;
+      out_vec[tid] = *(scalar_vec + ((scalar_id * step) % n_scalars)) * in_vec[tid];
     }
   }
 
diff --git a/scripts/hooks/pre-push b/scripts/hooks/pre-push
index cbd7844d2..f26b8bbcb 100755
--- a/scripts/hooks/pre-push
+++ b/scripts/hooks/pre-push
@@ -3,9 +3,13 @@
 status=0
 # Run clang-format on CUDA, C, and CPP files
 # clang-format writes to stderr in dry-run mode. In order to capture the output to detect if there are changes needed we redirect stderr to stdin
-if [[ $(find ./ -path ./icicle/build -prune -o -path ./target -prune -iname *.h -or -iname *.cuh -or -iname *.cu -or -iname *.c -or -iname *.cpp | xargs clang-format --dry-run -ferror-limit=1 -style=file 2>&1) ]];
+# to print list of files 
+unformatted_files=$(find ./ -path ./icicle/build -prune -o -path ./target -prune -iname *.h -or -iname *.cuh -or -iname *.cu -or -iname *.c -or -iname *.cpp | xargs clang-format --dry-run -ferror-limit=1 -style=file 2>&1)
+
+if [[ $unformatted_files ]];
 then
     echo "🚨 There are files in Icicle Core that need formatting."
+    echo $unformatted_files
     echo "Please format all .c, .cpp, .h, .cu, .cuh files using the following command:"
     echo "find ./ -path ./icicle/build -prune -o -path ./target -prune -iname *.h -or -iname *.cuh -or -iname *.cu -or -iname *.c -or -iname *.cpp | xargs clang-format -i -style=file"
     status=1
diff --git a/wrappers/rust/icicle-core/src/ntt/mod.rs b/wrappers/rust/icicle-core/src/ntt/mod.rs
index d9f150e0e..426abe896 100644
--- a/wrappers/rust/icicle-core/src/ntt/mod.rs
+++ b/wrappers/rust/icicle-core/src/ntt/mod.rs
@@ -3,9 +3,17 @@ use std::os::raw::c_int;
 
 /**
  * @enum Ordering
- * How to order inputs and outputs of the NTT:
- * - kNN: inputs and outputs are natural-order (example of natural ordering: \f$ \{a_0, a_1, a_2, a_3, a_4, a_5, a_6, a_7\} \f$).
- * - kNR: inputs are natural-order and outputs are bit-reversed-order (example of bit-reversed ordering: \f$ \{a_0, a_4, a_2, a_6, a_1, a_5, a_3, a_7\} \f$).
+ * How to order inputs and outputs of the NTT. If needed, use this field to specify decimation: decimation in time
+ * (DIT) corresponds to `Ordering::kRN` while decimation in frequency (DIF) to `Ordering::kNR`. Also, to specify
+ * butterfly to be used, select `Ordering::kRN` for Cooley-Tukey and `Ordering::kNR` for Gentleman-Sande. There's
+ * no implication that a certain decimation or butterfly will actually be used under the hood, this is just for
+ * compatibility with codebases that use "decimation" and "butterfly" to denote ordering of inputs and outputs.
+ *
+ * Ordering options are:
+ * - kNN: inputs and outputs are natural-order (example of natural ordering: \f$ \{a_0, a_1, a_2, a_3, a_4, a_5, a_6,
+ * a_7\} \f$).
+ * - kNR: inputs are natural-order and outputs are bit-reversed-order (example of bit-reversed ordering: \f$ \{a_0,
+ * a_4, a_2, a_6, a_1, a_5, a_3, a_7\} \f$).
  * - kRN: inputs are bit-reversed-order and outputs are natural-order.
  * - kRR: inputs and outputs are bit-reversed-order.
  */
@@ -19,86 +27,41 @@ pub enum Ordering {
     kRR,
 }
 
-/**
- * @enum Decimation
- * Decimation of the NTT algorithm:
- * - kDIT: decimation in time.
- * - kDIF: decimation in frequency.
- */
-#[allow(non_camel_case_types)]
-#[repr(C)]
-#[derive(Clone, Copy, Debug, PartialEq, Eq)]
-pub enum Decimation {
-    kDIT,
-    kDIF,
-}
-
-/**
- * @enum Butterfly
- * [Butterfly](https://en.wikipedia.org/wiki/Butterfly_diagram) used in the NTT algorithm (i.e. what happens to each pair of inputs on every iteration):
- * - kCooleyTukey: Cooley-Tukey butterfly.
- * - kGentlemanSande: Gentleman-Sande butterfly.
- */
-#[allow(non_camel_case_types)]
-#[repr(C)]
-#[derive(Clone, Copy, Debug, PartialEq, Eq)]
-pub enum Butterfly {
-    kCooleyTukey,
-    kGentlemanSande,
-}
-
 /**
  * @struct NTTConfig
  * Struct that encodes NTT parameters to be passed into the [ntt](@ref ntt) function.
  */
 #[repr(C)]
 #[derive(Debug)]
-pub struct NTTConfigCuda<'a, E, S> {
-    pub inout: *mut E,
-    /**< Input that's mutated in-place by this function. Length of this array needs to be \f$ size \cdot config.batch_size \f$.
-     *   Note that if inputs are in Montgomery form, the outputs will be as well and vice-verse: non-Montgomery inputs produce non-Montgomety outputs.*/
-    pub is_input_on_device: bool,
-    /**< True if inputs/outputs are on device and false if they're on host. Default value: false. */
-    pub is_inverse: bool,
-    /**< True if true . Default value: false. */
+pub struct NTTConfig<'a, S> {
+    /** Coset generator. Used to perform coset (i)NTTs. Default value: `S::one()` (corresponding to no coset being used). */
+    pub coset_gen: S,
+    /** Ordering of inputs and outputs. See [Ordering](@ref Ordering). Default value: `Ordering::kNN`. */
     pub ordering: Ordering,
-    /**< Ordering of inputs and outputs. See [Ordering](@ref Ordering). Default value: `Ordering::kNN`. */
-    pub decimation: Decimation,
-    /**< Decimation of the algorithm, see [Decimation](@ref Decimation). Default value: `Decimation::kDIT`.
-     *   __Note:__ this variable exists mainly for compatibility with codebases that use similar notation.
-     *   If [ordering](@ref ordering) is `Ordering::kRN`, the value of this variable will be overridden to
-     *   `Decimation::kDIT` and if ordering is `Ordering::kNR` — to `Decimation::kDIF`. */
-    pub butterfly: Butterfly,
-    /**< Butterfly used by the NTT. See [Butterfly](@ref Butterfly). Default value: `Butterfly::kCooleyTukey`.
-     *   __Note:__ this variable exists mainly for compatibility with codebases that use similar notation.
-     *   If [ordering](@ref ordering) is `Ordering::kRN`, the value of this variable will be overridden to
-     *   `Butterfly::kCooleyTukey` and if ordering is `Ordering::kNR` — to `Butterfly::kGentlemanSande`. */
-    pub is_coset: bool,
-    /**< If false, NTT is computed on a subfield given by [twiddles](@ref twiddles). If true, NTT is computed
-     *   on a coset of [twiddles](@ref twiddles) given by [the coset generator](@ref coset_gen), so:
-     *   \f$ \{coset\_gen\cdot\omega^0, coset\_gen\cdot\omega^1, \dots, coset\_gen\cdot\omega^{n-1}\} \f$. Default value: false. */
-    pub coset_gen: *const S,
-    /**< The field element that generates a coset if [is_coset](@ref is_coset) is true.
-     *   Otherwise should be set to `nullptr`. Default value: `nullptr`. */
-    pub twiddles: *const S,
-    /**< "Twiddle factors", (or "domain", or "roots of unity") on which the NTT is evaluated.
-     *   This pointer is expected to live on device. The order is as follows:
-     *   \f$ \{\omega^0=1, \omega^1, \dots, \omega^{n-1}\} \f$. If this pointer is `nullptr`, twiddle factors
-     *   are generated online using the default generator (TODO: link to twiddle gen here) and function
-     *   [GenerateTwiddleFactors](@ref GenerateTwiddleFactors). Default value: `nullptr`. */
-    pub inv_twiddles: *const S,
-    /**< "Inverse twiddle factors", (or "domain", or "roots of unity") on which the iNTT is evaluated.
-     *   This pointer is expected to live on device. The order is as follows:
-     *   \f$ \{\omega^0=1, \omega^1, \dots, \omega^{n-1}\} \f$. If this pointer is `nullptr`, twiddle factors
-     *   are generated online using the default generator (TODO: link to twiddle gen here) and function
-     *   [GenerateTwiddleFactors](@ref GenerateTwiddleFactors). Default value: `nullptr`. */
-    pub size: c_int,
-    /**< NTT size \f$ n \f$. If a batch of NTTs (which all need to have the same size) is computed, this is the size of 1 NTT. */
+    /** True if inputs are on device and false if they're on host. Default value: false. */
+    pub are_inputs_on_device: bool,
+    /** If true, output is preserved on device for subsequent use in config and not freed after calculation. Default value: false. */
+    pub are_outputs_on_device: bool,
+    /** The number of NTTs to compute. Default value: 1. */
     pub batch_size: c_int,
-    /**< The number of NTTs to compute. Default value: 1. */
-    pub is_preserving_twiddles: bool,
-    /**< If true, twiddle factors are preserved on device for subsequent use in config and not freed after calculation. Default value: false. */
-    pub is_output_on_device: bool,
-    /**< If true, output is preserved on device for subsequent use in config and not freed after calculation. Default value: false. */
-    pub ctx: DeviceContext<'a>, /*< Details related to the device such as its id and stream id. See [DeviceContext](@ref device_context::DeviceContext). */
+    /** Whether to run the NTT asyncronously. If set to `true`, the NTT function will be non-blocking and you'd need to synchronize
+     *  it explicitly by running `cudaStreamSynchronize` or `cudaDeviceSynchronize`. If set to false, the NTT
+     *  function will block the current CPU thread. */
+    pub is_async: bool,
+    /** Details related to the device such as its id and stream id. See [DeviceContext](@ref device_context::DeviceContext). */
+    pub ctx: DeviceContext<'a>,
 }
+
+// /**
+//  * @struct Domain
+//  * Struct containing information about the domain on which (i)NTT is evaluated: twiddle factors and coset generator.
+//  * Twiddle factors are private, static and can only be set using [GenerateDomain](@ref GenerateDomain) function.
+//  * The internal representation of twiddles is prone to change in accordance with changing [NTT](@ref NTT) algorithm.
+//  */
+// #[repr(C)]
+// #[derive(Debug)]
+// pub struct Domain<'a, T> {
+//     /** Scalar elements that specify a coset to be used in (i)NTT. Default value: None (no coset or alternatively coset
+//      *  generated by `S::one()` is used). */
+//     pub coset_table: Option<&'a [T]>,
+// }
diff --git a/wrappers/rust/icicle-cuda-runtime/Cargo.toml b/wrappers/rust/icicle-cuda-runtime/Cargo.toml
index 3cc5951c1..74fc7f4a5 100644
--- a/wrappers/rust/icicle-cuda-runtime/Cargo.toml
+++ b/wrappers/rust/icicle-cuda-runtime/Cargo.toml
@@ -8,7 +8,7 @@ homepage = "https://www.ingonyama.com"
 repository = "https://github.com/ingonyama-zk/icicle"
 
 [dependencies]
-bitflags = "2.4"
+bitflags = "1.3"
 
 [build-dependencies]
 bindgen = "*"
\ No newline at end of file
diff --git a/wrappers/rust/icicle-cuda-runtime/src/device_context.rs b/wrappers/rust/icicle-cuda-runtime/src/device_context.rs
index 742d44b3a..11e72894c 100644
--- a/wrappers/rust/icicle-cuda-runtime/src/device_context.rs
+++ b/wrappers/rust/icicle-cuda-runtime/src/device_context.rs
@@ -1,5 +1,5 @@
 use crate::memory::CudaMemPool;
-use crate::stream::CudaStream;
+use crate::stream::{CudaStream, CudaStreamCreateFlags};
 
 /// Properties of the device used in icicle functions.
 #[repr(C)]
diff --git a/wrappers/rust/icicle-cuda-runtime/src/stream.rs b/wrappers/rust/icicle-cuda-runtime/src/stream.rs
index cad33d795..d8b474980 100644
--- a/wrappers/rust/icicle-cuda-runtime/src/stream.rs
+++ b/wrappers/rust/icicle-cuda-runtime/src/stream.rs
@@ -1,5 +1,6 @@
 use crate::bindings::{
-    cudaStreamCreate, cudaStreamDefault, cudaStreamDestroy, cudaStreamNonBlocking, cudaStreamSynchronize, cudaStream_t,
+    cudaStreamCreate, cudaStreamCreateWithFlags, cudaStreamDefault, cudaStreamDestroy, cudaStreamNonBlocking,
+    cudaStreamSynchronize, cudaStream_t,
 };
 use crate::error::{CudaResult, CudaResultWrap};
 use bitflags::bitflags;
@@ -34,6 +35,15 @@ impl CudaStream {
         }
     }
 
+    pub fn create_with_flags(flags: CudaStreamCreateFlags) -> CudaResult<Self> {
+        let mut handle = MaybeUninit::<cudaStream_t>::uninit();
+        unsafe {
+            cudaStreamCreateWithFlags(handle.as_mut_ptr(), flags.bits)
+                .wrap_maybe_uninit(handle)
+                .map(CudaStream::from_handle)
+        }
+    }
+
     pub fn destroy(self) -> CudaResult<()> {
         let handle = self.handle;
         forget(self);
diff --git a/wrappers/rust/icicle-curves/icicle-bn254/build.rs b/wrappers/rust/icicle-curves/icicle-bn254/build.rs
index baa64e81d..f65dfe5f9 100644
--- a/wrappers/rust/icicle-curves/icicle-bn254/build.rs
+++ b/wrappers/rust/icicle-curves/icicle-bn254/build.rs
@@ -10,11 +10,9 @@ fn main() {
 
     let target_output_dir = format!("{}/../../target/{}", cargo_dir, profile);
 
-    Config::new("./icicle")
+    Config::new("../../../../icicle")
                 .define("BUILD_TESTS", "OFF") //TODO: feature
-                // .define("CURVE", "bls12_381")
                 .define("CURVE", "bn254")
-                // .define("ECNTT_DEFINED", "") //TODO: feature
                 .define("LIBRARY_OUTPUT_DIRECTORY", &target_output_dir)
                 .define("CMAKE_BUILD_TYPE", "Release")
                 .build_target("icicle")
@@ -22,7 +20,6 @@ fn main() {
 
     println!("cargo:rustc-link-search={}", &target_output_dir);
 
-    // println!("cargo:rustc-link-lib=icicle");
     println!("cargo:rustc-link-lib=ingo_bn254");
     println!("cargo:rustc-link-lib=stdc++");
     // println!("cargo:rustc-link-search=native=/usr/local/cuda/lib64");
diff --git a/wrappers/rust/icicle-curves/icicle-bn254/src/msm/mod.rs b/wrappers/rust/icicle-curves/icicle-bn254/src/msm/mod.rs
index 69b3a72cf..9885914e5 100644
--- a/wrappers/rust/icicle-curves/icicle-bn254/src/msm/mod.rs
+++ b/wrappers/rust/icicle-curves/icicle-bn254/src/msm/mod.rs
@@ -12,12 +12,12 @@ extern "C" {
         out: *mut G1Projective,
     ) -> CudaError;
 
-    #[link_name = "bn254GetDefaultMSMConfig"]
-    fn GetDefaultMSMConfig() -> MSMConfig<'static>;
+    #[link_name = "bn254DefaultMSMConfig"]
+    fn default_msm_config() -> MSMConfig<'static>;
 }
 
 pub fn get_default_msm_config() -> MSMConfig<'static> {
-    unsafe { GetDefaultMSMConfig() }
+    unsafe { default_msm_config() }
 }
 
 pub fn msm<'a>(
diff --git a/wrappers/rust/icicle-curves/icicle-bn254/src/ntt/config.rs b/wrappers/rust/icicle-curves/icicle-bn254/src/ntt/config.rs
deleted file mode 100644
index 0d37527ad..000000000
--- a/wrappers/rust/icicle-curves/icicle-bn254/src/ntt/config.rs
+++ /dev/null
@@ -1,61 +0,0 @@
-use std::os::raw::c_int;
-
-use crate::curve::*;
-use icicle_core::ntt::{Butterfly, Decimation, NTTConfigCuda, Ordering};
-use icicle_cuda_runtime::device_context::{get_default_device_context, DeviceContext};
-
-pub(super) type ECNTTConfig<'a> = NTTConfigCuda<'a, G1Projective, ScalarField>;
-pub(super) type NTTConfig<'a> = NTTConfigCuda<'a, ScalarField, ScalarField>;
-
-pub(super) fn get_ntt_config<E, S>(size: usize, ctx: DeviceContext) -> NTTConfigCuda<E, S> {
-    //TODO: implement on CUDA side
-
-    NTTConfigCuda::<E, S> {
-        inout: 0 as _, // inout as *mut _ as *mut ScalarField,
-        is_input_on_device: false,
-        is_inverse: false,
-        ordering: Ordering::kNN,
-        decimation: Decimation::kDIF,
-        butterfly: Butterfly::kCooleyTukey,
-        is_coset: false,
-        coset_gen: 0 as _,    //TODO: ?
-        twiddles: 0 as _,     //TODO: ?,
-        inv_twiddles: 0 as _, //TODO: ?,
-        size: size as i32,
-        batch_size: 0 as i32,
-        is_preserving_twiddles: true,
-        is_output_on_device: false,
-        ctx,
-    }
-}
-
-pub(super) fn get_ntt_default_config<E, S>(size: usize) -> NTTConfigCuda<'static, E, S> {
-    //TODO: implement on CUDA side
-    let ctx = get_default_device_context();
-
-    // let root_of_unity = S::default(); //TODO: implement on CUDA side
-
-    let config = get_ntt_config(size, ctx);
-
-    config
-}
-
-pub(super) fn get_ntt_config_with_input(ntt_intt_result: &mut [ScalarField], size: usize, batches: usize) -> NTTConfig {
-    NTTConfig {
-        inout: ntt_intt_result as *mut _ as *mut ScalarField,
-        is_input_on_device: false,
-        is_inverse: false,
-        ordering: Ordering::kNN,
-        decimation: Decimation::kDIF,
-        butterfly: Butterfly::kCooleyTukey,
-        is_coset: false,
-        coset_gen: &[ScalarField::zero()] as _, //TODO: ?
-        twiddles: 0 as *const ScalarField,      //TODO: ?,
-        inv_twiddles: 0 as *const ScalarField,  //TODO: ?,
-        size: size as _,
-        batch_size: batches as i32,
-        is_preserving_twiddles: true,
-        is_output_on_device: true,
-        ctx: get_default_device_context(),
-    }
-}
diff --git a/wrappers/rust/icicle-curves/icicle-bn254/src/ntt/domain.rs b/wrappers/rust/icicle-curves/icicle-bn254/src/ntt/domain.rs
deleted file mode 100644
index bd9120528..000000000
--- a/wrappers/rust/icicle-curves/icicle-bn254/src/ntt/domain.rs
+++ /dev/null
@@ -1,192 +0,0 @@
-use icicle_core::ntt::{Butterfly, Decimation, NTTConfigCuda, Ordering};
-use icicle_cuda_runtime::device_context::{get_default_device_context, DeviceContext};
-use icicle_cuda_runtime::memory::DeviceSlice;
-use std::default;
-
-pub(super) type ECNTTDomain<'a> = Domain<'a, G1Projective, ScalarField>;
-pub(super) type NTTDomain<'a> = Domain<'a, ScalarField, ScalarField>;
-
-use crate::curve::*;
-
-use super::{config::*, ntt_internal};
-
-/// Represents the NTT domain
-pub struct Domain<'a, E, S> {
-    config: NTTConfigCuda<'a, E, S>,
-}
-
-impl<'a, E, S> Domain<'a, E, S> {
-    pub fn new(size: usize, ctx: DeviceContext<'a>) -> Self {
-        Domain {
-            config: get_ntt_config(size, ctx),
-        }
-    }
-
-    pub fn get_output_on_device(&self) -> Result<*mut E, &'static str> {
-        if self
-            .config
-            .is_output_on_device
-        {
-            Ok(self
-                .config
-                .inout)
-        } else {
-            Err("Output should be on device.")
-        }
-    }
-
-    pub fn get_input_on_device(&self) -> Result<*mut E, &'static str> {
-        if self
-            .config
-            .is_input_on_device
-        {
-            Ok(self
-                .config
-                .inout)
-        } else {
-            Err("Input should be on device.")
-        }
-    }
-
-    pub fn get_input(&self) -> Result<*mut E, &'static str> {
-        if !self
-            .config
-            .is_input_on_device
-        {
-            Ok(self
-                .config
-                .inout)
-        } else {
-            Err("Output is on device.")
-        }
-    }
-
-    pub fn get_output(&self) -> Result<*mut E, &'static str> {
-        if !self
-            .config
-            .is_output_on_device
-        {
-            Ok(self
-                .config
-                .inout)
-        } else {
-            Err("Output is on device.")
-        }
-    }
-
-    pub(crate) fn new_for_default_context(size: usize) -> Self {
-        let ctx = get_default_device_context();
-        // let default_root_of_unity = S::default(); //TODO: implement
-        let domain = Domain::new(size, ctx);
-        domain
-    }
-}
-
-// Add implementations for other methods and structs as needed.
-
-impl<'a, E: 'static, S: 'static> Domain<'a, E, S> {
-    // ... previous methods ...
-
-    // NTT methods
-    pub fn ntt(&mut self, inout: &mut [E]) {
-        let batch_size = 1;
-
-        let size = inout.len();
-
-        if size
-            != self
-                .config
-                .size as _
-        {
-            //TODO: test for this error
-            panic!(
-                "input lenght: {} does not match domain size: {}",
-                size,
-                self.config
-                    .size
-            )
-        }
-
-        self.config
-            .inout = inout.as_mut_ptr(); // as *mut _ as *mut E;
-        self.config
-            .is_inverse = false;
-        self.config
-            .is_input_on_device = false;
-        self.config
-            .is_output_on_device = false;
-        // self.config
-        //     .ordering = Ordering::default(); //TODO: each call?
-        self.config
-            .batch_size = batch_size as i32;
-
-        ntt_internal(&mut self.config);
-    }
-
-    pub fn ntt_on_device(&mut self, inout: &mut DeviceSlice<E>) {
-        // Implementation for NTT on device
-    }
-
-    pub fn ntt_batch(&mut self, inout: &mut [E]) {
-        // Implementation for batched NTT
-    }
-
-    pub fn ntt_batch_on_device(&mut self, inout: &mut DeviceSlice<E>) {
-        // Implementation for batched NTT on device
-    }
-
-    pub fn ntt_coset(&mut self, inout: &mut [E], coset: &mut [E]) {
-        // Implementation for NTT with coset
-    }
-
-    pub fn ntt_coset_on_device(&mut self, inout: &mut DeviceSlice<E>, coset: &mut DeviceSlice<E>) {
-        // Implementation for NTT with coset on device
-    }
-
-    pub fn ntt_coset_batch(&mut self, inout: &mut [E], coset: &mut [E]) {
-        // Implementation for batched NTT with coset
-    }
-
-    pub fn ntt_coset_batch_on_device(&mut self, inout: &mut DeviceSlice<E>, coset: &mut DeviceSlice<E>) {
-        // Implementation for batched NTT with coset on device
-    }
-
-    // iNTT methods
-    pub fn intt(&mut self, inout: &mut [E]) {
-        // Implementation for iNTT
-    }
-
-    pub fn intt_on_device(&mut self, inout: &mut DeviceSlice<E>) {
-        // Implementation for iNTT on device
-    }
-
-    pub fn intt_batch(&mut self, inout: &mut [E]) {
-        // Implementation for batched iNTT
-    }
-
-    pub fn intt_batch_on_device(&mut self, inout: &mut DeviceSlice<E>) {
-        // Implementation for batched iNTT on device
-    }
-
-    pub fn intt_coset(&mut self, inout: &mut [E], coset: &mut [E]) {
-        // Implementation for iNTT with coset
-    }
-
-    pub fn intt_coset_on_device(&mut self, inout: &mut DeviceSlice<E>, coset: &mut DeviceSlice<E>) {
-        // Implementation for iNTT with coset on device
-    }
-
-    pub fn intt_coset_batch(&mut self, inout: &mut [E], coset: &mut [E]) {
-        // Implementation for batched iNTT with coset
-    }
-
-    pub fn intt_coset_batch_on_device(&mut self, inout: &mut DeviceSlice<E>, coset: &mut DeviceSlice<E>) {
-        // Implementation for batched iNTT with coset on device
-    }
-
-    // Ordering setter
-    pub fn set_ordering(&mut self, ordering: Ordering) {
-        self.config
-            .ordering = ordering;
-    }
-}
diff --git a/wrappers/rust/icicle-curves/icicle-bn254/src/ntt/mod.rs b/wrappers/rust/icicle-curves/icicle-bn254/src/ntt/mod.rs
index d418917a3..1f581b0ea 100644
--- a/wrappers/rust/icicle-curves/icicle-bn254/src/ntt/mod.rs
+++ b/wrappers/rust/icicle-curves/icicle-bn254/src/ntt/mod.rs
@@ -1,87 +1,70 @@
-mod config;
-pub mod domain;
+use crate::curve::ScalarField;
 
-use std::any::TypeId;
-
-use crate::curve::*;
-
-use self::config::*;
-
-use icicle_core::ntt::{Butterfly, Decimation, Ordering};
-use icicle_cuda_runtime::error::CudaError;
+use icicle_core::ntt::NTTConfig;
+use icicle_cuda_runtime::device_context::DeviceContext;
+use icicle_cuda_runtime::error::{CudaError, CudaResult, CudaResultWrap};
 
 extern "C" {
-    #[link_name = "NTTDefaultContextCuda"]
-    fn ntt_cuda(config: *mut NTTConfig) -> CudaError;
+    #[link_name = "bn254NTTCuda"]
+    fn ntt_cuda<'a>(
+        input: *const ScalarField,
+        size: usize,
+        is_inverse: bool,
+        config: &NTTConfig<'a, ScalarField>,
+        output: *mut ScalarField,
+    ) -> CudaError;
+
+    #[link_name = "bn254DefaultNTTConfig"]
+    fn default_ntt_config() -> NTTConfig<'static, ScalarField>;
+
+    #[link_name = "bn254InitializeDomain"]
+    fn initialize_ntt_domain(primitive_root: ScalarField, ctx: &DeviceContext) -> CudaError;
 }
 
-pub(crate) fn ntt_wip(
-    inout: &mut [ScalarField],
-    is_inverse: bool,
-    is_input_on_device: bool,
-    ordering: Ordering,
-    is_output_on_device: bool,
-    batch_size: usize,
-) {
-    let mut batch_size = batch_size;
-    if batch_size == 0 {
-        batch_size = 1;
-    }
-
-    let size = inout.len() / batch_size;
-
-    let mut config = get_ntt_default_config::<ScalarField, ScalarField>(size);
-
-    config.inout = inout as *mut _ as *mut ScalarField;
-    config.is_inverse = is_inverse;
-    config.is_input_on_device = is_input_on_device;
-    config.is_output_on_device = is_output_on_device;
-    config.ordering = ordering;
-    config.batch_size = batch_size as i32;
-
-    ntt_internal(&mut config);
+pub fn get_default_ntt_config() -> NTTConfig<'static, ScalarField> {
+    unsafe { default_ntt_config() }
 }
 
-pub(self) fn ntt_internal<TConfig>(config: *mut TConfig) -> CudaError {
-    let result_code = unsafe { ntt_cuda(config as _) };
-    // let typeid = TypeId::of::<TConfig>();
-    // if typeid == TypeId::of::<NTTConfig>() {
-    //     result_code = unsafe { ntt_cuda(config as _) };
-    // } else {
-    //     result_code = CudaError::cudaSuccess; //TODO: unsafe { ecntt_cuda(config as _) };
-    // }
-
-    // if result_code != CudaError::cudaSuccess {
-    //     println!("_result_code = {:?}", result_code);
-    // }
-
-    return CudaError::cudaSuccess;
+pub fn initialize_domain(primitive_root: ScalarField, ctx: &DeviceContext) -> CudaResult<()> {
+    unsafe { initialize_ntt_domain(primitive_root, ctx).wrap() }
 }
 
-pub(self) fn ecntt_internal(config: *mut ECNTTConfig) -> u32 {
-    let result_code = 0; //TODO: unsafe { ecntt_cuda(config) };
-    if result_code != 0 {
-        println!("_result_code = {}", result_code);
+pub fn ntt(
+    input: &[ScalarField],
+    is_inverse: bool,
+    cfg: &NTTConfig<ScalarField>,
+    output: &mut [ScalarField],
+) -> CudaResult<()> {
+    if input.len() != output.len() {
+        return Err(CudaError::cudaErrorInvalidValue);
     }
 
-    return result_code;
+    unsafe {
+        ntt_cuda(
+            input as *const _ as *const ScalarField,
+            input.len(),
+            is_inverse,
+            cfg,
+            output as *mut _ as *mut ScalarField,
+        )
+        .wrap()
+    }
 }
 
 #[cfg(test)]
 pub(crate) mod tests {
-    use ark_bn254::{Fr, G1Affine as arkG1Affine, G1Projective as arkG1Projective};
-    // use ark_bls12_381::{Fr, G1Projective};
-    use ark_ff::PrimeField;
-    use ark_poly::EvaluationDomain;
-    use ark_poly::GeneralEvaluationDomain;
-    use ark_std::UniformRand;
-    use std::slice;
-
-    use crate::ntt::domain::NTTDomain;
-    use crate::{curve::*, ntt::*};
     use icicle_core::traits::ArkConvertible;
+    use icicle_core::ntt::Ordering;
+    use icicle_cuda_runtime::device_context::get_default_device_context;
+
+    use crate::curve::generate_random_scalars;
+    use crate::ntt::{get_default_ntt_config, initialize_domain, ntt, ScalarField};
 
-    pub fn reverse_bit_order(n: u32, order: u32) -> u32 {
+    use ark_bn254::Fr;
+    use ark_ff::FftField;
+    use ark_poly::{EvaluationDomain, GeneralEvaluationDomain};
+
+    fn reverse_bit_order(n: u32, order: u32) -> u32 {
         fn is_power_of_two(n: u32) -> bool {
             n != 0 && n & (n - 1) == 0
         }
@@ -95,7 +78,7 @@ pub(crate) mod tests {
         u32::from_str_radix(&reversed, 2).unwrap()
     }
 
-    pub fn list_to_reverse_bit_order<T: Copy>(l: &[T]) -> Vec<T> {
+    fn list_to_reverse_bit_order<T: Copy>(l: &[T]) -> Vec<T> {
         l.iter()
             .enumerate()
             .map(|(i, _)| l[reverse_bit_order(i as u32, l.len() as u32) as usize])
@@ -104,219 +87,98 @@ pub(crate) mod tests {
 
     #[test]
     fn test_ntt() {
-        //NTT
-        let test_size = 1 << 11;
-        let batches = 1;
-
-        let full_test_size = test_size * batches;
-        let scalars_batch: Vec<ScalarField> = generate_random_scalars(full_test_size);
-
-        // let scalars_batch: Vec<ScalarField> = (0..full_test_size)
-        //     .into_iter()
-        //     .map(|x| {
-        //         // if x % 1 == 0 {
-        //         if x % 2 == 0 {
-        //             ScalarField::one()
-        //         } else {
-        //             ScalarField::zero()
-        //         }
-        //     })
-        //     .collect();
+        let test_size = 1 << 16;
+        let ctx = get_default_device_context();
+        // two roughly analogous calls for icicle and arkworks. one difference is that icicle call creates
+        // domain for all NTTs of size <= `test_size`. also for icicle domain is a hidden static object
+        initialize_domain(
+            ScalarField::from_ark(Fr::get_root_of_unity(test_size as u64).unwrap()),
+            &ctx,
+        ).unwrap();
+        let ark_domain = GeneralEvaluationDomain::<Fr>::new(test_size).unwrap();
 
-        let mut ntt_result = scalars_batch.clone();
+        let scalars: Vec<ScalarField> = generate_random_scalars(test_size);
 
-        let ark_domain = GeneralEvaluationDomain::<Fr>::new(test_size).unwrap();
-        let mut domain = NTTDomain::new_for_default_context(test_size);
+        let config = get_default_ntt_config();
+        let mut ntt_result = vec![ScalarField::zero(); test_size];
+        ntt(&scalars, false, &config, &mut ntt_result).unwrap();
+        assert_ne!(ntt_result, scalars);
 
-        let ark_scalars_batch = scalars_batch
-            .clone()
+        let ark_scalars = scalars
             .iter()
             .map(|v| v.to_ark())
             .collect::<Vec<Fr>>();
-        let mut ark_ntt_result = ark_scalars_batch.clone();
-
+        let mut ark_ntt_result = ark_scalars.clone();
         ark_domain.fft_in_place(&mut ark_ntt_result);
+        assert_ne!(ark_ntt_result, ark_scalars);
 
-        assert_ne!(ark_ntt_result, ark_scalars_batch);
-
-        // do ntt
-        // ntt_wip(&mut ntt_result, false, false, Ordering::kNN, false, batches);
-        domain.ntt(&mut ntt_result); //single ntt
         let ntt_result_as_ark = ntt_result
             .iter()
             .map(|p| p.to_ark())
             .collect::<Vec<Fr>>();
-
-        assert_ne!(ntt_result, scalars_batch);
         assert_eq!(ark_ntt_result, ntt_result_as_ark);
 
-        let mut ark_intt_result = ark_ntt_result;
+        let mut intt_result = vec![ScalarField::zero(); test_size];
+        ntt(&ntt_result, true, &config, &mut intt_result).unwrap();
 
-        ark_domain.ifft_in_place(&mut ark_intt_result);
-        assert_eq!(ark_intt_result, ark_scalars_batch);
-
-        // check that ntt output is different from input
-        assert_ne!(ntt_result, scalars_batch);
-
-        // do intt
-        let mut intt_result = ntt_result;
-
-        ntt_wip(&mut intt_result, true, false, Ordering::kNN, false, batches);
-
-        assert!(ark_intt_result == ark_scalars_batch);
-        assert!(intt_result == scalars_batch);
-
-        let mut ntt_intt_result = intt_result;
-        ntt_wip(&mut ntt_intt_result, false, false, Ordering::kNR, false, batches);
-        assert!(ntt_intt_result != scalars_batch);
-        ntt_wip(&mut ntt_intt_result, true, false, Ordering::kRN, false, batches);
-        assert!(ntt_intt_result == scalars_batch);
-
-        let mut ntt_intt_result = list_to_reverse_bit_order(&ntt_intt_result);
-        ntt_wip(&mut ntt_intt_result, false, false, Ordering::kRR, false, batches);
-        assert!(ntt_intt_result != scalars_batch);
-        ntt_wip(&mut ntt_intt_result, true, false, Ordering::kRN, false, batches);
-        assert!(ntt_intt_result == scalars_batch);
-
-        ////
-        let size = ntt_intt_result.len() / batches;
-
-        let mut config = get_ntt_config_with_input(&mut ntt_intt_result, size, batches);
-
-        ntt_internal(&mut config);
-
-        //host
-        let mut ntt_result = scalars_batch.clone();
-        ntt_wip(&mut ntt_result, false, false, Ordering::kNR, false, batches);
-
-        // let mut buff1 = DeviceBuffer::from_slice(&scalars_batch[..]).unwrap();
-        // let dev_ptr1 = buff1
-        //     .as_device_ptr()
-        //     .as_raw_mut();
-
-        // let buff_len = buff1.len();
-
-        // std::mem::forget(buff1);
-
-        // let buff_from_dev_ptr = unsafe { DeviceBuffer::from_raw_parts(DevicePointer::wrap(dev_ptr1), buff_len) };
-        // let mut from_device = vec![ScalarField::zero(); scalars_batch.len()];
-        // buff_from_dev_ptr
-        //     .copy_to(&mut from_device)
-        //     .unwrap();
-
-        // assert_eq!(from_device, scalars_batch);
-
-        // host - device - device - host
-        let mut ntt_intt_result = scalars_batch.clone();
-
-        let mut config = get_ntt_config_with_input(&mut ntt_intt_result, size, batches);
-
-        config.is_input_on_device = false;
-        config.is_output_on_device = true;
-        // config.is_preserving_twiddles = true; // TODO: same as in get_ntt_config
-        config.ordering = Ordering::kNR;
-
-        ntt_internal(&mut config); //twiddles are preserved after first call
-
-        // config.is_preserving_twiddles = true;        //TODO: same as in get_ntt_config
-        config.is_inverse = true;
-        config.is_input_on_device = false;
-        config.is_output_on_device = true;
-        config.ordering = Ordering::kNR;
-
-        ntt_internal(&mut config); //inv_twiddles are preserved after first call
-
-        let ntt_intt_result = &mut scalars_batch.clone()[..];
-        let raw_scalars_batch_copy = ntt_intt_result as *mut _ as *mut ScalarField;
-
-        let config_inout2: &mut [ScalarField] =
-            unsafe { std::slice::from_raw_parts_mut(raw_scalars_batch_copy, config.size as usize) };
-        assert_eq!(config_inout2, scalars_batch);
-
-        config.is_preserving_twiddles = true; //TODO: same as in get_ntt_config
-
-        config.inout = raw_scalars_batch_copy;
-
-        config.is_inverse = false;
-        config.is_input_on_device = false;
-        config.is_output_on_device = true;
-        config.ordering = Ordering::kNR;
-
-        ntt_internal(&mut config);
-
-        config.is_inverse = true;
-        config.is_input_on_device = true;
-        config.is_output_on_device = false;
-        config.ordering = Ordering::kRN;
-
-        ntt_internal(&mut config);
-
-        let result_from_device: &mut [ScalarField] =
-            unsafe { std::slice::from_raw_parts_mut(config.inout, scalars_batch.len()) };
-
-        assert_eq!(result_from_device, &scalars_batch);
+        assert_eq!(intt_result, scalars);
+        // check that ntt_result wasn't mutated by the latest `ntt` call
+        assert_eq!(ntt_result_as_ark[1], ntt_result[1].to_ark());
     }
 
     #[test]
-    fn test_batch_ntt() {
-        //NTT
-        let test_size = 1 << 11;
-        let batches = 2;
-
-        let full_test_size = test_size * batches;
-        let scalars_batch: Vec<ScalarField> = generate_random_scalars(full_test_size);
-
-        let mut scalar_vec_of_vec: Vec<Vec<ScalarField>> = Vec::new();
-
-        for i in 0..batches {
-            scalar_vec_of_vec.push(scalars_batch[i * test_size..(i + 1) * test_size].to_vec());
-        }
-
-        let mut ntt_result = scalars_batch.clone();
-
-        // do batch ntt
-        ntt_wip(&mut ntt_result, false, false, Ordering::kNN, false, batches);
-
-        let mut ntt_result_vec_of_vec = Vec::new();
-
-        // do ntt for every chunk
-        for i in 0..batches {
-            ntt_result_vec_of_vec.push(scalar_vec_of_vec[i].clone());
-
-            ntt_wip(&mut ntt_result_vec_of_vec[i], false, false, Ordering::kNN, false, 1);
-        }
-
-        // check that the ntt of each vec of scalars is equal to the ntt of the specific batch
-        for i in 0..batches {
-            assert_eq!(ntt_result_vec_of_vec[i], ntt_result[i * test_size..(i + 1) * test_size]);
-        }
-
-        // check that ntt output is different from input
-        assert_ne!(ntt_result, scalars_batch);
-
-        let mut intt_result = ntt_result.clone();
-
-        // do batch intt
-        // intt_batch(&mut intt_result, test_size, 0);
-        ntt_wip(&mut intt_result, true, false, Ordering::kNN, false, batches);
-
-        let mut intt_result_vec_of_vec = Vec::new();
-
-        // do intt for every chunk
-        for i in 0..batches {
-            intt_result_vec_of_vec.push(ntt_result_vec_of_vec[i].clone());
-            // intt(&mut intt_result_vec_of_vec[i], 0);
-            ntt_wip(&mut intt_result_vec_of_vec[i], true, false, Ordering::kNN, false, 1);
-        }
+    fn test_ntt_coset_from_subgroup() {
+        let test_size = 1 << 16;
+        let small_size = test_size >> 1;
+        let test_size_rou = Fr::get_root_of_unity(test_size as u64).unwrap();
+        let ctx = get_default_device_context();
+        // two roughly analogous calls for icicle and arkworks. one difference is that icicle call creates
+        // domain for all NTTs of size <= `test_size`. also for icicle domain is a hidden static object
+        initialize_domain(ScalarField::from_ark(test_size_rou), &ctx).unwrap();
+        let ark_small_domain = GeneralEvaluationDomain::<Fr>::new(small_size).unwrap().get_coset(test_size_rou).unwrap();
+        let ark_large_domain = GeneralEvaluationDomain::<Fr>::new(test_size).unwrap();
+
+        let mut scalars: Vec<ScalarField> = generate_random_scalars(small_size);
+
+        let mut config = get_default_ntt_config();
+        config.ordering = Ordering::kNR;
+        let mut ntt_result = vec![ScalarField::zero(); test_size];
+        ntt(&scalars, false, &config, &mut ntt_result[..small_size]).unwrap();
+        assert_ne!(ntt_result[..small_size], scalars);
+        config.coset_gen = ScalarField::from_ark(test_size_rou);
+        ntt(&scalars, false, &config, &mut ntt_result[small_size..]).unwrap();
+        let mut ntt_large_result = vec![ScalarField::zero(); test_size];
+        // back to non-coset NTT
+        config.coset_gen = ScalarField::one();
+        scalars.resize(test_size, ScalarField::zero());
+        ntt(&scalars, false, &config, &mut ntt_large_result).unwrap();
+        assert_eq!(ntt_result, ntt_large_result);
+
+        let mut ark_scalars = scalars
+            .iter()
+            .map(|v| v.to_ark())
+            .collect::<Vec<Fr>>();
+        let mut ark_large_scalars = ark_scalars.clone();
+        ark_small_domain.fft_in_place(&mut ark_scalars);
+        let ntt_result_as_ark = ntt_result
+            .iter()
+            .map(|p| p.to_ark())
+            .collect::<Vec<Fr>>();
+        assert_eq!(ark_scalars[..small_size], list_to_reverse_bit_order(&ntt_result_as_ark[small_size..]));
+        ark_large_domain.fft_in_place(&mut ark_large_scalars);
+        assert_eq!(ark_large_scalars, list_to_reverse_bit_order(&ntt_result_as_ark));
 
-        // check that the intt of each vec of scalars is equal to the intt of the specific batch
-        for i in 0..batches {
-            assert_eq!(
-                intt_result_vec_of_vec[i],
-                intt_result[i * test_size..(i + 1) * test_size]
-            );
-        }
+        config.coset_gen = ScalarField::from_ark(test_size_rou);
+        config.ordering = Ordering::kRN;
+        let mut intt_result = vec![ScalarField::zero(); small_size];
+        ntt(&ntt_result[small_size..], true, &config, &mut intt_result).unwrap();
+        assert_eq!(intt_result, scalars[..small_size]);
 
-        assert_eq!(intt_result, scalars_batch);
+        ark_small_domain.ifft_in_place(&mut ark_scalars);
+        let intt_result_as_ark = intt_result
+            .iter()
+            .map(|p| p.to_ark())
+            .collect::<Vec<Fr>>();
+        assert_eq!(ark_scalars[..small_size], intt_result_as_ark);
     }
 }