diff --git a/.github/ISSUE_TEMPLATE/bug_issue.md b/.github/ISSUE_TEMPLATE/bug_issue.md index 818d00e14..60b439855 100644 --- a/.github/ISSUE_TEMPLATE/bug_issue.md +++ b/.github/ISSUE_TEMPLATE/bug_issue.md @@ -2,7 +2,7 @@ name: ":bug: Bug Report" about: Create a bug report to help us improve the repo title: "[BUG]: " -labels: bug +labels: type:bug --- ## Description diff --git a/.github/ISSUE_TEMPLATE/feature_request.md b/.github/ISSUE_TEMPLATE/feature_request.md index 02a4be3ec..de824ce46 100644 --- a/.github/ISSUE_TEMPLATE/feature_request.md +++ b/.github/ISSUE_TEMPLATE/feature_request.md @@ -2,7 +2,7 @@ name: ":sparkles: Feature Request" about: Request the inclusion of a new feature or functionality title: "[FEAT]: " -labels: enhancement +labels: type:feature --- ## Description diff --git a/.github/workflows/dev-pr.yml b/.github/workflows/dev-pr.yml deleted file mode 100644 index d418ba3d9..000000000 --- a/.github/workflows/dev-pr.yml +++ /dev/null @@ -1,101 +0,0 @@ -name: Dev PR - -on: - pull_request: - branches: - - dev - -env: - CARGO_TERM_COLOR: always - ARCH_TYPE: sm_70 - LD_LIBRARY_PATH: $GITHUB_WORKSPACE/goicicle - -jobs: - build-rust-linux: - name: Build Rust on Linux - runs-on: [self-hosted, Linux, X64, icicle] - steps: - - name: Checkout Repo - uses: actions/checkout@v3 - - name: Build Rust - run: cargo build --release --verbose - - test-rust-linux: - name: Test Rust on Linux - needs: build-rust-linux - runs-on: [self-hosted, Linux, X64, icicle] - steps: - - name: Checkout Repo - uses: actions/checkout@v3 - - name: Run Rust Tests - run: cargo test --release --verbose -- --test-threads=1 - - formatting-rust: - name: Check Rust Code Formatting - runs-on: ubuntu-22.04 - steps: - - name: Checkout - uses: actions/checkout@v3 - - name: Check rustfmt - run: if [[ $(cargo fmt --check) ]]; then echo "Please run cargo fmt"; exit 1; fi - # - name: Check clippy - # run: cargo clippy --no-deps --all-features --all-targets - - build-rust-windows: - name: Build Rust on Windows - runs-on: windows-2022 - steps: - - name: Checkout Repo - uses: actions/checkout@v3 - - name: Download and Install Cuda - uses: Jimver/cuda-toolkit@v0.2.11 - with: - cuda: '12.0.0' - method: 'network' - # https://docs.nvidia.com/cuda/archive/12.0.0/cuda-installation-guide-microsoft-windows/index.html - sub-packages: '["cudart", "nvcc", "thrust"]' - - name: Build Rust Targets - run: cargo build --release --verbose - - test-golang-linux: - name: Test Golang on Linux - runs-on: [self-hosted, Linux, X64, icicle] - steps: - - name: Checkout Repo - uses: actions/checkout@v3 - - name: Build CUDA libs - run: make libbn254.so - working-directory: ./goicicle - - name: Run Golang Tests - run: | - export LD_LIBRARY_PATH=$LD_LIBRARY_PATH:$(pwd)/goicicle - go test ./goicicle/curves/bn254 -count=1 - # TODO: Fix tests for bls12377 - # TODO: Fix tests for bls12381 - # run: go test ./goicicle/curves/bn254 ./goicicle/curves/bls12377 ./goicicle/curves/bls12381 -count=1 - - formatting-golang: - name: Check Golang Code Formatting - runs-on: ubuntu-22.04 - steps: - - name: Checkout - uses: actions/checkout@v3 - - name: Check gofmt - run: if [[ $(go list ./... | xargs go fmt) ]]; then echo "Please run go fmt"; exit 1; fi - - # TODO: Add once Golang make file supports building for Windows - # build-golang-windows: - # name: Build Golang on Windows - # runs-on: windows-2022 - # steps: - # - name: Checkout Repo - # uses: actions/checkout@v3 - # - name: Download and Install Cuda - # uses: Jimver/cuda-toolkit@v0.2.11 - # with: - # cuda: '12.0.0' - # method: 'network' - # # https://docs.nvidia.com/cuda/archive/12.0.0/cuda-installation-guide-microsoft-windows/index.html - # sub-packages: '["cudart", "nvcc", "thrust"]' - # - name: Build cpp libs - # run: cd goicicle && make all diff --git a/.github/workflows/main-build.yml b/.github/workflows/main-build.yml index c40959a42..fe2e37ae9 100644 --- a/.github/workflows/main-build.yml +++ b/.github/workflows/main-build.yml @@ -5,6 +5,10 @@ on: branches: - main +concurrency: + group: ${{ github.workflow }}-${{ github.ref }} + cancel-in-progress: true + env: CARGO_TERM_COLOR: always ARCH_TYPE: native @@ -59,9 +63,11 @@ jobs: cuda: '12.0.0' method: 'network' # https://docs.nvidia.com/cuda/archive/12.0.0/cuda-installation-guide-microsoft-windows/index.html - sub-packages: '["cudart", "nvcc", "thrust"]' + sub-packages: '["cudart", "nvcc", "thrust", "visual_studio_integration"]' - name: Build Rust Targets if: needs.check-changed-files.outputs.rust == 'true' || needs.check-changed-files.outputs.cpp_cuda == 'true' + env: + CUDA_PATH: ${{steps.cuda-toolkit.outputs.CUDA_PATH}} run: cargo build --release --verbose build-golang-linux: diff --git a/.github/workflows/main-format.yml b/.github/workflows/main-format.yml index 4f0c4d2b2..646d0221f 100644 --- a/.github/workflows/main-format.yml +++ b/.github/workflows/main-format.yml @@ -5,6 +5,10 @@ on: branches: - main +concurrency: + group: ${{ github.workflow }}-${{ github.ref }} + cancel-in-progress: true + jobs: formatting-rust: name: Check Rust Code Formatting @@ -33,6 +37,5 @@ jobs: - name: Checkout uses: actions/checkout@v3 - name: Check clang-format - run: | - if [[ $(find ./ -path ./icicle/build -prune -iname *.h -or -iname *.cuh -or -iname *.cu -or -iname *.c -or -iname *.cpp | xargs clang-format --dry-run -ferror-limit=1 -style=file 2>&1) ]]; then echo "Please run clang-format"; exit 1; fi - + run: unformatted_files=$(find ./ -path ./icicle/build -prune -o -path ./target -prune -iname *.h -or -iname *.cuh -or -iname *.cu -or -iname *.c -or -iname *.cpp | xargs clang-format --dry-run -ferror-limit=1 -style=file >&2); if [[ $unformatted_files ]]; then echo $unformatted_files; echo "Please run clang-format"; exit 1; fi + diff --git a/.github/workflows/main-test.yml b/.github/workflows/main-test.yml index 6b13fe21a..4bc2e2c63 100644 --- a/.github/workflows/main-test.yml +++ b/.github/workflows/main-test.yml @@ -5,6 +5,10 @@ on: branches: - main +concurrency: + group: ${{ github.workflow }}-${{ github.ref }} + cancel-in-progress: true + env: CARGO_TERM_COLOR: always ARCH_TYPE: native @@ -57,7 +61,7 @@ jobs: if: needs.check-changed-files.outputs.cpp_cuda == 'true' run: | mkdir -p build - cmake -S . -B build + cmake -DBUILD_TESTS=ON -S . -B build cmake --build build - name: Run C++ Tests working-directory: ./icicle/build diff --git a/README.md b/README.md index f9a55a416..4d95f18b5 100644 --- a/README.md +++ b/README.md @@ -5,13 +5,15 @@ ![image (4)](https://user-images.githubusercontent.com/2446179/223707486-ed8eb5ab-0616-4601-8557-12050df8ccf7.png) -
- -![Build status](https://github.com/ingonyama-zk/icicle/actions/workflows/main-build.yml/badge.svg) -![Discord server](https://img.shields.io/discord/1063033227788423299?label=Discord&logo=Discord&logoColor=%23&style=plastic) -![Follow us on twitter](https://img.shields.io/twitter/follow/Ingo_zk?style=social) - -
+

+ Build status + + Chat with us on Discord + + + Follow us on Twitter + +

## Background @@ -34,6 +36,7 @@ ICICLE is a CUDA implementation of general functions widely used in ZKP. ICICLE - [BLS12-381] - [BLS12-377] - [BN254] + - [BW6-671] ## Build and usage @@ -43,6 +46,10 @@ ICICLE is a CUDA implementation of general functions widely used in ZKP. ICICLE - [NVCC] (version 12.0 or newer) - cmake 3.18 and above - follow [these instructions](https://github.com/ingonyama-zk/icicle/tree/main/icicle#prerequisites-on-ubuntu) +- Any Nvidia GPU + +If you don't have access to a Nvidia GPU check out [google-colab](#google-colab). If you require more compute power and are looking to build or do research with ICICLE refer to our [grant program][GRANT_PROGRAM]. + ### Steps @@ -59,6 +66,14 @@ nvcc -o build/ ./icicle/curves/index.cu -lib -arch=native We are using [googletest] library for testing. To build and run [the test suite](./icicle/README.md) for finite field and elliptic curve arithmetic, run from the `icicle` folder: +For testing, ensure the `BUILD_TESTS` option is enabled in cmake. If not, toggle it on by adding `-DBUILD_TESTS=ON` in the cmake configuration command: + +```sh +cmake -S . -B build -DBUILD_TESTS=ON +``` + +Proceed with the following commands: + ```sh mkdir -p build cmake -S . -B build @@ -68,6 +83,7 @@ cd build && ctest NOTE: If you are using cmake versions < 3.24 add `-DCUDA_ARCH=` to the command `cmake -S . -B build` + ### Rust Bindings For convenience, we also provide rust bindings to the ICICLE library for the following primitives: @@ -82,7 +98,7 @@ For convenience, we also provide rust bindings to the ICICLE library for the fol - Scalar Vector Multiplication - Point Vector Multiplication -A custom [build script][B_SCRIPT] is used to compile and link the ICICLE library. The environement variable `ARCH_TYPE` is used to determine which GPU type the library should be compiled for and it defaults to `native` when it is not set allowing the compiler to detect the installed GPU type. +A custom [build script][B_SCRIPT] is used to compile and link the ICICLE library. The environment variable `ARCH_TYPE` is used to determine which GPU type the library should be compiled for and it defaults to `native` when it is not set allowing the compiler to detect the installed GPU type. > NOTE: A GPU must be detectable and therefore installed if the `ARCH_TYPE` is not set. @@ -115,20 +131,21 @@ Create a JSON file with the curve parameters. The curve is defined by the follow - ``curve_name`` - e.g. ``bls12_381``. - ``modulus_p`` - scalar field modulus (in decimal). - ``bit_count_p`` - number of bits needed to represent `` modulus_p`` . -- ``limb_p`` - number of bytes needed to represent `` modulus_p`` (rounded). -- ``ntt_size`` - log of the maximal size subgroup of the scalar field. +- ``limb_p`` - number of (32-bit) limbs needed to represent `` modulus_p`` (rounded up). +- ``ntt_size`` - log of the maximal size subgroup of the scalar field. - ``modulus_q`` - base field modulus (in decimal). - ``bit_count_q`` - number of bits needed to represent `` modulus_q`` . -- ``limb_q`` number of bytes needed to represent `` modulus_p`` (rounded). -- ``weierstrass_b`` - Weierstrauss constant of the curve. -- ``weierstrass_b_g2_re`` - Weierstrauss real constant of the g2 curve. -- ``weierstrass_b_g2_im`` - Weierstrauss imaginary constant of the g2 curve. -- ``gen_x`` - x-value of a generator element for the curve. -- ``gen_y`` - y-value of a generator element for the curve. -- ``gen_x_re`` - real x-value of a generator element for the g2 curve. -- ``gen_x_im`` - imaginary x-value of a generator element for the g2 curve. -- ``gen_y_re`` - real y-value of a generator element for the g2 curve. -- ``gen_y_im`` - imaginary y-value of a generator element for the g2 curve. +- ``limb_q`` - number of (32-bit) limbs needed to represent `` modulus_q`` (rounded up). +- ``weierstrass_b`` - `b` of the curve in Weierstrauss form. +- ``weierstrass_b_g2_re`` - real part of the `b` value in of the g2 curve in Weierstrass form. +- ``weierstrass_b_g2_im`` - imaginary part of the `b` value in of the g2 curve in Weierstrass form. +- ``gen_x`` - `x` coordinate of a generator element for the curve. +- ``gen_y`` - `y` coordinate of a generator element for the curve. +- ``gen_x_re`` - real part of the `x` coordinate of generator element for the g2 curve. +- ``gen_x_im`` - imaginary part of the `x` coordinate of generator element for the g2 curve. +- ``gen_y_re`` - real part of the `y` coordinate of generator element for the g2 curve. +- ``gen_y_im`` - imaginary part of the `y` coordinate of generator element for the g2 curve. +- ``nonresidue`` - nonresidue, or `i^2`, or `u^2` - square of the element that generates quadratic extension field of the base field. Here's an example for BLS12-381. ``` @@ -142,14 +159,15 @@ Here's an example for BLS12-381. "bit_count_q" : 381, "limb_q" : 12, "weierstrass_b" : 4, - "weierstrass_b_g2_re":4, - "weierstrass_b_g2_im":4, + "weierstrass_b_g2_re" : 4, + "weierstrass_b_g2_im" : 4, "gen_x" : 3685416753713387016781088315183077757961620795782546409894578378688607592378376318836054947676345821548104185464507, "gen_y" : 1339506544944476473020471379941921221584933875938349620426543736416511423956333506472724655353366534992391756441569, "gen_x_re" : 352701069587466618187139116011060144890029952792775240219908644239793785735715026873347600343865175952761926303160, "gen_x_im" : 3059144344244213709971259814753781636986470325476647558659373206291635324768958432433509563104347017837885763365758, "gen_y_re" : 1985150602287291935568054521177171638300868978215655730859378665066344726373823718423869104263333984641494340347905, - "gen_y_im" : 927553665492332455747201965776037880757740193453592970025027978793976877002675564980949289727957565575433344219582 + "gen_y_im" : 927553665492332455747201965776037880757740193453592970025027978793976877002675564980949289727957565575433344219582, + "nonresidue" : -1 } ``` @@ -167,17 +185,27 @@ The script does the following: - Creates a file with the curve name in ``src/curves`` with the relevant objects for the curve. - Creates a test file with the curve name in ``src``. +Also files from ``./icicle/curves//supported_operations.cu`` should be added individually to ``add_library`` section of [``./icicle/CMakeLists.txt``][CMAKELISTS] + Testing the new curve could be done by running the tests in ``tests_curve_name`` (e.g. ``tests_bls12_381``). ## Docker -We offer a simple Docker container so you can simply run ICICLE without settig everything up locally. +We offer a simple Docker container so you can simply run ICICLE without setting everything up locally. ``` docker build -t . docker run --gpus all -it /bin/bash ``` +## Google Colab + +[Colab](https://colab.google/) is a hosted Jupyter Notebook service that requires no setup to use and provides free access to computing resources including GPUS! + +You can easily run ICICLE in Google Colab on a free GPU instance, this is a great option for those who want to get started with ICICLE instantly without any local setup or GPU. + +Follow this [guide][GOOGLE_COLAB_ICICLE] for more details. + ## Contributions Join our [Discord Server][DISCORD] and find us on the icicle channel. We will be happy to work together to support your use case and talk features, bugs and design. @@ -190,12 +218,19 @@ If you are changing code, please make sure to change your [git hooks path][HOOKS git config core.hooksPath ./scripts/hooks ``` +In case `clang-format` is missing on your system, you can install it using the following command: + +```sh +sudo apt install clang-format +``` + This will ensure our custom hooks are run and will make it easier to follow our coding guidelines. ### Hall of Fame -- [Robik](https://github.com/robik75), for his on-going support and mentorship +- [Robik](https://github.com/robik75), for his ongoing support and mentorship - [liuxiao](https://github.com/liuxiaobleach), for being a top notch bug smasher +- [gkigiermo](https://github.com/gkigiermo), for making it intuitive to use ICICLE in Google Colab. ## Help & Support @@ -212,6 +247,7 @@ See [LICENSE-MIT][LMIT] for details. [BLS12-381]: ./icicle/curves/bls12_381/supported_operations.cu [BLS12-377]: ./icicle/curves/bls12_377/supported_operations.cu [BN254]: ./icicle/curves/bn254/supported_operations.cu +[BW6-671]: ./icicle/curves/bw6_671/supported_operations.cu [NVCC]: https://docs.nvidia.com/cuda/#installation-guides [CRV_TEMPLATE]: ./icicle/curves/curve_template/ [CRV_CONFIG]: ./icicle/curves/index.cu @@ -222,5 +258,8 @@ See [LICENSE-MIT][LMIT] for details. [googletest]: https://github.com/google/googletest/ [HOOKS_DOCS]: https://git-scm.com/docs/githooks [HOOKS_PATH]: ./scripts/hooks/ +[CMAKELISTS]: https://github.com/ingonyama-zk/icicle/blob/f0e6b465611227b858ec4590f4de5432e892748d/icicle/CMakeLists.txt#L28 +[GOOGLE_COLAB_ICICLE]: https://github.com/gkigiermo/rust-cuda-colab +[GRANT_PROGRAM]: https://docs.google.com/forms/d/e/1FAIpQLSc967TnNwxZZ4akejcSi4KOUmGrEc68ZZV-FHLfo8KnP1wbpg/viewform diff --git a/curve_parameters/bls12_377.json b/curve_parameters/bls12_377.json index 22814b746..ae896f1fd 100644 --- a/curve_parameters/bls12_377.json +++ b/curve_parameters/bls12_377.json @@ -3,7 +3,7 @@ "modulus_p" : 8444461749428370424248824938781546531375899335154063827935233455917409239041, "bit_count_p" : 253, "limb_p" : 8, - "ntt_size" : 32, + "ntt_size" : 47, "modulus_q" : 258664426012969094010652733694893533536393512754914660539884262666720468348340822774968888139573360124440321458177, "bit_count_q" : 377, "limb_q" : 12, @@ -16,5 +16,6 @@ "g2_gen_x_re" : 233578398248691099356572568220835526895379068987715365179118596935057653620464273615301663571204657964920925606294, "g2_gen_x_im" : 140913150380207355837477652521042157274541796891053068589147167627541651775299824604154852141315666357241556069118, "g2_gen_y_re" : 63160294768292073209381361943935198908131692476676907196754037919244929611450776219210369229519898517858833747423, - "g2_gen_y_im" : 149157405641012693445398062341192467754805999074082136895788947234480009303640899064710353187729182149407503257491 + "g2_gen_y_im" : 149157405641012693445398062341192467754805999074082136895788947234480009303640899064710353187729182149407503257491, + "nonresidue" : -5 } \ No newline at end of file diff --git a/curve_parameters/bls12_381.json b/curve_parameters/bls12_381.json index f7557bab1..ddbce2931 100644 --- a/curve_parameters/bls12_381.json +++ b/curve_parameters/bls12_381.json @@ -16,5 +16,6 @@ "g2_gen_x_re" : 352701069587466618187139116011060144890029952792775240219908644239793785735715026873347600343865175952761926303160, "g2_gen_x_im" : 3059144344244213709971259814753781636986470325476647558659373206291635324768958432433509563104347017837885763365758, "g2_gen_y_re" : 1985150602287291935568054521177171638300868978215655730859378665066344726373823718423869104263333984641494340347905, - "g2_gen_y_im" : 927553665492332455747201965776037880757740193453592970025027978793976877002675564980949289727957565575433344219582 + "g2_gen_y_im" : 927553665492332455747201965776037880757740193453592970025027978793976877002675564980949289727957565575433344219582, + "nonresidue" : -1 } \ No newline at end of file diff --git a/curve_parameters/bn254.json b/curve_parameters/bn254.json index 4fcaa16d0..766e0a416 100644 --- a/curve_parameters/bn254.json +++ b/curve_parameters/bn254.json @@ -16,5 +16,6 @@ "g2_gen_x_re" : 10857046999023057135944570762232829481370756359578518086990519993285655852781, "g2_gen_x_im" : 11559732032986387107991004021392285783925812861821192530917403151452391805634, "g2_gen_y_re" : 8495653923123431417604973247489272438418190587263600148770280649306958101930, - "g2_gen_y_im" : 4082367875863433681332203403145435568316851327593401208105741076214120093531 + "g2_gen_y_im" : 4082367875863433681332203403145435568316851327593401208105741076214120093531, + "nonresidue" : -1 } \ No newline at end of file diff --git a/curve_parameters/bw6-761.json b/curve_parameters/bw6-761.json new file mode 100644 index 000000000..71fdb1690 --- /dev/null +++ b/curve_parameters/bw6-761.json @@ -0,0 +1,21 @@ +{ + "curve_name" : "bw6_761", + "modulus_p" : 258664426012969094010652733694893533536393512754914660539884262666720468348340822774968888139573360124440321458177, + "bit_count_p" : 377, + "limb_p" : 12, + "ntt_size" : 46, + "modulus_q" : 6891450384315732539396789682275657542479668912536150109513790160209623422243491736087683183289411687640864567753786613451161759120554247759349511699125301598951605099378508850372543631423596795951899700429969112842764913119068299, + "bit_count_q" : 761, + "limb_q" : 24, + "root_of_unity" : 32863578547254505029601261939868325669770508939375122462904745766352256812585773382134936404344547323199885654433, + "weierstrass_b" : 6891450384315732539396789682275657542479668912536150109513790160209623422243491736087683183289411687640864567753786613451161759120554247759349511699125301598951605099378508850372543631423596795951899700429969112842764913119068298, + "weierstrass_b_g2_re" : 4, + "weierstrass_b_g2_im" : 0, + "g1_gen_x" : 6238772257594679368032145693622812838779005809760824733138787810501188623461307351759238099287535516224314149266511977132140828635950940021790489507611754366317801811090811367945064510304504157188661901055903167026722666149426237, + "g1_gen_y" : 2101735126520897423911504562215834951148127555913367997162789335052900271653517958562461315794228241561913734371411178226936527683203879553093934185950470971848972085321797958124416462268292467002957525517188485984766314758624099, + "g2_gen_x_re" : 6445332910596979336035888152774071626898886139774101364933948236926875073754470830732273879639675437155036544153105017729592600560631678554299562762294743927912429096636156401171909259073181112518725201388196280039960074422214428, + "g2_gen_x_im" : 1, + "g2_gen_y_re" : 562923658089539719386922163444547387757586534741080263946953401595155211934630598999300396317104182598044793758153214972605680357108252243146746187917218885078195819486220416605630144001533548163105316661692978285266378674355041, + "g2_gen_y_im" : 1, + "nonresidue" : -1 +} diff --git a/curve_parameters/new_curve_script.py b/curve_parameters/new_curve_script.py index ea8c4ac13..5220a3553 100644 --- a/curve_parameters/new_curve_script.py +++ b/curve_parameters/new_curve_script.py @@ -17,7 +17,7 @@ def to_hex(val: int, length): n = 8 chunks = [x[i:i+n] for i in range(0, len(x), n)][::-1] s = "" - for c in chunks: + for c in chunks[:length // n]: s += f'0x{c}, ' return s[:-2] @@ -30,15 +30,15 @@ def compute_values(modulus, modulus_bit_count, limbs): modulus_2 = to_hex(modulus*2,limb_size) modulus_4 = to_hex(modulus*4,limb_size) modulus_wide = to_hex(modulus,limb_size*2) - modulus_squared = to_hex(modulus*modulus,limb_size) - modulus_squared_2 = to_hex(modulus*modulus*2,limb_size) - modulus_squared_4 = to_hex(modulus*modulus*4,limb_size) + modulus_squared = to_hex(modulus*modulus,limb_size*2) + modulus_squared_2 = to_hex(modulus*modulus*2,limb_size*2) + modulus_squared_4 = to_hex(modulus*modulus*4,limb_size*2) m_raw = int(math.floor(int(pow(2,2*modulus_bit_count) // modulus))) m = to_hex(m_raw,limb_size) one = to_hex(1,limb_size) zero = to_hex(0,limb_size) - montgomery_r = to_hex((2 ** bit_size) % modulus, limb_size) - montgomery_r_inv = to_hex(((modulus+1)//2)**bit_size % modulus, limb_size) + montgomery_r = to_hex(pow(2,bit_size,modulus),limb_size) + montgomery_r_inv = to_hex(pow(2,-bit_size,modulus),limb_size) return ( modulus_, @@ -56,7 +56,7 @@ def compute_values(modulus, modulus_bit_count, limbs): ) -def get_fq_params(modulus, modulus_bit_count, limbs, g1_gen_x, g1_gen_y, g2_gen_x_re, g2_gen_x_im, g2_gen_y_re, g2_gen_y_im): +def get_fq_params(modulus, modulus_bit_count, limbs, nonresidue): ( modulus, modulus_2, @@ -73,6 +73,8 @@ def get_fq_params(modulus, modulus_bit_count, limbs, g1_gen_x, g1_gen_y, g2_gen_ ) = compute_values(modulus, modulus_bit_count, limbs) limb_size = 8*limbs + nonresidue_is_negative = str(nonresidue < 0).lower() + nonresidue = abs(nonresidue) return { 'fq_modulus': modulus, 'fq_modulus_2': modulus_2, @@ -86,12 +88,8 @@ def get_fq_params(modulus, modulus_bit_count, limbs, g1_gen_x, g1_gen_y, g2_gen_ 'fq_zero': zero, 'fq_montgomery_r': montgomery_r, 'fq_montgomery_r_inv': montgomery_r_inv, - 'fq_gen_x': to_hex(g1_gen_x, limb_size), - 'fq_gen_y': to_hex(g1_gen_y, limb_size), - 'fq_gen_x_re': to_hex(g2_gen_x_re, limb_size), - 'fq_gen_x_im': to_hex(g2_gen_x_im, limb_size), - 'fq_gen_y_re': to_hex(g2_gen_y_re, limb_size), - 'fq_gen_y_im': to_hex(g2_gen_y_im, limb_size) + 'nonresidue': nonresidue, + 'nonresidue_is_negative': nonresidue_is_negative } @@ -151,6 +149,18 @@ def get_fp_params(modulus, modulus_bit_count, limbs, root_of_unity, size=0): } +def get_generators(g1_gen_x, g1_gen_y, g2_gen_x_re, g2_gen_x_im, g2_gen_y_re, g2_gen_y_im, size): + + return { + 'fq_gen_x': to_hex(g1_gen_x, size), + 'fq_gen_y': to_hex(g1_gen_y, size), + 'fq_gen_x_re': to_hex(g2_gen_x_re, size), + 'fq_gen_x_im': to_hex(g2_gen_x_im, size), + 'fq_gen_y_re': to_hex(g2_gen_y_re, size), + 'fq_gen_y_im': to_hex(g2_gen_y_im, size) + } + + def get_weier_params(weierstrass_b, weierstrass_b_g2_re, weierstrass_b_g2_im, size): return { @@ -171,6 +181,7 @@ def get_params(config): bit_count_q = config["bit_count_q"] limb_q = config["limb_q"] root_of_unity = config["root_of_unity"] + nonresidue = config["nonresidue"] if root_of_unity == modulus_p: sys.exit("Invalid root_of_unity value; please update in curve parameters") @@ -194,13 +205,15 @@ def get_params(config): } fp_params = get_fp_params(modulus_p, bit_count_p, limb_p, root_of_unity, ntt_size) - fq_params = get_fq_params(modulus_q, bit_count_q, limb_q, g1_gen_x, g1_gen_y, g2_generator_x_re, g2_generator_x_im, g2_generator_y_re, g2_generator_y_im) + fq_params = get_fq_params(modulus_q, bit_count_q, limb_q, nonresidue) + generators = get_generators(g1_gen_x, g1_gen_y, g2_generator_x_re, g2_generator_x_im, g2_generator_y_re, g2_generator_y_im, 8*limb_q) weier_params = get_weier_params(weierstrass_b, weierstrass_b_g2_re, weierstrass_b_g2_im, 8*limb_q) return { **params, **fp_params, **fq_params, + **generators, **weier_params } diff --git a/goicicle/Makefile b/goicicle/Makefile index 0e11c9112..a12860cc0 100644 --- a/goicicle/Makefile +++ b/goicicle/Makefile @@ -5,20 +5,25 @@ LDFLAGS = -shared FEATURES = -DG2_DEFINED TARGET_BN254 = libbn254.so +TARGET_BW6761 = libbw6761.so TARGET_BLS12_381 = libbls12_381.so TARGET_BLS12_377 = libbls12_377.so -VPATH = ../icicle/curves/bn254:../icicle/curves/bls12_377:../icicle/curves/bls12_381 +VPATH = ../icicle/curves/bn254:../icicle/curves/bls12_377:../icicle/curves/bls12_381:../icicle/curves/bw6_761 SRCS_BN254 = lde.cu msm.cu projective.cu ve_mod_mult.cu +SRCS_BW6761 = lde.cu msm.cu projective.cu ve_mod_mult.cu SRCS_BLS12_381 = lde.cu msm.cu projective.cu ve_mod_mult.cu poseidon.cu SRCS_BLS12_377 = lde.cu msm.cu projective.cu ve_mod_mult.cu -all: $(TARGET_BN254) $(TARGET_BLS12_381) $(TARGET_BLS12_377) +all: $(TARGET_BN254) $(TARGET_BLS12_381) $(TARGET_BLS12_377) $(TARGET_BW6761) $(TARGET_BN254): $(NVCC) $(FEATURES) $(CFLAGS) $(LDFLAGS) $(addprefix ../icicle/curves/bn254/, $(SRCS_BN254)) -o $@ +$(TARGET_BW6761): + $(NVCC) $(FEATURES) $(CFLAGS) $(LDFLAGS) $(addprefix ../icicle/curves/bw6_761/, $(SRCS_BW6761)) -o $@ + $(TARGET_BLS12_381): $(NVCC) $(FEATURES) $(CFLAGS) $(LDFLAGS) $(addprefix ../icicle/curves/bls12_381/, $(SRCS_BLS12_381)) -o $@ @@ -26,4 +31,4 @@ $(TARGET_BLS12_377): $(NVCC) $(FEATURES) $(CFLAGS) $(LDFLAGS) $(addprefix ../icicle/curves/bls12_377/, $(SRCS_BLS12_377)) -o $@ clean: - rm -f $(TARGET_BN254) $(TARGET_BLS12_381) $(TARGET_BLS12_377) + rm -f $(TARGET_BN254) $(TARGET_BLS12_381) $(TARGET_BLS12_377) $(TARGET_BW6761) diff --git a/goicicle/README.md b/goicicle/README.md index 82995ecb8..623b3953a 100644 --- a/goicicle/README.md +++ b/goicicle/README.md @@ -11,13 +11,13 @@ To compile the CUDA files, you will need: ## Structure of the Makefile -The Makefile is designed to compile CUDA files for three curves: BN254, BLS12_381, and BLS12_377. The source files are located in the `icicle/curves/` directory. +The Makefile is designed to compile CUDA files for four curves: BN254, BLS12_381, BLS12_377 and BW6_671. The source files are located in the `icicle/curves/` directory. ## Compiling CUDA Code 1. Navigate to the directory containing the Makefile in your terminal. -2. To compile all curve libraries, use the `make all` command. This will create three shared libraries: `libbn254.so`, `libbls12_381.so`, and `libbls12_377.so`. -3. If you want to compile a specific curve, you can do so by specifying the target. For example, to compile only the BN254 curve, use `make libbn254.so`. Replace `libbn254.so` with `libbls12_381.so` or `libbls12_377.so` to compile those curves instead. +2. To compile all curve libraries, use the `make all` command. This will create four shared libraries: `libbn254.so`, `libbls12_381.so`, `libbls12_377.so` and `libbw6_671.so`. +3. If you want to compile a specific curve, you can do so by specifying the target. For example, to compile only the BN254 curve, use `make libbn254.so`. Replace `libbn254.so` with `libbls12_381.so`, `libbls12_377.so` or `libbw6_671.so` to compile those curves instead. The resulting `.so` files are the compiled shared libraries for each curve. @@ -25,13 +25,13 @@ The resulting `.so` files are the compiled shared libraries for each curve. The shared libraries produced from the CUDA code compilation are used to bind Golang to ICICLE's CUDA code. -1. These shared libraries (`libbn254.so`, `libbls12_381.so`, `libbls12_377.so`) can be imported in your Go project to leverage the GPU accelerated functionalities provided by ICICLE. +1. These shared libraries (`libbn254.so`, `libbls12_381.so`, `libbls12_377.so`, `libbw6_671.so`) can be imported in your Go project to leverage the GPU accelerated functionalities provided by ICICLE. 2. In your Go project, you can use `cgo` to link these shared libraries. Here's a basic example on how you can use `cgo` to link these libraries: ```go /* -#cgo LDFLAGS: -L/path/to/shared/libs -lbn254 -lbls12_381 -lbls12_377 +#cgo LDFLAGS: -L/path/to/shared/libs -lbn254 -lbls12_381 -lbls12_377 -lbw6_671 #include "icicle.h" // make sure you use the correct header file(s) */ import "C" @@ -46,7 +46,7 @@ Replace `/path/to/shared/libs` with the actual path where the shared libraries a ## Cleaning up -If you want to remove the compiled files, you can use the `make clean` command. This will remove the `libbn254.so`, `libbls12_381.so`, and `libbls12_377.so` files. +If you want to remove the compiled files, you can use the `make clean` command. This will remove the `libbn254.so`, `libbls12_381.so`, `libbls12_377.so` and `libbw6_671.so` files. ## Common issues diff --git a/goicicle/curves/bls12377/g2.go b/goicicle/curves/bls12377/g2.go index 92ca068ca..837354719 100644 --- a/goicicle/curves/bls12377/g2.go +++ b/goicicle/curves/bls12377/g2.go @@ -84,17 +84,6 @@ func (f *G2Element) ToBytesLe() []byte { return bytes } -func (p *G2PointAffine) ToProjective() G2Point { - return G2Point{ - X: p.X, - Y: p.Y, - Z: ExtentionField{ - A0: G2Element{1, 0, 0, 0}, - A1: G2Element{0, 0, 0, 0}, - }, - } -} - func (p *G2PointAffine) FromProjective(projective *G2Point) *G2PointAffine { out := (*C.BLS12_377_g2_affine_t)(unsafe.Pointer(p)) in := (*C.BLS12_377_g2_projective_t)(unsafe.Pointer(projective)) diff --git a/goicicle/curves/bls12377/g2_test.go b/goicicle/curves/bls12377/g2_test.go index b1cc5fd13..f6652c9e9 100644 --- a/goicicle/curves/bls12377/g2_test.go +++ b/goicicle/curves/bls12377/g2_test.go @@ -71,7 +71,8 @@ func TestG2ShouldConvertToProjective(t *testing.T) { var pointAffine G2PointAffine pointAffine.FromProjective(&pointProjective) - proj := pointAffine.ToProjective() + var proj G2Point + proj.FromAffine(&pointAffine) assert.True(t, proj.IsOnCurve()) assert.True(t, pointProjective.Eq(&proj)) diff --git a/goicicle/curves/bls12377/msm_test.go b/goicicle/curves/bls12377/msm_test.go index f8c53ff3e..6382c755c 100644 --- a/goicicle/curves/bls12377/msm_test.go +++ b/goicicle/curves/bls12377/msm_test.go @@ -179,7 +179,7 @@ func BenchmarkCommit(b *testing.B) { e := Commit(out_d, scalars_d, points_d, msmSize, 10) if e != 0 { - panic("Error occured") + panic("Error occurred") } } }) @@ -226,7 +226,7 @@ func BenchmarkMSM(b *testing.B) { _, e := Msm(out, points, scalars, 0) if e != nil { - panic("Error occured") + panic("Error occurred") } } }) @@ -288,7 +288,7 @@ func BenchmarkMsmG2BLS12_377(b *testing.B) { _, e := MsmG2(out, points, scalars, 0) if e != nil { - panic("Error occured") + panic("Error occurred") } } }) diff --git a/goicicle/curves/bls12381/g2.go b/goicicle/curves/bls12381/g2.go index cb62eb814..3ba78cb7d 100644 --- a/goicicle/curves/bls12381/g2.go +++ b/goicicle/curves/bls12381/g2.go @@ -84,17 +84,6 @@ func (f *G2Element) ToBytesLe() []byte { return bytes } -func (p *G2PointAffine) ToProjective() G2Point { - return G2Point{ - X: p.X, - Y: p.Y, - Z: ExtentionField{ - A0: G2Element{1, 0, 0, 0}, - A1: G2Element{0, 0, 0, 0}, - }, - } -} - func (p *G2PointAffine) FromProjective(projective *G2Point) *G2PointAffine { out := (*C.BLS12_381_g2_affine_t)(unsafe.Pointer(p)) in := (*C.BLS12_381_g2_projective_t)(unsafe.Pointer(projective)) diff --git a/goicicle/curves/bls12381/g2_test.go b/goicicle/curves/bls12381/g2_test.go index c3b3035d1..38311b458 100644 --- a/goicicle/curves/bls12381/g2_test.go +++ b/goicicle/curves/bls12381/g2_test.go @@ -71,7 +71,8 @@ func TestG2ShouldConvertToProjective(t *testing.T) { var pointAffine G2PointAffine pointAffine.FromProjective(&pointProjective) - proj := pointAffine.ToProjective() + var proj G2Point + proj.FromAffine(&pointAffine) assert.True(t, proj.IsOnCurve()) assert.True(t, pointProjective.Eq(&proj)) diff --git a/goicicle/curves/bls12381/msm_test.go b/goicicle/curves/bls12381/msm_test.go index 6a12db32d..15e1f0971 100644 --- a/goicicle/curves/bls12381/msm_test.go +++ b/goicicle/curves/bls12381/msm_test.go @@ -179,7 +179,7 @@ func BenchmarkCommit(b *testing.B) { e := Commit(out_d, scalars_d, points_d, msmSize, 10) if e != 0 { - panic("Error occured") + panic("Error occurred") } } }) @@ -226,7 +226,7 @@ func BenchmarkMSM(b *testing.B) { _, e := Msm(out, points, scalars, 0) if e != nil { - panic("Error occured") + panic("Error occurred") } } }) @@ -288,7 +288,7 @@ func BenchmarkMsmG2BLS12_381(b *testing.B) { _, e := MsmG2(out, points, scalars, 0) if e != nil { - panic("Error occured") + panic("Error occurred") } } }) diff --git a/goicicle/curves/bn254/g2.go b/goicicle/curves/bn254/g2.go index 2a54b493d..c48fdea63 100644 --- a/goicicle/curves/bn254/g2.go +++ b/goicicle/curves/bn254/g2.go @@ -84,17 +84,6 @@ func (f *G2Element) ToBytesLe() []byte { return bytes } -func (p *G2PointAffine) ToProjective() G2Point { - return G2Point{ - X: p.X, - Y: p.Y, - Z: ExtentionField{ - A0: G2Element{1, 0, 0, 0}, - A1: G2Element{0, 0, 0, 0}, - }, - } -} - func (p *G2PointAffine) FromProjective(projective *G2Point) *G2PointAffine { out := (*C.BN254_g2_affine_t)(unsafe.Pointer(p)) in := (*C.BN254_g2_projective_t)(unsafe.Pointer(projective)) diff --git a/goicicle/curves/bn254/g2_test.go b/goicicle/curves/bn254/g2_test.go index c0198cd35..a19538714 100644 --- a/goicicle/curves/bn254/g2_test.go +++ b/goicicle/curves/bn254/g2_test.go @@ -71,7 +71,8 @@ func TestG2ShouldConvertToProjective(t *testing.T) { var pointAffine G2PointAffine pointAffine.FromProjective(&pointProjective) - proj := pointAffine.ToProjective() + var proj G2Point + proj.FromAffine(&pointAffine) assert.True(t, proj.IsOnCurve()) assert.True(t, pointProjective.Eq(&proj)) diff --git a/goicicle/curves/bn254/msm_test.go b/goicicle/curves/bn254/msm_test.go index 73cb41ab6..c8f04346e 100644 --- a/goicicle/curves/bn254/msm_test.go +++ b/goicicle/curves/bn254/msm_test.go @@ -179,7 +179,7 @@ func BenchmarkCommit(b *testing.B) { e := Commit(out_d, scalars_d, points_d, msmSize, 10) if e != 0 { - panic("Error occured") + panic("Error occurred") } } }) @@ -226,7 +226,7 @@ func BenchmarkMSM(b *testing.B) { _, e := Msm(out, points, scalars, 0) if e != nil { - panic("Error occured") + panic("Error occurred") } } }) @@ -288,7 +288,7 @@ func BenchmarkMsmG2BN254(b *testing.B) { _, e := MsmG2(out, points, scalars, 0) if e != nil { - panic("Error occured") + panic("Error occurred") } } }) diff --git a/goicicle/curves/bw6761/g1.go b/goicicle/curves/bw6761/g1.go new file mode 100644 index 000000000..4b69ba05e --- /dev/null +++ b/goicicle/curves/bw6761/g1.go @@ -0,0 +1,328 @@ +// Copyright 2023 Ingonyama +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +// Code generated by Ingonyama DO NOT EDIT + +package bw6761 + +import ( + "unsafe" + + "encoding/binary" +) + +// #cgo CFLAGS: -I./include/ +// #cgo CFLAGS: -I/usr/local/cuda/include +// #cgo LDFLAGS: -L${SRCDIR}/../../ -lbw6761 +// #include "projective.h" +// #include "ve_mod_mult.h" +import "C" + +const SCALAR_SIZE = 12 +const BASE_SIZE = 24 + +type G1ScalarField struct { + S [SCALAR_SIZE]uint32 +} + +type G1BaseField struct { + S [BASE_SIZE]uint32 +} + +/* + * BaseField Constrctors + */ + +func (f *G1BaseField) SetZero() *G1BaseField { + var S [BASE_SIZE]uint32 + f.S = S + + return f +} + +func (f *G1BaseField) SetOne() *G1BaseField { + var S [BASE_SIZE]uint32 + + S[0] = 1 + + f.S = S + return f +} + +func (p *G1ProjectivePoint) FromAffine(affine *G1PointAffine) *G1ProjectivePoint { + out := (*C.BW6761_projective_t)(unsafe.Pointer(p)) + in := (*C.BW6761_affine_t)(unsafe.Pointer(affine)) + + C.projective_from_affine_bw6_761(out, in) + + return p +} + +func (f *G1BaseField) FromLimbs(limbs [BASE_SIZE]uint32) *G1BaseField { + copy(f.S[:], limbs[:]) + + return f +} + +/* + * BaseField methods + */ + +func (f *G1BaseField) Limbs() [BASE_SIZE]uint32 { + return f.S +} + +func (f *G1BaseField) ToBytesLe() []byte { + bytes := make([]byte, len(f.S)*4) + for i, v := range f.S { + binary.LittleEndian.PutUint32(bytes[i*4:], v) + } + + return bytes +} + +/* + * ScalarField methods + */ + +func (p *G1ScalarField) Random() *G1ScalarField { + outC := (*C.BW6761_scalar_t)(unsafe.Pointer(p)) + C.random_scalar_bw6_761(outC) + + return p +} + +func (f *G1ScalarField) SetZero() *G1ScalarField { + var S [SCALAR_SIZE]uint32 + f.S = S + + return f +} + +func (f *G1ScalarField) SetOne() *G1ScalarField { + var S [SCALAR_SIZE]uint32 + S[0] = 1 + f.S = S + + return f +} + +func (a *G1ScalarField) Eq(b *G1ScalarField) bool { + for i, v := range a.S { + if b.S[i] != v { + return false + } + } + return true +} + +/* + * ScalarField methods + */ + +func (f *G1ScalarField) Limbs() [SCALAR_SIZE]uint32 { + return f.S +} + +func (f *G1ScalarField) ToBytesLe() []byte { + bytes := make([]byte, len(f.S)*4) + for i, v := range f.S { + binary.LittleEndian.PutUint32(bytes[i*4:], v) + } + + return bytes +} + +/* + * PointBW6761 + */ + +type G1ProjectivePoint struct { + X, Y, Z G1BaseField +} + +func (f *G1ProjectivePoint) SetZero() *G1ProjectivePoint { + var yOne G1BaseField + yOne.SetOne() + + var xZero G1BaseField + xZero.SetZero() + + var zZero G1BaseField + zZero.SetZero() + + f.X = xZero + f.Y = yOne + f.Z = zZero + + return f +} + +func (p *G1ProjectivePoint) Eq(pCompare *G1ProjectivePoint) bool { + // Cast *PointBW6761 to *C.BW6761_projective_t + // The unsafe.Pointer cast is necessary because Go doesn't allow direct casts + // between different pointer types. + // It'S your responsibility to ensure that the types are compatible. + pC := (*C.BW6761_projective_t)(unsafe.Pointer(p)) + pCompareC := (*C.BW6761_projective_t)(unsafe.Pointer(pCompare)) + + // Call the C function + // The C function doesn't keep any references to the data, + // so it'S fine if the Go garbage collector moves or deletes the data later. + return bool(C.eq_bw6_761(pC, pCompareC)) +} + +func (p *G1ProjectivePoint) IsOnCurve() bool { + point := (*C.BW6761_projective_t)(unsafe.Pointer(p)) + res := C.projective_is_on_curve_bw6_761(point) + + return bool(res) +} + +func (p *G1ProjectivePoint) Random() *G1ProjectivePoint { + outC := (*C.BW6761_projective_t)(unsafe.Pointer(p)) + C.random_projective_bw6_761(outC) + + return p +} + +func (p *G1ProjectivePoint) StripZ() *G1PointAffine { + return &G1PointAffine{ + X: p.X, + Y: p.Y, + } +} + +func (p *G1ProjectivePoint) FromLimbs(x, y, z *[]uint32) *G1ProjectivePoint { + var _x G1BaseField + var _y G1BaseField + var _z G1BaseField + + _x.FromLimbs(GetFixedLimbs(x)) + _y.FromLimbs(GetFixedLimbs(y)) + _z.FromLimbs(GetFixedLimbs(z)) + + p.X = _x + p.Y = _y + p.Z = _z + + return p +} + +/* + * PointAffineNoInfinityBW6761 + */ + +type G1PointAffine struct { + X, Y G1BaseField +} + +func (p *G1PointAffine) FromProjective(projective *G1ProjectivePoint) *G1PointAffine { + in := (*C.BW6761_projective_t)(unsafe.Pointer(projective)) + out := (*C.BW6761_affine_t)(unsafe.Pointer(p)) + + C.projective_to_affine_bw6_761(out, in) + + return p +} + +func (p *G1PointAffine) ToProjective() *G1ProjectivePoint { + var Z G1BaseField + Z.SetOne() + + return &G1ProjectivePoint{ + X: p.X, + Y: p.Y, + Z: Z, + } +} + +func (p *G1PointAffine) FromLimbs(X, Y *[]uint32) *G1PointAffine { + var _x G1BaseField + var _y G1BaseField + + _x.FromLimbs(GetFixedLimbs(X)) + _y.FromLimbs(GetFixedLimbs(Y)) + + p.X = _x + p.Y = _y + + return p +} + +/* + * Multiplication + */ + +func MultiplyVec(a []G1ProjectivePoint, b []G1ScalarField, deviceID int) { + if len(a) != len(b) { + panic("a and b have different lengths") + } + + pointsC := (*C.BW6761_projective_t)(unsafe.Pointer(&a[0])) + scalarsC := (*C.BW6761_scalar_t)(unsafe.Pointer(&b[0])) + deviceIdC := C.size_t(deviceID) + nElementsC := C.size_t(len(a)) + + C.vec_mod_mult_point_bw6_761(pointsC, scalarsC, nElementsC, deviceIdC) +} + +func MultiplyScalar(a []G1ScalarField, b []G1ScalarField, deviceID int) { + if len(a) != len(b) { + panic("a and b have different lengths") + } + + aC := (*C.BW6761_scalar_t)(unsafe.Pointer(&a[0])) + bC := (*C.BW6761_scalar_t)(unsafe.Pointer(&b[0])) + deviceIdC := C.size_t(deviceID) + nElementsC := C.size_t(len(a)) + + C.vec_mod_mult_scalar_bw6_761(aC, bC, nElementsC, deviceIdC) +} + +// Multiply a matrix by a scalar: +// +// `a` - flattenned matrix; +// `b` - vector to multiply `a` by; +func MultiplyMatrix(a []G1ScalarField, b []G1ScalarField, deviceID int) { + c := make([]G1ScalarField, len(b)) + for i := range c { + var p G1ScalarField + p.SetZero() + + c[i] = p + } + + aC := (*C.BW6761_scalar_t)(unsafe.Pointer(&a[0])) + bC := (*C.BW6761_scalar_t)(unsafe.Pointer(&b[0])) + cC := (*C.BW6761_scalar_t)(unsafe.Pointer(&c[0])) + deviceIdC := C.size_t(deviceID) + nElementsC := C.size_t(len(a)) + + C.matrix_vec_mod_mult_bw6_761(aC, bC, cC, nElementsC, deviceIdC) +} + +/* + * Utils + */ + +func GetFixedLimbs(slice *[]uint32) [BASE_SIZE]uint32 { + if len(*slice) <= BASE_SIZE { + limbs := [BASE_SIZE]uint32{} + copy(limbs[:len(*slice)], *slice) + return limbs + } + + panic("slice has too many elements") +} diff --git a/goicicle/curves/bw6761/g1_test.go b/goicicle/curves/bw6761/g1_test.go new file mode 100644 index 000000000..b530c7a69 --- /dev/null +++ b/goicicle/curves/bw6761/g1_test.go @@ -0,0 +1,212 @@ +// Copyright 2023 Ingonyama +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +// Code generated by Ingonyama DO NOT EDIT + +package bw6761 + +import ( + "encoding/binary" + "fmt" + "testing" + + "github.com/stretchr/testify/assert" +) + +func generateUint32Array(length int, isZero bool) []uint32 { + arr := make([]uint32, length) + for i := 0; i < length; i++ { + if isZero { + arr[i] = 0x0 + } else { + arr[i] = uint32(i + 1) // You can modify this line to fill the array as needed + } + } + return arr +} + +func TestNewFieldBW6761One(t *testing.T) { + var oneField G1BaseField + oneField.SetOne() + + rawOneField := [24]uint32([24]uint32{0x1, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0}) + + assert.Equal(t, oneField.S, rawOneField) +} + +func TestNewFieldBW6761Zero(t *testing.T) { + var zeroField G1BaseField + zeroField.SetZero() + + rawZeroField := [24]uint32([24]uint32{0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0}) + + assert.Equal(t, zeroField.S, rawZeroField) +} + +func TestFieldBW6761ToBytesLe(t *testing.T) { + var p G1ProjectivePoint + p.Random() + + expected := make([]byte, len(p.X.S)*4) // each uint32 takes 4 bytes + for i, v := range p.X.S { + binary.LittleEndian.PutUint32(expected[i*4:], v) + } + + assert.Equal(t, p.X.ToBytesLe(), expected) + assert.Equal(t, len(p.X.ToBytesLe()), 96) +} + +func TestNewPointBW6761Zero(t *testing.T) { + var pointZero G1ProjectivePoint + pointZero.SetZero() + + var baseOne G1BaseField + baseOne.SetOne() + + var zeroSanity G1BaseField + zeroSanity.SetZero() + + assert.Equal(t, pointZero.X, zeroSanity) + assert.Equal(t, pointZero.Y, baseOne) + assert.Equal(t, pointZero.Z, zeroSanity) +} + +func TestFromProjectiveToAffine(t *testing.T) { + fmt.Print() // this prevents the test from hanging. TODO: figure out why + var projective G1ProjectivePoint + var affine G1PointAffine + + projective.Random() + + affine.FromProjective(&projective) + var projective2 G1ProjectivePoint + projective2.FromAffine(&affine) + + assert.True(t, projective.IsOnCurve()) + assert.True(t, projective2.IsOnCurve()) + assert.True(t, projective.Eq(&projective2)) +} + +func TestBW6761Eq(t *testing.T) { + var p1 G1ProjectivePoint + p1.Random() + var p2 G1ProjectivePoint + p2.Random() + + assert.Equal(t, p1.Eq(&p1), true) + assert.Equal(t, p1.Eq(&p2), false) +} + +func TestBW6761StripZ(t *testing.T) { + var p1 G1ProjectivePoint + p1.Random() + + p2ZLess := p1.StripZ() + + assert.IsType(t, G1PointAffine{}, *p2ZLess) + assert.Equal(t, p1.X, p2ZLess.X) + assert.Equal(t, p1.Y, p2ZLess.Y) +} + +func TestPointBW6761fromLimbs(t *testing.T) { + var p G1ProjectivePoint + p.Random() + + x := p.X.Limbs() + y := p.Y.Limbs() + z := p.Z.Limbs() + + xSlice := x[:] + ySlice := y[:] + zSlice := z[:] + + var pFromLimbs G1ProjectivePoint + pFromLimbs.FromLimbs(&xSlice, &ySlice, &zSlice) + + assert.Equal(t, pFromLimbs, p) +} + +func TestNewPointAffineNoInfinityBW6761Zero(t *testing.T) { + var zeroP G1PointAffine + + var zeroSanity G1BaseField + zeroSanity.SetZero() + + assert.Equal(t, zeroP.X, zeroSanity) + assert.Equal(t, zeroP.Y, zeroSanity) +} + +func TestPointAffineNoInfinityBW6761FromLimbs(t *testing.T) { + // Initialize your test values + x := [24]uint32{1, 2, 3, 4, 5, 6, 7, 8, 1, 2, 3, 4, 5, 6, 7, 8, 1, 2, 3, 4, 5, 6, 7, 8} + y := [24]uint32{1, 2, 3, 4, 5, 6, 7, 8, 1, 2, 3, 4, 5, 6, 7, 8, 1, 2, 3, 4, 5, 6, 7, 8} + xSlice := x[:] + ySlice := y[:] + + // Execute your function + var result G1PointAffine + result.FromLimbs(&xSlice, &ySlice) + + var xBase G1BaseField + var yBase G1BaseField + xBase.FromLimbs(x) + yBase.FromLimbs(y) + + // Define your expected result + expected := G1PointAffine{ + X: xBase, + Y: yBase, + } + + // Test if result is as expected + assert.Equal(t, expected, result) +} + +func TestGetFixedLimbs(t *testing.T) { + t.Run("case of valid input of length less than 8", func(t *testing.T) { + slice := []uint32{1, 2, 3, 4, 5, 6, 7} + expected := [24]uint32{1, 2, 3, 4, 5, 6, 7, 0} + + result := GetFixedLimbs(&slice) + assert.Equal(t, result, expected) + }) + + t.Run("case of valid input of length 24", func(t *testing.T) { + slice := generateUint32Array(24, false) + expected := [24]uint32(generateUint32Array(24, false)) + + result := GetFixedLimbs(&slice) + assert.Equal(t, result, expected) + }) + + t.Run("case of empty input", func(t *testing.T) { + slice := []uint32{} + expected := [24]uint32(generateUint32Array(24, true)) + + result := GetFixedLimbs(&slice) + assert.Equal(t, result, expected) + }) + + t.Run("case of input length greater than 24", func(t *testing.T) { + slice := generateUint32Array(25, false) + + defer func() { + if r := recover(); r == nil { + t.Errorf("the code did not panic") + } + }() + + GetFixedLimbs(&slice) + }) +} diff --git a/goicicle/curves/bw6761/g2.go b/goicicle/curves/bw6761/g2.go new file mode 100644 index 000000000..6cbc7d2a7 --- /dev/null +++ b/goicicle/curves/bw6761/g2.go @@ -0,0 +1,98 @@ +// Copyright 2023 Ingonyama +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +// Code generated by Ingonyama DO NOT EDIT + +package bw6761 + +import ( + "encoding/binary" + "unsafe" +) + +// #cgo CFLAGS: -I./include/ +// #cgo CFLAGS: -I/usr/local/cuda/include +// #cgo LDFLAGS: -L${SRCDIR}/../../ -lbw6761 +// #include "projective.h" +// #include "ve_mod_mult.h" +import "C" + +// G2 extension field + +type G2Element [12]uint64 + +type G2PointAffine struct { + X, Y G2Element +} + +type G2Point struct { + X, Y, Z G2Element +} + +func (p *G2Point) Random() *G2Point { + outC := (*C.BW6761_g2_projective_t)(unsafe.Pointer(p)) + C.random_g2_projective_bw6_761(outC) + + return p +} + +func (p *G2Point) FromAffine(affine *G2PointAffine) *G2Point { + out := (*C.BW6761_g2_projective_t)(unsafe.Pointer(p)) + in := (*C.BW6761_g2_affine_t)(unsafe.Pointer(affine)) + + C.g2_projective_from_affine_bw6_761(out, in) + + return p +} + +func (p *G2Point) Eq(pCompare *G2Point) bool { + // Cast *PointBW6761 to *C.BW6761_projective_t + // The unsafe.Pointer cast is necessary because Go doesn't allow direct casts + // between different pointer types. + // It's your responsibility to ensure that the types are compatible. + pC := (*C.BW6761_g2_projective_t)(unsafe.Pointer(p)) + pCompareC := (*C.BW6761_g2_projective_t)(unsafe.Pointer(pCompare)) + + // Call the C function + // The C function doesn't keep any references to the data, + // so it's fine if the Go garbage collector moves or deletes the data later. + return bool(C.eq_g2_bw6_761(pC, pCompareC)) +} + +func (f *G2Element) ToBytesLe() []byte { + var bytes []byte + for _, val := range f { + buf := make([]byte, 8) // 8 bytes because uint64 is 64-bit + binary.LittleEndian.PutUint64(buf, val) + bytes = append(bytes, buf...) + } + return bytes +} + +func (p *G2PointAffine) FromProjective(projective *G2Point) *G2PointAffine { + out := (*C.BW6761_g2_affine_t)(unsafe.Pointer(p)) + in := (*C.BW6761_g2_projective_t)(unsafe.Pointer(projective)) + + C.g2_projective_to_affine_bw6_761(out, in) + + return p +} + +func (p *G2Point) IsOnCurve() bool { + // Directly copy memory from the C struct to the Go struct + point := (*C.BW6761_g2_projective_t)(unsafe.Pointer(p)) + res := C.g2_projective_is_on_curve_bw6_761(point) + + return bool(res) +} diff --git a/goicicle/curves/bw6761/g2_test.go b/goicicle/curves/bw6761/g2_test.go new file mode 100644 index 000000000..bc7ebf845 --- /dev/null +++ b/goicicle/curves/bw6761/g2_test.go @@ -0,0 +1,83 @@ +// Copyright 2023 Ingonyama +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +// Code generated by Ingonyama DO NOT EDIT + +package bw6761 + +import ( + "fmt" + "testing" + "unsafe" + + "github.com/stretchr/testify/assert" +) + +func TestG2Eqg2(t *testing.T) { + var point G2Point + + point.Random() + + assert.True(t, point.Eq(&point)) +} + +func TestG2FromProjectiveToAffine(t *testing.T) { + fmt.Print() // this prevents the test from hanging. TODO: figure out why + var projective G2Point + projective.Random() + + var affine G2PointAffine + affine.FromProjective(&projective) + + var projective2 G2Point + projective2.FromAffine(&affine) + + assert.True(t, projective.IsOnCurve()) + assert.True(t, projective2.IsOnCurve()) + assert.True(t, projective.Eq(&projective2)) +} + +func TestG2Eqg2NotEqual(t *testing.T) { + var point G2Point + point.Random() + + var point2 G2Point + point2.Random() + + assert.False(t, point.Eq(&point2)) +} + +func TestG2ToBytes(t *testing.T) { + var point G2Point + var element G2Element + point.Random() + bytes := point.X.ToBytesLe() + + assert.Equal(t, len(bytes), int(unsafe.Sizeof(element))) +} + +func TestG2ShouldConvertToProjective(t *testing.T) { + fmt.Print() // this prevents the test from hanging. TODO: figure out why + var pointProjective G2Point + pointProjective.Random() + + var pointAffine G2PointAffine + pointAffine.FromProjective(&pointProjective) + + var proj G2Point + proj.FromAffine(&pointAffine) + + assert.True(t, proj.IsOnCurve()) + assert.True(t, pointProjective.Eq(&proj)) +} diff --git a/goicicle/curves/bw6761/include/msm.h b/goicicle/curves/bw6761/include/msm.h new file mode 100644 index 000000000..03901c147 --- /dev/null +++ b/goicicle/curves/bw6761/include/msm.h @@ -0,0 +1,101 @@ + + // Copyright 2023 Ingonyama + // + // Licensed under the Apache License, Version 2.0 (the "License"); + // you may not use this file except in compliance with the License. + // You may obtain a copy of the License at + // + // http://www.apache.org/licenses/LICENSE-2.0 + // + // Unless required by applicable law or agreed to in writing, software + // distributed under the License is distributed on an "AS IS" BASIS, + // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + // See the License for the specific language governing permissions and + // limitations under the License. + +// Code generated by Ingonyama DO NOT EDIT + +#include +#include +#include +// msm.h + +#ifndef _BW6761_MSM_H +#define _BW6761_MSM_H + +#ifdef __cplusplus +extern "C" { +#endif + +// Incomplete declaration of BW6761 projective and affine structs +typedef struct BW6761_projective_t BW6761_projective_t; +typedef struct BW6761_g2_projective_t BW6761_g2_projective_t; +typedef struct BW6761_affine_t BW6761_affine_t; +typedef struct BW6761_g2_affine_t BW6761_g2_affine_t; +typedef struct BW6761_scalar_t BW6761_scalar_t; +typedef cudaStream_t CudaStream_t; + +int msm_cuda_bw6_761( + BW6761_projective_t* out, BW6761_affine_t* points, BW6761_scalar_t* scalars, size_t count, size_t device_id); + +int msm_batch_cuda_bw6_761( + BW6761_projective_t* out, + BW6761_affine_t* points, + BW6761_scalar_t* scalars, + size_t batch_size, + size_t msm_size, + size_t device_id); + +int commit_cuda_bw6_761( + BW6761_projective_t* d_out, + BW6761_scalar_t* d_scalars, + BW6761_affine_t* d_points, + size_t count, + unsigned large_bucket_factor, + size_t device_id); + +int commit_batch_cuda_bw6_761( + BW6761_projective_t* d_out, + BW6761_scalar_t* d_scalars, + BW6761_affine_t* d_points, + size_t count, + size_t batch_size, + size_t device_id); + +int msm_g2_cuda_bw6_761( + BW6761_g2_projective_t* out, + BW6761_g2_affine_t* points, + BW6761_scalar_t* scalars, + size_t count, + size_t device_id); + +int msm_batch_g2_cuda_bw6_761( + BW6761_g2_projective_t* out, + BW6761_g2_affine_t* points, + BW6761_scalar_t* scalars, + size_t batch_size, + size_t msm_size, + size_t device_id); + +int commit_g2_cuda_bw6_761( + BW6761_g2_projective_t* d_out, + BW6761_scalar_t* d_scalars, + BW6761_g2_affine_t* d_points, + size_t count, + unsigned large_bucket_factor, + size_t device_id); + +int commit_batch_g2_cuda_bw6_761( + BW6761_g2_projective_t* d_out, + BW6761_scalar_t* d_scalars, + BW6761_g2_affine_t* d_points, + size_t count, + size_t batch_size, + size_t device_id, + cudaStream_t stream); + +#ifdef __cplusplus +} +#endif + +#endif /* _BW6761_MSM_H */ diff --git a/goicicle/curves/bw6761/include/ntt.h b/goicicle/curves/bw6761/include/ntt.h new file mode 100644 index 000000000..61f371427 --- /dev/null +++ b/goicicle/curves/bw6761/include/ntt.h @@ -0,0 +1,198 @@ + + // Copyright 2023 Ingonyama + // + // Licensed under the Apache License, Version 2.0 (the "License"); + // you may not use this file except in compliance with the License. + // You may obtain a copy of the License at + // + // http://www.apache.org/licenses/LICENSE-2.0 + // + // Unless required by applicable law or agreed to in writing, software + // distributed under the License is distributed on an "AS IS" BASIS, + // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + // See the License for the specific language governing permissions and + // limitations under the License. + +// Code generated by Ingonyama DO NOT EDIT + +#include +#include +// ntt.h + +#ifndef _BW6761_NTT_H +#define _BW6761_NTT_H + +#ifdef __cplusplus +extern "C" { +#endif + +// Incomplete declaration of BW6761 projective and affine structs +typedef struct BW6761_projective_t BW6761_projective_t; +typedef struct BW6761_affine_t BW6761_affine_t; +typedef struct BW6761_scalar_t BW6761_scalar_t; + +typedef struct BW6761_g2_projective_t BW6761_g2_projective_t; +typedef struct BW6761_g2_affine_t BW6761_g2_affine_t; + +int ntt_cuda_bw6_761(BW6761_scalar_t* arr, uint32_t n, bool inverse, size_t device_id); +int ntt_batch_cuda_bw6_761( + BW6761_scalar_t* arr, uint32_t arr_size, uint32_t batch_size, bool inverse, size_t device_id); + +int ecntt_cuda_bw6_761(BW6761_projective_t* arr, uint32_t n, bool inverse, size_t device_id); +int ecntt_batch_cuda_bw6_761( + BW6761_projective_t* arr, uint32_t arr_size, uint32_t batch_size, bool inverse, size_t device_id); + +BW6761_scalar_t* +build_domain_cuda_bw6_761(uint32_t domain_size, uint32_t logn, bool inverse, size_t device_id, size_t stream); + +int interpolate_scalars_cuda_bw6_761( + BW6761_scalar_t* d_out, + BW6761_scalar_t* d_evaluations, + BW6761_scalar_t* d_domain, + unsigned n, + unsigned device_id, + size_t stream); +int interpolate_scalars_batch_cuda_bw6_761( + BW6761_scalar_t* d_out, + BW6761_scalar_t* d_evaluations, + BW6761_scalar_t* d_domain, + unsigned n, + unsigned batch_size, + size_t device_id, + size_t stream); +int interpolate_points_cuda_bw6_761( + BW6761_projective_t* d_out, + BW6761_projective_t* d_evaluations, + BW6761_scalar_t* d_domain, + unsigned n, + size_t device_id, + size_t stream); +int interpolate_points_batch_cuda_bw6_761( + BW6761_projective_t* d_out, + BW6761_projective_t* d_evaluations, + BW6761_scalar_t* d_domain, + unsigned n, + unsigned batch_size, + size_t device_id, + size_t stream); +int interpolate_scalars_on_coset_cuda_bw6_761( + BW6761_scalar_t* d_out, + BW6761_scalar_t* d_evaluations, + BW6761_scalar_t* d_domain, + unsigned n, + BW6761_scalar_t* coset_powers, + size_t device_id, + size_t stream); +int interpolate_scalars_batch_on_coset_cuda_bw6_761( + BW6761_scalar_t* d_out, + BW6761_scalar_t* d_evaluations, + BW6761_scalar_t* d_domain, + unsigned n, + unsigned batch_size, + BW6761_scalar_t* coset_powers, + size_t device_id, + size_t stream); + +int evaluate_scalars_cuda_bw6_761( + BW6761_scalar_t* d_out, + BW6761_scalar_t* d_coefficients, + BW6761_scalar_t* d_domain, + unsigned domain_size, + unsigned n, + unsigned device_id, + size_t stream); +int evaluate_scalars_batch_cuda_bw6_761( + BW6761_scalar_t* d_out, + BW6761_scalar_t* d_coefficients, + BW6761_scalar_t* d_domain, + unsigned domain_size, + unsigned n, + unsigned batch_size, + size_t device_id, + size_t stream); +int evaluate_points_cuda_bw6_761( + BW6761_projective_t* d_out, + BW6761_projective_t* d_coefficients, + BW6761_scalar_t* d_domain, + unsigned domain_size, + unsigned n, + size_t device_id, + size_t stream); +int evaluate_points_batch_cuda_bw6_761( + BW6761_projective_t* d_out, + BW6761_projective_t* d_coefficients, + BW6761_scalar_t* d_domain, + unsigned domain_size, + unsigned n, + unsigned batch_size, + size_t device_id, + size_t stream); +int evaluate_scalars_on_coset_cuda_bw6_761( + BW6761_scalar_t* d_out, + BW6761_scalar_t* d_coefficients, + BW6761_scalar_t* d_domain, + unsigned domain_size, + unsigned n, + BW6761_scalar_t* coset_powers, + unsigned device_id, + size_t stream); +int evaluate_scalars_on_coset_batch_cuda_bw6_761( + BW6761_scalar_t* d_out, + BW6761_scalar_t* d_coefficients, + BW6761_scalar_t* d_domain, + unsigned domain_size, + unsigned n, + unsigned batch_size, + BW6761_scalar_t* coset_powers, + size_t device_id, + size_t stream); +int evaluate_points_on_coset_cuda_bw6_761( + BW6761_projective_t* d_out, + BW6761_projective_t* d_coefficients, + BW6761_scalar_t* d_domain, + unsigned domain_size, + unsigned n, + BW6761_scalar_t* coset_powers, + size_t device_id, + size_t stream); +int evaluate_points_on_coset_batch_cuda_bw6_761( + BW6761_projective_t* d_out, + BW6761_projective_t* d_coefficients, + BW6761_scalar_t* d_domain, + unsigned domain_size, + unsigned n, + unsigned batch_size, + BW6761_scalar_t* coset_powers, + size_t device_id, + size_t stream); + +int reverse_order_scalars_cuda_bw6_761(BW6761_scalar_t* arr, int n, size_t device_id, size_t stream); +int reverse_order_scalars_batch_cuda_bw6_761( + BW6761_scalar_t* arr, int n, int batch_size, size_t device_id, size_t stream); +int reverse_order_points_cuda_bw6_761(BW6761_projective_t* arr, int n, size_t device_id, size_t stream); +int reverse_order_points_batch_cuda_bw6_761( + BW6761_projective_t* arr, int n, int batch_size, size_t device_id, size_t stream); +int add_scalars_cuda_bw6_761( + BW6761_scalar_t* d_out, BW6761_scalar_t* d_in1, BW6761_scalar_t* d_in2, unsigned n, size_t stream); +int sub_scalars_cuda_bw6_761( + BW6761_scalar_t* d_out, BW6761_scalar_t* d_in1, BW6761_scalar_t* d_in2, unsigned n, size_t stream); +int to_montgomery_scalars_cuda_bw6_761(BW6761_scalar_t* d_inout, unsigned n, size_t stream); +int from_montgomery_scalars_cuda_bw6_761(BW6761_scalar_t* d_inout, unsigned n, size_t stream); + +// points g1 +int to_montgomery_proj_points_cuda_bw6_761(BW6761_projective_t* d_inout, unsigned n, size_t stream); +int from_montgomery_proj_points_cuda_bw6_761(BW6761_projective_t* d_inout, unsigned n, size_t stream); +int to_montgomery_aff_points_cuda_bw6_761(BW6761_affine_t* d_inout, unsigned n, size_t stream); +int from_montgomery_aff_points_cuda_bw6_761(BW6761_affine_t* d_inout, unsigned n, size_t stream); + +// points g2 +int to_montgomery_proj_points_g2_cuda_bw6_761(BW6761_g2_projective_t* d_inout, unsigned n, size_t stream); +int from_montgomery_proj_points_g2_cuda_bw6_761(BW6761_g2_projective_t* d_inout, unsigned n, size_t stream); +int to_montgomery_aff_points_g2_cuda_bw6_761(BW6761_g2_affine_t* d_inout, unsigned n, size_t stream); +int from_montgomery_aff_points_g2_cuda_bw6_761(BW6761_g2_affine_t* d_inout, unsigned n, size_t stream); + +#ifdef __cplusplus +} +#endif + +#endif /* _BW6761_NTT_H */ diff --git a/goicicle/curves/bw6761/include/projective.h b/goicicle/curves/bw6761/include/projective.h new file mode 100644 index 000000000..74f347d24 --- /dev/null +++ b/goicicle/curves/bw6761/include/projective.h @@ -0,0 +1,50 @@ + + // Copyright 2023 Ingonyama + // + // Licensed under the Apache License, Version 2.0 (the "License"); + // you may not use this file except in compliance with the License. + // You may obtain a copy of the License at + // + // http://www.apache.org/licenses/LICENSE-2.0 + // + // Unless required by applicable law or agreed to in writing, software + // distributed under the License is distributed on an "AS IS" BASIS, + // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + // See the License for the specific language governing permissions and + // limitations under the License. + +// Code generated by Ingonyama DO NOT EDIT + +#include +#include +// projective.h + +#ifdef __cplusplus +extern "C" { +#endif + +typedef struct BW6761_projective_t BW6761_projective_t; +typedef struct BW6761_g2_projective_t BW6761_g2_projective_t; +typedef struct BW6761_affine_t BW6761_affine_t; +typedef struct BW6761_g2_affine_t BW6761_g2_affine_t; +typedef struct BW6761_scalar_t BW6761_scalar_t; + +bool projective_is_on_curve_bw6_761(BW6761_projective_t* point1); + +int random_scalar_bw6_761(BW6761_scalar_t* out); +int random_projective_bw6_761(BW6761_projective_t* out); +BW6761_projective_t* projective_zero_bw6_761(); +int projective_to_affine_bw6_761(BW6761_affine_t* out, BW6761_projective_t* point1); +int projective_from_affine_bw6_761(BW6761_projective_t* out, BW6761_affine_t* point1); + +int random_g2_projective_bw6_761(BW6761_g2_projective_t* out); +int g2_projective_to_affine_bw6_761(BW6761_g2_affine_t* out, BW6761_g2_projective_t* point1); +int g2_projective_from_affine_bw6_761(BW6761_g2_projective_t* out, BW6761_g2_affine_t* point1); +bool g2_projective_is_on_curve_bw6_761(BW6761_g2_projective_t* point1); + +bool eq_bw6_761(BW6761_projective_t* point1, BW6761_projective_t* point2); +bool eq_g2_bw6_761(BW6761_g2_projective_t* point1, BW6761_g2_projective_t* point2); + +#ifdef __cplusplus +} +#endif diff --git a/goicicle/curves/bw6761/include/ve_mod_mult.h b/goicicle/curves/bw6761/include/ve_mod_mult.h new file mode 100644 index 000000000..fbc2b5a8d --- /dev/null +++ b/goicicle/curves/bw6761/include/ve_mod_mult.h @@ -0,0 +1,49 @@ + + // Copyright 2023 Ingonyama + // + // Licensed under the Apache License, Version 2.0 (the "License"); + // you may not use this file except in compliance with the License. + // You may obtain a copy of the License at + // + // http://www.apache.org/licenses/LICENSE-2.0 + // + // Unless required by applicable law or agreed to in writing, software + // distributed under the License is distributed on an "AS IS" BASIS, + // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + // See the License for the specific language governing permissions and + // limitations under the License. + +// Code generated by Ingonyama DO NOT EDIT + +#include +#include +// ve_mod_mult.h + +#ifndef _BW6761_VEC_MULT_H +#define _BW6761_VEC_MULT_H + +#ifdef __cplusplus +extern "C" { +#endif + +typedef struct BW6761_projective_t BW6761_projective_t; +typedef struct BW6761_scalar_t BW6761_scalar_t; + +int32_t vec_mod_mult_point_bw6_761( + BW6761_projective_t* inout, BW6761_scalar_t* scalar_vec, size_t n_elments, size_t device_id); +int32_t vec_mod_mult_scalar_bw6_761( + BW6761_scalar_t* inout, BW6761_scalar_t* scalar_vec, size_t n_elments, size_t device_id); +int32_t vec_mod_mult_device_scalar_bw6_761( + BW6761_scalar_t* inout, BW6761_scalar_t* scalar_vec, size_t n_elements, size_t device_id); +int32_t matrix_vec_mod_mult_bw6_761( + BW6761_scalar_t* matrix_flattened, + BW6761_scalar_t* input, + BW6761_scalar_t* output, + size_t n_elments, + size_t device_id); + +#ifdef __cplusplus +} +#endif + +#endif /* _BW6761_VEC_MULT_H */ diff --git a/goicicle/curves/bw6761/msm.go b/goicicle/curves/bw6761/msm.go new file mode 100644 index 000000000..c0a39ffcd --- /dev/null +++ b/goicicle/curves/bw6761/msm.go @@ -0,0 +1,209 @@ +// Copyright 2023 Ingonyama +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +// Code generated by Ingonyama DO NOT EDIT + +package bw6761 + +import ( + "errors" + "fmt" + "unsafe" +) + +// #cgo CFLAGS: -I./include/ +// #cgo CFLAGS: -I/usr/local/cuda/include +// #cgo LDFLAGS: -L${SRCDIR}/../../ -lbw6761 +// #include "msm.h" +import "C" + +func Msm(out *G1ProjectivePoint, points []G1PointAffine, scalars []G1ScalarField, device_id int) (*G1ProjectivePoint, error) { + if len(points) != len(scalars) { + return nil, errors.New("error on: len(points) != len(scalars)") + } + + pointsC := (*C.BW6761_affine_t)(unsafe.Pointer(&points[0])) + scalarsC := (*C.BW6761_scalar_t)(unsafe.Pointer(&scalars[0])) + outC := (*C.BW6761_projective_t)(unsafe.Pointer(out)) + ret := C.msm_cuda_bw6_761(outC, pointsC, scalarsC, C.size_t(len(points)), C.size_t(device_id)) + + if ret != 0 { + return nil, fmt.Errorf("msm_cuda_bw6_761 returned error code: %d", ret) + } + + return out, nil +} + +func MsmG2(out *G2Point, points []G2PointAffine, scalars []G1ScalarField, device_id int) (*G2Point, error) { + if len(points) != len(scalars) { + return nil, errors.New("error on: len(points) != len(scalars)") + } + + pointsC := (*C.BW6761_g2_affine_t)(unsafe.Pointer(&points[0])) + scalarsC := (*C.BW6761_scalar_t)(unsafe.Pointer(&scalars[0])) + outC := (*C.BW6761_g2_projective_t)(unsafe.Pointer(out)) + + ret := C.msm_g2_cuda_bw6_761(outC, pointsC, scalarsC, C.size_t(len(points)), C.size_t(device_id)) + + if ret != 0 { + return nil, fmt.Errorf("msm_g2_cuda_bw6_761 returned error code: %d", ret) + } + + return out, nil +} + +func MsmBatch(points *[]G1PointAffine, scalars *[]G1ScalarField, batchSize, deviceId int) ([]G1ProjectivePoint, error) { + // Check for nil pointers + if points == nil || scalars == nil { + return nil, errors.New("points or scalars is nil") + } + + if len(*points) != len(*scalars) { + return nil, errors.New("error on: len(points) != len(scalars)") + } + + // Check for empty slices + if len(*points) == 0 || len(*scalars) == 0 { + return nil, errors.New("points or scalars is empty") + } + + // Check for zero batchSize + if batchSize <= 0 { + return nil, errors.New("error on: batchSize must be greater than zero") + } + + out := make([]G1ProjectivePoint, batchSize) + + for i := 0; i < len(out); i++ { + var p G1ProjectivePoint + p.SetZero() + + out[i] = p + } + + outC := (*C.BW6761_projective_t)(unsafe.Pointer(&out[0])) + pointsC := (*C.BW6761_affine_t)(unsafe.Pointer(&(*points)[0])) + scalarsC := (*C.BW6761_scalar_t)(unsafe.Pointer(&(*scalars)[0])) + msmSizeC := C.size_t(len(*points) / batchSize) + deviceIdC := C.size_t(deviceId) + batchSizeC := C.size_t(batchSize) + + ret := C.msm_batch_cuda_bw6_761(outC, pointsC, scalarsC, batchSizeC, msmSizeC, deviceIdC) + if ret != 0 { + return nil, fmt.Errorf("msm_batch_cuda_bw6_761 returned error code: %d", ret) + } + + return out, nil +} + +func MsmG2Batch(points *[]G2PointAffine, scalars *[]G1ScalarField, batchSize, deviceId int) ([]G2Point, error) { + // Check for nil pointers + if points == nil || scalars == nil { + return nil, errors.New("points or scalars is nil") + } + + if len(*points) != len(*scalars) { + return nil, errors.New("error on: len(points) != len(scalars)") + } + + // Check for empty slices + if len(*points) == 0 || len(*scalars) == 0 { + return nil, errors.New("points or scalars is empty") + } + + // Check for zero batchSize + if batchSize <= 0 { + return nil, errors.New("error on: batchSize must be greater than zero") + } + + out := make([]G2Point, batchSize) + + outC := (*C.BW6761_g2_projective_t)(unsafe.Pointer(&out[0])) + pointsC := (*C.BW6761_g2_affine_t)(unsafe.Pointer(&(*points)[0])) + scalarsC := (*C.BW6761_scalar_t)(unsafe.Pointer(&(*scalars)[0])) + msmSizeC := C.size_t(len(*points) / batchSize) + deviceIdC := C.size_t(deviceId) + batchSizeC := C.size_t(batchSize) + + ret := C.msm_batch_g2_cuda_bw6_761(outC, pointsC, scalarsC, batchSizeC, msmSizeC, deviceIdC) + if ret != 0 { + return nil, fmt.Errorf("msm_batch_cuda_bw6_761 returned error code: %d", ret) + } + + return out, nil +} + +func Commit(d_out, d_scalars, d_points unsafe.Pointer, count, bucketFactor int) int { + d_outC := (*C.BW6761_projective_t)(d_out) + scalarsC := (*C.BW6761_scalar_t)(d_scalars) + pointsC := (*C.BW6761_affine_t)(d_points) + countC := (C.size_t)(count) + largeBucketFactorC := C.uint(bucketFactor) + + ret := C.commit_cuda_bw6_761(d_outC, scalarsC, pointsC, countC, largeBucketFactorC, 0) + + if ret != 0 { + return -1 + } + + return 0 +} + +func CommitG2(d_out, d_scalars, d_points unsafe.Pointer, count, bucketFactor int) int { + d_outC := (*C.BW6761_g2_projective_t)(d_out) + scalarsC := (*C.BW6761_scalar_t)(d_scalars) + pointsC := (*C.BW6761_g2_affine_t)(d_points) + countC := (C.size_t)(count) + largeBucketFactorC := C.uint(bucketFactor) + + ret := C.commit_g2_cuda_bw6_761(d_outC, scalarsC, pointsC, countC, largeBucketFactorC, 0) + + if ret != 0 { + return -1 + } + + return 0 +} + +func CommitBatch(d_out, d_scalars, d_points unsafe.Pointer, count, batch_size int) int { + d_outC := (*C.BW6761_projective_t)(d_out) + scalarsC := (*C.BW6761_scalar_t)(d_scalars) + pointsC := (*C.BW6761_affine_t)(d_points) + countC := (C.size_t)(count) + batch_sizeC := (C.size_t)(batch_size) + + ret := C.commit_batch_cuda_bw6_761(d_outC, scalarsC, pointsC, countC, batch_sizeC, 0) + + if ret != 0 { + return -1 + } + + return 0 +} + +func CommitG2Batch(d_out, d_scalars, d_points unsafe.Pointer, count, batch_size int) int { + d_outC := (*C.BW6761_g2_projective_t)(d_out) + scalarsC := (*C.BW6761_scalar_t)(d_scalars) + pointsC := (*C.BW6761_g2_affine_t)(d_points) + countC := (C.size_t)(count) + batch_sizeC := (C.size_t)(batch_size) + + ret := C.msm_batch_g2_cuda_bw6_761(d_outC, pointsC, scalarsC, countC, batch_sizeC, 0) + + if ret != 0 { + return -1 + } + + return 0 +} diff --git a/goicicle/curves/bw6761/msm_test.go b/goicicle/curves/bw6761/msm_test.go new file mode 100644 index 000000000..53b70a4fc --- /dev/null +++ b/goicicle/curves/bw6761/msm_test.go @@ -0,0 +1,367 @@ +// Copyright 2023 Ingonyama +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +// Code generated by Ingonyama DO NOT EDIT + +package bw6761 + +import ( + "fmt" + "math" + "testing" + "time" + "unsafe" + + "github.com/ingonyama-zk/icicle/goicicle" + "github.com/stretchr/testify/assert" +) + +func GeneratePoints(count int) []G1PointAffine { + // Declare a slice of integers + var points []G1PointAffine + + // populate the slice + for i := 0; i < 10; i++ { + var pointProjective G1ProjectivePoint + pointProjective.Random() + + var pointAffine G1PointAffine + pointAffine.FromProjective(&pointProjective) + + points = append(points, pointAffine) + } + + log2_10 := math.Log2(10) + log2Count := math.Log2(float64(count)) + log2Size := int(math.Ceil(log2Count - log2_10)) + + for i := 0; i < log2Size; i++ { + points = append(points, points...) + } + + return points[:count] +} + +func GeneratePointsProj(count int) []G1ProjectivePoint { + // Declare a slice of integers + var points []G1ProjectivePoint + // Use a loop to populate the slice + for i := 0; i < count; i++ { + var p G1ProjectivePoint + p.Random() + + points = append(points, p) + } + + return points +} + +func GenerateScalars(count int, skewed bool) []G1ScalarField { + // Declare a slice of integers + var scalars []G1ScalarField + + var rand G1ScalarField + var zero G1ScalarField + var one G1ScalarField + var randLarge G1ScalarField + + zero.SetZero() + one.SetOne() + randLarge.Random() + + if skewed && count > 1_200_000 { + for i := 0; i < count-1_200_000; i++ { + rand.Random() + scalars = append(scalars, rand) + } + + for i := 0; i < 600_000; i++ { + scalars = append(scalars, randLarge) + } + for i := 0; i < 400_000; i++ { + scalars = append(scalars, zero) + } + for i := 0; i < 200_000; i++ { + scalars = append(scalars, one) + } + } else { + for i := 0; i < count; i++ { + rand.Random() + scalars = append(scalars, rand) + } + } + + return scalars[:count] +} + +func TestMSM(t *testing.T) { + fmt.Print() // this prevents the test from hanging. TODO: figure out why + for _, v := range []int{8} { + count := 1 << v + + points := GeneratePoints(count) + fmt.Print("Finished generating points\n") + scalars := GenerateScalars(count, false) + fmt.Print("Finished generating scalars\n") + + out := new(G1ProjectivePoint) + startTime := time.Now() + _, e := Msm(out, points, scalars, 0) // non mont + fmt.Printf("icicle MSM took: %d ms\n", time.Since(startTime).Milliseconds()) + + assert.Equal(t, e, nil, "error should be nil") + + assert.True(t, out.IsOnCurve()) + } +} + +func TestCommitMSM(t *testing.T) { + for _, v := range []int{8} { + count := 1< int get_optimal_c(int bitsize) { - return ceil(log2(bitsize)) - 4; + return max((int)ceil(log2(bitsize)) - 4, 1); } template @@ -904,13 +904,9 @@ namespace msm { } // namespace - MSMConfig DefaultMSMConfig() + extern "C" MSMConfig DefaultMSMConfig() { - device_context::DeviceContext ctx = { - 0, // device_id - (cudaStream_t)0, // stream - 0, // mempool - }; + device_context::DeviceContext ctx = device_context::get_default_device_context(); MSMConfig config = { false, // are_scalars_on_device false, // are_scalars_montgomery_form @@ -925,7 +921,7 @@ namespace msm { false, // is_big_triangle 10, // large_bucket_factor false, // is_async - ctx, // DeviceContext + ctx, // ctx }; return config; } @@ -950,13 +946,7 @@ namespace msm { } /** - * Extern version of [DefaultMSMConfig](@ref DefaultMSMConfig) function. - * @return Default value of [MSMConfig](@ref MSMConfig). - */ - extern "C" MSMConfig GetDefaultMSMConfig() { return DefaultMSMConfig(); } - - /** - * Extern version of [MSM](@ref MSM) function with the following values of template parameters + * Extern "C" version of [MSM](@ref MSM) function with the following values of template parameters * (where the curve is given by `-DCURVE` env variable during build): * - `S` is the [scalar field](@ref scalar_t) of the curve; * - `A` is the [affine representation](@ref affine_t) of curve points; @@ -977,7 +967,7 @@ namespace msm { #if defined(G2_DEFINED) /** - * Extern version of [MSM](@ref MSM) function with the following values of template parameters + * Extern "C" version of [MSM](@ref MSM) function with the following values of template parameters * (where the curve is given by `-DCURVE` env variable during build): * - `S` is the [scalar field](@ref scalar_t) of the curve; * - `A` is the [affine representation](@ref g2_affine_t) of G2 curve points; diff --git a/icicle/appUtils/msm/msm.cuh b/icicle/appUtils/msm/msm.cuh index 356033b1a..e1fcc9cce 100644 --- a/icicle/appUtils/msm/msm.cuh +++ b/icicle/appUtils/msm/msm.cuh @@ -33,47 +33,50 @@ namespace msm { /** * @struct MSMConfig - * Struct that encodes MSM parameters to be passed into the [msm](@ref msm) function. + * Struct that encodes MSM parameters to be passed into the [MSM](@ref MSM) function. The intended use of this struct + * is to create it using [DefaultMSMConfig](@ref DefaultMSMConfig) function and then you'll hopefully only need to + * change a small number of default values for each of your MSMs. */ struct MSMConfig { - bool - are_scalars_on_device; /**< True if scalars are on device and false if they're on host. Default value: false. */ + bool are_scalars_on_device; /**< True if scalars are on device and false if they're on host. Default value: + * false. */ bool are_scalars_montgomery_form; /**< True if scalars are in Montgomery form and false otherwise. Default value: - true. */ - int points_size; /**< Number of points in the MSM. If a batch of MSMs needs to be computed, this should be a number - * of different points. So, if each MSM re-uses the same set of points, this variable is set - * equal to the MSM size. And if every MSM uses a distinct set of points, it should be set to the - * product of MSM size and [batch_size](@ref batch_size). Default value: 0 (meaning it's equal to - * the MSM size). */ + * true. */ + int points_size; /**< Number of points in the MSM. If a batch of MSMs needs to be computed, this should be + * a number of different points. So, if each MSM re-uses the same set of points, this + * variable is set equal to the MSM size. And if every MSM uses a distinct set of + * points, it should be set to the product of MSM size and [batch_size](@ref + * batch_size). Default value: 0 (meaning it's equal to the MSM size). */ int precompute_factor; /**< The number of extra points to pre-compute for each point. Larger values decrease the - * number of computations to make, on-line memory footprint, but increase the static - * memory footprint. Default value: 1 (i.e. don't pre-compute). */ + * number of computations to make, on-line memory footprint, but increase the static + * memory footprint. Default value: 1 (i.e. don't pre-compute). */ bool are_points_on_device; /**< True if points are on device and false if they're on host. Default value: false. */ bool are_points_montgomery_form; /**< True if coordinates of points are in Montgomery form and false otherwise. - Default value: true. */ + * Default value: true. */ int batch_size; /**< The number of MSMs to compute. Default value: 1. */ bool are_results_on_device; /**< True if the results should be on device and false if they should be on host. If set - * to false, `is_async` won't take effect because a synchronization is needed to - * transfer results to the host. Default value: false. */ - int c; /**< \f$ c \f$ value, or "window bitsize" which is the main parameter of the "bucket method" - * that we use to solve the MSM problem. As a rule of thumb, larger value means more on-line memory - * footprint but also more parallelism and less computational complexity (up to a certain point). - * Default value: 0 (the optimal value of \f$ c \f$ is chosen automatically). */ - int bitsize; /**< Number of bits of the largest scalar. Typically equals the bitsize of scalar field, but if a - * different (better) upper bound is known, it should be reflected in this variable. Default value: 0 - * (set to the bitsize of scalar field). */ - bool is_big_triangle; /**< Whether to do "bucket accumulation" serially. Decreases computational complexity, but - * also greatly decreases parallelism, so only suitable for large batches of MSMs. Default - * value: false. */ - int large_bucket_factor; /**< Variable that controls how sensitive the algorithm is to the buckets that occur very - * frequently. Useful for efficient treatment of non-uniform distributions of scalars and - * "top windows" with few bits. Can be set to 0 to disable separate treatment of large - * buckets altogether. Default value: 10. */ - int is_async; /**< Whether to run the MSM asyncronously. If set to `true`, the MSM function will be non-blocking - * and you'd need to synchronize it explicitly by running `cudaStreamSynchronize` or - * `cudaDeviceSynchronize`. If set to false, the MSM function will block the current CPU thread. */ - device_context::DeviceContext ctx; /**< Details related to the device such as its id and stream id. See - [DeviceContext](@ref `device_context::DeviceContext`). */ + * to false, `is_async` won't take effect because a synchronization is needed to + * transfer results to the host. Default value: false. */ + int c; /**< \f$ c \f$ value, or "window bitsize" which is the main parameter of the "bucket + * method" that we use to solve the MSM problem. As a rule of thumb, larger value + * means more on-line memory footprint but also more parallelism and less computational + * complexity (up to a certain point). Default value: 0 (the optimal value of \f$ c \f$ + * is chosen automatically). */ + int bitsize; /**< Number of bits of the largest scalar. Typically equals the bitsize of scalar field, + * but if a different (better) upper bound is known, it should be reflected in this + * variable. Default value: 0 (set to the bitsize of scalar field). */ + bool is_big_triangle; /**< Whether to do "bucket accumulation" serially. Decreases computational complexity + * but also greatly decreases parallelism, so only suitable for large batches of MSMs. + * Default value: false. */ + int large_bucket_factor; /**< Variable that controls how sensitive the algorithm is to the buckets that occur + * very frequently. Useful for efficient treatment of non-uniform distributions of + * scalars and "top windows" with few bits. Can be set to 0 to disable separate + * treatment of large buckets altogether. Default value: 10. */ + bool is_async; /**< Whether to run the MSM asyncronously. If set to true, the MSM function will be + * non-blocking and you'd need to synchronize it explicitly by running + * `cudaStreamSynchronize` or `cudaDeviceSynchronize`. If set to false, the MSM + * function will block the current CPU thread. */ + device_context::DeviceContext ctx; /**< Details related to the device such as its id and stream id. */ }; /** @@ -89,8 +92,8 @@ namespace msm { * So, if for example all MSMs share the same base points, they can be repeated only once. * @param msm_size MSM size \f$ N \f$. If a batch of MSMs (which all need to have the same size) is computed, this is * the size of 1 MSM. - * @param results Result (or results in the case of batch MSM). * @param config [MSMConfig](@ref MSMConfig) used in this MSM. + * @param results Buffer for the result (or results in the case of batch MSM). * @tparam S Scalar field type. * @tparam A The type of points \f$ \{P_i\} \f$ which is typically an [affine * Weierstrass](https://hyperelliptic.org/EFD/g1p/auto-shortw.html) point. @@ -98,10 +101,6 @@ namespace msm { * Weierstrass](https://hyperelliptic.org/EFD/g1p/auto-shortw-projective.html) point in our codebase. * @return `cudaSuccess` if the execution was successful and an error code otherwise. * - * This function is asyncronous, and to sync it with host, you need to call `cudaDeviceSyncronize()`. To syncronize - * with a different stream `stream1`, call `cudaStreamSynchronize(config.stream)` and - * `cudaStreamSynchronize(stream1)`. - * * **Note:** this function is still WIP and the following [MSMConfig](@ref MSMConfig) members do not yet have any * effect: `points_size` (it's always equal to the msm size currenly), `precompute_factor` (always equals 1) and * `ctx.device_id` (0 device is always used). Also, it's currently better to use `batch_size=1` in most cases (expept diff --git a/icicle/appUtils/ntt/ntt.cu b/icicle/appUtils/ntt/ntt.cu index 66016f208..e2acbfec1 100644 --- a/icicle/appUtils/ntt/ntt.cu +++ b/icicle/appUtils/ntt/ntt.cu @@ -1,5 +1,8 @@ #include "ntt.cuh" +#include +#include + #include "../../curves/curve_config.cuh" #include "../../utils/sharedmem.cuh" #include "../../utils/utils_kernels.cuh" @@ -42,59 +45,59 @@ namespace ntt { } /** - * Bit-reverses a batch of input arrays in-place inside GPU. + * Bit-reverses a batch of input arrays out-of-place inside GPU. * for example: on input array ([a[0],a[1],a[2],a[3]], 4, 2) it returns * [a[0],a[3],a[2],a[1]] (elements at indices 3 and 1 swhich places). - * @param arr batch of arrays of some object of type T. Should be on GPU. + * @param arr_in batch of arrays of some object of type T. Should be on GPU. * @param n length of `arr`. * @param logn log(n). * @param batch_size the size of the batch. + * @param arr_out buffer of the same size as `arr_in` on the GPU to write the bit-permuted array into. */ template - void reverse_order_batch(E* arr, uint32_t n, uint32_t logn, uint32_t batch_size, cudaStream_t stream) + void reverse_order_batch(E* arr_in, uint32_t n, uint32_t logn, uint32_t batch_size, cudaStream_t stream, E* arr_out) { - E* arr_reversed; - cudaMallocAsync(&arr_reversed, n * batch_size * sizeof(E), stream); int number_of_threads = MAX_THREADS_BATCH; int number_of_blocks = (n * batch_size + number_of_threads - 1) / number_of_threads; - reverse_order_kernel<<>>(arr, arr_reversed, n, logn, batch_size); - cudaMemcpyAsync(arr, arr_reversed, n * batch_size * sizeof(E), cudaMemcpyDefault, stream); - cudaFreeAsync(arr_reversed, stream); + reverse_order_kernel<<>>(arr_in, arr_out, n, logn, batch_size); } /** - * Bit-reverses an input array in-place inside GPU. + * Bit-reverses an input array out-of-place inside GPU. * for example: on array ([a[0],a[1],a[2],a[3]], 4, 2) it returns * [a[0],a[3],a[2],a[1]] (elements at indices 3 and 1 swhich places). - * @param arr array of some object of type T of size which is a power of 2. Should be on GPU. + * @param arr_in array of some object of type T of size which is a power of 2. Should be on GPU. * @param n length of `arr`. * @param logn log(n). + * @param arr_out buffer of the same size as `arr_in` on the GPU to write the bit-permuted array into. */ template - void reverse_order(E* arr, uint32_t n, uint32_t logn, cudaStream_t stream) + void reverse_order(E* arr_in, uint32_t n, uint32_t logn, cudaStream_t stream, E* arr_out) { - reverse_order_batch(arr, n, logn, 1, stream); + reverse_order_batch(arr_in, n, logn, 1, stream, arr_out); } /** * Cooley-Tuckey NTT. * NOTE! this function assumes that d_twiddles are located in the device memory. - * @param arr input array of type E (elements). + * @param arr_in input array of type E (elements). * @param n length of d_arr. * @param twiddles twiddle factors of type S (scalars) array allocated on the device memory (must be a power of 2). - * @param n_twiddles length of twiddles. + * @param n_twiddles length of twiddles, should be negative for intt. * @param max_task max count of parallel tasks. * @param s log2(n) loop index. + * @param arr_out buffer for the output. */ template __global__ void ntt_template_kernel_shared_rev( - E* __restrict__ arr_g, - uint32_t n, + E* __restrict__ arr_in, + int n, const S* __restrict__ r_twiddles, - uint32_t n_twiddles, - uint32_t max_task, - uint32_t ss, - uint32_t logn) + int n_twiddles, + int max_task, + int ss, + int logn, + E* __restrict__ arr_out) { SharedMemory smem; E* arr = smem.getPointer(); @@ -128,13 +131,13 @@ namespace ntt { uint32_t oij = i + j; uint32_t k = oij + shift_s; - S tw = r_twiddles[j * n_twiddles_div]; + S tw = *(r_twiddles + (int)(j * n_twiddles_div)); - E u = is_beginning ? arr_g[offset + oij] : arr[oij]; - E v = is_beginning ? arr_g[offset + k] : arr[k]; + E u = is_beginning ? arr_in[offset + oij] : arr[oij]; + E v = is_beginning ? arr_in[offset + k] : arr[k]; if (is_end) { - arr_g[offset + oij] = u + v; - arr_g[offset + k] = tw * (u - v); + arr_out[offset + oij] = u + v; + arr_out[offset + k] = tw * (u - v); } else { arr[oij] = u + v; arr[k] = tw * (u - v); @@ -149,22 +152,24 @@ namespace ntt { /** * Cooley-Tuckey NTT. * NOTE! this function assumes that d_twiddles are located in the device memory. - * @param arr input array of type E (elements). + * @param arr_in input array of type E (elements). * @param n length of d_arr. * @param twiddles twiddle factors of type S (scalars) array allocated on the device memory (must be a power of 2). - * @param n_twiddles length of twiddles. + * @param n_twiddles length of twiddles, should be negative for intt. * @param max_task max count of parallel tasks. * @param s log2(n) loop index. + * @param arr_out buffer for the output. */ template __global__ void ntt_template_kernel_shared( - E* __restrict__ arr_g, - uint32_t n, + E* __restrict__ arr_in, + int n, const S* __restrict__ r_twiddles, - uint32_t n_twiddles, - uint32_t max_task, - uint32_t s, - uint32_t logn) + int n_twiddles, + int max_task, + int s, + int logn, + E* __restrict__ arr_out) { SharedMemory smem; E* arr = smem.getPointer(); @@ -194,14 +199,14 @@ namespace ntt { uint32_t i = ((l >> s) * shift2_s) & (n - 1); // (..) % n (assuming n is power of 2) uint32_t oij = i + j; uint32_t k = oij + shift_s; - S tw = r_twiddles[j * n_twiddles_div]; + S tw = *(r_twiddles + (int)(j * n_twiddles_div)); - E u = s == 0 ? arr_g[offset + oij] : arr[oij]; - E v = s == 0 ? arr_g[offset + k] : arr[k]; + E u = s == 0 ? arr_in[offset + oij] : arr[oij]; + E v = s == 0 ? arr_in[offset + k] : arr[k]; v = tw * v; if (s == (logn - 1)) { - arr_g[offset + oij] = u + v; - arr_g[offset + k] = u - v; + arr_out[offset + oij] = u + v; + arr_out[offset + k] = u - v; } else { arr[oij] = u + v; arr[k] = u - v; @@ -219,13 +224,13 @@ namespace ntt { * @param arr input array of type E (elements). * @param n length of d_arr. * @param twiddles twiddle factors of type S (scalars) array allocated on the device memory (must be a power of 2). - * @param n_twiddles length of twiddles. + * @param n_twiddles length of twiddles, should be negative for intt. * @param max_task max count of parallel tasks. * @param s log2(n) loop index. */ template __global__ void - ntt_template_kernel(E* arr, uint32_t n, S* twiddles, uint32_t n_twiddles, uint32_t max_task, uint32_t s, bool rev) + ntt_template_kernel(E* arr_in, int n, S* twiddles, int n_twiddles, int max_task, int s, bool rev, E* arr_out) { int task = blockIdx.x; int chunks = n / (blockDim.x * 2); @@ -248,15 +253,15 @@ namespace ntt { uint32_t i = ((l >> s) * shift2_s) & (n - 1); // (..) % n (assuming n is power of 2) uint32_t k = i + j + shift_s; - S tw = twiddles[j * n_twiddles_div]; + S tw = *(twiddles + (int)(j * n_twiddles_div)); uint32_t offset = (task / chunks) * n; - E u = arr[offset + i + j]; - E v = arr[offset + k]; + E u = arr_in[offset + i + j]; + E v = arr_in[offset + k]; if (!rev) v = tw * v; - arr[offset + i + j] = u + v; + arr_out[offset + i + j] = u + v; v = u - v; - arr[offset + k] = rev ? tw * v : v; + arr_out[offset + k] = rev ? tw * v : v; } } } @@ -264,29 +269,32 @@ namespace ntt { /** * NTT/INTT inplace batch * Note: this function does not preform any bit-reverse permutations on its inputs or outputs. - * @param d_inout Array for inplace processing - * @param d_twiddles - * @param n Length of `d_twiddles` array + * @param d_input Input array + * @param n Size of `d_input` + * @param d_twiddles Twiddles + * @param n_twiddles Size of `d_twiddles` * @param batch_size The size of the batch; the length of `d_inout` is `n` * `batch_size`. * @param inverse true for iNTT - * @param is_coset true for multiplication by coset - * @param coset should be array of lenght n - or in case of lesser than n, right-padded with zeroes + * @param coset should be array of lenght n or a nullptr if NTT is not computed on a coset * @param stream CUDA stream - * @param is_sync_needed do perform sync of the supplied CUDA stream at the end of processing + * @param is_async if false, perform sync of the supplied CUDA stream at the end of processing + * @param d_output Output array */ template void ntt_inplace_batch_template( - E* d_inout, + E* d_input, + int n, S* d_twiddles, - unsigned n, - unsigned batch_size, + int n_twiddles, + int batch_size, + int logn, bool inverse, - bool is_coset, - S* coset, + bool ct_buttterfly, + int coset_gen_index, cudaStream_t stream, - bool is_sync_needed) + bool is_async, + E* d_output) { - const int logn = int(log(n) / log(2)); bool is_shared_mem_enabled = sizeof(E) <= MAX_SHARED_MEM_ELEMENT_SIZE; const int log2_shmem_elems = is_shared_mem_enabled ? int(log(int(MAX_SHARED_MEM / sizeof(E))) / log(2)) : logn; int num_threads = max(min(min(n / 2, MAX_THREADS_BATCH), 1 << (log2_shmem_elems - 1)), 1); @@ -297,215 +305,231 @@ namespace ntt { // less then max to allow more concurrent blocks on SM const int logn_shmem = is_shared_mem_enabled ? int(log(2 * num_threads) / log(2)) : 0; // TODO: shared memory support only for types <= 32 bytes + int num_threads_coset = max(min(n / 2, MAX_NUM_THREADS), 1); + int num_blocks_coset = (n * batch_size + num_threads_coset - 1) / num_threads_coset; if (inverse) { + d_twiddles = d_twiddles + n_twiddles; + n_twiddles = -n_twiddles; + } + + bool is_on_coset = (coset_gen_index > 0); + bool direct_coset = (!inverse && is_on_coset); + if (direct_coset) + utils_internal::BatchMulKernel + <<>>(d_input, n, batch_size, d_twiddles, coset_gen_index, n_twiddles, d_output); + + if (ct_buttterfly) { if (is_shared_mem_enabled) ntt_template_kernel_shared<<>>( - d_inout, 1 << logn_shmem, d_twiddles, n, total_tasks, 0, logn_shmem); + direct_coset ? d_output : d_input, 1 << logn_shmem, d_twiddles, n_twiddles, total_tasks, 0, logn_shmem, d_output); for (int s = logn_shmem; s < logn; s++) // TODO: this loop also can be unrolled { - ntt_template_kernel - <<>>(d_inout, n, d_twiddles, n, total_tasks, s, false); + ntt_template_kernel<<>>( + (direct_coset && (s == 0)) ? d_input : d_output, n, d_twiddles, n_twiddles, total_tasks, s, false, d_output); } - - if (is_coset) - utils_internal::BatchMulKernel<<>>(d_inout, coset, n, batch_size); - - num_threads = max(min(n / 2, MAX_NUM_THREADS), 1); - num_blocks = (n * batch_size + num_threads - 1) / num_threads; - utils_internal::NormalizeKernel - <<>>(d_inout, S::inv_log_size(logn), n * batch_size); } else { - if (is_coset) - utils_internal::BatchMulKernel<<>>(d_inout, coset, n, batch_size); - for (int s = logn - 1; s >= logn_shmem; s--) // TODO: this loop also can be unrolled { - ntt_template_kernel<<>>(d_inout, n, d_twiddles, n, total_tasks, s, true); + ntt_template_kernel<<>>( + (direct_coset || (s < logn - 1)) ? d_output : d_input, n, d_twiddles, n_twiddles, total_tasks, s, true, d_output); } if (is_shared_mem_enabled) ntt_template_kernel_shared_rev<<>>( - d_inout, 1 << logn_shmem, d_twiddles, n, total_tasks, 0, logn_shmem); + (direct_coset || (logn > logn_shmem)) ? d_output : d_input, 1 << logn_shmem, d_twiddles, + n_twiddles, total_tasks, 0, logn_shmem, d_output); } - if (!is_sync_needed) return; + if (inverse) { + if (is_on_coset) + utils_internal::BatchMulKernel + <<>>(d_output, n, batch_size, d_twiddles, -coset_gen_index, -n_twiddles, d_output); + + utils_internal::NormalizeKernel + <<>>(d_output, S::inv_log_size(logn), n * batch_size); + } + + if (is_async) return; cudaStreamSynchronize(stream); } } // namespace + /** + * @struct Domain + * Struct containing information about the domain on which (i)NTT is evaluated i.e. twiddle factors. + * Twiddle factors are private, static and can only be set using [InitDomain](@ref InitDomain) function. + * The internal representation of twiddles is prone to change in accordance with changing [NTT](@ref NTT) algorithm. + * @tparam S The type of twiddle factors \f$ \{ \omega^i \} \f$. Must be a field. + */ + template + class Domain { + static int max_size; + static S* twiddles; + static std::unordered_map coset_index; + + public: + template + friend cudaError_t InitDomain(U primitive_root, device_context::DeviceContext& ctx); + + template + friend cudaError_t NTT(E* input, int size, bool is_inverse, NTTConfig& config, E* output); + }; + + template int Domain::max_size = 0; + template S* Domain::twiddles = nullptr; + template std::unordered_map Domain::coset_index = {}; + template - cudaError_t GenerateTwiddleFactors(S* d_twiddles, int n_twiddles, S omega, device_context::DeviceContext ctx) + cudaError_t InitDomain(S primitive_root, device_context::DeviceContext& ctx) { - twiddle_factors_kernel<<<1, 1, 0, ctx.stream>>>(d_twiddles, n_twiddles, omega); - cudaStreamSynchronize(ctx.stream); + // only generate twiddles if they haven't been generated yet (TODO: thread safety) + if (!Domain::twiddles) { + // TODO DmytroTym: the following line is just a temporary patch to make it work, + // having issues creating default stream on rust side + device_context::DeviceContext ctx = device_context::get_default_device_context(); + std::vector h_twiddles; + h_twiddles.push_back(S::one()); + int n = 1; + do { + Domain::coset_index[h_twiddles.at(n - 1)] = n - 1; + h_twiddles.push_back(h_twiddles.at(n - 1) * primitive_root); + } while (h_twiddles.at(n++) != S::one()); + cudaMallocAsync(&Domain::twiddles, n * sizeof(S), ctx.stream); + cudaMemcpyAsync(Domain::twiddles, &h_twiddles.front(), n * sizeof(S), cudaMemcpyHostToDevice, ctx.stream); + Domain::max_size = n - 1; + } return cudaSuccess; } - template - cudaError_t NTT(NTTConfig* config) + template + cudaError_t NTT(E* input, int size, bool is_inverse, NTTConfig& config, E* output) { CHECK_LAST_CUDA_ERROR(); - cudaStream_t stream = config->ctx.stream; - int size = config->size; - int batch_size = config->batch_size; - bool is_inverse = config->is_inverse; - int n_twiddles = size; + cudaStream_t stream = config.ctx.stream; + int batch_size = config.batch_size; int logn = int(log(size) / log(2)); int input_size_bytes = size * batch_size * sizeof(E); - bool is_input_on_device = config->are_inputs_on_device; - bool is_output_on_device = config->is_output_on_device; - bool is_forward_twiddle_empty = config->twiddles == nullptr; - bool is_inverse_twiddle_empty = config->inv_twiddles == nullptr; - bool is_generating_twiddles = (is_forward_twiddle_empty && is_inverse_twiddle_empty) || - (is_forward_twiddle_empty && !is_inverse) || (is_inverse_twiddle_empty && is_inverse); - - S* d_twiddles; - if (is_generating_twiddles) { - cudaMallocAsync(&d_twiddles, n_twiddles * sizeof(S), stream); - S omega = is_inverse ? S::omega_inv(logn) : S::omega(logn); - GenerateTwiddleFactors(d_twiddles, n_twiddles, omega, config->ctx); + bool is_input_on_device = config.are_inputs_on_device; + bool is_output_on_device = config.are_outputs_on_device; + + E* d_input; + if (is_input_on_device) { + d_input = input; } else { - d_twiddles = is_inverse ? config->inv_twiddles : config->twiddles; + cudaMallocAsync(&d_input, input_size_bytes, stream); + cudaMemcpyAsync(d_input, input, input_size_bytes, cudaMemcpyHostToDevice, stream); } - - E* d_inout; + E* d_output; if (is_input_on_device) { - d_inout = config->inout; + d_output = output; } else { - cudaMallocAsync(&d_inout, input_size_bytes, stream); - cudaMemcpyAsync(d_inout, config->inout, input_size_bytes, cudaMemcpyHostToDevice, stream); + cudaMallocAsync(&d_output, input_size_bytes, stream); } - bool reverse_input; - bool reverse_output; - switch (config->ordering) { + bool ct_butterfly = true; + bool reverse_input = false; + switch (config.ordering) { case Ordering::kNN: - reverse_input = is_inverse; - reverse_output = !is_inverse; + reverse_input = true; break; case Ordering::kNR: - reverse_input = is_inverse; - reverse_output = is_inverse; - break; - case Ordering::kRN: - reverse_input = !is_inverse; - reverse_output = !is_inverse; + ct_butterfly = false; break; case Ordering::kRR: - reverse_input = !is_inverse; - reverse_output = is_inverse; + reverse_input = true; + ct_butterfly = false; break; } CHECK_LAST_CUDA_ERROR(); - if (reverse_input) reverse_order_batch(d_inout, size, logn, config->batch_size, stream); + if (reverse_input) reverse_order_batch(d_input, size, logn, batch_size, stream, d_output); CHECK_LAST_CUDA_ERROR(); ntt_inplace_batch_template( - d_inout, d_twiddles, size, batch_size, is_inverse, config->is_coset, config->coset_gen, stream, false); - CHECK_LAST_CUDA_ERROR(); - - if (reverse_output) reverse_order_batch(d_inout, size, logn, batch_size, stream); + reverse_input ? d_output : d_input, size, Domain::twiddles, Domain::max_size, batch_size, logn, + is_inverse, ct_butterfly, Domain::coset_index[config.coset_gen], stream, !config.is_async, d_output); CHECK_LAST_CUDA_ERROR(); if (is_output_on_device) { // free(config->inout); // TODO: ? or callback?+ - config->inout = d_inout; + output = d_output; } else { - if (is_input_on_device) { - E* h_output = (E*)malloc(input_size_bytes); // TODO: caller responsible for memory management - cudaMemcpyAsync(h_output, d_inout, input_size_bytes, cudaMemcpyDeviceToHost, stream); - config->inout = h_output; - CHECK_LAST_CUDA_ERROR(); - } else { - cudaMemcpyAsync(config->inout, d_inout, input_size_bytes, cudaMemcpyDeviceToHost, stream); - CHECK_LAST_CUDA_ERROR(); - } - cudaFreeAsync(d_inout, stream); // TODO: make it optional? so can be reused + cudaMemcpyAsync(output, d_output, input_size_bytes, cudaMemcpyDeviceToHost, stream); + CHECK_LAST_CUDA_ERROR(); } CHECK_LAST_CUDA_ERROR(); - if (is_generating_twiddles && !config->is_preserving_twiddles) { cudaFreeAsync(d_twiddles, stream); } - - if (config->is_preserving_twiddles) { - if (is_inverse) - config->inv_twiddles = d_twiddles; - else { - config->twiddles = d_twiddles; - } - } - - cudaStreamSynchronize(stream); + if (!config.is_async) cudaStreamSynchronize(stream); CHECK_LAST_CUDA_ERROR(); return cudaSuccess; } + template + NTTConfig DefaultNTTConfig() { + device_context::DeviceContext ctx = device_context::get_default_device_context(); + NTTConfig config = { + S::one(), // coset_gen + Ordering::kNN, // ordering + false, // are_inputs_on_device + false, // are_outputs_on_device + 1, // batch_size + false, // is_async + ctx, // ctx + }; + return config; + } + /** - * Extern version of [ntt](@ref ntt) function with the following values of template parameters - * (where the curve is given by `-DCURVE` env variable during build): - * - `S` and `E` are both the [scalar field](@ref scalar_t) of the curve; - * @return `cudaSuccess` if the execution was successful and an error code otherwise. + * Extern "C" version of [DefaultNTTConfig](@ref DefaultNTTConfig) function with the following + * value of template parameter (where the curve is given by `-DCURVE` env variable during build): + * - `S` is the [scalar field](@ref scalar_t) of the curve; + * @return Default [NTTConfig](@ref NTTConfig). */ - extern "C" cudaError_t NTTCuda(NTTConfig* config) - { - return NTT(config); + extern "C" NTTConfig GetDefaultNTTConfig() { + return DefaultNTTConfig(); } /** - * Extern version of [ntt](@ref ntt) function with the following values of template parameters - * (where the curve is given by `-DCURVE` env variable during build): - * - `S` and `E` are both the [scalar field](@ref scalar_t) of the curve; - * @return `cudaSuccess` if the execution was successful and an error code otherwise. + * Extern "C" version of [InitDomain](@ref InitDomain) function with the following + * value of template parameter (where the curve is given by `-DCURVE` env variable during build): + * - `S` is the [scalar field](@ref scalar_t) of the curve; */ - template - cudaError_t NTTDefaultContext(NTTConfig* config) - { - // TODO: if empty - create default - cudaMemPool_t mempool; - cudaDeviceGetDefaultMemPool(&mempool, config->ctx.device_id); - - device_context::DeviceContext context = { - config->ctx.device_id, - 0, // default stream - mempool}; - - config->ctx = context; - - return NTT(config); + extern "C" cudaError_t InitializeDomain(curve_config::scalar_t primitive_root, device_context::DeviceContext& ctx) { + return InitDomain(primitive_root, ctx); } /** - * Extern version of [ntt](@ref ntt) function with the following values of template parameters + * Extern "C" version of [NTT](@ref NTT) function with the following values of template parameters * (where the curve is given by `-DCURVE` env variable during build): * - `S` and `E` are both the [scalar field](@ref scalar_t) of the curve; * @return `cudaSuccess` if the execution was successful and an error code otherwise. */ - extern "C" cudaError_t NTTDefaultContextCuda(NTTConfig* config) + extern "C" cudaError_t NTTCuda(curve_config::scalar_t* input, int size, bool is_inverse, NTTConfig& config, curve_config::scalar_t* output) { - return NTTDefaultContext(config); + return NTT(input, size, is_inverse, config, output); } #if defined(ECNTT_DEFINED) /** - * Extern version of [NTT](@ref NTT) function with the following values of template parameters + * Extern "C" version of [NTT](@ref NTT) function with the following values of template parameters * (where the curve is given by `-DCURVE` env variable during build): * - `S` is the [projective representation](@ref projective_t) of the curve (i.e. EC NTT is computed); * - `E` is the [scalar field](@ref scalar_t) of the curve; * @return `cudaSuccess` if the execution was successful and an error code otherwise. */ - extern "C" cudaError_t ECNTTCuda(NTTConfig* config) + extern "C" cudaError_t ECNTTCuda(curve_config::projective_t* input, int size, bool is_inverse, NTTConfig& config, curve_config::projective_t* output) { - return NTT(config); + return NTT(input, size, is_inverse, config, output); } #endif -} // namespace ntt +} // namespace ntt \ No newline at end of file diff --git a/icicle/appUtils/ntt/ntt.cuh b/icicle/appUtils/ntt/ntt.cuh index 1bc976d6b..1eb57aee3 100644 --- a/icicle/appUtils/ntt/ntt.cuh +++ b/icicle/appUtils/ntt/ntt.cuh @@ -15,16 +15,36 @@ * Number Theoretic Transform, or NTT is a version of [fast Fourier * transform](https://en.wikipedia.org/wiki/Fast_Fourier_transform) where instead of real or complex numbers, inputs and * outputs belong to certain finite groups or fields. NTT computes the values of a polynomial \f$ p(x) = p_0 + p_1 \cdot - * x + \dots + p_{n-1} \cdot x^{n-1} \f$ on special subfields called "roots of unity", or "twiddle factors": \f[ NTT(p) - * = \{ p(\omega^0), p(\omega^1), \dots, p(\omega^{n-1}) \} \f] Inverse NTT, or iNTT solves the inverse problem of - * computing coefficients of \f$ p(x) \f$ from evaluations \f$ \{ p(\omega^0), p(\omega^1), \dots, p(\omega^{n-1}) \} - * \f$. If not specified otherwise, \f$ n \f$ is a power of 2. + * x + \dots + p_{n-1} \cdot x^{n-1} \f$ on special subfields called "roots of unity", or "twiddle factors" (optionally + * shifted by an additional element called "coset generator"): \f[ NTT(p) = \{ p(\omega^0), p(\omega^1), \dots, + * p(\omega^{n-1}) \} \f] Inverse NTT, or iNTT solves the inverse problem of computing coefficients of \f$ p(x) \f$ + * given evaluations \f$ \{ p(\omega^0), p(\omega^1), \dots, p(\omega^{n-1}) \} \f$. If not specified otherwise, + * \f$ n \f$ is a power of 2. */ namespace ntt { + /** + * Generate a domain that supports all NTTs of sizes under a certain threshold. Note that the this function might + * be expensive, so if possible it should be called before all time-critical operations. + * It's assumed that during program execution only the coset generator might change, but twiddles stay fixed, so + * they are initialized at the first call of this function and don't change afterwards. + * @param primitive_root Primitive root in field `S` of order \f$ 2^s \f$. This should be the smallest power-of-2 + * order that's large enough to support any NTT you might want to perform. + * @param ctx Details related to the device such as its id and stream id. + * @return `cudaSuccess` if the execution was successful and an error code otherwise. + */ + template + cudaError_t InitDomain(S primitive_root, device_context::DeviceContext& ctx); + /** * @enum Ordering - * How to order inputs and outputs of the NTT: + * How to order inputs and outputs of the NTT. If needed, use this field to specify decimation: decimation in time + * (DIT) corresponds to `Ordering::kRN` while decimation in frequency (DIF) to `Ordering::kNR`. Also, to specify + * butterfly to be used, select `Ordering::kRN` for Cooley-Tukey and `Ordering::kNR` for Gentleman-Sande. There's + * no implication that a certain decimation or butterfly will actually be used under the hood, this is just for + * compatibility with codebases that use "decimation" and "butterfly" to denote ordering of inputs and outputs. + * + * Ordering options are: * - kNN: inputs and outputs are natural-order (example of natural ordering: \f$ \{a_0, a_1, a_2, a_3, a_4, a_5, a_6, * a_7\} \f$). * - kNR: inputs are natural-order and outputs are bit-reversed-order (example of bit-reversed ordering: \f$ \{a_0, @@ -34,100 +54,50 @@ namespace ntt { */ enum class Ordering { kNN, kNR, kRN, kRR }; - /** - * @enum Decimation - * Decimation of the NTT algorithm: - * - kDIT: decimation in time. - * - kDIF: decimation in frequency. - */ - enum class Decimation { kDIT, kDIF }; - - /** - * @enum Butterfly - * [Butterfly](https://en.wikipedia.org/wiki/Butterfly_diagram) used in the NTT algorithm (i.e. what happens to each - * pair of inputs on every iteration): - * - kCooleyTukey: Cooley-Tukey butterfly. - * - kGentlemanSande: Gentleman-Sande butterfly. - */ - enum class Butterfly { kCooleyTukey, kGentlemanSande }; - /** * @struct NTTConfig - * Struct that encodes NTT parameters to be passed into the [ntt](@ref ntt) function. + * Struct that encodes NTT parameters to be passed into the [NTT](@ref NTT) function. */ - template + template struct NTTConfig { - E* inout; /**< Input that's mutated in-place by this function. Length of this array needs to be \f$ size \cdot - * config.batch_size \f$. Note that if inputs are in Montgomery form, the outputs will be as well and - * vice-verse: non-Montgomery inputs produce non-Montgomety outputs.*/ - bool are_inputs_on_device; /**< True if inputs/outputs are on device and false if they're on host. Default value: - false. */ - bool is_inverse; /**< True if true . Default value: false. */ - Ordering - ordering; /**< Ordering of inputs and outputs. See [Ordering](@ref Ordering). Default value: `Ordering::kNN`. */ - Decimation - decimation; /**< Decimation of the algorithm, see [Decimation](@ref Decimation). Default value: - * `Decimation::kDIT`. - * __Note:__ this variable exists mainly for compatibility with codebases that use similar notation. - * If [ordering](@ref ordering) is `Ordering::kRN`, the value of this variable will be overridden to - * `Decimation::kDIT` and if ordering is `Ordering::kNR` — to `Decimation::kDIF`. */ - Butterfly - butterfly; /**< Butterfly used by the NTT. See [Butterfly](@ref Butterfly). Default value: - * `Butterfly::kCooleyTukey`. - * __Note:__ this variable exists mainly for compatibility with codebases that use similar notation. - * If [ordering](@ref ordering) is `Ordering::kRN`, the value of this variable will be overridden to - * `Butterfly::kCooleyTukey` and if ordering is `Ordering::kNR` — to `Butterfly::kGentlemanSande`. */ - bool is_coset; /**< If false, NTT is computed on a subfield given by [twiddles](@ref twiddles). If true, NTT is - * computed on a coset of [twiddles](@ref twiddles) given by [the coset generator](@ref coset_gen), - * so: \f$ \{coset\_gen\cdot\omega^0, coset\_gen\cdot\omega^1, \dots, coset\_gen\cdot\omega^{n-1}\} - * \f$. Default value: false. */ - S* coset_gen; /**< The field element that generates a coset if [is_coset](@ref is_coset) is true. - * Otherwise should be set to `nullptr`. Default value: `nullptr`. */ - S* twiddles; /**< "Twiddle factors", (or "domain", or "roots of unity") on which the NTT is evaluated. - * This pointer is expected to live on device. The order is as follows: - * \f$ \{\omega^0=1, \omega^1, \dots, \omega^{n-1}\} \f$. If this pointer is `nullptr`, twiddle - * factors are generated online using the default generator (TODO: link to twiddle gen here) and - * function [GenerateTwiddleFactors](@ref GenerateTwiddleFactors). Default value: `nullptr`. */ - S* inv_twiddles; /**< "Inverse twiddle factors", (or "domain", or "roots of unity") on which the iNTT is evaluated. - * This pointer is expected to live on device. The order is as follows: - * \f$ \{\omega^0=1, \omega^1, \dots, \omega^{n-1}\} \f$. If this pointer is `nullptr`, twiddle - * factors are generated online using the default generator (TODO: link to twiddle gen here) and - * function [GenerateTwiddleFactors](@ref GenerateTwiddleFactors). Default value: `nullptr`. */ - int size; /**< NTT size \f$ n \f$. If a batch of NTTs (which all need to have the same size) is computed, this is - the size of 1 NTT. */ - int batch_size; /**< The number of NTTs to compute. Default value: 1. */ - bool is_preserving_twiddles; /**< If true, twiddle factors are preserved on device for subsequent use in config and - not freed after calculation. Default value: false. */ - bool is_output_on_device; /**< If true, output is preserved on device for subsequent use in config and not freed - after calculation. Default value: false. */ - device_context::DeviceContext ctx; /**< Details related to the device such as its id and stream id. See - [DeviceContext](@ref device_context::DeviceContext). */ + S coset_gen; /**< Coset generator. Used to perform coset (i)NTTs. Default value: `S::one()` + * (corresponding to no coset being used). */ + Ordering ordering; /**< Ordering of inputs and outputs. See [Ordering](@ref Ordering). Default value: + * `Ordering::kNN`. */ + bool are_inputs_on_device; /**< True if inputs are on device and false if they're on host. Default value: false. */ + bool are_outputs_on_device; /**< If true, output is preserved on device, otherwise on host. Default value: false. */ + int batch_size; /**< The number of NTTs to compute. Default value: 1. */ + bool is_async; /**< Whether to run the NTT asyncronously. If set to `true`, the NTT function will be + * non-blocking and you'd need to synchronize it explicitly by running + * `cudaStreamSynchronize` or `cudaDeviceSynchronize`. If set to false, the NTT + * function will block the current CPU thread. */ + device_context::DeviceContext ctx; /**< Details related to the device such as its id and stream. */ }; + /** + * A function that returns the default value of [NTTConfig](@ref NTTConfig) for the [NTT](@ref NTT) function. + * @return Default value of [NTTConfig](@ref NTTConfig). + */ + template + NTTConfig DefaultNTTConfig(); + /** * A function that computes NTT or iNTT in-place. + * @param input Input of the NTT. Length of this array needs to be \f$ size \cdot config.batch\_size \f$. Note + * that if inputs are in Montgomery form, the outputs will be as well and vice-versa: non-Montgomery inputs produce + * non-Montgomety outputs. + * @param size NTT size. If a batch of NTTs (which all need to have the same size) is computed, this is the size + * of 1 NTT, so it must equal the size of `inout` divided by `config.batch_size`. + * @param is_inverse True for inverse NTT and false for direct NTT. Default value: false. * @param config [NTTConfig](@ref NTTConfig) used in this NTT. + * @param output Buffer for the output of the NTT. Should be of the same size as `input`. * @tparam E The type of inputs and outputs (i.e. coefficients \f$ \{p_i\} \f$ and values \f$ p(x) \f$). Must be a * group. * @tparam S The type of "twiddle factors" \f$ \{ \omega^i \} \f$. Must be a field. Often (but not always) `S=E`. * @return `cudaSuccess` if the execution was successful and an error code otherwise. */ - template - cudaError_t NTT(NTTConfig* config); - - /** - * Generates twiddles \f$ \{\omega^0=1, \omega^1, \dots, \omega^{n-1}\} \f$ from root of unity \f$ \omega \f$ and - * stores them on device. - * @param d_twiddles Input empty array on device to which twiddles are to be written. - * @param n_twiddles Number of twiddle \f$ n \f$ factors to generate. - * @param omega Root of unity \f$ \omega \f$. - * @param ctx Details related to the device such as its id and stream id. See [DeviceContext](@ref - * device_context::DeviceContext). - * @tparam S The type of twiddle factors \f$ \{ \omega^i \} \f$. - * @return `cudaSuccess` if the execution was successful and an error code otherwise. - */ - template - cudaError_t GenerateTwiddleFactors(S* d_twiddles, int n_twiddles, S omega, device_context::DeviceContext ctx); + template + cudaError_t NTT(E* input, int size, bool is_inverse, NTTConfig& config, E* output); } // namespace ntt diff --git a/icicle/curves/bls12_377_params.cuh b/icicle/curves/bls12_377_params.cuh index 596dc6ff8..f6a8e457b 100644 --- a/icicle/curves/bls12_377_params.cuh +++ b/icicle/curves/bls12_377_params.cuh @@ -7,8 +7,9 @@ namespace bls12_377 { struct fp_config { static constexpr unsigned limbs_count = 8; - static constexpr unsigned omegas_count = 32; + static constexpr unsigned omegas_count = 47; static constexpr unsigned modulus_bit_count = 253; + static constexpr unsigned num_of_reductions = 1; static constexpr storage modulus = {0x00000001, 0x0a118000, 0xd0000001, 0x59aa76fe, 0x5c37b001, 0x60b44d1e, 0x9a2ca556, 0x12ab655e}; @@ -16,6 +17,8 @@ namespace bls12_377 { 0xb86f6002, 0xc1689a3c, 0x34594aac, 0x2556cabd}; static constexpr storage modulus_4 = {0x00000004, 0x28460000, 0x40000004, 0x66a9dbfb, 0x70dec005, 0x82d13479, 0x68b29559, 0x4aad957a}; + static constexpr storage neg_modulus = {0xffffffff, 0xf5ee7fff, 0x2ffffffe, 0xa6558901, + 0xa3c84ffe, 0x9f4bb2e1, 0x65d35aa9, 0xed549aa1}; static constexpr storage<2 * limbs_count> modulus_wide = { 0x00000001, 0x0a118000, 0xd0000001, 0x59aa76fe, 0x5c37b001, 0x60b44d1e, 0x9a2ca556, 0x12ab655e, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000}; @@ -41,72 +44,102 @@ namespace bls12_377 { 0x27b28e2f, 0x838557e2, 0x2290c02c, 0x07b30191}; static constexpr storage_array omega = { - {{0xec2a895e, 0x476ef4a4, 0x63e3f04a, 0x9b506ee3, 0xd1a8a12f, 0x60c69477, 0x0cb92cc1, 0x11d4b7f6}, - {0x01ab3a4d, 0x006f60fa, 0x814ba450, 0xe6600e15, 0xdf9eb147, 0xbde4df36, 0x33760d7b, 0x055d58fa}, - {0xfdacff58, 0x8215b91d, 0x98331645, 0xd8d9177d, 0x439e803c, 0xe85223ad, 0xcca42c1f, 0x04aa8ef0}, - {0x293f8481, 0xd52cc17a, 0x6f133205, 0x041178fb, 0xb2961832, 0xbbc70d18, 0x481760cd, 0x073d34d1}, - {0x5e9020dd, 0xade9d4b4, 0x87db8813, 0x489259d2, 0x25051238, 0x5ddce740, 0xb5bc4d11, 0x0c775db1}, - {0xd5fba57b, 0x90684fea, 0xe0defe98, 0xed237883, 0x030ae924, 0xc502b692, 0xe7a1ec2c, 0x08aa58e8}, - {0x44ddbbdc, 0xbafb92a6, 0x26b01974, 0x63c7a02d, 0x5f28a274, 0x0ff86e13, 0x867f2e29, 0x0a7b462a}, - {0x355dd694, 0x4258374d, 0x44c76a20, 0x5c31e8ac, 0xaa5fd062, 0x9b473969, 0x1a37b6b4, 0x0a693d77}, - {0x22df9f13, 0x56313de8, 0x599e7536, 0xe2e75200, 0x6d163e50, 0xa1b4fce7, 0xc8111763, 0x0aec2172}, - {0xf32d6bac, 0xa0b973d4, 0xf0d81b72, 0xae951889, 0x2e2daa0a, 0x51dbe098, 0x40d9af8f, 0x04679474}, - {0x1b29736e, 0x8f267f19, 0x1d5a0c3a, 0xa2e04d58, 0x1ae99514, 0x76803064, 0x57f7c806, 0x12129439}, - {0xbd83a3da, 0xd3b69b29, 0xe02ce197, 0x9543950f, 0xc2f87783, 0x80799665, 0xc15be215, 0x11ce8199}, - {0xf284f768, 0xdeee484b, 0xe26a0475, 0x2a02e015, 0x88d968c2, 0xf0eb4925, 0x82a391c9, 0x0620ce9e}, - {0xa90a2740, 0xfe3ca4f0, 0x512a7c7a, 0xd259ff36, 0xb41fe696, 0xbca3176a, 0xf33132ce, 0x05bd5ea3}, - {0xb14361d4, 0x7f1db43f, 0x25ab6d51, 0x7927e578, 0x383bf21e, 0xb43e52a5, 0xd27fa99f, 0x077595e9}, - {0xa9966ac4, 0x1ae0ea67, 0xda83fb3b, 0x4e2dbb1c, 0x0b51380e, 0xf77cf749, 0xb28a7670, 0x048b4b0e}, - {0xa0234d2d, 0xe943054c, 0xe5f5be5e, 0x673b0ee0, 0x5048a19a, 0xcdd48e41, 0xabc3cb99, 0x0997d277}, - {0x1912f7fa, 0x77d7da1d, 0x299fd7d6, 0xbcb7a5b2, 0x142a4480, 0x705e45dd, 0xb492dbd8, 0x0dc835fd}, - {0x20b7298a, 0xd7652451, 0x65013b06, 0xc7c9a0b7, 0xad0d8457, 0x479b82a9, 0x0c99f5ce, 0x0bef1e5a}, - {0xe5f8848a, 0x270a2326, 0xa727567d, 0x97d14afa, 0x48746fc7, 0x1a3a5a4e, 0xa42f077a, 0x0044e4b1}, - {0x4dd87a5e, 0xf423a283, 0xd9a4c364, 0x1fe46601, 0xbfdc7e9b, 0xda4addbf, 0x3bf94b2b, 0x0a7f2bd8}, - {0xf02ba42c, 0x553085d9, 0x1119b10d, 0x59662159, 0x6b8ea03f, 0xaa670958, 0x7ce92983, 0x066f6f5f}, - {0xedc626c3, 0xf30e312d, 0xcf1f3a94, 0x8367a7ca, 0x917a1b28, 0x621e15e1, 0xf2e93b82, 0x07cd59f8}, - {0xafeb494b, 0x97319dcd, 0x1d78404c, 0xab30c83e, 0xf26ffe90, 0x452d8a48, 0xa36452c7, 0x0bfc2e92}, - {0xcc943028, 0xed2576ad, 0xfa4c6090, 0x846e49bc, 0x0049d8e6, 0xc74c1865, 0x665d7be5, 0x0e9c5a12}, - {0xf45b9621, 0x102fbfb0, 0xf04faac0, 0xe80f4241, 0x7ca61177, 0x0b830bfd, 0x7033169d, 0x10521892}, - {0x3358eb25, 0xdbc547bc, 0x722037db, 0x8909d398, 0x5e705b6d, 0x8b7075b5, 0x9bdaf407, 0x02694bb2}, - {0x66a16869, 0x50c487c1, 0xd1fd4525, 0x380a66ab, 0x265e8539, 0xd455a01a, 0x064b5334, 0x0cd62875}, - {0x4637701d, 0x0848f958, 0x4c8353af, 0x8a750076, 0x0ef6174a, 0x485f4e4f, 0xf38db632, 0x078d97a1}, - {0x3d766f80, 0x1b4b71cf, 0x1069012d, 0x47d21195, 0x9151ebec, 0x5635235f, 0x2b13c808, 0x093f7d91}, + {{0x00000000, 0x0a118000, 0xd0000001, 0x59aa76fe, 0x5c37b001, 0x60b44d1e, 0x9a2ca556, 0x12ab655e}, + {0x00000001, 0x8f1a4000, 0xb0000001, 0xcf664765, 0x970dec00, 0x23ed1347, 0x00000000, 0x00000000}, + {0xfbfa0a01, 0x0f830f7e, 0xd75769a0, 0x20f8b46c, 0xf05d5033, 0x7108bd18, 0x0788de01, 0x07405e08}, + {0x60b9bdae, 0xc78085a6, 0x789094f5, 0x3116ec22, 0xce87d660, 0x0a02a81d, 0xc2a94856, 0x0ead8236}, + {0x3e83a7cc, 0x6ffc39d9, 0x958a0a74, 0x117d996e, 0x0b92e8c9, 0xc242289d, 0x29d977d6, 0x0484efb4}, + {0x0111ec3f, 0x15455b00, 0xc5f6be6f, 0x6b62d7af, 0x337f2d07, 0xfcba0365, 0x43fccd26, 0x0f151842}, + {0xc31ec69b, 0x57951b2e, 0x2a37ce1f, 0x3e0a4be7, 0xcf3b198a, 0x960aeb4a, 0x341fd5cd, 0x04fb0673}, + {0xa921851f, 0x71c1b78e, 0x7808f239, 0x3c26340c, 0x976fb990, 0xbcc8f69b, 0xe880dc71, 0x06a5edb2}, + {0xc0f5679e, 0x7619eab5, 0x0dc0b9cd, 0x1f4cd10e, 0xbf6a480a, 0x7e1b70aa, 0x7f5461bb, 0x0ffc66da}, + {0xec5cbab2, 0x8159806d, 0x498264a3, 0x14ea1333, 0xe3abfaa6, 0x56bbe1d8, 0x02aa031f, 0x09d2b5c4}, + {0xc010c48a, 0xd2aa9562, 0x3b004b60, 0x447e5c11, 0x11e243bb, 0xd5a21c13, 0x0ab418b1, 0x01eab23e}, + {0xacff6986, 0x08715ee8, 0xa93924d0, 0xab01878a, 0x6e9ae5c4, 0xbfbc5e71, 0x26b08d6e, 0x0f8000bf}, + {0x3ddbc679, 0x06bc13b0, 0x615256ce, 0x7269a1f1, 0x1f5221a2, 0xf7716fbf, 0x8c66c14f, 0x0fa1f02c}, + {0x906f531f, 0xdd40f131, 0x30728eff, 0xb06b29c7, 0x88839294, 0xc891fd19, 0x646978e8, 0x04e88447}, + {0x6e259cdc, 0xb1e4b769, 0x00514e5e, 0xbcb0b709, 0x05113e7f, 0x74edb7c0, 0xe92e22af, 0x10c88511}, + {0x240ede5b, 0xebb2e898, 0x42cd84c6, 0xc2639185, 0x9408f956, 0xf79e8391, 0x94e87a7d, 0x06872fa1}, {0x260678ff, 0xf8522249, 0xa8de9973, 0x6148cb16, 0x5a4e8d56, 0x5750f3f4, 0xbaeaf0c3, 0x0e805156}, - {0x240ede5b, 0xebb2e898, 0x42cd84c6, 0xc2639185, 0x9408f956, 0xf79e8391, 0x94e87a7d, 0x06872fa1}}}; + {0x3d766f80, 0x1b4b71cf, 0x1069012d, 0x47d21195, 0x9151ebec, 0x5635235f, 0x2b13c808, 0x093f7d91}, + {0x4637701d, 0x0848f958, 0x4c8353af, 0x8a750076, 0x0ef6174a, 0x485f4e4f, 0xf38db632, 0x078d97a1}, + {0x66a16869, 0x50c487c1, 0xd1fd4525, 0x380a66ab, 0x265e8539, 0xd455a01a, 0x064b5334, 0x0cd62875}, + {0x3358eb25, 0xdbc547bc, 0x722037db, 0x8909d398, 0x5e705b6d, 0x8b7075b5, 0x9bdaf407, 0x02694bb2}, + {0xf45b9621, 0x102fbfb0, 0xf04faac0, 0xe80f4241, 0x7ca61177, 0x0b830bfd, 0x7033169d, 0x10521892}, + {0xcc943028, 0xed2576ad, 0xfa4c6090, 0x846e49bc, 0x0049d8e6, 0xc74c1865, 0x665d7be5, 0x0e9c5a12}, + {0xafeb494b, 0x97319dcd, 0x1d78404c, 0xab30c83e, 0xf26ffe90, 0x452d8a48, 0xa36452c7, 0x0bfc2e92}, + {0xedc626c3, 0xf30e312d, 0xcf1f3a94, 0x8367a7ca, 0x917a1b28, 0x621e15e1, 0xf2e93b82, 0x07cd59f8}, + {0xf02ba42c, 0x553085d9, 0x1119b10d, 0x59662159, 0x6b8ea03f, 0xaa670958, 0x7ce92983, 0x066f6f5f}, + {0x4dd87a5e, 0xf423a283, 0xd9a4c364, 0x1fe46601, 0xbfdc7e9b, 0xda4addbf, 0x3bf94b2b, 0x0a7f2bd8}, + {0xe5f8848a, 0x270a2326, 0xa727567d, 0x97d14afa, 0x48746fc7, 0x1a3a5a4e, 0xa42f077a, 0x0044e4b1}, + {0x20b7298a, 0xd7652451, 0x65013b06, 0xc7c9a0b7, 0xad0d8457, 0x479b82a9, 0x0c99f5ce, 0x0bef1e5a}, + {0x1912f7fa, 0x77d7da1d, 0x299fd7d6, 0xbcb7a5b2, 0x142a4480, 0x705e45dd, 0xb492dbd8, 0x0dc835fd}, + {0xa0234d2d, 0xe943054c, 0xe5f5be5e, 0x673b0ee0, 0x5048a19a, 0xcdd48e41, 0xabc3cb99, 0x0997d277}, + {0xa9966ac4, 0x1ae0ea67, 0xda83fb3b, 0x4e2dbb1c, 0x0b51380e, 0xf77cf749, 0xb28a7670, 0x048b4b0e}, + {0xb14361d4, 0x7f1db43f, 0x25ab6d51, 0x7927e578, 0x383bf21e, 0xb43e52a5, 0xd27fa99f, 0x077595e9}, + {0xa90a2740, 0xfe3ca4f0, 0x512a7c7a, 0xd259ff36, 0xb41fe696, 0xbca3176a, 0xf33132ce, 0x05bd5ea3}, + {0xf284f768, 0xdeee484b, 0xe26a0475, 0x2a02e015, 0x88d968c2, 0xf0eb4925, 0x82a391c9, 0x0620ce9e}, + {0xbd83a3da, 0xd3b69b29, 0xe02ce197, 0x9543950f, 0xc2f87783, 0x80799665, 0xc15be215, 0x11ce8199}, + {0x1b29736e, 0x8f267f19, 0x1d5a0c3a, 0xa2e04d58, 0x1ae99514, 0x76803064, 0x57f7c806, 0x12129439}, + {0xf32d6bac, 0xa0b973d4, 0xf0d81b72, 0xae951889, 0x2e2daa0a, 0x51dbe098, 0x40d9af8f, 0x04679474}, + {0x22df9f13, 0x56313de8, 0x599e7536, 0xe2e75200, 0x6d163e50, 0xa1b4fce7, 0xc8111763, 0x0aec2172}, + {0x355dd694, 0x4258374d, 0x44c76a20, 0x5c31e8ac, 0xaa5fd062, 0x9b473969, 0x1a37b6b4, 0x0a693d77}, + {0x44ddbbdc, 0xbafb92a6, 0x26b01974, 0x63c7a02d, 0x5f28a274, 0x0ff86e13, 0x867f2e29, 0x0a7b462a}, + {0xd5fba57b, 0x90684fea, 0xe0defe98, 0xed237883, 0x030ae924, 0xc502b692, 0xe7a1ec2c, 0x08aa58e8}, + {0x5e9020dd, 0xade9d4b4, 0x87db8813, 0x489259d2, 0x25051238, 0x5ddce740, 0xb5bc4d11, 0x0c775db1}, + {0x293f8481, 0xd52cc17a, 0x6f133205, 0x041178fb, 0xb2961832, 0xbbc70d18, 0x481760cd, 0x073d34d1}, + {0xfdacff58, 0x8215b91d, 0x98331645, 0xd8d9177d, 0x439e803c, 0xe85223ad, 0xcca42c1f, 0x04aa8ef0}, + {0x01ab3a4d, 0x006f60fa, 0x814ba450, 0xe6600e15, 0xdf9eb147, 0xbde4df36, 0x33760d7b, 0x055d58fa}, + {0xec2a895e, 0x476ef4a4, 0x63e3f04a, 0x9b506ee3, 0xd1a8a12f, 0x60c69477, 0x0cb92cc1, 0x11d4b7f6}}}; static constexpr storage_array omega_inv = { - {{0xb9112c51, 0x2542c2b2, 0x6e23b3ce, 0x36ead8da, 0x76476754, 0x9a268d13, 0xa1ad7cf1, 0x121f44ad}, - {0x3b3625b6, 0x1e62401f, 0x28471e5a, 0xd0692164, 0x5cad6b77, 0xb85aa9ec, 0xaa95acf2, 0x063e4b66}, - {0x4ffa086a, 0xecc89610, 0xca06afc6, 0x4db82291, 0x8f3a6426, 0x9ae7c68c, 0x2a874432, 0x0b3dae8c}, - {0xe3b4dc56, 0xa0594a67, 0x91b698e1, 0xc8e6b582, 0x8df78057, 0x711cadbf, 0x396466f8, 0x0049abdf}, - {0x6464580f, 0x33e6c8c0, 0x3c4aa09f, 0x9d560eb3, 0xcc98f404, 0xb3f1a899, 0x8ca24b48, 0x012c1ea5}, - {0xaf858193, 0x2b955be2, 0x5fb5e378, 0xa513d8be, 0xa326aeb9, 0x88c4ebeb, 0xf3d45990, 0x00c378e2}, - {0x33bf2a1c, 0x842b0c9c, 0xa29b9236, 0x1fd43c95, 0xc06795d3, 0x6b37a603, 0x0c1b712a, 0x00017b17}, - {0x526bf9fc, 0x023031cc, 0x79c209ba, 0x0e4136c0, 0x3ec42e5c, 0xe5234df1, 0x1d455234, 0x00cb9592}, - {0xef01ed78, 0xf2828212, 0xf103c9ca, 0xa66094ac, 0x7a2d5573, 0xdceb481d, 0x8af46aab, 0x0190fcde}, - {0x89b0ca6f, 0xb4d938e2, 0x2c897570, 0x0214eb59, 0x2d4cf27a, 0x56c45327, 0x3ed546a4, 0x10a2f358}, - {0x78500f1a, 0x98310dd7, 0x735ccb27, 0x1c6050bf, 0xb2081df4, 0x07b6fa7f, 0xfa0f1e20, 0x003edf24}, - {0xa39b02a3, 0x8a3de898, 0xdc94422c, 0x068b2992, 0xf493db31, 0x1c5f019a, 0x11b0f668, 0x066b1790}, - {0xdddb58ec, 0x41f8042f, 0x10886d85, 0x7dd54384, 0x622ff4b4, 0x19544f90, 0x050cc539, 0x02f0b49a}, - {0x7998b62c, 0xbb53132b, 0x22c9b4aa, 0x064a9186, 0x71d61334, 0xd56de253, 0x04e416f6, 0x10fcf25f}, - {0xdf80223d, 0x55f432c9, 0x11a2fed9, 0x23daf2f6, 0x41ae8c34, 0x9e43e003, 0x95f22373, 0x0d51533b}, - {0x78fd3239, 0xaf29730b, 0x40c3e723, 0xbd907ac9, 0x77f214f7, 0x5dcc0aad, 0xb05fb3a1, 0x02d958da}, - {0x498fb549, 0xd5993cd5, 0x09da9272, 0x718adcee, 0x72bd5bc0, 0x9e03cbb4, 0xc592813f, 0x07206942}, - {0xe978594b, 0x4ddd3320, 0x3abe3f79, 0xe5f36fbe, 0xe4dcff8e, 0x5dba9ef2, 0x7105148f, 0x0bfc27e2}, - {0x3e47b53f, 0x50380ce2, 0x3a9613fc, 0x6ea3c2d3, 0x4c87ab50, 0xfe743105, 0xd192221c, 0x07871979}, - {0x49c6284a, 0x9ba6aa00, 0xeacbdc63, 0x0b8429fb, 0xedafdf37, 0x9b9c6c5b, 0xad0c78c6, 0x009907e8}, - {0x5d4e643c, 0x3da791ea, 0x85bff013, 0xb6a956ef, 0xd73de6a3, 0x86c629a8, 0x6b8c48a9, 0x0a5a5f55}, - {0x4b9ac952, 0x3d29f5ba, 0xc8ea8f94, 0x7c7f2662, 0xcefc3052, 0x736ccb63, 0x0981f3cb, 0x04bfce2f}, - {0x930cee0b, 0x432d3626, 0xf26e8ba3, 0x55ed3efb, 0x14c5457f, 0x802eebcc, 0xe2310f22, 0x00d300e3}, - {0x60cf1330, 0x840f913b, 0x1df5ed87, 0x5610cde6, 0x72b36ddf, 0x858381b0, 0x6f64e0b7, 0x109bf66c}, - {0x03ad3139, 0x01d3f431, 0xa137ce16, 0xe56f6002, 0x1deb42e8, 0x97f53369, 0xaa37cddd, 0x033fa9ac}, - {0xc161761f, 0x271d7caf, 0xc369a371, 0xf1001d6f, 0x00e60f51, 0x65286415, 0xb74d14b8, 0x00b918f9}, - {0xa26c8c12, 0xa6f4e1d1, 0xf6610f7e, 0x13571553, 0x56701caf, 0xd95e5df6, 0x2263d69d, 0x050e7b89}, - {0x1d75bec9, 0xe29ef6c0, 0xd4b0183b, 0xead287a2, 0xedfd3795, 0x75a017cf, 0x64427c8e, 0x107f8d0f}, - {0x00db2b48, 0xa43c0e02, 0x933d10ee, 0x76585489, 0xc0ba6a80, 0x12d64af1, 0x2fad8d8e, 0x01940f43}, - {0x4b1b63a9, 0x12998cbc, 0xcf420c9f, 0x0f780c6c, 0x129289ad, 0xa5e48723, 0x240a141d, 0x0a3a1223}, + {{0x00000000, 0x0a118000, 0xd0000001, 0x59aa76fe, 0x5c37b001, 0x60b44d1e, 0x9a2ca556, 0x12ab655e}, + {0x00000000, 0x7af74000, 0x1fffffff, 0x8a442f99, 0xc529c400, 0x3cc739d6, 0x9a2ca556, 0x12ab655e}, + {0xd60fb046, 0xc9fa190c, 0xc5b4674e, 0xdb5c179b, 0xbc7b8726, 0x2b2bce0b, 0xbf6e69bf, 0x0e4eb338}, + {0x8ffc4ed5, 0x74732d1f, 0xb7f2eefc, 0x42d9f590, 0xa24dd4dd, 0xf70461e5, 0xef64676f, 0x03b6eba4}, + {0x102bbab0, 0x5a21f98a, 0x8d8e2efb, 0xa6a147a9, 0x7612906f, 0x0eb4f005, 0x47d8d2e3, 0x0e1a5481}, + {0xd01e5aa8, 0x6e509add, 0x6e3f123d, 0xe1582468, 0x8274db24, 0xbd6313ee, 0xd173a634, 0x05d5836e}, + {0xe975c0cf, 0x6aab3344, 0x6f1dc38e, 0xca362e0e, 0x1dd1743a, 0x2fe72cda, 0xc1b4c4c2, 0x0c1c956e}, + {0xec89a64f, 0x59fe97a0, 0xe8de5d4c, 0x579617d7, 0xc9c1ea7b, 0x256a305b, 0x53fa131b, 0x01ffae4e}, + {0x29bcb088, 0x463a73ff, 0xe1438e80, 0xee9e9a5e, 0x3c9369e4, 0x2a00951f, 0x80a32052, 0x09711183}, + {0x4bec8dd2, 0xa36899db, 0x96393687, 0x2946872e, 0x842df3c8, 0xd4b5734f, 0x5f5cd8fb, 0x0834098f}, + {0xe3c711b9, 0x4bc485f6, 0x648d1d7e, 0xf43a2598, 0xee88abaa, 0x7f981a0e, 0xec6a3f27, 0x0c88c9c3}, + {0x49046b52, 0x42bcc6c2, 0x56ab9ecc, 0xcc77294a, 0xe4df3ddd, 0x02ecb41a, 0x67f76726, 0x0e567d22}, + {0x91c64fc2, 0x1cc56cc3, 0xd16a490b, 0x8cb71e65, 0x14fac366, 0x984be37e, 0xa25d7ba5, 0x0a08e032}, + {0xd4f5941e, 0x966d9739, 0xe5772a73, 0x5805deb6, 0x5c1f970c, 0xe4eb0d33, 0xbdf35409, 0x039715db}, + {0xcc6518ac, 0x8419686c, 0x9c7a2366, 0x96dec3a8, 0x71724384, 0xefbfcac6, 0xaf34c239, 0x0c44b99a}, + {0xc18ff4fd, 0xcb66fe1b, 0x86c8d586, 0x588e18b3, 0x1dfab57c, 0xc6e6d2a3, 0x7d7d4efd, 0x10918ad2}, {0x97a18f58, 0x56d6cf22, 0xd0d7abd9, 0x11710758, 0x5eb7a9c5, 0xd1a6608b, 0xc4937e38, 0x04059bdb}, - {0xc18ff4fd, 0xcb66fe1b, 0x86c8d586, 0x588e18b3, 0x1dfab57c, 0xc6e6d2a3, 0x7d7d4efd, 0x10918ad2}}}; + {0x4b1b63a9, 0x12998cbc, 0xcf420c9f, 0x0f780c6c, 0x129289ad, 0xa5e48723, 0x240a141d, 0x0a3a1223}, + {0x00db2b48, 0xa43c0e02, 0x933d10ee, 0x76585489, 0xc0ba6a80, 0x12d64af1, 0x2fad8d8e, 0x01940f43}, + {0x1d75bec9, 0xe29ef6c0, 0xd4b0183b, 0xead287a2, 0xedfd3795, 0x75a017cf, 0x64427c8e, 0x107f8d0f}, + {0xa26c8c12, 0xa6f4e1d1, 0xf6610f7e, 0x13571553, 0x56701caf, 0xd95e5df6, 0x2263d69d, 0x050e7b89}, + {0xc161761f, 0x271d7caf, 0xc369a371, 0xf1001d6f, 0x00e60f51, 0x65286415, 0xb74d14b8, 0x00b918f9}, + {0x03ad3139, 0x01d3f431, 0xa137ce16, 0xe56f6002, 0x1deb42e8, 0x97f53369, 0xaa37cddd, 0x033fa9ac}, + {0x60cf1330, 0x840f913b, 0x1df5ed87, 0x5610cde6, 0x72b36ddf, 0x858381b0, 0x6f64e0b7, 0x109bf66c}, + {0x930cee0b, 0x432d3626, 0xf26e8ba3, 0x55ed3efb, 0x14c5457f, 0x802eebcc, 0xe2310f22, 0x00d300e3}, + {0x4b9ac952, 0x3d29f5ba, 0xc8ea8f94, 0x7c7f2662, 0xcefc3052, 0x736ccb63, 0x0981f3cb, 0x04bfce2f}, + {0x5d4e643c, 0x3da791ea, 0x85bff013, 0xb6a956ef, 0xd73de6a3, 0x86c629a8, 0x6b8c48a9, 0x0a5a5f55}, + {0x49c6284a, 0x9ba6aa00, 0xeacbdc63, 0x0b8429fb, 0xedafdf37, 0x9b9c6c5b, 0xad0c78c6, 0x009907e8}, + {0x3e47b53f, 0x50380ce2, 0x3a9613fc, 0x6ea3c2d3, 0x4c87ab50, 0xfe743105, 0xd192221c, 0x07871979}, + {0xe978594b, 0x4ddd3320, 0x3abe3f79, 0xe5f36fbe, 0xe4dcff8e, 0x5dba9ef2, 0x7105148f, 0x0bfc27e2}, + {0x498fb549, 0xd5993cd5, 0x09da9272, 0x718adcee, 0x72bd5bc0, 0x9e03cbb4, 0xc592813f, 0x07206942}, + {0x78fd3239, 0xaf29730b, 0x40c3e723, 0xbd907ac9, 0x77f214f7, 0x5dcc0aad, 0xb05fb3a1, 0x02d958da}, + {0xdf80223d, 0x55f432c9, 0x11a2fed9, 0x23daf2f6, 0x41ae8c34, 0x9e43e003, 0x95f22373, 0x0d51533b}, + {0x7998b62c, 0xbb53132b, 0x22c9b4aa, 0x064a9186, 0x71d61334, 0xd56de253, 0x04e416f6, 0x10fcf25f}, + {0xdddb58ec, 0x41f8042f, 0x10886d85, 0x7dd54384, 0x622ff4b4, 0x19544f90, 0x050cc539, 0x02f0b49a}, + {0xa39b02a3, 0x8a3de898, 0xdc94422c, 0x068b2992, 0xf493db31, 0x1c5f019a, 0x11b0f668, 0x066b1790}, + {0x78500f1a, 0x98310dd7, 0x735ccb27, 0x1c6050bf, 0xb2081df4, 0x07b6fa7f, 0xfa0f1e20, 0x003edf24}, + {0x89b0ca6f, 0xb4d938e2, 0x2c897570, 0x0214eb59, 0x2d4cf27a, 0x56c45327, 0x3ed546a4, 0x10a2f358}, + {0xef01ed78, 0xf2828212, 0xf103c9ca, 0xa66094ac, 0x7a2d5573, 0xdceb481d, 0x8af46aab, 0x0190fcde}, + {0x526bf9fc, 0x023031cc, 0x79c209ba, 0x0e4136c0, 0x3ec42e5c, 0xe5234df1, 0x1d455234, 0x00cb9592}, + {0x33bf2a1c, 0x842b0c9c, 0xa29b9236, 0x1fd43c95, 0xc06795d3, 0x6b37a603, 0x0c1b712a, 0x00017b17}, + {0xaf858193, 0x2b955be2, 0x5fb5e378, 0xa513d8be, 0xa326aeb9, 0x88c4ebeb, 0xf3d45990, 0x00c378e2}, + {0x6464580f, 0x33e6c8c0, 0x3c4aa09f, 0x9d560eb3, 0xcc98f404, 0xb3f1a899, 0x8ca24b48, 0x012c1ea5}, + {0xe3b4dc56, 0xa0594a67, 0x91b698e1, 0xc8e6b582, 0x8df78057, 0x711cadbf, 0x396466f8, 0x0049abdf}, + {0x4ffa086a, 0xecc89610, 0xca06afc6, 0x4db82291, 0x8f3a6426, 0x9ae7c68c, 0x2a874432, 0x0b3dae8c}, + {0x3b3625b6, 0x1e62401f, 0x28471e5a, 0xd0692164, 0x5cad6b77, 0xb85aa9ec, 0xaa95acf2, 0x063e4b66}, + {0xb9112c51, 0x2542c2b2, 0x6e23b3ce, 0x36ead8da, 0x76476754, 0x9a268d13, 0xa1ad7cf1, 0x121f44ad}}}; static constexpr storage_array inv = { {{0x00000001, 0x8508c000, 0x68000000, 0xacd53b7f, 0x2e1bd800, 0x305a268f, 0x4d1652ab, 0x0955b2af}, @@ -140,12 +173,29 @@ namespace bls12_377 { {0xaf740001, 0x8a117ff7, 0x02ac480a, 0x77ecf6f4, 0x5695470e, 0x8f4f226b, 0x04d17a61, 0x12ab655e}, {0xd7ba0001, 0xca117ffb, 0x69562405, 0xe8cbb6f9, 0xd9667b87, 0xf801b7c4, 0x4f7f0fdb, 0x12ab655e}, {0xebdd0001, 0x6a117ffd, 0x1cab1203, 0xa13b16fc, 0x9acf15c4, 0x2c5b0271, 0x74d5da99, 0x12ab655e}, - {0xf5ee8001, 0x3a117ffe, 0x76558902, 0xfd72c6fd, 0xfb8362e2, 0xc687a7c7, 0x87813ff7, 0x12ab655e}}}; + {0xf5ee8001, 0x3a117ffe, 0x76558902, 0xfd72c6fd, 0xfb8362e2, 0xc687a7c7, 0x87813ff7, 0x12ab655e}, + {0x7af74001, 0xa2117fff, 0x232ac481, 0x2b8e9efe, 0x2bdd8972, 0x139dfa73, 0x90d6f2a7, 0x12ab655e}, + {0xbd7ba001, 0x56117fff, 0x79956241, 0xc29c8afe, 0xc40a9cb9, 0xba2923c8, 0x9581cbfe, 0x12ab655e}, + {0xdebdd001, 0x30117fff, 0xa4cab121, 0x8e2380fe, 0x9021265d, 0x8d6eb873, 0x97d738aa, 0x12ab655e}, + {0xef5ee801, 0x1d117fff, 0xba655891, 0x73e6fbfe, 0xf62c6b2f, 0x771182c8, 0x9901ef00, 0x12ab655e}, + {0xf7af7401, 0x13917fff, 0xc532ac49, 0x66c8b97e, 0xa9320d98, 0x6be2e7f3, 0x99974a2b, 0x12ab655e}, + {0xfbd7ba01, 0x0ed17fff, 0xca995625, 0xe039983e, 0x02b4decc, 0xe64b9a89, 0x99e1f7c0, 0x12ab655e}, + {0xfdebdd01, 0x0c717fff, 0xcd4cab13, 0x1cf2079e, 0xaf764767, 0xa37ff3d3, 0x9a074e8b, 0x12ab655e}, + {0xfef5ee81, 0x0b417fff, 0xcea6558a, 0x3b4e3f4e, 0x05d6fbb4, 0x021a2079, 0x9a19f9f1, 0x12ab655e}, + {0xff7af741, 0x8aa97fff, 0xcf532ac5, 0xca7c5b26, 0xb10755da, 0xb16736cb, 0x9a234fa3, 0x12ab655e}, + {0xffbd7ba1, 0x4a5d7fff, 0xcfa99563, 0x12136912, 0x069f82ee, 0x090dc1f5, 0x9a27fa7d, 0x12ab655e}, + {0xffdebdd1, 0x2a377fff, 0xcfd4cab2, 0xb5def008, 0xb16b9977, 0xb4e10789, 0x9a2a4fe9, 0x12ab655e}, + {0xffef5ee9, 0x9a247fff, 0xcfea6559, 0x87c4b383, 0x06d1a4bc, 0x0acaaa54, 0x9a2b7aa0, 0x12ab655e}, + {0xfff7af75, 0x521affff, 0x4ff532ad, 0xf0b79541, 0x3184aa5e, 0x35bf7bb9, 0x9a2c0ffb, 0x12ab655e}, + {0xfffbd7bb, 0x2e163fff, 0x0ffa9957, 0x25310620, 0xc6de2d30, 0xcb39e46b, 0x9a2c5aa8, 0x12ab655e}, + {0xfffdebde, 0x1c13dfff, 0x6ffd4cac, 0xbf6dbe8f, 0x118aee98, 0x95f718c5, 0x9a2c7fff, 0x12ab655e}}}; }; struct fq_config { static constexpr unsigned limbs_count = 12; + static constexpr unsigned omegas_count = 48; static constexpr unsigned modulus_bit_count = 377; + static constexpr unsigned num_of_reductions = 1; static constexpr storage modulus = {0x00000001, 0x8508c000, 0x30000000, 0x170b5d44, 0xba094800, 0x1ef3622f, 0x00f5138f, 0x1a22d9f3, 0x6ca1493b, 0xc63b05c0, 0x17c510ea, 0x01ae3a46}; @@ -155,6 +205,9 @@ namespace bls12_377 { static constexpr storage modulus_4 = {0x00000004, 0x14230000, 0xc0000002, 0x5c2d7510, 0xe8252000, 0x7bcd88be, 0x03d44e3c, 0x688b67cc, 0xb28524ec, 0x18ec1701, 0x5f1443ab, 0x06b8e918}; + static constexpr storage neg_modulus = {0xffffffff, 0x7af73fff, 0xcfffffff, 0xe8f4a2bb, + 0x45f6b7ff, 0xe10c9dd0, 0xff0aec70, 0xe5dd260c, + 0x935eb6c4, 0x39c4fa3f, 0xe83aef15, 0xfe51c5b9}; static constexpr storage<2 * limbs_count> modulus_wide = { 0x00000001, 0x8508c000, 0x30000000, 0x170b5d44, 0xba094800, 0x1ef3622f, 0x00f5138f, 0x1a22d9f3, 0x6ca1493b, 0xc63b05c0, 0x17c510ea, 0x01ae3a46, 0x00000000, 0x00000000, 0x00000000, 0x00000000, @@ -179,37 +232,321 @@ namespace bls12_377 { static constexpr storage zero = {0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000}; - static constexpr storage montgomery_r = {0xffffff, 0xf73fffff, 0xffffff7a, 0xf4a2bbcf, - 0xf6b7ffe8, 0x0c9dd045, 0x0aec70e1, 0xdd260cff, - 0x5eb6c4e5, 0xc4fa3f93, 0x3aef1539, 0x51c5b9e8}; - static constexpr storage montgomery_r_inv = {0x934f3a1, 0xb0909a28, 0xc1cfac62, 0x3264aa55, - 0x2a491ae8, 0xaccd49ca, 0xe80e9a61, 0x28b2dce9, - 0x26f7c08a, 0x4d313ea1, 0x36254563, 0x161de1ee}; + static constexpr storage montgomery_r = {0xffffff68, 0x02cdffff, 0x7fffffb1, 0x51409f83, + 0x8a7d3ff2, 0x9f7db3a9, 0x6e7c6305, 0x7b4e97b7, + 0x803c84e8, 0x4cf495bf, 0xe2fdf49a, 0x008d6661}; + static constexpr storage montgomery_r_inv = {0x451269e8, 0xef129093, 0xe65839f5, 0x6e20bbcd, + 0xa5582c93, 0x852e3c88, 0xf7f2e657, 0xeeaaf41d, + 0xa4c49351, 0xeb89746c, 0x436b0736, 0x014212fc}; + + static constexpr storage_array omega = { + {{0x00000000, 0x8508c000, 0x30000000, 0x170b5d44, 0xba094800, 0x1ef3622f, 0x00f5138f, 0x1a22d9f3, 0x6ca1493b, + 0xc63b05c0, 0x17c510ea, 0x01ae3a46}, + {0xf1391c63, 0x6e76d5ec, 0xbff27d8e, 0x99588459, 0x436b0f62, 0xbce649cf, 0x0ad1dec1, 0x400398f5, 0x1a79beb1, + 0xc0c534db, 0x796537ca, 0x01680a40}, + {0x554c85ba, 0x6cbff0e3, 0x0be8ff9d, 0xc07c7a91, 0x9dde4fa2, 0xc3c79f67, 0xb5726bde, 0x44bc6d1a, 0x76d6d607, + 0xad812919, 0x95e8fd0e, 0x001bc0c2}, + {0x6d5db237, 0xb8c206b0, 0xcabde6ba, 0x08fed85d, 0xcd92eb6f, 0xf2f54ffc, 0xe39c1788, 0xee81121f, 0x88e82edb, + 0x852def4d, 0xb95fdb80, 0x00bf1268}, + {0x192bf14f, 0x3663c26a, 0xe6351854, 0x99c859be, 0x159361b8, 0xf9430828, 0xfbe33d7d, 0x478ed715, 0xdb79c984, + 0x41e220cf, 0xd961f2be, 0x00cedb38}, + {0xcc724685, 0xb99caa69, 0x1388a46d, 0xc24087ba, 0x08f03491, 0xeb13a05a, 0x98fb0ff7, 0x558ab21e, 0x86bbd802, + 0x0166d08d, 0xf5b5728a, 0x00d1dec9}, + {0x92db32a2, 0x2e3951fe, 0x6014b201, 0x8f5a16c9, 0xa91fbb38, 0xa9e942b9, 0x17b4dbd2, 0xf7bf5b43, 0x81325c7d, + 0x57f3934a, 0x615ad019, 0x012be78e}, + {0xdce33f04, 0xb42b84a2, 0x0db0b91c, 0x7a0c1423, 0x88d9f8c8, 0xaed11a0c, 0xd484c501, 0x712d6bc0, 0xfa3f7633, + 0x50aca1e5, 0xb90f34d0, 0x01002f29}, + {0xf012f6a0, 0xbc3db054, 0x0d332ea7, 0x00d66897, 0xfd416167, 0x8278ef44, 0x20268e84, 0x1a1a3c4d, 0x4b11d215, + 0x7c976aa6, 0x63b6e925, 0x00949581}, + {0x339637c6, 0x9d73cf29, 0xa5642677, 0x8257d1a2, 0xcafd597c, 0xcb48f07f, 0x081435a3, 0x7a505010, 0xacbb9c39, + 0xaaa45ce1, 0x7431b9c8, 0x013f2b13}, + {0xd4710c0b, 0x9ef8bddb, 0x85047671, 0xb4c73188, 0x134695ba, 0x87a51d65, 0x022416dd, 0x67f3bc43, 0xcb2a157b, + 0x21d965b2, 0x5ce4195d, 0x013a57e4}, + {0xd2461368, 0xf2db3a9f, 0x3802aef2, 0x0595c232, 0x5ea85bd6, 0xa53d621a, 0xa34ee943, 0xce930fbc, 0x6b372bee, + 0x1d216665, 0xa4535740, 0x009f0159}, + {0x656bf68d, 0x73cf953a, 0xeac5c1d7, 0x50a5a5b5, 0xaa5355a9, 0x2697b2e1, 0x08de37d2, 0x6be70306, 0x44c5afab, + 0x907f6976, 0xd4ec46b1, 0x0155cfa2}, + {0x090e3e20, 0x034160c4, 0xf77a6fbb, 0xbc73cc59, 0x188e54f6, 0x437cd23b, 0x17e42614, 0x5a788edd, 0xebdc8eae, + 0xf1ad4f54, 0x2f129bcd, 0x005d1440}, + {0x4e269ee5, 0x5626c031, 0x0d1501ec, 0x5f97673e, 0x86d31c18, 0x4fe089bd, 0x62d1259a, 0x3e9fffcb, 0x1ff89d01, + 0xe1898f32, 0x59d01a38, 0x00fa1331}, + {0x38d427b1, 0xda80661b, 0xa814f14b, 0x1913027d, 0xcda4061d, 0xd3f61e24, 0x5da8fcb2, 0x9509e69d, 0x1f05e6d3, + 0x0e7493a5, 0xa5c6bd06, 0x00dcb8db}, + {0x61cff9ed, 0x88499d0a, 0x53718444, 0x0b317da2, 0x4b7eec5f, 0xc1624bfd, 0x5af10e6f, 0x6ffc3241, 0xd6c66ff2, + 0x27d0edf3, 0x73ab0f4a, 0x013019b5}, + {0x06027b24, 0x42dc7673, 0x3341b9e7, 0x018f8bbd, 0xa435f7e2, 0xd3b389d9, 0xea031176, 0x279739a5, 0x74c35801, + 0x3555ca51, 0x049dcf87, 0x00748c30}, + {0x81fe14de, 0x731b16f0, 0x333cc61a, 0x528d6ada, 0x5736dc15, 0x7ae87278, 0xc8bfd40c, 0xa94b9fd2, 0x299b0487, + 0x714dd8ed, 0xf1a53233, 0x00642b62}, + {0x5bc45170, 0x31270ddf, 0x7f72c758, 0x7efb6b06, 0xcf4973a8, 0x2eb9f2aa, 0xe556d234, 0xdcb534c9, 0x0e043fef, + 0xf0b1a210, 0x54dda04e, 0x00e79c44}, + {0x2d5f1bc2, 0x213b3f52, 0xfd933428, 0x9e115ba7, 0x434c9e2a, 0x7f77d57e, 0xcdb944ef, 0x47a78418, 0x699aa559, + 0x8cb01cbb, 0xb064c4d7, 0x0075bf81}, + {0x3fbfc66c, 0x0b6c2e65, 0x6fcab2f8, 0x7bece031, 0xb79dcd4d, 0x2ba7e325, 0xa5c6881b, 0x8c18f66a, 0x7283805a, + 0x4d893e5a, 0xfc296bfe, 0x0107d3c5}, + {0x948c881a, 0x53fbdbb4, 0x16803d18, 0xf27a9c14, 0xeddfafef, 0x8490f6c5, 0x3e57fa15, 0xfe068e1d, 0xd26b296b, + 0xbe923119, 0x9fa377a1, 0x00d56016}, + {0x6f5b2ad1, 0xb3bbaeb3, 0x11886a1c, 0x0efd4ba9, 0xdedb7083, 0x5911498f, 0x5bd0a90f, 0x0921fe19, 0x83d379cb, + 0x38e05d4e, 0xb7ba3c73, 0x006b39e2}, + {0xa55550ba, 0x61b560e4, 0xe7288461, 0xd9ac545b, 0xc6e3e282, 0xde8d2826, 0x7e49dd2c, 0x9e87a310, 0xc43080b7, + 0xf2edfc44, 0x95b7d300, 0x012b4875}, + {0x27591e60, 0x4048ddc3, 0xc5d21791, 0xb77c9738, 0x49826bea, 0xf2f82033, 0x42f97e95, 0xf60bb703, 0x5966139d, + 0xef8f6f16, 0xc0e95e39, 0x00327618}, + {0x441e395f, 0xf9059c8f, 0xbd087238, 0x29eab35f, 0x7dee5ff1, 0x5d4abeff, 0x771e60e9, 0x7222499b, 0x7ac324a2, + 0xb70c1ea3, 0x0da51ce8, 0x015b3af9}, + {0xe9a70026, 0xf7aa576b, 0x01c4a126, 0xb28733ef, 0xa3307647, 0x06b8e768, 0xe12588ce, 0x115500e1, 0x6c9f9b1d, + 0x7e8dd6b9, 0x6ec020b3, 0x014d091e}, + {0x8e5bbc8d, 0xd318265d, 0x141bee9b, 0x70b460ba, 0x1aa9df5b, 0x145dd6a6, 0xe3478cb3, 0xd9da2548, 0x7b509387, + 0x47250509, 0xe967973c, 0x00de53d3}, + {0xd2aa57b8, 0x5ff4399c, 0xa6ae9b07, 0x90360194, 0x6cfcdb7a, 0x68979991, 0x64e56abb, 0xf517467c, 0xad7a6573, + 0x44227491, 0xa35ebf55, 0x0001da0b}, + {0x4d80f6da, 0xd8b22d5a, 0x10ee1a06, 0x6e7b2bfb, 0x17faeac0, 0xac8d97e5, 0x7a12c923, 0x8b75540b, 0x5b42ce02, + 0xa2787368, 0xe98d9998, 0x008d30a5}, + {0x9dc292bb, 0xee29c02a, 0xc5b7e1c9, 0x9e7ea016, 0x9a908e5f, 0x62daf95d, 0x3e98eae9, 0x80a71c61, 0xfdda3bba, + 0x2d514723, 0x068ef829, 0x00f65844}, + {0x185b1ad6, 0xf62fdfa4, 0xf90ccbe6, 0x2ae7f104, 0x972ce78e, 0xfa435fb6, 0x45e59f91, 0x53a75d3c, 0x2f320b7a, + 0x7290cac2, 0xe7cb5108, 0x01a2022a}, + {0xd59dda24, 0xcf0a15be, 0xf2ec72b4, 0xbc77f6d4, 0x96c31202, 0xa8df0caf, 0xbb4f8842, 0xb95429c0, 0xd0087306, + 0xb989b210, 0x5571e9f0, 0x002b1694}, + {0x67ae536e, 0x7e84d4b5, 0xc8fb9b80, 0x3a920871, 0x1948ee86, 0x1a82df2b, 0xb3c66ed3, 0xdef79467, 0xef64d05a, + 0x58fd84f2, 0xd999f400, 0x00c6d5b7}, + {0x81ee0d53, 0x7639f9a2, 0xb5747565, 0x8ade807d, 0xe6235609, 0xfd9d6266, 0x53730f18, 0xea1948a3, 0xd890142e, + 0xa356108a, 0xe3e8a723, 0x00a48ac6}, + {0xd0ca5e04, 0x531c4b83, 0x2ba0a328, 0xff35ced6, 0xa4e563aa, 0x01613079, 0x1442dcd1, 0x6f52b3a3, 0x9e19b0a6, + 0x813b4616, 0x9536db26, 0x004828c5}, + {0x0bce1b4e, 0x8a9321a9, 0xae85d6ff, 0xb9759dbe, 0x5cb206e0, 0x1ce1d522, 0x35a1607a, 0x87df044f, 0x94e1329a, + 0x2ebabee7, 0x73586cc9, 0x01a73170}, + {0x3dd667f3, 0x69824754, 0x28fd63a2, 0x61a081a7, 0x99499385, 0x0b9f6d2e, 0x5c253e16, 0x6d45622b, 0x765a7f5f, + 0xcd672e4d, 0x7150d847, 0x01182798}, + {0x2742d2f6, 0x0af0bfd2, 0x3a02631d, 0x93616956, 0xac8a2203, 0x32dae751, 0x85cf4e2d, 0xea4ffbe7, 0x7dba6eb9, + 0x673424f4, 0x61f4060d, 0x002ec230}, + {0x5a5b5c2b, 0x226293ca, 0x0684dbc9, 0xbc0ca23e, 0x7d637c4f, 0x4510cf3a, 0x9b2f4a52, 0x7869c488, 0x2fd73a53, + 0xec009b90, 0xa8c99cca, 0x003499d6}, + {0xfd745afc, 0x9da60b0a, 0x41c5362e, 0xff0769ec, 0xfa9fd8ee, 0x487621e9, 0xab04558f, 0x138910d1, 0xc1ed03ce, + 0x870903cf, 0xed3ffb51, 0x002c1cfa}, + {0x42870c46, 0x271b1ff3, 0x13b4b491, 0x1e0a9cd1, 0x3c55c65e, 0x2d58cb1a, 0x74756f6e, 0xa6e12c32, 0x2e313bc4, + 0xf774a43d, 0xcc386ffc, 0x00ca156d}, + {0x4a67741c, 0x588f79b6, 0xc3590b63, 0xc0ae78b5, 0xc3576385, 0xad0bb97d, 0xb8473137, 0x0583dd49, 0x515d8604, + 0xb31d9631, 0xd3ba3b12, 0x015337bc}, + {0x8a458e8c, 0x976a14f5, 0xc3a26ae8, 0xc90809b4, 0x089acf15, 0x270a1575, 0x5013d4b1, 0x614a0d25, 0x6d09901e, + 0x1314e076, 0xf208945e, 0x0022f414}, + {0xc563b9a1, 0x7eca603c, 0x06fe0bc3, 0x06df0a43, 0x0ddff8c6, 0xb44d994a, 0x4512a3d4, 0x40fbe05b, 0x8aeffc9b, + 0x30f15248, 0x05198a80, 0x0036a92e}}}; + + static constexpr storage_array omega_inv = { + {{0x00000000, 0x8508c000, 0x30000000, 0x170b5d44, 0xba094800, 0x1ef3622f, 0x00f5138f, 0x1a22d9f3, 0x6ca1493b, + 0xc63b05c0, 0x17c510ea, 0x01ae3a46}, + {0x0ec6e39e, 0x1691ea13, 0x700d8272, 0x7db2d8ea, 0x769e389d, 0x620d1860, 0xf62334cd, 0xda1f40fd, 0x52278a89, + 0x0575d0e5, 0x9e5fd920, 0x00463005}, + {0x93997f11, 0x9403412c, 0xdfb2323f, 0x845557b3, 0x2d50c7fc, 0x66f2eaaa, 0xc103f92f, 0x992358fb, 0x5d7a3179, + 0x01d60217, 0xd2af5da0, 0x0077b354}, + {0xc1000ea4, 0x7ac2ca7a, 0x7f8d9495, 0x937db751, 0x0de62931, 0x401b3873, 0x980129ba, 0x59be7529, 0xa545a303, + 0x2ba8f85d, 0xb6705512, 0x00573e3a}, + {0x2c1b22e6, 0xb55712f9, 0x0f91cddd, 0x66cfc0f3, 0x8bb345d8, 0x8d5fcd42, 0x86c0abc3, 0x61e4cf98, 0x432fe8f3, + 0x93556354, 0xad005fb6, 0x00ff87d5}, + {0x7aba560e, 0x05065a97, 0x7918b9db, 0x333ff005, 0xdf6be708, 0x03938ae1, 0x7410a77b, 0x922d3376, 0x03a15063, + 0xa5aeaa56, 0x4aea89e5, 0x01542cb6}, + {0xe4d6a772, 0x61a6a2d6, 0x6e6239a7, 0xc18c9ef7, 0x04cac70f, 0x8772bb3f, 0x16c5916b, 0x8bbb4185, 0x46335dc0, + 0x4aa656e2, 0x842c1664, 0x008187ac}, + {0xdd4e93c5, 0xa002ea0a, 0x07458704, 0xb40a45e8, 0xbaa65f2a, 0xee9ee3ea, 0x8f3b8a87, 0xeffa4f9e, 0x95b5feba, + 0xb6e03897, 0x81751c63, 0x003c41de}, + {0x13043a4a, 0x50221a3b, 0xda73331a, 0x6537fca8, 0x8e85077c, 0x8b74cef4, 0x0e5bbe67, 0x65705341, 0xefa22d23, + 0xf0f56caa, 0xd1865d98, 0x001f8eb5}, + {0x3e26a605, 0xd9af8944, 0x6970166f, 0xad0efb6e, 0x2c7464ec, 0xc16d7972, 0xf788281b, 0xe0de4b04, 0xaa878b0e, + 0x0c049e55, 0x63e2e7cd, 0x0135383a}, + {0x6f6893f7, 0x6b12c42e, 0x44bbbf63, 0x831f38c0, 0x191be6c9, 0xa57797d4, 0x447475cb, 0x6af7f695, 0x4b8be189, + 0x3295e9e7, 0x350d0aad, 0x00a9a32b}, + {0x7656ef1d, 0xc2243f86, 0xf4211219, 0x3e4c3bc3, 0x3c9a3d21, 0xaa4db6e0, 0xe8a4c946, 0x29ac638a, 0xa4cf856e, + 0x21449f8b, 0x7d4c9c67, 0x018cf097}, + {0x6a8e0139, 0x18e472a2, 0xd6b1c835, 0xcc7c80fd, 0x6546fc0a, 0x1f760883, 0x4ea3417c, 0x5bcfc1fb, 0xe9acb8b0, + 0x52c9a29b, 0xd9f265a2, 0x01a6d8b2}, + {0xebb83ac0, 0x95eb1dc8, 0x9f390cf2, 0x1e8d70f5, 0xb0d85145, 0xf9e4955d, 0x89720ee1, 0xe9690d30, 0x50fc879f, + 0x629972a5, 0x69ccd670, 0x00456e23}, + {0x83f38be4, 0xfbfb11a1, 0x388e6726, 0xb90a19b9, 0xc860d62c, 0x3fc10bc7, 0xc3c4e575, 0xc9fe043e, 0x7396d780, + 0x67aeff74, 0x01cadaee, 0x019059fa}, + {0xfd581be8, 0x43506d6e, 0x018b1b76, 0xf09563e6, 0xe87f9d80, 0x5cd193b2, 0x0a933402, 0x18ba3260, 0x50524c77, + 0x4de839d9, 0xd90315ce, 0x0018c2ed}, + {0xa737701d, 0xf900eb81, 0x995e6672, 0x6874c90e, 0xa495900b, 0x69ade94a, 0xd07bd4b1, 0xd5f358e7, 0x6f88e8e4, + 0xbd437e9d, 0x1d6b88cf, 0x0130d706}, + {0xfc29b95f, 0x064629bd, 0xb546585c, 0x0a897bff, 0x54a80d9a, 0x856c8d4f, 0x944568ff, 0x85410cc4, 0x59fc4370, + 0xc1978c65, 0xc668dc52, 0x017c86c8}, + {0xf6109131, 0x65cecd55, 0x7d2f52e5, 0x6d7e892e, 0xb90b2403, 0xe9a09007, 0xae0a060d, 0x92ca9aac, 0xa22b1e96, + 0x5ce1cc4f, 0x45201e6f, 0x012eb33c}, + {0x20d1aac5, 0x9d2cb4cf, 0xded22997, 0x3e4a1e77, 0x07fae2e2, 0x09d692f7, 0xd49bdcbe, 0x6a6aa4f8, 0x09c01cab, + 0xa8e21ead, 0x6b03b72e, 0x01a19e81}, + {0x935650ca, 0xf3d94623, 0x2ffd937e, 0x4a688a46, 0xa622b139, 0xf55fd53a, 0x7a1a1e40, 0x227406aa, 0x9a3fea60, + 0x40dd4504, 0x1edbb584, 0x00fc2332}, + {0xf28db3fc, 0x9707402f, 0xc28593f1, 0x3d898bd7, 0xb30effcd, 0xcaee2dfd, 0x4fb6ec9d, 0xff1b0790, 0x09ed1120, + 0x9cb0597e, 0xb78d15e9, 0x005c73a5}, + {0xb0a8a3b9, 0x739a4c2e, 0xc57196ae, 0x083bde21, 0xba602f29, 0x247eb070, 0x1c2c7132, 0x4ba1dd6a, 0xe2187c6c, + 0x4ce59fb6, 0x606880b1, 0x0014a7b5}, + {0x484baf56, 0xdd0eccab, 0x4541b101, 0xe6c80eaf, 0xf7964f64, 0x35b8a558, 0xc50ccf94, 0xb3b824d4, 0x21c71aeb, + 0xe1f6b4c8, 0x23031df0, 0x01a8a647}, + {0x592a9620, 0x5338dc01, 0xd94a401b, 0xb217f96d, 0xf830b00e, 0xfefb6601, 0xafd3dee4, 0x1ec061b5, 0x05a199bd, + 0x0d5d4d3c, 0xc8489913, 0x0196c768}, + {0x1f980ca0, 0x4acb430e, 0x71c6821c, 0x8973a3cc, 0xb3e9aa75, 0x74414c20, 0x0c13f042, 0x79212a5f, 0x375c705b, + 0x5c44d226, 0x29439af2, 0x000a2fdd}, + {0xa387b60c, 0xf01901e6, 0x4561ff3d, 0xa7b1b7dc, 0x0558e085, 0x5d82d374, 0xf2bc1d29, 0x519298e5, 0x3d332207, + 0x0ad719a8, 0xea19a807, 0x0150a138}, + {0x9deb8e06, 0x7c6b3eb1, 0x28206b6c, 0x3a8f53c4, 0x7fed1065, 0x039f575f, 0x40c1f898, 0x31be74ba, 0x790ac003, + 0x76db938e, 0x5508c5e4, 0x0096d5e1}, + {0xb83f8358, 0x3e940e0e, 0x372a4b8b, 0x204d80e0, 0xa820b2ec, 0x956454b2, 0x2cc8078c, 0x8e2cb3d4, 0xc6f81363, + 0xdd0d3e12, 0x49041a64, 0x0052f327}, + {0x2aec0be2, 0x37ca2eb7, 0x555cc652, 0x05093570, 0xd2588d31, 0xe62f1adb, 0x798be240, 0x2fd2518e, 0x0ff6b579, + 0x9302d4e3, 0x6ee95e5d, 0x0025ca57}, + {0x233eed68, 0xcc664858, 0xece3a327, 0x600ca1ac, 0x93a2e34f, 0x330d1102, 0xdb5e3bb4, 0xc84ab55f, 0xe4d5576e, + 0x5179c101, 0x0938f714, 0x00efb20e}, + {0xfdddaf5c, 0x907f96e7, 0x1ffe49da, 0x348dab77, 0xc14ab779, 0x3eca44ad, 0x4cdc5d98, 0xe9b10b2e, 0xa95c5a36, + 0x65a25d16, 0x6e616518, 0x00c9f759}, + {0x7a5aff62, 0x9497d331, 0xb57cd01d, 0x21896195, 0x6c7ba745, 0xe09e22f7, 0x5a7acff0, 0xcc9f1064, 0xc93c46b0, + 0x7b867cdf, 0x23eba5ae, 0x01a05dcb}, + {0x4dcc71f4, 0xa56a8e33, 0xcbebdba2, 0xc480b083, 0x36ea43af, 0x748448fa, 0xe7859f3c, 0xee9b4b0e, 0x5af41919, + 0x9ab2bb09, 0x65caa0ea, 0x0127262d}, + {0x352a05cc, 0x77c7d12f, 0xdc7160c9, 0xb91ca5be, 0x5a3feda0, 0x245106da, 0x7669f7cd, 0xfd45012d, 0xdc5489fa, + 0xc4774629, 0x2872daa0, 0x00241273}, + {0x0d3e0b0b, 0x1838ae6f, 0xff67fc2c, 0x7fcc9b21, 0x23956100, 0xaedca59e, 0x1e79aa4b, 0x572ed634, 0xc7f0673c, + 0xaeeda160, 0xc8047256, 0x00360e2c}, + {0xe05044f9, 0xec5e4514, 0x7ec9b4ef, 0xe915b7e7, 0x9c4bec48, 0x9fb78cd8, 0xa38d95a3, 0xd7b84113, 0xb86fd119, + 0x7be64440, 0xe4f9e70a, 0x009e3a60}, + {0xc7435591, 0xc61cc546, 0xe5e94dc4, 0xea99a96f, 0xdb8ff17d, 0x5b10e2b4, 0x3dd0ff10, 0x13f8fb9d, 0xe118b9e9, + 0xcbb1c0ce, 0x7ebf8a0d, 0x00b37258}, + {0xce5943e7, 0xd44fdb9d, 0x79fa927a, 0xcb7d41ea, 0xdcee72ca, 0x9a4bcebf, 0x11634905, 0x2317799d, 0x584055ac, + 0x3f1c302e, 0xdc2d0017, 0x013ef021}, + {0xa78a1578, 0x345cb052, 0x5961b8fe, 0x1ed4d48a, 0x74a5e2af, 0x5858e93c, 0x0fd17e9f, 0xaf643f0a, 0x79d94009, + 0x61530753, 0xde7b2f53, 0x010a3393}, + {0x813925df, 0x548b1d28, 0xca3e79b6, 0xabab3a4e, 0x7e51071a, 0xb3c9c068, 0x6c5fcedb, 0x8014e879, 0x95d9facc, + 0x3ba5db77, 0x7f5c3d2f, 0x0105c419}, + {0x26bc1104, 0xbb9cbd28, 0xe03cc852, 0x27f09abb, 0x22e5be61, 0x02763b4a, 0xb94fa254, 0xa3940542, 0xff34c35f, + 0xcf058850, 0x1482533c, 0x019f538f}, + {0xb3f42de9, 0xf2126047, 0xbeb0a1b8, 0xdb0451c4, 0x9aabc291, 0x1a945bc0, 0x7fe3a6f2, 0x13d08312, 0x390e1c07, + 0xd8fb13f1, 0x6b30562b, 0x005a41c4}, + {0xe8b3d5dd, 0x1c60fcc5, 0x75b3a464, 0x5d7babba, 0xf3989910, 0x0d9f52c7, 0x9beec571, 0x464a2840, 0x79689d4b, + 0x139c496f, 0x099e64c4, 0x0022c6a3}, + {0x023e0cd1, 0x9df6c2d5, 0xa6b747de, 0x8e23def9, 0x90da6876, 0x7bc83eee, 0xc88bb007, 0xdaeac352, 0x68bb6a7f, + 0x45cabb6f, 0x94697b34, 0x001e7154}, + {0x0203d905, 0xffcee91d, 0xc99df56d, 0xd878ee01, 0x210d754c, 0xa0e882f9, 0x7d0aec6a, 0x26c96db8, 0x8ff7afe4, + 0x46e2e145, 0x54749283, 0x015cd1b0}}}; + + static constexpr storage_array inv = { + {{0x00000001, 0x42846000, 0x18000000, 0x0b85aea2, 0xdd04a400, 0x8f79b117, 0x807a89c7, 0x8d116cf9, 0x3650a49d, + 0x631d82e0, 0x0be28875, 0x00d71d23}, + {0x00000001, 0x63c69000, 0x24000000, 0x114885f3, 0xcb86f600, 0x573689a3, 0x40b7ceab, 0x539a2376, 0x5178f6ec, + 0x14ac4450, 0x91d3ccb0, 0x0142abb4}, + {0x00000001, 0x7467a800, 0xaa000000, 0x1429f19b, 0xc2c81f00, 0x3b14f5e9, 0xa0d6711d, 0xb6de7eb4, 0x5f0d2013, + 0x6d73a508, 0x54cc6ecd, 0x017872fd}, + {0x00000001, 0x7cb83400, 0xed000000, 0x159aa76f, 0xbe68b380, 0x2d042c0c, 0xd0e5c256, 0x6880ac53, 0x65d734a7, + 0x19d75564, 0xb648bfdc, 0x019356a1}, + {0x00000001, 0x80e07a00, 0x0e800000, 0x1653025a, 0x3c38fdc0, 0xa5fbc71e, 0x68ed6af2, 0x4151c323, 0x693c3ef1, + 0x70092d92, 0xe706e863, 0x01a0c873}, + {0x00000001, 0x82f49d00, 0x1f400000, 0x16af2fcf, 0xfb2122e0, 0xe27794a6, 0x34f13f40, 0x2dba4e8b, 0x6aeec416, + 0x1b2219a9, 0xff65fca7, 0x01a7815c}, + {0x00000001, 0x83feae80, 0xa7a00000, 0x16dd4689, 0x5a953570, 0x00b57b6b, 0x1af32968, 0xa3ee943f, 0xebc806a8, + 0xf0ae8fb4, 0x8b9586c8, 0x01aaddd1}, + {0x00000001, 0x8483b740, 0xebd00000, 0x16f451e6, 0x8a4f3eb8, 0x8fd46ecd, 0x0df41e7b, 0xdf08b719, 0xac34a7f1, + 0xdb74caba, 0xd1ad4bd9, 0x01ac8c0b}, + {0x00000001, 0x84c63ba0, 0x8de80000, 0x16ffd795, 0xa22c435c, 0x5763e87e, 0x07749905, 0x7c95c886, 0x8c6af896, + 0x50d7e83d, 0xf4b92e62, 0x01ad6328}, + {0x00000001, 0x84e77dd0, 0xdef40000, 0x17059a6c, 0x2e1ac5ae, 0x3b2ba557, 0x8434d64a, 0xcb5c513c, 0xfc8620e8, + 0x8b8976fe, 0x863f1fa6, 0x01adceb7}, + {0x00000001, 0x84f81ee8, 0x877a0000, 0x17087bd8, 0x741206d7, 0xad0f83c3, 0xc294f4ec, 0xf2bf9597, 0xb493b511, + 0xa8e23e5f, 0xcf021848, 0x01ae047e}, + {0x00000001, 0x85006f74, 0x5bbd0000, 0x9709ec8e, 0x970da76b, 0xe60172f9, 0x61c5043d, 0x867137c5, 0x109a7f26, + 0xb78ea210, 0x73639499, 0x01ae1f62}, + {0x00000001, 0x850497ba, 0x45de8000, 0xd70aa4e9, 0xa88b77b5, 0x827a6a94, 0x315d0be6, 0xd04a08dc, 0x3e9de430, + 0x3ee4d3e8, 0x459452c2, 0x01ae2cd4}, + {0x00000001, 0x8506abdd, 0xbaef4000, 0xf70b0116, 0x314a5fda, 0xd0b6e662, 0x99290fba, 0xf5367167, 0x559f96b5, + 0x828fecd4, 0x2eacb1d6, 0x01ae338d}, + {0x80000001, 0x8507b5ee, 0x7577a000, 0x870b2f2d, 0xf5a9d3ed, 0xf7d52448, 0x4d0f11a4, 0x87aca5ad, 0x61206ff8, + 0xa465794a, 0xa338e160, 0x01ae36e9}, + {0x40000001, 0x85083af7, 0xd2bbd000, 0xcf0b4638, 0x57d98df6, 0x0b64433c, 0x2702129a, 0xd0e7bfd0, 0x66e0dc99, + 0xb5503f85, 0xdd7ef925, 0x01ae3897}, + {0xa0000001, 0x85087d7b, 0x815de800, 0x730b51be, 0x08f16afb, 0x952bd2b6, 0x93fb9314, 0x75854ce1, 0xe9c112ea, + 0x3dc5a2a2, 0xfaa20508, 0x01ae396e}, + {0xd0000001, 0x85089ebd, 0x58aef400, 0xc50b5781, 0xe17d597d, 0xda0f9a72, 0x4a785351, 0xc7d4136a, 0xab312e12, + 0x82005431, 0x89338af9, 0x01ae39da}, + {0xe8000001, 0x8508af5e, 0xc4577a00, 0xee0b5a62, 0x4dc350be, 0x7c817e51, 0xa5b6b370, 0xf0fb76ae, 0x0be93ba6, + 0x241dacf9, 0x507c4df2, 0x01ae3a10}, + {0x74000001, 0x8508b7af, 0x7a2bbd00, 0x828b5bd3, 0x83e64c5f, 0xcdba7040, 0xd355e37f, 0x058f2850, 0xbc454271, + 0x752c595c, 0x3420af6e, 0x01ae3a2b}, + {0xba000001, 0x8508bbd7, 0xd515de80, 0xcccb5c8b, 0x1ef7ca2f, 0x7656e938, 0xea257b87, 0x0fd90121, 0x947345d6, + 0x9db3af8e, 0xa5f2e02c, 0x01ae3a38}, + {0xdd000001, 0x8508bdeb, 0x028aef40, 0xf1eb5ce8, 0xec808917, 0x4aa525b3, 0x758d478b, 0x94fded8a, 0x808a4788, + 0xb1f75aa7, 0x5edbf88b, 0x01ae3a3f}, + {0xee800001, 0x8508bef5, 0x194577a0, 0x047b5d16, 0xd344e88c, 0x34cc43f1, 0xbb412d8d, 0xd79063be, 0xf695c861, + 0x3c193033, 0xbb5084bb, 0x01ae3a42}, + {0xf7400001, 0x8508bf7a, 0x24a2bbd0, 0x0dc35d2d, 0xc6a71846, 0x29dfd310, 0xde1b208e, 0x78d99ed8, 0x319b88ce, + 0x012a1afa, 0x698acad3, 0x01ae3a44}, + {0x7ba00001, 0x8508bfbd, 0xaa515de8, 0x12675d38, 0x40583023, 0xa4699aa0, 0xef881a0e, 0xc97e3c65, 0x4f1e6904, + 0xe3b2905d, 0x40a7edde, 0x01ae3a45}, + {0xbdd00001, 0x8508bfde, 0x6d28aef4, 0x94b95d3e, 0xfd30bc11, 0xe1ae7e67, 0x783e96ce, 0xf1d08b2c, 0xdddfd91f, + 0xd4f6cb0e, 0xac367f64, 0x01ae3a45}, + {0x5ee80001, 0x8508bfef, 0x4e94577a, 0xd5e25d41, 0xdb9d0208, 0x0050f04b, 0xbc99d52f, 0x85f9b28f, 0xa540912d, + 0xcd98e867, 0xe1fdc827, 0x01ae3a45}, + {0xaf740001, 0x8508bff7, 0xbf4a2bbd, 0x7676dd42, 0xcad32504, 0x0fa2293d, 0x5ec7745f, 0x500e4641, 0x08f0ed34, + 0x49e9f714, 0xfce16c89, 0x01ae3a45}, + {0xd7ba0001, 0x0508bffb, 0x77a515df, 0x46c11d43, 0xc26e3682, 0x174ac5b6, 0x2fde43f7, 0xb518901a, 0x3ac91b37, + 0x08127e6a, 0x0a533eba, 0x01ae3a46}, + {0xebdd0001, 0xc508bffd, 0xd3d28aef, 0x2ee63d43, 0x3e3bbf41, 0x1b1f13f3, 0x9869abc3, 0x679db506, 0x53b53239, + 0x6726c215, 0x110c27d2, 0x01ae3a46}, + {0xf5ee8001, 0x2508bffe, 0x01e94578, 0xa2f8cd44, 0x7c2283a0, 0x1d093b11, 0xccaf5fa9, 0x40e0477c, 0xe02b3dba, + 0x96b0e3ea, 0x14689c5e, 0x01ae3a46}, + {0x7af74001, 0x5508bfff, 0x18f4a2bc, 0x5d021544, 0x9b15e5d0, 0x1dfe4ea0, 0xe6d2399c, 0xad8190b7, 0xa666437a, + 0xae75f4d5, 0x1616d6a4, 0x01ae3a46}, + {0xbd7ba001, 0x6d08bfff, 0x247a515e, 0x3a06b944, 0x2a8f96e8, 0x9e78d868, 0x73e3a695, 0xe3d23555, 0x0983c65a, + 0xba587d4b, 0x16edf3c7, 0x01ae3a46}, + {0xdebdd001, 0x7908bfff, 0x2a3d28af, 0x28890b44, 0xf24c6f74, 0x5eb61d4b, 0x3a6c5d12, 0xfefa87a4, 0xbb1287ca, + 0x4049c185, 0x17598259, 0x01ae3a46}, + {0xef5ee801, 0xff08bfff, 0x2d1e9457, 0x1fca3444, 0xd62adbba, 0xbed4bfbd, 0x9db0b850, 0x0c8eb0cb, 0x13d9e883, + 0x034263a3, 0x178f49a2, 0x01ae3a46}, + {0xf7af7401, 0x4208bfff, 0x2e8f4a2c, 0x1b6ac8c4, 0xc81a11dd, 0xeee410f6, 0x4f52e5ef, 0x1358c55f, 0xc03d98df, + 0x64beb4b1, 0x17aa2d46, 0x01ae3a46}, + {0xfbd7ba01, 0x6388bfff, 0x2f47a516, 0x993b1304, 0x4111acee, 0x86ebb993, 0x2823fcbf, 0x16bdcfa9, 0x166f710d, + 0x957cdd39, 0x17b79f18, 0x01ae3a46}, + {0xfdebdd01, 0x7448bfff, 0x2fa3d28b, 0x58233824, 0x7d8d7a77, 0x52ef8de1, 0x148c8827, 0x187054ce, 0xc1885d24, + 0xaddbf17c, 0x17be5801, 0x01ae3a46}, + {0xfef5ee81, 0xfca8bfff, 0x2fd1e945, 0xb7974ab4, 0x9bcb613b, 0x38f17808, 0x8ac0cddb, 0x99499760, 0x9714d32f, + 0x3a0b7b9e, 0x17c1b476, 0x01ae3a46}, + {0xff7af741, 0x40d8bfff, 0x2fe8f4a3, 0xe75153fc, 0x2aea549d, 0x2bf26d1c, 0xc5daf0b5, 0x59b638a9, 0x81db0e35, + 0x802340af, 0x17c362b0, 0x01ae3a46}, + {0xffbd7ba1, 0xe2f0bfff, 0x2ff47a51, 0xff2e58a0, 0xf279ce4e, 0x2572e7a5, 0x63680222, 0x39ec894e, 0xf73e2bb8, + 0xa32f2337, 0x17c439cd, 0x01ae3a46}, + {0xffdebdd1, 0x33fcbfff, 0x2ffa3d29, 0x8b1cdaf2, 0xd6418b27, 0xa23324ea, 0xb22e8ad8, 0xaa07b1a0, 0x31efba79, + 0x34b5147c, 0x17c4a55c, 0x01ae3a46}, + {0xffef5ee9, 0xdc82bfff, 0x2ffd1e94, 0xd1141c1b, 0x48256993, 0xe093438d, 0xd991cf33, 0x621545c9, 0x4f4881da, + 0x7d780d1e, 0x17c4db23, 0x01ae3a46}, + {0xfff7af75, 0xb0c5bfff, 0xaffe8f4a, 0xf40fbcaf, 0x811758c9, 0x7fc352de, 0x6d437161, 0xbe1c0fde, 0x5df4e58a, + 0x21d9896f, 0x17c4f607, 0x01ae3a46}, + {0xfffbd7bb, 0x9ae73fff, 0xefff47a5, 0x058d8cf9, 0x1d905065, 0x4f5b5a87, 0xb71c4278, 0xec1f74e8, 0xe54b1762, + 0xf40a4797, 0x17c50378, 0x01ae3a46}, + {0xfffdebde, 0x0ff7ffff, 0x0fffa3d3, 0x8e4c751f, 0x6bcccc32, 0xb7275e5b, 0xdc08ab03, 0x0321276d, 0x28f6304f, + 0xdd22a6ac, 0x17c50a31, 0x01ae3a46}}}; + // i^2, the square of the imaginary unit for the extension field static constexpr uint32_t i_squared = 5; // true if i^2 is negative static constexpr bool i_squared_is_negative = true; - // G1 and G2 generators - static constexpr storage g1_gen_x = {0xb21be9ef, 0xeab9b16e, 0xffcd394e, 0xd5481512, - 0xbd37cb5c, 0x188282c8, 0xaa9d41bb, 0x85951e2c, - 0xbf87ff54, 0xc8fc6225, 0xfe740a67, 0x008848de}; - static constexpr storage g1_gen_y = {0x559c8ea6, 0xfd82de55, 0x34a9591a, 0xc2fe3d36, - 0x4fb82305, 0x6d182ad4, 0xca3e52d9, 0xbd7fb348, - 0x30afeec4, 0x1f674f5d, 0xc5102eff, 0x01914a69}; - static constexpr storage g2_gen_x_re = {0x7c005196, 0x74e3e48f, 0xbb535402, 0x71889f52, - 0x57db6b9b, 0x7ea501f5, 0x203e5031, 0xc565f071, - 0xa3841d01, 0xc89630a2, 0x71c785fe, 0x018480be}; - static constexpr storage g2_gen_x_im = {0x6ea16afe, 0xb26bfefa, 0xbff76fe6, 0x5cf89984, - 0x0799c9de, 0xe7223ece, 0x6651cecb, 0x532777ee, - 0xb1b140d5, 0x70dc5a51, 0xe7004031, 0x00ea6040}; - static constexpr storage g2_gen_y_re = {0x09fd4ddf, 0xf0940944, 0x6d8c7c2e, 0xf2cf8888, - 0xf832d204, 0xe458c282, 0x74b49a58, 0xde03ed72, - 0xcbb2efb4, 0xd960736b, 0x5d446f7b, 0x00690d66}; - static constexpr storage g2_gen_y_im = {0x85eb8f93, 0xd9a1cdd1, 0x5e52270b, 0x4279b83f, - 0xcee304c2, 0x2463b01a, 0x3d591bf1, 0x61ef11ac, - 0x151a70aa, 0x9e549da3, 0xd2835518, 0x00f8169f}; }; + // G1 and G2 generators + static constexpr storage g1_gen_x = {0xb21be9ef, 0xeab9b16e, 0xffcd394e, 0xd5481512, + 0xbd37cb5c, 0x188282c8, 0xaa9d41bb, 0x85951e2c, + 0xbf87ff54, 0xc8fc6225, 0xfe740a67, 0x008848de}; + static constexpr storage g1_gen_y = {0x559c8ea6, 0xfd82de55, 0x34a9591a, 0xc2fe3d36, + 0x4fb82305, 0x6d182ad4, 0xca3e52d9, 0xbd7fb348, + 0x30afeec4, 0x1f674f5d, 0xc5102eff, 0x01914a69}; + static constexpr storage g2_gen_x_re = {0x7c005196, 0x74e3e48f, 0xbb535402, 0x71889f52, + 0x57db6b9b, 0x7ea501f5, 0x203e5031, 0xc565f071, + 0xa3841d01, 0xc89630a2, 0x71c785fe, 0x018480be}; + static constexpr storage g2_gen_x_im = {0x6ea16afe, 0xb26bfefa, 0xbff76fe6, 0x5cf89984, + 0x0799c9de, 0xe7223ece, 0x6651cecb, 0x532777ee, + 0xb1b140d5, 0x70dc5a51, 0xe7004031, 0x00ea6040}; + static constexpr storage g2_gen_y_re = {0x09fd4ddf, 0xf0940944, 0x6d8c7c2e, 0xf2cf8888, + 0xf832d204, 0xe458c282, 0x74b49a58, 0xde03ed72, + 0xcbb2efb4, 0xd960736b, 0x5d446f7b, 0x00690d66}; + static constexpr storage g2_gen_y_im = {0x85eb8f93, 0xd9a1cdd1, 0x5e52270b, 0x4279b83f, + 0xcee304c2, 0x2463b01a, 0x3d591bf1, 0x61ef11ac, + 0x151a70aa, 0x9e549da3, 0xd2835518, 0x00f8169f}; + static constexpr storage weierstrass_b = {0x00000001, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000}; diff --git a/icicle/curves/bls12_381_params.cuh b/icicle/curves/bls12_381_params.cuh index 1b60c0c94..102590975 100644 --- a/icicle/curves/bls12_381_params.cuh +++ b/icicle/curves/bls12_381_params.cuh @@ -6,38 +6,34 @@ namespace bls12_381 { struct fp_config { - // field structure size = 8 * 32 bit static constexpr unsigned limbs_count = 8; static constexpr unsigned omegas_count = 32; - // modulus = 52435875175126190479447740508185965837690552500527637822603658699938581184513 + static constexpr unsigned modulus_bit_count = 255; + static constexpr unsigned num_of_reductions = 2; + static constexpr storage modulus = {0x00000001, 0xffffffff, 0xfffe5bfe, 0x53bda402, 0x09a1d805, 0x3339d808, 0x299d7d48, 0x73eda753}; - // modulus*2 = 104871750350252380958895481016371931675381105001055275645207317399877162369026 static constexpr storage modulus_2 = {0x00000002, 0xfffffffe, 0xfffcb7fd, 0xa77b4805, 0x1343b00a, 0x6673b010, 0x533afa90, 0xe7db4ea6}; - static constexpr storage modulus_4 = {0x00000000, 0x00000000, 0x00000000, 0x00000000, - 0x00000000, 0x00000000, 0x00000000, 0x00000000}; - + static constexpr storage modulus_4 = {0x00000004, 0xfffffffc, 0xfff96ffb, 0x4ef6900b, + 0x26876015, 0xcce76020, 0xa675f520, 0xcfb69d4c}; + static constexpr storage neg_modulus = {0xffffffff, 0x00000000, 0x0001a401, 0xac425bfd, + 0xf65e27fa, 0xccc627f7, 0xd66282b7, 0x8c1258ac}; static constexpr storage<2 * limbs_count> modulus_wide = { 0x00000001, 0xffffffff, 0xfffe5bfe, 0x53bda402, 0x09a1d805, 0x3339d808, 0x299d7d48, 0x73eda753, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000}; - // modulus^2 static constexpr storage<2 * limbs_count> modulus_squared = { 0x00000001, 0xfffffffe, 0xfffcb7fe, 0xa77e9007, 0x1cdbb005, 0x698ae002, 0x5433f7b8, 0x48aa415e, 0x4aa9c661, 0xc2611f6f, 0x59934a1d, 0x0e9593f9, 0xef2cc20f, 0x520c13db, 0xf4bc2778, 0x347f60f3}; - // 2*modulus^2 static constexpr storage<2 * limbs_count> modulus_squared_2 = { 0x00000002, 0xfffffffc, 0xfff96ffd, 0x4efd200f, 0x39b7600b, 0xd315c004, 0xa867ef70, 0x915482bc, 0x95538cc2, 0x84c23ede, 0xb326943b, 0x1d2b27f2, 0xde59841e, 0xa41827b7, 0xe9784ef0, 0x68fec1e7}; - // note: doesnt actually fit into 384 bits, and shouldnt be used! is added for compilation static constexpr storage<2 * limbs_count> modulus_squared_4 = { - 0x00000002, 0xfffffffc, 0xfff96ffd, 0x4efd200f, 0x39b7600b, 0xd315c004, 0xa867ef70, 0x915482bc, - 0x95538cc2, 0x84c23ede, 0xb326943b, 0x1d2b27f2, 0xde59841e, 0xa41827b7, 0xe9784ef0, 0x68fec1e7}; - static constexpr unsigned modulus_bit_count = 255; - // m = floor(2^(2*modulus_bit_count) / modulus) - static constexpr storage m = {0x830358e4, 0x509cde80, 0x2f92eb5c, 0xd9410fad, - 0xc1f823b4, 0xe2d772d, 0x7fb78ddf, 0x8d54253b}; + 0x00000004, 0xfffffff8, 0xfff2dffb, 0x9dfa401f, 0x736ec016, 0xa62b8008, 0x50cfdee1, 0x22a90579, + 0x2aa71985, 0x09847dbd, 0x664d2877, 0x3a564fe5, 0xbcb3083c, 0x48304f6f, 0xd2f09de1, 0xd1fd83cf}; + static constexpr storage m = {0x830358e4, 0x509cde80, 0x2f92eb5c, 0xd9410fad, + 0xc1f823b4, 0x0e2d772d, 0x7fb78ddf, 0x8d54253b}; static constexpr storage one = {0x00000001, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000}; static constexpr storage zero = {0x00000000, 0x00000000, 0x00000000, 0x00000000, @@ -47,322 +43,141 @@ namespace bls12_381 { static constexpr storage montgomery_r_inv = {0xfe75c040, 0x13f75b69, 0x09dc705f, 0xab6fca8f, 0x4f77266a, 0x7204078a, 0x30009d57, 0x1bbe8693}; - // static constexpr storage omega[32]= { {0x00000000, 0xffffffff, 0xfffe5bfe, 0x53bda402, 0x09a1d805, - // 0x3339d808, 0x299d7d48, 0x73eda753}, {0x00000000, 0x00010000, 0x76030000, 0xec030002, 0x760304d0, 0x8d51ccce, - // 0x00000000, 0x00000000}, {0x688bc087, 0x8dd702cb, 0x78eaa4fe, 0xa0328240, 0x98ca5b22, 0xa733b23a, 0x25a31660, - // 0x3f96405d}, {0x0411fe73, 0x95df4b36, 0xebc1e1bb, 0x1ef4e672, 0x60afca4a, 0x6e92a9c4, 0x753e4fcc, 0x4f2c596e}, - // {0xba60eaa6, 0x9733f3a6, 0x77487ae7, 0xbd7fdf9c, 0xc8b6cc00, 0xd84f8612, 0x6162ffab, 0x476fa2fb}, {0xac5db47f, - // 0xd2fc5e69, 0x15d0b8e4, 0xa12a70a6, 0xbc8de5d9, 0x293b1d67, 0x57f86f5e, 0x0e4840ac}, {0xab28e208, 0xb750da4c, - // 0x3be95635, 0x501dff64, 0xf0b4b276, 0x8cbe2437, 0xa94a946e, 0x07d0c802}, {0x2fe322b8, 0x2cabadec, 0x15412560, - // 0x752c84f3, 0x1a3b0aef, 0x32a732ae, 0xa33dcbf2, 0x2e95da59}, {0xfe0c65f4, 0x33811ea1, 0x687f28a2, 0x15c1ad4c, - // 0x42dee7f4, 0xecfbede3, 0x9a5d88b1, 0x1bb46667}, {0x2d010ff9, 0xd58a5af4, 0x570bf109, 0x79efd6b0, 0x6350721d, - // 0x3ed6d55a, 0x58f43cef, 0x2f27b098}, {0x8c130477, 0x74a1f671, 0xb61e0abe, 0xa534af14, 0x620890d7, 0xeb674a1a, - // 0xca252472, 0x43527a8b}, {0x7ea8ee05, 0x450d9f97, 0x37d56fc0, 0x565af171, 0x93f9e9ac, 0xe155cb48, 0xc8e9101b, - // 0x110cebd0}, {0x59a0be92, 0x23c91599, 0x7a027759, 0x87d188ce, 0xcab3c3cc, 0x70491431, 0xb3f7f8da, 0x0ac00eb8}, - // {0x69583404, 0x13e96ade, 0x5306243d, 0x82c05727, 0x29ca9f2a, 0x77e48bf5, 0x1fe19595, 0x50646ac8}, {0xa97eccd4, - // 0xe6a354dd, 0x88fbbc57, 0x39929d2e, 0xd6e7b1c8, 0xa22ba63d, 0xf5f07f43, 0x42c22911}, {0xcfc35f7a, 0x137b458a, - // 0x29c01b06, 0x0caba63a, 0x7a02402c, 0x0409ee98, 0x56aa725b, 0x6709c6cd}, {0x8831e03e, 0x10251f7d, 0x7ff858ec, - // 0x77d85a93, 0x4fb9ac5c, 0xebe905bd, 0xf8727901, 0x05deb333}, {0xb9009408, 0xbf87b689, 0xdd3ccc96, 0x4f730e7d, - // 0x4610300c, 0xfd7f05ba, 0x0b8ac903, 0x5ef5e8db}, {0x17cd0c14, 0x64996884, 0x68812f7f, 0xa6728673, 0x22cc3253, - // 0x2e1d9a19, 0xaa0a1d80, 0x3a689e83}, {0x41144dea, 0x20b53cbe, 0xc2f0fcbd, 0x870c46fa, 0x537d6971, 0x556c35f6, - // 0x5f686d91, 0x3436287f}, {0x436ba2e7, 0x007e082a, 0x9116e877, 0x67c6630f, 0xfb4460f7, 0x36f8f165, 0x7e7046e0, - // 0x6eee34d5}, {0xa53a56d1, 0xc5b670ee, 0x53037d7b, 0x127d1f42, 0xa722c2e2, 0x57d4257e, 0x33cbd838, 0x03ae26a3}, - // {0x76504cf8, 0x1e914848, 0xb63edd02, 0x55bbbf1e, 0x4e55aa02, 0xbcdafec8, 0x2dc0beb0, 0x5145c4cd}, {0x1ab70e2c, - // 0x5b90153a, 0x75fb0ab8, 0x8deffa31, 0x46900c95, 0xc553ae23, 0x6bd3118c, 0x1d31dcdc}, {0x59a2e8eb, 0x801c894c, - // 0xe12fc974, 0xbc535c5c, 0x47d39803, 0x95508d27, 0xac5d094f, 0x16d9d3cd}, {0xcca1d8be, 0x810fa372, 0x82e0bfa7, - // 0xc67b8c28, 0xe2d35bc2, 0xdbb4edf0, 0x5087c995, 0x712d1580}, {0xfd88f133, 0xeb162203, 0xf010ea74, 0xac96c38f, - // 0xe64cfc70, 0x4307987f, 0x37b7a114, 0x350fe98d}, {0x42f2a254, 0xaba2f518, 0xa71efc0c, 0x4d7f3c3a, 0xd274a80a, - // 0x97ae418d, 0x5e3e7682, 0x2967385d}, {0x575a0b79, 0x75c55c7b, 0x74a7ded1, 0x3ba4a157, 0xa04fccf3, 0xc3974d73, - // 0x4a939684, 0x705aba4f}, {0x14ebb608, 0x8409a9ea, 0x66bac611, 0xfad0084e, 0x811c1dfb, 0x04287254, 0x23b30c29, - // 0x086d072b}, {0x67e4756a, 0xb427c9b3, 0x02ebc38d, 0xc7537fb9, 0xcd6a205f, 0x51de21be, 0x7923597d, 0x6064ab72}, - // {0x0b912f1f, 0x1b788f50, 0x70b3e094, 0xc4024ff2, 0xd168d6c0, 0x0fd56dc8, 0x5b416b6f, 0x0212d79e}}; Quick fix for - // linking issue - static constexpr storage omega1 = {0x00000000, 0xffffffff, 0xfffe5bfe, 0x53bda402, - 0x09a1d805, 0x3339d808, 0x299d7d48, 0x73eda753}; - static constexpr storage omega2 = {0x00000000, 0x00010000, 0x76030000, 0xec030002, - 0x760304d0, 0x8d51ccce, 0x00000000, 0x00000000}; - static constexpr storage omega3 = {0x688bc087, 0x8dd702cb, 0x78eaa4fe, 0xa0328240, - 0x98ca5b22, 0xa733b23a, 0x25a31660, 0x3f96405d}; - static constexpr storage omega4 = {0x0411fe73, 0x95df4b36, 0xebc1e1bb, 0x1ef4e672, - 0x60afca4a, 0x6e92a9c4, 0x753e4fcc, 0x4f2c596e}; - static constexpr storage omega5 = {0xba60eaa6, 0x9733f3a6, 0x77487ae7, 0xbd7fdf9c, - 0xc8b6cc00, 0xd84f8612, 0x6162ffab, 0x476fa2fb}; - static constexpr storage omega6 = {0xac5db47f, 0xd2fc5e69, 0x15d0b8e4, 0xa12a70a6, - 0xbc8de5d9, 0x293b1d67, 0x57f86f5e, 0x0e4840ac}; - static constexpr storage omega7 = {0xab28e208, 0xb750da4c, 0x3be95635, 0x501dff64, - 0xf0b4b276, 0x8cbe2437, 0xa94a946e, 0x07d0c802}; - static constexpr storage omega8 = {0x2fe322b8, 0x2cabadec, 0x15412560, 0x752c84f3, - 0x1a3b0aef, 0x32a732ae, 0xa33dcbf2, 0x2e95da59}; - static constexpr storage omega9 = {0xfe0c65f4, 0x33811ea1, 0x687f28a2, 0x15c1ad4c, - 0x42dee7f4, 0xecfbede3, 0x9a5d88b1, 0x1bb46667}; - static constexpr storage omega10 = {0x2d010ff9, 0xd58a5af4, 0x570bf109, 0x79efd6b0, - 0x6350721d, 0x3ed6d55a, 0x58f43cef, 0x2f27b098}; - static constexpr storage omega11 = {0x8c130477, 0x74a1f671, 0xb61e0abe, 0xa534af14, - 0x620890d7, 0xeb674a1a, 0xca252472, 0x43527a8b}; - static constexpr storage omega12 = {0x7ea8ee05, 0x450d9f97, 0x37d56fc0, 0x565af171, - 0x93f9e9ac, 0xe155cb48, 0xc8e9101b, 0x110cebd0}; - static constexpr storage omega13 = {0x59a0be92, 0x23c91599, 0x7a027759, 0x87d188ce, - 0xcab3c3cc, 0x70491431, 0xb3f7f8da, 0x0ac00eb8}; - static constexpr storage omega14 = {0x69583404, 0x13e96ade, 0x5306243d, 0x82c05727, - 0x29ca9f2a, 0x77e48bf5, 0x1fe19595, 0x50646ac8}; - static constexpr storage omega15 = {0xa97eccd4, 0xe6a354dd, 0x88fbbc57, 0x39929d2e, - 0xd6e7b1c8, 0xa22ba63d, 0xf5f07f43, 0x42c22911}; - static constexpr storage omega16 = {0xcfc35f7a, 0x137b458a, 0x29c01b06, 0x0caba63a, - 0x7a02402c, 0x0409ee98, 0x56aa725b, 0x6709c6cd}; - static constexpr storage omega17 = {0x8831e03e, 0x10251f7d, 0x7ff858ec, 0x77d85a93, - 0x4fb9ac5c, 0xebe905bd, 0xf8727901, 0x05deb333}; - static constexpr storage omega18 = {0xb9009408, 0xbf87b689, 0xdd3ccc96, 0x4f730e7d, - 0x4610300c, 0xfd7f05ba, 0x0b8ac903, 0x5ef5e8db}; - static constexpr storage omega19 = {0x17cd0c14, 0x64996884, 0x68812f7f, 0xa6728673, - 0x22cc3253, 0x2e1d9a19, 0xaa0a1d80, 0x3a689e83}; - static constexpr storage omega20 = {0x41144dea, 0x20b53cbe, 0xc2f0fcbd, 0x870c46fa, - 0x537d6971, 0x556c35f6, 0x5f686d91, 0x3436287f}; - static constexpr storage omega21 = {0x436ba2e7, 0x007e082a, 0x9116e877, 0x67c6630f, - 0xfb4460f7, 0x36f8f165, 0x7e7046e0, 0x6eee34d5}; - static constexpr storage omega22 = {0xa53a56d1, 0xc5b670ee, 0x53037d7b, 0x127d1f42, - 0xa722c2e2, 0x57d4257e, 0x33cbd838, 0x03ae26a3}; - static constexpr storage omega23 = {0x76504cf8, 0x1e914848, 0xb63edd02, 0x55bbbf1e, - 0x4e55aa02, 0xbcdafec8, 0x2dc0beb0, 0x5145c4cd}; - static constexpr storage omega24 = {0x1ab70e2c, 0x5b90153a, 0x75fb0ab8, 0x8deffa31, - 0x46900c95, 0xc553ae23, 0x6bd3118c, 0x1d31dcdc}; - static constexpr storage omega25 = {0x59a2e8eb, 0x801c894c, 0xe12fc974, 0xbc535c5c, - 0x47d39803, 0x95508d27, 0xac5d094f, 0x16d9d3cd}; - static constexpr storage omega26 = {0xcca1d8be, 0x810fa372, 0x82e0bfa7, 0xc67b8c28, - 0xe2d35bc2, 0xdbb4edf0, 0x5087c995, 0x712d1580}; - static constexpr storage omega27 = {0xfd88f133, 0xeb162203, 0xf010ea74, 0xac96c38f, - 0xe64cfc70, 0x4307987f, 0x37b7a114, 0x350fe98d}; - static constexpr storage omega28 = {0x42f2a254, 0xaba2f518, 0xa71efc0c, 0x4d7f3c3a, - 0xd274a80a, 0x97ae418d, 0x5e3e7682, 0x2967385d}; - static constexpr storage omega29 = {0x575a0b79, 0x75c55c7b, 0x74a7ded1, 0x3ba4a157, - 0xa04fccf3, 0xc3974d73, 0x4a939684, 0x705aba4f}; - static constexpr storage omega30 = {0x14ebb608, 0x8409a9ea, 0x66bac611, 0xfad0084e, - 0x811c1dfb, 0x04287254, 0x23b30c29, 0x086d072b}; - static constexpr storage omega31 = {0x67e4756a, 0xb427c9b3, 0x02ebc38d, 0xc7537fb9, - 0xcd6a205f, 0x51de21be, 0x7923597d, 0x6064ab72}; - static constexpr storage omega32 = {0x0b912f1f, 0x1b788f50, 0x70b3e094, 0xc4024ff2, - 0xd168d6c0, 0x0fd56dc8, 0x5b416b6f, 0x0212d79e}; - static constexpr storage_array omega = { - omega1, omega2, omega3, omega4, omega5, omega6, omega7, omega8, omega9, omega10, omega11, - omega12, omega13, omega14, omega15, omega16, omega17, omega18, omega19, omega20, omega21, omega22, - omega23, omega24, omega25, omega26, omega27, omega28, omega29, omega30, omega31, omega32, - }; - - // static constexpr storage omega_inv[32]={ {0x00000000, 0xffffffff, 0xfffe5bfe, 0x53bda402, - // 0x09a1d805, 0x3339d808, 0x299d7d48, 0x73eda753}, {0x00000001, 0xfffeffff, 0x89fb5bfe, 0x67baa400, 0x939ed334, - // 0xa5e80b39, 0x299d7d47, 0x73eda753}, {0xae99502e, 0x6037fe81, 0x94b04fd8, 0x8e749036, 0xca86bf65, 0xbabc5aff, - // 0x5ce11044, 0x1333b22e}, {0x7dc08d74, 0x7f847ee4, 0x04eeaf5a, 0xbd433896, 0x1832fc60, 0xd66c91d6, 0x607e449b, - // 0x551115b4}, {0x4e7773cb, 0xee5bcecc, 0xf6dab086, 0x45593d6f, 0x4016e2bd, 0xa3a95d2d, 0xaf96816f, 0x047cb16c}, - // {0x982b68c5, 0xb891fa3f, 0x1d426b52, 0xa41e8501, 0x882952d6, 0x566009b5, 0x7b3c79d6, 0x199cdaee}, {0xcf28601b, - // 0x571ba2fc, 0xac74db12, 0x166fb582, 0x3501370b, 0x51420be4, 0x52f970ba, 0x1996fa8d}, {0x6a2f777a, 0xe9561c17, - // 0x2393991b, 0xc03cae03, 0x5a5bfd4f, 0x91b00023, 0x272e58ee, 0x6d64ed25}, {0xf02a116e, 0xfb350dbe, 0xb4543a3e, - // 0x1c510ebf, 0x37ad4eca, 0xf675522e, 0x80f82b2d, 0x1907a56e}, {0x4eb71aa6, 0xb0ad8003, 0xaa67e0be, 0x50a32c41, - // 0x19141f44, 0x105f0672, 0xa3dad316, 0x2bcd9508}, {0x0f6fb2ac, 0x3dc9e560, 0x9aa58ff5, 0x3cc5bb32, 0x36f376e1, - // 0xdeae67bc, 0x65ba213e, 0x394fda0d}, {0x60b82267, 0x09f239f7, 0x8b24f123, 0x14180e0e, 0x45625d95, 0xad5a5340, - // 0x6d174692, 0x58c3ba63}, {0x348b416f, 0x0acf21c2, 0xbc086439, 0x798b6bf6, 0xb1ca111d, 0x222d411f, 0x30ba1e0f, - // 0x044107b7}, {0x014abe84, 0xa3b861b8, 0x427ed008, 0x37c017e4, 0xae0ff4f5, 0xae51f613, 0xcb1218d3, 0x1a2d00e1}, - // {0x4de7eb2b, 0x48aaa3bf, 0x6772057d, 0x4a58d54d, 0x7093b551, 0xce25f16c, 0xd206337c, 0x242150ac}, {0x9ed57ae5, - // 0xdf3ec9ae, 0x7166577f, 0xea7df73a, 0x022fbbe4, 0x6ca8d281, 0x151e3f6b, 0x5850c003}, {0x645e1cfa, 0x903a0a0c, - // 0x34788c37, 0xfbac54cb, 0x8cf73d78, 0xdc127d11, 0x975d3c82, 0x6d0b5c7c}, {0x14b1ba04, 0xb49d6b05, 0xf00b84f2, - // 0x56e466b4, 0x0b904f22, 0x30c390cf, 0x3ee254cc, 0x3e11cfb7}, {0xbe8201ab, 0x84dfa547, 0x530715d2, 0x3887ce8b, - // 0x3eed4ed7, 0xa4c719c6, 0x8f8007b4, 0x18c44950}, {0x7d813cd1, 0xdaf0346d, 0xf755beb1, 0xeccf6f9a, 0xe08143e3, - // 0x167fce38, 0x6f5d6dfa, 0x545ad9b2}, {0x577605de, 0x973f5466, 0x974f953c, 0x0ce8986e, 0x074382f9, 0x8941cf4b, - // 0x6fa2672c, 0x156cd7f6}, {0x33b66141, 0x24315404, 0x1992f584, 0x5d1375ab, 0x8b20ca1a, 0xf193ffa6, 0x2701a503, - // 0x47880cd5}, {0xe9f7b9af, 0xf7b6847d, 0x62c83ce2, 0x9a339673, 0x6e5e6f79, 0xfabf4537, 0x35af33a3, 0x0975acd9}, - // {0x0eddd248, 0x4fb4204a, 0xc9e509b3, 0x8c98706a, 0x2bb27eb1, 0xd0be8987, 0xc831438b, 0x6ec5f960}, {0x20238f62, - // 0xa13c95b7, 0x83b476b9, 0x130aa097, 0x14860881, 0x758a04e0, 0x97066493, 0x58e2f8d6}, {0xe8bff41e, 0x65b09c73, - // 0x37f1c6a3, 0x8b3280e8, 0x2846fb21, 0xe17b82ce, 0xb1ae27df, 0x476534bf}, {0xd5fdb757, 0x8480c0e7, 0x365bf9fd, - // 0x3644eea0, 0xb776be86, 0x4ca116ca, 0x8b58390c, 0x17b6395f}, {0x252eb0db, 0x2c811e9a, 0x7479e161, 0x1b7d960d, - // 0xb0a89a26, 0xb3afc7c1, 0x32b5e793, 0x6a2f9533}, {0x08b8a7ad, 0xe877b2c4, 0x341652b4, 0x68b0e8f0, 0xe8b6a2d9, - // 0x2d44da3b, 0xfd09be59, 0x092778ff}, {0x7988f244, 0x84a1aa6f, 0x24faf63f, 0xa164b3d9, 0xc1bbb915, 0x7aae9724, - // 0xf386c0d2, 0x24e5d287}, {0x41a1b30c, 0xa70a7efd, 0x39f0e511, 0xc49c55a5, 0x033bb323, 0xab307a8f, 0x17acbd7f, - // 0x0158abd6}, {0x0f642025, 0x2c228b30, 0x01bd882b, 0xb0878e8d, 0xd7377fea, 0xd862b255, 0xf0490536, 0x18ac3666}}; - // Quick fix for linking issue - static constexpr storage omega_inv1 = {0x00000000, 0xffffffff, 0xfffe5bfe, 0x53bda402, - 0x09a1d805, 0x3339d808, 0x299d7d48, 0x73eda753}; - static constexpr storage omega_inv2 = {0x00000001, 0xfffeffff, 0x89fb5bfe, 0x67baa400, - 0x939ed334, 0xa5e80b39, 0x299d7d47, 0x73eda753}; - static constexpr storage omega_inv3 = {0xae99502e, 0x6037fe81, 0x94b04fd8, 0x8e749036, - 0xca86bf65, 0xbabc5aff, 0x5ce11044, 0x1333b22e}; - static constexpr storage omega_inv4 = {0x7dc08d74, 0x7f847ee4, 0x04eeaf5a, 0xbd433896, - 0x1832fc60, 0xd66c91d6, 0x607e449b, 0x551115b4}; - static constexpr storage omega_inv5 = {0x4e7773cb, 0xee5bcecc, 0xf6dab086, 0x45593d6f, - 0x4016e2bd, 0xa3a95d2d, 0xaf96816f, 0x047cb16c}; - static constexpr storage omega_inv6 = {0x982b68c5, 0xb891fa3f, 0x1d426b52, 0xa41e8501, - 0x882952d6, 0x566009b5, 0x7b3c79d6, 0x199cdaee}; - static constexpr storage omega_inv7 = {0xcf28601b, 0x571ba2fc, 0xac74db12, 0x166fb582, - 0x3501370b, 0x51420be4, 0x52f970ba, 0x1996fa8d}; - static constexpr storage omega_inv8 = {0x6a2f777a, 0xe9561c17, 0x2393991b, 0xc03cae03, - 0x5a5bfd4f, 0x91b00023, 0x272e58ee, 0x6d64ed25}; - static constexpr storage omega_inv9 = {0xf02a116e, 0xfb350dbe, 0xb4543a3e, 0x1c510ebf, - 0x37ad4eca, 0xf675522e, 0x80f82b2d, 0x1907a56e}; - static constexpr storage omega_inv10 = {0x4eb71aa6, 0xb0ad8003, 0xaa67e0be, 0x50a32c41, - 0x19141f44, 0x105f0672, 0xa3dad316, 0x2bcd9508}; - static constexpr storage omega_inv11 = {0x0f6fb2ac, 0x3dc9e560, 0x9aa58ff5, 0x3cc5bb32, - 0x36f376e1, 0xdeae67bc, 0x65ba213e, 0x394fda0d}; - static constexpr storage omega_inv12 = {0x60b82267, 0x09f239f7, 0x8b24f123, 0x14180e0e, - 0x45625d95, 0xad5a5340, 0x6d174692, 0x58c3ba63}; - static constexpr storage omega_inv13 = {0x348b416f, 0x0acf21c2, 0xbc086439, 0x798b6bf6, - 0xb1ca111d, 0x222d411f, 0x30ba1e0f, 0x044107b7}; - static constexpr storage omega_inv14 = {0x014abe84, 0xa3b861b8, 0x427ed008, 0x37c017e4, - 0xae0ff4f5, 0xae51f613, 0xcb1218d3, 0x1a2d00e1}; - static constexpr storage omega_inv15 = {0x4de7eb2b, 0x48aaa3bf, 0x6772057d, 0x4a58d54d, - 0x7093b551, 0xce25f16c, 0xd206337c, 0x242150ac}; - static constexpr storage omega_inv16 = {0x9ed57ae5, 0xdf3ec9ae, 0x7166577f, 0xea7df73a, - 0x022fbbe4, 0x6ca8d281, 0x151e3f6b, 0x5850c003}; - static constexpr storage omega_inv17 = {0x645e1cfa, 0x903a0a0c, 0x34788c37, 0xfbac54cb, - 0x8cf73d78, 0xdc127d11, 0x975d3c82, 0x6d0b5c7c}; - static constexpr storage omega_inv18 = {0x14b1ba04, 0xb49d6b05, 0xf00b84f2, 0x56e466b4, - 0x0b904f22, 0x30c390cf, 0x3ee254cc, 0x3e11cfb7}; - static constexpr storage omega_inv19 = {0xbe8201ab, 0x84dfa547, 0x530715d2, 0x3887ce8b, - 0x3eed4ed7, 0xa4c719c6, 0x8f8007b4, 0x18c44950}; - static constexpr storage omega_inv20 = {0x7d813cd1, 0xdaf0346d, 0xf755beb1, 0xeccf6f9a, - 0xe08143e3, 0x167fce38, 0x6f5d6dfa, 0x545ad9b2}; - static constexpr storage omega_inv21 = {0x577605de, 0x973f5466, 0x974f953c, 0x0ce8986e, - 0x074382f9, 0x8941cf4b, 0x6fa2672c, 0x156cd7f6}; - static constexpr storage omega_inv22 = {0x33b66141, 0x24315404, 0x1992f584, 0x5d1375ab, - 0x8b20ca1a, 0xf193ffa6, 0x2701a503, 0x47880cd5}; - static constexpr storage omega_inv23 = {0xe9f7b9af, 0xf7b6847d, 0x62c83ce2, 0x9a339673, - 0x6e5e6f79, 0xfabf4537, 0x35af33a3, 0x0975acd9}; - static constexpr storage omega_inv24 = {0x0eddd248, 0x4fb4204a, 0xc9e509b3, 0x8c98706a, - 0x2bb27eb1, 0xd0be8987, 0xc831438b, 0x6ec5f960}; - static constexpr storage omega_inv25 = {0x20238f62, 0xa13c95b7, 0x83b476b9, 0x130aa097, - 0x14860881, 0x758a04e0, 0x97066493, 0x58e2f8d6}; - static constexpr storage omega_inv26 = {0xe8bff41e, 0x65b09c73, 0x37f1c6a3, 0x8b3280e8, - 0x2846fb21, 0xe17b82ce, 0xb1ae27df, 0x476534bf}; - static constexpr storage omega_inv27 = {0xd5fdb757, 0x8480c0e7, 0x365bf9fd, 0x3644eea0, - 0xb776be86, 0x4ca116ca, 0x8b58390c, 0x17b6395f}; - static constexpr storage omega_inv28 = {0x252eb0db, 0x2c811e9a, 0x7479e161, 0x1b7d960d, - 0xb0a89a26, 0xb3afc7c1, 0x32b5e793, 0x6a2f9533}; - static constexpr storage omega_inv29 = {0x08b8a7ad, 0xe877b2c4, 0x341652b4, 0x68b0e8f0, - 0xe8b6a2d9, 0x2d44da3b, 0xfd09be59, 0x092778ff}; - static constexpr storage omega_inv30 = {0x7988f244, 0x84a1aa6f, 0x24faf63f, 0xa164b3d9, - 0xc1bbb915, 0x7aae9724, 0xf386c0d2, 0x24e5d287}; - static constexpr storage omega_inv31 = {0x41a1b30c, 0xa70a7efd, 0x39f0e511, 0xc49c55a5, - 0x033bb323, 0xab307a8f, 0x17acbd7f, 0x0158abd6}; - static constexpr storage omega_inv32 = {0x0f642025, 0x2c228b30, 0x01bd882b, 0xb0878e8d, - 0xd7377fea, 0xd862b255, 0xf0490536, 0x18ac3666}; + {{0x00000000, 0xffffffff, 0xfffe5bfe, 0x53bda402, 0x09a1d805, 0x3339d808, 0x299d7d48, 0x73eda753}, + {0x00000000, 0x00010000, 0x76030000, 0xec030002, 0x760304d0, 0x8d51ccce, 0x00000000, 0x00000000}, + {0x688bc087, 0x8dd702cb, 0x78eaa4fe, 0xa0328240, 0x98ca5b22, 0xa733b23a, 0x25a31660, 0x3f96405d}, + {0x0411fe73, 0x95df4b36, 0xebc1e1bb, 0x1ef4e672, 0x60afca4a, 0x6e92a9c4, 0x753e4fcc, 0x4f2c596e}, + {0xba60eaa6, 0x9733f3a6, 0x77487ae7, 0xbd7fdf9c, 0xc8b6cc00, 0xd84f8612, 0x6162ffab, 0x476fa2fb}, + {0xac5db47f, 0xd2fc5e69, 0x15d0b8e4, 0xa12a70a6, 0xbc8de5d9, 0x293b1d67, 0x57f86f5e, 0x0e4840ac}, + {0xab28e208, 0xb750da4c, 0x3be95635, 0x501dff64, 0xf0b4b276, 0x8cbe2437, 0xa94a946e, 0x07d0c802}, + {0x2fe322b8, 0x2cabadec, 0x15412560, 0x752c84f3, 0x1a3b0aef, 0x32a732ae, 0xa33dcbf2, 0x2e95da59}, + {0xfe0c65f4, 0x33811ea1, 0x687f28a2, 0x15c1ad4c, 0x42dee7f4, 0xecfbede3, 0x9a5d88b1, 0x1bb46667}, + {0x2d010ff9, 0xd58a5af4, 0x570bf109, 0x79efd6b0, 0x6350721d, 0x3ed6d55a, 0x58f43cef, 0x2f27b098}, + {0x8c130477, 0x74a1f671, 0xb61e0abe, 0xa534af14, 0x620890d7, 0xeb674a1a, 0xca252472, 0x43527a8b}, + {0x7ea8ee05, 0x450d9f97, 0x37d56fc0, 0x565af171, 0x93f9e9ac, 0xe155cb48, 0xc8e9101b, 0x110cebd0}, + {0x59a0be92, 0x23c91599, 0x7a027759, 0x87d188ce, 0xcab3c3cc, 0x70491431, 0xb3f7f8da, 0x0ac00eb8}, + {0x69583404, 0x13e96ade, 0x5306243d, 0x82c05727, 0x29ca9f2a, 0x77e48bf5, 0x1fe19595, 0x50646ac8}, + {0xa97eccd4, 0xe6a354dd, 0x88fbbc57, 0x39929d2e, 0xd6e7b1c8, 0xa22ba63d, 0xf5f07f43, 0x42c22911}, + {0xcfc35f7a, 0x137b458a, 0x29c01b06, 0x0caba63a, 0x7a02402c, 0x0409ee98, 0x56aa725b, 0x6709c6cd}, + {0x8831e03e, 0x10251f7d, 0x7ff858ec, 0x77d85a93, 0x4fb9ac5c, 0xebe905bd, 0xf8727901, 0x05deb333}, + {0xb9009408, 0xbf87b689, 0xdd3ccc96, 0x4f730e7d, 0x4610300c, 0xfd7f05ba, 0x0b8ac903, 0x5ef5e8db}, + {0x17cd0c14, 0x64996884, 0x68812f7f, 0xa6728673, 0x22cc3253, 0x2e1d9a19, 0xaa0a1d80, 0x3a689e83}, + {0x41144dea, 0x20b53cbe, 0xc2f0fcbd, 0x870c46fa, 0x537d6971, 0x556c35f6, 0x5f686d91, 0x3436287f}, + {0x436ba2e7, 0x007e082a, 0x9116e877, 0x67c6630f, 0xfb4460f7, 0x36f8f165, 0x7e7046e0, 0x6eee34d5}, + {0xa53a56d1, 0xc5b670ee, 0x53037d7b, 0x127d1f42, 0xa722c2e2, 0x57d4257e, 0x33cbd838, 0x03ae26a3}, + {0x76504cf8, 0x1e914848, 0xb63edd02, 0x55bbbf1e, 0x4e55aa02, 0xbcdafec8, 0x2dc0beb0, 0x5145c4cd}, + {0x1ab70e2c, 0x5b90153a, 0x75fb0ab8, 0x8deffa31, 0x46900c95, 0xc553ae23, 0x6bd3118c, 0x1d31dcdc}, + {0x59a2e8eb, 0x801c894c, 0xe12fc974, 0xbc535c5c, 0x47d39803, 0x95508d27, 0xac5d094f, 0x16d9d3cd}, + {0xcca1d8be, 0x810fa372, 0x82e0bfa7, 0xc67b8c28, 0xe2d35bc2, 0xdbb4edf0, 0x5087c995, 0x712d1580}, + {0xfd88f133, 0xeb162203, 0xf010ea74, 0xac96c38f, 0xe64cfc70, 0x4307987f, 0x37b7a114, 0x350fe98d}, + {0x42f2a254, 0xaba2f518, 0xa71efc0c, 0x4d7f3c3a, 0xd274a80a, 0x97ae418d, 0x5e3e7682, 0x2967385d}, + {0x575a0b79, 0x75c55c7b, 0x74a7ded1, 0x3ba4a157, 0xa04fccf3, 0xc3974d73, 0x4a939684, 0x705aba4f}, + {0x14ebb608, 0x8409a9ea, 0x66bac611, 0xfad0084e, 0x811c1dfb, 0x04287254, 0x23b30c29, 0x086d072b}, + {0x67e4756a, 0xb427c9b3, 0x02ebc38d, 0xc7537fb9, 0xcd6a205f, 0x51de21be, 0x7923597d, 0x6064ab72}, + {0x0b912f1f, 0x1b788f50, 0x70b3e094, 0xc4024ff2, 0xd168d6c0, 0x0fd56dc8, 0x5b416b6f, 0x0212d79e}}}; static constexpr storage_array omega_inv = { - omega_inv1, omega_inv2, omega_inv3, omega_inv4, omega_inv5, omega_inv6, omega_inv7, omega_inv8, - omega_inv9, omega_inv10, omega_inv11, omega_inv12, omega_inv13, omega_inv14, omega_inv15, omega_inv16, - omega_inv17, omega_inv18, omega_inv19, omega_inv20, omega_inv21, omega_inv22, omega_inv23, omega_inv24, - omega_inv25, omega_inv26, omega_inv27, omega_inv28, omega_inv29, omega_inv30, omega_inv31, omega_inv32, - }; - - // Quick fix for linking issue - static constexpr storage inv1 = {0x80000001, 0x7fffffff, 0x7fff2dff, 0xa9ded201, - 0x04d0ec02, 0x199cec04, 0x94cebea4, 0x39f6d3a9}; - static constexpr storage inv2 = {0x40000001, 0x3fffffff, 0x3ffec4ff, 0xfece3b02, - 0x07396203, 0x266b6206, 0x5f361df6, 0x56f23d7e}; - static constexpr storage inv3 = {0x20000001, 0x1fffffff, 0x9ffe907f, 0xa945ef82, - 0x086d9d04, 0x2cd29d07, 0xc469cd9f, 0x656ff268}; - static constexpr storage inv4 = {0x10000001, 0x0fffffff, 0xcffe763f, 0xfe81c9c2, - 0x8907ba84, 0xb0063a87, 0xf703a573, 0x6caeccdd}; - static constexpr storage inv5 = {0x08000001, 0x07ffffff, 0xe7fe691f, 0x291fb6e2, - 0xc954c945, 0xf1a00947, 0x9050915d, 0x704e3a18}; - static constexpr storage inv6 = {0x04000001, 0x03ffffff, 0xf3fe628f, 0x3e6ead72, - 0xe97b50a5, 0x126cf0a7, 0xdcf70753, 0x721df0b5}; - static constexpr storage inv7 = {0x02000001, 0x01ffffff, 0xf9fe5f47, 0x491628ba, - 0xf98e9455, 0xa2d36457, 0x834a424d, 0x7305cc04}; - static constexpr storage inv8 = {0x01000001, 0x00ffffff, 0xfcfe5da3, 0x4e69e65e, - 0x0198362d, 0xeb069e30, 0xd673dfca, 0x7379b9ab}; - static constexpr storage inv9 = {0x00800001, 0x007fffff, 0xfe7e5cd1, 0x5113c530, - 0x059d0719, 0x8f203b1c, 0x8008ae89, 0x73b3b07f}; - static constexpr storage inv10 = {0x00400001, 0x003fffff, 0xff3e5c68, 0x5268b499, - 0x079f6f8f, 0xe12d0992, 0x54d315e8, 0x73d0abe9}; - static constexpr storage inv11 = {0x00200001, 0x801fffff, 0x7f9e5c33, 0x53132c4e, - 0x08a0a3ca, 0x8a3370cd, 0x3f384998, 0x73df299e}; - static constexpr storage inv12 = {0x00100001, 0x400fffff, 0xbfce5c19, 0xd3686828, - 0x89213de7, 0x5eb6a46a, 0xb46ae370, 0x73e66878}; - static constexpr storage inv13 = {0x00080001, 0x2007ffff, 0xdfe65c0c, 0x93930615, - 0x49618af6, 0x48f83e39, 0xef04305c, 0x73ea07e5}; - static constexpr storage inv14 = {0x00040001, 0x9003ffff, 0x6ff25c05, 0xf3a8550c, - 0xa981b17d, 0x3e190b20, 0x8c50d6d2, 0x73ebd79c}; - static constexpr storage inv15 = {0x00020001, 0x4801ffff, 0xb7f85c02, 0xa3b2fc87, - 0x5991c4c1, 0x38a97194, 0xdaf72a0d, 0x73ecbf77}; - static constexpr storage inv16 = {0x00010001, 0xa400ffff, 0x5bfb5c00, 0x7bb85045, - 0x3199ce63, 0xb5f1a4ce, 0x824a53aa, 0x73ed3365}; - static constexpr storage inv17 = {0x00008001, 0xd2007fff, 0x2dfcdbff, 0x67bafa24, - 0x1d9dd334, 0x7495be6b, 0x55f3e879, 0x73ed6d5c}; - static constexpr storage inv18 = {0x00004001, 0x69003fff, 0x96fd9bff, 0xddbc4f13, - 0x939fd59c, 0xd3e7cb39, 0xbfc8b2e0, 0x73ed8a57}; - static constexpr storage inv19 = {0x00002001, 0x34801fff, 0x4b7dfbff, 0x18bcf98b, - 0xcea0d6d1, 0x8390d1a0, 0x74b31814, 0x73ed98d5}; - static constexpr storage inv20 = {0x00001001, 0x1a400fff, 0x25be2bff, 0x363d4ec7, - 0x6c21576b, 0x5b6554d4, 0x4f284aae, 0x73eda014}; - static constexpr storage inv21 = {0x00000801, 0x0d2007ff, 0x12de43ff, 0x44fd7965, - 0x3ae197b8, 0x474f966e, 0xbc62e3fb, 0x73eda3b3}; - static constexpr storage inv22 = {0x00000401, 0x069003ff, 0x096e4fff, 0xcc5d8eb4, - 0x2241b7de, 0xbd44b73b, 0x730030a1, 0x73eda583}; - static constexpr storage inv23 = {0x00000201, 0x034801ff, 0x84b655ff, 0x100d995b, - 0x95f1c7f2, 0xf83f47a1, 0x4e4ed6f4, 0x73eda66b}; - static constexpr storage inv24 = {0x00000101, 0x01a400ff, 0x425a58ff, 0xb1e59eaf, - 0xcfc9cffb, 0x95bc8fd4, 0x3bf62a1e, 0x73eda6df}; - static constexpr storage inv25 = {0x00000081, 0x00d2007f, 0x212c5a7f, 0x82d1a159, - 0x6cb5d400, 0x647b33ee, 0x32c9d3b3, 0x73eda719}; - static constexpr storage inv26 = {0x00000041, 0x0069003f, 0x10955b3f, 0xeb47a2ae, - 0x3b2bd602, 0xcbda85fb, 0x2e33a87d, 0x73eda736}; - static constexpr storage inv27 = {0x00000021, 0x0034801f, 0x8849db9f, 0x1f82a358, - 0xa266d704, 0xff8a2f01, 0xabe892e2, 0x73eda744}; - static constexpr storage inv28 = {0x00000011, 0x001a400f, 0xc4241bcf, 0xb9a023ad, - 0xd6045784, 0x99620384, 0xeac30815, 0x73eda74b}; - static constexpr storage inv29 = {0x00000009, 0x000d2007, 0x62113be7, 0x06aee3d8, - 0x6fd317c5, 0xe64dedc6, 0x8a3042ae, 0x73eda74f}; - static constexpr storage inv30 = {0x00000005, 0x00069003, 0xb107cbf3, 0x2d3643ed, - 0x3cba77e5, 0x8cc3e2e7, 0x59e6dffb, 0x73eda751}; - static constexpr storage inv31 = {0x00000003, 0x00034801, 0x588313f9, 0x4079f3f8, - 0xa32e27f5, 0xdffedd77, 0x41c22ea1, 0x73eda752}; - static constexpr storage inv32 = {0x00000002, 0x0001a400, 0xac40b7fc, 0x4a1bcbfd, - 0xd667fffd, 0x099c5abf, 0xb5afd5f5, 0x73eda752}; + {{0x00000000, 0xffffffff, 0xfffe5bfe, 0x53bda402, 0x09a1d805, 0x3339d808, 0x299d7d48, 0x73eda753}, + {0x00000001, 0xfffeffff, 0x89fb5bfe, 0x67baa400, 0x939ed334, 0xa5e80b39, 0x299d7d47, 0x73eda753}, + {0xae99502e, 0x6037fe81, 0x94b04fd8, 0x8e749036, 0xca86bf65, 0xbabc5aff, 0x5ce11044, 0x1333b22e}, + {0x7dc08d74, 0x7f847ee4, 0x04eeaf5a, 0xbd433896, 0x1832fc60, 0xd66c91d6, 0x607e449b, 0x551115b4}, + {0x4e7773cb, 0xee5bcecc, 0xf6dab086, 0x45593d6f, 0x4016e2bd, 0xa3a95d2d, 0xaf96816f, 0x047cb16c}, + {0x982b68c5, 0xb891fa3f, 0x1d426b52, 0xa41e8501, 0x882952d6, 0x566009b5, 0x7b3c79d6, 0x199cdaee}, + {0xcf28601b, 0x571ba2fc, 0xac74db12, 0x166fb582, 0x3501370b, 0x51420be4, 0x52f970ba, 0x1996fa8d}, + {0x6a2f777a, 0xe9561c17, 0x2393991b, 0xc03cae03, 0x5a5bfd4f, 0x91b00023, 0x272e58ee, 0x6d64ed25}, + {0xf02a116e, 0xfb350dbe, 0xb4543a3e, 0x1c510ebf, 0x37ad4eca, 0xf675522e, 0x80f82b2d, 0x1907a56e}, + {0x4eb71aa6, 0xb0ad8003, 0xaa67e0be, 0x50a32c41, 0x19141f44, 0x105f0672, 0xa3dad316, 0x2bcd9508}, + {0x0f6fb2ac, 0x3dc9e560, 0x9aa58ff5, 0x3cc5bb32, 0x36f376e1, 0xdeae67bc, 0x65ba213e, 0x394fda0d}, + {0x60b82267, 0x09f239f7, 0x8b24f123, 0x14180e0e, 0x45625d95, 0xad5a5340, 0x6d174692, 0x58c3ba63}, + {0x348b416f, 0x0acf21c2, 0xbc086439, 0x798b6bf6, 0xb1ca111d, 0x222d411f, 0x30ba1e0f, 0x044107b7}, + {0x014abe84, 0xa3b861b8, 0x427ed008, 0x37c017e4, 0xae0ff4f5, 0xae51f613, 0xcb1218d3, 0x1a2d00e1}, + {0x4de7eb2b, 0x48aaa3bf, 0x6772057d, 0x4a58d54d, 0x7093b551, 0xce25f16c, 0xd206337c, 0x242150ac}, + {0x9ed57ae5, 0xdf3ec9ae, 0x7166577f, 0xea7df73a, 0x022fbbe4, 0x6ca8d281, 0x151e3f6b, 0x5850c003}, + {0x645e1cfa, 0x903a0a0c, 0x34788c37, 0xfbac54cb, 0x8cf73d78, 0xdc127d11, 0x975d3c82, 0x6d0b5c7c}, + {0x14b1ba04, 0xb49d6b05, 0xf00b84f2, 0x56e466b4, 0x0b904f22, 0x30c390cf, 0x3ee254cc, 0x3e11cfb7}, + {0xbe8201ab, 0x84dfa547, 0x530715d2, 0x3887ce8b, 0x3eed4ed7, 0xa4c719c6, 0x8f8007b4, 0x18c44950}, + {0x7d813cd1, 0xdaf0346d, 0xf755beb1, 0xeccf6f9a, 0xe08143e3, 0x167fce38, 0x6f5d6dfa, 0x545ad9b2}, + {0x577605de, 0x973f5466, 0x974f953c, 0x0ce8986e, 0x074382f9, 0x8941cf4b, 0x6fa2672c, 0x156cd7f6}, + {0x33b66141, 0x24315404, 0x1992f584, 0x5d1375ab, 0x8b20ca1a, 0xf193ffa6, 0x2701a503, 0x47880cd5}, + {0xe9f7b9af, 0xf7b6847d, 0x62c83ce2, 0x9a339673, 0x6e5e6f79, 0xfabf4537, 0x35af33a3, 0x0975acd9}, + {0x0eddd248, 0x4fb4204a, 0xc9e509b3, 0x8c98706a, 0x2bb27eb1, 0xd0be8987, 0xc831438b, 0x6ec5f960}, + {0x20238f62, 0xa13c95b7, 0x83b476b9, 0x130aa097, 0x14860881, 0x758a04e0, 0x97066493, 0x58e2f8d6}, + {0xe8bff41e, 0x65b09c73, 0x37f1c6a3, 0x8b3280e8, 0x2846fb21, 0xe17b82ce, 0xb1ae27df, 0x476534bf}, + {0xd5fdb757, 0x8480c0e7, 0x365bf9fd, 0x3644eea0, 0xb776be86, 0x4ca116ca, 0x8b58390c, 0x17b6395f}, + {0x252eb0db, 0x2c811e9a, 0x7479e161, 0x1b7d960d, 0xb0a89a26, 0xb3afc7c1, 0x32b5e793, 0x6a2f9533}, + {0x08b8a7ad, 0xe877b2c4, 0x341652b4, 0x68b0e8f0, 0xe8b6a2d9, 0x2d44da3b, 0xfd09be59, 0x092778ff}, + {0x7988f244, 0x84a1aa6f, 0x24faf63f, 0xa164b3d9, 0xc1bbb915, 0x7aae9724, 0xf386c0d2, 0x24e5d287}, + {0x41a1b30c, 0xa70a7efd, 0x39f0e511, 0xc49c55a5, 0x033bb323, 0xab307a8f, 0x17acbd7f, 0x0158abd6}, + {0x0f642025, 0x2c228b30, 0x01bd882b, 0xb0878e8d, 0xd7377fea, 0xd862b255, 0xf0490536, 0x18ac3666}}}; static constexpr storage_array inv = { - inv1, inv2, inv3, inv4, inv5, inv6, inv7, inv8, inv9, inv10, inv11, inv12, inv13, inv14, inv15, inv16, - inv17, inv18, inv19, inv20, inv21, inv22, inv23, inv24, inv25, inv26, inv27, inv28, inv29, inv30, inv31, inv32, - }; + {{0x80000001, 0x7fffffff, 0x7fff2dff, 0xa9ded201, 0x04d0ec02, 0x199cec04, 0x94cebea4, 0x39f6d3a9}, + {0x40000001, 0x3fffffff, 0x3ffec4ff, 0xfece3b02, 0x07396203, 0x266b6206, 0x5f361df6, 0x56f23d7e}, + {0x20000001, 0x1fffffff, 0x9ffe907f, 0xa945ef82, 0x086d9d04, 0x2cd29d07, 0xc469cd9f, 0x656ff268}, + {0x10000001, 0x0fffffff, 0xcffe763f, 0xfe81c9c2, 0x8907ba84, 0xb0063a87, 0xf703a573, 0x6caeccdd}, + {0x08000001, 0x07ffffff, 0xe7fe691f, 0x291fb6e2, 0xc954c945, 0xf1a00947, 0x9050915d, 0x704e3a18}, + {0x04000001, 0x03ffffff, 0xf3fe628f, 0x3e6ead72, 0xe97b50a5, 0x126cf0a7, 0xdcf70753, 0x721df0b5}, + {0x02000001, 0x01ffffff, 0xf9fe5f47, 0x491628ba, 0xf98e9455, 0xa2d36457, 0x834a424d, 0x7305cc04}, + {0x01000001, 0x00ffffff, 0xfcfe5da3, 0x4e69e65e, 0x0198362d, 0xeb069e30, 0xd673dfca, 0x7379b9ab}, + {0x00800001, 0x007fffff, 0xfe7e5cd1, 0x5113c530, 0x059d0719, 0x8f203b1c, 0x8008ae89, 0x73b3b07f}, + {0x00400001, 0x003fffff, 0xff3e5c68, 0x5268b499, 0x079f6f8f, 0xe12d0992, 0x54d315e8, 0x73d0abe9}, + {0x00200001, 0x801fffff, 0x7f9e5c33, 0x53132c4e, 0x08a0a3ca, 0x8a3370cd, 0x3f384998, 0x73df299e}, + {0x00100001, 0x400fffff, 0xbfce5c19, 0xd3686828, 0x89213de7, 0x5eb6a46a, 0xb46ae370, 0x73e66878}, + {0x00080001, 0x2007ffff, 0xdfe65c0c, 0x93930615, 0x49618af6, 0x48f83e39, 0xef04305c, 0x73ea07e5}, + {0x00040001, 0x9003ffff, 0x6ff25c05, 0xf3a8550c, 0xa981b17d, 0x3e190b20, 0x8c50d6d2, 0x73ebd79c}, + {0x00020001, 0x4801ffff, 0xb7f85c02, 0xa3b2fc87, 0x5991c4c1, 0x38a97194, 0xdaf72a0d, 0x73ecbf77}, + {0x00010001, 0xa400ffff, 0x5bfb5c00, 0x7bb85045, 0x3199ce63, 0xb5f1a4ce, 0x824a53aa, 0x73ed3365}, + {0x00008001, 0xd2007fff, 0x2dfcdbff, 0x67bafa24, 0x1d9dd334, 0x7495be6b, 0x55f3e879, 0x73ed6d5c}, + {0x00004001, 0x69003fff, 0x96fd9bff, 0xddbc4f13, 0x939fd59c, 0xd3e7cb39, 0xbfc8b2e0, 0x73ed8a57}, + {0x00002001, 0x34801fff, 0x4b7dfbff, 0x18bcf98b, 0xcea0d6d1, 0x8390d1a0, 0x74b31814, 0x73ed98d5}, + {0x00001001, 0x1a400fff, 0x25be2bff, 0x363d4ec7, 0x6c21576b, 0x5b6554d4, 0x4f284aae, 0x73eda014}, + {0x00000801, 0x0d2007ff, 0x12de43ff, 0x44fd7965, 0x3ae197b8, 0x474f966e, 0xbc62e3fb, 0x73eda3b3}, + {0x00000401, 0x069003ff, 0x096e4fff, 0xcc5d8eb4, 0x2241b7de, 0xbd44b73b, 0x730030a1, 0x73eda583}, + {0x00000201, 0x034801ff, 0x84b655ff, 0x100d995b, 0x95f1c7f2, 0xf83f47a1, 0x4e4ed6f4, 0x73eda66b}, + {0x00000101, 0x01a400ff, 0x425a58ff, 0xb1e59eaf, 0xcfc9cffb, 0x95bc8fd4, 0x3bf62a1e, 0x73eda6df}, + {0x00000081, 0x00d2007f, 0x212c5a7f, 0x82d1a159, 0x6cb5d400, 0x647b33ee, 0x32c9d3b3, 0x73eda719}, + {0x00000041, 0x0069003f, 0x10955b3f, 0xeb47a2ae, 0x3b2bd602, 0xcbda85fb, 0x2e33a87d, 0x73eda736}, + {0x00000021, 0x0034801f, 0x8849db9f, 0x1f82a358, 0xa266d704, 0xff8a2f01, 0xabe892e2, 0x73eda744}, + {0x00000011, 0x001a400f, 0xc4241bcf, 0xb9a023ad, 0xd6045784, 0x99620384, 0xeac30815, 0x73eda74b}, + {0x00000009, 0x000d2007, 0x62113be7, 0x06aee3d8, 0x6fd317c5, 0xe64dedc6, 0x8a3042ae, 0x73eda74f}, + {0x00000005, 0x00069003, 0xb107cbf3, 0x2d3643ed, 0x3cba77e5, 0x8cc3e2e7, 0x59e6dffb, 0x73eda751}, + {0x00000003, 0x00034801, 0x588313f9, 0x4079f3f8, 0xa32e27f5, 0xdffedd77, 0x41c22ea1, 0x73eda752}, + {0x00000002, 0x0001a400, 0xac40b7fc, 0x4a1bcbfd, 0xd667fffd, 0x099c5abf, 0xb5afd5f5, 0x73eda752}}}; }; struct fq_config { - // field structure size = 12 * 32 bit static constexpr unsigned limbs_count = 12; - // modulus = - // 4002409555221667393417789825735904156556882819939007885332058136124031650490837864442687629129015664037894272559787 + static constexpr unsigned modulus_bit_count = 381; + static constexpr unsigned num_of_reductions = 1; static constexpr storage modulus = {0xffffaaab, 0xb9feffff, 0xb153ffff, 0x1eabfffe, 0xf6b0f624, 0x6730d2a0, 0xf38512bf, 0x64774b84, 0x434bacd7, 0x4b1ba7b6, 0x397fe69a, 0x1a0111ea}; - // modulus*2 = - // 8004819110443334786835579651471808313113765639878015770664116272248063300981675728885375258258031328075788545119574 static constexpr storage modulus_2 = {0xffff5556, 0x73fdffff, 0x62a7ffff, 0x3d57fffd, 0xed61ec48, 0xce61a541, 0xe70a257e, 0xc8ee9709, 0x869759ae, 0x96374f6c, 0x72ffcd34, 0x340223d4}; - // modulus*4 = - // 16009638220886669573671159302943616626227531279756031541328232544496126601963351457770750516516062656151577090239148 static constexpr storage modulus_4 = {0xfffeaaac, 0xe7fbffff, 0xc54ffffe, 0x7aaffffa, 0xdac3d890, 0x9cc34a83, 0xce144afd, 0x91dd2e13, - 0xd2eb35d, 0x2c6e9ed9, 0xe5ff9a69, 0x680447a8}; - + 0x0d2eb35d, 0x2c6e9ed9, 0xe5ff9a69, 0x680447a8}; + static constexpr storage neg_modulus = {0x00005555, 0x46010000, 0x4eac0000, 0xe1540001, + 0x094f09db, 0x98cf2d5f, 0x0c7aed40, 0x9b88b47b, + 0xbcb45328, 0xb4e45849, 0xc6801965, 0xe5feee15}; static constexpr storage<2 * limbs_count> modulus_wide = { 0xffffaaab, 0xb9feffff, 0xb153ffff, 0x1eabfffe, 0xf6b0f624, 0x6730d2a0, 0xf38512bf, 0x64774b84, 0x434bacd7, 0x4b1ba7b6, 0x397fe69a, 0x1a0111ea, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000}; - - // modulus^2 static constexpr storage<2 * limbs_count> modulus_squared = { 0x1c718e39, 0x26aa0000, 0x76382eab, 0x7ced6b1d, 0x62113cfd, 0x162c3383, 0x3e71b743, 0x66bf91ed, 0x7091a049, 0x292e85a8, 0x86185c7b, 0x1d68619c, 0x0978ef01, 0xf5314933, 0x16ddca6e, 0x50a62cfd, 0x349e8bd0, 0x66e59e49, 0x0e7046b4, 0xe2dc90e5, 0xa22f25e9, 0x4bd278ea, 0xb8c35fc7, 0x02a437a4}; - // 2*modulus^2 static constexpr storage<2 * limbs_count> modulus_squared_2 = { 0x38e31c72, 0x4d540000, 0xec705d56, 0xf9dad63a, 0xc42279fa, 0x2c586706, 0x7ce36e86, 0xcd7f23da, 0xe1234092, 0x525d0b50, 0x0c30b8f6, 0x3ad0c339, 0x12f1de02, 0xea629266, 0x2dbb94dd, 0xa14c59fa, 0x693d17a0, 0xcdcb3c92, 0x1ce08d68, 0xc5b921ca, 0x445e4bd3, 0x97a4f1d5, 0x7186bf8e, 0x05486f49}; - // 4*modulus^2 static constexpr storage<2 * limbs_count> modulus_squared_4 = { 0x71c638e4, 0x9aa80000, 0xd8e0baac, 0xf3b5ac75, 0x8844f3f5, 0x58b0ce0d, 0xf9c6dd0c, 0x9afe47b4, 0xc2468125, 0xa4ba16a1, 0x186171ec, 0x75a18672, 0x25e3bc04, 0xd4c524cc, 0x5b7729bb, 0x4298b3f4, 0xd27a2f41, 0x9b967924, 0x39c11ad1, 0x8b724394, 0x88bc97a7, 0x2f49e3aa, 0xe30d7f1d, 0x0a90de92}; - static constexpr unsigned modulus_bit_count = 381; - // m = floor(2^(2*modulus_bit_count) / modulus) static constexpr storage m = {0xd59646e8, 0xec4f881f, 0x8163c701, 0x4e65c59e, 0x80a19de7, 0x2f7d1dc7, 0x7fda82a5, 0xa46e09d0, 0x331e9ae8, 0x38a0406c, 0xcf327917, 0x2760d74b}; static constexpr storage one = {0x00000001, 0x00000000, 0x00000000, 0x00000000, @@ -371,37 +186,38 @@ namespace bls12_381 { static constexpr storage zero = {0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000}; - static constexpr storage montgomery_r = {0x0005555, 0x60100000, 0xeac00004, 0x15400014, - 0x94f09dbe, 0x8cf2d5f0, 0xc7aed409, 0xb88b47b0, - 0xcb453289, 0x4e45849b, 0x6801965b, 0x5feee15c}; - static constexpr storage montgomery_r_inv = {0x05c40fe, 0xaa212c9c, 0xccfd7e14, 0x70093ae9, - 0xc85a96b4, 0x6d05c02d, 0x025fecd3, 0x1f193851, - 0xeb48f4c6, 0x84d32f44, 0xed8ffb1a, 0xbefcc91e}; + static constexpr storage montgomery_r = {0x0002fffd, 0x76090000, 0xc40c0002, 0xebf4000b, + 0x53c758ba, 0x5f489857, 0x70525745, 0x77ce5853, + 0xa256ec6d, 0x5c071a97, 0xfa80e493, 0x15f65ec3}; + static constexpr storage montgomery_r_inv = {0x380b4820, 0xf4d38259, 0xd898fafb, 0x7fe11274, + 0x14956dc8, 0x343ea979, 0x58a88de9, 0x1797ab14, + 0x3c4f538b, 0xed5e6427, 0xe8fb0ce9, 0x14fec701}; // i^2, the square of the imaginary unit for the extension field static constexpr uint32_t i_squared = 1; // true if i^2 is negative static constexpr bool i_squared_is_negative = true; - // G1 and G2 generators - static constexpr storage g1_gen_x = {0xdb22c6bb, 0xfb3af00a, 0xf97a1aef, 0x6c55e83f, - 0x171bac58, 0xa14e3a3f, 0x9774b905, 0xc3688c4f, - 0x4fa9ac0f, 0x2695638c, 0x3197d794, 0x17f1d3a7}; - static constexpr storage g1_gen_y = {0x46c5e7e1, 0x0caa2329, 0xa2888ae4, 0xd03cc744, - 0x2c04b3ed, 0x00db18cb, 0xd5d00af6, 0xfcf5e095, - 0x741d8ae4, 0xa09e30ed, 0xe3aaa0f1, 0x08b3f481}; - static constexpr storage g2_gen_x_re = {0xc121bdb8, 0xd48056c8, 0xa805bbef, 0x0bac0326, - 0x7ae3d177, 0xb4510b64, 0xfa403b02, 0xc6e47ad4, - 0x2dc51051, 0x26080527, 0xf08f0a91, 0x024aa2b2}; - static constexpr storage g2_gen_x_im = {0x5d042b7e, 0xe5ac7d05, 0x13945d57, 0x334cf112, - 0xdc7f5049, 0xb5da61bb, 0x9920b61a, 0x596bd0d0, - 0x88274f65, 0x7dacd3a0, 0x52719f60, 0x13e02b60}; - static constexpr storage g2_gen_y_re = {0x08b82801, 0xe1935486, 0x3baca289, 0x923ac9cc, - 0x5160d12c, 0x6d429a69, 0x8cbdd3a7, 0xadfd9baa, - 0xda2e351a, 0x8cc9cdc6, 0x727d6e11, 0x0ce5d527}; - static constexpr storage g2_gen_y_im = {0xf05f79be, 0xaaa9075f, 0x5cec1da1, 0x3f370d27, - 0x572e99ab, 0x267492ab, 0x85a763af, 0xcb3e287e, - 0x2bc28b99, 0x32acd2b0, 0x2ea734cc, 0x0606c4a0}; }; + // G1 and G2 generators + static constexpr storage g1_gen_x = {0xdb22c6bb, 0xfb3af00a, 0xf97a1aef, 0x6c55e83f, + 0x171bac58, 0xa14e3a3f, 0x9774b905, 0xc3688c4f, + 0x4fa9ac0f, 0x2695638c, 0x3197d794, 0x17f1d3a7}; + static constexpr storage g1_gen_y = {0x46c5e7e1, 0x0caa2329, 0xa2888ae4, 0xd03cc744, + 0x2c04b3ed, 0x00db18cb, 0xd5d00af6, 0xfcf5e095, + 0x741d8ae4, 0xa09e30ed, 0xe3aaa0f1, 0x08b3f481}; + static constexpr storage g2_gen_x_re = {0xc121bdb8, 0xd48056c8, 0xa805bbef, 0x0bac0326, + 0x7ae3d177, 0xb4510b64, 0xfa403b02, 0xc6e47ad4, + 0x2dc51051, 0x26080527, 0xf08f0a91, 0x024aa2b2}; + static constexpr storage g2_gen_x_im = {0x5d042b7e, 0xe5ac7d05, 0x13945d57, 0x334cf112, + 0xdc7f5049, 0xb5da61bb, 0x9920b61a, 0x596bd0d0, + 0x88274f65, 0x7dacd3a0, 0x52719f60, 0x13e02b60}; + static constexpr storage g2_gen_y_re = {0x08b82801, 0xe1935486, 0x3baca289, 0x923ac9cc, + 0x5160d12c, 0x6d429a69, 0x8cbdd3a7, 0xadfd9baa, + 0xda2e351a, 0x8cc9cdc6, 0x727d6e11, 0x0ce5d527}; + static constexpr storage g2_gen_y_im = {0xf05f79be, 0xaaa9075f, 0x5cec1da1, 0x3f370d27, + 0x572e99ab, 0x267492ab, 0x85a763af, 0xcb3e287e, + 0x2bc28b99, 0x32acd2b0, 0x2ea734cc, 0x0606c4a0}; + static constexpr storage weierstrass_b = {0x00000004, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000}; diff --git a/icicle/curves/bn254_params.cuh b/icicle/curves/bn254_params.cuh index 0708a15dd..d06474a38 100644 --- a/icicle/curves/bn254_params.cuh +++ b/icicle/curves/bn254_params.cuh @@ -9,6 +9,7 @@ namespace bn254 { static constexpr unsigned limbs_count = 8; static constexpr unsigned omegas_count = 28; static constexpr unsigned modulus_bit_count = 254; + static constexpr unsigned num_of_reductions = 1; static constexpr storage modulus = {0xf0000001, 0x43e1f593, 0x79b97091, 0x2833e848, 0x8181585d, 0xb85045b6, 0xe131a029, 0x30644e72}; @@ -16,6 +17,8 @@ namespace bn254 { 0x0302b0ba, 0x70a08b6d, 0xc2634053, 0x60c89ce5}; static constexpr storage modulus_4 = {0xc0000004, 0x0f87d64f, 0xe6e5c245, 0xa0cfa121, 0x06056174, 0xe14116da, 0x84c680a6, 0xc19139cb}; + static constexpr storage neg_modulus = {0x0fffffff, 0xbc1e0a6c, 0x86468f6e, 0xd7cc17b7, + 0x7e7ea7a2, 0x47afba49, 0x1ece5fd6, 0xcf9bb18d}; static constexpr storage<2 * limbs_count> modulus_wide = { 0xf0000001, 0x43e1f593, 0x79b97091, 0x2833e848, 0x8181585d, 0xb85045b6, 0xe131a029, 0x30644e72, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000}; @@ -36,8 +39,8 @@ namespace bn254 { static constexpr storage zero = {0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000}; static constexpr storage montgomery_r = {0x4ffffffb, 0xac96341c, 0x9f60cd29, 0x36fc7695, - 0x7879462e, 0x666ea36f, 0x9a07df2f, 0xe0a77c1}; - static constexpr storage montgomery_r_inv = {0x6db1194e, 0xdc5ba005, 0xe111ec87, 0x90ef5a9, + 0x7879462e, 0x666ea36f, 0x9a07df2f, 0x0e0a77c1}; + static constexpr storage montgomery_r_inv = {0x6db1194e, 0xdc5ba005, 0xe111ec87, 0x090ef5a9, 0xaeb85d5d, 0xc8260de4, 0x82c5551c, 0x15ebf951}; static constexpr storage_array omega = { @@ -134,12 +137,15 @@ namespace bn254 { struct fq_config { static constexpr unsigned limbs_count = 8; static constexpr unsigned modulus_bit_count = 254; + static constexpr unsigned num_of_reductions = 1; static constexpr storage modulus = {0xd87cfd47, 0x3c208c16, 0x6871ca8d, 0x97816a91, 0x8181585d, 0xb85045b6, 0xe131a029, 0x30644e72}; static constexpr storage modulus_2 = {0xb0f9fa8e, 0x7841182d, 0xd0e3951a, 0x2f02d522, 0x0302b0bb, 0x70a08b6d, 0xc2634053, 0x60c89ce5}; static constexpr storage modulus_4 = {0x61f3f51c, 0xf082305b, 0xa1c72a34, 0x5e05aa45, 0x06056176, 0xe14116da, 0x84c680a6, 0xc19139cb}; + static constexpr storage neg_modulus = {0x278302b9, 0xc3df73e9, 0x978e3572, 0x687e956e, + 0x7e7ea7a2, 0x47afba49, 0x1ece5fd6, 0xcf9bb18d}; static constexpr storage<2 * limbs_count> modulus_wide = { 0xd87cfd47, 0x3c208c16, 0x6871ca8d, 0x97816a91, 0x8181585d, 0xb85045b6, 0xe131a029, 0x30644e72, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000}; @@ -158,30 +164,30 @@ namespace bn254 { 0x00000000, 0x00000000, 0x00000000, 0x00000000}; static constexpr storage zero = {0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000}; - static constexpr storage montgomery_r = {0xc58f0d9d, 0xd35d438d, 0xf5c70b3d, 0xa78eb28, - 0x7879462c, 0x666ea36f, 0x9a07df2f, 0xe0a77c1}; - static constexpr storage montgomery_r_inv = {0x14afa37, 0xed84884a, 0x278edf8, 0xeb202285, + static constexpr storage montgomery_r = {0xc58f0d9d, 0xd35d438d, 0xf5c70b3d, 0x0a78eb28, + 0x7879462c, 0x666ea36f, 0x9a07df2f, 0x0e0a77c1}; + static constexpr storage montgomery_r_inv = {0x014afa37, 0xed84884a, 0x0278edf8, 0xeb202285, 0xb74492d9, 0xcf63e9cf, 0x59e5c639, 0x2e671571}; - // i^2, the square of the imaginary unit for the extension field static constexpr uint32_t i_squared = 1; // true if i^2 is negative static constexpr bool i_squared_is_negative = true; - // G1 and G2 generators - static constexpr storage g1_gen_x = {0x00000001, 0x00000000, 0x00000000, 0x00000000, - 0x00000000, 0x00000000, 0x00000000, 0x00000000}; - static constexpr storage g1_gen_y = {0x00000002, 0x00000000, 0x00000000, 0x00000000, - 0x00000000, 0x00000000, 0x00000000, 0x00000000}; - static constexpr storage g2_gen_x_re = {0xd992f6ed, 0x46debd5c, 0xf75edadd, 0x674322d4, - 0x5e5c4479, 0x426a0066, 0x121f1e76, 0x1800deef}; - static constexpr storage g2_gen_x_im = {0xaef312c2, 0x97e485b7, 0x35a9e712, 0xf1aa4933, - 0x31fb5d25, 0x7260bfb7, 0x920d483a, 0x198e9393}; - static constexpr storage g2_gen_y_re = {0x66fa7daa, 0x4ce6cc01, 0x0c43d37b, 0xe3d1e769, - 0x8dcb408f, 0x4aab7180, 0xdb8c6deb, 0x12c85ea5}; - static constexpr storage g2_gen_y_im = {0xd122975b, 0x55acdadc, 0x70b38ef3, 0xbc4b3133, - 0x690c3395, 0xec9e99ad, 0x585ff075, 0x090689d0}; }; + // G1 and G2 generators + static constexpr storage g1_gen_x = {0x00000001, 0x00000000, 0x00000000, 0x00000000, + 0x00000000, 0x00000000, 0x00000000, 0x00000000}; + static constexpr storage g1_gen_y = {0x00000002, 0x00000000, 0x00000000, 0x00000000, + 0x00000000, 0x00000000, 0x00000000, 0x00000000}; + static constexpr storage g2_gen_x_re = {0xd992f6ed, 0x46debd5c, 0xf75edadd, 0x674322d4, + 0x5e5c4479, 0x426a0066, 0x121f1e76, 0x1800deef}; + static constexpr storage g2_gen_x_im = {0xaef312c2, 0x97e485b7, 0x35a9e712, 0xf1aa4933, + 0x31fb5d25, 0x7260bfb7, 0x920d483a, 0x198e9393}; + static constexpr storage g2_gen_y_re = {0x66fa7daa, 0x4ce6cc01, 0x0c43d37b, 0xe3d1e769, + 0x8dcb408f, 0x4aab7180, 0xdb8c6deb, 0x12c85ea5}; + static constexpr storage g2_gen_y_im = {0xd122975b, 0x55acdadc, 0x70b38ef3, 0xbc4b3133, + 0x690c3395, 0xec9e99ad, 0x585ff075, 0x090689d0}; + static constexpr storage weierstrass_b = {0x00000003, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000}; static constexpr storage weierstrass_b_g2_re = { diff --git a/icicle/curves/bw6_761_params.cuh b/icicle/curves/bw6_761_params.cuh new file mode 100644 index 000000000..d9ab08b3b --- /dev/null +++ b/icicle/curves/bw6_761_params.cuh @@ -0,0 +1,106 @@ +#pragma once +#ifndef BW6_761_PARAMS_H +#define BW6_761_PARAMS_H + +#include "../utils/storage.cuh" + +namespace bw6_761 { + struct fq_config { + static constexpr unsigned limbs_count = 24; + static constexpr unsigned modulus_bit_count = 761; + static constexpr unsigned num_of_reductions = 1; + static constexpr storage modulus = { + 0x0000008b, 0xf49d0000, 0x70000082, 0xe6913e68, 0xeaf0a437, 0x160cf8ae, 0x5667a8f8, 0x98a116c2, + 0x73ebff2e, 0x71dcd3dc, 0x12f9fd90, 0x8689c8ed, 0x25b42304, 0x03cebaff, 0xe584e919, 0x707ba638, + 0x8087be41, 0x528275ef, 0x81d14688, 0xb926186a, 0x04faff3e, 0xd187c940, 0xfb83ce0a, 0x0122e824}; + static constexpr storage modulus_2 = { + 0x00000116, 0xe93a0000, 0xe0000105, 0xcd227cd0, 0xd5e1486f, 0x2c19f15d, 0xaccf51f0, 0x31422d84, + 0xe7d7fe5d, 0xe3b9a7b8, 0x25f3fb20, 0x0d1391da, 0x4b684609, 0x079d75fe, 0xcb09d232, 0xe0f74c71, + 0x010f7c82, 0xa504ebdf, 0x03a28d10, 0x724c30d5, 0x09f5fe7d, 0xa30f9280, 0xf7079c15, 0x0245d049}; + static constexpr storage modulus_4 = { + 0x0000022c, 0xd2740000, 0xc000020b, 0x9a44f9a1, 0xabc290df, 0x5833e2bb, 0x599ea3e0, 0x62845b09, + 0xcfaffcba, 0xc7734f71, 0x4be7f641, 0x1a2723b4, 0x96d08c12, 0x0f3aebfc, 0x9613a464, 0xc1ee98e3, + 0x021ef905, 0x4a09d7be, 0x07451a21, 0xe49861aa, 0x13ebfcfa, 0x461f2500, 0xee0f382b, 0x048ba093}; + static constexpr storage neg_modulus = { + 0xffffff75, 0x0b62ffff, 0x8fffff7d, 0x196ec197, 0x150f5bc8, 0xe9f30751, 0xa9985707, 0x675ee93d, + 0x8c1400d1, 0x8e232c23, 0xed06026f, 0x79763712, 0xda4bdcfb, 0xfc314500, 0x1a7b16e6, 0x8f8459c7, + 0x7f7841be, 0xad7d8a10, 0x7e2eb977, 0x46d9e795, 0xfb0500c1, 0x2e7836bf, 0x047c31f5, 0xfedd17db}; + static constexpr storage<2 * limbs_count> modulus_wide = { + 0x0000008b, 0xf49d0000, 0x70000082, 0xe6913e68, 0xeaf0a437, 0x160cf8ae, 0x5667a8f8, 0x98a116c2, + 0x73ebff2e, 0x71dcd3dc, 0x12f9fd90, 0x8689c8ed, 0x25b42304, 0x03cebaff, 0xe584e919, 0x707ba638, + 0x8087be41, 0x528275ef, 0x81d14688, 0xb926186a, 0x04faff3e, 0xd187c940, 0xfb83ce0a, 0x0122e824, + 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, + 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, + 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000}; + static constexpr storage<2 * limbs_count> modulus_squared = { + 0x00004b79, 0xa27e0000, 0xa0008e35, 0xbae96db2, 0x82ebf7b1, 0x4aaf1d22, 0x7224cb3d, 0x7908fd92, + 0x29b17ed1, 0x6fe68290, 0xafc968db, 0xfe1b7282, 0x9028bbf0, 0xe1e548cb, 0x3a8ffc03, 0x09094ed6, + 0x61e9cf95, 0xd63ea631, 0x54918abf, 0xe834ca62, 0x52aa651e, 0xe52594ed, 0xb4c46a4f, 0xe2423252, + 0x6c09aae4, 0xa8cf17d8, 0xc5f5cee5, 0x2d80ffb0, 0x55bbc10d, 0x2dede100, 0xe2360382, 0x1f4e7a7c, + 0xae2fe433, 0x586c3847, 0x78eadae1, 0x915c56e1, 0x69a5ce00, 0xa35b2945, 0x767c08ca, 0x9d66e7fe, + 0xd8b88c77, 0x7e44cf6a, 0x67c9c873, 0xb29bfc93, 0xbbc80af9, 0x6a24005a, 0xc64ce3d5, 0x00014a92}; + static constexpr storage<2 * limbs_count> modulus_squared_2 = { + 0x000096f2, 0x44fc0000, 0x40011c6b, 0x75d2db65, 0x05d7ef63, 0x955e3a45, 0xe449967a, 0xf211fb24, + 0x5362fda2, 0xdfcd0520, 0x5f92d1b6, 0xfc36e505, 0x205177e1, 0xc3ca9197, 0x751ff807, 0x12129dac, + 0xc3d39f2a, 0xac7d4c62, 0xa923157f, 0xd06994c4, 0xa554ca3d, 0xca4b29da, 0x6988d49f, 0xc48464a5, + 0xd81355c9, 0x519e2fb0, 0x8beb9dcb, 0x5b01ff61, 0xab77821a, 0x5bdbc200, 0xc46c0704, 0x3e9cf4f9, + 0x5c5fc866, 0xb0d8708f, 0xf1d5b5c2, 0x22b8adc2, 0xd34b9c01, 0x46b6528a, 0xecf81195, 0x3acdcffc, + 0xb17118ef, 0xfc899ed5, 0xcf9390e6, 0x6537f926, 0x779015f3, 0xd44800b5, 0x8c99c7aa, 0x00029525}; + static constexpr storage<2 * limbs_count> modulus_squared_4 = { + 0x00012de4, 0x89f80000, 0x800238d6, 0xeba5b6ca, 0x0bafdec6, 0x2abc748a, 0xc8932cf5, 0xe423f649, + 0xa6c5fb45, 0xbf9a0a40, 0xbf25a36d, 0xf86dca0a, 0x40a2efc3, 0x8795232e, 0xea3ff00f, 0x24253b58, + 0x87a73e54, 0x58fa98c5, 0x52462aff, 0xa0d32989, 0x4aa9947b, 0x949653b5, 0xd311a93f, 0x8908c94a, + 0xb026ab93, 0xa33c5f61, 0x17d73b96, 0xb603fec3, 0x56ef0434, 0xb7b78401, 0x88d80e08, 0x7d39e9f3, + 0xb8bf90cc, 0x61b0e11e, 0xe3ab6b85, 0x45715b85, 0xa6973802, 0x8d6ca515, 0xd9f0232a, 0x759b9ff9, + 0x62e231de, 0xf9133dab, 0x9f2721cd, 0xca6ff24d, 0xef202be6, 0xa890016a, 0x19338f55, 0x00052a4b}; + static constexpr storage m = {0x2507e899, 0x11629ccd, 0x2e4424dd, 0xab1eef5b, 0x481d2cfa, 0xb82146a9, + 0x34e4227b, 0xf3182afa, 0xbeb25621, 0xf615fdb5, 0xccc261d6, 0xc4d8988c, + 0xaaf4fab0, 0x3590d652, 0x2ab9ff30, 0x9c5d0a04, 0x6ec3f460, 0xf6e8534f, + 0x88075ab4, 0xe8d78b06, 0x6f3fc8fe, 0xa8d3675b, 0x7bc5cd4b, 0x03852086}; + static constexpr storage one = { + 0x00000001, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, + 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, + 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000}; + static constexpr storage zero = { + 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, + 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, + 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000}; + static constexpr storage montgomery_r = { + 0xffff85d5, 0x0202ffff, 0x8fff8ce7, 0x5a582635, 0x827faade, 0x9e996e43, 0x0ee47df4, 0xda6aff32, + 0x1d94b80b, 0xece9cb3e, 0x5248240b, 0xc0e667a2, 0xdcad3905, 0xa74da5bf, 0x462f2103, 0x2352e7fe, + 0x08b1c87c, 0x7b565880, 0xe711022f, 0x45848a63, 0x9f65a9df, 0xd7a81ebb, 0xf127e87d, 0x0051f77e}; + static constexpr storage montgomery_r_inv = { + 0x181fa3f1, 0x27c2b2a0, 0x25a0e1b8, 0x7d9ca9f9, 0x0a004a5d, 0x35a910f0, 0xdb6b8539, 0x54655b3f, + 0x7695ef18, 0x5e763565, 0x4fae56bb, 0x226022c2, 0xb70d7652, 0x80e7f067, 0x72116b89, 0x435a8b4a, + 0x5d84e0d4, 0xac258fd6, 0x4427c7b2, 0x47ee8ac5, 0xd04e621b, 0x478c4048, 0x2add3e93, 0x00e0aa7d}; + }; + + // G1 and G2 generators + static constexpr storage g1_gen_x = { + 0x66e5b43d, 0x4088f3af, 0xa6af603f, 0x055928ac, 0x56133e82, 0x6750dd03, 0x280ca27f, 0x03758f9a, + 0xc9ea0971, 0x5bd71fa0, 0x47729b90, 0xa17a54ce, 0x94c2e746, 0x11dbfcd2, 0xc15520ac, 0x79017ffa, + 0x85f56fc7, 0xee05c54b, 0x551b27f0, 0xe6a0cfb7, 0xa477beae, 0xb277ce98, 0x0ea190c8, 0x01075b02}; + static constexpr storage g1_gen_y = { + 0xb4e95363, 0xbafc8f2d, 0x0b20d2a1, 0xad1cb2be, 0xcad0fb93, 0xb2b08119, 0xb3053253, 0x9f9df141, + 0x6fc2cdd4, 0xbe3fb90b, 0x717a4c55, 0xcc685d31, 0x71b5b806, 0xc5b8fa17, 0xaf7e0dba, 0x265909f1, + 0xa2e573a3, 0x1a7348d2, 0x884c9ec6, 0x0f952589, 0x45cc2a42, 0xe6fd637b, 0x0a6fc574, 0x0058b84e}; + static constexpr storage g2_gen_x = { + 0xcd025f1c, 0xa830c194, 0xe1bf995b, 0x6410cf4f, 0xc2ad54b0, 0x00e96efb, 0x3cd208d7, 0xce6948cb, + 0x00e1b6ba, 0x963317a3, 0xac70e7c7, 0xc5bbcae9, 0xf09feb58, 0x734ec3f1, 0xab3da268, 0x26b41c5d, + 0x13890f6d, 0x4c062010, 0xc5a7115f, 0xd61053aa, 0x69d660f9, 0xc852a82e, 0x41d9b816, 0x01101332}; + static constexpr storage g2_gen_y = { + 0x28c73b61, 0xeb70a167, 0xf9eac689, 0x91ec0594, 0x3c5a02a5, 0x58aa2d3a, 0x504affc7, 0x3ea96fcd, + 0xffa82300, 0x8906c170, 0xd2c712b8, 0x64f293db, 0x33293fef, 0x94c97eb7, 0x0b95a59c, 0x0a1d86c8, + 0x53ffe316, 0x81a78e27, 0xcec2181c, 0x26b7cf9a, 0xe4b6d2dc, 0x8179eb10, 0x7761369f, 0x0017c335}; + + static constexpr storage weierstrass_b = { + 0x0000008a, 0xf49d0000, 0x70000082, 0xe6913e68, 0xeaf0a437, 0x160cf8ae, 0x5667a8f8, 0x98a116c2, + 0x73ebff2e, 0x71dcd3dc, 0x12f9fd90, 0x8689c8ed, 0x25b42304, 0x03cebaff, 0xe584e919, 0x707ba638, + 0x8087be41, 0x528275ef, 0x81d14688, 0xb926186a, 0x04faff3e, 0xd187c940, 0xfb83ce0a, 0x0122e824}; + static constexpr storage g2_weierstrass_b = { + 0x00000004, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, + 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, + 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000}; +} // namespace bw6_761 + +#endif diff --git a/icicle/curves/curve_config.cuh b/icicle/curves/curve_config.cuh index 2600db954..510e75db5 100644 --- a/icicle/curves/curve_config.cuh +++ b/icicle/curves/curve_config.cuh @@ -5,6 +5,7 @@ #define BN254 1 #define BLS12_381 2 #define BLS12_377 3 +#define BW6_761 4 #include "../primitives/field.cuh" #include "../primitives/projective.cuh" @@ -21,21 +22,41 @@ using namespace bls12_381; #elif CURVE_ID == BLS12_377 #include "bls12_377_params.cuh" using namespace bls12_377; +#elif CURVE_ID == BW6_761 +#include "bls12_377_params.cuh" +#include "bw6_761_params.cuh" +using namespace bw6_761; #endif namespace curve_config { +#if CURVE_ID == BW6_761 + typedef bls12_377::fq_config fp_config; +#endif typedef Field scalar_t; typedef Field point_field_t; + static constexpr point_field_t generator_x = point_field_t{g1_gen_x}; + static constexpr point_field_t generator_y = point_field_t{g1_gen_y}; static constexpr point_field_t b = point_field_t{weierstrass_b}; - typedef Projective projective_t; + typedef Projective projective_t; typedef Affine affine_t; #if defined(G2_DEFINED) +#if CURVE_ID == BW6_761 + typedef point_field_t g2_point_field_t; + static constexpr g2_point_field_t g2_generator_x = g2_point_field_t{g2_gen_x}; + static constexpr g2_point_field_t g2_generator_y = g2_point_field_t{g2_gen_y}; + static constexpr g2_point_field_t g2_b = g2_point_field_t{g2_weierstrass_b}; +#else typedef ExtensionField g2_point_field_t; - static constexpr g2_point_field_t b_g2 = - g2_point_field_t{point_field_t{weierstrass_b_g2_re}, point_field_t{weierstrass_b_g2_im}}; - typedef Projective g2_projective_t; + static constexpr g2_point_field_t g2_generator_x = g2_point_field_t{ + point_field_t{g2_gen_x_re}, point_field_t{g2_gen_x_im}}; + static constexpr g2_point_field_t g2_generator_y = g2_point_field_t{ + point_field_t{g2_gen_y_re}, point_field_t{g2_gen_y_im}}; + static constexpr g2_point_field_t g2_b = g2_point_field_t{ + point_field_t{weierstrass_b_g2_re}, point_field_t{weierstrass_b_g2_im}}; +#endif + typedef Projective g2_projective_t; typedef Affine g2_affine_t; #endif diff --git a/icicle/primitives/field.cuh b/icicle/primitives/field.cuh index f7b4d4150..d4e3affda 100644 --- a/icicle/primitives/field.cuh +++ b/icicle/primitives/field.cuh @@ -1,3 +1,21 @@ +/** + * This file contains methods for working with elements of a prime field. It is based on and evolved from Matter Labs' + * [Zprize + * submission](https://github.com/matter-labs/z-prize-msm-gpu/blob/main/bellman-cuda-rust/bellman-cuda-sys/native/ff_dispatch_st.cuh). + * + * TODO: DmytroTym: current version needs refactoring (e.g. there's no reason to have different classes Field and + * ff_storage among other issues). But because this is an internal file and correctness and performance are unaffected, + * refactoring it is low in the priority list. + * + * Documentation of methods is intended to explain inner workings to developers working on icicle. In its current state + * it mostly explains modular mutliplication and related methods. One important quirk of modern CUDA that's affecting + * most methods is explained by [Niall Emmart](https://youtu.be/KAWlySN7Hm8?si=h7nzDujnvubWXeDX&t=4039). In short, when + * 64-bit MAD (`r = a * b + c`) instructions get compiled down to SASS (CUDA assembly) they require two-register values + * `r` and `c` to start from even register (e.g. `r` can live in registers 20 and 21, or 14 and 15, but not 15 and 16). + * This complicates implementations forcing us to segregate terms into two categories depending on their alignment. + * Which is where `even` and `odd` arrays across the codebase come from. + */ + #pragma once #include "../utils/host_math.cuh" @@ -34,10 +52,6 @@ public: return Field{scalar}; } - static constexpr HOST_DEVICE_INLINE Field generator_x() { return Field{CONFIG::g1_gen_x}; } - - static constexpr HOST_DEVICE_INLINE Field generator_y() { return Field{CONFIG::g1_gen_y}; } - static HOST_INLINE Field omega(uint32_t logn) { if (logn == 0) { return Field{CONFIG::one}; } @@ -67,12 +81,20 @@ public: return Field{inv.storages[logn - 1]}; } - static constexpr HOST_DEVICE_INLINE Field modulus() { return Field{CONFIG::modulus}; } - // private: typedef storage ff_storage; typedef storage<2 * TLC> ff_wide_storage; + /** + * A new addition to the config file - \f$ 2^{32 \cdot num\_limbs} - p \f$. + */ + static constexpr HOST_DEVICE_INLINE ff_storage get_neg_modulus() { return CONFIG::neg_modulus; } + + /** + * A new addition to the config file - the number of times to reduce in [reduce](@ref reduce) function. + */ + static constexpr HOST_DEVICE_INLINE unsigned num_of_reductions() { return CONFIG::num_of_reductions; } + static constexpr unsigned slack_bits = 32 * TLC - NBITS; struct Wide { @@ -89,6 +111,17 @@ public: return out; } + static constexpr Field HOST_DEVICE_INLINE get_higher(const Wide& xs) + { + Field out{}; +#ifdef __CUDA_ARCH__ +#pragma unroll +#endif + for (unsigned i = 0; i < TLC; i++) + out.limbs_storage.limbs[i] = xs.limbs_storage.limbs[i + TLC]; + return out; + } + static constexpr Field HOST_DEVICE_INLINE get_higher_with_slack(const Wide& xs) { Field out{}; @@ -98,10 +131,10 @@ public: for (unsigned i = 0; i < TLC; i++) { #ifdef __CUDA_ARCH__ out.limbs_storage.limbs[i] = - __funnelshift_lc(xs.limbs_storage.limbs[i + TLC - 1], xs.limbs_storage.limbs[i + TLC], slack_bits); + __funnelshift_lc(xs.limbs_storage.limbs[i + TLC - 1], xs.limbs_storage.limbs[i + TLC], 2 * slack_bits); #else - out.limbs_storage.limbs[i] = - (xs.limbs_storage.limbs[i + TLC] << slack_bits) + (xs.limbs_storage.limbs[i + TLC - 1] >> (32 - slack_bits)); + out.limbs_storage.limbs[i] = (xs.limbs_storage.limbs[i + TLC] << 2 * slack_bits) + + (xs.limbs_storage.limbs[i + TLC - 1] >> (32 - 2 * slack_bits)); #endif } return out; @@ -143,7 +176,7 @@ public: } }; - // return modulus + // return modulus multiplied by 1, 2 or 4 template static constexpr HOST_DEVICE_INLINE ff_storage get_modulus() { @@ -184,27 +217,31 @@ public: } } - // add or subtract limbs template - static constexpr DEVICE_INLINE uint32_t - add_sub_limbs_device(const ff_storage& xs, const ff_storage& ys, ff_storage& rs) + static constexpr __device__ __forceinline__ uint32_t + add_sub_u32_device(const uint32_t* x, const uint32_t* y, uint32_t* r, size_t n = (TLC >> 1)) { - const uint32_t* x = xs.limbs; - const uint32_t* y = ys.limbs; - uint32_t* r = rs.limbs; r[0] = SUBTRACT ? ptx::sub_cc(x[0], y[0]) : ptx::add_cc(x[0], y[0]); -#ifdef __CUDA_ARCH__ -#pragma unroll -#endif - for (unsigned i = 1; i < (CARRY_OUT ? TLC : TLC - 1); i++) + for (unsigned i = 1; i < (CARRY_OUT ? n : n - 1); i++) r[i] = SUBTRACT ? ptx::subc_cc(x[i], y[i]) : ptx::addc_cc(x[i], y[i]); if (!CARRY_OUT) { - r[TLC - 1] = SUBTRACT ? ptx::subc(x[TLC - 1], y[TLC - 1]) : ptx::addc(x[TLC - 1], y[TLC - 1]); + r[n - 1] = SUBTRACT ? ptx::subc(x[n - 1], y[n - 1]) : ptx::addc(x[n - 1], y[n - 1]); return 0; } return SUBTRACT ? ptx::subc(0, 0) : ptx::addc(0, 0); } + // add or subtract limbs + template + static constexpr DEVICE_INLINE uint32_t + add_sub_limbs_device(const ff_storage& xs, const ff_storage& ys, ff_storage& rs) + { + const uint32_t* x = xs.limbs; + const uint32_t* y = ys.limbs; + uint32_t* r = rs.limbs; + return add_sub_u32_device(x, y, r, TLC); + } + template static constexpr DEVICE_INLINE uint32_t add_sub_limbs_device(const ff_wide_storage& xs, const ff_wide_storage& ys, ff_wide_storage& rs) @@ -212,17 +249,7 @@ public: const uint32_t* x = xs.limbs; const uint32_t* y = ys.limbs; uint32_t* r = rs.limbs; - r[0] = SUBTRACT ? ptx::sub_cc(x[0], y[0]) : ptx::add_cc(x[0], y[0]); -#ifdef __CUDA_ARCH__ -#pragma unroll -#endif - for (unsigned i = 1; i < (CARRY_OUT ? 2 * TLC : 2 * TLC - 1); i++) - r[i] = SUBTRACT ? ptx::subc_cc(x[i], y[i]) : ptx::addc_cc(x[i], y[i]); - if (!CARRY_OUT) { - r[2 * TLC - 1] = SUBTRACT ? ptx::subc(x[2 * TLC - 1], y[2 * TLC - 1]) : ptx::addc(x[2 * TLC - 1], y[2 * TLC - 1]); - return 0; - } - return SUBTRACT ? ptx::subc(0, 0) : ptx::addc(0, 0); + return add_sub_u32_device(x, y, r, 2 * TLC); } template @@ -252,16 +279,6 @@ public: return CARRY_OUT ? carry : 0; } - static constexpr HOST_INLINE uint32_t - sub_limbs_partial_host(uint32_t* x, uint32_t* y, uint32_t* r, uint32_t num_limbs) - { - uint32_t carry = 0; - host_math::carry_chain<2 * TLC, false, true> chain; - for (unsigned i = 0; i < num_limbs; i++) - r[i] = chain.sub(x[i], y[i], carry); - return carry; - } - template static constexpr HOST_DEVICE_INLINE uint32_t add_limbs(const T& xs, const T& ys, T& rs) { @@ -300,12 +317,14 @@ public: } } - static DEVICE_INLINE void cmad_n(uint32_t* acc, const uint32_t* a, uint32_t bi, size_t n = TLC) + template + static __device__ __forceinline__ void + cmad_n(uint32_t* acc, const uint32_t* a, uint32_t bi, size_t n = TLC, uint32_t optional_carry = 0) { - // multiply scalar by vector - // acc = acc + bi*A[::2] - acc[0] = ptx::mad_lo_cc(a[0], bi, acc[0]); + if (CARRY_IN) ptx::add_cc(UINT32_MAX, optional_carry); + acc[0] = CARRY_IN ? ptx::madc_lo_cc(a[0], bi, acc[0]) : ptx::mad_lo_cc(a[0], bi, acc[0]); acc[1] = ptx::madc_hi_cc(a[0], bi, acc[1]); + #pragma unroll for (size_t i = 2; i < n; i += 2) { acc[i] = ptx::madc_lo_cc(a[i], bi, acc[i]); @@ -313,320 +332,277 @@ public: } } - static DEVICE_INLINE void - cmad_n_msb(uint32_t* acc, const uint32_t* a, uint32_t bi, size_t n = TLC, size_t a_start_idx = 0) + template + static __device__ __forceinline__ void cmad_n_msb(uint32_t* acc, const uint32_t* a, uint32_t bi, size_t n = TLC) { - // multiply scalar by vector - // acc = acc + bi*A[::2] - acc[a_start_idx] = ptx::mad_lo_cc(a[a_start_idx], bi, acc[a_start_idx]); - acc[a_start_idx + 1] = ptx::madc_hi_cc(a[a_start_idx], bi, acc[a_start_idx + 1]); + if (EVEN_PHASE) { + acc[0] = ptx::mad_lo_cc(a[0], bi, acc[0]); + acc[1] = ptx::madc_hi_cc(a[0], bi, acc[1]); + } else { + acc[1] = ptx::mad_hi_cc(a[0], bi, acc[1]); + } + #pragma unroll - for (size_t i = a_start_idx + 2; i < n; i += 2) { + for (size_t i = 2; i < n; i += 2) { acc[i] = ptx::madc_lo_cc(a[i], bi, acc[i]); acc[i + 1] = ptx::madc_hi_cc(a[i], bi, acc[i + 1]); } } - static DEVICE_INLINE void mad_row(uint32_t* odd, uint32_t* even, const uint32_t* a, uint32_t bi, size_t n = TLC) + static __device__ __forceinline__ void cmad_n_lsb(uint32_t* acc, const uint32_t* a, uint32_t bi, size_t n = TLC) { - // odd = odd + bi*A - // even = even + bi*A - cmad_n(odd, a + 1, bi, n - 2); - odd[n - 2] = ptx::madc_lo_cc(a[n - 1], bi, 0); - odd[n - 1] = ptx::madc_hi(a[n - 1], bi, 0); + if (n > 1) + acc[0] = ptx::mad_lo_cc(a[0], bi, acc[0]); + else + acc[0] = ptx::mad_lo(a[0], bi, acc[0]); + + size_t i; +#pragma unroll + for (i = 1; i < n - 1; i += 2) { + acc[i] = ptx::madc_hi_cc(a[i - 1], bi, acc[i]); + if (i == n - 2) + acc[i + 1] = ptx::madc_lo(a[i + 1], bi, acc[i + 1]); + else + acc[i + 1] = ptx::madc_lo_cc(a[i + 1], bi, acc[i + 1]); + } + if (i == n - 1) acc[i] = ptx::madc_hi(a[i - 1], bi, acc[i]); + } + + template + static __device__ __forceinline__ uint32_t mad_row( + uint32_t* odd, + uint32_t* even, + const uint32_t* a, + uint32_t bi, + size_t n = TLC, + uint32_t ci = 0, + uint32_t di = 0, + uint32_t carry_for_high = 0, + uint32_t carry_for_low = 0) + { + cmad_n(odd, a + 1, bi, n - 2, carry_for_low); + odd[n - 2] = ptx::madc_lo_cc(a[n - 1], bi, ci); + odd[n - 1] = CARRY_OUT ? ptx::madc_hi_cc(a[n - 1], bi, di) : ptx::madc_hi(a[n - 1], bi, di); + uint32_t cr = CARRY_OUT ? ptx::addc(0, 0) : 0; cmad_n(even, a, bi, n); - odd[n - 1] = ptx::addc(odd[n - 1], 0); + if (CARRY_OUT) { + odd[n - 1] = ptx::addc_cc(odd[n - 1], carry_for_high); + cr = ptx::addc(cr, 0); + } else + odd[n - 1] = ptx::addc(odd[n - 1], carry_for_high); + return cr; } - static DEVICE_INLINE void - mad_row_msb(uint32_t* odd, uint32_t* even, const uint32_t* a, uint32_t bi, size_t n = TLC, size_t a_start_idx = 0) + template + static __device__ __forceinline__ void + mad_row_msb(uint32_t* odd, uint32_t* even, const uint32_t* a, uint32_t bi, size_t n = TLC) { - // odd = odd + bi*A - // even = even + bi*A - cmad_n_msb(odd, a + 1, bi, n - 2, a_start_idx - 1); - odd[n - 2] = ptx::madc_lo_cc(a[n - 1], bi, 0); - odd[n - 1] = ptx::madc_hi(a[n - 1], bi, 0); - cmad_n_msb(even, a, bi, n, a_start_idx); - odd[n - 1] = ptx::addc(odd[n - 1], 0); + cmad_n_msb(odd, EVEN_PHASE ? a : (a + 1), bi, n - 2); + odd[EVEN_PHASE ? (n - 1) : (n - 2)] = ptx::madc_lo_cc(a[n - 1], bi, 0); + odd[EVEN_PHASE ? n : (n - 1)] = ptx::madc_hi(a[n - 1], bi, 0); + cmad_n_msb(even, EVEN_PHASE ? (a + 1) : a, bi, n - 1); + odd[EVEN_PHASE ? n : (n - 1)] = ptx::addc(odd[EVEN_PHASE ? n : (n - 1)], 0); } - static DEVICE_INLINE void multiply_raw_device(const ff_storage& as, const ff_storage& bs, ff_wide_storage& rs) + static __device__ __forceinline__ void + mad_row_lsb(uint32_t* odd, uint32_t* even, const uint32_t* a, uint32_t bi, size_t n = TLC) { - const uint32_t* a = as.limbs; - const uint32_t* b = bs.limbs; - uint32_t* even = rs.limbs; - __align__(8) uint32_t odd[2 * TLC - 2]; - mul_n(even, a, b[0]); - mul_n(odd, a + 1, b[0]); - mad_row(&even[2], &odd[0], a, b[1]); - size_t i; -#pragma unroll - for (i = 2; i < TLC - 1; i += 2) { - mad_row(&odd[i], &even[i], a, b[i]); - mad_row(&even[i + 2], &odd[i], a, b[i + 1]); + // bi here is constant so we can do a compile-time check for zero (which does happen once for bls12-381 scalar field + // modulus) + if (bi != 0) { + if (n > 1) cmad_n_lsb(odd, a + 1, bi, n - 1); + cmad_n_lsb(even, a, bi, n); } - // merge |even| and |odd| - even[1] = ptx::add_cc(even[1], odd[0]); - for (i = 1; i < 2 * TLC - 2; i++) - even[i + 1] = ptx::addc_cc(even[i + 1], odd[i]); - even[i + 1] = ptx::addc(even[i + 1], 0); + return; } - static DEVICE_INLINE void mult_no_carry(uint32_t a, uint32_t b, uint32_t* r) + static __device__ __forceinline__ uint32_t + mul_n_and_add(uint32_t* acc, const uint32_t* a, uint32_t bi, uint32_t* extra, size_t n = (TLC >> 1)) { - r[0] = ptx::mul_lo(a, b); - r[1] = ptx::mul_hi(a, b); - } + acc[0] = ptx::mad_lo_cc(a[0], bi, extra[0]); - static DEVICE_INLINE void ingo_multiply_raw_device(const ff_storage& as, const ff_storage& bs, ff_wide_storage& rs) +#pragma unroll + for (size_t i = 1; i < n - 1; i += 2) { + acc[i] = ptx::madc_hi_cc(a[i - 1], bi, extra[i]); + acc[i + 1] = ptx::madc_lo_cc(a[i + 1], bi, extra[i + 1]); + } + + acc[n - 1] = ptx::madc_hi_cc(a[n - 2], bi, extra[n - 1]); + return ptx::addc(0, 0); + } + + /** + * A function that computes wide product \f$ rs = as \cdot bs \f$ that's correct for the higher TLC + 1 limbs with a + * small maximum error. + * + * The way this function saves computations (as compared to regular school-book multiplication) is by not including + * terms that are too small. Namely, limb product \f$ a_i \cdot b_j \f$ is excluded if \f$ i + j < TLC - 2 \f$ and + * only the higher half is included if \f$ i + j = TLC - 2 \f$. All other limb products are included. So, the error + * i.e. difference between true product and the result of this function written to `rs` is exactly the sum of all + * dropped limbs products, which we can bound: \f$ a_0 \cdot b_0 + 2^{32}(a_0 \cdot b_1 + a_1 \cdot b_0) + \dots + + * 2^{32(TLC - 3)}(a_{TLC - 3} \cdot b_0 + \dots + a_0 \cdot b_{TLC - 3}) + 2^{32(TLC - 2)}(\floor{\frac{a_{TLC - 2} + * \cdot b_0}{2^{32}}} + \dots + \floor{\frac{a_0 \cdot b_{TLC - 2}}{2^{32}}}) \leq 2^{64} + 2\cdot 2^{96} + \dots + + * (TLC - 2) \cdot 2^{32(TLC - 1)} + (TLC - 1) \cdot 2^{32(TLC - 1)} \leq 2(TLC - 1) \cdot 2^{32(TLC - 1)}\f$. + */ + static __device__ __forceinline__ void + multiply_msb_raw_device(const ff_storage& as, const ff_storage& bs, ff_wide_storage& rs) { const uint32_t* a = as.limbs; const uint32_t* b = bs.limbs; - uint32_t* r = rs.limbs; - uint32_t i, j; uint32_t* even = rs.limbs; - __align__(8) uint32_t odd[2 * TLC]; - for (uint32_t i = 0; i < 2 * TLC; i++) { - even[i] = 0; - odd[i] = 0; - } - // first row special case, no carry in no carry out. split to non parts, even and odd. - for (i = 0; i < TLC - 1; i += 2) { - mult_no_carry(b[0], a[i], &even[i]); - mult_no_carry(b[0], a[i + 1], &odd[i]); - } - - // doing two rows at one loop - for (i = 1; i < TLC - 1; i += 2) { - // odd bi's - // multiply accumulate even part of new row with odd part prev row (needs a carry) - // // j = 0, no carry in, only carry out - odd[i - 1] = ptx::mad_lo_cc(a[0], b[i], odd[i - 1]); - odd[i] = ptx::madc_hi_cc(a[0], b[i], odd[i]); - // for loop carry in carry out - for (j = 2; j < TLC; j += 2) // 2, 4, 6 - { - odd[i + j - 1] = ptx::madc_lo_cc(a[j], b[i], odd[i + j - 1]); - odd[i + j] = ptx::madc_hi_cc(a[j], b[i], odd[i + j]); - } - odd[i + j - 1] = ptx::addc(odd[i + j - 1], 0); // handling last carry - - // multiply accumulate odd part of new row with even part prev row (doesnt need a carry) - // j = 1, no carry in, only carry out - even[i + 1] = ptx::mad_lo_cc(a[1], b[i], even[i + 1]); - even[i + 2] = ptx::madc_hi_cc(a[1], b[i], even[i + 2]); - // for loop carry in carry out - for (j = 3; j < TLC; j += 2) { - even[i + j] = ptx::madc_lo_cc(a[j], b[i], even[i + j]); - even[i + j + 1] = ptx::madc_hi_cc(a[j], b[i], even[i + j + 1]); - } - - // even bi's - // multiply accumulate even part of new row with even part of prev row // needs a carry - // j = 0, no carry in, only carry out - even[i + 1] = ptx::mad_lo_cc(a[0], b[i + 1], even[i + 1]); - even[i + 2] = ptx::madc_hi_cc(a[0], b[i + 1], even[i + 2]); - // for loop, carry in, carry out. - for (j = 2; j < TLC; j += 2) { - even[i + j + 1] = ptx::madc_lo_cc(a[j], b[i + 1], even[i + j + 1]); - even[i + j + 2] = ptx::madc_hi_cc(a[j], b[i + 1], even[i + j + 2]); - } - even[i + j + 1] = ptx::addc(even[i + j + 1], 0); // handling last carry - - // multiply accumulate odd part of new row with odd part of prev row - // j = 1, no carry in, only carry out - odd[i + 1] = ptx::mad_lo_cc(a[1], b[i + 1], odd[i + 1]); - odd[i + 2] = ptx::madc_hi_cc(a[1], b[i + 1], odd[i + 2]); - // for loop, carry in, carry out. - for (j = 3; j < TLC; j += 2) { - odd[i + j] = ptx::madc_lo_cc(a[j], b[i + 1], odd[i + j]); - odd[i + j + 1] = ptx::madc_hi_cc(a[j], b[i + 1], odd[i + j + 1]); - } - } + __align__(16) uint32_t odd[2 * TLC - 2]; - odd[i - 1] = ptx::mad_lo_cc(a[0], b[i], odd[i - 1]); - odd[i] = ptx::madc_hi_cc(a[0], b[i], odd[i]); - // for loop carry in carry out - for (j = 2; j < TLC; j += 2) { - odd[i + j - 1] = ptx::madc_lo_cc(a[j], b[i], odd[i + j - 1]); - odd[i + j] = ptx::madc_hi_cc(a[j], b[i], odd[i + j]); - } - odd[i + j - 1] = ptx::addc(odd[i + j - 1], 0); // handling last carry - - // multiply accumulate odd part of new row with even part prev row - // j = 1, no carry in, only carry out - even[i + 1] = ptx::mad_lo_cc(a[1], b[i], even[i + 1]); - even[i + 2] = ptx::madc_hi_cc(a[1], b[i], even[i + 2]); - // for loop carry in carry out - for (j = 3; j < TLC; j += 2) { - even[i + j] = ptx::madc_lo_cc(a[j], b[i], even[i + j]); - even[i + j + 1] = ptx::madc_hi_cc(a[j], b[i], even[i + j + 1]); + even[TLC - 1] = ptx::mul_hi(a[TLC - 2], b[0]); + odd[TLC - 2] = ptx::mul_lo(a[TLC - 1], b[0]); + odd[TLC - 1] = ptx::mul_hi(a[TLC - 1], b[0]); + size_t i; +#pragma unroll + for (i = 2; i < TLC - 1; i += 2) { + mad_row_msb(&even[TLC - 2], &odd[TLC - 2], &a[TLC - i - 1], b[i - 1], i + 1); + mad_row_msb(&odd[TLC - 2], &even[TLC - 2], &a[TLC - i - 2], b[i], i + 2); } + mad_row(&even[TLC], &odd[TLC - 2], a, b[TLC - 1]); - // add even and odd parts - even[1] = ptx::add_cc(even[1], odd[0]); - for (i = 1; i < 2 * TLC - 2; i++) + // merge |even| and |odd| + ptx::add_cc(even[TLC - 1], odd[TLC - 2]); + for (i = TLC - 1; i < 2 * TLC - 2; i++) even[i + 1] = ptx::addc_cc(even[i + 1], odd[i]); even[i + 1] = ptx::addc(even[i + 1], 0); } - static DEVICE_INLINE void - ingo_msb_multiply_raw_device(const ff_storage& as, const ff_storage& bs, ff_wide_storage& rs) + /** + * A function that computes the low half of the fused multiply-and-add \f$ rs = as \cdot bs + cs \f$. + * + * For efficiency, this method does not include terms that are too large. Namely, limb product \f$ a_i \cdot b_j \f$ + * is excluded if \f$ i + j > TLC - 1 \f$ and only the lower half is included if \f$ i + j = TLC - 1 \f$. All other + * limb products are included. + */ + static __device__ __forceinline__ void + multiply_and_add_lsb_raw_device(const ff_storage& as, const ff_storage& bs, ff_storage& cs, ff_storage& rs) { const uint32_t* a = as.limbs; const uint32_t* b = bs.limbs; - uint32_t* r = rs.limbs; - uint32_t i, j; uint32_t* even = rs.limbs; - __align__(8) uint32_t odd[2 * TLC]; - for (uint32_t i = 0; i < 2 * TLC; i++) { - even[i] = 0; - odd[i] = 0; - } - // only last element from first row. - mult_no_carry(b[0], a[TLC - 1], &odd[TLC - 2]); - -// doing two rows at one loop -#pragma unroll - for (i = 1; i < TLC - 1; i += 2) { - const uint32_t first_active_j = TLC - 1 - i; - const uint32_t first_active_j_odd = first_active_j + (1 - (first_active_j % 2)); - const uint32_t first_active_j_even = first_active_j + first_active_j % 2; - // odd bi's - // multiply accumulate even part of new row with odd part prev row (needs a carry) - // j = 0, no carry in, only carry out - odd[first_active_j_even + i - 1] = ptx::mad_lo_cc(a[first_active_j_even], b[i], odd[first_active_j_even + i - 1]); - odd[first_active_j_even + i] = ptx::madc_hi_cc(a[first_active_j_even], b[i], odd[first_active_j_even + i]); -// for loop carry in carry out -#pragma unroll - for (j = first_active_j_even + 2; j < TLC; j += 2) { - odd[i + j - 1] = ptx::madc_lo_cc(a[j], b[i], odd[i + j - 1]); - odd[i + j] = ptx::madc_hi_cc(a[j], b[i], odd[i + j]); - } - odd[i + j - 1] = ptx::addc(odd[i + j - 1], 0); // handling last carry - - // multiply accumulate odd part of new row with even part prev row (doesnt need a carry) - // j = 1, no carry in, only carry out - even[i + first_active_j_odd] = ptx::mad_lo_cc(a[first_active_j_odd], b[i], even[i + first_active_j_odd]); - even[i + first_active_j_odd + 1] = ptx::madc_hi_cc(a[first_active_j_odd], b[i], even[i + first_active_j_odd + 1]); -// for loop carry in carry out -#pragma unroll - for (j = first_active_j_odd + 2; j < TLC; j += 2) { - even[i + j] = ptx::madc_lo_cc(a[j], b[i], even[i + j]); - even[i + j + 1] = ptx::madc_hi_cc(a[j], b[i], even[i + j + 1]); - } - - // even bi's - uint32_t const first_active_j1 = TLC - 1 - (i + 1); - uint32_t const first_active_j_odd1 = first_active_j1 + (1 - (first_active_j1 % 2)); - uint32_t const first_active_j_even1 = first_active_j1 + first_active_j1 % 2; - // multiply accumulate even part of new row with even part of prev row // needs a carry - // j = 0, no carry in, only carry out - even[first_active_j_even1 + i + 1] = - ptx::mad_lo_cc(a[first_active_j_even1], b[i + 1], even[first_active_j_even1 + i + 1]); - even[first_active_j_even1 + i + 2] = - ptx::madc_hi_cc(a[first_active_j_even1], b[i + 1], even[first_active_j_even1 + i + 2]); -// for loop, carry in, carry out. -#pragma unroll - for (j = first_active_j_even1 + 2; j < TLC; j += 2) { - even[i + j + 1] = ptx::madc_lo_cc(a[j], b[i + 1], even[i + j + 1]); - even[i + j + 2] = ptx::madc_hi_cc(a[j], b[i + 1], even[i + j + 2]); - } - even[i + j + 1] = ptx::addc(even[i + j + 1], 0); // handling last carry - - // multiply accumulate odd part of new row with odd part of prev row - // j = 1, no carry in, only carry out - odd[first_active_j_odd1 + i] = ptx::mad_lo_cc(a[first_active_j_odd1], b[i + 1], odd[first_active_j_odd1 + i]); - odd[first_active_j_odd1 + i + 1] = - ptx::madc_hi_cc(a[first_active_j_odd1], b[i + 1], odd[first_active_j_odd1 + i + 1]); -// for loop, carry in, carry out. + __align__(16) uint32_t odd[TLC - 1]; + size_t i; + // `b[0]` is \f$ 2^{32} \f$ minus the last limb of prime modulus. Because most scalar (and some base) primes + // are neccessarily NTT-friendly, `b[0]` often turns out to be \f$ 2^{32} - 1 \f$. This actually leads to + // less efficient SASS generated by nvcc, so this case needed separate handling. + if (b[0] == UINT32_MAX) { + add_sub_u32_device(cs.limbs, a, even, TLC); + for (i = 0; i < TLC - 1; i++) + odd[i] = a[i]; + } else { + mul_n_and_add(even, a, b[0], cs.limbs, TLC); + mul_n(odd, a + 1, b[0], TLC - 1); + } + mad_row_lsb(&even[2], &odd[0], a, b[1], TLC - 1); #pragma unroll - for (j = first_active_j_odd1 + 2; j < TLC; j += 2) { - odd[i + j] = ptx::madc_lo_cc(a[j], b[i + 1], odd[i + j]); - odd[i + j + 1] = ptx::madc_hi_cc(a[j], b[i + 1], odd[i + j + 1]); - } + for (i = 2; i < TLC - 1; i += 2) { + mad_row_lsb(&odd[i], &even[i], a, b[i], TLC - i); + mad_row_lsb(&even[i + 2], &odd[i], a, b[i + 1], TLC - i - 1); } - // last round, i = TLC - 1 - odd[i - 1] = ptx::mad_lo_cc(a[0], b[i], odd[i - 1]); - odd[i] = ptx::madc_hi_cc(a[0], b[i], odd[i]); -// for loop carry in carry out -#pragma unroll - for (j = 2; j < TLC; j += 2) { - odd[i + j - 1] = ptx::madc_lo_cc(a[j], b[i], odd[i + j - 1]); - odd[i + j] = ptx::madc_hi_cc(a[j], b[i], odd[i + j]); - } - odd[i + j - 1] = ptx::addc(odd[i + j - 1], 0); // handling last carry + // merge |even| and |odd| + even[1] = ptx::add_cc(even[1], odd[0]); + for (i = 1; i < TLC - 2; i++) + even[i + 1] = ptx::addc_cc(even[i + 1], odd[i]); + even[i + 1] = ptx::addc(even[i + 1], odd[i]); + } - // multiply accumulate odd part of new row with even part prev row - // j = 1, no carry in, only carry out - even[i + 1] = ptx::mad_lo_cc(a[1], b[i], even[i + 1]); - even[i + 2] = ptx::madc_hi_cc(a[1], b[i], even[i + 2]); -// for loop carry in carry out -#pragma unroll - for (j = 3; j < TLC; j += 2) { - even[i + j] = ptx::madc_lo_cc(a[j], b[i], even[i + j]); - even[i + j + 1] = ptx::madc_hi_cc(a[j], b[i], even[i + j + 1]); - } + /** + * This method multiplies `a` and `b` (both assumed to have TLC / 2 limbs) and adds `in1` and `in2` (TLC limbs each) + * to the result which is written to `even`. + * + * It is used to compute the "middle" part of Karatsuba: \f$ a_{lo} \cdot b_{hi} + b_{lo} \cdot a_{hi} = + * (a_{hi} - a_{lo})(b_{lo} - b_{hi}) + a_{lo} \cdot b_{lo} + a_{hi} \cdot b_{hi} \f$. Currently this method assumes + * that the top bit of \f$ a_{hi} \f$ and \f$ b_{hi} \f$ are unset. This ensures correctness by allowing to keep the + * result inside TLC limbs and ignore the carries from the highest limb. + */ + static __device__ __forceinline__ void + multiply_and_add_short_raw_device(const uint32_t* a, const uint32_t* b, uint32_t* even, uint32_t* in1, uint32_t* in2) + { + __align__(16) uint32_t odd[TLC - 2]; + uint32_t first_row_carry = mul_n_and_add(even, a, b[0], in1); + uint32_t carry = mul_n_and_add(odd, a + 1, b[0], &in2[1]); - // add even and odd parts - even[1] = ptx::add_cc(even[1], odd[0]); + size_t i; #pragma unroll - for (i = 1; i < 2 * TLC - 2; i++) + for (i = 2; i < ((TLC >> 1) - 1); i += 2) { + carry = mad_row( + &even[i], &odd[i - 2], a, b[i - 1], TLC >> 1, in1[(TLC >> 1) + i - 2], in1[(TLC >> 1) + i - 1], carry); + carry = + mad_row(&odd[i], &even[i], a, b[i], TLC >> 1, in2[(TLC >> 1) + i - 1], in2[(TLC >> 1) + i], carry); + } + mad_row( + &even[TLC >> 1], &odd[(TLC >> 1) - 2], a, b[(TLC >> 1) - 1], TLC >> 1, in1[TLC - 2], in1[TLC - 1], carry, + first_row_carry); + // merge |even| and |odd| plus the parts of `in2` we haven't added yet (first and last limbs) + even[0] = ptx::add_cc(even[0], in2[0]); + for (i = 0; i < (TLC - 2); i++) even[i + 1] = ptx::addc_cc(even[i + 1], odd[i]); - even[i + 1] = ptx::addc(even[i + 1], 0); + even[i + 1] = ptx::addc(even[i + 1], in2[i + 1]); } - static DEVICE_INLINE void multiply_lsb_raw_device(const ff_storage& as, const ff_storage& bs, ff_wide_storage& rs) + /** + * This method multiplies `a` and `b` and writes the result into `even`. It assumes that `a` and `b` are TLC/2 limbs + * long. The usual schoolbook algorithm is used. + */ + static __device__ __forceinline__ void multiply_short_raw_device(const uint32_t* a, const uint32_t* b, uint32_t* even) { - // r = a * b is correcrt for the first TLC + 1 digits. (not computing from TLC + 1 to 2*TLC - 2). - const uint32_t* a = as.limbs; - const uint32_t* b = bs.limbs; - uint32_t* even = rs.limbs; - __align__(8) uint32_t odd[2 * TLC - 2]; - mul_n(even, a, b[0]); - mul_n(odd, a + 1, b[0]); - mad_row(&even[2], &odd[0], a, b[1]); + __align__(16) uint32_t odd[TLC - 2]; + mul_n(even, a, b[0], TLC >> 1); + mul_n(odd, a + 1, b[0], TLC >> 1); + mad_row(&even[2], &odd[0], a, b[1], TLC >> 1); + size_t i; #pragma unroll - for (i = 2; i < TLC - 1; i += 2) { - mad_row(&odd[i], &even[i], a, b[i], TLC - i + 2); - mad_row(&even[i + 2], &odd[i], a, b[i + 1], TLC - i + 2); + for (i = 2; i < ((TLC >> 1) - 1); i += 2) { + mad_row(&odd[i], &even[i], a, b[i], TLC >> 1); + mad_row(&even[i + 2], &odd[i], a, b[i + 1], TLC >> 1); } - // merge |even| and |odd| even[1] = ptx::add_cc(even[1], odd[0]); - for (i = 1; i < TLC + 1; i++) + for (i = 1; i < TLC - 2; i++) even[i + 1] = ptx::addc_cc(even[i + 1], odd[i]); even[i + 1] = ptx::addc(even[i + 1], 0); } - static DEVICE_INLINE void multiply_msb_raw_device(const ff_storage& as, const ff_storage& bs, ff_wide_storage& rs) + /** + * This method multiplies `as` and `bs` and writes the (wide) result into `rs`. + * + * It is assumed that the highest bits of `as` and `bs` are unset which is true for all the numbers icicle had to deal + * with so far. This method implements [subtractive + * Karatsuba](https://en.wikipedia.org/wiki/Karatsuba_algorithm#Implementation). + */ + static DEVICE_INLINE void multiply_raw_device(const ff_storage& as, const ff_storage& bs, ff_wide_storage& rs) { const uint32_t* a = as.limbs; const uint32_t* b = bs.limbs; - uint32_t* even = rs.limbs; - __align__(8) uint32_t odd[2 * TLC - 2]; - for (int i = 0; i < 2 * TLC - 1; i++) { - even[i] = 0; - odd[i] = 0; - } - uint32_t min_indexes_sum = TLC - 1; - // only diagonal - mul_n_msb(even, a, b[0], TLC, min_indexes_sum); - mul_n_msb(odd, a + 1, b[0], TLC, min_indexes_sum - 1); - mad_row_msb(&even[2], &odd[0], a, b[1], TLC, min_indexes_sum - 1); - size_t i; -#pragma unroll - for (i = 2; i < TLC - 1; i += 2) { - mad_row(&odd[i], &even[i], a, b[i]); - mad_row(&even[i + 2], &odd[i], a, b[i + 1]); - } - // merge |even| and |odd| - even[1] = ptx::add_cc(even[1], odd[0]); - for (i = 1; i < 2 * TLC - 2; i++) - even[i + 1] = ptx::addc_cc(even[i + 1], odd[i]); - even[i + 1] = ptx::addc(even[i + 1], 0); + uint32_t* r = rs.limbs; + // Next two lines multiply high and low halves of operands (\f$ a_{lo} \cdot b_{lo}; a_{hi} \cdot b_{hi} \$f) and + // write the results into `r`. + multiply_short_raw_device(a, b, r); + multiply_short_raw_device(&a[TLC >> 1], &b[TLC >> 1], &r[TLC]); + __align__(16) uint32_t middle_part[TLC]; + __align__(16) uint32_t diffs[TLC]; + // Differences of halves \f$ a_{hi} - a_{lo}; b_{lo} - b_{hi} \$f are written into `diffs`, signs written to + // `carry1` and `carry2`. + uint32_t carry1 = add_sub_u32_device(&a[TLC >> 1], a, diffs); + uint32_t carry2 = add_sub_u32_device(b, &b[TLC >> 1], &diffs[TLC >> 1]); + // Compute the "middle part" of Karatsuba: \f$ a_{lo} \cdot b_{hi} + b_{lo} \cdot a_{hi} \f$. + // This is where the assumption about unset high bit of `a` and `b` is relevant. + multiply_and_add_short_raw_device(diffs, &diffs[TLC >> 1], middle_part, r, &r[TLC]); + // Corrections that need to be performed when differences are negative. + // Again, carry doesn't need to be propagated due to unset high bits of `a` and `b`. + if (carry1) add_sub_u32_device(&middle_part[TLC >> 1], &diffs[TLC >> 1], &middle_part[TLC >> 1]); + if (carry2) add_sub_u32_device(&middle_part[TLC >> 1], diffs, &middle_part[TLC >> 1]); + // Now that middle part is fully correct, it can be added to the result. + add_sub_u32_device(&r[TLC >> 1], middle_part, &r[TLC >> 1], TLC); + + // Carry from adding middle part has to be propagated to the highest limb. + for (size_t i = TLC + (TLC >> 1); i < 2 * TLC; i++) + r[i] = ptx::addc_cc(r[i], 0); } static HOST_INLINE void multiply_raw_host(const ff_storage& as, const ff_storage& bs, ff_wide_storage& rs) @@ -651,19 +627,23 @@ public: #endif } - static HOST_DEVICE_INLINE void multiply_raw_lsb(const ff_storage& as, const ff_storage& bs, ff_wide_storage& rs) + static HOST_DEVICE_INLINE void + multiply_and_add_lsb_raw(const ff_storage& as, const ff_storage& bs, ff_storage& cs, ff_storage& rs) { #ifdef __CUDA_ARCH__ - return multiply_lsb_raw_device(as, bs, rs); + return multiply_and_add_lsb_raw_device(as, bs, cs, rs); #else - return multiply_raw_host(as, bs, rs); + Wide r_wide = {}; + multiply_raw_host(as, bs, r_wide.limbs_storage); + Field r = Wide::get_lower(r_wide); + add_limbs(cs, r.limbs_storage, rs); #endif } - static HOST_DEVICE_INLINE void multiply_raw_msb(const ff_storage& as, const ff_storage& bs, ff_wide_storage& rs) + static HOST_DEVICE_INLINE void multiply_msb_raw(const ff_storage& as, const ff_storage& bs, ff_wide_storage& rs) { #ifdef __CUDA_ARCH__ - return multiply_raw_device(as, bs, rs); + return multiply_msb_raw_device(as, bs, rs); #else return multiply_raw_host(as, bs, rs); #endif @@ -694,8 +674,8 @@ public: Field value{}; for (unsigned i = 0; i < TLC; i++) value.limbs_storage.limbs[i] = distribution(generator); - while (lt(modulus(), value)) - value = value - modulus(); + while (lt(Field{get_modulus()}, value)) + value = value - Field{get_modulus()}; return value; } @@ -752,55 +732,64 @@ public: return rs; } - static constexpr DEVICE_INLINE uint32_t - sub_limbs_partial_device(uint32_t* x, uint32_t* y, uint32_t* r, uint32_t num_limbs) - { - r[0] = ptx::sub_cc(x[0], y[0]); -#pragma unroll - for (unsigned i = 1; i < num_limbs; i++) - r[i] = ptx::subc_cc(x[i], y[i]); - return ptx::subc(0, 0); - } + static constexpr HOST_DEVICE_INLINE Field to_montgomery(const Field& xs) { return xs * Field{CONFIG::montgomery_r}; } - static constexpr HOST_DEVICE_INLINE uint32_t - sub_limbs_partial(uint32_t* x, uint32_t* y, uint32_t* r, uint32_t num_limbs) + static constexpr HOST_DEVICE_INLINE Field from_montgomery(const Field& xs) { -#ifdef __CUDA_ARCH__ - return sub_limbs_partial_device(x, y, r, num_limbs); -#else - return sub_limbs_partial_host(x, y, r, num_limbs); -#endif + return xs * Field{CONFIG::montgomery_r_inv}; } + /** + * This method reduces a Wide number `xs` modulo `p` and returns the result as a Field element. + * + * It is assumed that the high `2 * slack_bits` bits of `xs` are unset which is always the case for the product of 2 + * numbers with thier high `slack_bits` unset. Larger Wide numbers should be reduced by subtracting an appropriate + * factor of `modulus_squared` first. + * + * This function implements ["multi-precision Barrett"](https://github.com/ingonyama-zk/modular_multiplication). As + * opposed to Montgomery reduction, it doesn't require numbers to have a special representation but lets us work with + * them as-is. The general idea of Barrett reduction is to estimate the quotient \f$ l \approx \floor{\frac{xs}{p}} + * \f$ and return \f$ xs - l \cdot p \f$. But since \f$ l \f$ is inevitably computed with an error (it's always less + * or equal than the real quotient). So the modulus `p` might need to be subtracted several times before the result is + * in the desired range \f$ [0;p-1] \f$. The estimate of the error is as follows: \f[ \frac{xs}{p} - l = \frac{xs}{p} + * - \frac{xs \cdot m}{2^{2n}} + \frac{xs \cdot m}{2^{2n}} - \floor{\frac{xs}{2^k}}\frac{m}{2^{2n-k}} + * + \floor{\frac{xs}{2^k}}\frac{m}{2^{2n-k}} - l \leq p^2(\frac{1}{p}-\frac{m}{2^{2n}}) + \frac{m}{2^{2n-k}} + 2(TLC + * - 1) \cdot 2^{-32} \f] Here \f$ l \f$ is the result of [multiply_msb_raw](@ref multiply_msb_raw) function and the + * last term in the error is due to its approximation. \f$ n \f$ is the number of bits in \f$ p \f$ and \f$ k = 2n - + * 32\cdot TLC \f$. Overall, the error is always less than 2 so at most 2 reductions are needed. However, in most + * cases it's less than 1, so setting the [num_of_reductions](@ref num_of_reductions) variable for a field equal to 1 + * will cause only 1 reduction to be performed. + */ template static constexpr HOST_DEVICE_INLINE Field reduce(const Wide& xs) { - Field xs_hi = Wide::get_higher_with_slack(xs); // xy << slack_bits + // `xs` is left-shifted by `2 * slack_bits` and higher half is written to `xs_hi` + Field xs_hi = Wide::get_higher_with_slack(xs); Wide l = {}; - multiply_raw_msb(xs_hi.limbs_storage, get_m(), l.limbs_storage); // MSB mult - Field l_hi = Wide::get_higher_with_slack(l); - Wide lp = {}; - multiply_raw_lsb(l_hi.limbs_storage, get_modulus(), lp.limbs_storage); // LSB mult - Wide r_wide = xs - lp; - Wide r_wide_reduced = {}; - for (unsigned i = 0; i < TLC + 1; i++) { - uint32_t carry = sub_limbs_partial( - r_wide.limbs_storage.limbs, modulus_wide().limbs, r_wide_reduced.limbs_storage.limbs, TLC + 1); - if (carry == 0) // continue to reduce - r_wide = r_wide_reduced; - else // done - break; - } + multiply_msb_raw(xs_hi.limbs_storage, get_m(), l.limbs_storage); // MSB mult by `m` + Field l_hi = Wide::get_higher(l); + Field r = {}; + Field xs_lo = Wide::get_lower(xs); + // Here we need to compute the lsb of `xs - l \cdot p` and to make use of fused multiply-and-add, we rewrite it as + // `xs + l \cdot (2^{32 \cdot TLC}-p)` which is the same as original (up to higher limbs which we don't care about). + multiply_and_add_lsb_raw(l_hi.limbs_storage, get_neg_modulus(), xs_lo.limbs_storage, r.limbs_storage); + ff_storage r_reduced = {}; + uint32_t carry; + // As mentioned, either 2 or 1 reduction can be performed depending on the field in question. + if (num_of_reductions() == 2) { + carry = sub_limbs(r.limbs_storage, get_modulus<2>(), r_reduced); + if (carry == 0) r = Field{r_reduced}; + } + carry = sub_limbs(r.limbs_storage, get_modulus<1>(), r_reduced); + if (carry == 0) r = Field{r_reduced}; - // number of wrap around is bounded by TLC + 1 times. - Field r = Wide::get_lower(r_wide); return r; } friend HOST_DEVICE_INLINE Field operator*(const Field& xs, const Field& ys) { Wide xy = mul_wide(xs, ys); // full mult - return reduce(xy); + return reduce(xy); // reduce mod p } friend HOST_DEVICE_INLINE bool operator==(const Field& xs, const Field& ys) @@ -949,3 +938,16 @@ public: return (u == one) ? b : c; } }; + +template +struct std::hash> +{ + std::size_t operator()(const Field& key) const + { + std::size_t hash = 0; + // boost hashing, see https://stackoverflow.com/questions/35985960/c-why-is-boosthash-combine-the-best-way-to-combine-hash-values/35991300#35991300 + for (int i = 0; i < CONFIG::limbs_count; i++) + hash ^= std::hash()(key.limbs_storage.limbs[i]) + 0x9e3779b9 + (hash<<6) + (hash>>2); + return hash; + } +}; diff --git a/icicle/primitives/projective.cuh b/icicle/primitives/projective.cuh index bc0ca067f..4aa81609b 100644 --- a/icicle/primitives/projective.cuh +++ b/icicle/primitives/projective.cuh @@ -2,7 +2,7 @@ #include "affine.cuh" -template +template class Projective { friend Affine; @@ -32,7 +32,7 @@ public: return {FF::FromMontgomery(point.x), FF::FromMontgomery(point.y), FF::FromMontgomery(point.z)}; } - static HOST_DEVICE_INLINE Projective generator() { return {FF::generator_x(), FF::generator_y(), FF::one()}; } + static HOST_DEVICE_INLINE Projective generator() { return {GENERATOR_X, GENERATOR_Y, FF::one()}; } static HOST_DEVICE_INLINE Projective neg(const Projective& point) { return {point.x, FF::neg(point.y), point.z}; } diff --git a/icicle/primitives/test.cu b/icicle/primitives/test.cu index d76dcc8b0..fb6cd7729 100644 --- a/icicle/primitives/test.cu +++ b/icicle/primitives/test.cu @@ -30,22 +30,22 @@ protected: projective_t* points2{}; g2_projective_t* g2_points1{}; g2_projective_t* g2_points2{}; - scalar_field_t* scalars1{}; - scalar_field_t* scalars2{}; + scalar_t* scalars1{}; + scalar_t* scalars2{}; projective_t* zero_points{}; g2_projective_t* g2_zero_points{}; - scalar_field_t* zero_scalars{}; - scalar_field_t* one_scalars{}; + scalar_t* zero_scalars{}; + scalar_t* one_scalars{}; affine_t* aff_points{}; g2_affine_t* g2_aff_points{}; projective_t* res_points1{}; projective_t* res_points2{}; g2_projective_t* g2_res_points1{}; g2_projective_t* g2_res_points2{}; - scalar_field_t* res_scalars1{}; - scalar_field_t* res_scalars2{}; - scalar_field_t::Wide* res_scalars_wide{}; - scalar_field_t::Wide* res_scalars_wide_full{}; + scalar_t* res_scalars1{}; + scalar_t* res_scalars2{}; + scalar_t::Wide* res_scalars_wide{}; + scalar_t::Wide* res_scalars_wide_full{}; PrimitivesTest() { @@ -54,22 +54,20 @@ protected: assert(!cudaMallocManaged(&points2, n * sizeof(projective_t))); assert(!cudaMallocManaged(&g2_points1, n * sizeof(g2_projective_t))); assert(!cudaMallocManaged(&g2_points2, n * sizeof(g2_projective_t))); - assert(!cudaMallocManaged(&scalars1, n * sizeof(scalar_field_t))); - assert(!cudaMallocManaged(&scalars2, n * sizeof(scalar_field_t))); + assert(!cudaMallocManaged(&scalars1, n * sizeof(scalar_t))); + assert(!cudaMallocManaged(&scalars2, n * sizeof(scalar_t))); assert(!cudaMallocManaged(&zero_points, n * sizeof(projective_t))); assert(!cudaMallocManaged(&g2_zero_points, n * sizeof(g2_projective_t))); - assert(!cudaMallocManaged(&zero_scalars, n * sizeof(scalar_field_t))); - assert(!cudaMallocManaged(&one_scalars, n * sizeof(scalar_field_t))); + assert(!cudaMallocManaged(&zero_scalars, n * sizeof(scalar_t))); + assert(!cudaMallocManaged(&one_scalars, n * sizeof(scalar_t))); assert(!cudaMallocManaged(&aff_points, n * sizeof(affine_t))); assert(!cudaMallocManaged(&g2_aff_points, n * sizeof(g2_affine_t))); assert(!cudaMallocManaged(&res_points1, n * sizeof(projective_t))); assert(!cudaMallocManaged(&res_points2, n * sizeof(projective_t))); assert(!cudaMallocManaged(&g2_res_points1, n * sizeof(g2_projective_t))); assert(!cudaMallocManaged(&g2_res_points2, n * sizeof(g2_projective_t))); - assert(!cudaMallocManaged(&res_scalars1, n * sizeof(scalar_field_t))); - assert(!cudaMallocManaged(&res_scalars2, n * sizeof(scalar_field_t))); - assert(!cudaMallocManaged(&res_scalars_wide, n * sizeof(scalar_field_t::Wide))); - assert(!cudaMallocManaged(&res_scalars_wide_full, n * sizeof(scalar_field_t::Wide))); + assert(!cudaMallocManaged(&res_scalars1, n * sizeof(scalar_t))); + assert(!cudaMallocManaged(&res_scalars2, n * sizeof(scalar_t))); } ~PrimitivesTest() override @@ -93,9 +91,6 @@ protected: cudaFree(res_scalars1); cudaFree(res_scalars2); - cudaFree(res_scalars_wide); - cudaFree(res_scalars_wide_full); - cudaDeviceReset(); } @@ -105,22 +100,20 @@ protected: ASSERT_EQ(device_populate_random(points2, n), cudaSuccess); ASSERT_EQ(device_populate_random(g2_points1, n), cudaSuccess); ASSERT_EQ(device_populate_random(g2_points2, n), cudaSuccess); - ASSERT_EQ(device_populate_random(scalars1, n), cudaSuccess); - ASSERT_EQ(device_populate_random(scalars2, n), cudaSuccess); + ASSERT_EQ(device_populate_random(scalars1, n), cudaSuccess); + ASSERT_EQ(device_populate_random(scalars2, n), cudaSuccess); ASSERT_EQ(device_set(zero_points, projective_t::zero(), n), cudaSuccess); ASSERT_EQ(device_set(g2_zero_points, g2_projective_t::zero(), n), cudaSuccess); - ASSERT_EQ(device_set(zero_scalars, scalar_field_t::zero(), n), cudaSuccess); - ASSERT_EQ(device_set(one_scalars, scalar_field_t::one(), n), cudaSuccess); + ASSERT_EQ(device_set(zero_scalars, scalar_t::zero(), n), cudaSuccess); + ASSERT_EQ(device_set(one_scalars, scalar_t::one(), n), cudaSuccess); ASSERT_EQ(cudaMemset(aff_points, 0, n * sizeof(affine_t)), cudaSuccess); ASSERT_EQ(cudaMemset(g2_aff_points, 0, n * sizeof(g2_affine_t)), cudaSuccess); ASSERT_EQ(cudaMemset(res_points1, 0, n * sizeof(projective_t)), cudaSuccess); ASSERT_EQ(cudaMemset(res_points2, 0, n * sizeof(projective_t)), cudaSuccess); ASSERT_EQ(cudaMemset(g2_res_points1, 0, n * sizeof(g2_projective_t)), cudaSuccess); ASSERT_EQ(cudaMemset(g2_res_points2, 0, n * sizeof(g2_projective_t)), cudaSuccess); - ASSERT_EQ(cudaMemset(res_scalars1, 0, n * sizeof(scalar_field_t)), cudaSuccess); - ASSERT_EQ(cudaMemset(res_scalars2, 0, n * sizeof(scalar_field_t)), cudaSuccess); - ASSERT_EQ(cudaMemset(res_scalars_wide, 0, n * sizeof(scalar_field_t::Wide)), cudaSuccess); - ASSERT_EQ(cudaMemset(res_scalars_wide_full, 0, n * sizeof(scalar_field_t::Wide)), cudaSuccess); + ASSERT_EQ(cudaMemset(res_scalars1, 0, n * sizeof(scalar_t)), cudaSuccess); + ASSERT_EQ(cudaMemset(res_scalars2, 0, n * sizeof(scalar_t)), cudaSuccess); } }; @@ -319,82 +312,6 @@ TEST_F(PrimitivesTest, ECMixedAdditionOfNegatedPointEqSubtraction) ASSERT_EQ(res_points1[i], points1[i] + res_points2[i]); } -TEST_F(PrimitivesTest, MP_LSB_MULT) -{ - // LSB multiply, check correctness of first TLC + 1 digits result. - ASSERT_EQ(mp_lsb_mult(scalars1, scalars2, res_scalars_wide), cudaSuccess); - std::cout << "first GPU lsb mult output = 0x"; - for (int i = 0; i < 2 * scalar_field_t::TLC; i++) { - std::cout << std::hex << res_scalars_wide[0].limbs_storage.limbs[i]; - } - std::cout << std::endl; - - ASSERT_EQ(mp_mult(scalars1, scalars2, res_scalars_wide_full), cudaSuccess); - std::cout << "first GPU full mult output = 0x"; - for (int i = 0; i < 2 * scalar_field_t::TLC; i++) { - std::cout << std::hex << res_scalars_wide_full[0].limbs_storage.limbs[i]; - } - std::cout << std::endl; - for (int j = 0; j < n; j++) { - for (int i = 0; i < scalar_field_t::TLC + 1; i++) { - ASSERT_EQ(res_scalars_wide_full[j].limbs_storage.limbs[i], res_scalars_wide[j].limbs_storage.limbs[i]); - } - } -} - -TEST_F(PrimitivesTest, MP_MSB_MULT) -{ - // MSB multiply, take n msb bits of multiplication, assert that the error is up to 1. - ASSERT_EQ(mp_msb_mult(scalars1, scalars2, res_scalars_wide), cudaSuccess); - std::cout << "first GPU msb mult output = 0x"; - for (int i = 2 * scalar_field_t::TLC - 1; i >= 0; i--) { - std::cout << std::hex << res_scalars_wide[0].limbs_storage.limbs[i] << " "; - } - std::cout << std::endl; - - ASSERT_EQ(mp_mult(scalars1, scalars2, res_scalars_wide_full), cudaSuccess); - std::cout << "first GPU full mult output = 0x"; - for (int i = 2 * scalar_field_t::TLC - 1; i >= 0; i--) { - std::cout << std::hex << res_scalars_wide_full[0].limbs_storage.limbs[i] << " "; - } - - std::cout << std::endl; - - for (int i = 0; i < 2 * scalar_field_t::TLC - 1; i++) { - if (res_scalars_wide_full[0].limbs_storage.limbs[i] == res_scalars_wide[0].limbs_storage.limbs[i]) - std::cout << "matched word idx = " << i << std::endl; - } -} - -TEST_F(PrimitivesTest, INGO_MP_MULT) -{ - // MSB multiply, take n msb bits of multiplication, assert that the error is up to 1. - ASSERT_EQ(ingo_mp_mult(scalars1, scalars2, res_scalars_wide), cudaSuccess); - std::cout << "INGO = 0x"; - for (int i = 0; i < 2 * scalar_field_t::TLC; i++) { - std::cout << std::hex << res_scalars_wide[0].limbs_storage.limbs[i] << " "; - } - std::cout << std::endl; - - ASSERT_EQ(mp_mult(scalars1, scalars2, res_scalars_wide_full), cudaSuccess); - std::cout << "ZKSYNC = 0x"; - for (int i = 0; i < 2 * scalar_field_t::TLC; i++) { - std::cout << std::hex << res_scalars_wide_full[0].limbs_storage.limbs[i] << " "; - } - - std::cout << std::endl; - - for (int i = 0; i < 2 * scalar_field_t::TLC - 1; i++) { - if (res_scalars_wide_full[0].limbs_storage.limbs[i] == res_scalars_wide[0].limbs_storage.limbs[i]) - std::cout << "matched word idx = " << i << std::endl; - } - for (int j = 0; j < n; j++) { - for (int i = 0; i < 2 * scalar_field_t::TLC - 1; i++) { - ASSERT_EQ(res_scalars_wide_full[j].limbs_storage.limbs[i], res_scalars_wide[j].limbs_storage.limbs[i]); - } - } -} - TEST_F(PrimitivesTest, G2ECRandomPointsAreOnCurve) { for (unsigned i = 0; i < n; i++) diff --git a/icicle/primitives/test_kernels.cuh b/icicle/primitives/test_kernels.cuh index 2555ab569..bd8d2e145 100644 --- a/icicle/primitives/test_kernels.cuh +++ b/icicle/primitives/test_kernels.cuh @@ -75,28 +75,28 @@ int vec_mul(const F* x, const G* y, G* result, const unsigned count) return error ? error : cudaDeviceSynchronize(); } -__global__ void inv_field_elements_kernel(const scalar_field_t* x, scalar_field_t* result, const unsigned count) +__global__ void inv_field_elements_kernel(const scalar_t* x, scalar_t* result, const unsigned count) { const unsigned gid = blockIdx.x * blockDim.x + threadIdx.x; if (gid >= count) return; - result[gid] = scalar_field_t::inverse(x[gid]); + result[gid] = scalar_t::inverse(x[gid]); } -int field_vec_inv(const scalar_field_t* x, scalar_field_t* result, const unsigned count) +int field_vec_inv(const scalar_t* x, scalar_t* result, const unsigned count) { inv_field_elements_kernel<<<(count - 1) / 32 + 1, 32>>>(x, result, count); int error = cudaGetLastError(); return error ? error : cudaDeviceSynchronize(); } -__global__ void sqr_field_elements_kernel(const scalar_field_t* x, scalar_field_t* result, const unsigned count) +__global__ void sqr_field_elements_kernel(const scalar_t* x, scalar_t* result, const unsigned count) { const unsigned gid = blockIdx.x * blockDim.x + threadIdx.x; if (gid >= count) return; - result[gid] = scalar_field_t::sqr(x[gid]); + result[gid] = scalar_t::sqr(x[gid]); } -int field_vec_sqr(const scalar_field_t* x, scalar_field_t* result, const unsigned count) +int field_vec_sqr(const scalar_t* x, scalar_t* result, const unsigned count) { sqr_field_elements_kernel<<<(count - 1) / 32 + 1, 32>>>(x, result, count); int error = cudaGetLastError(); @@ -118,81 +118,3 @@ int point_vec_to_affine(const P* x, A* result, const unsigned count) int error = cudaGetLastError(); return error ? error : cudaDeviceSynchronize(); } - -__global__ void mp_mult_kernel(const scalar_field_t* x, const scalar_field_t* y, scalar_field_t::Wide* result) -{ - const unsigned gid = blockIdx.x * blockDim.x + threadIdx.x; - scalar_field_t::multiply_raw_device(x[gid].limbs_storage, y[gid].limbs_storage, result[gid].limbs_storage); -} - -int mp_mult(const scalar_field_t* x, scalar_field_t* y, scalar_field_t::Wide* result) -{ - mp_mult_kernel<<<1, 32>>>(x, y, result); - int error = cudaGetLastError(); - return error ? error : cudaDeviceSynchronize(); -} - -__global__ void mp_lsb_mult_kernel(const scalar_field_t* x, const scalar_field_t* y, scalar_field_t::Wide* result) -{ - const unsigned gid = blockIdx.x * blockDim.x + threadIdx.x; - scalar_field_t::multiply_lsb_raw_device(x[gid].limbs_storage, y[gid].limbs_storage, result[gid].limbs_storage); -} - -int mp_lsb_mult(const scalar_field_t* x, scalar_field_t* y, scalar_field_t::Wide* result) -{ - mp_lsb_mult_kernel<<<1, 32>>>(x, y, result); - int error = cudaGetLastError(); - return error ? error : cudaDeviceSynchronize(); -} - -__global__ void mp_msb_mult_kernel(const scalar_field_t* x, const scalar_field_t* y, scalar_field_t::Wide* result) -{ - const unsigned gid = blockIdx.x * blockDim.x + threadIdx.x; - scalar_field_t::multiply_msb_raw_device(x[gid].limbs_storage, y[gid].limbs_storage, result[gid].limbs_storage); -} - -int mp_msb_mult(const scalar_field_t* x, scalar_field_t* y, scalar_field_t::Wide* result) -{ - mp_msb_mult_kernel<<<1, 1>>>(x, y, result); - int error = cudaGetLastError(); - return error ? error : cudaDeviceSynchronize(); -} - -__global__ void ingo_mp_mult_kernel(const scalar_field_t* x, const scalar_field_t* y, scalar_field_t::Wide* result) -{ - const unsigned gid = blockIdx.x * blockDim.x + threadIdx.x; - scalar_field_t::ingo_multiply_raw_device(x[gid].limbs_storage, y[gid].limbs_storage, result[gid].limbs_storage); -} - -int ingo_mp_mult(const scalar_field_t* x, scalar_field_t* y, scalar_field_t::Wide* result) -{ - ingo_mp_mult_kernel<<<1, 32>>>(x, y, result); - int error = cudaGetLastError(); - return error ? error : cudaDeviceSynchronize(); -} - -__global__ void ingo_mp_msb_mult_kernel(const scalar_field_t* x, const scalar_field_t* y, scalar_field_t::Wide* result) -{ - const unsigned gid = blockIdx.x * blockDim.x + threadIdx.x; - scalar_field_t::ingo_msb_multiply_raw_device(x[gid].limbs_storage, y[gid].limbs_storage, result[gid].limbs_storage); -} - -int ingo_mp_msb_mult(const scalar_field_t* x, scalar_field_t* y, scalar_field_t::Wide* result, const unsigned n) -{ - ingo_mp_msb_mult_kernel<<<1, n>>>(x, y, result); - int error = cudaGetLastError(); - return error ? error : cudaDeviceSynchronize(); -} - -__global__ void ingo_mp_mod_mult_kernel(const scalar_field_t* x, const scalar_field_t* y, scalar_field_t* result) -{ - const unsigned gid = blockIdx.x * blockDim.x + threadIdx.x; - result[gid] = x[gid] * y[gid]; -} - -int ingo_mp_mod_mult(const scalar_field_t* x, scalar_field_t* y, scalar_field_t* result, const unsigned n) -{ - ingo_mp_mod_mult_kernel<<<1, n>>>(x, y, result); - int error = cudaGetLastError(); - return error ? error : cudaDeviceSynchronize(); -} \ No newline at end of file diff --git a/icicle/utils/device_context.cu b/icicle/utils/device_context.cu new file mode 100644 index 000000000..7e2c42f98 --- /dev/null +++ b/icicle/utils/device_context.cu @@ -0,0 +1,9 @@ +#include "device_context.cuh" + +namespace device_context { + + extern "C" DeviceContext GetDefaultDeviceContext() { + return get_default_device_context(); + } + +} diff --git a/icicle/utils/device_context.cuh b/icicle/utils/device_context.cuh index 5ce30e485..21ac61cdf 100644 --- a/icicle/utils/device_context.cuh +++ b/icicle/utils/device_context.cuh @@ -15,6 +15,17 @@ namespace device_context { cudaMemPool_t mempool; /**< Mempool to use. Default value: 0. */ }; + /** + * Return default device context that corresponds to using the default stream of the first GPU + */ + inline DeviceContext get_default_device_context() { + return DeviceContext { + 0, // device_id + (cudaStream_t)0, // stream + 0, // mempool + }; + } + } // namespace device_context #endif diff --git a/icicle/utils/error_handler.cuh b/icicle/utils/error_handler.cuh index 58c008153..b8af83050 100644 --- a/icicle/utils/error_handler.cuh +++ b/icicle/utils/error_handler.cuh @@ -6,12 +6,32 @@ #define CHECK_CUDA_ERROR(val) check((val), #val, __FILE__, __LINE__) template -void check(T err, const char* const func, const char* const file, const int line); +void inline check(T err, const char* const func, const char* const file, const int line) +{ + if (err != cudaSuccess) { + std::cerr << "CUDA Runtime Error at: " << file << ":" << line << std::endl; + std::cerr << cudaGetErrorString(err) << " " << func << std::endl; + } +} #define CHECK_LAST_CUDA_ERROR() checkLast(__FILE__, __LINE__) -void checkLast(const char* const file, const int line); +void inline checkLast(const char* const file, const int line) +{ + cudaError_t err{cudaGetLastError()}; + if (err != cudaSuccess) { + std::cerr << "CUDA Runtime Error at: " << file << ":" << line << std::endl; + std::cerr << cudaGetErrorString(err) << std::endl; + } +} #define CHECK_SYNC_DEVICE_ERROR() syncDevice(__FILE__, __LINE__) -void syncDevice(const char* const file, const int line); +void inline syncDevice(const char* const file, const int line) +{ + cudaError_t err{cudaDeviceSynchronize()}; + if (err != cudaSuccess) { + std::cerr << "CUDA Runtime Error at: " << file << ":" << line << std::endl; + std::cerr << cudaGetErrorString(err) << std::endl; + } +} #endif diff --git a/icicle/utils/utils_kernels.cu b/icicle/utils/utils_kernels.cu deleted file mode 100644 index e1099cc36..000000000 --- a/icicle/utils/utils_kernels.cu +++ /dev/null @@ -1,29 +0,0 @@ -#include "utils_kernels.cuh" - -namespace utils_internal { - // TODO: weird linking issue - only works in headers - // template - // __global__ void NormalizeKernel(E* arr, S scalar, unsigned n) - // { - // int tid = blockIdx.x * blockDim.x + threadIdx.x; - // if (tid < n) { arr[tid] = scalar * arr[tid]; } - // } - - template - __global__ void NormalizeKernel(E* arr, S scalar, int n) - { - int tid = blockIdx.x * blockDim.x + threadIdx.x; - if (tid < n) { arr[tid] = scalar * arr[tid]; } - } - - template - __global__ void BatchMulKernel(E* element_vec, S* scalar_vec, int n_scalars, int batch_size) - { - int tid = blockDim.x * blockIdx.x + threadIdx.x; - if (tid < n_scalars * batch_size) { - int scalar_id = tid % n_scalars; - element_vec[tid] = scalar_vec[scalar_id] * element_vec[tid]; - } - } - -} // namespace utils_internal diff --git a/icicle/utils/utils_kernels.cuh b/icicle/utils/utils_kernels.cuh index ae73da595..5ef3dd2b5 100644 --- a/icicle/utils/utils_kernels.cuh +++ b/icicle/utils/utils_kernels.cuh @@ -2,22 +2,31 @@ #ifndef UTILS_KERNELS_H #define UTILS_KERNELS_H +#include "utils_kernels.cuh" + namespace utils_internal { + // TODO: weird linking issue - only works in headers + // template + // __global__ void NormalizeKernel(E* arr, S scalar, unsigned n) + // { + // int tid = blockIdx.x * blockDim.x + threadIdx.x; + // if (tid < n) { arr[tid] = scalar * arr[tid]; } + // } template - __global__ void NormalizeKernel(E* arr, S scalar, unsigned n) + __global__ void NormalizeKernel(E* arr, S scalar, int n) { int tid = blockIdx.x * blockDim.x + threadIdx.x; if (tid < n) { arr[tid] = scalar * arr[tid]; } } template - __global__ void BatchMulKernel(E* element_vec, S* scalar_vec, unsigned n_scalars, unsigned batch_size) + __global__ void BatchMulKernel(E* in_vec, int n_elements, int batch_size, S* scalar_vec, int step, int n_scalars, E* out_vec) { int tid = blockDim.x * blockIdx.x + threadIdx.x; - if (tid < n_scalars * batch_size) { - int scalar_id = tid % n_scalars; - element_vec[tid] = scalar_vec[scalar_id] * element_vec[tid]; + if (tid < n_elements * batch_size) { + int scalar_id = tid % n_elements; + out_vec[tid] = *(scalar_vec + ((scalar_id * step) % n_scalars)) * in_vec[tid]; } } diff --git a/scripts/hooks/pre-push b/scripts/hooks/pre-push index cbd7844d2..f26b8bbcb 100755 --- a/scripts/hooks/pre-push +++ b/scripts/hooks/pre-push @@ -3,9 +3,13 @@ status=0 # Run clang-format on CUDA, C, and CPP files # clang-format writes to stderr in dry-run mode. In order to capture the output to detect if there are changes needed we redirect stderr to stdin -if [[ $(find ./ -path ./icicle/build -prune -o -path ./target -prune -iname *.h -or -iname *.cuh -or -iname *.cu -or -iname *.c -or -iname *.cpp | xargs clang-format --dry-run -ferror-limit=1 -style=file 2>&1) ]]; +# to print list of files +unformatted_files=$(find ./ -path ./icicle/build -prune -o -path ./target -prune -iname *.h -or -iname *.cuh -or -iname *.cu -or -iname *.c -or -iname *.cpp | xargs clang-format --dry-run -ferror-limit=1 -style=file 2>&1) + +if [[ $unformatted_files ]]; then echo "🚨 There are files in Icicle Core that need formatting." + echo $unformatted_files echo "Please format all .c, .cpp, .h, .cu, .cuh files using the following command:" echo "find ./ -path ./icicle/build -prune -o -path ./target -prune -iname *.h -or -iname *.cuh -or -iname *.cu -or -iname *.c -or -iname *.cpp | xargs clang-format -i -style=file" status=1 diff --git a/wrappers/rust/icicle-core/src/ntt/mod.rs b/wrappers/rust/icicle-core/src/ntt/mod.rs index d9f150e0e..426abe896 100644 --- a/wrappers/rust/icicle-core/src/ntt/mod.rs +++ b/wrappers/rust/icicle-core/src/ntt/mod.rs @@ -3,9 +3,17 @@ use std::os::raw::c_int; /** * @enum Ordering - * How to order inputs and outputs of the NTT: - * - kNN: inputs and outputs are natural-order (example of natural ordering: \f$ \{a_0, a_1, a_2, a_3, a_4, a_5, a_6, a_7\} \f$). - * - kNR: inputs are natural-order and outputs are bit-reversed-order (example of bit-reversed ordering: \f$ \{a_0, a_4, a_2, a_6, a_1, a_5, a_3, a_7\} \f$). + * How to order inputs and outputs of the NTT. If needed, use this field to specify decimation: decimation in time + * (DIT) corresponds to `Ordering::kRN` while decimation in frequency (DIF) to `Ordering::kNR`. Also, to specify + * butterfly to be used, select `Ordering::kRN` for Cooley-Tukey and `Ordering::kNR` for Gentleman-Sande. There's + * no implication that a certain decimation or butterfly will actually be used under the hood, this is just for + * compatibility with codebases that use "decimation" and "butterfly" to denote ordering of inputs and outputs. + * + * Ordering options are: + * - kNN: inputs and outputs are natural-order (example of natural ordering: \f$ \{a_0, a_1, a_2, a_3, a_4, a_5, a_6, + * a_7\} \f$). + * - kNR: inputs are natural-order and outputs are bit-reversed-order (example of bit-reversed ordering: \f$ \{a_0, + * a_4, a_2, a_6, a_1, a_5, a_3, a_7\} \f$). * - kRN: inputs are bit-reversed-order and outputs are natural-order. * - kRR: inputs and outputs are bit-reversed-order. */ @@ -19,86 +27,41 @@ pub enum Ordering { kRR, } -/** - * @enum Decimation - * Decimation of the NTT algorithm: - * - kDIT: decimation in time. - * - kDIF: decimation in frequency. - */ -#[allow(non_camel_case_types)] -#[repr(C)] -#[derive(Clone, Copy, Debug, PartialEq, Eq)] -pub enum Decimation { - kDIT, - kDIF, -} - -/** - * @enum Butterfly - * [Butterfly](https://en.wikipedia.org/wiki/Butterfly_diagram) used in the NTT algorithm (i.e. what happens to each pair of inputs on every iteration): - * - kCooleyTukey: Cooley-Tukey butterfly. - * - kGentlemanSande: Gentleman-Sande butterfly. - */ -#[allow(non_camel_case_types)] -#[repr(C)] -#[derive(Clone, Copy, Debug, PartialEq, Eq)] -pub enum Butterfly { - kCooleyTukey, - kGentlemanSande, -} - /** * @struct NTTConfig * Struct that encodes NTT parameters to be passed into the [ntt](@ref ntt) function. */ #[repr(C)] #[derive(Debug)] -pub struct NTTConfigCuda<'a, E, S> { - pub inout: *mut E, - /**< Input that's mutated in-place by this function. Length of this array needs to be \f$ size \cdot config.batch_size \f$. - * Note that if inputs are in Montgomery form, the outputs will be as well and vice-verse: non-Montgomery inputs produce non-Montgomety outputs.*/ - pub is_input_on_device: bool, - /**< True if inputs/outputs are on device and false if they're on host. Default value: false. */ - pub is_inverse: bool, - /**< True if true . Default value: false. */ +pub struct NTTConfig<'a, S> { + /** Coset generator. Used to perform coset (i)NTTs. Default value: `S::one()` (corresponding to no coset being used). */ + pub coset_gen: S, + /** Ordering of inputs and outputs. See [Ordering](@ref Ordering). Default value: `Ordering::kNN`. */ pub ordering: Ordering, - /**< Ordering of inputs and outputs. See [Ordering](@ref Ordering). Default value: `Ordering::kNN`. */ - pub decimation: Decimation, - /**< Decimation of the algorithm, see [Decimation](@ref Decimation). Default value: `Decimation::kDIT`. - * __Note:__ this variable exists mainly for compatibility with codebases that use similar notation. - * If [ordering](@ref ordering) is `Ordering::kRN`, the value of this variable will be overridden to - * `Decimation::kDIT` and if ordering is `Ordering::kNR` — to `Decimation::kDIF`. */ - pub butterfly: Butterfly, - /**< Butterfly used by the NTT. See [Butterfly](@ref Butterfly). Default value: `Butterfly::kCooleyTukey`. - * __Note:__ this variable exists mainly for compatibility with codebases that use similar notation. - * If [ordering](@ref ordering) is `Ordering::kRN`, the value of this variable will be overridden to - * `Butterfly::kCooleyTukey` and if ordering is `Ordering::kNR` — to `Butterfly::kGentlemanSande`. */ - pub is_coset: bool, - /**< If false, NTT is computed on a subfield given by [twiddles](@ref twiddles). If true, NTT is computed - * on a coset of [twiddles](@ref twiddles) given by [the coset generator](@ref coset_gen), so: - * \f$ \{coset\_gen\cdot\omega^0, coset\_gen\cdot\omega^1, \dots, coset\_gen\cdot\omega^{n-1}\} \f$. Default value: false. */ - pub coset_gen: *const S, - /**< The field element that generates a coset if [is_coset](@ref is_coset) is true. - * Otherwise should be set to `nullptr`. Default value: `nullptr`. */ - pub twiddles: *const S, - /**< "Twiddle factors", (or "domain", or "roots of unity") on which the NTT is evaluated. - * This pointer is expected to live on device. The order is as follows: - * \f$ \{\omega^0=1, \omega^1, \dots, \omega^{n-1}\} \f$. If this pointer is `nullptr`, twiddle factors - * are generated online using the default generator (TODO: link to twiddle gen here) and function - * [GenerateTwiddleFactors](@ref GenerateTwiddleFactors). Default value: `nullptr`. */ - pub inv_twiddles: *const S, - /**< "Inverse twiddle factors", (or "domain", or "roots of unity") on which the iNTT is evaluated. - * This pointer is expected to live on device. The order is as follows: - * \f$ \{\omega^0=1, \omega^1, \dots, \omega^{n-1}\} \f$. If this pointer is `nullptr`, twiddle factors - * are generated online using the default generator (TODO: link to twiddle gen here) and function - * [GenerateTwiddleFactors](@ref GenerateTwiddleFactors). Default value: `nullptr`. */ - pub size: c_int, - /**< NTT size \f$ n \f$. If a batch of NTTs (which all need to have the same size) is computed, this is the size of 1 NTT. */ + /** True if inputs are on device and false if they're on host. Default value: false. */ + pub are_inputs_on_device: bool, + /** If true, output is preserved on device for subsequent use in config and not freed after calculation. Default value: false. */ + pub are_outputs_on_device: bool, + /** The number of NTTs to compute. Default value: 1. */ pub batch_size: c_int, - /**< The number of NTTs to compute. Default value: 1. */ - pub is_preserving_twiddles: bool, - /**< If true, twiddle factors are preserved on device for subsequent use in config and not freed after calculation. Default value: false. */ - pub is_output_on_device: bool, - /**< If true, output is preserved on device for subsequent use in config and not freed after calculation. Default value: false. */ - pub ctx: DeviceContext<'a>, /*< Details related to the device such as its id and stream id. See [DeviceContext](@ref device_context::DeviceContext). */ + /** Whether to run the NTT asyncronously. If set to `true`, the NTT function will be non-blocking and you'd need to synchronize + * it explicitly by running `cudaStreamSynchronize` or `cudaDeviceSynchronize`. If set to false, the NTT + * function will block the current CPU thread. */ + pub is_async: bool, + /** Details related to the device such as its id and stream id. See [DeviceContext](@ref device_context::DeviceContext). */ + pub ctx: DeviceContext<'a>, } + +// /** +// * @struct Domain +// * Struct containing information about the domain on which (i)NTT is evaluated: twiddle factors and coset generator. +// * Twiddle factors are private, static and can only be set using [GenerateDomain](@ref GenerateDomain) function. +// * The internal representation of twiddles is prone to change in accordance with changing [NTT](@ref NTT) algorithm. +// */ +// #[repr(C)] +// #[derive(Debug)] +// pub struct Domain<'a, T> { +// /** Scalar elements that specify a coset to be used in (i)NTT. Default value: None (no coset or alternatively coset +// * generated by `S::one()` is used). */ +// pub coset_table: Option<&'a [T]>, +// } diff --git a/wrappers/rust/icicle-cuda-runtime/Cargo.toml b/wrappers/rust/icicle-cuda-runtime/Cargo.toml index 3cc5951c1..74fc7f4a5 100644 --- a/wrappers/rust/icicle-cuda-runtime/Cargo.toml +++ b/wrappers/rust/icicle-cuda-runtime/Cargo.toml @@ -8,7 +8,7 @@ homepage = "https://www.ingonyama.com" repository = "https://github.com/ingonyama-zk/icicle" [dependencies] -bitflags = "2.4" +bitflags = "1.3" [build-dependencies] bindgen = "*" \ No newline at end of file diff --git a/wrappers/rust/icicle-cuda-runtime/src/device_context.rs b/wrappers/rust/icicle-cuda-runtime/src/device_context.rs index 742d44b3a..11e72894c 100644 --- a/wrappers/rust/icicle-cuda-runtime/src/device_context.rs +++ b/wrappers/rust/icicle-cuda-runtime/src/device_context.rs @@ -1,5 +1,5 @@ use crate::memory::CudaMemPool; -use crate::stream::CudaStream; +use crate::stream::{CudaStream, CudaStreamCreateFlags}; /// Properties of the device used in icicle functions. #[repr(C)] diff --git a/wrappers/rust/icicle-cuda-runtime/src/stream.rs b/wrappers/rust/icicle-cuda-runtime/src/stream.rs index cad33d795..d8b474980 100644 --- a/wrappers/rust/icicle-cuda-runtime/src/stream.rs +++ b/wrappers/rust/icicle-cuda-runtime/src/stream.rs @@ -1,5 +1,6 @@ use crate::bindings::{ - cudaStreamCreate, cudaStreamDefault, cudaStreamDestroy, cudaStreamNonBlocking, cudaStreamSynchronize, cudaStream_t, + cudaStreamCreate, cudaStreamCreateWithFlags, cudaStreamDefault, cudaStreamDestroy, cudaStreamNonBlocking, + cudaStreamSynchronize, cudaStream_t, }; use crate::error::{CudaResult, CudaResultWrap}; use bitflags::bitflags; @@ -34,6 +35,15 @@ impl CudaStream { } } + pub fn create_with_flags(flags: CudaStreamCreateFlags) -> CudaResult { + let mut handle = MaybeUninit::::uninit(); + unsafe { + cudaStreamCreateWithFlags(handle.as_mut_ptr(), flags.bits) + .wrap_maybe_uninit(handle) + .map(CudaStream::from_handle) + } + } + pub fn destroy(self) -> CudaResult<()> { let handle = self.handle; forget(self); diff --git a/wrappers/rust/icicle-curves/icicle-bn254/build.rs b/wrappers/rust/icicle-curves/icicle-bn254/build.rs index baa64e81d..f65dfe5f9 100644 --- a/wrappers/rust/icicle-curves/icicle-bn254/build.rs +++ b/wrappers/rust/icicle-curves/icicle-bn254/build.rs @@ -10,11 +10,9 @@ fn main() { let target_output_dir = format!("{}/../../target/{}", cargo_dir, profile); - Config::new("./icicle") + Config::new("../../../../icicle") .define("BUILD_TESTS", "OFF") //TODO: feature - // .define("CURVE", "bls12_381") .define("CURVE", "bn254") - // .define("ECNTT_DEFINED", "") //TODO: feature .define("LIBRARY_OUTPUT_DIRECTORY", &target_output_dir) .define("CMAKE_BUILD_TYPE", "Release") .build_target("icicle") @@ -22,7 +20,6 @@ fn main() { println!("cargo:rustc-link-search={}", &target_output_dir); - // println!("cargo:rustc-link-lib=icicle"); println!("cargo:rustc-link-lib=ingo_bn254"); println!("cargo:rustc-link-lib=stdc++"); // println!("cargo:rustc-link-search=native=/usr/local/cuda/lib64"); diff --git a/wrappers/rust/icicle-curves/icicle-bn254/src/msm/mod.rs b/wrappers/rust/icicle-curves/icicle-bn254/src/msm/mod.rs index 69b3a72cf..9885914e5 100644 --- a/wrappers/rust/icicle-curves/icicle-bn254/src/msm/mod.rs +++ b/wrappers/rust/icicle-curves/icicle-bn254/src/msm/mod.rs @@ -12,12 +12,12 @@ extern "C" { out: *mut G1Projective, ) -> CudaError; - #[link_name = "bn254GetDefaultMSMConfig"] - fn GetDefaultMSMConfig() -> MSMConfig<'static>; + #[link_name = "bn254DefaultMSMConfig"] + fn default_msm_config() -> MSMConfig<'static>; } pub fn get_default_msm_config() -> MSMConfig<'static> { - unsafe { GetDefaultMSMConfig() } + unsafe { default_msm_config() } } pub fn msm<'a>( diff --git a/wrappers/rust/icicle-curves/icicle-bn254/src/ntt/config.rs b/wrappers/rust/icicle-curves/icicle-bn254/src/ntt/config.rs deleted file mode 100644 index 0d37527ad..000000000 --- a/wrappers/rust/icicle-curves/icicle-bn254/src/ntt/config.rs +++ /dev/null @@ -1,61 +0,0 @@ -use std::os::raw::c_int; - -use crate::curve::*; -use icicle_core::ntt::{Butterfly, Decimation, NTTConfigCuda, Ordering}; -use icicle_cuda_runtime::device_context::{get_default_device_context, DeviceContext}; - -pub(super) type ECNTTConfig<'a> = NTTConfigCuda<'a, G1Projective, ScalarField>; -pub(super) type NTTConfig<'a> = NTTConfigCuda<'a, ScalarField, ScalarField>; - -pub(super) fn get_ntt_config(size: usize, ctx: DeviceContext) -> NTTConfigCuda { - //TODO: implement on CUDA side - - NTTConfigCuda:: { - inout: 0 as _, // inout as *mut _ as *mut ScalarField, - is_input_on_device: false, - is_inverse: false, - ordering: Ordering::kNN, - decimation: Decimation::kDIF, - butterfly: Butterfly::kCooleyTukey, - is_coset: false, - coset_gen: 0 as _, //TODO: ? - twiddles: 0 as _, //TODO: ?, - inv_twiddles: 0 as _, //TODO: ?, - size: size as i32, - batch_size: 0 as i32, - is_preserving_twiddles: true, - is_output_on_device: false, - ctx, - } -} - -pub(super) fn get_ntt_default_config(size: usize) -> NTTConfigCuda<'static, E, S> { - //TODO: implement on CUDA side - let ctx = get_default_device_context(); - - // let root_of_unity = S::default(); //TODO: implement on CUDA side - - let config = get_ntt_config(size, ctx); - - config -} - -pub(super) fn get_ntt_config_with_input(ntt_intt_result: &mut [ScalarField], size: usize, batches: usize) -> NTTConfig { - NTTConfig { - inout: ntt_intt_result as *mut _ as *mut ScalarField, - is_input_on_device: false, - is_inverse: false, - ordering: Ordering::kNN, - decimation: Decimation::kDIF, - butterfly: Butterfly::kCooleyTukey, - is_coset: false, - coset_gen: &[ScalarField::zero()] as _, //TODO: ? - twiddles: 0 as *const ScalarField, //TODO: ?, - inv_twiddles: 0 as *const ScalarField, //TODO: ?, - size: size as _, - batch_size: batches as i32, - is_preserving_twiddles: true, - is_output_on_device: true, - ctx: get_default_device_context(), - } -} diff --git a/wrappers/rust/icicle-curves/icicle-bn254/src/ntt/domain.rs b/wrappers/rust/icicle-curves/icicle-bn254/src/ntt/domain.rs deleted file mode 100644 index bd9120528..000000000 --- a/wrappers/rust/icicle-curves/icicle-bn254/src/ntt/domain.rs +++ /dev/null @@ -1,192 +0,0 @@ -use icicle_core::ntt::{Butterfly, Decimation, NTTConfigCuda, Ordering}; -use icicle_cuda_runtime::device_context::{get_default_device_context, DeviceContext}; -use icicle_cuda_runtime::memory::DeviceSlice; -use std::default; - -pub(super) type ECNTTDomain<'a> = Domain<'a, G1Projective, ScalarField>; -pub(super) type NTTDomain<'a> = Domain<'a, ScalarField, ScalarField>; - -use crate::curve::*; - -use super::{config::*, ntt_internal}; - -/// Represents the NTT domain -pub struct Domain<'a, E, S> { - config: NTTConfigCuda<'a, E, S>, -} - -impl<'a, E, S> Domain<'a, E, S> { - pub fn new(size: usize, ctx: DeviceContext<'a>) -> Self { - Domain { - config: get_ntt_config(size, ctx), - } - } - - pub fn get_output_on_device(&self) -> Result<*mut E, &'static str> { - if self - .config - .is_output_on_device - { - Ok(self - .config - .inout) - } else { - Err("Output should be on device.") - } - } - - pub fn get_input_on_device(&self) -> Result<*mut E, &'static str> { - if self - .config - .is_input_on_device - { - Ok(self - .config - .inout) - } else { - Err("Input should be on device.") - } - } - - pub fn get_input(&self) -> Result<*mut E, &'static str> { - if !self - .config - .is_input_on_device - { - Ok(self - .config - .inout) - } else { - Err("Output is on device.") - } - } - - pub fn get_output(&self) -> Result<*mut E, &'static str> { - if !self - .config - .is_output_on_device - { - Ok(self - .config - .inout) - } else { - Err("Output is on device.") - } - } - - pub(crate) fn new_for_default_context(size: usize) -> Self { - let ctx = get_default_device_context(); - // let default_root_of_unity = S::default(); //TODO: implement - let domain = Domain::new(size, ctx); - domain - } -} - -// Add implementations for other methods and structs as needed. - -impl<'a, E: 'static, S: 'static> Domain<'a, E, S> { - // ... previous methods ... - - // NTT methods - pub fn ntt(&mut self, inout: &mut [E]) { - let batch_size = 1; - - let size = inout.len(); - - if size - != self - .config - .size as _ - { - //TODO: test for this error - panic!( - "input lenght: {} does not match domain size: {}", - size, - self.config - .size - ) - } - - self.config - .inout = inout.as_mut_ptr(); // as *mut _ as *mut E; - self.config - .is_inverse = false; - self.config - .is_input_on_device = false; - self.config - .is_output_on_device = false; - // self.config - // .ordering = Ordering::default(); //TODO: each call? - self.config - .batch_size = batch_size as i32; - - ntt_internal(&mut self.config); - } - - pub fn ntt_on_device(&mut self, inout: &mut DeviceSlice) { - // Implementation for NTT on device - } - - pub fn ntt_batch(&mut self, inout: &mut [E]) { - // Implementation for batched NTT - } - - pub fn ntt_batch_on_device(&mut self, inout: &mut DeviceSlice) { - // Implementation for batched NTT on device - } - - pub fn ntt_coset(&mut self, inout: &mut [E], coset: &mut [E]) { - // Implementation for NTT with coset - } - - pub fn ntt_coset_on_device(&mut self, inout: &mut DeviceSlice, coset: &mut DeviceSlice) { - // Implementation for NTT with coset on device - } - - pub fn ntt_coset_batch(&mut self, inout: &mut [E], coset: &mut [E]) { - // Implementation for batched NTT with coset - } - - pub fn ntt_coset_batch_on_device(&mut self, inout: &mut DeviceSlice, coset: &mut DeviceSlice) { - // Implementation for batched NTT with coset on device - } - - // iNTT methods - pub fn intt(&mut self, inout: &mut [E]) { - // Implementation for iNTT - } - - pub fn intt_on_device(&mut self, inout: &mut DeviceSlice) { - // Implementation for iNTT on device - } - - pub fn intt_batch(&mut self, inout: &mut [E]) { - // Implementation for batched iNTT - } - - pub fn intt_batch_on_device(&mut self, inout: &mut DeviceSlice) { - // Implementation for batched iNTT on device - } - - pub fn intt_coset(&mut self, inout: &mut [E], coset: &mut [E]) { - // Implementation for iNTT with coset - } - - pub fn intt_coset_on_device(&mut self, inout: &mut DeviceSlice, coset: &mut DeviceSlice) { - // Implementation for iNTT with coset on device - } - - pub fn intt_coset_batch(&mut self, inout: &mut [E], coset: &mut [E]) { - // Implementation for batched iNTT with coset - } - - pub fn intt_coset_batch_on_device(&mut self, inout: &mut DeviceSlice, coset: &mut DeviceSlice) { - // Implementation for batched iNTT with coset on device - } - - // Ordering setter - pub fn set_ordering(&mut self, ordering: Ordering) { - self.config - .ordering = ordering; - } -} diff --git a/wrappers/rust/icicle-curves/icicle-bn254/src/ntt/mod.rs b/wrappers/rust/icicle-curves/icicle-bn254/src/ntt/mod.rs index d418917a3..1f581b0ea 100644 --- a/wrappers/rust/icicle-curves/icicle-bn254/src/ntt/mod.rs +++ b/wrappers/rust/icicle-curves/icicle-bn254/src/ntt/mod.rs @@ -1,87 +1,70 @@ -mod config; -pub mod domain; +use crate::curve::ScalarField; -use std::any::TypeId; - -use crate::curve::*; - -use self::config::*; - -use icicle_core::ntt::{Butterfly, Decimation, Ordering}; -use icicle_cuda_runtime::error::CudaError; +use icicle_core::ntt::NTTConfig; +use icicle_cuda_runtime::device_context::DeviceContext; +use icicle_cuda_runtime::error::{CudaError, CudaResult, CudaResultWrap}; extern "C" { - #[link_name = "NTTDefaultContextCuda"] - fn ntt_cuda(config: *mut NTTConfig) -> CudaError; + #[link_name = "bn254NTTCuda"] + fn ntt_cuda<'a>( + input: *const ScalarField, + size: usize, + is_inverse: bool, + config: &NTTConfig<'a, ScalarField>, + output: *mut ScalarField, + ) -> CudaError; + + #[link_name = "bn254DefaultNTTConfig"] + fn default_ntt_config() -> NTTConfig<'static, ScalarField>; + + #[link_name = "bn254InitializeDomain"] + fn initialize_ntt_domain(primitive_root: ScalarField, ctx: &DeviceContext) -> CudaError; } -pub(crate) fn ntt_wip( - inout: &mut [ScalarField], - is_inverse: bool, - is_input_on_device: bool, - ordering: Ordering, - is_output_on_device: bool, - batch_size: usize, -) { - let mut batch_size = batch_size; - if batch_size == 0 { - batch_size = 1; - } - - let size = inout.len() / batch_size; - - let mut config = get_ntt_default_config::(size); - - config.inout = inout as *mut _ as *mut ScalarField; - config.is_inverse = is_inverse; - config.is_input_on_device = is_input_on_device; - config.is_output_on_device = is_output_on_device; - config.ordering = ordering; - config.batch_size = batch_size as i32; - - ntt_internal(&mut config); +pub fn get_default_ntt_config() -> NTTConfig<'static, ScalarField> { + unsafe { default_ntt_config() } } -pub(self) fn ntt_internal(config: *mut TConfig) -> CudaError { - let result_code = unsafe { ntt_cuda(config as _) }; - // let typeid = TypeId::of::(); - // if typeid == TypeId::of::() { - // result_code = unsafe { ntt_cuda(config as _) }; - // } else { - // result_code = CudaError::cudaSuccess; //TODO: unsafe { ecntt_cuda(config as _) }; - // } - - // if result_code != CudaError::cudaSuccess { - // println!("_result_code = {:?}", result_code); - // } - - return CudaError::cudaSuccess; +pub fn initialize_domain(primitive_root: ScalarField, ctx: &DeviceContext) -> CudaResult<()> { + unsafe { initialize_ntt_domain(primitive_root, ctx).wrap() } } -pub(self) fn ecntt_internal(config: *mut ECNTTConfig) -> u32 { - let result_code = 0; //TODO: unsafe { ecntt_cuda(config) }; - if result_code != 0 { - println!("_result_code = {}", result_code); +pub fn ntt( + input: &[ScalarField], + is_inverse: bool, + cfg: &NTTConfig, + output: &mut [ScalarField], +) -> CudaResult<()> { + if input.len() != output.len() { + return Err(CudaError::cudaErrorInvalidValue); } - return result_code; + unsafe { + ntt_cuda( + input as *const _ as *const ScalarField, + input.len(), + is_inverse, + cfg, + output as *mut _ as *mut ScalarField, + ) + .wrap() + } } #[cfg(test)] pub(crate) mod tests { - use ark_bn254::{Fr, G1Affine as arkG1Affine, G1Projective as arkG1Projective}; - // use ark_bls12_381::{Fr, G1Projective}; - use ark_ff::PrimeField; - use ark_poly::EvaluationDomain; - use ark_poly::GeneralEvaluationDomain; - use ark_std::UniformRand; - use std::slice; - - use crate::ntt::domain::NTTDomain; - use crate::{curve::*, ntt::*}; use icicle_core::traits::ArkConvertible; + use icicle_core::ntt::Ordering; + use icicle_cuda_runtime::device_context::get_default_device_context; + + use crate::curve::generate_random_scalars; + use crate::ntt::{get_default_ntt_config, initialize_domain, ntt, ScalarField}; - pub fn reverse_bit_order(n: u32, order: u32) -> u32 { + use ark_bn254::Fr; + use ark_ff::FftField; + use ark_poly::{EvaluationDomain, GeneralEvaluationDomain}; + + fn reverse_bit_order(n: u32, order: u32) -> u32 { fn is_power_of_two(n: u32) -> bool { n != 0 && n & (n - 1) == 0 } @@ -95,7 +78,7 @@ pub(crate) mod tests { u32::from_str_radix(&reversed, 2).unwrap() } - pub fn list_to_reverse_bit_order(l: &[T]) -> Vec { + fn list_to_reverse_bit_order(l: &[T]) -> Vec { l.iter() .enumerate() .map(|(i, _)| l[reverse_bit_order(i as u32, l.len() as u32) as usize]) @@ -104,219 +87,98 @@ pub(crate) mod tests { #[test] fn test_ntt() { - //NTT - let test_size = 1 << 11; - let batches = 1; - - let full_test_size = test_size * batches; - let scalars_batch: Vec = generate_random_scalars(full_test_size); - - // let scalars_batch: Vec = (0..full_test_size) - // .into_iter() - // .map(|x| { - // // if x % 1 == 0 { - // if x % 2 == 0 { - // ScalarField::one() - // } else { - // ScalarField::zero() - // } - // }) - // .collect(); + let test_size = 1 << 16; + let ctx = get_default_device_context(); + // two roughly analogous calls for icicle and arkworks. one difference is that icicle call creates + // domain for all NTTs of size <= `test_size`. also for icicle domain is a hidden static object + initialize_domain( + ScalarField::from_ark(Fr::get_root_of_unity(test_size as u64).unwrap()), + &ctx, + ).unwrap(); + let ark_domain = GeneralEvaluationDomain::::new(test_size).unwrap(); - let mut ntt_result = scalars_batch.clone(); + let scalars: Vec = generate_random_scalars(test_size); - let ark_domain = GeneralEvaluationDomain::::new(test_size).unwrap(); - let mut domain = NTTDomain::new_for_default_context(test_size); + let config = get_default_ntt_config(); + let mut ntt_result = vec![ScalarField::zero(); test_size]; + ntt(&scalars, false, &config, &mut ntt_result).unwrap(); + assert_ne!(ntt_result, scalars); - let ark_scalars_batch = scalars_batch - .clone() + let ark_scalars = scalars .iter() .map(|v| v.to_ark()) .collect::>(); - let mut ark_ntt_result = ark_scalars_batch.clone(); - + let mut ark_ntt_result = ark_scalars.clone(); ark_domain.fft_in_place(&mut ark_ntt_result); + assert_ne!(ark_ntt_result, ark_scalars); - assert_ne!(ark_ntt_result, ark_scalars_batch); - - // do ntt - // ntt_wip(&mut ntt_result, false, false, Ordering::kNN, false, batches); - domain.ntt(&mut ntt_result); //single ntt let ntt_result_as_ark = ntt_result .iter() .map(|p| p.to_ark()) .collect::>(); - - assert_ne!(ntt_result, scalars_batch); assert_eq!(ark_ntt_result, ntt_result_as_ark); - let mut ark_intt_result = ark_ntt_result; + let mut intt_result = vec![ScalarField::zero(); test_size]; + ntt(&ntt_result, true, &config, &mut intt_result).unwrap(); - ark_domain.ifft_in_place(&mut ark_intt_result); - assert_eq!(ark_intt_result, ark_scalars_batch); - - // check that ntt output is different from input - assert_ne!(ntt_result, scalars_batch); - - // do intt - let mut intt_result = ntt_result; - - ntt_wip(&mut intt_result, true, false, Ordering::kNN, false, batches); - - assert!(ark_intt_result == ark_scalars_batch); - assert!(intt_result == scalars_batch); - - let mut ntt_intt_result = intt_result; - ntt_wip(&mut ntt_intt_result, false, false, Ordering::kNR, false, batches); - assert!(ntt_intt_result != scalars_batch); - ntt_wip(&mut ntt_intt_result, true, false, Ordering::kRN, false, batches); - assert!(ntt_intt_result == scalars_batch); - - let mut ntt_intt_result = list_to_reverse_bit_order(&ntt_intt_result); - ntt_wip(&mut ntt_intt_result, false, false, Ordering::kRR, false, batches); - assert!(ntt_intt_result != scalars_batch); - ntt_wip(&mut ntt_intt_result, true, false, Ordering::kRN, false, batches); - assert!(ntt_intt_result == scalars_batch); - - //// - let size = ntt_intt_result.len() / batches; - - let mut config = get_ntt_config_with_input(&mut ntt_intt_result, size, batches); - - ntt_internal(&mut config); - - //host - let mut ntt_result = scalars_batch.clone(); - ntt_wip(&mut ntt_result, false, false, Ordering::kNR, false, batches); - - // let mut buff1 = DeviceBuffer::from_slice(&scalars_batch[..]).unwrap(); - // let dev_ptr1 = buff1 - // .as_device_ptr() - // .as_raw_mut(); - - // let buff_len = buff1.len(); - - // std::mem::forget(buff1); - - // let buff_from_dev_ptr = unsafe { DeviceBuffer::from_raw_parts(DevicePointer::wrap(dev_ptr1), buff_len) }; - // let mut from_device = vec![ScalarField::zero(); scalars_batch.len()]; - // buff_from_dev_ptr - // .copy_to(&mut from_device) - // .unwrap(); - - // assert_eq!(from_device, scalars_batch); - - // host - device - device - host - let mut ntt_intt_result = scalars_batch.clone(); - - let mut config = get_ntt_config_with_input(&mut ntt_intt_result, size, batches); - - config.is_input_on_device = false; - config.is_output_on_device = true; - // config.is_preserving_twiddles = true; // TODO: same as in get_ntt_config - config.ordering = Ordering::kNR; - - ntt_internal(&mut config); //twiddles are preserved after first call - - // config.is_preserving_twiddles = true; //TODO: same as in get_ntt_config - config.is_inverse = true; - config.is_input_on_device = false; - config.is_output_on_device = true; - config.ordering = Ordering::kNR; - - ntt_internal(&mut config); //inv_twiddles are preserved after first call - - let ntt_intt_result = &mut scalars_batch.clone()[..]; - let raw_scalars_batch_copy = ntt_intt_result as *mut _ as *mut ScalarField; - - let config_inout2: &mut [ScalarField] = - unsafe { std::slice::from_raw_parts_mut(raw_scalars_batch_copy, config.size as usize) }; - assert_eq!(config_inout2, scalars_batch); - - config.is_preserving_twiddles = true; //TODO: same as in get_ntt_config - - config.inout = raw_scalars_batch_copy; - - config.is_inverse = false; - config.is_input_on_device = false; - config.is_output_on_device = true; - config.ordering = Ordering::kNR; - - ntt_internal(&mut config); - - config.is_inverse = true; - config.is_input_on_device = true; - config.is_output_on_device = false; - config.ordering = Ordering::kRN; - - ntt_internal(&mut config); - - let result_from_device: &mut [ScalarField] = - unsafe { std::slice::from_raw_parts_mut(config.inout, scalars_batch.len()) }; - - assert_eq!(result_from_device, &scalars_batch); + assert_eq!(intt_result, scalars); + // check that ntt_result wasn't mutated by the latest `ntt` call + assert_eq!(ntt_result_as_ark[1], ntt_result[1].to_ark()); } #[test] - fn test_batch_ntt() { - //NTT - let test_size = 1 << 11; - let batches = 2; - - let full_test_size = test_size * batches; - let scalars_batch: Vec = generate_random_scalars(full_test_size); - - let mut scalar_vec_of_vec: Vec> = Vec::new(); - - for i in 0..batches { - scalar_vec_of_vec.push(scalars_batch[i * test_size..(i + 1) * test_size].to_vec()); - } - - let mut ntt_result = scalars_batch.clone(); - - // do batch ntt - ntt_wip(&mut ntt_result, false, false, Ordering::kNN, false, batches); - - let mut ntt_result_vec_of_vec = Vec::new(); - - // do ntt for every chunk - for i in 0..batches { - ntt_result_vec_of_vec.push(scalar_vec_of_vec[i].clone()); - - ntt_wip(&mut ntt_result_vec_of_vec[i], false, false, Ordering::kNN, false, 1); - } - - // check that the ntt of each vec of scalars is equal to the ntt of the specific batch - for i in 0..batches { - assert_eq!(ntt_result_vec_of_vec[i], ntt_result[i * test_size..(i + 1) * test_size]); - } - - // check that ntt output is different from input - assert_ne!(ntt_result, scalars_batch); - - let mut intt_result = ntt_result.clone(); - - // do batch intt - // intt_batch(&mut intt_result, test_size, 0); - ntt_wip(&mut intt_result, true, false, Ordering::kNN, false, batches); - - let mut intt_result_vec_of_vec = Vec::new(); - - // do intt for every chunk - for i in 0..batches { - intt_result_vec_of_vec.push(ntt_result_vec_of_vec[i].clone()); - // intt(&mut intt_result_vec_of_vec[i], 0); - ntt_wip(&mut intt_result_vec_of_vec[i], true, false, Ordering::kNN, false, 1); - } + fn test_ntt_coset_from_subgroup() { + let test_size = 1 << 16; + let small_size = test_size >> 1; + let test_size_rou = Fr::get_root_of_unity(test_size as u64).unwrap(); + let ctx = get_default_device_context(); + // two roughly analogous calls for icicle and arkworks. one difference is that icicle call creates + // domain for all NTTs of size <= `test_size`. also for icicle domain is a hidden static object + initialize_domain(ScalarField::from_ark(test_size_rou), &ctx).unwrap(); + let ark_small_domain = GeneralEvaluationDomain::::new(small_size).unwrap().get_coset(test_size_rou).unwrap(); + let ark_large_domain = GeneralEvaluationDomain::::new(test_size).unwrap(); + + let mut scalars: Vec = generate_random_scalars(small_size); + + let mut config = get_default_ntt_config(); + config.ordering = Ordering::kNR; + let mut ntt_result = vec![ScalarField::zero(); test_size]; + ntt(&scalars, false, &config, &mut ntt_result[..small_size]).unwrap(); + assert_ne!(ntt_result[..small_size], scalars); + config.coset_gen = ScalarField::from_ark(test_size_rou); + ntt(&scalars, false, &config, &mut ntt_result[small_size..]).unwrap(); + let mut ntt_large_result = vec![ScalarField::zero(); test_size]; + // back to non-coset NTT + config.coset_gen = ScalarField::one(); + scalars.resize(test_size, ScalarField::zero()); + ntt(&scalars, false, &config, &mut ntt_large_result).unwrap(); + assert_eq!(ntt_result, ntt_large_result); + + let mut ark_scalars = scalars + .iter() + .map(|v| v.to_ark()) + .collect::>(); + let mut ark_large_scalars = ark_scalars.clone(); + ark_small_domain.fft_in_place(&mut ark_scalars); + let ntt_result_as_ark = ntt_result + .iter() + .map(|p| p.to_ark()) + .collect::>(); + assert_eq!(ark_scalars[..small_size], list_to_reverse_bit_order(&ntt_result_as_ark[small_size..])); + ark_large_domain.fft_in_place(&mut ark_large_scalars); + assert_eq!(ark_large_scalars, list_to_reverse_bit_order(&ntt_result_as_ark)); - // check that the intt of each vec of scalars is equal to the intt of the specific batch - for i in 0..batches { - assert_eq!( - intt_result_vec_of_vec[i], - intt_result[i * test_size..(i + 1) * test_size] - ); - } + config.coset_gen = ScalarField::from_ark(test_size_rou); + config.ordering = Ordering::kRN; + let mut intt_result = vec![ScalarField::zero(); small_size]; + ntt(&ntt_result[small_size..], true, &config, &mut intt_result).unwrap(); + assert_eq!(intt_result, scalars[..small_size]); - assert_eq!(intt_result, scalars_batch); + ark_small_domain.ifft_in_place(&mut ark_scalars); + let intt_result_as_ark = intt_result + .iter() + .map(|p| p.to_ark()) + .collect::>(); + assert_eq!(ark_scalars[..small_size], intt_result_as_ark); } }