Skip to content

Commit

Permalink
Merge branch 'ingonyama-zk:main' into main
Browse files Browse the repository at this point in the history
  • Loading branch information
liuhao230 authored May 17, 2024
2 parents 49c7fa4 + 76a82bf commit cfea6eb
Show file tree
Hide file tree
Showing 90 changed files with 1,866 additions and 190 deletions.
95 changes: 48 additions & 47 deletions docs/docs/icicle/golang-bindings/msm.md
Original file line number Diff line number Diff line change
Expand Up @@ -6,52 +6,53 @@
package main

import (
"github.com/ingonyama-zk/icicle/v2/wrappers/golang/core"
cr "github.com/ingonyama-zk/icicle/v2/wrappers/golang/cuda_runtime"
bn254 "github.com/ingonyama-zk/icicle/v2/wrappers/golang/curves/bn254"
"github.com/ingonyama-zk/icicle/v2/wrappers/golang/core"
cr "github.com/ingonyama-zk/icicle/v2/wrappers/golang/cuda_runtime"
"github.com/ingonyama-zk/icicle/v2/wrappers/golang/curves/bn254"
bn254_msm "github.com/ingonyama-zk/icicle/v2/wrappers/golang/curves/bn254/msm"
)

func main() {
// Obtain the default MSM configuration.
cfg := bn254.GetDefaultMSMConfig()
// Obtain the default MSM configuration.
cfg := core.GetDefaultMSMConfig()

// Define the size of the problem, here 2^18.
size := 1 << 18
// Define the size of the problem, here 2^18.
size := 1 << 18

// Generate scalars and points for the MSM operation.
scalars := bn254.GenerateScalars(size)
points := bn254.GenerateAffinePoints(size)
// Generate scalars and points for the MSM operation.
scalars := bn254.GenerateScalars(size)
points := bn254.GenerateAffinePoints(size)

// Create a CUDA stream for asynchronous operations.
stream, _ := cr.CreateStream()
var p bn254.Projective
// Create a CUDA stream for asynchronous operations.
stream, _ := cr.CreateStream()
var p bn254.Projective

// Allocate memory on the device for the result of the MSM operation.
var out core.DeviceSlice
_, e := out.MallocAsync(p.Size(), p.Size(), stream)
// Allocate memory on the device for the result of the MSM operation.
var out core.DeviceSlice
_, e := out.MallocAsync(p.Size(), p.Size(), stream)

if e != cr.CudaSuccess {
panic(e)
}
if e != cr.CudaSuccess {
panic(e)
}

// Set the CUDA stream in the MSM configuration.
cfg.Ctx.Stream = &stream
cfg.IsAsync = true
// Set the CUDA stream in the MSM configuration.
cfg.Ctx.Stream = &stream
cfg.IsAsync = true

// Perform the MSM operation.
e = bn254.Msm(scalars, points, &cfg, out)
// Perform the MSM operation.
e = bn254_msm.Msm(scalars, points, &cfg, out)

if e != cr.CudaSuccess {
panic(e)
}
if e != cr.CudaSuccess {
panic(e)
}

// Allocate host memory for the results and copy the results from the device.
outHost := make(core.HostSlice[bn254.Projective], 1)
cr.SynchronizeStream(&stream)
outHost.CopyFromDevice(&out)
// Allocate host memory for the results and copy the results from the device.
outHost := make(core.HostSlice[bn254.Projective], 1)
cr.SynchronizeStream(&stream)
outHost.CopyFromDevice(&out)

// Free the device memory allocated for the results.
out.Free()
// Free the device memory allocated for the results.
out.Free()
}

```
Expand Down Expand Up @@ -169,23 +170,23 @@ This package include `G2Projective` and `G2Affine` points as well as a `G2Msm` m
package main

import (
"github.com/ingonyama-zk/icicle/v2/wrappers/golang/core"
bn254 "github.com/ingonyama-zk/icicle/v2/wrappers/golang/curves/bn254"
g2 "github.com/ingonyama-zk/icicle/v2/wrappers/golang/curves/bn254/g2"
"github.com/ingonyama-zk/icicle/v2/wrappers/golang/core"
bn254 "github.com/ingonyama-zk/icicle/v2/wrappers/golang/curves/bn254"
g2 "github.com/ingonyama-zk/icicle/v2/wrappers/golang/curves/bn254/g2"
)

func main() {
cfg := bn254.GetDefaultMSMConfig()
size := 1 << 12
batchSize := 3
totalSize := size * batchSize
scalars := bn254.GenerateScalars(totalSize)
points := g2.G2GenerateAffinePoints(totalSize)

var p g2.G2Projective
var out core.DeviceSlice
out.Malloc(batchSize*p.Size(), p.Size())
g2.G2Msm(scalars, points, &cfg, out)
cfg := core.GetDefaultMSMConfig()
size := 1 << 12
batchSize := 3
totalSize := size * batchSize
scalars := bn254.GenerateScalars(totalSize)
points := g2.G2GenerateAffinePoints(totalSize)

var p g2.G2Projective
var out core.DeviceSlice
out.Malloc(batchSize*p.Size(), p.Size())
g2.G2Msm(scalars, points, &cfg, out)
}

```
Expand Down
23 changes: 23 additions & 0 deletions examples/c++/best-practice-ntt/CMakeLists.txt
Original file line number Diff line number Diff line change
@@ -0,0 +1,23 @@
cmake_minimum_required(VERSION 3.18)
set(CMAKE_CXX_STANDARD 17)
set(CMAKE_CUDA_STANDARD 17)
set(CMAKE_CUDA_STANDARD_REQUIRED TRUE)
set(CMAKE_CXX_STANDARD_REQUIRED TRUE)
if (${CMAKE_VERSION} VERSION_LESS "3.24.0")
set(CMAKE_CUDA_ARCHITECTURES ${CUDA_ARCH})
else()
set(CMAKE_CUDA_ARCHITECTURES native) # on 3.24+, on earlier it is ignored, and the target is not passed
endif ()
project(example LANGUAGES CUDA CXX)

set(CMAKE_CUDA_FLAGS "${CMAKE_CUDA_FLAGS} --expt-relaxed-constexpr")
set(CMAKE_CUDA_FLAGS_RELEASE "")
set(CMAKE_CUDA_FLAGS_DEBUG "${CMAKE_CUDA_FLAGS_DEBUG} -g -G -O0")

add_executable(
example
example.cu
)
target_include_directories(example PRIVATE "../../../icicle/include")
target_link_libraries(example ${CMAKE_SOURCE_DIR}/build/icicle/lib/libingo_field_bn254.a)
set_target_properties(example PROPERTIES CUDA_SEPARABLE_COMPILATION ON)
33 changes: 33 additions & 0 deletions examples/c++/best-practice-ntt/README.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,33 @@
# ICICLE best practices: Concurrent Data Transfer and NTT Computation

The [Number Theoretic Transform (NTT)](https://dev.ingonyama.com/icicle/primitives/ntt) is an integral component of many cryptographic algorithms, such as polynomial multiplication in Zero Knowledge Proofs. The performance bottleneck of NTT on GPUs is the data transfer between the host (CPU) and the device (GPU). In a typical NVIDIA GPU this transfer dominates the total NTT execution time.

## Key-Takeaway

When you have to run several NTTs, consider Concurrent Data Download, Upload, and Computation to improve data bus (PCIe) and GPU utilization, and get better total execution time.

Typically, you concurrently

1. Download the output of a previous NTT back to the host
2. Upload the input for a next NTT on the device
3. Run current NTT

> [!NOTE]
> This approach requires two on-device memory vectors, decreasing the maximum size of NTT by 2x.
## Best-Practices

1. Use three separate CUDA streams for Download, Upload, and Compute operations
2. Use pinned (page-locked) memory on host to speed data bus transfers. Calling `cudaHostAlloc` allocates pinned memory.
3. Use in-place NTT to save on device memory.

## Running the example

To change the default curve BN254, edit `compile.sh` and `CMakeLists.txt`

```sh
./compile.sh
./run.sh
```

To compare with ICICLE baseline (i.e. non-concurrent) NTT, you can run [this example](../ntt/README.md).
16 changes: 16 additions & 0 deletions examples/c++/best-practice-ntt/compile.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,16 @@
#!/bin/bash

# Exit immediately on error
set -e

mkdir -p build/example
mkdir -p build/icicle

# Configure and build Icicle
cmake -S ../../../icicle/ -B build/icicle -DCMAKE_BUILD_TYPE=Release -DCURVE=bn254 -DG2=OFF -DMSM=OFF
cmake --build build/icicle

# Configure and build the example application
cmake -S . -B build/example
cmake --build build/example

142 changes: 142 additions & 0 deletions examples/c++/best-practice-ntt/example.cu
Original file line number Diff line number Diff line change
@@ -0,0 +1,142 @@
#include <stdio.h>
#include <iostream>
#include <string>
#include <chrono>

#include "curves/params/bn254.cuh"
#include "api/bn254.h"
using namespace bn254;
using namespace ntt;

const std::string curve = "BN254";

typedef scalar_t S;
typedef scalar_t E;

const unsigned max_log_ntt_size = 27;

void initialize_input(const unsigned ntt_size, const unsigned nof_ntts, E * elements ) {
for (unsigned i = 0; i < ntt_size * nof_ntts; i++) {
elements[i] = E::from(i+1);
}
}

using FpMilliseconds = std::chrono::duration<float, std::chrono::milliseconds::period>;
#define START_TIMER(timer) auto timer##_start = std::chrono::high_resolution_clock::now();
#define END_TIMER(timer, msg) printf("%s: %.0f ms\n", msg, FpMilliseconds(std::chrono::high_resolution_clock::now() - timer##_start).count());

int main(int argc, char** argv) {
cudaDeviceReset();
cudaDeviceProp deviceProperties;
int deviceId=0;
cudaGetDeviceProperties(&deviceProperties, deviceId);
std::string gpu_full_name = deviceProperties.name;
std::cout << gpu_full_name << std::endl;
std::string gpu_name = gpu_full_name;

std::cout << "Curve: " << curve << std::endl;

S basic_root = S::omega(max_log_ntt_size);

// change these parameters to match the desired NTT size and batch size
const unsigned log_ntt_size = 22;
const unsigned nof_ntts = 16;

std::cout << "log NTT size: " << log_ntt_size << std::endl;
const unsigned ntt_size = 1 << log_ntt_size;

std::cout << "Batch size: " << nof_ntts << std::endl;

// Create separate CUDA streams for overlapping data transfers and kernel execution.
cudaStream_t stream_compute, stream_h2d, stream_d2h;
cudaStreamCreate(&stream_compute);
cudaStreamCreate(&stream_h2d);
cudaStreamCreate(&stream_d2h);

// Create device context for NTT computation
auto ctx_compute = device_context::DeviceContext{
stream_compute, // stream
0, // device_id
0, // mempool
};

// Initialize NTT domain and configuration
bn254_initialize_domain(&basic_root, ctx_compute, /* fast twiddles */ true);
NTTConfig<S> config_compute = default_ntt_config<S>(ctx_compute);
config_compute.ntt_algorithm = NttAlgorithm::MixedRadix;
config_compute.batch_size = nof_ntts;
config_compute.are_inputs_on_device = true;
config_compute.are_outputs_on_device = true;
config_compute.is_async = true;

std::cout << "Concurrent Download, Upload, and Compute In-place NTT" << std::endl;
int nof_blocks = 32;
std::cout << "Number of blocks: " << nof_blocks << std::endl;
int block_size = ntt_size*nof_ntts/nof_blocks;

// on-host pinned data
E * h_inp[2];
E * h_out[2];
for (int i = 0; i < 2; i++) {
cudaHostAlloc((void**)&h_inp[i], sizeof(E)*ntt_size*nof_ntts, cudaHostAllocDefault);
cudaHostAlloc((void**)&h_out[i], sizeof(E)*ntt_size*nof_ntts, cudaHostAllocDefault);
}

// on-device in-place data
// we need two on-device vectors to overlap data transfers with NTT kernel execution
E * d_vec[2];
for (int i = 0; i < 2; i++) {
cudaMalloc((void**)&d_vec[i], sizeof(E)*ntt_size*nof_ntts);
}

// initialize input data
initialize_input(ntt_size, nof_ntts, h_inp[0]);
initialize_input(ntt_size, nof_ntts, h_inp[1]);

cudaEvent_t compute_start, compute_stop;
cudaEventCreate(&compute_start);
cudaEventCreate(&compute_stop);

for ( int run = 0; run < 10; run++ ) {
int vec_compute = run % 2;
int vec_transfer = (run + 1) % 2;
std::cout << "Run: " << run << std::endl;
std::cout << "Compute Vector: " << vec_compute << std::endl;
std::cout << "Transfer Vector: " << vec_transfer << std::endl;
START_TIMER(inplace);
cudaEventRecord(compute_start, stream_compute);
bn254_ntt_cuda(d_vec[vec_compute], ntt_size, NTTDir::kForward, config_compute, d_vec[vec_compute]);
cudaEventRecord(compute_stop, stream_compute);
// we have to delay upload to device relative to download from device by one block: preserve write after read
for (int i = 0; i <= nof_blocks; i++) {
if (i < nof_blocks) {
cudaMemcpyAsync(&h_out[vec_transfer][i*block_size], &d_vec[vec_transfer][i*block_size], sizeof(E)*block_size, cudaMemcpyDeviceToHost, stream_d2h);
}
if (i>0) {
cudaMemcpyAsync(&d_vec[vec_transfer][(i-1)*block_size], &h_inp[vec_transfer][(i-1)*block_size], sizeof(E)*block_size, cudaMemcpyHostToDevice, stream_h2d);
}
// synchronize upload and download at the end of the block to ensure data integrity
cudaStreamSynchronize(stream_d2h);
cudaStreamSynchronize(stream_h2d);
}
// synchronize compute stream with the end of the computation
cudaEventSynchronize(compute_stop);
float milliseconds = 0;
cudaEventElapsedTime(&milliseconds, compute_start, compute_stop);
END_TIMER(inplace, "Concurrent In-Place NTT");
std::cout << "NTT time: " << milliseconds << " ms" << std::endl;
};

// Clean-up
for (int i = 0; i < 2; i++) {
cudaFree(d_vec[i]);
cudaFreeHost(h_inp[i]);
cudaFreeHost(h_out[i]);
}
cudaEventDestroy(compute_start);
cudaEventDestroy(compute_stop);
cudaStreamDestroy(stream_compute);
cudaStreamDestroy(stream_d2h);
cudaStreamDestroy(stream_h2d);
return 0;
}
2 changes: 2 additions & 0 deletions examples/c++/best-practice-ntt/run.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,2 @@
#!/bin/bash
./build/example/example
34 changes: 34 additions & 0 deletions examples/golang/msm/README.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,34 @@
# ICICLE example: MultiScalar Multiplication (MSM) in Golang

`ICICLE` provides Golang bindings to CUDA-accelerated C++ implementation of [Multi-Scalar Multiplication](https://github.com/ingonyama-zk/ingopedia/blob/master/src/msm.md).

## Usage

```go
err := Msm(
/* Scalars input vector */ scalars,
/* Points input vector */ points,
/* MSMConfig reference */ &cfg,
/* Projective point result */ results)
```

In this example we use `BN254` and `BLS12377` curves. The function computes $result = \sum_{i=0}^{size-1} scalars[i] \cdot points[i]$, where input `points[]` uses affine coordinates, and `result` uses projective coordinates.

## What's in the example

1. Define the size of MSM.
2. Generate random inputs on-device
3. Configure MSM
4. Execute MSM on-device
5. Move the result on host

Running the example:
```sh
go run main.go
```

> [!NOTE]
> The default sizes are 2^17 - 2^22. You can change this by passing the `-l <size> -u <size>` options. To change the size range to 2^21 - 2^24, run the example like this:
> ```sh
> go run main.go -l=21 -u=24
> ```
Loading

0 comments on commit cfea6eb

Please sign in to comment.