Merge branch 'ingonyama-zk:main' into main

ingonyama-zk · May 17, 2024 · cfea6eb · cfea6eb
2 parents 49c7fa4 + 76a82bf
commit cfea6eb
Show file tree

Hide file tree

Showing 90 changed files with 1,866 additions and 190 deletions.
diff --git a/docs/docs/icicle/golang-bindings/msm.md b/docs/docs/icicle/golang-bindings/msm.md
@@ -6,52 +6,53 @@
 package main
 
 import (
-  "github.com/ingonyama-zk/icicle/v2/wrappers/golang/core"
-  cr "github.com/ingonyama-zk/icicle/v2/wrappers/golang/cuda_runtime"
-  bn254 "github.com/ingonyama-zk/icicle/v2/wrappers/golang/curves/bn254"
+	"github.com/ingonyama-zk/icicle/v2/wrappers/golang/core"
+	cr "github.com/ingonyama-zk/icicle/v2/wrappers/golang/cuda_runtime"
+	"github.com/ingonyama-zk/icicle/v2/wrappers/golang/curves/bn254"
+	bn254_msm "github.com/ingonyama-zk/icicle/v2/wrappers/golang/curves/bn254/msm"
 )
 
 func main() {
-  // Obtain the default MSM configuration.
-  cfg := bn254.GetDefaultMSMConfig()
+	// Obtain the default MSM configuration.
+	cfg := core.GetDefaultMSMConfig()
 
-  // Define the size of the problem, here 2^18.
-  size := 1 << 18
+	// Define the size of the problem, here 2^18.
+	size := 1 << 18
 
-  // Generate scalars and points for the MSM operation.
-  scalars := bn254.GenerateScalars(size)
-  points := bn254.GenerateAffinePoints(size)
+	// Generate scalars and points for the MSM operation.
+	scalars := bn254.GenerateScalars(size)
+	points := bn254.GenerateAffinePoints(size)
 
-  // Create a CUDA stream for asynchronous operations.
-  stream, _ := cr.CreateStream()
-  var p bn254.Projective
+	// Create a CUDA stream for asynchronous operations.
+	stream, _ := cr.CreateStream()
+	var p bn254.Projective
 
-  // Allocate memory on the device for the result of the MSM operation.
-  var out core.DeviceSlice
-  _, e := out.MallocAsync(p.Size(), p.Size(), stream)
+	// Allocate memory on the device for the result of the MSM operation.
+	var out core.DeviceSlice
+	_, e := out.MallocAsync(p.Size(), p.Size(), stream)
 
-  if e != cr.CudaSuccess {
-    panic(e)
-  }
+	if e != cr.CudaSuccess {
+		panic(e)
+	}
 
-  // Set the CUDA stream in the MSM configuration.
-  cfg.Ctx.Stream = &stream
-  cfg.IsAsync = true
+	// Set the CUDA stream in the MSM configuration.
+	cfg.Ctx.Stream = &stream
+	cfg.IsAsync = true
 
-  // Perform the MSM operation.
-  e = bn254.Msm(scalars, points, &cfg, out)
+	// Perform the MSM operation.
+	e = bn254_msm.Msm(scalars, points, &cfg, out)
 
-  if e != cr.CudaSuccess {
-    panic(e)
-  }
+	if e != cr.CudaSuccess {
+		panic(e)
+	}
 
-  // Allocate host memory for the results and copy the results from the device.
-  outHost := make(core.HostSlice[bn254.Projective], 1)
-  cr.SynchronizeStream(&stream)
-  outHost.CopyFromDevice(&out)
+	// Allocate host memory for the results and copy the results from the device.
+	outHost := make(core.HostSlice[bn254.Projective], 1)
+	cr.SynchronizeStream(&stream)
+	outHost.CopyFromDevice(&out)
 
-  // Free the device memory allocated for the results.
-  out.Free()
+	// Free the device memory allocated for the results.
+	out.Free()
 }
 
 ```
@@ -169,23 +170,23 @@ This package include `G2Projective` and `G2Affine` points as well as a `G2Msm` m
 package main
 
 import (
-  "github.com/ingonyama-zk/icicle/v2/wrappers/golang/core"
-  bn254 "github.com/ingonyama-zk/icicle/v2/wrappers/golang/curves/bn254"
-  g2 "github.com/ingonyama-zk/icicle/v2/wrappers/golang/curves/bn254/g2"
+	"github.com/ingonyama-zk/icicle/v2/wrappers/golang/core"
+	bn254 "github.com/ingonyama-zk/icicle/v2/wrappers/golang/curves/bn254"
+	g2 "github.com/ingonyama-zk/icicle/v2/wrappers/golang/curves/bn254/g2"
 )
 
 func main() {
-  cfg := bn254.GetDefaultMSMConfig()
-  size := 1 << 12
-  batchSize := 3
-  totalSize := size * batchSize
-  scalars := bn254.GenerateScalars(totalSize)
-  points := g2.G2GenerateAffinePoints(totalSize)
-
-  var p g2.G2Projective
-  var out core.DeviceSlice
-  out.Malloc(batchSize*p.Size(), p.Size())
-  g2.G2Msm(scalars, points, &cfg, out)
+	cfg := core.GetDefaultMSMConfig()
+	size := 1 << 12
+	batchSize := 3
+	totalSize := size * batchSize
+	scalars := bn254.GenerateScalars(totalSize)
+	points := g2.G2GenerateAffinePoints(totalSize)
+
+	var p g2.G2Projective
+	var out core.DeviceSlice
+	out.Malloc(batchSize*p.Size(), p.Size())
+	g2.G2Msm(scalars, points, &cfg, out)
 }
 
 ```

diff --git a/examples/c++/best-practice-ntt/CMakeLists.txt b/examples/c++/best-practice-ntt/CMakeLists.txt
@@ -0,0 +1,23 @@
+cmake_minimum_required(VERSION 3.18)
+set(CMAKE_CXX_STANDARD 17)
+set(CMAKE_CUDA_STANDARD 17)
+set(CMAKE_CUDA_STANDARD_REQUIRED TRUE)
+set(CMAKE_CXX_STANDARD_REQUIRED TRUE)
+if (${CMAKE_VERSION} VERSION_LESS "3.24.0")
+    set(CMAKE_CUDA_ARCHITECTURES ${CUDA_ARCH})
+else()
+    set(CMAKE_CUDA_ARCHITECTURES native) # on 3.24+, on earlier it is ignored, and the target is not passed
+endif ()
+project(example LANGUAGES CUDA CXX)
+
+set(CMAKE_CUDA_FLAGS "${CMAKE_CUDA_FLAGS} --expt-relaxed-constexpr")
+set(CMAKE_CUDA_FLAGS_RELEASE "")
+set(CMAKE_CUDA_FLAGS_DEBUG "${CMAKE_CUDA_FLAGS_DEBUG} -g -G -O0")
+
+add_executable(
+  example
+  example.cu
+)
+target_include_directories(example PRIVATE "../../../icicle/include")
+target_link_libraries(example ${CMAKE_SOURCE_DIR}/build/icicle/lib/libingo_field_bn254.a)
+set_target_properties(example PROPERTIES CUDA_SEPARABLE_COMPILATION ON)
diff --git a/examples/c++/best-practice-ntt/README.md b/examples/c++/best-practice-ntt/README.md
@@ -0,0 +1,33 @@
+# ICICLE best practices: Concurrent Data Transfer and NTT Computation
+
+The [Number Theoretic Transform (NTT)](https://dev.ingonyama.com/icicle/primitives/ntt) is an integral component of many cryptographic algorithms, such as polynomial multiplication in Zero Knowledge Proofs. The performance bottleneck of NTT on GPUs is the data transfer between the host (CPU) and the device (GPU). In a typical NVIDIA GPU this transfer dominates the total NTT execution time.
+
+## Key-Takeaway
+
+When you have to run several NTTs, consider Concurrent Data Download, Upload, and Computation to improve data bus (PCIe) and GPU utilization, and get better total execution time.
+
+Typically, you concurrently
+
+1. Download the output of a previous NTT back to the host
+2. Upload the input for a next NTT on the device
+3. Run current NTT
+
+> [!NOTE]
+> This approach requires two on-device memory vectors, decreasing the maximum size of NTT by 2x.
+
+## Best-Practices
+
+1. Use three separate CUDA streams for Download, Upload, and Compute operations
+2. Use pinned (page-locked) memory on host to speed data bus transfers. Calling `cudaHostAlloc` allocates pinned memory.
+3. Use in-place NTT to save on device memory.
+
+## Running the example
+
+To change the default curve BN254, edit `compile.sh` and `CMakeLists.txt`
+
+```sh
+./compile.sh
+./run.sh
+```
+
+To compare with ICICLE baseline (i.e. non-concurrent) NTT, you can run [this example](../ntt/README.md).
diff --git a/examples/c++/best-practice-ntt/compile.sh b/examples/c++/best-practice-ntt/compile.sh
@@ -0,0 +1,16 @@
+#!/bin/bash
+
+# Exit immediately on error
+set -e
+
+mkdir -p build/example
+mkdir -p build/icicle
+
+# Configure and build Icicle
+cmake -S ../../../icicle/ -B build/icicle -DCMAKE_BUILD_TYPE=Release -DCURVE=bn254 -DG2=OFF -DMSM=OFF
+cmake --build build/icicle
+
+# Configure and build the example application
+cmake -S . -B build/example
+cmake --build build/example
+
diff --git a/examples/c++/best-practice-ntt/example.cu b/examples/c++/best-practice-ntt/example.cu
@@ -0,0 +1,142 @@
+#include <stdio.h>
+#include <iostream>
+#include <string>
+#include <chrono>
+
+#include "curves/params/bn254.cuh"
+#include "api/bn254.h"
+using namespace bn254;
+using namespace ntt;
+
+const std::string curve = "BN254";
+
+typedef scalar_t S;
+typedef scalar_t E;
+
+const unsigned max_log_ntt_size = 27;
+
+void initialize_input(const unsigned ntt_size, const unsigned nof_ntts, E * elements ) {
+  for (unsigned i = 0; i < ntt_size * nof_ntts; i++) {
+    elements[i] = E::from(i+1);
+  }
+}
+
+using FpMilliseconds = std::chrono::duration<float, std::chrono::milliseconds::period>;
+#define START_TIMER(timer) auto timer##_start = std::chrono::high_resolution_clock::now();
+#define END_TIMER(timer, msg) printf("%s: %.0f ms\n", msg, FpMilliseconds(std::chrono::high_resolution_clock::now() - timer##_start).count());
+
+int main(int argc, char** argv) {
+  cudaDeviceReset();
+  cudaDeviceProp deviceProperties;
+  int deviceId=0;
+  cudaGetDeviceProperties(&deviceProperties, deviceId);
+  std::string gpu_full_name = deviceProperties.name;
+  std::cout << gpu_full_name << std::endl;
+  std::string gpu_name = gpu_full_name;
+
+  std::cout << "Curve: " << curve << std::endl;
+
+  S basic_root = S::omega(max_log_ntt_size);
+
+  // change these parameters to match the desired NTT size and batch size  
+  const unsigned log_ntt_size = 22;
+  const unsigned nof_ntts = 16;
+
+  std::cout << "log NTT size: " << log_ntt_size << std::endl;
+  const unsigned ntt_size = 1 << log_ntt_size;
+
+  std::cout << "Batch size: " << nof_ntts << std::endl;
+
+  // Create separate CUDA streams for overlapping data transfers and kernel execution.
+  cudaStream_t stream_compute, stream_h2d, stream_d2h;
+  cudaStreamCreate(&stream_compute); 
+  cudaStreamCreate(&stream_h2d);
+  cudaStreamCreate(&stream_d2h);
+
+  // Create device context for NTT computation
+  auto ctx_compute = device_context::DeviceContext{
+    stream_compute, // stream
+    0,              // device_id
+    0,              // mempool
+  };
+
+  // Initialize NTT domain and configuration
+  bn254_initialize_domain(&basic_root, ctx_compute, /* fast twiddles */ true);
+  NTTConfig<S> config_compute = default_ntt_config<S>(ctx_compute);
+  config_compute.ntt_algorithm = NttAlgorithm::MixedRadix;
+  config_compute.batch_size = nof_ntts;
+  config_compute.are_inputs_on_device = true;
+  config_compute.are_outputs_on_device = true;
+  config_compute.is_async = true;
+
+  std::cout << "Concurrent Download, Upload, and Compute In-place NTT" << std::endl;
+  int nof_blocks = 32;
+  std::cout << "Number of blocks: " << nof_blocks << std::endl;
+  int block_size = ntt_size*nof_ntts/nof_blocks;
+
+  // on-host pinned data
+  E * h_inp[2];
+  E * h_out[2];
+  for (int i = 0; i < 2; i++) {
+    cudaHostAlloc((void**)&h_inp[i], sizeof(E)*ntt_size*nof_ntts, cudaHostAllocDefault);
+    cudaHostAlloc((void**)&h_out[i], sizeof(E)*ntt_size*nof_ntts, cudaHostAllocDefault);
+  }
+
+  // on-device in-place data
+  // we need two on-device vectors to overlap data transfers with NTT kernel execution
+  E * d_vec[2];
+  for (int i = 0; i < 2; i++) {
+    cudaMalloc((void**)&d_vec[i], sizeof(E)*ntt_size*nof_ntts);
+  }
+
+  // initialize input data
+  initialize_input(ntt_size, nof_ntts, h_inp[0]);
+  initialize_input(ntt_size, nof_ntts, h_inp[1]);
+
+  cudaEvent_t compute_start, compute_stop;
+  cudaEventCreate(&compute_start);
+  cudaEventCreate(&compute_stop);
+
+  for ( int run = 0; run < 10; run++ ) {  
+    int vec_compute = run % 2;
+    int vec_transfer = (run + 1) % 2;
+    std::cout << "Run: " << run << std::endl;
+    std::cout << "Compute Vector: " << vec_compute << std::endl;
+    std::cout << "Transfer Vector: " << vec_transfer << std::endl;
+    START_TIMER(inplace);
+    cudaEventRecord(compute_start, stream_compute);
+    bn254_ntt_cuda(d_vec[vec_compute], ntt_size, NTTDir::kForward, config_compute, d_vec[vec_compute]);
+    cudaEventRecord(compute_stop, stream_compute);
+    // we have to delay upload to device relative to download from device by one block: preserve write after read
+    for (int i = 0; i <= nof_blocks; i++) {
+      if (i < nof_blocks) {
+        cudaMemcpyAsync(&h_out[vec_transfer][i*block_size], &d_vec[vec_transfer][i*block_size], sizeof(E)*block_size, cudaMemcpyDeviceToHost, stream_d2h);    
+      }
+      if (i>0) {
+        cudaMemcpyAsync(&d_vec[vec_transfer][(i-1)*block_size], &h_inp[vec_transfer][(i-1)*block_size], sizeof(E)*block_size, cudaMemcpyHostToDevice, stream_h2d);
+      }
+      // synchronize upload and download at the end of the block to ensure data integrity
+      cudaStreamSynchronize(stream_d2h); 
+      cudaStreamSynchronize(stream_h2d); 
+    }
+    // synchronize compute stream with the end of the computation
+    cudaEventSynchronize(compute_stop);
+    float milliseconds = 0;
+    cudaEventElapsedTime(&milliseconds, compute_start, compute_stop);
+    END_TIMER(inplace, "Concurrent In-Place  NTT");
+    std::cout << "NTT time: " << milliseconds << " ms" << std::endl;
+  };
+
+  // Clean-up
+  for (int i = 0; i < 2; i++) {
+    cudaFree(d_vec[i]); 
+    cudaFreeHost(h_inp[i]); 
+    cudaFreeHost(h_out[i]); 
+  }
+  cudaEventDestroy(compute_start);
+  cudaEventDestroy(compute_stop);
+  cudaStreamDestroy(stream_compute);
+  cudaStreamDestroy(stream_d2h);
+  cudaStreamDestroy(stream_h2d);
+  return 0;
+}
diff --git a/examples/c++/best-practice-ntt/run.sh b/examples/c++/best-practice-ntt/run.sh
@@ -0,0 +1,2 @@
+#!/bin/bash
+./build/example/example
diff --git a/examples/golang/msm/README.md b/examples/golang/msm/README.md
@@ -0,0 +1,34 @@
+# ICICLE example: MultiScalar Multiplication (MSM) in Golang
+
+`ICICLE` provides Golang bindings to CUDA-accelerated C++ implementation of [Multi-Scalar Multiplication](https://github.com/ingonyama-zk/ingopedia/blob/master/src/msm.md).
+
+## Usage
+
+```go
+err := Msm(
+  /* Scalars input vector */ scalars,
+  /* Points input vector */ points,
+  /* MSMConfig reference */ &cfg,
+  /* Projective point result */ results)
+```
+
+In this example we use `BN254` and `BLS12377` curves. The function computes $result = \sum_{i=0}^{size-1} scalars[i] \cdot points[i]$, where input `points[]` uses affine coordinates, and `result` uses projective coordinates.
+
+## What's in the example
+
+1. Define the size of MSM. 
+2. Generate random inputs on-device
+3. Configure MSM
+4. Execute MSM on-device
+5. Move the result on host
+
+Running the example:
+```sh
+go run main.go
+```
+
+> [!NOTE]
+> The default sizes are 2^17 - 2^22. You can change this by passing the `-l <size> -u <size>` options. To change the size range to 2^21 - 2^24, run the example like this:
+> ```sh
+> go run main.go -l=21 -u=24
+> ```