Skip to content

Commit

Permalink
Release for cuda archs 75,80,86 for better compatability (#609)
Browse files Browse the repository at this point in the history
  • Loading branch information
yshekel authored Sep 10, 2024
1 parent d6c0034 commit 2e62cca
Show file tree
Hide file tree
Showing 12 changed files with 50 additions and 62 deletions.
6 changes: 3 additions & 3 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -190,6 +190,9 @@ cmake --install build # install icicle to /path/to/install/dir/
**Run tests (optional):**
> [!CAUTION]
> Most tests assume a CUDA backend exists and will fail otherwise, if a CUDA device is not found.
Add `-DBUILD_TESTS=ON` to the cmake command, build and execute tests:
```bash
Expand All @@ -207,9 +210,6 @@ or choose the test-suite
./build/tests/test_field_api --gtest_filter="*ntt*"
```
> [!NOTE]
> Most tests assume a CUDA backend exists and will fail otherwise, if a CUDA device is not found.
**Build Flags:**
You can customize your ICICLE build with the following flags:
Expand Down
8 changes: 4 additions & 4 deletions icicle/include/icicle/curves/projective.h
Original file line number Diff line number Diff line change
Expand Up @@ -50,7 +50,7 @@ class Projective

static HOST_DEVICE_INLINE Projective neg(const Projective& point) { return {point.x, FF::neg(point.y), point.z}; }

static HOST_DEVICE_INLINE Projective dbl(const Projective& point)
static HOST_DEVICE Projective dbl(const Projective& point)
{
const FF X = point.x;
const FF Y = point.y;
Expand Down Expand Up @@ -78,7 +78,7 @@ class Projective
return {X3, Y3, Z3};
}

friend HOST_DEVICE_INLINE Projective operator+(Projective p1, const Projective& p2)
friend HOST_DEVICE Projective operator+(Projective p1, const Projective& p2)
{
const FF X1 = p1.x; // < 2
const FF Y1 = p1.y; // < 2
Expand Down Expand Up @@ -126,7 +126,7 @@ class Projective

friend HOST_DEVICE_INLINE Projective operator-(Projective p1, const Projective& p2) { return p1 + neg(p2); }

friend HOST_DEVICE_INLINE Projective operator+(Projective p1, const Affine<FF>& p2)
friend HOST_DEVICE Projective operator+(Projective p1, const Affine<FF>& p2)
{
const FF X1 = p1.x; // < 2
const FF Y1 = p1.y; // < 2
Expand Down Expand Up @@ -176,7 +176,7 @@ class Projective
return p1 + Affine<FF>::neg(p2);
}

friend HOST_DEVICE_INLINE Projective operator*(SCALAR_FF scalar, const Projective& point)
friend HOST_DEVICE Projective operator*(SCALAR_FF scalar, const Projective& point)
{
Projective res = zero();
#ifdef __CUDA_ARCH__
Expand Down
20 changes: 10 additions & 10 deletions icicle/include/icicle/fields/field.h
Original file line number Diff line number Diff line change
Expand Up @@ -201,7 +201,7 @@ class Field
return rs;
}

friend HOST_DEVICE_INLINE Wide operator+(Wide xs, const Wide& ys)
friend HOST_DEVICE Wide operator+(Wide xs, const Wide& ys)
{
Wide rs = {};
add_limbs<2 * TLC, false>(xs.limbs_storage, ys.limbs_storage, rs.limbs_storage);
Expand Down Expand Up @@ -730,14 +730,14 @@ class Field
return os;
}

friend HOST_DEVICE_INLINE Field operator+(Field xs, const Field& ys)
friend HOST_DEVICE Field operator+(Field xs, const Field& ys)
{
Field rs = {};
add_limbs<TLC, false>(xs.limbs_storage, ys.limbs_storage, rs.limbs_storage);
return sub_modulus<1>(rs);
}

friend HOST_DEVICE_INLINE Field operator-(Field xs, const Field& ys)
friend HOST_DEVICE Field operator-(Field xs, const Field& ys)
{
Field rs = {};
uint32_t carry = sub_limbs<TLC, true>(xs.limbs_storage, ys.limbs_storage, rs.limbs_storage);
Expand Down Expand Up @@ -802,21 +802,21 @@ class Field
return r;
}

HOST_DEVICE_INLINE Field& operator=(Field const& other)
HOST_DEVICE Field& operator=(Field const& other)
{
for (int i = 0; i < TLC; i++) {
this->limbs_storage.limbs[i] = other.limbs_storage.limbs[i];
}
return *this;
}

friend HOST_DEVICE_INLINE Field operator*(const Field& xs, const Field& ys)
friend HOST_DEVICE Field operator*(const Field& xs, const Field& ys)
{
Wide xy = mul_wide(xs, ys); // full mult
return reduce(xy); // reduce mod p
}

friend HOST_DEVICE_INLINE bool operator==(const Field& xs, const Field& ys)
friend HOST_DEVICE bool operator==(const Field& xs, const Field& ys)
{
#ifdef __CUDA_ARCH__
const uint32_t* x = xs.limbs_storage.limbs;
Expand All @@ -833,7 +833,7 @@ class Field
#endif
}

friend HOST_DEVICE_INLINE bool operator!=(const Field& xs, const Field& ys) { return !(xs == ys); }
friend HOST_DEVICE bool operator!=(const Field& xs, const Field& ys) { return !(xs == ys); }

template <const Field& multiplier>
static HOST_DEVICE_INLINE Field mul_const(const Field& xs)
Expand Down Expand Up @@ -892,7 +892,7 @@ class Field
}

template <unsigned MODULUS_MULTIPLE = 1>
static constexpr HOST_DEVICE_INLINE Field neg(const Field& xs)
static constexpr HOST_DEVICE Field neg(const Field& xs)
{
const ff_storage modulus = get_modulus<MODULUS_MULTIPLE>();
Field rs = {};
Expand Down Expand Up @@ -934,7 +934,7 @@ class Field

static constexpr HOST_DEVICE_INLINE bool is_even(const Field& xs) { return ~xs.limbs_storage.limbs[0] & 1; }

static constexpr HOST_DEVICE_INLINE Field inverse(const Field& xs)
static constexpr HOST_DEVICE Field inverse(const Field& xs)
{
if (xs == zero()) return zero();
constexpr Field one = Field{CONFIG::one};
Expand Down Expand Up @@ -965,7 +965,7 @@ class Field
return (u == one) ? b : c;
}

static constexpr HOST_DEVICE_INLINE Field pow(Field base, int exp)
static constexpr HOST_DEVICE Field pow(Field base, int exp)
{
Field res = one();
while (exp > 0) {
Expand Down
4 changes: 3 additions & 1 deletion icicle/include/icicle/utils/modifiers.h
Original file line number Diff line number Diff line change
Expand Up @@ -14,12 +14,14 @@

#define HOST_INLINE __host__ INLINE_MACRO
#define DEVICE_INLINE __device__ INLINE_MACRO
#define HOST_DEVICE_INLINE __host__ __device__ INLINE_MACRO
#define HOST_DEVICE __host__ __device__
#define HOST_DEVICE_INLINE HOST_DEVICE INLINE_MACRO
#else // not CUDA
#define INLINE_MACRO
#define UNROLL
#define HOST_INLINE
#define DEVICE_INLINE
#define HOST_DEVICE
#define HOST_DEVICE_INLINE
#define __host__
#define __device__
Expand Down
40 changes: 9 additions & 31 deletions icicle/tests/test_curve_api.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -69,58 +69,36 @@ class CurveApiTest : public ::testing::Test
const int batch = 2;
const int N = 1 << logn;
const int precompute_factor = 2;
const bool points_montgomery = false;
int c = std::max(logn, 8) - 1;
if (scalar_t::NBITS % c == 0) { c++; }
int n_threads = std::thread::hardware_concurrency();
if (n_threads <= 0) {
ICICLE_LOG_WARNING << "Unable to detect number of hardware supported threads - fixing it to 1\n";
n_threads = 1;
}
const int total_nof_elemets = batch * N;

auto scalars = std::make_unique<scalar_t[]>(total_nof_elemets);
auto bases = std::make_unique<A[]>(N);

auto precomp_bases = std::make_unique<A[]>(N * precompute_factor);
scalar_t::rand_host_many(scalars.get(), total_nof_elemets);
P::rand_host_many(bases.get(), N);

auto result_main = std::make_unique<P[]>(batch);
auto result_ref = std::make_unique<P[]>(batch);

auto config = default_msm_config();
config.batch_size = batch;
config.are_points_shared_in_batch = true;
config.precompute_factor = precompute_factor;

auto run = [&](const std::string& dev_type, P* result, const char* msg, bool measure, int iters) {
Device dev = {dev_type, 0};
icicle_set_device(dev);

std::ostringstream oss;
oss << dev_type << " " << msg;

auto config = default_msm_config();
config.c = c;
config.batch_size = batch;
config.are_points_shared_in_batch = true;
config.precompute_factor = precompute_factor;

config.are_results_on_device = true;

ConfigExtension ext;
ext.set(CpuBackendConfig::CPU_NOF_THREADS, n_threads);
config.ext = &ext;

// Note: allocating the precompute_bases on device since CUDA backend assumes that.
// TODO: fix CUDA backend to support host memory too.
A* precomp_bases = nullptr;
ICICLE_CHECK(icicle_malloc((void**)&precomp_bases, N * precompute_factor * sizeof(A)));
ICICLE_CHECK(msm_precompute_bases(bases.get(), N, config, precomp_bases));

config.are_points_on_device = true;
config.are_results_on_device = false;
ICICLE_CHECK(msm_precompute_bases(bases.get(), N, config, precomp_bases.get()));

START_TIMER(MSM_sync)
for (int i = 0; i < iters; ++i) {
ICICLE_CHECK(msm(scalars.get(), precomp_bases, N, config, result));
ICICLE_CHECK(msm(scalars.get(), precomp_bases.get(), N, config, result));
}
END_TIMER(MSM_sync, oss.str().c_str(), measure);
ICICLE_CHECK(icicle_free(precomp_bases));
};

run(s_main_target, result_main.get(), "msm", VERBOSE /*=measure*/, 1 /*=iters*/);
Expand Down
2 changes: 2 additions & 0 deletions scripts/release/Dockerfile.ubi8
Original file line number Diff line number Diff line change
Expand Up @@ -3,3 +3,5 @@ FROM nvidia/cuda:12.2.2-devel-ubi8

# install cmake
RUN dnf update -y && dnf install -y cmake
# install ninja-build
RUN dnf install -y ninja-build
4 changes: 3 additions & 1 deletion scripts/release/Dockerfile.ubi9
Original file line number Diff line number Diff line change
Expand Up @@ -2,4 +2,6 @@
FROM nvidia/cuda:12.2.2-devel-ubi9

# install cmake
RUN dnf update -y && dnf install -y cmake
RUN dnf update -y && dnf install -y cmake
# install ninja-build
RUN dnf install -y ninja-build
3 changes: 2 additions & 1 deletion scripts/release/Dockerfile.ubuntu20
Original file line number Diff line number Diff line change
Expand Up @@ -12,7 +12,8 @@ RUN apt-get update && apt-get install -y \
libssl-dev \
libcurl4-openssl-dev \
libarchive-dev \
zlib1g-dev
zlib1g-dev \
ninja-build

# Install the latest stable version of CMake from source
RUN CMAKE_VERSION=3.27.4 \
Expand Down
3 changes: 2 additions & 1 deletion scripts/release/Dockerfile.ubuntu22
Original file line number Diff line number Diff line change
Expand Up @@ -5,6 +5,7 @@ FROM nvidia/cuda:12.2.2-devel-ubuntu22.04
RUN apt-get update && apt-get install -y \
build-essential \
cmake \
tar
tar \
ninja-build

RUN apt install cargo -y
8 changes: 4 additions & 4 deletions scripts/release/build_all.sh
Original file line number Diff line number Diff line change
Expand Up @@ -32,25 +32,25 @@ docker run --rm --gpus all \
-v ./icicle:/icicle \
-v "$output_dir:/output" \
-v ./scripts:/scripts \
icicle-release-ubuntu22-cuda122 bash /scripts/release/build_release_and_tar.sh icicle30 ubuntu22 cuda122
icicle-release-ubuntu22-cuda122 bash /scripts/release/build_release_and_tar.sh icicle30 ubuntu22 cuda122 &

# ubuntu 20
docker run --rm --gpus all \
-v ./icicle:/icicle \
-v "$output_dir:/output" \
-v ./scripts:/scripts \
icicle-release-ubuntu20-cuda122 bash /scripts/release/build_release_and_tar.sh icicle30 ubuntu20 cuda122
icicle-release-ubuntu20-cuda122 bash /scripts/release/build_release_and_tar.sh icicle30 ubuntu20 cuda122 &

# ubi 8 (rhel compatible)
docker run --rm --gpus all \
-v ./icicle:/icicle \
-v "$output_dir:/output" \
-v ./scripts:/scripts \
icicle-release-ubi8-cuda122 bash /scripts/release/build_release_and_tar.sh icicle30 ubi8 cuda122
icicle-release-ubi8-cuda122 bash /scripts/release/build_release_and_tar.sh icicle30 ubi8 cuda122 &

# ubi 9 (rhel compatible)
docker run --rm --gpus all \
-v ./icicle:/icicle \
-v "$output_dir:/output" \
-v ./scripts:/scripts \
icicle-release-ubi9-cuda122 bash /scripts/release/build_release_and_tar.sh icicle30 ubi9 cuda122
icicle-release-ubi9-cuda122 bash /scripts/release/build_release_and_tar.sh icicle30 ubi9 cuda122 &
10 changes: 6 additions & 4 deletions scripts/release/build_release_and_tar.sh
Original file line number Diff line number Diff line change
Expand Up @@ -20,8 +20,9 @@ for field in "${fields[@]}"; do

mkdir -p build && rm -rf build/*
# Configure, build, and install
cmake -S icicle -B build -DFIELD=$field -DCUDA_BACKEND=local -DCMAKE_INSTALL_PREFIX=install_dir/icicle -DCMAKE_BUILD_TYPE=Release
cmake --build build -j # build
# Precompile SASS for modern architectures (Turing, Ampere, etc.) and include PTX fallback (?)
cmake -S icicle -B build -G Ninja -DFIELD=$field -DCUDA_BACKEND=local -DCMAKE_INSTALL_PREFIX=install_dir/icicle -DCMAKE_BUILD_TYPE=Release -DCUDA_ARCH="75;80;86"
cmake --build build # build
cmake --install build # install
done

Expand All @@ -31,8 +32,9 @@ for curve in "${curves[@]}"; do

mkdir -p build && rm -rf build/*
# Configure, build, and install
cmake -S icicle -B build -DCURVE=$curve -DCUDA_BACKEND=local -DCMAKE_INSTALL_PREFIX=install_dir/icicle -DCMAKE_BUILD_TYPE=Release
cmake --build build -j # build
# Precompile SASS for modern architectures (Turing, Ampere, etc.) and include PTX fallback (?)
cmake -S icicle -B build -G Ninja -DCURVE=$curve -DCUDA_BACKEND=local -DCMAKE_INSTALL_PREFIX=install_dir/icicle -DCMAKE_BUILD_TYPE=Release -DCUDA_ARCH="75;80;86"
cmake --build build # build
cmake --install build # install
done

Expand Down
4 changes: 2 additions & 2 deletions wrappers/rust/icicle-core/src/msm/tests.rs
Original file line number Diff line number Diff line change
Expand Up @@ -99,7 +99,7 @@ pub fn check_msm_batch_shared<C: Curve + MSM<C>>()
where
<C::ScalarField as FieldImpl>::Config: GenerateRandom<C::ScalarField>,
{
let test_sizes = [1000, 1 << 16];
let test_sizes = [1000, 1 << 14];
let batch_sizes = [1, 3, 1 << 4];
let mut stream = IcicleStream::create().unwrap();
let precompute_factor = 8;
Expand Down Expand Up @@ -175,7 +175,7 @@ pub fn check_msm_batch_not_shared<C: Curve + MSM<C>>()
where
<C::ScalarField as FieldImpl>::Config: GenerateRandom<C::ScalarField>,
{
let test_sizes = [1000, 1 << 16];
let test_sizes = [1000, 1 << 14];
let batch_sizes = [1, 3, 1 << 4];
let mut stream = IcicleStream::create().unwrap();
let precompute_factor = 8;
Expand Down

0 comments on commit 2e62cca

Please sign in to comment.