Skip to content

Commit

Permalink
Implement SIMD and reduce quantisation scale to 181 (#334)
Browse files Browse the repository at this point in the history
Elo | 15.50 +- 7.12 (95%)
SPRT | 8.0+0.08s Threads=1 Hash=16MB
LLR | 2.97 (-2.94, 2.94) [0.00, 3.00]
Games | N: 4710 W: 1317 L: 1107 D: 2286
Penta | [43, 490, 1089, 680, 53]
Bench 6920775

Co-authored-by: jeff-pow <103015219+jeff-pow@users.noreply.github.com>
  • Loading branch information
cj5716 and jeff-pow authored Feb 11, 2024
1 parent f7ad980 commit d0ed9e9
Show file tree
Hide file tree
Showing 6 changed files with 136 additions and 27 deletions.
3 changes: 2 additions & 1 deletion Contributors
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,8 @@
Cosmo - relative NNUE inference code
cj5716 - Several gainers
Disservin - Makefile and CI PR
Gabe - fat speedup
fireandice - Writing SIMD so we don't have to
Gabe - Fat speedup
JW - 16bits move structure shenanigans
Kimmy - Search stuff
kz04px (Big T) - Code refactoring
Expand Down
71 changes: 49 additions & 22 deletions makefile
Original file line number Diff line number Diff line change
@@ -1,17 +1,17 @@

NETWORK_NAME = nn.net
_THIS := $(realpath $(dir $(abspath $(lastword $(MAKEFILE_LIST)))))
_ROOT := $(_THIS)
_THIS := $(realpath $(dir $(abspath $(lastword $(MAKEFILE_LIST)))))
_ROOT := $(_THIS)
EVALFILE = $(NETWORK_NAME)
CXX := g++
TARGET := Alexandria
CXX := g++
TARGET := Alexandria
WARNINGS = -Wall -Wcast-qual -Wextra -Wshadow -Wdouble-promotion -Wformat=2 -Wnull-dereference -Wlogical-op -Wold-style-cast -Wundef -pedantic
CXXFLAGS := -funroll-loops -O3 -flto -fno-exceptions -std=gnu++2a -DNDEBUG $(WARNINGS)
NATIVE = -march=native
NATIVE = -march=native


# engine name
NAME := Alexandria
NAME := Alexandria

TMPDIR = .tmp

Expand All @@ -22,10 +22,10 @@ endif

# Detect Windows
ifeq ($(OS), Windows_NT)
MKDIR := mkdir
MKDIR := mkdir
else
ifeq ($(COMP), MINGW)
MKDIR := mkdir
MKDIR := mkdir
else
MKDIR := mkdir -p
endif
Expand All @@ -52,45 +52,72 @@ endif
# Remove native for builds
ifdef build
NATIVE =
else
build = native
endif

# SPECIFIC BUILDS
ifeq ($(build), native)
NATIVE = -march=native
ARCH = -x86-64-native
NATIVE = -march=native
ARCH = -x86-64-native
PROPERTIES = $(shell echo | $(CXX) -march=native -E -dM -)
ifneq ($(findstring __AVX512F__, $(PROPERTIES)),)
ifneq ($(findstring __AVX512BW__, $(PROPERTIES)),)
CXXFLAGS += -DUSE_AVX512 -mavx512f -mavx512bw
endif
else ifneq ($(findstring __BMI2__, $(PROPERTIES)),)
CXXFLAGS += -DUSE_AVX2 -mavx2 -mbmi
else ifneq ($(findstring __AVX2__, $(PROPERTIES)),)
CXXFLAGS += -DUSE_AVX2 -mavx2 -mbmi
endif

endif

ifeq ($(build), x86-64)
NATIVE = -mtune=znver1
NATIVE = -mtune=znver1
INSTRUCTIONS = -msse -msse2 -mpopcnt
ARCH = -x86-64
ARCH = -x86-64
endif

ifeq ($(build), x86-64-modern)
NATIVE = -mtune=znver2
NATIVE = -mtune=znver2
INSTRUCTIONS = -m64 -msse -msse3 -mpopcnt
ARCH = -x86-64-modern
ARCH = -x86-64-modern
endif

ifeq ($(build), x86-64-avx2)
NATIVE = -march=bdver4 -mno-tbm -mno-sse4a -mno-bmi2
ARCH = -x86-64-avx2
NATIVE = -march=bdver4 -mno-tbm -mno-sse4a -mno-bmi2
ARCH = -x86-64-avx2
CXXFLAGS += -DUSE_AVX2 -mavx2 -mbmi
endif

ifeq ($(build), x86-64-bmi2)
NATIVE = -march=haswell
ARCH = -x86-64-bmi2
NATIVE = -march=haswell
ARCH = -x86-64-bmi2
CXXFLAGS += -DUSE_AVX2 -mavx2 -mbmi
endif

ifeq ($(build), x86-64-avx512)
NATIVE = -march=x86-64-v4 -mtune=znver4
ARCH = -x86-64-avx512
NATIVE = -march=x86-64-v4 -mtune=znver4
ARCH = -x86-64-avx512
CXXFLAGS += -DUSE_AVX512 -mavx512f -mavx512bw
endif

ifeq ($(build), debug)
CXXFLAGS = -O3 -g3 -fno-omit-frame-pointer -std=gnu++2a
NATIVE = -msse -msse3 -mpopcnt
FLAGS = -lpthread -lstdc++
FLAGS = -lpthread -lstdc++

PROPERTIES = $(shell echo | $(CXX) -march=native -E -dM -)
ifneq ($(findstring __AVX512F__, $(PROPERTIES)),)
ifneq ($(findstring __AVX512BW__, $(PROPERTIES)),)
CXXFLAGS += -DUSE_AVX512 -mavx512f -mavx512bw
endif
else ifneq ($(findstring __BMI2__, $(PROPERTIES)),)
CXXFLAGS += -DUSE_AVX2 -mavx2 -mbmi
else ifneq ($(findstring __AVX2__, $(PROPERTIES)),)
CXXFLAGS += -DUSE_AVX2 -mavx2 -mbmi
endif
endif

# Add network name and Evalfile
Expand All @@ -99,7 +126,7 @@ CXXFLAGS += -DNETWORK_NAME=\"$(NETWORK_NAME)\" -DEVALFILE=\"$(EVALFILE)\"
SOURCES := $(wildcard src/*.cpp)
OBJECTS := $(patsubst %.cpp,$(TMPDIR)/%.o,$(SOURCES))
DEPENDS := $(patsubst %.cpp,$(TMPDIR)/%.d,$(SOURCES))
EXE := $(NAME)$(SUFFIX)
EXE := $(NAME)$(SUFFIX)

all: $(TARGET)
clean:
Expand Down
Binary file modified nn.net
Binary file not shown.
78 changes: 75 additions & 3 deletions src/nnue.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,7 @@
#include <cstdio>
#include <cstring>
#include <iostream>
#include <immintrin.h>
#include "incbin/incbin.h"

// Macro to embed the default efficiently updatable neural network (NNUE) file
Expand All @@ -26,9 +27,75 @@ Network net;
// Thanks to Disservin for having me look at his code and Luecx for the
// invaluable help and the immense patience

#if defined(USE_AVX512)
constexpr int32_t CHUNK_SIZE = 32;
#elif defined(USE_AVX2)
constexpr int32_t CHUNK_SIZE = 16;
#else
constexpr int32_t CHUNK_SIZE = 1;
#endif
constexpr int32_t REQUIRED_ITERS = HIDDEN_SIZE / CHUNK_SIZE;

#if defined(USE_AVX2)
__m256i NNUE::simd_screlu(const __m256i vec) {
auto min = _mm256_set1_epi16(0);
auto max = _mm256_set1_epi16(181);
auto clamped = _mm256_min_epi16(_mm256_max_epi16(vec, min), max);
return _mm256_mullo_epi16(clamped, clamped);
}

int32_t NNUE::horizontal_add(const __m256i sum) {
auto upper_128 = _mm256_extracti128_si256(sum, 1);
auto lower_128 = _mm256_castsi256_si128(sum);
auto sum_128 = _mm_add_epi32(upper_128, lower_128);

auto upper_64 = _mm_unpackhi_epi64(sum_128, sum_128);
auto sum_64 = _mm_add_epi32(upper_64, sum_128);

auto upper_32 = _mm_shuffle_epi32(sum_64, 1);
auto sum_32 = _mm_add_epi32(upper_32, sum_64);

return _mm_cvtsi128_si32(sum_32);
}

int32_t NNUE::flatten(const int16_t *acc, const int16_t *weights) {
auto sum = _mm256_setzero_si256();
for (int i = 0; i < REQUIRED_ITERS; i++) {
auto us_vector = _mm256_loadu_si256(reinterpret_cast<const __m256i*>(acc + i * CHUNK_SIZE));
auto activated = simd_screlu(us_vector);
auto weights_vec = _mm256_loadu_si256(reinterpret_cast<const __m256i*>(weights + i * CHUNK_SIZE));
auto mul = _mm256_madd_epi16(activated, weights_vec);
sum = _mm256_add_epi32(sum, mul);
}
return horizontal_add(sum);
}

#elif defined(USE_AVX512)

__m512i NNUE::simd_screlu(const __m512i vec) {
auto min = _mm512_set1_epi16(0);
auto max = _mm512_set1_epi16(181);
auto clamped = _mm512_min_epi16(_mm512_max_epi16(vec, min), max);
return _mm512_mullo_epi16(clamped, clamped);
}

int32_t NNUE::flatten(const int16_t *acc, const int16_t *weights) {
auto sum = _mm512_setzero_si512();
for (int i = 0; i < REQUIRED_ITERS; i++) {
auto us_vector = _mm512_loadu_si512(reinterpret_cast<const __m512i*>(acc + i * CHUNK_SIZE));
auto activated = simd_screlu(us_vector);
auto weights_vec = _mm512_loadu_si512(reinterpret_cast<const __m512i*>(weights + i * CHUNK_SIZE));
auto mul = _mm512_madd_epi16(activated, weights_vec);
sum = _mm512_add_epi32(sum, mul);
}
return _mm512_reduce_add_epi32(sum);
}

#endif

int32_t NNUE::SCReLU(int16_t x) {
constexpr int16_t CR_MIN = 0;
constexpr int16_t CR_MAX = 255;
constexpr int16_t CR_MAX = 181;
// compute squared clipped ReLU
int16_t clipped = std::clamp(x, CR_MIN, CR_MAX);
int32_t wide = clipped;
Expand Down Expand Up @@ -166,15 +233,20 @@ int32_t NNUE::output(const NNUE::accumulator& board_accumulator, const bool whit
us = board_accumulator[1].data();
them = board_accumulator[0].data();
}
#if defined(USE_AVX512) || defined(USE_AVX2)
int32_t output = flatten(us, net.outputWeights) + flatten(them, net.outputWeights + HIDDEN_SIZE);
return (net.outputBias + output / 181) * 400 / (64 * 181);
#else
int32_t output = 0;
for (int i = 0; i < HIDDEN_SIZE; i++) {
output += SCReLU(us[i]) * static_cast<int32_t>(net.outputWeights[i]);
}
for (int i = 0; i < HIDDEN_SIZE; i++) {
output += SCReLU(them[i]) * static_cast<int32_t>(net.outputWeights[HIDDEN_SIZE + i]);
}
int32_t unsquared = output / 255 + net.outputBias;
return unsquared * 400 / (64 * 255);
int32_t unsquared = output / 181 + net.outputBias;
return unsquared * 400 / (64 * 181);
#endif
}

std::pair<std::size_t, std::size_t> NNUE::GetIndex(const int piece, const int square) {
Expand Down
9 changes: 9 additions & 0 deletions src/nnue.h
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,7 @@
#include <cstdint>
#include <array>
#include <vector>
#include <immintrin.h>

constexpr int INPUT_WEIGHTS = 768;
constexpr int HIDDEN_SIZE = 1024;
Expand All @@ -28,4 +29,12 @@ class NNUE {
[[nodiscard]] int32_t SCReLU(int16_t x);
[[nodiscard]] int32_t output(const NNUE::accumulator& board_accumulator, const bool whiteToMove);
[[nodiscard]] std::pair<std::size_t, std::size_t> GetIndex(const int piece, const int square);
#if defined(USE_AVX2)
[[nodiscard]] int32_t flatten(const int16_t *acc, const int16_t *weights);
[[nodiscard]] int32_t horizontal_add(const __m256i sum);
[[nodiscard]] __m256i simd_screlu(const __m256i vec);
#elif defined(USE_AVX512)
[[nodiscard]] int32_t flatten(const int16_t *acc, const int16_t *weights);
[[nodiscard]] __m512i simd_screlu(const __m512i vec);
#endif
};
2 changes: 1 addition & 1 deletion src/types.h
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,7 @@

#include <cstdint>

#define NAME "Alexandria-6.0.4"
#define NAME "Alexandria-6.0.5"

// define bitboard data type
using Bitboard = uint64_t;
Expand Down

0 comments on commit d0ed9e9

Please sign in to comment.