Implement SIMD and reduce quantisation scale to 181 (#334)

Elo | 15.50 +- 7.12 (95%) SPRT | 8.0+0.08s Threads=1 Hash=16MB LLR | 2.97 (-2.94, 2.94) [0.00, 3.00] Games | N: 4710 W: 1317 L: 1107 D: 2286 Penta | [43, 490, 1089, 680, 53] Bench 6920775 Co-authored-by: jeff-pow <103015219+jeff-pow@users.noreply.github.com>
PGG106 · Feb 11, 2024 · d0ed9e9 · d0ed9e9
1 parent f7ad980
commit d0ed9e9
Show file tree

Hide file tree

Showing 6 changed files with 136 additions and 27 deletions.
diff --git a/Contributors b/Contributors
@@ -2,7 +2,8 @@
 Cosmo - relative NNUE inference code
 cj5716 - Several gainers
 Disservin - Makefile and CI PR
-Gabe - fat speedup
+fireandice - Writing SIMD so we don't have to
+Gabe - Fat speedup
 JW - 16bits move structure shenanigans
 Kimmy - Search stuff
 kz04px (Big T) - Code refactoring

diff --git a/makefile b/makefile
@@ -1,17 +1,17 @@
 
 NETWORK_NAME = nn.net
-_THIS     := $(realpath $(dir $(abspath $(lastword $(MAKEFILE_LIST)))))
-_ROOT     := $(_THIS)
+_THIS	 := $(realpath $(dir $(abspath $(lastword $(MAKEFILE_LIST)))))
+_ROOT	 := $(_THIS)
 EVALFILE   = $(NETWORK_NAME)
-CXX       := g++
-TARGET    := Alexandria
+CXX	   := g++
+TARGET	:= Alexandria
 WARNINGS   = -Wall -Wcast-qual -Wextra -Wshadow -Wdouble-promotion -Wformat=2 -Wnull-dereference -Wlogical-op -Wold-style-cast -Wundef -pedantic
 CXXFLAGS  :=  -funroll-loops -O3 -flto -fno-exceptions -std=gnu++2a -DNDEBUG $(WARNINGS)
-NATIVE     = -march=native
+NATIVE	 = -march=native
 
 
 # engine name
-NAME      := Alexandria
+NAME	  := Alexandria
 
 TMPDIR = .tmp
 
@@ -22,10 +22,10 @@ endif
 
 # Detect Windows
 ifeq ($(OS), Windows_NT)
-	MKDIR    := mkdir
+	MKDIR	:= mkdir
 else
 ifeq ($(COMP), MINGW)
-	MKDIR    := mkdir
+	MKDIR	:= mkdir
 else
 	MKDIR   := mkdir -p
 endif
@@ -52,45 +52,72 @@ endif
 # Remove native for builds
 ifdef build
 	NATIVE =
+else
+	build = native
 endif
 
 # SPECIFIC BUILDS
 ifeq ($(build), native)
-	NATIVE   = -march=native
-	ARCH     = -x86-64-native
+	NATIVE	 = -march=native
+	ARCH	   = -x86-64-native
+	PROPERTIES = $(shell echo | $(CXX) -march=native -E -dM -)
+	ifneq ($(findstring __AVX512F__, $(PROPERTIES)),)
+		ifneq ($(findstring __AVX512BW__, $(PROPERTIES)),)
+			CXXFLAGS += -DUSE_AVX512 -mavx512f -mavx512bw
+		endif
+	else ifneq ($(findstring __BMI2__, $(PROPERTIES)),)
+		CXXFLAGS += -DUSE_AVX2 -mavx2 -mbmi
+	else ifneq ($(findstring __AVX2__, $(PROPERTIES)),)
+		CXXFLAGS += -DUSE_AVX2 -mavx2 -mbmi
+	endif
+
 endif
 
 ifeq ($(build), x86-64)
-	NATIVE       = -mtune=znver1
+	NATIVE	   = -mtune=znver1
 	INSTRUCTIONS = -msse -msse2 -mpopcnt
-	ARCH         = -x86-64
+	ARCH		 = -x86-64
 endif
 
 ifeq ($(build), x86-64-modern)
-	NATIVE       = -mtune=znver2
+	NATIVE	   = -mtune=znver2
 	INSTRUCTIONS = -m64 -msse -msse3 -mpopcnt
-	ARCH         = -x86-64-modern
+	ARCH		 = -x86-64-modern
 endif
 
 ifeq ($(build), x86-64-avx2)
-	NATIVE       = -march=bdver4 -mno-tbm -mno-sse4a -mno-bmi2
-	ARCH         = -x86-64-avx2
+	NATIVE	   = -march=bdver4 -mno-tbm -mno-sse4a -mno-bmi2
+	ARCH		 = -x86-64-avx2
+	CXXFLAGS += -DUSE_AVX2 -mavx2 -mbmi
 endif
 
 ifeq ($(build), x86-64-bmi2)
-	NATIVE       = -march=haswell
-	ARCH         = -x86-64-bmi2
+	NATIVE	   = -march=haswell
+	ARCH		 = -x86-64-bmi2
+	CXXFLAGS += -DUSE_AVX2 -mavx2 -mbmi
 endif
 
 ifeq ($(build), x86-64-avx512)
-	NATIVE       = -march=x86-64-v4 -mtune=znver4
-	ARCH         = -x86-64-avx512
+	NATIVE	   = -march=x86-64-v4 -mtune=znver4
+	ARCH		 = -x86-64-avx512
+	CXXFLAGS += -DUSE_AVX512 -mavx512f -mavx512bw
 endif
 
 ifeq ($(build), debug)
 	CXXFLAGS = -O3 -g3 -fno-omit-frame-pointer -std=gnu++2a
 	NATIVE   = -msse -msse3 -mpopcnt
-	FLAGS    = -lpthread -lstdc++
+	FLAGS	= -lpthread -lstdc++
+
+	PROPERTIES = $(shell echo | $(CXX) -march=native -E -dM -)
+	ifneq ($(findstring __AVX512F__, $(PROPERTIES)),)
+		ifneq ($(findstring __AVX512BW__, $(PROPERTIES)),)
+			CXXFLAGS += -DUSE_AVX512 -mavx512f -mavx512bw
+		endif
+	else ifneq ($(findstring __BMI2__, $(PROPERTIES)),)
+		CXXFLAGS += -DUSE_AVX2 -mavx2 -mbmi
+	else ifneq ($(findstring __AVX2__, $(PROPERTIES)),)
+		CXXFLAGS += -DUSE_AVX2 -mavx2 -mbmi
+	endif
 endif
 
 # Add network name and Evalfile
@@ -99,7 +126,7 @@ CXXFLAGS += -DNETWORK_NAME=\"$(NETWORK_NAME)\" -DEVALFILE=\"$(EVALFILE)\"
 SOURCES := $(wildcard src/*.cpp)
 OBJECTS := $(patsubst %.cpp,$(TMPDIR)/%.o,$(SOURCES))
 DEPENDS := $(patsubst %.cpp,$(TMPDIR)/%.d,$(SOURCES))
-EXE     := $(NAME)$(SUFFIX)
+EXE	    := $(NAME)$(SUFFIX)
 
 all: $(TARGET)
 clean:

diff --git a/nn.net b/nn.net
diff --git a/src/nnue.cpp b/src/nnue.cpp
@@ -4,6 +4,7 @@
 #include <cstdio>
 #include <cstring>
 #include <iostream>
+#include <immintrin.h>
 #include "incbin/incbin.h"
 
 // Macro to embed the default efficiently updatable neural network (NNUE) file
@@ -26,9 +27,75 @@ Network net;
 // Thanks to Disservin for having me look at his code and Luecx for the
 // invaluable help and the immense patience
 
+#if defined(USE_AVX512)
+constexpr int32_t CHUNK_SIZE = 32;
+#elif defined(USE_AVX2)
+constexpr int32_t CHUNK_SIZE = 16;
+#else
+constexpr int32_t CHUNK_SIZE = 1;
+#endif
+constexpr int32_t REQUIRED_ITERS = HIDDEN_SIZE / CHUNK_SIZE;
+
+#if defined(USE_AVX2)
+__m256i NNUE::simd_screlu(const __m256i vec) {
+    auto min = _mm256_set1_epi16(0);
+    auto max = _mm256_set1_epi16(181);
+    auto clamped = _mm256_min_epi16(_mm256_max_epi16(vec, min), max);
+    return _mm256_mullo_epi16(clamped, clamped);
+}
+
+int32_t NNUE::horizontal_add(const __m256i sum) {
+    auto upper_128 = _mm256_extracti128_si256(sum, 1);
+    auto lower_128 = _mm256_castsi256_si128(sum);
+    auto sum_128 = _mm_add_epi32(upper_128, lower_128);
+
+    auto upper_64 = _mm_unpackhi_epi64(sum_128, sum_128);
+    auto sum_64 = _mm_add_epi32(upper_64, sum_128);
+
+    auto upper_32 = _mm_shuffle_epi32(sum_64, 1);
+    auto sum_32 = _mm_add_epi32(upper_32, sum_64);
+
+    return _mm_cvtsi128_si32(sum_32);
+}
+
+int32_t NNUE::flatten(const int16_t *acc, const int16_t *weights) {
+    auto sum = _mm256_setzero_si256();
+    for (int i = 0; i < REQUIRED_ITERS; i++) {
+        auto us_vector = _mm256_loadu_si256(reinterpret_cast<const __m256i*>(acc + i * CHUNK_SIZE));
+        auto activated = simd_screlu(us_vector);
+        auto weights_vec = _mm256_loadu_si256(reinterpret_cast<const __m256i*>(weights + i * CHUNK_SIZE));
+        auto mul = _mm256_madd_epi16(activated, weights_vec);
+        sum = _mm256_add_epi32(sum, mul);
+    }
+    return horizontal_add(sum);
+}
+
+#elif defined(USE_AVX512)
+
+__m512i NNUE::simd_screlu(const __m512i vec) {
+    auto min = _mm512_set1_epi16(0);
+    auto max = _mm512_set1_epi16(181);
+    auto clamped = _mm512_min_epi16(_mm512_max_epi16(vec, min), max);
+    return _mm512_mullo_epi16(clamped, clamped);
+}
+
+int32_t NNUE::flatten(const int16_t *acc, const int16_t *weights) {
+    auto sum = _mm512_setzero_si512();
+    for (int i = 0; i < REQUIRED_ITERS; i++) {
+        auto us_vector = _mm512_loadu_si512(reinterpret_cast<const __m512i*>(acc + i * CHUNK_SIZE));
+        auto activated = simd_screlu(us_vector);
+        auto weights_vec = _mm512_loadu_si512(reinterpret_cast<const __m512i*>(weights + i * CHUNK_SIZE));
+        auto mul = _mm512_madd_epi16(activated, weights_vec);
+        sum = _mm512_add_epi32(sum, mul);
+    }
+    return _mm512_reduce_add_epi32(sum);
+}
+
+#endif
+
 int32_t NNUE::SCReLU(int16_t x) {
     constexpr int16_t CR_MIN = 0;
-    constexpr int16_t CR_MAX = 255;
+    constexpr int16_t CR_MAX = 181;
     // compute squared clipped ReLU
     int16_t clipped = std::clamp(x, CR_MIN, CR_MAX);
     int32_t wide = clipped;
@@ -166,15 +233,20 @@ int32_t NNUE::output(const NNUE::accumulator& board_accumulator, const bool whit
         us = board_accumulator[1].data();
         them = board_accumulator[0].data();
     }
+    #if defined(USE_AVX512) || defined(USE_AVX2)
+    int32_t output = flatten(us, net.outputWeights) + flatten(them, net.outputWeights + HIDDEN_SIZE);
+    return (net.outputBias + output / 181) * 400 / (64 * 181);
+    #else
     int32_t output = 0;
     for (int i = 0; i < HIDDEN_SIZE; i++) {
         output += SCReLU(us[i]) * static_cast<int32_t>(net.outputWeights[i]);
     }
     for (int i = 0; i < HIDDEN_SIZE; i++) {
         output += SCReLU(them[i]) * static_cast<int32_t>(net.outputWeights[HIDDEN_SIZE + i]);
     }
-    int32_t unsquared = output / 255 + net.outputBias;
-    return unsquared * 400 / (64 * 255);
+    int32_t unsquared = output / 181 + net.outputBias;
+    return unsquared * 400 / (64 * 181);
+    #endif
 }
 
 std::pair<std::size_t, std::size_t> NNUE::GetIndex(const int piece, const int square) {

diff --git a/src/nnue.h b/src/nnue.h
@@ -3,6 +3,7 @@
 #include <cstdint>
 #include <array>
 #include <vector>
+#include <immintrin.h>
 
 constexpr int INPUT_WEIGHTS = 768;
 constexpr int HIDDEN_SIZE = 1024;
@@ -28,4 +29,12 @@ class NNUE {
     [[nodiscard]] int32_t SCReLU(int16_t x);
     [[nodiscard]] int32_t output(const NNUE::accumulator& board_accumulator, const bool whiteToMove);
     [[nodiscard]] std::pair<std::size_t, std::size_t> GetIndex(const int piece, const int square);
+    #if defined(USE_AVX2)
+    [[nodiscard]] int32_t flatten(const int16_t *acc, const int16_t *weights);
+    [[nodiscard]] int32_t horizontal_add(const __m256i sum);
+    [[nodiscard]] __m256i simd_screlu(const __m256i vec);
+    #elif defined(USE_AVX512)
+    [[nodiscard]] int32_t flatten(const int16_t *acc, const int16_t *weights);
+    [[nodiscard]] __m512i simd_screlu(const __m512i vec);
+    #endif
 };
diff --git a/src/types.h b/src/types.h
@@ -2,7 +2,7 @@
 
 #include <cstdint>
 
-#define NAME "Alexandria-6.0.4"
+#define NAME "Alexandria-6.0.5"
 
 // define bitboard data type
 using Bitboard = uint64_t;