Skip to content

Commit

Permalink
ifdefの条件を修正した (#291)
Browse files Browse the repository at this point in the history
* ifdefの条件を修正した

* NEONでのエラーを修正
  • Loading branch information
KazApps authored Oct 8, 2024
1 parent 863bb39 commit 44bbb88
Show file tree
Hide file tree
Showing 3 changed files with 25 additions and 6 deletions.
4 changes: 2 additions & 2 deletions source/Makefile
Original file line number Diff line number Diff line change
Expand Up @@ -595,7 +595,7 @@ else ifeq ($(TARGET_CPU),AVX512VNNI)
# cascadelakeとicelakeとの違いがあるのかは知らないので、cascadelakeのみでいいや。

else ifeq ($(TARGET_CPU),AVXVNNI)
CPPFLAGS += -DUSE_AVX2 -DUSE_BMI2 -DUSE_VNNI -march=alderlake
CPPFLAGS += -DUSE_AVX2 -DUSE_BMI2 -DUSE_VNNI -DUSE_AVXVNNI -march=alderlake
# NNUEのコード、USE_VNNIが指定されているとVNNI対応のコードになる。
# Intel Alder Lake、Sapphire Rapids 以降追加の命令群。LLVM12, GCC11 以降。
# AVXVNNI (AVX2VNNI という表記も有り?) は AVX512VNNIの256bit幅以下限定版。
Expand Down Expand Up @@ -626,7 +626,7 @@ else ifeq ($(TARGET_CPU),NO_SSE)
else ifeq ($(TARGET_CPU),GRAVITON2)
# for Amazon Web Servece EC2, the Graviton2 CPU [M6g/M6gd, C6g/C6gd/C6gn, R6g/R6gd, T4g, X2gd] instances
# https://github.com/aws/aws-graviton-getting-started/blob/main/c-c++.md
CPPFLAGS += -DIS_64BIT -DUSE_NEON -march=armv8.2-a+fp16+rcpc+dotprod+crypto
CPPFLAGS += -DIS_64BIT -DUSE_NEON=8 -march=armv8.2-a+fp16+rcpc+dotprod+crypto
else ifeq ($(TARGET_CPU),APPLEAVX2)
CPPFLAGS += -DIS_64BIT -DUSE_AVX2 -DUSE_BMI2 -target x86_64-apple-macos11 -mbmi -mbmi2 -mavx2 -mpopcnt
else ifeq ($(TARGET_CPU),APPLESSE42)
Expand Down
12 changes: 8 additions & 4 deletions source/eval/nnue/layers/affine_transform_sparse_input.h
Original file line number Diff line number Diff line change
Expand Up @@ -15,6 +15,8 @@

namespace Eval::NNUE::Layers {

#if defined(USE_SSSE3) || USE_NEON >= 8

alignas(kCacheLineSize) static inline const
std::array<std::array<std::uint16_t, 8>, 256> lookup_indices = []() {
std::array<std::array<std::uint16_t, 8>, 256> v{};
Expand All @@ -36,7 +38,7 @@ void find_nnz(const std::int32_t* input, std::uint16_t* out, IndexType& count_ou
#define vec_nnz(a) _mm512_cmpgt_epi32_mask(a, _mm512_setzero_si512())
#elif defined(USE_AVX2)
using vec_t = __m256i;
#if defined(USE_VNNI) && defined(USE_AVX512)
#if defined(USE_VNNI) && !defined(USE_AVXVNNI)
#define vec_nnz(a) _mm256_cmpgt_epi32_mask(a, _mm256_setzero_si256())
#else
#define vec_nnz(a) \
Expand Down Expand Up @@ -105,6 +107,8 @@ void find_nnz(const std::int32_t* input, std::uint16_t* out, IndexType& count_ou
#undef vec128_storeu
#undef vec128_add

#endif

// AffineTransform layer that takes block-sparse input
// ブロック疎な入力を受け取るアフィン変換層
template <typename PreviousLayer, IndexType OutputDimensions>
Expand All @@ -131,7 +135,7 @@ class AffineTransformSparseInput {
// 入力層からこの層までで使用する順伝播用バッファのサイズ
static constexpr std::size_t kBufferSize = PreviousLayer::kBufferSize + kSelfBufferSize;

#if defined(USE_SSSE3) || defined(USE_NEON_DOTPROD)
#if defined(USE_SSSE3) || USE_NEON >= 8
static constexpr IndexType kChunkSize = 4;
#else
static constexpr IndexType kChunkSize = 1;
Expand Down Expand Up @@ -159,7 +163,7 @@ class AffineTransformSparseInput {
}

static constexpr IndexType get_weight_index(IndexType i) {
#if defined(USE_SSSE3) || defined(USE_NEON_DOTPROD)
#if defined(USE_SSSE3) || USE_NEON >= 8
return get_weight_index_scrambled(i);
#else
return i;
Expand Down Expand Up @@ -210,7 +214,7 @@ class AffineTransformSparseInput {
}
#endif

#if defined(USE_SSSE3) || defined(USE_NEON_DOTPROD)
#if defined(USE_SSSE3) || USE_NEON >= 8

#if defined(USE_AVX512)
if constexpr (kOutputDimensions % 16 == 0)
Expand Down
15 changes: 15 additions & 0 deletions source/eval/nnue/layers/simd.h
Original file line number Diff line number Diff line change
Expand Up @@ -87,7 +87,11 @@ namespace Simd
#if defined(USE_NEON)

[[maybe_unused]] static int neon_m128_reduce_add_epi32(int32x4_t s) {
#if USE_NEON >= 8
return vaddvq_s32(s);
#else
return s[0] + s[1] + s[2] + s[3];
#endif
}

[[maybe_unused]] static int neon_m128_hadd(int32x4_t sum, int bias) {
Expand All @@ -96,6 +100,17 @@ namespace Simd

#endif

#if USE_NEON >= 8
[[maybe_unused]] static void neon_m128_add_dpbusd_epi32(int32x4_t& acc, int8x16_t a, int8x16_t b) {

int16x8_t product0 = vmull_s8(vget_low_s8(a), vget_low_s8(b));
int16x8_t product1 = vmull_high_s8(a, b);
int16x8_t sum = vpaddq_s16(product0, product1);
acc = vpadalq_s16(acc, sum);
}

#endif


} // namespace Simd

Expand Down

0 comments on commit 44bbb88

Please sign in to comment.