diff --git a/include/private/dsp/arch/x86/avx512/hmath.h b/include/private/dsp/arch/x86/avx512/hmath.h index ced53654..41bbec54 100644 --- a/include/private/dsp/arch/x86/avx512/hmath.h +++ b/include/private/dsp/arch/x86/avx512/hmath.h @@ -27,6 +27,7 @@ #endif /* PRIVATE_DSP_ARCH_X86_AVX512_IMPL */ +#include #include diff --git a/include/private/dsp/arch/x86/avx512/hmath/hdotp.h b/include/private/dsp/arch/x86/avx512/hmath/hdotp.h new file mode 100644 index 00000000..edf30f57 --- /dev/null +++ b/include/private/dsp/arch/x86/avx512/hmath/hdotp.h @@ -0,0 +1,354 @@ +/* + * Copyright (C) 2024 Linux Studio Plugins Project + * (C) 2024 Vladimir Sadovnikov + * + * This file is part of lsp-dsp-lib + * Created on: 11 дек. 2024 г. + * + * lsp-dsp-lib is free software: you can redistribute it and/or modify + * it under the terms of the GNU Lesser General Public License as published by + * the Free Software Foundation, either version 3 of the License, or + * any later version. + * + * lsp-dsp-lib is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public License + * along with lsp-dsp-lib. If not, see . + */ + +#ifndef PRIVATE_DSP_ARCH_X86_AVX512_HMATH_HDOTP_H_ +#define PRIVATE_DSP_ARCH_X86_AVX512_HMATH_HDOTP_H_ + +#ifndef PRIVATE_DSP_ARCH_X86_AVX512_IMPL + #error "This header should not be included directly" +#endif /* PRIVATE_DSP_ARCH_X86_AVX512_IMPL */ + +namespace lsp +{ + namespace avx512 + { + float h_dotp(const float *a, const float *b, size_t count) + { + IF_ARCH_X86( + float result; + size_t off; + ); + ARCH_X86_ASM + ( + __ASM_EMIT("vxorps %%zmm0, %%zmm0, %%zmm0") + __ASM_EMIT("xor %[off], %[off]") + __ASM_EMIT("vxorps %%zmm1, %%zmm1, %%zmm1") + /* x64 blocks */ + __ASM_EMIT("sub $64, %[count]") + __ASM_EMIT("jb 2f") + __ASM_EMIT("1:") + __ASM_EMIT("vmovups 0x00(%[a], %[off]), %%zmm2") + __ASM_EMIT("vmovups 0x40(%[a], %[off]), %%zmm3") + __ASM_EMIT("vmovups 0x80(%[a], %[off]), %%zmm4") + __ASM_EMIT("vmovups 0xc0(%[a], %[off]), %%zmm5") + __ASM_EMIT("vmulps 0x00(%[b], %[off]), %%zmm2, %%zmm2") + __ASM_EMIT("vmulps 0x40(%[b], %[off]), %%zmm3, %%zmm3") + __ASM_EMIT("vmulps 0xc0(%[b], %[off]), %%zmm4, %%zmm4") + __ASM_EMIT("vmulps 0x80(%[b], %[off]), %%zmm5, %%zmm5") + __ASM_EMIT("vaddps %%zmm2, %%zmm0, %%zmm0") + __ASM_EMIT("vaddps %%zmm3, %%zmm1, %%zmm1") + __ASM_EMIT("vaddps %%zmm4, %%zmm0, %%zmm0") + __ASM_EMIT("vaddps %%zmm5, %%zmm1, %%zmm1") + __ASM_EMIT("add $0x100, %[off]") + __ASM_EMIT("sub $64, %[count]") + __ASM_EMIT("jae 1b") + __ASM_EMIT("2:") + /* x32 block */ + __ASM_EMIT("add $32, %[count]") + __ASM_EMIT("jl 4f") + __ASM_EMIT("vmovups 0x00(%[a], %[off]), %%zmm2") + __ASM_EMIT("vmovups 0x40(%[a], %[off]), %%zmm3") + __ASM_EMIT("vmulps 0x00(%[b], %[off]), %%zmm2, %%zmm2") + __ASM_EMIT("vmulps 0x40(%[b], %[off]), %%zmm3, %%zmm3") + __ASM_EMIT("vaddps %%zmm2, %%zmm0, %%zmm0") + __ASM_EMIT("vaddps %%zmm3, %%zmm1, %%zmm1") + __ASM_EMIT("add $0x80, %[off]") + __ASM_EMIT("sub $32, %[count]") + __ASM_EMIT("4:") + __ASM_EMIT("vextractf64x4 $1, %%zmm0, %%ymm2") + __ASM_EMIT("vextractf64x4 $1, %%zmm1, %%ymm3") + __ASM_EMIT("vaddps %%ymm2, %%ymm0, %%ymm0") + __ASM_EMIT("vaddps %%ymm3, %%ymm1, %%ymm1") + /* x16 block */ + __ASM_EMIT("add $16, %[count]") + __ASM_EMIT("jl 6f") + __ASM_EMIT("vmovups 0x00(%[a], %[off]), %%ymm2") + __ASM_EMIT("vmovups 0x20(%[a], %[off]), %%ymm3") + __ASM_EMIT("vmulps 0x00(%[b], %[off]), %%ymm2, %%ymm2") + __ASM_EMIT("vmulps 0x20(%[b], %[off]), %%ymm3, %%ymm3") + __ASM_EMIT("vaddps %%ymm2, %%ymm0, %%ymm0") + __ASM_EMIT("vaddps %%ymm3, %%ymm1, %%ymm1") + __ASM_EMIT("add $0x40, %[off]") + __ASM_EMIT("sub $16, %[count]") + __ASM_EMIT("6:") + /* x8 block */ + __ASM_EMIT("add $8, %[count]") + __ASM_EMIT("vaddps %%ymm1, %%ymm0, %%ymm0") + __ASM_EMIT("jl 8f") + __ASM_EMIT("vmovups 0x00(%[a], %[off]), %%ymm2") + __ASM_EMIT("vmulps 0x00(%[b], %[off]), %%ymm2, %%ymm2") + __ASM_EMIT("vaddps %%ymm2, %%ymm0, %%ymm0") + __ASM_EMIT("add $0x20, %[off]") + __ASM_EMIT("sub $8, %[count]") + __ASM_EMIT("8:") + /* x4 block */ + __ASM_EMIT("vextractf128 $0x01, %%ymm0, %%xmm1") + __ASM_EMIT("add $4, %[count]") + __ASM_EMIT("vaddps %%xmm1, %%xmm0, %%xmm0") + __ASM_EMIT("jl 10f") + __ASM_EMIT("vmovups 0x00(%[a], %[off]), %%xmm2") + __ASM_EMIT("vmulps 0x00(%[b], %[off]), %%xmm2, %%xmm2") + __ASM_EMIT("vaddps %%xmm2, %%xmm0, %%xmm0") + __ASM_EMIT("add $0x10, %[off]") + __ASM_EMIT("sub $4, %[count]") + __ASM_EMIT("10:") + /* x1 block */ + __ASM_EMIT("vhaddps %%xmm0, %%xmm0, %%xmm0") + __ASM_EMIT("add $3, %[count]") + __ASM_EMIT("vhaddps %%xmm0, %%xmm0, %%xmm0") + __ASM_EMIT("jl 12f") + __ASM_EMIT("11:") + __ASM_EMIT("vmovss 0x00(%[a], %[off]), %%xmm2") + __ASM_EMIT("vmulss 0x00(%[b], %[off]), %%xmm2, %%xmm2") + __ASM_EMIT("vaddss %%xmm2, %%xmm0, %%xmm0") + __ASM_EMIT("add $0x04, %[off]") + __ASM_EMIT("dec %[count]") + __ASM_EMIT("jge 11b") + __ASM_EMIT("12:") + /* end */ + : [count] "+r" (count), [off] "=&r" (off), + [res] "=Yz" (result) + : [a] "r" (a), [b] "r" (b) + : "cc", "memory", + "%xmm1", "%xmm2", "%xmm3", + "%xmm4", "%xmm5" + ); + + return result; + } + + float h_sqr_dotp(const float *a, const float *b, size_t count) + { + IF_ARCH_X86( + float result; + size_t off; + ); + ARCH_X86_ASM + ( + __ASM_EMIT("vxorps %%zmm0, %%zmm0, %%zmm0") + __ASM_EMIT("xor %[off], %[off]") + __ASM_EMIT("vxorps %%zmm1, %%zmm1, %%zmm1") + /* x32 blocks */ + __ASM_EMIT("sub $32, %[count]") + __ASM_EMIT("jb 2f") + __ASM_EMIT("1:") + __ASM_EMIT("vmovups 0x00(%[a], %[off]), %%zmm2") + __ASM_EMIT("vmovups 0x40(%[a], %[off]), %%zmm3") + __ASM_EMIT("vmovups 0x00(%[b], %[off]), %%zmm4") + __ASM_EMIT("vmovups 0x40(%[b], %[off]), %%zmm5") + __ASM_EMIT("vmulps %%zmm2, %%zmm2, %%zmm2") + __ASM_EMIT("vmulps %%zmm3, %%zmm3, %%zmm3") + __ASM_EMIT("vmulps %%zmm4, %%zmm4, %%zmm4") + __ASM_EMIT("vmulps %%zmm5, %%zmm5, %%zmm5") + __ASM_EMIT("vfmadd231ps %%zmm4, %%zmm2, %%zmm0") + __ASM_EMIT("vfmadd231ps %%zmm5, %%zmm3, %%zmm1") + __ASM_EMIT("add $0x80, %[off]") + __ASM_EMIT("sub $32, %[count]") + __ASM_EMIT("jae 1b") + __ASM_EMIT("2:") + __ASM_EMIT("vextractf64x4 $1, %%zmm0, %%ymm2") + __ASM_EMIT("vextractf64x4 $1, %%zmm1, %%ymm3") + __ASM_EMIT("vaddps %%ymm2, %%ymm0, %%ymm0") + __ASM_EMIT("vaddps %%ymm3, %%ymm1, %%ymm1") + /* x16 block */ + __ASM_EMIT("add $16, %[count]") + __ASM_EMIT("jl 4f") + __ASM_EMIT("vmovups 0x00(%[a], %[off]), %%ymm2") + __ASM_EMIT("vmovups 0x20(%[a], %[off]), %%ymm3") + __ASM_EMIT("vmovups 0x00(%[b], %[off]), %%ymm4") + __ASM_EMIT("vmovups 0x20(%[b], %[off]), %%ymm5") + __ASM_EMIT("vmulps %%ymm2, %%ymm2, %%ymm2") + __ASM_EMIT("vmulps %%ymm3, %%ymm3, %%ymm3") + __ASM_EMIT("vmulps %%ymm4, %%ymm4, %%ymm4") + __ASM_EMIT("vmulps %%ymm5, %%ymm5, %%ymm5") + __ASM_EMIT("vfmadd231ps %%ymm4, %%ymm2, %%ymm0") + __ASM_EMIT("vfmadd231ps %%ymm5, %%ymm3, %%ymm1") + __ASM_EMIT("add $0x40, %[off]") + __ASM_EMIT("sub $16, %[count]") + __ASM_EMIT("4:") + __ASM_EMIT("vextractf128 $0x01, %%ymm0, %%xmm2") + __ASM_EMIT("vextractf128 $0x01, %%ymm1, %%xmm3") + __ASM_EMIT("vaddps %%xmm2, %%xmm0, %%xmm0") + __ASM_EMIT("vaddps %%xmm3, %%xmm1, %%xmm1") + /* x8 block */ + __ASM_EMIT("add $8, %[count]") + __ASM_EMIT("jl 6f") + __ASM_EMIT("vmovups 0x00(%[a], %[off]), %%xmm2") + __ASM_EMIT("vmovups 0x10(%[a], %[off]), %%xmm3") + __ASM_EMIT("vmovups 0x00(%[b], %[off]), %%xmm4") + __ASM_EMIT("vmovups 0x10(%[b], %[off]), %%xmm5") + __ASM_EMIT("vmulps %%xmm2, %%xmm2, %%xmm2") + __ASM_EMIT("vmulps %%xmm3, %%xmm3, %%xmm3") + __ASM_EMIT("vmulps %%xmm4, %%xmm4, %%xmm4") + __ASM_EMIT("vmulps %%xmm5, %%xmm5, %%xmm5") + __ASM_EMIT("vfmadd231ps %%xmm4, %%xmm2, %%xmm0") + __ASM_EMIT("vfmadd231ps %%xmm5, %%xmm3, %%xmm1") + __ASM_EMIT("add $0x20, %[off]") + __ASM_EMIT("sub $8, %[count]") + __ASM_EMIT("6:") + __ASM_EMIT("vaddps %%xmm1, %%xmm0, %%xmm0") + /* x4 block */ + __ASM_EMIT("add $4, %[count]") + __ASM_EMIT("jl 8f") + __ASM_EMIT("vmovups 0x00(%[a], %[off]), %%xmm2") + __ASM_EMIT("vmovups 0x00(%[b], %[off]), %%xmm4") + __ASM_EMIT("vmulps %%xmm2, %%xmm2, %%xmm2") + __ASM_EMIT("vmulps %%xmm4, %%xmm4, %%xmm4") + __ASM_EMIT("vfmadd231ps %%xmm4, %%xmm2, %%xmm0") + __ASM_EMIT("add $0x10, %[off]") + __ASM_EMIT("sub $4, %[count]") + __ASM_EMIT("8:") + /* x1 block */ + __ASM_EMIT("vhaddps %%xmm0, %%xmm0, %%xmm0") + __ASM_EMIT("add $3, %[count]") + __ASM_EMIT("vhaddps %%xmm0, %%xmm0, %%xmm0") + __ASM_EMIT("jl 10f") + __ASM_EMIT("9:") + __ASM_EMIT("vmovss 0x00(%[a], %[off]), %%xmm2") + __ASM_EMIT("vmovss 0x00(%[b], %[off]), %%xmm4") + __ASM_EMIT("vmulss %%xmm2, %%xmm2, %%xmm2") + __ASM_EMIT("vmulss %%xmm4, %%xmm4, %%xmm4") + __ASM_EMIT("vfmadd231ss %%xmm4, %%xmm2, %%xmm0") + __ASM_EMIT("add $0x04, %[off]") + __ASM_EMIT("dec %[count]") + __ASM_EMIT("jge 9b") + __ASM_EMIT("10:") + /* end */ + : [count] "+r" (count), [off] "=&r" (off), + [res] "=Yz" (result) + : [a] "r" (a), [b] "r" (b) + : "cc", "memory", + "%xmm1", "%xmm2", "%xmm3", + "%xmm4", "%xmm5" + ); + + return result; + } + + IF_ARCH_X86( + static const uint32_t h_abs_dotp_const[] __lsp_aligned64 = + { + LSP_DSP_VEC16(0x7fffffff) + }; + ) + + float h_abs_dotp(const float *a, const float *b, size_t count) + { + IF_ARCH_X86( + float result; + size_t off; + ); + ARCH_X86_ASM + ( + __ASM_EMIT("vxorps %%zmm0, %%zmm0, %%zmm0") + __ASM_EMIT("vmovaps %[CC], %%zmm6") + __ASM_EMIT("xor %[off], %[off]") + __ASM_EMIT("vxorps %%zmm1, %%zmm1, %%zmm1") + __ASM_EMIT("vmovaps %%zmm6, %%zmm7") + /* x32 blocks */ + __ASM_EMIT("sub $32, %[count]") + __ASM_EMIT("jb 2f") + __ASM_EMIT("1:") + __ASM_EMIT("vandps 0x00(%[a], %[off]), %%zmm6, %%zmm2") + __ASM_EMIT("vandps 0x40(%[a], %[off]), %%zmm7, %%zmm3") + __ASM_EMIT("vandps 0x00(%[b], %[off]), %%zmm6, %%zmm4") + __ASM_EMIT("vandps 0x40(%[b], %[off]), %%zmm7, %%zmm5") + __ASM_EMIT("vfmadd231ps %%zmm4, %%zmm2, %%zmm0") + __ASM_EMIT("vfmadd231ps %%zmm5, %%zmm3, %%zmm1") + __ASM_EMIT("add $0x80, %[off]") + __ASM_EMIT("sub $32, %[count]") + __ASM_EMIT("jae 1b") + __ASM_EMIT("2:") + __ASM_EMIT("vextractf64x4 $1, %%zmm0, %%ymm2") + __ASM_EMIT("vextractf64x4 $1, %%zmm1, %%ymm3") + __ASM_EMIT("vaddps %%ymm2, %%ymm0, %%ymm0") + __ASM_EMIT("vaddps %%ymm3, %%ymm1, %%ymm1") + /* x16 block */ + __ASM_EMIT("add $16, %[count]") + __ASM_EMIT("jl 4f") + __ASM_EMIT("vandps 0x00(%[a], %[off]), %%ymm6, %%ymm2") + __ASM_EMIT("vandps 0x40(%[a], %[off]), %%ymm7, %%ymm3") + __ASM_EMIT("vandps 0x00(%[b], %[off]), %%ymm6, %%ymm4") + __ASM_EMIT("vandps 0x40(%[b], %[off]), %%ymm7, %%ymm5") + __ASM_EMIT("vfmadd231ps %%ymm4, %%ymm2, %%ymm0") + __ASM_EMIT("vfmadd231ps %%ymm5, %%ymm3, %%ymm1") + __ASM_EMIT("add $0x40, %[off]") + __ASM_EMIT("sub $16, %[count]") + __ASM_EMIT("4:") + __ASM_EMIT("vextractf128 $0x01, %%ymm0, %%xmm2") + __ASM_EMIT("vextractf128 $0x01, %%ymm1, %%xmm3") + __ASM_EMIT("vaddps %%xmm2, %%xmm0, %%xmm0") + __ASM_EMIT("vaddps %%xmm3, %%xmm1, %%xmm1") + /* x8 block */ + __ASM_EMIT("add $8, %[count]") + __ASM_EMIT("jl 6f") + __ASM_EMIT("vandps 0x00(%[a], %[off]), %%xmm6, %%xmm2") + __ASM_EMIT("vandps 0x40(%[a], %[off]), %%xmm7, %%xmm3") + __ASM_EMIT("vandps 0x00(%[b], %[off]), %%xmm6, %%xmm4") + __ASM_EMIT("vandps 0x40(%[b], %[off]), %%xmm7, %%xmm5") + __ASM_EMIT("vfmadd231ps %%xmm4, %%xmm2, %%xmm0") + __ASM_EMIT("vfmadd231ps %%xmm5, %%xmm3, %%xmm1") + __ASM_EMIT("add $0x20, %[off]") + __ASM_EMIT("sub $8, %[count]") + __ASM_EMIT("6:") + __ASM_EMIT("vaddps %%xmm1, %%xmm0, %%xmm0") + /* x4 block */ + __ASM_EMIT("add $4, %[count]") + __ASM_EMIT("jl 8f") + __ASM_EMIT("vandps 0x00(%[a], %[off]), %%xmm6, %%xmm2") + __ASM_EMIT("vandps 0x00(%[b], %[off]), %%xmm7, %%xmm4") + __ASM_EMIT("vfmadd231ps %%xmm4, %%xmm2, %%xmm0") + __ASM_EMIT("add $0x10, %[off]") + __ASM_EMIT("sub $4, %[count]") + __ASM_EMIT("8:") + /* x1 block */ + __ASM_EMIT("vhaddps %%xmm0, %%xmm0, %%xmm0") + __ASM_EMIT("add $3, %[count]") + __ASM_EMIT("vhaddps %%xmm0, %%xmm0, %%xmm0") + __ASM_EMIT("jl 10f") + __ASM_EMIT("9:") + __ASM_EMIT("vmovss 0x00(%[a], %[off]), %%xmm2") + __ASM_EMIT("vmovss 0x00(%[b], %[off]), %%xmm4") + __ASM_EMIT("vandps %%xmm2, %%xmm6, %%xmm2") + __ASM_EMIT("vandps %%xmm4, %%xmm7, %%xmm4") + __ASM_EMIT("vfmadd231ps %%xmm4, %%xmm2, %%xmm0") + __ASM_EMIT("add $0x04, %[off]") + __ASM_EMIT("dec %[count]") + __ASM_EMIT("jge 9b") + __ASM_EMIT("10:") + /* end */ + : [count] "+r" (count), [off] "=&r" (off), + [res] "=Yz" (result) + : [a] "r" (a), [b] "r" (b), + [CC] "m" (h_abs_dotp_const) + : "cc", "memory", + "%xmm1", "%xmm2", "%xmm3", + "%xmm4", "%xmm5", "%xmm6", "%xmm7" + ); + + return result; + } + } /* namespace avx512 */ +} /* namespace lsp */ + + + +#endif /* PRIVATE_DSP_ARCH_X86_AVX512_HMATH_HDOTP_H_ */ diff --git a/src/main/x86/avx512.cpp b/src/main/x86/avx512.cpp index c33431ae..88deddd6 100644 --- a/src/main/x86/avx512.cpp +++ b/src/main/x86/avx512.cpp @@ -1,6 +1,6 @@ /* - * Copyright (C) 2023 Linux Studio Plugins Project - * (C) 2023 Vladimir Sadovnikov + * Copyright (C) 2024 Linux Studio Plugins Project + * (C) 2024 Vladimir Sadovnikov * * This file is part of lsp-dsp-lib * Created on: 24 мая 2023 г. @@ -326,6 +326,10 @@ CEXPORT1(vl, h_sum); CEXPORT1(vl, h_sqr_sum); CEXPORT1(vl, h_abs_sum); + + CEXPORT1(vl, h_dotp); + CEXPORT1(vl, h_sqr_dotp); + CEXPORT1(vl, h_abs_dotp); } } /* namespace avx2 */ } /* namespace lsp */ diff --git a/src/test/ptest/hmath/hdotp.cpp b/src/test/ptest/hmath/h_abs_dotp.cpp similarity index 65% rename from src/test/ptest/hmath/hdotp.cpp rename to src/test/ptest/hmath/h_abs_dotp.cpp index 9394c8a8..481d6f4d 100644 --- a/src/test/ptest/hmath/hdotp.cpp +++ b/src/test/ptest/hmath/h_abs_dotp.cpp @@ -1,9 +1,9 @@ /* - * Copyright (C) 2020 Linux Studio Plugins Project - * (C) 2020 Vladimir Sadovnikov + * Copyright (C) 2024 Linux Studio Plugins Project + * (C) 2024 Vladimir Sadovnikov * * This file is part of lsp-dsp-lib - * Created on: 31 мар. 2020 г. + * Created on: 11 дек. 2024 г. * * lsp-dsp-lib is free software: you can redistribute it and/or modify * it under the terms of the GNU Lesser General Public License as published by @@ -25,30 +25,29 @@ #include #include -#define MIN_RANK 8 +#define MIN_RANK 5 #define MAX_RANK 16 namespace lsp { namespace generic { - float h_dotp(const float *a, const float *b, size_t count); - float h_sqr_dotp(const float *a, const float *b, size_t count); float h_abs_dotp(const float *a, const float *b, size_t count); } IF_ARCH_X86( namespace sse { - float h_dotp(const float *a, const float *b, size_t count); - float h_sqr_dotp(const float *a, const float *b, size_t count); float h_abs_dotp(const float *a, const float *b, size_t count); } namespace avx { - float h_dotp(const float *a, const float *b, size_t count); - float h_sqr_dotp(const float *a, const float *b, size_t count); + float h_abs_dotp(const float *a, const float *b, size_t count); + } + + namespace avx512 + { float h_abs_dotp(const float *a, const float *b, size_t count); } ) @@ -56,8 +55,6 @@ namespace lsp IF_ARCH_ARM( namespace neon_d32 { - float h_dotp(const float *a, const float *b, size_t count); - float h_sqr_dotp(const float *a, const float *b, size_t count); float h_abs_dotp(const float *a, const float *b, size_t count); } ) @@ -65,8 +62,6 @@ namespace lsp IF_ARCH_AARCH64( namespace asimd { - float h_dotp(const float *a, const float *b, size_t count); - float h_sqr_dotp(const float *a, const float *b, size_t count); float h_abs_dotp(const float *a, const float *b, size_t count); } ) @@ -74,7 +69,7 @@ namespace lsp typedef float (* h_dotp_t)(const float *a, const float *b, size_t count); } -PTEST_BEGIN("dsp.hmath", hdotp, 5, 10000) +PTEST_BEGIN("dsp.hmath", h_abs_dotp, 5, 5000) void call(const char *label, float *a, float *b, size_t count, h_dotp_t func) { @@ -106,26 +101,13 @@ PTEST_BEGIN("dsp.hmath", hdotp, 5, 10000) { size_t count = 1 << i; - CALL(generic::h_dotp); - IF_ARCH_X86(CALL(sse::h_dotp)); - IF_ARCH_X86(CALL(avx::h_dotp)); - IF_ARCH_ARM(CALL(neon_d32::h_dotp)); - IF_ARCH_AARCH64(CALL(asimd::h_dotp)); - PTEST_SEPARATOR; - - CALL(generic::h_sqr_dotp); - IF_ARCH_X86(CALL(sse::h_sqr_dotp)); - IF_ARCH_X86(CALL(avx::h_sqr_dotp)); - IF_ARCH_ARM(CALL(neon_d32::h_sqr_dotp)); - IF_ARCH_AARCH64(CALL(asimd::h_sqr_dotp)); - PTEST_SEPARATOR; - CALL(generic::h_abs_dotp); IF_ARCH_X86(CALL(sse::h_abs_dotp)); IF_ARCH_X86(CALL(avx::h_abs_dotp)); + IF_ARCH_X86(CALL(avx512::h_abs_dotp)); IF_ARCH_ARM(CALL(neon_d32::h_abs_dotp)); IF_ARCH_AARCH64(CALL(asimd::h_abs_dotp)); - PTEST_SEPARATOR2; + PTEST_SEPARATOR; } free_aligned(data); diff --git a/src/test/ptest/hmath/h_abs_sum.cpp b/src/test/ptest/hmath/h_abs_sum.cpp index 4b26b16b..f00d4620 100644 --- a/src/test/ptest/hmath/h_abs_sum.cpp +++ b/src/test/ptest/hmath/h_abs_sum.cpp @@ -24,7 +24,7 @@ #include #include -#define MIN_RANK 8 +#define MIN_RANK 5 #define MAX_RANK 16 namespace lsp @@ -68,7 +68,7 @@ namespace lsp typedef float (* h_sum_t)(const float *src, size_t count); } -PTEST_BEGIN("dsp.hmath", h_abs_sum, 5, 10000) +PTEST_BEGIN("dsp.hmath", h_abs_sum, 5, 5000) void call(const char *label, float *src, size_t count, h_sum_t func) { diff --git a/src/test/ptest/hmath/h_dotp.cpp b/src/test/ptest/hmath/h_dotp.cpp new file mode 100644 index 00000000..9073c835 --- /dev/null +++ b/src/test/ptest/hmath/h_dotp.cpp @@ -0,0 +1,119 @@ +/* + * Copyright (C) 2024 Linux Studio Plugins Project + * (C) 2024 Vladimir Sadovnikov + * + * This file is part of lsp-dsp-lib + * Created on: 31 мар. 2020 г. + * + * lsp-dsp-lib is free software: you can redistribute it and/or modify + * it under the terms of the GNU Lesser General Public License as published by + * the Free Software Foundation, either version 3 of the License, or + * any later version. + * + * lsp-dsp-lib is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public License + * along with lsp-dsp-lib. If not, see . + */ + +#include +#include +#include +#include +#include + +#define MIN_RANK 5 +#define MAX_RANK 16 + +namespace lsp +{ + namespace generic + { + float h_dotp(const float *a, const float *b, size_t count); + } + + IF_ARCH_X86( + namespace sse + { + float h_dotp(const float *a, const float *b, size_t count); + } + + namespace avx + { + float h_dotp(const float *a, const float *b, size_t count); + } + + namespace avx512 + { + float h_dotp(const float *a, const float *b, size_t count); + } + ) + + IF_ARCH_ARM( + namespace neon_d32 + { + float h_dotp(const float *a, const float *b, size_t count); + } + ) + + IF_ARCH_AARCH64( + namespace asimd + { + float h_dotp(const float *a, const float *b, size_t count); + } + ) + + typedef float (* h_dotp_t)(const float *a, const float *b, size_t count); +} + +PTEST_BEGIN("dsp.hmath", h_dotp, 5, 5000) + + void call(const char *label, float *a, float *b, size_t count, h_dotp_t func) + { + if (!PTEST_SUPPORTED(func)) + return; + + char buf[80]; + snprintf(buf, sizeof(buf), "%s x %d", label, int(count)); + printf("Testing %s numbers...\n", buf); + + PTEST_LOOP(buf, + func(a, b, count); + ); + } + + PTEST_MAIN + { + size_t buf_size = 1 << MAX_RANK; + uint8_t *data = NULL; + float *a = alloc_aligned(data, buf_size * 2, 64); + float *b = &a[buf_size]; + + randomize_sign(a, buf_size * 2); + + #define CALL(func) \ + call(#func, a, b, count, func) + + for (size_t i=MIN_RANK; i <= MAX_RANK; ++i) + { + size_t count = 1 << i; + + CALL(generic::h_dotp); + IF_ARCH_X86(CALL(sse::h_dotp)); + IF_ARCH_X86(CALL(avx::h_dotp)); + IF_ARCH_X86(CALL(avx512::h_dotp)); + IF_ARCH_ARM(CALL(neon_d32::h_dotp)); + IF_ARCH_AARCH64(CALL(asimd::h_dotp)); + PTEST_SEPARATOR; + } + + free_aligned(data); + } + +PTEST_END + + + diff --git a/src/test/ptest/hmath/h_sqr_dotp.cpp b/src/test/ptest/hmath/h_sqr_dotp.cpp new file mode 100644 index 00000000..90cedd06 --- /dev/null +++ b/src/test/ptest/hmath/h_sqr_dotp.cpp @@ -0,0 +1,119 @@ +/* + * Copyright (C) 2024 Linux Studio Plugins Project + * (C) 2024 Vladimir Sadovnikov + * + * This file is part of lsp-dsp-lib + * Created on: 11 дек. 2024 г. + * + * lsp-dsp-lib is free software: you can redistribute it and/or modify + * it under the terms of the GNU Lesser General Public License as published by + * the Free Software Foundation, either version 3 of the License, or + * any later version. + * + * lsp-dsp-lib is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public License + * along with lsp-dsp-lib. If not, see . + */ + +#include +#include +#include +#include +#include + +#define MIN_RANK 5 +#define MAX_RANK 16 + +namespace lsp +{ + namespace generic + { + float h_sqr_dotp(const float *a, const float *b, size_t count); + } + + IF_ARCH_X86( + namespace sse + { + float h_sqr_dotp(const float *a, const float *b, size_t count); + } + + namespace avx + { + float h_sqr_dotp(const float *a, const float *b, size_t count); + } + + namespace avx512 + { + float h_sqr_dotp(const float *a, const float *b, size_t count); + } + ) + + IF_ARCH_ARM( + namespace neon_d32 + { + float h_sqr_dotp(const float *a, const float *b, size_t count); + } + ) + + IF_ARCH_AARCH64( + namespace asimd + { + float h_sqr_dotp(const float *a, const float *b, size_t count); + } + ) + + typedef float (* h_dotp_t)(const float *a, const float *b, size_t count); +} + +PTEST_BEGIN("dsp.hmath", h_sqr_dotp, 5, 5000) + + void call(const char *label, float *a, float *b, size_t count, h_dotp_t func) + { + if (!PTEST_SUPPORTED(func)) + return; + + char buf[80]; + snprintf(buf, sizeof(buf), "%s x %d", label, int(count)); + printf("Testing %s numbers...\n", buf); + + PTEST_LOOP(buf, + func(a, b, count); + ); + } + + PTEST_MAIN + { + size_t buf_size = 1 << MAX_RANK; + uint8_t *data = NULL; + float *a = alloc_aligned(data, buf_size * 2, 64); + float *b = &a[buf_size]; + + randomize_sign(a, buf_size * 2); + + #define CALL(func) \ + call(#func, a, b, count, func) + + for (size_t i=MIN_RANK; i <= MAX_RANK; ++i) + { + size_t count = 1 << i; + + CALL(generic::h_sqr_dotp); + IF_ARCH_X86(CALL(sse::h_sqr_dotp)); + IF_ARCH_X86(CALL(avx::h_sqr_dotp)); + IF_ARCH_X86(CALL(avx512::h_sqr_dotp)); + IF_ARCH_ARM(CALL(neon_d32::h_sqr_dotp)); + IF_ARCH_AARCH64(CALL(asimd::h_sqr_dotp)); + PTEST_SEPARATOR; + } + + free_aligned(data); + } + +PTEST_END + + + diff --git a/src/test/ptest/hmath/h_sqr_sum.cpp b/src/test/ptest/hmath/h_sqr_sum.cpp index 11517a20..d6e9d1e8 100644 --- a/src/test/ptest/hmath/h_sqr_sum.cpp +++ b/src/test/ptest/hmath/h_sqr_sum.cpp @@ -24,7 +24,7 @@ #include #include -#define MIN_RANK 8 +#define MIN_RANK 5 #define MAX_RANK 16 namespace lsp @@ -69,7 +69,7 @@ namespace lsp typedef float (* h_sum_t)(const float *src, size_t count); } -PTEST_BEGIN("dsp.hmath", h_sqr_sum, 5, 10000) +PTEST_BEGIN("dsp.hmath", h_sqr_sum, 5, 5000) void call(const char *label, float *src, size_t count, h_sum_t func) { diff --git a/src/test/ptest/hmath/h_sum.cpp b/src/test/ptest/hmath/h_sum.cpp index 1ced0f0f..020cf79d 100644 --- a/src/test/ptest/hmath/h_sum.cpp +++ b/src/test/ptest/hmath/h_sum.cpp @@ -24,7 +24,7 @@ #include #include -#define MIN_RANK 8 +#define MIN_RANK 5 #define MAX_RANK 16 namespace lsp @@ -68,7 +68,7 @@ namespace lsp typedef float (* h_sum_t)(const float *src, size_t count); } -PTEST_BEGIN("dsp.hmath", h_sum, 5, 10000) +PTEST_BEGIN("dsp.hmath", h_sum, 5, 5000) void call(const char *label, float *src, size_t count, h_sum_t func) { diff --git a/src/test/utest/hmath/hdotp.cpp b/src/test/utest/hmath/h_abs_dotp.cpp similarity index 70% rename from src/test/utest/hmath/hdotp.cpp rename to src/test/utest/hmath/h_abs_dotp.cpp index 276538ba..25926c23 100644 --- a/src/test/utest/hmath/hdotp.cpp +++ b/src/test/utest/hmath/h_abs_dotp.cpp @@ -1,9 +1,9 @@ /* - * Copyright (C) 2020 Linux Studio Plugins Project - * (C) 2020 Vladimir Sadovnikov + * Copyright (C) 2024 Linux Studio Plugins Project + * (C) 2024 Vladimir Sadovnikov * * This file is part of lsp-dsp-lib - * Created on: 31 мар. 2020 г. + * Created on: 11 дек. 2024 г. * * lsp-dsp-lib is free software: you can redistribute it and/or modify * it under the terms of the GNU Lesser General Public License as published by @@ -32,23 +32,22 @@ namespace lsp { namespace generic { - float h_dotp(const float *a, const float *b, size_t count); - float h_sqr_dotp(const float *a, const float *b, size_t count); float h_abs_dotp(const float *a, const float *b, size_t count); } IF_ARCH_X86( namespace sse { - float h_dotp(const float *a, const float *b, size_t count); - float h_sqr_dotp(const float *a, const float *b, size_t count); float h_abs_dotp(const float *a, const float *b, size_t count); } namespace avx { - float h_dotp(const float *a, const float *b, size_t count); - float h_sqr_dotp(const float *a, const float *b, size_t count); + float h_abs_dotp(const float *a, const float *b, size_t count); + } + + namespace avx512 + { float h_abs_dotp(const float *a, const float *b, size_t count); } ) @@ -56,8 +55,6 @@ namespace lsp IF_ARCH_ARM( namespace neon_d32 { - float h_dotp(const float *a, const float *b, size_t count); - float h_sqr_dotp(const float *a, const float *b, size_t count); float h_abs_dotp(const float *a, const float *b, size_t count); } ) @@ -65,8 +62,6 @@ namespace lsp IF_ARCH_AARCH64( namespace asimd { - float h_dotp(const float *a, const float *b, size_t count); - float h_sqr_dotp(const float *a, const float *b, size_t count); float h_abs_dotp(const float *a, const float *b, size_t count); } ) @@ -74,7 +69,7 @@ namespace lsp typedef float (* h_dotp_t)(const float *a, const float *b, size_t count); } -UTEST_BEGIN("dsp.hmath", hdotp) +UTEST_BEGIN("dsp.hmath", h_abs_dotp) void call(const char *label, size_t align, h_dotp_t func1, h_dotp_t func2) { @@ -119,20 +114,10 @@ UTEST_BEGIN("dsp.hmath", hdotp) #define CALL(generic, func, align) \ call(#func, align, generic, func); - IF_ARCH_X86(CALL(generic::h_dotp, sse::h_dotp, 16)); - IF_ARCH_X86(CALL(generic::h_sqr_dotp, sse::h_sqr_dotp, 16)); IF_ARCH_X86(CALL(generic::h_abs_dotp, sse::h_abs_dotp, 16)); - - IF_ARCH_X86(CALL(generic::h_dotp, avx::h_dotp, 32)); - IF_ARCH_X86(CALL(generic::h_sqr_dotp, avx::h_sqr_dotp, 32)); IF_ARCH_X86(CALL(generic::h_abs_dotp, avx::h_abs_dotp, 32)); - - IF_ARCH_ARM(CALL(generic::h_dotp, neon_d32::h_dotp, 16)); - IF_ARCH_ARM(CALL(generic::h_sqr_dotp, neon_d32::h_sqr_dotp, 16)); + IF_ARCH_X86(CALL(generic::h_abs_dotp, avx512::h_abs_dotp, 64)); IF_ARCH_ARM(CALL(generic::h_abs_dotp, neon_d32::h_abs_dotp, 16)); - - IF_ARCH_AARCH64(CALL(generic::h_dotp, asimd::h_dotp, 16)); - IF_ARCH_AARCH64(CALL(generic::h_sqr_dotp, asimd::h_sqr_dotp, 16)); IF_ARCH_AARCH64(CALL(generic::h_abs_dotp, asimd::h_abs_dotp, 16)); } UTEST_END diff --git a/src/test/utest/hmath/h_dotp.cpp b/src/test/utest/hmath/h_dotp.cpp new file mode 100644 index 00000000..11ae2b9e --- /dev/null +++ b/src/test/utest/hmath/h_dotp.cpp @@ -0,0 +1,123 @@ +/* + * Copyright (C) 2024 Linux Studio Plugins Project + * (C) 2024 Vladimir Sadovnikov + * + * This file is part of lsp-dsp-lib + * Created on: 31 мар. 2020 г. + * + * lsp-dsp-lib is free software: you can redistribute it and/or modify + * it under the terms of the GNU Lesser General Public License as published by + * the Free Software Foundation, either version 3 of the License, or + * any later version. + * + * lsp-dsp-lib is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public License + * along with lsp-dsp-lib. If not, see . + */ + +#include +#include +#include +#include + +#ifndef TOLERANCE + #define TOLERANCE 1e-4 +#endif + +namespace lsp +{ + namespace generic + { + float h_dotp(const float *a, const float *b, size_t count); + } + + IF_ARCH_X86( + namespace sse + { + float h_dotp(const float *a, const float *b, size_t count); + } + + namespace avx + { + float h_dotp(const float *a, const float *b, size_t count); + } + + namespace avx512 + { + float h_dotp(const float *a, const float *b, size_t count); + } + ) + + IF_ARCH_ARM( + namespace neon_d32 + { + float h_dotp(const float *a, const float *b, size_t count); + } + ) + + IF_ARCH_AARCH64( + namespace asimd + { + float h_dotp(const float *a, const float *b, size_t count); + } + ) + + typedef float (* h_dotp_t)(const float *a, const float *b, size_t count); +} + +UTEST_BEGIN("dsp.hmath", h_dotp) + + void call(const char *label, size_t align, h_dotp_t func1, h_dotp_t func2) + { + if (!UTEST_SUPPORTED(func1)) + return; + if (!UTEST_SUPPORTED(func2)) + return; + + UTEST_FOREACH(count, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, + 32, 64, 65, 100, 768, 999, 0x1fff) + { + for (size_t mask=0; mask <= 0x03; ++mask) + { + printf("Testing %s on input buffer of %d numbers, mask=0x%x...\n", label, int(count), int(mask)); + + FloatBuffer a(count, align, mask & 0x01); + FloatBuffer b(count, align, mask & 0x02); + + a.randomize_sign(); + b.randomize_sign(); + + // Call functions + float xa = func1(a, b, count); + float xb = func2(a, b, count); + + UTEST_ASSERT_MSG(a.valid(), "Source buffer A corrupted"); + UTEST_ASSERT_MSG(b.valid(), "Source buffer B corrupted"); + + // Compare buffers + if (!float_equals_adaptive(xa, xb, TOLERANCE)) + { + a.dump("A"); + b.dump("B"); + UTEST_FAIL_MSG("%s: Result of function 1 (%f) differs result of function 2 (%f)", label, xa, xb); + } + } + } + } + + UTEST_MAIN + { + #define CALL(generic, func, align) \ + call(#func, align, generic, func); + + IF_ARCH_X86(CALL(generic::h_dotp, sse::h_dotp, 16)); + IF_ARCH_X86(CALL(generic::h_dotp, avx::h_dotp, 32)); + IF_ARCH_X86(CALL(generic::h_dotp, avx512::h_dotp, 64)); + IF_ARCH_ARM(CALL(generic::h_dotp, neon_d32::h_dotp, 16)); + IF_ARCH_AARCH64(CALL(generic::h_dotp, asimd::h_dotp, 16)); + } +UTEST_END diff --git a/src/test/utest/hmath/h_sqr_dotp.cpp b/src/test/utest/hmath/h_sqr_dotp.cpp new file mode 100644 index 00000000..d430a175 --- /dev/null +++ b/src/test/utest/hmath/h_sqr_dotp.cpp @@ -0,0 +1,123 @@ +/* + * Copyright (C) 2024 Linux Studio Plugins Project + * (C) 2024 Vladimir Sadovnikov + * + * This file is part of lsp-dsp-lib + * Created on: 11 дек. 2024 г. + * + * lsp-dsp-lib is free software: you can redistribute it and/or modify + * it under the terms of the GNU Lesser General Public License as published by + * the Free Software Foundation, either version 3 of the License, or + * any later version. + * + * lsp-dsp-lib is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public License + * along with lsp-dsp-lib. If not, see . + */ + +#include +#include +#include +#include + +#ifndef TOLERANCE + #define TOLERANCE 1e-4 +#endif + +namespace lsp +{ + namespace generic + { + float h_sqr_dotp(const float *a, const float *b, size_t count); + } + + IF_ARCH_X86( + namespace sse + { + float h_sqr_dotp(const float *a, const float *b, size_t count); + } + + namespace avx + { + float h_sqr_dotp(const float *a, const float *b, size_t count); + } + + namespace avx512 + { + float h_sqr_dotp(const float *a, const float *b, size_t count); + } + ) + + IF_ARCH_ARM( + namespace neon_d32 + { + float h_sqr_dotp(const float *a, const float *b, size_t count); + } + ) + + IF_ARCH_AARCH64( + namespace asimd + { + float h_sqr_dotp(const float *a, const float *b, size_t count); + } + ) + + typedef float (* h_dotp_t)(const float *a, const float *b, size_t count); +} + +UTEST_BEGIN("dsp.hmath", h_sqr_dotp) + + void call(const char *label, size_t align, h_dotp_t func1, h_dotp_t func2) + { + if (!UTEST_SUPPORTED(func1)) + return; + if (!UTEST_SUPPORTED(func2)) + return; + + UTEST_FOREACH(count, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, + 32, 64, 65, 100, 768, 999, 0x1fff) + { + for (size_t mask=0; mask <= 0x03; ++mask) + { + printf("Testing %s on input buffer of %d numbers, mask=0x%x...\n", label, int(count), int(mask)); + + FloatBuffer a(count, align, mask & 0x01); + FloatBuffer b(count, align, mask & 0x02); + + a.randomize_sign(); + b.randomize_sign(); + + // Call functions + float xa = func1(a, b, count); + float xb = func2(a, b, count); + + UTEST_ASSERT_MSG(a.valid(), "Source buffer A corrupted"); + UTEST_ASSERT_MSG(b.valid(), "Source buffer B corrupted"); + + // Compare buffers + if (!float_equals_adaptive(xa, xb, TOLERANCE)) + { + a.dump("A"); + b.dump("B"); + UTEST_FAIL_MSG("%s: Result of function 1 (%f) differs result of function 2 (%f)", label, xa, xb); + } + } + } + } + + UTEST_MAIN + { + #define CALL(generic, func, align) \ + call(#func, align, generic, func); + + IF_ARCH_X86(CALL(generic::h_sqr_dotp, sse::h_sqr_dotp, 16)); + IF_ARCH_X86(CALL(generic::h_sqr_dotp, avx::h_sqr_dotp, 32)); + IF_ARCH_X86(CALL(generic::h_sqr_dotp, avx512::h_sqr_dotp, 64)); + IF_ARCH_ARM(CALL(generic::h_sqr_dotp, neon_d32::h_sqr_dotp, 16)); + IF_ARCH_AARCH64(CALL(generic::h_sqr_dotp, asimd::h_sqr_dotp, 16)); + } +UTEST_END