From b42abe1532ee0700751eabd3cb3d9d0c56e4fe8c Mon Sep 17 00:00:00 2001 From: lucshi Date: Thu, 18 Aug 2022 15:20:25 +0800 Subject: [PATCH 1/4] enable avx512 support for base64 encoding. Reuse WojciechMula/base64-avx512 code --- CMakeLists.txt | 4 + Makefile | 9 +- cmake/Modules/TargetSIMDInstructionSet.cmake | 2 + cmake/config.h.in | 3 + include/libbase64.h | 1 + lib/arch/avx512/chromiumbase64.c | 410 +++++++++++++++++++ lib/arch/avx512/chromiumbase64.h | 168 ++++++++ lib/arch/avx512/codec.c | 40 ++ lib/arch/avx512/enc_loop.c | 74 ++++ lib/codec_choose.c | 32 +- lib/lib.c | 2 +- test/ci/test.sh | 5 +- test/codec_supported.c | 1 + 13 files changed, 742 insertions(+), 9 deletions(-) create mode 100644 lib/arch/avx512/chromiumbase64.c create mode 100644 lib/arch/avx512/chromiumbase64.h create mode 100644 lib/arch/avx512/codec.c create mode 100644 lib/arch/avx512/enc_loop.c diff --git a/CMakeLists.txt b/CMakeLists.txt index dcca17f6..c6020190 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -62,6 +62,8 @@ cmake_dependent_option(BASE64_WITH_AVX "add AVX codepath" ON ${_IS_X86} OFF) add_feature_info(AVX BASE64_WITH_AVX "add AVX codepath") cmake_dependent_option(BASE64_WITH_AVX2 "add AVX 2 codepath" ON ${_IS_X86} OFF) add_feature_info(AVX2 BASE64_WITH_AVX2 "add AVX2 codepath") +cmake_dependent_option(BASE64_WITH_AVX512 "add AVX 512 codepath" ON ${_IS_X86} OFF) +add_feature_info(AVX2 BASE64_WITH_AVX512 "add AVX512 codepath") cmake_dependent_option(BASE64_WITH_NEON32 "add NEON32 codepath" OFF _TARGET_ARCH_arm OFF) add_feature_info(NEON32 BASE64_WITH_NEON32 "add NEON32 codepath") @@ -118,6 +120,7 @@ add_library(base64 lib/arch/sse42/codec.c lib/arch/avx/codec.c lib/arch/avx2/codec.c + lib/arch/avx512/codec.c lib/arch/neon32/codec.c lib/arch/neon64/codec.c @@ -206,6 +209,7 @@ if (_TARGET_ARCH STREQUAL "x86" OR _TARGET_ARCH STREQUAL "x64") configure_codec(SSE42 __SSSE4_2__) configure_codec(AVX) configure_codec(AVX2) + configure_codec(AVX512) elseif (_TARGET_ARCH STREQUAL "arm") set(BASE64_NEON32_CFLAGS "${COMPILE_FLAGS_NEON32}" CACHE STRING "the NEON32 compile flags (for 'lib/arch/neon32/codec.c')") diff --git a/Makefile b/Makefile index 2bb01e20..8dd55388 100644 --- a/Makefile +++ b/Makefile @@ -4,6 +4,7 @@ CFLAGS += -std=c99 -O3 -Wall -Wextra -pedantic OBJCOPY ?= objcopy OBJS = \ + lib/arch/avx512/codec.o \ lib/arch/avx2/codec.o \ lib/arch/generic/codec.o \ lib/arch/neon32/codec.o \ @@ -16,6 +17,7 @@ OBJS = \ lib/codec_choose.o \ lib/tables/tables.o +HAVE_AVX512 = 0 HAVE_AVX2 = 0 HAVE_NEON32 = 0 HAVE_NEON64 = 0 @@ -26,6 +28,9 @@ HAVE_AVX = 0 # The user should supply compiler flags for the codecs they want to build. # Check which codecs we're going to include: +ifdef AVX512_CFLAGS + HAVE_AVX512 = 1 +endif ifdef AVX2_CFLAGS HAVE_AVX2 = 1 endif @@ -64,7 +69,8 @@ lib/libbase64.o: $(OBJS) $(OBJCOPY) --keep-global-symbols=lib/exports.txt $@ lib/config.h: - @echo "#define HAVE_AVX2 $(HAVE_AVX2)" > $@ + @echo "#define HAVE_AVX512 $(HAVE_AVX512)" > $@ + @echo "#define HAVE_AVX2 $(HAVE_AVX2)" >> $@ @echo "#define HAVE_NEON32 $(HAVE_NEON32)" >> $@ @echo "#define HAVE_NEON64 $(HAVE_NEON64)" >> $@ @echo "#define HAVE_SSSE3 $(HAVE_SSSE3)" >> $@ @@ -75,6 +81,7 @@ lib/config.h: $(OBJS): lib/config.h $(OBJS): CFLAGS += -Ilib +lib/arch/avx512/codec.o: CFLAGS += $(AVX512_CFLAGS) lib/arch/avx2/codec.o: CFLAGS += $(AVX2_CFLAGS) lib/arch/neon32/codec.o: CFLAGS += $(NEON32_CFLAGS) lib/arch/neon64/codec.o: CFLAGS += $(NEON64_CFLAGS) diff --git a/cmake/Modules/TargetSIMDInstructionSet.cmake b/cmake/Modules/TargetSIMDInstructionSet.cmake index ba1f6e51..48508090 100644 --- a/cmake/Modules/TargetSIMDInstructionSet.cmake +++ b/cmake/Modules/TargetSIMDInstructionSet.cmake @@ -21,6 +21,7 @@ macro(define_SIMD_compile_flags) set(COMPILE_FLAGS_SSE42 "-msse4.2") set(COMPILE_FLAGS_AVX "-mavx") set(COMPILE_FLAGS_AVX2 "-mavx2") + set(COMPILE_FLAGS_AVX512 "-mavx512vl -mavx512vbmi") #arm set(COMPILE_FLAGS_NEON32 "-mfpu=neon") @@ -30,5 +31,6 @@ macro(define_SIMD_compile_flags) set(COMPILE_FLAGS_SSE42 " ") set(COMPILE_FLAGS_AVX "/arch:AVX") set(COMPILE_FLAGS_AVX2 "/arch:AVX2") + set(COMPILE_FLAGS_AVX512 "/arch:AVX512") endif() endmacro(define_SIMD_compile_flags) diff --git a/cmake/config.h.in b/cmake/config.h.in index 8530d1e1..c7faa94b 100644 --- a/cmake/config.h.in +++ b/cmake/config.h.in @@ -16,6 +16,9 @@ #cmakedefine01 BASE64_WITH_AVX2 #define HAVE_AVX2 BASE64_WITH_AVX2 +#cmakedefine01 BASE64_WITH_AVX512 +#define HAVE_AVX512 BASE64_WITH_AVX512 + #cmakedefine01 BASE64_WITH_NEON32 #define HAVE_NEON32 BASE64_WITH_NEON32 diff --git a/include/libbase64.h b/include/libbase64.h index d470a82f..c5908973 100644 --- a/include/libbase64.h +++ b/include/libbase64.h @@ -53,6 +53,7 @@ extern "C" { #define BASE64_FORCE_SSE41 (1 << 5) #define BASE64_FORCE_SSE42 (1 << 6) #define BASE64_FORCE_AVX (1 << 7) +#define BASE64_FORCE_AVX512 (1 << 8) struct base64_state { int eof; diff --git a/lib/arch/avx512/chromiumbase64.c b/lib/arch/avx512/chromiumbase64.c new file mode 100644 index 00000000..8a8a417d --- /dev/null +++ b/lib/arch/avx512/chromiumbase64.c @@ -0,0 +1,410 @@ +#include "chromiumbase64.h" + +#define CHAR62 '+' +#define CHAR63 '/' +#define CHARPAD '=' +static const char e0[256] = { + 'A', 'A', 'A', 'A', 'B', 'B', 'B', 'B', 'C', 'C', + 'C', 'C', 'D', 'D', 'D', 'D', 'E', 'E', 'E', 'E', + 'F', 'F', 'F', 'F', 'G', 'G', 'G', 'G', 'H', 'H', + 'H', 'H', 'I', 'I', 'I', 'I', 'J', 'J', 'J', 'J', + 'K', 'K', 'K', 'K', 'L', 'L', 'L', 'L', 'M', 'M', + 'M', 'M', 'N', 'N', 'N', 'N', 'O', 'O', 'O', 'O', + 'P', 'P', 'P', 'P', 'Q', 'Q', 'Q', 'Q', 'R', 'R', + 'R', 'R', 'S', 'S', 'S', 'S', 'T', 'T', 'T', 'T', + 'U', 'U', 'U', 'U', 'V', 'V', 'V', 'V', 'W', 'W', + 'W', 'W', 'X', 'X', 'X', 'X', 'Y', 'Y', 'Y', 'Y', + 'Z', 'Z', 'Z', 'Z', 'a', 'a', 'a', 'a', 'b', 'b', + 'b', 'b', 'c', 'c', 'c', 'c', 'd', 'd', 'd', 'd', + 'e', 'e', 'e', 'e', 'f', 'f', 'f', 'f', 'g', 'g', + 'g', 'g', 'h', 'h', 'h', 'h', 'i', 'i', 'i', 'i', + 'j', 'j', 'j', 'j', 'k', 'k', 'k', 'k', 'l', 'l', + 'l', 'l', 'm', 'm', 'm', 'm', 'n', 'n', 'n', 'n', + 'o', 'o', 'o', 'o', 'p', 'p', 'p', 'p', 'q', 'q', + 'q', 'q', 'r', 'r', 'r', 'r', 's', 's', 's', 's', + 't', 't', 't', 't', 'u', 'u', 'u', 'u', 'v', 'v', + 'v', 'v', 'w', 'w', 'w', 'w', 'x', 'x', 'x', 'x', + 'y', 'y', 'y', 'y', 'z', 'z', 'z', 'z', '0', '0', + '0', '0', '1', '1', '1', '1', '2', '2', '2', '2', + '3', '3', '3', '3', '4', '4', '4', '4', '5', '5', + '5', '5', '6', '6', '6', '6', '7', '7', '7', '7', + '8', '8', '8', '8', '9', '9', '9', '9', '+', '+', + '+', '+', '/', '/', '/', '/' +}; + +static const char e1[256] = { + 'A', 'B', 'C', 'D', 'E', 'F', 'G', 'H', 'I', 'J', + 'K', 'L', 'M', 'N', 'O', 'P', 'Q', 'R', 'S', 'T', + 'U', 'V', 'W', 'X', 'Y', 'Z', 'a', 'b', 'c', 'd', + 'e', 'f', 'g', 'h', 'i', 'j', 'k', 'l', 'm', 'n', + 'o', 'p', 'q', 'r', 's', 't', 'u', 'v', 'w', 'x', + 'y', 'z', '0', '1', '2', '3', '4', '5', '6', '7', + '8', '9', '+', '/', 'A', 'B', 'C', 'D', 'E', 'F', + 'G', 'H', 'I', 'J', 'K', 'L', 'M', 'N', 'O', 'P', + 'Q', 'R', 'S', 'T', 'U', 'V', 'W', 'X', 'Y', 'Z', + 'a', 'b', 'c', 'd', 'e', 'f', 'g', 'h', 'i', 'j', + 'k', 'l', 'm', 'n', 'o', 'p', 'q', 'r', 's', 't', + 'u', 'v', 'w', 'x', 'y', 'z', '0', '1', '2', '3', + '4', '5', '6', '7', '8', '9', '+', '/', 'A', 'B', + 'C', 'D', 'E', 'F', 'G', 'H', 'I', 'J', 'K', 'L', + 'M', 'N', 'O', 'P', 'Q', 'R', 'S', 'T', 'U', 'V', + 'W', 'X', 'Y', 'Z', 'a', 'b', 'c', 'd', 'e', 'f', + 'g', 'h', 'i', 'j', 'k', 'l', 'm', 'n', 'o', 'p', + 'q', 'r', 's', 't', 'u', 'v', 'w', 'x', 'y', 'z', + '0', '1', '2', '3', '4', '5', '6', '7', '8', '9', + '+', '/', 'A', 'B', 'C', 'D', 'E', 'F', 'G', 'H', + 'I', 'J', 'K', 'L', 'M', 'N', 'O', 'P', 'Q', 'R', + 'S', 'T', 'U', 'V', 'W', 'X', 'Y', 'Z', 'a', 'b', + 'c', 'd', 'e', 'f', 'g', 'h', 'i', 'j', 'k', 'l', + 'm', 'n', 'o', 'p', 'q', 'r', 's', 't', 'u', 'v', + 'w', 'x', 'y', 'z', '0', '1', '2', '3', '4', '5', + '6', '7', '8', '9', '+', '/' +}; + +static const char e2[256] = { + 'A', 'B', 'C', 'D', 'E', 'F', 'G', 'H', 'I', 'J', + 'K', 'L', 'M', 'N', 'O', 'P', 'Q', 'R', 'S', 'T', + 'U', 'V', 'W', 'X', 'Y', 'Z', 'a', 'b', 'c', 'd', + 'e', 'f', 'g', 'h', 'i', 'j', 'k', 'l', 'm', 'n', + 'o', 'p', 'q', 'r', 's', 't', 'u', 'v', 'w', 'x', + 'y', 'z', '0', '1', '2', '3', '4', '5', '6', '7', + '8', '9', '+', '/', 'A', 'B', 'C', 'D', 'E', 'F', + 'G', 'H', 'I', 'J', 'K', 'L', 'M', 'N', 'O', 'P', + 'Q', 'R', 'S', 'T', 'U', 'V', 'W', 'X', 'Y', 'Z', + 'a', 'b', 'c', 'd', 'e', 'f', 'g', 'h', 'i', 'j', + 'k', 'l', 'm', 'n', 'o', 'p', 'q', 'r', 's', 't', + 'u', 'v', 'w', 'x', 'y', 'z', '0', '1', '2', '3', + '4', '5', '6', '7', '8', '9', '+', '/', 'A', 'B', + 'C', 'D', 'E', 'F', 'G', 'H', 'I', 'J', 'K', 'L', + 'M', 'N', 'O', 'P', 'Q', 'R', 'S', 'T', 'U', 'V', + 'W', 'X', 'Y', 'Z', 'a', 'b', 'c', 'd', 'e', 'f', + 'g', 'h', 'i', 'j', 'k', 'l', 'm', 'n', 'o', 'p', + 'q', 'r', 's', 't', 'u', 'v', 'w', 'x', 'y', 'z', + '0', '1', '2', '3', '4', '5', '6', '7', '8', '9', + '+', '/', 'A', 'B', 'C', 'D', 'E', 'F', 'G', 'H', + 'I', 'J', 'K', 'L', 'M', 'N', 'O', 'P', 'Q', 'R', + 'S', 'T', 'U', 'V', 'W', 'X', 'Y', 'Z', 'a', 'b', + 'c', 'd', 'e', 'f', 'g', 'h', 'i', 'j', 'k', 'l', + 'm', 'n', 'o', 'p', 'q', 'r', 's', 't', 'u', 'v', + 'w', 'x', 'y', 'z', '0', '1', '2', '3', '4', '5', + '6', '7', '8', '9', '+', '/' +}; + + + +/* SPECIAL DECODE TABLES FOR LITTLE ENDIAN (INTEL) CPUS */ + +static const uint32_t d0[256] = { +0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, +0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, +0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, +0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, +0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, +0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, +0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, +0x01ffffff, 0x000000f8, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x000000fc, +0x000000d0, 0x000000d4, 0x000000d8, 0x000000dc, 0x000000e0, 0x000000e4, +0x000000e8, 0x000000ec, 0x000000f0, 0x000000f4, 0x01ffffff, 0x01ffffff, +0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x00000000, +0x00000004, 0x00000008, 0x0000000c, 0x00000010, 0x00000014, 0x00000018, +0x0000001c, 0x00000020, 0x00000024, 0x00000028, 0x0000002c, 0x00000030, +0x00000034, 0x00000038, 0x0000003c, 0x00000040, 0x00000044, 0x00000048, +0x0000004c, 0x00000050, 0x00000054, 0x00000058, 0x0000005c, 0x00000060, +0x00000064, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, +0x01ffffff, 0x00000068, 0x0000006c, 0x00000070, 0x00000074, 0x00000078, +0x0000007c, 0x00000080, 0x00000084, 0x00000088, 0x0000008c, 0x00000090, +0x00000094, 0x00000098, 0x0000009c, 0x000000a0, 0x000000a4, 0x000000a8, +0x000000ac, 0x000000b0, 0x000000b4, 0x000000b8, 0x000000bc, 0x000000c0, +0x000000c4, 0x000000c8, 0x000000cc, 0x01ffffff, 0x01ffffff, 0x01ffffff, +0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, +0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, +0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, +0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, +0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, +0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, +0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, +0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, +0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, +0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, +0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, +0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, +0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, +0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, +0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, +0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, +0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, +0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, +0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, +0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, +0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, +0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff +}; + + +static const uint32_t d1[256] = { +0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, +0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, +0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, +0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, +0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, +0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, +0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, +0x01ffffff, 0x0000e003, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x0000f003, +0x00004003, 0x00005003, 0x00006003, 0x00007003, 0x00008003, 0x00009003, +0x0000a003, 0x0000b003, 0x0000c003, 0x0000d003, 0x01ffffff, 0x01ffffff, +0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x00000000, +0x00001000, 0x00002000, 0x00003000, 0x00004000, 0x00005000, 0x00006000, +0x00007000, 0x00008000, 0x00009000, 0x0000a000, 0x0000b000, 0x0000c000, +0x0000d000, 0x0000e000, 0x0000f000, 0x00000001, 0x00001001, 0x00002001, +0x00003001, 0x00004001, 0x00005001, 0x00006001, 0x00007001, 0x00008001, +0x00009001, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, +0x01ffffff, 0x0000a001, 0x0000b001, 0x0000c001, 0x0000d001, 0x0000e001, +0x0000f001, 0x00000002, 0x00001002, 0x00002002, 0x00003002, 0x00004002, +0x00005002, 0x00006002, 0x00007002, 0x00008002, 0x00009002, 0x0000a002, +0x0000b002, 0x0000c002, 0x0000d002, 0x0000e002, 0x0000f002, 0x00000003, +0x00001003, 0x00002003, 0x00003003, 0x01ffffff, 0x01ffffff, 0x01ffffff, +0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, +0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, +0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, +0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, +0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, +0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, +0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, +0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, +0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, +0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, +0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, +0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, +0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, +0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, +0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, +0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, +0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, +0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, +0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, +0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, +0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, +0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff +}; + + +static const uint32_t d2[256] = { +0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, +0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, +0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, +0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, +0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, +0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, +0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, +0x01ffffff, 0x00800f00, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x00c00f00, +0x00000d00, 0x00400d00, 0x00800d00, 0x00c00d00, 0x00000e00, 0x00400e00, +0x00800e00, 0x00c00e00, 0x00000f00, 0x00400f00, 0x01ffffff, 0x01ffffff, +0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x00000000, +0x00400000, 0x00800000, 0x00c00000, 0x00000100, 0x00400100, 0x00800100, +0x00c00100, 0x00000200, 0x00400200, 0x00800200, 0x00c00200, 0x00000300, +0x00400300, 0x00800300, 0x00c00300, 0x00000400, 0x00400400, 0x00800400, +0x00c00400, 0x00000500, 0x00400500, 0x00800500, 0x00c00500, 0x00000600, +0x00400600, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, +0x01ffffff, 0x00800600, 0x00c00600, 0x00000700, 0x00400700, 0x00800700, +0x00c00700, 0x00000800, 0x00400800, 0x00800800, 0x00c00800, 0x00000900, +0x00400900, 0x00800900, 0x00c00900, 0x00000a00, 0x00400a00, 0x00800a00, +0x00c00a00, 0x00000b00, 0x00400b00, 0x00800b00, 0x00c00b00, 0x00000c00, +0x00400c00, 0x00800c00, 0x00c00c00, 0x01ffffff, 0x01ffffff, 0x01ffffff, +0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, +0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, +0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, +0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, +0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, +0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, +0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, +0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, +0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, +0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, +0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, +0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, +0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, +0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, +0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, +0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, +0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, +0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, +0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, +0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, +0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, +0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff +}; + + +static const uint32_t d3[256] = { +0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, +0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, +0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, +0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, +0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, +0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, +0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, +0x01ffffff, 0x003e0000, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x003f0000, +0x00340000, 0x00350000, 0x00360000, 0x00370000, 0x00380000, 0x00390000, +0x003a0000, 0x003b0000, 0x003c0000, 0x003d0000, 0x01ffffff, 0x01ffffff, +0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x00000000, +0x00010000, 0x00020000, 0x00030000, 0x00040000, 0x00050000, 0x00060000, +0x00070000, 0x00080000, 0x00090000, 0x000a0000, 0x000b0000, 0x000c0000, +0x000d0000, 0x000e0000, 0x000f0000, 0x00100000, 0x00110000, 0x00120000, +0x00130000, 0x00140000, 0x00150000, 0x00160000, 0x00170000, 0x00180000, +0x00190000, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, +0x01ffffff, 0x001a0000, 0x001b0000, 0x001c0000, 0x001d0000, 0x001e0000, +0x001f0000, 0x00200000, 0x00210000, 0x00220000, 0x00230000, 0x00240000, +0x00250000, 0x00260000, 0x00270000, 0x00280000, 0x00290000, 0x002a0000, +0x002b0000, 0x002c0000, 0x002d0000, 0x002e0000, 0x002f0000, 0x00300000, +0x00310000, 0x00320000, 0x00330000, 0x01ffffff, 0x01ffffff, 0x01ffffff, +0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, +0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, +0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, +0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, +0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, +0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, +0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, +0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, +0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, +0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, +0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, +0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, +0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, +0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, +0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, +0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, +0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, +0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, +0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, +0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, +0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, +0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff +}; + + + +#define BADCHAR 0x01FFFFFF + +/** + * you can control if we use padding by commenting out this + * next line. However, I highly recommend you use padding and not + * using it should only be for compatability with a 3rd party. + * Also, 'no padding' is not tested! + */ +#define DOPAD 1 + +/* + * if we aren't doing padding + * set the pad character to NULL + */ +#ifndef DOPAD +#undef CHARPAD +#define CHARPAD '\0' +#endif + +size_t chromium_base64_encode(char* dest, const char* str, size_t len) +{ + size_t i = 0; + uint8_t* p = (uint8_t*) dest; + + /* unsigned here is important! */ + uint8_t t1, t2, t3; + + if (len > 2) { + for (; i < len - 2; i += 3) { + t1 = str[i]; t2 = str[i+1]; t3 = str[i+2]; + *p++ = e0[t1]; + *p++ = e1[((t1 & 0x03) << 4) | ((t2 >> 4) & 0x0F)]; + *p++ = e1[((t2 & 0x0F) << 2) | ((t3 >> 6) & 0x03)]; + *p++ = e2[t3]; + } + } + + switch (len - i) { + case 0: + break; + case 1: + t1 = str[i]; + *p++ = e0[t1]; + *p++ = e1[(t1 & 0x03) << 4]; + *p++ = CHARPAD; + *p++ = CHARPAD; + break; + default: /* case 2 */ + t1 = str[i]; t2 = str[i+1]; + *p++ = e0[t1]; + *p++ = e1[((t1 & 0x03) << 4) | ((t2 >> 4) & 0x0F)]; + *p++ = e2[(t2 & 0x0F) << 2]; + *p++ = CHARPAD; + } + + *p = '\0'; + return p - (uint8_t*)dest; +} + + +size_t chromium_base64_decode(char* dest, const char* src, size_t len) +{ + if (len == 0) return 0; + +#ifdef DOPAD + /* + * if padding is used, then the message must be at least + * 4 chars and be a multiple of 4 + */ + if (len < 4 || (len % 4 != 0)) { + return MODP_B64_ERROR; /* error */ + } + /* there can be at most 2 pad chars at the end */ + if (src[len-1] == CHARPAD) { + len--; + if (src[len -1] == CHARPAD) { + len--; + } + } +#endif + + size_t i; + int leftover = len % 4; + size_t chunks = (leftover == 0) ? len / 4 - 1 : len /4; + + uint8_t* p = (uint8_t*)dest; + uint32_t x = 0; + const uint8_t* y = (uint8_t*)src; + for (i = 0; i < chunks; ++i, y += 4) { + x = d0[y[0]] | d1[y[1]] | d2[y[2]] | d3[y[3]]; + if (x >= BADCHAR) return MODP_B64_ERROR; + *p++ = ((uint8_t*)(&x))[0]; + *p++ = ((uint8_t*)(&x))[1]; + *p++ = ((uint8_t*)(&x))[2]; + } + + switch (leftover) { + case 0: + x = d0[y[0]] | d1[y[1]] | d2[y[2]] | d3[y[3]]; + + if (x >= BADCHAR) return MODP_B64_ERROR; + *p++ = ((uint8_t*)(&x))[0]; + *p++ = ((uint8_t*)(&x))[1]; + *p = ((uint8_t*)(&x))[2]; + return (chunks+1)*3; + break; + case 1: /* with padding this is an impossible case */ + x = d0[y[0]]; + *p = *((uint8_t*)(&x)); // i.e. first char/byte in int + break; + case 2: // * case 2, 1 output byte */ + x = d0[y[0]] | d1[y[1]]; + *p = *((uint8_t*)(&x)); // i.e. first char + break; + default: /* case 3, 2 output bytes */ + x = d0[y[0]] | d1[y[1]] | d2[y[2]]; /* 0x3c */ + *p++ = ((uint8_t*)(&x))[0]; + *p = ((uint8_t*)(&x))[1]; + break; + } + + if (x >= BADCHAR) return MODP_B64_ERROR; + + return 3*chunks + (6*leftover)/8; +} \ No newline at end of file diff --git a/lib/arch/avx512/chromiumbase64.h b/lib/arch/avx512/chromiumbase64.h new file mode 100644 index 00000000..d6a38ae7 --- /dev/null +++ b/lib/arch/avx512/chromiumbase64.h @@ -0,0 +1,168 @@ +/*************** +* Taken more or less as-is from the chromium project +****************/ + + + +/** + * \file + *
+ * High performance base64 encoder / decoder
+ * Version 1.3 -- 17-Mar-2006
+ *
+ * Copyright © 2005, 2006, Nick Galbreath -- nickg [at] modp [dot] com
+ * All rights reserved.
+ *
+ * http://modp.com/release/base64
+ *
+ * Released under bsd license.  See modp_b64.c for details.
+ * 
+ * + * The default implementation is the standard b64 encoding with padding. + * It's easy to change this to use "URL safe" characters and to remove + * padding. See the modp_b64.c source code for details. + * + */ + +#ifndef MODP_B64 +#define MODP_B64 + +#include +#include + +#ifdef __cplusplus +extern "C" { +#endif + +#define MODP_B64_ERROR ((size_t)-1) +/** + * Encode a raw binary string into base 64. + * src contains the bytes + * len contains the number of bytes in the src + * dest should be allocated by the caller to contain + * at least chromium_base64_encode_len(len) bytes (see below) + * This will contain the null-terminated b64 encoded result + * returns length of the destination string plus the ending null byte + * i.e. the result will be equal to strlen(dest) + 1 + * + * Example + * + * \code + * char* src = ...; + * int srclen = ...; //the length of number of bytes in src + * char* dest = (char*) malloc(chromium_base64_decode_len(srclen)); + * int len = chromium_base64_encode(dest, src, sourcelen); + * if (len == MODP_B64_ERROR) { + * printf("Error\n"); + * } else { + * printf("b64 = %s\n", dest); + * } + * \endcode + * + */ +size_t chromium_base64_encode(char* dest, const char* str, size_t len); + +/** + * Decode a base64 encoded string + * + * + * src should contain exactly len bytes of b64 characters. + * if src contains -any- non-base characters (such as white + * space, MODP_B64_ERROR is returned. + * + * dest should be allocated by the caller to contain at least + * len * 3 / 4 bytes. + * + * Returns the length (strlen) of the output, or MODP_B64_ERROR if unable to + * decode + * + * \code + * char* src = ...; + * int srclen = ...; // or if you don't know use strlen(src) + * char* dest = (char*) malloc(chromium_base64_encode_len(srclen)); + * int len = chromium_base64_decode(dest, src, sourcelen); + * if (len == MODP_B64_ERROR) { error } + * \endcode + */ +size_t chromium_base64_decode(char* dest, const char* src, size_t len); + +/** + * Given a source string of length len, this returns the amount of + * memory the destination string should have. + * + * remember, this is integer math + * 3 bytes turn into 4 chars + * ceiling[len / 3] * 4 + 1 + * + * +1 is for any extra null. + */ +#define chromium_base64_encode_len(A) ((A+2)/3 * 4 + 1) + +/** + * Given a base64 string of length len, + * this returns the amount of memory required for output string + * It maybe be more than the actual number of bytes written. + * NOTE: remember this is integer math + * this allocates a bit more memory than traditional versions of b64 + * decode 4 chars turn into 3 bytes + * floor[len * 3/4] + 2 + */ +#define chromium_base64_decode_len(A) (A / 4 * 3 + 2) + +/** + * Will return the strlen of the output from encoding. + * This may be less than the required number of bytes allocated. + * + * This allows you to 'deserialized' a struct + * \code + * char* b64encoded = "..."; + * int len = strlen(b64encoded); + * + * struct datastuff foo; + * if (chromium_base64_encode_strlen(sizeof(struct datastuff)) != len) { + * // wrong size + * return false; + * } else { + * // safe to do; + * if (chromium_base64_encode((char*) &foo, b64encoded, len) == MODP_B64_ERROR) { + * // bad characters + * return false; + * } + * } + * // foo is filled out now + * \endcode + */ +#define chromium_base64_encode_strlen(A) ((A + 2)/ 3 * 4) + + + +#ifdef __cplusplus +} + +#include + + +/** + * base 64 decode a string (self-modifing) + * On failure, the string is empty. + * + * This function is for C++ only (duh) + * + * \param[in,out] s the string to be decoded + * \return a reference to the input string + */ +inline std::string& chromium_base64_encode(std::string& s) +{ + std::string x(chromium_base64_encode_len(s.size()), '\0'); + size_t d = chromium_base64_encode(const_cast(x.data()), s.data(), (int)s.size()); + if (d == MODP_B64_ERROR) { + x.clear(); + } else { + x.erase(d, std::string::npos); + } + s.swap(x); + return s; +} + +#endif /* __cplusplus */ +#endif \ No newline at end of file diff --git a/lib/arch/avx512/codec.c b/lib/arch/avx512/codec.c new file mode 100644 index 00000000..ef5ea5f7 --- /dev/null +++ b/lib/arch/avx512/codec.c @@ -0,0 +1,40 @@ +#include +#include +#include + +#include "../../../include/libbase64.h" +#include "../../tables/tables.h" +#include "../../codecs.h" +#include "config.h" +#include "../../env.h" + +#if HAVE_AVX512 +#include + +#include "enc_loop.c" +#include "chromiumbase64.c" +#include "../avx2/dec_reshuffle.c" +#include "../avx2/dec_loop.c" + +#endif // HAVE_AVX512 + +BASE64_ENC_FUNCTION(avx512) +{ +#if HAVE_AVX512 + enc_loop_avx512(src, srclen, out, outlen); +#else + BASE64_ENC_STUB +#endif +} + +BASE64_DEC_FUNCTION(avx512) +{ +// avx512 decode is not implemented yet, reuse avx2 version +#if HAVE_AVX512 + #include "../generic/dec_head.c" + dec_loop_avx2(&s, &slen, &o, &olen); + #include "../generic/dec_tail.c" +#else + BASE64_DEC_STUB +#endif +} \ No newline at end of file diff --git a/lib/arch/avx512/enc_loop.c b/lib/arch/avx512/enc_loop.c new file mode 100644 index 00000000..9024037c --- /dev/null +++ b/lib/arch/avx512/enc_loop.c @@ -0,0 +1,74 @@ +#include +#include "chromiumbase64.h" + +static inline void +enc_loop_avx512 (const char* src, size_t slen, char* dst, size_t* dlen) { + size_t dlen_encoded = 0; + // compliant with current simd base64 impl which only supports NORMAL mode + static char base64_table[] = "ABCDEFGHIJKLMNOPQRSTUVWXYZ" + "abcdefghijklmnopqrstuvwxyz" + "0123456789+/"; + const char* lookup_tbl = base64_table; + // 32-bit input + // [ 0 0 0 0 0 0 0 0|c1 c0 d5 d4 d3 d2 d1 d0| + // b3 b2 b1 b0 c5 c4 c3 c2|a5 a4 a3 a2 a1 a0 b5 b4] + // output order [1, 2, 0, 1] + // [b3 b2 b1 b0 c5 c4 c3 c2|c1 c0 d5 d4 d3 d2 d1 d0| + // a5 a4 a3 a2 a1 a0 b5 b4|b3 b2 b1 b0 c3 c2 c1 c0] + + const __m512i shuffle_input = _mm512_setr_epi32(0x01020001, + 0x04050304, + 0x07080607, + 0x0a0b090a, + 0x0d0e0c0d, + 0x10110f10, + 0x13141213, + 0x16171516, + 0x191a1819, + 0x1c1d1b1c, + 0x1f201e1f, + 0x22232122, + 0x25262425, + 0x28292728, + 0x2b2c2a2b, + 0x2e2f2d2e); + const __m512i lookup = _mm512_loadu_si512(lookup_tbl); + + while (slen >= 64) { + const __m512i v = _mm512_loadu_si512(src); + + // Reorder bytes + // [b3 b2 b1 b0 c5 c4 c3 c2|c1 c0 d5 d4 d3 d2 d1 d0| + // a5 a4 a3 a2 a1 a0 b5 b4|b3 b2 b1 b0 c3 c2 c1 c0] + const __m512i in = _mm512_permutexvar_epi8(shuffle_input, v); + + // After multishift a single 32-bit lane has following layout + // [c1 c0 d5 d4 d3 d2 d1 d0|b1 b0 c5 c4 c3 c2 c1 c0| + // a1 a0 b5 b4 b3 b2 b1 b0|d1 d0 a5 a4 a3 a2 a1 a0] + // (a = [10:17], b = [4:11], c = [22:27], d = [16:21]) + + // 48, 54, 36, 42, 16, 22, 4, 10 + const __m512i shifts = _mm512_set1_epi64(0x3036242a1016040alu); + const __m512i indices = _mm512_multishift_epi64_epi8(shifts, in); + + // Note: the two higher bits of each indices' byte have garbage + // but the following permutexvar instruction masks them out + + // Translation 6-bit values to ASCII. + const __m512i result = _mm512_permutexvar_epi8(indices, lookup); + + _mm512_storeu_si512(dst, result); + + dlen_encoded +=64; + dst += 64; + src += 48; + slen -= 48; + } + + // Fallback to a fast Base64 encoding used in Chromium project + if (slen > 0) { + dlen_encoded += chromium_base64_encode(dst, src, slen); + } + + *dlen = dlen_encoded; +} \ No newline at end of file diff --git a/lib/codec_choose.c b/lib/codec_choose.c index 6a07d6a7..a1b2c187 100644 --- a/lib/codec_choose.c +++ b/lib/codec_choose.c @@ -2,6 +2,7 @@ #include #include #include +#include #include "../include/libbase64.h" #include "codecs.h" @@ -10,7 +11,7 @@ #if (__x86_64__ || __i386__ || _M_X86 || _M_X64) #define BASE64_X86 - #if (HAVE_SSSE3 || HAVE_SSE41 || HAVE_SSE42 || HAVE_AVX || HAVE_AVX2) + #if (HAVE_SSSE3 || HAVE_SSE41 || HAVE_SSE42 || HAVE_AVX || HAVE_AVX2 || HAVE_AVX512) #define BASE64_X86_SIMD #endif #endif @@ -31,7 +32,7 @@ __cpuid_count(__level, 0, __eax, __ebx, __ecx, __edx) #else #include - #if HAVE_AVX2 || HAVE_AVX + #if HAVE_AVX512 || HAVE_AVX2 || HAVE_AVX #if ((__GNUC__ > 4 || __GNUC__ == 4 && __GNUC_MINOR__ >= 2) || (__clang_major__ >= 3)) static inline uint64_t _xgetbv (uint32_t index) { @@ -45,6 +46,10 @@ #endif #endif +#ifndef bit_AVX512 +#define bit_AVX512vl (1 << 31) +#define bit_AVX512vbmi (1 << 1) +#endif #ifndef bit_AVX2 #define bit_AVX2 (1 << 5) #endif @@ -75,6 +80,7 @@ BASE64_ENC_FUNCTION(arch); \ BASE64_DEC_FUNCTION(arch); \ +BASE64_CODEC_FUNCS(avx512) BASE64_CODEC_FUNCS(avx2) BASE64_CODEC_FUNCS(neon32) BASE64_CODEC_FUNCS(neon64) @@ -91,9 +97,10 @@ codec_choose_forced (struct codec *codec, int flags) // always allow it, even if the codec is a no-op. // For testing purposes. - if (!(flags & 0xFF)) { + if (!(flags & 0xFFFF)) { return false; } + if (flags & BASE64_FORCE_AVX2) { codec->enc = base64_stream_encode_avx2; codec->dec = base64_stream_decode_avx2; @@ -134,6 +141,11 @@ codec_choose_forced (struct codec *codec, int flags) codec->dec = base64_stream_decode_avx; return true; } + if (flags & BASE64_FORCE_AVX512) { + codec->enc = base64_stream_encode_avx512; + codec->dec = base64_stream_decode_avx512; + return true; + } return false; } @@ -178,8 +190,8 @@ codec_choose_x86 (struct codec *codec) max_level = __get_cpuid_max(0, NULL); #endif - #if HAVE_AVX2 || HAVE_AVX - // Check for AVX/AVX2 support: + #if HAVE_AVX512 || HAVE_AVX2 || HAVE_AVX + // Check for AVX/AVX2/AVX512 support: // Checking for AVX requires 3 things: // 1) CPUID indicates that the OS uses XSAVE and XRSTORE instructions // (allowing saving YMM registers on context switch) @@ -195,6 +207,16 @@ codec_choose_x86 (struct codec *codec) uint64_t xcr_mask; xcr_mask = _xgetbv(_XCR_XFEATURE_ENABLED_MASK); if (xcr_mask & _XCR_XMM_AND_YMM_STATE_ENABLED_BY_OS) { + #if HAVE_AVX512 + if (max_level >= 7) { + __cpuid_count(7, 0, eax, ebx, ecx, edx); + if (ebx & bit_AVX512vl && ecx & bit_AVX512VBMI) { + codec->enc = base64_stream_encode_avx512; + codec->dec = base64_stream_decode_avx512; + return true; + } + } + #endif #if HAVE_AVX2 if (max_level >= 7) { __cpuid_count(7, 0, eax, ebx, ecx, edx); diff --git a/lib/lib.c b/lib/lib.c index 4703512b..053931a9 100644 --- a/lib/lib.c +++ b/lib/lib.c @@ -68,7 +68,7 @@ void base64_stream_decode_init (struct base64_state *state, int flags) { // If any of the codec flags are set, redo choice: - if (codec.dec == NULL || flags & 0xFF) { + if (codec.dec == NULL || flags & 0xFFFF) { codec_choose(&codec, flags); } state->eof = 0; diff --git a/test/ci/test.sh b/test/ci/test.sh index 066a49f4..803cd1bd 100755 --- a/test/ci/test.sh +++ b/test/ci/test.sh @@ -9,7 +9,8 @@ if [ "${MACHINE}" == "x86_64" ]; then export AVX_CFLAGS=-mavx # no AVX2 on GHA macOS if [ "$(uname -s)" != "Darwin" ]; then - export AVX2_CFLAGS=-mavx2 + export AVX2_CFLAGS=-mavx2 + export AVX512_CFLAGS="-mavx512vl -mavx512vbmi" fi elif [ "${MACHINE}" == "aarch64" ]; then export NEON64_CFLAGS="-march=armv8-a" @@ -22,7 +23,7 @@ if [ "${OPENMP:-}" == "0" ]; then fi uname -a -${CC} --version +gcc --version make make -C test diff --git a/test/codec_supported.c b/test/codec_supported.c index a027b994..f68c7668 100644 --- a/test/codec_supported.c +++ b/test/codec_supported.c @@ -11,6 +11,7 @@ static char *_codecs[] = , "SSE41" , "SSE42" , "AVX" +, "AVX512" , NULL } ; From 8731e4720508e1442e3a991a95e66309059397aa Mon Sep 17 00:00:00 2001 From: lucshi Date: Mon, 26 Sep 2022 17:24:50 +0800 Subject: [PATCH 2/4] fix bug --- lib/arch/ssse3/enc_translate.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/lib/arch/ssse3/enc_translate.c b/lib/arch/ssse3/enc_translate.c index 04f288fc..175777e0 100644 --- a/lib/arch/ssse3/enc_translate.c +++ b/lib/arch/ssse3/enc_translate.c @@ -30,4 +30,4 @@ enc_translate (const __m128i in) // Add offsets to input values: return _mm_add_epi8(in, _mm_shuffle_epi8(lut, indices)); -} +} \ No newline at end of file From ad528a9bc133263232ed74bb5c176e648595fd40 Mon Sep 17 00:00:00 2001 From: lucshi Date: Thu, 29 Sep 2022 15:39:10 +0800 Subject: [PATCH 3/4] updated the AVX512 encoding code and README, removed the Chromium code --- README.md | 23 +- lib/arch/avx512/chromiumbase64.c | 410 ---------------------- lib/arch/avx512/chromiumbase64.h | 168 --------- lib/arch/avx512/codec.c | 18 +- lib/arch/avx512/dec_loop.c | 110 ++++++ lib/arch/avx512/dec_reshuffle.c | 34 ++ lib/arch/avx512/enc_loop.c | 125 +++---- lib/arch/avx512/enc_reshuffle_translate.c | 56 +++ test/ci/test.sh | 4 +- 9 files changed, 288 insertions(+), 660 deletions(-) delete mode 100644 lib/arch/avx512/chromiumbase64.c delete mode 100644 lib/arch/avx512/chromiumbase64.h create mode 100644 lib/arch/avx512/dec_loop.c create mode 100644 lib/arch/avx512/dec_reshuffle.c create mode 100644 lib/arch/avx512/enc_reshuffle_translate.c diff --git a/README.md b/README.md index b953c324..a99ef540 100644 --- a/README.md +++ b/README.md @@ -3,7 +3,7 @@ [![Build Status](https://github.com/aklomp/base64/actions/workflows/test.yml/badge.svg)](https://github.com/aklomp/base64/actions/workflows/test.yml) This is an implementation of a base64 stream encoding/decoding library in C99 -with SIMD (AVX2, NEON, AArch64/NEON, SSSE3, SSE4.1, SSE4.2, AVX) and +with SIMD (AVX2, AVX512, NEON, AArch64/NEON, SSSE3, SSE4.1, SSE4.2, AVX) and [OpenMP](http://www.openmp.org) acceleration. It also contains wrapper functions to encode/decode simple length-delimited strings. This library aims to be: @@ -19,6 +19,10 @@ will pick an optimized codec that lets it encode/decode 12 or 24 bytes at a time, which gives a speedup of four or more times compared to the "plain" bytewise codec. +AVX512 support is only for encoding at present, utilizing the AVX512 VL and VBMI +instructions. Decoding part reused AVX2 implementations. For CPUs later than +Cannonlake (manufactured in 2018) supports these instructions. + NEON support is hardcoded to on or off at compile time, because portable runtime feature detection is unavailable on ARM. @@ -59,6 +63,9 @@ optimizations described by Wojciech Muła in a [articles](http://0x80.pl/notesen/2016-01-17-sse-base64-decoding.html). His own code is [here](https://github.com/WojciechMula/toys/tree/master/base64). +The AVX512 encoder code is also referenced from the project of Wojciech Muła and +the project code is [here](https://github.com/WojciechMula/base64-avx512) + The OpenMP implementation was added by Ferry Toth (@htot) from [Exalon Delft](http://www.exalondelft.nl). ## Building @@ -76,8 +83,8 @@ To compile just the "plain" library without SIMD codecs, type: make lib/libbase64.o ``` -Optional SIMD codecs can be included by specifying the `AVX2_CFLAGS`, `NEON32_CFLAGS`, `NEON64_CFLAGS`, -`SSSE3_CFLAGS`, `SSE41_CFLAGS`, `SSE42_CFLAGS` and/or `AVX_CFLAGS` environment variables. +Optional SIMD codecs can be included by specifying the `AVX2_CFLAGS`, `AVX512_CFLAGS`, +`NEON32_CFLAGS`, `NEON64_CFLAGS`, `SSSE3_CFLAGS`, `SSE41_CFLAGS`, `SSE42_CFLAGS` and/or `AVX_CFLAGS` environment variables. A typical build invocation on x86 looks like this: ```sh @@ -93,6 +100,15 @@ Example: AVX2_CFLAGS=-mavx2 make ``` +### AVX512 + +To build and include the AVX512 codec, set the `AVX512_CFLAGS` environment variable to a value that will turn on AVX512 support in your compiler, typically `-mavx512vl -mavx512vbmi`. +Example: + +```sh +AVX512_CFLAGS="-mavx512vl -mavx512vbmi" make +``` + The codec will only be used if runtime feature detection shows that the target machine supports AVX2. ### SSSE3 @@ -208,6 +224,7 @@ Mainly there for testing purposes, this is also useful on ARM where the only way The following constants can be used: - `BASE64_FORCE_AVX2` +- `BASE64_FORCE_AVX512` - `BASE64_FORCE_NEON32` - `BASE64_FORCE_NEON64` - `BASE64_FORCE_PLAIN` diff --git a/lib/arch/avx512/chromiumbase64.c b/lib/arch/avx512/chromiumbase64.c deleted file mode 100644 index 8a8a417d..00000000 --- a/lib/arch/avx512/chromiumbase64.c +++ /dev/null @@ -1,410 +0,0 @@ -#include "chromiumbase64.h" - -#define CHAR62 '+' -#define CHAR63 '/' -#define CHARPAD '=' -static const char e0[256] = { - 'A', 'A', 'A', 'A', 'B', 'B', 'B', 'B', 'C', 'C', - 'C', 'C', 'D', 'D', 'D', 'D', 'E', 'E', 'E', 'E', - 'F', 'F', 'F', 'F', 'G', 'G', 'G', 'G', 'H', 'H', - 'H', 'H', 'I', 'I', 'I', 'I', 'J', 'J', 'J', 'J', - 'K', 'K', 'K', 'K', 'L', 'L', 'L', 'L', 'M', 'M', - 'M', 'M', 'N', 'N', 'N', 'N', 'O', 'O', 'O', 'O', - 'P', 'P', 'P', 'P', 'Q', 'Q', 'Q', 'Q', 'R', 'R', - 'R', 'R', 'S', 'S', 'S', 'S', 'T', 'T', 'T', 'T', - 'U', 'U', 'U', 'U', 'V', 'V', 'V', 'V', 'W', 'W', - 'W', 'W', 'X', 'X', 'X', 'X', 'Y', 'Y', 'Y', 'Y', - 'Z', 'Z', 'Z', 'Z', 'a', 'a', 'a', 'a', 'b', 'b', - 'b', 'b', 'c', 'c', 'c', 'c', 'd', 'd', 'd', 'd', - 'e', 'e', 'e', 'e', 'f', 'f', 'f', 'f', 'g', 'g', - 'g', 'g', 'h', 'h', 'h', 'h', 'i', 'i', 'i', 'i', - 'j', 'j', 'j', 'j', 'k', 'k', 'k', 'k', 'l', 'l', - 'l', 'l', 'm', 'm', 'm', 'm', 'n', 'n', 'n', 'n', - 'o', 'o', 'o', 'o', 'p', 'p', 'p', 'p', 'q', 'q', - 'q', 'q', 'r', 'r', 'r', 'r', 's', 's', 's', 's', - 't', 't', 't', 't', 'u', 'u', 'u', 'u', 'v', 'v', - 'v', 'v', 'w', 'w', 'w', 'w', 'x', 'x', 'x', 'x', - 'y', 'y', 'y', 'y', 'z', 'z', 'z', 'z', '0', '0', - '0', '0', '1', '1', '1', '1', '2', '2', '2', '2', - '3', '3', '3', '3', '4', '4', '4', '4', '5', '5', - '5', '5', '6', '6', '6', '6', '7', '7', '7', '7', - '8', '8', '8', '8', '9', '9', '9', '9', '+', '+', - '+', '+', '/', '/', '/', '/' -}; - -static const char e1[256] = { - 'A', 'B', 'C', 'D', 'E', 'F', 'G', 'H', 'I', 'J', - 'K', 'L', 'M', 'N', 'O', 'P', 'Q', 'R', 'S', 'T', - 'U', 'V', 'W', 'X', 'Y', 'Z', 'a', 'b', 'c', 'd', - 'e', 'f', 'g', 'h', 'i', 'j', 'k', 'l', 'm', 'n', - 'o', 'p', 'q', 'r', 's', 't', 'u', 'v', 'w', 'x', - 'y', 'z', '0', '1', '2', '3', '4', '5', '6', '7', - '8', '9', '+', '/', 'A', 'B', 'C', 'D', 'E', 'F', - 'G', 'H', 'I', 'J', 'K', 'L', 'M', 'N', 'O', 'P', - 'Q', 'R', 'S', 'T', 'U', 'V', 'W', 'X', 'Y', 'Z', - 'a', 'b', 'c', 'd', 'e', 'f', 'g', 'h', 'i', 'j', - 'k', 'l', 'm', 'n', 'o', 'p', 'q', 'r', 's', 't', - 'u', 'v', 'w', 'x', 'y', 'z', '0', '1', '2', '3', - '4', '5', '6', '7', '8', '9', '+', '/', 'A', 'B', - 'C', 'D', 'E', 'F', 'G', 'H', 'I', 'J', 'K', 'L', - 'M', 'N', 'O', 'P', 'Q', 'R', 'S', 'T', 'U', 'V', - 'W', 'X', 'Y', 'Z', 'a', 'b', 'c', 'd', 'e', 'f', - 'g', 'h', 'i', 'j', 'k', 'l', 'm', 'n', 'o', 'p', - 'q', 'r', 's', 't', 'u', 'v', 'w', 'x', 'y', 'z', - '0', '1', '2', '3', '4', '5', '6', '7', '8', '9', - '+', '/', 'A', 'B', 'C', 'D', 'E', 'F', 'G', 'H', - 'I', 'J', 'K', 'L', 'M', 'N', 'O', 'P', 'Q', 'R', - 'S', 'T', 'U', 'V', 'W', 'X', 'Y', 'Z', 'a', 'b', - 'c', 'd', 'e', 'f', 'g', 'h', 'i', 'j', 'k', 'l', - 'm', 'n', 'o', 'p', 'q', 'r', 's', 't', 'u', 'v', - 'w', 'x', 'y', 'z', '0', '1', '2', '3', '4', '5', - '6', '7', '8', '9', '+', '/' -}; - -static const char e2[256] = { - 'A', 'B', 'C', 'D', 'E', 'F', 'G', 'H', 'I', 'J', - 'K', 'L', 'M', 'N', 'O', 'P', 'Q', 'R', 'S', 'T', - 'U', 'V', 'W', 'X', 'Y', 'Z', 'a', 'b', 'c', 'd', - 'e', 'f', 'g', 'h', 'i', 'j', 'k', 'l', 'm', 'n', - 'o', 'p', 'q', 'r', 's', 't', 'u', 'v', 'w', 'x', - 'y', 'z', '0', '1', '2', '3', '4', '5', '6', '7', - '8', '9', '+', '/', 'A', 'B', 'C', 'D', 'E', 'F', - 'G', 'H', 'I', 'J', 'K', 'L', 'M', 'N', 'O', 'P', - 'Q', 'R', 'S', 'T', 'U', 'V', 'W', 'X', 'Y', 'Z', - 'a', 'b', 'c', 'd', 'e', 'f', 'g', 'h', 'i', 'j', - 'k', 'l', 'm', 'n', 'o', 'p', 'q', 'r', 's', 't', - 'u', 'v', 'w', 'x', 'y', 'z', '0', '1', '2', '3', - '4', '5', '6', '7', '8', '9', '+', '/', 'A', 'B', - 'C', 'D', 'E', 'F', 'G', 'H', 'I', 'J', 'K', 'L', - 'M', 'N', 'O', 'P', 'Q', 'R', 'S', 'T', 'U', 'V', - 'W', 'X', 'Y', 'Z', 'a', 'b', 'c', 'd', 'e', 'f', - 'g', 'h', 'i', 'j', 'k', 'l', 'm', 'n', 'o', 'p', - 'q', 'r', 's', 't', 'u', 'v', 'w', 'x', 'y', 'z', - '0', '1', '2', '3', '4', '5', '6', '7', '8', '9', - '+', '/', 'A', 'B', 'C', 'D', 'E', 'F', 'G', 'H', - 'I', 'J', 'K', 'L', 'M', 'N', 'O', 'P', 'Q', 'R', - 'S', 'T', 'U', 'V', 'W', 'X', 'Y', 'Z', 'a', 'b', - 'c', 'd', 'e', 'f', 'g', 'h', 'i', 'j', 'k', 'l', - 'm', 'n', 'o', 'p', 'q', 'r', 's', 't', 'u', 'v', - 'w', 'x', 'y', 'z', '0', '1', '2', '3', '4', '5', - '6', '7', '8', '9', '+', '/' -}; - - - -/* SPECIAL DECODE TABLES FOR LITTLE ENDIAN (INTEL) CPUS */ - -static const uint32_t d0[256] = { -0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, -0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, -0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, -0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, -0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, -0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, -0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, -0x01ffffff, 0x000000f8, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x000000fc, -0x000000d0, 0x000000d4, 0x000000d8, 0x000000dc, 0x000000e0, 0x000000e4, -0x000000e8, 0x000000ec, 0x000000f0, 0x000000f4, 0x01ffffff, 0x01ffffff, -0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x00000000, -0x00000004, 0x00000008, 0x0000000c, 0x00000010, 0x00000014, 0x00000018, -0x0000001c, 0x00000020, 0x00000024, 0x00000028, 0x0000002c, 0x00000030, -0x00000034, 0x00000038, 0x0000003c, 0x00000040, 0x00000044, 0x00000048, -0x0000004c, 0x00000050, 0x00000054, 0x00000058, 0x0000005c, 0x00000060, -0x00000064, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, -0x01ffffff, 0x00000068, 0x0000006c, 0x00000070, 0x00000074, 0x00000078, -0x0000007c, 0x00000080, 0x00000084, 0x00000088, 0x0000008c, 0x00000090, -0x00000094, 0x00000098, 0x0000009c, 0x000000a0, 0x000000a4, 0x000000a8, -0x000000ac, 0x000000b0, 0x000000b4, 0x000000b8, 0x000000bc, 0x000000c0, -0x000000c4, 0x000000c8, 0x000000cc, 0x01ffffff, 0x01ffffff, 0x01ffffff, -0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, -0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, -0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, -0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, -0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, -0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, -0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, -0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, -0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, -0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, -0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, -0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, -0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, -0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, -0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, -0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, -0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, -0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, -0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, -0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, -0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, -0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff -}; - - -static const uint32_t d1[256] = { -0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, -0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, -0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, -0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, -0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, -0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, -0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, -0x01ffffff, 0x0000e003, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x0000f003, -0x00004003, 0x00005003, 0x00006003, 0x00007003, 0x00008003, 0x00009003, -0x0000a003, 0x0000b003, 0x0000c003, 0x0000d003, 0x01ffffff, 0x01ffffff, -0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x00000000, -0x00001000, 0x00002000, 0x00003000, 0x00004000, 0x00005000, 0x00006000, -0x00007000, 0x00008000, 0x00009000, 0x0000a000, 0x0000b000, 0x0000c000, -0x0000d000, 0x0000e000, 0x0000f000, 0x00000001, 0x00001001, 0x00002001, -0x00003001, 0x00004001, 0x00005001, 0x00006001, 0x00007001, 0x00008001, -0x00009001, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, -0x01ffffff, 0x0000a001, 0x0000b001, 0x0000c001, 0x0000d001, 0x0000e001, -0x0000f001, 0x00000002, 0x00001002, 0x00002002, 0x00003002, 0x00004002, -0x00005002, 0x00006002, 0x00007002, 0x00008002, 0x00009002, 0x0000a002, -0x0000b002, 0x0000c002, 0x0000d002, 0x0000e002, 0x0000f002, 0x00000003, -0x00001003, 0x00002003, 0x00003003, 0x01ffffff, 0x01ffffff, 0x01ffffff, -0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, -0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, -0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, -0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, -0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, -0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, -0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, -0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, -0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, -0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, -0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, -0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, -0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, -0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, -0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, -0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, -0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, -0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, -0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, -0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, -0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, -0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff -}; - - -static const uint32_t d2[256] = { -0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, -0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, -0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, -0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, -0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, -0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, -0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, -0x01ffffff, 0x00800f00, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x00c00f00, -0x00000d00, 0x00400d00, 0x00800d00, 0x00c00d00, 0x00000e00, 0x00400e00, -0x00800e00, 0x00c00e00, 0x00000f00, 0x00400f00, 0x01ffffff, 0x01ffffff, -0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x00000000, -0x00400000, 0x00800000, 0x00c00000, 0x00000100, 0x00400100, 0x00800100, -0x00c00100, 0x00000200, 0x00400200, 0x00800200, 0x00c00200, 0x00000300, -0x00400300, 0x00800300, 0x00c00300, 0x00000400, 0x00400400, 0x00800400, -0x00c00400, 0x00000500, 0x00400500, 0x00800500, 0x00c00500, 0x00000600, -0x00400600, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, -0x01ffffff, 0x00800600, 0x00c00600, 0x00000700, 0x00400700, 0x00800700, -0x00c00700, 0x00000800, 0x00400800, 0x00800800, 0x00c00800, 0x00000900, -0x00400900, 0x00800900, 0x00c00900, 0x00000a00, 0x00400a00, 0x00800a00, -0x00c00a00, 0x00000b00, 0x00400b00, 0x00800b00, 0x00c00b00, 0x00000c00, -0x00400c00, 0x00800c00, 0x00c00c00, 0x01ffffff, 0x01ffffff, 0x01ffffff, -0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, -0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, -0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, -0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, -0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, -0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, -0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, -0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, -0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, -0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, -0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, -0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, -0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, -0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, -0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, -0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, -0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, -0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, -0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, -0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, -0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, -0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff -}; - - -static const uint32_t d3[256] = { -0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, -0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, -0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, -0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, -0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, -0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, -0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, -0x01ffffff, 0x003e0000, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x003f0000, -0x00340000, 0x00350000, 0x00360000, 0x00370000, 0x00380000, 0x00390000, -0x003a0000, 0x003b0000, 0x003c0000, 0x003d0000, 0x01ffffff, 0x01ffffff, -0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x00000000, -0x00010000, 0x00020000, 0x00030000, 0x00040000, 0x00050000, 0x00060000, -0x00070000, 0x00080000, 0x00090000, 0x000a0000, 0x000b0000, 0x000c0000, -0x000d0000, 0x000e0000, 0x000f0000, 0x00100000, 0x00110000, 0x00120000, -0x00130000, 0x00140000, 0x00150000, 0x00160000, 0x00170000, 0x00180000, -0x00190000, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, -0x01ffffff, 0x001a0000, 0x001b0000, 0x001c0000, 0x001d0000, 0x001e0000, -0x001f0000, 0x00200000, 0x00210000, 0x00220000, 0x00230000, 0x00240000, -0x00250000, 0x00260000, 0x00270000, 0x00280000, 0x00290000, 0x002a0000, -0x002b0000, 0x002c0000, 0x002d0000, 0x002e0000, 0x002f0000, 0x00300000, -0x00310000, 0x00320000, 0x00330000, 0x01ffffff, 0x01ffffff, 0x01ffffff, -0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, -0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, -0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, -0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, -0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, -0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, -0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, -0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, -0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, -0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, -0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, -0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, -0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, -0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, -0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, -0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, -0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, -0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, -0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, -0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, -0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, -0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff -}; - - - -#define BADCHAR 0x01FFFFFF - -/** - * you can control if we use padding by commenting out this - * next line. However, I highly recommend you use padding and not - * using it should only be for compatability with a 3rd party. - * Also, 'no padding' is not tested! - */ -#define DOPAD 1 - -/* - * if we aren't doing padding - * set the pad character to NULL - */ -#ifndef DOPAD -#undef CHARPAD -#define CHARPAD '\0' -#endif - -size_t chromium_base64_encode(char* dest, const char* str, size_t len) -{ - size_t i = 0; - uint8_t* p = (uint8_t*) dest; - - /* unsigned here is important! */ - uint8_t t1, t2, t3; - - if (len > 2) { - for (; i < len - 2; i += 3) { - t1 = str[i]; t2 = str[i+1]; t3 = str[i+2]; - *p++ = e0[t1]; - *p++ = e1[((t1 & 0x03) << 4) | ((t2 >> 4) & 0x0F)]; - *p++ = e1[((t2 & 0x0F) << 2) | ((t3 >> 6) & 0x03)]; - *p++ = e2[t3]; - } - } - - switch (len - i) { - case 0: - break; - case 1: - t1 = str[i]; - *p++ = e0[t1]; - *p++ = e1[(t1 & 0x03) << 4]; - *p++ = CHARPAD; - *p++ = CHARPAD; - break; - default: /* case 2 */ - t1 = str[i]; t2 = str[i+1]; - *p++ = e0[t1]; - *p++ = e1[((t1 & 0x03) << 4) | ((t2 >> 4) & 0x0F)]; - *p++ = e2[(t2 & 0x0F) << 2]; - *p++ = CHARPAD; - } - - *p = '\0'; - return p - (uint8_t*)dest; -} - - -size_t chromium_base64_decode(char* dest, const char* src, size_t len) -{ - if (len == 0) return 0; - -#ifdef DOPAD - /* - * if padding is used, then the message must be at least - * 4 chars and be a multiple of 4 - */ - if (len < 4 || (len % 4 != 0)) { - return MODP_B64_ERROR; /* error */ - } - /* there can be at most 2 pad chars at the end */ - if (src[len-1] == CHARPAD) { - len--; - if (src[len -1] == CHARPAD) { - len--; - } - } -#endif - - size_t i; - int leftover = len % 4; - size_t chunks = (leftover == 0) ? len / 4 - 1 : len /4; - - uint8_t* p = (uint8_t*)dest; - uint32_t x = 0; - const uint8_t* y = (uint8_t*)src; - for (i = 0; i < chunks; ++i, y += 4) { - x = d0[y[0]] | d1[y[1]] | d2[y[2]] | d3[y[3]]; - if (x >= BADCHAR) return MODP_B64_ERROR; - *p++ = ((uint8_t*)(&x))[0]; - *p++ = ((uint8_t*)(&x))[1]; - *p++ = ((uint8_t*)(&x))[2]; - } - - switch (leftover) { - case 0: - x = d0[y[0]] | d1[y[1]] | d2[y[2]] | d3[y[3]]; - - if (x >= BADCHAR) return MODP_B64_ERROR; - *p++ = ((uint8_t*)(&x))[0]; - *p++ = ((uint8_t*)(&x))[1]; - *p = ((uint8_t*)(&x))[2]; - return (chunks+1)*3; - break; - case 1: /* with padding this is an impossible case */ - x = d0[y[0]]; - *p = *((uint8_t*)(&x)); // i.e. first char/byte in int - break; - case 2: // * case 2, 1 output byte */ - x = d0[y[0]] | d1[y[1]]; - *p = *((uint8_t*)(&x)); // i.e. first char - break; - default: /* case 3, 2 output bytes */ - x = d0[y[0]] | d1[y[1]] | d2[y[2]]; /* 0x3c */ - *p++ = ((uint8_t*)(&x))[0]; - *p = ((uint8_t*)(&x))[1]; - break; - } - - if (x >= BADCHAR) return MODP_B64_ERROR; - - return 3*chunks + (6*leftover)/8; -} \ No newline at end of file diff --git a/lib/arch/avx512/chromiumbase64.h b/lib/arch/avx512/chromiumbase64.h deleted file mode 100644 index d6a38ae7..00000000 --- a/lib/arch/avx512/chromiumbase64.h +++ /dev/null @@ -1,168 +0,0 @@ -/*************** -* Taken more or less as-is from the chromium project -****************/ - - - -/** - * \file - *
- * High performance base64 encoder / decoder
- * Version 1.3 -- 17-Mar-2006
- *
- * Copyright © 2005, 2006, Nick Galbreath -- nickg [at] modp [dot] com
- * All rights reserved.
- *
- * http://modp.com/release/base64
- *
- * Released under bsd license.  See modp_b64.c for details.
- * 
- * - * The default implementation is the standard b64 encoding with padding. - * It's easy to change this to use "URL safe" characters and to remove - * padding. See the modp_b64.c source code for details. - * - */ - -#ifndef MODP_B64 -#define MODP_B64 - -#include -#include - -#ifdef __cplusplus -extern "C" { -#endif - -#define MODP_B64_ERROR ((size_t)-1) -/** - * Encode a raw binary string into base 64. - * src contains the bytes - * len contains the number of bytes in the src - * dest should be allocated by the caller to contain - * at least chromium_base64_encode_len(len) bytes (see below) - * This will contain the null-terminated b64 encoded result - * returns length of the destination string plus the ending null byte - * i.e. the result will be equal to strlen(dest) + 1 - * - * Example - * - * \code - * char* src = ...; - * int srclen = ...; //the length of number of bytes in src - * char* dest = (char*) malloc(chromium_base64_decode_len(srclen)); - * int len = chromium_base64_encode(dest, src, sourcelen); - * if (len == MODP_B64_ERROR) { - * printf("Error\n"); - * } else { - * printf("b64 = %s\n", dest); - * } - * \endcode - * - */ -size_t chromium_base64_encode(char* dest, const char* str, size_t len); - -/** - * Decode a base64 encoded string - * - * - * src should contain exactly len bytes of b64 characters. - * if src contains -any- non-base characters (such as white - * space, MODP_B64_ERROR is returned. - * - * dest should be allocated by the caller to contain at least - * len * 3 / 4 bytes. - * - * Returns the length (strlen) of the output, or MODP_B64_ERROR if unable to - * decode - * - * \code - * char* src = ...; - * int srclen = ...; // or if you don't know use strlen(src) - * char* dest = (char*) malloc(chromium_base64_encode_len(srclen)); - * int len = chromium_base64_decode(dest, src, sourcelen); - * if (len == MODP_B64_ERROR) { error } - * \endcode - */ -size_t chromium_base64_decode(char* dest, const char* src, size_t len); - -/** - * Given a source string of length len, this returns the amount of - * memory the destination string should have. - * - * remember, this is integer math - * 3 bytes turn into 4 chars - * ceiling[len / 3] * 4 + 1 - * - * +1 is for any extra null. - */ -#define chromium_base64_encode_len(A) ((A+2)/3 * 4 + 1) - -/** - * Given a base64 string of length len, - * this returns the amount of memory required for output string - * It maybe be more than the actual number of bytes written. - * NOTE: remember this is integer math - * this allocates a bit more memory than traditional versions of b64 - * decode 4 chars turn into 3 bytes - * floor[len * 3/4] + 2 - */ -#define chromium_base64_decode_len(A) (A / 4 * 3 + 2) - -/** - * Will return the strlen of the output from encoding. - * This may be less than the required number of bytes allocated. - * - * This allows you to 'deserialized' a struct - * \code - * char* b64encoded = "..."; - * int len = strlen(b64encoded); - * - * struct datastuff foo; - * if (chromium_base64_encode_strlen(sizeof(struct datastuff)) != len) { - * // wrong size - * return false; - * } else { - * // safe to do; - * if (chromium_base64_encode((char*) &foo, b64encoded, len) == MODP_B64_ERROR) { - * // bad characters - * return false; - * } - * } - * // foo is filled out now - * \endcode - */ -#define chromium_base64_encode_strlen(A) ((A + 2)/ 3 * 4) - - - -#ifdef __cplusplus -} - -#include - - -/** - * base 64 decode a string (self-modifing) - * On failure, the string is empty. - * - * This function is for C++ only (duh) - * - * \param[in,out] s the string to be decoded - * \return a reference to the input string - */ -inline std::string& chromium_base64_encode(std::string& s) -{ - std::string x(chromium_base64_encode_len(s.size()), '\0'); - size_t d = chromium_base64_encode(const_cast(x.data()), s.data(), (int)s.size()); - if (d == MODP_B64_ERROR) { - x.clear(); - } else { - x.erase(d, std::string::npos); - } - s.swap(x); - return s; -} - -#endif /* __cplusplus */ -#endif \ No newline at end of file diff --git a/lib/arch/avx512/codec.c b/lib/arch/avx512/codec.c index ef5ea5f7..3fd73521 100644 --- a/lib/arch/avx512/codec.c +++ b/lib/arch/avx512/codec.c @@ -11,30 +11,32 @@ #if HAVE_AVX512 #include +#include "dec_reshuffle.c" +#include "dec_loop.c" +#include "enc_reshuffle_translate.c" #include "enc_loop.c" -#include "chromiumbase64.c" -#include "../avx2/dec_reshuffle.c" -#include "../avx2/dec_loop.c" #endif // HAVE_AVX512 BASE64_ENC_FUNCTION(avx512) { -#if HAVE_AVX512 - enc_loop_avx512(src, srclen, out, outlen); +#if HAVE_AVX2 + #include "../generic/enc_head.c" + enc_loop_avx512(&s, &slen, &o, &olen); + #include "../generic/enc_tail.c" #else BASE64_ENC_STUB #endif } +// Reuse AVX2 decoding. Not supporting AVX512 at present BASE64_DEC_FUNCTION(avx512) { -// avx512 decode is not implemented yet, reuse avx2 version -#if HAVE_AVX512 +#if HAVE_AVX2 #include "../generic/dec_head.c" dec_loop_avx2(&s, &slen, &o, &olen); #include "../generic/dec_tail.c" #else BASE64_DEC_STUB #endif -} \ No newline at end of file +} diff --git a/lib/arch/avx512/dec_loop.c b/lib/arch/avx512/dec_loop.c new file mode 100644 index 00000000..f959fc4b --- /dev/null +++ b/lib/arch/avx512/dec_loop.c @@ -0,0 +1,110 @@ +static inline int +dec_loop_avx2_inner (const uint8_t **s, uint8_t **o, size_t *rounds) +{ + const __m256i lut_lo = _mm256_setr_epi8( + 0x15, 0x11, 0x11, 0x11, 0x11, 0x11, 0x11, 0x11, + 0x11, 0x11, 0x13, 0x1A, 0x1B, 0x1B, 0x1B, 0x1A, + 0x15, 0x11, 0x11, 0x11, 0x11, 0x11, 0x11, 0x11, + 0x11, 0x11, 0x13, 0x1A, 0x1B, 0x1B, 0x1B, 0x1A); + + const __m256i lut_hi = _mm256_setr_epi8( + 0x10, 0x10, 0x01, 0x02, 0x04, 0x08, 0x04, 0x08, + 0x10, 0x10, 0x10, 0x10, 0x10, 0x10, 0x10, 0x10, + 0x10, 0x10, 0x01, 0x02, 0x04, 0x08, 0x04, 0x08, + 0x10, 0x10, 0x10, 0x10, 0x10, 0x10, 0x10, 0x10); + + const __m256i lut_roll = _mm256_setr_epi8( + 0, 16, 19, 4, -65, -65, -71, -71, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 16, 19, 4, -65, -65, -71, -71, + 0, 0, 0, 0, 0, 0, 0, 0); + + const __m256i mask_2F = _mm256_set1_epi8(0x2F); + + // Load input: + __m256i str = _mm256_loadu_si256((__m256i *) *s); + + // See the SSSE3 decoder for an explanation of the algorithm. + const __m256i hi_nibbles = _mm256_and_si256(_mm256_srli_epi32(str, 4), mask_2F); + const __m256i lo_nibbles = _mm256_and_si256(str, mask_2F); + const __m256i hi = _mm256_shuffle_epi8(lut_hi, hi_nibbles); + const __m256i lo = _mm256_shuffle_epi8(lut_lo, lo_nibbles); + + if (!_mm256_testz_si256(lo, hi)) { + return 0; + } + + const __m256i eq_2F = _mm256_cmpeq_epi8(str, mask_2F); + const __m256i roll = _mm256_shuffle_epi8(lut_roll, _mm256_add_epi8(eq_2F, hi_nibbles)); + + // Now simply add the delta values to the input: + str = _mm256_add_epi8(str, roll); + + // Reshuffle the input to packed 12-byte output format: + str = dec_reshuffle(str); + + // Store the output: + _mm256_storeu_si256((__m256i *) *o, str); + + *s += 32; + *o += 24; + *rounds -= 1; + + return 1; +} + +static inline void +dec_loop_avx2 (const uint8_t **s, size_t *slen, uint8_t **o, size_t *olen) +{ + if (*slen < 45) { + return; + } + + // Process blocks of 32 bytes per round. Because 8 extra zero bytes are + // written after the output, ensure that there will be at least 13 + // bytes of input data left to cover the gap. (11 data bytes and up to + // two end-of-string markers.) + size_t rounds = (*slen - 13) / 32; + + *slen -= rounds * 32; // 32 bytes consumed per round + *olen += rounds * 24; // 24 bytes produced per round + + do { + if (rounds >= 8) { + if (dec_loop_avx2_inner(s, o, &rounds) && + dec_loop_avx2_inner(s, o, &rounds) && + dec_loop_avx2_inner(s, o, &rounds) && + dec_loop_avx2_inner(s, o, &rounds) && + dec_loop_avx2_inner(s, o, &rounds) && + dec_loop_avx2_inner(s, o, &rounds) && + dec_loop_avx2_inner(s, o, &rounds) && + dec_loop_avx2_inner(s, o, &rounds)) { + continue; + } + break; + } + if (rounds >= 4) { + if (dec_loop_avx2_inner(s, o, &rounds) && + dec_loop_avx2_inner(s, o, &rounds) && + dec_loop_avx2_inner(s, o, &rounds) && + dec_loop_avx2_inner(s, o, &rounds)) { + continue; + } + break; + } + if (rounds >= 2) { + if (dec_loop_avx2_inner(s, o, &rounds) && + dec_loop_avx2_inner(s, o, &rounds)) { + continue; + } + break; + } + dec_loop_avx2_inner(s, o, &rounds); + break; + + } while (rounds > 0); + + // Adjust for any rounds that were skipped: + *slen += rounds * 32; + *olen -= rounds * 24; +} diff --git a/lib/arch/avx512/dec_reshuffle.c b/lib/arch/avx512/dec_reshuffle.c new file mode 100644 index 00000000..f3518098 --- /dev/null +++ b/lib/arch/avx512/dec_reshuffle.c @@ -0,0 +1,34 @@ +static inline __m256i +dec_reshuffle (const __m256i in) +{ + // in, lower lane, bits, upper case are most significant bits, lower + // case are least significant bits: + // 00llllll 00kkkkLL 00jjKKKK 00JJJJJJ + // 00iiiiii 00hhhhII 00ggHHHH 00GGGGGG + // 00ffffff 00eeeeFF 00ddEEEE 00DDDDDD + // 00cccccc 00bbbbCC 00aaBBBB 00AAAAAA + + const __m256i merge_ab_and_bc = _mm256_maddubs_epi16(in, _mm256_set1_epi32(0x01400140)); + // 0000kkkk LLllllll 0000JJJJ JJjjKKKK + // 0000hhhh IIiiiiii 0000GGGG GGggHHHH + // 0000eeee FFffffff 0000DDDD DDddEEEE + // 0000bbbb CCcccccc 0000AAAA AAaaBBBB + + __m256i out = _mm256_madd_epi16(merge_ab_and_bc, _mm256_set1_epi32(0x00011000)); + // 00000000 JJJJJJjj KKKKkkkk LLllllll + // 00000000 GGGGGGgg HHHHhhhh IIiiiiii + // 00000000 DDDDDDdd EEEEeeee FFffffff + // 00000000 AAAAAAaa BBBBbbbb CCcccccc + + // Pack bytes together in each lane: + out = _mm256_shuffle_epi8(out, _mm256_setr_epi8( + 2, 1, 0, 6, 5, 4, 10, 9, 8, 14, 13, 12, -1, -1, -1, -1, + 2, 1, 0, 6, 5, 4, 10, 9, 8, 14, 13, 12, -1, -1, -1, -1)); + // 00000000 00000000 00000000 00000000 + // LLllllll KKKKkkkk JJJJJJjj IIiiiiii + // HHHHhhhh GGGGGGgg FFffffff EEEEeeee + // DDDDDDdd CCcccccc BBBBbbbb AAAAAAaa + + // Pack lanes: + return _mm256_permutevar8x32_epi32(out, _mm256_setr_epi32(0, 1, 2, 4, 5, 6, -1, -1)); +} diff --git a/lib/arch/avx512/enc_loop.c b/lib/arch/avx512/enc_loop.c index 9024037c..a6daca66 100644 --- a/lib/arch/avx512/enc_loop.c +++ b/lib/arch/avx512/enc_loop.c @@ -1,74 +1,61 @@ -#include -#include "chromiumbase64.h" - static inline void -enc_loop_avx512 (const char* src, size_t slen, char* dst, size_t* dlen) { - size_t dlen_encoded = 0; - // compliant with current simd base64 impl which only supports NORMAL mode - static char base64_table[] = "ABCDEFGHIJKLMNOPQRSTUVWXYZ" - "abcdefghijklmnopqrstuvwxyz" - "0123456789+/"; - const char* lookup_tbl = base64_table; - // 32-bit input - // [ 0 0 0 0 0 0 0 0|c1 c0 d5 d4 d3 d2 d1 d0| - // b3 b2 b1 b0 c5 c4 c3 c2|a5 a4 a3 a2 a1 a0 b5 b4] - // output order [1, 2, 0, 1] - // [b3 b2 b1 b0 c5 c4 c3 c2|c1 c0 d5 d4 d3 d2 d1 d0| - // a5 a4 a3 a2 a1 a0 b5 b4|b3 b2 b1 b0 c3 c2 c1 c0] - - const __m512i shuffle_input = _mm512_setr_epi32(0x01020001, - 0x04050304, - 0x07080607, - 0x0a0b090a, - 0x0d0e0c0d, - 0x10110f10, - 0x13141213, - 0x16171516, - 0x191a1819, - 0x1c1d1b1c, - 0x1f201e1f, - 0x22232122, - 0x25262425, - 0x28292728, - 0x2b2c2a2b, - 0x2e2f2d2e); - const __m512i lookup = _mm512_loadu_si512(lookup_tbl); - - while (slen >= 64) { - const __m512i v = _mm512_loadu_si512(src); - - // Reorder bytes - // [b3 b2 b1 b0 c5 c4 c3 c2|c1 c0 d5 d4 d3 d2 d1 d0| - // a5 a4 a3 a2 a1 a0 b5 b4|b3 b2 b1 b0 c3 c2 c1 c0] - const __m512i in = _mm512_permutexvar_epi8(shuffle_input, v); - - // After multishift a single 32-bit lane has following layout - // [c1 c0 d5 d4 d3 d2 d1 d0|b1 b0 c5 c4 c3 c2 c1 c0| - // a1 a0 b5 b4 b3 b2 b1 b0|d1 d0 a5 a4 a3 a2 a1 a0] - // (a = [10:17], b = [4:11], c = [22:27], d = [16:21]) +enc_loop_avx512_inner (const uint8_t **s, uint8_t **o) +{ + // Load input: + __m512i src = _mm512_loadu_si512((__m512i *) *s); - // 48, 54, 36, 42, 16, 22, 4, 10 - const __m512i shifts = _mm512_set1_epi64(0x3036242a1016040alu); - const __m512i indices = _mm512_multishift_epi64_epi8(shifts, in); + // Reshuffle, translate, store: + src = enc_reshuffle_translate(src); + _mm512_storeu_si512((__m512i *) *o, src); - // Note: the two higher bits of each indices' byte have garbage - // but the following permutexvar instruction masks them out + *s += 48; + *o += 64; +} - // Translation 6-bit values to ASCII. - const __m512i result = _mm512_permutexvar_epi8(indices, lookup); - - _mm512_storeu_si512(dst, result); - - dlen_encoded +=64; - dst += 64; - src += 48; - slen -= 48; - } - - // Fallback to a fast Base64 encoding used in Chromium project - if (slen > 0) { - dlen_encoded += chromium_base64_encode(dst, src, slen); - } - - *dlen = dlen_encoded; +static inline void +enc_loop_avx512 (const uint8_t **s, size_t *slen, uint8_t **o, size_t *olen) +{ + if (*slen < 64) { + return; + } + + // Process blocks of 48 bytes at a time. Because blocks are loaded 64 + // bytes at a time, ensure that there will be at least 24 remaining bytes + // after the last round, so that the final read will not pass beyond the + // bounds of the input buffer: + size_t rounds = (*slen - 24) / 48; + + *slen -= rounds * 48; // 48 bytes consumed per round + *olen += rounds * 64; // 64 bytes produced per round + + while (rounds > 0) { + if (rounds >= 8) { + enc_loop_avx512_inner(s, o); + enc_loop_avx512_inner(s, o); + enc_loop_avx512_inner(s, o); + enc_loop_avx512_inner(s, o); + enc_loop_avx512_inner(s, o); + enc_loop_avx512_inner(s, o); + enc_loop_avx512_inner(s, o); + enc_loop_avx512_inner(s, o); + rounds -= 8; + continue; + } + if (rounds >= 4) { + enc_loop_avx512_inner(s, o); + enc_loop_avx512_inner(s, o); + enc_loop_avx512_inner(s, o); + enc_loop_avx512_inner(s, o); + rounds -= 4; + continue; + } + if (rounds >= 2) { + enc_loop_avx512_inner(s, o); + enc_loop_avx512_inner(s, o); + rounds -= 2; + continue; + } + enc_loop_avx512_inner(s, o); + break; + } } \ No newline at end of file diff --git a/lib/arch/avx512/enc_reshuffle_translate.c b/lib/arch/avx512/enc_reshuffle_translate.c new file mode 100644 index 00000000..c033d81b --- /dev/null +++ b/lib/arch/avx512/enc_reshuffle_translate.c @@ -0,0 +1,56 @@ +// AVX512 algorithm is based on permutevar and multishift. The code is +// referenced from https://github.com/WojciechMula/base64-avx512 which +// is under BSD-3 license + +static inline __m512i +enc_reshuffle_translate (const __m512i input) +{ + // 32-bit input + // [ 0 0 0 0 0 0 0 0|c1 c0 d5 d4 d3 d2 d1 d0| + // b3 b2 b1 b0 c5 c4 c3 c2|a5 a4 a3 a2 a1 a0 b5 b4] + // output order [1, 2, 0, 1] + // [b3 b2 b1 b0 c5 c4 c3 c2|c1 c0 d5 d4 d3 d2 d1 d0| + // a5 a4 a3 a2 a1 a0 b5 b4|b3 b2 b1 b0 c3 c2 c1 c0] + + const __m512i shuffle_input = _mm512_setr_epi32(0x01020001, + 0x04050304, + 0x07080607, + 0x0a0b090a, + 0x0d0e0c0d, + 0x10110f10, + 0x13141213, + 0x16171516, + 0x191a1819, + 0x1c1d1b1c, + 0x1f201e1f, + 0x22232122, + 0x25262425, + 0x28292728, + 0x2b2c2a2b, + 0x2e2f2d2e); + + // Reorder bytes + // [b3 b2 b1 b0 c5 c4 c3 c2|c1 c0 d5 d4 d3 d2 d1 d0| + // a5 a4 a3 a2 a1 a0 b5 b4|b3 b2 b1 b0 c3 c2 c1 c0] + const __m512i in = _mm512_permutexvar_epi8(shuffle_input, input); + + + // After multishift a single 32-bit lane has following layout + // [c1 c0 d5 d4 d3 d2 d1 d0|b1 b0 c5 c4 c3 c2 c1 c0| + // a1 a0 b5 b4 b3 b2 b1 b0|d1 d0 a5 a4 a3 a2 a1 a0] + // (a = [10:17], b = [4:11], c = [22:27], d = [16:21]) + + // 48, 54, 36, 42, 16, 22, 4, 10 + const __m512i shifts = _mm512_set1_epi64(0x3036242a1016040alu); + __m512i shuffled_in = _mm512_multishift_epi64_epi8(shifts, in); + + // Translate immediatedly after reshuffled + static char base64_table[] = "ABCDEFGHIJKLMNOPQRSTUVWXYZ" + "abcdefghijklmnopqrstuvwxyz" + "0123456789+/"; + const char* lookup_tbl = base64_table; + const __m512i lookup = _mm512_loadu_si512(lookup_tbl); + + // Translation 6-bit values to ASCII. + return _mm512_permutexvar_epi8(shuffled_in, lookup); +} \ No newline at end of file diff --git a/test/ci/test.sh b/test/ci/test.sh index 803cd1bd..30407b9b 100755 --- a/test/ci/test.sh +++ b/test/ci/test.sh @@ -7,10 +7,10 @@ if [ "${MACHINE}" == "x86_64" ]; then export SSE41_CFLAGS=-msse4.1 export SSE42_CFLAGS=-msse4.2 export AVX_CFLAGS=-mavx - # no AVX2 on GHA macOS + # no AVX2 or AVX512 on GHA macOS if [ "$(uname -s)" != "Darwin" ]; then export AVX2_CFLAGS=-mavx2 - export AVX512_CFLAGS="-mavx512vl -mavx512vbmi" + export AVX512_CFLAGS="-mavx512vl -mavx512vbmi" fi elif [ "${MACHINE}" == "aarch64" ]; then export NEON64_CFLAGS="-march=armv8-a" From 234760eaa53ed69e418cae979ba57fd8322cae12 Mon Sep 17 00:00:00 2001 From: lucshi Date: Thu, 29 Sep 2022 15:45:04 +0800 Subject: [PATCH 4/4] resume to the main branch --- lib/arch/ssse3/enc_translate.c | 2 +- test/ci/test.sh | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/lib/arch/ssse3/enc_translate.c b/lib/arch/ssse3/enc_translate.c index 175777e0..04f288fc 100644 --- a/lib/arch/ssse3/enc_translate.c +++ b/lib/arch/ssse3/enc_translate.c @@ -30,4 +30,4 @@ enc_translate (const __m128i in) // Add offsets to input values: return _mm_add_epi8(in, _mm_shuffle_epi8(lut, indices)); -} \ No newline at end of file +} diff --git a/test/ci/test.sh b/test/ci/test.sh index 30407b9b..a296bfcf 100755 --- a/test/ci/test.sh +++ b/test/ci/test.sh @@ -23,7 +23,7 @@ if [ "${OPENMP:-}" == "0" ]; then fi uname -a -gcc --version +${CC} --version make make -C test