From ea175e28f8ef7f6a8f5931ebad1835d95ec466ed Mon Sep 17 00:00:00 2001 From: costan Date: Mon, 27 Feb 2017 14:29:18 -0800 Subject: [PATCH] Implement support for Intel crc32 instruction (SSE 4.2) This change authored by vadimskipin and submitted via: https://github.com/google/leveldb/pull/309 Changes made to support iOS builds and other architectures without support for SSE 4.2. db_bench reports original crc32 speed at: crc32c : 3.610 micros/op; 1082.0 MB/s (4K per op) with this change performance has increased to: crc32c : 0.843 micros/op; 4633.6 MB/s (4K per op) ------------- Created by MOE: https://github.com/google/moe MOE_MIGRATED_REVID=148694935 --- Makefile | 6 ++ build_detect_platform | 30 +++++++++- port/port_example.h | 6 ++ port/port_posix.h | 2 + port/port_posix_sse.cc | 125 +++++++++++++++++++++++++++++++++++++++++ util/crc32c.cc | 18 ++++++ 6 files changed, 186 insertions(+), 1 deletion(-) create mode 100644 port/port_posix_sse.cc diff --git a/Makefile b/Makefile index 07a5a1ead6..66e32261f6 100644 --- a/Makefile +++ b/Makefile @@ -412,3 +412,9 @@ $(SHARED_OUTDIR)/%.o: %.cc $(SHARED_OUTDIR)/%.o: %.c $(CC) $(CFLAGS) $(PLATFORM_SHARED_CFLAGS) -c $< -o $@ + +$(STATIC_OUTDIR)/port/port_posix_sse.o: port/port_posix_sse.cc + $(CXX) $(CXXFLAGS) $(PLATFORM_SSEFLAGS) -c $< -o $@ + +$(SHARED_OUTDIR)/port/port_posix_sse.o: port/port_posix_sse.cc + $(CXX) $(CXXFLAGS) $(PLATFORM_SHARED_CFLAGS) $(PLATFORM_SSEFLAGS) -c $< -o $@ diff --git a/build_detect_platform b/build_detect_platform index f0629939ca..d2a20ce5b6 100755 --- a/build_detect_platform +++ b/build_detect_platform @@ -63,6 +63,7 @@ PLATFORM_SHARED_EXT="so" PLATFORM_SHARED_LDFLAGS="-shared -Wl,-soname -Wl," PLATFORM_SHARED_CFLAGS="-fPIC" PLATFORM_SHARED_VERSIONED=true +PLATFORM_SSEFLAGS= MEMCMP_FLAG= if [ "$CXX" = "g++" ]; then @@ -77,6 +78,7 @@ case "$TARGET_OS" in COMMON_FLAGS="$MEMCMP_FLAG -lpthread -DOS_LINUX -DCYGWIN" PLATFORM_LDFLAGS="-lpthread" PORT_FILE=port/port_posix.cc + PORT_SSE_FILE=port/port_posix_sse.cc ;; Darwin) PLATFORM=OS_MACOSX @@ -85,48 +87,56 @@ case "$TARGET_OS" in [ -z "$INSTALL_PATH" ] && INSTALL_PATH=`pwd` PLATFORM_SHARED_LDFLAGS="-dynamiclib -install_name $INSTALL_PATH/" PORT_FILE=port/port_posix.cc + PORT_SSE_FILE=port/port_posix_sse.cc ;; Linux) PLATFORM=OS_LINUX COMMON_FLAGS="$MEMCMP_FLAG -pthread -DOS_LINUX" PLATFORM_LDFLAGS="-pthread" PORT_FILE=port/port_posix.cc + PORT_SSE_FILE=port/port_posix_sse.cc ;; SunOS) PLATFORM=OS_SOLARIS COMMON_FLAGS="$MEMCMP_FLAG -D_REENTRANT -DOS_SOLARIS" PLATFORM_LIBS="-lpthread -lrt" PORT_FILE=port/port_posix.cc + PORT_SSE_FILE=port/port_posix_sse.cc ;; FreeBSD) PLATFORM=OS_FREEBSD COMMON_FLAGS="$MEMCMP_FLAG -D_REENTRANT -DOS_FREEBSD" PLATFORM_LIBS="-lpthread" PORT_FILE=port/port_posix.cc + PORT_SSE_FILE=port/port_posix_sse.cc ;; NetBSD) PLATFORM=OS_NETBSD COMMON_FLAGS="$MEMCMP_FLAG -D_REENTRANT -DOS_NETBSD" PLATFORM_LIBS="-lpthread -lgcc_s" PORT_FILE=port/port_posix.cc + PORT_SSE_FILE=port/port_posix_sse.cc ;; OpenBSD) PLATFORM=OS_OPENBSD COMMON_FLAGS="$MEMCMP_FLAG -D_REENTRANT -DOS_OPENBSD" PLATFORM_LDFLAGS="-pthread" PORT_FILE=port/port_posix.cc + PORT_SSE_FILE=port/port_posix_sse.cc ;; DragonFly) PLATFORM=OS_DRAGONFLYBSD COMMON_FLAGS="$MEMCMP_FLAG -D_REENTRANT -DOS_DRAGONFLYBSD" PLATFORM_LIBS="-lpthread" PORT_FILE=port/port_posix.cc + PORT_SSE_FILE=port/port_posix_sse.cc ;; OS_ANDROID_CROSSCOMPILE) PLATFORM=OS_ANDROID COMMON_FLAGS="$MEMCMP_FLAG -D_REENTRANT -DOS_ANDROID -DLEVELDB_PLATFORM_POSIX" PLATFORM_LDFLAGS="" # All pthread features are in the Android C library PORT_FILE=port/port_posix.cc + PORT_SSE_FILE=port/port_posix_sse.cc CROSS_COMPILE=true ;; HP-UX) @@ -134,6 +144,7 @@ case "$TARGET_OS" in COMMON_FLAGS="$MEMCMP_FLAG -D_REENTRANT -DOS_HPUX" PLATFORM_LDFLAGS="-pthread" PORT_FILE=port/port_posix.cc + PORT_SSE_FILE=port/port_posix_sse.cc # man ld: +h internal_name PLATFORM_SHARED_LDFLAGS="-shared -Wl,+h -Wl," ;; @@ -142,6 +153,7 @@ case "$TARGET_OS" in COMMON_FLAGS="$MEMCMP_FLAG -DOS_MACOSX" [ -z "$INSTALL_PATH" ] && INSTALL_PATH=`pwd` PORT_FILE=port/port_posix.cc + PORT_SSE_FILE=port/port_posix_sse.cc PLATFORM_SHARED_EXT= PLATFORM_SHARED_LDFLAGS= PLATFORM_SHARED_CFLAGS= @@ -168,7 +180,7 @@ set +f # re-enable globbing # The sources consist of the portable files, plus the platform-specific port # file. -echo "SOURCES=$PORTABLE_FILES $PORT_FILE" >> $OUTPUT +echo "SOURCES=$PORTABLE_FILES $PORT_FILE $PORT_SSE_FILE" >> $OUTPUT echo "MEMENV_SOURCES=helpers/memenv/memenv.cc" >> $OUTPUT if [ "$CROSS_COMPILE" = "true" ]; then @@ -210,6 +222,21 @@ EOF fi rm -f $CXXOUTPUT 2>/dev/null + + # Test if gcc SSE 4.2 is supported + $CXX $CXXFLAGS -x c++ - -o $CXXOUTPUT -msse4.2 2>/dev/null </dev/null +fi + +# Use the SSE 4.2 CRC32C intrinsics iff runtime checks indicate compiler supports them. +if [ -n "$PLATFORM_SSEFLAGS" ]; then + PLATFORM_SSEFLAGS="$PLATFORM_SSEFLAGS -DLEVELDB_PLATFORM_POSIX_SSE" fi PLATFORM_CCFLAGS="$PLATFORM_CCFLAGS $COMMON_FLAGS" @@ -222,6 +249,7 @@ echo "PLATFORM_LDFLAGS=$PLATFORM_LDFLAGS" >> $OUTPUT echo "PLATFORM_LIBS=$PLATFORM_LIBS" >> $OUTPUT echo "PLATFORM_CCFLAGS=$PLATFORM_CCFLAGS" >> $OUTPUT echo "PLATFORM_CXXFLAGS=$PLATFORM_CXXFLAGS" >> $OUTPUT +echo "PLATFORM_SSEFLAGS=$PLATFORM_SSEFLAGS" >> $OUTPUT echo "PLATFORM_SHARED_CFLAGS=$PLATFORM_SHARED_CFLAGS" >> $OUTPUT echo "PLATFORM_SHARED_EXT=$PLATFORM_SHARED_EXT" >> $OUTPUT echo "PLATFORM_SHARED_LDFLAGS=$PLATFORM_SHARED_LDFLAGS" >> $OUTPUT diff --git a/port/port_example.h b/port/port_example.h index ab9e489b32..97bd669a5e 100644 --- a/port/port_example.h +++ b/port/port_example.h @@ -129,6 +129,12 @@ extern bool Snappy_Uncompress(const char* input_data, size_t input_length, // The concatenation of all "data[0,n-1]" fragments is the heap profile. extern bool GetHeapProfile(void (*func)(void*, const char*, int), void* arg); +// Extend the CRC to include the first n bytes of buf. +// +// Returns zero if the CRC cannot be extended using acceleration, else returns +// the newly extended CRC value (which may also be zero). +uint32_t AcceleratedCRC32C(uint32_t crc, const char* buf, size_t size); + } // namespace port } // namespace leveldb diff --git a/port/port_posix.h b/port/port_posix.h index 89fc222c04..d67ab68c27 100644 --- a/port/port_posix.h +++ b/port/port_posix.h @@ -148,6 +148,8 @@ inline bool GetHeapProfile(void (*func)(void*, const char*, int), void* arg) { return false; } +uint32_t AcceleratedCRC32C(uint32_t crc, const char* buf, size_t size); + } // namespace port } // namespace leveldb diff --git a/port/port_posix_sse.cc b/port/port_posix_sse.cc new file mode 100644 index 0000000000..57ec8fee7e --- /dev/null +++ b/port/port_posix_sse.cc @@ -0,0 +1,125 @@ +// Copyright 2016 The LevelDB Authors. All rights reserved. +// Use of this source code is governed by a BSD-style license that can be +// found in the LICENSE file. See the AUTHORS file for names of contributors. +// +// A portable implementation of crc32c, optimized to handle +// four bytes at a time. +// +// In a separate source file to allow this accelerated CRC32C function to be +// compiled with the appropriate compiler flags to enable x86 SSE 4.2 +// instructions. + +#include +#include +#include "port/port.h" + +#if defined(LEVELDB_PLATFORM_POSIX_SSE) + +#if defined(_MSC_VER) +#include +#elif defined(__GNUC__) && defined(__SSE4_2__) +#include +#include +#endif + +#endif // defined(LEVELDB_PLATFORM_POSIX_SSE) + +namespace leveldb { +namespace port { + +#if defined(LEVELDB_PLATFORM_POSIX_SSE) + +// Used to fetch a naturally-aligned 32-bit word in little endian byte-order +static inline uint32_t LE_LOAD32(const uint8_t *p) { + // SSE is x86 only, so ensured that |p| is always little-endian. + uint32_t word; + memcpy(&word, p, sizeof(word)); + return word; +} + +// Used to fetch a naturally-aligned 64-bit word in little endian byte-order +static inline uint64_t LE_LOAD64(const uint8_t *p) { + uint64_t dword; + memcpy(&dword, p, sizeof(dword)); + return dword; +} + +static inline bool HaveSSE42() { +#if defined(_MSC_VER) + int cpu_info[4]; + __cpuid(cpu_info, 1); + return (cpu_info[2] & (1 << 20)) != 0; +#elif defined(__GNUC__) + unsigned int eax, ebx, ecx, edx; + __get_cpuid(1, &eax, &ebx, &ecx, &edx); + return (ecx & (1 << 20)) != 0; +#else + return false; +#endif +} + +#endif // defined(LEVELDB_PLATFORM_POSIX_SSE) + +// For further improvements see Intel publication at: +// http://download.intel.com/design/intarch/papers/323405.pdf +uint32_t AcceleratedCRC32C(uint32_t crc, const char* buf, size_t size) { +#if !defined(LEVELDB_PLATFORM_POSIX_SSE) + return 0; +#else + static bool have = HaveSSE42(); + if (!have) { + return 0; + } + + const uint8_t *p = reinterpret_cast(buf); + const uint8_t *e = p + size; + uint32_t l = crc ^ 0xffffffffu; + +#define STEP1 do { \ + l = _mm_crc32_u8(l, *p++); \ +} while (0) +#define STEP4 do { \ + l = _mm_crc32_u32(l, LE_LOAD32(p)); \ + p += 4; \ +} while (0) +#define STEP8 do { \ + l = _mm_crc32_u64(l, LE_LOAD64(p)); \ + p += 8; \ +} while (0) + + if (size > 16) { + // Process unaligned bytes + for (unsigned int i = reinterpret_cast(p) % 8; i; --i) { + STEP1; + } + + // _mm_crc32_u64 is only available on x64. +#if defined(_M_X64) || defined(__x86_64__) + // Process 8 bytes at a time + while ((e-p) >= 8) { + STEP8; + } + // Process 4 bytes at a time + if ((e-p) >= 4) { + STEP4; + } +#else // !(defined(_M_X64) || defined(__x86_64__)) + // Process 4 bytes at a time + while ((e-p) >= 4) { + STEP4; + } +#endif // defined(_M_X64) || defined(__x86_64__) + } + // Process the last few bytes + while (p != e) { + STEP1; + } +#undef STEP8 +#undef STEP4 +#undef STEP1 + return l ^ 0xffffffffu; +#endif // defined(LEVELDB_PLATFORM_POSIX_SSE) +} + +} // namespace port +} // namespace leveldb diff --git a/util/crc32c.cc b/util/crc32c.cc index 6db9e77077..edd61cfd6f 100644 --- a/util/crc32c.cc +++ b/util/crc32c.cc @@ -8,6 +8,8 @@ #include "util/crc32c.h" #include + +#include "port/port.h" #include "util/coding.h" namespace leveldb { @@ -283,7 +285,23 @@ static inline uint32_t LE_LOAD32(const uint8_t *p) { return DecodeFixed32(reinterpret_cast(p)); } +// Determine if the CPU running this program can accelerate the CRC32C +// calculation. +static bool CanAccelerateCRC32C() { + // port::AcceleretedCRC32C returns zero when unable to accelerate. + static const char kTestCRCBuffer[] = "TestCRCBuffer"; + static const char kBufSize = sizeof(kTestCRCBuffer) - 1; + static const uint32_t kTestCRCValue = 0xdcbc59fa; + + return port::AcceleratedCRC32C(0, kTestCRCBuffer, kBufSize) == kTestCRCValue; +} + uint32_t Extend(uint32_t crc, const char* buf, size_t size) { + static bool accelerate = CanAccelerateCRC32C(); + if (accelerate) { + return port::AcceleratedCRC32C(crc, buf, size); + } + const uint8_t *p = reinterpret_cast(buf); const uint8_t *e = p + size; uint32_t l = crc ^ 0xffffffffu;