Skip to content

Commit

Permalink
ht_dec.c: Improve MSVC arm64 popcount performance (#1479)
Browse files Browse the repository at this point in the history
Use NEON instructions for ARM64 (implementation based on microsoft/STL#2127).

Godbolt output here: https://godbolt.org/z/q7GPTqT14
  • Loading branch information
PeterJohnson authored Dec 9, 2023
1 parent dfdedea commit 41c25e3
Showing 1 changed file with 13 additions and 0 deletions.
13 changes: 13 additions & 0 deletions src/lib/openjp2/ht_dec.c
Original file line number Diff line number Diff line change
Expand Up @@ -55,6 +55,16 @@
#define OPJ_COMPILER_GNUC
#endif

#if defined(OPJ_COMPILER_MSVC) && defined(_M_ARM64) \
&& !defined(_M_ARM64EC) && !defined(_M_CEE_PURE) && !defined(__CUDACC__) \
&& !defined(__INTEL_COMPILER) && !defined(__clang__)
#define MSVC_NEON_INTRINSICS
#endif

#ifdef MSVC_NEON_INTRINSICS
#include <arm64_neon.h>
#endif

//************************************************************************/
/** @brief Displays the error message for disabling the decoding of SPP and
* MRP passes
Expand All @@ -71,6 +81,9 @@ OPJ_UINT32 population_count(OPJ_UINT32 val)
{
#if defined(OPJ_COMPILER_MSVC) && (defined(_M_IX86) || defined(_M_AMD64))
return (OPJ_UINT32)__popcnt(val);
#elif defined(OPJ_COMPILER_MSVC) && defined(MSVC_NEON_INTRINSICS)
const __n64 temp = neon_cnt(__uint64ToN64_v(val));
return neon_addv8(temp).n8_i8[0];
#elif (defined OPJ_COMPILER_GNUC)
return (OPJ_UINT32)__builtin_popcount(val);
#else
Expand Down

0 comments on commit 41c25e3

Please sign in to comment.