<bit>: `popcount()` does not utilize `cnt` instruction on arm64 #1924

JoelLinn · 2021-05-16T11:01:57Z

The bug
std::popcount() uses the slow fallback implementation on arm64 because there is no variant utilizing the cnt instruction (like there is for x64 avx using popcnt).

#include <cstdint>
#include <bit>

int xxx(uint64_t num) {
    return std::popcount(num);
}

which compiles to (cl -O2 /std:c++latest):

|int xxx(unsigned __int64)| PROC                                    ; xxx
        lsr         x8,x0,#1
        and         x9,x8,#0x5555555555555555
        sub         x11,x0,x9
        lsr         x8,x11,#2
        and         x10,x8,#0x3333333333333333
        and         x9,x11,#0x3333333333333333
        add         x10,x10,x9
        add         x8,x10,x10,lsr #4
        mov         w10,#8
        and         x11,x8,#0xF0F0F0F0F0F0F0F
|$LL8@xxx|
        lsr         x9,x11,x10
        lsl         w10,w10,#1
        cmp         w10,#0x40
        add         x11,x9,x11
        blt         |$LL8@xxx|
        and         w0,w11,#0x7F
        ret

        ENDP  ; |int xxx(unsigned __int64)|, xxx

Expected behavior
The cnt instruction should be utilized, resulting in something similar to (gcc -O2 -std=c++20):

xxx(unsigned long):
        fmov    d0, x0
        cnt     v0.8b, v0.8b
        addv    b0, v0.8b
        fmov    w0, s0
        ret

godbolt sample

The text was updated successfully, but these errors were encountered:

AlexGuteniev · 2021-08-15T18:38:13Z

The needed intrinsic is present:

:\temp2>"C:\Program Files\Microsoft Visual Studio\2022\Preview\VC\Auxiliary\Build\vcvarsamd64_arm64.bat"
**********************************************************************
** Visual Studio 2022 Developer Command Prompt v17.0.0-pre.3.0
** Copyright (c) 2021 Microsoft Corporation
**********************************************************************
[vcvarsall.bat] Environment initialized for: 'x64_arm64'

D:\temp2>type arm64_pocnt.cpp
#include <arm64_neon.h>

int main(int argc, const char* argv[])
{
    return neon_cnt(__int64ToN64_v(argc)).n64_u64[0];
}

D:\temp2>cl /c /FA /O2 /MT arm64_pocnt.cpp
Microsoft (R) C/C++ Optimizing Compiler Version 19.30.30423 for ARM64
Copyright (C) Microsoft Corporation.  All rights reserved.

arm64_pocnt.cpp

D:\temp2>type arm64_pocnt.asm
; Listing generated by Microsoft (R) Optimizing Compiler Version 19.30.30423.0

        TTL     D:\temp2\arm64_pocnt.obj
        ;ARM64

        AREA    |.drectve|, DRECTVE

        EXPORT  |__int64ToN64_v|
        EXPORT  |main|

        AREA    |.pdata|, PDATA
|$pdata$__int64ToN64_v| DCD |$LN4|
        DCD     0x800015
        ;Flags[SingleProEpi] functionLength[20] RegF[0] RegI[0] H[0] frameChainReturn[UnChained] frameSize[16]

        AREA    |.pdata|, PDATA
|$pdata$main| DCD |$LN6|
        DCD     0x800025
        ;Flags[SingleProEpi] functionLength[36] RegF[0] RegI[0] H[0] frameChainReturn[UnChained] frameSize[16]
; Function compile flags: /Ogtpy

        AREA    |.text$mn|, CODE, ARM64

|main|  PROC
; File D:\temp2\arm64_pocnt.cpp
; Line 4
|$LN6|
        sub         sp,sp,#0x10
; Line 5
        sxtw        x8,w0
        fmov        d16,x8
        str         x8,[sp]
        cnt         v17.8b,v16.8b
        str         d17,[sp]
        fmov        x0,d17
        add         sp,sp,#0x10
        ret

        ENDP  ; |main|

; Function compile flags: /Ogtpy

        AREA    |.text$mn|, CODE, ARM64

|__int64ToN64_v| PROC
; File C:\Program Files\Microsoft Visual Studio\2022\Preview\VC\Tools\MSVC\14.30.30423\include\arm64_neon.h
; Line 254
|$LN4|
        sub         sp,sp,#0x10
; Line 256
        str         x0,[sp]
; Line 257
        fmov        d0,x0
        add         sp,sp,#0x10
        ret

        ENDP  ; |__int64ToN64_v|

        END

cbezault added ARM64 Related to the ARM64 architecture performance Must go faster labels May 17, 2021

fsb4000 mentioned this issue Jul 26, 2021

optimize _Popcount_fallback #2079

Merged

fsb4000 mentioned this issue Aug 15, 2021

<bit>: popcount() utilizes cnt instruction on arm64 #2127

Merged

StephanTLavavej closed this as completed in #2127 Sep 11, 2021

StephanTLavavej added the fixed Something works now, yay! label Sep 11, 2021

Provide feedback

Saved searches

Use saved searches to filter your results more quickly

<bit>: `popcount()` does not utilize `cnt` instruction on arm64 #1924

<bit>: `popcount()` does not utilize `cnt` instruction on arm64 #1924

JoelLinn commented May 16, 2021

AlexGuteniev commented Aug 15, 2021

<bit>: popcount() does not utilize cnt instruction on arm64 #1924

<bit>: popcount() does not utilize cnt instruction on arm64 #1924

Comments

JoelLinn commented May 16, 2021

AlexGuteniev commented Aug 15, 2021

<bit>: `popcount()` does not utilize `cnt` instruction on arm64 #1924

<bit>: `popcount()` does not utilize `cnt` instruction on arm64 #1924