Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

close #11330 sets uses optimized countSetBits #17334

Merged
merged 18 commits into from
Mar 21, 2021
92 changes: 12 additions & 80 deletions lib/pure/bitops.nim
Original file line number Diff line number Diff line change
Expand Up @@ -27,6 +27,13 @@

import macros
import std/private/since
from std/private/vmutils import forwardImpl, toUnsigned


# sets.nim cannot import bitops, but bitops can use import
# system/sets to eliminate code duplication. sets.nim defines
# countBits32 and countBits64.
import system/sets
ringabout marked this conversation as resolved.
Show resolved Hide resolved
Araq marked this conversation as resolved.
Show resolved Hide resolved

func bitnot*[T: SomeInteger](x: T): T {.magic: "BitnotI".}
## Computes the `bitwise complement` of the integer `x`.
Expand Down Expand Up @@ -58,34 +65,6 @@ macro bitxor*[T: SomeInteger](x, y: T; z: varargs[T]): T =
for extra in z:
result = newCall(fn, result, extra)

const useBuiltins = not defined(noIntrinsicsBitOpts)
const noUndefined = defined(noUndefinedBitOpts)
const useGCC_builtins = (defined(gcc) or defined(llvm_gcc) or
defined(clang)) and useBuiltins
const useICC_builtins = defined(icc) and useBuiltins
const useVCC_builtins = defined(vcc) and useBuiltins
const arch64 = sizeof(int) == 8
const useBuiltinsRotate = (defined(amd64) or defined(i386)) and
(defined(gcc) or defined(clang) or defined(vcc) or
(defined(icl) and not defined(cpp))) and useBuiltins

template toUnsigned(x: int8): uint8 = cast[uint8](x)
template toUnsigned(x: int16): uint16 = cast[uint16](x)
template toUnsigned(x: int32): uint32 = cast[uint32](x)
template toUnsigned(x: int64): uint64 = cast[uint64](x)
template toUnsigned(x: int): uint = cast[uint](x)

template forwardImpl(impl, arg) {.dirty.} =
when sizeof(x) <= 4:
when x is SomeSignedInt:
impl(cast[uint32](x.int32))
else:
impl(x.uint32)
else:
when x is SomeSignedInt:
impl(cast[uint64](x.int64))
else:
impl(x.uint64)

type BitsRange*[T] = range[0..sizeof(T)*8-1]
## A range with all bit positions for type `T`.
Expand Down Expand Up @@ -436,13 +415,12 @@ func fastlog2Nim(x: uint64): int {.inline.} =
v = v or v shr 32
result = lookup[(v * 0x03F6EAF2CD271461'u64) shr 58].int

# sets.nim cannot import bitops, but bitops can use include
# system/sets to eliminate code duplication. sets.nim defines
# countBits32 and countBits64.
import system/countbits_impl

template countSetBitsNim(n: uint32): int = countBits32(n)
template countSetBitsNim(n: uint64): int = countBits64(n)
const arch64 = sizeof(int) == 8
const useBuiltinsRotate = (defined(amd64) or defined(i386)) and
(defined(gcc) or defined(clang) or defined(vcc) or
(defined(icl) and not defined(cpp))) and useBuiltins

template parityImpl[T](value: T): int =
# formula id from: https://graphics.stanford.edu/%7Eseander/bithacks.html#ParityParallel
Expand All @@ -459,11 +437,6 @@ template parityImpl[T](value: T): int =


when useGCC_builtins:
# Returns the number of set 1-bits in value.
proc builtin_popcount(x: cuint): cint {.importc: "__builtin_popcount", cdecl.}
proc builtin_popcountll(x: culonglong): cint {.
importc: "__builtin_popcountll", cdecl.}

# Returns the bit parity in value
proc builtin_parity(x: cuint): cint {.importc: "__builtin_parity", cdecl.}
proc builtin_parityll(x: culonglong): cint {.importc: "__builtin_parityll", cdecl.}
Expand All @@ -481,14 +454,6 @@ when useGCC_builtins:
proc builtin_ctzll(x: culonglong): cint {.importc: "__builtin_ctzll", cdecl.}

elif useVCC_builtins:
# Counts the number of one bits (population count) in a 16-, 32-, or 64-byte unsigned integer.
func builtin_popcnt16(a2: uint16): uint16 {.
importc: "__popcnt16", header: "<intrin.h>".}
func builtin_popcnt32(a2: uint32): uint32 {.
importc: "__popcnt", header: "<intrin.h>".}
func builtin_popcnt64(a2: uint64): uint64 {.
importc: "__popcnt64", header: "<intrin.h>".}

# Search the mask data from most significant bit (MSB) to least significant bit (LSB) for a set bit (1).
func bitScanReverse(index: ptr culong, mask: culong): cuchar {.
importc: "_BitScanReverse", header: "<intrin.h>".}
Expand All @@ -507,15 +472,6 @@ elif useVCC_builtins:
index.int

elif useICC_builtins:

# Intel compiler intrinsics: http://fulla.fnal.gov/intel/compiler_c/main_cls/intref_cls/common/intref_allia_misc.htm
# see also: https://software.intel.com/en-us/node/523362
# Count the number of bits set to 1 in an integer a, and return that count in dst.
func builtin_popcnt32(a: cint): cint {.
importc: "_popcnt", header: "<immintrin.h>".}
func builtin_popcnt64(a: uint64): cint {.
importc: "_popcnt64", header: "<immintrin.h>".}

# Returns the number of trailing 0-bits in x, starting at the least significant bit position. If x is 0, the result is undefined.
func bitScanForward(p: ptr uint32, b: uint32): cuchar {.
importc: "_BitScanForward", header: "<immintrin.h>".}
Expand All @@ -533,37 +489,13 @@ elif useICC_builtins:
discard fnc(index.addr, v)
index.int


func countSetBits*(x: SomeInteger): int {.inline.} =
## Counts the set bits in an integer (also called `Hamming weight`:idx:).
runnableExamples:
doAssert countSetBits(0b0000_0011'u8) == 2
doAssert countSetBits(0b1010_1010'u8) == 4

# TODO: figure out if ICC support _popcnt32/_popcnt64 on platform without POPCNT.
# like GCC and MSVC
when x is SomeSignedInt:
let x = x.toUnsigned
when nimvm:
result = forwardImpl(countSetBitsNim, x)
else:
when useGCC_builtins:
when sizeof(x) <= 4: result = builtin_popcount(x.cuint).int
else: result = builtin_popcountll(x.culonglong).int
elif useVCC_builtins:
when sizeof(x) <= 2: result = builtin_popcnt16(x.uint16).int
elif sizeof(x) <= 4: result = builtin_popcnt32(x.uint32).int
elif arch64: result = builtin_popcnt64(x.uint64).int
else: result = builtin_popcnt32((x.uint64 and 0xFFFFFFFF'u64).uint32).int +
builtin_popcnt32((x.uint64 shr 32'u64).uint32).int
elif useICC_builtins:
when sizeof(x) <= 4: result = builtin_popcnt32(x.cint).int
elif arch64: result = builtin_popcnt64(x.uint64).int
else: result = builtin_popcnt32((x.uint64 and 0xFFFFFFFF'u64).cint).int +
builtin_popcnt32((x.uint64 shr 32'u64).cint).int
else:
when sizeof(x) <= 4: result = countSetBitsNim(x.uint32)
else: result = countSetBitsNim(x.uint64)
result = countSetBitsImpl(x)

func popcount*(x: SomeInteger): int {.inline.} =
## Alias for `countSetBits <#countSetBits,SomeInteger>`_ (Hamming weight).
Expand Down
17 changes: 17 additions & 0 deletions lib/std/private/vmutils.nim
Original file line number Diff line number Diff line change
@@ -0,0 +1,17 @@
template forwardImpl*(impl, arg) {.dirty.} =
when sizeof(x) <= 4:
when x is SomeSignedInt:
impl(cast[uint32](x.int32))
else:
impl(x.uint32)
else:
when x is SomeSignedInt:
impl(cast[uint64](x.int64))
else:
impl(x.uint64)

template toUnsigned*(x: int8): uint8 = cast[uint8](x)
template toUnsigned*(x: int16): uint16 = cast[uint16](x)
template toUnsigned*(x: int32): uint32 = cast[uint32](x)
template toUnsigned*(x: int64): uint64 = cast[uint64](x)
template toUnsigned*(x: int): uint = cast[uint](x)
3 changes: 1 addition & 2 deletions lib/system.nim
Original file line number Diff line number Diff line change
Expand Up @@ -2339,8 +2339,7 @@ when notJSnotNims:
when hostOS != "standalone" and hostOS != "any":
include "system/dyncalls"

import system/countbits_impl
include "system/sets"
from system/sets import cardSet
ringabout marked this conversation as resolved.
Show resolved Hide resolved
ringabout marked this conversation as resolved.
Show resolved Hide resolved

when defined(gogc):
const GenericSeqSize = (3 * sizeof(int))
Expand Down
77 changes: 73 additions & 4 deletions lib/system/countbits_impl.nim
Original file line number Diff line number Diff line change
Expand Up @@ -9,17 +9,86 @@

## Contains the used algorithms for counting bits.

proc countBits32*(n: uint32): int {.compilerproc.} =
from std/private/vmutils import forwardImpl, toUnsigned


const useBuiltins* = not defined(noIntrinsicsBitOpts)
const noUndefined* = defined(noUndefinedBitOpts)
const useGCC_builtins* = (defined(gcc) or defined(llvm_gcc) or
defined(clang)) and useBuiltins
const useICC_builtins* = defined(icc) and useBuiltins
const useVCC_builtins* = defined(vcc) and useBuiltins

template countBitsImpl(n: uint32): int =
# generic formula is from: https://graphics.stanford.edu/~seander/bithacks.html#CountBitsSetParallel
var v = uint32(n)
v = v - ((v shr 1'u32) and 0x55555555'u32)
v = (v and 0x33333333'u32) + ((v shr 2'u32) and 0x33333333'u32)
result = (((v + (v shr 4'u32) and 0xF0F0F0F'u32) * 0x1010101'u32) shr 24'u32).int
(((v + (v shr 4'u32) and 0xF0F0F0F'u32) * 0x1010101'u32) shr 24'u32).int

proc countBits64*(n: uint64): int {.compilerproc, inline.} =
template countBitsImpl(n: uint64): int =
# generic formula is from: https://graphics.stanford.edu/~seander/bithacks.html#CountBitsSetParallel
var v = uint64(n)
v = v - ((v shr 1'u64) and 0x5555555555555555'u64)
v = (v and 0x3333333333333333'u64) + ((v shr 2'u64) and 0x3333333333333333'u64)
v = (v + (v shr 4'u64) and 0x0F0F0F0F0F0F0F0F'u64)
result = ((v * 0x0101010101010101'u64) shr 56'u64).int
((v * 0x0101010101010101'u64) shr 56'u64).int


when useGCC_builtins:
# Returns the number of set 1-bits in value.
proc builtin_popcount(x: cuint): cint {.importc: "__builtin_popcount", cdecl.}
proc builtin_popcountll(x: culonglong): cint {.
importc: "__builtin_popcountll", cdecl.}

elif useVCC_builtins:
# Counts the number of one bits (population count) in a 16-, 32-, or 64-byte unsigned integer.
func builtin_popcnt16(a2: uint16): uint16 {.
importc: "__popcnt16", header: "<intrin.h>".}
func builtin_popcnt32(a2: uint32): uint32 {.
importc: "__popcnt", header: "<intrin.h>".}
func builtin_popcnt64(a2: uint64): uint64 {.
importc: "__popcnt64", header: "<intrin.h>".}

elif useICC_builtins:
# Intel compiler intrinsics: http://fulla.fnal.gov/intel/compiler_c/main_cls/intref_cls/common/intref_allia_misc.htm
# see also: https://software.intel.com/en-us/node/523362
# Count the number of bits set to 1 in an integer a, and return that count in dst.
func builtin_popcnt32(a: cint): cint {.
importc: "_popcnt", header: "<immintrin.h>".}
func builtin_popcnt64(a: uint64): cint {.
importc: "_popcnt64", header: "<immintrin.h>".}


func countSetBitsImpl*(x: SomeInteger): int {.inline.} =
## Counts the set bits in an integer (also called `Hamming weight`:idx:).
# TODO: figure out if ICC support _popcnt32/_popcnt64 on platform without POPCNT.
# like GCC and MSVC
when x is SomeSignedInt:
let x = x.toUnsigned
when nimvm:
result = forwardImpl(countBitsImpl, x)
else:
when useGCC_builtins:
when sizeof(x) <= 4: result = builtin_popcount(x.cuint).int
else: result = builtin_popcountll(x.culonglong).int
elif useVCC_builtins:
when sizeof(x) <= 2: result = builtin_popcnt16(x.uint16).int
elif sizeof(x) <= 4: result = builtin_popcnt32(x.uint32).int
elif arch64: result = builtin_popcnt64(x.uint64).int
else: result = builtin_popcnt32((x.uint64 and 0xFFFFFFFF'u64).uint32).int +
builtin_popcnt32((x.uint64 shr 32'u64).uint32).int
elif useICC_builtins:
when sizeof(x) <= 4: result = builtin_popcnt32(x.cint).int
elif arch64: result = builtin_popcnt64(x.uint64).int
else: result = builtin_popcnt32((x.uint64 and 0xFFFFFFFF'u64).cint).int +
builtin_popcnt32((x.uint64 shr 32'u64).cint).int
else:
when sizeof(x) <= 4: result = countBitsImpl(x.uint32)
else: result = countBitsImpl(x.uint64)

proc countBits32*(n: uint32): int {.compilerproc, inline.} =
result = countSetBitsImpl(n)

proc countBits64*(n: uint64): int {.compilerproc, inline.} =
result = countSetBitsImpl(n)
4 changes: 2 additions & 2 deletions lib/system/sets.nim
Original file line number Diff line number Diff line change
Expand Up @@ -8,13 +8,13 @@
#

# set handling
import system/countbits_impl
ringabout marked this conversation as resolved.
Show resolved Hide resolved

type
NimSet = array[0..4*2048-1, uint8]

# bitops can't be imported here, therefore the code duplication.

proc cardSet(s: NimSet, len: int): int {.compilerproc, inline.} =
proc cardSet*(s: NimSet, len: int): int {.compilerproc, inline.} =
ringabout marked this conversation as resolved.
Show resolved Hide resolved
var i = 0
result = 0
when defined(x86) or defined(amd64):
Expand Down