-
Notifications
You must be signed in to change notification settings - Fork 1.5k
make bitset use popcount #2126
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
make bitset use popcount #2126
Conversation
Similar to `all()` method
tr1/bitset test has enough coverage |
try to depend on local variable
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
I'd like to see benchmark results using the actual __isa_available symbol or at the very least using a normal extern and not a volatile.
The benchmark that compares old implementation, new implementation and fallback of new implementation: // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
#include <bit>
#include <chrono>
#include <iostream>
#include <random>
int get_isa_available() noexcept {
if (_STD is_constant_evaluated()) {
return 0;
}
#if defined(_M_IX86) || defined(_M_X64)
return std::__isa_available;
#else
return 0;
#endif
}
template <class _Ty>
int checked_x86_x64_popcount(const _Ty _Val, const int _Isa_available) noexcept {
constexpr int _Digits = std::numeric_limits<_Ty>::digits;
#ifndef __AVX__
const bool _Definitely_have_popcnt = _Isa_available >= __ISA_AVAILABLE_SSE42;
if (!_Definitely_have_popcnt) {
return std::_Popcount_fallback(_Val);
}
#else // ^^^ pre-__AVX__ ^^^ / vvv __AVX__ vvv
(void)_Isa_available;
#endif // ^^^ __AVX__ ^^^
if constexpr (_Digits <= 16) {
return static_cast<int>(__popcnt16(_Val));
}
else if constexpr (_Digits == 32) {
return static_cast<int>(__popcnt(_Val));
}
else {
#ifdef _M_IX86
return static_cast<int>(__popcnt(_Val >> 32) + __popcnt(static_cast<unsigned int>(_Val)));
#else // ^^^ _M_IX86 / !_M_IX86 vvv
return static_cast<int>(__popcnt64(_Val));
#endif // _M_IX86
}
}
template <class _Ty, std::enable_if_t<std::_Is_standard_unsigned_integer<_Ty>, int> _Enabled = 0>
int popcount(const _Ty _Val, const int _Isa_available = _Get_isa_available()) noexcept {
if (!_STD is_constant_evaluated())
{
return checked_x86_x64_popcount(_Val, _Isa_available);
}
return std::_Popcount_fallback(_Val);
}
template<std::size_t Words>
struct bitset {
void populate() {
std::mt19937 mt(43223);
std::uniform_int_distribution<Ty> dis(std::numeric_limits<Ty>::min(), std::numeric_limits<Ty>::max());
for (std::size_t Wpos = 0; Wpos <= Words; ++Wpos) {
Array[Wpos] = dis(mt);
}
}
using Ty = std::uint64_t;
size_t count_old() const noexcept {
const char* const _Bitsperbyte =
"\0\1\1\2\1\2\2\3\1\2\2\3\2\3\3\4"
"\1\2\2\3\2\3\3\4\2\3\3\4\3\4\4\5"
"\1\2\2\3\2\3\3\4\2\3\3\4\3\4\4\5"
"\2\3\3\4\3\4\4\5\3\4\4\5\4\5\5\6"
"\1\2\2\3\2\3\3\4\2\3\3\4\3\4\4\5"
"\2\3\3\4\3\4\4\5\3\4\4\5\4\5\5\6"
"\2\3\3\4\3\4\4\5\3\4\4\5\4\5\5\6"
"\3\4\4\5\4\5\5\6\4\5\5\6\5\6\6\7"
"\1\2\2\3\2\3\3\4\2\3\3\4\3\4\4\5"
"\2\3\3\4\3\4\4\5\3\4\4\5\4\5\5\6"
"\2\3\3\4\3\4\4\5\3\4\4\5\4\5\5\6"
"\3\4\4\5\4\5\5\6\4\5\5\6\5\6\6\7"
"\2\3\3\4\3\4\4\5\3\4\4\5\4\5\5\6"
"\3\4\4\5\4\5\5\6\4\5\5\6\5\6\6\7"
"\3\4\4\5\4\5\5\6\4\5\5\6\5\6\6\7"
"\4\5\5\6\5\6\6\7\5\6\6\7\6\7\7\x8";
const unsigned char* _Ptr = &reinterpret_cast<const unsigned char&>(Array);
const unsigned char* const _End = _Ptr + sizeof(Array);
size_t _Val = 0;
for (; _Ptr != _End; ++_Ptr) {
_Val += _Bitsperbyte[*_Ptr];
}
return _Val;
}
size_t count_new() const noexcept {
const int _Isa_available = get_isa_available();
size_t _Val = 0;
for (size_t _Wpos = 0; _Wpos <= Words; ++_Wpos) {
_Val += popcount(Array[_Wpos], _Isa_available);
}
return _Val;
}
Ty Array[Words + 1];
};
volatile std::size_t result;
int main()
{
const int N = 200000;
bitset<10'000> bs;
bs.populate();
std::chrono::steady_clock::duration d1{};
std::chrono::steady_clock::duration d2{};
std::chrono::steady_clock::duration d3{};
std::chrono::steady_clock::time_point t;
t = std::chrono::steady_clock::now();
for (int i = 0; i < N; ++i) {
result = bs.count_old();
}
d1 = std::chrono::steady_clock::now() - t;
t = std::chrono::steady_clock::now();
for (int i = 0; i < N; ++i) {
result = bs.count_new();
}
d2 = std::chrono::steady_clock::now() - t;
std::__isa_available = 0;
t = std::chrono::steady_clock::now();
for (int i = 0; i < N; ++i) {
result = bs.count_new();
}
d3 = std::chrono::steady_clock::now() - t;
std::cout << "old\t" << d1 << "\n";
std::cout << "new\t" << d2 << "\n";
std::cout << "new fb\t" << d3 << "\n";
return 0;
} |
x64 benchmark result:
x86 benchmark result:
So it works, but the fallback become worse. |
Turns out that for the best performance, more significant changes are needed. I'll wait for #2127 to be merged, then will continue, as the changes going to be extensive. |
Re-doe in a different PR, used another approach -- please review #2201 |
Resolve #667