Skip to content

Commit

Permalink
Getting vxsort working on Linux amd64 (#98712)
Browse files Browse the repository at this point in the history
Co-authored-by: Jan Vorlicek <janvorli@microsoft.com>
Co-authored-by: Adeel Mujahid <3840695+am11@users.noreply.github.com>
  • Loading branch information
3 people authored Apr 3, 2024
1 parent 71f45aa commit c08bd7b
Show file tree
Hide file tree
Showing 32 changed files with 142 additions and 141 deletions.
6 changes: 6 additions & 0 deletions src/coreclr/dlls/mscoree/coreclr/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -111,6 +111,12 @@ set(CORECLR_LIBRARIES
gc_pal
)

if(CLR_CMAKE_TARGET_ARCH_AMD64)
list(APPEND CORECLR_LIBRARIES
gc_vxsort
)
endif(CLR_CMAKE_TARGET_ARCH_AMD64)

if(CLR_CMAKE_TARGET_WIN32)
list(APPEND CORECLR_LIBRARIES
${STATIC_MT_CRT_LIB}
Expand Down
26 changes: 11 additions & 15 deletions src/coreclr/gc/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -36,20 +36,9 @@ else()
windows/Native.rc)
endif(CLR_CMAKE_HOST_UNIX)

if (CLR_CMAKE_TARGET_ARCH_AMD64 AND CLR_CMAKE_TARGET_WIN32)
set (GC_SOURCES
${GC_SOURCES}
vxsort/isa_detection.cpp
vxsort/do_vxsort_avx2.cpp
vxsort/do_vxsort_avx512.cpp
vxsort/machine_traits.avx2.cpp
vxsort/smallsort/bitonic_sort.AVX2.int64_t.generated.cpp
vxsort/smallsort/bitonic_sort.AVX2.int32_t.generated.cpp
vxsort/smallsort/bitonic_sort.AVX512.int64_t.generated.cpp
vxsort/smallsort/bitonic_sort.AVX512.int32_t.generated.cpp
vxsort/smallsort/avx2_load_mask_tables.cpp
)
endif (CLR_CMAKE_TARGET_ARCH_AMD64 AND CLR_CMAKE_TARGET_WIN32)
if (CLR_CMAKE_TARGET_ARCH_AMD64)
add_subdirectory(vxsort)
endif (CLR_CMAKE_TARGET_ARCH_AMD64)

if (CLR_CMAKE_TARGET_WIN32)
set(GC_HEADERS
Expand Down Expand Up @@ -87,7 +76,7 @@ if (CLR_CMAKE_TARGET_WIN32)
handletablepriv.h
objecthandle.h
softwarewritewatch.h
vxsort/do_vxsort.h)
)
endif(CLR_CMAKE_TARGET_WIN32)

if(CLR_CMAKE_HOST_WIN32)
Expand All @@ -100,6 +89,13 @@ endif(CLR_CMAKE_HOST_WIN32)

set (GC_LINK_LIBRARIES ${GC_LINK_LIBRARIES} gc_pal)

if(CLR_CMAKE_TARGET_ARCH_AMD64)
list(APPEND GC_LINK_LIBRARIES
gc_vxsort
)
endif(CLR_CMAKE_TARGET_ARCH_AMD64)


list(APPEND GC_SOURCES ${GC_HEADERS})

convert_to_absolute_path(GC_SOURCES ${GC_SOURCES})
Expand Down
6 changes: 3 additions & 3 deletions src/coreclr/gc/gc.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -18,7 +18,7 @@

#include "gcpriv.h"

#if defined(TARGET_AMD64) && defined(TARGET_WINDOWS)
#ifdef TARGET_AMD64
#define USE_VXSORT
#else
#define USE_INTROSORT
Expand Down Expand Up @@ -10305,11 +10305,11 @@ static void do_vxsort (uint8_t** item_array, ptrdiff_t item_count, uint8_t* rang
{
// above this threshold, using AVX2 for sorting will likely pay off
// despite possible downclocking on some devices
const size_t AVX2_THRESHOLD_SIZE = 8 * 1024;
const ptrdiff_t AVX2_THRESHOLD_SIZE = 8 * 1024;

// above this threshold, using AVX512F for sorting will likely pay off
// despite possible downclocking on current devices
const size_t AVX512F_THRESHOLD_SIZE = 128 * 1024;
const ptrdiff_t AVX512F_THRESHOLD_SIZE = 128 * 1024;

if (item_count <= 1)
return;
Expand Down
2 changes: 1 addition & 1 deletion src/coreclr/gc/gcsvr.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -20,7 +20,7 @@

#define SERVER_GC 1

#if defined(TARGET_AMD64) && defined(TARGET_WINDOWS)
#ifdef TARGET_AMD64
#include "vxsort/do_vxsort.h"
#endif

Expand Down
2 changes: 1 addition & 1 deletion src/coreclr/gc/gcwks.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -20,7 +20,7 @@
#undef SERVER_GC
#endif

#if defined(TARGET_AMD64) && defined(TARGET_WINDOWS)
#ifdef TARGET_AMD64
#include "vxsort/do_vxsort.h"
#endif

Expand Down
6 changes: 0 additions & 6 deletions src/coreclr/gc/unix/gcenv.unix.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -35,12 +35,6 @@
#define __has_cpp_attribute(x) (0)
#endif

#if __has_cpp_attribute(fallthrough)
#define FALLTHROUGH [[fallthrough]]
#else
#define FALLTHROUGH
#endif

#include <algorithm>

#if HAVE_SYS_TIME_H
Expand Down
29 changes: 29 additions & 0 deletions src/coreclr/gc/vxsort/CMakeLists.txt
Original file line number Diff line number Diff line change
@@ -0,0 +1,29 @@
set(CMAKE_INCLUDE_CURRENT_DIR ON)
include_directories("../env")

if(CLR_CMAKE_HOST_UNIX)
set_source_files_properties(isa_detection.cpp PROPERTIES COMPILE_FLAGS -mavx2)
set_source_files_properties(do_vxsort_avx2.cpp PROPERTIES COMPILE_FLAGS -mavx2)
set_source_files_properties(do_vxsort_avx512.cpp PROPERTIES COMPILE_FLAGS -mavx2)
set_source_files_properties(machine_traits.avx2.cpp PROPERTIES COMPILE_FLAGS -mavx2)
set_source_files_properties(smallsort/bitonic_sort.AVX2.int64_t.generated.cpp PROPERTIES COMPILE_FLAGS -mavx2)
set_source_files_properties(smallsort/bitonic_sort.AVX2.int32_t.generated.cpp PROPERTIES COMPILE_FLAGS -mavx2)
set_source_files_properties(smallsort/bitonic_sort.AVX512.int64_t.generated.cpp PROPERTIES COMPILE_FLAGS -mavx2)
set_source_files_properties(smallsort/bitonic_sort.AVX512.int32_t.generated.cpp PROPERTIES COMPILE_FLAGS -mavx2)
set_source_files_properties(smallsort/avx2_load_mask_tables.cpp PROPERTIES COMPILE_FLAGS -mavx2)
endif(CLR_CMAKE_HOST_UNIX)

set (VXSORT_SOURCES
isa_detection.cpp
do_vxsort_avx2.cpp
do_vxsort_avx512.cpp
machine_traits.avx2.cpp
smallsort/bitonic_sort.AVX2.int64_t.generated.cpp
smallsort/bitonic_sort.AVX2.int32_t.generated.cpp
smallsort/bitonic_sort.AVX512.int64_t.generated.cpp
smallsort/bitonic_sort.AVX512.int32_t.generated.cpp
smallsort/avx2_load_mask_tables.cpp
do_vxsort.h
)

add_library(gc_vxsort STATIC ${VXSORT_SOURCES})
31 changes: 1 addition & 30 deletions src/coreclr/gc/vxsort/defs.h
Original file line number Diff line number Diff line change
Expand Up @@ -45,36 +45,7 @@
#define NOINLINE __attribute__((noinline))
#endif

namespace std {
template <class _Ty>
class numeric_limits {
public:
static constexpr _Ty Max() { static_assert(sizeof(_Ty) != sizeof(_Ty), "func must be specialized!"); return _Ty(); }
static constexpr _Ty Min() { static_assert(sizeof(_Ty) != sizeof(_Ty), "func must be specialized!"); return _Ty(); }
};

template <>
class numeric_limits<int32_t> {
public:
static constexpr int32_t Max() { return 0x7fffffff; }
static constexpr int32_t Min() { return -0x7fffffff - 1; }
};

template <>
class numeric_limits<uint32_t> {
public:
static constexpr uint32_t Max() { return 0xffffffff; }
static constexpr uint32_t Min() { return 0; }
};

template <>
class numeric_limits<int64_t> {
public:
static constexpr int64_t Max() { return 0x7fffffffffffffffi64; }

static constexpr int64_t Min() { return -0x7fffffffffffffffi64 - 1; }
};
} // namespace std
#include <limits>

#ifndef max
template <typename T>
Expand Down
4 changes: 2 additions & 2 deletions src/coreclr/gc/vxsort/machine_traits.avx2.h
Original file line number Diff line number Diff line change
Expand Up @@ -13,6 +13,7 @@
#include <immintrin.h>
#include <assert.h>
#include <inttypes.h>
#include <type_traits>
#include "defs.h"
#include "machine_traits.h"

Expand Down Expand Up @@ -123,8 +124,7 @@ class vxsort_machine_traits<int64_t, AVX2> {

template <int Shift>
static constexpr bool can_pack(T span) {
const auto PACK_LIMIT = (((TU) std::numeric_limits<uint32_t>::Max() + 1)) << Shift;
return ((TU) span) < PACK_LIMIT;
return ((TU) span) < ((((TU) std::numeric_limits<uint32_t>::max() + 1)) << Shift);
}

static INLINE TV load_vec(TV* p) { return _mm256_lddqu_si256(p); }
Expand Down
4 changes: 2 additions & 2 deletions src/coreclr/gc/vxsort/machine_traits.avx512.h
Original file line number Diff line number Diff line change
Expand Up @@ -11,6 +11,7 @@
#include "vxsort_targets_enable_avx512.h"

#include <immintrin.h>
#include <type_traits>
#include "defs.h"
#include "machine_traits.h"

Expand Down Expand Up @@ -92,8 +93,7 @@ class vxsort_machine_traits<int64_t, AVX512> {

template <int Shift>
static constexpr bool can_pack(T span) {
const auto PACK_LIMIT = (((TU) std::numeric_limits<uint32_t>::Max() + 1)) << Shift;
return ((TU) span) < PACK_LIMIT;
return ((TU) span) < ((((TU) std::numeric_limits<uint32_t>::max() + 1)) << Shift);
}

static INLINE TV load_vec(TV* p) { return _mm512_loadu_si512(p); }
Expand Down
12 changes: 6 additions & 6 deletions src/coreclr/gc/vxsort/packer.h
Original file line number Diff line number Diff line change
Expand Up @@ -56,7 +56,7 @@ class packer {
public:

static void pack(TFrom *mem, size_t len, TFrom base) {
TFrom offset = MT::template shift_n_sub<Shift>(base, (TFrom) std::numeric_limits<TTo>::Min());
TFrom offset = MT::template shift_n_sub<Shift>(base, (TFrom) std::numeric_limits<TTo>::min());
auto baseVec = MT::broadcast(offset);

auto pre_aligned_mem = reinterpret_cast<TFrom *>(reinterpret_cast<size_t>(mem) & ~ALIGN_MASK);
Expand Down Expand Up @@ -87,8 +87,8 @@ class packer {

assert(AH::is_aligned(mem_read));

auto memv_read = (TV *) mem_read;
auto memv_write = (TV *) mem_write;
TV * memv_read = (TV *) mem_read;
TV * memv_write = (TV *) mem_write;

auto lenv = len / N;
len -= (lenv * N);
Expand Down Expand Up @@ -156,7 +156,7 @@ class packer {


static void unpack(TTo *mem, size_t len, TFrom base) {
TFrom offset = MT::template shift_n_sub<Shift>(base, (TFrom) std::numeric_limits<TTo>::Min());
TFrom offset = MT::template shift_n_sub<Shift>(base, (TFrom) std::numeric_limits<TTo>::min());
auto baseVec = MT::broadcast(offset);

auto mem_read = mem + len;
Expand Down Expand Up @@ -184,8 +184,8 @@ class packer {
assert(AH::is_aligned(mem_read));

auto lenv = len / (N * 2);
auto memv_read = ((TV *) mem_read) - 1;
auto memv_write = ((TV *) mem_write) - 2;
TV * memv_read = ((TV *) mem_read) - 1;
TV * memv_write = ((TV *) mem_write) - 2;
len -= lenv * N * 2;

while (lenv >= Unroll) {
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -39,7 +39,7 @@ extern "C" const uint8_t mask_table_8[M8_SIZE];

template<> struct bitonic<int32_t, AVX2> {
static const int N = 8;
static constexpr int32_t MAX = std::numeric_limits<int32_t>::Max();
static constexpr int32_t MAX = std::numeric_limits<int32_t>::max();
public:

static INLINE void sort_01v_ascending(__m256i& d01) {
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -39,7 +39,7 @@ extern "C" const uint8_t mask_table_8[M8_SIZE];

template<> struct bitonic<int64_t, AVX2> {
static const int N = 4;
static constexpr int64_t MAX = std::numeric_limits<int64_t>::Max();
static constexpr int64_t MAX = std::numeric_limits<int64_t>::max();
public:

static INLINE void sort_01v_ascending(__m256i& d01) {
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -36,7 +36,7 @@ namespace vxsort {
namespace smallsort {
template<> struct bitonic<int32_t, AVX512> {
static const int N = 16;
static constexpr int32_t MAX = std::numeric_limits<int32_t>::Max();
static constexpr int32_t MAX = std::numeric_limits<int32_t>::max();
public:

static INLINE void sort_01v_ascending(__m512i& d01) {
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -36,7 +36,7 @@ namespace vxsort {
namespace smallsort {
template<> struct bitonic<int64_t, AVX512> {
static const int N = 8;
static constexpr int64_t MAX = std::numeric_limits<int64_t>::Max();
static constexpr int64_t MAX = std::numeric_limits<int64_t>::max();
public:

static INLINE void sort_01v_ascending(__m512i& d01) {
Expand Down
2 changes: 1 addition & 1 deletion src/coreclr/gc/vxsort/smallsort/codegen/avx2.py
Original file line number Diff line number Diff line change
Expand Up @@ -303,7 +303,7 @@ def generate_prologue(self, f):
template<> struct bitonic<{t}, AVX2> {{
static const int N = {self.vector_size()};
static constexpr {t} MAX = std::numeric_limits<{t}>::Max();
static constexpr {t} MAX = std::numeric_limits<{t}>::max();
public:
"""
print(s, file=f)
Expand Down
2 changes: 1 addition & 1 deletion src/coreclr/gc/vxsort/smallsort/codegen/avx512.py
Original file line number Diff line number Diff line change
Expand Up @@ -299,7 +299,7 @@ def generate_prologue(self, f):
namespace smallsort {{
template<> struct bitonic<{t}, AVX512> {{
static const int N = {self.vector_size()};
static constexpr {t} MAX = std::numeric_limits<{t}>::Max();
static constexpr {t} MAX = std::numeric_limits<{t}>::max();
public:
"""
print(s, file=f)
Expand Down
Loading

0 comments on commit c08bd7b

Please sign in to comment.