Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Make QMC_SIMD_ALIGNMENT configured via CMake #2981

Merged
merged 5 commits into from
Mar 5, 2021
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
12 changes: 12 additions & 0 deletions CMake/CheckSIMDAlignment.cmake
Original file line number Diff line number Diff line change
@@ -0,0 +1,12 @@
# Check if AVX512 is activated in the compilation
# Since cross-compiling is not unusual on HPC systems (Cray),
# try_compile is robust against
try_compile(CXX_COMPILER_HAVE_AVX512_MACRO ${CMAKE_BINARY_DIR}
${PROJECT_CMAKE}/try_compile_sources/checkAVX512.cxx
CMAKE_FLAGS "${CMAKE_CXX_FLAGS}")

if (CXX_COMPILER_HAVE_AVX512_MACRO)
set(default_alignment 64)
else()
set(default_alignment 32)
endif()
12 changes: 3 additions & 9 deletions CMake/ClangCompilers.cmake
Original file line number Diff line number Diff line change
Expand Up @@ -16,14 +16,14 @@ IF(QMC_OMP)
SET(ENABLE_OPENMP 1)
IF(ENABLE_OFFLOAD AND NOT CMAKE_SYSTEM_NAME STREQUAL "CrayLinuxEnvironment")
SET(OFFLOAD_TARGET "nvptx64-nvidia-cuda" CACHE STRING "Offload target architecture")
SET(CLANG_OPENMP_OFFLOAD_FLAGS "-fopenmp-targets=${OFFLOAD_TARGET}")
SET(OPENMP_OFFLOAD_COMPILE_OPTIONS "-fopenmp-targets=${OFFLOAD_TARGET}")

IF(DEFINED OFFLOAD_ARCH)
SET(CLANG_OPENMP_OFFLOAD_FLAGS "${CLANG_OPENMP_OFFLOAD_FLAGS} -Xopenmp-target=${OFFLOAD_TARGET} -march=${OFFLOAD_ARCH}")
SET(OPENMP_OFFLOAD_COMPILE_OPTIONS "${OPENMP_OFFLOAD_COMPILE_OPTIONS} -Xopenmp-target=${OFFLOAD_TARGET} -march=${OFFLOAD_ARCH}")
ENDIF()

IF(OFFLOAD_TARGET MATCHES "nvptx64")
SET(CLANG_OPENMP_OFFLOAD_FLAGS "${CLANG_OPENMP_OFFLOAD_FLAGS} -Wno-unknown-cuda-version")
SET(OPENMP_OFFLOAD_COMPILE_OPTIONS "${OPENMP_OFFLOAD_COMPILE_OPTIONS} -Wno-unknown-cuda-version")
ENDIF()

# Intel clang compiler needs a different flag for the host side OpenMP library when offload is used.
Expand Down Expand Up @@ -118,12 +118,6 @@ ELSEIF(CMAKE_SYSTEM_PROCESSOR MATCHES "ppc64" OR CMAKE_SYSTEM_PROCESSOR MATCHES
ENDIF()
ENDIF()

# Add OpenMP offload flags
# This step is intentionally put after the -march parsing for CPUs.
IF(DEFINED CLANG_OPENMP_OFFLOAD_FLAGS)
SET(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} ${CLANG_OPENMP_OFFLOAD_FLAGS}")
ENDIF()

# Add static flags if necessary
IF(QMC_BUILD_STATIC)
SET(CMAKE_CXX_LINK_FLAGS " -static")
Expand Down
7 changes: 3 additions & 4 deletions CMake/GNUCompilers.cmake
Original file line number Diff line number Diff line change
Expand Up @@ -10,11 +10,10 @@ SET(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} -std=c99")
IF(QMC_OMP)
SET(ENABLE_OPENMP 1)
SET(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} -fopenmp")
IF(ENABLE_OFFLOAD)
SET(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -fopenmp")
IF(ENABLE_OFFLOAD AND NOT CMAKE_SYSTEM_NAME STREQUAL "CrayLinuxEnvironment")
SET(OFFLOAD_TARGET "nvptx-none" CACHE STRING "Offload target architecture")
SET(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -fopenmp -foffload=${OFFLOAD_TARGET} -foffload=\"-lm -latomic\"")
ELSE()
SET(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -fopenmp")
SET(OPENMP_OFFLOAD_COMPILE_OPTIONS "-foffload=${OFFLOAD_TARGET} -foffload=\"-lm -latomic\"")
ENDIF()
ENDIF(QMC_OMP)

Expand Down
6 changes: 3 additions & 3 deletions CMake/IBMCompilers.cmake
Original file line number Diff line number Diff line change
Expand Up @@ -26,10 +26,10 @@ SET( CMAKE_CXX_FLAGS_RELWITHDEBINFO "-g -O3" )

IF(QMC_OMP)
SET(ENABLE_OPENMP 1)
SET(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} -qsmp=omp")
SET(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -qsmp=omp")
IF(ENABLE_OFFLOAD)
SET(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -qsmp=omp -qoffload")
ELSE(ENABLE_OFFLOAD)
SET(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -qsmp=omp")
set(OPENMP_OFFLOAD_COMPILE_OPTIONS "-qoffload")
ENDIF(ENABLE_OFFLOAD)
ELSE(QMC_OMP)
SET(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -qnothreaded")
Expand Down
2 changes: 1 addition & 1 deletion CMake/PGICompilers.cmake
Original file line number Diff line number Diff line change
Expand Up @@ -15,7 +15,7 @@ IF(QMC_OMP)
MESSAGE(FATAL_ERROR "NVIDIA HPC compiler requires -gpu=ccXX option set based on the target GPU architecture! "
"Please add -DOFFLOAD_ARCH=ccXX to cmake. For example, cc70 is for Volta.")
ENDIF()
SET(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -mp=gpu -gpu=${OFFLOAD_ARCH}")
SET(OPENMP_OFFLOAD_COMPILE_OPTIONS "-mp=gpu -gpu=${OFFLOAD_ARCH}")
ELSE()
SET(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -mp=allcores")
ENDIF()
Expand Down
8 changes: 8 additions & 0 deletions CMake/try_compile_sources/checkAVX512.cxx
Original file line number Diff line number Diff line change
@@ -0,0 +1,8 @@
// Check if AVX512 is activated by the compiler
int main(int argc, char **argv)
{
#if !defined(__AVX512F__)
#error "AVX512 not found"
#endif
return 0;
}
26 changes: 26 additions & 0 deletions CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -144,6 +144,7 @@ SET(OHMMS_DIM 3 CACHE STRING "Select physical dimension")
SET(OHMMS_INDEXTYPE int)
MESSAGE(STATUS "defining the float point precision")
SET(OHMMS_PRECISION_FULL double)

IF(QMC_CUDA)
SET(QMC_MIXED_PRECISION 1 CACHE BOOL "Enable/disable mixed precision")
SET(OHMMS_PRECISION double)
Expand Down Expand Up @@ -427,6 +428,31 @@ IF (BUILD_AFQMC AND NOT APPLE)
LINK_LIBRARIES("rt")
ENDIF()

#-------------------------------------------------------------------------------
# Check SIMD alignment for CPU only
#-------------------------------------------------------------------------------
# This is intentionally placed before adding OpenMP offload compile options
# to avoid contamination from device compilation pass.
# When '-march=skylake-avx512 -fopenmp-targets=nvptx64 -march=sm_70' is added
# for Clang, the source code is parsed twice for both host and offload targets.
# A trap for macro __AVX512F__ always fails because the offload pass doesn't
# carry '-march=skylake-avx512' but only takes '-march=sm_70'.
#-------------------------------------------------------------------------------
include(CMake/CheckSIMDAlignment.cmake)
set(QMC_SIMD_ALIGNMENT ${default_alignment} CACHE STRING "Cache/SIMD alignment in bytes")
math(EXPR alignment_remainder "${QMC_SIMD_ALIGNMENT} % ${default_alignment}")
if (alignment_remainder)
message(FATAL_ERROR "QMC_SIMD_ALIGNMENT must be a multiple of ${default_alignment}. Bad cached value is ${QMC_SIMD_ALIGNMENT}")
endif()
message(STATUS "Setting QMC_SIMD_ALIGNMENT to ${QMC_SIMD_ALIGNMENT}")

#---------------------------------------------------------
# Set up OpenMP offload compile options
#---------------------------------------------------------
if (ENABLE_OFFLOAD AND DEFINED OPENMP_OFFLOAD_COMPILE_OPTIONS)
set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} ${OPENMP_OFFLOAD_COMPILE_OPTIONS}")
endif()

#-------------------------------------------------------------------
# check MPI installation
#-------------------------------------------------------------------
Expand Down
16 changes: 12 additions & 4 deletions src/Platforms/CPU/SIMD/aligned_allocator.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -16,12 +16,20 @@

#include <vector>
#include <cstdlib>
#include "alignment.config.h"
#include "config.h"
#include "Mallocator.hpp"

#if defined(__INTEL_COMPILER)
#define ASSUME_ALIGNED(x) __assume_aligned(x,QMC_SIMD_ALIGNMENT)
#elif defined(__GNUC__) && !defined(__ibmxl__)
#define ASSUME_ALIGNED(x) (x) = (__typeof__(x)) __builtin_assume_aligned(x,QMC_SIMD_ALIGNMENT)
#else
#define ASSUME_ALIGNED(x)
#endif

namespace qmcplusplus
{
template<class T, size_t ALIGN = QMC_CLINE>
template<class T, size_t ALIGN = QMC_SIMD_ALIGNMENT>
using aligned_allocator = Mallocator<T, ALIGN>;
template<class T>
using aligned_vector = std::vector<T, aligned_allocator<T>>;
Expand All @@ -30,15 +38,15 @@ using aligned_vector = std::vector<T, aligned_allocator<T>>;

/** return size in T's of allocated aligned memory
*/
template<typename T, size_t ALIGN = QMC_CLINE>
template<typename T, size_t ALIGN = QMC_SIMD_ALIGNMENT>
inline size_t getAlignedSize(size_t n)
{
constexpr size_t ND = ALIGN / sizeof(T);
static_assert(ALIGN % sizeof(T) == 0, "getAlignedSize ALIGN must be a multiple of sizeof(T)");
return ((n + ND - 1) / ND) * ND;
}

template<typename T, size_t ALIGN = QMC_CLINE>
template<typename T, size_t ALIGN = QMC_SIMD_ALIGNMENT>
inline size_t getAlignment()
{
static_assert(ALIGN % sizeof(T) == 0, "getAlignedSize ALIGN must be a multiple of sizeof(T)");
Expand Down
37 changes: 0 additions & 37 deletions src/Platforms/CPU/SIMD/alignment.config.h

This file was deleted.

3 changes: 1 addition & 2 deletions src/Platforms/CUDA/CUDAallocator.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -27,7 +27,6 @@
#include "cudaError.h"
#include "allocator_traits.hpp"
#include "CUDAfill.hpp"
#include "CPU/SIMD/alignment.config.h"

namespace qmcplusplus
{
Expand Down Expand Up @@ -62,7 +61,7 @@ struct CUDAManagedAllocator
{
void* pt;
cudaErrorCheck(cudaMallocManaged(&pt, n * sizeof(T)), "Allocation failed in CUDAManagedAllocator!");
if ((size_t(pt)) & (QMC_CLINE - 1))
if ((size_t(pt)) & (QMC_SIMD_ALIGNMENT - 1))
throw std::runtime_error("Unaligned memory allocated in CUDAManagedAllocator");
return static_cast<T*>(pt);
}
Expand Down
2 changes: 1 addition & 1 deletion src/Platforms/PinnedAllocator.h
Original file line number Diff line number Diff line change
Expand Up @@ -29,7 +29,7 @@ using PinnedAllocator = CUDALockedPageAllocator<T>;
using PinnedAllocator = std::allocator<T>;
#endif

template<typename T, size_t ALIGN = QMC_CLINE>
template<typename T, size_t ALIGN = QMC_SIMD_ALIGNMENT>
#ifdef ENABLE_CUDA
using PinnedAlignedAllocator = CUDALockedPageAllocator<T, aligned_allocator<T, ALIGN>>;
#else
Expand Down
16 changes: 8 additions & 8 deletions src/Platforms/tests/CPU/test_aligned_allocator.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -23,21 +23,21 @@ TEST_CASE("Aligned allocator", "[numerics]")
bool not_aligned;

aligned_vector<float> a(311);
std::cout << "address=" << a.data() << " require=" << (void*)(QMC_CLINE - 1) << std::endl;
not_aligned = (size_t)a.data() & (QMC_CLINE - 1);
std::cout << "address=" << a.data() << " require=" << (void*)(QMC_SIMD_ALIGNMENT - 1) << std::endl;
not_aligned = (size_t)a.data() & (QMC_SIMD_ALIGNMENT - 1);
REQUIRE(!not_aligned);
a.resize(829);
std::cout << "address=" << a.data() << " require=" << (void*)(QMC_CLINE - 1) << std::endl;
not_aligned = (size_t)a.data() & (QMC_CLINE - 1);
std::cout << "address=" << a.data() << " require=" << (void*)(QMC_SIMD_ALIGNMENT - 1) << std::endl;
not_aligned = (size_t)a.data() & (QMC_SIMD_ALIGNMENT - 1);
REQUIRE(!not_aligned);

aligned_vector<double> b(311);
std::cout << "address=" << b.data() << " require=" << (void*)(QMC_CLINE - 1) << std::endl;
not_aligned = (size_t)b.data() & (QMC_CLINE - 1);
std::cout << "address=" << b.data() << " require=" << (void*)(QMC_SIMD_ALIGNMENT - 1) << std::endl;
not_aligned = (size_t)b.data() & (QMC_SIMD_ALIGNMENT - 1);
REQUIRE(!not_aligned);
b.resize(829);
std::cout << "address=" << b.data() << " require=" << (void*)(QMC_CLINE - 1) << std::endl;
not_aligned = (size_t)b.data() & (QMC_CLINE - 1);
std::cout << "address=" << b.data() << " require=" << (void*)(QMC_SIMD_ALIGNMENT - 1) << std::endl;
not_aligned = (size_t)b.data() & (QMC_SIMD_ALIGNMENT - 1);
REQUIRE(!not_aligned);
}

Expand Down
2 changes: 1 addition & 1 deletion src/Utilities/PooledMemory.h
Original file line number Diff line number Diff line change
Expand Up @@ -107,7 +107,7 @@ struct PooledMemory
inline void allocate()
{
myData.resize(Current + Current_scalar * scalar_multiplier);
if ((size_t(myData.data())) & (QMC_CLINE - 1))
if ((size_t(myData.data())) & (QMC_SIMD_ALIGNMENT - 1))
throw std::runtime_error("Unaligned memory allocated in PooledMemory");
Scalar_ptr = reinterpret_cast<T_scalar*>(myData.data() + Current);
}
Expand Down
2 changes: 1 addition & 1 deletion src/Utilities/tests/test_pooled_memory.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -91,7 +91,7 @@ TEST_CASE("pack scalar", "[utilities]")
p >> i5;

p.get(i6_dummy.data(), i6_dummy.data() + i6_dummy.size());
bool not_aligned = (((size_t)p.data()) + p.current()) & (QMC_CLINE - 1);
bool not_aligned = (((size_t)p.data()) + p.current()) & (QMC_SIMD_ALIGNMENT - 1);
REQUIRE(!not_aligned);

p.get(i6.data(), i6.data() + i6.size());
Expand Down
3 changes: 3 additions & 0 deletions src/config.h.cmake.in
Original file line number Diff line number Diff line change
Expand Up @@ -68,6 +68,9 @@
/* Define the full precision: double, long double */
#cmakedefine OHMMS_PRECISION_FULL @OHMMS_PRECISION_FULL@

/* Define Cache/SIMD alignment in bytes */
#cmakedefine QMC_SIMD_ALIGNMENT @QMC_SIMD_ALIGNMENT@

/* Define to 1 if precision is mixed, only for the CPU code */
#cmakedefine MIXED_PRECISION @MIXED_PRECISION@

Expand Down
1 change: 1 addition & 0 deletions src/qmcpack.settings
Original file line number Diff line number Diff line change
Expand Up @@ -8,6 +8,7 @@ QMC_OMP = @QMC_OMP@
QMC_CUDA = @QMC_CUDA@
QMC_COMPLEX = @QMC_COMPLEX@
QMC_MIXED_PRECISION = @QMC_MIXED_PRECISION@
QMC_SIMD_ALIGNMENT = @QMC_SIMD_ALIGNMENT@
BUILD_AFQMC = @BUILD_AFQMC@
BUILD_FCIQMC = @BUILD_FCIQMC@
ENABLE_OFFLOAD = @ENABLE_OFFLOAD@
Expand Down