ginkgo-project · tcojean · Nov 11, 2022 · Nov 10, 2022 · Nov 10, 2022 · Nov 10, 2022
diff --git a/CMakeLists.txt b/CMakeLists.txt
@@ -116,6 +116,13 @@ if(MINGW OR CYGWIN)
     set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -Wa,-mbig-obj")
 endif()
 
+# For now, PGI/NVHPC nvc++ compiler doesn't seem to support
+# `#pragma omp declare reduction`
+if (${CMAKE_CXX_COMPILER_ID} MATCHES "PGI|NVHPC")
+    message(STATUS "OpenMP: Switching to OFF because PGI/NVHPC nvc++ compiler lacks important features.")
+    set(GINKGO_BUILD_OMP OFF)
+endif()
+
 set(GINKGO_CIRCULAR_DEPS_FLAGS "-Wl,--no-undefined")
 
 # Use ccache as compilation launcher

diff --git a/README.md b/README.md
@@ -42,10 +42,12 @@ For Ginkgo core library:
     *   _clang 3.9+_
     *   _Intel compiler 2018+_
     *   _Apple LLVM 8.0+_
+    *   _Cray Compiler 14.0.1+_
+    *   _NVHPC Compiler 22.7+_
 
 The Ginkgo CUDA module has the following __additional__ requirements:
 
-*   _CUDA 9.2+_
+*   _CUDA 9.2+_ or _NVHPC Package 22.7+_
 *   Any host compiler restrictions your version of CUDA may impose also apply
     here. For the newest CUDA version, this information can be found in the
     [CUDA installation guide for Linux](https://docs.nvidia.com/cuda/cuda-installation-guide-linux/index.html)

diff --git a/cmake/cuda.cmake b/cmake/cuda.cmake
@@ -21,13 +21,44 @@ cas_variable_cuda_architectures(GINKGO_CUDA_ARCH_FLAGS
     ARCHITECTURES ${GINKGO_CUDA_ARCHITECTURES}
     UNSUPPORTED "20" "21")
 
+if (${CMAKE_CXX_COMPILER_ID} MATCHES "PGI|NVHPC")
+    find_package(NVHPC REQUIRED
+        HINTS
+        $ENV{NVIDIA_PATH}
+        ${CMAKE_CUDA_COMPILER}/../../..
+        )
+
+    set(CUDA_RUNTIME_LIBS_DYNAMIC ${NVHPC_CUDART_LIBRARY})
+    set(CUDA_RUNTIME_LIBS_STATIC ${NVHPC_CUDART_LIBRARY_STATIC})
+    set(CUBLAS ${NVHPC_CUBLAS_LIBRARY})
+    set(CUSPARSE ${NVHPC_CUSPARSE_LIBRARY})
+    set(CURAND ${NVHPC_CURAND_LIBRARY})
+    set(CUFFT ${NVHPC_CUFFT_LIBRARY})
+else()
+    find_library(CUDA_RUNTIME_LIBS_DYNAMIC cudart
+        HINT ${CMAKE_CUDA_IMPLICIT_LINK_DIRECTORIES})
+    find_library(CUDA_RUNTIME_LIBS_STATIC cudart_static
+        HINT ${CMAKE_CUDA_IMPLICIT_LINK_DIRECTORIES})
+
+    # CUDA 10.1/10.2 put cublas, cublasLt, cudnn in /usr/lib/<arch>-linux-gnu/, but
+    # others (<= 10.0 or >= 11) put them in cuda own directory
+    # If the environment installs several cuda including 10.1/10.2, cmake will find
+    # the 10.1/10.2 .so files when searching others cuda in the default path.
+    # CMake already puts /usr/lib/<arch>-linux-gnu/ after cuda own directory in the
+    # `CMAKE_CUDA_IMPLICIT_LINK_DIRECTORIES`, so we always put NO_DEFAULT_PATH here.
+    find_library(CUBLAS cublas
+        HINT ${CMAKE_CUDA_IMPLICIT_LINK_DIRECTORIES} NO_DEFAULT_PATH)
+    find_library(CUSPARSE cusparse
+        HINT ${CMAKE_CUDA_IMPLICIT_LINK_DIRECTORIES})
+    find_library(CURAND curand
+        HINT ${CMAKE_CUDA_IMPLICIT_LINK_DIRECTORIES})
+    find_library(CUFFT cufft
+        HINT ${CMAKE_CUDA_IMPLICIT_LINK_DIRECTORIES})
+endif()
+
 # MSVC nvcc uses static cudartlibrary by default, and other platforms use shared cudartlibrary.
 # add `-cudart shared` or `-cudart=shared` according system into CMAKE_CUDA_FLAGS
 # to force nvcc to use dynamic cudart library in MSVC.
-find_library(CUDA_RUNTIME_LIBS_DYNAMIC cudart
-        HINT ${CMAKE_CUDA_IMPLICIT_LINK_DIRECTORIES})
-find_library(CUDA_RUNTIME_LIBS_STATIC cudart_static
-        HINT ${CMAKE_CUDA_IMPLICIT_LINK_DIRECTORIES})
 if(MSVC)
     if("${CMAKE_CUDA_FLAGS}" MATCHES "-cudart(=| )shared")
         set(CUDA_RUNTIME_LIBS "${CUDA_RUNTIME_LIBS_DYNAMIC}" CACHE STRING "Path to a library" FORCE)
@@ -38,21 +69,6 @@ else()
     set(CUDA_RUNTIME_LIBS "${CUDA_RUNTIME_LIBS_DYNAMIC}" CACHE STRING "Path to a library" FORCE)
 endif()
 
-# CUDA 10.1/10.2 put cublas, cublasLt, cudnn in /usr/lib/<arch>-linux-gnu/, but
-# others (<= 10.0 or >= 11) put them in cuda own directory
-# If the environment installs several cuda including 10.1/10.2, cmake will find
-# the 10.1/10.2 .so files when searching others cuda in the default path.
-# CMake already puts /usr/lib/<arch>-linux-gnu/ after cuda own directory in the
-# `CMAKE_CUDA_IMPLICIT_LINK_DIRECTORIES`, so we always put NO_DEFAULT_PATH here.
-find_library(CUBLAS cublas
-    HINT ${CMAKE_CUDA_IMPLICIT_LINK_DIRECTORIES} NO_DEFAULT_PATH)
-find_library(CUSPARSE cusparse
-    HINT ${CMAKE_CUDA_IMPLICIT_LINK_DIRECTORIES})
-find_library(CURAND curand
-    HINT ${CMAKE_CUDA_IMPLICIT_LINK_DIRECTORIES})
-find_library(CUFFT cufft
-    HINT ${CMAKE_CUDA_IMPLICIT_LINK_DIRECTORIES})
-
 if (NOT CMAKE_CUDA_HOST_COMPILER AND NOT GINKGO_CUDA_DEFAULT_HOST_COMPILER)
     set(CMAKE_CUDA_HOST_COMPILER "${CMAKE_CXX_COMPILER}" CACHE STRING "" FORCE)
 elseif(GINKGO_CUDA_DEFAULT_HOST_COMPILER)
@@ -75,4 +91,4 @@ if (CMAKE_CUDA_COMPILER_ID STREQUAL "NVIDIA" AND CMAKE_CUDA_COMPILER_VERSION
         message(FATAL_ERROR "There is a bug between nvcc 9.2 and clang 5.0 which create a compiling issue."
             "Consider using a different CUDA host compiler or CUDA version.")
     endif()
-endif()
+endif()
diff --git a/include/ginkgo/core/stop/combined.hpp b/include/ginkgo/core/stop/combined.hpp
@@ -125,7 +125,6 @@ std::shared_ptr<const CriterionFactory> combine(FactoryContainer&& factories)
     switch (factories.size()) {
     case 0:
         GKO_NOT_SUPPORTED(nullptr);
-        return nullptr;
     case 1:
         if (factories[0] == nullptr) {
             GKO_NOT_SUPPORTED(nullptr);
@@ -135,7 +134,6 @@ std::shared_ptr<const CriterionFactory> combine(FactoryContainer&& factories)
         if (factories[0] == nullptr) {
             // first factory must be valid to capture executor
             GKO_NOT_SUPPORTED(nullptr);
-            return nullptr;
         } else {
             auto exec = factories[0]->get_executor();
             return Combined::build()

diff --git a/omp/distributed/partition_kernels.cpp b/omp/distributed/partition_kernels.cpp
@@ -76,6 +76,7 @@ void build_starting_indices(std::shared_ptr<const DefaultExecutor> exec,
         }
 #pragma omp barrier
         // exclusive prefix sum over local sizes
+        // FIXME: PGI/NVHPC(22.7) doesn't like reduction with references
 #pragma omp for reduction(+ : num_empty_parts)
         for (comm_index_type part = 0; part < num_parts; ++part) {
             LocalIndexType size{};