Skip to content

Commit

Permalink
Merge branch 'develop' for 2.8.00
Browse files Browse the repository at this point in the history
Part of Kokkos C++ Performance Portability Programming EcoSystem 2.8
  • Loading branch information
ndellingwood committed Feb 6, 2019
2 parents 94456cf + 6a79032 commit 4ee5f3c
Show file tree
Hide file tree
Showing 158 changed files with 3,855 additions and 669 deletions.
20 changes: 20 additions & 0 deletions CHANGELOG.md
Original file line number Diff line number Diff line change
@@ -1,5 +1,25 @@
# Change Log

## [2.8.00](https://github.com/kokkos/kokkos-kernels/tree/2.8.00) (2019-02-05)
[Full Changelog](https://github.com/kokkos/kokkos-kernels/compare/2.7.24...2.8.00)

**Implemented enhancements:**

- Capability, Tests: C++14 Support and Testing [\#351](https://github.com/kokkos/kokkos-kernels/issues/351)
- Capability: Batched getrs [\#332](https://github.com/kokkos/kokkos-kernels/issues/332)
- More Kernel Labels for KokkosBlas [\#239](https://github.com/kokkos/kokkos-kernels/issues/239)
- Name all parallel kernels and regions [\#124](https://github.com/kokkos/kokkos-kernels/issues/124)

**Fixed bugs:**

- BLAS TPL: BLAS underscore mangling [\#369](https://github.com/kokkos/kokkos-kernels/issues/369)
- BLAS TPL, Complex: Promotion 2.7.24 broke MV unit tests in Tpetra with complex types [\#360](https://github.com/kokkos/kokkos-kernels/issues/360)
- GEMM: GEMM uses wrong function for computing shared memory allocation size [\#368](https://github.com/kokkos/kokkos-kernels/issues/368)
- BuildSystem: BLAS TPL macro not properly enabled with MKL BLAS [\#347](https://github.com/kokkos/kokkos-kernels/issues/347)
- BuildSystem: make clean - errors [\#353](https://github.com/kokkos/kokkos-kernels/issues/353)
- Compiler Workaround: Internal compiler error in KokkosBatched::Experimental::TeamGemm [\#349](https://github.com/kokkos/kokkos-kernels/issues/349)
- KokkosBlas: Some KokkosBlas kernels assume default execution space [\#14](https://github.com/kokkos/kokkos-kernels/issues/14)

## [2.7.24](https://github.com/kokkos/kokkos-kernels/tree/2.7.24) (2018-11-04)
[Full Changelog](https://github.com/kokkos/kokkos-kernels/compare/2.7.00...2.7.24)

Expand Down
13 changes: 13 additions & 0 deletions CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -298,6 +298,10 @@ IF (TPL_ENABLE_MKL)
ENDIF()

IF(${Kokkos_ENABLE_Cuda})
IF (NOT KOKKOSKERNELS_ENABLE_TPL_BLAS)
SET(KOKKOSKERNELS_ENABLE_TPL_BLAS ON)
LIST(APPEND TPL_LIST "BLAS")
ENDIF()
# CUBLAS is ON by default when CUDA is enabled
SET(KOKKOSKERNELS_ENABLE_TPL_CUBLAS ON)
# Tribit provides TPL mechanism for CUSPARSE; thus, use it
Expand All @@ -319,6 +323,15 @@ IF (KOKKOSKERNELS_ENABLE_TPL_CUBLAS)
LIST(APPEND TPL_LIST "CUBLAS")
ENDIF()

# ==================================================================
# Fortran Complex BLAS
# ==================================================================

IF (KOKKOSKERNELS_ENABLE_TPL_BLAS OR KOKKOSKERNELS_ENABLE_TPL_MKL)
INCLUDE(CheckHostBlasReturnComplex.cmake)
CHECK_HOST_BLAS_RETURN_COMPLEX(KOKKOSKERNELS_TPL_BLAS_RETURN_COMPLEX)
ENDIF()

# ==================================================================
# CMake Summary
# ==================================================================
Expand Down
38 changes: 38 additions & 0 deletions CheckHostBlasReturnComplex.cmake
Original file line number Diff line number Diff line change
@@ -0,0 +1,38 @@
INCLUDE(CheckCXXSourceRuns)

FUNCTION(CHECK_HOST_BLAS_RETURN_COMPLEX VARNAME)

SET(CMAKE_REQUIRED_LIBRARIES ${TPL_BLAS_LIBRARIES})

SET(SOURCE
"
#include <complex>
#define F77_BLAS_MANGLE${F77_BLAS_MANGLE}
extern \"C\" {
std::complex<double> F77_BLAS_MANGLE(zdotc,ZDOTC)(
const int* n,
const std::complex<double> x[], const int* incx,
const std::complex<double> y[], const int* incy);
}
int main() {
const int NUM=2;
const int INC=1;
std::complex<double> f[NUM];
const std::complex<double>
ONE = std::complex<double>(0.0,1.0),
TWO = std::complex<double>(0.0,2.0);
f[0] = ONE;
f[1] = TWO;
std::complex<double> ret
= F77_BLAS_MANGLE(zdotc,ZDOTC)(&NUM, f, &INC, f, &INC);
return (ret.real() == double(5.0) ? 0 : 1);
}
"
)

CHECK_CXX_SOURCE_RUNS("${SOURCE}" ${VARNAME})

ENDFUNCTION()
45 changes: 45 additions & 0 deletions Makefile.kokkos-kernels
Original file line number Diff line number Diff line change
Expand Up @@ -71,6 +71,40 @@ tmp := $(shell echo "----------------------------------------------*/" >> Kokkos
tmp := $(shell echo "\#ifndef KOKKOSKERNELS_CONFIG_H_" >> KokkosKernels_config.tmp)
tmp := $(shell echo "\#define KOKKOSKERNELS_CONFIG_H_" >> KokkosKernels_config.tmp)


#==== User-settable options for Fortran mangling macros =================
#With Makefile build, we rely on users' input

# default mangling scheme with a single under score
KOKKOSKERNELS_FORTRAN_GLOBAL = name\#\#_
KOKKOSKERNELS_INTERNAL_OVERRIDE_FORTRAN_MANGLING_WITH_DOUBLE_UNDERSCORES := $(strip $(shell echo $(KOKKOSKERNELS_OPTIONS) | grep "blas-mangle__" | wc -l))
ifeq ($(KOKKOSKERNELS_INTERNAL_OVERRIDE_FORTRAN_MANGLING_WITH_DOUBLE_UNDERSCORES), 1)
KOKKOSKERNELS_FORTRAN_GLOBAL = name\#\#__
else
KOKKOSKERNELS_INTERNAL_OVERRIDE_FORTRAN_MANGLING_WITH_SINGLE_UNDERSCORE := $(strip $(shell echo $(KOKKOSKERNELS_OPTIONS) | grep "blas-mangle_" | wc -l))
ifeq ($(KOKKOSKERNELS_INTERNAL_OVERRIDE_FORTRAN_MANGLING_WITH_SINGLE_UNDERSCORE), 1)
KOKKOSKERNELS_FORTRAN_GLOBAL = name\#\#_
else
KOKKOSKERNELS_INTERNAL_OVERRIDE_FORTRAN_MANGLING_WITH_NO_UNDERSCORE := $(strip $(shell echo $(KOKKOSKERNELS_OPTIONS) | grep "blas-mangle" | wc -l))
ifeq ($(KOKKOSKERNELS_INTERNAL_OVERRIDE_FORTRAN_MANGLING_WITH_NO_UNDERSCORE), 1)
KOKKOSKERNELS_FORTRAN_GLOBAL = name\#\#
endif
endif
endif

tmp := $(shell echo "" >> KokkosKernels_config.tmp)
tmp := $(shell echo "/* ---------------------------------------------" >> KokkosKernels_config.tmp)
tmp := $(shell echo "Fortran BLAS mangling:" >> KokkosKernels_config.tmp)
tmp := $(shell echo " ---------------------------------------------*/" >> KokkosKernels_config.tmp)
tmp := $(shell echo "\#if !defined(F77_BLAS_MANGLE)" >> KokkosKernels_config.tmp )
tmp := $(shell echo "\#define F77_BLAS_MANGLE(name,NAME) $(KOKKOSKERNELS_FORTRAN_GLOBAL)" >> KokkosKernels_config.tmp)
tmp := $(shell echo "\#endif" >> KokkosKernels_config.tmp )

KOKKOSKERNELS_INTERNAL_TPL_BLAS_RETURN_COMPLEX := $(strip $(shell echo $(KOKKOSKERNELS_OPTIONS) | grep "blas-return-complex" | wc -l))
ifeq ($(KOKKOSKERNELS_INTERNAL_TPL_BLAS_RETURN_COMPLEX), 1)
tmp := $(shell echo "\#define KOKKOSKERNELS_TPL_BLAS_RETURN_COMPLEX" >> KokkosKernels_config.tmp )
endif

#==== ETI Macros Scalars =================================================
KOKKOSKERNELS_INTERNAL_INST_SCALARS =
KOKKOSKERNELS_INTERNAL_INST_DOUBLE=$(strip $(shell echo "$(KOKKOSKERNELS_SCALARS)" | grep "double" | wc -l))
Expand Down Expand Up @@ -401,6 +435,17 @@ ifeq (${KOKKOSKERNELS_INTERNAL_ENABLE_CUBLAS}, 1)
KOKKOSKERNELS_INTERNAL_SRC_BLAS += ${KOKKOSKERNELS_PATH}/src/impl/tpls/KokkosBlas_Cuda_tpl.cpp
endif

KOKKOSKERNELS_INTERNAL_ENABLE_HOST_BLAS=0
ifeq (${KOKKOSKERNELS_INTERNAL_ENABLE_BLAS}, 1)
KOKKOSKERNELS_INTERNAL_ENABLE_HOST_BLAS=1
endif
ifeq (${KOKKOSKERNELS_INTERNAL_ENABLE_MKL}, 1)
KOKKOSKERNELS_INTERNAL_ENABLE_HOST_BLAS=1
endif
ifeq (${KOKKOSKERNELS_INTERNAL_ENABLE_HOST_BLAS}, 1)
KOKKOSKERNELS_INTERNAL_SRC_BLAS += ${KOKKOSKERNELS_PATH}/src/impl/tpls/KokkosBlas_Host_tpl.cpp
endif

KOKKOSKERNELS_INTERNAL_HEADERS = $(wildcard ${KOKKOSKERNELS_PATH}/src/impl/*.hpp)
KOKKOSKERNELS_INTERNAL_HEADERS += $(wildcard ${KOKKOSKERNELS_PATH}/src/impl/generated_specializations_hpp/*/*eti_spec*.hpp)

Expand Down
9 changes: 9 additions & 0 deletions cmake/KokkosKernels_config.h.in
Original file line number Diff line number Diff line change
@@ -1,6 +1,15 @@
#ifndef KOKKOSKERNELS_CONFIG_H
#define KOKKOSKERNELS_CONFIG_H


/* Define Fortran mangle from Trilinos macro definition */
#ifndef F77_BLAS_MANGLE
# define F77_BLAS_MANGLE@F77_BLAS_MANGLE@
#endif

/* Define if fortran blas 1 function can return complex type */
#cmakedefine KOKKOSKERNELS_TPL_BLAS_RETURN_COMPLEX

/* Define if building in debug mode */
#cmakedefine HAVE_KOKKOSKERNELS_DEBUG

Expand Down
1 change: 1 addition & 0 deletions master_history.txt
Original file line number Diff line number Diff line change
Expand Up @@ -4,3 +4,4 @@ tag: 2.5.00 date: 12/15/2017 master: e4c645e9 develop: 04d58766
tag: 2.6.00 date: 03/07/2018 master: 00b16484 develop: f81778ce
tag: 2.7.00 date: 05/24/2018 master: 6e8e97a9 develop: 692114a6
tag: 2.7.24 date: 11/05/2018 master: 1a7b524b develop: fab89e37
tag: 2.8.00 date: 02:05:2019 master: a6e05e06 develop: 6a790321
25 changes: 10 additions & 15 deletions perf_test/batched/KokkosBatched_Test_Gemm_Cuda.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -74,8 +74,7 @@ namespace KokkosBatched {
void operator()(const TeamTagV1 &, const MemberType &member) const {
const int kbeg = (member.league_rank()*(member.team_size()*VectorLength) +
member.team_rank()*VectorLength);
Kokkos::parallel_for
(Kokkos::ThreadVectorRange(member, VectorLength),
Kokkos::parallel_for(Kokkos::ThreadVectorRange(member, VectorLength),
[&](const int &k) {
const int kk = kbeg + k;
if (kk < int(_c.extent(0))) {
Expand All @@ -93,8 +92,7 @@ namespace KokkosBatched {
KOKKOS_INLINE_FUNCTION
void operator()(const TeamTagV2 &, const MemberType &member) const {
const int kbeg = member.league_rank()*VectorLength;
Kokkos::parallel_for
(Kokkos::ThreadVectorRange(member, VectorLength),
Kokkos::parallel_for(Kokkos::ThreadVectorRange(member, VectorLength),
[&](const int &k) {
const int kk = kbeg + k;
if (kk < int(_c.extent(0))) {
Expand All @@ -116,8 +114,7 @@ namespace KokkosBatched {
ScratchViewType<ViewType> sb(member.team_scratch(lvl), VectorLength, _b.extent(1), _b.extent(2));

const int kbeg = member.league_rank()*VectorLength;
Kokkos::parallel_for
(Kokkos::ThreadVectorRange(member, VectorLength),
Kokkos::parallel_for(Kokkos::ThreadVectorRange(member, VectorLength),
[&](const int &k) {
const int kk = kbeg + k;
if (kk < int(_c.extent(0))) {
Expand All @@ -142,14 +139,12 @@ namespace KokkosBatched {
KOKKOS_INLINE_FUNCTION
void operator()(const TeamTagHandmade &, const MemberType &member) const {
const int kbeg = member.league_rank()*VectorLength;
Kokkos::parallel_for
(Kokkos::ThreadVectorRange(member, VectorLength),
Kokkos::parallel_for(Kokkos::ThreadVectorRange(member, VectorLength),
[&](const int &k) {
const int kk = kbeg + k;
if (kk < int(_c.extent(0))) {
const int m = _c.extent(1), n = _c.extent(2), q = _a.extent(2);
Kokkos::parallel_for
(Kokkos::TeamThreadRange(member,0,m*n),
Kokkos::parallel_for(Kokkos::TeamThreadRange(member,0,m*n),
[&](const int &ij) {
const int i = ij%m, j = ij/m;
typename ViewType::non_const_value_type cval = 0;
Expand Down Expand Up @@ -315,7 +310,7 @@ namespace KokkosBatched {
DeviceSpaceType::fence();
timer.reset();

Kokkos::parallel_for("GEMM: RangePolicy version", policy, functor_type(a,b,c));
Kokkos::parallel_for("KokkosBatched::PerfTest::GemmCuda::RangeTag", policy, functor_type(a,b,c));

DeviceSpaceType::fence();
const double t = timer.seconds();
Expand Down Expand Up @@ -382,7 +377,7 @@ namespace KokkosBatched {
DeviceSpaceType::fence();
timer.reset();

Kokkos::parallel_for("GEMM: TeamPolicy version 1", policy,functor_type(a,b,c));
Kokkos::parallel_for("KokkosBatched::PerfTest::GemmCuda::TeamPolicyV1", policy,functor_type(a,b,c));

DeviceSpaceType::fence();
const double t = timer.seconds();
Expand Down Expand Up @@ -455,7 +450,7 @@ namespace KokkosBatched {
DeviceSpaceType::fence();
timer.reset();

Kokkos::parallel_for("GEMM: TeamPolicy version 2", policy, functor_type(a,b,c));
Kokkos::parallel_for("KokkosBatched::PerfTest::GemmCuda::TeamPolicyV2", policy, functor_type(a,b,c));

DeviceSpaceType::fence();
const double t = timer.seconds();
Expand Down Expand Up @@ -532,7 +527,7 @@ namespace KokkosBatched {
DeviceSpaceType::fence();
timer.reset();

Kokkos::parallel_for("GEMM: TeamPolicy version 3", policy, functor_type(a,b,c));
Kokkos::parallel_for("KokkosBatched::PerfTest::GemmCuda::TeamPolicyV3", policy, functor_type(a,b,c));

DeviceSpaceType::fence();
const double t = timer.seconds();
Expand Down Expand Up @@ -604,7 +599,7 @@ namespace KokkosBatched {
DeviceSpaceType::fence();
timer.reset();

Kokkos::parallel_for("GEMM: TeamPolicy handmade", policy, functor_type(a,b,c));
Kokkos::parallel_for("KokkosBatched::PerfTest::GemmCuda::TeamPolicyHandmade", policy, functor_type(a,b,c));

DeviceSpaceType::fence();
const double t = timer.seconds();
Expand Down
16 changes: 8 additions & 8 deletions perf_test/batched/KokkosBatched_Test_Gemm_Host.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -87,8 +87,8 @@ namespace KokkosBatched {
amat_simd("amat_simd", N, BlkSize, BlkSize),
bmat_simd("bmat_simd", N, BlkSize, BlkSize);

Kokkos::parallel_for
(Kokkos::RangePolicy<HostSpaceType>(0, N*VectorLength),
Kokkos::parallel_for("KokkosBatched::PerfTest::GemmHost::Pack",
Kokkos::RangePolicy<HostSpaceType>(0, N*VectorLength),
KOKKOS_LAMBDA(const int k) {
const int k0 = k/VectorLength, k1 = k%VectorLength;
for (int i=0;i<BlkSize;++i)
Expand Down Expand Up @@ -128,8 +128,8 @@ namespace KokkosBatched {
HostSpaceType::fence();
timer.reset();

Kokkos::parallel_for
(policy,
Kokkos::parallel_for("KokkosBatched::PerfTest::GemmHost::CblasOpenMP",
policy,
KOKKOS_LAMBDA(const int k) {
auto aa = Kokkos::subview(a, k, Kokkos::ALL(), Kokkos::ALL());
auto bb = Kokkos::subview(b, k, Kokkos::ALL(), Kokkos::ALL());
Expand Down Expand Up @@ -385,8 +385,8 @@ namespace KokkosBatched {
HostSpaceType::fence();
timer.reset();

Kokkos::parallel_for
(policy,
Kokkos::parallel_for("KokkosBatched::PerfTest::GemmHost::libxswmmOpenMP",
policy,
KOKKOS_LAMBDA(const int k) {
auto aa = Kokkos::subview(a, k, Kokkos::ALL(), Kokkos::ALL());
auto bb = Kokkos::subview(b, k, Kokkos::ALL(), Kokkos::ALL());
Expand Down Expand Up @@ -515,8 +515,8 @@ namespace KokkosBatched {
HostSpaceType::fence();
timer.reset();

Kokkos::parallel_for
(policy,
Kokkos::parallel_for("KokkosBatched::PerfTest::GemmHost::SIMDSerialOpenMP",
policy,
KOKKOS_LAMBDA(const int k) {
auto aa = Kokkos::subview(a, k, Kokkos::ALL(), Kokkos::ALL());
auto bb = Kokkos::subview(b, k, Kokkos::ALL(), Kokkos::ALL());
Expand Down
12 changes: 6 additions & 6 deletions perf_test/batched/KokkosBatched_Test_Gemv_Host.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -114,8 +114,8 @@ namespace KokkosBatched {
HostSpaceType::fence();
timer.reset();

Kokkos::parallel_for
(policy,
Kokkos::parallel_for("KokkosBatched::PerfTest::GemvHost::CblasOpenMP",
policy,
KOKKOS_LAMBDA(const int k) {
auto aa = Kokkos::subview(a, k, Kokkos::ALL(), Kokkos::ALL());
for (int j=0;j<NumVecs;++j) {
Expand Down Expand Up @@ -174,8 +174,8 @@ namespace KokkosBatched {
HostSpaceType::fence();
timer.reset();

Kokkos::parallel_for
(policy,
Kokkos::parallel_for("KokkosBatched::PerfTest::GemvHost::SerialOpenMP",
policy,
KOKKOS_LAMBDA(const int k) {
auto aa = Kokkos::subview(a, k, Kokkos::ALL(), Kokkos::ALL());

Expand Down Expand Up @@ -249,8 +249,8 @@ namespace KokkosBatched {
HostSpaceType::fence();
timer.reset();

Kokkos::parallel_for
(policy,
Kokkos::parallel_for("KokkosBatched::PerfTest::GemvHost::SIMDSerialOpenMP",
policy,
KOKKOS_LAMBDA(const int k) {
auto aa = Kokkos::subview(a, k, Kokkos::ALL(), Kokkos::ALL());

Expand Down
8 changes: 4 additions & 4 deletions perf_test/batched/KokkosBatched_Test_LU_Cuda.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -286,7 +286,7 @@ namespace KokkosBatched {
DeviceSpaceType::fence();
timer.reset();

Kokkos::parallel_for(policy, functor_type(a));
Kokkos::parallel_for("KokkosBatched::PerfTest::LUCuda::RangeTag", policy, functor_type(a));

DeviceSpaceType::fence();
const double t = timer.seconds();
Expand Down Expand Up @@ -345,7 +345,7 @@ namespace KokkosBatched {
DeviceSpaceType::fence();
timer.reset();

Kokkos::parallel_for(policy, functor_type(a));
Kokkos::parallel_for("KokkosBatched::PerfTest::LUCuda::TeamTagV1", policy, functor_type(a));

DeviceSpaceType::fence();
const double t = timer.seconds();
Expand Down Expand Up @@ -414,7 +414,7 @@ namespace KokkosBatched {
DeviceSpaceType::fence();
timer.reset();

Kokkos::parallel_for(policy, functor_type(a));
Kokkos::parallel_for("KokkosBatched::PerfTest::LUCuda::TeamTagV2", policy, functor_type(a));

DeviceSpaceType::fence();
const double t = timer.seconds();
Expand Down Expand Up @@ -486,7 +486,7 @@ namespace KokkosBatched {
DeviceSpaceType::fence();
timer.reset();

Kokkos::parallel_for(policy.set_scratch_size(lvl, Kokkos::PerTeam(per_team_scratch)),
Kokkos::parallel_for("KokkosBatched::PerfTest::LUCuda::TeamTagV3", policy.set_scratch_size(lvl, Kokkos::PerTeam(per_team_scratch)),
functor_type(a));

DeviceSpaceType::fence();
Expand Down
Loading

0 comments on commit 4ee5f3c

Please sign in to comment.