From 70a9eb0cfed068382a77197ad66bd544b5e436a1 Mon Sep 17 00:00:00 2001 From: Evan Harvey Date: Thu, 30 Sep 2021 13:57:01 -0600 Subject: [PATCH 001/300] perf_test/blas/blas3: Add benchmarking script for reproducability --- cmake/KokkosKernels_config.h.in | 2 + cmake/kokkoskernels_eti_floats.cmake | 13 ++ .../KokkosBatched_BatchedGemm_benchmark.sh | 141 ++++++++++++++++++ src/common/KokkosKernels_default_types.hpp | 2 + 4 files changed, 158 insertions(+) create mode 100755 perf_test/blas/blas3/KokkosBatched_BatchedGemm_benchmark.sh diff --git a/cmake/KokkosKernels_config.h.in b/cmake/KokkosKernels_config.h.in index 543f80b4f8..64efebcc29 100644 --- a/cmake/KokkosKernels_config.h.in +++ b/cmake/KokkosKernels_config.h.in @@ -63,6 +63,8 @@ #cmakedefine KOKKOSKERNELS_INST_DOUBLE /* Whether to build kernels for scalar type float */ #cmakedefine KOKKOSKERNELS_INST_FLOAT +/* Whether to build kernels for scalar type Kokkos::Experimental::half_t */ +#cmakedefine KOKKOSKERNELS_INST_HALF /* Whether to build kernels for scalar type complex */ #cmakedefine KOKKOSKERNELS_INST_COMPLEX_DOUBLE /* Whether to build kernels for scalar type complex */ diff --git a/cmake/kokkoskernels_eti_floats.cmake b/cmake/kokkoskernels_eti_floats.cmake index 69e50af3cd..dde3c88871 100644 --- a/cmake/kokkoskernels_eti_floats.cmake +++ b/cmake/kokkoskernels_eti_floats.cmake @@ -18,13 +18,22 @@ KOKKOSKERNELS_ADD_OPTION( "Whether to pre instantiate kernels for the scalar type float. Disabling this may increase build times. Default: OFF or unless enabled during a Trilinos build with Trilinos_ENABLE_FLOAT." ) +KOKKOSKERNELS_ADD_OPTION( + INST_HALF + OFF + BOOL + "Whether to pre instantiate kernels for the scalar type Kokkos::Experimental::half_t. Disabling this may increase build times. Default: OFF" +) + SET(FLOATS + HALF FLOAT DOUBLE COMPLEX_FLOAT COMPLEX_DOUBLE) SET(DOUBLE_CPP_TYPE "double") SET(FLOAT_CPP_TYPE "float") +SET(HALF_CPP_TYPE "Kokkos::Experimental::half_t") SET(COMPLEX_FLOAT_CPP_TYPE "Kokkos::complex") SET(COMPLEX_DOUBLE_CPP_TYPE "Kokkos::complex") @@ -63,6 +72,10 @@ IF (KOKKOSKERNELS_INST_FLOAT) LIST(APPEND SCALAR_LIST "float") ENDIF() +IF (KOKKOSKERNELS_INST_HALF) + LIST(APPEND SCALAR_LIST "Kokkos::Experimental::half_t") +ENDIF() + IF (KOKKOSKERNELS_INST_COMPLEX_DOUBLE) LIST(APPEND SCALAR_LIST "complex") ENDIF() diff --git a/perf_test/blas/blas3/KokkosBatched_BatchedGemm_benchmark.sh b/perf_test/blas/blas3/KokkosBatched_BatchedGemm_benchmark.sh new file mode 100755 index 0000000000..ae80da2f71 --- /dev/null +++ b/perf_test/blas/blas3/KokkosBatched_BatchedGemm_benchmark.sh @@ -0,0 +1,141 @@ +#!/bin/bash +################################################################################ +# @Brief: On the specified arch, build and run KokkosBlas3_perf_test. +# +# The value of this script is to ensure that the benchmark results can be easily +# reproduced. +# +# Author: Evan Harvey +################################################################################ + +function envprint() { + for x in $@; do + echo $x:\$$x | envsubst + done +} + +function printhelp() { + echo "--Usage--" + echo "$0 PRECISION HOST_ARCH " + echo " PRECISION: Kokkos::Experimental::half_t, float, double" + echo " HOST_ARCH: POWER9, A64FX, SKX" + echo " ACCELERATOR_ARCH: VOLTA70" + echo "" +} + +function earlyexit() { + rm -rf $benchmark_dir + exit $1 +} + +function beval() { + local ret=0 + echo "---------------------------------------------------------------------------------------------------------------" + echo "START: \"$@\"" + if [ $dry_run == "off" ]; then + eval $@ + ret=$PIPESTATUS + fi + if [ $ret -ne 0 ]; then + echo "ERROR: \"$@\"" + earlyexit 1 + fi + echo "END : \"$@\"" + echo "---------------------------------------------------------------------------------------------------------------" +} + +# Handle input args +export KOKKOS_SRC_DIR=${KOKKOS_SRC_DIR:-"$HOME/KOKKOS.base/kokkos"} +export KOKKOS_SRC_DIR=$(realpath $KOKKOS_SRC_DIR) +export KOKKOS_SHA=${KOKKOS_SHA:-"2fc1050"} # Tip of develop as of 09-30-21 +export KOKKOSKERNELS_SRC_DIR=${KOKKOSKERNELS_SRC_DIR:-"$HOME/KOKKOS.base/kokkos-kernels"} +export KOKKOSKERNELS_SRC_DIR=$(realpath $KOKKOSKERNELS_SRC_DIR) +export KOKKOSKERNELS_SHA=${KOKKOSKERNELS_SHA:-"3d2992f"} # Tip of e10harvey/issue1045 as of 09-30-21 +envprint KOKKOS_SRC_DIR KOKKOS_SHA KOKKOSKERNELS_SRC_DIR KOKKOSKERNELS_SHA + +# Create benchmark directory +benchmark_dir=$0_$(date +"%Y-%m-%d_%H.%M.%S") +mkdir -p $benchmark_dir/kokkos-{build,instal} +mkdir -p $benchmark_dir/kokkos-kernels-{build,install} +export KOKKOS_BUILD_DIR=$(realpath $benchmark_dir/kokkos-build) +export KOKKOS_INSTALL_DIR=$(realpath $benchmark_dir/kokkos-install) +export KOKKOSKERNELS_BUILD_DIR=$(realpath $benchmark_dir/kokkos-kernels-build) +export KOKKOSKERNELS_INSTALL_DIR=$(realpath $benchmark_dir/kokkos-kernels-install) +envprint KOKKOS_INSTALL_DIR KOKKOS_BUILD_DIR KOKKOSKERNELS_BUILD_DIR KOKKOSKERNELS_INSTALL_DIR + +dry_run="off" +precision="$1" +arch_names="$2 $3" +echo "PRECISION=\"$1\", HOST_ARCH=\"$2\", ACCELERATOR_ARCH=\"$3\"" + +# Setup arch specific cmake configurations and job submission commands +if [[ "$arch_names" == " " || -z $precision ]]; then + printhelp; earlyexit 1 +elif [ "$arch_names" == "POWER9 VOLTA70" ]; then + module load cmake/3.18.0 gcc/7.2.0 cuda/10.2.2 + kokkos_config_cmd="cd $KOKKOS_BUILD_DIR; $KOKKOS_SRC_DIR/generate_makefile.bash --cxxflags='-O3' --arch=Power9,Volta70 \ + --with-cuda=$CUDA_PATH --compiler=$KOKKOS_SRC_DIR/bin/nvcc_wrapper --kokkos-path=$KOKKOS_SRC_DIR \ + --prefix=$KOKKOS_INSTALL_DIR 2>&1 | tee kokkos_config_cmd.out" + + kokkoskernels_config_cmd="cd $KOKKOSKERNELS_BUILD_DIR; $KOKKOSKERNELS_SRC_DIR/cm_generate_makefile.bash \ + --cxxflags='-O3' --arch=Power9,Volta70 \ + --with-scalars="$precision" \ + --with-cuda=$CUDA_PATH --compiler=$KOKKOS_INSTALL_DIR/bin/nvcc_wrapper \ + --kokkos-path=$KOKKOS_SRC_DIR --kokkoskernels-path=$KOKKOSKERNELS_SRC_DIR \ + --kokkos-prefix=$KOKKOS_INSTALL_DIR --prefix=$KOKKOSKERNELS_INSTALL_DIR 2>&1 | \ + tee kokkoskernels_config_cmd.out" + kokkoskernels_config_layout_cmd="cd $KOKKOSKERNELS_BUILD_DIR; cmake -DKokkosKernels_INST_LAYOUTLEFT:BOOL=OFF \ + -DKokkosKernels_INST_LAYOUTRIGHT:BOOL=ON \ + $KOKKOSKERNELS_SRC_DIR 2>&1 | tee -a kokkoskernels_config_cmd.out" + + kokkos_build_cmd="bsub -q rhel7W -W 2:00 -Is $KOKKOS_BUILD_DIR/build.sh" + kokkoskernels_build_cmd="bsub -q rhel7W -W 2:00 -Is $KOKKOSKERNELS_BUILD_DIR/build.sh" + benchmark_cmd="bsub -q rhel7W -W 2:00 -Is $KOKKOSKERNELS_BUILD_DIR/bench.sh" +elif [ "$arch_names" == "A64FX " ]; then + earlyexit 0 +elif [ "$arch_names" == "SKX " ]; then + earlyexit 0 +else + echo "Invalid arch: $arch_names" + printhelp; earlyexit 1 +fi + +# Set the arch agnostic commands +echo "#!/bin/bash" > $KOKKOS_BUILD_DIR/build.sh +echo "cd $KOKKOS_BUILD_DIR" >> $KOKKOS_BUILD_DIR/build.sh +echo "make -j40 install" >> $KOKKOS_BUILD_DIR/build.sh +chmod +x $KOKKOS_BUILD_DIR/build.sh + +echo "#!/bin/bash" > $KOKKOSKERNELS_BUILD_DIR/build.sh +echo "cd $KOKKOSKERNELS_BUILD_DIR/perf_test/blas/blas3" >> $KOKKOSKERNELS_BUILD_DIR/build.sh +echo "make -j40" >> $KOKKOSKERNELS_BUILD_DIR/build.sh +chmod +x $KOKKOSKERNELS_BUILD_DIR/build.sh + +echo "#!/bin/bash" > $KOKKOSKERNELS_BUILD_DIR/bench.sh +echo "cd $benchmark_dir" >> $KOKKOSKERNELS_BUILD_DIR/bench.sh +echo "Writing output to: $benchmark_dir/bench.csv..." >> $KOKKOSKERNELS_BUILD_DIR/bench.sh +echo "KOKKOSKERNELS_BUILD_DIR/perf_test/blas/blas3/KokkosBlas3_perf_test \ + --precision=$precision \ + --test=batched_heuristic --routines=gemm --loop_type=parallel --batch_size_last_dim=0 \ + --matrix_size_start=2x2,2x2,2x2 --matrix_size_stop=64x64,64x64,64x64 + --matrix_size_step=2 --batch_size=$((80*1024)) \ + --warm_up_loop=10 --iter=20 --verify=0 \ + --csv=$benchmark_dir/bench.csv" \ + >> $KOKKOSKERNELS_BUILD_DIR/bench.sh +chmod +x $KOKKOSKERNELS_BUILD_DIR/bench.sh + +# Check out the correct SHAs +beval "cd $KOKKOS_SRC_DIR && git checkout $KOKKOS_SHA" +beval "cd $KOKKOSKERNELS_SRC_DIR && git checkout $KOKKOSKERNELS_SHA" + +# Build Kokkos +beval $kokkos_config_cmd +beval $kokkos_build_cmd + +# Build KokkosKernels +beval $kokkoskernels_config_cmd +beval $kokkoskernels_config_layout_cmd +beval $kokkoskernels_build_cmd + +# Run the benchmark +beval $benchmark_cmd \ No newline at end of file diff --git a/src/common/KokkosKernels_default_types.hpp b/src/common/KokkosKernels_default_types.hpp index aec2ff98f2..74a7a92183 100644 --- a/src/common/KokkosKernels_default_types.hpp +++ b/src/common/KokkosKernels_default_types.hpp @@ -76,6 +76,8 @@ using default_scalar = double; #elif defined(KOKKOSKERNELS_INST_FLOAT) using default_scalar = float; +#elif defined(KOKKOSKERNELS_INST_HALF) + using default_scalar = Kokkos::Experimental::half_t; #else using default_scalar = double; #endif From f59f2c2b4ee09cac535c32e3325a3354b7ccde31 Mon Sep 17 00:00:00 2001 From: Evan Harvey Date: Fri, 1 Oct 2021 11:06:26 -0600 Subject: [PATCH 002/300] perf_test/blas/blas3: - Fix --csv option in KokkosBlas3_perf_test - Update benchmark script: - reduce build time - fix typos - add comments - wait for file system writes to land --- cmake/kokkoskernels_eti_floats.cmake | 8 ++--- .../KokkosBatched_BatchedGemm_benchmark.sh | 35 ++++++++++++------- .../blas/blas3/KokkosBlas3_perf_test.cpp | 3 +- 3 files changed, 28 insertions(+), 18 deletions(-) diff --git a/cmake/kokkoskernels_eti_floats.cmake b/cmake/kokkoskernels_eti_floats.cmake index dde3c88871..debf99bb0e 100644 --- a/cmake/kokkoskernels_eti_floats.cmake +++ b/cmake/kokkoskernels_eti_floats.cmake @@ -26,7 +26,6 @@ KOKKOSKERNELS_ADD_OPTION( ) SET(FLOATS - HALF FLOAT DOUBLE COMPLEX_FLOAT @@ -72,9 +71,10 @@ IF (KOKKOSKERNELS_INST_FLOAT) LIST(APPEND SCALAR_LIST "float") ENDIF() -IF (KOKKOSKERNELS_INST_HALF) - LIST(APPEND SCALAR_LIST "Kokkos::Experimental::half_t") -ENDIF() +# TODO: Fix build errors in kokkos when half_t is used in ETI +#IF (KOKKOSKERNELS_INST_HALF) +# LIST(APPEND SCALAR_LIST "Kokkos::Experimental::half_t") +#ENDIF() IF (KOKKOSKERNELS_INST_COMPLEX_DOUBLE) LIST(APPEND SCALAR_LIST "complex") diff --git a/perf_test/blas/blas3/KokkosBatched_BatchedGemm_benchmark.sh b/perf_test/blas/blas3/KokkosBatched_BatchedGemm_benchmark.sh index ae80da2f71..129e53f02d 100755 --- a/perf_test/blas/blas3/KokkosBatched_BatchedGemm_benchmark.sh +++ b/perf_test/blas/blas3/KokkosBatched_BatchedGemm_benchmark.sh @@ -54,7 +54,7 @@ export KOKKOSKERNELS_SHA=${KOKKOSKERNELS_SHA:-"3d2992f"} # Tip of e10harvey/issu envprint KOKKOS_SRC_DIR KOKKOS_SHA KOKKOSKERNELS_SRC_DIR KOKKOSKERNELS_SHA # Create benchmark directory -benchmark_dir=$0_$(date +"%Y-%m-%d_%H.%M.%S") +benchmark_dir=$PWD/$0_$(date +"%Y-%m-%d_%H.%M.%S") mkdir -p $benchmark_dir/kokkos-{build,instal} mkdir -p $benchmark_dir/kokkos-kernels-{build,install} export KOKKOS_BUILD_DIR=$(realpath $benchmark_dir/kokkos-build) @@ -73,19 +73,21 @@ if [[ "$arch_names" == " " || -z $precision ]]; then printhelp; earlyexit 1 elif [ "$arch_names" == "POWER9 VOLTA70" ]; then module load cmake/3.18.0 gcc/7.2.0 cuda/10.2.2 - kokkos_config_cmd="cd $KOKKOS_BUILD_DIR; $KOKKOS_SRC_DIR/generate_makefile.bash --cxxflags='-O3' --arch=Power9,Volta70 \ + kokkos_config_cmd="cd $KOKKOS_BUILD_DIR && $KOKKOS_SRC_DIR/generate_makefile.bash --cxxflags='-O3' --arch=Power9,Volta70 \ --with-cuda=$CUDA_PATH --compiler=$KOKKOS_SRC_DIR/bin/nvcc_wrapper --kokkos-path=$KOKKOS_SRC_DIR \ --prefix=$KOKKOS_INSTALL_DIR 2>&1 | tee kokkos_config_cmd.out" + kokkos_config_defaults_cmd="cd $KOKKOS_BUILD_DIR && cmake -DKokkos_ENABLE_TESTS:BOOL=OFF $KOKKOS_SRC_DIR 2>&1 \ + | tee -a kokkos_config_cmd.out" - kokkoskernels_config_cmd="cd $KOKKOSKERNELS_BUILD_DIR; $KOKKOSKERNELS_SRC_DIR/cm_generate_makefile.bash \ + kokkoskernels_config_cmd="cd $KOKKOSKERNELS_BUILD_DIR && $KOKKOSKERNELS_SRC_DIR/cm_generate_makefile.bash \ --cxxflags='-O3' --arch=Power9,Volta70 \ --with-scalars="$precision" \ --with-cuda=$CUDA_PATH --compiler=$KOKKOS_INSTALL_DIR/bin/nvcc_wrapper \ --kokkos-path=$KOKKOS_SRC_DIR --kokkoskernels-path=$KOKKOSKERNELS_SRC_DIR \ --kokkos-prefix=$KOKKOS_INSTALL_DIR --prefix=$KOKKOSKERNELS_INSTALL_DIR 2>&1 | \ tee kokkoskernels_config_cmd.out" - kokkoskernels_config_layout_cmd="cd $KOKKOSKERNELS_BUILD_DIR; cmake -DKokkosKernels_INST_LAYOUTLEFT:BOOL=OFF \ - -DKokkosKernels_INST_LAYOUTRIGHT:BOOL=ON \ + kokkoskernels_config_defaults_cmd="cd $KOKKOSKERNELS_BUILD_DIR && cmake -DKokkosKernels_INST_LAYOUTLEFT:BOOL=OFF \ + -DKokkosKernels_INST_LAYOUTRIGHT:BOOL=ON -DKokkosKernels_INST_DOUBLE:BOOL=OFF \ $KOKKOSKERNELS_SRC_DIR 2>&1 | tee -a kokkoskernels_config_cmd.out" kokkos_build_cmd="bsub -q rhel7W -W 2:00 -Is $KOKKOS_BUILD_DIR/build.sh" @@ -100,25 +102,26 @@ else printhelp; earlyexit 1 fi -# Set the arch agnostic commands +# Write the arch agnostic kokkos build script echo "#!/bin/bash" > $KOKKOS_BUILD_DIR/build.sh echo "cd $KOKKOS_BUILD_DIR" >> $KOKKOS_BUILD_DIR/build.sh echo "make -j40 install" >> $KOKKOS_BUILD_DIR/build.sh chmod +x $KOKKOS_BUILD_DIR/build.sh +# Write the arch agnostic kokkos-kernels build script echo "#!/bin/bash" > $KOKKOSKERNELS_BUILD_DIR/build.sh echo "cd $KOKKOSKERNELS_BUILD_DIR/perf_test/blas/blas3" >> $KOKKOSKERNELS_BUILD_DIR/build.sh echo "make -j40" >> $KOKKOSKERNELS_BUILD_DIR/build.sh chmod +x $KOKKOSKERNELS_BUILD_DIR/build.sh +# Write the arch agnostic kokkos-kernels benchmark script echo "#!/bin/bash" > $KOKKOSKERNELS_BUILD_DIR/bench.sh echo "cd $benchmark_dir" >> $KOKKOSKERNELS_BUILD_DIR/bench.sh -echo "Writing output to: $benchmark_dir/bench.csv..." >> $KOKKOSKERNELS_BUILD_DIR/bench.sh -echo "KOKKOSKERNELS_BUILD_DIR/perf_test/blas/blas3/KokkosBlas3_perf_test \ - --precision=$precision \ +echo "echo \"Writing output to: $benchmark_dir/bench.csv...\"" >> $KOKKOSKERNELS_BUILD_DIR/bench.sh +echo "$KOKKOSKERNELS_BUILD_DIR/perf_test/blas/blas3/KokkosBlas3_perf_test \ --test=batched_heuristic --routines=gemm --loop_type=parallel --batch_size_last_dim=0 \ - --matrix_size_start=2x2,2x2,2x2 --matrix_size_stop=64x64,64x64,64x64 - --matrix_size_step=2 --batch_size=$((80*1024)) \ + --matrix_size_start=2x2,2x2,2x2 --matrix_size_stop=64x64,64x64,64x64 \ + --matrix_size_step=2 --batch_size=1024 \ --warm_up_loop=10 --iter=20 --verify=0 \ --csv=$benchmark_dir/bench.csv" \ >> $KOKKOSKERNELS_BUILD_DIR/bench.sh @@ -130,12 +133,18 @@ beval "cd $KOKKOSKERNELS_SRC_DIR && git checkout $KOKKOSKERNELS_SHA" # Build Kokkos beval $kokkos_config_cmd +beval $kokkos_config_defaults_cmd beval $kokkos_build_cmd +# Wait for the file system on the head node to catch up +while [ ! -e $KOKKOS_INSTALL_DIR/bin/nvcc_wrapper ]; do + sleep 3s +done + # Build KokkosKernels beval $kokkoskernels_config_cmd -beval $kokkoskernels_config_layout_cmd +beval $kokkoskernels_config_defaults_cmd beval $kokkoskernels_build_cmd # Run the benchmark -beval $benchmark_cmd \ No newline at end of file +beval $benchmark_cmd diff --git a/perf_test/blas/blas3/KokkosBlas3_perf_test.cpp b/perf_test/blas/blas3/KokkosBlas3_perf_test.cpp index 1069bc4d00..cf03f1b6a3 100644 --- a/perf_test/blas/blas3/KokkosBlas3_perf_test.cpp +++ b/perf_test/blas/blas3/KokkosBlas3_perf_test.cpp @@ -46,6 +46,7 @@ #include "KokkosBlas3_gemm_perf_test.hpp" #include +#include #include #include @@ -242,6 +243,7 @@ int main(int argc, char **argv) { int option_idx = 0, ret, i; char *n_str = nullptr, *adim = nullptr, *bdim = nullptr, *cdim = nullptr; std::filebuf fb; + std::ostream out(&fb); char *out_file = nullptr; using rt_type = decltype(do_trmm_invoke); rt_type *routine_table[BLAS_ROUTINES_N] = { @@ -429,7 +431,6 @@ int main(int argc, char **argv) { if (out_file != nullptr) { fb.open(out_file, std::ios::out); - std::ostream out(&fb); options.out = &out; } From 4dcd182bc6825b74c183aaaa0db95d50148eb886 Mon Sep 17 00:00:00 2001 From: Evan Harvey Date: Fri, 1 Oct 2021 11:26:16 -0600 Subject: [PATCH 003/300] perf_test/blas/blas3: - Small refactor in benchmark script --- .../blas3/KokkosBatched_BatchedGemm_benchmark.sh | 14 +++++++------- 1 file changed, 7 insertions(+), 7 deletions(-) diff --git a/perf_test/blas/blas3/KokkosBatched_BatchedGemm_benchmark.sh b/perf_test/blas/blas3/KokkosBatched_BatchedGemm_benchmark.sh index 129e53f02d..15b38302a5 100755 --- a/perf_test/blas/blas3/KokkosBatched_BatchedGemm_benchmark.sh +++ b/perf_test/blas/blas3/KokkosBatched_BatchedGemm_benchmark.sh @@ -50,11 +50,16 @@ export KOKKOS_SRC_DIR=$(realpath $KOKKOS_SRC_DIR) export KOKKOS_SHA=${KOKKOS_SHA:-"2fc1050"} # Tip of develop as of 09-30-21 export KOKKOSKERNELS_SRC_DIR=${KOKKOSKERNELS_SRC_DIR:-"$HOME/KOKKOS.base/kokkos-kernels"} export KOKKOSKERNELS_SRC_DIR=$(realpath $KOKKOSKERNELS_SRC_DIR) -export KOKKOSKERNELS_SHA=${KOKKOSKERNELS_SHA:-"3d2992f"} # Tip of e10harvey/issue1045 as of 09-30-21 +export KOKKOSKERNELS_SHA=${KOKKOSKERNELS_SHA:-"issue933"} # Tip of e10harvey/issue933 as of 10-01-21 envprint KOKKOS_SRC_DIR KOKKOS_SHA KOKKOSKERNELS_SRC_DIR KOKKOSKERNELS_SHA +dry_run="off" +precision="$1" +arch_names="$2 $3" +echo "PRECISION=\"$1\", HOST_ARCH=\"$2\", ACCELERATOR_ARCH=\"$3\"" + # Create benchmark directory -benchmark_dir=$PWD/$0_$(date +"%Y-%m-%d_%H.%M.%S") +benchmark_dir=$precision_$PWD/$0_$(date +"%Y-%m-%d_%H.%M.%S") mkdir -p $benchmark_dir/kokkos-{build,instal} mkdir -p $benchmark_dir/kokkos-kernels-{build,install} export KOKKOS_BUILD_DIR=$(realpath $benchmark_dir/kokkos-build) @@ -63,11 +68,6 @@ export KOKKOSKERNELS_BUILD_DIR=$(realpath $benchmark_dir/kokkos-kernels-build) export KOKKOSKERNELS_INSTALL_DIR=$(realpath $benchmark_dir/kokkos-kernels-install) envprint KOKKOS_INSTALL_DIR KOKKOS_BUILD_DIR KOKKOSKERNELS_BUILD_DIR KOKKOSKERNELS_INSTALL_DIR -dry_run="off" -precision="$1" -arch_names="$2 $3" -echo "PRECISION=\"$1\", HOST_ARCH=\"$2\", ACCELERATOR_ARCH=\"$3\"" - # Setup arch specific cmake configurations and job submission commands if [[ "$arch_names" == " " || -z $precision ]]; then printhelp; earlyexit 1 From 74b5df2d8e46d5fd581e8327d251884ce0db45fa Mon Sep 17 00:00:00 2001 From: Evan Harvey Date: Fri, 1 Oct 2021 13:31:07 -0600 Subject: [PATCH 004/300] perf_test/blas/blas3: - Add A64FX and SKX to benchmark script - Add --use_simd option to perf_test --- .../KokkosBatched_BatchedGemm_benchmark.sh | 78 +++++++++++++++---- perf_test/blas/blas3/KokkosBlas3_common.hpp | 2 + .../blas/blas3/KokkosBlas3_gemm_perf_test.hpp | 13 ++-- .../blas/blas3/KokkosBlas3_perf_test.cpp | 13 +++- 4 files changed, 83 insertions(+), 23 deletions(-) diff --git a/perf_test/blas/blas3/KokkosBatched_BatchedGemm_benchmark.sh b/perf_test/blas/blas3/KokkosBatched_BatchedGemm_benchmark.sh index 15b38302a5..4514b6be5d 100755 --- a/perf_test/blas/blas3/KokkosBatched_BatchedGemm_benchmark.sh +++ b/perf_test/blas/blas3/KokkosBatched_BatchedGemm_benchmark.sh @@ -59,9 +59,9 @@ arch_names="$2 $3" echo "PRECISION=\"$1\", HOST_ARCH=\"$2\", ACCELERATOR_ARCH=\"$3\"" # Create benchmark directory -benchmark_dir=$precision_$PWD/$0_$(date +"%Y-%m-%d_%H.%M.%S") -mkdir -p $benchmark_dir/kokkos-{build,instal} -mkdir -p $benchmark_dir/kokkos-kernels-{build,install} +benchmark_dir=$PWD/$0_$(date +"%Y-%m-%d_%H.%M.%S") +beval mkdir -p $benchmark_dir/kokkos-{build,install} +beval mkdir -p $benchmark_dir/kokkos-kernels-{build,install} export KOKKOS_BUILD_DIR=$(realpath $benchmark_dir/kokkos-build) export KOKKOS_INSTALL_DIR=$(realpath $benchmark_dir/kokkos-install) export KOKKOSKERNELS_BUILD_DIR=$(realpath $benchmark_dir/kokkos-kernels-build) @@ -72,17 +72,17 @@ envprint KOKKOS_INSTALL_DIR KOKKOS_BUILD_DIR KOKKOSKERNELS_BUILD_DIR KOKKOSKERNE if [[ "$arch_names" == " " || -z $precision ]]; then printhelp; earlyexit 1 elif [ "$arch_names" == "POWER9 VOLTA70" ]; then + module purge module load cmake/3.18.0 gcc/7.2.0 cuda/10.2.2 - kokkos_config_cmd="cd $KOKKOS_BUILD_DIR && $KOKKOS_SRC_DIR/generate_makefile.bash --cxxflags='-O3' --arch=Power9,Volta70 \ - --with-cuda=$CUDA_PATH --compiler=$KOKKOS_SRC_DIR/bin/nvcc_wrapper --kokkos-path=$KOKKOS_SRC_DIR \ - --prefix=$KOKKOS_INSTALL_DIR 2>&1 | tee kokkos_config_cmd.out" + kokkos_config_cmd="cd $KOKKOS_BUILD_DIR && $KOKKOS_SRC_DIR/generate_makefile.bash --cxxflags='-O3' \ + --arch=Power9,Volta70 --with-cuda=$CUDA_PATH --compiler=$KOKKOS_SRC_DIR/bin/nvcc_wrapper \ + --kokkos-path=$KOKKOS_SRC_DIR --prefix=$KOKKOS_INSTALL_DIR 2>&1 | tee kokkos_config_cmd.out" kokkos_config_defaults_cmd="cd $KOKKOS_BUILD_DIR && cmake -DKokkos_ENABLE_TESTS:BOOL=OFF $KOKKOS_SRC_DIR 2>&1 \ | tee -a kokkos_config_cmd.out" kokkoskernels_config_cmd="cd $KOKKOSKERNELS_BUILD_DIR && $KOKKOSKERNELS_SRC_DIR/cm_generate_makefile.bash \ - --cxxflags='-O3' --arch=Power9,Volta70 \ - --with-scalars="$precision" \ - --with-cuda=$CUDA_PATH --compiler=$KOKKOS_INSTALL_DIR/bin/nvcc_wrapper \ + --arch=Power9,Volta70 --with-cuda=$CUDA_PATH --compiler=$KOKKOS_INSTALL_DIR/bin/nvcc_wrapper \ + --cxxflags='-O3' --with-scalars=$precision \ --kokkos-path=$KOKKOS_SRC_DIR --kokkoskernels-path=$KOKKOSKERNELS_SRC_DIR \ --kokkos-prefix=$KOKKOS_INSTALL_DIR --prefix=$KOKKOSKERNELS_INSTALL_DIR 2>&1 | \ tee kokkoskernels_config_cmd.out" @@ -94,9 +94,54 @@ elif [ "$arch_names" == "POWER9 VOLTA70" ]; then kokkoskernels_build_cmd="bsub -q rhel7W -W 2:00 -Is $KOKKOSKERNELS_BUILD_DIR/build.sh" benchmark_cmd="bsub -q rhel7W -W 2:00 -Is $KOKKOSKERNELS_BUILD_DIR/bench.sh" elif [ "$arch_names" == "A64FX " ]; then - earlyexit 0 + export OMP_PROC_BIND=close + export OMP_PLACES=cores + export OMP_NUM_THREADS=48 + module purge + module load gcc/10.2.0 cmake/3.17.0 + kokkos_config_cmd="cd $KOKKOS_BUILD_DIR && $KOKKOS_SRC_DIR/generate_makefile.bash --cxxflags='-O3' \ + --arch=A64FX \ + --kokkos-path=$KOKKOS_SRC_DIR --prefix=$KOKKOS_INSTALL_DIR 2>&1 | tee kokkos_config_cmd.out" + kokkos_config_defaults_cmd="cd $KOKKOS_BUILD_DIR && cmake -DKokkos_ENABLE_TESTS:BOOL=OFF $KOKKOS_SRC_DIR 2>&1 \ + | tee -a kokkos_config_cmd.out" + + kokkoskernels_config_cmd="cd $KOKKOSKERNELS_BUILD_DIR && $KOKKOSKERNELS_SRC_DIR/cm_generate_makefile.bash \ + --cxxflags='-msve-vector-bits=512 -Ofast' --arch=A64FX --with-scalars=$precision --with-openmp \ + --kokkos-path=$KOKKOS_SRC_DIR --kokkoskernels-path=$KOKKOSKERNELS_SRC_DIR \ + --kokkos-prefix=$KOKKOS_INSTALL_DIR --prefix=$KOKKOSKERNELS_INSTALL_DIR 2>&1 | \ + tee kokkoskernels_config_cmd.out" + kokkoskernels_config_defaults_cmd="cd $KOKKOSKERNELS_BUILD_DIR && cmake -DKokkosKernels_INST_LAYOUTLEFT:BOOL=OFF \ + -DKokkosKernels_INST_LAYOUTRIGHT:BOOL=ON -DKokkosKernels_INST_DOUBLE:BOOL=OFF \ + $KOKKOSKERNELS_SRC_DIR 2>&1 | tee -a kokkoskernels_config_cmd.out" + + kokkos_build_cmd="salloc --time=2:00:00 -N1 $KOKKOS_BUILD_DIR/build.sh" + kokkoskernels_build_cmd="salloc --time=2:00:00 -N1 $KOKKOSKERNELS_BUILD_DIR/build.sh" + benchmark_cmd="salloc --time=2:00:00 -N1 $KOKKOSKERNELS_BUILD_DIR/bench.sh" elif [ "$arch_names" == "SKX " ]; then - earlyexit 0 + export OMP_PROC_BIND=close + export OMP_PLACES=cores + export OMP_NUM_THREADS=96 + module purge + module load gcc/7.2.0 cmake/3.19.3 + kokkos_config_cmd="cd $KOKKOS_BUILD_DIR && $KOKKOS_SRC_DIR/generate_makefile.bash --cxxflags='-O3' \ + --arch=SKX \ + --kokkos-path=$KOKKOS_SRC_DIR --prefix=$KOKKOS_INSTALL_DIR 2>&1 | tee kokkos_config_cmd.out" + kokkos_config_defaults_cmd="cd $KOKKOS_BUILD_DIR && cmake -DKokkos_ENABLE_TESTS:BOOL=OFF $KOKKOS_SRC_DIR 2>&1 \ + | tee -a kokkos_config_cmd.out" + + kokkoskernels_config_cmd="cd $KOKKOSKERNELS_BUILD_DIR && $KOKKOSKERNELS_SRC_DIR/cm_generate_makefile.bash \ + --cxxflags='-O3' --arch=SKX --with-scalars=$precision --with-openmp \ + --kokkos-path=$KOKKOS_SRC_DIR --kokkoskernels-path=$KOKKOSKERNELS_SRC_DIR \ + --kokkos-prefix=$KOKKOS_INSTALL_DIR --prefix=$KOKKOSKERNELS_INSTALL_DIR 2>&1 | \ + tee kokkoskernels_config_cmd.out" + kokkoskernels_config_defaults_cmd="cd $KOKKOSKERNELS_BUILD_DIR && cmake -DKokkosKernels_INST_LAYOUTLEFT:BOOL=OFF \ + -DKokkosKernels_INST_LAYOUTRIGHT:BOOL=ON -DKokkosKernels_INST_DOUBLE:BOOL=OFF \ + $KOKKOSKERNELS_SRC_DIR 2>&1 | tee -a kokkoskernels_config_cmd.out" + + kokkos_build_cmd="salloc --time=2:00:00 -N1 $KOKKOS_BUILD_DIR/build.sh" + kokkoskernels_build_cmd="salloc --time=2:00:00 -N1 $KOKKOSKERNELS_BUILD_DIR/build.sh" + benchmark_cmd="salloc --time=2:00:00 -N1 $KOKKOSKERNELS_BUILD_DIR/bench.sh" + use_simd="--use_simd=1" else echo "Invalid arch: $arch_names" printhelp; earlyexit 1 @@ -111,19 +156,19 @@ chmod +x $KOKKOS_BUILD_DIR/build.sh # Write the arch agnostic kokkos-kernels build script echo "#!/bin/bash" > $KOKKOSKERNELS_BUILD_DIR/build.sh echo "cd $KOKKOSKERNELS_BUILD_DIR/perf_test/blas/blas3" >> $KOKKOSKERNELS_BUILD_DIR/build.sh -echo "make -j40" >> $KOKKOSKERNELS_BUILD_DIR/build.sh +echo "make -j40 KokkosBlas3_perf_test" >> $KOKKOSKERNELS_BUILD_DIR/build.sh chmod +x $KOKKOSKERNELS_BUILD_DIR/build.sh # Write the arch agnostic kokkos-kernels benchmark script echo "#!/bin/bash" > $KOKKOSKERNELS_BUILD_DIR/bench.sh echo "cd $benchmark_dir" >> $KOKKOSKERNELS_BUILD_DIR/bench.sh -echo "echo \"Writing output to: $benchmark_dir/bench.csv...\"" >> $KOKKOSKERNELS_BUILD_DIR/bench.sh echo "$KOKKOSKERNELS_BUILD_DIR/perf_test/blas/blas3/KokkosBlas3_perf_test \ --test=batched_heuristic --routines=gemm --loop_type=parallel --batch_size_last_dim=0 \ --matrix_size_start=2x2,2x2,2x2 --matrix_size_stop=64x64,64x64,64x64 \ --matrix_size_step=2 --batch_size=1024 \ - --warm_up_loop=10 --iter=20 --verify=0 \ - --csv=$benchmark_dir/bench.csv" \ + --warm_up_loop=10 --iter=20 --verify=1 \ + ${use_simd} \ + --csv=${benchmark_dir}/${precision}_bench.csv" \ >> $KOKKOSKERNELS_BUILD_DIR/bench.sh chmod +x $KOKKOSKERNELS_BUILD_DIR/bench.sh @@ -137,7 +182,7 @@ beval $kokkos_config_defaults_cmd beval $kokkos_build_cmd # Wait for the file system on the head node to catch up -while [ ! -e $KOKKOS_INSTALL_DIR/bin/nvcc_wrapper ]; do +while [[ "$arch_names" == "POWER9 VOLTA70" && ! -e $KOKKOS_INSTALL_DIR/bin/nvcc_wrapper ]]; do sleep 3s done @@ -148,3 +193,4 @@ beval $kokkoskernels_build_cmd # Run the benchmark beval $benchmark_cmd +beval "cat ${benchmark_dir}/${precision}_bench.csv" diff --git a/perf_test/blas/blas3/KokkosBlas3_common.hpp b/perf_test/blas/blas3/KokkosBlas3_common.hpp index ec34a1fb80..6d85fbd0da 100644 --- a/perf_test/blas/blas3/KokkosBlas3_common.hpp +++ b/perf_test/blas/blas3/KokkosBlas3_common.hpp @@ -65,6 +65,7 @@ #define DEFAULT_BATCH_SIZE_LAST_DIM 0 #define DEFAULT_VERIFY 1 #define DEFAULT_NINTER 4 +#define DEFAULT_USE_SIMD 0 /************************ blas routine structure definitions **********/ struct perf_test_trmm_args { @@ -213,6 +214,7 @@ struct perf_test_options { std::string blas_routines; bool verify; int ninter; + bool use_simd; }; typedef struct perf_test_options options_t; diff --git a/perf_test/blas/blas3/KokkosBlas3_gemm_perf_test.hpp b/perf_test/blas/blas3/KokkosBlas3_gemm_perf_test.hpp index 94b916348a..87d0cc0bdc 100644 --- a/perf_test/blas/blas3/KokkosBlas3_gemm_perf_test.hpp +++ b/perf_test/blas/blas3/KokkosBlas3_gemm_perf_test.hpp @@ -1903,11 +1903,7 @@ gemm_args_t __do_setup(options_t options, matrix_dims_t dims) { gemm_args.dims = dims; gemm_args.transA = options.blas_args.gemm.gemm_args.c_str()[0]; gemm_args.transB = options.blas_args.gemm.gemm_args.c_str()[1]; - if (options.test == BATCHED_TEAM_SIMD || - options.test == BATCHED_TEAM_SIMD_BLOCKED || - options.test == BATCHED_SERIAL_SIMD || - options.test == BATCHED_SERIAL_SIMD_BLOCKED || - options.test == BATCHED_SERIAL_COMPACT_MKL) { + if (options.use_simd) { // Calculate the batch size for simd views auto a_simd_batch_size = dims.a.k / simd_vector_size + (dims.a.k % simd_vector_size > 0); @@ -2256,6 +2252,7 @@ void do_gemm_serial_simd_batched_parallel(options_t options) { STATUS; // SerialBatchDim3Tag // SerialSimdTag + options.use_simd = true; if (options.blas_args.batch_size_last_dim) __do_loop_and_invoke( options, @@ -2272,6 +2269,7 @@ void do_gemm_serial_simd_batched_blocked_parallel(options_t options) { STATUS; // SerialBatchDim3Tag // SerialSimdTag + options.use_simd = true; if (options.blas_args.batch_size_last_dim) __do_loop_and_invoke( options, @@ -2302,8 +2300,9 @@ void do_gemm_serial_batched_compact_mkl_parallel(options_t options) { return; } #else -void do_gemm_serial_batched_compact_mkl_parallel(options_t /*options*/) { +void do_gemm_serial_batched_compact_mkl_parallel(options_t options) { STATUS; + options.use_simd = true; #if !defined(__KOKKOSBATCHED_ENABLE_INTEL_MKL__) std::cerr << std::string(__func__) @@ -2370,6 +2369,7 @@ void do_gemm_team_vector_batched_parallel(options_t options) { void do_gemm_team_simd_batched_parallel(options_t options) { STATUS; + options.use_simd = true; if (options.blas_args.batch_size_last_dim) __do_loop_and_invoke( options, @@ -2384,6 +2384,7 @@ void do_gemm_team_simd_batched_parallel(options_t options) { void do_gemm_team_simd_batched_blocked_parallel(options_t options) { STATUS; + options.use_simd = true; if (options.blas_args.batch_size_last_dim) __do_loop_and_invoke( options, diff --git a/perf_test/blas/blas3/KokkosBlas3_perf_test.cpp b/perf_test/blas/blas3/KokkosBlas3_perf_test.cpp index cf03f1b6a3..26b7b3abe1 100644 --- a/perf_test/blas/blas3/KokkosBlas3_perf_test.cpp +++ b/perf_test/blas/blas3/KokkosBlas3_perf_test.cpp @@ -72,6 +72,7 @@ static struct option long_options[] = { {"routines", required_argument, 0, 'r'}, {"verify", required_argument, 0, 'v'}, {"ninter", required_argument, 0, 'j'}, + {"use_simd", required_argument, 0, 'f'}, {0, 0, 0, 0}}; static void __print_help_blas3_perf_test() { @@ -229,6 +230,14 @@ static void __print_help_blas3_perf_test() { "that evenly divides the batch size. " "(default: %d)\n", DEFAULT_NINTER); + + printf("\t-u, --use_simd=SIMD\n"); + printf( + "\t\tWhether to use SIMD views.\n"); + printf( + "\t\t\tValid values for SIMD are 1 to use SIMD views and 0 to use non-SIMD" + "views instead. (default: %d)\n", + DEFAULT_USE_SIMD); } static void __blas3_perf_test_input_error(char ** /*argv*/, char short_opt, @@ -283,6 +292,7 @@ int main(int argc, char **argv) { options.blas_args.batch_size_last_dim = DEFAULT_BATCH_SIZE_LAST_DIM; options.verify = DEFAULT_VERIFY; options.ninter = DEFAULT_NINTER; + options.use_simd = DEFAULT_USE_SIMD; options.blas_args.trmm.trmm_args = DEFAULT_TRMM_ARGS; options.blas_args.trmm.alpha = DEFAULT_TRMM_ALPHA; @@ -292,7 +302,7 @@ int main(int argc, char **argv) { options.blas_args.gemm.beta = DEFAULT_GEMM_BETA; while ( - (ret = getopt_long(argc, argv, "ht:l:b:e:s:w:i:o:a:c:r:g:z:n:k:u:p:d:v:j:", + (ret = getopt_long(argc, argv, "ht:l:b:e:s:w:i:o:a:c:r:g:z:n:k:u:p:d:v:j:f:", long_options, &option_idx)) != -1) { switch (ret) { case 'h': __print_help_blas3_perf_test(); return 0; @@ -419,6 +429,7 @@ int main(int argc, char **argv) { case 'z': options.blas_args.team_size = atoi(optarg); break; case 'n': options.blas_args.vector_len = atoi(optarg); break; case 'u': options.blas_args.use_auto = atoi(optarg); break; + case 'f': options.use_simd = atoi(optarg); break; case 'c': out_file = optarg; options.out_file = std::string(out_file); From cfb762f14bd6dae1de9ee16b3f482806cadc38ce Mon Sep 17 00:00:00 2001 From: Evan Harvey Date: Fri, 1 Oct 2021 14:18:23 -0600 Subject: [PATCH 005/300] src/batched: - Add simd support for KK_SQUARE. perf_test/blas/blas3: - pass simd views for batched heuristic. --- .../blas/blas3/KokkosBlas3_gemm_perf_test.hpp | 72 ++++++++++++++----- src/batched/dense/KokkosBatched_Gemm_Decl.hpp | 1 + 2 files changed, 57 insertions(+), 16 deletions(-) diff --git a/perf_test/blas/blas3/KokkosBlas3_gemm_perf_test.hpp b/perf_test/blas/blas3/KokkosBlas3_gemm_perf_test.hpp index 87d0cc0bdc..73523bff38 100644 --- a/perf_test/blas/blas3/KokkosBlas3_gemm_perf_test.hpp +++ b/perf_test/blas/blas3/KokkosBlas3_gemm_perf_test.hpp @@ -485,45 +485,85 @@ void __do_gemm_parallel_batched_heuristic_template(options_t options, // using C = Trans::ConjTranspose; STATUS; - if (a == 'N' && b == 'N') { if (options.blas_args.batch_size_last_dim) - KokkosBatched::BatchedGemm( - &batchedGemmHandle, gemm_args.alpha, gemm_args.A, gemm_args.B, - gemm_args.beta, gemm_args.C); + if (options.use_simd) + KokkosBatched::BatchedGemm( + &batchedGemmHandle, gemm_args.alpha, gemm_args.Av.vec_3d, + gemm_args.Bv.vec_3d, gemm_args.beta, gemm_args.Cv.vec_3d); + else + KokkosBatched::BatchedGemm( + &batchedGemmHandle, gemm_args.alpha, gemm_args.A, gemm_args.B, + gemm_args.beta, gemm_args.C); else - KokkosBatched::BatchedGemm( - &batchedGemmHandle, gemm_args.alpha, gemm_args.A, gemm_args.B, - gemm_args.beta, gemm_args.C); + if (options.use_simd) + KokkosBatched::BatchedGemm( + &batchedGemmHandle, gemm_args.alpha, gemm_args.Av.vec_3d, + gemm_args.Bv.vec_3d, gemm_args.beta, gemm_args.Cv.vec_3d); + else + KokkosBatched::BatchedGemm( + &batchedGemmHandle, gemm_args.alpha, gemm_args.A, gemm_args.B, + gemm_args.beta, gemm_args.C); + } else if (a == 'N' && b == 'T') { if (options.blas_args.batch_size_last_dim) + if (options.use_simd) KokkosBatched::BatchedGemm( - &batchedGemmHandle, gemm_args.alpha, gemm_args.A, gemm_args.B, - gemm_args.beta, gemm_args.C); + &batchedGemmHandle, gemm_args.alpha, gemm_args.Av.vec_3d, + gemm_args.Bv.vec_3d, gemm_args.beta, gemm_args.Cv.vec_3d); else - KokkosBatched::BatchedGemm( + KokkosBatched::BatchedGemm( &batchedGemmHandle, gemm_args.alpha, gemm_args.A, gemm_args.B, gemm_args.beta, gemm_args.C); + else + if (options.use_simd) + KokkosBatched::BatchedGemm( + &batchedGemmHandle, gemm_args.alpha, gemm_args.Av.vec_3d, + gemm_args.Bv.vec_3d, gemm_args.beta, gemm_args.Cv.vec_3d); + else + KokkosBatched::BatchedGemm( + &batchedGemmHandle, gemm_args.alpha, gemm_args.A, gemm_args.B, + gemm_args.beta, gemm_args.C); //} else if (a == 'N' && b == 'C') { // __do_gemm_serial_batched_template(options, gemm_args); } else if (a == 'T' && b == 'N') { if (options.blas_args.batch_size_last_dim) + if (options.use_simd) KokkosBatched::BatchedGemm( - &batchedGemmHandle, gemm_args.alpha, gemm_args.A, gemm_args.B, - gemm_args.beta, gemm_args.C); + &batchedGemmHandle, gemm_args.alpha, gemm_args.Av.vec_3d, + gemm_args.Bv.vec_3d, gemm_args.beta, gemm_args.Cv.vec_3d); else - KokkosBatched::BatchedGemm( + KokkosBatched::BatchedGemm( &batchedGemmHandle, gemm_args.alpha, gemm_args.A, gemm_args.B, gemm_args.beta, gemm_args.C); + else + if (options.use_simd) + KokkosBatched::BatchedGemm( + &batchedGemmHandle, gemm_args.alpha, gemm_args.Av.vec_3d, + gemm_args.Bv.vec_3d, gemm_args.beta, gemm_args.Cv.vec_3d); + else + KokkosBatched::BatchedGemm( + &batchedGemmHandle, gemm_args.alpha, gemm_args.A, gemm_args.B, + gemm_args.beta, gemm_args.C); } else if (a == 'T' && b == 'T') { if (options.blas_args.batch_size_last_dim) + if (options.use_simd) KokkosBatched::BatchedGemm( - &batchedGemmHandle, gemm_args.alpha, gemm_args.A, gemm_args.B, - gemm_args.beta, gemm_args.C); + &batchedGemmHandle, gemm_args.alpha, gemm_args.Av.vec_3d, + gemm_args.Bv.vec_3d, gemm_args.beta, gemm_args.Cv.vec_3d); else - KokkosBatched::BatchedGemm( + KokkosBatched::BatchedGemm( &batchedGemmHandle, gemm_args.alpha, gemm_args.A, gemm_args.B, gemm_args.beta, gemm_args.C); + else + if (options.use_simd) + KokkosBatched::BatchedGemm( + &batchedGemmHandle, gemm_args.alpha, gemm_args.Av.vec_3d, + gemm_args.Bv.vec_3d, gemm_args.beta, gemm_args.Cv.vec_3d); + else + KokkosBatched::BatchedGemm( + &batchedGemmHandle, gemm_args.alpha, gemm_args.A, gemm_args.B, + gemm_args.beta, gemm_args.C); //} else if (a == 'T' && b == 'C') { // __do_gemm_serial_batched_template(options, gemm_args); //} else if (a == 'C' && b == 'N') { diff --git a/src/batched/dense/KokkosBatched_Gemm_Decl.hpp b/src/batched/dense/KokkosBatched_Gemm_Decl.hpp index 51d3004e88..bc499db7a3 100644 --- a/src/batched/dense/KokkosBatched_Gemm_Decl.hpp +++ b/src/batched/dense/KokkosBatched_Gemm_Decl.hpp @@ -330,6 +330,7 @@ int BatchedGemm(BatchedGemmHandleType *const handle, const ScalarType alpha, // For SIMD views, we can have either 3-rank or 4-ranks inputs. switch (handle->get_kernel_algo_type()) { case BaseKokkosBatchedAlgos::KK_SERIAL: + case BaseHeuristicAlgos::SQUARE: static_assert(static_cast(AViewType::rank) == 3, "AViewType must have rank 3."); static_assert(static_cast(BViewType::rank) == 3, From b309fd26091bf70d923f557ec68543e40877be63 Mon Sep 17 00:00:00 2001 From: Evan Harvey Date: Tue, 5 Oct 2021 09:31:44 -0600 Subject: [PATCH 006/300] src/batched: - Add debug prints for KK_SQUARE. perf_test/blas/blas3: - Use srun on SKX and A64FX. --- .../blas3/KokkosBatched_BatchedGemm_benchmark.sh | 12 ++++++------ src/batched/dense/KokkosBatched_Gemm_Decl.hpp | 7 +++++++ 2 files changed, 13 insertions(+), 6 deletions(-) diff --git a/perf_test/blas/blas3/KokkosBatched_BatchedGemm_benchmark.sh b/perf_test/blas/blas3/KokkosBatched_BatchedGemm_benchmark.sh index 4514b6be5d..2f46f1d81f 100755 --- a/perf_test/blas/blas3/KokkosBatched_BatchedGemm_benchmark.sh +++ b/perf_test/blas/blas3/KokkosBatched_BatchedGemm_benchmark.sh @@ -114,9 +114,9 @@ elif [ "$arch_names" == "A64FX " ]; then -DKokkosKernels_INST_LAYOUTRIGHT:BOOL=ON -DKokkosKernels_INST_DOUBLE:BOOL=OFF \ $KOKKOSKERNELS_SRC_DIR 2>&1 | tee -a kokkoskernels_config_cmd.out" - kokkos_build_cmd="salloc --time=2:00:00 -N1 $KOKKOS_BUILD_DIR/build.sh" - kokkoskernels_build_cmd="salloc --time=2:00:00 -N1 $KOKKOSKERNELS_BUILD_DIR/build.sh" - benchmark_cmd="salloc --time=2:00:00 -N1 $KOKKOSKERNELS_BUILD_DIR/bench.sh" + kokkos_build_cmd="srun --time=2:00:00 -N1 $KOKKOS_BUILD_DIR/build.sh" + kokkoskernels_build_cmd="srun --time=2:00:00 -N1 $KOKKOSKERNELS_BUILD_DIR/build.sh" + benchmark_cmd="srun --time=2:00:00 -N1 $KOKKOSKERNELS_BUILD_DIR/bench.sh" elif [ "$arch_names" == "SKX " ]; then export OMP_PROC_BIND=close export OMP_PLACES=cores @@ -138,9 +138,9 @@ elif [ "$arch_names" == "SKX " ]; then -DKokkosKernels_INST_LAYOUTRIGHT:BOOL=ON -DKokkosKernels_INST_DOUBLE:BOOL=OFF \ $KOKKOSKERNELS_SRC_DIR 2>&1 | tee -a kokkoskernels_config_cmd.out" - kokkos_build_cmd="salloc --time=2:00:00 -N1 $KOKKOS_BUILD_DIR/build.sh" - kokkoskernels_build_cmd="salloc --time=2:00:00 -N1 $KOKKOSKERNELS_BUILD_DIR/build.sh" - benchmark_cmd="salloc --time=2:00:00 -N1 $KOKKOSKERNELS_BUILD_DIR/bench.sh" + kokkos_build_cmd="srun --time=2:00:00 -N1 $KOKKOS_BUILD_DIR/build.sh" + kokkoskernels_build_cmd="srun --time=2:00:00 -N1 $KOKKOSKERNELS_BUILD_DIR/build.sh" + benchmark_cmd="srun --time=2:00:00 -N1 $KOKKOSKERNELS_BUILD_DIR/bench.sh" use_simd="--use_simd=1" else echo "Invalid arch: $arch_names" diff --git a/src/batched/dense/KokkosBatched_Gemm_Decl.hpp b/src/batched/dense/KokkosBatched_Gemm_Decl.hpp index bc499db7a3..5ff6d1cdc4 100644 --- a/src/batched/dense/KokkosBatched_Gemm_Decl.hpp +++ b/src/batched/dense/KokkosBatched_Gemm_Decl.hpp @@ -453,6 +453,13 @@ int BatchedGemm(BatchedGemmHandleType *const handle, const ScalarType alpha, Algo::Gemm::Blocked>::type>::type>:: type; + if (handle->enableDebug) { + std::cout << "bsgResultsPerThread: " + << typeid(bsgResultsPerThread).name() << std::endl + << "bsgModeType: " + << typeid(bsgModeType).name() << std::endl; + } + // if (on_gpu && c_m >= 20 && // (alpha == 1.0F && beta == 0.0F) ? c_m <= 24 : c_m <= 21) { // // TODO: invoke TeamShmem From d30d2737116a5fb7d249506a9d80b340e5111170 Mon Sep 17 00:00:00 2001 From: Amy Powell Date: Thu, 7 Oct 2021 11:23:22 -0600 Subject: [PATCH 007/300] blas3: rps version of gemm test --- perf_test/CMakeLists.txt | 1 + perf_test/KokkosKernelsTrackedTesting.cpp | 3 + .../blas/blas3/KokkosBlas3_gemm_perf_test.hpp | 133 +----------------- .../KokkosBlas3_gemm_tracked_perf_test.cpp | 125 ++++++++++++++++ .../KokkosBlas3_gemm_tracked_perf_test.hpp | 86 +++++++++++ perf_test/blas/blas3/tracked_testing.hpp | 58 ++++++++ 6 files changed, 280 insertions(+), 126 deletions(-) create mode 100644 perf_test/blas/blas3/KokkosBlas3_gemm_tracked_perf_test.cpp create mode 100644 perf_test/blas/blas3/KokkosBlas3_gemm_tracked_perf_test.hpp create mode 100644 perf_test/blas/blas3/tracked_testing.hpp diff --git a/perf_test/CMakeLists.txt b/perf_test/CMakeLists.txt index d9ec2a34d9..91dc727867 100644 --- a/perf_test/CMakeLists.txt +++ b/perf_test/CMakeLists.txt @@ -35,6 +35,7 @@ if (KokkosKernels_ENABLE_TESTS_AND_PERFSUITE) blas/blas2/KokkosBlas2_gemv_tracked_perf_test.cpp blas/blas1/KokkosBlas_dot_tracked_perf_test.cpp blas/blas1/KokkosBlas_team_dot_tracked_perf_test.cpp + blas/blas3/KokkosBlas3_gemm_tracked_perf_test.cpp PerfTestUtilities.cpp sparse/spmv/OpenMPSmartStatic_SPMV.cpp #sparse / KokkosSparse_spgemm_test.cpp diff --git a/perf_test/KokkosKernelsTrackedTesting.cpp b/perf_test/KokkosKernelsTrackedTesting.cpp index ffb7f98447..10fb834270 100644 --- a/perf_test/KokkosKernelsTrackedTesting.cpp +++ b/perf_test/KokkosKernelsTrackedTesting.cpp @@ -9,6 +9,7 @@ // For RPS version of BLAS Level-1 Tests #include "blas/blas1/tracked_testing.hpp" #include "blas/blas2/tracked_testing.hpp" +#include "blas/blas3/tracked_testing.hpp" int main(int argc, char* argv[]) { { // argument parsing for setting input data at runtime @@ -55,6 +56,8 @@ int main(int argc, char* argv[]) { test::blas2::build_blas2_executor(exec, argc, argv, run_params); + test::blas3::build_blas3_executor(exec, argc, argv, run_params); + exec.setupSuite(); // STEP 3: Report suite run summary diff --git a/perf_test/blas/blas3/KokkosBlas3_gemm_perf_test.hpp b/perf_test/blas/blas3/KokkosBlas3_gemm_perf_test.hpp index 58962c728b..347e5b4a54 100644 --- a/perf_test/blas/blas3/KokkosBlas3_gemm_perf_test.hpp +++ b/perf_test/blas/blas3/KokkosBlas3_gemm_perf_test.hpp @@ -79,7 +79,6 @@ void do_gemm_serial_batched_blocked(options_t options); // void do_gemm_serial_blas_parallel(options_t options); // Not valid! The KokkosBlas::gemm function may take the entire device per // invocation! -void do_gemm_heuristic_batched_parallel(options_t options); void do_gemm_serial_batched_parallel(options_t options); void do_gemm_serial_batched_blocked_parallel(options_t options); void do_gemm_serial_simd_batched_parallel(options_t options); @@ -118,8 +117,7 @@ void (*do_gemm_invoke[LOOP_N][TEST_N])(options_t) = { NULL, // Serial Experiment }, { - NULL, // BLAS - do_gemm_heuristic_batched_parallel, + NULL, // BLAS do_gemm_serial_batched_parallel, // Serial do_gemm_serial_batched_blocked_parallel, do_gemm_serial_simd_batched_parallel, @@ -287,18 +285,12 @@ static inline std::string __gemm_output_dim_string(options_t options, static void __gemm_output_csv_row(options_t options, gemm_args_t gemm_args, double time_in_seconds, - const char *experiment_name = nullptr, - const char *team_size = nullptr, - const char *vec_len = nullptr, - const char *vec_type = nullptr) { - std::string algo_name = !experiment_name ? test_e_str[options.test] - : std::string(experiment_name); - std::string ts = !team_size ? std::to_string(gemm_args.bp.team_size) - : std::string(team_size); - std::string vlen = - !vec_len ? std::to_string(gemm_args.bp.vector_len) : std::string(vec_len); - std::string vtype = - !vec_type ? internal_vector_type::label() : std::string(vec_type); + const char *experiment_name = nullptr) { + std::string algo_name = test_e_str[options.test]; + std::string ts = std::to_string(gemm_args.bp.team_size); + std::string vlen = std::to_string(gemm_args.bp.vector_len); + std::string vtype = internal_vector_type::label(); + if (experiment_name) algo_name = std::string(experiment_name); if (options.blas_args.use_auto) ts = vlen = "Kokkos::AUTO"; double flops; @@ -476,93 +468,6 @@ void __do_gemm_serial_batched(options_t options, gemm_args_t gemm_args) { return; } -template -void __do_gemm_parallel_batched_heuristic_template(options_t options, - gemm_args_t gemm_args) { - BatchedGemmHandle batchedGemmHandle(BaseHeuristicAlgos::SQUARE); - char a = toupper(gemm_args.transA); - char b = toupper(gemm_args.transB); - using N = Trans::NoTranspose; - using T = Trans::Transpose; - // using C = Trans::ConjTranspose; - - STATUS; - - if (a == 'N' && b == 'N') { - if (options.blas_args.batch_size_last_dim) - KokkosBatched::BatchedGemm( - &batchedGemmHandle, gemm_args.alpha, gemm_args.A, gemm_args.B, - gemm_args.beta, gemm_args.C); - else - KokkosBatched::BatchedGemm( - &batchedGemmHandle, gemm_args.alpha, gemm_args.A, gemm_args.B, - gemm_args.beta, gemm_args.C); - } else if (a == 'N' && b == 'T') { - if (options.blas_args.batch_size_last_dim) - KokkosBatched::BatchedGemm( - &batchedGemmHandle, gemm_args.alpha, gemm_args.A, gemm_args.B, - gemm_args.beta, gemm_args.C); - else - KokkosBatched::BatchedGemm( - &batchedGemmHandle, gemm_args.alpha, gemm_args.A, gemm_args.B, - gemm_args.beta, gemm_args.C); - //} else if (a == 'N' && b == 'C') { - // __do_gemm_serial_batched_template(options, gemm_args); - } else if (a == 'T' && b == 'N') { - if (options.blas_args.batch_size_last_dim) - KokkosBatched::BatchedGemm( - &batchedGemmHandle, gemm_args.alpha, gemm_args.A, gemm_args.B, - gemm_args.beta, gemm_args.C); - else - KokkosBatched::BatchedGemm( - &batchedGemmHandle, gemm_args.alpha, gemm_args.A, gemm_args.B, - gemm_args.beta, gemm_args.C); - } else if (a == 'T' && b == 'T') { - if (options.blas_args.batch_size_last_dim) - KokkosBatched::BatchedGemm( - &batchedGemmHandle, gemm_args.alpha, gemm_args.A, gemm_args.B, - gemm_args.beta, gemm_args.C); - else - KokkosBatched::BatchedGemm( - &batchedGemmHandle, gemm_args.alpha, gemm_args.A, gemm_args.B, - gemm_args.beta, gemm_args.C); - //} else if (a == 'T' && b == 'C') { - // __do_gemm_serial_batched_template(options, gemm_args); - //} else if (a == 'C' && b == 'N') { - // __do_gemm_serial_batched_template(options, gemm_args); - //} else if (a == 'C' && b == 'T') { - // __do_gemm_serial_batched_template(options, gemm_args); - //} else if (a == 'C' && b == 'C') { - // __do_gemm_serial_batched_template(options, gemm_args); - } else { - FATAL_ERROR("Bad gemm_args TransA or TransB value"); - } -} - -template -void __do_gemm_parallel_batched_heuristic(options_t options, - gemm_args_t gemm_args) { - Kokkos::Timer timer; - - for (uint32_t i = 0; i < options.warm_up_n; ++i) - __do_gemm_parallel_batched_heuristic_template( - options, gemm_args); - Kokkos::fence(); - - timer.reset(); - for (uint32_t i = 0; i < options.n; ++i) - __do_gemm_parallel_batched_heuristic_template( - options, gemm_args); - Kokkos::fence(); - - __gemm_output_csv_row(options, gemm_args, timer.seconds(), nullptr, "-", "-", - "-"); -} - template struct parallel_batched_gemm_range_policy { gemm_args_t gemm_args_; @@ -1602,12 +1507,6 @@ template static inline void __gemm_copy_simd_view_to_3d_view(gemm_simd_args_t src, dstViewType dst, options_t options) { - // clang-format off - // Related issue: https://github.com/kokkos/kokkos-kernels/issues/998 - // CUDA VERSION 10.2.2 generates a compiler error: - // KokkosBlas3_gemm_perf_test.hpp: error: ‘h_subview_type_2d’ was not declared in this scope - // clang-format on -#if (CUDA_VERSION != 10020) using dst_scalar_type = typename dstViewType::value_type; using src_scalar_type = typename view_type_5d::value_type; size_t remainder, vector_batch_size, simd_batch_size, last_batch; @@ -1735,10 +1634,6 @@ static inline void __gemm_copy_simd_view_to_3d_view(gemm_simd_args_t src, out: Kokkos::deep_copy(dst, h_dst); Kokkos::fence(); -#else - Kokkos::abort( - "Cannot perform simd verification with cuda/10.2.2, rerun with -v 0"); -#endif // #if (CUDA_VERSION != 10020) } /** @@ -2213,20 +2108,6 @@ void do_gemm_serial_batched_blocked(options_t options) { return; } -void do_gemm_heuristic_batched_parallel(options_t options) { - STATUS; - if (options.blas_args.use_auto) { - fprintf(stderr, "ERROR: --test=%s does not support --use_auto=%d\n", - test_e_str[options.test].c_str(), (int)options.blas_args.use_auto); - exit(-EINVAL); - } - - __do_loop_and_invoke( - options, - __do_gemm_parallel_batched_heuristic); - return; -} - void do_gemm_serial_batched_parallel(options_t options) { STATUS; if (options.blas_args.batch_size_last_dim) diff --git a/perf_test/blas/blas3/KokkosBlas3_gemm_tracked_perf_test.cpp b/perf_test/blas/blas3/KokkosBlas3_gemm_tracked_perf_test.cpp new file mode 100644 index 0000000000..e1bf74ecaa --- /dev/null +++ b/perf_test/blas/blas3/KokkosBlas3_gemm_tracked_perf_test.cpp @@ -0,0 +1,125 @@ +/* +//@HEADER +// ************************************************************************ +// +// Kokkos v. 3.0 +// Copyright (2020) National Technology & Engineering +// Solutions of Sandia, LLC (NTESS). +// +// Under the terms of Contract DE-NA0003525 with NTESS, +// the U.S. Government retains certain rights in this software. +// +// Redistribution and use in source and binary forms, with or without +// modification, are permitted provided that the following conditions are +// met: +// +// 1. Redistributions of source code must retain the above copyright +// notice, this list of conditions and the following disclaimer. +// +// 2. Redistributions in binary form must reproduce the above copyright +// notice, this list of conditions and the following disclaimer in the +// documentation and/or other materials provided with the distribution. +// +// 3. Neither the name of the Corporation nor the names of the +// contributors may be used to endorse or promote products derived from +// this software without specific prior written permission. +// +// THIS SOFTWARE IS PROVIDED BY NTESS "AS IS" AND ANY +// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR +// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NTESS OR THE +// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, +// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, +// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR +// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF +// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING +// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS +// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +// +// Questions? Contact Siva Rajamanickam (srajama@sandia.gov) +// +// ************************************************************************ +//@HEADER +*/ + +#include +#include +#include + +// Required for tracked_testing version +#include "KokkosBlas3_gemm_tracked_perf_test.hpp" + +#ifdef KOKKOSKERNELS_ENABLE_TESTS_AND_PERFSUITE +#include +#endif + +// API ref for "General Matrix Multiplication" (gemm) +// https://github.com/kokkos/kokkos-kernels/wiki/BLAS-3%3A%3Agemm + +// Usage: KokkosBlas::gemm(modeA, modeB, alpha, A, B, beta, C); +/* +* transA [in] "N" for non-transpose, "T" for transpose, +* "C" for conjugate transpose. +* All characters after the first are ignored. This works just like the BLAS +routines. +* +* transB [in] "N" for non-transpose, +* "T" for transpose, +* "C" for conjugate transpose. +* All characters after the first are ignored. This works just like the BLAS +routines. + +* alpha [in] Input coefficient of A*x +* A [in] Input matrix, as a 2-D Kokkos::View +* B [in] Input matrix, as a 2-D Kokkos::View +* beta [in] Input coefficient of C +* C [in/out] Output vector, as a nonconst 2-D Kokkos::View + +*/ + +// Define setup_test +template +testData_gemm setup_test(int m, int n, int k, + int repeat) { + testData_gemm testData_gemm_obj(m, n, k, repeat); + + return testData_gemm_obj; +} + +test_list construct_gemm_kernel_base(const rajaperf::RunParams& run_params, + const std::vector& m_n_k_vect) + +{ + // instantiate kernel_base_vector as type test_list + // kernel_base_vector will contain which tests to run, and data to run them + test_list kernel_base_vector; + + for (const auto& value : m_n_k_vect) { + kernel_base_vector.push_back(rajaperf::make_kernel_base( + "BLAS3_GEMM_" + std::to_string(value.m) + "_" + + std::to_string(value.n) + "_" + std::to_string(value.k), + run_params, + // setup_test lambda captures by value; + // Mapping Kokkos features to RAJAPerf Suite + // repeat = runreps (RAJAPerf Suite) + // m = getActualRunSize() (RAJAPerf Suite) + [=](const int repeat, const int) { + // returns a tuple of testData objects + return std::make_tuple( + // setup_test is templated on ExecSpace and Layout + setup_test( + value.m, value.n, value.k, repeat)); + }, + // the run lambda will capture the returned setup_test tuple by + // reference + [&](const int iteration, const int runsize, auto& data) { + // KokkosBlas::gemm(modeA, modeB, alpha, A, B, beta, C); + KokkosBlas::gemm("N", "N", 1.0, data.A, data.B, 0.0, data.C); + })); + } + + // return a vector of kernel base objects of type test_list + return kernel_base_vector; +} diff --git a/perf_test/blas/blas3/KokkosBlas3_gemm_tracked_perf_test.hpp b/perf_test/blas/blas3/KokkosBlas3_gemm_tracked_perf_test.hpp new file mode 100644 index 0000000000..3eac236f5c --- /dev/null +++ b/perf_test/blas/blas3/KokkosBlas3_gemm_tracked_perf_test.hpp @@ -0,0 +1,86 @@ + +// This file is for the "tracked test" version of +// a Kokkos Kernels performance test. +// Created by David Poliakoff and Amy Powell on 9/22/2021 + +#ifndef KOKKOSKERNELS_KOKKOSBLAS_GEMM_TEST_RPS_HPP +#define KOKKOSKERNELS_KOKKOSBLAS_GEMM_TEST_RPS_HPP + +#include +#include "blas/KokkosBlas3_gemm.hpp" +#include + +// These headers are required for RPS tracked perf testing +#ifdef KOKKOSKERNELS_ENABLE_TESTS_AND_PERFSUITE +#include + +struct gemmConfig { + int m; + int n; + int k; +}; + +test_list construct_gemm_kernel_base(const rajaperf::RunParams& run_params, + const std::vector& n_k_vect); + +#endif // KOKKOSKERNELS_ENABLE_TESTS_AND_PERFSUITE + +// Templating on these three types, mirroring +template +struct testData_gemm { + // Data for running tests + // m is the number of rows in A + int m = 1000; + // n is the number of columns in A; + // n is the number of rows in B; + int n = 1000; + // k is the number of columns in B; + int k = 1000; + int repeat = 1; + + std::string modeA = "N"; + std::string modeB = "N"; + + // coefficients for values in A and B + double alpha = 1.0; + double beta = 0.0; + + using Scalar = double; + using MemSpace = typename ExecSpace::memory_space; + using Device = Kokkos::Device; + + // Create 2D Kokoks::View, "A" containing an input matrix with m x n + // dimensions + Kokkos::View A; + + // Create 2D Kokkos::View, "B" containing an input matrix with n x k + // dimensions + Kokkos::View B; + + // Create 2D Kokkos::View, "C" , the resultant matrix + Kokkos::View C; + + // class Constructor: + testData_gemm(int m_in, int n_in, int k_in, int repeat_in) + : m(m_in), n(n_in), k(k_in), repeat(repeat_in) { + // You must set A, B and C equal to its intended value + A = Kokkos::View( + Kokkos::view_alloc(Kokkos::WithoutInitializing, "A"), m, n); + B = Kokkos::View( + Kokkos::view_alloc(Kokkos::WithoutInitializing, "B"), n, k); + C = Kokkos::View( + Kokkos::view_alloc(Kokkos::WithoutInitializing, "C"), m, k); + // Seed random number generation + Kokkos::Random_XorShift64_Pool pool(123); + // Fill input matrices A and x with 10 random values from pool + Kokkos::fill_random(A, pool, 10.0); + Kokkos::fill_random(B, pool, 10.0); + } +}; + +// Declare setup_test +template +testData_gemm setup_test(int m, int n, int k, + int repeat); + +#endif // KOKKOSKERNELS_KOKKOSBLAS_GEMM_TEST_RPS_HPP diff --git a/perf_test/blas/blas3/tracked_testing.hpp b/perf_test/blas/blas3/tracked_testing.hpp new file mode 100644 index 0000000000..a899fa91f1 --- /dev/null +++ b/perf_test/blas/blas3/tracked_testing.hpp @@ -0,0 +1,58 @@ +// +// Created by Poliakoff, David Zoeller on 4/26/21. +// +#ifndef KOKKOSKERNELS_BLAS3_TRACKED_TESTING_HPP +#define KOKKOSKERNELS_BLAS3_TRACKED_TESTING_HPP + +#include +#include + +#include "KokkosBlas3_gemm_tracked_perf_test.hpp" + +/* + *Three cases to test: + * + * 1) m = n = k + * 2) one case for m, n, k all pretty large, + * 3) and another for m, k small but n large + * + * You could use m = k = 5, n = 1 million or something like that for dot based + * gemm + * + */ + +namespace test { +namespace blas3 { + +// Change n and k values in the context of the backend +template +std::vector create_m_n_k_vect() { + std::string exec_space_name = ExecSpace::name(); + + return { + + // m = n = k; one case for m, n, k all pretty large, + {1000, 1000, 1000}, + // and another for m, k small but n large + {5, 1000000, 5}}; +} + +// Register kernels for a specific test +void build_gemm_executor(rajaperf::Executor& exec, int argc, char* argv[], + const rajaperf::RunParams& params) { + for (auto* kernel : construct_gemm_kernel_base( + params, create_m_n_k_vect())) { + exec.registerKernel("BLAS3", kernel); + } +} + +void build_blas3_executor(rajaperf::Executor& exec, int argc, char* argv[], + const rajaperf::RunParams& params) { + exec.registerGroup("BLAS3"); + build_gemm_executor(exec, argc, argv, params); +} + +} // namespace blas3 +} // namespace test + +#endif // KOKKOSKERNELS_TRACKED_TESTING_HPP From 58f46d47a5062cb5139d949da80fb60e393e380d Mon Sep 17 00:00:00 2001 From: Amy Powell Date: Fri, 8 Oct 2021 10:16:13 -0600 Subject: [PATCH 008/300] Addressing BMK PR#1132 comments: all comments accepted --- .../blas/blas3/KokkosBlas3_gemm_perf_test.hpp | 133 +++++++++++++++++- .../KokkosBlas3_gemm_tracked_perf_test.hpp | 50 ++++++- 2 files changed, 175 insertions(+), 8 deletions(-) diff --git a/perf_test/blas/blas3/KokkosBlas3_gemm_perf_test.hpp b/perf_test/blas/blas3/KokkosBlas3_gemm_perf_test.hpp index 347e5b4a54..58962c728b 100644 --- a/perf_test/blas/blas3/KokkosBlas3_gemm_perf_test.hpp +++ b/perf_test/blas/blas3/KokkosBlas3_gemm_perf_test.hpp @@ -79,6 +79,7 @@ void do_gemm_serial_batched_blocked(options_t options); // void do_gemm_serial_blas_parallel(options_t options); // Not valid! The KokkosBlas::gemm function may take the entire device per // invocation! +void do_gemm_heuristic_batched_parallel(options_t options); void do_gemm_serial_batched_parallel(options_t options); void do_gemm_serial_batched_blocked_parallel(options_t options); void do_gemm_serial_simd_batched_parallel(options_t options); @@ -117,7 +118,8 @@ void (*do_gemm_invoke[LOOP_N][TEST_N])(options_t) = { NULL, // Serial Experiment }, { - NULL, // BLAS + NULL, // BLAS + do_gemm_heuristic_batched_parallel, do_gemm_serial_batched_parallel, // Serial do_gemm_serial_batched_blocked_parallel, do_gemm_serial_simd_batched_parallel, @@ -285,12 +287,18 @@ static inline std::string __gemm_output_dim_string(options_t options, static void __gemm_output_csv_row(options_t options, gemm_args_t gemm_args, double time_in_seconds, - const char *experiment_name = nullptr) { - std::string algo_name = test_e_str[options.test]; - std::string ts = std::to_string(gemm_args.bp.team_size); - std::string vlen = std::to_string(gemm_args.bp.vector_len); - std::string vtype = internal_vector_type::label(); - if (experiment_name) algo_name = std::string(experiment_name); + const char *experiment_name = nullptr, + const char *team_size = nullptr, + const char *vec_len = nullptr, + const char *vec_type = nullptr) { + std::string algo_name = !experiment_name ? test_e_str[options.test] + : std::string(experiment_name); + std::string ts = !team_size ? std::to_string(gemm_args.bp.team_size) + : std::string(team_size); + std::string vlen = + !vec_len ? std::to_string(gemm_args.bp.vector_len) : std::string(vec_len); + std::string vtype = + !vec_type ? internal_vector_type::label() : std::string(vec_type); if (options.blas_args.use_auto) ts = vlen = "Kokkos::AUTO"; double flops; @@ -468,6 +476,93 @@ void __do_gemm_serial_batched(options_t options, gemm_args_t gemm_args) { return; } +template +void __do_gemm_parallel_batched_heuristic_template(options_t options, + gemm_args_t gemm_args) { + BatchedGemmHandle batchedGemmHandle(BaseHeuristicAlgos::SQUARE); + char a = toupper(gemm_args.transA); + char b = toupper(gemm_args.transB); + using N = Trans::NoTranspose; + using T = Trans::Transpose; + // using C = Trans::ConjTranspose; + + STATUS; + + if (a == 'N' && b == 'N') { + if (options.blas_args.batch_size_last_dim) + KokkosBatched::BatchedGemm( + &batchedGemmHandle, gemm_args.alpha, gemm_args.A, gemm_args.B, + gemm_args.beta, gemm_args.C); + else + KokkosBatched::BatchedGemm( + &batchedGemmHandle, gemm_args.alpha, gemm_args.A, gemm_args.B, + gemm_args.beta, gemm_args.C); + } else if (a == 'N' && b == 'T') { + if (options.blas_args.batch_size_last_dim) + KokkosBatched::BatchedGemm( + &batchedGemmHandle, gemm_args.alpha, gemm_args.A, gemm_args.B, + gemm_args.beta, gemm_args.C); + else + KokkosBatched::BatchedGemm( + &batchedGemmHandle, gemm_args.alpha, gemm_args.A, gemm_args.B, + gemm_args.beta, gemm_args.C); + //} else if (a == 'N' && b == 'C') { + // __do_gemm_serial_batched_template(options, gemm_args); + } else if (a == 'T' && b == 'N') { + if (options.blas_args.batch_size_last_dim) + KokkosBatched::BatchedGemm( + &batchedGemmHandle, gemm_args.alpha, gemm_args.A, gemm_args.B, + gemm_args.beta, gemm_args.C); + else + KokkosBatched::BatchedGemm( + &batchedGemmHandle, gemm_args.alpha, gemm_args.A, gemm_args.B, + gemm_args.beta, gemm_args.C); + } else if (a == 'T' && b == 'T') { + if (options.blas_args.batch_size_last_dim) + KokkosBatched::BatchedGemm( + &batchedGemmHandle, gemm_args.alpha, gemm_args.A, gemm_args.B, + gemm_args.beta, gemm_args.C); + else + KokkosBatched::BatchedGemm( + &batchedGemmHandle, gemm_args.alpha, gemm_args.A, gemm_args.B, + gemm_args.beta, gemm_args.C); + //} else if (a == 'T' && b == 'C') { + // __do_gemm_serial_batched_template(options, gemm_args); + //} else if (a == 'C' && b == 'N') { + // __do_gemm_serial_batched_template(options, gemm_args); + //} else if (a == 'C' && b == 'T') { + // __do_gemm_serial_batched_template(options, gemm_args); + //} else if (a == 'C' && b == 'C') { + // __do_gemm_serial_batched_template(options, gemm_args); + } else { + FATAL_ERROR("Bad gemm_args TransA or TransB value"); + } +} + +template +void __do_gemm_parallel_batched_heuristic(options_t options, + gemm_args_t gemm_args) { + Kokkos::Timer timer; + + for (uint32_t i = 0; i < options.warm_up_n; ++i) + __do_gemm_parallel_batched_heuristic_template( + options, gemm_args); + Kokkos::fence(); + + timer.reset(); + for (uint32_t i = 0; i < options.n; ++i) + __do_gemm_parallel_batched_heuristic_template( + options, gemm_args); + Kokkos::fence(); + + __gemm_output_csv_row(options, gemm_args, timer.seconds(), nullptr, "-", "-", + "-"); +} + template struct parallel_batched_gemm_range_policy { gemm_args_t gemm_args_; @@ -1507,6 +1602,12 @@ template static inline void __gemm_copy_simd_view_to_3d_view(gemm_simd_args_t src, dstViewType dst, options_t options) { + // clang-format off + // Related issue: https://github.com/kokkos/kokkos-kernels/issues/998 + // CUDA VERSION 10.2.2 generates a compiler error: + // KokkosBlas3_gemm_perf_test.hpp: error: ‘h_subview_type_2d’ was not declared in this scope + // clang-format on +#if (CUDA_VERSION != 10020) using dst_scalar_type = typename dstViewType::value_type; using src_scalar_type = typename view_type_5d::value_type; size_t remainder, vector_batch_size, simd_batch_size, last_batch; @@ -1634,6 +1735,10 @@ static inline void __gemm_copy_simd_view_to_3d_view(gemm_simd_args_t src, out: Kokkos::deep_copy(dst, h_dst); Kokkos::fence(); +#else + Kokkos::abort( + "Cannot perform simd verification with cuda/10.2.2, rerun with -v 0"); +#endif // #if (CUDA_VERSION != 10020) } /** @@ -2108,6 +2213,20 @@ void do_gemm_serial_batched_blocked(options_t options) { return; } +void do_gemm_heuristic_batched_parallel(options_t options) { + STATUS; + if (options.blas_args.use_auto) { + fprintf(stderr, "ERROR: --test=%s does not support --use_auto=%d\n", + test_e_str[options.test].c_str(), (int)options.blas_args.use_auto); + exit(-EINVAL); + } + + __do_loop_and_invoke( + options, + __do_gemm_parallel_batched_heuristic); + return; +} + void do_gemm_serial_batched_parallel(options_t options) { STATUS; if (options.blas_args.batch_size_last_dim) diff --git a/perf_test/blas/blas3/KokkosBlas3_gemm_tracked_perf_test.hpp b/perf_test/blas/blas3/KokkosBlas3_gemm_tracked_perf_test.hpp index 3eac236f5c..7c2279c672 100644 --- a/perf_test/blas/blas3/KokkosBlas3_gemm_tracked_perf_test.hpp +++ b/perf_test/blas/blas3/KokkosBlas3_gemm_tracked_perf_test.hpp @@ -1,3 +1,47 @@ +/* +//@HEADER +// ************************************************************************ +// +// Kokkos v. 3.0 +// Copyright (2020) National Technology & Engineering +// Solutions of Sandia, LLC (NTESS). +// +// Under the terms of Contract DE-NA0003525 with NTESS, +// the U.S. Government retains certain rights in this software. +// +// Redistribution and use in source and binary forms, with or without +// modification, are permitted provided that the following conditions are +// met: +// +// 1. Redistributions of source code must retain the above copyright +// notice, this list of conditions and the following disclaimer. +// +// 2. Redistributions in binary form must reproduce the above copyright +// notice, this list of conditions and the following disclaimer in the +// documentation and/or other materials provided with the distribution. +// +// 3. Neither the name of the Corporation nor the names of the +// contributors may be used to endorse or promote products derived from +// this software without specific prior written permission. +// +// THIS SOFTWARE IS PROVIDED BY NTESS "AS IS" AND ANY +// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR +// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NTESS OR THE +// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, +// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, +// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR +// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF +// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING +// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS +// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +// +// Questions? Contact Siva Rajamanickam (srajama@sandia.gov) +// +// ************************************************************************ +//@HEADER +*/ + // This file is for the "tracked test" version of // a Kokkos Kernels performance test. @@ -41,7 +85,11 @@ struct testData_gemm { std::string modeA = "N"; std::string modeB = "N"; - // coefficients for values in A and B + // Usage: KokkosBlas::gemm(modeA, modeB, alpha, A, B, beta, C); + // Dense matrix-matrix multiply: C = beta*C + alpha*op(A)*op(B); + // alpha: alpha [in] Input coefficient of A*x + // beta [in] Input coefficient of C + double alpha = 1.0; double beta = 0.0; From 015d82a30e4971b7a3a97ff07022ce30e11845f3 Mon Sep 17 00:00:00 2001 From: Kim Liegeois Date: Fri, 8 Oct 2021 15:19:03 -0600 Subject: [PATCH 009/300] Add batched axpy and batched spmv (#1092) * Add batched dense axpy and batched spmv --- src/batched/KokkosBatched_Util.hpp | 26 ++ src/batched/dense/KokkosBatched_Axpy.hpp | 148 +++++++ .../dense/impl/KokkosBatched_Axpy_Impl.hpp | 382 ++++++++++++++++++ src/batched/sparse/KokkosBatched_Spmv.hpp | 296 ++++++++++++++ .../impl/KokkosBatched_Spmv_Serial_Impl.hpp | 183 +++++++++ .../KokkosBatched_Spmv_TeamVector_Impl.hpp | 210 ++++++++++ .../impl/KokkosBatched_Spmv_Team_Impl.hpp | 210 ++++++++++ .../batched/dense/Test_Batched_Dense.hpp | 9 + .../batched/dense/Test_Batched_SerialAxpy.hpp | 142 +++++++ .../dense/Test_Batched_SerialAxpy_Complex.hpp | 10 + .../dense/Test_Batched_SerialAxpy_Real.hpp | 12 + .../batched/dense/Test_Batched_TeamAxpy.hpp | 150 +++++++ .../dense/Test_Batched_TeamAxpy_Complex.hpp | 10 + .../dense/Test_Batched_TeamAxpy_Real.hpp | 12 + .../dense/Test_Batched_TeamVectorAxpy.hpp | 150 +++++++ .../Test_Batched_TeamVectorAxpy_Complex.hpp | 10 + .../Test_Batched_TeamVectorAxpy_Real.hpp | 12 + .../sparse/Test_Batched_SerialSpmv.hpp | 237 +++++++++++ .../sparse/Test_Batched_SerialSpmv_Real.hpp | 14 + .../batched/sparse/Test_Batched_Sparse.hpp | 9 +- .../batched/sparse/Test_Batched_TeamSpmv.hpp | 245 +++++++++++ .../sparse/Test_Batched_TeamSpmv_Real.hpp | 14 + .../sparse/Test_Batched_TeamVectorSpmv.hpp | 244 +++++++++++ .../Test_Batched_TeamVectorSpmv_Real.hpp | 14 + 24 files changed, 2746 insertions(+), 3 deletions(-) create mode 100644 src/batched/dense/KokkosBatched_Axpy.hpp create mode 100644 src/batched/dense/impl/KokkosBatched_Axpy_Impl.hpp create mode 100644 src/batched/sparse/KokkosBatched_Spmv.hpp create mode 100644 src/batched/sparse/impl/KokkosBatched_Spmv_Serial_Impl.hpp create mode 100644 src/batched/sparse/impl/KokkosBatched_Spmv_TeamVector_Impl.hpp create mode 100644 src/batched/sparse/impl/KokkosBatched_Spmv_Team_Impl.hpp create mode 100644 unit_test/batched/dense/Test_Batched_SerialAxpy.hpp create mode 100644 unit_test/batched/dense/Test_Batched_SerialAxpy_Complex.hpp create mode 100644 unit_test/batched/dense/Test_Batched_SerialAxpy_Real.hpp create mode 100644 unit_test/batched/dense/Test_Batched_TeamAxpy.hpp create mode 100644 unit_test/batched/dense/Test_Batched_TeamAxpy_Complex.hpp create mode 100644 unit_test/batched/dense/Test_Batched_TeamAxpy_Real.hpp create mode 100644 unit_test/batched/dense/Test_Batched_TeamVectorAxpy.hpp create mode 100644 unit_test/batched/dense/Test_Batched_TeamVectorAxpy_Complex.hpp create mode 100644 unit_test/batched/dense/Test_Batched_TeamVectorAxpy_Real.hpp create mode 100644 unit_test/batched/sparse/Test_Batched_SerialSpmv.hpp create mode 100644 unit_test/batched/sparse/Test_Batched_SerialSpmv_Real.hpp create mode 100644 unit_test/batched/sparse/Test_Batched_TeamSpmv.hpp create mode 100644 unit_test/batched/sparse/Test_Batched_TeamSpmv_Real.hpp create mode 100644 unit_test/batched/sparse/Test_Batched_TeamVectorSpmv.hpp create mode 100644 unit_test/batched/sparse/Test_Batched_TeamVectorSpmv_Real.hpp diff --git a/src/batched/KokkosBatched_Util.hpp b/src/batched/KokkosBatched_Util.hpp index c19e9512c4..ae074b54ac 100644 --- a/src/batched/KokkosBatched_Util.hpp +++ b/src/batched/KokkosBatched_Util.hpp @@ -666,6 +666,32 @@ namespace KokkosBatched { } }; + template + KOKKOS_INLINE_FUNCTION + typename std::enable_if::value, void>::type + getIndices(const OrdinalType iTemp, + const OrdinalType /*numRows*/, + const OrdinalType numMatrices, + OrdinalType &iRow, + OrdinalType &iMatrix) { + iRow = iTemp / numMatrices; + iMatrix = iTemp % numMatrices; + } + + template + KOKKOS_INLINE_FUNCTION + typename std::enable_if::value, void>::type + getIndices(const OrdinalType iTemp, + const OrdinalType numRows, + const OrdinalType /*numMatrices*/, + OrdinalType &iRow, + OrdinalType &iMatrix) { + iRow = iTemp % numRows; + iMatrix = iTemp / numRows; + } + template KOKKOS_INLINE_FUNCTION auto transpose_2d_view(ViewType v, const int *order) { constexpr int rank = 2; diff --git a/src/batched/dense/KokkosBatched_Axpy.hpp b/src/batched/dense/KokkosBatched_Axpy.hpp new file mode 100644 index 0000000000..bdaa3d0a4d --- /dev/null +++ b/src/batched/dense/KokkosBatched_Axpy.hpp @@ -0,0 +1,148 @@ +//@HEADER +// ************************************************************************ +// +// Kokkos v. 3.4 +// Copyright (2021) National Technology & Engineering +// Solutions of Sandia, LLC (NTESS). +// +// Under the terms of Contract DE-NA0003525 with NTESS, +// the U.S. Government retains certain rights in this software. +// +// Redistribution and use in source and binary forms, with or without +// modification, are permitted provided that the following conditions are +// met: +// +// 1. Redistributions of source code must retain the above copyright +// notice, this list of conditions and the following disclaimer. +// +// 2. Redistributions in binary form must reproduce the above copyright +// notice, this list of conditions and the following disclaimer in the +// documentation and/or other materials provided with the distribution. +// +// 3. Neither the name of the Corporation nor the names of the +// contributors may be used to endorse or promote products derived from +// this software without specific prior written permission. +// +// THIS SOFTWARE IS PROVIDED BY NTESS "AS IS" AND ANY +// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR +// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NTESS OR THE +// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, +// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, +// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR +// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF +// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING +// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS +// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +// +// Questions? Contact Siva Rajamanickam (srajama@sandia.gov) +// +// ************************************************************************ +//@HEADER +#ifndef __KOKKOSBATCHED_AXPY_HPP__ +#define __KOKKOSBATCHED_AXPY_HPP__ + + +/// \author Kim Liegeois (knliege@sandia.gov) + +#include "KokkosBatched_Util.hpp" +#include "KokkosBatched_Vector.hpp" + +namespace KokkosBatched { + + /// \brief Serial Batched AXPY: + /// y_l <- alpha_l * x_l + y_l for all l = 1, ..., N + /// where: + /// * N is the number of vectors, + /// * x_1, ..., x_N are the N input vectors, + /// * y_1, ..., y_N are the N output vectors, + /// * alpha_1, ..., alpha_N are N scaling factors for x_1, ..., x_N. + /// + /// \tparam ViewType: Input type for X and Y, needs to be a 2D view + /// \tparam alphaViewType: Input type for alpha, needs to be a 1D view + /// + /// \param alpha [in]: input coefficient for X, a rank 1 view + /// \param X [in]: Input vector X, a rank 2 view + /// \param Y [in/out]: Output vector Y, a rank 2 view + /// + /// No nested parallel_for is used inside of the function. + /// + + struct SerialAxpy { + template + KOKKOS_INLINE_FUNCTION + static int + invoke(const alphaViewType &alpha, + const ViewType &X, + const ViewType &Y); + }; + + /// \brief Team Batched AXPY: + /// y_l <- alpha_l * x_l + y_l for all l = 1, ..., N + /// where: + /// * N is the number of vectors, + /// * x_1, ..., x_N are the N input vectors, + /// * y_1, ..., y_N are the N output vectors, + /// * alpha_1, ..., alpha_N are N scaling factors for x_1, ..., x_N. + /// + /// \tparam ViewType: Input type for X and Y, needs to be a 2D view + /// \tparam alphaViewType: Input type for alpha, needs to be a 1D view + /// + /// \param member [in]: TeamPolicy member + /// \param alpha [in]: input coefficient for X, a rank 1 view + /// \param X [in]: Input vector X, a rank 2 view + /// \param Y [in/out]: Output vector Y, a rank 2 view + /// + /// A nested parallel_for with TeamThreadRange is used. + /// + + template + struct TeamAxpy { + template + KOKKOS_INLINE_FUNCTION + static int + invoke(const MemberType &member, + const alphaViewType &alpha, + const ViewType &X, + const ViewType &Y); + }; + + /// \brief TeamVector Batched AXPY: + /// y_l <- alpha_l * x_l + y_l for all l = 1, ..., N + /// where: + /// * N is the number of vectors, + /// * x_1, ..., x_N are the N input vectors, + /// * y_1, ..., y_N are the N output vectors, + /// * alpha_1, ..., alpha_N are N scaling factors for x_1, ..., x_N. + /// + /// \tparam ViewType: Input type for X and Y, needs to be a 2D view + /// \tparam alphaViewType: Input type for alpha, needs to be a 1D view + /// + /// \param member [in]: TeamPolicy member + /// \param alpha [in]: input coefficient for X, a rank 1 view + /// \param X [in]: Input vector X, a rank 2 view + /// \param Y [in/out]: Output vector Y, a rank 2 view + /// + /// Two nested parallel_for with both TeamThreadRange and ThreadVectorRange + /// (or one with TeamVectorRange) are used inside. + /// + + template + struct TeamVectorAxpy { + template + KOKKOS_INLINE_FUNCTION + static int + invoke(const MemberType &member, + const alphaViewType &alpha, + const ViewType &X, + const ViewType &Y); + }; + +} + +#include "KokkosBatched_Axpy_Impl.hpp" + +#endif diff --git a/src/batched/dense/impl/KokkosBatched_Axpy_Impl.hpp b/src/batched/dense/impl/KokkosBatched_Axpy_Impl.hpp new file mode 100644 index 0000000000..7ba4746983 --- /dev/null +++ b/src/batched/dense/impl/KokkosBatched_Axpy_Impl.hpp @@ -0,0 +1,382 @@ +//@HEADER +// ************************************************************************ +// +// Kokkos v. 3.4 +// Copyright (2021) National Technology & Engineering +// Solutions of Sandia, LLC (NTESS). +// +// Under the terms of Contract DE-NA0003525 with NTESS, +// the U.S. Government retains certain rights in this software. +// +// Redistribution and use in source and binary forms, with or without +// modification, are permitted provided that the following conditions are +// met: +// +// 1. Redistributions of source code must retain the above copyright +// notice, this list of conditions and the following disclaimer. +// +// 2. Redistributions in binary form must reproduce the above copyright +// notice, this list of conditions and the following disclaimer in the +// documentation and/or other materials provided with the distribution. +// +// 3. Neither the name of the Corporation nor the names of the +// contributors may be used to endorse or promote products derived from +// this software without specific prior written permission. +// +// THIS SOFTWARE IS PROVIDED BY NTESS "AS IS" AND ANY +// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR +// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NTESS OR THE +// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, +// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, +// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR +// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF +// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING +// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS +// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +// +// Questions? Contact Siva Rajamanickam (srajama@sandia.gov) +// +// ************************************************************************ +//@HEADER +#ifndef __KOKKOSBATCHED_AXPY_IMPL_HPP__ +#define __KOKKOSBATCHED_AXPY_IMPL_HPP__ + + +/// \author Kim Liegeois (knliege@sandia.gov) + +#include "KokkosBatched_Util.hpp" + +namespace KokkosBatched { + + /// + /// Serial Internal Impl + /// ==================== + struct SerialAxpyInternal { + template + KOKKOS_INLINE_FUNCTION + static int + invoke(const int m, + const ScalarType alpha, + const ValueType* KOKKOS_RESTRICT X, const int xs0, + /* */ ValueType* KOKKOS_RESTRICT Y, const int ys0) { + +#if defined(KOKKOS_ENABLE_PRAGMA_UNROLL) +#pragma unroll +#endif + for (int i=0;i + KOKKOS_INLINE_FUNCTION + static int + invoke(const int m, + const ScalarType* KOKKOS_RESTRICT alpha, const int alphas0, + const ValueType* KOKKOS_RESTRICT X, const int xs0, + /* */ ValueType* KOKKOS_RESTRICT Y, const int ys0) { + +#if defined(KOKKOS_ENABLE_PRAGMA_UNROLL) +#pragma unroll +#endif + for (int i=0;i + KOKKOS_INLINE_FUNCTION + static int + invoke(const int m, const int n, + const ScalarType* KOKKOS_RESTRICT alpha, const int alphas0, + const ValueType* KOKKOS_RESTRICT X, const int xs0, const int xs1, + /* */ ValueType* KOKKOS_RESTRICT Y, const int ys0, const int ys1) { + + if (xs0 > xs1) + for (int i=0;i + KOKKOS_INLINE_FUNCTION + static int + invoke(const MemberType &member, + const int m, + const ScalarType alpha, + const ValueType* KOKKOS_RESTRICT X, const int xs0, + /* */ ValueType* KOKKOS_RESTRICT Y, const int ys0) { + + Kokkos::parallel_for + (Kokkos::TeamThreadRange(member,m), + [&](const int &i) { + Y[i*ys0] += alpha*X[i*xs0]; + }); + //member.team_barrier(); + return 0; + } + + template + KOKKOS_INLINE_FUNCTION + static int + invoke(const MemberType &member, + const int m, + const ScalarType* KOKKOS_RESTRICT alpha, const int alphas0, + const ValueType* KOKKOS_RESTRICT X, const int xs0, + /* */ ValueType* KOKKOS_RESTRICT Y, const int ys0) { + + Kokkos::parallel_for + (Kokkos::TeamThreadRange(member,m), + [&](const int &i) { + Y[i*ys0] += alpha[i*alphas0]*X[i*xs0]; + }); + //member.team_barrier(); + return 0; + } + + template + KOKKOS_INLINE_FUNCTION + static int + invoke(const MemberType &member, + const int m, const int n, + const ScalarType* KOKKOS_RESTRICT alpha, const int alphas0, + const ValueType* KOKKOS_RESTRICT X, const int xs0, const int xs1, + /* */ ValueType* KOKKOS_RESTRICT Y, const int ys0, const int ys1) { + if (m > n) { + Kokkos::parallel_for + (Kokkos::TeamThreadRange(member,m), + [&](const int &i) { + SerialAxpyInternal::invoke(n, alpha[i*alphas0], X+i*xs0, xs1, Y+i*ys0, ys1); + }); + } else { + Kokkos::parallel_for + (Kokkos::TeamThreadRange(member,n), + [&](const int &j) { + SerialAxpyInternal::invoke(m, alpha, alphas0, X+j*xs1, xs0, Y+j*ys1, ys0); + }); + } + //member.team_barrier(); + return 0; + } + }; + + /// + /// TeamVector Internal Impl + /// ======================== + struct TeamVectorAxpyInternal { + template + KOKKOS_INLINE_FUNCTION + static int + invoke(const MemberType &member, + const int m, + const ScalarType alpha, + const ValueType* KOKKOS_RESTRICT X, const int xs0, + /* */ ValueType* KOKKOS_RESTRICT Y, const int ys0) { + + Kokkos::parallel_for + (Kokkos::TeamVectorRange(member,m), + [&](const int &i) { + Y[i*ys0] += alpha*X[i*xs0]; + }); + //member.team_barrier(); + return 0; + } + + template + KOKKOS_INLINE_FUNCTION + static int + invoke(const MemberType &member, + const int m, + const ScalarType* KOKKOS_RESTRICT alpha, const int alphas0, + const ValueType* KOKKOS_RESTRICT X, const int xs0, + /* */ ValueType* KOKKOS_RESTRICT Y, const int ys0) { + + Kokkos::parallel_for + (Kokkos::TeamVectorRange(member,m), + [&](const int &i) { + Y[i*ys0] += alpha[i*alphas0]*X[i*xs0]; + }); + //member.team_barrier(); + return 0; + } + + template + KOKKOS_INLINE_FUNCTION + static int + invoke(const MemberType &member, + const int m, const int n, + const ScalarType* KOKKOS_RESTRICT alpha, const int alphas0, + const ValueType* KOKKOS_RESTRICT X, const int xs0, const int xs1, + /* */ ValueType* KOKKOS_RESTRICT Y, const int ys0, const int ys1) { + Kokkos::parallel_for( + Kokkos::TeamVectorRange(member, 0, m * n), + [&](const int& iTemp) { + int i, j; + getIndices(iTemp, n, m, j, i); + Y[i*ys0+j*ys1] += alpha[i*alphas0] * X[i*xs0+j*xs1]; + }); + //member.team_barrier(); + return 0; + } + }; + + /// + /// Serial Impl + /// =========== + template + KOKKOS_INLINE_FUNCTION + int + SerialAxpy:: + invoke(const alphaViewType &alpha, + const ViewType &X, + const ViewType &Y) { + +#if (KOKKOSKERNELS_DEBUG_LEVEL > 0) + static_assert (Kokkos::Impl::is_view::value, "KokkosBatched::axpy: ViewType is not a Kokkos::View."); + static_assert (Kokkos::Impl::is_view::value, "KokkosBatched::axpy: alphaViewType is not a Kokkos::View."); + static_assert (ViewType::Rank == 2, "KokkosBatched::axpy: ViewType must have rank 2."); + static_assert (alphaViewType::Rank == 1, "KokkosBatched::axpy: alphaViewType must have rank 1."); + + // Check compatibility of dimensions at run time. + if (X.extent(0) != Y.extent(0) || + X.extent(1) != Y.extent(1)) { + printf("KokkosBatched::axpy: Dimensions of X and Y do not match: X: %d x %d, Y: %d x %d\n", (int) X.extent(0), (int) X.extent(1), (int) Y.extent(0), (int) Y.extent(1)); + return 1; + } + if (X.extent(0) != alpha.extent(0)) { + printf("KokkosBatched::axpy: First dimension of X and alpha do not match: X: %d x %d, alpha: %d\n", (int) X.extent(0), (int) X.extent(1), (int) alpha.extent(0)); + return 1; + } +#endif + + return SerialAxpyInternal::template + invoke + (X.extent(0), X.extent(1), + alpha.data(), alpha.stride_0(), + X.data(), X.stride_0(), X.stride_1(), + Y.data(), Y.stride_0(), Y.stride_1()); + } + + /// + /// Team Impl + /// ========= + + template + template + KOKKOS_INLINE_FUNCTION + int + TeamAxpy:: + invoke(const MemberType &member, + const alphaViewType &alpha, + const ViewType &X, + const ViewType &Y) { + +#if (KOKKOSKERNELS_DEBUG_LEVEL > 0) + static_assert (Kokkos::Impl::is_view::value, "KokkosBatched::axpy: ViewType is not a Kokkos::View."); + static_assert (Kokkos::Impl::is_view::value, "KokkosBatched::axpy: alphaViewType is not a Kokkos::View."); + static_assert (ViewType::Rank == 2, "KokkosBatched::axpy: ViewType must have rank 2."); + static_assert (alphaViewType::Rank == 1, "KokkosBatched::axpy: alphaViewType must have rank 1."); + + // Check compatibility of dimensions at run time. + if (X.extent(0) != Y.extent(0) || + X.extent(1) != Y.extent(1)) { + printf("KokkosBatched::axpy: Dimensions of X and Y do not match: X: %d x %d, Y: %d x %d\n", (int) X.extent(0), (int) X.extent(1), (int) Y.extent(0), (int) Y.extent(1)); + return 1; + } + if (X.extent(0) != alpha.extent(0)) { + printf("KokkosBatched::axpy: First dimension of X and alpha do not match: X: %d x %d, alpha: %d\n", (int) X.extent(0), (int) X.extent(1), (int) alpha.extent(0)); + return 1; + } +#endif + + return TeamAxpyInternal::template + invoke + (member, + X.extent(0), X.extent(1), + alpha.data(), alpha.stride_0(), + X.data(), X.stride_0(), X.stride_1(), + Y.data(), Y.stride_0(), Y.stride_1()); + } + + /// + /// TeamVector Impl + /// =============== + + template + template + KOKKOS_INLINE_FUNCTION + int + TeamVectorAxpy:: + invoke(const MemberType &member, + const alphaViewType &alpha, + const ViewType &X, + const ViewType &Y) { + +#if (KOKKOSKERNELS_DEBUG_LEVEL > 0) + static_assert (Kokkos::Impl::is_view::value, "KokkosBatched::axpy: ViewType is not a Kokkos::View."); + static_assert (Kokkos::Impl::is_view::value, "KokkosBatched::axpy: alphaViewType is not a Kokkos::View."); + static_assert (ViewType::Rank == 2, "KokkosBatched::axpy: ViewType must have rank 2."); + static_assert (alphaViewType::Rank == 1, "KokkosBatched::axpy: alphaViewType must have rank 1."); + + // Check compatibility of dimensions at run time. + if (X.extent(0) != Y.extent(0) || + X.extent(1) != Y.extent(1)) { + printf("KokkosBatched::axpy: Dimensions of X and Y do not match: X: %d x %d, Y: %d x %d\n", (int) X.extent(0), (int) X.extent(1), (int) Y.extent(0), (int) Y.extent(1)); + return 1; + } + if (X.extent(0) != alpha.extent(0)) { + printf("KokkosBatched::axpy: First dimension of X and alpha do not match: X: %d x %d, alpha: %d\n", (int) X.extent(0), (int) X.extent(1), (int) alpha.extent(0)); + return 1; + } +#endif + + return TeamVectorAxpyInternal:: + invoke + (member, + X.extent(0), X.extent(1), + alpha.data(), alpha.stride_0(), + X.data(), X.stride_0(), X.stride_1(), + Y.data(), Y.stride_0(), Y.stride_1()); + } + +} + + +#endif diff --git a/src/batched/sparse/KokkosBatched_Spmv.hpp b/src/batched/sparse/KokkosBatched_Spmv.hpp new file mode 100644 index 0000000000..57aa732c98 --- /dev/null +++ b/src/batched/sparse/KokkosBatched_Spmv.hpp @@ -0,0 +1,296 @@ +//@HEADER +// ************************************************************************ +// +// Kokkos v. 3.4 +// Copyright (2021) National Technology & Engineering +// Solutions of Sandia, LLC (NTESS). +// +// Under the terms of Contract DE-NA0003525 with NTESS, +// the U.S. Government retains certain rights in this software. +// +// Redistribution and use in source and binary forms, with or without +// modification, are permitted provided that the following conditions are +// met: +// +// 1. Redistributions of source code must retain the above copyright +// notice, this list of conditions and the following disclaimer. +// +// 2. Redistributions in binary form must reproduce the above copyright +// notice, this list of conditions and the following disclaimer in the +// documentation and/or other materials provided with the distribution. +// +// 3. Neither the name of the Corporation nor the names of the +// contributors may be used to endorse or promote products derived from +// this software without specific prior written permission. +// +// THIS SOFTWARE IS PROVIDED BY NTESS "AS IS" AND ANY +// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR +// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NTESS OR THE +// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, +// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, +// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR +// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF +// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING +// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS +// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +// +// Questions? Contact Siva Rajamanickam (srajama@sandia.gov) +// +// ************************************************************************ +//@HEADER +#ifndef __KOKKOSBATCHED_SPMV_HPP__ +#define __KOKKOSBATCHED_SPMV_HPP__ + + +/// \author Kim Liegeois (knliege@sandia.gov) + +#include "KokkosBatched_Util.hpp" +#include "KokkosBatched_Vector.hpp" + +namespace KokkosBatched { + + /// \brief Serial Batched SPMV: + /// y_l <- alpha_l * A_l * x_l + beta_l * y_l for all l = 1, ..., N + /// where: + /// * N is the number of matrices, + /// * A_1, ..., A_N are N sparse matrices which share the same sparsity pattern, + /// * x_1, ..., x_N are the N input vectors, + /// * y_1, ..., y_N are the N output vectors, + /// * alpha_1, ..., alpha_N are N scaling factors for x_1, ..., x_N, + /// * beta_1, ..., beta_N are N scaling factors for y_1, ..., y_N. + /// + /// \tparam ValuesViewType: Input type for the values of the batched crs matrix, needs to be a 2D view + /// \tparam IntView: Input type for row offset array and column-index array, needs to be a 1D view + /// \tparam xViewType: Input type for X, needs to be a 2D view + /// \tparam yViewType: Input type for Y, needs to be a 2D view + /// \tparam alphaViewType: Input type for alpha, needs to be a 1D view + /// \tparam betaViewType: Input type for beta, needs to be a 1D view + /// \tparam dobeta: Int which sepcifies if beta_l * y_l is used or not (if dobeta == 0, beta_l * y_l is not added to the result of alpha_l * A_l * x_l) + /// + /// \param alpha [in]: input coefficient for X, a rank 1 view + /// \param values [in]: values of the batched crs matrix, a rank 2 view + /// \param row_ptr [in]: row offset array of the batched crs matrix, a rank 1 view + /// \param colIndices [in]: column-index array of the batched crs matrix, a rank 1 view + /// \param X [in]: Input vector X, a rank 2 view + /// \param beta [in]: input coefficient for Y (if dobeta != 0), a rank 1 view + /// \param Y [in/out]: Output vector Y, a rank 2 view + /// + /// The matrices are represented using a Compressed Row Storage (CRS) format and + /// the shared sparsity pattern is reused from one matrix to the others. + /// + /// Concretely, instead of providing an array of N matrices to the batched SPMV kernel, + /// the user provides one row offset array (1D view), one column-index array (1D view), + /// and one value array (2D view, one dimension for the non-zero indices and one for the + /// matrix indices). + /// + /// No nested parallel_for is used inside of the function. + /// + + template + struct SerialSpmv { + template + KOKKOS_INLINE_FUNCTION + static int + invoke(const alphaViewType &alpha, + const ValuesViewType &values, + const IntView &row_ptr, + const IntView &colIndices, + const xViewType &X, + const betaViewType &beta, + const yViewType &Y); + }; + + /// \brief Team Batched SPMV: + /// y_l <- alpha_l * A_l * x_l + beta_l * y_l for all l = 1, ..., N + /// where: + /// * N is the number of matrices, + /// * A_1, ..., A_N are N sparse matrices which share the same sparsity pattern, + /// * x_1, ..., x_N are the N input vectors, + /// * y_1, ..., y_N are the N output vectors, + /// * alpha_1, ..., alpha_N are N scaling factors for x_1, ..., x_N, + /// * beta_1, ..., beta_N are N scaling factors for y_1, ..., y_N. + /// + /// \tparam ValuesViewType: Input type for the values of the batched crs matrix, needs to be a 2D view + /// \tparam IntView: Input type for row offset array and column-index array, needs to be a 1D view + /// \tparam xViewType: Input type for X, needs to be a 2D view + /// \tparam yViewType: Input type for Y, needs to be a 2D view + /// \tparam alphaViewType: Input type for alpha, needs to be a 1D view + /// \tparam betaViewType: Input type for beta, needs to be a 1D view + /// \tparam dobeta: Int which sepcifies if beta_l * y_l is used or not (if dobeta == 0, beta_l * y_l is not added to the result of alpha_l * A_l * x_l) + /// + /// \param member [in]: TeamPolicy member + /// \param alpha [in]: input coefficient for X, a rank 1 view + /// \param values [in]: values of the batched crs matrix, a rank 2 view + /// \param row_ptr [in]: row offset array of the batched crs matrix, a rank 1 view + /// \param colIndices [in]: column-index array of the batched crs matrix, a rank 1 view + /// \param X [in]: Input vector X, a rank 2 view + /// \param beta [in]: input coefficient for Y (if dobeta != 0), a rank 1 view + /// \param Y [in/out]: Output vector Y, a rank 2 view + /// + /// The matrices are represented using a Compressed Row Storage (CRS) format and + /// the shared sparsity pattern is reused from one matrix to the others. + /// + /// Concretely, instead of providing an array of N matrices to the batched SPMV kernel, + /// the user provides one row offset array (1D view), one column-index array (1D view), + /// and one value array (2D view, one dimension for the non-zero indices and one for the + /// matrix indices). + /// + /// A nested parallel_for with TeamThreadRange is used. + /// + + template + struct TeamSpmv { + template + KOKKOS_INLINE_FUNCTION + static int + invoke(const MemberType &member, + const alphaViewType &alpha, + const ValuesViewType &values, + const IntView &row_ptr, + const IntView &colIndices, + const xViewType &x, + const betaViewType &beta, + const yViewType &y); + }; + + /// \brief TeamVector Batched SPMV: + /// y_l <- alpha_l * A_l * x_l + beta_l * y_l for all l = 1, ..., N + /// where: + /// * N is the number of matrices, + /// * A_1, ..., A_N are N sparse matrices which share the same sparsity pattern, + /// * x_1, ..., x_N are the N input vectors, + /// * y_1, ..., y_N are the N output vectors, + /// * alpha_1, ..., alpha_N are N scaling factors for x_1, ..., x_N, + /// * beta_1, ..., beta_N are N scaling factors for y_1, ..., y_N. + /// + /// \tparam ValuesViewType: Input type for the values of the batched crs matrix, needs to be a 2D view + /// \tparam IntView: Input type for row offset array and column-index array, needs to be a 1D view + /// \tparam xViewType: Input type for X, needs to be a 2D view + /// \tparam yViewType: Input type for Y, needs to be a 2D view + /// \tparam alphaViewType: Input type for alpha, needs to be a 1D view + /// \tparam betaViewType: Input type for beta, needs to be a 1D view + /// \tparam dobeta: Int which sepcifies if beta_l * y_l is used or not (if dobeta == 0, beta_l * y_l is not added to the result of alpha_l * A_l * x_l) + /// + /// \param member [in]: TeamPolicy member + /// \param alpha [in]: input coefficient for X, a rank 1 view + /// \param values [in]: values of the batched crs matrix, a rank 2 view + /// \param row_ptr [in]: row offset array of the batched crs matrix, a rank 1 view + /// \param colIndices [in]: column-index array of the batched crs matrix, a rank 1 view + /// \param X [in]: Input vector X, a rank 2 view + /// \param beta [in]: input coefficient for Y (if dobeta != 0), a rank 1 view + /// \param Y [in/out]: Output vector Y, a rank 2 view + /// + /// The matrices are represented using a Compressed Row Storage (CRS) format and + /// the shared sparsity pattern is reused from one matrix to the others. + /// + /// Concretely, instead of providing an array of N matrices to the batched SPMV kernel, + /// the user provides one row offset array (1D view), one column-index array (1D view), + /// and one value array (2D view, one dimension for the non-zero indices and one for the + /// matrix indices). + /// + /// Two nested parallel_for with both TeamThreadRange and ThreadVectorRange + /// (or one with TeamVectorRange) are used inside. + /// + + template + struct TeamVectorSpmv { + template + KOKKOS_INLINE_FUNCTION + static int + invoke(const MemberType &member, + const alphaViewType &alpha, + const ValuesViewType &values, + const IntView &row_ptr, + const IntView &colIndices, + const xViewType &x, + const betaViewType &beta, + const yViewType &y); + }; + + /// \brief Batched SPMV: Selective Interface + /// y_l <- alpha_l * A_l * x_l + beta_l * y_l for all l = 1, ..., N + /// where: + /// * N is the number of matrices, + /// * A_1, ..., A_N are N sparse matrices which share the same sparsity pattern, + /// * x_1, ..., x_N are the N input vectors, + /// * y_1, ..., y_N are the N output vectors, + /// * alpha_1, ..., alpha_N are N scaling factors for x_1, ..., x_N, + /// * beta_1, ..., beta_N are N scaling factors for y_1, ..., y_N. + /// + /// \tparam ValuesViewType: Input type for the values of the batched crs matrix, needs to be a 2D view + /// \tparam IntView: Input type for row offset array and column-index array, needs to be a 1D view + /// \tparam xViewType: Input type for X, needs to be a 2D view + /// \tparam yViewType: Input type for Y, needs to be a 2D view + /// \tparam alphaViewType: Input type for alpha, needs to be a 1D view + /// \tparam betaViewType: Input type for beta, needs to be a 1D view + /// \tparam dobeta: Int which sepcifies if beta_l * y_l is used or not (if dobeta == 0, beta_l * y_l is not added to the result of alpha_l * A_l * x_l) + /// + /// \param member [in]: TeamPolicy member + /// \param alpha [in]: input coefficient for X, a rank 1 view + /// \param values [in]: values of the batched crs matrix, a rank 2 view + /// \param row_ptr [in]: row offset array of the batched crs matrix, a rank 1 view + /// \param colIndices [in]: column-index array of the batched crs matrix, a rank 1 view + /// \param X [in]: Input vector X, a rank 2 view + /// \param beta [in]: input coefficient for Y (if dobeta != 0), a rank 1 view + /// \param Y [in/out]: Output vector Y, a rank 2 view + + template + struct Spmv { + template + KOKKOS_INLINE_FUNCTION + static int + invoke(const MemberType &member, + const alphaViewType &alpha, + const ValuesViewType &values, + const IntView &row_ptr, + const IntView &colIndices, + const xViewType &x, + const betaViewType &beta, + const yViewType &y) { + int r_val = 0; + if (std::is_same::value) { + r_val = SerialSpmv::template invoke(alpha, values, row_ptr, colIndices, x, beta, y); + } else if (std::is_same::value) { + r_val = TeamSpmv::template invoke(member, alpha, values, row_ptr, colIndices, x, beta, y); + } else if (std::is_same::value) { + r_val = TeamVectorSpmv::template invoke(member, alpha, values, row_ptr, colIndices, x, beta, y); + } + return r_val; + } + }; + +} + +#include "KokkosBatched_Spmv_Serial_Impl.hpp" +#include "KokkosBatched_Spmv_Team_Impl.hpp" +#include "KokkosBatched_Spmv_TeamVector_Impl.hpp" +#endif diff --git a/src/batched/sparse/impl/KokkosBatched_Spmv_Serial_Impl.hpp b/src/batched/sparse/impl/KokkosBatched_Spmv_Serial_Impl.hpp new file mode 100644 index 0000000000..31144a0340 --- /dev/null +++ b/src/batched/sparse/impl/KokkosBatched_Spmv_Serial_Impl.hpp @@ -0,0 +1,183 @@ +//@HEADER +// ************************************************************************ +// +// Kokkos v. 3.4 +// Copyright (2021) National Technology & Engineering +// Solutions of Sandia, LLC (NTESS). +// +// Under the terms of Contract DE-NA0003525 with NTESS, +// the U.S. Government retains certain rights in this software. +// +// Redistribution and use in source and binary forms, with or without +// modification, are permitted provided that the following conditions are +// met: +// +// 1. Redistributions of source code must retain the above copyright +// notice, this list of conditions and the following disclaimer. +// +// 2. Redistributions in binary form must reproduce the above copyright +// notice, this list of conditions and the following disclaimer in the +// documentation and/or other materials provided with the distribution. +// +// 3. Neither the name of the Corporation nor the names of the +// contributors may be used to endorse or promote products derived from +// this software without specific prior written permission. +// +// THIS SOFTWARE IS PROVIDED BY NTESS "AS IS" AND ANY +// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR +// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NTESS OR THE +// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, +// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, +// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR +// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF +// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING +// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS +// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +// +// Questions? Contact Siva Rajamanickam (srajama@sandia.gov) +// +// ************************************************************************ +//@HEADER +#ifndef __KOKKOSBATCHED_SPMV_SERIAL_IMPL_HPP__ +#define __KOKKOSBATCHED_SPMV_SERIAL_IMPL_HPP__ + + +/// \author Kim Liegeois (knliege@sandia.gov) + +#include "KokkosBatched_Util.hpp" + +namespace KokkosBatched { + + /// + /// Serial Internal Impl + /// ==================== + struct SerialSpmvInternal { + template + KOKKOS_INLINE_FUNCTION + static int + invoke(const OrdinalType numMatrices, const OrdinalType numRows, + const ScalarType* KOKKOS_RESTRICT alpha, const OrdinalType alphas0, + const ValueType* KOKKOS_RESTRICT values, const OrdinalType valuess0, const OrdinalType valuess1, + const OrdinalType* KOKKOS_RESTRICT row_ptr, const OrdinalType row_ptrs0, + const OrdinalType* KOKKOS_RESTRICT colIndices, const OrdinalType colIndicess0, + const ValueType* KOKKOS_RESTRICT X, const OrdinalType xs0, const OrdinalType xs1, + const ScalarType* KOKKOS_RESTRICT beta, const OrdinalType betas0, + /**/ ValueType* KOKKOS_RESTRICT Y, const OrdinalType ys0, const OrdinalType ys1) { + + for (OrdinalType iMatrix = 0; iMatrix < numMatrices; ++iMatrix) { + for (OrdinalType iRow = 0; iRow < numRows; ++iRow) { + const OrdinalType rowLength = + row_ptr[(iRow+1)*row_ptrs0] - row_ptr[iRow*row_ptrs0]; + ValueType sum = 0; + #if defined(KOKKOS_ENABLE_PRAGMA_UNROLL) + #pragma unroll + #endif + for (OrdinalType iEntry = 0; iEntry < rowLength; ++iEntry) { + sum += values[iMatrix*valuess0+(row_ptr[iRow*row_ptrs0]+iEntry)*valuess1] + * X[iMatrix*xs0+colIndices[(row_ptr[iRow*row_ptrs0]+iEntry)*colIndicess0]*xs1]; + } + + sum *= alpha[iMatrix*alphas0]; + + if (dobeta == 0) { + Y[iMatrix*ys0+iRow*ys1] = sum; + } else { + Y[iMatrix*ys0+iRow*ys1] = + beta[iMatrix*betas0] * Y[iMatrix*ys0+iRow*ys1] + sum; + } + } + } + + return 0; + } + }; + + template<> + struct SerialSpmv { + + template + KOKKOS_INLINE_FUNCTION + static int + invoke(const alphaViewType &alpha, + const ValuesViewType &values, + const IntView &row_ptr, + const IntView &colIndices, + const xViewType &X, + const betaViewType &beta, + const yViewType &Y) { + +#if (KOKKOSKERNELS_DEBUG_LEVEL > 0) + static_assert (Kokkos::Impl::is_view::value, "KokkosBatched::spmv: ValuesViewType is not a Kokkos::View."); + static_assert (Kokkos::Impl::is_view::value, "KokkosBatched::spmv: IntView is not a Kokkos::View."); + static_assert (Kokkos::Impl::is_view::value, "KokkosBatched::spmv: xViewType is not a Kokkos::View."); + static_assert (Kokkos::Impl::is_view::value, "KokkosBatched::spmv: yViewType is not a Kokkos::View."); + static_assert (Kokkos::Impl::is_view::value, "KokkosBatched::spmv: alphaViewType is not a Kokkos::View."); + static_assert (Kokkos::Impl::is_view::value, "KokkosBatched::spmv: betaViewType is not a Kokkos::View."); + + static_assert (ValuesViewType::Rank == 2, "KokkosBatched::spmv: ValuesViewType must have rank 2."); + static_assert (IntView::Rank == 1, "KokkosBatched::spmv: IntView must have rank 2."); + static_assert (xViewType::Rank == 2, "KokkosBatched::spmv: xViewType must have rank 2."); + static_assert (yViewType::Rank == 2, "KokkosBatched::spmv: yViewType must have rank 2."); + static_assert (alphaViewType::Rank == 1, "KokkosBatched::spmv: alphaViewType must have rank 1."); + static_assert (betaViewType::Rank == 1, "KokkosBatched::spmv: betaViewType must have rank 1."); + + // Check compatibility of dimensions at run time. + if (X.extent(0) != Y.extent(0) || + X.extent(1) != Y.extent(1)) { + printf("KokkosBatched::spmv: Dimensions of X and Y do not match: X: %d x %d, Y: %d x %d\n", (int) X.extent(0), (int) X.extent(1), (int) Y.extent(0), (int) Y.extent(1)); + return 1; + } + if (X.extent(0) != alpha.extent(0)) { + printf("KokkosBatched::spmv: First dimension of X and alpha do not match: X: %d x %d, alpha: %d\n", (int) X.extent(0), (int) X.extent(1), (int) alpha.extent(0)); + return 1; + } + if (X.extent(0) != beta.extent(0)) { + printf("KokkosBatched::spmv: First dimension of X and beta do not match: X: %d x %d, beta: %d\n", (int) X.extent(0), (int) X.extent(1), (int) beta.extent(0)); + return 1; + } + if (X.extent(0) != values.extent(0)) { + printf("KokkosBatched::spmv: First dimension of X and the first dimension of values do not match: X: %d x %d, values: %d x %d\n", (int) X.extent(0), (int) X.extent(1), (int) values.extent(0), (int) values.extent(1)); + return 1; + } + if (colIndices.extent(0) != values.extent(1)) { + printf("KokkosBatched::spmv: Dimension of colIndices and the second dimension of values do not match: colIndices: %d , values: %d x %d\n", (int) colIndices.extent(0), (int) values.extent(0), (int) values.extent(1)); + return 1; + } + if (row_ptr.extent(0) - 1 != X.extent(1)) { + printf("KokkosBatched::spmv: Dimension of row_ptr and the second dimension of X do not match: colIndices (-1): %d , values: %d x %d\n", (int) row_ptr.extent(0) - 1, (int) X.extent(0), (int) X.extent(1)); + return 1; + } +#endif + + return SerialSpmvInternal::template + invoke + (X.extent(0), X.extent(1), + alpha.data(), alpha.stride_0(), + values.data(), values.stride_0(), values.stride_1(), + row_ptr.data(), row_ptr.stride_0(), + colIndices.data(), colIndices.stride_0(), + X.data(), X.stride_0(), X.stride_1(), + beta.data(), beta.stride_0(), + Y.data(), Y.stride_0(), Y.stride_1()); + } + }; + +} + + +#endif diff --git a/src/batched/sparse/impl/KokkosBatched_Spmv_TeamVector_Impl.hpp b/src/batched/sparse/impl/KokkosBatched_Spmv_TeamVector_Impl.hpp new file mode 100644 index 0000000000..4d3508a46c --- /dev/null +++ b/src/batched/sparse/impl/KokkosBatched_Spmv_TeamVector_Impl.hpp @@ -0,0 +1,210 @@ +//@HEADER +// ************************************************************************ +// +// Kokkos v. 3.4 +// Copyright (2021) National Technology & Engineering +// Solutions of Sandia, LLC (NTESS). +// +// Under the terms of Contract DE-NA0003525 with NTESS, +// the U.S. Government retains certain rights in this software. +// +// Redistribution and use in source and binary forms, with or without +// modification, are permitted provided that the following conditions are +// met: +// +// 1. Redistributions of source code must retain the above copyright +// notice, this list of conditions and the following disclaimer. +// +// 2. Redistributions in binary form must reproduce the above copyright +// notice, this list of conditions and the following disclaimer in the +// documentation and/or other materials provided with the distribution. +// +// 3. Neither the name of the Corporation nor the names of the +// contributors may be used to endorse or promote products derived from +// this software without specific prior written permission. +// +// THIS SOFTWARE IS PROVIDED BY NTESS "AS IS" AND ANY +// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR +// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NTESS OR THE +// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, +// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, +// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR +// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF +// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING +// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS +// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +// +// Questions? Contact Siva Rajamanickam (srajama@sandia.gov) +// +// ************************************************************************ +//@HEADER +#ifndef __KOKKOSBATCHED_SPMV_TEAMVECTOR_IMPL_HPP__ +#define __KOKKOSBATCHED_SPMV_TEAMVECTOR_IMPL_HPP__ + + +/// \author Kim Liegeois (knliege@sandia.gov) + +#include "KokkosBatched_Util.hpp" + +namespace KokkosBatched { + + /// + /// TeamVector Internal Impl + /// ==================== + struct TeamVectorSpmvInternal { + template + KOKKOS_INLINE_FUNCTION + static int + invoke(const MemberType &member, + const OrdinalType numMatrices, const OrdinalType numRows, + const ScalarType* KOKKOS_RESTRICT alpha, const OrdinalType alphas0, + const ValueType* KOKKOS_RESTRICT values, const OrdinalType valuess0, const OrdinalType valuess1, + const OrdinalType* KOKKOS_RESTRICT row_ptr, const OrdinalType row_ptrs0, + const OrdinalType* KOKKOS_RESTRICT colIndices, const OrdinalType colIndicess0, + const ValueType* KOKKOS_RESTRICT X, const OrdinalType xs0, const OrdinalType xs1, + const ScalarType* KOKKOS_RESTRICT beta, const OrdinalType betas0, + /**/ ValueType* KOKKOS_RESTRICT Y, const OrdinalType ys0, const OrdinalType ys1); + }; + + template + KOKKOS_INLINE_FUNCTION + int + TeamVectorSpmvInternal:: + invoke(const MemberType &member, + const OrdinalType numMatrices, const OrdinalType numRows, + const ScalarType* KOKKOS_RESTRICT alpha, const OrdinalType alphas0, + const ValueType* KOKKOS_RESTRICT values, const OrdinalType valuess0, const OrdinalType valuess1, + const OrdinalType* KOKKOS_RESTRICT row_ptr, const OrdinalType row_ptrs0, + const OrdinalType* KOKKOS_RESTRICT colIndices, const OrdinalType colIndicess0, + const ValueType* KOKKOS_RESTRICT X, const OrdinalType xs0, const OrdinalType xs1, + const ScalarType* KOKKOS_RESTRICT beta, const OrdinalType betas0, + /**/ ValueType* KOKKOS_RESTRICT Y, const OrdinalType ys0, const OrdinalType ys1) { + + + Kokkos::parallel_for( + Kokkos::TeamVectorRange(member, 0, numMatrices * numRows), + [&](const OrdinalType& iTemp) { + OrdinalType iRow, iMatrix; + getIndices(iTemp, numRows, numMatrices, iRow, iMatrix); + + const OrdinalType rowLength = + row_ptr[(iRow+1)*row_ptrs0] - row_ptr[iRow*row_ptrs0]; + ValueType sum = 0; +#if defined(KOKKOS_ENABLE_PRAGMA_UNROLL) +#pragma unroll +#endif + for (OrdinalType iEntry = 0; iEntry < rowLength; ++iEntry) { + sum += values[iMatrix*valuess0+(row_ptr[iRow*row_ptrs0]+iEntry)*valuess1] + * X[iMatrix*xs0+colIndices[(row_ptr[iRow*row_ptrs0]+iEntry)*colIndicess0]*xs1]; + } + + sum *= alpha[iMatrix*alphas0]; + + if (dobeta == 0) { + Y[iMatrix*ys0+iRow*ys1] = sum; + } else { + Y[iMatrix*ys0+iRow*ys1] = + beta[iMatrix*betas0] * Y[iMatrix*ys0+iRow*ys1] + sum; + } + }); + + return 0; + } + + template + struct TeamVectorSpmv { + + template + KOKKOS_INLINE_FUNCTION + static int + invoke(const MemberType &member, + const alphaViewType &alpha, + const ValuesViewType &values, + const IntView &row_ptr, + const IntView &colIndices, + const xViewType &X, + const betaViewType &beta, + const yViewType &Y) { + +#if (KOKKOSKERNELS_DEBUG_LEVEL > 0) + static_assert (Kokkos::Impl::is_view::value, "KokkosBatched::spmv: ValuesViewType is not a Kokkos::View."); + static_assert (Kokkos::Impl::is_view::value, "KokkosBatched::spmv: IntView is not a Kokkos::View."); + static_assert (Kokkos::Impl::is_view::value, "KokkosBatched::spmv: xViewType is not a Kokkos::View."); + static_assert (Kokkos::Impl::is_view::value, "KokkosBatched::spmv: yViewType is not a Kokkos::View."); + static_assert (Kokkos::Impl::is_view::value, "KokkosBatched::spmv: alphaViewType is not a Kokkos::View."); + static_assert (Kokkos::Impl::is_view::value, "KokkosBatched::spmv: betaViewType is not a Kokkos::View."); + + static_assert (ValuesViewType::Rank == 2, "KokkosBatched::spmv: ValuesViewType must have rank 2."); + static_assert (IntView::Rank == 1, "KokkosBatched::spmv: IntView must have rank 2."); + static_assert (xViewType::Rank == 2, "KokkosBatched::spmv: xViewType must have rank 2."); + static_assert (yViewType::Rank == 2, "KokkosBatched::spmv: yViewType must have rank 2."); + static_assert (alphaViewType::Rank == 1, "KokkosBatched::spmv: alphaViewType must have rank 1."); + static_assert (betaViewType::Rank == 1, "KokkosBatched::spmv: betaViewType must have rank 1."); + + // Check compatibility of dimensions at run time. + if (X.extent(0) != Y.extent(0) || + X.extent(1) != Y.extent(1)) { + printf("KokkosBatched::spmv: Dimensions of X and Y do not match: X: %d x %d, Y: %d x %d\n", (int) X.extent(0), (int) X.extent(1), (int) Y.extent(0), (int) Y.extent(1)); + return 1; + } + if (X.extent(0) != alpha.extent(0)) { + printf("KokkosBatched::spmv: First dimension of X and alpha do not match: X: %d x %d, alpha: %d\n", (int) X.extent(0), (int) X.extent(1), (int) alpha.extent(0)); + return 1; + } + if (X.extent(0) != beta.extent(0)) { + printf("KokkosBatched::spmv: First dimension of X and beta do not match: X: %d x %d, beta: %d\n", (int) X.extent(0), (int) X.extent(1), (int) beta.extent(0)); + return 1; + } + if (X.extent(0) != values.extent(0)) { + printf("KokkosBatched::spmv: First dimension of X and the first dimension of values do not match: X: %d x %d, values: %d x %d\n", (int) X.extent(0), (int) X.extent(1), (int) values.extent(0), (int) values.extent(1)); + return 1; + } + if (colIndices.extent(0) != values.extent(1)) { + printf("KokkosBatched::spmv: Dimension of colIndices and the second dimension of values do not match: colIndices: %d , values: %d x %d\n", (int) colIndices.extent(0), (int) values.extent(0), (int) values.extent(1)); + return 1; + } + if (row_ptr.extent(0) - 1 != X.extent(1)) { + printf("KokkosBatched::spmv: Dimension of row_ptr and the second dimension of X do not match: colIndices (-1): %d , values: %d x %d\n", (int) row_ptr.extent(0) - 1, (int) X.extent(0), (int) X.extent(1)); + return 1; + } +#endif + + return TeamVectorSpmvInternal::template + invoke + (member, + X.extent(0), X.extent(1), + alpha.data(), alpha.stride_0(), + values.data(), values.stride_0(), values.stride_1(), + row_ptr.data(), row_ptr.stride_0(), + colIndices.data(), colIndices.stride_0(), + X.data(), X.stride_0(), X.stride_1(), + beta.data(), beta.stride_0(), + Y.data(), Y.stride_0(), Y.stride_1()); + } + }; + +} + +#endif diff --git a/src/batched/sparse/impl/KokkosBatched_Spmv_Team_Impl.hpp b/src/batched/sparse/impl/KokkosBatched_Spmv_Team_Impl.hpp new file mode 100644 index 0000000000..7fe6aa8072 --- /dev/null +++ b/src/batched/sparse/impl/KokkosBatched_Spmv_Team_Impl.hpp @@ -0,0 +1,210 @@ +//@HEADER +// ************************************************************************ +// +// Kokkos v. 3.4 +// Copyright (2021) National Technology & Engineering +// Solutions of Sandia, LLC (NTESS). +// +// Under the terms of Contract DE-NA0003525 with NTESS, +// the U.S. Government retains certain rights in this software. +// +// Redistribution and use in source and binary forms, with or without +// modification, are permitted provided that the following conditions are +// met: +// +// 1. Redistributions of source code must retain the above copyright +// notice, this list of conditions and the following disclaimer. +// +// 2. Redistributions in binary form must reproduce the above copyright +// notice, this list of conditions and the following disclaimer in the +// documentation and/or other materials provided with the distribution. +// +// 3. Neither the name of the Corporation nor the names of the +// contributors may be used to endorse or promote products derived from +// this software without specific prior written permission. +// +// THIS SOFTWARE IS PROVIDED BY NTESS "AS IS" AND ANY +// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR +// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NTESS OR THE +// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, +// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, +// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR +// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF +// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING +// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS +// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +// +// Questions? Contact Siva Rajamanickam (srajama@sandia.gov) +// +// ************************************************************************ +//@HEADER +#ifndef __KOKKOSBATCHED_SPMV_TEAM_IMPL_HPP__ +#define __KOKKOSBATCHED_SPMV_TEAM_IMPL_HPP__ + + +/// \author Kim Liegeois (knliege@sandia.gov) + +#include "KokkosBatched_Util.hpp" + +namespace KokkosBatched { + + /// + /// Team Internal Impl + /// ==================== + struct TeamSpmvInternal { + template + KOKKOS_INLINE_FUNCTION + static int + invoke(const MemberType &member, + const OrdinalType numMatrices, const OrdinalType numRows, + const ScalarType* KOKKOS_RESTRICT alpha, const OrdinalType alphas0, + const ValueType* KOKKOS_RESTRICT values, const OrdinalType valuess0, const OrdinalType valuess1, + const OrdinalType* KOKKOS_RESTRICT row_ptr, const OrdinalType row_ptrs0, + const OrdinalType* KOKKOS_RESTRICT colIndices, const OrdinalType colIndicess0, + const ValueType* KOKKOS_RESTRICT X, const OrdinalType xs0, const OrdinalType xs1, + const ScalarType* KOKKOS_RESTRICT beta, const OrdinalType betas0, + /**/ ValueType* KOKKOS_RESTRICT Y, const OrdinalType ys0, const OrdinalType ys1); + }; + + template + KOKKOS_INLINE_FUNCTION + int + TeamSpmvInternal:: + invoke(const MemberType &member, + const OrdinalType numMatrices, const OrdinalType numRows, + const ScalarType* KOKKOS_RESTRICT alpha, const OrdinalType alphas0, + const ValueType* KOKKOS_RESTRICT values, const OrdinalType valuess0, const OrdinalType valuess1, + const OrdinalType* KOKKOS_RESTRICT row_ptr, const OrdinalType row_ptrs0, + const OrdinalType* KOKKOS_RESTRICT colIndices, const OrdinalType colIndicess0, + const ValueType* KOKKOS_RESTRICT X, const OrdinalType xs0, const OrdinalType xs1, + const ScalarType* KOKKOS_RESTRICT beta, const OrdinalType betas0, + /**/ ValueType* KOKKOS_RESTRICT Y, const OrdinalType ys0, const OrdinalType ys1) { + + + Kokkos::parallel_for( + Kokkos::TeamThreadRange(member, 0, numMatrices * numRows), + [&](const OrdinalType& iTemp) { + OrdinalType iRow, iMatrix; + getIndices(iTemp, numRows, numMatrices, iRow, iMatrix); + + const OrdinalType rowLength = + row_ptr[(iRow+1)*row_ptrs0] - row_ptr[iRow*row_ptrs0]; + ValueType sum = 0; +#if defined(KOKKOS_ENABLE_PRAGMA_UNROLL) +#pragma unroll +#endif + for (OrdinalType iEntry = 0; iEntry < rowLength; ++iEntry) { + sum += values[iMatrix*valuess0+(row_ptr[iRow*row_ptrs0]+iEntry)*valuess1] + * X[iMatrix*xs0+colIndices[(row_ptr[iRow*row_ptrs0]+iEntry)*colIndicess0]*xs1]; + } + + sum *= alpha[iMatrix*alphas0]; + + if (dobeta == 0) { + Y[iMatrix*ys0+iRow*ys1] = sum; + } else { + Y[iMatrix*ys0+iRow*ys1] = + beta[iMatrix*betas0] * Y[iMatrix*ys0+iRow*ys1] + sum; + } + }); + + return 0; + } + + template + struct TeamSpmv { + + template + KOKKOS_INLINE_FUNCTION + static int + invoke(const MemberType &member, + const alphaViewType &alpha, + const ValuesViewType &values, + const IntView &row_ptr, + const IntView &colIndices, + const xViewType &X, + const betaViewType &beta, + const yViewType &Y) { + +#if (KOKKOSKERNELS_DEBUG_LEVEL > 0) + static_assert (Kokkos::Impl::is_view::value, "KokkosBatched::spmv: ValuesViewType is not a Kokkos::View."); + static_assert (Kokkos::Impl::is_view::value, "KokkosBatched::spmv: IntView is not a Kokkos::View."); + static_assert (Kokkos::Impl::is_view::value, "KokkosBatched::spmv: xViewType is not a Kokkos::View."); + static_assert (Kokkos::Impl::is_view::value, "KokkosBatched::spmv: yViewType is not a Kokkos::View."); + static_assert (Kokkos::Impl::is_view::value, "KokkosBatched::spmv: alphaViewType is not a Kokkos::View."); + static_assert (Kokkos::Impl::is_view::value, "KokkosBatched::spmv: betaViewType is not a Kokkos::View."); + + static_assert (ValuesViewType::Rank == 2, "KokkosBatched::spmv: ValuesViewType must have rank 2."); + static_assert (IntView::Rank == 1, "KokkosBatched::spmv: IntView must have rank 2."); + static_assert (xViewType::Rank == 2, "KokkosBatched::spmv: xViewType must have rank 2."); + static_assert (yViewType::Rank == 2, "KokkosBatched::spmv: yViewType must have rank 2."); + static_assert (alphaViewType::Rank == 1, "KokkosBatched::spmv: alphaViewType must have rank 1."); + static_assert (betaViewType::Rank == 1, "KokkosBatched::spmv: betaViewType must have rank 1."); + + // Check compatibility of dimensions at run time. + if (X.extent(0) != Y.extent(0) || + X.extent(1) != Y.extent(1)) { + printf("KokkosBatched::spmv: Dimensions of X and Y do not match: X: %d x %d, Y: %d x %d\n", (int) X.extent(0), (int) X.extent(1), (int) Y.extent(0), (int) Y.extent(1)); + return 1; + } + if (X.extent(0) != alpha.extent(0)) { + printf("KokkosBatched::spmv: First dimension of X and alpha do not match: X: %d x %d, alpha: %d\n", (int) X.extent(0), (int) X.extent(1), (int) alpha.extent(0)); + return 1; + } + if (X.extent(0) != beta.extent(0)) { + printf("KokkosBatched::spmv: First dimension of X and beta do not match: X: %d x %d, beta: %d\n", (int) X.extent(0), (int) X.extent(1), (int) beta.extent(0)); + return 1; + } + if (X.extent(0) != values.extent(0)) { + printf("KokkosBatched::spmv: First dimension of X and the first dimension of values do not match: X: %d x %d, values: %d x %d\n", (int) X.extent(0), (int) X.extent(1), (int) values.extent(0), (int) values.extent(1)); + return 1; + } + if (colIndices.extent(0) != values.extent(1)) { + printf("KokkosBatched::spmv: Dimension of colIndices and the second dimension of values do not match: colIndices: %d , values: %d x %d\n", (int) colIndices.extent(0), (int) values.extent(0), (int) values.extent(1)); + return 1; + } + if (row_ptr.extent(0) - 1 != X.extent(1)) { + printf("KokkosBatched::spmv: Dimension of row_ptr and the second dimension of X do not match: colIndices (-1): %d , values: %d x %d\n", (int) row_ptr.extent(0) - 1, (int) X.extent(0), (int) X.extent(1)); + return 1; + } +#endif + + return TeamSpmvInternal::template + invoke + (member, + X.extent(0), X.extent(1), + alpha.data(), alpha.stride_0(), + values.data(), values.stride_0(), values.stride_1(), + row_ptr.data(), row_ptr.stride_0(), + colIndices.data(), colIndices.stride_0(), + X.data(), X.stride_0(), X.stride_1(), + beta.data(), beta.stride_0(), + Y.data(), Y.stride_0(), Y.stride_1()); + } + }; + +} + +#endif diff --git a/unit_test/batched/dense/Test_Batched_Dense.hpp b/unit_test/batched/dense/Test_Batched_Dense.hpp index 60dcda20cc..bf56daec65 100644 --- a/unit_test/batched/dense/Test_Batched_Dense.hpp +++ b/unit_test/batched/dense/Test_Batched_Dense.hpp @@ -2,6 +2,9 @@ #define TEST_BATCHED_DENSE_HPP // Serial kernels +#include "Test_Batched_SerialAxpy.hpp" +#include "Test_Batched_SerialAxpy_Real.hpp" +#include "Test_Batched_SerialAxpy_Complex.hpp" #include "Test_Batched_SerialEigendecomposition.hpp" #include "Test_Batched_SerialEigendecomposition_Real.hpp" #include "Test_Batched_SerialGemm.hpp" @@ -40,6 +43,9 @@ #include "Test_Batched_SerialSVD.hpp" // Team Kernels +#include "Test_Batched_TeamAxpy.hpp" +#include "Test_Batched_TeamAxpy_Real.hpp" +#include "Test_Batched_TeamAxpy_Complex.hpp" #include "Test_Batched_TeamGemm.hpp" #include "Test_Batched_TeamGemm_Real.hpp" #include "Test_Batched_TeamGemm_Complex.hpp" @@ -66,6 +72,9 @@ #include "Test_Batched_TeamTrsv_Complex.hpp" // TeamVector Kernels +#include "Test_Batched_TeamVectorAxpy.hpp" +#include "Test_Batched_TeamVectorAxpy_Real.hpp" +#include "Test_Batched_TeamVectorAxpy_Complex.hpp" #include "Test_Batched_TeamVectorEigendecomposition.hpp" #include "Test_Batched_TeamVectorEigendecomposition_Real.hpp" #include "Test_Batched_TeamVectorGemm.hpp" diff --git a/unit_test/batched/dense/Test_Batched_SerialAxpy.hpp b/unit_test/batched/dense/Test_Batched_SerialAxpy.hpp new file mode 100644 index 0000000000..8f58347c1d --- /dev/null +++ b/unit_test/batched/dense/Test_Batched_SerialAxpy.hpp @@ -0,0 +1,142 @@ +/// \author Kim Liegeois (knliege@sandia.gov) + +#include "gtest/gtest.h" +#include "Kokkos_Core.hpp" +#include "Kokkos_Random.hpp" + +#include "KokkosBatched_Axpy.hpp" + +#include "KokkosKernels_TestUtils.hpp" + +using namespace KokkosBatched; + +namespace Test { +namespace Axpy { + + template + struct Functor_TestBatchedSerialAxpy { + const alphaViewType _alpha; + const ViewType _X; + const ViewType _Y; + + KOKKOS_INLINE_FUNCTION + Functor_TestBatchedSerialAxpy(const alphaViewType &alpha, + const ViewType &X, + const ViewType &Y) + : _alpha(alpha), _X(X), _Y(Y) {} + + KOKKOS_INLINE_FUNCTION + void operator()(const int k) const { + auto alpha = Kokkos::subview(_alpha,Kokkos::make_pair(k,k+1)); + auto x = Kokkos::subview(_X,Kokkos::make_pair(k,k+1),Kokkos::ALL); + auto y = Kokkos::subview(_Y,Kokkos::make_pair(k,k+1),Kokkos::ALL); + + KokkosBatched::SerialAxpy::template invoke + (alpha, x, y); + } + + inline + void run() { + typedef typename ViewType::value_type value_type; + std::string name_region("KokkosBatched::Test::SerialAxpy"); + std::string name_value_type = ( std::is_same::value ? "::Float" : + std::is_same::value ? "::Double" : + std::is_same >::value ? "::ComplexFloat" : + std::is_same >::value ? "::ComplexDouble" : "::UnknownValueType" ); + std::string name = name_region + name_value_type; + Kokkos::Profiling::pushRegion(name.c_str() ); + Kokkos::RangePolicy policy(0, _X.extent(0)); + Kokkos::parallel_for(name.c_str(), policy, *this); + Kokkos::Profiling::popRegion(); + } + }; + + template + void impl_test_batched_axpy(const int N, const int BlkSize) { + typedef typename ViewType::value_type value_type; + typedef Kokkos::Details::ArithTraits ats; + + ViewType X0("x0", N, BlkSize), X1("x1", N, BlkSize), Y0("y0", N, BlkSize), Y1("y1", N, BlkSize); + + alphaViewType alpha("alpha", N); + + Kokkos::Random_XorShift64_Pool random(13718); + Kokkos::fill_random(X0, random, value_type(1.0)); + Kokkos::fill_random(Y0, random, value_type(1.0)); + Kokkos::fill_random(alpha, random, value_type(1.0)); + + Kokkos::fence(); + + Kokkos::deep_copy(X1, X0); + Kokkos::deep_copy(Y1, Y0); + + /// test body + auto alpha_host = Kokkos::create_mirror_view(alpha); + auto X0_host = Kokkos::create_mirror_view(X0); + auto Y0_host = Kokkos::create_mirror_view(Y0); + + Kokkos::deep_copy(alpha_host, alpha); + Kokkos::deep_copy(X0_host, X0); + Kokkos::deep_copy(Y0_host, Y0); + + for (int l=0;l + (alpha, X1, Y1).run(); + + Kokkos::fence(); + + /// for comparison send it to host + auto Y1_host = Kokkos::create_mirror_view(Y1); + + Kokkos::deep_copy(Y1_host, Y1); + + /// check c0 = c1 ; this eps is about 10^-14 + typedef typename ats::mag_type mag_type; + mag_type sum(1), diff(0); + const mag_type eps = 1.0e3 * ats::epsilon(); + + for (int l=0;l +int test_batched_axpy() { +#if defined(KOKKOSKERNELS_INST_LAYOUTLEFT) + { + typedef Kokkos::View ViewType; + typedef Kokkos::View alphaViewType; + + for (int i=3;i<10;++i) { + Test::Axpy::impl_test_batched_axpy(1024, i); + } + } +#endif +#if defined(KOKKOSKERNELS_INST_LAYOUTRIGHT) + { + typedef Kokkos::View ViewType; + typedef Kokkos::View alphaViewType; + + for (int i=3;i<10;++i) { + Test::Axpy::impl_test_batched_axpy(1024, i); + } + } +#endif + + return 0; +} + diff --git a/unit_test/batched/dense/Test_Batched_SerialAxpy_Complex.hpp b/unit_test/batched/dense/Test_Batched_SerialAxpy_Complex.hpp new file mode 100644 index 0000000000..2c7f510c8a --- /dev/null +++ b/unit_test/batched/dense/Test_Batched_SerialAxpy_Complex.hpp @@ -0,0 +1,10 @@ + +#if defined(KOKKOSKERNELS_INST_COMPLEX_DOUBLE) +TEST_F( TestCategory, batched_scalar_serial_axpy_nt_dcomplex_dcomplex ) { + test_batched_axpy,Kokkos::complex>(); +} + +TEST_F( TestCategory, batched_scalar_serial_axpy_nt_dcomplex_double ) { + test_batched_axpy,double>(); +} +#endif diff --git a/unit_test/batched/dense/Test_Batched_SerialAxpy_Real.hpp b/unit_test/batched/dense/Test_Batched_SerialAxpy_Real.hpp new file mode 100644 index 0000000000..d38caab20c --- /dev/null +++ b/unit_test/batched/dense/Test_Batched_SerialAxpy_Real.hpp @@ -0,0 +1,12 @@ + +#if defined(KOKKOSKERNELS_INST_FLOAT) +TEST_F( TestCategory, batched_scalar_serial_axpy_nt_float_float ) { + test_batched_axpy(); +} +#endif + +#if defined(KOKKOSKERNELS_INST_DOUBLE) +TEST_F( TestCategory, batched_scalar_serial_axpy_nt_double_double ) { + test_batched_axpy(); +} +#endif diff --git a/unit_test/batched/dense/Test_Batched_TeamAxpy.hpp b/unit_test/batched/dense/Test_Batched_TeamAxpy.hpp new file mode 100644 index 0000000000..b717fd9c8b --- /dev/null +++ b/unit_test/batched/dense/Test_Batched_TeamAxpy.hpp @@ -0,0 +1,150 @@ +/// \author Kim Liegeois (knliege@sandia.gov) + +#include "gtest/gtest.h" +#include "Kokkos_Core.hpp" +#include "Kokkos_Random.hpp" + +#include "KokkosBatched_Axpy.hpp" + +#include "KokkosKernels_TestUtils.hpp" + +using namespace KokkosBatched; + +namespace Test { +namespace TeamAxpy { + + template + struct Functor_TestBatchedTeamAxpy { + const alphaViewType _alpha; + const ViewType _X; + const ViewType _Y; + const int _N_team; + + KOKKOS_INLINE_FUNCTION + Functor_TestBatchedTeamAxpy(const alphaViewType &alpha, + const ViewType &X, + const ViewType &Y, + const int N_team) + : _alpha(alpha), _X(X), _Y(Y), _N_team(N_team) {} + + template + KOKKOS_INLINE_FUNCTION + void operator()(const MemberType &member) const { + const int first_matrix = + static_cast(member.league_rank()) * _N_team; + const int N = _X.extent(0); + const int last_matrix = (static_cast(member.league_rank() + 1) * _N_team < N ? static_cast(member.league_rank() + 1) * _N_team : N ); + + auto alpha = Kokkos::subview(_alpha,Kokkos::make_pair(first_matrix,last_matrix)); + auto x = Kokkos::subview(_X,Kokkos::make_pair(first_matrix,last_matrix),Kokkos::ALL); + auto y = Kokkos::subview(_Y,Kokkos::make_pair(first_matrix,last_matrix),Kokkos::ALL); + + KokkosBatched::TeamAxpy::template invoke + (member, alpha, x, y); + } + + inline + void run() { + typedef typename ViewType::value_type value_type; + std::string name_region("KokkosBatched::Test::TeamAxpy"); + std::string name_value_type = ( std::is_same::value ? "::Float" : + std::is_same::value ? "::Double" : + std::is_same >::value ? "::ComplexFloat" : + std::is_same >::value ? "::ComplexDouble" : "::UnknownValueType" ); + std::string name = name_region + name_value_type; + Kokkos::Profiling::pushRegion(name.c_str() ); + Kokkos::TeamPolicy policy(_X.extent(0)/_N_team, Kokkos::AUTO(), Kokkos::AUTO()); + Kokkos::parallel_for(name.c_str(), policy, *this); + Kokkos::Profiling::popRegion(); + } + }; + + template + void impl_test_batched_axpy(const int N, const int BlkSize, const int N_team) { + typedef typename ViewType::value_type value_type; + typedef Kokkos::Details::ArithTraits ats; + + ViewType X0("x0", N, BlkSize), X1("x1", N, BlkSize), Y0("y0", N, BlkSize), Y1("y1", N, BlkSize); + + alphaViewType alpha("alpha", N); + + Kokkos::Random_XorShift64_Pool random(13718); + Kokkos::fill_random(X0, random, value_type(1.0)); + Kokkos::fill_random(Y0, random, value_type(1.0)); + Kokkos::fill_random(alpha, random, value_type(1.0)); + + Kokkos::fence(); + + Kokkos::deep_copy(X1, X0); + Kokkos::deep_copy(Y1, Y0); + + /// test body + auto alpha_host = Kokkos::create_mirror_view(alpha); + auto X0_host = Kokkos::create_mirror_view(X0); + auto Y0_host = Kokkos::create_mirror_view(Y0); + + Kokkos::deep_copy(alpha_host, alpha); + Kokkos::deep_copy(X0_host, X0); + Kokkos::deep_copy(Y0_host, Y0); + + for (int l=0;l + (alpha, X1, Y1, N_team).run(); + + Kokkos::fence(); + + /// for comparison send it to host + auto Y1_host = Kokkos::create_mirror_view(Y1); + + Kokkos::deep_copy(Y1_host, Y1); + + /// check c0 = c1 ; this eps is about 10^-14 + typedef typename ats::mag_type mag_type; + mag_type sum(1), diff(0); + const mag_type eps = 1.0e3 * ats::epsilon(); + + for (int l=0;l +int test_batched_team_axpy() { +#if defined(KOKKOSKERNELS_INST_LAYOUTLEFT) + { + typedef Kokkos::View ViewType; + typedef Kokkos::View alphaViewType; + + for (int i=3;i<10;++i) { + Test::TeamAxpy::impl_test_batched_axpy(1024, i, 2); + } + } +#endif +#if defined(KOKKOSKERNELS_INST_LAYOUTRIGHT) + { + typedef Kokkos::View ViewType; + typedef Kokkos::View alphaViewType; + + for (int i=3;i<10;++i) { + Test::TeamAxpy::impl_test_batched_axpy(1024, i, 2); + } + } +#endif + + return 0; +} + diff --git a/unit_test/batched/dense/Test_Batched_TeamAxpy_Complex.hpp b/unit_test/batched/dense/Test_Batched_TeamAxpy_Complex.hpp new file mode 100644 index 0000000000..aa14c15e93 --- /dev/null +++ b/unit_test/batched/dense/Test_Batched_TeamAxpy_Complex.hpp @@ -0,0 +1,10 @@ + +#if defined(KOKKOSKERNELS_INST_COMPLEX_DOUBLE) +TEST_F( TestCategory, batched_scalar_team_axpy_nt_dcomplex_dcomplex ) { + test_batched_team_axpy,Kokkos::complex>(); +} + +TEST_F( TestCategory, batched_scalar_team_axpy_nt_dcomplex_double ) { + test_batched_team_axpy,double>(); +} +#endif diff --git a/unit_test/batched/dense/Test_Batched_TeamAxpy_Real.hpp b/unit_test/batched/dense/Test_Batched_TeamAxpy_Real.hpp new file mode 100644 index 0000000000..928f1aee19 --- /dev/null +++ b/unit_test/batched/dense/Test_Batched_TeamAxpy_Real.hpp @@ -0,0 +1,12 @@ + +#if defined(KOKKOSKERNELS_INST_FLOAT) +TEST_F( TestCategory, batched_scalar_team_axpy_nt_float_float ) { + test_batched_team_axpy(); +} +#endif + +#if defined(KOKKOSKERNELS_INST_DOUBLE) +TEST_F( TestCategory, batched_scalar_team_axpy_nt_double_double ) { + test_batched_team_axpy(); +} +#endif diff --git a/unit_test/batched/dense/Test_Batched_TeamVectorAxpy.hpp b/unit_test/batched/dense/Test_Batched_TeamVectorAxpy.hpp new file mode 100644 index 0000000000..29273aed84 --- /dev/null +++ b/unit_test/batched/dense/Test_Batched_TeamVectorAxpy.hpp @@ -0,0 +1,150 @@ +/// \author Kim Liegeois (knliege@sandia.gov) + +#include "gtest/gtest.h" +#include "Kokkos_Core.hpp" +#include "Kokkos_Random.hpp" + +#include "KokkosBatched_Axpy.hpp" + +#include "KokkosKernels_TestUtils.hpp" + +using namespace KokkosBatched; + +namespace Test { +namespace TeamVectorAxpy { + + template + struct Functor_TestBatchedTeamVectorAxpy { + const alphaViewType _alpha; + const ViewType _X; + const ViewType _Y; + const int _N_team; + + KOKKOS_INLINE_FUNCTION + Functor_TestBatchedTeamVectorAxpy(const alphaViewType &alpha, + const ViewType &X, + const ViewType &Y, + const int N_team) + : _alpha(alpha), _X(X), _Y(Y), _N_team(N_team) {} + + template + KOKKOS_INLINE_FUNCTION + void operator()(const MemberType &member) const { + const int first_matrix = + static_cast(member.league_rank()) * _N_team; + const int N = _X.extent(0); + const int last_matrix = (static_cast(member.league_rank() + 1) * _N_team < N ? static_cast(member.league_rank() + 1) * _N_team : N ); + + auto alpha = Kokkos::subview(_alpha,Kokkos::make_pair(first_matrix,last_matrix)); + auto x = Kokkos::subview(_X,Kokkos::make_pair(first_matrix,last_matrix),Kokkos::ALL); + auto y = Kokkos::subview(_Y,Kokkos::make_pair(first_matrix,last_matrix),Kokkos::ALL); + + KokkosBatched::TeamVectorAxpy::template invoke + (member, alpha, x, y); + } + + inline + void run() { + typedef typename ViewType::value_type value_type; + std::string name_region("KokkosBatched::Test::TeamVectorAxpy"); + std::string name_value_type = ( std::is_same::value ? "::Float" : + std::is_same::value ? "::Double" : + std::is_same >::value ? "::ComplexFloat" : + std::is_same >::value ? "::ComplexDouble" : "::UnknownValueType" ); + std::string name = name_region + name_value_type; + Kokkos::Profiling::pushRegion(name.c_str() ); + Kokkos::TeamPolicy policy(_X.extent(0)/_N_team, Kokkos::AUTO(), Kokkos::AUTO()); + Kokkos::parallel_for(name.c_str(), policy, *this); + Kokkos::Profiling::popRegion(); + } + }; + + template + void impl_test_batched_axpy(const int N, const int BlkSize, const int N_team) { + typedef typename ViewType::value_type value_type; + typedef Kokkos::Details::ArithTraits ats; + + ViewType X0("x0", N, BlkSize), X1("x1", N, BlkSize), Y0("y0", N, BlkSize), Y1("y1", N, BlkSize); + + alphaViewType alpha("alpha", N); + + Kokkos::Random_XorShift64_Pool random(13718); + Kokkos::fill_random(X0, random, value_type(1.0)); + Kokkos::fill_random(Y0, random, value_type(1.0)); + Kokkos::fill_random(alpha, random, value_type(1.0)); + + Kokkos::fence(); + + Kokkos::deep_copy(X1, X0); + Kokkos::deep_copy(Y1, Y0); + + /// test body + auto alpha_host = Kokkos::create_mirror_view(alpha); + auto X0_host = Kokkos::create_mirror_view(X0); + auto Y0_host = Kokkos::create_mirror_view(Y0); + + Kokkos::deep_copy(alpha_host, alpha); + Kokkos::deep_copy(X0_host, X0); + Kokkos::deep_copy(Y0_host, Y0); + + for (int l=0;l + (alpha, X1, Y1, N_team).run(); + + Kokkos::fence(); + + /// for comparison send it to host + auto Y1_host = Kokkos::create_mirror_view(Y1); + + Kokkos::deep_copy(Y1_host, Y1); + + /// check c0 = c1 ; this eps is about 10^-14 + typedef typename ats::mag_type mag_type; + mag_type sum(1), diff(0); + const mag_type eps = 1.0e3 * ats::epsilon(); + + for (int l=0;l +int test_batched_teamvector_axpy() { +#if defined(KOKKOSKERNELS_INST_LAYOUTLEFT) + { + typedef Kokkos::View ViewType; + typedef Kokkos::View alphaViewType; + + for (int i=3;i<10;++i) { + Test::TeamVectorAxpy::impl_test_batched_axpy(1024, i, 2); + } + } +#endif +#if defined(KOKKOSKERNELS_INST_LAYOUTRIGHT) + { + typedef Kokkos::View ViewType; + typedef Kokkos::View alphaViewType; + + for (int i=3;i<10;++i) { + Test::TeamVectorAxpy::impl_test_batched_axpy(1024, i, 2); + } + } +#endif + + return 0; +} + diff --git a/unit_test/batched/dense/Test_Batched_TeamVectorAxpy_Complex.hpp b/unit_test/batched/dense/Test_Batched_TeamVectorAxpy_Complex.hpp new file mode 100644 index 0000000000..d29f932d44 --- /dev/null +++ b/unit_test/batched/dense/Test_Batched_TeamVectorAxpy_Complex.hpp @@ -0,0 +1,10 @@ + +#if defined(KOKKOSKERNELS_INST_COMPLEX_DOUBLE) +TEST_F( TestCategory, batched_scalar_teamvector_axpy_nt_dcomplex_dcomplex ) { + test_batched_teamvector_axpy,Kokkos::complex>(); +} + +TEST_F( TestCategory, batched_scalar_teamvector_axpy_nt_dcomplex_double ) { + test_batched_teamvector_axpy,double>(); +} +#endif diff --git a/unit_test/batched/dense/Test_Batched_TeamVectorAxpy_Real.hpp b/unit_test/batched/dense/Test_Batched_TeamVectorAxpy_Real.hpp new file mode 100644 index 0000000000..1f96cb6e22 --- /dev/null +++ b/unit_test/batched/dense/Test_Batched_TeamVectorAxpy_Real.hpp @@ -0,0 +1,12 @@ + +#if defined(KOKKOSKERNELS_INST_FLOAT) +TEST_F( TestCategory, batched_scalar_teamvector_axpy_nt_float_float ) { + test_batched_teamvector_axpy(); +} +#endif + +#if defined(KOKKOSKERNELS_INST_DOUBLE) +TEST_F( TestCategory, batched_scalar_teamvector_axpy_nt_double_double ) { + test_batched_teamvector_axpy(); +} +#endif diff --git a/unit_test/batched/sparse/Test_Batched_SerialSpmv.hpp b/unit_test/batched/sparse/Test_Batched_SerialSpmv.hpp new file mode 100644 index 0000000000..148ff6f6e0 --- /dev/null +++ b/unit_test/batched/sparse/Test_Batched_SerialSpmv.hpp @@ -0,0 +1,237 @@ +/// \author Kim Liegeois (knliege@sandia.gov) + +#include "gtest/gtest.h" +#include "Kokkos_Core.hpp" +#include "Kokkos_Random.hpp" + +//#include "KokkosBatched_Vector.hpp" + +#include "KokkosBatched_Spmv.hpp" +#include "KokkosBatched_Spmv_Serial_Impl.hpp" + +#include "KokkosKernels_TestUtils.hpp" + +using namespace KokkosBatched; + +namespace Test { +namespace Spmv { + + template + struct ParamTag { + typedef T trans; + }; + + template + struct Functor_TestBatchedSerialSpmv { + const alphaViewType _alpha; + const ValuesViewType _D; + const IntView _r; + const IntView _c; + const xViewType _X; + const betaViewType _beta; + const yViewType _Y; + + KOKKOS_INLINE_FUNCTION + Functor_TestBatchedSerialSpmv(const alphaViewType &alpha, + const ValuesViewType &D, + const IntView &r, + const IntView &c, + const xViewType &X, + const betaViewType &beta, + const yViewType &Y) + : _alpha(alpha), _D(D), _r(r), _c(c), _X(X), _beta(beta), _Y(Y) {} + + KOKKOS_INLINE_FUNCTION + void operator()(const ParamTagType &, const int k) const { + auto alpha = Kokkos::subview(_alpha,Kokkos::make_pair(k,k+1)); + auto d = Kokkos::subview(_D,Kokkos::make_pair(k,k+1),Kokkos::ALL); + auto x = Kokkos::subview(_X,Kokkos::make_pair(k,k+1),Kokkos::ALL); + auto beta = Kokkos::subview(_beta,Kokkos::make_pair(k,k+1)); + auto y = Kokkos::subview(_Y,Kokkos::make_pair(k,k+1),Kokkos::ALL); + + KokkosBatched::SerialSpmv::template invoke + (alpha, d, _r, _c, x, beta, y); + } + + inline + void run() { + typedef typename ValuesViewType::value_type value_type; + std::string name_region("KokkosBatched::Test::SerialSpmv"); + std::string name_value_type = ( std::is_same::value ? "::Float" : + std::is_same::value ? "::Double" : + std::is_same >::value ? "::ComplexFloat" : + std::is_same >::value ? "::ComplexDouble" : "::UnknownValueType" ); + std::string name = name_region + name_value_type; + Kokkos::Profiling::pushRegion(name.c_str() ); + Kokkos::RangePolicy policy(0, _D.extent(0)); + Kokkos::parallel_for(name.c_str(), policy, *this); + Kokkos::Profiling::popRegion(); + } + }; + + template + void impl_test_batched_spmv(const int N, const int BlkSize) { + typedef typename ValuesViewType::value_type value_type; + typedef Kokkos::Details::ArithTraits ats; + + const int nnz = (BlkSize-2) * 3 + 2 * 2; + + xViewType X0("x0", N, BlkSize), X1("x1", N, BlkSize); + yViewType Y0("y0", N, BlkSize), Y1("y1", N, BlkSize); + ValuesViewType D("D", N, nnz); + IntView r("r", BlkSize+1); + IntView c("c", nnz); + + alphaViewType alpha("alpha", N); + betaViewType beta("beta", N); + + Kokkos::deep_copy(alpha, value_type(1.0)); + Kokkos::deep_copy(beta, value_type(1.0)); + + Kokkos::Random_XorShift64_Pool random(13718); + Kokkos::fill_random(X0, random, value_type(1.0)); + Kokkos::fill_random(Y0, random, value_type(1.0)); + + auto D_host = Kokkos::create_mirror_view(D); + auto r_host = Kokkos::create_mirror_view(r); + auto c_host = Kokkos::create_mirror_view(c); + + r_host(0) = 0; + + int current_col = 0; + + for (int i=0;i + (alpha, D, r, c, X1, beta, Y1).run(); + + Kokkos::fence(); + + /// for comparison send it to host + auto Y1_host = Kokkos::create_mirror_view(Y1); + + Kokkos::deep_copy(Y1_host, Y1); + + /// check c0 = c1 ; this eps is about 10^-14 + typedef typename ats::mag_type mag_type; + mag_type sum(1), diff(0); + const mag_type eps = 1.0e3 * ats::epsilon(); + + for (int l=0;l +int test_batched_spmv() { +#if defined(KOKKOSKERNELS_INST_LAYOUTLEFT) + { + typedef Kokkos::View ViewType; + typedef Kokkos::View IntView; + typedef Kokkos::View alphaViewType; + + for (int i=3;i<10;++i) { + Test::Spmv::impl_test_batched_spmv(1024, i); + } + for (int i=3;i<10;++i) { + Test::Spmv::impl_test_batched_spmv(1024, i); + } + } +#endif +#if defined(KOKKOSKERNELS_INST_LAYOUTRIGHT) + { + typedef Kokkos::View ViewType; + typedef Kokkos::View IntView; + typedef Kokkos::View alphaViewType; + + for (int i=3;i<10;++i) { + Test::Spmv::impl_test_batched_spmv(1024, i); + } + for (int i=3;i<10;++i) { + Test::Spmv::impl_test_batched_spmv(1024, i); + } + } +#endif + + return 0; +} + diff --git a/unit_test/batched/sparse/Test_Batched_SerialSpmv_Real.hpp b/unit_test/batched/sparse/Test_Batched_SerialSpmv_Real.hpp new file mode 100644 index 0000000000..1cc6c3688e --- /dev/null +++ b/unit_test/batched/sparse/Test_Batched_SerialSpmv_Real.hpp @@ -0,0 +1,14 @@ + +#if defined(KOKKOSKERNELS_INST_FLOAT) +TEST_F( TestCategory, batched_scalar_serial_spmv_nt_float_float ) { + typedef ::Test::Spmv::ParamTag param_tag_type; + test_batched_spmv(); +} +#endif + +#if defined(KOKKOSKERNELS_INST_DOUBLE) +TEST_F( TestCategory, batched_scalar_serial_spmv_nt_double_double ) { + typedef ::Test::Spmv::ParamTag param_tag_type; + test_batched_spmv(); +} +#endif diff --git a/unit_test/batched/sparse/Test_Batched_Sparse.hpp b/unit_test/batched/sparse/Test_Batched_Sparse.hpp index ee18463706..a3ceb52a5b 100644 --- a/unit_test/batched/sparse/Test_Batched_Sparse.hpp +++ b/unit_test/batched/sparse/Test_Batched_Sparse.hpp @@ -2,13 +2,16 @@ #define TEST_BATCHED_SPARSE_HPP // Serial kernels - +#include "Test_Batched_SerialSpmv.hpp" +#include "Test_Batched_SerialSpmv_Real.hpp" // Team Kernels - +#include "Test_Batched_TeamSpmv.hpp" +#include "Test_Batched_TeamSpmv_Real.hpp" // TeamVector Kernels - +#include "Test_Batched_TeamVectorSpmv.hpp" +#include "Test_Batched_TeamVectorSpmv_Real.hpp" // Vector Kernels diff --git a/unit_test/batched/sparse/Test_Batched_TeamSpmv.hpp b/unit_test/batched/sparse/Test_Batched_TeamSpmv.hpp new file mode 100644 index 0000000000..4ede79995e --- /dev/null +++ b/unit_test/batched/sparse/Test_Batched_TeamSpmv.hpp @@ -0,0 +1,245 @@ +/// \author Kim Liegeois (knliege@sandia.gov) + +#include "gtest/gtest.h" +#include "Kokkos_Core.hpp" +#include "Kokkos_Random.hpp" + +//#include "KokkosBatched_Vector.hpp" + +#include "KokkosBatched_Spmv.hpp" +#include "KokkosBatched_Spmv_Team_Impl.hpp" + +#include "KokkosKernels_TestUtils.hpp" + +using namespace KokkosBatched; + +namespace Test { +namespace TeamSpmv { + + template + struct ParamTag { + typedef T trans; + }; + + template + struct Functor_TestBatchedTeamSpmv { + const alphaViewType _alpha; + const ValuesViewType _D; + const IntView _r; + const IntView _c; + const xViewType _X; + const betaViewType _beta; + const yViewType _Y; + const int _N_team; + + KOKKOS_INLINE_FUNCTION + Functor_TestBatchedTeamSpmv(const alphaViewType &alpha, + const ValuesViewType &D, + const IntView &r, + const IntView &c, + const xViewType &X, + const betaViewType &beta, + const yViewType &Y, + const int N_team) + : _alpha(alpha), _D(D), _r(r), _c(c), _X(X), _beta(beta), _Y(Y), _N_team(N_team) {} + + template + KOKKOS_INLINE_FUNCTION + void operator()(const ParamTagType &, const MemberType &member) const { + const int first_matrix = + static_cast(member.league_rank()) * _N_team; + const int N = _D.extent(0); + const int last_matrix = (static_cast(member.league_rank() + 1) * _N_team < N ? static_cast(member.league_rank() + 1) * _N_team : N ); + + auto alpha = Kokkos::subview(_alpha,Kokkos::make_pair(first_matrix,last_matrix)); + auto d = Kokkos::subview(_D,Kokkos::make_pair(first_matrix,last_matrix),Kokkos::ALL); + auto x = Kokkos::subview(_X,Kokkos::make_pair(first_matrix,last_matrix),Kokkos::ALL); + auto beta = Kokkos::subview(_beta,Kokkos::make_pair(first_matrix,last_matrix)); + auto y = Kokkos::subview(_Y,Kokkos::make_pair(first_matrix,last_matrix),Kokkos::ALL); + + KokkosBatched::TeamSpmv::template invoke (member, alpha, d, _r, _c, x, beta, y); + } + + inline + void run() { + typedef typename ValuesViewType::value_type value_type; + std::string name_region("KokkosBatched::Test::TeamSpmv"); + std::string name_value_type = ( std::is_same::value ? "::Float" : + std::is_same::value ? "::Double" : + std::is_same >::value ? "::ComplexFloat" : + std::is_same >::value ? "::ComplexDouble" : "::UnknownValueType" ); + std::string name = name_region + name_value_type; + Kokkos::Profiling::pushRegion(name.c_str() ); + Kokkos::TeamPolicy policy(_D.extent(0)/_N_team, Kokkos::AUTO(), Kokkos::AUTO()); + Kokkos::parallel_for(name.c_str(), policy, *this); + Kokkos::Profiling::popRegion(); + } + }; + + template + void impl_test_batched_spmv(const int N, const int BlkSize, const int N_team) { + typedef typename ValuesViewType::value_type value_type; + typedef Kokkos::Details::ArithTraits ats; + + const int nnz = (BlkSize-2) * 3 + 2 * 2; + + xViewType X0("x0", N, BlkSize), X1("x1", N, BlkSize); + yViewType Y0("y0", N, BlkSize), Y1("y1", N, BlkSize); + ValuesViewType D("D", N, nnz); + IntView r("r", BlkSize+1); + IntView c("c", nnz); + + alphaViewType alpha("alpha", N); + betaViewType beta("beta", N); + + Kokkos::deep_copy(alpha, value_type(1.0)); + Kokkos::deep_copy(beta, value_type(1.0)); + + Kokkos::Random_XorShift64_Pool random(13718); + Kokkos::fill_random(X0, random, value_type(1.0)); + Kokkos::fill_random(Y0, random, value_type(1.0)); + + auto D_host = Kokkos::create_mirror_view(D); + auto r_host = Kokkos::create_mirror_view(r); + auto c_host = Kokkos::create_mirror_view(c); + + r_host(0) = 0; + + int current_col = 0; + + for (int i=0;i + (alpha, D, r, c, X1, beta, Y1, N_team).run(); + + Kokkos::fence(); + + /// for comparison send it to host + auto Y1_host = Kokkos::create_mirror_view(Y1); + + Kokkos::deep_copy(Y1_host, Y1); + + /// check c0 = c1 ; this eps is about 10^-14 + typedef typename ats::mag_type mag_type; + mag_type sum(1), diff(0); + const mag_type eps = 1.0e3 * ats::epsilon(); + + for (int l=0;l +int test_batched_team_spmv() { +#if defined(KOKKOSKERNELS_INST_LAYOUTLEFT) + { + typedef Kokkos::View ViewType; + typedef Kokkos::View IntView; + typedef Kokkos::View alphaViewType; + + for (int i=3;i<10;++i) { + Test::TeamSpmv::impl_test_batched_spmv(1024, i, 2); + } + for (int i=3;i<10;++i) { + Test::TeamSpmv::impl_test_batched_spmv(1024, i, 2); + } + } +#endif +#if defined(KOKKOSKERNELS_INST_LAYOUTRIGHT) + { + typedef Kokkos::View ViewType; + typedef Kokkos::View IntView; + typedef Kokkos::View alphaViewType; + + for (int i=3;i<10;++i) { + Test::TeamSpmv::impl_test_batched_spmv(1024, i, 2); + } + + for (int i=3;i<10;++i) { + Test::TeamSpmv::impl_test_batched_spmv(1024, i, 2); + } + } +#endif + + return 0; +} + diff --git a/unit_test/batched/sparse/Test_Batched_TeamSpmv_Real.hpp b/unit_test/batched/sparse/Test_Batched_TeamSpmv_Real.hpp new file mode 100644 index 0000000000..713d7354ae --- /dev/null +++ b/unit_test/batched/sparse/Test_Batched_TeamSpmv_Real.hpp @@ -0,0 +1,14 @@ + +#if defined(KOKKOSKERNELS_INST_FLOAT) +TEST_F( TestCategory, batched_scalar_team_spmv_nt_float_float ) { + typedef ::Test::Spmv::ParamTag param_tag_type; + test_batched_team_spmv(); +} +#endif + +#if defined(KOKKOSKERNELS_INST_DOUBLE) +TEST_F( TestCategory, batched_scalar_team_spmv_nt_double_double ) { + typedef ::Test::Spmv::ParamTag param_tag_type; + test_batched_team_spmv(); +} +#endif diff --git a/unit_test/batched/sparse/Test_Batched_TeamVectorSpmv.hpp b/unit_test/batched/sparse/Test_Batched_TeamVectorSpmv.hpp new file mode 100644 index 0000000000..9adf6843ad --- /dev/null +++ b/unit_test/batched/sparse/Test_Batched_TeamVectorSpmv.hpp @@ -0,0 +1,244 @@ +/// \author Kim Liegeois (knliege@sandia.gov) + +#include "gtest/gtest.h" +#include "Kokkos_Core.hpp" +#include "Kokkos_Random.hpp" + +//#include "KokkosBatched_Vector.hpp" + +#include "KokkosBatched_Spmv.hpp" +#include "KokkosBatched_Spmv_TeamVector_Impl.hpp" + +#include "KokkosKernels_TestUtils.hpp" + +using namespace KokkosBatched; + +namespace Test { +namespace TeamVectorSpmv { + + template + struct ParamTag { + typedef T trans; + }; + + template + struct Functor_TestBatchedTeamVectorSpmv { + const alphaViewType _alpha; + const ValuesViewType _D; + const IntView _r; + const IntView _c; + const xViewType _X; + const betaViewType _beta; + const yViewType _Y; + const int _N_team; + + KOKKOS_INLINE_FUNCTION + Functor_TestBatchedTeamVectorSpmv(const alphaViewType &alpha, + const ValuesViewType &D, + const IntView &r, + const IntView &c, + const xViewType &X, + const betaViewType &beta, + const yViewType &Y, + const int N_team) + : _alpha(alpha), _D(D), _r(r), _c(c), _X(X), _beta(beta), _Y(Y), _N_team(N_team) {} + + template + KOKKOS_INLINE_FUNCTION + void operator()(const ParamTagType &, const MemberType &member) const { + const int first_matrix = + static_cast(member.league_rank()) * _N_team; + const int N = _D.extent(0); + const int last_matrix = (static_cast(member.league_rank() + 1) * _N_team < N ? static_cast(member.league_rank() + 1) * _N_team : N ); + + auto alpha = Kokkos::subview(_alpha,Kokkos::make_pair(first_matrix,last_matrix)); + auto d = Kokkos::subview(_D,Kokkos::make_pair(first_matrix,last_matrix),Kokkos::ALL); + auto x = Kokkos::subview(_X,Kokkos::make_pair(first_matrix,last_matrix),Kokkos::ALL); + auto beta = Kokkos::subview(_beta,Kokkos::make_pair(first_matrix,last_matrix)); + auto y = Kokkos::subview(_Y,Kokkos::make_pair(first_matrix,last_matrix),Kokkos::ALL); + + KokkosBatched::TeamVectorSpmv::template invoke (member, alpha, d, _r, _c, x, beta, y); + } + + inline + void run() { + typedef typename ValuesViewType::value_type value_type; + std::string name_region("KokkosBatched::Test::TeamVectorSpmv"); + std::string name_value_type = ( std::is_same::value ? "::Float" : + std::is_same::value ? "::Double" : + std::is_same >::value ? "::ComplexFloat" : + std::is_same >::value ? "::ComplexDouble" : "::UnknownValueType" ); + std::string name = name_region + name_value_type; + Kokkos::Profiling::pushRegion(name.c_str() ); + Kokkos::TeamPolicy policy(_D.extent(0)/_N_team, Kokkos::AUTO(), Kokkos::AUTO()); + Kokkos::parallel_for(name.c_str(), policy, *this); + Kokkos::Profiling::popRegion(); + } + }; + + template + void impl_test_batched_spmv(const int N, const int BlkSize, const int N_team) { + typedef typename ValuesViewType::value_type value_type; + typedef Kokkos::Details::ArithTraits ats; + + const int nnz = (BlkSize-2) * 3 + 2 * 2; + + xViewType X0("x0", N, BlkSize), X1("x1", N, BlkSize); + yViewType Y0("y0", N, BlkSize), Y1("y1", N, BlkSize); + ValuesViewType D("D", N, nnz); + IntView r("r", BlkSize+1); + IntView c("c", nnz); + + alphaViewType alpha("alpha", N); + betaViewType beta("beta", N); + + Kokkos::deep_copy(alpha, value_type(1.0)); + Kokkos::deep_copy(beta, value_type(1.0)); + + Kokkos::Random_XorShift64_Pool random(13718); + Kokkos::fill_random(X0, random, value_type(1.0)); + Kokkos::fill_random(Y0, random, value_type(1.0)); + + auto D_host = Kokkos::create_mirror_view(D); + auto r_host = Kokkos::create_mirror_view(r); + auto c_host = Kokkos::create_mirror_view(c); + + r_host(0) = 0; + + int current_col = 0; + + for (int i=0;i + (alpha, D, r, c, X1, beta, Y1, N_team).run(); + + Kokkos::fence(); + + /// for comparison send it to host + auto Y1_host = Kokkos::create_mirror_view(Y1); + + Kokkos::deep_copy(Y1_host, Y1); + + /// check c0 = c1 ; this eps is about 10^-14 + typedef typename ats::mag_type mag_type; + mag_type sum(1), diff(0); + const mag_type eps = 1.0e3 * ats::epsilon(); + + for (int l=0;l +int test_batched_teamvector_spmv() { +#if defined(KOKKOSKERNELS_INST_LAYOUTLEFT) + { + typedef Kokkos::View ViewType; + typedef Kokkos::View IntView; + typedef Kokkos::View alphaViewType; + + for (int i=3;i<10;++i) { + Test::TeamVectorSpmv::impl_test_batched_spmv(1024, i, 2); + } + for (int i=3;i<10;++i) { + Test::TeamVectorSpmv::impl_test_batched_spmv(1024, i, 2); + } + } +#endif +#if defined(KOKKOSKERNELS_INST_LAYOUTRIGHT) + { + typedef Kokkos::View ViewType; + typedef Kokkos::View IntView; + typedef Kokkos::View alphaViewType; + + for (int i=3;i<10;++i) { + Test::TeamVectorSpmv::impl_test_batched_spmv(1024, i, 2); + } + for (int i=3;i<10;++i) { + Test::TeamVectorSpmv::impl_test_batched_spmv(1024, i, 2); + } + } +#endif + + return 0; +} + diff --git a/unit_test/batched/sparse/Test_Batched_TeamVectorSpmv_Real.hpp b/unit_test/batched/sparse/Test_Batched_TeamVectorSpmv_Real.hpp new file mode 100644 index 0000000000..58be5de533 --- /dev/null +++ b/unit_test/batched/sparse/Test_Batched_TeamVectorSpmv_Real.hpp @@ -0,0 +1,14 @@ + +#if defined(KOKKOSKERNELS_INST_FLOAT) +TEST_F( TestCategory, batched_scalar_teamvector_spmv_nt_float_float ) { + typedef ::Test::Spmv::ParamTag param_tag_type; + test_batched_teamvector_spmv(); +} +#endif + +#if defined(KOKKOSKERNELS_INST_DOUBLE) +TEST_F( TestCategory, batched_scalar_teamvector_spmv_nt_double_double ) { + typedef ::Test::Spmv::ParamTag param_tag_type; + test_batched_teamvector_spmv(); +} +#endif From 74d9a3b68d31470fa19baff4b2b2a3b98fa225bc Mon Sep 17 00:00:00 2001 From: Evan Harvey Date: Tue, 12 Oct 2021 09:49:21 -0600 Subject: [PATCH 010/300] Revert "gmres: disable examples for builds with ibm/xl" This reverts commit adaa5510f9db1c9fbac8fef8e13afa6f23f0d383. --- example/gmres/CMakeLists.txt | 6 ------ 1 file changed, 6 deletions(-) diff --git a/example/gmres/CMakeLists.txt b/example/gmres/CMakeLists.txt index 15bfaac95d..05b712af18 100644 --- a/example/gmres/CMakeLists.txt +++ b/example/gmres/CMakeLists.txt @@ -1,8 +1,6 @@ KOKKOSKERNELS_INCLUDE_DIRECTORIES(${CMAKE_CURRENT_BINARY_DIR}) KOKKOSKERNELS_INCLUDE_DIRECTORIES(${CMAKE_CURRENT_SOURCE_DIR}) -# Workaround https://github.com/kokkos/kokkos/issues/4376 for ibm/xl -IF (NOT ${KOKKOS_COMPILER_IBM}) KOKKOSKERNELS_ADD_EXECUTABLE( gmres_ex_real_A SOURCES ex_real_A.cpp @@ -23,7 +21,3 @@ KOKKOSKERNELS_ADD_EXECUTABLE_AND_TEST( SOURCES test_prec.cpp ) -ELSE () - MESSAGE (STATUS "SKIPPING gmres examples - Kokkos::complex unsupported with ibm/xlC as host compiler") -ENDIF () - From 0ffb820881335beb0e78463c19988a0c61044705 Mon Sep 17 00:00:00 2001 From: Evan Harvey Date: Tue, 12 Oct 2021 13:56:29 -0600 Subject: [PATCH 011/300] Apply clang-format to code base. .github/workflows: Add clang-format-check action --- .github/workflows/main.yml | 34 + perf_test/PerfTestUtilities.hpp | 5 +- .../KokkosBatched_Test_BlockCrs_Cuda.cpp | 73 +- .../KokkosBatched_Test_BlockCrs_Host.cpp | 59 +- ...okkosBatched_Test_BlockJacobi_Tutorial.cpp | 372 +- .../KokkosBatched_Test_BlockTridiagDirect.cpp | 784 +-- .../KokkosBatched_Test_BlockTridiagJacobi.cpp | 709 +-- .../KokkosBatched_Test_Gemm_Cuda.cpp | 1163 ++-- .../KokkosBatched_Test_Gemm_Host.hpp | 963 ++-- .../KokkosBatched_Test_Gemm_Host_Complex.cpp | 27 +- .../KokkosBatched_Test_Gemm_Host_Real.cpp | 24 +- .../KokkosBatched_Test_Gemv_Host.hpp | 485 +- .../KokkosBatched_Test_Gemv_Host_Real.cpp | 25 +- .../do-not-use/KokkosBatched_Test_LU_Cuda.cpp | 948 +-- .../do-not-use/KokkosBatched_Test_LU_Host.hpp | 566 +- .../KokkosBatched_Test_LU_Host_Real.cpp | 21 +- .../KokkosBatched_Test_Trsm_Cuda.cpp | 1292 +++-- .../KokkosBatched_Test_Trsm_Host.hpp | 1119 ++-- .../KokkosBatched_Test_Trsm_Host_Real.cpp | 58 +- perf_test/blas/KokkosBlas_blas1.cpp | 151 +- perf_test/blas/KokkosBlas_blas1_MV.cpp | 306 +- .../blas/blas1/KokkosBlas_dot_perf_test.cpp | 2 +- .../blas/blas1/KokkosBlas_dot_perf_test.hpp | 4 +- .../KokkosBlas_dot_tracked_perf_test.cpp | 2 +- .../KokkosBlas_team_dot_tracked_perf_test.cpp | 7 +- perf_test/blas/blas1/tracked_testing.hpp | 4 +- .../blas/blas2/KokkosBlas2_gemv_perf_test.hpp | 60 +- .../KokkosBlas2_gemv_tracked_perf_test.cpp | 1 - perf_test/blas/blas2/tracked_testing.hpp | 25 +- .../blas/blas3/KokkosBlas3_gemm_perf_test.hpp | 89 +- .../KokkosBlas3_gemm_standalone_perf_test.cpp | 141 +- .../KokkosBlas3_gemm_tracked_perf_test.hpp | 1 - .../blas/blas3/KokkosBlas3_perf_test.cpp | 22 +- .../blas/blas3/KokkosBlas3_trmm_perf_test.hpp | 36 +- .../blas/blas3/KokkosBlas_trtri_perf_test.hpp | 139 +- perf_test/graph/KokkosGraph_color.cpp | 669 ++- perf_test/graph/KokkosGraph_color_d2.cpp | 1023 ++-- perf_test/graph/KokkosGraph_mis_d2.cpp | 484 +- .../graph/KokkosGraph_multimem_triangle.hpp | 338 +- perf_test/graph/KokkosGraph_run_triangle.hpp | 330 +- perf_test/graph/KokkosGraph_triangle.cpp | 365 +- perf_test/performance/performance_example.cpp | 38 +- .../performance/performance_validate.cpp | 617 +- perf_test/sparse/KokkosSparse_block_pcg.cpp | 660 ++- perf_test/sparse/KokkosSparse_gs.cpp | 350 +- perf_test/sparse/KokkosSparse_kk_spmv.cpp | 197 +- .../sparse/KokkosSparse_multimem_spgemm.hpp | 375 +- perf_test/sparse/KokkosSparse_pcg.cpp | 309 +- perf_test/sparse/KokkosSparse_pcg.hpp | 510 +- perf_test/sparse/KokkosSparse_run_spgemm.hpp | 328 +- .../sparse/KokkosSparse_run_spgemm_jacobi.hpp | 825 +-- perf_test/sparse/KokkosSparse_spadd.cpp | 543 +- perf_test/sparse/KokkosSparse_spgemm.cpp | 400 +- .../sparse/KokkosSparse_spgemm_jacobi.cpp | 323 +- perf_test/sparse/KokkosSparse_spiluk.cpp | 549 +- perf_test/sparse/KokkosSparse_spmv.cpp | 95 +- perf_test/sparse/KokkosSparse_spmv_merge.cpp | 276 +- perf_test/sparse/KokkosSparse_spmv_struct.cpp | 406 +- .../KokkosSparse_spmv_struct_tuning.cpp | 817 +-- perf_test/sparse/KokkosSparse_spmv_test.cpp | 26 +- perf_test/sparse/KokkosSparse_spmv_test.hpp | 70 +- perf_test/sparse/KokkosSparse_sptrsv.cpp | 1581 ++--- perf_test/sparse/KokkosSparse_sptrsv_aux.hpp | 574 +- .../sparse/KokkosSparse_sptrsv_cholmod.cpp | 486 +- .../sparse/KokkosSparse_sptrsv_superlu.cpp | 853 +-- .../sparse/KokkosSparse_sptrsv_supernode.cpp | 372 +- perf_test/sparse/spmv/ArmPL_SPMV.hpp | 32 +- perf_test/sparse/spmv/CuSparse_SPMV.hpp | 74 +- perf_test/sparse/spmv/KokkosKernels_SPMV.hpp | 56 +- .../sparse/spmv/KokkosKernels_spmv_data.hpp | 31 +- perf_test/sparse/spmv/Kokkos_SPMV.hpp | 185 +- .../sparse/spmv/Kokkos_SPMV_Inspector.hpp | 168 +- perf_test/sparse/spmv/MKL_SPMV.hpp | 77 +- perf_test/sparse/spmv/OpenMPDynamic_SPMV.hpp | 18 +- .../sparse/spmv/OpenMPSmartStatic_SPMV.hpp | 82 +- perf_test/sparse/spmv/OpenMPStatic_SPMV.hpp | 29 +- perf_test/sparse/spmv/matrix_market.hpp | 466 +- perf_test/sparse/tracked_testing.hpp | 8 +- perf_test/test_crsmatrix.cpp | 821 +-- perf_test/test_mv.cpp | 355 +- src/KokkosKernels_Half.hpp | 36 +- src/KokkosKernels_Macros.hpp | 3 +- src/KokkosLinAlg_config.h | 2 +- src/Kokkos_ArithTraits.hpp | 3194 +++++------ src/Kokkos_InnerProductSpaceTraits.hpp | 154 +- src/batched/KokkosBatched_Util.cpp | 32 +- src/batched/KokkosBatched_Util.hpp | 1459 ++--- .../dense/KokkosBatched_AddRadial_Decl.hpp | 63 +- .../KokkosBatched_ApplyHouseholder_Decl.hpp | 64 +- .../dense/KokkosBatched_ApplyPivot_Decl.hpp | 41 +- .../dense/KokkosBatched_ApplyQ_Decl.hpp | 157 +- src/batched/dense/KokkosBatched_Axpy.hpp | 164 +- src/batched/dense/KokkosBatched_Copy_Decl.hpp | 170 +- .../KokkosBatched_Eigendecomposition_Decl.hpp | 99 +- src/batched/dense/KokkosBatched_Gemm_Decl.hpp | 8 +- src/batched/dense/KokkosBatched_Gemv_Decl.hpp | 252 +- .../dense/KokkosBatched_Householder_Decl.hpp | 61 +- .../KokkosBatched_InnerGemmFixA_Decl.hpp | 60 +- .../KokkosBatched_InnerGemmFixB_Decl.hpp | 59 +- .../KokkosBatched_InnerGemmFixC_Decl.hpp | 120 +- .../dense/KokkosBatched_InnerLU_Decl.hpp | 48 +- ...osBatched_InnerMultipleDotProduct_Decl.hpp | 60 +- .../dense/KokkosBatched_InnerTrsm_Decl.hpp | 208 +- .../dense/KokkosBatched_InverseLU_Decl.hpp | 86 +- .../dense/KokkosBatched_Kernel_Handle.hpp | 5 +- src/batched/dense/KokkosBatched_LU_Decl.hpp | 89 +- src/batched/dense/KokkosBatched_QR_Decl.hpp | 133 +- ...kkosBatched_QR_WithColumnPivoting_Decl.hpp | 40 +- src/batched/dense/KokkosBatched_SVD_Decl.hpp | 101 +- .../dense/KokkosBatched_Scale_Decl.hpp | 80 +- .../dense/KokkosBatched_SetIdentity_Decl.hpp | 85 +- src/batched/dense/KokkosBatched_Set_Decl.hpp | 80 +- .../dense/KokkosBatched_SolveLU_Decl.hpp | 156 +- .../dense/KokkosBatched_SolveUTV_Decl.hpp | 81 +- .../KokkosBatched_Test_BlockCrs_Util.hpp | 1732 +++--- src/batched/dense/KokkosBatched_Trmm_Decl.hpp | 27 +- src/batched/dense/KokkosBatched_Trsm_Decl.hpp | 129 +- src/batched/dense/KokkosBatched_Trsv_Decl.hpp | 352 +- .../dense/KokkosBatched_Trtri_Decl.hpp | 19 +- src/batched/dense/KokkosBatched_UTV_Decl.hpp | 93 +- src/batched/dense/KokkosBatched_Vector.hpp | 438 +- .../dense/KokkosBatched_Vector_SIMD.hpp | 1515 ++--- .../impl/KokkosBatched_AddRadial_Impl.hpp | 66 +- .../impl/KokkosBatched_AddRadial_Internal.hpp | 100 +- ...kosBatched_ApplyGivens_Serial_Internal.hpp | 274 +- ...osBatched_ApplyHouseholder_Serial_Impl.hpp | 74 +- ...tched_ApplyHouseholder_Serial_Internal.hpp | 189 +- ...tched_ApplyHouseholder_TeamVector_Impl.hpp | 93 +- ...d_ApplyHouseholder_TeamVector_Internal.hpp | 266 +- .../impl/KokkosBatched_ApplyPivot_Impl.hpp | 319 +- .../KokkosBatched_ApplyPivot_Internal.hpp | 369 +- .../impl/KokkosBatched_ApplyQ_Serial_Impl.hpp | 104 +- .../KokkosBatched_ApplyQ_Serial_Internal.hpp | 369 +- .../KokkosBatched_ApplyQ_TeamVector_Impl.hpp | 125 +- ...kkosBatched_ApplyQ_TeamVector_Internal.hpp | 374 +- .../dense/impl/KokkosBatched_Axpy_Impl.hpp | 546 +- .../dense/impl/KokkosBatched_Copy_Impl.hpp | 189 +- .../impl/KokkosBatched_Copy_Internal.hpp | 226 +- .../dense/impl/KokkosBatched_Dot_Internal.hpp | 183 +- ...Batched_Eigendecomposition_Serial_Impl.hpp | 80 +- ...hed_Eigendecomposition_Serial_Internal.hpp | 742 ++- ...hed_Eigendecomposition_TeamVector_Impl.hpp | 83 +- ...Eigendecomposition_TeamVector_Internal.hpp | 179 +- ...kkosBatched_Eigenvalue_Serial_Internal.hpp | 268 +- .../impl/KokkosBatched_FindAmax_Internal.hpp | 109 +- .../KokkosBatched_Francis_Serial_Internal.hpp | 356 +- .../impl/KokkosBatched_Gemm_DblBuf_Impl.hpp | 2 +- .../KokkosBatched_Gemm_Serial_Internal.hpp | 239 +- .../KokkosBatched_Gemm_TeamVector_Impl.hpp | 226 +- ...KokkosBatched_Gemm_TeamVector_Internal.hpp | 115 +- .../impl/KokkosBatched_Gemm_Team_Impl.hpp | 416 +- .../impl/KokkosBatched_Gemm_Team_Internal.hpp | 288 +- .../impl/KokkosBatched_Gemv_Serial_Impl.hpp | 361 +- .../KokkosBatched_Gemv_Serial_Internal.hpp | 160 +- .../KokkosBatched_Gemv_TeamVector_Impl.hpp | 198 +- ...KokkosBatched_Gemv_TeamVector_Internal.hpp | 114 +- .../impl/KokkosBatched_Gemv_Team_Impl.hpp | 198 +- .../impl/KokkosBatched_Gemv_Team_Internal.hpp | 198 +- .../KokkosBatched_Givens_Serial_Internal.hpp | 102 +- ...atched_HessenbergFormQ_Serial_Internal.hpp | 82 +- ...HessenbergQR_WithShift_Serial_Internal.hpp | 230 +- ...kkosBatched_Hessenberg_Serial_Internal.hpp | 168 +- .../KokkosBatched_Householder_Serial_Impl.hpp | 32 +- ...kosBatched_Householder_Serial_Internal.hpp | 126 +- ...kosBatched_Householder_TeamVector_Impl.hpp | 34 +- ...atched_Householder_TeamVector_Internal.hpp | 145 +- ...okkosBatched_InnerGemmFixA_Serial_Impl.hpp | 2380 ++++---- ...okkosBatched_InnerGemmFixB_Serial_Impl.hpp | 2233 ++++---- ...okkosBatched_InnerGemmFixC_Serial_Impl.hpp | 2502 ++++---- .../KokkosBatched_InnerGemmFixC_Team_Impl.hpp | 84 +- .../KokkosBatched_InnerLU_Serial_Impl.hpp | 612 +- ...ed_InnerMultipleDotProduct_Serial_Impl.hpp | 499 +- .../KokkosBatched_InnerTrsm_Serial_Impl.hpp | 2755 ++++----- .../KokkosBatched_InverseLU_Serial_Impl.hpp | 100 +- .../impl/KokkosBatched_LU_Serial_Impl.hpp | 120 +- .../impl/KokkosBatched_LU_Serial_Internal.hpp | 223 +- .../dense/impl/KokkosBatched_LU_Team_Impl.hpp | 74 +- .../impl/KokkosBatched_LU_Team_Internal.hpp | 274 +- ...ftEigenvectorFromSchur_Serial_Internal.hpp | 255 +- .../impl/KokkosBatched_Normalize_Internal.hpp | 90 +- ...KokkosBatched_QR_FormQ_Serial_Internal.hpp | 83 +- ...osBatched_QR_FormQ_TeamVector_Internal.hpp | 86 +- .../impl/KokkosBatched_QR_Serial_Impl.hpp | 35 +- .../impl/KokkosBatched_QR_Serial_Internal.hpp | 108 +- .../impl/KokkosBatched_QR_TeamVector_Impl.hpp | 46 +- .../KokkosBatched_QR_TeamVector_Internal.hpp | 115 +- ..._QR_WithColumnPivoting_TeamVector_Impl.hpp | 54 +- ...WithColumnPivoting_TeamVector_Internal.hpp | 306 +- ...htEigenvectorFromSchur_Serial_Internal.hpp | 241 +- .../impl/KokkosBatched_SVD_Serial_Impl.hpp | 64 +- .../KokkosBatched_SVD_Serial_Internal.hpp | 611 +- .../dense/impl/KokkosBatched_Scale_Impl.hpp | 87 +- .../impl/KokkosBatched_Scale_Internal.hpp | 230 +- ...KokkosBatched_Schur2x2_Serial_Internal.hpp | 221 +- .../KokkosBatched_Schur_Serial_Internal.hpp | 327 +- .../impl/KokkosBatched_SetIdentity_Impl.hpp | 57 +- .../KokkosBatched_SetIdentity_Internal.hpp | 118 +- .../KokkosBatched_SetTriangular_Internal.hpp | 85 +- .../dense/impl/KokkosBatched_Set_Impl.hpp | 95 +- .../dense/impl/KokkosBatched_Set_Internal.hpp | 239 +- ...kosBatched_ShiftedTrsv_Serial_Internal.hpp | 259 +- ...KokkosBatched_SolveUTV_TeamVector_Impl.hpp | 85 +- ...osBatched_SolveUTV_TeamVector_Internal.hpp | 272 +- .../impl/KokkosBatched_Trmm_Serial_Impl.hpp | 400 +- .../KokkosBatched_Trmm_Serial_Internal.hpp | 594 +- .../impl/KokkosBatched_Trsm_Serial_Impl.hpp | 835 ++- .../KokkosBatched_Trsm_Serial_Internal.hpp | 434 +- .../KokkosBatched_Trsm_TeamVector_Impl.hpp | 247 +- ...KokkosBatched_Trsm_TeamVector_Internal.hpp | 251 +- .../impl/KokkosBatched_Trsm_Team_Impl.hpp | 422 +- .../impl/KokkosBatched_Trsm_Team_Internal.hpp | 531 +- .../impl/KokkosBatched_Trsv_Serial_Impl.hpp | 686 +-- .../KokkosBatched_Trsv_Serial_Internal.hpp | 387 +- .../KokkosBatched_Trsv_TeamVector_Impl.hpp | 200 +- ...KokkosBatched_Trsv_TeamVector_Internal.hpp | 244 +- .../impl/KokkosBatched_Trsv_Team_Impl.hpp | 336 +- .../impl/KokkosBatched_Trsv_Team_Internal.hpp | 448 +- .../impl/KokkosBatched_Trtri_Serial_Impl.hpp | 44 +- .../KokkosBatched_Trtri_Serial_Internal.hpp | 209 +- .../KokkosBatched_UTV_TeamVector_Impl.hpp | 56 +- .../KokkosBatched_UTV_TeamVector_Internal.hpp | 108 +- .../KokkosBatched_UpdateGivens_Internal.hpp | 44 +- .../impl/KokkosBatched_Vector_SIMD_Arith.hpp | 1467 +++-- .../KokkosBatched_Vector_SIMD_Logical.hpp | 163 +- .../impl/KokkosBatched_Vector_SIMD_Math.hpp | 455 +- .../impl/KokkosBatched_Vector_SIMD_Misc.hpp | 335 +- .../KokkosBatched_Vector_SIMD_Relation.hpp | 103 +- .../impl/KokkosBatched_Vector_SIMD_View.hpp | 394 +- ...Batched_WilkinsonShift_Serial_Internal.hpp | 95 +- src/batched/sparse/KokkosBatched_Spmv.hpp | 439 +- .../impl/KokkosBatched_Spmv_Serial_Impl.hpp | 228 +- .../KokkosBatched_Spmv_TeamVector_Impl.hpp | 273 +- .../impl/KokkosBatched_Spmv_Team_Impl.hpp | 273 +- src/blas/KokkosBlas.hpp | 36 +- src/blas/KokkosBlas1_abs.hpp | 82 +- src/blas/KokkosBlas1_axpby.hpp | 101 +- src/blas/KokkosBlas1_dot.hpp | 232 +- src/blas/KokkosBlas1_fill.hpp | 13 +- src/blas/KokkosBlas1_iamax.hpp | 154 +- src/blas/KokkosBlas1_mult.hpp | 106 +- src/blas/KokkosBlas1_nrm1.hpp | 158 +- src/blas/KokkosBlas1_nrm2.hpp | 141 +- src/blas/KokkosBlas1_nrm2_squared.hpp | 141 +- src/blas/KokkosBlas1_nrm2w.hpp | 136 +- src/blas/KokkosBlas1_nrm2w_squared.hpp | 136 +- src/blas/KokkosBlas1_nrminf.hpp | 151 +- src/blas/KokkosBlas1_reciprocal.hpp | 83 +- src/blas/KokkosBlas1_scal.hpp | 85 +- src/blas/KokkosBlas1_sum.hpp | 120 +- src/blas/KokkosBlas1_team_abs.hpp | 16 +- src/blas/KokkosBlas1_team_axpby.hpp | 32 +- src/blas/KokkosBlas1_team_dot.hpp | 16 +- src/blas/KokkosBlas1_team_mult.hpp | 18 +- src/blas/KokkosBlas1_team_nrm2.hpp | 16 +- src/blas/KokkosBlas1_team_scal.hpp | 18 +- src/blas/KokkosBlas1_team_update.hpp | 20 +- src/blas/KokkosBlas1_update.hpp | 125 +- src/blas/KokkosBlas2_gemv.hpp | 111 +- src/blas/KokkosBlas2_team_gemv.hpp | 30 +- src/blas/KokkosBlas3_gemm.hpp | 235 +- src/blas/KokkosBlas3_trmm.hpp | 151 +- src/blas/KokkosBlas3_trsm.hpp | 135 +- src/blas/KokkosBlas_gesv.hpp | 153 +- src/blas/KokkosBlas_trtri.hpp | 89 +- src/blas/impl/KokkosBlas1_abs_impl.hpp | 220 +- src/blas/impl/KokkosBlas1_abs_spec.hpp | 242 +- src/blas/impl/KokkosBlas1_axpby_impl.hpp | 291 +- src/blas/impl/KokkosBlas1_axpby_mv_impl.hpp | 811 +-- src/blas/impl/KokkosBlas1_axpby_spec.hpp | 553 +- src/blas/impl/KokkosBlas1_dot_impl.hpp | 61 +- src/blas/impl/KokkosBlas1_dot_mv_impl.hpp | 622 +- src/blas/impl/KokkosBlas1_dot_spec.hpp | 733 +-- src/blas/impl/KokkosBlas1_iamax_impl.hpp | 248 +- src/blas/impl/KokkosBlas1_iamax_spec.hpp | 410 +- src/blas/impl/KokkosBlas1_mult_impl.hpp | 178 +- src/blas/impl/KokkosBlas1_mult_spec.hpp | 345 +- src/blas/impl/KokkosBlas1_nrm1_impl.hpp | 186 +- src/blas/impl/KokkosBlas1_nrm1_spec.hpp | 265 +- src/blas/impl/KokkosBlas1_nrm2_impl.hpp | 208 +- src/blas/impl/KokkosBlas1_nrm2_spec.hpp | 265 +- src/blas/impl/KokkosBlas1_nrm2w_impl.hpp | 221 +- src/blas/impl/KokkosBlas1_nrm2w_spec.hpp | 261 +- src/blas/impl/KokkosBlas1_nrminf_impl.hpp | 195 +- src/blas/impl/KokkosBlas1_nrminf_spec.hpp | 268 +- src/blas/impl/KokkosBlas1_reciprocal_impl.hpp | 224 +- src/blas/impl/KokkosBlas1_reciprocal_spec.hpp | 244 +- src/blas/impl/KokkosBlas1_scal_impl.hpp | 114 +- src/blas/impl/KokkosBlas1_scal_mv_impl.hpp | 369 +- src/blas/impl/KokkosBlas1_scal_spec.hpp | 405 +- src/blas/impl/KokkosBlas1_sum_impl.hpp | 247 +- src/blas/impl/KokkosBlas1_sum_spec.hpp | 250 +- src/blas/impl/KokkosBlas1_team_abs_spec.hpp | 25 +- src/blas/impl/KokkosBlas1_team_axpby_spec.hpp | 38 +- src/blas/impl/KokkosBlas1_team_dot_spec.hpp | 41 +- src/blas/impl/KokkosBlas1_team_mult_spec.hpp | 36 +- src/blas/impl/KokkosBlas1_team_nrm2_spec.hpp | 55 +- src/blas/impl/KokkosBlas1_team_scal_spec.hpp | 30 +- .../impl/KokkosBlas1_team_update_spec.hpp | 40 +- src/blas/impl/KokkosBlas1_update_impl.hpp | 430 +- src/blas/impl/KokkosBlas1_update_spec.hpp | 413 +- src/blas/impl/KokkosBlas2_gemv_impl.hpp | 1031 ++-- src/blas/impl/KokkosBlas2_gemv_spec.hpp | 185 +- src/blas/impl/KokkosBlas2_team_gemv_spec.hpp | 94 +- .../impl/KokkosBlas3_gemm_dotbased_impl.hpp | 199 +- src/blas/impl/KokkosBlas3_gemm_impl.hpp | 740 ++- src/blas/impl/KokkosBlas3_gemm_spec.hpp | 468 +- src/blas/impl/KokkosBlas3_trmm_spec.hpp | 168 +- src/blas/impl/KokkosBlas3_trsm_impl.hpp | 485 +- src/blas/impl/KokkosBlas3_trsm_spec.hpp | 177 +- src/blas/impl/KokkosBlas_gesv_impl.hpp | 8 +- src/blas/impl/KokkosBlas_gesv_spec.hpp | 139 +- src/blas/impl/KokkosBlas_trtri_impl.hpp | 68 +- src/blas/impl/KokkosBlas_trtri_spec.hpp | 136 +- src/common/KokkosKernels_BitUtils.hpp | 252 +- src/common/KokkosKernels_Controls.hpp | 127 +- src/common/KokkosKernels_ExecSpaceUtils.hpp | 133 +- src/common/KokkosKernels_Handle.hpp | 750 +-- .../KokkosKernels_HashmapAccumulator.hpp | 747 ++- src/common/KokkosKernels_IOUtils.hpp | 1476 +++-- src/common/KokkosKernels_PrintUtils.hpp | 100 +- src/common/KokkosKernels_SimpleUtils.hpp | 289 +- src/common/KokkosKernels_Sorting.hpp | 1438 +++-- src/common/KokkosKernels_SparseUtils.hpp | 2538 ++++---- .../KokkosKernels_SparseUtils_cusparse.hpp | 91 +- ...Kernels_Uniform_Initialized_MemoryPool.hpp | 322 +- src/common/KokkosKernels_Utils.hpp | 1616 +++--- src/common/KokkosKernels_VectorUtils.hpp | 92 +- src/common/KokkosKernels_default_types.hpp | 47 +- src/graph/KokkosGraph_Distance1Color.hpp | 125 +- .../KokkosGraph_Distance1ColorHandle.hpp | 831 +-- src/graph/KokkosGraph_Distance2Color.hpp | 240 +- .../KokkosGraph_Distance2ColorHandle.hpp | 798 +-- src/graph/KokkosGraph_ExplicitCoarsening.hpp | 102 +- src/graph/KokkosGraph_GraphColorHandle.hpp | 1 - src/graph/KokkosGraph_MIS2.hpp | 59 +- src/graph/KokkosGraph_RCM.hpp | 30 +- src/graph/KokkosGraph_Triangle.hpp | 529 +- src/graph/KokkosGraph_graph_color.hpp | 1 - src/graph/impl/KokkosGraph_BFS_impl.hpp | 115 +- .../impl/KokkosGraph_Distance1Color_impl.hpp | 3096 +++++----- .../impl/KokkosGraph_Distance2Color_impl.hpp | 3128 +++++----- .../impl/KokkosGraph_Distance2MIS_impl.hpp | 1054 ++-- .../KokkosGraph_ExplicitCoarsening_impl.hpp | 334 +- src/impl/KokkosKernels_helpers.hpp | 28 +- .../tpls/KokkosBlas1_abs_tpl_spec_avail.hpp | 6 +- .../tpls/KokkosBlas1_abs_tpl_spec_decl.hpp | 5 +- .../tpls/KokkosBlas1_axpby_tpl_spec_avail.hpp | 78 +- .../tpls/KokkosBlas1_axpby_tpl_spec_decl.hpp | 643 ++- .../tpls/KokkosBlas1_dot_tpl_spec_avail.hpp | 79 +- .../tpls/KokkosBlas1_dot_tpl_spec_decl.hpp | 631 +- .../tpls/KokkosBlas1_iamax_tpl_spec_avail.hpp | 131 +- .../tpls/KokkosBlas1_iamax_tpl_spec_decl.hpp | 1129 ++-- .../tpls/KokkosBlas1_mult_tpl_spec_avail.hpp | 6 +- .../tpls/KokkosBlas1_mult_tpl_spec_decl.hpp | 5 +- .../tpls/KokkosBlas1_nrm1_tpl_spec_avail.hpp | 75 +- .../tpls/KokkosBlas1_nrm1_tpl_spec_decl.hpp | 591 +- .../tpls/KokkosBlas1_nrm2_tpl_spec_avail.hpp | 75 +- .../tpls/KokkosBlas1_nrm2_tpl_spec_decl.hpp | 643 ++- .../tpls/KokkosBlas1_nrm2w_tpl_spec_avail.hpp | 6 +- .../tpls/KokkosBlas1_nrm2w_tpl_spec_decl.hpp | 5 +- .../KokkosBlas1_nrminf_tpl_spec_avail.hpp | 42 +- .../tpls/KokkosBlas1_nrminf_tpl_spec_decl.hpp | 356 +- .../KokkosBlas1_reciprocal_tpl_spec_avail.hpp | 6 +- .../KokkosBlas1_reciprocal_tpl_spec_decl.hpp | 5 +- .../tpls/KokkosBlas1_scal_tpl_spec_avail.hpp | 87 +- .../tpls/KokkosBlas1_scal_tpl_spec_decl.hpp | 715 +-- .../tpls/KokkosBlas1_sum_tpl_spec_avail.hpp | 6 +- .../tpls/KokkosBlas1_sum_tpl_spec_decl.hpp | 5 +- .../KokkosBlas1_update_tpl_spec_avail.hpp | 6 +- .../tpls/KokkosBlas1_update_tpl_spec_decl.hpp | 5 +- .../tpls/KokkosBlas2_gemv_tpl_spec_avail.hpp | 118 +- .../tpls/KokkosBlas2_gemv_tpl_spec_decl.hpp | 796 +-- .../tpls/KokkosBlas3_gemm_tpl_spec_avail.hpp | 152 +- .../tpls/KokkosBlas3_gemm_tpl_spec_decl.hpp | 1304 +++-- .../tpls/KokkosBlas3_trmm_tpl_spec_avail.hpp | 135 +- .../tpls/KokkosBlas3_trmm_tpl_spec_decl.hpp | 552 +- .../tpls/KokkosBlas3_trsm_tpl_spec_avail.hpp | 129 +- .../tpls/KokkosBlas3_trsm_tpl_spec_decl.hpp | 1354 +++-- src/impl/tpls/KokkosBlas_Cuda_tpl.cpp | 2 +- src/impl/tpls/KokkosBlas_Cuda_tpl.hpp | 53 +- src/impl/tpls/KokkosBlas_Host_tpl.cpp | 1778 +++--- src/impl/tpls/KokkosBlas_Host_tpl.hpp | 160 +- .../tpls/KokkosBlas_gesv_tpl_spec_avail.hpp | 97 +- .../tpls/KokkosBlas_gesv_tpl_spec_decl.hpp | 876 +-- src/impl/tpls/KokkosBlas_tpl_spec.hpp | 19 +- .../tpls/KokkosBlas_trtri_tpl_spec_avail.hpp | 113 +- .../tpls/KokkosBlas_trtri_tpl_spec_decl.hpp | 263 +- .../tpls/KokkosKernels_tpl_handles_decl.hpp | 12 +- .../tpls/KokkosKernels_tpl_handles_def.hpp | 20 +- ...kkosSparse_gauss_seidel_tpl_spec_avail.hpp | 13 +- ...okkosSparse_gauss_seidel_tpl_spec_decl.hpp | 5 +- .../KokkosSparse_spgemm_tpl_spec_avail.hpp | 30 +- .../KokkosSparse_spgemm_tpl_spec_decl.hpp | 5 +- ...osSparse_spiluk_numeric_tpl_spec_avail.hpp | 19 +- ...kosSparse_spiluk_numeric_tpl_spec_decl.hpp | 5 +- ...sSparse_spiluk_symbolic_tpl_spec_avail.hpp | 15 +- ...osSparse_spiluk_symbolic_tpl_spec_decl.hpp | 5 +- ...okkosSparse_spmv_struct_tpl_spec_avail.hpp | 16 +- ...KokkosSparse_spmv_struct_tpl_spec_decl.hpp | 7 +- .../tpls/KokkosSparse_spmv_tpl_spec_avail.hpp | 219 +- .../tpls/KokkosSparse_spmv_tpl_spec_decl.hpp | 923 +-- ...kkosSparse_sptrsv_solve_tpl_spec_avail.hpp | 13 +- ...okkosSparse_sptrsv_solve_tpl_spec_decl.hpp | 5 +- ...sSparse_sptrsv_symbolic_tpl_spec_avail.hpp | 9 +- ...osSparse_sptrsv_symbolic_tpl_spec_decl.hpp | 5 +- .../tpls/KokkosSparse_trsv_tpl_spec_avail.hpp | 10 +- .../tpls/KokkosSparse_trsv_tpl_spec_decl.hpp | 5 +- src/sparse/KokkosSparse.hpp | 1 - src/sparse/KokkosSparse_BlockCrsMatrix.hpp | 724 +-- src/sparse/KokkosSparse_BsrMatrix.hpp | 83 +- src/sparse/KokkosSparse_CrsMatrix.hpp | 529 +- src/sparse/KokkosSparse_OrdinalTraits.hpp | 44 +- src/sparse/KokkosSparse_findRelOffset.hpp | 235 +- src/sparse/KokkosSparse_gauss_seidel.hpp | 1491 ++--- .../KokkosSparse_gauss_seidel_handle.hpp | 1279 ++--- src/sparse/KokkosSparse_getDiagCopy.hpp | 57 +- src/sparse/KokkosSparse_spadd.hpp | 358 +- src/sparse/KokkosSparse_spadd_handle.hpp | 79 +- src/sparse/KokkosSparse_spgemm.hpp | 12 +- src/sparse/KokkosSparse_spgemm_handle.hpp | 652 ++- src/sparse/KokkosSparse_spgemm_jacobi.hpp | 408 +- src/sparse/KokkosSparse_spgemm_numeric.hpp | 379 +- src/sparse/KokkosSparse_spgemm_symbolic.hpp | 221 +- src/sparse/KokkosSparse_spiluk.hpp | 901 +-- src/sparse/KokkosSparse_spiluk_handle.hpp | 137 +- src/sparse/KokkosSparse_spmv.hpp | 1151 ++-- src/sparse/KokkosSparse_sptrsv.hpp | 629 +- src/sparse/KokkosSparse_sptrsv_cholmod.hpp | 412 +- src/sparse/KokkosSparse_sptrsv_handle.hpp | 872 ++- src/sparse/KokkosSparse_sptrsv_superlu.hpp | 633 +- src/sparse/KokkosSparse_sptrsv_supernode.hpp | 2508 ++++---- src/sparse/KokkosSparse_trsv.hpp | 128 +- .../impl/KokkosSparse_BsrMatrix_impl.hpp | 2 +- ...KokkosSparse_cluster_gauss_seidel_impl.hpp | 1732 +++--- .../impl/KokkosSparse_gauss_seidel_impl.hpp | 3366 +++++------ .../impl/KokkosSparse_gauss_seidel_spec.hpp | 864 ++- ...kkosSparse_getDiagCopyWithOffsets_impl.hpp | 123 +- .../impl/KokkosSparse_partitioning_impl.hpp | 266 +- .../impl/KokkosSparse_sor_sequential_impl.hpp | 237 +- .../impl/KokkosSparse_spgemm_CUSP_impl.hpp | 243 +- .../KokkosSparse_spgemm_cuSPARSE_impl.hpp | 413 +- .../impl/KokkosSparse_spgemm_imp_outer.hpp | 979 ++-- src/sparse/impl/KokkosSparse_spgemm_impl.hpp | 965 ++-- .../impl/KokkosSparse_spgemm_impl_color.hpp | 888 +-- .../KokkosSparse_spgemm_impl_compression.hpp | 1564 ++--- .../impl/KokkosSparse_spgemm_impl_def.hpp | 344 +- .../impl/KokkosSparse_spgemm_impl_kkmem.hpp | 2554 +++++---- .../KokkosSparse_spgemm_impl_memaccess.hpp | 1138 ++-- .../impl/KokkosSparse_spgemm_impl_seq.hpp | 265 +- .../impl/KokkosSparse_spgemm_impl_speed.hpp | 776 ++- .../KokkosSparse_spgemm_impl_symbolic.hpp | 3782 ++++++------ .../KokkosSparse_spgemm_impl_triangle.hpp | 2927 +++++----- ...se_spgemm_impl_triangle_no_compression.hpp | 1662 +++--- ...kkosSparse_spgemm_jacobi_denseacc_impl.hpp | 482 +- .../KokkosSparse_spgemm_jacobi_seq_impl.hpp | 186 +- ...kosSparse_spgemm_jacobi_sparseacc_impl.hpp | 3009 +++++----- .../impl/KokkosSparse_spgemm_jacobi_spec.hpp | 487 +- .../KokkosSparse_spgemm_mkl2phase_impl.hpp | 845 +-- .../impl/KokkosSparse_spgemm_mkl_impl.hpp | 874 ++- .../impl/KokkosSparse_spgemm_numeric_spec.hpp | 711 ++- .../KokkosSparse_spgemm_symbolic_spec.hpp | 363 +- .../KokkosSparse_spgemm_viennaCL_impl.hpp | 279 +- .../impl/KokkosSparse_spiluk_numeric_impl.hpp | 598 +- .../impl/KokkosSparse_spiluk_numeric_spec.hpp | 379 +- .../KokkosSparse_spiluk_symbolic_impl.hpp | 599 +- .../KokkosSparse_spiluk_symbolic_spec.hpp | 293 +- src/sparse/impl/KokkosSparse_spmv_impl.hpp | 1900 +++--- .../impl/KokkosSparse_spmv_impl_omp.hpp | 61 +- src/sparse/impl/KokkosSparse_spmv_spec.hpp | 466 +- .../impl/KokkosSparse_spmv_struct_impl.hpp | 2095 +++---- .../impl/KokkosSparse_spmv_struct_spec.hpp | 690 +-- .../KokkosSparse_sptrsv_cuSPARSE_impl.hpp | 403 +- .../impl/KokkosSparse_sptrsv_solve_impl.hpp | 4369 +++++++------- .../impl/KokkosSparse_sptrsv_solve_spec.hpp | 292 +- .../KokkosSparse_sptrsv_symbolic_impl.hpp | 1836 +++--- .../KokkosSparse_sptrsv_symbolic_spec.hpp | 166 +- src/sparse/impl/KokkosSparse_trsv_impl.hpp | 567 +- src/sparse/impl/KokkosSparse_trsv_spec.hpp | 245 +- ...okkosSparse_twostage_gauss_seidel_impl.hpp | 1885 +++--- src/stage/blas3/Kokkos_Blas3.hpp | 341 +- src/stage/blas3/Kokkos_Blas3_impl.hpp | 2533 ++++---- src/stage/blas3/blas3_UnitTest_01.cpp | 6 +- unit_test/Test_Main.cpp | 9 +- .../batched/dense/Test_Batched_Dense.hpp | 3 +- .../batched/dense/Test_Batched_SerialAxpy.hpp | 206 +- .../dense/Test_Batched_SerialAxpy_Complex.hpp | 9 +- .../dense/Test_Batched_SerialAxpy_Real.hpp | 8 +- .../Test_Batched_SerialEigendecomposition.hpp | 65 +- ..._Batched_SerialEigendecomposition_Real.hpp | 1 - .../batched/dense/Test_Batched_SerialGemm.hpp | 35 +- .../dense/Test_Batched_SerialGemm_Complex.hpp | 80 +- .../dense/Test_Batched_SerialGemm_Real.hpp | 76 +- .../batched/dense/Test_Batched_SerialGemv.hpp | 247 +- .../dense/Test_Batched_SerialGemv_Complex.hpp | 20 +- .../dense/Test_Batched_SerialGemv_Real.hpp | 20 +- .../dense/Test_Batched_SerialInverseLU.hpp | 358 +- .../Test_Batched_SerialInverseLU_Complex.hpp | 14 +- .../Test_Batched_SerialInverseLU_Real.hpp | 22 +- .../batched/dense/Test_Batched_SerialLU.hpp | 154 +- .../dense/Test_Batched_SerialLU_Complex.hpp | 4 +- .../dense/Test_Batched_SerialLU_Real.hpp | 10 +- .../dense/Test_Batched_SerialMatUtil.hpp | 251 +- .../Test_Batched_SerialMatUtil_Complex.hpp | 20 +- .../dense/Test_Batched_SerialMatUtil_Real.hpp | 19 +- .../batched/dense/Test_Batched_SerialSVD.hpp | 412 +- .../dense/Test_Batched_SerialSolveLU.hpp | 429 +- .../Test_Batched_SerialSolveLU_Complex.hpp | 14 +- .../dense/Test_Batched_SerialSolveLU_Real.hpp | 22 +- .../batched/dense/Test_Batched_SerialTrmm.hpp | 534 +- .../dense/Test_Batched_SerialTrmm_Complex.hpp | 416 +- .../dense/Test_Batched_SerialTrmm_Real.hpp | 398 +- .../batched/dense/Test_Batched_SerialTrsm.hpp | 248 +- .../dense/Test_Batched_SerialTrsm_Complex.hpp | 195 +- .../dense/Test_Batched_SerialTrsm_Real.hpp | 182 +- .../batched/dense/Test_Batched_SerialTrsv.hpp | 349 +- .../dense/Test_Batched_SerialTrsv_Complex.hpp | 66 +- .../dense/Test_Batched_SerialTrsv_Real.hpp | 67 +- .../dense/Test_Batched_SerialTrtri.hpp | 553 +- .../Test_Batched_SerialTrtri_Complex.hpp | 82 +- .../dense/Test_Batched_SerialTrtri_Real.hpp | 74 +- .../batched/dense/Test_Batched_TeamAxpy.hpp | 227 +- .../dense/Test_Batched_TeamAxpy_Complex.hpp | 9 +- .../dense/Test_Batched_TeamAxpy_Real.hpp | 8 +- .../batched/dense/Test_Batched_TeamGemm.hpp | 231 +- .../dense/Test_Batched_TeamGemm_Complex.hpp | 84 +- .../dense/Test_Batched_TeamGemm_Real.hpp | 75 +- .../batched/dense/Test_Batched_TeamGemv.hpp | 260 +- .../dense/Test_Batched_TeamGemv_Complex.hpp | 20 +- .../dense/Test_Batched_TeamGemv_Real.hpp | 22 +- .../dense/Test_Batched_TeamInverseLU.hpp | 402 +- .../Test_Batched_TeamInverseLU_Complex.hpp | 14 +- .../dense/Test_Batched_TeamInverseLU_Real.hpp | 22 +- .../batched/dense/Test_Batched_TeamLU.hpp | 175 +- .../dense/Test_Batched_TeamLU_Complex.hpp | 4 +- .../dense/Test_Batched_TeamLU_Real.hpp | 10 +- .../dense/Test_Batched_TeamMatUtil.hpp | 272 +- .../Test_Batched_TeamMatUtil_Complex.hpp | 20 +- .../dense/Test_Batched_TeamMatUtil_Real.hpp | 23 +- .../dense/Test_Batched_TeamSolveLU.hpp | 469 +- .../Test_Batched_TeamSolveLU_Complex.hpp | 14 +- .../dense/Test_Batched_TeamSolveLU_Real.hpp | 22 +- .../batched/dense/Test_Batched_TeamTrsm.hpp | 262 +- .../dense/Test_Batched_TeamTrsm_Complex.hpp | 227 +- .../dense/Test_Batched_TeamTrsm_Real.hpp | 182 +- .../batched/dense/Test_Batched_TeamTrsv.hpp | 229 +- .../dense/Test_Batched_TeamTrsv_Complex.hpp | 42 +- .../dense/Test_Batched_TeamTrsv_Real.hpp | 41 +- .../dense/Test_Batched_TeamVectorAxpy.hpp | 228 +- .../Test_Batched_TeamVectorAxpy_Complex.hpp | 10 +- .../Test_Batched_TeamVectorAxpy_Real.hpp | 8 +- ...t_Batched_TeamVectorEigendecomposition.hpp | 80 +- .../dense/Test_Batched_TeamVectorGemm.hpp | 237 +- .../Test_Batched_TeamVectorGemm_Complex.hpp | 90 +- .../Test_Batched_TeamVectorGemm_Real.hpp | 92 +- .../dense/Test_Batched_TeamVectorQR.hpp | 289 +- .../dense/Test_Batched_TeamVectorQR_Real.hpp | 10 +- ...atched_TeamVectorQR_WithColumnPivoting.hpp | 332 +- ...d_TeamVectorQR_WithColumnPivoting_Real.hpp | 12 +- .../dense/Test_Batched_TeamVectorSolveUTV.hpp | 409 +- .../Test_Batched_TeamVectorSolveUTV2.hpp | 423 +- .../Test_Batched_TeamVectorSolveUTV2_Real.hpp | 9 +- .../Test_Batched_TeamVectorSolveUTV_Real.hpp | 9 +- .../dense/Test_Batched_TeamVectorUTV.hpp | 463 +- .../dense/Test_Batched_TeamVectorUTV_Real.hpp | 9 +- .../dense/Test_Batched_VectorArithmatic.hpp | 472 +- .../dense/Test_Batched_VectorLogical.hpp | 135 +- .../batched/dense/Test_Batched_VectorMath.hpp | 195 +- .../batched/dense/Test_Batched_VectorMisc.hpp | 252 +- .../dense/Test_Batched_VectorRelation.hpp | 161 +- .../batched/dense/Test_Batched_VectorView.hpp | 511 +- .../sparse/Test_Batched_SerialSpmv.hpp | 379 +- .../sparse/Test_Batched_SerialSpmv_Real.hpp | 8 +- .../batched/sparse/Test_Batched_Sparse.hpp | 2 +- .../batched/sparse/Test_Batched_TeamSpmv.hpp | 409 +- .../sparse/Test_Batched_TeamSpmv_Real.hpp | 8 +- .../sparse/Test_Batched_TeamVectorSpmv.hpp | 404 +- .../Test_Batched_TeamVectorSpmv_Real.hpp | 8 +- unit_test/blas/Test_Blas.hpp | 2 +- unit_test/blas/Test_Blas1_abs.hpp | 371 +- unit_test/blas/Test_Blas1_asum.hpp | 142 +- unit_test/blas/Test_Blas1_axpby.hpp | 382 +- unit_test/blas/Test_Blas1_axpy.hpp | 384 +- unit_test/blas/Test_Blas1_dot.hpp | 286 +- unit_test/blas/Test_Blas1_iamax.hpp | 390 +- unit_test/blas/Test_Blas1_mult.hpp | 334 +- unit_test/blas/Test_Blas1_nrm1.hpp | 268 +- unit_test/blas/Test_Blas1_nrm2.hpp | 247 +- unit_test/blas/Test_Blas1_nrm2_squared.hpp | 277 +- unit_test/blas/Test_Blas1_nrminf.hpp | 251 +- unit_test/blas/Test_Blas1_reciprocal.hpp | 437 +- unit_test/blas/Test_Blas1_scal.hpp | 319 +- unit_test/blas/Test_Blas1_sum.hpp | 234 +- unit_test/blas/Test_Blas1_team_abs.hpp | 510 +- unit_test/blas/Test_Blas1_team_axpby.hpp | 503 +- unit_test/blas/Test_Blas1_team_axpy.hpp | 499 +- unit_test/blas/Test_Blas1_team_dot.hpp | 637 +- unit_test/blas/Test_Blas1_team_mult.hpp | 674 ++- unit_test/blas/Test_Blas1_team_nrm2.hpp | 204 +- unit_test/blas/Test_Blas1_team_scal.hpp | 615 +- unit_test/blas/Test_Blas1_team_update.hpp | 677 ++- unit_test/blas/Test_Blas1_update.hpp | 541 +- unit_test/blas/Test_Blas2_gemv.hpp | 430 +- unit_test/blas/Test_Blas2_team_gemv.hpp | 396 +- unit_test/blas/Test_Blas3_gemm.hpp | 381 +- unit_test/blas/Test_Blas3_trmm.hpp | 857 +-- unit_test/blas/Test_Blas3_trsm.hpp | 757 +-- unit_test/blas/Test_Blas_gesv.hpp | 583 +- unit_test/blas/Test_Blas_trtri.hpp | 514 +- unit_test/common/Test_Common.hpp | 10 +- unit_test/common/Test_Common_ArithTraits.hpp | 1066 ++-- unit_test/common/Test_Common_Sorting.hpp | 802 ++- unit_test/common/Test_Common_Transpose.hpp | 159 +- unit_test/common/Test_Common_float128.hpp | 58 +- .../common/Test_Common_set_bit_count.hpp | 236 +- unit_test/cuda/Test_Cuda.hpp | 12 +- unit_test/cuda/Test_Cuda_Batched_Dense.cpp | 6 +- unit_test/cuda/Test_Cuda_Batched_Sparse.cpp | 6 +- unit_test/cuda/Test_Cuda_Blas.cpp | 6 +- unit_test/cuda/Test_Cuda_Common.cpp | 6 +- unit_test/cuda/Test_Cuda_Graph.cpp | 6 +- unit_test/cuda/Test_Cuda_Sparse.cpp | 6 +- unit_test/graph/Test_Graph.hpp | 2 +- unit_test/graph/Test_Graph_graph_color.hpp | 172 +- .../Test_Graph_graph_color_deterministic.hpp | 232 +- .../Test_Graph_graph_color_distance2.hpp | 521 +- unit_test/graph/Test_Graph_mis2.hpp | 318 +- unit_test/graph/Test_Graph_rcm.hpp | 183 +- unit_test/hip/Test_HIP.hpp | 12 +- unit_test/hip/Test_HIP_Batched_Dense.cpp | 2 +- unit_test/hip/Test_HIP_Batched_Sparse.cpp | 2 +- unit_test/hip/Test_HIP_Blas.cpp | 2 +- unit_test/hip/Test_HIP_Common.cpp | 2 +- unit_test/hip/Test_HIP_Graph.cpp | 2 +- unit_test/hip/Test_HIP_Sparse.cpp | 2 +- unit_test/openmp/Test_OpenMP.hpp | 12 +- .../openmp/Test_OpenMP_Batched_Dense.cpp | 6 +- .../openmp/Test_OpenMP_Batched_Sparse.cpp | 6 +- unit_test/openmp/Test_OpenMP_Blas.cpp | 6 +- unit_test/openmp/Test_OpenMP_Common.cpp | 6 +- unit_test/openmp/Test_OpenMP_Graph.cpp | 6 +- unit_test/openmp/Test_OpenMP_Sparse.cpp | 6 +- unit_test/openmptarget/Test_OpenMPTarget.hpp | 12 +- .../Test_OpenMPTarget_Batched_Dense.cpp | 2 +- .../Test_OpenMPTarget_Batched_Sparse.cpp | 2 +- .../openmptarget/Test_OpenMPTarget_Blas.cpp | 2 +- .../openmptarget/Test_OpenMPTarget_Common.cpp | 2 +- .../openmptarget/Test_OpenMPTarget_Graph.cpp | 2 +- .../openmptarget/Test_OpenMPTarget_Sparse.cpp | 2 +- unit_test/serial/Test_Serial.hpp | 12 +- .../serial/Test_Serial_Batched_Dense.cpp | 6 +- .../serial/Test_Serial_Batched_Sparse.cpp | 6 +- unit_test/serial/Test_Serial_Blas.cpp | 6 +- unit_test/serial/Test_Serial_Common.cpp | 6 +- unit_test/serial/Test_Serial_Graph.cpp | 6 +- unit_test/serial/Test_Serial_Sparse.cpp | 6 +- unit_test/sparse/Test_Sparse.hpp | 2 +- .../sparse/Test_Sparse_BlockCrsMatrix.hpp | 629 +- unit_test/sparse/Test_Sparse_BsrMatrix.hpp | 632 +- unit_test/sparse/Test_Sparse_CrsMatrix.hpp | 473 +- .../sparse/Test_Sparse_Utils_cusparse.hpp | 11 +- .../sparse/Test_Sparse_block_gauss_seidel.hpp | 507 +- .../sparse/Test_Sparse_findRelOffset.hpp | 735 ++- unit_test/sparse/Test_Sparse_gauss_seidel.hpp | 808 +-- .../sparse/Test_Sparse_replaceSumInto.hpp | 529 +- unit_test/sparse/Test_Sparse_spadd.hpp | 473 +- unit_test/sparse/Test_Sparse_spgemm.hpp | 569 +- .../sparse/Test_Sparse_spgemm_jacobi.hpp | 342 +- unit_test/sparse/Test_Sparse_spiluk.hpp | 333 +- unit_test/sparse/Test_Sparse_spmv.hpp | 1615 +++--- unit_test/sparse/Test_Sparse_sptrsv.hpp | 630 +- unit_test/sparse/Test_Sparse_trsv.hpp | 606 +- unit_test/sparse/matrixIssue402.hpp | 5097 +++++++++-------- unit_test/standalone/main.cpp | 16 +- unit_test/sycl/Test_SYCL_Batched_Dense.cpp | 6 +- unit_test/sycl/Test_SYCL_Batched_Sparse.cpp | 6 +- unit_test/sycl/Test_SYCL_Blas.cpp | 6 +- unit_test/sycl/Test_SYCL_Common.cpp | 6 +- unit_test/sycl/Test_SYCL_Graph.cpp | 6 +- unit_test/sycl/Test_SYCL_Sparse.cpp | 6 +- unit_test/threads/Test_Threads.hpp | 12 +- .../threads/Test_Threads_Batched_Dense.cpp | 6 +- .../threads/Test_Threads_Batched_Sparse.cpp | 6 +- unit_test/threads/Test_Threads_Blas.cpp | 6 +- unit_test/threads/Test_Threads_Common.cpp | 6 +- unit_test/threads/Test_Threads_Graph.cpp | 6 +- unit_test/threads/Test_Threads_Sparse.cpp | 6 +- 686 files changed, 121291 insertions(+), 116578 deletions(-) create mode 100644 .github/workflows/main.yml diff --git a/.github/workflows/main.yml b/.github/workflows/main.yml new file mode 100644 index 0000000000..12db392c3a --- /dev/null +++ b/.github/workflows/main.yml @@ -0,0 +1,34 @@ +name: CI + +on: + pull_request: + branches: + - master + - develop +#dsadsadsa +jobs: + clang-format-check: + runs-on: ubuntu-18.04 + steps: + - uses: actions/checkout@v2 + + - name: Install Dependencies + run: sudo apt install clang-format-8 + + - name: check + run: | + # Fetch from the default remote (origin) + git fetch &> /dev/null + + # For every file changed, apply clang-format + for file in $(git diff --name-only origin/$GITHUB_BASE_REF | egrep '*.cpp|*.hpp|*.h'); do + clang-format-8 -i -style=file $file + git add $file + done + + # If any diffs exist, error out + if [[ ! -z $(git status -s -uno . -- ':!.github') ]]; then + echo "The following files require formatting changes:" + git status -s -uno . -- ':!.github' + exit 1 + fi diff --git a/perf_test/PerfTestUtilities.hpp b/perf_test/PerfTestUtilities.hpp index 828c0d285a..743df53502 100644 --- a/perf_test/PerfTestUtilities.hpp +++ b/perf_test/PerfTestUtilities.hpp @@ -46,8 +46,9 @@ inline std::vector get_directories(std::string path) { while ((dir = readdir(d)) != NULL) { std::string nname = std::string(dir->d_name); // Check to see if item is a directory - //if (isDirectory(path + '/' + nname)) - if(nname != "." && nname != ".." && isDirectory(path + '/' + dir->d_name)) + // if (isDirectory(path + '/' + nname)) + if (nname != "." && nname != ".." && + isDirectory(path + '/' + dir->d_name)) // std::vector::emplace_back: insert a new element to the end of vector paths.emplace_back(dir->d_name); } diff --git a/perf_test/batched/KokkosBatched_Test_BlockCrs_Cuda.cpp b/perf_test/batched/KokkosBatched_Test_BlockCrs_Cuda.cpp index 2930aa4e79..50f15cf719 100644 --- a/perf_test/batched/KokkosBatched_Test_BlockCrs_Cuda.cpp +++ b/perf_test/batched/KokkosBatched_Test_BlockCrs_Cuda.cpp @@ -22,7 +22,7 @@ #include "Kokkos_Core.hpp" #include "Kokkos_Timer.hpp" -#if defined(KOKKOS_ENABLE_CUDA) +#if defined(KOKKOS_ENABLE_CUDA) #define __KOKKOSBATCHED_TEST_ENABLE_CUDA__ #include "KokkosBatched_Util.hpp" @@ -30,16 +30,16 @@ #define KOKKOSBATCHED_USE_UNBLOCKED_ALGO 1 //#define KOKKOSBATCHED_USE_BLOCKED_ALGO 1 -#if defined (KOKKOSBATCHED_USE_UNBLOCKED_ALGO) -typedef KokkosBatched::Algo::LU::Unblocked AlgoLU; +#if defined(KOKKOSBATCHED_USE_UNBLOCKED_ALGO) +typedef KokkosBatched::Algo::LU::Unblocked AlgoLU; typedef KokkosBatched::Algo::Trsm::Unblocked AlgoTrsm; typedef KokkosBatched::Algo::Gemm::Unblocked AlgoGemm; typedef KokkosBatched::Algo::Trsv::Unblocked AlgoTrsv; typedef KokkosBatched::Algo::Gemv::Unblocked AlgoGemv; #endif -#if defined (KOKKOSBATCHED_USE_BLOCKED_ALGO) -typedef KokkosBatched::Algo::LU::Blocked AlgoLU; +#if defined(KOKKOSBATCHED_USE_BLOCKED_ALGO) +typedef KokkosBatched::Algo::LU::Blocked AlgoLU; typedef KokkosBatched::Algo::Trsm::Blocked AlgoTrsm; typedef KokkosBatched::Algo::Gemm::Blocked AlgoGemm; @@ -51,8 +51,8 @@ typedef KokkosBatched::Algo::Gemv::Blocked AlgoGemv; using namespace KokkosBatched; -int main (int argc, char *argv[]) { - Kokkos::initialize(argc, argv); +int main(int argc, char* argv[]) { + Kokkos::initialize(argc, argv); typedef Kokkos::DefaultExecutionSpace DeviceSpaceType; @@ -60,40 +60,53 @@ int main (int argc, char *argv[]) { Kokkos::print_configuration(std::cout, detail); - enum : int { VectorLength = DefaultVectorLength::value, - RangeTagOper = 0, - TeamTagOper = 1 }; - + enum : int { + VectorLength = + DefaultVectorLength::value, + RangeTagOper = 0, + TeamTagOper = 1 + }; + // Unit tests bool profile = false; - for (int i=1;i( 3, 4, 2, 25, 2); - // Test::run(44, 63, 15, 4, 1); - // Test::run( 2, 2, 15, 3, 3); - // Test::run( 1, 1, 2, 63, 8); - + // Test::run( + // 3, 4, 2, 25, 2); + // Test::run(44, + // 63, 15, 4, 1); + // Test::run( + // 2, 2, 15, 3, 3); + // Test::run( + // 1, 1, 2, 63, 8); + // for (int nrhs=1;nrhs<=33;++nrhs) - // Test::run(2, 2, 15, 3, nrhs); + // Test::run(2, + // 2, 15, 3, nrhs); // } // std::cout << " Unit Test::Range :: End\n"; - + std::cout << " Unit Test::Team :: Begin\n"; { - Test::run( 3, 4, 2, 25, 2); - Test::run(44, 63, 15, 4, 1); - Test::run( 2, 2, 15, 3, 3); - Test::run( 1, 1, 2, 63, 8); - - for (int nrhs=1;nrhs<=33;++nrhs) - Test::run(2, 2, 15, 3, nrhs); + Test::run( + 3, 4, 2, 25, 2); + Test::run( + 44, 63, 15, 4, 1); + Test::run( + 2, 2, 15, 3, 3); + Test::run( + 1, 1, 2, 63, 8); + + for (int nrhs = 1; nrhs <= 33; ++nrhs) + Test::run(2, 2, 15, 3, nrhs); } std::cout << " Unit Test::Team :: End\n"; } @@ -101,9 +114,9 @@ int main (int argc, char *argv[]) { // Performance tests std::cout << " Perf Test:: Begin\n"; { - const Test::Input input(argc, argv); - Test::run(input); - } + const Test::Input input(argc, argv); + Test::run(input); + } std::cout << " Perf Test:: End\n"; Kokkos::finalize(); diff --git a/perf_test/batched/KokkosBatched_Test_BlockCrs_Host.cpp b/perf_test/batched/KokkosBatched_Test_BlockCrs_Host.cpp index f682e1e119..1319fa03db 100644 --- a/perf_test/batched/KokkosBatched_Test_BlockCrs_Host.cpp +++ b/perf_test/batched/KokkosBatched_Test_BlockCrs_Host.cpp @@ -27,16 +27,16 @@ //#define KOKKOSBATCHED_USE_UNBLOCKED_ALGO 1 #define KOKKOSBATCHED_USE_BLOCKED_ALGO 1 -#if defined (KOKKOSBATCHED_USE_UNBLOCKED_ALGO) -typedef KokkosBatched::Algo::LU::Unblocked AlgoLU; +#if defined(KOKKOSBATCHED_USE_UNBLOCKED_ALGO) +typedef KokkosBatched::Algo::LU::Unblocked AlgoLU; typedef KokkosBatched::Algo::Trsm::Unblocked AlgoTrsm; typedef KokkosBatched::Algo::Gemm::Unblocked AlgoGemm; typedef KokkosBatched::Algo::Trsv::Unblocked AlgoTrsv; typedef KokkosBatched::Algo::Gemv::Unblocked AlgoGemv; #endif -#if defined (KOKKOSBATCHED_USE_BLOCKED_ALGO) -typedef KokkosBatched::Algo::LU::Blocked AlgoLU; +#if defined(KOKKOSBATCHED_USE_BLOCKED_ALGO) +typedef KokkosBatched::Algo::LU::Blocked AlgoLU; typedef KokkosBatched::Algo::Trsm::Blocked AlgoTrsm; typedef KokkosBatched::Algo::Gemm::Blocked AlgoGemm; @@ -48,8 +48,8 @@ typedef KokkosBatched::Algo::Gemv::Blocked AlgoGemv; using namespace KokkosBatched; -int main (int argc, char *argv[]) { - Kokkos::initialize(argc, argv); +int main(int argc, char* argv[]) { + Kokkos::initialize(argc, argv); #if !defined(__CUDA_ARCH__) && !defined(__HIP_DEVICE_COMPILE__) typedef Kokkos::DefaultHostExecutionSpace HostSpaceType; @@ -57,53 +57,60 @@ int main (int argc, char *argv[]) { Kokkos::print_configuration(std::cout, detail); - enum : int { VectorLength = DefaultVectorLength::value, - RangeTagOper = 0 }; + enum : int { + VectorLength = + DefaultVectorLength::value, + RangeTagOper = 0 + }; // vector type - typedef Vector,VectorLength> VectorType; + typedef Vector, VectorLength> VectorType; // Unit tests bool profile = false; - for (int i=1;i( 3, 4, 2, 25, 2); - Test::run(44, 63, 15, 4, 1); - Test::run( 2, 2, 15, 3, 3); - - for (int nrhs=1;nrhs<=33;++nrhs) - Test::run(2, 2, 15, 3, nrhs); + Test::run(3, 4, 2, + 25, 2); + Test::run( + 44, 63, 15, 4, 1); + Test::run(2, 2, 15, + 3, 3); + + for (int nrhs = 1; nrhs <= 33; ++nrhs) + Test::run( + 2, 2, 15, 3, nrhs); } std::cout << " Unit Test::Range::Vector :: End\n"; } - + // MKL #if defined(__KOKKOSBATCHED_ENABLE_INTEL_MKL_COMPACT_BATCHED__) std::cout << " Perf Test::CompactMKL Begin\n"; { const bool test_mkl = true; - const Test::Input input(argc, argv); - Test::run(input, test_mkl); - } - std::cout << " Perf Test::CompactMKL End\n"; + const Test::Input input(argc, argv); + Test::run(input, test_mkl); + } + std::cout << " Perf Test::CompactMKL End\n"; #endif // Performance tests std::cout << " Perf Test::Vector Begin\n"; { - const Test::Input input(argc, argv); - Test::run(input); - } + const Test::Input input(argc, argv); + Test::run(input); + } std::cout << " Perf Test::Vector End\n"; #endif diff --git a/perf_test/batched/KokkosBatched_Test_BlockJacobi_Tutorial.cpp b/perf_test/batched/KokkosBatched_Test_BlockJacobi_Tutorial.cpp index cd2e0015a0..f3237d9b4f 100644 --- a/perf_test/batched/KokkosBatched_Test_BlockJacobi_Tutorial.cpp +++ b/perf_test/batched/KokkosBatched_Test_BlockJacobi_Tutorial.cpp @@ -3,16 +3,15 @@ #include "Kokkos_Timer.hpp" #include "Kokkos_Random.hpp" - -#if defined(KOKKOS_ENABLE_CXX11_DISPATCH_LAMBDA) +#if defined(KOKKOS_ENABLE_CXX11_DISPATCH_LAMBDA) #if !defined(KOKKOS_ENABLE_CUDA) || (8000 <= CUDA_VERSION) -#if defined(KOKKOS_ENABLE_CUDA_LAMBDA) -#define KOKKOSBATCHED_TEST_BLOCKJACOBI -#endif +#if defined(KOKKOS_ENABLE_CUDA_LAMBDA) +#define KOKKOSBATCHED_TEST_BLOCKJACOBI +#endif #endif #endif -#if defined(KOKKOSBATCHED_TEST_BLOCKJACOBI) +#if defined(KOKKOSBATCHED_TEST_BLOCKJACOBI) /// KokkosKernels headers #include "KokkosBatched_Util.hpp" @@ -35,57 +34,52 @@ #include "cuda_profiler_api.h" #endif - -using exec_space_type = Kokkos::DefaultExecutionSpace; +using exec_space_type = Kokkos::DefaultExecutionSpace; using memory_space_type = typename exec_space_type::memory_space; -using host_space = Kokkos::DefaultHostExecutionSpace; +using host_space = Kokkos::DefaultHostExecutionSpace; -using val_type = double; +using val_type = double; using policy_type = Kokkos::TeamPolicy; using member_type = typename policy_type::member_type; using namespace KokkosBatched; -template -val_type computeResidual(const ManyMatrixType &A, - const ManyVectorType &x, - const ManyVectorType &b, - const ManyVectorType &r) { +template +val_type computeResidual(const ManyMatrixType &A, const ManyVectorType &x, + const ManyVectorType &b, const ManyVectorType &r) { /// compute residual val_type residual(0); { - policy_type policy(A.extent(0), Kokkos::AUTO()); + policy_type policy(A.extent(0), Kokkos::AUTO()); Kokkos::deep_copy(r, b); - Kokkos::parallel_reduce - ("compute-residual", - policy, KOKKOS_LAMBDA(const member_type &member, val_type &update) { - const int i = member.league_rank(); - const val_type one(1); - auto AA = Kokkos::subview(A, i, Kokkos::ALL(), Kokkos::ALL()); - auto xx = Kokkos::subview(x, i, Kokkos::ALL()); - auto rr = Kokkos::subview(r, i, Kokkos::ALL()); - - TeamGemv - ::invoke(member, -one, AA, xx, one, rr); - - val_type sum(0); - Kokkos::parallel_reduce - (Kokkos::TeamThreadRange(member, rr.extent(0)), - [&](const int &k, val_type &lsum) { - lsum += Kokkos::ArithTraits::abs(rr(k)); - }, sum); - Kokkos::single(Kokkos::PerTeam(member), [&]() { - update += sum; - }); - }, residual); + Kokkos::parallel_reduce( + "compute-residual", policy, + KOKKOS_LAMBDA(const member_type &member, val_type &update) { + const int i = member.league_rank(); + const val_type one(1); + auto AA = Kokkos::subview(A, i, Kokkos::ALL(), Kokkos::ALL()); + auto xx = Kokkos::subview(x, i, Kokkos::ALL()); + auto rr = Kokkos::subview(r, i, Kokkos::ALL()); + + TeamGemv::invoke(member, -one, AA, xx, one, + rr); + + val_type sum(0); + Kokkos::parallel_reduce( + Kokkos::TeamThreadRange(member, rr.extent(0)), + [&](const int &k, val_type &lsum) { + lsum += Kokkos::ArithTraits::abs(rr(k)); + }, + sum); + Kokkos::single(Kokkos::PerTeam(member), [&]() { update += sum; }); + }, + residual); } return residual; } -int main(int argc, char* argv[]) { +int main(int argc, char *argv[]) { Kokkos::initialize(argc, argv); { #if defined(KOKKOS_ENABLE_CUDA) @@ -98,16 +92,15 @@ int main(int argc, char* argv[]) { /// /// input arguments parsing /// - int N = 128*128; /// # of problems (batch size) - int Blk = 5; /// block dimension - for (int i=1;i A("block diagonals", N, Blk, Blk); - Kokkos::View T("temporal block diagonals", N, Blk, Blk); - Kokkos::View x("x", N, Blk); - Kokkos::View b("b", N, Blk); + Kokkos::View A( + "block diagonals", N, Blk, Blk); + Kokkos::View T( + "temporal block diagonals", N, Blk, Blk); + Kokkos::View x("x", N, + Blk); + Kokkos::View b("b", N, + Blk); /// copy of A to check residual - Kokkos::View Acopy("Acopy", - A.extent(0), - A.extent(1), - A.extent(2)); + Kokkos::View Acopy( + "Acopy", A.extent(0), A.extent(1), A.extent(2)); /// residual vector - Kokkos::View r("r", - b.extent(0), - b.extent(1)); + Kokkos::View r( + "r", b.extent(0), b.extent(1)); - /// The block diagonal matrices are assumed to be extracted from a block sparse matrix. - /// Here we set the blocks with random values + /// The block diagonal matrices are assumed to be extracted from a block + /// sparse matrix. Here we set the blocks with random values Kokkos::Random_XorShift64_Pool random(13245); Kokkos::fill_random(A, random, val_type(1.0)); Kokkos::fill_random(b, random, val_type(1.0)); @@ -143,8 +137,8 @@ int main(int argc, char* argv[]) { /// /// Objective : /// - Construct the inverse of A(i,:,:) for all i. - /// - Solve the equation using matrix vector multiplication. - + /// - Solve the equation using matrix vector multiplication. + /// Task 1. Use the so-called standard batch interface /// parallel_for(factorize) /// parallel_For(set identity matrix) @@ -157,90 +151,95 @@ int main(int argc, char* argv[]) { cudaProfilerStart(); #endif Kokkos::deep_copy(A, Acopy); - + /// construction of block jacobi using batched blas interface /// each parallel for is a batch function { - policy_type policy(A.extent(0), Kokkos::AUTO()); - timer.reset(); - Kokkos::parallel_for - ("task1.factorize", - policy, KOKKOS_LAMBDA(const member_type &member) { - const int i = member.league_rank(); - auto AA = Kokkos::subview(A, i, Kokkos::ALL(), Kokkos::ALL()); - TeamLU::invoke(member,AA); - }); - Kokkos::deep_copy(T, A); - Kokkos::parallel_for - ("task1.set-identity", - policy, KOKKOS_LAMBDA(const member_type &member) { - const int i = member.league_rank(); - auto AA = Kokkos::subview(A, i, Kokkos::ALL(), Kokkos::ALL()); - TeamSetIdentity::invoke(member, AA); - }); - Kokkos::fence(); - Kokkos::parallel_for - ("task1.solve-lower-triangular", - policy, KOKKOS_LAMBDA(const member_type &member) { - const int i = member.league_rank(); - const val_type one(1); - auto AA = Kokkos::subview(A, i, Kokkos::ALL(), Kokkos::ALL()); - auto TT = Kokkos::subview(T, i, Kokkos::ALL(), Kokkos::ALL()); - TeamTrsm - ::invoke(member, one, TT, AA); - }); - Kokkos::fence(); - Kokkos::parallel_for - ("task1.solve-upper-triangular", - policy, KOKKOS_LAMBDA(const member_type &member) { - const int i = member.league_rank(); - const val_type one(1); - auto AA = Kokkos::subview(A, i, Kokkos::ALL(), Kokkos::ALL()); - auto TT = Kokkos::subview(T, i, Kokkos::ALL(), Kokkos::ALL()); - TeamTrsm - ::invoke(member, one, TT, AA); - }); - Kokkos::fence(); - const double t = timer.seconds(); - printf("task 1: construction of jacobi time = %f , # of constructions per min = %.0f \n", t, 1.0/t*60); + policy_type policy(A.extent(0), Kokkos::AUTO()); + timer.reset(); + Kokkos::parallel_for( + "task1.factorize", policy, + KOKKOS_LAMBDA(const member_type &member) { + const int i = member.league_rank(); + auto AA = Kokkos::subview(A, i, Kokkos::ALL(), Kokkos::ALL()); + TeamLU::invoke(member, AA); + }); + Kokkos::deep_copy(T, A); + Kokkos::parallel_for( + "task1.set-identity", policy, + KOKKOS_LAMBDA(const member_type &member) { + const int i = member.league_rank(); + auto AA = Kokkos::subview(A, i, Kokkos::ALL(), Kokkos::ALL()); + TeamSetIdentity::invoke(member, AA); + }); + Kokkos::fence(); + Kokkos::parallel_for( + "task1.solve-lower-triangular", policy, + KOKKOS_LAMBDA(const member_type &member) { + const int i = member.league_rank(); + const val_type one(1); + auto AA = Kokkos::subview(A, i, Kokkos::ALL(), Kokkos::ALL()); + auto TT = Kokkos::subview(T, i, Kokkos::ALL(), Kokkos::ALL()); + TeamTrsm::invoke(member, one, + TT, AA); + }); + Kokkos::fence(); + Kokkos::parallel_for( + "task1.solve-upper-triangular", policy, + KOKKOS_LAMBDA(const member_type &member) { + const int i = member.league_rank(); + const val_type one(1); + auto AA = Kokkos::subview(A, i, Kokkos::ALL(), Kokkos::ALL()); + auto TT = Kokkos::subview(T, i, Kokkos::ALL(), Kokkos::ALL()); + TeamTrsm::invoke(member, + one, TT, + AA); + }); + Kokkos::fence(); + const double t = timer.seconds(); + printf( + "task 1: construction of jacobi time = %f , # of constructions per " + "min = %.0f \n", + t, 1.0 / t * 60); } - + /// apply block jacobi { - timer.reset(); - policy_type policy(A.extent(0), Kokkos::AUTO()); - Kokkos::parallel_for - ("task1.apply-block-jacobi", - policy, KOKKOS_LAMBDA(const member_type &member) { - const int i = member.league_rank(); - const val_type one(1), zero(0); - auto AA = Kokkos::subview(A, i, Kokkos::ALL(), Kokkos::ALL()); - auto xx = Kokkos::subview(x, i, Kokkos::ALL()); - auto bb = Kokkos::subview(b, i, Kokkos::ALL()); - TeamGemv - ::invoke(member, one, AA, bb, zero, xx); - }); - const double t = timer.seconds(); - printf("task 1: application of jacobi time = %f , # of applications per min = %.0f \n", t, 1.0/t*60); + timer.reset(); + policy_type policy(A.extent(0), Kokkos::AUTO()); + Kokkos::parallel_for( + "task1.apply-block-jacobi", policy, + KOKKOS_LAMBDA(const member_type &member) { + const int i = member.league_rank(); + const val_type one(1), zero(0); + auto AA = Kokkos::subview(A, i, Kokkos::ALL(), Kokkos::ALL()); + auto xx = Kokkos::subview(x, i, Kokkos::ALL()); + auto bb = Kokkos::subview(b, i, Kokkos::ALL()); + TeamGemv::invoke(member, one, AA, bb, + zero, xx); + }); + const double t = timer.seconds(); + printf( + "task 1: application of jacobi time = %f , # of applications per " + "min = %.0f \n", + t, 1.0 / t * 60); } /// check residual { - const double residual = computeResidual(Acopy, x, b, r); - printf("task 1: residual = %f\n", residual); + const double residual = computeResidual(Acopy, x, b, r); + printf("task 1: residual = %f\n", residual); } -#if defined(KOKKOS_ENABLE_CUDA) +#if defined(KOKKOS_ENABLE_CUDA) cudaProfilerStop(); -#endif +#endif } - - /// Task 2. Compose a new batch function using kokkos batched team-level interface + + /// Task 2. Compose a new batch function using kokkos batched team-level + /// interface /// parallel_for(LU, set identity, solve lower/upper triangular) /// parallel_for(matrix vector multiplication) @@ -249,78 +248,77 @@ int main(int argc, char* argv[]) { cudaProfilerStart(); #endif Kokkos::deep_copy(A, Acopy); - + /// construction of block jacobi using batched blas interface /// each parallel for is a batch function { - policy_type policy(A.extent(0), Kokkos::AUTO()); - timer.reset(); - Kokkos::parallel_for - ("task2.factorize-invert", - policy, KOKKOS_LAMBDA(const member_type &member) { - const val_type one(1); - const int i = member.league_rank(); - auto AA = Kokkos::subview(A, i, Kokkos::ALL(), Kokkos::ALL()); - auto TT = Kokkos::subview(T, i, Kokkos::ALL(), Kokkos::ALL()); - - TeamLU::invoke(member,AA); - TeamCopy::invoke(member, AA, TT); - TeamSetIdentity::invoke(member, AA); - TeamTrsm - ::invoke(member, one, TT, AA); - TeamTrsm - ::invoke(member, one, TT, AA); - }); - Kokkos::fence(); - const double t = timer.seconds(); - printf("task 2: construction of jacobi time = %f , # of constructions per min = %.0f \n", t, 1.0/t*60); + policy_type policy(A.extent(0), Kokkos::AUTO()); + timer.reset(); + Kokkos::parallel_for( + "task2.factorize-invert", policy, + KOKKOS_LAMBDA(const member_type &member) { + const val_type one(1); + const int i = member.league_rank(); + auto AA = Kokkos::subview(A, i, Kokkos::ALL(), Kokkos::ALL()); + auto TT = Kokkos::subview(T, i, Kokkos::ALL(), Kokkos::ALL()); + + TeamLU::invoke(member, AA); + TeamCopy::invoke(member, AA, TT); + TeamSetIdentity::invoke(member, AA); + TeamTrsm::invoke(member, one, + TT, AA); + TeamTrsm::invoke(member, + one, TT, + AA); + }); + Kokkos::fence(); + const double t = timer.seconds(); + printf( + "task 2: construction of jacobi time = %f , # of constructions per " + "min = %.0f \n", + t, 1.0 / t * 60); } - + /// apply block jacobi { - timer.reset(); - policy_type policy(A.extent(0), Kokkos::AUTO()); - Kokkos::parallel_for - ("task2.apply-block-jacobi", - policy, KOKKOS_LAMBDA(const member_type &member) { - const int i = member.league_rank(); - const val_type one(1), zero(0); - auto AA = Kokkos::subview(A, i, Kokkos::ALL(), Kokkos::ALL()); - auto xx = Kokkos::subview(x, i, Kokkos::ALL()); - auto bb = Kokkos::subview(b, i, Kokkos::ALL()); - TeamGemv - ::invoke(member, one, AA, bb, zero, xx); - }); - const double t = timer.seconds(); - printf("task 2: application of jacobi time = %f , # of applications per min = %.0f \n", t, 1.0/t*60); + timer.reset(); + policy_type policy(A.extent(0), Kokkos::AUTO()); + Kokkos::parallel_for( + "task2.apply-block-jacobi", policy, + KOKKOS_LAMBDA(const member_type &member) { + const int i = member.league_rank(); + const val_type one(1), zero(0); + auto AA = Kokkos::subview(A, i, Kokkos::ALL(), Kokkos::ALL()); + auto xx = Kokkos::subview(x, i, Kokkos::ALL()); + auto bb = Kokkos::subview(b, i, Kokkos::ALL()); + TeamGemv::invoke(member, one, AA, bb, + zero, xx); + }); + const double t = timer.seconds(); + printf( + "task 2: application of jacobi time = %f , # of applications per " + "min = %.0f \n", + t, 1.0 / t * 60); } /// check residual { - const double residual = computeResidual(Acopy, x, b, r); - printf("task 2: residual = %f\n", residual); + const double residual = computeResidual(Acopy, x, b, r); + printf("task 2: residual = %f\n", residual); } -#if defined(KOKKOS_ENABLE_CUDA) +#if defined(KOKKOS_ENABLE_CUDA) cudaProfilerStop(); -#endif +#endif } - } Kokkos::finalize(); return 0; } - #else -int main() { - return 0; -} +int main() { return 0; } #endif - diff --git a/perf_test/batched/KokkosBatched_Test_BlockTridiagDirect.cpp b/perf_test/batched/KokkosBatched_Test_BlockTridiagDirect.cpp index 4183380854..a8b3de209b 100644 --- a/perf_test/batched/KokkosBatched_Test_BlockTridiagDirect.cpp +++ b/perf_test/batched/KokkosBatched_Test_BlockTridiagDirect.cpp @@ -3,14 +3,13 @@ #include "Kokkos_Timer.hpp" #include "Kokkos_Random.hpp" -#if defined(KOKKOS_ENABLE_CXX11_DISPATCH_LAMBDA) +#if defined(KOKKOS_ENABLE_CXX11_DISPATCH_LAMBDA) #if !defined(KOKKOS_ENABLE_CUDA) || (8000 <= CUDA_VERSION) -#if defined(KOKKOS_ENABLE_CUDA_LAMBDA) -#define KOKKOSBATCHED_TEST_BLOCKTRIDIAGDIRECT -#endif +#if defined(KOKKOS_ENABLE_CUDA_LAMBDA) +#define KOKKOSBATCHED_TEST_BLOCKTRIDIAGDIRECT +#endif #endif #endif - #if defined(KOKKOSBATCHED_TEST_BLOCKTRIDIAGDIRECT) @@ -60,55 +59,57 @@ typedef double value_type; /// using namespace KokkosBatched; -static constexpr int vector_length = DefaultVectorLength::value; +static constexpr int vector_length = + DefaultVectorLength::value; #if defined(KOKKOSBATCHED_USE_128BIT_MEMORY_INST) -static constexpr int internal_vector_length = DefaultInternalVectorLength::value; +static constexpr int internal_vector_length = + DefaultInternalVectorLength::value; #else static constexpr int internal_vector_length = 1; #endif -typedef Vector,vector_length> vector_type; +typedef Vector, vector_length> vector_type; #if defined(KOKKOSBATCHED_USE_128BIT_MEMORY_INST) -typedef Vector,internal_vector_length> internal_vector_type; +typedef Vector, internal_vector_length> internal_vector_type; #else typedef value_type internal_vector_type; #endif -template +template struct FactorizeModeAndAlgo; -template<> +template <> struct FactorizeModeAndAlgo { typedef Mode::Serial mode_type; - typedef Algo::Level3::Blocked algo_type; + typedef Algo::Level3::Blocked algo_type; }; #if defined(KOKKOS_ENABLE_CUDA) -template<> +template <> struct FactorizeModeAndAlgo { typedef Mode::Team mode_type; - typedef Algo::Level3::Unblocked algo_type; + typedef Algo::Level3::Unblocked algo_type; }; #endif -template +template struct SolveModeAndAlgo; -template<> +template <> struct SolveModeAndAlgo { typedef Mode::Serial mode_type; - typedef Algo::Level2::Blocked algo_type; + typedef Algo::Level2::Blocked algo_type; }; #if defined(KOKKOS_ENABLE_CUDA) -template<> +template <> struct SolveModeAndAlgo { typedef Mode::Team mode_type; - typedef Algo::Level2::Unblocked algo_type; + typedef Algo::Level2::Unblocked algo_type; }; #endif -int main(int argc, char* argv[]) { +int main(int argc, char *argv[]) { Kokkos::initialize(argc, argv); { #if defined(KOKKOS_ENABLE_CUDA) && defined(KOKKOSBATCHED_PROFILE) @@ -116,20 +117,20 @@ int main(int argc, char* argv[]) { #endif Kokkos::print_configuration(std::cout); - //typedef Kokkos::Details::ArithTraits ats; + // typedef Kokkos::Details::ArithTraits ats; Kokkos::Timer timer; /// /// input arguments parsing /// - int N = 128*128; /// # of problems (batch size) - int L = 128; /// length of block tridiags - int Blk = 5; /// block dimension - int Nvec = 1; - int S = 0; /// scratch size + int N = 128 * 128; /// # of problems (batch size) + int L = 128; /// length of block tridiags + int Blk = 5; /// block dimension + int Nvec = 1; + int S = 0; /// scratch size int niter = 1; - for (int i=1;i Av("A", - N/vector_length, L, 3, Blk, Blk); + Kokkos::View Av( + "A", N / vector_length, L, 3, Blk, Blk); /// double - Kokkos::View As((value_type*)Av.data(), - Av.extent(0), - Av.extent(1), - Av.extent(2), - Av.extent(3), - Av.extent(4), - vector_length); + Kokkos::View As( + (value_type *)Av.data(), Av.extent(0), Av.extent(1), Av.extent(2), + Av.extent(3), Av.extent(4), vector_length); /// double 2 - Kokkos::View Ai((internal_vector_type*)Av.data(), - Av.extent(0), - Av.extent(1), - Av.extent(2), - Av.extent(3), - Av.extent(4), - vector_length/internal_vector_length); + Kokkos::View + Ai((internal_vector_type *)Av.data(), Av.extent(0), Av.extent(1), + Av.extent(2), Av.extent(3), Av.extent(4), + vector_length / internal_vector_length); /// double 16 - Kokkos::View xv("x", - N/vector_length, Nvec, L, Blk); + Kokkos::View xv( + "x", N / vector_length, Nvec, L, Blk); /// double - Kokkos::View xs((value_type*)xv.data(), - xv.extent(0), - xv.extent(1), - xv.extent(2), - xv.extent(3), - vector_length); + Kokkos::View xs( + (value_type *)xv.data(), xv.extent(0), xv.extent(1), xv.extent(2), + xv.extent(3), vector_length); /// double 2 - Kokkos::View xi((internal_vector_type*)xv.data(), - xv.extent(0), - xv.extent(1), - xv.extent(2), - xv.extent(3), - vector_length/internal_vector_length); + Kokkos::View + xi((internal_vector_type *)xv.data(), xv.extent(0), xv.extent(1), + xv.extent(2), xv.extent(3), vector_length / internal_vector_length); /// double 16 - Kokkos::View bv("b", - N/vector_length, Nvec, L, Blk); + Kokkos::View bv( + "b", N / vector_length, Nvec, L, Blk); /// double - Kokkos::View bs((value_type*)bv.data(), - bv.extent(0), - bv.extent(1), - bv.extent(2), - bv.extent(3), - vector_length); + Kokkos::View bs( + (value_type *)bv.data(), bv.extent(0), bv.extent(1), bv.extent(2), + bv.extent(3), vector_length); /// double 2 - Kokkos::View bi((internal_vector_type*)bv.data(), - bv.extent(0), - bv.extent(1), - bv.extent(2), - bv.extent(3), - vector_length/internal_vector_length); - + Kokkos::View + bi((internal_vector_type *)bv.data(), bv.extent(0), bv.extent(1), + bv.extent(2), bv.extent(3), vector_length / internal_vector_length); /// double copy of A - Kokkos::View Acopy("Acopy", - As.extent(0), - As.extent(1), - As.extent(2), - As.extent(3), - As.extent(4), - As.extent(5)); - - Kokkos::View rs("rs", - bs.extent(0), - bs.extent(1), - bs.extent(2), - bs.extent(3), - bs.extent(4)); + Kokkos::View Acopy( + "Acopy", As.extent(0), As.extent(1), As.extent(2), As.extent(3), + As.extent(4), As.extent(5)); + + Kokkos::View rs( + "rs", bs.extent(0), bs.extent(1), bs.extent(2), bs.extent(3), + bs.extent(4)); #if defined(KOKKOSBATCHED_USE_128BIT_MEMORY_INST) auto AA = Ai; @@ -245,17 +220,21 @@ int main(int argc, char* argv[]) { using policy_type = Kokkos::TeamPolicy; using member_type = typename policy_type::member_type; policy_type policy(AA.extent(0), Kokkos::AUTO(), AA.extent(5)); - Kokkos::parallel_for - ("setTridiagToIdentity", - policy, KOKKOS_LAMBDA(const member_type &member) { - const int i = member.league_rank(); - Kokkos::parallel_for(Kokkos::TeamThreadRange(member,AA.extent(1)),[&](const int &j) { - Kokkos::parallel_for(Kokkos::ThreadVectorRange(member, AA.extent(5)),[&](const int &v) { - for (int k=0,kend=AA.extent(3);k random(13245); Kokkos::fill_random(As, random, one); Kokkos::fill_random(bs, random, one); - + Kokkos::deep_copy(Acopy, As); } @@ -284,70 +263,76 @@ int main(int argc, char* argv[]) { timer.reset(); using policy_type = Kokkos::TeamPolicy; using member_type = typename policy_type::member_type; - int team_size = 0; - if (Blk < 8) { team_size = 32/AA.extent(5); - } else if (Blk < 12) { team_size = 64/AA.extent(5); - } else { team_size = 128/AA.extent(5); } + int team_size = 0; + if (Blk < 8) { + team_size = 32 / AA.extent(5); + } else if (Blk < 12) { + team_size = 64 / AA.extent(5); + } else { + team_size = 128 / AA.extent(5); + } policy_type policy(AA.extent(0), team_size, AA.extent(5)); - Kokkos::parallel_for - ("factorize", - policy.set_scratch_size(0,Kokkos::PerTeam(S)), - KOKKOS_LAMBDA(const member_type &member) { - typedef FactorizeModeAndAlgo default_mode_and_algo_type; - typedef default_mode_and_algo_type::mode_type mode_type; - typedef default_mode_and_algo_type::algo_type algo_type; - - const int i = member.league_rank(); - - Kokkos::parallel_for(Kokkos::ThreadVectorRange(member, AA.extent(5)),[&](const int &v) { - auto AAA = Kokkos::subview(AA, i, Kokkos::ALL(), Kokkos::ALL(), Kokkos::ALL(), Kokkos::ALL(), v); - - /// subview patterns - auto A = Kokkos::subview(AAA, 0, 1, Kokkos::ALL(), Kokkos::ALL()); - auto B = Kokkos::subview(AAA, 0, 2, Kokkos::ALL(), Kokkos::ALL()); - auto C = Kokkos::subview(AAA, 0, 0, Kokkos::ALL(), Kokkos::ALL()); - auto D = Kokkos::subview(AAA, 0, 1, Kokkos::ALL(), Kokkos::ALL()); - - if (L == 1) { - A.assign_data( &AAA(0, 1, 0, 0) ); - LU::invoke(member, A); - } else { - for (int k=0;k<(L-1);++k) { - A.assign_data( &AAA(k, 1, 0, 0) ); - B.assign_data( &AAA(k, 2, 0, 0) ); - C.assign_data( &AAA(k, 0, 0, 0) ); - D.assign_data( &AAA(k+1, 1, 0, 0) ); - - LU - ::invoke(member, A); - Trsm - ::invoke(member, 1.0, A, B); - Trsm - ::invoke(member, 1.0, A, C); - Gemm - ::invoke(member, -1.0, C, B, 1.0, D); - } - LU - ::invoke(member, D); - } - }); - }); + Kokkos::parallel_for( + "factorize", policy.set_scratch_size(0, Kokkos::PerTeam(S)), + KOKKOS_LAMBDA(const member_type &member) { + typedef FactorizeModeAndAlgo< + Kokkos::Impl::ActiveExecutionMemorySpace> + default_mode_and_algo_type; + typedef default_mode_and_algo_type::mode_type mode_type; + typedef default_mode_and_algo_type::algo_type algo_type; + + const int i = member.league_rank(); + + Kokkos::parallel_for( + Kokkos::ThreadVectorRange(member, AA.extent(5)), + [&](const int &v) { + auto AAA = + Kokkos::subview(AA, i, Kokkos::ALL(), Kokkos::ALL(), + Kokkos::ALL(), Kokkos::ALL(), v); + + /// subview patterns + auto A = + Kokkos::subview(AAA, 0, 1, Kokkos::ALL(), Kokkos::ALL()); + auto B = + Kokkos::subview(AAA, 0, 2, Kokkos::ALL(), Kokkos::ALL()); + auto C = + Kokkos::subview(AAA, 0, 0, Kokkos::ALL(), Kokkos::ALL()); + auto D = + Kokkos::subview(AAA, 0, 1, Kokkos::ALL(), Kokkos::ALL()); + + if (L == 1) { + A.assign_data(&AAA(0, 1, 0, 0)); + LU::invoke(member, A); + } else { + for (int k = 0; k < (L - 1); ++k) { + A.assign_data(&AAA(k, 1, 0, 0)); + B.assign_data(&AAA(k, 2, 0, 0)); + C.assign_data(&AAA(k, 0, 0, 0)); + D.assign_data(&AAA(k + 1, 1, 0, 0)); + + LU::invoke(member, A); + Trsm::invoke(member, 1.0, A, B); + Trsm::invoke(member, 1.0, A, C); + Gemm::invoke(member, -1.0, C, B, + 1.0, D); + } + LU::invoke(member, D); + } + }); + }); Kokkos::fence(); const double t = timer.seconds(); #if defined(KOKKOS_ENABLE_CUDA) && defined(KOKKOSBATCHED_PROFILE) cudaProfilerStop(); #endif - printf("factorize time = %f , # of factorization per min = %f \n", t, 1.0/t*60); + printf("factorize time = %f , # of factorization per min = %f \n", t, + 1.0 / t * 60); } /// @@ -360,133 +345,144 @@ int main(int argc, char* argv[]) { timer.reset(); using policy_type = Kokkos::TeamPolicy; using member_type = typename policy_type::member_type; - int team_size = 0; - if (Blk < 8) { team_size = 32/AA.extent(5); - } else if (Blk < 12) { team_size = 64/AA.extent(5); - } else { team_size = 128/AA.extent(5); } - - policy_type policy(AA.extent(0), team_size, AA.extent(5)); - for (int iter=0;iter default_mode_and_algo_type; - typedef default_mode_and_algo_type::mode_type mode_type; - typedef default_mode_and_algo_type::algo_type algo_type; + int team_size = 0; + if (Blk < 8) { + team_size = 32 / AA.extent(5); + } else if (Blk < 12) { + team_size = 64 / AA.extent(5); + } else { + team_size = 128 / AA.extent(5); + } - const int i = member.league_rank(); - Kokkos::parallel_for(Kokkos::ThreadVectorRange(member, AA.extent(5)),[&](const int &v) { - auto A = Kokkos::subview(AA, i, Kokkos::ALL(), 1, Kokkos::ALL(), Kokkos::ALL(), v); - auto B = Kokkos::subview(AA, i, Kokkos::ALL(), 2, Kokkos::ALL(), Kokkos::ALL(), v); - auto C = Kokkos::subview(AA, i, Kokkos::ALL(), 0, Kokkos::ALL(), Kokkos::ALL(), v); - - for (int jvec=0;jvec - ::invoke(member, bk, xb); - member.team_barrier(); - } - } - const int kend = L - 1; - for (int k=0;k - ::invoke(member, bk, xb); - } - - Trsv - ::invoke(member, 1.0, LT, xt); - - Gemv - ::invoke(member, -1.0, LB, xt, 1.0, xb); - } - { - LT.assign_data(&A(kend, 0, 0)); - xt.assign_data(&x(kend, 0)); - Trsv - ::invoke(member, 1.0, LT, xt); - } - } /// end forward substitution - - /// - /// backward substitution - /// - { - auto UT = Kokkos::subview(B, 0, Kokkos::ALL(), Kokkos::ALL()); - auto UB = Kokkos::subview(A, 0, Kokkos::ALL(), Kokkos::ALL()); - - const int kbegin = L - 1; - for (int k=kbegin;k>0;--k) { - UT.assign_data(&B(k-1, 0, 0)); - UB.assign_data(&A(k, 0, 0)); - - xt.assign_data(&x(k-1, 0)); - xb.assign_data(&x(k, 0)); - - Trsv - ::invoke(member, 1.0, UB, xb); - - Gemv - ::invoke(member, -1.0, UT, xb, 1.0, xt); - } - { - UB.assign_data(&A(0, 0, 0)); - xb.assign_data(&x(0, 0)); - Trsv - ::invoke(member, 1.0, UB, xb); + policy_type policy(AA.extent(0), team_size, AA.extent(5)); + for (int iter = 0; iter < niter; ++iter) { + Kokkos::parallel_for( + "solve", policy.set_scratch_size(0, Kokkos::PerTeam(S)), + KOKKOS_LAMBDA(const member_type &member) { + typedef SolveModeAndAlgo + default_mode_and_algo_type; + typedef default_mode_and_algo_type::mode_type mode_type; + typedef default_mode_and_algo_type::algo_type algo_type; + + const int i = member.league_rank(); + Kokkos::parallel_for( + Kokkos::ThreadVectorRange(member, AA.extent(5)), + [&](const int &v) { + auto A = Kokkos::subview(AA, i, Kokkos::ALL(), 1, + Kokkos::ALL(), Kokkos::ALL(), v); + auto B = Kokkos::subview(AA, i, Kokkos::ALL(), 2, + Kokkos::ALL(), Kokkos::ALL(), v); + auto C = Kokkos::subview(AA, i, Kokkos::ALL(), 0, + Kokkos::ALL(), Kokkos::ALL(), v); + + for (int jvec = 0; jvec < Nvec; ++jvec) { + auto x = Kokkos::subview(xx, i, jvec, Kokkos::ALL(), + Kokkos::ALL(), v); + auto b = Kokkos::subview(bb, i, jvec, Kokkos::ALL(), + Kokkos::ALL(), v); + + auto xt = Kokkos::subview(x, 0, Kokkos::ALL()); + auto xb = Kokkos::subview(x, 0, Kokkos::ALL()); + + /// + /// forward substitution + /// + { + // const bool is_same_x_and_b = (x.data() == b.data()); + auto LT = + Kokkos::subview(A, 0, Kokkos::ALL(), Kokkos::ALL()); + auto LB = + Kokkos::subview(C, 0, Kokkos::ALL(), Kokkos::ALL()); + + auto bk = Kokkos::subview(b, 0, Kokkos::ALL()); + { + { // if (!is_same_x_and_b) { + Copy::invoke(member, bk, xb); + member.team_barrier(); + } + } + const int kend = L - 1; + for (int k = 0; k < kend; ++k) { + LT.assign_data(&A(k, 0, 0)); + LB.assign_data(&C(k, 0, 0)); + + xt.assign_data(&x(k, 0)); + xb.assign_data(&x(k + 1, 0)); + + { // if (!is_same_x_and_b) { + bk.assign_data(&b(k + 1, 0)); + Copy::invoke(member, bk, xb); + } + + Trsv::invoke(member, + 1.0, + LT, + xt); + + Gemv::invoke(member, -1.0, LB, xt, 1.0, + xb); + } + { + LT.assign_data(&A(kend, 0, 0)); + xt.assign_data(&x(kend, 0)); + Trsv::invoke(member, + 1.0, + LT, + xt); + } + } /// end forward substitution + + /// + /// backward substitution + /// + { + auto UT = + Kokkos::subview(B, 0, Kokkos::ALL(), Kokkos::ALL()); + auto UB = + Kokkos::subview(A, 0, Kokkos::ALL(), Kokkos::ALL()); + + const int kbegin = L - 1; + for (int k = kbegin; k > 0; --k) { + UT.assign_data(&B(k - 1, 0, 0)); + UB.assign_data(&A(k, 0, 0)); + + xt.assign_data(&x(k - 1, 0)); + xb.assign_data(&x(k, 0)); + + Trsv::invoke(member, 1.0, UB, xb); + + Gemv::invoke(member, -1.0, UT, xb, 1.0, + xt); + } + { + UB.assign_data(&A(0, 0, 0)); + xb.assign_data(&x(0, 0)); + Trsv::invoke(member, 1.0, UB, xb); + } + } // end backward substitution } - } // end backward substitution - } - }); - }); + }); + }); Kokkos::fence(); } const double t = timer.seconds(); #if defined(KOKKOS_ENABLE_CUDA) && defined(KOKKOSBATCHED_PROFILE) cudaProfilerStop(); #endif - printf("solve time = %f , # of solves per min = %f\n", t, 1.0/t*60*niter); + printf("solve time = %f , # of solves per min = %f\n", t, + 1.0 / t * 60 * niter); } - + /// /// compute residual /// @@ -495,105 +491,118 @@ int main(int argc, char* argv[]) { using policy_type = Kokkos::TeamPolicy; using member_type = typename policy_type::member_type; policy_type policy(Acopy.extent(0), Kokkos::AUTO(), Acopy.extent(5)); - Kokkos::parallel_for - ("compute residual", - policy, KOKKOS_LAMBDA(const member_type &member) { - const int i = member.league_rank(); - Kokkos::parallel_for(Kokkos::ThreadVectorRange(member, Acopy.extent(5)),[&](const int &v) { - auto A = Kokkos::subview(Acopy, i, Kokkos::ALL(), 1, Kokkos::ALL(), Kokkos::ALL(), v); - auto B = Kokkos::subview(Acopy, i, Kokkos::ALL(), 2, Kokkos::ALL(), Kokkos::ALL(), v); - auto C = Kokkos::subview(Acopy, i, Kokkos::ALL(), 0, Kokkos::ALL(), Kokkos::ALL(), v); - - for (int jvec=0,jvecend=rs.extent(1);jvec - ::invoke(member, b0, r0); - TeamGemv - ::invoke(member, -1.0, A0, x0, 1.0, r0); - } else { - int k = 0; - { - /// first row - auto A1 = Kokkos::subview(A, k, Kokkos::ALL(), Kokkos::ALL()); - auto B2 = Kokkos::subview(B, k, Kokkos::ALL(), Kokkos::ALL()); - - auto x1 = Kokkos::subview(x, k, Kokkos::ALL()); - auto x2 = Kokkos::subview(x, k+1, Kokkos::ALL()); - - auto bk = Kokkos::subview(b, k, Kokkos::ALL()); - auto rk = Kokkos::subview(r, k, Kokkos::ALL()); - TeamCopy - ::invoke(member, bk, rk); - member.team_barrier(); - TeamGemv - ::invoke(member, -1.0, A1, x1, 1.0, rk); - TeamGemv - ::invoke(member, -1.0, B2, x2, 1.0, rk); - ++k; - } - for (;k<(L-1);++k) { - auto C0 = Kokkos::subview(C, k-1, Kokkos::ALL(), Kokkos::ALL()); - auto A1 = Kokkos::subview(A, k, Kokkos::ALL(), Kokkos::ALL()); - auto B2 = Kokkos::subview(B, k, Kokkos::ALL(), Kokkos::ALL()); - - auto x0 = Kokkos::subview(x, k-1, Kokkos::ALL()); - auto x1 = Kokkos::subview(x, k, Kokkos::ALL()); - auto x2 = Kokkos::subview(x, k+1, Kokkos::ALL()); - - auto bk = Kokkos::subview(b, k, Kokkos::ALL()); - auto rk = Kokkos::subview(r, k, Kokkos::ALL()); - TeamCopy - ::invoke(member, bk, rk); - member.team_barrier(); - TeamGemv - ::invoke(member, -1.0, C0, x0, 1.0, rk); - TeamGemv - ::invoke(member, -1.0, A1, x1, 1.0, rk); - TeamGemv - ::invoke(member, -1.0, B2, x2, 1.0, rk); + Kokkos::parallel_for( + "compute residual", policy, KOKKOS_LAMBDA(const member_type &member) { + const int i = member.league_rank(); + Kokkos::parallel_for( + Kokkos::ThreadVectorRange(member, Acopy.extent(5)), + [&](const int &v) { + auto A = Kokkos::subview(Acopy, i, Kokkos::ALL(), 1, + Kokkos::ALL(), Kokkos::ALL(), v); + auto B = Kokkos::subview(Acopy, i, Kokkos::ALL(), 2, + Kokkos::ALL(), Kokkos::ALL(), v); + auto C = Kokkos::subview(Acopy, i, Kokkos::ALL(), 0, + Kokkos::ALL(), Kokkos::ALL(), v); + + for (int jvec = 0, jvecend = rs.extent(1); jvec < jvecend; + ++jvec) { + auto x = Kokkos::subview(xs, i, jvec, Kokkos::ALL(), + Kokkos::ALL(), v); + auto b = Kokkos::subview(bs, i, jvec, Kokkos::ALL(), + Kokkos::ALL(), v); + auto r = Kokkos::subview(rs, i, jvec, Kokkos::ALL(), + Kokkos::ALL(), v); + + if (L == 1) { + auto A0 = + Kokkos::subview(A, 0, Kokkos::ALL(), Kokkos::ALL()); + auto x0 = Kokkos::subview(x, 0, Kokkos::ALL()); + auto b0 = Kokkos::subview(b, 0, Kokkos::ALL()); + auto r0 = Kokkos::subview(r, 0, Kokkos::ALL()); + + TeamCopy::invoke(member, + b0, r0); + TeamGemv::invoke(member, -1.0, A0, x0, 1.0, + r0); + } else { + int k = 0; + { + /// first row + auto A1 = + Kokkos::subview(A, k, Kokkos::ALL(), Kokkos::ALL()); + auto B2 = + Kokkos::subview(B, k, Kokkos::ALL(), Kokkos::ALL()); + + auto x1 = Kokkos::subview(x, k, Kokkos::ALL()); + auto x2 = Kokkos::subview(x, k + 1, Kokkos::ALL()); + + auto bk = Kokkos::subview(b, k, Kokkos::ALL()); + auto rk = Kokkos::subview(r, k, Kokkos::ALL()); + TeamCopy::invoke( + member, bk, rk); + member.team_barrier(); + TeamGemv::invoke(member, -1.0, A1, x1, 1.0, + rk); + TeamGemv::invoke(member, -1.0, B2, x2, 1.0, + rk); + ++k; + } + for (; k < (L - 1); ++k) { + auto C0 = Kokkos::subview(C, k - 1, Kokkos::ALL(), + Kokkos::ALL()); + auto A1 = + Kokkos::subview(A, k, Kokkos::ALL(), Kokkos::ALL()); + auto B2 = + Kokkos::subview(B, k, Kokkos::ALL(), Kokkos::ALL()); + + auto x0 = Kokkos::subview(x, k - 1, Kokkos::ALL()); + auto x1 = Kokkos::subview(x, k, Kokkos::ALL()); + auto x2 = Kokkos::subview(x, k + 1, Kokkos::ALL()); + + auto bk = Kokkos::subview(b, k, Kokkos::ALL()); + auto rk = Kokkos::subview(r, k, Kokkos::ALL()); + TeamCopy::invoke( + member, bk, rk); + member.team_barrier(); + TeamGemv::invoke(member, -1.0, C0, x0, 1.0, + rk); + TeamGemv::invoke(member, -1.0, A1, x1, 1.0, + rk); + TeamGemv::invoke(member, -1.0, B2, x2, 1.0, + rk); + } + { + // last row + auto C0 = Kokkos::subview(C, k - 1, Kokkos::ALL(), + Kokkos::ALL()); + auto A1 = + Kokkos::subview(A, k, Kokkos::ALL(), Kokkos::ALL()); + + auto x0 = Kokkos::subview(x, k - 1, Kokkos::ALL()); + auto x1 = Kokkos::subview(x, k, Kokkos::ALL()); + + auto bk = Kokkos::subview(b, k, Kokkos::ALL()); + auto rk = Kokkos::subview(r, k, Kokkos::ALL()); + TeamCopy::invoke( + member, bk, rk); + member.team_barrier(); + TeamGemv::invoke(member, -1.0, C0, x0, 1.0, + rk); + TeamGemv::invoke(member, -1.0, A1, x1, 1.0, + rk); + } + } } - { - // last row - auto C0 = Kokkos::subview(C, k-1, Kokkos::ALL(), Kokkos::ALL()); - auto A1 = Kokkos::subview(A, k, Kokkos::ALL(), Kokkos::ALL()); - - auto x0 = Kokkos::subview(x, k-1, Kokkos::ALL()); - auto x1 = Kokkos::subview(x, k, Kokkos::ALL()); - - auto bk = Kokkos::subview(b, k, Kokkos::ALL()); - auto rk = Kokkos::subview(r, k, Kokkos::ALL()); - TeamCopy - ::invoke(member, bk, rk); - member.team_barrier(); - TeamGemv - ::invoke(member, -1.0, C0, x0, 1.0, rk); - TeamGemv - ::invoke(member, -1.0, A1, x1, 1.0, rk); - } - } - } - }); - }); + }); + }); Kokkos::fence(); auto rs_host = Kokkos::create_mirror_view(rs); auto bs_host = Kokkos::create_mirror_view(bs); @@ -602,17 +611,19 @@ int main(int argc, char* argv[]) { Kokkos::fence(); { double norm2 = 0, diff2 = 0; - for (int i0=0,i0end=rs.extent(0);i0::value; +static constexpr int vector_length = + DefaultVectorLength::value; #if defined(KOKKOSBATCHED_USE_128BIT_MEMORY_INST) -static constexpr int internal_vector_length = DefaultInternalVectorLength::value; +static constexpr int internal_vector_length = + DefaultInternalVectorLength::value; #else static constexpr int internal_vector_length = 1; #endif -typedef Vector,vector_length> vector_type; +typedef Vector, vector_length> vector_type; #if defined(KOKKOSBATCHED_USE_128BIT_MEMORY_INST) -typedef Vector,internal_vector_length> internal_vector_type; +typedef Vector, internal_vector_length> internal_vector_type; #else typedef value_type internal_vector_type; #endif -template +template struct InverseDiagonalsModeAndAlgo; -template<> +template <> struct InverseDiagonalsModeAndAlgo { typedef Mode::Serial mode_type; - typedef Algo::Level3::Blocked algo_type; + typedef Algo::Level3::Blocked algo_type; }; #if defined(KOKKOS_ENABLE_CUDA) -template<> +template <> struct InverseDiagonalsModeAndAlgo { typedef Mode::Team mode_type; - typedef Algo::Level3::Unblocked algo_type; + typedef Algo::Level3::Unblocked algo_type; }; #endif -template +template struct SolveModeAndAlgo; -template<> +template <> struct SolveModeAndAlgo { typedef Mode::Serial mode_type; - typedef Algo::Level2::Blocked algo_type; + typedef Algo::Level2::Blocked algo_type; }; #if defined(KOKKOS_ENABLE_CUDA) -template<> +template <> struct SolveModeAndAlgo { typedef Mode::Team mode_type; - typedef Algo::Level2::Unblocked algo_type; + typedef Algo::Level2::Unblocked algo_type; }; #endif -int main(int argc, char* argv[]) { +int main(int argc, char *argv[]) { Kokkos::initialize(argc, argv); { #if defined(KOKKOS_ENABLE_CUDA) && defined(KOKKOSBATCHED_PROFILE) @@ -115,21 +117,21 @@ int main(int argc, char* argv[]) { #endif Kokkos::print_configuration(std::cout); - //typedef Kokkos::Details::ArithTraits ats; + // typedef Kokkos::Details::ArithTraits ats; Kokkos::Timer timer; /// /// input arguments parsing /// - int N = 128*128; /// # of problems (batch size) - int L = 128; /// length of block tridiags - int Blk = 5; /// block dimension - int Nvec = 1; - int S = 0; /// scratch size - int niter = 1; - int nsweep = 10; - for (int i=1;i Av("A", - N/vector_length, L, 4, Blk, Blk); + Kokkos::View Av( + "A", N / vector_length, L, 4, Blk, Blk); /// double - Kokkos::View As((value_type*)Av.data(), - Av.extent(0), - Av.extent(1), - Av.extent(2), - Av.extent(3), - Av.extent(4), - vector_length); + Kokkos::View As( + (value_type *)Av.data(), Av.extent(0), Av.extent(1), Av.extent(2), + Av.extent(3), Av.extent(4), vector_length); /// double 2 - Kokkos::View Ai((internal_vector_type*)Av.data(), - Av.extent(0), - Av.extent(1), - Av.extent(2), - Av.extent(3), - Av.extent(4), - vector_length/internal_vector_length); + Kokkos::View + Ai((internal_vector_type *)Av.data(), Av.extent(0), Av.extent(1), + Av.extent(2), Av.extent(3), Av.extent(4), + vector_length / internal_vector_length); /// double 16 - Kokkos::View xv("x", - N/vector_length, Nvec, 2, L, Blk); + Kokkos::View xv( + "x", N / vector_length, Nvec, 2, L, Blk); /// double - Kokkos::View xs((value_type*)xv.data(), - xv.extent(0), - xv.extent(1), - xv.extent(2), - xv.extent(3), - xv.extent(4), - vector_length); + Kokkos::View xs( + (value_type *)xv.data(), xv.extent(0), xv.extent(1), xv.extent(2), + xv.extent(3), xv.extent(4), vector_length); /// double 2 - Kokkos::View xi((internal_vector_type*)xv.data(), - xv.extent(0), - xv.extent(1), - xv.extent(2), - xv.extent(3), - xv.extent(4), - vector_length/internal_vector_length); + Kokkos::View + xi((internal_vector_type *)xv.data(), xv.extent(0), xv.extent(1), + xv.extent(2), xv.extent(3), xv.extent(4), + vector_length / internal_vector_length); /// double 16 - Kokkos::View bv("b", - N/vector_length, Nvec, L, Blk); + Kokkos::View bv( + "b", N / vector_length, Nvec, L, Blk); /// double - Kokkos::View bs((value_type*)bv.data(), - bv.extent(0), - bv.extent(1), - bv.extent(2), - bv.extent(3), - vector_length); + Kokkos::View bs( + (value_type *)bv.data(), bv.extent(0), bv.extent(1), bv.extent(2), + bv.extent(3), vector_length); /// double 2 - Kokkos::View bi((internal_vector_type*)bv.data(), - bv.extent(0), - bv.extent(1), - bv.extent(2), - bv.extent(3), - vector_length/internal_vector_length); - + Kokkos::View + bi((internal_vector_type *)bv.data(), bv.extent(0), bv.extent(1), + bv.extent(2), bv.extent(3), vector_length / internal_vector_length); /// double copy of A - Kokkos::View Acopy("Acopy", - As.extent(0), - As.extent(1), - As.extent(2), - As.extent(3), - As.extent(4), - As.extent(5)); - - Kokkos::View rs("rs", - bs.extent(0), - bs.extent(1), - bs.extent(2), - bs.extent(3), - bs.extent(4)); + Kokkos::View Acopy( + "Acopy", As.extent(0), As.extent(1), As.extent(2), As.extent(3), + As.extent(4), As.extent(5)); + + Kokkos::View rs( + "rs", bs.extent(0), bs.extent(1), bs.extent(2), bs.extent(3), + bs.extent(4)); #if defined(KOKKOSBATCHED_USE_128BIT_MEMORY_INST) auto AA = Ai; @@ -243,7 +218,7 @@ int main(int argc, char* argv[]) { Kokkos::fill_random(bs, random, value_type(1.0)); /// - /// diagonal dominant + /// diagonal dominant /// if (1) { #if defined(KOKKOS_ENABLE_CUDA) && defined(KOKKOSBATCHED_PROFILE) @@ -251,18 +226,21 @@ int main(int argc, char* argv[]) { #endif using policy_type = Kokkos::TeamPolicy; using member_type = typename policy_type::member_type; - policy_type policy(AA.extent(0)*L, Kokkos::AUTO(), AA.extent(5)); - Kokkos::parallel_for - ("diagonal dominant", - policy, KOKKOS_LAMBDA(const member_type &member) { - const int i = member.league_rank()/L; - const int k = member.league_rank()%L; - Kokkos::parallel_for(Kokkos::TeamThreadRange(member,Blk),[&](const int &j) { - Kokkos::parallel_for(Kokkos::ThreadVectorRange(member, AA.extent(5)),[&](const int &v) { - AA(i, k, 1, j, j, v) += internal_vector_type(9*Blk); + policy_type policy(AA.extent(0) * L, Kokkos::AUTO(), AA.extent(5)); + Kokkos::parallel_for( + "diagonal dominant", policy, + KOKKOS_LAMBDA(const member_type &member) { + const int i = member.league_rank() / L; + const int k = member.league_rank() % L; + Kokkos::parallel_for( + Kokkos::TeamThreadRange(member, Blk), [&](const int &j) { + Kokkos::parallel_for( + Kokkos::ThreadVectorRange(member, AA.extent(5)), + [&](const int &v) { + AA(i, k, 1, j, j, v) += internal_vector_type(9 * Blk); + }); }); - }); - }); + }); Kokkos::fence(); #if defined(KOKKOS_ENABLE_CUDA) && defined(KOKKOSBATCHED_PROFILE) cudaProfilerStop(); @@ -280,61 +258,70 @@ int main(int argc, char* argv[]) { #endif timer.reset(); typedef internal_vector_type scratch_value_type; - typedef Kokkos::View scratch_view_type; - + typedef Kokkos::View + scratch_view_type; + using policy_type = Kokkos::TeamPolicy; using member_type = typename policy_type::member_type; - const int per_team_scratch = scratch_view_type::shmem_size(Blk, Blk, AA.extent(5)); - int team_size = 0; - if (Blk < 8) { team_size = 32/AA.extent(5); - } else if (Blk < 12) { team_size = 32/AA.extent(5); - } else { team_size = 64/AA.extent(5); } - - policy_type policy(AA.extent(0)*L, team_size, AA.extent(5)); - Kokkos::parallel_for - ("inverse diagonals", - policy.set_scratch_size(0,Kokkos::PerTeam(S < per_team_scratch ? per_team_scratch : S)), - KOKKOS_LAMBDA(const member_type &member) { - typedef InverseDiagonalsModeAndAlgo default_mode_and_algo_type; - typedef default_mode_and_algo_type::mode_type mode_type; - typedef default_mode_and_algo_type::algo_type algo_type; - - const int i = member.league_rank()/L; - const int k = member.league_rank()%L; - - scratch_view_type WW(member.team_scratch(0), Blk, Blk, AA.extent(5)); - Kokkos::parallel_for(Kokkos::ThreadVectorRange(member, AA.extent(5)),[&](const int &v) { - auto A = Kokkos::subview(AA, i, k, 1, Kokkos::ALL(), Kokkos::ALL(), v); - auto D = Kokkos::subview(AA, i, k, 3, Kokkos::ALL(), Kokkos::ALL(), v); - auto W = Kokkos::subview(WW, Kokkos::ALL(), Kokkos::ALL(), v); - - Copy - ::invoke(member, A, W); - SetIdentity - ::invoke(member, D); - member.team_barrier(); - LU::invoke(member, W); - Trsm - ::invoke(member, 1.0, W, D); - Trsm - ::invoke(member, 1.0, W, D); - }); - }); + const int per_team_scratch = + scratch_view_type::shmem_size(Blk, Blk, AA.extent(5)); + int team_size = 0; + if (Blk < 8) { + team_size = 32 / AA.extent(5); + } else if (Blk < 12) { + team_size = 32 / AA.extent(5); + } else { + team_size = 64 / AA.extent(5); + } + + policy_type policy(AA.extent(0) * L, team_size, AA.extent(5)); + Kokkos::parallel_for( + "inverse diagonals", + policy.set_scratch_size( + 0, Kokkos::PerTeam(S < per_team_scratch ? per_team_scratch : S)), + KOKKOS_LAMBDA(const member_type &member) { + typedef InverseDiagonalsModeAndAlgo< + Kokkos::Impl::ActiveExecutionMemorySpace> + default_mode_and_algo_type; + typedef default_mode_and_algo_type::mode_type mode_type; + typedef default_mode_and_algo_type::algo_type algo_type; + + const int i = member.league_rank() / L; + const int k = member.league_rank() % L; + + scratch_view_type WW(member.team_scratch(0), Blk, Blk, + AA.extent(5)); + Kokkos::parallel_for( + Kokkos::ThreadVectorRange(member, AA.extent(5)), + [&](const int &v) { + auto A = Kokkos::subview(AA, i, k, 1, Kokkos::ALL(), + Kokkos::ALL(), v); + auto D = Kokkos::subview(AA, i, k, 3, Kokkos::ALL(), + Kokkos::ALL(), v); + auto W = Kokkos::subview(WW, Kokkos::ALL(), Kokkos::ALL(), v); + + Copy::invoke( + member, A, W); + SetIdentity::invoke(member, D); + member.team_barrier(); + LU::invoke(member, W); + Trsm::invoke(member, 1.0, W, + D); + Trsm::invoke(member, 1.0, + W, D); + }); + }); Kokkos::fence(); const double t = timer.seconds(); #if defined(KOKKOS_ENABLE_CUDA) && defined(KOKKOSBATCHED_PROFILE) cudaProfilerStop(); #endif - printf("inverse time = %f , # of inverse per min = %f \n", t, 1.0/t*60); + printf("inverse time = %f , # of inverse per min = %f \n", t, + 1.0 / t * 60); } /// @@ -346,75 +333,114 @@ int main(int argc, char* argv[]) { #endif timer.reset(); typedef internal_vector_type scratch_value_type; - typedef Kokkos::View scratch_view_type; - const int per_team_scratch = scratch_view_type::shmem_size(Blk, AA.extent(5)); - + typedef Kokkos::View + scratch_view_type; + const int per_team_scratch = + scratch_view_type::shmem_size(Blk, AA.extent(5)); + using policy_type = Kokkos::TeamPolicy; using member_type = typename policy_type::member_type; - int team_size = 0; - if (Blk < 8) { team_size = 32/AA.extent(5); - } else if (Blk < 12) { team_size = 32/AA.extent(5); - } else { team_size = 32/AA.extent(5); } - policy_type policy(AA.extent(0)*L, team_size, AA.extent(5)); - - for (int iter=0;iter default_mode_and_algo_type; - typedef default_mode_and_algo_type::mode_type mode_type; - typedef default_mode_and_algo_type::algo_type algo_type; - - scratch_view_type WW(member.team_scratch(0), Blk, AA.extent(5)); - const int i = member.league_rank()/L; //%AA.extent(0); - const int k = member.league_rank()%L; - Kokkos::parallel_for(Kokkos::ThreadVectorRange(member, AA.extent(5)),[&](const int &v) { - auto A = Kokkos::subview(AA, i, k, 1, Kokkos::ALL(), Kokkos::ALL(), v); - auto D = Kokkos::subview(AA, i, k, 3, Kokkos::ALL(), Kokkos::ALL(), v); - auto B = Kokkos::subview(AA, i, k, 2, Kokkos::ALL(), Kokkos::ALL(), v); - auto C = Kokkos::subview(AA, i, k ? k-1 : 0, 0, Kokkos::ALL(), Kokkos::ALL(), v); - auto u = Kokkos::subview(WW, Kokkos::ALL(), v); - for (int jvec=0;jvec::invoke(member, 1.0, D, b, 0.0, x1); - } else { - Copy::invoke(member, b, u); - if (k == 0) { - Gemv::invoke(member, -1.0, B, x2, 1.0, u); - } else if (k == L-1) { - Gemv::invoke(member, -1.0, C, x0, 1.0, u); - } else { - Gemv::invoke(member, -1.0, B, x2, 1.0, u); - Gemv::invoke(member, -1.0, C, x0, 1.0, u); - } - Gemv::invoke(member, 1.0, D, u, 0.0, y1); - } - } - }); - }); - auto tmp = xxx; xxx = yyy; yyy = tmp; - } - Kokkos::fence(); + int team_size = 0; + if (Blk < 8) { + team_size = 32 / AA.extent(5); + } else if (Blk < 12) { + team_size = 32 / AA.extent(5); + } else { + team_size = 32 / AA.extent(5); + } + policy_type policy(AA.extent(0) * L, team_size, AA.extent(5)); + + for (int iter = 0; iter < niter; ++iter) { + auto xxx = Kokkos::subview(xx, Kokkos::ALL(), Kokkos::ALL(), 0, + Kokkos::ALL(), Kokkos::ALL(), Kokkos::ALL()); + auto yyy = Kokkos::subview(xx, Kokkos::ALL(), Kokkos::ALL(), 1, + Kokkos::ALL(), Kokkos::ALL(), Kokkos::ALL()); + + for (int nis = 0; nis < nsweep; ++nis) { + Kokkos::parallel_for( + "solve", + policy.set_scratch_size( + 0, + Kokkos::PerTeam(S < per_team_scratch ? per_team_scratch : S)), + KOKKOS_LAMBDA(const member_type &member) { + typedef SolveModeAndAlgo< + Kokkos::Impl::ActiveExecutionMemorySpace> + default_mode_and_algo_type; + typedef default_mode_and_algo_type::mode_type mode_type; + typedef default_mode_and_algo_type::algo_type algo_type; + + scratch_view_type WW(member.team_scratch(0), Blk, AA.extent(5)); + const int i = member.league_rank() / L; //%AA.extent(0); + const int k = member.league_rank() % L; + Kokkos::parallel_for( + Kokkos::ThreadVectorRange(member, AA.extent(5)), + [&](const int &v) { + auto A = Kokkos::subview(AA, i, k, 1, Kokkos::ALL(), + Kokkos::ALL(), v); + auto D = Kokkos::subview(AA, i, k, 3, Kokkos::ALL(), + Kokkos::ALL(), v); + auto B = Kokkos::subview(AA, i, k, 2, Kokkos::ALL(), + Kokkos::ALL(), v); + auto C = Kokkos::subview(AA, i, k ? k - 1 : 0, 0, + Kokkos::ALL(), Kokkos::ALL(), v); + auto u = Kokkos::subview(WW, Kokkos::ALL(), v); + for (int jvec = 0; jvec < Nvec; ++jvec) { + auto x0 = Kokkos::subview( + xxx, i, jvec, k == 0 ? 0 : k - 1, Kokkos::ALL(), v); + auto x1 = + Kokkos::subview(xxx, i, jvec, k, Kokkos::ALL(), v); + auto x2 = Kokkos::subview(xxx, i, jvec, + k == L - 1 ? 0 : k + 1, + Kokkos::ALL(), v); + auto y1 = + Kokkos::subview(yyy, i, jvec, k, Kokkos::ALL(), v); + auto b = + Kokkos::subview(bb, i, jvec, k, Kokkos::ALL(), v); + + if (L == 1) { + Gemv::invoke(member, 1.0, D, b, 0.0, x1); + } else { + Copy::invoke(member, b, u); + if (k == 0) { + Gemv::invoke(member, -1.0, B, x2, 1.0, + u); + } else if (k == L - 1) { + Gemv::invoke(member, -1.0, C, x0, 1.0, + u); + } else { + Gemv::invoke(member, -1.0, B, x2, 1.0, + u); + Gemv::invoke(member, -1.0, C, x0, 1.0, + u); + } + Gemv::invoke(member, 1.0, D, u, 0.0, y1); + } + } + }); + }); + auto tmp = xxx; + xxx = yyy; + yyy = tmp; + } + Kokkos::fence(); } const double t = timer.seconds(); #if defined(KOKKOS_ENABLE_CUDA) && defined(KOKKOSBATCHED_PROFILE) cudaProfilerStop(); #endif - printf("solve time = %f , # of solves per min = %f\n", t, 1.0/t*60*niter); + printf("solve time = %f , # of solves per min = %f\n", t, + 1.0 / t * 60 * niter); } - + /// /// compute residual /// @@ -422,105 +448,142 @@ int main(int argc, char* argv[]) { typedef KokkosBatched::Algo::Level2::Unblocked algo_type; using policy_type = Kokkos::TeamPolicy; policy_type policy(Acopy.extent(0), Kokkos::AUTO(), Acopy.extent(5)); - Kokkos::parallel_for - ("compute residual", - policy, KOKKOS_LAMBDA(const typename policy_type::member_type &member) { - const int i = member.league_rank(); - Kokkos::parallel_for(Kokkos::ThreadVectorRange(member, Acopy.extent(5)),[&](const int &v) { - auto A = Kokkos::subview(Acopy, i, Kokkos::ALL(), 1, Kokkos::ALL(), Kokkos::ALL(), v); - auto B = Kokkos::subview(Acopy, i, Kokkos::ALL(), 2, Kokkos::ALL(), Kokkos::ALL(), v); - auto C = Kokkos::subview(Acopy, i, Kokkos::ALL(), 0, Kokkos::ALL(), Kokkos::ALL(), v); - - for (int jvec=0,jvecend=rs.extent(1);jvec - ::invoke(member, b0, r0); - TeamGemv - ::invoke(member, -1.0, A0, x0, 1.0, r0); - } else { - int k = 0; - { - /// first row - auto A1 = Kokkos::subview(A, k, Kokkos::ALL(), Kokkos::ALL()); - auto B2 = Kokkos::subview(B, k, Kokkos::ALL(), Kokkos::ALL()); - - auto x1 = Kokkos::subview(x, k, Kokkos::ALL()); - auto x2 = Kokkos::subview(x, k+1, Kokkos::ALL()); - - auto bk = Kokkos::subview(b, k, Kokkos::ALL()); - auto rk = Kokkos::subview(r, k, Kokkos::ALL()); - TeamCopy - ::invoke(member, bk, rk); - member.team_barrier(); - TeamGemv - ::invoke(member, -1.0, A1, x1, 1.0, rk); - TeamGemv - ::invoke(member, -1.0, B2, x2, 1.0, rk); - ++k; + Kokkos::parallel_for( + "compute residual", policy, + KOKKOS_LAMBDA(const typename policy_type::member_type &member) { + const int i = member.league_rank(); + Kokkos::parallel_for( + Kokkos::ThreadVectorRange(member, Acopy.extent(5)), + [&](const int &v) { + auto A = Kokkos::subview(Acopy, i, Kokkos::ALL(), 1, + Kokkos::ALL(), Kokkos::ALL(), v); + auto B = Kokkos::subview(Acopy, i, Kokkos::ALL(), 2, + Kokkos::ALL(), Kokkos::ALL(), v); + auto C = Kokkos::subview(Acopy, i, Kokkos::ALL(), 0, + Kokkos::ALL(), Kokkos::ALL(), v); + + for (int jvec = 0, jvecend = rs.extent(1); jvec < jvecend; + ++jvec) { + auto x = Kokkos::subview(xs, i, jvec, nsweep % 2, + Kokkos::ALL(), Kokkos::ALL(), v); + auto b = Kokkos::subview(bs, i, jvec, Kokkos::ALL(), + Kokkos::ALL(), v); + auto r = Kokkos::subview(rs, i, jvec, Kokkos::ALL(), + Kokkos::ALL(), v); + + if (L == 1) { + auto A0 = + Kokkos::subview(A, 0, Kokkos::ALL(), Kokkos::ALL()); + auto x0 = Kokkos::subview(x, 0, Kokkos::ALL()); + auto b0 = Kokkos::subview(b, 0, Kokkos::ALL()); + auto r0 = Kokkos::subview(r, 0, Kokkos::ALL()); + + TeamCopy::invoke(member, b0, r0); + TeamGemv::invoke(member, + -1.0, A0, + x0, 1.0, + r0); + } else { + int k = 0; + { + /// first row + auto A1 = + Kokkos::subview(A, k, Kokkos::ALL(), Kokkos::ALL()); + auto B2 = + Kokkos::subview(B, k, Kokkos::ALL(), Kokkos::ALL()); + + auto x1 = Kokkos::subview(x, k, Kokkos::ALL()); + auto x2 = Kokkos::subview(x, k + 1, Kokkos::ALL()); + + auto bk = Kokkos::subview(b, k, Kokkos::ALL()); + auto rk = Kokkos::subview(r, k, Kokkos::ALL()); + TeamCopy::invoke(member, bk, rk); + member.team_barrier(); + TeamGemv::invoke(member, + -1.0, + A1, x1, + 1.0, + rk); + TeamGemv::invoke(member, + -1.0, + B2, x2, + 1.0, + rk); + ++k; + } + for (; k < (L - 1); ++k) { + auto C0 = Kokkos::subview(C, k - 1, Kokkos::ALL(), + Kokkos::ALL()); + auto A1 = + Kokkos::subview(A, k, Kokkos::ALL(), Kokkos::ALL()); + auto B2 = + Kokkos::subview(B, k, Kokkos::ALL(), Kokkos::ALL()); + + auto x0 = Kokkos::subview(x, k - 1, Kokkos::ALL()); + auto x1 = Kokkos::subview(x, k, Kokkos::ALL()); + auto x2 = Kokkos::subview(x, k + 1, Kokkos::ALL()); + + auto bk = Kokkos::subview(b, k, Kokkos::ALL()); + auto rk = Kokkos::subview(r, k, Kokkos::ALL()); + TeamCopy::invoke(member, bk, rk); + member.team_barrier(); + TeamGemv::invoke(member, + -1.0, + C0, x0, + 1.0, + rk); + TeamGemv::invoke(member, + -1.0, + A1, x1, + 1.0, + rk); + TeamGemv::invoke(member, + -1.0, + B2, x2, + 1.0, + rk); + } + { + // last row + auto C0 = Kokkos::subview(C, k - 1, Kokkos::ALL(), + Kokkos::ALL()); + auto A1 = + Kokkos::subview(A, k, Kokkos::ALL(), Kokkos::ALL()); + + auto x0 = Kokkos::subview(x, k - 1, Kokkos::ALL()); + auto x1 = Kokkos::subview(x, k, Kokkos::ALL()); + + auto bk = Kokkos::subview(b, k, Kokkos::ALL()); + auto rk = Kokkos::subview(r, k, Kokkos::ALL()); + TeamCopy::invoke(member, bk, rk); + member.team_barrier(); + TeamGemv::invoke(member, + -1.0, + C0, x0, + 1.0, + rk); + TeamGemv::invoke(member, + -1.0, + A1, x1, + 1.0, + rk); + } + } } - for (;k<(L-1);++k) { - auto C0 = Kokkos::subview(C, k-1, Kokkos::ALL(), Kokkos::ALL()); - auto A1 = Kokkos::subview(A, k, Kokkos::ALL(), Kokkos::ALL()); - auto B2 = Kokkos::subview(B, k, Kokkos::ALL(), Kokkos::ALL()); - - auto x0 = Kokkos::subview(x, k-1, Kokkos::ALL()); - auto x1 = Kokkos::subview(x, k, Kokkos::ALL()); - auto x2 = Kokkos::subview(x, k+1, Kokkos::ALL()); - - auto bk = Kokkos::subview(b, k, Kokkos::ALL()); - auto rk = Kokkos::subview(r, k, Kokkos::ALL()); - TeamCopy - ::invoke(member, bk, rk); - member.team_barrier(); - TeamGemv - ::invoke(member, -1.0, C0, x0, 1.0, rk); - TeamGemv - ::invoke(member, -1.0, A1, x1, 1.0, rk); - TeamGemv - ::invoke(member, -1.0, B2, x2, 1.0, rk); - } - { - // last row - auto C0 = Kokkos::subview(C, k-1, Kokkos::ALL(), Kokkos::ALL()); - auto A1 = Kokkos::subview(A, k, Kokkos::ALL(), Kokkos::ALL()); - - auto x0 = Kokkos::subview(x, k-1, Kokkos::ALL()); - auto x1 = Kokkos::subview(x, k, Kokkos::ALL()); - - auto bk = Kokkos::subview(b, k, Kokkos::ALL()); - auto rk = Kokkos::subview(r, k, Kokkos::ALL()); - TeamCopy - ::invoke(member, bk, rk); - member.team_barrier(); - TeamGemv - ::invoke(member, -1.0, C0, x0, 1.0, rk); - TeamGemv - ::invoke(member, -1.0, A1, x1, 1.0, rk); - } - } - } - }); - }); + }); + }); Kokkos::fence(); auto rs_host = Kokkos::create_mirror_view(rs); auto bs_host = Kokkos::create_mirror_view(bs); @@ -529,17 +592,19 @@ int main(int argc, char* argv[]) { Kokkos::fence(); { double norm2 = 0, diff2 = 0; - for (int i0=0,i0end=rs.extent(0);i0 @@ -24,622 +24,633 @@ #include "KokkosBatched_Gemm_Team_Impl.hpp" namespace KokkosBatched { - namespace PerfTest { +namespace PerfTest { #undef FLOP_MUL #undef FLOP_ADD #define FLOP_MUL 1.0 #define FLOP_ADD 1.0 - typedef double value_type; +typedef double value_type; - double FlopCount(int mm, int nn, int kk) { - double m = (double)mm; double n = (double)nn; double k = (double)kk; - return (FLOP_MUL*(m*n*k) + - FLOP_ADD*(m*n*k)); - } +double FlopCount(int mm, int nn, int kk) { + double m = (double)mm; + double n = (double)nn; + double k = (double)kk; + return (FLOP_MUL * (m * n * k) + FLOP_ADD * (m * n * k)); +} - struct RangeTag {}; - struct TeamTagV1 {}; - struct TeamTagV2 {}; - struct TeamTagV3 {}; - struct TeamTagHandmade {}; - - template - struct Functor { - ConstUnmanagedViewType _a, _b; - UnmanagedViewType _c; - - KOKKOS_INLINE_FUNCTION - Functor() = default; - - KOKKOS_INLINE_FUNCTION - Functor(const ViewType &a, - const ViewType &b, - const ViewType &c) - : _a(a), _b(b), _c(c) {} - - KOKKOS_INLINE_FUNCTION - void operator()(const RangeTag &, const int k) const { - auto aa = Kokkos::subview(_a, k, Kokkos::ALL(), Kokkos::ALL()); - auto bb = Kokkos::subview(_b, k, Kokkos::ALL(), Kokkos::ALL()); - auto cc = Kokkos::subview(_c, k, Kokkos::ALL(), Kokkos::ALL()); - - SerialGemm:: - invoke(1.0, aa, bb, 1.0, cc); - } +struct RangeTag {}; +struct TeamTagV1 {}; +struct TeamTagV2 {}; +struct TeamTagV3 {}; +struct TeamTagHandmade {}; - template - KOKKOS_INLINE_FUNCTION - void operator()(const TeamTagV1 &, const MemberType &member) const { - const int kbeg = (member.league_rank()*(member.team_size()*VectorLength) + - member.team_rank()*VectorLength); - Kokkos::parallel_for(Kokkos::ThreadVectorRange(member, VectorLength), - [&](const int &k) { - const int kk = kbeg + k; - if (kk < int(_c.extent(0))) { - auto aa = Kokkos::subview(_a, kk, Kokkos::ALL(), Kokkos::ALL()); - auto bb = Kokkos::subview(_b, kk, Kokkos::ALL(), Kokkos::ALL()); - auto cc = Kokkos::subview(_c, kk, Kokkos::ALL(), Kokkos::ALL()); - - SerialGemm:: - invoke(1.0, aa, bb, 1.0, cc); - } - }); - } - - template - KOKKOS_INLINE_FUNCTION - void operator()(const TeamTagV2 &, const MemberType &member) const { - const int kbeg = member.league_rank()*VectorLength; - Kokkos::parallel_for(Kokkos::ThreadVectorRange(member, VectorLength), - [&](const int &k) { - const int kk = kbeg + k; - if (kk < int(_c.extent(0))) { - auto aa = Kokkos::subview(_a, kk, Kokkos::ALL(), Kokkos::ALL()); - auto bb = Kokkos::subview(_b, kk, Kokkos::ALL(), Kokkos::ALL()); - auto cc = Kokkos::subview(_c, kk, Kokkos::ALL(), Kokkos::ALL()); - - TeamGemm:: - invoke(member, 1.0, aa, bb, 1.0, cc); - } - }); - } +template +struct Functor { + ConstUnmanagedViewType _a, _b; + UnmanagedViewType _c; - template - KOKKOS_INLINE_FUNCTION - void operator()(const TeamTagV3 &, const MemberType &member) const { - const int lvl = 0; - ScratchViewType sa(member.team_scratch(lvl), VectorLength, _a.extent(1), _a.extent(2)); - ScratchViewType sb(member.team_scratch(lvl), VectorLength, _b.extent(1), _b.extent(2)); - - const int kbeg = member.league_rank()*VectorLength; - Kokkos::parallel_for(Kokkos::ThreadVectorRange(member, VectorLength), - [&](const int &k) { - const int kk = kbeg + k; - if (kk < int(_c.extent(0))) { - auto aa = Kokkos::subview(_a, kk, Kokkos::ALL(), Kokkos::ALL()); - auto bb = Kokkos::subview(_b, kk, Kokkos::ALL(), Kokkos::ALL()); - auto cc = Kokkos::subview(_c, kk, Kokkos::ALL(), Kokkos::ALL()); - - auto saa = Kokkos::subview(sa, k, Kokkos::ALL(), Kokkos::ALL()); - auto sbb = Kokkos::subview(sb, k, Kokkos::ALL(), Kokkos::ALL()); - - TeamCopy::invoke(member, aa, saa); - TeamCopy::invoke(member, bb, sbb); - member.team_barrier(); - - TeamGemm:: - invoke(member, 1.0, saa, sbb, 1.0, cc); - } - }); - } - - template - KOKKOS_INLINE_FUNCTION - void operator()(const TeamTagHandmade &, const MemberType &member) const { - const int kbeg = member.league_rank()*VectorLength; - Kokkos::parallel_for(Kokkos::ThreadVectorRange(member, VectorLength), - [&](const int &k) { - const int kk = kbeg + k; - if (kk < int(_c.extent(0))) { - const int m = _c.extent(1), n = _c.extent(2), q = _a.extent(2); - Kokkos::parallel_for(Kokkos::TeamThreadRange(member,0,m*n), - [&](const int &ij) { - const int i = ij%m, j = ij/m; - typename ViewType::non_const_value_type cval = 0; - for (int p=0;p - void Gemm(const int NN, const int BlkSize) { - typedef Kokkos::Schedule ScheduleType; - - constexpr int VectorLength = DefaultVectorLength::value; - const int N = NN/VectorLength; - - { - std::string value_type_name; - if (std::is_same::value) value_type_name = "double"; - if (std::is_same >::value) value_type_name = "Kokkos::complex"; - - std::cout << "SIMD is defined: datatype " << value_type_name << " a vector length " << VectorLength << "\n"; - } + KOKKOS_INLINE_FUNCTION + Functor() = default; - const double flop = (N*VectorLength)*FlopCount(BlkSize,BlkSize,BlkSize); - const double tmax = 1.0e15; + KOKKOS_INLINE_FUNCTION + Functor(const ViewType &a, const ViewType &b, const ViewType &c) + : _a(a), _b(b), _c(c) {} - typedef Kokkos::DefaultHostExecutionSpace HostSpaceType; - typedef typename DeviceSpaceType::memory_space DeviceMemorySpaceType; + KOKKOS_INLINE_FUNCTION + void operator()(const RangeTag &, const int k) const { + auto aa = Kokkos::subview(_a, k, Kokkos::ALL(), Kokkos::ALL()); + auto bb = Kokkos::subview(_b, k, Kokkos::ALL(), Kokkos::ALL()); + auto cc = Kokkos::subview(_c, k, Kokkos::ALL(), Kokkos::ALL()); - const int iter_begin = -3, iter_end = 30; - Kokkos::Timer timer; + SerialGemm::invoke( + 1.0, aa, bb, 1.0, cc); + } - Kokkos::View - amat("amat", N*VectorLength, BlkSize, BlkSize), - bmat("bmat", N*VectorLength, BlkSize, BlkSize), - cref("cref", N*VectorLength, BlkSize, BlkSize); + template + KOKKOS_INLINE_FUNCTION void operator()(const TeamTagV1 &, + const MemberType &member) const { + const int kbeg = + (member.league_rank() * (member.team_size() * VectorLength) + + member.team_rank() * VectorLength); + Kokkos::parallel_for( + Kokkos::ThreadVectorRange(member, VectorLength), [&](const int &k) { + const int kk = kbeg + k; + if (kk < int(_c.extent(0))) { + auto aa = Kokkos::subview(_a, kk, Kokkos::ALL(), Kokkos::ALL()); + auto bb = Kokkos::subview(_b, kk, Kokkos::ALL(), Kokkos::ALL()); + auto cc = Kokkos::subview(_c, kk, Kokkos::ALL(), Kokkos::ALL()); + + SerialGemm::invoke(1.0, aa, bb, 1.0, cc); + } + }); + } - { - Random random; - for (int k=0;k + KOKKOS_INLINE_FUNCTION void operator()(const TeamTagV2 &, + const MemberType &member) const { + const int kbeg = member.league_rank() * VectorLength; + Kokkos::parallel_for( + Kokkos::ThreadVectorRange(member, VectorLength), [&](const int &k) { + const int kk = kbeg + k; + if (kk < int(_c.extent(0))) { + auto aa = Kokkos::subview(_a, kk, Kokkos::ALL(), Kokkos::ALL()); + auto bb = Kokkos::subview(_b, kk, Kokkos::ALL(), Kokkos::ALL()); + auto cc = Kokkos::subview(_c, kk, Kokkos::ALL(), Kokkos::ALL()); + + TeamGemm::invoke(member, 1.0, aa, bb, 1.0, cc); + } + }); + } + + template + KOKKOS_INLINE_FUNCTION void operator()(const TeamTagV3 &, + const MemberType &member) const { + const int lvl = 0; + ScratchViewType sa(member.team_scratch(lvl), VectorLength, + _a.extent(1), _a.extent(2)); + ScratchViewType sb(member.team_scratch(lvl), VectorLength, + _b.extent(1), _b.extent(2)); + + const int kbeg = member.league_rank() * VectorLength; + Kokkos::parallel_for( + Kokkos::ThreadVectorRange(member, VectorLength), [&](const int &k) { + const int kk = kbeg + k; + if (kk < int(_c.extent(0))) { + auto aa = Kokkos::subview(_a, kk, Kokkos::ALL(), Kokkos::ALL()); + auto bb = Kokkos::subview(_b, kk, Kokkos::ALL(), Kokkos::ALL()); + auto cc = Kokkos::subview(_c, kk, Kokkos::ALL(), Kokkos::ALL()); + + auto saa = Kokkos::subview(sa, k, Kokkos::ALL(), Kokkos::ALL()); + auto sbb = Kokkos::subview(sb, k, Kokkos::ALL(), Kokkos::ALL()); + + TeamCopy::invoke(member, aa, saa); + TeamCopy::invoke(member, bb, sbb); + member.team_barrier(); + + TeamGemm::invoke(member, 1.0, saa, sbb, 1.0, cc); + } + }); + } + + template + KOKKOS_INLINE_FUNCTION void operator()(const TeamTagHandmade &, + const MemberType &member) const { + const int kbeg = member.league_rank() * VectorLength; + Kokkos::parallel_for( + Kokkos::ThreadVectorRange(member, VectorLength), [&](const int &k) { + const int kk = kbeg + k; + if (kk < int(_c.extent(0))) { + const int m = _c.extent(1), n = _c.extent(2), q = _a.extent(2); + Kokkos::parallel_for( + Kokkos::TeamThreadRange(member, 0, m * n), [&](const int &ij) { + const int i = ij % m, j = ij / m; + typename ViewType::non_const_value_type cval = 0; + for (int p = 0; p < q; ++p) + cval += _a(kk, i, p) * _b(kk, p, j); + _c(kk, i, j) += cval; + }); + } + }); + } +}; + +template +void Gemm(const int NN, const int BlkSize) { + typedef Kokkos::Schedule ScheduleType; + + constexpr int VectorLength = + DefaultVectorLength::value; + const int N = NN / VectorLength; + + { + std::string value_type_name; + if (std::is_same::value) value_type_name = "double"; + if (std::is_same >::value) + value_type_name = "Kokkos::complex"; + + std::cout << "SIMD is defined: datatype " << value_type_name + << " a vector length " << VectorLength << "\n"; + } - // P100 L2 cache 4MB per core - constexpr size_t LLC_CAPACITY = 56*4*1024*1024; - Flush flush; + const double flop = (N * VectorLength) * FlopCount(BlkSize, BlkSize, BlkSize); + const double tmax = 1.0e15; + + typedef Kokkos::DefaultHostExecutionSpace HostSpaceType; + typedef typename DeviceSpaceType::memory_space DeviceMemorySpaceType; + + const int iter_begin = -3, iter_end = 30; + Kokkos::Timer timer; + + Kokkos::View amat( + "amat", N * VectorLength, BlkSize, BlkSize), + bmat("bmat", N * VectorLength, BlkSize, BlkSize), + cref("cref", N * VectorLength, BlkSize, BlkSize); + + { + Random random; + for (int k = 0; k < N * VectorLength; ++k) + for (int i = 0; i < BlkSize; ++i) + for (int j = 0; j < BlkSize; ++j) { + amat(k, i, j) = random.value(); + bmat(k, i, j) = random.value(); + } + } + + // P100 L2 cache 4MB per core + constexpr size_t LLC_CAPACITY = 56 * 4 * 1024 * 1024; + Flush flush; #if defined(__KOKKOSKERNELS_NVIDIA_CUBLAS__) - if (1) { - /// - /// CUBLAS Strided version - /// - const Kokkos::LayoutStride stride(N*VectorLength, BlkSize*BlkSize, - BlkSize, 1, - BlkSize, BlkSize); + if (1) { + /// + /// CUBLAS Strided version + /// + const Kokkos::LayoutStride stride(N * VectorLength, BlkSize * BlkSize, + BlkSize, 1, BlkSize, BlkSize); + + Kokkos::View a( + "a", stride), + b("b", stride), c("c", stride); + + double tavg = 0, tmin = tmax; - Kokkos::View - a("a", stride), - b("b", stride), - c("c", stride); + cublasStatus_t stat; + cublasHandle_t handle; - double tavg = 0, tmin = tmax; + stat = cublasCreate(&handle); + if (stat != CUBLAS_STATUS_SUCCESS) + Kokkos::abort("CUBLAS initialization failed\n"); - cublasStatus_t stat; - cublasHandle_t handle; + auto amat_device = + Kokkos::create_mirror_view(DeviceMemorySpaceType(), amat); + auto bmat_device = + Kokkos::create_mirror_view(DeviceMemorySpaceType(), bmat); - stat = cublasCreate(&handle); - if (stat != CUBLAS_STATUS_SUCCESS) - Kokkos::abort("CUBLAS initialization failed\n"); + Kokkos::deep_copy(amat_device, amat); + Kokkos::deep_copy(bmat_device, bmat); - auto amat_device = Kokkos::create_mirror_view(DeviceMemorySpaceType(), amat); - auto bmat_device = Kokkos::create_mirror_view(DeviceMemorySpaceType(), bmat); + Kokkos::fence(); - Kokkos::deep_copy(amat_device, amat); - Kokkos::deep_copy(bmat_device, bmat); + const double one(1.0), zero(0.0); + { + tavg = 0; + tmin = tmax; + + for (int iter = iter_begin; iter < iter_end; ++iter) { + // flush + flush.run(); + + // initialize matrices + Kokkos::deep_copy(a, amat_device); + Kokkos::deep_copy(b, bmat_device); + Kokkos::deep_copy(c, 0); Kokkos::fence(); + timer.reset(); - const double one(1.0), zero(0.0); - { - tavg = 0; tmin = tmax; - - for (int iter=iter_begin;iter= 0)*t; - } - tavg /= iter_end; - - auto csol = Kokkos::create_mirror_view(typename HostSpaceType::memory_space(), c); - Kokkos::deep_copy(csol, c); - Kokkos::deep_copy(cref, csol); - - std::cout << std::setw(8) << "CUBLAS" - << std::setw(8) << "Strided" - << " BlkSize = " << std::setw(3) << BlkSize - << " TeamSize = N/A" - << " ScratchSize (KB) = 0" - << " time = " << std::scientific << tmin - << " avg flop/s = " << (flop/tavg) - << " max flop/s = " << (flop/tmin) - << std::endl; - } - cublasDestroy(handle); + stat = cublasDgemmStridedBatched( + handle, CUBLAS_OP_N, CUBLAS_OP_N, BlkSize, BlkSize, BlkSize, &one, + (const value_type *)a.data(), BlkSize, BlkSize * BlkSize, + (const value_type *)b.data(), BlkSize, BlkSize * BlkSize, &zero, + (value_type *)c.data(), BlkSize, BlkSize * BlkSize, + N * VectorLength); + + Kokkos::fence(); + const double t = timer.seconds(); + tmin = std::min(tmin, t); + tavg += (iter >= 0) * t; } + tavg /= iter_end; + + auto csol = + Kokkos::create_mirror_view(typename HostSpaceType::memory_space(), c); + Kokkos::deep_copy(csol, c); + Kokkos::deep_copy(cref, csol); + + std::cout << std::setw(8) << "CUBLAS" << std::setw(8) << "Strided" + << " BlkSize = " << std::setw(3) << BlkSize << " TeamSize = N/A" + << " ScratchSize (KB) = 0" + << " time = " << std::scientific << tmin + << " avg flop/s = " << (flop / tavg) + << " max flop/s = " << (flop / tmin) << std::endl; + } + cublasDestroy(handle); + } #endif - if (1) { - /// - /// Range policy version - /// - typedef Kokkos::View view_type; - view_type - a("a", N*VectorLength, BlkSize, BlkSize), - b("b", N*VectorLength, BlkSize, BlkSize), - c("c", N*VectorLength, BlkSize, BlkSize); - - double tavg = 0, tmin = tmax; - { - typedef Functor functor_type; - const Kokkos::RangePolicy policy(0, N*VectorLength); - - for (int iter=iter_begin;iter= 0)*t; - } - tavg /= iter_end; - - auto csol = Kokkos::create_mirror_view(typename HostSpaceType::memory_space(), c); - Kokkos::deep_copy(csol, c); - - double diff = 0; - for (int i=0,iend=cref.extent(0);i functor_type; + const Kokkos::RangePolicy policy( + 0, N * VectorLength); + + for (int iter = iter_begin; iter < iter_end; ++iter) { + // flush + flush.run(); + + // initialize matrices + Kokkos::deep_copy(a, amat); + Kokkos::deep_copy(b, bmat); + Kokkos::deep_copy(c, 0); + + Kokkos::fence(); + timer.reset(); + + Kokkos::parallel_for("KokkosBatched::PerfTest::GemmCuda::RangeTag", + policy, functor_type(a, b, c)); + + Kokkos::fence(); + const double t = timer.seconds(); + tmin = std::min(tmin, t); + tavg += (iter >= 0) * t; + } + tavg /= iter_end; + + auto csol = + Kokkos::create_mirror_view(typename HostSpaceType::memory_space(), c); + Kokkos::deep_copy(csol, c); + + double diff = 0; + for (int i = 0, iend = cref.extent(0); i < iend; ++i) + for (int j = 0, jend = cref.extent(1); j < jend; ++j) + for (int k = 0, kend = cref.extent(2); k < kend; ++k) + diff += std::abs(cref(i, j, k) - csol(i, j, k)); + + std::cout << std::setw(8) << "Kokkos" << std::setw(8) << "Range" + << " BlkSize = " << std::setw(3) << BlkSize << " TeamSize = N/A" + << " ScratchSize (KB) = 0" + << " time = " << std::scientific << tmin + << " avg flop/s = " << (flop / tavg) + << " max flop/s = " << (flop / tmin); #if defined(__KOKKOSKERNELS_NVIDIA_CUBLAS__) - std::cout << " diff to ref = " << diff; + std::cout << " diff to ref = " << diff; #endif - std::cout << std::endl; - } - } + std::cout << std::endl; + } + } - if (1) { - /// - /// Team policy V1 - almost same scheduling with range policy; - /// expect the same performance as range policy - /// - typedef Kokkos::View view_type; - view_type - a("a", N*VectorLength, BlkSize, BlkSize), - b("b", N*VectorLength, BlkSize, BlkSize), - c("c", N*VectorLength, BlkSize, BlkSize); - - double tavg = 0, tmin = tmax; - { - typedef Kokkos::TeamPolicy policy_type; - - typedef Functor functor_type; - - // 128 is rough estimates - const int team_size = - policy_type(N/32, Kokkos::AUTO, VectorLength).team_size_recommended(functor_type(), Kokkos::ParallelForTag()); - - const policy_type policy(N/team_size, team_size, VectorLength); - for (int iter=iter_begin;iter= 0)*t; - } - tavg /= iter_end; - - auto csol = Kokkos::create_mirror_view(typename HostSpaceType::memory_space(), c); - Kokkos::deep_copy(csol, c); - - double diff = 0; - for (int i=0,iend=cref.extent(0);i + policy_type; + + typedef Functor functor_type; + + // 128 is rough estimates + const int team_size = + policy_type(N / 32, Kokkos::AUTO, VectorLength) + .team_size_recommended(functor_type(), Kokkos::ParallelForTag()); + + const policy_type policy(N / team_size, team_size, VectorLength); + for (int iter = iter_begin; iter < iter_end; ++iter) { + // flush + flush.run(); + + // initialize matrices + Kokkos::deep_copy(a, amat); + Kokkos::deep_copy(b, bmat); + Kokkos::deep_copy(c, 0); + + Kokkos::fence(); + timer.reset(); + + Kokkos::parallel_for("KokkosBatched::PerfTest::GemmCuda::TeamPolicyV1", + policy, functor_type(a, b, c)); + + Kokkos::fence(); + const double t = timer.seconds(); + tmin = std::min(tmin, t); + tavg += (iter >= 0) * t; + } + tavg /= iter_end; + + auto csol = + Kokkos::create_mirror_view(typename HostSpaceType::memory_space(), c); + Kokkos::deep_copy(csol, c); + + double diff = 0; + for (int i = 0, iend = cref.extent(0); i < iend; ++i) + for (int j = 0, jend = cref.extent(1); j < jend; ++j) + for (int k = 0, kend = cref.extent(2); k < kend; ++k) + diff += std::abs(cref(i, j, k) - csol(i, j, k)); + + std::cout << std::setw(8) << "Kokkos" << std::setw(8) << "Team V1" + << " BlkSize = " << std::setw(3) << BlkSize + << " TeamSize = " << std::setw(3) << team_size + << " ScratchSize (KB) = 0" + << " time = " << std::scientific << tmin + << " avg flop/s = " << (flop / tavg) + << " max flop/s = " << (flop / tmin); #if defined(__KOKKOSKERNELS_NVIDIA_CUBLAS__) - std::cout << " diff to ref = " << diff; + std::cout << " diff to ref = " << diff; #endif - std::cout << std::endl; - } - } + std::cout << std::endl; + } + } - if (1) { - /// - /// Team policy V2 - team parallel - /// - typedef Kokkos::View view_type; - view_type - a("a", N*VectorLength, BlkSize, BlkSize), - b("b", N*VectorLength, BlkSize, BlkSize), - c("c", N*VectorLength, BlkSize, BlkSize); - - double tavg = 0, tmin = tmax; - { - typedef Kokkos::TeamPolicy policy_type; - typedef Functor functor_type; - - const int - is_blocked_algo = (std::is_same::value), - mb = Algo::Gemm::Blocked::mb(), - mp = BlkSize%mb > 0; - - const int - mblk = is_blocked_algo ? (BlkSize/mb + mp) : BlkSize; - - const int max_team_size = - policy_type(N, Kokkos::AUTO, VectorLength).team_size_max(functor_type(), Kokkos::ParallelForTag()); - const int team_size = std::min(std::max(mblk*mblk,4), max_team_size); - - policy_type policy(N, team_size, VectorLength); - for (int iter=iter_begin;iter= 0)*t; - } - tavg /= iter_end; - - auto csol = Kokkos::create_mirror_view(typename HostSpaceType::memory_space(), c); - Kokkos::deep_copy(csol, c); - - double diff = 0; - for (int i=0,iend=cref.extent(0);i + policy_type; + typedef Functor functor_type; + + const int is_blocked_algo = + (std::is_same::value), + mb = Algo::Gemm::Blocked::mb(), + mp = BlkSize % mb > 0; + + const int mblk = is_blocked_algo ? (BlkSize / mb + mp) : BlkSize; + + const int max_team_size = + policy_type(N, Kokkos::AUTO, VectorLength) + .team_size_max(functor_type(), Kokkos::ParallelForTag()); + const int team_size = std::min(std::max(mblk * mblk, 4), max_team_size); + + policy_type policy(N, team_size, VectorLength); + for (int iter = iter_begin; iter < iter_end; ++iter) { + // flush + flush.run(); + + // initialize matrices + Kokkos::deep_copy(a, amat); + Kokkos::deep_copy(b, bmat); + Kokkos::deep_copy(c, 0); + + Kokkos::fence(); + timer.reset(); + + Kokkos::parallel_for("KokkosBatched::PerfTest::GemmCuda::TeamPolicyV2", + policy, functor_type(a, b, c)); + + Kokkos::fence(); + const double t = timer.seconds(); + tmin = std::min(tmin, t); + tavg += (iter >= 0) * t; + } + tavg /= iter_end; + + auto csol = + Kokkos::create_mirror_view(typename HostSpaceType::memory_space(), c); + Kokkos::deep_copy(csol, c); + + double diff = 0; + for (int i = 0, iend = cref.extent(0); i < iend; ++i) + for (int j = 0, jend = cref.extent(1); j < jend; ++j) + for (int k = 0, kend = cref.extent(2); k < kend; ++k) + diff += std::abs(cref(i, j, k) - csol(i, j, k)); + + std::cout << std::setw(8) << "Kokkos" << std::setw(8) << "Team V2" + << " BlkSize = " << std::setw(3) << BlkSize + << " TeamSize = " << std::setw(3) << team_size + << " ScratchSize (KB) = 0" + << " time = " << std::scientific << tmin + << " avg flop/s = " << (flop / tavg) + << " max flop/s = " << (flop / tmin); #if defined(__KOKKOSKERNELS_NVIDIA_CUBLAS__) - std::cout << " diff to ref = " << diff; + std::cout << " diff to ref = " << diff; #endif - std::cout << std::endl; - } - } + std::cout << std::endl; + } + } - if (1) { - /// - /// Team policy V3 - team parallel + scratch - /// - typedef Kokkos::View view_type; - view_type - a("a", N*VectorLength, BlkSize, BlkSize), - b("b", N*VectorLength, BlkSize, BlkSize), - c("c", N*VectorLength, BlkSize, BlkSize); - - double tavg = 0, tmin = tmax; - { - typedef Kokkos::TeamPolicy policy_type; - typedef Functor functor_type; - - const int lvl = 0, per_team_scratch = 2*ScratchViewType::shmem_size(VectorLength, BlkSize, BlkSize); - //std::cout << "per team scratch " << per_team_scratch << "\n"; - if (per_team_scratch/1024 < 48) { - const int - is_blocked_algo = (std::is_same::value), - mb = Algo::Gemm::Blocked::mb(), - mp = BlkSize%mb > 0; - - const int - mblk = is_blocked_algo ? (BlkSize/mb + mp) : BlkSize; - - const int max_team_size = - policy_type(N, Kokkos::AUTO, VectorLength).set_scratch_size(lvl, Kokkos::PerTeam(per_team_scratch)) - .team_size_max(functor_type(), Kokkos::ParallelForTag()); - const int team_size = std::min(std::max(mblk*mblk,4), max_team_size); - - policy_type policy = policy_type(N, team_size, VectorLength).set_scratch_size(lvl, Kokkos::PerTeam(per_team_scratch)); - for (int iter=iter_begin;iter= 0)*t; - } - tavg /= iter_end; - - auto csol = Kokkos::create_mirror_view(typename HostSpaceType::memory_space(), c); - Kokkos::deep_copy(csol, c); - - double diff = 0; - for (int i=0,iend=cref.extent(0);i + policy_type; + typedef Functor functor_type; + + const int lvl = 0, + per_team_scratch = 2 * ScratchViewType::shmem_size( + VectorLength, BlkSize, BlkSize); + // std::cout << "per team scratch " << per_team_scratch << "\n"; + if (per_team_scratch / 1024 < 48) { + const int is_blocked_algo = + (std::is_same::value), + mb = Algo::Gemm::Blocked::mb(), + mp = BlkSize % mb > 0; + + const int mblk = is_blocked_algo ? (BlkSize / mb + mp) : BlkSize; + + const int max_team_size = + policy_type(N, Kokkos::AUTO, VectorLength) + .set_scratch_size(lvl, Kokkos::PerTeam(per_team_scratch)) + .team_size_max(functor_type(), Kokkos::ParallelForTag()); + const int team_size = std::min(std::max(mblk * mblk, 4), max_team_size); + + policy_type policy = + policy_type(N, team_size, VectorLength) + .set_scratch_size(lvl, Kokkos::PerTeam(per_team_scratch)); + for (int iter = iter_begin; iter < iter_end; ++iter) { + // flush + flush.run(); + + // initialize matrices + Kokkos::deep_copy(a, amat); + Kokkos::deep_copy(b, bmat); + Kokkos::deep_copy(c, 0); + + Kokkos::fence(); + timer.reset(); + + Kokkos::parallel_for( + "KokkosBatched::PerfTest::GemmCuda::TeamPolicyV3", policy, + functor_type(a, b, c)); + + Kokkos::fence(); + const double t = timer.seconds(); + tmin = std::min(tmin, t); + tavg += (iter >= 0) * t; + } + tavg /= iter_end; + + auto csol = Kokkos::create_mirror_view( + typename HostSpaceType::memory_space(), c); + Kokkos::deep_copy(csol, c); + + double diff = 0; + for (int i = 0, iend = cref.extent(0); i < iend; ++i) + for (int j = 0, jend = cref.extent(1); j < jend; ++j) + for (int k = 0, kend = cref.extent(2); k < kend; ++k) + diff += std::abs(cref(i, j, k) - csol(i, j, k)); + + std::cout << std::setw(8) << "Kokkos" << std::setw(8) << "Team V3" + << " BlkSize = " << std::setw(3) << BlkSize + << " TeamSize = " << std::setw(3) << team_size + << " ScratchSize (KB) = " << std::setw(3) + << (per_team_scratch / 1024) << " time = " << std::scientific + << tmin << " avg flop/s = " << (flop / tavg) + << " max flop/s = " << (flop / tmin); #if defined(__KOKKOSKERNELS_NVIDIA_CUBLAS__) - std::cout << " diff to ref = " << diff; + std::cout << " diff to ref = " << diff; #endif - std::cout << std::endl; - } else { - std::cout << std::setw(8) << "Kokkos" - << std::setw(8) << "Team V3" - << " Scratch per team is too big:" << std::setw(3) << (per_team_scratch/1024) - << std::endl; - } - } + std::cout << std::endl; + } else { + std::cout << std::setw(8) << "Kokkos" << std::setw(8) << "Team V3" + << " Scratch per team is too big:" << std::setw(3) + << (per_team_scratch / 1024) << std::endl; } + } + } - if (1) { - /// - /// Team policy - handmade - /// - typedef Kokkos::View view_type; - view_type - a("a", N*VectorLength, BlkSize, BlkSize), - b("b", N*VectorLength, BlkSize, BlkSize), - c("c", N*VectorLength, BlkSize, BlkSize); - - double tavg = 0, tmin = tmax; - { - typedef Kokkos::TeamPolicy policy_type; - typedef Functor functor_type; - - const int max_team_size = - policy_type(N, Kokkos::AUTO, VectorLength).team_size_max(functor_type(), Kokkos::ParallelForTag()); - - const int team_size = std::min(max_team_size,BlkSize*BlkSize); - - const policy_type policy(N, team_size, VectorLength); - for (int iter=iter_begin;iter= 0)*t; - } - tavg /= iter_end; - - auto csol = Kokkos::create_mirror_view(typename HostSpaceType::memory_space(), c); - Kokkos::deep_copy(csol, c); - - double diff = 0; - for (int i=0,iend=cref.extent(0);i + policy_type; + typedef Functor functor_type; + + const int max_team_size = + policy_type(N, Kokkos::AUTO, VectorLength) + .team_size_max(functor_type(), Kokkos::ParallelForTag()); + + const int team_size = std::min(max_team_size, BlkSize * BlkSize); + + const policy_type policy(N, team_size, VectorLength); + for (int iter = iter_begin; iter < iter_end; ++iter) { + // flush + flush.run(); + + // initialize matrices + Kokkos::deep_copy(a, amat); + Kokkos::deep_copy(b, bmat); + Kokkos::deep_copy(c, 0); + + Kokkos::fence(); + timer.reset(); + + Kokkos::parallel_for( + "KokkosBatched::PerfTest::GemmCuda::TeamPolicyHandmade", policy, + functor_type(a, b, c)); + + Kokkos::fence(); + const double t = timer.seconds(); + tmin = std::min(tmin, t); + tavg += (iter >= 0) * t; + } + tavg /= iter_end; + + auto csol = + Kokkos::create_mirror_view(typename HostSpaceType::memory_space(), c); + Kokkos::deep_copy(csol, c); + + double diff = 0; + for (int i = 0, iend = cref.extent(0); i < iend; ++i) + for (int j = 0, jend = cref.extent(1); j < jend; ++j) + for (int k = 0, kend = cref.extent(2); k < kend; ++k) + diff += std::abs(cref(i, j, k) - csol(i, j, k)); + + std::cout << std::setw(8) << "Kokkos" << std::setw(8) << "Team HM" + << " BlkSize = " << std::setw(3) << BlkSize + << " TeamSize = " << std::setw(3) << team_size + << " ScratchSize (KB) = 0" + << " time = " << std::scientific << tmin + << " avg flop/s = " << (flop / tavg) + << " max flop/s = " << (flop / tmin); #if defined(__KOKKOSKERNELS_NVIDIA_CUBLAS__) - std::cout << " diff to ref = " << diff; + std::cout << " diff to ref = " << diff; #endif - std::cout << std::endl; - } - } - std::cout << std::endl; } } -} + std::cout << std::endl; +} +} // namespace PerfTest +} // namespace KokkosBatched using namespace KokkosBatched; -template +template void run(const int N, const int B) { typedef Kokkos::DefaultExecutionSpace ExecSpace; @@ -648,27 +659,25 @@ void run(const int N, const int B) { if (B != 0) { PerfTest::Gemm(N, B); } else { - PerfTest::Gemm(N, 3); - PerfTest::Gemm(N, 5); + PerfTest::Gemm(N, 3); + PerfTest::Gemm(N, 5); PerfTest::Gemm(N, 10); PerfTest::Gemm(N, 15); - + // PerfTest::Gemm(N, 4); // PerfTest::Gemm(N, 8); // PerfTest::Gemm(N, 16); // PerfTest::Gemm(N, 18); } - } int main(int argc, char *argv[]) { - Kokkos::initialize(argc, argv); - int N = 128*128, B = 0; + int N = 128 * 128, B = 0; - for (int i=1;i(N, B); - - std::cout << "\n Testing LayoutLeft Algo::Gemm::Blocked\n"; + + std::cout << "\n Testing LayoutLeft Algo::Gemm::Blocked\n"; run(N, B); } diff --git a/perf_test/batched/do-not-use/KokkosBatched_Test_Gemm_Host.hpp b/perf_test/batched/do-not-use/KokkosBatched_Test_Gemm_Host.hpp index 4e827f34b6..de67d9c804 100644 --- a/perf_test/batched/do-not-use/KokkosBatched_Test_Gemm_Host.hpp +++ b/perf_test/batched/do-not-use/KokkosBatched_Test_Gemm_Host.hpp @@ -23,533 +23,512 @@ //#undef __KOKKOSBATCHED_INTEL_MKL_BATCHED__ namespace KokkosBatched { - namespace PerfTest { +namespace PerfTest { #undef FLOP_MUL #undef FLOP_ADD -#if defined( KokkosBatched_Test_Gemm_Host_Complex ) +#if defined(KokkosBatched_Test_Gemm_Host_Complex) #define FLOP_MUL 6.0 #define FLOP_ADD 2.0 - typedef Kokkos::complex value_type; +typedef Kokkos::complex value_type; #endif -#if defined( KokkosBatched_Test_Gemm_Host_Real ) +#if defined(KokkosBatched_Test_Gemm_Host_Real) #define FLOP_MUL 1.0 -#define FLOP_ADD 1.0 - typedef double value_type; +#define FLOP_ADD 1.0 +typedef double value_type; #endif - double FlopCount(int mm, int nn, int kk) { - double m = (double)mm; double n = (double)nn; double k = (double)kk; - return (FLOP_MUL*(m*n*k) + - FLOP_ADD*(m*n*k)); - } - - template - void Gemm(const int NN) { - typedef Kokkos::Schedule ScheduleType; - - constexpr int VectorLength = DefaultVectorLength::value; - const int N = NN/VectorLength; - - { - std::string value_type_name; - if (std::is_same::value) value_type_name = "double"; - if (std::is_same >::value) value_type_name = "Kokkos::complex"; -#if defined(__AVX512F__) - std::cout << "AVX512 is defined: datatype " << value_type_name << " a vector length " << VectorLength << "\n"; +double FlopCount(int mm, int nn, int kk) { + double m = (double)mm; + double n = (double)nn; + double k = (double)kk; + return (FLOP_MUL * (m * n * k) + FLOP_ADD * (m * n * k)); +} + +template +void Gemm(const int NN) { + typedef Kokkos::Schedule ScheduleType; + + constexpr int VectorLength = + DefaultVectorLength::value; + const int N = NN / VectorLength; + + { + std::string value_type_name; + if (std::is_same::value) value_type_name = "double"; + if (std::is_same >::value) + value_type_name = "Kokkos::complex"; +#if defined(__AVX512F__) + std::cout << "AVX512 is defined: datatype " << value_type_name + << " a vector length " << VectorLength << "\n"; #elif defined(__AVX__) || defined(__AVX2__) - std::cout << "AVX or AVX2 is defined: datatype " << value_type_name << " a vector length " << VectorLength << "\n"; + std::cout << "AVX or AVX2 is defined: datatype " << value_type_name + << " a vector length " << VectorLength << "\n"; #else - std::cout << "SIMD (compiler vectorization) is defined: datatype " << value_type_name << " a vector length " << VectorLength << "\n"; + std::cout << "SIMD (compiler vectorization) is defined: datatype " + << value_type_name << " a vector length " << VectorLength << "\n"; #endif - } - - const double flop = (N*VectorLength)*FlopCount(BlkSize,BlkSize,BlkSize); - const double tmax = 1.0e15; - - const int iter_begin = -10, iter_end = 100; - Kokkos::Timer timer; - - Kokkos::View cref; - Kokkos::View - amat("amat", N*VectorLength, BlkSize, BlkSize), - bmat("bmat", N*VectorLength, BlkSize, BlkSize); - - Kokkos::Random_XorShift64_Pool random(13718); - Kokkos::fill_random(amat, random, value_type(1.0)); - Kokkos::fill_random(bmat, random, value_type(1.0)); - - typedef Vector,VectorLength> VectorType; - Kokkos::View - amat_simd("amat_simd", N, BlkSize, BlkSize), - bmat_simd("bmat_simd", N, BlkSize, BlkSize); - - Kokkos::parallel_for("KokkosBatched::PerfTest::GemmHost::Pack", - Kokkos::RangePolicy(0, N*VectorLength), - KOKKOS_LAMBDA(const int k) { - const int k0 = k/VectorLength, k1 = k%VectorLength; - for (int i=0;i flush; - - /// - /// Reference version using MKL DGEMM - /// -#if defined(__KOKKOSBATCHED_INTEL_MKL__) - { - Kokkos::View - a("a", N*VectorLength, BlkSize, BlkSize), - b("b", N*VectorLength, BlkSize, BlkSize), - c("c", N*VectorLength, BlkSize, BlkSize); - - { - const Kokkos::RangePolicy policy(0, N*VectorLength); - - double tavg = 0, tmin = tmax; - for (int iter=iter_begin;iter::value) { - cblas_dgemm(CblasRowMajor, CblasNoTrans, CblasNoTrans, - BlkSize, BlkSize, BlkSize, - one, - (double*)aa.data(), aa.stride_0(), - (double*)bb.data(), bb.stride_0(), - one, - (double*)cc.data(), cc.stride_0()); - } else if (std::is_same >::value) { - cblas_zgemm(CblasRowMajor, CblasNoTrans, CblasNoTrans, - BlkSize, BlkSize, BlkSize, - (void*)&one, - (void*)aa.data(), aa.stride_0(), - (void*)bb.data(), bb.stride_0(), - (void*)&one, - (void*)cc.data(), cc.stride_0()); - } - - }); - - HostSpaceType().fence(); - const double t = timer.seconds(); - tmin = std::min(tmin, t); - tavg += (iter >= 0)*t; + } + + const double flop = (N * VectorLength) * FlopCount(BlkSize, BlkSize, BlkSize); + const double tmax = 1.0e15; + + const int iter_begin = -10, iter_end = 100; + Kokkos::Timer timer; + + Kokkos::View cref; + Kokkos::View amat( + "amat", N * VectorLength, BlkSize, BlkSize), + bmat("bmat", N * VectorLength, BlkSize, BlkSize); + + Kokkos::Random_XorShift64_Pool random(13718); + Kokkos::fill_random(amat, random, value_type(1.0)); + Kokkos::fill_random(bmat, random, value_type(1.0)); + + typedef Vector, VectorLength> VectorType; + Kokkos::View amat_simd( + "amat_simd", N, BlkSize, BlkSize), + bmat_simd("bmat_simd", N, BlkSize, BlkSize); + + Kokkos::parallel_for( + "KokkosBatched::PerfTest::GemmHost::Pack", + Kokkos::RangePolicy(0, N * VectorLength), + KOKKOS_LAMBDA(const int k) { + const int k0 = k / VectorLength, k1 = k % VectorLength; + for (int i = 0; i < BlkSize; ++i) + for (int j = 0; j < BlkSize; ++j) { + amat_simd(k0, i, j)[k1] = amat(k, i, j); + bmat_simd(k0, i, j)[k1] = bmat(k, i, j); } - tavg /= iter_end; + }); - std::cout << std::setw(12) << "MKL DGEMM" - << " BlkSize = " << std::setw(3) << BlkSize - << " time = " << std::scientific << tmin - << " avg flop/s = " << (flop/tavg) - << " max flop/s = " << (flop/tmin) - << std::endl; + // for KNL (1MB per tile) + constexpr size_t LLC_CAPACITY = 34 * 1024 * 1024; + Flush flush; - cref = c; - } + /// + /// Reference version using MKL DGEMM + /// +#if defined(__KOKKOSBATCHED_INTEL_MKL__) + { + Kokkos::View a( + "a", N * VectorLength, BlkSize, BlkSize), + b("b", N * VectorLength, BlkSize, BlkSize), + c("c", N * VectorLength, BlkSize, BlkSize); + + { + const Kokkos::RangePolicy policy( + 0, N * VectorLength); + + double tavg = 0, tmin = tmax; + for (int iter = iter_begin; iter < iter_end; ++iter) { + // flush + flush.run(); + + // initialize matrices + Kokkos::deep_copy(a, amat); + Kokkos::deep_copy(b, bmat); + Kokkos::deep_copy(c, 0); + + HostSpaceType().fence(); + timer.reset(); + + Kokkos::parallel_for( + "KokkosBatched::PerfTest::GemmHost::CblasOpenMP", policy, + KOKKOS_LAMBDA(const int k) { + auto aa = Kokkos::subview(a, k, Kokkos::ALL(), Kokkos::ALL()); + auto bb = Kokkos::subview(b, k, Kokkos::ALL(), Kokkos::ALL()); + auto cc = Kokkos::subview(c, k, Kokkos::ALL(), Kokkos::ALL()); + + const double one = 1.0; + if (std::is_same::value) { + cblas_dgemm(CblasRowMajor, CblasNoTrans, CblasNoTrans, BlkSize, + BlkSize, BlkSize, one, (double *)aa.data(), + aa.stride_0(), (double *)bb.data(), bb.stride_0(), + one, (double *)cc.data(), cc.stride_0()); + } else if (std::is_same >::value) { + cblas_zgemm(CblasRowMajor, CblasNoTrans, CblasNoTrans, BlkSize, + BlkSize, BlkSize, (void *)&one, (void *)aa.data(), + aa.stride_0(), (void *)bb.data(), bb.stride_0(), + (void *)&one, (void *)cc.data(), cc.stride_0()); + } + }); + + HostSpaceType().fence(); + const double t = timer.seconds(); + tmin = std::min(tmin, t); + tavg += (iter >= 0) * t; } + tavg /= iter_end; + + std::cout << std::setw(12) << "MKL DGEMM" + << " BlkSize = " << std::setw(3) << BlkSize + << " time = " << std::scientific << tmin + << " avg flop/s = " << (flop / tavg) + << " max flop/s = " << (flop / tmin) << std::endl; + + cref = c; + } + } #if defined(__KOKKOSBATCHED_INTEL_MKL_BATCHED__) - { - typedef Kokkos::View ViewType; - ViewType - a("a", N*VectorLength, BlkSize, BlkSize), - b("b", N*VectorLength, BlkSize, BlkSize), - c("c", N*VectorLength, BlkSize, BlkSize); - - value_type - *aa[N*VectorLength], - *bb[N*VectorLength], - *cc[N*VectorLength]; - - for (int k=0;k + ViewType; + ViewType a("a", N * VectorLength, BlkSize, BlkSize), + b("b", N * VectorLength, BlkSize, BlkSize), + c("c", N * VectorLength, BlkSize, BlkSize); + + value_type *aa[N * VectorLength], *bb[N * VectorLength], + *cc[N * VectorLength]; + + for (int k = 0; k < N * VectorLength; ++k) { + aa[k] = &a(k, 0, 0); + bb[k] = &b(k, 0, 0); + cc[k] = &c(k, 0, 0); + } - { - double tavg = 0, tmin = tmax; - - MKL_INT blksize[1] = { BlkSize }; - MKL_INT lda[1] = { a.stride_1() }; - MKL_INT ldb[1] = { b.stride_1() }; - MKL_INT ldc[1] = { c.stride_1() }; - - CBLAS_TRANSPOSE transA[1] = { CblasNoTrans }; - CBLAS_TRANSPOSE transB[1] = { CblasNoTrans }; - - double one[1] = { 1.0 }; - MKL_INT size_per_grp[1] = { N*VectorLength }; - - for (int iter=iter_begin;iter::value) { - cblas_dgemm_batch(CblasRowMajor, - transA, - transB, - blksize, blksize, blksize, - one, - (const double**)aa, lda, - (const double**)bb, ldb, - one, - (double**)cc, ldc, - 1, size_per_grp); - } else if (std::is_same >::value) { - cblas_zgemm_batch(CblasRowMajor, - transA, - transB, - blksize, blksize, blksize, - one, - (const void**)aa, lda, - (const void**)bb, ldb, - one, - (void**)cc, ldc, - 1, size_per_grp); - } - - HostSpaceType().fence(); - const double t = timer.seconds(); - tmin = std::min(tmin, t); - tavg += (iter >= 0)*t; - } - tavg /= iter_end; - - double diff = 0; - for (int i=0,iend=cref.extent(0);i zone(1.0); - - MKL_COMPACT_PACK format; - if (std::is_same::value) { - if (VectorLength == 4) format = MKL_COMPACT_AVX; - else if (VectorLength == 8) format = MKL_COMPACT_AVX512; - } else if (std::is_same >::value) { - if (VectorLength == 2) format = MKL_COMPACT_AVX; - else if (VectorLength == 4) format = MKL_COMPACT_AVX512; - } + { + Kokkos::View a( + "a", N, BlkSize, BlkSize), + b("b", N, BlkSize, BlkSize), c("c", N, BlkSize, BlkSize); + + { + double tavg = 0, tmin = tmax; + + double done(1.0); + std::complex zone(1.0); + + MKL_COMPACT_PACK format; + if (std::is_same::value) { + if (VectorLength == 4) + format = MKL_COMPACT_AVX; + else if (VectorLength == 8) + format = MKL_COMPACT_AVX512; + } else if (std::is_same >::value) { + if (VectorLength == 2) + format = MKL_COMPACT_AVX; + else if (VectorLength == 4) + format = MKL_COMPACT_AVX512; + } - if (format == MKL_COMPACT_AVX512 || format == MKL_COMPACT_AVX) { - for (int iter=iter_begin;iter::value) { - mkl_dgemm_compact(MKL_ROW_MAJOR, - MKL_NOTRANS, MKL_NOTRANS, - BlkSize, BlkSize, BlkSize, - done, - (const double*)a.data(), (MKL_INT)a.stride_1(), - (const double*)b.data(), (MKL_INT)b.stride_1(), - done, - ( double*)c.data(), (MKL_INT)c.stride_1(), - format, N*VectorLength); - } else if (std::is_same >::value) { - mkl_zgemm_compact(MKL_ROW_MAJOR, - MKL_NOTRANS, MKL_NOTRANS, - BlkSize, BlkSize, BlkSize, - (MKL_Complex16*)&zone, - (const double*)a.data(), (MKL_INT)a.stride_1(), - (const double*)b.data(), (MKL_INT)b.stride_1(), - (MKL_Complex16*)&zone, - ( double*)c.data(), (MKL_INT)c.stride_1(), - format, N*VectorLength); - } - - HostSpaceType().fence(); - const double t = timer.seconds(); - tmin = std::min(tmin, t); - tavg += (iter >= 0)*t; - } - tavg /= iter_end; - - double diff = 0; - for (int i=0,iend=cref.extent(0);i policy(0, N*VectorLength); - - double tavg = 0, tmin = tmax; - - // adjust column major order in xsmm - char transA = 'N', transB = 'N'; - libxsmm_blasint blksize = BlkSize; - double one = 1.0; - - for (int iter=iter_begin;iter= 0)*t; - } - tavg /= iter_end; - - // adjust transpose - double diff = 0; - for (int i=0,iend=cref.extent(0);i policy( + 0, N * VectorLength); + + double tavg = 0, tmin = tmax; + + // adjust column major order in xsmm + char transA = 'N', transB = 'N'; + libxsmm_blasint blksize = BlkSize; + double one = 1.0; + + for (int iter = iter_begin; iter < iter_end; ++iter) { + // flush + flush.run(); + + // initialize matrices + Kokkos::deep_copy(a, amat); + Kokkos::deep_copy(b, bmat); + Kokkos::deep_copy(c, 0); + + HostSpaceType().fence(); + timer.reset(); + + Kokkos::parallel_for( + "KokkosBatched::PerfTest::GemmHost::libxswmmOpenMP", policy, + KOKKOS_LAMBDA(const int k) { + auto aa = Kokkos::subview(a, k, Kokkos::ALL(), Kokkos::ALL()); + auto bb = Kokkos::subview(b, k, Kokkos::ALL(), Kokkos::ALL()); + auto cc = Kokkos::subview(c, k, Kokkos::ALL(), Kokkos::ALL()); + + // column major + libxsmm_gemm((const char *)&transA, (const char *)&transB, + blksize, blksize, blksize, (const double *)&one, + (const double *)bb.data(), + (const libxsmm_blasint *)&ldb, + (const double *)aa.data(), + (const libxsmm_blasint *)&lda, (const double *)&one, + (double *)cc.data(), (const libxsmm_blasint *)&ldc); + }); + + HostSpaceType().fence(); + const double t = timer.seconds(); + tmin = std::min(tmin, t); + tavg += (iter >= 0) * t; } + tavg /= iter_end; + + // adjust transpose + double diff = 0; + for (int i = 0, iend = cref.extent(0); i < iend; ++i) + for (int j = 0, jend = cref.extent(1); j < jend; ++j) + for (int k = 0, kend = cref.extent(2); k < kend; ++k) + diff += abs(cref(i, j, k) - c(i, j, k)); + + std::cout << std::setw(12) << "libxsmm" + << " BlkSize = " << std::setw(3) << BlkSize + << " time = " << std::scientific << tmin + << " avg flop/s = " << (flop / tavg) + << " max flop/s = " << (flop / tmin) + << " diff to ref = " << diff << std::endl; + } + libxsmm_finalize(); + } #endif - /// - /// Do not test this. Test Compact vs MKL - /// KK Scalar version (comparable to micro BLAS version) - /// - // if (!std::is_same::value) { - // Kokkos::View - // a("a", N*VectorLength, BlkSize, BlkSize), - // b("b", N*VectorLength, BlkSize, BlkSize), - // c("c", N*VectorLength, BlkSize, BlkSize); - - // { - // const Kokkos::RangePolicy policy(0, N*VectorLength); - - // double tavg = 0, tmin = tmax; - - // for (int iter=iter_begin;iter:: - // invoke(1.0, aa, bb, 1.0, cc); - // }); - - // HostSpaceType().fence(); - // const double t = timer.seconds(); - // tmin = std::min(tmin, t); - // tavg += (iter >= 0)*t; - // } - // tavg /= iter_end; - - // double diff = 0; - // for (int i=0,iend=cref.extent(2);i policy(0, N); - - double tavg = 0, tmin = tmax; - - for (int iter=iter_begin;iter:: - invoke(1.0, aa, bb, 1.0, cc); - }); - - HostSpaceType().fence(); - const double t = timer.seconds(); - tmin = std::min(tmin, t); - tavg += (iter >= 0)*t; - } - tavg /= iter_end; - - double diff = 0; - for (int i=0,iend=cref.extent(0);i policy(0, + // N*VectorLength); + + // double tavg = 0, tmin = tmax; + + // for (int iter=iter_begin;iter:: + // invoke(1.0, aa, bb, 1.0, cc); + // }); + + // HostSpaceType().fence(); + // const double t = timer.seconds(); + // tmin = std::min(tmin, t); + // tavg += (iter >= 0)*t; + // } + // tavg /= iter_end; + + // double diff = 0; + // for (int i=0,iend=cref.extent(2);i policy(0, N); + + double tavg = 0, tmin = tmax; + + for (int iter = iter_begin; iter < iter_end; ++iter) { + // flush + flush.run(); + + // initialize matrices + Kokkos::deep_copy(a, amat_simd); + Kokkos::deep_copy(b, bmat_simd); + Kokkos::deep_copy(c, 0); + + HostSpaceType().fence(); + timer.reset(); + + Kokkos::parallel_for( + "KokkosBatched::PerfTest::GemmHost::SIMDSerialOpenMP", policy, + KOKKOS_LAMBDA(const int k) { + auto aa = Kokkos::subview(a, k, Kokkos::ALL(), Kokkos::ALL()); + auto bb = Kokkos::subview(b, k, Kokkos::ALL(), Kokkos::ALL()); + auto cc = Kokkos::subview(c, k, Kokkos::ALL(), Kokkos::ALL()); + + SerialGemm::invoke(1.0, aa, bb, 1.0, cc); + }); + + HostSpaceType().fence(); + const double t = timer.seconds(); + tmin = std::min(tmin, t); + tavg += (iter >= 0) * t; } - std::cout << std::endl; + tavg /= iter_end; + + double diff = 0; + for (int i = 0, iend = cref.extent(0); i < iend; ++i) + for (int j = 0, jend = cref.extent(1); j < jend; ++j) + for (int k = 0, kend = cref.extent(2); k < kend; ++k) + diff += abs(cref(i, j, k) - + c(i / VectorLength, j, k)[i % VectorLength]); + + std::cout << std::setw(12) << "KK Vector" + << " BlkSize = " << std::setw(3) << BlkSize + << " time = " << std::scientific << tmin + << " avg flop/s = " << (flop / tavg) + << " max flop/s = " << (flop / tmin) + << " diff to ref = " << diff << std::endl; } - - } // end perftest -} // end batched + } + std::cout << std::endl; +} +} // namespace PerfTest +} // namespace KokkosBatched diff --git a/perf_test/batched/do-not-use/KokkosBatched_Test_Gemm_Host_Complex.cpp b/perf_test/batched/do-not-use/KokkosBatched_Test_Gemm_Host_Complex.cpp index 2fffa06855..484c519b1c 100644 --- a/perf_test/batched/do-not-use/KokkosBatched_Test_Gemm_Host_Complex.cpp +++ b/perf_test/batched/do-not-use/KokkosBatched_Test_Gemm_Host_Complex.cpp @@ -6,7 +6,7 @@ using namespace KokkosBatched; -template +template void run(const int N) { typedef Kokkos::DefaultHostExecutionSpace HostSpaceType; @@ -19,35 +19,35 @@ void run(const int N) { // Test::Gemm<32, AlgoTagType>(N); // Test::Gemm<64, AlgoTagType>(N); - PerfTest::Gemm< 3, HostSpaceType, AlgoTagType>(N); - PerfTest::Gemm< 5, HostSpaceType, AlgoTagType>(N); + PerfTest::Gemm<3, HostSpaceType, AlgoTagType>(N); + PerfTest::Gemm<5, HostSpaceType, AlgoTagType>(N); PerfTest::Gemm<10, HostSpaceType, AlgoTagType>(N); PerfTest::Gemm<15, HostSpaceType, AlgoTagType>(N); } -int main(int argc, char *argv[]) { - +int main(int argc, char* argv[]) { Kokkos::initialize(argc, argv); #if !defined(__CUDA_ARCH__) && !defined(__HIP_DEVICE_COMPILE__) const int ntest = 1; - //const int N[6] = { 256, 512, 768, 1024, 1280, 1536 }; - int N[1] = { 128*128 }; + // const int N[6] = { 256, 512, 768, 1024, 1280, 1536 }; + int N[1] = {128 * 128}; - for (int i=1;i(N[i]); - + std::cout << "\n Testing Algo::Gemm::Blocked\n"; run(N[i]); @@ -55,7 +55,6 @@ int main(int argc, char *argv[]) { std::cout << "\n Testing Algo::Gemm::CompactMKL\n"; run(N[i]); #endif - } } diff --git a/perf_test/batched/do-not-use/KokkosBatched_Test_Gemm_Host_Real.cpp b/perf_test/batched/do-not-use/KokkosBatched_Test_Gemm_Host_Real.cpp index 031909d540..b062942341 100644 --- a/perf_test/batched/do-not-use/KokkosBatched_Test_Gemm_Host_Real.cpp +++ b/perf_test/batched/do-not-use/KokkosBatched_Test_Gemm_Host_Real.cpp @@ -5,7 +5,7 @@ using namespace KokkosBatched; -template +template void run(const int N) { typedef Kokkos::DefaultHostExecutionSpace HostSpaceType; @@ -18,32 +18,31 @@ void run(const int N) { // Test::Gemm<32, AlgoTagType>(N); // Test::Gemm<64, AlgoTagType>(N); - PerfTest::Gemm< 3, HostSpaceType, AlgoTagType>(N); - PerfTest::Gemm< 5, HostSpaceType, AlgoTagType>(N); + PerfTest::Gemm<3, HostSpaceType, AlgoTagType>(N); + PerfTest::Gemm<5, HostSpaceType, AlgoTagType>(N); PerfTest::Gemm<10, HostSpaceType, AlgoTagType>(N); PerfTest::Gemm<15, HostSpaceType, AlgoTagType>(N); } -int main(int argc, char *argv[]) { - +int main(int argc, char* argv[]) { Kokkos::initialize(argc, argv); #if !defined(__CUDA_ARCH__) && !defined(__HIP_DEVICE_COMPILE__) const int ntest = 1; - //const int N[6] = { 256, 512, 768, 1024, 1280, 1536 }; - int N[1] = { 128*128 }; + // const int N[6] = { 256, 512, 768, 1024, 1280, 1536 }; + int N[1] = {128 * 128}; - for (int i=1;i(N[i]); - + std::cout << "\n Testing Algo::Gemm::Blocked\n"; run(N[i]); @@ -51,7 +50,6 @@ int main(int argc, char *argv[]) { std::cout << "\n Testing Algo::Gemm::CompactMKL\n"; run(N[i]); #endif - } } diff --git a/perf_test/batched/do-not-use/KokkosBatched_Test_Gemv_Host.hpp b/perf_test/batched/do-not-use/KokkosBatched_Test_Gemv_Host.hpp index 0a45a0b56b..f913aa5740 100644 --- a/perf_test/batched/do-not-use/KokkosBatched_Test_Gemv_Host.hpp +++ b/perf_test/batched/do-not-use/KokkosBatched_Test_Gemv_Host.hpp @@ -22,267 +22,270 @@ #undef __KOKKOSBATCHED_INTEL_MKL_BATCHED__ namespace KokkosBatched { - namespace PerfTest { +namespace PerfTest { #undef FLOP_MUL #undef FLOP_ADD -#if defined( KokkosBatched_Test_Gemv_Host_Complex ) +#if defined(KokkosBatched_Test_Gemv_Host_Complex) #define FLOP_MUL 6.0 #define FLOP_ADD 2.0 - typedef Kokkos::complex value_type; +typedef Kokkos::complex value_type; #endif -#if defined( KokkosBatched_Test_Gemv_Host_Real ) +#if defined(KokkosBatched_Test_Gemv_Host_Real) #define FLOP_MUL 1.0 -#define FLOP_ADD 1.0 - typedef double value_type; +#define FLOP_ADD 1.0 +typedef double value_type; #endif - double FlopCount(int mm, int nn) { - double m = (double)mm; double n = (double)nn; - return (FLOP_MUL*(m*n) + - FLOP_ADD*(m*n)); - } - - template - void Gemv(const int NN) { - typedef Kokkos::Schedule ScheduleType; - //typedef Kokkos::Schedule ScheduleType; - - constexpr int VectorLength = DefaultVectorLength::value; - const int N = NN/VectorLength; - - { - std::string value_type_name; - if (std::is_same::value) value_type_name = "double"; - if (std::is_same >::value) value_type_name = "Kokkos::complex"; - -#if defined(__AVX512F__) - std::cout << "AVX512 is defined: datatype " << value_type_name << " a vector length " << VectorLength << "\n"; +double FlopCount(int mm, int nn) { + double m = (double)mm; + double n = (double)nn; + return (FLOP_MUL * (m * n) + FLOP_ADD * (m * n)); +} + +template +void Gemv(const int NN) { + typedef Kokkos::Schedule ScheduleType; + // typedef Kokkos::Schedule ScheduleType; + + constexpr int VectorLength = + DefaultVectorLength::value; + const int N = NN / VectorLength; + + { + std::string value_type_name; + if (std::is_same::value) value_type_name = "double"; + if (std::is_same >::value) + value_type_name = "Kokkos::complex"; + +#if defined(__AVX512F__) + std::cout << "AVX512 is defined: datatype " << value_type_name + << " a vector length " << VectorLength << "\n"; #elif defined(__AVX__) || defined(__AVX2__) - std::cout << "AVX or AVX2 is defined: datatype " << value_type_name << " a vector length " << VectorLength << "\n"; + std::cout << "AVX or AVX2 is defined: datatype " << value_type_name + << " a vector length " << VectorLength << "\n"; #else - std::cout << "SIMD (compiler vectorization) is defined: datatype " << value_type_name << " a vector length " << VectorLength << "\n"; + std::cout << "SIMD (compiler vectorization) is defined: datatype " + << value_type_name << " a vector length " << VectorLength << "\n"; #endif - } + } + + const double flop = + (N * VectorLength) * FlopCount(BlkSize, BlkSize) * NumVecs; + // const double tmax = 1.0e15; - const double flop = (N*VectorLength)*FlopCount(BlkSize,BlkSize)*NumVecs; - //const double tmax = 1.0e15; - - const int iter_begin = -10, iter_end = 100; - Kokkos::Timer timer; - - Kokkos::View yref; - Kokkos::View - amat("amat", N*VectorLength, BlkSize, BlkSize); - Kokkos::View - xvec("xvec", N*VectorLength, NumVecs, BlkSize); - - Kokkos::Random_XorShift64_Pool random(13718); - Kokkos::fill_random(xvec, random, value_type(1.0)); - Kokkos::fill_random(amat, random, value_type(1.0)); - - // for KNL - constexpr size_t LLC_CAPACITY = 34*1024*1024; - Flush flush; - - /// - /// Reference version using MKL DGEMM - /// + const int iter_begin = -10, iter_end = 100; + Kokkos::Timer timer; + + Kokkos::View yref; + Kokkos::View amat( + "amat", N * VectorLength, BlkSize, BlkSize); + Kokkos::View xvec( + "xvec", N * VectorLength, NumVecs, BlkSize); + + Kokkos::Random_XorShift64_Pool random(13718); + Kokkos::fill_random(xvec, random, value_type(1.0)); + Kokkos::fill_random(amat, random, value_type(1.0)); + + // for KNL + constexpr size_t LLC_CAPACITY = 34 * 1024 * 1024; + Flush flush; + + /// + /// Reference version using MKL DGEMM + /// #if defined(__KOKKOSBATCHED_INTEL_MKL__) - { - Kokkos::View - a("a", N*VectorLength, BlkSize, BlkSize), - x("x", N*VectorLength, NumVecs, BlkSize), - y("y", N*VectorLength, NumVecs, BlkSize); - - { - const Kokkos::RangePolicy policy(0, N*VectorLength); - - double t = 0; - for (int iter=iter_begin;iter= 0)*timer.seconds(); - } - t /= iter_end; - - std::cout << std::setw(12) << "MKL DGEMV" - << " BlkSize = " << std::setw(3) << BlkSize - << " NumVecs = " << std::setw(3) << NumVecs - << " time = " << std::scientific << t - << " flop/s = " << (flop/t) - << std::endl; - - yref = y; - } + { + Kokkos::View a( + "a", N * VectorLength, BlkSize, BlkSize), + x("x", N * VectorLength, NumVecs, BlkSize), + y("y", N * VectorLength, NumVecs, BlkSize); + + { + const Kokkos::RangePolicy policy( + 0, N * VectorLength); + + double t = 0; + for (int iter = iter_begin; iter < iter_end; ++iter) { + // flush + flush.run(); + + // initialize matrices + Kokkos::deep_copy(a, amat); + Kokkos::deep_copy(x, xvec); + Kokkos::deep_copy(y, 0); + + HostSpaceType().fence(); + timer.reset(); + + Kokkos::parallel_for( + "KokkosBatched::PerfTest::GemvHost::CblasOpenMP", policy, + KOKKOS_LAMBDA(const int k) { + auto aa = Kokkos::subview(a, k, Kokkos::ALL(), Kokkos::ALL()); + for (int j = 0; j < NumVecs; ++j) { + auto xx = Kokkos::subview(x, k, j, Kokkos::ALL()); + auto yy = Kokkos::subview(y, k, j, Kokkos::ALL()); + + cblas_dgemv(CblasRowMajor, CblasNoTrans, BlkSize, BlkSize, 1.0, + (double*)aa.data(), aa.stride_0(), + (double*)xx.data(), xx.stride_0(), 1.0, + (double*)yy.data(), yy.stride_0()); + } + }); + + HostSpaceType().fence(); + t += (iter >= 0) * timer.seconds(); } + t /= iter_end; + + std::cout << std::setw(12) << "MKL DGEMV" + << " BlkSize = " << std::setw(3) << BlkSize + << " NumVecs = " << std::setw(3) << NumVecs + << " time = " << std::scientific << t + << " flop/s = " << (flop / t) << std::endl; + + yref = y; + } + } #endif - - /// - /// Plain version (comparable to micro BLAS version) - /// - { - Kokkos::View - a("a", N*VectorLength, BlkSize, BlkSize), - x("x", N*VectorLength, NumVecs, BlkSize), - y("y", N*VectorLength, NumVecs, BlkSize); - - { - const Kokkos::RangePolicy policy(0, N*VectorLength); - - double t = 0; - for (int iter=iter_begin;iter:: - invoke(1.0, aa, xx, 1.0, yy); - } - }); - - HostSpaceType().fence(); - t += (iter >= 0)*timer.seconds(); - } - t /= iter_end; - - double diff = 0; - for (int i=0,iend=yref.extent(0);i policy( + 0, N * VectorLength); + + double t = 0; + for (int iter = iter_begin; iter < iter_end; ++iter) { + // flush + flush.run(); + + // initialize matrices + Kokkos::deep_copy(a, amat); + Kokkos::deep_copy(x, xvec); + Kokkos::deep_copy(y, 0); + + HostSpaceType().fence(); + timer.reset(); + + Kokkos::parallel_for( + "KokkosBatched::PerfTest::GemvHost::SerialOpenMP", policy, + KOKKOS_LAMBDA(const int k) { + auto aa = Kokkos::subview(a, k, Kokkos::ALL(), Kokkos::ALL()); + + for (int j = 0; j < NumVecs; ++j) { + auto xx = Kokkos::subview(x, k, j, Kokkos::ALL()); + auto yy = Kokkos::subview(y, k, j, Kokkos::ALL()); + + SerialGemv::invoke(1.0, aa, xx, + 1.0, yy); + } + }); + + HostSpaceType().fence(); + t += (iter >= 0) * timer.seconds(); + } + t /= iter_end; + + double diff = 0; + for (int i = 0, iend = yref.extent(0); i < iend; ++i) + for (int j = 0, jend = yref.extent(1); j < jend; ++j) + for (int k = 0, kend = yref.extent(2); k < kend; ++k) + diff += std::abs(yref(i, j, k) - y(i, j, k)); + + std::cout << std::setw(12) << "Plain" + << " BlkSize = " << std::setw(3) << BlkSize + << " NumVecs = " << std::setw(3) << NumVecs + << " time = " << std::scientific << t + << " flop/s = " << (flop / t) << " diff to ref = " << diff + << std::endl; + } + } + + typedef Vector, VectorLength> VectorType; + Kokkos::View amat_simd( + "amat_simd", N, BlkSize, BlkSize), + xvec_simd("xvec_simd", N, NumVecs, BlkSize); + + for (int k0 = 0; k0 < N; ++k0) + for (int k1 = 0; k1 < VectorLength; ++k1) + for (int i = 0; i < BlkSize; ++i) { + for (int j = 0; j < NumVecs; ++j) + xvec_simd(k0, j, i)[k1] = xvec(k0 * VectorLength + k1, j, i); + for (int j = 0; j < BlkSize; ++j) + amat_simd(k0, i, j)[k1] = amat(k0 * VectorLength + k1, i, j); } - - typedef Vector,VectorLength> VectorType; - Kokkos::View - amat_simd("amat_simd", N, BlkSize, BlkSize), - xvec_simd("xvec_simd", N, NumVecs, BlkSize); - - for (int k0=0;k0 - a("a", N, BlkSize, BlkSize), - x("x", N, NumVecs, BlkSize), - y("y", N, NumVecs, BlkSize); - - { - const Kokkos::RangePolicy policy(0, N); - - double t = 0; - for (int iter=iter_begin;iter:: - invoke(1.0, aa, xx, 1.0, yy); - } - }); - - HostSpaceType().fence(); - t += (iter >= 0)*timer.seconds(); - } - t /= iter_end; - - double diff = 0; - for (int i=0,iend=yref.extent(0);i policy(0, N); + + double t = 0; + for (int iter = iter_begin; iter < iter_end; ++iter) { + // flush + flush.run(); + + // initialize matrices + Kokkos::deep_copy(a, amat_simd); + Kokkos::deep_copy(x, xvec_simd); + Kokkos::deep_copy(y, 0); + + HostSpaceType().fence(); + timer.reset(); + + Kokkos::parallel_for( + "KokkosBatched::PerfTest::GemvHost::SIMDSerialOpenMP", policy, + KOKKOS_LAMBDA(const int k) { + auto aa = Kokkos::subview(a, k, Kokkos::ALL(), Kokkos::ALL()); + + for (int j = 0; j < NumVecs; ++j) { + auto xx = Kokkos::subview(x, k, j, Kokkos::ALL()); + auto yy = Kokkos::subview(y, k, j, Kokkos::ALL()); + + SerialGemv::invoke(1.0, aa, xx, + 1.0, yy); + } + }); + + HostSpaceType().fence(); + t += (iter >= 0) * timer.seconds(); } + t /= iter_end; + + double diff = 0; + for (int i = 0, iend = yref.extent(0); i < iend; ++i) + for (int j = 0, jend = yref.extent(1); j < jend; ++j) + for (int k = 0, kend = yref.extent(2); k < kend; ++k) + diff += std::abs(yref(i, j, k) - + y(i / VectorLength, j, k)[i % VectorLength]); + + std::cout << std::setw(12) << "Serial SIMD" + << " BlkSize = " << std::setw(3) << BlkSize + << " NumVecs = " << std::setw(3) << NumVecs + << " time = " << std::scientific << t + << " flop/s = " << (flop / t) << " diff to ref = " << diff + << std::endl; } - } } + +} // namespace PerfTest +} // namespace KokkosBatched diff --git a/perf_test/batched/do-not-use/KokkosBatched_Test_Gemv_Host_Real.cpp b/perf_test/batched/do-not-use/KokkosBatched_Test_Gemv_Host_Real.cpp index 56ade7a446..75f4bca4c0 100644 --- a/perf_test/batched/do-not-use/KokkosBatched_Test_Gemv_Host_Real.cpp +++ b/perf_test/batched/do-not-use/KokkosBatched_Test_Gemv_Host_Real.cpp @@ -5,7 +5,7 @@ using namespace KokkosBatched; -template +template void run(const int N) { typedef Kokkos::DefaultHostExecutionSpace HostSpaceType; @@ -18,33 +18,32 @@ void run(const int N) { // PerfTest::Gemv<32, 1, ExecSpace,AlgoTagType>(N); // PerfTest::Gemv<64, 1, ExecSpace,AlgoTagType>(N); - PerfTest::Gemv< 3, 1, HostSpaceType,AlgoTagType>(N); - PerfTest::Gemv< 5, 1, HostSpaceType,AlgoTagType>(N); - PerfTest::Gemv<10, 1, HostSpaceType,AlgoTagType>(N); - PerfTest::Gemv<15, 1, HostSpaceType,AlgoTagType>(N); + PerfTest::Gemv<3, 1, HostSpaceType, AlgoTagType>(N); + PerfTest::Gemv<5, 1, HostSpaceType, AlgoTagType>(N); + PerfTest::Gemv<10, 1, HostSpaceType, AlgoTagType>(N); + PerfTest::Gemv<15, 1, HostSpaceType, AlgoTagType>(N); } int main(int argc, char *argv[]) { - Kokkos::initialize(argc, argv); #if !defined(__CUDA_ARCH__) && !defined(__HIP_DEVICE_COMPILE__) const int ntest = 1; - //const int N[6] = { 256, 512, 768, 1024, 1280, 1536 }; - const int N[1] = { 128*128 }; + // const int N[6] = { 256, 512, 768, 1024, 1280, 1536 }; + const int N[1] = {128 * 128}; - { - for (int i=0;i(N[i]); - + std::cout << "\n Testing Algo::Gemv::Blocked\n"; run(N[i]); } } #endif Kokkos::finalize(); - + return 0; } diff --git a/perf_test/batched/do-not-use/KokkosBatched_Test_LU_Cuda.cpp b/perf_test/batched/do-not-use/KokkosBatched_Test_LU_Cuda.cpp index dcd60af9f0..1514bb9fcd 100644 --- a/perf_test/batched/do-not-use/KokkosBatched_Test_LU_Cuda.cpp +++ b/perf_test/batched/do-not-use/KokkosBatched_Test_LU_Cuda.cpp @@ -3,7 +3,7 @@ #include "Kokkos_Core.hpp" #include "Kokkos_Timer.hpp" -#if defined(KOKKOS_ENABLE_CUDA) +#if defined(KOKKOS_ENABLE_CUDA) #include @@ -23,551 +23,561 @@ #include "KokkosBatched_LU_Team_Impl.hpp" namespace KokkosBatched { - namespace PerfTest { - +namespace PerfTest { + #define FLOP_MUL 1.0 #define FLOP_ADD 1.0 - typedef double value_type; - - double FlopCount(int mm, int nn) { - double m = (double)mm; double n = (double)nn; - if (m > n) - return (FLOP_MUL*(0.5*m*n*n-(1.0/6.0)*n*n*n+0.5*m*n-0.5*n*n+(2.0/3.0)*n) + - FLOP_ADD*(0.5*m*n*n-(1.0/6.0)*n*n*n-0.5*m*n+ (1.0/6.0)*n)); - else - return (FLOP_MUL*(0.5*n*m*m-(1.0/6.0)*m*m*m+0.5*n*m-0.5*m*m+(2.0/3.0)*m) + - FLOP_ADD*(0.5*n*m*m-(1.0/6.0)*m*m*m-0.5*n*m+ (1.0/6.0)*m)); - } +typedef double value_type; + +double FlopCount(int mm, int nn) { + double m = (double)mm; + double n = (double)nn; + if (m > n) + return (FLOP_MUL * (0.5 * m * n * n - (1.0 / 6.0) * n * n * n + + 0.5 * m * n - 0.5 * n * n + (2.0 / 3.0) * n) + + FLOP_ADD * (0.5 * m * n * n - (1.0 / 6.0) * n * n * n - + 0.5 * m * n + (1.0 / 6.0) * n)); + else + return (FLOP_MUL * (0.5 * n * m * m - (1.0 / 6.0) * m * m * m + + 0.5 * n * m - 0.5 * m * m + (2.0 / 3.0) * m) + + FLOP_ADD * (0.5 * n * m * m - (1.0 / 6.0) * m * m * m - + 0.5 * n * m + (1.0 / 6.0) * m)); +} - struct RangeTag {}; - struct TeamTagV1 {}; - struct TeamTagV2 {}; - struct TeamTagV3 {}; - struct TeamTagHandmade {}; +struct RangeTag {}; +struct TeamTagV1 {}; +struct TeamTagV2 {}; +struct TeamTagV3 {}; +struct TeamTagHandmade {}; - template - struct Functor { - UnmanagedViewType _a; +template +struct Functor { + UnmanagedViewType _a; - KOKKOS_INLINE_FUNCTION - Functor() = default; + KOKKOS_INLINE_FUNCTION + Functor() = default; - KOKKOS_INLINE_FUNCTION - Functor(const ViewType &a) - : _a(a) {} + KOKKOS_INLINE_FUNCTION + Functor(const ViewType &a) : _a(a) {} - KOKKOS_INLINE_FUNCTION - void operator()(const RangeTag &, const int k) const { - auto aa = Kokkos::subview(_a, k, Kokkos::ALL(), Kokkos::ALL()); - SerialLU::invoke(aa); - } + KOKKOS_INLINE_FUNCTION + void operator()(const RangeTag &, const int k) const { + auto aa = Kokkos::subview(_a, k, Kokkos::ALL(), Kokkos::ALL()); + SerialLU::invoke(aa); + } - template - KOKKOS_INLINE_FUNCTION - void operator()(const TeamTagV1 &, const MemberType &member) const { - const int kbeg = (member.league_rank()*(member.team_size()*VectorLength) + - member.team_rank()*VectorLength); - Kokkos::parallel_for - (Kokkos::ThreadVectorRange(member, VectorLength), - [&](const int &k) { - const int kk = kbeg + k; - if (kk < _a.extent_int(0)) { - auto aa = Kokkos::subview(_a, kk, Kokkos::ALL(), Kokkos::ALL()); - SerialLU::invoke(aa); - } - }); - } + template + KOKKOS_INLINE_FUNCTION void operator()(const TeamTagV1 &, + const MemberType &member) const { + const int kbeg = + (member.league_rank() * (member.team_size() * VectorLength) + + member.team_rank() * VectorLength); + Kokkos::parallel_for( + Kokkos::ThreadVectorRange(member, VectorLength), [&](const int &k) { + const int kk = kbeg + k; + if (kk < _a.extent_int(0)) { + auto aa = Kokkos::subview(_a, kk, Kokkos::ALL(), Kokkos::ALL()); + SerialLU::invoke(aa); + } + }); + } - template - KOKKOS_INLINE_FUNCTION - void operator()(const TeamTagV2 &, const MemberType &member) const { - const int kbeg = member.league_rank()*VectorLength; - Kokkos::parallel_for - (Kokkos::ThreadVectorRange(member, VectorLength), - [&](const int &k) { - const int kk = kbeg + k; - if (kk < _a.extent_int(0)) { - auto aa = Kokkos::subview(_a, kk, Kokkos::ALL(), Kokkos::ALL()); - TeamLU::invoke(member, aa); - } - }); - } + template + KOKKOS_INLINE_FUNCTION void operator()(const TeamTagV2 &, + const MemberType &member) const { + const int kbeg = member.league_rank() * VectorLength; + Kokkos::parallel_for( + Kokkos::ThreadVectorRange(member, VectorLength), [&](const int &k) { + const int kk = kbeg + k; + if (kk < _a.extent_int(0)) { + auto aa = Kokkos::subview(_a, kk, Kokkos::ALL(), Kokkos::ALL()); + TeamLU::invoke(member, aa); + } + }); + } - template - KOKKOS_INLINE_FUNCTION - void operator()(const TeamTagV3 &, const MemberType &member) const { - const int lvl = 0; - ScratchViewType sa(member.team_scratch(lvl), VectorLength, _a.extent(1), _a.extent(2)); - - const int kbeg = member.league_rank()*VectorLength; - Kokkos::parallel_for - (Kokkos::ThreadVectorRange(member, VectorLength), - [&](const int &k) { - const int kk = kbeg + k; - if (kk < _a.extent_int(0)) { - auto aa = Kokkos::subview(_a, kk, Kokkos::ALL(), Kokkos::ALL()); - auto saa = Kokkos::subview(sa, k, Kokkos::ALL(), Kokkos::ALL()); - - TeamCopy::invoke(member, aa, saa); - member.team_barrier(); - TeamLU::invoke(member, saa); - member.team_barrier(); - TeamCopy::invoke(member, saa, aa); - } - }); - } + template + KOKKOS_INLINE_FUNCTION void operator()(const TeamTagV3 &, + const MemberType &member) const { + const int lvl = 0; + ScratchViewType sa(member.team_scratch(lvl), VectorLength, + _a.extent(1), _a.extent(2)); + + const int kbeg = member.league_rank() * VectorLength; + Kokkos::parallel_for( + Kokkos::ThreadVectorRange(member, VectorLength), [&](const int &k) { + const int kk = kbeg + k; + if (kk < _a.extent_int(0)) { + auto aa = Kokkos::subview(_a, kk, Kokkos::ALL(), Kokkos::ALL()); + auto saa = Kokkos::subview(sa, k, Kokkos::ALL(), Kokkos::ALL()); + + TeamCopy::invoke(member, aa, saa); + member.team_barrier(); + TeamLU::invoke(member, saa); + member.team_barrier(); + TeamCopy::invoke(member, saa, aa); + } + }); + } +}; - }; - - template - void LU(const int NN, const int BlkSize) { - typedef Kokkos::Schedule ScheduleType; - - constexpr int VectorLength = DefaultVectorLength::value; - const int N = NN/VectorLength; - - { - std::string value_type_name; - if (std::is_same::value) value_type_name = "double"; - if (std::is_same >::value) value_type_name = "Kokkos::complex"; - - std::cout << "SIMD is defined: datatype " << value_type_name << " a vector length " << VectorLength << "\n"; - } +template +void LU(const int NN, const int BlkSize) { + typedef Kokkos::Schedule ScheduleType; - const double flop = (N*VectorLength)*FlopCount(BlkSize,BlkSize); - const double tmax = 1.0e15; - - typedef Kokkos::DefaultHostExecutionSpace HostSpaceType; - typedef typename DeviceSpaceType::memory_space DeviceMemorySpaceType; - - const int iter_begin = -3, iter_end = 50; - Kokkos::Timer timer; - - Kokkos::View - amat("amat", N*VectorLength, BlkSize, BlkSize), - aref("aref", N*VectorLength, BlkSize, BlkSize); - - { - Random random; - for (int k=0;k::value; + const int N = NN / VectorLength; + + { + std::string value_type_name; + if (std::is_same::value) value_type_name = "double"; + if (std::is_same >::value) + value_type_name = "Kokkos::complex"; + + std::cout << "SIMD is defined: datatype " << value_type_name + << " a vector length " << VectorLength << "\n"; + } + + const double flop = (N * VectorLength) * FlopCount(BlkSize, BlkSize); + const double tmax = 1.0e15; - // value_type d[BlkSize], v[BlkSize][BlkSize]; - // for (int i=0;i amat( + "amat", N * VectorLength, BlkSize, BlkSize), + aref("aref", N * VectorLength, BlkSize, BlkSize); + + { + Random random; + for (int k = 0; k < N * VectorLength; ++k) { + // use tridiagonal matrices; for now we just check elementwise l/u factors + // do not allow pivots + for (int i = 0; i < BlkSize; ++i) { + amat(k, i, i) = random.value() + 10.0; + if ((i + 1) < BlkSize) { + amat(k, i, i + 1) = random.value() + 1.0; + amat(k, i + 1, i) = random.value() + 1.0; } } - constexpr size_t LLC_CAPACITY = 56*4*1024*1024; - Flush flush; - + // value_type d[BlkSize], v[BlkSize][BlkSize]; + // for (int i=0;i flush; + #if defined(__KOKKOSKERNELS_NVIDIA_CUBLAS__) - if (1) { - /// - /// CUBLAS Batch version - /// - const Kokkos::LayoutStride stride(N*VectorLength, BlkSize*BlkSize, - BlkSize, 1, - BlkSize, BlkSize); - - Kokkos::View a("a", stride); - Kokkos::View info("info", N*VectorLength); - - cublasStatus_t stat; - cublasHandle_t handle; - - stat = cublasCreate(&handle); - if (stat != CUBLAS_STATUS_SUCCESS) - Kokkos::abort("CUBLAS initialization failed\n"); - - auto amat_device = Kokkos::create_mirror_view(typename DeviceSpaceType::memory_space(), amat); - Kokkos::deep_copy(amat_device, amat); + if (1) { + /// + /// CUBLAS Batch version + /// + const Kokkos::LayoutStride stride(N * VectorLength, BlkSize * BlkSize, + BlkSize, 1, BlkSize, BlkSize); + + Kokkos::View a( + "a", stride); + Kokkos::View info("info", N * VectorLength); + + cublasStatus_t stat; + cublasHandle_t handle; + + stat = cublasCreate(&handle); + if (stat != CUBLAS_STATUS_SUCCESS) + Kokkos::abort("CUBLAS initialization failed\n"); + + auto amat_device = Kokkos::create_mirror_view( + typename DeviceSpaceType::memory_space(), amat); + Kokkos::deep_copy(amat_device, amat); + + Kokkos::fence(); + { + double tavg = 0, tmin = tmax; + value_type *aa[N * VectorLength]; + + for (int k = 0; k < N * VectorLength; ++k) { + aa[k] = a.data() + k * a.stride_0(); + } + value_type **aa_device; + if (cudaMalloc(&aa_device, N * VectorLength * sizeof(value_type *)) != + cudaSuccess) { + Kokkos::abort("CUDA memory allocation failed\n"); + } + if (cudaMemcpy(aa_device, aa, sizeof(value_type *) * N * VectorLength, + cudaMemcpyHostToDevice) != cudaSuccess) { + Kokkos::abort("CUDA memcpy failed\n"); + } + Kokkos::fence(); + for (int iter = iter_begin; iter < iter_end; ++iter) { + // flush + flush.run(); + + // initialize matrix + Kokkos::deep_copy(a, amat_device); Kokkos::fence(); - { - double tavg = 0, tmin = tmax; - value_type *aa[N*VectorLength]; + timer.reset(); - for (int k=0;k= 0)*t; - } - tavg /= iter_end; - - auto asol = Kokkos::create_mirror_view(typename HostSpaceType::memory_space(), a); - Kokkos::deep_copy(asol, a); - Kokkos::deep_copy(aref, asol); - - if (cudaFree(aa_device) != cudaSuccess) { - Kokkos::abort("CUDA memory free failed\n"); - } - - std::cout << std::setw(8) << "CUBLAS" - << std::setw(8) << "Batch" - << " BlkSize = " << std::setw(3) << BlkSize - << " TeamSize = N/A" - << " ScratchSize (KB) = N/A" - << " time = " << std::scientific << tmin - << " avg flop/s = " << (flop/tavg) - << " max flop/s = " << (flop/tmin) - << std::endl; + stat = cublasDgetrfBatched(handle, BlkSize, (value_type **)aa_device, + BlkSize, NULL, (int *)info.data(), + N * VectorLength); + if (stat != CUBLAS_STATUS_SUCCESS) { + Kokkos::abort("CUBLAS LU Batched failed\n"); } + + Kokkos::fence(); + const double t = timer.seconds(); + tmin = std::min(tmin, t); + tavg += (iter >= 0) * t; } + tavg /= iter_end; + + auto asol = + Kokkos::create_mirror_view(typename HostSpaceType::memory_space(), a); + Kokkos::deep_copy(asol, a); + Kokkos::deep_copy(aref, asol); + + if (cudaFree(aa_device) != cudaSuccess) { + Kokkos::abort("CUDA memory free failed\n"); + } + + std::cout << std::setw(8) << "CUBLAS" << std::setw(8) << "Batch" + << " BlkSize = " << std::setw(3) << BlkSize << " TeamSize = N/A" + << " ScratchSize (KB) = N/A" + << " time = " << std::scientific << tmin + << " avg flop/s = " << (flop / tavg) + << " max flop/s = " << (flop / tmin) << std::endl; + } + } #endif - if (1) { - /// - /// Range policy version - /// - typedef Kokkos::View view_type; - view_type - a("a", N*VectorLength, BlkSize, BlkSize); + if (1) { + /// + /// Range policy version + /// + typedef Kokkos::View view_type; + view_type a("a", N * VectorLength, BlkSize, BlkSize); - double tavg = 0, tmin = tmax; - { - typedef Functor functor_type; - const Kokkos::RangePolicy policy(0, N*VectorLength); + double tavg = 0, tmin = tmax; + { + typedef Functor functor_type; + const Kokkos::RangePolicy policy( + 0, N * VectorLength); - for (int iter=iter_begin;iter= 0)*t; - } - tavg /= iter_end; - - auto asol = Kokkos::create_mirror_view(typename HostSpaceType::memory_space(), a); - Kokkos::deep_copy(asol, a); - - double diff = 0; - for (int i=0,iend=aref.extent(0);i + policy_type; + typedef Functor functor_type; + + const int team_size = + policy_type(N / 32, Kokkos::AUTO, VectorLength) + .team_size_recommended(functor_type(), Kokkos::ParallelForTag()); + + const policy_type policy(N / team_size, team_size, VectorLength); + for (int iter = iter_begin; iter < iter_end; ++iter) { + // flush + flush.run(); + + // initialize matrix + Kokkos::deep_copy(a, amat); + + Kokkos::fence(); + timer.reset(); + + Kokkos::parallel_for("KokkosBatched::PerfTest::LUCuda::TeamTagV1", + policy, functor_type(a)); + + Kokkos::fence(); + const double t = timer.seconds(); + tmin = std::min(tmin, t); + tavg += (iter >= 0) * t; } - if (1) { - /// - /// Team V1 - /// - typedef Kokkos::View view_type; - view_type - a("a", N*VectorLength, BlkSize, BlkSize); - - double tavg = 0, tmin = tmax; - { - typedef Kokkos::TeamPolicy policy_type; - typedef Functor functor_type; - - const int team_size = - policy_type(N/32, Kokkos::AUTO, VectorLength).team_size_recommended(functor_type(), Kokkos::ParallelForTag()); - - const policy_type policy(N/team_size, team_size, VectorLength); - for (int iter=iter_begin;iter= 0)*t; - } - tavg /= iter_end; - - auto asol = Kokkos::create_mirror_view(typename HostSpaceType::memory_space(), a); - Kokkos::deep_copy(asol, a); - - double diff = 0; - for (int i=0,iend=aref.extent(0);i + policy_type; + typedef Functor functor_type; + + const int is_blocked_algo = + (std::is_same::value), + mb = Algo::LU::Blocked::mb(); + // mp = BlkSize%mb > 0; + + const int + // mblk = is_blocked_algo ? (BlkSize/mb + mp) : BlkSize; + mblk = is_blocked_algo ? (BlkSize - mb) : (BlkSize - 1); + + const int max_team_size = + policy_type(N, Kokkos::AUTO, VectorLength) + .team_size_max(functor_type(), Kokkos::ParallelForTag()); + const int team_size = std::min(std::max(mblk * 2, 1), max_team_size); + + const policy_type policy(N, team_size, VectorLength); + for (int iter = iter_begin; iter < iter_end; ++iter) { + // flush + flush.run(); + + // initialize matrix + Kokkos::deep_copy(a, amat); + + Kokkos::fence(); + timer.reset(); + + Kokkos::parallel_for("KokkosBatched::PerfTest::LUCuda::TeamTagV2", + policy, functor_type(a)); + + Kokkos::fence(); + const double t = timer.seconds(); + tmin = std::min(tmin, t); + tavg += (iter >= 0) * t; } - if (1) { - /// - /// Team V2 - /// - typedef Kokkos::View view_type; - view_type - a("a", N*VectorLength, BlkSize, BlkSize); - - double tavg = 0, tmin = tmax; - { - typedef Kokkos::TeamPolicy policy_type; - typedef Functor functor_type; - - const int - is_blocked_algo = (std::is_same::value), - mb = Algo::LU::Blocked::mb(); - //mp = BlkSize%mb > 0; - - const int - //mblk = is_blocked_algo ? (BlkSize/mb + mp) : BlkSize; + tavg /= iter_end; + + auto asol = + Kokkos::create_mirror_view(typename HostSpaceType::memory_space(), a); + Kokkos::deep_copy(asol, a); + + double diff = 0; + for (int i = 0, iend = aref.extent(0); i < iend; ++i) + for (int j = 0, jend = aref.extent(1); j < jend; ++j) + for (int k = 0, kend = aref.extent(2); k < kend; ++k) + diff += std::abs(aref(i, j, k) - asol(i, j, k)); + + std::cout << std::setw(8) << "Kokkos" << std::setw(8) << "Team V2" + << " BlkSize = " << std::setw(3) << BlkSize + << " TeamSize = " << std::setw(3) << team_size + << " ScratchSize (KB) = 0" + << " time = " << std::scientific << tmin + << " avg flop/s = " << (flop / tavg) + << " max flop/s = " << (flop / tmin); +#if defined(__KOKKOSKERNELS_NVIDIA_CUBLAS__) + std::cout << " diff to ref = " << diff; +#endif + std::cout << std::endl; + } + } + if (1) { + /// + /// Team V3 + /// + typedef Kokkos::View view_type; + view_type a("a", N * VectorLength, BlkSize, BlkSize); + + double tavg = 0, tmin = tmax; + { + typedef Kokkos::TeamPolicy + policy_type; + typedef Functor functor_type; + + const int lvl = 0, + per_team_scratch = ScratchViewType::shmem_size( + VectorLength, BlkSize, BlkSize); + if (per_team_scratch / 1024 < 48) { + const int is_blocked_algo = + (std::is_same::value), + mb = Algo::LU::Blocked::mb(); + // mp = BlkSize%mb > 0; + + const int + // mblk = is_blocked_algo ? (BlkSize/mb + mp) : BlkSize; mblk = is_blocked_algo ? (BlkSize - mb) : (BlkSize - 1); - const int max_team_size = - policy_type(N, Kokkos::AUTO, VectorLength).team_size_max(functor_type(), Kokkos::ParallelForTag()); - const int team_size = std::min(std::max(mblk*2,1), max_team_size); + const int max_team_size = + policy_type(N, Kokkos::AUTO, VectorLength) + .set_scratch_size(lvl, Kokkos::PerTeam(per_team_scratch)) + .team_size_max(functor_type(), Kokkos::ParallelForTag()); + const int team_size = std::min(std::max(mblk * 2, 1), max_team_size); - const policy_type policy(N, team_size, VectorLength); - for (int iter=iter_begin;iter= 0)*t; - } - tavg /= iter_end; - - auto asol = Kokkos::create_mirror_view(typename HostSpaceType::memory_space(), a); - Kokkos::deep_copy(asol, a); - - double diff = 0; - for (int i=0,iend=aref.extent(0);i policy_type; - typedef Functor functor_type; - - const int lvl = 0, per_team_scratch = ScratchViewType::shmem_size(VectorLength, BlkSize, BlkSize); - if (per_team_scratch/1024 < 48) { - const int - is_blocked_algo = (std::is_same::value), - mb = Algo::LU::Blocked::mb(); - // mp = BlkSize%mb > 0; - - const int - //mblk = is_blocked_algo ? (BlkSize/mb + mp) : BlkSize; - mblk = is_blocked_algo ? (BlkSize - mb) : (BlkSize - 1); - - const int max_team_size = - policy_type(N, Kokkos::AUTO, VectorLength).set_scratch_size(lvl, Kokkos::PerTeam(per_team_scratch)) - .team_size_max(functor_type(), Kokkos::ParallelForTag()); - const int team_size = std::min(std::max(mblk*2,1), max_team_size); - - policy_type policy(N, team_size, VectorLength); - for (int iter=iter_begin;iter= 0)*t; - } - tavg /= iter_end; - - auto asol = Kokkos::create_mirror_view(typename HostSpaceType::memory_space(), a); - Kokkos::deep_copy(asol, a); - - double diff = 0; - for (int i=0,iend=aref.extent(0);i +template void run(const int N, const int B) { typedef Kokkos::DefaultExecutionSpace ExecSpace; Kokkos::print_configuration(std::cout, false); if (B != 0) { - PerfTest::LU(N,B); + PerfTest::LU(N, B); } else { PerfTest::LU(N, 3); PerfTest::LU(N, 5); - PerfTest::LU(N,10); - PerfTest::LU(N,15); + PerfTest::LU(N, 10); + PerfTest::LU(N, 15); } } int main(int argc, char *argv[]) { - Kokkos::initialize(argc, argv); - int N = 128*128, B = 0; + int N = 128 * 128, B = 0; - for (int i=1;i(N,B); - + run(N, B); + std::cout << "\n Testing LayoutLeft Algo::LU::Blocked\n"; - run(N,B); + run(N, B); } Kokkos::finalize(); diff --git a/perf_test/batched/do-not-use/KokkosBatched_Test_LU_Host.hpp b/perf_test/batched/do-not-use/KokkosBatched_Test_LU_Host.hpp index 33cbd78b6c..68daa24eb1 100644 --- a/perf_test/batched/do-not-use/KokkosBatched_Test_LU_Host.hpp +++ b/perf_test/batched/do-not-use/KokkosBatched_Test_LU_Host.hpp @@ -20,312 +20,324 @@ #undef __KOKKOSBATCHED_INTEL_MKL_BATCHED__ namespace KokkosBatched { - namespace PerfTest { +namespace PerfTest { #undef FLOP_MUL #undef FLOP_ADD - // no complex yet -#if defined( KokkosBatched_Test_LU_Host_Complex ) +// no complex yet +#if defined(KokkosBatched_Test_LU_Host_Complex) #define FLOP_MUL 6.0 #define FLOP_ADD 2.0 - typedef Kokkos::complex value_type; +typedef Kokkos::complex value_type; #endif -#if defined( KokkosBatched_Test_LU_Host_Real ) +#if defined(KokkosBatched_Test_LU_Host_Real) #define FLOP_MUL 1.0 -#define FLOP_ADD 1.0 - typedef double value_type; +#define FLOP_ADD 1.0 +typedef double value_type; #endif - double FlopCount(int mm, int nn) { - double m = (double)mm; double n = (double)nn; - if (m > n) - return (FLOP_MUL*(0.5*m*n*n-(1.0/6.0)*n*n*n+0.5*m*n-0.5*n*n+(2.0/3.0)*n) + - FLOP_ADD*(0.5*m*n*n-(1.0/6.0)*n*n*n-0.5*m*n+ (1.0/6.0)*n)); - else - return (FLOP_MUL*(0.5*n*m*m-(1.0/6.0)*m*m*m+0.5*n*m-0.5*m*m+(2.0/3.0)*m) + - FLOP_ADD*(0.5*n*m*m-(1.0/6.0)*m*m*m-0.5*n*m+ (1.0/6.0)*m)); - } - - template - void LU(const int NN) { - typedef Kokkos::Schedule ScheduleType; - //typedef Kokkos::Schedule ScheduleType; - - constexpr int VectorLength = DefaultVectorLength::value; - const int N = NN/VectorLength; - - { - std::string value_type_name; - if (std::is_same::value) value_type_name = "double"; - if (std::is_same >::value) value_type_name = "Kokkos::complex"; - -#if defined(__AVX512F__) - std::cout << "AVX512 is defined: datatype " << value_type_name << " a vector length " << VectorLength << "\n"; +double FlopCount(int mm, int nn) { + double m = (double)mm; + double n = (double)nn; + if (m > n) + return (FLOP_MUL * (0.5 * m * n * n - (1.0 / 6.0) * n * n * n + + 0.5 * m * n - 0.5 * n * n + (2.0 / 3.0) * n) + + FLOP_ADD * (0.5 * m * n * n - (1.0 / 6.0) * n * n * n - + 0.5 * m * n + (1.0 / 6.0) * n)); + else + return (FLOP_MUL * (0.5 * n * m * m - (1.0 / 6.0) * m * m * m + + 0.5 * n * m - 0.5 * m * m + (2.0 / 3.0) * m) + + FLOP_ADD * (0.5 * n * m * m - (1.0 / 6.0) * m * m * m - + 0.5 * n * m + (1.0 / 6.0) * m)); +} + +template +void LU(const int NN) { + typedef Kokkos::Schedule ScheduleType; + // typedef Kokkos::Schedule ScheduleType; + + constexpr int VectorLength = + DefaultVectorLength::value; + const int N = NN / VectorLength; + + { + std::string value_type_name; + if (std::is_same::value) value_type_name = "double"; + if (std::is_same >::value) + value_type_name = "Kokkos::complex"; + +#if defined(__AVX512F__) + std::cout << "AVX512 is defined: datatype " << value_type_name + << " a vector length " << VectorLength << "\n"; #elif defined(__AVX__) || defined(__AVX2__) - std::cout << "AVX or AVX2 is defined: datatype " << value_type_name << " a vector length " << VectorLength << "\n"; + std::cout << "AVX or AVX2 is defined: datatype " << value_type_name + << " a vector length " << VectorLength << "\n"; #else - std::cout << "SIMD (compiler vectorization) is defined: datatype " << value_type_name << " a vector length " << VectorLength << "\n"; + std::cout << "SIMD (compiler vectorization) is defined: datatype " + << value_type_name << " a vector length " << VectorLength << "\n"; #endif + } + + const double flop = (N * VectorLength) * FlopCount(BlkSize, BlkSize); + const double tmax = 1.0e15; + + const int iter_begin = -10, iter_end = 100; + Kokkos::Timer timer; + + /// + /// Reference version using MKL DGETRF + /// + Kokkos::View aref; + Kokkos::View amat( + "amat", N * VectorLength, BlkSize, BlkSize); + + Random random; + + for (int k = 0; k < N * VectorLength; ++k) { + // use tridiagonal matrices; for now we just check elementwise l/u factors + // do not allow pivots + for (int i = 0; i < BlkSize; ++i) { + amat(k, i, i) = random.value() + 10.0; + if ((i + 1) < BlkSize) { + amat(k, i, i + 1) = random.value() + 1.0; + amat(k, i + 1, i) = random.value() + 1.0; } - - const double flop = (N*VectorLength)*FlopCount(BlkSize,BlkSize); - const double tmax = 1.0e15; - - const int iter_begin = -10, iter_end = 100; - Kokkos::Timer timer; - - /// - /// Reference version using MKL DGETRF - /// - Kokkos::View aref; - Kokkos::View - amat("amat", N*VectorLength, BlkSize, BlkSize); - - Random random; - - for (int k=0;k, VectorLength> VectorType; + Kokkos::View amat_simd( + "amat_simd", N, BlkSize, BlkSize); //, a("a", N, BlkSize, BlkSize); + + Kokkos::parallel_for( + "KokkosBatched::PerfTest::LUHost::Pack", + Kokkos::RangePolicy(0, N * VectorLength), + KOKKOS_LAMBDA(const int k) { + const int k0 = k / VectorLength, k1 = k % VectorLength; + for (int i = 0; i < BlkSize; ++i) + for (int j = 0; j < BlkSize; ++j) { + amat_simd(k0, i, j)[k1] = amat(k0 * VectorLength + k1, i, j); } - } - } + }); - typedef Vector,VectorLength> VectorType; - Kokkos::View - amat_simd("amat_simd", N, BlkSize, BlkSize); //, a("a", N, BlkSize, BlkSize); - - Kokkos::parallel_for("KokkosBatched::PerfTest::LUHost::Pack", - Kokkos::RangePolicy(0, N*VectorLength), - KOKKOS_LAMBDA(const int k) { - const int k0 = k/VectorLength, k1 = k%VectorLength; - for (int i=0;i flush; - - /// - /// Reference version using MKL DGETRF - /// -#if defined(__KOKKOSBATCHED_INTEL_MKL__) - { - Kokkos::View a("a", N*VectorLength, BlkSize, BlkSize); - Kokkos::View p("p", N*VectorLength, BlkSize); - { - double tavg = 0, tmin = tmax; - for (int iter=iter_begin;iter policy(0, N*VectorLength); - Kokkos::parallel_for("KokkosBatched::PerfTest::LUHost::LAPACKE_dgetrfOpenMP", - policy, - KOKKOS_LAMBDA(const int k) { - auto aa = Kokkos::subview(a, k, Kokkos::ALL(), Kokkos::ALL()); - auto pp = Kokkos::subview(p, k, Kokkos::ALL()); - LAPACKE_dgetrf(LAPACK_ROW_MAJOR, - BlkSize, BlkSize, - (double*)aa.data(), aa.stride_0(), - (int*)pp.data()); - }); - - HostSpaceType().fence(); - const double t = timer.seconds(); - tmin = std::min(tmin, t); - tavg += (iter >= 0)*t; - } - tavg /= iter_end; - - std::cout << std::setw(10) << "MKL LU" - << " BlkSize = " << std::setw(3) << BlkSize - << " time = " << std::scientific << tmin - << " avg flop/s = " << (flop/tavg) - << " max flop/s = " << (flop/tmin) - << std::endl; - } + // for KNL + constexpr size_t LLC_CAPACITY = 34 * 1024 * 1024; + Flush flush; - aref = a; + /// + /// Reference version using MKL DGETRF + /// +#if defined(__KOKKOSBATCHED_INTEL_MKL__) + { + Kokkos::View a( + "a", N * VectorLength, BlkSize, BlkSize); + Kokkos::View p( + "p", N * VectorLength, BlkSize); + { + double tavg = 0, tmin = tmax; + for (int iter = iter_begin; iter < iter_end; ++iter) { + // flush + flush.run(); + + // initialize matrix + Kokkos::deep_copy(a, amat); + + HostSpaceType().fence(); + timer.reset(); + + Kokkos::RangePolicy policy( + 0, N * VectorLength); + Kokkos::parallel_for( + "KokkosBatched::PerfTest::LUHost::LAPACKE_dgetrfOpenMP", policy, + KOKKOS_LAMBDA(const int k) { + auto aa = Kokkos::subview(a, k, Kokkos::ALL(), Kokkos::ALL()); + auto pp = Kokkos::subview(p, k, Kokkos::ALL()); + LAPACKE_dgetrf(LAPACK_ROW_MAJOR, BlkSize, BlkSize, + (double*)aa.data(), aa.stride_0(), + (int*)pp.data()); + }); + + HostSpaceType().fence(); + const double t = timer.seconds(); + tmin = std::min(tmin, t); + tavg += (iter >= 0) * t; } + tavg /= iter_end; + + std::cout << std::setw(10) << "MKL LU" + << " BlkSize = " << std::setw(3) << BlkSize + << " time = " << std::scientific << tmin + << " avg flop/s = " << (flop / tavg) + << " max flop/s = " << (flop / tmin) << std::endl; + } + + aref = a; + } #if defined(__KOKKOSBATCHED_INTEL_MKL_BATCHED__) #endif #if defined(__KOKKOSBATCHED_INTEL_MKL_COMPACT_BATCHED__) - { - Kokkos::View - a("a", N, BlkSize, BlkSize); - - { - double tavg = 0, tmin = tmax; - MKL_COMPACT_PACK format; - if (VectorLength == 8) format = MKL_COMPACT_AVX512; - else if (VectorLength == 4) format = MKL_COMPACT_AVX; - - if (format == MKL_COMPACT_AVX512 || format == MKL_COMPACT_AVX) { - int info; - for (int iter=iter_begin;iter= 0)*t; - } - tavg /= iter_end; - - double diff = 0; - for (int i=0,iend=aref.extent(0);i= 0) * t; } + tavg /= iter_end; + + double diff = 0; + for (int i = 0, iend = aref.extent(0); i < iend; ++i) + for (int j = 0, jend = aref.extent(1); j < jend; ++j) + for (int k = 0, kend = aref.extent(2); k < kend; ++k) + diff += abs(aref(i, j, k) - + a(i / VectorLength, j, k)[i % VectorLength]); + + std::cout << std::setw(10) << "MKL Cmpt" + << " BlkSize = " << std::setw(3) << BlkSize + << " time = " << std::scientific << tmin + << " avg flop/s = " << (flop / tavg) + << " max flop/s = " << (flop / tmin) + << " diff to ref = " << diff << std::endl; } + } + } #endif #endif - // /// - // /// Plain version (comparable to micro BLAS version) - // /// - - // { - // Kokkos::View - // a("a", N*VectorLength, BlkSize, BlkSize); - - // { - // double tavg = 0, tmin = tmax; - // for (int iter=iter_begin;iter policy(0, N*VectorLength); - // Kokkos::parallel_for - // (policy, - // KOKKOS_LAMBDA(const int k) { - // auto aa = Kokkos::subview(a, k, Kokkos::ALL(), Kokkos::ALL()); - - // SerialLU::invoke(aa); - // }); - - // HostSpaceType().fence(); - // const double t = timer.seconds(); - // tmin = std::min(tmin, t); - // tavg += (iter >= 0)*t; - // } - // tavg /= iter_end; - - // double diff = 0; - // for (int i=0,iend=aref.extent(0);i policy(0, N); - Kokkos::parallel_for("KokkosBatched::PerfTest::LUHost::SIMDSerialOpenMP", - policy, - KOKKOS_LAMBDA(const int k) { - auto aa = Kokkos::subview(a, k, Kokkos::ALL(), Kokkos::ALL()); - - SerialLU::invoke(aa); - }); - - HostSpaceType().fence(); - const double t = timer.seconds(); - tmin = std::min(tmin, t); - tavg += (iter >= 0)*t; - } - tavg /= iter_end; - - double diff = 0; - for (int i=0,iend=aref.extent(0);i policy(0, + // N*VectorLength); Kokkos::parallel_for + // (policy, + // KOKKOS_LAMBDA(const int k) { + // auto aa = Kokkos::subview(a, k, Kokkos::ALL(), Kokkos::ALL()); + + // SerialLU::invoke(aa); + // }); + + // HostSpaceType().fence(); + // const double t = timer.seconds(); + // tmin = std::min(tmin, t); + // tavg += (iter >= 0)*t; + // } + // tavg /= iter_end; + + // double diff = 0; + // for (int i=0,iend=aref.extent(0);i policy(0, N); + Kokkos::parallel_for( + "KokkosBatched::PerfTest::LUHost::SIMDSerialOpenMP", policy, + KOKKOS_LAMBDA(const int k) { + auto aa = Kokkos::subview(a, k, Kokkos::ALL(), Kokkos::ALL()); + + SerialLU::invoke(aa); + }); + + HostSpaceType().fence(); + const double t = timer.seconds(); + tmin = std::min(tmin, t); + tavg += (iter >= 0) * t; } + tavg /= iter_end; + + double diff = 0; + for (int i = 0, iend = aref.extent(0); i < iend; ++i) + for (int j = 0, jend = aref.extent(1); j < jend; ++j) + for (int k = 0, kend = aref.extent(2); k < kend; ++k) + diff += abs(aref(i, j, k) - + a(i / VectorLength, j, k)[i % VectorLength]); + std::cout << std::setw(10) << "SIMD" + << " BlkSize = " << std::setw(3) << BlkSize + << " time = " << std::scientific << tmin + << " avg flop/s = " << (flop / tavg) + << " max flop/s = " << (flop / tmin) + << " diff to ref = " << diff << std::endl; } + } +} - } // namespace PerfTest -} // namespace KokkosBatched - - +} // namespace PerfTest +} // namespace KokkosBatched diff --git a/perf_test/batched/do-not-use/KokkosBatched_Test_LU_Host_Real.cpp b/perf_test/batched/do-not-use/KokkosBatched_Test_LU_Host_Real.cpp index 7d352283c6..6c0736501d 100644 --- a/perf_test/batched/do-not-use/KokkosBatched_Test_LU_Host_Real.cpp +++ b/perf_test/batched/do-not-use/KokkosBatched_Test_LU_Host_Real.cpp @@ -5,36 +5,35 @@ using namespace KokkosBatched; -template +template void run(const int N) { typedef Kokkos::DefaultHostExecutionSpace HostSpaceType; Kokkos::print_configuration(std::cout, false); - PerfTest::LU< 3, HostSpaceType,AlgoTagType>(N); - PerfTest::LU< 5, HostSpaceType,AlgoTagType>(N); - PerfTest::LU<10, HostSpaceType,AlgoTagType>(N); - PerfTest::LU<15, HostSpaceType,AlgoTagType>(N); + PerfTest::LU<3, HostSpaceType, AlgoTagType>(N); + PerfTest::LU<5, HostSpaceType, AlgoTagType>(N); + PerfTest::LU<10, HostSpaceType, AlgoTagType>(N); + PerfTest::LU<15, HostSpaceType, AlgoTagType>(N); } -int main(int argc, char *argv[]) { - +int main(int argc, char* argv[]) { Kokkos::initialize(argc, argv); #if !defined(__CUDA_ARCH__) && !defined(__HIP_DEVICE_COMPILE__) - int N = 128*128; + int N = 128 * 128; - for (int i=1;i(N); - + std::cout << "\n Testing Algo::LU::Blocked\n"; run(N); diff --git a/perf_test/batched/do-not-use/KokkosBatched_Test_Trsm_Cuda.cpp b/perf_test/batched/do-not-use/KokkosBatched_Test_Trsm_Cuda.cpp index 807b7a884e..de6fb2582e 100644 --- a/perf_test/batched/do-not-use/KokkosBatched_Test_Trsm_Cuda.cpp +++ b/perf_test/batched/do-not-use/KokkosBatched_Test_Trsm_Cuda.cpp @@ -3,7 +3,7 @@ #include "Kokkos_Core.hpp" #include "Kokkos_Timer.hpp" -#if defined(KOKKOS_ENABLE_CUDA) +#if defined(KOKKOS_ENABLE_CUDA) #include @@ -24,755 +24,742 @@ #include "KokkosBatched_Trsm_Team_Impl.hpp" namespace KokkosBatched { - namespace PerfTest { - +namespace PerfTest { + #undef FLOP_MUL #undef FLOP_ADD #define FLOP_MUL 1.0 #define FLOP_ADD 1.0 - typedef double value_type; +typedef double value_type; - double FlopCountLower(int mm, int nn) { - double m = (double)mm; double n = (double)nn; - return (FLOP_MUL*(0.5*m*n*(n+1.0)) + - FLOP_ADD*(0.5*m*n*(n-1.0))); - } - - double FlopCountUpper(int mm, int nn) { - double m = (double)mm; double n = (double)nn; - return (FLOP_MUL*(0.5*m*n*(n+1.0)) + - FLOP_ADD*(0.5*m*n*(n-1.0))); +double FlopCountLower(int mm, int nn) { + double m = (double)mm; + double n = (double)nn; + return (FLOP_MUL * (0.5 * m * n * (n + 1.0)) + + FLOP_ADD * (0.5 * m * n * (n - 1.0))); +} + +double FlopCountUpper(int mm, int nn) { + double m = (double)mm; + double n = (double)nn; + return (FLOP_MUL * (0.5 * m * n * (n + 1.0)) + + FLOP_ADD * (0.5 * m * n * (n - 1.0))); +} + +struct RangeTag {}; +struct TeamTagV1 {}; +struct TeamTagV2 {}; +struct TeamTagV3 {}; +struct TeamTagHandmade {}; + +template +struct Functor { + ConstUnmanagedViewType _a; + UnmanagedViewType _b; + + KOKKOS_INLINE_FUNCTION + Functor() = default; + + KOKKOS_INLINE_FUNCTION + Functor(const ViewType &a, const ViewType &b) : _a(a), _b(b) {} + + KOKKOS_INLINE_FUNCTION + void operator()(const RangeTag &, const int k) const { + auto aa = Kokkos::subview(_a, k, Kokkos::ALL(), Kokkos::ALL()); + auto bb = Kokkos::subview(_b, k, Kokkos::ALL(), Kokkos::ALL()); + + switch (test) { + case 0: + SerialTrsm::invoke(1.0, aa, bb); + break; + case 1: + SerialTrsm::invoke(1.0, aa, bb); + break; + case 2: + SerialTrsm::invoke(1.0, aa, bb); + break; + case 3: + SerialTrsm::invoke(1.0, aa, bb); + break; + case 4: + SerialTrsm::invoke(1.0, aa, bb); + break; } + } - struct RangeTag {}; - struct TeamTagV1 {}; - struct TeamTagV2 {}; - struct TeamTagV3 {}; - struct TeamTagHandmade {}; - - template - struct Functor { - ConstUnmanagedViewType _a; - UnmanagedViewType _b; - - KOKKOS_INLINE_FUNCTION - Functor() = default; - - KOKKOS_INLINE_FUNCTION - Functor(const ViewType &a, - const ViewType &b) - : _a(a), _b(b) {} - - KOKKOS_INLINE_FUNCTION - void operator()(const RangeTag &, const int k) const { - auto aa = Kokkos::subview(_a, k, Kokkos::ALL(), Kokkos::ALL()); - auto bb = Kokkos::subview(_b, k, Kokkos::ALL(), Kokkos::ALL()); + template + KOKKOS_INLINE_FUNCTION void operator()(const TeamTagV1 &, + const MemberType &member) const { + const int kbeg = + (member.league_rank() * (member.team_size() * VectorLength) + + member.team_rank() * VectorLength); + Kokkos::parallel_for( + Kokkos::ThreadVectorRange(member, VectorLength), [&](const int &k) { + const int kk = kbeg + k; + if (kk < int(_b.extent(0))) { + auto aa = Kokkos::subview(_a, kk, Kokkos::ALL(), Kokkos::ALL()); + auto bb = Kokkos::subview(_b, kk, Kokkos::ALL(), Kokkos::ALL()); - switch (test) { - case 0: - SerialTrsm:: - invoke(1.0, aa, bb); - break; - case 1: - SerialTrsm:: - invoke(1.0, aa, bb); - break; - case 2: - SerialTrsm:: - invoke(1.0, aa, bb); - break; - case 3: - SerialTrsm:: - invoke(1.0, aa, bb); - break; - case 4: - SerialTrsm:: - invoke(1.0, aa, bb); - break; - } - } - - template - KOKKOS_INLINE_FUNCTION - void operator()(const TeamTagV1 &, const MemberType &member) const { - const int kbeg = (member.league_rank()*(member.team_size()*VectorLength) + - member.team_rank()*VectorLength); - Kokkos::parallel_for - (Kokkos::ThreadVectorRange(member, VectorLength), - [&](const int &k) { - const int kk = kbeg + k; - if (kk < int(_b.extent(0))) { - auto aa = Kokkos::subview(_a, kk, Kokkos::ALL(), Kokkos::ALL()); - auto bb = Kokkos::subview(_b, kk, Kokkos::ALL(), Kokkos::ALL()); - - switch (test) { - case 0: - SerialTrsm:: - invoke(1.0, aa, bb); + switch (test) { + case 0: + SerialTrsm::invoke(1.0, aa, bb); break; case 1: - SerialTrsm:: - invoke(1.0, aa, bb); + SerialTrsm::invoke(1.0, aa, bb); break; case 2: - SerialTrsm:: - invoke(1.0, aa, bb); + SerialTrsm::invoke(1.0, aa, bb); break; case 3: - SerialTrsm:: - invoke(1.0, aa, bb); + SerialTrsm::invoke(1.0, aa, bb); break; case 4: - SerialTrsm:: - invoke(1.0, aa, bb); + SerialTrsm::invoke(1.0, aa, bb); break; - } } - }); - } - - template - KOKKOS_INLINE_FUNCTION - void operator()(const TeamTagV2 &, const MemberType &member) const { - const int kbeg = member.league_rank()*VectorLength; - Kokkos::parallel_for - (Kokkos::ThreadVectorRange(member, VectorLength), - [&](const int &k) { - const int kk = kbeg + k; - if (kk < int(_b.extent(0))) { - auto aa = Kokkos::subview(_a, kk, Kokkos::ALL(), Kokkos::ALL()); - auto bb = Kokkos::subview(_b, kk, Kokkos::ALL(), Kokkos::ALL()); - - switch (test) { - case 0: - TeamTrsm:: - invoke(member, 1.0, aa, bb); + } + }); + } + + template + KOKKOS_INLINE_FUNCTION void operator()(const TeamTagV2 &, + const MemberType &member) const { + const int kbeg = member.league_rank() * VectorLength; + Kokkos::parallel_for( + Kokkos::ThreadVectorRange(member, VectorLength), [&](const int &k) { + const int kk = kbeg + k; + if (kk < int(_b.extent(0))) { + auto aa = Kokkos::subview(_a, kk, Kokkos::ALL(), Kokkos::ALL()); + auto bb = Kokkos::subview(_b, kk, Kokkos::ALL(), Kokkos::ALL()); + + switch (test) { + case 0: + TeamTrsm::invoke(member, 1.0, aa, bb); break; case 1: - TeamTrsm:: - invoke(member, 1.0, aa, bb); + TeamTrsm::invoke(member, 1.0, aa, bb); break; case 2: - TeamTrsm:: - invoke(member, 1.0, aa, bb); + TeamTrsm::invoke(member, 1.0, aa, bb); break; case 3: - TeamTrsm:: - invoke(member, 1.0, aa, bb); + TeamTrsm::invoke(member, 1.0, aa, bb); break; case 4: - TeamTrsm:: - invoke(member, 1.0, aa, bb); + TeamTrsm::invoke(member, 1.0, aa, bb); break; - } } - }); - } + } + }); + } + + template + KOKKOS_INLINE_FUNCTION void operator()(const TeamTagV3 &, + const MemberType &member) const { + const int lvl = 0; + ScratchViewType sa(member.team_scratch(lvl), VectorLength, + _a.extent(1), _a.extent(2)); + // ScratchViewType sb(member.team_scratch(lvl), VectorLength, + // _b.extent(1), _b.extent(2)); - template - KOKKOS_INLINE_FUNCTION - void operator()(const TeamTagV3 &, const MemberType &member) const { - const int lvl = 0; - ScratchViewType sa(member.team_scratch(lvl), VectorLength, _a.extent(1), _a.extent(2)); - //ScratchViewType sb(member.team_scratch(lvl), VectorLength, _b.extent(1), _b.extent(2)); - - const int kbeg = member.league_rank()*VectorLength; - Kokkos::parallel_for - (Kokkos::ThreadVectorRange(member, VectorLength), - [&](const int &k) { - const int kk = kbeg + k; - if (kk < int(_b.extent(0))) { - auto aa = Kokkos::subview(_a, kk, Kokkos::ALL(), Kokkos::ALL()); - auto bb = Kokkos::subview(_b, kk, Kokkos::ALL(), Kokkos::ALL()); - - auto saa = Kokkos::subview(sa, k, Kokkos::ALL(), Kokkos::ALL()); - - TeamCopy::invoke(member, aa, saa); - member.team_barrier(); - - switch (test) { - case 0: - TeamTrsm:: - invoke(member, 1.0, saa, bb); + const int kbeg = member.league_rank() * VectorLength; + Kokkos::parallel_for( + Kokkos::ThreadVectorRange(member, VectorLength), [&](const int &k) { + const int kk = kbeg + k; + if (kk < int(_b.extent(0))) { + auto aa = Kokkos::subview(_a, kk, Kokkos::ALL(), Kokkos::ALL()); + auto bb = Kokkos::subview(_b, kk, Kokkos::ALL(), Kokkos::ALL()); + + auto saa = Kokkos::subview(sa, k, Kokkos::ALL(), Kokkos::ALL()); + + TeamCopy::invoke(member, aa, saa); + member.team_barrier(); + + switch (test) { + case 0: + TeamTrsm::invoke(member, 1.0, saa, bb); break; case 1: - TeamTrsm:: - invoke(member, 1.0, saa, bb); + TeamTrsm::invoke(member, 1.0, saa, bb); break; case 2: - TeamTrsm:: - invoke(member, 1.0, saa, bb); + TeamTrsm::invoke(member, 1.0, saa, bb); break; case 3: - TeamTrsm:: - invoke(member, 1.0, saa, bb); + TeamTrsm::invoke(member, 1.0, saa, bb); break; case 4: - TeamTrsm:: - invoke(member, 1.0, saa, bb); + TeamTrsm::invoke(member, 1.0, saa, bb); break; - } } - }); - } + } + }); + } +}; - }; +template +void Trsm(const int NN, const int BlkSize, const int NumCols) { + typedef Kokkos::Schedule ScheduleType; + constexpr int VectorLength = + DefaultVectorLength::value; + const int N = NN / VectorLength; - template - void Trsm(const int NN, const int BlkSize, const int NumCols) { - typedef Kokkos::Schedule ScheduleType; + { + std::string value_type_name; + if (std::is_same::value) value_type_name = "double"; + if (std::is_same >::value) + value_type_name = "Kokkos::complex"; - constexpr int VectorLength = DefaultVectorLength::value; - const int N = NN/VectorLength; - - { - std::string value_type_name; - if (std::is_same::value) value_type_name = "double"; - if (std::is_same >::value) value_type_name = "Kokkos::complex"; - - std::cout << "SIMD is defined: datatype " << value_type_name << " a vector length " << VectorLength << "\n"; - } + std::cout << "SIMD is defined: datatype " << value_type_name + << " a vector length " << VectorLength << "\n"; + } - switch (test) { - case 0: std::cout << "TestID = Left, Lower, NoTrans, UnitDiag\n"; break; - case 1: std::cout << "TestID = Left, Lower, NoTrans, NonUnitDiag\n"; break; - case 2: std::cout << "TestID = Right, Upper, NoTrans, UnitDiag\n"; break; - case 3: std::cout << "TestID = Right, Upper, NoTrans, NonUnitDiag\n"; break; - case 4: std::cout << "TestID = Left, Upper, NoTrans, NonUnitDiag\n"; break; - } + switch (test) { + case 0: std::cout << "TestID = Left, Lower, NoTrans, UnitDiag\n"; break; + case 1: std::cout << "TestID = Left, Lower, NoTrans, NonUnitDiag\n"; break; + case 2: std::cout << "TestID = Right, Upper, NoTrans, UnitDiag\n"; break; + case 3: std::cout << "TestID = Right, Upper, NoTrans, NonUnitDiag\n"; break; + case 4: std::cout << "TestID = Left, Upper, NoTrans, NonUnitDiag\n"; break; + } - // when m == n, lower upper does not matter (unit and nonunit) - double flop = 0; - switch (test) { - case 0: - case 1: - flop = FlopCountLower(BlkSize,NumCols); - break; - case 2: - case 3: - case 4: - flop = FlopCountUpper(BlkSize,NumCols); - break; - } - flop *= (N*VectorLength); - const double tmax = 1.0e15; - - typedef Kokkos::DefaultHostExecutionSpace HostSpaceType; - typedef typename DeviceSpaceType::memory_space DeviceMemorySpaceType; - - const int iter_begin = -3, iter_end = 30; - Kokkos::Timer timer; - - Kokkos::View - amat("amat", N*VectorLength, BlkSize, BlkSize), - bmat("bmat", N*VectorLength, BlkSize, NumCols), - bref("bmat", N*VectorLength, BlkSize, NumCols); - - { - Random random; - for (int k=0;k flush; + // when m == n, lower upper does not matter (unit and nonunit) + double flop = 0; + switch (test) { + case 0: + case 1: flop = FlopCountLower(BlkSize, NumCols); break; + case 2: + case 3: + case 4: flop = FlopCountUpper(BlkSize, NumCols); break; + } + flop *= (N * VectorLength); + const double tmax = 1.0e15; -#if defined(__KOKKOSKERNELS_NVIDIA_CUBLAS__) - if (1) { - /// - /// CUBLAS Batch version - /// - const Kokkos::LayoutStride stride(N*VectorLength, BlkSize*BlkSize, - BlkSize, 1, - BlkSize, BlkSize); + typedef Kokkos::DefaultHostExecutionSpace HostSpaceType; + typedef typename DeviceSpaceType::memory_space DeviceMemorySpaceType; - Kokkos::View - a("a", stride), - b("b", stride); + const int iter_begin = -3, iter_end = 30; + Kokkos::Timer timer; - cublasStatus_t stat; - cublasHandle_t handle; + Kokkos::View amat( + "amat", N * VectorLength, BlkSize, BlkSize), + bmat("bmat", N * VectorLength, BlkSize, NumCols), + bref("bmat", N * VectorLength, BlkSize, NumCols); - stat = cublasCreate(&handle); - if (stat != CUBLAS_STATUS_SUCCESS) - Kokkos::abort("CUBLAS initialization failed\n"); + { + Random random; + for (int k = 0; k < N * VectorLength; ++k) { + for (int i = 0; i < BlkSize; ++i) + for (int j = 0; j < BlkSize; ++j) + amat(k, i, j) = random.value() + 4.0 * (i == j); + for (int i = 0; i < BlkSize; ++i) + for (int j = 0; j < NumCols; ++j) bmat(k, i, j) = random.value(); + } + } - auto amat_device = Kokkos::create_mirror_view(typename DeviceSpaceType::memory_space(), amat); - auto bmat_device = Kokkos::create_mirror_view(typename DeviceSpaceType::memory_space(), bmat); + // P100 L2 cache 4MB per core + constexpr size_t LLC_CAPACITY = 56 * 4 * 1024 * 1024; + Flush flush; - Kokkos::deep_copy(amat_device, amat); - Kokkos::deep_copy(bmat_device, bmat); +#if defined(__KOKKOSKERNELS_NVIDIA_CUBLAS__) + if (1) { + /// + /// CUBLAS Batch version + /// + const Kokkos::LayoutStride stride(N * VectorLength, BlkSize * BlkSize, + BlkSize, 1, BlkSize, BlkSize); + + Kokkos::View a( + "a", stride), + b("b", stride); + + cublasStatus_t stat; + cublasHandle_t handle; + + stat = cublasCreate(&handle); + if (stat != CUBLAS_STATUS_SUCCESS) + Kokkos::abort("CUBLAS initialization failed\n"); + + auto amat_device = Kokkos::create_mirror_view( + typename DeviceSpaceType::memory_space(), amat); + auto bmat_device = Kokkos::create_mirror_view( + typename DeviceSpaceType::memory_space(), bmat); + + Kokkos::deep_copy(amat_device, amat); + Kokkos::deep_copy(bmat_device, bmat); + + Kokkos::fence(); + + const double one(1.0); //, zero(0.0); + { + double tavg = 0, tmin = tmax; + value_type *aa[N * VectorLength], *bb[N * VectorLength]; + for (int k = 0; k < N * VectorLength; ++k) { + aa[k] = a.data() + k * a.stride_0(); + bb[k] = b.data() + k * b.stride_0(); + } + value_type **aa_device, **bb_device; + if (cudaMalloc(&aa_device, N * VectorLength * sizeof(value_type *)) != + cudaSuccess || + cudaMalloc(&bb_device, N * VectorLength * sizeof(value_type *)) != + cudaSuccess) { + Kokkos::abort("CUDA memory allocation failed\n"); + } + if (cudaMemcpy(aa_device, aa, sizeof(value_type *) * N * VectorLength, + cudaMemcpyHostToDevice) != cudaSuccess || + cudaMemcpy(bb_device, bb, sizeof(value_type *) * N * VectorLength, + cudaMemcpyHostToDevice) != cudaSuccess) { + Kokkos::abort("CUDA memcpy failed\n"); + } + Kokkos::fence(); + for (int iter = iter_begin; iter < iter_end; ++iter) { + // flush + flush.run(); - Kokkos::fence(); + // initialize matrices + Kokkos::deep_copy(a, amat_device); + Kokkos::deep_copy(b, bmat_device); - const double one(1.0); //, zero(0.0); - { - double tavg = 0, tmin = tmax; - value_type - *aa[N*VectorLength], - *bb[N*VectorLength]; - for (int k=0;k= 0)*t; + case 3: { + // Right, Upper, NoTrans, NonUnitDiag + stat = cublasDtrsmBatched( + handle, CUBLAS_SIDE_RIGHT, CUBLAS_FILL_MODE_UPPER, CUBLAS_OP_N, + CUBLAS_DIAG_NON_UNIT, BlkSize, NumCols, &one, + (const value_type **)aa_device, BlkSize, + (value_type **)bb_device, BlkSize, N * VectorLength); + break; } - tavg /= iter_end; - - auto bsol = Kokkos::create_mirror_view(typename HostSpaceType::memory_space(), b); - Kokkos::deep_copy(bsol, b); - Kokkos::deep_copy(bref, bsol); - - if (cudaFree(aa_device) != cudaSuccess || - cudaFree(bb_device) != cudaSuccess) { - Kokkos::abort("CUDA memory free failed\n"); + case 4: { + // Left, Upper, NoTrans, NonUnitDiag + stat = cublasDtrsmBatched( + handle, CUBLAS_SIDE_LEFT, CUBLAS_FILL_MODE_UPPER, CUBLAS_OP_N, + CUBLAS_DIAG_NON_UNIT, BlkSize, NumCols, &one, + (const value_type **)aa_device, BlkSize, + (value_type **)bb_device, BlkSize, N * VectorLength); + break; } - - std::cout << std::setw(8) << "CUBLAS" - << std::setw(8) << "Batched" - << " BlkSize = " << std::setw(3) << BlkSize - << " NumCols = " << std::setw(3) << NumCols - << " TeamSize = N/A" - << " ScratchSize (KB) = N/A" - << " time = " << std::scientific << tmin - << " avg flop/s = " << (flop/tavg) - << " max flop/s = " << (flop/tmin) - << std::endl; } - cublasDestroy(handle); + + if (stat != CUBLAS_STATUS_SUCCESS) { + Kokkos::abort("CUBLAS Trsm Batched failed\n"); + } + Kokkos::fence(); + const double t = timer.seconds(); + tmin = std::min(tmin, t); + tavg += (iter >= 0) * t; } + tavg /= iter_end; + + auto bsol = + Kokkos::create_mirror_view(typename HostSpaceType::memory_space(), b); + Kokkos::deep_copy(bsol, b); + Kokkos::deep_copy(bref, bsol); + + if (cudaFree(aa_device) != cudaSuccess || + cudaFree(bb_device) != cudaSuccess) { + Kokkos::abort("CUDA memory free failed\n"); + } + + std::cout << std::setw(8) << "CUBLAS" << std::setw(8) << "Batched" + << " BlkSize = " << std::setw(3) << BlkSize + << " NumCols = " << std::setw(3) << NumCols << " TeamSize = N/A" + << " ScratchSize (KB) = N/A" + << " time = " << std::scientific << tmin + << " avg flop/s = " << (flop / tavg) + << " max flop/s = " << (flop / tmin) << std::endl; + } + cublasDestroy(handle); + } #endif - if (1) { - /// - /// Range policy version - /// - typedef Kokkos::View view_type; - view_type - a("a", N*VectorLength, BlkSize, BlkSize), - b("b", N*VectorLength, BlkSize, NumCols); - - double tavg = 0, tmin = tmax; - { - typedef Functor functor_type; - const Kokkos::RangePolicy policy(0, N*VectorLength); - - for (int iter=iter_begin;iter= 0)*t; - } - tavg /= iter_end; - - auto bsol = Kokkos::create_mirror_view(typename HostSpaceType::memory_space(), b); - Kokkos::deep_copy(bsol, b); - - double diff = 0; - for (int i=0,iend=bref.extent(0);i functor_type; + const Kokkos::RangePolicy policy( + 0, N * VectorLength); + + for (int iter = iter_begin; iter < iter_end; ++iter) { + // flush + flush.run(); + + // initialize matrices + Kokkos::deep_copy(a, amat); + Kokkos::deep_copy(b, bmat); + + Kokkos::fence(); + timer.reset(); + + Kokkos::parallel_for("KokkosBatched::PerfTest::RangeTag", policy, + functor_type(a, b)); + + Kokkos::fence(); + const double t = timer.seconds(); + tmin = std::min(tmin, t); + tavg += (iter >= 0) * t; + } + tavg /= iter_end; + + auto bsol = + Kokkos::create_mirror_view(typename HostSpaceType::memory_space(), b); + Kokkos::deep_copy(bsol, b); + + double diff = 0; + for (int i = 0, iend = bref.extent(0); i < iend; ++i) + for (int j = 0, jend = bref.extent(1); j < jend; ++j) + for (int k = 0, kend = bref.extent(2); k < kend; ++k) + diff += std::abs(bref(i, j, k) - bsol(i, j, k)); + + std::cout << std::setw(8) << "Kokkos" << std::setw(8) << "Range" + << " BlkSize = " << std::setw(3) << BlkSize + << " NumCols = " << std::setw(3) << NumCols << " TeamSize = N/A" + << " ScratchSize (KB) = 0" + << " time = " << std::scientific << tmin + << " avg flop/s = " << (flop / tavg) + << " max flop/s = " << (flop / tmin); #if defined(__KOKKOSKERNELS_NVIDIA_CUBLAS__) - std::cout << " diff to ref = " << diff; + std::cout << " diff to ref = " << diff; #endif - std::cout << std::endl; - } - } + std::cout << std::endl; + } + } - if (1) { - /// - /// Team policy V1 - almost same scheduling with range policy - /// - typedef Kokkos::View view_type; - view_type - a("a", N*VectorLength, BlkSize, BlkSize), - b("b", N*VectorLength, BlkSize, NumCols); - - double tavg = 0, tmin = tmax; - { - typedef Kokkos::TeamPolicy policy_type; - typedef Functor functor_type; - - const int team_size = - policy_type(N/32, Kokkos::AUTO, VectorLength).team_size_recommended(functor_type(), Kokkos::ParallelForTag()); - - const policy_type policy(N/team_size, team_size, VectorLength); - for (int iter=iter_begin;iter= 0)*t; - } - tavg /= iter_end; - - auto bsol = Kokkos::create_mirror_view(typename HostSpaceType::memory_space(), b); - Kokkos::deep_copy(bsol, b); - - double diff = 0; - for (int i=0,iend=bref.extent(0);i + policy_type; + typedef Functor functor_type; + + const int team_size = + policy_type(N / 32, Kokkos::AUTO, VectorLength) + .team_size_recommended(functor_type(), Kokkos::ParallelForTag()); + + const policy_type policy(N / team_size, team_size, VectorLength); + for (int iter = iter_begin; iter < iter_end; ++iter) { + // flush + flush.run(); + + // initialize matrices + Kokkos::deep_copy(a, amat); + Kokkos::deep_copy(b, bmat); + + Kokkos::fence(); + timer.reset(); + + Kokkos::parallel_for("KokkosBatched::PerfTest::TeamTagV1", policy, + functor_type(a, b)); + + Kokkos::fence(); + const double t = timer.seconds(); + tmin = std::min(tmin, t); + tavg += (iter >= 0) * t; + } + tavg /= iter_end; + + auto bsol = + Kokkos::create_mirror_view(typename HostSpaceType::memory_space(), b); + Kokkos::deep_copy(bsol, b); + + double diff = 0; + for (int i = 0, iend = bref.extent(0); i < iend; ++i) + for (int j = 0, jend = bref.extent(1); j < jend; ++j) + for (int k = 0, kend = bref.extent(2); k < kend; ++k) + diff += std::abs(bref(i, j, k) - bsol(i, j, k)); + + std::cout << std::setw(8) << "Kokkos" << std::setw(8) << "Team V1" + << " BlkSize = " << std::setw(3) << BlkSize + << " NumCols = " << std::setw(3) << NumCols + << " TeamSize = " << std::setw(3) << team_size + << " ScratchSize (KB) = 0" + << " time = " << std::scientific << tmin + << " avg flop/s = " << (flop / tavg) + << " max flop/s = " << (flop / tmin); #if defined(__KOKKOSKERNELS_NVIDIA_CUBLAS__) - std::cout << " diff to ref = " << diff; + std::cout << " diff to ref = " << diff; #endif - std::cout << std::endl; - } - } + std::cout << std::endl; + } + } - if (1) { - /// - /// Team policy V2 - team parallel - /// - typedef Kokkos::View view_type; - view_type - a("a", N*VectorLength, BlkSize, BlkSize), - b("b", N*VectorLength, BlkSize, NumCols); - - double tavg = 0, tmin = tmax; - { - typedef Kokkos::TeamPolicy policy_type; - typedef Functor functor_type; - - const int - is_blocked_algo = (std::is_same::value), - mb = Algo::Trsm::Blocked::mb(), - mp = BlkSize%mb > 0; - - const int - mblk = is_blocked_algo ? (BlkSize/mb + mp) : BlkSize; - - const int max_team_size = - policy_type(N, Kokkos::AUTO, VectorLength).team_size_max(functor_type(), Kokkos::ParallelForTag()); - const int team_size = std::min(std::max(NumCols,(mblk-1)*mblk), max_team_size); - - const policy_type policy(N, team_size, VectorLength); - for (int iter=iter_begin;iter= 0)*t; - } - tavg /= iter_end; - - auto bsol = Kokkos::create_mirror_view(typename HostSpaceType::memory_space(), b); - Kokkos::deep_copy(bsol, b); - - double diff = 0; - for (int i=0,iend=bref.extent(0);i + policy_type; + typedef Functor functor_type; + + const int is_blocked_algo = + (std::is_same::value), + mb = Algo::Trsm::Blocked::mb(), + mp = BlkSize % mb > 0; + + const int mblk = is_blocked_algo ? (BlkSize / mb + mp) : BlkSize; + + const int max_team_size = + policy_type(N, Kokkos::AUTO, VectorLength) + .team_size_max(functor_type(), Kokkos::ParallelForTag()); + const int team_size = + std::min(std::max(NumCols, (mblk - 1) * mblk), max_team_size); + + const policy_type policy(N, team_size, VectorLength); + for (int iter = iter_begin; iter < iter_end; ++iter) { + // flush + flush.run(); + + // initialize matrices + Kokkos::deep_copy(a, amat); + Kokkos::deep_copy(b, bmat); + + DeviceSpaceType().fence(); + timer.reset(); + + Kokkos::parallel_for("KokkosBatched::PerfTest::TeamTagV2", policy, + functor_type(a, b)); + + DeviceSpaceType().fence(); + const double t = timer.seconds(); + tmin = std::min(tmin, t); + tavg += (iter >= 0) * t; + } + tavg /= iter_end; + + auto bsol = + Kokkos::create_mirror_view(typename HostSpaceType::memory_space(), b); + Kokkos::deep_copy(bsol, b); + + double diff = 0; + for (int i = 0, iend = bref.extent(0); i < iend; ++i) + for (int j = 0, jend = bref.extent(1); j < jend; ++j) + for (int k = 0, kend = bref.extent(2); k < kend; ++k) + diff += std::abs(bref(i, j, k) - bsol(i, j, k)); + + std::cout << std::setw(8) << "Kokkos" << std::setw(8) << "Team V2" + << " BlkSize = " << std::setw(3) << BlkSize + << " NumCols = " << std::setw(3) << NumCols + << " TeamSize = " << std::setw(3) << team_size + << " ScratchSize (KB) = 0" + << " time = " << std::scientific << tmin + << " avg flop/s = " << (flop / tavg) + << " max flop/s = " << (flop / tmin); #if defined(__KOKKOSKERNELS_NVIDIA_CUBLAS__) - std::cout << " diff to ref = " << diff; + std::cout << " diff to ref = " << diff; #endif - std::cout << std::endl; - } - } + std::cout << std::endl; + } + } - if (1) { - /// - /// Team policy V3 - team parallel + sratch - /// - typedef Kokkos::View view_type; - view_type - a("a", N*VectorLength, BlkSize, BlkSize), - b("b", N*VectorLength, BlkSize, NumCols); - - double tavg = 0, tmin = tmax; - { - typedef Kokkos::TeamPolicy policy_type; - typedef Functor functor_type; - - const int lvl = 0, per_team_scratch - = ScratchViewType::shmem_size(VectorLength, BlkSize, BlkSize); - - if (per_team_scratch/1024 < 48) { - const int - is_blocked_algo = (std::is_same::value), - mb = Algo::Trsm::Blocked::mb(), - mp = BlkSize%mb > 0; - - const int - mblk = is_blocked_algo ? (BlkSize/mb + mp) : BlkSize; - - const int max_team_size = - policy_type(N, Kokkos::AUTO, VectorLength).set_scratch_size(lvl, Kokkos::PerTeam(per_team_scratch)) - .team_size_max(functor_type(), Kokkos::ParallelForTag()); - const int team_size = std::min(std::max(NumCols,(mblk-1)*mblk), max_team_size); - - policy_type policy(N, team_size, VectorLength); - for (int iter=iter_begin;iter= 0)*t; - } - tavg /= iter_end; - - auto bsol = Kokkos::create_mirror_view(typename HostSpaceType::memory_space(), b); - Kokkos::deep_copy(bsol, b); - - double diff = 0; - for (int i=0,iend=bref.extent(0);i + policy_type; + typedef Functor functor_type; + + const int lvl = 0, + per_team_scratch = ScratchViewType::shmem_size( + VectorLength, BlkSize, BlkSize); + + if (per_team_scratch / 1024 < 48) { + const int is_blocked_algo = + (std::is_same::value), + mb = Algo::Trsm::Blocked::mb(), + mp = BlkSize % mb > 0; + + const int mblk = is_blocked_algo ? (BlkSize / mb + mp) : BlkSize; + + const int max_team_size = + policy_type(N, Kokkos::AUTO, VectorLength) + .set_scratch_size(lvl, Kokkos::PerTeam(per_team_scratch)) + .team_size_max(functor_type(), Kokkos::ParallelForTag()); + const int team_size = + std::min(std::max(NumCols, (mblk - 1) * mblk), max_team_size); + + policy_type policy(N, team_size, VectorLength); + for (int iter = iter_begin; iter < iter_end; ++iter) { + // flush + flush.run(); + + // initialize matrices + Kokkos::deep_copy(a, amat); + Kokkos::deep_copy(b, bmat); + + DeviceSpaceType().fence(); + timer.reset(); + + Kokkos::parallel_for("KokkosBatched::PerfTest::TeamTagV3", policy, + functor_type(a, b)); + + DeviceSpaceType().fence(); + const double t = timer.seconds(); + tmin = std::min(tmin, t); + tavg += (iter >= 0) * t; + } + tavg /= iter_end; + + auto bsol = Kokkos::create_mirror_view( + typename HostSpaceType::memory_space(), b); + Kokkos::deep_copy(bsol, b); + + double diff = 0; + for (int i = 0, iend = bref.extent(0); i < iend; ++i) + for (int j = 0, jend = bref.extent(1); j < jend; ++j) + for (int k = 0, kend = bref.extent(2); k < kend; ++k) + diff += std::abs(bref(i, j, k) - bsol(i, j, k)); + + std::cout << std::setw(8) << "Kokkos" << std::setw(8) << "Team V3" + << " BlkSize = " << std::setw(3) << BlkSize + << " NumCols = " << std::setw(3) << NumCols + << " TeamSize = " << std::setw(3) << team_size + << " ScratchSize (KB) = " << std::setw(3) + << (per_team_scratch / 1024) << " time = " << std::scientific + << tmin << " avg flop/s = " << (flop / tavg) + << " max flop/s = " << (flop / tmin); #if defined(__KOKKOSKERNELS_NVIDIA_CUBLAS__) - std::cout << " diff to ref = " << diff; + std::cout << " diff to ref = " << diff; #endif - std::cout << std::endl; - } else { - std::cout << std::setw(8) << "Kokkos" - << std::setw(8) << "Team V3" - << " Scratch per team is too big (KB): " << (per_team_scratch/1024) - << std::endl; - } - } + std::cout << std::endl; + } else { + std::cout << std::setw(8) << "Kokkos" << std::setw(8) << "Team V3" + << " Scratch per team is too big (KB): " + << (per_team_scratch / 1024) << std::endl; } - std::cout << "\n\n"; } } + std::cout << "\n\n"; } +} // namespace PerfTest +} // namespace KokkosBatched using namespace KokkosBatched; -template +template void run(const int N, const int B, const int R) { typedef Kokkos::DefaultExecutionSpace ExecSpace; Kokkos::print_configuration(std::cout, false); if (B != 0 && R != 0) { - PerfTest::Trsm<0, ExecSpace, AlgoTagType>(N,B,R); + PerfTest::Trsm<0, ExecSpace, AlgoTagType>(N, B, R); } else { - std::cout << "\n\n Used for Factorization \n\n"; /// Left, Lower, NoTrans, UnitDiag (used in LU factorization and LU solve) PerfTest::Trsm<0, ExecSpace, AlgoTagType>(N, 3, 3); PerfTest::Trsm<0, ExecSpace, AlgoTagType>(N, 5, 5); - PerfTest::Trsm<0, ExecSpace, AlgoTagType>(N,10,10); - PerfTest::Trsm<0, ExecSpace, AlgoTagType>(N,15,15); + PerfTest::Trsm<0, ExecSpace, AlgoTagType>(N, 10, 10); + PerfTest::Trsm<0, ExecSpace, AlgoTagType>(N, 15, 15); /// Left, Lower, NoTrans, NonUnitDiag PerfTest::Trsm<1, ExecSpace, AlgoTagType>(N, 3, 3); PerfTest::Trsm<1, ExecSpace, AlgoTagType>(N, 5, 5); - PerfTest::Trsm<1, ExecSpace, AlgoTagType>(N,10,10); - PerfTest::Trsm<1, ExecSpace, AlgoTagType>(N,15,15); + PerfTest::Trsm<1, ExecSpace, AlgoTagType>(N, 10, 10); + PerfTest::Trsm<1, ExecSpace, AlgoTagType>(N, 15, 15); /// Right, Upper, NoTrans, UnitDiag PerfTest::Trsm<2, ExecSpace, AlgoTagType>(N, 3, 3); PerfTest::Trsm<2, ExecSpace, AlgoTagType>(N, 5, 5); - PerfTest::Trsm<2, ExecSpace, AlgoTagType>(N,10,10); - PerfTest::Trsm<2, ExecSpace, AlgoTagType>(N,15,15); + PerfTest::Trsm<2, ExecSpace, AlgoTagType>(N, 10, 10); + PerfTest::Trsm<2, ExecSpace, AlgoTagType>(N, 15, 15); /// Right, Upper, NoTrans, NonUnitDiag (used in LU factorization) PerfTest::Trsm<3, ExecSpace, AlgoTagType>(N, 3, 3); PerfTest::Trsm<3, ExecSpace, AlgoTagType>(N, 5, 5); - PerfTest::Trsm<3, ExecSpace, AlgoTagType>(N,10,10); - PerfTest::Trsm<3, ExecSpace, AlgoTagType>(N,15,15); + PerfTest::Trsm<3, ExecSpace, AlgoTagType>(N, 10, 10); + PerfTest::Trsm<3, ExecSpace, AlgoTagType>(N, 15, 15); std::cout << "\n\n Used for Solve \n\n"; @@ -780,26 +767,25 @@ void run(const int N, const int B, const int R) { PerfTest::Trsm<0, ExecSpace, AlgoTagType>(N, 3, 1); PerfTest::Trsm<0, ExecSpace, AlgoTagType>(N, 5, 1); - PerfTest::Trsm<0, ExecSpace, AlgoTagType>(N,10, 1); - PerfTest::Trsm<0, ExecSpace, AlgoTagType>(N,15, 1); + PerfTest::Trsm<0, ExecSpace, AlgoTagType>(N, 10, 1); + PerfTest::Trsm<0, ExecSpace, AlgoTagType>(N, 15, 1); /// Left, Upper, Notrans, NonUnitDiag (user in LU solve) PerfTest::Trsm<4, ExecSpace, AlgoTagType>(N, 3, 1); PerfTest::Trsm<4, ExecSpace, AlgoTagType>(N, 5, 1); - PerfTest::Trsm<4, ExecSpace, AlgoTagType>(N,10, 1); - PerfTest::Trsm<4, ExecSpace, AlgoTagType>(N,15, 1); + PerfTest::Trsm<4, ExecSpace, AlgoTagType>(N, 10, 1); + PerfTest::Trsm<4, ExecSpace, AlgoTagType>(N, 15, 1); } } int main(int argc, char *argv[]) { - Kokkos::initialize(argc, argv); - int N = 128*128, B = 0, R = 0; + int N = 128 * 128, B = 0, R = 0; - for (int i=1;i(N,B,R); + run(N, B, R); std::cout << "\n Testing LayoutLeft Algo::Trsm::Blocked\n"; - run(N,B,R); + run(N, B, R); } Kokkos::finalize(); @@ -822,7 +808,7 @@ int main(int argc, char *argv[]) { return 0; } -#else +#else int main(int argc, char *argv[]) { std::cout << "Kokkos::Cuda is not enabled\n"; diff --git a/perf_test/batched/do-not-use/KokkosBatched_Test_Trsm_Host.hpp b/perf_test/batched/do-not-use/KokkosBatched_Test_Trsm_Host.hpp index 0e14fe0cf9..6b57b534b7 100644 --- a/perf_test/batched/do-not-use/KokkosBatched_Test_Trsm_Host.hpp +++ b/perf_test/batched/do-not-use/KokkosBatched_Test_Trsm_Host.hpp @@ -18,641 +18,600 @@ //#undef __KOKKOSBATCHED_INTEL_MKL_BATCHED__ namespace KokkosBatched { - namespace PerfTest { +namespace PerfTest { #undef FLOP_MUL #undef FLOP_ADD - // no complex yet -#if defined( KokkosBatched_Test_Trsm_Host_Complex ) +// no complex yet +#if defined(KokkosBatched_Test_Trsm_Host_Complex) #define FLOP_MUL 6.0 #define FLOP_ADD 2.0 - typedef Kokkos::complex value_type; +typedef Kokkos::complex value_type; #endif -#if defined( KokkosBatched_Test_Trsm_Host_Real ) +#if defined(KokkosBatched_Test_Trsm_Host_Real) #define FLOP_MUL 1.0 -#define FLOP_ADD 1.0 - typedef double value_type; +#define FLOP_ADD 1.0 +typedef double value_type; #endif - double FlopCountLower(int mm, int nn) { - double m = (double)mm; double n = (double)nn; - return (FLOP_MUL*(0.5*m*n*(n+1.0)) + - FLOP_ADD*(0.5*m*n*(n-1.0))); - } - - double FlopCountUpper(int mm, int nn) { - double m = (double)mm; double n = (double)nn; - return (FLOP_MUL*(0.5*m*n*(n+1.0)) + - FLOP_ADD*(0.5*m*n*(n-1.0))); - } - - template - void Trsm(const int NN) { - typedef Kokkos::Schedule ScheduleType; +double FlopCountLower(int mm, int nn) { + double m = (double)mm; + double n = (double)nn; + return (FLOP_MUL * (0.5 * m * n * (n + 1.0)) + + FLOP_ADD * (0.5 * m * n * (n - 1.0))); +} - constexpr int VectorLength = DefaultVectorLength::value; - const int N = NN/VectorLength; +double FlopCountUpper(int mm, int nn) { + double m = (double)mm; + double n = (double)nn; + return (FLOP_MUL * (0.5 * m * n * (n + 1.0)) + + FLOP_ADD * (0.5 * m * n * (n - 1.0))); +} - { - std::string value_type_name; - if (std::is_same::value) value_type_name = "double"; - if (std::is_same >::value) value_type_name = "Kokkos::complex"; -#if defined(__AVX512F__) - std::cout << "AVX512 is defined: datatype " << value_type_name << " a vector length " << VectorLength << "\n"; +template +void Trsm(const int NN) { + typedef Kokkos::Schedule ScheduleType; + + constexpr int VectorLength = + DefaultVectorLength::value; + const int N = NN / VectorLength; + + { + std::string value_type_name; + if (std::is_same::value) value_type_name = "double"; + if (std::is_same >::value) + value_type_name = "Kokkos::complex"; +#if defined(__AVX512F__) + std::cout << "AVX512 is defined: datatype " << value_type_name + << " a vector length " << VectorLength << "\n"; #elif defined(__AVX__) || defined(__AVX2__) - std::cout << "AVX or AVX2 is defined: datatype " << value_type_name << " a vector length " << VectorLength << "\n"; + std::cout << "AVX or AVX2 is defined: datatype " << value_type_name + << " a vector length " << VectorLength << "\n"; #else - std::cout << "SIMD (compiler vectorization) is defined: datatype " << value_type_name << " a vector length " << VectorLength << "\n"; + std::cout << "SIMD (compiler vectorization) is defined: datatype " + << value_type_name << " a vector length " << VectorLength << "\n"; #endif - } + } + + switch (test) { + case 0: std::cout << "TestID = Left, Lower, NoTrans, UnitDiag\n"; break; + case 1: std::cout << "TestID = Left, Lower, NoTrans, NonUnitDiag\n"; break; + case 2: std::cout << "TestID = Right, Upper, NoTrans, UnitDiag\n"; break; + case 3: std::cout << "TestID = Right, Upper, NoTrans, NonUnitDiag\n"; break; + case 4: std::cout << "TestID = Left, Upper, NoTrans, NonUnitDiag\n"; break; + } - switch (test) { - case 0: std::cout << "TestID = Left, Lower, NoTrans, UnitDiag\n"; break; - case 1: std::cout << "TestID = Left, Lower, NoTrans, NonUnitDiag\n"; break; - case 2: std::cout << "TestID = Right, Upper, NoTrans, UnitDiag\n"; break; - case 3: std::cout << "TestID = Right, Upper, NoTrans, NonUnitDiag\n"; break; - case 4: std::cout << "TestID = Left, Upper, NoTrans, NonUnitDiag\n"; break; + // when m == n, lower upper does not matter (unit and nonunit) + double flop = 0; + switch (test) { + case 0: + case 1: flop = FlopCountLower(BlkSize, NumCols); break; + case 2: + case 3: + case 4: flop = FlopCountUpper(BlkSize, NumCols); break; + } + flop *= (N * VectorLength); + + const double tmax = 1.0e15; + + const int iter_begin = -10, iter_end = 100; + Kokkos::Timer timer; + + /// + /// Reference version using MKL DTRSM + /// + Kokkos::View bref; + Kokkos::View amat( + "amat", N * VectorLength, BlkSize, BlkSize), + bmat("bmat", N * VectorLength, BlkSize, NumCols); + + typedef Vector, VectorLength> VectorType; + Kokkos::View amat_simd( + "amat_simd", N, BlkSize, BlkSize), + bmat_simd("bmat_simd", N, BlkSize, NumCols); + + Random random; + + for (int k = 0; k < N * VectorLength; ++k) { + const int k0 = k / VectorLength, k1 = k % VectorLength; + for (int i = 0; i < BlkSize; ++i) + for (int j = 0; j < BlkSize; ++j) { + amat(k, i, j) = random.value() + 4.0 * (i == j); + amat_simd(k0, i, j)[k1] = amat(k, i, j); + } + for (int i = 0; i < BlkSize; ++i) + for (int j = 0; j < NumCols; ++j) { + bmat(k, i, j) = random.value(); + bmat_simd(k0, i, j)[k1] = bmat(k, i, j); } + } + + // for KNL + constexpr size_t LLC_CAPACITY = 34 * 1024 * 1024; + Flush flush; - // when m == n, lower upper does not matter (unit and nonunit) - double flop = 0; - switch (test) { - case 0: - case 1: - flop = FlopCountLower(BlkSize,NumCols); - break; - case 2: - case 3: - case 4: - flop = FlopCountUpper(BlkSize,NumCols); - break; + /// + /// Reference version using MKL DTRSM + /// +#if defined(__KOKKOSBATCHED_INTEL_MKL__) + { + Kokkos::View a( + "a", N * VectorLength, BlkSize, BlkSize), + b("b", N * VectorLength, BlkSize, NumCols); + + { + double tavg = 0, tmin = tmax; + for (int iter = iter_begin; iter < iter_end; ++iter) { + // flush + flush.run(); + + // initialize matrices + Kokkos::deep_copy(a, amat); + Kokkos::deep_copy(b, bmat); + + HostSpaceType().fence(); + timer.reset(); + + Kokkos::RangePolicy policy( + 0, N * VectorLength); + Kokkos::parallel_for( + "KokkosBatched::PerfTest::TrsmHost::MKLOpenMP", policy, + KOKKOS_LAMBDA(const int k) { + auto aa = Kokkos::subview(a, k, Kokkos::ALL(), Kokkos::ALL()); + auto bb = Kokkos::subview(b, k, Kokkos::ALL(), Kokkos::ALL()); + + switch (test) { + case 0: + cblas_dtrsm(CblasRowMajor, CblasLeft, CblasLower, + CblasNoTrans, CblasUnit, BlkSize, NumCols, 1.0, + (double *)aa.data(), aa.stride_0(), + (double *)bb.data(), bb.stride_0()); + break; + case 1: + cblas_dtrsm(CblasRowMajor, CblasLeft, CblasLower, + CblasNoTrans, CblasNonUnit, BlkSize, NumCols, 1.0, + (double *)aa.data(), aa.stride_0(), + (double *)bb.data(), bb.stride_0()); + break; + case 2: + cblas_dtrsm(CblasRowMajor, CblasRight, CblasUpper, + CblasNoTrans, CblasUnit, BlkSize, NumCols, 1.0, + (double *)aa.data(), aa.stride_0(), + (double *)bb.data(), bb.stride_0()); + break; + case 3: + cblas_dtrsm(CblasRowMajor, CblasRight, CblasUpper, + CblasNoTrans, CblasNonUnit, BlkSize, NumCols, 1.0, + (double *)aa.data(), aa.stride_0(), + (double *)bb.data(), bb.stride_0()); + break; + case 4: + cblas_dtrsm(CblasRowMajor, CblasLeft, CblasUpper, + CblasNoTrans, CblasNonUnit, BlkSize, NumCols, 1.0, + (double *)aa.data(), aa.stride_0(), + (double *)bb.data(), bb.stride_0()); + break; + } + }); + + HostSpaceType().fence(); + const double t = timer.seconds(); + tmin = std::min(tmin, t); + tavg += (iter >= 0) * t; } - flop *= (N*VectorLength); - - const double tmax = 1.0e15; - - const int iter_begin = -10, iter_end = 100; - Kokkos::Timer timer; - - /// - /// Reference version using MKL DTRSM - /// - Kokkos::View bref; - Kokkos::View - amat("amat", N*VectorLength, BlkSize, BlkSize), - bmat("bmat", N*VectorLength, BlkSize, NumCols); - - typedef Vector,VectorLength> VectorType; - Kokkos::View - amat_simd("amat_simd", N, BlkSize, BlkSize), - bmat_simd("bmat_simd", N, BlkSize, NumCols); - - Random random; - - for (int k=0;k flush; - - /// - /// Reference version using MKL DTRSM - /// -#if defined(__KOKKOSBATCHED_INTEL_MKL__) - { - Kokkos::View - a("a", N*VectorLength, BlkSize, BlkSize), - b("b", N*VectorLength, BlkSize, NumCols); - - { - double tavg = 0, tmin = tmax; - for (int iter=iter_begin;iter policy(0, N*VectorLength); - Kokkos::parallel_for("KokkosBatched::PerfTest::TrsmHost::MKLOpenMP", policy, - KOKKOS_LAMBDA(const int k) { - auto aa = Kokkos::subview(a, k, Kokkos::ALL(), Kokkos::ALL()); - auto bb = Kokkos::subview(b, k, Kokkos::ALL(), Kokkos::ALL()); - - switch (test) { - case 0: - cblas_dtrsm(CblasRowMajor, - CblasLeft, CblasLower, CblasNoTrans, CblasUnit, - BlkSize, NumCols, - 1.0, - (double*)aa.data(), aa.stride_0(), - (double*)bb.data(), bb.stride_0()); - break; - case 1: - cblas_dtrsm(CblasRowMajor, - CblasLeft, CblasLower, CblasNoTrans, CblasNonUnit, - BlkSize, NumCols, - 1.0, - (double*)aa.data(), aa.stride_0(), - (double*)bb.data(), bb.stride_0()); - break; - case 2: - cblas_dtrsm(CblasRowMajor, - CblasRight, CblasUpper, CblasNoTrans, CblasUnit, - BlkSize, NumCols, - 1.0, - (double*)aa.data(), aa.stride_0(), - (double*)bb.data(), bb.stride_0()); - break; - case 3: - cblas_dtrsm(CblasRowMajor, - CblasRight, CblasUpper, CblasNoTrans, CblasNonUnit, - BlkSize, NumCols, - 1.0, - (double*)aa.data(), aa.stride_0(), - (double*)bb.data(), bb.stride_0()); - break; - case 4: - cblas_dtrsm(CblasRowMajor, - CblasLeft, CblasUpper, CblasNoTrans, CblasNonUnit, - BlkSize, NumCols, - 1.0, - (double*)aa.data(), aa.stride_0(), - (double*)bb.data(), bb.stride_0()); - break; - } - }); - - HostSpaceType().fence(); - const double t = timer.seconds(); - tmin = std::min(tmin, t); - tavg += (iter >= 0)*t; + case 2: { + CBLAS_SIDE side[1] = {CblasRight}; + CBLAS_UPLO uplo[1] = {CblasUpper}; + CBLAS_TRANSPOSE transA[1] = {CblasNoTrans}; + CBLAS_DIAG diag[1] = {CblasUnit}; + + cblas_dtrsm_batch(CblasRowMajor, side, uplo, transA, diag, blksize, + numcols, one, (const double **)aa, lda, + (double **)bb, ldb, 1, size_per_grp); + break; + } + case 3: { + CBLAS_SIDE side[1] = {CblasRight}; + CBLAS_UPLO uplo[1] = {CblasUpper}; + CBLAS_TRANSPOSE transA[1] = {CblasNoTrans}; + CBLAS_DIAG diag[1] = {CblasNonUnit}; + + cblas_dtrsm_batch(CblasRowMajor, side, uplo, transA, diag, blksize, + numcols, one, (const double **)aa, lda, + (double **)bb, ldb, 1, size_per_grp); + break; + } + case 4: { + CBLAS_SIDE side[1] = {CblasLeft}; + CBLAS_UPLO uplo[1] = {CblasUpper}; + CBLAS_TRANSPOSE transA[1] = {CblasNoTrans}; + CBLAS_DIAG diag[1] = {CblasNonUnit}; + + cblas_dtrsm_batch(CblasRowMajor, side, uplo, transA, diag, blksize, + numcols, one, (const double **)aa, lda, + (double **)bb, ldb, 1, size_per_grp); + break; } - tavg /= iter_end; - - double sum = 0; - for (int i=0,iend=b.extent(0);i= 0)*t; } - tavg /= iter_end; - - double diff = 0; - for (int i=0,iend=bref.extent(0);i= 0)*t; - } - tavg /= iter_end; - - double diff = 0; - for (int i=0,iend=bref.extent(0);i policy(0, N*VectorLength); - // Kokkos::parallel_for - // (policy, - // KOKKOS_LAMBDA(const int k) { - // auto aa = Kokkos::subview(a, k, Kokkos::ALL(), Kokkos::ALL()); - // auto bb = Kokkos::subview(b, k, Kokkos::ALL(), Kokkos::ALL()); - - // switch (test) { - // case 0: - // SerialTrsm:: - // invoke(1.0, aa, bb); - // break; - // case 1: - // SerialTrsm:: - // invoke(1.0, aa, bb); - // break; - // case 2: - // SerialTrsm:: - // invoke(1.0, aa, bb); - // break; - // case 3: - // SerialTrsm:: - // invoke(1.0, aa, bb); - // break; - // case 4: - // SerialTrsm:: - // invoke(1.0, aa, bb); - // break; - // } - // }); - - // HostSpaceType().fence(); - // const double t = timer.seconds(); - // tmin = std::min(tmin, t); - // tavg += (iter >= 0)*t; - // } - // tavg /= iter_end; - - // double diff = 0; - // for (int i=0,iend=bref.extent(0);i policy(0, N); - Kokkos::parallel_for("KokkosBatched::PerfTest::TrsmHost::SIMDSerialOpenMP", - policy, - KOKKOS_LAMBDA(const int k) { - auto aa = Kokkos::subview(a, k, Kokkos::ALL(), Kokkos::ALL()); - auto bb = Kokkos::subview(b, k, Kokkos::ALL(), Kokkos::ALL()); - - switch (test) { - case 0: - SerialTrsm:: - invoke(1.0, aa, bb); - break; - case 1: - SerialTrsm:: - invoke(1.0, aa, bb); - break; - case 2: - SerialTrsm:: - invoke(1.0, aa, bb); - break; - case 3: - SerialTrsm:: - invoke(1.0, aa, bb); - break; - case 4: - SerialTrsm:: - invoke(1.0, aa, bb); - break; - } - }); - - HostSpaceType().fence(); - const double t = timer.seconds(); - tmin = std::min(tmin, t); - tavg += (iter >= 0)*t; - } - tavg /= iter_end; - - double diff = 0; - for (int i=0,iend=bref.extent(0);i policy(0, + // N*VectorLength); Kokkos::parallel_for + // (policy, + // KOKKOS_LAMBDA(const int k) { + // auto aa = Kokkos::subview(a, k, Kokkos::ALL(), Kokkos::ALL()); + // auto bb = Kokkos::subview(b, k, Kokkos::ALL(), Kokkos::ALL()); + + // switch (test) { + // case 0: + // SerialTrsm:: + // invoke(1.0, aa, bb); + // break; + // case 1: + // SerialTrsm:: + // invoke(1.0, aa, bb); + // break; + // case 2: + // SerialTrsm:: + // invoke(1.0, aa, bb); + // break; + // case 3: + // SerialTrsm:: + // invoke(1.0, aa, bb); + // break; + // case 4: + // SerialTrsm:: + // invoke(1.0, aa, bb); + // break; + // } + // }); + + // HostSpaceType().fence(); + // const double t = timer.seconds(); + // tmin = std::min(tmin, t); + // tavg += (iter >= 0)*t; + // } + // tavg /= iter_end; + + // double diff = 0; + // for (int i=0,iend=bref.extent(0);i policy(0, N); + Kokkos::parallel_for( + "KokkosBatched::PerfTest::TrsmHost::SIMDSerialOpenMP", policy, + KOKKOS_LAMBDA(const int k) { + auto aa = Kokkos::subview(a, k, Kokkos::ALL(), Kokkos::ALL()); + auto bb = Kokkos::subview(b, k, Kokkos::ALL(), Kokkos::ALL()); + + switch (test) { + case 0: + SerialTrsm::invoke(1.0, aa, bb); + break; + case 1: + SerialTrsm::invoke(1.0, aa, bb); + break; + case 2: + SerialTrsm::invoke(1.0, aa, bb); + break; + case 3: + SerialTrsm::invoke(1.0, aa, bb); + break; + case 4: + SerialTrsm::invoke(1.0, aa, bb); + break; + } + }); + + HostSpaceType().fence(); + const double t = timer.seconds(); + tmin = std::min(tmin, t); + tavg += (iter >= 0) * t; } - std::cout << "\n\n"; + tavg /= iter_end; + + double diff = 0; + for (int i = 0, iend = bref.extent(0); i < iend; ++i) + for (int j = 0, jend = bref.extent(1); j < jend; ++j) + for (int k = 0, kend = bref.extent(2); k < kend; ++k) + diff += std::abs(bref(i, j, k) - + b(i / VectorLength, j, k)[i % VectorLength]); + + std::cout << std::setw(10) << "KK Vector" + << " BlkSize = " << std::setw(3) << BlkSize + << " NumCols = " << std::setw(3) << NumCols + << " time = " << std::scientific << tmin + << " avg flop/s = " << (flop / tavg) + << " max flop/s = " << (flop / tmin) + << " diff to ref = " << diff << std::endl; } } + std::cout << "\n\n"; } - +} // namespace PerfTest +} // namespace KokkosBatched diff --git a/perf_test/batched/do-not-use/KokkosBatched_Test_Trsm_Host_Real.cpp b/perf_test/batched/do-not-use/KokkosBatched_Test_Trsm_Host_Real.cpp index bb82e0e56d..3d45195bb1 100644 --- a/perf_test/batched/do-not-use/KokkosBatched_Test_Trsm_Host_Real.cpp +++ b/perf_test/batched/do-not-use/KokkosBatched_Test_Trsm_Host_Real.cpp @@ -5,7 +5,7 @@ using namespace KokkosBatched; -template +template void run(const int N) { typedef Kokkos::DefaultHostExecutionSpace HostSpaceType; @@ -15,56 +15,55 @@ void run(const int N) { /// Left, Lower, NoTrans, UnitDiag (used in LU factorization and LU solve) - PerfTest::Trsm<0, 3, 3, HostSpaceType,AlgoTagType>(N); - PerfTest::Trsm<0, 5, 5, HostSpaceType,AlgoTagType>(N); - PerfTest::Trsm<0,10,10, HostSpaceType,AlgoTagType>(N); - PerfTest::Trsm<0,15,15, HostSpaceType,AlgoTagType>(N); + PerfTest::Trsm<0, 3, 3, HostSpaceType, AlgoTagType>(N); + PerfTest::Trsm<0, 5, 5, HostSpaceType, AlgoTagType>(N); + PerfTest::Trsm<0, 10, 10, HostSpaceType, AlgoTagType>(N); + PerfTest::Trsm<0, 15, 15, HostSpaceType, AlgoTagType>(N); /// Left, Lower, NoTrans, NonUnitDiag - PerfTest::Trsm<1, 3, 3, HostSpaceType,AlgoTagType>(N); - PerfTest::Trsm<1, 5, 5, HostSpaceType,AlgoTagType>(N); - PerfTest::Trsm<1,10,10, HostSpaceType,AlgoTagType>(N); - PerfTest::Trsm<1,15,15, HostSpaceType,AlgoTagType>(N); + PerfTest::Trsm<1, 3, 3, HostSpaceType, AlgoTagType>(N); + PerfTest::Trsm<1, 5, 5, HostSpaceType, AlgoTagType>(N); + PerfTest::Trsm<1, 10, 10, HostSpaceType, AlgoTagType>(N); + PerfTest::Trsm<1, 15, 15, HostSpaceType, AlgoTagType>(N); /// Right, Upper, NoTrans, UnitDiag - PerfTest::Trsm<2, 3, 3, HostSpaceType,AlgoTagType>(N); - PerfTest::Trsm<2, 5, 5, HostSpaceType,AlgoTagType>(N); - PerfTest::Trsm<2,10,10, HostSpaceType,AlgoTagType>(N); - PerfTest::Trsm<2,15,15, HostSpaceType,AlgoTagType>(N); + PerfTest::Trsm<2, 3, 3, HostSpaceType, AlgoTagType>(N); + PerfTest::Trsm<2, 5, 5, HostSpaceType, AlgoTagType>(N); + PerfTest::Trsm<2, 10, 10, HostSpaceType, AlgoTagType>(N); + PerfTest::Trsm<2, 15, 15, HostSpaceType, AlgoTagType>(N); /// Right, Upper, NoTrans, NonUnitDiag (used in LU factorization) - PerfTest::Trsm<3, 3, 3, HostSpaceType,AlgoTagType>(N); - PerfTest::Trsm<3, 5, 5, HostSpaceType,AlgoTagType>(N); - PerfTest::Trsm<3,10,10, HostSpaceType,AlgoTagType>(N); - PerfTest::Trsm<3,15,15, HostSpaceType,AlgoTagType>(N); + PerfTest::Trsm<3, 3, 3, HostSpaceType, AlgoTagType>(N); + PerfTest::Trsm<3, 5, 5, HostSpaceType, AlgoTagType>(N); + PerfTest::Trsm<3, 10, 10, HostSpaceType, AlgoTagType>(N); + PerfTest::Trsm<3, 15, 15, HostSpaceType, AlgoTagType>(N); std::cout << "\n\n Used for Solve \n\n"; /// Left, Lower, NoTrans, UnitDiag (used in LU solve) - PerfTest::Trsm<0, 3, 1, HostSpaceType,AlgoTagType>(N); - PerfTest::Trsm<0, 5, 1, HostSpaceType,AlgoTagType>(N); - PerfTest::Trsm<0,10, 1, HostSpaceType,AlgoTagType>(N); - PerfTest::Trsm<0,15, 1, HostSpaceType,AlgoTagType>(N); + PerfTest::Trsm<0, 3, 1, HostSpaceType, AlgoTagType>(N); + PerfTest::Trsm<0, 5, 1, HostSpaceType, AlgoTagType>(N); + PerfTest::Trsm<0, 10, 1, HostSpaceType, AlgoTagType>(N); + PerfTest::Trsm<0, 15, 1, HostSpaceType, AlgoTagType>(N); /// Left, Upper, Notrans, NonUnitDiag (user in LU solve) - PerfTest::Trsm<4, 3, 1, HostSpaceType,AlgoTagType>(N); - PerfTest::Trsm<4, 5, 1, HostSpaceType,AlgoTagType>(N); - PerfTest::Trsm<4,10, 1, HostSpaceType,AlgoTagType>(N); - PerfTest::Trsm<4,15, 1, HostSpaceType,AlgoTagType>(N); + PerfTest::Trsm<4, 3, 1, HostSpaceType, AlgoTagType>(N); + PerfTest::Trsm<4, 5, 1, HostSpaceType, AlgoTagType>(N); + PerfTest::Trsm<4, 10, 1, HostSpaceType, AlgoTagType>(N); + PerfTest::Trsm<4, 15, 1, HostSpaceType, AlgoTagType>(N); } -int main(int argc, char *argv[]) { - +int main(int argc, char* argv[]) { Kokkos::initialize(argc, argv); #if !defined(__CUDA_ARCH__) && !defined(__HIP_DEVICE_COMPILE__) - int N = 128*128; + int N = 128 * 128; - for (int i=1;i(N); #endif - } #endif Kokkos::finalize(); diff --git a/perf_test/blas/KokkosBlas_blas1.cpp b/perf_test/blas/KokkosBlas_blas1.cpp index 01c6c430fa..764f800f39 100644 --- a/perf_test/blas/KokkosBlas_blas1.cpp +++ b/perf_test/blas/KokkosBlas_blas1.cpp @@ -46,10 +46,10 @@ #include #include #ifdef HAVE_MPI -# include +#include #else -# include -#endif // HAVE_MPI +#include +#endif // HAVE_MPI using Teuchos::Comm; using Teuchos::CommandLineProcessor; @@ -60,61 +60,58 @@ using Teuchos::TimeMonitor; // Create a new timer with the given name if it hasn't already been // created, else get the previously created timer with that name. -RCP