From 6e6d6df6a431fc560da8acf16c42685d5afe6cb8 Mon Sep 17 00:00:00 2001 From: "dependabot[bot]" <49699333+dependabot[bot]@users.noreply.github.com> Date: Sat, 20 Jul 2024 22:07:03 +0900 Subject: [PATCH 1/3] Bump actions/dependency-review-action from 4.3.3 to 4.3.4 (#2279) Bumps [actions/dependency-review-action](https://github.com/actions/dependency-review-action) from 4.3.3 to 4.3.4. - [Release notes](https://github.com/actions/dependency-review-action/releases) - [Commits](https://github.com/actions/dependency-review-action/compare/72eb03d02c7872a771aacd928f3123ac62ad6d3a...5a2ce3f5b92ee19cbb1541a4984c76d921601d7c) --- updated-dependencies: - dependency-name: actions/dependency-review-action dependency-type: direct:production update-type: version-update:semver-patch ... Signed-off-by: dependabot[bot] Co-authored-by: dependabot[bot] <49699333+dependabot[bot]@users.noreply.github.com> --- .github/workflows/dependency-review.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/workflows/dependency-review.yml b/.github/workflows/dependency-review.yml index 1792f0181c..56d5770ba5 100644 --- a/.github/workflows/dependency-review.yml +++ b/.github/workflows/dependency-review.yml @@ -24,4 +24,4 @@ jobs: - name: 'Checkout Repository' uses: actions/checkout@692973e3d937129bcbf40652eb9f2f61becf3332 # v4.1.7 - name: 'Dependency Review' - uses: actions/dependency-review-action@72eb03d02c7872a771aacd928f3123ac62ad6d3a # v4.3.3 + uses: actions/dependency-review-action@5a2ce3f5b92ee19cbb1541a4984c76d921601d7c # v4.3.4 From 8ffebaaa4f974ed7baf255f3bffe789d0af29115 Mon Sep 17 00:00:00 2001 From: "dependabot[bot]" <49699333+dependabot[bot]@users.noreply.github.com> Date: Sat, 20 Jul 2024 22:07:31 +0900 Subject: [PATCH 2/3] Bump github/codeql-action from 3.25.11 to 3.25.12 (#2278) Bumps [github/codeql-action](https://github.com/github/codeql-action) from 3.25.11 to 3.25.12. - [Release notes](https://github.com/github/codeql-action/releases) - [Changelog](https://github.com/github/codeql-action/blob/main/CHANGELOG.md) - [Commits](https://github.com/github/codeql-action/compare/b611370bb5703a7efb587f9d136a52ea24c5c38c...4fa2a7953630fd2f3fb380f21be14ede0169dd4f) --- updated-dependencies: - dependency-name: github/codeql-action dependency-type: direct:production update-type: version-update:semver-patch ... Signed-off-by: dependabot[bot] Co-authored-by: dependabot[bot] <49699333+dependabot[bot]@users.noreply.github.com> --- .github/workflows/codeql.yml | 4 ++-- .github/workflows/scorecards.yml | 2 +- 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/.github/workflows/codeql.yml b/.github/workflows/codeql.yml index e1f8aa51f8..49ce377c26 100644 --- a/.github/workflows/codeql.yml +++ b/.github/workflows/codeql.yml @@ -44,7 +44,7 @@ jobs: # Initializes the CodeQL tools for scanning. - name: Initialize CodeQL - uses: github/codeql-action/init@b611370bb5703a7efb587f9d136a52ea24c5c38c # v3.25.11 + uses: github/codeql-action/init@4fa2a7953630fd2f3fb380f21be14ede0169dd4f # v3.25.12 with: languages: c-cpp # If you wish to specify custom queries, you can do so here or in a config file. @@ -100,6 +100,6 @@ jobs: run: make -j2 - name: Perform CodeQL Analysis - uses: github/codeql-action/analyze@b611370bb5703a7efb587f9d136a52ea24c5c38c # v3.25.11 + uses: github/codeql-action/analyze@4fa2a7953630fd2f3fb380f21be14ede0169dd4f # v3.25.12 with: category: "/language:c-cpp" diff --git a/.github/workflows/scorecards.yml b/.github/workflows/scorecards.yml index bf06213b40..d81ed0e1bc 100644 --- a/.github/workflows/scorecards.yml +++ b/.github/workflows/scorecards.yml @@ -73,6 +73,6 @@ jobs: # Upload the results to GitHub's code scanning dashboard. - name: "Upload to code-scanning" - uses: github/codeql-action/upload-sarif@b611370bb5703a7efb587f9d136a52ea24c5c38c # v3.25.11 + uses: github/codeql-action/upload-sarif@4fa2a7953630fd2f3fb380f21be14ede0169dd4f # v3.25.12 with: sarif_file: results.sarif From 985c3a9e7343c2f612560024cae4d968f800c8ac Mon Sep 17 00:00:00 2001 From: James Foucar Date: Sat, 20 Jul 2024 07:10:45 -0600 Subject: [PATCH 3/3] Propose increasing column limit to 120. (#2255) * Change key files * Full reformat * Update format.yml * Update ubuntu version for format checker --- .clang-format | 1 + .git-blame-ignore-revs | 17 + .github/workflows/format.yml | 6 +- batched/KokkosBatched_Util.hpp | 276 +- .../impl/KokkosBatched_AddRadial_Impl.hpp | 17 +- .../impl/KokkosBatched_AddRadial_Internal.hpp | 9 +- ...kosBatched_ApplyGivens_Serial_Internal.hpp | 42 +- ...osBatched_ApplyHouseholder_Serial_Impl.hpp | 30 +- ...tched_ApplyHouseholder_Serial_Internal.hpp | 20 +- ...tched_ApplyHouseholder_TeamVector_Impl.hpp | 34 +- ...d_ApplyHouseholder_TeamVector_Internal.hpp | 76 +- .../impl/KokkosBatched_ApplyPivot_Impl.hpp | 88 +- .../KokkosBatched_ApplyPivot_Internal.hpp | 86 +- .../impl/KokkosBatched_ApplyQ_Serial_Impl.hpp | 48 +- .../KokkosBatched_ApplyQ_Serial_Internal.hpp | 42 +- .../KokkosBatched_ApplyQ_TeamVector_Impl.hpp | 60 +- ...kkosBatched_ApplyQ_TeamVector_Internal.hpp | 48 +- .../dense/impl/KokkosBatched_Axpy_Impl.hpp | 238 +- .../dense/impl/KokkosBatched_Copy_Impl.hpp | 184 +- .../impl/KokkosBatched_Copy_Internal.hpp | 80 +- .../dense/impl/KokkosBatched_Dot_Internal.hpp | 277 +- ...Batched_Eigendecomposition_Serial_Impl.hpp | 36 +- ...hed_Eigendecomposition_Serial_Internal.hpp | 47 +- ...hed_Eigendecomposition_TeamVector_Impl.hpp | 41 +- ...Eigendecomposition_TeamVector_Internal.hpp | 22 +- ...kkosBatched_Eigenvalue_Serial_Internal.hpp | 38 +- .../impl/KokkosBatched_FindAmax_Internal.hpp | 11 +- .../KokkosBatched_Francis_Serial_Internal.hpp | 55 +- .../impl/KokkosBatched_Gemm_Serial_Impl.hpp | 268 +- .../KokkosBatched_Gemm_Serial_Internal.hpp | 35 +- .../KokkosBatched_Gemm_TeamVector_Impl.hpp | 64 +- ...KokkosBatched_Gemm_TeamVector_Internal.hpp | 72 +- .../impl/KokkosBatched_Gemm_Team_Impl.hpp | 144 +- .../impl/KokkosBatched_Gemm_Team_Internal.hpp | 83 +- .../KokkosBatched_Gemv_TeamVector_Impl.hpp | 63 +- ...KokkosBatched_Gemv_TeamVector_Internal.hpp | 69 +- .../impl/KokkosBatched_Gemv_Team_Impl.hpp | 71 +- .../impl/KokkosBatched_Gemv_Team_Internal.hpp | 67 +- .../dense/impl/KokkosBatched_Gesv_Impl.hpp | 343 +- .../KokkosBatched_Givens_Serial_Internal.hpp | 11 +- .../KokkosBatched_HadamardProduct_Impl.hpp | 158 +- ...atched_HessenbergFormQ_Serial_Internal.hpp | 17 +- ...HessenbergQR_WithShift_Serial_Internal.hpp | 30 +- ...kkosBatched_Hessenberg_Serial_Internal.hpp | 16 +- ...okkosBatched_HostLevel_Gemm_Armpl_Impl.hpp | 58 +- ...kkosBatched_HostLevel_Gemm_DblBuf_Impl.hpp | 401 +- .../KokkosBatched_HostLevel_Gemm_Impl.hpp | 176 +- ...kkosBatched_HostLevel_Gemm_Serial_Impl.hpp | 43 +- .../KokkosBatched_HostLevel_Gemm_Spec.hpp | 279 +- .../KokkosBatched_Householder_Serial_Impl.hpp | 8 +- ...kosBatched_Householder_Serial_Internal.hpp | 8 +- ...kosBatched_Householder_TeamVector_Impl.hpp | 9 +- ...atched_Householder_TeamVector_Internal.hpp | 16 +- ...okkosBatched_InnerGemmFixA_Serial_Impl.hpp | 517 +- ...okkosBatched_InnerGemmFixB_Serial_Impl.hpp | 504 +- ...okkosBatched_InnerGemmFixC_Serial_Impl.hpp | 538 +- .../KokkosBatched_InnerGemmFixC_Team_Impl.hpp | 51 +- .../KokkosBatched_InnerLU_Serial_Impl.hpp | 76 +- .../KokkosBatched_InnerTrsm_Serial_Impl.hpp | 472 +- .../KokkosBatched_InverseLU_Serial_Impl.hpp | 35 +- .../impl/KokkosBatched_LU_Serial_Impl.hpp | 37 +- .../impl/KokkosBatched_LU_Serial_Internal.hpp | 31 +- .../dense/impl/KokkosBatched_LU_Team_Impl.hpp | 16 +- .../impl/KokkosBatched_LU_Team_Internal.hpp | 80 +- ...ftEigenvectorFromSchur_Serial_Internal.hpp | 34 +- .../impl/KokkosBatched_Normalize_Internal.hpp | 9 +- .../impl/KokkosBatched_Pttrf_Serial_Impl.hpp | 23 +- .../KokkosBatched_Pttrf_Serial_Internal.hpp | 27 +- ...KokkosBatched_QR_FormQ_Serial_Internal.hpp | 12 +- ...osBatched_QR_FormQ_TeamVector_Internal.hpp | 17 +- .../impl/KokkosBatched_QR_Serial_Impl.hpp | 7 +- .../impl/KokkosBatched_QR_Serial_Internal.hpp | 11 +- .../impl/KokkosBatched_QR_TeamVector_Impl.hpp | 7 +- .../KokkosBatched_QR_TeamVector_Internal.hpp | 11 +- ..._QR_WithColumnPivoting_TeamVector_Impl.hpp | 16 +- ...WithColumnPivoting_TeamVector_Internal.hpp | 36 +- ...htEigenvectorFromSchur_Serial_Internal.hpp | 30 +- .../impl/KokkosBatched_SVD_Serial_Impl.hpp | 60 +- .../KokkosBatched_SVD_Serial_Internal.hpp | 139 +- ...KokkosBatched_Schur2x2_Serial_Internal.hpp | 39 +- .../KokkosBatched_Schur_Serial_Internal.hpp | 80 +- .../impl/KokkosBatched_SetIdentity_Impl.hpp | 9 +- .../KokkosBatched_SetIdentity_Internal.hpp | 20 +- .../KokkosBatched_SetTriangular_Internal.hpp | 21 +- ...kosBatched_ShiftedTrsv_Serial_Internal.hpp | 28 +- ...KokkosBatched_SolveUTV_TeamVector_Impl.hpp | 31 +- ...osBatched_SolveUTV_TeamVector_Internal.hpp | 77 +- .../impl/KokkosBatched_Tbsv_Serial_Impl.hpp | 71 +- .../KokkosBatched_Tbsv_Serial_Internal.hpp | 73 +- .../impl/KokkosBatched_Trmm_Serial_Impl.hpp | 144 +- .../KokkosBatched_Trmm_Serial_Internal.hpp | 218 +- .../impl/KokkosBatched_Trsm_Serial_Impl.hpp | 321 +- .../KokkosBatched_Trsm_Serial_Internal.hpp | 92 +- .../KokkosBatched_Trsm_TeamVector_Impl.hpp | 60 +- ...KokkosBatched_Trsm_TeamVector_Internal.hpp | 92 +- .../impl/KokkosBatched_Trsm_Team_Impl.hpp | 196 +- .../impl/KokkosBatched_Trsm_Team_Internal.hpp | 177 +- .../impl/KokkosBatched_Trsv_Serial_Impl.hpp | 222 +- .../KokkosBatched_Trsv_Serial_Internal.hpp | 99 +- .../KokkosBatched_Trsv_TeamVector_Impl.hpp | 52 +- ...KokkosBatched_Trsv_TeamVector_Internal.hpp | 71 +- .../impl/KokkosBatched_Trsv_Team_Impl.hpp | 103 +- .../impl/KokkosBatched_Trsv_Team_Internal.hpp | 89 +- .../impl/KokkosBatched_Trtri_Serial_Impl.hpp | 10 +- .../KokkosBatched_Trtri_Serial_Internal.hpp | 46 +- .../KokkosBatched_UTV_TeamVector_Impl.hpp | 17 +- .../KokkosBatched_UTV_TeamVector_Internal.hpp | 33 +- .../KokkosBatched_UpdateGivens_Internal.hpp | 5 +- .../impl/KokkosBatched_Vector_SIMD_Arith.hpp | 445 +- .../KokkosBatched_Vector_SIMD_Logical.hpp | 41 +- .../impl/KokkosBatched_Vector_SIMD_Math.hpp | 45 +- .../impl/KokkosBatched_Vector_SIMD_Misc.hpp | 76 +- .../KokkosBatched_Vector_SIMD_Relation.hpp | 40 +- .../impl/KokkosBatched_Vector_SIMD_View.hpp | 178 +- ...Batched_WilkinsonShift_Serial_Internal.hpp | 10 +- .../dense/impl/KokkosBatched_Xpay_Impl.hpp | 193 +- .../src/KokkosBatched_AddRadial_Decl.hpp | 7 +- .../KokkosBatched_ApplyHouseholder_Decl.hpp | 13 +- .../src/KokkosBatched_ApplyPivot_Decl.hpp | 7 +- .../dense/src/KokkosBatched_ApplyQ_Decl.hpp | 52 +- batched/dense/src/KokkosBatched_Axpy.hpp | 12 +- batched/dense/src/KokkosBatched_Copy_Decl.hpp | 48 +- batched/dense/src/KokkosBatched_Dot.hpp | 12 +- .../KokkosBatched_Eigendecomposition_Decl.hpp | 18 +- batched/dense/src/KokkosBatched_Gemm_Decl.hpp | 49 +- batched/dense/src/KokkosBatched_Gemv_Decl.hpp | 110 +- batched/dense/src/KokkosBatched_Gesv.hpp | 17 +- .../src/KokkosBatched_HadamardProduct.hpp | 27 +- .../src/KokkosBatched_HostLevel_Gemm.hpp | 33 +- .../KokkosBatched_HostLevel_Gemm_Handle.hpp | 39 +- .../src/KokkosBatched_Householder_Decl.hpp | 7 +- .../src/KokkosBatched_InnerGemmFixA_Decl.hpp | 16 +- .../src/KokkosBatched_InnerGemmFixB_Decl.hpp | 16 +- .../src/KokkosBatched_InnerGemmFixC_Decl.hpp | 34 +- .../dense/src/KokkosBatched_InnerLU_Decl.hpp | 6 +- .../src/KokkosBatched_InnerTrsm_Decl.hpp | 36 +- .../src/KokkosBatched_InverseLU_Decl.hpp | 20 +- .../dense/src/KokkosBatched_Kernel_Handle.hpp | 14 +- batched/dense/src/KokkosBatched_LU_Decl.hpp | 10 +- batched/dense/src/KokkosBatched_Pttrf.hpp | 3 +- batched/dense/src/KokkosBatched_QR_Decl.hpp | 18 +- ...kkosBatched_QR_WithColumnPivoting_Decl.hpp | 10 +- batched/dense/src/KokkosBatched_SVD_Decl.hpp | 12 +- .../dense/src/KokkosBatched_Scale_Decl.hpp | 50 +- .../src/KokkosBatched_SetIdentity_Decl.hpp | 6 +- batched/dense/src/KokkosBatched_Set_Decl.hpp | 50 +- .../dense/src/KokkosBatched_SolveLU_Decl.hpp | 48 +- .../dense/src/KokkosBatched_SolveUTV_Decl.hpp | 12 +- batched/dense/src/KokkosBatched_Tbsv.hpp | 6 +- batched/dense/src/KokkosBatched_Trmm_Decl.hpp | 7 +- batched/dense/src/KokkosBatched_Trsm_Decl.hpp | 38 +- batched/dense/src/KokkosBatched_Trsv_Decl.hpp | 231 +- batched/dense/src/KokkosBatched_UTV_Decl.hpp | 10 +- batched/dense/src/KokkosBatched_Vector.hpp | 34 +- .../dense/src/KokkosBatched_Vector_SIMD.hpp | 77 +- batched/dense/src/KokkosBatched_Xpay.hpp | 12 +- .../unit_test/Test_Batched_BatchedGemm.hpp | 242 +- .../Test_Batched_BatchedGemm_Complex.hpp | 118 +- .../Test_Batched_BatchedGemm_Real.hpp | 182 +- .../unit_test/Test_Batched_DenseUtils.hpp | 14 +- .../unit_test/Test_Batched_SerialAxpy.hpp | 31 +- .../Test_Batched_SerialAxpy_Complex.hpp | 3 +- .../Test_Batched_SerialAxpy_Real.hpp | 8 +- .../unit_test/Test_Batched_SerialGemm.hpp | 142 +- .../Test_Batched_SerialGemm_Complex.hpp | 48 +- .../Test_Batched_SerialGemm_Real.hpp | 112 +- .../unit_test/Test_Batched_SerialGesv.hpp | 48 +- .../Test_Batched_SerialInverseLU.hpp | 62 +- .../Test_Batched_SerialInverseLU_Complex.hpp | 6 +- .../dense/unit_test/Test_Batched_SerialLU.hpp | 14 +- .../unit_test/Test_Batched_SerialPttrf.hpp | 167 +- .../unit_test/Test_Batched_SerialSVD.hpp | 122 +- .../unit_test/Test_Batched_SerialSolveLU.hpp | 48 +- .../Test_Batched_SerialSolveLU_Complex.hpp | 6 +- .../unit_test/Test_Batched_SerialTbsv.hpp | 119 +- .../Test_Batched_SerialTbsv_Complex.hpp | 64 +- .../Test_Batched_SerialTbsv_Real.hpp | 80 +- .../unit_test/Test_Batched_SerialTrmm.hpp | 144 +- .../Test_Batched_SerialTrmm_Complex.hpp | 270 +- .../Test_Batched_SerialTrmm_Real.hpp | 198 +- .../unit_test/Test_Batched_SerialTrsm.hpp | 62 +- .../Test_Batched_SerialTrsm_Complex.hpp | 112 +- .../Test_Batched_SerialTrsm_Real.hpp | 110 +- .../unit_test/Test_Batched_SerialTrsv.hpp | 69 +- .../Test_Batched_SerialTrsv_Complex.hpp | 48 +- .../Test_Batched_SerialTrsv_Real.hpp | 36 +- .../unit_test/Test_Batched_SerialTrtri.hpp | 89 +- .../Test_Batched_SerialTrtri_Complex.hpp | 28 +- .../Test_Batched_SerialTrtri_Real.hpp | 12 +- .../dense/unit_test/Test_Batched_TeamAxpy.hpp | 48 +- .../Test_Batched_TeamAxpy_Complex.hpp | 3 +- .../unit_test/Test_Batched_TeamAxpy_Real.hpp | 4 +- .../dense/unit_test/Test_Batched_TeamGemm.hpp | 148 +- .../Test_Batched_TeamGemm_Complex.hpp | 52 +- .../unit_test/Test_Batched_TeamGemm_Real.hpp | 120 +- .../dense/unit_test/Test_Batched_TeamGesv.hpp | 61 +- .../unit_test/Test_Batched_TeamGesv_Real.hpp | 6 +- .../unit_test/Test_Batched_TeamInverseLU.hpp | 72 +- .../Test_Batched_TeamInverseLU_Complex.hpp | 6 +- .../dense/unit_test/Test_Batched_TeamLU.hpp | 26 +- .../unit_test/Test_Batched_TeamSolveLU.hpp | 60 +- .../Test_Batched_TeamSolveLU_Complex.hpp | 6 +- .../dense/unit_test/Test_Batched_TeamTrsm.hpp | 77 +- .../Test_Batched_TeamTrsm_Complex.hpp | 150 +- .../unit_test/Test_Batched_TeamTrsm_Real.hpp | 140 +- .../dense/unit_test/Test_Batched_TeamTrsv.hpp | 60 +- .../unit_test/Test_Batched_TeamVectorAxpy.hpp | 49 +- .../Test_Batched_TeamVectorAxpy_Complex.hpp | 3 +- .../unit_test/Test_Batched_TeamVectorGemm.hpp | 166 +- .../Test_Batched_TeamVectorGemm_Complex.hpp | 50 +- .../Test_Batched_TeamVectorGemm_Real.hpp | 100 +- .../unit_test/Test_Batched_TeamVectorGesv.hpp | 65 +- .../Test_Batched_TeamVectorGesv_Real.hpp | 12 +- .../unit_test/Test_Batched_TeamVectorQR.hpp | 66 +- ...atched_TeamVectorQR_WithColumnPivoting.hpp | 90 +- .../Test_Batched_TeamVectorSolveUTV.hpp | 105 +- .../Test_Batched_TeamVectorSolveUTV2.hpp | 115 +- .../unit_test/Test_Batched_TeamVectorUTV.hpp | 133 +- .../Test_Batched_VectorArithmatic.hpp | 138 +- .../unit_test/Test_Batched_VectorLogical.hpp | 54 +- .../unit_test/Test_Batched_VectorMath.hpp | 69 +- .../unit_test/Test_Batched_VectorMisc.hpp | 27 +- .../unit_test/Test_Batched_VectorRelation.hpp | 38 +- .../unit_test/Test_Batched_VectorView.hpp | 270 +- .../impl/KokkosBatched_CG_TeamVector_Impl.hpp | 91 +- .../impl/KokkosBatched_CG_Team_Impl.hpp | 92 +- .../impl/KokkosBatched_GMRES_Serial_Impl.hpp | 93 +- .../KokkosBatched_GMRES_TeamVector_Impl.hpp | 355 +- .../impl/KokkosBatched_GMRES_Team_Impl.hpp | 348 +- .../impl/KokkosBatched_Spmv_Serial_Impl.hpp | 205 +- .../KokkosBatched_Spmv_TeamVector_Impl.hpp | 371 +- .../impl/KokkosBatched_Spmv_Team_Impl.hpp | 307 +- batched/sparse/src/KokkosBatched_CG.hpp | 18 +- .../sparse/src/KokkosBatched_CrsMatrix.hpp | 42 +- batched/sparse/src/KokkosBatched_GMRES.hpp | 21 +- batched/sparse/src/KokkosBatched_Identity.hpp | 17 +- .../sparse/src/KokkosBatched_JacobiPrec.hpp | 40 +- .../src/KokkosBatched_Krylov_Handle.hpp | 46 +- .../src/KokkosBatched_Krylov_Solvers.hpp | 121 +- batched/sparse/src/KokkosBatched_Spmv.hpp | 140 +- .../unit_test/Test_Batched_SerialGMRES.hpp | 91 +- .../Test_Batched_SerialGMRES_Real.hpp | 8 +- .../unit_test/Test_Batched_SerialSpmv.hpp | 74 +- .../unit_test/Test_Batched_SparseUtils.hpp | 19 +- .../sparse/unit_test/Test_Batched_TeamCG.hpp | 76 +- .../unit_test/Test_Batched_TeamCG_Real.hpp | 8 +- .../unit_test/Test_Batched_TeamGMRES.hpp | 102 +- .../unit_test/Test_Batched_TeamGMRES_Real.hpp | 8 +- .../unit_test/Test_Batched_TeamSpmv.hpp | 116 +- .../unit_test/Test_Batched_TeamVectorCG.hpp | 81 +- .../Test_Batched_TeamVectorCG_Real.hpp | 8 +- .../Test_Batched_TeamVectorGMRES.hpp | 104 +- .../Test_Batched_TeamVectorGMRES_Real.hpp | 8 +- .../unit_test/Test_Batched_TeamVectorSpmv.hpp | 122 +- blas/impl/KokkosBlas1_abs_impl.hpp | 9 +- blas/impl/KokkosBlas1_abs_spec.hpp | 137 +- blas/impl/KokkosBlas1_axpby_impl.hpp | 116 +- blas/impl/KokkosBlas1_axpby_mv_impl.hpp | 212 +- blas/impl/KokkosBlas1_axpby_spec.hpp | 434 +- ...Blas1_axpby_unification_attempt_traits.hpp | 734 ++- blas/impl/KokkosBlas1_dot_impl.hpp | 18 +- blas/impl/KokkosBlas1_dot_mv_impl.hpp | 48 +- blas/impl/KokkosBlas1_dot_spec.hpp | 542 +- blas/impl/KokkosBlas1_iamax_impl.hpp | 15 +- blas/impl/KokkosBlas1_iamax_spec.hpp | 307 +- blas/impl/KokkosBlas1_mult_impl.hpp | 29 +- blas/impl/KokkosBlas1_mult_spec.hpp | 213 +- blas/impl/KokkosBlas1_nrm1_impl.hpp | 47 +- blas/impl/KokkosBlas1_nrm1_spec.hpp | 172 +- blas/impl/KokkosBlas1_nrm2_impl.hpp | 69 +- blas/impl/KokkosBlas1_nrm2_spec.hpp | 187 +- blas/impl/KokkosBlas1_nrm2w_impl.hpp | 76 +- blas/impl/KokkosBlas1_nrm2w_spec.hpp | 179 +- blas/impl/KokkosBlas1_nrminf_impl.hpp | 9 +- blas/impl/KokkosBlas1_nrminf_spec.hpp | 168 +- blas/impl/KokkosBlas1_reciprocal_impl.hpp | 9 +- blas/impl/KokkosBlas1_reciprocal_spec.hpp | 150 +- blas/impl/KokkosBlas1_rot_impl.hpp | 7 +- blas/impl/KokkosBlas1_rot_spec.hpp | 76 +- blas/impl/KokkosBlas1_rotg_impl.hpp | 46 +- blas/impl/KokkosBlas1_rotg_spec.hpp | 78 +- blas/impl/KokkosBlas1_rotm_impl.hpp | 29 +- blas/impl/KokkosBlas1_rotm_spec.hpp | 77 +- blas/impl/KokkosBlas1_rotmg_impl.hpp | 15 +- blas/impl/KokkosBlas1_rotmg_spec.hpp | 91 +- blas/impl/KokkosBlas1_scal_impl.hpp | 33 +- blas/impl/KokkosBlas1_scal_mv_impl.hpp | 184 +- blas/impl/KokkosBlas1_scal_spec.hpp | 278 +- blas/impl/KokkosBlas1_serial_scal_impl.hpp | 9 +- blas/impl/KokkosBlas1_set_impl.hpp | 71 +- blas/impl/KokkosBlas1_sum_impl.hpp | 49 +- blas/impl/KokkosBlas1_sum_spec.hpp | 148 +- blas/impl/KokkosBlas1_swap_impl.hpp | 3 +- blas/impl/KokkosBlas1_swap_spec.hpp | 70 +- blas/impl/KokkosBlas1_team_abs_spec.hpp | 12 +- blas/impl/KokkosBlas1_team_axpby_spec.hpp | 20 +- blas/impl/KokkosBlas1_team_dot_spec.hpp | 17 +- blas/impl/KokkosBlas1_team_mult_spec.hpp | 24 +- blas/impl/KokkosBlas1_team_nrm2_spec.hpp | 23 +- blas/impl/KokkosBlas1_team_scal_impl.hpp | 63 +- blas/impl/KokkosBlas1_team_scal_spec.hpp | 16 +- blas/impl/KokkosBlas1_team_update_spec.hpp | 27 +- blas/impl/KokkosBlas1_update_impl.hpp | 144 +- blas/impl/KokkosBlas1_update_spec.hpp | 252 +- blas/impl/KokkosBlas2_gemv_impl.hpp | 438 +- blas/impl/KokkosBlas2_gemv_spec.hpp | 108 +- blas/impl/KokkosBlas2_ger_impl.hpp | 118 +- blas/impl/KokkosBlas2_ger_spec.hpp | 87 +- blas/impl/KokkosBlas2_serial_gemv_impl.hpp | 100 +- ...osBlas2_serial_gemv_inner_multiple_dot.hpp | 134 +- .../impl/KokkosBlas2_serial_gemv_internal.hpp | 43 +- blas/impl/KokkosBlas2_syr2_impl.hpp | 231 +- blas/impl/KokkosBlas2_syr2_spec.hpp | 110 +- blas/impl/KokkosBlas2_syr_impl.hpp | 150 +- blas/impl/KokkosBlas2_syr_spec.hpp | 90 +- blas/impl/KokkosBlas2_team_gemv_impl.hpp | 131 +- blas/impl/KokkosBlas2_team_gemv_spec.hpp | 177 +- blas/impl/KokkosBlas3_gemm_dotbased_impl.hpp | 56 +- blas/impl/KokkosBlas3_gemm_impl.hpp | 611 +-- blas/impl/KokkosBlas3_gemm_spec.hpp | 299 +- blas/impl/KokkosBlas3_trmm_impl.hpp | 69 +- blas/impl/KokkosBlas3_trmm_spec.hpp | 94 +- blas/impl/KokkosBlas3_trsm_impl.hpp | 287 +- blas/impl/KokkosBlas3_trsm_spec.hpp | 111 +- blas/impl/KokkosBlas_serial_axpy.hpp | 20 +- blas/impl/KokkosBlas_serial_nrm2.hpp | 19 +- blas/impl/KokkosBlas_util.hpp | 9 +- blas/src/KokkosBlas1_abs.hpp | 40 +- blas/src/KokkosBlas1_axpby.hpp | 97 +- blas/src/KokkosBlas1_dot.hpp | 156 +- blas/src/KokkosBlas1_fill.hpp | 3 +- blas/src/KokkosBlas1_iamax.hpp | 78 +- blas/src/KokkosBlas1_mult.hpp | 69 +- blas/src/KokkosBlas1_nrm1.hpp | 92 +- blas/src/KokkosBlas1_nrm2.hpp | 112 +- blas/src/KokkosBlas1_nrm2_squared.hpp | 88 +- blas/src/KokkosBlas1_nrm2w.hpp | 75 +- blas/src/KokkosBlas1_nrm2w_squared.hpp | 90 +- blas/src/KokkosBlas1_nrminf.hpp | 88 +- blas/src/KokkosBlas1_reciprocal.hpp | 40 +- blas/src/KokkosBlas1_rot.hpp | 63 +- blas/src/KokkosBlas1_rotg.hpp | 40 +- blas/src/KokkosBlas1_rotm.hpp | 63 +- blas/src/KokkosBlas1_rotmg.hpp | 47 +- blas/src/KokkosBlas1_scal.hpp | 69 +- blas/src/KokkosBlas1_set.hpp | 21 +- blas/src/KokkosBlas1_sum.hpp | 64 +- blas/src/KokkosBlas1_swap.hpp | 42 +- blas/src/KokkosBlas1_team_abs.hpp | 3 +- blas/src/KokkosBlas1_team_axpby.hpp | 17 +- blas/src/KokkosBlas1_team_dot.hpp | 6 +- blas/src/KokkosBlas1_team_mult.hpp | 10 +- blas/src/KokkosBlas1_team_nrm2.hpp | 6 +- blas/src/KokkosBlas1_team_scal.hpp | 5 +- blas/src/KokkosBlas1_team_update.hpp | 12 +- blas/src/KokkosBlas1_update.hpp | 66 +- blas/src/KokkosBlas2_gemv.hpp | 148 +- blas/src/KokkosBlas2_ger.hpp | 83 +- blas/src/KokkosBlas2_serial_gemv.hpp | 17 +- blas/src/KokkosBlas2_syr.hpp | 69 +- blas/src/KokkosBlas2_syr2.hpp | 93 +- blas/src/KokkosBlas2_team_gemv.hpp | 61 +- blas/src/KokkosBlas3_gemm.hpp | 129 +- blas/src/KokkosBlas3_trmm.hpp | 61 +- blas/src/KokkosBlas3_trsm.hpp | 61 +- blas/src/KokkosBlas_trtri.hpp | 3 +- .../tpls/KokkosBlas1_axpby_tpl_spec_avail.hpp | 69 +- blas/tpls/KokkosBlas1_axpby_tpl_spec_decl.hpp | 583 +-- blas/tpls/KokkosBlas1_dot_tpl_spec_avail.hpp | 74 +- blas/tpls/KokkosBlas1_dot_tpl_spec_decl.hpp | 398 +- .../tpls/KokkosBlas1_iamax_tpl_spec_avail.hpp | 189 +- blas/tpls/KokkosBlas1_iamax_tpl_spec_decl.hpp | 708 ++- blas/tpls/KokkosBlas1_mult_tpl_spec_avail.hpp | 3 +- blas/tpls/KokkosBlas1_nrm1_tpl_spec_avail.hpp | 136 +- blas/tpls/KokkosBlas1_nrm1_tpl_spec_decl.hpp | 481 +- blas/tpls/KokkosBlas1_nrm2_tpl_spec_avail.hpp | 71 +- blas/tpls/KokkosBlas1_nrm2_tpl_spec_decl.hpp | 568 +-- .../KokkosBlas1_nrminf_tpl_spec_avail.hpp | 33 +- .../tpls/KokkosBlas1_nrminf_tpl_spec_decl.hpp | 328 +- blas/tpls/KokkosBlas1_rot_tpl_spec_avail.hpp | 62 +- blas/tpls/KokkosBlas1_rot_tpl_spec_decl.hpp | 512 +- blas/tpls/KokkosBlas1_rotg_tpl_spec_avail.hpp | 195 +- blas/tpls/KokkosBlas1_rotg_tpl_spec_decl.hpp | 946 ++-- blas/tpls/KokkosBlas1_rotm_tpl_spec_avail.hpp | 105 +- blas/tpls/KokkosBlas1_rotm_tpl_spec_decl.hpp | 396 +- .../tpls/KokkosBlas1_rotmg_tpl_spec_avail.hpp | 108 +- blas/tpls/KokkosBlas1_rotmg_tpl_spec_decl.hpp | 443 +- blas/tpls/KokkosBlas1_scal_tpl_spec_avail.hpp | 122 +- blas/tpls/KokkosBlas1_scal_tpl_spec_decl.hpp | 458 +- blas/tpls/KokkosBlas1_swap_tpl_spec_avail.hpp | 149 +- blas/tpls/KokkosBlas1_swap_tpl_spec_decl.hpp | 733 ++- .../KokkosBlas1_update_tpl_spec_avail.hpp | 3 +- blas/tpls/KokkosBlas2_gemv_tpl_spec_avail.hpp | 175 +- blas/tpls/KokkosBlas2_gemv_tpl_spec_decl.hpp | 1172 ++--- blas/tpls/KokkosBlas2_ger_tpl_spec_avail.hpp | 218 +- .../KokkosBlas2_ger_tpl_spec_decl_blas.hpp | 485 +- .../KokkosBlas2_ger_tpl_spec_decl_cublas.hpp | 513 +- .../KokkosBlas2_ger_tpl_spec_decl_rocblas.hpp | 482 +- .../KokkosBlas2_serial_gemv_tpl_spec_decl.hpp | 76 +- blas/tpls/KokkosBlas2_syr2_tpl_spec_avail.hpp | 222 +- .../KokkosBlas2_syr2_tpl_spec_decl_blas.hpp | 466 +- .../KokkosBlas2_syr2_tpl_spec_decl_cublas.hpp | 562 +-- ...KokkosBlas2_syr2_tpl_spec_decl_rocblas.hpp | 504 +- blas/tpls/KokkosBlas2_syr_tpl_spec_avail.hpp | 203 +- .../KokkosBlas2_syr_tpl_spec_decl_blas.hpp | 404 +- .../KokkosBlas2_syr_tpl_spec_decl_cublas.hpp | 505 +- .../KokkosBlas2_syr_tpl_spec_decl_rocblas.hpp | 452 +- blas/tpls/KokkosBlas3_gemm_tpl_spec_avail.hpp | 175 +- blas/tpls/KokkosBlas3_gemm_tpl_spec_decl.hpp | 689 ++- blas/tpls/KokkosBlas3_trmm_tpl_spec_avail.hpp | 113 +- blas/tpls/KokkosBlas3_trmm_tpl_spec_decl.hpp | 508 +- blas/tpls/KokkosBlas3_trsm_tpl_spec_avail.hpp | 113 +- blas/tpls/KokkosBlas3_trsm_tpl_spec_decl.hpp | 1348 +++-- blas/tpls/KokkosBlas_Cuda_tpl.hpp | 3 +- blas/tpls/KokkosBlas_Host_tpl.cpp | 837 ++- blas/tpls/KokkosBlas_Host_tpl.hpp | 61 +- blas/tpls/KokkosBlas_Rocm_tpl.hpp | 3 +- blas/tpls/KokkosBlas_tpl_spec.hpp | 53 +- blas/unit_test/Test_Blas1_abs.hpp | 56 +- blas/unit_test/Test_Blas1_asum.hpp | 29 +- blas/unit_test/Test_Blas1_axpby.hpp | 71 +- .../Test_Blas1_axpby_unification.hpp | 999 ++-- blas/unit_test/Test_Blas1_axpy.hpp | 63 +- blas/unit_test/Test_Blas1_dot.hpp | 67 +- blas/unit_test/Test_Blas1_iamax.hpp | 64 +- blas/unit_test/Test_Blas1_mult.hpp | 141 +- blas/unit_test/Test_Blas1_nrm1.hpp | 57 +- blas/unit_test/Test_Blas1_nrm2.hpp | 47 +- blas/unit_test/Test_Blas1_nrm2_squared.hpp | 56 +- blas/unit_test/Test_Blas1_nrm2w.hpp | 53 +- blas/unit_test/Test_Blas1_nrm2w_squared.hpp | 50 +- blas/unit_test/Test_Blas1_nrminf.hpp | 47 +- blas/unit_test/Test_Blas1_reciprocal.hpp | 83 +- blas/unit_test/Test_Blas1_rot.hpp | 12 +- blas/unit_test/Test_Blas1_rotg.hpp | 15 +- blas/unit_test/Test_Blas1_rotm.hpp | 15 +- blas/unit_test/Test_Blas1_rotmg.hpp | 19 +- blas/unit_test/Test_Blas1_scal.hpp | 62 +- blas/unit_test/Test_Blas1_serial_setscal.hpp | 89 +- blas/unit_test/Test_Blas1_sum.hpp | 41 +- blas/unit_test/Test_Blas1_swap.hpp | 12 +- blas/unit_test/Test_Blas1_team_abs.hpp | 139 +- blas/unit_test/Test_Blas1_team_axpby.hpp | 150 +- blas/unit_test/Test_Blas1_team_axpy.hpp | 147 +- blas/unit_test/Test_Blas1_team_dot.hpp | 179 +- blas/unit_test/Test_Blas1_team_mult.hpp | 244 +- blas/unit_test/Test_Blas1_team_nrm2.hpp | 49 +- blas/unit_test/Test_Blas1_team_scal.hpp | 188 +- blas/unit_test/Test_Blas1_team_setscal.hpp | 103 +- blas/unit_test/Test_Blas1_team_update.hpp | 258 +- blas/unit_test/Test_Blas1_update.hpp | 152 +- blas/unit_test/Test_Blas2_gemv.hpp | 189 +- blas/unit_test/Test_Blas2_gemv_util.hpp | 129 +- blas/unit_test/Test_Blas2_ger.hpp | 800 +-- blas/unit_test/Test_Blas2_serial_gemv.hpp | 42 +- blas/unit_test/Test_Blas2_syr.hpp | 873 ++-- blas/unit_test/Test_Blas2_syr2.hpp | 980 ++-- blas/unit_test/Test_Blas2_team_gemv.hpp | 21 +- blas/unit_test/Test_Blas2_teamvector_gemv.hpp | 28 +- blas/unit_test/Test_Blas3_gemm.hpp | 198 +- blas/unit_test/Test_Blas3_trmm.hpp | 299 +- blas/unit_test/Test_Blas3_trsm.hpp | 290 +- blas/unit_test/Test_Blas_Newton.hpp | 15 +- blas/unit_test/Test_Blas_rocblas.hpp | 3 +- blas/unit_test/Test_Blas_serial_axpy.hpp | 67 +- blas/unit_test/Test_Blas_serial_nrm2.hpp | 89 +- common/impl/KokkosKernels_Iota.hpp | 10 +- common/impl/KokkosKernels_NaN.hpp | 7 +- common/impl/KokkosKernels_SafeCompare.hpp | 3 +- common/impl/KokkosKernels_ViewUtils.hpp | 8 +- common/src/KokkosKernels_BitUtils.hpp | 3 +- .../KokkosKernels_BlockHashmapAccumulator.hpp | 117 +- common/src/KokkosKernels_BlockUtils.hpp | 44 +- common/src/KokkosKernels_Error.hpp | 26 +- common/src/KokkosKernels_ExecSpaceUtils.hpp | 121 +- .../src/KokkosKernels_HashmapAccumulator.hpp | 173 +- common/src/KokkosKernels_IOUtils.hpp | 28 +- common/src/KokkosKernels_LowerBound.hpp | 95 +- common/src/KokkosKernels_Macros.hpp | 11 +- common/src/KokkosKernels_Predicates.hpp | 39 +- .../src/KokkosKernels_PrintConfiguration.hpp | 11 +- common/src/KokkosKernels_PrintUtils.hpp | 34 +- common/src/KokkosKernels_SimpleUtils.hpp | 177 +- common/src/KokkosKernels_Sorting.hpp | 398 +- common/src/KokkosKernels_TplsVersion.hpp | 6 +- ...Kernels_Uniform_Initialized_MemoryPool.hpp | 26 +- common/src/KokkosKernels_UpperBound.hpp | 13 +- common/src/KokkosKernels_Utils.hpp | 866 ++-- common/src/KokkosKernels_VectorUtils.hpp | 35 +- common/src/KokkosKernels_helpers.hpp | 40 +- common/src/Kokkos_ArithTraits.hpp | 716 +-- common/src/Kokkos_InnerProductSpaceTraits.hpp | 61 +- common/unit_test/Test_Common_AlignPtrTo.hpp | 29 +- common/unit_test/Test_Common_ArithTraits.hpp | 194 +- common/unit_test/Test_Common_Error.hpp | 3 +- common/unit_test/Test_Common_Iota.hpp | 18 +- common/unit_test/Test_Common_LowerBound.hpp | 67 +- .../Test_Common_PrintConfiguration.hpp | 4 +- common/unit_test/Test_Common_Sorting.hpp | 138 +- common/unit_test/Test_Common_UpperBound.hpp | 67 +- common/unit_test/Test_Common_Version.hpp | 3 +- common/unit_test/Test_Common_float128.hpp | 13 +- .../unit_test/Test_Common_set_bit_count.hpp | 58 +- example/batched_solve/examples_helper.hpp | 45 +- example/batched_solve/static_pivoting.cpp | 31 +- example/batched_solve/team_GMRES.cpp | 140 +- example/gmres/ex_real_A.cpp | 62 +- example/gmres/test_prec.cpp | 73 +- ...kosKernels_Example_Distance2GraphColor.cpp | 178 +- example/graph/PartitioningExample.cpp | 2 +- example/half/xpy.cpp | 21 +- ...kkosKernels_Example_HashmapAccumulator.cpp | 79 +- example/wiki/blas/abs/abs.cpp | 3 +- .../graph/KokkosGraph_wiki_9pt_stencil.hpp | 10 +- .../graph/KokkosGraph_wiki_coarsening.cpp | 5 +- .../wiki/graph/KokkosGraph_wiki_coloring.cpp | 17 +- example/wiki/graph/KokkosGraph_wiki_mis2.cpp | 19 +- example/wiki/graph/KokkosGraph_wiki_rcm.cpp | 21 +- .../sparse/KokkosSparse_wiki_bsrmatrix.cpp | 34 +- .../sparse/KokkosSparse_wiki_bsrmatrix_2.cpp | 71 +- .../sparse/KokkosSparse_wiki_crsmatrix.cpp | 22 +- .../sparse/KokkosSparse_wiki_gauss_seidel.cpp | 34 +- .../wiki/sparse/KokkosSparse_wiki_spadd.cpp | 22 +- .../wiki/sparse/KokkosSparse_wiki_spgemm.cpp | 22 +- .../wiki/sparse/KokkosSparse_wiki_spmv.cpp | 24 +- graph/impl/KokkosGraph_BFS_impl.hpp | 41 +- .../impl/KokkosGraph_Distance1Color_impl.hpp | 1136 ++--- .../impl/KokkosGraph_Distance2Color_impl.hpp | 410 +- graph/impl/KokkosGraph_Distance2MIS_impl.hpp | 365 +- .../KokkosGraph_ExplicitCoarsening_impl.hpp | 166 +- graph/impl/KokkosGraph_color_d1_spec.hpp | 89 +- graph/src/KokkosGraph_CoarsenConstruct.hpp | 1050 ++-- graph/src/KokkosGraph_CoarsenHeuristics.hpp | 229 +- graph/src/KokkosGraph_Distance1Color.hpp | 45 +- .../src/KokkosGraph_Distance1ColorHandle.hpp | 292 +- graph/src/KokkosGraph_Distance2Color.hpp | 80 +- .../src/KokkosGraph_Distance2ColorHandle.hpp | 167 +- graph/src/KokkosGraph_ExplicitCoarsening.hpp | 68 +- graph/src/KokkosGraph_MIS2.hpp | 49 +- graph/src/KokkosGraph_Triangle.hpp | 231 +- graph/unit_test/Test_Graph_coarsen.hpp | 221 +- graph/unit_test/Test_Graph_graph_color.hpp | 118 +- .../Test_Graph_graph_color_deterministic.hpp | 75 +- .../Test_Graph_graph_color_distance2.hpp | 291 +- graph/unit_test/Test_Graph_mis2.hpp | 189 +- graph/unit_test/Test_Graph_rcm.hpp | 154 +- lapack/impl/KokkosLapack_gesv_spec.hpp | 88 +- lapack/impl/KokkosLapack_svd_spec.hpp | 114 +- lapack/impl/KokkosLapack_trtri_impl.hpp | 23 +- lapack/impl/KokkosLapack_trtri_spec.hpp | 57 +- lapack/src/KokkosLapack_gesv.hpp | 71 +- lapack/src/KokkosLapack_svd.hpp | 114 +- lapack/src/KokkosLapack_trtri.hpp | 25 +- lapack/tpls/KokkosLapack_Cuda_tpl.hpp | 3 +- lapack/tpls/KokkosLapack_Host_tpl.cpp | 128 +- lapack/tpls/KokkosLapack_Host_tpl.hpp | 14 +- lapack/tpls/KokkosLapack_cusolver.hpp | 34 +- .../tpls/KokkosLapack_gesv_tpl_spec_avail.hpp | 147 +- .../tpls/KokkosLapack_gesv_tpl_spec_decl.hpp | 546 +- .../tpls/KokkosLapack_svd_tpl_spec_avail.hpp | 174 +- .../tpls/KokkosLapack_svd_tpl_spec_decl.hpp | 617 +-- .../KokkosLapack_trtri_tpl_spec_avail.hpp | 91 +- .../tpls/KokkosLapack_trtri_tpl_spec_decl.hpp | 253 +- lapack/unit_test/Test_Lapack_gesv.hpp | 177 +- lapack/unit_test/Test_Lapack_svd.hpp | 105 +- lapack/unit_test/Test_Lapack_trtri.hpp | 107 +- ode/impl/KokkosODE_BDF_impl.hpp | 183 +- ode/impl/KokkosODE_Newton_impl.hpp | 29 +- ode/impl/KokkosODE_RungeKuttaTables_impl.hpp | 97 +- ode/impl/KokkosODE_RungeKutta_impl.hpp | 42 +- ode/src/KokkosODE_BDF.hpp | 43 +- ode/src/KokkosODE_Newton.hpp | 15 +- ode/src/KokkosODE_RungeKutta.hpp | 14 +- ode/src/KokkosODE_Types.hpp | 14 +- ode/unit_test/Test_ODE_BDF.hpp | 272 +- ode/unit_test/Test_ODE_Newton.hpp | 152 +- ode/unit_test/Test_ODE_RK.hpp | 197 +- ode/unit_test/Test_ODE_RK_chem.hpp | 50 +- perf_test/Benchmark_Context.hpp | 42 +- .../KokkosKernels_perf_test_instantiation.hpp | 7 +- .../KokkosKernels_perf_test_utilities.hpp | 69 +- perf_test/PerfTestUtilities.cpp | 4 +- perf_test/PerfTestUtilities.hpp | 37 +- ...okkosBatched_Test_BlockJacobi_Tutorial.cpp | 92 +- .../KokkosBatched_Test_BlockTridiagDirect.cpp | 530 +- .../KokkosBatched_Test_BlockTridiagJacobi.cpp | 471 +- .../KokkosBatched_Test_Gemm_Cuda.cpp | 330 +- .../KokkosBatched_Test_Gemm_Host.hpp | 185 +- .../KokkosBatched_Test_Gemv_Host.hpp | 110 +- .../do-not-use/KokkosBatched_Test_LU_Cuda.cpp | 233 +- .../do-not-use/KokkosBatched_Test_LU_Host.hpp | 101 +- .../KokkosBatched_Test_Trsm_Cuda.cpp | 476 +- .../KokkosBatched_Test_Trsm_Host.hpp | 204 +- .../CG/Functor_TestBatchedTeamVectorCG_1.hpp | 35 +- .../CG/Functor_TestBatchedTeamVectorCG_2.hpp | 58 +- .../CG/Functor_TestBatchedTeamVectorCG_3.hpp | 57 +- .../sparse/CG/KokkosBatched_Test_CG.cpp | 172 +- .../Functor_TestBatchedTeamVectorGMRES_1.hpp | 56 +- .../Functor_TestBatchedTeamVectorGMRES_2.hpp | 99 +- .../Functor_TestBatchedTeamVectorGMRES_3.hpp | 100 +- .../sparse/GMRES/KokkosBatched_Test_GMRES.cpp | 290 +- .../KokkosBatched_Test_Sparse_Helper.hpp | 30 +- .../sparse/SPMV/KokkosBatched_SPMV_View.hpp | 212 +- .../sparse/SPMV/KokkosBatched_Test_SPMV.cpp | 247 +- .../KokkosBatched_Test_cusolverDn.cpp | 73 +- .../KokkosBatched_Test_cusolverSp.cpp | 168 +- perf_test/blas/KokkosBlas_blas1.cpp | 34 +- perf_test/blas/KokkosBlas_blas1_MV.cpp | 69 +- .../blas1/KokkosBlas_dot_mv_perf_test.cpp | 28 +- .../KokkosBlas_dot_mv_perf_test_benchmark.cpp | 18 +- .../blas/blas1/KokkosBlas_dot_perf_test.cpp | 22 +- .../blas/blas1/KokkosBlas_dot_perf_test.hpp | 6 +- .../KokkosBlas_dot_perf_test_benchmark.cpp | 15 +- .../KokkosBlas_dot_tracked_perf_test.cpp | 7 +- perf_test/blas/blas1/KokkosBlas_perf_test.cpp | 54 +- .../blas1/KokkosBlas_team_dot_perf_test.cpp | 29 +- .../blas1/KokkosBlas_team_dot_perf_test.hpp | 28 +- ...okkosBlas_team_dot_perf_test_benchmark.cpp | 25 +- .../KokkosBlas_team_dot_tracked_perf_test.cpp | 15 +- perf_test/blas/blas1/tracked_testing.hpp | 9 +- .../blas/blas2/KokkosBlas2_gemv_perf_test.cpp | 43 +- .../blas/blas2/KokkosBlas2_gemv_perf_test.hpp | 15 +- .../KokkosBlas2_gemv_perf_test_benchmark.cpp | 51 +- .../KokkosBlas2_gemv_tracked_perf_test.cpp | 4 +- .../KokkosBlas2_ger_perf_test_benchmark.cpp | 139 +- perf_test/blas/blas2/tracked_testing.hpp | 6 +- perf_test/blas/blas3/KokkosBlas3_common.hpp | 13 +- .../blas/blas3/KokkosBlas3_gemm_perf_test.hpp | 1340 ++--- .../KokkosBlas3_gemm_standalone_perf_test.cpp | 24 +- ...s3_gemm_standalone_perf_test_benchmark.cpp | 70 +- .../KokkosBlas3_gemm_tracked_perf_test.cpp | 15 +- .../KokkosBlas3_gemm_tracked_perf_test.hpp | 18 +- .../blas/blas3/KokkosBlas3_perf_test.cpp | 92 +- .../blas/blas3/KokkosBlas3_trmm_perf_test.hpp | 308 +- .../blas/blas3/KokkosBlas_trtri_perf_test.hpp | 186 +- perf_test/blas/blas3/tracked_testing.hpp | 9 +- perf_test/graph/KokkosGraph_color.cpp | 260 +- perf_test/graph/KokkosGraph_color_d2.cpp | 288 +- perf_test/graph/KokkosGraph_mis_d2.cpp | 61 +- perf_test/graph/KokkosGraph_triangle.cpp | 120 +- .../lapack/KokkosLapack_SVD_benchmark.cpp | 26 +- perf_test/ode/KokkosODE_BDF.cpp | 64 +- perf_test/ode/KokkosODE_RK.cpp | 113 +- perf_test/performance/performance_example.cpp | 30 +- .../performance/performance_validate.cpp | 166 +- perf_test/sparse/KokkosSparse_block_pcg.cpp | 194 +- perf_test/sparse/KokkosSparse_gs.cpp | 124 +- perf_test/sparse/KokkosSparse_kk_spmv.cpp | 38 +- perf_test/sparse/KokkosSparse_mdf.cpp | 69 +- perf_test/sparse/KokkosSparse_par_ilut.cpp | 144 +- perf_test/sparse/KokkosSparse_pcg.cpp | 70 +- perf_test/sparse/KokkosSparse_pcg.hpp | 136 +- .../sparse/KokkosSparse_run_spgemm_jacobi.hpp | 258 +- perf_test/sparse/KokkosSparse_spadd.cpp | 255 +- perf_test/sparse/KokkosSparse_spgemm.cpp | 223 +- .../sparse/KokkosSparse_spgemm_jacobi.cpp | 63 +- perf_test/sparse/KokkosSparse_spiluk.cpp | 205 +- perf_test/sparse/KokkosSparse_spmv.cpp | 51 +- .../sparse/KokkosSparse_spmv_benchmark.cpp | 59 +- perf_test/sparse/KokkosSparse_spmv_bsr.cpp | 169 +- .../KokkosSparse_spmv_bsr_benchmark.cpp | 123 +- perf_test/sparse/KokkosSparse_spmv_merge.cpp | 72 +- perf_test/sparse/KokkosSparse_spmv_struct.cpp | 99 +- .../KokkosSparse_spmv_struct_tuning.cpp | 280 +- perf_test/sparse/KokkosSparse_spmv_test.cpp | 29 +- perf_test/sparse/KokkosSparse_spmv_test.hpp | 78 +- perf_test/sparse/KokkosSparse_sptrsv.cpp | 273 +- perf_test/sparse/KokkosSparse_sptrsv_aux.hpp | 296 +- .../sparse/KokkosSparse_sptrsv_cholmod.cpp | 175 +- .../sparse/KokkosSparse_sptrsv_superlu.cpp | 299 +- .../sparse/KokkosSparse_sptrsv_supernode.cpp | 114 +- perf_test/sparse/spmv/CuSparse_SPMV.hpp | 33 +- perf_test/sparse/spmv/KokkosKernels_SPMV.hpp | 24 +- .../sparse/spmv/KokkosKernels_spmv_data.hpp | 6 +- perf_test/sparse/spmv/Kokkos_SPMV.hpp | 112 +- .../sparse/spmv/Kokkos_SPMV_Inspector.hpp | 56 +- perf_test/sparse/spmv/MKL_SPMV.hpp | 22 +- perf_test/sparse/spmv/OpenMPDynamic_SPMV.hpp | 3 +- .../sparse/spmv/OpenMPSmartStatic_SPMV.hpp | 17 +- perf_test/sparse/spmv/OpenMPStatic_SPMV.hpp | 3 +- perf_test/sparse/spmv/matrix_market.hpp | 61 +- perf_test/sparse/tracked_testing.hpp | 3 +- perf_test/test_crsmatrix.cpp | 177 +- perf_test/test_mv.cpp | 59 +- sparse/impl/KokkosSparse_bspgemm_impl.hpp | 92 +- sparse/impl/KokkosSparse_bspgemm_impl_def.hpp | 30 +- .../impl/KokkosSparse_bspgemm_impl_kkmem.hpp | 961 ++-- sparse/impl/KokkosSparse_bspgemm_impl_seq.hpp | 68 +- .../impl/KokkosSparse_bspgemm_impl_speed.hpp | 382 +- .../KokkosSparse_bspgemm_numeric_spec.hpp | 457 +- sparse/impl/KokkosSparse_bsr_to_crs_impl.hpp | 24 +- ...KokkosSparse_cluster_gauss_seidel_impl.hpp | 463 +- sparse/impl/KokkosSparse_coo2crs_impl.hpp | 82 +- .../KokkosSparse_crs_detect_block_size.hpp | 14 +- sparse/impl/KokkosSparse_crs_to_bsr_impl.hpp | 47 +- .../impl/KokkosSparse_gauss_seidel_impl.hpp | 1342 ++--- .../impl/KokkosSparse_gauss_seidel_spec.hpp | 656 +-- ...kkosSparse_getDiagCopyWithOffsets_impl.hpp | 61 +- sparse/impl/KokkosSparse_gmres_impl.hpp | 115 +- sparse/impl/KokkosSparse_gmres_spec.hpp | 135 +- sparse/impl/KokkosSparse_mdf_impl.hpp | 376 +- sparse/impl/KokkosSparse_merge_matrix.hpp | 64 +- .../KokkosSparse_par_ilut_numeric_impl.hpp | 500 +- .../KokkosSparse_par_ilut_numeric_spec.hpp | 258 +- .../KokkosSparse_par_ilut_symbolic_impl.hpp | 10 +- .../KokkosSparse_par_ilut_symbolic_spec.hpp | 149 +- .../impl/KokkosSparse_partitioning_impl.hpp | 104 +- .../impl/KokkosSparse_sor_sequential_impl.hpp | 48 +- .../impl/KokkosSparse_spadd_numeric_impl.hpp | 157 +- .../impl/KokkosSparse_spadd_numeric_spec.hpp | 251 +- .../impl/KokkosSparse_spadd_symbolic_impl.hpp | 380 +- .../impl/KokkosSparse_spadd_symbolic_spec.hpp | 168 +- sparse/impl/KokkosSparse_spgemm_imp_outer.hpp | 371 +- sparse/impl/KokkosSparse_spgemm_impl.hpp | 486 +- .../impl/KokkosSparse_spgemm_impl_color.hpp | 592 +-- .../KokkosSparse_spgemm_impl_compression.hpp | 829 ++- sparse/impl/KokkosSparse_spgemm_impl_def.hpp | 140 +- .../impl/KokkosSparse_spgemm_impl_kkmem.hpp | 1044 ++-- .../KokkosSparse_spgemm_impl_memaccess.hpp | 476 +- sparse/impl/KokkosSparse_spgemm_impl_seq.hpp | 81 +- .../impl/KokkosSparse_spgemm_impl_speed.hpp | 355 +- .../KokkosSparse_spgemm_impl_symbolic.hpp | 1823 +++---- .../KokkosSparse_spgemm_impl_triangle.hpp | 1499 +++--- ...se_spgemm_impl_triangle_no_compression.hpp | 811 ++- ...kkosSparse_spgemm_jacobi_denseacc_impl.hpp | 226 +- .../KokkosSparse_spgemm_jacobi_seq_impl.hpp | 58 +- ...kosSparse_spgemm_jacobi_sparseacc_impl.hpp | 1042 ++-- .../impl/KokkosSparse_spgemm_jacobi_spec.hpp | 299 +- .../impl/KokkosSparse_spgemm_noreuse_spec.hpp | 114 +- .../impl/KokkosSparse_spgemm_numeric_spec.hpp | 256 +- .../KokkosSparse_spgemm_symbolic_spec.hpp | 178 +- .../impl/KokkosSparse_spiluk_numeric_impl.hpp | 401 +- .../impl/KokkosSparse_spiluk_numeric_spec.hpp | 327 +- .../KokkosSparse_spiluk_symbolic_impl.hpp | 106 +- .../KokkosSparse_spiluk_symbolic_spec.hpp | 203 +- .../impl/KokkosSparse_spmv_bsrmatrix_impl.hpp | 956 ++-- .../KokkosSparse_spmv_bsrmatrix_impl_v42.hpp | 27 +- .../impl/KokkosSparse_spmv_bsrmatrix_spec.hpp | 350 +- sparse/impl/KokkosSparse_spmv_impl.hpp | 778 ++- sparse/impl/KokkosSparse_spmv_impl_merge.hpp | 178 +- sparse/impl/KokkosSparse_spmv_impl_omp.hpp | 19 +- sparse/impl/KokkosSparse_spmv_spec.hpp | 295 +- sparse/impl/KokkosSparse_spmv_struct_impl.hpp | 1016 ++-- sparse/impl/KokkosSparse_spmv_struct_spec.hpp | 257 +- sparse/impl/KokkosSparse_spmv_team_impl.hpp | 132 +- sparse/impl/KokkosSparse_spmv_team_spec.hpp | 48 +- .../KokkosSparse_sptrsv_cuSPARSE_impl.hpp | 388 +- .../impl/KokkosSparse_sptrsv_solve_impl.hpp | 1117 ++-- .../impl/KokkosSparse_sptrsv_solve_spec.hpp | 258 +- .../KokkosSparse_sptrsv_symbolic_impl.hpp | 210 +- .../KokkosSparse_sptrsv_symbolic_spec.hpp | 102 +- sparse/impl/KokkosSparse_trsv_impl.hpp | 110 +- sparse/impl/KokkosSparse_trsv_spec.hpp | 184 +- ...okkosSparse_twostage_gauss_seidel_impl.hpp | 369 +- sparse/src/KokkosKernels_Controls.hpp | 38 +- sparse/src/KokkosKernels_Handle.hpp | 390 +- sparse/src/KokkosSparse_BsrMatrix.hpp | 333 +- sparse/src/KokkosSparse_CcsMatrix.hpp | 67 +- sparse/src/KokkosSparse_CooMatrix.hpp | 33 +- sparse/src/KokkosSparse_CrsMatrix.hpp | 245 +- sparse/src/KokkosSparse_IOUtils.hpp | 426 +- sparse/src/KokkosSparse_LUPrec.hpp | 68 +- sparse/src/KokkosSparse_MatrixPrec.hpp | 8 +- sparse/src/KokkosSparse_OrdinalTraits.hpp | 16 +- sparse/src/KokkosSparse_Preconditioner.hpp | 8 +- sparse/src/KokkosSparse_SortCrs.hpp | 432 +- sparse/src/KokkosSparse_Utils.hpp | 1390 ++--- sparse/src/KokkosSparse_Utils_cusparse.hpp | 55 +- sparse/src/KokkosSparse_Utils_mkl.hpp | 139 +- sparse/src/KokkosSparse_Utils_rocsparse.hpp | 61 +- sparse/src/KokkosSparse_ccs2crs.hpp | 69 +- sparse/src/KokkosSparse_coo2crs.hpp | 36 +- sparse/src/KokkosSparse_crs2ccs.hpp | 69 +- sparse/src/KokkosSparse_crs2coo.hpp | 82 +- sparse/src/KokkosSparse_findRelOffset.hpp | 12 +- sparse/src/KokkosSparse_gauss_seidel.hpp | 756 ++- .../src/KokkosSparse_gauss_seidel_handle.hpp | 401 +- sparse/src/KokkosSparse_getDiagCopy.hpp | 35 +- sparse/src/KokkosSparse_gmres.hpp | 122 +- sparse/src/KokkosSparse_gmres_handle.hpp | 26 +- sparse/src/KokkosSparse_mdf.hpp | 126 +- sparse/src/KokkosSparse_mdf_handle.hpp | 18 +- sparse/src/KokkosSparse_par_ilut.hpp | 468 +- sparse/src/KokkosSparse_par_ilut_handle.hpp | 72 +- sparse/src/KokkosSparse_spadd.hpp | 275 +- sparse/src/KokkosSparse_spadd_handle.hpp | 18 +- sparse/src/KokkosSparse_spgemm.hpp | 120 +- sparse/src/KokkosSparse_spgemm_handle.hpp | 264 +- sparse/src/KokkosSparse_spgemm_jacobi.hpp | 141 +- sparse/src/KokkosSparse_spgemm_numeric.hpp | 157 +- sparse/src/KokkosSparse_spgemm_symbolic.hpp | 94 +- sparse/src/KokkosSparse_spiluk.hpp | 1005 ++-- sparse/src/KokkosSparse_spiluk_handle.hpp | 69 +- sparse/src/KokkosSparse_spmv.hpp | 674 +-- sparse/src/KokkosSparse_spmv_deprecated.hpp | 229 +- sparse/src/KokkosSparse_spmv_handle.hpp | 113 +- sparse/src/KokkosSparse_spmv_team.hpp | 90 +- sparse/src/KokkosSparse_sptrsv.hpp | 534 +- sparse/src/KokkosSparse_sptrsv_cholmod.hpp | 84 +- sparse/src/KokkosSparse_sptrsv_handle.hpp | 294 +- sparse/src/KokkosSparse_sptrsv_superlu.hpp | 94 +- sparse/src/KokkosSparse_sptrsv_supernode.hpp | 547 +- sparse/src/KokkosSparse_trsv.hpp | 70 +- sparse/tpls/KokkosKernels_tpl_handles_def.hpp | 4 +- ...kkosSparse_gauss_seidel_tpl_spec_avail.hpp | 7 +- .../KokkosSparse_gmres_tpl_spec_avail.hpp | 3 +- ...Sparse_par_ilut_numeric_tpl_spec_avail.hpp | 6 +- ...parse_par_ilut_symbolic_tpl_spec_avail.hpp | 3 +- ...kkosSparse_spadd_numeric_tpl_spec_decl.hpp | 420 +- ...kosSparse_spadd_symbolic_tpl_spec_decl.hpp | 345 +- .../KokkosSparse_spadd_tpl_spec_avail.hpp | 168 +- ...kosSparse_spgemm_jacobi_tpl_spec_avail.hpp | 7 +- ...osSparse_spgemm_noreuse_tpl_spec_avail.hpp | 45 +- ...kosSparse_spgemm_noreuse_tpl_spec_decl.hpp | 199 +- ...osSparse_spgemm_numeric_tpl_spec_avail.hpp | 180 +- ...kosSparse_spgemm_numeric_tpl_spec_decl.hpp | 660 +-- ...sSparse_spgemm_symbolic_tpl_spec_avail.hpp | 118 +- ...osSparse_spgemm_symbolic_tpl_spec_decl.hpp | 606 +-- ...osSparse_spiluk_numeric_tpl_spec_avail.hpp | 7 +- ...sSparse_spiluk_symbolic_tpl_spec_avail.hpp | 5 +- ...osSparse_spmv_bsrmatrix_tpl_spec_avail.hpp | 334 +- ...kosSparse_spmv_bsrmatrix_tpl_spec_decl.hpp | 1017 ++-- .../KokkosSparse_spmv_mv_tpl_spec_avail.hpp | 165 +- .../KokkosSparse_spmv_mv_tpl_spec_decl.hpp | 399 +- .../tpls/KokkosSparse_spmv_tpl_spec_avail.hpp | 280 +- .../tpls/KokkosSparse_spmv_tpl_spec_decl.hpp | 725 +-- ...kkosSparse_sptrsv_solve_tpl_spec_avail.hpp | 4 +- .../tpls/KokkosSparse_trsv_tpl_spec_avail.hpp | 3 +- sparse/unit_test/Test_Sparse_BsrMatrix.hpp | 86 +- sparse/unit_test/Test_Sparse_Controls.hpp | 3 +- sparse/unit_test/Test_Sparse_CrsMatrix.hpp | 92 +- sparse/unit_test/Test_Sparse_IOUtils.hpp | 69 +- sparse/unit_test/Test_Sparse_MergeMatrix.hpp | 45 +- sparse/unit_test/Test_Sparse_SortCrs.hpp | 278 +- .../Test_Sparse_TestUtils_RandCsMat.hpp | 9 +- sparse/unit_test/Test_Sparse_Transpose.hpp | 180 +- sparse/unit_test/Test_Sparse_Utils.hpp | 47 +- .../Test_Sparse_block_gauss_seidel.hpp | 190 +- sparse/unit_test/Test_Sparse_bspgemm.hpp | 116 +- sparse/unit_test/Test_Sparse_ccs2crs.hpp | 51 +- sparse/unit_test/Test_Sparse_coo2crs.hpp | 111 +- sparse/unit_test/Test_Sparse_crs2ccs.hpp | 51 +- sparse/unit_test/Test_Sparse_crs2coo.hpp | 42 +- sparse/unit_test/Test_Sparse_csc2csr.hpp | 37 +- .../Test_Sparse_extractCrsDiagonalBlocks.hpp | 39 +- .../unit_test/Test_Sparse_findRelOffset.hpp | 98 +- sparse/unit_test/Test_Sparse_gauss_seidel.hpp | 576 +-- sparse/unit_test/Test_Sparse_gmres.hpp | 41 +- sparse/unit_test/Test_Sparse_mdf.hpp | 133 +- sparse/unit_test/Test_Sparse_par_ilut.hpp | 125 +- .../Test_Sparse_removeCrsMatrixZeros.hpp | 123 +- .../unit_test/Test_Sparse_replaceSumInto.hpp | 66 +- .../Test_Sparse_replaceSumIntoLonger.hpp | 135 +- sparse/unit_test/Test_Sparse_rocsparse.hpp | 7 +- sparse/unit_test/Test_Sparse_spadd.hpp | 147 +- sparse/unit_test/Test_Sparse_spgemm.hpp | 258 +- .../unit_test/Test_Sparse_spgemm_jacobi.hpp | 108 +- sparse/unit_test/Test_Sparse_spiluk.hpp | 292 +- sparse/unit_test/Test_Sparse_spmv.hpp | 607 +-- sparse/unit_test/Test_Sparse_spmv_bsr.hpp | 227 +- sparse/unit_test/Test_Sparse_sptrsv.hpp | 101 +- sparse/unit_test/Test_Sparse_trsv.hpp | 81 +- sparse/unit_test/Test_vector_fixtures.hpp | 40 +- sparse/unit_test/matrixIssue402.hpp | 4476 +++++++---------- test_common/KokkosKernels_MatrixConverter.cpp | 48 +- test_common/KokkosKernels_MyCRSMatrix.hpp | 80 +- test_common/KokkosKernels_TestUtils.hpp | 294 +- .../KokkosKernels_Test_Structured_Matrix.hpp | 423 +- ...KokkosKernels_WriteBinaryFromBinSrcDst.cpp | 41 +- test_common/Kokkos_Performance.hpp | 138 +- .../Test_Common_Test_All_Type_Combos.hpp | 51 +- test_common/Test_Cuda.hpp | 6 +- test_common/Test_HIP.hpp | 8 +- 873 files changed, 51793 insertions(+), 85746 deletions(-) create mode 100644 .git-blame-ignore-revs diff --git a/.clang-format b/.clang-format index db5f94fa2e..ca42ad54e7 100644 --- a/.clang-format +++ b/.clang-format @@ -4,3 +4,4 @@ SortIncludes: false AlignConsecutiveAssignments: true AllowShortCaseLabelsOnASingleLine: true AllowShortIfStatementsOnASingleLine: true +ColumnLimit: 120 diff --git a/.git-blame-ignore-revs b/.git-blame-ignore-revs new file mode 100644 index 0000000000..07973335d9 --- /dev/null +++ b/.git-blame-ignore-revs @@ -0,0 +1,17 @@ +# +# Add formatting and other uninteresting commits here and +# 'git blame $file' will skip them IF +# A) you have it configured to do so globally (dangerous, this will break git blame on repos that don't have a .git-blame-ignore-revs file. +# git config --global blame.ignoreRevsFile .git-blame-ignore-revs +# OR +# B) you have it configured to do so locally (tedious, you have to remember to do this in every KK clone) +# git config blame.ignoreRevsFile .git-blame-ignore-revs +# OR +# C) you have an aliased blame command for KokkosKernels: +# git config --global alias.kkblame 'blame --ignore-revs-file=.git-blame-ignore-revs' +# NOTE: this implies you run 'git kkblame $file' +# OR +# D) you explicitly tell blame to skip them +# git blame --ignore-revs-file=.git-blame-ignore-revs $file +# +# List skips here: diff --git a/.github/workflows/format.yml b/.github/workflows/format.yml index 08b541587f..2819fd1554 100644 --- a/.github/workflows/format.yml +++ b/.github/workflows/format.yml @@ -11,12 +11,12 @@ permissions: jobs: clang-format-check: - runs-on: ubuntu-20.04 + runs-on: ubuntu-24.04 steps: - uses: actions/checkout@692973e3d937129bcbf40652eb9f2f61becf3332 # v4.1.7 - name: Install Dependencies - run: sudo apt install clang-format-8 + run: sudo apt install clang-format-16 - name: check run: | @@ -26,7 +26,7 @@ jobs: # For every file changed, apply clang-format for file in $(git diff --name-only origin/$GITHUB_BASE_REF | egrep '.*\.cpp$|.*\.hpp$|.*\.h$'); do if [ -e $file ]; then - clang-format-8 -i -style=file $file + clang-format-16 -i -style=file $file git add $file fi done diff --git a/batched/KokkosBatched_Util.hpp b/batched/KokkosBatched_Util.hpp index fc14bd5a19..8a1cb0e01b 100644 --- a/batched/KokkosBatched_Util.hpp +++ b/batched/KokkosBatched_Util.hpp @@ -48,7 +48,7 @@ #define __KOKKOSBATCHED_ENABLE_INTEL_MKL_BATCHED__ 1 #define __KOKKOSBATCHED_ENABLE_INTEL_MKL_COMPACT_BATCHED__ 1 #include "mkl.h" -//#include "mkl_types.h" +// #include "mkl_types.h" #endif #endif @@ -71,12 +71,11 @@ struct is_vector : public std::false_type {}; template struct is_same_mag_type { - static const bool is_specialized = (Kokkos::ArithTraits::is_specialized && - Kokkos::ArithTraits::is_specialized); + static const bool is_specialized = + (Kokkos::ArithTraits::is_specialized && Kokkos::ArithTraits::is_specialized); static const bool is_mag_type_same = - std::is_same::mag_type, - typename Kokkos::ArithTraits::mag_type>::value; + std::is_same::mag_type, typename Kokkos::ArithTraits::mag_type>::value; static const bool value = is_specialized && is_mag_type_same; }; @@ -87,42 +86,36 @@ using std::min; // view manipulation template -using MemoryTraits = Kokkos::MemoryTraits; template -using UnmanagedViewType = Kokkos::View< - typename ViewType::data_type, typename ViewType::array_layout, - typename ViewType::device_type, - MemoryTraits >; +using UnmanagedViewType = + Kokkos::View >; template -using ConstViewType = Kokkos::View< - typename ViewType::const_data_type, typename ViewType::array_layout, - typename ViewType::device_type, typename ViewType::memory_traits>; +using ConstViewType = Kokkos::View; template using ConstUnmanagedViewType = ConstViewType >; template -using ScratchViewType = Kokkos::View< - typename ViewType::data_type, typename ViewType::array_layout, - typename ViewType::execution_space::scratch_memory_space, - MemoryTraits >; +using ScratchViewType = Kokkos::View >; // helper for vector type template -KOKKOS_INLINE_FUNCTION - typename std::enable_if::value, size_t>::type - adjustDimension(const size_t &m) { +KOKKOS_INLINE_FUNCTION typename std::enable_if::value, size_t>::type adjustDimension( + const size_t &m) { return m; } template -KOKKOS_INLINE_FUNCTION - typename std::enable_if::value, size_t>::type - adjustDimension(const size_t &m) { +KOKKOS_INLINE_FUNCTION typename std::enable_if::value, size_t>::type adjustDimension( + const size_t &m) { return (m / T::vector_length + (m % T::vector_length > 0)); } @@ -132,9 +125,7 @@ struct Flush { // flush a large host buffer Kokkos::View _buf; - Flush() : _buf("Flush::buf", BufSize / sizeof(double)) { - Kokkos::deep_copy(_buf, 1); - } + Flush() : _buf("Flush::buf", BufSize / sizeof(double)) { Kokkos::deep_copy(_buf, 1); } KOKKOS_INLINE_FUNCTION void init(value_type &update) { update = 0; } @@ -147,9 +138,7 @@ struct Flush { void run() { double sum = 0; - Kokkos::parallel_reduce( - Kokkos::RangePolicy(0, BufSize / sizeof(double)), *this, - sum); + Kokkos::parallel_reduce(Kokkos::RangePolicy(0, BufSize / sizeof(double)), *this, sum); SpaceType().fence(); FILE *fp = fopen("/dev/null", "w"); fprintf(fp, "%f\n", sum); @@ -161,9 +150,7 @@ template struct Random; template -struct Random::value || - std::is_same::value, - T>::type> { +struct Random::value || std::is_same::value, T>::type> { Random(const unsigned int seed = 0) { srand(seed); } T value() { const auto val = (rand() / ((T)RAND_MAX) - 0.5) * 2.0; @@ -172,18 +159,16 @@ struct Random::value || }; template -struct Random >::value || - std::is_same >::value || - std::is_same >::value || - std::is_same >::value, - T>::type> { +struct Random< + T, typename std::enable_if< + std::is_same >::value || std::is_same >::value || + std::is_same >::value || std::is_same >::value, + T>::type> { Random(const unsigned int seed = 0) { srand(seed); } T value() { const auto rval = (rand() / ((double)RAND_MAX) - 0.5) * 2.0; const auto ival = (rand() / ((double)RAND_MAX) - 0.5) * 2.0; - return T(rval > 0 ? rval + 1.0e-3 : rval - 1.0e-3, - ival > 0 ? ival + 1.0e-3 : ival - 1.0e-3); + return T(rval > 0 ? rval + 1.0e-3 : rval - 1.0e-3, ival > 0 ? ival + 1.0e-3 : ival - 1.0e-3); } }; @@ -199,23 +184,18 @@ struct Timer { const double t = _clock.seconds(); std::string label = _label; label.resize(24); - std::cout << "KokkosKernels::Timer:: " << std::setw(26) << label - << std::setw(15) << std::scientific << t << " [sec] " - << std::endl; + std::cout << "KokkosKernels::Timer:: " << std::setw(26) << label << std::setw(15) << std::scientific << t + << " [sec] " << std::endl; } }; // Implicit vectorization template struct SIMD { - static_assert(std::is_same::value || std::is_same::value || - std::is_same::value || - std::is_same::value || - std::is_same::value || - std::is_same >::value || - std::is_same >::value || - std::is_same >::value || - std::is_same >::value || + static_assert(std::is_same::value || std::is_same::value || std::is_same::value || + std::is_same::value || std::is_same::value || + std::is_same >::value || std::is_same >::value || + std::is_same >::value || std::is_same >::value || std::is_same::value || std::is_same::value, "KokkosKernels:: Invalid SIMD<> type."); @@ -225,10 +205,8 @@ struct SIMD { // Intel AVX instruction device (explicit vectorization) template struct AVX { - static_assert(std::is_same::value || - std::is_same::value || - std::is_same >::value || - std::is_same >::value, + static_assert(std::is_same::value || std::is_same::value || + std::is_same >::value || std::is_same >::value, "KokkosKernels:: Invalid AVX<> type."); using value_type = T; }; @@ -304,17 +282,15 @@ using KokkosBlas::Mode; struct Util { template - KOKKOS_INLINE_FUNCTION static void packColMajor( - ValueType *KOKKOS_RESTRICT A, const int m, const int n, - const ValueType *KOKKOS_RESTRICT B, const int bs0, const int bs1) { + KOKKOS_INLINE_FUNCTION static void packColMajor(ValueType *KOKKOS_RESTRICT A, const int m, const int n, + const ValueType *KOKKOS_RESTRICT B, const int bs0, const int bs1) { for (int j = 0; j < n; ++j) for (int i = 0; i < m; ++i) A[i + j * m] = B[i * bs0 + j * bs1]; } template - KOKKOS_INLINE_FUNCTION static void packRowMajor( - ValueType *KOKKOS_RESTRICT A, const int m, const int n, - const ValueType *KOKKOS_RESTRICT B, const int bs0, const int bs1) { + KOKKOS_INLINE_FUNCTION static void packRowMajor(ValueType *KOKKOS_RESTRICT A, const int m, const int n, + const ValueType *KOKKOS_RESTRICT B, const int bs0, const int bs1) { for (int i = 0; i < m; ++i) for (int j = 0; j < n; ++j) A[i * n + j] = B[i * bs0 + j * bs1]; } @@ -366,8 +342,7 @@ struct Partition1x3 { ValueType *A0, *A1, *A2; KOKKOS_INLINE_FUNCTION - Partition1x3(const int arg_as1) - : as1(arg_as1), A0(NULL), A1(NULL), A2(NULL) {} + Partition1x3(const int arg_as1) : as1(arg_as1), A0(NULL), A1(NULL), A2(NULL) {} KOKKOS_INLINE_FUNCTION void partWithAL(const Partition1x2 &part, const int mA1) { @@ -403,9 +378,7 @@ struct Partition2x1 { } KOKKOS_INLINE_FUNCTION - void partWithAB(ValueType *A, const int mA, const int mAB) { - partWithAT(A, mA, mA - mAB); - } + void partWithAB(ValueType *A, const int mA, const int mAB) { partWithAT(A, mA, mA - mAB); } // A0 // A1 is merged into AT @@ -430,8 +403,7 @@ struct Partition3x1 { /* */ *A2; KOKKOS_INLINE_FUNCTION - Partition3x1(const int arg_as0) - : as0(arg_as0), A0(NULL), A1(NULL), A2(NULL) {} + Partition3x1(const int arg_as0) : as0(arg_as0), A0(NULL), A1(NULL), A2(NULL) {} KOKKOS_INLINE_FUNCTION void partWithAB(const Partition2x1 &part, const int mA1) { @@ -460,16 +432,10 @@ struct Partition2x2 { KOKKOS_INLINE_FUNCTION Partition2x2(const int arg_as0, const int arg_as1) - : as0(arg_as0), - as1(arg_as1), - ATL(NULL), - ATR(NULL), - ABL(NULL), - ABR(NULL) {} + : as0(arg_as0), as1(arg_as1), ATL(NULL), ATR(NULL), ABL(NULL), ABR(NULL) {} KOKKOS_INLINE_FUNCTION - void partWithATL(ValueType *A, const int /* mA */, const int /* nA */, - const int mATL, const int nATL) { + void partWithATL(ValueType *A, const int /* mA */, const int /* nA */, const int mATL, const int nATL) { ATL = A; ATR = ATL + nATL * as1; ABL = ATL + mATL * as0; @@ -477,8 +443,7 @@ struct Partition2x2 { } KOKKOS_INLINE_FUNCTION - void partWithABR(ValueType *A, const int mA, const int nA, const int mABR, - const int nABR) { + void partWithABR(ValueType *A, const int mA, const int nA, const int mABR, const int nABR) { partWithATL(A, mA, nA, mA - mABR, nA - nABR); } @@ -523,8 +488,7 @@ struct Partition3x3 { A22(NULL) {} KOKKOS_INLINE_FUNCTION - void partWithABR(const Partition2x2 &part, const int mA11, - const int nA11) { + void partWithABR(const Partition2x2 &part, const int mA11, const int nA11) { A00 = part.ATL; A01 = part.ATR; A02 = part.ATR + nA11 * as1; @@ -537,8 +501,7 @@ struct Partition3x3 { } KOKKOS_INLINE_FUNCTION - void partWithATL(const Partition2x2 &part, const int mA11, - const int nA11) { + void partWithATL(const Partition2x2 &part, const int mA11, const int nA11) { A00 = part.ATL; A01 = part.ATR - nA11 * as1; A02 = part.ATR; @@ -552,94 +515,74 @@ struct Partition3x3 { }; template -KOKKOS_INLINE_FUNCTION - typename std::enable_if::value, - void>::type - getIndices(const OrdinalType iTemp, const OrdinalType /*numRows*/, - const OrdinalType numMatrices, OrdinalType &iRow, - OrdinalType &iMatrix) { +KOKKOS_INLINE_FUNCTION typename std::enable_if::value, void>::type getIndices( + const OrdinalType iTemp, const OrdinalType /*numRows*/, const OrdinalType numMatrices, OrdinalType &iRow, + OrdinalType &iMatrix) { iRow = iTemp / numMatrices; iMatrix = iTemp % numMatrices; } template -KOKKOS_INLINE_FUNCTION - typename std::enable_if::value, - void>::type - getIndices(const OrdinalType iTemp, const OrdinalType numRows, - const OrdinalType /*numMatrices*/, OrdinalType &iRow, - OrdinalType &iMatrix) { +KOKKOS_INLINE_FUNCTION typename std::enable_if::value, void>::type getIndices( + const OrdinalType iTemp, const OrdinalType numRows, const OrdinalType /*numMatrices*/, OrdinalType &iRow, + OrdinalType &iMatrix) { iRow = iTemp % numRows; iMatrix = iTemp / numRows; } template -KOKKOS_INLINE_FUNCTION - typename std::enable_if::value, - void>::type - getIndices(const OrdinalType iTemp, const OrdinalType /*numRows*/, - const OrdinalType numMatrices, OrdinalType &iRow, - OrdinalType &iMatrix) { +KOKKOS_INLINE_FUNCTION typename std::enable_if::value, void>::type +getIndices(const OrdinalType iTemp, const OrdinalType /*numRows*/, const OrdinalType numMatrices, OrdinalType &iRow, + OrdinalType &iMatrix) { iRow = iTemp / numMatrices; iMatrix = iTemp % numMatrices; } template KOKKOS_INLINE_FUNCTION auto transpose_2d_view(ViewType v, const int *order) { - constexpr int rank = 2; - const int dim[] = {v.extent_int(1), v.extent_int(0)}; - using view_value_type = typename ViewType::value_type; - using execution_space_type = typename ViewType::execution_space; - using view_type = Kokkos::View; - Kokkos::LayoutStride stride = - Kokkos::LayoutStride::order_dimensions(rank, order, dim); + constexpr int rank = 2; + const int dim[] = {v.extent_int(1), v.extent_int(0)}; + using view_value_type = typename ViewType::value_type; + using execution_space_type = typename ViewType::execution_space; + using view_type = Kokkos::View; + Kokkos::LayoutStride stride = Kokkos::LayoutStride::order_dimensions(rank, order, dim); return view_type(v.data(), stride); } template -KOKKOS_INLINE_FUNCTION auto transpose_2d_view(ViewType v, - const BatchLayout::Left &) { +KOKKOS_INLINE_FUNCTION auto transpose_2d_view(ViewType v, const BatchLayout::Left &) { const int order[] = {0, 1}; // v is LayoutRight return transpose_2d_view(v, order); } template -KOKKOS_INLINE_FUNCTION auto transpose_2d_view(ViewType v, - const BatchLayout::Right &) { +KOKKOS_INLINE_FUNCTION auto transpose_2d_view(ViewType v, const BatchLayout::Right &) { const int order[] = {1, 0}; // v is LayoutLeft return transpose_2d_view(v, order); } ///// subview_wrapper overloads for handling 3-rank BatchLayout::Left views template -KOKKOS_INLINE_FUNCTION auto subview_wrapper(ViewType v, IdxType1 i1, - IdxType2 i2, IdxType3 i3, +KOKKOS_INLINE_FUNCTION auto subview_wrapper(ViewType v, IdxType1 i1, IdxType2 i2, IdxType3 i3, const BatchLayout::Left &) { return Kokkos::subview(v, i1, i2, i3); } template -KOKKOS_INLINE_FUNCTION auto subview_wrapper(ViewType v, IdxType1 i1, - IdxType2 i2, IdxType3 i3, - const BatchLayout::Left &layout_tag, - const Trans::NoTranspose) { +KOKKOS_INLINE_FUNCTION auto subview_wrapper(ViewType v, IdxType1 i1, IdxType2 i2, IdxType3 i3, + const BatchLayout::Left &layout_tag, const Trans::NoTranspose) { return subview_wrapper(v, i1, i2, i3, layout_tag); } template -KOKKOS_INLINE_FUNCTION auto subview_wrapper(ViewType v, IdxType1 i1, - Kokkos::ALL_t i2, Kokkos::ALL_t i3, - const BatchLayout::Left &layout_tag, - const Trans::Transpose) { +KOKKOS_INLINE_FUNCTION auto subview_wrapper(ViewType v, IdxType1 i1, Kokkos::ALL_t i2, Kokkos::ALL_t i3, + const BatchLayout::Left &layout_tag, const Trans::Transpose) { auto sv_nt = subview_wrapper(v, i1, i3, i2, layout_tag); return transpose_2d_view(sv_nt, layout_tag); } template -KOKKOS_INLINE_FUNCTION auto subview_wrapper(ViewType v, IdxType1 i1, - IdxType2 i2, IdxType3 i3, - const BatchLayout::Left &layout_tag, - const Trans::Transpose) { +KOKKOS_INLINE_FUNCTION auto subview_wrapper(ViewType v, IdxType1 i1, IdxType2 i2, IdxType3 i3, + const BatchLayout::Left &layout_tag, const Trans::Transpose) { auto sv_nt = subview_wrapper(v, i1, i3, i2, layout_tag); return sv_nt; @@ -647,29 +590,25 @@ KOKKOS_INLINE_FUNCTION auto subview_wrapper(ViewType v, IdxType1 i1, //// subview_wrapper overloads for handling 3-rank BatchLayout::Right views template -KOKKOS_INLINE_FUNCTION auto subview_wrapper(ViewType v, IdxType1 i1, - IdxType2 i2, IdxType3 i3, +KOKKOS_INLINE_FUNCTION auto subview_wrapper(ViewType v, IdxType1 i1, IdxType2 i2, IdxType3 i3, const BatchLayout::Right &) { return Kokkos::subview(v, i2, i3, i1); } template -KOKKOS_INLINE_FUNCTION auto subview_wrapper( - ViewType v, IdxType1 i1, IdxType2 i2, IdxType3 i3, - const BatchLayout::Right &layout_tag, const Trans::NoTranspose &) { +KOKKOS_INLINE_FUNCTION auto subview_wrapper(ViewType v, IdxType1 i1, IdxType2 i2, IdxType3 i3, + const BatchLayout::Right &layout_tag, const Trans::NoTranspose &) { return subview_wrapper(v, i1, i2, i3, layout_tag); } template -KOKKOS_INLINE_FUNCTION auto subview_wrapper( - ViewType v, IdxType1 i1, Kokkos::ALL_t i2, Kokkos::ALL_t i3, - const BatchLayout::Right &layout_tag, const Trans::Transpose &) { +KOKKOS_INLINE_FUNCTION auto subview_wrapper(ViewType v, IdxType1 i1, Kokkos::ALL_t i2, Kokkos::ALL_t i3, + const BatchLayout::Right &layout_tag, const Trans::Transpose &) { auto sv_nt = subview_wrapper(v, i1, i3, i2, layout_tag); return transpose_2d_view(sv_nt, layout_tag); } template -KOKKOS_INLINE_FUNCTION auto subview_wrapper( - ViewType v, IdxType1 i1, IdxType2 i2, IdxType3 i3, - const BatchLayout::Right &layout_tag, const Trans::Transpose &) { +KOKKOS_INLINE_FUNCTION auto subview_wrapper(ViewType v, IdxType1 i1, IdxType2 i2, IdxType3 i3, + const BatchLayout::Right &layout_tag, const Trans::Transpose &) { auto sv_nt = subview_wrapper(v, i1, i3, i2, layout_tag); return sv_nt; @@ -686,71 +625,48 @@ KOKKOS_INLINE_FUNCTION auto subview_wrapper( * otherwise, the last element of v. */ template -KOKKOS_INLINE_FUNCTION ViewValueType -access_view_bounds_check(ViewType v, int m, int n, const BoundsCheck::Yes &) { - return v(KOKKOSKERNELS_MACRO_MIN(m, v.extent_int(0) - 1), - KOKKOSKERNELS_MACRO_MIN(n, v.extent_int(1) - 1)); +KOKKOS_INLINE_FUNCTION ViewValueType access_view_bounds_check(ViewType v, int m, int n, const BoundsCheck::Yes &) { + return v(KOKKOSKERNELS_MACRO_MIN(m, v.extent_int(0) - 1), KOKKOSKERNELS_MACRO_MIN(n, v.extent_int(1) - 1)); } template -KOKKOS_INLINE_FUNCTION ViewValueType -access_view_bounds_check(ViewType v, int m, int n, const BoundsCheck::No &) { +KOKKOS_INLINE_FUNCTION ViewValueType access_view_bounds_check(ViewType v, int m, int n, const BoundsCheck::No &) { return v(m, n); } template -KOKKOS_INLINE_FUNCTION ViewValueType fma_alpha(ViewValueType reg_c, - ScalarType alpha, - const AlphaTag::Yes &) { +KOKKOS_INLINE_FUNCTION ViewValueType fma_alpha(ViewValueType reg_c, ScalarType alpha, const AlphaTag::Yes &) { return reg_c * alpha; } template -KOKKOS_INLINE_FUNCTION ViewValueType fma_alpha(ViewValueType reg_c, - ScalarType /*alpha*/, - const AlphaTag::No &) { +KOKKOS_INLINE_FUNCTION ViewValueType fma_alpha(ViewValueType reg_c, ScalarType /*alpha*/, const AlphaTag::No &) { return reg_c; } -template -KOKKOS_INLINE_FUNCTION void fma_bounds_check(ViewType v, SizeType m, SizeType n, - ViewValueType reg_c, - ScalarType alpha, ScalarType beta, - const ArgAlphaFmaTag &alpha_tag, +template +KOKKOS_INLINE_FUNCTION void fma_bounds_check(ViewType v, SizeType m, SizeType n, ViewValueType reg_c, ScalarType alpha, + ScalarType beta, const ArgAlphaFmaTag &alpha_tag, const BoundsCheck::Yes &) { - if (m < v.extent_int(0) && n < v.extent_int(1)) - v(m, n) = fma_alpha(reg_c, alpha, alpha_tag) + v(m, n) * beta; + if (m < v.extent_int(0) && n < v.extent_int(1)) v(m, n) = fma_alpha(reg_c, alpha, alpha_tag) + v(m, n) * beta; } -template -KOKKOS_INLINE_FUNCTION void fma_bounds_check(ViewType v, SizeType m, SizeType n, - ViewValueType reg_c, - ScalarType alpha, ScalarType beta, - const ArgAlphaFmaTag &alpha_tag, +template +KOKKOS_INLINE_FUNCTION void fma_bounds_check(ViewType v, SizeType m, SizeType n, ViewValueType reg_c, ScalarType alpha, + ScalarType beta, const ArgAlphaFmaTag &alpha_tag, const BoundsCheck::No &) { v(m, n) = fma_alpha(reg_c, alpha, alpha_tag) + v(m, n) * beta; } -template -KOKKOS_INLINE_FUNCTION void fma_bounds_check(ViewType v, SizeType m, SizeType n, - ViewValueType reg_c, - ScalarType alpha, - const ArgAlphaFmaTag &alpha_tag, - const BoundsCheck::Yes &) { - if (m < v.extent_int(0) && n < v.extent_int(1)) - v(m, n) = fma_alpha(reg_c, alpha, alpha_tag); +template +KOKKOS_INLINE_FUNCTION void fma_bounds_check(ViewType v, SizeType m, SizeType n, ViewValueType reg_c, ScalarType alpha, + const ArgAlphaFmaTag &alpha_tag, const BoundsCheck::Yes &) { + if (m < v.extent_int(0) && n < v.extent_int(1)) v(m, n) = fma_alpha(reg_c, alpha, alpha_tag); } -template -KOKKOS_INLINE_FUNCTION void fma_bounds_check(ViewType v, SizeType m, SizeType n, - ViewValueType reg_c, - ScalarType alpha, - const ArgAlphaFmaTag &alpha_tag, - const BoundsCheck::No &) { +template +KOKKOS_INLINE_FUNCTION void fma_bounds_check(ViewType v, SizeType m, SizeType n, ViewValueType reg_c, ScalarType alpha, + const ArgAlphaFmaTag &alpha_tag, const BoundsCheck::No &) { v(m, n) = fma_alpha(reg_c, alpha, alpha_tag); } diff --git a/batched/dense/impl/KokkosBatched_AddRadial_Impl.hpp b/batched/dense/impl/KokkosBatched_AddRadial_Impl.hpp index 252c78d5c5..d89a82ae2c 100644 --- a/batched/dense/impl/KokkosBatched_AddRadial_Impl.hpp +++ b/batched/dense/impl/KokkosBatched_AddRadial_Impl.hpp @@ -28,11 +28,9 @@ namespace KokkosBatched { /// =========== template -KOKKOS_INLINE_FUNCTION int SerialAddRadial::invoke(const ScalarType tiny, - const AViewType &A) { - return SerialAddRadialInternal::invoke( - (A.extent(0) < A.extent(1) ? A.extent(0) : A.extent(1)), tiny, A.data(), - (A.stride_0() + A.stride_1())); +KOKKOS_INLINE_FUNCTION int SerialAddRadial::invoke(const ScalarType tiny, const AViewType &A) { + return SerialAddRadialInternal::invoke((A.extent(0) < A.extent(1) ? A.extent(0) : A.extent(1)), tiny, A.data(), + (A.stride_0() + A.stride_1())); } /// @@ -41,11 +39,10 @@ KOKKOS_INLINE_FUNCTION int SerialAddRadial::invoke(const ScalarType tiny, template template -KOKKOS_INLINE_FUNCTION int TeamAddRadial::invoke( - const MemberType &member, const ScalarType tiny, const AViewType &A) { - return TeamAddRadialInternal::invoke( - member, (A.extent(0) < A.extent(1) ? A.extent(0) : A.extent(1)), tiny, - A.data(), (A.stride_0() + A.stride_1())); +KOKKOS_INLINE_FUNCTION int TeamAddRadial::invoke(const MemberType &member, const ScalarType tiny, + const AViewType &A) { + return TeamAddRadialInternal::invoke(member, (A.extent(0) < A.extent(1) ? A.extent(0) : A.extent(1)), tiny, A.data(), + (A.stride_0() + A.stride_1())); } } // end namespace KokkosBatched diff --git a/batched/dense/impl/KokkosBatched_AddRadial_Internal.hpp b/batched/dense/impl/KokkosBatched_AddRadial_Internal.hpp index 24ecafe0a0..634879530e 100644 --- a/batched/dense/impl/KokkosBatched_AddRadial_Internal.hpp +++ b/batched/dense/impl/KokkosBatched_AddRadial_Internal.hpp @@ -28,8 +28,7 @@ namespace KokkosBatched { struct SerialAddRadialInternal { template KOKKOS_INLINE_FUNCTION static int invoke(const int m, const ScalarType tiny, - /* */ ValueType *KOKKOS_RESTRICT A, - const int as) { + /* */ ValueType *KOKKOS_RESTRICT A, const int as) { const auto abs_tiny = tiny > 0 ? tiny : -tiny; const auto minus_abs_tiny = -abs_tiny; @@ -52,10 +51,8 @@ struct SerialAddRadialInternal { /// ================== struct TeamAddRadialInternal { template - KOKKOS_INLINE_FUNCTION static int invoke(const MemberType &member, - const int m, const ScalarType tiny, - /* */ ValueType *KOKKOS_RESTRICT A, - const int as) { + KOKKOS_INLINE_FUNCTION static int invoke(const MemberType &member, const int m, const ScalarType tiny, + /* */ ValueType *KOKKOS_RESTRICT A, const int as) { const auto abs_tiny = tiny > 0 ? tiny : -tiny; const auto minus_abs_tiny = -abs_tiny; diff --git a/batched/dense/impl/KokkosBatched_ApplyGivens_Serial_Internal.hpp b/batched/dense/impl/KokkosBatched_ApplyGivens_Serial_Internal.hpp index cf8a946e99..2d3d2af915 100644 --- a/batched/dense/impl/KokkosBatched_ApplyGivens_Serial_Internal.hpp +++ b/batched/dense/impl/KokkosBatched_ApplyGivens_Serial_Internal.hpp @@ -30,10 +30,9 @@ namespace KokkosBatched { /// struct SerialApplyLeftGivensInternal { template - KOKKOS_INLINE_FUNCTION static int invoke( - const Kokkos::pair G, const int n, - /* */ ValueType *a1t, const int a1ts, - /* */ ValueType *a2t, const int a2ts) { + KOKKOS_INLINE_FUNCTION static int invoke(const Kokkos::pair G, const int n, + /* */ ValueType *a1t, const int a1ts, + /* */ ValueType *a2t, const int a2ts) { typedef ValueType value_type; if (n == 0) return 0; // quick return if (G.first == value_type(1) && G.second == value_type(0)) return 0; @@ -59,10 +58,9 @@ struct SerialApplyLeftGivensInternal { struct SerialApplyRightGivensInternal { template - KOKKOS_INLINE_FUNCTION static int invoke( - const Kokkos::pair G, const int m, - /* */ ValueType *a1, const int a1s, - /* */ ValueType *a2, const int a2s) { + KOKKOS_INLINE_FUNCTION static int invoke(const Kokkos::pair G, const int m, + /* */ ValueType *a1, const int a1s, + /* */ ValueType *a2, const int a2s) { typedef ValueType value_type; if (m == 0) return 0; // quick return if (G.first == value_type(1) && G.second == value_type(0)) return 0; @@ -88,12 +86,11 @@ struct SerialApplyRightGivensInternal { struct SerialApplyLeftRightGivensInternal { template - KOKKOS_INLINE_FUNCTION static int invoke( - const Kokkos::pair &G12, const int &m, const int &n, - /* */ ValueType *KOKKOS_RESTRICT a1t, - /* */ ValueType *KOKKOS_RESTRICT a2t, - /* */ ValueType *KOKKOS_RESTRICT a1, - /* */ ValueType *KOKKOS_RESTRICT a2, const int &as0, const int &as1) { + KOKKOS_INLINE_FUNCTION static int invoke(const Kokkos::pair &G12, const int &m, const int &n, + /* */ ValueType *KOKKOS_RESTRICT a1t, + /* */ ValueType *KOKKOS_RESTRICT a2t, + /* */ ValueType *KOKKOS_RESTRICT a1, + /* */ ValueType *KOKKOS_RESTRICT a2, const int &as0, const int &as1) { typedef ValueType value_type; if (G12.first == value_type(1) && G12.second == value_type(0)) return 0; if (m == 0 && n == 0) return 0; // quick return @@ -124,15 +121,14 @@ struct SerialApplyLeftRightGivensInternal { } template - KOKKOS_INLINE_FUNCTION static int invoke( - const Kokkos::pair &G12, - const Kokkos::pair &G13, const int &m, const int &n, - /* */ ValueType *KOKKOS_RESTRICT a1t, - /* */ ValueType *KOKKOS_RESTRICT a2t, - /* */ ValueType *KOKKOS_RESTRICT a3t, - /* */ ValueType *KOKKOS_RESTRICT a1, - /* */ ValueType *KOKKOS_RESTRICT a2, - /* */ ValueType *KOKKOS_RESTRICT a3, const int &as0, const int &as1) { + KOKKOS_INLINE_FUNCTION static int invoke(const Kokkos::pair &G12, + const Kokkos::pair &G13, const int &m, const int &n, + /* */ ValueType *KOKKOS_RESTRICT a1t, + /* */ ValueType *KOKKOS_RESTRICT a2t, + /* */ ValueType *KOKKOS_RESTRICT a3t, + /* */ ValueType *KOKKOS_RESTRICT a1, + /* */ ValueType *KOKKOS_RESTRICT a2, + /* */ ValueType *KOKKOS_RESTRICT a3, const int &as0, const int &as1) { typedef ValueType value_type; if (m == 0 && n == 0) return 0; // quick return diff --git a/batched/dense/impl/KokkosBatched_ApplyHouseholder_Serial_Impl.hpp b/batched/dense/impl/KokkosBatched_ApplyHouseholder_Serial_Impl.hpp index be720bef2e..db85d96680 100644 --- a/batched/dense/impl/KokkosBatched_ApplyHouseholder_Serial_Impl.hpp +++ b/batched/dense/impl/KokkosBatched_ApplyHouseholder_Serial_Impl.hpp @@ -28,27 +28,21 @@ namespace KokkosBatched { /// =========== template <> -template -KOKKOS_INLINE_FUNCTION int SerialApplyHouseholder::invoke( - const uViewType &u2, const tauViewType &tau, const AViewType &A, - const wViewType &w) { - return SerialApplyLeftHouseholderInternal::invoke( - A.extent(0) - 1, A.extent(1), tau.data(), u2.data(), u2.stride(0), - A.data(), A.stride(1), A.data() + A.stride(0), A.stride(0), A.stride(1), - w.data()); +template +KOKKOS_INLINE_FUNCTION int SerialApplyHouseholder::invoke(const uViewType &u2, const tauViewType &tau, + const AViewType &A, const wViewType &w) { + return SerialApplyLeftHouseholderInternal::invoke(A.extent(0) - 1, A.extent(1), tau.data(), u2.data(), u2.stride(0), + A.data(), A.stride(1), A.data() + A.stride(0), A.stride(0), + A.stride(1), w.data()); } template <> -template -KOKKOS_INLINE_FUNCTION int SerialApplyHouseholder::invoke( - const uViewType &u2, const tauViewType &tau, const AViewType &A, - const wViewType &w) { - return SerialApplyRightHouseholderInternal::invoke( - A.extent(0), A.extent(1) - 1, tau.data(), u2.data(), u2.stride(0), - A.data(), A.stride(0), A.data() + A.stride(1), A.stride(0), A.stride(1), - w.data()); +template +KOKKOS_INLINE_FUNCTION int SerialApplyHouseholder::invoke(const uViewType &u2, const tauViewType &tau, + const AViewType &A, const wViewType &w) { + return SerialApplyRightHouseholderInternal::invoke(A.extent(0), A.extent(1) - 1, tau.data(), u2.data(), u2.stride(0), + A.data(), A.stride(0), A.data() + A.stride(1), A.stride(0), + A.stride(1), w.data()); } } // namespace KokkosBatched diff --git a/batched/dense/impl/KokkosBatched_ApplyHouseholder_Serial_Internal.hpp b/batched/dense/impl/KokkosBatched_ApplyHouseholder_Serial_Internal.hpp index 611e9440b5..e129fef5a5 100644 --- a/batched/dense/impl/KokkosBatched_ApplyHouseholder_Serial_Internal.hpp +++ b/batched/dense/impl/KokkosBatched_ApplyHouseholder_Serial_Internal.hpp @@ -30,12 +30,10 @@ namespace KokkosBatched { /// struct SerialApplyLeftHouseholderInternal { template - KOKKOS_INLINE_FUNCTION static int invoke(const int m, const int n, - const ValueType* tau, + KOKKOS_INLINE_FUNCTION static int invoke(const int m, const int n, const ValueType* tau, /* */ ValueType* u2, const int u2s, /* */ ValueType* a1t, const int a1ts, - /* */ ValueType* A2, const int as0, - const int as1, + /* */ ValueType* A2, const int as0, const int as1, /* */ ValueType* w1t) { typedef ValueType value_type; @@ -55,9 +53,7 @@ struct SerialApplyLeftHouseholderInternal { // w1t /= tau for (int j = 0; j < n; ++j) { value_type tmp = a1t[j * a1ts]; - for (int i = 0; i < m; ++i) - tmp += Kokkos::ArithTraits::conj(u2[i * u2s]) * - A2[i * as0 + j * as1]; + for (int i = 0; i < m; ++i) tmp += Kokkos::ArithTraits::conj(u2[i * u2s]) * A2[i * as0 + j * as1]; w1t[j] = tmp * inv_tau; // /= (*tau); } @@ -74,12 +70,10 @@ struct SerialApplyLeftHouseholderInternal { struct SerialApplyRightHouseholderInternal { template - KOKKOS_INLINE_FUNCTION static int invoke(const int m, const int n, - const ValueType* tau, + KOKKOS_INLINE_FUNCTION static int invoke(const int m, const int n, const ValueType* tau, /* */ ValueType* u2, const int u2s, /* */ ValueType* a1, const int a1s, - /* */ ValueType* A2, const int as0, - const int as1, + /* */ ValueType* A2, const int as0, const int as1, /* */ ValueType* w1) { typedef ValueType value_type; /// u2 n x 1 @@ -107,9 +101,7 @@ struct SerialApplyRightHouseholderInternal { // A2 -= w1 * u2' (ger with conjugate) for (int j = 0; j < n; ++j) - for (int i = 0; i < m; ++i) - A2[i * as0 + j * as1] -= - w1[i] * Kokkos::ArithTraits::conj(u2[j * u2s]); + for (int i = 0; i < m; ++i) A2[i * as0 + j * as1] -= w1[i] * Kokkos::ArithTraits::conj(u2[j * u2s]); return 0; } diff --git a/batched/dense/impl/KokkosBatched_ApplyHouseholder_TeamVector_Impl.hpp b/batched/dense/impl/KokkosBatched_ApplyHouseholder_TeamVector_Impl.hpp index d1dcc58d18..b322574ad0 100644 --- a/batched/dense/impl/KokkosBatched_ApplyHouseholder_TeamVector_Impl.hpp +++ b/batched/dense/impl/KokkosBatched_ApplyHouseholder_TeamVector_Impl.hpp @@ -29,33 +29,23 @@ namespace KokkosBatched { template struct TeamVectorApplyHouseholder { - template - KOKKOS_INLINE_FUNCTION static int invoke(const MemberType &member, - const uViewType &u2, - const tauViewType &tau, - const AViewType &A, - const wViewType &w) { - return TeamVectorApplyLeftHouseholderInternal::invoke( - member, A.extent(0) - 1, A.extent(1), tau.data(), u2.data(), - u2.stride(0), A.data(), A.stride(1), A.data() + A.stride(0), - A.stride(0), A.stride(1), w.data()); + template + KOKKOS_INLINE_FUNCTION static int invoke(const MemberType &member, const uViewType &u2, const tauViewType &tau, + const AViewType &A, const wViewType &w) { + return TeamVectorApplyLeftHouseholderInternal::invoke(member, A.extent(0) - 1, A.extent(1), tau.data(), u2.data(), + u2.stride(0), A.data(), A.stride(1), A.data() + A.stride(0), + A.stride(0), A.stride(1), w.data()); } }; template struct TeamVectorApplyHouseholder { - template - KOKKOS_INLINE_FUNCTION static int invoke(const MemberType &member, - const uViewType &u2, - const tauViewType &tau, - const AViewType &A, - const wViewType &w) { - return TeamVectorApplyRightHouseholderInternal::invoke( - member, A.extent(0), A.extent(1) - 1, tau.data(), u2.data(), - u2.stride(0), A.data(), A.stride(0), A.data() + A.stride(1), - A.stride(0), A.stride(1), w.data()); + template + KOKKOS_INLINE_FUNCTION static int invoke(const MemberType &member, const uViewType &u2, const tauViewType &tau, + const AViewType &A, const wViewType &w) { + return TeamVectorApplyRightHouseholderInternal::invoke(member, A.extent(0), A.extent(1) - 1, tau.data(), u2.data(), + u2.stride(0), A.data(), A.stride(0), A.data() + A.stride(1), + A.stride(0), A.stride(1), w.data()); } }; diff --git a/batched/dense/impl/KokkosBatched_ApplyHouseholder_TeamVector_Internal.hpp b/batched/dense/impl/KokkosBatched_ApplyHouseholder_TeamVector_Internal.hpp index 2754818fbf..2474a10fe3 100644 --- a/batched/dense/impl/KokkosBatched_ApplyHouseholder_TeamVector_Internal.hpp +++ b/batched/dense/impl/KokkosBatched_ApplyHouseholder_TeamVector_Internal.hpp @@ -30,13 +30,10 @@ namespace KokkosBatched { /// struct TeamVectorApplyLeftHouseholderInternal { template - KOKKOS_INLINE_FUNCTION static int invoke(const MemberType &member, - const int m, const int n, - const ValueType *tau, + KOKKOS_INLINE_FUNCTION static int invoke(const MemberType &member, const int m, const int n, const ValueType *tau, /* */ ValueType *u2, const int u2s, /* */ ValueType *a1t, const int a1ts, - /* */ ValueType *A2, const int as0, - const int as1, + /* */ ValueType *A2, const int as0, const int as1, /* */ ValueType *w1t) { typedef ValueType value_type; @@ -59,8 +56,7 @@ struct TeamVectorApplyLeftHouseholderInternal { Kokkos::parallel_reduce( Kokkos::ThreadVectorRange(member, m), [&](const int &i, value_type &val) { - val += Kokkos::ArithTraits::conj(u2[i * u2s]) * - A2[i * as0 + j * as1]; + val += Kokkos::ArithTraits::conj(u2[i * u2s]) * A2[i * as0 + j * as1]; }, tmp); Kokkos::single(Kokkos::PerThread(member), [&]() { @@ -70,26 +66,19 @@ struct TeamVectorApplyLeftHouseholderInternal { member.team_barrier(); // a1t -= w1t (axpy) - Kokkos::parallel_for(Kokkos::TeamVectorRange(member, n), - [&](const int &j) { a1t[j * a1ts] -= w1t[j]; }); + Kokkos::parallel_for(Kokkos::TeamVectorRange(member, n), [&](const int &j) { a1t[j * a1ts] -= w1t[j]; }); // A2 -= u2 w1t (ger) if (as0 <= as1) { - Kokkos::parallel_for( - Kokkos::TeamThreadRange(member, n), [&](const int &j) { - Kokkos::parallel_for( - Kokkos::ThreadVectorRange(member, m), [&](const int &i) { - A2[i * as0 + j * as1] -= u2[i * u2s] * w1t[j]; - }); - }); + Kokkos::parallel_for(Kokkos::TeamThreadRange(member, n), [&](const int &j) { + Kokkos::parallel_for(Kokkos::ThreadVectorRange(member, m), + [&](const int &i) { A2[i * as0 + j * as1] -= u2[i * u2s] * w1t[j]; }); + }); } else { - Kokkos::parallel_for( - Kokkos::ThreadVectorRange(member, n), [&](const int &j) { - Kokkos::parallel_for( - Kokkos::TeamThreadRange(member, m), [&](const int &i) { - A2[i * as0 + j * as1] -= u2[i * u2s] * w1t[j]; - }); - }); + Kokkos::parallel_for(Kokkos::ThreadVectorRange(member, n), [&](const int &j) { + Kokkos::parallel_for(Kokkos::TeamThreadRange(member, m), + [&](const int &i) { A2[i * as0 + j * as1] -= u2[i * u2s] * w1t[j]; }); + }); } return 0; @@ -98,13 +87,10 @@ struct TeamVectorApplyLeftHouseholderInternal { struct TeamVectorApplyRightHouseholderInternal { template - KOKKOS_INLINE_FUNCTION static int invoke(const MemberType &member, - const int m, const int n, - const ValueType *tau, + KOKKOS_INLINE_FUNCTION static int invoke(const MemberType &member, const int m, const int n, const ValueType *tau, /* */ ValueType *u2, const int u2s, /* */ ValueType *a1, const int a1s, - /* */ ValueType *A2, const int as0, - const int as1, + /* */ ValueType *A2, const int as0, const int as1, /* */ ValueType *w1) { typedef ValueType value_type; /// u2 n x 1 @@ -125,10 +111,7 @@ struct TeamVectorApplyRightHouseholderInternal { value_type tmp(0); Kokkos::parallel_reduce( Kokkos::ThreadVectorRange(member, n), - [&](const int &j, value_type &val) { - val += A2[i * as0 + j * as1] * u2[j * u2s]; - }, - tmp); + [&](const int &j, value_type &val) { val += A2[i * as0 + j * as1] * u2[j * u2s]; }, tmp); Kokkos::single(Kokkos::PerThread(member), [&]() { w1[i] = (tmp + a1[i * a1s]) * inv_tau; // \= (*tau); }); @@ -136,28 +119,21 @@ struct TeamVectorApplyRightHouseholderInternal { member.team_barrier(); // a1 -= w1 (axpy) - Kokkos::parallel_for(Kokkos::TeamVectorRange(member, m), - [&](const int &i) { a1[i * a1s] -= w1[i]; }); + Kokkos::parallel_for(Kokkos::TeamVectorRange(member, m), [&](const int &i) { a1[i * a1s] -= w1[i]; }); // A2 -= w1 * u2' (ger with conjugate) if (as0 <= as1) { - Kokkos::parallel_for( - Kokkos::TeamThreadRange(member, n), [&](const int &j) { - Kokkos::parallel_for( - Kokkos::ThreadVectorRange(member, m), [&](const int &i) { - A2[i * as0 + j * as1] -= - w1[i] * Kokkos::ArithTraits::conj(u2[j * u2s]); - }); - }); + Kokkos::parallel_for(Kokkos::TeamThreadRange(member, n), [&](const int &j) { + Kokkos::parallel_for(Kokkos::ThreadVectorRange(member, m), [&](const int &i) { + A2[i * as0 + j * as1] -= w1[i] * Kokkos::ArithTraits::conj(u2[j * u2s]); + }); + }); } else { - Kokkos::parallel_for( - Kokkos::ThreadVectorRange(member, n), [&](const int &j) { - Kokkos::parallel_for( - Kokkos::TeamThreadRange(member, m), [&](const int &i) { - A2[i * as0 + j * as1] -= - w1[i] * Kokkos::ArithTraits::conj(u2[j * u2s]); - }); - }); + Kokkos::parallel_for(Kokkos::ThreadVectorRange(member, n), [&](const int &j) { + Kokkos::parallel_for(Kokkos::TeamThreadRange(member, m), [&](const int &i) { + A2[i * as0 + j * as1] -= w1[i] * Kokkos::ArithTraits::conj(u2[j * u2s]); + }); + }); } return 0; diff --git a/batched/dense/impl/KokkosBatched_ApplyPivot_Impl.hpp b/batched/dense/impl/KokkosBatched_ApplyPivot_Impl.hpp index afc518f43c..10455f65b6 100644 --- a/batched/dense/impl/KokkosBatched_ApplyPivot_Impl.hpp +++ b/batched/dense/impl/KokkosBatched_ApplyPivot_Impl.hpp @@ -35,34 +35,26 @@ namespace KokkosBatched { template struct TeamVectorApplyPivot { template - KOKKOS_INLINE_FUNCTION static int invoke(const MemberType &member, - const int piv, const AViewType &A) { + KOKKOS_INLINE_FUNCTION static int invoke(const MemberType &member, const int piv, const AViewType &A) { if (AViewType::rank == 1) { const int as0 = A.stride(0); - TeamVectorApplyPivotVectorForwardInternal::invoke(member, piv, A.data(), - as0); + TeamVectorApplyPivotVectorForwardInternal::invoke(member, piv, A.data(), as0); } else if (AViewType::rank == 2) { const int n = A.extent(1), as0 = A.stride(0), as1 = A.stride(1); - TeamVectorApplyPivotMatrixForwardInternal::invoke(member, n, piv, - A.data(), as0, as1); + TeamVectorApplyPivotMatrixForwardInternal::invoke(member, n, piv, A.data(), as0, as1); } return 0; } template - KOKKOS_INLINE_FUNCTION static int invoke(const MemberType &member, - const PivViewType piv, - const AViewType &A) { + KOKKOS_INLINE_FUNCTION static int invoke(const MemberType &member, const PivViewType piv, const AViewType &A) { if (AViewType::rank == 1) { const int plen = piv.extent(0), ps0 = piv.stride(0), as0 = A.stride(0); - TeamVectorApplyPivotVectorForwardInternal::invoke( - member, plen, piv.data(), ps0, A.data(), as0); + TeamVectorApplyPivotVectorForwardInternal::invoke(member, plen, piv.data(), ps0, A.data(), as0); } else if (AViewType::rank == 2) { // row permutation - const int plen = piv.extent(0), ps0 = piv.stride(0), n = A.extent(1), - as0 = A.stride(0), as1 = A.stride(1); - TeamVectorApplyPivotMatrixForwardInternal::invoke( - member, n, plen, piv.data(), ps0, A.data(), as0, as1); + const int plen = piv.extent(0), ps0 = piv.stride(0), n = A.extent(1), as0 = A.stride(0), as1 = A.stride(1); + TeamVectorApplyPivotMatrixForwardInternal::invoke(member, n, plen, piv.data(), ps0, A.data(), as0, as1); } return 0; } @@ -72,34 +64,26 @@ struct TeamVectorApplyPivot { template struct TeamVectorApplyPivot { template - KOKKOS_INLINE_FUNCTION static int invoke(const MemberType &member, - const int piv, const AViewType &A) { + KOKKOS_INLINE_FUNCTION static int invoke(const MemberType &member, const int piv, const AViewType &A) { if (AViewType::rank == 1) { const int as0 = A.stride(0); - TeamVectorApplyPivotVectorForwardInternal::invoke(member, piv, A.data(), - as0); + TeamVectorApplyPivotVectorForwardInternal::invoke(member, piv, A.data(), as0); } else if (AViewType::rank == 2) { const int m = A.extent(0), as0 = A.stride(0), as1 = A.stride(1); - TeamVectorApplyPivotMatrixForwardInternal::invoke(member, m, piv, - A.data(), as1, as0); + TeamVectorApplyPivotMatrixForwardInternal::invoke(member, m, piv, A.data(), as1, as0); } return 0; } template - KOKKOS_INLINE_FUNCTION static int invoke(const MemberType &member, - const PivViewType &piv, - const AViewType &A) { + KOKKOS_INLINE_FUNCTION static int invoke(const MemberType &member, const PivViewType &piv, const AViewType &A) { if (AViewType::rank == 1) { const int plen = piv.extent(0), as0 = A.stride(0); - TeamVectorApplyPivotVectorForwardInternal ::invoke( - member, plen, piv.data(), A.data(), as0); + TeamVectorApplyPivotVectorForwardInternal ::invoke(member, plen, piv.data(), A.data(), as0); } else if (AViewType::rank == 2) { // column permutation - const int plen = piv.extent(0), ps = piv.stride(0), m = A.extent(0), - as0 = A.stride(0), as1 = A.stride(1); - TeamVectorApplyPivotMatrixForwardInternal ::invoke( - member, m, plen, piv.data(), ps, A.data(), as1, as0); + const int plen = piv.extent(0), ps = piv.stride(0), m = A.extent(0), as0 = A.stride(0), as1 = A.stride(1); + TeamVectorApplyPivotMatrixForwardInternal ::invoke(member, m, plen, piv.data(), ps, A.data(), as1, as0); } return 0; } @@ -113,34 +97,26 @@ struct TeamVectorApplyPivot { template struct TeamVectorApplyPivot { template - KOKKOS_INLINE_FUNCTION static int invoke(const MemberType &member, - const int piv, const AViewType &A) { + KOKKOS_INLINE_FUNCTION static int invoke(const MemberType &member, const int piv, const AViewType &A) { if (AViewType::rank == 1) { const int as0 = A.stride(0); - TeamVectorApplyPivotVectorBackwardInternal::invoke(member, piv, A.data(), - as0); + TeamVectorApplyPivotVectorBackwardInternal::invoke(member, piv, A.data(), as0); } else if (AViewType::rank == 2) { const int n = A.extent(1), as0 = A.stride(0), as1 = A.stride(1); - TeamVectorApplyPivotMatrixBackwardInternal::invoke(member, n, piv, - A.data(), as0, as1); + TeamVectorApplyPivotMatrixBackwardInternal::invoke(member, n, piv, A.data(), as0, as1); } return 0; } template - KOKKOS_INLINE_FUNCTION static int invoke(const MemberType &member, - const PivViewType piv, - const AViewType &A) { + KOKKOS_INLINE_FUNCTION static int invoke(const MemberType &member, const PivViewType piv, const AViewType &A) { if (AViewType::rank == 1) { const int plen = piv.extent(0), ps0 = piv.stride(0), as0 = A.stride(0); - TeamVectorApplyPivotVectorBackwardInternal::invoke( - member, plen, piv.data(), ps0, A.data(), as0); + TeamVectorApplyPivotVectorBackwardInternal::invoke(member, plen, piv.data(), ps0, A.data(), as0); } else if (AViewType::rank == 2) { // row permutation - const int plen = piv.extent(0), ps0 = piv.stride(0), n = A.extent(1), - as0 = A.stride(0), as1 = A.stride(1); - TeamVectorApplyPivotMatrixBackwardInternal::invoke( - member, n, plen, piv.data(), ps0, A.data(), as0, as1); + const int plen = piv.extent(0), ps0 = piv.stride(0), n = A.extent(1), as0 = A.stride(0), as1 = A.stride(1); + TeamVectorApplyPivotMatrixBackwardInternal::invoke(member, n, plen, piv.data(), ps0, A.data(), as0, as1); } return 0; } @@ -150,34 +126,26 @@ struct TeamVectorApplyPivot { template struct TeamVectorApplyPivot { template - KOKKOS_INLINE_FUNCTION static int invoke(const MemberType &member, - const int piv, const AViewType &A) { + KOKKOS_INLINE_FUNCTION static int invoke(const MemberType &member, const int piv, const AViewType &A) { if (AViewType::rank == 1) { const int as0 = A.stride(0); - TeamVectorApplyPivotVectorBackwardInternal::invoke(member, piv, A.data(), - as0); + TeamVectorApplyPivotVectorBackwardInternal::invoke(member, piv, A.data(), as0); } else if (AViewType::rank == 2) { const int m = A.extent(0), as0 = A.stride(0), as1 = A.stride(1); - TeamVectorApplyPivotMatrixBackwardInternal::invoke(member, m, piv, - A.data(), as1, as0); + TeamVectorApplyPivotMatrixBackwardInternal::invoke(member, m, piv, A.data(), as1, as0); } return 0; } template - KOKKOS_INLINE_FUNCTION static int invoke(const MemberType &member, - const PivViewType &piv, - const AViewType &A) { + KOKKOS_INLINE_FUNCTION static int invoke(const MemberType &member, const PivViewType &piv, const AViewType &A) { if (AViewType::rank == 1) { const int plen = piv.extent(0), as0 = A.stride(0); - TeamVectorApplyPivotVectorBackwardInternal ::invoke( - member, plen, piv.data(), A.data(), as0); + TeamVectorApplyPivotVectorBackwardInternal ::invoke(member, plen, piv.data(), A.data(), as0); } else if (AViewType::rank == 2) { // column permutation - const int plen = piv.extent(0), ps = piv.stride(0), m = A.extent(0), - as0 = A.stride(0), as1 = A.stride(1); - TeamVectorApplyPivotMatrixBackwardInternal ::invoke( - member, m, plen, piv.data(), ps, A.data(), as1, as0); + const int plen = piv.extent(0), ps = piv.stride(0), m = A.extent(0), as0 = A.stride(0), as1 = A.stride(1); + TeamVectorApplyPivotMatrixBackwardInternal ::invoke(member, m, plen, piv.data(), ps, A.data(), as1, as0); } return 0; } diff --git a/batched/dense/impl/KokkosBatched_ApplyPivot_Internal.hpp b/batched/dense/impl/KokkosBatched_ApplyPivot_Internal.hpp index 59548c3d26..a301382108 100644 --- a/batched/dense/impl/KokkosBatched_ApplyPivot_Internal.hpp +++ b/batched/dense/impl/KokkosBatched_ApplyPivot_Internal.hpp @@ -31,10 +31,8 @@ namespace KokkosBatched { /// struct TeamVectorApplyPivotVectorForwardInternal { template - KOKKOS_INLINE_FUNCTION static int invoke(const MemberType &member, - const int piv, - /* */ ValueType *KOKKOS_RESTRICT A, - const int as0) { + KOKKOS_INLINE_FUNCTION static int invoke(const MemberType &member, const int piv, + /* */ ValueType *KOKKOS_RESTRICT A, const int as0) { if (piv != 0) { Kokkos::single(Kokkos::PerTeam(member), [&]() { const int idx_p = piv * as0; @@ -47,12 +45,9 @@ struct TeamVectorApplyPivotVectorForwardInternal { } template - KOKKOS_INLINE_FUNCTION static int invoke(const MemberType &member, - const int plen, - const IntType *KOKKOS_RESTRICT p, + KOKKOS_INLINE_FUNCTION static int invoke(const MemberType &member, const int plen, const IntType *KOKKOS_RESTRICT p, const int ps0, - /* */ ValueType *KOKKOS_RESTRICT A, - const int as0) { + /* */ ValueType *KOKKOS_RESTRICT A, const int as0) { Kokkos::single(Kokkos::PerTeam(member), [&]() { for (int i = 0; i < plen; ++i) { const int piv = p[i * ps0]; @@ -71,30 +66,24 @@ struct TeamVectorApplyPivotVectorForwardInternal { /// Pivot a row struct TeamVectorApplyPivotMatrixForwardInternal { template - KOKKOS_INLINE_FUNCTION static int invoke(const MemberType &member, - const int n, const int piv, - /* */ ValueType *KOKKOS_RESTRICT A, - const int as0, const int as1) { + KOKKOS_INLINE_FUNCTION static int invoke(const MemberType &member, const int n, const int piv, + /* */ ValueType *KOKKOS_RESTRICT A, const int as0, const int as1) { if (piv != 0) { - Kokkos::parallel_for(Kokkos::TeamVectorRange(member, n), - [&](const int &j) { - ValueType *KOKKOS_RESTRICT A_at_j = A + j * as1; - const int idx_p = piv * as0; - const ValueType tmp = A_at_j[0]; - A_at_j[0] = A_at_j[idx_p]; - A_at_j[idx_p] = tmp; - }); + Kokkos::parallel_for(Kokkos::TeamVectorRange(member, n), [&](const int &j) { + ValueType *KOKKOS_RESTRICT A_at_j = A + j * as1; + const int idx_p = piv * as0; + const ValueType tmp = A_at_j[0]; + A_at_j[0] = A_at_j[idx_p]; + A_at_j[idx_p] = tmp; + }); } return 0; } template - KOKKOS_INLINE_FUNCTION static int invoke(const MemberType &member, - const int n, const int plen, - const IntType *KOKKOS_RESTRICT p, - const int ps0, - /* */ ValueType *KOKKOS_RESTRICT A, - const int as0, const int as1) { + KOKKOS_INLINE_FUNCTION static int invoke(const MemberType &member, const int n, const int plen, + const IntType *KOKKOS_RESTRICT p, const int ps0, + /* */ ValueType *KOKKOS_RESTRICT A, const int as0, const int as1) { Kokkos::parallel_for(Kokkos::TeamVectorRange(member, n), [&](const int &j) { ValueType *KOKKOS_RESTRICT A_at_j = A + j * as1; for (int i = 0; i < plen; ++i) { @@ -116,10 +105,8 @@ struct TeamVectorApplyPivotMatrixForwardInternal { /// struct TeamVectorApplyPivotVectorBackwardInternal { template - KOKKOS_INLINE_FUNCTION static int invoke(const MemberType &member, - const int piv, - /* */ ValueType *KOKKOS_RESTRICT A, - const int as0) { + KOKKOS_INLINE_FUNCTION static int invoke(const MemberType &member, const int piv, + /* */ ValueType *KOKKOS_RESTRICT A, const int as0) { if (piv != 0) { Kokkos::single(Kokkos::PerTeam(member), [&]() { const int idx_p = piv * as0; @@ -132,12 +119,9 @@ struct TeamVectorApplyPivotVectorBackwardInternal { } template - KOKKOS_INLINE_FUNCTION static int invoke(const MemberType &member, - const int plen, - const IntType *KOKKOS_RESTRICT p, + KOKKOS_INLINE_FUNCTION static int invoke(const MemberType &member, const int plen, const IntType *KOKKOS_RESTRICT p, const int ps0, - /* */ ValueType *KOKKOS_RESTRICT A, - const int as0) { + /* */ ValueType *KOKKOS_RESTRICT A, const int as0) { Kokkos::single(Kokkos::PerTeam(member), [&]() { for (int i = (plen - 1); i >= 0; --i) { const int piv = p[i * ps0]; @@ -156,30 +140,24 @@ struct TeamVectorApplyPivotVectorBackwardInternal { /// Pivot a row struct TeamVectorApplyPivotMatrixBackwardInternal { template - KOKKOS_INLINE_FUNCTION static int invoke(const MemberType &member, - const int n, const int piv, - /* */ ValueType *KOKKOS_RESTRICT A, - const int as0, const int as1) { + KOKKOS_INLINE_FUNCTION static int invoke(const MemberType &member, const int n, const int piv, + /* */ ValueType *KOKKOS_RESTRICT A, const int as0, const int as1) { if (piv != 0) { - Kokkos::parallel_for(Kokkos::TeamVectorRange(member, n), - [&](const int &j) { - ValueType *KOKKOS_RESTRICT A_at_j = A + j * as1; - const int idx_p = piv * as0; - const ValueType tmp = A_at_j[0]; - A_at_j[0] = A_at_j[idx_p]; - A_at_j[idx_p] = tmp; - }); + Kokkos::parallel_for(Kokkos::TeamVectorRange(member, n), [&](const int &j) { + ValueType *KOKKOS_RESTRICT A_at_j = A + j * as1; + const int idx_p = piv * as0; + const ValueType tmp = A_at_j[0]; + A_at_j[0] = A_at_j[idx_p]; + A_at_j[idx_p] = tmp; + }); } return 0; } template - KOKKOS_INLINE_FUNCTION static int invoke(const MemberType &member, - const int n, const int plen, - const IntType *KOKKOS_RESTRICT p, - const int ps0, - /* */ ValueType *KOKKOS_RESTRICT A, - const int as0, const int as1) { + KOKKOS_INLINE_FUNCTION static int invoke(const MemberType &member, const int n, const int plen, + const IntType *KOKKOS_RESTRICT p, const int ps0, + /* */ ValueType *KOKKOS_RESTRICT A, const int as0, const int as1) { Kokkos::parallel_for(Kokkos::TeamVectorRange(member, n), [&](const int &j) { ValueType *KOKKOS_RESTRICT A_at_j = A + j * as1; for (int i = (plen - 1); i >= 0; --i) { diff --git a/batched/dense/impl/KokkosBatched_ApplyQ_Serial_Impl.hpp b/batched/dense/impl/KokkosBatched_ApplyQ_Serial_Impl.hpp index 2a7519f2dc..ba9d85350f 100644 --- a/batched/dense/impl/KokkosBatched_ApplyQ_Serial_Impl.hpp +++ b/batched/dense/impl/KokkosBatched_ApplyQ_Serial_Impl.hpp @@ -28,42 +28,30 @@ namespace KokkosBatched { /// =========== template <> -template -KOKKOS_INLINE_FUNCTION int -SerialApplyQ::invoke( - const AViewType &A, const tViewType &t, const BViewType &B, - const wViewType &w) { - return SerialApplyQ_LeftForwardInternal::invoke( - B.extent(0), B.extent(1), A.extent(1), A.data(), A.stride_0(), - A.stride_1(), t.data(), t.stride_0(), B.data(), B.stride_0(), - B.stride_1(), w.data()); +template +KOKKOS_INLINE_FUNCTION int SerialApplyQ::invoke( + const AViewType &A, const tViewType &t, const BViewType &B, const wViewType &w) { + return SerialApplyQ_LeftForwardInternal::invoke(B.extent(0), B.extent(1), A.extent(1), A.data(), A.stride_0(), + A.stride_1(), t.data(), t.stride_0(), B.data(), B.stride_0(), + B.stride_1(), w.data()); } template <> -template -KOKKOS_INLINE_FUNCTION int -SerialApplyQ::invoke( - const AViewType &A, const tViewType &t, const BViewType &B, - const wViewType &w) { - return SerialApplyQ_LeftBackwardInternal::invoke( - B.extent(0), B.extent(1), A.extent(1), A.data(), A.stride_0(), - A.stride_1(), t.data(), t.stride_0(), B.data(), B.stride_0(), - B.stride_1(), w.data()); +template +KOKKOS_INLINE_FUNCTION int SerialApplyQ::invoke( + const AViewType &A, const tViewType &t, const BViewType &B, const wViewType &w) { + return SerialApplyQ_LeftBackwardInternal::invoke(B.extent(0), B.extent(1), A.extent(1), A.data(), A.stride_0(), + A.stride_1(), t.data(), t.stride_0(), B.data(), B.stride_0(), + B.stride_1(), w.data()); } template <> -template -KOKKOS_INLINE_FUNCTION int -SerialApplyQ::invoke( - const AViewType &A, const tViewType &t, const BViewType &B, - const wViewType &w) { - return SerialApplyQ_RightForwardInternal::invoke( - B.extent(0), B.extent(1), A.extent(1), A.data(), A.stride_0(), - A.stride_1(), t.data(), t.stride_0(), B.data(), B.stride_0(), - B.stride_1(), w.data()); +template +KOKKOS_INLINE_FUNCTION int SerialApplyQ::invoke( + const AViewType &A, const tViewType &t, const BViewType &B, const wViewType &w) { + return SerialApplyQ_RightForwardInternal::invoke(B.extent(0), B.extent(1), A.extent(1), A.data(), A.stride_0(), + A.stride_1(), t.data(), t.stride_0(), B.data(), B.stride_0(), + B.stride_1(), w.data()); } } // namespace KokkosBatched diff --git a/batched/dense/impl/KokkosBatched_ApplyQ_Serial_Internal.hpp b/batched/dense/impl/KokkosBatched_ApplyQ_Serial_Internal.hpp index e8d6905964..dbb11df747 100644 --- a/batched/dense/impl/KokkosBatched_ApplyQ_Serial_Internal.hpp +++ b/batched/dense/impl/KokkosBatched_ApplyQ_Serial_Internal.hpp @@ -32,13 +32,10 @@ namespace KokkosBatched { struct SerialApplyQ_LeftForwardInternal { template - KOKKOS_INLINE_FUNCTION static int invoke(const int m, const int n, - const int k, - /* */ ValueType *A, const int as0, - const int as1, + KOKKOS_INLINE_FUNCTION static int invoke(const int m, const int n, const int k, + /* */ ValueType *A, const int as0, const int as1, /* */ ValueType *t, const int ts, - /* */ ValueType *B, const int bs0, - const int bs1, + /* */ ValueType *B, const int bs0, const int bs1, /* */ ValueType *w) { typedef ValueType value_type; @@ -75,9 +72,8 @@ struct SerialApplyQ_LeftForwardInternal { const int m_A2 = m - m_A0 - 1; /// ----------------------------------------------------- // left apply householder to partitioned B1 and B2 - SerialApplyLeftHouseholderInternal::invoke(m_A2, n, tau, A_part3x3.A21, - as0, B_part3x1.A1, bs1, - B_part3x1.A2, bs0, bs1, w); + SerialApplyLeftHouseholderInternal::invoke(m_A2, n, tau, A_part3x3.A21, as0, B_part3x1.A1, bs1, B_part3x1.A2, bs0, + bs1, w); /// ----------------------------------------------------- A_part2x2.mergeToABR(A_part3x3); @@ -90,13 +86,10 @@ struct SerialApplyQ_LeftForwardInternal { struct SerialApplyQ_LeftBackwardInternal { template - KOKKOS_INLINE_FUNCTION static int invoke(const int m, const int n, - const int k, - /* */ ValueType *A, const int as0, - const int as1, + KOKKOS_INLINE_FUNCTION static int invoke(const int m, const int n, const int k, + /* */ ValueType *A, const int as0, const int as1, /* */ ValueType *t, const int ts, - /* */ ValueType *B, const int bs0, - const int bs1, + /* */ ValueType *B, const int bs0, const int bs1, /* */ ValueType *w) { typedef ValueType value_type; @@ -133,9 +126,8 @@ struct SerialApplyQ_LeftBackwardInternal { const int m_A2 = m - m_A0 - 1; /// ----------------------------------------------------- // left apply householder to partitioned B1 and B2 - SerialApplyLeftHouseholderInternal::invoke(m_A2, n, tau, A_part3x3.A21, - as0, B_part3x1.A1, bs1, - B_part3x1.A2, bs0, bs1, w); + SerialApplyLeftHouseholderInternal::invoke(m_A2, n, tau, A_part3x3.A21, as0, B_part3x1.A1, bs1, B_part3x1.A2, bs0, + bs1, w); /// ----------------------------------------------------- A_part2x2.mergeToATL(A_part3x3); @@ -148,13 +140,10 @@ struct SerialApplyQ_LeftBackwardInternal { struct SerialApplyQ_RightForwardInternal { template - KOKKOS_INLINE_FUNCTION static int invoke(const int m, const int n, - const int k, - /* */ ValueType *A, const int as0, - const int as1, + KOKKOS_INLINE_FUNCTION static int invoke(const int m, const int n, const int k, + /* */ ValueType *A, const int as0, const int as1, /* */ ValueType *t, const int ts, - /* */ ValueType *B, const int bs0, - const int bs1, + /* */ ValueType *B, const int bs0, const int bs1, /* */ ValueType *w) { typedef ValueType value_type; @@ -191,9 +180,8 @@ struct SerialApplyQ_RightForwardInternal { const int n_B2 = n - n_A0 - 1; /// ----------------------------------------------------- // right apply householder to partitioned B1 and B2 - SerialApplyRightHouseholderInternal::invoke(m, n_B2, tau, A_part3x3.A21, - as0, B_part1x3.A1, bs0, - B_part1x3.A2, bs0, bs1, w); + SerialApplyRightHouseholderInternal::invoke(m, n_B2, tau, A_part3x3.A21, as0, B_part1x3.A1, bs0, B_part1x3.A2, + bs0, bs1, w); /// ----------------------------------------------------- A_part2x2.mergeToATL(A_part3x3); t_part2x1.mergeToAT(t_part3x1); diff --git a/batched/dense/impl/KokkosBatched_ApplyQ_TeamVector_Impl.hpp b/batched/dense/impl/KokkosBatched_ApplyQ_TeamVector_Impl.hpp index 7f3a695d75..d6abd61a78 100644 --- a/batched/dense/impl/KokkosBatched_ApplyQ_TeamVector_Impl.hpp +++ b/batched/dense/impl/KokkosBatched_ApplyQ_TeamVector_Impl.hpp @@ -28,53 +28,35 @@ namespace KokkosBatched { /// =============== template -struct TeamVectorApplyQ { - template - KOKKOS_INLINE_FUNCTION static int invoke(const MemberType &member, - const AViewType &A, - const tViewType &t, - const BViewType &B, - const wViewType &w) { - return TeamVectorApplyQ_LeftForwardInternal::invoke( - member, B.extent(0), B.extent(1), A.extent(1), A.data(), A.stride_0(), - A.stride_1(), t.data(), t.stride_0(), B.data(), B.stride_0(), - B.stride_1(), w.data()); +struct TeamVectorApplyQ { + template + KOKKOS_INLINE_FUNCTION static int invoke(const MemberType &member, const AViewType &A, const tViewType &t, + const BViewType &B, const wViewType &w) { + return TeamVectorApplyQ_LeftForwardInternal::invoke(member, B.extent(0), B.extent(1), A.extent(1), A.data(), + A.stride_0(), A.stride_1(), t.data(), t.stride_0(), B.data(), + B.stride_0(), B.stride_1(), w.data()); } }; template -struct TeamVectorApplyQ { - template - KOKKOS_INLINE_FUNCTION static int invoke(const MemberType &member, - const AViewType &A, - const tViewType &t, - const BViewType &B, - const wViewType &w) { - return TeamVectorApplyQ_LeftBackwardInternal::invoke( - member, B.extent(0), B.extent(1), A.extent(1), A.data(), A.stride_0(), - A.stride_1(), t.data(), t.stride_0(), B.data(), B.stride_0(), - B.stride_1(), w.data()); +struct TeamVectorApplyQ { + template + KOKKOS_INLINE_FUNCTION static int invoke(const MemberType &member, const AViewType &A, const tViewType &t, + const BViewType &B, const wViewType &w) { + return TeamVectorApplyQ_LeftBackwardInternal::invoke(member, B.extent(0), B.extent(1), A.extent(1), A.data(), + A.stride_0(), A.stride_1(), t.data(), t.stride_0(), B.data(), + B.stride_0(), B.stride_1(), w.data()); } }; template -struct TeamVectorApplyQ { - template - KOKKOS_INLINE_FUNCTION static int invoke(const MemberType &member, - const AViewType &A, - const tViewType &t, - const BViewType &B, - const wViewType &w) { - return TeamVectorApplyQ_RightForwardInternal::invoke( - member, B.extent(0), B.extent(1), A.extent(1), A.data(), A.stride_0(), - A.stride_1(), t.data(), t.stride_0(), B.data(), B.stride_0(), - B.stride_1(), w.data()); +struct TeamVectorApplyQ { + template + KOKKOS_INLINE_FUNCTION static int invoke(const MemberType &member, const AViewType &A, const tViewType &t, + const BViewType &B, const wViewType &w) { + return TeamVectorApplyQ_RightForwardInternal::invoke(member, B.extent(0), B.extent(1), A.extent(1), A.data(), + A.stride_0(), A.stride_1(), t.data(), t.stride_0(), B.data(), + B.stride_0(), B.stride_1(), w.data()); } }; diff --git a/batched/dense/impl/KokkosBatched_ApplyQ_TeamVector_Internal.hpp b/batched/dense/impl/KokkosBatched_ApplyQ_TeamVector_Internal.hpp index 233daa8978..8fc6c8a78a 100644 --- a/batched/dense/impl/KokkosBatched_ApplyQ_TeamVector_Internal.hpp +++ b/batched/dense/impl/KokkosBatched_ApplyQ_TeamVector_Internal.hpp @@ -32,12 +32,11 @@ namespace KokkosBatched { struct TeamVectorApplyQ_LeftForwardInternal { template - KOKKOS_INLINE_FUNCTION static int invoke( - const MemberType &member, const int m, const int n, const int k, - /* */ ValueType *A, const int as0, const int as1, - /* */ ValueType *t, const int ts, - /* */ ValueType *B, const int bs0, const int bs1, - /* */ ValueType *w) { + KOKKOS_INLINE_FUNCTION static int invoke(const MemberType &member, const int m, const int n, const int k, + /* */ ValueType *A, const int as0, const int as1, + /* */ ValueType *t, const int ts, + /* */ ValueType *B, const int bs0, const int bs1, + /* */ ValueType *w) { typedef ValueType value_type; /// Given a matrix A that includes a series of householder vectors, @@ -73,9 +72,8 @@ struct TeamVectorApplyQ_LeftForwardInternal { const int m_A2 = m - m_A0 - 1; /// ----------------------------------------------------- // left apply householder to partitioned B1 and B2 - TeamVectorApplyLeftHouseholderInternal::invoke( - member, m_A2, n, tau, A_part3x3.A21, as0, B_part3x1.A1, bs1, - B_part3x1.A2, bs0, bs1, w); + TeamVectorApplyLeftHouseholderInternal::invoke(member, m_A2, n, tau, A_part3x3.A21, as0, B_part3x1.A1, bs1, + B_part3x1.A2, bs0, bs1, w); member.team_barrier(); /// ----------------------------------------------------- A_part2x2.mergeToABR(A_part3x3); @@ -88,12 +86,11 @@ struct TeamVectorApplyQ_LeftForwardInternal { struct TeamVectorApplyQ_LeftBackwardInternal { template - KOKKOS_INLINE_FUNCTION static int invoke( - const MemberType &member, const int m, const int n, const int k, - /* */ ValueType *A, const int as0, const int as1, - /* */ ValueType *t, const int ts, - /* */ ValueType *B, const int bs0, const int bs1, - /* */ ValueType *w) { + KOKKOS_INLINE_FUNCTION static int invoke(const MemberType &member, const int m, const int n, const int k, + /* */ ValueType *A, const int as0, const int as1, + /* */ ValueType *t, const int ts, + /* */ ValueType *B, const int bs0, const int bs1, + /* */ ValueType *w) { typedef ValueType value_type; /// Given a matrix A that includes a series of householder vectors, @@ -129,9 +126,8 @@ struct TeamVectorApplyQ_LeftBackwardInternal { const int m_A2 = m - m_A0 - 1; /// ----------------------------------------------------- // left apply householder to partitioned B1 and B2 - TeamVectorApplyLeftHouseholderInternal::invoke( - member, m_A2, n, tau, A_part3x3.A21, as0, B_part3x1.A1, bs1, - B_part3x1.A2, bs0, bs1, w); + TeamVectorApplyLeftHouseholderInternal::invoke(member, m_A2, n, tau, A_part3x3.A21, as0, B_part3x1.A1, bs1, + B_part3x1.A2, bs0, bs1, w); member.team_barrier(); /// ----------------------------------------------------- A_part2x2.mergeToATL(A_part3x3); @@ -144,12 +140,11 @@ struct TeamVectorApplyQ_LeftBackwardInternal { struct TeamVectorApplyQ_RightForwardInternal { template - KOKKOS_INLINE_FUNCTION static int invoke( - const MemberType &member, const int m, const int n, const int k, - /* */ ValueType *A, const int as0, const int as1, - /* */ ValueType *t, const int ts, - /* */ ValueType *B, const int bs0, const int bs1, - /* */ ValueType *w) { + KOKKOS_INLINE_FUNCTION static int invoke(const MemberType &member, const int m, const int n, const int k, + /* */ ValueType *A, const int as0, const int as1, + /* */ ValueType *t, const int ts, + /* */ ValueType *B, const int bs0, const int bs1, + /* */ ValueType *w) { typedef ValueType value_type; /// Given a matrix A that includes a series of householder vectors, @@ -185,9 +180,8 @@ struct TeamVectorApplyQ_RightForwardInternal { const int n_B2 = n - n_A0 - 1; /// ----------------------------------------------------- // right apply householder to partitioned B1 and B2 - TeamVectorApplyRightHouseholderInternal::invoke( - member, m, n_B2, tau, A_part3x3.A21, as0, B_part1x3.A1, bs0, - B_part1x3.A2, bs0, bs1, w); + TeamVectorApplyRightHouseholderInternal::invoke(member, m, n_B2, tau, A_part3x3.A21, as0, B_part1x3.A1, bs0, + B_part1x3.A2, bs0, bs1, w); member.team_barrier(); /// ----------------------------------------------------- A_part2x2.mergeToATL(A_part3x3); diff --git a/batched/dense/impl/KokkosBatched_Axpy_Impl.hpp b/batched/dense/impl/KokkosBatched_Axpy_Impl.hpp index da9d607241..6d65ebc294 100644 --- a/batched/dense/impl/KokkosBatched_Axpy_Impl.hpp +++ b/batched/dense/impl/KokkosBatched_Axpy_Impl.hpp @@ -28,11 +28,9 @@ namespace KokkosBatched { /// ==================== struct SerialAxpyInternal { template - KOKKOS_INLINE_FUNCTION static int invoke(const int m, const ScalarType alpha, - const ValueType* KOKKOS_RESTRICT X, + KOKKOS_INLINE_FUNCTION static int invoke(const int m, const ScalarType alpha, const ValueType* KOKKOS_RESTRICT X, const int xs0, - /* */ ValueType* KOKKOS_RESTRICT Y, - const int ys0) { + /* */ ValueType* KOKKOS_RESTRICT Y, const int ys0) { #if defined(KOKKOS_ENABLE_PRAGMA_UNROLL) #pragma unroll #endif @@ -42,10 +40,9 @@ struct SerialAxpyInternal { } template - KOKKOS_INLINE_FUNCTION static int invoke( - const int m, const ScalarType* KOKKOS_RESTRICT alpha, const int alphas0, - const ValueType* KOKKOS_RESTRICT X, const int xs0, - /* */ ValueType* KOKKOS_RESTRICT Y, const int ys0) { + KOKKOS_INLINE_FUNCTION static int invoke(const int m, const ScalarType* KOKKOS_RESTRICT alpha, const int alphas0, + const ValueType* KOKKOS_RESTRICT X, const int xs0, + /* */ ValueType* KOKKOS_RESTRICT Y, const int ys0) { #if defined(KOKKOS_ENABLE_PRAGMA_UNROLL) #pragma unroll #endif @@ -55,17 +52,14 @@ struct SerialAxpyInternal { } template - KOKKOS_INLINE_FUNCTION static int invoke( - const int m, const int n, const ScalarType* KOKKOS_RESTRICT alpha, - const int alphas0, const ValueType* KOKKOS_RESTRICT X, const int xs0, - const int xs1, - /* */ ValueType* KOKKOS_RESTRICT Y, const int ys0, const int ys1) { + KOKKOS_INLINE_FUNCTION static int invoke(const int m, const int n, const ScalarType* KOKKOS_RESTRICT alpha, + const int alphas0, const ValueType* KOKKOS_RESTRICT X, const int xs0, + const int xs1, + /* */ ValueType* KOKKOS_RESTRICT Y, const int ys0, const int ys1) { if (xs0 > xs1) - for (int i = 0; i < m; ++i) - invoke(n, alpha[i * alphas0], X + i * xs0, xs1, Y + i * ys0, ys1); + for (int i = 0; i < m; ++i) invoke(n, alpha[i * alphas0], X + i * xs0, xs1, Y + i * ys0, ys1); else - for (int j = 0; j < n; ++j) - invoke(m, alpha, alphas0, X + j * xs1, xs0, Y + j * ys1, ys0); + for (int j = 0; j < n; ++j) invoke(m, alpha, alphas0, X + j * xs1, xs0, Y + j * ys1, ys0); return 0; } @@ -76,50 +70,38 @@ struct SerialAxpyInternal { /// ==================== struct TeamAxpyInternal { template - KOKKOS_INLINE_FUNCTION static int invoke(const MemberType& member, - const int m, const ScalarType alpha, - const ValueType* KOKKOS_RESTRICT X, - const int xs0, - /* */ ValueType* KOKKOS_RESTRICT Y, - const int ys0) { - Kokkos::parallel_for(Kokkos::TeamThreadRange(member, m), [&](const int& i) { - Y[i * ys0] += alpha * X[i * xs0]; - }); + KOKKOS_INLINE_FUNCTION static int invoke(const MemberType& member, const int m, const ScalarType alpha, + const ValueType* KOKKOS_RESTRICT X, const int xs0, + /* */ ValueType* KOKKOS_RESTRICT Y, const int ys0) { + Kokkos::parallel_for(Kokkos::TeamThreadRange(member, m), [&](const int& i) { Y[i * ys0] += alpha * X[i * xs0]; }); // member.team_barrier(); return 0; } template - KOKKOS_INLINE_FUNCTION static int invoke( - const MemberType& member, const int m, - const ScalarType* KOKKOS_RESTRICT alpha, const int alphas0, - const ValueType* KOKKOS_RESTRICT X, const int xs0, - /* */ ValueType* KOKKOS_RESTRICT Y, const int ys0) { - Kokkos::parallel_for(Kokkos::TeamThreadRange(member, m), [&](const int& i) { - Y[i * ys0] += alpha[i * alphas0] * X[i * xs0]; - }); + KOKKOS_INLINE_FUNCTION static int invoke(const MemberType& member, const int m, + const ScalarType* KOKKOS_RESTRICT alpha, const int alphas0, + const ValueType* KOKKOS_RESTRICT X, const int xs0, + /* */ ValueType* KOKKOS_RESTRICT Y, const int ys0) { + Kokkos::parallel_for(Kokkos::TeamThreadRange(member, m), + [&](const int& i) { Y[i * ys0] += alpha[i * alphas0] * X[i * xs0]; }); // member.team_barrier(); return 0; } template - KOKKOS_INLINE_FUNCTION static int invoke( - const MemberType& member, const int m, const int n, - const ScalarType* KOKKOS_RESTRICT alpha, const int alphas0, - const ValueType* KOKKOS_RESTRICT X, const int xs0, const int xs1, - /* */ ValueType* KOKKOS_RESTRICT Y, const int ys0, const int ys1) { + KOKKOS_INLINE_FUNCTION static int invoke(const MemberType& member, const int m, const int n, + const ScalarType* KOKKOS_RESTRICT alpha, const int alphas0, + const ValueType* KOKKOS_RESTRICT X, const int xs0, const int xs1, + /* */ ValueType* KOKKOS_RESTRICT Y, const int ys0, const int ys1) { if (m > n) { - Kokkos::parallel_for( - Kokkos::TeamThreadRange(member, m), [&](const int& i) { - SerialAxpyInternal::invoke(n, alpha[i * alphas0], X + i * xs0, xs1, - Y + i * ys0, ys1); - }); + Kokkos::parallel_for(Kokkos::TeamThreadRange(member, m), [&](const int& i) { + SerialAxpyInternal::invoke(n, alpha[i * alphas0], X + i * xs0, xs1, Y + i * ys0, ys1); + }); } else { - Kokkos::parallel_for( - Kokkos::TeamThreadRange(member, n), [&](const int& j) { - SerialAxpyInternal::invoke(m, alpha, alphas0, X + j * xs1, xs0, - Y + j * ys1, ys0); - }); + Kokkos::parallel_for(Kokkos::TeamThreadRange(member, n), [&](const int& j) { + SerialAxpyInternal::invoke(m, alpha, alphas0, X + j * xs1, xs0, Y + j * ys1, ys0); + }); } // member.team_barrier(); return 0; @@ -131,45 +113,35 @@ struct TeamAxpyInternal { /// ======================== struct TeamVectorAxpyInternal { template - KOKKOS_INLINE_FUNCTION static int invoke(const MemberType& member, - const int m, const ScalarType alpha, - const ValueType* KOKKOS_RESTRICT X, - const int xs0, - /* */ ValueType* KOKKOS_RESTRICT Y, - const int ys0) { - Kokkos::parallel_for(Kokkos::TeamVectorRange(member, m), [&](const int& i) { - Y[i * ys0] += alpha * X[i * xs0]; - }); + KOKKOS_INLINE_FUNCTION static int invoke(const MemberType& member, const int m, const ScalarType alpha, + const ValueType* KOKKOS_RESTRICT X, const int xs0, + /* */ ValueType* KOKKOS_RESTRICT Y, const int ys0) { + Kokkos::parallel_for(Kokkos::TeamVectorRange(member, m), [&](const int& i) { Y[i * ys0] += alpha * X[i * xs0]; }); // member.team_barrier(); return 0; } template - KOKKOS_INLINE_FUNCTION static int invoke( - const MemberType& member, const int m, - const ScalarType* KOKKOS_RESTRICT alpha, const int alphas0, - const ValueType* KOKKOS_RESTRICT X, const int xs0, - /* */ ValueType* KOKKOS_RESTRICT Y, const int ys0) { - Kokkos::parallel_for(Kokkos::TeamVectorRange(member, m), [&](const int& i) { - Y[i * ys0] += alpha[i * alphas0] * X[i * xs0]; - }); + KOKKOS_INLINE_FUNCTION static int invoke(const MemberType& member, const int m, + const ScalarType* KOKKOS_RESTRICT alpha, const int alphas0, + const ValueType* KOKKOS_RESTRICT X, const int xs0, + /* */ ValueType* KOKKOS_RESTRICT Y, const int ys0) { + Kokkos::parallel_for(Kokkos::TeamVectorRange(member, m), + [&](const int& i) { Y[i * ys0] += alpha[i * alphas0] * X[i * xs0]; }); // member.team_barrier(); return 0; } - template - KOKKOS_INLINE_FUNCTION static int invoke( - const MemberType& member, const int m, const int n, - const ScalarType* KOKKOS_RESTRICT alpha, const int alphas0, - const ValueType* KOKKOS_RESTRICT X, const int xs0, const int xs1, - /* */ ValueType* KOKKOS_RESTRICT Y, const int ys0, const int ys1) { - Kokkos::parallel_for( - Kokkos::TeamVectorRange(member, 0, m * n), [&](const int& iTemp) { - int i, j; - getIndices(iTemp, n, m, j, i); - Y[i * ys0 + j * ys1] += alpha[i * alphas0] * X[i * xs0 + j * xs1]; - }); + template + KOKKOS_INLINE_FUNCTION static int invoke(const MemberType& member, const int m, const int n, + const ScalarType* KOKKOS_RESTRICT alpha, const int alphas0, + const ValueType* KOKKOS_RESTRICT X, const int xs0, const int xs1, + /* */ ValueType* KOKKOS_RESTRICT Y, const int ys0, const int ys1) { + Kokkos::parallel_for(Kokkos::TeamVectorRange(member, 0, m * n), [&](const int& iTemp) { + int i, j; + getIndices(iTemp, n, m, j, i); + Y[i * ys0 + j * ys1] += alpha[i * alphas0] * X[i * xs0 + j * xs1]; + }); // member.team_barrier(); return 0; } @@ -180,22 +152,14 @@ struct TeamVectorAxpyInternal { /// =========== template -KOKKOS_INLINE_FUNCTION int SerialAxpy::invoke(const alphaViewType& alpha, - const XViewType& X, - const YViewType& Y) { +KOKKOS_INLINE_FUNCTION int SerialAxpy::invoke(const alphaViewType& alpha, const XViewType& X, const YViewType& Y) { #if (KOKKOSKERNELS_DEBUG_LEVEL > 0) - static_assert(Kokkos::is_view::value, - "KokkosBatched::axpy: XViewType is not a Kokkos::View."); - static_assert(Kokkos::is_view::value, - "KokkosBatched::axpy: YViewType is not a Kokkos::View."); - static_assert(Kokkos::is_view::value, - "KokkosBatched::axpy: alphaViewType is not a Kokkos::View."); - static_assert(XViewType::rank == 2, - "KokkosBatched::axpy: XViewType must have rank 2."); - static_assert(YViewType::rank == 2, - "KokkosBatched::axpy: YViewType must have rank 2."); - static_assert(alphaViewType::rank == 1, - "KokkosBatched::axpy: alphaViewType must have rank 1."); + static_assert(Kokkos::is_view::value, "KokkosBatched::axpy: XViewType is not a Kokkos::View."); + static_assert(Kokkos::is_view::value, "KokkosBatched::axpy: YViewType is not a Kokkos::View."); + static_assert(Kokkos::is_view::value, "KokkosBatched::axpy: alphaViewType is not a Kokkos::View."); + static_assert(XViewType::rank == 2, "KokkosBatched::axpy: XViewType must have rank 2."); + static_assert(YViewType::rank == 2, "KokkosBatched::axpy: YViewType must have rank 2."); + static_assert(alphaViewType::rank == 1, "KokkosBatched::axpy: alphaViewType must have rank 1."); // Check compatibility of dimensions at run time. if (X.extent(0) != Y.extent(0) || X.extent(1) != Y.extent(1)) { @@ -217,11 +181,10 @@ KOKKOS_INLINE_FUNCTION int SerialAxpy::invoke(const alphaViewType& alpha, // No need to check if X.extent(0)==1 in the serial case as we don't // parallelize the kernel anyway. - return SerialAxpyInternal::template invoke< - typename alphaViewType::non_const_value_type, - typename XViewType::non_const_value_type>( - X.extent(0), X.extent(1), alpha.data(), alpha.stride_0(), X.data(), - X.stride_0(), X.stride_1(), Y.data(), Y.stride_0(), Y.stride_1()); + return SerialAxpyInternal::template invoke( + X.extent(0), X.extent(1), alpha.data(), alpha.stride_0(), X.data(), X.stride_0(), X.stride_1(), Y.data(), + Y.stride_0(), Y.stride_1()); } /// @@ -230,22 +193,15 @@ KOKKOS_INLINE_FUNCTION int SerialAxpy::invoke(const alphaViewType& alpha, template template -KOKKOS_INLINE_FUNCTION int TeamAxpy::invoke( - const MemberType& member, const alphaViewType& alpha, const XViewType& X, - const YViewType& Y) { +KOKKOS_INLINE_FUNCTION int TeamAxpy::invoke(const MemberType& member, const alphaViewType& alpha, + const XViewType& X, const YViewType& Y) { #if (KOKKOSKERNELS_DEBUG_LEVEL > 0) - static_assert(Kokkos::is_view::value, - "KokkosBatched::axpy: XViewType is not a Kokkos::View."); - static_assert(Kokkos::is_view::value, - "KokkosBatched::axpy: YViewType is not a Kokkos::View."); - static_assert(Kokkos::is_view::value, - "KokkosBatched::axpy: alphaViewType is not a Kokkos::View."); - static_assert(XViewType::rank == 2, - "KokkosBatched::axpy: XViewType must have rank 2."); - static_assert(YViewType::rank == 2, - "KokkosBatched::axpy: YViewType must have rank 2."); - static_assert(alphaViewType::rank == 1, - "KokkosBatched::axpy: alphaViewType must have rank 1."); + static_assert(Kokkos::is_view::value, "KokkosBatched::axpy: XViewType is not a Kokkos::View."); + static_assert(Kokkos::is_view::value, "KokkosBatched::axpy: YViewType is not a Kokkos::View."); + static_assert(Kokkos::is_view::value, "KokkosBatched::axpy: alphaViewType is not a Kokkos::View."); + static_assert(XViewType::rank == 2, "KokkosBatched::axpy: XViewType must have rank 2."); + static_assert(YViewType::rank == 2, "KokkosBatched::axpy: YViewType must have rank 2."); + static_assert(alphaViewType::rank == 1, "KokkosBatched::axpy: alphaViewType must have rank 1."); // Check compatibility of dimensions at run time. if (X.extent(0) != Y.extent(0) || X.extent(1) != Y.extent(1)) { @@ -265,18 +221,15 @@ KOKKOS_INLINE_FUNCTION int TeamAxpy::invoke( #endif if (X.extent(0) == 1) { - KokkosBlas::Experimental::axpy( - member, alpha.data()[0], Kokkos::subview(X, 0, Kokkos::ALL), - Kokkos::subview(Y, 0, Kokkos::ALL)); + KokkosBlas::Experimental::axpy(member, alpha.data()[0], Kokkos::subview(X, 0, Kokkos::ALL), + Kokkos::subview(Y, 0, Kokkos::ALL)); return 0; } - return TeamAxpyInternal::template invoke< - MemberType, typename alphaViewType::non_const_value_type, - typename XViewType::non_const_value_type>( - member, X.extent(0), X.extent(1), alpha.data(), alpha.stride_0(), - X.data(), X.stride_0(), X.stride_1(), Y.data(), Y.stride_0(), - Y.stride_1()); + return TeamAxpyInternal::template invoke( + member, X.extent(0), X.extent(1), alpha.data(), alpha.stride_0(), X.data(), X.stride_0(), X.stride_1(), Y.data(), + Y.stride_0(), Y.stride_1()); } /// @@ -285,22 +238,15 @@ KOKKOS_INLINE_FUNCTION int TeamAxpy::invoke( template template -KOKKOS_INLINE_FUNCTION int TeamVectorAxpy::invoke( - const MemberType& member, const alphaViewType& alpha, const XViewType& X, - const YViewType& Y) { +KOKKOS_INLINE_FUNCTION int TeamVectorAxpy::invoke(const MemberType& member, const alphaViewType& alpha, + const XViewType& X, const YViewType& Y) { #if (KOKKOSKERNELS_DEBUG_LEVEL > 0) - static_assert(Kokkos::is_view::value, - "KokkosBatched::axpy: XViewType is not a Kokkos::View."); - static_assert(Kokkos::is_view::value, - "KokkosBatched::axpy: YViewType is not a Kokkos::View."); - static_assert(Kokkos::is_view::value, - "KokkosBatched::axpy: alphaViewType is not a Kokkos::View."); - static_assert(XViewType::rank == 2, - "KokkosBatched::axpy: XViewType must have rank 2."); - static_assert(YViewType::rank == 2, - "KokkosBatched::axpy: YViewType must have rank 2."); - static_assert(alphaViewType::rank == 1, - "KokkosBatched::axpy: alphaViewType must have rank 1."); + static_assert(Kokkos::is_view::value, "KokkosBatched::axpy: XViewType is not a Kokkos::View."); + static_assert(Kokkos::is_view::value, "KokkosBatched::axpy: YViewType is not a Kokkos::View."); + static_assert(Kokkos::is_view::value, "KokkosBatched::axpy: alphaViewType is not a Kokkos::View."); + static_assert(XViewType::rank == 2, "KokkosBatched::axpy: XViewType must have rank 2."); + static_assert(YViewType::rank == 2, "KokkosBatched::axpy: YViewType must have rank 2."); + static_assert(alphaViewType::rank == 1, "KokkosBatched::axpy: alphaViewType must have rank 1."); // Check compatibility of dimensions at run time. if (X.extent(0) != Y.extent(0) || X.extent(1) != Y.extent(1)) { @@ -320,19 +266,15 @@ KOKKOS_INLINE_FUNCTION int TeamVectorAxpy::invoke( #endif if (X.extent(0) == 1) { - KokkosBlas::Experimental::axpy( - member, alpha.data()[0], Kokkos::subview(X, 0, Kokkos::ALL), - Kokkos::subview(Y, 0, Kokkos::ALL)); + KokkosBlas::Experimental::axpy(member, alpha.data()[0], Kokkos::subview(X, 0, Kokkos::ALL), + Kokkos::subview(Y, 0, Kokkos::ALL)); return 0; } - return TeamVectorAxpyInternal::invoke< - MemberType, typename alphaViewType::non_const_value_type, - typename XViewType::non_const_value_type, - typename XViewType::array_layout>(member, X.extent(0), X.extent(1), - alpha.data(), alpha.stride_0(), - X.data(), X.stride_0(), X.stride_1(), - Y.data(), Y.stride_0(), Y.stride_1()); + return TeamVectorAxpyInternal::invoke( + member, X.extent(0), X.extent(1), alpha.data(), alpha.stride_0(), X.data(), X.stride_0(), X.stride_1(), Y.data(), + Y.stride_0(), Y.stride_1()); } } // namespace KokkosBatched diff --git a/batched/dense/impl/KokkosBatched_Copy_Impl.hpp b/batched/dense/impl/KokkosBatched_Copy_Impl.hpp index 0a8c9d456f..e11106cc24 100644 --- a/batched/dense/impl/KokkosBatched_Copy_Impl.hpp +++ b/batched/dense/impl/KokkosBatched_Copy_Impl.hpp @@ -29,33 +29,24 @@ namespace KokkosBatched { template <> template -KOKKOS_INLINE_FUNCTION int SerialCopy::invoke( - const AViewType &A, const BViewType &B) { - return SerialCopyInternal::invoke(A.extent(0), A.data(), A.stride_0(), - B.data(), B.stride_0()); +KOKKOS_INLINE_FUNCTION int SerialCopy::invoke(const AViewType &A, const BViewType &B) { + return SerialCopyInternal::invoke(A.extent(0), A.data(), A.stride_0(), B.data(), B.stride_0()); } template <> template -KOKKOS_INLINE_FUNCTION int SerialCopy::invoke( - const AViewType &A, const BViewType &B) { - return SerialCopyInternal::invoke(A.extent(0), A.data(), A.stride_0(), - B.data(), B.stride_0()); +KOKKOS_INLINE_FUNCTION int SerialCopy::invoke(const AViewType &A, const BViewType &B) { + return SerialCopyInternal::invoke(A.extent(0), A.data(), A.stride_0(), B.data(), B.stride_0()); } template <> template -KOKKOS_INLINE_FUNCTION int SerialCopy::invoke( - const AViewType &A, const BViewType &B) { +KOKKOS_INLINE_FUNCTION int SerialCopy::invoke(const AViewType &A, const BViewType &B) { #if (KOKKOSKERNELS_DEBUG_LEVEL > 0) - static_assert(Kokkos::is_view::value, - "KokkosBatched::copy: AViewType is not a Kokkos::View."); - static_assert(Kokkos::is_view::value, - "KokkosBatched::copy: BViewType is not a Kokkos::View."); - static_assert(AViewType::rank == 2, - "KokkosBatched::copy: AViewType must have rank 2."); - static_assert(BViewType::rank == 2, - "KokkosBatched::copy: BViewType must have rank 2."); + static_assert(Kokkos::is_view::value, "KokkosBatched::copy: AViewType is not a Kokkos::View."); + static_assert(Kokkos::is_view::value, "KokkosBatched::copy: BViewType is not a Kokkos::View."); + static_assert(AViewType::rank == 2, "KokkosBatched::copy: AViewType must have rank 2."); + static_assert(BViewType::rank == 2, "KokkosBatched::copy: BViewType must have rank 2."); // Check compatibility of dimensions at run time. if (A.extent(0) != B.extent(0) || A.extent(1) != B.extent(1)) { @@ -66,24 +57,18 @@ KOKKOS_INLINE_FUNCTION int SerialCopy::invoke( return 1; } #endif - return SerialCopyInternal::invoke(A.extent(0), A.extent(1), A.data(), - A.stride_0(), A.stride_1(), B.data(), + return SerialCopyInternal::invoke(A.extent(0), A.extent(1), A.data(), A.stride_0(), A.stride_1(), B.data(), B.stride_0(), B.stride_1()); } template <> template -KOKKOS_INLINE_FUNCTION int SerialCopy::invoke( - const AViewType &A, const BViewType &B) { +KOKKOS_INLINE_FUNCTION int SerialCopy::invoke(const AViewType &A, const BViewType &B) { #if (KOKKOSKERNELS_DEBUG_LEVEL > 0) - static_assert(Kokkos::is_view::value, - "KokkosBatched::copy: AViewType is not a Kokkos::View."); - static_assert(Kokkos::is_view::value, - "KokkosBatched::copy: BViewType is not a Kokkos::View."); - static_assert(AViewType::rank == 2, - "KokkosBatched::copy: AViewType must have rank 2."); - static_assert(BViewType::rank == 2, - "KokkosBatched::copy: BViewType must have rank 2."); + static_assert(Kokkos::is_view::value, "KokkosBatched::copy: AViewType is not a Kokkos::View."); + static_assert(Kokkos::is_view::value, "KokkosBatched::copy: BViewType is not a Kokkos::View."); + static_assert(AViewType::rank == 2, "KokkosBatched::copy: AViewType must have rank 2."); + static_assert(BViewType::rank == 2, "KokkosBatched::copy: BViewType must have rank 2."); // Check compatibility of dimensions at run time. if (A.extent(0) != B.extent(0) || A.extent(1) != B.extent(1)) { @@ -94,8 +79,7 @@ KOKKOS_INLINE_FUNCTION int SerialCopy::invoke( return 1; } #endif - return SerialCopyInternal::invoke(A.extent(1), A.extent(0), A.data(), - A.stride_1(), A.stride_0(), B.data(), + return SerialCopyInternal::invoke(A.extent(1), A.extent(0), A.data(), A.stride_1(), A.stride_0(), B.data(), B.stride_0(), B.stride_1()); } @@ -106,40 +90,28 @@ KOKKOS_INLINE_FUNCTION int SerialCopy::invoke( template struct TeamCopy { template - KOKKOS_INLINE_FUNCTION static int invoke(const MemberType &member, - const AViewType &A, - const BViewType &B) { - return TeamCopyInternal::invoke(member, A.extent(0), A.data(), A.stride_0(), - B.data(), B.stride_0()); + KOKKOS_INLINE_FUNCTION static int invoke(const MemberType &member, const AViewType &A, const BViewType &B) { + return TeamCopyInternal::invoke(member, A.extent(0), A.data(), A.stride_0(), B.data(), B.stride_0()); } }; template struct TeamCopy { template - KOKKOS_INLINE_FUNCTION static int invoke(const MemberType &member, - const AViewType &A, - const BViewType &B) { - return TeamCopyInternal::invoke(member, A.extent(0), A.data(), A.stride_0(), - B.data(), B.stride_0()); + KOKKOS_INLINE_FUNCTION static int invoke(const MemberType &member, const AViewType &A, const BViewType &B) { + return TeamCopyInternal::invoke(member, A.extent(0), A.data(), A.stride_0(), B.data(), B.stride_0()); } }; template struct TeamCopy { template - KOKKOS_INLINE_FUNCTION static int invoke(const MemberType &member, - const AViewType &A, - const BViewType &B) { + KOKKOS_INLINE_FUNCTION static int invoke(const MemberType &member, const AViewType &A, const BViewType &B) { #if (KOKKOSKERNELS_DEBUG_LEVEL > 0) - static_assert(Kokkos::is_view::value, - "KokkosBatched::copy: AViewType is not a Kokkos::View."); - static_assert(Kokkos::is_view::value, - "KokkosBatched::copy: BViewType is not a Kokkos::View."); - static_assert(AViewType::rank == 2, - "KokkosBatched::copy: AViewType must have rank 2."); - static_assert(BViewType::rank == 2, - "KokkosBatched::copy: BViewType must have rank 2."); + static_assert(Kokkos::is_view::value, "KokkosBatched::copy: AViewType is not a Kokkos::View."); + static_assert(Kokkos::is_view::value, "KokkosBatched::copy: BViewType is not a Kokkos::View."); + static_assert(AViewType::rank == 2, "KokkosBatched::copy: AViewType must have rank 2."); + static_assert(BViewType::rank == 2, "KokkosBatched::copy: BViewType must have rank 2."); // Check compatibility of dimensions at run time. if (A.extent(0) != B.extent(0) || A.extent(1) != B.extent(1)) { @@ -147,18 +119,15 @@ struct TeamCopy { "KokkosBatched::copy: Dimensions of A and B do not match: A: %d x " "%d, " "B: %d x %d\n", - (int)A.extent(0), (int)A.extent(1), (int)B.extent(0), - (int)B.extent(1)); + (int)A.extent(0), (int)A.extent(1), (int)B.extent(0), (int)B.extent(1)); return 1; } #endif if (A.extent(0) == 1) { - return TeamCopy::invoke( - member, Kokkos::subview(A, 0, Kokkos::ALL), - Kokkos::subview(B, 0, Kokkos::ALL)); + return TeamCopy::invoke(member, Kokkos::subview(A, 0, Kokkos::ALL), + Kokkos::subview(B, 0, Kokkos::ALL)); } - return TeamCopyInternal::invoke(member, A.extent(0), A.extent(1), A.data(), - A.stride_0(), A.stride_1(), B.data(), + return TeamCopyInternal::invoke(member, A.extent(0), A.extent(1), A.data(), A.stride_0(), A.stride_1(), B.data(), B.stride_0(), B.stride_1()); } }; @@ -166,18 +135,12 @@ struct TeamCopy { template struct TeamCopy { template - KOKKOS_INLINE_FUNCTION static int invoke(const MemberType &member, - const AViewType &A, - const BViewType &B) { + KOKKOS_INLINE_FUNCTION static int invoke(const MemberType &member, const AViewType &A, const BViewType &B) { #if (KOKKOSKERNELS_DEBUG_LEVEL > 0) - static_assert(Kokkos::is_view::value, - "KokkosBatched::copy: AViewType is not a Kokkos::View."); - static_assert(Kokkos::is_view::value, - "KokkosBatched::copy: BViewType is not a Kokkos::View."); - static_assert(AViewType::rank == 2, - "KokkosBatched::copy: AViewType must have rank 2."); - static_assert(BViewType::rank == 2, - "KokkosBatched::copy: BViewType must have rank 2."); + static_assert(Kokkos::is_view::value, "KokkosBatched::copy: AViewType is not a Kokkos::View."); + static_assert(Kokkos::is_view::value, "KokkosBatched::copy: BViewType is not a Kokkos::View."); + static_assert(AViewType::rank == 2, "KokkosBatched::copy: AViewType must have rank 2."); + static_assert(BViewType::rank == 2, "KokkosBatched::copy: BViewType must have rank 2."); // Check compatibility of dimensions at run time. if (A.extent(0) != B.extent(0) || A.extent(1) != B.extent(1)) { @@ -185,18 +148,15 @@ struct TeamCopy { "KokkosBatched::copy: Dimensions of A and B do not match: A: %d x " "%d, " "B: %d x %d\n", - (int)A.extent(0), (int)A.extent(1), (int)B.extent(0), - (int)B.extent(1)); + (int)A.extent(0), (int)A.extent(1), (int)B.extent(0), (int)B.extent(1)); return 1; } #endif if (A.extent(1) == 1) { - return TeamCopy::invoke( - member, Kokkos::subview(A, Kokkos::ALL, 0), - Kokkos::subview(B, Kokkos::ALL, 0)); + return TeamCopy::invoke(member, Kokkos::subview(A, Kokkos::ALL, 0), + Kokkos::subview(B, Kokkos::ALL, 0)); } - return TeamCopyInternal::invoke(member, A.extent(1), A.extent(0), A.data(), - A.stride_1(), A.stride_0(), B.data(), + return TeamCopyInternal::invoke(member, A.extent(1), A.extent(0), A.data(), A.stride_1(), A.stride_0(), B.data(), B.stride_0(), B.stride_1()); } }; @@ -208,40 +168,28 @@ struct TeamCopy { template struct TeamVectorCopy { template - KOKKOS_INLINE_FUNCTION static int invoke(const MemberType &member, - const AViewType &A, - const BViewType &B) { - return TeamVectorCopyInternal::invoke(member, A.extent(0), A.data(), - A.stride_0(), B.data(), B.stride_0()); + KOKKOS_INLINE_FUNCTION static int invoke(const MemberType &member, const AViewType &A, const BViewType &B) { + return TeamVectorCopyInternal::invoke(member, A.extent(0), A.data(), A.stride_0(), B.data(), B.stride_0()); } }; template struct TeamVectorCopy { template - KOKKOS_INLINE_FUNCTION static int invoke(const MemberType &member, - const AViewType &A, - const BViewType &B) { - return TeamVectorCopyInternal::invoke(member, A.extent(0), A.data(), - A.stride_0(), B.data(), B.stride_0()); + KOKKOS_INLINE_FUNCTION static int invoke(const MemberType &member, const AViewType &A, const BViewType &B) { + return TeamVectorCopyInternal::invoke(member, A.extent(0), A.data(), A.stride_0(), B.data(), B.stride_0()); } }; template struct TeamVectorCopy { template - KOKKOS_INLINE_FUNCTION static int invoke(const MemberType &member, - const AViewType &A, - const BViewType &B) { + KOKKOS_INLINE_FUNCTION static int invoke(const MemberType &member, const AViewType &A, const BViewType &B) { #if (KOKKOSKERNELS_DEBUG_LEVEL > 0) - static_assert(Kokkos::is_view::value, - "KokkosBatched::copy: AViewType is not a Kokkos::View."); - static_assert(Kokkos::is_view::value, - "KokkosBatched::copy: BViewType is not a Kokkos::View."); - static_assert(AViewType::rank == 2, - "KokkosBatched::copy: AViewType must have rank 2."); - static_assert(BViewType::rank == 2, - "KokkosBatched::copy: BViewType must have rank 2."); + static_assert(Kokkos::is_view::value, "KokkosBatched::copy: AViewType is not a Kokkos::View."); + static_assert(Kokkos::is_view::value, "KokkosBatched::copy: BViewType is not a Kokkos::View."); + static_assert(AViewType::rank == 2, "KokkosBatched::copy: AViewType must have rank 2."); + static_assert(BViewType::rank == 2, "KokkosBatched::copy: BViewType must have rank 2."); // Check compatibility of dimensions at run time. if (A.extent(0) != B.extent(0) || A.extent(1) != B.extent(1)) { @@ -249,18 +197,15 @@ struct TeamVectorCopy { "KokkosBatched::copy: Dimensions of A and B do not match: A: %d x " "%d, " "B: %d x %d\n", - (int)A.extent(0), (int)A.extent(1), (int)B.extent(0), - (int)B.extent(1)); + (int)A.extent(0), (int)A.extent(1), (int)B.extent(0), (int)B.extent(1)); return 1; } #endif if (A.extent(0) == 1) { - return TeamVectorCopy::invoke( - member, Kokkos::subview(A, 0, Kokkos::ALL), - Kokkos::subview(B, 0, Kokkos::ALL)); + return TeamVectorCopy::invoke(member, Kokkos::subview(A, 0, Kokkos::ALL), + Kokkos::subview(B, 0, Kokkos::ALL)); } - return TeamVectorCopyInternal::invoke(member, A.extent(0), A.extent(1), - A.data(), A.stride_0(), A.stride_1(), + return TeamVectorCopyInternal::invoke(member, A.extent(0), A.extent(1), A.data(), A.stride_0(), A.stride_1(), B.data(), B.stride_0(), B.stride_1()); } }; @@ -268,18 +213,12 @@ struct TeamVectorCopy { template struct TeamVectorCopy { template - KOKKOS_INLINE_FUNCTION static int invoke(const MemberType &member, - const AViewType &A, - const BViewType &B) { + KOKKOS_INLINE_FUNCTION static int invoke(const MemberType &member, const AViewType &A, const BViewType &B) { #if (KOKKOSKERNELS_DEBUG_LEVEL > 0) - static_assert(Kokkos::is_view::value, - "KokkosBatched::copy: AViewType is not a Kokkos::View."); - static_assert(Kokkos::is_view::value, - "KokkosBatched::copy: BViewType is not a Kokkos::View."); - static_assert(AViewType::rank == 2, - "KokkosBatched::copy: AViewType must have rank 2."); - static_assert(BViewType::rank == 2, - "KokkosBatched::copy: BViewType must have rank 2."); + static_assert(Kokkos::is_view::value, "KokkosBatched::copy: AViewType is not a Kokkos::View."); + static_assert(Kokkos::is_view::value, "KokkosBatched::copy: BViewType is not a Kokkos::View."); + static_assert(AViewType::rank == 2, "KokkosBatched::copy: AViewType must have rank 2."); + static_assert(BViewType::rank == 2, "KokkosBatched::copy: BViewType must have rank 2."); // Check compatibility of dimensions at run time. if (A.extent(0) != B.extent(0) || A.extent(1) != B.extent(1)) { @@ -287,18 +226,15 @@ struct TeamVectorCopy { "KokkosBatched::copy: Dimensions of A and B do not match: A: %d x " "%d, " "B: %d x %d\n", - (int)A.extent(0), (int)A.extent(1), (int)B.extent(0), - (int)B.extent(1)); + (int)A.extent(0), (int)A.extent(1), (int)B.extent(0), (int)B.extent(1)); return 1; } #endif if (A.extent(1) == 1) { - return TeamVectorCopy::invoke( - member, Kokkos::subview(A, Kokkos::ALL, 0), - Kokkos::subview(B, Kokkos::ALL, 0)); + return TeamVectorCopy::invoke(member, Kokkos::subview(A, Kokkos::ALL, 0), + Kokkos::subview(B, Kokkos::ALL, 0)); } - return TeamVectorCopyInternal::invoke(member, A.extent(1), A.extent(0), - A.data(), A.stride_1(), A.stride_0(), + return TeamVectorCopyInternal::invoke(member, A.extent(1), A.extent(0), A.data(), A.stride_1(), A.stride_0(), B.data(), B.stride_0(), B.stride_1()); } }; diff --git a/batched/dense/impl/KokkosBatched_Copy_Internal.hpp b/batched/dense/impl/KokkosBatched_Copy_Internal.hpp index ca59e4f79c..004c62646a 100644 --- a/batched/dense/impl/KokkosBatched_Copy_Internal.hpp +++ b/batched/dense/impl/KokkosBatched_Copy_Internal.hpp @@ -28,9 +28,8 @@ namespace KokkosBatched { struct SerialCopyInternal { template - KOKKOS_FORCEINLINE_FUNCTION static int invoke( - const int m, const ValueType *KOKKOS_RESTRICT A, const int as0, - /* */ ValueType *KOKKOS_RESTRICT B, const int bs0) { + KOKKOS_FORCEINLINE_FUNCTION static int invoke(const int m, const ValueType *KOKKOS_RESTRICT A, const int as0, + /* */ ValueType *KOKKOS_RESTRICT B, const int bs0) { #if defined(KOKKOS_ENABLE_PRAGMA_UNROLL) #pragma unroll #endif @@ -39,10 +38,9 @@ struct SerialCopyInternal { return 0; } template - KOKKOS_FORCEINLINE_FUNCTION static int invoke( - const int m, const int n, const ValueType *KOKKOS_RESTRICT A, - const int as0, const int as1, - /* */ ValueType *KOKKOS_RESTRICT B, const int bs0, const int bs1) { + KOKKOS_FORCEINLINE_FUNCTION static int invoke(const int m, const int n, const ValueType *KOKKOS_RESTRICT A, + const int as0, const int as1, + /* */ ValueType *KOKKOS_RESTRICT B, const int bs0, const int bs1) { if (as1 < as0) for (int i = 0; i < m; ++i) invoke(n, A + i * as0, as1, B + i * bs0, bs1); else @@ -56,30 +54,23 @@ struct SerialCopyInternal { /// ================== struct TeamCopyInternal { template - KOKKOS_FORCEINLINE_FUNCTION static int invoke( - const MemberType &member, const int m, const ValueType *KOKKOS_RESTRICT A, - const int as0, - /* */ ValueType *KOKKOS_RESTRICT B, const int bs0) { - Kokkos::parallel_for(Kokkos::TeamThreadRange(member, m), - [&](const int &i) { B[i * bs0] = A[i * as0]; }); + KOKKOS_FORCEINLINE_FUNCTION static int invoke(const MemberType &member, const int m, + const ValueType *KOKKOS_RESTRICT A, const int as0, + /* */ ValueType *KOKKOS_RESTRICT B, const int bs0) { + Kokkos::parallel_for(Kokkos::TeamThreadRange(member, m), [&](const int &i) { B[i * bs0] = A[i * as0]; }); // member.team_barrier(); return 0; } template - KOKKOS_FORCEINLINE_FUNCTION static int invoke( - const MemberType &member, const int m, const int n, - const ValueType *KOKKOS_RESTRICT A, const int as0, const int as1, - /* */ ValueType *KOKKOS_RESTRICT B, const int bs0, const int bs1) { + KOKKOS_FORCEINLINE_FUNCTION static int invoke(const MemberType &member, const int m, const int n, + const ValueType *KOKKOS_RESTRICT A, const int as0, const int as1, + /* */ ValueType *KOKKOS_RESTRICT B, const int bs0, const int bs1) { if (m >= n) { - Kokkos::parallel_for( - Kokkos::TeamThreadRange(member, m), [&](const int &i) { - SerialCopyInternal::invoke(n, A + i * as0, as1, B + i * bs0, bs1); - }); + Kokkos::parallel_for(Kokkos::TeamThreadRange(member, m), + [&](const int &i) { SerialCopyInternal::invoke(n, A + i * as0, as1, B + i * bs0, bs1); }); } else { - Kokkos::parallel_for( - Kokkos::TeamThreadRange(member, n), [&](const int &j) { - SerialCopyInternal::invoke(m, A + j * as1, as0, B + j * bs1, bs0); - }); + Kokkos::parallel_for(Kokkos::TeamThreadRange(member, n), + [&](const int &j) { SerialCopyInternal::invoke(m, A + j * as1, as0, B + j * bs1, bs0); }); } // member.team_barrier(); return 0; @@ -91,36 +82,27 @@ struct TeamCopyInternal { /// ======================== struct TeamVectorCopyInternal { template - KOKKOS_FORCEINLINE_FUNCTION static int invoke( - const MemberType &member, const int m, const ValueType *KOKKOS_RESTRICT A, - const int as0, - /* */ ValueType *KOKKOS_RESTRICT B, const int bs0) { - Kokkos::parallel_for(Kokkos::TeamVectorRange(member, m), - [&](const int &i) { B[i * bs0] = A[i * as0]; }); + KOKKOS_FORCEINLINE_FUNCTION static int invoke(const MemberType &member, const int m, + const ValueType *KOKKOS_RESTRICT A, const int as0, + /* */ ValueType *KOKKOS_RESTRICT B, const int bs0) { + Kokkos::parallel_for(Kokkos::TeamVectorRange(member, m), [&](const int &i) { B[i * bs0] = A[i * as0]; }); // member.team_barrier(); return 0; } template - KOKKOS_FORCEINLINE_FUNCTION static int invoke( - const MemberType &member, const int m, const int n, - const ValueType *KOKKOS_RESTRICT A, const int as0, const int as1, - /* */ ValueType *KOKKOS_RESTRICT B, const int bs0, const int bs1) { + KOKKOS_FORCEINLINE_FUNCTION static int invoke(const MemberType &member, const int m, const int n, + const ValueType *KOKKOS_RESTRICT A, const int as0, const int as1, + /* */ ValueType *KOKKOS_RESTRICT B, const int bs0, const int bs1) { if (as0 > as1) { - Kokkos::parallel_for( - Kokkos::TeamThreadRange(member, m), [&](const int &i) { - Kokkos::parallel_for(Kokkos::ThreadVectorRange(member, n), - [&](const int &j) { - B[i * bs0 + j * bs1] = A[i * as0 + j * as1]; - }); - }); + Kokkos::parallel_for(Kokkos::TeamThreadRange(member, m), [&](const int &i) { + Kokkos::parallel_for(Kokkos::ThreadVectorRange(member, n), + [&](const int &j) { B[i * bs0 + j * bs1] = A[i * as0 + j * as1]; }); + }); } else { - Kokkos::parallel_for( - Kokkos::ThreadVectorRange(member, m), [&](const int &i) { - Kokkos::parallel_for(Kokkos::TeamThreadRange(member, n), - [&](const int &j) { - B[i * bs0 + j * bs1] = A[i * as0 + j * as1]; - }); - }); + Kokkos::parallel_for(Kokkos::ThreadVectorRange(member, m), [&](const int &i) { + Kokkos::parallel_for(Kokkos::TeamThreadRange(member, n), + [&](const int &j) { B[i * bs0 + j * bs1] = A[i * as0 + j * as1]; }); + }); } // member.team_barrier(); return 0; diff --git a/batched/dense/impl/KokkosBatched_Dot_Internal.hpp b/batched/dense/impl/KokkosBatched_Dot_Internal.hpp index a0960c621b..48d1b1f1ac 100644 --- a/batched/dense/impl/KokkosBatched_Dot_Internal.hpp +++ b/batched/dense/impl/KokkosBatched_Dot_Internal.hpp @@ -31,10 +31,9 @@ struct SerialDotInternal { // i \in [0,m) // C = conj(A(:))*B(:) template - KOKKOS_FORCEINLINE_FUNCTION static int invoke( - const int m, const ValueType *KOKKOS_RESTRICT A, const int as0, - const ValueType *KOKKOS_RESTRICT B, const int bs0, - /* */ MagnitudeType *KOKKOS_RESTRICT C) { + KOKKOS_FORCEINLINE_FUNCTION static int invoke(const int m, const ValueType *KOKKOS_RESTRICT A, const int as0, + const ValueType *KOKKOS_RESTRICT B, const int bs0, + /* */ MagnitudeType *KOKKOS_RESTRICT C) { using ats = Kokkos::ArithTraits; C[0] = ValueType(0); #if defined(KOKKOS_ENABLE_PRAGMA_UNROLL) @@ -50,13 +49,11 @@ struct SerialDotInternal { // j \in [0,n), i \in [0,m) // C(j) = conj(A(:,j))*B(:,j) template - KOKKOS_INLINE_FUNCTION static int invoke( - const int m, const int n, const ValueType *KOKKOS_RESTRICT A, - const int as0, const int as1, const ValueType *KOKKOS_RESTRICT B, - const int bs0, const int bs1, - /* */ MagnitudeType *KOKKOS_RESTRICT C, const int cs) { - for (int j = 0; j < n; ++j) - invoke(m, A + j * as1, as0, B + j * bs1, bs0, C + j * cs); + KOKKOS_INLINE_FUNCTION static int invoke(const int m, const int n, const ValueType *KOKKOS_RESTRICT A, const int as0, + const int as1, const ValueType *KOKKOS_RESTRICT B, const int bs0, + const int bs1, + /* */ MagnitudeType *KOKKOS_RESTRICT C, const int cs) { + for (int j = 0; j < n; ++j) invoke(m, A + j * as1, as0, B + j * bs1, bs0, C + j * cs); return 0; } }; @@ -69,10 +66,10 @@ struct SerialDotInternal { // C = conj(A(:))*B(:) struct TeamDotInternal { template - KOKKOS_FORCEINLINE_FUNCTION static int invoke( - const MemberType &member, const int m, const ValueType *KOKKOS_RESTRICT A, - const int as0, const ValueType *KOKKOS_RESTRICT B, const int bs0, - /* */ MagnitudeType *KOKKOS_RESTRICT C) { + KOKKOS_FORCEINLINE_FUNCTION static int invoke(const MemberType &member, const int m, + const ValueType *KOKKOS_RESTRICT A, const int as0, + const ValueType *KOKKOS_RESTRICT B, const int bs0, + /* */ MagnitudeType *KOKKOS_RESTRICT C) { using ats = Kokkos::ArithTraits; ValueType t(0); Kokkos::parallel_reduce( @@ -89,11 +86,10 @@ struct TeamDotInternal { // j \in [0,n), i \in [0,m) // C(j) = conj(A(:,j))*B(:,j) template - KOKKOS_FORCEINLINE_FUNCTION static int invoke( - const MemberType &member, const int m, const int n, - const ValueType *KOKKOS_RESTRICT A, const int as0, const int as1, - const ValueType *KOKKOS_RESTRICT B, const int bs0, const int bs1, - /* */ MagnitudeType *KOKKOS_RESTRICT C, const int cs) { + KOKKOS_FORCEINLINE_FUNCTION static int invoke(const MemberType &member, const int m, const int n, + const ValueType *KOKKOS_RESTRICT A, const int as0, const int as1, + const ValueType *KOKKOS_RESTRICT B, const int bs0, const int bs1, + /* */ MagnitudeType *KOKKOS_RESTRICT C, const int cs) { using ats = Kokkos::ArithTraits; Kokkos::parallel_for(Kokkos::TeamThreadRange(member, n), [&](const int &j) { ValueType t(0); @@ -117,10 +113,10 @@ struct TeamDotInternal { // C = conj(A(:))*B(:) struct TeamVectorDotInternal { template - KOKKOS_FORCEINLINE_FUNCTION static int invoke( - const MemberType &member, const int m, const ValueType *KOKKOS_RESTRICT A, - const int as0, const ValueType *KOKKOS_RESTRICT B, const int bs0, - /* */ MagnitudeType *KOKKOS_RESTRICT C) { + KOKKOS_FORCEINLINE_FUNCTION static int invoke(const MemberType &member, const int m, + const ValueType *KOKKOS_RESTRICT A, const int as0, + const ValueType *KOKKOS_RESTRICT B, const int bs0, + /* */ MagnitudeType *KOKKOS_RESTRICT C) { using ats = Kokkos::ArithTraits; ValueType t(0); Kokkos::parallel_reduce( @@ -137,11 +133,10 @@ struct TeamVectorDotInternal { // j \in [0,n), i \in [0,m) // C(j) = conj(A(:,j))*B(:,j) template - KOKKOS_FORCEINLINE_FUNCTION static int invoke( - const MemberType &member, const int m, const int n, - const ValueType *KOKKOS_RESTRICT A, const int as0, const int as1, - const ValueType *KOKKOS_RESTRICT B, const int bs0, const int bs1, - /* */ MagnitudeType *KOKKOS_RESTRICT C, const int cs) { + KOKKOS_FORCEINLINE_FUNCTION static int invoke(const MemberType &member, const int m, const int n, + const ValueType *KOKKOS_RESTRICT A, const int as0, const int as1, + const ValueType *KOKKOS_RESTRICT B, const int bs0, const int bs1, + /* */ MagnitudeType *KOKKOS_RESTRICT C, const int cs) { using ats = Kokkos::ArithTraits; Kokkos::parallel_for(Kokkos::TeamThreadRange(member, n), [&](const int &j) { ValueType t(0); @@ -167,30 +162,21 @@ struct TeamVectorDotInternal { template <> struct SerialDot { template - KOKKOS_INLINE_FUNCTION static int invoke(const XViewType &X, - const YViewType &Y, - const NormViewType &dot) { + KOKKOS_INLINE_FUNCTION static int invoke(const XViewType &X, const YViewType &Y, const NormViewType &dot) { #if (KOKKOSKERNELS_DEBUG_LEVEL > 0) - static_assert(Kokkos::is_view::value, - "KokkosBatched::dot: XViewType is not a Kokkos::View."); - static_assert(Kokkos::is_view::value, - "KokkosBatched::dot: YViewType is not a Kokkos::View."); - static_assert(Kokkos::is_view::value, - "KokkosBatched::dot: NormViewType is not a Kokkos::View."); - static_assert(XViewType::rank == 2, - "KokkosBatched::dot: XViewType must have rank 2."); - static_assert(YViewType::rank == 2, - "KokkosBatched::dot: YViewType must have rank 2."); - static_assert(NormViewType::rank == 1, - "KokkosBatched::dot: NormViewType must have rank 1."); + static_assert(Kokkos::is_view::value, "KokkosBatched::dot: XViewType is not a Kokkos::View."); + static_assert(Kokkos::is_view::value, "KokkosBatched::dot: YViewType is not a Kokkos::View."); + static_assert(Kokkos::is_view::value, "KokkosBatched::dot: NormViewType is not a Kokkos::View."); + static_assert(XViewType::rank == 2, "KokkosBatched::dot: XViewType must have rank 2."); + static_assert(YViewType::rank == 2, "KokkosBatched::dot: YViewType must have rank 2."); + static_assert(NormViewType::rank == 1, "KokkosBatched::dot: NormViewType must have rank 1."); // Check compatibility of dimensions at run time. if (X.extent(0) != Y.extent(0) || X.extent(1) != Y.extent(1)) { Kokkos::printf( "KokkosBatched::dot: Dimensions of X and Y do not match: X: %d x %d, " "Y: %d x %d\n", - (int)X.extent(0), (int)X.extent(1), (int)Y.extent(0), - (int)Y.extent(1)); + (int)X.extent(0), (int)X.extent(1), (int)Y.extent(0), (int)Y.extent(1)); return 1; } if (X.extent(1) != dot.extent(0)) { @@ -202,41 +188,31 @@ struct SerialDot { return 1; } #endif - return SerialDotInternal::template invoke< - typename XViewType::non_const_value_type, - typename NormViewType::non_const_value_type>( - X.extent(0), X.extent(1), X.data(), X.stride_0(), X.stride_1(), - Y.data(), Y.stride_0(), Y.stride_1(), dot.data(), dot.stride_0()); + return SerialDotInternal::template invoke( + X.extent(0), X.extent(1), X.data(), X.stride_0(), X.stride_1(), Y.data(), Y.stride_0(), Y.stride_1(), + dot.data(), dot.stride_0()); } }; template <> struct SerialDot { template - KOKKOS_INLINE_FUNCTION static int invoke(const XViewType &X, - const YViewType &Y, - const NormViewType &dot) { + KOKKOS_INLINE_FUNCTION static int invoke(const XViewType &X, const YViewType &Y, const NormViewType &dot) { #if (KOKKOSKERNELS_DEBUG_LEVEL > 0) - static_assert(Kokkos::is_view::value, - "KokkosBatched::dot: XViewType is not a Kokkos::View."); - static_assert(Kokkos::is_view::value, - "KokkosBatched::dot: YViewType is not a Kokkos::View."); - static_assert(Kokkos::is_view::value, - "KokkosBatched::dot: NormViewType is not a Kokkos::View."); - static_assert(XViewType::rank == 2, - "KokkosBatched::dot: XViewType must have rank 2."); - static_assert(YViewType::rank == 2, - "KokkosBatched::dot: YViewType must have rank 2."); - static_assert(NormViewType::rank == 1, - "KokkosBatched::dot: NormViewType must have rank 1."); + static_assert(Kokkos::is_view::value, "KokkosBatched::dot: XViewType is not a Kokkos::View."); + static_assert(Kokkos::is_view::value, "KokkosBatched::dot: YViewType is not a Kokkos::View."); + static_assert(Kokkos::is_view::value, "KokkosBatched::dot: NormViewType is not a Kokkos::View."); + static_assert(XViewType::rank == 2, "KokkosBatched::dot: XViewType must have rank 2."); + static_assert(YViewType::rank == 2, "KokkosBatched::dot: YViewType must have rank 2."); + static_assert(NormViewType::rank == 1, "KokkosBatched::dot: NormViewType must have rank 1."); // Check compatibility of dimensions at run time. if (X.extent(0) != Y.extent(0) || X.extent(1) != Y.extent(1)) { Kokkos::printf( "KokkosBatched::dot: Dimensions of X and Y do not match: X: %d x %d, " "Y: %d x %d\n", - (int)X.extent(0), (int)X.extent(1), (int)Y.extent(0), - (int)Y.extent(1)); + (int)X.extent(0), (int)X.extent(1), (int)Y.extent(0), (int)Y.extent(1)); return 1; } if (X.extent(0) != dot.extent(0)) { @@ -247,11 +223,10 @@ struct SerialDot { return 1; } #endif - return SerialDotInternal::template invoke< - typename XViewType::non_const_value_type, - typename NormViewType::non_const_value_type>( - X.extent(1), X.extent(0), X.data(), X.stride_1(), X.stride_0(), - Y.data(), Y.stride_1(), Y.stride_0(), dot.data(), dot.stride_0()); + return SerialDotInternal::template invoke( + X.extent(1), X.extent(0), X.data(), X.stride_1(), X.stride_0(), Y.data(), Y.stride_1(), Y.stride_0(), + dot.data(), dot.stride_0()); } }; @@ -262,31 +237,22 @@ struct SerialDot { template struct TeamDot { template - KOKKOS_INLINE_FUNCTION static int invoke(const MemberType &member, - const XViewType &X, - const YViewType &Y, + KOKKOS_INLINE_FUNCTION static int invoke(const MemberType &member, const XViewType &X, const YViewType &Y, const NormViewType &dot) { #if (KOKKOSKERNELS_DEBUG_LEVEL > 0) - static_assert(Kokkos::is_view::value, - "KokkosBatched::dot: XViewType is not a Kokkos::View."); - static_assert(Kokkos::is_view::value, - "KokkosBatched::dot: YViewType is not a Kokkos::View."); - static_assert(Kokkos::is_view::value, - "KokkosBatched::dot: NormViewType is not a Kokkos::View."); - static_assert(XViewType::rank == 2, - "KokkosBatched::dot: XViewType must have rank 2."); - static_assert(YViewType::rank == 2, - "KokkosBatched::dot: YViewType must have rank 2."); - static_assert(NormViewType::rank == 1, - "KokkosBatched::dot: NormViewType must have rank 1."); + static_assert(Kokkos::is_view::value, "KokkosBatched::dot: XViewType is not a Kokkos::View."); + static_assert(Kokkos::is_view::value, "KokkosBatched::dot: YViewType is not a Kokkos::View."); + static_assert(Kokkos::is_view::value, "KokkosBatched::dot: NormViewType is not a Kokkos::View."); + static_assert(XViewType::rank == 2, "KokkosBatched::dot: XViewType must have rank 2."); + static_assert(YViewType::rank == 2, "KokkosBatched::dot: YViewType must have rank 2."); + static_assert(NormViewType::rank == 1, "KokkosBatched::dot: NormViewType must have rank 1."); // Check compatibility of dimensions at run time. if (X.extent(0) != Y.extent(0) || X.extent(1) != Y.extent(1)) { Kokkos::printf( "KokkosBatched::dot: Dimensions of X and Y do not match: X: %d x %d, " "Y: %d x %d\n", - (int)X.extent(0), (int)X.extent(1), (int)Y.extent(0), - (int)Y.extent(1)); + (int)X.extent(0), (int)X.extent(1), (int)Y.extent(0), (int)Y.extent(1)); return 1; } if (X.extent(1) != dot.extent(0)) { @@ -300,48 +266,37 @@ struct TeamDot { #endif if (X.extent(1) == 1) { - dot(0) = KokkosBlas::Experimental::dot( - member, Kokkos::subview(X, Kokkos::ALL, 0), - Kokkos::subview(Y, Kokkos::ALL, 0)); + dot(0) = + KokkosBlas::Experimental::dot(member, Kokkos::subview(X, Kokkos::ALL, 0), Kokkos::subview(Y, Kokkos::ALL, 0)); return 0; } - return TeamDotInternal::template invoke< - MemberType, typename XViewType::non_const_value_type, - typename NormViewType::non_const_value_type>( - member, X.extent(0), X.extent(1), X.data(), X.stride_0(), X.stride_1(), - Y.data(), Y.stride_0(), Y.stride_1(), dot.data(), dot.stride_0()); + return TeamDotInternal::template invoke( + member, X.extent(0), X.extent(1), X.data(), X.stride_0(), X.stride_1(), Y.data(), Y.stride_0(), Y.stride_1(), + dot.data(), dot.stride_0()); } }; template struct TeamDot { template - KOKKOS_INLINE_FUNCTION static int invoke(const MemberType &member, - const XViewType &X, - const YViewType &Y, + KOKKOS_INLINE_FUNCTION static int invoke(const MemberType &member, const XViewType &X, const YViewType &Y, const NormViewType &dot) { #if (KOKKOSKERNELS_DEBUG_LEVEL > 0) - static_assert(Kokkos::is_view::value, - "KokkosBatched::dot: XViewType is not a Kokkos::View."); - static_assert(Kokkos::is_view::value, - "KokkosBatched::dot: YViewType is not a Kokkos::View."); - static_assert(Kokkos::is_view::value, - "KokkosBatched::dot: NormViewType is not a Kokkos::View."); - static_assert(XViewType::rank == 2, - "KokkosBatched::dot: XViewType must have rank 2."); - static_assert(YViewType::rank == 2, - "KokkosBatched::dot: YViewType must have rank 2."); - static_assert(NormViewType::rank == 1, - "KokkosBatched::dot: NormViewType must have rank 1."); + static_assert(Kokkos::is_view::value, "KokkosBatched::dot: XViewType is not a Kokkos::View."); + static_assert(Kokkos::is_view::value, "KokkosBatched::dot: YViewType is not a Kokkos::View."); + static_assert(Kokkos::is_view::value, "KokkosBatched::dot: NormViewType is not a Kokkos::View."); + static_assert(XViewType::rank == 2, "KokkosBatched::dot: XViewType must have rank 2."); + static_assert(YViewType::rank == 2, "KokkosBatched::dot: YViewType must have rank 2."); + static_assert(NormViewType::rank == 1, "KokkosBatched::dot: NormViewType must have rank 1."); // Check compatibility of dimensions at run time. if (X.extent(0) != Y.extent(0) || X.extent(1) != Y.extent(1)) { Kokkos::printf( "KokkosBatched::dot: Dimensions of X and Y do not match: X: %d x %d, " "Y: %d x %d\n", - (int)X.extent(0), (int)X.extent(1), (int)Y.extent(0), - (int)Y.extent(1)); + (int)X.extent(0), (int)X.extent(1), (int)Y.extent(0), (int)Y.extent(1)); return 1; } if (X.extent(0) != dot.extent(0)) { @@ -354,17 +309,15 @@ struct TeamDot { #endif if (X.extent(0) == 1) { - dot(0) = KokkosBlas::Experimental::dot( - member, Kokkos::subview(X, 0, Kokkos::ALL), - Kokkos::subview(Y, 0, Kokkos::ALL)); + dot(0) = + KokkosBlas::Experimental::dot(member, Kokkos::subview(X, 0, Kokkos::ALL), Kokkos::subview(Y, 0, Kokkos::ALL)); return 0; } - return TeamDotInternal::template invoke< - MemberType, typename XViewType::non_const_value_type, - typename NormViewType::non_const_value_type>( - member, X.extent(1), X.extent(0), X.data(), X.stride_1(), X.stride_0(), - Y.data(), Y.stride_1(), Y.stride_0(), dot.data(), dot.stride_0()); + return TeamDotInternal::template invoke( + member, X.extent(1), X.extent(0), X.data(), X.stride_1(), X.stride_0(), Y.data(), Y.stride_1(), Y.stride_0(), + dot.data(), dot.stride_0()); } }; @@ -375,31 +328,22 @@ struct TeamDot { template struct TeamVectorDot { template - KOKKOS_INLINE_FUNCTION static int invoke(const MemberType &member, - const XViewType &X, - const YViewType &Y, + KOKKOS_INLINE_FUNCTION static int invoke(const MemberType &member, const XViewType &X, const YViewType &Y, const NormViewType &dot) { #if (KOKKOSKERNELS_DEBUG_LEVEL > 0) - static_assert(Kokkos::is_view::value, - "KokkosBatched::dot: XViewType is not a Kokkos::View."); - static_assert(Kokkos::is_view::value, - "KokkosBatched::dot: YViewType is not a Kokkos::View."); - static_assert(Kokkos::is_view::value, - "KokkosBatched::dot: NormViewType is not a Kokkos::View."); - static_assert(XViewType::rank == 2, - "KokkosBatched::dot: XViewType must have rank 2."); - static_assert(YViewType::rank == 2, - "KokkosBatched::dot: YViewType must have rank 2."); - static_assert(NormViewType::rank == 1, - "KokkosBatched::dot: NormViewType must have rank 1."); + static_assert(Kokkos::is_view::value, "KokkosBatched::dot: XViewType is not a Kokkos::View."); + static_assert(Kokkos::is_view::value, "KokkosBatched::dot: YViewType is not a Kokkos::View."); + static_assert(Kokkos::is_view::value, "KokkosBatched::dot: NormViewType is not a Kokkos::View."); + static_assert(XViewType::rank == 2, "KokkosBatched::dot: XViewType must have rank 2."); + static_assert(YViewType::rank == 2, "KokkosBatched::dot: YViewType must have rank 2."); + static_assert(NormViewType::rank == 1, "KokkosBatched::dot: NormViewType must have rank 1."); // Check compatibility of dimensions at run time. if (X.extent(0) != Y.extent(0) || X.extent(1) != Y.extent(1)) { Kokkos::printf( "KokkosBatched::dot: Dimensions of X and Y do not match: X: %d x %d, " "Y: %d x %d\n", - (int)X.extent(0), (int)X.extent(1), (int)Y.extent(0), - (int)Y.extent(1)); + (int)X.extent(0), (int)X.extent(1), (int)Y.extent(0), (int)Y.extent(1)); return 1; } if (X.extent(1) != dot.extent(0)) { @@ -413,48 +357,37 @@ struct TeamVectorDot { #endif if (X.extent(1) == 1) { - dot(0) = KokkosBlas::Experimental::dot( - member, Kokkos::subview(X, Kokkos::ALL, 0), - Kokkos::subview(Y, Kokkos::ALL, 0)); + dot(0) = + KokkosBlas::Experimental::dot(member, Kokkos::subview(X, Kokkos::ALL, 0), Kokkos::subview(Y, Kokkos::ALL, 0)); return 0; } - return TeamVectorDotInternal::template invoke< - MemberType, typename XViewType::non_const_value_type, - typename NormViewType::non_const_value_type>( - member, X.extent(0), X.extent(1), X.data(), X.stride_0(), X.stride_1(), - Y.data(), Y.stride_0(), Y.stride_1(), dot.data(), dot.stride_0()); + return TeamVectorDotInternal::template invoke( + member, X.extent(0), X.extent(1), X.data(), X.stride_0(), X.stride_1(), Y.data(), Y.stride_0(), Y.stride_1(), + dot.data(), dot.stride_0()); } }; template struct TeamVectorDot { template - KOKKOS_INLINE_FUNCTION static int invoke(const MemberType &member, - const XViewType &X, - const YViewType &Y, + KOKKOS_INLINE_FUNCTION static int invoke(const MemberType &member, const XViewType &X, const YViewType &Y, const NormViewType &dot) { #if (KOKKOSKERNELS_DEBUG_LEVEL > 0) - static_assert(Kokkos::is_view::value, - "KokkosBatched::dot: XViewType is not a Kokkos::View."); - static_assert(Kokkos::is_view::value, - "KokkosBatched::dot: YViewType is not a Kokkos::View."); - static_assert(Kokkos::is_view::value, - "KokkosBatched::dot: NormViewType is not a Kokkos::View."); - static_assert(XViewType::rank == 2, - "KokkosBatched::dot: XViewType must have rank 2."); - static_assert(YViewType::rank == 2, - "KokkosBatched::dot: YViewType must have rank 2."); - static_assert(NormViewType::rank == 1, - "KokkosBatched::dot: NormViewType must have rank 1."); + static_assert(Kokkos::is_view::value, "KokkosBatched::dot: XViewType is not a Kokkos::View."); + static_assert(Kokkos::is_view::value, "KokkosBatched::dot: YViewType is not a Kokkos::View."); + static_assert(Kokkos::is_view::value, "KokkosBatched::dot: NormViewType is not a Kokkos::View."); + static_assert(XViewType::rank == 2, "KokkosBatched::dot: XViewType must have rank 2."); + static_assert(YViewType::rank == 2, "KokkosBatched::dot: YViewType must have rank 2."); + static_assert(NormViewType::rank == 1, "KokkosBatched::dot: NormViewType must have rank 1."); // Check compatibility of dimensions at run time. if (X.extent(0) != Y.extent(0) || X.extent(1) != Y.extent(1)) { Kokkos::printf( "KokkosBatched::dot: Dimensions of X and Y do not match: X: %d x %d, " "Y: %d x %d\n", - (int)X.extent(0), (int)X.extent(1), (int)Y.extent(0), - (int)Y.extent(1)); + (int)X.extent(0), (int)X.extent(1), (int)Y.extent(0), (int)Y.extent(1)); return 1; } if (X.extent(0) != dot.extent(0)) { @@ -467,17 +400,15 @@ struct TeamVectorDot { #endif if (X.extent(0) == 1) { - dot(0) = KokkosBlas::Experimental::dot( - member, Kokkos::subview(X, 0, Kokkos::ALL), - Kokkos::subview(Y, 0, Kokkos::ALL)); + dot(0) = + KokkosBlas::Experimental::dot(member, Kokkos::subview(X, 0, Kokkos::ALL), Kokkos::subview(Y, 0, Kokkos::ALL)); return 0; } - return TeamVectorDotInternal::template invoke< - MemberType, typename XViewType::non_const_value_type, - typename NormViewType::non_const_value_type>( - member, X.extent(1), X.extent(0), X.data(), X.stride_1(), X.stride_0(), - Y.data(), Y.stride_1(), Y.stride_0(), dot.data(), dot.stride_0()); + return TeamVectorDotInternal::template invoke( + member, X.extent(1), X.extent(0), X.data(), X.stride_1(), X.stride_0(), Y.data(), Y.stride_1(), Y.stride_0(), + dot.data(), dot.stride_0()); } }; diff --git a/batched/dense/impl/KokkosBatched_Eigendecomposition_Serial_Impl.hpp b/batched/dense/impl/KokkosBatched_Eigendecomposition_Serial_Impl.hpp index 49a7184e39..8ca3b09e59 100644 --- a/batched/dense/impl/KokkosBatched_Eigendecomposition_Serial_Impl.hpp +++ b/batched/dense/impl/KokkosBatched_Eigendecomposition_Serial_Impl.hpp @@ -26,38 +26,28 @@ namespace KokkosBatched { /// /// Serial Impl /// =========== -template -KOKKOS_INLINE_FUNCTION int SerialEigendecomposition::invoke( - const AViewType &A, const EViewType &er, const EViewType &ei, - const UViewType &UL, const UViewType &UR, const WViewType &W) { +template +KOKKOS_INLINE_FUNCTION int SerialEigendecomposition::invoke(const AViewType &A, const EViewType &er, + const EViewType &ei, const UViewType &UL, + const UViewType &UR, const WViewType &W) { /// view checking const int m = A.extent(0); assert(m == int(A.extent(1)) && "Eigendecomposition: A is not square"); - assert(m == int(er.extent(0)) && - "Eigendecomposition: Length of er does not match to A's dimension"); - assert(m == int(ei.extent(0)) && - "Eigendecomposition: Length of ei does not match to A's dimension"); - assert(m == int(UL.extent(0)) && - "Eigendecomposition: Length of UL does not match to A's dimension"); - assert(m == int(UL.extent(1)) && - "Eigendecomposition: Width of UL does not match to A's dimension"); - assert(m == int(UR.extent(0)) && - "Eigendecomposition: Length of UR does not match to A's dimension"); - assert(m == int(UR.extent(1)) && - "Eigendecomposition: Width of UR does not match to A's dimension"); + assert(m == int(er.extent(0)) && "Eigendecomposition: Length of er does not match to A's dimension"); + assert(m == int(ei.extent(0)) && "Eigendecomposition: Length of ei does not match to A's dimension"); + assert(m == int(UL.extent(0)) && "Eigendecomposition: Length of UL does not match to A's dimension"); + assert(m == int(UL.extent(1)) && "Eigendecomposition: Width of UL does not match to A's dimension"); + assert(m == int(UR.extent(0)) && "Eigendecomposition: Length of UR does not match to A's dimension"); + assert(m == int(UR.extent(1)) && "Eigendecomposition: Width of UR does not match to A's dimension"); // assert(int(W.extent(0)) >= int(2*m*m+5*m) && "Eigendecomposition: workspace // size is too small"); - assert(int(W.stride(0)) == int(1) && - "Eigendecomposition: Provided workspace is not contiguous"); + assert(int(W.stride(0)) == int(1) && "Eigendecomposition: Provided workspace is not contiguous"); /// static assert A,er,ei,UL,UR,W has the same value_type /// static assert all views have the same memory space return m ? SerialEigendecompositionInternal ::invoke( - A.extent(0), A.data(), A.stride(0), A.stride(1), er.data(), - er.stride(0), ei.data(), ei.stride(0), UL.data(), UL.stride(0), - UL.stride(1), UR.data(), UR.stride(0), UR.stride(1), W.data(), - W.extent(0)) + A.extent(0), A.data(), A.stride(0), A.stride(1), er.data(), er.stride(0), ei.data(), ei.stride(0), + UL.data(), UL.stride(0), UL.stride(1), UR.data(), UR.stride(0), UR.stride(1), W.data(), W.extent(0)) : 0; } diff --git a/batched/dense/impl/KokkosBatched_Eigendecomposition_Serial_Internal.hpp b/batched/dense/impl/KokkosBatched_Eigendecomposition_Serial_Internal.hpp index c857de19c2..b1cfb6ef25 100644 --- a/batched/dense/impl/KokkosBatched_Eigendecomposition_Serial_Internal.hpp +++ b/batched/dense/impl/KokkosBatched_Eigendecomposition_Serial_Internal.hpp @@ -61,11 +61,10 @@ struct SerialEigendecompositionInternal { /// [out]w, [in]wlen /// Workspace template - KOKKOS_INLINE_FUNCTION static int device_invoke( - const int m, RealType* A, const int as0, const int as1, RealType* er, - const int ers, RealType* ei, const int eis, RealType* UL, const int uls0, - const int uls1, RealType* UR, const int urs0, const int urs1, RealType* w, - const int wlen) { + KOKKOS_INLINE_FUNCTION static int device_invoke(const int m, RealType* A, const int as0, const int as1, RealType* er, + const int ers, RealType* ei, const int eis, RealType* UL, + const int uls0, const int uls1, RealType* UR, const int urs0, + const int urs1, RealType* w, const int wlen) { /// until debugging is done, comment out the code /// testing happens only for TPLs on host. static_assert(false, @@ -336,14 +335,10 @@ struct SerialEigendecompositionInternal { } template - inline static int host_invoke(const int m, RealType* A, const int as0, - const int as1, RealType* er, const int ers, - RealType* ei, const int eis, RealType* UL, - const int uls0, const int uls1, RealType* UR, - const int urs0, const int urs1, RealType* w, - const int wlen) { -#if defined(__KOKKOSBATCHED_ENABLE_LAPACKE__) || \ - defined(__KOKKOSBATCHED_ENABLE_INTEL_MKL__) + inline static int host_invoke(const int m, RealType* A, const int as0, const int as1, RealType* er, const int ers, + RealType* ei, const int eis, RealType* UL, const int uls0, const int uls1, RealType* UR, + const int urs0, const int urs1, RealType* w, const int wlen) { +#if defined(__KOKKOSBATCHED_ENABLE_LAPACKE__) || defined(__KOKKOSBATCHED_ENABLE_INTEL_MKL__) int matrix_layout(0), lda(0), uls(0), urs(0); if (as0 == 1) { assert(uls0 == 1 && "UL is not column major"); @@ -365,33 +360,29 @@ struct SerialEigendecompositionInternal { } assert(matrix_layout != 0 && "Either stride of A is not unit"); if (std::is_same::value) { - LAPACKE_sgeev(matrix_layout, 'V', 'V', m, (float*)A, lda, (float*)er, - (float*)ei, (float*)UL, uls, (float*)UR, urs); + LAPACKE_sgeev(matrix_layout, 'V', 'V', m, (float*)A, lda, (float*)er, (float*)ei, (float*)UL, uls, (float*)UR, + urs); } else if (std::is_same::value) { - LAPACKE_dgeev(matrix_layout, 'V', 'V', m, (double*)A, lda, (double*)er, - (double*)ei, (double*)UL, uls, (double*)UR, urs); + LAPACKE_dgeev(matrix_layout, 'V', 'V', m, (double*)A, lda, (double*)er, (double*)ei, (double*)UL, uls, + (double*)UR, urs); } else { // no complex is needed for this moment assert(false && "complex type is not supported"); } #else - device_invoke(m, A, as0, as1, er, ers, ei, eis, UL, uls0, uls1, UR, urs0, - urs1, w, wlen); + device_invoke(m, A, as0, as1, er, ers, ei, eis, UL, uls0, uls1, UR, urs0, urs1, w, wlen); #endif return 0; } template - KOKKOS_INLINE_FUNCTION static int invoke( - const int m, RealType* A, const int as0, const int as1, RealType* er, - const int ers, RealType* ei, const int eis, RealType* UL, const int uls0, - const int uls1, RealType* UR, const int urs0, const int urs1, RealType* w, - const int wlen) { - KOKKOS_IF_ON_HOST((host_invoke(m, A, as0, as1, er, ers, ei, eis, UL, uls0, - uls1, UR, urs0, urs1, w, wlen);)) - KOKKOS_IF_ON_DEVICE((device_invoke(m, A, as0, as1, er, ers, ei, eis, UL, - uls0, uls1, UR, urs0, urs1, w, wlen);)) + KOKKOS_INLINE_FUNCTION static int invoke(const int m, RealType* A, const int as0, const int as1, RealType* er, + const int ers, RealType* ei, const int eis, RealType* UL, const int uls0, + const int uls1, RealType* UR, const int urs0, const int urs1, RealType* w, + const int wlen) { + KOKKOS_IF_ON_HOST((host_invoke(m, A, as0, as1, er, ers, ei, eis, UL, uls0, uls1, UR, urs0, urs1, w, wlen);)) + KOKKOS_IF_ON_DEVICE((device_invoke(m, A, as0, as1, er, ers, ei, eis, UL, uls0, uls1, UR, urs0, urs1, w, wlen);)) return 0; } }; diff --git a/batched/dense/impl/KokkosBatched_Eigendecomposition_TeamVector_Impl.hpp b/batched/dense/impl/KokkosBatched_Eigendecomposition_TeamVector_Impl.hpp index a05ee11965..97f68d63de 100644 --- a/batched/dense/impl/KokkosBatched_Eigendecomposition_TeamVector_Impl.hpp +++ b/batched/dense/impl/KokkosBatched_Eigendecomposition_TeamVector_Impl.hpp @@ -28,37 +28,28 @@ namespace KokkosBatched { /// ========= template -template -KOKKOS_INLINE_FUNCTION int TeamVectorEigendecomposition::invoke( - const MemberType &member, const AViewType &A, const EViewType &er, - const EViewType &ei, const UViewType &UL, const UViewType &UR, - const WViewType &W) { +template +KOKKOS_INLINE_FUNCTION int TeamVectorEigendecomposition::invoke(const MemberType &member, + const AViewType &A, const EViewType &er, + const EViewType &ei, const UViewType &UL, + const UViewType &UR, const WViewType &W) { /// view checking const int m = A.extent(0); assert(m == A.extent(1) && "Eigendecomposition: A is not square"); - assert(m == er.extent(0) && - "Eigendecomposition: Length of er does not match to A's dimension"); - assert(m == ei.extent(0) && - "Eigendecomposition: Length of ei does not match to A's dimension"); - assert(m == UL.extent(0) && - "Eigendecomposition: Length of UL does not match to A's dimension"); - assert(m == UL.extent(1) && - "Eigendecomposition: Width of UL does not match to A's dimension"); - assert(m == UR.extent(0) && - "Eigendecomposition: Length of UR does not match to A's dimension"); - assert(m == UR.extent(1) && - "Eigendecomposition: Width of UR does not match to A's dimension"); + assert(m == er.extent(0) && "Eigendecomposition: Length of er does not match to A's dimension"); + assert(m == ei.extent(0) && "Eigendecomposition: Length of ei does not match to A's dimension"); + assert(m == UL.extent(0) && "Eigendecomposition: Length of UL does not match to A's dimension"); + assert(m == UL.extent(1) && "Eigendecomposition: Width of UL does not match to A's dimension"); + assert(m == UR.extent(0) && "Eigendecomposition: Length of UR does not match to A's dimension"); + assert(m == UR.extent(1) && "Eigendecomposition: Width of UR does not match to A's dimension"); // assert(W.extent(0) >= (2*m*m+5*m) && "Eigendecomposition: workspace size is // too small"); - assert(W.stride(0) == 1 && - "Eigendecomposition: Provided workspace is not contiguous"); + assert(W.stride(0) == 1 && "Eigendecomposition: Provided workspace is not contiguous"); - return m ? TeamVectorEigendecompositionInternal ::invoke( - member, A.extent(0), A.data(), A.stride(0), A.stride(1), - er.data(), er.stride(0), ei.data(), ei.stride(0), UL.data(), - UL.stride(0), UL.stride(1), UR.data(), UR.stride(0), - UR.stride(1), W.data(), W.extent(0)) + return m ? TeamVectorEigendecompositionInternal ::invoke(member, A.extent(0), A.data(), A.stride(0), A.stride(1), + er.data(), er.stride(0), ei.data(), ei.stride(0), UL.data(), + UL.stride(0), UL.stride(1), UR.data(), UR.stride(0), + UR.stride(1), W.data(), W.extent(0)) : 0; } diff --git a/batched/dense/impl/KokkosBatched_Eigendecomposition_TeamVector_Internal.hpp b/batched/dense/impl/KokkosBatched_Eigendecomposition_TeamVector_Internal.hpp index 50324338ee..567bbd3ad5 100644 --- a/batched/dense/impl/KokkosBatched_Eigendecomposition_TeamVector_Internal.hpp +++ b/batched/dense/impl/KokkosBatched_Eigendecomposition_TeamVector_Internal.hpp @@ -40,11 +40,11 @@ namespace KokkosBatched { struct TeamVectorEigendecompositionInternal { template - KOKKOS_INLINE_FUNCTION static int device_invoke( - const MemberType &member, const int m, RealType *A, const int as0, - const int as1, RealType *er, const int ers, RealType *ei, const int eis, - RealType *UL, const int uls0, const int uls1, RealType *UR, - const int urs0, const int urs1, RealType *w, const int wlen) { + KOKKOS_INLINE_FUNCTION static int device_invoke(const MemberType &member, const int m, RealType *A, const int as0, + const int as1, RealType *er, const int ers, RealType *ei, + const int eis, RealType *UL, const int uls0, const int uls1, + RealType *UR, const int urs0, const int urs1, RealType *w, + const int wlen) { /// not yet implemented return 0; } @@ -74,13 +74,11 @@ struct TeamVectorEigendecompositionInternal { /// [out]w, [in]wlen /// Workspace template - KOKKOS_INLINE_FUNCTION static int invoke( - const MemberType &member, const int m, RealType *A, const int as0, - const int as1, RealType *er, const int ers, RealType *ei, const int eis, - RealType *UL, const int uls0, const int uls1, RealType *UR, - const int urs0, const int urs1, RealType *w, const int wlen) { - static_assert(false, - "TeamVector eigendecomposition is not implemented yet."); + KOKKOS_INLINE_FUNCTION static int invoke(const MemberType &member, const int m, RealType *A, const int as0, + const int as1, RealType *er, const int ers, RealType *ei, const int eis, + RealType *UL, const int uls0, const int uls1, RealType *UR, const int urs0, + const int urs1, RealType *w, const int wlen) { + static_assert(false, "TeamVector eigendecomposition is not implemented yet."); /* // DO NOT USE // diff --git a/batched/dense/impl/KokkosBatched_Eigenvalue_Serial_Internal.hpp b/batched/dense/impl/KokkosBatched_Eigenvalue_Serial_Internal.hpp index ae4cf10634..0ac8ed3859 100644 --- a/batched/dense/impl/KokkosBatched_Eigenvalue_Serial_Internal.hpp +++ b/batched/dense/impl/KokkosBatched_Eigenvalue_Serial_Internal.hpp @@ -61,11 +61,9 @@ struct SerialEigenvalueInternal { /// returns -1. template KOKKOS_INLINE_FUNCTION static int invoke(const int m, - /* */ RealType *H, const int hs0, - const int hs1, + /* */ RealType *H, const int hs0, const int hs1, /* */ RealType *er, const int ers, - /* */ RealType *ei, const int eis, - const bool restart = false, + /* */ RealType *ei, const int eis, const bool restart = false, const int user_max_iteration = -1) { typedef RealType real_type; typedef Kokkos::ArithTraits ats; @@ -94,8 +92,7 @@ struct SerialEigenvalueInternal { /// compute eigenvalues from the characteristic determinant equation bool is_complex; Kokkos::complex lambda1, lambda2; - SerialWilkinsonShiftInternal::invoke(H[0], H[hs1], H[hs0], H[hs], - &lambda1, &lambda2, &is_complex); + SerialWilkinsonShiftInternal::invoke(H[0], H[hs1], H[hs0], H[hs], &lambda1, &lambda2, &is_complex); er[0] = lambda1.real(); ei[0] = lambda1.imag(); er[1] = lambda2.real(); @@ -150,9 +147,8 @@ struct SerialEigenvalueInternal { bool is_complex; real_type *sub2x2 = H + (mend - 2) * hs; if (2 == mdiff) { - SerialWilkinsonShiftInternal::invoke( - sub2x2[0], sub2x2[hs1], sub2x2[hs0], sub2x2[hs], &lambda1, - &lambda2, &is_complex); + SerialWilkinsonShiftInternal::invoke(sub2x2[0], sub2x2[hs1], sub2x2[hs0], sub2x2[hs], &lambda1, + &lambda2, &is_complex); sub2x2[hs0] = zero; /// eigenvalues are from wilkinson shift @@ -161,13 +157,10 @@ struct SerialEigenvalueInternal { er[(mbeg + 1) * ers] = lambda2.real(); ei[(mbeg + 1) * eis] = lambda2.imag(); } else { - SerialWilkinsonShiftInternal::invoke( - sub2x2[0], sub2x2[hs1], sub2x2[hs0], sub2x2[hs], &lambda1, - &lambda2, &is_complex); + SerialWilkinsonShiftInternal::invoke(sub2x2[0], sub2x2[hs1], sub2x2[hs0], sub2x2[hs], &lambda1, + &lambda2, &is_complex); - SerialFrancisInternal::invoke(0, mdiff, mdiff, H + hs * mbeg, - hs0, hs1, lambda1, lambda2, - is_complex); + SerialFrancisInternal::invoke(0, mdiff, mdiff, H + hs * mbeg, hs0, hs1, lambda1, lambda2, is_complex); /* */ auto &val1 = *(sub2x2 + hs0); /* */ auto &val2 = *(sub2x2 - hs1); const auto abs_val1 = ats::abs(val1); @@ -217,18 +210,15 @@ struct SerialEigenvalueInternal { /// complex interface template - KOKKOS_INLINE_FUNCTION static int invoke( - const int m, - /* */ RealType *H, const int hs0, const int hs1, - /* */ Kokkos::complex *e, const int es, - const int max_iteration = 300, - const RealType user_tolerence = RealType(-1), - const bool restart = false) { + KOKKOS_INLINE_FUNCTION static int invoke(const int m, + /* */ RealType *H, const int hs0, const int hs1, + /* */ Kokkos::complex *e, const int es, + const int max_iteration = 300, const RealType user_tolerence = RealType(-1), + const bool restart = false) { RealType *er = (RealType *)e; RealType *ei = er + 1; const int two_es = 2 * es; - return invoke(m, H, hs0, hs1, er, two_es, ei, two_es, user_tolerence, - restart, max_iteration); + return invoke(m, H, hs0, hs1, er, two_es, ei, two_es, user_tolerence, restart, max_iteration); } }; diff --git a/batched/dense/impl/KokkosBatched_FindAmax_Internal.hpp b/batched/dense/impl/KokkosBatched_FindAmax_Internal.hpp index ffe911d688..42dc948014 100644 --- a/batched/dense/impl/KokkosBatched_FindAmax_Internal.hpp +++ b/batched/dense/impl/KokkosBatched_FindAmax_Internal.hpp @@ -27,9 +27,7 @@ namespace KokkosBatched { /// ===================== struct SerialFindAmaxInternal { template - KOKKOS_INLINE_FUNCTION static int invoke(const int m, - const ValueType *KOKKOS_RESTRICT A, - const int as0, + KOKKOS_INLINE_FUNCTION static int invoke(const int m, const ValueType *KOKKOS_RESTRICT A, const int as0, /**/ IntType *KOKKOS_RESTRICT idx) { ValueType max_val(A[0]); IntType val_loc(0); @@ -50,14 +48,11 @@ struct SerialFindAmaxInternal { /// ======================== struct TeamVectorFindAmaxInternal { template - KOKKOS_INLINE_FUNCTION static int invoke(const MemberType &member, - const int m, - const ValueType *KOKKOS_RESTRICT A, + KOKKOS_INLINE_FUNCTION static int invoke(const MemberType &member, const int m, const ValueType *KOKKOS_RESTRICT A, const int as0, /**/ IntType *KOKKOS_RESTRICT idx) { if (m > 0) { - using reducer_value_type = - typename Kokkos::MaxLoc::value_type; + using reducer_value_type = typename Kokkos::MaxLoc::value_type; reducer_value_type value{}; Kokkos::MaxLoc reducer_value(value); Kokkos::parallel_reduce( diff --git a/batched/dense/impl/KokkosBatched_Francis_Serial_Internal.hpp b/batched/dense/impl/KokkosBatched_Francis_Serial_Internal.hpp index 21587f4481..e303cafd1f 100644 --- a/batched/dense/impl/KokkosBatched_Francis_Serial_Internal.hpp +++ b/batched/dense/impl/KokkosBatched_Francis_Serial_Internal.hpp @@ -32,12 +32,11 @@ namespace KokkosBatched { /// struct SerialFrancisInternal { template - KOKKOS_INLINE_FUNCTION static int invoke( - const int mbeg, const int mend, const int morg, - /* */ ValueType *HH, const int hs0, const int hs1, - const Kokkos::complex lambda1, - const Kokkos::complex lambda2, const bool is_complex, - /* */ Kokkos::pair *GG, const bool request_schur) { + KOKKOS_INLINE_FUNCTION static int invoke(const int mbeg, const int mend, const int morg, + /* */ ValueType *HH, const int hs0, const int hs1, + const Kokkos::complex lambda1, + const Kokkos::complex lambda2, const bool is_complex, + /* */ Kokkos::pair *GG, const bool request_schur) { typedef ValueType value_type; const int hs = hs0 + hs1; @@ -73,25 +72,21 @@ struct SerialFrancisInternal { // this needs m>=3 // v = M e_1 = (H*H - 2 Re(lambda) H + |lambda|^2 I)e_1 value_type s, t; - const value_type h00 = H[0 * hs0 + 0 * hs1], h01 = H[0 * hs0 + 1 * hs1], - h10 = H[1 * hs0 + 0 * hs1], h11 = H[1 * hs0 + 1 * hs1], + const value_type h00 = H[0 * hs0 + 0 * hs1], h01 = H[0 * hs0 + 1 * hs1], h10 = H[1 * hs0 + 0 * hs1], + h11 = H[1 * hs0 + 1 * hs1], /* */ h21 = H[2 * hs0 + 1 * hs1]; if (is_complex) { s = 2 * lambda1.real(); t = lambda1.real() * lambda1.real() + lambda1.imag() * lambda1.imag(); } else { - const value_type val = H[(m - 1) * hs]; - const auto dist_lambda1 = - Kokkos::ArithTraits::abs(lambda1.real() - val); - const auto dist_lambda2 = - Kokkos::ArithTraits::abs(lambda2.real() - val); - const value_type lambda = - dist_lambda1 < dist_lambda2 ? lambda1.real() : lambda2.real(); - s = 2 * lambda; - t = lambda * lambda; + const value_type val = H[(m - 1) * hs]; + const auto dist_lambda1 = Kokkos::ArithTraits::abs(lambda1.real() - val); + const auto dist_lambda2 = Kokkos::ArithTraits::abs(lambda2.real() - val); + const value_type lambda = dist_lambda1 < dist_lambda2 ? lambda1.real() : lambda2.real(); + s = 2 * lambda; + t = lambda * lambda; } - v[0] = - h00 * h00 + h01 * h10 /* H^2 e_1 */ - s * h00 /* 2 Re(lambda) */ + t; + v[0] = h00 * h00 + h01 * h10 /* H^2 e_1 */ - s * h00 /* 2 Re(lambda) */ + t; v[1] = h10 * h00 + h11 * h10 /* */ - s * h10; v[2] = h21 * h10; } @@ -112,9 +107,8 @@ struct SerialFrancisInternal { const int mm = m < 4 ? m : 4, nn = m; value_type *Hs = H - mbeg_mult_hs0; - SerialApplyLeftRightGivensInternal ::invoke( - G[0], G[1], mm + mbeg, nn + mrst, H, H + hs0, H + 2 * hs0, Hs, - Hs + hs1, Hs + 2 * hs1, hs0, hs1); + SerialApplyLeftRightGivensInternal ::invoke(G[0], G[1], mm + mbeg, nn + mrst, H, H + hs0, H + 2 * hs0, Hs, + Hs + hs1, Hs + 2 * hs1, hs0, hs1); } /// 1. chase the bulge @@ -155,9 +149,8 @@ struct SerialFrancisInternal { value_type *a2 = a1 + hs1; value_type *a3 = a2 + hs1; - SerialApplyLeftRightGivensInternal ::invoke(G[0], G[1], mm + mbeg, - nn + mrst, a1t, a2t, a3t, a1, - a2, a3, hs0, hs1); + SerialApplyLeftRightGivensInternal ::invoke(G[0], G[1], mm + mbeg, nn + mrst, a1t, a2t, a3t, a1, a2, a3, hs0, + hs1); /// ----------------------------------------------------- H_part2x2.mergeToATL(H_part3x3); } @@ -181,8 +174,7 @@ struct SerialFrancisInternal { value_type *a2t = a1t + hs0; value_type *a1 = H_part3x3.A01 - mbeg_mult_hs0; value_type *a2 = a1 + hs1; - SerialApplyLeftRightGivensInternal ::invoke(G[0], mm + mbeg, nn + mrst, - a1t, a2t, a1, a2, hs0, hs1); + SerialApplyLeftRightGivensInternal ::invoke(G[0], mm + mbeg, nn + mrst, a1t, a2t, a1, a2, hs0, hs1); /// ----------------------------------------------------- H_part2x2.mergeToATL(H_part3x3); @@ -192,11 +184,10 @@ struct SerialFrancisInternal { } template - KOKKOS_FORCEINLINE_FUNCTION static int invoke( - const int mbeg, const int mend, const int morg, - /* */ ValueType *HH, const int hs0, const int hs1, - const Kokkos::complex lambda1, - const Kokkos::complex lambda2, const bool is_complex) { + KOKKOS_FORCEINLINE_FUNCTION static int invoke(const int mbeg, const int mend, const int morg, + /* */ ValueType *HH, const int hs0, const int hs1, + const Kokkos::complex lambda1, + const Kokkos::complex lambda2, const bool is_complex) { return invoke(mbeg, mend, morg, HH, hs0, hs1, lambda1, lambda2, is_complex, (Kokkos::pair *)NULL, false); } diff --git a/batched/dense/impl/KokkosBatched_Gemm_Serial_Impl.hpp b/batched/dense/impl/KokkosBatched_Gemm_Serial_Impl.hpp index 6b3cec25da..82d6b1641b 100644 --- a/batched/dense/impl/KokkosBatched_Gemm_Serial_Impl.hpp +++ b/batched/dense/impl/KokkosBatched_Gemm_Serial_Impl.hpp @@ -36,44 +36,31 @@ namespace KokkosBatched { /// NT/NT /// -#if defined(__KOKKOSBATCHED_ENABLE_INTEL_MKL__) && \ - defined(__KOKKOSBATCHED_ENABLE_INTEL_MKL_BATCHED__) && \ +#if defined(__KOKKOSBATCHED_ENABLE_INTEL_MKL__) && defined(__KOKKOSBATCHED_ENABLE_INTEL_MKL_BATCHED__) && \ defined(__KOKKOSBATCHED_ENABLE_INTEL_MKL_COMPACT_BATCHED__) template <> -template -KOKKOS_INLINE_FUNCTION int -SerialGemm::invoke(const ScalarType alpha, - const AViewType &A, - const BViewType &B, - const ScalarType beta, - const CViewType &C) { +template +KOKKOS_INLINE_FUNCTION int SerialGemm::invoke( + const ScalarType alpha, const AViewType &A, const BViewType &B, const ScalarType beta, const CViewType &C) { typedef typename CViewType::value_type vector_type; // typedef typename vector_type::value_type value_type; const int m = C.extent(0), n = C.extent(1), k = A.extent(1); static_assert(is_vector::value, "value type is not vector type"); - static_assert( - vector_type::vector_length == 4 || vector_type::vector_length == 8, - "AVX, AVX2 and AVX512 is supported"); - const MKL_COMPACT_PACK format = - vector_type::vector_length == 8 ? MKL_COMPACT_AVX512 : MKL_COMPACT_AVX; + static_assert(vector_type::vector_length == 4 || vector_type::vector_length == 8, + "AVX, AVX2 and AVX512 is supported"); + const MKL_COMPACT_PACK format = vector_type::vector_length == 8 ? MKL_COMPACT_AVX512 : MKL_COMPACT_AVX; // no error check int r_val = 0; if (A.stride_0() == 1 && B.stride_0() == 1 && C.stride_0() == 1) { - mkl_dgemm_compact(MKL_COL_MAJOR, MKL_NOTRANS, MKL_NOTRANS, m, n, k, alpha, - (const double *)A.data(), A.stride_1(), - (const double *)B.data(), B.stride_1(), beta, - (double *)C.data(), C.stride_1(), format, + mkl_dgemm_compact(MKL_COL_MAJOR, MKL_NOTRANS, MKL_NOTRANS, m, n, k, alpha, (const double *)A.data(), A.stride_1(), + (const double *)B.data(), B.stride_1(), beta, (double *)C.data(), C.stride_1(), format, (MKL_INT)vector_type::vector_length); } else if (A.stride_1() == 1 && B.stride_1() == 1 && C.stride_1() == 1) { - mkl_dgemm_compact(MKL_ROW_MAJOR, MKL_NOTRANS, MKL_NOTRANS, m, n, k, alpha, - (const double *)A.data(), A.stride_0(), - (const double *)B.data(), B.stride_0(), beta, - (double *)C.data(), C.stride_0(), format, + mkl_dgemm_compact(MKL_ROW_MAJOR, MKL_NOTRANS, MKL_NOTRANS, m, n, k, alpha, (const double *)A.data(), A.stride_0(), + (const double *)B.data(), B.stride_0(), beta, (double *)C.data(), C.stride_0(), format, (MKL_INT)vector_type::vector_length); } else { r_val = -1; @@ -83,80 +70,56 @@ SerialGemm -template -KOKKOS_INLINE_FUNCTION int -SerialGemm::invoke(const ScalarType alpha, - const AViewType &A, - const BViewType &B, - const ScalarType beta, - const CViewType &C) { +template +KOKKOS_INLINE_FUNCTION int SerialGemm::invoke( + const ScalarType alpha, const AViewType &A, const BViewType &B, const ScalarType beta, const CViewType &C) { // C = beta C + alpha A B // C (m x n), A(m x k), B(k x n) - return SerialGemmInternal::invoke( - C.extent(0), C.extent(1), A.extent(1), alpha, A.data(), A.stride_0(), - A.stride_1(), B.data(), B.stride_0(), B.stride_1(), beta, C.data(), - C.stride_0(), C.stride_1()); + return SerialGemmInternal::invoke(C.extent(0), C.extent(1), A.extent(1), alpha, A.data(), + A.stride_0(), A.stride_1(), B.data(), B.stride_0(), + B.stride_1(), beta, C.data(), C.stride_0(), C.stride_1()); } template <> -template -KOKKOS_INLINE_FUNCTION int -SerialGemm::invoke( - const ScalarType alpha, const AViewType &A, const BViewType &B, - const ScalarType beta, const CViewType &C) { +template +KOKKOS_INLINE_FUNCTION int SerialGemm::invoke( + const ScalarType alpha, const AViewType &A, const BViewType &B, const ScalarType beta, const CViewType &C) { // C = beta C + alpha A B // C (m x n), A(m x k), B(k x n) - return SerialGemmInternal::invoke( - C.extent(0), C.extent(1), A.extent(1), alpha, A.data(), A.stride_0(), - A.stride_1(), B.data(), B.stride_0(), B.stride_1(), beta, C.data(), - C.stride_0(), C.stride_1()); + return SerialGemmInternal::invoke(C.extent(0), C.extent(1), A.extent(1), alpha, A.data(), + A.stride_0(), A.stride_1(), B.data(), B.stride_0(), + B.stride_1(), beta, C.data(), C.stride_0(), C.stride_1()); } /// /// T/NT /// -#if defined(__KOKKOSBATCHED_ENABLE_INTEL_MKL__) && \ - defined(__KOKKOSBATCHED_ENABLE_INTEL_MKL_BATCHED__) && \ +#if defined(__KOKKOSBATCHED_ENABLE_INTEL_MKL__) && defined(__KOKKOSBATCHED_ENABLE_INTEL_MKL_BATCHED__) && \ defined(__KOKKOSBATCHED_ENABLE_INTEL_MKL_COMPACT_BATCHED__) template <> -template -KOKKOS_INLINE_FUNCTION int -SerialGemm::invoke(const ScalarType alpha, - const AViewType &A, - const BViewType &B, - const ScalarType beta, - const CViewType &C) { +template +KOKKOS_INLINE_FUNCTION int SerialGemm::invoke( + const ScalarType alpha, const AViewType &A, const BViewType &B, const ScalarType beta, const CViewType &C) { typedef typename CViewType::value_type vector_type; // typedef typename vector_type::value_type value_type; const int m = C.extent(0), n = C.extent(1), k = A.extent(0); static_assert(is_vector::value, "value type is not vector type"); - static_assert( - vector_type::vector_length == 4 || vector_type::vector_length == 8, - "AVX, AVX2 and AVX512 is supported"); - const MKL_COMPACT_PACK format = - vector_type::vector_length == 8 ? MKL_COMPACT_AVX512 : MKL_COMPACT_AVX; + static_assert(vector_type::vector_length == 4 || vector_type::vector_length == 8, + "AVX, AVX2 and AVX512 is supported"); + const MKL_COMPACT_PACK format = vector_type::vector_length == 8 ? MKL_COMPACT_AVX512 : MKL_COMPACT_AVX; // no error check int r_val = 0; if (A.stride_0() == 1 && B.stride_0() == 1 && C.stride_0() == 1) { - mkl_dgemm_compact(MKL_COL_MAJOR, MKL_TRANS, MKL_NOTRANS, m, n, k, alpha, - (const double *)A.data(), A.stride_1(), - (const double *)B.data(), B.stride_1(), beta, - (double *)C.data(), C.stride_1(), format, + mkl_dgemm_compact(MKL_COL_MAJOR, MKL_TRANS, MKL_NOTRANS, m, n, k, alpha, (const double *)A.data(), A.stride_1(), + (const double *)B.data(), B.stride_1(), beta, (double *)C.data(), C.stride_1(), format, (MKL_INT)vector_type::vector_length); } else if (A.stride_1() == 1 && B.stride_1() == 1 && C.stride_1() == 1) { - mkl_dgemm_compact(MKL_ROW_MAJOR, MKL_TRANS, MKL_NOTRANS, m, n, k, alpha, - (const double *)A.data(), A.stride_0(), - (const double *)B.data(), B.stride_0(), beta, - (double *)C.data(), C.stride_0(), format, + mkl_dgemm_compact(MKL_ROW_MAJOR, MKL_TRANS, MKL_NOTRANS, m, n, k, alpha, (const double *)A.data(), A.stride_0(), + (const double *)B.data(), B.stride_0(), beta, (double *)C.data(), C.stride_0(), format, (MKL_INT)vector_type::vector_length); } else { r_val = -1; @@ -166,77 +129,56 @@ SerialGemm -template -KOKKOS_INLINE_FUNCTION int -SerialGemm::invoke( - const ScalarType alpha, const AViewType &A, const BViewType &B, - const ScalarType beta, const CViewType &C) { +template +KOKKOS_INLINE_FUNCTION int SerialGemm::invoke( + const ScalarType alpha, const AViewType &A, const BViewType &B, const ScalarType beta, const CViewType &C) { // C = beta C + alpha A B // C (m x n), A(m x k), B(k x n) - return SerialGemmInternal::invoke( - C.extent(0), C.extent(1), A.extent(0), alpha, A.data(), A.stride_1(), - A.stride_0(), B.data(), B.stride_0(), B.stride_1(), beta, C.data(), - C.stride_0(), C.stride_1()); + return SerialGemmInternal::invoke(C.extent(0), C.extent(1), A.extent(0), alpha, A.data(), + A.stride_1(), A.stride_0(), B.data(), B.stride_0(), + B.stride_1(), beta, C.data(), C.stride_0(), C.stride_1()); } template <> -template -KOKKOS_INLINE_FUNCTION int -SerialGemm::invoke( - const ScalarType alpha, const AViewType &A, const BViewType &B, - const ScalarType beta, const CViewType &C) { +template +KOKKOS_INLINE_FUNCTION int SerialGemm::invoke( + const ScalarType alpha, const AViewType &A, const BViewType &B, const ScalarType beta, const CViewType &C) { // C = beta C + alpha A B // C (m x n), A(m x k), B(k x n) - return SerialGemmInternal::invoke( - C.extent(0), C.extent(1), A.extent(0), alpha, A.data(), A.stride_1(), - A.stride_0(), B.data(), B.stride_0(), B.stride_1(), beta, C.data(), - C.stride_0(), C.stride_1()); + return SerialGemmInternal::invoke(C.extent(0), C.extent(1), A.extent(0), alpha, A.data(), + A.stride_1(), A.stride_0(), B.data(), B.stride_0(), + B.stride_1(), beta, C.data(), C.stride_0(), C.stride_1()); } /// /// NT/T /// -#if defined(__KOKKOSBATCHED_ENABLE_INTEL_MKL__) && \ - defined(__KOKKOSBATCHED_ENABLE_INTEL_MKL_BATCHED__) && \ +#if defined(__KOKKOSBATCHED_ENABLE_INTEL_MKL__) && defined(__KOKKOSBATCHED_ENABLE_INTEL_MKL_BATCHED__) && \ defined(__KOKKOSBATCHED_ENABLE_INTEL_MKL_COMPACT_BATCHED__) template <> -template -KOKKOS_INLINE_FUNCTION int -SerialGemm::invoke(const ScalarType alpha, - const AViewType &A, - const BViewType &B, - const ScalarType beta, - const CViewType &C) { +template +KOKKOS_INLINE_FUNCTION int SerialGemm::invoke( + const ScalarType alpha, const AViewType &A, const BViewType &B, const ScalarType beta, const CViewType &C) { typedef typename CViewType::value_type vector_type; // typedef typename vector_type::value_type value_type; const int m = C.extent(0), n = C.extent(1), k = A.extent(1); static_assert(is_vector::value, "value type is not vector type"); - static_assert( - vector_type::vector_length == 4 || vector_type::vector_length == 8, - "AVX, AVX2 and AVX512 is supported"); - const MKL_COMPACT_PACK format = - vector_type::vector_length == 8 ? MKL_COMPACT_AVX512 : MKL_COMPACT_AVX; + static_assert(vector_type::vector_length == 4 || vector_type::vector_length == 8, + "AVX, AVX2 and AVX512 is supported"); + const MKL_COMPACT_PACK format = vector_type::vector_length == 8 ? MKL_COMPACT_AVX512 : MKL_COMPACT_AVX; // no error check int r_val = 0; if (A.stride_0() == 1 && B.stride_0() == 1 && C.stride_0() == 1) { - mkl_dgemm_compact(MKL_COL_MAJOR, MKL_NOTRANS, MKL_TRANS, m, n, k, alpha, - (const double *)A.data(), A.stride_1(), - (const double *)B.data(), B.stride_1(), beta, - (double *)C.data(), C.stride_1(), format, + mkl_dgemm_compact(MKL_COL_MAJOR, MKL_NOTRANS, MKL_TRANS, m, n, k, alpha, (const double *)A.data(), A.stride_1(), + (const double *)B.data(), B.stride_1(), beta, (double *)C.data(), C.stride_1(), format, (MKL_INT)vector_type::vector_length); } else if (A.stride_1() == 1 && B.stride_1() == 1 && C.stride_1() == 1) { - mkl_dgemm_compact(MKL_ROW_MAJOR, MKL_NOTRANS, MKL_TRANS, m, n, k, alpha, - (const double *)A.data(), A.stride_0(), - (const double *)B.data(), B.stride_0(), beta, - (double *)C.data(), C.stride_0(), format, + mkl_dgemm_compact(MKL_ROW_MAJOR, MKL_NOTRANS, MKL_TRANS, m, n, k, alpha, (const double *)A.data(), A.stride_0(), + (const double *)B.data(), B.stride_0(), beta, (double *)C.data(), C.stride_0(), format, (MKL_INT)vector_type::vector_length); } else { r_val = -1; @@ -246,74 +188,56 @@ SerialGemm -template -KOKKOS_INLINE_FUNCTION int -SerialGemm::invoke( - const ScalarType alpha, const AViewType &A, const BViewType &B, - const ScalarType beta, const CViewType &C) { +template +KOKKOS_INLINE_FUNCTION int SerialGemm::invoke( + const ScalarType alpha, const AViewType &A, const BViewType &B, const ScalarType beta, const CViewType &C) { // C = beta C + alpha A B // C (m x n), A(m x k), B(k x n) - return SerialGemmInternal::invoke( - C.extent(0), C.extent(1), A.extent(1), alpha, A.data(), A.stride_0(), - A.stride_1(), B.data(), B.stride_1(), B.stride_0(), beta, C.data(), - C.stride_0(), C.stride_1()); + return SerialGemmInternal::invoke(C.extent(0), C.extent(1), A.extent(1), alpha, A.data(), + A.stride_0(), A.stride_1(), B.data(), B.stride_1(), + B.stride_0(), beta, C.data(), C.stride_0(), C.stride_1()); } template <> -template -KOKKOS_INLINE_FUNCTION int -SerialGemm::invoke( - const ScalarType alpha, const AViewType &A, const BViewType &B, - const ScalarType beta, const CViewType &C) { +template +KOKKOS_INLINE_FUNCTION int SerialGemm::invoke( + const ScalarType alpha, const AViewType &A, const BViewType &B, const ScalarType beta, const CViewType &C) { // C = beta C + alpha A B // C (m x n), A(m x k), B(k x n) - return SerialGemmInternal::invoke( - C.extent(0), C.extent(1), A.extent(1), alpha, A.data(), A.stride_0(), - A.stride_1(), B.data(), B.stride_1(), B.stride_0(), beta, C.data(), - C.stride_0(), C.stride_1()); + return SerialGemmInternal::invoke(C.extent(0), C.extent(1), A.extent(1), alpha, A.data(), + A.stride_0(), A.stride_1(), B.data(), B.stride_1(), + B.stride_0(), beta, C.data(), C.stride_0(), C.stride_1()); } /// /// T/T /// -#if defined(__KOKKOSBATCHED_ENABLE_INTEL_MKL__) && \ - defined(__KOKKOSBATCHED_ENABLE_INTEL_MKL_BATCHED__) && \ +#if defined(__KOKKOSBATCHED_ENABLE_INTEL_MKL__) && defined(__KOKKOSBATCHED_ENABLE_INTEL_MKL_BATCHED__) && \ defined(__KOKKOSBATCHED_ENABLE_INTEL_MKL_COMPACT_BATCHED__) template <> -template -KOKKOS_INLINE_FUNCTION int -SerialGemm::invoke( - const ScalarType alpha, const AViewType &A, const BViewType &B, - const ScalarType beta, const CViewType &C) { +template +KOKKOS_INLINE_FUNCTION int SerialGemm::invoke( + const ScalarType alpha, const AViewType &A, const BViewType &B, const ScalarType beta, const CViewType &C) { typedef typename CViewType::value_type vector_type; // typedef typename vector_type::value_type value_type; const int m = C.extent(0), n = C.extent(1), k = A.extent(0); static_assert(is_vector::value, "value type is not vector type"); - static_assert( - vector_type::vector_length == 4 || vector_type::vector_length == 8, - "AVX, AVX2 and AVX512 is supported"); - const MKL_COMPACT_PACK format = - vector_type::vector_length == 8 ? MKL_COMPACT_AVX512 : MKL_COMPACT_AVX; + static_assert(vector_type::vector_length == 4 || vector_type::vector_length == 8, + "AVX, AVX2 and AVX512 is supported"); + const MKL_COMPACT_PACK format = vector_type::vector_length == 8 ? MKL_COMPACT_AVX512 : MKL_COMPACT_AVX; // no error check int r_val = 0; if (A.stride_0() == 1 && B.stride_0() == 1 && C.stride_0() == 1) { - mkl_dgemm_compact(MKL_COL_MAJOR, MKL_TRANS, MKL_TRANS, m, n, k, alpha, - (const double *)A.data(), A.stride_1(), - (const double *)B.data(), B.stride_1(), beta, - (double *)C.data(), C.stride_1(), format, + mkl_dgemm_compact(MKL_COL_MAJOR, MKL_TRANS, MKL_TRANS, m, n, k, alpha, (const double *)A.data(), A.stride_1(), + (const double *)B.data(), B.stride_1(), beta, (double *)C.data(), C.stride_1(), format, (MKL_INT)vector_type::vector_length); } else if (A.stride_1() == 1 && B.stride_1() == 1 && C.stride_1() == 1) { - mkl_dgemm_compact(MKL_ROW_MAJOR, MKL_TRANS, MKL_TRANS, m, n, k, alpha, - (const double *)A.data(), A.stride_0(), - (const double *)B.data(), B.stride_0(), beta, - (double *)C.data(), C.stride_0(), format, + mkl_dgemm_compact(MKL_ROW_MAJOR, MKL_TRANS, MKL_TRANS, m, n, k, alpha, (const double *)A.data(), A.stride_0(), + (const double *)B.data(), B.stride_0(), beta, (double *)C.data(), C.stride_0(), format, (MKL_INT)vector_type::vector_length); } else { r_val = -1; @@ -323,33 +247,25 @@ SerialGemm::invoke( #endif template <> -template -KOKKOS_INLINE_FUNCTION int -SerialGemm::invoke( - const ScalarType alpha, const AViewType &A, const BViewType &B, - const ScalarType beta, const CViewType &C) { +template +KOKKOS_INLINE_FUNCTION int SerialGemm::invoke( + const ScalarType alpha, const AViewType &A, const BViewType &B, const ScalarType beta, const CViewType &C) { // C = beta C + alpha A B // C (m x n), A(m x k), B(k x n) - return SerialGemmInternal::invoke( - C.extent(0), C.extent(1), A.extent(0), alpha, A.data(), A.stride_1(), - A.stride_0(), B.data(), B.stride_1(), B.stride_0(), beta, C.data(), - C.stride_0(), C.stride_1()); + return SerialGemmInternal::invoke(C.extent(0), C.extent(1), A.extent(0), alpha, A.data(), + A.stride_1(), A.stride_0(), B.data(), B.stride_1(), + B.stride_0(), beta, C.data(), C.stride_0(), C.stride_1()); } template <> -template -KOKKOS_INLINE_FUNCTION int -SerialGemm::invoke( - const ScalarType alpha, const AViewType &A, const BViewType &B, - const ScalarType beta, const CViewType &C) { +template +KOKKOS_INLINE_FUNCTION int SerialGemm::invoke( + const ScalarType alpha, const AViewType &A, const BViewType &B, const ScalarType beta, const CViewType &C) { // C = beta C + alpha A B // C (m x n), A(m x k), B(k x n) - return SerialGemmInternal::invoke( - C.extent(0), C.extent(1), A.extent(0), alpha, A.data(), A.stride_1(), - A.stride_0(), B.data(), B.stride_1(), B.stride_0(), beta, C.data(), - C.stride_0(), C.stride_1()); + return SerialGemmInternal::invoke(C.extent(0), C.extent(1), A.extent(0), alpha, A.data(), + A.stride_1(), A.stride_0(), B.data(), B.stride_1(), + B.stride_0(), beta, C.data(), C.stride_0(), C.stride_1()); } } // namespace KokkosBatched diff --git a/batched/dense/impl/KokkosBatched_Gemm_Serial_Internal.hpp b/batched/dense/impl/KokkosBatched_Gemm_Serial_Internal.hpp index 43197f1da3..eaa5b67ffa 100644 --- a/batched/dense/impl/KokkosBatched_Gemm_Serial_Internal.hpp +++ b/batched/dense/impl/KokkosBatched_Gemm_Serial_Internal.hpp @@ -34,21 +34,18 @@ namespace KokkosBatched { template struct SerialGemmInternal { template - KOKKOS_INLINE_FUNCTION static int invoke( - const int m, const int n, const int k, const ScalarType alpha, - const ValueType *KOKKOS_RESTRICT A, const int as0, const int as1, - const ValueType *KOKKOS_RESTRICT B, const int bs0, const int bs1, - const ScalarType beta, - /**/ ValueType *KOKKOS_RESTRICT C, const int cs0, const int cs1); + KOKKOS_INLINE_FUNCTION static int invoke(const int m, const int n, const int k, const ScalarType alpha, + const ValueType *KOKKOS_RESTRICT A, const int as0, const int as1, + const ValueType *KOKKOS_RESTRICT B, const int bs0, const int bs1, + const ScalarType beta, + /**/ ValueType *KOKKOS_RESTRICT C, const int cs0, const int cs1); }; template <> template KOKKOS_INLINE_FUNCTION int SerialGemmInternal::invoke( - const int m, const int n, const int k, const ScalarType alpha, - const ValueType *KOKKOS_RESTRICT A, const int as0, const int as1, - const ValueType *KOKKOS_RESTRICT B, const int bs0, const int bs1, - const ScalarType beta, + const int m, const int n, const int k, const ScalarType alpha, const ValueType *KOKKOS_RESTRICT A, const int as0, + const int as1, const ValueType *KOKKOS_RESTRICT B, const int bs0, const int bs1, const ScalarType beta, /**/ ValueType *KOKKOS_RESTRICT C, const int cs0, const int cs1) { // C = beta C + alpha A B // C (m x n), A(m x k), B(k x n) @@ -65,8 +62,7 @@ KOKKOS_INLINE_FUNCTION int SerialGemmInternal::invoke( ValueType *KOKKOS_RESTRICT pC = C; for (int p = 0; p < k; ++p) { - const ValueType *KOKKOS_RESTRICT pA = A + p * as1, - *KOKKOS_RESTRICT pB = B + p * bs0; + const ValueType *KOKKOS_RESTRICT pA = A + p * as1, *KOKKOS_RESTRICT pB = B + p * bs0; for (int i = 0; i < m; ++i) { const ValueType tA(alpha * pA[i * as0]); #if defined(KOKKOS_ENABLE_PRAGMA_UNROLL) @@ -82,10 +78,8 @@ KOKKOS_INLINE_FUNCTION int SerialGemmInternal::invoke( template <> template KOKKOS_INLINE_FUNCTION int SerialGemmInternal::invoke( - const int m, const int n, const int k, const ScalarType alpha, - const ValueType *KOKKOS_RESTRICT A, const int as0, const int as1, - const ValueType *KOKKOS_RESTRICT B, const int bs0, const int bs1, - const ScalarType beta, + const int m, const int n, const int k, const ScalarType alpha, const ValueType *KOKKOS_RESTRICT A, const int as0, + const int as1, const ValueType *KOKKOS_RESTRICT B, const int bs0, const int bs1, const ScalarType beta, /**/ ValueType *KOKKOS_RESTRICT C, const int cs0, const int cs1) { // C = beta C + alpha A B // C (m x n), A(m x k), B(k x n) @@ -105,17 +99,14 @@ KOKKOS_INLINE_FUNCTION int SerialGemmInternal::invoke( const ValueType alpha_value(alpha); InnerGemmFixC inner(as0, as1, bs0, bs1, cs0, cs1); - auto gemm = [&](const int ib, const int jb, const int pb, - const ValueType *KOKKOS_RESTRICT AA, + auto gemm = [&](const int ib, const int jb, const int pb, const ValueType *KOKKOS_RESTRICT AA, const ValueType *KOKKOS_RESTRICT BB, /**/ ValueType *KOKKOS_RESTRICT CC) { const int mb = mbAlgo, nb = nbAlgo; for (int i = 0; i < ib; i += mb) for (int j = 0; j < jb; j += nb) - inner.serial_invoke(alpha_value, AA + i * as0, BB + j * bs1, - (i + mb) > ib ? (ib - i) : mb, - (j + nb) > jb ? (jb - j) : nb, pb, - CC + i * cs0 + j * cs1); + inner.serial_invoke(alpha_value, AA + i * as0, BB + j * bs1, (i + mb) > ib ? (ib - i) : mb, + (j + nb) > jb ? (jb - j) : nb, pb, CC + i * cs0 + j * cs1); }; const bool is_small = true; //(m*n*k <= 64*64*64); diff --git a/batched/dense/impl/KokkosBatched_Gemm_TeamVector_Impl.hpp b/batched/dense/impl/KokkosBatched_Gemm_TeamVector_Impl.hpp index aedfb9f662..64e65d62d8 100644 --- a/batched/dense/impl/KokkosBatched_Gemm_TeamVector_Impl.hpp +++ b/batched/dense/impl/KokkosBatched_Gemm_TeamVector_Impl.hpp @@ -40,19 +40,15 @@ namespace KokkosBatched { /// template -struct TeamVectorGemm { - template - KOKKOS_INLINE_FUNCTION static int invoke( - const MemberType &member, const ScalarType alpha, const AViewType &A, - const BViewType &B, const ScalarType beta, const CViewType &C) { +struct TeamVectorGemm { + template + KOKKOS_INLINE_FUNCTION static int invoke(const MemberType &member, const ScalarType alpha, const AViewType &A, + const BViewType &B, const ScalarType beta, const CViewType &C) { // C = beta C + alpha A B // C (m x n), A(m x k), B(k x n) return TeamVectorGemmInternal::invoke( - member, C.extent(0), C.extent(1), A.extent(1), alpha, A.data(), - A.stride_0(), A.stride_1(), B.data(), B.stride_0(), B.stride_1(), beta, - C.data(), C.stride_0(), C.stride_1()); + member, C.extent(0), C.extent(1), A.extent(1), alpha, A.data(), A.stride_0(), A.stride_1(), B.data(), + B.stride_0(), B.stride_1(), beta, C.data(), C.stride_0(), C.stride_1()); } }; @@ -61,19 +57,15 @@ struct TeamVectorGemm -struct TeamVectorGemm { - template - KOKKOS_INLINE_FUNCTION static int invoke( - const MemberType &member, const ScalarType alpha, const AViewType &A, - const BViewType &B, const ScalarType beta, const CViewType &C) { +struct TeamVectorGemm { + template + KOKKOS_INLINE_FUNCTION static int invoke(const MemberType &member, const ScalarType alpha, const AViewType &A, + const BViewType &B, const ScalarType beta, const CViewType &C) { // C = beta C + alpha A B // C (m x n), A(m x k), B(k x n) return TeamVectorGemmInternal::invoke( - member, C.extent(0), C.extent(1), A.extent(0), alpha, A.data(), - A.stride_1(), A.stride_0(), B.data(), B.stride_0(), B.stride_1(), beta, - C.data(), C.stride_0(), C.stride_1()); + member, C.extent(0), C.extent(1), A.extent(0), alpha, A.data(), A.stride_1(), A.stride_0(), B.data(), + B.stride_0(), B.stride_1(), beta, C.data(), C.stride_0(), C.stride_1()); } }; @@ -82,19 +74,15 @@ struct TeamVectorGemm -struct TeamVectorGemm { - template - KOKKOS_INLINE_FUNCTION static int invoke( - const MemberType &member, const ScalarType alpha, const AViewType &A, - const BViewType &B, const ScalarType beta, const CViewType &C) { +struct TeamVectorGemm { + template + KOKKOS_INLINE_FUNCTION static int invoke(const MemberType &member, const ScalarType alpha, const AViewType &A, + const BViewType &B, const ScalarType beta, const CViewType &C) { // C = beta C + alpha A B // C (m x n), A(m x k), B(k x n) return TeamVectorGemmInternal::invoke( - member, C.extent(0), C.extent(1), A.extent(1), alpha, A.data(), - A.stride_0(), A.stride_1(), B.data(), B.stride_1(), B.stride_0(), beta, - C.data(), C.stride_0(), C.stride_1()); + member, C.extent(0), C.extent(1), A.extent(1), alpha, A.data(), A.stride_0(), A.stride_1(), B.data(), + B.stride_1(), B.stride_0(), beta, C.data(), C.stride_0(), C.stride_1()); } }; @@ -103,19 +91,15 @@ struct TeamVectorGemm -struct TeamVectorGemm { - template - KOKKOS_INLINE_FUNCTION static int invoke( - const MemberType &member, const ScalarType alpha, const AViewType &A, - const BViewType &B, const ScalarType beta, const CViewType &C) { +struct TeamVectorGemm { + template + KOKKOS_INLINE_FUNCTION static int invoke(const MemberType &member, const ScalarType alpha, const AViewType &A, + const BViewType &B, const ScalarType beta, const CViewType &C) { // C = beta C + alpha A B // C (m x n), A(m x k), B(k x n) return TeamVectorGemmInternal::invoke( - member, C.extent(0), C.extent(1), A.extent(0), alpha, A.data(), - A.stride_1(), A.stride_0(), B.data(), B.stride_1(), B.stride_0(), beta, - C.data(), C.stride_0(), C.stride_1()); + member, C.extent(0), C.extent(1), A.extent(0), alpha, A.data(), A.stride_1(), A.stride_0(), B.data(), + B.stride_1(), B.stride_0(), beta, C.data(), C.stride_0(), C.stride_1()); } }; diff --git a/batched/dense/impl/KokkosBatched_Gemm_TeamVector_Internal.hpp b/batched/dense/impl/KokkosBatched_Gemm_TeamVector_Internal.hpp index 7e40ec4415..8ad7d570df 100644 --- a/batched/dense/impl/KokkosBatched_Gemm_TeamVector_Internal.hpp +++ b/batched/dense/impl/KokkosBatched_Gemm_TeamVector_Internal.hpp @@ -31,21 +31,18 @@ namespace KokkosBatched { template struct TeamVectorGemmInternal { template - KOKKOS_INLINE_FUNCTION static int invoke( - const MemberType &member, const int m, const int n, const int k, - const ScalarType alpha, const ValueType *KOKKOS_RESTRICT A, const int as0, - const int as1, const ValueType *KOKKOS_RESTRICT B, const int bs0, - const int bs1, const ScalarType beta, - /**/ ValueType *KOKKOS_RESTRICT C, const int cs0, const int cs1); + KOKKOS_INLINE_FUNCTION static int invoke(const MemberType &member, const int m, const int n, const int k, + const ScalarType alpha, const ValueType *KOKKOS_RESTRICT A, const int as0, + const int as1, const ValueType *KOKKOS_RESTRICT B, const int bs0, + const int bs1, const ScalarType beta, + /**/ ValueType *KOKKOS_RESTRICT C, const int cs0, const int cs1); }; template <> template -KOKKOS_INLINE_FUNCTION int -TeamVectorGemmInternal::invoke( - const MemberType &member, const int m, const int n, const int k, - const ScalarType alpha, const ValueType *KOKKOS_RESTRICT A, const int as0, - const int as1, const ValueType *KOKKOS_RESTRICT B, const int bs0, +KOKKOS_INLINE_FUNCTION int TeamVectorGemmInternal::invoke( + const MemberType &member, const int m, const int n, const int k, const ScalarType alpha, + const ValueType *KOKKOS_RESTRICT A, const int as0, const int as1, const ValueType *KOKKOS_RESTRICT B, const int bs0, const int bs1, const ScalarType beta, /**/ ValueType *KOKKOS_RESTRICT C, const int cs0, const int cs1) { // C = beta C + alpha A B @@ -54,11 +51,9 @@ TeamVectorGemmInternal::invoke( const ScalarType one(1.0), zero(0.0); if (beta == zero) - KokkosBlas::Impl::TeamVectorSetInternal::invoke(member, m, n, zero, C, cs0, - cs1); + KokkosBlas::Impl::TeamVectorSetInternal::invoke(member, m, n, zero, C, cs0, cs1); else if (beta != one) - KokkosBlas::Impl::TeamVectorScaleInternal::invoke(member, m, n, beta, C, - cs0, cs1); + KokkosBlas::Impl::TeamVectorScaleInternal::invoke(member, m, n, beta, C, cs0, cs1); if (alpha != ScalarType(0.0)) { if (m <= 0 || n <= 0 || k <= 0) return 0; @@ -67,15 +62,13 @@ TeamVectorGemmInternal::invoke( Kokkos::parallel_for(Kokkos::TeamThreadRange(member, m), [&](const int &i) { const ValueType *KOKKOS_RESTRICT pA = A + i * as0; - Kokkos::parallel_for(Kokkos::ThreadVectorRange(member, n), - [&](const int &j) { - const ValueType *KOKKOS_RESTRICT pB = B + j * bs1; - - ValueType c = ValueType(0); - for (int p = 0; p < k; ++p) - c += pA[p * as1] * pB[p * bs0]; - C[i * cs0 + j * cs1] += alpha * c; - }); + Kokkos::parallel_for(Kokkos::ThreadVectorRange(member, n), [&](const int &j) { + const ValueType *KOKKOS_RESTRICT pB = B + j * bs1; + + ValueType c = ValueType(0); + for (int p = 0; p < k; ++p) c += pA[p * as1] * pB[p * bs0]; + C[i * cs0 + j * cs1] += alpha * c; + }); }); } return 0; @@ -83,11 +76,9 @@ TeamVectorGemmInternal::invoke( template <> template -KOKKOS_INLINE_FUNCTION int -TeamVectorGemmInternal::invoke( - const MemberType &member, const int m, const int n, const int k, - const ScalarType alpha, const ValueType *KOKKOS_RESTRICT A, const int as0, - const int as1, const ValueType *KOKKOS_RESTRICT B, const int bs0, +KOKKOS_INLINE_FUNCTION int TeamVectorGemmInternal::invoke( + const MemberType &member, const int m, const int n, const int k, const ScalarType alpha, + const ValueType *KOKKOS_RESTRICT A, const int as0, const int as1, const ValueType *KOKKOS_RESTRICT B, const int bs0, const int bs1, const ScalarType beta, /**/ ValueType *KOKKOS_RESTRICT C, const int cs0, const int cs1) { // C = beta C + alpha A B @@ -96,11 +87,9 @@ TeamVectorGemmInternal::invoke( const ScalarType one(1.0), zero(0.0); if (beta == zero) - KokkosBlas::Impl::TeamVectorSetInternal::invoke(member, m, n, zero, C, cs0, - cs1); + KokkosBlas::Impl::TeamVectorSetInternal::invoke(member, m, n, zero, C, cs0, cs1); else if (beta != one) - KokkosBlas::Impl::TeamVectorScaleInternal::invoke(member, m, n, beta, C, - cs0, cs1); + KokkosBlas::Impl::TeamVectorScaleInternal::invoke(member, m, n, beta, C, cs0, cs1); if (alpha != ScalarType(0.0)) { if (m <= 0 || n <= 0 || k <= 0) return 0; @@ -109,16 +98,13 @@ TeamVectorGemmInternal::invoke( Kokkos::parallel_for(Kokkos::TeamThreadRange(member, m), [&](const int &i) { const ValueType *KOKKOS_RESTRICT pA = A + i * as0; - Kokkos::parallel_for( - Kokkos::ThreadVectorRange(member, n), [&](const int &j) { - const ValueType *KOKKOS_RESTRICT pB = B + j * bs1; - - ValueType c = ValueType(0); - for (int p = 0; p < k; ++p) - c += Kokkos::ArithTraits::conj(pA[p * as1]) * - pB[p * bs0]; - C[i * cs0 + j * cs1] += alpha * c; - }); + Kokkos::parallel_for(Kokkos::ThreadVectorRange(member, n), [&](const int &j) { + const ValueType *KOKKOS_RESTRICT pB = B + j * bs1; + + ValueType c = ValueType(0); + for (int p = 0; p < k; ++p) c += Kokkos::ArithTraits::conj(pA[p * as1]) * pB[p * bs0]; + C[i * cs0 + j * cs1] += alpha * c; + }); }); } return 0; diff --git a/batched/dense/impl/KokkosBatched_Gemm_Team_Impl.hpp b/batched/dense/impl/KokkosBatched_Gemm_Team_Impl.hpp index 647ffbdb26..0a9fb87b9e 100644 --- a/batched/dense/impl/KokkosBatched_Gemm_Team_Impl.hpp +++ b/batched/dense/impl/KokkosBatched_Gemm_Team_Impl.hpp @@ -40,36 +40,28 @@ namespace KokkosBatched { /// template -struct TeamGemm { - template - KOKKOS_INLINE_FUNCTION static int invoke( - const MemberType &member, const ScalarType alpha, const AViewType &A, - const BViewType &B, const ScalarType beta, const CViewType &C) { +struct TeamGemm { + template + KOKKOS_INLINE_FUNCTION static int invoke(const MemberType &member, const ScalarType alpha, const AViewType &A, + const BViewType &B, const ScalarType beta, const CViewType &C) { // C = beta C + alpha A B // C (m x n), A(m x k), B(k x n) - return TeamGemmInternal::invoke( - member, C.extent(0), C.extent(1), A.extent(1), alpha, A.data(), - A.stride_0(), A.stride_1(), B.data(), B.stride_0(), B.stride_1(), beta, - C.data(), C.stride_0(), C.stride_1()); + return TeamGemmInternal::invoke(member, C.extent(0), C.extent(1), A.extent(1), alpha, + A.data(), A.stride_0(), A.stride_1(), B.data(), B.stride_0(), + B.stride_1(), beta, C.data(), C.stride_0(), C.stride_1()); } }; template -struct TeamGemm { - template - KOKKOS_INLINE_FUNCTION static int invoke( - const MemberType &member, const ScalarType alpha, const AViewType &A, - const BViewType &B, const ScalarType beta, const CViewType &C) { +struct TeamGemm { + template + KOKKOS_INLINE_FUNCTION static int invoke(const MemberType &member, const ScalarType alpha, const AViewType &A, + const BViewType &B, const ScalarType beta, const CViewType &C) { // C = beta C + alpha A B // C (m x n), A(m x k), B(k x n) - return TeamGemmInternal::invoke( - member, C.extent(0), C.extent(1), A.extent(1), alpha, A.data(), - A.stride_0(), A.stride_1(), B.data(), B.stride_0(), B.stride_1(), beta, - C.data(), C.stride_0(), C.stride_1()); + return TeamGemmInternal::invoke(member, C.extent(0), C.extent(1), A.extent(1), alpha, A.data(), + A.stride_0(), A.stride_1(), B.data(), B.stride_0(), + B.stride_1(), beta, C.data(), C.stride_0(), C.stride_1()); } }; @@ -78,36 +70,28 @@ struct TeamGemm -struct TeamGemm { - template - KOKKOS_INLINE_FUNCTION static int invoke( - const MemberType &member, const ScalarType alpha, const AViewType &A, - const BViewType &B, const ScalarType beta, const CViewType &C) { +struct TeamGemm { + template + KOKKOS_INLINE_FUNCTION static int invoke(const MemberType &member, const ScalarType alpha, const AViewType &A, + const BViewType &B, const ScalarType beta, const CViewType &C) { // C = beta C + alpha A B // C (m x n), A(m x k), B(k x n) - return TeamGemmInternal::invoke( - member, C.extent(0), C.extent(1), A.extent(0), alpha, A.data(), - A.stride_1(), A.stride_0(), B.data(), B.stride_0(), B.stride_1(), beta, - C.data(), C.stride_0(), C.stride_1()); + return TeamGemmInternal::invoke(member, C.extent(0), C.extent(1), A.extent(0), alpha, + A.data(), A.stride_1(), A.stride_0(), B.data(), B.stride_0(), + B.stride_1(), beta, C.data(), C.stride_0(), C.stride_1()); } }; template -struct TeamGemm { - template - KOKKOS_INLINE_FUNCTION static int invoke( - const MemberType &member, const ScalarType alpha, const AViewType &A, - const BViewType &B, const ScalarType beta, const CViewType &C) { +struct TeamGemm { + template + KOKKOS_INLINE_FUNCTION static int invoke(const MemberType &member, const ScalarType alpha, const AViewType &A, + const BViewType &B, const ScalarType beta, const CViewType &C) { // C = beta C + alpha A B // C (m x n), A(m x k), B(k x n) - return TeamGemmInternal::invoke( - member, C.extent(0), C.extent(1), A.extent(0), alpha, A.data(), - A.stride_1(), A.stride_0(), B.data(), B.stride_0(), B.stride_1(), beta, - C.data(), C.stride_0(), C.stride_1()); + return TeamGemmInternal::invoke(member, C.extent(0), C.extent(1), A.extent(0), alpha, A.data(), + A.stride_1(), A.stride_0(), B.data(), B.stride_0(), + B.stride_1(), beta, C.data(), C.stride_0(), C.stride_1()); } }; @@ -116,36 +100,28 @@ struct TeamGemm -struct TeamGemm { - template - KOKKOS_INLINE_FUNCTION static int invoke( - const MemberType &member, const ScalarType alpha, const AViewType &A, - const BViewType &B, const ScalarType beta, const CViewType &C) { +struct TeamGemm { + template + KOKKOS_INLINE_FUNCTION static int invoke(const MemberType &member, const ScalarType alpha, const AViewType &A, + const BViewType &B, const ScalarType beta, const CViewType &C) { // C = beta C + alpha A B // C (m x n), A(m x k), B(k x n) - return TeamGemmInternal::invoke( - member, C.extent(0), C.extent(1), A.extent(1), alpha, A.data(), - A.stride_0(), A.stride_1(), B.data(), B.stride_1(), B.stride_0(), beta, - C.data(), C.stride_0(), C.stride_1()); + return TeamGemmInternal::invoke(member, C.extent(0), C.extent(1), A.extent(1), alpha, + A.data(), A.stride_0(), A.stride_1(), B.data(), B.stride_1(), + B.stride_0(), beta, C.data(), C.stride_0(), C.stride_1()); } }; template -struct TeamGemm { - template - KOKKOS_INLINE_FUNCTION static int invoke( - const MemberType &member, const ScalarType alpha, const AViewType &A, - const BViewType &B, const ScalarType beta, const CViewType &C) { +struct TeamGemm { + template + KOKKOS_INLINE_FUNCTION static int invoke(const MemberType &member, const ScalarType alpha, const AViewType &A, + const BViewType &B, const ScalarType beta, const CViewType &C) { // C = beta C + alpha A B // C (m x n), A(m x k), B(k x n) - return TeamGemmInternal::invoke( - member, C.extent(0), C.extent(1), A.extent(1), alpha, A.data(), - A.stride_0(), A.stride_1(), B.data(), B.stride_1(), B.stride_0(), beta, - C.data(), C.stride_0(), C.stride_1()); + return TeamGemmInternal::invoke(member, C.extent(0), C.extent(1), A.extent(1), alpha, A.data(), + A.stride_0(), A.stride_1(), B.data(), B.stride_1(), + B.stride_0(), beta, C.data(), C.stride_0(), C.stride_1()); } }; @@ -154,36 +130,28 @@ struct TeamGemm -struct TeamGemm { - template - KOKKOS_INLINE_FUNCTION static int invoke( - const MemberType &member, const ScalarType alpha, const AViewType &A, - const BViewType &B, const ScalarType beta, const CViewType &C) { +struct TeamGemm { + template + KOKKOS_INLINE_FUNCTION static int invoke(const MemberType &member, const ScalarType alpha, const AViewType &A, + const BViewType &B, const ScalarType beta, const CViewType &C) { // C = beta C + alpha A B // C (m x n), A(m x k), B(k x n) - return TeamGemmInternal::invoke( - member, C.extent(0), C.extent(1), A.extent(0), alpha, A.data(), - A.stride_1(), A.stride_0(), B.data(), B.stride_1(), B.stride_0(), beta, - C.data(), C.stride_0(), C.stride_1()); + return TeamGemmInternal::invoke(member, C.extent(0), C.extent(1), A.extent(0), alpha, + A.data(), A.stride_1(), A.stride_0(), B.data(), B.stride_1(), + B.stride_0(), beta, C.data(), C.stride_0(), C.stride_1()); } }; template -struct TeamGemm { - template - KOKKOS_INLINE_FUNCTION static int invoke( - const MemberType &member, const ScalarType alpha, const AViewType &A, - const BViewType &B, const ScalarType beta, const CViewType &C) { +struct TeamGemm { + template + KOKKOS_INLINE_FUNCTION static int invoke(const MemberType &member, const ScalarType alpha, const AViewType &A, + const BViewType &B, const ScalarType beta, const CViewType &C) { // C = beta C + alpha A B // C (m x n), A(m x k), B(k x n) - return TeamGemmInternal::invoke( - member, C.extent(0), C.extent(1), A.extent(0), alpha, A.data(), - A.stride_1(), A.stride_0(), B.data(), B.stride_1(), B.stride_0(), beta, - C.data(), C.stride_0(), C.stride_1()); + return TeamGemmInternal::invoke(member, C.extent(0), C.extent(1), A.extent(0), alpha, A.data(), + A.stride_1(), A.stride_0(), B.data(), B.stride_1(), + B.stride_0(), beta, C.data(), C.stride_0(), C.stride_1()); } }; diff --git a/batched/dense/impl/KokkosBatched_Gemm_Team_Internal.hpp b/batched/dense/impl/KokkosBatched_Gemm_Team_Internal.hpp index 988a4e5da2..1b77a25991 100644 --- a/batched/dense/impl/KokkosBatched_Gemm_Team_Internal.hpp +++ b/batched/dense/impl/KokkosBatched_Gemm_Team_Internal.hpp @@ -34,20 +34,18 @@ namespace KokkosBatched { template struct TeamGemmInternal { template - KOKKOS_INLINE_FUNCTION static int invoke( - const MemberType &member, const int m, const int n, const int k, - const ScalarType alpha, const ValueType *KOKKOS_RESTRICT A, const int as0, - const int as1, const ValueType *KOKKOS_RESTRICT B, const int bs0, - const int bs1, const ScalarType beta, - /**/ ValueType *KOKKOS_RESTRICT C, const int cs0, const int cs1); + KOKKOS_INLINE_FUNCTION static int invoke(const MemberType &member, const int m, const int n, const int k, + const ScalarType alpha, const ValueType *KOKKOS_RESTRICT A, const int as0, + const int as1, const ValueType *KOKKOS_RESTRICT B, const int bs0, + const int bs1, const ScalarType beta, + /**/ ValueType *KOKKOS_RESTRICT C, const int cs0, const int cs1); }; template <> template KOKKOS_INLINE_FUNCTION int TeamGemmInternal::invoke( - const MemberType &member, const int m, const int n, const int k, - const ScalarType alpha, const ValueType *KOKKOS_RESTRICT A, const int as0, - const int as1, const ValueType *KOKKOS_RESTRICT B, const int bs0, + const MemberType &member, const int m, const int n, const int k, const ScalarType alpha, + const ValueType *KOKKOS_RESTRICT A, const int as0, const int as1, const ValueType *KOKKOS_RESTRICT B, const int bs0, const int bs1, const ScalarType beta, /**/ ValueType *KOKKOS_RESTRICT C, const int cs0, const int cs1) { // C = beta C + alpha A B @@ -58,25 +56,22 @@ KOKKOS_INLINE_FUNCTION int TeamGemmInternal::invoke( if (beta == zero) KokkosBlas::Impl::TeamSetInternal::invoke(member, m, n, zero, C, cs0, cs1); else if (beta != one) - KokkosBlas::Impl::TeamScaleInternal::invoke(member, m, n, beta, C, cs0, - cs1); + KokkosBlas::Impl::TeamScaleInternal::invoke(member, m, n, beta, C, cs0, cs1); if (alpha != ScalarType(0.0)) { if (m <= 0 || n <= 0 || k <= 0) return 0; if (beta != one) member.team_barrier(); - Kokkos::parallel_for( - Kokkos::TeamThreadRange(member, 0, m * n), [&](const int &ij) { - // assume layout right for batched computation - const int i = ij / n, j = ij % n; - const ValueType *KOKKOS_RESTRICT pA = A + i * as0, - *KOKKOS_RESTRICT pB = B + j * bs1; - - ValueType c = ValueType(0); - for (int p = 0; p < k; ++p) c += pA[p * as1] * pB[p * bs0]; - C[i * cs0 + j * cs1] += alpha * c; - }); + Kokkos::parallel_for(Kokkos::TeamThreadRange(member, 0, m * n), [&](const int &ij) { + // assume layout right for batched computation + const int i = ij / n, j = ij % n; + const ValueType *KOKKOS_RESTRICT pA = A + i * as0, *KOKKOS_RESTRICT pB = B + j * bs1; + + ValueType c = ValueType(0); + for (int p = 0; p < k; ++p) c += pA[p * as1] * pB[p * bs0]; + C[i * cs0 + j * cs1] += alpha * c; + }); } return 0; } @@ -84,9 +79,8 @@ KOKKOS_INLINE_FUNCTION int TeamGemmInternal::invoke( template <> template KOKKOS_INLINE_FUNCTION int TeamGemmInternal::invoke( - const MemberType &member, const int m, const int n, const int k, - const ScalarType alpha, const ValueType *KOKKOS_RESTRICT A, const int as0, - const int as1, const ValueType *KOKKOS_RESTRICT B, const int bs0, + const MemberType &member, const int m, const int n, const int k, const ScalarType alpha, + const ValueType *KOKKOS_RESTRICT A, const int as0, const int as1, const ValueType *KOKKOS_RESTRICT B, const int bs0, const int bs1, const ScalarType beta, /**/ ValueType *KOKKOS_RESTRICT C, const int cs0, const int cs1) { // C = beta C + alpha A B @@ -100,8 +94,7 @@ KOKKOS_INLINE_FUNCTION int TeamGemmInternal::invoke( if (beta == zero) KokkosBlas::Impl::TeamSetInternal::invoke(member, m, n, zero, C, cs0, cs1); else if (beta != one) - KokkosBlas::Impl::TeamScaleInternal::invoke(member, m, n, beta, C, cs0, - cs1); + KokkosBlas::Impl::TeamScaleInternal::invoke(member, m, n, beta, C, cs0, cs1); if (alpha != ScalarType(0.0)) { if (m <= 0 || n <= 0 || k <= 0) return 0; @@ -111,31 +104,27 @@ KOKKOS_INLINE_FUNCTION int TeamGemmInternal::invoke( /// /// GPU case: team size is large and blocksize (mb,nb) is small InnerGemmFixC inner(as0, as1, bs0, bs1, cs0, cs1); - auto gemm = [&](const int ib, const int jb, const int pb, - const ValueType *KOKKOS_RESTRICT AA, + auto gemm = [&](const int ib, const int jb, const int pb, const ValueType *KOKKOS_RESTRICT AA, const ValueType *KOKKOS_RESTRICT BB, /**/ ValueType *KOKKOS_RESTRICT CC) { // Made this non-const in order to WORKAROUND issue #349 - int mb = mbAlgo, mp = (ib % mb), mq = (ib / mb) + (mp > 0), nb = nbAlgo, - np = (jb % nb), nq = (jb / nb) + (np > 0); + int mb = mbAlgo, mp = (ib % mb), mq = (ib / mb) + (mp > 0), nb = nbAlgo, np = (jb % nb), + nq = (jb / nb) + (np > 0); // square tiling - Kokkos::parallel_for( - Kokkos::TeamThreadRange(member, mq * nq), [&](const int &ij) { - int i, j; - // note: the condition is constexpr - if (KokkosKernels::Impl::kk_is_gpu_exec_space< - typename MemberType::execution_space>()) { - i = ij % mq * mb; - j = ij / mq * nb; - } else { - i = ij / nq * mb; - j = ij % nq * nb; - } - inner.serial_invoke( - alpha, AA + i * as0, BB + j * bs1, (i + mb) > ib ? mp : mb, - (j + nb) > jb ? np : nb, pb, CC + i * cs0 + j * cs1); - }); + Kokkos::parallel_for(Kokkos::TeamThreadRange(member, mq * nq), [&](const int &ij) { + int i, j; + // note: the condition is constexpr + if (KokkosKernels::Impl::kk_is_gpu_exec_space()) { + i = ij % mq * mb; + j = ij / mq * nb; + } else { + i = ij / nq * mb; + j = ij % nq * nb; + } + inner.serial_invoke(alpha, AA + i * as0, BB + j * bs1, (i + mb) > ib ? mp : mb, (j + nb) > jb ? np : nb, pb, + CC + i * cs0 + j * cs1); + }); }; const bool is_small = true; //(m*n*k <= 64*64*64); diff --git a/batched/dense/impl/KokkosBatched_Gemv_TeamVector_Impl.hpp b/batched/dense/impl/KokkosBatched_Gemv_TeamVector_Impl.hpp index a0b948bb13..4f54bf7f31 100644 --- a/batched/dense/impl/KokkosBatched_Gemv_TeamVector_Impl.hpp +++ b/batched/dense/impl/KokkosBatched_Gemv_TeamVector_Impl.hpp @@ -41,43 +41,30 @@ namespace KokkosBatched { template struct TeamVectorGemv { - template - KOKKOS_INLINE_FUNCTION static int invoke( - const MemberType &member, const ScalarType alpha, const AViewType &A, - const xViewType &x, const ScalarType beta, const yViewType &y) { + template + KOKKOS_INLINE_FUNCTION static int invoke(const MemberType &member, const ScalarType alpha, const AViewType &A, + const xViewType &x, const ScalarType beta, const yViewType &y) { static_assert(AViewType::rank == 3, "Batched TeamVectorGemv requires rank-3 A matrix (use " "KokkosBlas::TeamVectorGemv for regular rank-2 matrix)"); if (A.extent(0) == 1) { - KokkosBlas::TeamVectorGemv< - MemberType, Trans::NoTranspose, - Algo::Gemv::Unblocked>::invoke(member, alpha, - Kokkos::subview(A, 0, Kokkos::ALL, - Kokkos::ALL), - Kokkos::subview(x, 0, Kokkos::ALL), - beta, - Kokkos::subview(y, 0, Kokkos::ALL)); + KokkosBlas::TeamVectorGemv::invoke( + member, alpha, Kokkos::subview(A, 0, Kokkos::ALL, Kokkos::ALL), Kokkos::subview(x, 0, Kokkos::ALL), beta, + Kokkos::subview(y, 0, Kokkos::ALL)); return 0; } return TeamVectorGemvInternal::template invoke< - MemberType, ScalarType, typename AViewType::array_layout, - typename AViewType::non_const_value_type>( - member, A.extent(0), A.extent(1), A.extent(2), alpha, A.data(), - A.stride_0(), A.stride_1(), A.stride_2(), x.data(), x.stride_0(), - x.stride_1(), beta, y.data(), y.stride_0(), y.stride_1()); + MemberType, ScalarType, typename AViewType::array_layout, typename AViewType::non_const_value_type>( + member, A.extent(0), A.extent(1), A.extent(2), alpha, A.data(), A.stride_0(), A.stride_1(), A.stride_2(), + x.data(), x.stride_0(), x.stride_1(), beta, y.data(), y.stride_0(), y.stride_1()); } }; template struct TeamVectorGemv { - template - KOKKOS_INLINE_FUNCTION static int invoke(const MemberType & /*member*/, - const ScalarType /*alpha*/, - const AViewType & /*A*/, - const xViewType & /*x*/, - const ScalarType /*beta*/, + template + KOKKOS_INLINE_FUNCTION static int invoke(const MemberType & /*member*/, const ScalarType /*alpha*/, + const AViewType & /*A*/, const xViewType & /*x*/, const ScalarType /*beta*/, const yViewType & /*y*/) { static_assert(AViewType::rank == 3, "Batched TeamVectorGemv requires rank-3 A matrix (use " @@ -94,32 +81,24 @@ struct TeamVectorGemv { template struct TeamVectorGemv { - template - KOKKOS_INLINE_FUNCTION static int invoke( - const MemberType &member, const ScalarType alpha, const AViewType &A, - const xViewType &x, const ScalarType beta, const yViewType &y) { + template + KOKKOS_INLINE_FUNCTION static int invoke(const MemberType &member, const ScalarType alpha, const AViewType &A, + const xViewType &x, const ScalarType beta, const yViewType &y) { static_assert(AViewType::rank == 3, "Batched TeamVectorGemv requires rank-3 A matrix (use " "KokkosBlas::TeamVectorGemv for regular rank-2 matrix)"); return TeamVectorGemvInternal::template invoke< - MemberType, ScalarType, typename AViewType::array_layout, - typename AViewType::non_const_value_type>( - member, A.extent(0), A.extent(2), A.extent(1), alpha, A.data(), - A.stride_0(), A.stride_2(), A.stride_1(), x.data(), x.stride_0(), - x.stride_1(), beta, y.data(), y.stride_0(), y.stride_1()); + MemberType, ScalarType, typename AViewType::array_layout, typename AViewType::non_const_value_type>( + member, A.extent(0), A.extent(2), A.extent(1), alpha, A.data(), A.stride_0(), A.stride_2(), A.stride_1(), + x.data(), x.stride_0(), x.stride_1(), beta, y.data(), y.stride_0(), y.stride_1()); } }; template struct TeamVectorGemv { - template - KOKKOS_INLINE_FUNCTION static int invoke(const MemberType & /*member*/, - const ScalarType /*alpha*/, - const AViewType & /*A*/, - const xViewType & /*x*/, - const ScalarType /*beta*/, + template + KOKKOS_INLINE_FUNCTION static int invoke(const MemberType & /*member*/, const ScalarType /*alpha*/, + const AViewType & /*A*/, const xViewType & /*x*/, const ScalarType /*beta*/, const yViewType & /*y*/) { static_assert(AViewType::rank == 3, "Batched TeamVectorGemv requires rank-3 A matrix (use " diff --git a/batched/dense/impl/KokkosBatched_Gemv_TeamVector_Internal.hpp b/batched/dense/impl/KokkosBatched_Gemv_TeamVector_Internal.hpp index 0ffc60ec90..8d9676b223 100644 --- a/batched/dense/impl/KokkosBatched_Gemv_TeamVector_Internal.hpp +++ b/batched/dense/impl/KokkosBatched_Gemv_TeamVector_Internal.hpp @@ -30,30 +30,24 @@ namespace KokkosBatched { /// ==================== template struct TeamVectorGemvInternal { - template - KOKKOS_INLINE_FUNCTION static int invoke( - const MemberType & /*member*/, const int /*N*/, const int /*m*/, - const int /*n*/, const ScalarType /*alpha*/, - const ValueType *KOKKOS_RESTRICT /*A*/, const int /*as0*/, - const int /*as1*/, const int /*as2*/, - const ValueType *KOKKOS_RESTRICT /*x*/, const int /*xs0*/, - const int /*xs1*/, const ScalarType /*beta*/, - /**/ ValueType *KOKKOS_RESTRICT /*y*/, const int /*ys0*/, - const int /*ys1*/) { + template + KOKKOS_INLINE_FUNCTION static int invoke(const MemberType & /*member*/, const int /*N*/, const int /*m*/, + const int /*n*/, const ScalarType /*alpha*/, + const ValueType *KOKKOS_RESTRICT /*A*/, const int /*as0*/, const int /*as1*/, + const int /*as2*/, const ValueType *KOKKOS_RESTRICT /*x*/, const int /*xs0*/, + const int /*xs1*/, const ScalarType /*beta*/, + /**/ ValueType *KOKKOS_RESTRICT /*y*/, const int /*ys0*/, + const int /*ys1*/) { assert(false && "Error: encounter dummy impl"); return 0; } }; template <> -template -KOKKOS_INLINE_FUNCTION int -TeamVectorGemvInternal::invoke( - const MemberType &member, const int N, const int m, const int n, - const ScalarType alpha, const ValueType *KOKKOS_RESTRICT A, const int as0, - const int as1, const int as2, const ValueType *KOKKOS_RESTRICT X, +template +KOKKOS_INLINE_FUNCTION int TeamVectorGemvInternal::invoke( + const MemberType &member, const int N, const int m, const int n, const ScalarType alpha, + const ValueType *KOKKOS_RESTRICT A, const int as0, const int as1, const int as2, const ValueType *KOKKOS_RESTRICT X, const int xs0, const int xs1, const ScalarType beta, /**/ ValueType *KOKKOS_RESTRICT Y, const int ys0, const int ys1) { const ScalarType one(1.0), zero(0.0); @@ -64,37 +58,32 @@ TeamVectorGemvInternal::invoke( if (beta == zero) // TODO: KokkosBlas::Impl::TeamVectorSetInternal::invoke(member, m, zero, y, // ys0); - Kokkos::parallel_for(Kokkos::TeamVectorRange(member, 0, N * m), - [&](const int &iTemp) { - int iRow, iMatrix; - getIndices(iTemp, m, N, iRow, iMatrix); - Y[ys0 * iMatrix + ys1 * iRow] = zero; - }); + Kokkos::parallel_for(Kokkos::TeamVectorRange(member, 0, N * m), [&](const int &iTemp) { + int iRow, iMatrix; + getIndices(iTemp, m, N, iRow, iMatrix); + Y[ys0 * iMatrix + ys1 * iRow] = zero; + }); else if (beta != one) // TODO: KokkosBlas::Impl::TeamVectorScaleInternal::invoke(member, m, beta, // y, ys0); - Kokkos::parallel_for(Kokkos::TeamVectorRange(member, 0, N * m), - [&](const int &iTemp) { - int iRow, iMatrix; - getIndices(iTemp, m, N, iRow, iMatrix); - Y[ys0 * iMatrix + ys1 * iRow] *= beta; - }); + Kokkos::parallel_for(Kokkos::TeamVectorRange(member, 0, N * m), [&](const int &iTemp) { + int iRow, iMatrix; + getIndices(iTemp, m, N, iRow, iMatrix); + Y[ys0 * iMatrix + ys1 * iRow] *= beta; + }); if (alpha != zero) { if (m <= 0 || n <= 0) return 0; if (beta != one) member.team_barrier(); - Kokkos::parallel_for(Kokkos::TeamVectorRange(member, 0, N * m), - [&](const int &iTemp) { - int iRow, iMatrix; - ValueType t(0); - getIndices(iTemp, m, N, iRow, iMatrix); - for (int i = 0; i < n; ++i) - t += A[as0 * iMatrix + as1 * iRow + as2 * i] * - X[xs0 * iMatrix + xs1 * i]; - Y[ys0 * iMatrix + ys1 * iRow] += alpha * t; - }); + Kokkos::parallel_for(Kokkos::TeamVectorRange(member, 0, N * m), [&](const int &iTemp) { + int iRow, iMatrix; + ValueType t(0); + getIndices(iTemp, m, N, iRow, iMatrix); + for (int i = 0; i < n; ++i) t += A[as0 * iMatrix + as1 * iRow + as2 * i] * X[xs0 * iMatrix + xs1 * i]; + Y[ys0 * iMatrix + ys1 * iRow] += alpha * t; + }); } return 0; } diff --git a/batched/dense/impl/KokkosBatched_Gemv_Team_Impl.hpp b/batched/dense/impl/KokkosBatched_Gemv_Team_Impl.hpp index 48627aaf30..16f12529d4 100644 --- a/batched/dense/impl/KokkosBatched_Gemv_Team_Impl.hpp +++ b/batched/dense/impl/KokkosBatched_Gemv_Team_Impl.hpp @@ -42,11 +42,9 @@ namespace KokkosBatched { template struct TeamGemv { - template - KOKKOS_INLINE_FUNCTION static int invoke( - const MemberType &member, const ScalarType alpha, const AViewType &A, - const xViewType &x, const ScalarType beta, const yViewType &y) { + template + KOKKOS_INLINE_FUNCTION static int invoke(const MemberType &member, const ScalarType alpha, const AViewType &A, + const xViewType &x, const ScalarType beta, const yViewType &y) { if constexpr (Kokkos::is_dyn_rank_view::value) { assert(A.rank_dynamic() == 3 && "Batched TeamGemv requires rank-3 A matrix (use " @@ -58,34 +56,23 @@ struct TeamGemv { } if (A.extent(0) == 1) { - KokkosBlas::TeamGemv< - MemberType, Trans::NoTranspose, - Algo::Gemv::Unblocked>::invoke(member, alpha, - Kokkos::subview(A, 0, Kokkos::ALL, - Kokkos::ALL), - Kokkos::subview(x, 0, Kokkos::ALL), - beta, - Kokkos::subview(y, 0, Kokkos::ALL)); + KokkosBlas::TeamGemv::invoke( + member, alpha, Kokkos::subview(A, 0, Kokkos::ALL, Kokkos::ALL), Kokkos::subview(x, 0, Kokkos::ALL), beta, + Kokkos::subview(y, 0, Kokkos::ALL)); return 0; } return TeamGemvInternal::template invoke< - MemberType, ScalarType, typename AViewType::array_layout, - typename AViewType::non_const_value_type>( - member, A.extent(0), A.extent(1), A.extent(2), alpha, A.data(), - A.stride_0(), A.stride_1(), A.stride_2(), x.data(), x.stride_0(), - x.stride_1(), beta, y.data(), y.stride_0(), y.stride_1()); + MemberType, ScalarType, typename AViewType::array_layout, typename AViewType::non_const_value_type>( + member, A.extent(0), A.extent(1), A.extent(2), alpha, A.data(), A.stride_0(), A.stride_1(), A.stride_2(), + x.data(), x.stride_0(), x.stride_1(), beta, y.data(), y.stride_0(), y.stride_1()); } }; template struct TeamGemv { - template - KOKKOS_INLINE_FUNCTION static int invoke(const MemberType & /*member*/, - const ScalarType /*alpha*/, - const AViewType & /*A*/, - const xViewType & /*x*/, - const ScalarType /*beta*/, + template + KOKKOS_INLINE_FUNCTION static int invoke(const MemberType & /*member*/, const ScalarType /*alpha*/, + const AViewType & /*A*/, const xViewType & /*x*/, const ScalarType /*beta*/, const yViewType & /*y*/) { /* if constexpr (Kokkos::is_dyn_rank_view::value) { assert(A.rank_dynamic() == 3 && @@ -108,11 +95,9 @@ struct TeamGemv { template struct TeamGemv { - template - KOKKOS_INLINE_FUNCTION static int invoke( - const MemberType &member, const ScalarType alpha, const AViewType &A, - const xViewType &x, const ScalarType beta, const yViewType &y) { + template + KOKKOS_INLINE_FUNCTION static int invoke(const MemberType &member, const ScalarType alpha, const AViewType &A, + const xViewType &x, const ScalarType beta, const yViewType &y) { if constexpr (Kokkos::is_dyn_rank_view::value) { assert(A.rank_dynamic() == 3 && "Batched TeamGemv requires rank-3 A matrix (use " @@ -123,31 +108,23 @@ struct TeamGemv { "KokkosBlas::TeamGemv for regular rank-2 matrix)"); } if (A.extent(0) == 1) { - KokkosBlas:: - TeamGemv::invoke( - member, alpha, Kokkos::subview(A, 0, Kokkos::ALL, Kokkos::ALL), - Kokkos::subview(x, 0, Kokkos::ALL), beta, - Kokkos::subview(y, 0, Kokkos::ALL)); + KokkosBlas::TeamGemv::invoke( + member, alpha, Kokkos::subview(A, 0, Kokkos::ALL, Kokkos::ALL), Kokkos::subview(x, 0, Kokkos::ALL), beta, + Kokkos::subview(y, 0, Kokkos::ALL)); return 0; } return TeamGemvInternal::template invoke< - MemberType, ScalarType, typename AViewType::array_layout, - typename AViewType::non_const_value_type>( - member, A.extent(0), A.extent(2), A.extent(1), alpha, A.data(), - A.stride_0(), A.stride_2(), A.stride_1(), x.data(), x.stride_0(), - x.stride_1(), beta, y.data(), y.stride_0(), y.stride_1()); + MemberType, ScalarType, typename AViewType::array_layout, typename AViewType::non_const_value_type>( + member, A.extent(0), A.extent(2), A.extent(1), alpha, A.data(), A.stride_0(), A.stride_2(), A.stride_1(), + x.data(), x.stride_0(), x.stride_1(), beta, y.data(), y.stride_0(), y.stride_1()); } }; template struct TeamGemv { - template - KOKKOS_INLINE_FUNCTION static int invoke(const MemberType & /*member*/, - const ScalarType /*alpha*/, - const AViewType & /*A*/, - const xViewType & /*x*/, - const ScalarType /*beta*/, + template + KOKKOS_INLINE_FUNCTION static int invoke(const MemberType & /*member*/, const ScalarType /*alpha*/, + const AViewType & /*A*/, const xViewType & /*x*/, const ScalarType /*beta*/, const yViewType & /*y*/) { /* if constexpr (Kokkos::is_dyn_rank_view::value) { assert(A.rank_dynamic() == 3 && diff --git a/batched/dense/impl/KokkosBatched_Gemv_Team_Internal.hpp b/batched/dense/impl/KokkosBatched_Gemv_Team_Internal.hpp index 77629c678f..8f63e24b27 100644 --- a/batched/dense/impl/KokkosBatched_Gemv_Team_Internal.hpp +++ b/batched/dense/impl/KokkosBatched_Gemv_Team_Internal.hpp @@ -20,9 +20,9 @@ #include "KokkosBatched_Util.hpp" -//#include "KokkosBlas1_set_impl.hpp" -//#include "KokkosBlas1_team_scal_impl.hpp" -//#include "KokkosBlas2_serial_gemv_inner_multiple_dot.hpp" +// #include "KokkosBlas1_set_impl.hpp" +// #include "KokkosBlas1_team_scal_impl.hpp" +// #include "KokkosBlas2_serial_gemv_inner_multiple_dot.hpp" namespace KokkosBatched { @@ -31,23 +31,19 @@ namespace KokkosBatched { /// ==================== template struct TeamGemvInternal { - template - KOKKOS_INLINE_FUNCTION static int invoke( - const MemberType &member, const int N, const int m, const int n, - const ScalarType alpha, const ValueType *KOKKOS_RESTRICT A, const int as0, - const int as1, const int as2, const ValueType *KOKKOS_RESTRICT x, - const int xs0, const int xs1, const ScalarType beta, - /**/ ValueType *KOKKOS_RESTRICT y, const int ys0, const int ys1); + template + KOKKOS_INLINE_FUNCTION static int invoke(const MemberType &member, const int N, const int m, const int n, + const ScalarType alpha, const ValueType *KOKKOS_RESTRICT A, const int as0, + const int as1, const int as2, const ValueType *KOKKOS_RESTRICT x, + const int xs0, const int xs1, const ScalarType beta, + /**/ ValueType *KOKKOS_RESTRICT y, const int ys0, const int ys1); }; template <> -template +template KOKKOS_INLINE_FUNCTION int TeamGemvInternal::invoke( - const MemberType &member, const int N, const int m, const int n, - const ScalarType alpha, const ValueType *KOKKOS_RESTRICT A, const int as0, - const int as1, const int as2, const ValueType *KOKKOS_RESTRICT X, + const MemberType &member, const int N, const int m, const int n, const ScalarType alpha, + const ValueType *KOKKOS_RESTRICT A, const int as0, const int as1, const int as2, const ValueType *KOKKOS_RESTRICT X, const int xs0, const int xs1, const ScalarType beta, /**/ ValueType *KOKKOS_RESTRICT Y, const int ys0, const int ys1) { const ScalarType one(1.0), zero(0.0); @@ -56,35 +52,30 @@ KOKKOS_INLINE_FUNCTION int TeamGemvInternal::invoke( // y_l (m), A_l(m x n), B_l(n) if (beta == zero) - Kokkos::parallel_for(Kokkos::TeamThreadRange(member, 0, N * m), - [&](const int &iTemp) { - int iRow, iMatrix; - getIndices(iTemp, m, N, iRow, iMatrix); - Y[ys0 * iMatrix + ys1 * iRow] = zero; - }); + Kokkos::parallel_for(Kokkos::TeamThreadRange(member, 0, N * m), [&](const int &iTemp) { + int iRow, iMatrix; + getIndices(iTemp, m, N, iRow, iMatrix); + Y[ys0 * iMatrix + ys1 * iRow] = zero; + }); else if (beta != one) - Kokkos::parallel_for(Kokkos::TeamThreadRange(member, 0, N * m), - [&](const int &iTemp) { - int iRow, iMatrix; - getIndices(iTemp, m, N, iRow, iMatrix); - Y[ys0 * iMatrix + ys1 * iRow] *= beta; - }); + Kokkos::parallel_for(Kokkos::TeamThreadRange(member, 0, N * m), [&](const int &iTemp) { + int iRow, iMatrix; + getIndices(iTemp, m, N, iRow, iMatrix); + Y[ys0 * iMatrix + ys1 * iRow] *= beta; + }); if (alpha != zero) { if (m <= 0 || n <= 0) return 0; if (beta != one) member.team_barrier(); - Kokkos::parallel_for(Kokkos::TeamThreadRange(member, 0, N * m), - [&](const int &iTemp) { - int iRow, iMatrix; - ValueType t(0); - getIndices(iTemp, m, N, iRow, iMatrix); - for (int i = 0; i < n; ++i) - t += A[as0 * iMatrix + as1 * iRow + as2 * i] * - X[xs0 * iMatrix + xs1 * i]; - Y[ys0 * iMatrix + ys1 * iRow] += alpha * t; - }); + Kokkos::parallel_for(Kokkos::TeamThreadRange(member, 0, N * m), [&](const int &iTemp) { + int iRow, iMatrix; + ValueType t(0); + getIndices(iTemp, m, N, iRow, iMatrix); + for (int i = 0; i < n; ++i) t += A[as0 * iMatrix + as1 * iRow + as2 * i] * X[xs0 * iMatrix + xs1 * i]; + Y[ys0 * iMatrix + ys1 * iRow] += alpha * t; + }); } return 0; } diff --git a/batched/dense/impl/KokkosBatched_Gesv_Impl.hpp b/batched/dense/impl/KokkosBatched_Gesv_Impl.hpp index 4c9f54d037..ba18cbafd7 100644 --- a/batched/dense/impl/KokkosBatched_Gesv_Impl.hpp +++ b/batched/dense/impl/KokkosBatched_Gesv_Impl.hpp @@ -26,40 +26,33 @@ namespace KokkosBatched { struct SerialStaticPivoting { - template - KOKKOS_INLINE_FUNCTION static int invoke( - const MatrixType1 A, const MatrixType2 PDAD, const VectorType1 Y, - const VectorType2 PDY, const VectorType2 D2, const VectorType2 tmp_v_1, - const VectorType2 tmp_v_2); + template + KOKKOS_INLINE_FUNCTION static int invoke(const MatrixType1 A, const MatrixType2 PDAD, const VectorType1 Y, + const VectorType2 PDY, const VectorType2 D2, const VectorType2 tmp_v_1, + const VectorType2 tmp_v_2); }; template struct TeamStaticPivoting { - template - KOKKOS_INLINE_FUNCTION static int invoke( - const MemberType &member, const MatrixType1 A, const MatrixType2 PDAD, - const VectorType1 Y, const VectorType2 PDY, const VectorType2 D2, - const VectorType2 tmp_v_1, const VectorType2 tmp_v_2); + template + KOKKOS_INLINE_FUNCTION static int invoke(const MemberType &member, const MatrixType1 A, const MatrixType2 PDAD, + const VectorType1 Y, const VectorType2 PDY, const VectorType2 D2, + const VectorType2 tmp_v_1, const VectorType2 tmp_v_2); }; template struct TeamVectorStaticPivoting { - template - KOKKOS_INLINE_FUNCTION static int invoke( - const MemberType &member, const MatrixType1 A, const MatrixType2 PDAD, - const VectorType1 Y, const VectorType2 PDY, const VectorType2 D2, - const VectorType2 tmp_v_1, const VectorType2 tmp_v_2); + template + KOKKOS_INLINE_FUNCTION static int invoke(const MemberType &member, const MatrixType1 A, const MatrixType2 PDAD, + const VectorType1 Y, const VectorType2 PDY, const VectorType2 D2, + const VectorType2 tmp_v_1, const VectorType2 tmp_v_2); }; -template -KOKKOS_INLINE_FUNCTION int SerialStaticPivoting::invoke( - const MatrixType1 A, const MatrixType2 PDAD, const VectorType1 Y, - const VectorType2 PDY, const VectorType2 D2, const VectorType2 tmp_v_1, - const VectorType2 tmp_v_2) { +template +KOKKOS_INLINE_FUNCTION int SerialStaticPivoting::invoke(const MatrixType1 A, const MatrixType2 PDAD, + const VectorType1 Y, const VectorType2 PDY, + const VectorType2 D2, const VectorType2 tmp_v_1, + const VectorType2 tmp_v_2) { using value_type = typename MatrixType1::non_const_value_type; const size_t n = A.extent(0); @@ -139,15 +132,14 @@ KOKKOS_INLINE_FUNCTION int SerialStaticPivoting::invoke( } template -template -KOKKOS_INLINE_FUNCTION int TeamStaticPivoting::invoke( - const MemberType &member, const MatrixType1 A, const MatrixType2 PDAD, - const VectorType1 Y, const VectorType2 PDY, const VectorType2 D2, - const VectorType2 tmp_v_1, const VectorType2 tmp_v_2) { - using value_type = typename MatrixType1::non_const_value_type; - using reducer_value_type = - typename Kokkos::MaxLoc::value_type; +template +KOKKOS_INLINE_FUNCTION int TeamStaticPivoting::invoke(const MemberType &member, const MatrixType1 A, + const MatrixType2 PDAD, const VectorType1 Y, + const VectorType2 PDY, const VectorType2 D2, + const VectorType2 tmp_v_1, + const VectorType2 tmp_v_2) { + using value_type = typename MatrixType1::non_const_value_type; + using reducer_value_type = typename Kokkos::MaxLoc::value_type; // This implementation follows the strategy of SerialStaticPivoting but uses // an extra level of parallelism. @@ -222,15 +214,14 @@ KOKKOS_INLINE_FUNCTION int TeamStaticPivoting::invoke( } template -template -KOKKOS_INLINE_FUNCTION int TeamVectorStaticPivoting::invoke( - const MemberType &member, const MatrixType1 A, const MatrixType2 PDAD, - const VectorType1 Y, const VectorType2 PDY, const VectorType2 D2, - const VectorType2 tmp_v_1, const VectorType2 tmp_v_2) { - using value_type = typename MatrixType1::non_const_value_type; - using reducer_value_type = - typename Kokkos::MaxLoc::value_type; +template +KOKKOS_INLINE_FUNCTION int TeamVectorStaticPivoting::invoke(const MemberType &member, const MatrixType1 A, + const MatrixType2 PDAD, const VectorType1 Y, + const VectorType2 PDY, const VectorType2 D2, + const VectorType2 tmp_v_1, + const VectorType2 tmp_v_2) { + using value_type = typename MatrixType1::non_const_value_type; + using reducer_value_type = typename Kokkos::MaxLoc::value_type; // This implementation follows the strategy of SerialStaticPivoting but uses // two extra levels of parallelism. @@ -265,8 +256,7 @@ KOKKOS_INLINE_FUNCTION int TeamVectorStaticPivoting::invoke( }); Kokkos::parallel_for(Kokkos::TeamThreadRange(member, n), [&](const int &i) { - Kokkos::parallel_for(Kokkos::ThreadVectorRange(member, n), - [&](const int &j) { A(i, j) *= D2(j); }); + Kokkos::parallel_for(Kokkos::ThreadVectorRange(member, n), [&](const int &j) { A(i, j) *= D2(j); }); }); Kokkos::parallel_for(Kokkos::TeamThreadRange(member, n), [&](const int &i) { @@ -283,8 +273,7 @@ KOKKOS_INLINE_FUNCTION int TeamVectorStaticPivoting::invoke( }, reducer_value); D1_i = 1. / value.val; - Kokkos::parallel_for(Kokkos::ThreadVectorRange(member, n), - [&](const int &j) { A(i, j) *= D1_i; }); + Kokkos::parallel_for(Kokkos::ThreadVectorRange(member, n), [&](const int &j) { A(i, j) *= D1_i; }); Y(i) *= D1_i; }); @@ -318,18 +307,15 @@ KOKKOS_INLINE_FUNCTION int TeamVectorStaticPivoting::invoke( tmp_v_1(row_index) = Kokkos::ArithTraits::zero(); tmp_v_2(col_index) = Kokkos::ArithTraits::zero(); - Kokkos::parallel_for(Kokkos::TeamVectorRange(member, n), [&](const int &j) { - PDAD(col_index, j) = A(row_index, j); - }); + Kokkos::parallel_for(Kokkos::TeamVectorRange(member, n), + [&](const int &j) { PDAD(col_index, j) = A(row_index, j); }); PDY(col_index) = Y(row_index); } return 0; } template -KOKKOS_INLINE_FUNCTION void SerialHadamard1D(const VectorType1 X, - const VectorType2 D, - const VectorType3 DX) { +KOKKOS_INLINE_FUNCTION void SerialHadamard1D(const VectorType1 X, const VectorType2 D, const VectorType3 DX) { const size_t n = X.extent(0); for (size_t i = 0; i < n; ++i) { @@ -337,28 +323,20 @@ KOKKOS_INLINE_FUNCTION void SerialHadamard1D(const VectorType1 X, } } -template -KOKKOS_INLINE_FUNCTION void TeamHadamard1D(const MemberType &member, - const VectorType1 X, - const VectorType2 D, +template +KOKKOS_INLINE_FUNCTION void TeamHadamard1D(const MemberType &member, const VectorType1 X, const VectorType2 D, const VectorType3 DX) { const size_t n = X.extent(0); - Kokkos::parallel_for(Kokkos::TeamThreadRange(member, n), - [&](const size_t &i) { DX(i) = D(i) * X(i); }); + Kokkos::parallel_for(Kokkos::TeamThreadRange(member, n), [&](const size_t &i) { DX(i) = D(i) * X(i); }); } -template -KOKKOS_INLINE_FUNCTION void TeamVectorHadamard1D(const MemberType &member, - const VectorType1 X, - const VectorType2 D, +template +KOKKOS_INLINE_FUNCTION void TeamVectorHadamard1D(const MemberType &member, const VectorType1 X, const VectorType2 D, const VectorType3 DX) { const size_t n = X.extent(0); - Kokkos::parallel_for(Kokkos::TeamVectorRange(member, n), - [&](const size_t &i) { DX(i) = D(i) * X(i); }); + Kokkos::parallel_for(Kokkos::TeamVectorRange(member, n), [&](const size_t &i) { DX(i) = D(i) * X(i); }); } /// @@ -367,23 +345,15 @@ KOKKOS_INLINE_FUNCTION void TeamVectorHadamard1D(const MemberType &member, template <> struct SerialGesv { template - KOKKOS_INLINE_FUNCTION static int invoke(const MatrixType A, - const XVectorType X, - const YVectorType Y, + KOKKOS_INLINE_FUNCTION static int invoke(const MatrixType A, const XVectorType X, const YVectorType Y, const MatrixType tmp) { #if (KOKKOSKERNELS_DEBUG_LEVEL > 0) - static_assert(Kokkos::is_view::value, - "KokkosBatched::gesv: MatrixType is not a Kokkos::View."); - static_assert(Kokkos::is_view::value, - "KokkosBatched::gesv: XVectorType is not a Kokkos::View."); - static_assert(Kokkos::is_view::value, - "KokkosBatched::gesv: YVectorType is not a Kokkos::View."); - static_assert(MatrixType::rank == 2, - "KokkosBatched::gesv: MatrixType must have rank 2."); - static_assert(XVectorType::rank == 1, - "KokkosBatched::gesv: XVectorType must have rank 1."); - static_assert(YVectorType::rank == 1, - "KokkosBatched::gesv: YVectorType must have rank 1."); + static_assert(Kokkos::is_view::value, "KokkosBatched::gesv: MatrixType is not a Kokkos::View."); + static_assert(Kokkos::is_view::value, "KokkosBatched::gesv: XVectorType is not a Kokkos::View."); + static_assert(Kokkos::is_view::value, "KokkosBatched::gesv: YVectorType is not a Kokkos::View."); + static_assert(MatrixType::rank == 2, "KokkosBatched::gesv: MatrixType must have rank 2."); + static_assert(XVectorType::rank == 1, "KokkosBatched::gesv: XVectorType must have rank 1."); + static_assert(YVectorType::rank == 1, "KokkosBatched::gesv: YVectorType must have rank 1."); // Check compatibility of dimensions at run time. @@ -392,18 +362,15 @@ struct SerialGesv { "KokkosBatched::gesv: dimensions of A and tmp do not match: A: " "%d x %d, tmp (note: its second dimension should be the second " "dimension of A + 4): %d x %d\n", - (int)A.extent(0), (int)A.extent(1), (int)tmp.extent(0), - (int)tmp.extent(1)); + (int)A.extent(0), (int)A.extent(1), (int)tmp.extent(0), (int)tmp.extent(1)); return 1; } - if (A.extent(0) != X.extent(0) || A.extent(1) != X.extent(0) || - A.extent(0) != Y.extent(0)) { + if (A.extent(0) != X.extent(0) || A.extent(1) != X.extent(0) || A.extent(0) != Y.extent(0)) { Kokkos::printf( "KokkosBatched::gesv: dimensions of A and X and Y do not match: A: " "%d x %d, X: %d, Y: %d\n", - (int)A.extent(0), (int)A.extent(1), (int)X.extent(0), - (int)Y.extent(0)); + (int)A.extent(0), (int)A.extent(1), (int)X.extent(0), (int)Y.extent(0)); return 1; } #endif @@ -416,8 +383,7 @@ struct SerialGesv { auto tmp_v_1 = Kokkos::subview(tmp, Kokkos::ALL, n + 2); auto tmp_v_2 = Kokkos::subview(tmp, Kokkos::ALL, n + 3); - if (SerialStaticPivoting::invoke(A, PDAD, Y, PDY, D2, tmp_v_1, tmp_v_2) == - 1) { + if (SerialStaticPivoting::invoke(A, PDAD, Y, PDY, D2, tmp_v_1, tmp_v_2) == 1) { Kokkos::printf( "KokkosBatched::gesv: the currently implemented static pivoting " "failed.\n"); @@ -427,14 +393,12 @@ struct SerialGesv { int r_val = SerialLU::invoke(PDAD); if (r_val == 0) - r_val = - SerialTrsm::invoke(1.0, PDAD, PDY); + r_val = SerialTrsm::invoke( + 1.0, PDAD, PDY); if (r_val == 0) - r_val = - SerialTrsm::invoke(1.0, PDAD, PDY); + r_val = SerialTrsm::invoke( + 1.0, PDAD, PDY); if (r_val == 0) SerialHadamard1D(PDY, D2, X); return r_val; @@ -444,33 +408,23 @@ struct SerialGesv { template <> struct SerialGesv { template - KOKKOS_INLINE_FUNCTION static int invoke(const MatrixType A, - const XVectorType X, - const YVectorType Y, + KOKKOS_INLINE_FUNCTION static int invoke(const MatrixType A, const XVectorType X, const YVectorType Y, const MatrixType /*tmp*/) { #if (KOKKOSKERNELS_DEBUG_LEVEL > 0) - static_assert(Kokkos::is_view::value, - "KokkosBatched::gesv: MatrixType is not a Kokkos::View."); - static_assert(Kokkos::is_view::value, - "KokkosBatched::gesv: XVectorType is not a Kokkos::View."); - static_assert(Kokkos::is_view::value, - "KokkosBatched::gesv: YVectorType is not a Kokkos::View."); - static_assert(MatrixType::rank == 2, - "KokkosBatched::gesv: MatrixType must have rank 2."); - static_assert(XVectorType::rank == 1, - "KokkosBatched::gesv: XVectorType must have rank 1."); - static_assert(YVectorType::rank == 1, - "KokkosBatched::gesv: YVectorType must have rank 1."); + static_assert(Kokkos::is_view::value, "KokkosBatched::gesv: MatrixType is not a Kokkos::View."); + static_assert(Kokkos::is_view::value, "KokkosBatched::gesv: XVectorType is not a Kokkos::View."); + static_assert(Kokkos::is_view::value, "KokkosBatched::gesv: YVectorType is not a Kokkos::View."); + static_assert(MatrixType::rank == 2, "KokkosBatched::gesv: MatrixType must have rank 2."); + static_assert(XVectorType::rank == 1, "KokkosBatched::gesv: XVectorType must have rank 1."); + static_assert(YVectorType::rank == 1, "KokkosBatched::gesv: YVectorType must have rank 1."); // Check compatibility of dimensions at run time. - if (A.extent(0) != X.extent(0) || A.extent(1) != X.extent(0) || - A.extent(0) != Y.extent(0)) { + if (A.extent(0) != X.extent(0) || A.extent(1) != X.extent(0) || A.extent(0) != Y.extent(0)) { Kokkos::printf( "KokkosBatched::gesv: dimensions of A and X and Y do not match: A: " "%d x %d, X: %d, Y: %d\n", - (int)A.extent(0), (int)A.extent(1), (int)X.extent(0), - (int)Y.extent(0)); + (int)A.extent(0), (int)A.extent(1), (int)X.extent(0), (int)Y.extent(0)); return 1; } #endif @@ -480,14 +434,12 @@ struct SerialGesv { if (r_val == 0) r_val = SerialCopy::invoke(Y, X); if (r_val == 0) - r_val = - SerialTrsm::invoke(1.0, A, X); + r_val = SerialTrsm::invoke( + 1.0, A, X); if (r_val == 0) - r_val = - SerialTrsm::invoke(1.0, A, X); + r_val = SerialTrsm::invoke( + 1.0, A, X); return r_val; } @@ -500,34 +452,25 @@ struct SerialGesv { template struct TeamGesv { template - KOKKOS_INLINE_FUNCTION static int invoke(const MemberType &member, - const MatrixType A, - const VectorType X, + KOKKOS_INLINE_FUNCTION static int invoke(const MemberType &member, const MatrixType A, const VectorType X, const VectorType Y) { #if (KOKKOSKERNELS_DEBUG_LEVEL > 0) - static_assert(Kokkos::is_view::value, - "KokkosBatched::gesv: MatrixType is not a Kokkos::View."); - static_assert(Kokkos::is_view::value, - "KokkosBatched::gesv: VectorType is not a Kokkos::View."); - static_assert(MatrixType::rank == 2, - "KokkosBatched::gesv: MatrixType must have rank 2."); - static_assert(VectorType::rank == 1, - "KokkosBatched::gesv: VectorType must have rank 1."); + static_assert(Kokkos::is_view::value, "KokkosBatched::gesv: MatrixType is not a Kokkos::View."); + static_assert(Kokkos::is_view::value, "KokkosBatched::gesv: VectorType is not a Kokkos::View."); + static_assert(MatrixType::rank == 2, "KokkosBatched::gesv: MatrixType must have rank 2."); + static_assert(VectorType::rank == 1, "KokkosBatched::gesv: VectorType must have rank 1."); // Check compatibility of dimensions at run time. - if (A.extent(0) != X.extent(0) || A.extent(1) != X.extent(0) || - A.extent(0) != Y.extent(0)) { + if (A.extent(0) != X.extent(0) || A.extent(1) != X.extent(0) || A.extent(0) != Y.extent(0)) { Kokkos::printf( "KokkosBatched::gesv: dimensions of A and X and Y do not match: A: " "%d x %d, X: %d, Y: %d\n", - (int)A.extent(0), (int)A.extent(1), (int)X.extent(0), - (int)Y.extent(0)); + (int)A.extent(0), (int)A.extent(1), (int)X.extent(0), (int)Y.extent(0)); return 1; } #endif - using ScratchPadMatrixViewType = Kokkos::View< - typename MatrixType::non_const_value_type **, - typename MatrixType::execution_space::scratch_memory_space>; + using ScratchPadMatrixViewType = Kokkos::View; const int n = A.extent(0); @@ -538,8 +481,7 @@ struct TeamGesv { auto tmp_v_1 = Kokkos::subview(tmp, Kokkos::ALL, n + 2); auto tmp_v_2 = Kokkos::subview(tmp, Kokkos::ALL, n + 3); - if (TeamStaticPivoting::invoke(member, A, PDAD, Y, PDY, D2, - tmp_v_1, tmp_v_2) == 1) { + if (TeamStaticPivoting::invoke(member, A, PDAD, Y, PDY, D2, tmp_v_1, tmp_v_2) == 1) { Kokkos::printf( "KokkosBatched::gesv: the currently implemented static pivoting " "failed.\n"); @@ -547,22 +489,18 @@ struct TeamGesv { } member.team_barrier(); - int r_val = - TeamLU::invoke(member, PDAD); + int r_val = TeamLU::invoke(member, PDAD); member.team_barrier(); if (r_val == 0) { - r_val = TeamTrsm::invoke(member, 1.0, - PDAD, PDY); + r_val = TeamTrsm::invoke(member, 1.0, PDAD, PDY); member.team_barrier(); } if (r_val == 0) { - r_val = - TeamTrsm::invoke(member, 1.0, - PDAD, PDY); + r_val = TeamTrsm::invoke(member, 1.0, PDAD, PDY); member.team_barrier(); } @@ -578,28 +516,20 @@ struct TeamGesv { template struct TeamGesv { template - KOKKOS_INLINE_FUNCTION static int invoke(const MemberType &member, - const MatrixType A, - const VectorType X, + KOKKOS_INLINE_FUNCTION static int invoke(const MemberType &member, const MatrixType A, const VectorType X, const VectorType Y) { #if (KOKKOSKERNELS_DEBUG_LEVEL > 0) - static_assert(Kokkos::is_view::value, - "KokkosBatched::gesv: MatrixType is not a Kokkos::View."); - static_assert(Kokkos::is_view::value, - "KokkosBatched::gesv: VectorType is not a Kokkos::View."); - static_assert(MatrixType::rank == 2, - "KokkosBatched::gesv: MatrixType must have rank 2."); - static_assert(VectorType::rank == 1, - "KokkosBatched::gesv: VectorType must have rank 1."); + static_assert(Kokkos::is_view::value, "KokkosBatched::gesv: MatrixType is not a Kokkos::View."); + static_assert(Kokkos::is_view::value, "KokkosBatched::gesv: VectorType is not a Kokkos::View."); + static_assert(MatrixType::rank == 2, "KokkosBatched::gesv: MatrixType must have rank 2."); + static_assert(VectorType::rank == 1, "KokkosBatched::gesv: VectorType must have rank 1."); // Check compatibility of dimensions at run time. - if (A.extent(0) != X.extent(0) || A.extent(1) != X.extent(0) || - A.extent(0) != Y.extent(0)) { + if (A.extent(0) != X.extent(0) || A.extent(1) != X.extent(0) || A.extent(0) != Y.extent(0)) { Kokkos::printf( "KokkosBatched::gesv: dimensions of A and X and Y do not match: A: " "%d x %d, X: %d, Y: %d\n", - (int)A.extent(0), (int)A.extent(1), (int)X.extent(0), - (int)Y.extent(0)); + (int)A.extent(0), (int)A.extent(1), (int)X.extent(0), (int)Y.extent(0)); return 1; } #endif @@ -613,15 +543,14 @@ struct TeamGesv { } if (r_val == 0) { - TeamTrsm::invoke(member, 1.0, A, X); + TeamTrsm::invoke( + member, 1.0, A, X); member.team_barrier(); } if (r_val == 0) { - TeamTrsm::invoke(member, 1.0, A, - X); + TeamTrsm::invoke( + member, 1.0, A, X); member.team_barrier(); } @@ -636,34 +565,25 @@ struct TeamGesv { template struct TeamVectorGesv { template - KOKKOS_INLINE_FUNCTION static int invoke(const MemberType &member, - const MatrixType A, - const VectorType X, + KOKKOS_INLINE_FUNCTION static int invoke(const MemberType &member, const MatrixType A, const VectorType X, const VectorType Y) { #if (KOKKOSKERNELS_DEBUG_LEVEL > 0) - static_assert(Kokkos::is_view::value, - "KokkosBatched::gesv: MatrixType is not a Kokkos::View."); - static_assert(Kokkos::is_view::value, - "KokkosBatched::gesv: VectorType is not a Kokkos::View."); - static_assert(MatrixType::rank == 2, - "KokkosBatched::gesv: MatrixType must have rank 2."); - static_assert(VectorType::rank == 1, - "KokkosBatched::gesv: VectorType must have rank 1."); + static_assert(Kokkos::is_view::value, "KokkosBatched::gesv: MatrixType is not a Kokkos::View."); + static_assert(Kokkos::is_view::value, "KokkosBatched::gesv: VectorType is not a Kokkos::View."); + static_assert(MatrixType::rank == 2, "KokkosBatched::gesv: MatrixType must have rank 2."); + static_assert(VectorType::rank == 1, "KokkosBatched::gesv: VectorType must have rank 1."); // Check compatibility of dimensions at run time. - if (A.extent(0) != X.extent(0) || A.extent(1) != X.extent(0) || - A.extent(0) != Y.extent(0)) { + if (A.extent(0) != X.extent(0) || A.extent(1) != X.extent(0) || A.extent(0) != Y.extent(0)) { Kokkos::printf( "KokkosBatched::gesv: dimensions of A and X and Y do not match: A: " "%d x %d, X: %d, Y: %d\n", - (int)A.extent(0), (int)A.extent(1), (int)X.extent(0), - (int)Y.extent(0)); + (int)A.extent(0), (int)A.extent(1), (int)X.extent(0), (int)Y.extent(0)); return 1; } #endif - using ScratchPadMatrixViewType = Kokkos::View< - typename MatrixType::non_const_value_type **, - typename MatrixType::execution_space::scratch_memory_space>; + using ScratchPadMatrixViewType = Kokkos::View; const int n = A.extent(0); @@ -674,8 +594,7 @@ struct TeamVectorGesv { auto tmp_v_1 = Kokkos::subview(tmp, Kokkos::ALL, n + 2); auto tmp_v_2 = Kokkos::subview(tmp, Kokkos::ALL, n + 3); - if (TeamVectorStaticPivoting::invoke( - member, A, PDAD, Y, PDY, D2, tmp_v_1, tmp_v_2) == 1) { + if (TeamVectorStaticPivoting::invoke(member, A, PDAD, Y, PDY, D2, tmp_v_1, tmp_v_2) == 1) { Kokkos::printf( "KokkosBatched::gesv: the currently implemented static pivoting " "failed.\n"); @@ -684,22 +603,18 @@ struct TeamVectorGesv { member.team_barrier(); - int r_val = - TeamLU::invoke(member, PDAD); + int r_val = TeamLU::invoke(member, PDAD); member.team_barrier(); if (r_val == 0) { - TeamVectorTrsm::invoke(member, 1.0, - PDAD, PDY); + TeamVectorTrsm::invoke(member, 1.0, PDAD, PDY); member.team_barrier(); } if (r_val == 0) { - TeamVectorTrsm::invoke(member, - 1.0, PDAD, - PDY); + TeamVectorTrsm::invoke(member, 1.0, PDAD, PDY); member.team_barrier(); } @@ -715,28 +630,20 @@ struct TeamVectorGesv { template struct TeamVectorGesv { template - KOKKOS_INLINE_FUNCTION static int invoke(const MemberType &member, - const MatrixType A, - const VectorType X, + KOKKOS_INLINE_FUNCTION static int invoke(const MemberType &member, const MatrixType A, const VectorType X, const VectorType Y) { #if (KOKKOSKERNELS_DEBUG_LEVEL > 0) - static_assert(Kokkos::is_view::value, - "KokkosBatched::gesv: MatrixType is not a Kokkos::View."); - static_assert(Kokkos::is_view::value, - "KokkosBatched::gesv: VectorType is not a Kokkos::View."); - static_assert(MatrixType::rank == 2, - "KokkosBatched::gesv: MatrixType must have rank 2."); - static_assert(VectorType::rank == 1, - "KokkosBatched::gesv: VectorType must have rank 1."); + static_assert(Kokkos::is_view::value, "KokkosBatched::gesv: MatrixType is not a Kokkos::View."); + static_assert(Kokkos::is_view::value, "KokkosBatched::gesv: VectorType is not a Kokkos::View."); + static_assert(MatrixType::rank == 2, "KokkosBatched::gesv: MatrixType must have rank 2."); + static_assert(VectorType::rank == 1, "KokkosBatched::gesv: VectorType must have rank 1."); // Check compatibility of dimensions at run time. - if (A.extent(0) != X.extent(0) || A.extent(1) != X.extent(0) || - A.extent(0) != Y.extent(0)) { + if (A.extent(0) != X.extent(0) || A.extent(1) != X.extent(0) || A.extent(0) != Y.extent(0)) { Kokkos::printf( "KokkosBatched::gesv: dimensions of A and X and Y do not match: A: " "%d x %d, X: %d, Y: %d\n", - (int)A.extent(0), (int)A.extent(1), (int)X.extent(0), - (int)Y.extent(0)); + (int)A.extent(0), (int)A.extent(1), (int)X.extent(0), (int)Y.extent(0)); return 1; } #endif @@ -750,16 +657,14 @@ struct TeamVectorGesv { } if (r_val == 0) { - TeamVectorTrsm::invoke(member, 1.0, - A, X); + TeamVectorTrsm::invoke(member, 1.0, A, X); member.team_barrier(); } if (r_val == 0) { - TeamVectorTrsm::invoke(member, - 1.0, A, X); + TeamVectorTrsm::invoke(member, 1.0, A, X); member.team_barrier(); } diff --git a/batched/dense/impl/KokkosBatched_Givens_Serial_Internal.hpp b/batched/dense/impl/KokkosBatched_Givens_Serial_Internal.hpp index 4d80c6a250..963862661b 100644 --- a/batched/dense/impl/KokkosBatched_Givens_Serial_Internal.hpp +++ b/batched/dense/impl/KokkosBatched_Givens_Serial_Internal.hpp @@ -30,10 +30,9 @@ namespace KokkosBatched { /// struct SerialGivensInternal { template - KOKKOS_INLINE_FUNCTION static int invoke( - const ValueType chi1, const ValueType chi2, - /* */ Kokkos::pair* G, - /* */ ValueType* chi1_new) { + KOKKOS_INLINE_FUNCTION static int invoke(const ValueType chi1, const ValueType chi2, + /* */ Kokkos::pair* G, + /* */ ValueType* chi1_new) { typedef ValueType value_type; const value_type zero(0), one(1); /// compute G = [ gamma -sigma; @@ -58,9 +57,7 @@ struct SerialGivensInternal { cs = chi1 / r; sn = chi2 / r; - if (Kokkos::ArithTraits::abs(chi1) > - Kokkos::ArithTraits::abs(chi2) && - cs < zero) { + if (Kokkos::ArithTraits::abs(chi1) > Kokkos::ArithTraits::abs(chi2) && cs < zero) { cs = -cs; sn = -sn; r = -r; diff --git a/batched/dense/impl/KokkosBatched_HadamardProduct_Impl.hpp b/batched/dense/impl/KokkosBatched_HadamardProduct_Impl.hpp index 90b89e4ad1..658acd6b60 100644 --- a/batched/dense/impl/KokkosBatched_HadamardProduct_Impl.hpp +++ b/batched/dense/impl/KokkosBatched_HadamardProduct_Impl.hpp @@ -27,16 +27,12 @@ namespace KokkosBatched { /// ==================== struct SerialHadamardProductInternal { template - KOKKOS_INLINE_FUNCTION static int invoke(const int m, const int n, - const ValueType* KOKKOS_RESTRICT X, - const int xs0, const int xs1, - const ValueType* KOKKOS_RESTRICT Y, - const int ys0, const int ys1, - /* */ ValueType* KOKKOS_RESTRICT V, - const int vs0, const int vs1) { + KOKKOS_INLINE_FUNCTION static int invoke(const int m, const int n, const ValueType* KOKKOS_RESTRICT X, const int xs0, + const int xs1, const ValueType* KOKKOS_RESTRICT Y, const int ys0, + const int ys1, + /* */ ValueType* KOKKOS_RESTRICT V, const int vs0, const int vs1) { for (int i = 0; i < m; ++i) - for (int j = 0; j < n; ++j) - V[i * vs0 + j * vs1] = X[i * xs0 + j * xs1] * Y[i * ys0 + j * ys1]; + for (int j = 0; j < n; ++j) V[i * vs0 + j * vs1] = X[i * xs0 + j * xs1] * Y[i * ys0 + j * ys1]; return 0; } @@ -47,17 +43,15 @@ struct SerialHadamardProductInternal { /// ==================== struct TeamHadamardProductInternal { template - KOKKOS_INLINE_FUNCTION static int invoke( - const MemberType& member, const int m, const int n, - const ValueType* KOKKOS_RESTRICT X, const int xs0, const int xs1, - const ValueType* KOKKOS_RESTRICT Y, const int ys0, const int ys1, - /* */ ValueType* KOKKOS_RESTRICT V, const int vs0, const int vs1) { - Kokkos::parallel_for( - Kokkos::TeamThreadRange(member, 0, m * n), [&](const int& iTemp) { - int i, j; - getIndices(iTemp, n, m, j, i); - V[i * vs0 + j * vs1] = X[i * xs0 + j * xs1] * Y[i * ys0 + j * ys1]; - }); + KOKKOS_INLINE_FUNCTION static int invoke(const MemberType& member, const int m, const int n, + const ValueType* KOKKOS_RESTRICT X, const int xs0, const int xs1, + const ValueType* KOKKOS_RESTRICT Y, const int ys0, const int ys1, + /* */ ValueType* KOKKOS_RESTRICT V, const int vs0, const int vs1) { + Kokkos::parallel_for(Kokkos::TeamThreadRange(member, 0, m * n), [&](const int& iTemp) { + int i, j; + getIndices(iTemp, n, m, j, i); + V[i * vs0 + j * vs1] = X[i * xs0 + j * xs1] * Y[i * ys0 + j * ys1]; + }); // member.team_barrier(); return 0; } @@ -68,17 +62,15 @@ struct TeamHadamardProductInternal { /// ======================== struct TeamVectorHadamardProductInternal { template - KOKKOS_INLINE_FUNCTION static int invoke( - const MemberType& member, const int m, const int n, - const ValueType* KOKKOS_RESTRICT X, const int xs0, const int xs1, - const ValueType* KOKKOS_RESTRICT Y, const int ys0, const int ys1, - /* */ ValueType* KOKKOS_RESTRICT V, const int vs0, const int vs1) { - Kokkos::parallel_for( - Kokkos::TeamVectorRange(member, 0, m * n), [&](const int& iTemp) { - int i, j; - getIndices(iTemp, n, m, j, i); - V[i * vs0 + j * vs1] = X[i * xs0 + j * xs1] * Y[i * ys0 + j * ys1]; - }); + KOKKOS_INLINE_FUNCTION static int invoke(const MemberType& member, const int m, const int n, + const ValueType* KOKKOS_RESTRICT X, const int xs0, const int xs1, + const ValueType* KOKKOS_RESTRICT Y, const int ys0, const int ys1, + /* */ ValueType* KOKKOS_RESTRICT V, const int vs0, const int vs1) { + Kokkos::parallel_for(Kokkos::TeamVectorRange(member, 0, m * n), [&](const int& iTemp) { + int i, j; + getIndices(iTemp, n, m, j, i); + V[i * vs0 + j * vs1] = X[i * xs0 + j * xs1] * Y[i * ys0 + j * ys1]; + }); // member.team_barrier(); return 0; } @@ -88,25 +80,14 @@ struct TeamVectorHadamardProductInternal { /// Serial Impl /// =========== template -KOKKOS_INLINE_FUNCTION int SerialHadamardProduct::invoke(const XViewType& X, - const YViewType& Y, - const VViewType& V) { +KOKKOS_INLINE_FUNCTION int SerialHadamardProduct::invoke(const XViewType& X, const YViewType& Y, const VViewType& V) { #if (KOKKOSKERNELS_DEBUG_LEVEL > 0) - static_assert( - Kokkos::is_view::value, - "KokkosBatched::HadamardProduct: XViewType is not a Kokkos::View."); - static_assert( - Kokkos::is_view::value, - "KokkosBatched::HadamardProduct: YViewType is not a Kokkos::View."); - static_assert( - Kokkos::is_view::value, - "KokkosBatched::HadamardProduct: VViewType is not a Kokkos::View."); - static_assert(XViewType::rank == 2, - "KokkosBatched::HadamardProduct: XViewType must have rank 2."); - static_assert(YViewType::rank == 2, - "KokkosBatched::HadamardProduct: YViewType must have rank 2."); - static_assert(VViewType::rank == 2, - "KokkosBatched::HadamardProduct: VViewType must have rank 2."); + static_assert(Kokkos::is_view::value, "KokkosBatched::HadamardProduct: XViewType is not a Kokkos::View."); + static_assert(Kokkos::is_view::value, "KokkosBatched::HadamardProduct: YViewType is not a Kokkos::View."); + static_assert(Kokkos::is_view::value, "KokkosBatched::HadamardProduct: VViewType is not a Kokkos::View."); + static_assert(XViewType::rank == 2, "KokkosBatched::HadamardProduct: XViewType must have rank 2."); + static_assert(YViewType::rank == 2, "KokkosBatched::HadamardProduct: YViewType must have rank 2."); + static_assert(VViewType::rank == 2, "KokkosBatched::HadamardProduct: VViewType must have rank 2."); // Check compatibility of dimensions at run time. if (X.extent(0) != Y.extent(0) || X.extent(1) != Y.extent(1)) { @@ -127,10 +108,9 @@ KOKKOS_INLINE_FUNCTION int SerialHadamardProduct::invoke(const XViewType& X, } #endif - return SerialHadamardProductInternal::template invoke< - typename XViewType::non_const_value_type>( - X.extent(0), X.extent(1), X.data(), X.stride_0(), X.stride_1(), Y.data(), - Y.stride_0(), Y.stride_1(), V.data(), V.stride_0(), V.stride_1()); + return SerialHadamardProductInternal::template invoke( + X.extent(0), X.extent(1), X.data(), X.stride_0(), X.stride_1(), Y.data(), Y.stride_0(), Y.stride_1(), V.data(), + V.stride_0(), V.stride_1()); } /// @@ -139,25 +119,15 @@ KOKKOS_INLINE_FUNCTION int SerialHadamardProduct::invoke(const XViewType& X, template template -KOKKOS_INLINE_FUNCTION int TeamHadamardProduct::invoke( - const MemberType& member, const XViewType& X, const YViewType& Y, - const VViewType& V) { +KOKKOS_INLINE_FUNCTION int TeamHadamardProduct::invoke(const MemberType& member, const XViewType& X, + const YViewType& Y, const VViewType& V) { #if (KOKKOSKERNELS_DEBUG_LEVEL > 0) - static_assert( - Kokkos::is_view::value, - "KokkosBatched::HadamardProduct: XViewType is not a Kokkos::View."); - static_assert( - Kokkos::is_view::value, - "KokkosBatched::HadamardProduct: YViewType is not a Kokkos::View."); - static_assert( - Kokkos::is_view::value, - "KokkosBatched::HadamardProduct: VViewType is not a Kokkos::View."); - static_assert(XViewType::rank == 2, - "KokkosBatched::HadamardProduct: XViewType must have rank 2."); - static_assert(YViewType::rank == 2, - "KokkosBatched::HadamardProduct: YViewType must have rank 2."); - static_assert(VViewType::rank == 2, - "KokkosBatched::HadamardProduct: VViewType must have rank 2."); + static_assert(Kokkos::is_view::value, "KokkosBatched::HadamardProduct: XViewType is not a Kokkos::View."); + static_assert(Kokkos::is_view::value, "KokkosBatched::HadamardProduct: YViewType is not a Kokkos::View."); + static_assert(Kokkos::is_view::value, "KokkosBatched::HadamardProduct: VViewType is not a Kokkos::View."); + static_assert(XViewType::rank == 2, "KokkosBatched::HadamardProduct: XViewType must have rank 2."); + static_assert(YViewType::rank == 2, "KokkosBatched::HadamardProduct: YViewType must have rank 2."); + static_assert(VViewType::rank == 2, "KokkosBatched::HadamardProduct: VViewType must have rank 2."); // Check compatibility of dimensions at run time. if (X.extent(0) != Y.extent(0) || X.extent(1) != Y.extent(1)) { @@ -178,12 +148,10 @@ KOKKOS_INLINE_FUNCTION int TeamHadamardProduct::invoke( } #endif - return TeamHadamardProductInternal::template invoke< - MemberType, typename XViewType::non_const_value_type, - typename XViewType::array_layout>(member, X.extent(0), X.extent(1), - X.data(), X.stride_0(), X.stride_1(), - Y.data(), Y.stride_0(), Y.stride_1(), - V.data(), V.stride_0(), V.stride_1()); + return TeamHadamardProductInternal::template invoke( + member, X.extent(0), X.extent(1), X.data(), X.stride_0(), X.stride_1(), Y.data(), Y.stride_0(), Y.stride_1(), + V.data(), V.stride_0(), V.stride_1()); } /// @@ -192,25 +160,15 @@ KOKKOS_INLINE_FUNCTION int TeamHadamardProduct::invoke( template template -KOKKOS_INLINE_FUNCTION int TeamVectorHadamardProduct::invoke( - const MemberType& member, const XViewType& X, const YViewType& Y, - const VViewType& V) { +KOKKOS_INLINE_FUNCTION int TeamVectorHadamardProduct::invoke(const MemberType& member, const XViewType& X, + const YViewType& Y, const VViewType& V) { #if (KOKKOSKERNELS_DEBUG_LEVEL > 0) - static_assert( - Kokkos::is_view::value, - "KokkosBatched::HadamardProduct: XViewType is not a Kokkos::View."); - static_assert( - Kokkos::is_view::value, - "KokkosBatched::HadamardProduct: YViewType is not a Kokkos::View."); - static_assert( - Kokkos::is_view::value, - "KokkosBatched::HadamardProduct: VViewType is not a Kokkos::View."); - static_assert(XViewType::rank == 2, - "KokkosBatched::HadamardProduct: XViewType must have rank 2."); - static_assert(YViewType::rank == 2, - "KokkosBatched::HadamardProduct: YViewType must have rank 2."); - static_assert(VViewType::rank == 2, - "KokkosBatched::HadamardProduct: VViewType must have rank 2."); + static_assert(Kokkos::is_view::value, "KokkosBatched::HadamardProduct: XViewType is not a Kokkos::View."); + static_assert(Kokkos::is_view::value, "KokkosBatched::HadamardProduct: YViewType is not a Kokkos::View."); + static_assert(Kokkos::is_view::value, "KokkosBatched::HadamardProduct: VViewType is not a Kokkos::View."); + static_assert(XViewType::rank == 2, "KokkosBatched::HadamardProduct: XViewType must have rank 2."); + static_assert(YViewType::rank == 2, "KokkosBatched::HadamardProduct: YViewType must have rank 2."); + static_assert(VViewType::rank == 2, "KokkosBatched::HadamardProduct: VViewType must have rank 2."); // Check compatibility of dimensions at run time. if (X.extent(0) != Y.extent(0) || X.extent(1) != Y.extent(1)) { @@ -231,12 +189,10 @@ KOKKOS_INLINE_FUNCTION int TeamVectorHadamardProduct::invoke( } #endif - return TeamVectorHadamardProductInternal::invoke< - MemberType, typename XViewType::non_const_value_type, - typename XViewType::array_layout>(member, X.extent(0), X.extent(1), - X.data(), X.stride_0(), X.stride_1(), - Y.data(), Y.stride_0(), Y.stride_1(), - V.data(), V.stride_0(), V.stride_1()); + return TeamVectorHadamardProductInternal::invoke( + member, X.extent(0), X.extent(1), X.data(), X.stride_0(), X.stride_1(), Y.data(), Y.stride_0(), Y.stride_1(), + V.data(), V.stride_0(), V.stride_1()); } } // namespace KokkosBatched diff --git a/batched/dense/impl/KokkosBatched_HessenbergFormQ_Serial_Internal.hpp b/batched/dense/impl/KokkosBatched_HessenbergFormQ_Serial_Internal.hpp index 023257c8ed..8db5d40a98 100644 --- a/batched/dense/impl/KokkosBatched_HessenbergFormQ_Serial_Internal.hpp +++ b/batched/dense/impl/KokkosBatched_HessenbergFormQ_Serial_Internal.hpp @@ -34,13 +34,10 @@ namespace KokkosBatched { struct SerialHessenbergFormQInternal { template KOKKOS_INLINE_FUNCTION static int invoke(const int m, const int k, - /* */ ValueType* A, const int as0, - const int as1, + /* */ ValueType* A, const int as0, const int as1, /* */ ValueType* t, const int ts, - /* */ ValueType* Q, const int qs0, - const int qs1, - /* */ ValueType* w, - const bool is_Q_zero = false) { + /* */ ValueType* Q, const int qs0, const int qs1, + /* */ ValueType* w, const bool is_Q_zero = false) { typedef ValueType value_type; /// Given a matrix A that includes Hessenberg factorization @@ -52,14 +49,12 @@ struct SerialHessenbergFormQInternal { /// B is m x m // set identity if (is_Q_zero) - KokkosBlas::Impl::SerialSetInternal::invoke(m, value_type(1), Q, - qs0 + qs1); + KokkosBlas::Impl::SerialSetInternal::invoke(m, value_type(1), Q, qs0 + qs1); else SerialSetIdentityInternal::invoke(m, Q, qs0, qs1); - return SerialApplyQ_LeftNoTransForwardInternal ::invoke( - m - 1, m - 1, k - 1, A + as0, as0, as1, t, ts, Q + qs0 + qs1, qs1, qs0, - w); + return SerialApplyQ_LeftNoTransForwardInternal ::invoke(m - 1, m - 1, k - 1, A + as0, as0, as1, t, ts, + Q + qs0 + qs1, qs1, qs0, w); } }; diff --git a/batched/dense/impl/KokkosBatched_HessenbergQR_WithShift_Serial_Internal.hpp b/batched/dense/impl/KokkosBatched_HessenbergQR_WithShift_Serial_Internal.hpp index 3d2b75e64d..3815a9e18e 100644 --- a/batched/dense/impl/KokkosBatched_HessenbergQR_WithShift_Serial_Internal.hpp +++ b/batched/dense/impl/KokkosBatched_HessenbergQR_WithShift_Serial_Internal.hpp @@ -32,10 +32,9 @@ namespace KokkosBatched { /// struct SerialHessenbergQR_WithShiftInternal { template - KOKKOS_INLINE_FUNCTION static int invoke( - const int mbeg, const int mend, const int morg, - /* */ ValueType *HH, const int hs0, const int hs1, const ValueType shift, - /* */ Kokkos::pair *GG, const bool request_schur) { + KOKKOS_INLINE_FUNCTION static int invoke(const int mbeg, const int mend, const int morg, + /* */ ValueType *HH, const int hs0, const int hs1, const ValueType shift, + /* */ Kokkos::pair *GG, const bool request_schur) { typedef ValueType value_type; // typedef Kokkos::ArithTraits ats; @@ -79,13 +78,11 @@ struct SerialHessenbergQR_WithShiftInternal { // apply G' from left G.second = -G.second; // transpose G const int nn = m; - SerialApplyLeftGivensInternal::invoke(G, nn + (morg - mend), h11, hs1, - h21, hs1); + SerialApplyLeftGivensInternal::invoke(G, nn + (morg - mend), h11, hs1, h21, hs1); // apply (G')' from right const int mm = m < 3 ? m : 3; - SerialApplyRightGivensInternal::invoke(G, mm + mbeg, h11 - mbeg_mult_hs0, - hs0, h12 - mbeg_mult_hs0, hs0); + SerialApplyRightGivensInternal::invoke(G, mm + mbeg, h11 - mbeg_mult_hs0, hs0, h12 - mbeg_mult_hs0, hs0); } /// 1. chase the bulge @@ -112,13 +109,11 @@ struct SerialHessenbergQR_WithShiftInternal { G.second = -G.second; // transpose G const int nn = m - m_htl; - SerialApplyLeftGivensInternal::invoke( - G, nn + (morg - mend), H_part3x3.A11, hs1, H_part3x3.A21, hs1); + SerialApplyLeftGivensInternal::invoke(G, nn + (morg - mend), H_part3x3.A11, hs1, H_part3x3.A21, hs1); const int mtmp = m_htl + 3, mm = mtmp < m ? mtmp : m; - SerialApplyRightGivensInternal::invoke( - G, mm + mbeg, H_part3x3.A01 - mbeg_mult_hs0, hs0, - H_part3x3.A02 - mbeg_mult_hs0, hs0); + SerialApplyRightGivensInternal::invoke(G, mm + mbeg, H_part3x3.A01 - mbeg_mult_hs0, hs0, + H_part3x3.A02 - mbeg_mult_hs0, hs0); /// ----------------------------------------------------- H_part2x2.mergeToATL(H_part3x3); } @@ -126,13 +121,10 @@ struct SerialHessenbergQR_WithShiftInternal { } template - KOKKOS_FORCEINLINE_FUNCTION static int invoke(const int mbeg, const int mend, - const int morg, - /* */ ValueType *HH, - const int hs0, const int hs1, + KOKKOS_FORCEINLINE_FUNCTION static int invoke(const int mbeg, const int mend, const int morg, + /* */ ValueType *HH, const int hs0, const int hs1, const ValueType shift) { - return invoke(mbeg, mend, morg, HH, hs0, hs1, shift, - (Kokkos::pair *)NULL, false); + return invoke(mbeg, mend, morg, HH, hs0, hs1, shift, (Kokkos::pair *)NULL, false); } }; diff --git a/batched/dense/impl/KokkosBatched_Hessenberg_Serial_Internal.hpp b/batched/dense/impl/KokkosBatched_Hessenberg_Serial_Internal.hpp index f12115e4de..44c5b44373 100644 --- a/batched/dense/impl/KokkosBatched_Hessenberg_Serial_Internal.hpp +++ b/batched/dense/impl/KokkosBatched_Hessenberg_Serial_Internal.hpp @@ -34,8 +34,7 @@ struct SerialHessenbergInternal { template KOKKOS_INLINE_FUNCTION static int invoke(const int m, // m = NumRows(A) const int n, // n = NumCols(A) - /* */ ValueType *A, const int as0, - const int as1, + /* */ ValueType *A, const int as0, const int as1, /* */ ValueType *t, const int ts, /* */ ValueType *w) { typedef ValueType value_type; @@ -76,25 +75,22 @@ struct SerialHessenbergInternal { // perform householder transformation const int m_A22_b = m_A22 - 1; - SerialLeftHouseholderInternal::invoke(m_A22_b, A21_part2x1.AT, - A21_part2x1.AB, as0, tau); + SerialLeftHouseholderInternal::invoke(m_A22_b, A21_part2x1.AT, A21_part2x1.AB, as0, tau); // partition A22 into 2x1 A22_part2x1.partWithAT(A_part3x3.A22, m_A22, 1); // left apply householder to partitioned A22 - SerialApplyLeftHouseholderInternal::invoke( - m_A22_b, n_A22, tau, A21_part2x1.AB, as0, A22_part2x1.AT, as1, - A22_part2x1.AB, as0, as1, w); + SerialApplyLeftHouseholderInternal::invoke(m_A22_b, n_A22, tau, A21_part2x1.AB, as0, A22_part2x1.AT, as1, + A22_part2x1.AB, as0, as1, w); // partition A*2 column into 1x2 A2_part1x2.partWithAL(A_part3x3.A02, n_A22, 1); // right apply householder to A*2 colums const int n_A22_r = n_A22 - 1; - SerialApplyRightHouseholderInternal::invoke( - m, n_A22_r, tau, A21_part2x1.AB, as0, A2_part1x2.AL, as0, - A2_part1x2.AR, as0, as1, w); + SerialApplyRightHouseholderInternal::invoke(m, n_A22_r, tau, A21_part2x1.AB, as0, A2_part1x2.AL, as0, + A2_part1x2.AR, as0, as1, w); } /// ----------------------------------------------------- A_part2x2.mergeToATL(A_part3x3); diff --git a/batched/dense/impl/KokkosBatched_HostLevel_Gemm_Armpl_Impl.hpp b/batched/dense/impl/KokkosBatched_HostLevel_Gemm_Armpl_Impl.hpp index 971fb36081..7e814646a2 100644 --- a/batched/dense/impl/KokkosBatched_HostLevel_Gemm_Armpl_Impl.hpp +++ b/batched/dense/impl/KokkosBatched_HostLevel_Gemm_Armpl_Impl.hpp @@ -67,9 +67,8 @@ namespace Impl { /// ScalarType, AViewType, BViewType, CViewType> /// (handle, alpha, A, B, beta, C).invoke(); // clang-format on -template +template class BatchedArmplGemm { private: HandleType *const __handle; @@ -107,26 +106,21 @@ class BatchedArmplGemm { for (int ib = 0; ib < __nbatch; ++ib) { for (int i = 0; i < __ninter; ++i) { auto svA = - subview_wrapper(__A, ib * __ninter + i, Kokkos::ALL(), - Kokkos::ALL(), __batch_layout_tag, __no_trans_tag); + subview_wrapper(__A, ib * __ninter + i, Kokkos::ALL(), Kokkos::ALL(), __batch_layout_tag, __no_trans_tag); auto svB = - subview_wrapper(__B, ib * __ninter + i, Kokkos::ALL(), - Kokkos::ALL(), __batch_layout_tag, __no_trans_tag); + subview_wrapper(__B, ib * __ninter + i, Kokkos::ALL(), Kokkos::ALL(), __batch_layout_tag, __no_trans_tag); auto svC = - subview_wrapper(__C, ib * __ninter + i, Kokkos::ALL(), - Kokkos::ALL(), __batch_layout_tag, __no_trans_tag); + subview_wrapper(__C, ib * __ninter + i, Kokkos::ALL(), Kokkos::ALL(), __batch_layout_tag, __no_trans_tag); - auto info = armpl_dge_interleave( - __ninter, i, __Am, __An, svA.data(), svA.stride(0), svA.stride(1), - &__Adp[__Abstrd * ib], __Aistrd, __Ajstrd); + auto info = armpl_dge_interleave(__ninter, i, __Am, __An, svA.data(), svA.stride(0), svA.stride(1), + &__Adp[__Abstrd * ib], __Aistrd, __Ajstrd); if (info != ARMPL_STATUS_SUCCESS) { std::ostringstream os; os << "armpl_dge_interleave(A) returned:" << info << std::endl; KokkosKernels::Impl::throw_runtime_exception(os.str()); } - info = armpl_dge_interleave(__ninter, i, __Bm, __Bn, svB.data(), - svB.stride(0), svB.stride(1), + info = armpl_dge_interleave(__ninter, i, __Bm, __Bn, svB.data(), svB.stride(0), svB.stride(1), &__Bdp[__Bbstrd * ib], __Bistrd, __Bjstrd); if (info != ARMPL_STATUS_SUCCESS) { std::ostringstream os; @@ -134,8 +128,7 @@ class BatchedArmplGemm { KokkosKernels::Impl::throw_runtime_exception(os.str()); } - info = armpl_dge_interleave(__ninter, i, __Cm, __Cn, svC.data(), - svC.stride(0), svC.stride(1), + info = armpl_dge_interleave(__ninter, i, __Cm, __Cn, svC.data(), svC.stride(0), svC.stride(1), &__Cdp[__Cbstrd * ib], __Cistrd, __Cjstrd); if (info != ARMPL_STATUS_SUCCESS) { std::ostringstream os; @@ -152,12 +145,10 @@ class BatchedArmplGemm { for (int ib = 0; ib < __nbatch; ++ib) { for (int i = 0; i < __ninter; ++i) { auto svC = - subview_wrapper(__C, ib * __ninter + i, Kokkos::ALL(), - Kokkos::ALL(), __batch_layout_tag, __no_trans_tag); + subview_wrapper(__C, ib * __ninter + i, Kokkos::ALL(), Kokkos::ALL(), __batch_layout_tag, __no_trans_tag); - auto info = armpl_dge_deinterleave( - __ninter, i, __Cm, __Cn, svC.data(), svC.stride(0), svC.stride(1), - &__Cdp[__Cbstrd * ib], __Cistrd, __Cjstrd); + auto info = armpl_dge_deinterleave(__ninter, i, __Cm, __Cn, svC.data(), svC.stride(0), svC.stride(1), + &__Cdp[__Cbstrd * ib], __Cistrd, __Cjstrd); if (info != ARMPL_STATUS_SUCCESS) { std::ostringstream os; os << "armpl_dge_deinterleave returned:" << info << std::endl; @@ -170,11 +161,10 @@ class BatchedArmplGemm { template std::enable_if_t::value, void> __run(T &) { - auto info = armpl_dgemm_interleave_batch( - __ninter, __nbatch, __transa, __transb, __Cm, __Cn, - std::is_same::value ? __An : __Am, - __alpha, __Adp, __Abstrd, __Aistrd, __Ajstrd, __Bdp, __Bbstrd, __Bistrd, - __Bjstrd, __beta, __Cdp, __Cbstrd, __Cistrd, __Cjstrd); + auto info = armpl_dgemm_interleave_batch(__ninter, __nbatch, __transa, __transb, __Cm, __Cn, + std::is_same::value ? __An : __Am, __alpha, + __Adp, __Abstrd, __Aistrd, __Ajstrd, __Bdp, __Bbstrd, __Bistrd, __Bjstrd, + __beta, __Cdp, __Cbstrd, __Cistrd, __Cjstrd); if (info != ARMPL_STATUS_SUCCESS) { std::ostringstream os; os << "armpl_dgemm_interleave_batch returned :" << info << std::endl; @@ -193,8 +183,7 @@ class BatchedArmplGemm { std::enable_if_t::value, void> __run(T &) {} public: - BatchedArmplGemm(HandleType *const handle, ScalarType alpha, AViewType A, - BViewType B, ScalarType beta, CViewType C) + BatchedArmplGemm(HandleType *const handle, ScalarType alpha, AViewType A, BViewType B, ScalarType beta, CViewType C) : __handle(handle), __A(A), __B(B), __C(C), __alpha(alpha), __beta(beta) { __ninter = __handle->get_tpl_params()[0]; @@ -234,15 +223,11 @@ class BatchedArmplGemm { int invoke() { if (__handle->enableDebug) { - std::cerr << "__nbatch:" << std::to_string(__nbatch) - << ", __ninter:" << std::to_string(__ninter) - << ", __Am:" << std::to_string(__Am) - << ", __An:" << std::to_string(__An) << std::endl; + std::cerr << "__nbatch:" << std::to_string(__nbatch) << ", __ninter:" << std::to_string(__ninter) + << ", __Am:" << std::to_string(__Am) << ", __An:" << std::to_string(__An) << std::endl; } - if (!std::is_same::value || - !std::is_same::value || - !std::is_same::value || + if (!std::is_same::value || !std::is_same::value || !std::is_same::value || !std::is_same::value) { std::ostringstream os; os << "KokkosBatched::Impl::BatchedArmplGemm only supports 'double' " @@ -254,8 +239,7 @@ class BatchedArmplGemm { if (__nbatch != 0) { if (__ninter == 0 || __nbatch % __ninter) { std::ostringstream os; - os << "batch size must be evenly divisible by ninter. __nbatch: " - << std::to_string(__nbatch) + os << "batch size must be evenly divisible by ninter. __nbatch: " << std::to_string(__nbatch) << ", __ninter: " << std::to_string(__ninter) << std::endl; KokkosKernels::Impl::throw_runtime_exception(os.str()); } diff --git a/batched/dense/impl/KokkosBatched_HostLevel_Gemm_DblBuf_Impl.hpp b/batched/dense/impl/KokkosBatched_HostLevel_Gemm_DblBuf_Impl.hpp index 50d662b281..6888de725d 100644 --- a/batched/dense/impl/KokkosBatched_HostLevel_Gemm_DblBuf_Impl.hpp +++ b/batched/dense/impl/KokkosBatched_HostLevel_Gemm_DblBuf_Impl.hpp @@ -126,15 +126,13 @@ using TagFromLayout = typename TagFromLayoutHelper::tag; /// ScalarType, AViewType, BViewType, CViewType /// ArgBoundsCheck, tile_m, tile_n, tile_k>(alpha, A, B, beta, C).invoke(); // clang-format on -template +template class BatchedDblBufGemm { private: using AlphaMulTag = - std::conditional_t::value, - AlphaTag::No, AlphaTag::Yes>; + std::conditional_t::value, AlphaTag::No, AlphaTag::Yes>; HandleType *const __handle; AViewType __A; @@ -153,20 +151,12 @@ class BatchedDblBufGemm { using layout_type = typename CViewType::array_layout; using device_type = typename CViewType::device_type; using execution_space_type = typename device_type::execution_space; - using scratch_space_type = - typename execution_space_type::scratch_memory_space; - using view_type_2d_scratch = - Kokkos::View; + using scratch_space_type = typename execution_space_type::scratch_memory_space; + using view_type_2d_scratch = Kokkos::View; public: - BatchedDblBufGemm(HandleType *const handle, ScalarType alpha, AViewType A, - BViewType B, ScalarType beta, CViewType C) - : __handle(handle), - __A(A), - __B(B), - __C(C), - __alpha(alpha), - __beta(beta) {} + BatchedDblBufGemm(HandleType *const handle, ScalarType alpha, AViewType A, BViewType B, ScalarType beta, CViewType C) + : __handle(handle), __A(A), __B(B), __C(C), __alpha(alpha), __beta(beta) {} int invoke() { __run(); @@ -175,8 +165,7 @@ class BatchedDblBufGemm { private: void __run() { - using policy_type = - Kokkos::TeamPolicy, execution_space_type>; + using policy_type = Kokkos::TeamPolicy, execution_space_type>; using member_type = typename policy_type::member_type; // Compile-time expressions required for functor-level register allocations: @@ -190,7 +179,7 @@ class BatchedDblBufGemm { constexpr int reg_n = TILE_N / TILE_K + 2 * !!(TILE_N % TILE_K); constexpr int stride_m = TILE_K; constexpr int stride_n = TILE_N / reg_n; - using functor_type = Functor; + using functor_type = Functor; functor_type functor(*this, __A, __B, __C); @@ -211,43 +200,35 @@ class BatchedDblBufGemm { int vector_len = stride_n; const int max_team_size = - policy_type(league_size, Kokkos::AUTO, vector_len) - .team_size_max(functor, Kokkos::ParallelForTag()); + policy_type(league_size, Kokkos::AUTO, vector_len).team_size_max(functor, Kokkos::ParallelForTag()); if (team_size > max_team_size) { std::ostringstream os; - os << "KokkosBatched::BatchedGemm with kernelAlgoType = " - << std::to_string(__handle->get_kernel_algo_type()) - << " does not support team_size > " << std::to_string(max_team_size) - << "." << std::endl + os << "KokkosBatched::BatchedGemm with kernelAlgoType = " << std::to_string(__handle->get_kernel_algo_type()) + << " does not support team_size > " << std::to_string(max_team_size) << "." << std::endl << " The tile dimensions must be adjusted." << std::endl; KokkosKernels::Impl::throw_runtime_exception(os.str()); } - const int max_vector_len = - policy_type(league_size, team_size, Kokkos::AUTO).vector_length_max(); + const int max_vector_len = policy_type(league_size, team_size, Kokkos::AUTO).vector_length_max(); if (vector_len > max_vector_len) { std::ostringstream os; - os << "KokkosBatched::BatchedGemm with kernelAlgoType = " - << std::to_string(__handle->get_kernel_algo_type()) - << " does not support vector_len > " << std::to_string(max_vector_len) - << "." << std::endl + os << "KokkosBatched::BatchedGemm with kernelAlgoType = " << std::to_string(__handle->get_kernel_algo_type()) + << " does not support vector_len > " << std::to_string(max_vector_len) << "." << std::endl << " The tile dimensions must be adjusted." << std::endl; KokkosKernels::Impl::throw_runtime_exception(os.str()); } if (__handle->enableDebug) { - std::cout << "max_team_size:" << max_team_size - << " team_size:" << team_size << std::endl - << "max_vector_len:" << max_vector_len - << " vector_len:" << vector_len << std::endl + std::cout << "max_team_size:" << max_team_size << " team_size:" << team_size << std::endl + << "max_vector_len:" << max_vector_len << " vector_len:" << vector_len << std::endl << "TILE_M:" << TILE_M << std::endl << "TILE_N:" << TILE_N << std::endl << "TILE_K:" << TILE_K << std::endl; } // TODO: Use statically allocated shmem - int shmem_size = view_type_2d_scratch::shmem_size(TILE_M, TILE_K) + - view_type_2d_scratch::shmem_size(TILE_K, TILE_N); + int shmem_size = + view_type_2d_scratch::shmem_size(TILE_M, TILE_K) + view_type_2d_scratch::shmem_size(TILE_K, TILE_N); // Each member solves a portion of TILE_K in parallel with other members policy_type team_policy(league_size, team_size, vector_len); @@ -278,8 +259,7 @@ class BatchedDblBufGemm { // below. If those are used, we get an invalid memory error from cuda. I // suspect this is due the values not being copied to device and then // runtime resolution of the host address &__ei. - Functor(BatchedDblBufGemm &ei, AViewType A, BViewType B, CViewType C) - : __ei(ei), __A(A), __B(B), __C(C) { + Functor(BatchedDblBufGemm &ei, AViewType A, BViewType B, CViewType C) : __ei(ei), __A(A), __B(B), __C(C) { if (std::is_same::value) { ei.__c_batch_size = ei.__C.extent_int(0); ei.__c_m = ei.__C.extent_int(1); @@ -310,24 +290,17 @@ class BatchedDblBufGemm { } KOKKOS_INLINE_FUNCTION - void __mul(view_value_type a, view_value_type b, view_value_type &c, - const AlphaTag::No &) const { - c += a * b; - } + void __mul(view_value_type a, view_value_type b, view_value_type &c, const AlphaTag::No &) const { c += a * b; } KOKKOS_INLINE_FUNCTION - void __mul(view_value_type a, view_value_type b, view_value_type &c, - const AlphaTag::Yes &) const { + void __mul(view_value_type a, view_value_type b, view_value_type &c, const AlphaTag::Yes &) const { c += a * b * __alpha; } KOKKOS_INLINE_FUNCTION - void __rshmem_and_mul(const int &thread_id, const int &vlane_id, - const unsigned &nk, view_value_type reg_a[REG_M], - view_value_type reg_b[REG_N], - view_value_type reg_c[REG_M][REG_N], - view_type_2d_scratch &svA_scr, - view_type_2d_scratch &svB_scr) const { + void __rshmem_and_mul(const int &thread_id, const int &vlane_id, const unsigned &nk, view_value_type reg_a[REG_M], + view_value_type reg_b[REG_N], view_value_type reg_c[REG_M][REG_N], + view_type_2d_scratch &svA_scr, view_type_2d_scratch &svB_scr) const { #if defined(KOKKOS_ENABLE_PRAGMA_UNROLL) #pragma unroll #endif // KOKKOS_ENABLE_PRAGMA_UNROLL @@ -335,14 +308,12 @@ class BatchedDblBufGemm { #if defined(KOKKOS_ENABLE_PRAGMA_UNROLL) #pragma unroll #endif // KOKKOS_ENABLE_PRAGMA_UNROLL - for (int m = 0; m < REG_M; ++m) - reg_a[m] = svA_scr(thread_id + m * STRIDE_M, k); + for (int m = 0; m < REG_M; ++m) reg_a[m] = svA_scr(thread_id + m * STRIDE_M, k); #if defined(KOKKOS_ENABLE_PRAGMA_UNROLL) #pragma unroll #endif // KOKKOS_ENABLE_PRAGMA_UNROLL - for (int n = 0; n < REG_N; ++n) - reg_b[n] = svB_scr(k, vlane_id + n * STRIDE_N); + for (int n = 0; n < REG_N; ++n) reg_b[n] = svB_scr(k, vlane_id + n * STRIDE_N); #if defined(KOKKOS_ENABLE_PRAGMA_UNROLL) #pragma unroll @@ -351,18 +322,15 @@ class BatchedDblBufGemm { #if defined(KOKKOS_ENABLE_PRAGMA_UNROLL) #pragma unroll #endif // KOKKOS_ENABLE_PRAGMA_UNROLL - for (int n = 0; n < REG_N; ++n) - __mul(reg_a[m], reg_b[n], reg_c[m][n], __ei.__alpha_mul_tag); + for (int n = 0; n < REG_N; ++n) __mul(reg_a[m], reg_b[n], reg_c[m][n], __ei.__alpha_mul_tag); } } } KOKKOS_INLINE_FUNCTION - void __rshmem_and_mul_ll(const int &thread_id, const int &vlane_id, - const unsigned &nk, view_value_type reg_a[REG_M], - view_value_type reg_b[REG_N], - view_value_type reg_c[REG_M][REG_N], - view_type_2d_scratch &svA_scr, + void __rshmem_and_mul_ll(const int &thread_id, const int &vlane_id, const unsigned &nk, + view_value_type reg_a[REG_M], view_value_type reg_b[REG_N], + view_value_type reg_c[REG_M][REG_N], view_type_2d_scratch &svA_scr, view_type_2d_scratch &svB_scr) const { #if defined(KOKKOS_ENABLE_PRAGMA_UNROLL) #pragma unroll @@ -371,14 +339,12 @@ class BatchedDblBufGemm { #if defined(KOKKOS_ENABLE_PRAGMA_UNROLL) #pragma unroll #endif // KOKKOS_ENABLE_PRAGMA_UNROLL - for (int m = 0; m < REG_M; ++m) - reg_a[m] = svA_scr(k, vlane_id + m * STRIDE_M); + for (int m = 0; m < REG_M; ++m) reg_a[m] = svA_scr(k, vlane_id + m * STRIDE_M); #if defined(KOKKOS_ENABLE_PRAGMA_UNROLL) #pragma unroll #endif // KOKKOS_ENABLE_PRAGMA_UNROLL - for (int n = 0; n < REG_N; ++n) - reg_b[n] = svB_scr(thread_id + n * STRIDE_N, k); + for (int n = 0; n < REG_N; ++n) reg_b[n] = svB_scr(thread_id + n * STRIDE_N, k); #if defined(KOKKOS_ENABLE_PRAGMA_UNROLL) #pragma unroll @@ -387,8 +353,7 @@ class BatchedDblBufGemm { #if defined(KOKKOS_ENABLE_PRAGMA_UNROLL) #pragma unroll #endif // KOKKOS_ENABLE_PRAGMA_UNROLL - for (int n = 0; n < REG_N; ++n) - __mul(reg_a[m], reg_b[n], reg_c[m][n], __ei.__alpha_mul_tag); + for (int n = 0; n < REG_N; ++n) __mul(reg_a[m], reg_b[n], reg_c[m][n], __ei.__alpha_mul_tag); } } } @@ -401,8 +366,7 @@ class BatchedDblBufGemm { view_value_type prefetch_reg_a[REG_M] = {0}, prefetch_reg_b[REG_N] = {0}; // Allocate registers used for FMAs - view_value_type reg_a[REG_M] = {0}, reg_b[REG_N] = {0}, - reg_c[REG_M][REG_N] = {{0}}; + view_value_type reg_a[REG_M] = {0}, reg_b[REG_N] = {0}, reg_c[REG_M][REG_N] = {{0}}; // TODO: look at local loads and stores via nvprof // TODO: look at GPU trace in nvprof to find out how many registers are // used. @@ -417,147 +381,124 @@ class BatchedDblBufGemm { int kk; // Fetch entire 2-rank sub-matrix - auto svA = subview_wrapper(__A, batch_idx, Kokkos::ALL(), Kokkos::ALL(), - __ei.__batch_layout_tag, __ei.__transA_tag); - auto svB = subview_wrapper(__B, batch_idx, Kokkos::ALL(), Kokkos::ALL(), - __ei.__batch_layout_tag, __ei.__transB_tag); - auto svC = subview_wrapper(__C, batch_idx, Kokkos::ALL(), Kokkos::ALL(), - __ei.__batch_layout_tag); + auto svA = + subview_wrapper(__A, batch_idx, Kokkos::ALL(), Kokkos::ALL(), __ei.__batch_layout_tag, __ei.__transA_tag); + auto svB = + subview_wrapper(__B, batch_idx, Kokkos::ALL(), Kokkos::ALL(), __ei.__batch_layout_tag, __ei.__transB_tag); + auto svC = subview_wrapper(__C, batch_idx, Kokkos::ALL(), Kokkos::ALL(), __ei.__batch_layout_tag); // Allocate scratch memory buffers used for prefetching view_type_2d_scratch svA_scr(member.team_scratch(0), TILE_M, TILE_K); view_type_2d_scratch svB_scr(member.team_scratch(0), TILE_K, TILE_N); - Kokkos::parallel_for( - Kokkos::TeamThreadRange(member, 0, STRIDE_M), - [&](const int &thread_id) { - int m_offset = thread_id + start_m; + Kokkos::parallel_for(Kokkos::TeamThreadRange(member, 0, STRIDE_M), [&](const int &thread_id) { + int m_offset = thread_id + start_m; - Kokkos::parallel_for( - Kokkos::ThreadVectorRange(member, 0, STRIDE_N), - [&](const int &vlane_id) { - int n_offset = vlane_id + start_n; + Kokkos::parallel_for(Kokkos::ThreadVectorRange(member, 0, STRIDE_N), [&](const int &vlane_id) { + int n_offset = vlane_id + start_n; // Here we populate scratch memory with one or more "k" tiles for // every thread of the team! #if defined(KOKKOS_ENABLE_PRAGMA_UNROLL) #pragma unroll #endif // KOKKOS_ENABLE_PRAGMA_UNROLL - for (int i = 0; i < REG_N * STRIDE_N; i += STRIDE_N) - svB_scr(thread_id, vlane_id + i) = - access_view_bounds_check( - svB, thread_id, n_offset + i, - __ei.__bounds_check_tag); + for (int i = 0; i < REG_N * STRIDE_N; i += STRIDE_N) + svB_scr(thread_id, vlane_id + i) = + access_view_bounds_check(svB, thread_id, n_offset + i, __ei.__bounds_check_tag); #if defined(KOKKOS_ENABLE_PRAGMA_UNROLL) #pragma unroll #endif // KOKKOS_ENABLE_PRAGMA_UNROLL - for (int i = 0; i < REG_M * STRIDE_M; i += STRIDE_M) - svA_scr(thread_id + i, vlane_id) = - access_view_bounds_check( - svA, m_offset + i, vlane_id, - __ei.__bounds_check_tag); + for (int i = 0; i < REG_M * STRIDE_M; i += STRIDE_M) + svA_scr(thread_id + i, vlane_id) = + access_view_bounds_check(svA, m_offset + i, vlane_id, __ei.__bounds_check_tag); - // Wait for A, B to reside in scratch memory - member.team_barrier(); + // Wait for A, B to reside in scratch memory + member.team_barrier(); - // Each thread calculates a single dot product in chunks of - // size TILE_K - for (kk = 0; kk < __k - TILE_K; kk += TILE_K) { - int k_tile_offset = kk + TILE_K; + // Each thread calculates a single dot product in chunks of + // size TILE_K + for (kk = 0; kk < __k - TILE_K; kk += TILE_K) { + int k_tile_offset = kk + TILE_K; // Get this threads next TILE_K entries from global memory // Each thread has its own copy of prefetch_reg_b. #if defined(KOKKOS_ENABLE_PRAGMA_UNROLL) #pragma unroll #endif // KOKKOS_ENABLE_PRAGMA_UNROLL - for (int i = 0; i < REG_N; ++i) - prefetch_reg_b[i] = - access_view_bounds_check( - svB, thread_id + k_tile_offset, - n_offset + i * STRIDE_N, __ei.__bounds_check_tag); + for (int i = 0; i < REG_N; ++i) + prefetch_reg_b[i] = access_view_bounds_check( + svB, thread_id + k_tile_offset, n_offset + i * STRIDE_N, __ei.__bounds_check_tag); #if defined(KOKKOS_ENABLE_PRAGMA_UNROLL) #pragma unroll #endif // KOKKOS_ENABLE_PRAGMA_UNROLL - for (int i = 0; i < REG_M; ++i) - prefetch_reg_a[i] = - access_view_bounds_check( - svA, m_offset + i * STRIDE_M, - vlane_id + k_tile_offset, - __ei.__bounds_check_tag); + for (int i = 0; i < REG_M; ++i) + prefetch_reg_a[i] = access_view_bounds_check( + svA, m_offset + i * STRIDE_M, vlane_id + k_tile_offset, __ei.__bounds_check_tag); - __rshmem_and_mul(thread_id, vlane_id, TILE_K, reg_a, reg_b, - reg_c, svA_scr, svB_scr); + __rshmem_and_mul(thread_id, vlane_id, TILE_K, reg_a, reg_b, reg_c, svA_scr, svB_scr); - // Wait for: - // 1. prefetch_regs to be populated - // 2. for shmem to no longer be read from - member.team_barrier(); + // Wait for: + // 1. prefetch_regs to be populated + // 2. for shmem to no longer be read from + member.team_barrier(); // populate shmem from prefetch registers. Each thread has its own // copy of prefetch_reg_b. #if defined(KOKKOS_ENABLE_PRAGMA_UNROLL) #pragma unroll #endif // KOKKOS_ENABLE_PRAGMA_UNROLL - for (int i = 0; i < REG_N; ++i) - svB_scr(thread_id, vlane_id + i * STRIDE_N) = - prefetch_reg_b[i]; + for (int i = 0; i < REG_N; ++i) svB_scr(thread_id, vlane_id + i * STRIDE_N) = prefetch_reg_b[i]; // populate shmem from prefetch registers. Each thread has its own // copy of prefetch_reg_a. #if defined(KOKKOS_ENABLE_PRAGMA_UNROLL) #pragma unroll #endif // KOKKOS_ENABLE_PRAGMA_UNROLL - for (int i = 0; i < REG_M; ++i) - svA_scr(thread_id + i * STRIDE_M, vlane_id) = - prefetch_reg_a[i]; + for (int i = 0; i < REG_M; ++i) svA_scr(thread_id + i * STRIDE_M, vlane_id) = prefetch_reg_a[i]; - // Wait for shmem stores to land before performing next - // TILE_K multiply - member.team_barrier(); - } // end n_tile_k_tiles loop + // Wait for shmem stores to land before performing next + // TILE_K multiply + member.team_barrier(); + } // end n_tile_k_tiles loop - // Multiply last tile, may be a partial tile - __rshmem_and_mul(thread_id, vlane_id, __k - kk, reg_a, reg_b, - reg_c, svA_scr, svB_scr); + // Multiply last tile, may be a partial tile + __rshmem_and_mul(thread_id, vlane_id, __k - kk, reg_a, reg_b, reg_c, svA_scr, svB_scr); - // store results back to global memory - if (__beta == 0.0F) { + // store results back to global memory + if (__beta == 0.0F) { #if defined(KOKKOS_ENABLE_PRAGMA_UNROLL) #pragma unroll #endif // KOKKOS_ENABLE_PRAGMA_UNROLL - for (int m = 0; m < REG_M; ++m) { - int cm = m_offset + m * STRIDE_M; + for (int m = 0; m < REG_M; ++m) { + int cm = m_offset + m * STRIDE_M; #if defined(KOKKOS_ENABLE_PRAGMA_UNROLL) #pragma unroll #endif // KOKKOS_ENABLE_PRAGMA_UNROLL - for (int n = 0; n < REG_N; ++n) { - int cn = n_offset + n * STRIDE_N; - fma_bounds_check(svC, cm, cn, reg_c[m][n], __alpha, - __ei.__alpha_fma_tag, - __ei.__bounds_check_tag); - } - } - } else { + for (int n = 0; n < REG_N; ++n) { + int cn = n_offset + n * STRIDE_N; + fma_bounds_check(svC, cm, cn, reg_c[m][n], __alpha, __ei.__alpha_fma_tag, __ei.__bounds_check_tag); + } + } + } else { #if defined(KOKKOS_ENABLE_PRAGMA_UNROLL) #pragma unroll #endif // KOKKOS_ENABLE_PRAGMA_UNROLL - for (int m = 0; m < REG_M; ++m) { - int cm = m_offset + m * STRIDE_M; + for (int m = 0; m < REG_M; ++m) { + int cm = m_offset + m * STRIDE_M; #if defined(KOKKOS_ENABLE_PRAGMA_UNROLL) #pragma unroll #endif // KOKKOS_ENABLE_PRAGMA_UNROLL - for (int n = 0; n < REG_N; ++n) { - int cn = n_offset + n * STRIDE_N; - fma_bounds_check(svC, cm, cn, reg_c[m][n], __alpha, - __beta, __ei.__alpha_fma_tag, - __ei.__bounds_check_tag); - } - } - } - }); - }); + for (int n = 0; n < REG_N; ++n) { + int cn = n_offset + n * STRIDE_N; + fma_bounds_check(svC, cm, cn, reg_c[m][n], __alpha, __beta, __ei.__alpha_fma_tag, + __ei.__bounds_check_tag); + } + } + } + }); + }); } KOKKOS_INLINE_FUNCTION @@ -568,8 +509,7 @@ class BatchedDblBufGemm { view_value_type prefetch_reg_a[REG_M] = {0}, prefetch_reg_b[REG_N] = {0}; // Allocate registers used for FMAs - view_value_type reg_a[REG_M] = {0}, reg_b[REG_N] = {0}, - reg_c[REG_M][REG_N] = {{0}}; + view_value_type reg_a[REG_M] = {0}, reg_b[REG_N] = {0}, reg_c[REG_M][REG_N] = {{0}}; // TODO: look at local loads and stores via nvprof // TODO: look at GPU trace in nvprof to find out how many registers are // used. @@ -584,149 +524,126 @@ class BatchedDblBufGemm { int kk; // Fetch entire 2-rank sub-matrix - auto svA = subview_wrapper(__A, batch_idx, Kokkos::ALL(), Kokkos::ALL(), - __ei.__batch_layout_tag, __ei.__transA_tag); - auto svB = subview_wrapper(__B, batch_idx, Kokkos::ALL(), Kokkos::ALL(), - __ei.__batch_layout_tag, __ei.__transB_tag); - auto svC = subview_wrapper(__C, batch_idx, Kokkos::ALL(), Kokkos::ALL(), - __ei.__batch_layout_tag); + auto svA = + subview_wrapper(__A, batch_idx, Kokkos::ALL(), Kokkos::ALL(), __ei.__batch_layout_tag, __ei.__transA_tag); + auto svB = + subview_wrapper(__B, batch_idx, Kokkos::ALL(), Kokkos::ALL(), __ei.__batch_layout_tag, __ei.__transB_tag); + auto svC = subview_wrapper(__C, batch_idx, Kokkos::ALL(), Kokkos::ALL(), __ei.__batch_layout_tag); // Allocate scratch memory buffers used for prefetching view_type_2d_scratch svA_scr(member.team_scratch(0), TILE_K, TILE_M); view_type_2d_scratch svB_scr(member.team_scratch(0), TILE_N, TILE_K); - Kokkos::parallel_for( - Kokkos::TeamThreadRange(member, 0, STRIDE_N), - [&](const int &thread_id) { - int n_offset = thread_id + start_n; + Kokkos::parallel_for(Kokkos::TeamThreadRange(member, 0, STRIDE_N), [&](const int &thread_id) { + int n_offset = thread_id + start_n; - Kokkos::parallel_for( - Kokkos::ThreadVectorRange(member, 0, STRIDE_M), - [&](const int &vlane_id) { - int m_offset = vlane_id + start_m; + Kokkos::parallel_for(Kokkos::ThreadVectorRange(member, 0, STRIDE_M), [&](const int &vlane_id) { + int m_offset = vlane_id + start_m; // Here we populate scratch memory with one or more "k" tiles for // every thread of the team! #if defined(KOKKOS_ENABLE_PRAGMA_UNROLL) #pragma unroll #endif // KOKKOS_ENABLE_PRAGMA_UNROLL - for (int i = 0; i < REG_N * STRIDE_N; i += STRIDE_N) - svB_scr(thread_id + i, vlane_id) = - access_view_bounds_check( - svB, vlane_id, n_offset + i, - __ei.__bounds_check_tag); + for (int i = 0; i < REG_N * STRIDE_N; i += STRIDE_N) + svB_scr(thread_id + i, vlane_id) = + access_view_bounds_check(svB, vlane_id, n_offset + i, __ei.__bounds_check_tag); #if defined(KOKKOS_ENABLE_PRAGMA_UNROLL) #pragma unroll #endif // KOKKOS_ENABLE_PRAGMA_UNROLL - for (int i = 0; i < REG_M * STRIDE_M; i += STRIDE_M) - svA_scr(thread_id, vlane_id + i) = - access_view_bounds_check( - svA, m_offset + i, thread_id, - __ei.__bounds_check_tag); + for (int i = 0; i < REG_M * STRIDE_M; i += STRIDE_M) + svA_scr(thread_id, vlane_id + i) = + access_view_bounds_check(svA, m_offset + i, thread_id, __ei.__bounds_check_tag); - // Wait for A, B to reside in scratch memory - member.team_barrier(); + // Wait for A, B to reside in scratch memory + member.team_barrier(); - // Each thread calculates a single dot product in chunks of - // size TILE_K - for (kk = 0; kk < __k - TILE_K; kk += TILE_K) { - int k_tile_offset = kk + TILE_K; + // Each thread calculates a single dot product in chunks of + // size TILE_K + for (kk = 0; kk < __k - TILE_K; kk += TILE_K) { + int k_tile_offset = kk + TILE_K; // Get this threads next TILE_K entries from global memory // Each thread has its own copy of prefetch_reg_b. #if defined(KOKKOS_ENABLE_PRAGMA_UNROLL) #pragma unroll #endif // KOKKOS_ENABLE_PRAGMA_UNROLL - for (int i = 0; i < REG_N; ++i) - prefetch_reg_b[i] = - access_view_bounds_check( - svB, vlane_id + k_tile_offset, - n_offset + i * STRIDE_N, __ei.__bounds_check_tag); + for (int i = 0; i < REG_N; ++i) + prefetch_reg_b[i] = access_view_bounds_check( + svB, vlane_id + k_tile_offset, n_offset + i * STRIDE_N, __ei.__bounds_check_tag); #if defined(KOKKOS_ENABLE_PRAGMA_UNROLL) #pragma unroll #endif // KOKKOS_ENABLE_PRAGMA_UNROLL - for (int i = 0; i < REG_M; ++i) - prefetch_reg_a[i] = - access_view_bounds_check( - svA, m_offset + i * STRIDE_M, - thread_id + k_tile_offset, - __ei.__bounds_check_tag); + for (int i = 0; i < REG_M; ++i) + prefetch_reg_a[i] = access_view_bounds_check( + svA, m_offset + i * STRIDE_M, thread_id + k_tile_offset, __ei.__bounds_check_tag); - __rshmem_and_mul_ll(thread_id, vlane_id, TILE_K, reg_a, - reg_b, reg_c, svA_scr, svB_scr); + __rshmem_and_mul_ll(thread_id, vlane_id, TILE_K, reg_a, reg_b, reg_c, svA_scr, svB_scr); - // Wait for: - // 1. prefetch_regs to be populated - // 2. for shmem to no longer be read from - member.team_barrier(); + // Wait for: + // 1. prefetch_regs to be populated + // 2. for shmem to no longer be read from + member.team_barrier(); // populate shmem from prefetch registers. Each thread has its own // copy of prefetch_reg_b. #if defined(KOKKOS_ENABLE_PRAGMA_UNROLL) #pragma unroll #endif // KOKKOS_ENABLE_PRAGMA_UNROLL - for (int i = 0; i < REG_N; ++i) - svB_scr(thread_id + i * STRIDE_N, vlane_id) = - prefetch_reg_b[i]; + for (int i = 0; i < REG_N; ++i) svB_scr(thread_id + i * STRIDE_N, vlane_id) = prefetch_reg_b[i]; // populate shmem from prefetch registers. Each thread has its own // copy of prefetch_reg_a. #if defined(KOKKOS_ENABLE_PRAGMA_UNROLL) #pragma unroll #endif // KOKKOS_ENABLE_PRAGMA_UNROLL - for (int i = 0; i < REG_M; ++i) - svA_scr(thread_id, vlane_id + i * STRIDE_M) = - prefetch_reg_a[i]; + for (int i = 0; i < REG_M; ++i) svA_scr(thread_id, vlane_id + i * STRIDE_M) = prefetch_reg_a[i]; - // Wait for shmem stores to land before performing next - // TILE_K multiply - member.team_barrier(); - } // end n_tile_k_tiles loop + // Wait for shmem stores to land before performing next + // TILE_K multiply + member.team_barrier(); + } // end n_tile_k_tiles loop - // Multiply last tile, may be a partial tile - __rshmem_and_mul_ll(thread_id, vlane_id, __k - kk, reg_a, - reg_b, reg_c, svA_scr, svB_scr); + // Multiply last tile, may be a partial tile + __rshmem_and_mul_ll(thread_id, vlane_id, __k - kk, reg_a, reg_b, reg_c, svA_scr, svB_scr); - // store results back to global memory - if (__beta == 0.0F) { + // store results back to global memory + if (__beta == 0.0F) { #if defined(KOKKOS_ENABLE_PRAGMA_UNROLL) #pragma unroll #endif // KOKKOS_ENABLE_PRAGMA_UNROLL - for (int n = 0; n < REG_N; ++n) { - int cn = n_offset + n * STRIDE_N; + for (int n = 0; n < REG_N; ++n) { + int cn = n_offset + n * STRIDE_N; #if defined(KOKKOS_ENABLE_PRAGMA_UNROLL) #pragma unroll #endif // KOKKOS_ENABLE_PRAGMA_UNROLL - for (int m = 0; m < REG_M; ++m) { - int cm = m_offset + m * STRIDE_M; - fma_bounds_check(svC, cm, cn, reg_c[m][n], __alpha, - __ei.__alpha_fma_tag, - __ei.__bounds_check_tag); - } - } - } else { + for (int m = 0; m < REG_M; ++m) { + int cm = m_offset + m * STRIDE_M; + fma_bounds_check(svC, cm, cn, reg_c[m][n], __alpha, __ei.__alpha_fma_tag, __ei.__bounds_check_tag); + } + } + } else { #if defined(KOKKOS_ENABLE_PRAGMA_UNROLL) #pragma unroll #endif // KOKKOS_ENABLE_PRAGMA_UNROLL - for (int n = 0; n < REG_N; ++n) { - int cn = n_offset + n * STRIDE_N; + for (int n = 0; n < REG_N; ++n) { + int cn = n_offset + n * STRIDE_N; #if defined(KOKKOS_ENABLE_PRAGMA_UNROLL) #pragma unroll #endif // KOKKOS_ENABLE_PRAGMA_UNROLL - for (int m = 0; m < REG_M; ++m) { - int cm = m_offset + m * STRIDE_M; - fma_bounds_check(svC, cm, cn, reg_c[m][n], __alpha, - __beta, __ei.__alpha_fma_tag, - __ei.__bounds_check_tag); - } - } - } - }); - }); + for (int m = 0; m < REG_M; ++m) { + int cm = m_offset + m * STRIDE_M; + fma_bounds_check(svC, cm, cn, reg_c[m][n], __alpha, __beta, __ei.__alpha_fma_tag, + __ei.__bounds_check_tag); + } + } + } + }); + }); } }; }; diff --git a/batched/dense/impl/KokkosBatched_HostLevel_Gemm_Impl.hpp b/batched/dense/impl/KokkosBatched_HostLevel_Gemm_Impl.hpp index 464ea6d04a..6216aeb099 100644 --- a/batched/dense/impl/KokkosBatched_HostLevel_Gemm_Impl.hpp +++ b/batched/dense/impl/KokkosBatched_HostLevel_Gemm_Impl.hpp @@ -62,30 +62,21 @@ constexpr KOKKOS_INLINE_FUNCTION size_t kk_gemm_dbl_buf_alpha_in_fma_thresh() { #endif // __CUDAACC_RDC__ } -template -int BatchedGemmImpl(BatchedGemmHandleType *const handle, const ScalarType alpha, - const AViewType &A, const BViewType &B, +template +int BatchedGemmImpl(BatchedGemmHandleType *const handle, const ScalarType alpha, const AViewType &A, const BViewType &B, const ScalarType beta, const CViewType &C) { int ret = 0; size_t c_m, c_n; using ViewValueType = typename CViewType::value_type; // Check for valid input views - static_assert(Kokkos::is_view::value, - "AViewType must be a Kokkos::View."); - static_assert(Kokkos::is_view::value, - "BViewType must be a Kokkos::View."); - static_assert(Kokkos::is_view::value, - "CViewType must be a Kokkos::View."); - static_assert( - std::is_same::value || - std::is_same::value, - "ArgTransA must be either Trans::Transpose or Trans::NoTranspose."); - static_assert( - std::is_same::value || - std::is_same::value, - "ArgTransB must be either Trans::Transpose or Trans::NoTranspose."); + static_assert(Kokkos::is_view::value, "AViewType must be a Kokkos::View."); + static_assert(Kokkos::is_view::value, "BViewType must be a Kokkos::View."); + static_assert(Kokkos::is_view::value, "CViewType must be a Kokkos::View."); + static_assert(std::is_same::value || std::is_same::value, + "ArgTransA must be either Trans::Transpose or Trans::NoTranspose."); + static_assert(std::is_same::value || std::is_same::value, + "ArgTransB must be either Trans::Transpose or Trans::NoTranspose."); if constexpr (is_vector::value) { // Check ranks of view with underlying SIMD value types // For SIMD views, we can have either 3-rank or 4-ranks inputs. @@ -100,31 +91,27 @@ int BatchedGemmImpl(BatchedGemmHandleType *const handle, const ScalarType alpha, default: std::ostringstream os; os << "KokkosBatched::BatchedGemm does not support kernelAlgoType = " - << std::to_string(handle->get_kernel_algo_type()) - << " with SIMD views." << std::endl; + << std::to_string(handle->get_kernel_algo_type()) << " with SIMD views." << std::endl; KokkosKernels::Impl::throw_runtime_exception(os.str()); break; } } else { // Check ranks of views with underlying scalar value types - static_assert(static_cast(AViewType::rank) == 3, - "AViewType must have rank 3."); - static_assert(static_cast(BViewType::rank) == 3, - "BViewType must have rank 3."); - static_assert(static_cast(CViewType::rank) == 3, - "CViewType must have rank 3."); + static_assert(static_cast(AViewType::rank) == 3, "AViewType must have rank 3."); + static_assert(static_cast(BViewType::rank) == 3, "BViewType must have rank 3."); + static_assert(static_cast(CViewType::rank) == 3, "CViewType must have rank 3."); } // Check for valid data access patterns // Skip checking a_layout == b_layout == c_layout // Skip checking for LayoutStride using c_layout = typename CViewType::array_layout; - static_assert(!(std::is_same::value && - !std::is_same::value), - "LayoutLeft views require BatchLayout::Right"); - static_assert(!(std::is_same::value && - !std::is_same::value), - "LayoutRight views require BatchLayout::Left"); + static_assert( + !(std::is_same::value && !std::is_same::value), + "LayoutLeft views require BatchLayout::Right"); + static_assert( + !(std::is_same::value && !std::is_same::value), + "LayoutRight views require BatchLayout::Left"); if constexpr (std::is_same::value) { // c_b = C.extent(0); @@ -141,17 +128,13 @@ int BatchedGemmImpl(BatchedGemmHandleType *const handle, const ScalarType alpha, using layout_type = typename CViewType::array_layout; using exec_space = typename CViewType::execution_space; constexpr bool is_vector = KokkosBatched::is_vector::value; - constexpr bool on_gpu = - KokkosKernels::Impl::kk_is_gpu_exec_space(); - constexpr bool on_x86_64 = KokkosKernels::Impl::kk_is_x86_64_mem_space< - typename exec_space::memory_space>(); - constexpr bool on_a64fx = KokkosKernels::Impl::kk_is_a64fx_mem_space< - typename exec_space::memory_space>(); - bool out_of_range = false; + constexpr bool on_gpu = KokkosKernels::Impl::kk_is_gpu_exec_space(); + constexpr bool on_x86_64 = KokkosKernels::Impl::kk_is_x86_64_mem_space(); + constexpr bool on_a64fx = KokkosKernels::Impl::kk_is_a64fx_mem_space(); + bool out_of_range = false; if (handle->enableDebug) { - std::cout << "view_scalar_type:" << typeid(view_scalar_type).name() - << std::endl + std::cout << "view_scalar_type:" << typeid(view_scalar_type).name() << std::endl << "execution_space:" << typeid(exec_space).name() << std::endl << std::endl << "is_vector:" << is_vector << std::endl @@ -166,79 +149,58 @@ int BatchedGemmImpl(BatchedGemmHandleType *const handle, const ScalarType alpha, if (c_m != c_n) { std::ostringstream os; os << "KokkosBatched::BatchedGemm does not support kernelAlgoType = " - << std::to_string(handle->get_kernel_algo_type()) << " when c_m(" - << std::to_string(c_m) << ") != c_n(" << std::to_string(c_n) << ")" - << std::endl; + << std::to_string(handle->get_kernel_algo_type()) << " when c_m(" << std::to_string(c_m) << ") != c_n(" + << std::to_string(c_n) << ")" << std::endl; KokkosKernels::Impl::throw_runtime_exception(os.str()); } // Select optimal resultsPerThread param for BatchedSerialGemm using bsgResultsPerThread = - std::conditional_t; + std::conditional_t; // Select optimal mode param for SerialGemm. using bsgModeType = typename std::conditional< - is_vector, - typename std::conditional::type, + is_vector, typename std::conditional::type, typename std::conditional< on_gpu, Algo::Gemm::Unblocked, - typename std::conditional::type>::type>:: - type; + typename std::conditional::type>::type>::type; if (handle->enableDebug) { - std::cout << "bsgResultsPerThread: " - << typeid(bsgResultsPerThread).name() << std::endl + std::cout << "bsgResultsPerThread: " << typeid(bsgResultsPerThread).name() << std::endl << "bsgModeType: " << typeid(bsgModeType).name() << std::endl; } if constexpr (on_gpu) { - if (((std::is_same::value) - ? (c_m >= 16) - : (c_m >= 24 && c_m <= 32) || c_m >= 40)) { - handle->teamSz = handle->vecLen = 8; - constexpr int tile_m = Impl::kk_gemm_dbl_buf_tile_m(); - constexpr int tile_n = Impl::kk_gemm_dbl_buf_tile_n(); - constexpr int tile_k = Impl::kk_gemm_dbl_buf_tile_k(); - constexpr size_t alpha_in_fma_thresh = - Impl::kk_gemm_dbl_buf_alpha_in_fma_thresh(); + if (((std::is_same::value) ? (c_m >= 16) + : (c_m >= 24 && c_m <= 32) || c_m >= 40)) { + handle->teamSz = handle->vecLen = 8; + constexpr int tile_m = Impl::kk_gemm_dbl_buf_tile_m(); + constexpr int tile_n = Impl::kk_gemm_dbl_buf_tile_n(); + constexpr int tile_k = Impl::kk_gemm_dbl_buf_tile_k(); + constexpr size_t alpha_in_fma_thresh = Impl::kk_gemm_dbl_buf_alpha_in_fma_thresh(); if (c_m % 32 == 0) { // No bounds checking if (c_m >= alpha_in_fma_thresh) { // apply alpha in fma - ret = Impl::BatchedDblBufGemm( - handle, alpha, A, B, beta, C) + ret = Impl::BatchedDblBufGemm(handle, alpha, A, B, beta, C) .invoke(); } else { // apply alpha in mul - ret = Impl::BatchedDblBufGemm( - handle, alpha, A, B, beta, C) + ret = Impl::BatchedDblBufGemm(handle, alpha, A, B, beta, C) .invoke(); } } else { // bounds checking if (c_m >= alpha_in_fma_thresh) { // apply alpha in fma - ret = Impl::BatchedDblBufGemm( - handle, alpha, A, B, beta, C) + ret = Impl::BatchedDblBufGemm(handle, alpha, A, B, beta, C) .invoke(); } else { // apply alpha in mul - ret = Impl::BatchedDblBufGemm( - handle, alpha, A, B, beta, C) + ret = Impl::BatchedDblBufGemm(handle, alpha, A, B, beta, C) .invoke(); } } @@ -247,10 +209,8 @@ int BatchedGemmImpl(BatchedGemmHandleType *const handle, const ScalarType alpha, } } if (!on_gpu || out_of_range) { - ret = Impl::BatchedSerialGemm(alpha, A, B, beta, C) + ret = Impl::BatchedSerialGemm(alpha, A, B, beta, C) .invoke(); } break; @@ -261,10 +221,8 @@ int BatchedGemmImpl(BatchedGemmHandleType *const handle, const ScalarType alpha, ////////////// TPL ALGOS ////////////// #if defined(KOKKOSKERNELS_ENABLE_TPL_ARMPL) && ARMPL_BUILD >= 1058 case BaseTplAlgos::ARMPL: - ret = Impl::BatchedArmplGemm(handle, alpha, A, B, - beta, C) + ret = Impl::BatchedArmplGemm(handle, alpha, A, B, beta, C) .invoke(); break; #endif // KOKKOSKERNELS_ENABLE_TPL_ARMPL @@ -276,23 +234,17 @@ int BatchedGemmImpl(BatchedGemmHandleType *const handle, const ScalarType alpha, ////////////// KokkosBatched ALGOS ////////////// case BaseKokkosBatchedAlgos::KK_SERIAL: - ret = - Impl::BatchedSerialGemm( - alpha, A, B, beta, C) - .invoke(); + ret = Impl::BatchedSerialGemm(alpha, A, B, beta, C) + .invoke(); break; // case GemmKokkosBatchedAlgos::KK_SERIALSIMD: case GemmKokkosBatchedAlgos::KK_SERIAL_RANK0: - ret = - Impl::BatchedSerialGemm( - alpha, A, B, beta, C) - .invoke(); + ret = Impl::BatchedSerialGemm(alpha, A, B, beta, C) + .invoke(); break; // case GemmKokkosBatchedAlgos::KK_SERIAL_SHMEM: @@ -308,11 +260,9 @@ int BatchedGemmImpl(BatchedGemmHandleType *const handle, const ScalarType alpha, // performance. // TODO: Add auto-selection of tile size based on inputs and device type - ret = Impl::BatchedDblBufGemm( - handle, alpha, A, B, beta, C) + ret = Impl::BatchedDblBufGemm(handle, alpha, A, B, + beta, C) .invoke(); break; diff --git a/batched/dense/impl/KokkosBatched_HostLevel_Gemm_Serial_Impl.hpp b/batched/dense/impl/KokkosBatched_HostLevel_Gemm_Serial_Impl.hpp index 5ff581bb64..8da3c7acd1 100644 --- a/batched/dense/impl/KokkosBatched_HostLevel_Gemm_Serial_Impl.hpp +++ b/batched/dense/impl/KokkosBatched_HostLevel_Gemm_Serial_Impl.hpp @@ -76,9 +76,8 @@ namespace Impl { /// ArgResultsPerThread, ScalarType, AViewType, /// BViewType, CViewType>(alpha, A, B, beta, C).invoke(); // clang-format on -template +template class BatchedSerialGemm { private: AViewType A; @@ -92,10 +91,8 @@ class BatchedSerialGemm { void run() { using execution_space = typename CViewType::device_type::execution_space; - using policy_type = - Kokkos::RangePolicy; - Kokkos::parallel_for("BatchedSerialGemm", policy_type(0, batch_size), - *this); + using policy_type = Kokkos::RangePolicy; + Kokkos::parallel_for("BatchedSerialGemm", policy_type(0, batch_size), *this); } public: @@ -117,8 +114,7 @@ class BatchedSerialGemm { batch_size *= divisor; run(); - } else if (std::is_same::value) { + } else if (std::is_same::value) { if (std::is_same::value) batch_size = C.extent(0); else @@ -132,8 +128,7 @@ class BatchedSerialGemm { return 0; } - BatchedSerialGemm(ScalarType _alpha, AViewType _A, BViewType _B, - ScalarType _beta, CViewType _C) + BatchedSerialGemm(ScalarType _alpha, AViewType _A, BViewType _B, ScalarType _beta, CViewType _C) : A(_A), B(_B), C(_C), alpha(_alpha), beta(_beta) {} KOKKOS_INLINE_FUNCTION @@ -149,34 +144,26 @@ class BatchedSerialGemm { // Due to taking 1-rank subviews out, we must handle transpose here. // Use overloads of subview_wrapper to handle transpose at compile time. - auto svA_row = subview_wrapper(A, batch_idx, row_idx, Kokkos::ALL(), - batch_layout_tag, transA_tag); - auto svB_col = subview_wrapper(B, batch_idx, Kokkos::ALL(), col_idx, - batch_layout_tag, transB_tag); - auto svC_ele = - subview_wrapper(C, batch_idx, row_idx, col_idx, batch_layout_tag); + auto svA_row = subview_wrapper(A, batch_idx, row_idx, Kokkos::ALL(), batch_layout_tag, transA_tag); + auto svB_col = subview_wrapper(B, batch_idx, Kokkos::ALL(), col_idx, batch_layout_tag, transB_tag); + auto svC_ele = subview_wrapper(C, batch_idx, row_idx, col_idx, batch_layout_tag); // Kokkos::subview(scalar, ALL) or Kokkos::subview(ALL, scalar) always // returns a column vector. Since the subviews above handle the // matrix transpositions, here we must perform the GEMM on: // row_vec x col_vec, which is svA_row' x svB_col to compute the element // of C. - KokkosBatched::SerialGemm::invoke(alpha, svA_row, svB_col, beta, - svC_ele); + KokkosBatched::SerialGemm::invoke(alpha, svA_row, svB_col, beta, + svC_ele); } KOKKOS_INLINE_FUNCTION void operator()(const ResultsPerThread::Rank2 &, const int &i) const { - auto svA = - subview_wrapper(A, i, Kokkos::ALL(), Kokkos::ALL(), batch_layout_tag); - auto svB = - subview_wrapper(B, i, Kokkos::ALL(), Kokkos::ALL(), batch_layout_tag); - auto svC = - subview_wrapper(C, i, Kokkos::ALL(), Kokkos::ALL(), batch_layout_tag); + auto svA = subview_wrapper(A, i, Kokkos::ALL(), Kokkos::ALL(), batch_layout_tag); + auto svB = subview_wrapper(B, i, Kokkos::ALL(), Kokkos::ALL(), batch_layout_tag); + auto svC = subview_wrapper(C, i, Kokkos::ALL(), Kokkos::ALL(), batch_layout_tag); - KokkosBatched::SerialGemm::invoke( - alpha, svA, svB, beta, svC); + KokkosBatched::SerialGemm::invoke(alpha, svA, svB, beta, svC); } }; } // namespace Impl diff --git a/batched/dense/impl/KokkosBatched_HostLevel_Gemm_Spec.hpp b/batched/dense/impl/KokkosBatched_HostLevel_Gemm_Spec.hpp index 6ec792172b..6f06694f09 100644 --- a/batched/dense/impl/KokkosBatched_HostLevel_Gemm_Spec.hpp +++ b/batched/dense/impl/KokkosBatched_HostLevel_Gemm_Spec.hpp @@ -29,17 +29,15 @@ namespace KokkosBatched { namespace Impl { // Specialization struct which defines whether a specialization exists // This struct is currently never specialized. -template +template struct batched_gemm_tpl_spec_avail { enum : bool { value = false }; }; // Specialization struct which defines whether a specialization exists -template +template struct batched_gemm_eti_spec_avail { enum : bool { value = false }; }; @@ -47,71 +45,55 @@ struct batched_gemm_eti_spec_avail { } // namespace KokkosBatched // ETI specalization macros, consumed by generated *_eti_spec_avail.hpp files -#define KOKKOSBATCHED_GEMM_ETI_SPEC_AVAIL_INNER(ARG_TRANS_A, ARG_TRANS_B, \ - ARG_BATCH_LAYOUT, SCALAR, \ - LAYOUT, EXEC_SPACE, MEM_SPACE) \ - template <> \ - struct batched_gemm_eti_spec_avail< \ - ARG_TRANS_A, ARG_TRANS_B, ARG_BATCH_LAYOUT, BatchedGemmHandle, SCALAR, \ - Kokkos::View, \ - Kokkos::MemoryTraits>, \ - Kokkos::View, \ - Kokkos::MemoryTraits>, \ - Kokkos::View, \ - Kokkos::MemoryTraits>> { \ - enum : bool { value = true }; \ +#define KOKKOSBATCHED_GEMM_ETI_SPEC_AVAIL_INNER(ARG_TRANS_A, ARG_TRANS_B, ARG_BATCH_LAYOUT, SCALAR, LAYOUT, \ + EXEC_SPACE, MEM_SPACE) \ + template <> \ + struct batched_gemm_eti_spec_avail, \ + Kokkos::MemoryTraits>, \ + Kokkos::View, \ + Kokkos::MemoryTraits>, \ + Kokkos::View, \ + Kokkos::MemoryTraits>> { \ + enum : bool { value = true }; \ }; #if defined(KOKKOSKERNELS_INST_LAYOUTRIGHT) -#define KOKKOSBATCHED_GEMM_BLL_ETI_SPEC_AVAIL_INNER( \ - ARG_TRANS_A, ARG_TRANS_B, ARG_BATCH_LAYOUT, SCALAR, LAYOUT, EXEC_SPACE, \ - MEM_SPACE) \ - KOKKOSBATCHED_GEMM_ETI_SPEC_AVAIL_INNER( \ - ARG_TRANS_A, ARG_TRANS_B, ARG_BATCH_LAYOUT, SCALAR, Kokkos::LayoutRight, \ - EXEC_SPACE, MEM_SPACE) +#define KOKKOSBATCHED_GEMM_BLL_ETI_SPEC_AVAIL_INNER(ARG_TRANS_A, ARG_TRANS_B, ARG_BATCH_LAYOUT, SCALAR, LAYOUT, \ + EXEC_SPACE, MEM_SPACE) \ + KOKKOSBATCHED_GEMM_ETI_SPEC_AVAIL_INNER(ARG_TRANS_A, ARG_TRANS_B, ARG_BATCH_LAYOUT, SCALAR, Kokkos::LayoutRight, \ + EXEC_SPACE, MEM_SPACE) #else -#define KOKKOSBATCHED_GEMM_BLL_ETI_SPEC_AVAIL_INNER( \ - ARG_TRANS_A, ARG_TRANS_B, ARG_BATCH_LAYOUT, SCALAR, LAYOUT, EXEC_SPACE, \ - MEM_SPACE) +#define KOKKOSBATCHED_GEMM_BLL_ETI_SPEC_AVAIL_INNER(ARG_TRANS_A, ARG_TRANS_B, ARG_BATCH_LAYOUT, SCALAR, LAYOUT, \ + EXEC_SPACE, MEM_SPACE) #endif #if defined(KOKKOSKERNELS_INST_LAYOUTLEFT) -#define KOKKOSBATCHED_GEMM_BLR_ETI_SPEC_AVAIL_INNER( \ - ARG_TRANS_A, ARG_TRANS_B, ARG_BATCH_LAYOUT, SCALAR, LAYOUT, EXEC_SPACE, \ - MEM_SPACE) \ - KOKKOSBATCHED_GEMM_ETI_SPEC_AVAIL_INNER( \ - ARG_TRANS_A, ARG_TRANS_B, ARG_BATCH_LAYOUT, SCALAR, Kokkos::LayoutLeft, \ - EXEC_SPACE, MEM_SPACE) +#define KOKKOSBATCHED_GEMM_BLR_ETI_SPEC_AVAIL_INNER(ARG_TRANS_A, ARG_TRANS_B, ARG_BATCH_LAYOUT, SCALAR, LAYOUT, \ + EXEC_SPACE, MEM_SPACE) \ + KOKKOSBATCHED_GEMM_ETI_SPEC_AVAIL_INNER(ARG_TRANS_A, ARG_TRANS_B, ARG_BATCH_LAYOUT, SCALAR, Kokkos::LayoutLeft, \ + EXEC_SPACE, MEM_SPACE) #else -#define KOKKOSBATCHED_GEMM_BLR_ETI_SPEC_AVAIL_INNER( \ - ARG_TRANS_A, ARG_TRANS_B, ARG_BATCH_LAYOUT, SCALAR, LAYOUT, EXEC_SPACE, \ - MEM_SPACE) +#define KOKKOSBATCHED_GEMM_BLR_ETI_SPEC_AVAIL_INNER(ARG_TRANS_A, ARG_TRANS_B, ARG_BATCH_LAYOUT, SCALAR, LAYOUT, \ + EXEC_SPACE, MEM_SPACE) #endif ///////////////// BatchLayout::Left Permutations ///////////////// -#define KOKKOSBATCHED_GEMM_NT_NT_BLL_ETI_SPEC_AVAIL(SCALAR, LAYOUT, \ - EXEC_SPACE, MEM_SPACE) \ - KOKKOSBATCHED_GEMM_BLL_ETI_SPEC_AVAIL_INNER( \ - Trans::NoTranspose, Trans::NoTranspose, BatchLayout::Left, SCALAR, \ - LAYOUT, EXEC_SPACE, MEM_SPACE) +#define KOKKOSBATCHED_GEMM_NT_NT_BLL_ETI_SPEC_AVAIL(SCALAR, LAYOUT, EXEC_SPACE, MEM_SPACE) \ + KOKKOSBATCHED_GEMM_BLL_ETI_SPEC_AVAIL_INNER(Trans::NoTranspose, Trans::NoTranspose, BatchLayout::Left, SCALAR, \ + LAYOUT, EXEC_SPACE, MEM_SPACE) -#define KOKKOSBATCHED_GEMM_NT_T_BLL_ETI_SPEC_AVAIL(SCALAR, LAYOUT, EXEC_SPACE, \ - MEM_SPACE) \ - KOKKOSBATCHED_GEMM_BLL_ETI_SPEC_AVAIL_INNER( \ - Trans::NoTranspose, Trans::Transpose, BatchLayout::Left, SCALAR, LAYOUT, \ - EXEC_SPACE, MEM_SPACE) +#define KOKKOSBATCHED_GEMM_NT_T_BLL_ETI_SPEC_AVAIL(SCALAR, LAYOUT, EXEC_SPACE, MEM_SPACE) \ + KOKKOSBATCHED_GEMM_BLL_ETI_SPEC_AVAIL_INNER(Trans::NoTranspose, Trans::Transpose, BatchLayout::Left, SCALAR, LAYOUT, \ + EXEC_SPACE, MEM_SPACE) -#define KOKKOSBATCHED_GEMM_T_NT_BLL_ETI_SPEC_AVAIL(SCALAR, LAYOUT, EXEC_SPACE, \ - MEM_SPACE) \ - KOKKOSBATCHED_GEMM_BLL_ETI_SPEC_AVAIL_INNER( \ - Trans::Transpose, Trans::NoTranspose, BatchLayout::Left, SCALAR, LAYOUT, \ - EXEC_SPACE, MEM_SPACE) +#define KOKKOSBATCHED_GEMM_T_NT_BLL_ETI_SPEC_AVAIL(SCALAR, LAYOUT, EXEC_SPACE, MEM_SPACE) \ + KOKKOSBATCHED_GEMM_BLL_ETI_SPEC_AVAIL_INNER(Trans::Transpose, Trans::NoTranspose, BatchLayout::Left, SCALAR, LAYOUT, \ + EXEC_SPACE, MEM_SPACE) -#define KOKKOSBATCHED_GEMM_T_T_BLL_ETI_SPEC_AVAIL(SCALAR, LAYOUT, EXEC_SPACE, \ - MEM_SPACE) \ - KOKKOSBATCHED_GEMM_BLL_ETI_SPEC_AVAIL_INNER( \ - Trans::Transpose, Trans::Transpose, BatchLayout::Left, SCALAR, LAYOUT, \ - EXEC_SPACE, MEM_SPACE) +#define KOKKOSBATCHED_GEMM_T_T_BLL_ETI_SPEC_AVAIL(SCALAR, LAYOUT, EXEC_SPACE, MEM_SPACE) \ + KOKKOSBATCHED_GEMM_BLL_ETI_SPEC_AVAIL_INNER(Trans::Transpose, Trans::Transpose, BatchLayout::Left, SCALAR, LAYOUT, \ + EXEC_SPACE, MEM_SPACE) // Include the BLL ETI specalizations #include @@ -120,29 +102,21 @@ struct batched_gemm_eti_spec_avail { #include ///////////////// BatchLayout::Right Permutations ///////////////// -#define KOKKOSBATCHED_GEMM_NT_NT_BLR_ETI_SPEC_AVAIL(SCALAR, LAYOUT, \ - EXEC_SPACE, MEM_SPACE) \ - KOKKOSBATCHED_GEMM_BLR_ETI_SPEC_AVAIL_INNER( \ - Trans::NoTranspose, Trans::NoTranspose, BatchLayout::Right, SCALAR, \ - LAYOUT, EXEC_SPACE, MEM_SPACE) +#define KOKKOSBATCHED_GEMM_NT_NT_BLR_ETI_SPEC_AVAIL(SCALAR, LAYOUT, EXEC_SPACE, MEM_SPACE) \ + KOKKOSBATCHED_GEMM_BLR_ETI_SPEC_AVAIL_INNER(Trans::NoTranspose, Trans::NoTranspose, BatchLayout::Right, SCALAR, \ + LAYOUT, EXEC_SPACE, MEM_SPACE) -#define KOKKOSBATCHED_GEMM_NT_T_BLR_ETI_SPEC_AVAIL(SCALAR, LAYOUT, EXEC_SPACE, \ - MEM_SPACE) \ - KOKKOSBATCHED_GEMM_BLR_ETI_SPEC_AVAIL_INNER( \ - Trans::NoTranspose, Trans::Transpose, BatchLayout::Right, SCALAR, \ - LAYOUT, EXEC_SPACE, MEM_SPACE) +#define KOKKOSBATCHED_GEMM_NT_T_BLR_ETI_SPEC_AVAIL(SCALAR, LAYOUT, EXEC_SPACE, MEM_SPACE) \ + KOKKOSBATCHED_GEMM_BLR_ETI_SPEC_AVAIL_INNER(Trans::NoTranspose, Trans::Transpose, BatchLayout::Right, SCALAR, \ + LAYOUT, EXEC_SPACE, MEM_SPACE) -#define KOKKOSBATCHED_GEMM_T_NT_BLR_ETI_SPEC_AVAIL(SCALAR, LAYOUT, EXEC_SPACE, \ - MEM_SPACE) \ - KOKKOSBATCHED_GEMM_BLR_ETI_SPEC_AVAIL_INNER( \ - Trans::Transpose, Trans::NoTranspose, BatchLayout::Right, SCALAR, \ - LAYOUT, EXEC_SPACE, MEM_SPACE) +#define KOKKOSBATCHED_GEMM_T_NT_BLR_ETI_SPEC_AVAIL(SCALAR, LAYOUT, EXEC_SPACE, MEM_SPACE) \ + KOKKOSBATCHED_GEMM_BLR_ETI_SPEC_AVAIL_INNER(Trans::Transpose, Trans::NoTranspose, BatchLayout::Right, SCALAR, \ + LAYOUT, EXEC_SPACE, MEM_SPACE) -#define KOKKOSBATCHED_GEMM_T_T_BLR_ETI_SPEC_AVAIL(SCALAR, LAYOUT, EXEC_SPACE, \ - MEM_SPACE) \ - KOKKOSBATCHED_GEMM_BLR_ETI_SPEC_AVAIL_INNER( \ - Trans::Transpose, Trans::Transpose, BatchLayout::Right, SCALAR, LAYOUT, \ - EXEC_SPACE, MEM_SPACE) +#define KOKKOSBATCHED_GEMM_T_T_BLR_ETI_SPEC_AVAIL(SCALAR, LAYOUT, EXEC_SPACE, MEM_SPACE) \ + KOKKOSBATCHED_GEMM_BLR_ETI_SPEC_AVAIL_INNER(Trans::Transpose, Trans::Transpose, BatchLayout::Right, SCALAR, LAYOUT, \ + EXEC_SPACE, MEM_SPACE) // Include the BLR ETI specalizations #include @@ -152,19 +126,15 @@ struct batched_gemm_eti_spec_avail { namespace KokkosBatched { namespace Impl { -template ::value, - bool eti_spec_avail = batched_gemm_eti_spec_avail< - ArgTransA, ArgTransB, ArgBatchSzDim, BatchedGemmHandleType, - ScalarType, AViewType, BViewType, CViewType>::value> +template ::value, + bool eti_spec_avail = batched_gemm_eti_spec_avail::value> struct BatchedGemmSpec { - static int run(BatchedGemmHandleType *const handle, const ScalarType alpha, - const AViewType &A, const BViewType &B, const ScalarType beta, - const CViewType &C) + static int run(BatchedGemmHandleType *const handle, const ScalarType alpha, const AViewType &A, const BViewType &B, + const ScalarType beta, const CViewType &C) #if !defined(KOKKOSKERNELS_ETI_ONLY) || KOKKOSKERNELS_IMPL_COMPILE_LIBRARY { #ifdef KOKKOSKERNELS_ENABLE_CHECK_SPECIALIZATION @@ -172,23 +142,20 @@ struct BatchedGemmSpec { printf( "KokkosBatched::BatchedGemm<> ETI specialization for < %s, %s, %s, " "%s, %s, %s, %s, %s >\n", - typeid(ArgTransA).name(), typeid(ArgTransB).name(), - typeid(ArgBatchSzDim).name(), typeid(BatchedGemmHandleType).name(), - typeid(ScalarType).name(), typeid(AViewType).name(), + typeid(ArgTransA).name(), typeid(ArgTransB).name(), typeid(ArgBatchSzDim).name(), + typeid(BatchedGemmHandleType).name(), typeid(ScalarType).name(), typeid(AViewType).name(), typeid(BViewType).name(), typeid(CViewType).name()); #else printf( "KokkosBatched::BatchedGemm<> non-ETI specialization for < %s, %s, " "%s, %s, %s, %s, %s, %s >\n", - typeid(ArgTransA).name(), typeid(ArgTransB).name(), - typeid(ArgBatchSzDim).name(), typeid(BatchedGemmHandleType).name(), - typeid(ScalarType).name(), typeid(AViewType).name(), + typeid(ArgTransA).name(), typeid(ArgTransB).name(), typeid(ArgBatchSzDim).name(), + typeid(BatchedGemmHandleType).name(), typeid(ScalarType).name(), typeid(AViewType).name(), typeid(BViewType).name(), typeid(CViewType).name()); #endif // KOKKOSKERNELS_IMPL_COMPILE_LIBRARY #endif // KOKKOSKERNELS_ENABLE_CHECK_SPECIALIZATION - return KokkosBatched::Impl::BatchedGemmImpl< - ArgTransA, ArgTransB, ArgBatchSzDim, BatchedGemmHandleType, ScalarType, - AViewType, BViewType, CViewType>(handle, alpha, A, B, beta, C); + return KokkosBatched::Impl::BatchedGemmImpl(handle, alpha, A, B, beta, C); } #else ; @@ -199,92 +166,68 @@ struct BatchedGemmSpec { } // namespace KokkosBatched // ETI instantiation macros, consumed by *.cpp.in files -#define KOKKOSBATCHED_GEMM_ETI_SPEC_INST_INNER(ARG_TRANS_A, ARG_TRANS_B, \ - ARG_BATCH_LAYOUT, SCALAR, \ - LAYOUT, EXEC_SPACE, MEM_SPACE) \ - template struct BatchedGemmSpec< \ - ARG_TRANS_A, ARG_TRANS_B, ARG_BATCH_LAYOUT, BatchedGemmHandle, SCALAR, \ - Kokkos::View, \ - Kokkos::MemoryTraits>, \ - Kokkos::View, \ - Kokkos::MemoryTraits>, \ - Kokkos::View, \ - Kokkos::MemoryTraits>, \ - false, true>; +#define KOKKOSBATCHED_GEMM_ETI_SPEC_INST_INNER(ARG_TRANS_A, ARG_TRANS_B, ARG_BATCH_LAYOUT, SCALAR, LAYOUT, EXEC_SPACE, \ + MEM_SPACE) \ + template struct BatchedGemmSpec, \ + Kokkos::MemoryTraits>, \ + Kokkos::View, \ + Kokkos::MemoryTraits>, \ + Kokkos::View, \ + Kokkos::MemoryTraits>, \ + false, true>; #if defined(KOKKOSKERNELS_INST_LAYOUTRIGHT) -#define KOKKOSBATCHED_GEMM_BLL_ETI_SPEC_INST_INNER( \ - ARG_TRANS_A, ARG_TRANS_B, ARG_BATCH_LAYOUT, SCALAR, LAYOUT, EXEC_SPACE, \ - MEM_SPACE) \ - KOKKOSBATCHED_GEMM_ETI_SPEC_INST_INNER( \ - ARG_TRANS_A, ARG_TRANS_B, ARG_BATCH_LAYOUT, SCALAR, Kokkos::LayoutRight, \ - EXEC_SPACE, MEM_SPACE) +#define KOKKOSBATCHED_GEMM_BLL_ETI_SPEC_INST_INNER(ARG_TRANS_A, ARG_TRANS_B, ARG_BATCH_LAYOUT, SCALAR, LAYOUT, \ + EXEC_SPACE, MEM_SPACE) \ + KOKKOSBATCHED_GEMM_ETI_SPEC_INST_INNER(ARG_TRANS_A, ARG_TRANS_B, ARG_BATCH_LAYOUT, SCALAR, Kokkos::LayoutRight, \ + EXEC_SPACE, MEM_SPACE) #else -#define KOKKOSBATCHED_GEMM_BLL_ETI_SPEC_INST_INNER( \ - ARG_TRANS_A, ARG_TRANS_B, ARG_BATCH_LAYOUT, SCALAR, LAYOUT, EXEC_SPACE, \ - MEM_SPACE) +#define KOKKOSBATCHED_GEMM_BLL_ETI_SPEC_INST_INNER(ARG_TRANS_A, ARG_TRANS_B, ARG_BATCH_LAYOUT, SCALAR, LAYOUT, \ + EXEC_SPACE, MEM_SPACE) #endif #if defined(KOKKOSKERNELS_INST_LAYOUTLEFT) -#define KOKKOSBATCHED_GEMM_BLR_ETI_SPEC_INST_INNER( \ - ARG_TRANS_A, ARG_TRANS_B, ARG_BATCH_LAYOUT, SCALAR, LAYOUT, EXEC_SPACE, \ - MEM_SPACE) \ - KOKKOSBATCHED_GEMM_ETI_SPEC_INST_INNER( \ - ARG_TRANS_A, ARG_TRANS_B, ARG_BATCH_LAYOUT, SCALAR, Kokkos::LayoutLeft, \ - EXEC_SPACE, MEM_SPACE) +#define KOKKOSBATCHED_GEMM_BLR_ETI_SPEC_INST_INNER(ARG_TRANS_A, ARG_TRANS_B, ARG_BATCH_LAYOUT, SCALAR, LAYOUT, \ + EXEC_SPACE, MEM_SPACE) \ + KOKKOSBATCHED_GEMM_ETI_SPEC_INST_INNER(ARG_TRANS_A, ARG_TRANS_B, ARG_BATCH_LAYOUT, SCALAR, Kokkos::LayoutLeft, \ + EXEC_SPACE, MEM_SPACE) #else -#define KOKKOSBATCHED_GEMM_BLR_ETI_SPEC_INST_INNER( \ - ARG_TRANS_A, ARG_TRANS_B, ARG_BATCH_LAYOUT, SCALAR, LAYOUT, EXEC_SPACE, \ - MEM_SPACE) +#define KOKKOSBATCHED_GEMM_BLR_ETI_SPEC_INST_INNER(ARG_TRANS_A, ARG_TRANS_B, ARG_BATCH_LAYOUT, SCALAR, LAYOUT, \ + EXEC_SPACE, MEM_SPACE) #endif ///////////////// BatchLayout::Left Permutations ///////////////// -#define KOKKOSBATCHED_GEMM_NT_NT_BLL_ETI_SPEC_INST(SCALAR, LAYOUT, EXEC_SPACE, \ - MEM_SPACE) \ - KOKKOSBATCHED_GEMM_BLL_ETI_SPEC_INST_INNER( \ - Trans::NoTranspose, Trans::NoTranspose, BatchLayout::Left, SCALAR, \ - LAYOUT, EXEC_SPACE, MEM_SPACE) +#define KOKKOSBATCHED_GEMM_NT_NT_BLL_ETI_SPEC_INST(SCALAR, LAYOUT, EXEC_SPACE, MEM_SPACE) \ + KOKKOSBATCHED_GEMM_BLL_ETI_SPEC_INST_INNER(Trans::NoTranspose, Trans::NoTranspose, BatchLayout::Left, SCALAR, \ + LAYOUT, EXEC_SPACE, MEM_SPACE) -#define KOKKOSBATCHED_GEMM_NT_T_BLL_ETI_SPEC_INST(SCALAR, LAYOUT, EXEC_SPACE, \ - MEM_SPACE) \ - KOKKOSBATCHED_GEMM_BLL_ETI_SPEC_INST_INNER( \ - Trans::NoTranspose, Trans::Transpose, BatchLayout::Left, SCALAR, LAYOUT, \ - EXEC_SPACE, MEM_SPACE) +#define KOKKOSBATCHED_GEMM_NT_T_BLL_ETI_SPEC_INST(SCALAR, LAYOUT, EXEC_SPACE, MEM_SPACE) \ + KOKKOSBATCHED_GEMM_BLL_ETI_SPEC_INST_INNER(Trans::NoTranspose, Trans::Transpose, BatchLayout::Left, SCALAR, LAYOUT, \ + EXEC_SPACE, MEM_SPACE) -#define KOKKOSBATCHED_GEMM_T_NT_BLL_ETI_SPEC_INST(SCALAR, LAYOUT, EXEC_SPACE, \ - MEM_SPACE) \ - KOKKOSBATCHED_GEMM_BLL_ETI_SPEC_INST_INNER( \ - Trans::Transpose, Trans::NoTranspose, BatchLayout::Left, SCALAR, LAYOUT, \ - EXEC_SPACE, MEM_SPACE) +#define KOKKOSBATCHED_GEMM_T_NT_BLL_ETI_SPEC_INST(SCALAR, LAYOUT, EXEC_SPACE, MEM_SPACE) \ + KOKKOSBATCHED_GEMM_BLL_ETI_SPEC_INST_INNER(Trans::Transpose, Trans::NoTranspose, BatchLayout::Left, SCALAR, LAYOUT, \ + EXEC_SPACE, MEM_SPACE) -#define KOKKOSBATCHED_GEMM_T_T_BLL_ETI_SPEC_INST(SCALAR, LAYOUT, EXEC_SPACE, \ - MEM_SPACE) \ - KOKKOSBATCHED_GEMM_BLL_ETI_SPEC_INST_INNER( \ - Trans::Transpose, Trans::Transpose, BatchLayout::Left, SCALAR, LAYOUT, \ - EXEC_SPACE, MEM_SPACE) +#define KOKKOSBATCHED_GEMM_T_T_BLL_ETI_SPEC_INST(SCALAR, LAYOUT, EXEC_SPACE, MEM_SPACE) \ + KOKKOSBATCHED_GEMM_BLL_ETI_SPEC_INST_INNER(Trans::Transpose, Trans::Transpose, BatchLayout::Left, SCALAR, LAYOUT, \ + EXEC_SPACE, MEM_SPACE) ///////////////// BatchLayout::Right Permutations ///////////////// -#define KOKKOSBATCHED_GEMM_NT_NT_BLR_ETI_SPEC_INST(SCALAR, LAYOUT, EXEC_SPACE, \ - MEM_SPACE) \ - KOKKOSBATCHED_GEMM_BLR_ETI_SPEC_INST_INNER( \ - Trans::NoTranspose, Trans::NoTranspose, BatchLayout::Right, SCALAR, \ - LAYOUT, EXEC_SPACE, MEM_SPACE) +#define KOKKOSBATCHED_GEMM_NT_NT_BLR_ETI_SPEC_INST(SCALAR, LAYOUT, EXEC_SPACE, MEM_SPACE) \ + KOKKOSBATCHED_GEMM_BLR_ETI_SPEC_INST_INNER(Trans::NoTranspose, Trans::NoTranspose, BatchLayout::Right, SCALAR, \ + LAYOUT, EXEC_SPACE, MEM_SPACE) -#define KOKKOSBATCHED_GEMM_NT_T_BLR_ETI_SPEC_INST(SCALAR, LAYOUT, EXEC_SPACE, \ - MEM_SPACE) \ - KOKKOSBATCHED_GEMM_BLR_ETI_SPEC_INST_INNER( \ - Trans::NoTranspose, Trans::Transpose, BatchLayout::Right, SCALAR, \ - LAYOUT, EXEC_SPACE, MEM_SPACE) +#define KOKKOSBATCHED_GEMM_NT_T_BLR_ETI_SPEC_INST(SCALAR, LAYOUT, EXEC_SPACE, MEM_SPACE) \ + KOKKOSBATCHED_GEMM_BLR_ETI_SPEC_INST_INNER(Trans::NoTranspose, Trans::Transpose, BatchLayout::Right, SCALAR, LAYOUT, \ + EXEC_SPACE, MEM_SPACE) -#define KOKKOSBATCHED_GEMM_T_NT_BLR_ETI_SPEC_INST(SCALAR, LAYOUT, EXEC_SPACE, \ - MEM_SPACE) \ - KOKKOSBATCHED_GEMM_BLR_ETI_SPEC_INST_INNER( \ - Trans::Transpose, Trans::NoTranspose, BatchLayout::Right, SCALAR, \ - LAYOUT, EXEC_SPACE, MEM_SPACE) +#define KOKKOSBATCHED_GEMM_T_NT_BLR_ETI_SPEC_INST(SCALAR, LAYOUT, EXEC_SPACE, MEM_SPACE) \ + KOKKOSBATCHED_GEMM_BLR_ETI_SPEC_INST_INNER(Trans::Transpose, Trans::NoTranspose, BatchLayout::Right, SCALAR, LAYOUT, \ + EXEC_SPACE, MEM_SPACE) -#define KOKKOSBATCHED_GEMM_T_T_BLR_ETI_SPEC_INST(SCALAR, LAYOUT, EXEC_SPACE, \ - MEM_SPACE) \ - KOKKOSBATCHED_GEMM_BLR_ETI_SPEC_INST_INNER( \ - Trans::Transpose, Trans::Transpose, BatchLayout::Right, SCALAR, LAYOUT, \ - EXEC_SPACE, MEM_SPACE) +#define KOKKOSBATCHED_GEMM_T_T_BLR_ETI_SPEC_INST(SCALAR, LAYOUT, EXEC_SPACE, MEM_SPACE) \ + KOKKOSBATCHED_GEMM_BLR_ETI_SPEC_INST_INNER(Trans::Transpose, Trans::Transpose, BatchLayout::Right, SCALAR, LAYOUT, \ + EXEC_SPACE, MEM_SPACE) #endif // __KOKKOSBATCHED_HOSTLEVEL_GEMM_SPEC_HPP__ diff --git a/batched/dense/impl/KokkosBatched_Householder_Serial_Impl.hpp b/batched/dense/impl/KokkosBatched_Householder_Serial_Impl.hpp index 4a3e26685b..c8f5c7a20e 100644 --- a/batched/dense/impl/KokkosBatched_Householder_Serial_Impl.hpp +++ b/batched/dense/impl/KokkosBatched_Householder_Serial_Impl.hpp @@ -29,11 +29,9 @@ namespace KokkosBatched { template <> template -KOKKOS_INLINE_FUNCTION int SerialHouseholder::invoke( - const aViewType &a, const tauViewType &tau) { - return SerialLeftHouseholderInternal::invoke(a.extent(0) - 1, a.data(), - a.data() + a.stride(0), - a.stride(0), tau.data()); +KOKKOS_INLINE_FUNCTION int SerialHouseholder::invoke(const aViewType &a, const tauViewType &tau) { + return SerialLeftHouseholderInternal::invoke(a.extent(0) - 1, a.data(), a.data() + a.stride(0), a.stride(0), + tau.data()); } } // namespace KokkosBatched diff --git a/batched/dense/impl/KokkosBatched_Householder_Serial_Internal.hpp b/batched/dense/impl/KokkosBatched_Householder_Serial_Internal.hpp index 05654a2f37..0257ff4d9b 100644 --- a/batched/dense/impl/KokkosBatched_Householder_Serial_Internal.hpp +++ b/batched/dense/impl/KokkosBatched_Householder_Serial_Internal.hpp @@ -61,8 +61,7 @@ struct SerialLeftHouseholderInternal { const mag_type norm_chi1 = Kokkos::ArithTraits::abs(*chi1); /// compute 2 norm of x using norm_chi1 and norm_x2 - const mag_type norm_x = Kokkos::ArithTraits::sqrt( - norm_x2_square + norm_chi1 * norm_chi1); + const mag_type norm_x = Kokkos::ArithTraits::sqrt(norm_x2_square + norm_chi1 * norm_chi1); /// compute alpha const mag_type alpha = (*chi1 < 0 ? one : minus_one) * norm_x; @@ -76,9 +75,8 @@ struct SerialLeftHouseholderInternal { // SerialScaleInternal::invoke(m_x2, inv_chi1_minus_alpha, x2, x2s); /// compute tau - const mag_type chi1_minus_alpha_square = - chi1_minus_alpha * chi1_minus_alpha; - *tau = half + half * (norm_x2_square / chi1_minus_alpha_square); + const mag_type chi1_minus_alpha_square = chi1_minus_alpha * chi1_minus_alpha; + *tau = half + half * (norm_x2_square / chi1_minus_alpha_square); /// overwrite chi1 with alpha *chi1 = alpha; diff --git a/batched/dense/impl/KokkosBatched_Householder_TeamVector_Impl.hpp b/batched/dense/impl/KokkosBatched_Householder_TeamVector_Impl.hpp index 955e1a72b8..bc55a646bc 100644 --- a/batched/dense/impl/KokkosBatched_Householder_TeamVector_Impl.hpp +++ b/batched/dense/impl/KokkosBatched_Householder_TeamVector_Impl.hpp @@ -29,11 +29,10 @@ namespace KokkosBatched { template template -KOKKOS_INLINE_FUNCTION int TeamVectorHouseholder::invoke( - const MemberType &member, const aViewType &a, const tauViewType &tau) { - return TeamVectorLeftHouseholderInternal::invoke( - member, a.extent(0) - 1, a.data(), a.data() + a.stride(0), a.stride(0), - tau.data()); +KOKKOS_INLINE_FUNCTION int TeamVectorHouseholder::invoke(const MemberType &member, const aViewType &a, + const tauViewType &tau) { + return TeamVectorLeftHouseholderInternal::invoke(member, a.extent(0) - 1, a.data(), a.data() + a.stride(0), + a.stride(0), tau.data()); } } // namespace KokkosBatched diff --git a/batched/dense/impl/KokkosBatched_Householder_TeamVector_Internal.hpp b/batched/dense/impl/KokkosBatched_Householder_TeamVector_Internal.hpp index 64fe24fa31..1074dc4280 100644 --- a/batched/dense/impl/KokkosBatched_Householder_TeamVector_Internal.hpp +++ b/batched/dense/impl/KokkosBatched_Householder_TeamVector_Internal.hpp @@ -30,8 +30,7 @@ namespace KokkosBatched { /// struct TeamVectorLeftHouseholderInternal { template - KOKKOS_INLINE_FUNCTION static int invoke(const MemberType &member, - const int m_x2, + KOKKOS_INLINE_FUNCTION static int invoke(const MemberType &member, const int m_x2, /* */ ValueType *chi1, /* */ ValueType *x2, const int x2s, /* */ ValueType *tau) { @@ -67,8 +66,7 @@ struct TeamVectorLeftHouseholderInternal { const mag_type norm_chi1 = Kokkos::ArithTraits::abs(*chi1); /// compute 2 norm of x using norm_chi1 and norm_x2 - const mag_type norm_x = Kokkos::ArithTraits::sqrt( - norm_x2_square + norm_chi1 * norm_chi1); + const mag_type norm_x = Kokkos::ArithTraits::sqrt(norm_x2_square + norm_chi1 * norm_chi1); /// compute alpha const mag_type alpha = (*chi1 < 0 ? one : minus_one) * norm_x; @@ -76,9 +74,8 @@ struct TeamVectorLeftHouseholderInternal { /// overwrite x2 with u2 const value_type chi1_minus_alpha = *chi1 - alpha; const value_type inv_chi1_minus_alpha = one / chi1_minus_alpha; - Kokkos::parallel_for( - Kokkos::TeamVectorRange(member, m_x2), - [&](const int &i) { x2[i * x2s] *= inv_chi1_minus_alpha; }); + Kokkos::parallel_for(Kokkos::TeamVectorRange(member, m_x2), + [&](const int &i) { x2[i * x2s] *= inv_chi1_minus_alpha; }); member.team_barrier(); // later consider to use the following @@ -86,9 +83,8 @@ struct TeamVectorLeftHouseholderInternal { /// compute tau Kokkos::single(Kokkos::PerTeam(member), [&]() { - const mag_type chi1_minus_alpha_square = - chi1_minus_alpha * chi1_minus_alpha; - *tau = half + half * (norm_x2_square / chi1_minus_alpha_square); + const mag_type chi1_minus_alpha_square = chi1_minus_alpha * chi1_minus_alpha; + *tau = half + half * (norm_x2_square / chi1_minus_alpha_square); /// overwrite chi1 with alpha *chi1 = alpha; diff --git a/batched/dense/impl/KokkosBatched_InnerGemmFixA_Serial_Impl.hpp b/batched/dense/impl/KokkosBatched_InnerGemmFixA_Serial_Impl.hpp index d59f9e0c0b..eb576f1dff 100644 --- a/batched/dense/impl/KokkosBatched_InnerGemmFixA_Serial_Impl.hpp +++ b/batched/dense/impl/KokkosBatched_InnerGemmFixA_Serial_Impl.hpp @@ -29,31 +29,26 @@ namespace KokkosBatched { template <> template -KOKKOS_INLINE_FUNCTION int InnerGemmFixA<5, 5>::serial_invoke( - const ScalarType alpha, const ValueType *KOKKOS_RESTRICT A, - const ValueType *KOKKOS_RESTRICT B, const int n, - /**/ ValueType *KOKKOS_RESTRICT C) { +KOKKOS_INLINE_FUNCTION int InnerGemmFixA<5, 5>::serial_invoke(const ScalarType alpha, + const ValueType *KOKKOS_RESTRICT A, + const ValueType *KOKKOS_RESTRICT B, const int n, + /**/ ValueType *KOKKOS_RESTRICT C) { if (n <= 0) return 0; - const ValueType a_00 = A[0 * _as0 + 0 * _as1], a_01 = A[0 * _as0 + 1 * _as1], - a_02 = A[0 * _as0 + 2 * _as1], a_03 = A[0 * _as0 + 3 * _as1], - a_04 = A[0 * _as0 + 4 * _as1], a_10 = A[1 * _as0 + 0 * _as1], - a_11 = A[1 * _as0 + 1 * _as1], a_12 = A[1 * _as0 + 2 * _as1], - a_13 = A[1 * _as0 + 3 * _as1], a_14 = A[1 * _as0 + 4 * _as1], - a_20 = A[2 * _as0 + 0 * _as1], a_21 = A[2 * _as0 + 1 * _as1], - a_22 = A[2 * _as0 + 2 * _as1], a_23 = A[2 * _as0 + 3 * _as1], - a_24 = A[2 * _as0 + 4 * _as1], a_30 = A[3 * _as0 + 0 * _as1], - a_31 = A[3 * _as0 + 1 * _as1], a_32 = A[3 * _as0 + 2 * _as1], - a_33 = A[3 * _as0 + 3 * _as1], a_34 = A[3 * _as0 + 4 * _as1], - a_40 = A[4 * _as0 + 0 * _as1], a_41 = A[4 * _as0 + 1 * _as1], - a_42 = A[4 * _as0 + 2 * _as1], a_43 = A[4 * _as0 + 3 * _as1], + const ValueType a_00 = A[0 * _as0 + 0 * _as1], a_01 = A[0 * _as0 + 1 * _as1], a_02 = A[0 * _as0 + 2 * _as1], + a_03 = A[0 * _as0 + 3 * _as1], a_04 = A[0 * _as0 + 4 * _as1], a_10 = A[1 * _as0 + 0 * _as1], + a_11 = A[1 * _as0 + 1 * _as1], a_12 = A[1 * _as0 + 2 * _as1], a_13 = A[1 * _as0 + 3 * _as1], + a_14 = A[1 * _as0 + 4 * _as1], a_20 = A[2 * _as0 + 0 * _as1], a_21 = A[2 * _as0 + 1 * _as1], + a_22 = A[2 * _as0 + 2 * _as1], a_23 = A[2 * _as0 + 3 * _as1], a_24 = A[2 * _as0 + 4 * _as1], + a_30 = A[3 * _as0 + 0 * _as1], a_31 = A[3 * _as0 + 1 * _as1], a_32 = A[3 * _as0 + 2 * _as1], + a_33 = A[3 * _as0 + 3 * _as1], a_34 = A[3 * _as0 + 4 * _as1], a_40 = A[4 * _as0 + 0 * _as1], + a_41 = A[4 * _as0 + 1 * _as1], a_42 = A[4 * _as0 + 2 * _as1], a_43 = A[4 * _as0 + 3 * _as1], a_44 = A[4 * _as0 + 4 * _as1]; ValueType b_0p, c_0p, b_1p, c_1p, b_2p, c_2p, b_3p, c_3p, b_4p, c_4p; - const int ib0 = 0 * _bs0, ib1 = 1 * _bs0, ib2 = 2 * _bs0, ib3 = 3 * _bs0, - ib4 = 4 * _bs0, ic0 = 0 * _cs0, ic1 = 1 * _cs0, ic2 = 2 * _cs0, - ic3 = 3 * _cs0, ic4 = 4 * _cs0; + const int ib0 = 0 * _bs0, ib1 = 1 * _bs0, ib2 = 2 * _bs0, ib3 = 3 * _bs0, ib4 = 4 * _bs0, ic0 = 0 * _cs0, + ic1 = 1 * _cs0, ic2 = 2 * _cs0, ic3 = 3 * _cs0, ic4 = 4 * _cs0; for (int p = 0; p < n; ++p) { b_0p = B[ib0 + p * _bs1]; @@ -100,29 +95,25 @@ KOKKOS_INLINE_FUNCTION int InnerGemmFixA<5, 5>::serial_invoke( template <> template -KOKKOS_INLINE_FUNCTION int InnerGemmFixA<5, 4>::serial_invoke( - const ScalarType alpha, const ValueType *KOKKOS_RESTRICT A, - const ValueType *KOKKOS_RESTRICT B, const int n, - /**/ ValueType *KOKKOS_RESTRICT C) { +KOKKOS_INLINE_FUNCTION int InnerGemmFixA<5, 4>::serial_invoke(const ScalarType alpha, + const ValueType *KOKKOS_RESTRICT A, + const ValueType *KOKKOS_RESTRICT B, const int n, + /**/ ValueType *KOKKOS_RESTRICT C) { if (n <= 0) return 0; - const ValueType a_00 = A[0 * _as0 + 0 * _as1], a_01 = A[0 * _as0 + 1 * _as1], - a_02 = A[0 * _as0 + 2 * _as1], a_03 = A[0 * _as0 + 3 * _as1], - a_10 = A[1 * _as0 + 0 * _as1], a_11 = A[1 * _as0 + 1 * _as1], - a_12 = A[1 * _as0 + 2 * _as1], a_13 = A[1 * _as0 + 3 * _as1], - a_20 = A[2 * _as0 + 0 * _as1], a_21 = A[2 * _as0 + 1 * _as1], - a_22 = A[2 * _as0 + 2 * _as1], a_23 = A[2 * _as0 + 3 * _as1], - a_30 = A[3 * _as0 + 0 * _as1], a_31 = A[3 * _as0 + 1 * _as1], - a_32 = A[3 * _as0 + 2 * _as1], a_33 = A[3 * _as0 + 3 * _as1], - a_40 = A[4 * _as0 + 0 * _as1], a_41 = A[4 * _as0 + 1 * _as1], + const ValueType a_00 = A[0 * _as0 + 0 * _as1], a_01 = A[0 * _as0 + 1 * _as1], a_02 = A[0 * _as0 + 2 * _as1], + a_03 = A[0 * _as0 + 3 * _as1], a_10 = A[1 * _as0 + 0 * _as1], a_11 = A[1 * _as0 + 1 * _as1], + a_12 = A[1 * _as0 + 2 * _as1], a_13 = A[1 * _as0 + 3 * _as1], a_20 = A[2 * _as0 + 0 * _as1], + a_21 = A[2 * _as0 + 1 * _as1], a_22 = A[2 * _as0 + 2 * _as1], a_23 = A[2 * _as0 + 3 * _as1], + a_30 = A[3 * _as0 + 0 * _as1], a_31 = A[3 * _as0 + 1 * _as1], a_32 = A[3 * _as0 + 2 * _as1], + a_33 = A[3 * _as0 + 3 * _as1], a_40 = A[4 * _as0 + 0 * _as1], a_41 = A[4 * _as0 + 1 * _as1], a_42 = A[4 * _as0 + 2 * _as1], a_43 = A[4 * _as0 + 3 * _as1]; ValueType b_0p, c_0p, b_1p, c_1p, b_2p, c_2p, b_3p, c_3p, /**/ c_4p; - const int ib0 = 0 * _bs0, ib1 = 1 * _bs0, ib2 = 2 * _bs0, ib3 = 3 * _bs0, - ic0 = 0 * _cs0, ic1 = 1 * _cs0, ic2 = 2 * _cs0, ic3 = 3 * _cs0, - ic4 = 4 * _cs0; + const int ib0 = 0 * _bs0, ib1 = 1 * _bs0, ib2 = 2 * _bs0, ib3 = 3 * _bs0, ic0 = 0 * _cs0, ic1 = 1 * _cs0, + ic2 = 2 * _cs0, ic3 = 3 * _cs0, ic4 = 4 * _cs0; for (int p = 0; p < n; ++p) { b_0p = B[ib0 + p * _bs1]; @@ -163,27 +154,24 @@ KOKKOS_INLINE_FUNCTION int InnerGemmFixA<5, 4>::serial_invoke( template <> template -KOKKOS_INLINE_FUNCTION int InnerGemmFixA<5, 3>::serial_invoke( - const ScalarType alpha, const ValueType *KOKKOS_RESTRICT A, - const ValueType *KOKKOS_RESTRICT B, const int n, - /**/ ValueType *KOKKOS_RESTRICT C) { +KOKKOS_INLINE_FUNCTION int InnerGemmFixA<5, 3>::serial_invoke(const ScalarType alpha, + const ValueType *KOKKOS_RESTRICT A, + const ValueType *KOKKOS_RESTRICT B, const int n, + /**/ ValueType *KOKKOS_RESTRICT C) { if (n <= 0) return 0; - const ValueType a_00 = A[0 * _as0 + 0 * _as1], a_01 = A[0 * _as0 + 1 * _as1], - a_02 = A[0 * _as0 + 2 * _as1], a_10 = A[1 * _as0 + 0 * _as1], - a_11 = A[1 * _as0 + 1 * _as1], a_12 = A[1 * _as0 + 2 * _as1], - a_20 = A[2 * _as0 + 0 * _as1], a_21 = A[2 * _as0 + 1 * _as1], - a_22 = A[2 * _as0 + 2 * _as1], a_30 = A[3 * _as0 + 0 * _as1], - a_31 = A[3 * _as0 + 1 * _as1], a_32 = A[3 * _as0 + 2 * _as1], - a_40 = A[4 * _as0 + 0 * _as1], a_41 = A[4 * _as0 + 1 * _as1], - a_42 = A[4 * _as0 + 2 * _as1]; + const ValueType a_00 = A[0 * _as0 + 0 * _as1], a_01 = A[0 * _as0 + 1 * _as1], a_02 = A[0 * _as0 + 2 * _as1], + a_10 = A[1 * _as0 + 0 * _as1], a_11 = A[1 * _as0 + 1 * _as1], a_12 = A[1 * _as0 + 2 * _as1], + a_20 = A[2 * _as0 + 0 * _as1], a_21 = A[2 * _as0 + 1 * _as1], a_22 = A[2 * _as0 + 2 * _as1], + a_30 = A[3 * _as0 + 0 * _as1], a_31 = A[3 * _as0 + 1 * _as1], a_32 = A[3 * _as0 + 2 * _as1], + a_40 = A[4 * _as0 + 0 * _as1], a_41 = A[4 * _as0 + 1 * _as1], a_42 = A[4 * _as0 + 2 * _as1]; ValueType b_0p, c_0p, b_1p, c_1p, b_2p, c_2p, /**/ c_3p, /**/ c_4p; - const int ib0 = 0 * _bs0, ib1 = 1 * _bs0, ib2 = 2 * _bs0, ic0 = 0 * _cs0, - ic1 = 1 * _cs0, ic2 = 2 * _cs0, ic3 = 3 * _cs0, ic4 = 4 * _cs0; + const int ib0 = 0 * _bs0, ib1 = 1 * _bs0, ib2 = 2 * _bs0, ic0 = 0 * _cs0, ic1 = 1 * _cs0, ic2 = 2 * _cs0, + ic3 = 3 * _cs0, ic4 = 4 * _cs0; for (int p = 0; p < n; ++p) { b_0p = B[ib0 + p * _bs1]; @@ -217,25 +205,24 @@ KOKKOS_INLINE_FUNCTION int InnerGemmFixA<5, 3>::serial_invoke( } template <> template -KOKKOS_INLINE_FUNCTION int InnerGemmFixA<5, 2>::serial_invoke( - const ScalarType alpha, const ValueType *KOKKOS_RESTRICT A, - const ValueType *KOKKOS_RESTRICT B, const int n, - /**/ ValueType *KOKKOS_RESTRICT C) { +KOKKOS_INLINE_FUNCTION int InnerGemmFixA<5, 2>::serial_invoke(const ScalarType alpha, + const ValueType *KOKKOS_RESTRICT A, + const ValueType *KOKKOS_RESTRICT B, const int n, + /**/ ValueType *KOKKOS_RESTRICT C) { if (n <= 0) return 0; - const ValueType a_00 = A[0 * _as0 + 0 * _as1], a_01 = A[0 * _as0 + 1 * _as1], - a_10 = A[1 * _as0 + 0 * _as1], a_11 = A[1 * _as0 + 1 * _as1], - a_20 = A[2 * _as0 + 0 * _as1], a_21 = A[2 * _as0 + 1 * _as1], - a_30 = A[3 * _as0 + 0 * _as1], a_31 = A[3 * _as0 + 1 * _as1], - a_40 = A[4 * _as0 + 0 * _as1], a_41 = A[4 * _as0 + 1 * _as1]; + const ValueType a_00 = A[0 * _as0 + 0 * _as1], a_01 = A[0 * _as0 + 1 * _as1], a_10 = A[1 * _as0 + 0 * _as1], + a_11 = A[1 * _as0 + 1 * _as1], a_20 = A[2 * _as0 + 0 * _as1], a_21 = A[2 * _as0 + 1 * _as1], + a_30 = A[3 * _as0 + 0 * _as1], a_31 = A[3 * _as0 + 1 * _as1], a_40 = A[4 * _as0 + 0 * _as1], + a_41 = A[4 * _as0 + 1 * _as1]; ValueType b_0p, c_0p, b_1p, c_1p, /**/ c_2p, /**/ c_3p, /**/ c_4p; - const int ib0 = 0 * _bs0, ib1 = 1 * _bs0, ic0 = 0 * _cs0, ic1 = 1 * _cs0, - ic2 = 2 * _cs0, ic3 = 3 * _cs0, ic4 = 4 * _cs0; + const int ib0 = 0 * _bs0, ib1 = 1 * _bs0, ic0 = 0 * _cs0, ic1 = 1 * _cs0, ic2 = 2 * _cs0, ic3 = 3 * _cs0, + ic4 = 4 * _cs0; for (int p = 0; p < n; ++p) { b_0p = B[ib0 + p * _bs1]; @@ -263,15 +250,14 @@ KOKKOS_INLINE_FUNCTION int InnerGemmFixA<5, 2>::serial_invoke( } template <> template -KOKKOS_INLINE_FUNCTION int InnerGemmFixA<5, 1>::serial_invoke( - const ScalarType alpha, const ValueType *KOKKOS_RESTRICT A, - const ValueType *KOKKOS_RESTRICT B, const int n, - /**/ ValueType *KOKKOS_RESTRICT C) { +KOKKOS_INLINE_FUNCTION int InnerGemmFixA<5, 1>::serial_invoke(const ScalarType alpha, + const ValueType *KOKKOS_RESTRICT A, + const ValueType *KOKKOS_RESTRICT B, const int n, + /**/ ValueType *KOKKOS_RESTRICT C) { if (n <= 0) return 0; - const ValueType a_00 = A[0 * _as0 + 0 * _as1], a_10 = A[1 * _as0 + 0 * _as1], - a_20 = A[2 * _as0 + 0 * _as1], a_30 = A[3 * _as0 + 0 * _as1], - a_40 = A[4 * _as0 + 0 * _as1]; + const ValueType a_00 = A[0 * _as0 + 0 * _as1], a_10 = A[1 * _as0 + 0 * _as1], a_20 = A[2 * _as0 + 0 * _as1], + a_30 = A[3 * _as0 + 0 * _as1], a_40 = A[4 * _as0 + 0 * _as1]; ValueType b_0p, c_0p, /**/ c_1p, @@ -279,8 +265,7 @@ KOKKOS_INLINE_FUNCTION int InnerGemmFixA<5, 1>::serial_invoke( /**/ c_3p, /**/ c_4p; - const int ib0 = 0 * _bs0, ic0 = 0 * _cs0, ic1 = 1 * _cs0, ic2 = 2 * _cs0, - ic3 = 3 * _cs0, ic4 = 4 * _cs0; + const int ib0 = 0 * _bs0, ic0 = 0 * _cs0, ic1 = 1 * _cs0, ic2 = 2 * _cs0, ic3 = 3 * _cs0, ic4 = 4 * _cs0; for (int p = 0; p < n; ++p) { b_0p = B[ib0 + p * _bs1]; @@ -303,28 +288,24 @@ KOKKOS_INLINE_FUNCTION int InnerGemmFixA<5, 1>::serial_invoke( template <> template -KOKKOS_INLINE_FUNCTION int InnerGemmFixA<4, 5>::serial_invoke( - const ScalarType alpha, const ValueType *KOKKOS_RESTRICT A, - const ValueType *KOKKOS_RESTRICT B, const int n, - /**/ ValueType *KOKKOS_RESTRICT C) { +KOKKOS_INLINE_FUNCTION int InnerGemmFixA<4, 5>::serial_invoke(const ScalarType alpha, + const ValueType *KOKKOS_RESTRICT A, + const ValueType *KOKKOS_RESTRICT B, const int n, + /**/ ValueType *KOKKOS_RESTRICT C) { if (n <= 0) return 0; - const ValueType a_00 = A[0 * _as0 + 0 * _as1], a_01 = A[0 * _as0 + 1 * _as1], - a_02 = A[0 * _as0 + 2 * _as1], a_03 = A[0 * _as0 + 3 * _as1], - a_04 = A[0 * _as0 + 4 * _as1], a_10 = A[1 * _as0 + 0 * _as1], - a_11 = A[1 * _as0 + 1 * _as1], a_12 = A[1 * _as0 + 2 * _as1], - a_13 = A[1 * _as0 + 3 * _as1], a_14 = A[1 * _as0 + 4 * _as1], - a_20 = A[2 * _as0 + 0 * _as1], a_21 = A[2 * _as0 + 1 * _as1], - a_22 = A[2 * _as0 + 2 * _as1], a_23 = A[2 * _as0 + 3 * _as1], - a_24 = A[2 * _as0 + 4 * _as1], a_30 = A[3 * _as0 + 0 * _as1], - a_31 = A[3 * _as0 + 1 * _as1], a_32 = A[3 * _as0 + 2 * _as1], + const ValueType a_00 = A[0 * _as0 + 0 * _as1], a_01 = A[0 * _as0 + 1 * _as1], a_02 = A[0 * _as0 + 2 * _as1], + a_03 = A[0 * _as0 + 3 * _as1], a_04 = A[0 * _as0 + 4 * _as1], a_10 = A[1 * _as0 + 0 * _as1], + a_11 = A[1 * _as0 + 1 * _as1], a_12 = A[1 * _as0 + 2 * _as1], a_13 = A[1 * _as0 + 3 * _as1], + a_14 = A[1 * _as0 + 4 * _as1], a_20 = A[2 * _as0 + 0 * _as1], a_21 = A[2 * _as0 + 1 * _as1], + a_22 = A[2 * _as0 + 2 * _as1], a_23 = A[2 * _as0 + 3 * _as1], a_24 = A[2 * _as0 + 4 * _as1], + a_30 = A[3 * _as0 + 0 * _as1], a_31 = A[3 * _as0 + 1 * _as1], a_32 = A[3 * _as0 + 2 * _as1], a_33 = A[3 * _as0 + 3 * _as1], a_34 = A[3 * _as0 + 4 * _as1]; ValueType b_0p, c_0p, b_1p, c_1p, b_2p, c_2p, b_3p, c_3p, b_4p; - const int ib0 = 0 * _bs0, ib1 = 1 * _bs0, ib2 = 2 * _bs0, ib3 = 3 * _bs0, - ib4 = 4 * _bs0, ic0 = 0 * _cs0, ic1 = 1 * _cs0, ic2 = 2 * _cs0, - ic3 = 3 * _cs0; + const int ib0 = 0 * _bs0, ib1 = 1 * _bs0, ib2 = 2 * _bs0, ib3 = 3 * _bs0, ib4 = 4 * _bs0, ic0 = 0 * _cs0, + ic1 = 1 * _cs0, ic2 = 2 * _cs0, ic3 = 3 * _cs0; for (int p = 0; p < n; ++p) { b_0p = B[ib0 + p * _bs1]; @@ -365,25 +346,22 @@ KOKKOS_INLINE_FUNCTION int InnerGemmFixA<4, 5>::serial_invoke( template <> template -KOKKOS_INLINE_FUNCTION int InnerGemmFixA<3, 5>::serial_invoke( - const ScalarType alpha, const ValueType *KOKKOS_RESTRICT A, - const ValueType *KOKKOS_RESTRICT B, const int n, - /**/ ValueType *KOKKOS_RESTRICT C) { +KOKKOS_INLINE_FUNCTION int InnerGemmFixA<3, 5>::serial_invoke(const ScalarType alpha, + const ValueType *KOKKOS_RESTRICT A, + const ValueType *KOKKOS_RESTRICT B, const int n, + /**/ ValueType *KOKKOS_RESTRICT C) { if (n <= 0) return 0; - const ValueType a_00 = A[0 * _as0 + 0 * _as1], a_01 = A[0 * _as0 + 1 * _as1], - a_02 = A[0 * _as0 + 2 * _as1], a_03 = A[0 * _as0 + 3 * _as1], - a_04 = A[0 * _as0 + 4 * _as1], a_10 = A[1 * _as0 + 0 * _as1], - a_11 = A[1 * _as0 + 1 * _as1], a_12 = A[1 * _as0 + 2 * _as1], - a_13 = A[1 * _as0 + 3 * _as1], a_14 = A[1 * _as0 + 4 * _as1], - a_20 = A[2 * _as0 + 0 * _as1], a_21 = A[2 * _as0 + 1 * _as1], - a_22 = A[2 * _as0 + 2 * _as1], a_23 = A[2 * _as0 + 3 * _as1], - a_24 = A[2 * _as0 + 4 * _as1]; + const ValueType a_00 = A[0 * _as0 + 0 * _as1], a_01 = A[0 * _as0 + 1 * _as1], a_02 = A[0 * _as0 + 2 * _as1], + a_03 = A[0 * _as0 + 3 * _as1], a_04 = A[0 * _as0 + 4 * _as1], a_10 = A[1 * _as0 + 0 * _as1], + a_11 = A[1 * _as0 + 1 * _as1], a_12 = A[1 * _as0 + 2 * _as1], a_13 = A[1 * _as0 + 3 * _as1], + a_14 = A[1 * _as0 + 4 * _as1], a_20 = A[2 * _as0 + 0 * _as1], a_21 = A[2 * _as0 + 1 * _as1], + a_22 = A[2 * _as0 + 2 * _as1], a_23 = A[2 * _as0 + 3 * _as1], a_24 = A[2 * _as0 + 4 * _as1]; ValueType b_0p, c_0p, b_1p, c_1p, b_2p, c_2p, b_3p, b_4p; - const int ib0 = 0 * _bs0, ib1 = 1 * _bs0, ib2 = 2 * _bs0, ib3 = 3 * _bs0, - ib4 = 4 * _bs0, ic0 = 0 * _cs0, ic1 = 1 * _cs0, ic2 = 2 * _cs0; + const int ib0 = 0 * _bs0, ib1 = 1 * _bs0, ib2 = 2 * _bs0, ib3 = 3 * _bs0, ib4 = 4 * _bs0, ic0 = 0 * _cs0, + ic1 = 1 * _cs0, ic2 = 2 * _cs0; for (int p = 0; p < n; ++p) { b_0p = B[ib0 + p * _bs1]; @@ -417,22 +395,21 @@ KOKKOS_INLINE_FUNCTION int InnerGemmFixA<3, 5>::serial_invoke( } template <> template -KOKKOS_INLINE_FUNCTION int InnerGemmFixA<2, 5>::serial_invoke( - const ScalarType alpha, const ValueType *KOKKOS_RESTRICT A, - const ValueType *KOKKOS_RESTRICT B, const int n, - /**/ ValueType *KOKKOS_RESTRICT C) { +KOKKOS_INLINE_FUNCTION int InnerGemmFixA<2, 5>::serial_invoke(const ScalarType alpha, + const ValueType *KOKKOS_RESTRICT A, + const ValueType *KOKKOS_RESTRICT B, const int n, + /**/ ValueType *KOKKOS_RESTRICT C) { if (n <= 0) return 0; - const ValueType a_00 = A[0 * _as0 + 0 * _as1], a_01 = A[0 * _as0 + 1 * _as1], - a_02 = A[0 * _as0 + 2 * _as1], a_03 = A[0 * _as0 + 3 * _as1], - a_04 = A[0 * _as0 + 4 * _as1], a_10 = A[1 * _as0 + 0 * _as1], - a_11 = A[1 * _as0 + 1 * _as1], a_12 = A[1 * _as0 + 2 * _as1], - a_13 = A[1 * _as0 + 3 * _as1], a_14 = A[1 * _as0 + 4 * _as1]; + const ValueType a_00 = A[0 * _as0 + 0 * _as1], a_01 = A[0 * _as0 + 1 * _as1], a_02 = A[0 * _as0 + 2 * _as1], + a_03 = A[0 * _as0 + 3 * _as1], a_04 = A[0 * _as0 + 4 * _as1], a_10 = A[1 * _as0 + 0 * _as1], + a_11 = A[1 * _as0 + 1 * _as1], a_12 = A[1 * _as0 + 2 * _as1], a_13 = A[1 * _as0 + 3 * _as1], + a_14 = A[1 * _as0 + 4 * _as1]; ValueType b_0p, c_0p, b_1p, c_1p, b_2p, b_3p, b_4p; - const int ib0 = 0 * _bs0, ib1 = 1 * _bs0, ib2 = 2 * _bs0, ib3 = 3 * _bs0, - ib4 = 4 * _bs0, ic0 = 0 * _cs0, ic1 = 1 * _cs0; + const int ib0 = 0 * _bs0, ib1 = 1 * _bs0, ib2 = 2 * _bs0, ib3 = 3 * _bs0, ib4 = 4 * _bs0, ic0 = 0 * _cs0, + ic1 = 1 * _cs0; for (int p = 0; p < n; ++p) { b_0p = B[ib0 + p * _bs1]; @@ -460,20 +437,18 @@ KOKKOS_INLINE_FUNCTION int InnerGemmFixA<2, 5>::serial_invoke( } template <> template -KOKKOS_INLINE_FUNCTION int InnerGemmFixA<1, 5>::serial_invoke( - const ScalarType alpha, const ValueType *KOKKOS_RESTRICT A, - const ValueType *KOKKOS_RESTRICT B, const int n, - /**/ ValueType *KOKKOS_RESTRICT C) { +KOKKOS_INLINE_FUNCTION int InnerGemmFixA<1, 5>::serial_invoke(const ScalarType alpha, + const ValueType *KOKKOS_RESTRICT A, + const ValueType *KOKKOS_RESTRICT B, const int n, + /**/ ValueType *KOKKOS_RESTRICT C) { if (n <= 0) return 0; - const ValueType a_00 = A[0 * _as0 + 0 * _as1], a_01 = A[0 * _as0 + 1 * _as1], - a_02 = A[0 * _as0 + 2 * _as1], a_03 = A[0 * _as0 + 3 * _as1], - a_04 = A[0 * _as0 + 4 * _as1]; + const ValueType a_00 = A[0 * _as0 + 0 * _as1], a_01 = A[0 * _as0 + 1 * _as1], a_02 = A[0 * _as0 + 2 * _as1], + a_03 = A[0 * _as0 + 3 * _as1], a_04 = A[0 * _as0 + 4 * _as1]; ValueType b_0p, c_0p, b_1p, b_2p, b_3p, b_4p; - const int ib0 = 0 * _bs0, ib1 = 1 * _bs0, ib2 = 2 * _bs0, ib3 = 3 * _bs0, - ib4 = 4 * _bs0, ic0 = 0 * _cs0; + const int ib0 = 0 * _bs0, ib1 = 1 * _bs0, ib2 = 2 * _bs0, ib3 = 3 * _bs0, ib4 = 4 * _bs0, ic0 = 0 * _cs0; for (int p = 0; p < n; ++p) { b_0p = B[ib0 + p * _bs1]; @@ -496,10 +471,11 @@ KOKKOS_INLINE_FUNCTION int InnerGemmFixA<1, 5>::serial_invoke( template <> template -KOKKOS_INLINE_FUNCTION int InnerGemmFixA<5, 5>::serial_invoke( - const ScalarType alpha, const ValueType *KOKKOS_RESTRICT A, - const ValueType *KOKKOS_RESTRICT B, const int m, const int n, const int k, - /**/ ValueType *KOKKOS_RESTRICT C) { +KOKKOS_INLINE_FUNCTION int InnerGemmFixA<5, 5>::serial_invoke(const ScalarType alpha, + const ValueType *KOKKOS_RESTRICT A, + const ValueType *KOKKOS_RESTRICT B, const int m, + const int n, const int k, + /**/ ValueType *KOKKOS_RESTRICT C) { if (m <= 0 || n <= 0 || k <= 0) return 0; switch (m * 10 + k) { @@ -548,12 +524,10 @@ KOKKOS_INLINE_FUNCTION int InnerGemmFixA<5, 5>::serial_invoke( InnerGemmFixA<2, 2> inner(_as0, _as1, _bs0, _bs1, _cs0, _cs1); for (int i = 0; i < m; i += 2) for (int p = 0; p < k; p += 2) - inner.serial_invoke(alpha, A + i * _as0 + p * _as1, B + p * _bs0, - (i + 2 > m ? 1 : 2), n, (p + 2 > k ? 1 : 2), - C + i * _cs0); + inner.serial_invoke(alpha, A + i * _as0 + p * _as1, B + p * _bs0, (i + 2 > m ? 1 : 2), n, + (p + 2 > k ? 1 : 2), C + i * _cs0); } else { - Kokkos::abort( - "InnerGemmFixA<5,5>::serial_invoke, assert failure (m<5 && n<5)"); + Kokkos::abort("InnerGemmFixA<5,5>::serial_invoke, assert failure (m<5 && n<5)"); } break; } @@ -568,25 +542,23 @@ KOKKOS_INLINE_FUNCTION int InnerGemmFixA<5, 5>::serial_invoke( template <> template -KOKKOS_INLINE_FUNCTION int InnerGemmFixA<4, 4>::serial_invoke( - const ScalarType alpha, const ValueType *KOKKOS_RESTRICT A, - const ValueType *KOKKOS_RESTRICT B, const int n, - /**/ ValueType *KOKKOS_RESTRICT C) { +KOKKOS_INLINE_FUNCTION int InnerGemmFixA<4, 4>::serial_invoke(const ScalarType alpha, + const ValueType *KOKKOS_RESTRICT A, + const ValueType *KOKKOS_RESTRICT B, const int n, + /**/ ValueType *KOKKOS_RESTRICT C) { if (n <= 0) return 0; - const ValueType a_00 = A[0 * _as0 + 0 * _as1], a_01 = A[0 * _as0 + 1 * _as1], - a_02 = A[0 * _as0 + 2 * _as1], a_03 = A[0 * _as0 + 3 * _as1], - a_10 = A[1 * _as0 + 0 * _as1], a_11 = A[1 * _as0 + 1 * _as1], - a_12 = A[1 * _as0 + 2 * _as1], a_13 = A[1 * _as0 + 3 * _as1], - a_20 = A[2 * _as0 + 0 * _as1], a_21 = A[2 * _as0 + 1 * _as1], - a_22 = A[2 * _as0 + 2 * _as1], a_23 = A[2 * _as0 + 3 * _as1], - a_30 = A[3 * _as0 + 0 * _as1], a_31 = A[3 * _as0 + 1 * _as1], - a_32 = A[3 * _as0 + 2 * _as1], a_33 = A[3 * _as0 + 3 * _as1]; + const ValueType a_00 = A[0 * _as0 + 0 * _as1], a_01 = A[0 * _as0 + 1 * _as1], a_02 = A[0 * _as0 + 2 * _as1], + a_03 = A[0 * _as0 + 3 * _as1], a_10 = A[1 * _as0 + 0 * _as1], a_11 = A[1 * _as0 + 1 * _as1], + a_12 = A[1 * _as0 + 2 * _as1], a_13 = A[1 * _as0 + 3 * _as1], a_20 = A[2 * _as0 + 0 * _as1], + a_21 = A[2 * _as0 + 1 * _as1], a_22 = A[2 * _as0 + 2 * _as1], a_23 = A[2 * _as0 + 3 * _as1], + a_30 = A[3 * _as0 + 0 * _as1], a_31 = A[3 * _as0 + 1 * _as1], a_32 = A[3 * _as0 + 2 * _as1], + a_33 = A[3 * _as0 + 3 * _as1]; ValueType b_0p, c_0p, b_1p, c_1p, b_2p, c_2p, b_3p, c_3p; - const int ib0 = 0 * _bs0, ib1 = 1 * _bs0, ib2 = 2 * _bs0, ib3 = 3 * _bs0, - ic0 = 0 * _cs0, ic1 = 1 * _cs0, ic2 = 2 * _cs0, ic3 = 3 * _cs0; + const int ib0 = 0 * _bs0, ib1 = 1 * _bs0, ib2 = 2 * _bs0, ib3 = 3 * _bs0, ic0 = 0 * _cs0, ic1 = 1 * _cs0, + ic2 = 2 * _cs0, ic3 = 3 * _cs0; for (int p = 0; p < n; ++p) { b_0p = B[ib0 + p * _bs1]; @@ -622,24 +594,22 @@ KOKKOS_INLINE_FUNCTION int InnerGemmFixA<4, 4>::serial_invoke( template <> template -KOKKOS_INLINE_FUNCTION int InnerGemmFixA<4, 3>::serial_invoke( - const ScalarType alpha, const ValueType *KOKKOS_RESTRICT A, - const ValueType *KOKKOS_RESTRICT B, const int n, - /**/ ValueType *KOKKOS_RESTRICT C) { +KOKKOS_INLINE_FUNCTION int InnerGemmFixA<4, 3>::serial_invoke(const ScalarType alpha, + const ValueType *KOKKOS_RESTRICT A, + const ValueType *KOKKOS_RESTRICT B, const int n, + /**/ ValueType *KOKKOS_RESTRICT C) { if (n <= 0) return 0; - const ValueType a_00 = A[0 * _as0 + 0 * _as1], a_01 = A[0 * _as0 + 1 * _as1], - a_02 = A[0 * _as0 + 2 * _as1], a_10 = A[1 * _as0 + 0 * _as1], - a_11 = A[1 * _as0 + 1 * _as1], a_12 = A[1 * _as0 + 2 * _as1], - a_20 = A[2 * _as0 + 0 * _as1], a_21 = A[2 * _as0 + 1 * _as1], - a_22 = A[2 * _as0 + 2 * _as1], a_30 = A[3 * _as0 + 0 * _as1], - a_31 = A[3 * _as0 + 1 * _as1], a_32 = A[3 * _as0 + 2 * _as1]; + const ValueType a_00 = A[0 * _as0 + 0 * _as1], a_01 = A[0 * _as0 + 1 * _as1], a_02 = A[0 * _as0 + 2 * _as1], + a_10 = A[1 * _as0 + 0 * _as1], a_11 = A[1 * _as0 + 1 * _as1], a_12 = A[1 * _as0 + 2 * _as1], + a_20 = A[2 * _as0 + 0 * _as1], a_21 = A[2 * _as0 + 1 * _as1], a_22 = A[2 * _as0 + 2 * _as1], + a_30 = A[3 * _as0 + 0 * _as1], a_31 = A[3 * _as0 + 1 * _as1], a_32 = A[3 * _as0 + 2 * _as1]; ValueType b_0p, c_0p, b_1p, c_1p, b_2p, c_2p, /**/ c_3p; - const int ib0 = 0 * _bs0, ib1 = 1 * _bs0, ib2 = 2 * _bs0, ic0 = 0 * _cs0, - ic1 = 1 * _cs0, ic2 = 2 * _cs0, ic3 = 3 * _cs0; + const int ib0 = 0 * _bs0, ib1 = 1 * _bs0, ib2 = 2 * _bs0, ic0 = 0 * _cs0, ic1 = 1 * _cs0, ic2 = 2 * _cs0, + ic3 = 3 * _cs0; for (int p = 0; p < n; ++p) { b_0p = B[ib0 + p * _bs1]; @@ -670,23 +640,21 @@ KOKKOS_INLINE_FUNCTION int InnerGemmFixA<4, 3>::serial_invoke( template <> template -KOKKOS_INLINE_FUNCTION int InnerGemmFixA<4, 2>::serial_invoke( - const ScalarType alpha, const ValueType *KOKKOS_RESTRICT A, - const ValueType *KOKKOS_RESTRICT B, const int n, - /**/ ValueType *KOKKOS_RESTRICT C) { +KOKKOS_INLINE_FUNCTION int InnerGemmFixA<4, 2>::serial_invoke(const ScalarType alpha, + const ValueType *KOKKOS_RESTRICT A, + const ValueType *KOKKOS_RESTRICT B, const int n, + /**/ ValueType *KOKKOS_RESTRICT C) { if (n <= 0) return 0; - const ValueType a_00 = A[0 * _as0 + 0 * _as1], a_01 = A[0 * _as0 + 1 * _as1], - a_10 = A[1 * _as0 + 0 * _as1], a_11 = A[1 * _as0 + 1 * _as1], - a_20 = A[2 * _as0 + 0 * _as1], a_21 = A[2 * _as0 + 1 * _as1], + const ValueType a_00 = A[0 * _as0 + 0 * _as1], a_01 = A[0 * _as0 + 1 * _as1], a_10 = A[1 * _as0 + 0 * _as1], + a_11 = A[1 * _as0 + 1 * _as1], a_20 = A[2 * _as0 + 0 * _as1], a_21 = A[2 * _as0 + 1 * _as1], a_30 = A[3 * _as0 + 0 * _as1], a_31 = A[3 * _as0 + 1 * _as1]; ValueType b_0p, c_0p, b_1p, c_1p, /**/ c_2p, /**/ c_3p; - const int ib0 = 0 * _bs0, ib1 = 1 * _bs0, ic0 = 0 * _cs0, ic1 = 1 * _cs0, - ic2 = 2 * _cs0, ic3 = 3 * _cs0; + const int ib0 = 0 * _bs0, ib1 = 1 * _bs0, ic0 = 0 * _cs0, ic1 = 1 * _cs0, ic2 = 2 * _cs0, ic3 = 3 * _cs0; for (int p = 0; p < n; ++p) { b_0p = B[ib0 + p * _bs1]; @@ -712,22 +680,21 @@ KOKKOS_INLINE_FUNCTION int InnerGemmFixA<4, 2>::serial_invoke( template <> template -KOKKOS_INLINE_FUNCTION int InnerGemmFixA<4, 1>::serial_invoke( - const ScalarType alpha, const ValueType *KOKKOS_RESTRICT A, - const ValueType *KOKKOS_RESTRICT B, const int n, - /**/ ValueType *KOKKOS_RESTRICT C) { +KOKKOS_INLINE_FUNCTION int InnerGemmFixA<4, 1>::serial_invoke(const ScalarType alpha, + const ValueType *KOKKOS_RESTRICT A, + const ValueType *KOKKOS_RESTRICT B, const int n, + /**/ ValueType *KOKKOS_RESTRICT C) { if (n <= 0) return 0; - const ValueType a_00 = A[0 * _as0 + 0 * _as1], a_10 = A[1 * _as0 + 0 * _as1], - a_20 = A[2 * _as0 + 0 * _as1], a_30 = A[3 * _as0 + 0 * _as1]; + const ValueType a_00 = A[0 * _as0 + 0 * _as1], a_10 = A[1 * _as0 + 0 * _as1], a_20 = A[2 * _as0 + 0 * _as1], + a_30 = A[3 * _as0 + 0 * _as1]; ValueType b_0p, c_0p, /**/ c_1p, /**/ c_2p, /**/ c_3p; - const int ib0 = 0 * _bs0, ic0 = 0 * _cs0, ic1 = 1 * _cs0, ic2 = 2 * _cs0, - ic3 = 3 * _cs0; + const int ib0 = 0 * _bs0, ic0 = 0 * _cs0, ic1 = 1 * _cs0, ic2 = 2 * _cs0, ic3 = 3 * _cs0; for (int p = 0; p < n; ++p) { b_0p = B[ib0 + p * _bs1]; @@ -748,23 +715,21 @@ KOKKOS_INLINE_FUNCTION int InnerGemmFixA<4, 1>::serial_invoke( template <> template -KOKKOS_INLINE_FUNCTION int InnerGemmFixA<3, 4>::serial_invoke( - const ScalarType alpha, const ValueType *KOKKOS_RESTRICT A, - const ValueType *KOKKOS_RESTRICT B, const int n, - /**/ ValueType *KOKKOS_RESTRICT C) { +KOKKOS_INLINE_FUNCTION int InnerGemmFixA<3, 4>::serial_invoke(const ScalarType alpha, + const ValueType *KOKKOS_RESTRICT A, + const ValueType *KOKKOS_RESTRICT B, const int n, + /**/ ValueType *KOKKOS_RESTRICT C) { if (n <= 0) return 0; - const ValueType a_00 = A[0 * _as0 + 0 * _as1], a_01 = A[0 * _as0 + 1 * _as1], - a_02 = A[0 * _as0 + 2 * _as1], a_03 = A[0 * _as0 + 3 * _as1], - a_10 = A[1 * _as0 + 0 * _as1], a_11 = A[1 * _as0 + 1 * _as1], - a_12 = A[1 * _as0 + 2 * _as1], a_13 = A[1 * _as0 + 3 * _as1], - a_20 = A[2 * _as0 + 0 * _as1], a_21 = A[2 * _as0 + 1 * _as1], - a_22 = A[2 * _as0 + 2 * _as1], a_23 = A[2 * _as0 + 3 * _as1]; + const ValueType a_00 = A[0 * _as0 + 0 * _as1], a_01 = A[0 * _as0 + 1 * _as1], a_02 = A[0 * _as0 + 2 * _as1], + a_03 = A[0 * _as0 + 3 * _as1], a_10 = A[1 * _as0 + 0 * _as1], a_11 = A[1 * _as0 + 1 * _as1], + a_12 = A[1 * _as0 + 2 * _as1], a_13 = A[1 * _as0 + 3 * _as1], a_20 = A[2 * _as0 + 0 * _as1], + a_21 = A[2 * _as0 + 1 * _as1], a_22 = A[2 * _as0 + 2 * _as1], a_23 = A[2 * _as0 + 3 * _as1]; ValueType b_0p, c_0p, b_1p, c_1p, b_2p, c_2p, b_3p; - const int ib0 = 0 * _bs0, ib1 = 1 * _bs0, ib2 = 2 * _bs0, ib3 = 3 * _bs0, - ic0 = 0 * _cs0, ic1 = 1 * _cs0, ic2 = 2 * _cs0; + const int ib0 = 0 * _bs0, ib1 = 1 * _bs0, ib2 = 2 * _bs0, ib3 = 3 * _bs0, ic0 = 0 * _cs0, ic1 = 1 * _cs0, + ic2 = 2 * _cs0; for (int p = 0; p < n; ++p) { b_0p = B[ib0 + p * _bs1]; @@ -795,21 +760,19 @@ KOKKOS_INLINE_FUNCTION int InnerGemmFixA<3, 4>::serial_invoke( template <> template -KOKKOS_INLINE_FUNCTION int InnerGemmFixA<2, 4>::serial_invoke( - const ScalarType alpha, const ValueType *KOKKOS_RESTRICT A, - const ValueType *KOKKOS_RESTRICT B, const int n, - /**/ ValueType *KOKKOS_RESTRICT C) { +KOKKOS_INLINE_FUNCTION int InnerGemmFixA<2, 4>::serial_invoke(const ScalarType alpha, + const ValueType *KOKKOS_RESTRICT A, + const ValueType *KOKKOS_RESTRICT B, const int n, + /**/ ValueType *KOKKOS_RESTRICT C) { if (n <= 0) return 0; - const ValueType a_00 = A[0 * _as0 + 0 * _as1], a_01 = A[0 * _as0 + 1 * _as1], - a_02 = A[0 * _as0 + 2 * _as1], a_03 = A[0 * _as0 + 3 * _as1], - a_10 = A[1 * _as0 + 0 * _as1], a_11 = A[1 * _as0 + 1 * _as1], + const ValueType a_00 = A[0 * _as0 + 0 * _as1], a_01 = A[0 * _as0 + 1 * _as1], a_02 = A[0 * _as0 + 2 * _as1], + a_03 = A[0 * _as0 + 3 * _as1], a_10 = A[1 * _as0 + 0 * _as1], a_11 = A[1 * _as0 + 1 * _as1], a_12 = A[1 * _as0 + 2 * _as1], a_13 = A[1 * _as0 + 3 * _as1]; ValueType b_0p, c_0p, b_1p, c_1p, b_2p, b_3p; - const int ib0 = 0 * _bs0, ib1 = 1 * _bs0, ib2 = 2 * _bs0, ib3 = 3 * _bs0, - ic0 = 0 * _cs0, ic1 = 1 * _cs0; + const int ib0 = 0 * _bs0, ib1 = 1 * _bs0, ib2 = 2 * _bs0, ib3 = 3 * _bs0, ic0 = 0 * _cs0, ic1 = 1 * _cs0; for (int p = 0; p < n; ++p) { b_0p = B[ib0 + p * _bs1]; @@ -835,19 +798,18 @@ KOKKOS_INLINE_FUNCTION int InnerGemmFixA<2, 4>::serial_invoke( template <> template -KOKKOS_INLINE_FUNCTION int InnerGemmFixA<1, 4>::serial_invoke( - const ScalarType alpha, const ValueType *KOKKOS_RESTRICT A, - const ValueType *KOKKOS_RESTRICT B, const int n, - /**/ ValueType *KOKKOS_RESTRICT C) { +KOKKOS_INLINE_FUNCTION int InnerGemmFixA<1, 4>::serial_invoke(const ScalarType alpha, + const ValueType *KOKKOS_RESTRICT A, + const ValueType *KOKKOS_RESTRICT B, const int n, + /**/ ValueType *KOKKOS_RESTRICT C) { if (n <= 0) return 0; - const ValueType a_00 = A[0 * _as0 + 0 * _as1], a_01 = A[0 * _as0 + 1 * _as1], - a_02 = A[0 * _as0 + 2 * _as1], a_03 = A[0 * _as0 + 3 * _as1]; + const ValueType a_00 = A[0 * _as0 + 0 * _as1], a_01 = A[0 * _as0 + 1 * _as1], a_02 = A[0 * _as0 + 2 * _as1], + a_03 = A[0 * _as0 + 3 * _as1]; ValueType b_0p, c_0p, b_1p, b_2p, b_3p; - const int ib0 = 0 * _bs0, ib1 = 1 * _bs0, ib2 = 2 * _bs0, ib3 = 3 * _bs0, - ic0 = 0 * _cs0; + const int ib0 = 0 * _bs0, ib1 = 1 * _bs0, ib2 = 2 * _bs0, ib3 = 3 * _bs0, ic0 = 0 * _cs0; for (int p = 0; p < n; ++p) { b_0p = B[ib0 + p * _bs1]; @@ -868,10 +830,11 @@ KOKKOS_INLINE_FUNCTION int InnerGemmFixA<1, 4>::serial_invoke( template <> template -KOKKOS_INLINE_FUNCTION int InnerGemmFixA<4, 4>::serial_invoke( - const ScalarType alpha, const ValueType *KOKKOS_RESTRICT A, - const ValueType *KOKKOS_RESTRICT B, const int m, const int n, const int k, - /**/ ValueType *KOKKOS_RESTRICT C) { +KOKKOS_INLINE_FUNCTION int InnerGemmFixA<4, 4>::serial_invoke(const ScalarType alpha, + const ValueType *KOKKOS_RESTRICT A, + const ValueType *KOKKOS_RESTRICT B, const int m, + const int n, const int k, + /**/ ValueType *KOKKOS_RESTRICT C) { if (m <= 0 || n <= 0 || k <= 0) return 0; switch (m * 10 + k) { @@ -915,12 +878,10 @@ KOKKOS_INLINE_FUNCTION int InnerGemmFixA<4, 4>::serial_invoke( InnerGemmFixA<2, 2> inner(_as0, _as1, _bs0, _bs1, _cs0, _cs1); for (int i = 0; i < m; i += 2) for (int p = 0; p < k; p += 2) - inner.serial_invoke(alpha, A + i * _as0 + p * _as1, B + p * _bs0, - (i + 2 > m ? 1 : 2), n, (p + 2 > k ? 1 : 2), - C + i * _cs0); + inner.serial_invoke(alpha, A + i * _as0 + p * _as1, B + p * _bs0, (i + 2 > m ? 1 : 2), n, + (p + 2 > k ? 1 : 2), C + i * _cs0); } else { - Kokkos::abort( - "InnerGemmFixA<4,4>::serial_invoke, assert failure (m<4 && n<4)"); + Kokkos::abort("InnerGemmFixA<4,4>::serial_invoke, assert failure (m<4 && n<4)"); } break; } @@ -935,22 +896,19 @@ KOKKOS_INLINE_FUNCTION int InnerGemmFixA<4, 4>::serial_invoke( template <> template -KOKKOS_INLINE_FUNCTION int InnerGemmFixA<3, 3>::serial_invoke( - const ScalarType alpha, const ValueType *KOKKOS_RESTRICT A, - const ValueType *KOKKOS_RESTRICT B, const int n, - /**/ ValueType *KOKKOS_RESTRICT C) { +KOKKOS_INLINE_FUNCTION int InnerGemmFixA<3, 3>::serial_invoke(const ScalarType alpha, + const ValueType *KOKKOS_RESTRICT A, + const ValueType *KOKKOS_RESTRICT B, const int n, + /**/ ValueType *KOKKOS_RESTRICT C) { if (n <= 0) return 0; - const ValueType a_00 = A[0 * _as0 + 0 * _as1], a_01 = A[0 * _as0 + 1 * _as1], - a_02 = A[0 * _as0 + 2 * _as1], a_10 = A[1 * _as0 + 0 * _as1], - a_11 = A[1 * _as0 + 1 * _as1], a_12 = A[1 * _as0 + 2 * _as1], - a_20 = A[2 * _as0 + 0 * _as1], a_21 = A[2 * _as0 + 1 * _as1], - a_22 = A[2 * _as0 + 2 * _as1]; + const ValueType a_00 = A[0 * _as0 + 0 * _as1], a_01 = A[0 * _as0 + 1 * _as1], a_02 = A[0 * _as0 + 2 * _as1], + a_10 = A[1 * _as0 + 0 * _as1], a_11 = A[1 * _as0 + 1 * _as1], a_12 = A[1 * _as0 + 2 * _as1], + a_20 = A[2 * _as0 + 0 * _as1], a_21 = A[2 * _as0 + 1 * _as1], a_22 = A[2 * _as0 + 2 * _as1]; ValueType b_0p, c_0p, b_1p, c_1p, b_2p, c_2p; - const int ib0 = 0 * _bs0, ib1 = 1 * _bs0, ib2 = 2 * _bs0, ic0 = 0 * _cs0, - ic1 = 1 * _cs0, ic2 = 2 * _cs0; + const int ib0 = 0 * _bs0, ib1 = 1 * _bs0, ib2 = 2 * _bs0, ic0 = 0 * _cs0, ic1 = 1 * _cs0, ic2 = 2 * _cs0; for (int p = 0; p < n; ++p) { b_0p = B[ib0 + p * _bs1]; @@ -977,21 +935,19 @@ KOKKOS_INLINE_FUNCTION int InnerGemmFixA<3, 3>::serial_invoke( template <> template -KOKKOS_INLINE_FUNCTION int InnerGemmFixA<3, 2>::serial_invoke( - const ScalarType alpha, const ValueType *KOKKOS_RESTRICT A, - const ValueType *KOKKOS_RESTRICT B, const int n, - /**/ ValueType *KOKKOS_RESTRICT C) { +KOKKOS_INLINE_FUNCTION int InnerGemmFixA<3, 2>::serial_invoke(const ScalarType alpha, + const ValueType *KOKKOS_RESTRICT A, + const ValueType *KOKKOS_RESTRICT B, const int n, + /**/ ValueType *KOKKOS_RESTRICT C) { if (n <= 0) return 0; - const ValueType a_00 = A[0 * _as0 + 0 * _as1], a_01 = A[0 * _as0 + 1 * _as1], - a_10 = A[1 * _as0 + 0 * _as1], a_11 = A[1 * _as0 + 1 * _as1], - a_20 = A[2 * _as0 + 0 * _as1], a_21 = A[2 * _as0 + 1 * _as1]; + const ValueType a_00 = A[0 * _as0 + 0 * _as1], a_01 = A[0 * _as0 + 1 * _as1], a_10 = A[1 * _as0 + 0 * _as1], + a_11 = A[1 * _as0 + 1 * _as1], a_20 = A[2 * _as0 + 0 * _as1], a_21 = A[2 * _as0 + 1 * _as1]; ValueType b_0p, c_0p, b_1p, c_1p, /**/ c_2p; - const int ib0 = 0 * _bs0, ib1 = 1 * _bs0, ic0 = 0 * _cs0, ic1 = 1 * _cs0, - ic2 = 2 * _cs0; + const int ib0 = 0 * _bs0, ib1 = 1 * _bs0, ic0 = 0 * _cs0, ic1 = 1 * _cs0, ic2 = 2 * _cs0; for (int p = 0; p < n; ++p) { b_0p = B[ib0 + p * _bs1]; @@ -1014,14 +970,13 @@ KOKKOS_INLINE_FUNCTION int InnerGemmFixA<3, 2>::serial_invoke( template <> template -KOKKOS_INLINE_FUNCTION int InnerGemmFixA<3, 1>::serial_invoke( - const ScalarType alpha, const ValueType *KOKKOS_RESTRICT A, - const ValueType *KOKKOS_RESTRICT B, const int n, - /**/ ValueType *KOKKOS_RESTRICT C) { +KOKKOS_INLINE_FUNCTION int InnerGemmFixA<3, 1>::serial_invoke(const ScalarType alpha, + const ValueType *KOKKOS_RESTRICT A, + const ValueType *KOKKOS_RESTRICT B, const int n, + /**/ ValueType *KOKKOS_RESTRICT C) { if (n <= 0) return 0; - const ValueType a_00 = A[0 * _as0 + 0 * _as1], a_10 = A[1 * _as0 + 0 * _as1], - a_20 = A[2 * _as0 + 0 * _as1]; + const ValueType a_00 = A[0 * _as0 + 0 * _as1], a_10 = A[1 * _as0 + 0 * _as1], a_20 = A[2 * _as0 + 0 * _as1]; ValueType b_0p, c_0p, /**/ c_1p, @@ -1046,20 +1001,18 @@ KOKKOS_INLINE_FUNCTION int InnerGemmFixA<3, 1>::serial_invoke( template <> template -KOKKOS_INLINE_FUNCTION int InnerGemmFixA<2, 3>::serial_invoke( - const ScalarType alpha, const ValueType *KOKKOS_RESTRICT A, - const ValueType *KOKKOS_RESTRICT B, const int n, - /**/ ValueType *KOKKOS_RESTRICT C) { +KOKKOS_INLINE_FUNCTION int InnerGemmFixA<2, 3>::serial_invoke(const ScalarType alpha, + const ValueType *KOKKOS_RESTRICT A, + const ValueType *KOKKOS_RESTRICT B, const int n, + /**/ ValueType *KOKKOS_RESTRICT C) { if (n <= 0) return 0; - const ValueType a_00 = A[0 * _as0 + 0 * _as1], a_01 = A[0 * _as0 + 1 * _as1], - a_02 = A[0 * _as0 + 2 * _as1], a_10 = A[1 * _as0 + 0 * _as1], - a_11 = A[1 * _as0 + 1 * _as1], a_12 = A[1 * _as0 + 2 * _as1]; + const ValueType a_00 = A[0 * _as0 + 0 * _as1], a_01 = A[0 * _as0 + 1 * _as1], a_02 = A[0 * _as0 + 2 * _as1], + a_10 = A[1 * _as0 + 0 * _as1], a_11 = A[1 * _as0 + 1 * _as1], a_12 = A[1 * _as0 + 2 * _as1]; ValueType b_0p, c_0p, b_1p, c_1p, b_2p; - const int ib0 = 0 * _bs0, ib1 = 1 * _bs0, ib2 = 2 * _bs0, ic0 = 0 * _cs0, - ic1 = 1 * _cs0; + const int ib0 = 0 * _bs0, ib1 = 1 * _bs0, ib2 = 2 * _bs0, ic0 = 0 * _cs0, ic1 = 1 * _cs0; for (int p = 0; p < n; ++p) { b_0p = B[ib0 + p * _bs1]; @@ -1081,14 +1034,13 @@ KOKKOS_INLINE_FUNCTION int InnerGemmFixA<2, 3>::serial_invoke( } template <> template -KOKKOS_INLINE_FUNCTION int InnerGemmFixA<1, 3>::serial_invoke( - const ScalarType alpha, const ValueType *KOKKOS_RESTRICT A, - const ValueType *KOKKOS_RESTRICT B, const int n, - /**/ ValueType *KOKKOS_RESTRICT C) { +KOKKOS_INLINE_FUNCTION int InnerGemmFixA<1, 3>::serial_invoke(const ScalarType alpha, + const ValueType *KOKKOS_RESTRICT A, + const ValueType *KOKKOS_RESTRICT B, const int n, + /**/ ValueType *KOKKOS_RESTRICT C) { if (n <= 0) return 0; - const ValueType a_00 = A[0 * _as0 + 0 * _as1], a_01 = A[0 * _as0 + 1 * _as1], - a_02 = A[0 * _as0 + 2 * _as1]; + const ValueType a_00 = A[0 * _as0 + 0 * _as1], a_01 = A[0 * _as0 + 1 * _as1], a_02 = A[0 * _as0 + 2 * _as1]; ValueType b_0p, c_0p, b_1p, b_2p; @@ -1111,10 +1063,11 @@ KOKKOS_INLINE_FUNCTION int InnerGemmFixA<1, 3>::serial_invoke( template <> template -KOKKOS_INLINE_FUNCTION int InnerGemmFixA<3, 3>::serial_invoke( - const ScalarType alpha, const ValueType *KOKKOS_RESTRICT A, - const ValueType *KOKKOS_RESTRICT B, const int m, const int n, const int k, - /**/ ValueType *KOKKOS_RESTRICT C) { +KOKKOS_INLINE_FUNCTION int InnerGemmFixA<3, 3>::serial_invoke(const ScalarType alpha, + const ValueType *KOKKOS_RESTRICT A, + const ValueType *KOKKOS_RESTRICT B, const int m, + const int n, const int k, + /**/ ValueType *KOKKOS_RESTRICT C) { if (m <= 0 || n <= 0 || k <= 0) return 0; switch (m * 10 + k) { @@ -1148,12 +1101,10 @@ KOKKOS_INLINE_FUNCTION int InnerGemmFixA<3, 3>::serial_invoke( InnerGemmFixA<2, 2> inner(_as0, _as1, _bs0, _bs1, _cs0, _cs1); for (int i = 0; i < m; i += 2) for (int p = 0; p < k; p += 2) - inner.serial_invoke(alpha, A + i * _as0 + p * _as1, B + p * _bs0, - (i + 2 > m ? 1 : 2), n, (p + 2 > k ? 1 : 2), - C + i * _cs0); + inner.serial_invoke(alpha, A + i * _as0 + p * _as1, B + p * _bs0, (i + 2 > m ? 1 : 2), n, + (p + 2 > k ? 1 : 2), C + i * _cs0); } else { - Kokkos::abort( - "InnerGemmFixA<3,3>::serial_invoke, assert failure (m<3 && n<3)"); + Kokkos::abort("InnerGemmFixA<3,3>::serial_invoke, assert failure (m<3 && n<3)"); } break; } @@ -1168,14 +1119,14 @@ KOKKOS_INLINE_FUNCTION int InnerGemmFixA<3, 3>::serial_invoke( template <> template -KOKKOS_INLINE_FUNCTION int InnerGemmFixA<2, 2>::serial_invoke( - const ScalarType alpha, const ValueType *KOKKOS_RESTRICT A, - const ValueType *KOKKOS_RESTRICT B, const int n, - /**/ ValueType *KOKKOS_RESTRICT C) { +KOKKOS_INLINE_FUNCTION int InnerGemmFixA<2, 2>::serial_invoke(const ScalarType alpha, + const ValueType *KOKKOS_RESTRICT A, + const ValueType *KOKKOS_RESTRICT B, const int n, + /**/ ValueType *KOKKOS_RESTRICT C) { if (n <= 0) return 0; - const ValueType a_00 = A[0 * _as0 + 0 * _as1], a_01 = A[0 * _as0 + 1 * _as1], - a_10 = A[1 * _as0 + 0 * _as1], a_11 = A[1 * _as0 + 1 * _as1]; + const ValueType a_00 = A[0 * _as0 + 0 * _as1], a_01 = A[0 * _as0 + 1 * _as1], a_10 = A[1 * _as0 + 0 * _as1], + a_11 = A[1 * _as0 + 1 * _as1]; ValueType b_0p, c_0p, b_1p, c_1p; @@ -1199,10 +1150,10 @@ KOKKOS_INLINE_FUNCTION int InnerGemmFixA<2, 2>::serial_invoke( template <> template -KOKKOS_INLINE_FUNCTION int InnerGemmFixA<2, 1>::serial_invoke( - const ScalarType alpha, const ValueType *KOKKOS_RESTRICT A, - const ValueType *KOKKOS_RESTRICT B, const int n, - /**/ ValueType *KOKKOS_RESTRICT C) { +KOKKOS_INLINE_FUNCTION int InnerGemmFixA<2, 1>::serial_invoke(const ScalarType alpha, + const ValueType *KOKKOS_RESTRICT A, + const ValueType *KOKKOS_RESTRICT B, const int n, + /**/ ValueType *KOKKOS_RESTRICT C) { if (n <= 0) return 0; const ValueType a_00 = A[0 * _as0 + 0 * _as1], a_10 = A[1 * _as0 + 0 * _as1]; @@ -1227,10 +1178,10 @@ KOKKOS_INLINE_FUNCTION int InnerGemmFixA<2, 1>::serial_invoke( template <> template -KOKKOS_INLINE_FUNCTION int InnerGemmFixA<1, 2>::serial_invoke( - const ScalarType alpha, const ValueType *KOKKOS_RESTRICT A, - const ValueType *KOKKOS_RESTRICT B, const int n, - /**/ ValueType *KOKKOS_RESTRICT C) { +KOKKOS_INLINE_FUNCTION int InnerGemmFixA<1, 2>::serial_invoke(const ScalarType alpha, + const ValueType *KOKKOS_RESTRICT A, + const ValueType *KOKKOS_RESTRICT B, const int n, + /**/ ValueType *KOKKOS_RESTRICT C) { if (n <= 0) return 0; const ValueType a_00 = A[0 * _as0 + 0 * _as1], a_01 = A[0 * _as0 + 1 * _as1]; @@ -1254,10 +1205,11 @@ KOKKOS_INLINE_FUNCTION int InnerGemmFixA<1, 2>::serial_invoke( template <> template -KOKKOS_INLINE_FUNCTION int InnerGemmFixA<2, 2>::serial_invoke( - const ScalarType alpha, const ValueType *KOKKOS_RESTRICT A, - const ValueType *KOKKOS_RESTRICT B, const int m, const int n, const int k, - /**/ ValueType *KOKKOS_RESTRICT C) { +KOKKOS_INLINE_FUNCTION int InnerGemmFixA<2, 2>::serial_invoke(const ScalarType alpha, + const ValueType *KOKKOS_RESTRICT A, + const ValueType *KOKKOS_RESTRICT B, const int m, + const int n, const int k, + /**/ ValueType *KOKKOS_RESTRICT C) { if (m <= 0 || n <= 0 || k <= 0) return 0; switch (m * 10 + k) { @@ -1282,8 +1234,7 @@ KOKKOS_INLINE_FUNCTION int InnerGemmFixA<2, 2>::serial_invoke( break; } default: { - Kokkos::abort( - "InnerGemmFixA<2,2>::serial_invoke, assert failure (m<2 && n<2)"); + Kokkos::abort("InnerGemmFixA<2,2>::serial_invoke, assert failure (m<2 && n<2)"); break; } } @@ -1297,10 +1248,10 @@ KOKKOS_INLINE_FUNCTION int InnerGemmFixA<2, 2>::serial_invoke( template <> template -KOKKOS_INLINE_FUNCTION int InnerGemmFixA<1, 1>::serial_invoke( - const ScalarType alpha, const ValueType *KOKKOS_RESTRICT A, - const ValueType *KOKKOS_RESTRICT B, const int n, - /**/ ValueType *KOKKOS_RESTRICT C) { +KOKKOS_INLINE_FUNCTION int InnerGemmFixA<1, 1>::serial_invoke(const ScalarType alpha, + const ValueType *KOKKOS_RESTRICT A, + const ValueType *KOKKOS_RESTRICT B, const int n, + /**/ ValueType *KOKKOS_RESTRICT C) { if (n <= 0) return 0; const ValueType a_00 = A[0 * _as0 + 0 * _as1]; diff --git a/batched/dense/impl/KokkosBatched_InnerGemmFixB_Serial_Impl.hpp b/batched/dense/impl/KokkosBatched_InnerGemmFixB_Serial_Impl.hpp index a725bf5b45..6912c285a6 100644 --- a/batched/dense/impl/KokkosBatched_InnerGemmFixB_Serial_Impl.hpp +++ b/batched/dense/impl/KokkosBatched_InnerGemmFixB_Serial_Impl.hpp @@ -29,31 +29,26 @@ namespace KokkosBatched { template <> template -KOKKOS_INLINE_FUNCTION int InnerGemmFixB<5, 5>::serial_invoke( - const ScalarType alpha, const ValueType *KOKKOS_RESTRICT A, - const ValueType *KOKKOS_RESTRICT B, const int m, - /**/ ValueType *KOKKOS_RESTRICT C) { +KOKKOS_INLINE_FUNCTION int InnerGemmFixB<5, 5>::serial_invoke(const ScalarType alpha, + const ValueType *KOKKOS_RESTRICT A, + const ValueType *KOKKOS_RESTRICT B, const int m, + /**/ ValueType *KOKKOS_RESTRICT C) { if (m <= 0) return 0; - const ValueType b_00 = B[0 * _bs0 + 0 * _bs1], b_01 = B[0 * _bs0 + 1 * _bs1], - b_02 = B[0 * _bs0 + 2 * _bs1], b_03 = B[0 * _bs0 + 3 * _bs1], - b_04 = B[0 * _bs0 + 4 * _bs1], b_10 = B[1 * _bs0 + 0 * _bs1], - b_11 = B[1 * _bs0 + 1 * _bs1], b_12 = B[1 * _bs0 + 2 * _bs1], - b_13 = B[1 * _bs0 + 3 * _bs1], b_14 = B[1 * _bs0 + 4 * _bs1], - b_20 = B[2 * _bs0 + 0 * _bs1], b_21 = B[2 * _bs0 + 1 * _bs1], - b_22 = B[2 * _bs0 + 2 * _bs1], b_23 = B[2 * _bs0 + 3 * _bs1], - b_24 = B[2 * _bs0 + 4 * _bs1], b_30 = B[3 * _bs0 + 0 * _bs1], - b_31 = B[3 * _bs0 + 1 * _bs1], b_32 = B[3 * _bs0 + 2 * _bs1], - b_33 = B[3 * _bs0 + 3 * _bs1], b_34 = B[3 * _bs0 + 4 * _bs1], - b_40 = B[4 * _bs0 + 0 * _bs1], b_41 = B[4 * _bs0 + 1 * _bs1], - b_42 = B[4 * _bs0 + 2 * _bs1], b_43 = B[4 * _bs0 + 3 * _bs1], + const ValueType b_00 = B[0 * _bs0 + 0 * _bs1], b_01 = B[0 * _bs0 + 1 * _bs1], b_02 = B[0 * _bs0 + 2 * _bs1], + b_03 = B[0 * _bs0 + 3 * _bs1], b_04 = B[0 * _bs0 + 4 * _bs1], b_10 = B[1 * _bs0 + 0 * _bs1], + b_11 = B[1 * _bs0 + 1 * _bs1], b_12 = B[1 * _bs0 + 2 * _bs1], b_13 = B[1 * _bs0 + 3 * _bs1], + b_14 = B[1 * _bs0 + 4 * _bs1], b_20 = B[2 * _bs0 + 0 * _bs1], b_21 = B[2 * _bs0 + 1 * _bs1], + b_22 = B[2 * _bs0 + 2 * _bs1], b_23 = B[2 * _bs0 + 3 * _bs1], b_24 = B[2 * _bs0 + 4 * _bs1], + b_30 = B[3 * _bs0 + 0 * _bs1], b_31 = B[3 * _bs0 + 1 * _bs1], b_32 = B[3 * _bs0 + 2 * _bs1], + b_33 = B[3 * _bs0 + 3 * _bs1], b_34 = B[3 * _bs0 + 4 * _bs1], b_40 = B[4 * _bs0 + 0 * _bs1], + b_41 = B[4 * _bs0 + 1 * _bs1], b_42 = B[4 * _bs0 + 2 * _bs1], b_43 = B[4 * _bs0 + 3 * _bs1], b_44 = B[4 * _bs0 + 4 * _bs1]; ValueType a_p0, a_p1, a_p2, a_p3, a_p4, c_p0, c_p1, c_p2, c_p3, c_p4; - const int ja0 = 0 * _as1, ja1 = 1 * _as1, ja2 = 2 * _as1, ja3 = 3 * _as1, - ja4 = 4 * _as1, jc0 = 0 * _cs1, jc1 = 1 * _cs1, jc2 = 2 * _cs1, - jc3 = 3 * _cs1, jc4 = 4 * _cs1; + const int ja0 = 0 * _as1, ja1 = 1 * _as1, ja2 = 2 * _as1, ja3 = 3 * _as1, ja4 = 4 * _as1, jc0 = 0 * _cs1, + jc1 = 1 * _cs1, jc2 = 2 * _cs1, jc3 = 3 * _cs1, jc4 = 4 * _cs1; for (int p = 0; p < m; ++p) { a_p0 = A[p * _bs0 + ja0]; @@ -100,28 +95,24 @@ KOKKOS_INLINE_FUNCTION int InnerGemmFixB<5, 5>::serial_invoke( template <> template -KOKKOS_INLINE_FUNCTION int InnerGemmFixB<5, 4>::serial_invoke( - const ScalarType alpha, const ValueType *KOKKOS_RESTRICT A, - const ValueType *KOKKOS_RESTRICT B, const int m, - /**/ ValueType *KOKKOS_RESTRICT C) { +KOKKOS_INLINE_FUNCTION int InnerGemmFixB<5, 4>::serial_invoke(const ScalarType alpha, + const ValueType *KOKKOS_RESTRICT A, + const ValueType *KOKKOS_RESTRICT B, const int m, + /**/ ValueType *KOKKOS_RESTRICT C) { if (m <= 0) return 0; - const ValueType b_00 = B[0 * _bs0 + 0 * _bs1], b_01 = B[0 * _bs0 + 1 * _bs1], - b_02 = B[0 * _bs0 + 2 * _bs1], b_03 = B[0 * _bs0 + 3 * _bs1], - b_10 = B[1 * _bs0 + 0 * _bs1], b_11 = B[1 * _bs0 + 1 * _bs1], - b_12 = B[1 * _bs0 + 2 * _bs1], b_13 = B[1 * _bs0 + 3 * _bs1], - b_20 = B[2 * _bs0 + 0 * _bs1], b_21 = B[2 * _bs0 + 1 * _bs1], - b_22 = B[2 * _bs0 + 2 * _bs1], b_23 = B[2 * _bs0 + 3 * _bs1], - b_30 = B[3 * _bs0 + 0 * _bs1], b_31 = B[3 * _bs0 + 1 * _bs1], - b_32 = B[3 * _bs0 + 2 * _bs1], b_33 = B[3 * _bs0 + 3 * _bs1], - b_40 = B[4 * _bs0 + 0 * _bs1], b_41 = B[4 * _bs0 + 1 * _bs1], + const ValueType b_00 = B[0 * _bs0 + 0 * _bs1], b_01 = B[0 * _bs0 + 1 * _bs1], b_02 = B[0 * _bs0 + 2 * _bs1], + b_03 = B[0 * _bs0 + 3 * _bs1], b_10 = B[1 * _bs0 + 0 * _bs1], b_11 = B[1 * _bs0 + 1 * _bs1], + b_12 = B[1 * _bs0 + 2 * _bs1], b_13 = B[1 * _bs0 + 3 * _bs1], b_20 = B[2 * _bs0 + 0 * _bs1], + b_21 = B[2 * _bs0 + 1 * _bs1], b_22 = B[2 * _bs0 + 2 * _bs1], b_23 = B[2 * _bs0 + 3 * _bs1], + b_30 = B[3 * _bs0 + 0 * _bs1], b_31 = B[3 * _bs0 + 1 * _bs1], b_32 = B[3 * _bs0 + 2 * _bs1], + b_33 = B[3 * _bs0 + 3 * _bs1], b_40 = B[4 * _bs0 + 0 * _bs1], b_41 = B[4 * _bs0 + 1 * _bs1], b_42 = B[4 * _bs0 + 2 * _bs1], b_43 = B[4 * _bs0 + 3 * _bs1]; ValueType a_p0, a_p1, a_p2, a_p3, a_p4, c_p0, c_p1, c_p2, c_p3; - const int ja0 = 0 * _as1, ja1 = 1 * _as1, ja2 = 2 * _as1, ja3 = 3 * _as1, - ja4 = 4 * _as1, jc0 = 0 * _cs1, jc1 = 1 * _cs1, jc2 = 2 * _cs1, - jc3 = 3 * _cs1; + const int ja0 = 0 * _as1, ja1 = 1 * _as1, ja2 = 2 * _as1, ja3 = 3 * _as1, ja4 = 4 * _as1, jc0 = 0 * _cs1, + jc1 = 1 * _cs1, jc2 = 2 * _cs1, jc3 = 3 * _cs1; for (int p = 0; p < m; ++p) { a_p0 = A[p * _bs0 + ja0]; @@ -162,25 +153,22 @@ KOKKOS_INLINE_FUNCTION int InnerGemmFixB<5, 4>::serial_invoke( template <> template -KOKKOS_INLINE_FUNCTION int InnerGemmFixB<5, 3>::serial_invoke( - const ScalarType alpha, const ValueType *KOKKOS_RESTRICT A, - const ValueType *KOKKOS_RESTRICT B, const int m, - /**/ ValueType *KOKKOS_RESTRICT C) { +KOKKOS_INLINE_FUNCTION int InnerGemmFixB<5, 3>::serial_invoke(const ScalarType alpha, + const ValueType *KOKKOS_RESTRICT A, + const ValueType *KOKKOS_RESTRICT B, const int m, + /**/ ValueType *KOKKOS_RESTRICT C) { if (m <= 0) return 0; - const ValueType b_00 = B[0 * _bs0 + 0 * _bs1], b_01 = B[0 * _bs0 + 1 * _bs1], - b_02 = B[0 * _bs0 + 2 * _bs1], b_10 = B[1 * _bs0 + 0 * _bs1], - b_11 = B[1 * _bs0 + 1 * _bs1], b_12 = B[1 * _bs0 + 2 * _bs1], - b_20 = B[2 * _bs0 + 0 * _bs1], b_21 = B[2 * _bs0 + 1 * _bs1], - b_22 = B[2 * _bs0 + 2 * _bs1], b_30 = B[3 * _bs0 + 0 * _bs1], - b_31 = B[3 * _bs0 + 1 * _bs1], b_32 = B[3 * _bs0 + 2 * _bs1], - b_40 = B[4 * _bs0 + 0 * _bs1], b_41 = B[4 * _bs0 + 1 * _bs1], - b_42 = B[4 * _bs0 + 2 * _bs1]; + const ValueType b_00 = B[0 * _bs0 + 0 * _bs1], b_01 = B[0 * _bs0 + 1 * _bs1], b_02 = B[0 * _bs0 + 2 * _bs1], + b_10 = B[1 * _bs0 + 0 * _bs1], b_11 = B[1 * _bs0 + 1 * _bs1], b_12 = B[1 * _bs0 + 2 * _bs1], + b_20 = B[2 * _bs0 + 0 * _bs1], b_21 = B[2 * _bs0 + 1 * _bs1], b_22 = B[2 * _bs0 + 2 * _bs1], + b_30 = B[3 * _bs0 + 0 * _bs1], b_31 = B[3 * _bs0 + 1 * _bs1], b_32 = B[3 * _bs0 + 2 * _bs1], + b_40 = B[4 * _bs0 + 0 * _bs1], b_41 = B[4 * _bs0 + 1 * _bs1], b_42 = B[4 * _bs0 + 2 * _bs1]; ValueType a_p0, a_p1, a_p2, a_p3, a_p4, c_p0, c_p1, c_p2; - const int ja0 = 0 * _as1, ja1 = 1 * _as1, ja2 = 2 * _as1, ja3 = 3 * _as1, - ja4 = 4 * _as1, jc0 = 0 * _cs1, jc1 = 1 * _cs1, jc2 = 2 * _cs1; + const int ja0 = 0 * _as1, ja1 = 1 * _as1, ja2 = 2 * _as1, ja3 = 3 * _as1, ja4 = 4 * _as1, jc0 = 0 * _cs1, + jc1 = 1 * _cs1, jc2 = 2 * _cs1; for (int p = 0; p < m; ++p) { a_p0 = A[p * _bs0 + ja0]; @@ -214,22 +202,21 @@ KOKKOS_INLINE_FUNCTION int InnerGemmFixB<5, 3>::serial_invoke( } template <> template -KOKKOS_INLINE_FUNCTION int InnerGemmFixB<5, 2>::serial_invoke( - const ScalarType alpha, const ValueType *KOKKOS_RESTRICT A, - const ValueType *KOKKOS_RESTRICT B, const int m, - /**/ ValueType *KOKKOS_RESTRICT C) { +KOKKOS_INLINE_FUNCTION int InnerGemmFixB<5, 2>::serial_invoke(const ScalarType alpha, + const ValueType *KOKKOS_RESTRICT A, + const ValueType *KOKKOS_RESTRICT B, const int m, + /**/ ValueType *KOKKOS_RESTRICT C) { if (m <= 0) return 0; - const ValueType b_00 = B[0 * _bs0 + 0 * _bs1], b_01 = B[0 * _bs0 + 1 * _bs1], - b_10 = B[1 * _bs0 + 0 * _bs1], b_11 = B[1 * _bs0 + 1 * _bs1], - b_20 = B[2 * _bs0 + 0 * _bs1], b_21 = B[2 * _bs0 + 1 * _bs1], - b_30 = B[3 * _bs0 + 0 * _bs1], b_31 = B[3 * _bs0 + 1 * _bs1], - b_40 = B[4 * _bs0 + 0 * _bs1], b_41 = B[4 * _bs0 + 1 * _bs1]; + const ValueType b_00 = B[0 * _bs0 + 0 * _bs1], b_01 = B[0 * _bs0 + 1 * _bs1], b_10 = B[1 * _bs0 + 0 * _bs1], + b_11 = B[1 * _bs0 + 1 * _bs1], b_20 = B[2 * _bs0 + 0 * _bs1], b_21 = B[2 * _bs0 + 1 * _bs1], + b_30 = B[3 * _bs0 + 0 * _bs1], b_31 = B[3 * _bs0 + 1 * _bs1], b_40 = B[4 * _bs0 + 0 * _bs1], + b_41 = B[4 * _bs0 + 1 * _bs1]; ValueType a_p0, a_p1, a_p2, a_p3, a_p4, c_p0, c_p1; - const int ja0 = 0 * _as1, ja1 = 1 * _as1, ja2 = 2 * _as1, ja3 = 3 * _as1, - ja4 = 4 * _as1, jc0 = 0 * _cs1, jc1 = 1 * _cs1; + const int ja0 = 0 * _as1, ja1 = 1 * _as1, ja2 = 2 * _as1, ja3 = 3 * _as1, ja4 = 4 * _as1, jc0 = 0 * _cs1, + jc1 = 1 * _cs1; for (int p = 0; p < m; ++p) { a_p0 = A[p * _bs0 + ja0]; @@ -257,20 +244,18 @@ KOKKOS_INLINE_FUNCTION int InnerGemmFixB<5, 2>::serial_invoke( } template <> template -KOKKOS_INLINE_FUNCTION int InnerGemmFixB<5, 1>::serial_invoke( - const ScalarType alpha, const ValueType *KOKKOS_RESTRICT A, - const ValueType *KOKKOS_RESTRICT B, const int m, - /**/ ValueType *KOKKOS_RESTRICT C) { +KOKKOS_INLINE_FUNCTION int InnerGemmFixB<5, 1>::serial_invoke(const ScalarType alpha, + const ValueType *KOKKOS_RESTRICT A, + const ValueType *KOKKOS_RESTRICT B, const int m, + /**/ ValueType *KOKKOS_RESTRICT C) { if (m <= 0) return 0; - const ValueType b_00 = B[0 * _bs0 + 0 * _bs1], b_10 = B[1 * _bs0 + 0 * _bs1], - b_20 = B[2 * _bs0 + 0 * _bs1], b_30 = B[3 * _bs0 + 0 * _bs1], - b_40 = B[4 * _bs0 + 0 * _bs1]; + const ValueType b_00 = B[0 * _bs0 + 0 * _bs1], b_10 = B[1 * _bs0 + 0 * _bs1], b_20 = B[2 * _bs0 + 0 * _bs1], + b_30 = B[3 * _bs0 + 0 * _bs1], b_40 = B[4 * _bs0 + 0 * _bs1]; ValueType a_p0, a_p1, a_p2, a_p3, a_p4, c_p0; - const int ja0 = 0 * _as1, ja1 = 1 * _as1, ja2 = 2 * _as1, ja3 = 3 * _as1, - ja4 = 4 * _as1, jc0 = 0 * _cs1; + const int ja0 = 0 * _as1, ja1 = 1 * _as1, ja2 = 2 * _as1, ja3 = 3 * _as1, ja4 = 4 * _as1, jc0 = 0 * _cs1; for (int p = 0; p < m; ++p) { a_p0 = A[p * _bs0 + ja0]; @@ -293,28 +278,24 @@ KOKKOS_INLINE_FUNCTION int InnerGemmFixB<5, 1>::serial_invoke( template <> template -KOKKOS_INLINE_FUNCTION int InnerGemmFixB<4, 5>::serial_invoke( - const ScalarType alpha, const ValueType *KOKKOS_RESTRICT A, - const ValueType *KOKKOS_RESTRICT B, const int m, - /**/ ValueType *KOKKOS_RESTRICT C) { +KOKKOS_INLINE_FUNCTION int InnerGemmFixB<4, 5>::serial_invoke(const ScalarType alpha, + const ValueType *KOKKOS_RESTRICT A, + const ValueType *KOKKOS_RESTRICT B, const int m, + /**/ ValueType *KOKKOS_RESTRICT C) { if (m <= 0) return 0; - const ValueType b_00 = B[0 * _bs0 + 0 * _bs1], b_01 = B[0 * _bs0 + 1 * _bs1], - b_02 = B[0 * _bs0 + 2 * _bs1], b_03 = B[0 * _bs0 + 3 * _bs1], - b_04 = B[0 * _bs0 + 4 * _bs1], b_10 = B[1 * _bs0 + 0 * _bs1], - b_11 = B[1 * _bs0 + 1 * _bs1], b_12 = B[1 * _bs0 + 2 * _bs1], - b_13 = B[1 * _bs0 + 3 * _bs1], b_14 = B[1 * _bs0 + 4 * _bs1], - b_20 = B[2 * _bs0 + 0 * _bs1], b_21 = B[2 * _bs0 + 1 * _bs1], - b_22 = B[2 * _bs0 + 2 * _bs1], b_23 = B[2 * _bs0 + 3 * _bs1], - b_24 = B[2 * _bs0 + 4 * _bs1], b_30 = B[3 * _bs0 + 0 * _bs1], - b_31 = B[3 * _bs0 + 1 * _bs1], b_32 = B[3 * _bs0 + 2 * _bs1], + const ValueType b_00 = B[0 * _bs0 + 0 * _bs1], b_01 = B[0 * _bs0 + 1 * _bs1], b_02 = B[0 * _bs0 + 2 * _bs1], + b_03 = B[0 * _bs0 + 3 * _bs1], b_04 = B[0 * _bs0 + 4 * _bs1], b_10 = B[1 * _bs0 + 0 * _bs1], + b_11 = B[1 * _bs0 + 1 * _bs1], b_12 = B[1 * _bs0 + 2 * _bs1], b_13 = B[1 * _bs0 + 3 * _bs1], + b_14 = B[1 * _bs0 + 4 * _bs1], b_20 = B[2 * _bs0 + 0 * _bs1], b_21 = B[2 * _bs0 + 1 * _bs1], + b_22 = B[2 * _bs0 + 2 * _bs1], b_23 = B[2 * _bs0 + 3 * _bs1], b_24 = B[2 * _bs0 + 4 * _bs1], + b_30 = B[3 * _bs0 + 0 * _bs1], b_31 = B[3 * _bs0 + 1 * _bs1], b_32 = B[3 * _bs0 + 2 * _bs1], b_33 = B[3 * _bs0 + 3 * _bs1], b_34 = B[3 * _bs0 + 4 * _bs1]; ValueType a_p0, a_p1, a_p2, a_p3, c_p0, c_p1, c_p2, c_p3, c_p4; - const int ja0 = 0 * _as1, ja1 = 1 * _as1, ja2 = 2 * _as1, ja3 = 3 * _as1, - jc0 = 0 * _cs1, jc1 = 1 * _cs1, jc2 = 2 * _cs1, jc3 = 3 * _cs1, - jc4 = 4 * _cs1; + const int ja0 = 0 * _as1, ja1 = 1 * _as1, ja2 = 2 * _as1, ja3 = 3 * _as1, jc0 = 0 * _cs1, jc1 = 1 * _cs1, + jc2 = 2 * _cs1, jc3 = 3 * _cs1, jc4 = 4 * _cs1; for (int p = 0; p < m; ++p) { a_p0 = A[p * _bs0 + ja0]; @@ -355,25 +336,22 @@ KOKKOS_INLINE_FUNCTION int InnerGemmFixB<4, 5>::serial_invoke( template <> template -KOKKOS_INLINE_FUNCTION int InnerGemmFixB<3, 5>::serial_invoke( - const ScalarType alpha, const ValueType *KOKKOS_RESTRICT A, - const ValueType *KOKKOS_RESTRICT B, const int m, - /**/ ValueType *KOKKOS_RESTRICT C) { +KOKKOS_INLINE_FUNCTION int InnerGemmFixB<3, 5>::serial_invoke(const ScalarType alpha, + const ValueType *KOKKOS_RESTRICT A, + const ValueType *KOKKOS_RESTRICT B, const int m, + /**/ ValueType *KOKKOS_RESTRICT C) { if (m <= 0) return 0; - const ValueType b_00 = B[0 * _bs0 + 0 * _bs1], b_01 = B[0 * _bs0 + 1 * _bs1], - b_02 = B[0 * _bs0 + 2 * _bs1], b_03 = B[0 * _bs0 + 3 * _bs1], - b_04 = B[0 * _bs0 + 4 * _bs1], b_10 = B[1 * _bs0 + 0 * _bs1], - b_11 = B[1 * _bs0 + 1 * _bs1], b_12 = B[1 * _bs0 + 2 * _bs1], - b_13 = B[1 * _bs0 + 3 * _bs1], b_14 = B[1 * _bs0 + 4 * _bs1], - b_20 = B[2 * _bs0 + 0 * _bs1], b_21 = B[2 * _bs0 + 1 * _bs1], - b_22 = B[2 * _bs0 + 2 * _bs1], b_23 = B[2 * _bs0 + 3 * _bs1], - b_24 = B[2 * _bs0 + 4 * _bs1]; + const ValueType b_00 = B[0 * _bs0 + 0 * _bs1], b_01 = B[0 * _bs0 + 1 * _bs1], b_02 = B[0 * _bs0 + 2 * _bs1], + b_03 = B[0 * _bs0 + 3 * _bs1], b_04 = B[0 * _bs0 + 4 * _bs1], b_10 = B[1 * _bs0 + 0 * _bs1], + b_11 = B[1 * _bs0 + 1 * _bs1], b_12 = B[1 * _bs0 + 2 * _bs1], b_13 = B[1 * _bs0 + 3 * _bs1], + b_14 = B[1 * _bs0 + 4 * _bs1], b_20 = B[2 * _bs0 + 0 * _bs1], b_21 = B[2 * _bs0 + 1 * _bs1], + b_22 = B[2 * _bs0 + 2 * _bs1], b_23 = B[2 * _bs0 + 3 * _bs1], b_24 = B[2 * _bs0 + 4 * _bs1]; ValueType a_p0, a_p1, a_p2, c_p0, c_p1, c_p2, c_p3, c_p4; - const int ja0 = 0 * _as1, ja1 = 1 * _as1, ja2 = 2 * _as1, jc0 = 0 * _cs1, - jc1 = 1 * _cs1, jc2 = 2 * _cs1, jc3 = 3 * _cs1, jc4 = 4 * _cs1; + const int ja0 = 0 * _as1, ja1 = 1 * _as1, ja2 = 2 * _as1, jc0 = 0 * _cs1, jc1 = 1 * _cs1, jc2 = 2 * _cs1, + jc3 = 3 * _cs1, jc4 = 4 * _cs1; for (int p = 0; p < m; ++p) { a_p0 = A[p * _bs0 + ja0]; @@ -407,22 +385,21 @@ KOKKOS_INLINE_FUNCTION int InnerGemmFixB<3, 5>::serial_invoke( } template <> template -KOKKOS_INLINE_FUNCTION int InnerGemmFixB<2, 5>::serial_invoke( - const ScalarType alpha, const ValueType *KOKKOS_RESTRICT A, - const ValueType *KOKKOS_RESTRICT B, const int m, - /**/ ValueType *KOKKOS_RESTRICT C) { +KOKKOS_INLINE_FUNCTION int InnerGemmFixB<2, 5>::serial_invoke(const ScalarType alpha, + const ValueType *KOKKOS_RESTRICT A, + const ValueType *KOKKOS_RESTRICT B, const int m, + /**/ ValueType *KOKKOS_RESTRICT C) { if (m <= 0) return 0; - const ValueType b_00 = B[0 * _bs0 + 0 * _bs1], b_01 = B[0 * _bs0 + 1 * _bs1], - b_02 = B[0 * _bs0 + 2 * _bs1], b_03 = B[0 * _bs0 + 3 * _bs1], - b_04 = B[0 * _bs0 + 4 * _bs1], b_10 = B[1 * _bs0 + 0 * _bs1], - b_11 = B[1 * _bs0 + 1 * _bs1], b_12 = B[1 * _bs0 + 2 * _bs1], - b_13 = B[1 * _bs0 + 3 * _bs1], b_14 = B[1 * _bs0 + 4 * _bs1]; + const ValueType b_00 = B[0 * _bs0 + 0 * _bs1], b_01 = B[0 * _bs0 + 1 * _bs1], b_02 = B[0 * _bs0 + 2 * _bs1], + b_03 = B[0 * _bs0 + 3 * _bs1], b_04 = B[0 * _bs0 + 4 * _bs1], b_10 = B[1 * _bs0 + 0 * _bs1], + b_11 = B[1 * _bs0 + 1 * _bs1], b_12 = B[1 * _bs0 + 2 * _bs1], b_13 = B[1 * _bs0 + 3 * _bs1], + b_14 = B[1 * _bs0 + 4 * _bs1]; ValueType a_p0, a_p1, c_p0, c_p1, c_p2, c_p3, c_p4; - const int ja0 = 0 * _as1, ja1 = 1 * _as1, jc0 = 0 * _cs1, jc1 = 1 * _cs1, - jc2 = 2 * _cs1, jc3 = 3 * _cs1, jc4 = 4 * _cs1; + const int ja0 = 0 * _as1, ja1 = 1 * _as1, jc0 = 0 * _cs1, jc1 = 1 * _cs1, jc2 = 2 * _cs1, jc3 = 3 * _cs1, + jc4 = 4 * _cs1; for (int p = 0; p < m; ++p) { a_p0 = A[p * _bs0 + ja0]; @@ -450,20 +427,18 @@ KOKKOS_INLINE_FUNCTION int InnerGemmFixB<2, 5>::serial_invoke( } template <> template -KOKKOS_INLINE_FUNCTION int InnerGemmFixB<1, 5>::serial_invoke( - const ScalarType alpha, const ValueType *KOKKOS_RESTRICT A, - const ValueType *KOKKOS_RESTRICT B, const int m, - /**/ ValueType *KOKKOS_RESTRICT C) { +KOKKOS_INLINE_FUNCTION int InnerGemmFixB<1, 5>::serial_invoke(const ScalarType alpha, + const ValueType *KOKKOS_RESTRICT A, + const ValueType *KOKKOS_RESTRICT B, const int m, + /**/ ValueType *KOKKOS_RESTRICT C) { if (m <= 0) return 0; - const ValueType b_00 = B[0 * _bs0 + 0 * _bs1], b_01 = B[0 * _bs0 + 1 * _bs1], - b_02 = B[0 * _bs0 + 2 * _bs1], b_03 = B[0 * _bs0 + 3 * _bs1], - b_04 = B[0 * _bs0 + 4 * _bs1]; + const ValueType b_00 = B[0 * _bs0 + 0 * _bs1], b_01 = B[0 * _bs0 + 1 * _bs1], b_02 = B[0 * _bs0 + 2 * _bs1], + b_03 = B[0 * _bs0 + 3 * _bs1], b_04 = B[0 * _bs0 + 4 * _bs1]; ValueType a_p0, c_p0, c_p1, c_p2, c_p3, c_p4; - const int ja0 = 0 * _as1, jc0 = 0 * _cs1, jc1 = 1 * _cs1, jc2 = 2 * _cs1, - jc3 = 3 * _cs1, jc4 = 4 * _cs1; + const int ja0 = 0 * _as1, jc0 = 0 * _cs1, jc1 = 1 * _cs1, jc2 = 2 * _cs1, jc3 = 3 * _cs1, jc4 = 4 * _cs1; for (int p = 0; p < m; ++p) { a_p0 = A[p * _bs0 + ja0]; @@ -486,10 +461,11 @@ KOKKOS_INLINE_FUNCTION int InnerGemmFixB<1, 5>::serial_invoke( template <> template -KOKKOS_INLINE_FUNCTION int InnerGemmFixB<5, 5>::serial_invoke( - const ScalarType alpha, const ValueType *KOKKOS_RESTRICT A, - const ValueType *KOKKOS_RESTRICT B, const int m, const int n, const int k, - /**/ ValueType *KOKKOS_RESTRICT C) { +KOKKOS_INLINE_FUNCTION int InnerGemmFixB<5, 5>::serial_invoke(const ScalarType alpha, + const ValueType *KOKKOS_RESTRICT A, + const ValueType *KOKKOS_RESTRICT B, const int m, + const int n, const int k, + /**/ ValueType *KOKKOS_RESTRICT C) { if (m <= 0 || n <= 0 || k <= 0) return 0; switch (k * 10 + n) { @@ -544,25 +520,23 @@ KOKKOS_INLINE_FUNCTION int InnerGemmFixB<5, 5>::serial_invoke( template <> template -KOKKOS_INLINE_FUNCTION int InnerGemmFixB<4, 4>::serial_invoke( - const ScalarType alpha, const ValueType *KOKKOS_RESTRICT A, - const ValueType *KOKKOS_RESTRICT B, const int m, - /**/ ValueType *KOKKOS_RESTRICT C) { +KOKKOS_INLINE_FUNCTION int InnerGemmFixB<4, 4>::serial_invoke(const ScalarType alpha, + const ValueType *KOKKOS_RESTRICT A, + const ValueType *KOKKOS_RESTRICT B, const int m, + /**/ ValueType *KOKKOS_RESTRICT C) { if (m <= 0) return 0; - const ValueType b_00 = B[0 * _bs0 + 0 * _bs1], b_01 = B[0 * _bs0 + 1 * _bs1], - b_02 = B[0 * _bs0 + 2 * _bs1], b_03 = B[0 * _bs0 + 3 * _bs1], - b_10 = B[1 * _bs0 + 0 * _bs1], b_11 = B[1 * _bs0 + 1 * _bs1], - b_12 = B[1 * _bs0 + 2 * _bs1], b_13 = B[1 * _bs0 + 3 * _bs1], - b_20 = B[2 * _bs0 + 0 * _bs1], b_21 = B[2 * _bs0 + 1 * _bs1], - b_22 = B[2 * _bs0 + 2 * _bs1], b_23 = B[2 * _bs0 + 3 * _bs1], - b_30 = B[3 * _bs0 + 0 * _bs1], b_31 = B[3 * _bs0 + 1 * _bs1], - b_32 = B[3 * _bs0 + 2 * _bs1], b_33 = B[3 * _bs0 + 3 * _bs1]; + const ValueType b_00 = B[0 * _bs0 + 0 * _bs1], b_01 = B[0 * _bs0 + 1 * _bs1], b_02 = B[0 * _bs0 + 2 * _bs1], + b_03 = B[0 * _bs0 + 3 * _bs1], b_10 = B[1 * _bs0 + 0 * _bs1], b_11 = B[1 * _bs0 + 1 * _bs1], + b_12 = B[1 * _bs0 + 2 * _bs1], b_13 = B[1 * _bs0 + 3 * _bs1], b_20 = B[2 * _bs0 + 0 * _bs1], + b_21 = B[2 * _bs0 + 1 * _bs1], b_22 = B[2 * _bs0 + 2 * _bs1], b_23 = B[2 * _bs0 + 3 * _bs1], + b_30 = B[3 * _bs0 + 0 * _bs1], b_31 = B[3 * _bs0 + 1 * _bs1], b_32 = B[3 * _bs0 + 2 * _bs1], + b_33 = B[3 * _bs0 + 3 * _bs1]; ValueType a_p0, a_p1, a_p2, a_p3, c_p0, c_p1, c_p2, c_p3; - const int ja0 = 0 * _as1, ja1 = 1 * _as1, ja2 = 2 * _as1, ja3 = 3 * _as1, - jc0 = 0 * _cs1, jc1 = 1 * _cs1, jc2 = 2 * _cs1, jc3 = 3 * _cs1; + const int ja0 = 0 * _as1, ja1 = 1 * _as1, ja2 = 2 * _as1, ja3 = 3 * _as1, jc0 = 0 * _cs1, jc1 = 1 * _cs1, + jc2 = 2 * _cs1, jc3 = 3 * _cs1; for (int p = 0; p < m; ++p) { a_p0 = A[p * _bs0 + ja0]; @@ -598,23 +572,21 @@ KOKKOS_INLINE_FUNCTION int InnerGemmFixB<4, 4>::serial_invoke( template <> template -KOKKOS_INLINE_FUNCTION int InnerGemmFixB<4, 3>::serial_invoke( - const ScalarType alpha, const ValueType *KOKKOS_RESTRICT A, - const ValueType *KOKKOS_RESTRICT B, const int m, - /**/ ValueType *KOKKOS_RESTRICT C) { +KOKKOS_INLINE_FUNCTION int InnerGemmFixB<4, 3>::serial_invoke(const ScalarType alpha, + const ValueType *KOKKOS_RESTRICT A, + const ValueType *KOKKOS_RESTRICT B, const int m, + /**/ ValueType *KOKKOS_RESTRICT C) { if (m <= 0) return 0; - const ValueType b_00 = B[0 * _bs0 + 0 * _bs1], b_01 = B[0 * _bs0 + 1 * _bs1], - b_02 = B[0 * _bs0 + 2 * _bs1], b_10 = B[1 * _bs0 + 0 * _bs1], - b_11 = B[1 * _bs0 + 1 * _bs1], b_12 = B[1 * _bs0 + 2 * _bs1], - b_20 = B[2 * _bs0 + 0 * _bs1], b_21 = B[2 * _bs0 + 1 * _bs1], - b_22 = B[2 * _bs0 + 2 * _bs1], b_30 = B[3 * _bs0 + 0 * _bs1], - b_31 = B[3 * _bs0 + 1 * _bs1], b_32 = B[3 * _bs0 + 2 * _bs1]; + const ValueType b_00 = B[0 * _bs0 + 0 * _bs1], b_01 = B[0 * _bs0 + 1 * _bs1], b_02 = B[0 * _bs0 + 2 * _bs1], + b_10 = B[1 * _bs0 + 0 * _bs1], b_11 = B[1 * _bs0 + 1 * _bs1], b_12 = B[1 * _bs0 + 2 * _bs1], + b_20 = B[2 * _bs0 + 0 * _bs1], b_21 = B[2 * _bs0 + 1 * _bs1], b_22 = B[2 * _bs0 + 2 * _bs1], + b_30 = B[3 * _bs0 + 0 * _bs1], b_31 = B[3 * _bs0 + 1 * _bs1], b_32 = B[3 * _bs0 + 2 * _bs1]; ValueType a_p0, a_p1, a_p2, a_p3, c_p0, c_p1, c_p2; - const int ja0 = 0 * _as1, ja1 = 1 * _as1, ja2 = 2 * _as1, ja3 = 3 * _as1, - jc0 = 0 * _cs1, jc1 = 1 * _cs1, jc2 = 2 * _cs1; + const int ja0 = 0 * _as1, ja1 = 1 * _as1, ja2 = 2 * _as1, ja3 = 3 * _as1, jc0 = 0 * _cs1, jc1 = 1 * _cs1, + jc2 = 2 * _cs1; for (int p = 0; p < m; ++p) { a_p0 = A[p * _bs0 + ja0]; @@ -645,21 +617,19 @@ KOKKOS_INLINE_FUNCTION int InnerGemmFixB<4, 3>::serial_invoke( template <> template -KOKKOS_INLINE_FUNCTION int InnerGemmFixB<4, 2>::serial_invoke( - const ScalarType alpha, const ValueType *KOKKOS_RESTRICT A, - const ValueType *KOKKOS_RESTRICT B, const int m, - /**/ ValueType *KOKKOS_RESTRICT C) { +KOKKOS_INLINE_FUNCTION int InnerGemmFixB<4, 2>::serial_invoke(const ScalarType alpha, + const ValueType *KOKKOS_RESTRICT A, + const ValueType *KOKKOS_RESTRICT B, const int m, + /**/ ValueType *KOKKOS_RESTRICT C) { if (m <= 0) return 0; - const ValueType b_00 = B[0 * _bs0 + 0 * _bs1], b_01 = B[0 * _bs0 + 1 * _bs1], - b_10 = B[1 * _bs0 + 0 * _bs1], b_11 = B[1 * _bs0 + 1 * _bs1], - b_20 = B[2 * _bs0 + 0 * _bs1], b_21 = B[2 * _bs0 + 1 * _bs1], + const ValueType b_00 = B[0 * _bs0 + 0 * _bs1], b_01 = B[0 * _bs0 + 1 * _bs1], b_10 = B[1 * _bs0 + 0 * _bs1], + b_11 = B[1 * _bs0 + 1 * _bs1], b_20 = B[2 * _bs0 + 0 * _bs1], b_21 = B[2 * _bs0 + 1 * _bs1], b_30 = B[3 * _bs0 + 0 * _bs1], b_31 = B[3 * _bs0 + 1 * _bs1]; ValueType a_p0, a_p1, a_p2, a_p3, c_p0, c_p1; - const int ja0 = 0 * _as1, ja1 = 1 * _as1, ja2 = 2 * _as1, ja3 = 3 * _as1, - jc0 = 0 * _cs1, jc1 = 1 * _cs1; + const int ja0 = 0 * _as1, ja1 = 1 * _as1, ja2 = 2 * _as1, ja3 = 3 * _as1, jc0 = 0 * _cs1, jc1 = 1 * _cs1; for (int p = 0; p < m; ++p) { a_p0 = A[p * _bs0 + ja0]; @@ -685,19 +655,18 @@ KOKKOS_INLINE_FUNCTION int InnerGemmFixB<4, 2>::serial_invoke( template <> template -KOKKOS_INLINE_FUNCTION int InnerGemmFixB<4, 1>::serial_invoke( - const ScalarType alpha, const ValueType *KOKKOS_RESTRICT A, - const ValueType *KOKKOS_RESTRICT B, const int m, - /**/ ValueType *KOKKOS_RESTRICT C) { +KOKKOS_INLINE_FUNCTION int InnerGemmFixB<4, 1>::serial_invoke(const ScalarType alpha, + const ValueType *KOKKOS_RESTRICT A, + const ValueType *KOKKOS_RESTRICT B, const int m, + /**/ ValueType *KOKKOS_RESTRICT C) { if (m <= 0) return 0; - const ValueType b_00 = B[0 * _bs0 + 0 * _bs1], b_10 = B[1 * _bs0 + 0 * _bs1], - b_20 = B[2 * _bs0 + 0 * _bs1], b_30 = B[3 * _bs0 + 0 * _bs1]; + const ValueType b_00 = B[0 * _bs0 + 0 * _bs1], b_10 = B[1 * _bs0 + 0 * _bs1], b_20 = B[2 * _bs0 + 0 * _bs1], + b_30 = B[3 * _bs0 + 0 * _bs1]; ValueType a_p0, a_p1, a_p2, a_p3, c_p0; - const int ja0 = 0 * _as1, ja1 = 1 * _as1, ja2 = 2 * _as1, ja3 = 3 * _as1, - jc0 = 0 * _cs1; + const int ja0 = 0 * _as1, ja1 = 1 * _as1, ja2 = 2 * _as1, ja3 = 3 * _as1, jc0 = 0 * _cs1; for (int p = 0; p < m; ++p) { a_p0 = A[p * _bs0 + ja0]; @@ -718,23 +687,21 @@ KOKKOS_INLINE_FUNCTION int InnerGemmFixB<4, 1>::serial_invoke( template <> template -KOKKOS_INLINE_FUNCTION int InnerGemmFixB<3, 4>::serial_invoke( - const ScalarType alpha, const ValueType *KOKKOS_RESTRICT A, - const ValueType *KOKKOS_RESTRICT B, const int m, - /**/ ValueType *KOKKOS_RESTRICT C) { +KOKKOS_INLINE_FUNCTION int InnerGemmFixB<3, 4>::serial_invoke(const ScalarType alpha, + const ValueType *KOKKOS_RESTRICT A, + const ValueType *KOKKOS_RESTRICT B, const int m, + /**/ ValueType *KOKKOS_RESTRICT C) { if (m <= 0) return 0; - const ValueType b_00 = B[0 * _bs0 + 0 * _bs1], b_01 = B[0 * _bs0 + 1 * _bs1], - b_02 = B[0 * _bs0 + 2 * _bs1], b_03 = B[0 * _bs0 + 3 * _bs1], - b_10 = B[1 * _bs0 + 0 * _bs1], b_11 = B[1 * _bs0 + 1 * _bs1], - b_12 = B[1 * _bs0 + 2 * _bs1], b_13 = B[1 * _bs0 + 3 * _bs1], - b_20 = B[2 * _bs0 + 0 * _bs1], b_21 = B[2 * _bs0 + 1 * _bs1], - b_22 = B[2 * _bs0 + 2 * _bs1], b_23 = B[2 * _bs0 + 3 * _bs1]; + const ValueType b_00 = B[0 * _bs0 + 0 * _bs1], b_01 = B[0 * _bs0 + 1 * _bs1], b_02 = B[0 * _bs0 + 2 * _bs1], + b_03 = B[0 * _bs0 + 3 * _bs1], b_10 = B[1 * _bs0 + 0 * _bs1], b_11 = B[1 * _bs0 + 1 * _bs1], + b_12 = B[1 * _bs0 + 2 * _bs1], b_13 = B[1 * _bs0 + 3 * _bs1], b_20 = B[2 * _bs0 + 0 * _bs1], + b_21 = B[2 * _bs0 + 1 * _bs1], b_22 = B[2 * _bs0 + 2 * _bs1], b_23 = B[2 * _bs0 + 3 * _bs1]; ValueType a_p0, a_p1, a_p2, c_p0, c_p1, c_p2, c_p3; - const int ja0 = 0 * _as1, ja1 = 1 * _as1, ja2 = 2 * _as1, jc0 = 0 * _cs1, - jc1 = 1 * _cs1, jc2 = 2 * _cs1, jc3 = 3 * _cs1; + const int ja0 = 0 * _as1, ja1 = 1 * _as1, ja2 = 2 * _as1, jc0 = 0 * _cs1, jc1 = 1 * _cs1, jc2 = 2 * _cs1, + jc3 = 3 * _cs1; for (int p = 0; p < m; ++p) { a_p0 = A[p * _bs0 + ja0]; @@ -765,21 +732,19 @@ KOKKOS_INLINE_FUNCTION int InnerGemmFixB<3, 4>::serial_invoke( template <> template -KOKKOS_INLINE_FUNCTION int InnerGemmFixB<2, 4>::serial_invoke( - const ScalarType alpha, const ValueType *KOKKOS_RESTRICT A, - const ValueType *KOKKOS_RESTRICT B, const int m, - /**/ ValueType *KOKKOS_RESTRICT C) { +KOKKOS_INLINE_FUNCTION int InnerGemmFixB<2, 4>::serial_invoke(const ScalarType alpha, + const ValueType *KOKKOS_RESTRICT A, + const ValueType *KOKKOS_RESTRICT B, const int m, + /**/ ValueType *KOKKOS_RESTRICT C) { if (m <= 0) return 0; - const ValueType b_00 = B[0 * _bs0 + 0 * _bs1], b_01 = B[0 * _bs0 + 1 * _bs1], - b_02 = B[0 * _bs0 + 2 * _bs1], b_03 = B[0 * _bs0 + 3 * _bs1], - b_10 = B[1 * _bs0 + 0 * _bs1], b_11 = B[1 * _bs0 + 1 * _bs1], + const ValueType b_00 = B[0 * _bs0 + 0 * _bs1], b_01 = B[0 * _bs0 + 1 * _bs1], b_02 = B[0 * _bs0 + 2 * _bs1], + b_03 = B[0 * _bs0 + 3 * _bs1], b_10 = B[1 * _bs0 + 0 * _bs1], b_11 = B[1 * _bs0 + 1 * _bs1], b_12 = B[1 * _bs0 + 2 * _bs1], b_13 = B[1 * _bs0 + 3 * _bs1]; ValueType a_p0, a_p1, c_p0, c_p1, c_p2, c_p3; - const int ja0 = 0 * _as1, ja1 = 1 * _as1, jc0 = 0 * _cs1, jc1 = 1 * _cs1, - jc2 = 2 * _cs1, jc3 = 3 * _cs1; + const int ja0 = 0 * _as1, ja1 = 1 * _as1, jc0 = 0 * _cs1, jc1 = 1 * _cs1, jc2 = 2 * _cs1, jc3 = 3 * _cs1; for (int p = 0; p < m; ++p) { a_p0 = A[p * _bs0 + ja0]; @@ -805,19 +770,18 @@ KOKKOS_INLINE_FUNCTION int InnerGemmFixB<2, 4>::serial_invoke( template <> template -KOKKOS_INLINE_FUNCTION int InnerGemmFixB<1, 4>::serial_invoke( - const ScalarType alpha, const ValueType *KOKKOS_RESTRICT A, - const ValueType *KOKKOS_RESTRICT B, const int m, - /**/ ValueType *KOKKOS_RESTRICT C) { +KOKKOS_INLINE_FUNCTION int InnerGemmFixB<1, 4>::serial_invoke(const ScalarType alpha, + const ValueType *KOKKOS_RESTRICT A, + const ValueType *KOKKOS_RESTRICT B, const int m, + /**/ ValueType *KOKKOS_RESTRICT C) { if (m <= 0) return 0; - const ValueType b_00 = B[0 * _bs0 + 0 * _bs1], b_01 = B[0 * _bs0 + 1 * _bs1], - b_02 = B[0 * _bs0 + 2 * _bs1], b_03 = B[0 * _bs0 + 3 * _bs1]; + const ValueType b_00 = B[0 * _bs0 + 0 * _bs1], b_01 = B[0 * _bs0 + 1 * _bs1], b_02 = B[0 * _bs0 + 2 * _bs1], + b_03 = B[0 * _bs0 + 3 * _bs1]; ValueType a_p0, c_p0, c_p1, c_p2, c_p3; - const int ja0 = 0 * _as1, jc0 = 0 * _cs1, jc1 = 1 * _cs1, jc2 = 2 * _cs1, - jc3 = 3 * _cs1; + const int ja0 = 0 * _as1, jc0 = 0 * _cs1, jc1 = 1 * _cs1, jc2 = 2 * _cs1, jc3 = 3 * _cs1; for (int p = 0; p < m; ++p) { a_p0 = A[p * _bs0 + ja0]; @@ -838,10 +802,11 @@ KOKKOS_INLINE_FUNCTION int InnerGemmFixB<1, 4>::serial_invoke( template <> template -KOKKOS_INLINE_FUNCTION int InnerGemmFixB<4, 4>::serial_invoke( - const ScalarType alpha, const ValueType *KOKKOS_RESTRICT A, - const ValueType *KOKKOS_RESTRICT B, const int m, const int n, const int k, - /**/ ValueType *KOKKOS_RESTRICT C) { +KOKKOS_INLINE_FUNCTION int InnerGemmFixB<4, 4>::serial_invoke(const ScalarType alpha, + const ValueType *KOKKOS_RESTRICT A, + const ValueType *KOKKOS_RESTRICT B, const int m, + const int n, const int k, + /**/ ValueType *KOKKOS_RESTRICT C) { if (m <= 0 || n <= 0 || k <= 0) return 0; switch (k * 10 + n) { @@ -886,22 +851,19 @@ KOKKOS_INLINE_FUNCTION int InnerGemmFixB<4, 4>::serial_invoke( template <> template -KOKKOS_INLINE_FUNCTION int InnerGemmFixB<3, 3>::serial_invoke( - const ScalarType alpha, const ValueType *KOKKOS_RESTRICT A, - const ValueType *KOKKOS_RESTRICT B, const int m, - /**/ ValueType *KOKKOS_RESTRICT C) { +KOKKOS_INLINE_FUNCTION int InnerGemmFixB<3, 3>::serial_invoke(const ScalarType alpha, + const ValueType *KOKKOS_RESTRICT A, + const ValueType *KOKKOS_RESTRICT B, const int m, + /**/ ValueType *KOKKOS_RESTRICT C) { if (m <= 0) return 0; - const ValueType b_00 = B[0 * _bs0 + 0 * _bs1], b_01 = B[0 * _bs0 + 1 * _bs1], - b_02 = B[0 * _bs0 + 2 * _bs1], b_10 = B[1 * _bs0 + 0 * _bs1], - b_11 = B[1 * _bs0 + 1 * _bs1], b_12 = B[1 * _bs0 + 2 * _bs1], - b_20 = B[2 * _bs0 + 0 * _bs1], b_21 = B[2 * _bs0 + 1 * _bs1], - b_22 = B[2 * _bs0 + 2 * _bs1]; + const ValueType b_00 = B[0 * _bs0 + 0 * _bs1], b_01 = B[0 * _bs0 + 1 * _bs1], b_02 = B[0 * _bs0 + 2 * _bs1], + b_10 = B[1 * _bs0 + 0 * _bs1], b_11 = B[1 * _bs0 + 1 * _bs1], b_12 = B[1 * _bs0 + 2 * _bs1], + b_20 = B[2 * _bs0 + 0 * _bs1], b_21 = B[2 * _bs0 + 1 * _bs1], b_22 = B[2 * _bs0 + 2 * _bs1]; ValueType a_p0, a_p1, a_p2, c_p0, c_p1, c_p2; - const int ja0 = 0 * _as1, ja1 = 1 * _as1, ja2 = 2 * _as1, jc0 = 0 * _cs1, - jc1 = 1 * _cs1, jc2 = 2 * _cs1; + const int ja0 = 0 * _as1, ja1 = 1 * _as1, ja2 = 2 * _as1, jc0 = 0 * _cs1, jc1 = 1 * _cs1, jc2 = 2 * _cs1; for (int p = 0; p < m; ++p) { a_p0 = A[p * _bs0 + ja0]; @@ -928,20 +890,18 @@ KOKKOS_INLINE_FUNCTION int InnerGemmFixB<3, 3>::serial_invoke( template <> template -KOKKOS_INLINE_FUNCTION int InnerGemmFixB<3, 2>::serial_invoke( - const ScalarType alpha, const ValueType *KOKKOS_RESTRICT A, - const ValueType *KOKKOS_RESTRICT B, const int m, - /**/ ValueType *KOKKOS_RESTRICT C) { +KOKKOS_INLINE_FUNCTION int InnerGemmFixB<3, 2>::serial_invoke(const ScalarType alpha, + const ValueType *KOKKOS_RESTRICT A, + const ValueType *KOKKOS_RESTRICT B, const int m, + /**/ ValueType *KOKKOS_RESTRICT C) { if (m <= 0) return 0; - const ValueType b_00 = B[0 * _bs0 + 0 * _bs1], b_01 = B[0 * _bs0 + 1 * _bs1], - b_10 = B[1 * _bs0 + 0 * _bs1], b_11 = B[1 * _bs0 + 1 * _bs1], - b_20 = B[2 * _bs0 + 0 * _bs1], b_21 = B[2 * _bs0 + 1 * _bs1]; + const ValueType b_00 = B[0 * _bs0 + 0 * _bs1], b_01 = B[0 * _bs0 + 1 * _bs1], b_10 = B[1 * _bs0 + 0 * _bs1], + b_11 = B[1 * _bs0 + 1 * _bs1], b_20 = B[2 * _bs0 + 0 * _bs1], b_21 = B[2 * _bs0 + 1 * _bs1]; ValueType a_p0, a_p1, a_p2, c_p0, c_p1; - const int ja0 = 0 * _as1, ja1 = 1 * _as1, ja2 = 2 * _as1, jc0 = 0 * _cs1, - jc1 = 1 * _cs1; + const int ja0 = 0 * _as1, ja1 = 1 * _as1, ja2 = 2 * _as1, jc0 = 0 * _cs1, jc1 = 1 * _cs1; for (int p = 0; p < m; ++p) { a_p0 = A[p * _bs0 + ja0]; @@ -964,14 +924,13 @@ KOKKOS_INLINE_FUNCTION int InnerGemmFixB<3, 2>::serial_invoke( template <> template -KOKKOS_INLINE_FUNCTION int InnerGemmFixB<3, 1>::serial_invoke( - const ScalarType alpha, const ValueType *KOKKOS_RESTRICT A, - const ValueType *KOKKOS_RESTRICT B, const int m, - /**/ ValueType *KOKKOS_RESTRICT C) { +KOKKOS_INLINE_FUNCTION int InnerGemmFixB<3, 1>::serial_invoke(const ScalarType alpha, + const ValueType *KOKKOS_RESTRICT A, + const ValueType *KOKKOS_RESTRICT B, const int m, + /**/ ValueType *KOKKOS_RESTRICT C) { if (m <= 0) return 0; - const ValueType b_00 = B[0 * _bs0 + 0 * _bs1], b_10 = B[1 * _bs0 + 0 * _bs1], - b_20 = B[2 * _bs0 + 0 * _bs1]; + const ValueType b_00 = B[0 * _bs0 + 0 * _bs1], b_10 = B[1 * _bs0 + 0 * _bs1], b_20 = B[2 * _bs0 + 0 * _bs1]; ValueType a_p0, a_p1, a_p2, c_p0; @@ -994,20 +953,18 @@ KOKKOS_INLINE_FUNCTION int InnerGemmFixB<3, 1>::serial_invoke( template <> template -KOKKOS_INLINE_FUNCTION int InnerGemmFixB<2, 3>::serial_invoke( - const ScalarType alpha, const ValueType *KOKKOS_RESTRICT A, - const ValueType *KOKKOS_RESTRICT B, const int m, - /**/ ValueType *KOKKOS_RESTRICT C) { +KOKKOS_INLINE_FUNCTION int InnerGemmFixB<2, 3>::serial_invoke(const ScalarType alpha, + const ValueType *KOKKOS_RESTRICT A, + const ValueType *KOKKOS_RESTRICT B, const int m, + /**/ ValueType *KOKKOS_RESTRICT C) { if (m <= 0) return 0; - const ValueType b_00 = B[0 * _bs0 + 0 * _bs1], b_01 = B[0 * _bs0 + 1 * _bs1], - b_02 = B[0 * _bs0 + 2 * _bs1], b_10 = B[1 * _bs0 + 0 * _bs1], - b_11 = B[1 * _bs0 + 1 * _bs1], b_12 = B[1 * _bs0 + 2 * _bs1]; + const ValueType b_00 = B[0 * _bs0 + 0 * _bs1], b_01 = B[0 * _bs0 + 1 * _bs1], b_02 = B[0 * _bs0 + 2 * _bs1], + b_10 = B[1 * _bs0 + 0 * _bs1], b_11 = B[1 * _bs0 + 1 * _bs1], b_12 = B[1 * _bs0 + 2 * _bs1]; ValueType a_p0, a_p1, a_p2, c_p0, c_p1; - const int ja0 = 0 * _as1, ja1 = 1 * _as1, ja2 = 2 * _as1, jc0 = 0 * _cs1, - jc1 = 1 * _cs1; + const int ja0 = 0 * _as1, ja1 = 1 * _as1, ja2 = 2 * _as1, jc0 = 0 * _cs1, jc1 = 1 * _cs1; for (int p = 0; p < m; ++p) { a_p0 = A[p * _bs0 + ja0]; @@ -1029,14 +986,14 @@ KOKKOS_INLINE_FUNCTION int InnerGemmFixB<2, 3>::serial_invoke( } template <> template -KOKKOS_INLINE_FUNCTION int InnerGemmFixB<1, 3>::serial_invoke( - const ScalarType alpha, const ValueType *KOKKOS_RESTRICT A, - const ValueType *KOKKOS_RESTRICT B, const int m, - /**/ ValueType *KOKKOS_RESTRICT C) { +KOKKOS_INLINE_FUNCTION int InnerGemmFixB<1, 3>::serial_invoke(const ScalarType alpha, + const ValueType *KOKKOS_RESTRICT A, + const ValueType *KOKKOS_RESTRICT B, const int m, + /**/ ValueType *KOKKOS_RESTRICT C) { if (m <= 0) return 0; - const ValueType b_00 = B[0 * _bs0 + 0 * _bs1], b_01 = B[0 * _bs0 + 1 * _bs1], - b_10 = B[1 * _bs0 + 0 * _bs1], b_11 = B[1 * _bs0 + 1 * _bs1]; + const ValueType b_00 = B[0 * _bs0 + 0 * _bs1], b_01 = B[0 * _bs0 + 1 * _bs1], b_10 = B[1 * _bs0 + 0 * _bs1], + b_11 = B[1 * _bs0 + 1 * _bs1]; ValueType a_p0, a_p1, a_p2, c_p0; @@ -1059,10 +1016,11 @@ KOKKOS_INLINE_FUNCTION int InnerGemmFixB<1, 3>::serial_invoke( template <> template -KOKKOS_INLINE_FUNCTION int InnerGemmFixB<3, 3>::serial_invoke( - const ScalarType alpha, const ValueType *KOKKOS_RESTRICT A, - const ValueType *KOKKOS_RESTRICT B, const int m, const int n, const int k, - /**/ ValueType *KOKKOS_RESTRICT C) { +KOKKOS_INLINE_FUNCTION int InnerGemmFixB<3, 3>::serial_invoke(const ScalarType alpha, + const ValueType *KOKKOS_RESTRICT A, + const ValueType *KOKKOS_RESTRICT B, const int m, + const int n, const int k, + /**/ ValueType *KOKKOS_RESTRICT C) { if (m <= 0 || n <= 0 || k <= 0) return 0; switch (k * 10 + n) { @@ -1097,14 +1055,14 @@ KOKKOS_INLINE_FUNCTION int InnerGemmFixB<3, 3>::serial_invoke( template <> template -KOKKOS_INLINE_FUNCTION int InnerGemmFixB<2, 2>::serial_invoke( - const ScalarType alpha, const ValueType *KOKKOS_RESTRICT A, - const ValueType *KOKKOS_RESTRICT B, const int m, - /**/ ValueType *KOKKOS_RESTRICT C) { +KOKKOS_INLINE_FUNCTION int InnerGemmFixB<2, 2>::serial_invoke(const ScalarType alpha, + const ValueType *KOKKOS_RESTRICT A, + const ValueType *KOKKOS_RESTRICT B, const int m, + /**/ ValueType *KOKKOS_RESTRICT C) { if (m <= 0) return 0; - const ValueType b_00 = B[0 * _bs0 + 0 * _bs1], b_01 = B[0 * _bs0 + 1 * _bs1], - b_10 = B[1 * _bs0 + 0 * _bs1], b_11 = B[1 * _bs0 + 1 * _bs1]; + const ValueType b_00 = B[0 * _bs0 + 0 * _bs1], b_01 = B[0 * _bs0 + 1 * _bs1], b_10 = B[1 * _bs0 + 0 * _bs1], + b_11 = B[1 * _bs0 + 1 * _bs1]; ValueType a_p0, a_p1, c_p0, c_p1; @@ -1128,10 +1086,10 @@ KOKKOS_INLINE_FUNCTION int InnerGemmFixB<2, 2>::serial_invoke( template <> template -KOKKOS_INLINE_FUNCTION int InnerGemmFixB<2, 1>::serial_invoke( - const ScalarType alpha, const ValueType *KOKKOS_RESTRICT A, - const ValueType *KOKKOS_RESTRICT B, const int m, - /**/ ValueType *KOKKOS_RESTRICT C) { +KOKKOS_INLINE_FUNCTION int InnerGemmFixB<2, 1>::serial_invoke(const ScalarType alpha, + const ValueType *KOKKOS_RESTRICT A, + const ValueType *KOKKOS_RESTRICT B, const int m, + /**/ ValueType *KOKKOS_RESTRICT C) { if (m <= 0) return 0; const ValueType b_00 = B[0 * _bs0 + 0 * _bs1], b_10 = B[1 * _bs0 + 0 * _bs1]; @@ -1155,10 +1113,10 @@ KOKKOS_INLINE_FUNCTION int InnerGemmFixB<2, 1>::serial_invoke( template <> template -KOKKOS_INLINE_FUNCTION int InnerGemmFixB<1, 2>::serial_invoke( - const ScalarType alpha, const ValueType *KOKKOS_RESTRICT A, - const ValueType *KOKKOS_RESTRICT B, const int m, - /**/ ValueType *KOKKOS_RESTRICT C) { +KOKKOS_INLINE_FUNCTION int InnerGemmFixB<1, 2>::serial_invoke(const ScalarType alpha, + const ValueType *KOKKOS_RESTRICT A, + const ValueType *KOKKOS_RESTRICT B, const int m, + /**/ ValueType *KOKKOS_RESTRICT C) { if (m <= 0) return 0; const ValueType b_00 = B[0 * _bs0 + 0 * _bs1], b_01 = B[0 * _bs0 + 1 * _bs1]; @@ -1182,10 +1140,11 @@ KOKKOS_INLINE_FUNCTION int InnerGemmFixB<1, 2>::serial_invoke( template <> template -KOKKOS_INLINE_FUNCTION int InnerGemmFixB<2, 2>::serial_invoke( - const ScalarType alpha, const ValueType *KOKKOS_RESTRICT A, - const ValueType *KOKKOS_RESTRICT B, const int m, const int n, const int k, - /**/ ValueType *KOKKOS_RESTRICT C) { +KOKKOS_INLINE_FUNCTION int InnerGemmFixB<2, 2>::serial_invoke(const ScalarType alpha, + const ValueType *KOKKOS_RESTRICT A, + const ValueType *KOKKOS_RESTRICT B, const int m, + const int n, const int k, + /**/ ValueType *KOKKOS_RESTRICT C) { if (m <= 0 || n <= 0 || k <= 0) return 0; switch (k * 10 + n) { @@ -1210,10 +1169,10 @@ KOKKOS_INLINE_FUNCTION int InnerGemmFixB<2, 2>::serial_invoke( template <> template -KOKKOS_INLINE_FUNCTION int InnerGemmFixB<1, 1>::serial_invoke( - const ScalarType alpha, const ValueType *KOKKOS_RESTRICT A, - const ValueType *KOKKOS_RESTRICT B, const int m, - /**/ ValueType *KOKKOS_RESTRICT C) { +KOKKOS_INLINE_FUNCTION int InnerGemmFixB<1, 1>::serial_invoke(const ScalarType alpha, + const ValueType *KOKKOS_RESTRICT A, + const ValueType *KOKKOS_RESTRICT B, const int m, + /**/ ValueType *KOKKOS_RESTRICT C) { if (m <= 0) return 0; const ValueType b_00 = B[0 * _bs0 + 0 * _bs1]; @@ -1239,10 +1198,11 @@ KOKKOS_INLINE_FUNCTION int InnerGemmFixB<1, 1>::serial_invoke( template <> template -KOKKOS_INLINE_FUNCTION int InnerGemmFixB<0, 0>::serial_invoke( - const ScalarType alpha, const ValueType *KOKKOS_RESTRICT A, - const ValueType *KOKKOS_RESTRICT B, const int m, const int n, const int k, - /**/ ValueType *KOKKOS_RESTRICT C) { +KOKKOS_INLINE_FUNCTION int InnerGemmFixB<0, 0>::serial_invoke(const ScalarType alpha, + const ValueType *KOKKOS_RESTRICT A, + const ValueType *KOKKOS_RESTRICT B, const int m, + const int n, const int k, + /**/ ValueType *KOKKOS_RESTRICT C) { if (m <= 0 || n <= 0 || k <= 0) return 0; if (k == n) { @@ -1276,10 +1236,10 @@ KOKKOS_INLINE_FUNCTION int InnerGemmFixB<0, 0>::serial_invoke( } else { for (int i = 0; i < m; ++i) { const ValueType *KOKKOS_RESTRICT iA = A + i * _as0; - /**/ ValueType *KOKKOS_RESTRICT iC = C + i * _cs0; + /**/ ValueType *KOKKOS_RESTRICT iC = C + i * _cs0; for (int j = 0; j < n; ++j) { const ValueType *KOKKOS_RESTRICT jB = B + j * _bs1; - /**/ ValueType tC = 0; + /**/ ValueType tC = 0; for (int p = 0; p < k; ++p) tC += iA[p * _as1] * jB[p * _bs0]; pC[i * _cs0] += alpha * tC; } diff --git a/batched/dense/impl/KokkosBatched_InnerGemmFixC_Serial_Impl.hpp b/batched/dense/impl/KokkosBatched_InnerGemmFixC_Serial_Impl.hpp index 8bdf4fee4f..9ad08549cb 100644 --- a/batched/dense/impl/KokkosBatched_InnerGemmFixC_Serial_Impl.hpp +++ b/batched/dense/impl/KokkosBatched_InnerGemmFixC_Serial_Impl.hpp @@ -29,22 +29,19 @@ namespace KokkosBatched { template <> template -KOKKOS_INLINE_FUNCTION int InnerGemmFixC<5, 5>::serial_invoke( - const ScalarType alpha, const ValueType *KOKKOS_RESTRICT A, - const ValueType *KOKKOS_RESTRICT B, const int k, - /**/ ValueType *KOKKOS_RESTRICT C) { +KOKKOS_INLINE_FUNCTION int InnerGemmFixC<5, 5>::serial_invoke(const ScalarType alpha, + const ValueType *KOKKOS_RESTRICT A, + const ValueType *KOKKOS_RESTRICT B, const int k, + /**/ ValueType *KOKKOS_RESTRICT C) { if (k <= 0) return 0; - ValueType a_0p, b_p0, - c_00 = 0, c_01 = 0, c_02 = 0, c_03 = 0, c_04 = 0, a_1p, b_p1, c_10 = 0, - c_11 = 0, c_12 = 0, c_13 = 0, c_14 = 0, a_2p, b_p2, c_20 = 0, c_21 = 0, - c_22 = 0, c_23 = 0, c_24 = 0, a_3p, b_p3, c_30 = 0, c_31 = 0, c_32 = 0, - c_33 = 0, c_34 = 0, a_4p, b_p4, c_40 = 0, c_41 = 0, c_42 = 0, c_43 = 0, - c_44 = 0; + ValueType a_0p, b_p0, c_00 = 0, c_01 = 0, c_02 = 0, c_03 = 0, c_04 = 0, a_1p, b_p1, c_10 = 0, c_11 = 0, c_12 = 0, + c_13 = 0, c_14 = 0, a_2p, b_p2, c_20 = 0, c_21 = 0, c_22 = 0, c_23 = 0, c_24 = 0, a_3p, b_p3, + c_30 = 0, c_31 = 0, c_32 = 0, c_33 = 0, c_34 = 0, a_4p, b_p4, c_40 = 0, c_41 = 0, c_42 = 0, + c_43 = 0, c_44 = 0; - const int i0 = 0 * _as0, i1 = 1 * _as0, i2 = 2 * _as0, i3 = 3 * _as0, - i4 = 4 * _as0, j0 = 0 * _bs1, j1 = 1 * _bs1, j2 = 2 * _bs1, - j3 = 3 * _bs1, j4 = 4 * _bs1; + const int i0 = 0 * _as0, i1 = 1 * _as0, i2 = 2 * _as0, i3 = 3 * _as0, i4 = 4 * _as0, j0 = 0 * _bs1, j1 = 1 * _bs1, + j2 = 2 * _bs1, j3 = 3 * _bs1, j4 = 4 * _bs1; #if defined(KOKKOS_ENABLE_PRAGMA_UNROLL) #pragma unroll @@ -119,21 +116,18 @@ KOKKOS_INLINE_FUNCTION int InnerGemmFixC<5, 5>::serial_invoke( template <> template -KOKKOS_INLINE_FUNCTION int InnerGemmFixC<5, 4>::serial_invoke( - const ScalarType alpha, const ValueType *KOKKOS_RESTRICT A, - const ValueType *KOKKOS_RESTRICT B, const int k, - /**/ ValueType *KOKKOS_RESTRICT C) { +KOKKOS_INLINE_FUNCTION int InnerGemmFixC<5, 4>::serial_invoke(const ScalarType alpha, + const ValueType *KOKKOS_RESTRICT A, + const ValueType *KOKKOS_RESTRICT B, const int k, + /**/ ValueType *KOKKOS_RESTRICT C) { if (k <= 0) return 0; - ValueType a_0p, b_p0, c_00 = 0, c_01 = 0, c_02 = 0, c_03 = 0, a_1p, b_p1, - c_10 = 0, c_11 = 0, c_12 = 0, c_13 = 0, a_2p, b_p2, - c_20 = 0, c_21 = 0, c_22 = 0, c_23 = 0, a_3p, b_p3, - c_30 = 0, c_31 = 0, c_32 = 0, c_33 = 0, a_4p, c_40 = 0, - c_41 = 0, c_42 = 0, c_43 = 0; + ValueType a_0p, b_p0, c_00 = 0, c_01 = 0, c_02 = 0, c_03 = 0, a_1p, b_p1, c_10 = 0, c_11 = 0, c_12 = 0, c_13 = 0, + a_2p, b_p2, c_20 = 0, c_21 = 0, c_22 = 0, c_23 = 0, a_3p, b_p3, c_30 = 0, c_31 = 0, c_32 = 0, + c_33 = 0, a_4p, c_40 = 0, c_41 = 0, c_42 = 0, c_43 = 0; - const int i0 = 0 * _as0, i1 = 1 * _as0, i2 = 2 * _as0, i3 = 3 * _as0, - i4 = 4 * _as0, j0 = 0 * _bs1, j1 = 1 * _bs1, j2 = 2 * _bs1, - j3 = 3 * _bs1; + const int i0 = 0 * _as0, i1 = 1 * _as0, i2 = 2 * _as0, i3 = 3 * _as0, i4 = 4 * _as0, j0 = 0 * _bs1, j1 = 1 * _bs1, + j2 = 2 * _bs1, j3 = 3 * _bs1; #if defined(KOKKOS_ENABLE_PRAGMA_UNROLL) #pragma unroll @@ -197,19 +191,17 @@ KOKKOS_INLINE_FUNCTION int InnerGemmFixC<5, 4>::serial_invoke( template <> template -KOKKOS_INLINE_FUNCTION int InnerGemmFixC<5, 3>::serial_invoke( - const ScalarType alpha, const ValueType *KOKKOS_RESTRICT A, - const ValueType *KOKKOS_RESTRICT B, const int k, - /**/ ValueType *KOKKOS_RESTRICT C) { +KOKKOS_INLINE_FUNCTION int InnerGemmFixC<5, 3>::serial_invoke(const ScalarType alpha, + const ValueType *KOKKOS_RESTRICT A, + const ValueType *KOKKOS_RESTRICT B, const int k, + /**/ ValueType *KOKKOS_RESTRICT C) { if (k <= 0) return 0; - ValueType a_0p, b_p0, c_00 = 0, c_01 = 0, c_02 = 0, a_1p, b_p1, c_10 = 0, - c_11 = 0, c_12 = 0, a_2p, b_p2, c_20 = 0, c_21 = 0, - c_22 = 0, a_3p, c_30 = 0, c_31 = 0, c_32 = 0, a_4p, - c_40 = 0, c_41 = 0, c_42 = 0; + ValueType a_0p, b_p0, c_00 = 0, c_01 = 0, c_02 = 0, a_1p, b_p1, c_10 = 0, c_11 = 0, c_12 = 0, a_2p, b_p2, c_20 = 0, + c_21 = 0, c_22 = 0, a_3p, c_30 = 0, c_31 = 0, c_32 = 0, a_4p, c_40 = 0, c_41 = 0, c_42 = 0; - const int i0 = 0 * _as0, i1 = 1 * _as0, i2 = 2 * _as0, i3 = 3 * _as0, - i4 = 4 * _as0, j0 = 0 * _bs1, j1 = 1 * _bs1, j2 = 2 * _bs1; + const int i0 = 0 * _as0, i1 = 1 * _as0, i2 = 2 * _as0, i3 = 3 * _as0, i4 = 4 * _as0, j0 = 0 * _bs1, j1 = 1 * _bs1, + j2 = 2 * _bs1; #if defined(KOKKOS_ENABLE_PRAGMA_UNROLL) #pragma unroll @@ -262,18 +254,16 @@ KOKKOS_INLINE_FUNCTION int InnerGemmFixC<5, 3>::serial_invoke( template <> template -KOKKOS_INLINE_FUNCTION int InnerGemmFixC<5, 2>::serial_invoke( - const ScalarType alpha, const ValueType *KOKKOS_RESTRICT A, - const ValueType *KOKKOS_RESTRICT B, const int k, - /**/ ValueType *KOKKOS_RESTRICT C) { +KOKKOS_INLINE_FUNCTION int InnerGemmFixC<5, 2>::serial_invoke(const ScalarType alpha, + const ValueType *KOKKOS_RESTRICT A, + const ValueType *KOKKOS_RESTRICT B, const int k, + /**/ ValueType *KOKKOS_RESTRICT C) { if (k <= 0) return 0; - ValueType a_0p, b_p0, c_00 = 0, c_01 = 0, a_1p, b_p1, c_10 = 0, c_11 = 0, - a_2p, c_20 = 0, c_21 = 0, a_3p, c_30 = 0, c_31 = 0, - a_4p, c_40 = 0, c_41 = 0; + ValueType a_0p, b_p0, c_00 = 0, c_01 = 0, a_1p, b_p1, c_10 = 0, c_11 = 0, a_2p, c_20 = 0, c_21 = 0, a_3p, c_30 = 0, + c_31 = 0, a_4p, c_40 = 0, c_41 = 0; - const int i0 = 0 * _as0, i1 = 1 * _as0, i2 = 2 * _as0, i3 = 3 * _as0, - i4 = 4 * _as0, j0 = 0 * _bs1, j1 = 1 * _bs1; + const int i0 = 0 * _as0, i1 = 1 * _as0, i2 = 2 * _as0, i3 = 3 * _as0, i4 = 4 * _as0, j0 = 0 * _bs1, j1 = 1 * _bs1; #if defined(KOKKOS_ENABLE_PRAGMA_UNROLL) #pragma unroll @@ -315,17 +305,15 @@ KOKKOS_INLINE_FUNCTION int InnerGemmFixC<5, 2>::serial_invoke( template <> template -KOKKOS_INLINE_FUNCTION int InnerGemmFixC<5, 1>::serial_invoke( - const ScalarType alpha, const ValueType *KOKKOS_RESTRICT A, - const ValueType *KOKKOS_RESTRICT B, const int k, - /**/ ValueType *KOKKOS_RESTRICT C) { +KOKKOS_INLINE_FUNCTION int InnerGemmFixC<5, 1>::serial_invoke(const ScalarType alpha, + const ValueType *KOKKOS_RESTRICT A, + const ValueType *KOKKOS_RESTRICT B, const int k, + /**/ ValueType *KOKKOS_RESTRICT C) { if (k <= 0) return 0; - ValueType a_0p, b_p0, c_00 = 0, a_1p, c_10 = 0, a_2p, c_20 = 0, a_3p, - c_30 = 0, a_4p, c_40 = 0; + ValueType a_0p, b_p0, c_00 = 0, a_1p, c_10 = 0, a_2p, c_20 = 0, a_3p, c_30 = 0, a_4p, c_40 = 0; - const int i0 = 0 * _as0, i1 = 1 * _as0, i2 = 2 * _as0, i3 = 3 * _as0, - i4 = 4 * _as0, j0 = 0 * _bs1; + const int i0 = 0 * _as0, i1 = 1 * _as0, i2 = 2 * _as0, i3 = 3 * _as0, i4 = 4 * _as0, j0 = 0 * _bs1; #if defined(KOKKOS_ENABLE_PRAGMA_UNROLL) #pragma unroll @@ -356,35 +344,32 @@ KOKKOS_INLINE_FUNCTION int InnerGemmFixC<5, 1>::serial_invoke( template <> template -KOKKOS_INLINE_FUNCTION int InnerGemmFixC<4, 5>::serial_invoke( - const ScalarType alpha, const ValueType *KOKKOS_RESTRICT A, - const ValueType *KOKKOS_RESTRICT B, const int k, - /**/ ValueType *KOKKOS_RESTRICT C) { +KOKKOS_INLINE_FUNCTION int InnerGemmFixC<4, 5>::serial_invoke(const ScalarType alpha, + const ValueType *KOKKOS_RESTRICT A, + const ValueType *KOKKOS_RESTRICT B, const int k, + /**/ ValueType *KOKKOS_RESTRICT C) { if (k <= 0) return 0; - ValueType a_0p, b_p0, c_00 = 0, c_01 = 0, c_02 = 0, c_03 = 0, c_04 = 0, a_1p, - b_p1, c_10 = 0, c_11 = 0, c_12 = 0, c_13 = 0, c_14 = 0, - a_2p, b_p2, c_20 = 0, c_21 = 0, c_22 = 0, c_23 = 0, - c_24 = 0, a_3p, b_p3, c_30 = 0, c_31 = 0, c_32 = 0, - c_33 = 0, c_34 = 0, + ValueType a_0p, b_p0, c_00 = 0, c_01 = 0, c_02 = 0, c_03 = 0, c_04 = 0, a_1p, b_p1, c_10 = 0, c_11 = 0, c_12 = 0, + c_13 = 0, c_14 = 0, a_2p, b_p2, c_20 = 0, c_21 = 0, c_22 = 0, c_23 = 0, c_24 = 0, a_3p, b_p3, + c_30 = 0, c_31 = 0, c_32 = 0, c_33 = 0, c_34 = 0, /**/ b_p4; - const int i0 = 0 * _as0, i1 = 1 * _as0, i2 = 2 * _as0, i3 = 3 * _as0, - j0 = 0 * _bs1, j1 = 1 * _bs1, j2 = 2 * _bs1, j3 = 3 * _bs1, - j4 = 4 * _bs1; + const int i0 = 0 * _as0, i1 = 1 * _as0, i2 = 2 * _as0, i3 = 3 * _as0, j0 = 0 * _bs1, j1 = 1 * _bs1, j2 = 2 * _bs1, + j3 = 3 * _bs1, j4 = 4 * _bs1; #if defined(KOKKOS_ENABLE_PRAGMA_UNROLL) #pragma unroll #endif for (int p = 0; p < k; ++p) { - a_0p = A[i0 + p * _as1]; - b_p0 = B[p * _bs0 + j0]; - a_1p = A[i1 + p * _as1]; - b_p1 = B[p * _bs0 + j1]; - a_2p = A[i2 + p * _as1]; - b_p2 = B[p * _bs0 + j2]; - a_3p = A[i3 + p * _as1]; - b_p3 = B[p * _bs0 + j3]; + a_0p = A[i0 + p * _as1]; + b_p0 = B[p * _bs0 + j0]; + a_1p = A[i1 + p * _as1]; + b_p1 = B[p * _bs0 + j1]; + a_2p = A[i2 + p * _as1]; + b_p2 = B[p * _bs0 + j2]; + a_3p = A[i3 + p * _as1]; + b_p3 = B[p * _bs0 + j3]; /**/ b_p4 = B[p * _bs0 + j4]; c_00 += a_0p * b_p0; @@ -435,32 +420,30 @@ KOKKOS_INLINE_FUNCTION int InnerGemmFixC<4, 5>::serial_invoke( template <> template -KOKKOS_INLINE_FUNCTION int InnerGemmFixC<3, 5>::serial_invoke( - const ScalarType alpha, const ValueType *KOKKOS_RESTRICT A, - const ValueType *KOKKOS_RESTRICT B, const int k, - /**/ ValueType *KOKKOS_RESTRICT C) { +KOKKOS_INLINE_FUNCTION int InnerGemmFixC<3, 5>::serial_invoke(const ScalarType alpha, + const ValueType *KOKKOS_RESTRICT A, + const ValueType *KOKKOS_RESTRICT B, const int k, + /**/ ValueType *KOKKOS_RESTRICT C) { if (k <= 0) return 0; - ValueType a_0p, b_p0, c_00 = 0, c_01 = 0, c_02 = 0, c_03 = 0, c_04 = 0, a_1p, - b_p1, c_10 = 0, c_11 = 0, c_12 = 0, c_13 = 0, c_14 = 0, - a_2p, b_p2, c_20 = 0, c_21 = 0, c_22 = 0, c_23 = 0, - c_24 = 0, + ValueType a_0p, b_p0, c_00 = 0, c_01 = 0, c_02 = 0, c_03 = 0, c_04 = 0, a_1p, b_p1, c_10 = 0, c_11 = 0, c_12 = 0, + c_13 = 0, c_14 = 0, a_2p, b_p2, c_20 = 0, c_21 = 0, c_22 = 0, c_23 = 0, c_24 = 0, /**/ b_p3, /**/ b_p4; - const int i0 = 0 * _as0, i1 = 1 * _as0, i2 = 2 * _as0, j0 = 0 * _bs1, - j1 = 1 * _bs1, j2 = 2 * _bs1, j3 = 3 * _bs1, j4 = 4 * _bs1; + const int i0 = 0 * _as0, i1 = 1 * _as0, i2 = 2 * _as0, j0 = 0 * _bs1, j1 = 1 * _bs1, j2 = 2 * _bs1, j3 = 3 * _bs1, + j4 = 4 * _bs1; #if defined(KOKKOS_ENABLE_PRAGMA_UNROLL) #pragma unroll #endif for (int p = 0; p < k; ++p) { - a_0p = A[i0 + p * _as1]; - b_p0 = B[p * _bs0 + j0]; - a_1p = A[i1 + p * _as1]; - b_p1 = B[p * _bs0 + j1]; - a_2p = A[i2 + p * _as1]; - b_p2 = B[p * _bs0 + j2]; + a_0p = A[i0 + p * _as1]; + b_p0 = B[p * _bs0 + j0]; + a_1p = A[i1 + p * _as1]; + b_p1 = B[p * _bs0 + j1]; + a_2p = A[i2 + p * _as1]; + b_p2 = B[p * _bs0 + j2]; /**/ b_p3 = B[p * _bs0 + j3]; /**/ b_p4 = B[p * _bs0 + j4]; @@ -502,29 +485,28 @@ KOKKOS_INLINE_FUNCTION int InnerGemmFixC<3, 5>::serial_invoke( template <> template -KOKKOS_INLINE_FUNCTION int InnerGemmFixC<2, 5>::serial_invoke( - const ScalarType alpha, const ValueType *KOKKOS_RESTRICT A, - const ValueType *KOKKOS_RESTRICT B, const int k, - /**/ ValueType *KOKKOS_RESTRICT C) { +KOKKOS_INLINE_FUNCTION int InnerGemmFixC<2, 5>::serial_invoke(const ScalarType alpha, + const ValueType *KOKKOS_RESTRICT A, + const ValueType *KOKKOS_RESTRICT B, const int k, + /**/ ValueType *KOKKOS_RESTRICT C) { if (k <= 0) return 0; - ValueType a_0p, b_p0, c_00 = 0, c_01 = 0, c_02 = 0, c_03 = 0, c_04 = 0, a_1p, - b_p1, c_10 = 0, c_11 = 0, c_12 = 0, c_13 = 0, c_14 = 0, + ValueType a_0p, b_p0, c_00 = 0, c_01 = 0, c_02 = 0, c_03 = 0, c_04 = 0, a_1p, b_p1, c_10 = 0, c_11 = 0, c_12 = 0, + c_13 = 0, c_14 = 0, /**/ b_p2, /**/ b_p3, /**/ b_p4; - const int i0 = 0 * _as0, i1 = 1 * _as0, j0 = 0 * _bs1, j1 = 1 * _bs1, - j2 = 2 * _bs1, j3 = 3 * _bs1, j4 = 4 * _bs1; + const int i0 = 0 * _as0, i1 = 1 * _as0, j0 = 0 * _bs1, j1 = 1 * _bs1, j2 = 2 * _bs1, j3 = 3 * _bs1, j4 = 4 * _bs1; #if defined(KOKKOS_ENABLE_PRAGMA_UNROLL) #pragma unroll #endif for (int p = 0; p < k; ++p) { - a_0p = A[i0 + p * _as1]; - b_p0 = B[p * _bs0 + j0]; - a_1p = A[i1 + p * _as1]; - b_p1 = B[p * _bs0 + j1]; + a_0p = A[i0 + p * _as1]; + b_p0 = B[p * _bs0 + j0]; + a_1p = A[i1 + p * _as1]; + b_p1 = B[p * _bs0 + j1]; /**/ b_p2 = B[p * _bs0 + j2]; /**/ b_p3 = B[p * _bs0 + j3]; /**/ b_p4 = B[p * _bs0 + j4]; @@ -557,10 +539,10 @@ KOKKOS_INLINE_FUNCTION int InnerGemmFixC<2, 5>::serial_invoke( template <> template -KOKKOS_INLINE_FUNCTION int InnerGemmFixC<1, 5>::serial_invoke( - const ScalarType alpha, const ValueType *KOKKOS_RESTRICT A, - const ValueType *KOKKOS_RESTRICT B, const int k, - /**/ ValueType *KOKKOS_RESTRICT C) { +KOKKOS_INLINE_FUNCTION int InnerGemmFixC<1, 5>::serial_invoke(const ScalarType alpha, + const ValueType *KOKKOS_RESTRICT A, + const ValueType *KOKKOS_RESTRICT B, const int k, + /**/ ValueType *KOKKOS_RESTRICT C) { if (k <= 0) return 0; ValueType a_0p, b_p0, c_00 = 0, c_01 = 0, c_02 = 0, c_03 = 0, c_04 = 0, @@ -569,15 +551,14 @@ KOKKOS_INLINE_FUNCTION int InnerGemmFixC<1, 5>::serial_invoke( /**/ b_p3, /**/ b_p4; - const int i0 = 0 * _as0, j0 = 0 * _bs1, j1 = 1 * _bs1, j2 = 2 * _bs1, - j3 = 3 * _bs1, j4 = 4 * _bs1; + const int i0 = 0 * _as0, j0 = 0 * _bs1, j1 = 1 * _bs1, j2 = 2 * _bs1, j3 = 3 * _bs1, j4 = 4 * _bs1; #if defined(KOKKOS_ENABLE_PRAGMA_UNROLL) #pragma unroll #endif for (int p = 0; p < k; ++p) { - a_0p = A[i0 + p * _as1]; - b_p0 = B[p * _bs0 + j0]; + a_0p = A[i0 + p * _as1]; + b_p0 = B[p * _bs0 + j0]; /**/ b_p1 = B[p * _bs0 + j1]; /**/ b_p2 = B[p * _bs0 + j2]; /**/ b_p3 = B[p * _bs0 + j3]; @@ -604,22 +585,19 @@ KOKKOS_INLINE_FUNCTION int InnerGemmFixC<1, 5>::serial_invoke( template <> template -KOKKOS_INLINE_FUNCTION int InnerGemmFixC<4, 4>::serial_invoke( - const ScalarType alpha, const ValueType *KOKKOS_RESTRICT A, - const ValueType *KOKKOS_RESTRICT B, const int k, - /**/ ValueType *KOKKOS_RESTRICT C) { +KOKKOS_INLINE_FUNCTION int InnerGemmFixC<4, 4>::serial_invoke(const ScalarType alpha, + const ValueType *KOKKOS_RESTRICT A, + const ValueType *KOKKOS_RESTRICT B, const int k, + /**/ ValueType *KOKKOS_RESTRICT C) { if (k <= 0) return 0; - ValueType a_0p, b_p0, - c_00 = ValueType(0), c_01 = ValueType(0), c_02 = ValueType(0), - c_03 = ValueType(0), a_1p, b_p1, c_10 = ValueType(0), c_11 = ValueType(0), - c_12 = ValueType(0), c_13 = ValueType(0), a_2p, b_p2, c_20 = ValueType(0), - c_21 = ValueType(0), c_22 = ValueType(0), c_23 = ValueType(0), a_3p, b_p3, - c_30 = ValueType(0), c_31 = ValueType(0), c_32 = ValueType(0), - c_33 = ValueType(0); + ValueType a_0p, b_p0, c_00 = ValueType(0), c_01 = ValueType(0), c_02 = ValueType(0), c_03 = ValueType(0), a_1p, b_p1, + c_10 = ValueType(0), c_11 = ValueType(0), c_12 = ValueType(0), c_13 = ValueType(0), a_2p, b_p2, + c_20 = ValueType(0), c_21 = ValueType(0), c_22 = ValueType(0), c_23 = ValueType(0), a_3p, b_p3, + c_30 = ValueType(0), c_31 = ValueType(0), c_32 = ValueType(0), c_33 = ValueType(0); - const int i0 = 0 * _as0, i1 = 1 * _as0, i2 = 2 * _as0, i3 = 3 * _as0, - j0 = 0 * _bs1, j1 = 1 * _bs1, j2 = 2 * _bs1, j3 = 3 * _bs1; + const int i0 = 0 * _as0, i1 = 1 * _as0, i2 = 2 * _as0, i3 = 3 * _as0, j0 = 0 * _bs1, j1 = 1 * _bs1, j2 = 2 * _bs1, + j3 = 3 * _bs1; #if defined(KOKKOS_ENABLE_PRAGMA_UNROLL) #pragma unroll @@ -674,20 +652,17 @@ KOKKOS_INLINE_FUNCTION int InnerGemmFixC<4, 4>::serial_invoke( template <> template -KOKKOS_INLINE_FUNCTION int InnerGemmFixC<4, 3>::serial_invoke( - const ScalarType alpha, const ValueType *KOKKOS_RESTRICT A, - const ValueType *KOKKOS_RESTRICT B, const int k, - /**/ ValueType *KOKKOS_RESTRICT C) { +KOKKOS_INLINE_FUNCTION int InnerGemmFixC<4, 3>::serial_invoke(const ScalarType alpha, + const ValueType *KOKKOS_RESTRICT A, + const ValueType *KOKKOS_RESTRICT B, const int k, + /**/ ValueType *KOKKOS_RESTRICT C) { if (k <= 0) return 0; - ValueType a_0p, b_p0, - c_00 = ValueType(0), c_01 = ValueType(0), c_02 = ValueType(0), a_1p, b_p1, - c_10 = ValueType(0), c_11 = ValueType(0), c_12 = ValueType(0), a_2p, b_p2, - c_20 = ValueType(0), c_21 = ValueType(0), c_22 = ValueType(0), a_3p, - c_30 = ValueType(0), c_31 = ValueType(0), c_32 = ValueType(0); + ValueType a_0p, b_p0, c_00 = ValueType(0), c_01 = ValueType(0), c_02 = ValueType(0), a_1p, b_p1, c_10 = ValueType(0), + c_11 = ValueType(0), c_12 = ValueType(0), a_2p, b_p2, c_20 = ValueType(0), c_21 = ValueType(0), + c_22 = ValueType(0), a_3p, c_30 = ValueType(0), c_31 = ValueType(0), c_32 = ValueType(0); - const int i0 = 0 * _as0, i1 = 1 * _as0, i2 = 2 * _as0, i3 = 3 * _as0, - j0 = 0 * _bs1, j1 = 1 * _bs1, j2 = 2 * _bs1; + const int i0 = 0 * _as0, i1 = 1 * _as0, i2 = 2 * _as0, i3 = 3 * _as0, j0 = 0 * _bs1, j1 = 1 * _bs1, j2 = 2 * _bs1; #if defined(KOKKOS_ENABLE_PRAGMA_UNROLL) #pragma unroll @@ -733,19 +708,16 @@ KOKKOS_INLINE_FUNCTION int InnerGemmFixC<4, 3>::serial_invoke( template <> template -KOKKOS_INLINE_FUNCTION int InnerGemmFixC<4, 2>::serial_invoke( - const ScalarType alpha, const ValueType *KOKKOS_RESTRICT A, - const ValueType *KOKKOS_RESTRICT B, const int k, - /**/ ValueType *KOKKOS_RESTRICT C) { +KOKKOS_INLINE_FUNCTION int InnerGemmFixC<4, 2>::serial_invoke(const ScalarType alpha, + const ValueType *KOKKOS_RESTRICT A, + const ValueType *KOKKOS_RESTRICT B, const int k, + /**/ ValueType *KOKKOS_RESTRICT C) { if (k <= 0) return 0; - ValueType a_0p, b_p0, c_00 = ValueType(0), c_01 = ValueType(0), a_1p, b_p1, - c_10 = ValueType(0), c_11 = ValueType(0), a_2p, - c_20 = ValueType(0), c_21 = ValueType(0), a_3p, - c_30 = ValueType(0), c_31 = ValueType(0); + ValueType a_0p, b_p0, c_00 = ValueType(0), c_01 = ValueType(0), a_1p, b_p1, c_10 = ValueType(0), c_11 = ValueType(0), + a_2p, c_20 = ValueType(0), c_21 = ValueType(0), a_3p, c_30 = ValueType(0), c_31 = ValueType(0); - const int i0 = 0 * _as0, i1 = 1 * _as0, i2 = 2 * _as0, i3 = 3 * _as0, - j0 = 0 * _bs1, j1 = 1 * _bs1; + const int i0 = 0 * _as0, i1 = 1 * _as0, i2 = 2 * _as0, i3 = 3 * _as0, j0 = 0 * _bs1, j1 = 1 * _bs1; #if defined(KOKKOS_ENABLE_PRAGMA_UNROLL) #pragma unroll @@ -782,17 +754,16 @@ KOKKOS_INLINE_FUNCTION int InnerGemmFixC<4, 2>::serial_invoke( template <> template -KOKKOS_INLINE_FUNCTION int InnerGemmFixC<4, 1>::serial_invoke( - const ScalarType alpha, const ValueType *KOKKOS_RESTRICT A, - const ValueType *KOKKOS_RESTRICT B, const int k, - /**/ ValueType *KOKKOS_RESTRICT C) { +KOKKOS_INLINE_FUNCTION int InnerGemmFixC<4, 1>::serial_invoke(const ScalarType alpha, + const ValueType *KOKKOS_RESTRICT A, + const ValueType *KOKKOS_RESTRICT B, const int k, + /**/ ValueType *KOKKOS_RESTRICT C) { if (k <= 0) return 0; - ValueType a_0p, b_p0, c_00 = ValueType(0), a_1p, c_10 = ValueType(0), a_2p, - c_20 = ValueType(0), a_3p, c_30 = ValueType(0); + ValueType a_0p, b_p0, c_00 = ValueType(0), a_1p, c_10 = ValueType(0), a_2p, c_20 = ValueType(0), a_3p, + c_30 = ValueType(0); - const int i0 = 0 * _as0, i1 = 1 * _as0, i2 = 2 * _as0, i3 = 3 * _as0, - j0 = 0 * _bs1; + const int i0 = 0 * _as0, i1 = 1 * _as0, i2 = 2 * _as0, i3 = 3 * _as0, j0 = 0 * _bs1; #if defined(KOKKOS_ENABLE_PRAGMA_UNROLL) #pragma unroll @@ -820,32 +791,29 @@ KOKKOS_INLINE_FUNCTION int InnerGemmFixC<4, 1>::serial_invoke( template <> template -KOKKOS_INLINE_FUNCTION int InnerGemmFixC<3, 4>::serial_invoke( - const ScalarType alpha, const ValueType *KOKKOS_RESTRICT A, - const ValueType *KOKKOS_RESTRICT B, const int k, - /**/ ValueType *KOKKOS_RESTRICT C) { +KOKKOS_INLINE_FUNCTION int InnerGemmFixC<3, 4>::serial_invoke(const ScalarType alpha, + const ValueType *KOKKOS_RESTRICT A, + const ValueType *KOKKOS_RESTRICT B, const int k, + /**/ ValueType *KOKKOS_RESTRICT C) { if (k <= 0) return 0; - ValueType a_0p, b_p0, - c_00 = ValueType(0), c_01 = ValueType(0), c_02 = ValueType(0), - c_03 = ValueType(0), a_1p, b_p1, c_10 = ValueType(0), c_11 = ValueType(0), - c_12 = ValueType(0), c_13 = ValueType(0), a_2p, b_p2, c_20 = ValueType(0), - c_21 = ValueType(0), c_22 = ValueType(0), c_23 = ValueType(0), - /**/ b_p3; + ValueType a_0p, b_p0, c_00 = ValueType(0), c_01 = ValueType(0), c_02 = ValueType(0), c_03 = ValueType(0), a_1p, b_p1, + c_10 = ValueType(0), c_11 = ValueType(0), c_12 = ValueType(0), c_13 = ValueType(0), a_2p, b_p2, + c_20 = ValueType(0), c_21 = ValueType(0), c_22 = ValueType(0), c_23 = ValueType(0), + /**/ b_p3; - const int i0 = 0 * _as0, i1 = 1 * _as0, i2 = 2 * _as0, j0 = 0 * _bs1, - j1 = 1 * _bs1, j2 = 2 * _bs1, j3 = 3 * _bs1; + const int i0 = 0 * _as0, i1 = 1 * _as0, i2 = 2 * _as0, j0 = 0 * _bs1, j1 = 1 * _bs1, j2 = 2 * _bs1, j3 = 3 * _bs1; #if defined(KOKKOS_ENABLE_PRAGMA_UNROLL) #pragma unroll #endif for (int p = 0; p < k; ++p) { - a_0p = A[i0 + p * _as1]; - b_p0 = B[p * _bs0 + j0]; - a_1p = A[i1 + p * _as1]; - b_p1 = B[p * _bs0 + j1]; - a_2p = A[i2 + p * _as1]; - b_p2 = B[p * _bs0 + j2]; + a_0p = A[i0 + p * _as1]; + b_p0 = B[p * _bs0 + j0]; + a_1p = A[i1 + p * _as1]; + b_p1 = B[p * _bs0 + j1]; + a_2p = A[i2 + p * _as1]; + b_p2 = B[p * _bs0 + j2]; /**/ b_p3 = B[p * _bs0 + j3]; c_00 += a_0p * b_p0; @@ -880,30 +848,27 @@ KOKKOS_INLINE_FUNCTION int InnerGemmFixC<3, 4>::serial_invoke( template <> template -KOKKOS_INLINE_FUNCTION int InnerGemmFixC<2, 4>::serial_invoke( - const ScalarType alpha, const ValueType *KOKKOS_RESTRICT A, - const ValueType *KOKKOS_RESTRICT B, const int k, - /**/ ValueType *KOKKOS_RESTRICT C) { +KOKKOS_INLINE_FUNCTION int InnerGemmFixC<2, 4>::serial_invoke(const ScalarType alpha, + const ValueType *KOKKOS_RESTRICT A, + const ValueType *KOKKOS_RESTRICT B, const int k, + /**/ ValueType *KOKKOS_RESTRICT C) { if (k <= 0) return 0; - ValueType a_0p, b_p0, c_00 = ValueType(0), c_01 = ValueType(0), - c_02 = ValueType(0), c_03 = ValueType(0), a_1p, b_p1, - c_10 = ValueType(0), c_11 = ValueType(0), - c_12 = ValueType(0), c_13 = ValueType(0), + ValueType a_0p, b_p0, c_00 = ValueType(0), c_01 = ValueType(0), c_02 = ValueType(0), c_03 = ValueType(0), a_1p, b_p1, + c_10 = ValueType(0), c_11 = ValueType(0), c_12 = ValueType(0), c_13 = ValueType(0), /**/ b_p2, /**/ b_p3; - const int i0 = 0 * _as0, i1 = 1 * _as0, j0 = 0 * _bs1, j1 = 1 * _bs1, - j2 = 2 * _bs1, j3 = 3 * _bs1; + const int i0 = 0 * _as0, i1 = 1 * _as0, j0 = 0 * _bs1, j1 = 1 * _bs1, j2 = 2 * _bs1, j3 = 3 * _bs1; #if defined(KOKKOS_ENABLE_PRAGMA_UNROLL) #pragma unroll #endif for (int p = 0; p < k; ++p) { - a_0p = A[i0 + p * _as1]; - b_p0 = B[p * _bs0 + j0]; - a_1p = A[i1 + p * _as1]; - b_p1 = B[p * _bs0 + j1]; + a_0p = A[i0 + p * _as1]; + b_p0 = B[p * _bs0 + j0]; + a_1p = A[i1 + p * _as1]; + b_p1 = B[p * _bs0 + j1]; /**/ b_p2 = B[p * _bs0 + j2]; /**/ b_p3 = B[p * _bs0 + j3]; @@ -931,27 +896,25 @@ KOKKOS_INLINE_FUNCTION int InnerGemmFixC<2, 4>::serial_invoke( template <> template -KOKKOS_INLINE_FUNCTION int InnerGemmFixC<1, 4>::serial_invoke( - const ScalarType alpha, const ValueType *KOKKOS_RESTRICT A, - const ValueType *KOKKOS_RESTRICT B, const int k, - /**/ ValueType *KOKKOS_RESTRICT C) { +KOKKOS_INLINE_FUNCTION int InnerGemmFixC<1, 4>::serial_invoke(const ScalarType alpha, + const ValueType *KOKKOS_RESTRICT A, + const ValueType *KOKKOS_RESTRICT B, const int k, + /**/ ValueType *KOKKOS_RESTRICT C) { if (k <= 0) return 0; - ValueType a_0p, b_p0, c_00 = ValueType(0), c_01 = ValueType(0), - c_02 = ValueType(0), c_03 = ValueType(0), + ValueType a_0p, b_p0, c_00 = ValueType(0), c_01 = ValueType(0), c_02 = ValueType(0), c_03 = ValueType(0), /**/ b_p1, /**/ b_p2, /**/ b_p3; - const int i0 = 0 * _as0, j0 = 0 * _bs1, j1 = 1 * _bs1, j2 = 2 * _bs1, - j3 = 3 * _bs1; + const int i0 = 0 * _as0, j0 = 0 * _bs1, j1 = 1 * _bs1, j2 = 2 * _bs1, j3 = 3 * _bs1; #if defined(KOKKOS_ENABLE_PRAGMA_UNROLL) #pragma unroll #endif for (int p = 0; p < k; ++p) { - a_0p = A[i0 + p * _as1]; - b_p0 = B[p * _bs0 + j0]; + a_0p = A[i0 + p * _as1]; + b_p0 = B[p * _bs0 + j0]; /**/ b_p1 = B[p * _bs0 + j1]; /**/ b_p2 = B[p * _bs0 + j2]; /**/ b_p3 = B[p * _bs0 + j3]; @@ -976,19 +939,17 @@ KOKKOS_INLINE_FUNCTION int InnerGemmFixC<1, 4>::serial_invoke( template <> template -KOKKOS_INLINE_FUNCTION int InnerGemmFixC<3, 3>::serial_invoke( - const ScalarType alpha, const ValueType *KOKKOS_RESTRICT A, - const ValueType *KOKKOS_RESTRICT B, const int k, - /**/ ValueType *KOKKOS_RESTRICT C) { +KOKKOS_INLINE_FUNCTION int InnerGemmFixC<3, 3>::serial_invoke(const ScalarType alpha, + const ValueType *KOKKOS_RESTRICT A, + const ValueType *KOKKOS_RESTRICT B, const int k, + /**/ ValueType *KOKKOS_RESTRICT C) { if (k <= 0) return 0; - ValueType a_0p, b_p0, - c_00 = ValueType(0), c_01 = ValueType(0), c_02 = ValueType(0), a_1p, b_p1, - c_10 = ValueType(0), c_11 = ValueType(0), c_12 = ValueType(0), a_2p, b_p2, - c_20 = ValueType(0), c_21 = ValueType(0), c_22 = ValueType(0); + ValueType a_0p, b_p0, c_00 = ValueType(0), c_01 = ValueType(0), c_02 = ValueType(0), a_1p, b_p1, c_10 = ValueType(0), + c_11 = ValueType(0), c_12 = ValueType(0), a_2p, b_p2, c_20 = ValueType(0), c_21 = ValueType(0), + c_22 = ValueType(0); - const int i0 = 0 * _as0, i1 = 1 * _as0, i2 = 2 * _as0, j0 = 0 * _bs1, - j1 = 1 * _bs1, j2 = 2 * _bs1; + const int i0 = 0 * _as0, i1 = 1 * _as0, i2 = 2 * _as0, j0 = 0 * _bs1, j1 = 1 * _bs1, j2 = 2 * _bs1; #if defined(KOKKOS_ENABLE_PRAGMA_UNROLL) #pragma unroll @@ -1027,18 +988,16 @@ KOKKOS_INLINE_FUNCTION int InnerGemmFixC<3, 3>::serial_invoke( template <> template -KOKKOS_INLINE_FUNCTION int InnerGemmFixC<3, 2>::serial_invoke( - const ScalarType alpha, const ValueType *KOKKOS_RESTRICT A, - const ValueType *KOKKOS_RESTRICT B, const int k, - /**/ ValueType *KOKKOS_RESTRICT C) { +KOKKOS_INLINE_FUNCTION int InnerGemmFixC<3, 2>::serial_invoke(const ScalarType alpha, + const ValueType *KOKKOS_RESTRICT A, + const ValueType *KOKKOS_RESTRICT B, const int k, + /**/ ValueType *KOKKOS_RESTRICT C) { if (k <= 0) return 0; - ValueType a_0p, b_p0, c_00 = ValueType(0), c_01 = ValueType(0), a_1p, b_p1, - c_10 = ValueType(0), c_11 = ValueType(0), a_2p, - c_20 = ValueType(0), c_21 = ValueType(0); + ValueType a_0p, b_p0, c_00 = ValueType(0), c_01 = ValueType(0), a_1p, b_p1, c_10 = ValueType(0), c_11 = ValueType(0), + a_2p, c_20 = ValueType(0), c_21 = ValueType(0); - const int i0 = 0 * _as0, i1 = 1 * _as0, i2 = 2 * _as0, j0 = 0 * _bs1, - j1 = 1 * _bs1; + const int i0 = 0 * _as0, i1 = 1 * _as0, i2 = 2 * _as0, j0 = 0 * _bs1, j1 = 1 * _bs1; #if defined(KOKKOS_ENABLE_PRAGMA_UNROLL) #pragma unroll @@ -1070,14 +1029,13 @@ KOKKOS_INLINE_FUNCTION int InnerGemmFixC<3, 2>::serial_invoke( template <> template -KOKKOS_INLINE_FUNCTION int InnerGemmFixC<3, 1>::serial_invoke( - const ScalarType alpha, const ValueType *KOKKOS_RESTRICT A, - const ValueType *KOKKOS_RESTRICT B, const int k, - /**/ ValueType *KOKKOS_RESTRICT C) { +KOKKOS_INLINE_FUNCTION int InnerGemmFixC<3, 1>::serial_invoke(const ScalarType alpha, + const ValueType *KOKKOS_RESTRICT A, + const ValueType *KOKKOS_RESTRICT B, const int k, + /**/ ValueType *KOKKOS_RESTRICT C) { if (k <= 0) return 0; - ValueType a_0p, b_p0, c_00 = ValueType(0), a_1p, c_10 = ValueType(0), a_2p, - c_20 = ValueType(0); + ValueType a_0p, b_p0, c_00 = ValueType(0), a_1p, c_10 = ValueType(0), a_2p, c_20 = ValueType(0); const int i0 = 0 * _as0, i1 = 1 * _as0, i2 = 2 * _as0, j0 = 0 * _bs1; @@ -1104,28 +1062,26 @@ KOKKOS_INLINE_FUNCTION int InnerGemmFixC<3, 1>::serial_invoke( template <> template -KOKKOS_INLINE_FUNCTION int InnerGemmFixC<2, 3>::serial_invoke( - const ScalarType alpha, const ValueType *KOKKOS_RESTRICT A, - const ValueType *KOKKOS_RESTRICT B, const int k, - /**/ ValueType *KOKKOS_RESTRICT C) { +KOKKOS_INLINE_FUNCTION int InnerGemmFixC<2, 3>::serial_invoke(const ScalarType alpha, + const ValueType *KOKKOS_RESTRICT A, + const ValueType *KOKKOS_RESTRICT B, const int k, + /**/ ValueType *KOKKOS_RESTRICT C) { if (k <= 0) return 0; - ValueType a_0p, b_p0, c_00 = ValueType(0), c_01 = ValueType(0), - c_02 = ValueType(0), a_1p, b_p1, c_10 = ValueType(0), + ValueType a_0p, b_p0, c_00 = ValueType(0), c_01 = ValueType(0), c_02 = ValueType(0), a_1p, b_p1, c_10 = ValueType(0), c_11 = ValueType(0), c_12 = ValueType(0), /**/ b_p2; - const int i0 = 0 * _as0, i1 = 1 * _as0, j0 = 0 * _bs1, j1 = 1 * _bs1, - j2 = 2 * _bs1; + const int i0 = 0 * _as0, i1 = 1 * _as0, j0 = 0 * _bs1, j1 = 1 * _bs1, j2 = 2 * _bs1; #if defined(KOKKOS_ENABLE_PRAGMA_UNROLL) #pragma unroll #endif for (int p = 0; p < k; ++p) { - a_0p = A[i0 + p * _as1]; - b_p0 = B[p * _bs0 + j0]; - a_1p = A[i1 + p * _as1]; - b_p1 = B[p * _bs0 + j1]; + a_0p = A[i0 + p * _as1]; + b_p0 = B[p * _bs0 + j0]; + a_1p = A[i1 + p * _as1]; + b_p1 = B[p * _bs0 + j1]; /**/ b_p2 = B[p * _bs0 + j2]; c_00 += a_0p * b_p0; @@ -1147,14 +1103,13 @@ KOKKOS_INLINE_FUNCTION int InnerGemmFixC<2, 3>::serial_invoke( } template <> template -KOKKOS_INLINE_FUNCTION int InnerGemmFixC<1, 3>::serial_invoke( - const ScalarType alpha, const ValueType *KOKKOS_RESTRICT A, - const ValueType *KOKKOS_RESTRICT B, const int k, - /**/ ValueType *KOKKOS_RESTRICT C) { +KOKKOS_INLINE_FUNCTION int InnerGemmFixC<1, 3>::serial_invoke(const ScalarType alpha, + const ValueType *KOKKOS_RESTRICT A, + const ValueType *KOKKOS_RESTRICT B, const int k, + /**/ ValueType *KOKKOS_RESTRICT C) { if (k <= 0) return 0; - ValueType a_0p, b_p0, c_00 = ValueType(0), c_01 = ValueType(0), - c_02 = ValueType(0), + ValueType a_0p, b_p0, c_00 = ValueType(0), c_01 = ValueType(0), c_02 = ValueType(0), /**/ b_p1, /**/ b_p2; @@ -1164,8 +1119,8 @@ KOKKOS_INLINE_FUNCTION int InnerGemmFixC<1, 3>::serial_invoke( #pragma unroll #endif for (int p = 0; p < k; ++p) { - a_0p = A[i0 + p * _as1]; - b_p0 = B[p * _bs0 + j0]; + a_0p = A[i0 + p * _as1]; + b_p0 = B[p * _bs0 + j0]; /**/ b_p1 = B[p * _bs0 + j1]; /**/ b_p2 = B[p * _bs0 + j2]; @@ -1187,14 +1142,13 @@ KOKKOS_INLINE_FUNCTION int InnerGemmFixC<1, 3>::serial_invoke( template <> template -KOKKOS_INLINE_FUNCTION int InnerGemmFixC<2, 2>::serial_invoke( - const ScalarType alpha, const ValueType *KOKKOS_RESTRICT A, - const ValueType *KOKKOS_RESTRICT B, const int k, - /**/ ValueType *KOKKOS_RESTRICT C) { +KOKKOS_INLINE_FUNCTION int InnerGemmFixC<2, 2>::serial_invoke(const ScalarType alpha, + const ValueType *KOKKOS_RESTRICT A, + const ValueType *KOKKOS_RESTRICT B, const int k, + /**/ ValueType *KOKKOS_RESTRICT C) { if (k <= 0) return 0; - ValueType a_0p, b_p0, c_00 = ValueType(0), c_01 = ValueType(0), a_1p, b_p1, - c_10 = ValueType(0), c_11 = ValueType(0); + ValueType a_0p, b_p0, c_00 = ValueType(0), c_01 = ValueType(0), a_1p, b_p1, c_10 = ValueType(0), c_11 = ValueType(0); const int i0 = 0 * _as0, i1 = 1 * _as0, j0 = 0 * _bs1, j1 = 1 * _bs1; @@ -1223,10 +1177,10 @@ KOKKOS_INLINE_FUNCTION int InnerGemmFixC<2, 2>::serial_invoke( template <> template -KOKKOS_INLINE_FUNCTION int InnerGemmFixC<2, 1>::serial_invoke( - const ScalarType alpha, const ValueType *KOKKOS_RESTRICT A, - const ValueType *KOKKOS_RESTRICT B, const int k, - /**/ ValueType *KOKKOS_RESTRICT C) { +KOKKOS_INLINE_FUNCTION int InnerGemmFixC<2, 1>::serial_invoke(const ScalarType alpha, + const ValueType *KOKKOS_RESTRICT A, + const ValueType *KOKKOS_RESTRICT B, const int k, + /**/ ValueType *KOKKOS_RESTRICT C) { if (k <= 0) return 0; ValueType a_0p, b_p0, c_00 = ValueType(0), a_1p, c_10 = ValueType(0); @@ -1253,10 +1207,10 @@ KOKKOS_INLINE_FUNCTION int InnerGemmFixC<2, 1>::serial_invoke( template <> template -KOKKOS_INLINE_FUNCTION int InnerGemmFixC<1, 2>::serial_invoke( - const ScalarType alpha, const ValueType *KOKKOS_RESTRICT A, - const ValueType *KOKKOS_RESTRICT B, const int k, - /**/ ValueType *KOKKOS_RESTRICT C) { +KOKKOS_INLINE_FUNCTION int InnerGemmFixC<1, 2>::serial_invoke(const ScalarType alpha, + const ValueType *KOKKOS_RESTRICT A, + const ValueType *KOKKOS_RESTRICT B, const int k, + /**/ ValueType *KOKKOS_RESTRICT C) { if (k <= 0) return 0; ValueType a_0p, b_p0, c_00 = ValueType(0), c_01 = ValueType(0), @@ -1286,10 +1240,10 @@ KOKKOS_INLINE_FUNCTION int InnerGemmFixC<1, 2>::serial_invoke( template <> template -KOKKOS_INLINE_FUNCTION int InnerGemmFixC<1, 1>::serial_invoke( - const ScalarType alpha, const ValueType *KOKKOS_RESTRICT A, - const ValueType *KOKKOS_RESTRICT B, const int k, - /**/ ValueType *KOKKOS_RESTRICT C) { +KOKKOS_INLINE_FUNCTION int InnerGemmFixC<1, 1>::serial_invoke(const ScalarType alpha, + const ValueType *KOKKOS_RESTRICT A, + const ValueType *KOKKOS_RESTRICT B, const int k, + /**/ ValueType *KOKKOS_RESTRICT C) { if (k <= 0) return 0; ValueType a_0p, b_p0, c_00 = ValueType(0); @@ -1311,10 +1265,11 @@ KOKKOS_INLINE_FUNCTION int InnerGemmFixC<1, 1>::serial_invoke( template <> template -KOKKOS_INLINE_FUNCTION int InnerGemmFixC<0, 1>::serial_invoke( - const ScalarType alpha, const ValueType *KOKKOS_RESTRICT A, - const ValueType *KOKKOS_RESTRICT B, const int m, const int k, - /**/ ValueType *KOKKOS_RESTRICT C) { +KOKKOS_INLINE_FUNCTION int InnerGemmFixC<0, 1>::serial_invoke(const ScalarType alpha, + const ValueType *KOKKOS_RESTRICT A, + const ValueType *KOKKOS_RESTRICT B, const int m, + const int k, + /**/ ValueType *KOKKOS_RESTRICT C) { if (m <= 0 || k <= 0) return 0; switch (m) { @@ -1353,14 +1308,13 @@ KOKKOS_INLINE_FUNCTION int InnerGemmFixC<0, 1>::serial_invoke( template <> template -KOKKOS_INLINE_FUNCTION int InnerGemmFixC<5, 5>::serial_invoke( - const ScalarType alpha, const ValueType *KOKKOS_RESTRICT A, - const ValueType *KOKKOS_RESTRICT B, const int m, const int n, const int k, - /**/ ValueType *KOKKOS_RESTRICT C) { +KOKKOS_INLINE_FUNCTION int InnerGemmFixC<5, 5>::serial_invoke(const ScalarType alpha, + const ValueType *KOKKOS_RESTRICT A, + const ValueType *KOKKOS_RESTRICT B, const int m, + const int n, const int k, + /**/ ValueType *KOKKOS_RESTRICT C) { if (m <= 0 || n <= 0 || k <= 0) return 0; - if (!(m <= 5 && n <= 5)) - Kokkos::abort( - "InnerGemmFixC<5,5>::serial_invoke, assert failure (m<=5 && n<=5)"); + if (!(m <= 5 && n <= 5)) Kokkos::abort("InnerGemmFixC<5,5>::serial_invoke, assert failure (m<=5 && n<=5)"); switch (m * 10 + n) { case 55: { @@ -1419,14 +1373,13 @@ KOKKOS_INLINE_FUNCTION int InnerGemmFixC<5, 5>::serial_invoke( template <> template -KOKKOS_INLINE_FUNCTION int InnerGemmFixC<4, 4>::serial_invoke( - const ScalarType alpha, const ValueType *KOKKOS_RESTRICT A, - const ValueType *KOKKOS_RESTRICT B, const int m, const int n, const int k, - /**/ ValueType *KOKKOS_RESTRICT C) { +KOKKOS_INLINE_FUNCTION int InnerGemmFixC<4, 4>::serial_invoke(const ScalarType alpha, + const ValueType *KOKKOS_RESTRICT A, + const ValueType *KOKKOS_RESTRICT B, const int m, + const int n, const int k, + /**/ ValueType *KOKKOS_RESTRICT C) { if (m <= 0 || n <= 0 || k <= 0) return 0; - if (!(m <= 4 && n <= 4)) - Kokkos::abort( - "InnerGemmFixC<4,4>::serial_invoke, assert failure (m<=4 && n<=4)"); + if (!(m <= 4 && n <= 4)) Kokkos::abort("InnerGemmFixC<4,4>::serial_invoke, assert failure (m<=4 && n<=4)"); switch (m * 10 + n) { case 44: { @@ -1475,14 +1428,13 @@ KOKKOS_INLINE_FUNCTION int InnerGemmFixC<4, 4>::serial_invoke( template <> template -KOKKOS_INLINE_FUNCTION int InnerGemmFixC<3, 3>::serial_invoke( - const ScalarType alpha, const ValueType *KOKKOS_RESTRICT A, - const ValueType *KOKKOS_RESTRICT B, const int m, const int n, const int k, - /**/ ValueType *KOKKOS_RESTRICT C) { +KOKKOS_INLINE_FUNCTION int InnerGemmFixC<3, 3>::serial_invoke(const ScalarType alpha, + const ValueType *KOKKOS_RESTRICT A, + const ValueType *KOKKOS_RESTRICT B, const int m, + const int n, const int k, + /**/ ValueType *KOKKOS_RESTRICT C) { if (m <= 0 || n <= 0 || k <= 0) return 0; - if (!(m <= 3 && n <= 3)) - Kokkos::abort( - "InnerGemmFixC<3,3>::serial_invoke, assert failure (m<=3 && n<=3)"); + if (!(m <= 3 && n <= 3)) Kokkos::abort("InnerGemmFixC<3,3>::serial_invoke, assert failure (m<=3 && n<=3)"); switch (m * 10 + n) { case 33: { @@ -1521,14 +1473,13 @@ KOKKOS_INLINE_FUNCTION int InnerGemmFixC<3, 3>::serial_invoke( template <> template -KOKKOS_INLINE_FUNCTION int InnerGemmFixC<2, 2>::serial_invoke( - const ScalarType alpha, const ValueType *KOKKOS_RESTRICT A, - const ValueType *KOKKOS_RESTRICT B, const int m, const int n, const int k, - /**/ ValueType *KOKKOS_RESTRICT C) { +KOKKOS_INLINE_FUNCTION int InnerGemmFixC<2, 2>::serial_invoke(const ScalarType alpha, + const ValueType *KOKKOS_RESTRICT A, + const ValueType *KOKKOS_RESTRICT B, const int m, + const int n, const int k, + /**/ ValueType *KOKKOS_RESTRICT C) { if (m <= 0 || n <= 0 || k <= 0) return 0; - if (!(m <= 2 && n <= 2)) - Kokkos::abort( - "InnerGemmFixC<2,2>::serial_invoke, assert failure (m<=2 && n<=2)"); + if (!(m <= 2 && n <= 2)) Kokkos::abort("InnerGemmFixC<2,2>::serial_invoke, assert failure (m<=2 && n<=2)"); switch (m * 10 + n) { case 22: { @@ -1557,14 +1508,13 @@ KOKKOS_INLINE_FUNCTION int InnerGemmFixC<2, 2>::serial_invoke( template <> template -KOKKOS_INLINE_FUNCTION int InnerGemmFixC<1, 1>::serial_invoke( - const ScalarType alpha, const ValueType *KOKKOS_RESTRICT A, - const ValueType *KOKKOS_RESTRICT B, const int m, const int n, const int k, - /**/ ValueType *KOKKOS_RESTRICT C) { +KOKKOS_INLINE_FUNCTION int InnerGemmFixC<1, 1>::serial_invoke(const ScalarType alpha, + const ValueType *KOKKOS_RESTRICT A, + const ValueType *KOKKOS_RESTRICT B, const int m, + const int n, const int k, + /**/ ValueType *KOKKOS_RESTRICT C) { if (m <= 0 || n <= 0 || k <= 0) return 0; - if (!(m <= 1 && n <= 1)) - Kokkos::abort( - "InnerGemmFixC<1,1>::serial_invoke, assert failure (m<=1 && n<=1)"); + if (!(m <= 1 && n <= 1)) Kokkos::abort("InnerGemmFixC<1,1>::serial_invoke, assert failure (m<=1 && n<=1)"); return serial_invoke(alpha, A, B, k, C); ; diff --git a/batched/dense/impl/KokkosBatched_InnerGemmFixC_Team_Impl.hpp b/batched/dense/impl/KokkosBatched_InnerGemmFixC_Team_Impl.hpp index 116545f653..a3d6dece58 100644 --- a/batched/dense/impl/KokkosBatched_InnerGemmFixC_Team_Impl.hpp +++ b/batched/dense/impl/KokkosBatched_InnerGemmFixC_Team_Impl.hpp @@ -25,43 +25,38 @@ namespace KokkosBatched { template template -KOKKOS_INLINE_FUNCTION int InnerGemmFixC::team_invoke( - const MemberType &member, const ScalarType alpha, - const ValueType *KOKKOS_RESTRICT A, const ValueType *KOKKOS_RESTRICT B, - const int k, - /**/ ValueType *KOKKOS_RESTRICT C) { - Kokkos::parallel_for( - Kokkos::TeamThreadRange(member, 0, mb * nb), [&](const int &ij) { - const int i = ij / nb, j = ij % nb; +KOKKOS_INLINE_FUNCTION int InnerGemmFixC::team_invoke(const MemberType &member, const ScalarType alpha, + const ValueType *KOKKOS_RESTRICT A, + const ValueType *KOKKOS_RESTRICT B, const int k, + /**/ ValueType *KOKKOS_RESTRICT C) { + Kokkos::parallel_for(Kokkos::TeamThreadRange(member, 0, mb * nb), [&](const int &ij) { + const int i = ij / nb, j = ij % nb; - const ValueType *KOKKOS_RESTRICT pA = A + i * _as0, - *KOKKOS_RESTRICT pB = B + j * _bs1; + const ValueType *KOKKOS_RESTRICT pA = A + i * _as0, *KOKKOS_RESTRICT pB = B + j * _bs1; - ValueType c = 0; - for (int p = 0; p < k; ++p) c += pA[p * _as1] * pB[p * _bs0]; - C[i * _cs0 + j * _cs1] += alpha * c; - }); + ValueType c = 0; + for (int p = 0; p < k; ++p) c += pA[p * _as1] * pB[p * _bs0]; + C[i * _cs0 + j * _cs1] += alpha * c; + }); return 0; } template template -KOKKOS_INLINE_FUNCTION int InnerGemmFixC::team_invoke( - const MemberType &member, const ScalarType alpha, - const ValueType *KOKKOS_RESTRICT A, const ValueType *KOKKOS_RESTRICT B, - const int m, const int n, const int k, - /**/ ValueType *KOKKOS_RESTRICT C) { - Kokkos::parallel_for( - Kokkos::TeamThreadRange(member, 0, m * n), [&](const int &ij) { - const int i = ij / n, j = ij % n; +KOKKOS_INLINE_FUNCTION int InnerGemmFixC::team_invoke(const MemberType &member, const ScalarType alpha, + const ValueType *KOKKOS_RESTRICT A, + const ValueType *KOKKOS_RESTRICT B, const int m, + const int n, const int k, + /**/ ValueType *KOKKOS_RESTRICT C) { + Kokkos::parallel_for(Kokkos::TeamThreadRange(member, 0, m * n), [&](const int &ij) { + const int i = ij / n, j = ij % n; - const ValueType *KOKKOS_RESTRICT pA = A + i * _as0, - *KOKKOS_RESTRICT pB = B + j * _bs1; + const ValueType *KOKKOS_RESTRICT pA = A + i * _as0, *KOKKOS_RESTRICT pB = B + j * _bs1; - ValueType c = 0; - for (int p = 0; p < k; ++p) c += pA[p * _as1] * pB[p * _bs0]; - C[i * _cs0 + j * _cs1] += alpha * c; - }); + ValueType c = 0; + for (int p = 0; p < k; ++p) c += pA[p * _as1] * pB[p * _bs0]; + C[i * _cs0 + j * _cs1] += alpha * c; + }); return 0; } } // namespace KokkosBatched diff --git a/batched/dense/impl/KokkosBatched_InnerLU_Serial_Impl.hpp b/batched/dense/impl/KokkosBatched_InnerLU_Serial_Impl.hpp index 3089d068bb..0d74598b24 100644 --- a/batched/dense/impl/KokkosBatched_InnerLU_Serial_Impl.hpp +++ b/batched/dense/impl/KokkosBatched_InnerLU_Serial_Impl.hpp @@ -29,21 +29,16 @@ namespace KokkosBatched { template <> template -KOKKOS_INLINE_FUNCTION int InnerLU<5>::serial_invoke( - ValueType *KOKKOS_RESTRICT A) { +KOKKOS_INLINE_FUNCTION int InnerLU<5>::serial_invoke(ValueType *KOKKOS_RESTRICT A) { // load - ValueType a_00 = A[0 * _as0 + 0 * _as1], a_01 = A[0 * _as0 + 1 * _as1], - a_02 = A[0 * _as0 + 2 * _as1], a_03 = A[0 * _as0 + 3 * _as1], - a_04 = A[0 * _as0 + 4 * _as1], a_10 = A[1 * _as0 + 0 * _as1], - a_11 = A[1 * _as0 + 1 * _as1], a_12 = A[1 * _as0 + 2 * _as1], - a_13 = A[1 * _as0 + 3 * _as1], a_14 = A[1 * _as0 + 4 * _as1], - a_20 = A[2 * _as0 + 0 * _as1], a_21 = A[2 * _as0 + 1 * _as1], - a_22 = A[2 * _as0 + 2 * _as1], a_23 = A[2 * _as0 + 3 * _as1], - a_24 = A[2 * _as0 + 4 * _as1], a_30 = A[3 * _as0 + 0 * _as1], - a_31 = A[3 * _as0 + 1 * _as1], a_32 = A[3 * _as0 + 2 * _as1], - a_33 = A[3 * _as0 + 3 * _as1], a_34 = A[3 * _as0 + 4 * _as1], - a_40 = A[4 * _as0 + 0 * _as1], a_41 = A[4 * _as0 + 1 * _as1], - a_42 = A[4 * _as0 + 2 * _as1], a_43 = A[4 * _as0 + 3 * _as1], + ValueType a_00 = A[0 * _as0 + 0 * _as1], a_01 = A[0 * _as0 + 1 * _as1], a_02 = A[0 * _as0 + 2 * _as1], + a_03 = A[0 * _as0 + 3 * _as1], a_04 = A[0 * _as0 + 4 * _as1], a_10 = A[1 * _as0 + 0 * _as1], + a_11 = A[1 * _as0 + 1 * _as1], a_12 = A[1 * _as0 + 2 * _as1], a_13 = A[1 * _as0 + 3 * _as1], + a_14 = A[1 * _as0 + 4 * _as1], a_20 = A[2 * _as0 + 0 * _as1], a_21 = A[2 * _as0 + 1 * _as1], + a_22 = A[2 * _as0 + 2 * _as1], a_23 = A[2 * _as0 + 3 * _as1], a_24 = A[2 * _as0 + 4 * _as1], + a_30 = A[3 * _as0 + 0 * _as1], a_31 = A[3 * _as0 + 1 * _as1], a_32 = A[3 * _as0 + 2 * _as1], + a_33 = A[3 * _as0 + 3 * _as1], a_34 = A[3 * _as0 + 4 * _as1], a_40 = A[4 * _as0 + 0 * _as1], + a_41 = A[4 * _as0 + 1 * _as1], a_42 = A[4 * _as0 + 2 * _as1], a_43 = A[4 * _as0 + 3 * _as1], a_44 = A[4 * _as0 + 4 * _as1]; // 0 iteration @@ -121,17 +116,14 @@ KOKKOS_INLINE_FUNCTION int InnerLU<5>::serial_invoke( template <> template -KOKKOS_INLINE_FUNCTION int InnerLU<4>::serial_invoke( - ValueType *KOKKOS_RESTRICT A) { +KOKKOS_INLINE_FUNCTION int InnerLU<4>::serial_invoke(ValueType *KOKKOS_RESTRICT A) { // load - ValueType a_00 = A[0 * _as0 + 0 * _as1], a_01 = A[0 * _as0 + 1 * _as1], - a_02 = A[0 * _as0 + 2 * _as1], a_03 = A[0 * _as0 + 3 * _as1], - a_10 = A[1 * _as0 + 0 * _as1], a_11 = A[1 * _as0 + 1 * _as1], - a_12 = A[1 * _as0 + 2 * _as1], a_13 = A[1 * _as0 + 3 * _as1], - a_20 = A[2 * _as0 + 0 * _as1], a_21 = A[2 * _as0 + 1 * _as1], - a_22 = A[2 * _as0 + 2 * _as1], a_23 = A[2 * _as0 + 3 * _as1], - a_30 = A[3 * _as0 + 0 * _as1], a_31 = A[3 * _as0 + 1 * _as1], - a_32 = A[3 * _as0 + 2 * _as1], a_33 = A[3 * _as0 + 3 * _as1]; + ValueType a_00 = A[0 * _as0 + 0 * _as1], a_01 = A[0 * _as0 + 1 * _as1], a_02 = A[0 * _as0 + 2 * _as1], + a_03 = A[0 * _as0 + 3 * _as1], a_10 = A[1 * _as0 + 0 * _as1], a_11 = A[1 * _as0 + 1 * _as1], + a_12 = A[1 * _as0 + 2 * _as1], a_13 = A[1 * _as0 + 3 * _as1], a_20 = A[2 * _as0 + 0 * _as1], + a_21 = A[2 * _as0 + 1 * _as1], a_22 = A[2 * _as0 + 2 * _as1], a_23 = A[2 * _as0 + 3 * _as1], + a_30 = A[3 * _as0 + 0 * _as1], a_31 = A[3 * _as0 + 1 * _as1], a_32 = A[3 * _as0 + 2 * _as1], + a_33 = A[3 * _as0 + 3 * _as1]; // 0 iteration a_10 /= a_00; @@ -178,14 +170,11 @@ KOKKOS_INLINE_FUNCTION int InnerLU<4>::serial_invoke( template <> template -KOKKOS_INLINE_FUNCTION int InnerLU<3>::serial_invoke( - ValueType *KOKKOS_RESTRICT A) { +KOKKOS_INLINE_FUNCTION int InnerLU<3>::serial_invoke(ValueType *KOKKOS_RESTRICT A) { // load - ValueType a_00 = A[0 * _as0 + 0 * _as1], a_01 = A[0 * _as0 + 1 * _as1], - a_02 = A[0 * _as0 + 2 * _as1], a_10 = A[1 * _as0 + 0 * _as1], - a_11 = A[1 * _as0 + 1 * _as1], a_12 = A[1 * _as0 + 2 * _as1], - a_20 = A[2 * _as0 + 0 * _as1], a_21 = A[2 * _as0 + 1 * _as1], - a_22 = A[2 * _as0 + 2 * _as1]; + ValueType a_00 = A[0 * _as0 + 0 * _as1], a_01 = A[0 * _as0 + 1 * _as1], a_02 = A[0 * _as0 + 2 * _as1], + a_10 = A[1 * _as0 + 0 * _as1], a_11 = A[1 * _as0 + 1 * _as1], a_12 = A[1 * _as0 + 2 * _as1], + a_20 = A[2 * _as0 + 0 * _as1], a_21 = A[2 * _as0 + 1 * _as1], a_22 = A[2 * _as0 + 2 * _as1]; // 0 iteration a_10 /= a_00; @@ -212,11 +201,10 @@ KOKKOS_INLINE_FUNCTION int InnerLU<3>::serial_invoke( template <> template -KOKKOS_INLINE_FUNCTION int InnerLU<2>::serial_invoke( - ValueType *KOKKOS_RESTRICT A) { +KOKKOS_INLINE_FUNCTION int InnerLU<2>::serial_invoke(ValueType *KOKKOS_RESTRICT A) { // load - ValueType a_00 = A[0 * _as0 + 0 * _as1], a_01 = A[0 * _as0 + 1 * _as1], - a_10 = A[1 * _as0 + 0 * _as1], a_11 = A[1 * _as0 + 1 * _as1]; + ValueType a_00 = A[0 * _as0 + 0 * _as1], a_01 = A[0 * _as0 + 1 * _as1], a_10 = A[1 * _as0 + 0 * _as1], + a_11 = A[1 * _as0 + 1 * _as1]; // 0 iteration a_10 /= a_00; @@ -231,15 +219,13 @@ KOKKOS_INLINE_FUNCTION int InnerLU<2>::serial_invoke( template <> template -KOKKOS_INLINE_FUNCTION int InnerLU<1>::serial_invoke( - ValueType *KOKKOS_RESTRICT /* A */) { +KOKKOS_INLINE_FUNCTION int InnerLU<1>::serial_invoke(ValueType *KOKKOS_RESTRICT /* A */) { return 0; } template <> template -KOKKOS_INLINE_FUNCTION int InnerLU<5>::serial_invoke( - const int m, ValueType *KOKKOS_RESTRICT A) { +KOKKOS_INLINE_FUNCTION int InnerLU<5>::serial_invoke(const int m, ValueType *KOKKOS_RESTRICT A) { if (m > 5) Kokkos::abort("InnerLU<5>::serial_invoke, assert failure (m<=5)"); if (m <= 0) return 0; @@ -275,8 +261,7 @@ KOKKOS_INLINE_FUNCTION int InnerLU<5>::serial_invoke( template <> template -KOKKOS_INLINE_FUNCTION int InnerLU<4>::serial_invoke( - const int m, ValueType *KOKKOS_RESTRICT A) { +KOKKOS_INLINE_FUNCTION int InnerLU<4>::serial_invoke(const int m, ValueType *KOKKOS_RESTRICT A) { if (m > 4) Kokkos::abort("InnerLU<4>::serial_invoke, assert failure (m<=4)"); if (m <= 0) return 0; @@ -307,8 +292,7 @@ KOKKOS_INLINE_FUNCTION int InnerLU<4>::serial_invoke( template <> template -KOKKOS_INLINE_FUNCTION int InnerLU<3>::serial_invoke( - const int m, ValueType *KOKKOS_RESTRICT A) { +KOKKOS_INLINE_FUNCTION int InnerLU<3>::serial_invoke(const int m, ValueType *KOKKOS_RESTRICT A) { if (m > 3) Kokkos::abort("InnerLU<3>::serial_invoke, assert failure (m<=3)"); if (m <= 0) return 0; @@ -334,8 +318,7 @@ KOKKOS_INLINE_FUNCTION int InnerLU<3>::serial_invoke( template <> template -KOKKOS_INLINE_FUNCTION int InnerLU<2>::serial_invoke( - const int m, ValueType *KOKKOS_RESTRICT A) { +KOKKOS_INLINE_FUNCTION int InnerLU<2>::serial_invoke(const int m, ValueType *KOKKOS_RESTRICT A) { if (m > 2) Kokkos::abort("InnerLU<2>::serial_invoke, assert failure (m<=2)"); if (m <= 0) return 0; @@ -356,8 +339,7 @@ KOKKOS_INLINE_FUNCTION int InnerLU<2>::serial_invoke( template <> template -KOKKOS_INLINE_FUNCTION int InnerLU<1>::serial_invoke( - const int m, ValueType *KOKKOS_RESTRICT A) { +KOKKOS_INLINE_FUNCTION int InnerLU<1>::serial_invoke(const int m, ValueType *KOKKOS_RESTRICT A) { if (m > 1) Kokkos::abort("InnerLU<1>::serial_invoke, assert failure (m<=1)"); if (m <= 0) return 0; diff --git a/batched/dense/impl/KokkosBatched_InnerTrsm_Serial_Impl.hpp b/batched/dense/impl/KokkosBatched_InnerTrsm_Serial_Impl.hpp index 539980a705..04825ac61c 100644 --- a/batched/dense/impl/KokkosBatched_InnerTrsm_Serial_Impl.hpp +++ b/batched/dense/impl/KokkosBatched_InnerTrsm_Serial_Impl.hpp @@ -30,19 +30,16 @@ namespace KokkosBatched { template <> template -KOKKOS_INLINE_FUNCTION int InnerTrsmLeftLowerUnitDiag<5>::serial_invoke( - const ValueType *KOKKOS_RESTRICT A, const int n, - /**/ ValueType *KOKKOS_RESTRICT B) { +KOKKOS_INLINE_FUNCTION int InnerTrsmLeftLowerUnitDiag<5>::serial_invoke(const ValueType *KOKKOS_RESTRICT A, const int n, + /**/ ValueType *KOKKOS_RESTRICT B) { if (n <= 0) return 0; - const ValueType a_10 = A[1 * _as0 + 0 * _as1], a_20 = A[2 * _as0 + 0 * _as1], - a_21 = A[2 * _as0 + 1 * _as1], a_30 = A[3 * _as0 + 0 * _as1], - a_31 = A[3 * _as0 + 1 * _as1], a_32 = A[3 * _as0 + 2 * _as1], - a_40 = A[4 * _as0 + 0 * _as1], a_41 = A[4 * _as0 + 1 * _as1], - a_42 = A[4 * _as0 + 2 * _as1], a_43 = A[4 * _as0 + 3 * _as1]; + const ValueType a_10 = A[1 * _as0 + 0 * _as1], a_20 = A[2 * _as0 + 0 * _as1], a_21 = A[2 * _as0 + 1 * _as1], + a_30 = A[3 * _as0 + 0 * _as1], a_31 = A[3 * _as0 + 1 * _as1], a_32 = A[3 * _as0 + 2 * _as1], + a_40 = A[4 * _as0 + 0 * _as1], a_41 = A[4 * _as0 + 1 * _as1], a_42 = A[4 * _as0 + 2 * _as1], + a_43 = A[4 * _as0 + 3 * _as1]; - auto trsv = [&](const int p, ValueType &b_0p, ValueType &b_1p, - ValueType &b_2p, ValueType &b_3p, ValueType &b_4p) { + auto trsv = [&](const int p, ValueType &b_0p, ValueType &b_1p, ValueType &b_2p, ValueType &b_3p, ValueType &b_4p) { // load b_0p = B[0 * _bs0 + p * _bs1]; b_1p = B[1 * _bs0 + p * _bs1]; @@ -87,17 +84,14 @@ KOKKOS_INLINE_FUNCTION int InnerTrsmLeftLowerUnitDiag<5>::serial_invoke( template <> template -KOKKOS_INLINE_FUNCTION int InnerTrsmLeftLowerUnitDiag<4>::serial_invoke( - const ValueType *KOKKOS_RESTRICT A, const int n, - /**/ ValueType *KOKKOS_RESTRICT B) { +KOKKOS_INLINE_FUNCTION int InnerTrsmLeftLowerUnitDiag<4>::serial_invoke(const ValueType *KOKKOS_RESTRICT A, const int n, + /**/ ValueType *KOKKOS_RESTRICT B) { if (n <= 0) return 0; - const ValueType a_10 = A[1 * _as0 + 0 * _as1], a_20 = A[2 * _as0 + 0 * _as1], - a_21 = A[2 * _as0 + 1 * _as1], a_30 = A[3 * _as0 + 0 * _as1], - a_31 = A[3 * _as0 + 1 * _as1], a_32 = A[3 * _as0 + 2 * _as1]; + const ValueType a_10 = A[1 * _as0 + 0 * _as1], a_20 = A[2 * _as0 + 0 * _as1], a_21 = A[2 * _as0 + 1 * _as1], + a_30 = A[3 * _as0 + 0 * _as1], a_31 = A[3 * _as0 + 1 * _as1], a_32 = A[3 * _as0 + 2 * _as1]; - auto trsv = [&](const int p, ValueType &b_0p, ValueType &b_1p, - ValueType &b_2p, ValueType &b_3p) { + auto trsv = [&](const int p, ValueType &b_0p, ValueType &b_1p, ValueType &b_2p, ValueType &b_3p) { // load b_0p = B[0 * _bs0 + p * _bs1]; b_1p = B[1 * _bs0 + p * _bs1]; @@ -134,16 +128,13 @@ KOKKOS_INLINE_FUNCTION int InnerTrsmLeftLowerUnitDiag<4>::serial_invoke( template <> template -KOKKOS_INLINE_FUNCTION int InnerTrsmLeftLowerUnitDiag<3>::serial_invoke( - const ValueType *KOKKOS_RESTRICT A, const int n, - /**/ ValueType *KOKKOS_RESTRICT B) { +KOKKOS_INLINE_FUNCTION int InnerTrsmLeftLowerUnitDiag<3>::serial_invoke(const ValueType *KOKKOS_RESTRICT A, const int n, + /**/ ValueType *KOKKOS_RESTRICT B) { if (n <= 0) return 0; - const ValueType a_10 = A[1 * _as0 + 0 * _as1], a_20 = A[2 * _as0 + 0 * _as1], - a_21 = A[2 * _as0 + 1 * _as1]; + const ValueType a_10 = A[1 * _as0 + 0 * _as1], a_20 = A[2 * _as0 + 0 * _as1], a_21 = A[2 * _as0 + 1 * _as1]; - auto trsv = [&](const int p, ValueType &b_0p, ValueType &b_1p, - ValueType &b_2p) { + auto trsv = [&](const int p, ValueType &b_0p, ValueType &b_1p, ValueType &b_2p) { // load b_0p = B[0 * _bs0 + p * _bs1]; b_1p = B[1 * _bs0 + p * _bs1]; @@ -173,9 +164,8 @@ KOKKOS_INLINE_FUNCTION int InnerTrsmLeftLowerUnitDiag<3>::serial_invoke( template <> template -KOKKOS_INLINE_FUNCTION int InnerTrsmLeftLowerUnitDiag<2>::serial_invoke( - const ValueType *KOKKOS_RESTRICT A, const int n, - /**/ ValueType *KOKKOS_RESTRICT B) { +KOKKOS_INLINE_FUNCTION int InnerTrsmLeftLowerUnitDiag<2>::serial_invoke(const ValueType *KOKKOS_RESTRICT A, const int n, + /**/ ValueType *KOKKOS_RESTRICT B) { if (n <= 0) return 0; const ValueType a_10 = A[1 * _as0 + 0 * _as1]; @@ -205,9 +195,9 @@ KOKKOS_INLINE_FUNCTION int InnerTrsmLeftLowerUnitDiag<2>::serial_invoke( template <> template -KOKKOS_INLINE_FUNCTION int InnerTrsmLeftLowerUnitDiag<1>::serial_invoke( - const ValueType *KOKKOS_RESTRICT /* A */, const int /* n */, - /**/ ValueType *KOKKOS_RESTRICT /* B */) { +KOKKOS_INLINE_FUNCTION int InnerTrsmLeftLowerUnitDiag<1>::serial_invoke(const ValueType *KOKKOS_RESTRICT /* A */, + const int /* n */, + /**/ ValueType *KOKKOS_RESTRICT /* B */) { return 0; } @@ -218,12 +208,10 @@ KOKKOS_INLINE_FUNCTION int InnerTrsmLeftLowerUnitDiag<1>::serial_invoke( template <> template -KOKKOS_INLINE_FUNCTION int InnerTrsmLeftLowerUnitDiag<5>::serial_invoke( - const ValueType *KOKKOS_RESTRICT A, const int m, const int n, - /**/ ValueType *KOKKOS_RESTRICT B) { - if (m > 5) - Kokkos::abort( - "InnerTrsmLeftLowerUnitDiag<5>::serial_invoke, assert failure (m<=5)"); +KOKKOS_INLINE_FUNCTION int InnerTrsmLeftLowerUnitDiag<5>::serial_invoke(const ValueType *KOKKOS_RESTRICT A, const int m, + const int n, + /**/ ValueType *KOKKOS_RESTRICT B) { + if (m > 5) Kokkos::abort("InnerTrsmLeftLowerUnitDiag<5>::serial_invoke, assert failure (m<=5)"); if (m <= 0 || n <= 0) return 0; switch (m) { case 5: { @@ -256,12 +244,10 @@ KOKKOS_INLINE_FUNCTION int InnerTrsmLeftLowerUnitDiag<5>::serial_invoke( } template <> template -KOKKOS_INLINE_FUNCTION int InnerTrsmLeftLowerUnitDiag<4>::serial_invoke( - const ValueType *KOKKOS_RESTRICT A, const int m, const int n, - /**/ ValueType *KOKKOS_RESTRICT B) { - if (m > 4) - Kokkos::abort( - "InnerTrsmLeftLowerUnitDiag<4>::serial_invoke, assert failure (m<=4)"); +KOKKOS_INLINE_FUNCTION int InnerTrsmLeftLowerUnitDiag<4>::serial_invoke(const ValueType *KOKKOS_RESTRICT A, const int m, + const int n, + /**/ ValueType *KOKKOS_RESTRICT B) { + if (m > 4) Kokkos::abort("InnerTrsmLeftLowerUnitDiag<4>::serial_invoke, assert failure (m<=4)"); if (m <= 0 || n <= 0) return 0; switch (m) { case 4: { @@ -289,12 +275,10 @@ KOKKOS_INLINE_FUNCTION int InnerTrsmLeftLowerUnitDiag<4>::serial_invoke( } template <> template -KOKKOS_INLINE_FUNCTION int InnerTrsmLeftLowerUnitDiag<3>::serial_invoke( - const ValueType *KOKKOS_RESTRICT A, const int m, const int n, - /**/ ValueType *KOKKOS_RESTRICT B) { - if (m > 3) - Kokkos::abort( - "InnerTrsmLeftLowerUnitDiag<3>::serial_invoke, assert failure (m<=3)"); +KOKKOS_INLINE_FUNCTION int InnerTrsmLeftLowerUnitDiag<3>::serial_invoke(const ValueType *KOKKOS_RESTRICT A, const int m, + const int n, + /**/ ValueType *KOKKOS_RESTRICT B) { + if (m > 3) Kokkos::abort("InnerTrsmLeftLowerUnitDiag<3>::serial_invoke, assert failure (m<=3)"); if (m <= 0 || n <= 0) return 0; switch (m) { case 3: { @@ -317,12 +301,10 @@ KOKKOS_INLINE_FUNCTION int InnerTrsmLeftLowerUnitDiag<3>::serial_invoke( } template <> template -KOKKOS_INLINE_FUNCTION int InnerTrsmLeftLowerUnitDiag<2>::serial_invoke( - const ValueType *KOKKOS_RESTRICT A, const int m, const int n, - /**/ ValueType *KOKKOS_RESTRICT B) { - if (m > 2) - Kokkos::abort( - "InnerTrsmLeftLowerUnitDiag<2>::serial_invoke, assert failure (m<=2)"); +KOKKOS_INLINE_FUNCTION int InnerTrsmLeftLowerUnitDiag<2>::serial_invoke(const ValueType *KOKKOS_RESTRICT A, const int m, + const int n, + /**/ ValueType *KOKKOS_RESTRICT B) { + if (m > 2) Kokkos::abort("InnerTrsmLeftLowerUnitDiag<2>::serial_invoke, assert failure (m<=2)"); if (m <= 0 || n <= 0) return 0; switch (m) { case 2: { @@ -340,12 +322,10 @@ KOKKOS_INLINE_FUNCTION int InnerTrsmLeftLowerUnitDiag<2>::serial_invoke( } template <> template -KOKKOS_INLINE_FUNCTION int InnerTrsmLeftLowerUnitDiag<1>::serial_invoke( - const ValueType *KOKKOS_RESTRICT A, const int m, const int n, - /**/ ValueType *KOKKOS_RESTRICT B) { - if (m > 1) - Kokkos::abort( - "InnerTrsmLeftLowerUnitDiag<1>::serial_invoke, assert failure (m<=1)"); +KOKKOS_INLINE_FUNCTION int InnerTrsmLeftLowerUnitDiag<1>::serial_invoke(const ValueType *KOKKOS_RESTRICT A, const int m, + const int n, + /**/ ValueType *KOKKOS_RESTRICT B) { + if (m > 1) Kokkos::abort("InnerTrsmLeftLowerUnitDiag<1>::serial_invoke, assert failure (m<=1)"); if (m <= 0 || n <= 0) return 0; switch (m) { case 1: { @@ -364,16 +344,15 @@ KOKKOS_INLINE_FUNCTION int InnerTrsmLeftLowerUnitDiag<1>::serial_invoke( template <> template -KOKKOS_INLINE_FUNCTION int InnerTrsmLeftLowerNonUnitDiag<5>::serial_invoke( - const ValueType *KOKKOS_RESTRICT A, const int n, - /**/ ValueType *KOKKOS_RESTRICT B) { +KOKKOS_INLINE_FUNCTION int InnerTrsmLeftLowerNonUnitDiag<5>::serial_invoke(const ValueType *KOKKOS_RESTRICT A, + const int n, + /**/ ValueType *KOKKOS_RESTRICT B) { if (n <= 0) return 0; - const ValueType a_10 = A[1 * _as0 + 0 * _as1], a_20 = A[2 * _as0 + 0 * _as1], - a_21 = A[2 * _as0 + 1 * _as1], a_30 = A[3 * _as0 + 0 * _as1], - a_31 = A[3 * _as0 + 1 * _as1], a_32 = A[3 * _as0 + 2 * _as1], - a_40 = A[4 * _as0 + 0 * _as1], a_41 = A[4 * _as0 + 1 * _as1], - a_42 = A[4 * _as0 + 2 * _as1], a_43 = A[4 * _as0 + 3 * _as1]; + const ValueType a_10 = A[1 * _as0 + 0 * _as1], a_20 = A[2 * _as0 + 0 * _as1], a_21 = A[2 * _as0 + 1 * _as1], + a_30 = A[3 * _as0 + 0 * _as1], a_31 = A[3 * _as0 + 1 * _as1], a_32 = A[3 * _as0 + 2 * _as1], + a_40 = A[4 * _as0 + 0 * _as1], a_41 = A[4 * _as0 + 1 * _as1], a_42 = A[4 * _as0 + 2 * _as1], + a_43 = A[4 * _as0 + 3 * _as1]; // const ValueType // a_00 = A[0*_as0+0*_as1], @@ -382,19 +361,13 @@ KOKKOS_INLINE_FUNCTION int InnerTrsmLeftLowerNonUnitDiag<5>::serial_invoke( // a_33 = A[3*_as0+3*_as1], // a_44 = A[4*_as0+4*_as1]; - const ValueType inv_a_00 = - static_cast(1.0) / A[0 * _as0 + 0 * _as1], - inv_a_11 = - static_cast(1.0) / A[1 * _as0 + 1 * _as1], - inv_a_22 = - static_cast(1.0) / A[2 * _as0 + 2 * _as1], - inv_a_33 = - static_cast(1.0) / A[3 * _as0 + 3 * _as1], - inv_a_44 = - static_cast(1.0) / A[4 * _as0 + 4 * _as1]; - - auto trsv = [&](const int p, ValueType &b_0p, ValueType &b_1p, - ValueType &b_2p, ValueType &b_3p, ValueType &b_4p) { + const ValueType inv_a_00 = static_cast(1.0) / A[0 * _as0 + 0 * _as1], + inv_a_11 = static_cast(1.0) / A[1 * _as0 + 1 * _as1], + inv_a_22 = static_cast(1.0) / A[2 * _as0 + 2 * _as1], + inv_a_33 = static_cast(1.0) / A[3 * _as0 + 3 * _as1], + inv_a_44 = static_cast(1.0) / A[4 * _as0 + 4 * _as1]; + + auto trsv = [&](const int p, ValueType &b_0p, ValueType &b_1p, ValueType &b_2p, ValueType &b_3p, ValueType &b_4p) { // load b_0p = B[0 * _bs0 + p * _bs1]; b_1p = B[1 * _bs0 + p * _bs1]; @@ -448,14 +421,13 @@ KOKKOS_INLINE_FUNCTION int InnerTrsmLeftLowerNonUnitDiag<5>::serial_invoke( template <> template -KOKKOS_INLINE_FUNCTION int InnerTrsmLeftLowerNonUnitDiag<4>::serial_invoke( - const ValueType *KOKKOS_RESTRICT A, const int n, - /**/ ValueType *KOKKOS_RESTRICT B) { +KOKKOS_INLINE_FUNCTION int InnerTrsmLeftLowerNonUnitDiag<4>::serial_invoke(const ValueType *KOKKOS_RESTRICT A, + const int n, + /**/ ValueType *KOKKOS_RESTRICT B) { if (n <= 0) return 0; - const ValueType a_10 = A[1 * _as0 + 0 * _as1], a_20 = A[2 * _as0 + 0 * _as1], - a_21 = A[2 * _as0 + 1 * _as1], a_30 = A[3 * _as0 + 0 * _as1], - a_31 = A[3 * _as0 + 1 * _as1], a_32 = A[3 * _as0 + 2 * _as1]; + const ValueType a_10 = A[1 * _as0 + 0 * _as1], a_20 = A[2 * _as0 + 0 * _as1], a_21 = A[2 * _as0 + 1 * _as1], + a_30 = A[3 * _as0 + 0 * _as1], a_31 = A[3 * _as0 + 1 * _as1], a_32 = A[3 * _as0 + 2 * _as1]; // const ValueType // a_00 = A[0*_as0+0*_as1], @@ -463,17 +435,12 @@ KOKKOS_INLINE_FUNCTION int InnerTrsmLeftLowerNonUnitDiag<4>::serial_invoke( // a_22 = A[2*_as0+2*_as1], // a_33 = A[3*_as0+3*_as1]; - const ValueType inv_a_00 = - static_cast(1.0) / A[0 * _as0 + 0 * _as1], - inv_a_11 = - static_cast(1.0) / A[1 * _as0 + 1 * _as1], - inv_a_22 = - static_cast(1.0) / A[2 * _as0 + 2 * _as1], - inv_a_33 = - static_cast(1.0) / A[3 * _as0 + 3 * _as1]; - - auto trsv = [&](const int p, ValueType &b_0p, ValueType &b_1p, - ValueType &b_2p, ValueType &b_3p) { + const ValueType inv_a_00 = static_cast(1.0) / A[0 * _as0 + 0 * _as1], + inv_a_11 = static_cast(1.0) / A[1 * _as0 + 1 * _as1], + inv_a_22 = static_cast(1.0) / A[2 * _as0 + 2 * _as1], + inv_a_33 = static_cast(1.0) / A[3 * _as0 + 3 * _as1]; + + auto trsv = [&](const int p, ValueType &b_0p, ValueType &b_1p, ValueType &b_2p, ValueType &b_3p) { // load b_0p = B[0 * _bs0 + p * _bs1]; b_1p = B[1 * _bs0 + p * _bs1]; @@ -518,28 +485,23 @@ KOKKOS_INLINE_FUNCTION int InnerTrsmLeftLowerNonUnitDiag<4>::serial_invoke( template <> template -KOKKOS_INLINE_FUNCTION int InnerTrsmLeftLowerNonUnitDiag<3>::serial_invoke( - const ValueType *KOKKOS_RESTRICT A, const int n, - /**/ ValueType *KOKKOS_RESTRICT B) { +KOKKOS_INLINE_FUNCTION int InnerTrsmLeftLowerNonUnitDiag<3>::serial_invoke(const ValueType *KOKKOS_RESTRICT A, + const int n, + /**/ ValueType *KOKKOS_RESTRICT B) { if (n <= 0) return 0; - const ValueType a_10 = A[1 * _as0 + 0 * _as1], a_20 = A[2 * _as0 + 0 * _as1], - a_21 = A[2 * _as0 + 1 * _as1]; + const ValueType a_10 = A[1 * _as0 + 0 * _as1], a_20 = A[2 * _as0 + 0 * _as1], a_21 = A[2 * _as0 + 1 * _as1]; // const ValueType // a_00 = A[0*_as0+0*_as1], // a_11 = A[1*_as0+1*_as1], // a_22 = A[2*_as0+2*_as1]; - const ValueType inv_a_00 = - static_cast(1.0) / A[0 * _as0 + 0 * _as1], - inv_a_11 = - static_cast(1.0) / A[1 * _as0 + 1 * _as1], - inv_a_22 = - static_cast(1.0) / A[2 * _as0 + 2 * _as1]; + const ValueType inv_a_00 = static_cast(1.0) / A[0 * _as0 + 0 * _as1], + inv_a_11 = static_cast(1.0) / A[1 * _as0 + 1 * _as1], + inv_a_22 = static_cast(1.0) / A[2 * _as0 + 2 * _as1]; - auto trsv = [&](const int p, ValueType &b_0p, ValueType &b_1p, - ValueType &b_2p) { + auto trsv = [&](const int p, ValueType &b_0p, ValueType &b_1p, ValueType &b_2p) { // load b_0p = B[0 * _bs0 + p * _bs1]; b_1p = B[1 * _bs0 + p * _bs1]; @@ -576,9 +538,9 @@ KOKKOS_INLINE_FUNCTION int InnerTrsmLeftLowerNonUnitDiag<3>::serial_invoke( template <> template -KOKKOS_INLINE_FUNCTION int InnerTrsmLeftLowerNonUnitDiag<2>::serial_invoke( - const ValueType *KOKKOS_RESTRICT A, const int n, - /**/ ValueType *KOKKOS_RESTRICT B) { +KOKKOS_INLINE_FUNCTION int InnerTrsmLeftLowerNonUnitDiag<2>::serial_invoke(const ValueType *KOKKOS_RESTRICT A, + const int n, + /**/ ValueType *KOKKOS_RESTRICT B) { if (n <= 0) return 0; const ValueType a_10 = A[1 * _as0 + 0 * _as1]; @@ -587,10 +549,8 @@ KOKKOS_INLINE_FUNCTION int InnerTrsmLeftLowerNonUnitDiag<2>::serial_invoke( // a_00 = A[0*_as0+0*_as1], // a_11 = A[1*_as0+1*_as1]; - const ValueType inv_a_00 = - static_cast(1.0) / A[0 * _as0 + 0 * _as1], - inv_a_11 = - static_cast(1.0) / A[1 * _as0 + 1 * _as1]; + const ValueType inv_a_00 = static_cast(1.0) / A[0 * _as0 + 0 * _as1], + inv_a_11 = static_cast(1.0) / A[1 * _as0 + 1 * _as1]; auto trsv = [&](const int p, ValueType &b_0p, ValueType &b_1p) { // load @@ -622,16 +582,15 @@ KOKKOS_INLINE_FUNCTION int InnerTrsmLeftLowerNonUnitDiag<2>::serial_invoke( template <> template -KOKKOS_INLINE_FUNCTION int InnerTrsmLeftLowerNonUnitDiag<1>::serial_invoke( - const ValueType *KOKKOS_RESTRICT A, const int n, - /**/ ValueType *KOKKOS_RESTRICT B) { +KOKKOS_INLINE_FUNCTION int InnerTrsmLeftLowerNonUnitDiag<1>::serial_invoke(const ValueType *KOKKOS_RESTRICT A, + const int n, + /**/ ValueType *KOKKOS_RESTRICT B) { if (n <= 0) return 0; // const ValueType // a_00 = A[0*_as0+0*_as1]; - const ValueType inv_a_00 = - static_cast(1.0) / A[0 * _as0 + 0 * _as1]; + const ValueType inv_a_00 = static_cast(1.0) / A[0 * _as0 + 0 * _as1]; auto trsv = [&](const int p, ValueType & /* b_0p */) { B[0 * _bs0 + p * _bs1] *= inv_a_00; /* b_0p /= a_00;*/ @@ -655,9 +614,9 @@ KOKKOS_INLINE_FUNCTION int InnerTrsmLeftLowerNonUnitDiag<1>::serial_invoke( template <> template -KOKKOS_INLINE_FUNCTION int InnerTrsmLeftLowerNonUnitDiag<5>::serial_invoke( - const ValueType *KOKKOS_RESTRICT A, const int m, const int n, - /**/ ValueType *KOKKOS_RESTRICT B) { +KOKKOS_INLINE_FUNCTION int InnerTrsmLeftLowerNonUnitDiag<5>::serial_invoke(const ValueType *KOKKOS_RESTRICT A, + const int m, const int n, + /**/ ValueType *KOKKOS_RESTRICT B) { if (m > 5) Kokkos::abort( "InnerTrsmLeftLowerNonUnitDiag<5>::serial_invoke, assert failure " @@ -694,9 +653,9 @@ KOKKOS_INLINE_FUNCTION int InnerTrsmLeftLowerNonUnitDiag<5>::serial_invoke( } template <> template -KOKKOS_INLINE_FUNCTION int InnerTrsmLeftLowerNonUnitDiag<4>::serial_invoke( - const ValueType *KOKKOS_RESTRICT A, const int m, const int n, - /**/ ValueType *KOKKOS_RESTRICT B) { +KOKKOS_INLINE_FUNCTION int InnerTrsmLeftLowerNonUnitDiag<4>::serial_invoke(const ValueType *KOKKOS_RESTRICT A, + const int m, const int n, + /**/ ValueType *KOKKOS_RESTRICT B) { if (m > 4) Kokkos::abort( "InnerTrsmLeftLowerNonUnitDiag<4>::serial_invoke, assert failure " @@ -728,9 +687,9 @@ KOKKOS_INLINE_FUNCTION int InnerTrsmLeftLowerNonUnitDiag<4>::serial_invoke( } template <> template -KOKKOS_INLINE_FUNCTION int InnerTrsmLeftLowerNonUnitDiag<3>::serial_invoke( - const ValueType *KOKKOS_RESTRICT A, const int m, const int n, - /**/ ValueType *KOKKOS_RESTRICT B) { +KOKKOS_INLINE_FUNCTION int InnerTrsmLeftLowerNonUnitDiag<3>::serial_invoke(const ValueType *KOKKOS_RESTRICT A, + const int m, const int n, + /**/ ValueType *KOKKOS_RESTRICT B) { if (m > 3) Kokkos::abort( "InnerTrsmLeftLowerNonUnitDiag<3>::serial_invoke, assert failure " @@ -757,9 +716,9 @@ KOKKOS_INLINE_FUNCTION int InnerTrsmLeftLowerNonUnitDiag<3>::serial_invoke( } template <> template -KOKKOS_INLINE_FUNCTION int InnerTrsmLeftLowerNonUnitDiag<2>::serial_invoke( - const ValueType *KOKKOS_RESTRICT A, const int m, const int n, - /**/ ValueType *KOKKOS_RESTRICT B) { +KOKKOS_INLINE_FUNCTION int InnerTrsmLeftLowerNonUnitDiag<2>::serial_invoke(const ValueType *KOKKOS_RESTRICT A, + const int m, const int n, + /**/ ValueType *KOKKOS_RESTRICT B) { if (m > 2) Kokkos::abort( "InnerTrsmLeftLowerNonUnitDiag<2>::serial_invoke, assert failure " @@ -781,9 +740,9 @@ KOKKOS_INLINE_FUNCTION int InnerTrsmLeftLowerNonUnitDiag<2>::serial_invoke( } template <> template -KOKKOS_INLINE_FUNCTION int InnerTrsmLeftLowerNonUnitDiag<1>::serial_invoke( - const ValueType *KOKKOS_RESTRICT A, const int m, const int n, - /**/ ValueType *KOKKOS_RESTRICT B) { +KOKKOS_INLINE_FUNCTION int InnerTrsmLeftLowerNonUnitDiag<1>::serial_invoke(const ValueType *KOKKOS_RESTRICT A, + const int m, const int n, + /**/ ValueType *KOKKOS_RESTRICT B) { if (m > 1) Kokkos::abort( "InnerTrsmLeftLowerNonUnitDiag<1>::serial_invoke, assert failure " @@ -806,21 +765,17 @@ KOKKOS_INLINE_FUNCTION int InnerTrsmLeftLowerNonUnitDiag<1>::serial_invoke( template <> template -KOKKOS_INLINE_FUNCTION int InnerTrsmLeftUpperUnitDiag<5>::serial_invoke( - const ValueType *KOKKOS_RESTRICT A, const int n, - /**/ ValueType *KOKKOS_RESTRICT B) { +KOKKOS_INLINE_FUNCTION int InnerTrsmLeftUpperUnitDiag<5>::serial_invoke(const ValueType *KOKKOS_RESTRICT A, const int n, + /**/ ValueType *KOKKOS_RESTRICT B) { if (n <= 0) return 0; - const ValueType a_01 = A[0 * _as0 + 1 * _as1], a_02 = A[0 * _as0 + 2 * _as1], - a_03 = A[0 * _as0 + 3 * _as1], a_04 = A[0 * _as0 + 4 * _as1], - /**/ a_12 = A[1 * _as0 + 2 * _as1], - a_13 = A[1 * _as0 + 3 * _as1], a_14 = A[1 * _as0 + 4 * _as1], - /**/ a_23 = A[2 * _as0 + 3 * _as1], - a_24 = A[2 * _as0 + 4 * _as1], + const ValueType a_01 = A[0 * _as0 + 1 * _as1], a_02 = A[0 * _as0 + 2 * _as1], a_03 = A[0 * _as0 + 3 * _as1], + a_04 = A[0 * _as0 + 4 * _as1], + /**/ a_12 = A[1 * _as0 + 2 * _as1], a_13 = A[1 * _as0 + 3 * _as1], a_14 = A[1 * _as0 + 4 * _as1], + /**/ a_23 = A[2 * _as0 + 3 * _as1], a_24 = A[2 * _as0 + 4 * _as1], /**/ a_34 = A[3 * _as0 + 4 * _as1]; - auto trsv = [&](const int p, ValueType &b_0p, ValueType &b_1p, - ValueType &b_2p, ValueType &b_3p, ValueType &b_4p) { + auto trsv = [&](const int p, ValueType &b_0p, ValueType &b_1p, ValueType &b_2p, ValueType &b_3p, ValueType &b_4p) { // load b_0p = B[0 * _bs0 + p * _bs1]; b_1p = B[1 * _bs0 + p * _bs1]; @@ -866,19 +821,15 @@ KOKKOS_INLINE_FUNCTION int InnerTrsmLeftUpperUnitDiag<5>::serial_invoke( template <> template -KOKKOS_INLINE_FUNCTION int InnerTrsmLeftUpperUnitDiag<4>::serial_invoke( - const ValueType *KOKKOS_RESTRICT A, const int n, - /**/ ValueType *KOKKOS_RESTRICT B) { +KOKKOS_INLINE_FUNCTION int InnerTrsmLeftUpperUnitDiag<4>::serial_invoke(const ValueType *KOKKOS_RESTRICT A, const int n, + /**/ ValueType *KOKKOS_RESTRICT B) { if (n <= 0) return 0; - const ValueType a_01 = A[0 * _as0 + 1 * _as1], a_02 = A[0 * _as0 + 2 * _as1], - a_03 = A[0 * _as0 + 3 * _as1], - /**/ a_12 = A[1 * _as0 + 2 * _as1], - a_13 = A[1 * _as0 + 3 * _as1], + const ValueType a_01 = A[0 * _as0 + 1 * _as1], a_02 = A[0 * _as0 + 2 * _as1], a_03 = A[0 * _as0 + 3 * _as1], + /**/ a_12 = A[1 * _as0 + 2 * _as1], a_13 = A[1 * _as0 + 3 * _as1], /**/ a_23 = A[2 * _as0 + 3 * _as1]; - auto trsv = [&](const int p, ValueType &b_0p, ValueType &b_1p, - ValueType &b_2p, ValueType &b_3p) { + auto trsv = [&](const int p, ValueType &b_0p, ValueType &b_1p, ValueType &b_2p, ValueType &b_3p) { // load b_0p = B[0 * _bs0 + p * _bs1]; b_1p = B[1 * _bs0 + p * _bs1]; @@ -916,16 +867,14 @@ KOKKOS_INLINE_FUNCTION int InnerTrsmLeftUpperUnitDiag<4>::serial_invoke( template <> template -KOKKOS_INLINE_FUNCTION int InnerTrsmLeftUpperUnitDiag<3>::serial_invoke( - const ValueType *KOKKOS_RESTRICT A, const int n, - /**/ ValueType *KOKKOS_RESTRICT B) { +KOKKOS_INLINE_FUNCTION int InnerTrsmLeftUpperUnitDiag<3>::serial_invoke(const ValueType *KOKKOS_RESTRICT A, const int n, + /**/ ValueType *KOKKOS_RESTRICT B) { if (n <= 0) return 0; const ValueType a_01 = A[0 * _as0 + 1 * _as1], a_02 = A[0 * _as0 + 2 * _as1], /**/ a_12 = A[1 * _as0 + 2 * _as1]; - auto trsv = [&](const int p, ValueType &b_0p, ValueType &b_1p, - ValueType &b_2p) { + auto trsv = [&](const int p, ValueType &b_0p, ValueType &b_1p, ValueType &b_2p) { // load b_0p = B[0 * _bs0 + p * _bs1]; b_1p = B[1 * _bs0 + p * _bs1]; @@ -956,9 +905,8 @@ KOKKOS_INLINE_FUNCTION int InnerTrsmLeftUpperUnitDiag<3>::serial_invoke( template <> template -KOKKOS_INLINE_FUNCTION int InnerTrsmLeftUpperUnitDiag<2>::serial_invoke( - const ValueType *KOKKOS_RESTRICT A, const int n, - /**/ ValueType *KOKKOS_RESTRICT B) { +KOKKOS_INLINE_FUNCTION int InnerTrsmLeftUpperUnitDiag<2>::serial_invoke(const ValueType *KOKKOS_RESTRICT A, const int n, + /**/ ValueType *KOKKOS_RESTRICT B) { if (n <= 0) return 0; const ValueType a_01 = A[0 * _as0 + 1 * _as1]; @@ -988,9 +936,9 @@ KOKKOS_INLINE_FUNCTION int InnerTrsmLeftUpperUnitDiag<2>::serial_invoke( template <> template -KOKKOS_INLINE_FUNCTION int InnerTrsmLeftUpperUnitDiag<1>::serial_invoke( - const ValueType *KOKKOS_RESTRICT /* A */, const int /* n */, - /**/ ValueType *KOKKOS_RESTRICT /* B */) { +KOKKOS_INLINE_FUNCTION int InnerTrsmLeftUpperUnitDiag<1>::serial_invoke(const ValueType *KOKKOS_RESTRICT /* A */, + const int /* n */, + /**/ ValueType *KOKKOS_RESTRICT /* B */) { return 0; } @@ -1001,12 +949,10 @@ KOKKOS_INLINE_FUNCTION int InnerTrsmLeftUpperUnitDiag<1>::serial_invoke( template <> template -KOKKOS_INLINE_FUNCTION int InnerTrsmLeftUpperUnitDiag<5>::serial_invoke( - const ValueType *KOKKOS_RESTRICT A, const int m, const int n, - /**/ ValueType *KOKKOS_RESTRICT B) { - if (m > 5) - Kokkos::abort( - "InnerTrsmLeftUpperUnitDiag<5>::serial_invoke, assert failure (m<=5)"); +KOKKOS_INLINE_FUNCTION int InnerTrsmLeftUpperUnitDiag<5>::serial_invoke(const ValueType *KOKKOS_RESTRICT A, const int m, + const int n, + /**/ ValueType *KOKKOS_RESTRICT B) { + if (m > 5) Kokkos::abort("InnerTrsmLeftUpperUnitDiag<5>::serial_invoke, assert failure (m<=5)"); if (m <= 0 || n <= 0) return 0; switch (m) { case 5: { @@ -1039,12 +985,10 @@ KOKKOS_INLINE_FUNCTION int InnerTrsmLeftUpperUnitDiag<5>::serial_invoke( } template <> template -KOKKOS_INLINE_FUNCTION int InnerTrsmLeftUpperUnitDiag<4>::serial_invoke( - const ValueType *KOKKOS_RESTRICT A, const int m, const int n, - /**/ ValueType *KOKKOS_RESTRICT B) { - if (m > 4) - Kokkos::abort( - "InnerTrsmLeftUpperUnitDiag<4>::serial_invoke, assert failure (m<=4)"); +KOKKOS_INLINE_FUNCTION int InnerTrsmLeftUpperUnitDiag<4>::serial_invoke(const ValueType *KOKKOS_RESTRICT A, const int m, + const int n, + /**/ ValueType *KOKKOS_RESTRICT B) { + if (m > 4) Kokkos::abort("InnerTrsmLeftUpperUnitDiag<4>::serial_invoke, assert failure (m<=4)"); if (m <= 0 || n <= 0) return 0; switch (m) { case 4: { @@ -1072,12 +1016,10 @@ KOKKOS_INLINE_FUNCTION int InnerTrsmLeftUpperUnitDiag<4>::serial_invoke( } template <> template -KOKKOS_INLINE_FUNCTION int InnerTrsmLeftUpperUnitDiag<3>::serial_invoke( - const ValueType *KOKKOS_RESTRICT A, const int m, const int n, - /**/ ValueType *KOKKOS_RESTRICT B) { - if (m > 3) - Kokkos::abort( - "InnerTrsmLeftUpperUnitDiag<3>::serial_invoke, assert failure (m<=3)"); +KOKKOS_INLINE_FUNCTION int InnerTrsmLeftUpperUnitDiag<3>::serial_invoke(const ValueType *KOKKOS_RESTRICT A, const int m, + const int n, + /**/ ValueType *KOKKOS_RESTRICT B) { + if (m > 3) Kokkos::abort("InnerTrsmLeftUpperUnitDiag<3>::serial_invoke, assert failure (m<=3)"); if (m <= 0 || n <= 0) return 0; switch (m) { case 3: { @@ -1100,12 +1042,10 @@ KOKKOS_INLINE_FUNCTION int InnerTrsmLeftUpperUnitDiag<3>::serial_invoke( } template <> template -KOKKOS_INLINE_FUNCTION int InnerTrsmLeftUpperUnitDiag<2>::serial_invoke( - const ValueType *KOKKOS_RESTRICT A, const int m, const int n, - /**/ ValueType *KOKKOS_RESTRICT B) { - if (m > 2) - Kokkos::abort( - "InnerTrsmLeftUpperUnitDiag<2>::serial_invoke, assert failure (m<=2)"); +KOKKOS_INLINE_FUNCTION int InnerTrsmLeftUpperUnitDiag<2>::serial_invoke(const ValueType *KOKKOS_RESTRICT A, const int m, + const int n, + /**/ ValueType *KOKKOS_RESTRICT B) { + if (m > 2) Kokkos::abort("InnerTrsmLeftUpperUnitDiag<2>::serial_invoke, assert failure (m<=2)"); if (m <= 0 || n <= 0) return 0; switch (m) { case 2: { @@ -1123,12 +1063,10 @@ KOKKOS_INLINE_FUNCTION int InnerTrsmLeftUpperUnitDiag<2>::serial_invoke( } template <> template -KOKKOS_INLINE_FUNCTION int InnerTrsmLeftUpperUnitDiag<1>::serial_invoke( - const ValueType *KOKKOS_RESTRICT A, const int m, const int n, - /**/ ValueType *KOKKOS_RESTRICT B) { - if (m > 1) - Kokkos::abort( - "InnerTrsmLeftUpperUnitDiag<1>::serial_invoke, assert failure (m<=1)"); +KOKKOS_INLINE_FUNCTION int InnerTrsmLeftUpperUnitDiag<1>::serial_invoke(const ValueType *KOKKOS_RESTRICT A, const int m, + const int n, + /**/ ValueType *KOKKOS_RESTRICT B) { + if (m > 1) Kokkos::abort("InnerTrsmLeftUpperUnitDiag<1>::serial_invoke, assert failure (m<=1)"); if (m <= 0 || n <= 0) return 0; switch (m) { case 1: { @@ -1147,17 +1085,15 @@ KOKKOS_INLINE_FUNCTION int InnerTrsmLeftUpperUnitDiag<1>::serial_invoke( template <> template -KOKKOS_INLINE_FUNCTION int InnerTrsmLeftUpperNonUnitDiag<5>::serial_invoke( - const ValueType *KOKKOS_RESTRICT A, const int n, - /**/ ValueType *KOKKOS_RESTRICT B) { +KOKKOS_INLINE_FUNCTION int InnerTrsmLeftUpperNonUnitDiag<5>::serial_invoke(const ValueType *KOKKOS_RESTRICT A, + const int n, + /**/ ValueType *KOKKOS_RESTRICT B) { if (n <= 0) return 0; - const ValueType a_01 = A[0 * _as0 + 1 * _as1], a_02 = A[0 * _as0 + 2 * _as1], - a_03 = A[0 * _as0 + 3 * _as1], a_04 = A[0 * _as0 + 4 * _as1], - /**/ a_12 = A[1 * _as0 + 2 * _as1], - a_13 = A[1 * _as0 + 3 * _as1], a_14 = A[1 * _as0 + 4 * _as1], - /**/ a_23 = A[2 * _as0 + 3 * _as1], - a_24 = A[2 * _as0 + 4 * _as1], + const ValueType a_01 = A[0 * _as0 + 1 * _as1], a_02 = A[0 * _as0 + 2 * _as1], a_03 = A[0 * _as0 + 3 * _as1], + a_04 = A[0 * _as0 + 4 * _as1], + /**/ a_12 = A[1 * _as0 + 2 * _as1], a_13 = A[1 * _as0 + 3 * _as1], a_14 = A[1 * _as0 + 4 * _as1], + /**/ a_23 = A[2 * _as0 + 3 * _as1], a_24 = A[2 * _as0 + 4 * _as1], /**/ a_34 = A[3 * _as0 + 4 * _as1]; // const ValueType @@ -1167,19 +1103,13 @@ KOKKOS_INLINE_FUNCTION int InnerTrsmLeftUpperNonUnitDiag<5>::serial_invoke( // a_33 = A[3*_as0+3*_as1], // a_44 = A[4*_as0+4*_as1]; - const ValueType inv_a_00 = - static_cast(1.0) / A[0 * _as0 + 0 * _as1], - inv_a_11 = - static_cast(1.0) / A[1 * _as0 + 1 * _as1], - inv_a_22 = - static_cast(1.0) / A[2 * _as0 + 2 * _as1], - inv_a_33 = - static_cast(1.0) / A[3 * _as0 + 3 * _as1], - inv_a_44 = - static_cast(1.0) / A[4 * _as0 + 4 * _as1]; - - auto trsv = [&](const int p, ValueType &b_0p, ValueType &b_1p, - ValueType &b_2p, ValueType &b_3p, ValueType &b_4p) { + const ValueType inv_a_00 = static_cast(1.0) / A[0 * _as0 + 0 * _as1], + inv_a_11 = static_cast(1.0) / A[1 * _as0 + 1 * _as1], + inv_a_22 = static_cast(1.0) / A[2 * _as0 + 2 * _as1], + inv_a_33 = static_cast(1.0) / A[3 * _as0 + 3 * _as1], + inv_a_44 = static_cast(1.0) / A[4 * _as0 + 4 * _as1]; + + auto trsv = [&](const int p, ValueType &b_0p, ValueType &b_1p, ValueType &b_2p, ValueType &b_3p, ValueType &b_4p) { // load b_0p = B[0 * _bs0 + p * _bs1]; b_1p = B[1 * _bs0 + p * _bs1]; @@ -1233,15 +1163,13 @@ KOKKOS_INLINE_FUNCTION int InnerTrsmLeftUpperNonUnitDiag<5>::serial_invoke( template <> template -KOKKOS_INLINE_FUNCTION int InnerTrsmLeftUpperNonUnitDiag<4>::serial_invoke( - const ValueType *KOKKOS_RESTRICT A, const int n, - /**/ ValueType *KOKKOS_RESTRICT B) { +KOKKOS_INLINE_FUNCTION int InnerTrsmLeftUpperNonUnitDiag<4>::serial_invoke(const ValueType *KOKKOS_RESTRICT A, + const int n, + /**/ ValueType *KOKKOS_RESTRICT B) { if (n <= 0) return 0; - const ValueType a_01 = A[0 * _as0 + 1 * _as1], a_02 = A[0 * _as0 + 2 * _as1], - a_03 = A[0 * _as0 + 3 * _as1], - /**/ a_12 = A[1 * _as0 + 2 * _as1], - a_13 = A[1 * _as0 + 3 * _as1], + const ValueType a_01 = A[0 * _as0 + 1 * _as1], a_02 = A[0 * _as0 + 2 * _as1], a_03 = A[0 * _as0 + 3 * _as1], + /**/ a_12 = A[1 * _as0 + 2 * _as1], a_13 = A[1 * _as0 + 3 * _as1], /**/ a_23 = A[2 * _as0 + 3 * _as1]; // const ValueType @@ -1250,17 +1178,12 @@ KOKKOS_INLINE_FUNCTION int InnerTrsmLeftUpperNonUnitDiag<4>::serial_invoke( // a_22 = A[2*_as0+2*_as1], // a_33 = A[3*_as0+3*_as1]; - const ValueType inv_a_00 = - static_cast(1.0) / A[0 * _as0 + 0 * _as1], - inv_a_11 = - static_cast(1.0) / A[1 * _as0 + 1 * _as1], - inv_a_22 = - static_cast(1.0) / A[2 * _as0 + 2 * _as1], - inv_a_33 = - static_cast(1.0) / A[3 * _as0 + 3 * _as1]; - - auto trsv = [&](const int p, ValueType &b_0p, ValueType &b_1p, - ValueType &b_2p, ValueType &b_3p) { + const ValueType inv_a_00 = static_cast(1.0) / A[0 * _as0 + 0 * _as1], + inv_a_11 = static_cast(1.0) / A[1 * _as0 + 1 * _as1], + inv_a_22 = static_cast(1.0) / A[2 * _as0 + 2 * _as1], + inv_a_33 = static_cast(1.0) / A[3 * _as0 + 3 * _as1]; + + auto trsv = [&](const int p, ValueType &b_0p, ValueType &b_1p, ValueType &b_2p, ValueType &b_3p) { // load b_0p = B[0 * _bs0 + p * _bs1]; b_1p = B[1 * _bs0 + p * _bs1]; @@ -1305,9 +1228,9 @@ KOKKOS_INLINE_FUNCTION int InnerTrsmLeftUpperNonUnitDiag<4>::serial_invoke( template <> template -KOKKOS_INLINE_FUNCTION int InnerTrsmLeftUpperNonUnitDiag<3>::serial_invoke( - const ValueType *KOKKOS_RESTRICT A, const int n, - /**/ ValueType *KOKKOS_RESTRICT B) { +KOKKOS_INLINE_FUNCTION int InnerTrsmLeftUpperNonUnitDiag<3>::serial_invoke(const ValueType *KOKKOS_RESTRICT A, + const int n, + /**/ ValueType *KOKKOS_RESTRICT B) { if (n <= 0) return 0; const ValueType a_01 = A[0 * _as0 + 1 * _as1], a_02 = A[0 * _as0 + 2 * _as1], @@ -1318,15 +1241,11 @@ KOKKOS_INLINE_FUNCTION int InnerTrsmLeftUpperNonUnitDiag<3>::serial_invoke( // a_11 = A[1*_as0+1*_as1], // a_22 = A[2*_as0+2*_as1]; - const ValueType inv_a_00 = - static_cast(1.0) / A[0 * _as0 + 0 * _as1], - inv_a_11 = - static_cast(1.0) / A[1 * _as0 + 1 * _as1], - inv_a_22 = - static_cast(1.0) / A[2 * _as0 + 2 * _as1]; + const ValueType inv_a_00 = static_cast(1.0) / A[0 * _as0 + 0 * _as1], + inv_a_11 = static_cast(1.0) / A[1 * _as0 + 1 * _as1], + inv_a_22 = static_cast(1.0) / A[2 * _as0 + 2 * _as1]; - auto trsv = [&](const int p, ValueType &b_0p, ValueType &b_1p, - ValueType &b_2p) { + auto trsv = [&](const int p, ValueType &b_0p, ValueType &b_1p, ValueType &b_2p) { // load b_0p = B[0 * _bs0 + p * _bs1]; b_1p = B[1 * _bs0 + p * _bs1]; @@ -1363,9 +1282,9 @@ KOKKOS_INLINE_FUNCTION int InnerTrsmLeftUpperNonUnitDiag<3>::serial_invoke( template <> template -KOKKOS_INLINE_FUNCTION int InnerTrsmLeftUpperNonUnitDiag<2>::serial_invoke( - const ValueType *KOKKOS_RESTRICT A, const int n, - /**/ ValueType *KOKKOS_RESTRICT B) { +KOKKOS_INLINE_FUNCTION int InnerTrsmLeftUpperNonUnitDiag<2>::serial_invoke(const ValueType *KOKKOS_RESTRICT A, + const int n, + /**/ ValueType *KOKKOS_RESTRICT B) { if (n <= 0) return 0; const ValueType a_01 = A[0 * _as0 + 1 * _as1]; @@ -1374,10 +1293,8 @@ KOKKOS_INLINE_FUNCTION int InnerTrsmLeftUpperNonUnitDiag<2>::serial_invoke( // a_00 = A[0*_as0+0*_as1], // a_11 = A[1*_as0+1*_as1]; - const ValueType inv_a_00 = - static_cast(1.0) / A[0 * _as0 + 0 * _as1], - inv_a_11 = - static_cast(1.0) / A[1 * _as0 + 1 * _as1]; + const ValueType inv_a_00 = static_cast(1.0) / A[0 * _as0 + 0 * _as1], + inv_a_11 = static_cast(1.0) / A[1 * _as0 + 1 * _as1]; auto trsv = [&](const int p, ValueType &b_0p, ValueType &b_1p) { // load @@ -1409,16 +1326,15 @@ KOKKOS_INLINE_FUNCTION int InnerTrsmLeftUpperNonUnitDiag<2>::serial_invoke( template <> template -KOKKOS_INLINE_FUNCTION int InnerTrsmLeftUpperNonUnitDiag<1>::serial_invoke( - const ValueType *KOKKOS_RESTRICT A, const int n, - /**/ ValueType *KOKKOS_RESTRICT B) { +KOKKOS_INLINE_FUNCTION int InnerTrsmLeftUpperNonUnitDiag<1>::serial_invoke(const ValueType *KOKKOS_RESTRICT A, + const int n, + /**/ ValueType *KOKKOS_RESTRICT B) { if (n <= 0) return 0; // const ValueType // a_00 = A[0*_as0+0*_as1]; - const ValueType inv_a_00 = - static_cast(1.0) / A[0 * _as0 + 0 * _as1]; + const ValueType inv_a_00 = static_cast(1.0) / A[0 * _as0 + 0 * _as1]; auto trsv = [&](const int p, ValueType & /* b_0p */) { // 0 iteration @@ -1443,9 +1359,9 @@ KOKKOS_INLINE_FUNCTION int InnerTrsmLeftUpperNonUnitDiag<1>::serial_invoke( template <> template -KOKKOS_INLINE_FUNCTION int InnerTrsmLeftUpperNonUnitDiag<5>::serial_invoke( - const ValueType *KOKKOS_RESTRICT A, const int m, const int n, - /**/ ValueType *KOKKOS_RESTRICT B) { +KOKKOS_INLINE_FUNCTION int InnerTrsmLeftUpperNonUnitDiag<5>::serial_invoke(const ValueType *KOKKOS_RESTRICT A, + const int m, const int n, + /**/ ValueType *KOKKOS_RESTRICT B) { if (m > 5) Kokkos::abort( "InnerTrsmLeftUpperNonUnitDiag<5>::serial_invoke, assert failure " @@ -1482,9 +1398,9 @@ KOKKOS_INLINE_FUNCTION int InnerTrsmLeftUpperNonUnitDiag<5>::serial_invoke( } template <> template -KOKKOS_INLINE_FUNCTION int InnerTrsmLeftUpperNonUnitDiag<4>::serial_invoke( - const ValueType *KOKKOS_RESTRICT A, const int m, const int n, - /**/ ValueType *KOKKOS_RESTRICT B) { +KOKKOS_INLINE_FUNCTION int InnerTrsmLeftUpperNonUnitDiag<4>::serial_invoke(const ValueType *KOKKOS_RESTRICT A, + const int m, const int n, + /**/ ValueType *KOKKOS_RESTRICT B) { if (m > 4) Kokkos::abort( "InnerTrsmLeftUpperNonUnitDiag<4>::serial_invoke, assert failure " @@ -1516,9 +1432,9 @@ KOKKOS_INLINE_FUNCTION int InnerTrsmLeftUpperNonUnitDiag<4>::serial_invoke( } template <> template -KOKKOS_INLINE_FUNCTION int InnerTrsmLeftUpperNonUnitDiag<3>::serial_invoke( - const ValueType *KOKKOS_RESTRICT A, const int m, const int n, - /**/ ValueType *KOKKOS_RESTRICT B) { +KOKKOS_INLINE_FUNCTION int InnerTrsmLeftUpperNonUnitDiag<3>::serial_invoke(const ValueType *KOKKOS_RESTRICT A, + const int m, const int n, + /**/ ValueType *KOKKOS_RESTRICT B) { if (m > 3) Kokkos::abort( "InnerTrsmLeftUpperNonUnitDiag<3>::serial_invoke, assert failure " @@ -1545,9 +1461,9 @@ KOKKOS_INLINE_FUNCTION int InnerTrsmLeftUpperNonUnitDiag<3>::serial_invoke( } template <> template -KOKKOS_INLINE_FUNCTION int InnerTrsmLeftUpperNonUnitDiag<2>::serial_invoke( - const ValueType *KOKKOS_RESTRICT A, const int m, const int n, - /**/ ValueType *KOKKOS_RESTRICT B) { +KOKKOS_INLINE_FUNCTION int InnerTrsmLeftUpperNonUnitDiag<2>::serial_invoke(const ValueType *KOKKOS_RESTRICT A, + const int m, const int n, + /**/ ValueType *KOKKOS_RESTRICT B) { if (m > 2) Kokkos::abort( "InnerTrsmLeftUpperNonUnitDiag<2>::serial_invoke, assert failure " @@ -1569,9 +1485,9 @@ KOKKOS_INLINE_FUNCTION int InnerTrsmLeftUpperNonUnitDiag<2>::serial_invoke( } template <> template -KOKKOS_INLINE_FUNCTION int InnerTrsmLeftUpperNonUnitDiag<1>::serial_invoke( - const ValueType *KOKKOS_RESTRICT A, const int m, const int n, - /**/ ValueType *KOKKOS_RESTRICT B) { +KOKKOS_INLINE_FUNCTION int InnerTrsmLeftUpperNonUnitDiag<1>::serial_invoke(const ValueType *KOKKOS_RESTRICT A, + const int m, const int n, + /**/ ValueType *KOKKOS_RESTRICT B) { if (m > 1) Kokkos::abort( "InnerTrsmLeftUpperNonUnitDiag<1>::serial_invoke, assert failure " diff --git a/batched/dense/impl/KokkosBatched_InverseLU_Serial_Impl.hpp b/batched/dense/impl/KokkosBatched_InverseLU_Serial_Impl.hpp index 070a620531..215c62e9f2 100644 --- a/batched/dense/impl/KokkosBatched_InverseLU_Serial_Impl.hpp +++ b/batched/dense/impl/KokkosBatched_InverseLU_Serial_Impl.hpp @@ -32,49 +32,42 @@ namespace KokkosBatched { /// InverseLU no piv /// -#if defined(__KOKKOSBATCHED_ENABLE_INTEL_MKL__) && \ - defined(__KOKKOSBATCHED_ENABLE_INTEL_MKL_BATCHED__) && \ +#if defined(__KOKKOSBATCHED_ENABLE_INTEL_MKL__) && defined(__KOKKOSBATCHED_ENABLE_INTEL_MKL_BATCHED__) && \ defined(__KOKKOSBATCHED_ENABLE_INTEL_MKL_COMPACT_BATCHED__) template <> template -KOKKOS_INLINE_FUNCTION int SerialInverseLU::invoke( - const AViewType &A, const WViewType &W) { +KOKKOS_INLINE_FUNCTION int SerialInverseLU::invoke(const AViewType &A, + const WViewType &W) { typedef typename AViewType::value_type vector_type; // typedef typename vector_type::value_type value_type; const int m = A.extent(0), n = A.extent(1); static_assert(is_vector::value, "value type is not vector type"); - static_assert( - vector_type::vector_length == 4 || vector_type::vector_length == 8, - "AVX, AVX2 and AVX512 is supported"); + static_assert(vector_type::vector_length == 4 || vector_type::vector_length == 8, + "AVX, AVX2 and AVX512 is supported"); static_assert(AViewType::rank == 2, "A should have two dimensions"); static_assert(WViewType::rank == 1, "W should have one dimension"); - static_assert(std::is_same::value, + static_assert(std::is_same::value, "A and W should be on the same memory space"); - static_assert(!std::is_same::value, + static_assert(!std::is_same::value, "W should be an contiguous 1D array"); assert(A.extent(0) * A.extent(1) * sizeof(typename AViewType::value_type) <= W.span() * sizeof(typename WViewType::value_type)); assert(m == n); - const MKL_COMPACT_PACK format = - vector_type::vector_length == 8 ? MKL_COMPACT_AVX512 : MKL_COMPACT_AVX; + const MKL_COMPACT_PACK format = vector_type::vector_length == 8 ? MKL_COMPACT_AVX512 : MKL_COMPACT_AVX; int r_val = 0; if (A.stride(0) == 1) { - mkl_dgetrinp_compact( - MKL_COL_MAJOR, n, (double *)A.data(), A.stride(1), (double *)W.data(), - (MKL_INT)(n * n * vector_type::vector_length), (MKL_INT *)&r_val, - format, (MKL_INT)vector_type::vector_length); + mkl_dgetrinp_compact(MKL_COL_MAJOR, n, (double *)A.data(), A.stride(1), (double *)W.data(), + (MKL_INT)(n * n * vector_type::vector_length), (MKL_INT *)&r_val, format, + (MKL_INT)vector_type::vector_length); } else if (A.stride(1) == 1) { - mkl_dgetrinp_compact( - MKL_ROW_MAJOR, n, (double *)A.data(), A.stride(0), (double *)W.data(), - (MKL_INT)(n * n * vector_type::vector_length), (MKL_INT *)&r_val, - format, (MKL_INT)vector_type::vector_length); + mkl_dgetrinp_compact(MKL_ROW_MAJOR, n, (double *)A.data(), A.stride(0), (double *)W.data(), + (MKL_INT)(n * n * vector_type::vector_length), (MKL_INT *)&r_val, format, + (MKL_INT)vector_type::vector_length); } else { r_val = -1; } diff --git a/batched/dense/impl/KokkosBatched_LU_Serial_Impl.hpp b/batched/dense/impl/KokkosBatched_LU_Serial_Impl.hpp index 2fa372aa7c..e2acd012cb 100644 --- a/batched/dense/impl/KokkosBatched_LU_Serial_Impl.hpp +++ b/batched/dense/impl/KokkosBatched_LU_Serial_Impl.hpp @@ -31,35 +31,28 @@ namespace KokkosBatched { /// SerialLU no piv /// -#if defined(__KOKKOSBATCHED_ENABLE_INTEL_MKL__) && \ - defined(__KOKKOSBATCHED_ENABLE_INTEL_MKL_BATCHED__) && \ +#if defined(__KOKKOSBATCHED_ENABLE_INTEL_MKL__) && defined(__KOKKOSBATCHED_ENABLE_INTEL_MKL_BATCHED__) && \ defined(__KOKKOSBATCHED_ENABLE_INTEL_MKL_COMPACT_BATCHED__) template <> template KOKKOS_INLINE_FUNCTION int SerialLU::invoke( - const AViewType &A, - const typename MagnitudeScalarType< - typename AViewType::non_const_value_type>::type tiny) { + const AViewType &A, const typename MagnitudeScalarType::type tiny) { typedef typename AViewType::value_type vector_type; // typedef typename vector_type::value_type value_type; const int m = A.extent(0), n = A.extent(1); static_assert(is_vector::value, "value type is not vector type"); - static_assert( - vector_type::vector_length == 4 || vector_type::vector_length == 8, - "AVX, AVX2 and AVX512 is supported"); - const MKL_COMPACT_PACK format = - vector_type::vector_length == 8 ? MKL_COMPACT_AVX512 : MKL_COMPACT_AVX; + static_assert(vector_type::vector_length == 4 || vector_type::vector_length == 8, + "AVX, AVX2 and AVX512 is supported"); + const MKL_COMPACT_PACK format = vector_type::vector_length == 8 ? MKL_COMPACT_AVX512 : MKL_COMPACT_AVX; int r_val = 0; if (A.stride_0() == 1) { - mkl_dgetrfnp_compact(MKL_COL_MAJOR, m, n, (double *)A.data(), A.stride_1(), - (MKL_INT *)&r_val, format, + mkl_dgetrfnp_compact(MKL_COL_MAJOR, m, n, (double *)A.data(), A.stride_1(), (MKL_INT *)&r_val, format, (MKL_INT)vector_type::vector_length); } else if (A.stride_1() == 1) { - mkl_dgetrfnp_compact(MKL_ROW_MAJOR, m, n, (double *)A.data(), A.stride_0(), - (MKL_INT *)&r_val, format, + mkl_dgetrfnp_compact(MKL_ROW_MAJOR, m, n, (double *)A.data(), A.stride_0(), (MKL_INT *)&r_val, format, (MKL_INT)vector_type::vector_length); } else { r_val = -1; @@ -71,21 +64,17 @@ KOKKOS_INLINE_FUNCTION int SerialLU::invoke( template <> template KOKKOS_INLINE_FUNCTION int SerialLU::invoke( - const AViewType &A, - const typename MagnitudeScalarType< - typename AViewType::non_const_value_type>::type tiny) { - return SerialLU_Internal::invoke( - A.extent(0), A.extent(1), A.data(), A.stride_0(), A.stride_1(), tiny); + const AViewType &A, const typename MagnitudeScalarType::type tiny) { + return SerialLU_Internal::invoke(A.extent(0), A.extent(1), A.data(), A.stride_0(), A.stride_1(), + tiny); } template <> template KOKKOS_INLINE_FUNCTION int SerialLU::invoke( - const AViewType &A, - const typename MagnitudeScalarType< - typename AViewType::non_const_value_type>::type tiny) { - return SerialLU_Internal::invoke( - A.extent(0), A.extent(1), A.data(), A.stride_0(), A.stride_1(), tiny); + const AViewType &A, const typename MagnitudeScalarType::type tiny) { + return SerialLU_Internal::invoke(A.extent(0), A.extent(1), A.data(), A.stride_0(), A.stride_1(), + tiny); } } // namespace KokkosBatched diff --git a/batched/dense/impl/KokkosBatched_LU_Serial_Internal.hpp b/batched/dense/impl/KokkosBatched_LU_Serial_Internal.hpp index e6b34d8f1b..6555a16d93 100644 --- a/batched/dense/impl/KokkosBatched_LU_Serial_Internal.hpp +++ b/batched/dense/impl/KokkosBatched_LU_Serial_Internal.hpp @@ -33,16 +33,15 @@ namespace KokkosBatched { template struct SerialLU_Internal { template - KOKKOS_INLINE_FUNCTION static int invoke( - const int m, const int n, ValueType *KOKKOS_RESTRICT A, const int as0, - const int as1, const typename MagnitudeScalarType::type tiny); + KOKKOS_INLINE_FUNCTION static int invoke(const int m, const int n, ValueType *KOKKOS_RESTRICT A, const int as0, + const int as1, const typename MagnitudeScalarType::type tiny); }; template <> template KOKKOS_INLINE_FUNCTION int SerialLU_Internal::invoke( - const int m, const int n, ValueType *KOKKOS_RESTRICT A, const int as0, - const int as1, const typename MagnitudeScalarType::type tiny) { + const int m, const int n, ValueType *KOKKOS_RESTRICT A, const int as0, const int as1, + const typename MagnitudeScalarType::type tiny) { const int k = (m < n ? m : n); if (k <= 0) return 0; @@ -55,14 +54,12 @@ KOKKOS_INLINE_FUNCTION int SerialLU_Internal::invoke( const ValueType *KOKKOS_RESTRICT a12t = A + (p)*as0 + (p + 1) * as1; - ValueType *KOKKOS_RESTRICT a21 = A + (p + 1) * as0 + (p)*as1, - *KOKKOS_RESTRICT A22 = - A + (p + 1) * as0 + (p + 1) * as1; + ValueType *KOKKOS_RESTRICT a21 = A + (p + 1) * as0 + (p)*as1, + *KOKKOS_RESTRICT A22 = A + (p + 1) * as0 + (p + 1) * as1; if (tiny != 0) { ValueType &alpha11_reference = A[p * as0 + p * as1]; - const auto alpha11_real = - Kokkos::ArithTraits::real(alpha11_reference); + const auto alpha11_real = Kokkos::ArithTraits::real(alpha11_reference); alpha11_reference += minus_abs_tiny * ValueType(alpha11_real < 0); alpha11_reference += abs_tiny * ValueType(alpha11_real >= 0); } @@ -76,8 +73,7 @@ KOKKOS_INLINE_FUNCTION int SerialLU_Internal::invoke( #if defined(KOKKOS_ENABLE_PRAGMA_UNROLL) #pragma unroll #endif - for (int j = 0; j < jend; ++j) - A22[i * as0 + j * as1] -= a21[i * as0] * a12t[j * as1]; + for (int j = 0; j < jend; ++j) A22[i * as0 + j * as1] -= a21[i * as0] * a12t[j * as1]; } } return 0; @@ -86,8 +82,7 @@ KOKKOS_INLINE_FUNCTION int SerialLU_Internal::invoke( template <> template KOKKOS_INLINE_FUNCTION int SerialLU_Internal::invoke( - const int m, const int n, ValueType *KOKKOS_RESTRICT A, const int as0, - const int as1, + const int m, const int n, ValueType *KOKKOS_RESTRICT A, const int as0, const int as1, const typename MagnitudeScalarType::type /*tiny*/) { constexpr int mbAlgo = Algo::LU::Blocked::mb(); const typename MagnitudeScalarType::type one(1.0), minus_one(-1.0); @@ -100,8 +95,7 @@ KOKKOS_INLINE_FUNCTION int SerialLU_Internal::invoke( InnerTrsmLeftLowerUnitDiag trsm_llu(as0, as1, as0, as1); InnerTrsmLeftLowerNonUnitDiag trsm_run(as1, as0, as1, as0); - auto lu_factorize = [&](const int ib, const int jb, - ValueType *KOKKOS_RESTRICT AA) { + auto lu_factorize = [&](const int ib, const int jb, ValueType *KOKKOS_RESTRICT AA) { const int mb = mbAlgo; const int kb = ib < jb ? ib : jb; for (int p = 0; p < kb; p += mb) { @@ -121,9 +115,8 @@ KOKKOS_INLINE_FUNCTION int SerialLU_Internal::invoke( trsm_run.serial_invoke(Ap, pb, m_abr, Ap + mb * as0); // gemm update - SerialGemmInternal::invoke( - m_abr, n_abr, pb, minus_one, Ap + mb * as0, as0, as1, Ap + mb * as1, - as0, as1, one, Ap + mb * as0 + mb * as1, as0, as1); + SerialGemmInternal::invoke(m_abr, n_abr, pb, minus_one, Ap + mb * as0, as0, as1, + Ap + mb * as1, as0, as1, one, Ap + mb * as0 + mb * as1, as0, as1); } }; diff --git a/batched/dense/impl/KokkosBatched_LU_Team_Impl.hpp b/batched/dense/impl/KokkosBatched_LU_Team_Impl.hpp index 3f28c063b8..9ed5e244d2 100644 --- a/batched/dense/impl/KokkosBatched_LU_Team_Impl.hpp +++ b/batched/dense/impl/KokkosBatched_LU_Team_Impl.hpp @@ -36,11 +36,9 @@ struct TeamLU { template KOKKOS_INLINE_FUNCTION static int invoke( const MemberType &member, const AViewType &A, - const typename MagnitudeScalarType< - typename AViewType::non_const_value_type>::type tiny = 0) { - return TeamLU_Internal::invoke( - member, A.extent(0), A.extent(1), A.data(), A.stride_0(), A.stride_1(), - tiny); + const typename MagnitudeScalarType::type tiny = 0) { + return TeamLU_Internal::invoke(member, A.extent(0), A.extent(1), A.data(), A.stride_0(), + A.stride_1(), tiny); } }; @@ -49,11 +47,9 @@ struct TeamLU { template KOKKOS_INLINE_FUNCTION static int invoke( const MemberType &member, const AViewType &A, - const typename MagnitudeScalarType< - typename AViewType::non_const_value_type>::type tiny = 0) { - return TeamLU_Internal::invoke( - member, A.extent(0), A.extent(1), A.data(), A.stride_0(), A.stride_1(), - tiny); + const typename MagnitudeScalarType::type tiny = 0) { + return TeamLU_Internal::invoke(member, A.extent(0), A.extent(1), A.data(), A.stride_0(), + A.stride_1(), tiny); } }; diff --git a/batched/dense/impl/KokkosBatched_LU_Team_Internal.hpp b/batched/dense/impl/KokkosBatched_LU_Team_Internal.hpp index cbc811de5e..dacfb02ed4 100644 --- a/batched/dense/impl/KokkosBatched_LU_Team_Internal.hpp +++ b/batched/dense/impl/KokkosBatched_LU_Team_Internal.hpp @@ -35,17 +35,15 @@ namespace KokkosBatched { template struct TeamLU_Internal { template - KOKKOS_INLINE_FUNCTION static int invoke( - const MemberType &member, const int m, const int n, - ValueType *KOKKOS_RESTRICT A, const int as0, const int as1, - const typename MagnitudeScalarType::type tiny); + KOKKOS_INLINE_FUNCTION static int invoke(const MemberType &member, const int m, const int n, + ValueType *KOKKOS_RESTRICT A, const int as0, const int as1, + const typename MagnitudeScalarType::type tiny); }; template <> template KOKKOS_INLINE_FUNCTION int TeamLU_Internal::invoke( - const MemberType &member, const int m, const int n, - ValueType *KOKKOS_RESTRICT A, const int as0, const int as1, + const MemberType &member, const int m, const int n, ValueType *KOKKOS_RESTRICT A, const int as0, const int as1, const typename MagnitudeScalarType::type tiny) { const int k = (m < n ? m : n); if (k <= 0) return 0; @@ -60,15 +58,13 @@ KOKKOS_INLINE_FUNCTION int TeamLU_Internal::invoke( const ValueType *KOKKOS_RESTRICT a12t = A + (p)*as0 + (p + 1) * as1; - ValueType *KOKKOS_RESTRICT a21 = A + (p + 1) * as0 + (p)*as1, - *KOKKOS_RESTRICT A22 = - A + (p + 1) * as0 + (p + 1) * as1; + ValueType *KOKKOS_RESTRICT a21 = A + (p + 1) * as0 + (p)*as1, + *KOKKOS_RESTRICT A22 = A + (p + 1) * as0 + (p + 1) * as1; if (tiny != 0) { if (member.team_rank() == 0) { ValueType &alpha11_reference = A[p * as0 + p * as1]; - const auto alpha11_real = - Kokkos::ArithTraits::real(alpha11_reference); + const auto alpha11_real = Kokkos::ArithTraits::real(alpha11_reference); alpha11_reference += minus_abs_tiny * ValueType(alpha11_real < 0); alpha11_reference += abs_tiny * ValueType(alpha11_real >= 0); } @@ -76,19 +72,17 @@ KOKKOS_INLINE_FUNCTION int TeamLU_Internal::invoke( member.team_barrier(); const ValueType alpha11 = A[p * as0 + p * as1]; - Kokkos::parallel_for(Kokkos::TeamThreadRange(member, 0, iend), - [&](const int &i) { - // a21[i*as0] *= inv_alpha11; - a21[i * as0] /= alpha11; - }); + Kokkos::parallel_for(Kokkos::TeamThreadRange(member, 0, iend), [&](const int &i) { + // a21[i*as0] *= inv_alpha11; + a21[i * as0] /= alpha11; + }); member.team_barrier(); - Kokkos::parallel_for( - Kokkos::TeamThreadRange(member, 0, iend * jend), [&](const int &ij) { - // assume layout right for batched computation - const int i = ij / jend, j = ij % jend; - A22[i * as0 + j * as1] -= a21[i * as0] * a12t[j * as1]; - }); + Kokkos::parallel_for(Kokkos::TeamThreadRange(member, 0, iend * jend), [&](const int &ij) { + // assume layout right for batched computation + const int i = ij / jend, j = ij % jend; + A22[i * as0 + j * as1] -= a21[i * as0] * a12t[j * as1]; + }); } return 0; } @@ -96,8 +90,7 @@ KOKKOS_INLINE_FUNCTION int TeamLU_Internal::invoke( template <> template KOKKOS_INLINE_FUNCTION int TeamLU_Internal::invoke( - const MemberType &member, const int m, const int n, - ValueType *KOKKOS_RESTRICT A, const int as0, const int as1, + const MemberType &member, const int m, const int n, ValueType *KOKKOS_RESTRICT A, const int as0, const int as1, const typename MagnitudeScalarType::type /*tiny*/) { constexpr int mbAlgo = Algo::LU::Blocked::mb(); @@ -110,15 +103,11 @@ KOKKOS_INLINE_FUNCTION int TeamLU_Internal::invoke( InnerTrsmLeftLowerUnitDiag trsm_llu(as0, as1, as0, as1); InnerTrsmLeftLowerNonUnitDiag trsm_run(as1, as0, as1, as0); - auto lu_factorize = [&](const int ib, const int jb, - ValueType *KOKKOS_RESTRICT AA) { + auto lu_factorize = [&](const int ib, const int jb, ValueType *KOKKOS_RESTRICT AA) { const int tsize = member.team_size(); // Made this non-const in order to WORKAROUND issue #349 int mb = mbAlgo; - int nb = ((jb - mb) + (ib - mb)) > 0 - ? ((jb - mb) + (ib - mb)) / tsize + - (((jb - mb) + (ib - mb)) % tsize > 0) - : 1; + int nb = ((jb - mb) + (ib - mb)) > 0 ? ((jb - mb) + (ib - mb)) / tsize + (((jb - mb) + (ib - mb)) % tsize > 0) : 1; const int kb = ib < jb ? ib : jb; for (int p = 0; p < kb; p += mb) { @@ -133,29 +122,24 @@ KOKKOS_INLINE_FUNCTION int TeamLU_Internal::invoke( member.team_barrier(); // Made this non-const in order to WORKAROUND issue #349 - int m_abr = ib - p - mb, n_abr = jb - p - mb, mp_abr = m_abr % nb, - np_abr = n_abr % nb, mq_abr = (m_abr / nb) + (mp_abr > 0), - nq_abr = (n_abr / nb) + (np_abr > 0); + int m_abr = ib - p - mb, n_abr = jb - p - mb, mp_abr = m_abr % nb, np_abr = n_abr % nb, + mq_abr = (m_abr / nb) + (mp_abr > 0), nq_abr = (n_abr / nb) + (np_abr > 0); // trsm update - Kokkos::parallel_for( - Kokkos::TeamThreadRange(member, 0, mq_abr + nq_abr), - [&](const int &ij) { - if (ij < nq_abr) { - const int j = (ij)*nb, qb = (j + nb) > n_abr ? np_abr : nb; - trsm_llu.serial_invoke(Ap, pb, qb, Ap + (j + mb) * as1); - } else { - const int i = (ij - nq_abr) * nb, - qb = (i + nb) > m_abr ? mp_abr : nb; - trsm_run.serial_invoke(Ap, pb, qb, Ap + (i + mb) * as0); - } - }); + Kokkos::parallel_for(Kokkos::TeamThreadRange(member, 0, mq_abr + nq_abr), [&](const int &ij) { + if (ij < nq_abr) { + const int j = (ij)*nb, qb = (j + nb) > n_abr ? np_abr : nb; + trsm_llu.serial_invoke(Ap, pb, qb, Ap + (j + mb) * as1); + } else { + const int i = (ij - nq_abr) * nb, qb = (i + nb) > m_abr ? mp_abr : nb; + trsm_run.serial_invoke(Ap, pb, qb, Ap + (i + mb) * as0); + } + }); member.team_barrier(); // gemm update - TeamGemmInternal::invoke( - member, m_abr, n_abr, pb, minus_one, Ap + mb * as0, as0, as1, - Ap + mb * as1, as0, as1, one, Ap + mb * as0 + mb * as1, as0, as1); + TeamGemmInternal::invoke(member, m_abr, n_abr, pb, minus_one, Ap + mb * as0, as0, as1, + Ap + mb * as1, as0, as1, one, Ap + mb * as0 + mb * as1, as0, as1); } }; diff --git a/batched/dense/impl/KokkosBatched_LeftEigenvectorFromSchur_Serial_Internal.hpp b/batched/dense/impl/KokkosBatched_LeftEigenvectorFromSchur_Serial_Internal.hpp index ea87217a37..c266d65c54 100644 --- a/batched/dense/impl/KokkosBatched_LeftEigenvectorFromSchur_Serial_Internal.hpp +++ b/batched/dense/impl/KokkosBatched_LeftEigenvectorFromSchur_Serial_Internal.hpp @@ -45,12 +45,9 @@ struct SerialLeftEigenvectorFromSchurInternal { /// contiguous workspace that can hold complex array (m) template KOKKOS_INLINE_FUNCTION static int invoke(const int m, - /* */ ValueType *S, const int ss0, - const int ss1, - /* */ ValueType *V, const int vs0, - const int vs1, - /* */ ValueType *w, - const int *blks) { + /* */ ValueType *S, const int ss0, const int ss1, + /* */ ValueType *V, const int vs0, const int vs1, + /* */ ValueType *w, const int *blks) { typedef ValueType value_type; typedef Kokkos::ArithTraits ats; // typedef typename ats::mag_type mag_type; @@ -77,8 +74,7 @@ struct SerialLeftEigenvectorFromSchurInternal { for (; m_stl < (m - 1);) { /// part 2x2 into 3x3 const int mA11 = blks[m_stl]; - assert(((mA11 == 1) || (mA11 == 2)) && - "LeftEigenvectorFromSchur: blk is not 1x1 nor 2x2"); + assert(((mA11 == 1) || (mA11 == 2)) && "LeftEigenvectorFromSchur: blk is not 1x1 nor 2x2"); S_part3x3.partWithABR(S_part2x2, mA11, mA11); V_part3x1.partWithAB(V_part2x1, mA11); @@ -90,23 +86,19 @@ struct SerialLeftEigenvectorFromSchurInternal { /// initialize a left hand side b[m_stl] = one; - for (int j = 0; j < (m - m_stl_plus_mA11); ++j) - b[j + m_stl_plus_mA11] = -S_part3x3.A12[j * ss1]; + for (int j = 0; j < (m - m_stl_plus_mA11); ++j) b[j + m_stl_plus_mA11] = -S_part3x3.A12[j * ss1]; /// perform shifted trsv (transposed) - SerialShiftedTrsvInternalLower::invoke( - m - m_stl_plus_mA11, lambda, S_part3x3.A22, ss1, ss0, - b + m_stl_plus_mA11, 1, blks + m_stl_plus_mA11); + SerialShiftedTrsvInternalLower::invoke(m - m_stl_plus_mA11, lambda, S_part3x3.A22, ss1, ss0, + b + m_stl_plus_mA11, 1, blks + m_stl_plus_mA11); /// copy back to V (row wise copy) for (int j = 0; j < m_stl; ++j) V_part3x1.A1[j * vs1] = zero; for (int j = m_stl; j < m; ++j) V_part3x1.A1[j * vs1] = b[j]; } else { /// complex eigen pair - const value_type alpha11 = S_part3x3.A11[0], - alpha12 = S_part3x3.A11[ss1], - alpha21 = S_part3x3.A11[ss0], - beta = ats::sqrt(-alpha12 * alpha21); + const value_type alpha11 = S_part3x3.A11[0], alpha12 = S_part3x3.A11[ss1], alpha21 = S_part3x3.A11[ss0], + beta = ats::sqrt(-alpha12 * alpha21); const complex_type lambda(alpha11, beta); complex_type *bc = (complex_type *)(b); @@ -118,13 +110,11 @@ struct SerialLeftEigenvectorFromSchurInternal { const value_type *S_A12_a = S_part3x3.A12; const value_type *S_A12_b = S_part3x3.A12 + ss0; for (int j = 0; j < (m - m_stl_plus_mA11); ++j) - bc[j + m_stl_plus_mA11] = complex_type(-S_A12_a[j * ss1] * beta, - S_A12_b[j * ss1] * alpha12); + bc[j + m_stl_plus_mA11] = complex_type(-S_A12_a[j * ss1] * beta, S_A12_b[j * ss1] * alpha12); /// perform shifted trsv - SerialShiftedTrsvInternalLower::invoke( - m - m_stl_plus_mA11, lambda, S_part3x3.A22, ss1, ss0, - bc + m_stl_plus_mA11, 1, blks + m_stl_plus_mA11); + SerialShiftedTrsvInternalLower::invoke(m - m_stl_plus_mA11, lambda, S_part3x3.A22, ss1, ss0, + bc + m_stl_plus_mA11, 1, blks + m_stl_plus_mA11); /// copy back to V value_type *V_A1_r = V_part3x1.A1; diff --git a/batched/dense/impl/KokkosBatched_Normalize_Internal.hpp b/batched/dense/impl/KokkosBatched_Normalize_Internal.hpp index 42adf8eeba..af6832940b 100644 --- a/batched/dense/impl/KokkosBatched_Normalize_Internal.hpp +++ b/batched/dense/impl/KokkosBatched_Normalize_Internal.hpp @@ -28,8 +28,7 @@ namespace KokkosBatched { struct SerialNormalizeInternal { template KOKKOS_INLINE_FUNCTION static int invoke(const int m, - /* */ ValueType *KOKKOS_RESTRICT v, - const int vs) { + /* */ ValueType *KOKKOS_RESTRICT v, const int vs) { typedef ValueType value_type; typedef Kokkos::ArithTraits ats; typedef typename ats::mag_type mag_type; @@ -53,10 +52,8 @@ struct SerialNormalizeInternal { template KOKKOS_INLINE_FUNCTION static int invoke(const int m, - /* */ RealType *KOKKOS_RESTRICT vr, - const int vrs, - /* */ RealType *KOKKOS_RESTRICT vi, - const int vis) { + /* */ RealType *KOKKOS_RESTRICT vr, const int vrs, + /* */ RealType *KOKKOS_RESTRICT vi, const int vis) { typedef RealType real_type; typedef Kokkos::ArithTraits ats; typedef typename ats::mag_type mag_type; diff --git a/batched/dense/impl/KokkosBatched_Pttrf_Serial_Impl.hpp b/batched/dense/impl/KokkosBatched_Pttrf_Serial_Impl.hpp index b0ea39fa3f..b96c47e642 100644 --- a/batched/dense/impl/KokkosBatched_Pttrf_Serial_Impl.hpp +++ b/batched/dense/impl/KokkosBatched_Pttrf_Serial_Impl.hpp @@ -24,17 +24,13 @@ namespace KokkosBatched { template -KOKKOS_INLINE_FUNCTION static int checkPttrfInput( - [[maybe_unused]] const DViewType &d, [[maybe_unused]] const EViewType &e) { - static_assert(Kokkos::is_view::value, - "KokkosBatched::pttrf: DViewType is not a Kokkos::View."); - static_assert(Kokkos::is_view::value, - "KokkosBatched::pttrf: EViewType is not a Kokkos::View."); +KOKKOS_INLINE_FUNCTION static int checkPttrfInput([[maybe_unused]] const DViewType &d, + [[maybe_unused]] const EViewType &e) { + static_assert(Kokkos::is_view::value, "KokkosBatched::pttrf: DViewType is not a Kokkos::View."); + static_assert(Kokkos::is_view::value, "KokkosBatched::pttrf: EViewType is not a Kokkos::View."); - static_assert(DViewType::rank == 1, - "KokkosBatched::pttrf: DViewType must have rank 1."); - static_assert(EViewType::rank == 1, - "KokkosBatched::pttrf: EViewType must have rank 1."); + static_assert(DViewType::rank == 1, "KokkosBatched::pttrf: DViewType must have rank 1."); + static_assert(EViewType::rank == 1, "KokkosBatched::pttrf: EViewType must have rank 1."); #if (KOKKOSKERNELS_DEBUG_LEVEL > 0) const int nd = d.extent(0); @@ -55,8 +51,7 @@ KOKKOS_INLINE_FUNCTION static int checkPttrfInput( template <> struct SerialPttrf { template - KOKKOS_INLINE_FUNCTION static int invoke(const DViewType &d, - const EViewType &e) { + KOKKOS_INLINE_FUNCTION static int invoke(const DViewType &d, const EViewType &e) { // Quick return if possible if (d.extent(0) == 0) return 0; if (d.extent(0) == 1) return (d(0) < 0 ? 1 : 0); @@ -64,8 +59,8 @@ struct SerialPttrf { auto info = checkPttrfInput(d, e); if (info) return info; - return SerialPttrfInternal::invoke( - d.extent(0), d.data(), d.stride(0), e.data(), e.stride(0)); + return SerialPttrfInternal::invoke(d.extent(0), d.data(), d.stride(0), e.data(), + e.stride(0)); } }; } // namespace KokkosBatched diff --git a/batched/dense/impl/KokkosBatched_Pttrf_Serial_Internal.hpp b/batched/dense/impl/KokkosBatched_Pttrf_Serial_Internal.hpp index 5b4d3fb182..438ec43320 100644 --- a/batched/dense/impl/KokkosBatched_Pttrf_Serial_Internal.hpp +++ b/batched/dense/impl/KokkosBatched_Pttrf_Serial_Internal.hpp @@ -25,16 +25,12 @@ namespace KokkosBatched { template struct SerialPttrfInternal { template - KOKKOS_INLINE_FUNCTION static int invoke(const int n, - ValueType *KOKKOS_RESTRICT d, - const int ds0, - ValueType *KOKKOS_RESTRICT e, - const int es0); + KOKKOS_INLINE_FUNCTION static int invoke(const int n, ValueType *KOKKOS_RESTRICT d, const int ds0, + ValueType *KOKKOS_RESTRICT e, const int es0); template - KOKKOS_INLINE_FUNCTION static int invoke( - const int n, ValueType *KOKKOS_RESTRICT d, const int ds0, - Kokkos::complex *KOKKOS_RESTRICT e, const int es0); + KOKKOS_INLINE_FUNCTION static int invoke(const int n, ValueType *KOKKOS_RESTRICT d, const int ds0, + Kokkos::complex *KOKKOS_RESTRICT e, const int es0); }; /// @@ -44,8 +40,7 @@ struct SerialPttrfInternal { template <> template KOKKOS_INLINE_FUNCTION int SerialPttrfInternal::invoke( - const int n, ValueType *KOKKOS_RESTRICT d, const int ds0, - ValueType *KOKKOS_RESTRICT e, const int es0) { + const int n, ValueType *KOKKOS_RESTRICT d, const int ds0, ValueType *KOKKOS_RESTRICT e, const int es0) { int info = 0; auto update = [&](const int i) { @@ -54,9 +49,7 @@ KOKKOS_INLINE_FUNCTION int SerialPttrfInternal::invoke( d[(i + 1) * ds0] -= e[i * es0] * ei_tmp; }; - auto check_positive_definitiveness = [&](const int i) { - return (d[i] <= 0.0) ? (i + 1) : 0; - }; + auto check_positive_definitiveness = [&](const int i) { return (d[i] <= 0.0) ? (i + 1) : 0; }; // Compute the L*D*L' (or U'*D*U) factorization of A. const int i4 = (n - 1) % 4; @@ -127,8 +120,8 @@ KOKKOS_INLINE_FUNCTION int SerialPttrfInternal::invoke( template <> template KOKKOS_INLINE_FUNCTION int SerialPttrfInternal::invoke( - const int n, ValueType *KOKKOS_RESTRICT d, const int ds0, - Kokkos::complex *KOKKOS_RESTRICT e, const int es0) { + const int n, ValueType *KOKKOS_RESTRICT d, const int ds0, Kokkos::complex *KOKKOS_RESTRICT e, + const int es0) { int info = 0; auto update = [&](const int i) { @@ -140,9 +133,7 @@ KOKKOS_INLINE_FUNCTION int SerialPttrfInternal::invoke( d[(i + 1) * ds0] = d[(i + 1) * ds0] - f_tmp * eir_tmp - g_tmp * eii_tmp; }; - auto check_positive_definitiveness = [&](const int i) { - return (d[i] <= 0.0) ? (i + 1) : 0; - }; + auto check_positive_definitiveness = [&](const int i) { return (d[i] <= 0.0) ? (i + 1) : 0; }; // Compute the L*D*L' (or U'*D*U) factorization of A. const int i4 = (n - 1) % 4; diff --git a/batched/dense/impl/KokkosBatched_QR_FormQ_Serial_Internal.hpp b/batched/dense/impl/KokkosBatched_QR_FormQ_Serial_Internal.hpp index ac97a3f772..7c717c2eed 100644 --- a/batched/dense/impl/KokkosBatched_QR_FormQ_Serial_Internal.hpp +++ b/batched/dense/impl/KokkosBatched_QR_FormQ_Serial_Internal.hpp @@ -34,13 +34,10 @@ namespace KokkosBatched { struct SerialQR_FormQ_Internal { template KOKKOS_INLINE_FUNCTION static int invoke(const int m, const int k, - /* */ ValueType* A, const int as0, - const int as1, + /* */ ValueType* A, const int as0, const int as1, /* */ ValueType* t, const int ts, - /* */ ValueType* Q, const int qs0, - const int qs1, - /* */ ValueType* w, - const bool is_Q_zero = false) { + /* */ ValueType* Q, const int qs0, const int qs1, + /* */ ValueType* w, const bool is_Q_zero = false) { typedef ValueType value_type; /// Given a matrix A that includes QR factorization @@ -57,8 +54,7 @@ struct SerialQR_FormQ_Internal { else SerialSetIdentityInternal::invoke(m, Q, qs0, qs1); - return SerialApplyQ_LeftNoTransForwardInternal ::invoke( - m, m, k, A, as0, as1, t, ts, Q, qs0, qs1, w); + return SerialApplyQ_LeftNoTransForwardInternal ::invoke(m, m, k, A, as0, as1, t, ts, Q, qs0, qs1, w); } }; diff --git a/batched/dense/impl/KokkosBatched_QR_FormQ_TeamVector_Internal.hpp b/batched/dense/impl/KokkosBatched_QR_FormQ_TeamVector_Internal.hpp index 66b63f23f6..af7f458898 100644 --- a/batched/dense/impl/KokkosBatched_QR_FormQ_TeamVector_Internal.hpp +++ b/batched/dense/impl/KokkosBatched_QR_FormQ_TeamVector_Internal.hpp @@ -33,12 +33,11 @@ namespace KokkosBatched { /// struct TeamVectorQR_FormQ_Internal { template - KOKKOS_INLINE_FUNCTION static int invoke( - const MemberType &member, const int m, const int n, const int k, - /* */ ValueType *A, const int as0, const int as1, - /* */ ValueType *t, const int ts, - /* */ ValueType *Q, const int qs0, const int qs1, - /* */ ValueType *w, const bool is_Q_zero = false) { + KOKKOS_INLINE_FUNCTION static int invoke(const MemberType &member, const int m, const int n, const int k, + /* */ ValueType *A, const int as0, const int as1, + /* */ ValueType *t, const int ts, + /* */ ValueType *Q, const int qs0, const int qs1, + /* */ ValueType *w, const bool is_Q_zero = false) { typedef ValueType value_type; /// Given a matrix A that includes QR factorization @@ -51,14 +50,12 @@ struct TeamVectorQR_FormQ_Internal { // set identity if (is_Q_zero) - KokkosBlas::Impl::TeamVectorSetInternal::invoke(member, m, value_type(1), - Q, qs0 + qs1); + KokkosBlas::Impl::TeamVectorSetInternal::invoke(member, m, value_type(1), Q, qs0 + qs1); else TeamVectorSetIdentityInternal::invoke(member, m, n, Q, qs0, qs1); member.team_barrier(); - return TeamVectorApplyQ_LeftForwardInternal ::invoke( - member, m, n, k, A, as0, as1, t, ts, Q, qs0, qs1, w); + return TeamVectorApplyQ_LeftForwardInternal ::invoke(member, m, n, k, A, as0, as1, t, ts, Q, qs0, qs1, w); } }; diff --git a/batched/dense/impl/KokkosBatched_QR_Serial_Impl.hpp b/batched/dense/impl/KokkosBatched_QR_Serial_Impl.hpp index 5eac699f56..1083e6af2a 100644 --- a/batched/dense/impl/KokkosBatched_QR_Serial_Impl.hpp +++ b/batched/dense/impl/KokkosBatched_QR_Serial_Impl.hpp @@ -29,10 +29,9 @@ namespace KokkosBatched { template <> template -KOKKOS_INLINE_FUNCTION int SerialQR::invoke( - const AViewType &A, const tViewType &t, const wViewType &w) { - return SerialQR_Internal::invoke(A.extent(0), A.extent(1), A.data(), - A.stride_0(), A.stride_1(), t.data(), +KOKKOS_INLINE_FUNCTION int SerialQR::invoke(const AViewType &A, const tViewType &t, + const wViewType &w) { + return SerialQR_Internal::invoke(A.extent(0), A.extent(1), A.data(), A.stride_0(), A.stride_1(), t.data(), t.stride_0(), w.data()); } diff --git a/batched/dense/impl/KokkosBatched_QR_Serial_Internal.hpp b/batched/dense/impl/KokkosBatched_QR_Serial_Internal.hpp index 729604f6c3..95ca1c4340 100644 --- a/batched/dense/impl/KokkosBatched_QR_Serial_Internal.hpp +++ b/batched/dense/impl/KokkosBatched_QR_Serial_Internal.hpp @@ -34,8 +34,7 @@ struct SerialQR_Internal { template KOKKOS_INLINE_FUNCTION static int invoke(const int m, // m = NumRows(A) const int n, // n = NumCols(A) - /* */ ValueType *A, const int as0, - const int as1, + /* */ ValueType *A, const int as0, const int as1, /* */ ValueType *t, const int ts, /* */ ValueType *w) { typedef ValueType value_type; @@ -66,13 +65,11 @@ struct SerialQR_Internal { /// ----------------------------------------------------- // perform householder transformation - SerialLeftHouseholderInternal::invoke(m_A22, A_part3x3.A11, A_part3x3.A21, - as0, tau); + SerialLeftHouseholderInternal::invoke(m_A22, A_part3x3.A11, A_part3x3.A21, as0, tau); // left apply householder to A22 - SerialApplyLeftHouseholderInternal::invoke( - m_A22, n_A22, tau, A_part3x3.A21, as0, A_part3x3.A12, as1, - A_part3x3.A22, as0, as1, w); + SerialApplyLeftHouseholderInternal::invoke(m_A22, n_A22, tau, A_part3x3.A21, as0, A_part3x3.A12, as1, + A_part3x3.A22, as0, as1, w); /// ----------------------------------------------------- A_part2x2.mergeToATL(A_part3x3); t_part2x1.mergeToAT(t_part3x1); diff --git a/batched/dense/impl/KokkosBatched_QR_TeamVector_Impl.hpp b/batched/dense/impl/KokkosBatched_QR_TeamVector_Impl.hpp index 78d6e226a8..2497e5adf5 100644 --- a/batched/dense/impl/KokkosBatched_QR_TeamVector_Impl.hpp +++ b/batched/dense/impl/KokkosBatched_QR_TeamVector_Impl.hpp @@ -30,12 +30,9 @@ namespace KokkosBatched { template struct TeamVectorQR { template - KOKKOS_INLINE_FUNCTION static int invoke(const MemberType &member, - const AViewType &A, - const tViewType &t, + KOKKOS_INLINE_FUNCTION static int invoke(const MemberType &member, const AViewType &A, const tViewType &t, const wViewType &w) { - return TeamVectorQR_Internal::invoke(member, A.extent(0), A.extent(1), - A.data(), A.stride_0(), A.stride_1(), + return TeamVectorQR_Internal::invoke(member, A.extent(0), A.extent(1), A.data(), A.stride_0(), A.stride_1(), t.data(), t.stride_0(), w.data()); } }; diff --git a/batched/dense/impl/KokkosBatched_QR_TeamVector_Internal.hpp b/batched/dense/impl/KokkosBatched_QR_TeamVector_Internal.hpp index 312feba997..e3dde67986 100644 --- a/batched/dense/impl/KokkosBatched_QR_TeamVector_Internal.hpp +++ b/batched/dense/impl/KokkosBatched_QR_TeamVector_Internal.hpp @@ -35,8 +35,7 @@ struct TeamVectorQR_Internal { KOKKOS_INLINE_FUNCTION static int invoke(const MemberType &member, const int m, // m = NumRows(A) const int n, // n = NumCols(A) - /* */ ValueType *A, const int as0, - const int as1, + /* */ ValueType *A, const int as0, const int as1, /* */ ValueType *t, const int ts, /* */ ValueType *w) { typedef ValueType value_type; @@ -67,14 +66,12 @@ struct TeamVectorQR_Internal { /// ----------------------------------------------------- // perform householder transformation - TeamVectorLeftHouseholderInternal::invoke(member, m_A22, A_part3x3.A11, - A_part3x3.A21, as0, tau); + TeamVectorLeftHouseholderInternal::invoke(member, m_A22, A_part3x3.A11, A_part3x3.A21, as0, tau); member.team_barrier(); // left apply householder to A22 - TeamVectorApplyLeftHouseholderInternal::invoke( - member, m_A22, n_A22, tau, A_part3x3.A21, as0, A_part3x3.A12, as1, - A_part3x3.A22, as0, as1, w); + TeamVectorApplyLeftHouseholderInternal::invoke(member, m_A22, n_A22, tau, A_part3x3.A21, as0, A_part3x3.A12, as1, + A_part3x3.A22, as0, as1, w); member.team_barrier(); /// ----------------------------------------------------- A_part2x2.mergeToATL(A_part3x3); diff --git a/batched/dense/impl/KokkosBatched_QR_WithColumnPivoting_TeamVector_Impl.hpp b/batched/dense/impl/KokkosBatched_QR_WithColumnPivoting_TeamVector_Impl.hpp index 4f293f12cf..ed9ccd8cce 100644 --- a/batched/dense/impl/KokkosBatched_QR_WithColumnPivoting_TeamVector_Impl.hpp +++ b/batched/dense/impl/KokkosBatched_QR_WithColumnPivoting_TeamVector_Impl.hpp @@ -29,17 +29,13 @@ namespace KokkosBatched { template struct TeamVectorQR_WithColumnPivoting { - template - KOKKOS_INLINE_FUNCTION static int invoke(const MemberType &member, - const AViewType &A, - const tViewType &t, - const pViewType &p, - const wViewType &w, + template + KOKKOS_INLINE_FUNCTION static int invoke(const MemberType &member, const AViewType &A, const tViewType &t, + const pViewType &p, const wViewType &w, /* */ int &matrix_rank) { - return TeamVectorQR_WithColumnPivotingInternal::invoke( - member, A.extent(0), A.extent(1), A.data(), A.stride_0(), A.stride_1(), - t.data(), t.stride_0(), p.data(), p.stride_0(), w.data(), matrix_rank); + return TeamVectorQR_WithColumnPivotingInternal::invoke(member, A.extent(0), A.extent(1), A.data(), A.stride_0(), + A.stride_1(), t.data(), t.stride_0(), p.data(), p.stride_0(), + w.data(), matrix_rank); } }; diff --git a/batched/dense/impl/KokkosBatched_QR_WithColumnPivoting_TeamVector_Internal.hpp b/batched/dense/impl/KokkosBatched_QR_WithColumnPivoting_TeamVector_Internal.hpp index 26efb70c77..280bfa434b 100644 --- a/batched/dense/impl/KokkosBatched_QR_WithColumnPivoting_TeamVector_Internal.hpp +++ b/batched/dense/impl/KokkosBatched_QR_WithColumnPivoting_TeamVector_Internal.hpp @@ -37,10 +37,9 @@ namespace KokkosBatched { /// struct TeamVectorUpdateColumnNormsInternal { template - KOKKOS_INLINE_FUNCTION static int invoke( - const MemberType &member, const int n, const ValueType *KOKKOS_RESTRICT a, - const int as0, - /* */ ValueType *KOKKOS_RESTRICT norm, const int ns0) { + KOKKOS_INLINE_FUNCTION static int invoke(const MemberType &member, const int n, const ValueType *KOKKOS_RESTRICT a, + const int as0, + /* */ ValueType *KOKKOS_RESTRICT norm, const int ns0) { using ats = Kokkos::ArithTraits; Kokkos::parallel_for(Kokkos::TeamVectorRange(member, n), [&](const int &j) { const int idx_a = j * as0, idx_n = j * ns0; @@ -55,8 +54,7 @@ struct TeamVectorQR_WithColumnPivotingInternal { KOKKOS_INLINE_FUNCTION static int invoke(const MemberType &member, const int m, // m = NumRows(A) const int n, // n = NumCols(A) - /* */ ValueType *A, const int as0, - const int as1, + /* */ ValueType *A, const int as0, const int as1, /* */ ValueType *t, const int ts0, /* */ IntType *p, const int ps0, /* */ ValueType *w, @@ -98,8 +96,7 @@ struct TeamVectorQR_WithColumnPivotingInternal { norm_part1x2.partWithAL(norm, n, 0); // compute initial column norms (replaced by dot product) - TeamVectorDotInternal::invoke(member, m, n, A, as0, as1, A, as0, as1, norm, - 1); + TeamVectorDotInternal::invoke(member, m, n, A, as0, as1, A, as0, as1, norm, 1); member.team_barrier(); const bool finish_when_rank_found = (matrix_rank == -1); @@ -124,33 +121,27 @@ struct TeamVectorQR_WithColumnPivotingInternal { /// ----------------------------------------------------- // find max location - TeamVectorFindAmaxInternal::invoke(member, n_AR, norm_part1x2.AR, 1, - pividx); + TeamVectorFindAmaxInternal::invoke(member, n_AR, norm_part1x2.AR, 1, pividx); member.team_barrier(); // apply pivot - TeamVectorApplyPivotVectorForwardInternal::invoke(member, *pividx, - norm_part1x2.AR, 1); - TeamVectorApplyPivotMatrixForwardInternal::invoke( - member, m, *pividx, A_part2x2.ATR, as1, as0); + TeamVectorApplyPivotVectorForwardInternal::invoke(member, *pividx, norm_part1x2.AR, 1); + TeamVectorApplyPivotMatrixForwardInternal::invoke(member, m, *pividx, A_part2x2.ATR, as1, as0); member.team_barrier(); // perform householder transformation - TeamVectorLeftHouseholderInternal::invoke(member, m_A22, A_part3x3.A11, - A_part3x3.A21, as0, tau); + TeamVectorLeftHouseholderInternal::invoke(member, m_A22, A_part3x3.A11, A_part3x3.A21, as0, tau); member.team_barrier(); // left apply householder to A22 - TeamVectorApplyLeftHouseholderInternal::invoke( - member, m_A22, n_A22, tau, A_part3x3.A21, as0, A_part3x3.A12, as1, - A_part3x3.A22, as0, as1, w); + TeamVectorApplyLeftHouseholderInternal::invoke(member, m_A22, n_A22, tau, A_part3x3.A21, as0, A_part3x3.A12, as1, + A_part3x3.A22, as0, as1, w); member.team_barrier(); // break condition if (matrix_rank == min_mn) { if (m_atl == 0) max_diag = ats::abs(A[0]); - const value_type val_diag = ats::abs(A_part3x3.A11[0]), - threshold(10 * max_diag * ats::epsilon()); + const value_type val_diag = ats::abs(A_part3x3.A11[0]), threshold(10 * max_diag * ats::epsilon()); if (val_diag < threshold) { matrix_rank = m_atl; if (finish_when_rank_found) break; @@ -158,8 +149,7 @@ struct TeamVectorQR_WithColumnPivotingInternal { } // norm update - TeamVectorUpdateColumnNormsInternal::invoke(member, n_A22, A_part3x3.A12, - as1, norm_part1x3.A2, 1); + TeamVectorUpdateColumnNormsInternal::invoke(member, n_A22, A_part3x3.A12, as1, norm_part1x3.A2, 1); member.team_barrier(); /// ----------------------------------------------------- A_part2x2.mergeToATL(A_part3x3); diff --git a/batched/dense/impl/KokkosBatched_RightEigenvectorFromSchur_Serial_Internal.hpp b/batched/dense/impl/KokkosBatched_RightEigenvectorFromSchur_Serial_Internal.hpp index 4716506064..029875f810 100644 --- a/batched/dense/impl/KokkosBatched_RightEigenvectorFromSchur_Serial_Internal.hpp +++ b/batched/dense/impl/KokkosBatched_RightEigenvectorFromSchur_Serial_Internal.hpp @@ -45,12 +45,9 @@ struct SerialRightEigenvectorFromSchurInternal { /// contiguous workspace that can hold complex array (m) template KOKKOS_INLINE_FUNCTION static int invoke(const int m, - /* */ ValueType *S, const int ss0, - const int ss1, - /* */ ValueType *V, const int vs0, - const int vs1, - /* */ ValueType *w, - const int *blks) { + /* */ ValueType *S, const int ss0, const int ss1, + /* */ ValueType *V, const int vs0, const int vs1, + /* */ ValueType *w, const int *blks) { typedef ValueType value_type; typedef Kokkos::ArithTraits ats; // typedef typename ats::mag_type mag_type; @@ -78,8 +75,7 @@ struct SerialRightEigenvectorFromSchurInternal { for (; m_stl > 0;) { /// part 2x2 into 3x3 const int mA11 = blks[m_stl - 1]; - assert(((mA11 == 1) || (mA11 == 2)) && - "RightEigenvectorFromSchur: blk is not 1x1 nor 2x2"); + assert(((mA11 == 1) || (mA11 == 2)) && "RightEigenvectorFromSchur: blk is not 1x1 nor 2x2"); S_part3x3.partWithATL(S_part2x2, mA11, mA11); V_part1x3.partWithAL(V_part1x2, mA11); @@ -90,23 +86,19 @@ struct SerialRightEigenvectorFromSchurInternal { const value_type lambda = *S_part3x3.A11; /// initialize a right eigen vector - for (int i = 0; i < m_stl_minus_mA11; ++i) - b[i] = -S_part3x3.A01[i * ss0]; + for (int i = 0; i < m_stl_minus_mA11; ++i) b[i] = -S_part3x3.A01[i * ss0]; b[m_stl - 1] = one; /// perform shifted trsv - SerialShiftedTrsvInternalUpper::invoke( - m_stl_minus_mA11, lambda, S_part3x3.A00, ss0, ss1, w, 1, blks); + SerialShiftedTrsvInternalUpper::invoke(m_stl_minus_mA11, lambda, S_part3x3.A00, ss0, ss1, w, 1, blks); /// copy back to V for (int i = 0; i < m_stl; ++i) V_part1x3.A1[i * vs0] = w[i]; for (int i = m_stl; i < m; ++i) V_part1x3.A1[i * vs0] = zero; } else { /// complex eigen pair - const value_type alpha11 = S_part3x3.A11[0], - alpha12 = S_part3x3.A11[ss1], - alpha21 = S_part3x3.A11[ss0], - beta = ats::sqrt(-alpha12 * alpha21); + const value_type alpha11 = S_part3x3.A11[0], alpha12 = S_part3x3.A11[ss1], alpha21 = S_part3x3.A11[ss0], + beta = ats::sqrt(-alpha12 * alpha21); const complex_type lambda(alpha11, beta); complex_type *bc = (complex_type *)(b); @@ -115,14 +107,12 @@ struct SerialRightEigenvectorFromSchurInternal { const value_type *S_A01_a = S_part3x3.A01; const value_type *S_A01_b = S_part3x3.A01 + ss1; for (int i = 0; i < m_stl_minus_mA11; ++i) - bc[i] = complex_type(-S_A01_a[i * ss0] * beta, - S_A01_b[i * ss0] * alpha21); + bc[i] = complex_type(-S_A01_a[i * ss0] * beta, S_A01_b[i * ss0] * alpha21); bc[m_stl - 2] = complex_type(beta, zero); bc[m_stl - 1] = complex_type(zero, -alpha21); /// perform shifted trsv - SerialShiftedTrsvInternalUpper::invoke( - m_stl_minus_mA11, lambda, S_part3x3.A00, ss0, ss1, bc, 1, blks); + SerialShiftedTrsvInternalUpper::invoke(m_stl_minus_mA11, lambda, S_part3x3.A00, ss0, ss1, bc, 1, blks); /// copy back to V value_type *V_A1_r = V_part1x3.A1; diff --git a/batched/dense/impl/KokkosBatched_SVD_Serial_Impl.hpp b/batched/dense/impl/KokkosBatched_SVD_Serial_Impl.hpp index a2c345f4fb..e0c25c2ce7 100644 --- a/batched/dense/impl/KokkosBatched_SVD_Serial_Impl.hpp +++ b/batched/dense/impl/KokkosBatched_SVD_Serial_Impl.hpp @@ -22,50 +22,36 @@ namespace KokkosBatched { // Version which computes the full factorization -template -KOKKOS_INLINE_FUNCTION int SerialSVD::invoke( - SVD_USV_Tag, const AViewType &A, const UViewType &U, const SViewType &sigma, - const VViewType &Vt, const WViewType &work, - typename AViewType::const_value_type tol) { - static_assert(Kokkos::is_view_v && AViewType::rank == 2, - "SVD: A must be a rank-2 view"); - static_assert(Kokkos::is_view_v && UViewType::rank == 2, - "SVD: U must be a rank-2 view"); - static_assert(Kokkos::is_view_v && SViewType::rank == 1, - "SVD: s must be a rank-1 view"); - static_assert(Kokkos::is_view_v && VViewType::rank == 2, - "SVD: V must be a rank-2 view"); - static_assert(Kokkos::is_view_v && WViewType::rank == 1, - "SVD: W must be a rank-1 view"); - static_assert( - !std::is_same_v, - "SVD: W must be contiguous (not LayoutStride)"); +template +KOKKOS_INLINE_FUNCTION int SerialSVD::invoke(SVD_USV_Tag, const AViewType &A, const UViewType &U, + const SViewType &sigma, const VViewType &Vt, const WViewType &work, + typename AViewType::const_value_type tol) { + static_assert(Kokkos::is_view_v && AViewType::rank == 2, "SVD: A must be a rank-2 view"); + static_assert(Kokkos::is_view_v && UViewType::rank == 2, "SVD: U must be a rank-2 view"); + static_assert(Kokkos::is_view_v && SViewType::rank == 1, "SVD: s must be a rank-1 view"); + static_assert(Kokkos::is_view_v && VViewType::rank == 2, "SVD: V must be a rank-2 view"); + static_assert(Kokkos::is_view_v && WViewType::rank == 1, "SVD: W must be a rank-1 view"); + static_assert(!std::is_same_v, + "SVD: W must be contiguous (not LayoutStride)"); using value_type = typename AViewType::non_const_value_type; return KokkosBatched::SerialSVDInternal::invoke( - A.extent(0), A.extent(1), A.data(), A.stride(0), A.stride(1), U.data(), - U.stride(0), U.stride(1), Vt.data(), Vt.stride(0), Vt.stride(1), - sigma.data(), sigma.stride(0), work.data(), tol); + A.extent(0), A.extent(1), A.data(), A.stride(0), A.stride(1), U.data(), U.stride(0), U.stride(1), Vt.data(), + Vt.stride(0), Vt.stride(1), sigma.data(), sigma.stride(0), work.data(), tol); } // Version which computes only singular values template -KOKKOS_INLINE_FUNCTION int SerialSVD::invoke( - SVD_S_Tag, const AViewType &A, const SViewType &sigma, - const WViewType &work, typename AViewType::const_value_type tol) { - static_assert(Kokkos::is_view_v && AViewType::rank == 2, - "SVD: A must be a rank-2 view"); - static_assert(Kokkos::is_view_v && SViewType::rank == 1, - "SVD: s must be a rank-1 view"); - static_assert(Kokkos::is_view_v && WViewType::rank == 1, - "SVD: W must be a rank-1 view"); - static_assert( - !std::is_same_v, - "SVD: W must be contiguous (not LayoutStride)"); +KOKKOS_INLINE_FUNCTION int SerialSVD::invoke(SVD_S_Tag, const AViewType &A, const SViewType &sigma, + const WViewType &work, typename AViewType::const_value_type tol) { + static_assert(Kokkos::is_view_v && AViewType::rank == 2, "SVD: A must be a rank-2 view"); + static_assert(Kokkos::is_view_v && SViewType::rank == 1, "SVD: s must be a rank-1 view"); + static_assert(Kokkos::is_view_v && WViewType::rank == 1, "SVD: W must be a rank-1 view"); + static_assert(!std::is_same_v, + "SVD: W must be contiguous (not LayoutStride)"); using value_type = typename AViewType::non_const_value_type; - return KokkosBatched::SerialSVDInternal::invoke( - A.extent(0), A.extent(1), A.data(), A.stride(0), A.stride(1), nullptr, 0, - 0, nullptr, 0, 0, sigma.data(), sigma.stride(0), work.data(), tol); + return KokkosBatched::SerialSVDInternal::invoke(A.extent(0), A.extent(1), A.data(), A.stride(0), + A.stride(1), nullptr, 0, 0, nullptr, 0, 0, sigma.data(), + sigma.stride(0), work.data(), tol); } } // namespace KokkosBatched diff --git a/batched/dense/impl/KokkosBatched_SVD_Serial_Internal.hpp b/batched/dense/impl/KokkosBatched_SVD_Serial_Internal.hpp index 87ed65d81e..0b85b1e28e 100644 --- a/batched/dense/impl/KokkosBatched_SVD_Serial_Internal.hpp +++ b/batched/dense/impl/KokkosBatched_SVD_Serial_Internal.hpp @@ -49,8 +49,7 @@ struct SerialSVDInternal { // however this is simpler because it exploits the symmetric structure, and // the realness of the eigenvalues. template - KOKKOS_INLINE_FUNCTION static void symEigen2x2(value_type a11, value_type a21, - value_type a22, value_type& e1, + KOKKOS_INLINE_FUNCTION static void symEigen2x2(value_type a11, value_type a21, value_type a22, value_type& e1, value_type& e2) { value_type a = Kokkos::ArithTraits::one(); value_type b = -a11 - a22; @@ -67,10 +66,8 @@ struct SerialSVDInternal { // // B22 is nsub * nsub, Usub is m * nsub, and Vtsub is nsub * n template - KOKKOS_INLINE_FUNCTION static void svdStep(value_type* B, value_type* U, - value_type* Vt, int um, int vn, - int n, int Bs0, int Bs1, int Us0, - int Us1, int Vts0, int Vts1) { + KOKKOS_INLINE_FUNCTION static void svdStep(value_type* B, value_type* U, value_type* Vt, int um, int vn, int n, + int Bs0, int Bs1, int Us0, int Us1, int Vts0, int Vts1) { using KAT = Kokkos::ArithTraits; // Compute the eigenvalues of trailing 2x2 value_type dn = SVDIND(B, n - 1, n - 1); @@ -91,34 +88,30 @@ struct SerialSVDInternal { // Use Givens to zero out z in [y; z] Kokkos::pair G; value_type discard; // Don't actually write [alpha; 0] anywhere - KokkosBatched::SerialGivensInternal::invoke(y, z, &G, - &discard); + KokkosBatched::SerialGivensInternal::invoke(y, z, &G, &discard); // apply the Givens transformation to B on the right, to columns k,k+1 // B := BG(k, k+1, theta) int minrow = KOKKOSKERNELS_MACRO_MAX(0, k - 1); int maxrow = KOKKOSKERNELS_MACRO_MIN(n, k + 2); - KokkosBatched::SerialApplyRightGivensInternal::invoke( - G, maxrow - minrow, &SVDIND(B, minrow, k + 1), Bs0, - &SVDIND(B, minrow, k), Bs0); + KokkosBatched::SerialApplyRightGivensInternal::invoke(G, maxrow - minrow, &SVDIND(B, minrow, k + 1), + Bs0, &SVDIND(B, minrow, k), Bs0); if (Vt) { - KokkosBatched::SerialApplyLeftGivensInternal::invoke( - G, vn, &SVDIND(Vt, k + 1, 0), Vts1, &SVDIND(Vt, k, 0), Vts1); + KokkosBatched::SerialApplyLeftGivensInternal::invoke(G, vn, &SVDIND(Vt, k + 1, 0), Vts1, + &SVDIND(Vt, k, 0), Vts1); } y = SVDIND(B, k, k); z = SVDIND(B, k + 1, k); - KokkosBatched::SerialGivensInternal::invoke(y, z, &G, - &SVDIND(B, k, k)); + KokkosBatched::SerialGivensInternal::invoke(y, z, &G, &SVDIND(B, k, k)); SVDIND(B, k + 1, k) = KAT::zero(); int mincol = k + 1; int maxcol = KOKKOSKERNELS_MACRO_MIN(n, k + 3); // apply Givens transformation to B on the left, to rows k, k + 1 // B := G(k, k+1, theta)^T * B - KokkosBatched::SerialApplyLeftGivensInternal::invoke( - G, maxcol - mincol, &SVDIND(B, k + 1, mincol), Bs1, - &SVDIND(B, k, mincol), Bs1); + KokkosBatched::SerialApplyLeftGivensInternal::invoke(G, maxcol - mincol, &SVDIND(B, k + 1, mincol), + Bs1, &SVDIND(B, k, mincol), Bs1); if (U) { - KokkosBatched::SerialApplyRightGivensInternal::invoke( - G, um, &SVDIND(U, 0, k + 1), Us0, &SVDIND(U, 0, k), Us0); + KokkosBatched::SerialApplyRightGivensInternal::invoke(G, um, &SVDIND(U, 0, k + 1), Us0, + &SVDIND(U, 0, k), Us0); } if (k < n - 2) { y = SVDIND(B, k, k + 1); @@ -131,71 +124,65 @@ struct SerialSVDInternal { // Assumes i is not the last row. // U is m*m, B is n*n template - KOKKOS_INLINE_FUNCTION static void svdZeroRow(int i, value_type* B, int n, - int Bs0, int Bs1, value_type* U, - int m, int Us0, int Us1) { + KOKKOS_INLINE_FUNCTION static void svdZeroRow(int i, value_type* B, int n, int Bs0, int Bs1, value_type* U, int m, + int Us0, int Us1) { Kokkos::pair G; for (int j = i + 1; j < n; j++) { // Zero out B(i, j) against diagonal j, introducing nonzero in B(i, j + 1) - KokkosBatched::SerialGivensInternal::invoke( - SVDIND(B, j, j), SVDIND(B, i, j), &G, &SVDIND(B, j, j)); + KokkosBatched::SerialGivensInternal::invoke(SVDIND(B, j, j), SVDIND(B, i, j), &G, &SVDIND(B, j, j)); SVDIND(B, i, j) = Kokkos::ArithTraits::zero(); // Now, only need to apply givens to a single column (if not already at // the end), introducing the next nonzero if (j < n - 1) { - KokkosBatched::SerialApplyLeftGivensInternal::invoke( - G, 1, &SVDIND(B, i, j + 1), Bs1, &SVDIND(B, j, j + 1), Bs1); + KokkosBatched::SerialApplyLeftGivensInternal::invoke(G, 1, &SVDIND(B, i, j + 1), Bs1, + &SVDIND(B, j, j + 1), Bs1); } if (U) { - KokkosBatched::SerialApplyRightGivensInternal::invoke( - G, m, &SVDIND(U, 0, i), Us0, &SVDIND(U, 0, j), Us0); + KokkosBatched::SerialApplyRightGivensInternal::invoke(G, m, &SVDIND(U, 0, i), Us0, &SVDIND(U, 0, j), + Us0); } } } template - KOKKOS_INLINE_FUNCTION static void svdZeroLastColumn(value_type* B, int n, - int Bs0, int Bs1, - value_type* Vt, int Vts0, + KOKKOS_INLINE_FUNCTION static void svdZeroLastColumn(value_type* B, int n, int Bs0, int Bs1, value_type* Vt, int Vts0, int Vts1) { // Deal with B(n-1, n-1) = 0, by chasing the superdiagonal nonzero up the // last column. Kokkos::pair G; for (int j = n - 2; j >= 0; j--) { - KokkosBatched::SerialGivensInternal::invoke( - SVDIND(B, j, j), SVDIND(B, j, n - 1), &G, &SVDIND(B, j, j)); + KokkosBatched::SerialGivensInternal::invoke(SVDIND(B, j, j), SVDIND(B, j, n - 1), &G, + &SVDIND(B, j, j)); SVDIND(B, j, n - 1) = Kokkos::ArithTraits::zero(); if (j != 0) { - KokkosBatched::SerialApplyRightGivensInternal::invoke( - G, 1, &SVDIND(B, j - 1, n - 1), Bs0, &SVDIND(B, j - 1, j), Bs0); + KokkosBatched::SerialApplyRightGivensInternal::invoke(G, 1, &SVDIND(B, j - 1, n - 1), Bs0, + &SVDIND(B, j - 1, j), Bs0); } if (Vt) { - KokkosBatched::SerialApplyLeftGivensInternal::invoke( - G, n, &SVDIND(Vt, n - 1, 0), Vts1, &SVDIND(Vt, j, 0), Vts1); + KokkosBatched::SerialApplyLeftGivensInternal::invoke(G, n, &SVDIND(Vt, n - 1, 0), Vts1, + &SVDIND(Vt, j, 0), Vts1); } } } template - KOKKOS_INLINE_FUNCTION static void bidiagonalize( - int m, int n, value_type* A, int As0, int As1, value_type* U, int Us0, - int Us1, value_type* Vt, int Vts0, int Vts1, value_type* work) { + KOKKOS_INLINE_FUNCTION static void bidiagonalize(int m, int n, value_type* A, int As0, int As1, value_type* U, + int Us0, int Us1, value_type* Vt, int Vts0, int Vts1, + value_type* work) { using KAT = Kokkos::ArithTraits; value_type tau; for (int i = 0; i < n; i++) { // Eliminating column i of A below the diagonal - KokkosBatched::SerialLeftHouseholderInternal::invoke( - m - i - 1, &SVDIND(A, i, i), &SVDIND(A, i + 1, i), As0, &tau); + KokkosBatched::SerialLeftHouseholderInternal::invoke(m - i - 1, &SVDIND(A, i, i), + &SVDIND(A, i + 1, i), As0, &tau); if (n - i > 1) { KokkosBatched::SerialApplyLeftHouseholderInternal::invoke( - m - i - 1, n - i - 1, &tau, &SVDIND(A, i + 1, i), As0, - &SVDIND(A, i, i + 1), As1, &SVDIND(A, i + 1, i + 1), As0, As1, - work); + m - i - 1, n - i - 1, &tau, &SVDIND(A, i + 1, i), As0, &SVDIND(A, i, i + 1), As1, &SVDIND(A, i + 1, i + 1), + As0, As1, work); } if (U) { KokkosBatched::SerialApplyRightHouseholderInternal::invoke( - m, m - i - 1, &tau, &SVDIND(A, i + 1, i), As0, &SVDIND(U, 0, i), - Us0, &SVDIND(U, 0, i + 1), Us0, Us1, work); + m, m - i - 1, &tau, &SVDIND(A, i + 1, i), As0, &SVDIND(U, 0, i), Us0, &SVDIND(U, 0, i + 1), Us0, Us1, work); } // Zero out A subdiag explicitly (NOTE: may not be necessary...) for (int j = i + 1; j < m; j++) { @@ -203,19 +190,17 @@ struct SerialSVDInternal { } if (i < n - 2) { // Eliminating row i of A to the right of the 1st superdiagonal - KokkosBatched::SerialLeftHouseholderInternal::invoke( - n - i - 2, &SVDIND(A, i, i + 1), &SVDIND(A, i, i + 2), As1, &tau); + KokkosBatched::SerialLeftHouseholderInternal::invoke(n - i - 2, &SVDIND(A, i, i + 1), + &SVDIND(A, i, i + 2), As1, &tau); if (m - i > 1) { - KokkosBatched::SerialApplyRightHouseholderInternal::invoke< - value_type>(m - i - 1, n - i - 2, &tau, &SVDIND(A, i, i + 2), As1, - &SVDIND(A, i + 1, i + 1), As0, - &SVDIND(A, i + 1, i + 2), As0, As1, work); + KokkosBatched::SerialApplyRightHouseholderInternal::invoke( + m - i - 1, n - i - 2, &tau, &SVDIND(A, i, i + 2), As1, &SVDIND(A, i + 1, i + 1), As0, + &SVDIND(A, i + 1, i + 2), As0, As1, work); } if (Vt) { KokkosBatched::SerialApplyLeftHouseholderInternal::invoke( - n - i - 2, n, &tau, &SVDIND(A, i, i + 2), As1, - &SVDIND(Vt, i + 1, 0), Vts1, &SVDIND(Vt, i + 2, 0), Vts0, Vts1, - work); + n - i - 2, n, &tau, &SVDIND(A, i, i + 2), As1, &SVDIND(Vt, i + 1, 0), Vts1, &SVDIND(Vt, i + 2, 0), Vts0, + Vts1, work); } // Zero out A superdiag row explicitly for (int j = i + 2; j < n; j++) { @@ -229,11 +214,8 @@ struct SerialSVDInternal { // U and Vt to maintain the product U*B*Vt. At the end, the singular values // are copied to sigma. template - KOKKOS_INLINE_FUNCTION static void bidiSVD(int m, int n, value_type* B, - int Bs0, int Bs1, value_type* U, - int Us0, int Us1, value_type* Vt, - int Vts0, int Vts1, - value_type* sigma, int ss, + KOKKOS_INLINE_FUNCTION static void bidiSVD(int m, int n, value_type* B, int Bs0, int Bs1, value_type* U, int Us0, + int Us1, value_type* Vt, int Vts0, int Vts1, value_type* sigma, int ss, const value_type& tol) { using KAT = Kokkos::ArithTraits; const value_type eps = Kokkos::ArithTraits::epsilon(); @@ -242,8 +224,7 @@ struct SerialSVDInternal { while (true) { // Zero out tiny superdiagonal entries for (int i = 0; i < n - 1; i++) { - if (fabs(SVDIND(B, i, i + 1)) < - eps * (fabs(SVDIND(B, i, i)) + fabs(SVDIND(B, i + 1, i + 1))) || + if (fabs(SVDIND(B, i, i + 1)) < eps * (fabs(SVDIND(B, i, i)) + fabs(SVDIND(B, i + 1, i + 1))) || fabs(SVDIND(B, i, i + 1)) < tol) { SVDIND(B, i, i + 1) = KAT::zero(); } @@ -283,8 +264,7 @@ struct SerialSVDInternal { } int nsub = q - p; // B22 is nsub * nsub, Usub is m * nsub, and Vtsub is nsub * n - svdStep(&SVDIND(B, p, p), &SVDIND(U, 0, p), &SVDIND(Vt, p, 0), m, n, nsub, - Bs0, Bs1, Us0, Us1, Vts0, Vts1); + svdStep(&SVDIND(B, p, p), &SVDIND(U, 0, p), &SVDIND(Vt, p, 0), m, n, nsub, Bs0, Bs1, Us0, Us1, Vts0, Vts1); } for (int i = 0; i < n; i++) { sigma[i * ss] = SVDIND(B, i, i); @@ -294,11 +274,8 @@ struct SerialSVDInternal { // Convert SVD into conventional form: singular values positive and in // descending order template - KOKKOS_INLINE_FUNCTION static void postprocessSVD(int m, int n, value_type* U, - int Us0, int Us1, - value_type* Vt, int Vts0, - int Vts1, value_type* sigma, - int ss) { + KOKKOS_INLINE_FUNCTION static void postprocessSVD(int m, int n, value_type* U, int Us0, int Us1, value_type* Vt, + int Vts0, int Vts1, value_type* sigma, int ss) { // First step: flip signs on negative singular values for (int i = 0; i < n; i++) { if (sigma[i * ss] < 0) { @@ -327,23 +304,19 @@ struct SerialSVDInternal { if (i != maxloc) { SVDSWAP(sigma[i * ss], sigma[maxloc * ss]); if (U) { - for (int j = 0; j < m; j++) - SVDSWAP(SVDIND(U, j, i), SVDIND(U, j, maxloc)) + for (int j = 0; j < m; j++) SVDSWAP(SVDIND(U, j, i), SVDIND(U, j, maxloc)) } if (Vt) { - for (int j = 0; j < n; j++) - SVDSWAP(SVDIND(Vt, i, j), SVDIND(Vt, maxloc, j)) + for (int j = 0; j < n; j++) SVDSWAP(SVDIND(Vt, i, j), SVDIND(Vt, maxloc, j)) } } } } template - KOKKOS_INLINE_FUNCTION static int invoke( - int m, int n, value_type* A, int As0, int As1, value_type* U, int Us0, - int Us1, value_type* Vt, int Vts0, int Vts1, value_type* sigma, int ss, - value_type* work, - value_type tol = Kokkos::ArithTraits::zero()) { + KOKKOS_INLINE_FUNCTION static int invoke(int m, int n, value_type* A, int As0, int As1, value_type* U, int Us0, + int Us1, value_type* Vt, int Vts0, int Vts1, value_type* sigma, int ss, + value_type* work, value_type tol = Kokkos::ArithTraits::zero()) { // First, if m < n, need to instead compute (V, s, U^T) = A^T. // This just means swapping U & Vt, and implicitly transposing A, U and Vt. if (m < n) { @@ -356,12 +329,10 @@ struct SerialSVDInternal { SVDSWAP(Us1, Vts0); } if (U) { - KokkosBatched::SerialSetIdentityInternal::invoke(m, m, U, Us0, - Us1); + KokkosBatched::SerialSetIdentityInternal::invoke(m, m, U, Us0, Us1); } if (Vt) { - KokkosBatched::SerialSetIdentityInternal::invoke(n, n, Vt, - Vts0, Vts1); + KokkosBatched::SerialSetIdentityInternal::invoke(n, n, Vt, Vts0, Vts1); } if (m == 0 || n == 0) { // sigma is length 0, so there's nothing left to compute diff --git a/batched/dense/impl/KokkosBatched_Schur2x2_Serial_Internal.hpp b/batched/dense/impl/KokkosBatched_Schur2x2_Serial_Internal.hpp index 22a599ed58..41e525d2ba 100644 --- a/batched/dense/impl/KokkosBatched_Schur2x2_Serial_Internal.hpp +++ b/batched/dense/impl/KokkosBatched_Schur2x2_Serial_Internal.hpp @@ -30,12 +30,9 @@ namespace KokkosBatched { /// struct SerialSchur2x2Internal { template - KOKKOS_INLINE_FUNCTION static int invoke(RealType* alpha00, RealType* alpha01, - RealType* alpha10, RealType* alpha11, - Kokkos::pair* G, - Kokkos::complex* lambda1, - Kokkos::complex* lambda2, - bool* is_complex) { + KOKKOS_INLINE_FUNCTION static int invoke(RealType* alpha00, RealType* alpha01, RealType* alpha10, RealType* alpha11, + Kokkos::pair* G, Kokkos::complex* lambda1, + Kokkos::complex* lambda2, bool* is_complex) { typedef RealType real_type; typedef Kokkos::ArithTraits ats; const real_type zero(0), one(1), half(0.5), minus_one(-1); @@ -70,8 +67,7 @@ struct SerialSchur2x2Internal { *lambda1 = Kokkos::complex(*alpha00, zero); *lambda2 = Kokkos::complex(*alpha11, zero); *is_complex = false; - } else if (ats::abs(*alpha00 - *alpha11) < tol && - (*alpha01) * (*alpha10) > zero) { + } else if (ats::abs(*alpha00 - *alpha11) < tol && (*alpha01) * (*alpha10) > zero) { // no rotation (already the standard schur form) *G = Kokkos::pair(one, zero); /// two real eigen values @@ -84,9 +80,8 @@ struct SerialSchur2x2Internal { const real_type b = (*alpha01) + (*alpha10); const real_type l = ats::sqrt(a * a + b * b); const real_type c = ats::sqrt(half * (one + ats::abs(b) / l)); - const real_type s = - -((half * a) / (l * c)) * (b > zero ? one : minus_one); - *G = Kokkos::pair(c, s); + const real_type s = -((half * a) / (l * c)) * (b > zero ? one : minus_one); + *G = Kokkos::pair(c, s); /// [ gamma sigma ][ alpha00 alpha01 [ gamma -sigma --> [ alpha11 /// -alpha10 /// -sigma gamma ] alpha10 alpha11 ] sigma gamma ] 0 alpha00] @@ -105,19 +100,17 @@ struct SerialSchur2x2Internal { const real_type mult_alpha_offdiags = (*alpha10) * (*alpha01); if (mult_alpha_offdiags > zero) { /// transforms the matrix into a upper triangular - const real_type sqrt_mult_alpha_offdiags = - ats::sqrt(mult_alpha_offdiags); + const real_type sqrt_mult_alpha_offdiags = ats::sqrt(mult_alpha_offdiags); /// redefine the rotation matrix // const real_type sqrt_abs_alpha01 = ats::sqrt(ats::abs(*alpha01)); // const real_type sqrt_abs_alpha10 = ats::sqrt(ats::abs(*alpha10)); const real_type abs_sum_offidags = ats::abs((*alpha01) + (*alpha10)); - const real_type c1 = ats::sqrt(ats::abs(*alpha01) / abs_sum_offidags); - const real_type s1 = ats::sqrt(ats::abs(*alpha10) / abs_sum_offidags); - const real_type sign_alpha10 = *alpha10 > zero ? one : minus_one; + const real_type c1 = ats::sqrt(ats::abs(*alpha01) / abs_sum_offidags); + const real_type s1 = ats::sqrt(ats::abs(*alpha10) / abs_sum_offidags); + const real_type sign_alpha10 = *alpha10 > zero ? one : minus_one; - *G = Kokkos::pair(c * c1 - s * s1, - c * s1 + s * c1); + *G = Kokkos::pair(c * c1 - s * s1, c * s1 + s * c1); /// apply rotation to 2x2 matrix so that alpha10 becomes zero *alpha00 = tmp + sign_alpha10 * sqrt_mult_alpha_offdiags; @@ -131,12 +124,10 @@ struct SerialSchur2x2Internal { *is_complex = false; } else { /// two complex eigen values - const real_type sqrt_mult_alpha_offdiags = - ats::sqrt(-mult_alpha_offdiags); - *lambda1 = Kokkos::complex(tmp, sqrt_mult_alpha_offdiags); - *lambda2 = - Kokkos::complex(lambda1->real(), -lambda1->imag()); - *is_complex = true; + const real_type sqrt_mult_alpha_offdiags = ats::sqrt(-mult_alpha_offdiags); + *lambda1 = Kokkos::complex(tmp, sqrt_mult_alpha_offdiags); + *lambda2 = Kokkos::complex(lambda1->real(), -lambda1->imag()); + *is_complex = true; } } return 0; diff --git a/batched/dense/impl/KokkosBatched_Schur_Serial_Internal.hpp b/batched/dense/impl/KokkosBatched_Schur_Serial_Internal.hpp index c7f35d5c4f..c6d55b301b 100644 --- a/batched/dense/impl/KokkosBatched_Schur_Serial_Internal.hpp +++ b/batched/dense/impl/KokkosBatched_Schur_Serial_Internal.hpp @@ -68,33 +68,27 @@ struct SerialSchurInternal { /// returns -1. template KOKKOS_INLINE_FUNCTION static int invoke(const int m, - /* */ RealType *H, const int hs0, - const int hs1, - /* */ RealType *Z, const int zs0, - const int zs1, - /* */ RealType *w, const int wlen, - const bool restart = false, + /* */ RealType *H, const int hs0, const int hs1, + /* */ RealType *Z, const int zs0, const int zs1, + /* */ RealType *w, const int wlen, const bool restart = false, const int user_max_iteration = -1) { typedef RealType real_type; typedef Kokkos::ArithTraits ats; const real_type /* one(1), */ zero(0), tol = 1e2 * ats::epsilon(); const int max_iteration = user_max_iteration < 0 ? 300 : user_max_iteration; - if (wlen < m * 5) - Kokkos::abort("Error: provided workspace is smaller than 3*m"); + if (wlen < m * 5) Kokkos::abort("Error: provided workspace is smaller than 3*m"); int r_val = 0; if (restart) { - if (m <= 2) - Kokkos::abort("Error: restart option cannot be used for m=1 or m=2"); + if (m <= 2) Kokkos::abort("Error: restart option cannot be used for m=1 or m=2"); } else { /// do not touch input /// SerialSetIdentityInternal::invoke(m, Z, zs0, zs1); } // workspaces - real_type *subdiags = w; - Kokkos::pair *Gs = - (Kokkos::pair *)(w + m); + real_type *subdiags = w; + Kokkos::pair *Gs = (Kokkos::pair *)(w + m); if (!restart) { /// initialize workspace and Gs for (int i = 0; i < m; ++i) subdiags[i] = zero; @@ -111,8 +105,7 @@ struct SerialSchurInternal { bool is_complex; Kokkos::complex lambda1, lambda2; Kokkos::pair G; - SerialSchur2x2Internal::invoke(H, H + hs1, H + hs0, H + hs, &G, - &lambda1, &lambda2, &is_complex); + SerialSchur2x2Internal::invoke(H, H + hs1, H + hs0, H + hs, &G, &lambda1, &lambda2, &is_complex); G.second = -G.second; // transpose SerialApplyRightGivensInternal::invoke(G, 2, Z, zs0, Z + zs1, zs0); @@ -171,49 +164,37 @@ struct SerialSchurInternal { real_type *sub2x2 = H + (mend - 2) * hs; if (2 == mdiff) { Kokkos::pair G; - SerialSchur2x2Internal::invoke(sub2x2, sub2x2 + hs1, - sub2x2 + hs0, sub2x2 + hs, &G, - &lambda1, &lambda2, &is_complex); + SerialSchur2x2Internal::invoke(sub2x2, sub2x2 + hs1, sub2x2 + hs0, sub2x2 + hs, &G, &lambda1, &lambda2, + &is_complex); subdiags[mend - 1] = sub2x2[hs0]; /// apply G' from left G.second = -G.second; - SerialApplyLeftGivensInternal::invoke( - G, m - mend, sub2x2 + 2 * hs1, hs1, sub2x2 + hs0 + 2 * hs1, - hs1); + SerialApplyLeftGivensInternal::invoke(G, m - mend, sub2x2 + 2 * hs1, hs1, sub2x2 + hs0 + 2 * hs1, hs1); /// apply (G')' from right - SerialApplyRightGivensInternal::invoke( - G, mend - 2, sub2x2 - mend_minus_two_mult_hs0, hs0, - sub2x2 + hs1 - mend_minus_two_mult_hs0, hs0); + SerialApplyRightGivensInternal::invoke(G, mend - 2, sub2x2 - mend_minus_two_mult_hs0, hs0, + sub2x2 + hs1 - mend_minus_two_mult_hs0, hs0); sub2x2[hs0] = zero; /// apply (G')' from right to compute Z - SerialApplyRightGivensInternal::invoke( - G, m, Z + (mend - 2) * zs1, zs0, Z + (mend - 1) * zs1, zs0); + SerialApplyRightGivensInternal::invoke(G, m, Z + (mend - 2) * zs1, zs0, Z + (mend - 1) * zs1, zs0); } else { - SerialWilkinsonShiftInternal::invoke( - sub2x2[0], sub2x2[hs1], sub2x2[hs0], sub2x2[hs], &lambda1, - &lambda2, &is_complex); + SerialWilkinsonShiftInternal::invoke(sub2x2[0], sub2x2[hs1], sub2x2[hs0], sub2x2[hs], &lambda1, + &lambda2, &is_complex); - SerialFrancisInternal::invoke(mbeg, mend, m, H, hs0, hs1, - lambda1, lambda2, is_complex, Gs, - true); + SerialFrancisInternal::invoke(mbeg, mend, m, H, hs0, hs1, lambda1, lambda2, is_complex, Gs, true); /* */ auto &val1 = *(sub2x2 + hs0); /* */ auto &val2 = *(sub2x2 - hs1); const auto abs_val1 = ats::abs(val1); const auto abs_val2 = ats::abs(val2); for (int i = mbeg; i < (mend - 1); ++i) { - const Kokkos::pair G0( - Gs[2 * i].first, -Gs[2 * i].second); - const Kokkos::pair G1( - Gs[2 * i + 1].first, -Gs[2 * i + 1].second); - SerialApplyRightGivensInternal::invoke( - G0, m, Z + i * zs1, zs0, Z + i * zs1 + 1 * zs1, zs0); - SerialApplyRightGivensInternal::invoke( - G1, m, Z + i * zs1, zs0, Z + i * zs1 + 2 * zs1, zs0); + const Kokkos::pair G0(Gs[2 * i].first, -Gs[2 * i].second); + const Kokkos::pair G1(Gs[2 * i + 1].first, -Gs[2 * i + 1].second); + SerialApplyRightGivensInternal::invoke(G0, m, Z + i * zs1, zs0, Z + i * zs1 + 1 * zs1, zs0); + SerialApplyRightGivensInternal::invoke(G1, m, Z + i * zs1, zs0, Z + i * zs1 + 2 * zs1, zs0); } /// convergence check @@ -222,28 +203,23 @@ struct SerialSchurInternal { } else if (abs_val2 < tol) { /// preserve the standard schur form Kokkos::pair G; - SerialSchur2x2Internal::invoke( - sub2x2, sub2x2 + hs1, sub2x2 + hs0, sub2x2 + hs, &G, - &lambda1, &lambda2, &is_complex); + SerialSchur2x2Internal::invoke(sub2x2, sub2x2 + hs1, sub2x2 + hs0, sub2x2 + hs, &G, &lambda1, + &lambda2, &is_complex); subdiags[mend - 1] = val1; /// apply G' from left G.second = -G.second; - SerialApplyLeftGivensInternal::invoke( - G, m - mend, sub2x2 + 2 * hs1, hs1, - sub2x2 + hs0 + 2 * hs1, hs1); + SerialApplyLeftGivensInternal::invoke(G, m - mend, sub2x2 + 2 * hs1, hs1, sub2x2 + hs0 + 2 * hs1, + hs1); // apply (G')' from right - SerialApplyRightGivensInternal::invoke( - G, mend - 2, sub2x2 - mend_minus_two_mult_hs0, hs0, - sub2x2 + hs1 - mend_minus_two_mult_hs0, hs0); + SerialApplyRightGivensInternal::invoke(G, mend - 2, sub2x2 - mend_minus_two_mult_hs0, hs0, + sub2x2 + hs1 - mend_minus_two_mult_hs0, hs0); val1 = zero; val2 = zero; // apply (G')' from right - SerialApplyRightGivensInternal::invoke( - G, m, Z + (mend - 2) * zs1, zs0, Z + (mend - 1) * zs1, - zs0); + SerialApplyRightGivensInternal::invoke(G, m, Z + (mend - 2) * zs1, zs0, Z + (mend - 1) * zs1, zs0); } } } diff --git a/batched/dense/impl/KokkosBatched_SetIdentity_Impl.hpp b/batched/dense/impl/KokkosBatched_SetIdentity_Impl.hpp index e826c4cbb7..9219f3a9ec 100644 --- a/batched/dense/impl/KokkosBatched_SetIdentity_Impl.hpp +++ b/batched/dense/impl/KokkosBatched_SetIdentity_Impl.hpp @@ -29,8 +29,7 @@ namespace KokkosBatched { template KOKKOS_INLINE_FUNCTION int SerialSetIdentity::invoke(const AViewType &A) { - return SerialSetIdentityInternal::invoke(A.extent(0), A.extent(1), A.data(), - A.stride_0(), A.stride_1()); + return SerialSetIdentityInternal::invoke(A.extent(0), A.extent(1), A.data(), A.stride_0(), A.stride_1()); } /// @@ -39,10 +38,8 @@ KOKKOS_INLINE_FUNCTION int SerialSetIdentity::invoke(const AViewType &A) { template template -KOKKOS_INLINE_FUNCTION int TeamSetIdentity::invoke( - const MemberType &member, const AViewType &A) { - return TeamSetIdentityInternal::invoke(member, A.extent(0), A.extent(1), - A.data(), A.stride_0(), A.stride_1()); +KOKKOS_INLINE_FUNCTION int TeamSetIdentity::invoke(const MemberType &member, const AViewType &A) { + return TeamSetIdentityInternal::invoke(member, A.extent(0), A.extent(1), A.data(), A.stride_0(), A.stride_1()); } } // end namespace KokkosBatched diff --git a/batched/dense/impl/KokkosBatched_SetIdentity_Internal.hpp b/batched/dense/impl/KokkosBatched_SetIdentity_Internal.hpp index 7a89767526..f5afb5c79c 100644 --- a/batched/dense/impl/KokkosBatched_SetIdentity_Internal.hpp +++ b/batched/dense/impl/KokkosBatched_SetIdentity_Internal.hpp @@ -28,8 +28,7 @@ namespace KokkosBatched { struct SerialSetIdentityInternal { template KOKKOS_INLINE_FUNCTION static int invoke(const int m, const int n, - /* */ ValueType *KOKKOS_RESTRICT A, - const int as0, const int as1) { + /* */ ValueType *KOKKOS_RESTRICT A, const int as0, const int as1) { const ValueType one(1), zero(0); for (int j = 0; j < n; ++j) { #if defined(KOKKOS_ENABLE_PRAGMA_UNROLL) @@ -49,10 +48,8 @@ struct SerialSetIdentityInternal { /// ================== struct TeamSetIdentityInternal { template - KOKKOS_INLINE_FUNCTION static int invoke(const MemberType &member, - const int m, const int n, - /* */ ValueType *KOKKOS_RESTRICT A, - const int as0, const int as1) { + KOKKOS_INLINE_FUNCTION static int invoke(const MemberType &member, const int m, const int n, + /* */ ValueType *KOKKOS_RESTRICT A, const int as0, const int as1) { const ValueType one(1), zero(0); Kokkos::parallel_for(Kokkos::TeamThreadRange(member, m), [&](const int &i) { #if defined(KOKKOS_ENABLE_PRAGMA_UNROLL) @@ -70,15 +67,12 @@ struct TeamSetIdentityInternal { /// ======================== struct TeamVectorSetIdentityInternal { template - KOKKOS_INLINE_FUNCTION static int invoke(const MemberType &member, - const int m, const int n, - /* */ ValueType *KOKKOS_RESTRICT A, - const int as0, const int as1) { + KOKKOS_INLINE_FUNCTION static int invoke(const MemberType &member, const int m, const int n, + /* */ ValueType *KOKKOS_RESTRICT A, const int as0, const int as1) { const ValueType one(1), zero(0); Kokkos::parallel_for(Kokkos::TeamThreadRange(member, m), [&](const int &i) { - Kokkos::parallel_for( - Kokkos::ThreadVectorRange(member, n), - [&](const int &j) { A[i * as0 + j * as1] = i == j ? one : zero; }); + Kokkos::parallel_for(Kokkos::ThreadVectorRange(member, n), + [&](const int &j) { A[i * as0 + j * as1] = i == j ? one : zero; }); }); return 0; diff --git a/batched/dense/impl/KokkosBatched_SetTriangular_Internal.hpp b/batched/dense/impl/KokkosBatched_SetTriangular_Internal.hpp index 844c3f72c5..09e94ab5f3 100644 --- a/batched/dense/impl/KokkosBatched_SetTriangular_Internal.hpp +++ b/batched/dense/impl/KokkosBatched_SetTriangular_Internal.hpp @@ -27,11 +27,8 @@ namespace KokkosBatched { /// ==================== struct SerialSetLowerTriangularInternal { template - KOKKOS_INLINE_FUNCTION static int invoke(const int m, const int n, - const int dist, - const ScalarType alpha, - /* */ ValueType *KOKKOS_RESTRICT A, - const int as0, const int as1) { + KOKKOS_INLINE_FUNCTION static int invoke(const int m, const int n, const int dist, const ScalarType alpha, + /* */ ValueType *KOKKOS_RESTRICT A, const int as0, const int as1) { for (int j = 0; j < n; ++j) { #if defined(KOKKOS_ENABLE_PRAGMA_UNROLL) #pragma unroll @@ -47,18 +44,14 @@ struct SerialSetLowerTriangularInternal { struct TeamVectorSetLowerTriangularInternal { template - KOKKOS_INLINE_FUNCTION static int invoke(const MemberType &member, - const int m, const int n, - const int dist, + KOKKOS_INLINE_FUNCTION static int invoke(const MemberType &member, const int m, const int n, const int dist, const ScalarType alpha, - /* */ ValueType *KOKKOS_RESTRICT A, - const int as0, const int as1) { + /* */ ValueType *KOKKOS_RESTRICT A, const int as0, const int as1) { Kokkos::parallel_for(Kokkos::TeamThreadRange(member, n), [&](const int &j) { const int jdist = j + dist; - Kokkos::parallel_for(Kokkos::ThreadVectorRange(member, m), - [=](const int &i) { - if (i >= jdist) A[i * as0 + j * as1] = alpha; - }); + Kokkos::parallel_for(Kokkos::ThreadVectorRange(member, m), [=](const int &i) { + if (i >= jdist) A[i * as0 + j * as1] = alpha; + }); }); return 0; } diff --git a/batched/dense/impl/KokkosBatched_ShiftedTrsv_Serial_Internal.hpp b/batched/dense/impl/KokkosBatched_ShiftedTrsv_Serial_Internal.hpp index 2e356f818e..c0963447c4 100644 --- a/batched/dense/impl/KokkosBatched_ShiftedTrsv_Serial_Internal.hpp +++ b/batched/dense/impl/KokkosBatched_ShiftedTrsv_Serial_Internal.hpp @@ -36,19 +36,16 @@ namespace KokkosBatched { struct SerialShiftedTrsvInternalLower { template - KOKKOS_INLINE_FUNCTION static int invoke(const int m, const ScalarType lambda, - const ValueTypeA *KOKKOS_RESTRICT A, + KOKKOS_INLINE_FUNCTION static int invoke(const int m, const ScalarType lambda, const ValueTypeA *KOKKOS_RESTRICT A, const int as0, const int as1, - /* */ ValueTypeB *KOKKOS_RESTRICT b, - const int bs0, + /* */ ValueTypeB *KOKKOS_RESTRICT b, const int bs0, const int *KOKKOS_RESTRICT blks) { const int as = as0 + as1; int p = 0; for (; p < m;) { const int blk = blks[p], iend = m - p - blk; - assert(((blk == 1) || (blk == 2)) && - "ShiftedTrsvLower: blocks are not 1x1 or 2x2"); + assert(((blk == 1) || (blk == 2)) && "ShiftedTrsvLower: blocks are not 1x1 or 2x2"); if (blk == 1) { const auto alpha11 = A[p * as] - lambda; ValueTypeB *KOKKOS_RESTRICT beta1 = b + p * bs0; @@ -84,9 +81,7 @@ struct SerialShiftedTrsvInternalLower { const ValueTypeA *KOKKOS_RESTRICT A21 = A + p * as + 2 * as0; ValueTypeB *KOKKOS_RESTRICT b2 = beta1 + 2 * bs0; - for (int i = 0; i < iend; ++i) - b2[i * bs0] -= - (A21[i * as0] * (*beta1) + A21[i * as0 + as1] * (*beta2)); + for (int i = 0; i < iend; ++i) b2[i * bs0] -= (A21[i * as0] * (*beta1) + A21[i * as0 + as1] * (*beta2)); } } p += blk; @@ -101,11 +96,9 @@ struct SerialShiftedTrsvInternalLower { struct SerialShiftedTrsvInternalUpper { template - KOKKOS_INLINE_FUNCTION static int invoke(const int m, const ScalarType lambda, - const ValueTypeA *KOKKOS_RESTRICT A, + KOKKOS_INLINE_FUNCTION static int invoke(const int m, const ScalarType lambda, const ValueTypeA *KOKKOS_RESTRICT A, const int as0, const int as1, - /**/ ValueTypeB *KOKKOS_RESTRICT b, - const int bs0, + /**/ ValueTypeB *KOKKOS_RESTRICT b, const int bs0, const int *KOKKOS_RESTRICT blks) { const int as = as0 + as1; @@ -114,10 +107,9 @@ struct SerialShiftedTrsvInternalUpper { int p = m - 1; for (; p >= 0;) { const int blk = blks[p], iend = p + 1 - blk; - assert(((blk == 1) || (blk == 2)) && - "ShiftedTrsvUpper: blocks are not 1x1 or 2x2"); + assert(((blk == 1) || (blk == 2)) && "ShiftedTrsvUpper: blocks are not 1x1 or 2x2"); if (blk == 1) { - const auto alpha11 = A[p * as] - lambda; + const auto alpha11 = A[p * as] - lambda; /**/ ValueTypeB *KOKKOS_RESTRICT beta1 = b + p * bs0; // with KOKKOS_RESTRICT a compiler assumes that the pointer is not @@ -148,9 +140,7 @@ struct SerialShiftedTrsvInternalUpper { if (iend) { const ValueTypeA *KOKKOS_RESTRICT A01 = A + p_minus_one * as1; - for (int i = 0; i < iend; ++i) - b0[i * bs0] -= - (A01[i * as0] * (*beta1) + A01[i * as0 + as1] * (*beta2)); + for (int i = 0; i < iend; ++i) b0[i * bs0] -= (A01[i * as0] * (*beta1) + A01[i * as0 + as1] * (*beta2)); } } p -= blk; diff --git a/batched/dense/impl/KokkosBatched_SolveUTV_TeamVector_Impl.hpp b/batched/dense/impl/KokkosBatched_SolveUTV_TeamVector_Impl.hpp index 4f6f81216d..3b85a26294 100644 --- a/batched/dense/impl/KokkosBatched_SolveUTV_TeamVector_Impl.hpp +++ b/batched/dense/impl/KokkosBatched_SolveUTV_TeamVector_Impl.hpp @@ -28,26 +28,21 @@ namespace KokkosBatched { /// =============== template struct TeamVectorSolveUTV { - template - KOKKOS_INLINE_FUNCTION static int invoke( - const MemberType &member, const int matrix_rank, const UViewType &U, - const TViewType &T, const VViewType &V, const pViewType &p, - const XViewType &X, const BViewType &B, const wViewType &w) { + template + KOKKOS_INLINE_FUNCTION static int invoke(const MemberType &member, const int matrix_rank, const UViewType &U, + const TViewType &T, const VViewType &V, const pViewType &p, + const XViewType &X, const BViewType &B, const wViewType &w) { if (BViewType::rank == 1) - TeamVectorSolveUTV_Internal::invoke( - member, matrix_rank, T.extent(0), V.extent(0), U.data(), U.stride(0), - U.stride(1), T.data(), T.stride(0), T.stride(1), V.data(), - V.stride(0), V.stride(1), p.data(), p.stride(0), X.data(), - X.stride(0), B.data(), B.stride(0), w.data()); + TeamVectorSolveUTV_Internal::invoke(member, matrix_rank, T.extent(0), V.extent(0), U.data(), U.stride(0), + U.stride(1), T.data(), T.stride(0), T.stride(1), V.data(), V.stride(0), + V.stride(1), p.data(), p.stride(0), X.data(), X.stride(0), B.data(), + B.stride(0), w.data()); else - TeamVectorSolveUTV_Internal::invoke( - member, matrix_rank, T.extent(0), V.extent(0), B.extent(1), U.data(), - U.stride(0), U.stride(1), T.data(), T.stride(0), T.stride(1), - V.data(), V.stride(0), V.stride(1), p.data(), p.stride(0), X.data(), - X.stride(0), X.stride(1), B.data(), B.stride(0), B.stride(1), - w.data()); + TeamVectorSolveUTV_Internal::invoke(member, matrix_rank, T.extent(0), V.extent(0), B.extent(1), U.data(), + U.stride(0), U.stride(1), T.data(), T.stride(0), T.stride(1), V.data(), + V.stride(0), V.stride(1), p.data(), p.stride(0), X.data(), X.stride(0), + X.stride(1), B.data(), B.stride(0), B.stride(1), w.data()); return 0; } }; diff --git a/batched/dense/impl/KokkosBatched_SolveUTV_TeamVector_Internal.hpp b/batched/dense/impl/KokkosBatched_SolveUTV_TeamVector_Internal.hpp index 71050504aa..18440745eb 100644 --- a/batched/dense/impl/KokkosBatched_SolveUTV_TeamVector_Internal.hpp +++ b/batched/dense/impl/KokkosBatched_SolveUTV_TeamVector_Internal.hpp @@ -33,14 +33,13 @@ namespace KokkosBatched { /// =================== struct TeamVectorSolveUTV_Internal { template - KOKKOS_INLINE_FUNCTION static int invoke( - const MemberType &member, const int matrix_rank, const int m, - const int /*n*/, const ValueType *U, const int us0, const int us1, - const ValueType *T, const int ts0, const int ts1, const ValueType *V, - const int vs0, const int vs1, const IntType *p, const int ps0, - /* */ ValueType *x, const int xs0, - /* */ ValueType *b, const int bs0, - /* */ ValueType *w) { + KOKKOS_INLINE_FUNCTION static int invoke(const MemberType &member, const int matrix_rank, const int m, + const int /*n*/, const ValueType *U, const int us0, const int us1, + const ValueType *T, const int ts0, const int ts1, const ValueType *V, + const int vs0, const int vs1, const IntType *p, const int ps0, + /* */ ValueType *x, const int xs0, + /* */ ValueType *b, const int bs0, + /* */ ValueType *w) { typedef ValueType value_type; // typedef IntType int_type; @@ -49,40 +48,36 @@ struct TeamVectorSolveUTV_Internal { if (matrix_rank < m) { /// w = U^T b - KokkosBlas::Impl::TeamVectorGemvInternal::invoke( - member, matrix_rank, m, one, U, us1, us0, b, bs0, zero, w, ws0); + KokkosBlas::Impl::TeamVectorGemvInternal::invoke(member, matrix_rank, m, one, U, us1, us0, + b, bs0, zero, w, ws0); /// w = T^{-1} w - TeamVectorTrsvInternalLower::invoke( - member, false, matrix_rank, one, T, ts0, ts1, w, ws0); + TeamVectorTrsvInternalLower::invoke(member, false, matrix_rank, one, T, ts0, ts1, w, ws0); /// x = V^T w - KokkosBlas::Impl::TeamVectorGemvInternal::invoke( - member, m, matrix_rank, one, V, vs1, vs0, w, ws0, zero, x, xs0); + KokkosBlas::Impl::TeamVectorGemvInternal::invoke(member, m, matrix_rank, one, V, vs1, vs0, + w, ws0, zero, x, xs0); } else { - KokkosBlas::Impl::TeamVectorGemvInternal::invoke( - member, matrix_rank, m, one, U, us1, us0, b, bs0, zero, x, xs0); + KokkosBlas::Impl::TeamVectorGemvInternal::invoke(member, matrix_rank, m, one, U, us1, us0, + b, bs0, zero, x, xs0); - TeamVectorTrsvInternalUpper::invoke( - member, false, matrix_rank, one, T, ts0, ts1, x, xs0); + TeamVectorTrsvInternalUpper::invoke(member, false, matrix_rank, one, T, ts0, ts1, x, xs0); } /// x = P^T x - TeamVectorApplyPivotVectorBackwardInternal ::invoke(member, m, p, ps0, x, - xs0); + TeamVectorApplyPivotVectorBackwardInternal ::invoke(member, m, p, ps0, x, xs0); return 0; } template - KOKKOS_INLINE_FUNCTION static int invoke( - const MemberType &member, const int matrix_rank, const int m, const int n, - const int nrhs, const ValueType *U, const int us0, const int us1, - const ValueType *T, const int ts0, const int ts1, const ValueType *V, - const int vs0, const int vs1, const IntType *p, const int ps0, - /* */ ValueType *X, const int xs0, const int xs1, - /* */ ValueType *B, const int bs0, const int bs1, - /* */ ValueType *w) { + KOKKOS_INLINE_FUNCTION static int invoke(const MemberType &member, const int matrix_rank, const int m, const int n, + const int nrhs, const ValueType *U, const int us0, const int us1, + const ValueType *T, const int ts0, const int ts1, const ValueType *V, + const int vs0, const int vs1, const IntType *p, const int ps0, + /* */ ValueType *X, const int xs0, const int xs1, + /* */ ValueType *B, const int bs0, const int bs1, + /* */ ValueType *w) { typedef ValueType value_type; // typedef IntType int_type; @@ -96,37 +91,33 @@ struct TeamVectorSolveUTV_Internal { /// T is matrix_rank x matrix_rank /// V is matrix_rank x n /// W = U^T B - TeamVectorGemmInternal::invoke( - member, matrix_rank, nrhs, m, one, U, us1, us0, B, bs0, bs1, zero, W, - ws0, ws1); + TeamVectorGemmInternal::invoke(member, matrix_rank, nrhs, m, one, U, us1, us0, B, bs0, bs1, + zero, W, ws0, ws1); member.team_barrier(); /// W = T^{-1} W - TeamVectorTrsmInternalLeftLower::invoke( - member, false, matrix_rank, nrhs, one, T, ts0, ts1, W, ws0, ws1); + TeamVectorTrsmInternalLeftLower::invoke(member, false, matrix_rank, nrhs, one, T, ts0, ts1, + W, ws0, ws1); member.team_barrier(); /// X = V^T W - TeamVectorGemmInternal::invoke( - member, n, nrhs, matrix_rank, one, V, vs1, vs0, W, ws0, ws1, zero, X, - xs0, xs1); + TeamVectorGemmInternal::invoke(member, n, nrhs, matrix_rank, one, V, vs1, vs0, W, ws0, ws1, + zero, X, xs0, xs1); member.team_barrier(); } else { /// W = U^T B - TeamVectorGemmInternal::invoke( - member, matrix_rank, nrhs, m, one, U, us1, us0, B, bs0, bs1, zero, X, - xs0, xs1); + TeamVectorGemmInternal::invoke(member, matrix_rank, nrhs, m, one, U, us1, us0, B, bs0, bs1, + zero, X, xs0, xs1); member.team_barrier(); /// X = T^{-1} X - TeamVectorTrsmInternalLeftUpper::invoke( - member, false, matrix_rank, nrhs, one, T, ts0, ts1, X, xs0, xs1); + TeamVectorTrsmInternalLeftUpper::invoke(member, false, matrix_rank, nrhs, one, T, ts0, ts1, + X, xs0, xs1); member.team_barrier(); } /// X = P^T X - TeamVectorApplyPivotMatrixBackwardInternal ::invoke(member, nrhs, n, p, ps0, - X, xs0, xs1); + TeamVectorApplyPivotMatrixBackwardInternal ::invoke(member, nrhs, n, p, ps0, X, xs0, xs1); return 0; } diff --git a/batched/dense/impl/KokkosBatched_Tbsv_Serial_Impl.hpp b/batched/dense/impl/KokkosBatched_Tbsv_Serial_Impl.hpp index 675e73f744..853e453b89 100644 --- a/batched/dense/impl/KokkosBatched_Tbsv_Serial_Impl.hpp +++ b/batched/dense/impl/KokkosBatched_Tbsv_Serial_Impl.hpp @@ -25,17 +25,12 @@ namespace KokkosBatched { template -KOKKOS_INLINE_FUNCTION static int checkTbsvInput( - [[maybe_unused]] const AViewType &A, [[maybe_unused]] const XViewType &x, - [[maybe_unused]] const int k) { - static_assert(Kokkos::is_view::value, - "KokkosBatched::tbsv: AViewType is not a Kokkos::View."); - static_assert(Kokkos::is_view::value, - "KokkosBatched::tbsv: XViewType is not a Kokkos::View."); - static_assert(AViewType::rank == 2, - "KokkosBatched::tbsv: AViewType must have rank 2."); - static_assert(XViewType::rank == 1, - "KokkosBatched::tbsv: XViewType must have rank 1."); +KOKKOS_INLINE_FUNCTION static int checkTbsvInput([[maybe_unused]] const AViewType &A, + [[maybe_unused]] const XViewType &x, [[maybe_unused]] const int k) { + static_assert(Kokkos::is_view::value, "KokkosBatched::tbsv: AViewType is not a Kokkos::View."); + static_assert(Kokkos::is_view::value, "KokkosBatched::tbsv: XViewType is not a Kokkos::View."); + static_assert(AViewType::rank == 2, "KokkosBatched::tbsv: AViewType must have rank 2."); + static_assert(XViewType::rank == 1, "KokkosBatched::tbsv: XViewType must have rank 1."); #if (KOKKOSKERNELS_DEBUG_LEVEL > 0) if (k < 0) { @@ -70,97 +65,79 @@ KOKKOS_INLINE_FUNCTION static int checkTbsvInput( //// Lower non-transpose //// template -struct SerialTbsv { +struct SerialTbsv { template - KOKKOS_INLINE_FUNCTION static int invoke(const AViewType &A, - const XViewType &x, const int k) { + KOKKOS_INLINE_FUNCTION static int invoke(const AViewType &A, const XViewType &x, const int k) { auto info = checkTbsvInput(A, x, k); if (info) return info; return SerialTbsvInternalLower::invoke( - ArgDiag::use_unit_diag, A.extent(1), A.data(), A.stride_0(), - A.stride_1(), x.data(), x.stride_0(), k); + ArgDiag::use_unit_diag, A.extent(1), A.data(), A.stride_0(), A.stride_1(), x.data(), x.stride_0(), k); } }; //// Lower transpose //// template -struct SerialTbsv { +struct SerialTbsv { template - KOKKOS_INLINE_FUNCTION static int invoke(const AViewType &A, - const XViewType &x, const int k) { + KOKKOS_INLINE_FUNCTION static int invoke(const AViewType &A, const XViewType &x, const int k) { auto info = checkTbsvInput(A, x, k); if (info) return info; return SerialTbsvInternalLowerTranspose::invoke( - ArgDiag::use_unit_diag, false, A.extent(1), A.data(), A.stride_0(), - A.stride_1(), x.data(), x.stride_0(), k); + ArgDiag::use_unit_diag, false, A.extent(1), A.data(), A.stride_0(), A.stride_1(), x.data(), x.stride_0(), k); } }; //// Lower conjugate-transpose //// template -struct SerialTbsv { +struct SerialTbsv { template - KOKKOS_INLINE_FUNCTION static int invoke(const AViewType &A, - const XViewType &x, const int k) { + KOKKOS_INLINE_FUNCTION static int invoke(const AViewType &A, const XViewType &x, const int k) { auto info = checkTbsvInput(A, x, k); if (info) return info; return SerialTbsvInternalLowerTranspose::invoke( - ArgDiag::use_unit_diag, true, A.extent(1), A.data(), A.stride_0(), - A.stride_1(), x.data(), x.stride_0(), k); + ArgDiag::use_unit_diag, true, A.extent(1), A.data(), A.stride_0(), A.stride_1(), x.data(), x.stride_0(), k); } }; //// Upper non-transpose //// template -struct SerialTbsv { +struct SerialTbsv { template - KOKKOS_INLINE_FUNCTION static int invoke(const AViewType &A, - const XViewType &x, const int k) { + KOKKOS_INLINE_FUNCTION static int invoke(const AViewType &A, const XViewType &x, const int k) { auto info = checkTbsvInput(A, x, k); if (info) return info; return SerialTbsvInternalUpper::invoke( - ArgDiag::use_unit_diag, A.extent(1), A.data(), A.stride_0(), - A.stride_1(), x.data(), x.stride_0(), k); + ArgDiag::use_unit_diag, A.extent(1), A.data(), A.stride_0(), A.stride_1(), x.data(), x.stride_0(), k); } }; //// Upper transpose //// template -struct SerialTbsv { +struct SerialTbsv { template - KOKKOS_INLINE_FUNCTION static int invoke(const AViewType &A, - const XViewType &x, const int k) { + KOKKOS_INLINE_FUNCTION static int invoke(const AViewType &A, const XViewType &x, const int k) { auto info = checkTbsvInput(A, x, k); if (info) return info; return SerialTbsvInternalUpperTranspose::invoke( - ArgDiag::use_unit_diag, false, A.extent(1), A.data(), A.stride_0(), - A.stride_1(), x.data(), x.stride_0(), k); + ArgDiag::use_unit_diag, false, A.extent(1), A.data(), A.stride_0(), A.stride_1(), x.data(), x.stride_0(), k); } }; //// Upper conjugate-transpose //// template -struct SerialTbsv { +struct SerialTbsv { template - KOKKOS_INLINE_FUNCTION static int invoke(const AViewType &A, - const XViewType &x, const int k) { + KOKKOS_INLINE_FUNCTION static int invoke(const AViewType &A, const XViewType &x, const int k) { auto info = checkTbsvInput(A, x, k); if (info) return info; return SerialTbsvInternalUpperTranspose::invoke( - ArgDiag::use_unit_diag, true, A.extent(1), A.data(), A.stride_0(), - A.stride_1(), x.data(), x.stride_0(), k); + ArgDiag::use_unit_diag, true, A.extent(1), A.data(), A.stride_0(), A.stride_1(), x.data(), x.stride_0(), k); } }; diff --git a/batched/dense/impl/KokkosBatched_Tbsv_Serial_Internal.hpp b/batched/dense/impl/KokkosBatched_Tbsv_Serial_Internal.hpp index d2f5df4649..64221008cc 100644 --- a/batched/dense/impl/KokkosBatched_Tbsv_Serial_Internal.hpp +++ b/batched/dense/impl/KokkosBatched_Tbsv_Serial_Internal.hpp @@ -34,20 +34,15 @@ namespace KokkosBatched { template struct SerialTbsvInternalLower { template - KOKKOS_INLINE_FUNCTION static int invoke(const bool use_unit_diag, - const int an, - const ValueType *KOKKOS_RESTRICT A, + KOKKOS_INLINE_FUNCTION static int invoke(const bool use_unit_diag, const int an, const ValueType *KOKKOS_RESTRICT A, const int as0, const int as1, - /**/ ValueType *KOKKOS_RESTRICT x, - const int xs0, const int k); + /**/ ValueType *KOKKOS_RESTRICT x, const int xs0, const int k); }; template <> template -KOKKOS_INLINE_FUNCTION int -SerialTbsvInternalLower::invoke( - const bool use_unit_diag, const int an, const ValueType *KOKKOS_RESTRICT A, - const int as0, const int as1, +KOKKOS_INLINE_FUNCTION int SerialTbsvInternalLower::invoke( + const bool use_unit_diag, const int an, const ValueType *KOKKOS_RESTRICT A, const int as0, const int as1, /**/ ValueType *KOKKOS_RESTRICT x, const int xs0, const int k) { #if defined(KOKKOS_ENABLE_PRAGMA_UNROLL) #pragma unroll @@ -76,20 +71,16 @@ SerialTbsvInternalLower::invoke( template struct SerialTbsvInternalLowerTranspose { template - KOKKOS_INLINE_FUNCTION static int invoke(const bool use_unit_diag, - const bool do_conj, const int an, - const ValueType *KOKKOS_RESTRICT A, - const int as0, const int as1, - /**/ ValueType *KOKKOS_RESTRICT x, - const int xs0, const int k); + KOKKOS_INLINE_FUNCTION static int invoke(const bool use_unit_diag, const bool do_conj, const int an, + const ValueType *KOKKOS_RESTRICT A, const int as0, const int as1, + /**/ ValueType *KOKKOS_RESTRICT x, const int xs0, const int k); }; template <> template -KOKKOS_INLINE_FUNCTION int -SerialTbsvInternalLowerTranspose::invoke( - const bool use_unit_diag, const bool do_conj, const int an, - const ValueType *KOKKOS_RESTRICT A, const int as0, const int as1, +KOKKOS_INLINE_FUNCTION int SerialTbsvInternalLowerTranspose::invoke( + const bool use_unit_diag, const bool do_conj, const int an, const ValueType *KOKKOS_RESTRICT A, const int as0, + const int as1, /**/ ValueType *KOKKOS_RESTRICT x, const int xs0, const int k) { #if defined(KOKKOS_ENABLE_PRAGMA_UNROLL) #pragma unroll @@ -102,12 +93,9 @@ SerialTbsvInternalLowerTranspose::invoke( #pragma unroll #endif for (int i = Kokkos::min(an - 1, j + k); i > j; --i) { - temp -= - Kokkos::ArithTraits::conj(A[(i - j) * as0 + j * as1]) * - x[i * xs0]; + temp -= Kokkos::ArithTraits::conj(A[(i - j) * as0 + j * as1]) * x[i * xs0]; } - if (!use_unit_diag) - temp = temp / Kokkos::ArithTraits::conj(A[0 + j * as1]); + if (!use_unit_diag) temp = temp / Kokkos::ArithTraits::conj(A[0 + j * as1]); } else { #if defined(KOKKOS_ENABLE_PRAGMA_UNROLL) #pragma unroll @@ -130,20 +118,15 @@ SerialTbsvInternalLowerTranspose::invoke( template struct SerialTbsvInternalUpper { template - KOKKOS_INLINE_FUNCTION static int invoke(const bool use_unit_diag, - const int an, - const ValueType *KOKKOS_RESTRICT A, + KOKKOS_INLINE_FUNCTION static int invoke(const bool use_unit_diag, const int an, const ValueType *KOKKOS_RESTRICT A, const int as0, const int as1, - /**/ ValueType *KOKKOS_RESTRICT x, - const int xs0, const int k); + /**/ ValueType *KOKKOS_RESTRICT x, const int xs0, const int k); }; template <> template -KOKKOS_INLINE_FUNCTION int -SerialTbsvInternalUpper::invoke( - const bool use_unit_diag, const int an, const ValueType *KOKKOS_RESTRICT A, - const int as0, const int as1, +KOKKOS_INLINE_FUNCTION int SerialTbsvInternalUpper::invoke( + const bool use_unit_diag, const int an, const ValueType *KOKKOS_RESTRICT A, const int as0, const int as1, /**/ ValueType *KOKKOS_RESTRICT x, const int xs0, const int k) { #if defined(KOKKOS_ENABLE_PRAGMA_UNROLL) #pragma unroll @@ -172,20 +155,16 @@ SerialTbsvInternalUpper::invoke( template struct SerialTbsvInternalUpperTranspose { template - KOKKOS_INLINE_FUNCTION static int invoke(const bool use_unit_diag, - const bool do_conj, const int an, - const ValueType *KOKKOS_RESTRICT A, - const int as0, const int as1, - /**/ ValueType *KOKKOS_RESTRICT x, - const int xs0, const int k); + KOKKOS_INLINE_FUNCTION static int invoke(const bool use_unit_diag, const bool do_conj, const int an, + const ValueType *KOKKOS_RESTRICT A, const int as0, const int as1, + /**/ ValueType *KOKKOS_RESTRICT x, const int xs0, const int k); }; template <> template -KOKKOS_INLINE_FUNCTION int -SerialTbsvInternalUpperTranspose::invoke( - const bool use_unit_diag, const bool do_conj, const int an, - const ValueType *KOKKOS_RESTRICT A, const int as0, const int as1, +KOKKOS_INLINE_FUNCTION int SerialTbsvInternalUpperTranspose::invoke( + const bool use_unit_diag, const bool do_conj, const int an, const ValueType *KOKKOS_RESTRICT A, const int as0, + const int as1, /**/ ValueType *KOKKOS_RESTRICT x, const int xs0, const int k) { #if defined(KOKKOS_ENABLE_PRAGMA_UNROLL) #pragma unroll @@ -197,13 +176,9 @@ SerialTbsvInternalUpperTranspose::invoke( #pragma unroll #endif for (int i = Kokkos::max(0, j - k); i < j; ++i) { - temp -= Kokkos::ArithTraits::conj( - A[(i + k - j) * as0 + j * as1]) * - x[i * xs0]; + temp -= Kokkos::ArithTraits::conj(A[(i + k - j) * as0 + j * as1]) * x[i * xs0]; } - if (!use_unit_diag) - temp = - temp / Kokkos::ArithTraits::conj(A[k * as0 + j * as1]); + if (!use_unit_diag) temp = temp / Kokkos::ArithTraits::conj(A[k * as0 + j * as1]); } else { #if defined(KOKKOS_ENABLE_PRAGMA_UNROLL) #pragma unroll diff --git a/batched/dense/impl/KokkosBatched_Trmm_Serial_Impl.hpp b/batched/dense/impl/KokkosBatched_Trmm_Serial_Impl.hpp index 044af0814c..6313d817c6 100644 --- a/batched/dense/impl/KokkosBatched_Trmm_Serial_Impl.hpp +++ b/batched/dense/impl/KokkosBatched_Trmm_Serial_Impl.hpp @@ -23,164 +23,116 @@ namespace KokkosBatched { //// Lower non-transpose //// template -struct SerialTrmm { +struct SerialTrmm { template - KOKKOS_INLINE_FUNCTION static int invoke(const ScalarType alpha, - const AViewType &A, - const BViewType &B) { + KOKKOS_INLINE_FUNCTION static int invoke(const ScalarType alpha, const AViewType &A, const BViewType &B) { return SerialTrmmInternalLeftLower::invoke( - ArgDiag::use_unit_diag, false, A.extent(0), A.extent(1), B.extent(0), - B.extent(1), alpha, A.data(), A.stride_0(), A.stride_1(), B.data(), - B.stride_0(), B.stride_1()); + ArgDiag::use_unit_diag, false, A.extent(0), A.extent(1), B.extent(0), B.extent(1), alpha, A.data(), + A.stride_0(), A.stride_1(), B.data(), B.stride_0(), B.stride_1()); } }; template -struct SerialTrmm { +struct SerialTrmm { template - KOKKOS_INLINE_FUNCTION static int invoke(const ScalarType alpha, - const AViewType &A, - const BViewType &B) { + KOKKOS_INLINE_FUNCTION static int invoke(const ScalarType alpha, const AViewType &A, const BViewType &B) { return SerialTrmmInternalRightLower::invoke( - ArgDiag::use_unit_diag, false, A.extent(0), A.extent(1), B.extent(0), - B.extent(1), alpha, A.data(), A.stride_0(), A.stride_1(), B.data(), - B.stride_0(), B.stride_1()); + ArgDiag::use_unit_diag, false, A.extent(0), A.extent(1), B.extent(0), B.extent(1), alpha, A.data(), + A.stride_0(), A.stride_1(), B.data(), B.stride_0(), B.stride_1()); } }; //// Lower transpose ///// template -struct SerialTrmm { +struct SerialTrmm { template - KOKKOS_INLINE_FUNCTION static int invoke(const ScalarType alpha, - const AViewType &A, - const BViewType &B) { + KOKKOS_INLINE_FUNCTION static int invoke(const ScalarType alpha, const AViewType &A, const BViewType &B) { return SerialTrmmInternalLeftUpper::invoke( - ArgDiag::use_unit_diag, false, A.extent(1), A.extent(0), B.extent(0), - B.extent(1), alpha, A.data(), A.stride_1(), A.stride_0(), B.data(), - B.stride_0(), B.stride_1()); + ArgDiag::use_unit_diag, false, A.extent(1), A.extent(0), B.extent(0), B.extent(1), alpha, A.data(), + A.stride_1(), A.stride_0(), B.data(), B.stride_0(), B.stride_1()); } }; template -struct SerialTrmm { +struct SerialTrmm { template - KOKKOS_INLINE_FUNCTION static int invoke(const ScalarType alpha, - const AViewType &A, - const BViewType &B) { + KOKKOS_INLINE_FUNCTION static int invoke(const ScalarType alpha, const AViewType &A, const BViewType &B) { return SerialTrmmInternalRightUpper::invoke( - ArgDiag::use_unit_diag, false, A.extent(1), A.extent(0), B.extent(0), - B.extent(1), alpha, A.data(), A.stride_1(), A.stride_0(), B.data(), - B.stride_0(), B.stride_1()); + ArgDiag::use_unit_diag, false, A.extent(1), A.extent(0), B.extent(0), B.extent(1), alpha, A.data(), + A.stride_1(), A.stride_0(), B.data(), B.stride_0(), B.stride_1()); } }; //// Lower conjugate-transpose //// template -struct SerialTrmm { +struct SerialTrmm { template - KOKKOS_INLINE_FUNCTION static int invoke(const ScalarType alpha, - const AViewType &A, - const BViewType &B) { + KOKKOS_INLINE_FUNCTION static int invoke(const ScalarType alpha, const AViewType &A, const BViewType &B) { return SerialTrmmInternalLeftUpper::invoke( - ArgDiag::use_unit_diag, true, A.extent(1), A.extent(0), B.extent(0), - B.extent(1), alpha, A.data(), A.stride_1(), A.stride_0(), B.data(), - B.stride_0(), B.stride_1()); + ArgDiag::use_unit_diag, true, A.extent(1), A.extent(0), B.extent(0), B.extent(1), alpha, A.data(), A.stride_1(), + A.stride_0(), B.data(), B.stride_0(), B.stride_1()); } }; template -struct SerialTrmm { +struct SerialTrmm { template - KOKKOS_INLINE_FUNCTION static int invoke(const ScalarType alpha, - const AViewType &A, - const BViewType &B) { + KOKKOS_INLINE_FUNCTION static int invoke(const ScalarType alpha, const AViewType &A, const BViewType &B) { return SerialTrmmInternalRightUpper::invoke( - ArgDiag::use_unit_diag, true, A.extent(1), A.extent(0), B.extent(0), - B.extent(1), alpha, A.data(), A.stride_1(), A.stride_0(), B.data(), - B.stride_0(), B.stride_1()); + ArgDiag::use_unit_diag, true, A.extent(1), A.extent(0), B.extent(0), B.extent(1), alpha, A.data(), A.stride_1(), + A.stride_0(), B.data(), B.stride_0(), B.stride_1()); } }; //// Upper non-transpose //// template -struct SerialTrmm { +struct SerialTrmm { template - KOKKOS_INLINE_FUNCTION static int invoke(const ScalarType alpha, - const AViewType &A, - const BViewType &B) { + KOKKOS_INLINE_FUNCTION static int invoke(const ScalarType alpha, const AViewType &A, const BViewType &B) { return SerialTrmmInternalLeftUpper::invoke( - ArgDiag::use_unit_diag, false, A.extent(0), A.extent(1), B.extent(0), - B.extent(1), alpha, A.data(), A.stride_0(), A.stride_1(), B.data(), - B.stride_0(), B.stride_1()); + ArgDiag::use_unit_diag, false, A.extent(0), A.extent(1), B.extent(0), B.extent(1), alpha, A.data(), + A.stride_0(), A.stride_1(), B.data(), B.stride_0(), B.stride_1()); } }; template -struct SerialTrmm { +struct SerialTrmm { template - KOKKOS_INLINE_FUNCTION static int invoke(const ScalarType alpha, - const AViewType &A, - const BViewType &B) { + KOKKOS_INLINE_FUNCTION static int invoke(const ScalarType alpha, const AViewType &A, const BViewType &B) { return SerialTrmmInternalRightUpper::invoke( - ArgDiag::use_unit_diag, false, A.extent(0), A.extent(1), B.extent(0), - B.extent(1), alpha, A.data(), A.stride_0(), A.stride_1(), B.data(), - B.stride_0(), B.stride_1()); + ArgDiag::use_unit_diag, false, A.extent(0), A.extent(1), B.extent(0), B.extent(1), alpha, A.data(), + A.stride_0(), A.stride_1(), B.data(), B.stride_0(), B.stride_1()); } }; //// Upper transpose ///// template -struct SerialTrmm { +struct SerialTrmm { template - KOKKOS_INLINE_FUNCTION static int invoke(const ScalarType alpha, - const AViewType &A, - const BViewType &B) { + KOKKOS_INLINE_FUNCTION static int invoke(const ScalarType alpha, const AViewType &A, const BViewType &B) { return SerialTrmmInternalLeftLower::invoke( - ArgDiag::use_unit_diag, false, A.extent(1), A.extent(0), B.extent(0), - B.extent(1), alpha, A.data(), A.stride_1(), A.stride_0(), B.data(), - B.stride_0(), B.stride_1()); + ArgDiag::use_unit_diag, false, A.extent(1), A.extent(0), B.extent(0), B.extent(1), alpha, A.data(), + A.stride_1(), A.stride_0(), B.data(), B.stride_0(), B.stride_1()); } }; template -struct SerialTrmm { +struct SerialTrmm { template - KOKKOS_INLINE_FUNCTION static int invoke(const ScalarType alpha, - const AViewType &A, - const BViewType &B) { + KOKKOS_INLINE_FUNCTION static int invoke(const ScalarType alpha, const AViewType &A, const BViewType &B) { return SerialTrmmInternalRightLower::invoke( - ArgDiag::use_unit_diag, false, A.extent(1), A.extent(0), B.extent(0), - B.extent(1), alpha, A.data(), A.stride_1(), A.stride_0(), B.data(), - B.stride_0(), B.stride_1()); + ArgDiag::use_unit_diag, false, A.extent(1), A.extent(0), B.extent(0), B.extent(1), alpha, A.data(), + A.stride_1(), A.stride_0(), B.data(), B.stride_0(), B.stride_1()); } }; //// Upper conjugate-transpose //// template -struct SerialTrmm { +struct SerialTrmm { template - KOKKOS_INLINE_FUNCTION static int invoke(const ScalarType alpha, - const AViewType &A, - const BViewType &B) { + KOKKOS_INLINE_FUNCTION static int invoke(const ScalarType alpha, const AViewType &A, const BViewType &B) { return SerialTrmmInternalLeftLower::invoke( - ArgDiag::use_unit_diag, true, A.extent(1), A.extent(0), B.extent(0), - B.extent(1), alpha, A.data(), A.stride_1(), A.stride_0(), B.data(), - B.stride_0(), B.stride_1()); + ArgDiag::use_unit_diag, true, A.extent(1), A.extent(0), B.extent(0), B.extent(1), alpha, A.data(), A.stride_1(), + A.stride_0(), B.data(), B.stride_0(), B.stride_1()); } }; template -struct SerialTrmm { +struct SerialTrmm { template - KOKKOS_INLINE_FUNCTION static int invoke(const ScalarType alpha, - const AViewType &A, - const BViewType &B) { + KOKKOS_INLINE_FUNCTION static int invoke(const ScalarType alpha, const AViewType &A, const BViewType &B) { return SerialTrmmInternalRightLower::invoke( - ArgDiag::use_unit_diag, true, A.extent(1), A.extent(0), B.extent(0), - B.extent(1), alpha, A.data(), A.stride_1(), A.stride_0(), B.data(), - B.stride_0(), B.stride_1()); + ArgDiag::use_unit_diag, true, A.extent(1), A.extent(0), B.extent(0), B.extent(1), alpha, A.data(), A.stride_1(), + A.stride_0(), B.data(), B.stride_0(), B.stride_1()); } }; } // namespace KokkosBatched diff --git a/batched/dense/impl/KokkosBatched_Trmm_Serial_Internal.hpp b/batched/dense/impl/KokkosBatched_Trmm_Serial_Internal.hpp index 3e4024974b..c36d04213d 100644 --- a/batched/dense/impl/KokkosBatched_Trmm_Serial_Internal.hpp +++ b/batched/dense/impl/KokkosBatched_Trmm_Serial_Internal.hpp @@ -27,41 +27,37 @@ namespace KokkosBatched { template struct SerialTrmmInternalLeftLower { template - KOKKOS_INLINE_FUNCTION static int invoke( - const bool use_unit_diag, const bool do_conj, const int am, const int an, - const int bm, const int bn, const ScalarType alpha, - const ValueType *KOKKOS_RESTRICT A, const int as0, const int as1, - /**/ ValueType *KOKKOS_RESTRICT B, const int bs0, const int bs1); + KOKKOS_INLINE_FUNCTION static int invoke(const bool use_unit_diag, const bool do_conj, const int am, const int an, + const int bm, const int bn, const ScalarType alpha, + const ValueType *KOKKOS_RESTRICT A, const int as0, const int as1, + /**/ ValueType *KOKKOS_RESTRICT B, const int bs0, const int bs1); }; template struct SerialTrmmInternalLeftUpper { template - KOKKOS_INLINE_FUNCTION static int invoke( - const bool use_unit_diag, const bool do_conj, const int am, const int an, - const int bm, const int bn, const ScalarType alpha, - const ValueType *KOKKOS_RESTRICT A, const int as0, const int as1, - /**/ ValueType *KOKKOS_RESTRICT B, const int bs0, const int bs1); + KOKKOS_INLINE_FUNCTION static int invoke(const bool use_unit_diag, const bool do_conj, const int am, const int an, + const int bm, const int bn, const ScalarType alpha, + const ValueType *KOKKOS_RESTRICT A, const int as0, const int as1, + /**/ ValueType *KOKKOS_RESTRICT B, const int bs0, const int bs1); }; template struct SerialTrmmInternalRightLower { template - KOKKOS_INLINE_FUNCTION static int invoke( - const bool use_unit_diag, const bool do_conj, const int am, const int an, - const int bm, const int bn, const ScalarType alpha, - const ValueType *KOKKOS_RESTRICT A, const int as0, const int as1, - /**/ ValueType *KOKKOS_RESTRICT B, const int bs0, const int bs1); + KOKKOS_INLINE_FUNCTION static int invoke(const bool use_unit_diag, const bool do_conj, const int am, const int an, + const int bm, const int bn, const ScalarType alpha, + const ValueType *KOKKOS_RESTRICT A, const int as0, const int as1, + /**/ ValueType *KOKKOS_RESTRICT B, const int bs0, const int bs1); }; template struct SerialTrmmInternalRightUpper { template - KOKKOS_INLINE_FUNCTION static int invoke( - const bool use_unit_diag, const bool do_conj, const int am, const int an, - const int bm, const int bn, const ScalarType alpha, - const ValueType *KOKKOS_RESTRICT A, const int as0, const int as1, - /**/ ValueType *KOKKOS_RESTRICT B, const int bs0, const int bs1); + KOKKOS_INLINE_FUNCTION static int invoke(const bool use_unit_diag, const bool do_conj, const int am, const int an, + const int bm, const int bn, const ScalarType alpha, + const ValueType *KOKKOS_RESTRICT A, const int as0, const int as1, + /**/ ValueType *KOKKOS_RESTRICT B, const int bs0, const int bs1); }; // ech-note: use_unit_diag intentionally ignored for now. Compiler can optimize @@ -70,11 +66,9 @@ struct SerialTrmmInternalRightUpper { // if use_unit_diag. template <> template -KOKKOS_INLINE_FUNCTION int -SerialTrmmInternalLeftLower::invoke( - const bool /*use_unit_diag*/, const bool do_conj, const int am, - const int an, const int bm, const int bn, const ScalarType alpha, - const ValueType *KOKKOS_RESTRICT A, const int as0, const int as1, +KOKKOS_INLINE_FUNCTION int SerialTrmmInternalLeftLower::invoke( + const bool /*use_unit_diag*/, const bool do_conj, const int am, const int an, const int bm, const int bn, + const ScalarType alpha, const ValueType *KOKKOS_RESTRICT A, const int as0, const int as1, /**/ ValueType *KOKKOS_RESTRICT B, const int bs0, const int bs1) { const ScalarType one(1.0), zero(0.0); typedef Kokkos::ArithTraits AT; @@ -87,27 +81,23 @@ SerialTrmmInternalLeftLower::invoke( //} // printf("SerialTrmmInternalLeftLower\n"); - auto dotLowerLeftConj = - [&](const ValueType *KOKKOS_RESTRICT __A, const int __as0, - const int __as1, const int __left_row, ValueType *KOKKOS_RESTRICT __B, - const int __bs0, const int __bs1, const int __right_col) { - auto B_elems = __left_row; - ScalarType sum = 0; + auto dotLowerLeftConj = [&](const ValueType *KOKKOS_RESTRICT __A, const int __as0, const int __as1, + const int __left_row, ValueType *KOKKOS_RESTRICT __B, const int __bs0, const int __bs1, + const int __right_col) { + auto B_elems = __left_row; + ScalarType sum = 0; #if defined(KOKKOS_ENABLE_PRAGMA_UNROLL) #pragma unroll #endif - for (int i = 0; i <= B_elems; i++) { - // sum += A[left_row, i] * B[i, right_col] - sum += AT::conj(__A[__left_row * __as0 + i * __as1]) * - __B[i * __bs0 + __bs1 * __right_col]; - } - return sum; - }; + for (int i = 0; i <= B_elems; i++) { + // sum += A[left_row, i] * B[i, right_col] + sum += AT::conj(__A[__left_row * __as0 + i * __as1]) * __B[i * __bs0 + __bs1 * __right_col]; + } + return sum; + }; - auto dotLowerLeft = [&](const ValueType *KOKKOS_RESTRICT __A, const int __as0, - const int __as1, const int __left_row, - ValueType *KOKKOS_RESTRICT __B, const int __bs0, - const int __bs1, const int __right_col) { + auto dotLowerLeft = [&](const ValueType *KOKKOS_RESTRICT __A, const int __as0, const int __as1, const int __left_row, + ValueType *KOKKOS_RESTRICT __B, const int __bs0, const int __bs1, const int __right_col) { auto B_elems = __left_row; ScalarType sum = 0; #if defined(KOKKOS_ENABLE_PRAGMA_UNROLL) @@ -115,8 +105,7 @@ SerialTrmmInternalLeftLower::invoke( #endif for (int i = 0; i <= B_elems; i++) { // sum += A[left_row, i] * B[i, right_col] - sum += __A[__left_row * __as0 + i * __as1] * - __B[i * __bs0 + __bs1 * __right_col]; + sum += __A[__left_row * __as0 + i * __as1] * __B[i * __bs0 + __bs1 * __right_col]; } return sum; }; @@ -126,8 +115,7 @@ SerialTrmmInternalLeftLower::invoke( if (alpha == zero) KokkosBlas::Impl::SerialSetInternal::invoke(bm, bn, zero, B, bs0, bs1); else { - if (alpha != one) - KokkosBlas::Impl::SerialScaleInternal::invoke(bm, bn, alpha, B, bs0, bs1); + if (alpha != one) KokkosBlas::Impl::SerialScaleInternal::invoke(bm, bn, alpha, B, bs0, bs1); #if defined(KOKKOS_ENABLE_PRAGMA_UNROLL) #pragma unroll @@ -138,8 +126,7 @@ SerialTrmmInternalLeftLower::invoke( #endif for (int n = 0; n < right_n; n++) { if (do_conj) { - B[m * bs0 + n * bs1] = - dotLowerLeftConj(A, as0, as1, m, B, bs0, bs1, n); + B[m * bs0 + n * bs1] = dotLowerLeftConj(A, as0, as1, m, B, bs0, bs1, n); } else { B[m * bs0 + n * bs1] = dotLowerLeft(A, as0, as1, m, B, bs0, bs1, n); } @@ -155,11 +142,9 @@ SerialTrmmInternalLeftLower::invoke( // if use_unit_diag. template <> template -KOKKOS_INLINE_FUNCTION int -SerialTrmmInternalRightLower::invoke( - const bool /*use_unit_diag*/, const bool do_conj, const int am, - const int an, const int bm, const int bn, const ScalarType alpha, - const ValueType *KOKKOS_RESTRICT A, const int as0, const int as1, +KOKKOS_INLINE_FUNCTION int SerialTrmmInternalRightLower::invoke( + const bool /*use_unit_diag*/, const bool do_conj, const int am, const int an, const int bm, const int bn, + const ScalarType alpha, const ValueType *KOKKOS_RESTRICT A, const int as0, const int as1, /**/ ValueType *KOKKOS_RESTRICT B, const int bs0, const int bs1) { const ScalarType one(1.0), zero(0.0); typedef Kokkos::ArithTraits AT; @@ -174,11 +159,9 @@ SerialTrmmInternalRightLower::invoke( // Lower triangular matrix is on RHS with the base facing down. // Everytime we compute a new output row of B, we must shift over to the // right by one in A's column to ensure we skip the 0's. - auto dotLowerRightConj = [&](const ValueType *KOKKOS_RESTRICT __A, - const int __as0, const int __as1, const int __am, - const int __left_row, - ValueType *KOKKOS_RESTRICT __B, const int __bs0, - const int __bs1, const int __right_col) { + auto dotLowerRightConj = [&](const ValueType *KOKKOS_RESTRICT __A, const int __as0, const int __as1, const int __am, + const int __left_row, ValueType *KOKKOS_RESTRICT __B, const int __bs0, const int __bs1, + const int __right_col) { auto B_elems = __am - 1; ScalarType sum = 0; #if defined(KOKKOS_ENABLE_PRAGMA_UNROLL) @@ -186,16 +169,13 @@ SerialTrmmInternalRightLower::invoke( #endif for (int i = __right_col; i <= B_elems; i++) { // sum += B[left_row, i] * A[i, right_col] - sum += __B[__bs0 * __left_row + i * __bs1] * - AT::conj(__A[i * __as0 + __right_col * __as1]); + sum += __B[__bs0 * __left_row + i * __bs1] * AT::conj(__A[i * __as0 + __right_col * __as1]); } return sum; }; - auto dotLowerRight = [&](const ValueType *KOKKOS_RESTRICT __A, - const int __as0, const int __as1, const int __am, - const int __left_row, ValueType *KOKKOS_RESTRICT __B, - const int __bs0, const int __bs1, + auto dotLowerRight = [&](const ValueType *KOKKOS_RESTRICT __A, const int __as0, const int __as1, const int __am, + const int __left_row, ValueType *KOKKOS_RESTRICT __B, const int __bs0, const int __bs1, const int __right_col) { auto B_elems = __am - 1; ScalarType sum = 0; @@ -204,8 +184,7 @@ SerialTrmmInternalRightLower::invoke( #endif for (int i = __right_col; i <= B_elems; i++) { // sum += B[left_row, i] * A[i, right_col] - sum += __B[__bs0 * __left_row + i * __bs1] * - __A[i * __as0 + __right_col * __as1]; + sum += __B[__bs0 * __left_row + i * __bs1] * __A[i * __as0 + __right_col * __as1]; } return sum; }; @@ -215,8 +194,7 @@ SerialTrmmInternalRightLower::invoke( if (alpha == zero) KokkosBlas::Impl::SerialSetInternal::invoke(bm, bn, zero, B, bs0, bs1); else { - if (alpha != one) - KokkosBlas::Impl::SerialScaleInternal::invoke(bm, bn, alpha, B, bs0, bs1); + if (alpha != one) KokkosBlas::Impl::SerialScaleInternal::invoke(bm, bn, alpha, B, bs0, bs1); #if defined(KOKKOS_ENABLE_PRAGMA_UNROLL) #pragma unroll @@ -227,11 +205,9 @@ SerialTrmmInternalRightLower::invoke( #endif for (int n = 0; n < right_n; n++) { if (do_conj) { - B[m * bs0 + n * bs1] = - dotLowerRightConj(A, as0, as1, am, m, B, bs0, bs1, n); + B[m * bs0 + n * bs1] = dotLowerRightConj(A, as0, as1, am, m, B, bs0, bs1, n); } else { - B[m * bs0 + n * bs1] = - dotLowerRight(A, as0, as1, am, m, B, bs0, bs1, n); + B[m * bs0 + n * bs1] = dotLowerRight(A, as0, as1, am, m, B, bs0, bs1, n); } } } @@ -241,11 +217,9 @@ SerialTrmmInternalRightLower::invoke( template <> template -KOKKOS_INLINE_FUNCTION int -SerialTrmmInternalLeftUpper::invoke( - const bool /*use_unit_diag*/, const bool do_conj, const int am, - const int an, const int bm, const int bn, const ScalarType alpha, - const ValueType *KOKKOS_RESTRICT A, const int as0, const int as1, +KOKKOS_INLINE_FUNCTION int SerialTrmmInternalLeftUpper::invoke( + const bool /*use_unit_diag*/, const bool do_conj, const int am, const int an, const int bm, const int bn, + const ScalarType alpha, const ValueType *KOKKOS_RESTRICT A, const int as0, const int as1, /**/ ValueType *KOKKOS_RESTRICT B, const int bs0, const int bs1) { const ScalarType one(1.0), zero(0.0); typedef Kokkos::ArithTraits AT; @@ -257,11 +231,9 @@ SerialTrmmInternalLeftUpper::invoke( // conjOp = AT::conj; //} - auto dotUpperLeftConj = [&](const ValueType *KOKKOS_RESTRICT __A, - const int __as0, const int __as1, const int __an, - const int __left_row, - ValueType *KOKKOS_RESTRICT __B, const int __bs0, - const int __bs1, const int __right_col) { + auto dotUpperLeftConj = [&](const ValueType *KOKKOS_RESTRICT __A, const int __as0, const int __as1, const int __an, + const int __left_row, ValueType *KOKKOS_RESTRICT __B, const int __bs0, const int __bs1, + const int __right_col) { auto B_elems = __an - __left_row - 1; ScalarType sum = 0; #if defined(KOKKOS_ENABLE_PRAGMA_UNROLL) @@ -275,10 +247,9 @@ SerialTrmmInternalLeftUpper::invoke( return sum; }; - auto dotUpperLeft = [&](const ValueType *KOKKOS_RESTRICT __A, const int __as0, - const int __as1, const int __an, const int __left_row, - ValueType *KOKKOS_RESTRICT __B, const int __bs0, - const int __bs1, const int __right_col) { + auto dotUpperLeft = [&](const ValueType *KOKKOS_RESTRICT __A, const int __as0, const int __as1, const int __an, + const int __left_row, ValueType *KOKKOS_RESTRICT __B, const int __bs0, const int __bs1, + const int __right_col) { auto B_elems = __an - __left_row - 1; ScalarType sum = 0; #if defined(KOKKOS_ENABLE_PRAGMA_UNROLL) @@ -286,8 +257,7 @@ SerialTrmmInternalLeftUpper::invoke( #endif for (int i = 0; i <= B_elems; i++) { // sum += A[left_row, i+left_row] * B[i+left_row, right_col] - sum += __A[__left_row * __as0 + (i + __left_row) * __as1] * - __B[(i + __left_row) * __bs0 + __bs1 * __right_col]; + sum += __A[__left_row * __as0 + (i + __left_row) * __as1] * __B[(i + __left_row) * __bs0 + __bs1 * __right_col]; } return sum; }; @@ -297,8 +267,7 @@ SerialTrmmInternalLeftUpper::invoke( if (alpha == zero) KokkosBlas::Impl::SerialSetInternal::invoke(bm, bn, zero, B, bs0, bs1); else { - if (alpha != one) - KokkosBlas::Impl::SerialScaleInternal::invoke(bm, bn, alpha, B, bs0, bs1); + if (alpha != one) KokkosBlas::Impl::SerialScaleInternal::invoke(bm, bn, alpha, B, bs0, bs1); #if defined(KOKKOS_ENABLE_PRAGMA_UNROLL) #pragma unroll @@ -309,11 +278,9 @@ SerialTrmmInternalLeftUpper::invoke( #endif for (int n = 0; n < right_n; ++n) { if (do_conj) { - B[m * bs0 + n * bs1] = - dotUpperLeftConj(A, as0, as1, an, m, B, bs0, bs1, n); + B[m * bs0 + n * bs1] = dotUpperLeftConj(A, as0, as1, an, m, B, bs0, bs1, n); } else { - B[m * bs0 + n * bs1] = - dotUpperLeft(A, as0, as1, an, m, B, bs0, bs1, n); + B[m * bs0 + n * bs1] = dotUpperLeft(A, as0, as1, an, m, B, bs0, bs1, n); } } } @@ -323,11 +290,9 @@ SerialTrmmInternalLeftUpper::invoke( template <> template -KOKKOS_INLINE_FUNCTION int -SerialTrmmInternalRightUpper::invoke( - const bool /*use_unit_diag*/, const bool do_conj, const int am, - const int an, const int bm, const int bn, const ScalarType alpha, - const ValueType *KOKKOS_RESTRICT A, const int as0, const int as1, +KOKKOS_INLINE_FUNCTION int SerialTrmmInternalRightUpper::invoke( + const bool /*use_unit_diag*/, const bool do_conj, const int am, const int an, const int bm, const int bn, + const ScalarType alpha, const ValueType *KOKKOS_RESTRICT A, const int as0, const int as1, /**/ ValueType *KOKKOS_RESTRICT B, const int bs0, const int bs1) { const ScalarType one(1.0), zero(0.0); typedef Kokkos::ArithTraits AT; @@ -339,47 +304,41 @@ SerialTrmmInternalRightUpper::invoke( // conjOp = AT::conj; //} - auto dotUpperRightConj = - [&](const ValueType *KOKKOS_RESTRICT __A, const int __as0, - const int __as1, const int __left_row, ValueType *KOKKOS_RESTRICT __B, - const int __bs0, const int __bs1, const int __right_col) { - auto B_elems = __right_col; - ScalarType sum = 0; + auto dotUpperRightConj = [&](const ValueType *KOKKOS_RESTRICT __A, const int __as0, const int __as1, + const int __left_row, ValueType *KOKKOS_RESTRICT __B, const int __bs0, const int __bs1, + const int __right_col) { + auto B_elems = __right_col; + ScalarType sum = 0; #if defined(KOKKOS_ENABLE_PRAGMA_UNROLL) #pragma unroll #endif - for (int i = 0; i <= B_elems; i++) { - // sum += B[left_row, i] * A[i, right_col] - sum += __B[__left_row * __bs0 + i * __bs1] * - AT::conj(__A[i * __as0 + __right_col * __as1]); - } - return sum; - }; - - auto dotUpperRight = - [&](const ValueType *KOKKOS_RESTRICT __A, const int __as0, - const int __as1, const int __left_row, ValueType *KOKKOS_RESTRICT __B, - const int __bs0, const int __bs1, const int __right_col) { - auto B_elems = __right_col; - ScalarType sum = 0; + for (int i = 0; i <= B_elems; i++) { + // sum += B[left_row, i] * A[i, right_col] + sum += __B[__left_row * __bs0 + i * __bs1] * AT::conj(__A[i * __as0 + __right_col * __as1]); + } + return sum; + }; + + auto dotUpperRight = [&](const ValueType *KOKKOS_RESTRICT __A, const int __as0, const int __as1, const int __left_row, + ValueType *KOKKOS_RESTRICT __B, const int __bs0, const int __bs1, const int __right_col) { + auto B_elems = __right_col; + ScalarType sum = 0; #if defined(KOKKOS_ENABLE_PRAGMA_UNROLL) #pragma unroll #endif - for (int i = 0; i <= B_elems; i++) { - // sum += B[left_row, i] * A[i, right_col] - sum += __B[__left_row * __bs0 + i * __bs1] * - __A[i * __as0 + __right_col * __as1]; - } - return sum; - }; + for (int i = 0; i <= B_elems; i++) { + // sum += B[left_row, i] * A[i, right_col] + sum += __B[__left_row * __bs0 + i * __bs1] * __A[i * __as0 + __right_col * __as1]; + } + return sum; + }; if (bm <= 0 || bn <= 0 || am <= 0 || an <= 0) return 0; if (alpha == zero) KokkosBlas::Impl::SerialSetInternal::invoke(bm, bn, zero, B, bs0, bs1); else { - if (alpha != one) - KokkosBlas::Impl::SerialScaleInternal::invoke(bm, bn, alpha, B, bs0, bs1); + if (alpha != one) KokkosBlas::Impl::SerialScaleInternal::invoke(bm, bn, alpha, B, bs0, bs1); #if defined(KOKKOS_ENABLE_PRAGMA_UNROLL) #pragma unroll @@ -390,8 +349,7 @@ SerialTrmmInternalRightUpper::invoke( #endif for (int n = right_n - 1; n >= 0; --n) { if (do_conj) { - B[m * bs0 + n * bs1] = - dotUpperRightConj(A, as0, as1, m, B, bs0, bs1, n); + B[m * bs0 + n * bs1] = dotUpperRightConj(A, as0, as1, m, B, bs0, bs1, n); } else { B[m * bs0 + n * bs1] = dotUpperRight(A, as0, as1, m, B, bs0, bs1, n); } diff --git a/batched/dense/impl/KokkosBatched_Trsm_Serial_Impl.hpp b/batched/dense/impl/KokkosBatched_Trsm_Serial_Impl.hpp index 4d094c24d2..694ac36fa0 100644 --- a/batched/dense/impl/KokkosBatched_Trsm_Serial_Impl.hpp +++ b/batched/dense/impl/KokkosBatched_Trsm_Serial_Impl.hpp @@ -29,43 +29,32 @@ namespace KokkosBatched { /// B := inv(tril(A)) (alpha*B) /// A(m x m), B(m x n) -#if defined(__KOKKOSBATCHED_ENABLE_INTEL_MKL__) && \ - defined(__KOKKOSBATCHED_ENABLE_INTEL_MKL_BATCHED__) && \ +#if defined(__KOKKOSBATCHED_ENABLE_INTEL_MKL__) && defined(__KOKKOSBATCHED_ENABLE_INTEL_MKL_BATCHED__) && \ defined(__KOKKOSBATCHED_ENABLE_INTEL_MKL_COMPACT_BATCHED__) template -struct SerialTrsm { +struct SerialTrsm { template - KOKKOS_INLINE_FUNCTION static int invoke(const ScalarType alpha, - const AViewType &A, - const BViewType &B) { + KOKKOS_INLINE_FUNCTION static int invoke(const ScalarType alpha, const AViewType &A, const BViewType &B) { typedef typename BViewType::value_type vector_type; // typedef typename vector_type::value_type value_type; const int m = B.extent(0), n = B.extent(1); - static_assert(is_vector::value, - "value type is not vector type"); - static_assert( - vector_type::vector_length == 4 || vector_type::vector_length == 8, - "AVX, AVX2 and AVX512 is supported"); - const MKL_COMPACT_PACK format = - vector_type::vector_length == 8 ? MKL_COMPACT_AVX512 : MKL_COMPACT_AVX; + static_assert(is_vector::value, "value type is not vector type"); + static_assert(vector_type::vector_length == 4 || vector_type::vector_length == 8, + "AVX, AVX2 and AVX512 is supported"); + const MKL_COMPACT_PACK format = vector_type::vector_length == 8 ? MKL_COMPACT_AVX512 : MKL_COMPACT_AVX; // no error check int r_val = 0; if (A.stride_0() == 1 && B.stride_0() == 1) { mkl_dtrsm_compact(MKL_COL_MAJOR, MKL_LEFT, MKL_LOWER, MKL_NOTRANS, - ArgDiag::use_unit_diag ? MKL_UNIT : MKL_NONUNIT, m, n, - alpha, (const double *)A.data(), A.stride_1(), - (double *)B.data(), B.stride_1(), format, - (MKL_INT)vector_type::vector_length); + ArgDiag::use_unit_diag ? MKL_UNIT : MKL_NONUNIT, m, n, alpha, (const double *)A.data(), + A.stride_1(), (double *)B.data(), B.stride_1(), format, (MKL_INT)vector_type::vector_length); } else if (A.stride_1() == 1 && B.stride_1() == 1) { mkl_dtrsm_compact(MKL_ROW_MAJOR, MKL_LEFT, MKL_LOWER, MKL_NOTRANS, - ArgDiag::use_unit_diag ? MKL_UNIT : MKL_NONUNIT, m, n, - alpha, (const double *)A.data(), A.stride_0(), - (double *)B.data(), B.stride_0(), format, - (MKL_INT)vector_type::vector_length); + ArgDiag::use_unit_diag ? MKL_UNIT : MKL_NONUNIT, m, n, alpha, (const double *)A.data(), + A.stride_0(), (double *)B.data(), B.stride_0(), format, (MKL_INT)vector_type::vector_length); } else { r_val = -1; } @@ -75,28 +64,22 @@ struct SerialTrsm -struct SerialTrsm { +struct SerialTrsm { template - KOKKOS_INLINE_FUNCTION static int invoke(const ScalarType alpha, - const AViewType &A, - const BViewType &B) { - return SerialTrsmInternalLeftLower::invoke( - ArgDiag::use_unit_diag, B.extent(0), B.extent(1), alpha, A.data(), - A.stride_0(), A.stride_1(), B.data(), B.stride_0(), B.stride_1()); + KOKKOS_INLINE_FUNCTION static int invoke(const ScalarType alpha, const AViewType &A, const BViewType &B) { + return SerialTrsmInternalLeftLower::invoke(ArgDiag::use_unit_diag, B.extent(0), B.extent(1), + alpha, A.data(), A.stride_0(), A.stride_1(), + B.data(), B.stride_0(), B.stride_1()); } }; template -struct SerialTrsm { +struct SerialTrsm { template - KOKKOS_INLINE_FUNCTION static int invoke(const ScalarType alpha, - const AViewType &A, - const BViewType &B) { - return SerialTrsmInternalLeftLower::invoke( - ArgDiag::use_unit_diag, B.extent(0), B.extent(1), alpha, A.data(), - A.stride_0(), A.stride_1(), B.data(), B.stride_0(), B.stride_1()); + KOKKOS_INLINE_FUNCTION static int invoke(const ScalarType alpha, const AViewType &A, const BViewType &B) { + return SerialTrsmInternalLeftLower::invoke(ArgDiag::use_unit_diag, B.extent(0), B.extent(1), + alpha, A.data(), A.stride_0(), A.stride_1(), + B.data(), B.stride_0(), B.stride_1()); } }; @@ -105,43 +88,32 @@ struct SerialTrsm -struct SerialTrsm { +struct SerialTrsm { template - KOKKOS_INLINE_FUNCTION static int invoke(const ScalarType alpha, - const AViewType &A, - const BViewType &B) { + KOKKOS_INLINE_FUNCTION static int invoke(const ScalarType alpha, const AViewType &A, const BViewType &B) { typedef typename BViewType::value_type vector_type; // typedef typename vector_type::value_type value_type; const int m = B.extent(0), n = B.extent(1); - static_assert(is_vector::value, - "value type is not vector type"); - static_assert( - vector_type::vector_length == 4 || vector_type::vector_length == 8, - "AVX, AVX2 and AVX512 is supported"); - const MKL_COMPACT_PACK format = - vector_type::vector_length == 8 ? MKL_COMPACT_AVX512 : MKL_COMPACT_AVX; + static_assert(is_vector::value, "value type is not vector type"); + static_assert(vector_type::vector_length == 4 || vector_type::vector_length == 8, + "AVX, AVX2 and AVX512 is supported"); + const MKL_COMPACT_PACK format = vector_type::vector_length == 8 ? MKL_COMPACT_AVX512 : MKL_COMPACT_AVX; // no error check int r_val = 0; if (A.stride_0() == 1 && B.stride_0() == 1) { mkl_dtrsm_compact(MKL_COL_MAJOR, MKL_RIGHT, MKL_UPPER, MKL_NOTRANS, - ArgDiag::use_unit_diag ? MKL_UNIT : MKL_NONUNIT, m, n, - alpha, (const double *)A.data(), A.stride_1(), - (double *)B.data(), B.stride_1(), format, - (MKL_INT)vector_type::vector_length); + ArgDiag::use_unit_diag ? MKL_UNIT : MKL_NONUNIT, m, n, alpha, (const double *)A.data(), + A.stride_1(), (double *)B.data(), B.stride_1(), format, (MKL_INT)vector_type::vector_length); } else if (A.stride_1() == 1 && B.stride_1() == 1) { mkl_dtrsm_compact(MKL_ROW_MAJOR, MKL_RIGHT, MKL_UPPER, MKL_NOTRANS, - ArgDiag::use_unit_diag ? MKL_UNIT : MKL_NONUNIT, m, n, - alpha, (const double *)A.data(), A.stride_0(), - (double *)B.data(), B.stride_0(), format, - (MKL_INT)vector_type::vector_length); + ArgDiag::use_unit_diag ? MKL_UNIT : MKL_NONUNIT, m, n, alpha, (const double *)A.data(), + A.stride_0(), (double *)B.data(), B.stride_0(), format, (MKL_INT)vector_type::vector_length); } else { r_val = -1; } @@ -151,54 +123,42 @@ struct SerialTrsm -struct SerialTrsm { +struct SerialTrsm { template - KOKKOS_INLINE_FUNCTION static int invoke(const ScalarType alpha, - const AViewType &A, - const BViewType &B) { - return SerialTrsmInternalLeftLower::invoke( - ArgDiag::use_unit_diag, B.extent(1), B.extent(0), alpha, A.data(), - A.stride_1(), A.stride_0(), B.data(), B.stride_1(), B.stride_0()); + KOKKOS_INLINE_FUNCTION static int invoke(const ScalarType alpha, const AViewType &A, const BViewType &B) { + return SerialTrsmInternalLeftLower::invoke(ArgDiag::use_unit_diag, B.extent(1), B.extent(0), + alpha, A.data(), A.stride_1(), A.stride_0(), + B.data(), B.stride_1(), B.stride_0()); } }; template -struct SerialTrsm { +struct SerialTrsm { template - KOKKOS_INLINE_FUNCTION static int invoke(const ScalarType alpha, - const AViewType &A, - const BViewType &B) { - return SerialTrsmInternalLeftLower::invoke( - ArgDiag::use_unit_diag, B.extent(1), B.extent(0), alpha, A.data(), - A.stride_1(), A.stride_0(), B.data(), B.stride_1(), B.stride_0()); + KOKKOS_INLINE_FUNCTION static int invoke(const ScalarType alpha, const AViewType &A, const BViewType &B) { + return SerialTrsmInternalLeftLower::invoke(ArgDiag::use_unit_diag, B.extent(1), B.extent(0), + alpha, A.data(), A.stride_1(), A.stride_0(), + B.data(), B.stride_1(), B.stride_0()); } }; template -struct SerialTrsm { +struct SerialTrsm { template - KOKKOS_INLINE_FUNCTION static int invoke(const ScalarType alpha, - const AViewType &A, - const BViewType &B) { - return SerialTrsmInternalLeftLower::invoke( - ArgDiag::use_unit_diag, B.extent(1), B.extent(0), alpha, A.data(), - A.stride_0(), A.stride_1(), B.data(), B.stride_1(), B.stride_0()); + KOKKOS_INLINE_FUNCTION static int invoke(const ScalarType alpha, const AViewType &A, const BViewType &B) { + return SerialTrsmInternalLeftLower::invoke(ArgDiag::use_unit_diag, B.extent(1), B.extent(0), + alpha, A.data(), A.stride_0(), A.stride_1(), + B.data(), B.stride_1(), B.stride_0()); } }; template -struct SerialTrsm { +struct SerialTrsm { template - KOKKOS_INLINE_FUNCTION static int invoke(const ScalarType alpha, - const AViewType &A, - const BViewType &B) { - return SerialTrsmInternalLeftLower::invoke( - ArgDiag::use_unit_diag, B.extent(1), B.extent(0), alpha, A.data(), - A.stride_0(), A.stride_1(), B.data(), B.stride_1(), B.stride_0()); + KOKKOS_INLINE_FUNCTION static int invoke(const ScalarType alpha, const AViewType &A, const BViewType &B) { + return SerialTrsmInternalLeftLower::invoke(ArgDiag::use_unit_diag, B.extent(1), B.extent(0), + alpha, A.data(), A.stride_0(), A.stride_1(), + B.data(), B.stride_1(), B.stride_0()); } }; @@ -207,43 +167,32 @@ struct SerialTrsm -struct SerialTrsm { +struct SerialTrsm { template - KOKKOS_INLINE_FUNCTION static int invoke(const ScalarType alpha, - const AViewType &A, - const BViewType &B) { + KOKKOS_INLINE_FUNCTION static int invoke(const ScalarType alpha, const AViewType &A, const BViewType &B) { typedef typename BViewType::value_type vector_type; // typedef typename vector_type::value_type value_type; const int m = B.extent(0), n = B.extent(1); - static_assert(is_vector::value, - "value type is not vector type"); - static_assert( - vector_type::vector_length == 4 || vector_type::vector_length == 8, - "AVX, AVX2 and AVX512 is supported"); - const MKL_COMPACT_PACK format = - vector_type::vector_length == 8 ? MKL_COMPACT_AVX512 : MKL_COMPACT_AVX; + static_assert(is_vector::value, "value type is not vector type"); + static_assert(vector_type::vector_length == 4 || vector_type::vector_length == 8, + "AVX, AVX2 and AVX512 is supported"); + const MKL_COMPACT_PACK format = vector_type::vector_length == 8 ? MKL_COMPACT_AVX512 : MKL_COMPACT_AVX; // no error check int r_val = 0; if (A.stride_0() == 1 && B.stride_0() == 1) { mkl_dtrsm_compact(MKL_COL_MAJOR, MKL_LEFT, MKL_UPPER, MKL_NOTRANS, - ArgDiag::use_unit_diag ? MKL_UNIT : MKL_NONUNIT, m, n, - alpha, (const double *)A.data(), A.stride_1(), - (double *)B.data(), B.stride_1(), format, - (MKL_INT)vector_type::vector_length); + ArgDiag::use_unit_diag ? MKL_UNIT : MKL_NONUNIT, m, n, alpha, (const double *)A.data(), + A.stride_1(), (double *)B.data(), B.stride_1(), format, (MKL_INT)vector_type::vector_length); } else if (A.stride_1() == 1 && B.stride_1() == 1) { mkl_dtrsm_compact(MKL_ROW_MAJOR, MKL_LEFT, MKL_UPPER, MKL_NOTRANS, - ArgDiag::use_unit_diag ? MKL_UNIT : MKL_NONUNIT, m, n, - alpha, (const double *)A.data(), A.stride_0(), - (double *)B.data(), B.stride_0(), format, - (MKL_INT)vector_type::vector_length); + ArgDiag::use_unit_diag ? MKL_UNIT : MKL_NONUNIT, m, n, alpha, (const double *)A.data(), + A.stride_0(), (double *)B.data(), B.stride_0(), format, (MKL_INT)vector_type::vector_length); } else { r_val = -1; } @@ -253,28 +202,22 @@ struct SerialTrsm -struct SerialTrsm { +struct SerialTrsm { template - KOKKOS_INLINE_FUNCTION static int invoke(const ScalarType alpha, - const AViewType &A, - const BViewType &B) { - return SerialTrsmInternalLeftUpper::invoke( - ArgDiag::use_unit_diag, B.extent(0), B.extent(1), alpha, A.data(), - A.stride_0(), A.stride_1(), B.data(), B.stride_0(), B.stride_1()); + KOKKOS_INLINE_FUNCTION static int invoke(const ScalarType alpha, const AViewType &A, const BViewType &B) { + return SerialTrsmInternalLeftUpper::invoke(ArgDiag::use_unit_diag, B.extent(0), B.extent(1), + alpha, A.data(), A.stride_0(), A.stride_1(), + B.data(), B.stride_0(), B.stride_1()); } }; template -struct SerialTrsm { +struct SerialTrsm { template - KOKKOS_INLINE_FUNCTION static int invoke(const ScalarType alpha, - const AViewType &A, - const BViewType &B) { - return SerialTrsmInternalLeftUpper::invoke( - ArgDiag::use_unit_diag, B.extent(0), B.extent(1), alpha, A.data(), - A.stride_0(), A.stride_1(), B.data(), B.stride_0(), B.stride_1()); + KOKKOS_INLINE_FUNCTION static int invoke(const ScalarType alpha, const AViewType &A, const BViewType &B) { + return SerialTrsmInternalLeftUpper::invoke(ArgDiag::use_unit_diag, B.extent(0), B.extent(1), + alpha, A.data(), A.stride_0(), A.stride_1(), + B.data(), B.stride_0(), B.stride_1()); } }; @@ -284,42 +227,31 @@ struct SerialTrsm -struct SerialTrsm { +struct SerialTrsm { template - KOKKOS_INLINE_FUNCTION static int invoke(const ScalarType alpha, - const AViewType &A, - const BViewType &B) { + KOKKOS_INLINE_FUNCTION static int invoke(const ScalarType alpha, const AViewType &A, const BViewType &B) { typedef typename BViewType::value_type vector_type; // typedef typename vector_type::value_type value_type; const int m = B.extent(0), n = B.extent(1); - static_assert(is_vector::value, - "value type is not vector type"); - static_assert( - vector_type::vector_length == 4 || vector_type::vector_length == 8, - "AVX, AVX2 and AVX512 is supported"); - const MKL_COMPACT_PACK format = - vector_type::vector_length == 8 ? MKL_COMPACT_AVX512 : MKL_COMPACT_AVX; + static_assert(is_vector::value, "value type is not vector type"); + static_assert(vector_type::vector_length == 4 || vector_type::vector_length == 8, + "AVX, AVX2 and AVX512 is supported"); + const MKL_COMPACT_PACK format = vector_type::vector_length == 8 ? MKL_COMPACT_AVX512 : MKL_COMPACT_AVX; // no error check int r_val = 0; if (A.stride_0() == 1 && B.stride_0() == 1) { - mkl_dtrsm_compact(MKL_COL_MAJOR, MKL_LEFT, MKL_LOWER, MKL_TRANS, - ArgDiag::use_unit_diag ? MKL_UNIT : MKL_NONUNIT, m, n, - alpha, (const double *)A.data(), A.stride_1(), - (double *)B.data(), B.stride_1(), format, + mkl_dtrsm_compact(MKL_COL_MAJOR, MKL_LEFT, MKL_LOWER, MKL_TRANS, ArgDiag::use_unit_diag ? MKL_UNIT : MKL_NONUNIT, + m, n, alpha, (const double *)A.data(), A.stride_1(), (double *)B.data(), B.stride_1(), format, (MKL_INT)vector_type::vector_length); } else if (A.stride_1() == 1 && B.stride_1() == 1) { - mkl_dtrsm_compact(MKL_ROW_MAJOR, MKL_LEFT, MKL_LOWER, MKL_TRANS, - ArgDiag::use_unit_diag ? MKL_UNIT : MKL_NONUNIT, m, n, - alpha, (const double *)A.data(), A.stride_0(), - (double *)B.data(), B.stride_0(), format, + mkl_dtrsm_compact(MKL_ROW_MAJOR, MKL_LEFT, MKL_LOWER, MKL_TRANS, ArgDiag::use_unit_diag ? MKL_UNIT : MKL_NONUNIT, + m, n, alpha, (const double *)A.data(), A.stride_0(), (double *)B.data(), B.stride_0(), format, (MKL_INT)vector_type::vector_length); } else { r_val = -1; @@ -330,28 +262,22 @@ struct SerialTrsm -struct SerialTrsm { +struct SerialTrsm { template - KOKKOS_INLINE_FUNCTION static int invoke(const ScalarType alpha, - const AViewType &A, - const BViewType &B) { - return SerialTrsmInternalLeftUpper::invoke( - ArgDiag::use_unit_diag, B.extent(0), B.extent(1), alpha, A.data(), - A.stride_1(), A.stride_0(), B.data(), B.stride_0(), B.stride_1()); + KOKKOS_INLINE_FUNCTION static int invoke(const ScalarType alpha, const AViewType &A, const BViewType &B) { + return SerialTrsmInternalLeftUpper::invoke(ArgDiag::use_unit_diag, B.extent(0), B.extent(1), + alpha, A.data(), A.stride_1(), A.stride_0(), + B.data(), B.stride_0(), B.stride_1()); } }; template -struct SerialTrsm { +struct SerialTrsm { template - KOKKOS_INLINE_FUNCTION static int invoke(const ScalarType alpha, - const AViewType &A, - const BViewType &B) { - return SerialTrsmInternalLeftUpper::invoke( - ArgDiag::use_unit_diag, B.extent(0), B.extent(1), alpha, A.data(), - A.stride_1(), A.stride_0(), B.data(), B.stride_0(), B.stride_1()); + KOKKOS_INLINE_FUNCTION static int invoke(const ScalarType alpha, const AViewType &A, const BViewType &B) { + return SerialTrsmInternalLeftUpper::invoke(ArgDiag::use_unit_diag, B.extent(0), B.extent(1), + alpha, A.data(), A.stride_1(), A.stride_0(), + B.data(), B.stride_0(), B.stride_1()); } }; /// @@ -359,42 +285,31 @@ struct SerialTrsm -struct SerialTrsm { +struct SerialTrsm { template - KOKKOS_INLINE_FUNCTION static int invoke(const ScalarType alpha, - const AViewType &A, - const BViewType &B) { + KOKKOS_INLINE_FUNCTION static int invoke(const ScalarType alpha, const AViewType &A, const BViewType &B) { typedef typename BViewType::value_type vector_type; // typedef typename vector_type::value_type value_type; const int m = B.extent(0), n = B.extent(1); - static_assert(is_vector::value, - "value type is not vector type"); - static_assert( - vector_type::vector_length == 4 || vector_type::vector_length == 8, - "AVX, AVX2 and AVX512 is supported"); - const MKL_COMPACT_PACK format = - vector_type::vector_length == 8 ? MKL_COMPACT_AVX512 : MKL_COMPACT_AVX; + static_assert(is_vector::value, "value type is not vector type"); + static_assert(vector_type::vector_length == 4 || vector_type::vector_length == 8, + "AVX, AVX2 and AVX512 is supported"); + const MKL_COMPACT_PACK format = vector_type::vector_length == 8 ? MKL_COMPACT_AVX512 : MKL_COMPACT_AVX; // no error check int r_val = 0; if (A.stride_0() == 1 && B.stride_0() == 1) { - mkl_dtrsm_compact(MKL_COL_MAJOR, MKL_LEFT, MKL_UPPER, MKL_TRANS, - ArgDiag::use_unit_diag ? MKL_UNIT : MKL_NONUNIT, m, n, - alpha, (const double *)A.data(), A.stride_1(), - (double *)B.data(), B.stride_1(), format, + mkl_dtrsm_compact(MKL_COL_MAJOR, MKL_LEFT, MKL_UPPER, MKL_TRANS, ArgDiag::use_unit_diag ? MKL_UNIT : MKL_NONUNIT, + m, n, alpha, (const double *)A.data(), A.stride_1(), (double *)B.data(), B.stride_1(), format, (MKL_INT)vector_type::vector_length); } else if (A.stride_1() == 1 && B.stride_1() == 1) { - mkl_dtrsm_compact(MKL_ROW_MAJOR, MKL_LEFT, MKL_UPPER, MKL_TRANS, - ArgDiag::use_unit_diag ? MKL_UNIT : MKL_NONUNIT, m, n, - alpha, (const double *)A.data(), A.stride_0(), - (double *)B.data(), B.stride_0(), format, + mkl_dtrsm_compact(MKL_ROW_MAJOR, MKL_LEFT, MKL_UPPER, MKL_TRANS, ArgDiag::use_unit_diag ? MKL_UNIT : MKL_NONUNIT, + m, n, alpha, (const double *)A.data(), A.stride_0(), (double *)B.data(), B.stride_0(), format, (MKL_INT)vector_type::vector_length); } else { r_val = -1; @@ -405,28 +320,22 @@ struct SerialTrsm -struct SerialTrsm { +struct SerialTrsm { template - KOKKOS_INLINE_FUNCTION static int invoke(const ScalarType alpha, - const AViewType &A, - const BViewType &B) { - return SerialTrsmInternalLeftLower::invoke( - ArgDiag::use_unit_diag, B.extent(0), B.extent(1), alpha, A.data(), - A.stride_1(), A.stride_0(), B.data(), B.stride_0(), B.stride_1()); + KOKKOS_INLINE_FUNCTION static int invoke(const ScalarType alpha, const AViewType &A, const BViewType &B) { + return SerialTrsmInternalLeftLower::invoke(ArgDiag::use_unit_diag, B.extent(0), B.extent(1), + alpha, A.data(), A.stride_1(), A.stride_0(), + B.data(), B.stride_0(), B.stride_1()); } }; template -struct SerialTrsm { +struct SerialTrsm { template - KOKKOS_INLINE_FUNCTION static int invoke(const ScalarType alpha, - const AViewType &A, - const BViewType &B) { - return SerialTrsmInternalLeftLower::invoke( - ArgDiag::use_unit_diag, B.extent(0), B.extent(1), alpha, A.data(), - A.stride_1(), A.stride_0(), B.data(), B.stride_0(), B.stride_1()); + KOKKOS_INLINE_FUNCTION static int invoke(const ScalarType alpha, const AViewType &A, const BViewType &B) { + return SerialTrsmInternalLeftLower::invoke(ArgDiag::use_unit_diag, B.extent(0), B.extent(1), + alpha, A.data(), A.stride_1(), A.stride_0(), + B.data(), B.stride_0(), B.stride_1()); } }; diff --git a/batched/dense/impl/KokkosBatched_Trsm_Serial_Internal.hpp b/batched/dense/impl/KokkosBatched_Trsm_Serial_Internal.hpp index a44943e5d6..0e65d269f0 100644 --- a/batched/dense/impl/KokkosBatched_Trsm_Serial_Internal.hpp +++ b/batched/dense/impl/KokkosBatched_Trsm_Serial_Internal.hpp @@ -34,40 +34,31 @@ namespace KokkosBatched { template struct SerialTrsmInternalLeftLower { template - KOKKOS_INLINE_FUNCTION static int invoke(const bool use_unit_diag, - const int m, const int n, - const ScalarType alpha, - const ValueType *KOKKOS_RESTRICT A, - const int as0, const int as1, - /**/ ValueType *KOKKOS_RESTRICT B, - const int bs0, const int bs1); + KOKKOS_INLINE_FUNCTION static int invoke(const bool use_unit_diag, const int m, const int n, const ScalarType alpha, + const ValueType *KOKKOS_RESTRICT A, const int as0, const int as1, + /**/ ValueType *KOKKOS_RESTRICT B, const int bs0, const int bs1); }; template <> template -KOKKOS_INLINE_FUNCTION int -SerialTrsmInternalLeftLower::invoke( - const bool use_unit_diag, const int m, const int n, const ScalarType alpha, - const ValueType *KOKKOS_RESTRICT A, const int as0, const int as1, +KOKKOS_INLINE_FUNCTION int SerialTrsmInternalLeftLower::invoke( + const bool use_unit_diag, const int m, const int n, const ScalarType alpha, const ValueType *KOKKOS_RESTRICT A, + const int as0, const int as1, /**/ ValueType *KOKKOS_RESTRICT B, const int bs0, const int bs1) { const ScalarType one(1.0), zero(0.0); if (alpha == zero) KokkosBlas::Impl::SerialSetInternal::invoke(m, n, zero, B, bs0, bs1); else { - if (alpha != one) - KokkosBlas::Impl::SerialScaleInternal::invoke(m, n, alpha, B, bs0, bs1); + if (alpha != one) KokkosBlas::Impl::SerialScaleInternal::invoke(m, n, alpha, B, bs0, bs1); if (m <= 0 || n <= 0) return 0; for (int p = 0; p < m; ++p) { const int iend = m - p - 1, jend = n; - const ValueType *KOKKOS_RESTRICT a21 = - iend ? A + (p + 1) * as0 + p * as1 : NULL; + const ValueType *KOKKOS_RESTRICT a21 = iend ? A + (p + 1) * as0 + p * as1 : NULL; - ValueType *KOKKOS_RESTRICT b1t = B + p * bs0, - *KOKKOS_RESTRICT B2 = - iend ? B + (p + 1) * bs0 : NULL; + ValueType *KOKKOS_RESTRICT b1t = B + p * bs0, *KOKKOS_RESTRICT B2 = iend ? B + (p + 1) * bs0 : NULL; if (!use_unit_diag) { const ValueType alpha11 = A[p * as0 + p * as1]; @@ -83,8 +74,7 @@ SerialTrsmInternalLeftLower::invoke( #if defined(KOKKOS_ENABLE_PRAGMA_UNROLL) #pragma unroll #endif - for (int j = 0; j < jend; ++j) - B2[i * bs0 + j * bs1] -= a21[i * as0] * b1t[j * bs1]; + for (int j = 0; j < jend; ++j) B2[i * bs0 + j * bs1] -= a21[i * as0] * b1t[j * bs1]; } } return 0; @@ -92,10 +82,9 @@ SerialTrsmInternalLeftLower::invoke( template <> template -KOKKOS_INLINE_FUNCTION int -SerialTrsmInternalLeftLower::invoke( - const bool use_unit_diag, const int m, const int n, const ScalarType alpha, - const ValueType *KOKKOS_RESTRICT A, const int as0, const int as1, +KOKKOS_INLINE_FUNCTION int SerialTrsmInternalLeftLower::invoke( + const bool use_unit_diag, const int m, const int n, const ScalarType alpha, const ValueType *KOKKOS_RESTRICT A, + const int as0, const int as1, /**/ ValueType *KOKKOS_RESTRICT B, const int bs0, const int bs1) { constexpr int mbAlgo = Algo::Trsm::Blocked::mb(); @@ -104,16 +93,14 @@ SerialTrsmInternalLeftLower::invoke( if (alpha == zero) KokkosBlas::Impl::SerialSetInternal::invoke(m, n, zero, B, bs0, bs1); else { - if (alpha != one) - KokkosBlas::Impl::SerialScaleInternal::invoke(m, n, alpha, B, bs0, bs1); + if (alpha != one) KokkosBlas::Impl::SerialScaleInternal::invoke(m, n, alpha, B, bs0, bs1); if (m <= 0 || n <= 0) return 0; InnerTrsmLeftLowerUnitDiag trsm_u(as0, as1, bs0, bs1); InnerTrsmLeftLowerNonUnitDiag trsm_n(as0, as1, bs0, bs1); InnerGemmFixA gemm(as0, as1, bs0, bs1, bs0, bs1); - auto trsm = [&](const int ib, const int jb, - const ValueType *KOKKOS_RESTRICT AA, + auto trsm = [&](const int ib, const int jb, const ValueType *KOKKOS_RESTRICT AA, /**/ ValueType *KOKKOS_RESTRICT BB) { const int mb = mbAlgo; for (int p = 0; p < ib; p += mb) { @@ -121,7 +108,7 @@ SerialTrsmInternalLeftLower::invoke( // trsm update const ValueType *KOKKOS_RESTRICT Ap = AA + p * as0 + p * as1; - /**/ ValueType *KOKKOS_RESTRICT Bp = BB + p * bs0; + /**/ ValueType *KOKKOS_RESTRICT Bp = BB + p * bs0; if (use_unit_diag) trsm_u.serial_invoke(Ap, pb, jb, Bp); @@ -131,8 +118,7 @@ SerialTrsmInternalLeftLower::invoke( // gemm update for (int i = p + mb; i < ib; i += mb) { const int mm = (i + mb) > ib ? (ib - i) : mb; - gemm.serial_invoke(minus_one, AA + i * as0 + p * as1, BB + p * bs0, - mm, jb, pb, BB + i * bs0); + gemm.serial_invoke(minus_one, AA + i * as0 + p * as1, BB + p * bs0, mm, jb, pb, BB + i * bs0); } } }; @@ -151,29 +137,23 @@ SerialTrsmInternalLeftLower::invoke( template struct SerialTrsmInternalLeftUpper { template - KOKKOS_INLINE_FUNCTION static int invoke(const bool use_unit_diag, - const int m, const int n, - const ScalarType alpha, - const ValueType *KOKKOS_RESTRICT A, - const int as0, const int as1, - /**/ ValueType *KOKKOS_RESTRICT B, - const int bs0, const int bs1); + KOKKOS_INLINE_FUNCTION static int invoke(const bool use_unit_diag, const int m, const int n, const ScalarType alpha, + const ValueType *KOKKOS_RESTRICT A, const int as0, const int as1, + /**/ ValueType *KOKKOS_RESTRICT B, const int bs0, const int bs1); }; template <> template -KOKKOS_INLINE_FUNCTION int -SerialTrsmInternalLeftUpper::invoke( - const bool use_unit_diag, const int m, const int n, const ScalarType alpha, - const ValueType *KOKKOS_RESTRICT A, const int as0, const int as1, +KOKKOS_INLINE_FUNCTION int SerialTrsmInternalLeftUpper::invoke( + const bool use_unit_diag, const int m, const int n, const ScalarType alpha, const ValueType *KOKKOS_RESTRICT A, + const int as0, const int as1, /**/ ValueType *KOKKOS_RESTRICT B, const int bs0, const int bs1) { const ScalarType one(1.0), zero(0.0); if (alpha == zero) KokkosBlas::Impl::SerialSetInternal::invoke(m, n, zero, B, bs0, bs1); else { - if (alpha != one) - KokkosBlas::Impl::SerialScaleInternal::invoke(m, n, alpha, B, bs0, bs1); + if (alpha != one) KokkosBlas::Impl::SerialScaleInternal::invoke(m, n, alpha, B, bs0, bs1); if (m <= 0 || n <= 0) return 0; ValueType *KOKKOS_RESTRICT B0 = B; @@ -199,8 +179,7 @@ SerialTrsmInternalLeftUpper::invoke( #if defined(KOKKOS_ENABLE_PRAGMA_UNROLL) #pragma unroll #endif - for (int j = 0; j < jend; ++j) - B0[i * bs0 + j * bs1] -= a01[i * as0] * b1t[j * bs1]; + for (int j = 0; j < jend; ++j) B0[i * bs0 + j * bs1] -= a01[i * as0] * b1t[j * bs1]; } } } @@ -209,10 +188,9 @@ SerialTrsmInternalLeftUpper::invoke( template <> template -KOKKOS_INLINE_FUNCTION int -SerialTrsmInternalLeftUpper::invoke( - const bool use_unit_diag, const int m, const int n, const ScalarType alpha, - const ValueType *KOKKOS_RESTRICT A, const int as0, const int as1, +KOKKOS_INLINE_FUNCTION int SerialTrsmInternalLeftUpper::invoke( + const bool use_unit_diag, const int m, const int n, const ScalarType alpha, const ValueType *KOKKOS_RESTRICT A, + const int as0, const int as1, /**/ ValueType *KOKKOS_RESTRICT B, const int bs0, const int bs1) { const ScalarType one(1.0), zero(0.0), minus_one(-1.0); @@ -221,8 +199,7 @@ SerialTrsmInternalLeftUpper::invoke( if (alpha == zero) KokkosBlas::Impl::SerialSetInternal::invoke(m, n, zero, B, bs0, bs1); else { - if (alpha != one) - KokkosBlas::Impl::SerialScaleInternal::invoke(m, n, alpha, B, bs0, bs1); + if (alpha != one) KokkosBlas::Impl::SerialScaleInternal::invoke(m, n, alpha, B, bs0, bs1); if (m <= 0 || n <= 0) return 0; InnerTrsmLeftUpperUnitDiag trsm_u(as0, as1, bs0, bs1); @@ -230,17 +207,15 @@ SerialTrsmInternalLeftUpper::invoke( InnerGemmFixA gemm(as0, as1, bs0, bs1, bs0, bs1); - auto trsm = [&](const int ib, const int jb, - const ValueType *KOKKOS_RESTRICT AA, + auto trsm = [&](const int ib, const int jb, const ValueType *KOKKOS_RESTRICT AA, /**/ ValueType *KOKKOS_RESTRICT BB) { const int mb = mbAlgo; for (int pp = 0; pp < ib; pp += mb) { - const int ptmp = ib - pp - mb, p = ptmp < 0 ? 0 : ptmp, - pb = mb + (ptmp < 0) * ptmp; + const int ptmp = ib - pp - mb, p = ptmp < 0 ? 0 : ptmp, pb = mb + (ptmp < 0) * ptmp; // trsm update const ValueType *KOKKOS_RESTRICT Ap = AA + p * as0 + p * as1; - /**/ ValueType *KOKKOS_RESTRICT Bp = BB + p * bs0; + /**/ ValueType *KOKKOS_RESTRICT Bp = BB + p * bs0; if (use_unit_diag) trsm_u.serial_invoke(Ap, pb, jb, Bp); @@ -249,8 +224,7 @@ SerialTrsmInternalLeftUpper::invoke( // gemm update for (int i = 0; i < p; i += mb) { - gemm.serial_invoke(minus_one, AA + i * as0 + p * as1, Bp, - (i + mb) > p ? (p - i) : mb, jb, pb, BB + i * bs0); + gemm.serial_invoke(minus_one, AA + i * as0 + p * as1, Bp, (i + mb) > p ? (p - i) : mb, jb, pb, BB + i * bs0); } } }; diff --git a/batched/dense/impl/KokkosBatched_Trsm_TeamVector_Impl.hpp b/batched/dense/impl/KokkosBatched_Trsm_TeamVector_Impl.hpp index dbaba7fc6c..145f8e0c2d 100644 --- a/batched/dense/impl/KokkosBatched_Trsm_TeamVector_Impl.hpp +++ b/batched/dense/impl/KokkosBatched_Trsm_TeamVector_Impl.hpp @@ -34,17 +34,13 @@ namespace KokkosBatched { /// A(m x m), B(m x n) template -struct TeamVectorTrsm { +struct TeamVectorTrsm { template - KOKKOS_INLINE_FUNCTION static int invoke(const MemberType &member, - const ScalarType alpha, - const AViewType &A, + KOKKOS_INLINE_FUNCTION static int invoke(const MemberType &member, const ScalarType alpha, const AViewType &A, const BViewType &B) { return TeamVectorTrsmInternalLeftLower::invoke( - member, ArgDiag::use_unit_diag, B.extent(0), B.extent(1), alpha, - A.data(), A.stride_0(), A.stride_1(), B.data(), B.stride_0(), - B.stride_1()); + member, ArgDiag::use_unit_diag, B.extent(0), B.extent(1), alpha, A.data(), A.stride_0(), A.stride_1(), B.data(), + B.stride_0(), B.stride_1()); } }; @@ -55,17 +51,13 @@ struct TeamVectorTrsm -struct TeamVectorTrsm { +struct TeamVectorTrsm { template - KOKKOS_INLINE_FUNCTION static int invoke(const MemberType &member, - const ScalarType alpha, - const AViewType &A, + KOKKOS_INLINE_FUNCTION static int invoke(const MemberType &member, const ScalarType alpha, const AViewType &A, const BViewType &B) { return TeamVectorTrsmInternalLeftLower::invoke( - member, ArgDiag::use_unit_diag, B.extent(1), B.extent(0), alpha, - A.data(), A.stride_1(), A.stride_0(), B.data(), B.stride_1(), - B.stride_0()); + member, ArgDiag::use_unit_diag, B.extent(1), B.extent(0), alpha, A.data(), A.stride_1(), A.stride_0(), B.data(), + B.stride_1(), B.stride_0()); } }; @@ -76,17 +68,13 @@ struct TeamVectorTrsm -struct TeamVectorTrsm { +struct TeamVectorTrsm { template - KOKKOS_INLINE_FUNCTION static int invoke(const MemberType &member, - const ScalarType alpha, - const AViewType &A, + KOKKOS_INLINE_FUNCTION static int invoke(const MemberType &member, const ScalarType alpha, const AViewType &A, const BViewType &B) { return TeamVectorTrsmInternalLeftUpper::invoke( - member, ArgDiag::use_unit_diag, B.extent(0), B.extent(1), alpha, - A.data(), A.stride_0(), A.stride_1(), B.data(), B.stride_0(), - B.stride_1()); + member, ArgDiag::use_unit_diag, B.extent(0), B.extent(1), alpha, A.data(), A.stride_0(), A.stride_1(), B.data(), + B.stride_0(), B.stride_1()); } }; @@ -97,17 +85,13 @@ struct TeamVectorTrsm -struct TeamVectorTrsm { +struct TeamVectorTrsm { template - KOKKOS_INLINE_FUNCTION static int invoke(const MemberType &member, - const ScalarType alpha, - const AViewType &A, + KOKKOS_INLINE_FUNCTION static int invoke(const MemberType &member, const ScalarType alpha, const AViewType &A, const BViewType &B) { return TeamVectorTrsmInternalLeftUpper::invoke( - member, ArgDiag::use_unit_diag, B.extent(0), B.extent(1), alpha, - A.data(), A.stride_1(), A.stride_0(), B.data(), B.stride_0(), - B.stride_1()); + member, ArgDiag::use_unit_diag, B.extent(0), B.extent(1), alpha, A.data(), A.stride_1(), A.stride_0(), B.data(), + B.stride_0(), B.stride_1()); } }; @@ -118,17 +102,13 @@ struct TeamVectorTrsm -struct TeamVectorTrsm { +struct TeamVectorTrsm { template - KOKKOS_INLINE_FUNCTION static int invoke(const MemberType &member, - const ScalarType alpha, - const AViewType &A, + KOKKOS_INLINE_FUNCTION static int invoke(const MemberType &member, const ScalarType alpha, const AViewType &A, const BViewType &B) { return TeamVectorTrsmInternalLeftLower::invoke( - member, ArgDiag::use_unit_diag, B.extent(0), B.extent(1), alpha, - A.data(), A.stride_1(), A.stride_0(), B.data(), B.stride_0(), - B.stride_1()); + member, ArgDiag::use_unit_diag, B.extent(0), B.extent(1), alpha, A.data(), A.stride_1(), A.stride_0(), B.data(), + B.stride_0(), B.stride_1()); } }; diff --git a/batched/dense/impl/KokkosBatched_Trsm_TeamVector_Internal.hpp b/batched/dense/impl/KokkosBatched_Trsm_TeamVector_Internal.hpp index 3ee13f0b80..c1781a001c 100644 --- a/batched/dense/impl/KokkosBatched_Trsm_TeamVector_Internal.hpp +++ b/batched/dense/impl/KokkosBatched_Trsm_TeamVector_Internal.hpp @@ -32,30 +32,24 @@ namespace KokkosBatched { template struct TeamVectorTrsmInternalLeftLower { template - KOKKOS_INLINE_FUNCTION static int invoke( - const MemberType &member, const bool use_unit_diag, const int m, - const int n, const ScalarType alpha, const ValueType *KOKKOS_RESTRICT A, - const int as0, const int as1, - /**/ ValueType *KOKKOS_RESTRICT B, const int bs0, const int bs1); + KOKKOS_INLINE_FUNCTION static int invoke(const MemberType &member, const bool use_unit_diag, const int m, const int n, + const ScalarType alpha, const ValueType *KOKKOS_RESTRICT A, const int as0, + const int as1, + /**/ ValueType *KOKKOS_RESTRICT B, const int bs0, const int bs1); }; template <> template -KOKKOS_INLINE_FUNCTION int -TeamVectorTrsmInternalLeftLower::invoke( - const MemberType &member, const bool use_unit_diag, const int m, - const int n, const ScalarType alpha, const ValueType *KOKKOS_RESTRICT A, - const int as0, const int as1, +KOKKOS_INLINE_FUNCTION int TeamVectorTrsmInternalLeftLower::invoke( + const MemberType &member, const bool use_unit_diag, const int m, const int n, const ScalarType alpha, + const ValueType *KOKKOS_RESTRICT A, const int as0, const int as1, /**/ ValueType *KOKKOS_RESTRICT B, const int bs0, const int bs1) { const ScalarType one(1.0), zero(0.0); if (alpha == zero) - KokkosBlas::Impl::TeamVectorSetInternal::invoke(member, m, n, zero, B, bs0, - bs1); + KokkosBlas::Impl::TeamVectorSetInternal::invoke(member, m, n, zero, B, bs0, bs1); else { - if (alpha != one) - KokkosBlas::Impl::TeamVectorScaleInternal::invoke(member, m, n, alpha, B, - bs0, bs1); + if (alpha != one) KokkosBlas::Impl::TeamVectorScaleInternal::invoke(member, m, n, alpha, B, bs0, bs1); if (m <= 0 || n <= 0) return 0; for (int p = 0; p < m; ++p) { @@ -63,29 +57,23 @@ TeamVectorTrsmInternalLeftLower::invoke( int iend = m - p - 1; int jend = n; - const ValueType *KOKKOS_RESTRICT a21 = - iend ? A + (p + 1) * as0 + p * as1 : NULL; + const ValueType *KOKKOS_RESTRICT a21 = iend ? A + (p + 1) * as0 + p * as1 : NULL; - ValueType *KOKKOS_RESTRICT b1t = B + p * bs0, - *KOKKOS_RESTRICT B2 = - iend ? B + (p + 1) * bs0 : NULL; + ValueType *KOKKOS_RESTRICT b1t = B + p * bs0, *KOKKOS_RESTRICT B2 = iend ? B + (p + 1) * bs0 : NULL; member.team_barrier(); if (!use_unit_diag) { const ValueType alpha11 = A[p * as0 + p * as1]; - Kokkos::parallel_for( - Kokkos::TeamVectorRange(member, 0, jend), - [&](const int &j) { b1t[j * bs1] = b1t[j * bs1] / alpha11; }); + Kokkos::parallel_for(Kokkos::TeamVectorRange(member, 0, jend), + [&](const int &j) { b1t[j * bs1] = b1t[j * bs1] / alpha11; }); member.team_barrier(); } - Kokkos::parallel_for( - Kokkos::TeamThreadRange(member, iend), [&](const int &i) { - Kokkos::parallel_for( - Kokkos::ThreadVectorRange(member, jend), [&](const int &j) { - // assume layout right for batched computation - B2[i * bs0 + j * bs1] -= a21[i * as0] * b1t[j * bs1]; - }); - }); + Kokkos::parallel_for(Kokkos::TeamThreadRange(member, iend), [&](const int &i) { + Kokkos::parallel_for(Kokkos::ThreadVectorRange(member, jend), [&](const int &j) { + // assume layout right for batched computation + B2[i * bs0 + j * bs1] -= a21[i * as0] * b1t[j * bs1]; + }); + }); } } return 0; @@ -94,31 +82,25 @@ TeamVectorTrsmInternalLeftLower::invoke( template struct TeamVectorTrsmInternalLeftUpper { template - KOKKOS_INLINE_FUNCTION static int invoke( - const MemberType &member, const bool use_unit_diag, const int m, - const int n, const ScalarType alpha, const ValueType *KOKKOS_RESTRICT A, - const int as0, const int as1, - /**/ ValueType *KOKKOS_RESTRICT B, const int bs0, const int bs1); + KOKKOS_INLINE_FUNCTION static int invoke(const MemberType &member, const bool use_unit_diag, const int m, const int n, + const ScalarType alpha, const ValueType *KOKKOS_RESTRICT A, const int as0, + const int as1, + /**/ ValueType *KOKKOS_RESTRICT B, const int bs0, const int bs1); }; template <> template -KOKKOS_INLINE_FUNCTION int -TeamVectorTrsmInternalLeftUpper::invoke( - const MemberType &member, const bool use_unit_diag, const int m, - const int n, const ScalarType alpha, const ValueType *KOKKOS_RESTRICT A, - const int as0, const int as1, +KOKKOS_INLINE_FUNCTION int TeamVectorTrsmInternalLeftUpper::invoke( + const MemberType &member, const bool use_unit_diag, const int m, const int n, const ScalarType alpha, + const ValueType *KOKKOS_RESTRICT A, const int as0, const int as1, /**/ ValueType *KOKKOS_RESTRICT B, const int bs0, const int bs1) { const ScalarType one(1.0), zero(0.0); // note that parallel range is different ( m*n vs m-1*n); if (alpha == zero) - KokkosBlas::Impl::TeamVectorSetInternal::invoke(member, m, n, zero, B, bs0, - bs1); + KokkosBlas::Impl::TeamVectorSetInternal::invoke(member, m, n, zero, B, bs0, bs1); else { - if (alpha != one) - KokkosBlas::Impl::TeamVectorScaleInternal::invoke(member, m, n, alpha, B, - bs0, bs1); + if (alpha != one) KokkosBlas::Impl::TeamVectorScaleInternal::invoke(member, m, n, alpha, B, bs0, bs1); if (m <= 0 || n <= 0) return 0; ValueType *KOKKOS_RESTRICT B0 = B; @@ -128,24 +110,20 @@ TeamVectorTrsmInternalLeftUpper::invoke( int jend = n; const ValueType *KOKKOS_RESTRICT a01 = A + p * as1; - /**/ ValueType *KOKKOS_RESTRICT b1t = B + p * bs0; + /**/ ValueType *KOKKOS_RESTRICT b1t = B + p * bs0; member.team_barrier(); if (!use_unit_diag) { const ValueType alpha11 = A[p * as0 + p * as1]; - Kokkos::parallel_for( - Kokkos::TeamVectorRange(member, 0, jend), - [&](const int &j) { b1t[j * bs1] = b1t[j * bs1] / alpha11; }); + Kokkos::parallel_for(Kokkos::TeamVectorRange(member, 0, jend), + [&](const int &j) { b1t[j * bs1] = b1t[j * bs1] / alpha11; }); member.team_barrier(); } - Kokkos::parallel_for( - Kokkos::TeamThreadRange(member, iend), [&](const int &i) { - Kokkos::parallel_for( - Kokkos::ThreadVectorRange(member, jend), [&](const int &j) { - B0[i * bs0 + j * bs1] -= a01[i * as0] * b1t[j * bs1]; - }); - }); + Kokkos::parallel_for(Kokkos::TeamThreadRange(member, iend), [&](const int &i) { + Kokkos::parallel_for(Kokkos::ThreadVectorRange(member, jend), + [&](const int &j) { B0[i * bs0 + j * bs1] -= a01[i * as0] * b1t[j * bs1]; }); + }); } } return 0; diff --git a/batched/dense/impl/KokkosBatched_Trsm_Team_Impl.hpp b/batched/dense/impl/KokkosBatched_Trsm_Team_Impl.hpp index 9f5f857e44..371dbb483c 100644 --- a/batched/dense/impl/KokkosBatched_Trsm_Team_Impl.hpp +++ b/batched/dense/impl/KokkosBatched_Trsm_Team_Impl.hpp @@ -34,32 +34,24 @@ namespace KokkosBatched { /// A(m x m), B(m x n) template -struct TeamTrsm { +struct TeamTrsm { template - KOKKOS_INLINE_FUNCTION static int invoke(const MemberType &member, - const ScalarType alpha, - const AViewType &A, + KOKKOS_INLINE_FUNCTION static int invoke(const MemberType &member, const ScalarType alpha, const AViewType &A, const BViewType &B) { - return TeamTrsmInternalLeftLower::invoke( - member, ArgDiag::use_unit_diag, B.extent(0), B.extent(1), alpha, - A.data(), A.stride_0(), A.stride_1(), B.data(), B.stride_0(), - B.stride_1()); + return TeamTrsmInternalLeftLower::invoke(member, ArgDiag::use_unit_diag, B.extent(0), + B.extent(1), alpha, A.data(), A.stride_0(), + A.stride_1(), B.data(), B.stride_0(), B.stride_1()); } }; template -struct TeamTrsm { +struct TeamTrsm { template - KOKKOS_INLINE_FUNCTION static int invoke(const MemberType &member, - const ScalarType alpha, - const AViewType &A, + KOKKOS_INLINE_FUNCTION static int invoke(const MemberType &member, const ScalarType alpha, const AViewType &A, const BViewType &B) { - return TeamTrsmInternalLeftLower::invoke( - member, ArgDiag::use_unit_diag, B.extent(0), B.extent(1), alpha, - A.data(), A.stride_0(), A.stride_1(), B.data(), B.stride_0(), - B.stride_1()); + return TeamTrsmInternalLeftLower::invoke(member, ArgDiag::use_unit_diag, B.extent(0), + B.extent(1), alpha, A.data(), A.stride_0(), + A.stride_1(), B.data(), B.stride_0(), B.stride_1()); } }; @@ -70,32 +62,24 @@ struct TeamTrsm -struct TeamTrsm { +struct TeamTrsm { template - KOKKOS_INLINE_FUNCTION static int invoke(const MemberType &member, - const ScalarType alpha, - const AViewType &A, + KOKKOS_INLINE_FUNCTION static int invoke(const MemberType &member, const ScalarType alpha, const AViewType &A, const BViewType &B) { - return TeamTrsmInternalLeftLower::invoke( - member, ArgDiag::use_unit_diag, B.extent(1), B.extent(0), alpha, - A.data(), A.stride_1(), A.stride_0(), B.data(), B.stride_1(), - B.stride_0()); + return TeamTrsmInternalLeftLower::invoke(member, ArgDiag::use_unit_diag, B.extent(1), + B.extent(0), alpha, A.data(), A.stride_1(), + A.stride_0(), B.data(), B.stride_1(), B.stride_0()); } }; template -struct TeamTrsm { +struct TeamTrsm { template - KOKKOS_INLINE_FUNCTION static int invoke(const MemberType &member, - const ScalarType alpha, - const AViewType &A, + KOKKOS_INLINE_FUNCTION static int invoke(const MemberType &member, const ScalarType alpha, const AViewType &A, const BViewType &B) { - return TeamTrsmInternalLeftLower::invoke( - member, ArgDiag::use_unit_diag, B.extent(1), B.extent(0), alpha, - A.data(), A.stride_1(), A.stride_0(), B.data(), B.stride_1(), - B.stride_0()); + return TeamTrsmInternalLeftLower::invoke(member, ArgDiag::use_unit_diag, B.extent(1), + B.extent(0), alpha, A.data(), A.stride_1(), + A.stride_0(), B.data(), B.stride_1(), B.stride_0()); } }; @@ -106,32 +90,24 @@ struct TeamTrsm -struct TeamTrsm { +struct TeamTrsm { template - KOKKOS_INLINE_FUNCTION static int invoke(const MemberType &member, - const ScalarType alpha, - const AViewType &A, + KOKKOS_INLINE_FUNCTION static int invoke(const MemberType &member, const ScalarType alpha, const AViewType &A, const BViewType &B) { - return TeamTrsmInternalLeftUpper::invoke( - member, ArgDiag::use_unit_diag, B.extent(1), B.extent(0), alpha, - A.data(), A.stride_1(), A.stride_0(), B.data(), B.stride_1(), - B.stride_0()); + return TeamTrsmInternalLeftUpper::invoke(member, ArgDiag::use_unit_diag, B.extent(1), + B.extent(0), alpha, A.data(), A.stride_1(), + A.stride_0(), B.data(), B.stride_1(), B.stride_0()); } }; template -struct TeamTrsm { +struct TeamTrsm { template - KOKKOS_INLINE_FUNCTION static int invoke(const MemberType &member, - const ScalarType alpha, - const AViewType &A, + KOKKOS_INLINE_FUNCTION static int invoke(const MemberType &member, const ScalarType alpha, const AViewType &A, const BViewType &B) { - return TeamTrsmInternalLeftUpper::invoke( - member, ArgDiag::use_unit_diag, B.extent(1), B.extent(0), alpha, - A.data(), A.stride_1(), A.stride_0(), B.data(), B.stride_1(), - B.stride_0()); + return TeamTrsmInternalLeftUpper::invoke(member, ArgDiag::use_unit_diag, B.extent(1), + B.extent(0), alpha, A.data(), A.stride_1(), + A.stride_0(), B.data(), B.stride_1(), B.stride_0()); } }; @@ -142,32 +118,24 @@ struct TeamTrsm -struct TeamTrsm { +struct TeamTrsm { template - KOKKOS_INLINE_FUNCTION static int invoke(const MemberType &member, - const ScalarType alpha, - const AViewType &A, + KOKKOS_INLINE_FUNCTION static int invoke(const MemberType &member, const ScalarType alpha, const AViewType &A, const BViewType &B) { - return TeamTrsmInternalLeftUpper::invoke( - member, ArgDiag::use_unit_diag, B.extent(1), B.extent(0), alpha, - A.data(), A.stride_0(), A.stride_1(), B.data(), B.stride_1(), - B.stride_0()); + return TeamTrsmInternalLeftUpper::invoke(member, ArgDiag::use_unit_diag, B.extent(1), + B.extent(0), alpha, A.data(), A.stride_0(), + A.stride_1(), B.data(), B.stride_1(), B.stride_0()); } }; template -struct TeamTrsm { +struct TeamTrsm { template - KOKKOS_INLINE_FUNCTION static int invoke(const MemberType &member, - const ScalarType alpha, - const AViewType &A, + KOKKOS_INLINE_FUNCTION static int invoke(const MemberType &member, const ScalarType alpha, const AViewType &A, const BViewType &B) { - return TeamTrsmInternalLeftUpper::invoke( - member, ArgDiag::use_unit_diag, B.extent(1), B.extent(0), alpha, - A.data(), A.stride_0(), A.stride_1(), B.data(), B.stride_1(), - B.stride_0()); + return TeamTrsmInternalLeftUpper::invoke(member, ArgDiag::use_unit_diag, B.extent(1), + B.extent(0), alpha, A.data(), A.stride_0(), + A.stride_1(), B.data(), B.stride_1(), B.stride_0()); } }; @@ -178,32 +146,24 @@ struct TeamTrsm -struct TeamTrsm { +struct TeamTrsm { template - KOKKOS_INLINE_FUNCTION static int invoke(const MemberType &member, - const ScalarType alpha, - const AViewType &A, + KOKKOS_INLINE_FUNCTION static int invoke(const MemberType &member, const ScalarType alpha, const AViewType &A, const BViewType &B) { - return TeamTrsmInternalLeftUpper::invoke( - member, ArgDiag::use_unit_diag, B.extent(0), B.extent(1), alpha, - A.data(), A.stride_0(), A.stride_1(), B.data(), B.stride_0(), - B.stride_1()); + return TeamTrsmInternalLeftUpper::invoke(member, ArgDiag::use_unit_diag, B.extent(0), + B.extent(1), alpha, A.data(), A.stride_0(), + A.stride_1(), B.data(), B.stride_0(), B.stride_1()); } }; template -struct TeamTrsm { +struct TeamTrsm { template - KOKKOS_INLINE_FUNCTION static int invoke(const MemberType &member, - const ScalarType alpha, - const AViewType &A, + KOKKOS_INLINE_FUNCTION static int invoke(const MemberType &member, const ScalarType alpha, const AViewType &A, const BViewType &B) { - return TeamTrsmInternalLeftUpper::invoke( - member, ArgDiag::use_unit_diag, B.extent(0), B.extent(1), alpha, - A.data(), A.stride_0(), A.stride_1(), B.data(), B.stride_0(), - B.stride_1()); + return TeamTrsmInternalLeftUpper::invoke(member, ArgDiag::use_unit_diag, B.extent(0), + B.extent(1), alpha, A.data(), A.stride_0(), + A.stride_1(), B.data(), B.stride_0(), B.stride_1()); } }; @@ -214,32 +174,24 @@ struct TeamTrsm -struct TeamTrsm { +struct TeamTrsm { template - KOKKOS_INLINE_FUNCTION static int invoke(const MemberType &member, - const ScalarType alpha, - const AViewType &A, + KOKKOS_INLINE_FUNCTION static int invoke(const MemberType &member, const ScalarType alpha, const AViewType &A, const BViewType &B) { - return TeamTrsmInternalLeftUpper::invoke( - member, ArgDiag::use_unit_diag, B.extent(0), B.extent(1), alpha, - A.data(), A.stride_1(), A.stride_0(), B.data(), B.stride_0(), - B.stride_1()); + return TeamTrsmInternalLeftUpper::invoke(member, ArgDiag::use_unit_diag, B.extent(0), + B.extent(1), alpha, A.data(), A.stride_1(), + A.stride_0(), B.data(), B.stride_0(), B.stride_1()); } }; template -struct TeamTrsm { +struct TeamTrsm { template - KOKKOS_INLINE_FUNCTION static int invoke(const MemberType &member, - const ScalarType alpha, - const AViewType &A, + KOKKOS_INLINE_FUNCTION static int invoke(const MemberType &member, const ScalarType alpha, const AViewType &A, const BViewType &B) { - return TeamTrsmInternalLeftUpper::invoke( - member, ArgDiag::use_unit_diag, B.extent(0), B.extent(1), alpha, - A.data(), A.stride_1(), A.stride_0(), B.data(), B.stride_0(), - B.stride_1()); + return TeamTrsmInternalLeftUpper::invoke(member, ArgDiag::use_unit_diag, B.extent(0), + B.extent(1), alpha, A.data(), A.stride_1(), + A.stride_0(), B.data(), B.stride_0(), B.stride_1()); } }; @@ -250,32 +202,24 @@ struct TeamTrsm -struct TeamTrsm { +struct TeamTrsm { template - KOKKOS_INLINE_FUNCTION static int invoke(const MemberType &member, - const ScalarType alpha, - const AViewType &A, + KOKKOS_INLINE_FUNCTION static int invoke(const MemberType &member, const ScalarType alpha, const AViewType &A, const BViewType &B) { - return TeamTrsmInternalLeftLower::invoke( - member, ArgDiag::use_unit_diag, B.extent(0), B.extent(1), alpha, - A.data(), A.stride_1(), A.stride_0(), B.data(), B.stride_0(), - B.stride_1()); + return TeamTrsmInternalLeftLower::invoke(member, ArgDiag::use_unit_diag, B.extent(0), + B.extent(1), alpha, A.data(), A.stride_1(), + A.stride_0(), B.data(), B.stride_0(), B.stride_1()); } }; template -struct TeamTrsm { +struct TeamTrsm { template - KOKKOS_INLINE_FUNCTION static int invoke(const MemberType &member, - const ScalarType alpha, - const AViewType &A, + KOKKOS_INLINE_FUNCTION static int invoke(const MemberType &member, const ScalarType alpha, const AViewType &A, const BViewType &B) { - return TeamTrsmInternalLeftLower::invoke( - member, ArgDiag::use_unit_diag, B.extent(0), B.extent(1), alpha, - A.data(), A.stride_1(), A.stride_0(), B.data(), B.stride_0(), - B.stride_1()); + return TeamTrsmInternalLeftLower::invoke(member, ArgDiag::use_unit_diag, B.extent(0), + B.extent(1), alpha, A.data(), A.stride_1(), + A.stride_0(), B.data(), B.stride_0(), B.stride_1()); } }; diff --git a/batched/dense/impl/KokkosBatched_Trsm_Team_Internal.hpp b/batched/dense/impl/KokkosBatched_Trsm_Team_Internal.hpp index a880186ae9..a1a7062809 100644 --- a/batched/dense/impl/KokkosBatched_Trsm_Team_Internal.hpp +++ b/batched/dense/impl/KokkosBatched_Trsm_Team_Internal.hpp @@ -35,29 +35,24 @@ namespace KokkosBatched { template struct TeamTrsmInternalLeftLower { template - KOKKOS_INLINE_FUNCTION static int invoke( - const MemberType &member, const bool use_unit_diag, const int m, - const int n, const ScalarType alpha, const ValueType *KOKKOS_RESTRICT A, - const int as0, const int as1, - /**/ ValueType *KOKKOS_RESTRICT B, const int bs0, const int bs1); + KOKKOS_INLINE_FUNCTION static int invoke(const MemberType &member, const bool use_unit_diag, const int m, const int n, + const ScalarType alpha, const ValueType *KOKKOS_RESTRICT A, const int as0, + const int as1, + /**/ ValueType *KOKKOS_RESTRICT B, const int bs0, const int bs1); }; template <> template -KOKKOS_INLINE_FUNCTION int -TeamTrsmInternalLeftLower::invoke( - const MemberType &member, const bool use_unit_diag, const int m, - const int n, const ScalarType alpha, const ValueType *KOKKOS_RESTRICT A, - const int as0, const int as1, +KOKKOS_INLINE_FUNCTION int TeamTrsmInternalLeftLower::invoke( + const MemberType &member, const bool use_unit_diag, const int m, const int n, const ScalarType alpha, + const ValueType *KOKKOS_RESTRICT A, const int as0, const int as1, /**/ ValueType *KOKKOS_RESTRICT B, const int bs0, const int bs1) { const ScalarType one(1.0), zero(0.0); if (alpha == zero) KokkosBlas::Impl::TeamSetInternal::invoke(member, m, n, zero, B, bs0, bs1); else { - if (alpha != one) - KokkosBlas::Impl::TeamScaleInternal::invoke(member, m, n, alpha, B, bs0, - bs1); + if (alpha != one) KokkosBlas::Impl::TeamScaleInternal::invoke(member, m, n, alpha, B, bs0, bs1); if (m <= 0 || n <= 0) return 0; for (int p = 0; p < m; ++p) { @@ -65,27 +60,22 @@ TeamTrsmInternalLeftLower::invoke( int iend = m - p - 1; int jend = n; - const ValueType *KOKKOS_RESTRICT a21 = - iend ? A + (p + 1) * as0 + p * as1 : NULL; + const ValueType *KOKKOS_RESTRICT a21 = iend ? A + (p + 1) * as0 + p * as1 : NULL; - ValueType *KOKKOS_RESTRICT b1t = B + p * bs0, - *KOKKOS_RESTRICT B2 = - iend ? B + (p + 1) * bs0 : NULL; + ValueType *KOKKOS_RESTRICT b1t = B + p * bs0, *KOKKOS_RESTRICT B2 = iend ? B + (p + 1) * bs0 : NULL; member.team_barrier(); if (!use_unit_diag) { const ValueType alpha11 = A[p * as0 + p * as1]; - Kokkos::parallel_for( - Kokkos::TeamThreadRange(member, 0, jend), - [&](const int &j) { b1t[j * bs1] = b1t[j * bs1] / alpha11; }); + Kokkos::parallel_for(Kokkos::TeamThreadRange(member, 0, jend), + [&](const int &j) { b1t[j * bs1] = b1t[j * bs1] / alpha11; }); member.team_barrier(); } - Kokkos::parallel_for( - Kokkos::TeamThreadRange(member, 0, iend * jend), [&](const int &ij) { - // assume layout right for batched computation - const int i = ij / jend, j = ij % jend; - B2[i * bs0 + j * bs1] -= a21[i * as0] * b1t[j * bs1]; - }); + Kokkos::parallel_for(Kokkos::TeamThreadRange(member, 0, iend * jend), [&](const int &ij) { + // assume layout right for batched computation + const int i = ij / jend, j = ij % jend; + B2[i * bs0 + j * bs1] -= a21[i * as0] * b1t[j * bs1]; + }); } } return 0; @@ -93,11 +83,9 @@ TeamTrsmInternalLeftLower::invoke( template <> template -KOKKOS_INLINE_FUNCTION int -TeamTrsmInternalLeftLower::invoke( - const MemberType &member, const bool use_unit_diag, const int m, - const int n, const ScalarType alpha, const ValueType *KOKKOS_RESTRICT A, - const int as0, const int as1, +KOKKOS_INLINE_FUNCTION int TeamTrsmInternalLeftLower::invoke( + const MemberType &member, const bool use_unit_diag, const int m, const int n, const ScalarType alpha, + const ValueType *KOKKOS_RESTRICT A, const int as0, const int as1, /**/ ValueType *KOKKOS_RESTRICT B, const int bs0, const int bs1) { constexpr int mbAlgo = Algo::Trsm::Blocked::mb(); @@ -107,9 +95,7 @@ TeamTrsmInternalLeftLower::invoke( if (alpha == zero) KokkosBlas::Impl::TeamSetInternal::invoke(member, m, n, zero, B, bs0, bs1); else { - if (alpha != one) - KokkosBlas::Impl::TeamScaleInternal::invoke(member, m, n, alpha, B, bs0, - bs1); + if (alpha != one) KokkosBlas::Impl::TeamScaleInternal::invoke(member, m, n, alpha, B, bs0, bs1); if (m <= 0 || n <= 0) return 0; /// @@ -120,8 +106,7 @@ TeamTrsmInternalLeftLower::invoke( InnerTrsmLeftLowerUnitDiag trsm_u(as0, as1, bs0, bs1); InnerTrsmLeftLowerNonUnitDiag trsm_n(as0, as1, bs0, bs1); - auto trsm = [&](const int ib, const int jb, - const ValueType *KOKKOS_RESTRICT AA, + auto trsm = [&](const int ib, const int jb, const ValueType *KOKKOS_RESTRICT AA, /**/ ValueType *KOKKOS_RESTRICT BB) { const int mb = mbAlgo; const int tsize = member.team_size(); @@ -134,25 +119,22 @@ TeamTrsmInternalLeftLower::invoke( // trsm update const ValueType *KOKKOS_RESTRICT Ap = AA + p * as0 + p * as1; - /**/ ValueType *KOKKOS_RESTRICT Bp = BB + p * bs0; + /**/ ValueType *KOKKOS_RESTRICT Bp = BB + p * bs0; member.team_barrier(); - Kokkos::parallel_for( - Kokkos::TeamThreadRange(member, 0, (jb / nb) + (np > 0)), - [&](const int jj) { - // Made this non-const in order to WORKAROUND issue #349 - int j = jj * nb, qb = (j + nb) > jb ? np : nb; - if (use_unit_diag) - trsm_u.serial_invoke(Ap, pb, qb, Bp + j * bs1); - else - trsm_n.serial_invoke(Ap, pb, qb, Bp + j * bs1); - }); + Kokkos::parallel_for(Kokkos::TeamThreadRange(member, 0, (jb / nb) + (np > 0)), [&](const int jj) { + // Made this non-const in order to WORKAROUND issue #349 + int j = jj * nb, qb = (j + nb) > jb ? np : nb; + if (use_unit_diag) + trsm_u.serial_invoke(Ap, pb, qb, Bp + j * bs1); + else + trsm_n.serial_invoke(Ap, pb, qb, Bp + j * bs1); + }); member.team_barrier(); // gemm update - TeamGemmInternal::invoke( - member, ib - p - pb, jb, pb, minus_one, Ap + pb * as0, as0, as1, Bp, - bs0, bs1, one, Bp + pb * bs0, bs0, bs1); + TeamGemmInternal::invoke(member, ib - p - pb, jb, pb, minus_one, Ap + pb * as0, as0, as1, + Bp, bs0, bs1, one, Bp + pb * bs0, bs0, bs1); } }; @@ -170,20 +152,17 @@ TeamTrsmInternalLeftLower::invoke( template struct TeamTrsmInternalLeftUpper { template - KOKKOS_INLINE_FUNCTION static int invoke( - const MemberType &member, const bool use_unit_diag, const int m, - const int n, const ScalarType alpha, const ValueType *KOKKOS_RESTRICT A, - const int as0, const int as1, - /**/ ValueType *KOKKOS_RESTRICT B, const int bs0, const int bs1); + KOKKOS_INLINE_FUNCTION static int invoke(const MemberType &member, const bool use_unit_diag, const int m, const int n, + const ScalarType alpha, const ValueType *KOKKOS_RESTRICT A, const int as0, + const int as1, + /**/ ValueType *KOKKOS_RESTRICT B, const int bs0, const int bs1); }; template <> template -KOKKOS_INLINE_FUNCTION int -TeamTrsmInternalLeftUpper::invoke( - const MemberType &member, const bool use_unit_diag, const int m, - const int n, const ScalarType alpha, const ValueType *KOKKOS_RESTRICT A, - const int as0, const int as1, +KOKKOS_INLINE_FUNCTION int TeamTrsmInternalLeftUpper::invoke( + const MemberType &member, const bool use_unit_diag, const int m, const int n, const ScalarType alpha, + const ValueType *KOKKOS_RESTRICT A, const int as0, const int as1, /**/ ValueType *KOKKOS_RESTRICT B, const int bs0, const int bs1) { const ScalarType one(1.0), zero(0.0); @@ -191,9 +170,7 @@ TeamTrsmInternalLeftUpper::invoke( if (alpha == zero) KokkosBlas::Impl::TeamSetInternal::invoke(member, m, n, zero, B, bs0, bs1); else { - if (alpha != one) - KokkosBlas::Impl::TeamScaleInternal::invoke(member, m, n, alpha, B, bs0, - bs1); + if (alpha != one) KokkosBlas::Impl::TeamScaleInternal::invoke(member, m, n, alpha, B, bs0, bs1); if (m <= 0 || n <= 0) return 0; ValueType *KOKKOS_RESTRICT B0 = B; @@ -203,30 +180,27 @@ TeamTrsmInternalLeftUpper::invoke( int jend = n; const ValueType *KOKKOS_RESTRICT a01 = A + p * as1; - /**/ ValueType *KOKKOS_RESTRICT b1t = B + p * bs0; + /**/ ValueType *KOKKOS_RESTRICT b1t = B + p * bs0; member.team_barrier(); if (!use_unit_diag) { const ValueType alpha11 = A[p * as0 + p * as1]; - Kokkos::parallel_for( - Kokkos::TeamThreadRange(member, 0, jend), - [&](const int &j) { b1t[j * bs1] = b1t[j * bs1] / alpha11; }); + Kokkos::parallel_for(Kokkos::TeamThreadRange(member, 0, jend), + [&](const int &j) { b1t[j * bs1] = b1t[j * bs1] / alpha11; }); member.team_barrier(); } - Kokkos::parallel_for( - Kokkos::TeamThreadRange(member, 0, iend * jend), [&](const int &ij) { - int i, j; - if (KokkosKernels::Impl::kk_is_gpu_exec_space< - typename MemberType::execution_space>()) { - i = ij % iend; - j = ij / iend; - } else { - i = ij / jend; - j = ij % jend; - } - B0[i * bs0 + j * bs1] -= a01[i * as0] * b1t[j * bs1]; - }); + Kokkos::parallel_for(Kokkos::TeamThreadRange(member, 0, iend * jend), [&](const int &ij) { + int i, j; + if (KokkosKernels::Impl::kk_is_gpu_exec_space()) { + i = ij % iend; + j = ij / iend; + } else { + i = ij / jend; + j = ij % jend; + } + B0[i * bs0 + j * bs1] -= a01[i * as0] * b1t[j * bs1]; + }); } } return 0; @@ -234,11 +208,9 @@ TeamTrsmInternalLeftUpper::invoke( template <> template -KOKKOS_INLINE_FUNCTION int -TeamTrsmInternalLeftUpper::invoke( - const MemberType &member, const bool use_unit_diag, const int m, - const int n, const ScalarType alpha, const ValueType *KOKKOS_RESTRICT A, - const int as0, const int as1, +KOKKOS_INLINE_FUNCTION int TeamTrsmInternalLeftUpper::invoke( + const MemberType &member, const bool use_unit_diag, const int m, const int n, const ScalarType alpha, + const ValueType *KOKKOS_RESTRICT A, const int as0, const int as1, /**/ ValueType *KOKKOS_RESTRICT B, const int bs0, const int bs1) { constexpr int mbAlgo = Algo::Trsm::Blocked::mb(); @@ -248,16 +220,13 @@ TeamTrsmInternalLeftUpper::invoke( if (alpha == zero) KokkosBlas::Impl::TeamSetInternal::invoke(member, m, n, zero, B, bs0, bs1); else { - if (alpha != one) - KokkosBlas::Impl::TeamScaleInternal::invoke(member, m, n, alpha, B, bs0, - bs1); + if (alpha != one) KokkosBlas::Impl::TeamScaleInternal::invoke(member, m, n, alpha, B, bs0, bs1); if (m <= 0 || n <= 0) return 0; InnerTrsmLeftUpperUnitDiag trsm_u(as0, as1, bs0, bs1); InnerTrsmLeftUpperNonUnitDiag trsm_n(as0, as1, bs0, bs1); - auto trsm = [&](const int ib, const int jb, - const ValueType *KOKKOS_RESTRICT AA, + auto trsm = [&](const int ib, const int jb, const ValueType *KOKKOS_RESTRICT AA, /**/ ValueType *KOKKOS_RESTRICT BB) { const int mb = mbAlgo; //(ib <=5 ? ib : mbAlgo); const int tsize = member.team_size(); @@ -265,29 +234,25 @@ TeamTrsmInternalLeftUpper::invoke( int nb = (jb / tsize + jb % tsize > 0); int np = jb % nb; for (int pp = 0; pp < ib; pp += mb) { - const int ptmp = (ib - pp - mb), p = (ptmp < 0 ? 0 : ptmp), - pb = (mb + (ptmp < 0) * ptmp); + const int ptmp = (ib - pp - mb), p = (ptmp < 0 ? 0 : ptmp), pb = (mb + (ptmp < 0) * ptmp); // trsm update const ValueType *KOKKOS_RESTRICT Ap = AA + p * as0 + p * as1; - /**/ ValueType *KOKKOS_RESTRICT Bp = BB + p * bs0; + /**/ ValueType *KOKKOS_RESTRICT Bp = BB + p * bs0; member.team_barrier(); - Kokkos::parallel_for( - Kokkos::TeamThreadRange(member, 0, (jb / nb) + (np > 0)), - [&](const int &jj) { - const int j = jj * nb, qb = (j + nb) > jb ? np : nb; - if (use_unit_diag) - trsm_u.serial_invoke(Ap, pb, qb, Bp + j * bs1); - else - trsm_n.serial_invoke(Ap, pb, qb, Bp + j * bs1); - }); + Kokkos::parallel_for(Kokkos::TeamThreadRange(member, 0, (jb / nb) + (np > 0)), [&](const int &jj) { + const int j = jj * nb, qb = (j + nb) > jb ? np : nb; + if (use_unit_diag) + trsm_u.serial_invoke(Ap, pb, qb, Bp + j * bs1); + else + trsm_n.serial_invoke(Ap, pb, qb, Bp + j * bs1); + }); member.team_barrier(); // gemm update - TeamGemmInternal::invoke( - member, p, jb, pb, minus_one, Ap - p * as0, as0, as1, Bp, bs0, bs1, - one, BB, bs0, bs1); + TeamGemmInternal::invoke(member, p, jb, pb, minus_one, Ap - p * as0, as0, as1, Bp, bs0, + bs1, one, BB, bs0, bs1); } }; diff --git a/batched/dense/impl/KokkosBatched_Trsv_Serial_Impl.hpp b/batched/dense/impl/KokkosBatched_Trsv_Serial_Impl.hpp index 0fc375a7b2..073970caa6 100644 --- a/batched/dense/impl/KokkosBatched_Trsv_Serial_Impl.hpp +++ b/batched/dense/impl/KokkosBatched_Trsv_Serial_Impl.hpp @@ -38,43 +38,32 @@ namespace KokkosBatched { /// L/NT /// -#if defined(__KOKKOSBATCHED_ENABLE_INTEL_MKL__) && \ - defined(__KOKKOSBATCHED_ENABLE_INTEL_MKL_BATCHED__) && \ +#if defined(__KOKKOSBATCHED_ENABLE_INTEL_MKL__) && defined(__KOKKOSBATCHED_ENABLE_INTEL_MKL_BATCHED__) && \ defined(__KOKKOSBATCHED_ENABLE_INTEL_MKL_COMPACT_BATCHED__) template -struct SerialTrsv { +struct SerialTrsv { template - KOKKOS_INLINE_FUNCTION static int invoke(const ScalarType alpha, - const AViewType &A, - const bViewType &b) { + KOKKOS_INLINE_FUNCTION static int invoke(const ScalarType alpha, const AViewType &A, const bViewType &b) { typedef typename bViewType::value_type vector_type; // typedef typename vector_type::value_type value_type; const int m = b.extent(0), n = 1; - static_assert(is_vector::value, - "value type is not vector type"); - static_assert( - vector_type::vector_length == 4 || vector_type::vector_length == 8, - "AVX, AVX2 and AVX512 is supported"); - const MKL_COMPACT_PACK format = - vector_type::vector_length == 8 ? MKL_COMPACT_AVX512 : MKL_COMPACT_AVX; + static_assert(is_vector::value, "value type is not vector type"); + static_assert(vector_type::vector_length == 4 || vector_type::vector_length == 8, + "AVX, AVX2 and AVX512 is supported"); + const MKL_COMPACT_PACK format = vector_type::vector_length == 8 ? MKL_COMPACT_AVX512 : MKL_COMPACT_AVX; // no error check int r_val = 0; if (A.stride_0() == 1) { mkl_dtrsm_compact(MKL_COL_MAJOR, MKL_LEFT, MKL_LOWER, MKL_NOTRANS, - ArgDiag::use_unit_diag ? MKL_UNIT : MKL_NONUNIT, m, n, - alpha, (const double *)A.data(), A.stride_0(), - (double *)b.data(), b.stride_0(), format, - (MKL_INT)vector_type::vector_length); + ArgDiag::use_unit_diag ? MKL_UNIT : MKL_NONUNIT, m, n, alpha, (const double *)A.data(), + A.stride_0(), (double *)b.data(), b.stride_0(), format, (MKL_INT)vector_type::vector_length); } else if (A.stride_1() == 1) { mkl_dtrsm_compact(MKL_ROW_MAJOR, MKL_LEFT, MKL_LOWER, MKL_NOTRANS, - ArgDiag::use_unit_diag ? MKL_UNIT : MKL_NONUNIT, m, n, - alpha, (const double *)A.data(), A.stride_0(), - (double *)b.data(), b.stride_0(), format, - (MKL_INT)vector_type::vector_length); + ArgDiag::use_unit_diag ? MKL_UNIT : MKL_NONUNIT, m, n, alpha, (const double *)A.data(), + A.stride_0(), (double *)b.data(), b.stride_0(), format, (MKL_INT)vector_type::vector_length); } else { r_val = -1; } @@ -84,28 +73,20 @@ struct SerialTrsv -struct SerialTrsv { +struct SerialTrsv { template - KOKKOS_INLINE_FUNCTION static int invoke(const ScalarType alpha, - const AViewType &A, - const bViewType &b) { - return SerialTrsvInternalLower::invoke( - ArgDiag::use_unit_diag, A.extent(0), alpha, A.data(), A.stride_0(), - A.stride_1(), b.data(), b.stride_0()); + KOKKOS_INLINE_FUNCTION static int invoke(const ScalarType alpha, const AViewType &A, const bViewType &b) { + return SerialTrsvInternalLower::invoke(ArgDiag::use_unit_diag, A.extent(0), alpha, A.data(), + A.stride_0(), A.stride_1(), b.data(), b.stride_0()); } }; template -struct SerialTrsv { +struct SerialTrsv { template - KOKKOS_INLINE_FUNCTION static int invoke(const ScalarType alpha, - const AViewType &A, - const bViewType &b) { - return SerialTrsvInternalLower::invoke( - ArgDiag::use_unit_diag, A.extent(0), alpha, A.data(), A.stride_0(), - A.stride_1(), b.data(), b.stride_0()); + KOKKOS_INLINE_FUNCTION static int invoke(const ScalarType alpha, const AViewType &A, const bViewType &b) { + return SerialTrsvInternalLower::invoke(ArgDiag::use_unit_diag, A.extent(0), alpha, A.data(), + A.stride_0(), A.stride_1(), b.data(), b.stride_0()); } }; @@ -113,42 +94,31 @@ struct SerialTrsv -struct SerialTrsv { +struct SerialTrsv { template - KOKKOS_INLINE_FUNCTION static int invoke(const ScalarType alpha, - const AViewType &A, - const bViewType &b) { + KOKKOS_INLINE_FUNCTION static int invoke(const ScalarType alpha, const AViewType &A, const bViewType &b) { typedef typename bViewType::value_type vector_type; // typedef typename vector_type::value_type value_type; const int m = b.extent(0), n = 1; - static_assert(is_vector::value, - "value type is not vector type"); - static_assert( - vector_type::vector_length == 4 || vector_type::vector_length == 8, - "AVX, AVX2 and AVX512 is supported"); - const MKL_COMPACT_PACK format = - vector_type::vector_length == 8 ? MKL_COMPACT_AVX512 : MKL_COMPACT_AVX; + static_assert(is_vector::value, "value type is not vector type"); + static_assert(vector_type::vector_length == 4 || vector_type::vector_length == 8, + "AVX, AVX2 and AVX512 is supported"); + const MKL_COMPACT_PACK format = vector_type::vector_length == 8 ? MKL_COMPACT_AVX512 : MKL_COMPACT_AVX; // no error check int r_val = 0; if (A.stride_0() == 1) { - mkl_dtrsm_compact(MKL_COL_MAJOR, MKL_LEFT, MKL_LOWER, MKL_TRANS, - ArgDiag::use_unit_diag ? MKL_UNIT : MKL_NONUNIT, m, n, - alpha, (const double *)A.data(), A.stride_0(), - (double *)b.data(), b.stride_0(), format, + mkl_dtrsm_compact(MKL_COL_MAJOR, MKL_LEFT, MKL_LOWER, MKL_TRANS, ArgDiag::use_unit_diag ? MKL_UNIT : MKL_NONUNIT, + m, n, alpha, (const double *)A.data(), A.stride_0(), (double *)b.data(), b.stride_0(), format, (MKL_INT)vector_type::vector_length); } else if (A.stride_1() == 1) { - mkl_dtrsm_compact(MKL_ROW_MAJOR, MKL_LEFT, MKL_LOWER, MKL_TRANS, - ArgDiag::use_unit_diag ? MKL_UNIT : MKL_NONUNIT, m, n, - alpha, (const double *)A.data(), A.stride_0(), - (double *)b.data(), b.stride_0(), format, + mkl_dtrsm_compact(MKL_ROW_MAJOR, MKL_LEFT, MKL_LOWER, MKL_TRANS, ArgDiag::use_unit_diag ? MKL_UNIT : MKL_NONUNIT, + m, n, alpha, (const double *)A.data(), A.stride_0(), (double *)b.data(), b.stride_0(), format, (MKL_INT)vector_type::vector_length); } else { r_val = -1; @@ -159,27 +129,20 @@ struct SerialTrsv -struct SerialTrsv { +struct SerialTrsv { template - KOKKOS_INLINE_FUNCTION static int invoke(const ScalarType alpha, - const AViewType &A, - const bViewType &b) { - return SerialTrsvInternalUpper::invoke( - ArgDiag::use_unit_diag, A.extent(1), alpha, A.data(), A.stride_1(), - A.stride_0(), b.data(), b.stride_0()); + KOKKOS_INLINE_FUNCTION static int invoke(const ScalarType alpha, const AViewType &A, const bViewType &b) { + return SerialTrsvInternalUpper::invoke(ArgDiag::use_unit_diag, A.extent(1), alpha, A.data(), + A.stride_1(), A.stride_0(), b.data(), b.stride_0()); } }; template struct SerialTrsv { template - KOKKOS_INLINE_FUNCTION static int invoke(const ScalarType alpha, - const AViewType &A, - const bViewType &b) { - return SerialTrsvInternalUpper::invoke( - ArgDiag::use_unit_diag, A.extent(1), alpha, A.data(), A.stride_1(), - A.stride_0(), b.data(), b.stride_0()); + KOKKOS_INLINE_FUNCTION static int invoke(const ScalarType alpha, const AViewType &A, const bViewType &b) { + return SerialTrsvInternalUpper::invoke(ArgDiag::use_unit_diag, A.extent(1), alpha, A.data(), + A.stride_1(), A.stride_0(), b.data(), b.stride_0()); } }; @@ -187,43 +150,32 @@ struct SerialTrsv { /// U/NT /// -#if defined(__KOKKOSBATCHED_ENABLE_INTEL_MKL__) && \ - defined(__KOKKOSBATCHED_ENABLE_INTEL_MKL_BATCHED__) && \ +#if defined(__KOKKOSBATCHED_ENABLE_INTEL_MKL__) && defined(__KOKKOSBATCHED_ENABLE_INTEL_MKL_BATCHED__) && \ defined(__KOKKOSBATCHED_ENABLE_INTEL_MKL_COMPACT_BATCHED__) template -struct SerialTrsv { +struct SerialTrsv { template - KOKKOS_INLINE_FUNCTION static int invoke(const ScalarType alpha, - const AViewType &A, - const bViewType &b) { + KOKKOS_INLINE_FUNCTION static int invoke(const ScalarType alpha, const AViewType &A, const bViewType &b) { typedef typename bViewType::value_type vector_type; // typedef typename vector_type::value_type value_type; const int m = b.extent(0), n = 1; - static_assert(is_vector::value, - "value type is not vector type"); - static_assert( - vector_type::vector_length == 4 || vector_type::vector_length == 8, - "AVX, AVX2 and AVX512 is supported"); - const MKL_COMPACT_PACK format = - vector_type::vector_length == 8 ? MKL_COMPACT_AVX512 : MKL_COMPACT_AVX; + static_assert(is_vector::value, "value type is not vector type"); + static_assert(vector_type::vector_length == 4 || vector_type::vector_length == 8, + "AVX, AVX2 and AVX512 is supported"); + const MKL_COMPACT_PACK format = vector_type::vector_length == 8 ? MKL_COMPACT_AVX512 : MKL_COMPACT_AVX; // no error check int r_val = 0; if (A.stride_0() == 1) { mkl_dtrsm_compact(MKL_COL_MAJOR, MKL_LEFT, MKL_UPPER, MKL_NOTRANS, - ArgDiag::use_unit_diag ? MKL_UNIT : MKL_NONUNIT, m, n, - alpha, (const double *)A.data(), A.stride_0(), - (double *)b.data(), b.stride_0(), format, - (MKL_INT)vector_type::vector_length); + ArgDiag::use_unit_diag ? MKL_UNIT : MKL_NONUNIT, m, n, alpha, (const double *)A.data(), + A.stride_0(), (double *)b.data(), b.stride_0(), format, (MKL_INT)vector_type::vector_length); } else if (A.stride_1() == 1) { mkl_dtrsm_compact(MKL_ROW_MAJOR, MKL_LEFT, MKL_UPPER, MKL_NOTRANS, - ArgDiag::use_unit_diag ? MKL_UNIT : MKL_NONUNIT, m, n, - alpha, (const double *)A.data(), A.stride_0(), - (double *)b.data(), b.stride_0(), format, - (MKL_INT)vector_type::vector_length); + ArgDiag::use_unit_diag ? MKL_UNIT : MKL_NONUNIT, m, n, alpha, (const double *)A.data(), + A.stride_0(), (double *)b.data(), b.stride_0(), format, (MKL_INT)vector_type::vector_length); } else { r_val = -1; } @@ -233,28 +185,20 @@ struct SerialTrsv -struct SerialTrsv { +struct SerialTrsv { template - KOKKOS_INLINE_FUNCTION static int invoke(const ScalarType alpha, - const AViewType &A, - const bViewType &b) { - return SerialTrsvInternalUpper::invoke( - ArgDiag::use_unit_diag, A.extent(0), alpha, A.data(), A.stride_0(), - A.stride_1(), b.data(), b.stride_0()); + KOKKOS_INLINE_FUNCTION static int invoke(const ScalarType alpha, const AViewType &A, const bViewType &b) { + return SerialTrsvInternalUpper::invoke(ArgDiag::use_unit_diag, A.extent(0), alpha, A.data(), + A.stride_0(), A.stride_1(), b.data(), b.stride_0()); } }; template -struct SerialTrsv { +struct SerialTrsv { template - KOKKOS_INLINE_FUNCTION static int invoke(const ScalarType alpha, - const AViewType &A, - const bViewType &b) { - return SerialTrsvInternalUpper::invoke( - ArgDiag::use_unit_diag, A.extent(0), alpha, A.data(), A.stride_0(), - A.stride_1(), b.data(), b.stride_0()); + KOKKOS_INLINE_FUNCTION static int invoke(const ScalarType alpha, const AViewType &A, const bViewType &b) { + return SerialTrsvInternalUpper::invoke(ArgDiag::use_unit_diag, A.extent(0), alpha, A.data(), + A.stride_0(), A.stride_1(), b.data(), b.stride_0()); } }; @@ -262,42 +206,31 @@ struct SerialTrsv -struct SerialTrsv { +struct SerialTrsv { template - KOKKOS_INLINE_FUNCTION static int invoke(const ScalarType alpha, - const AViewType &A, - const bViewType &b) { + KOKKOS_INLINE_FUNCTION static int invoke(const ScalarType alpha, const AViewType &A, const bViewType &b) { typedef typename bViewType::value_type vector_type; // typedef typename vector_type::value_type value_type; const int m = b.extent(0), n = 1; - static_assert(is_vector::value, - "value type is not vector type"); - static_assert( - vector_type::vector_length == 4 || vector_type::vector_length == 8, - "AVX, AVX2 and AVX512 is supported"); - const MKL_COMPACT_PACK format = - vector_type::vector_length == 8 ? MKL_COMPACT_AVX512 : MKL_COMPACT_AVX; + static_assert(is_vector::value, "value type is not vector type"); + static_assert(vector_type::vector_length == 4 || vector_type::vector_length == 8, + "AVX, AVX2 and AVX512 is supported"); + const MKL_COMPACT_PACK format = vector_type::vector_length == 8 ? MKL_COMPACT_AVX512 : MKL_COMPACT_AVX; // no error check int r_val = 0; if (A.stride_0() == 1) { - mkl_dtrsm_compact(MKL_COL_MAJOR, MKL_LEFT, MKL_UPPER, MKL_TRANS, - ArgDiag::use_unit_diag ? MKL_UNIT : MKL_NONUNIT, m, n, - alpha, (const double *)A.data(), A.stride_0(), - (double *)b.data(), b.stride_0(), format, + mkl_dtrsm_compact(MKL_COL_MAJOR, MKL_LEFT, MKL_UPPER, MKL_TRANS, ArgDiag::use_unit_diag ? MKL_UNIT : MKL_NONUNIT, + m, n, alpha, (const double *)A.data(), A.stride_0(), (double *)b.data(), b.stride_0(), format, (MKL_INT)vector_type::vector_length); } else if (A.stride_1() == 1) { - mkl_dtrsm_compact(MKL_ROW_MAJOR, MKL_LEFT, MKL_UPPER, MKL_TRANS, - ArgDiag::use_unit_diag ? MKL_UNIT : MKL_NONUNIT, m, n, - alpha, (const double *)A.data(), A.stride_0(), - (double *)b.data(), b.stride_0(), format, + mkl_dtrsm_compact(MKL_ROW_MAJOR, MKL_LEFT, MKL_UPPER, MKL_TRANS, ArgDiag::use_unit_diag ? MKL_UNIT : MKL_NONUNIT, + m, n, alpha, (const double *)A.data(), A.stride_0(), (double *)b.data(), b.stride_0(), format, (MKL_INT)vector_type::vector_length); } else { r_val = -1; @@ -308,27 +241,20 @@ struct SerialTrsv -struct SerialTrsv { +struct SerialTrsv { template - KOKKOS_INLINE_FUNCTION static int invoke(const ScalarType alpha, - const AViewType &A, - const bViewType &b) { - return SerialTrsvInternalLower::invoke( - ArgDiag::use_unit_diag, A.extent(1), alpha, A.data(), A.stride_1(), - A.stride_0(), b.data(), b.stride_0()); + KOKKOS_INLINE_FUNCTION static int invoke(const ScalarType alpha, const AViewType &A, const bViewType &b) { + return SerialTrsvInternalLower::invoke(ArgDiag::use_unit_diag, A.extent(1), alpha, A.data(), + A.stride_1(), A.stride_0(), b.data(), b.stride_0()); } }; template struct SerialTrsv { template - KOKKOS_INLINE_FUNCTION static int invoke(const ScalarType alpha, - const AViewType &A, - const bViewType &b) { - return SerialTrsvInternalLower::invoke( - ArgDiag::use_unit_diag, A.extent(1), alpha, A.data(), A.stride_1(), - A.stride_0(), b.data(), b.stride_0()); + KOKKOS_INLINE_FUNCTION static int invoke(const ScalarType alpha, const AViewType &A, const bViewType &b) { + return SerialTrsvInternalLower::invoke(ArgDiag::use_unit_diag, A.extent(1), alpha, A.data(), + A.stride_1(), A.stride_0(), b.data(), b.stride_0()); } }; diff --git a/batched/dense/impl/KokkosBatched_Trsv_Serial_Internal.hpp b/batched/dense/impl/KokkosBatched_Trsv_Serial_Internal.hpp index 3ae206cc09..43d95377d4 100644 --- a/batched/dense/impl/KokkosBatched_Trsv_Serial_Internal.hpp +++ b/batched/dense/impl/KokkosBatched_Trsv_Serial_Internal.hpp @@ -38,39 +38,33 @@ namespace KokkosBatched { template struct SerialTrsvInternalLower { template - KOKKOS_INLINE_FUNCTION static int invoke(const bool use_unit_diag, - const int m, const ScalarType alpha, - const ValueType *KOKKOS_RESTRICT A, - const int as0, const int as1, - /**/ ValueType *KOKKOS_RESTRICT b, - const int bs0); + KOKKOS_INLINE_FUNCTION static int invoke(const bool use_unit_diag, const int m, const ScalarType alpha, + const ValueType *KOKKOS_RESTRICT A, const int as0, const int as1, + /**/ ValueType *KOKKOS_RESTRICT b, const int bs0); }; template <> template -KOKKOS_INLINE_FUNCTION int -SerialTrsvInternalLower::invoke( - const bool use_unit_diag, const int m, const ScalarType alpha, - const ValueType *KOKKOS_RESTRICT A, const int as0, const int as1, - /**/ ValueType *KOKKOS_RESTRICT b, const int bs0) { +KOKKOS_INLINE_FUNCTION int SerialTrsvInternalLower::invoke(const bool use_unit_diag, const int m, + const ScalarType alpha, + const ValueType *KOKKOS_RESTRICT A, + const int as0, const int as1, + /**/ ValueType *KOKKOS_RESTRICT b, + const int bs0) { const ScalarType one(1.0), zero(0.0); if (alpha == zero) KokkosBlas::Impl::SerialSetInternal::invoke(m, zero, b, bs0); else { - if (alpha != one) - KokkosBlas::Impl::SerialScaleInternal::invoke(m, alpha, b, bs0); + if (alpha != one) KokkosBlas::Impl::SerialScaleInternal::invoke(m, alpha, b, bs0); if (m <= 0) return 0; for (int p = 0; p < m; ++p) { const int iend = m - p - 1; - const ValueType *KOKKOS_RESTRICT a21 = - iend ? A + (p + 1) * as0 + p * as1 : NULL; + const ValueType *KOKKOS_RESTRICT a21 = iend ? A + (p + 1) * as0 + p * as1 : NULL; - ValueType *KOKKOS_RESTRICT beta1 = b + p * bs0, - *KOKKOS_RESTRICT b2 = - iend ? beta1 + bs0 : NULL; + ValueType *KOKKOS_RESTRICT beta1 = b + p * bs0, *KOKKOS_RESTRICT b2 = iend ? beta1 + bs0 : NULL; // with KOKKOS_RESTRICT a compiler assumes that the pointer is not // accessed by others op(/=) uses this pointer and changes the associated @@ -85,10 +79,12 @@ SerialTrsvInternalLower::invoke( template <> template -KOKKOS_INLINE_FUNCTION int SerialTrsvInternalLower::invoke( - const bool use_unit_diag, const int m, const ScalarType alpha, - const ValueType *KOKKOS_RESTRICT A, const int as0, const int as1, - /**/ ValueType *KOKKOS_RESTRICT b, const int bs0) { +KOKKOS_INLINE_FUNCTION int SerialTrsvInternalLower::invoke(const bool use_unit_diag, const int m, + const ScalarType alpha, + const ValueType *KOKKOS_RESTRICT A, + const int as0, const int as1, + /**/ ValueType *KOKKOS_RESTRICT b, + const int bs0) { const ScalarType one(1.0), zero(0.0), minus_one(-1.0); constexpr int mbAlgo = Algo::Trsv::Blocked::mb(); @@ -96,8 +92,7 @@ KOKKOS_INLINE_FUNCTION int SerialTrsvInternalLower::invoke( if (alpha == zero) KokkosBlas::Impl::SerialSetInternal::invoke(m, zero, b, bs0); else { - if (alpha != one) - KokkosBlas::Impl::SerialScaleInternal::invoke(m, alpha, b, bs0); + if (alpha != one) KokkosBlas::Impl::SerialScaleInternal::invoke(m, alpha, b, bs0); if (m <= 0) return 0; /// case GPU: team size is large and blocksize (mb,nb) is small @@ -110,7 +105,7 @@ KOKKOS_INLINE_FUNCTION int SerialTrsvInternalLower::invoke( // trsm update const ValueType *KOKKOS_RESTRICT Ap = A + p * as0 + p * as1; - /**/ ValueType *KOKKOS_RESTRICT bp = b + p * bs0; + /**/ ValueType *KOKKOS_RESTRICT bp = b + p * bs0; if (use_unit_diag) trsm_u.serial_invoke(Ap, pb, 1, bp); @@ -118,9 +113,8 @@ KOKKOS_INLINE_FUNCTION int SerialTrsvInternalLower::invoke( trsm_n.serial_invoke(Ap, pb, 1, bp); // gemv update - KokkosBlas::Impl::SerialGemvInternal::invoke( - m - p - pb, pb, minus_one, Ap + pb * as0, as0, as1, bp, bs0, one, - bp + pb * bs0, bs0); + KokkosBlas::Impl::SerialGemvInternal::invoke(m - p - pb, pb, minus_one, Ap + pb * as0, as0, + as1, bp, bs0, one, bp + pb * bs0, bs0); } } return 0; @@ -133,36 +127,33 @@ KOKKOS_INLINE_FUNCTION int SerialTrsvInternalLower::invoke( template struct SerialTrsvInternalUpper { template - KOKKOS_INLINE_FUNCTION static int invoke(const bool use_unit_diag, - const int m, const ScalarType alpha, - const ValueType *KOKKOS_RESTRICT A, - const int as0, const int as1, - /**/ ValueType *KOKKOS_RESTRICT b, - const int bs0); + KOKKOS_INLINE_FUNCTION static int invoke(const bool use_unit_diag, const int m, const ScalarType alpha, + const ValueType *KOKKOS_RESTRICT A, const int as0, const int as1, + /**/ ValueType *KOKKOS_RESTRICT b, const int bs0); }; template <> template -KOKKOS_INLINE_FUNCTION int -SerialTrsvInternalUpper::invoke( - const bool use_unit_diag, const int m, const ScalarType alpha, - const ValueType *KOKKOS_RESTRICT A, const int as0, const int as1, - /**/ ValueType *KOKKOS_RESTRICT b, const int bs0) { +KOKKOS_INLINE_FUNCTION int SerialTrsvInternalUpper::invoke(const bool use_unit_diag, const int m, + const ScalarType alpha, + const ValueType *KOKKOS_RESTRICT A, + const int as0, const int as1, + /**/ ValueType *KOKKOS_RESTRICT b, + const int bs0) { const ScalarType one(1.0), zero(0.0); if (alpha == zero) KokkosBlas::Impl::SerialSetInternal::invoke(m, zero, b, bs0); else { - if (alpha != one) - KokkosBlas::Impl::SerialScaleInternal::invoke(m, alpha, b, bs0); + if (alpha != one) KokkosBlas::Impl::SerialScaleInternal::invoke(m, alpha, b, bs0); if (m <= 0) return 0; ValueType *KOKKOS_RESTRICT b0 = b; for (int p = (m - 1); p >= 0; --p) { const int iend = p; - const ValueType *KOKKOS_RESTRICT a01 = A + p * as1; - /**/ ValueType *KOKKOS_RESTRICT beta1 = b + p * bs0; + const ValueType *KOKKOS_RESTRICT a01 = A + p * as1; + /**/ ValueType *KOKKOS_RESTRICT beta1 = b + p * bs0; // with KOKKOS_RESTRICT a compiler assumes that the pointer is not // accessed by others op(/=) uses this pointer and changes the associated @@ -177,10 +168,12 @@ SerialTrsvInternalUpper::invoke( template <> template -KOKKOS_INLINE_FUNCTION int SerialTrsvInternalUpper::invoke( - const bool use_unit_diag, const int m, const ScalarType alpha, - const ValueType *KOKKOS_RESTRICT A, const int as0, const int as1, - /**/ ValueType *KOKKOS_RESTRICT b, const int bs0) { +KOKKOS_INLINE_FUNCTION int SerialTrsvInternalUpper::invoke(const bool use_unit_diag, const int m, + const ScalarType alpha, + const ValueType *KOKKOS_RESTRICT A, + const int as0, const int as1, + /**/ ValueType *KOKKOS_RESTRICT b, + const int bs0) { const ScalarType one(1.0), zero(0.0), minus_one(-1.0); constexpr int mbAlgo = Algo::Trsm::Blocked::mb(); @@ -189,8 +182,7 @@ KOKKOS_INLINE_FUNCTION int SerialTrsvInternalUpper::invoke( if (alpha == zero) KokkosBlas::Impl::SerialSetInternal::invoke(m, zero, b, bs0); else { - if (alpha != one) - KokkosBlas::Impl::SerialScaleInternal::invoke(m, alpha, b, bs0); + if (alpha != one) KokkosBlas::Impl::SerialScaleInternal::invoke(m, alpha, b, bs0); if (m <= 0) return 0; InnerTrsmLeftUpperUnitDiag trsm_u(as0, as1, bs0, 0); @@ -198,12 +190,11 @@ KOKKOS_INLINE_FUNCTION int SerialTrsvInternalUpper::invoke( const int mb = mbAlgo; for (int pp = 0; pp < m; pp += mb) { - const int ptmp = (m - pp - mb), p = (ptmp < 0 ? 0 : ptmp), - pb = (mb + (ptmp < 0) * ptmp); + const int ptmp = (m - pp - mb), p = (ptmp < 0 ? 0 : ptmp), pb = (mb + (ptmp < 0) * ptmp); // trsm update const ValueType *KOKKOS_RESTRICT Ap = A + p * as0 + p * as1; - /**/ ValueType *KOKKOS_RESTRICT bp = b + p * bs0; + /**/ ValueType *KOKKOS_RESTRICT bp = b + p * bs0; if (use_unit_diag) trsm_u.serial_invoke(Ap, pb, 1, bp); @@ -211,8 +202,8 @@ KOKKOS_INLINE_FUNCTION int SerialTrsvInternalUpper::invoke( trsm_n.serial_invoke(Ap, pb, 1, bp); // gemv update - KokkosBlas::Impl::SerialGemvInternal::invoke( - p, pb, minus_one, Ap - p * as0, as0, as1, bp, bs0, one, b, bs0); + KokkosBlas::Impl::SerialGemvInternal::invoke(p, pb, minus_one, Ap - p * as0, as0, as1, bp, + bs0, one, b, bs0); } } return 0; diff --git a/batched/dense/impl/KokkosBatched_Trsv_TeamVector_Impl.hpp b/batched/dense/impl/KokkosBatched_Trsv_TeamVector_Impl.hpp index 8e14b5ef37..42c242414c 100644 --- a/batched/dense/impl/KokkosBatched_Trsv_TeamVector_Impl.hpp +++ b/batched/dense/impl/KokkosBatched_Trsv_TeamVector_Impl.hpp @@ -38,16 +38,13 @@ namespace KokkosBatched { /// template -struct TeamVectorTrsv { +struct TeamVectorTrsv { template - KOKKOS_INLINE_FUNCTION static int invoke(const MemberType &member, - const ScalarType alpha, - const AViewType &A, + KOKKOS_INLINE_FUNCTION static int invoke(const MemberType &member, const ScalarType alpha, const AViewType &A, const bViewType &b) { - return TeamVectorTrsvInternalLower::invoke( - member, ArgDiag::use_unit_diag, A.extent(0), alpha, A.data(), - A.stride_0(), A.stride_1(), b.data(), b.stride_0()); + return TeamVectorTrsvInternalLower::invoke(member, ArgDiag::use_unit_diag, A.extent(0), + alpha, A.data(), A.stride_0(), A.stride_1(), + b.data(), b.stride_0()); } }; @@ -56,16 +53,13 @@ struct TeamVectorTrsv -struct TeamVectorTrsv { +struct TeamVectorTrsv { template - KOKKOS_INLINE_FUNCTION static int invoke(const MemberType &member, - const ScalarType alpha, - const AViewType &A, + KOKKOS_INLINE_FUNCTION static int invoke(const MemberType &member, const ScalarType alpha, const AViewType &A, const bViewType &b) { - return TeamVectorTrsvInternalUpper::invoke( - member, ArgDiag::use_unit_diag, A.extent(1), alpha, A.data(), - A.stride_1(), A.stride_0(), b.data(), b.stride_0()); + return TeamVectorTrsvInternalUpper::invoke(member, ArgDiag::use_unit_diag, A.extent(1), + alpha, A.data(), A.stride_1(), A.stride_0(), + b.data(), b.stride_0()); } }; @@ -74,16 +68,13 @@ struct TeamVectorTrsv -struct TeamVectorTrsv { +struct TeamVectorTrsv { template - KOKKOS_INLINE_FUNCTION static int invoke(const MemberType &member, - const ScalarType alpha, - const AViewType &A, + KOKKOS_INLINE_FUNCTION static int invoke(const MemberType &member, const ScalarType alpha, const AViewType &A, const bViewType &b) { - return TeamVectorTrsvInternalUpper::invoke( - member, ArgDiag::use_unit_diag, A.extent(0), alpha, A.data(), - A.stride_0(), A.stride_1(), b.data(), b.stride_0()); + return TeamVectorTrsvInternalUpper::invoke(member, ArgDiag::use_unit_diag, A.extent(0), + alpha, A.data(), A.stride_0(), A.stride_1(), + b.data(), b.stride_0()); } }; @@ -92,16 +83,13 @@ struct TeamVectorTrsv -struct TeamVectorTrsv { +struct TeamVectorTrsv { template - KOKKOS_INLINE_FUNCTION static int invoke(const MemberType &member, - const ScalarType alpha, - const AViewType &A, + KOKKOS_INLINE_FUNCTION static int invoke(const MemberType &member, const ScalarType alpha, const AViewType &A, const bViewType &b) { - return TeamVectorTrsvInternalLower::invoke( - member, ArgDiag::use_unit_diag, A.extent(1), alpha, A.data(), - A.stride_1(), A.stride_0(), b.data(), b.stride_0()); + return TeamVectorTrsvInternalLower::invoke(member, ArgDiag::use_unit_diag, A.extent(1), + alpha, A.data(), A.stride_1(), A.stride_0(), + b.data(), b.stride_0()); } }; diff --git a/batched/dense/impl/KokkosBatched_Trsv_TeamVector_Internal.hpp b/batched/dense/impl/KokkosBatched_Trsv_TeamVector_Internal.hpp index 40bca5a64a..894e684ef2 100644 --- a/batched/dense/impl/KokkosBatched_Trsv_TeamVector_Internal.hpp +++ b/batched/dense/impl/KokkosBatched_Trsv_TeamVector_Internal.hpp @@ -36,12 +36,10 @@ namespace KokkosBatched { template struct TeamVectorTrsvInternalLower { template - KOKKOS_INLINE_FUNCTION static int invoke( - const MemberType & /*member*/, const bool /*use_unit_diag*/, - const int /*m*/, const ScalarType /*alpha*/, - const ValueType *KOKKOS_RESTRICT /*A*/, const int /*as0*/, - const int /*as1*/, - /**/ ValueType *KOKKOS_RESTRICT /*b*/, const int /*bs0*/) { + KOKKOS_INLINE_FUNCTION static int invoke(const MemberType & /*member*/, const bool /*use_unit_diag*/, const int /*m*/, + const ScalarType /*alpha*/, const ValueType *KOKKOS_RESTRICT /*A*/, + const int /*as0*/, const int /*as1*/, + /**/ ValueType *KOKKOS_RESTRICT /*b*/, const int /*bs0*/) { assert(false && "Error: encounter dummy impl"); return 0; } @@ -49,31 +47,24 @@ struct TeamVectorTrsvInternalLower { template <> template -KOKKOS_INLINE_FUNCTION int -TeamVectorTrsvInternalLower::invoke( - const MemberType &member, const bool use_unit_diag, const int m, - const ScalarType alpha, const ValueType *KOKKOS_RESTRICT A, const int as0, - const int as1, +KOKKOS_INLINE_FUNCTION int TeamVectorTrsvInternalLower::invoke( + const MemberType &member, const bool use_unit_diag, const int m, const ScalarType alpha, + const ValueType *KOKKOS_RESTRICT A, const int as0, const int as1, /**/ ValueType *KOKKOS_RESTRICT b, const int bs0) { const ScalarType one(1.0), zero(0.0); if (alpha == zero) KokkosBlas::Impl::TeamVectorSetInternal::invoke(member, m, zero, b, bs0); else { - if (alpha != one) - KokkosBlas::Impl::TeamVectorScaleInternal::invoke(member, m, alpha, b, - bs0); + if (alpha != one) KokkosBlas::Impl::TeamVectorScaleInternal::invoke(member, m, alpha, b, bs0); if (m <= 0) return 0; for (int p = 0; p < m; ++p) { const int iend = m - p - 1; - const ValueType *KOKKOS_RESTRICT a21 = - iend ? A + (p + 1) * as0 + p * as1 : NULL; + const ValueType *KOKKOS_RESTRICT a21 = iend ? A + (p + 1) * as0 + p * as1 : NULL; - ValueType *KOKKOS_RESTRICT beta1 = b + p * bs0, - *KOKKOS_RESTRICT b2 = - iend ? beta1 + bs0 : NULL; + ValueType *KOKKOS_RESTRICT beta1 = b + p * bs0, *KOKKOS_RESTRICT b2 = iend ? beta1 + bs0 : NULL; member.team_barrier(); ValueType local_beta1 = *beta1; @@ -82,12 +73,10 @@ TeamVectorTrsvInternalLower::invoke( local_beta1 = local_beta1 / alpha11; member.team_barrier(); - Kokkos::single(Kokkos::PerTeam(member), - [&]() { *beta1 = local_beta1; }); + Kokkos::single(Kokkos::PerTeam(member), [&]() { *beta1 = local_beta1; }); } - Kokkos::parallel_for( - Kokkos::TeamVectorRange(member, 0, iend), - [&](const int &i) { b2[i * bs0] -= a21[i * as0] * local_beta1; }); + Kokkos::parallel_for(Kokkos::TeamVectorRange(member, 0, iend), + [&](const int &i) { b2[i * bs0] -= a21[i * as0] * local_beta1; }); } } return 0; @@ -100,12 +89,10 @@ TeamVectorTrsvInternalLower::invoke( template struct TeamVectorTrsvInternalUpper { template - KOKKOS_INLINE_FUNCTION static int invoke( - const MemberType & /*member*/, const bool /*use_unit_diag*/, - const int /*m*/, const ScalarType /*alpha*/, - const ValueType *KOKKOS_RESTRICT /*A*/, const int /*as0*/, - const int /*as1*/, - /**/ ValueType *KOKKOS_RESTRICT /*b*/, const int /*bs0*/) { + KOKKOS_INLINE_FUNCTION static int invoke(const MemberType & /*member*/, const bool /*use_unit_diag*/, const int /*m*/, + const ScalarType /*alpha*/, const ValueType *KOKKOS_RESTRICT /*A*/, + const int /*as0*/, const int /*as1*/, + /**/ ValueType *KOKKOS_RESTRICT /*b*/, const int /*bs0*/) { assert(false && "Error: encounter dummy impl"); return 0; } @@ -113,28 +100,24 @@ struct TeamVectorTrsvInternalUpper { template <> template -KOKKOS_INLINE_FUNCTION int -TeamVectorTrsvInternalUpper::invoke( - const MemberType &member, const bool use_unit_diag, const int m, - const ScalarType alpha, const ValueType *KOKKOS_RESTRICT A, const int as0, - const int as1, +KOKKOS_INLINE_FUNCTION int TeamVectorTrsvInternalUpper::invoke( + const MemberType &member, const bool use_unit_diag, const int m, const ScalarType alpha, + const ValueType *KOKKOS_RESTRICT A, const int as0, const int as1, /**/ ValueType *KOKKOS_RESTRICT b, const int bs0) { const ScalarType one(1.0), zero(0.0); if (alpha == zero) KokkosBlas::Impl::TeamVectorSetInternal::invoke(member, m, zero, b, bs0); else { - if (alpha != one) - KokkosBlas::Impl::TeamVectorScaleInternal::invoke(member, m, alpha, b, - bs0); + if (alpha != one) KokkosBlas::Impl::TeamVectorScaleInternal::invoke(member, m, alpha, b, bs0); if (m <= 0) return 0; ValueType *KOKKOS_RESTRICT b0 = b; for (int p = (m - 1); p >= 0; --p) { const int iend = p; - const ValueType *KOKKOS_RESTRICT a01 = A + p * as1; - /**/ ValueType *KOKKOS_RESTRICT beta1 = b + p * bs0; + const ValueType *KOKKOS_RESTRICT a01 = A + p * as1; + /**/ ValueType *KOKKOS_RESTRICT beta1 = b + p * bs0; member.team_barrier(); ValueType local_beta1 = *beta1; @@ -143,12 +126,10 @@ TeamVectorTrsvInternalUpper::invoke( local_beta1 = local_beta1 / alpha11; member.team_barrier(); - Kokkos::single(Kokkos::PerTeam(member), - [&]() { *beta1 = local_beta1; }); + Kokkos::single(Kokkos::PerTeam(member), [&]() { *beta1 = local_beta1; }); } - Kokkos::parallel_for( - Kokkos::TeamVectorRange(member, 0, iend), - [&](const int &i) { b0[i * bs0] -= a01[i * as0] * local_beta1; }); + Kokkos::parallel_for(Kokkos::TeamVectorRange(member, 0, iend), + [&](const int &i) { b0[i * bs0] -= a01[i * as0] * local_beta1; }); } } return 0; diff --git a/batched/dense/impl/KokkosBatched_Trsv_Team_Impl.hpp b/batched/dense/impl/KokkosBatched_Trsv_Team_Impl.hpp index 7f370c1f01..c658080dc2 100644 --- a/batched/dense/impl/KokkosBatched_Trsv_Team_Impl.hpp +++ b/batched/dense/impl/KokkosBatched_Trsv_Team_Impl.hpp @@ -38,30 +38,24 @@ namespace KokkosBatched { /// template -struct TeamTrsv { +struct TeamTrsv { template - KOKKOS_INLINE_FUNCTION static int invoke(const MemberType &member, - const ScalarType alpha, - const AViewType &A, + KOKKOS_INLINE_FUNCTION static int invoke(const MemberType &member, const ScalarType alpha, const AViewType &A, const bViewType &b) { - return TeamTrsvInternalLower::invoke( - member, ArgDiag::use_unit_diag, A.extent(0), alpha, A.data(), - A.stride_0(), A.stride_1(), b.data(), b.stride_0()); + return TeamTrsvInternalLower::invoke(member, ArgDiag::use_unit_diag, A.extent(0), alpha, + A.data(), A.stride_0(), A.stride_1(), b.data(), + b.stride_0()); } }; template -struct TeamTrsv { +struct TeamTrsv { template - KOKKOS_INLINE_FUNCTION static int invoke(const MemberType &member, - const ScalarType alpha, - const AViewType &A, + KOKKOS_INLINE_FUNCTION static int invoke(const MemberType &member, const ScalarType alpha, const AViewType &A, const bViewType &b) { - return TeamTrsvInternalLower::invoke( - member, ArgDiag::use_unit_diag, A.extent(0), alpha, A.data(), - A.stride_0(), A.stride_1(), b.data(), b.stride_0()); + return TeamTrsvInternalLower::invoke(member, ArgDiag::use_unit_diag, A.extent(0), alpha, + A.data(), A.stride_0(), A.stride_1(), b.data(), + b.stride_0()); } }; @@ -70,30 +64,23 @@ struct TeamTrsv -struct TeamTrsv { +struct TeamTrsv { template - KOKKOS_INLINE_FUNCTION static int invoke(const MemberType &member, - const ScalarType alpha, - const AViewType &A, + KOKKOS_INLINE_FUNCTION static int invoke(const MemberType &member, const ScalarType alpha, const AViewType &A, const bViewType &b) { - return TeamTrsvInternalUpper::invoke( - member, ArgDiag::use_unit_diag, A.extent(1), alpha, A.data(), - A.stride_1(), A.stride_0(), b.data(), b.stride_0()); + return TeamTrsvInternalUpper::invoke(member, ArgDiag::use_unit_diag, A.extent(1), alpha, + A.data(), A.stride_1(), A.stride_0(), b.data(), + b.stride_0()); } }; template -struct TeamTrsv { +struct TeamTrsv { template - KOKKOS_INLINE_FUNCTION static int invoke(const MemberType & /*member*/, - const ScalarType alpha, - const AViewType &A, + KOKKOS_INLINE_FUNCTION static int invoke(const MemberType & /*member*/, const ScalarType alpha, const AViewType &A, const bViewType &b) { - return TeamTrsvInternalUpper::invoke( - ArgDiag::use_unit_diag, A.extent(1), alpha, A.data(), A.stride_1(), - A.stride_0(), b.data(), b.stride_0()); + return TeamTrsvInternalUpper::invoke(ArgDiag::use_unit_diag, A.extent(1), alpha, A.data(), + A.stride_1(), A.stride_0(), b.data(), b.stride_0()); } }; @@ -102,30 +89,24 @@ struct TeamTrsv -struct TeamTrsv { +struct TeamTrsv { template - KOKKOS_INLINE_FUNCTION static int invoke(const MemberType &member, - const ScalarType alpha, - const AViewType &A, + KOKKOS_INLINE_FUNCTION static int invoke(const MemberType &member, const ScalarType alpha, const AViewType &A, const bViewType &b) { - return TeamTrsvInternalUpper::invoke( - member, ArgDiag::use_unit_diag, A.extent(0), alpha, A.data(), - A.stride_0(), A.stride_1(), b.data(), b.stride_0()); + return TeamTrsvInternalUpper::invoke(member, ArgDiag::use_unit_diag, A.extent(0), alpha, + A.data(), A.stride_0(), A.stride_1(), b.data(), + b.stride_0()); } }; template -struct TeamTrsv { +struct TeamTrsv { template - KOKKOS_INLINE_FUNCTION static int invoke(const MemberType &member, - const ScalarType alpha, - const AViewType &A, + KOKKOS_INLINE_FUNCTION static int invoke(const MemberType &member, const ScalarType alpha, const AViewType &A, const bViewType &b) { - return TeamTrsvInternalUpper::invoke( - member, ArgDiag::use_unit_diag, A.extent(0), alpha, A.data(), - A.stride_0(), A.stride_1(), b.data(), b.stride_0()); + return TeamTrsvInternalUpper::invoke(member, ArgDiag::use_unit_diag, A.extent(0), alpha, + A.data(), A.stride_0(), A.stride_1(), b.data(), + b.stride_0()); } }; @@ -134,30 +115,24 @@ struct TeamTrsv -struct TeamTrsv { +struct TeamTrsv { template - KOKKOS_INLINE_FUNCTION static int invoke(const MemberType &member, - const ScalarType alpha, - const AViewType &A, + KOKKOS_INLINE_FUNCTION static int invoke(const MemberType &member, const ScalarType alpha, const AViewType &A, const bViewType &b) { - return TeamTrsvInternalLower::invoke( - member, ArgDiag::use_unit_diag, A.extent(1), alpha, A.data(), - A.stride_1(), A.stride_0(), b.data(), b.stride_0()); + return TeamTrsvInternalLower::invoke(member, ArgDiag::use_unit_diag, A.extent(1), alpha, + A.data(), A.stride_1(), A.stride_0(), b.data(), + b.stride_0()); } }; template -struct TeamTrsv { +struct TeamTrsv { template - KOKKOS_INLINE_FUNCTION static int invoke(const MemberType &member, - const ScalarType alpha, - const AViewType &A, + KOKKOS_INLINE_FUNCTION static int invoke(const MemberType &member, const ScalarType alpha, const AViewType &A, const bViewType &b) { - return TeamTrsvInternalLower::invoke( - member, ArgDiag::use_unit_diag, A.extent(1), alpha, A.data(), - A.stride_1(), A.stride_0(), b.data(), b.stride_0()); + return TeamTrsvInternalLower::invoke(member, ArgDiag::use_unit_diag, A.extent(1), alpha, + A.data(), A.stride_1(), A.stride_0(), b.data(), + b.stride_0()); } }; } // namespace KokkosBatched diff --git a/batched/dense/impl/KokkosBatched_Trsv_Team_Internal.hpp b/batched/dense/impl/KokkosBatched_Trsv_Team_Internal.hpp index 600a0c6e81..ba3b2ff7b5 100644 --- a/batched/dense/impl/KokkosBatched_Trsv_Team_Internal.hpp +++ b/batched/dense/impl/KokkosBatched_Trsv_Team_Internal.hpp @@ -38,12 +38,10 @@ namespace KokkosBatched { template struct TeamTrsvInternalLower { template - KOKKOS_INLINE_FUNCTION static int invoke( - const MemberType & /*member*/, const bool /*use_unit_diag*/, - const int /*m*/, const ScalarType /*alpha*/, - const ValueType *KOKKOS_RESTRICT /*A*/, const int /*as0*/, - const int /*as1*/, - /**/ ValueType *KOKKOS_RESTRICT /*b*/, const int /*bs0*/) { + KOKKOS_INLINE_FUNCTION static int invoke(const MemberType & /*member*/, const bool /*use_unit_diag*/, const int /*m*/, + const ScalarType /*alpha*/, const ValueType *KOKKOS_RESTRICT /*A*/, + const int /*as0*/, const int /*as1*/, + /**/ ValueType *KOKKOS_RESTRICT /*b*/, const int /*bs0*/) { assert(false && "Error: encounter dummy impl"); return 0; } @@ -52,28 +50,23 @@ struct TeamTrsvInternalLower { template <> template KOKKOS_INLINE_FUNCTION int TeamTrsvInternalLower::invoke( - const MemberType &member, const bool use_unit_diag, const int m, - const ScalarType alpha, const ValueType *KOKKOS_RESTRICT A, const int as0, - const int as1, + const MemberType &member, const bool use_unit_diag, const int m, const ScalarType alpha, + const ValueType *KOKKOS_RESTRICT A, const int as0, const int as1, /**/ ValueType *KOKKOS_RESTRICT b, const int bs0) { const ScalarType one(1.0), zero(0.0); if (alpha == zero) KokkosBlas::Impl::TeamSetInternal::invoke(member, m, zero, b, bs0); else { - if (alpha != one) - KokkosBlas::Impl::TeamScaleInternal::invoke(member, m, alpha, b, bs0); + if (alpha != one) KokkosBlas::Impl::TeamScaleInternal::invoke(member, m, alpha, b, bs0); if (m <= 0) return 0; for (int p = 0; p < m; ++p) { const int iend = m - p - 1; - const ValueType *KOKKOS_RESTRICT a21 = - iend ? A + (p + 1) * as0 + p * as1 : NULL; + const ValueType *KOKKOS_RESTRICT a21 = iend ? A + (p + 1) * as0 + p * as1 : NULL; - ValueType *KOKKOS_RESTRICT beta1 = b + p * bs0, - *KOKKOS_RESTRICT b2 = - iend ? beta1 + bs0 : NULL; + ValueType *KOKKOS_RESTRICT beta1 = b + p * bs0, *KOKKOS_RESTRICT b2 = iend ? beta1 + bs0 : NULL; member.team_barrier(); ValueType local_beta1 = *beta1; @@ -85,9 +78,8 @@ KOKKOS_INLINE_FUNCTION int TeamTrsvInternalLower::invoke( if (member.team_rank() == 0) *beta1 = local_beta1; } /// member.team_barrier(); - Kokkos::parallel_for( - Kokkos::TeamThreadRange(member, 0, iend), - [&](const int &i) { b2[i * bs0] -= a21[i * as0] * local_beta1; }); + Kokkos::parallel_for(Kokkos::TeamThreadRange(member, 0, iend), + [&](const int &i) { b2[i * bs0] -= a21[i * as0] * local_beta1; }); } } return 0; @@ -96,9 +88,8 @@ KOKKOS_INLINE_FUNCTION int TeamTrsvInternalLower::invoke( template <> template KOKKOS_INLINE_FUNCTION int TeamTrsvInternalLower::invoke( - const MemberType &member, const bool use_unit_diag, const int m, - const ScalarType alpha, const ValueType *KOKKOS_RESTRICT A, const int as0, - const int as1, + const MemberType &member, const bool use_unit_diag, const int m, const ScalarType alpha, + const ValueType *KOKKOS_RESTRICT A, const int as0, const int as1, /**/ ValueType *KOKKOS_RESTRICT b, const int bs0) { const ScalarType one(1.0), zero(0.0), minus_one(-1.0); @@ -107,8 +98,7 @@ KOKKOS_INLINE_FUNCTION int TeamTrsvInternalLower::invoke( if (alpha == zero) KokkosBlas::Impl::TeamSetInternal::invoke(member, m, zero, b, bs0); else { - if (alpha != one) - KokkosBlas::Impl::TeamScaleInternal::invoke(member, m, alpha, b, bs0); + if (alpha != one) KokkosBlas::Impl::TeamScaleInternal::invoke(member, m, alpha, b, bs0); if (m <= 0) return 0; /// case GPU: team size is large and blocksize (mb,nb) is small @@ -122,7 +112,7 @@ KOKKOS_INLINE_FUNCTION int TeamTrsvInternalLower::invoke( // trsm update const ValueType *KOKKOS_RESTRICT Ap = A + p * as0 + p * as1; - /**/ ValueType *KOKKOS_RESTRICT bp = b + p * bs0; + /**/ ValueType *KOKKOS_RESTRICT bp = b + p * bs0; member.team_barrier(); if (member.team_rank() == 0) { @@ -134,9 +124,8 @@ KOKKOS_INLINE_FUNCTION int TeamTrsvInternalLower::invoke( // gemv update member.team_barrier(); - KokkosBlas::Impl::TeamGemvInternal::invoke( - member, m - p - pb, pb, minus_one, Ap + pb * as0, as0, as1, bp, 1, - one, bp + pb * bs0, bs0); + KokkosBlas::Impl::TeamGemvInternal::invoke(member, m - p - pb, pb, minus_one, Ap + pb * as0, + as0, as1, bp, 1, one, bp + pb * bs0, bs0); } } return 0; @@ -149,12 +138,10 @@ KOKKOS_INLINE_FUNCTION int TeamTrsvInternalLower::invoke( template struct TeamTrsvInternalUpper { template - KOKKOS_INLINE_FUNCTION static int invoke( - const MemberType & /*member*/, const bool /*use_unit_diag*/, - const int /*m*/, const ScalarType /*alpha*/, - const ValueType *KOKKOS_RESTRICT /*A*/, const int /*as0*/, - const int /*as1*/, - /**/ ValueType *KOKKOS_RESTRICT /*b*/, const int /*bs0*/) { + KOKKOS_INLINE_FUNCTION static int invoke(const MemberType & /*member*/, const bool /*use_unit_diag*/, const int /*m*/, + const ScalarType /*alpha*/, const ValueType *KOKKOS_RESTRICT /*A*/, + const int /*as0*/, const int /*as1*/, + /**/ ValueType *KOKKOS_RESTRICT /*b*/, const int /*bs0*/) { assert(false && "Error: encounter dummy impl"); return 0; } @@ -163,25 +150,23 @@ struct TeamTrsvInternalUpper { template <> template KOKKOS_INLINE_FUNCTION int TeamTrsvInternalUpper::invoke( - const MemberType &member, const bool use_unit_diag, const int m, - const ScalarType alpha, const ValueType *KOKKOS_RESTRICT A, const int as0, - const int as1, + const MemberType &member, const bool use_unit_diag, const int m, const ScalarType alpha, + const ValueType *KOKKOS_RESTRICT A, const int as0, const int as1, /**/ ValueType *KOKKOS_RESTRICT b, const int bs0) { const ScalarType one(1.0), zero(0.0); if (alpha == zero) KokkosBlas::Impl::TeamSetInternal::invoke(member, m, zero, b, bs0); else { - if (alpha != one) - KokkosBlas::Impl::TeamScaleInternal::invoke(member, m, alpha, b, bs0); + if (alpha != one) KokkosBlas::Impl::TeamScaleInternal::invoke(member, m, alpha, b, bs0); if (m <= 0) return 0; ValueType *KOKKOS_RESTRICT b0 = b; for (int p = (m - 1); p >= 0; --p) { const int iend = p; - const ValueType *KOKKOS_RESTRICT a01 = A + p * as1; - /**/ ValueType *KOKKOS_RESTRICT beta1 = b + p * bs0; + const ValueType *KOKKOS_RESTRICT a01 = A + p * as1; + /**/ ValueType *KOKKOS_RESTRICT beta1 = b + p * bs0; member.team_barrier(); ValueType local_beta1 = *beta1; @@ -193,9 +178,8 @@ KOKKOS_INLINE_FUNCTION int TeamTrsvInternalUpper::invoke( if (member.team_rank() == 0) *beta1 = local_beta1; } // member.team_barrier(); - Kokkos::parallel_for( - Kokkos::TeamThreadRange(member, 0, iend), - [&](const int &i) { b0[i * bs0] -= a01[i * as0] * local_beta1; }); + Kokkos::parallel_for(Kokkos::TeamThreadRange(member, 0, iend), + [&](const int &i) { b0[i * bs0] -= a01[i * as0] * local_beta1; }); } } return 0; @@ -204,9 +188,8 @@ KOKKOS_INLINE_FUNCTION int TeamTrsvInternalUpper::invoke( template <> template KOKKOS_INLINE_FUNCTION int TeamTrsvInternalUpper::invoke( - const MemberType &member, const bool use_unit_diag, const int m, - const ScalarType alpha, const ValueType *KOKKOS_RESTRICT A, const int as0, - const int as1, + const MemberType &member, const bool use_unit_diag, const int m, const ScalarType alpha, + const ValueType *KOKKOS_RESTRICT A, const int as0, const int as1, /**/ ValueType *KOKKOS_RESTRICT b, const int bs0) { const ScalarType one(1.0), zero(0.0), minus_one(-1.0); @@ -216,8 +199,7 @@ KOKKOS_INLINE_FUNCTION int TeamTrsvInternalUpper::invoke( if (alpha == zero) KokkosBlas::Impl::TeamSetInternal::invoke(member, m, zero, b, bs0); else { - if (alpha != one) - KokkosBlas::Impl::TeamScaleInternal::invoke(member, m, alpha, b, bs0); + if (alpha != one) KokkosBlas::Impl::TeamScaleInternal::invoke(member, m, alpha, b, bs0); if (m <= 0) return 0; InnerTrsmLeftUpperUnitDiag trsm_u(as0, as1, bs0, 0); @@ -225,12 +207,11 @@ KOKKOS_INLINE_FUNCTION int TeamTrsvInternalUpper::invoke( const int mb = mbAlgo; for (int pp = 0; pp < m; pp += mb) { - const int ptmp = (m - pp - mb), p = (ptmp < 0 ? 0 : ptmp), - pb = (mb + (ptmp < 0) * ptmp); + const int ptmp = (m - pp - mb), p = (ptmp < 0 ? 0 : ptmp), pb = (mb + (ptmp < 0) * ptmp); // trsm update const ValueType *KOKKOS_RESTRICT Ap = A + p * as0 + p * as1; - /**/ ValueType *KOKKOS_RESTRICT bp = b + p * bs0; + /**/ ValueType *KOKKOS_RESTRICT bp = b + p * bs0; member.team_barrier(); if (member.team_rank() == 0) { @@ -242,8 +223,8 @@ KOKKOS_INLINE_FUNCTION int TeamTrsvInternalUpper::invoke( // gemv update member.team_barrier(); - KokkosBlas::Impl::TeamGemvInternal::invoke( - member, p, pb, minus_one, Ap - p * as0, as0, as1, bp, 1, one, b, bs0); + KokkosBlas::Impl::TeamGemvInternal::invoke(member, p, pb, minus_one, Ap - p * as0, as0, + as1, bp, 1, one, b, bs0); } } return 0; diff --git a/batched/dense/impl/KokkosBatched_Trtri_Serial_Impl.hpp b/batched/dense/impl/KokkosBatched_Trtri_Serial_Impl.hpp index 66c8f91ac9..1068bf9e54 100644 --- a/batched/dense/impl/KokkosBatched_Trtri_Serial_Impl.hpp +++ b/batched/dense/impl/KokkosBatched_Trtri_Serial_Impl.hpp @@ -25,18 +25,16 @@ template struct SerialTrtri { template KOKKOS_INLINE_FUNCTION static int invoke(const AViewType &A) { - return SerialTrtriInternalLower::invoke( - ArgDiag::use_unit_diag, A.extent(0), A.extent(1), A.data(), - A.stride_0(), A.stride_1()); + return SerialTrtriInternalLower::invoke(ArgDiag::use_unit_diag, A.extent(0), A.extent(1), + A.data(), A.stride_0(), A.stride_1()); } }; template struct SerialTrtri { template KOKKOS_INLINE_FUNCTION static int invoke(const AViewType &A) { - return SerialTrtriInternalUpper::invoke( - ArgDiag::use_unit_diag, A.extent(0), A.extent(1), A.data(), A.stride(0), - A.stride(1)); + return SerialTrtriInternalUpper::invoke(ArgDiag::use_unit_diag, A.extent(0), A.extent(1), + A.data(), A.stride(0), A.stride(1)); } }; } // namespace KokkosBatched diff --git a/batched/dense/impl/KokkosBatched_Trtri_Serial_Internal.hpp b/batched/dense/impl/KokkosBatched_Trtri_Serial_Internal.hpp index 2941b03ccf..f6b0b4bf6d 100644 --- a/batched/dense/impl/KokkosBatched_Trtri_Serial_Internal.hpp +++ b/batched/dense/impl/KokkosBatched_Trtri_Serial_Internal.hpp @@ -25,27 +25,23 @@ namespace KokkosBatched { template struct SerialTrtriInternalLower { template - KOKKOS_INLINE_FUNCTION static int invoke(const bool use_unit_diag, - const int am, const int an, - ValueType *KOKKOS_RESTRICT A, - const int as0, const int as1); + KOKKOS_INLINE_FUNCTION static int invoke(const bool use_unit_diag, const int am, const int an, + ValueType *KOKKOS_RESTRICT A, const int as0, const int as1); }; template struct SerialTrtriInternalUpper { template - KOKKOS_INLINE_FUNCTION static int invoke(const bool use_unit_diag, - const int am, const int an, - ValueType *KOKKOS_RESTRICT A, - const int as0, const int as1); + KOKKOS_INLINE_FUNCTION static int invoke(const bool use_unit_diag, const int am, const int an, + ValueType *KOKKOS_RESTRICT A, const int as0, const int as1); }; template <> template -KOKKOS_INLINE_FUNCTION int -SerialTrtriInternalLower::invoke( - const bool use_unit_diag, const int am, const int /*an*/, - ValueType *KOKKOS_RESTRICT A, const int as0, const int as1) { +KOKKOS_INLINE_FUNCTION int SerialTrtriInternalLower::invoke(const bool use_unit_diag, + const int am, const int /*an*/, + ValueType *KOKKOS_RESTRICT A, + const int as0, const int as1) { ValueType one(1.0), zero(0.0), A_ii; if (!use_unit_diag) { #if defined(KOKKOS_ENABLE_PRAGMA_UNROLL) @@ -74,14 +70,13 @@ SerialTrtriInternalLower::invoke( int A_col_vec_m = am - i - 1, A_col_vec_n = 1; // TRMV/TRMM −− x=Ax // A((j+1):n,j) = A((j+1):n,(j+1):n) ∗ A((j+1):n,j) ; - SerialTrmmInternalLeftLower::invoke( - use_unit_diag, false, A_subblock_m, A_subblock_n, A_col_vec_m, - A_col_vec_n, one, A_subblock, as0, as1, A_col_vec, as0, as1); + SerialTrmmInternalLeftLower::invoke(use_unit_diag, false, A_subblock_m, A_subblock_n, + A_col_vec_m, A_col_vec_n, one, A_subblock, as0, as1, + A_col_vec, as0, as1); // SCAL -- x=ax // A((j+1):n,j) = A_ii * A((j+1):n,j) - KokkosBlas::Impl::SerialScaleInternal::invoke(A_col_vec_m, A_col_vec_n, - A_ii, A_col_vec, as0, as1); + KokkosBlas::Impl::SerialScaleInternal::invoke(A_col_vec_m, A_col_vec_n, A_ii, A_col_vec, as0, as1); } } return 0; @@ -89,10 +84,10 @@ SerialTrtriInternalLower::invoke( template <> template -KOKKOS_INLINE_FUNCTION int -SerialTrtriInternalUpper::invoke( - const bool use_unit_diag, const int am, const int /*an*/, - ValueType *KOKKOS_RESTRICT A, const int as0, const int as1) { +KOKKOS_INLINE_FUNCTION int SerialTrtriInternalUpper::invoke(const bool use_unit_diag, + const int am, const int /*an*/, + ValueType *KOKKOS_RESTRICT A, + const int as0, const int as1) { ValueType one(1.0), zero(0.0), A_ii; if (!use_unit_diag) { @@ -123,14 +118,13 @@ SerialTrtriInternalUpper::invoke( // TRMV/TRMM −− x=Ax // A(1:(j-1),j) = A(1:(j-1),1:(j-1)) ∗ A(1:(j-1),j) ; // SerialTrmm - SerialTrmmInternalLeftUpper::invoke( - use_unit_diag, false, A_subblock_m, A_subblock_n, A_col_vec_m, - A_col_vec_n, one, A_subblock, as0, as1, A_col_vec, as0, as1); + SerialTrmmInternalLeftUpper::invoke(use_unit_diag, false, A_subblock_m, A_subblock_n, + A_col_vec_m, A_col_vec_n, one, A_subblock, as0, as1, + A_col_vec, as0, as1); // SCAL -- x=ax // A((j+1):n,j) = A_ii * A((j+1):n,j) - KokkosBlas::Impl::SerialScaleInternal::invoke(A_col_vec_m, A_col_vec_n, - A_ii, A_col_vec, as0, as1); + KokkosBlas::Impl::SerialScaleInternal::invoke(A_col_vec_m, A_col_vec_n, A_ii, A_col_vec, as0, as1); } } return 0; diff --git a/batched/dense/impl/KokkosBatched_UTV_TeamVector_Impl.hpp b/batched/dense/impl/KokkosBatched_UTV_TeamVector_Impl.hpp index b57a145ccb..de5ecebf94 100644 --- a/batched/dense/impl/KokkosBatched_UTV_TeamVector_Impl.hpp +++ b/batched/dense/impl/KokkosBatched_UTV_TeamVector_Impl.hpp @@ -29,16 +29,13 @@ namespace KokkosBatched { template struct TeamVectorUTV { - template - KOKKOS_INLINE_FUNCTION static int invoke( - const MemberType &member, const AViewType &A, const pViewType &p, - const UViewType &U, const VViewType &V, const wViewType &w, - int &matrix_rank) { - return TeamVectorUTV_Internal::invoke( - member, A.extent(0), A.extent(1), A.data(), A.stride(0), A.stride(1), - p.data(), p.stride(0), U.data(), U.stride(0), U.stride(1), V.data(), - V.stride(0), V.stride(1), w.data(), matrix_rank); + template + KOKKOS_INLINE_FUNCTION static int invoke(const MemberType &member, const AViewType &A, const pViewType &p, + const UViewType &U, const VViewType &V, const wViewType &w, + int &matrix_rank) { + return TeamVectorUTV_Internal::invoke(member, A.extent(0), A.extent(1), A.data(), A.stride(0), A.stride(1), + p.data(), p.stride(0), U.data(), U.stride(0), U.stride(1), V.data(), + V.stride(0), V.stride(1), w.data(), matrix_rank); } }; diff --git a/batched/dense/impl/KokkosBatched_UTV_TeamVector_Internal.hpp b/batched/dense/impl/KokkosBatched_UTV_TeamVector_Internal.hpp index 1066467414..e39dba9a40 100644 --- a/batched/dense/impl/KokkosBatched_UTV_TeamVector_Internal.hpp +++ b/batched/dense/impl/KokkosBatched_UTV_TeamVector_Internal.hpp @@ -32,15 +32,14 @@ namespace KokkosBatched { /// =================== struct TeamVectorUTV_Internal { template - KOKKOS_INLINE_FUNCTION static int invoke( - const MemberType &member, const int m, - const int n, // m = NumRows(A), n = NumCols(A) - /* */ ValueType *A, const int as0, const int as1, - /* */ IntType *p, const int ps0, - /* */ ValueType *U, const int us0, const int us1, - /* */ ValueType *V, const int vs0, const int vs1, - /* */ ValueType *w, // 3*m, tau, norm, householder workspace - /* */ int &matrix_rank) { + KOKKOS_INLINE_FUNCTION static int invoke(const MemberType &member, const int m, + const int n, // m = NumRows(A), n = NumCols(A) + /* */ ValueType *A, const int as0, const int as1, + /* */ IntType *p, const int ps0, + /* */ ValueType *U, const int us0, const int us1, + /* */ ValueType *V, const int vs0, const int vs1, + /* */ ValueType *w, // 3*m, tau, norm, householder workspace + /* */ int &matrix_rank) { typedef ValueType value_type; // typedef IntType int_type; @@ -51,25 +50,19 @@ struct TeamVectorUTV_Internal { value_type *work = w; matrix_rank = -1; - TeamVectorQR_WithColumnPivotingInternal ::invoke( - member, m, n, A, as0, as1, t, ts0, p, ps0, work, matrix_rank); + TeamVectorQR_WithColumnPivotingInternal ::invoke(member, m, n, A, as0, as1, t, ts0, p, ps0, work, matrix_rank); - TeamVectorQR_FormQ_Internal ::invoke(member, m, matrix_rank, matrix_rank, A, - as0, as1, t, ts0, U, us0, us1, work); + TeamVectorQR_FormQ_Internal ::invoke(member, m, matrix_rank, matrix_rank, A, as0, as1, t, ts0, U, us0, us1, work); member.team_barrier(); /// for rank deficient matrix if (matrix_rank < n) { const value_type zero(0); - TeamVectorSetLowerTriangularInternal ::invoke( - member, matrix_rank, matrix_rank, 1, zero, A, as0, as1); + TeamVectorSetLowerTriangularInternal ::invoke(member, matrix_rank, matrix_rank, 1, zero, A, as0, as1); - TeamVectorQR_Internal ::invoke(member, n, matrix_rank, A, as1, as0, t, - ts0, work); + TeamVectorQR_Internal ::invoke(member, n, matrix_rank, A, as1, as0, t, ts0, work); - TeamVectorQR_FormQ_Internal ::invoke(member, n, matrix_rank, matrix_rank, - A, as1, as0, t, ts0, V, vs1, vs0, - work); + TeamVectorQR_FormQ_Internal ::invoke(member, n, matrix_rank, matrix_rank, A, as1, as0, t, ts0, V, vs1, vs0, work); } return 0; diff --git a/batched/dense/impl/KokkosBatched_UpdateGivens_Internal.hpp b/batched/dense/impl/KokkosBatched_UpdateGivens_Internal.hpp index 54e2791dbb..3f56e71422 100644 --- a/batched/dense/impl/KokkosBatched_UpdateGivens_Internal.hpp +++ b/batched/dense/impl/KokkosBatched_UpdateGivens_Internal.hpp @@ -30,9 +30,8 @@ namespace KokkosBatched { /// struct SerialUpdateGivensInternal { template - KOKKOS_INLINE_FUNCTION static int invoke( - const Kokkos::pair &S, - /* */ Kokkos::pair &G) { + KOKKOS_INLINE_FUNCTION static int invoke(const Kokkos::pair &S, + /* */ Kokkos::pair &G) { const ValueType tmp = S.first * G.first - S.second * G.second; G.second = S.first * G.second + S.second * G.first; G.first = tmp; diff --git a/batched/dense/impl/KokkosBatched_Vector_SIMD_Arith.hpp b/batched/dense/impl/KokkosBatched_Vector_SIMD_Arith.hpp index f87492ea5a..08628729bc 100644 --- a/batched/dense/impl/KokkosBatched_Vector_SIMD_Arith.hpp +++ b/batched/dense/impl/KokkosBatched_Vector_SIMD_Arith.hpp @@ -24,23 +24,21 @@ namespace KokkosBatched { #define KOKKOSKERNELS_SIMD_ARITH_RETURN_TYPE(T, l) Vector, l> -#define KOKKOSKERNELS_SIMD_ARITH_RETURN_REFERENCE_TYPE(T, l) \ - Vector, l> & +#define KOKKOSKERNELS_SIMD_ARITH_RETURN_REFERENCE_TYPE(T, l) Vector, l> & /// simd, simd #if defined(__KOKKOSBATCHED_ENABLE_AVX__) #if defined(__AVX512F__) KOKKOS_FORCEINLINE_FUNCTION -static KOKKOSKERNELS_SIMD_ARITH_RETURN_TYPE(double, 8) operator+( - const Vector, 8> &a, const Vector, 8> &b) { +static KOKKOSKERNELS_SIMD_ARITH_RETURN_TYPE(double, 8) operator+(const Vector, 8> &a, + const Vector, 8> &b) { return _mm512_add_pd(a, b); } #if !defined(KOKKOS_COMPILER_GNU) KOKKOS_FORCEINLINE_FUNCTION -static KOKKOSKERNELS_SIMD_ARITH_RETURN_TYPE(Kokkos::complex, 4) -operator+(const Vector >, 4> &a, - const Vector >, 4> &b) { +static KOKKOSKERNELS_SIMD_ARITH_RETURN_TYPE(Kokkos::complex, 4) operator+( + const Vector >, 4> &a, const Vector >, 4> &b) { return _mm512_add_pd(a, b); } #endif @@ -48,16 +46,15 @@ operator+(const Vector >, 4> &a, #endif #if defined(__AVX__) || defined(__AVX2__) KOKKOS_FORCEINLINE_FUNCTION -static KOKKOSKERNELS_SIMD_ARITH_RETURN_TYPE(double, 4) operator+( - const Vector, 4> &a, const Vector, 4> &b) { +static KOKKOSKERNELS_SIMD_ARITH_RETURN_TYPE(double, 4) operator+(const Vector, 4> &a, + const Vector, 4> &b) { return _mm256_add_pd(a, b); } #if !defined(KOKKOS_COMPILER_GNU) KOKKOS_FORCEINLINE_FUNCTION -static KOKKOSKERNELS_SIMD_ARITH_RETURN_TYPE(Kokkos::complex, 2) -operator+(const Vector >, 2> &a, - const Vector >, 2> &b) { +static KOKKOSKERNELS_SIMD_ARITH_RETURN_TYPE(Kokkos::complex, 2) operator+( + const Vector >, 2> &a, const Vector >, 2> &b) { return _mm256_add_pd(a, b); } #endif @@ -66,8 +63,8 @@ operator+(const Vector >, 2> &a, #endif template -KOKKOS_FORCEINLINE_FUNCTION static KOKKOSKERNELS_SIMD_ARITH_RETURN_TYPE(T, l) -operator+(const Vector, l> &a, const Vector, l> &b) { +KOKKOS_FORCEINLINE_FUNCTION static KOKKOSKERNELS_SIMD_ARITH_RETURN_TYPE(T, l) operator+(const Vector, l> &a, + const Vector, l> &b) { Vector, l> r_val; if (std::is_fundamental::value) { KOKKOSKERNELS_FORCE_SIMD @@ -80,24 +77,24 @@ operator+(const Vector, l> &a, const Vector, l> &b) { #if defined(__CUDA_ARCH__) || defined(__HIP_DEVICE_COMPILE__) KOKKOS_FORCEINLINE_FUNCTION -static KOKKOSKERNELS_SIMD_ARITH_RETURN_TYPE(float, 2) operator+( - const Vector, 2> &a, const Vector, 2> &b) { +static KOKKOSKERNELS_SIMD_ARITH_RETURN_TYPE(float, 2) operator+(const Vector, 2> &a, + const Vector, 2> &b) { float2 r_val; r_val.x = a.float2().x + b.float2().x; r_val.y = a.float2().y + b.float2().y; return r_val; } KOKKOS_FORCEINLINE_FUNCTION -static KOKKOSKERNELS_SIMD_ARITH_RETURN_TYPE(double, 2) operator+( - const Vector, 2> &a, const Vector, 2> &b) { +static KOKKOSKERNELS_SIMD_ARITH_RETURN_TYPE(double, 2) operator+(const Vector, 2> &a, + const Vector, 2> &b) { double2 r_val; r_val.x = a.double2().x + b.double2().x; r_val.y = a.double2().y + b.double2().y; return r_val; } KOKKOS_FORCEINLINE_FUNCTION -static KOKKOSKERNELS_SIMD_ARITH_RETURN_TYPE(float, 4) operator+( - const Vector, 4> &a, const Vector, 4> &b) { +static KOKKOSKERNELS_SIMD_ARITH_RETURN_TYPE(float, 4) operator+(const Vector, 4> &a, + const Vector, 4> &b) { float4 r_val; r_val.x = a.float4().x + b.float4().x; r_val.y = a.float4().y + b.float4().y; @@ -106,8 +103,8 @@ static KOKKOSKERNELS_SIMD_ARITH_RETURN_TYPE(float, 4) operator+( return r_val; } KOKKOS_FORCEINLINE_FUNCTION -static KOKKOSKERNELS_SIMD_ARITH_RETURN_TYPE(double, 4) operator+( - const Vector, 4> &a, const Vector, 4> &b) { +static KOKKOSKERNELS_SIMD_ARITH_RETURN_TYPE(double, 4) operator+(const Vector, 4> &a, + const Vector, 4> &b) { double4 r_val; r_val.x = a.double4().x + b.double4().x; r_val.y = a.double4().y + b.double4().y; @@ -119,9 +116,8 @@ static KOKKOSKERNELS_SIMD_ARITH_RETURN_TYPE(double, 4) operator+( #endif template -KOKKOS_FORCEINLINE_FUNCTION static KOKKOSKERNELS_SIMD_ARITH_RETURN_REFERENCE_TYPE( - T, l) -operator+=(Vector, l> &a, const Vector, l> &b) { +KOKKOS_FORCEINLINE_FUNCTION static KOKKOSKERNELS_SIMD_ARITH_RETURN_REFERENCE_TYPE(T, l) operator+=( + Vector, l> &a, const Vector, l> &b) { a = a + b; return a; } @@ -129,37 +125,34 @@ operator+=(Vector, l> &a, const Vector, l> &b) { /// simd, real template -KOKKOS_FORCEINLINE_FUNCTION static KOKKOSKERNELS_SIMD_ARITH_RETURN_TYPE(T, l) -operator+(const Vector, l> &a, const T b) { +KOKKOS_FORCEINLINE_FUNCTION static KOKKOSKERNELS_SIMD_ARITH_RETURN_TYPE(T, l) operator+(const Vector, l> &a, + const T b) { return a + Vector, l>(b); } template -KOKKOS_FORCEINLINE_FUNCTION static KOKKOSKERNELS_SIMD_ARITH_RETURN_TYPE(T, l) -operator+(const T a, const Vector, l> &b) { +KOKKOS_FORCEINLINE_FUNCTION static KOKKOSKERNELS_SIMD_ARITH_RETURN_TYPE(T, l) operator+(const T a, + const Vector, l> &b) { return Vector, l>(a) + b; } template -KOKKOS_FORCEINLINE_FUNCTION static KOKKOSKERNELS_SIMD_ARITH_RETURN_REFERENCE_TYPE( - T, l) -operator+=(Vector, l> &a, const T b) { +KOKKOS_FORCEINLINE_FUNCTION static KOKKOSKERNELS_SIMD_ARITH_RETURN_REFERENCE_TYPE(T, l) operator+=( + Vector, l> &a, const T b) { a = a + b; return a; } template -KOKKOS_FORCEINLINE_FUNCTION static KOKKOSKERNELS_SIMD_ARITH_RETURN_TYPE(T, l) -operator++(Vector, l> &a, int) { +KOKKOS_FORCEINLINE_FUNCTION static KOKKOSKERNELS_SIMD_ARITH_RETURN_TYPE(T, l) operator++(Vector, l> &a, int) { Vector, l> a0 = a; a = a + typename Kokkos::ArithTraits::mag_type(1); return a0; } template -KOKKOS_FORCEINLINE_FUNCTION static KOKKOSKERNELS_SIMD_ARITH_RETURN_REFERENCE_TYPE( - T, l) -operator++(Vector, l> &a) { +KOKKOS_FORCEINLINE_FUNCTION static KOKKOSKERNELS_SIMD_ARITH_RETURN_REFERENCE_TYPE(T, l) operator++( + Vector, l> &a) { a = a + typename Kokkos::ArithTraits::mag_type(1); return a; } @@ -167,23 +160,20 @@ operator++(Vector, l> &a) { /// simd complex, real template -KOKKOS_FORCEINLINE_FUNCTION static KOKKOSKERNELS_SIMD_ARITH_RETURN_TYPE( - Kokkos::complex, l) -operator+(const Vector >, l> &a, const T b) { +KOKKOS_FORCEINLINE_FUNCTION static KOKKOSKERNELS_SIMD_ARITH_RETURN_TYPE(Kokkos::complex, l) operator+( + const Vector >, l> &a, const T b) { return a + Vector >, l>(b); } template -KOKKOS_FORCEINLINE_FUNCTION static KOKKOSKERNELS_SIMD_ARITH_RETURN_TYPE( - Kokkos::complex, l) -operator+(const T a, const Vector >, l> &b) { +KOKKOS_FORCEINLINE_FUNCTION static KOKKOSKERNELS_SIMD_ARITH_RETURN_TYPE(Kokkos::complex, l) operator+( + const T a, const Vector >, l> &b) { return Vector >, l>(a) + b; } template -KOKKOS_FORCEINLINE_FUNCTION static KOKKOSKERNELS_SIMD_ARITH_RETURN_REFERENCE_TYPE( - Kokkos::complex, l) -operator+=(Vector >, l> &a, const T b) { +KOKKOS_FORCEINLINE_FUNCTION static KOKKOSKERNELS_SIMD_ARITH_RETURN_REFERENCE_TYPE(Kokkos::complex, l) operator+=( + Vector >, l> &a, const T b) { a = a + b; return a; } @@ -191,26 +181,20 @@ operator+=(Vector >, l> &a, const T b) { /// simd complex, complex template -KOKKOS_FORCEINLINE_FUNCTION static KOKKOSKERNELS_SIMD_ARITH_RETURN_TYPE( - Kokkos::complex, l) -operator+(const Vector >, l> &a, - const Kokkos::complex b) { +KOKKOS_FORCEINLINE_FUNCTION static KOKKOSKERNELS_SIMD_ARITH_RETURN_TYPE(Kokkos::complex, l) operator+( + const Vector >, l> &a, const Kokkos::complex b) { return a + Vector >, l>(b); } template -KOKKOS_FORCEINLINE_FUNCTION static KOKKOSKERNELS_SIMD_ARITH_RETURN_TYPE( - Kokkos::complex, l) -operator+(const Kokkos::complex a, - const Vector >, l> &b) { +KOKKOS_FORCEINLINE_FUNCTION static KOKKOSKERNELS_SIMD_ARITH_RETURN_TYPE(Kokkos::complex, l) operator+( + const Kokkos::complex a, const Vector >, l> &b) { return Vector >, l>(a) + b; } template -KOKKOS_FORCEINLINE_FUNCTION static KOKKOSKERNELS_SIMD_ARITH_RETURN_REFERENCE_TYPE( - Kokkos::complex, l) -operator+=(Vector >, l> &a, - const Kokkos::complex b) { +KOKKOS_FORCEINLINE_FUNCTION static KOKKOSKERNELS_SIMD_ARITH_RETURN_REFERENCE_TYPE(Kokkos::complex, l) operator+=( + Vector >, l> &a, const Kokkos::complex b) { a = a + b; return a; } @@ -222,16 +206,15 @@ operator+=(Vector >, l> &a, #if defined(__KOKKOSBATCHED_ENABLE_AVX__) #if defined(__AVX512F__) KOKKOS_FORCEINLINE_FUNCTION -static KOKKOSKERNELS_SIMD_ARITH_RETURN_TYPE(double, 8) operator-( - const Vector, 8> &a, const Vector, 8> &b) { +static KOKKOSKERNELS_SIMD_ARITH_RETURN_TYPE(double, 8) operator-(const Vector, 8> &a, + const Vector, 8> &b) { return _mm512_sub_pd(a, b); } #if !defined(KOKKOS_COMPILER_GNU) KOKKOS_FORCEINLINE_FUNCTION -static KOKKOSKERNELS_SIMD_ARITH_RETURN_TYPE(Kokkos::complex, 4) -operator-(const Vector >, 4> &a, - const Vector >, 4> &b) { +static KOKKOSKERNELS_SIMD_ARITH_RETURN_TYPE(Kokkos::complex, 4) operator-( + const Vector >, 4> &a, const Vector >, 4> &b) { return _mm512_sub_pd(a, b); } #endif @@ -239,16 +222,15 @@ operator-(const Vector >, 4> &a, #endif #if defined(__AVX__) || defined(__AVX2__) KOKKOS_FORCEINLINE_FUNCTION -static KOKKOSKERNELS_SIMD_ARITH_RETURN_TYPE(double, 4) operator-( - const Vector, 4> &a, const Vector, 4> &b) { +static KOKKOSKERNELS_SIMD_ARITH_RETURN_TYPE(double, 4) operator-(const Vector, 4> &a, + const Vector, 4> &b) { return _mm256_sub_pd(a, b); } #if !defined(KOKKOS_COMPILER_GNU) KOKKOS_FORCEINLINE_FUNCTION -static KOKKOSKERNELS_SIMD_ARITH_RETURN_TYPE(Kokkos::complex, 2) -operator-(const Vector >, 2> &a, - const Vector >, 2> &b) { +static KOKKOSKERNELS_SIMD_ARITH_RETURN_TYPE(Kokkos::complex, 2) operator-( + const Vector >, 2> &a, const Vector >, 2> &b) { return _mm256_sub_pd(a, b); } #endif @@ -257,8 +239,8 @@ operator-(const Vector >, 2> &a, #endif template -KOKKOS_FORCEINLINE_FUNCTION static KOKKOSKERNELS_SIMD_ARITH_RETURN_TYPE(T, l) -operator-(const Vector, l> &a, const Vector, l> &b) { +KOKKOS_FORCEINLINE_FUNCTION static KOKKOSKERNELS_SIMD_ARITH_RETURN_TYPE(T, l) operator-(const Vector, l> &a, + const Vector, l> &b) { Vector, l> r_val; if (std::is_fundamental::value) { KOKKOSKERNELS_FORCE_SIMD @@ -271,24 +253,24 @@ operator-(const Vector, l> &a, const Vector, l> &b) { #if defined(__CUDA_ARCH__) || defined(__HIP_DEVICE_COMPILE__) KOKKOS_FORCEINLINE_FUNCTION -static KOKKOSKERNELS_SIMD_ARITH_RETURN_TYPE(float, 2) operator-( - const Vector, 2> &a, const Vector, 2> &b) { +static KOKKOSKERNELS_SIMD_ARITH_RETURN_TYPE(float, 2) operator-(const Vector, 2> &a, + const Vector, 2> &b) { float2 r_val; r_val.x = a.float2().x - b.float2().x; r_val.y = a.float2().y - b.float2().y; return r_val; } KOKKOS_FORCEINLINE_FUNCTION -static KOKKOSKERNELS_SIMD_ARITH_RETURN_TYPE(double, 2) operator-( - const Vector, 2> &a, const Vector, 2> &b) { +static KOKKOSKERNELS_SIMD_ARITH_RETURN_TYPE(double, 2) operator-(const Vector, 2> &a, + const Vector, 2> &b) { double2 r_val; r_val.x = a.double2().x - b.double2().x; r_val.y = a.double2().y - b.double2().y; return r_val; } KOKKOS_FORCEINLINE_FUNCTION -static KOKKOSKERNELS_SIMD_ARITH_RETURN_TYPE(float, 4) operator-( - const Vector, 4> &a, const Vector, 4> &b) { +static KOKKOSKERNELS_SIMD_ARITH_RETURN_TYPE(float, 4) operator-(const Vector, 4> &a, + const Vector, 4> &b) { float4 r_val; r_val.x = a.float4().x - b.float4().x; r_val.y = a.float4().y - b.float4().y; @@ -297,8 +279,8 @@ static KOKKOSKERNELS_SIMD_ARITH_RETURN_TYPE(float, 4) operator-( return r_val; } KOKKOS_FORCEINLINE_FUNCTION -static KOKKOSKERNELS_SIMD_ARITH_RETURN_TYPE(double, 4) operator-( - const Vector, 4> &a, const Vector, 4> &b) { +static KOKKOSKERNELS_SIMD_ARITH_RETURN_TYPE(double, 4) operator-(const Vector, 4> &a, + const Vector, 4> &b) { double4 r_val; r_val.x = a.double4().x - b.double4().x; r_val.y = a.double4().y - b.double4().y; @@ -309,8 +291,7 @@ static KOKKOSKERNELS_SIMD_ARITH_RETURN_TYPE(double, 4) operator-( #endif template -KOKKOS_FORCEINLINE_FUNCTION static KOKKOSKERNELS_SIMD_ARITH_RETURN_TYPE(T, l) -operator-(const Vector, l> &a) { +KOKKOS_FORCEINLINE_FUNCTION static KOKKOSKERNELS_SIMD_ARITH_RETURN_TYPE(T, l) operator-(const Vector, l> &a) { Vector, l> r_val; if (std::is_fundamental::value) { KOKKOSKERNELS_FORCE_SIMD @@ -322,9 +303,8 @@ operator-(const Vector, l> &a) { } template -KOKKOS_FORCEINLINE_FUNCTION static KOKKOSKERNELS_SIMD_ARITH_RETURN_REFERENCE_TYPE( - T, l) -operator-=(Vector, l> &a, const Vector, l> &b) { +KOKKOS_FORCEINLINE_FUNCTION static KOKKOSKERNELS_SIMD_ARITH_RETURN_REFERENCE_TYPE(T, l) operator-=( + Vector, l> &a, const Vector, l> &b) { a = a - b; return a; } @@ -332,37 +312,34 @@ operator-=(Vector, l> &a, const Vector, l> &b) { /// simd, real template -KOKKOS_FORCEINLINE_FUNCTION static KOKKOSKERNELS_SIMD_ARITH_RETURN_TYPE(T, l) -operator-(const Vector, l> &a, const T b) { +KOKKOS_FORCEINLINE_FUNCTION static KOKKOSKERNELS_SIMD_ARITH_RETURN_TYPE(T, l) operator-(const Vector, l> &a, + const T b) { return a - Vector, l>(b); } template -KOKKOS_FORCEINLINE_FUNCTION static KOKKOSKERNELS_SIMD_ARITH_RETURN_TYPE(T, l) -operator-(const T a, const Vector, l> &b) { +KOKKOS_FORCEINLINE_FUNCTION static KOKKOSKERNELS_SIMD_ARITH_RETURN_TYPE(T, l) operator-(const T a, + const Vector, l> &b) { return Vector, l>(a) - b; } template -KOKKOS_FORCEINLINE_FUNCTION static KOKKOSKERNELS_SIMD_ARITH_RETURN_REFERENCE_TYPE( - T, l) -operator-=(Vector, l> &a, const T b) { +KOKKOS_FORCEINLINE_FUNCTION static KOKKOSKERNELS_SIMD_ARITH_RETURN_REFERENCE_TYPE(T, l) operator-=( + Vector, l> &a, const T b) { a = a - b; return a; } template -KOKKOS_FORCEINLINE_FUNCTION static KOKKOSKERNELS_SIMD_ARITH_RETURN_TYPE(T, l) -operator--(Vector, l> &a, int) { +KOKKOS_FORCEINLINE_FUNCTION static KOKKOSKERNELS_SIMD_ARITH_RETURN_TYPE(T, l) operator--(Vector, l> &a, int) { Vector, l> a0 = a; a = a - typename Kokkos::ArithTraits::mag_type(1); return a0; } template -KOKKOS_FORCEINLINE_FUNCTION static KOKKOSKERNELS_SIMD_ARITH_RETURN_REFERENCE_TYPE( - T, l) -operator--(Vector, l> &a) { +KOKKOS_FORCEINLINE_FUNCTION static KOKKOSKERNELS_SIMD_ARITH_RETURN_REFERENCE_TYPE(T, l) operator--( + Vector, l> &a) { a = a - typename Kokkos::ArithTraits::mag_type(1); return a; } @@ -370,23 +347,20 @@ operator--(Vector, l> &a) { /// simd complex, real template -KOKKOS_FORCEINLINE_FUNCTION static KOKKOSKERNELS_SIMD_ARITH_RETURN_TYPE( - Kokkos::complex, l) -operator-(const Vector >, l> &a, const T b) { +KOKKOS_FORCEINLINE_FUNCTION static KOKKOSKERNELS_SIMD_ARITH_RETURN_TYPE(Kokkos::complex, l) operator-( + const Vector >, l> &a, const T b) { return a - Vector >, l>(b); } template -KOKKOS_FORCEINLINE_FUNCTION static KOKKOSKERNELS_SIMD_ARITH_RETURN_TYPE( - Kokkos::complex, l) -operator-(const T a, const Vector >, l> &b) { +KOKKOS_FORCEINLINE_FUNCTION static KOKKOSKERNELS_SIMD_ARITH_RETURN_TYPE(Kokkos::complex, l) operator-( + const T a, const Vector >, l> &b) { return Vector >, l>(a) - b; } template -KOKKOS_FORCEINLINE_FUNCTION static KOKKOSKERNELS_SIMD_ARITH_RETURN_REFERENCE_TYPE( - Kokkos::complex, l) -operator-=(Vector >, l> &a, const T b) { +KOKKOS_FORCEINLINE_FUNCTION static KOKKOSKERNELS_SIMD_ARITH_RETURN_REFERENCE_TYPE(Kokkos::complex, l) operator-=( + Vector >, l> &a, const T b) { a = a - b; return a; } @@ -394,26 +368,20 @@ operator-=(Vector >, l> &a, const T b) { /// simd complex, complex template -KOKKOS_FORCEINLINE_FUNCTION static KOKKOSKERNELS_SIMD_ARITH_RETURN_TYPE( - Kokkos::complex, l) -operator-(const Vector >, l> &a, - const Kokkos::complex b) { +KOKKOS_FORCEINLINE_FUNCTION static KOKKOSKERNELS_SIMD_ARITH_RETURN_TYPE(Kokkos::complex, l) operator-( + const Vector >, l> &a, const Kokkos::complex b) { return a - Vector >, l>(b); } template -KOKKOS_FORCEINLINE_FUNCTION static KOKKOSKERNELS_SIMD_ARITH_RETURN_TYPE( - Kokkos::complex, l) -operator-(const Kokkos::complex a, - const Vector >, l> &b) { +KOKKOS_FORCEINLINE_FUNCTION static KOKKOSKERNELS_SIMD_ARITH_RETURN_TYPE(Kokkos::complex, l) operator-( + const Kokkos::complex a, const Vector >, l> &b) { return Vector >, l>(a) - b; } template -KOKKOS_FORCEINLINE_FUNCTION static KOKKOSKERNELS_SIMD_ARITH_RETURN_REFERENCE_TYPE( - Kokkos::complex, l) -operator-=(Vector >, l> &a, - const Kokkos::complex b) { +KOKKOS_FORCEINLINE_FUNCTION static KOKKOSKERNELS_SIMD_ARITH_RETURN_REFERENCE_TYPE(Kokkos::complex, l) operator-=( + Vector >, l> &a, const Kokkos::complex b) { a = a - b; return a; } @@ -425,30 +393,25 @@ operator-=(Vector >, l> &a, #if defined(__KOKKOSBATCHED_ENABLE_AVX__) #if defined(__AVX512F__) KOKKOS_FORCEINLINE_FUNCTION -static KOKKOSKERNELS_SIMD_ARITH_RETURN_TYPE(double, 8) operator*( - const Vector, 8> &a, const Vector, 8> &b) { +static KOKKOSKERNELS_SIMD_ARITH_RETURN_TYPE(double, 8) operator*(const Vector, 8> &a, + const Vector, 8> &b) { return _mm512_mul_pd(a, b); } #if !defined(KOKKOS_COMPILER_GNU) KOKKOS_FORCEINLINE_FUNCTION -static KOKKOSKERNELS_SIMD_ARITH_RETURN_TYPE(Kokkos::complex, 4) operator - *(const Vector >, 4> &a, - const Vector >, 4> &b) { - const __m512d as = _mm512_permute_pd(a, 0x55), - br = _mm512_permute_pd(b, 0x00), - bi = _mm512_permute_pd(b, 0xff); +static KOKKOSKERNELS_SIMD_ARITH_RETURN_TYPE(Kokkos::complex, 4) operator*( + const Vector >, 4> &a, const Vector >, 4> &b) { + const __m512d as = _mm512_permute_pd(a, 0x55), br = _mm512_permute_pd(b, 0x00), bi = _mm512_permute_pd(b, 0xff); #if defined(__FMA__) // latency 7, throughput 0.5 return _mm512_fmaddsub_pd(a, br, _mm512_mul_pd(as, bi)); #else - return _mm512_add_pd( - _mm512_mul_pd(a, br), - _mm512_castsi512_pd(_mm512_xor_si512( - _mm512_castpd_si512(_mm512_mul_pd(as, bi)), - _mm512_castpd_si512(_mm512_mask_broadcast_f64x4( - _mm512_setzero_pd(), 0x55, _mm256_set1_pd(-0.0)))))); + return _mm512_add_pd(_mm512_mul_pd(a, br), + _mm512_castsi512_pd(_mm512_xor_si512(_mm512_castpd_si512(_mm512_mul_pd(as, bi)), + _mm512_castpd_si512(_mm512_mask_broadcast_f64x4( + _mm512_setzero_pd(), 0x55, _mm256_set1_pd(-0.0)))))); // const __mm512d cc = _mm512_mul_pd(as, bi); // return _mm512_mask_sub_pd(_mm512_mask_add_pd(_mm512_mul_pd(a, br), 0x55, // cc), 0xaa, cc); @@ -459,25 +422,21 @@ static KOKKOSKERNELS_SIMD_ARITH_RETURN_TYPE(Kokkos::complex, 4) operator #endif #if defined(__AVX__) || defined(__AVX2__) KOKKOS_FORCEINLINE_FUNCTION -static KOKKOSKERNELS_SIMD_ARITH_RETURN_TYPE(double, 4) operator*( - const Vector, 4> &a, const Vector, 4> &b) { +static KOKKOSKERNELS_SIMD_ARITH_RETURN_TYPE(double, 4) operator*(const Vector, 4> &a, + const Vector, 4> &b) { return _mm256_mul_pd(a, b); } #if !defined(KOKKOS_COMPILER_GNU) KOKKOS_FORCEINLINE_FUNCTION -static Vector >, 2> operator*( - const Vector >, 2> &a, - const Vector >, 2> &b) { - const __m256d as = _mm256_permute_pd(a, 0x5), br = _mm256_permute_pd(b, 0x0), - bi = _mm256_permute_pd(b, 0xf); +static Vector >, 2> operator*(const Vector >, 2> &a, + const Vector >, 2> &b) { + const __m256d as = _mm256_permute_pd(a, 0x5), br = _mm256_permute_pd(b, 0x0), bi = _mm256_permute_pd(b, 0xf); #if defined(__FMA__) return _mm256_fmaddsub_pd(a, br, _mm256_mul_pd(as, bi)); #else - return _mm256_add_pd(_mm256_mul_pd(a, br), - _mm256_xor_pd(_mm256_mul_pd(as, bi), - _mm256_set_pd(0.0, -0.0, 0.0, -0.0))); + return _mm256_add_pd(_mm256_mul_pd(a, br), _mm256_xor_pd(_mm256_mul_pd(as, bi), _mm256_set_pd(0.0, -0.0, 0.0, -0.0))); #endif } #endif @@ -486,8 +445,8 @@ static Vector >, 2> operator*( #endif template -KOKKOS_FORCEINLINE_FUNCTION static KOKKOSKERNELS_SIMD_ARITH_RETURN_TYPE(T, l) -operator*(const Vector, l> &a, const Vector, l> &b) { +KOKKOS_FORCEINLINE_FUNCTION static KOKKOSKERNELS_SIMD_ARITH_RETURN_TYPE(T, l) operator*(const Vector, l> &a, + const Vector, l> &b) { Vector, l> r_val; if (std::is_fundamental::value) { KOKKOSKERNELS_FORCE_SIMD @@ -500,24 +459,24 @@ operator*(const Vector, l> &a, const Vector, l> &b) { #if defined(__CUDA_ARCH__) || defined(__HIP_DEVICE_COMPILE__) KOKKOS_FORCEINLINE_FUNCTION -static KOKKOSKERNELS_SIMD_ARITH_RETURN_TYPE(float, 2) operator*( - const Vector, 2> &a, const Vector, 2> &b) { +static KOKKOSKERNELS_SIMD_ARITH_RETURN_TYPE(float, 2) operator*(const Vector, 2> &a, + const Vector, 2> &b) { float2 r_val; r_val.x = a.float2().x * b.float2().x; r_val.y = a.float2().y * b.float2().y; return r_val; } KOKKOS_FORCEINLINE_FUNCTION -static KOKKOSKERNELS_SIMD_ARITH_RETURN_TYPE(double, 2) operator*( - const Vector, 2> &a, const Vector, 2> &b) { +static KOKKOSKERNELS_SIMD_ARITH_RETURN_TYPE(double, 2) operator*(const Vector, 2> &a, + const Vector, 2> &b) { double2 r_val; r_val.x = a.double2().x * b.double2().x; r_val.y = a.double2().y * b.double2().y; return r_val; } KOKKOS_FORCEINLINE_FUNCTION -static KOKKOSKERNELS_SIMD_ARITH_RETURN_TYPE(float, 4) operator*( - const Vector, 4> &a, const Vector, 4> &b) { +static KOKKOSKERNELS_SIMD_ARITH_RETURN_TYPE(float, 4) operator*(const Vector, 4> &a, + const Vector, 4> &b) { float4 r_val; r_val.x = a.float4().x * b.float4().x; r_val.y = a.float4().y * b.float4().y; @@ -526,8 +485,8 @@ static KOKKOSKERNELS_SIMD_ARITH_RETURN_TYPE(float, 4) operator*( return r_val; } KOKKOS_FORCEINLINE_FUNCTION -static KOKKOSKERNELS_SIMD_ARITH_RETURN_TYPE(double, 4) operator*( - const Vector, 4> &a, const Vector, 4> &b) { +static KOKKOSKERNELS_SIMD_ARITH_RETURN_TYPE(double, 4) operator*(const Vector, 4> &a, + const Vector, 4> &b) { double4 r_val; r_val.x = a.double4().x * b.double4().x; r_val.y = a.double4().y * b.double4().y; @@ -538,9 +497,8 @@ static KOKKOSKERNELS_SIMD_ARITH_RETURN_TYPE(double, 4) operator*( #endif template -KOKKOS_FORCEINLINE_FUNCTION static KOKKOSKERNELS_SIMD_ARITH_RETURN_REFERENCE_TYPE( - T, l) -operator*=(Vector, l> &a, const Vector, l> &b) { +KOKKOS_FORCEINLINE_FUNCTION static KOKKOSKERNELS_SIMD_ARITH_RETURN_REFERENCE_TYPE(T, l) operator*=( + Vector, l> &a, const Vector, l> &b) { a = a * b; return a; } @@ -548,21 +506,20 @@ operator*=(Vector, l> &a, const Vector, l> &b) { /// simd, real template -KOKKOS_FORCEINLINE_FUNCTION static KOKKOSKERNELS_SIMD_ARITH_RETURN_TYPE(T, l) -operator*(const Vector, l> &a, const T b) { +KOKKOS_FORCEINLINE_FUNCTION static KOKKOSKERNELS_SIMD_ARITH_RETURN_TYPE(T, l) operator*(const Vector, l> &a, + const T b) { return a * Vector, l>(b); } template -KOKKOS_FORCEINLINE_FUNCTION static KOKKOSKERNELS_SIMD_ARITH_RETURN_TYPE(T, l) -operator*(const T a, const Vector, l> &b) { +KOKKOS_FORCEINLINE_FUNCTION static KOKKOSKERNELS_SIMD_ARITH_RETURN_TYPE(T, l) operator*(const T a, + const Vector, l> &b) { return Vector, l>(a) * b; } template -KOKKOS_FORCEINLINE_FUNCTION static KOKKOSKERNELS_SIMD_ARITH_RETURN_REFERENCE_TYPE( - T, l) -operator*=(Vector, l> &a, const T b) { +KOKKOS_FORCEINLINE_FUNCTION static KOKKOSKERNELS_SIMD_ARITH_RETURN_REFERENCE_TYPE(T, l) operator*=( + Vector, l> &a, const T b) { a = a * b; return a; } @@ -585,8 +542,8 @@ operator*(const Vector >, 4> &a, const double b) { #if !defined(KOKKOS_COMPILER_GNU) KOKKOS_FORCEINLINE_FUNCTION -static KOKKOSKERNELS_SIMD_ARITH_RETURN_TYPE(Kokkos::complex, 2) operator - *(const Vector >, 2> &a, const double b) { +static KOKKOSKERNELS_SIMD_ARITH_RETURN_TYPE(Kokkos::complex, 2) operator*( + const Vector >, 2> &a, const double b) { return _mm256_mul_pd(a, _mm256_set1_pd(b)); } #endif @@ -595,9 +552,8 @@ static KOKKOSKERNELS_SIMD_ARITH_RETURN_TYPE(Kokkos::complex, 2) operator #endif template -KOKKOS_FORCEINLINE_FUNCTION static KOKKOSKERNELS_SIMD_ARITH_RETURN_TYPE( - Kokkos::complex, l) -operator*(const Vector >, l> &a, const T b) { +KOKKOS_FORCEINLINE_FUNCTION static KOKKOSKERNELS_SIMD_ARITH_RETURN_TYPE(Kokkos::complex, l) operator*( + const Vector >, l> &a, const T b) { return a * Vector >, l>(b); } @@ -617,8 +573,8 @@ operator*(const double a, const Vector >, 4> &b) { #if !defined(KOKKOS_COMPILER_GNU) KOKKOS_FORCEINLINE_FUNCTION -static KOKKOSKERNELS_SIMD_ARITH_RETURN_TYPE(Kokkos::complex, 2) operator - *(const double a, const Vector >, 2> &b) { +static KOKKOSKERNELS_SIMD_ARITH_RETURN_TYPE(Kokkos::complex, 2) operator*( + const double a, const Vector >, 2> &b) { return _mm256_mul_pd(_mm256_set1_pd(a), b); } #endif @@ -627,16 +583,14 @@ static KOKKOSKERNELS_SIMD_ARITH_RETURN_TYPE(Kokkos::complex, 2) operator #endif template -KOKKOS_FORCEINLINE_FUNCTION static KOKKOSKERNELS_SIMD_ARITH_RETURN_TYPE( - Kokkos::complex, l) -operator*(const T a, const Vector >, l> &b) { +KOKKOS_FORCEINLINE_FUNCTION static KOKKOSKERNELS_SIMD_ARITH_RETURN_TYPE(Kokkos::complex, l) operator*( + const T a, const Vector >, l> &b) { return Vector >, l>(a) * b; } template -KOKKOS_FORCEINLINE_FUNCTION static KOKKOSKERNELS_SIMD_ARITH_RETURN_REFERENCE_TYPE( - Kokkos::complex, l) -operator*=(Vector >, l> &a, const T b) { +KOKKOS_FORCEINLINE_FUNCTION static KOKKOSKERNELS_SIMD_ARITH_RETURN_REFERENCE_TYPE(Kokkos::complex, l) operator*=( + Vector >, l> &a, const T b) { a = a * b; return a; } @@ -644,26 +598,20 @@ operator*=(Vector >, l> &a, const T b) { /// simd complex, complex template -KOKKOS_FORCEINLINE_FUNCTION static KOKKOSKERNELS_SIMD_ARITH_RETURN_TYPE( - Kokkos::complex, l) -operator*(const Vector >, l> &a, - const Kokkos::complex b) { +KOKKOS_FORCEINLINE_FUNCTION static KOKKOSKERNELS_SIMD_ARITH_RETURN_TYPE(Kokkos::complex, l) operator*( + const Vector >, l> &a, const Kokkos::complex b) { return a * Vector >, l>(b); } template -KOKKOS_FORCEINLINE_FUNCTION static KOKKOSKERNELS_SIMD_ARITH_RETURN_TYPE( - Kokkos::complex, l) -operator*(const Kokkos::complex a, - const Vector >, l> &b) { +KOKKOS_FORCEINLINE_FUNCTION static KOKKOSKERNELS_SIMD_ARITH_RETURN_TYPE(Kokkos::complex, l) operator*( + const Kokkos::complex a, const Vector >, l> &b) { return Vector >, l>(a) * b; } template -KOKKOS_FORCEINLINE_FUNCTION static KOKKOSKERNELS_SIMD_ARITH_RETURN_REFERENCE_TYPE( - Kokkos::complex, l) -operator*=(Vector >, l> &a, - const Kokkos::complex b) { +KOKKOS_FORCEINLINE_FUNCTION static KOKKOSKERNELS_SIMD_ARITH_RETURN_REFERENCE_TYPE(Kokkos::complex, l) operator*=( + Vector >, l> &a, const Kokkos::complex b) { a = a * b; return a; } @@ -675,36 +623,30 @@ operator*=(Vector >, l> &a, #if defined(__KOKKOSBATCHED_ENABLE_AVX__) #if defined(__AVX512F__) KOKKOS_FORCEINLINE_FUNCTION -static KOKKOSKERNELS_SIMD_ARITH_RETURN_TYPE(double, 8) operator/( - const Vector, 8> &a, const Vector, 8> &b) { +static KOKKOSKERNELS_SIMD_ARITH_RETURN_TYPE(double, 8) operator/(const Vector, 8> &a, + const Vector, 8> &b) { return _mm512_div_pd(a, b); } #if !defined(KOKKOS_COMPILER_GNU) KOKKOS_FORCEINLINE_FUNCTION -static KOKKOSKERNELS_SIMD_ARITH_RETURN_TYPE(Kokkos::complex, 4) -operator/(const Vector >, 4> &a, - const Vector >, 4> &b) { +static KOKKOSKERNELS_SIMD_ARITH_RETURN_TYPE(Kokkos::complex, 4) operator/( + const Vector >, 4> &a, const Vector >, 4> &b) { const __m512d as = _mm512_permute_pd(a, 0x55), cb = _mm512_castsi512_pd(_mm512_xor_si512( _mm512_castpd_si512(b), - _mm512_castpd_si512(_mm512_mask_broadcast_f64x4( - _mm512_setzero_pd(), 0xAA, _mm256_set1_pd(-0.0))))), - br = _mm512_permute_pd(cb, 0x00), - bi = _mm512_permute_pd(cb, 0xff); + _mm512_castpd_si512(_mm512_mask_broadcast_f64x4(_mm512_setzero_pd(), 0xAA, _mm256_set1_pd(-0.0))))), + br = _mm512_permute_pd(cb, 0x00), bi = _mm512_permute_pd(cb, 0xff); #if defined(__FMA__) return _mm512_div_pd(_mm512_fmaddsub_pd(a, br, _mm512_mul_pd(as, bi)), _mm512_fmadd_pd(br, br, _mm512_mul_pd(bi, bi))); #else - return _mm512_div_pd( - _mm512_add_pd( - _mm512_mul_pd(a, br), - _mm512_castsi512_pd(_mm512_xor_si512( - _mm512_castpd_si512(_mm512_mul_pd(as, bi)), - _mm512_castpd_si512(_mm512_mask_broadcast_f64x4( - _mm512_setzero_pd(), 0xAA, _mm256_set1_pd(-0.0)))))), - _mm512_add_pd(_mm512_mul_pd(br, br), _mm512_mul_pd(bi, bi))); + return _mm512_div_pd(_mm512_add_pd(_mm512_mul_pd(a, br), _mm512_castsi512_pd(_mm512_xor_si512( + _mm512_castpd_si512(_mm512_mul_pd(as, bi)), + _mm512_castpd_si512(_mm512_mask_broadcast_f64x4( + _mm512_setzero_pd(), 0xAA, _mm256_set1_pd(-0.0)))))), + _mm512_add_pd(_mm512_mul_pd(br, br), _mm512_mul_pd(bi, bi))); // const __mm512d cc = _mm512_mul_pd(as, bi); // return _mm512_div_pd(_mm512_mask_sub_pd(_mm512_mask_add_pd(_mm512_mul_pd(a, // br), 0x55, cc), 0xaa, cc), @@ -718,30 +660,24 @@ operator/(const Vector >, 4> &a, #if defined(__AVX__) || defined(__AVX2__) KOKKOS_FORCEINLINE_FUNCTION -static KOKKOSKERNELS_SIMD_ARITH_RETURN_TYPE(double, 4) operator/( - const Vector, 4> &a, const Vector, 4> &b) { +static KOKKOSKERNELS_SIMD_ARITH_RETURN_TYPE(double, 4) operator/(const Vector, 4> &a, + const Vector, 4> &b) { return _mm256_div_pd(a, b); } #if !defined(KOKKOS_COMPILER_GNU) KOKKOS_FORCEINLINE_FUNCTION -static KOKKOSKERNELS_SIMD_ARITH_RETURN_TYPE(Kokkos::complex, 2) -operator/(Vector >, 2> const &a, - Vector >, 2> const &b) { - const __m256d as = _mm256_permute_pd(a, 0x5), - cb = _mm256_xor_pd(b, _mm256_set_pd(-0.0, 0.0, -0.0, 0.0)), - br = _mm256_permute_pd(cb, 0x0), - bi = _mm256_permute_pd(cb, 0xf); +static KOKKOSKERNELS_SIMD_ARITH_RETURN_TYPE(Kokkos::complex, 2) operator/( + Vector >, 2> const &a, Vector >, 2> const &b) { + const __m256d as = _mm256_permute_pd(a, 0x5), cb = _mm256_xor_pd(b, _mm256_set_pd(-0.0, 0.0, -0.0, 0.0)), + br = _mm256_permute_pd(cb, 0x0), bi = _mm256_permute_pd(cb, 0xf); #if defined(__FMA__) - return _mm256_div_pd( - _mm256_fmaddsub_pd(a, br, _mm256_mul_pd(as, bi)), - _mm256_add_pd(_mm256_mul_pd(br, br), _mm256_mul_pd(bi, bi))); + return _mm256_div_pd(_mm256_fmaddsub_pd(a, br, _mm256_mul_pd(as, bi)), + _mm256_add_pd(_mm256_mul_pd(br, br), _mm256_mul_pd(bi, bi))); #else return _mm256_div_pd( - _mm256_add_pd(_mm256_mul_pd(a, br), - _mm256_xor_pd(_mm256_mul_pd(as, bi), - _mm256_set_pd(0.0, -0.0, 0.0, -0.0))), + _mm256_add_pd(_mm256_mul_pd(a, br), _mm256_xor_pd(_mm256_mul_pd(as, bi), _mm256_set_pd(0.0, -0.0, 0.0, -0.0))), _mm256_add_pd(_mm256_mul_pd(br, br), _mm256_mul_pd(bi, bi))); #endif } @@ -751,8 +687,8 @@ operator/(Vector >, 2> const &a, #endif template -KOKKOS_FORCEINLINE_FUNCTION static KOKKOSKERNELS_SIMD_ARITH_RETURN_TYPE(T, l) -operator/(const Vector, l> &a, const Vector, l> &b) { +KOKKOS_FORCEINLINE_FUNCTION static KOKKOSKERNELS_SIMD_ARITH_RETURN_TYPE(T, l) operator/(const Vector, l> &a, + const Vector, l> &b) { Vector, l> r_val; if (std::is_fundamental::value) { KOKKOSKERNELS_FORCE_SIMD @@ -765,24 +701,24 @@ operator/(const Vector, l> &a, const Vector, l> &b) { #if defined(__CUDA_ARCH__) || defined(__HIP_DEVICE_COMPILE__) KOKKOS_FORCEINLINE_FUNCTION -static KOKKOSKERNELS_SIMD_ARITH_RETURN_TYPE(float, 2) operator/( - const Vector, 2> &a, const Vector, 2> &b) { +static KOKKOSKERNELS_SIMD_ARITH_RETURN_TYPE(float, 2) operator/(const Vector, 2> &a, + const Vector, 2> &b) { float2 r_val; r_val.x = a.float2().x / b.float2().x; r_val.y = a.float2().y / b.float2().y; return r_val; } KOKKOS_FORCEINLINE_FUNCTION -static KOKKOSKERNELS_SIMD_ARITH_RETURN_TYPE(double, 2) operator/( - const Vector, 2> &a, const Vector, 2> &b) { +static KOKKOSKERNELS_SIMD_ARITH_RETURN_TYPE(double, 2) operator/(const Vector, 2> &a, + const Vector, 2> &b) { double2 r_val; r_val.x = a.double2().x / b.double2().x; r_val.y = a.double2().y / b.double2().y; return r_val; } KOKKOS_FORCEINLINE_FUNCTION -static KOKKOSKERNELS_SIMD_ARITH_RETURN_TYPE(float, 4) operator/( - const Vector, 4> &a, const Vector, 4> &b) { +static KOKKOSKERNELS_SIMD_ARITH_RETURN_TYPE(float, 4) operator/(const Vector, 4> &a, + const Vector, 4> &b) { float4 r_val; r_val.x = a.float4().x / b.float4().x; r_val.y = a.float4().y / b.float4().y; @@ -791,8 +727,8 @@ static KOKKOSKERNELS_SIMD_ARITH_RETURN_TYPE(float, 4) operator/( return r_val; } KOKKOS_FORCEINLINE_FUNCTION -static KOKKOSKERNELS_SIMD_ARITH_RETURN_TYPE(double, 4) operator/( - const Vector, 4> &a, const Vector, 4> &b) { +static KOKKOSKERNELS_SIMD_ARITH_RETURN_TYPE(double, 4) operator/(const Vector, 4> &a, + const Vector, 4> &b) { double4 r_val; r_val.x = a.double4().x / b.double4().x; r_val.y = a.double4().y / b.double4().y; @@ -803,9 +739,8 @@ static KOKKOSKERNELS_SIMD_ARITH_RETURN_TYPE(double, 4) operator/( #endif template -KOKKOS_FORCEINLINE_FUNCTION static KOKKOSKERNELS_SIMD_ARITH_RETURN_REFERENCE_TYPE( - T, l) -operator/=(Vector, l> &a, const Vector, l> &b) { +KOKKOS_FORCEINLINE_FUNCTION static KOKKOSKERNELS_SIMD_ARITH_RETURN_REFERENCE_TYPE(T, l) operator/=( + Vector, l> &a, const Vector, l> &b) { a = a / b; return a; } @@ -816,8 +751,8 @@ operator/=(Vector, l> &a, const Vector, l> &b) { #if !defined(KOKKOS_COMPILER_GNU) KOKKOS_FORCEINLINE_FUNCTION -static KOKKOSKERNELS_SIMD_ARITH_RETURN_TYPE(Kokkos::complex, 4) -operator/(const Vector >, 4> &a, const double b) { +static KOKKOSKERNELS_SIMD_ARITH_RETURN_TYPE(Kokkos::complex, 4) operator/( + const Vector >, 4> &a, const double b) { return _mm512_div_pd(a, _mm512_set1_pd(b)); } #endif @@ -826,21 +761,20 @@ operator/(const Vector >, 4> &a, const double b) { #endif template -KOKKOS_FORCEINLINE_FUNCTION static KOKKOSKERNELS_SIMD_ARITH_RETURN_TYPE(T, l) -operator/(const Vector, l> &a, const T b) { +KOKKOS_FORCEINLINE_FUNCTION static KOKKOSKERNELS_SIMD_ARITH_RETURN_TYPE(T, l) operator/(const Vector, l> &a, + const T b) { return a / Vector, l>(b); } template -KOKKOS_FORCEINLINE_FUNCTION static KOKKOSKERNELS_SIMD_ARITH_RETURN_TYPE(T, l) -operator/(const T a, const Vector, l> &b) { +KOKKOS_FORCEINLINE_FUNCTION static KOKKOSKERNELS_SIMD_ARITH_RETURN_TYPE(T, l) operator/(const T a, + const Vector, l> &b) { return Vector, l>(a) / b; } template -KOKKOS_FORCEINLINE_FUNCTION static KOKKOSKERNELS_SIMD_ARITH_RETURN_REFERENCE_TYPE( - T, l) -operator/=(Vector, l> &a, const T b) { +KOKKOS_FORCEINLINE_FUNCTION static KOKKOSKERNELS_SIMD_ARITH_RETURN_REFERENCE_TYPE(T, l) operator/=( + Vector, l> &a, const T b) { a = a / b; return a; } @@ -848,23 +782,20 @@ operator/=(Vector, l> &a, const T b) { /// simd complex, real template -KOKKOS_FORCEINLINE_FUNCTION static KOKKOSKERNELS_SIMD_ARITH_RETURN_TYPE( - Kokkos::complex, l) -operator/(const Vector >, l> &a, const T b) { +KOKKOS_FORCEINLINE_FUNCTION static KOKKOSKERNELS_SIMD_ARITH_RETURN_TYPE(Kokkos::complex, l) operator/( + const Vector >, l> &a, const T b) { return a / Vector >, l>(b); } template -KOKKOS_FORCEINLINE_FUNCTION static KOKKOSKERNELS_SIMD_ARITH_RETURN_TYPE( - Kokkos::complex, l) -operator/(const T a, const Vector >, l> &b) { +KOKKOS_FORCEINLINE_FUNCTION static KOKKOSKERNELS_SIMD_ARITH_RETURN_TYPE(Kokkos::complex, l) operator/( + const T a, const Vector >, l> &b) { return Vector >, l>(a) / b; } template -KOKKOS_FORCEINLINE_FUNCTION static KOKKOSKERNELS_SIMD_ARITH_RETURN_REFERENCE_TYPE( - Kokkos::complex, l) -operator/=(Vector >, l> &a, const T b) { +KOKKOS_FORCEINLINE_FUNCTION static KOKKOSKERNELS_SIMD_ARITH_RETURN_REFERENCE_TYPE(Kokkos::complex, l) operator/=( + Vector >, l> &a, const T b) { a = a / b; return a; } @@ -872,26 +803,20 @@ operator/=(Vector >, l> &a, const T b) { /// simd complex, complex template -KOKKOS_FORCEINLINE_FUNCTION static KOKKOSKERNELS_SIMD_ARITH_RETURN_TYPE( - Kokkos::complex, l) -operator/(const Vector >, l> &a, - const Kokkos::complex b) { +KOKKOS_FORCEINLINE_FUNCTION static KOKKOSKERNELS_SIMD_ARITH_RETURN_TYPE(Kokkos::complex, l) operator/( + const Vector >, l> &a, const Kokkos::complex b) { return a / Vector >, l>(b); } template -KOKKOS_FORCEINLINE_FUNCTION static KOKKOSKERNELS_SIMD_ARITH_RETURN_TYPE( - Kokkos::complex, l) -operator/(const Kokkos::complex a, - const Vector >, l> &b) { +KOKKOS_FORCEINLINE_FUNCTION static KOKKOSKERNELS_SIMD_ARITH_RETURN_TYPE(Kokkos::complex, l) operator/( + const Kokkos::complex a, const Vector >, l> &b) { return Vector >, l>(a) / b; } template -KOKKOS_FORCEINLINE_FUNCTION static KOKKOSKERNELS_SIMD_ARITH_RETURN_REFERENCE_TYPE( - Kokkos::complex, l) -operator/=(Vector >, l> &a, - const Kokkos::complex b) { +KOKKOS_FORCEINLINE_FUNCTION static KOKKOSKERNELS_SIMD_ARITH_RETURN_REFERENCE_TYPE(Kokkos::complex, l) operator/=( + Vector >, l> &a, const Kokkos::complex b) { a = a / b; return a; } diff --git a/batched/dense/impl/KokkosBatched_Vector_SIMD_Logical.hpp b/batched/dense/impl/KokkosBatched_Vector_SIMD_Logical.hpp index c8c07e97c4..f289d5be09 100644 --- a/batched/dense/impl/KokkosBatched_Vector_SIMD_Logical.hpp +++ b/batched/dense/impl/KokkosBatched_Vector_SIMD_Logical.hpp @@ -22,16 +22,13 @@ namespace KokkosBatched { -#define KOKKOSKERNELS_SIMD_LOGICAL_RETURN_BOOL_TYPE(T0, T1, l) \ - typename std::enable_if::value && \ - std::is_integral::value, \ +#define KOKKOSKERNELS_SIMD_LOGICAL_RETURN_BOOL_TYPE(T0, T1, l) \ + typename std::enable_if::value && std::is_integral::value, \ const Vector, l> >::type template -KOKKOS_INLINE_FUNCTION static - typename std::enable_if::value, - const Vector, l> >::type - operator!(const Vector, l> &a) { +KOKKOS_INLINE_FUNCTION static typename std::enable_if::value, const Vector, l> >::type +operator!(const Vector, l> &a) { Vector, l> r_val; #if defined(KOKKOS_ENABLE_PRAGMA_IVDEP) #pragma ivdep @@ -44,9 +41,8 @@ KOKKOS_INLINE_FUNCTION static } template -KOKKOS_INLINE_FUNCTION static KOKKOSKERNELS_SIMD_LOGICAL_RETURN_BOOL_TYPE(T0, - T1, l) -operator||(const Vector, l> &a, const Vector, l> &b) { +KOKKOS_INLINE_FUNCTION static KOKKOSKERNELS_SIMD_LOGICAL_RETURN_BOOL_TYPE(T0, T1, l) operator||( + const Vector, l> &a, const Vector, l> &b) { Vector, l> r_val; #if defined(KOKKOS_ENABLE_PRAGMA_IVDEP) #pragma ivdep @@ -59,9 +55,8 @@ operator||(const Vector, l> &a, const Vector, l> &b) { } template -KOKKOS_INLINE_FUNCTION static KOKKOSKERNELS_SIMD_LOGICAL_RETURN_BOOL_TYPE(T0, - T1, l) -operator&&(const Vector, l> &a, const Vector, l> &b) { +KOKKOS_INLINE_FUNCTION static KOKKOSKERNELS_SIMD_LOGICAL_RETURN_BOOL_TYPE(T0, T1, l) operator&&( + const Vector, l> &a, const Vector, l> &b) { Vector, l> r_val; #if defined(KOKKOS_ENABLE_PRAGMA_IVDEP) #pragma ivdep @@ -74,9 +69,8 @@ operator&&(const Vector, l> &a, const Vector, l> &b) { } template -KOKKOS_INLINE_FUNCTION static KOKKOSKERNELS_SIMD_LOGICAL_RETURN_BOOL_TYPE(T0, - T1, l) -operator||(const Vector, l> &a, const T1 &b) { +KOKKOS_INLINE_FUNCTION static KOKKOSKERNELS_SIMD_LOGICAL_RETURN_BOOL_TYPE(T0, T1, l) operator||( + const Vector, l> &a, const T1 &b) { Vector, l> r_val; #if defined(KOKKOS_ENABLE_PRAGMA_IVDEP) #pragma ivdep @@ -89,9 +83,8 @@ operator||(const Vector, l> &a, const T1 &b) { } template -KOKKOS_INLINE_FUNCTION static KOKKOSKERNELS_SIMD_LOGICAL_RETURN_BOOL_TYPE(T0, - T1, l) -operator&&(const Vector, l> &a, const T1 &b) { +KOKKOS_INLINE_FUNCTION static KOKKOSKERNELS_SIMD_LOGICAL_RETURN_BOOL_TYPE(T0, T1, l) operator&&( + const Vector, l> &a, const T1 &b) { Vector, l> r_val; #if defined(KOKKOS_ENABLE_PRAGMA_IVDEP) #pragma ivdep @@ -104,9 +97,8 @@ operator&&(const Vector, l> &a, const T1 &b) { } template -KOKKOS_INLINE_FUNCTION static KOKKOSKERNELS_SIMD_LOGICAL_RETURN_BOOL_TYPE(T0, - T1, l) -operator||(const T0 &a, const Vector, l> &b) { +KOKKOS_INLINE_FUNCTION static KOKKOSKERNELS_SIMD_LOGICAL_RETURN_BOOL_TYPE(T0, T1, l) operator||( + const T0 &a, const Vector, l> &b) { Vector, l> r_val; #if defined(KOKKOS_ENABLE_PRAGMA_IVDEP) #pragma ivdep @@ -119,9 +111,8 @@ operator||(const T0 &a, const Vector, l> &b) { } template -KOKKOS_INLINE_FUNCTION static KOKKOSKERNELS_SIMD_LOGICAL_RETURN_BOOL_TYPE(T0, - T1, l) -operator&&(const T0 &a, const Vector, l> &b) { +KOKKOS_INLINE_FUNCTION static KOKKOSKERNELS_SIMD_LOGICAL_RETURN_BOOL_TYPE(T0, T1, l) operator&&( + const T0 &a, const Vector, l> &b) { Vector, l> r_val; #if defined(KOKKOS_ENABLE_PRAGMA_IVDEP) #pragma ivdep diff --git a/batched/dense/impl/KokkosBatched_Vector_SIMD_Math.hpp b/batched/dense/impl/KokkosBatched_Vector_SIMD_Math.hpp index 69bbb53c6b..eefaf4ce0d 100644 --- a/batched/dense/impl/KokkosBatched_Vector_SIMD_Math.hpp +++ b/batched/dense/impl/KokkosBatched_Vector_SIMD_Math.hpp @@ -24,14 +24,12 @@ namespace KokkosBatched { #define KOKKOSKERNELS_SIMD_MATH_RETURN_TYPE(T, l) Vector, l> #define KOKKOSKERNELS_SIMD_MATH_RETURN_FLOAT_TYPE(T, l) \ - typename std::enable_if::value, \ - Vector, l> >::type + typename std::enable_if::value, Vector, l> >::type /// simd template -KOKKOS_INLINE_FUNCTION static KOKKOSKERNELS_SIMD_MATH_RETURN_TYPE(T, l) - sqrt(const Vector, l> &a) { +KOKKOS_INLINE_FUNCTION static KOKKOSKERNELS_SIMD_MATH_RETURN_TYPE(T, l) sqrt(const Vector, l> &a) { typedef Kokkos::ArithTraits ats; Vector, l> r_val; #if defined(KOKKOS_ENABLE_PRAGMA_IVDEP) @@ -46,8 +44,7 @@ KOKKOS_INLINE_FUNCTION static KOKKOSKERNELS_SIMD_MATH_RETURN_TYPE(T, l) } template -KOKKOS_INLINE_FUNCTION static KOKKOSKERNELS_SIMD_MATH_RETURN_TYPE(T, l) - cbrt(const Vector, l> &a) { +KOKKOS_INLINE_FUNCTION static KOKKOSKERNELS_SIMD_MATH_RETURN_TYPE(T, l) cbrt(const Vector, l> &a) { typedef Kokkos::ArithTraits ats; Vector, l> r_val; #if defined(KOKKOS_ENABLE_PRAGMA_IVDEP) @@ -62,8 +59,7 @@ KOKKOS_INLINE_FUNCTION static KOKKOSKERNELS_SIMD_MATH_RETURN_TYPE(T, l) } template -KOKKOS_INLINE_FUNCTION static KOKKOSKERNELS_SIMD_MATH_RETURN_TYPE(T, l) - log(const Vector, l> &a) { +KOKKOS_INLINE_FUNCTION static KOKKOSKERNELS_SIMD_MATH_RETURN_TYPE(T, l) log(const Vector, l> &a) { typedef Kokkos::ArithTraits ats; Vector, l> r_val; #if defined(KOKKOS_ENABLE_PRAGMA_IVDEP) @@ -78,8 +74,7 @@ KOKKOS_INLINE_FUNCTION static KOKKOSKERNELS_SIMD_MATH_RETURN_TYPE(T, l) } template -KOKKOS_INLINE_FUNCTION static KOKKOSKERNELS_SIMD_MATH_RETURN_TYPE(T, l) - log10(const Vector, l> &a) { +KOKKOS_INLINE_FUNCTION static KOKKOSKERNELS_SIMD_MATH_RETURN_TYPE(T, l) log10(const Vector, l> &a) { typedef Kokkos::ArithTraits ats; Vector, l> r_val; #if defined(KOKKOS_ENABLE_PRAGMA_IVDEP) @@ -94,8 +89,7 @@ KOKKOS_INLINE_FUNCTION static KOKKOSKERNELS_SIMD_MATH_RETURN_TYPE(T, l) } template -KOKKOS_INLINE_FUNCTION static KOKKOSKERNELS_SIMD_MATH_RETURN_TYPE(T, l) - exp(const Vector, l> &a) { +KOKKOS_INLINE_FUNCTION static KOKKOSKERNELS_SIMD_MATH_RETURN_TYPE(T, l) exp(const Vector, l> &a) { typedef Kokkos::ArithTraits ats; Vector, l> r_val; #if defined(KOKKOS_ENABLE_PRAGMA_IVDEP) @@ -138,8 +132,7 @@ KOKKOS_INLINE_FUNCTION static KOKKOSKERNELS_SIMD_MATH_RETURN_TYPE(T0, l) } template -KOKKOS_INLINE_FUNCTION static KOKKOSKERNELS_SIMD_MATH_RETURN_FLOAT_TYPE(T, l) - sin(const Vector, l> &a) { +KOKKOS_INLINE_FUNCTION static KOKKOSKERNELS_SIMD_MATH_RETURN_FLOAT_TYPE(T, l) sin(const Vector, l> &a) { typedef Kokkos::ArithTraits ats; Vector, l> r_val; #if defined(KOKKOS_ENABLE_PRAGMA_IVDEP) @@ -154,8 +147,7 @@ KOKKOS_INLINE_FUNCTION static KOKKOSKERNELS_SIMD_MATH_RETURN_FLOAT_TYPE(T, l) } template -KOKKOS_INLINE_FUNCTION static KOKKOSKERNELS_SIMD_MATH_RETURN_FLOAT_TYPE(T, l) - cos(const Vector, l> &a) { +KOKKOS_INLINE_FUNCTION static KOKKOSKERNELS_SIMD_MATH_RETURN_FLOAT_TYPE(T, l) cos(const Vector, l> &a) { typedef Kokkos::ArithTraits ats; Vector, l> r_val; #if defined(KOKKOS_ENABLE_PRAGMA_IVDEP) @@ -170,8 +162,7 @@ KOKKOS_INLINE_FUNCTION static KOKKOSKERNELS_SIMD_MATH_RETURN_FLOAT_TYPE(T, l) } template -KOKKOS_INLINE_FUNCTION static KOKKOSKERNELS_SIMD_MATH_RETURN_FLOAT_TYPE(T, l) - tan(const Vector, l> &a) { +KOKKOS_INLINE_FUNCTION static KOKKOSKERNELS_SIMD_MATH_RETURN_FLOAT_TYPE(T, l) tan(const Vector, l> &a) { typedef Kokkos::ArithTraits ats; Vector, l> r_val; #if defined(KOKKOS_ENABLE_PRAGMA_IVDEP) @@ -186,8 +177,7 @@ KOKKOS_INLINE_FUNCTION static KOKKOSKERNELS_SIMD_MATH_RETURN_FLOAT_TYPE(T, l) } template -KOKKOS_INLINE_FUNCTION static KOKKOSKERNELS_SIMD_MATH_RETURN_FLOAT_TYPE(T, l) - sinh(const Vector, l> &a) { +KOKKOS_INLINE_FUNCTION static KOKKOSKERNELS_SIMD_MATH_RETURN_FLOAT_TYPE(T, l) sinh(const Vector, l> &a) { typedef Kokkos::ArithTraits ats; Vector, l> r_val; #if defined(KOKKOS_ENABLE_PRAGMA_IVDEP) @@ -202,8 +192,7 @@ KOKKOS_INLINE_FUNCTION static KOKKOSKERNELS_SIMD_MATH_RETURN_FLOAT_TYPE(T, l) } template -KOKKOS_INLINE_FUNCTION static KOKKOSKERNELS_SIMD_MATH_RETURN_FLOAT_TYPE(T, l) - cosh(const Vector, l> &a) { +KOKKOS_INLINE_FUNCTION static KOKKOSKERNELS_SIMD_MATH_RETURN_FLOAT_TYPE(T, l) cosh(const Vector, l> &a) { typedef Kokkos::ArithTraits ats; Vector, l> r_val; #if defined(KOKKOS_ENABLE_PRAGMA_IVDEP) @@ -218,8 +207,7 @@ KOKKOS_INLINE_FUNCTION static KOKKOSKERNELS_SIMD_MATH_RETURN_FLOAT_TYPE(T, l) } template -KOKKOS_INLINE_FUNCTION static KOKKOSKERNELS_SIMD_MATH_RETURN_FLOAT_TYPE(T, l) - tanh(const Vector, l> &a) { +KOKKOS_INLINE_FUNCTION static KOKKOSKERNELS_SIMD_MATH_RETURN_FLOAT_TYPE(T, l) tanh(const Vector, l> &a) { typedef Kokkos::ArithTraits ats; Vector, l> r_val; #if defined(KOKKOS_ENABLE_PRAGMA_IVDEP) @@ -234,8 +222,7 @@ KOKKOS_INLINE_FUNCTION static KOKKOSKERNELS_SIMD_MATH_RETURN_FLOAT_TYPE(T, l) } template -KOKKOS_INLINE_FUNCTION static KOKKOSKERNELS_SIMD_MATH_RETURN_FLOAT_TYPE(T, l) - asin(const Vector, l> &a) { +KOKKOS_INLINE_FUNCTION static KOKKOSKERNELS_SIMD_MATH_RETURN_FLOAT_TYPE(T, l) asin(const Vector, l> &a) { typedef Kokkos::ArithTraits ats; Vector, l> r_val; #if defined(KOKKOS_ENABLE_PRAGMA_IVDEP) @@ -250,8 +237,7 @@ KOKKOS_INLINE_FUNCTION static KOKKOSKERNELS_SIMD_MATH_RETURN_FLOAT_TYPE(T, l) } template -KOKKOS_INLINE_FUNCTION static KOKKOSKERNELS_SIMD_MATH_RETURN_FLOAT_TYPE(T, l) - acos(const Vector, l> &a) { +KOKKOS_INLINE_FUNCTION static KOKKOSKERNELS_SIMD_MATH_RETURN_FLOAT_TYPE(T, l) acos(const Vector, l> &a) { typedef Kokkos::ArithTraits ats; Vector, l> r_val; #if defined(KOKKOS_ENABLE_PRAGMA_IVDEP) @@ -266,8 +252,7 @@ KOKKOS_INLINE_FUNCTION static KOKKOSKERNELS_SIMD_MATH_RETURN_FLOAT_TYPE(T, l) } template -KOKKOS_INLINE_FUNCTION static KOKKOSKERNELS_SIMD_MATH_RETURN_FLOAT_TYPE(T, l) - atan(const Vector, l> &a) { +KOKKOS_INLINE_FUNCTION static KOKKOSKERNELS_SIMD_MATH_RETURN_FLOAT_TYPE(T, l) atan(const Vector, l> &a) { typedef Kokkos::ArithTraits ats; Vector, l> r_val; #if defined(KOKKOS_ENABLE_PRAGMA_IVDEP) diff --git a/batched/dense/impl/KokkosBatched_Vector_SIMD_Misc.hpp b/batched/dense/impl/KokkosBatched_Vector_SIMD_Misc.hpp index a95a752779..02f717d458 100644 --- a/batched/dense/impl/KokkosBatched_Vector_SIMD_Misc.hpp +++ b/batched/dense/impl/KokkosBatched_Vector_SIMD_Misc.hpp @@ -30,17 +30,13 @@ namespace KokkosBatched { // scalar, scalar template -KOKKOS_INLINE_FUNCTION static T conditional_assign(const bool cond, - const T &if_true_val, - const T &if_false_val) { +KOKKOS_INLINE_FUNCTION static T conditional_assign(const bool cond, const T &if_true_val, const T &if_false_val) { return cond ? if_true_val : if_false_val; } template -KOKKOS_INLINE_FUNCTION static KOKKOSKERNELS_SIMD_MISC_CONVERTIBLE_RETURN_VOID_TYPE( - T0, T1, T2, l) - conditional_assign(/* */ T0 &r_val, const bool cond, const T1 &if_true_val, - const T2 &if_false_val) { +KOKKOS_INLINE_FUNCTION static KOKKOSKERNELS_SIMD_MISC_CONVERTIBLE_RETURN_VOID_TYPE(T0, T1, T2, l) + conditional_assign(/* */ T0 &r_val, const bool cond, const T1 &if_true_val, const T2 &if_false_val) { r_val = cond ? if_true_val : if_false_val; } @@ -48,23 +44,18 @@ KOKKOS_INLINE_FUNCTION static KOKKOSKERNELS_SIMD_MISC_CONVERTIBLE_RETURN_VOID_TY template KOKKOS_INLINE_FUNCTION static KOKKOSKERNELS_SIMD_MISC_RETURN_TYPE(T, l) - conditional_assign(const Vector, l> &cond, - const Vector, l> &if_true_val, + conditional_assign(const Vector, l> &cond, const Vector, l> &if_true_val, const T &if_false_val) { Vector, l> r_val; - for (int i = 0; i < l; ++i) - r_val[i] = cond[i] ? if_true_val[i] : if_false_val; + for (int i = 0; i < l; ++i) r_val[i] = cond[i] ? if_true_val[i] : if_false_val; return r_val; } template -KOKKOS_INLINE_FUNCTION static KOKKOSKERNELS_SIMD_MISC_CONVERTIBLE_RETURN_VOID_TYPE( - T0, T1, T2, l) conditional_assign(/* */ Vector, l> &r_val, - const Vector, l> &cond, - const Vector, l> &if_true_val, - const T2 &if_false_val) { - for (int i = 0; i < l; ++i) - r_val[i] = cond[i] ? if_true_val[i] : if_false_val; +KOKKOS_INLINE_FUNCTION static KOKKOSKERNELS_SIMD_MISC_CONVERTIBLE_RETURN_VOID_TYPE(T0, T1, T2, l) + conditional_assign(/* */ Vector, l> &r_val, const Vector, l> &cond, + const Vector, l> &if_true_val, const T2 &if_false_val) { + for (int i = 0; i < l; ++i) r_val[i] = cond[i] ? if_true_val[i] : if_false_val; } // scalar, vector @@ -74,74 +65,57 @@ KOKKOS_INLINE_FUNCTION static KOKKOSKERNELS_SIMD_MISC_RETURN_TYPE(T, l) conditional_assign(const Vector, l> &cond, const T &if_true_val, const Vector, l> &if_false_val) { Vector, l> r_val; - for (int i = 0; i < l; ++i) - r_val[i] = cond[i] ? if_true_val : if_false_val[i]; + for (int i = 0; i < l; ++i) r_val[i] = cond[i] ? if_true_val : if_false_val[i]; return r_val; } template -KOKKOS_INLINE_FUNCTION static KOKKOSKERNELS_SIMD_MISC_CONVERTIBLE_RETURN_VOID_TYPE( - T0, T1, T2, l) - conditional_assign(/* */ Vector, l> &r_val, - const Vector, l> &cond, const T1 &if_true_val, +KOKKOS_INLINE_FUNCTION static KOKKOSKERNELS_SIMD_MISC_CONVERTIBLE_RETURN_VOID_TYPE(T0, T1, T2, l) + conditional_assign(/* */ Vector, l> &r_val, const Vector, l> &cond, const T1 &if_true_val, const Vector, l> &if_false_val) { - for (int i = 0; i < l; ++i) - r_val[i] = cond[i] ? if_true_val : if_false_val[i]; + for (int i = 0; i < l; ++i) r_val[i] = cond[i] ? if_true_val : if_false_val[i]; } // vector, vector template KOKKOS_INLINE_FUNCTION static KOKKOSKERNELS_SIMD_MISC_RETURN_TYPE(T, l) - conditional_assign(const Vector, l> &cond, - const Vector, l> &if_true_val, + conditional_assign(const Vector, l> &cond, const Vector, l> &if_true_val, const Vector, l> &if_false_val) { Vector, l> r_val; - for (int i = 0; i < l; ++i) - r_val[i] = cond[i] ? if_true_val[i] : if_false_val[i]; + for (int i = 0; i < l; ++i) r_val[i] = cond[i] ? if_true_val[i] : if_false_val[i]; return r_val; } template -KOKKOS_INLINE_FUNCTION static KOKKOSKERNELS_SIMD_MISC_CONVERTIBLE_RETURN_VOID_TYPE( - T0, T1, T2, l) conditional_assign(/* */ Vector, l> &r_val, - const Vector, l> &cond, - const Vector, l> &if_true_val, - const Vector, l> &if_false_val) { - for (int i = 0; i < l; ++i) - r_val[i] = cond[i] ? if_true_val[i] : if_false_val[i]; +KOKKOS_INLINE_FUNCTION static KOKKOSKERNELS_SIMD_MISC_CONVERTIBLE_RETURN_VOID_TYPE(T0, T1, T2, l) + conditional_assign(/* */ Vector, l> &r_val, const Vector, l> &cond, + const Vector, l> &if_true_val, const Vector, l> &if_false_val) { + for (int i = 0; i < l; ++i) r_val[i] = cond[i] ? if_true_val[i] : if_false_val[i]; } template -KOKKOS_INLINE_FUNCTION static T reduce(const Vector, l> &val, - const BinaryOp &func) { +KOKKOS_INLINE_FUNCTION static T reduce(const Vector, l> &val, const BinaryOp &func) { T r_val = val[0]; for (int i = 1; i < l; ++i) r_val = func(r_val, val[i]); return r_val; } template -KOKKOS_INLINE_FUNCTION static T reduce(const Vector, l> &val, - const BinaryOp &func, const T init) { +KOKKOS_INLINE_FUNCTION static T reduce(const Vector, l> &val, const BinaryOp &func, const T init) { T r_val = init; for (int i = 0; i < l; ++i) r_val = func(r_val, val[i]); return r_val; } template -KOKKOS_INLINE_FUNCTION static bool is_all_true( - const Vector, l> &cond) { - return reduce(cond, [](const bool left, const bool right) -> bool { - return (left && right); - }); +KOKKOS_INLINE_FUNCTION static bool is_all_true(const Vector, l> &cond) { + return reduce(cond, [](const bool left, const bool right) -> bool { return (left && right); }); } template -KOKKOS_INLINE_FUNCTION static bool is_any_true( - const Vector, l> &cond) { - return reduce(cond, [](const bool left, const bool right) -> bool { - return left || right; - }); +KOKKOS_INLINE_FUNCTION static bool is_any_true(const Vector, l> &cond) { + return reduce(cond, [](const bool left, const bool right) -> bool { return left || right; }); } template diff --git a/batched/dense/impl/KokkosBatched_Vector_SIMD_Relation.hpp b/batched/dense/impl/KokkosBatched_Vector_SIMD_Relation.hpp index d49c6f35f9..c956780192 100644 --- a/batched/dense/impl/KokkosBatched_Vector_SIMD_Relation.hpp +++ b/batched/dense/impl/KokkosBatched_Vector_SIMD_Relation.hpp @@ -25,13 +25,13 @@ namespace KokkosBatched { // vector, vector #undef KOKKOSBATCHED_RELATION_OPERATOR -#define KOKKOSBATCHED_RELATION_OPERATOR(op) \ - template \ - KOKKOS_INLINE_FUNCTION const Vector, l> operator op( \ - const Vector, l> &a, const Vector, l> &b) { \ - Vector, l> r_val; \ - for (int i = 0; i < l; ++i) r_val[i] = a[i] op b[i]; \ - return r_val; \ +#define KOKKOSBATCHED_RELATION_OPERATOR(op) \ + template \ + KOKKOS_INLINE_FUNCTION const Vector, l> operator op(const Vector, l> &a, \ + const Vector, l> &b) { \ + Vector, l> r_val; \ + for (int i = 0; i < l; ++i) r_val[i] = a[i] op b[i]; \ + return r_val; \ } KOKKOSBATCHED_RELATION_OPERATOR(<) @@ -43,13 +43,12 @@ KOKKOSBATCHED_RELATION_OPERATOR(!=) // vector, scalar #undef KOKKOSBATCHED_RELATION_OPERATOR -#define KOKKOSBATCHED_RELATION_OPERATOR(op) \ - template \ - KOKKOS_INLINE_FUNCTION const Vector, l> operator op( \ - const Vector, l> &a, const T2 &b) { \ - Vector, l> r_val; \ - for (int i = 0; i < l; ++i) r_val[i] = a[i] op b; \ - return r_val; \ +#define KOKKOSBATCHED_RELATION_OPERATOR(op) \ + template \ + KOKKOS_INLINE_FUNCTION const Vector, l> operator op(const Vector, l> &a, const T2 &b) { \ + Vector, l> r_val; \ + for (int i = 0; i < l; ++i) r_val[i] = a[i] op b; \ + return r_val; \ } KOKKOSBATCHED_RELATION_OPERATOR(<) @@ -61,13 +60,12 @@ KOKKOSBATCHED_RELATION_OPERATOR(!=) // scalar, vector #undef KOKKOSBATCHED_RELATION_OPERATOR -#define KOKKOSBATCHED_RELATION_OPERATOR(op) \ - template \ - KOKKOS_INLINE_FUNCTION const Vector, l> operator op( \ - const T1 &a, const Vector, l> &b) { \ - Vector, l> r_val; \ - for (int i = 0; i < l; ++i) r_val[i] = a op b[i]; \ - return r_val; \ +#define KOKKOSBATCHED_RELATION_OPERATOR(op) \ + template \ + KOKKOS_INLINE_FUNCTION const Vector, l> operator op(const T1 &a, const Vector, l> &b) { \ + Vector, l> r_val; \ + for (int i = 0; i < l; ++i) r_val[i] = a op b[i]; \ + return r_val; \ } KOKKOSBATCHED_RELATION_OPERATOR(<) diff --git a/batched/dense/impl/KokkosBatched_Vector_SIMD_View.hpp b/batched/dense/impl/KokkosBatched_Vector_SIMD_View.hpp index 3fb7ac872b..60e5e43e57 100644 --- a/batched/dense/impl/KokkosBatched_Vector_SIMD_View.hpp +++ b/batched/dense/impl/KokkosBatched_Vector_SIMD_View.hpp @@ -63,52 +63,38 @@ struct SimdViewAccess { } template - KOKKOS_INLINE_FUNCTION constexpr - typename std::enable_if::value, size_t>::type - extent(const iType &r) const { + KOKKOS_INLINE_FUNCTION constexpr typename std::enable_if::value, size_t>::type extent( + const iType &r) const { return _a.extent(r) * (r == PackDim::value ? vector_length : 1); } template - KOKKOS_INLINE_FUNCTION constexpr - typename std::enable_if::value, int>::type - extent_int(const iType &r) const { - return static_cast(_a.extent(r) * - (r == PackDim::value ? vector_length : 1)); + KOKKOS_INLINE_FUNCTION constexpr typename std::enable_if::value, int>::type extent_int( + const iType &r) const { + return static_cast(_a.extent(r) * (r == PackDim::value ? vector_length : 1)); } - KOKKOS_INLINE_FUNCTION constexpr size_t size() const { - return (_a.size() * vector_length); - } + KOKKOS_INLINE_FUNCTION constexpr size_t size() const { return (_a.size() * vector_length); } - KOKKOS_INLINE_FUNCTION constexpr size_t span() const { - return _a.span() * vector_length; - } - KOKKOS_INLINE_FUNCTION constexpr bool span_span_is_contiguous() const { - return _a.span_span_is_contiguous(); - } - KOKKOS_INLINE_FUNCTION constexpr pointer_type data() const { - return _a.data(); - } + KOKKOS_INLINE_FUNCTION constexpr size_t span() const { return _a.span() * vector_length; } + KOKKOS_INLINE_FUNCTION constexpr bool span_span_is_contiguous() const { return _a.span_span_is_contiguous(); } + KOKKOS_INLINE_FUNCTION constexpr pointer_type data() const { return _a.data(); } /// rank 0 /// this does not make sense as this is flat view to simd view /// rank 1 template - KOKKOS_FORCEINLINE_FUNCTION std::enable_if_t< - KokkosKernels::Impl::are_integral_v && 1 == ViewType::rank, - reference_type> - operator()(const I0 &i0, Args... /*args*/) const { + KOKKOS_FORCEINLINE_FUNCTION + std::enable_if_t && 1 == ViewType::rank, reference_type> + operator()(const I0 &i0, Args... /*args*/) const { return _a(i0 / vector_length)[i0 % vector_length]; } /// rank 2 template KOKKOS_FORCEINLINE_FUNCTION - std::enable_if_t && - 2 == ViewType::rank, - reference_type> + std::enable_if_t && 2 == ViewType::rank, reference_type> operator()(const I0 &i0, const I1 &i1, Args... /*args*/) const { switch (PackDim::value) { case 0: return _a(i0 / vector_length, i1)[i0 % vector_length]; @@ -120,11 +106,9 @@ struct SimdViewAccess { /// rank 3 template - KOKKOS_FORCEINLINE_FUNCTION std::enable_if_t< - KokkosKernels::Impl::are_integral_v && - 3 == ViewType::rank, - reference_type> - operator()(const I0 &i0, const I1 &i1, const I2 &i2, Args... /*args*/) const { + KOKKOS_FORCEINLINE_FUNCTION + std::enable_if_t && 3 == ViewType::rank, reference_type> + operator()(const I0 &i0, const I1 &i1, const I2 &i2, Args... /*args*/) const { switch (PackDim::value) { case 0: return _a(i0 / vector_length, i1, i2)[i0 % vector_length]; case 1: return _a(i0, i1 / vector_length, i2)[i1 % vector_length]; @@ -137,11 +121,8 @@ struct SimdViewAccess { /// rank 4 template KOKKOS_FORCEINLINE_FUNCTION std::enable_if_t< - KokkosKernels::Impl::are_integral_v && - 4 == ViewType::rank, - reference_type> - operator()(const I0 &i0, const I1 &i1, const I2 &i2, const I3 &i3, - Args... /*args*/) const { + KokkosKernels::Impl::are_integral_v && 4 == ViewType::rank, reference_type> + operator()(const I0 &i0, const I1 &i1, const I2 &i2, const I3 &i3, Args... /*args*/) const { switch (PackDim::value) { case 0: return _a(i0 / vector_length, i1, i2, i3)[i0 % vector_length]; case 1: return _a(i0, i1 / vector_length, i2, i3)[i1 % vector_length]; @@ -153,14 +134,10 @@ struct SimdViewAccess { } /// rank 5 - template + template KOKKOS_FORCEINLINE_FUNCTION std::enable_if_t< - KokkosKernels::Impl::are_integral_v && - 5 == ViewType::rank, - reference_type> - operator()(const I0 &i0, const I1 &i1, const I2 &i2, const I3 &i3, - const I4 &i4, Args... /*args*/) const { + KokkosKernels::Impl::are_integral_v && 5 == ViewType::rank, reference_type> + operator()(const I0 &i0, const I1 &i1, const I2 &i2, const I3 &i3, const I4 &i4, Args... /*args*/) const { switch (PackDim::value) { case 0: return _a(i0 / vector_length, i1, i2, i3, i4)[i0 % vector_length]; case 1: return _a(i0, i1 / vector_length, i2, i3, i4)[i1 % vector_length]; @@ -173,25 +150,17 @@ struct SimdViewAccess { } /// rank 6 - template + template KOKKOS_FORCEINLINE_FUNCTION std::enable_if_t< - KokkosKernels::Impl::are_integral_v && - 6 == ViewType::rank, - reference_type> - operator()(const I0 &i0, const I1 &i1, const I2 &i2, const I3 &i3, - const I4 &i4, const I5 &i5, Args... /*args*/) const { + KokkosKernels::Impl::are_integral_v && 6 == ViewType::rank, reference_type> + operator()(const I0 &i0, const I1 &i1, const I2 &i2, const I3 &i3, const I4 &i4, const I5 &i5, + Args... /*args*/) const { switch (PackDim::value) { - case 0: - return _a(i0 / vector_length, i1, i2, i3, i4, i5)[i0 % vector_length]; - case 1: - return _a(i0, i1 / vector_length, i2, i3, i4, i5)[i1 % vector_length]; - case 2: - return _a(i0, i1, i2 / vector_length, i3, i4, i5)[i2 % vector_length]; - case 3: - return _a(i0, i1, i2, i3 / vector_length, i4, i5)[i3 % vector_length]; - case 4: - return _a(i0, i1, i2, i3, i4 / vector_length, i5)[i4 % vector_length]; + case 0: return _a(i0 / vector_length, i1, i2, i3, i4, i5)[i0 % vector_length]; + case 1: return _a(i0, i1 / vector_length, i2, i3, i4, i5)[i1 % vector_length]; + case 2: return _a(i0, i1, i2 / vector_length, i3, i4, i5)[i2 % vector_length]; + case 3: return _a(i0, i1, i2, i3 / vector_length, i4, i5)[i3 % vector_length]; + case 4: return _a(i0, i1, i2, i3, i4 / vector_length, i5)[i4 % vector_length]; case 5: break; default: break; } @@ -199,35 +168,18 @@ struct SimdViewAccess { } /// rank 7 - template - KOKKOS_FORCEINLINE_FUNCTION - std::enable_if_t && - 7 == ViewType::rank, - reference_type> - operator()(const I0 &i0, const I1 &i1, const I2 &i2, const I3 &i3, - const I4 &i4, const I5 &i5, const I6 &i6, - Args... /*args*/) const { + template + KOKKOS_FORCEINLINE_FUNCTION std::enable_if_t< + KokkosKernels::Impl::are_integral_v && 7 == ViewType::rank, reference_type> + operator()(const I0 &i0, const I1 &i1, const I2 &i2, const I3 &i3, const I4 &i4, const I5 &i5, const I6 &i6, + Args... /*args*/) const { switch (PackDim::value) { - case 0: - return _a(i0 / vector_length, i1, i2, i3, i4, i5, - i6)[i0 % vector_length]; - case 1: - return _a(i0, i1 / vector_length, i2, i3, i4, i5, - i6)[i1 % vector_length]; - case 2: - return _a(i0, i1, i2 / vector_length, i3, i4, i5, - i6)[i2 % vector_length]; - case 3: - return _a(i0, i1, i2, i3 / vector_length, i4, i5, - i6)[i3 % vector_length]; - case 4: - return _a(i0, i1, i2, i3, i4 / vector_length, i5, - i6)[i4 % vector_length]; - case 5: - return _a(i0, i1, i2, i3, i4, i5 / vector_length, - i6)[i5 % vector_length]; + case 0: return _a(i0 / vector_length, i1, i2, i3, i4, i5, i6)[i0 % vector_length]; + case 1: return _a(i0, i1 / vector_length, i2, i3, i4, i5, i6)[i1 % vector_length]; + case 2: return _a(i0, i1, i2 / vector_length, i3, i4, i5, i6)[i2 % vector_length]; + case 3: return _a(i0, i1, i2, i3 / vector_length, i4, i5, i6)[i3 % vector_length]; + case 4: return _a(i0, i1, i2, i3, i4 / vector_length, i5, i6)[i4 % vector_length]; + case 5: return _a(i0, i1, i2, i3, i4, i5 / vector_length, i6)[i5 % vector_length]; case 6: break; default: break; } @@ -235,43 +187,25 @@ struct SimdViewAccess { } /// rank 8 - template - KOKKOS_FORCEINLINE_FUNCTION - std::enable_if_t && - 8 == ViewType::rank, - reference_type> - operator()(const I0 &i0, const I1 &i1, const I2 &i2, const I3 &i3, - const I4 &i4, const I5 &i5, const I6 &i6, const I7 &i7, - Args... /*args*/) const { + template + KOKKOS_FORCEINLINE_FUNCTION std::enable_if_t< + KokkosKernels::Impl::are_integral_v && 8 == ViewType::rank, + reference_type> + operator()(const I0 &i0, const I1 &i1, const I2 &i2, const I3 &i3, const I4 &i4, const I5 &i5, const I6 &i6, + const I7 &i7, Args... /*args*/) const { switch (PackDim::value) { - case 0: - return _a(i0 / vector_length, i1, i2, i3, i4, i5, i6, - i7)[i0 % vector_length]; - case 1: - return _a(i0, i1 / vector_length, i2, i3, i4, i5, i6, - i7)[i1 % vector_length]; - case 2: - return _a(i0, i1, i2 / vector_length, i3, i4, i5, i6, - i7)[i2 % vector_length]; - case 3: - return _a(i0, i1, i2, i3 / vector_length, i4, i5, i6, - i7)[i3 % vector_length]; - case 4: - return _a(i0, i1, i2, i3, i4 / vector_length, i5, i6, - i7)[i4 % vector_length]; - case 5: - return _a(i0, i1, i2, i3, i4, i5 / vector_length, i6, - i7)[i5 % vector_length]; - case 6: - return _a(i0, i1, i2, i3, i4, i5, i6 / vector_length, - i7)[i6 % vector_length]; + case 0: return _a(i0 / vector_length, i1, i2, i3, i4, i5, i6, i7)[i0 % vector_length]; + case 1: return _a(i0, i1 / vector_length, i2, i3, i4, i5, i6, i7)[i1 % vector_length]; + case 2: return _a(i0, i1, i2 / vector_length, i3, i4, i5, i6, i7)[i2 % vector_length]; + case 3: return _a(i0, i1, i2, i3 / vector_length, i4, i5, i6, i7)[i3 % vector_length]; + case 4: return _a(i0, i1, i2, i3, i4 / vector_length, i5, i6, i7)[i4 % vector_length]; + case 5: return _a(i0, i1, i2, i3, i4, i5 / vector_length, i6, i7)[i5 % vector_length]; + case 6: return _a(i0, i1, i2, i3, i4, i5, i6 / vector_length, i7)[i6 % vector_length]; case 7: break; default: break; } - return _a(i0, i1, i2, i3, i4, i5, i6, - i7 / vector_length)[i7 % vector_length]; + return _a(i0, i1, i2, i3, i4, i5, i6, i7 / vector_length)[i7 % vector_length]; } }; } // namespace KokkosBatched diff --git a/batched/dense/impl/KokkosBatched_WilkinsonShift_Serial_Internal.hpp b/batched/dense/impl/KokkosBatched_WilkinsonShift_Serial_Internal.hpp index 0d3a9b3df9..a23a9ea4d0 100644 --- a/batched/dense/impl/KokkosBatched_WilkinsonShift_Serial_Internal.hpp +++ b/batched/dense/impl/KokkosBatched_WilkinsonShift_Serial_Internal.hpp @@ -29,12 +29,10 @@ namespace KokkosBatched { /// struct SerialWilkinsonShiftInternal { template - KOKKOS_INLINE_FUNCTION static int invoke( - const ValueType a, const ValueType b, const ValueType c, - const ValueType d, - /* */ Kokkos::complex* lambda1, - /* */ Kokkos::complex* lambda2, - /* */ bool* is_complex) { + KOKKOS_INLINE_FUNCTION static int invoke(const ValueType a, const ValueType b, const ValueType c, const ValueType d, + /* */ Kokkos::complex* lambda1, + /* */ Kokkos::complex* lambda2, + /* */ bool* is_complex) { /// compute eigenvalues of 2x2 system [a b; /// c d] /// when the system has a real complex values, diff --git a/batched/dense/impl/KokkosBatched_Xpay_Impl.hpp b/batched/dense/impl/KokkosBatched_Xpay_Impl.hpp index 52e1425041..988bd30c93 100644 --- a/batched/dense/impl/KokkosBatched_Xpay_Impl.hpp +++ b/batched/dense/impl/KokkosBatched_Xpay_Impl.hpp @@ -27,11 +27,9 @@ namespace KokkosBatched { /// ==================== struct SerialXpayInternal { template - KOKKOS_INLINE_FUNCTION static int invoke(const int m, const ScalarType alpha, - const ValueType* KOKKOS_RESTRICT X, + KOKKOS_INLINE_FUNCTION static int invoke(const int m, const ScalarType alpha, const ValueType* KOKKOS_RESTRICT X, const int xs0, - /* */ ValueType* KOKKOS_RESTRICT Y, - const int ys0) { + /* */ ValueType* KOKKOS_RESTRICT Y, const int ys0) { #if defined(KOKKOS_ENABLE_PRAGMA_UNROLL) #pragma unroll #endif @@ -44,10 +42,9 @@ struct SerialXpayInternal { } template - KOKKOS_INLINE_FUNCTION static int invoke( - const int m, const ScalarType* KOKKOS_RESTRICT alpha, const int alphas0, - const ValueType* KOKKOS_RESTRICT X, const int xs0, - /* */ ValueType* KOKKOS_RESTRICT Y, const int ys0) { + KOKKOS_INLINE_FUNCTION static int invoke(const int m, const ScalarType* KOKKOS_RESTRICT alpha, const int alphas0, + const ValueType* KOKKOS_RESTRICT X, const int xs0, + /* */ ValueType* KOKKOS_RESTRICT Y, const int ys0) { #if defined(KOKKOS_ENABLE_PRAGMA_UNROLL) #pragma unroll #endif @@ -60,17 +57,14 @@ struct SerialXpayInternal { } template - KOKKOS_INLINE_FUNCTION static int invoke( - const int m, const int n, const ScalarType* KOKKOS_RESTRICT alpha, - const int alphas0, const ValueType* KOKKOS_RESTRICT X, const int xs0, - const int xs1, - /* */ ValueType* KOKKOS_RESTRICT Y, const int ys0, const int ys1) { + KOKKOS_INLINE_FUNCTION static int invoke(const int m, const int n, const ScalarType* KOKKOS_RESTRICT alpha, + const int alphas0, const ValueType* KOKKOS_RESTRICT X, const int xs0, + const int xs1, + /* */ ValueType* KOKKOS_RESTRICT Y, const int ys0, const int ys1) { if (xs0 > xs1) - for (int i = 0; i < m; ++i) - invoke(n, alpha[i * alphas0], X + i * xs0, xs1, Y + i * ys0, ys1); + for (int i = 0; i < m; ++i) invoke(n, alpha[i * alphas0], X + i * xs0, xs1, Y + i * ys0, ys1); else - for (int j = 0; j < n; ++j) - invoke(m, alpha, alphas0, X + j * xs1, xs0, Y + j * ys1, ys0); + for (int j = 0; j < n; ++j) invoke(m, alpha, alphas0, X + j * xs1, xs0, Y + j * ys1, ys0); return 0; } @@ -81,12 +75,9 @@ struct SerialXpayInternal { /// ==================== struct TeamXpayInternal { template - KOKKOS_INLINE_FUNCTION static int invoke(const MemberType& member, - const int m, const ScalarType alpha, - const ValueType* KOKKOS_RESTRICT X, - const int xs0, - /* */ ValueType* KOKKOS_RESTRICT Y, - const int ys0) { + KOKKOS_INLINE_FUNCTION static int invoke(const MemberType& member, const int m, const ScalarType alpha, + const ValueType* KOKKOS_RESTRICT X, const int xs0, + /* */ ValueType* KOKKOS_RESTRICT Y, const int ys0) { Kokkos::parallel_for(Kokkos::TeamThreadRange(member, m), [&](const int& i) { Y[i * ys0] *= alpha; Y[i * ys0] += X[i * xs0]; @@ -96,11 +87,10 @@ struct TeamXpayInternal { } template - KOKKOS_INLINE_FUNCTION static int invoke( - const MemberType& member, const int m, - const ScalarType* KOKKOS_RESTRICT alpha, const int alphas0, - const ValueType* KOKKOS_RESTRICT X, const int xs0, - /* */ ValueType* KOKKOS_RESTRICT Y, const int ys0) { + KOKKOS_INLINE_FUNCTION static int invoke(const MemberType& member, const int m, + const ScalarType* KOKKOS_RESTRICT alpha, const int alphas0, + const ValueType* KOKKOS_RESTRICT X, const int xs0, + /* */ ValueType* KOKKOS_RESTRICT Y, const int ys0) { Kokkos::parallel_for(Kokkos::TeamThreadRange(member, m), [&](const int& i) { Y[i * ys0] *= alpha[i * alphas0]; Y[i * ys0] += X[i * xs0]; @@ -110,23 +100,18 @@ struct TeamXpayInternal { } template - KOKKOS_INLINE_FUNCTION static int invoke( - const MemberType& member, const int m, const int n, - const ScalarType* KOKKOS_RESTRICT alpha, const int alphas0, - const ValueType* KOKKOS_RESTRICT X, const int xs0, const int xs1, - /* */ ValueType* KOKKOS_RESTRICT Y, const int ys0, const int ys1) { + KOKKOS_INLINE_FUNCTION static int invoke(const MemberType& member, const int m, const int n, + const ScalarType* KOKKOS_RESTRICT alpha, const int alphas0, + const ValueType* KOKKOS_RESTRICT X, const int xs0, const int xs1, + /* */ ValueType* KOKKOS_RESTRICT Y, const int ys0, const int ys1) { if (m > n) { - Kokkos::parallel_for( - Kokkos::TeamThreadRange(member, m), [&](const int& i) { - SerialXpayInternal::invoke(n, alpha[i * alphas0], X + i * xs0, xs1, - Y + i * ys0, ys1); - }); + Kokkos::parallel_for(Kokkos::TeamThreadRange(member, m), [&](const int& i) { + SerialXpayInternal::invoke(n, alpha[i * alphas0], X + i * xs0, xs1, Y + i * ys0, ys1); + }); } else { - Kokkos::parallel_for( - Kokkos::TeamThreadRange(member, n), [&](const int& j) { - SerialXpayInternal::invoke(m, alpha, alphas0, X + j * xs1, xs0, - Y + j * ys1, ys0); - }); + Kokkos::parallel_for(Kokkos::TeamThreadRange(member, n), [&](const int& j) { + SerialXpayInternal::invoke(m, alpha, alphas0, X + j * xs1, xs0, Y + j * ys1, ys0); + }); } // member.team_barrier(); return 0; @@ -138,12 +123,9 @@ struct TeamXpayInternal { /// ======================== struct TeamVectorXpayInternal { template - KOKKOS_INLINE_FUNCTION static int invoke(const MemberType& member, - const int m, const ScalarType alpha, - const ValueType* KOKKOS_RESTRICT X, - const int xs0, - /* */ ValueType* KOKKOS_RESTRICT Y, - const int ys0) { + KOKKOS_INLINE_FUNCTION static int invoke(const MemberType& member, const int m, const ScalarType alpha, + const ValueType* KOKKOS_RESTRICT X, const int xs0, + /* */ ValueType* KOKKOS_RESTRICT Y, const int ys0) { Kokkos::parallel_for(Kokkos::TeamVectorRange(member, m), [&](const int& i) { Y[i * ys0] *= alpha; Y[i * ys0] += X[i * xs0]; @@ -153,11 +135,10 @@ struct TeamVectorXpayInternal { } template - KOKKOS_INLINE_FUNCTION static int invoke( - const MemberType& member, const int m, - const ScalarType* KOKKOS_RESTRICT alpha, const int alphas0, - const ValueType* KOKKOS_RESTRICT X, const int xs0, - /* */ ValueType* KOKKOS_RESTRICT Y, const int ys0) { + KOKKOS_INLINE_FUNCTION static int invoke(const MemberType& member, const int m, + const ScalarType* KOKKOS_RESTRICT alpha, const int alphas0, + const ValueType* KOKKOS_RESTRICT X, const int xs0, + /* */ ValueType* KOKKOS_RESTRICT Y, const int ys0) { Kokkos::parallel_for(Kokkos::TeamVectorRange(member, m), [&](const int& i) { Y[i * ys0] *= alpha[i * alphas0]; Y[i * ys0] += X[i * xs0]; @@ -166,20 +147,17 @@ struct TeamVectorXpayInternal { return 0; } - template - KOKKOS_INLINE_FUNCTION static int invoke( - const MemberType& member, const int m, const int n, - const ScalarType* KOKKOS_RESTRICT alpha, const int alphas0, - const ValueType* KOKKOS_RESTRICT X, const int xs0, const int xs1, - /* */ ValueType* KOKKOS_RESTRICT Y, const int ys0, const int ys1) { - Kokkos::parallel_for(Kokkos::TeamVectorRange(member, 0, m * n), - [&](const int& iTemp) { - int i, j; - getIndices(iTemp, n, m, j, i); - Y[i * ys0 + j * ys1] *= alpha[i * alphas0]; - Y[i * ys0 + j * ys1] += X[i * xs0 + j * xs1]; - }); + template + KOKKOS_INLINE_FUNCTION static int invoke(const MemberType& member, const int m, const int n, + const ScalarType* KOKKOS_RESTRICT alpha, const int alphas0, + const ValueType* KOKKOS_RESTRICT X, const int xs0, const int xs1, + /* */ ValueType* KOKKOS_RESTRICT Y, const int ys0, const int ys1) { + Kokkos::parallel_for(Kokkos::TeamVectorRange(member, 0, m * n), [&](const int& iTemp) { + int i, j; + getIndices(iTemp, n, m, j, i); + Y[i * ys0 + j * ys1] *= alpha[i * alphas0]; + Y[i * ys0 + j * ys1] += X[i * xs0 + j * xs1]; + }); // member.team_barrier(); return 0; } @@ -189,18 +167,12 @@ struct TeamVectorXpayInternal { /// Serial Impl /// =========== template -KOKKOS_INLINE_FUNCTION int SerialXpay::invoke(const alphaViewType& alpha, - const ViewType& X, - const ViewType& Y) { +KOKKOS_INLINE_FUNCTION int SerialXpay::invoke(const alphaViewType& alpha, const ViewType& X, const ViewType& Y) { #if (KOKKOSKERNELS_DEBUG_LEVEL > 0) - static_assert(Kokkos::is_view::value, - "KokkosBatched::xpay: ViewType is not a Kokkos::View."); - static_assert(Kokkos::is_view::value, - "KokkosBatched::xpay: alphaViewType is not a Kokkos::View."); - static_assert(ViewType::rank == 2, - "KokkosBatched::xpay: ViewType must have rank 2."); - static_assert(alphaViewType::rank == 1, - "KokkosBatched::xpay: alphaViewType must have rank 1."); + static_assert(Kokkos::is_view::value, "KokkosBatched::xpay: ViewType is not a Kokkos::View."); + static_assert(Kokkos::is_view::value, "KokkosBatched::xpay: alphaViewType is not a Kokkos::View."); + static_assert(ViewType::rank == 2, "KokkosBatched::xpay: ViewType must have rank 2."); + static_assert(alphaViewType::rank == 1, "KokkosBatched::xpay: alphaViewType must have rank 1."); // Check compatibility of dimensions at run time. if (X.extent(0) != Y.extent(0) || X.extent(1) != Y.extent(1)) { @@ -219,11 +191,10 @@ KOKKOS_INLINE_FUNCTION int SerialXpay::invoke(const alphaViewType& alpha, } #endif - return SerialXpayInternal::template invoke< - typename alphaViewType::non_const_value_type, - typename ViewType::non_const_value_type>( - X.extent(0), X.extent(1), alpha.data(), alpha.stride_0(), X.data(), - X.stride_0(), X.stride_1(), Y.data(), Y.stride_0(), Y.stride_1()); + return SerialXpayInternal::template invoke( + X.extent(0), X.extent(1), alpha.data(), alpha.stride_0(), X.data(), X.stride_0(), X.stride_1(), Y.data(), + Y.stride_0(), Y.stride_1()); } /// @@ -232,18 +203,13 @@ KOKKOS_INLINE_FUNCTION int SerialXpay::invoke(const alphaViewType& alpha, template template -KOKKOS_INLINE_FUNCTION int TeamXpay::invoke( - const MemberType& member, const alphaViewType& alpha, const ViewType& X, - const ViewType& Y) { +KOKKOS_INLINE_FUNCTION int TeamXpay::invoke(const MemberType& member, const alphaViewType& alpha, + const ViewType& X, const ViewType& Y) { #if (KOKKOSKERNELS_DEBUG_LEVEL > 0) - static_assert(Kokkos::is_view::value, - "KokkosBatched::xpay: ViewType is not a Kokkos::View."); - static_assert(Kokkos::is_view::value, - "KokkosBatched::xpay: alphaViewType is not a Kokkos::View."); - static_assert(ViewType::rank == 2, - "KokkosBatched::xpay: ViewType must have rank 2."); - static_assert(alphaViewType::rank == 1, - "KokkosBatched::xpay: alphaViewType must have rank 1."); + static_assert(Kokkos::is_view::value, "KokkosBatched::xpay: ViewType is not a Kokkos::View."); + static_assert(Kokkos::is_view::value, "KokkosBatched::xpay: alphaViewType is not a Kokkos::View."); + static_assert(ViewType::rank == 2, "KokkosBatched::xpay: ViewType must have rank 2."); + static_assert(alphaViewType::rank == 1, "KokkosBatched::xpay: alphaViewType must have rank 1."); // Check compatibility of dimensions at run time. if (X.extent(0) != Y.extent(0) || X.extent(1) != Y.extent(1)) { @@ -262,12 +228,10 @@ KOKKOS_INLINE_FUNCTION int TeamXpay::invoke( } #endif - return TeamXpayInternal::template invoke< - MemberType, typename alphaViewType::non_const_value_type, - typename ViewType::non_const_value_type>( - member, X.extent(0), X.extent(1), alpha.data(), alpha.stride_0(), - X.data(), X.stride_0(), X.stride_1(), Y.data(), Y.stride_0(), - Y.stride_1()); + return TeamXpayInternal::template invoke( + member, X.extent(0), X.extent(1), alpha.data(), alpha.stride_0(), X.data(), X.stride_0(), X.stride_1(), Y.data(), + Y.stride_0(), Y.stride_1()); } /// @@ -276,18 +240,13 @@ KOKKOS_INLINE_FUNCTION int TeamXpay::invoke( template template -KOKKOS_INLINE_FUNCTION int TeamVectorXpay::invoke( - const MemberType& member, const alphaViewType& alpha, const ViewType& X, - const ViewType& Y) { +KOKKOS_INLINE_FUNCTION int TeamVectorXpay::invoke(const MemberType& member, const alphaViewType& alpha, + const ViewType& X, const ViewType& Y) { #if (KOKKOSKERNELS_DEBUG_LEVEL > 0) - static_assert(Kokkos::is_view::value, - "KokkosBatched::xpay: ViewType is not a Kokkos::View."); - static_assert(Kokkos::is_view::value, - "KokkosBatched::xpay: alphaViewType is not a Kokkos::View."); - static_assert(ViewType::rank == 2, - "KokkosBatched::xpay: ViewType must have rank 2."); - static_assert(alphaViewType::rank == 1, - "KokkosBatched::xpay: alphaViewType must have rank 1."); + static_assert(Kokkos::is_view::value, "KokkosBatched::xpay: ViewType is not a Kokkos::View."); + static_assert(Kokkos::is_view::value, "KokkosBatched::xpay: alphaViewType is not a Kokkos::View."); + static_assert(ViewType::rank == 2, "KokkosBatched::xpay: ViewType must have rank 2."); + static_assert(alphaViewType::rank == 1, "KokkosBatched::xpay: alphaViewType must have rank 1."); // Check compatibility of dimensions at run time. if (X.extent(0) != Y.extent(0) || X.extent(1) != Y.extent(1)) { @@ -306,12 +265,10 @@ KOKKOS_INLINE_FUNCTION int TeamVectorXpay::invoke( } #endif - return TeamVectorXpayInternal::invoke< - MemberType, typename alphaViewType::non_const_value_type, - typename ViewType::non_const_value_type, typename ViewType::array_layout>( - member, X.extent(0), X.extent(1), alpha.data(), alpha.stride_0(), - X.data(), X.stride_0(), X.stride_1(), Y.data(), Y.stride_0(), - Y.stride_1()); + return TeamVectorXpayInternal::invoke( + member, X.extent(0), X.extent(1), alpha.data(), alpha.stride_0(), X.data(), X.stride_0(), X.stride_1(), Y.data(), + Y.stride_0(), Y.stride_1()); } } // namespace KokkosBatched diff --git a/batched/dense/src/KokkosBatched_AddRadial_Decl.hpp b/batched/dense/src/KokkosBatched_AddRadial_Decl.hpp index 6b75a11dc7..7eadc43269 100644 --- a/batched/dense/src/KokkosBatched_AddRadial_Decl.hpp +++ b/batched/dense/src/KokkosBatched_AddRadial_Decl.hpp @@ -34,8 +34,7 @@ namespace KokkosBatched { struct SerialAddRadial { template - KOKKOS_INLINE_FUNCTION static int invoke(const ScalarType tiny, - const AViewType &A); + KOKKOS_INLINE_FUNCTION static int invoke(const ScalarType tiny, const AViewType &A); }; /// @@ -45,9 +44,7 @@ struct SerialAddRadial { template struct TeamAddRadial { template - KOKKOS_INLINE_FUNCTION static int invoke(const MemberType &member, - const ScalarType tiny, - const AViewType &A); + KOKKOS_INLINE_FUNCTION static int invoke(const MemberType &member, const ScalarType tiny, const AViewType &A); }; } // namespace KokkosBatched diff --git a/batched/dense/src/KokkosBatched_ApplyHouseholder_Decl.hpp b/batched/dense/src/KokkosBatched_ApplyHouseholder_Decl.hpp index 3fe51f3138..bee7d3a645 100644 --- a/batched/dense/src/KokkosBatched_ApplyHouseholder_Decl.hpp +++ b/batched/dense/src/KokkosBatched_ApplyHouseholder_Decl.hpp @@ -29,21 +29,16 @@ namespace KokkosBatched { // level 1 operation template struct SerialApplyHouseholder { - template - KOKKOS_INLINE_FUNCTION static int invoke(const uViewType &u2, - const tauViewType &tau, + template + KOKKOS_INLINE_FUNCTION static int invoke(const uViewType &u2, const tauViewType &tau, const AViewType const wViewType &w); }; // level 1 operation template struct TeamVectorApplyHouseholder { - template - KOKKOS_INLINE_FUNCTION static int invoke(const MemberType &member, - const uViewType &u2, - const tauViewType &tau, + template + KOKKOS_INLINE_FUNCTION static int invoke(const MemberType &member, const uViewType &u2, const tauViewType &tau, const AViewType const wViewType &w); }; diff --git a/batched/dense/src/KokkosBatched_ApplyPivot_Decl.hpp b/batched/dense/src/KokkosBatched_ApplyPivot_Decl.hpp index fb9bef60ae..2aa00bf8c2 100644 --- a/batched/dense/src/KokkosBatched_ApplyPivot_Decl.hpp +++ b/batched/dense/src/KokkosBatched_ApplyPivot_Decl.hpp @@ -28,13 +28,10 @@ namespace KokkosBatched { template struct TeamVectorApplyPivot { template - KOKKOS_INLINE_FUNCTION static int invoke(const MemberType &member, - const int piv, const AViewType &A); + KOKKOS_INLINE_FUNCTION static int invoke(const MemberType &member, const int piv, const AViewType &A); template - KOKKOS_INLINE_FUNCTION static int invoke(const MemberType &member, - const PivViewType piv, - const AViewType &A); + KOKKOS_INLINE_FUNCTION static int invoke(const MemberType &member, const PivViewType piv, const AViewType &A); }; } // namespace KokkosBatched diff --git a/batched/dense/src/KokkosBatched_ApplyQ_Decl.hpp b/batched/dense/src/KokkosBatched_ApplyQ_Decl.hpp index 177c338a98..7f78e31700 100644 --- a/batched/dense/src/KokkosBatched_ApplyQ_Decl.hpp +++ b/batched/dense/src/KokkosBatched_ApplyQ_Decl.hpp @@ -28,11 +28,8 @@ namespace KokkosBatched { template struct SerialApplyQ { - template - KOKKOS_INLINE_FUNCTION static int invoke(const AViewType &A, - const tViewType &t, - const BViewType &B, + template + KOKKOS_INLINE_FUNCTION static int invoke(const AViewType &A, const tViewType &t, const BViewType &B, const wViewType &w); }; @@ -40,56 +37,39 @@ struct SerialApplyQ { /// Team ApplyQ /// -template +template struct TeamApplyQ { - template - KOKKOS_INLINE_FUNCTION static int invoke(const MemberType &member, - const AViewType &A, - const tViewType &t, - const BViewType &B, - const wViewType &w); + template + KOKKOS_INLINE_FUNCTION static int invoke(const MemberType &member, const AViewType &A, const tViewType &t, + const BViewType &B, const wViewType &w); }; /// /// TeamVector ApplyQ /// -template +template struct TeamVectorApplyQ { - template - KOKKOS_INLINE_FUNCTION static int invoke(const MemberType &member, - const AViewType &A, - const tViewType &t, - const BViewType &B, - const wViewType &w); + template + KOKKOS_INLINE_FUNCTION static int invoke(const MemberType &member, const AViewType &A, const tViewType &t, + const BViewType &B, const wViewType &w); }; /// /// Selective Interface /// -template +template struct ApplyQ { - template - KOKKOS_FORCEINLINE_FUNCTION static int invoke(const MemberType &member, - const AViewType &A, - const tViewType &t, - const BViewType &B, - const wViewType &w) { + template + KOKKOS_FORCEINLINE_FUNCTION static int invoke(const MemberType &member, const AViewType &A, const tViewType &t, + const BViewType &B, const wViewType &w) { int r_val = 0; if (std::is_same::value) { r_val = SerialApplyQ::invoke(A, t, B, w); } else if (std::is_same::value) { - r_val = TeamApplyQ::invoke( - member, A, t, B, w); + r_val = TeamApplyQ::invoke(member, A, t, B, w); } else if (std::is_same::value) { - r_val = TeamVectorApplyQ::invoke( - member, A, t, B, w); + r_val = TeamVectorApplyQ::invoke(member, A, t, B, w); } return r_val; } diff --git a/batched/dense/src/KokkosBatched_Axpy.hpp b/batched/dense/src/KokkosBatched_Axpy.hpp index b76772f3b2..5b89c0862e 100644 --- a/batched/dense/src/KokkosBatched_Axpy.hpp +++ b/batched/dense/src/KokkosBatched_Axpy.hpp @@ -44,9 +44,7 @@ namespace KokkosBatched { struct SerialAxpy { template - KOKKOS_INLINE_FUNCTION static int invoke(const alphaViewType &alpha, - const XViewType &X, - const YViewType &Y); + KOKKOS_INLINE_FUNCTION static int invoke(const alphaViewType &alpha, const XViewType &X, const YViewType &Y); }; /// \brief Team Batched AXPY: @@ -72,9 +70,7 @@ struct SerialAxpy { template struct TeamAxpy { template - KOKKOS_INLINE_FUNCTION static int invoke(const MemberType &member, - const alphaViewType &alpha, - const XViewType &X, + KOKKOS_INLINE_FUNCTION static int invoke(const MemberType &member, const alphaViewType &alpha, const XViewType &X, const YViewType &Y); }; @@ -102,9 +98,7 @@ struct TeamAxpy { template struct TeamVectorAxpy { template - KOKKOS_INLINE_FUNCTION static int invoke(const MemberType &member, - const alphaViewType &alpha, - const XViewType &X, + KOKKOS_INLINE_FUNCTION static int invoke(const MemberType &member, const alphaViewType &alpha, const XViewType &X, const YViewType &Y); }; diff --git a/batched/dense/src/KokkosBatched_Copy_Decl.hpp b/batched/dense/src/KokkosBatched_Copy_Decl.hpp index 07e6ea42da..0e2b24e91d 100644 --- a/batched/dense/src/KokkosBatched_Copy_Decl.hpp +++ b/batched/dense/src/KokkosBatched_Copy_Decl.hpp @@ -29,46 +29,36 @@ namespace KokkosBatched { template struct SerialCopy { template - KOKKOS_INLINE_FUNCTION static int invoke(const AViewType &A, - const BViewType &B); + KOKKOS_INLINE_FUNCTION static int invoke(const AViewType &A, const BViewType &B); }; /// /// Team Copy /// -template +template struct TeamCopy { template - KOKKOS_INLINE_FUNCTION static int invoke(const MemberType &member, - const AViewType &A, - const BViewType &B); + KOKKOS_INLINE_FUNCTION static int invoke(const MemberType &member, const AViewType &A, const BViewType &B); }; /// /// TeamVector Copy /// -template +template struct TeamVectorCopy { template - KOKKOS_INLINE_FUNCTION static int invoke(const MemberType &member, - const AViewType &A, - const BViewType &B); + KOKKOS_INLINE_FUNCTION static int invoke(const MemberType &member, const AViewType &A, const BViewType &B); }; /// /// Selective Interface /// -template +template struct Copy { template - KOKKOS_FORCEINLINE_FUNCTION static int invoke(const MemberType &member, - const AViewType &A, - const BViewType &B) { + KOKKOS_FORCEINLINE_FUNCTION static int invoke(const MemberType &member, const AViewType &A, const BViewType &B) { int r_val = 0; if (std::is_same::value) { r_val = SerialCopy::invoke(A, B); @@ -85,29 +75,23 @@ struct Copy { #include "KokkosBatched_Copy_Impl.hpp" -#define KOKKOSBATCHED_SERIAL_COPY_MATRIX_NO_TRANSPOSE_INTERNAL_INVOKE( \ - M, N, A, AS0, AS1, B, BS0, BS1) \ +#define KOKKOSBATCHED_SERIAL_COPY_MATRIX_NO_TRANSPOSE_INTERNAL_INVOKE(M, N, A, AS0, AS1, B, BS0, BS1) \ KokkosBatched::SerialCopyInternal ::invoke(M, N, A, AS0, AS1, B, BS0, BS1) -#define KOKKOSBATCHED_TEAM_COPY_MATRIX_NO_TRANSPOSE_INTERNAL_INVOKE( \ - MEMBER, M, N, A, AS0, AS1, B, BS0, BS1) \ - KokkosBatched::TeamCopyInternal ::invoke(MEMBER, M, N, A, AS0, AS1, B, BS0, \ - BS1) +#define KOKKOSBATCHED_TEAM_COPY_MATRIX_NO_TRANSPOSE_INTERNAL_INVOKE(MEMBER, M, N, A, AS0, AS1, B, BS0, BS1) \ + KokkosBatched::TeamCopyInternal ::invoke(MEMBER, M, N, A, AS0, AS1, B, BS0, BS1) #define KOKKOSBATCHED_SERIAL_COPY_VECTOR_INTERNAL_INVOKE(M, A, AS, B, BS) \ KokkosBatched::SerialCopyInternal ::invoke(M, A, AS, B, BS) -#define KOKKOSBATCHED_TEAM_COPY_VECTOR_NO_TRANSPOSE_INTERNAL_INVOKE( \ - MEMBER, M, A, AS, B, BS) \ +#define KOKKOSBATCHED_TEAM_COPY_VECTOR_NO_TRANSPOSE_INTERNAL_INVOKE(MEMBER, M, A, AS, B, BS) \ KokkosBatched::TeamCopyInternal ::invoke(MEMBER, M, A, AS, B, BS) -#define KOKKOSBATCHED_COPY_VECTOR_NO_TRANSPOSE_INTERNAL_INVOKE( \ - MODETYPE, MEMBER, M, A, AS, B, BS) \ - if (std::is_same::value) { \ - KOKKOSBATCHED_SERIAL_COPY_VECTOR_INTERNAL_INVOKE(M, A, AS, B, BS); \ - } else if (std::is_same::value) { \ - KOKKOSBATCHED_TEAM_COPY_VECTOR_NO_TRANSPOSE_INTERNAL_INVOKE(MEMBER, M, A, \ - AS, B, BS); \ +#define KOKKOSBATCHED_COPY_VECTOR_NO_TRANSPOSE_INTERNAL_INVOKE(MODETYPE, MEMBER, M, A, AS, B, BS) \ + if (std::is_same::value) { \ + KOKKOSBATCHED_SERIAL_COPY_VECTOR_INTERNAL_INVOKE(M, A, AS, B, BS); \ + } else if (std::is_same::value) { \ + KOKKOSBATCHED_TEAM_COPY_VECTOR_NO_TRANSPOSE_INTERNAL_INVOKE(MEMBER, M, A, AS, B, BS); \ } #endif diff --git a/batched/dense/src/KokkosBatched_Dot.hpp b/batched/dense/src/KokkosBatched_Dot.hpp index c04914e220..545a4954ce 100644 --- a/batched/dense/src/KokkosBatched_Dot.hpp +++ b/batched/dense/src/KokkosBatched_Dot.hpp @@ -52,9 +52,7 @@ namespace KokkosBatched { template struct SerialDot { template - KOKKOS_INLINE_FUNCTION static int invoke(const XViewType &X, - const YViewType &Y, - const NormViewType &dot); + KOKKOS_INLINE_FUNCTION static int invoke(const XViewType &X, const YViewType &Y, const NormViewType &dot); }; /// \brief Team Batched DOT: @@ -86,9 +84,7 @@ struct SerialDot { template struct TeamDot { template - KOKKOS_INLINE_FUNCTION static int invoke(const MemberType &member, - const XViewType &X, - const YViewType &Y, + KOKKOS_INLINE_FUNCTION static int invoke(const MemberType &member, const XViewType &X, const YViewType &Y, const NormViewType &dot); }; @@ -122,9 +118,7 @@ struct TeamDot { template struct TeamVectorDot { template - KOKKOS_INLINE_FUNCTION static int invoke(const MemberType &member, - const XViewType &X, - const YViewType &Y, + KOKKOS_INLINE_FUNCTION static int invoke(const MemberType &member, const XViewType &X, const YViewType &Y, const NormViewType &dot); }; diff --git a/batched/dense/src/KokkosBatched_Eigendecomposition_Decl.hpp b/batched/dense/src/KokkosBatched_Eigendecomposition_Decl.hpp index 4ba24d519b..39ead9e26c 100644 --- a/batched/dense/src/KokkosBatched_Eigendecomposition_Decl.hpp +++ b/batched/dense/src/KokkosBatched_Eigendecomposition_Decl.hpp @@ -49,21 +49,17 @@ namespace KokkosBatched { /// dimension of matrix A. struct SerialEigendecomposition { - template - KOKKOS_INLINE_FUNCTION static int invoke( - const AViewType &A, const EViewType &er, const EViewType &ei, - const UViewType &UL, const UViewType &UR, const WViewType &W); + template + KOKKOS_INLINE_FUNCTION static int invoke(const AViewType &A, const EViewType &er, const EViewType &ei, + const UViewType &UL, const UViewType &UR, const WViewType &W); }; template struct TeamVectorEigendecomposition { - template - KOKKOS_INLINE_FUNCTION static int invoke( - const MemberType &member, const AViewType &A, const EViewType &er, - const EViewType &ei, const UViewType &UL, const UViewType &UR, - const WViewType &W); + template + KOKKOS_INLINE_FUNCTION static int invoke(const MemberType &member, const AViewType &A, const EViewType &er, + const EViewType &ei, const UViewType &UL, const UViewType &UR, + const WViewType &W); }; } // namespace KokkosBatched diff --git a/batched/dense/src/KokkosBatched_Gemm_Decl.hpp b/batched/dense/src/KokkosBatched_Gemm_Decl.hpp index 1febcaa771..9f4b745561 100644 --- a/batched/dense/src/KokkosBatched_Gemm_Decl.hpp +++ b/batched/dense/src/KokkosBatched_Gemm_Decl.hpp @@ -25,61 +25,46 @@ namespace KokkosBatched { template struct SerialGemm { - template - KOKKOS_INLINE_FUNCTION static int invoke(const ScalarType alpha, - const AViewType &A, - const BViewType &B, - const ScalarType beta, - const CViewType &C); + template + KOKKOS_INLINE_FUNCTION static int invoke(const ScalarType alpha, const AViewType &A, const BViewType &B, + const ScalarType beta, const CViewType &C); }; /// /// Team Gemm /// -template +template struct TeamGemm { - template - KOKKOS_INLINE_FUNCTION static int invoke( - const MemberType &member, const ScalarType alpha, const AViewType &A, - const BViewType &B, const ScalarType beta, const CViewType &C); + template + KOKKOS_INLINE_FUNCTION static int invoke(const MemberType &member, const ScalarType alpha, const AViewType &A, + const BViewType &B, const ScalarType beta, const CViewType &C); }; /// /// TeamVector Gemm /// -template +template struct TeamVectorGemm { - template - KOKKOS_INLINE_FUNCTION static int invoke( - const MemberType &member, const ScalarType alpha, const AViewType &A, - const BViewType &B, const ScalarType beta, const CViewType &C); + template + KOKKOS_INLINE_FUNCTION static int invoke(const MemberType &member, const ScalarType alpha, const AViewType &A, + const BViewType &B, const ScalarType beta, const CViewType &C); }; /// /// Selective Interface /// -template +template struct Gemm { - template - KOKKOS_FORCEINLINE_FUNCTION static int invoke( - const MemberType &member, const ScalarType alpha, const AViewType &A, - const BViewType &B, const ScalarType beta, const CViewType &C) { + template + KOKKOS_FORCEINLINE_FUNCTION static int invoke(const MemberType &member, const ScalarType alpha, const AViewType &A, + const BViewType &B, const ScalarType beta, const CViewType &C) { int r_val = 0; if (std::is_same::value) { - r_val = SerialGemm::invoke(alpha, A, B, - beta, C); + r_val = SerialGemm::invoke(alpha, A, B, beta, C); } else if (std::is_same::value) { - r_val = TeamGemm::invoke( - member, alpha, A, B, beta, C); + r_val = TeamGemm::invoke(member, alpha, A, B, beta, C); } return r_val; } diff --git a/batched/dense/src/KokkosBatched_Gemv_Decl.hpp b/batched/dense/src/KokkosBatched_Gemv_Decl.hpp index 825efa9dc5..9ab86d9e07 100644 --- a/batched/dense/src/KokkosBatched_Gemv_Decl.hpp +++ b/batched/dense/src/KokkosBatched_Gemv_Decl.hpp @@ -29,13 +29,9 @@ namespace KokkosBatched { template struct SerialGemv { - template - KOKKOS_INLINE_FUNCTION static int invoke(const ScalarType /*alpha*/, - const AViewType & /*A*/, - const xViewType & /*x*/, - const ScalarType /*beta*/, - const yViewType & /*y*/) { + template + KOKKOS_INLINE_FUNCTION static int invoke(const ScalarType /*alpha*/, const AViewType & /*A*/, const xViewType & /*x*/, + const ScalarType /*beta*/, const yViewType & /*y*/) { Kokkos::abort( "Error: KokkosBatched::SerialGemv has been deprecated - use " "KokkosBlas::SerialGemv instead"); @@ -49,13 +45,9 @@ struct SerialGemv { template struct TeamGemv { - template - KOKKOS_INLINE_FUNCTION static int invoke(const MemberType & /*member*/, - const ScalarType /*alpha*/, - const AViewType & /*A*/, - const xViewType & /*x*/, - const ScalarType /*beta*/, + template + KOKKOS_INLINE_FUNCTION static int invoke(const MemberType & /*member*/, const ScalarType /*alpha*/, + const AViewType & /*A*/, const xViewType & /*x*/, const ScalarType /*beta*/, const yViewType & /*y*/) { assert(false && "Error: encounter dummy impl"); return 0; @@ -68,13 +60,9 @@ struct TeamGemv { template struct TeamVectorGemv { - template - KOKKOS_INLINE_FUNCTION static int invoke(const MemberType & /*member*/, - const ScalarType /*alpha*/, - const AViewType & /*A*/, - const xViewType & /*x*/, - const ScalarType /*beta*/, + template + KOKKOS_INLINE_FUNCTION static int invoke(const MemberType & /*member*/, const ScalarType /*alpha*/, + const AViewType & /*A*/, const xViewType & /*x*/, const ScalarType /*beta*/, const yViewType & /*y*/) { assert(false && "Error: encounter dummy impl"); return 0; @@ -84,23 +72,18 @@ struct TeamVectorGemv { /// /// Selective Interface /// -template +template struct Gemv { - template - KOKKOS_FORCEINLINE_FUNCTION static int invoke( - const MemberType &member, const ScalarType alpha, const AViewType &A, - const xViewType &x, const ScalarType beta, const yViewType &y) { + template + KOKKOS_FORCEINLINE_FUNCTION static int invoke(const MemberType &member, const ScalarType alpha, const AViewType &A, + const xViewType &x, const ScalarType beta, const yViewType &y) { int r_val = 0; if (std::is_same::value) { r_val = SerialGemv::invoke(alpha, A, x, beta, y); } else if (std::is_same::value) { - r_val = TeamGemv::invoke(member, alpha, A, - x, beta, y); + r_val = TeamGemv::invoke(member, alpha, A, x, beta, y); } else if (std::is_same::value) { - r_val = TeamVectorGemv::invoke( - member, alpha, A, x, beta, y); + r_val = TeamVectorGemv::invoke(member, alpha, A, x, beta, y); } return r_val; } @@ -112,44 +95,35 @@ struct Gemv { #include "KokkosBatched_Gemv_TeamVector_Impl.hpp" #include "KokkosBlas2_serial_gemv_internal.hpp" -#define KOKKOSBATCHED_SERIAL_GEMV_NO_TRANSPOSE_INTERNAL_INVOKE( \ - ALGOTYPE, M, N, ALPHA, A, AS0, AS1, X, XS, BETA, Y, YS) \ - KokkosBlas::Impl::SerialGemvInternal::invoke( \ - M, N, ALPHA, A, AS0, AS1, X, XS, BETA, Y, YS) - -#define KOKKOSBATCHED_SERIAL_GEMV_TRANSPOSE_INTERNAL_INVOKE( \ - ALGOTYPE, M, N, ALPHA, A, AS0, AS1, X, XS, BETA, Y, YS) \ - KokkosBlas::Impl::SerialGemvInternal::invoke( \ - N, M, ALPHA, A, AS1, AS0, X, XS, BETA, Y, YS) - -#define KOKKOSBATCHED_TEAM_GEMV_NO_TRANSPOSE_INTERNAL_INVOKE( \ - ALGOTYPE, MEMBER, M, N, ALPHA, A, AS0, AS1, X, XS, BETA, Y, YS) \ - KokkosBlas::Impl::TeamGemvInternal::invoke( \ - MEMBER, M, N, ALPHA, A, AS0, AS1, X, XS, BETA, Y, YS) - -#define KOKKOSBATCHED_TEAM_GEMV_TRANSPOSE_INTERNAL_INVOKE( \ - ALGOTYPE, MEMBER, M, N, ALPHA, A, AS0, AS1, X, XS, BETA, Y, YS) \ - KokkosBlas::Impl::TeamGemvInternal::invoke( \ - MEMBER, N, M, ALPHA, A, AS1, AS0, X, XS, BETA, Y, YS) - -#define KOKKOSBATCHED_GEMV_NO_TRANSPOSE_INTERNAL_INVOKE( \ - MODETYPE, ALGOTYPE, MEMBER, M, N, ALPHA, A, AS0, AS1, X, XS, BETA, Y, YS) \ - if (std::is_same::value) { \ - KOKKOSBATCHED_SERIAL_GEMV_NO_TRANSPOSE_INTERNAL_INVOKE( \ - ALGOTYPE, M, N, ALPHA, A, AS0, AS1, X, XS, BETA, Y, YS); \ - } else if (std::is_same::value) { \ - KOKKOSBATCHED_TEAM_GEMV_NO_TRANSPOSE_INTERNAL_INVOKE( \ - ALGOTYPE, MEMBER, M, N, ALPHA, A, AS0, AS1, X, XS, BETA, Y, YS); \ +#define KOKKOSBATCHED_SERIAL_GEMV_NO_TRANSPOSE_INTERNAL_INVOKE(ALGOTYPE, M, N, ALPHA, A, AS0, AS1, X, XS, BETA, Y, YS) \ + KokkosBlas::Impl::SerialGemvInternal::invoke(M, N, ALPHA, A, AS0, AS1, X, XS, BETA, Y, YS) + +#define KOKKOSBATCHED_SERIAL_GEMV_TRANSPOSE_INTERNAL_INVOKE(ALGOTYPE, M, N, ALPHA, A, AS0, AS1, X, XS, BETA, Y, YS) \ + KokkosBlas::Impl::SerialGemvInternal::invoke(N, M, ALPHA, A, AS1, AS0, X, XS, BETA, Y, YS) + +#define KOKKOSBATCHED_TEAM_GEMV_NO_TRANSPOSE_INTERNAL_INVOKE(ALGOTYPE, MEMBER, M, N, ALPHA, A, AS0, AS1, X, XS, BETA, \ + Y, YS) \ + KokkosBlas::Impl::TeamGemvInternal::invoke(MEMBER, M, N, ALPHA, A, AS0, AS1, X, XS, BETA, Y, YS) + +#define KOKKOSBATCHED_TEAM_GEMV_TRANSPOSE_INTERNAL_INVOKE(ALGOTYPE, MEMBER, M, N, ALPHA, A, AS0, AS1, X, XS, BETA, Y, \ + YS) \ + KokkosBlas::Impl::TeamGemvInternal::invoke(MEMBER, N, M, ALPHA, A, AS1, AS0, X, XS, BETA, Y, YS) + +#define KOKKOSBATCHED_GEMV_NO_TRANSPOSE_INTERNAL_INVOKE(MODETYPE, ALGOTYPE, MEMBER, M, N, ALPHA, A, AS0, AS1, X, XS, \ + BETA, Y, YS) \ + if (std::is_same::value) { \ + KOKKOSBATCHED_SERIAL_GEMV_NO_TRANSPOSE_INTERNAL_INVOKE(ALGOTYPE, M, N, ALPHA, A, AS0, AS1, X, XS, BETA, Y, YS); \ + } else if (std::is_same::value) { \ + KOKKOSBATCHED_TEAM_GEMV_NO_TRANSPOSE_INTERNAL_INVOKE(ALGOTYPE, MEMBER, M, N, ALPHA, A, AS0, AS1, X, XS, BETA, Y, \ + YS); \ } -#define KOKKOSBATCHED_GEMV_TRANSPOSE_INTERNAL_INVOKE( \ - MODETYPE, ALGOTYPE, MEMBER, M, N, ALPHA, A, AS0, AS1, X, XS, BETA, Y, YS) \ - if (std::is_same::value) { \ - KOKKOSBATCHED_SERIAL_GEMV_TRANSPOSE_INTERNAL_INVOKE( \ - ALGOTYPE, M, N, ALPHA, A, AS0, AS1, X, XS, BETA, Y, YS); \ - } else if (std::is_same::value) { \ - KOKKOSBATCHED_TEAM_GEMV_TRANSPOSE_INTERNAL_INVOKE( \ - ALGOTYPE, MEMBER, M, N, ALPHA, A, AS0, AS1, X, XS, BETA, Y, YS); \ +#define KOKKOSBATCHED_GEMV_TRANSPOSE_INTERNAL_INVOKE(MODETYPE, ALGOTYPE, MEMBER, M, N, ALPHA, A, AS0, AS1, X, XS, \ + BETA, Y, YS) \ + if (std::is_same::value) { \ + KOKKOSBATCHED_SERIAL_GEMV_TRANSPOSE_INTERNAL_INVOKE(ALGOTYPE, M, N, ALPHA, A, AS0, AS1, X, XS, BETA, Y, YS); \ + } else if (std::is_same::value) { \ + KOKKOSBATCHED_TEAM_GEMV_TRANSPOSE_INTERNAL_INVOKE(ALGOTYPE, MEMBER, M, N, ALPHA, A, AS0, AS1, X, XS, BETA, Y, YS); \ } #endif diff --git a/batched/dense/src/KokkosBatched_Gesv.hpp b/batched/dense/src/KokkosBatched_Gesv.hpp index c4821db459..77922e4da0 100644 --- a/batched/dense/src/KokkosBatched_Gesv.hpp +++ b/batched/dense/src/KokkosBatched_Gesv.hpp @@ -64,15 +64,12 @@ struct Gesv { template struct SerialGesv { template - KOKKOS_INLINE_FUNCTION static int invoke(const MatrixType A, - const XVectorType X, - const YVectorType Y, + KOKKOS_INLINE_FUNCTION static int invoke(const MatrixType A, const XVectorType X, const YVectorType Y, const MatrixType tmp); template - [[deprecated]] KOKKOS_INLINE_FUNCTION static int invoke( - const MatrixType A, const VectorType X, const VectorType Y, - const MatrixType tmp) { + [[deprecated]] KOKKOS_INLINE_FUNCTION static int invoke(const MatrixType A, const VectorType X, const VectorType Y, + const MatrixType tmp) { return invoke(A, X, Y, tmp); } }; @@ -109,9 +106,7 @@ struct SerialGesv { template struct TeamGesv { template - KOKKOS_INLINE_FUNCTION static int invoke(const MemberType &member, - const MatrixType A, - const VectorType X, + KOKKOS_INLINE_FUNCTION static int invoke(const MemberType &member, const MatrixType A, const VectorType X, const VectorType Y); }; @@ -148,9 +143,7 @@ struct TeamGesv { template struct TeamVectorGesv { template - KOKKOS_INLINE_FUNCTION static int invoke(const MemberType &member, - const MatrixType A, - const VectorType X, + KOKKOS_INLINE_FUNCTION static int invoke(const MemberType &member, const MatrixType A, const VectorType X, const VectorType Y); }; diff --git a/batched/dense/src/KokkosBatched_HadamardProduct.hpp b/batched/dense/src/KokkosBatched_HadamardProduct.hpp index fadd4b5774..f21aa8bae2 100644 --- a/batched/dense/src/KokkosBatched_HadamardProduct.hpp +++ b/batched/dense/src/KokkosBatched_HadamardProduct.hpp @@ -42,9 +42,7 @@ namespace KokkosBatched { struct SerialHadamardProduct { template - KOKKOS_INLINE_FUNCTION static int invoke(const XViewType &X, - const YViewType &Y, - const VViewType &V); + KOKKOS_INLINE_FUNCTION static int invoke(const XViewType &X, const YViewType &Y, const VViewType &V); }; /// \brief Team Batched Hadamard Product: @@ -68,9 +66,7 @@ struct SerialHadamardProduct { template struct TeamHadamardProduct { template - KOKKOS_INLINE_FUNCTION static int invoke(const MemberType &member, - const XViewType &X, - const YViewType &Y, + KOKKOS_INLINE_FUNCTION static int invoke(const MemberType &member, const XViewType &X, const YViewType &Y, const VViewType &V); }; @@ -96,31 +92,22 @@ struct TeamHadamardProduct { template struct TeamVectorHadamardProduct { template - KOKKOS_INLINE_FUNCTION static int invoke(const MemberType &member, - const XViewType &X, - const YViewType &Y, + KOKKOS_INLINE_FUNCTION static int invoke(const MemberType &member, const XViewType &X, const YViewType &Y, const VViewType &V); }; template struct HadamardProduct { template - KOKKOS_INLINE_FUNCTION static int invoke(const MemberType &member, - const XViewType &X, - const YViewType &Y, + KOKKOS_INLINE_FUNCTION static int invoke(const MemberType &member, const XViewType &X, const YViewType &Y, const VViewType &V) { int r_val = 0; if (std::is_same::value) { - r_val = SerialHadamardProduct::template invoke(X, Y, V); + r_val = SerialHadamardProduct::template invoke(X, Y, V); } else if (std::is_same::value) { - r_val = - TeamHadamardProduct::template invoke(member, X, - Y, V); + r_val = TeamHadamardProduct::template invoke(member, X, Y, V); } else if (std::is_same::value) { - r_val = TeamVectorHadamardProduct::template invoke< - XViewType, YViewType, VViewType>(member, X, Y, V); + r_val = TeamVectorHadamardProduct::template invoke(member, X, Y, V); } return r_val; } diff --git a/batched/dense/src/KokkosBatched_HostLevel_Gemm.hpp b/batched/dense/src/KokkosBatched_HostLevel_Gemm.hpp index 4725e0220d..0741b5b41e 100644 --- a/batched/dense/src/KokkosBatched_HostLevel_Gemm.hpp +++ b/batched/dense/src/KokkosBatched_HostLevel_Gemm.hpp @@ -82,32 +82,23 @@ namespace KokkosBatched { /// BatchedGemm(handle, alpha, A, B, beta, C); // clang-format on -template -inline int BatchedGemm(BatchedGemmHandleType *const handle, - const ScalarType alpha, const AViewType &A, - const BViewType &B, const ScalarType beta, - const CViewType &C) { +template +inline int BatchedGemm(BatchedGemmHandleType *const handle, const ScalarType alpha, const AViewType &A, + const BViewType &B, const ScalarType beta, const CViewType &C) { // Minimize the number of ImplBatchedGemmWrapper instantiations, by // standardizing on particular View specializations for its template // parameters. - using UnifiedAVT = Kokkos::View< - typename AViewType::value_type ***, typename AViewType::array_layout, - typename AViewType::device_type, Kokkos::MemoryTraits>; - using UnifiedBVT = Kokkos::View< - typename BViewType::value_type ***, typename BViewType::array_layout, - typename BViewType::device_type, Kokkos::MemoryTraits>; - using UnifiedCVT = Kokkos::View>; + using UnifiedAVT = Kokkos::View>; + using UnifiedBVT = Kokkos::View>; + using UnifiedCVT = Kokkos::View>; // Go through specialization layer in case ETI'd symbols are available. - return Impl::BatchedGemmSpec::run(handle, alpha, A, B, - beta, C); + return Impl::BatchedGemmSpec::run(handle, alpha, A, B, beta, C); } } // namespace KokkosBatched #endif // __KOKKOSBATCHED_HOSTLEVEL_GEMM_DECL_HPP__ diff --git a/batched/dense/src/KokkosBatched_HostLevel_Gemm_Handle.hpp b/batched/dense/src/KokkosBatched_HostLevel_Gemm_Handle.hpp index 95e8f36bc2..2aa6f47cb0 100644 --- a/batched/dense/src/KokkosBatched_HostLevel_Gemm_Handle.hpp +++ b/batched/dense/src/KokkosBatched_HostLevel_Gemm_Handle.hpp @@ -40,15 +40,11 @@ enum GEMM_KOKKOS_BATCHED_ALGOS : int { }; } -#define GEMM_ALGO_STRS \ - "GemmTplAlgos::CUBLAS", "GemmTplAlgos::MAGMA", \ - "GemmKokkosBatchedAlgos::KK_TEAM", \ - "GemmKokkosBatchedAlgos::KK_TEAMVECTOR", \ - "GemmKokkosBatchedAlgos::KK_SERIALSIMD", \ - "GemmKokkosBatchedAlgos::KK_TEAMSIMD", \ - "GemmKokkosBatchedAlgos::KK_SERIAL_RANK0", \ - "GemmKokkosBatchedAlgos::KK_SERIAL_SHMEM", \ - "GemmKokkosBatchedAlgos::KK_DBLBUF" +#define GEMM_ALGO_STRS \ + "GemmTplAlgos::CUBLAS", "GemmTplAlgos::MAGMA", "GemmKokkosBatchedAlgos::KK_TEAM", \ + "GemmKokkosBatchedAlgos::KK_TEAMVECTOR", "GemmKokkosBatchedAlgos::KK_SERIALSIMD", \ + "GemmKokkosBatchedAlgos::KK_TEAMSIMD", "GemmKokkosBatchedAlgos::KK_SERIAL_RANK0", \ + "GemmKokkosBatchedAlgos::KK_SERIAL_SHMEM", "GemmKokkosBatchedAlgos::KK_DBLBUF" // clang-format off /// \brief Handle for selecting runtime behavior of the BatchedGemm interface. /// @@ -96,8 +92,7 @@ enum GEMM_KOKKOS_BATCHED_ALGOS : int { // clang-format on class BatchedGemmHandle : public BatchedKernelHandle { public: - BatchedGemmHandle(int kernelAlgoType = BaseHeuristicAlgos::SQUARE, - int teamSize = 0, int vecLength = 0) + BatchedGemmHandle(int kernelAlgoType = BaseHeuristicAlgos::SQUARE, int teamSize = 0, int vecLength = 0) : BatchedKernelHandle(kernelAlgoType, teamSize, vecLength) { #if defined(KOKKOSKERNELS_ENABLE_TPL_CUBLAS) if (!_tplParamsSet && kernelAlgoType == GemmTplAlgos::CUBLAS) { @@ -116,26 +111,23 @@ class BatchedGemmHandle : public BatchedKernelHandle { #endif // MAGMA }; - BatchedGemmHandle(bool tplParamsSet, - int kernelAlgoType = BaseHeuristicAlgos::SQUARE, - int teamSize = 0, int vecLength = 0) + BatchedGemmHandle(bool tplParamsSet, int kernelAlgoType = BaseHeuristicAlgos::SQUARE, int teamSize = 0, + int vecLength = 0) : BatchedKernelHandle(kernelAlgoType, teamSize, vecLength) { _tplParamsSet = tplParamsSet; }; #if defined(KOKKOSKERNELS_ENABLE_TPL_CUBLAS) - BatchedGemmHandle(cublasHandle_t &cublas_handle, - int kernelAlgoType = BaseHeuristicAlgos::SQUARE, - int teamSize = 0, int vecLength = 0) + BatchedGemmHandle(cublasHandle_t &cublas_handle, int kernelAlgoType = BaseHeuristicAlgos::SQUARE, int teamSize = 0, + int vecLength = 0) : BatchedGemmHandle(true, kernelAlgoType, teamSize, vecLength) { _tplParamsSingleton.cublas_handle = &cublas_handle; }; #endif // CUBLAS #if defined(KOKKOSKERNELS_ENABLE_TPL_MAGMA) - BatchedGemmHandle(magma_queue_t &magma_queue, - int kernelAlgoType = BaseHeuristicAlgos::SQUARE, - int teamSize = 0, int vecLength = 0) + BatchedGemmHandle(magma_queue_t &magma_queue, int kernelAlgoType = BaseHeuristicAlgos::SQUARE, int teamSize = 0, + int vecLength = 0) : BatchedGemmHandle(true, kernelAlgoType, teamSize, vecLength) { _tplParamsSingleton.magma_queue = &magma_queue; }; @@ -151,13 +143,10 @@ class BatchedGemmHandle : public BatchedKernelHandle { #endif } - std::string get_kernel_algo_type_str() const { - return gemm_algo_type_strs[_kernelAlgoType]; - } + std::string get_kernel_algo_type_str() const { return gemm_algo_type_strs[_kernelAlgoType]; } private: - const char *gemm_algo_type_strs[GemmKokkosBatchedAlgos::N] = {BASE_ALGO_STRS, - GEMM_ALGO_STRS}; + const char *gemm_algo_type_strs[GemmKokkosBatchedAlgos::N] = {BASE_ALGO_STRS, GEMM_ALGO_STRS}; }; } // namespace KokkosBatched diff --git a/batched/dense/src/KokkosBatched_Householder_Decl.hpp b/batched/dense/src/KokkosBatched_Householder_Decl.hpp index 6d749bd73a..0a48457551 100644 --- a/batched/dense/src/KokkosBatched_Householder_Decl.hpp +++ b/batched/dense/src/KokkosBatched_Householder_Decl.hpp @@ -30,8 +30,7 @@ namespace KokkosBatched { template struct SerialHouseholder { template - KOKKOS_INLINE_FUNCTION static int invoke(const aViewType &a, - const tauViewType &tau); + KOKKOS_INLINE_FUNCTION static int invoke(const aViewType &a, const tauViewType &tau); }; /// @@ -42,9 +41,7 @@ struct SerialHouseholder { template struct TeamVectorHouseholder { template - KOKKOS_INLINE_FUNCTION static int invoke(const MemberType &member, - const aViewType &a, - const tauViewType &tau); + KOKKOS_INLINE_FUNCTION static int invoke(const MemberType &member, const aViewType &a, const tauViewType &tau); }; } // namespace KokkosBatched diff --git a/batched/dense/src/KokkosBatched_InnerGemmFixA_Decl.hpp b/batched/dense/src/KokkosBatched_InnerGemmFixA_Decl.hpp index 90f2cdb643..757a92ca21 100644 --- a/batched/dense/src/KokkosBatched_InnerGemmFixA_Decl.hpp +++ b/batched/dense/src/KokkosBatched_InnerGemmFixA_Decl.hpp @@ -25,25 +25,19 @@ struct InnerGemmFixA { const int _as0, _as1, _bs0, _bs1, _cs0, _cs1; KOKKOS_INLINE_FUNCTION - InnerGemmFixA(const int as0, const int as1, const int bs0, const int bs1, - const int cs0, const int cs1) + InnerGemmFixA(const int as0, const int as1, const int bs0, const int bs1, const int cs0, const int cs1) : _as0(as0), _as1(as1), _bs0(bs0), _bs1(bs1), _cs0(cs0), _cs1(cs1) {} // serial rank update template - KOKKOS_INLINE_FUNCTION int serial_invoke(const ScalarType alpha, - const ValueType *KOKKOS_RESTRICT A, - const ValueType *KOKKOS_RESTRICT B, - const int n, + KOKKOS_INLINE_FUNCTION int serial_invoke(const ScalarType alpha, const ValueType *KOKKOS_RESTRICT A, + const ValueType *KOKKOS_RESTRICT B, const int n, /**/ ValueType *KOKKOS_RESTRICT C); // serial rank update for remainder template - KOKKOS_INLINE_FUNCTION int serial_invoke(const ScalarType alpha, - const ValueType *KOKKOS_RESTRICT A, - const ValueType *KOKKOS_RESTRICT B, - const int m, const int n, - const int k, + KOKKOS_INLINE_FUNCTION int serial_invoke(const ScalarType alpha, const ValueType *KOKKOS_RESTRICT A, + const ValueType *KOKKOS_RESTRICT B, const int m, const int n, const int k, /**/ ValueType *KOKKOS_RESTRICT C); }; } // namespace KokkosBatched diff --git a/batched/dense/src/KokkosBatched_InnerGemmFixB_Decl.hpp b/batched/dense/src/KokkosBatched_InnerGemmFixB_Decl.hpp index 67d968a356..b2f885970f 100644 --- a/batched/dense/src/KokkosBatched_InnerGemmFixB_Decl.hpp +++ b/batched/dense/src/KokkosBatched_InnerGemmFixB_Decl.hpp @@ -25,25 +25,19 @@ struct InnerGemmFixB { const int _as0, _as1, _bs0, _bs1, _cs0, _cs1; KOKKOS_INLINE_FUNCTION - InnerGemmFixA(const int as0, const int as1, const int bs0, const int bs1, - const int cs0, const int cs1) + InnerGemmFixA(const int as0, const int as1, const int bs0, const int bs1, const int cs0, const int cs1) : _as0(as0), _as1(as1), _bs0(bs0), _bs1(bs1), _cs0(cs0), _cs1(cs1) {} // serial rank update template - KOKKOS_INLINE_FUNCTION int serial_invoke(const ScalarType alpha, - const ValueType *KOKKOS_RESTRICT A, - const ValueType *KOKKOS_RESTRICT B, - const int n, + KOKKOS_INLINE_FUNCTION int serial_invoke(const ScalarType alpha, const ValueType *KOKKOS_RESTRICT A, + const ValueType *KOKKOS_RESTRICT B, const int n, /**/ ValueType *KOKKOS_RESTRICT C); // serial rank update for remainder template - KOKKOS_INLINE_FUNCTION int serial_invoke(const ScalarType alpha, - const ValueType *KOKKOS_RESTRICT A, - const ValueType *KOKKOS_RESTRICT B, - const int m, const int n, - const int k, + KOKKOS_INLINE_FUNCTION int serial_invoke(const ScalarType alpha, const ValueType *KOKKOS_RESTRICT A, + const ValueType *KOKKOS_RESTRICT B, const int m, const int n, const int k, /**/ ValueType *KOKKOS_RESTRICT C); }; } // namespace KokkosBatched diff --git a/batched/dense/src/KokkosBatched_InnerGemmFixC_Decl.hpp b/batched/dense/src/KokkosBatched_InnerGemmFixC_Decl.hpp index 64d00845ee..c61d966f77 100644 --- a/batched/dense/src/KokkosBatched_InnerGemmFixC_Decl.hpp +++ b/batched/dense/src/KokkosBatched_InnerGemmFixC_Decl.hpp @@ -25,49 +25,37 @@ struct InnerGemmFixC { const int _as0, _as1, _bs0, _bs1, _cs0, _cs1; KOKKOS_INLINE_FUNCTION - InnerGemmFixC(const int as0, const int as1, const int bs0, const int bs1, - const int cs0, const int cs1) + InnerGemmFixC(const int as0, const int as1, const int bs0, const int bs1, const int cs0, const int cs1) : _as0(as0), _as1(as1), _bs0(bs0), _bs1(bs1), _cs0(cs0), _cs1(cs1) {} // serial rank update template - KOKKOS_INLINE_FUNCTION int serial_invoke(const ScalarType alpha, - const ValueType *KOKKOS_RESTRICT A, - const ValueType *KOKKOS_RESTRICT B, - const int k, + KOKKOS_INLINE_FUNCTION int serial_invoke(const ScalarType alpha, const ValueType *KOKKOS_RESTRICT A, + const ValueType *KOKKOS_RESTRICT B, const int k, /**/ ValueType *KOKKOS_RESTRICT C); // serial rank update for remainder template - KOKKOS_INLINE_FUNCTION int serial_invoke(const ScalarType alpha, - const ValueType *KOKKOS_RESTRICT A, - const ValueType *KOKKOS_RESTRICT B, - const int m, const int k, + KOKKOS_INLINE_FUNCTION int serial_invoke(const ScalarType alpha, const ValueType *KOKKOS_RESTRICT A, + const ValueType *KOKKOS_RESTRICT B, const int m, const int k, /**/ ValueType *KOKKOS_RESTRICT C); // serial rank update for remainder template - KOKKOS_INLINE_FUNCTION int serial_invoke(const ScalarType alpha, - const ValueType *KOKKOS_RESTRICT A, - const ValueType *KOKKOS_RESTRICT B, - const int m, const int n, - const int k, + KOKKOS_INLINE_FUNCTION int serial_invoke(const ScalarType alpha, const ValueType *KOKKOS_RESTRICT A, + const ValueType *KOKKOS_RESTRICT B, const int m, const int n, const int k, /**/ ValueType *KOKKOS_RESTRICT C); template - KOKKOS_INLINE_FUNCTION int team_invoke(const MemberType &member, - const ScalarType alpha, - const ValueType *KOKKOS_RESTRICT A, - const ValueType *KOKKOS_RESTRICT B, + KOKKOS_INLINE_FUNCTION int team_invoke(const MemberType &member, const ScalarType alpha, + const ValueType *KOKKOS_RESTRICT A, const ValueType *KOKKOS_RESTRICT B, const int k, /**/ ValueType *KOKKOS_RESTRICT C); // team rank update for remainder template - KOKKOS_INLINE_FUNCTION int team_invoke(const MemberType &member, - const ScalarType alpha, - const ValueType *KOKKOS_RESTRICT A, - const ValueType *KOKKOS_RESTRICT B, + KOKKOS_INLINE_FUNCTION int team_invoke(const MemberType &member, const ScalarType alpha, + const ValueType *KOKKOS_RESTRICT A, const ValueType *KOKKOS_RESTRICT B, const int m, const int n, const int k, /**/ ValueType *KOKKOS_RESTRICT C); }; diff --git a/batched/dense/src/KokkosBatched_InnerLU_Decl.hpp b/batched/dense/src/KokkosBatched_InnerLU_Decl.hpp index d0d50a146c..c355185b74 100644 --- a/batched/dense/src/KokkosBatched_InnerLU_Decl.hpp +++ b/batched/dense/src/KokkosBatched_InnerLU_Decl.hpp @@ -33,13 +33,11 @@ struct InnerLU { // for remainder square template - KOKKOS_INLINE_FUNCTION int serial_invoke(const int m, - ValueType *KOKKOS_RESTRICT A); + KOKKOS_INLINE_FUNCTION int serial_invoke(const int m, ValueType *KOKKOS_RESTRICT A); // for remainder template - KOKKOS_INLINE_FUNCTION int serial_invoke(const int m, const int n, - ValueType *KOKKOS_RESTRICT A); + KOKKOS_INLINE_FUNCTION int serial_invoke(const int m, const int n, ValueType *KOKKOS_RESTRICT A); }; } // namespace KokkosBatched diff --git a/batched/dense/src/KokkosBatched_InnerTrsm_Decl.hpp b/batched/dense/src/KokkosBatched_InnerTrsm_Decl.hpp index 22395c9201..5b5b9bb147 100644 --- a/batched/dense/src/KokkosBatched_InnerTrsm_Decl.hpp +++ b/batched/dense/src/KokkosBatched_InnerTrsm_Decl.hpp @@ -27,20 +27,17 @@ struct InnerTrsmLeftLowerUnitDiag { const int _as0, _as1, _bs0, _bs1; KOKKOS_INLINE_FUNCTION - InnerTrsmLeftLowerUnitDiag(const int as0, const int as1, const int bs0, - const int bs1) + InnerTrsmLeftLowerUnitDiag(const int as0, const int as1, const int bs0, const int bs1) : _as0(as0), _as1(as1), _bs0(bs0), _bs1(bs1) {} // trisolve template - KOKKOS_INLINE_FUNCTION int serial_invoke(const ValueType *KOKKOS_RESTRICT A, - const int n, + KOKKOS_INLINE_FUNCTION int serial_invoke(const ValueType *KOKKOS_RESTRICT A, const int n, /**/ ValueType *KOKKOS_RESTRICT B); // for remainder template - KOKKOS_INLINE_FUNCTION int serial_invoke(const ValueType *KOKKOS_RESTRICT A, - const int m, const int n, + KOKKOS_INLINE_FUNCTION int serial_invoke(const ValueType *KOKKOS_RESTRICT A, const int m, const int n, /**/ ValueType *KOKKOS_RESTRICT B); }; @@ -51,20 +48,17 @@ struct InnerTrsmLeftLowerNonUnitDiag { const int _as0, _as1, _bs0, _bs1; KOKKOS_INLINE_FUNCTION - InnerTrsmLeftLowerNonUnitDiag(const int as0, const int as1, const int bs0, - const int bs1) + InnerTrsmLeftLowerNonUnitDiag(const int as0, const int as1, const int bs0, const int bs1) : _as0(as0), _as1(as1), _bs0(bs0), _bs1(bs1) {} // trisolve template - KOKKOS_INLINE_FUNCTION int serial_invoke(const ValueType *KOKKOS_RESTRICT A, - const int n, + KOKKOS_INLINE_FUNCTION int serial_invoke(const ValueType *KOKKOS_RESTRICT A, const int n, /**/ ValueType *KOKKOS_RESTRICT B); // for remainder template - KOKKOS_INLINE_FUNCTION int serial_invoke(const ValueType *KOKKOS_RESTRICT A, - const int m, const int n, + KOKKOS_INLINE_FUNCTION int serial_invoke(const ValueType *KOKKOS_RESTRICT A, const int m, const int n, /**/ ValueType *KOKKOS_RESTRICT B); }; @@ -75,20 +69,17 @@ struct InnerTrsmLeftUpperUnitDiag { const int _as0, _as1, _bs0, _bs1; KOKKOS_INLINE_FUNCTION - InnerTrsmLeftUpperUnitDiag(const int as0, const int as1, const int bs0, - const int bs1) + InnerTrsmLeftUpperUnitDiag(const int as0, const int as1, const int bs0, const int bs1) : _as0(as0), _as1(as1), _bs0(bs0), _bs1(bs1) {} // trisolve template - KOKKOS_INLINE_FUNCTION int serial_invoke(const ValueType *KOKKOS_RESTRICT A, - const int n, + KOKKOS_INLINE_FUNCTION int serial_invoke(const ValueType *KOKKOS_RESTRICT A, const int n, /**/ ValueType *KOKKOS_RESTRICT B); // for remainder template - KOKKOS_INLINE_FUNCTION int serial_invoke(const ValueType *KOKKOS_RESTRICT A, - const int m, const int n, + KOKKOS_INLINE_FUNCTION int serial_invoke(const ValueType *KOKKOS_RESTRICT A, const int m, const int n, /**/ ValueType *KOKKOS_RESTRICT B); }; @@ -99,20 +90,17 @@ struct InnerTrsmLeftUpperNonUnitDiag { const int _as0, _as1, _bs0, _bs1; KOKKOS_INLINE_FUNCTION - InnerTrsmLeftUpperNonUnitDiag(const int as0, const int as1, const int bs0, - const int bs1) + InnerTrsmLeftUpperNonUnitDiag(const int as0, const int as1, const int bs0, const int bs1) : _as0(as0), _as1(as1), _bs0(bs0), _bs1(bs1) {} // trisolve template - KOKKOS_INLINE_FUNCTION int serial_invoke(const ValueType *KOKKOS_RESTRICT A, - const int n, + KOKKOS_INLINE_FUNCTION int serial_invoke(const ValueType *KOKKOS_RESTRICT A, const int n, /**/ ValueType *KOKKOS_RESTRICT B); // for remainder template - KOKKOS_INLINE_FUNCTION int serial_invoke(const ValueType *KOKKOS_RESTRICT A, - const int m, const int n, + KOKKOS_INLINE_FUNCTION int serial_invoke(const ValueType *KOKKOS_RESTRICT A, const int m, const int n, /**/ ValueType *KOKKOS_RESTRICT B); }; diff --git a/batched/dense/src/KokkosBatched_InverseLU_Decl.hpp b/batched/dense/src/KokkosBatched_InverseLU_Decl.hpp index e28a0151ed..930bc790b0 100644 --- a/batched/dense/src/KokkosBatched_InverseLU_Decl.hpp +++ b/batched/dense/src/KokkosBatched_InverseLU_Decl.hpp @@ -30,12 +30,10 @@ namespace KokkosBatched { template struct SerialInverseLU { template - KOKKOS_INLINE_FUNCTION static int invoke(const AViewType &A, - const wViewType &w) { + KOKKOS_INLINE_FUNCTION static int invoke(const AViewType &A, const wViewType &w) { typedef typename wViewType::value_type value_type; // workspace w is always 1D view; reinterpret it - Kokkos::View W( - w.data(), A.extent(0), A.extent(1)); + Kokkos::View W(w.data(), A.extent(0), A.extent(1)); int r_val[3] = {}; r_val[0] = SerialCopy::invoke(A, W); @@ -48,19 +46,15 @@ struct SerialInverseLU { template struct TeamInverseLU { template - KOKKOS_INLINE_FUNCTION static int invoke(const MemberType &member, - const AViewType &A, - const wViewType &w) { + KOKKOS_INLINE_FUNCTION static int invoke(const MemberType &member, const AViewType &A, const wViewType &w) { typedef typename wViewType::value_type value_type; // workspace w is always 1D view; reinterpret it - Kokkos::View W( - w.data(), A.extent(0), A.extent(1)); + Kokkos::View W(w.data(), A.extent(0), A.extent(1)); int r_val[3] = {}; - r_val[0] = TeamCopy::invoke(member, A, W); - r_val[1] = TeamSetIdentity::invoke(member, A); - r_val[2] = TeamSolveLU::invoke( - member, W, A); + r_val[0] = TeamCopy::invoke(member, A, W); + r_val[1] = TeamSetIdentity::invoke(member, A); + r_val[2] = TeamSolveLU::invoke(member, W, A); return r_val[0] + r_val[1] + r_val[2]; } }; diff --git a/batched/dense/src/KokkosBatched_Kernel_Handle.hpp b/batched/dense/src/KokkosBatched_Kernel_Handle.hpp index 051f78979d..bd73b4e267 100644 --- a/batched/dense/src/KokkosBatched_Kernel_Handle.hpp +++ b/batched/dense/src/KokkosBatched_Kernel_Handle.hpp @@ -56,10 +56,9 @@ enum BASE_KOKKOS_BATCHED_ALGOS : int { KK_SERIAL = BaseTplAlgos::N, N }; } #define N_BASE_ALGOS BaseKokkosBatchedAlgos::N -#define BASE_ALGO_STRS \ - "BaseHeuristicAlgos::SQUARE", "BaseHeuristicAlgos::TALL", \ - "BaseHeuristicAlgos::WIDE", "BaseTplAlgos::ARMPL", "BaseTplAlgosMKL", \ - "BaseKokkosBatchedAlgos::KK_SERIAL" +#define BASE_ALGO_STRS \ + "BaseHeuristicAlgos::SQUARE", "BaseHeuristicAlgos::TALL", "BaseHeuristicAlgos::WIDE", "BaseTplAlgos::ARMPL", \ + "BaseTplAlgosMKL", "BaseKokkosBatchedAlgos::KK_SERIAL" /// \brief TplParams abstracts underlying handle or execution queue type. struct TplParams { @@ -145,8 +144,7 @@ class BatchedKernelHandle { int vecLen = 0; bool enableDebug = false; - BatchedKernelHandle(int kernelAlgoType = BaseHeuristicAlgos::SQUARE, - int teamSize = 0, int vecLength = 0) + BatchedKernelHandle(int kernelAlgoType = BaseHeuristicAlgos::SQUARE, int teamSize = 0, int vecLength = 0) : teamSz(teamSize), vecLen(vecLength), _kernelAlgoType(kernelAlgoType) { #if !defined(KOKKOSKERNELS_ENABLE_TPL_ARMPL) || ARMPL_BUILD < 1058 if (_kernelAlgoType == BaseTplAlgos::ARMPL) { @@ -161,9 +159,7 @@ class BatchedKernelHandle { int get_kernel_algo_type() const { return _kernelAlgoType; } - std::string get_kernel_algo_type_str() const { - return algo_type_strs[_kernelAlgoType]; - } + std::string get_kernel_algo_type_str() const { return algo_type_strs[_kernelAlgoType]; } decltype(auto) get_tpl_params() const { #if _kernelAlgoType == ARMPL && defined(KOKKOSKERNELS_ENABLE_TPL_ARMPL) diff --git a/batched/dense/src/KokkosBatched_LU_Decl.hpp b/batched/dense/src/KokkosBatched_LU_Decl.hpp index fcba6e20f8..363193c147 100644 --- a/batched/dense/src/KokkosBatched_LU_Decl.hpp +++ b/batched/dense/src/KokkosBatched_LU_Decl.hpp @@ -28,9 +28,7 @@ struct SerialLU { // no piv version template KOKKOS_INLINE_FUNCTION static int invoke( - const AViewType &A, - const typename MagnitudeScalarType< - typename AViewType::non_const_value_type>::type tiny = 0); + const AViewType &A, const typename MagnitudeScalarType::type tiny = 0); }; template @@ -39,8 +37,7 @@ struct TeamLU { template KOKKOS_INLINE_FUNCTION static int invoke( const MemberType &member, const AViewType &A, - const typename MagnitudeScalarType< - typename AViewType::non_const_value_type>::type tiny = 0); + const typename MagnitudeScalarType::type tiny = 0); }; /// @@ -52,8 +49,7 @@ struct LU { template KOKKOS_FORCEINLINE_FUNCTION static int invoke( const MemberType &member, const AViewType &A, - const typename MagnitudeScalarType< - typename AViewType::non_const_value_type>::type tiny = 0) { + const typename MagnitudeScalarType::type tiny = 0) { int r_val = 0; if (std::is_same::value) { r_val = SerialLU::invoke(A, tiny); diff --git a/batched/dense/src/KokkosBatched_Pttrf.hpp b/batched/dense/src/KokkosBatched_Pttrf.hpp index 4fcc944dc8..787e5aeee3 100644 --- a/batched/dense/src/KokkosBatched_Pttrf.hpp +++ b/batched/dense/src/KokkosBatched_Pttrf.hpp @@ -41,8 +41,7 @@ namespace KokkosBatched { template struct SerialPttrf { template - KOKKOS_INLINE_FUNCTION static int invoke(const DViewType &d, - const EViewType &e); + KOKKOS_INLINE_FUNCTION static int invoke(const DViewType &d, const EViewType &e); }; } // namespace KokkosBatched diff --git a/batched/dense/src/KokkosBatched_QR_Decl.hpp b/batched/dense/src/KokkosBatched_QR_Decl.hpp index 993e9345fb..78bdcd4d4b 100644 --- a/batched/dense/src/KokkosBatched_QR_Decl.hpp +++ b/batched/dense/src/KokkosBatched_QR_Decl.hpp @@ -29,9 +29,7 @@ namespace KokkosBatched { template struct SerialQR { template - KOKKOS_INLINE_FUNCTION static int invoke(const AViewType &A, - const tViewType &t, - const wViewType &w); + KOKKOS_INLINE_FUNCTION static int invoke(const AViewType &A, const tViewType &t, const wViewType &w); }; /// @@ -41,10 +39,8 @@ struct SerialQR { template struct TeamQR { template - KOKKOS_INLINE_FUNCTION static int invoke(const MemberType & /*member*/, - const AViewType & /*A*/, - const tViewType & /*t*/, - const wViewType & /*w*/) { + KOKKOS_INLINE_FUNCTION static int invoke(const MemberType & /*member*/, const AViewType & /*A*/, + const tViewType & /*t*/, const wViewType & /*w*/) { /// not implemented return -1; } @@ -57,9 +53,7 @@ struct TeamQR { template struct TeamVectorQR { template - KOKKOS_INLINE_FUNCTION static int invoke(const MemberType &member, - const AViewType &A, - const tViewType &t, + KOKKOS_INLINE_FUNCTION static int invoke(const MemberType &member, const AViewType &A, const tViewType &t, const wViewType &w); }; @@ -69,9 +63,7 @@ struct TeamVectorQR { template struct QR { template - KOKKOS_FORCEINLINE_FUNCTION static int invoke(const MemberType &member, - const AViewType &A, - const tViewType &t, + KOKKOS_FORCEINLINE_FUNCTION static int invoke(const MemberType &member, const AViewType &A, const tViewType &t, const wViewType &w) { int r_val = 0; if (std::is_same::value) { diff --git a/batched/dense/src/KokkosBatched_QR_WithColumnPivoting_Decl.hpp b/batched/dense/src/KokkosBatched_QR_WithColumnPivoting_Decl.hpp index 134a97ed73..b08e5277a0 100644 --- a/batched/dense/src/KokkosBatched_QR_WithColumnPivoting_Decl.hpp +++ b/batched/dense/src/KokkosBatched_QR_WithColumnPivoting_Decl.hpp @@ -28,13 +28,9 @@ namespace KokkosBatched { template struct TeamVectorQR_WithColumnPivoting { - template - KOKKOS_INLINE_FUNCTION static int invoke(const MemberType &member, - const AViewType &A, - const tViewType &t, - const pViewType &p, - const wViewType &w, + template + KOKKOS_INLINE_FUNCTION static int invoke(const MemberType &member, const AViewType &A, const tViewType &t, + const pViewType &p, const wViewType &w, /* */ int &matrix_rank); }; diff --git a/batched/dense/src/KokkosBatched_SVD_Decl.hpp b/batched/dense/src/KokkosBatched_SVD_Decl.hpp index e84008cb69..efade8029b 100644 --- a/batched/dense/src/KokkosBatched_SVD_Decl.hpp +++ b/batched/dense/src/KokkosBatched_SVD_Decl.hpp @@ -56,20 +56,16 @@ struct SVD_S_Tag {}; struct SerialSVD { // Version to compute full factorization: A == U * diag(s) * Vt - template + template KOKKOS_INLINE_FUNCTION static int invoke( - SVD_USV_Tag, const AViewType &A, const UViewType &U, const SViewType &s, - const VtViewType &Vt, const WViewType &W, - typename AViewType::const_value_type tol = - Kokkos::ArithTraits::zero()); + SVD_USV_Tag, const AViewType &A, const UViewType &U, const SViewType &s, const VtViewType &Vt, const WViewType &W, + typename AViewType::const_value_type tol = Kokkos::ArithTraits::zero()); // Version which computes only singular values template KOKKOS_INLINE_FUNCTION static int invoke( SVD_S_Tag, const AViewType &A, const SViewType &s, const WViewType &W, - typename AViewType::const_value_type tol = - Kokkos::ArithTraits::zero()); + typename AViewType::const_value_type tol = Kokkos::ArithTraits::zero()); }; } // namespace KokkosBatched diff --git a/batched/dense/src/KokkosBatched_Scale_Decl.hpp b/batched/dense/src/KokkosBatched_Scale_Decl.hpp index dbb9a43ffb..94453a5ede 100644 --- a/batched/dense/src/KokkosBatched_Scale_Decl.hpp +++ b/batched/dense/src/KokkosBatched_Scale_Decl.hpp @@ -26,49 +26,45 @@ namespace KokkosBatched { /// Serial Scale /// -struct [[deprecated]] SerialScale{ - template - KOKKOS_INLINE_FUNCTION static int invoke(const ScalarType alpha, - const AViewType &A){Kokkos::abort( +struct [[deprecated]] SerialScale { + template + KOKKOS_INLINE_FUNCTION static int invoke(const ScalarType alpha, const AViewType &A) { + Kokkos::abort( "KokkosBatched::SerialScale is deprecated: use KokkosBlas::SerialScale " "instead"); -return 0; -} // namespace KokkosBatched -} -; + return 0; + } // namespace KokkosBatched +}; /// /// Team Scale /// template -struct [[deprecated]] TeamScale{ - template - KOKKOS_INLINE_FUNCTION static int invoke(const MemberType &member, - const ScalarType alpha, - const AViewType &A){Kokkos::abort( +struct [[deprecated]] TeamScale { + template + KOKKOS_INLINE_FUNCTION static int invoke(const MemberType &member, const ScalarType alpha, const AViewType &A) { + Kokkos::abort( "KokkosBatched::TeamScale is deprecated: use KokkosBlas::TeamScale " "instead"); -return 0; -} -} -; + return 0; + } +}; /// /// TeamVector Scale /// template -struct [[deprecated]] TeamVectorScale{ - template - KOKKOS_INLINE_FUNCTION static int invoke( - const MemberType &member, const ScalarType alpha, const AViewType &A){ - Kokkos::abort("KokkosBatched::TeamVectorScale is deprecated: use " - "KokkosBlas::TeamVectorScale instead"); -return 0; -} -} -; +struct [[deprecated]] TeamVectorScale { + template + KOKKOS_INLINE_FUNCTION static int invoke(const MemberType &member, const ScalarType alpha, const AViewType &A) { + Kokkos::abort( + "KokkosBatched::TeamVectorScale is deprecated: use " + "KokkosBlas::TeamVectorScale instead"); + return 0; + } +}; } // namespace KokkosBatched diff --git a/batched/dense/src/KokkosBatched_SetIdentity_Decl.hpp b/batched/dense/src/KokkosBatched_SetIdentity_Decl.hpp index b78d3e7b05..27c2b22ed7 100644 --- a/batched/dense/src/KokkosBatched_SetIdentity_Decl.hpp +++ b/batched/dense/src/KokkosBatched_SetIdentity_Decl.hpp @@ -39,8 +39,7 @@ struct SerialSetIdentity { template struct TeamSetIdentity { template - KOKKOS_INLINE_FUNCTION static int invoke(const MemberType &member, - const AViewType &A); + KOKKOS_INLINE_FUNCTION static int invoke(const MemberType &member, const AViewType &A); }; /// @@ -49,8 +48,7 @@ struct TeamSetIdentity { template struct SetIdentity { template - KOKKOS_FORCEINLINE_FUNCTION static int invoke(const MemberType &member, - const AViewType &A) { + KOKKOS_FORCEINLINE_FUNCTION static int invoke(const MemberType &member, const AViewType &A) { int r_val = 0; if (std::is_same::value) { r_val = SerialSetIdentity::invoke(A); diff --git a/batched/dense/src/KokkosBatched_Set_Decl.hpp b/batched/dense/src/KokkosBatched_Set_Decl.hpp index ebddb72a4a..d33d186275 100644 --- a/batched/dense/src/KokkosBatched_Set_Decl.hpp +++ b/batched/dense/src/KokkosBatched_Set_Decl.hpp @@ -25,49 +25,45 @@ namespace KokkosBatched { /// Serial Set /// -struct [[deprecated]] SerialSet{ - template - KOKKOS_INLINE_FUNCTION static int invoke(const ScalarType alpha, - const AViewType &A){Kokkos::abort( +struct [[deprecated]] SerialSet { + template + KOKKOS_INLINE_FUNCTION static int invoke(const ScalarType alpha, const AViewType &A) { + Kokkos::abort( "KokkosBatched::SerialSet is deprecated: use KokkosBlas::SerialSet " "instead"); -return 0; -} // namespace KokkosBatched -} -; + return 0; + } // namespace KokkosBatched +}; /// /// Team Set /// template -struct [[deprecated]] TeamSet{ - template - KOKKOS_INLINE_FUNCTION static int invoke(const MemberType &member, - const ScalarType alpha, - const AViewType &A){Kokkos::abort( +struct [[deprecated]] TeamSet { + template + KOKKOS_INLINE_FUNCTION static int invoke(const MemberType &member, const ScalarType alpha, const AViewType &A) { + Kokkos::abort( "KokkosBatched::TeamSet is deprecated: use KokkosBlas::TeamSet " "instead"); -return 0; -} -} -; + return 0; + } +}; /// /// TeamVector Set /// template -struct [[deprecated]] TeamVectorSet{ - template - KOKKOS_INLINE_FUNCTION static int invoke( - const MemberType &member, const ScalarType alpha, const AViewType &A){ - Kokkos::abort("KokkosBatched::TeamVectorSet is deprecated: use " - "KokkosBlas::TeamVectorSet instead"); -return 0; -} -} -; +struct [[deprecated]] TeamVectorSet { + template + KOKKOS_INLINE_FUNCTION static int invoke(const MemberType &member, const ScalarType alpha, const AViewType &A) { + Kokkos::abort( + "KokkosBatched::TeamVectorSet is deprecated: use " + "KokkosBlas::TeamVectorSet instead"); + return 0; + } +}; } // namespace KokkosBatched diff --git a/batched/dense/src/KokkosBatched_SolveLU_Decl.hpp b/batched/dense/src/KokkosBatched_SolveLU_Decl.hpp index 8e731e2666..119f5c6916 100644 --- a/batched/dense/src/KokkosBatched_SolveLU_Decl.hpp +++ b/batched/dense/src/KokkosBatched_SolveLU_Decl.hpp @@ -30,25 +30,19 @@ template struct SerialSolveLU { // no piv version template - KOKKOS_INLINE_FUNCTION static int invoke(const AViewType &A, - const BViewType &B) { + KOKKOS_INLINE_FUNCTION static int invoke(const AViewType &A, const BViewType &B) { int r_val[2] = {}; const typename AViewType::non_const_value_type one(1.0); if (std::is_same::value) { // First, compute Y (= U*X) by solving the system L*Y = B for Y - r_val[0] = SerialTrsm::invoke(one, A, B); + r_val[0] = SerialTrsm::invoke(one, A, B); // Second, compute X by solving the system U*X = Y for X - r_val[1] = SerialTrsm::invoke(one, A, B); - } else if (std::is_same::value || - std::is_same::value) { + r_val[1] = SerialTrsm::invoke(one, A, B); + } else if (std::is_same::value || std::is_same::value) { // First, compute Y (= L'*X) by solving the system U'*Y = B for Y - r_val[0] = SerialTrsm::invoke(one, A, B); + r_val[0] = SerialTrsm::invoke(one, A, B); // Second, compute X by solving the system L'*X = Y for X - r_val[1] = SerialTrsm::invoke(one, A, B); + r_val[1] = SerialTrsm::invoke(one, A, B); } return r_val[0] + r_val[1]; } @@ -58,26 +52,23 @@ template struct TeamSolveLU { // no piv version template - KOKKOS_INLINE_FUNCTION static int invoke(const MemberType &member, - const AViewType &A, - const BViewType &B) { + KOKKOS_INLINE_FUNCTION static int invoke(const MemberType &member, const AViewType &A, const BViewType &B) { int r_val[2] = {}; const typename AViewType::non_const_value_type one(1.0); if (std::is_same::value) { // First, compute Y (= U*X) by solving the system L*Y = B for Y - r_val[0] = TeamTrsm::invoke(member, one, A, B); + r_val[0] = + TeamTrsm::invoke(member, one, A, B); // Second, compute X by solving the system U*X = Y for X - r_val[1] = TeamTrsm::invoke(member, one, A, B); - } else if (std::is_same::value || - std::is_same::value) { + r_val[1] = + TeamTrsm::invoke(member, one, A, B); + } else if (std::is_same::value || std::is_same::value) { // First, compute Y (= L'*X) by solving the system U'*Y = B for Y - r_val[0] = TeamTrsm::invoke(member, one, A, B); + r_val[0] = + TeamTrsm::invoke(member, one, A, B); // Second, compute X by solving the system L'*X = Y for X - r_val[1] = TeamTrsm::invoke(member, one, A, B); + r_val[1] = + TeamTrsm::invoke(member, one, A, B); } return r_val[0] + r_val[1]; } @@ -86,14 +77,11 @@ struct TeamSolveLU { /// /// Selective Interface /// -template +template struct SolveLU { // no piv version template - KOKKOS_FORCEINLINE_FUNCTION static int invoke(const MemberType &member, - const AViewType &A, - const BViewType &B) { + KOKKOS_FORCEINLINE_FUNCTION static int invoke(const MemberType &member, const AViewType &A, const BViewType &B) { int r_val = 0; if (std::is_same::value) { r_val = SerialSolveLU::invoke(A, B); diff --git a/batched/dense/src/KokkosBatched_SolveUTV_Decl.hpp b/batched/dense/src/KokkosBatched_SolveUTV_Decl.hpp index e55836de6c..c881a0b0f7 100644 --- a/batched/dense/src/KokkosBatched_SolveUTV_Decl.hpp +++ b/batched/dense/src/KokkosBatched_SolveUTV_Decl.hpp @@ -46,13 +46,11 @@ namespace KokkosBatched { template struct TeamVectorSolveUTV { - template - KOKKOS_INLINE_FUNCTION static int invoke( - const MemberType &member, const int matrix_rank, const UViewType &U, - const TViewType &T, const VViewType &V, const pViewType &p, - const XViewType &X, const BViewType &B, const wViewType &w); + template + KOKKOS_INLINE_FUNCTION static int invoke(const MemberType &member, const int matrix_rank, const UViewType &U, + const TViewType &T, const VViewType &V, const pViewType &p, + const XViewType &X, const BViewType &B, const wViewType &w); }; } // namespace KokkosBatched diff --git a/batched/dense/src/KokkosBatched_Tbsv.hpp b/batched/dense/src/KokkosBatched_Tbsv.hpp index 7510c07969..f7d700be44 100644 --- a/batched/dense/src/KokkosBatched_Tbsv.hpp +++ b/batched/dense/src/KokkosBatched_Tbsv.hpp @@ -41,12 +41,10 @@ namespace KokkosBatched { /// No nested parallel_for is used inside of the function. /// -template +template struct SerialTbsv { template - KOKKOS_INLINE_FUNCTION static int invoke(const AViewType &A, - const XViewType &X, const int k); + KOKKOS_INLINE_FUNCTION static int invoke(const AViewType &A, const XViewType &X, const int k); }; } // namespace KokkosBatched diff --git a/batched/dense/src/KokkosBatched_Trmm_Decl.hpp b/batched/dense/src/KokkosBatched_Trmm_Decl.hpp index 81d1f8d073..c284ed63b2 100644 --- a/batched/dense/src/KokkosBatched_Trmm_Decl.hpp +++ b/batched/dense/src/KokkosBatched_Trmm_Decl.hpp @@ -22,13 +22,10 @@ namespace KokkosBatched { -template +template struct SerialTrmm { template - KOKKOS_INLINE_FUNCTION static int invoke(const ScalarType alpha, - const AViewType &A, - const BViewType &B); + KOKKOS_INLINE_FUNCTION static int invoke(const ScalarType alpha, const AViewType &A, const BViewType &B); }; } // namespace KokkosBatched #endif // __KOKKOSBATCHED_TRMM_DECL_HPP__ diff --git a/batched/dense/src/KokkosBatched_Trsm_Decl.hpp b/batched/dense/src/KokkosBatched_Trsm_Decl.hpp index e0aee4659f..d2220953cc 100644 --- a/batched/dense/src/KokkosBatched_Trsm_Decl.hpp +++ b/batched/dense/src/KokkosBatched_Trsm_Decl.hpp @@ -23,54 +23,42 @@ namespace KokkosBatched { -template +template struct SerialTrsm { template - KOKKOS_INLINE_FUNCTION static int invoke(const ScalarType alpha, - const AViewType &A, - const BViewType &B); + KOKKOS_INLINE_FUNCTION static int invoke(const ScalarType alpha, const AViewType &A, const BViewType &B); }; -template +template struct TeamTrsm { template - KOKKOS_INLINE_FUNCTION static int invoke(const MemberType &member, - const ScalarType alpha, - const AViewType &A, + KOKKOS_INLINE_FUNCTION static int invoke(const MemberType &member, const ScalarType alpha, const AViewType &A, const BViewType &B); }; -template +template struct TeamVectorTrsm { template - KOKKOS_INLINE_FUNCTION static int invoke(const MemberType &member, - const ScalarType alpha, - const AViewType &A, + KOKKOS_INLINE_FUNCTION static int invoke(const MemberType &member, const ScalarType alpha, const AViewType &A, const BViewType &B); }; /// /// Selective Interface /// -template +template struct Trsm { template - KOKKOS_FORCEINLINE_FUNCTION static int invoke(const MemberType &member, - const ScalarType alpha, - const AViewType &A, + KOKKOS_FORCEINLINE_FUNCTION static int invoke(const MemberType &member, const ScalarType alpha, const AViewType &A, const BViewType &B) { int r_val = 0; if (std::is_same::value) { - r_val = SerialTrsm::invoke( - alpha, A, B); + r_val = SerialTrsm::invoke(alpha, A, B); } else if (std::is_same::value) { - r_val = TeamTrsm::invoke(member, alpha, A, B); + r_val = TeamTrsm::invoke(member, alpha, A, B); } return r_val; } diff --git a/batched/dense/src/KokkosBatched_Trsv_Decl.hpp b/batched/dense/src/KokkosBatched_Trsv_Decl.hpp index ed9f5cca26..e3da43a95d 100644 --- a/batched/dense/src/KokkosBatched_Trsv_Decl.hpp +++ b/batched/dense/src/KokkosBatched_Trsv_Decl.hpp @@ -27,12 +27,10 @@ namespace KokkosBatched { /// Serial Trsv /// -template +template struct SerialTrsv { template - KOKKOS_INLINE_FUNCTION static int invoke(const ScalarType /*alpha*/, - const AViewType & /*A*/, + KOKKOS_INLINE_FUNCTION static int invoke(const ScalarType /*alpha*/, const AViewType & /*A*/, const bViewType & /*b*/) { assert(false && "Error: encounter dummy impl"); return 0; @@ -43,14 +41,11 @@ struct SerialTrsv { /// Team Trsv /// -template +template struct TeamTrsv { template - KOKKOS_INLINE_FUNCTION static int invoke(const MemberType & /*member*/, - const ScalarType /*alpha*/, - const AViewType & /*A*/, - const bViewType & /*b*/) { + KOKKOS_INLINE_FUNCTION static int invoke(const MemberType & /*member*/, const ScalarType /*alpha*/, + const AViewType & /*A*/, const bViewType & /*b*/) { assert(false && "Error: encounter dummy impl"); return 0; } @@ -60,14 +55,11 @@ struct TeamTrsv { /// TeamVector Trsv /// -template +template struct TeamVectorTrsv { template - KOKKOS_INLINE_FUNCTION static int invoke(const MemberType & /*member*/, - const ScalarType /*alpha*/, - const AViewType & /*A*/, - const bViewType & /*b*/) { + KOKKOS_INLINE_FUNCTION static int invoke(const MemberType & /*member*/, const ScalarType /*alpha*/, + const AViewType & /*A*/, const bViewType & /*b*/) { assert(false && "Error: encounter dummy impl"); return 0; } @@ -76,24 +68,19 @@ struct TeamVectorTrsv { /// /// Selective Interface /// -template +template struct Trsv { template - KOKKOS_INLINE_FUNCTION static int invoke(const MemberType &member, - const ScalarType alpha, - const AViewType &A, + KOKKOS_INLINE_FUNCTION static int invoke(const MemberType &member, const ScalarType alpha, const AViewType &A, const bViewType &b) { int r_val = 0; if (std::is_same::value) { - r_val = - SerialTrsv::invoke(alpha, A, b); + r_val = SerialTrsv::invoke(alpha, A, b); } else if (std::is_same::value) { - r_val = TeamTrsv::invoke( - member, alpha, A, b); + r_val = TeamTrsv::invoke(member, alpha, A, b); } else if (std::is_same::value) { - r_val = TeamVectorTrsv::invoke(member, alpha, A, b); + r_val = TeamVectorTrsv::invoke(member, alpha, A, b); } return r_val; } @@ -105,116 +92,98 @@ struct Trsv { #include "KokkosBatched_Trsv_Team_Impl.hpp" #include "KokkosBatched_Trsv_TeamVector_Impl.hpp" -#define KOKKOSBATCHED_SERIAL_TRSV_LOWER_NO_TRANSPOSE_INTERNAL_INVOKE( \ - ALGOTYPE, DIAG, M, N, ALPHA, A, AS0, AS1, B, BS) \ - KokkosBatched::SerialTrsvInternalLower::invoke( \ - DIAG::use_unit_diag, M, ALPHA, A, AS0, AS1, B, BS) - -#define KOKKOSBATCHED_SERIAL_TRSV_LOWER_TRANSPOSE_INTERNAL_INVOKE( \ - ALGOTYPE, DIAG, M, N, ALPHA, A, AS0, AS1, B, BS) \ - KokkosBatched::SerialTrsvInternalUpper::invoke( \ - DIAG::use_unit_diag, N, ALPHA, A, AS1, AS0, B, BS) - -#define KOKKOSBATCHED_SERIAL_TRSV_UPPER_NO_TRANSPOSE_INTERNAL_INVOKE( \ - ALGOTYPE, DIAG, M, N, ALPHA, A, AS0, AS1, B, BS) \ - KokkosBatched::SerialTrsvInternalUpper::invoke( \ - DIAG::use_unit_diag, M, ALPHA, A, AS0, AS1, B, BS) - -#define KOKKOSBATCHED_SERIAL_TRSV_UPPER_TRANSPOSE_INTERNAL_INVOKE( \ - ALGOTYPE, DIAG, M, N, ALPHA, A, AS0, AS1, B, BS) \ - KokkosBatched::SerialTrsvInternalLower::invoke( \ - DIAG::use_unit_diag, N, ALPHA, A, AS1, AS0, B, BS) - -#define KOKKOSBATCHED_TEAM_TRSV_LOWER_NO_TRANSPOSE_INTERNAL_INVOKE( \ - ALGOTYPE, MEMBER, DIAG, M, N, ALPHA, A, AS0, AS1, B, BS) \ - KokkosBatched::TeamTrsvInternalLower::invoke( \ - MEMBER, DIAG::use_unit_diag, M, ALPHA, A, AS0, AS1, B, BS) - -#define KOKKOSBATCHED_TEAM_TRSV_LOWER_TRANSPOSE_INTERNAL_INVOKE( \ - ALGOTYPE, MEMBER, DIAG, M, N, ALPHA, A, AS0, AS1, B, BS) \ - KokkosBatched::TeamTrsvInternalUpper::invoke( \ - MEMBER, DIAG::use_unit_diag, N, ALPHA, A, AS1, AS0, B, BS) - -#define KOKKOSBATCHED_TEAM_TRSV_UPPER_NO_TRANSPOSE_INTERNAL_INVOKE( \ - ALGOTYPE, MEMBER, DIAG, M, N, ALPHA, A, AS0, AS1, B, BS) \ - KokkosBatched::TeamTrsvInternalUpper::invoke( \ - MEMBER, DIAG::use_unit_diag, M, ALPHA, A, AS0, AS1, B, BS) - -#define KOKKOSBATCHED_TEAM_TRSV_UPPER_TRANSPOSE_INTERNAL_INVOKE( \ - ALGOTYPE, MEMBER, DIAG, M, N, ALPHA, A, AS0, AS1, B, BS) \ - KokkosBatched::TeamTrsvInternalLower::invoke( \ - MEMBER, DIAG::use_unit_diag, N, ALPHA, A, AS1, AS0, B, BS) - -#define KOKKOSBATCHED_TEAMVECTOR_TRSV_LOWER_NO_TRANSPOSE_INTERNAL_INVOKE( \ - ALGOTYPE, MEMBER, DIAG, M, N, ALPHA, A, AS0, AS1, B, BS) \ - KokkosBatched::TeamVectorTrsvInternalLower::invoke( \ - MEMBER, DIAG::use_unit_diag, M, ALPHA, A, AS0, AS1, B, BS) - -#define KOKKOSBATCHED_TEAMVECTOR_TRSV_LOWER_TRANSPOSE_INTERNAL_INVOKE( \ - ALGOTYPE, MEMBER, DIAG, M, N, ALPHA, A, AS0, AS1, B, BS) \ - KokkosBatched::TeamVectorTrsvInternalUpper::invoke( \ - MEMBER, DIAG::use_unit_diag, N, ALPHA, A, AS1, AS0, B, BS) - -#define KOKKOSBATCHED_TEAMVECTOR_TRSV_UPPER_NO_TRANSPOSE_INTERNAL_INVOKE( \ - ALGOTYPE, MEMBER, DIAG, M, N, ALPHA, A, AS0, AS1, B, BS) \ - KokkosBatched::TeamVectorTrsvInternalUpper::invoke( \ - MEMBER, DIAG::use_unit_diag, M, ALPHA, A, AS0, AS1, B, BS) - -#define KOKKOSBATCHED_TEAMVECTOR_TRSV_UPPER_TRANSPOSE_INTERNAL_INVOKE( \ - ALGOTYPE, MEMBER, DIAG, M, N, ALPHA, A, AS0, AS1, B, BS) \ - KokkosBatched::TeamVectorTrsvInternalLower::invoke( \ - MEMBER, DIAG::use_unit_diag, N, ALPHA, A, AS1, AS0, B, BS) - -#define KOKKOSBATCHED_TRSV_LOWER_NO_TRANSPOSE_INTERNAL_INVOKE( \ - MODETYPE, ALGOTYPE, MEMBER, DIAG, M, N, ALPHA, A, AS0, AS1, B, BS) \ - if (std::is_same::value) { \ - KOKKOSBATCHED_SERIAL_TRSV_LOWER_NO_TRANSPOSE_INTERNAL_INVOKE( \ - ALGOTYPE, DIAG, M, N, ALPHA, A, AS0, AS1, B, BS); \ - } else if (std::is_same::value) { \ - KOKKOSBATCHED_TEAM_TRSV_LOWER_NO_TRANSPOSE_INTERNAL_INVOKE( \ - ALGOTYPE, MEMBER, DIAG, M, N, ALPHA, A, AS0, AS1, B, BS); \ - } else if (std::is_same::value) { \ - KOKKOSBATCHED_TEAMVECTOR_TRSV_LOWER_NO_TRANSPOSE_INTERNAL_INVOKE( \ - ALGOTYPE, MEMBER, DIAG, M, N, ALPHA, A, AS0, AS1, B, BS); \ +#define KOKKOSBATCHED_SERIAL_TRSV_LOWER_NO_TRANSPOSE_INTERNAL_INVOKE(ALGOTYPE, DIAG, M, N, ALPHA, A, AS0, AS1, B, BS) \ + KokkosBatched::SerialTrsvInternalLower::invoke(DIAG::use_unit_diag, M, ALPHA, A, AS0, AS1, B, BS) + +#define KOKKOSBATCHED_SERIAL_TRSV_LOWER_TRANSPOSE_INTERNAL_INVOKE(ALGOTYPE, DIAG, M, N, ALPHA, A, AS0, AS1, B, BS) \ + KokkosBatched::SerialTrsvInternalUpper::invoke(DIAG::use_unit_diag, N, ALPHA, A, AS1, AS0, B, BS) + +#define KOKKOSBATCHED_SERIAL_TRSV_UPPER_NO_TRANSPOSE_INTERNAL_INVOKE(ALGOTYPE, DIAG, M, N, ALPHA, A, AS0, AS1, B, BS) \ + KokkosBatched::SerialTrsvInternalUpper::invoke(DIAG::use_unit_diag, M, ALPHA, A, AS0, AS1, B, BS) + +#define KOKKOSBATCHED_SERIAL_TRSV_UPPER_TRANSPOSE_INTERNAL_INVOKE(ALGOTYPE, DIAG, M, N, ALPHA, A, AS0, AS1, B, BS) \ + KokkosBatched::SerialTrsvInternalLower::invoke(DIAG::use_unit_diag, N, ALPHA, A, AS1, AS0, B, BS) + +#define KOKKOSBATCHED_TEAM_TRSV_LOWER_NO_TRANSPOSE_INTERNAL_INVOKE(ALGOTYPE, MEMBER, DIAG, M, N, ALPHA, A, AS0, AS1, \ + B, BS) \ + KokkosBatched::TeamTrsvInternalLower::invoke(MEMBER, DIAG::use_unit_diag, M, ALPHA, A, AS0, AS1, B, BS) + +#define KOKKOSBATCHED_TEAM_TRSV_LOWER_TRANSPOSE_INTERNAL_INVOKE(ALGOTYPE, MEMBER, DIAG, M, N, ALPHA, A, AS0, AS1, B, \ + BS) \ + KokkosBatched::TeamTrsvInternalUpper::invoke(MEMBER, DIAG::use_unit_diag, N, ALPHA, A, AS1, AS0, B, BS) + +#define KOKKOSBATCHED_TEAM_TRSV_UPPER_NO_TRANSPOSE_INTERNAL_INVOKE(ALGOTYPE, MEMBER, DIAG, M, N, ALPHA, A, AS0, AS1, \ + B, BS) \ + KokkosBatched::TeamTrsvInternalUpper::invoke(MEMBER, DIAG::use_unit_diag, M, ALPHA, A, AS0, AS1, B, BS) + +#define KOKKOSBATCHED_TEAM_TRSV_UPPER_TRANSPOSE_INTERNAL_INVOKE(ALGOTYPE, MEMBER, DIAG, M, N, ALPHA, A, AS0, AS1, B, \ + BS) \ + KokkosBatched::TeamTrsvInternalLower::invoke(MEMBER, DIAG::use_unit_diag, N, ALPHA, A, AS1, AS0, B, BS) + +#define KOKKOSBATCHED_TEAMVECTOR_TRSV_LOWER_NO_TRANSPOSE_INTERNAL_INVOKE(ALGOTYPE, MEMBER, DIAG, M, N, ALPHA, A, AS0, \ + AS1, B, BS) \ + KokkosBatched::TeamVectorTrsvInternalLower::invoke(MEMBER, DIAG::use_unit_diag, M, ALPHA, A, AS0, AS1, B, \ + BS) + +#define KOKKOSBATCHED_TEAMVECTOR_TRSV_LOWER_TRANSPOSE_INTERNAL_INVOKE(ALGOTYPE, MEMBER, DIAG, M, N, ALPHA, A, AS0, \ + AS1, B, BS) \ + KokkosBatched::TeamVectorTrsvInternalUpper::invoke(MEMBER, DIAG::use_unit_diag, N, ALPHA, A, AS1, AS0, B, \ + BS) + +#define KOKKOSBATCHED_TEAMVECTOR_TRSV_UPPER_NO_TRANSPOSE_INTERNAL_INVOKE(ALGOTYPE, MEMBER, DIAG, M, N, ALPHA, A, AS0, \ + AS1, B, BS) \ + KokkosBatched::TeamVectorTrsvInternalUpper::invoke(MEMBER, DIAG::use_unit_diag, M, ALPHA, A, AS0, AS1, B, \ + BS) + +#define KOKKOSBATCHED_TEAMVECTOR_TRSV_UPPER_TRANSPOSE_INTERNAL_INVOKE(ALGOTYPE, MEMBER, DIAG, M, N, ALPHA, A, AS0, \ + AS1, B, BS) \ + KokkosBatched::TeamVectorTrsvInternalLower::invoke(MEMBER, DIAG::use_unit_diag, N, ALPHA, A, AS1, AS0, B, \ + BS) + +#define KOKKOSBATCHED_TRSV_LOWER_NO_TRANSPOSE_INTERNAL_INVOKE(MODETYPE, ALGOTYPE, MEMBER, DIAG, M, N, ALPHA, A, AS0, \ + AS1, B, BS) \ + if (std::is_same::value) { \ + KOKKOSBATCHED_SERIAL_TRSV_LOWER_NO_TRANSPOSE_INTERNAL_INVOKE(ALGOTYPE, DIAG, M, N, ALPHA, A, AS0, AS1, B, BS); \ + } else if (std::is_same::value) { \ + KOKKOSBATCHED_TEAM_TRSV_LOWER_NO_TRANSPOSE_INTERNAL_INVOKE(ALGOTYPE, MEMBER, DIAG, M, N, ALPHA, A, AS0, AS1, B, \ + BS); \ + } else if (std::is_same::value) { \ + KOKKOSBATCHED_TEAMVECTOR_TRSV_LOWER_NO_TRANSPOSE_INTERNAL_INVOKE(ALGOTYPE, MEMBER, DIAG, M, N, ALPHA, A, AS0, AS1, \ + B, BS); \ } -#define KOKKOSBATCHED_TRSV_LOWER_TRANSPOSE_INTERNAL_INVOKE( \ - MODETYPE, ALGOTYPE, MEMBER, DIAG, M, N, ALPHA, A, AS0, AS1, B, BS) \ - if (std::is_same::value) { \ - KOKKOSBATCHED_SERIAL_TRSV_LOWER_TRANSPOSE_INTERNAL_INVOKE( \ - ALGOTYPE, DIAG, M, N, ALPHA, A, AS0, AS1, B, BS); \ - } else if (std::is_same::value) { \ - KOKKOSBATCHED_TEAM_TRSV_LOWER_TRANSPOSE_INTERNAL_INVOKE( \ - ALGOTYPE, MEMBER, DIAG, M, N, ALPHA, A, AS0, AS1, B, BS); \ - } else if (std::is_same::value) { \ - KOKKOSBATCHED_TEAMVECTOR_TRSV_LOWER_TRANSPOSE_INTERNAL_INVOKE( \ - ALGOTYPE, MEMBER, DIAG, M, N, ALPHA, A, AS0, AS1, B, BS); \ +#define KOKKOSBATCHED_TRSV_LOWER_TRANSPOSE_INTERNAL_INVOKE(MODETYPE, ALGOTYPE, MEMBER, DIAG, M, N, ALPHA, A, AS0, AS1, \ + B, BS) \ + if (std::is_same::value) { \ + KOKKOSBATCHED_SERIAL_TRSV_LOWER_TRANSPOSE_INTERNAL_INVOKE(ALGOTYPE, DIAG, M, N, ALPHA, A, AS0, AS1, B, BS); \ + } else if (std::is_same::value) { \ + KOKKOSBATCHED_TEAM_TRSV_LOWER_TRANSPOSE_INTERNAL_INVOKE(ALGOTYPE, MEMBER, DIAG, M, N, ALPHA, A, AS0, AS1, B, BS); \ + } else if (std::is_same::value) { \ + KOKKOSBATCHED_TEAMVECTOR_TRSV_LOWER_TRANSPOSE_INTERNAL_INVOKE(ALGOTYPE, MEMBER, DIAG, M, N, ALPHA, A, AS0, AS1, B, \ + BS); \ } -#define KOKKOSBATCHED_TRSV_UPPER_NO_TRANSPOSE_INTERNAL_INVOKE( \ - MODETYPE, ALGOTYPE, MEMBER, DIAG, M, N, ALPHA, A, AS0, AS1, B, BS) \ - if (std::is_same::value) { \ - KOKKOSBATCHED_SERIAL_TRSV_UPPER_NO_TRANSPOSE_INTERNAL_INVOKE( \ - ALGOTYPE, DIAG, M, N, ALPHA, A, AS0, AS1, B, BS); \ - } else if (std::is_same::value) { \ - KOKKOSBATCHED_TEAM_TRSV_UPPER_NO_TRANSPOSE_INTERNAL_INVOKE( \ - ALGOTYPE, MEMBER, DIAG, M, N, ALPHA, A, AS0, AS1, B, BS); \ - } else if (std::is_same::value) { \ - KOKKOSBATCHED_TEAMVECTOR_TRSV_UPPER_NO_TRANSPOSE_INTERNAL_INVOKE( \ - ALGOTYPE, MEMBER, DIAG, M, N, ALPHA, A, AS0, AS1, B, BS); \ +#define KOKKOSBATCHED_TRSV_UPPER_NO_TRANSPOSE_INTERNAL_INVOKE(MODETYPE, ALGOTYPE, MEMBER, DIAG, M, N, ALPHA, A, AS0, \ + AS1, B, BS) \ + if (std::is_same::value) { \ + KOKKOSBATCHED_SERIAL_TRSV_UPPER_NO_TRANSPOSE_INTERNAL_INVOKE(ALGOTYPE, DIAG, M, N, ALPHA, A, AS0, AS1, B, BS); \ + } else if (std::is_same::value) { \ + KOKKOSBATCHED_TEAM_TRSV_UPPER_NO_TRANSPOSE_INTERNAL_INVOKE(ALGOTYPE, MEMBER, DIAG, M, N, ALPHA, A, AS0, AS1, B, \ + BS); \ + } else if (std::is_same::value) { \ + KOKKOSBATCHED_TEAMVECTOR_TRSV_UPPER_NO_TRANSPOSE_INTERNAL_INVOKE(ALGOTYPE, MEMBER, DIAG, M, N, ALPHA, A, AS0, AS1, \ + B, BS); \ } -#define KOKKOSBATCHED_TRSV_UPPER_TRANSPOSE_INTERNAL_INVOKE( \ - MODETYPE, ALGOTYPE, MEMBER, DIAG, M, N, ALPHA, A, AS0, AS1, B, BS) \ - if (std::is_same::value) { \ - KOKKOSBATCHED_SERIAL_TRSV_UPPER_TRANSPOSE_INTERNAL_INVOKE( \ - ALGOTYPE, DIAG, M, N, ALPHA, A, AS0, AS1, B, BS); \ - } else if (std::is_same::value) { \ - KOKKOSBATCHED_TEAM_TRSV_UPPER_TRANSPOSE_INTERNAL_INVOKE( \ - ALGOTYPE, MEMBER, DIAG, M, N, ALPHA, A, AS0, AS1, B, BS); \ - } else if (std::is_same::value) { \ - KOKKOSBATCHED_TEAMVECTOR_TRSV_UPPER_TRANSPOSE_INTERNAL_INVOKE( \ - ALGOTYPE, MEMBER, DIAG, M, N, ALPHA, A, AS0, AS1, B, BS); \ +#define KOKKOSBATCHED_TRSV_UPPER_TRANSPOSE_INTERNAL_INVOKE(MODETYPE, ALGOTYPE, MEMBER, DIAG, M, N, ALPHA, A, AS0, AS1, \ + B, BS) \ + if (std::is_same::value) { \ + KOKKOSBATCHED_SERIAL_TRSV_UPPER_TRANSPOSE_INTERNAL_INVOKE(ALGOTYPE, DIAG, M, N, ALPHA, A, AS0, AS1, B, BS); \ + } else if (std::is_same::value) { \ + KOKKOSBATCHED_TEAM_TRSV_UPPER_TRANSPOSE_INTERNAL_INVOKE(ALGOTYPE, MEMBER, DIAG, M, N, ALPHA, A, AS0, AS1, B, BS); \ + } else if (std::is_same::value) { \ + KOKKOSBATCHED_TEAMVECTOR_TRSV_UPPER_TRANSPOSE_INTERNAL_INVOKE(ALGOTYPE, MEMBER, DIAG, M, N, ALPHA, A, AS0, AS1, B, \ + BS); \ } #endif diff --git a/batched/dense/src/KokkosBatched_UTV_Decl.hpp b/batched/dense/src/KokkosBatched_UTV_Decl.hpp index 792236a14f..bae2780e10 100644 --- a/batched/dense/src/KokkosBatched_UTV_Decl.hpp +++ b/batched/dense/src/KokkosBatched_UTV_Decl.hpp @@ -57,12 +57,10 @@ namespace KokkosBatched { template struct TeamVectorUTV { - template - KOKKOS_INLINE_FUNCTION static int invoke( - const MemberType &member, const AViewType &A, const pViewType &p, - const UViewType &U, const VViewType &V, const wViewType &w, - int &matrix_rank); + template + KOKKOS_INLINE_FUNCTION static int invoke(const MemberType &member, const AViewType &A, const pViewType &p, + const UViewType &U, const VViewType &V, const wViewType &w, + int &matrix_rank); }; } // namespace KokkosBatched diff --git a/batched/dense/src/KokkosBatched_Vector.hpp b/batched/dense/src/KokkosBatched_Vector.hpp index 71d159cb03..e44af7bc04 100644 --- a/batched/dense/src/KokkosBatched_Vector.hpp +++ b/batched/dense/src/KokkosBatched_Vector.hpp @@ -143,9 +143,7 @@ struct DefaultInternalVectorLength { }; template struct DefaultInternalVectorLength { - enum : int { - value = DefaultVectorLength::value - }; + enum : int { value = DefaultVectorLength::value }; }; #if defined(KOKKOS_ENABLE_CUDA) @@ -174,13 +172,11 @@ struct DefaultInternalVectorLength { enum : int { value = 2 }; }; template <> -struct DefaultInternalVectorLength, - Kokkos::CudaUVMSpace> { +struct DefaultInternalVectorLength, Kokkos::CudaUVMSpace> { enum : int { value = 2 }; }; template <> -struct DefaultInternalVectorLength, - Kokkos::CudaUVMSpace> { +struct DefaultInternalVectorLength, Kokkos::CudaUVMSpace> { enum : int { value = 1 }; }; #endif @@ -256,18 +252,12 @@ class ArithTraits, l>> { typedef typename ArithTraits::val_type val_scalar_type; typedef typename ArithTraits::mag_type mag_scalar_type; - typedef KokkosBatched::Vector, l> - val_type; - typedef KokkosBatched::Vector, l> - mag_type; + typedef KokkosBatched::Vector, l> val_type; + typedef KokkosBatched::Vector, l> mag_type; - static KOKKOS_FORCEINLINE_FUNCTION mag_type real(const val_type &val) { - return val; - } + static KOKKOS_FORCEINLINE_FUNCTION mag_type real(const val_type &val) { return val; } - static KOKKOS_FORCEINLINE_FUNCTION val_type conj(const val_type &val) { - return val; - } + static KOKKOS_FORCEINLINE_FUNCTION val_type conj(const val_type &val) { return val; } static KOKKOS_FORCEINLINE_FUNCTION val_type abs(const val_type &val) { using KAT = ArithTraits; @@ -286,17 +276,13 @@ class ArithTraits, l>> { }; template -class ArithTraits< - KokkosBatched::Vector>, l>> { +class ArithTraits>, l>> { public: typedef typename ArithTraits::val_type val_scalar_type; typedef typename ArithTraits::mag_type mag_scalar_type; - typedef KokkosBatched::Vector< - KokkosBatched::SIMD>, l> - val_type; - typedef KokkosBatched::Vector, l> - mag_type; + typedef KokkosBatched::Vector>, l> val_type; + typedef KokkosBatched::Vector, l> mag_type; static KOKKOS_FORCEINLINE_FUNCTION mag_type real(const val_type &val) { mag_type r_val; diff --git a/batched/dense/src/KokkosBatched_Vector_SIMD.hpp b/batched/dense/src/KokkosBatched_Vector_SIMD.hpp index 753904dbb9..52a73deda4 100644 --- a/batched/dense/src/KokkosBatched_Vector_SIMD.hpp +++ b/batched/dense/src/KokkosBatched_Vector_SIMD.hpp @@ -63,8 +63,7 @@ class Vector, l> { for (int i = 0; i < vector_length; ++i) _data[i] = val; } template - KOKKOS_INLINE_FUNCTION Vector( - const Vector, vector_length> &b) { + KOKKOS_INLINE_FUNCTION Vector(const Vector, vector_length> &b) { KOKKOSKERNELS_FORCE_SIMD for (int i = 0; i < vector_length; ++i) _data[i] = b[i]; } @@ -140,8 +139,7 @@ class Vector, 2> { } template - KOKKOS_INLINE_FUNCTION Vector( - const Vector, vector_length> &b) { + KOKKOS_INLINE_FUNCTION Vector(const Vector, vector_length> &b) { _data.x = b[0]; _data.y = b[1]; } @@ -183,9 +181,7 @@ class Vector, 2> { } KOKKOS_INLINE_FUNCTION - value_type &operator[](const int &i) const { - return reinterpret_cast(&_data)[i]; - } + value_type &operator[](const int &i) const { return reinterpret_cast(&_data)[i]; } }; template <> @@ -232,8 +228,7 @@ class Vector, 2> { } template - KOKKOS_INLINE_FUNCTION Vector( - const Vector, vector_length> &b) { + KOKKOS_INLINE_FUNCTION Vector(const Vector, vector_length> &b) { _data.x = b[0]; _data.y = b[1]; } @@ -275,9 +270,7 @@ class Vector, 2> { } KOKKOS_INLINE_FUNCTION - value_type &operator[](const int &i) const { - return reinterpret_cast(&_data)[i]; - } + value_type &operator[](const int &i) const { return reinterpret_cast(&_data)[i]; } }; template <> @@ -334,8 +327,7 @@ class Vector, 4> { } template - KOKKOS_INLINE_FUNCTION Vector( - const Vector, vector_length> &b) { + KOKKOS_INLINE_FUNCTION Vector(const Vector, vector_length> &b) { _data.x = b[0]; _data.y = b[1]; _data.z = b[2]; @@ -389,9 +381,7 @@ class Vector, 4> { } KOKKOS_INLINE_FUNCTION - value_type &operator[](const int &i) const { - return reinterpret_cast(&_data)[i]; - } + value_type &operator[](const int &i) const { return reinterpret_cast(&_data)[i]; } }; template <> @@ -448,8 +438,7 @@ class Vector, 4> { } template - KOKKOS_INLINE_FUNCTION Vector( - const Vector, vector_length> &b) { + KOKKOS_INLINE_FUNCTION Vector(const Vector, vector_length> &b) { _data.x = b[0]; _data.y = b[1]; _data.z = b[2]; @@ -503,9 +492,7 @@ class Vector, 4> { } KOKKOS_INLINE_FUNCTION - value_type &operator[](const int &i) const { - return reinterpret_cast(&_data)[i]; - } + value_type &operator[](const int &i) const { return reinterpret_cast(&_data)[i]; } }; } // namespace KokkosBatched @@ -580,13 +567,9 @@ class Vector, 4> { inline void storeAligned(value_type *p) const { _mm256_store_pd(p, _data); } - inline void storeUnaligned(value_type *p) const { - _mm256_storeu_pd(p, _data); - } + inline void storeUnaligned(value_type *p) const { _mm256_storeu_pd(p, _data); } - inline value_type &operator[](const int &i) const { - return reinterpret_cast(&_data)[i]; - } + inline value_type &operator[](const int &i) const { return reinterpret_cast(&_data)[i]; } }; template <> @@ -657,17 +640,11 @@ class Vector >, 2> { return *this; } - inline void storeAligned(value_type *p) const { - _mm256_store_pd((mag_type *)p, _data); - } + inline void storeAligned(value_type *p) const { _mm256_store_pd((mag_type *)p, _data); } - inline void storeUnaligned(value_type *p) const { - _mm256_storeu_pd((mag_type *)p, _data); - } + inline void storeUnaligned(value_type *p) const { _mm256_storeu_pd((mag_type *)p, _data); } - inline value_type &operator[](const int &i) const { - return reinterpret_cast(&_data)[i]; - } + inline value_type &operator[](const int &i) const { return reinterpret_cast(&_data)[i]; } }; } // namespace KokkosBatched #endif /* #if defined(__AVX__) || defined(__AVX2__) */ @@ -737,13 +714,9 @@ class Vector, 8> { inline void storeAligned(value_type *p) const { _mm512_store_pd(p, _data); } - inline void storeUnaligned(value_type *p) const { - _mm512_storeu_pd(p, _data); - } + inline void storeUnaligned(value_type *p) const { _mm512_storeu_pd(p, _data); } - inline value_type &operator[](const int &i) const { - return reinterpret_cast(&_data)[i]; - } + inline value_type &operator[](const int &i) const { return reinterpret_cast(&_data)[i]; } }; template <> @@ -767,13 +740,11 @@ class Vector >, 4> { public: inline Vector() { _data = _mm512_setzero_pd(); } inline Vector(const value_type &val) { - _data = _mm512_mask_broadcast_f64x4(_mm512_set1_pd(val.imag()), 0x55, - _mm256_set1_pd(val.real())); + _data = _mm512_mask_broadcast_f64x4(_mm512_set1_pd(val.imag()), 0x55, _mm256_set1_pd(val.real())); KOKKOSKERNELS_GNU_COMPILER_FENCE } inline Vector(const mag_type &val) { - _data = _mm512_mask_broadcast_f64x4(_mm512_setzero_pd(), 0x55, - _mm256_set1_pd(val)); + _data = _mm512_mask_broadcast_f64x4(_mm512_setzero_pd(), 0x55, _mm256_set1_pd(val)); KOKKOSKERNELS_GNU_COMPILER_FENCE } inline Vector(const type &b) { _data = b._data; } @@ -810,17 +781,11 @@ class Vector >, 4> { return *this; } - inline void storeAligned(value_type *p) const { - _mm512_store_pd((mag_type *)p, _data); - } + inline void storeAligned(value_type *p) const { _mm512_store_pd((mag_type *)p, _data); } - inline void storeUnaligned(value_type *p) const { - _mm512_storeu_pd((mag_type *)p, _data); - } + inline void storeUnaligned(value_type *p) const { _mm512_storeu_pd((mag_type *)p, _data); } - inline value_type &operator[](const int &i) const { - return reinterpret_cast(&_data)[i]; - } + inline value_type &operator[](const int &i) const { return reinterpret_cast(&_data)[i]; } }; } // namespace KokkosBatched diff --git a/batched/dense/src/KokkosBatched_Xpay.hpp b/batched/dense/src/KokkosBatched_Xpay.hpp index 1e9a08623b..51418fd81a 100644 --- a/batched/dense/src/KokkosBatched_Xpay.hpp +++ b/batched/dense/src/KokkosBatched_Xpay.hpp @@ -44,9 +44,7 @@ namespace KokkosBatched { struct SerialXpay { template - KOKKOS_INLINE_FUNCTION static int invoke(const alphaViewType &alpha, - const ViewType &X, - const ViewType &Y); + KOKKOS_INLINE_FUNCTION static int invoke(const alphaViewType &alpha, const ViewType &X, const ViewType &Y); }; /// \brief Team Batched XPAY: @@ -72,9 +70,7 @@ struct SerialXpay { template struct TeamXpay { template - KOKKOS_INLINE_FUNCTION static int invoke(const MemberType &member, - const alphaViewType &alpha, - const ViewType &X, + KOKKOS_INLINE_FUNCTION static int invoke(const MemberType &member, const alphaViewType &alpha, const ViewType &X, const ViewType &Y); }; @@ -102,9 +98,7 @@ struct TeamXpay { template struct TeamVectorXpay { template - KOKKOS_INLINE_FUNCTION static int invoke(const MemberType &member, - const alphaViewType &alpha, - const ViewType &X, + KOKKOS_INLINE_FUNCTION static int invoke(const MemberType &member, const alphaViewType &alpha, const ViewType &X, const ViewType &Y); }; diff --git a/batched/dense/unit_test/Test_Batched_BatchedGemm.hpp b/batched/dense/unit_test/Test_Batched_BatchedGemm.hpp index 3c00b4f477..6c2c359f00 100644 --- a/batched/dense/unit_test/Test_Batched_BatchedGemm.hpp +++ b/batched/dense/unit_test/Test_Batched_BatchedGemm.hpp @@ -25,14 +25,10 @@ using namespace KokkosBatched; namespace Test { -template -void impl_test_batched_gemm_with_handle(BatchedGemmHandle* batchedGemmHandle, - const int N, const int matAdim1, - const int matAdim2, const int matBdim1, - const int matBdim2, const int matCdim1, - const int matCdim2, ScalarType alpha, - ScalarType beta) { +template +void impl_test_batched_gemm_with_handle(BatchedGemmHandle* batchedGemmHandle, const int N, const int matAdim1, + const int matAdim2, const int matBdim1, const int matBdim2, const int matCdim1, + const int matCdim2, ScalarType alpha, ScalarType beta) { using execution_space = typename DeviceType::execution_space; using transA = typename ParamTagType::transA; using transB = typename ParamTagType::transB; @@ -43,15 +39,11 @@ void impl_test_batched_gemm_with_handle(BatchedGemmHandle* batchedGemmHandle, auto algo_type = batchedGemmHandle->get_kernel_algo_type(); ViewType a_expected, a_actual, b_expected, b_actual, c_expected, c_actual; std::string fmsg; - std::string fmsg_rhs = - "algo_type:" + batchedGemmHandle->get_kernel_algo_type_str() + ", "; + std::string fmsg_rhs = "algo_type:" + batchedGemmHandle->get_kernel_algo_type_str() + ", "; fmsg_rhs += ("N:" + std::to_string(N) + ", "); - fmsg_rhs += - ("A:" + std::to_string(matAdim1) + "x" + std::to_string(matAdim2) + ", "); - fmsg_rhs += - ("B:" + std::to_string(matBdim1) + "x" + std::to_string(matBdim2) + ", "); - fmsg_rhs += - ("C:" + std::to_string(matCdim1) + "x" + std::to_string(matCdim2) + "\n"); + fmsg_rhs += ("A:" + std::to_string(matAdim1) + "x" + std::to_string(matAdim2) + ", "); + fmsg_rhs += ("B:" + std::to_string(matBdim1) + "x" + std::to_string(matBdim2) + ", "); + fmsg_rhs += ("C:" + std::to_string(matCdim1) + "x" + std::to_string(matCdim2) + "\n"); if (std::is_same::value) { a_expected = ViewType("a_expected", N, matAdim1, matAdim2); @@ -86,10 +78,8 @@ void impl_test_batched_gemm_with_handle(BatchedGemmHandle* batchedGemmHandle, // Check for DblBuf runtime errors related to team_size try { fmsg = kk_failure_str(__FILE__, __FUNCTION__, __LINE__); - Impl::BatchedDblBufGemm( + Impl::BatchedDblBufGemm( batchedGemmHandle, alpha, a_actual, b_actual, beta, c_actual) .invoke(); FAIL() << (fmsg + fmsg_rhs); @@ -100,11 +90,9 @@ void impl_test_batched_gemm_with_handle(BatchedGemmHandle* batchedGemmHandle, // Check for DblBuf runtime errors related to vector_len try { fmsg = kk_failure_str(__FILE__, __FUNCTION__, __LINE__); - Impl::BatchedDblBufGemm< - transA, transB, batchLayout, BatchedGemmHandle, ScalarType, - decltype(a_actual), decltype(b_actual), decltype(c_actual), - BoundsCheck::No, AlphaTag::No, 65536, 65536 * 2, 65536>( - batchedGemmHandle, alpha, a_actual, b_actual, beta, c_actual) + Impl::BatchedDblBufGemm(batchedGemmHandle, alpha, a_actual, b_actual, beta, c_actual) .invoke(); FAIL() << (fmsg + fmsg_rhs); } catch (const std::runtime_error& error) { @@ -123,9 +111,8 @@ void impl_test_batched_gemm_with_handle(BatchedGemmHandle* batchedGemmHandle, #endif fmsg = kk_failure_str(__FILE__, __FUNCTION__, __LINE__); - ret = BatchedGemm( - batchedGemmHandle, alpha, a_actual, b_actual, beta, - c_actual); // Compute c_actual + ret = BatchedGemm(batchedGemmHandle, alpha, a_actual, b_actual, beta, + c_actual); // Compute c_actual } catch (const std::runtime_error& error) { std::string error_msg = error.what(); if (algo_type == BaseHeuristicAlgos::SQUARE && matCdim1 != matCdim2) { @@ -135,8 +122,7 @@ void impl_test_batched_gemm_with_handle(BatchedGemmHandle* batchedGemmHandle, auto ninter = batchedGemmHandle->get_tpl_params()[0]; // No runtime errors expected since layout is valid, double is a supported // type, and ninter != 0 - if (std::is_same::value && - ninter != 0) { + if (std::is_same::value && ninter != 0) { FAIL() << (error_msg + fmsg + fmsg_rhs); } #else @@ -149,12 +135,10 @@ void impl_test_batched_gemm_with_handle(BatchedGemmHandle* batchedGemmHandle, } ASSERT_EQ(ret, 0) << (fmsg + fmsg_rhs); - Functor_BatchedVanillaGEMM - vgemm; - vgemm.A_t = std::is_same::value; - vgemm.B_t = std::is_same::value; - vgemm.batch_size_last_dim = - std::is_same::value; + Functor_BatchedVanillaGEMM vgemm; + vgemm.A_t = std::is_same::value; + vgemm.B_t = std::is_same::value; + vgemm.batch_size_last_dim = std::is_same::value; vgemm.A_c = vgemm.B_c = false; vgemm.A = a_expected; vgemm.B = b_expected; @@ -165,10 +149,8 @@ void impl_test_batched_gemm_with_handle(BatchedGemmHandle* batchedGemmHandle, Kokkos::fence(); - typename ViewType::HostMirror c_expected_host = - Kokkos::create_mirror_view(c_expected); - typename ViewType::HostMirror c_actual_host = - Kokkos::create_mirror_view(c_actual); + typename ViewType::HostMirror c_expected_host = Kokkos::create_mirror_view(c_expected); + typename ViewType::HostMirror c_actual_host = Kokkos::create_mirror_view(c_actual); // Copy to host Kokkos::deep_copy(c_expected_host, c_expected); @@ -205,26 +187,21 @@ void impl_test_batched_gemm_with_handle(BatchedGemmHandle* batchedGemmHandle, EXPECT_NEAR_KK(diff / sum, 0, eps, fmsg + fmsg_rhs); } -template -void impl_test_batched_gemm(const int N, const int matAdim1, const int matAdim2, - const int matBdim1, const int matBdim2, +template +void impl_test_batched_gemm(const int N, const int matAdim1, const int matAdim2, const int matBdim1, const int matBdim2, const int matCdim1, const int matCdim2) { { BatchedGemmHandle batchedGemmHandle; - ASSERT_EQ(batchedGemmHandle.get_kernel_algo_type(), - BaseHeuristicAlgos::SQUARE); + ASSERT_EQ(batchedGemmHandle.get_kernel_algo_type(), BaseHeuristicAlgos::SQUARE); ASSERT_EQ(batchedGemmHandle.teamSz, 0); ASSERT_EQ(batchedGemmHandle.vecLen, 0); #if defined(KOKKOSKERNELS_ENABLE_TPL_CUBLAS) cublasHandle_t cublas_handle; - BatchedGemmHandle batchedGemmHandleCublas(cublas_handle, - GemmTplAlgos::CUBLAS, 0, 0); + BatchedGemmHandle batchedGemmHandleCublas(cublas_handle, GemmTplAlgos::CUBLAS, 0, 0); ASSERT_EQ(&cublas_handle, batchedGemmHandleCublas.get_tpl_params()); - ASSERT_EQ(batchedGemmHandleCublas.get_kernel_algo_type(), - (int)GemmTplAlgos::CUBLAS); + ASSERT_EQ(batchedGemmHandleCublas.get_kernel_algo_type(), (int)GemmTplAlgos::CUBLAS); ASSERT_EQ(batchedGemmHandleCublas.teamSz, 0); ASSERT_EQ(batchedGemmHandleCublas.vecLen, 0); #endif @@ -232,53 +209,37 @@ void impl_test_batched_gemm(const int N, const int matAdim1, const int matAdim2, // FIXME temporary workaround to run this magma test only if cublas is not // enabled the design of the BatchedGemmHandle currently does not allow // simultanous testing in this way. See issue #2177 -#if defined(KOKKOSKERNELS_ENABLE_TPL_MAGMA) && \ - !defined(KOKKOSKERNELS_ENABLE_TPL_CUBLAS) +#if defined(KOKKOSKERNELS_ENABLE_TPL_MAGMA) && !defined(KOKKOSKERNELS_ENABLE_TPL_CUBLAS) magma_queue_t magma_queue; - BatchedGemmHandle batchedGemmHandleMagma(magma_queue, GemmTplAlgos::MAGMA, - 0, 0); + BatchedGemmHandle batchedGemmHandleMagma(magma_queue, GemmTplAlgos::MAGMA, 0, 0); ASSERT_EQ(&magma_queue, batchedGemmHandleMagma.get_tpl_params()); - ASSERT_EQ(batchedGemmHandleMagma.get_kernel_algo_type(), - (int)GemmTplAlgos::MAGMA); + ASSERT_EQ(batchedGemmHandleMagma.get_kernel_algo_type(), (int)GemmTplAlgos::MAGMA); ASSERT_EQ(batchedGemmHandleMagma.teamSz, 0); ASSERT_EQ(batchedGemmHandleMagma.vecLen, 0); #endif } - for (int algo_type = BaseHeuristicAlgos::SQUARE; - algo_type < GemmKokkosBatchedAlgos::N; ++algo_type) { + for (int algo_type = BaseHeuristicAlgos::SQUARE; algo_type < GemmKokkosBatchedAlgos::N; ++algo_type) { { try { BatchedGemmHandle batchedGemmHandle(algo_type); ASSERT_EQ(batchedGemmHandle.get_kernel_algo_type(), algo_type); - if (algo_type == BaseTplAlgos::ARMPL || - algo_type == BaseKokkosBatchedAlgos::KK_SERIAL || - algo_type == GemmKokkosBatchedAlgos::KK_SERIAL_RANK0 || - algo_type == GemmKokkosBatchedAlgos::KK_DBLBUF) { - impl_test_batched_gemm_with_handle( - &batchedGemmHandle, N, matAdim1, matAdim2, matBdim1, matBdim2, - matCdim1, matCdim2, 1.5, 3.0); + if (algo_type == BaseTplAlgos::ARMPL || algo_type == BaseKokkosBatchedAlgos::KK_SERIAL || + algo_type == GemmKokkosBatchedAlgos::KK_SERIAL_RANK0 || algo_type == GemmKokkosBatchedAlgos::KK_DBLBUF) { + impl_test_batched_gemm_with_handle( + &batchedGemmHandle, N, matAdim1, matAdim2, matBdim1, matBdim2, matCdim1, matCdim2, 1.5, 3.0); } else if (algo_type == BaseHeuristicAlgos::SQUARE) { // Invoke 4 times to ensure we cover all paths for alpha and beta - impl_test_batched_gemm_with_handle( - &batchedGemmHandle, N, matAdim1, matAdim2, matBdim1, matBdim2, - matCdim1, matCdim2, 0.0, 0.0); - impl_test_batched_gemm_with_handle( - &batchedGemmHandle, N, matAdim1, matAdim2, matBdim1, matBdim2, - matCdim1, matCdim2, 1.0, 0.0); - impl_test_batched_gemm_with_handle( - &batchedGemmHandle, N, matAdim1, matAdim2, matBdim1, matBdim2, - matCdim1, matCdim2, 0.0, 1.0); - impl_test_batched_gemm_with_handle( - &batchedGemmHandle, N, matAdim1, matAdim2, matBdim1, matBdim2, - matCdim1, matCdim2, 1.5, 3.0); + impl_test_batched_gemm_with_handle( + &batchedGemmHandle, N, matAdim1, matAdim2, matBdim1, matBdim2, matCdim1, matCdim2, 0.0, 0.0); + impl_test_batched_gemm_with_handle( + &batchedGemmHandle, N, matAdim1, matAdim2, matBdim1, matBdim2, matCdim1, matCdim2, 1.0, 0.0); + impl_test_batched_gemm_with_handle( + &batchedGemmHandle, N, matAdim1, matAdim2, matBdim1, matBdim2, matCdim1, matCdim2, 0.0, 1.0); + impl_test_batched_gemm_with_handle( + &batchedGemmHandle, N, matAdim1, matAdim2, matBdim1, matBdim2, matCdim1, matCdim2, 1.5, 3.0); } else { try { // Allocate these views to invoke BatchedGemm with an unsupported @@ -291,8 +252,7 @@ void impl_test_batched_gemm(const int N, const int matAdim1, const int matAdim2, using bl = typename ParamTagType::batchLayout; ScalarType alpha = 0.34; ScalarType beta = 0.43; - BatchedGemm(&batchedGemmHandle, alpha, a_actual, - b_actual, beta, c_actual); + BatchedGemm(&batchedGemmHandle, alpha, a_actual, b_actual, beta, c_actual); std::string fmsg = kk_failure_str(__FILE__, __FUNCTION__, __LINE__); FAIL() << fmsg; } catch (const std::runtime_error& error) { @@ -314,26 +274,21 @@ void impl_test_batched_gemm(const int N, const int matAdim1, const int matAdim2, } } // namespace Test -template +template void test_batched_gemm_with_layout(int N) { // Square cases { int i = 0; - Test::impl_test_batched_gemm(N, i, i, i, i, i, i); + Test::impl_test_batched_gemm(N, i, i, i, i, i, i); i = 10; - Test::impl_test_batched_gemm(N, i, i, i, i, i, i); + Test::impl_test_batched_gemm(N, i, i, i, i, i, i); i = 25; - Test::impl_test_batched_gemm(N, i, i, i, i, i, i); + Test::impl_test_batched_gemm(N, i, i, i, i, i, i); i = 32; - Test::impl_test_batched_gemm(N, i, i, i, i, i, i); + Test::impl_test_batched_gemm(N, i, i, i, i, i, i); } // Non-square cases @@ -341,63 +296,42 @@ void test_batched_gemm_with_layout(int N) { int dimM = 1 * i; int dimN = 2 * i; int dimK = 3 * i; - if ((std::is_same::value) && - (std::is_same::value)) { - Test::impl_test_batched_gemm(N, dimM, dimK, dimK, dimN, - dimM, dimN); + if ((std::is_same::value) && + (std::is_same::value)) { + Test::impl_test_batched_gemm(N, dimM, dimK, dimK, dimN, dimM, + dimN); } - if ((std::is_same::value) && - (std::is_same::value)) { - Test::impl_test_batched_gemm(N, dimM, dimK, dimN, dimK, - dimM, dimN); + if ((std::is_same::value) && + (std::is_same::value)) { + Test::impl_test_batched_gemm(N, dimM, dimK, dimN, dimK, dimM, + dimN); } - if ((std::is_same::value) && - (std::is_same::value)) { - Test::impl_test_batched_gemm(N, dimK, dimM, dimK, dimN, - dimM, dimN); + if ((std::is_same::value) && + (std::is_same::value)) { + Test::impl_test_batched_gemm(N, dimK, dimM, dimK, dimN, dimM, + dimN); } - if ((std::is_same::value) && - (std::is_same::value)) { - Test::impl_test_batched_gemm(N, dimK, dimM, dimN, dimK, - dimM, dimN); + if ((std::is_same::value) && + (std::is_same::value)) { + Test::impl_test_batched_gemm(N, dimK, dimM, dimN, dimK, dimM, + dimN); } } } -template +template int test_batched_gemm() { #if defined(KOKKOSKERNELS_INST_LAYOUTLEFT) || \ - (!defined(KOKKOSKERNELS_ETI_ONLY) && \ - !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS)) - if constexpr (std::is_same_v) { - using param_tag_type = ::Test::SharedParamTag; + (!defined(KOKKOSKERNELS_ETI_ONLY) && !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS)) + if constexpr (std::is_same_v) { + using param_tag_type = + ::Test::SharedParamTag; typedef Kokkos::View llVt; - test_batched_gemm_with_layout(0); - test_batched_gemm_with_layout(1); - test_batched_gemm_with_layout(4); - test_batched_gemm_with_layout(8); - test_batched_gemm_with_layout(16); + test_batched_gemm_with_layout(0); + test_batched_gemm_with_layout(1); + test_batched_gemm_with_layout(4); + test_batched_gemm_with_layout(8); + test_batched_gemm_with_layout(16); } else { std::cerr << "TEST SKIPPED since BatchLayout is not Right." << std::endl; } @@ -406,24 +340,16 @@ int test_batched_gemm() { #endif // KOKKOSKERNELS_INST_LAYOUTLEFT #if defined(KOKKOSKERNELS_INST_LAYOUTRIGHT) || \ - (!defined(KOKKOSKERNELS_ETI_ONLY) && \ - !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS)) - if constexpr (std::is_same_v) { - using param_tag_type = ::Test::SharedParamTag; + (!defined(KOKKOSKERNELS_ETI_ONLY) && !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS)) + if constexpr (std::is_same_v) { + using param_tag_type = + ::Test::SharedParamTag; typedef Kokkos::View lrVt; - test_batched_gemm_with_layout(0); - test_batched_gemm_with_layout(1); - test_batched_gemm_with_layout(4); - test_batched_gemm_with_layout(8); - test_batched_gemm_with_layout(16); + test_batched_gemm_with_layout(0); + test_batched_gemm_with_layout(1); + test_batched_gemm_with_layout(4); + test_batched_gemm_with_layout(8); + test_batched_gemm_with_layout(16); } else { std::cerr << "TEST SKIPPED since BatchLayout is not Left." << std::endl; } diff --git a/batched/dense/unit_test/Test_Batched_BatchedGemm_Complex.hpp b/batched/dense/unit_test/Test_Batched_BatchedGemm_Complex.hpp index 3c58f432ec..4e9bfa42ef 100644 --- a/batched/dense/unit_test/Test_Batched_BatchedGemm_Complex.hpp +++ b/batched/dense/unit_test/Test_Batched_BatchedGemm_Complex.hpp @@ -16,139 +16,89 @@ #if defined(KOKKOSKERNELS_INST_COMPLEX_FLOAT) /********************* BatchLayout::Left *********************/ TEST_F(TestCategory, batched_scalar_batched_gemm_nt_nt_scomplex_scomplex_left) { - typedef ::Test::SharedParamTag - param_tag_type; + typedef ::Test::SharedParamTag param_tag_type; - test_batched_gemm, Kokkos::complex, - param_tag_type>(); + test_batched_gemm, Kokkos::complex, param_tag_type>(); } TEST_F(TestCategory, batched_scalar_batched_gemm_t_nt_scomplex_scomplex_left) { - typedef ::Test::SharedParamTag - param_tag_type; + typedef ::Test::SharedParamTag param_tag_type; - test_batched_gemm, Kokkos::complex, - param_tag_type>(); + test_batched_gemm, Kokkos::complex, param_tag_type>(); } TEST_F(TestCategory, batched_scalar_batched_gemm_nt_t_scomplex_scomplex_left) { - typedef ::Test::SharedParamTag - param_tag_type; + typedef ::Test::SharedParamTag param_tag_type; - test_batched_gemm, Kokkos::complex, - param_tag_type>(); + test_batched_gemm, Kokkos::complex, param_tag_type>(); } TEST_F(TestCategory, batched_scalar_batched_gemm_t_t_scomplex_scomplex_left) { - typedef ::Test::SharedParamTag - param_tag_type; + typedef ::Test::SharedParamTag param_tag_type; - test_batched_gemm, Kokkos::complex, - param_tag_type>(); + test_batched_gemm, Kokkos::complex, param_tag_type>(); } /********************* BatchLayout::Right *********************/ -TEST_F(TestCategory, - batched_scalar_batched_gemm_nt_nt_scomplex_scomplex_right) { - typedef ::Test::SharedParamTag - param_tag_type; +TEST_F(TestCategory, batched_scalar_batched_gemm_nt_nt_scomplex_scomplex_right) { + typedef ::Test::SharedParamTag param_tag_type; - test_batched_gemm, Kokkos::complex, - param_tag_type>(); + test_batched_gemm, Kokkos::complex, param_tag_type>(); } TEST_F(TestCategory, batched_scalar_batched_gemm_t_nt_scomplex_scomplex_right) { - typedef ::Test::SharedParamTag - param_tag_type; + typedef ::Test::SharedParamTag param_tag_type; - test_batched_gemm, Kokkos::complex, - param_tag_type>(); + test_batched_gemm, Kokkos::complex, param_tag_type>(); } TEST_F(TestCategory, batched_scalar_batched_gemm_nt_t_scomplex_scomplex_right) { - typedef ::Test::SharedParamTag - param_tag_type; + typedef ::Test::SharedParamTag param_tag_type; - test_batched_gemm, Kokkos::complex, - param_tag_type>(); + test_batched_gemm, Kokkos::complex, param_tag_type>(); } TEST_F(TestCategory, batched_scalar_batched_gemm_t_t_scomplex_scomplex_right) { - typedef ::Test::SharedParamTag - param_tag_type; + typedef ::Test::SharedParamTag param_tag_type; - test_batched_gemm, Kokkos::complex, - param_tag_type>(); + test_batched_gemm, Kokkos::complex, param_tag_type>(); } #endif #if defined(KOKKOSKERNELS_INST_COMPLEX_DOUBLE) /********************* BatchLayout::Left *********************/ TEST_F(TestCategory, batched_scalar_batched_gemm_nt_nt_dcomplex_dcomplex_left) { - typedef ::Test::SharedParamTag - param_tag_type; + typedef ::Test::SharedParamTag param_tag_type; - test_batched_gemm, - Kokkos::complex, param_tag_type>(); + test_batched_gemm, Kokkos::complex, param_tag_type>(); } TEST_F(TestCategory, batched_scalar_batched_gemm_t_nt_dcomplex_dcomplex_left) { - typedef ::Test::SharedParamTag - param_tag_type; + typedef ::Test::SharedParamTag param_tag_type; - test_batched_gemm, - Kokkos::complex, param_tag_type>(); + test_batched_gemm, Kokkos::complex, param_tag_type>(); } TEST_F(TestCategory, batched_scalar_batched_gemm_nt_t_dcomplex_dcomplex_left) { - typedef ::Test::SharedParamTag - param_tag_type; + typedef ::Test::SharedParamTag param_tag_type; - test_batched_gemm, - Kokkos::complex, param_tag_type>(); + test_batched_gemm, Kokkos::complex, param_tag_type>(); } TEST_F(TestCategory, batched_scalar_batched_gemm_t_t_dcomplex_dcomplex_left) { - typedef ::Test::SharedParamTag - param_tag_type; + typedef ::Test::SharedParamTag param_tag_type; - test_batched_gemm, - Kokkos::complex, param_tag_type>(); + test_batched_gemm, Kokkos::complex, param_tag_type>(); } /********************* BatchLayout::Right *********************/ -TEST_F(TestCategory, - batched_scalar_batched_gemm_nt_nt_dcomplex_dcomplex_right) { - typedef ::Test::SharedParamTag - param_tag_type; +TEST_F(TestCategory, batched_scalar_batched_gemm_nt_nt_dcomplex_dcomplex_right) { + typedef ::Test::SharedParamTag param_tag_type; - test_batched_gemm, - Kokkos::complex, param_tag_type>(); + test_batched_gemm, Kokkos::complex, param_tag_type>(); } TEST_F(TestCategory, batched_scalar_batched_gemm_t_nt_dcomplex_dcomplex_right) { - typedef ::Test::SharedParamTag - param_tag_type; + typedef ::Test::SharedParamTag param_tag_type; - test_batched_gemm, - Kokkos::complex, param_tag_type>(); + test_batched_gemm, Kokkos::complex, param_tag_type>(); } TEST_F(TestCategory, batched_scalar_batched_gemm_nt_t_dcomplex_dcomplex_right) { - typedef ::Test::SharedParamTag - param_tag_type; + typedef ::Test::SharedParamTag param_tag_type; - test_batched_gemm, - Kokkos::complex, param_tag_type>(); + test_batched_gemm, Kokkos::complex, param_tag_type>(); } TEST_F(TestCategory, batched_scalar_batched_gemm_t_t_dcomplex_dcomplex_right) { - typedef ::Test::SharedParamTag - param_tag_type; + typedef ::Test::SharedParamTag param_tag_type; - test_batched_gemm, - Kokkos::complex, param_tag_type>(); + test_batched_gemm, Kokkos::complex, param_tag_type>(); } #endif diff --git a/batched/dense/unit_test/Test_Batched_BatchedGemm_Real.hpp b/batched/dense/unit_test/Test_Batched_BatchedGemm_Real.hpp index 62a4a291a8..d2e9fe48d7 100644 --- a/batched/dense/unit_test/Test_Batched_BatchedGemm_Real.hpp +++ b/batched/dense/unit_test/Test_Batched_BatchedGemm_Real.hpp @@ -16,206 +16,140 @@ // We do not ETI half-types. Only test this if ETI ONLY is off // and bhalf_t is not an alias to float. -#if !defined(KOKKOSKERNELS_ETI_ONLY) && \ - !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS) && \ +#if !defined(KOKKOSKERNELS_ETI_ONLY) && !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS) && \ defined(KOKKOS_BHALF_T_IS_FLOAT) && !KOKKOS_BHALF_T_IS_FLOAT /********************* BatchLayout::Left *********************/ TEST_F(TestCategory, batched_scalar_batched_gemm_nt_nt_bhalf_bhalf_left) { - typedef ::Test::SharedParamTag - param_tag_type; + typedef ::Test::SharedParamTag param_tag_type; - test_batched_gemm(); + test_batched_gemm(); } TEST_F(TestCategory, batched_scalar_batched_gemm_t_nt_bhalf_bhalf_left) { - typedef ::Test::SharedParamTag - param_tag_type; + typedef ::Test::SharedParamTag param_tag_type; - test_batched_gemm(); + test_batched_gemm(); } TEST_F(TestCategory, batched_scalar_batched_gemm_nt_t_bhalf_bhalf_left) { - typedef ::Test::SharedParamTag - param_tag_type; + typedef ::Test::SharedParamTag param_tag_type; - test_batched_gemm(); + test_batched_gemm(); } TEST_F(TestCategory, batched_scalar_batched_gemm_t_t_bhalf_bhalf_left) { - typedef ::Test::SharedParamTag - param_tag_type; + typedef ::Test::SharedParamTag param_tag_type; - test_batched_gemm(); + test_batched_gemm(); } /********************* BatchLayout::Right *********************/ TEST_F(TestCategory, batched_scalar_batched_gemm_nt_nt_bhalf_bhalf_right) { - typedef ::Test::SharedParamTag - param_tag_type; + typedef ::Test::SharedParamTag param_tag_type; - test_batched_gemm(); + test_batched_gemm(); } TEST_F(TestCategory, batched_scalar_batched_gemm_t_nt_bhalf_bhalf_right) { - typedef ::Test::SharedParamTag - param_tag_type; + typedef ::Test::SharedParamTag param_tag_type; - test_batched_gemm(); + test_batched_gemm(); } TEST_F(TestCategory, batched_scalar_batched_gemm_nt_t_bhalf_bhalf_right) { - typedef ::Test::SharedParamTag - param_tag_type; + typedef ::Test::SharedParamTag param_tag_type; - test_batched_gemm(); + test_batched_gemm(); } TEST_F(TestCategory, batched_scalar_batched_gemm_t_t_bhalf_bhalf_right) { - typedef ::Test::SharedParamTag - param_tag_type; + typedef ::Test::SharedParamTag param_tag_type; - test_batched_gemm(); + test_batched_gemm(); } #endif // KOKKOS_BHALF_T_IS_FLOAT // We do not ETI half-types. Only test this if ETI ONLY is off // and half_t is not an alias to float. -#if !defined(KOKKOSKERNELS_ETI_ONLY) && \ - !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS) && \ +#if !defined(KOKKOSKERNELS_ETI_ONLY) && !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS) && \ defined(KOKKOS_HALF_T_IS_FLOAT) && !KOKKOS_HALF_T_IS_FLOAT /********************* BatchLayout::Left *********************/ TEST_F(TestCategory, batched_scalar_batched_gemm_nt_nt_half_half_left) { - typedef ::Test::SharedParamTag - param_tag_type; + typedef ::Test::SharedParamTag param_tag_type; - test_batched_gemm(); + test_batched_gemm(); } TEST_F(TestCategory, batched_scalar_batched_gemm_t_nt_half_half_left) { - typedef ::Test::SharedParamTag - param_tag_type; + typedef ::Test::SharedParamTag param_tag_type; - test_batched_gemm(); + test_batched_gemm(); } TEST_F(TestCategory, batched_scalar_batched_gemm_nt_t_half_half_left) { - typedef ::Test::SharedParamTag - param_tag_type; + typedef ::Test::SharedParamTag param_tag_type; - test_batched_gemm(); + test_batched_gemm(); } TEST_F(TestCategory, batched_scalar_batched_gemm_t_t_half_half_left) { - typedef ::Test::SharedParamTag - param_tag_type; + typedef ::Test::SharedParamTag param_tag_type; - test_batched_gemm(); + test_batched_gemm(); } /********************* BatchLayout::Right *********************/ TEST_F(TestCategory, batched_scalar_batched_gemm_nt_nt_half_half_right) { - typedef ::Test::SharedParamTag - param_tag_type; + typedef ::Test::SharedParamTag param_tag_type; - test_batched_gemm(); + test_batched_gemm(); } TEST_F(TestCategory, batched_scalar_batched_gemm_t_nt_half_half_right) { - typedef ::Test::SharedParamTag - param_tag_type; + typedef ::Test::SharedParamTag param_tag_type; - test_batched_gemm(); + test_batched_gemm(); } TEST_F(TestCategory, batched_scalar_batched_gemm_nt_t_half_half_right) { - typedef ::Test::SharedParamTag - param_tag_type; + typedef ::Test::SharedParamTag param_tag_type; - test_batched_gemm(); + test_batched_gemm(); } TEST_F(TestCategory, batched_scalar_batched_gemm_t_t_half_half_right) { - typedef ::Test::SharedParamTag - param_tag_type; + typedef ::Test::SharedParamTag param_tag_type; - test_batched_gemm(); + test_batched_gemm(); } #endif // KOKKOS_HALF_T_IS_FLOAT #if defined(KOKKOSKERNELS_INST_FLOAT) /********************* BatchLayout::Left *********************/ TEST_F(TestCategory, batched_scalar_batched_gemm_nt_nt_float_float_left) { - typedef ::Test::SharedParamTag - param_tag_type; + typedef ::Test::SharedParamTag param_tag_type; test_batched_gemm(); } TEST_F(TestCategory, batched_scalar_batched_gemm_t_nt_float_float_left) { - typedef ::Test::SharedParamTag - param_tag_type; + typedef ::Test::SharedParamTag param_tag_type; test_batched_gemm(); } TEST_F(TestCategory, batched_scalar_batched_gemm_nt_t_float_float_left) { - typedef ::Test::SharedParamTag - param_tag_type; + typedef ::Test::SharedParamTag param_tag_type; test_batched_gemm(); } TEST_F(TestCategory, batched_scalar_batched_gemm_t_t_float_float_left) { - typedef ::Test::SharedParamTag - param_tag_type; + typedef ::Test::SharedParamTag param_tag_type; test_batched_gemm(); } /********************* BatchLayout::Right *********************/ TEST_F(TestCategory, batched_scalar_batched_gemm_nt_nt_float_float_right) { - typedef ::Test::SharedParamTag - param_tag_type; + typedef ::Test::SharedParamTag param_tag_type; test_batched_gemm(); } TEST_F(TestCategory, batched_scalar_batched_gemm_t_nt_float_float_right) { - typedef ::Test::SharedParamTag - param_tag_type; + typedef ::Test::SharedParamTag param_tag_type; test_batched_gemm(); } TEST_F(TestCategory, batched_scalar_batched_gemm_nt_t_float_float_right) { - typedef ::Test::SharedParamTag - param_tag_type; + typedef ::Test::SharedParamTag param_tag_type; test_batched_gemm(); } TEST_F(TestCategory, batched_scalar_batched_gemm_t_t_float_float_right) { - typedef ::Test::SharedParamTag - param_tag_type; + typedef ::Test::SharedParamTag param_tag_type; test_batched_gemm(); } @@ -224,59 +158,43 @@ TEST_F(TestCategory, batched_scalar_batched_gemm_t_t_float_float_right) { #if defined(KOKKOSKERNELS_INST_DOUBLE) /********************* BatchLayout::Left *********************/ TEST_F(TestCategory, batched_scalar_batched_gemm_nt_nt_double_double_left) { - using param_tag_type = - ::Test::SharedParamTag; + using param_tag_type = ::Test::SharedParamTag; test_batched_gemm(); } TEST_F(TestCategory, batched_scalar_batched_gemm_t_nt_double_double_left) { - typedef ::Test::SharedParamTag - param_tag_type; + typedef ::Test::SharedParamTag param_tag_type; test_batched_gemm(); } TEST_F(TestCategory, batched_scalar_batched_gemm_nt_t_double_double_left) { - typedef ::Test::SharedParamTag - param_tag_type; + typedef ::Test::SharedParamTag param_tag_type; test_batched_gemm(); } TEST_F(TestCategory, batched_scalar_batched_gemm_t_t_double_double_left) { - typedef ::Test::SharedParamTag - param_tag_type; + typedef ::Test::SharedParamTag param_tag_type; test_batched_gemm(); } /********************* BatchLayout::Right *********************/ TEST_F(TestCategory, batched_scalar_batched_gemm_nt_nt_double_double_right) { - typedef ::Test::SharedParamTag - param_tag_type; + typedef ::Test::SharedParamTag param_tag_type; test_batched_gemm(); } TEST_F(TestCategory, batched_scalar_batched_gemm_t_nt_double_double_right) { - typedef ::Test::SharedParamTag - param_tag_type; + typedef ::Test::SharedParamTag param_tag_type; test_batched_gemm(); } TEST_F(TestCategory, batched_scalar_batched_gemm_nt_t_double_double_right) { - typedef ::Test::SharedParamTag - param_tag_type; + typedef ::Test::SharedParamTag param_tag_type; test_batched_gemm(); } TEST_F(TestCategory, batched_scalar_batched_gemm_t_t_double_double_right) { - typedef ::Test::SharedParamTag - param_tag_type; + typedef ::Test::SharedParamTag param_tag_type; test_batched_gemm(); } diff --git a/batched/dense/unit_test/Test_Batched_DenseUtils.hpp b/batched/dense/unit_test/Test_Batched_DenseUtils.hpp index c1328291fb..f536f220d3 100644 --- a/batched/dense/unit_test/Test_Batched_DenseUtils.hpp +++ b/batched/dense/unit_test/Test_Batched_DenseUtils.hpp @@ -20,14 +20,9 @@ namespace KokkosBatched { template -void create_tridiagonal_batched_matrices(const MatrixViewType& A, - const VectorViewType& B) { - Kokkos::Random_XorShift64_Pool< - typename VectorViewType::device_type::execution_space> - random(13718); - Kokkos::fill_random( - B, random, - Kokkos::reduction_identity::prod()); +void create_tridiagonal_batched_matrices(const MatrixViewType& A, const VectorViewType& B) { + Kokkos::Random_XorShift64_Pool random(13718); + Kokkos::fill_random(B, random, Kokkos::reduction_identity::prod()); auto A_host = Kokkos::create_mirror_view(A); @@ -58,8 +53,7 @@ void create_tridiagonal_batched_matrices(const MatrixViewType& A, } template -void create_banded_triangular_matrix(InViewType& in, OutViewType& out, - int k = 1, bool band_storage = true) { +void create_banded_triangular_matrix(InViewType& in, OutViewType& out, int k = 1, bool band_storage = true) { auto h_in = Kokkos::create_mirror_view(in); auto h_out = Kokkos::create_mirror_view(out); const int N = in.extent(0), BlkSize = in.extent(1); diff --git a/batched/dense/unit_test/Test_Batched_SerialAxpy.hpp b/batched/dense/unit_test/Test_Batched_SerialAxpy.hpp index 90ce5addc3..df6f0ee069 100644 --- a/batched/dense/unit_test/Test_Batched_SerialAxpy.hpp +++ b/batched/dense/unit_test/Test_Batched_SerialAxpy.hpp @@ -36,8 +36,7 @@ struct Functor_TestBatchedSerialAxpy { const ViewType _Y; KOKKOS_INLINE_FUNCTION - Functor_TestBatchedSerialAxpy(const alphaViewType &alpha, const ViewType &X, - const ViewType &Y) + Functor_TestBatchedSerialAxpy(const alphaViewType &alpha, const ViewType &X, const ViewType &Y) : _alpha(alpha), _X(X), _Y(Y) {} KOKKOS_INLINE_FUNCTION @@ -68,13 +67,11 @@ void impl_test_batched_axpy(const int N, const int BlkSize) { typedef typename alphaViewType::const_value_type alpha_const_value_type; typedef Kokkos::ArithTraits ats; - ViewType X0("x0", N, BlkSize), X1("x1", N, BlkSize), Y0("y0", N, BlkSize), - Y1("y1", N, BlkSize); + ViewType X0("x0", N, BlkSize), X1("x1", N, BlkSize), Y0("y0", N, BlkSize), Y1("y1", N, BlkSize); alphaViewType alpha("alpha", N); - Kokkos::Random_XorShift64_Pool random( - 13718); + Kokkos::Random_XorShift64_Pool random(13718); Kokkos::fill_random(X0, random, const_value_type(1.0)); Kokkos::fill_random(Y0, random, const_value_type(1.0)); Kokkos::fill_random(alpha, random, alpha_const_value_type(1.0)); @@ -94,12 +91,9 @@ void impl_test_batched_axpy(const int N, const int BlkSize) { Kokkos::deep_copy(Y0_host, Y0); for (int l = 0; l < N; ++l) - for (int i = 0; i < BlkSize; ++i) - Y0_host(l, i) += alpha_host(l) * X0_host(l, i); + for (int i = 0; i < BlkSize; ++i) Y0_host(l, i) += alpha_host(l) * X0_host(l, i); - Functor_TestBatchedSerialAxpy(alpha, X1, - Y1) - .run(); + Functor_TestBatchedSerialAxpy(alpha, X1, Y1).run(); Kokkos::fence(); @@ -128,25 +122,20 @@ int test_batched_axpy() { #if defined(KOKKOSKERNELS_INST_LAYOUTLEFT) { typedef Kokkos::View ViewType; - typedef Kokkos::View - alphaViewType; + typedef Kokkos::View alphaViewType; for (int i = 3; i < 10; ++i) { - Test::Axpy::impl_test_batched_axpy( - 1024, i); + Test::Axpy::impl_test_batched_axpy(1024, i); } } #endif #if defined(KOKKOSKERNELS_INST_LAYOUTRIGHT) { - typedef Kokkos::View - ViewType; - typedef Kokkos::View - alphaViewType; + typedef Kokkos::View ViewType; + typedef Kokkos::View alphaViewType; for (int i = 3; i < 10; ++i) { - Test::Axpy::impl_test_batched_axpy( - 1024, i); + Test::Axpy::impl_test_batched_axpy(1024, i); } } #endif diff --git a/batched/dense/unit_test/Test_Batched_SerialAxpy_Complex.hpp b/batched/dense/unit_test/Test_Batched_SerialAxpy_Complex.hpp index ed647f1e3b..7d1b3301f1 100644 --- a/batched/dense/unit_test/Test_Batched_SerialAxpy_Complex.hpp +++ b/batched/dense/unit_test/Test_Batched_SerialAxpy_Complex.hpp @@ -16,8 +16,7 @@ #if defined(KOKKOSKERNELS_INST_COMPLEX_DOUBLE) TEST_F(TestCategory, batched_scalar_serial_axpy_nt_dcomplex_dcomplex) { - test_batched_axpy, - Kokkos::complex>(); + test_batched_axpy, Kokkos::complex>(); } TEST_F(TestCategory, batched_scalar_serial_axpy_nt_dcomplex_double) { diff --git a/batched/dense/unit_test/Test_Batched_SerialAxpy_Real.hpp b/batched/dense/unit_test/Test_Batched_SerialAxpy_Real.hpp index 3f1f6af2fd..a0c49287f7 100644 --- a/batched/dense/unit_test/Test_Batched_SerialAxpy_Real.hpp +++ b/batched/dense/unit_test/Test_Batched_SerialAxpy_Real.hpp @@ -15,13 +15,9 @@ //@HEADER #if defined(KOKKOSKERNELS_INST_FLOAT) -TEST_F(TestCategory, batched_scalar_serial_axpy_nt_float_float) { - test_batched_axpy(); -} +TEST_F(TestCategory, batched_scalar_serial_axpy_nt_float_float) { test_batched_axpy(); } #endif #if defined(KOKKOSKERNELS_INST_DOUBLE) -TEST_F(TestCategory, batched_scalar_serial_axpy_nt_double_double) { - test_batched_axpy(); -} +TEST_F(TestCategory, batched_scalar_serial_axpy_nt_double_double) { test_batched_axpy(); } #endif diff --git a/batched/dense/unit_test/Test_Batched_SerialGemm.hpp b/batched/dense/unit_test/Test_Batched_SerialGemm.hpp index 7f27fa7dcf..144bb2251e 100644 --- a/batched/dense/unit_test/Test_Batched_SerialGemm.hpp +++ b/batched/dense/unit_test/Test_Batched_SerialGemm.hpp @@ -19,7 +19,7 @@ #include "Kokkos_Core.hpp" #include "Kokkos_Random.hpp" -//#include "KokkosBatched_Vector.hpp" +// #include "KokkosBatched_Vector.hpp" #include "KokkosBatched_Gemm_Decl.hpp" #include "KokkosBatched_Gemm_Serial_Impl.hpp" @@ -37,8 +37,7 @@ struct ParamTag { typedef TB transB; }; -template +template struct Functor_TestBatchedSerialGemm { using execution_space = typename DeviceType::execution_space; ViewType _a, _b, _c; @@ -46,8 +45,7 @@ struct Functor_TestBatchedSerialGemm { ScalarType _alpha, _beta; KOKKOS_INLINE_FUNCTION - Functor_TestBatchedSerialGemm(const ScalarType alpha, const ViewType &a, - const ViewType &b, const ScalarType beta, + Functor_TestBatchedSerialGemm(const ScalarType alpha, const ViewType &a, const ViewType &b, const ScalarType beta, const ViewType &c) : _a(a), _b(b), _c(c), _alpha(alpha), _beta(beta) {} @@ -57,8 +55,8 @@ struct Functor_TestBatchedSerialGemm { auto bb = Kokkos::subview(_b, k, Kokkos::ALL(), Kokkos::ALL()); auto cc = Kokkos::subview(_c, k, Kokkos::ALL(), Kokkos::ALL()); - SerialGemm::invoke(_alpha, aa, bb, _beta, cc); + SerialGemm::invoke(_alpha, aa, bb, _beta, + cc); } inline void run() { @@ -73,10 +71,8 @@ struct Functor_TestBatchedSerialGemm { } }; -template -void impl_test_batched_gemm(const int N, const int matAdim1, const int matAdim2, - const int matBdim1, const int matBdim2, +template +void impl_test_batched_gemm(const int N, const int matAdim1, const int matAdim2, const int matBdim1, const int matBdim2, const int matCdim1, const int matCdim2) { using execution_space = typename DeviceType::execution_space; using transA = typename ParamTagType::transA; @@ -88,12 +84,9 @@ void impl_test_batched_gemm(const int N, const int matAdim1, const int matAdim2, ScalarType alpha = ScalarType(1.5); ScalarType beta = ScalarType(3.0); - ViewType a_expected("a_expected", N, matAdim1, matAdim2), - a_actual("a_actual", N, matAdim1, matAdim2), - b_expected("b_expected", N, matBdim1, matBdim2), - b_actual("b_actual", N, matBdim1, matBdim2), - c_expected("c_expected", N, matCdim1, matCdim2), - c_actual("c_actual", N, matCdim1, matCdim2); + ViewType a_expected("a_expected", N, matAdim1, matAdim2), a_actual("a_actual", N, matAdim1, matAdim2), + b_expected("b_expected", N, matBdim1, matBdim2), b_actual("b_actual", N, matBdim1, matBdim2), + c_expected("c_expected", N, matCdim1, matCdim2), c_actual("c_actual", N, matCdim1, matCdim2); Kokkos::Random_XorShift64_Pool random(13718); @@ -107,8 +100,7 @@ void impl_test_batched_gemm(const int N, const int matAdim1, const int matAdim2, Kokkos::deep_copy(b_actual, b_expected); Kokkos::deep_copy(c_actual, c_expected); - Functor_BatchedVanillaGEMM - vgemm; + Functor_BatchedVanillaGEMM vgemm; vgemm.A_t = std::is_same::value; vgemm.B_t = std::is_same::value; vgemm.A_c = vgemm.B_c = false; @@ -118,15 +110,12 @@ void impl_test_batched_gemm(const int N, const int matAdim1, const int matAdim2, vgemm.alpha = alpha; vgemm.beta = beta; vgemm.run(); // Compute c_expected - Functor_TestBatchedSerialGemm(alpha, a_actual, b_actual, beta, - c_actual) + Functor_TestBatchedSerialGemm(alpha, a_actual, b_actual, + beta, c_actual) .run(); - typename ViewType::HostMirror c_expected_host = - Kokkos::create_mirror_view(c_expected); - typename ViewType::HostMirror c_actual_host = - Kokkos::create_mirror_view(c_actual); + typename ViewType::HostMirror c_expected_host = Kokkos::create_mirror_view(c_expected); + typename ViewType::HostMirror c_actual_host = Kokkos::create_mirror_view(c_actual); // Copy to host for comparison Kokkos::deep_copy(c_expected_host, c_expected); @@ -157,57 +146,41 @@ void impl_test_batched_gemm(const int N, const int matAdim1, const int matAdim2, } // namespace Gemm } // namespace Test -template +template int test_batched_gemm() { #if defined(KOKKOSKERNELS_INST_LAYOUTLEFT) { - typedef Kokkos::View - ViewType; - Test::Gemm::impl_test_batched_gemm(0, 10, 10, 10, - 10, 10, 10); + typedef Kokkos::View ViewType; + Test::Gemm::impl_test_batched_gemm(0, 10, 10, 10, 10, + 10, 10); for (int i = 0; i < 10; ++i) { // printf("Testing: LayoutLeft, Blksize %d\n", i); - Test::Gemm::impl_test_batched_gemm(1024, i, i, - i, i, i, i); + Test::Gemm::impl_test_batched_gemm(1024, i, i, i, i, + i, i); } for (int i = 0; i < 10; ++i) { // printf("Testing: LayoutLeft, Blksize %d\n", i); int dimM = i; int dimN = 2 * i; int dimK = 3 * i; - if ((std::is_same::value) && - (std::is_same::value)) { - Test::Gemm::impl_test_batched_gemm( + if ((std::is_same::value) && + (std::is_same::value)) { + Test::Gemm::impl_test_batched_gemm( 1024, dimM, dimK, dimK, dimN, dimM, dimN); } - if ((std::is_same::value) && - (std::is_same::value)) { - Test::Gemm::impl_test_batched_gemm( + if ((std::is_same::value) && + (std::is_same::value)) { + Test::Gemm::impl_test_batched_gemm( 1024, dimM, dimK, dimN, dimK, dimM, dimN); } - if ((std::is_same::value) && - (std::is_same::value)) { - Test::Gemm::impl_test_batched_gemm( + if ((std::is_same::value) && + (std::is_same::value)) { + Test::Gemm::impl_test_batched_gemm( 1024, dimK, dimM, dimK, dimN, dimM, dimN); } - if ((std::is_same::value) && - (std::is_same::value)) { - Test::Gemm::impl_test_batched_gemm( + if ((std::is_same::value) && + (std::is_same::value)) { + Test::Gemm::impl_test_batched_gemm( 1024, dimK, dimM, dimN, dimK, dimM, dimN); } } @@ -215,52 +188,37 @@ int test_batched_gemm() { #endif #if defined(KOKKOSKERNELS_INST_LAYOUTRIGHT) { - typedef Kokkos::View - ViewType; - Test::Gemm::impl_test_batched_gemm(0, 10, 10, 10, - 10, 10, 10); + typedef Kokkos::View ViewType; + Test::Gemm::impl_test_batched_gemm(0, 10, 10, 10, 10, + 10, 10); for (int i = 0; i < 10; ++i) { // printf("Testing: LayoutRight, Blksize %d\n", i); - Test::Gemm::impl_test_batched_gemm(1024, i, i, - i, i, i, i); + Test::Gemm::impl_test_batched_gemm(1024, i, i, i, i, + i, i); } for (int i = 0; i < 10; ++i) { // printf("Testing: LayoutLeft, Blksize %d\n", i); int dimM = i; int dimN = 2 * i; int dimK = 3 * i; - if ((std::is_same::value) && - (std::is_same::value)) { - Test::Gemm::impl_test_batched_gemm( + if ((std::is_same::value) && + (std::is_same::value)) { + Test::Gemm::impl_test_batched_gemm( 1024, dimM, dimK, dimK, dimN, dimM, dimN); } - if ((std::is_same::value) && - (std::is_same::value)) { - Test::Gemm::impl_test_batched_gemm( + if ((std::is_same::value) && + (std::is_same::value)) { + Test::Gemm::impl_test_batched_gemm( 1024, dimM, dimK, dimN, dimK, dimM, dimN); } - if ((std::is_same::value) && - (std::is_same::value)) { - Test::Gemm::impl_test_batched_gemm( + if ((std::is_same::value) && + (std::is_same::value)) { + Test::Gemm::impl_test_batched_gemm( 1024, dimK, dimM, dimK, dimN, dimM, dimN); } - if ((std::is_same::value) && - (std::is_same::value)) { - Test::Gemm::impl_test_batched_gemm( + if ((std::is_same::value) && + (std::is_same::value)) { + Test::Gemm::impl_test_batched_gemm( 1024, dimK, dimM, dimN, dimK, dimM, dimN); } } diff --git a/batched/dense/unit_test/Test_Batched_SerialGemm_Complex.hpp b/batched/dense/unit_test/Test_Batched_SerialGemm_Complex.hpp index f671292c98..f785965602 100644 --- a/batched/dense/unit_test/Test_Batched_SerialGemm_Complex.hpp +++ b/batched/dense/unit_test/Test_Batched_SerialGemm_Complex.hpp @@ -18,32 +18,24 @@ /// dcomplex, dcomplex TEST_F(TestCategory, batched_scalar_serial_gemm_nt_nt_dcomplex_dcomplex) { - typedef ::Test::Gemm::ParamTag - param_tag_type; + typedef ::Test::Gemm::ParamTag param_tag_type; typedef Algo::Gemm::Blocked algo_tag_type; - test_batched_gemm, - Kokkos::complex, param_tag_type, algo_tag_type>(); + test_batched_gemm, Kokkos::complex, param_tag_type, algo_tag_type>(); } TEST_F(TestCategory, batched_scalar_serial_gemm_t_nt_dcomplex_dcomplex) { - typedef ::Test::Gemm::ParamTag - param_tag_type; + typedef ::Test::Gemm::ParamTag param_tag_type; typedef Algo::Gemm::Blocked algo_tag_type; - test_batched_gemm, - Kokkos::complex, param_tag_type, algo_tag_type>(); + test_batched_gemm, Kokkos::complex, param_tag_type, algo_tag_type>(); } TEST_F(TestCategory, batched_scalar_serial_gemm_nt_t_dcomplex_dcomplex) { - typedef ::Test::Gemm::ParamTag - param_tag_type; + typedef ::Test::Gemm::ParamTag param_tag_type; typedef Algo::Gemm::Blocked algo_tag_type; - test_batched_gemm, - Kokkos::complex, param_tag_type, algo_tag_type>(); + test_batched_gemm, Kokkos::complex, param_tag_type, algo_tag_type>(); } TEST_F(TestCategory, batched_scalar_serial_gemm_t_t_dcomplex_dcomplex) { - typedef ::Test::Gemm::ParamTag - param_tag_type; + typedef ::Test::Gemm::ParamTag param_tag_type; typedef Algo::Gemm::Blocked algo_tag_type; - test_batched_gemm, - Kokkos::complex, param_tag_type, algo_tag_type>(); + test_batched_gemm, Kokkos::complex, param_tag_type, algo_tag_type>(); } // TEST_F( TestCategory, batched_scalar_serial_gemm_ct_nt_dcomplex_dcomplex ) { // typedef ::Test::Gemm::ParamTag @@ -59,32 +51,24 @@ TEST_F(TestCategory, batched_scalar_serial_gemm_t_t_dcomplex_dcomplex) { /// dcomplex, double TEST_F(TestCategory, batched_scalar_serial_gemm_nt_nt_dcomplex_double) { - typedef ::Test::Gemm::ParamTag - param_tag_type; + typedef ::Test::Gemm::ParamTag param_tag_type; typedef Algo::Gemm::Blocked algo_tag_type; - test_batched_gemm, double, param_tag_type, - algo_tag_type>(); + test_batched_gemm, double, param_tag_type, algo_tag_type>(); } TEST_F(TestCategory, batched_scalar_serial_gemm_t_nt_dcomplex_double) { - typedef ::Test::Gemm::ParamTag - param_tag_type; + typedef ::Test::Gemm::ParamTag param_tag_type; typedef Algo::Gemm::Blocked algo_tag_type; - test_batched_gemm, double, param_tag_type, - algo_tag_type>(); + test_batched_gemm, double, param_tag_type, algo_tag_type>(); } TEST_F(TestCategory, batched_scalar_serial_gemm_nt_t_dcomplex_double) { - typedef ::Test::Gemm::ParamTag - param_tag_type; + typedef ::Test::Gemm::ParamTag param_tag_type; typedef Algo::Gemm::Blocked algo_tag_type; - test_batched_gemm, double, param_tag_type, - algo_tag_type>(); + test_batched_gemm, double, param_tag_type, algo_tag_type>(); } TEST_F(TestCategory, batched_scalar_serial_gemm_t_t_dcomplex_double) { - typedef ::Test::Gemm::ParamTag - param_tag_type; + typedef ::Test::Gemm::ParamTag param_tag_type; typedef Algo::Gemm::Blocked algo_tag_type; - test_batched_gemm, double, param_tag_type, - algo_tag_type>(); + test_batched_gemm, double, param_tag_type, algo_tag_type>(); } // TEST_F( TestCategory, batched_scalar_serial_gemm_ct_nt_dcomplex_double ) { // typedef ::Test::Gemm::ParamTag diff --git a/batched/dense/unit_test/Test_Batched_SerialGemm_Real.hpp b/batched/dense/unit_test/Test_Batched_SerialGemm_Real.hpp index 6f074867d9..afe5744688 100644 --- a/batched/dense/unit_test/Test_Batched_SerialGemm_Real.hpp +++ b/batched/dense/unit_test/Test_Batched_SerialGemm_Real.hpp @@ -15,112 +15,88 @@ //@HEADER #if defined(KOKKOS_BHALF_T_IS_FLOAT) TEST_F(TestCategory, batched_scalar_serial_gemm_nt_nt_bhalf_bhalf) { - typedef ::Test::Gemm::ParamTag - param_tag_type; + typedef ::Test::Gemm::ParamTag param_tag_type; - test_batched_gemm(); - test_batched_gemm(); } TEST_F(TestCategory, batched_scalar_serial_gemm_t_nt_bhalf_bhalf) { - typedef ::Test::Gemm::ParamTag - param_tag_type; + typedef ::Test::Gemm::ParamTag param_tag_type; - test_batched_gemm(); - test_batched_gemm(); } TEST_F(TestCategory, batched_scalar_serial_gemm_nt_t_bhalf_bhalf) { - typedef ::Test::Gemm::ParamTag - param_tag_type; + typedef ::Test::Gemm::ParamTag param_tag_type; - test_batched_gemm(); - test_batched_gemm(); } TEST_F(TestCategory, batched_scalar_serial_gemm_t_t_bhalf_bhalf) { - typedef ::Test::Gemm::ParamTag - param_tag_type; + typedef ::Test::Gemm::ParamTag param_tag_type; - test_batched_gemm(); - test_batched_gemm(); } #endif // KOKKOS_BHALF_T_IS_FLOAT #if defined(KOKKOS_HALF_T_IS_FLOAT) TEST_F(TestCategory, batched_scalar_serial_gemm_nt_nt_half_half) { - typedef ::Test::Gemm::ParamTag - param_tag_type; + typedef ::Test::Gemm::ParamTag param_tag_type; - test_batched_gemm(); - test_batched_gemm(); + test_batched_gemm(); + test_batched_gemm(); } TEST_F(TestCategory, batched_scalar_serial_gemm_t_nt_half_half) { - typedef ::Test::Gemm::ParamTag - param_tag_type; + typedef ::Test::Gemm::ParamTag param_tag_type; - test_batched_gemm(); - test_batched_gemm(); + test_batched_gemm(); + test_batched_gemm(); } TEST_F(TestCategory, batched_scalar_serial_gemm_nt_t_half_half) { - typedef ::Test::Gemm::ParamTag - param_tag_type; + typedef ::Test::Gemm::ParamTag param_tag_type; - test_batched_gemm(); - test_batched_gemm(); + test_batched_gemm(); + test_batched_gemm(); } TEST_F(TestCategory, batched_scalar_serial_gemm_t_t_half_half) { - typedef ::Test::Gemm::ParamTag - param_tag_type; + typedef ::Test::Gemm::ParamTag param_tag_type; - test_batched_gemm(); - test_batched_gemm(); + test_batched_gemm(); + test_batched_gemm(); } #endif // KOKKOS_HALF_T_IS_FLOAT #if defined(KOKKOSKERNELS_INST_FLOAT) TEST_F(TestCategory, batched_scalar_serial_gemm_nt_nt_float_float) { - typedef ::Test::Gemm::ParamTag - param_tag_type; + typedef ::Test::Gemm::ParamTag param_tag_type; typedef Algo::Gemm::Blocked algo_tag_type; test_batched_gemm(); } TEST_F(TestCategory, batched_scalar_serial_gemm_t_nt_float_float) { - typedef ::Test::Gemm::ParamTag - param_tag_type; + typedef ::Test::Gemm::ParamTag param_tag_type; typedef Algo::Gemm::Blocked algo_tag_type; test_batched_gemm(); } TEST_F(TestCategory, batched_scalar_serial_gemm_nt_t_float_float) { - typedef ::Test::Gemm::ParamTag - param_tag_type; + typedef ::Test::Gemm::ParamTag param_tag_type; typedef Algo::Gemm::Blocked algo_tag_type; test_batched_gemm(); } TEST_F(TestCategory, batched_scalar_serial_gemm_t_t_float_float) { - typedef ::Test::Gemm::ParamTag - param_tag_type; + typedef ::Test::Gemm::ParamTag param_tag_type; typedef Algo::Gemm::Blocked algo_tag_type; test_batched_gemm(); } @@ -128,31 +104,23 @@ TEST_F(TestCategory, batched_scalar_serial_gemm_t_t_float_float) { #if defined(KOKKOSKERNELS_INST_DOUBLE) TEST_F(TestCategory, batched_scalar_serial_gemm_nt_nt_double_double) { - typedef ::Test::Gemm::ParamTag - param_tag_type; + typedef ::Test::Gemm::ParamTag param_tag_type; typedef Algo::Gemm::Blocked algo_tag_type; - test_batched_gemm(); + test_batched_gemm(); } TEST_F(TestCategory, batched_scalar_serial_gemm_t_nt_double_double) { - typedef ::Test::Gemm::ParamTag - param_tag_type; + typedef ::Test::Gemm::ParamTag param_tag_type; typedef Algo::Gemm::Blocked algo_tag_type; - test_batched_gemm(); + test_batched_gemm(); } TEST_F(TestCategory, batched_scalar_serial_gemm_nt_t_double_double) { - typedef ::Test::Gemm::ParamTag - param_tag_type; + typedef ::Test::Gemm::ParamTag param_tag_type; typedef Algo::Gemm::Blocked algo_tag_type; - test_batched_gemm(); + test_batched_gemm(); } TEST_F(TestCategory, batched_scalar_serial_gemm_t_t_double_double) { - typedef ::Test::Gemm::ParamTag - param_tag_type; + typedef ::Test::Gemm::ParamTag param_tag_type; typedef Algo::Gemm::Blocked algo_tag_type; - test_batched_gemm(); + test_batched_gemm(); } #endif diff --git a/batched/dense/unit_test/Test_Batched_SerialGesv.hpp b/batched/dense/unit_test/Test_Batched_SerialGesv.hpp index bb05fab3bb..8ec0dd8189 100644 --- a/batched/dense/unit_test/Test_Batched_SerialGesv.hpp +++ b/batched/dense/unit_test/Test_Batched_SerialGesv.hpp @@ -32,8 +32,7 @@ using namespace KokkosBatched; namespace Test { namespace Gesv { -template +template struct Functor_TestBatchedSerialGesv { using execution_space = typename DeviceType::execution_space; const MatrixType _A; @@ -42,8 +41,7 @@ struct Functor_TestBatchedSerialGesv { const VectorType _B; KOKKOS_INLINE_FUNCTION - Functor_TestBatchedSerialGesv(const MatrixType &A, const MatrixType &tmp, - const VectorType &X, const VectorType &B) + Functor_TestBatchedSerialGesv(const MatrixType &A, const MatrixType &tmp, const VectorType &X, const VectorType &B) : _A(A), _tmp(tmp), _X(X), _B(B) {} KOKKOS_INLINE_FUNCTION @@ -68,21 +66,18 @@ struct Functor_TestBatchedSerialGesv { } }; -template +template void impl_test_batched_gesv(const int N, const int BlkSize) { typedef typename MatrixType::value_type value_type; typedef Kokkos::ArithTraits ats; using MagnitudeType = typename Kokkos::ArithTraits::mag_type; - using NormViewType = - Kokkos::View; + using NormViewType = Kokkos::View; NormViewType sqr_norm_j("sqr_norm_j", N); auto sqr_norm_j_host = Kokkos::create_mirror_view(sqr_norm_j); - MatrixType A("A", N, BlkSize, BlkSize), A2("A", N, BlkSize, BlkSize), - tmp("tmp", N, BlkSize, BlkSize + 4); + MatrixType A("A", N, BlkSize, BlkSize), A2("A", N, BlkSize, BlkSize), tmp("tmp", N, BlkSize, BlkSize + 4); VectorType B("b", N, BlkSize), B2("b", N, BlkSize), X("x", N, BlkSize); create_tridiagonal_batched_matrices(A, B); @@ -98,23 +93,18 @@ void impl_test_batched_gesv(const int N, const int BlkSize) { Kokkos::fence(); - Functor_TestBatchedSerialGesv(A, tmp, X, B) - .run(); + Functor_TestBatchedSerialGesv(A, tmp, X, B).run(); Kokkos::fence(); Kokkos::deep_copy(X_host, X); for (int l = 0; l < N; ++l) - KokkosBlas::SerialGemv:: - invoke(-1, Kokkos::subview(A_host, l, Kokkos::ALL, Kokkos::ALL), - Kokkos::subview(X_host, l, Kokkos::ALL), 1, - Kokkos::subview(B_host, l, Kokkos::ALL)); + KokkosBlas::SerialGemv::invoke( + -1, Kokkos::subview(A_host, l, Kokkos::ALL, Kokkos::ALL), Kokkos::subview(X_host, l, Kokkos::ALL), 1, + Kokkos::subview(B_host, l, Kokkos::ALL)); - KokkosBatched::SerialDot::invoke(B_host, B_host, - sqr_norm_j_host); + KokkosBatched::SerialDot::invoke(B_host, B_host, sqr_norm_j_host); const MagnitudeType eps = 1.0e3 * ats::epsilon(); @@ -127,27 +117,21 @@ template int test_batched_gesv() { #if defined(KOKKOSKERNELS_INST_LAYOUTLEFT) { - typedef Kokkos::View - MatrixType; - typedef Kokkos::View - VectorType; + typedef Kokkos::View MatrixType; + typedef Kokkos::View VectorType; for (int i = 3; i < 10; ++i) { - Test::Gesv::impl_test_batched_gesv(1024, i); + Test::Gesv::impl_test_batched_gesv(1024, i); } } #endif #if defined(KOKKOSKERNELS_INST_LAYOUTRIGHT) { - typedef Kokkos::View - MatrixType; - typedef Kokkos::View - VectorType; + typedef Kokkos::View MatrixType; + typedef Kokkos::View VectorType; for (int i = 3; i < 10; ++i) { - Test::Gesv::impl_test_batched_gesv(1024, i); + Test::Gesv::impl_test_batched_gesv(1024, i); } } #endif diff --git a/batched/dense/unit_test/Test_Batched_SerialInverseLU.hpp b/batched/dense/unit_test/Test_Batched_SerialInverseLU.hpp index 23ded73e25..6f11154471 100644 --- a/batched/dense/unit_test/Test_Batched_SerialInverseLU.hpp +++ b/batched/dense/unit_test/Test_Batched_SerialInverseLU.hpp @@ -19,14 +19,14 @@ #include "Kokkos_Core.hpp" #include "Kokkos_Random.hpp" -//#include "KokkosBatched_Vector.hpp" +// #include "KokkosBatched_Vector.hpp" #include "KokkosBatched_Gemm_Decl.hpp" #include "KokkosBatched_Gemm_Serial_Impl.hpp" #include "KokkosBatched_LU_Decl.hpp" #include "KokkosBatched_LU_Serial_Impl.hpp" #include "KokkosBatched_InverseLU_Decl.hpp" -//#include "KokkosBatched_InverseLU_Serial_Impl.hpp" +// #include "KokkosBatched_InverseLU_Serial_Impl.hpp" #include "KokkosKernels_TestUtils.hpp" @@ -41,8 +41,7 @@ struct ParamTag { typedef TB transB; }; -template +template struct Functor_BatchedSerialGemm { using execution_space = typename DeviceType::execution_space; ViewType _a, _b, _c; @@ -50,8 +49,7 @@ struct Functor_BatchedSerialGemm { ScalarType _alpha, _beta; KOKKOS_INLINE_FUNCTION - Functor_BatchedSerialGemm(const ScalarType alpha, const ViewType &a, - const ViewType &b, const ScalarType beta, + Functor_BatchedSerialGemm(const ScalarType alpha, const ViewType &a, const ViewType &b, const ScalarType beta, const ViewType &c) : _a(a), _b(b), _c(c), _alpha(alpha), _beta(beta) {} @@ -63,8 +61,8 @@ struct Functor_BatchedSerialGemm { for (int i = 0; i < static_cast(aa.extent(0)); ++i) aa(i, i) += 10.0; - SerialGemm::invoke(_alpha, aa, bb, _beta, cc); + SerialGemm::invoke(_alpha, aa, bb, _beta, + cc); } inline void run() { @@ -108,16 +106,14 @@ struct Functor_BatchedSerialLU { } }; -template +template struct Functor_TestBatchedSerialInverseLU { using execution_space = typename DeviceType::execution_space; AViewType _a; WViewType _w; KOKKOS_INLINE_FUNCTION - Functor_TestBatchedSerialInverseLU(const AViewType &a, const WViewType &w) - : _a(a), _w(w) {} + Functor_TestBatchedSerialInverseLU(const AViewType &a, const WViewType &w) : _a(a), _w(w) {} KOKKOS_INLINE_FUNCTION void operator()(const int k) const { @@ -139,8 +135,7 @@ struct Functor_TestBatchedSerialInverseLU { } }; -template +template void impl_test_batched_inverselu(const int N, const int BlkSize) { typedef typename AViewType::value_type value_type; typedef Kokkos::ArithTraits ats; @@ -151,8 +146,7 @@ void impl_test_batched_inverselu(const int N, const int BlkSize) { WViewType w("w", N, BlkSize * BlkSize); AViewType c0("c0", N, BlkSize, BlkSize); - Kokkos::Random_XorShift64_Pool random( - 13718); + Kokkos::Random_XorShift64_Pool random(13718); Kokkos::fill_random(a0, random, value_type(1.0)); Kokkos::fence(); @@ -162,16 +156,12 @@ void impl_test_batched_inverselu(const int N, const int BlkSize) { Functor_BatchedSerialLU(a1).run(); - Functor_TestBatchedSerialInverseLU(a1, w) - .run(); + Functor_TestBatchedSerialInverseLU(a1, w).run(); value_type alpha = 1.0, beta = 0.0; - typedef SerialInverseLU::ParamTag - param_tag_type; + typedef SerialInverseLU::ParamTag param_tag_type; - Functor_BatchedSerialGemm(alpha, a0, a1, beta, c0) + Functor_BatchedSerialGemm(alpha, a0, a1, beta, c0) .run(); Kokkos::fence(); @@ -202,31 +192,21 @@ template int test_batched_inverselu() { #if defined(KOKKOSKERNELS_INST_LAYOUTLEFT) { - typedef Kokkos::View - AViewType; - typedef Kokkos::View - WViewType; - Test::SerialInverseLU::impl_test_batched_inverselu( - 0, 10); + typedef Kokkos::View AViewType; + typedef Kokkos::View WViewType; + Test::SerialInverseLU::impl_test_batched_inverselu(0, 10); for (int i = 0; i < 10; ++i) { - Test::SerialInverseLU::impl_test_batched_inverselu< - DeviceType, AViewType, WViewType, AlgoTagType>(1024, i); + Test::SerialInverseLU::impl_test_batched_inverselu(1024, i); } } #endif #if defined(KOKKOSKERNELS_INST_LAYOUTRIGHT) { - typedef Kokkos::View - AViewType; - typedef Kokkos::View - WViewType; - Test::SerialInverseLU::impl_test_batched_inverselu( - 0, 10); + typedef Kokkos::View AViewType; + typedef Kokkos::View WViewType; + Test::SerialInverseLU::impl_test_batched_inverselu(0, 10); for (int i = 0; i < 10; ++i) { - Test::SerialInverseLU::impl_test_batched_inverselu< - DeviceType, AViewType, WViewType, AlgoTagType>(1024, i); + Test::SerialInverseLU::impl_test_batched_inverselu(1024, i); } } #endif diff --git a/batched/dense/unit_test/Test_Batched_SerialInverseLU_Complex.hpp b/batched/dense/unit_test/Test_Batched_SerialInverseLU_Complex.hpp index 243ed21908..01e6372471 100644 --- a/batched/dense/unit_test/Test_Batched_SerialInverseLU_Complex.hpp +++ b/batched/dense/unit_test/Test_Batched_SerialInverseLU_Complex.hpp @@ -18,11 +18,9 @@ TEST_F(TestCategory, batched_scalar_serial_inverselu_dcomplex) { // printf("Batched serial inverse LU - double complex - algorithm type: // Unblocked\n"); - test_batched_inverselu, - Algo::InverseLU::Unblocked>(); + test_batched_inverselu, Algo::InverseLU::Unblocked>(); // printf("Batched serial inverse LU - double complex - algorithm type: // Blocked\n"); - test_batched_inverselu, - Algo::InverseLU::Blocked>(); + test_batched_inverselu, Algo::InverseLU::Blocked>(); } #endif diff --git a/batched/dense/unit_test/Test_Batched_SerialLU.hpp b/batched/dense/unit_test/Test_Batched_SerialLU.hpp index 87224aa888..33e079dd9b 100644 --- a/batched/dense/unit_test/Test_Batched_SerialLU.hpp +++ b/batched/dense/unit_test/Test_Batched_SerialLU.hpp @@ -19,7 +19,7 @@ #include "Kokkos_Core.hpp" #include "Kokkos_Random.hpp" -//#include "KokkosBatched_Vector.hpp" +// #include "KokkosBatched_Vector.hpp" #include "KokkosBatched_LU_Decl.hpp" #include "KokkosBatched_LU_Serial_Impl.hpp" @@ -67,16 +67,14 @@ void impl_test_batched_lu(const int N, const int BlkSize) { /// randomized input testing views ViewType a0("a0", N, BlkSize, BlkSize), a1("a1", N, BlkSize, BlkSize); - Kokkos::Random_XorShift64_Pool random( - 13718); + Kokkos::Random_XorShift64_Pool random(13718); Kokkos::fill_random(a0, random, value_type(1.0)); Kokkos::fence(); Kokkos::deep_copy(a1, a0); - Functor_TestBatchedSerialLU(a0) - .run(); + Functor_TestBatchedSerialLU(a0).run(); Functor_TestBatchedSerialLU(a1).run(); Kokkos::fence(); @@ -107,8 +105,7 @@ template int test_batched_lu() { #if defined(KOKKOSKERNELS_INST_LAYOUTLEFT) { - typedef Kokkos::View - ViewType; + typedef Kokkos::View ViewType; Test::impl_test_batched_lu(0, 10); for (int i = 0; i < 10; ++i) { // printf("Testing: LayoutLeft, Blksize %d\n", i); @@ -118,8 +115,7 @@ int test_batched_lu() { #endif #if defined(KOKKOSKERNELS_INST_LAYOUTRIGHT) { - typedef Kokkos::View - ViewType; + typedef Kokkos::View ViewType; Test::impl_test_batched_lu(0, 10); for (int i = 0; i < 10; ++i) { // printf("Testing: LayoutLeft, Blksize %d\n", i); diff --git a/batched/dense/unit_test/Test_Batched_SerialPttrf.hpp b/batched/dense/unit_test/Test_Batched_SerialPttrf.hpp index 6ee7818ddc..11274fc311 100644 --- a/batched/dense/unit_test/Test_Batched_SerialPttrf.hpp +++ b/batched/dense/unit_test/Test_Batched_SerialPttrf.hpp @@ -27,16 +27,14 @@ using namespace KokkosBatched; namespace Test { namespace Pttrf { -template +template struct Functor_BatchedSerialPttrf { using execution_space = typename DeviceType::execution_space; DViewType _d; EViewType _e; KOKKOS_INLINE_FUNCTION - Functor_BatchedSerialPttrf(const DViewType &d, const EViewType &e) - : _d(d), _e(e) {} + Functor_BatchedSerialPttrf(const DViewType &d, const EViewType &e) : _d(d), _e(e) {} KOKKOS_INLINE_FUNCTION void operator()(const int k, int &info) const { @@ -60,8 +58,8 @@ struct Functor_BatchedSerialPttrf { } }; -template +template struct Functor_BatchedSerialGemm { using execution_space = typename DeviceType::execution_space; AViewType _a; @@ -70,8 +68,7 @@ struct Functor_BatchedSerialGemm { ScalarType _alpha, _beta; KOKKOS_INLINE_FUNCTION - Functor_BatchedSerialGemm(const ScalarType alpha, const AViewType &a, - const BViewType &b, const ScalarType beta, + Functor_BatchedSerialGemm(const ScalarType alpha, const AViewType &a, const BViewType &b, const ScalarType beta, const CViewType &c) : _a(a), _b(b), _c(c), _alpha(alpha), _beta(beta) {} @@ -81,9 +78,7 @@ struct Functor_BatchedSerialGemm { auto bb = Kokkos::subview(_b, k, Kokkos::ALL(), Kokkos::ALL()); auto cc = Kokkos::subview(_c, k, Kokkos::ALL(), Kokkos::ALL()); - KokkosBatched::SerialGemm::invoke(_alpha, aa, bb, - _beta, cc); + KokkosBatched::SerialGemm::invoke(_alpha, aa, bb, _beta, cc); } inline void run() { @@ -96,8 +91,7 @@ struct Functor_BatchedSerialGemm { } }; -template +template /// \brief Implementation details of batched pttrf test for random matrix /// /// \param N [in] Batch size of matrix A @@ -109,16 +103,13 @@ void impl_test_batched_pttrf(const int N, const int BlkSize) { using View2DType = Kokkos::View; using View3DType = Kokkos::View; - View3DType A("A", N, BlkSize, BlkSize), - A_reconst("A_reconst", N, BlkSize, BlkSize); - View3DType EL("EL", N, BlkSize, BlkSize), EU("EU", N, BlkSize, BlkSize), - D("D", N, BlkSize, BlkSize), LD("LD", N, BlkSize, BlkSize), - L("L", N, BlkSize, BlkSize), I("I", N, BlkSize, BlkSize); + View3DType A("A", N, BlkSize, BlkSize), A_reconst("A_reconst", N, BlkSize, BlkSize); + View3DType EL("EL", N, BlkSize, BlkSize), EU("EU", N, BlkSize, BlkSize), D("D", N, BlkSize, BlkSize), + LD("LD", N, BlkSize, BlkSize), L("L", N, BlkSize, BlkSize), I("I", N, BlkSize, BlkSize); RealView2DType d("d", N, BlkSize), // Diagonal components ones(Kokkos::view_alloc("ones", Kokkos::WithoutInitializing), N, BlkSize); - View2DType e_upper("e_upper", N, BlkSize - 1), - e_lower("e_lower", N, - BlkSize - 1); // upper and lower diagonal components + View2DType e_upper("e_upper", N, BlkSize - 1), e_lower("e_lower", N, + BlkSize - 1); // upper and lower diagonal components using execution_space = typename DeviceType::execution_space; Kokkos::Random_XorShift64_Pool rand_pool(13718); @@ -129,19 +120,16 @@ void impl_test_batched_pttrf(const int N, const int BlkSize) { KokkosKernels::Impl::getRandomBounds(1.0, randStart, randEnd); // Add BlkSize to ensure positive definiteness - Kokkos::fill_random(d, rand_pool, realRandStart + BlkSize, - realRandEnd + BlkSize); + Kokkos::fill_random(d, rand_pool, realRandStart + BlkSize, realRandEnd + BlkSize); Kokkos::fill_random(e_upper, rand_pool, randStart, randEnd); - auto h_e_upper = - Kokkos::create_mirror_view_and_copy(Kokkos::HostSpace(), e_upper); + auto h_e_upper = Kokkos::create_mirror_view_and_copy(Kokkos::HostSpace(), e_upper); auto h_e_lower = Kokkos::create_mirror_view(e_lower); for (int ib = 0; ib < N; ib++) { for (int i = 0; i < BlkSize - 1; i++) { // Fill the lower diagonal with conjugate of the upper diagonal - h_e_lower(ib, i) = - Kokkos::ArithTraits::conj(h_e_upper(ib, i)); + h_e_lower(ib, i) = Kokkos::ArithTraits::conj(h_e_upper(ib, i)); } } @@ -157,23 +145,21 @@ void impl_test_batched_pttrf(const int N, const int BlkSize) { // Matrix matrix addition by Gemm // D + EU by D * I + EU (result stored in EU) - Functor_BatchedSerialGemm(1.0, D, I, 1.0, EU) + Functor_BatchedSerialGemm(1.0, D, I, + 1.0, EU) .run(); // Copy EL to A Kokkos::deep_copy(A, EL); // EU + EL by EU * I + A (result stored in A) - Functor_BatchedSerialGemm(1.0, EU, I, 1.0, A) + Functor_BatchedSerialGemm(1.0, EU, I, + 1.0, A) .run(); // Factorize matrix A -> L * D * L**H // d and e are updated by pttrf - auto info = Functor_BatchedSerialPttrf(d, e_lower) - .run(); + auto info = Functor_BatchedSerialPttrf(d, e_lower).run(); Kokkos::fence(); @@ -189,14 +175,14 @@ void impl_test_batched_pttrf(const int N, const int BlkSize) { Kokkos::deep_copy(L, I); // EL + I by EL * I + L (result stored in L) - Functor_BatchedSerialGemm(1.0, EL, I, 1.0, L) + Functor_BatchedSerialGemm(1.0, EL, I, + 1.0, L) .run(); // Reconstruct A by L*D*L**H // Gemm to compute L*D -> LD - Functor_BatchedSerialGemm(1.0, L, D, 0.0, LD) + Functor_BatchedSerialGemm(1.0, L, D, + 0.0, LD) .run(); // FIXME: We should use SerialGemm Trans::ConjTranspose. @@ -222,9 +208,8 @@ void impl_test_batched_pttrf(const int N, const int BlkSize) { Kokkos::deep_copy(L, h_L); // Gemm to compute (L*D)*(conj(L))**T -> A_reconst - Functor_BatchedSerialGemm(1.0, LD, L, 0.0, - A_reconst) + Functor_BatchedSerialGemm( + 1.0, LD, L, 0.0, A_reconst) .run(); Kokkos::fence(); @@ -232,9 +217,8 @@ void impl_test_batched_pttrf(const int N, const int BlkSize) { // this eps is about 10^-14 RealType eps = 1.0e3 * ats::epsilon(); - auto h_A = Kokkos::create_mirror_view_and_copy(Kokkos::HostSpace(), A); - auto h_A_reconst = - Kokkos::create_mirror_view_and_copy(Kokkos::HostSpace(), A_reconst); + auto h_A = Kokkos::create_mirror_view_and_copy(Kokkos::HostSpace(), A); + auto h_A_reconst = Kokkos::create_mirror_view_and_copy(Kokkos::HostSpace(), A_reconst); // Check A = L*D*L**H for (int ib = 0; ib < N; ib++) { @@ -246,8 +230,7 @@ void impl_test_batched_pttrf(const int N, const int BlkSize) { } } -template +template /// \brief Implementation details of batched pttrf test for early return /// BlkSize must be 0 or 1 /// @@ -263,8 +246,7 @@ void impl_test_batched_pttrf_quick_return(const int N, const int BlkSize) { const int BlkSize_minus_1 = BlkSize > 0 ? BlkSize - 1 : 0; - RealView2DType d("d", N, BlkSize), - d2("d2", N, BlkSize); // Diagonal components + RealView2DType d("d", N, BlkSize), d2("d2", N, BlkSize); // Diagonal components View2DType e("e", N, BlkSize_minus_1); // lower diagonal components @@ -277,14 +259,10 @@ void impl_test_batched_pttrf_quick_return(const int N, const int BlkSize) { // Factorize matrix A -> L * D * L**H // d and e are updated by pttrf // Early return if BlkSize is 0 or 1 - auto info = Functor_BatchedSerialPttrf(d, e) - .run(); + auto info = Functor_BatchedSerialPttrf(d, e).run(); // For negative values, info should be 1 for BlkSize = 1 - auto info2 = Functor_BatchedSerialPttrf(d2, e) - .run(); + auto info2 = Functor_BatchedSerialPttrf(d2, e).run(); Kokkos::fence(); @@ -307,8 +285,7 @@ void impl_test_batched_pttrf_quick_return(const int N, const int BlkSize) { } } -template +template /// \brief Implementation details of batched pttrf test /// /// \param N [in] Batch size of matrix A @@ -320,11 +297,9 @@ void impl_test_batched_pttrf_analytical(const int N, const int BlkSize) { using View2DType = Kokkos::View; using View3DType = Kokkos::View; - View3DType A("A", N, BlkSize, BlkSize), - A_reconst("A_reconst", N, BlkSize, BlkSize); - View3DType EL("EL", N, BlkSize, BlkSize), EU("EU", N, BlkSize, BlkSize), - D("D", N, BlkSize, BlkSize), LD("LD", N, BlkSize, BlkSize), - L("L", N, BlkSize, BlkSize), I("I", N, BlkSize, BlkSize); + View3DType A("A", N, BlkSize, BlkSize), A_reconst("A_reconst", N, BlkSize, BlkSize); + View3DType EL("EL", N, BlkSize, BlkSize), EU("EU", N, BlkSize, BlkSize), D("D", N, BlkSize, BlkSize), + LD("LD", N, BlkSize, BlkSize), L("L", N, BlkSize, BlkSize), I("I", N, BlkSize, BlkSize); RealView2DType d(Kokkos::view_alloc("d", Kokkos::WithoutInitializing), N, BlkSize), // Diagonal components ones(Kokkos::view_alloc("ones", Kokkos::WithoutInitializing), N, BlkSize); @@ -344,23 +319,21 @@ void impl_test_batched_pttrf_analytical(const int N, const int BlkSize) { // Matrix matrix addition by Gemm // D + EU by D * I + EU (result stored in EU) - Functor_BatchedSerialGemm(1.0, D, I, 1.0, EU) + Functor_BatchedSerialGemm(1.0, D, I, + 1.0, EU) .run(); // Copy EL to A Kokkos::deep_copy(A, EL); // EU + EL by EU * I + A (result stored in A) - Functor_BatchedSerialGemm(1.0, EU, I, 1.0, A) + Functor_BatchedSerialGemm(1.0, EU, I, + 1.0, A) .run(); // Factorize matrix A -> L * D * L**T // d and e are updated by pttrf - auto info = Functor_BatchedSerialPttrf(d, e) - .run(); + auto info = Functor_BatchedSerialPttrf(d, e).run(); Kokkos::fence(); @@ -376,20 +349,19 @@ void impl_test_batched_pttrf_analytical(const int N, const int BlkSize) { Kokkos::deep_copy(L, I); // EL + I by EL * I + L (result stored in L) - Functor_BatchedSerialGemm(1.0, EL, I, 1.0, L) + Functor_BatchedSerialGemm(1.0, EL, I, + 1.0, L) .run(); // Reconstruct A by L*D*L**T // Gemm to compute L*D -> LD - Functor_BatchedSerialGemm(1.0, L, D, 0.0, LD) + Functor_BatchedSerialGemm(1.0, L, D, + 0.0, LD) .run(); // Gemm to compute (L*D)*L**T -> A_reconst - Functor_BatchedSerialGemm(1.0, LD, L, 0.0, - A_reconst) + Functor_BatchedSerialGemm( + 1.0, LD, L, 0.0, A_reconst) .run(); Kokkos::fence(); @@ -397,9 +369,8 @@ void impl_test_batched_pttrf_analytical(const int N, const int BlkSize) { // this eps is about 10^-14 RealType eps = 1.0e3 * ats::epsilon(); - auto h_A = Kokkos::create_mirror_view_and_copy(Kokkos::HostSpace(), A); - auto h_A_reconst = - Kokkos::create_mirror_view_and_copy(Kokkos::HostSpace(), A_reconst); + auto h_A = Kokkos::create_mirror_view_and_copy(Kokkos::HostSpace(), A); + auto h_A_reconst = Kokkos::create_mirror_view_and_copy(Kokkos::HostSpace(), A_reconst); // Check A = L*D*L.T for (int ib = 0; ib < N; ib++) { @@ -420,22 +391,14 @@ int test_batched_pttrf() { { using LayoutType = Kokkos::LayoutLeft; for (int i = 0; i < 2; i++) { - Test::Pttrf::impl_test_batched_pttrf_quick_return< - DeviceType, ScalarType, LayoutType, AlgoTagType>(1, i); - Test::Pttrf::impl_test_batched_pttrf_quick_return< - DeviceType, ScalarType, LayoutType, AlgoTagType>(2, i); + Test::Pttrf::impl_test_batched_pttrf_quick_return(1, i); + Test::Pttrf::impl_test_batched_pttrf_quick_return(2, i); } for (int i = 2; i < 10; i++) { - Test::Pttrf::impl_test_batched_pttrf(1, i); - Test::Pttrf::impl_test_batched_pttrf(2, i); - Test::Pttrf::impl_test_batched_pttrf_analytical( - 1, i); - Test::Pttrf::impl_test_batched_pttrf_analytical( - 2, i); + Test::Pttrf::impl_test_batched_pttrf(1, i); + Test::Pttrf::impl_test_batched_pttrf(2, i); + Test::Pttrf::impl_test_batched_pttrf_analytical(1, i); + Test::Pttrf::impl_test_batched_pttrf_analytical(2, i); } } #endif @@ -443,22 +406,14 @@ int test_batched_pttrf() { { using LayoutType = Kokkos::LayoutRight; for (int i = 0; i < 2; i++) { - Test::Pttrf::impl_test_batched_pttrf_quick_return< - DeviceType, ScalarType, LayoutType, AlgoTagType>(1, i); - Test::Pttrf::impl_test_batched_pttrf_quick_return< - DeviceType, ScalarType, LayoutType, AlgoTagType>(2, i); + Test::Pttrf::impl_test_batched_pttrf_quick_return(1, i); + Test::Pttrf::impl_test_batched_pttrf_quick_return(2, i); } for (int i = 2; i < 10; i++) { - Test::Pttrf::impl_test_batched_pttrf(1, i); - Test::Pttrf::impl_test_batched_pttrf(2, i); - Test::Pttrf::impl_test_batched_pttrf_analytical( - 1, i); - Test::Pttrf::impl_test_batched_pttrf_analytical( - 2, i); + Test::Pttrf::impl_test_batched_pttrf(1, i); + Test::Pttrf::impl_test_batched_pttrf(2, i); + Test::Pttrf::impl_test_batched_pttrf_analytical(1, i); + Test::Pttrf::impl_test_batched_pttrf_analytical(2, i); } } #endif diff --git a/batched/dense/unit_test/Test_Batched_SerialSVD.hpp b/batched/dense/unit_test/Test_Batched_SerialSVD.hpp index 099fa9219f..9bf9d43578 100644 --- a/batched/dense/unit_test/Test_Batched_SerialSVD.hpp +++ b/batched/dense/unit_test/Test_Batched_SerialSVD.hpp @@ -70,8 +70,7 @@ void verifyOrthogonal(const Mat& X) { } template -void verifySVD(const AView& A, const UView& U, const VtView& Vt, - const SigmaView& sigma) { +void verifySVD(const AView& A, const UView& U, const VtView& Vt, const SigmaView& sigma) { using Scalar = typename AView::non_const_value_type; using KAT = Kokkos::ArithTraits; // Check that U/V columns are unit length and orthogonal, and that U * @@ -85,10 +84,8 @@ void verifySVD(const AView& A, const UView& U, const VtView& Vt, verifyOrthogonal(Vt); Kokkos::View usvt("USV^T", m, n); for (int i = 0; i < maxrank; i++) { - auto Ucol = - Kokkos::subview(U, Kokkos::ALL(), Kokkos::make_pair(i, i + 1)); - auto Vtrow = - Kokkos::subview(Vt, Kokkos::make_pair(i, i + 1), Kokkos::ALL()); + auto Ucol = Kokkos::subview(U, Kokkos::ALL(), Kokkos::make_pair(i, i + 1)); + auto Vtrow = Kokkos::subview(Vt, Kokkos::make_pair(i, i + 1), Kokkos::ALL()); Test::vanillaGEMM(sigma(i), Ucol, Vtrow, 1.0, usvt); } for (int i = 0; i < m; i++) { @@ -113,8 +110,7 @@ Matrix createRandomMatrix(int m, int n, int deficiency, double maxval = 1.0) { auto mhost = Kokkos::create_mirror_view(mat); // Fill mat with random values first if (maxval != 0.0) { - Kokkos::Random_XorShift64_Pool rand_pool( - 13718); + Kokkos::Random_XorShift64_Pool rand_pool(13718); Scalar minrand, maxrand; Test::getRandomBounds(maxval, minrand, maxrand); Kokkos::fill_random(mhost, rand_pool, minrand, maxrand); @@ -143,15 +139,14 @@ Matrix createRandomMatrix(int m, int n, int deficiency, double maxval = 1.0) { template struct SerialSVDFunctor_Full { - SerialSVDFunctor_Full(const Matrix& A_, const Matrix& U_, const Matrix& Vt_, - const Vector& sigma_, const Vector& work_) + SerialSVDFunctor_Full(const Matrix& A_, const Matrix& U_, const Matrix& Vt_, const Vector& sigma_, + const Vector& work_) : A(A_), U(U_), Vt(Vt_), sigma(sigma_), work(work_) {} // NOTE: this functor is only meant to be launched with a single element range // policy KOKKOS_INLINE_FUNCTION void operator()(int) const { - KokkosBatched::SerialSVD::invoke(KokkosBatched::SVD_USV_Tag(), A, U, sigma, - Vt, work); + KokkosBatched::SerialSVD::invoke(KokkosBatched::SVD_USV_Tag(), A, U, sigma, Vt, work); } Matrix A; @@ -163,15 +158,13 @@ struct SerialSVDFunctor_Full { template struct SerialSVDFunctor_SingularValuesOnly { - SerialSVDFunctor_SingularValuesOnly(const Matrix& A_, const Vector& sigma_, - const Vector& work_) + SerialSVDFunctor_SingularValuesOnly(const Matrix& A_, const Vector& sigma_, const Vector& work_) : A(A_), sigma(sigma_), work(work_) {} // NOTE: this functor is only meant to be launched with a single element range // policy KOKKOS_INLINE_FUNCTION void operator()(int) const { - KokkosBatched::SerialSVD::invoke(KokkosBatched::SVD_S_Tag(), A, sigma, - work); + KokkosBatched::SerialSVD::invoke(KokkosBatched::SVD_S_Tag(), A, sigma, work); } Matrix A; @@ -201,14 +194,12 @@ void testSerialSVD(int m, int n, int deficiency, double maxval = 1.0) { typename Matrix::HostMirror Acopy("Acopy", m, n); Kokkos::deep_copy(Acopy, A); // Run the SVD - Kokkos::parallel_for( - Kokkos::RangePolicy(0, 1), - SerialSVDFunctor_Full(A, U, Vt, sigma, work)); + Kokkos::parallel_for(Kokkos::RangePolicy(0, 1), + SerialSVDFunctor_Full(A, U, Vt, sigma, work)); // Get the results back - auto Uhost = Kokkos::create_mirror_view_and_copy(Kokkos::HostSpace(), U); - auto Vthost = Kokkos::create_mirror_view_and_copy(Kokkos::HostSpace(), Vt); - auto sigmaHost = - Kokkos::create_mirror_view_and_copy(Kokkos::HostSpace(), sigma); + auto Uhost = Kokkos::create_mirror_view_and_copy(Kokkos::HostSpace(), U); + auto Vthost = Kokkos::create_mirror_view_and_copy(Kokkos::HostSpace(), Vt); + auto sigmaHost = Kokkos::create_mirror_view_and_copy(Kokkos::HostSpace(), sigma); // Verify the SVD is correct verifySVD(Acopy, Uhost, Vthost, sigmaHost); } @@ -237,22 +228,17 @@ void testSerialSVDSingularValuesOnly(int m, int n) { typename Matrix::HostMirror Acopy("Acopy", m, n); Kokkos::deep_copy(Acopy, A); // Run the SVD (full mode) - Kokkos::parallel_for( - Kokkos::RangePolicy(0, 1), - SerialSVDFunctor_Full(A, U, Vt, sigma1, work)); + Kokkos::parallel_for(Kokkos::RangePolicy(0, 1), + SerialSVDFunctor_Full(A, U, Vt, sigma1, work)); Kokkos::deep_copy(A, Acopy); // Run the same SVD (singular values only mode) - Kokkos::parallel_for( - Kokkos::RangePolicy(0, 1), - SerialSVDFunctor_SingularValuesOnly(A, sigma2, work)); - auto sigma1Host = - Kokkos::create_mirror_view_and_copy(Kokkos::HostSpace(), sigma1); - auto sigma2Host = - Kokkos::create_mirror_view_and_copy(Kokkos::HostSpace(), sigma2); + Kokkos::parallel_for(Kokkos::RangePolicy(0, 1), + SerialSVDFunctor_SingularValuesOnly(A, sigma2, work)); + auto sigma1Host = Kokkos::create_mirror_view_and_copy(Kokkos::HostSpace(), sigma1); + auto sigma2Host = Kokkos::create_mirror_view_and_copy(Kokkos::HostSpace(), sigma2); // Make sure they match for (int i = 0; i < maxrank; i++) { - Test::EXPECT_NEAR_KK(sigma1Host(i), sigma2Host(i), - Test::svdEpsilon()); + Test::EXPECT_NEAR_KK(sigma1Host(i), sigma2Host(i), Test::svdEpsilon()); } } @@ -279,9 +265,8 @@ void testSerialSVDZeroLastRow(int n) { Matrix BVt("UBVt", n, n); Test::vanillaGEMM(1.0, B, Vt, 0.0, BVt); // Run the routine (just on host) - KokkosBatched::SerialSVDInternal::svdZeroLastColumn( - B.data(), n, B.stride(0), B.stride(1), Vt.data(), Vt.stride(0), - Vt.stride(1)); + KokkosBatched::SerialSVDInternal::svdZeroLastColumn(B.data(), n, B.stride(0), B.stride(1), Vt.data(), + Vt.stride(0), Vt.stride(1)); // Check that B is still bidiagonal (to a tight tolerance, but not exactly // zero) for (int i = 0; i < n; i++) { @@ -292,8 +277,7 @@ void testSerialSVDZeroLastRow(int n) { } } // Check that the last superdiagonal is now zero - Test::EXPECT_NEAR_KK(B(n - 2, n - 1), KAT::zero(), - Test::svdEpsilon()); + Test::EXPECT_NEAR_KK(B(n - 2, n - 1), KAT::zero(), Test::svdEpsilon()); // Check that the product is still maintained Matrix BVt2("UBVt", n, n); Test::vanillaGEMM(1.0, B, Vt, 0.0, BVt2); @@ -312,8 +296,8 @@ void testSerialSVDZeroDiagonal(int n, int row) { // Generate a bidiagonal matrix using Matrix = Kokkos::View; using KAT = Kokkos::ArithTraits; - int m = n + 2; // Make U somewhat bigger to make sure the Givens transforms - // are applied correctly + int m = n + 2; // Make U somewhat bigger to make sure the Givens transforms + // are applied correctly Matrix B = createRandomMatrix(m, n, 0, 1.0); // Zero out entries to make B bidiagonal for (int i = 0; i < m; i++) { @@ -331,9 +315,8 @@ void testSerialSVDZeroDiagonal(int n, int row) { Matrix UB("UB", m, n); Test::vanillaGEMM(1.0, U, B, 0.0, UB); // Run the routine (just on host) - KokkosBatched::SerialSVDInternal::svdZeroRow( - row, B.data(), n, B.stride(0), B.stride(1), U.data(), m, U.stride(0), - U.stride(1)); + KokkosBatched::SerialSVDInternal::svdZeroRow(row, B.data(), n, B.stride(0), B.stride(1), U.data(), m, + U.stride(0), U.stride(1)); // Check that B is still bidiagonal (to a tight tolerance, but not exactly // zero) for (int i = 0; i < m; i++) { @@ -381,12 +364,9 @@ void testSVD() { template KOKKOS_INLINE_FUNCTION constexpr auto Determinant(ViewT F) - -> std::enable_if_t::value && ViewT::rank == 2, - double> { - return (F(0, 0) * F(1, 1) * F(2, 2) + F(0, 1) * F(1, 2) * F(2, 0) + - F(0, 2) * F(1, 0) * F(2, 1) - - (F(0, 2) * F(1, 1) * F(2, 0) + F(0, 1) * F(1, 0) * F(2, 2) + - F(0, 0) * F(1, 2) * F(2, 1))); + -> std::enable_if_t::value && ViewT::rank == 2, double> { + return (F(0, 0) * F(1, 1) * F(2, 2) + F(0, 1) * F(1, 2) * F(2, 0) + F(0, 2) * F(1, 0) * F(2, 1) - + (F(0, 2) * F(1, 1) * F(2, 0) + F(0, 1) * F(1, 0) * F(2, 2) + F(0, 0) * F(1, 2) * F(2, 1))); } template @@ -411,39 +391,31 @@ void testIssue1786() { using execution_space = typename Device::execution_space; using memory_space = typename Device::memory_space; constexpr int num_tests = 4; - Kokkos::View matrices("data", - num_tests); + Kokkos::View matrices("data", num_tests); GenerateTestData(matrices); - Kokkos::View Us("Us", - matrices.extent(0)); - Kokkos::View Ss("Ss", matrices.extent(0)); - Kokkos::View Vts("Vts", - matrices.extent(0)); + Kokkos::View Us("Us", matrices.extent(0)); + Kokkos::View Ss("Ss", matrices.extent(0)); + Kokkos::View Vts("Vts", matrices.extent(0)); // Make sure the 2nd dimension of works is contiguous - Kokkos::View works( - "works", matrices.extent(0)); - Kokkos::View matrices_copy( - "matrices_copy", matrices.extent(0)); + Kokkos::View works("works", matrices.extent(0)); + Kokkos::View matrices_copy("matrices_copy", matrices.extent(0)); // make a copy of the input data to avoid overwriting it Kokkos::deep_copy(matrices_copy, matrices); auto policy = Kokkos::RangePolicy(0, matrices.extent(0)); Kokkos::parallel_for( "polar decomposition", policy, KOKKOS_LAMBDA(int i) { - auto matrix_copy = - Kokkos::subview(matrices_copy, i, Kokkos::ALL(), Kokkos::ALL()); - auto U = Kokkos::subview(Us, i, Kokkos::ALL(), Kokkos::ALL()); - auto S = Kokkos::subview(Ss, i, Kokkos::ALL()); - auto Vt = Kokkos::subview(Vts, i, Kokkos::ALL(), Kokkos::ALL()); - auto work = Kokkos::subview(works, i, Kokkos::ALL()); - KokkosBatched::SerialSVD::invoke(KokkosBatched::SVD_USV_Tag{}, - matrix_copy, U, S, Vt, work); + auto matrix_copy = Kokkos::subview(matrices_copy, i, Kokkos::ALL(), Kokkos::ALL()); + auto U = Kokkos::subview(Us, i, Kokkos::ALL(), Kokkos::ALL()); + auto S = Kokkos::subview(Ss, i, Kokkos::ALL()); + auto Vt = Kokkos::subview(Vts, i, Kokkos::ALL(), Kokkos::ALL()); + auto work = Kokkos::subview(works, i, Kokkos::ALL()); + KokkosBatched::SerialSVD::invoke(KokkosBatched::SVD_USV_Tag{}, matrix_copy, U, S, Vt, work); }); - auto Us_h = Kokkos::create_mirror_view_and_copy(Kokkos::HostSpace{}, Us); - auto Ss_h = Kokkos::create_mirror_view_and_copy(Kokkos::HostSpace{}, Ss); - auto Vts_h = Kokkos::create_mirror_view_and_copy(Kokkos::HostSpace{}, Vts); - auto matrices_h = - Kokkos::create_mirror_view_and_copy(Kokkos::HostSpace{}, matrices); + auto Us_h = Kokkos::create_mirror_view_and_copy(Kokkos::HostSpace{}, Us); + auto Ss_h = Kokkos::create_mirror_view_and_copy(Kokkos::HostSpace{}, Ss); + auto Vts_h = Kokkos::create_mirror_view_and_copy(Kokkos::HostSpace{}, Vts); + auto matrices_h = Kokkos::create_mirror_view_and_copy(Kokkos::HostSpace{}, matrices); for (int i = 0; i < num_tests; i++) { auto A = Kokkos::subview(matrices_h, i, Kokkos::ALL(), Kokkos::ALL()); auto U = Kokkos::subview(Us_h, i, Kokkos::ALL(), Kokkos::ALL()); diff --git a/batched/dense/unit_test/Test_Batched_SerialSolveLU.hpp b/batched/dense/unit_test/Test_Batched_SerialSolveLU.hpp index 43cb8fab2f..734eda28bd 100644 --- a/batched/dense/unit_test/Test_Batched_SerialSolveLU.hpp +++ b/batched/dense/unit_test/Test_Batched_SerialSolveLU.hpp @@ -19,14 +19,14 @@ #include "Kokkos_Core.hpp" #include "Kokkos_Random.hpp" -//#include "KokkosBatched_Vector.hpp" +// #include "KokkosBatched_Vector.hpp" #include "KokkosBatched_Gemm_Decl.hpp" #include "KokkosBatched_Gemm_Serial_Impl.hpp" #include "KokkosBatched_LU_Decl.hpp" #include "KokkosBatched_LU_Serial_Impl.hpp" #include "KokkosBatched_SolveLU_Decl.hpp" -//#include "KokkosBatched_SolveLU_Serial_Impl.hpp" +// #include "KokkosBatched_SolveLU_Serial_Impl.hpp" #include "KokkosKernels_TestUtils.hpp" @@ -41,8 +41,7 @@ struct ParamTag { typedef TB transB; }; -template +template struct Functor_BatchedSerialGemm { using execution_space = typename DeviceType::execution_space; ViewType _a, _b, _c; @@ -50,8 +49,7 @@ struct Functor_BatchedSerialGemm { ScalarType _alpha, _beta; KOKKOS_INLINE_FUNCTION - Functor_BatchedSerialGemm(const ScalarType alpha, const ViewType &a, - const ViewType &b, const ScalarType beta, + Functor_BatchedSerialGemm(const ScalarType alpha, const ViewType &a, const ViewType &b, const ScalarType beta, const ViewType &c) : _a(a), _b(b), _c(c), _alpha(alpha), _beta(beta) {} @@ -63,8 +61,8 @@ struct Functor_BatchedSerialGemm { for (int i = 0; i < static_cast(aa.extent(0)); ++i) aa(i, i) += 10.0; - SerialGemm::invoke(_alpha, aa, bb, _beta, cc); + SerialGemm::invoke(_alpha, aa, bb, _beta, + cc); } inline void run() { @@ -108,16 +106,14 @@ struct Functor_BatchedSerialLU { } }; -template +template struct Functor_TestBatchedSerialSolveLU { using execution_space = typename DeviceType::execution_space; ViewType _a; ViewType _b; KOKKOS_INLINE_FUNCTION - Functor_TestBatchedSerialSolveLU(const ViewType &a, const ViewType &b) - : _a(a), _b(b) {} + Functor_TestBatchedSerialSolveLU(const ViewType &a, const ViewType &b) : _a(a), _b(b) {} KOKKOS_INLINE_FUNCTION void operator()(const int k) const { @@ -152,8 +148,7 @@ void impl_test_batched_solvelu(const int N, const int BlkSize) { // ViewType a0_T("a0_T", N, BlkSize, BlkSize); // ViewType b_T ("b_T", N, BlkSize, 5 ); - Kokkos::Random_XorShift64_Pool random( - 13718); + Kokkos::Random_XorShift64_Pool random(13718); Kokkos::fill_random(a0, random, value_type(1.0)); Kokkos::fill_random(x0, random, value_type(1.0)); @@ -165,15 +160,12 @@ void impl_test_batched_solvelu(const int N, const int BlkSize) { value_type alpha = 1.0, beta = 0.0; typedef ParamTag param_tag_type; - Functor_BatchedSerialGemm(alpha, a0, x0, beta, b) + Functor_BatchedSerialGemm(alpha, a0, x0, beta, b) .run(); Functor_BatchedSerialLU(a1).run(); - Functor_TestBatchedSerialSolveLU(a1, b) - .run(); + Functor_TestBatchedSerialSolveLU(a1, b).run(); Kokkos::fence(); @@ -230,25 +222,19 @@ template int test_batched_solvelu() { #if defined(KOKKOSKERNELS_INST_LAYOUTLEFT) { - typedef Kokkos::View - ViewType; - Test::SerialSolveLU::impl_test_batched_solvelu(0, 10); + typedef Kokkos::View ViewType; + Test::SerialSolveLU::impl_test_batched_solvelu(0, 10); for (int i = 0; i < 10; ++i) { - Test::SerialSolveLU::impl_test_batched_solvelu(1024, i); + Test::SerialSolveLU::impl_test_batched_solvelu(1024, i); } } #endif #if defined(KOKKOSKERNELS_INST_LAYOUTRIGHT) { - typedef Kokkos::View - ViewType; - Test::SerialSolveLU::impl_test_batched_solvelu(0, 10); + typedef Kokkos::View ViewType; + Test::SerialSolveLU::impl_test_batched_solvelu(0, 10); for (int i = 0; i < 10; ++i) { - Test::SerialSolveLU::impl_test_batched_solvelu(1024, i); + Test::SerialSolveLU::impl_test_batched_solvelu(1024, i); } } #endif diff --git a/batched/dense/unit_test/Test_Batched_SerialSolveLU_Complex.hpp b/batched/dense/unit_test/Test_Batched_SerialSolveLU_Complex.hpp index 6eaf9ca5aa..66a99e28d2 100644 --- a/batched/dense/unit_test/Test_Batched_SerialSolveLU_Complex.hpp +++ b/batched/dense/unit_test/Test_Batched_SerialSolveLU_Complex.hpp @@ -18,11 +18,9 @@ TEST_F(TestCategory, batched_scalar_serial_solvelu_dcomplex) { // printf("Batched serial solveLU - double complex - algorithm type: // Unblocked\n"); - test_batched_solvelu, - Algo::SolveLU::Unblocked>(); + test_batched_solvelu, Algo::SolveLU::Unblocked>(); // printf("Batched serial solveLU - double complex - algorithm type: // Blocked\n"); - test_batched_solvelu, - Algo::SolveLU::Blocked>(); + test_batched_solvelu, Algo::SolveLU::Blocked>(); } #endif diff --git a/batched/dense/unit_test/Test_Batched_SerialTbsv.hpp b/batched/dense/unit_test/Test_Batched_SerialTbsv.hpp index 572e02053b..cd52235dd6 100644 --- a/batched/dense/unit_test/Test_Batched_SerialTbsv.hpp +++ b/batched/dense/unit_test/Test_Batched_SerialTbsv.hpp @@ -34,8 +34,8 @@ struct ParamTag { using diag = D; }; -template +template struct Functor_BatchedSerialTrsv { using execution_space = typename DeviceType::execution_space; AViewType _a; @@ -44,8 +44,7 @@ struct Functor_BatchedSerialTrsv { ScalarType _alpha; KOKKOS_INLINE_FUNCTION - Functor_BatchedSerialTrsv(const ScalarType alpha, const AViewType &a, - const BViewType &b) + Functor_BatchedSerialTrsv(const ScalarType alpha, const AViewType &a, const BViewType &b) : _a(a), _b(b), _alpha(alpha) {} KOKKOS_INLINE_FUNCTION @@ -53,9 +52,8 @@ struct Functor_BatchedSerialTrsv { auto aa = Kokkos::subview(_a, k, Kokkos::ALL(), Kokkos::ALL()); auto bb = Kokkos::subview(_b, k, Kokkos::ALL()); - KokkosBatched::SerialTrsv< - typename ParamTagType::uplo, typename ParamTagType::trans, - typename ParamTagType::diag, AlgoTagType>::invoke(_alpha, aa, bb); + KokkosBatched::SerialTrsv::invoke(_alpha, aa, bb); } inline void run() { @@ -68,8 +66,7 @@ struct Functor_BatchedSerialTrsv { } }; -template +template struct Functor_BatchedSerialTbsv { using execution_space = typename DeviceType::execution_space; AViewType _a; @@ -77,17 +74,15 @@ struct Functor_BatchedSerialTbsv { int _k; KOKKOS_INLINE_FUNCTION - Functor_BatchedSerialTbsv(const AViewType &a, const BViewType &b, const int k) - : _a(a), _b(b), _k(k) {} + Functor_BatchedSerialTbsv(const AViewType &a, const BViewType &b, const int k) : _a(a), _b(b), _k(k) {} KOKKOS_INLINE_FUNCTION void operator()(const ParamTagType &, const int k) const { auto aa = Kokkos::subview(_a, k, Kokkos::ALL(), Kokkos::ALL()); auto bb = Kokkos::subview(_b, k, Kokkos::ALL()); - KokkosBatched::SerialTbsv< - typename ParamTagType::uplo, typename ParamTagType::trans, - typename ParamTagType::diag, AlgoTagType>::invoke(aa, bb, _k); + KokkosBatched::SerialTbsv::invoke(aa, bb, _k); } inline void run() { @@ -102,8 +97,7 @@ struct Functor_BatchedSerialTbsv { } }; -template +template /// \brief Implementation details of batched tbsv test /// /// \param N [in] Batch size of RHS (banded matrix can also be batched matrix) @@ -111,8 +105,8 @@ template ; - using View3DType = Kokkos::View; + using View2DType = Kokkos::View; + using View3DType = Kokkos::View; // Reference is created by trsv from triangular matrix View3DType A("A", N, BlkSize, BlkSize), Ref("Ref", N, BlkSize, BlkSize); @@ -128,22 +122,16 @@ void impl_test_batched_tbsv(const int N, const int k, const int BlkSize) { Kokkos::deep_copy(x1, x0); // Create triangluar or banded matrix - create_banded_triangular_matrix(Ref, A, k, - false); - create_banded_triangular_matrix(Ref, Ab, k, - true); + create_banded_triangular_matrix(Ref, A, k, false); + create_banded_triangular_matrix(Ref, Ab, k, true); // Reference trsv - Functor_BatchedSerialTrsv(1.0, A, x0) + Functor_BatchedSerialTrsv(1.0, A, + x0) .run(); // tbsv - Functor_BatchedSerialTbsv(Ab, x1, k) - .run(); + Functor_BatchedSerialTbsv(Ab, x1, k).run(); Kokkos::fence(); @@ -162,17 +150,15 @@ void impl_test_batched_tbsv(const int N, const int k, const int BlkSize) { } } -template +template /// \brief Implementation details of batched tbsv test /// /// \param N [in] Batch size of RHS (banded matrix can also be batched matrix) void impl_test_batched_tbsv_analytical(const std::size_t N) { - using execution_space = typename DeviceType::execution_space; - using View2DType = Kokkos::View; - using StridedView2DType = - Kokkos::View; - using View3DType = Kokkos::View; + using execution_space = typename DeviceType::execution_space; + using View2DType = Kokkos::View; + using StridedView2DType = Kokkos::View; + using View3DType = Kokkos::View; // Reference is created by trsv from triangular matrix constexpr std::size_t BlkSize = 3, k = 2, incx = 2; @@ -187,8 +173,7 @@ void impl_test_batched_tbsv_analytical(const std::size_t N) { Kokkos::RangePolicy policy(0, N); Kokkos::parallel_for( - "KokkosBatched::Test::SerialTbsv::Initialize", policy, - KOKKOS_LAMBDA(const std::size_t ib) { + "KokkosBatched::Test::SerialTbsv::Initialize", policy, KOKKOS_LAMBDA(const std::size_t ib) { for (std::size_t i = 0; i < BlkSize; i++) { for (std::size_t j = 0; j < BlkSize; j++) { ref(ib, i, j) = i + 1; @@ -199,10 +184,8 @@ void impl_test_batched_tbsv_analytical(const std::size_t N) { x1(ib, j) = 1; } - if (std::is_same_v) { - if (std::is_same_v) { + if (std::is_same_v) { + if (std::is_same_v) { if (std::is_same_v) { x_ref(ib, 0) = 1.0 / 2.0; x_ref(ib, 1) = 1.0 / 6.0; @@ -224,8 +207,7 @@ void impl_test_batched_tbsv_analytical(const std::size_t N) { } } } else { - if (std::is_same_v) { + if (std::is_same_v) { if (std::is_same_v) { x_ref(ib, 0) = 1.0; x_ref(ib, 1) = -1.0 / 2.0; @@ -252,22 +234,14 @@ void impl_test_batched_tbsv_analytical(const std::size_t N) { Kokkos::fence(); // Create triangluar or banded matrix - create_banded_triangular_matrix(ref, A, k, - false); - create_banded_triangular_matrix(ref, Ab, k, - true); + create_banded_triangular_matrix(ref, A, k, false); + create_banded_triangular_matrix(ref, Ab, k, true); // tbsv - Functor_BatchedSerialTbsv(Ab, x0, k) - .run(); + Functor_BatchedSerialTbsv(Ab, x0, k).run(); // tbsv with incx == 2 - Functor_BatchedSerialTbsv(Ab, x1, k) - .run(); + Functor_BatchedSerialTbsv(Ab, x1, k).run(); Kokkos::fence(); @@ -280,8 +254,7 @@ void impl_test_batched_tbsv_analytical(const std::size_t N) { // Pack x1 into x0 for contiguous storage Kokkos::parallel_for( - "KokkosBatched::Test::SerialTbsv::Copy", policy, - KOKKOS_LAMBDA(const std::size_t ib) { + "KokkosBatched::Test::SerialTbsv::Copy", policy, KOKKOS_LAMBDA(const std::size_t ib) { for (std::size_t j = 0; j < BlkSize; j++) { x0(ib, j) = x1(ib, j); } @@ -295,8 +268,7 @@ void impl_test_batched_tbsv_analytical(const std::size_t N) { using mag_type = typename ats::mag_type; mag_type eps = 1.0e3 * ats::epsilon(); - auto h_x_ref = - Kokkos::create_mirror_view_and_copy(Kokkos::HostSpace(), x_ref); + auto h_x_ref = Kokkos::create_mirror_view_and_copy(Kokkos::HostSpace(), x_ref); for (std::size_t ib = 0; ib < N; ib++) { for (std::size_t j = 0; j < BlkSize; j++) { // Check x0 = x_ref @@ -311,36 +283,27 @@ void impl_test_batched_tbsv_analytical(const std::size_t N) { } // namespace Tbsv } // namespace Test -template +template int test_batched_tbsv() { #if defined(KOKKOSKERNELS_INST_LAYOUTLEFT) { using LayoutType = Kokkos::LayoutLeft; - Test::Tbsv::impl_test_batched_tbsv_analytical< - DeviceType, ScalarType, LayoutType, ParamTagType, AlgoTagType>(0); - Test::Tbsv::impl_test_batched_tbsv_analytical< - DeviceType, ScalarType, LayoutType, ParamTagType, AlgoTagType>(1); - Test::Tbsv::impl_test_batched_tbsv(0, 1, 10); + Test::Tbsv::impl_test_batched_tbsv_analytical(0); + Test::Tbsv::impl_test_batched_tbsv_analytical(1); + Test::Tbsv::impl_test_batched_tbsv(0, 1, 10); for (int i = 0; i < 10; i++) { - Test::Tbsv::impl_test_batched_tbsv(1, 1, i); + Test::Tbsv::impl_test_batched_tbsv(1, 1, i); } } #endif #if defined(KOKKOSKERNELS_INST_LAYOUTRIGHT) { using LayoutType = Kokkos::LayoutRight; - Test::Tbsv::impl_test_batched_tbsv_analytical< - DeviceType, ScalarType, LayoutType, ParamTagType, AlgoTagType>(0); - Test::Tbsv::impl_test_batched_tbsv_analytical< - DeviceType, ScalarType, LayoutType, ParamTagType, AlgoTagType>(1); - Test::Tbsv::impl_test_batched_tbsv(0, 1, 10); + Test::Tbsv::impl_test_batched_tbsv_analytical(0); + Test::Tbsv::impl_test_batched_tbsv_analytical(1); + Test::Tbsv::impl_test_batched_tbsv(0, 1, 10); for (int i = 0; i < 10; i++) { - Test::Tbsv::impl_test_batched_tbsv(1, 1, i); + Test::Tbsv::impl_test_batched_tbsv(1, 1, i); } } #endif diff --git a/batched/dense/unit_test/Test_Batched_SerialTbsv_Complex.hpp b/batched/dense/unit_test/Test_Batched_SerialTbsv_Complex.hpp index 8789cc6931..005a6e92c0 100644 --- a/batched/dense/unit_test/Test_Batched_SerialTbsv_Complex.hpp +++ b/batched/dense/unit_test/Test_Batched_SerialTbsv_Complex.hpp @@ -17,69 +17,53 @@ #if defined(KOKKOSKERNELS_INST_COMPLEX_DOUBLE) // NO TRANSPOSE TEST_F(TestCategory, batched_serial_tbsv_l_nt_u_dcomplex) { - using param_tag_type = - ::Test::Tbsv::ParamTag; - using algo_tag_type = typename Algo::Tbsv::Unblocked; + using param_tag_type = ::Test::Tbsv::ParamTag; + using algo_tag_type = typename Algo::Tbsv::Unblocked; - test_batched_tbsv, param_tag_type, - algo_tag_type>(); + test_batched_tbsv, param_tag_type, algo_tag_type>(); } TEST_F(TestCategory, batched_serial_tbsv_l_nt_n_dcomplex) { - using param_tag_type = - ::Test::Tbsv::ParamTag; - using algo_tag_type = typename Algo::Tbsv::Unblocked; + using param_tag_type = ::Test::Tbsv::ParamTag; + using algo_tag_type = typename Algo::Tbsv::Unblocked; - test_batched_tbsv, param_tag_type, - algo_tag_type>(); + test_batched_tbsv, param_tag_type, algo_tag_type>(); } TEST_F(TestCategory, batched_serial_tbsv_u_nt_u_dcomplex) { - using param_tag_type = - ::Test::Tbsv::ParamTag; - using algo_tag_type = typename Algo::Tbsv::Unblocked; + using param_tag_type = ::Test::Tbsv::ParamTag; + using algo_tag_type = typename Algo::Tbsv::Unblocked; - test_batched_tbsv, param_tag_type, - algo_tag_type>(); + test_batched_tbsv, param_tag_type, algo_tag_type>(); } TEST_F(TestCategory, batched_serial_tbsv_u_nt_n_dcomplex) { - using param_tag_type = - ::Test::Tbsv::ParamTag; - using algo_tag_type = typename Algo::Tbsv::Unblocked; + using param_tag_type = ::Test::Tbsv::ParamTag; + using algo_tag_type = typename Algo::Tbsv::Unblocked; - test_batched_tbsv, param_tag_type, - algo_tag_type>(); + test_batched_tbsv, param_tag_type, algo_tag_type>(); } // TRANSPOSE TEST_F(TestCategory, batched_serial_tbsv_l_t_u_dcomplex) { - using param_tag_type = - ::Test::Tbsv::ParamTag; - using algo_tag_type = typename Algo::Tbsv::Unblocked; + using param_tag_type = ::Test::Tbsv::ParamTag; + using algo_tag_type = typename Algo::Tbsv::Unblocked; - test_batched_tbsv, param_tag_type, - algo_tag_type>(); + test_batched_tbsv, param_tag_type, algo_tag_type>(); } TEST_F(TestCategory, batched_serial_tbsv_l_t_n_dcomplex) { - using param_tag_type = - ::Test::Tbsv::ParamTag; - using algo_tag_type = typename Algo::Tbsv::Unblocked; + using param_tag_type = ::Test::Tbsv::ParamTag; + using algo_tag_type = typename Algo::Tbsv::Unblocked; - test_batched_tbsv, param_tag_type, - algo_tag_type>(); + test_batched_tbsv, param_tag_type, algo_tag_type>(); } TEST_F(TestCategory, batched_serial_tbsv_u_t_u_dcomplex) { - using param_tag_type = - ::Test::Tbsv::ParamTag; - using algo_tag_type = typename Algo::Tbsv::Unblocked; + using param_tag_type = ::Test::Tbsv::ParamTag; + using algo_tag_type = typename Algo::Tbsv::Unblocked; - test_batched_tbsv, param_tag_type, - algo_tag_type>(); + test_batched_tbsv, param_tag_type, algo_tag_type>(); } TEST_F(TestCategory, batched_serial_tbsv_u_t_n_dcomplex) { - using param_tag_type = - ::Test::Tbsv::ParamTag; - using algo_tag_type = typename Algo::Tbsv::Unblocked; + using param_tag_type = ::Test::Tbsv::ParamTag; + using algo_tag_type = typename Algo::Tbsv::Unblocked; - test_batched_tbsv, param_tag_type, - algo_tag_type>(); + test_batched_tbsv, param_tag_type, algo_tag_type>(); } /* [FIXME] These tests need Trans::ConjTranspose in trsv. diff --git a/batched/dense/unit_test/Test_Batched_SerialTbsv_Real.hpp b/batched/dense/unit_test/Test_Batched_SerialTbsv_Real.hpp index 8915b4ad05..c8f10adf5c 100644 --- a/batched/dense/unit_test/Test_Batched_SerialTbsv_Real.hpp +++ b/batched/dense/unit_test/Test_Batched_SerialTbsv_Real.hpp @@ -17,59 +17,51 @@ #if defined(KOKKOSKERNELS_INST_FLOAT) // NO TRANSPOSE TEST_F(TestCategory, batched_serial_tbsv_l_nt_u_float) { - using param_tag_type = - ::Test::Tbsv::ParamTag; - using algo_tag_type = typename Algo::Tbsv::Unblocked; + using param_tag_type = ::Test::Tbsv::ParamTag; + using algo_tag_type = typename Algo::Tbsv::Unblocked; test_batched_tbsv(); } TEST_F(TestCategory, batched_serial_tbsv_l_nt_n_float) { - using param_tag_type = - ::Test::Tbsv::ParamTag; - using algo_tag_type = typename Algo::Tbsv::Unblocked; + using param_tag_type = ::Test::Tbsv::ParamTag; + using algo_tag_type = typename Algo::Tbsv::Unblocked; test_batched_tbsv(); } TEST_F(TestCategory, batched_serial_tbsv_u_nt_u_float) { - using param_tag_type = - ::Test::Tbsv::ParamTag; - using algo_tag_type = typename Algo::Tbsv::Unblocked; + using param_tag_type = ::Test::Tbsv::ParamTag; + using algo_tag_type = typename Algo::Tbsv::Unblocked; test_batched_tbsv(); } TEST_F(TestCategory, batched_serial_tbsv_u_nt_n_float) { - using param_tag_type = - ::Test::Tbsv::ParamTag; - using algo_tag_type = typename Algo::Tbsv::Unblocked; + using param_tag_type = ::Test::Tbsv::ParamTag; + using algo_tag_type = typename Algo::Tbsv::Unblocked; test_batched_tbsv(); } // TRANSPOSE TEST_F(TestCategory, batched_serial_tbsv_l_t_u_float) { - using param_tag_type = - ::Test::Tbsv::ParamTag; - using algo_tag_type = typename Algo::Tbsv::Unblocked; + using param_tag_type = ::Test::Tbsv::ParamTag; + using algo_tag_type = typename Algo::Tbsv::Unblocked; test_batched_tbsv(); } TEST_F(TestCategory, batched_serial_tbsv_l_t_n_float) { - using param_tag_type = - ::Test::Tbsv::ParamTag; - using algo_tag_type = typename Algo::Tbsv::Unblocked; + using param_tag_type = ::Test::Tbsv::ParamTag; + using algo_tag_type = typename Algo::Tbsv::Unblocked; test_batched_tbsv(); } TEST_F(TestCategory, batched_serial_tbsv_u_t_u_float) { - using param_tag_type = - ::Test::Tbsv::ParamTag; - using algo_tag_type = typename Algo::Tbsv::Unblocked; + using param_tag_type = ::Test::Tbsv::ParamTag; + using algo_tag_type = typename Algo::Tbsv::Unblocked; test_batched_tbsv(); } TEST_F(TestCategory, batched_serial_tbsv_u_t_n_float) { - using param_tag_type = - ::Test::Tbsv::ParamTag; - using algo_tag_type = typename Algo::Tbsv::Unblocked; + using param_tag_type = ::Test::Tbsv::ParamTag; + using algo_tag_type = typename Algo::Tbsv::Unblocked; test_batched_tbsv(); } @@ -78,59 +70,51 @@ TEST_F(TestCategory, batched_serial_tbsv_u_t_n_float) { #if defined(KOKKOSKERNELS_INST_DOUBLE) // NO TRANSPOSE TEST_F(TestCategory, batched_serial_tbsv_l_nt_u_double) { - using param_tag_type = - ::Test::Tbsv::ParamTag; - using algo_tag_type = typename Algo::Tbsv::Unblocked; + using param_tag_type = ::Test::Tbsv::ParamTag; + using algo_tag_type = typename Algo::Tbsv::Unblocked; test_batched_tbsv(); } TEST_F(TestCategory, batched_serial_tbsv_l_nt_n_double) { - using param_tag_type = - ::Test::Tbsv::ParamTag; - using algo_tag_type = typename Algo::Tbsv::Unblocked; + using param_tag_type = ::Test::Tbsv::ParamTag; + using algo_tag_type = typename Algo::Tbsv::Unblocked; test_batched_tbsv(); } TEST_F(TestCategory, batched_serial_tbsv_u_nt_u_double) { - using param_tag_type = - ::Test::Tbsv::ParamTag; - using algo_tag_type = typename Algo::Tbsv::Unblocked; + using param_tag_type = ::Test::Tbsv::ParamTag; + using algo_tag_type = typename Algo::Tbsv::Unblocked; test_batched_tbsv(); } TEST_F(TestCategory, batched_serial_tbsv_u_nt_n_double) { - using param_tag_type = - ::Test::Tbsv::ParamTag; - using algo_tag_type = typename Algo::Tbsv::Unblocked; + using param_tag_type = ::Test::Tbsv::ParamTag; + using algo_tag_type = typename Algo::Tbsv::Unblocked; test_batched_tbsv(); } // TRANSPOSE TEST_F(TestCategory, batched_serial_tbsv_l_t_u_double) { - using param_tag_type = - ::Test::Tbsv::ParamTag; - using algo_tag_type = typename Algo::Tbsv::Unblocked; + using param_tag_type = ::Test::Tbsv::ParamTag; + using algo_tag_type = typename Algo::Tbsv::Unblocked; test_batched_tbsv(); } TEST_F(TestCategory, batched_serial_tbsv_l_t_n_double) { - using param_tag_type = - ::Test::Tbsv::ParamTag; - using algo_tag_type = typename Algo::Tbsv::Unblocked; + using param_tag_type = ::Test::Tbsv::ParamTag; + using algo_tag_type = typename Algo::Tbsv::Unblocked; test_batched_tbsv(); } TEST_F(TestCategory, batched_serial_tbsv_u_t_u_double) { - using param_tag_type = - ::Test::Tbsv::ParamTag; - using algo_tag_type = typename Algo::Tbsv::Unblocked; + using param_tag_type = ::Test::Tbsv::ParamTag; + using algo_tag_type = typename Algo::Tbsv::Unblocked; test_batched_tbsv(); } TEST_F(TestCategory, batched_serial_tbsv_u_t_n_double) { - using param_tag_type = - ::Test::Tbsv::ParamTag; - using algo_tag_type = typename Algo::Tbsv::Unblocked; + using param_tag_type = ::Test::Tbsv::ParamTag; + using algo_tag_type = typename Algo::Tbsv::Unblocked; test_batched_tbsv(); } diff --git a/batched/dense/unit_test/Test_Batched_SerialTrmm.hpp b/batched/dense/unit_test/Test_Batched_SerialTrmm.hpp index 7a7e89ebf8..610f9e700a 100644 --- a/batched/dense/unit_test/Test_Batched_SerialTrmm.hpp +++ b/batched/dense/unit_test/Test_Batched_SerialTrmm.hpp @@ -49,8 +49,7 @@ struct NonUnitDiagTRMM { KOKKOS_INLINE_FUNCTION void operator()(const int& i) const { A_(i, i) = A_(i, i) + 10; } }; -template +template struct VanillaGEMM { bool A_t, B_t, A_c, B_c; int N, K; @@ -67,12 +66,9 @@ struct VanillaGEMM { ScalarC beta; KOKKOS_INLINE_FUNCTION - void operator()( - const typename Kokkos::TeamPolicy::member_type& team) - const { + void operator()(const typename Kokkos::TeamPolicy::member_type& team) const { // GNU COMPILER BUG WORKAROUND -#if defined(KOKKOS_COMPILER_GNU) && !defined(__CUDA_ARCH__) && \ - !defined(__HIP_DEVICE_COMPILE__) +#if defined(KOKKOS_COMPILER_GNU) && !defined(__CUDA_ARCH__) && !defined(__HIP_DEVICE_COMPILE__) int i = team.league_rank(); #else const int i = team.league_rank(); @@ -110,8 +106,7 @@ struct ParamTag { typedef D diag; }; -template +template struct Functor_TestBatchedSerialTrmm { using execution_space = typename DeviceType::execution_space; ViewType _a, _b; @@ -119,8 +114,7 @@ struct Functor_TestBatchedSerialTrmm { ScalarType _alpha; KOKKOS_INLINE_FUNCTION - Functor_TestBatchedSerialTrmm(const ScalarType alpha, const ViewType& a, - const ViewType& b) + Functor_TestBatchedSerialTrmm(const ScalarType alpha, const ViewType& a, const ViewType& b) : _a(a), _b(b), _alpha(alpha) {} KOKKOS_INLINE_FUNCTION @@ -128,9 +122,8 @@ struct Functor_TestBatchedSerialTrmm { auto aa = Kokkos::subview(_a, k, Kokkos::ALL(), Kokkos::ALL()); auto bb = Kokkos::subview(_b, k, Kokkos::ALL(), Kokkos::ALL()); - SerialTrmm::invoke(_alpha, aa, bb); + SerialTrmm::invoke(_alpha, aa, bb); } inline void run() { @@ -145,10 +138,8 @@ struct Functor_TestBatchedSerialTrmm { } }; -template -void impl_test_batched_trmm(const int N, const int nRows, const int nCols, - const char* trans) { +template +void impl_test_batched_trmm(const int N, const int nRows, const int nCols, const char* trans) { typedef typename ViewType::value_type value_type; typedef typename DeviceType::execution_space execution_space; typedef Kokkos::ArithTraits ats; @@ -156,56 +147,40 @@ void impl_test_batched_trmm(const int N, const int nRows, const int nCols, ScalarType alpha(1.0); ScalarType beta(0.0); - const bool is_side_right = - std::is_same::value; - const bool is_A_lower = - std::is_same::value; - const int K = is_side_right ? nCols : nRows; - ViewType A("A", N, K, K), B_actual("B_actual", N, nRows, nCols), - B_expected("B_expected", N, nRows, nCols); - typename ViewType::HostMirror A_host = Kokkos::create_mirror_view(A); - typename ViewType::HostMirror B_actual_host = - Kokkos::create_mirror_view(B_actual); - typename ViewType::HostMirror B_expected_host = - Kokkos::create_mirror_view(B_expected); - uint64_t seed = - std::chrono::high_resolution_clock::now().time_since_epoch().count(); - - using ViewTypeSubA = - decltype(Kokkos::subview(A, 0, Kokkos::ALL(), Kokkos::ALL())); - using ViewTypeSubB = - decltype(Kokkos::subview(B_actual, 0, Kokkos::ALL(), Kokkos::ALL())); + const bool is_side_right = std::is_same::value; + const bool is_A_lower = std::is_same::value; + const int K = is_side_right ? nCols : nRows; + ViewType A("A", N, K, K), B_actual("B_actual", N, nRows, nCols), B_expected("B_expected", N, nRows, nCols); + typename ViewType::HostMirror A_host = Kokkos::create_mirror_view(A); + typename ViewType::HostMirror B_actual_host = Kokkos::create_mirror_view(B_actual); + typename ViewType::HostMirror B_expected_host = Kokkos::create_mirror_view(B_expected); + uint64_t seed = std::chrono::high_resolution_clock::now().time_since_epoch().count(); + + using ViewTypeSubA = decltype(Kokkos::subview(A, 0, Kokkos::ALL(), Kokkos::ALL())); + using ViewTypeSubB = decltype(Kokkos::subview(B_actual, 0, Kokkos::ALL(), Kokkos::ALL())); Kokkos::Random_XorShift64_Pool rand_pool(seed); if (std::is_same::value) { // Initialize A with deterministic random numbers - Kokkos::fill_random(A, rand_pool, - Kokkos::rand, - ScalarType>::max()); + Kokkos::fill_random(A, rand_pool, Kokkos::rand, ScalarType>::max()); using functor_type = UnitDiagTRMM; for (int k = 0; k < N; ++k) { functor_type udtrmm(Kokkos::subview(A, k, Kokkos::ALL(), Kokkos::ALL())); // Initialize As diag with 1s - Kokkos::parallel_for("KokkosBlas::Test::UnitDiagTRMM", - Kokkos::RangePolicy(0, K), udtrmm); + Kokkos::parallel_for("KokkosBlas::Test::UnitDiagTRMM", Kokkos::RangePolicy(0, K), udtrmm); } } else { //(diag[0]=='N')||(diag[0]=='n') // Initialize A with random numbers - Kokkos::fill_random(A, rand_pool, - Kokkos::rand, - ScalarType>::max()); + Kokkos::fill_random(A, rand_pool, Kokkos::rand, ScalarType>::max()); using functor_type = NonUnitDiagTRMM; for (int k = 0; k < N; ++k) { functor_type nudtrmm(Kokkos::subview(A, k, Kokkos::ALL(), Kokkos::ALL())); // Initialize As diag with A(i,i)+10 - Kokkos::parallel_for("KokkosBlas::Test::NonUnitDiagTRMM", - Kokkos::RangePolicy(0, K), nudtrmm); + Kokkos::parallel_for("KokkosBlas::Test::NonUnitDiagTRMM", Kokkos::RangePolicy(0, K), nudtrmm); } } - Kokkos::fill_random(B_actual, rand_pool, - Kokkos::rand, - ScalarType>::max()); + Kokkos::fill_random(B_actual, rand_pool, Kokkos::rand, ScalarType>::max()); Kokkos::fence(); Kokkos::deep_copy(B_expected, B_actual); @@ -227,9 +202,7 @@ void impl_test_batched_trmm(const int N, const int nRows, const int nCols, if (!is_side_right) { // B_expected = alpha * op(A) * B + beta * C = 1 * op(A) * B + 0 * C - struct VanillaGEMM - vgemm; + struct VanillaGEMM vgemm; vgemm.A_t = (trans[0] != 'N') && (trans[0] != 'n'); vgemm.B_t = false; vgemm.A_c = (trans[0] == 'C') || (trans[0] == 'c'); @@ -244,15 +217,12 @@ void impl_test_batched_trmm(const int N, const int nRows, const int nCols, ; vgemm.C = Kokkos::subview(B_expected, i, Kokkos::ALL(), Kokkos::ALL()); ; - Kokkos::parallel_for( - "KokkosBlas::Test::VanillaGEMM", - Kokkos::TeamPolicy(nRows, Kokkos::AUTO, 16), vgemm); + Kokkos::parallel_for("KokkosBlas::Test::VanillaGEMM", + Kokkos::TeamPolicy(nRows, Kokkos::AUTO, 16), vgemm); } } else { // B_expected = alpha * B * op(A) + beta * C = 1 * B * op(A) + 0 * C - struct VanillaGEMM - vgemm; + struct VanillaGEMM vgemm; vgemm.A_t = false; vgemm.B_t = (trans[0] != 'N') && (trans[0] != 'n'); vgemm.A_c = false; @@ -267,14 +237,13 @@ void impl_test_batched_trmm(const int N, const int nRows, const int nCols, ; vgemm.C = Kokkos::subview(B_expected, i, Kokkos::ALL(), Kokkos::ALL()); ; - Kokkos::parallel_for( - "KokkosBlas::Test::VanillaGEMM", - Kokkos::TeamPolicy(nRows, Kokkos::AUTO, 16), vgemm); + Kokkos::parallel_for("KokkosBlas::Test::VanillaGEMM", + Kokkos::TeamPolicy(nRows, Kokkos::AUTO, 16), vgemm); } } - Functor_TestBatchedSerialTrmm(alpha, A, B_actual) + Functor_TestBatchedSerialTrmm(alpha, A, + B_actual) .run(); Kokkos::fence(); @@ -308,50 +277,35 @@ void impl_test_batched_trmm(const int N, const int nRows, const int nCols, } // namespace Trmm } // namespace Test -template +template int test_batched_trmm(int batchSize = 512) { - char trans = - std::is_same::value - ? 'N' - : std::is_same::value - ? 'T' - : std::is_same::value - ? 'C' - : 'E'; + char trans = std::is_same::value ? 'N' + : std::is_same::value ? 'T' + : std::is_same::value ? 'C' + : 'E'; #if defined(KOKKOSKERNELS_INST_LAYOUTLEFT) { typedef Kokkos::View ViewType; - Test::Trmm::impl_test_batched_trmm(0, 10, 4, - &trans); + Test::Trmm::impl_test_batched_trmm(0, 10, 4, &trans); for (int i = 0; i < 10; ++i) { // printf("Testing: LayoutLeft, Blksize %d\n", i); - Test::Trmm::impl_test_batched_trmm( - batchSize, i, 4, &trans); - Test::Trmm::impl_test_batched_trmm( - batchSize, i, 1, &trans); + Test::Trmm::impl_test_batched_trmm(batchSize, i, 4, + &trans); + Test::Trmm::impl_test_batched_trmm(batchSize, i, 1, + &trans); } } #endif #if defined(KOKKOSKERNELS_INST_LAYOUTRIGHT) { - typedef Kokkos::View - ViewType; - Test::Trmm::impl_test_batched_trmm(0, 10, 4, - &trans); + typedef Kokkos::View ViewType; + Test::Trmm::impl_test_batched_trmm(0, 10, 4, &trans); for (int i = 0; i < 10; ++i) { // printf("Testing: LayoutRight, Blksize %d\n", i); - Test::Trmm::impl_test_batched_trmm( - batchSize, i, 4, &trans); - Test::Trmm::impl_test_batched_trmm( - batchSize, i, 1, &trans); + Test::Trmm::impl_test_batched_trmm(batchSize, i, 4, + &trans); + Test::Trmm::impl_test_batched_trmm(batchSize, i, 1, + &trans); } } #endif diff --git a/batched/dense/unit_test/Test_Batched_SerialTrmm_Complex.hpp b/batched/dense/unit_test/Test_Batched_SerialTrmm_Complex.hpp index 8ab6e2810c..2d9eab7c4c 100644 --- a/batched/dense/unit_test/Test_Batched_SerialTrmm_Complex.hpp +++ b/batched/dense/unit_test/Test_Batched_SerialTrmm_Complex.hpp @@ -17,353 +17,227 @@ #if defined(KOKKOSKERNELS_INST_COMPLEX_FLOAT) // NO TRANSPOSE TEST_F(TestCategory, batched_scalar_serial_trmm_l_l_nt_u_scomplex_scomplex) { - typedef ::Test::Trmm::ParamTag - param_tag_type; + typedef ::Test::Trmm::ParamTag param_tag_type; typedef Algo::Trmm::Unblocked algo_tag_type; - test_batched_trmm, Kokkos::complex, - param_tag_type, algo_tag_type>(128); + test_batched_trmm, Kokkos::complex, param_tag_type, algo_tag_type>(128); } TEST_F(TestCategory, batched_scalar_serial_trmm_l_l_nt_n_scomplex_scomplex) { - typedef ::Test::Trmm::ParamTag - param_tag_type; + typedef ::Test::Trmm::ParamTag param_tag_type; typedef Algo::Trmm::Unblocked algo_tag_type; - test_batched_trmm, Kokkos::complex, - param_tag_type, algo_tag_type>(128); + test_batched_trmm, Kokkos::complex, param_tag_type, algo_tag_type>(128); } TEST_F(TestCategory, batched_scalar_serial_trmm_l_u_nt_u_scomplex_scomplex) { - typedef ::Test::Trmm::ParamTag - param_tag_type; + typedef ::Test::Trmm::ParamTag param_tag_type; typedef Algo::Trmm::Unblocked algo_tag_type; - test_batched_trmm, Kokkos::complex, - param_tag_type, algo_tag_type>(128); + test_batched_trmm, Kokkos::complex, param_tag_type, algo_tag_type>(128); } TEST_F(TestCategory, batched_scalar_serial_trmm_l_u_nt_n_scomplex_scomplex) { - typedef ::Test::Trmm::ParamTag - param_tag_type; + typedef ::Test::Trmm::ParamTag param_tag_type; typedef Algo::Trmm::Unblocked algo_tag_type; - test_batched_trmm, Kokkos::complex, - param_tag_type, algo_tag_type>(128); + test_batched_trmm, Kokkos::complex, param_tag_type, algo_tag_type>(128); } TEST_F(TestCategory, batched_scalar_serial_trmm_r_u_nt_u_scomplex_scomplex) { - typedef ::Test::Trmm::ParamTag - param_tag_type; + typedef ::Test::Trmm::ParamTag param_tag_type; typedef Algo::Trmm::Unblocked algo_tag_type; - test_batched_trmm, Kokkos::complex, - param_tag_type, algo_tag_type>(128); + test_batched_trmm, Kokkos::complex, param_tag_type, algo_tag_type>(128); } TEST_F(TestCategory, batched_scalar_serial_trmm_r_u_nt_n_scomplex_scomplex) { - typedef ::Test::Trmm::ParamTag - param_tag_type; + typedef ::Test::Trmm::ParamTag param_tag_type; typedef Algo::Trmm::Unblocked algo_tag_type; - test_batched_trmm, Kokkos::complex, - param_tag_type, algo_tag_type>(128); + test_batched_trmm, Kokkos::complex, param_tag_type, algo_tag_type>(128); } // TRANSPOSE TEST_F(TestCategory, batched_scalar_serial_trmm_l_l_t_u_scomplex_scomplex) { - typedef ::Test::Trmm::ParamTag - param_tag_type; + typedef ::Test::Trmm::ParamTag param_tag_type; typedef Algo::Trmm::Unblocked algo_tag_type; - test_batched_trmm, Kokkos::complex, - param_tag_type, algo_tag_type>(128); + test_batched_trmm, Kokkos::complex, param_tag_type, algo_tag_type>(128); } TEST_F(TestCategory, batched_scalar_serial_trmm_l_l_t_n_scomplex_scomplex) { - typedef ::Test::Trmm::ParamTag - param_tag_type; + typedef ::Test::Trmm::ParamTag param_tag_type; typedef Algo::Trmm::Unblocked algo_tag_type; - test_batched_trmm, Kokkos::complex, - param_tag_type, algo_tag_type>(128); + test_batched_trmm, Kokkos::complex, param_tag_type, algo_tag_type>(128); } TEST_F(TestCategory, batched_scalar_serial_trmm_l_u_t_u_scomplex_scomplex) { - typedef ::Test::Trmm::ParamTag - param_tag_type; + typedef ::Test::Trmm::ParamTag param_tag_type; typedef Algo::Trmm::Unblocked algo_tag_type; - test_batched_trmm, Kokkos::complex, - param_tag_type, algo_tag_type>(128); + test_batched_trmm, Kokkos::complex, param_tag_type, algo_tag_type>(128); } TEST_F(TestCategory, batched_scalar_serial_trmm_l_u_t_n_scomplex_scomplex) { - typedef ::Test::Trmm::ParamTag - param_tag_type; + typedef ::Test::Trmm::ParamTag param_tag_type; typedef Algo::Trmm::Unblocked algo_tag_type; - test_batched_trmm, Kokkos::complex, - param_tag_type, algo_tag_type>(128); + test_batched_trmm, Kokkos::complex, param_tag_type, algo_tag_type>(128); } TEST_F(TestCategory, batched_scalar_serial_trmm_r_u_t_u_scomplex_scomplex) { - typedef ::Test::Trmm::ParamTag - param_tag_type; + typedef ::Test::Trmm::ParamTag param_tag_type; typedef Algo::Trmm::Unblocked algo_tag_type; - test_batched_trmm, Kokkos::complex, - param_tag_type, algo_tag_type>(128); + test_batched_trmm, Kokkos::complex, param_tag_type, algo_tag_type>(128); } TEST_F(TestCategory, batched_scalar_serial_trmm_r_u_t_n_scomplex_scomplex) { - typedef ::Test::Trmm::ParamTag - param_tag_type; + typedef ::Test::Trmm::ParamTag param_tag_type; typedef Algo::Trmm::Unblocked algo_tag_type; - test_batched_trmm, Kokkos::complex, - param_tag_type, algo_tag_type>(128); + test_batched_trmm, Kokkos::complex, param_tag_type, algo_tag_type>(128); } // CONJUGATE TRANSPOSE TEST_F(TestCategory, batched_scalar_serial_trmm_l_l_ct_u_scomplex_scomplex) { - typedef ::Test::Trmm::ParamTag - param_tag_type; + typedef ::Test::Trmm::ParamTag param_tag_type; typedef Algo::Trmm::Unblocked algo_tag_type; - test_batched_trmm, Kokkos::complex, - param_tag_type, algo_tag_type>(128); + test_batched_trmm, Kokkos::complex, param_tag_type, algo_tag_type>(128); } TEST_F(TestCategory, batched_scalar_serial_trmm_l_l_ct_n_scomplex_scomplex) { - typedef ::Test::Trmm::ParamTag - param_tag_type; + typedef ::Test::Trmm::ParamTag param_tag_type; typedef Algo::Trmm::Unblocked algo_tag_type; - test_batched_trmm, Kokkos::complex, - param_tag_type, algo_tag_type>(128); + test_batched_trmm, Kokkos::complex, param_tag_type, algo_tag_type>(128); } TEST_F(TestCategory, batched_scalar_serial_trmm_l_u_ct_u_scomplex_scomplex) { - typedef ::Test::Trmm::ParamTag - param_tag_type; + typedef ::Test::Trmm::ParamTag param_tag_type; typedef Algo::Trmm::Unblocked algo_tag_type; - test_batched_trmm, Kokkos::complex, - param_tag_type, algo_tag_type>(128); + test_batched_trmm, Kokkos::complex, param_tag_type, algo_tag_type>(128); } TEST_F(TestCategory, batched_scalar_serial_trmm_l_u_ct_n_scomplex_scomplex) { - typedef ::Test::Trmm::ParamTag - param_tag_type; + typedef ::Test::Trmm::ParamTag param_tag_type; typedef Algo::Trmm::Unblocked algo_tag_type; - test_batched_trmm, Kokkos::complex, - param_tag_type, algo_tag_type>(128); + test_batched_trmm, Kokkos::complex, param_tag_type, algo_tag_type>(128); } TEST_F(TestCategory, batched_scalar_serial_trmm_r_u_ct_u_scomplex_scomplex) { - typedef ::Test::Trmm::ParamTag - param_tag_type; + typedef ::Test::Trmm::ParamTag param_tag_type; typedef Algo::Trmm::Unblocked algo_tag_type; - test_batched_trmm, Kokkos::complex, - param_tag_type, algo_tag_type>(128); + test_batched_trmm, Kokkos::complex, param_tag_type, algo_tag_type>(128); } TEST_F(TestCategory, batched_scalar_serial_trmm_r_u_ct_n_scomplex_scomplex) { - typedef ::Test::Trmm::ParamTag - param_tag_type; + typedef ::Test::Trmm::ParamTag param_tag_type; typedef Algo::Trmm::Unblocked algo_tag_type; - test_batched_trmm, Kokkos::complex, - param_tag_type, algo_tag_type>(128); + test_batched_trmm, Kokkos::complex, param_tag_type, algo_tag_type>(128); } #endif #if defined(KOKKOSKERNELS_INST_COMPLEX_DOUBLE) // NO TRANSPOSE TEST_F(TestCategory, batched_scalar_serial_trmm_l_l_nt_u_dcomplex_dcomplex) { - typedef ::Test::Trmm::ParamTag - param_tag_type; + typedef ::Test::Trmm::ParamTag param_tag_type; typedef Algo::Trmm::Unblocked algo_tag_type; - test_batched_trmm, - Kokkos::complex, param_tag_type, algo_tag_type>( - 128); + test_batched_trmm, Kokkos::complex, param_tag_type, algo_tag_type>(128); } TEST_F(TestCategory, batched_scalar_serial_trmm_l_l_nt_n_dcomplex_dcomplex) { - typedef ::Test::Trmm::ParamTag - param_tag_type; + typedef ::Test::Trmm::ParamTag param_tag_type; typedef Algo::Trmm::Unblocked algo_tag_type; - test_batched_trmm, - Kokkos::complex, param_tag_type, algo_tag_type>( - 128); + test_batched_trmm, Kokkos::complex, param_tag_type, algo_tag_type>(128); } TEST_F(TestCategory, batched_scalar_serial_trmm_l_u_nt_u_dcomplex_dcomplex) { - typedef ::Test::Trmm::ParamTag - param_tag_type; + typedef ::Test::Trmm::ParamTag param_tag_type; typedef Algo::Trmm::Unblocked algo_tag_type; - test_batched_trmm, - Kokkos::complex, param_tag_type, algo_tag_type>( - 128); + test_batched_trmm, Kokkos::complex, param_tag_type, algo_tag_type>(128); } TEST_F(TestCategory, batched_scalar_serial_trmm_l_u_nt_n_dcomplex_dcomplex) { - typedef ::Test::Trmm::ParamTag - param_tag_type; + typedef ::Test::Trmm::ParamTag param_tag_type; typedef Algo::Trmm::Unblocked algo_tag_type; - test_batched_trmm, - Kokkos::complex, param_tag_type, algo_tag_type>( - 128); + test_batched_trmm, Kokkos::complex, param_tag_type, algo_tag_type>(128); } TEST_F(TestCategory, batched_scalar_serial_trmm_r_u_nt_u_dcomplex_dcomplex) { - typedef ::Test::Trmm::ParamTag - param_tag_type; + typedef ::Test::Trmm::ParamTag param_tag_type; typedef Algo::Trmm::Unblocked algo_tag_type; - test_batched_trmm, - Kokkos::complex, param_tag_type, algo_tag_type>( - 128); + test_batched_trmm, Kokkos::complex, param_tag_type, algo_tag_type>(128); } TEST_F(TestCategory, batched_scalar_serial_trmm_r_u_nt_n_dcomplex_dcomplex) { - typedef ::Test::Trmm::ParamTag - param_tag_type; + typedef ::Test::Trmm::ParamTag param_tag_type; typedef Algo::Trmm::Unblocked algo_tag_type; - test_batched_trmm, - Kokkos::complex, param_tag_type, algo_tag_type>( - 128); + test_batched_trmm, Kokkos::complex, param_tag_type, algo_tag_type>(128); } // TRANSPOSE TEST_F(TestCategory, batched_scalar_serial_trmm_l_l_t_u_dcomplex_dcomplex) { - typedef ::Test::Trmm::ParamTag - param_tag_type; + typedef ::Test::Trmm::ParamTag param_tag_type; typedef Algo::Trmm::Unblocked algo_tag_type; - test_batched_trmm, - Kokkos::complex, param_tag_type, algo_tag_type>( - 128); + test_batched_trmm, Kokkos::complex, param_tag_type, algo_tag_type>(128); } TEST_F(TestCategory, batched_scalar_serial_trmm_l_l_t_n_dcomplex_dcomplex) { - typedef ::Test::Trmm::ParamTag - param_tag_type; + typedef ::Test::Trmm::ParamTag param_tag_type; typedef Algo::Trmm::Unblocked algo_tag_type; - test_batched_trmm, - Kokkos::complex, param_tag_type, algo_tag_type>( - 128); + test_batched_trmm, Kokkos::complex, param_tag_type, algo_tag_type>(128); } TEST_F(TestCategory, batched_scalar_serial_trmm_l_u_t_u_dcomplex_dcomplex) { - typedef ::Test::Trmm::ParamTag - param_tag_type; + typedef ::Test::Trmm::ParamTag param_tag_type; typedef Algo::Trmm::Unblocked algo_tag_type; - test_batched_trmm, - Kokkos::complex, param_tag_type, algo_tag_type>( - 128); + test_batched_trmm, Kokkos::complex, param_tag_type, algo_tag_type>(128); } TEST_F(TestCategory, batched_scalar_serial_trmm_l_u_t_n_dcomplex_dcomplex) { - typedef ::Test::Trmm::ParamTag - param_tag_type; + typedef ::Test::Trmm::ParamTag param_tag_type; typedef Algo::Trmm::Unblocked algo_tag_type; - test_batched_trmm, - Kokkos::complex, param_tag_type, algo_tag_type>( - 128); + test_batched_trmm, Kokkos::complex, param_tag_type, algo_tag_type>(128); } TEST_F(TestCategory, batched_scalar_serial_trmm_r_u_t_u_dcomplex_dcomplex) { - typedef ::Test::Trmm::ParamTag - param_tag_type; + typedef ::Test::Trmm::ParamTag param_tag_type; typedef Algo::Trmm::Unblocked algo_tag_type; - test_batched_trmm, - Kokkos::complex, param_tag_type, algo_tag_type>( - 128); + test_batched_trmm, Kokkos::complex, param_tag_type, algo_tag_type>(128); } TEST_F(TestCategory, batched_scalar_serial_trmm_r_u_t_n_dcomplex_dcomplex) { - typedef ::Test::Trmm::ParamTag - param_tag_type; + typedef ::Test::Trmm::ParamTag param_tag_type; typedef Algo::Trmm::Unblocked algo_tag_type; - test_batched_trmm, - Kokkos::complex, param_tag_type, algo_tag_type>( - 128); + test_batched_trmm, Kokkos::complex, param_tag_type, algo_tag_type>(128); } // CONJUGATE TRANSPOSE TEST_F(TestCategory, batched_scalar_serial_trmm_l_l_ct_u_dcomplex_dcomplex) { - typedef ::Test::Trmm::ParamTag - param_tag_type; + typedef ::Test::Trmm::ParamTag param_tag_type; typedef Algo::Trmm::Unblocked algo_tag_type; - test_batched_trmm, - Kokkos::complex, param_tag_type, algo_tag_type>( - 128); + test_batched_trmm, Kokkos::complex, param_tag_type, algo_tag_type>(128); } TEST_F(TestCategory, batched_scalar_serial_trmm_l_l_ct_n_dcomplex_dcomplex) { - typedef ::Test::Trmm::ParamTag - param_tag_type; + typedef ::Test::Trmm::ParamTag param_tag_type; typedef Algo::Trmm::Unblocked algo_tag_type; - test_batched_trmm, - Kokkos::complex, param_tag_type, algo_tag_type>( - 128); + test_batched_trmm, Kokkos::complex, param_tag_type, algo_tag_type>(128); } TEST_F(TestCategory, batched_scalar_serial_trmm_l_u_ct_u_dcomplex_dcomplex) { - typedef ::Test::Trmm::ParamTag - param_tag_type; + typedef ::Test::Trmm::ParamTag param_tag_type; typedef Algo::Trmm::Unblocked algo_tag_type; - test_batched_trmm, - Kokkos::complex, param_tag_type, algo_tag_type>( - 128); + test_batched_trmm, Kokkos::complex, param_tag_type, algo_tag_type>(128); } TEST_F(TestCategory, batched_scalar_serial_trmm_l_u_ct_n_dcomplex_dcomplex) { - typedef ::Test::Trmm::ParamTag - param_tag_type; + typedef ::Test::Trmm::ParamTag param_tag_type; typedef Algo::Trmm::Unblocked algo_tag_type; - test_batched_trmm, - Kokkos::complex, param_tag_type, algo_tag_type>( - 128); + test_batched_trmm, Kokkos::complex, param_tag_type, algo_tag_type>(128); } TEST_F(TestCategory, batched_scalar_serial_trmm_r_u_ct_u_dcomplex_dcomplex) { - typedef ::Test::Trmm::ParamTag - param_tag_type; + typedef ::Test::Trmm::ParamTag param_tag_type; typedef Algo::Trmm::Unblocked algo_tag_type; - test_batched_trmm, - Kokkos::complex, param_tag_type, algo_tag_type>( - 128); + test_batched_trmm, Kokkos::complex, param_tag_type, algo_tag_type>(128); } TEST_F(TestCategory, batched_scalar_serial_trmm_r_u_ct_n_dcomplex_dcomplex) { - typedef ::Test::Trmm::ParamTag - param_tag_type; + typedef ::Test::Trmm::ParamTag param_tag_type; typedef Algo::Trmm::Unblocked algo_tag_type; - test_batched_trmm, - Kokkos::complex, param_tag_type, algo_tag_type>( - 128); + test_batched_trmm, Kokkos::complex, param_tag_type, algo_tag_type>(128); } #endif diff --git a/batched/dense/unit_test/Test_Batched_SerialTrmm_Real.hpp b/batched/dense/unit_test/Test_Batched_SerialTrmm_Real.hpp index 1cfc259dd3..10a4f38ed2 100644 --- a/batched/dense/unit_test/Test_Batched_SerialTrmm_Real.hpp +++ b/batched/dense/unit_test/Test_Batched_SerialTrmm_Real.hpp @@ -17,147 +17,111 @@ #if defined(KOKKOSKERNELS_INST_FLOAT) // NO TRANSPOSE TEST_F(TestCategory, batched_scalar_serial_trmm_l_l_nt_u_float_float) { - typedef ::Test::Trmm::ParamTag - param_tag_type; + typedef ::Test::Trmm::ParamTag param_tag_type; typedef Algo::Trmm::Unblocked algo_tag_type; test_batched_trmm(); } TEST_F(TestCategory, batched_scalar_serial_trmm_l_l_nt_n_float_float) { - typedef ::Test::Trmm::ParamTag - param_tag_type; + typedef ::Test::Trmm::ParamTag param_tag_type; typedef Algo::Trmm::Unblocked algo_tag_type; test_batched_trmm(); } TEST_F(TestCategory, batched_scalar_serial_trmm_l_u_nt_u_float_float) { - typedef ::Test::Trmm::ParamTag - param_tag_type; + typedef ::Test::Trmm::ParamTag param_tag_type; typedef Algo::Trmm::Unblocked algo_tag_type; test_batched_trmm(); } TEST_F(TestCategory, batched_scalar_serial_trmm_l_u_nt_n_float_float) { - typedef ::Test::Trmm::ParamTag - param_tag_type; + typedef ::Test::Trmm::ParamTag param_tag_type; typedef Algo::Trmm::Unblocked algo_tag_type; test_batched_trmm(); } TEST_F(TestCategory, batched_scalar_serial_trmm_r_u_nt_u_float_float) { - typedef ::Test::Trmm::ParamTag - param_tag_type; + typedef ::Test::Trmm::ParamTag param_tag_type; typedef Algo::Trmm::Unblocked algo_tag_type; test_batched_trmm(); } TEST_F(TestCategory, batched_scalar_serial_trmm_r_u_nt_n_float_float) { - typedef ::Test::Trmm::ParamTag - param_tag_type; + typedef ::Test::Trmm::ParamTag param_tag_type; typedef Algo::Trmm::Unblocked algo_tag_type; test_batched_trmm(); } // TRANSPOSE TEST_F(TestCategory, batched_scalar_serial_trmm_l_l_t_u_float_float) { - typedef ::Test::Trmm::ParamTag - param_tag_type; + typedef ::Test::Trmm::ParamTag param_tag_type; typedef Algo::Trmm::Unblocked algo_tag_type; test_batched_trmm(); } TEST_F(TestCategory, batched_scalar_serial_trmm_l_l_t_n_float_float) { - typedef ::Test::Trmm::ParamTag - param_tag_type; + typedef ::Test::Trmm::ParamTag param_tag_type; typedef Algo::Trmm::Unblocked algo_tag_type; test_batched_trmm(); } TEST_F(TestCategory, batched_scalar_serial_trmm_l_u_t_u_float_float) { - typedef ::Test::Trmm::ParamTag - param_tag_type; + typedef ::Test::Trmm::ParamTag param_tag_type; typedef Algo::Trmm::Unblocked algo_tag_type; test_batched_trmm(); } TEST_F(TestCategory, batched_scalar_serial_trmm_l_u_t_n_float_float) { - typedef ::Test::Trmm::ParamTag - param_tag_type; + typedef ::Test::Trmm::ParamTag param_tag_type; typedef Algo::Trmm::Unblocked algo_tag_type; test_batched_trmm(); } TEST_F(TestCategory, batched_scalar_serial_trmm_r_u_t_u_float_float) { - typedef ::Test::Trmm::ParamTag - param_tag_type; + typedef ::Test::Trmm::ParamTag param_tag_type; typedef Algo::Trmm::Unblocked algo_tag_type; test_batched_trmm(); } TEST_F(TestCategory, batched_scalar_serial_trmm_r_u_t_n_float_float) { - typedef ::Test::Trmm::ParamTag - param_tag_type; + typedef ::Test::Trmm::ParamTag param_tag_type; typedef Algo::Trmm::Unblocked algo_tag_type; test_batched_trmm(); } // CONJUGATE TRANSPOSE TEST_F(TestCategory, batched_scalar_serial_trmm_l_l_ct_u_float_float) { - typedef ::Test::Trmm::ParamTag - param_tag_type; + typedef ::Test::Trmm::ParamTag param_tag_type; typedef Algo::Trmm::Unblocked algo_tag_type; test_batched_trmm(); } TEST_F(TestCategory, batched_scalar_serial_trmm_l_l_ct_n_float_float) { - typedef ::Test::Trmm::ParamTag - param_tag_type; + typedef ::Test::Trmm::ParamTag param_tag_type; typedef Algo::Trmm::Unblocked algo_tag_type; test_batched_trmm(); } TEST_F(TestCategory, batched_scalar_serial_trmm_l_u_ct_u_float_float) { - typedef ::Test::Trmm::ParamTag - param_tag_type; + typedef ::Test::Trmm::ParamTag param_tag_type; typedef Algo::Trmm::Unblocked algo_tag_type; test_batched_trmm(); } TEST_F(TestCategory, batched_scalar_serial_trmm_l_u_ct_n_float_float) { - typedef ::Test::Trmm::ParamTag - param_tag_type; + typedef ::Test::Trmm::ParamTag param_tag_type; typedef Algo::Trmm::Unblocked algo_tag_type; test_batched_trmm(); } TEST_F(TestCategory, batched_scalar_serial_trmm_r_u_ct_u_float_float) { - typedef ::Test::Trmm::ParamTag - param_tag_type; + typedef ::Test::Trmm::ParamTag param_tag_type; typedef Algo::Trmm::Unblocked algo_tag_type; test_batched_trmm(); } TEST_F(TestCategory, batched_scalar_serial_trmm_r_u_ct_n_float_float) { - typedef ::Test::Trmm::ParamTag - param_tag_type; + typedef ::Test::Trmm::ParamTag param_tag_type; typedef Algo::Trmm::Unblocked algo_tag_type; test_batched_trmm(); @@ -167,167 +131,113 @@ TEST_F(TestCategory, batched_scalar_serial_trmm_r_u_ct_n_float_float) { #if defined(KOKKOSKERNELS_INST_DOUBLE) // NO TRANSPOSE TEST_F(TestCategory, batched_scalar_serial_trmm_l_l_nt_u_double_double) { - typedef ::Test::Trmm::ParamTag - param_tag_type; + typedef ::Test::Trmm::ParamTag param_tag_type; typedef Algo::Trmm::Unblocked algo_tag_type; - test_batched_trmm(); + test_batched_trmm(); } TEST_F(TestCategory, batched_scalar_serial_trmm_l_l_nt_n_double_double) { - typedef ::Test::Trmm::ParamTag - param_tag_type; + typedef ::Test::Trmm::ParamTag param_tag_type; typedef Algo::Trmm::Unblocked algo_tag_type; - test_batched_trmm(); + test_batched_trmm(); } TEST_F(TestCategory, batched_scalar_serial_trmm_l_u_nt_u_double_double) { - typedef ::Test::Trmm::ParamTag - param_tag_type; + typedef ::Test::Trmm::ParamTag param_tag_type; typedef Algo::Trmm::Unblocked algo_tag_type; - test_batched_trmm(); + test_batched_trmm(); } TEST_F(TestCategory, batched_scalar_serial_trmm_l_u_nt_n_double_double) { - typedef ::Test::Trmm::ParamTag - param_tag_type; + typedef ::Test::Trmm::ParamTag param_tag_type; typedef Algo::Trmm::Unblocked algo_tag_type; - test_batched_trmm(); + test_batched_trmm(); } TEST_F(TestCategory, batched_scalar_serial_trmm_r_u_nt_u_double_double) { - typedef ::Test::Trmm::ParamTag - param_tag_type; + typedef ::Test::Trmm::ParamTag param_tag_type; typedef Algo::Trmm::Unblocked algo_tag_type; - test_batched_trmm(); + test_batched_trmm(); } TEST_F(TestCategory, batched_scalar_serial_trmm_r_u_nt_n_double_double) { - typedef ::Test::Trmm::ParamTag - param_tag_type; + typedef ::Test::Trmm::ParamTag param_tag_type; typedef Algo::Trmm::Unblocked algo_tag_type; - test_batched_trmm(); + test_batched_trmm(); } // TRANSPOSE TEST_F(TestCategory, batched_scalar_serial_trmm_l_l_t_u_double_double) { - typedef ::Test::Trmm::ParamTag - param_tag_type; + typedef ::Test::Trmm::ParamTag param_tag_type; typedef Algo::Trmm::Unblocked algo_tag_type; - test_batched_trmm(); + test_batched_trmm(); } TEST_F(TestCategory, batched_scalar_serial_trmm_l_l_t_n_double_double) { - typedef ::Test::Trmm::ParamTag - param_tag_type; + typedef ::Test::Trmm::ParamTag param_tag_type; typedef Algo::Trmm::Unblocked algo_tag_type; - test_batched_trmm(); + test_batched_trmm(); } TEST_F(TestCategory, batched_scalar_serial_trmm_l_u_t_u_double_double) { - typedef ::Test::Trmm::ParamTag - param_tag_type; + typedef ::Test::Trmm::ParamTag param_tag_type; typedef Algo::Trmm::Unblocked algo_tag_type; - test_batched_trmm(); + test_batched_trmm(); } TEST_F(TestCategory, batched_scalar_serial_trmm_l_u_t_n_double_double) { - typedef ::Test::Trmm::ParamTag - param_tag_type; + typedef ::Test::Trmm::ParamTag param_tag_type; typedef Algo::Trmm::Unblocked algo_tag_type; - test_batched_trmm(); + test_batched_trmm(); } TEST_F(TestCategory, batched_scalar_serial_trmm_r_u_t_u_double_double) { - typedef ::Test::Trmm::ParamTag - param_tag_type; + typedef ::Test::Trmm::ParamTag param_tag_type; typedef Algo::Trmm::Unblocked algo_tag_type; - test_batched_trmm(); + test_batched_trmm(); } TEST_F(TestCategory, batched_scalar_serial_trmm_r_u_t_n_double_double) { - typedef ::Test::Trmm::ParamTag - param_tag_type; + typedef ::Test::Trmm::ParamTag param_tag_type; typedef Algo::Trmm::Unblocked algo_tag_type; - test_batched_trmm(); + test_batched_trmm(); } // CONJUGATE TRANSPOSE TEST_F(TestCategory, batched_scalar_serial_trmm_l_l_ct_u_double_double) { - typedef ::Test::Trmm::ParamTag - param_tag_type; + typedef ::Test::Trmm::ParamTag param_tag_type; typedef Algo::Trmm::Unblocked algo_tag_type; - test_batched_trmm(); + test_batched_trmm(); } TEST_F(TestCategory, batched_scalar_serial_trmm_l_l_ct_n_double_double) { - typedef ::Test::Trmm::ParamTag - param_tag_type; + typedef ::Test::Trmm::ParamTag param_tag_type; typedef Algo::Trmm::Unblocked algo_tag_type; - test_batched_trmm(); + test_batched_trmm(); } TEST_F(TestCategory, batched_scalar_serial_trmm_l_u_ct_u_double_double) { - typedef ::Test::Trmm::ParamTag - param_tag_type; + typedef ::Test::Trmm::ParamTag param_tag_type; typedef Algo::Trmm::Unblocked algo_tag_type; - test_batched_trmm(); + test_batched_trmm(); } TEST_F(TestCategory, batched_scalar_serial_trmm_l_u_ct_n_double_double) { - typedef ::Test::Trmm::ParamTag - param_tag_type; + typedef ::Test::Trmm::ParamTag param_tag_type; typedef Algo::Trmm::Unblocked algo_tag_type; - test_batched_trmm(); + test_batched_trmm(); } TEST_F(TestCategory, batched_scalar_serial_trmm_r_u_ct_u_double_double) { - typedef ::Test::Trmm::ParamTag - param_tag_type; + typedef ::Test::Trmm::ParamTag param_tag_type; typedef Algo::Trmm::Unblocked algo_tag_type; - test_batched_trmm(); + test_batched_trmm(); } TEST_F(TestCategory, batched_scalar_serial_trmm_r_u_ct_n_double_double) { - typedef ::Test::Trmm::ParamTag - param_tag_type; + typedef ::Test::Trmm::ParamTag param_tag_type; typedef Algo::Trmm::Unblocked algo_tag_type; - test_batched_trmm(); + test_batched_trmm(); } #endif diff --git a/batched/dense/unit_test/Test_Batched_SerialTrsm.hpp b/batched/dense/unit_test/Test_Batched_SerialTrsm.hpp index f9418a804a..62f4b4de69 100644 --- a/batched/dense/unit_test/Test_Batched_SerialTrsm.hpp +++ b/batched/dense/unit_test/Test_Batched_SerialTrsm.hpp @@ -37,8 +37,7 @@ struct ParamTag { typedef D diag; }; -template +template struct Functor_TestBatchedSerialTrsm { using execution_space = typename DeviceType::execution_space; ViewType _a, _b; @@ -46,8 +45,7 @@ struct Functor_TestBatchedSerialTrsm { ScalarType _alpha; KOKKOS_INLINE_FUNCTION - Functor_TestBatchedSerialTrsm(const ScalarType alpha, const ViewType &a, - const ViewType &b) + Functor_TestBatchedSerialTrsm(const ScalarType alpha, const ViewType &a, const ViewType &b) : _a(a), _b(b), _alpha(alpha) {} KOKKOS_INLINE_FUNCTION @@ -55,9 +53,8 @@ struct Functor_TestBatchedSerialTrsm { auto aa = Kokkos::subview(_a, k, Kokkos::ALL(), Kokkos::ALL()); auto bb = Kokkos::subview(_b, k, Kokkos::ALL(), Kokkos::ALL()); - SerialTrsm::invoke(_alpha, aa, bb); + SerialTrsm::invoke(_alpha, aa, bb); } inline void run() { @@ -72,8 +69,7 @@ struct Functor_TestBatchedSerialTrsm { } }; -template +template void impl_test_batched_trsm(const int N, const int BlkSize, const int NumCols) { typedef typename ViewType::value_type value_type; typedef Kokkos::ArithTraits ats; @@ -81,15 +77,13 @@ void impl_test_batched_trsm(const int N, const int BlkSize, const int NumCols) { /// randomized input testing views ScalarType alpha(1.0); - const bool is_side_right = - std::is_same::value; - const int b_nrows = is_side_right ? NumCols : BlkSize; - const int b_ncols = is_side_right ? BlkSize : NumCols; - ViewType a0("a0", N, BlkSize, BlkSize), a1("a1", N, BlkSize, BlkSize), - b0("b0", N, b_nrows, b_ncols), b1("b1", N, b_nrows, b_ncols); + const bool is_side_right = std::is_same::value; + const int b_nrows = is_side_right ? NumCols : BlkSize; + const int b_ncols = is_side_right ? BlkSize : NumCols; + ViewType a0("a0", N, BlkSize, BlkSize), a1("a1", N, BlkSize, BlkSize), b0("b0", N, b_nrows, b_ncols), + b1("b1", N, b_nrows, b_ncols); - Kokkos::Random_XorShift64_Pool random( - 13718); + Kokkos::Random_XorShift64_Pool random(13718); Kokkos::fill_random(a0, random, value_type(1.0)); Kokkos::fill_random(b0, random, value_type(1.0)); @@ -98,12 +92,9 @@ void impl_test_batched_trsm(const int N, const int BlkSize, const int NumCols) { Kokkos::deep_copy(a1, a0); Kokkos::deep_copy(b1, b0); - Functor_TestBatchedSerialTrsm(alpha, a0, b0) - .run(); - Functor_TestBatchedSerialTrsm(alpha, a1, b1) + Functor_TestBatchedSerialTrsm(alpha, a0, b0) .run(); + Functor_TestBatchedSerialTrsm(alpha, a1, b1).run(); Kokkos::fence(); @@ -130,36 +121,27 @@ void impl_test_batched_trsm(const int N, const int BlkSize, const int NumCols) { } // namespace Trsm } // namespace Test -template +template int test_batched_trsm() { #if defined(KOKKOSKERNELS_INST_LAYOUTLEFT) { - typedef Kokkos::View - ViewType; - Test::Trsm::impl_test_batched_trsm(0, 10, 4); + typedef Kokkos::View ViewType; + Test::Trsm::impl_test_batched_trsm(0, 10, 4); for (int i = 0; i < 10; ++i) { // printf("Testing: LayoutLeft, Blksize %d\n", i); - Test::Trsm::impl_test_batched_trsm(1024, i, 4); - Test::Trsm::impl_test_batched_trsm(1024, i, 1); + Test::Trsm::impl_test_batched_trsm(1024, i, 4); + Test::Trsm::impl_test_batched_trsm(1024, i, 1); } } #endif #if defined(KOKKOSKERNELS_INST_LAYOUTRIGHT) { - typedef Kokkos::View - ViewType; - Test::Trsm::impl_test_batched_trsm(0, 10, 4); + typedef Kokkos::View ViewType; + Test::Trsm::impl_test_batched_trsm(0, 10, 4); for (int i = 0; i < 10; ++i) { // printf("Testing: LayoutRight, Blksize %d\n", i); - Test::Trsm::impl_test_batched_trsm(1024, i, 4); - Test::Trsm::impl_test_batched_trsm(1024, i, 1); + Test::Trsm::impl_test_batched_trsm(1024, i, 4); + Test::Trsm::impl_test_batched_trsm(1024, i, 1); } } #endif diff --git a/batched/dense/unit_test/Test_Batched_SerialTrsm_Complex.hpp b/batched/dense/unit_test/Test_Batched_SerialTrsm_Complex.hpp index be0005a74c..d034ba1a53 100644 --- a/batched/dense/unit_test/Test_Batched_SerialTrsm_Complex.hpp +++ b/batched/dense/unit_test/Test_Batched_SerialTrsm_Complex.hpp @@ -16,28 +16,19 @@ #if defined(KOKKOSKERNELS_INST_COMPLEX_DOUBLE) TEST_F(TestCategory, batched_scalar_serial_trsm_l_l_nt_u_dcomplex_dcomplex) { - typedef ::Test::Trmm::ParamTag - param_tag_type; + typedef ::Test::Trmm::ParamTag param_tag_type; typedef Algo::Trsm::Blocked algo_tag_type; - test_batched_trsm, - Kokkos::complex, param_tag_type, algo_tag_type>(); + test_batched_trsm, Kokkos::complex, param_tag_type, algo_tag_type>(); } TEST_F(TestCategory, batched_scalar_serial_trsm_l_l_nt_n_dcomplex_dcomplex) { - typedef ::Test::Trmm::ParamTag - param_tag_type; + typedef ::Test::Trmm::ParamTag param_tag_type; typedef Algo::Trsm::Blocked algo_tag_type; - test_batched_trsm, - Kokkos::complex, param_tag_type, algo_tag_type>(); + test_batched_trsm, Kokkos::complex, param_tag_type, algo_tag_type>(); } TEST_F(TestCategory, batched_scalar_serial_trsm_l_u_nt_u_dcomplex_dcomplex) { - typedef ::Test::Trmm::ParamTag - param_tag_type; + typedef ::Test::Trmm::ParamTag param_tag_type; typedef Algo::Trsm::Blocked algo_tag_type; - test_batched_trsm, - Kokkos::complex, param_tag_type, algo_tag_type>(); + test_batched_trsm, Kokkos::complex, param_tag_type, algo_tag_type>(); } // TEST_F( TestCategory, batched_scalar_serial_trsm_l_u_nt_n_dcomplex_dcomplex ) // { @@ -47,45 +38,30 @@ TEST_F(TestCategory, batched_scalar_serial_trsm_l_u_nt_u_dcomplex_dcomplex) { // test_batched_trsm,Kokkos::complex,param_tag_type,algo_tag_type>(); // } TEST_F(TestCategory, batched_scalar_serial_trsm_r_u_nt_u_dcomplex_dcomplex) { - typedef ::Test::Trmm::ParamTag - param_tag_type; + typedef ::Test::Trmm::ParamTag param_tag_type; typedef Algo::Trsm::Blocked algo_tag_type; - test_batched_trsm, - Kokkos::complex, param_tag_type, algo_tag_type>(); + test_batched_trsm, Kokkos::complex, param_tag_type, algo_tag_type>(); } TEST_F(TestCategory, batched_scalar_serial_trsm_r_u_nt_n_dcomplex_dcomplex) { - typedef ::Test::Trmm::ParamTag - param_tag_type; + typedef ::Test::Trmm::ParamTag param_tag_type; typedef Algo::Trsm::Blocked algo_tag_type; - test_batched_trsm, - Kokkos::complex, param_tag_type, algo_tag_type>(); + test_batched_trsm, Kokkos::complex, param_tag_type, algo_tag_type>(); } // TEST_F(TestCategory, batched_scalar_serial_trsm_l_l_t_u_dcomplex_dcomplex) { - typedef ::Test::Trmm::ParamTag - param_tag_type; + typedef ::Test::Trmm::ParamTag param_tag_type; typedef Algo::Trsm::Blocked algo_tag_type; - test_batched_trsm, - Kokkos::complex, param_tag_type, algo_tag_type>(); + test_batched_trsm, Kokkos::complex, param_tag_type, algo_tag_type>(); } TEST_F(TestCategory, batched_scalar_serial_trsm_l_l_t_n_dcomplex_dcomplex) { - typedef ::Test::Trmm::ParamTag - param_tag_type; + typedef ::Test::Trmm::ParamTag param_tag_type; typedef Algo::Trsm::Blocked algo_tag_type; - test_batched_trsm, - Kokkos::complex, param_tag_type, algo_tag_type>(); + test_batched_trsm, Kokkos::complex, param_tag_type, algo_tag_type>(); } TEST_F(TestCategory, batched_scalar_serial_trsm_l_u_t_u_dcomplex_dcomplex) { - typedef ::Test::Trmm::ParamTag - param_tag_type; + typedef ::Test::Trmm::ParamTag param_tag_type; typedef Algo::Trsm::Blocked algo_tag_type; - test_batched_trsm, - Kokkos::complex, param_tag_type, algo_tag_type>(); + test_batched_trsm, Kokkos::complex, param_tag_type, algo_tag_type>(); } // TEST_F( TestCategory, batched_scalar_serial_trsm_l_u_t_n_dcomplex_dcomplex ) // { @@ -96,28 +72,19 @@ TEST_F(TestCategory, batched_scalar_serial_trsm_l_u_t_u_dcomplex_dcomplex) { // } TEST_F(TestCategory, batched_scalar_serial_trsm_l_l_nt_u_dcomplex_double) { - typedef ::Test::Trmm::ParamTag - param_tag_type; + typedef ::Test::Trmm::ParamTag param_tag_type; typedef Algo::Trsm::Blocked algo_tag_type; - test_batched_trsm, double, param_tag_type, - algo_tag_type>(); + test_batched_trsm, double, param_tag_type, algo_tag_type>(); } TEST_F(TestCategory, batched_scalar_serial_trsm_l_l_nt_n_dcomplex_double) { - typedef ::Test::Trmm::ParamTag - param_tag_type; + typedef ::Test::Trmm::ParamTag param_tag_type; typedef Algo::Trsm::Blocked algo_tag_type; - test_batched_trsm, double, param_tag_type, - algo_tag_type>(); + test_batched_trsm, double, param_tag_type, algo_tag_type>(); } TEST_F(TestCategory, batched_scalar_serial_trsm_l_u_nt_u_dcomplex_double) { - typedef ::Test::Trmm::ParamTag - param_tag_type; + typedef ::Test::Trmm::ParamTag param_tag_type; typedef Algo::Trsm::Blocked algo_tag_type; - test_batched_trsm, double, param_tag_type, - algo_tag_type>(); + test_batched_trsm, double, param_tag_type, algo_tag_type>(); } // TEST_F( TestCategory, batched_scalar_serial_trsm_l_u_nt_n_dcomplex_double ) { // typedef @@ -126,45 +93,30 @@ TEST_F(TestCategory, batched_scalar_serial_trsm_l_u_nt_u_dcomplex_double) { // test_batched_trsm,double,param_tag_type,algo_tag_type>(); // } TEST_F(TestCategory, batched_scalar_serial_trsm_r_u_nt_u_dcomplex_double) { - typedef ::Test::Trmm::ParamTag - param_tag_type; + typedef ::Test::Trmm::ParamTag param_tag_type; typedef Algo::Trsm::Blocked algo_tag_type; - test_batched_trsm, double, param_tag_type, - algo_tag_type>(); + test_batched_trsm, double, param_tag_type, algo_tag_type>(); } TEST_F(TestCategory, batched_scalar_serial_trsm_r_u_nt_n_dcomplex_double) { - typedef ::Test::Trmm::ParamTag - param_tag_type; + typedef ::Test::Trmm::ParamTag param_tag_type; typedef Algo::Trsm::Blocked algo_tag_type; - test_batched_trsm, double, param_tag_type, - algo_tag_type>(); + test_batched_trsm, double, param_tag_type, algo_tag_type>(); } // TEST_F(TestCategory, batched_scalar_serial_trsm_l_l_t_u_dcomplex_double) { - typedef ::Test::Trmm::ParamTag - param_tag_type; + typedef ::Test::Trmm::ParamTag param_tag_type; typedef Algo::Trsm::Blocked algo_tag_type; - test_batched_trsm, double, param_tag_type, - algo_tag_type>(); + test_batched_trsm, double, param_tag_type, algo_tag_type>(); } TEST_F(TestCategory, batched_scalar_serial_trsm_l_l_t_n_dcomplex_double) { - typedef ::Test::Trmm::ParamTag - param_tag_type; + typedef ::Test::Trmm::ParamTag param_tag_type; typedef Algo::Trsm::Blocked algo_tag_type; - test_batched_trsm, double, param_tag_type, - algo_tag_type>(); + test_batched_trsm, double, param_tag_type, algo_tag_type>(); } TEST_F(TestCategory, batched_scalar_serial_trsm_l_u_t_u_dcomplex_double) { - typedef ::Test::Trmm::ParamTag - param_tag_type; + typedef ::Test::Trmm::ParamTag param_tag_type; typedef Algo::Trsm::Blocked algo_tag_type; - test_batched_trsm, double, param_tag_type, - algo_tag_type>(); + test_batched_trsm, double, param_tag_type, algo_tag_type>(); } // TEST_F( TestCategory, batched_scalar_serial_trsm_l_u_t_n_dcomplex_double ) { // typedef diff --git a/batched/dense/unit_test/Test_Batched_SerialTrsm_Real.hpp b/batched/dense/unit_test/Test_Batched_SerialTrsm_Real.hpp index 18b10a81e6..44cb802263 100644 --- a/batched/dense/unit_test/Test_Batched_SerialTrsm_Real.hpp +++ b/batched/dense/unit_test/Test_Batched_SerialTrsm_Real.hpp @@ -16,73 +16,53 @@ #if defined(KOKKOSKERNELS_INST_FLOAT) TEST_F(TestCategory, batched_scalar_serial_trsm_l_l_nt_u_float_float) { - typedef ::Test::Trmm::ParamTag - param_tag_type; + typedef ::Test::Trmm::ParamTag param_tag_type; typedef Algo::Trsm::Blocked algo_tag_type; test_batched_trsm(); } TEST_F(TestCategory, batched_scalar_serial_trsm_l_l_nt_n_float_float) { - typedef ::Test::Trmm::ParamTag - param_tag_type; + typedef ::Test::Trmm::ParamTag param_tag_type; typedef Algo::Trsm::Blocked algo_tag_type; test_batched_trsm(); } TEST_F(TestCategory, batched_scalar_serial_trsm_l_u_nt_u_float_float) { - typedef ::Test::Trmm::ParamTag - param_tag_type; + typedef ::Test::Trmm::ParamTag param_tag_type; typedef Algo::Trsm::Blocked algo_tag_type; test_batched_trsm(); } TEST_F(TestCategory, batched_scalar_serial_trsm_l_u_nt_n_float_float) { - typedef ::Test::Trmm::ParamTag - param_tag_type; + typedef ::Test::Trmm::ParamTag param_tag_type; typedef Algo::Trsm::Blocked algo_tag_type; test_batched_trsm(); } TEST_F(TestCategory, batched_scalar_serial_trsm_r_u_nt_u_float_float) { - typedef ::Test::Trmm::ParamTag - param_tag_type; + typedef ::Test::Trmm::ParamTag param_tag_type; typedef Algo::Trsm::Blocked algo_tag_type; test_batched_trsm(); } TEST_F(TestCategory, batched_scalar_serial_trsm_r_u_nt_n_float_float) { - typedef ::Test::Trmm::ParamTag - param_tag_type; + typedef ::Test::Trmm::ParamTag param_tag_type; typedef Algo::Trsm::Blocked algo_tag_type; test_batched_trsm(); } // TEST_F(TestCategory, batched_scalar_serial_trsm_l_l_t_u_float_float) { - typedef ::Test::Trmm::ParamTag - param_tag_type; + typedef ::Test::Trmm::ParamTag param_tag_type; typedef Algo::Trsm::Blocked algo_tag_type; test_batched_trsm(); } TEST_F(TestCategory, batched_scalar_serial_trsm_l_l_t_n_float_float) { - typedef ::Test::Trmm::ParamTag - param_tag_type; + typedef ::Test::Trmm::ParamTag param_tag_type; typedef Algo::Trsm::Blocked algo_tag_type; test_batched_trsm(); } TEST_F(TestCategory, batched_scalar_serial_trsm_l_u_t_u_float_float) { - typedef ::Test::Trmm::ParamTag - param_tag_type; + typedef ::Test::Trmm::ParamTag param_tag_type; typedef Algo::Trsm::Blocked algo_tag_type; test_batched_trsm(); } TEST_F(TestCategory, batched_scalar_serial_trsm_l_u_t_n_float_float) { - typedef ::Test::Trmm::ParamTag - param_tag_type; + typedef ::Test::Trmm::ParamTag param_tag_type; typedef Algo::Trsm::Blocked algo_tag_type; test_batched_trsm(); } @@ -90,84 +70,54 @@ TEST_F(TestCategory, batched_scalar_serial_trsm_l_u_t_n_float_float) { #if defined(KOKKOSKERNELS_INST_DOUBLE) TEST_F(TestCategory, batched_scalar_serial_trsm_l_l_nt_u_double_double) { - typedef ::Test::Trmm::ParamTag - param_tag_type; + typedef ::Test::Trmm::ParamTag param_tag_type; typedef Algo::Trsm::Blocked algo_tag_type; - test_batched_trsm(); + test_batched_trsm(); } TEST_F(TestCategory, batched_scalar_serial_trsm_l_l_nt_n_double_double) { - typedef ::Test::Trmm::ParamTag - param_tag_type; + typedef ::Test::Trmm::ParamTag param_tag_type; typedef Algo::Trsm::Blocked algo_tag_type; - test_batched_trsm(); + test_batched_trsm(); } TEST_F(TestCategory, batched_scalar_serial_trsm_l_u_nt_u_double_double) { - typedef ::Test::Trmm::ParamTag - param_tag_type; + typedef ::Test::Trmm::ParamTag param_tag_type; typedef Algo::Trsm::Blocked algo_tag_type; - test_batched_trsm(); + test_batched_trsm(); } TEST_F(TestCategory, batched_scalar_serial_trsm_l_u_nt_n_double_double) { - typedef ::Test::Trmm::ParamTag - param_tag_type; + typedef ::Test::Trmm::ParamTag param_tag_type; typedef Algo::Trsm::Blocked algo_tag_type; - test_batched_trsm(); + test_batched_trsm(); } TEST_F(TestCategory, batched_scalar_serial_trsm_r_u_nt_u_double_double) { - typedef ::Test::Trmm::ParamTag - param_tag_type; + typedef ::Test::Trmm::ParamTag param_tag_type; typedef Algo::Trsm::Blocked algo_tag_type; - test_batched_trsm(); + test_batched_trsm(); } TEST_F(TestCategory, batched_scalar_serial_trsm_r_u_nt_n_double_double) { - typedef ::Test::Trmm::ParamTag - param_tag_type; + typedef ::Test::Trmm::ParamTag param_tag_type; typedef Algo::Trsm::Blocked algo_tag_type; - test_batched_trsm(); + test_batched_trsm(); } // TEST_F(TestCategory, batched_scalar_serial_trsm_l_l_t_u_double_double) { - typedef ::Test::Trmm::ParamTag - param_tag_type; + typedef ::Test::Trmm::ParamTag param_tag_type; typedef Algo::Trsm::Blocked algo_tag_type; - test_batched_trsm(); + test_batched_trsm(); } TEST_F(TestCategory, batched_scalar_serial_trsm_l_l_t_n_double_double) { - typedef ::Test::Trmm::ParamTag - param_tag_type; + typedef ::Test::Trmm::ParamTag param_tag_type; typedef Algo::Trsm::Blocked algo_tag_type; - test_batched_trsm(); + test_batched_trsm(); } TEST_F(TestCategory, batched_scalar_serial_trsm_l_u_t_u_double_double) { - typedef ::Test::Trmm::ParamTag - param_tag_type; + typedef ::Test::Trmm::ParamTag param_tag_type; typedef Algo::Trsm::Blocked algo_tag_type; - test_batched_trsm(); + test_batched_trsm(); } TEST_F(TestCategory, batched_scalar_serial_trsm_l_u_t_n_double_double) { - typedef ::Test::Trmm::ParamTag - param_tag_type; + typedef ::Test::Trmm::ParamTag param_tag_type; typedef Algo::Trsm::Blocked algo_tag_type; - test_batched_trsm(); + test_batched_trsm(); } #endif diff --git a/batched/dense/unit_test/Test_Batched_SerialTrsv.hpp b/batched/dense/unit_test/Test_Batched_SerialTrsv.hpp index 512dce3bce..c0a7de9e99 100644 --- a/batched/dense/unit_test/Test_Batched_SerialTrsv.hpp +++ b/batched/dense/unit_test/Test_Batched_SerialTrsv.hpp @@ -22,7 +22,7 @@ #include "KokkosBatched_Util.hpp" #include "KokkosBatched_Trsv_Decl.hpp" -//#include "KokkosKernels_TestUtils.hpp" +// #include "KokkosKernels_TestUtils.hpp" using namespace KokkosBatched; @@ -36,8 +36,7 @@ struct ParamTag { typedef D diag; }; -template +template struct Functor_TestBatchedSerialTrsv { using execution_space = typename DeviceType::execution_space; ViewType _a, _b; @@ -45,8 +44,7 @@ struct Functor_TestBatchedSerialTrsv { ScalarType _alpha; KOKKOS_INLINE_FUNCTION - Functor_TestBatchedSerialTrsv(const ScalarType alpha, const ViewType &a, - const ViewType &b) + Functor_TestBatchedSerialTrsv(const ScalarType alpha, const ViewType &a, const ViewType &b) : _a(a), _b(b), _alpha(alpha) {} KOKKOS_INLINE_FUNCTION @@ -54,9 +52,8 @@ struct Functor_TestBatchedSerialTrsv { auto aa = Kokkos::subview(_a, k, Kokkos::ALL(), Kokkos::ALL()); auto bb = Kokkos::subview(_b, k, Kokkos::ALL(), 0); - SerialTrsv::invoke(_alpha, aa, - bb); + SerialTrsv::invoke(_alpha, aa, bb); } inline void run() { @@ -71,8 +68,7 @@ struct Functor_TestBatchedSerialTrsv { } }; -template +template void impl_test_batched_trsv(const int N, const int BlkSize) { typedef typename ViewType::value_type value_type; typedef Kokkos::ArithTraits ats; @@ -80,11 +76,10 @@ void impl_test_batched_trsv(const int N, const int BlkSize) { /// randomized input testing views ScalarType alpha(1.5); - ViewType a0("a0", N, BlkSize, BlkSize), a1("a1", N, BlkSize, BlkSize), - b0("b0", N, BlkSize, 1), b1("b1", N, BlkSize, 1); + ViewType a0("a0", N, BlkSize, BlkSize), a1("a1", N, BlkSize, BlkSize), b0("b0", N, BlkSize, 1), + b1("b1", N, BlkSize, 1); - Kokkos::Random_XorShift64_Pool random( - 13718); + Kokkos::Random_XorShift64_Pool random(13718); Kokkos::fill_random(a0, random, value_type(1.0)); Kokkos::fill_random(b0, random, value_type(1.0)); @@ -95,12 +90,9 @@ void impl_test_batched_trsv(const int N, const int BlkSize) { Kokkos::deep_copy(a1, a0); Kokkos::deep_copy(b1, b0); - Functor_TestBatchedSerialTrsv(alpha, a0, b0) - .run(); - Functor_TestBatchedSerialTrsv(alpha, a1, b1) + Functor_TestBatchedSerialTrsv(alpha, a0, b0) .run(); + Functor_TestBatchedSerialTrsv(alpha, a1, b1).run(); Kokkos::fence(); @@ -120,16 +112,14 @@ void impl_test_batched_trsv(const int N, const int BlkSize) { /// check b0 and b1 are correct const value_type one(1); - const bool is_unit_diag = - std::is_same::value; + const bool is_unit_diag = std::is_same::value; for (int k = 0; k < N; ++k) { if (std::is_same::value) { if (std::is_same::value) { for (int i = 0; i < BlkSize; ++i) { value_type tmp(0); for (int j = 0; j <= i; ++j) { - const value_type aval = - (i == j && is_unit_diag ? one : a0_host(k, i, j)); + const value_type aval = (i == j && is_unit_diag ? one : a0_host(k, i, j)); const value_type bval = b0_host(k, j, 0); tmp += aval * bval; } @@ -138,20 +128,17 @@ void impl_test_batched_trsv(const int N, const int BlkSize) { for (int i = 0; i < BlkSize; ++i) { value_type tmp(0); for (int j = 0; j <= i; ++j) { - const value_type aval = - (i == j && is_unit_diag ? one : a0_host(k, i, j)); + const value_type aval = (i == j && is_unit_diag ? one : a0_host(k, i, j)); const value_type bval = b1_host(k, j, 0); tmp += aval * bval; } EXPECT_NEAR(ats::abs(tmp), ats::abs(alpha), eps); } - } else if (std::is_same::value) { + } else if (std::is_same::value) { for (int i = 0; i < BlkSize; ++i) { value_type tmp(0); for (int j = i; j < BlkSize; ++j) { - const value_type aval = - (i == j && is_unit_diag ? one : a0_host(k, i, j)); + const value_type aval = (i == j && is_unit_diag ? one : a0_host(k, i, j)); const value_type bval = b0_host(k, j, 0); tmp += aval * bval; } @@ -160,8 +147,7 @@ void impl_test_batched_trsv(const int N, const int BlkSize) { for (int i = 0; i < BlkSize; ++i) { value_type tmp(0); for (int j = i; j < BlkSize; ++j) { - const value_type aval = - (i == j && is_unit_diag ? one : a0_host(k, i, j)); + const value_type aval = (i == j && is_unit_diag ? one : a0_host(k, i, j)); const value_type bval = b1_host(k, j, 0); tmp += aval * bval; } @@ -183,15 +169,12 @@ void impl_test_batched_trsv(const int N, const int BlkSize) { } // namespace Trsv } // namespace Test -template +template int test_batched_trsv() { #if defined(KOKKOSKERNELS_INST_LAYOUTLEFT) { - typedef Kokkos::View - ViewType; - Test::Trsv::impl_test_batched_trsv(0, 10); + typedef Kokkos::View ViewType; + Test::Trsv::impl_test_batched_trsv(0, 10); for (int i = 0; i < 10; ++i) { // printf("Testing: LayoutLeft, Blksize %d, Uplo %d, Trans %d, Diag // %d\n", @@ -200,17 +183,14 @@ int test_batched_trsv() { // std::is_same::value, std::is_same::value); - Test::Trsv::impl_test_batched_trsv(1, i); + Test::Trsv::impl_test_batched_trsv(1, i); } } #endif #if defined(KOKKOSKERNELS_INST_LAYOUTRIGHT) { - typedef Kokkos::View - ViewType; - Test::Trsv::impl_test_batched_trsv(0, 10); + typedef Kokkos::View ViewType; + Test::Trsv::impl_test_batched_trsv(0, 10); for (int i = 0; i < 10; ++i) { // printf("Testing: LayoutRight, Blksize %d, Uplo %d, Trans %d, Diag // %d\n", @@ -219,8 +199,7 @@ int test_batched_trsv() { // std::is_same::value, std::is_same::value); - Test::Trsv::impl_test_batched_trsv(1, i); + Test::Trsv::impl_test_batched_trsv(1, i); } } #endif diff --git a/batched/dense/unit_test/Test_Batched_SerialTrsv_Complex.hpp b/batched/dense/unit_test/Test_Batched_SerialTrsv_Complex.hpp index a524b9f97e..73f0e65ed9 100644 --- a/batched/dense/unit_test/Test_Batched_SerialTrsv_Complex.hpp +++ b/batched/dense/unit_test/Test_Batched_SerialTrsv_Complex.hpp @@ -16,60 +16,44 @@ #if defined(KOKKOSKERNELS_INST_COMPLEX_DOUBLE) TEST_F(TestCategory, batched_scalar_serial_trsv_l_nt_u_dcomplex_dcomplex) { - typedef ::Test::Trsv::ParamTag - param_tag_type; + typedef ::Test::Trsv::ParamTag param_tag_type; typedef Algo::Trsv::Blocked algo_tag_type; - test_batched_trsv, - Kokkos::complex, param_tag_type, algo_tag_type>(); + test_batched_trsv, Kokkos::complex, param_tag_type, algo_tag_type>(); } TEST_F(TestCategory, batched_scalar_serial_trsv_l_nt_n_dcomplex_dcomplex) { - typedef ::Test::Trsv::ParamTag - param_tag_type; + typedef ::Test::Trsv::ParamTag param_tag_type; typedef Algo::Trsv::Blocked algo_tag_type; - test_batched_trsv, - Kokkos::complex, param_tag_type, algo_tag_type>(); + test_batched_trsv, Kokkos::complex, param_tag_type, algo_tag_type>(); } TEST_F(TestCategory, batched_scalar_serial_trsv_u_nt_u_dcomplex_dcomplex) { - typedef ::Test::Trsv::ParamTag - param_tag_type; + typedef ::Test::Trsv::ParamTag param_tag_type; typedef Algo::Trsv::Blocked algo_tag_type; - test_batched_trsv, - Kokkos::complex, param_tag_type, algo_tag_type>(); + test_batched_trsv, Kokkos::complex, param_tag_type, algo_tag_type>(); } TEST_F(TestCategory, batched_scalar_serial_trsv_u_nt_n_dcomplex_dcomplex) { - typedef ::Test::Trsv::ParamTag - param_tag_type; + typedef ::Test::Trsv::ParamTag param_tag_type; typedef Algo::Trsv::Blocked algo_tag_type; - test_batched_trsv, - Kokkos::complex, param_tag_type, algo_tag_type>(); + test_batched_trsv, Kokkos::complex, param_tag_type, algo_tag_type>(); } TEST_F(TestCategory, batched_scalar_serial_trsv_l_nt_u_dcomplex_double) { - typedef ::Test::Trsv::ParamTag - param_tag_type; + typedef ::Test::Trsv::ParamTag param_tag_type; typedef Algo::Trsv::Blocked algo_tag_type; - test_batched_trsv, double, param_tag_type, - algo_tag_type>(); + test_batched_trsv, double, param_tag_type, algo_tag_type>(); } TEST_F(TestCategory, batched_scalar_serial_trsv_l_nt_n_dcomplex_double) { - typedef ::Test::Trsv::ParamTag - param_tag_type; + typedef ::Test::Trsv::ParamTag param_tag_type; typedef Algo::Trsv::Blocked algo_tag_type; - test_batched_trsv, double, param_tag_type, - algo_tag_type>(); + test_batched_trsv, double, param_tag_type, algo_tag_type>(); } TEST_F(TestCategory, batched_scalar_serial_trsv_u_nt_u_dcomplex_double) { - typedef ::Test::Trsv::ParamTag - param_tag_type; + typedef ::Test::Trsv::ParamTag param_tag_type; typedef Algo::Trsv::Blocked algo_tag_type; - test_batched_trsv, double, param_tag_type, - algo_tag_type>(); + test_batched_trsv, double, param_tag_type, algo_tag_type>(); } TEST_F(TestCategory, batched_scalar_serial_trsv_u_nt_n_dcomplex_double) { - typedef ::Test::Trsv::ParamTag - param_tag_type; + typedef ::Test::Trsv::ParamTag param_tag_type; typedef Algo::Trsv::Blocked algo_tag_type; - test_batched_trsv, double, param_tag_type, - algo_tag_type>(); + test_batched_trsv, double, param_tag_type, algo_tag_type>(); } #endif diff --git a/batched/dense/unit_test/Test_Batched_SerialTrsv_Real.hpp b/batched/dense/unit_test/Test_Batched_SerialTrsv_Real.hpp index be1bf77b9e..5998232605 100644 --- a/batched/dense/unit_test/Test_Batched_SerialTrsv_Real.hpp +++ b/batched/dense/unit_test/Test_Batched_SerialTrsv_Real.hpp @@ -16,26 +16,22 @@ #if defined(KOKKOSKERNELS_INST_FLOAT) TEST_F(TestCategory, batched_scalar_serial_trsv_l_nt_u_float_float) { - typedef ::Test::Trsv::ParamTag - param_tag_type; + typedef ::Test::Trsv::ParamTag param_tag_type; typedef Algo::Trsv::Blocked algo_tag_type; test_batched_trsv(); } TEST_F(TestCategory, batched_scalar_serial_trsv_l_nt_n_float_float) { - typedef ::Test::Trsv::ParamTag - param_tag_type; + typedef ::Test::Trsv::ParamTag param_tag_type; typedef Algo::Trsv::Blocked algo_tag_type; test_batched_trsv(); } TEST_F(TestCategory, batched_scalar_serial_trsv_u_nt_u_float_float) { - typedef ::Test::Trsv::ParamTag - param_tag_type; + typedef ::Test::Trsv::ParamTag param_tag_type; typedef Algo::Trsv::Blocked algo_tag_type; test_batched_trsv(); } TEST_F(TestCategory, batched_scalar_serial_trsv_u_nt_n_float_float) { - typedef ::Test::Trsv::ParamTag - param_tag_type; + typedef ::Test::Trsv::ParamTag param_tag_type; typedef Algo::Trsv::Blocked algo_tag_type; test_batched_trsv(); } @@ -43,31 +39,23 @@ TEST_F(TestCategory, batched_scalar_serial_trsv_u_nt_n_float_float) { #if defined(KOKKOSKERNELS_INST_DOUBLE) TEST_F(TestCategory, batched_scalar_serial_trsv_l_nt_u_double_double) { - typedef ::Test::Trsv::ParamTag - param_tag_type; + typedef ::Test::Trsv::ParamTag param_tag_type; typedef Algo::Trsv::Blocked algo_tag_type; - test_batched_trsv(); + test_batched_trsv(); } TEST_F(TestCategory, batched_scalar_serial_trsv_l_nt_n_double_double) { - typedef ::Test::Trsv::ParamTag - param_tag_type; + typedef ::Test::Trsv::ParamTag param_tag_type; typedef Algo::Trsv::Blocked algo_tag_type; - test_batched_trsv(); + test_batched_trsv(); } TEST_F(TestCategory, batched_scalar_serial_trsv_u_nt_u_double_double) { - typedef ::Test::Trsv::ParamTag - param_tag_type; + typedef ::Test::Trsv::ParamTag param_tag_type; typedef Algo::Trsv::Blocked algo_tag_type; - test_batched_trsv(); + test_batched_trsv(); } TEST_F(TestCategory, batched_scalar_serial_trsv_u_nt_n_double_double) { - typedef ::Test::Trsv::ParamTag - param_tag_type; + typedef ::Test::Trsv::ParamTag param_tag_type; typedef Algo::Trsv::Blocked algo_tag_type; - test_batched_trsv(); + test_batched_trsv(); } #endif diff --git a/batched/dense/unit_test/Test_Batched_SerialTrtri.hpp b/batched/dense/unit_test/Test_Batched_SerialTrtri.hpp index b09cadcb7e..c4acbbfafb 100644 --- a/batched/dense/unit_test/Test_Batched_SerialTrtri.hpp +++ b/batched/dense/unit_test/Test_Batched_SerialTrtri.hpp @@ -51,8 +51,7 @@ struct NonUnitDiagTRTRI { KOKKOS_INLINE_FUNCTION void operator()(const int& i) const { A_(i, i) = A_(i, i) + 10; } }; -template +template struct VanillaGEMM { bool A_t, B_t, A_c, B_c; int N, K; @@ -69,12 +68,9 @@ struct VanillaGEMM { ScalarC beta; KOKKOS_INLINE_FUNCTION - void operator()( - const typename Kokkos::TeamPolicy::member_type& team) - const { + void operator()(const typename Kokkos::TeamPolicy::member_type& team) const { // GNU COMPILER BUG WORKAROUND -#if defined(KOKKOS_COMPILER_GNU) && !defined(__CUDA_ARCH__) && \ - !defined(__HIP_DEVICE_COMPILE__) +#if defined(KOKKOS_COMPILER_GNU) && !defined(__CUDA_ARCH__) && !defined(__HIP_DEVICE_COMPILE__) int i = team.league_rank(); #else const int i = team.league_rank(); @@ -110,8 +106,7 @@ struct ParamTag { typedef D diag; }; -template +template struct Functor_TestBatchedSerialTrtri { using execution_space = typename DeviceType::execution_space; ViewType _a; @@ -123,8 +118,7 @@ struct Functor_TestBatchedSerialTrtri { void operator()(const ParamTagType&, const int k) const { auto aa = Kokkos::subview(_a, k, Kokkos::ALL(), Kokkos::ALL()); - SerialTrtri::invoke(aa); + SerialTrtri::invoke(aa); } inline void run() { @@ -139,8 +133,7 @@ struct Functor_TestBatchedSerialTrtri { } }; -template +template void impl_test_batched_trtri(const int N, const int K) { typedef typename ViewType::value_type value_type; typedef typename DeviceType::execution_space execution_space; @@ -155,8 +148,7 @@ void impl_test_batched_trtri(const int N, const int K) { bool fail_flag = false; ScalarType cur_check_val; // Either 1 or 0, to check A_I - const bool is_A_lower = - std::is_same::value; + const bool is_A_lower = std::is_same::value; ViewType A("A", N, K, K); ViewType A_original("A_original", N, K, K); ViewType A_I("A_I", N, K, K); @@ -164,39 +156,29 @@ void impl_test_batched_trtri(const int N, const int K) { typename ViewType::HostMirror I_host = Kokkos::create_mirror_view(A_I); typename ViewType::HostMirror A_host = Kokkos::create_mirror_view(A); - uint64_t seed = - std::chrono::high_resolution_clock::now().time_since_epoch().count(); + uint64_t seed = std::chrono::high_resolution_clock::now().time_since_epoch().count(); - using ViewTypeSubA = - decltype(Kokkos::subview(A, 0, Kokkos::ALL(), Kokkos::ALL())); + using ViewTypeSubA = decltype(Kokkos::subview(A, 0, Kokkos::ALL(), Kokkos::ALL())); Kokkos::Random_XorShift64_Pool rand_pool(seed); if (std::is_same::value) { // Initialize A with deterministic random numbers - Kokkos::fill_random(A, rand_pool, - Kokkos::rand, - ScalarType>::max()); + Kokkos::fill_random(A, rand_pool, Kokkos::rand, ScalarType>::max()); using functor_type = UnitDiagTRTRI; for (int k = 0; k < N; ++k) { functor_type udtrtri(Kokkos::subview(A, k, Kokkos::ALL(), Kokkos::ALL())); // Initialize As diag with 1s - Kokkos::parallel_for("KokkosBlas::Test::UnitDiagTRTRI", - Kokkos::RangePolicy(0, K), udtrtri); + Kokkos::parallel_for("KokkosBlas::Test::UnitDiagTRTRI", Kokkos::RangePolicy(0, K), udtrtri); } } else { //(diag[0]=='N')||(diag[0]=='n') // Initialize A with random numbers - Kokkos::fill_random(A, rand_pool, - Kokkos::rand, - ScalarType>::max()); + Kokkos::fill_random(A, rand_pool, Kokkos::rand, ScalarType>::max()); using functor_type = NonUnitDiagTRTRI; for (int k = 0; k < N; ++k) { - functor_type nudtrtri( - Kokkos::subview(A, k, Kokkos::ALL(), Kokkos::ALL())); + functor_type nudtrtri(Kokkos::subview(A, k, Kokkos::ALL(), Kokkos::ALL())); // Initialize As diag with A(i,i)+10 - Kokkos::parallel_for("KokkosBlas::Test::NonUnitDiagTRTRI", - Kokkos::RangePolicy(0, K), - nudtrtri); + Kokkos::parallel_for("KokkosBlas::Test::NonUnitDiagTRTRI", Kokkos::RangePolicy(0, K), nudtrtri); } } Kokkos::fence(); @@ -241,9 +223,7 @@ void impl_test_batched_trtri(const int N, const int K) { } #endif - Functor_TestBatchedSerialTrtri(A) - .run(); + Functor_TestBatchedSerialTrtri(A).run(); #if PRINT_MAT printf("A_original:\n"); @@ -271,8 +251,7 @@ void impl_test_batched_trtri(const int N, const int K) { Kokkos::fence(); - struct VanillaGEMM - vgemm; + struct VanillaGEMM vgemm; vgemm.A_t = false; vgemm.B_t = false; vgemm.A_c = false; @@ -287,9 +266,8 @@ void impl_test_batched_trtri(const int N, const int K) { ; vgemm.C = Kokkos::subview(A_I, i, Kokkos::ALL(), Kokkos::ALL()); ; - Kokkos::parallel_for( - "KokkosBlas::Test::VanillaGEMM", - Kokkos::TeamPolicy(K, Kokkos::AUTO, 16), vgemm); + Kokkos::parallel_for("KokkosBlas::Test::VanillaGEMM", Kokkos::TeamPolicy(K, Kokkos::AUTO, 16), + vgemm); } Kokkos::fence(); @@ -311,8 +289,7 @@ void impl_test_batched_trtri(const int N, const int K) { for (int k = 0; k < N; ++k) { for (int i = 0; i < K; ++i) { for (int j = 0; j < K; ++j) { - cur_check_val = - (i == j) ? ScalarType(1) : ScalarType(0); // ats::abs(host_A(i,j)); + cur_check_val = (i == j) ? ScalarType(1) : ScalarType(0); // ats::abs(host_A(i,j)); if (ats::abs(ats::abs(I_host(k, i, j)) - cur_check_val) > eps) { fail_flag = true; // printf(" Error: eps ( %g ), I_host ( %.15f ) != cur_check_val @@ -329,41 +306,29 @@ void impl_test_batched_trtri(const int N, const int K) { } // namespace Trtri } // namespace Test -template +template int test_batched_trtri(int batchSize = 512) { #if defined(KOKKOSKERNELS_INST_LAYOUTLEFT) { typedef Kokkos::View ViewType; - Test::Trtri::impl_test_batched_trtri(0, 10); + Test::Trtri::impl_test_batched_trtri(0, 10); // Test::impl_test_batched_trtri( // 1, 2); for (int i = 0; i < 10; ++i) { // printf("Testing: LayoutLeft, Blksize %d\n", i); - Test::Trtri::impl_test_batched_trtri(batchSize, - i); - Test::Trtri::impl_test_batched_trtri(batchSize, - i); + Test::Trtri::impl_test_batched_trtri(batchSize, i); + Test::Trtri::impl_test_batched_trtri(batchSize, i); } } #endif #if defined(KOKKOSKERNELS_INST_LAYOUTRIGHT) { - typedef Kokkos::View - ViewType; - Test::Trtri::impl_test_batched_trtri(0, 10); + typedef Kokkos::View ViewType; + Test::Trtri::impl_test_batched_trtri(0, 10); for (int i = 0; i < 10; ++i) { // printf("Testing: LayoutRight, Blksize %d\n", i); - Test::Trtri::impl_test_batched_trtri(batchSize, - i); - Test::Trtri::impl_test_batched_trtri(batchSize, - i); + Test::Trtri::impl_test_batched_trtri(batchSize, i); + Test::Trtri::impl_test_batched_trtri(batchSize, i); } } #endif diff --git a/batched/dense/unit_test/Test_Batched_SerialTrtri_Complex.hpp b/batched/dense/unit_test/Test_Batched_SerialTrtri_Complex.hpp index 0d8f2c72a6..ca5575c99f 100644 --- a/batched/dense/unit_test/Test_Batched_SerialTrtri_Complex.hpp +++ b/batched/dense/unit_test/Test_Batched_SerialTrtri_Complex.hpp @@ -20,29 +20,25 @@ TEST_F(TestCategory, batched_scalar_serial_trtri_u_n_scomplex_scomplex) { typedef ::Test::Trtri::ParamTag param_tag_type; typedef Algo::Trtri::Unblocked algo_tag_type; - test_batched_trtri, Kokkos::complex, - param_tag_type, algo_tag_type>(128); + test_batched_trtri, Kokkos::complex, param_tag_type, algo_tag_type>(128); } TEST_F(TestCategory, batched_scalar_serial_trtri_u_u_scomplex_scomplex) { typedef ::Test::Trtri::ParamTag param_tag_type; typedef Algo::Trtri::Unblocked algo_tag_type; - test_batched_trtri, Kokkos::complex, - param_tag_type, algo_tag_type>(128); + test_batched_trtri, Kokkos::complex, param_tag_type, algo_tag_type>(128); } TEST_F(TestCategory, batched_scalar_serial_trtri_l_n_scomplex_scomplex) { typedef ::Test::Trtri::ParamTag param_tag_type; typedef Algo::Trtri::Unblocked algo_tag_type; - test_batched_trtri, Kokkos::complex, - param_tag_type, algo_tag_type>(128); + test_batched_trtri, Kokkos::complex, param_tag_type, algo_tag_type>(128); } TEST_F(TestCategory, batched_scalar_serial_trtri_l_u_scomplex_scomplex) { typedef ::Test::Trtri::ParamTag param_tag_type; typedef Algo::Trtri::Unblocked algo_tag_type; - test_batched_trtri, Kokkos::complex, - param_tag_type, algo_tag_type>(128); + test_batched_trtri, Kokkos::complex, param_tag_type, algo_tag_type>(128); } #endif @@ -52,32 +48,24 @@ TEST_F(TestCategory, batched_scalar_serial_trtri_u_n_dcomplex_dcomplex) { typedef ::Test::Trtri::ParamTag param_tag_type; typedef Algo::Trtri::Unblocked algo_tag_type; - test_batched_trtri, - Kokkos::complex, param_tag_type, algo_tag_type>( - 128); + test_batched_trtri, Kokkos::complex, param_tag_type, algo_tag_type>(128); } TEST_F(TestCategory, batched_scalar_serial_trtri_u_u_dcomplex_dcomplex) { typedef ::Test::Trtri::ParamTag param_tag_type; typedef Algo::Trtri::Unblocked algo_tag_type; - test_batched_trtri, - Kokkos::complex, param_tag_type, algo_tag_type>( - 128); + test_batched_trtri, Kokkos::complex, param_tag_type, algo_tag_type>(128); } TEST_F(TestCategory, batched_scalar_serial_trtri_l_n_dcomplex_dcomplex) { typedef ::Test::Trtri::ParamTag param_tag_type; typedef Algo::Trtri::Unblocked algo_tag_type; - test_batched_trtri, - Kokkos::complex, param_tag_type, algo_tag_type>( - 128); + test_batched_trtri, Kokkos::complex, param_tag_type, algo_tag_type>(128); } TEST_F(TestCategory, batched_scalar_serial_trtri_l_u_dcomplex_dcomplex) { typedef ::Test::Trtri::ParamTag param_tag_type; typedef Algo::Trtri::Unblocked algo_tag_type; - test_batched_trtri, - Kokkos::complex, param_tag_type, algo_tag_type>( - 128); + test_batched_trtri, Kokkos::complex, param_tag_type, algo_tag_type>(128); } #endif diff --git a/batched/dense/unit_test/Test_Batched_SerialTrtri_Real.hpp b/batched/dense/unit_test/Test_Batched_SerialTrtri_Real.hpp index 952994d207..66fcd162ab 100644 --- a/batched/dense/unit_test/Test_Batched_SerialTrtri_Real.hpp +++ b/batched/dense/unit_test/Test_Batched_SerialTrtri_Real.hpp @@ -48,28 +48,24 @@ TEST_F(TestCategory, batched_scalar_serial_trtri_u_n_double_double) { typedef ::Test::Trtri::ParamTag param_tag_type; typedef Algo::Trtri::Unblocked algo_tag_type; - test_batched_trtri(); + test_batched_trtri(); } TEST_F(TestCategory, batched_scalar_serial_trtri_u_u_double_double) { typedef ::Test::Trtri::ParamTag param_tag_type; typedef Algo::Trtri::Unblocked algo_tag_type; - test_batched_trtri(); + test_batched_trtri(); } TEST_F(TestCategory, batched_scalar_serial_trtri_l_n_double_double) { typedef ::Test::Trtri::ParamTag param_tag_type; typedef Algo::Trtri::Unblocked algo_tag_type; - test_batched_trtri(); + test_batched_trtri(); } TEST_F(TestCategory, batched_scalar_serial_trtri_l_u_double_double) { typedef ::Test::Trtri::ParamTag param_tag_type; typedef Algo::Trtri::Unblocked algo_tag_type; - test_batched_trtri(); + test_batched_trtri(); } #endif diff --git a/batched/dense/unit_test/Test_Batched_TeamAxpy.hpp b/batched/dense/unit_test/Test_Batched_TeamAxpy.hpp index b43b498607..d33f833146 100644 --- a/batched/dense/unit_test/Test_Batched_TeamAxpy.hpp +++ b/batched/dense/unit_test/Test_Batched_TeamAxpy.hpp @@ -37,8 +37,7 @@ struct Functor_TestBatchedTeamAxpy { const int _N_team; KOKKOS_INLINE_FUNCTION - Functor_TestBatchedTeamAxpy(const alphaViewType &alpha, const ViewType &X, - const ViewType &Y, const int N_team) + Functor_TestBatchedTeamAxpy(const alphaViewType &alpha, const ViewType &X, const ViewType &Y, const int N_team) : _alpha(alpha), _X(X), _Y(Y), _N_team(N_team) {} template @@ -46,16 +45,12 @@ struct Functor_TestBatchedTeamAxpy { const int first_matrix = static_cast(member.league_rank()) * _N_team; const int N = _X.extent(0); const int last_matrix = - (static_cast(member.league_rank() + 1) * _N_team < N - ? static_cast(member.league_rank() + 1) * _N_team - : N); + (static_cast(member.league_rank() + 1) * _N_team < N ? static_cast(member.league_rank() + 1) * _N_team + : N); - auto alpha = - Kokkos::subview(_alpha, Kokkos::make_pair(first_matrix, last_matrix)); - auto x = Kokkos::subview(_X, Kokkos::make_pair(first_matrix, last_matrix), - Kokkos::ALL); - auto y = Kokkos::subview(_Y, Kokkos::make_pair(first_matrix, last_matrix), - Kokkos::ALL); + auto alpha = Kokkos::subview(_alpha, Kokkos::make_pair(first_matrix, last_matrix)); + auto x = Kokkos::subview(_X, Kokkos::make_pair(first_matrix, last_matrix), Kokkos::ALL); + auto y = Kokkos::subview(_Y, Kokkos::make_pair(first_matrix, last_matrix), Kokkos::ALL); KokkosBatched::TeamAxpy::invoke(member, alpha, x, y); } @@ -66,8 +61,7 @@ struct Functor_TestBatchedTeamAxpy { const std::string name_value_type = Test::value_type_name(); std::string name = name_region + name_value_type; Kokkos::Profiling::pushRegion(name.c_str()); - Kokkos::TeamPolicy policy(_X.extent(0) / _N_team, - Kokkos::AUTO(), Kokkos::AUTO()); + Kokkos::TeamPolicy policy(_X.extent(0) / _N_team, Kokkos::AUTO(), Kokkos::AUTO()); Kokkos::parallel_for(name.c_str(), policy, *this); Kokkos::Profiling::popRegion(); } @@ -80,13 +74,11 @@ void impl_test_batched_axpy(const int N, const int BlkSize, const int N_team) { typedef typename alphaViewType::const_value_type alpha_const_value_type; typedef Kokkos::ArithTraits ats; - ViewType X0("x0", N, BlkSize), X1("x1", N, BlkSize), Y0("y0", N, BlkSize), - Y1("y1", N, BlkSize); + ViewType X0("x0", N, BlkSize), X1("x1", N, BlkSize), Y0("y0", N, BlkSize), Y1("y1", N, BlkSize); alphaViewType alpha("alpha", N); - Kokkos::Random_XorShift64_Pool random( - 13718); + Kokkos::Random_XorShift64_Pool random(13718); Kokkos::fill_random(X0, random, const_value_type(1.0)); Kokkos::fill_random(Y0, random, const_value_type(1.0)); Kokkos::fill_random(alpha, random, alpha_const_value_type(1.0)); @@ -106,12 +98,9 @@ void impl_test_batched_axpy(const int N, const int BlkSize, const int N_team) { Kokkos::deep_copy(Y0_host, Y0); for (int l = 0; l < N; ++l) - for (int i = 0; i < BlkSize; ++i) - Y0_host(l, i) += alpha_host(l) * X0_host(l, i); + for (int i = 0; i < BlkSize; ++i) Y0_host(l, i) += alpha_host(l) * X0_host(l, i); - Functor_TestBatchedTeamAxpy(alpha, X1, - Y1, N_team) - .run(); + Functor_TestBatchedTeamAxpy(alpha, X1, Y1, N_team).run(); Kokkos::fence(); @@ -140,25 +129,20 @@ int test_batched_team_axpy() { #if defined(KOKKOSKERNELS_INST_LAYOUTLEFT) { typedef Kokkos::View ViewType; - typedef Kokkos::View - alphaViewType; + typedef Kokkos::View alphaViewType; for (int i = 3; i < 10; ++i) { - Test::TeamAxpy::impl_test_batched_axpy(1024, i, 2); + Test::TeamAxpy::impl_test_batched_axpy(1024, i, 2); } } #endif #if defined(KOKKOSKERNELS_INST_LAYOUTRIGHT) { - typedef Kokkos::View - ViewType; - typedef Kokkos::View - alphaViewType; + typedef Kokkos::View ViewType; + typedef Kokkos::View alphaViewType; for (int i = 3; i < 10; ++i) { - Test::TeamAxpy::impl_test_batched_axpy(1024, i, 2); + Test::TeamAxpy::impl_test_batched_axpy(1024, i, 2); } } #endif diff --git a/batched/dense/unit_test/Test_Batched_TeamAxpy_Complex.hpp b/batched/dense/unit_test/Test_Batched_TeamAxpy_Complex.hpp index b95b769fcc..ba47fe739a 100644 --- a/batched/dense/unit_test/Test_Batched_TeamAxpy_Complex.hpp +++ b/batched/dense/unit_test/Test_Batched_TeamAxpy_Complex.hpp @@ -16,8 +16,7 @@ #if defined(KOKKOSKERNELS_INST_COMPLEX_DOUBLE) TEST_F(TestCategory, batched_scalar_team_axpy_nt_dcomplex_dcomplex) { - test_batched_team_axpy, - Kokkos::complex>(); + test_batched_team_axpy, Kokkos::complex>(); } TEST_F(TestCategory, batched_scalar_team_axpy_nt_dcomplex_double) { diff --git a/batched/dense/unit_test/Test_Batched_TeamAxpy_Real.hpp b/batched/dense/unit_test/Test_Batched_TeamAxpy_Real.hpp index ac458d4a55..1fcbae03d6 100644 --- a/batched/dense/unit_test/Test_Batched_TeamAxpy_Real.hpp +++ b/batched/dense/unit_test/Test_Batched_TeamAxpy_Real.hpp @@ -15,9 +15,7 @@ //@HEADER #if defined(KOKKOSKERNELS_INST_FLOAT) -TEST_F(TestCategory, batched_scalar_team_axpy_nt_float_float) { - test_batched_team_axpy(); -} +TEST_F(TestCategory, batched_scalar_team_axpy_nt_float_float) { test_batched_team_axpy(); } #endif #if defined(KOKKOSKERNELS_INST_DOUBLE) diff --git a/batched/dense/unit_test/Test_Batched_TeamGemm.hpp b/batched/dense/unit_test/Test_Batched_TeamGemm.hpp index 2d952889c9..f283da2b68 100644 --- a/batched/dense/unit_test/Test_Batched_TeamGemm.hpp +++ b/batched/dense/unit_test/Test_Batched_TeamGemm.hpp @@ -19,7 +19,7 @@ #include "Kokkos_Core.hpp" #include "Kokkos_Random.hpp" -//#include "KokkosBatched_Vector.hpp" +// #include "KokkosBatched_Vector.hpp" #include "KokkosBatched_Gemm_Decl.hpp" #include "KokkosBatched_Gemm_Serial_Impl.hpp" @@ -38,8 +38,7 @@ struct ParamTag { typedef TB transB; }; -template +template struct Functor_TestBatchedTeamGemm { using execution_space = typename DeviceType::execution_space; ViewType _a, _b, _c; @@ -47,24 +46,20 @@ struct Functor_TestBatchedTeamGemm { ScalarType _alpha, _beta; KOKKOS_INLINE_FUNCTION - Functor_TestBatchedTeamGemm(const ScalarType alpha, const ViewType &a, - const ViewType &b, const ScalarType beta, + Functor_TestBatchedTeamGemm(const ScalarType alpha, const ViewType &a, const ViewType &b, const ScalarType beta, const ViewType &c) : _a(a), _b(b), _c(c), _alpha(alpha), _beta(beta) {} template - KOKKOS_INLINE_FUNCTION void operator()(const ParamTagType &, - const MemberType &member) const { + KOKKOS_INLINE_FUNCTION void operator()(const ParamTagType &, const MemberType &member) const { const int k = member.league_rank(); auto aa = Kokkos::subview(_a, k, Kokkos::ALL(), Kokkos::ALL()); auto bb = Kokkos::subview(_b, k, Kokkos::ALL(), Kokkos::ALL()); auto cc = Kokkos::subview(_c, k, Kokkos::ALL(), Kokkos::ALL()); - KokkosBatched::TeamGemm::invoke(member, _alpha, aa, bb, _beta, - cc); + KokkosBatched::TeamGemm::invoke(member, _alpha, aa, bb, _beta, cc); } inline void run() { @@ -74,19 +69,15 @@ struct Functor_TestBatchedTeamGemm { std::string name = name_region + name_value_type; Kokkos::Profiling::pushRegion(name.c_str()); const int league_size = _c.extent(0); - Kokkos::TeamPolicy policy(league_size, - Kokkos::AUTO); + Kokkos::TeamPolicy policy(league_size, Kokkos::AUTO); Kokkos::parallel_for(name.c_str(), policy, *this); Kokkos::Profiling::popRegion(); } }; -template -void impl_test_batched_teamgemm(const int N, const int matAdim1, - const int matAdim2, const int matBdim1, - const int matBdim2, const int matCdim1, - const int matCdim2) { +template +void impl_test_batched_teamgemm(const int N, const int matAdim1, const int matAdim2, const int matBdim1, + const int matBdim2, const int matCdim1, const int matCdim2) { using transA = typename ParamTagType::transA; using transB = typename ParamTagType::transB; using execution_space = typename DeviceType::execution_space; @@ -96,15 +87,11 @@ void impl_test_batched_teamgemm(const int N, const int matAdim1, /// randomized input testing views ScalarType alpha = ScalarType(1.5), beta = ScalarType(3.0); - ViewType a_expected("a_expected", N, matAdim1, matAdim2), - a_actual("a_actual", N, matAdim1, matAdim2), - b_expected("b_expected", N, matBdim1, matBdim2), - b_actual("b_actual", N, matBdim1, matBdim2), - c_expected("c_expected", N, matCdim1, matCdim2), - c_actual("c_actual", N, matCdim1, matCdim2); + ViewType a_expected("a_expected", N, matAdim1, matAdim2), a_actual("a_actual", N, matAdim1, matAdim2), + b_expected("b_expected", N, matBdim1, matBdim2), b_actual("b_actual", N, matBdim1, matBdim2), + c_expected("c_expected", N, matCdim1, matCdim2), c_actual("c_actual", N, matCdim1, matCdim2); - Kokkos::Random_XorShift64_Pool random( - 13718); + Kokkos::Random_XorShift64_Pool random(13718); Kokkos::fill_random(a_expected, random, value_type(1.0)); Kokkos::fill_random(b_expected, random, value_type(1.0)); @@ -116,8 +103,7 @@ void impl_test_batched_teamgemm(const int N, const int matAdim1, Kokkos::deep_copy(b_actual, b_expected); Kokkos::deep_copy(c_actual, c_expected); - Functor_BatchedVanillaGEMM - vgemm; + Functor_BatchedVanillaGEMM vgemm; vgemm.A_t = std::is_same::value; vgemm.B_t = std::is_same::value; vgemm.A_c = vgemm.B_c = false; @@ -128,17 +114,14 @@ void impl_test_batched_teamgemm(const int N, const int matAdim1, vgemm.beta = beta; vgemm.run(); // Compute c_expected - Functor_TestBatchedTeamGemm(alpha, a_actual, b_actual, beta, - c_actual) + Functor_TestBatchedTeamGemm(alpha, a_actual, b_actual, + beta, c_actual) .run(); Kokkos::fence(); - typename ViewType::HostMirror c_expected_host = - Kokkos::create_mirror_view(c_expected); - typename ViewType::HostMirror c_actual_host = - Kokkos::create_mirror_view(c_actual); + typename ViewType::HostMirror c_expected_host = Kokkos::create_mirror_view(c_expected); + typename ViewType::HostMirror c_actual_host = Kokkos::create_mirror_view(c_actual); // Copy to host for comparision Kokkos::deep_copy(c_expected_host, c_expected); @@ -166,20 +149,16 @@ void impl_test_batched_teamgemm(const int N, const int matAdim1, // void (*impl_test)(const int, const int, const int, const int, const int, // const int, const int) -template +template int test_batched_teamgemm() { #if defined(KOKKOSKERNELS_INST_LAYOUTLEFT) { - typedef Kokkos::View - ViewType; - Test::TeamGemm::impl_test_batched_teamgemm( + typedef Kokkos::View ViewType; + Test::TeamGemm::impl_test_batched_teamgemm( 0, 10, 10, 10, 10, 10, 10); for (int i = 0; i < 10; ++i) { // printf("Testing: LayoutLeft, Blksize %d\n", i); - Test::TeamGemm::impl_test_batched_teamgemm< - DeviceType, ViewType, ScalarType, ParamTagType, AlgoTagType>( + Test::TeamGemm::impl_test_batched_teamgemm( 1024, i, i, i, i, i, i); } for (int i = 0; i < 10; ++i) { @@ -187,36 +166,24 @@ int test_batched_teamgemm() { int dimM = i; int dimN = 2 * i; int dimK = 3 * i; - if ((std::is_same::value) && - (std::is_same::value)) { - Test::TeamGemm::impl_test_batched_teamgemm< - DeviceType, ViewType, ScalarType, ParamTagType, AlgoTagType>( + if ((std::is_same::value) && + (std::is_same::value)) { + Test::TeamGemm::impl_test_batched_teamgemm( 1024, dimM, dimK, dimK, dimN, dimM, dimN); } - if ((std::is_same::value) && - (std::is_same::value)) { - Test::TeamGemm::impl_test_batched_teamgemm< - DeviceType, ViewType, ScalarType, ParamTagType, AlgoTagType>( + if ((std::is_same::value) && + (std::is_same::value)) { + Test::TeamGemm::impl_test_batched_teamgemm( 1024, dimM, dimK, dimN, dimK, dimM, dimN); } - if ((std::is_same::value) && - (std::is_same::value)) { - Test::TeamGemm::impl_test_batched_teamgemm< - DeviceType, ViewType, ScalarType, ParamTagType, AlgoTagType>( + if ((std::is_same::value) && + (std::is_same::value)) { + Test::TeamGemm::impl_test_batched_teamgemm( 1024, dimK, dimM, dimK, dimN, dimM, dimN); } - if ((std::is_same::value) && - (std::is_same::value)) { - Test::TeamGemm::impl_test_batched_teamgemm< - DeviceType, ViewType, ScalarType, ParamTagType, AlgoTagType>( + if ((std::is_same::value) && + (std::is_same::value)) { + Test::TeamGemm::impl_test_batched_teamgemm( 1024, dimK, dimM, dimN, dimK, dimM, dimN); } } @@ -224,15 +191,12 @@ int test_batched_teamgemm() { #endif #if defined(KOKKOSKERNELS_INST_LAYOUTRIGHT) { - typedef Kokkos::View - ViewType; - Test::TeamGemm::impl_test_batched_teamgemm( + typedef Kokkos::View ViewType; + Test::TeamGemm::impl_test_batched_teamgemm( 0, 10, 10, 10, 10, 10, 10); for (int i = 0; i < 10; ++i) { // printf("Testing: LayoutRight, Blksize %d\n", i); - Test::TeamGemm::impl_test_batched_teamgemm< - DeviceType, ViewType, ScalarType, ParamTagType, AlgoTagType>( + Test::TeamGemm::impl_test_batched_teamgemm( 1024, i, i, i, i, i, i); } for (int i = 0; i < 10; ++i) { @@ -240,36 +204,24 @@ int test_batched_teamgemm() { int dimM = i; int dimN = 2 * i; int dimK = 3 * i; - if ((std::is_same::value) && - (std::is_same::value)) { - Test::TeamGemm::impl_test_batched_teamgemm< - DeviceType, ViewType, ScalarType, ParamTagType, AlgoTagType>( + if ((std::is_same::value) && + (std::is_same::value)) { + Test::TeamGemm::impl_test_batched_teamgemm( 1024, dimM, dimK, dimK, dimN, dimM, dimN); } - if ((std::is_same::value) && - (std::is_same::value)) { - Test::TeamGemm::impl_test_batched_teamgemm< - DeviceType, ViewType, ScalarType, ParamTagType, AlgoTagType>( + if ((std::is_same::value) && + (std::is_same::value)) { + Test::TeamGemm::impl_test_batched_teamgemm( 1024, dimM, dimK, dimN, dimK, dimM, dimN); } - if ((std::is_same::value) && - (std::is_same::value)) { - Test::TeamGemm::impl_test_batched_teamgemm< - DeviceType, ViewType, ScalarType, ParamTagType, AlgoTagType>( + if ((std::is_same::value) && + (std::is_same::value)) { + Test::TeamGemm::impl_test_batched_teamgemm( 1024, dimK, dimM, dimK, dimN, dimM, dimN); } - if ((std::is_same::value) && - (std::is_same::value)) { - Test::TeamGemm::impl_test_batched_teamgemm< - DeviceType, ViewType, ScalarType, ParamTagType, AlgoTagType>( + if ((std::is_same::value) && + (std::is_same::value)) { + Test::TeamGemm::impl_test_batched_teamgemm( 1024, dimK, dimM, dimN, dimK, dimM, dimN); } } diff --git a/batched/dense/unit_test/Test_Batched_TeamGemm_Complex.hpp b/batched/dense/unit_test/Test_Batched_TeamGemm_Complex.hpp index 09c7f3f2cc..a353513967 100644 --- a/batched/dense/unit_test/Test_Batched_TeamGemm_Complex.hpp +++ b/batched/dense/unit_test/Test_Batched_TeamGemm_Complex.hpp @@ -19,36 +19,24 @@ /// dcomplex, dcomplex TEST_F(TestCategory, batched_scalar_team_gemm_nt_nt_dcomplex_dcomplex) { - typedef ::Test::TeamGemm::ParamTag - param_tag_type; + typedef ::Test::TeamGemm::ParamTag param_tag_type; typedef Algo::Gemm::Blocked algo_tag_type; - test_batched_teamgemm, - Kokkos::complex, param_tag_type, - algo_tag_type>(); + test_batched_teamgemm, Kokkos::complex, param_tag_type, algo_tag_type>(); } TEST_F(TestCategory, batched_scalar_team_gemm_t_nt_dcomplex_dcomplex) { - typedef ::Test::TeamGemm::ParamTag - param_tag_type; + typedef ::Test::TeamGemm::ParamTag param_tag_type; typedef Algo::Gemm::Blocked algo_tag_type; - test_batched_teamgemm, - Kokkos::complex, param_tag_type, - algo_tag_type>(); + test_batched_teamgemm, Kokkos::complex, param_tag_type, algo_tag_type>(); } TEST_F(TestCategory, batched_scalar_team_gemm_nt_t_dcomplex_dcomplex) { - typedef ::Test::TeamGemm::ParamTag - param_tag_type; + typedef ::Test::TeamGemm::ParamTag param_tag_type; typedef Algo::Gemm::Blocked algo_tag_type; - test_batched_teamgemm, - Kokkos::complex, param_tag_type, - algo_tag_type>(); + test_batched_teamgemm, Kokkos::complex, param_tag_type, algo_tag_type>(); } TEST_F(TestCategory, batched_scalar_team_gemm_t_t_dcomplex_dcomplex) { - typedef ::Test::TeamGemm::ParamTag - param_tag_type; + typedef ::Test::TeamGemm::ParamTag param_tag_type; typedef Algo::Gemm::Blocked algo_tag_type; - test_batched_teamgemm, - Kokkos::complex, param_tag_type, - algo_tag_type>(); + test_batched_teamgemm, Kokkos::complex, param_tag_type, algo_tag_type>(); } // TEST_F( TestCategory, batched_scalar_team_gemm_ct_nt_dcomplex_dcomplex ) { // typedef ::Test::TeamGemm::ParamTag @@ -64,32 +52,24 @@ TEST_F(TestCategory, batched_scalar_team_gemm_t_t_dcomplex_dcomplex) { /// dcomplex, double TEST_F(TestCategory, batched_scalar_team_gemm_nt_nt_dcomplex_double) { - typedef ::Test::TeamGemm::ParamTag - param_tag_type; + typedef ::Test::TeamGemm::ParamTag param_tag_type; typedef Algo::Gemm::Blocked algo_tag_type; - test_batched_teamgemm, double, - param_tag_type, algo_tag_type>(); + test_batched_teamgemm, double, param_tag_type, algo_tag_type>(); } TEST_F(TestCategory, batched_scalar_team_gemm_t_nt_dcomplex_double) { - typedef ::Test::TeamGemm::ParamTag - param_tag_type; + typedef ::Test::TeamGemm::ParamTag param_tag_type; typedef Algo::Gemm::Blocked algo_tag_type; - test_batched_teamgemm, double, - param_tag_type, algo_tag_type>(); + test_batched_teamgemm, double, param_tag_type, algo_tag_type>(); } TEST_F(TestCategory, batched_scalar_team_gemm_nt_t_dcomplex_double) { - typedef ::Test::TeamGemm::ParamTag - param_tag_type; + typedef ::Test::TeamGemm::ParamTag param_tag_type; typedef Algo::Gemm::Blocked algo_tag_type; - test_batched_teamgemm, double, - param_tag_type, algo_tag_type>(); + test_batched_teamgemm, double, param_tag_type, algo_tag_type>(); } TEST_F(TestCategory, batched_scalar_team_gemm_t_t_dcomplex_double) { - typedef ::Test::TeamGemm::ParamTag - param_tag_type; + typedef ::Test::TeamGemm::ParamTag param_tag_type; typedef Algo::Gemm::Blocked algo_tag_type; - test_batched_teamgemm, double, - param_tag_type, algo_tag_type>(); + test_batched_teamgemm, double, param_tag_type, algo_tag_type>(); } // TEST_F( TestCategory, batched_scalar_team_gemm_ct_nt_dcomplex_double ) { // typedef ::Test::TeamGemm::ParamTag diff --git a/batched/dense/unit_test/Test_Batched_TeamGemm_Real.hpp b/batched/dense/unit_test/Test_Batched_TeamGemm_Real.hpp index b1a5135018..6f06638c2a 100644 --- a/batched/dense/unit_test/Test_Batched_TeamGemm_Real.hpp +++ b/batched/dense/unit_test/Test_Batched_TeamGemm_Real.hpp @@ -15,156 +15,116 @@ //@HEADER #if defined(KOKKOS_BHALF_T_IS_FLOAT) TEST_F(TestCategory, batched_scalar_team_gemm_nt_nt_bhalf_bhalf) { - typedef ::Test::TeamGemm::ParamTag - param_tag_type; + typedef ::Test::TeamGemm::ParamTag param_tag_type; - test_batched_teamgemm(); - test_batched_teamgemm(); } TEST_F(TestCategory, batched_scalar_team_gemm_t_nt_bhalf_bhalf) { - typedef ::Test::TeamGemm::ParamTag - param_tag_type; + typedef ::Test::TeamGemm::ParamTag param_tag_type; - test_batched_teamgemm(); - test_batched_teamgemm(); } TEST_F(TestCategory, batched_scalar_team_gemm_nt_t_bhalf_bhalf) { - typedef ::Test::TeamGemm::ParamTag - param_tag_type; + typedef ::Test::TeamGemm::ParamTag param_tag_type; - test_batched_teamgemm(); - test_batched_teamgemm(); } TEST_F(TestCategory, batched_scalar_team_gemm_t_t_bhalf_bhalf) { - typedef ::Test::TeamGemm::ParamTag - param_tag_type; + typedef ::Test::TeamGemm::ParamTag param_tag_type; - test_batched_teamgemm(); - test_batched_teamgemm(); } #endif // KOKKOS_BHALF_T_IS_FLOAT #if defined(KOKKOS_HALF_T_IS_FLOAT) TEST_F(TestCategory, batched_scalar_team_gemm_nt_nt_half_half) { - typedef ::Test::TeamGemm::ParamTag - param_tag_type; + typedef ::Test::TeamGemm::ParamTag param_tag_type; - test_batched_teamgemm(); - test_batched_teamgemm(); } TEST_F(TestCategory, batched_scalar_team_gemm_t_nt_half_half) { - typedef ::Test::TeamGemm::ParamTag - param_tag_type; + typedef ::Test::TeamGemm::ParamTag param_tag_type; - test_batched_teamgemm(); - test_batched_teamgemm(); } TEST_F(TestCategory, batched_scalar_team_gemm_nt_t_half_half) { - typedef ::Test::TeamGemm::ParamTag - param_tag_type; + typedef ::Test::TeamGemm::ParamTag param_tag_type; - test_batched_teamgemm(); - test_batched_teamgemm(); } TEST_F(TestCategory, batched_scalar_team_gemm_t_t_half_half) { - typedef ::Test::TeamGemm::ParamTag - param_tag_type; + typedef ::Test::TeamGemm::ParamTag param_tag_type; - test_batched_teamgemm(); - test_batched_teamgemm(); } #endif // KOKKOS_HALF_T_IS_FLOAT #if defined(KOKKOSKERNELS_INST_FLOAT) TEST_F(TestCategory, batched_scalar_team_gemm_nt_nt_float_float) { - typedef ::Test::TeamGemm::ParamTag - param_tag_type; + typedef ::Test::TeamGemm::ParamTag param_tag_type; typedef Algo::Gemm::Blocked algo_tag_type; - test_batched_teamgemm(); + test_batched_teamgemm(); } TEST_F(TestCategory, batched_scalar_team_gemm_t_nt_float_float) { - typedef ::Test::TeamGemm::ParamTag - param_tag_type; + typedef ::Test::TeamGemm::ParamTag param_tag_type; typedef Algo::Gemm::Blocked algo_tag_type; - test_batched_teamgemm(); + test_batched_teamgemm(); } TEST_F(TestCategory, batched_scalar_team_gemm_nt_t_float_float) { - typedef ::Test::TeamGemm::ParamTag - param_tag_type; + typedef ::Test::TeamGemm::ParamTag param_tag_type; typedef Algo::Gemm::Blocked algo_tag_type; - test_batched_teamgemm(); + test_batched_teamgemm(); } TEST_F(TestCategory, batched_scalar_team_gemm_t_t_float_float) { - typedef ::Test::TeamGemm::ParamTag - param_tag_type; + typedef ::Test::TeamGemm::ParamTag param_tag_type; typedef Algo::Gemm::Blocked algo_tag_type; - test_batched_teamgemm(); + test_batched_teamgemm(); } #endif #if defined(KOKKOSKERNELS_INST_DOUBLE) TEST_F(TestCategory, batched_scalar_team_gemm_nt_nt_double_double) { - typedef ::Test::TeamGemm::ParamTag - param_tag_type; + typedef ::Test::TeamGemm::ParamTag param_tag_type; typedef Algo::Gemm::Blocked algo_tag_type; - test_batched_teamgemm(); + test_batched_teamgemm(); } TEST_F(TestCategory, batched_scalar_team_gemm_t_nt_double_double) { - typedef ::Test::TeamGemm::ParamTag - param_tag_type; + typedef ::Test::TeamGemm::ParamTag param_tag_type; typedef Algo::Gemm::Blocked algo_tag_type; - test_batched_teamgemm(); + test_batched_teamgemm(); } TEST_F(TestCategory, batched_scalar_team_gemm_nt_t_double_double) { - typedef ::Test::TeamGemm::ParamTag - param_tag_type; + typedef ::Test::TeamGemm::ParamTag param_tag_type; typedef Algo::Gemm::Blocked algo_tag_type; - test_batched_teamgemm(); + test_batched_teamgemm(); } TEST_F(TestCategory, batched_scalar_team_gemm_t_t_double_double) { - typedef ::Test::TeamGemm::ParamTag - param_tag_type; + typedef ::Test::TeamGemm::ParamTag param_tag_type; typedef Algo::Gemm::Blocked algo_tag_type; - test_batched_teamgemm(); + test_batched_teamgemm(); } #endif diff --git a/batched/dense/unit_test/Test_Batched_TeamGesv.hpp b/batched/dense/unit_test/Test_Batched_TeamGesv.hpp index dc3b4e53fb..d119308862 100644 --- a/batched/dense/unit_test/Test_Batched_TeamGesv.hpp +++ b/batched/dense/unit_test/Test_Batched_TeamGesv.hpp @@ -32,8 +32,7 @@ using namespace KokkosBatched; namespace Test { namespace TeamGesv { -template +template struct Functor_TestBatchedTeamGesv { using execution_space = typename DeviceType::execution_space; const MatrixType _A; @@ -41,16 +40,14 @@ struct Functor_TestBatchedTeamGesv { const VectorType _B; KOKKOS_INLINE_FUNCTION - Functor_TestBatchedTeamGesv(const MatrixType &A, const VectorType &X, - const VectorType &B) - : _A(A), _X(X), _B(B) {} + Functor_TestBatchedTeamGesv(const MatrixType &A, const VectorType &X, const VectorType &B) : _A(A), _X(X), _B(B) {} template KOKKOS_INLINE_FUNCTION void operator()(const MemberType &member) const { const int matrix_id = static_cast(member.league_rank()); - auto A = Kokkos::subview(_A, matrix_id, Kokkos::ALL, Kokkos::ALL); - auto x = Kokkos::subview(_X, matrix_id, Kokkos::ALL); - auto b = Kokkos::subview(_B, matrix_id, Kokkos::ALL); + auto A = Kokkos::subview(_A, matrix_id, Kokkos::ALL, Kokkos::ALL); + auto x = Kokkos::subview(_X, matrix_id, Kokkos::ALL); + auto b = Kokkos::subview(_B, matrix_id, Kokkos::ALL); member.team_barrier(); KokkosBatched::TeamGesv::invoke(member, A, x, b); @@ -63,13 +60,10 @@ struct Functor_TestBatchedTeamGesv { const std::string name_value_type = Test::value_type_name(); std::string name = name_region + name_value_type; Kokkos::Profiling::pushRegion(name.c_str()); - Kokkos::TeamPolicy policy(_X.extent(0), Kokkos::AUTO(), - Kokkos::AUTO()); + Kokkos::TeamPolicy policy(_X.extent(0), Kokkos::AUTO(), Kokkos::AUTO()); - using MatrixViewType = - Kokkos::View; + using MatrixViewType = Kokkos::View; const int n = _A.extent(1); size_t bytes_0 = MatrixViewType::shmem_size(n, n + 4); @@ -80,15 +74,13 @@ struct Functor_TestBatchedTeamGesv { } }; -template +template void impl_test_batched_gesv(const int N, const int BlkSize) { typedef typename MatrixType::value_type value_type; typedef Kokkos::ArithTraits ats; using MagnitudeType = typename Kokkos::ArithTraits::mag_type; - using NormViewType = - Kokkos::View; + using NormViewType = Kokkos::View; NormViewType sqr_norm_j("sqr_norm_j", N); auto sqr_norm_j_host = Kokkos::create_mirror_view(sqr_norm_j); @@ -109,23 +101,18 @@ void impl_test_batched_gesv(const int N, const int BlkSize) { Kokkos::fence(); - Functor_TestBatchedTeamGesv( - A, X, B) - .run(); + Functor_TestBatchedTeamGesv(A, X, B).run(); Kokkos::fence(); Kokkos::deep_copy(X_host, X); for (int l = 0; l < N; ++l) - KokkosBlas::SerialGemv:: - invoke(-1, Kokkos::subview(A_host, l, Kokkos::ALL, Kokkos::ALL), - Kokkos::subview(X_host, l, Kokkos::ALL), 1, - Kokkos::subview(B_host, l, Kokkos::ALL)); + KokkosBlas::SerialGemv::invoke( + -1, Kokkos::subview(A_host, l, Kokkos::ALL, Kokkos::ALL), Kokkos::subview(X_host, l, Kokkos::ALL), 1, + Kokkos::subview(B_host, l, Kokkos::ALL)); - KokkosBatched::SerialDot::invoke(B_host, B_host, - sqr_norm_j_host); + KokkosBatched::SerialDot::invoke(B_host, B_host, sqr_norm_j_host); const MagnitudeType eps = 1.0e3 * ats::epsilon(); @@ -138,27 +125,21 @@ template int test_batched_team_gesv() { #if defined(KOKKOSKERNELS_INST_LAYOUTLEFT) { - typedef Kokkos::View - MatrixType; - typedef Kokkos::View - VectorType; + typedef Kokkos::View MatrixType; + typedef Kokkos::View VectorType; for (int i = 3; i < 10; ++i) { - Test::TeamGesv::impl_test_batched_gesv(1024, i); + Test::TeamGesv::impl_test_batched_gesv(1024, i); } } #endif #if defined(KOKKOSKERNELS_INST_LAYOUTRIGHT) { - typedef Kokkos::View - MatrixType; - typedef Kokkos::View - VectorType; + typedef Kokkos::View MatrixType; + typedef Kokkos::View VectorType; for (int i = 3; i < 10; ++i) { - Test::TeamGesv::impl_test_batched_gesv(1024, i); + Test::TeamGesv::impl_test_batched_gesv(1024, i); } } #endif diff --git a/batched/dense/unit_test/Test_Batched_TeamGesv_Real.hpp b/batched/dense/unit_test/Test_Batched_TeamGesv_Real.hpp index d0b04ea57c..6fd7241f0b 100644 --- a/batched/dense/unit_test/Test_Batched_TeamGesv_Real.hpp +++ b/batched/dense/unit_test/Test_Batched_TeamGesv_Real.hpp @@ -15,8 +15,7 @@ //@HEADER #if defined(KOKKOSKERNELS_INST_FLOAT) TEST_F(TestCategory, batched_scalar_team_gesv_static_pivoting_float) { - test_batched_team_gesv(); + test_batched_team_gesv(); } TEST_F(TestCategory, batched_scalar_team_gesv_no_pivoting_float) { test_batched_team_gesv(); @@ -25,8 +24,7 @@ TEST_F(TestCategory, batched_scalar_team_gesv_no_pivoting_float) { #if defined(KOKKOSKERNELS_INST_DOUBLE) TEST_F(TestCategory, batched_scalar_team_gesv_static_pivoting_double) { - test_batched_team_gesv(); + test_batched_team_gesv(); } TEST_F(TestCategory, batched_scalar_team_gesv_no_pivoting_double) { test_batched_team_gesv(); diff --git a/batched/dense/unit_test/Test_Batched_TeamInverseLU.hpp b/batched/dense/unit_test/Test_Batched_TeamInverseLU.hpp index a62e655d02..36d0aae738 100644 --- a/batched/dense/unit_test/Test_Batched_TeamInverseLU.hpp +++ b/batched/dense/unit_test/Test_Batched_TeamInverseLU.hpp @@ -19,14 +19,14 @@ #include "Kokkos_Core.hpp" #include "Kokkos_Random.hpp" -//#include "KokkosBatched_Vector.hpp" +// #include "KokkosBatched_Vector.hpp" #include "KokkosBatched_Gemm_Decl.hpp" #include "KokkosBatched_Gemm_Team_Impl.hpp" #include "KokkosBatched_LU_Decl.hpp" #include "KokkosBatched_LU_Team_Impl.hpp" #include "KokkosBatched_InverseLU_Decl.hpp" -//#include "KokkosBatched_InverseLU_Team_Impl.hpp" +// #include "KokkosBatched_InverseLU_Team_Impl.hpp" #include "KokkosKernels_TestUtils.hpp" @@ -41,8 +41,7 @@ struct ParamTag { typedef TB transB; }; -template +template struct Functor_BatchedTeamGemm { using execution_space = typename DeviceType::execution_space; ViewType _a, _b, _c; @@ -50,14 +49,12 @@ struct Functor_BatchedTeamGemm { ScalarType _alpha, _beta; KOKKOS_INLINE_FUNCTION - Functor_BatchedTeamGemm(const ScalarType alpha, const ViewType &a, - const ViewType &b, const ScalarType beta, + Functor_BatchedTeamGemm(const ScalarType alpha, const ViewType &a, const ViewType &b, const ScalarType beta, const ViewType &c) : _a(a), _b(b), _c(c), _alpha(alpha), _beta(beta) {} template - KOKKOS_INLINE_FUNCTION void operator()(const ParamTagType &, - const MemberType &member) const { + KOKKOS_INLINE_FUNCTION void operator()(const ParamTagType &, const MemberType &member) const { const int k = member.league_rank(); auto aa = Kokkos::subview(_a, k, Kokkos::ALL(), Kokkos::ALL()); @@ -69,10 +66,8 @@ struct Functor_BatchedTeamGemm { } member.team_barrier(); - KokkosBatched::TeamGemm::invoke(member, _alpha, aa, bb, _beta, - cc); + KokkosBatched::TeamGemm::invoke(member, _alpha, aa, bb, _beta, cc); } inline void run() { @@ -83,8 +78,7 @@ struct Functor_BatchedTeamGemm { Kokkos::Profiling::pushRegion(name.c_str()); const int league_size = _c.extent(0); - Kokkos::TeamPolicy policy(league_size, - Kokkos::AUTO); + Kokkos::TeamPolicy policy(league_size, Kokkos::AUTO); Kokkos::parallel_for((name + "::GemmFunctor").c_str(), policy, *this); Kokkos::Profiling::popRegion(); } @@ -124,15 +118,13 @@ struct Functor_BatchedTeamLU { } }; -template +template struct Functor_TestBatchedTeamInverseLU { AViewType _a; WViewType _w; KOKKOS_INLINE_FUNCTION - Functor_TestBatchedTeamInverseLU(const AViewType &a, const WViewType &w) - : _a(a), _w(w) {} + Functor_TestBatchedTeamInverseLU(const AViewType &a, const WViewType &w) : _a(a), _w(w) {} template KOKKOS_INLINE_FUNCTION void operator()(const MemberType &member) const { @@ -140,8 +132,7 @@ struct Functor_TestBatchedTeamInverseLU { auto aa = Kokkos::subview(_a, k, Kokkos::ALL(), Kokkos::ALL()); auto ww = Kokkos::subview(_w, k, Kokkos::ALL()); - KokkosBatched::TeamInverseLU::invoke(member, aa, - ww); + KokkosBatched::TeamInverseLU::invoke(member, aa, ww); } inline void run() { @@ -158,8 +149,7 @@ struct Functor_TestBatchedTeamInverseLU { } }; -template +template void impl_test_batched_inverselu(const int N, const int BlkSize) { typedef typename AViewType::value_type value_type; typedef Kokkos::ArithTraits ats; @@ -170,8 +160,7 @@ void impl_test_batched_inverselu(const int N, const int BlkSize) { WViewType w("w", N, BlkSize * BlkSize); AViewType c0("c0", N, BlkSize, BlkSize); - Kokkos::Random_XorShift64_Pool random( - 13718); + Kokkos::Random_XorShift64_Pool random(13718); Kokkos::fill_random(a0, random, value_type(1.0)); Kokkos::fence(); @@ -181,15 +170,12 @@ void impl_test_batched_inverselu(const int N, const int BlkSize) { Functor_BatchedTeamLU(a1).run(); - Functor_TestBatchedTeamInverseLU(a1, w) - .run(); + Functor_TestBatchedTeamInverseLU(a1, w).run(); value_type alpha = 1.0, beta = 0.0; typedef ParamTag param_tag_type; - Functor_BatchedTeamGemm(alpha, a0, a1, beta, c0) + Functor_BatchedTeamGemm(alpha, a0, a1, beta, c0) .run(); Kokkos::fence(); @@ -220,33 +206,21 @@ template int test_batched_team_inverselu() { #if defined(KOKKOSKERNELS_INST_LAYOUTLEFT) { - typedef Kokkos::View - AViewType; - typedef Kokkos::View - WViewType; - Test::TeamInverseLU::impl_test_batched_inverselu( - 0, 10); + typedef Kokkos::View AViewType; + typedef Kokkos::View WViewType; + Test::TeamInverseLU::impl_test_batched_inverselu(0, 10); for (int i = 0; i < 10; ++i) { - Test::TeamInverseLU::impl_test_batched_inverselu( - 1024, i); + Test::TeamInverseLU::impl_test_batched_inverselu(1024, i); } } #endif #if defined(KOKKOSKERNELS_INST_LAYOUTRIGHT) { - typedef Kokkos::View - AViewType; - typedef Kokkos::View - WViewType; - Test::TeamInverseLU::impl_test_batched_inverselu( - 0, 10); + typedef Kokkos::View AViewType; + typedef Kokkos::View WViewType; + Test::TeamInverseLU::impl_test_batched_inverselu(0, 10); for (int i = 0; i < 10; ++i) { - Test::TeamInverseLU::impl_test_batched_inverselu( - 1024, i); + Test::TeamInverseLU::impl_test_batched_inverselu(1024, i); } } #endif diff --git a/batched/dense/unit_test/Test_Batched_TeamInverseLU_Complex.hpp b/batched/dense/unit_test/Test_Batched_TeamInverseLU_Complex.hpp index 7eb918beef..cf670f2fc9 100644 --- a/batched/dense/unit_test/Test_Batched_TeamInverseLU_Complex.hpp +++ b/batched/dense/unit_test/Test_Batched_TeamInverseLU_Complex.hpp @@ -18,11 +18,9 @@ TEST_F(TestCategory, batched_scalar_team_inverselu_dcomplex) { // printf("Batched team inverse LU - double complex - algorithm type: // Unblocked\n"); - test_batched_inverselu, - Algo::InverseLU::Unblocked>(); + test_batched_inverselu, Algo::InverseLU::Unblocked>(); // printf("Batched team inverse LU - double complex - algorithm type: // Blocked\n"); - test_batched_inverselu, - Algo::InverseLU::Blocked>(); + test_batched_inverselu, Algo::InverseLU::Blocked>(); } #endif diff --git a/batched/dense/unit_test/Test_Batched_TeamLU.hpp b/batched/dense/unit_test/Test_Batched_TeamLU.hpp index e20f3a7411..b662c4a365 100644 --- a/batched/dense/unit_test/Test_Batched_TeamLU.hpp +++ b/batched/dense/unit_test/Test_Batched_TeamLU.hpp @@ -19,7 +19,7 @@ #include "Kokkos_Core.hpp" #include "Kokkos_Random.hpp" -//#include "KokkosBatched_Vector.hpp" +// #include "KokkosBatched_Vector.hpp" #include "KokkosBatched_LU_Decl.hpp" #include "KokkosBatched_LU_Serial_Impl.hpp" @@ -76,16 +76,14 @@ void impl_test_batched_lu(const int N, const int BlkSize) { /// randomized input testing views ViewType a0("a0", N, BlkSize, BlkSize), a1("a1", N, BlkSize, BlkSize); - Kokkos::Random_XorShift64_Pool random( - 13718); + Kokkos::Random_XorShift64_Pool random(13718); Kokkos::fill_random(a0, random, value_type(1.0)); Kokkos::fence(); Kokkos::deep_copy(a1, a0); - Functor_TestBatchedTeamLU(a0) - .run(); + Functor_TestBatchedTeamLU(a0).run(); Functor_TestBatchedTeamLU(a1).run(); Kokkos::fence(); @@ -117,27 +115,21 @@ template int test_batched_team_lu() { #if defined(KOKKOSKERNELS_INST_LAYOUTLEFT) { - typedef Kokkos::View - ViewType; - Test::TeamLU::impl_test_batched_lu(0, - 10); + typedef Kokkos::View ViewType; + Test::TeamLU::impl_test_batched_lu(0, 10); for (int i = 0; i < 10; ++i) { // printf("Testing: LayoutLeft, Blksize %d\n", i); - Test::TeamLU::impl_test_batched_lu( - 1024, i); + Test::TeamLU::impl_test_batched_lu(1024, i); } } #endif #if defined(KOKKOSKERNELS_INST_LAYOUTRIGHT) { - typedef Kokkos::View - ViewType; - Test::TeamLU::impl_test_batched_lu(0, - 10); + typedef Kokkos::View ViewType; + Test::TeamLU::impl_test_batched_lu(0, 10); for (int i = 0; i < 10; ++i) { // printf("Testing: LayoutLeft, Blksize %d\n", i); - Test::TeamLU::impl_test_batched_lu( - 1024, i); + Test::TeamLU::impl_test_batched_lu(1024, i); } } #endif diff --git a/batched/dense/unit_test/Test_Batched_TeamSolveLU.hpp b/batched/dense/unit_test/Test_Batched_TeamSolveLU.hpp index 445e10132f..61a11e6be7 100644 --- a/batched/dense/unit_test/Test_Batched_TeamSolveLU.hpp +++ b/batched/dense/unit_test/Test_Batched_TeamSolveLU.hpp @@ -19,14 +19,14 @@ #include "Kokkos_Core.hpp" #include "Kokkos_Random.hpp" -//#include "KokkosBatched_Vector.hpp" +// #include "KokkosBatched_Vector.hpp" #include "KokkosBatched_Gemm_Decl.hpp" #include "KokkosBatched_Gemm_Team_Impl.hpp" #include "KokkosBatched_LU_Decl.hpp" #include "KokkosBatched_LU_Team_Impl.hpp" #include "KokkosBatched_SolveLU_Decl.hpp" -//#include "KokkosBatched_SolveLU_Team_Impl.hpp" +// #include "KokkosBatched_SolveLU_Team_Impl.hpp" #include "KokkosKernels_TestUtils.hpp" @@ -41,8 +41,7 @@ struct ParamTag { typedef TB transB; }; -template +template struct Functor_BatchedTeamGemm { using execution_space = typename DeviceType::execution_space; ViewType _a, _b, _c; @@ -50,14 +49,12 @@ struct Functor_BatchedTeamGemm { ScalarType _alpha, _beta; KOKKOS_INLINE_FUNCTION - Functor_BatchedTeamGemm(const ScalarType alpha, const ViewType &a, - const ViewType &b, const ScalarType beta, + Functor_BatchedTeamGemm(const ScalarType alpha, const ViewType &a, const ViewType &b, const ScalarType beta, const ViewType &c) : _a(a), _b(b), _c(c), _alpha(alpha), _beta(beta) {} template - KOKKOS_INLINE_FUNCTION void operator()(const ParamTagType &, - const MemberType &member) const { + KOKKOS_INLINE_FUNCTION void operator()(const ParamTagType &, const MemberType &member) const { const int k = member.league_rank(); auto aa = Kokkos::subview(_a, k, Kokkos::ALL(), Kokkos::ALL()); @@ -69,10 +66,8 @@ struct Functor_BatchedTeamGemm { } member.team_barrier(); - KokkosBatched::TeamGemm::invoke(member, _alpha, aa, bb, _beta, - cc); + KokkosBatched::TeamGemm::invoke(member, _alpha, aa, bb, _beta, cc); } inline void run() { @@ -82,8 +77,7 @@ struct Functor_BatchedTeamGemm { std::string name = name_region + name_value_type; Kokkos::Profiling::pushRegion(name.c_str()); const int league_size = _c.extent(0); - Kokkos::TeamPolicy policy(league_size, - Kokkos::AUTO); + Kokkos::TeamPolicy policy(league_size, Kokkos::AUTO); Kokkos::parallel_for((name + "::GemmFunctor").c_str(), policy, *this); Kokkos::Profiling::popRegion(); } @@ -120,16 +114,14 @@ struct Functor_BatchedTeamLU { Kokkos::Profiling::popRegion(); } }; -template +template struct Functor_TestBatchedTeamSolveLU { using execution_space = typename DeviceType::execution_space; ViewType _a; ViewType _b; KOKKOS_INLINE_FUNCTION - Functor_TestBatchedTeamSolveLU(const ViewType &a, const ViewType &b) - : _a(a), _b(b) {} + Functor_TestBatchedTeamSolveLU(const ViewType &a, const ViewType &b) : _a(a), _b(b) {} template KOKKOS_INLINE_FUNCTION void operator()(const MemberType &member) const { @@ -137,8 +129,7 @@ struct Functor_TestBatchedTeamSolveLU { auto aa = Kokkos::subview(_a, k, Kokkos::ALL(), Kokkos::ALL()); auto bb = Kokkos::subview(_b, k, Kokkos::ALL(), Kokkos::ALL()); - KokkosBatched::TeamSolveLU::invoke( - member, aa, bb); + KokkosBatched::TeamSolveLU::invoke(member, aa, bb); } inline void run() { @@ -168,8 +159,7 @@ void impl_test_batched_solvelu(const int N, const int BlkSize) { // ViewType a0_T("a0_T", N, BlkSize, BlkSize); // ViewType b_T ("b_T", N, BlkSize, 5 ); - Kokkos::Random_XorShift64_Pool random( - 13718); + Kokkos::Random_XorShift64_Pool random(13718); Kokkos::fill_random(a0, random, value_type(1.0)); Kokkos::fill_random(x0, random, value_type(1.0)); @@ -181,15 +171,11 @@ void impl_test_batched_solvelu(const int N, const int BlkSize) { value_type alpha = 1.0, beta = 0.0; typedef ParamTag param_tag_type; - Functor_BatchedTeamGemm(alpha, a0, x0, beta, b) - .run(); + Functor_BatchedTeamGemm(alpha, a0, x0, beta, b).run(); Functor_BatchedTeamLU(a1).run(); - Functor_TestBatchedTeamSolveLU(a1, b) - .run(); + Functor_TestBatchedTeamSolveLU(a1, b).run(); Kokkos::fence(); @@ -246,25 +232,19 @@ template int test_batched_team_solvelu() { #if defined(KOKKOSKERNELS_INST_LAYOUTLEFT) { - typedef Kokkos::View - ViewType; - Test::TeamSolveLU::impl_test_batched_solvelu(0, 10); + typedef Kokkos::View ViewType; + Test::TeamSolveLU::impl_test_batched_solvelu(0, 10); for (int i = 0; i < 10; ++i) { - Test::TeamSolveLU::impl_test_batched_solvelu(1024, i); + Test::TeamSolveLU::impl_test_batched_solvelu(1024, i); } } #endif #if defined(KOKKOSKERNELS_INST_LAYOUTRIGHT) { - typedef Kokkos::View - ViewType; - Test::TeamSolveLU::impl_test_batched_solvelu(0, 10); + typedef Kokkos::View ViewType; + Test::TeamSolveLU::impl_test_batched_solvelu(0, 10); for (int i = 0; i < 10; ++i) { - Test::TeamSolveLU::impl_test_batched_solvelu(1024, i); + Test::TeamSolveLU::impl_test_batched_solvelu(1024, i); } } #endif diff --git a/batched/dense/unit_test/Test_Batched_TeamSolveLU_Complex.hpp b/batched/dense/unit_test/Test_Batched_TeamSolveLU_Complex.hpp index 865f58ef43..f904983509 100644 --- a/batched/dense/unit_test/Test_Batched_TeamSolveLU_Complex.hpp +++ b/batched/dense/unit_test/Test_Batched_TeamSolveLU_Complex.hpp @@ -18,11 +18,9 @@ TEST_F(TestCategory, batched_scalar_team_solvelu_dcomplex) { // printf("Batched team solveLU - double complex - algorithm type: // Unblocked\n"); - test_batched_team_solvelu, - Algo::SolveLU::Unblocked>(); + test_batched_team_solvelu, Algo::SolveLU::Unblocked>(); // printf("Batched team solveLU - double complex - algorithm type: // Blocked\n"); - test_batched_team_solvelu, - Algo::SolveLU::Blocked>(); + test_batched_team_solvelu, Algo::SolveLU::Blocked>(); } #endif diff --git a/batched/dense/unit_test/Test_Batched_TeamTrsm.hpp b/batched/dense/unit_test/Test_Batched_TeamTrsm.hpp index 523bd02df4..5ae1e216d9 100644 --- a/batched/dense/unit_test/Test_Batched_TeamTrsm.hpp +++ b/batched/dense/unit_test/Test_Batched_TeamTrsm.hpp @@ -19,7 +19,7 @@ #include "Kokkos_Core.hpp" #include "Kokkos_Random.hpp" -//#include "KokkosBatched_Vector.hpp" +// #include "KokkosBatched_Vector.hpp" #include "KokkosBatched_Trsm_Decl.hpp" #include "KokkosBatched_Trsm_Serial_Impl.hpp" @@ -40,8 +40,7 @@ struct ParamTag { typedef D diag; }; -template +template struct Functor_TestBatchedTeamTrsm { using execution_space = typename DeviceType::execution_space; ViewType _a, _b; @@ -49,22 +48,20 @@ struct Functor_TestBatchedTeamTrsm { ScalarType _alpha; KOKKOS_INLINE_FUNCTION - Functor_TestBatchedTeamTrsm(const ScalarType alpha, const ViewType &a, - const ViewType &b) + Functor_TestBatchedTeamTrsm(const ScalarType alpha, const ViewType &a, const ViewType &b) : _a(a), _b(b), _alpha(alpha) {} template - KOKKOS_INLINE_FUNCTION void operator()(const ParamTagType &, - const MemberType &member) const { + KOKKOS_INLINE_FUNCTION void operator()(const ParamTagType &, const MemberType &member) const { const int k = member.league_rank(); auto aa = Kokkos::subview(_a, k, Kokkos::ALL(), Kokkos::ALL()); auto bb = Kokkos::subview(_b, k, Kokkos::ALL(), Kokkos::ALL()); - KokkosBatched::TeamTrsm< - MemberType, typename ParamTagType::side, typename ParamTagType::uplo, - typename ParamTagType::trans, typename ParamTagType::diag, - AlgoTagType>::invoke(member, _alpha, aa, bb); + KokkosBatched::TeamTrsm::invoke(member, + _alpha, aa, + bb); } inline void run() { @@ -75,15 +72,13 @@ struct Functor_TestBatchedTeamTrsm { Kokkos::Profiling::pushRegion(name.c_str()); const int league_size = _b.extent(0); - Kokkos::TeamPolicy policy(league_size, - Kokkos::AUTO); + Kokkos::TeamPolicy policy(league_size, Kokkos::AUTO); Kokkos::parallel_for(name.c_str(), policy, *this); Kokkos::Profiling::popRegion(); } }; -template +template void impl_test_batched_trsm(const int N, const int BlkSize, const int NumCols) { typedef typename ViewType::value_type value_type; typedef Kokkos::ArithTraits ats; @@ -91,15 +86,13 @@ void impl_test_batched_trsm(const int N, const int BlkSize, const int NumCols) { /// randomized input testing views ScalarType alpha(1.0); - const bool is_side_right = - std::is_same::value; - const int b_nrows = is_side_right ? NumCols : BlkSize; - const int b_ncols = is_side_right ? BlkSize : NumCols; - ViewType a0("a0", N, BlkSize, BlkSize), a1("a1", N, BlkSize, BlkSize), - b0("b0", N, b_nrows, b_ncols), b1("b1", N, b_nrows, b_ncols); + const bool is_side_right = std::is_same::value; + const int b_nrows = is_side_right ? NumCols : BlkSize; + const int b_ncols = is_side_right ? BlkSize : NumCols; + ViewType a0("a0", N, BlkSize, BlkSize), a1("a1", N, BlkSize, BlkSize), b0("b0", N, b_nrows, b_ncols), + b1("b1", N, b_nrows, b_ncols); - Kokkos::Random_XorShift64_Pool random( - 13718); + Kokkos::Random_XorShift64_Pool random(13718); Kokkos::fill_random(a0, random, value_type(1.0)); Kokkos::fill_random(b0, random, value_type(1.0)); @@ -108,12 +101,9 @@ void impl_test_batched_trsm(const int N, const int BlkSize, const int NumCols) { Kokkos::deep_copy(a1, a0); Kokkos::deep_copy(b1, b0); - Functor_TestBatchedTeamTrsm(alpha, a0, b0) - .run(); - Functor_TestBatchedTeamTrsm(alpha, a1, b1) + Functor_TestBatchedTeamTrsm(alpha, a0, b0) .run(); + Functor_TestBatchedTeamTrsm(alpha, a1, b1).run(); Kokkos::fence(); @@ -140,40 +130,27 @@ void impl_test_batched_trsm(const int N, const int BlkSize, const int NumCols) { } // namespace TeamTrsm } // namespace Test -template +template int test_batched_team_trsm() { #if defined(KOKKOSKERNELS_INST_LAYOUTLEFT) { - typedef Kokkos::View - ViewType; - Test::TeamTrsm::impl_test_batched_trsm(0, 10, 4); + typedef Kokkos::View ViewType; + Test::TeamTrsm::impl_test_batched_trsm(0, 10, 4); for (int i = 0; i < 10; ++i) { // printf("Testing: LayoutLeft, Blksize %d\n", i); - Test::TeamTrsm::impl_test_batched_trsm(1024, i, - 4); - Test::TeamTrsm::impl_test_batched_trsm(1024, i, - 1); + Test::TeamTrsm::impl_test_batched_trsm(1024, i, 4); + Test::TeamTrsm::impl_test_batched_trsm(1024, i, 1); } } #endif #if defined(KOKKOSKERNELS_INST_LAYOUTRIGHT) { - typedef Kokkos::View - ViewType; - Test::TeamTrsm::impl_test_batched_trsm(0, 10, 4); + typedef Kokkos::View ViewType; + Test::TeamTrsm::impl_test_batched_trsm(0, 10, 4); for (int i = 0; i < 10; ++i) { // printf("Testing: LayoutRight, Blksize %d\n", i); - Test::TeamTrsm::impl_test_batched_trsm(1024, i, - 4); - Test::TeamTrsm::impl_test_batched_trsm(1024, i, - 1); + Test::TeamTrsm::impl_test_batched_trsm(1024, i, 4); + Test::TeamTrsm::impl_test_batched_trsm(1024, i, 1); } } #endif diff --git a/batched/dense/unit_test/Test_Batched_TeamTrsm_Complex.hpp b/batched/dense/unit_test/Test_Batched_TeamTrsm_Complex.hpp index 0cf2761922..cf9cafeb9e 100644 --- a/batched/dense/unit_test/Test_Batched_TeamTrsm_Complex.hpp +++ b/batched/dense/unit_test/Test_Batched_TeamTrsm_Complex.hpp @@ -16,176 +16,106 @@ #if defined(KOKKOSKERNELS_INST_COMPLEX_DOUBLE) TEST_F(TestCategory, batched_scalar_team_trsm_l_l_nt_u_dcomplex_dcomplex) { - typedef ::Test::TeamTrsm::ParamTag - param_tag_type; + typedef ::Test::TeamTrsm::ParamTag param_tag_type; typedef Algo::Trsm::Blocked algo_tag_type; - test_batched_team_trsm, - Kokkos::complex, param_tag_type, - algo_tag_type>(); + test_batched_team_trsm, Kokkos::complex, param_tag_type, algo_tag_type>(); } TEST_F(TestCategory, batched_scalar_team_trsm_l_l_nt_n_dcomplex_dcomplex) { - typedef ::Test::TeamTrsm::ParamTag - param_tag_type; + typedef ::Test::TeamTrsm::ParamTag param_tag_type; typedef Algo::Trsm::Blocked algo_tag_type; - test_batched_team_trsm, - Kokkos::complex, param_tag_type, - algo_tag_type>(); + test_batched_team_trsm, Kokkos::complex, param_tag_type, algo_tag_type>(); } TEST_F(TestCategory, batched_scalar_team_trsm_l_u_nt_u_dcomplex_dcomplex) { - typedef ::Test::TeamTrsm::ParamTag - param_tag_type; + typedef ::Test::TeamTrsm::ParamTag param_tag_type; typedef Algo::Trsm::Blocked algo_tag_type; - test_batched_team_trsm, - Kokkos::complex, param_tag_type, - algo_tag_type>(); + test_batched_team_trsm, Kokkos::complex, param_tag_type, algo_tag_type>(); } TEST_F(TestCategory, batched_scalar_team_trsm_l_u_nt_n_dcomplex_dcomplex) { - typedef ::Test::TeamTrsm::ParamTag - param_tag_type; + typedef ::Test::TeamTrsm::ParamTag param_tag_type; typedef Algo::Trsm::Blocked algo_tag_type; - test_batched_team_trsm, - Kokkos::complex, param_tag_type, - algo_tag_type>(); + test_batched_team_trsm, Kokkos::complex, param_tag_type, algo_tag_type>(); } TEST_F(TestCategory, batched_scalar_team_trsm_r_u_nt_u_dcomplex_dcomplex) { - typedef ::Test::TeamTrsm::ParamTag - param_tag_type; + typedef ::Test::TeamTrsm::ParamTag param_tag_type; typedef Algo::Trsm::Blocked algo_tag_type; - test_batched_team_trsm, - Kokkos::complex, param_tag_type, - algo_tag_type>(); + test_batched_team_trsm, Kokkos::complex, param_tag_type, algo_tag_type>(); } TEST_F(TestCategory, batched_scalar_team_trsm_r_u_nt_n_dcomplex_dcomplex) { - typedef ::Test::TeamTrsm::ParamTag - param_tag_type; + typedef ::Test::TeamTrsm::ParamTag param_tag_type; typedef Algo::Trsm::Blocked algo_tag_type; - test_batched_team_trsm, - Kokkos::complex, param_tag_type, - algo_tag_type>(); + test_batched_team_trsm, Kokkos::complex, param_tag_type, algo_tag_type>(); } // TEST_F(TestCategory, batched_scalar_team_trsm_l_l_t_u_dcomplex_dcomplex) { - typedef ::Test::TeamTrsm::ParamTag - param_tag_type; + typedef ::Test::TeamTrsm::ParamTag param_tag_type; typedef Algo::Trsm::Blocked algo_tag_type; - test_batched_team_trsm, - Kokkos::complex, param_tag_type, - algo_tag_type>(); + test_batched_team_trsm, Kokkos::complex, param_tag_type, algo_tag_type>(); } TEST_F(TestCategory, batched_scalar_team_trsm_l_l_t_n_dcomplex_dcomplex) { - typedef ::Test::TeamTrsm::ParamTag - param_tag_type; + typedef ::Test::TeamTrsm::ParamTag param_tag_type; typedef Algo::Trsm::Blocked algo_tag_type; - test_batched_team_trsm, - Kokkos::complex, param_tag_type, - algo_tag_type>(); + test_batched_team_trsm, Kokkos::complex, param_tag_type, algo_tag_type>(); } TEST_F(TestCategory, batched_scalar_team_trsm_l_u_t_u_dcomplex_dcomplex) { - typedef ::Test::TeamTrsm::ParamTag - param_tag_type; + typedef ::Test::TeamTrsm::ParamTag param_tag_type; typedef Algo::Trsm::Blocked algo_tag_type; - test_batched_team_trsm, - Kokkos::complex, param_tag_type, - algo_tag_type>(); + test_batched_team_trsm, Kokkos::complex, param_tag_type, algo_tag_type>(); } TEST_F(TestCategory, batched_scalar_team_trsm_l_u_t_n_dcomplex_dcomplex) { - typedef ::Test::TeamTrsm::ParamTag - param_tag_type; + typedef ::Test::TeamTrsm::ParamTag param_tag_type; typedef Algo::Trsm::Blocked algo_tag_type; - test_batched_team_trsm, - Kokkos::complex, param_tag_type, - algo_tag_type>(); + test_batched_team_trsm, Kokkos::complex, param_tag_type, algo_tag_type>(); } TEST_F(TestCategory, batched_scalar_team_trsm_l_l_nt_u_dcomplex_double) { - typedef ::Test::TeamTrsm::ParamTag - param_tag_type; + typedef ::Test::TeamTrsm::ParamTag param_tag_type; typedef Algo::Trsm::Blocked algo_tag_type; - test_batched_team_trsm, double, - param_tag_type, algo_tag_type>(); + test_batched_team_trsm, double, param_tag_type, algo_tag_type>(); } TEST_F(TestCategory, batched_scalar_team_trsm_l_l_nt_n_dcomplex_double) { - typedef ::Test::TeamTrsm::ParamTag - param_tag_type; + typedef ::Test::TeamTrsm::ParamTag param_tag_type; typedef Algo::Trsm::Blocked algo_tag_type; - test_batched_team_trsm, double, - param_tag_type, algo_tag_type>(); + test_batched_team_trsm, double, param_tag_type, algo_tag_type>(); } TEST_F(TestCategory, batched_scalar_team_trsm_l_u_nt_u_dcomplex_double) { - typedef ::Test::TeamTrsm::ParamTag - param_tag_type; + typedef ::Test::TeamTrsm::ParamTag param_tag_type; typedef Algo::Trsm::Blocked algo_tag_type; - test_batched_team_trsm, double, - param_tag_type, algo_tag_type>(); + test_batched_team_trsm, double, param_tag_type, algo_tag_type>(); } TEST_F(TestCategory, batched_scalar_team_trsm_l_u_nt_n_dcomplex_double) { - typedef ::Test::TeamTrsm::ParamTag - param_tag_type; + typedef ::Test::TeamTrsm::ParamTag param_tag_type; typedef Algo::Trsm::Blocked algo_tag_type; - test_batched_team_trsm, double, - param_tag_type, algo_tag_type>(); + test_batched_team_trsm, double, param_tag_type, algo_tag_type>(); } TEST_F(TestCategory, batched_scalar_team_trsm_r_u_nt_u_dcomplex_double) { - typedef ::Test::TeamTrsm::ParamTag - param_tag_type; + typedef ::Test::TeamTrsm::ParamTag param_tag_type; typedef Algo::Trsm::Blocked algo_tag_type; - test_batched_team_trsm, double, - param_tag_type, algo_tag_type>(); + test_batched_team_trsm, double, param_tag_type, algo_tag_type>(); } TEST_F(TestCategory, batched_scalar_team_trsm_r_u_nt_n_dcomplex_double) { - typedef ::Test::TeamTrsm::ParamTag - param_tag_type; + typedef ::Test::TeamTrsm::ParamTag param_tag_type; typedef Algo::Trsm::Blocked algo_tag_type; - test_batched_team_trsm, double, - param_tag_type, algo_tag_type>(); + test_batched_team_trsm, double, param_tag_type, algo_tag_type>(); } // TEST_F(TestCategory, batched_scalar_team_trsm_l_l_t_u_dcomplex_double) { - typedef ::Test::TeamTrsm::ParamTag - param_tag_type; + typedef ::Test::TeamTrsm::ParamTag param_tag_type; typedef Algo::Trsm::Blocked algo_tag_type; - test_batched_team_trsm, double, - param_tag_type, algo_tag_type>(); + test_batched_team_trsm, double, param_tag_type, algo_tag_type>(); } TEST_F(TestCategory, batched_scalar_team_trsm_l_l_t_n_dcomplex_double) { - typedef ::Test::TeamTrsm::ParamTag - param_tag_type; + typedef ::Test::TeamTrsm::ParamTag param_tag_type; typedef Algo::Trsm::Blocked algo_tag_type; - test_batched_team_trsm, double, - param_tag_type, algo_tag_type>(); + test_batched_team_trsm, double, param_tag_type, algo_tag_type>(); } TEST_F(TestCategory, batched_scalar_team_trsm_l_u_t_u_dcomplex_double) { - typedef ::Test::TeamTrsm::ParamTag - param_tag_type; + typedef ::Test::TeamTrsm::ParamTag param_tag_type; typedef Algo::Trsm::Blocked algo_tag_type; - test_batched_team_trsm, double, - param_tag_type, algo_tag_type>(); + test_batched_team_trsm, double, param_tag_type, algo_tag_type>(); } TEST_F(TestCategory, batched_scalar_team_trsm_l_u_t_n_dcomplex_double) { - typedef ::Test::TeamTrsm::ParamTag - param_tag_type; + typedef ::Test::TeamTrsm::ParamTag param_tag_type; typedef Algo::Trsm::Blocked algo_tag_type; - test_batched_team_trsm, double, - param_tag_type, algo_tag_type>(); + test_batched_team_trsm, double, param_tag_type, algo_tag_type>(); } #endif diff --git a/batched/dense/unit_test/Test_Batched_TeamTrsm_Real.hpp b/batched/dense/unit_test/Test_Batched_TeamTrsm_Real.hpp index 6757617ddd..cd1d2a7211 100644 --- a/batched/dense/unit_test/Test_Batched_TeamTrsm_Real.hpp +++ b/batched/dense/unit_test/Test_Batched_TeamTrsm_Real.hpp @@ -16,168 +16,108 @@ #if defined(KOKKOSKERNELS_INST_FLOAT) TEST_F(TestCategory, batched_scalar_team_trsm_l_l_nt_u_float_float) { - typedef ::Test::TeamTrsm::ParamTag - param_tag_type; + typedef ::Test::TeamTrsm::ParamTag param_tag_type; typedef Algo::Trsm::Blocked algo_tag_type; - test_batched_team_trsm(); + test_batched_team_trsm(); } TEST_F(TestCategory, batched_scalar_team_trsm_l_l_nt_n_float_float) { - typedef ::Test::TeamTrsm::ParamTag - param_tag_type; + typedef ::Test::TeamTrsm::ParamTag param_tag_type; typedef Algo::Trsm::Blocked algo_tag_type; - test_batched_team_trsm(); + test_batched_team_trsm(); } TEST_F(TestCategory, batched_scalar_team_trsm_l_u_nt_u_float_float) { - typedef ::Test::TeamTrsm::ParamTag - param_tag_type; + typedef ::Test::TeamTrsm::ParamTag param_tag_type; typedef Algo::Trsm::Blocked algo_tag_type; - test_batched_team_trsm(); + test_batched_team_trsm(); } TEST_F(TestCategory, batched_scalar_team_trsm_l_u_nt_n_float_float) { - typedef ::Test::TeamTrsm::ParamTag - param_tag_type; + typedef ::Test::TeamTrsm::ParamTag param_tag_type; typedef Algo::Trsm::Blocked algo_tag_type; - test_batched_team_trsm(); + test_batched_team_trsm(); } TEST_F(TestCategory, batched_scalar_team_trsm_r_u_nt_u_float_float) { - typedef ::Test::TeamTrsm::ParamTag - param_tag_type; + typedef ::Test::TeamTrsm::ParamTag param_tag_type; typedef Algo::Trsm::Blocked algo_tag_type; - test_batched_team_trsm(); + test_batched_team_trsm(); } TEST_F(TestCategory, batched_scalar_team_trsm_r_u_nt_n_float_float) { - typedef ::Test::TeamTrsm::ParamTag - param_tag_type; + typedef ::Test::TeamTrsm::ParamTag param_tag_type; typedef Algo::Trsm::Blocked algo_tag_type; - test_batched_team_trsm(); + test_batched_team_trsm(); } // TEST_F(TestCategory, batched_scalar_team_trsm_l_l_t_u_float_float) { - typedef ::Test::TeamTrsm::ParamTag - param_tag_type; + typedef ::Test::TeamTrsm::ParamTag param_tag_type; typedef Algo::Trsm::Blocked algo_tag_type; - test_batched_team_trsm(); + test_batched_team_trsm(); } TEST_F(TestCategory, batched_scalar_team_trsm_l_l_t_n_float_float) { - typedef ::Test::TeamTrsm::ParamTag - param_tag_type; + typedef ::Test::TeamTrsm::ParamTag param_tag_type; typedef Algo::Trsm::Blocked algo_tag_type; - test_batched_team_trsm(); + test_batched_team_trsm(); } TEST_F(TestCategory, batched_scalar_team_trsm_l_u_t_u_float_float) { - typedef ::Test::TeamTrsm::ParamTag - param_tag_type; + typedef ::Test::TeamTrsm::ParamTag param_tag_type; typedef Algo::Trsm::Blocked algo_tag_type; - test_batched_team_trsm(); + test_batched_team_trsm(); } TEST_F(TestCategory, batched_scalar_team_trsm_l_u_t_n_float_float) { - typedef ::Test::TeamTrsm::ParamTag - param_tag_type; + typedef ::Test::TeamTrsm::ParamTag param_tag_type; typedef Algo::Trsm::Blocked algo_tag_type; - test_batched_team_trsm(); + test_batched_team_trsm(); } #endif #if defined(KOKKOSKERNELS_INST_DOUBLE) TEST_F(TestCategory, batched_scalar_team_trsm_l_l_nt_u_double_double) { - typedef ::Test::TeamTrsm::ParamTag - param_tag_type; + typedef ::Test::TeamTrsm::ParamTag param_tag_type; typedef Algo::Trsm::Blocked algo_tag_type; - test_batched_team_trsm(); + test_batched_team_trsm(); } TEST_F(TestCategory, batched_scalar_team_trsm_l_l_nt_n_double_double) { - typedef ::Test::TeamTrsm::ParamTag - param_tag_type; + typedef ::Test::TeamTrsm::ParamTag param_tag_type; typedef Algo::Trsm::Blocked algo_tag_type; - test_batched_team_trsm(); + test_batched_team_trsm(); } TEST_F(TestCategory, batched_scalar_team_trsm_l_u_nt_u_double_double) { - typedef ::Test::TeamTrsm::ParamTag - param_tag_type; + typedef ::Test::TeamTrsm::ParamTag param_tag_type; typedef Algo::Trsm::Blocked algo_tag_type; - test_batched_team_trsm(); + test_batched_team_trsm(); } TEST_F(TestCategory, batched_scalar_team_trsm_l_u_nt_n_double_double) { - typedef ::Test::TeamTrsm::ParamTag - param_tag_type; + typedef ::Test::TeamTrsm::ParamTag param_tag_type; typedef Algo::Trsm::Blocked algo_tag_type; - test_batched_team_trsm(); + test_batched_team_trsm(); } TEST_F(TestCategory, batched_scalar_team_trsm_r_u_nt_u_double_double) { - typedef ::Test::TeamTrsm::ParamTag - param_tag_type; + typedef ::Test::TeamTrsm::ParamTag param_tag_type; typedef Algo::Trsm::Blocked algo_tag_type; - test_batched_team_trsm(); + test_batched_team_trsm(); } TEST_F(TestCategory, batched_scalar_team_trsm_r_u_nt_n_double_double) { - typedef ::Test::TeamTrsm::ParamTag - param_tag_type; + typedef ::Test::TeamTrsm::ParamTag param_tag_type; typedef Algo::Trsm::Blocked algo_tag_type; - test_batched_team_trsm(); + test_batched_team_trsm(); } // TEST_F(TestCategory, batched_scalar_team_trsm_l_l_t_u_double_double) { - typedef ::Test::TeamTrsm::ParamTag - param_tag_type; + typedef ::Test::TeamTrsm::ParamTag param_tag_type; typedef Algo::Trsm::Blocked algo_tag_type; - test_batched_team_trsm(); + test_batched_team_trsm(); } TEST_F(TestCategory, batched_scalar_team_trsm_l_l_t_n_double_double) { - typedef ::Test::TeamTrsm::ParamTag - param_tag_type; + typedef ::Test::TeamTrsm::ParamTag param_tag_type; typedef Algo::Trsm::Blocked algo_tag_type; - test_batched_team_trsm(); + test_batched_team_trsm(); } TEST_F(TestCategory, batched_scalar_team_trsm_l_u_t_u_double_double) { - typedef ::Test::TeamTrsm::ParamTag - param_tag_type; + typedef ::Test::TeamTrsm::ParamTag param_tag_type; typedef Algo::Trsm::Blocked algo_tag_type; - test_batched_team_trsm(); + test_batched_team_trsm(); } TEST_F(TestCategory, batched_scalar_team_trsm_l_u_t_n_double_double) { - typedef ::Test::TeamTrsm::ParamTag - param_tag_type; + typedef ::Test::TeamTrsm::ParamTag param_tag_type; typedef Algo::Trsm::Blocked algo_tag_type; - test_batched_team_trsm(); + test_batched_team_trsm(); } #endif diff --git a/batched/dense/unit_test/Test_Batched_TeamTrsv.hpp b/batched/dense/unit_test/Test_Batched_TeamTrsv.hpp index 400e35deb8..37e8708bd2 100644 --- a/batched/dense/unit_test/Test_Batched_TeamTrsv.hpp +++ b/batched/dense/unit_test/Test_Batched_TeamTrsv.hpp @@ -19,7 +19,7 @@ #include "Kokkos_Core.hpp" #include "Kokkos_Random.hpp" -//#include "KokkosBatched_Vector.hpp" +// #include "KokkosBatched_Vector.hpp" #include "KokkosBatched_Trsv_Decl.hpp" #include "KokkosBatched_Trsv_Serial_Impl.hpp" @@ -38,8 +38,7 @@ struct ParamTag { typedef D diag; }; -template +template struct Functor_TestBatchedTeamTrsv { using execution_space = typename DeviceType::execution_space; ViewType _a, _b; @@ -47,22 +46,18 @@ struct Functor_TestBatchedTeamTrsv { ScalarType _alpha; KOKKOS_INLINE_FUNCTION - Functor_TestBatchedTeamTrsv(const ScalarType alpha, const ViewType &a, - const ViewType &b) + Functor_TestBatchedTeamTrsv(const ScalarType alpha, const ViewType &a, const ViewType &b) : _a(a), _b(b), _alpha(alpha) {} template - KOKKOS_INLINE_FUNCTION void operator()(const ParamTagType &, - const MemberType &member) const { + KOKKOS_INLINE_FUNCTION void operator()(const ParamTagType &, const MemberType &member) const { const int k = member.league_rank(); auto aa = Kokkos::subview(_a, k, Kokkos::ALL(), Kokkos::ALL()); auto bb = Kokkos::subview(_b, k, Kokkos::ALL(), 0); - KokkosBatched::TeamTrsv< - MemberType, typename ParamTagType::uplo, typename ParamTagType::trans, - typename ParamTagType::diag, AlgoTagType>::invoke(member, _alpha, aa, - bb); + KokkosBatched::TeamTrsv::invoke(member, _alpha, aa, bb); } inline void run() { @@ -73,15 +68,13 @@ struct Functor_TestBatchedTeamTrsv { Kokkos::Profiling::pushRegion(name.c_str()); const int league_size = _b.extent(0); - Kokkos::TeamPolicy policy(league_size, - Kokkos::AUTO); + Kokkos::TeamPolicy policy(league_size, Kokkos::AUTO); Kokkos::parallel_for(name.c_str(), policy, *this); Kokkos::Profiling::popRegion(); } }; -template +template void impl_test_batched_trsv(const int N, const int BlkSize) { typedef typename ViewType::value_type value_type; typedef Kokkos::ArithTraits ats; @@ -89,11 +82,10 @@ void impl_test_batched_trsv(const int N, const int BlkSize) { /// randomized input testing views ScalarType alpha(1.5); - ViewType a0("a0", N, BlkSize, BlkSize), a1("a1", N, BlkSize, BlkSize), - b0("b0", N, BlkSize, 1), b1("b1", N, BlkSize, 1); + ViewType a0("a0", N, BlkSize, BlkSize), a1("a1", N, BlkSize, BlkSize), b0("b0", N, BlkSize, 1), + b1("b1", N, BlkSize, 1); - Kokkos::Random_XorShift64_Pool random( - 13718); + Kokkos::Random_XorShift64_Pool random(13718); Kokkos::fill_random(a0, random, value_type(1.0)); Kokkos::fill_random(b0, random, value_type(1.0)); @@ -104,12 +96,9 @@ void impl_test_batched_trsv(const int N, const int BlkSize) { Kokkos::deep_copy(a1, a0); Kokkos::deep_copy(b1, b0); - Functor_TestBatchedTeamTrsv(alpha, a0, b0) - .run(); - Functor_TestBatchedTeamTrsv(alpha, a1, b1) + Functor_TestBatchedTeamTrsv(alpha, a0, b0) .run(); + Functor_TestBatchedTeamTrsv(alpha, a1, b1).run(); Kokkos::fence(); @@ -136,34 +125,25 @@ void impl_test_batched_trsv(const int N, const int BlkSize) { } // namespace TeamTrsv } // namespace Test -template +template int test_batched_team_trsv() { #if defined(KOKKOSKERNELS_INST_LAYOUTLEFT) { - typedef Kokkos::View - ViewType; - Test::TeamTrsv::impl_test_batched_trsv(0, 10); + typedef Kokkos::View ViewType; + Test::TeamTrsv::impl_test_batched_trsv(0, 10); for (int i = 0; i < 10; ++i) { // printf("Testing: LayoutLeft, Blksize %d\n", i); - Test::TeamTrsv::impl_test_batched_trsv(1024, - i); + Test::TeamTrsv::impl_test_batched_trsv(1024, i); } } #endif #if defined(KOKKOSKERNELS_INST_LAYOUTRIGHT) { - typedef Kokkos::View - ViewType; - Test::TeamTrsv::impl_test_batched_trsv(0, 10); + typedef Kokkos::View ViewType; + Test::TeamTrsv::impl_test_batched_trsv(0, 10); for (int i = 0; i < 10; ++i) { // printf("Testing: LayoutRight, Blksize %d\n", i); - Test::TeamTrsv::impl_test_batched_trsv(1024, - i); + Test::TeamTrsv::impl_test_batched_trsv(1024, i); } } #endif diff --git a/batched/dense/unit_test/Test_Batched_TeamVectorAxpy.hpp b/batched/dense/unit_test/Test_Batched_TeamVectorAxpy.hpp index fca0534b4b..cd378745ef 100644 --- a/batched/dense/unit_test/Test_Batched_TeamVectorAxpy.hpp +++ b/batched/dense/unit_test/Test_Batched_TeamVectorAxpy.hpp @@ -37,9 +37,7 @@ struct Functor_TestBatchedTeamVectorAxpy { const int _N_team; KOKKOS_INLINE_FUNCTION - Functor_TestBatchedTeamVectorAxpy(const alphaViewType &alpha, - const ViewType &X, const ViewType &Y, - const int N_team) + Functor_TestBatchedTeamVectorAxpy(const alphaViewType &alpha, const ViewType &X, const ViewType &Y, const int N_team) : _alpha(alpha), _X(X), _Y(Y), _N_team(N_team) {} template @@ -47,16 +45,12 @@ struct Functor_TestBatchedTeamVectorAxpy { const int first_matrix = static_cast(member.league_rank()) * _N_team; const int N = _X.extent(0); const int last_matrix = - (static_cast(member.league_rank() + 1) * _N_team < N - ? static_cast(member.league_rank() + 1) * _N_team - : N); + (static_cast(member.league_rank() + 1) * _N_team < N ? static_cast(member.league_rank() + 1) * _N_team + : N); - auto alpha = - Kokkos::subview(_alpha, Kokkos::make_pair(first_matrix, last_matrix)); - auto x = Kokkos::subview(_X, Kokkos::make_pair(first_matrix, last_matrix), - Kokkos::ALL); - auto y = Kokkos::subview(_Y, Kokkos::make_pair(first_matrix, last_matrix), - Kokkos::ALL); + auto alpha = Kokkos::subview(_alpha, Kokkos::make_pair(first_matrix, last_matrix)); + auto x = Kokkos::subview(_X, Kokkos::make_pair(first_matrix, last_matrix), Kokkos::ALL); + auto y = Kokkos::subview(_Y, Kokkos::make_pair(first_matrix, last_matrix), Kokkos::ALL); KokkosBatched::TeamVectorAxpy::invoke(member, alpha, x, y); } @@ -67,8 +61,7 @@ struct Functor_TestBatchedTeamVectorAxpy { const std::string name_value_type = Test::value_type_name(); std::string name = name_region + name_value_type; Kokkos::Profiling::pushRegion(name.c_str()); - Kokkos::TeamPolicy policy(_X.extent(0) / _N_team, - Kokkos::AUTO(), Kokkos::AUTO()); + Kokkos::TeamPolicy policy(_X.extent(0) / _N_team, Kokkos::AUTO(), Kokkos::AUTO()); Kokkos::parallel_for(name.c_str(), policy, *this); Kokkos::Profiling::popRegion(); } @@ -81,13 +74,11 @@ void impl_test_batched_axpy(const int N, const int BlkSize, const int N_team) { typedef typename alphaViewType::const_value_type alpha_const_value_type; typedef Kokkos::ArithTraits ats; - ViewType X0("x0", N, BlkSize), X1("x1", N, BlkSize), Y0("y0", N, BlkSize), - Y1("y1", N, BlkSize); + ViewType X0("x0", N, BlkSize), X1("x1", N, BlkSize), Y0("y0", N, BlkSize), Y1("y1", N, BlkSize); alphaViewType alpha("alpha", N); - Kokkos::Random_XorShift64_Pool random( - 13718); + Kokkos::Random_XorShift64_Pool random(13718); Kokkos::fill_random(X0, random, const_value_type(1.0)); Kokkos::fill_random(Y0, random, const_value_type(1.0)); Kokkos::fill_random(alpha, random, alpha_const_value_type(1.0)); @@ -107,12 +98,9 @@ void impl_test_batched_axpy(const int N, const int BlkSize, const int N_team) { Kokkos::deep_copy(Y0_host, Y0); for (int l = 0; l < N; ++l) - for (int i = 0; i < BlkSize; ++i) - Y0_host(l, i) += alpha_host(l) * X0_host(l, i); + for (int i = 0; i < BlkSize; ++i) Y0_host(l, i) += alpha_host(l) * X0_host(l, i); - Functor_TestBatchedTeamVectorAxpy( - alpha, X1, Y1, N_team) - .run(); + Functor_TestBatchedTeamVectorAxpy(alpha, X1, Y1, N_team).run(); Kokkos::fence(); @@ -141,25 +129,20 @@ int test_batched_teamvector_axpy() { #if defined(KOKKOSKERNELS_INST_LAYOUTLEFT) { typedef Kokkos::View ViewType; - typedef Kokkos::View - alphaViewType; + typedef Kokkos::View alphaViewType; for (int i = 3; i < 10; ++i) { - Test::TeamVectorAxpy::impl_test_batched_axpy(1024, i, 2); + Test::TeamVectorAxpy::impl_test_batched_axpy(1024, i, 2); } } #endif #if defined(KOKKOSKERNELS_INST_LAYOUTRIGHT) { - typedef Kokkos::View - ViewType; - typedef Kokkos::View - alphaViewType; + typedef Kokkos::View ViewType; + typedef Kokkos::View alphaViewType; for (int i = 3; i < 10; ++i) { - Test::TeamVectorAxpy::impl_test_batched_axpy(1024, i, 2); + Test::TeamVectorAxpy::impl_test_batched_axpy(1024, i, 2); } } #endif diff --git a/batched/dense/unit_test/Test_Batched_TeamVectorAxpy_Complex.hpp b/batched/dense/unit_test/Test_Batched_TeamVectorAxpy_Complex.hpp index b1f70a723e..0e8cb013f1 100644 --- a/batched/dense/unit_test/Test_Batched_TeamVectorAxpy_Complex.hpp +++ b/batched/dense/unit_test/Test_Batched_TeamVectorAxpy_Complex.hpp @@ -16,8 +16,7 @@ #if defined(KOKKOSKERNELS_INST_COMPLEX_DOUBLE) TEST_F(TestCategory, batched_scalar_teamvector_axpy_nt_dcomplex_dcomplex) { - test_batched_teamvector_axpy, - Kokkos::complex>(); + test_batched_teamvector_axpy, Kokkos::complex>(); } TEST_F(TestCategory, batched_scalar_teamvector_axpy_nt_dcomplex_double) { diff --git a/batched/dense/unit_test/Test_Batched_TeamVectorGemm.hpp b/batched/dense/unit_test/Test_Batched_TeamVectorGemm.hpp index f2f3bc217d..2ebc10f2e0 100644 --- a/batched/dense/unit_test/Test_Batched_TeamVectorGemm.hpp +++ b/batched/dense/unit_test/Test_Batched_TeamVectorGemm.hpp @@ -33,8 +33,7 @@ struct ParamTag { typedef TB transB; }; -template +template struct Functor_TestBatchedTeamVector { using execution_space = typename DeviceType::execution_space; ViewType _a, _b, _c; @@ -42,24 +41,20 @@ struct Functor_TestBatchedTeamVector { ScalarType _alpha, _beta; KOKKOS_INLINE_FUNCTION - Functor_TestBatchedTeamVector(const ScalarType alpha, const ViewType &a, - const ViewType &b, const ScalarType beta, + Functor_TestBatchedTeamVector(const ScalarType alpha, const ViewType &a, const ViewType &b, const ScalarType beta, const ViewType &c) : _a(a), _b(b), _c(c), _alpha(alpha), _beta(beta) {} template - KOKKOS_INLINE_FUNCTION void operator()(const ParamTagType &, - const MemberType &member) const { + KOKKOS_INLINE_FUNCTION void operator()(const ParamTagType &, const MemberType &member) const { const int k = member.league_rank(); auto aa = Kokkos::subview(_a, k, Kokkos::ALL(), Kokkos::ALL()); auto bb = Kokkos::subview(_b, k, Kokkos::ALL(), Kokkos::ALL()); auto cc = Kokkos::subview(_c, k, Kokkos::ALL(), Kokkos::ALL()); - KokkosBatched::TeamVectorGemm::invoke(member, _alpha, aa, bb, - _beta, cc); + KokkosBatched::TeamVectorGemm::invoke(member, _alpha, aa, bb, _beta, cc); } inline void run() { @@ -69,19 +64,15 @@ struct Functor_TestBatchedTeamVector { std::string name = name_region + name_value_type; Kokkos::Profiling::pushRegion(name.c_str()); const int league_size = _c.extent(0); - Kokkos::TeamPolicy policy(league_size, - Kokkos::AUTO); + Kokkos::TeamPolicy policy(league_size, Kokkos::AUTO); Kokkos::parallel_for(name.c_str(), policy, *this); Kokkos::Profiling::popRegion(); } }; -template -void impl_test_batched_teamvectorgemm(const int N, const int matAdim1, - const int matAdim2, const int matBdim1, - const int matBdim2, const int matCdim1, - const int matCdim2) { +template +void impl_test_batched_teamvectorgemm(const int N, const int matAdim1, const int matAdim2, const int matBdim1, + const int matBdim2, const int matCdim1, const int matCdim2) { using transA = typename ParamTagType::transA; using transB = typename ParamTagType::transB; using execution_space = typename DeviceType::execution_space; @@ -91,15 +82,11 @@ void impl_test_batched_teamvectorgemm(const int N, const int matAdim1, /// randomized input testing views ScalarType alpha = ScalarType(1.5), beta = ScalarType(3.0); - ViewType a_expected("a_expected", N, matAdim1, matAdim2), - a_actual("a_actual", N, matAdim1, matAdim2), - b_expected("b_expected", N, matBdim1, matBdim2), - b_actual("b_actual", N, matBdim1, matBdim2), - c_expected("c_expected", N, matCdim1, matCdim2), - c_actual("c_actual", N, matCdim1, matCdim2); + ViewType a_expected("a_expected", N, matAdim1, matAdim2), a_actual("a_actual", N, matAdim1, matAdim2), + b_expected("b_expected", N, matBdim1, matBdim2), b_actual("b_actual", N, matBdim1, matBdim2), + c_expected("c_expected", N, matCdim1, matCdim2), c_actual("c_actual", N, matCdim1, matCdim2); - Kokkos::Random_XorShift64_Pool random( - 13718); + Kokkos::Random_XorShift64_Pool random(13718); Kokkos::fill_random(a_expected, random, value_type(1.0)); Kokkos::fill_random(b_expected, random, value_type(1.0)); @@ -114,8 +101,7 @@ void impl_test_batched_teamvectorgemm(const int N, const int matAdim1, // Functor_TestBatchedTeamVector(alpha, a_expected, b_expected, // beta, c_expected).run(); - Functor_BatchedVanillaGEMM - vgemm; + Functor_BatchedVanillaGEMM vgemm; vgemm.A_t = std::is_same::value; vgemm.B_t = std::is_same::value; vgemm.A_c = vgemm.B_c = false; @@ -126,17 +112,14 @@ void impl_test_batched_teamvectorgemm(const int N, const int matAdim1, vgemm.beta = beta; vgemm.run(); // Compute c_expected - Functor_TestBatchedTeamVector(alpha, a_actual, b_actual, beta, - c_actual) + Functor_TestBatchedTeamVector(alpha, a_actual, b_actual, + beta, c_actual) .run(); Kokkos::fence(); - typename ViewType::HostMirror c_expected_host = - Kokkos::create_mirror_view(c_expected); - typename ViewType::HostMirror c_actual_host = - Kokkos::create_mirror_view(c_actual); + typename ViewType::HostMirror c_expected_host = Kokkos::create_mirror_view(c_expected); + typename ViewType::HostMirror c_actual_host = Kokkos::create_mirror_view(c_actual); // Copy to host for comparison Kokkos::deep_copy(c_expected_host, c_expected); @@ -165,111 +148,80 @@ void impl_test_batched_teamvectorgemm(const int N, const int matAdim1, // void (*impl_test)(const int, const int, const int, const int, const int, // const int, const int) -template +template int test_batched_teamvectorgemm() { #if defined(KOKKOSKERNELS_INST_LAYOUTLEFT) { - typedef Kokkos::View - ViewType; - Test::TeamVectorGemm::impl_test_batched_teamvectorgemm< - DeviceType, ViewType, ScalarType, ParamTagType, AlgoTagType>( + typedef Kokkos::View ViewType; + Test::TeamVectorGemm::impl_test_batched_teamvectorgemm( 0, 10, 10, 10, 10, 10, 10); for (int i = 0; i < 10; ++i) { // printf("Testing: LayoutLeft, Blksize %d\n", i); - Test::TeamVectorGemm::impl_test_batched_teamvectorgemm< - DeviceType, ViewType, ScalarType, ParamTagType, AlgoTagType>( - 1024, i, i, i, i, i, i); + Test::TeamVectorGemm::impl_test_batched_teamvectorgemm(1024, i, i, i, i, i, i); } for (int i = 0; i < 10; ++i) { // printf("Testing: LayoutLeft, Blksize %d\n", i); int dimM = i; int dimN = 2 * i; int dimK = 3 * i; - if ((std::is_same::value) && - (std::is_same::value)) { - Test::TeamVectorGemm::impl_test_batched_teamvectorgemm< - DeviceType, ViewType, ScalarType, ParamTagType, AlgoTagType>( - 1024, dimM, dimK, dimK, dimN, dimM, dimN); + if ((std::is_same::value) && + (std::is_same::value)) { + Test::TeamVectorGemm::impl_test_batched_teamvectorgemm(1024, dimM, dimK, dimK, dimN, dimM, dimN); } - if ((std::is_same::value) && - (std::is_same::value)) { - Test::TeamVectorGemm::impl_test_batched_teamvectorgemm< - DeviceType, ViewType, ScalarType, ParamTagType, AlgoTagType>( - 1024, dimM, dimK, dimN, dimK, dimM, dimN); + if ((std::is_same::value) && + (std::is_same::value)) { + Test::TeamVectorGemm::impl_test_batched_teamvectorgemm(1024, dimM, dimK, dimN, dimK, dimM, dimN); } - if ((std::is_same::value) && - (std::is_same::value)) { - Test::TeamVectorGemm::impl_test_batched_teamvectorgemm< - DeviceType, ViewType, ScalarType, ParamTagType, AlgoTagType>( - 1024, dimK, dimM, dimK, dimN, dimM, dimN); + if ((std::is_same::value) && + (std::is_same::value)) { + Test::TeamVectorGemm::impl_test_batched_teamvectorgemm(1024, dimK, dimM, dimK, dimN, dimM, dimN); } - if ((std::is_same::value) && - (std::is_same::value)) { - Test::TeamVectorGemm::impl_test_batched_teamvectorgemm< - DeviceType, ViewType, ScalarType, ParamTagType, AlgoTagType>( - 1024, dimK, dimM, dimN, dimK, dimM, dimN); + if ((std::is_same::value) && + (std::is_same::value)) { + Test::TeamVectorGemm::impl_test_batched_teamvectorgemm(1024, dimK, dimM, dimN, dimK, dimM, dimN); } } } #endif #if defined(KOKKOSKERNELS_INST_LAYOUTRIGHT) { - typedef Kokkos::View - ViewType; - Test::TeamVectorGemm::impl_test_batched_teamvectorgemm< - DeviceType, ViewType, ScalarType, ParamTagType, AlgoTagType>( + typedef Kokkos::View ViewType; + Test::TeamVectorGemm::impl_test_batched_teamvectorgemm( 0, 10, 10, 10, 10, 10, 10); for (int i = 0; i < 10; ++i) { // printf("Testing: LayoutRight, Blksize %d\n", i); - Test::TeamVectorGemm::impl_test_batched_teamvectorgemm< - DeviceType, ViewType, ScalarType, ParamTagType, AlgoTagType>( - 1024, i, i, i, i, i, i); + Test::TeamVectorGemm::impl_test_batched_teamvectorgemm(1024, i, i, i, i, i, i); } for (int i = 0; i < 10; ++i) { // printf("Testing: LayoutLeft, Blksize %d\n", i); int dimM = i; int dimN = 2 * i; int dimK = 3 * i; - if ((std::is_same::value) && - (std::is_same::value)) { - Test::TeamVectorGemm::impl_test_batched_teamvectorgemm< - DeviceType, ViewType, ScalarType, ParamTagType, AlgoTagType>( - 1024, dimM, dimK, dimK, dimN, dimM, dimN); + if ((std::is_same::value) && + (std::is_same::value)) { + Test::TeamVectorGemm::impl_test_batched_teamvectorgemm(1024, dimM, dimK, dimK, dimN, dimM, dimN); } - if ((std::is_same::value) && - (std::is_same::value)) { - Test::TeamVectorGemm::impl_test_batched_teamvectorgemm< - DeviceType, ViewType, ScalarType, ParamTagType, AlgoTagType>( - 1024, dimM, dimK, dimN, dimK, dimM, dimN); + if ((std::is_same::value) && + (std::is_same::value)) { + Test::TeamVectorGemm::impl_test_batched_teamvectorgemm(1024, dimM, dimK, dimN, dimK, dimM, dimN); } - if ((std::is_same::value) && - (std::is_same::value)) { - Test::TeamVectorGemm::impl_test_batched_teamvectorgemm< - DeviceType, ViewType, ScalarType, ParamTagType, AlgoTagType>( - 1024, dimK, dimM, dimK, dimN, dimM, dimN); + if ((std::is_same::value) && + (std::is_same::value)) { + Test::TeamVectorGemm::impl_test_batched_teamvectorgemm(1024, dimK, dimM, dimK, dimN, dimM, dimN); } - if ((std::is_same::value) && - (std::is_same::value)) { - Test::TeamVectorGemm::impl_test_batched_teamvectorgemm< - DeviceType, ViewType, ScalarType, ParamTagType, AlgoTagType>( - 1024, dimK, dimM, dimN, dimK, dimM, dimN); + if ((std::is_same::value) && + (std::is_same::value)) { + Test::TeamVectorGemm::impl_test_batched_teamvectorgemm(1024, dimK, dimM, dimN, dimK, dimM, dimN); } } } diff --git a/batched/dense/unit_test/Test_Batched_TeamVectorGemm_Complex.hpp b/batched/dense/unit_test/Test_Batched_TeamVectorGemm_Complex.hpp index cc6cbdd511..3d8bd949da 100644 --- a/batched/dense/unit_test/Test_Batched_TeamVectorGemm_Complex.hpp +++ b/batched/dense/unit_test/Test_Batched_TeamVectorGemm_Complex.hpp @@ -15,80 +15,62 @@ //@HEADER #if defined(KOKKOSKERNELS_INST_COMPLEX_FLOAT) TEST_F(TestCategory, batched_scalar_team_vector_gemm_nt_nt_scomplex_scomplex) { - typedef ::Test::TeamVectorGemm::ParamTag - param_tag_type; + typedef ::Test::TeamVectorGemm::ParamTag param_tag_type; // test_batched_teamvectorgemm,Kokkos::complex,param_tag_type,Algo::Gemm::Blocked>(); - test_batched_teamvectorgemm, - Kokkos::complex, param_tag_type, + test_batched_teamvectorgemm, Kokkos::complex, param_tag_type, Algo::Gemm::Unblocked>(); } TEST_F(TestCategory, batched_scalar_team_vector_gemm_t_nt_scomplex_scomplex) { - typedef ::Test::TeamVectorGemm::ParamTag - param_tag_type; + typedef ::Test::TeamVectorGemm::ParamTag param_tag_type; // test_batched_teamvectorgemm,Kokkos::complex,param_tag_type,Algo::Gemm::Blocked>(); - test_batched_teamvectorgemm, - Kokkos::complex, param_tag_type, + test_batched_teamvectorgemm, Kokkos::complex, param_tag_type, Algo::Gemm::Unblocked>(); } TEST_F(TestCategory, batched_scalar_team_vector_gemm_nt_t_scomplex_scomplex) { - typedef ::Test::TeamVectorGemm::ParamTag - param_tag_type; + typedef ::Test::TeamVectorGemm::ParamTag param_tag_type; // test_batched_teamvectorgemm,Kokkos::complex,param_tag_type,Algo::Gemm::Blocked>(); - test_batched_teamvectorgemm, - Kokkos::complex, param_tag_type, + test_batched_teamvectorgemm, Kokkos::complex, param_tag_type, Algo::Gemm::Unblocked>(); } TEST_F(TestCategory, batched_scalar_team_vector_gemm_t_t_scomplex_scomplex) { - typedef ::Test::TeamVectorGemm::ParamTag - param_tag_type; + typedef ::Test::TeamVectorGemm::ParamTag param_tag_type; // test_batched_teamvectorgemm,Kokkos::complex,param_tag_type,Algo::Gemm::Blocked>(); - test_batched_teamvectorgemm, - Kokkos::complex, param_tag_type, + test_batched_teamvectorgemm, Kokkos::complex, param_tag_type, Algo::Gemm::Unblocked>(); } #endif #if defined(KOKKOSKERNELS_INST_COMPLEX_DOUBLE) TEST_F(TestCategory, batched_scalar_team_vector_gemm_nt_nt_dcomplex_dcomplex) { - typedef ::Test::TeamVectorGemm::ParamTag - param_tag_type; + typedef ::Test::TeamVectorGemm::ParamTag param_tag_type; // test_batched_teamvectorgemm,Kokkos::complex,param_tag_type,Algo::Gemm::Blocked>(); - test_batched_teamvectorgemm, - Kokkos::complex, param_tag_type, + test_batched_teamvectorgemm, Kokkos::complex, param_tag_type, Algo::Gemm::Unblocked>(); } TEST_F(TestCategory, batched_scalar_team_vector_gemm_t_nt_dcomplex_dcomplex) { - typedef ::Test::TeamVectorGemm::ParamTag - param_tag_type; + typedef ::Test::TeamVectorGemm::ParamTag param_tag_type; // test_batched_teamvectorgemm,Kokkos::complex,param_tag_type,Algo::Gemm::Blocked>(); - test_batched_teamvectorgemm, - Kokkos::complex, param_tag_type, + test_batched_teamvectorgemm, Kokkos::complex, param_tag_type, Algo::Gemm::Unblocked>(); } TEST_F(TestCategory, batched_scalar_team_vector_gemm_nt_t_dcomplex_dcomplex) { - typedef ::Test::TeamVectorGemm::ParamTag - param_tag_type; + typedef ::Test::TeamVectorGemm::ParamTag param_tag_type; // test_batched_teamvectorgemm,Kokkos::complex,param_tag_type,Algo::Gemm::Blocked>(); - test_batched_teamvectorgemm, - Kokkos::complex, param_tag_type, + test_batched_teamvectorgemm, Kokkos::complex, param_tag_type, Algo::Gemm::Unblocked>(); } TEST_F(TestCategory, batched_scalar_team_vector_gemm_t_t_dcomplex_dcomplex) { - typedef ::Test::TeamVectorGemm::ParamTag - param_tag_type; + typedef ::Test::TeamVectorGemm::ParamTag param_tag_type; // test_batched_teamvectorgemm,Kokkos::complex,param_tag_type,Algo::Gemm::Blocked>(); - test_batched_teamvectorgemm, - Kokkos::complex, param_tag_type, + test_batched_teamvectorgemm, Kokkos::complex, param_tag_type, Algo::Gemm::Unblocked>(); } #endif diff --git a/batched/dense/unit_test/Test_Batched_TeamVectorGemm_Real.hpp b/batched/dense/unit_test/Test_Batched_TeamVectorGemm_Real.hpp index e96bc1ac5c..74a32c13e9 100644 --- a/batched/dense/unit_test/Test_Batched_TeamVectorGemm_Real.hpp +++ b/batched/dense/unit_test/Test_Batched_TeamVectorGemm_Real.hpp @@ -15,152 +15,116 @@ //@HEADER #if defined(KOKKOS_BHALF_T_IS_FLOAT) TEST_F(TestCategory, batched_scalar_team_vector_gemm_nt_nt_bhalf_bhalf) { - typedef ::Test::TeamVectorGemm::ParamTag - param_tag_type; + typedef ::Test::TeamVectorGemm::ParamTag param_tag_type; // test_batched_teamvectorgemm(); - test_batched_teamvectorgemm(); } TEST_F(TestCategory, batched_scalar_team_vector_gemm_t_nt_bhalf_bhalf) { - typedef ::Test::TeamVectorGemm::ParamTag - param_tag_type; + typedef ::Test::TeamVectorGemm::ParamTag param_tag_type; // test_batched_teamvectorgemm(); - test_batched_teamvectorgemm(); } TEST_F(TestCategory, batched_scalar_team_vector_gemm_nt_t_bhalf_bhalf) { - typedef ::Test::TeamVectorGemm::ParamTag - param_tag_type; + typedef ::Test::TeamVectorGemm::ParamTag param_tag_type; // test_batched_teamvectorgemm(); - test_batched_teamvectorgemm(); } TEST_F(TestCategory, batched_scalar_team_vector_gemm_t_t_bhalf_bhalf) { - typedef ::Test::TeamVectorGemm::ParamTag - param_tag_type; + typedef ::Test::TeamVectorGemm::ParamTag param_tag_type; // test_batched_teamvectorgemm(); - test_batched_teamvectorgemm(); } #endif // KOKKOS_BHALF_T_IS_FLOAT #if defined(KOKKOS_HALF_T_IS_FLOAT) TEST_F(TestCategory, batched_scalar_team_vector_gemm_nt_nt_half_half) { - typedef ::Test::TeamVectorGemm::ParamTag - param_tag_type; + typedef ::Test::TeamVectorGemm::ParamTag param_tag_type; // test_batched_teamvectorgemm(); - test_batched_teamvectorgemm(); } TEST_F(TestCategory, batched_scalar_team_vector_gemm_t_nt_half_half) { - typedef ::Test::TeamVectorGemm::ParamTag - param_tag_type; + typedef ::Test::TeamVectorGemm::ParamTag param_tag_type; // test_batched_teamvectorgemm(); - test_batched_teamvectorgemm(); } TEST_F(TestCategory, batched_scalar_team_vector_gemm_nt_t_half_half) { - typedef ::Test::TeamVectorGemm::ParamTag - param_tag_type; + typedef ::Test::TeamVectorGemm::ParamTag param_tag_type; // test_batched_teamvectorgemm(); - test_batched_teamvectorgemm(); } TEST_F(TestCategory, batched_scalar_team_vector_gemm_t_t_half_half) { - typedef ::Test::TeamVectorGemm::ParamTag - param_tag_type; + typedef ::Test::TeamVectorGemm::ParamTag param_tag_type; // test_batched_teamvectorgemm(); - test_batched_teamvectorgemm(); } #endif // KOKKOS_HALF_T_IS_FLOAT #if defined(KOKKOSKERNELS_INST_FLOAT) TEST_F(TestCategory, batched_scalar_team_vector_gemm_nt_nt_float_float) { - typedef ::Test::TeamVectorGemm::ParamTag - param_tag_type; + typedef ::Test::TeamVectorGemm::ParamTag param_tag_type; // test_batched_teamvectorgemm(); - test_batched_teamvectorgemm(); + test_batched_teamvectorgemm(); } TEST_F(TestCategory, batched_scalar_team_vector_gemm_t_nt_float_float) { - typedef ::Test::TeamVectorGemm::ParamTag - param_tag_type; + typedef ::Test::TeamVectorGemm::ParamTag param_tag_type; // test_batched_teamvectorgemm(); - test_batched_teamvectorgemm(); + test_batched_teamvectorgemm(); } TEST_F(TestCategory, batched_scalar_team_vector_gemm_nt_t_float_float) { - typedef ::Test::TeamVectorGemm::ParamTag - param_tag_type; + typedef ::Test::TeamVectorGemm::ParamTag param_tag_type; // test_batched_teamvectorgemm(); - test_batched_teamvectorgemm(); + test_batched_teamvectorgemm(); } TEST_F(TestCategory, batched_scalar_team_vector_gemm_t_t_float_float) { - typedef ::Test::TeamVectorGemm::ParamTag - param_tag_type; + typedef ::Test::TeamVectorGemm::ParamTag param_tag_type; // test_batched_teamvectorgemm(); - test_batched_teamvectorgemm(); + test_batched_teamvectorgemm(); } #endif #if defined(KOKKOSKERNELS_INST_DOUBLE) TEST_F(TestCategory, batched_scalar_team_vector_gemm_nt_nt_double_double) { - typedef ::Test::TeamVectorGemm::ParamTag - param_tag_type; + typedef ::Test::TeamVectorGemm::ParamTag param_tag_type; // test_batched_teamvectorgemm(); - test_batched_teamvectorgemm(); + test_batched_teamvectorgemm(); } TEST_F(TestCategory, batched_scalar_team_vector_gemm_t_nt_double_double) { - typedef ::Test::TeamVectorGemm::ParamTag - param_tag_type; + typedef ::Test::TeamVectorGemm::ParamTag param_tag_type; // test_batched_teamvectorgemm(); - test_batched_teamvectorgemm(); + test_batched_teamvectorgemm(); } TEST_F(TestCategory, batched_scalar_team_vector_gemm_nt_t_double_double) { - typedef ::Test::TeamVectorGemm::ParamTag - param_tag_type; + typedef ::Test::TeamVectorGemm::ParamTag param_tag_type; // test_batched_teamvectorgemm(); - test_batched_teamvectorgemm(); + test_batched_teamvectorgemm(); } TEST_F(TestCategory, batched_scalar_team_vector_gemm_t_t_double_double) { - typedef ::Test::TeamVectorGemm::ParamTag - param_tag_type; + typedef ::Test::TeamVectorGemm::ParamTag param_tag_type; // test_batched_teamvectorgemm(); - test_batched_teamvectorgemm(); + test_batched_teamvectorgemm(); } #endif diff --git a/batched/dense/unit_test/Test_Batched_TeamVectorGesv.hpp b/batched/dense/unit_test/Test_Batched_TeamVectorGesv.hpp index ddb1a5c40d..dba452da53 100644 --- a/batched/dense/unit_test/Test_Batched_TeamVectorGesv.hpp +++ b/batched/dense/unit_test/Test_Batched_TeamVectorGesv.hpp @@ -32,8 +32,7 @@ using namespace KokkosBatched; namespace Test { namespace TeamVectorGesv { -template +template struct Functor_TestBatchedTeamVectorGesv { using execution_space = typename DeviceType::execution_space; const MatrixType _A; @@ -41,20 +40,18 @@ struct Functor_TestBatchedTeamVectorGesv { const VectorType _B; KOKKOS_INLINE_FUNCTION - Functor_TestBatchedTeamVectorGesv(const MatrixType &A, const VectorType &X, - const VectorType &B) + Functor_TestBatchedTeamVectorGesv(const MatrixType &A, const VectorType &X, const VectorType &B) : _A(A), _X(X), _B(B) {} template KOKKOS_INLINE_FUNCTION void operator()(const MemberType &member) const { const int matrix_id = static_cast(member.league_rank()); - auto A = Kokkos::subview(_A, matrix_id, Kokkos::ALL, Kokkos::ALL); - auto x = Kokkos::subview(_X, matrix_id, Kokkos::ALL); - auto b = Kokkos::subview(_B, matrix_id, Kokkos::ALL); + auto A = Kokkos::subview(_A, matrix_id, Kokkos::ALL, Kokkos::ALL); + auto x = Kokkos::subview(_X, matrix_id, Kokkos::ALL); + auto b = Kokkos::subview(_B, matrix_id, Kokkos::ALL); member.team_barrier(); - KokkosBatched::TeamVectorGesv::invoke(member, A, x, - b); + KokkosBatched::TeamVectorGesv::invoke(member, A, x, b); member.team_barrier(); } @@ -64,13 +61,10 @@ struct Functor_TestBatchedTeamVectorGesv { const std::string name_value_type = Test::value_type_name(); std::string name = name_region + name_value_type; Kokkos::Profiling::pushRegion(name.c_str()); - Kokkos::TeamPolicy policy(_X.extent(0), Kokkos::AUTO(), - Kokkos::AUTO()); + Kokkos::TeamPolicy policy(_X.extent(0), Kokkos::AUTO(), Kokkos::AUTO()); - using MatrixViewType = - Kokkos::View; + using MatrixViewType = Kokkos::View; const int n = _A.extent(1); size_t bytes_0 = MatrixViewType::shmem_size(n, n + 4); @@ -81,15 +75,13 @@ struct Functor_TestBatchedTeamVectorGesv { } }; -template +template void impl_test_batched_gesv(const int N, const int BlkSize) { typedef typename MatrixType::value_type value_type; typedef Kokkos::ArithTraits ats; using MagnitudeType = typename Kokkos::ArithTraits::mag_type; - using NormViewType = - Kokkos::View; + using NormViewType = Kokkos::View; NormViewType sqr_norm_j("sqr_norm_j", N); auto sqr_norm_j_host = Kokkos::create_mirror_view(sqr_norm_j); @@ -110,23 +102,18 @@ void impl_test_batched_gesv(const int N, const int BlkSize) { Kokkos::fence(); - Functor_TestBatchedTeamVectorGesv(A, X, B) - .run(); + Functor_TestBatchedTeamVectorGesv(A, X, B).run(); Kokkos::fence(); Kokkos::deep_copy(X_host, X); for (int l = 0; l < N; ++l) - KokkosBlas::SerialGemv:: - invoke(-1, Kokkos::subview(A_host, l, Kokkos::ALL, Kokkos::ALL), - Kokkos::subview(X_host, l, Kokkos::ALL), 1, - Kokkos::subview(B_host, l, Kokkos::ALL)); + KokkosBlas::SerialGemv::invoke( + -1, Kokkos::subview(A_host, l, Kokkos::ALL, Kokkos::ALL), Kokkos::subview(X_host, l, Kokkos::ALL), 1, + Kokkos::subview(B_host, l, Kokkos::ALL)); - KokkosBatched::SerialDot::invoke(B_host, B_host, - sqr_norm_j_host); + KokkosBatched::SerialDot::invoke(B_host, B_host, sqr_norm_j_host); const MagnitudeType eps = 1.0e3 * ats::epsilon(); @@ -139,29 +126,21 @@ template int test_batched_teamvector_gesv() { #if defined(KOKKOSKERNELS_INST_LAYOUTLEFT) { - typedef Kokkos::View - MatrixType; - typedef Kokkos::View - VectorType; + typedef Kokkos::View MatrixType; + typedef Kokkos::View VectorType; for (int i = 3; i < 10; ++i) { - Test::TeamVectorGesv::impl_test_batched_gesv( - 1024, i); + Test::TeamVectorGesv::impl_test_batched_gesv(1024, i); } } #endif #if defined(KOKKOSKERNELS_INST_LAYOUTRIGHT) { - typedef Kokkos::View - MatrixType; - typedef Kokkos::View - VectorType; + typedef Kokkos::View MatrixType; + typedef Kokkos::View VectorType; for (int i = 3; i < 10; ++i) { - Test::TeamVectorGesv::impl_test_batched_gesv( - 1024, i); + Test::TeamVectorGesv::impl_test_batched_gesv(1024, i); } } #endif diff --git a/batched/dense/unit_test/Test_Batched_TeamVectorGesv_Real.hpp b/batched/dense/unit_test/Test_Batched_TeamVectorGesv_Real.hpp index 66c6fb3691..73a6281fe5 100644 --- a/batched/dense/unit_test/Test_Batched_TeamVectorGesv_Real.hpp +++ b/batched/dense/unit_test/Test_Batched_TeamVectorGesv_Real.hpp @@ -15,22 +15,18 @@ //@HEADER #if defined(KOKKOSKERNELS_INST_FLOAT) TEST_F(TestCategory, batched_scalar_teamvector_gesv_static_pivoting_float) { - test_batched_teamvector_gesv(); + test_batched_teamvector_gesv(); } TEST_F(TestCategory, batched_scalar_teamvector_gesv_no_pivoting_float) { - test_batched_teamvector_gesv(); + test_batched_teamvector_gesv(); } #endif #if defined(KOKKOSKERNELS_INST_DOUBLE) TEST_F(TestCategory, batched_scalar_teamvector_gesv_static_pivoting_double) { - test_batched_teamvector_gesv(); + test_batched_teamvector_gesv(); } TEST_F(TestCategory, batched_scalar_teamvector_gesv_no_pivoting_double) { - test_batched_teamvector_gesv(); + test_batched_teamvector_gesv(); } #endif diff --git a/batched/dense/unit_test/Test_Batched_TeamVectorQR.hpp b/batched/dense/unit_test/Test_Batched_TeamVectorQR.hpp index 84ccb39611..2f4812179a 100644 --- a/batched/dense/unit_test/Test_Batched_TeamVectorQR.hpp +++ b/batched/dense/unit_test/Test_Batched_TeamVectorQR.hpp @@ -32,8 +32,8 @@ using namespace KokkosBatched; namespace Test { -template +template struct Functor_TestBatchedTeamVectorQR { using execution_space = typename DeviceType::execution_space; MatrixViewType _a; @@ -41,11 +41,8 @@ struct Functor_TestBatchedTeamVectorQR { WorkViewType _w; KOKKOS_INLINE_FUNCTION - Functor_TestBatchedTeamVectorQR(const MatrixViewType &a, - const VectorViewType &x, - const VectorViewType &b, - const VectorViewType &t, - const WorkViewType &w) + Functor_TestBatchedTeamVectorQR(const MatrixViewType &a, const VectorViewType &x, const VectorViewType &b, + const VectorViewType &t, const WorkViewType &w) : _a(a), _x(x), _b(b), _t(t), _w(w) {} template @@ -61,17 +58,15 @@ struct Functor_TestBatchedTeamVectorQR { auto ww = Kokkos::subview(_w, k, Kokkos::ALL()); // make diagonal dominant - Kokkos::parallel_for(Kokkos::TeamVectorRange(member, aa.extent(0)), - [&](const int &i) { aa(i, i) += add_this; }); + Kokkos::parallel_for(Kokkos::TeamVectorRange(member, aa.extent(0)), [&](const int &i) { aa(i, i) += add_this; }); /// xx = 1 KokkosBlas::TeamVectorSet::invoke(member, one, xx); member.team_barrier(); /// bb = AA*xx - KokkosBlas::TeamVectorGemv::invoke(member, one, aa, - xx, zero, bb); + KokkosBlas::TeamVectorGemv::invoke(member, one, aa, xx, zero, + bb); member.team_barrier(); /// AA = QR @@ -83,13 +78,12 @@ struct Functor_TestBatchedTeamVectorQR { member.team_barrier(); /// xx = Q^{T}xx; - TeamVectorApplyQ::invoke(member, aa, tt, xx, ww); + TeamVectorApplyQ::invoke(member, aa, tt, xx, ww); member.team_barrier(); /// xx = R^{-1} xx - TeamVectorTrsv::invoke(member, one, aa, xx); + TeamVectorTrsv::invoke( + member, one, aa, xx); } inline void run() { @@ -107,8 +101,8 @@ struct Functor_TestBatchedTeamVectorQR { } }; -template +template void impl_test_batched_qr(const int N, const int BlkSize) { typedef typename MatrixViewType::non_const_value_type value_type; typedef Kokkos::ArithTraits ats; @@ -122,14 +116,12 @@ void impl_test_batched_qr(const int N, const int BlkSize) { Kokkos::fence(); - Kokkos::Random_XorShift64_Pool random( - 13718); + Kokkos::Random_XorShift64_Pool random(13718); Kokkos::fill_random(a, random, value_type(1.0)); Kokkos::fence(); - Functor_TestBatchedTeamVectorQR(a, x, b, t, w) + Functor_TestBatchedTeamVectorQR(a, x, b, t, w) .run(); Kokkos::fence(); @@ -157,35 +149,25 @@ template int test_batched_qr() { #if defined(KOKKOSKERNELS_INST_LAYOUTLEFT) { - typedef Kokkos::View - MatrixViewType; - typedef Kokkos::View - VectorViewType; - typedef Kokkos::View - WorkViewType; - Test::impl_test_batched_qr(0, 10); + typedef Kokkos::View MatrixViewType; + typedef Kokkos::View VectorViewType; + typedef Kokkos::View WorkViewType; + Test::impl_test_batched_qr(0, 10); for (int i = 1; i < 10; ++i) { // printf("Testing: LayoutLeft, Blksize %d\n", i); - Test::impl_test_batched_qr(1024, i); + Test::impl_test_batched_qr(1024, i); } } #endif #if defined(KOKKOSKERNELS_INST_LAYOUTRIGHT) { - typedef Kokkos::View - MatrixViewType; - typedef Kokkos::View - VectorViewType; - typedef Kokkos::View - WorkViewType; - Test::impl_test_batched_qr(0, 10); + typedef Kokkos::View MatrixViewType; + typedef Kokkos::View VectorViewType; + typedef Kokkos::View WorkViewType; + Test::impl_test_batched_qr(0, 10); for (int i = 1; i < 10; ++i) { // printf("Testing: LayoutRight, Blksize %d\n", i); - Test::impl_test_batched_qr(1024, i); + Test::impl_test_batched_qr(1024, i); } } #endif diff --git a/batched/dense/unit_test/Test_Batched_TeamVectorQR_WithColumnPivoting.hpp b/batched/dense/unit_test/Test_Batched_TeamVectorQR_WithColumnPivoting.hpp index 09427aa25e..f66cebe07d 100644 --- a/batched/dense/unit_test/Test_Batched_TeamVectorQR_WithColumnPivoting.hpp +++ b/batched/dense/unit_test/Test_Batched_TeamVectorQR_WithColumnPivoting.hpp @@ -32,8 +32,8 @@ using namespace KokkosBatched; namespace Test { -template +template struct Functor_TestBatchedTeamVectorQR_WithColumnPivoting { using execution_space = typename DeviceType::execution_space; MatrixViewType _a; @@ -42,9 +42,9 @@ struct Functor_TestBatchedTeamVectorQR_WithColumnPivoting { WorkViewType _w; KOKKOS_INLINE_FUNCTION - Functor_TestBatchedTeamVectorQR_WithColumnPivoting( - const MatrixViewType &a, const VectorViewType &x, const VectorViewType &b, - const VectorViewType &t, const PivotViewType &p, const WorkViewType &w) + Functor_TestBatchedTeamVectorQR_WithColumnPivoting(const MatrixViewType &a, const VectorViewType &x, + const VectorViewType &b, const VectorViewType &t, + const PivotViewType &p, const WorkViewType &w) : _a(a), _x(x), _b(b), _t(t), _p(p), _w(w) {} template @@ -69,15 +69,13 @@ struct Functor_TestBatchedTeamVectorQR_WithColumnPivoting { member.team_barrier(); /// bb = AA*xx - KokkosBlas::TeamVectorGemv::invoke(member, one, aa, - xx, zero, bb); + KokkosBlas::TeamVectorGemv::invoke(member, one, aa, xx, zero, + bb); member.team_barrier(); /// AA P^T = QR int matrix_rank(0); - TeamVectorQR_WithColumnPivoting::invoke( - member, aa, tt, pp, ww, matrix_rank); + TeamVectorQR_WithColumnPivoting::invoke(member, aa, tt, pp, ww, matrix_rank); member.team_barrier(); /// xx = bb; @@ -85,25 +83,22 @@ struct Functor_TestBatchedTeamVectorQR_WithColumnPivoting { member.team_barrier(); /// xx = Q^{T} xx; - TeamVectorApplyQ::invoke(member, aa, tt, xx, ww); + TeamVectorApplyQ::invoke(member, aa, tt, xx, ww); member.team_barrier(); /// xx = R^{-1} xx - TeamVectorTrsv::invoke(member, one, aa, xx); + TeamVectorTrsv::invoke( + member, one, aa, xx); member.team_barrier(); /// xx = P xx - TeamVectorApplyPivot::invoke( - member, pp, xx); + TeamVectorApplyPivot::invoke(member, pp, xx); member.team_barrier(); } inline void run() { typedef typename MatrixViewType::non_const_value_type value_type; - std::string name_region( - "KokkosBatched::Test::TeamVectorQR_WithColumnPivoting"); + std::string name_region("KokkosBatched::Test::TeamVectorQR_WithColumnPivoting"); const std::string name_value_type = Test::value_type_name(); std::string name = name_region + name_value_type; Kokkos::Profiling::pushRegion(name.c_str()); @@ -116,8 +111,8 @@ struct Functor_TestBatchedTeamVectorQR_WithColumnPivoting { } }; -template +template void impl_test_batched_qr_with_columnpivoting(const int N, const int BlkSize) { typedef typename MatrixViewType::non_const_value_type value_type; typedef Kokkos::ArithTraits ats; @@ -132,15 +127,13 @@ void impl_test_batched_qr_with_columnpivoting(const int N, const int BlkSize) { Kokkos::fence(); - Kokkos::Random_XorShift64_Pool random( - 13718); + Kokkos::Random_XorShift64_Pool random(13718); Kokkos::fill_random(a, random, value_type(1.0)); Kokkos::fence(); - Functor_TestBatchedTeamVectorQR_WithColumnPivoting< - DeviceType, MatrixViewType, VectorViewType, PivotViewType, WorkViewType, - AlgoTagType>(a, x, b, t, p, w) + Functor_TestBatchedTeamVectorQR_WithColumnPivoting(a, x, b, t, p, w) .run(); Kokkos::fence(); @@ -164,48 +157,35 @@ void impl_test_batched_qr_with_columnpivoting(const int N, const int BlkSize) { } } // namespace Test -template +template int test_batched_qr_with_columnpivoting() { #if defined(KOKKOSKERNELS_INST_LAYOUTLEFT) { - typedef Kokkos::View - MatrixViewType; - typedef Kokkos::View - VectorViewType; - typedef Kokkos::View - PivotViewType; - typedef Kokkos::View - WorkViewType; - Test::impl_test_batched_qr_with_columnpivoting< - DeviceType, MatrixViewType, VectorViewType, PivotViewType, WorkViewType, - AlgoTagType>(0, 10); + typedef Kokkos::View MatrixViewType; + typedef Kokkos::View VectorViewType; + typedef Kokkos::View PivotViewType; + typedef Kokkos::View WorkViewType; + Test::impl_test_batched_qr_with_columnpivoting(0, 10); for (int i = 1; i < 10; ++i) { // printf("Testing: LayoutLeft, Blksize %d\n", i); - Test::impl_test_batched_qr_with_columnpivoting< - DeviceType, MatrixViewType, VectorViewType, PivotViewType, - WorkViewType, AlgoTagType>(1024, i); + Test::impl_test_batched_qr_with_columnpivoting(1024, i); } } #endif #if defined(KOKKOSKERNELS_INST_LAYOUTRIGHT) { - typedef Kokkos::View - MatrixViewType; - typedef Kokkos::View - VectorViewType; - typedef Kokkos::View - PivotViewType; - typedef Kokkos::View - WorkViewType; - Test::impl_test_batched_qr_with_columnpivoting< - DeviceType, MatrixViewType, VectorViewType, PivotViewType, WorkViewType, - AlgoTagType>(0, 10); + typedef Kokkos::View MatrixViewType; + typedef Kokkos::View VectorViewType; + typedef Kokkos::View PivotViewType; + typedef Kokkos::View WorkViewType; + Test::impl_test_batched_qr_with_columnpivoting(0, 10); for (int i = 1; i < 10; ++i) { // printf("Testing: LayoutRight, Blksize %d\n", i); - Test::impl_test_batched_qr_with_columnpivoting< - DeviceType, MatrixViewType, VectorViewType, PivotViewType, - WorkViewType, AlgoTagType>(1024, i); + Test::impl_test_batched_qr_with_columnpivoting(1024, i); } } #endif diff --git a/batched/dense/unit_test/Test_Batched_TeamVectorSolveUTV.hpp b/batched/dense/unit_test/Test_Batched_TeamVectorSolveUTV.hpp index 2f30c7d3c1..fdf482b4ab 100644 --- a/batched/dense/unit_test/Test_Batched_TeamVectorSolveUTV.hpp +++ b/batched/dense/unit_test/Test_Batched_TeamVectorSolveUTV.hpp @@ -32,8 +32,8 @@ using namespace KokkosBatched; namespace Test { -template +template struct Functor_TestBatchedTeamVectorSolveUTV { using execution_space = typename DeviceType::execution_space; MatrixViewType _r, _a, _acopy, _u, _v; @@ -42,11 +42,9 @@ struct Functor_TestBatchedTeamVectorSolveUTV { WorkViewType _w; KOKKOS_INLINE_FUNCTION - Functor_TestBatchedTeamVectorSolveUTV( - const MatrixViewType &r, const MatrixViewType &a, - const MatrixViewType &acopy, const MatrixViewType &u, - const MatrixViewType &v, const PivViewType &p, const VectorViewType &x, - const VectorViewType &b, const WorkViewType &w) + Functor_TestBatchedTeamVectorSolveUTV(const MatrixViewType &r, const MatrixViewType &a, const MatrixViewType &acopy, + const MatrixViewType &u, const MatrixViewType &v, const PivViewType &p, + const VectorViewType &x, const VectorViewType &b, const WorkViewType &w) : _r(r), _a(a), _acopy(acopy), _u(u), _v(v), _p(p), _x(x), _b(b), _w(w) {} template @@ -72,22 +70,18 @@ struct Functor_TestBatchedTeamVectorSolveUTV { // make diagonal dominant and set xx = 1,2,3,4,5 const int m = aa.extent(0), r = rr.extent(1); if (m <= r) { - Kokkos::parallel_for(Kokkos::TeamVectorRange(member, m), - [&](const int &i) { - aa(i, i) += add_this; - xx(i) = (i + 1); - }); + Kokkos::parallel_for(Kokkos::TeamVectorRange(member, m), [&](const int &i) { + aa(i, i) += add_this; + xx(i) = (i + 1); + }); } else { - Kokkos::parallel_for(Kokkos::TeamVectorRange(member, m * m), - [=](const int &ij) { - const int i = ij / m, j = ij % m; - value_type tmp(0); - for (int l = 0; l < r; ++l) - tmp += rr(i, l) * rr(j, l); - aa(i, j) = tmp; - }); - Kokkos::parallel_for(Kokkos::TeamVectorRange(member, m), - [&](const int &i) { xx(i) = (i + 1); }); + Kokkos::parallel_for(Kokkos::TeamVectorRange(member, m * m), [=](const int &ij) { + const int i = ij / m, j = ij % m; + value_type tmp(0); + for (int l = 0; l < r; ++l) tmp += rr(i, l) * rr(j, l); + aa(i, j) = tmp; + }); + Kokkos::parallel_for(Kokkos::TeamVectorRange(member, m), [&](const int &i) { xx(i) = (i + 1); }); } member.team_barrier(); // finish writing aa, xx @@ -95,9 +89,8 @@ struct Functor_TestBatchedTeamVectorSolveUTV { TeamVectorCopy::invoke(member, aa, ac); /// bb = AA*xx - KokkosBlas::TeamVectorGemv::invoke(member, one, aa, - xx, zero, bb); + KokkosBlas::TeamVectorGemv::invoke(member, one, aa, xx, zero, + bb); member.team_barrier(); /// Solving Ax = b using UTV transformation @@ -106,12 +99,10 @@ struct Functor_TestBatchedTeamVectorSolveUTV { /// UTV = A P^T int matrix_rank(0); - TeamVectorUTV::invoke(member, aa, pp, uu, vv, ww, - matrix_rank); + TeamVectorUTV::invoke(member, aa, pp, uu, vv, ww, matrix_rank); member.team_barrier(); - TeamVectorSolveUTV::invoke(member, matrix_rank, uu, - aa, vv, pp, xx, bb, ww); + TeamVectorSolveUTV::invoke(member, matrix_rank, uu, aa, vv, pp, xx, bb, ww); } inline void run() { @@ -129,8 +120,8 @@ struct Functor_TestBatchedTeamVectorSolveUTV { } }; -template +template void impl_test_batched_solve_utv(const int N, const int BlkSize) { typedef typename MatrixViewType::non_const_value_type value_type; typedef Kokkos::ArithTraits ats; @@ -148,8 +139,7 @@ void impl_test_batched_solve_utv(const int N, const int BlkSize) { Kokkos::fence(); - Kokkos::Random_XorShift64_Pool random( - 13718); + Kokkos::Random_XorShift64_Pool random(13718); if (BlkSize <= 3) Kokkos::fill_random(a, random, value_type(1.0)); else @@ -157,10 +147,8 @@ void impl_test_batched_solve_utv(const int N, const int BlkSize) { Kokkos::fence(); - Functor_TestBatchedTeamVectorSolveUTV( - r, a, acopy, u, v, p, x, b, w) + Functor_TestBatchedTeamVectorSolveUTV(r, a, acopy, u, v, p, x, b, w) .run(); Kokkos::fence(); @@ -203,48 +191,35 @@ void impl_test_batched_solve_utv(const int N, const int BlkSize) { } } // namespace Test -template +template int test_batched_solve_utv() { #if defined(KOKKOSKERNELS_INST_LAYOUTLEFT) { - typedef Kokkos::View - MatrixViewType; - typedef Kokkos::View - VectorViewType; - typedef Kokkos::View - PivViewType; - typedef Kokkos::View - WorkViewType; - Test::impl_test_batched_solve_utv MatrixViewType; + typedef Kokkos::View VectorViewType; + typedef Kokkos::View PivViewType; + typedef Kokkos::View WorkViewType; + Test::impl_test_batched_solve_utv(0, 10); for (int i = 1; i < 10; ++i) { // printf("Testing: LayoutLeft, Blksize %d\n", i); - Test::impl_test_batched_solve_utv(1024, i); + Test::impl_test_batched_solve_utv(1024, i); } } #endif #if defined(KOKKOSKERNELS_INST_LAYOUTRIGHT) { - typedef Kokkos::View - MatrixViewType; - typedef Kokkos::View - VectorViewType; - typedef Kokkos::View - PivViewType; - typedef Kokkos::View - WorkViewType; - Test::impl_test_batched_solve_utv MatrixViewType; + typedef Kokkos::View VectorViewType; + typedef Kokkos::View PivViewType; + typedef Kokkos::View WorkViewType; + Test::impl_test_batched_solve_utv(0, 10); for (int i = 1; i < 10; ++i) { // printf("Testing: LayoutRight, Blksize %d\n", i); - Test::impl_test_batched_solve_utv(1024, i); + Test::impl_test_batched_solve_utv(1024, i); } } #endif diff --git a/batched/dense/unit_test/Test_Batched_TeamVectorSolveUTV2.hpp b/batched/dense/unit_test/Test_Batched_TeamVectorSolveUTV2.hpp index cf7084a92c..b38fb318e6 100644 --- a/batched/dense/unit_test/Test_Batched_TeamVectorSolveUTV2.hpp +++ b/batched/dense/unit_test/Test_Batched_TeamVectorSolveUTV2.hpp @@ -32,8 +32,8 @@ using namespace KokkosBatched; namespace Test { -template +template struct Functor_TestBatchedTeamVectorSolveUTV2 { using execution_space = typename DeviceType::execution_space; MatrixViewType _r, _a, _acopy, _u, _v; @@ -42,11 +42,9 @@ struct Functor_TestBatchedTeamVectorSolveUTV2 { WorkViewType _w; KOKKOS_INLINE_FUNCTION - Functor_TestBatchedTeamVectorSolveUTV2( - const MatrixViewType &r, const MatrixViewType &a, - const MatrixViewType &acopy, const MatrixViewType &u, - const MatrixViewType &v, const PivViewType &p, const VectorViewType &x, - const VectorViewType &b, const WorkViewType &w) + Functor_TestBatchedTeamVectorSolveUTV2(const MatrixViewType &r, const MatrixViewType &a, const MatrixViewType &acopy, + const MatrixViewType &u, const MatrixViewType &v, const PivViewType &p, + const VectorViewType &x, const VectorViewType &b, const WorkViewType &w) : _r(r), _a(a), _acopy(acopy), _u(u), _v(v), _p(p), _x(x), _b(b), _w(w) {} template @@ -72,24 +70,20 @@ struct Functor_TestBatchedTeamVectorSolveUTV2 { // make diagonal dominant and set xx = 1,2,3,4,5 const int m = aa.extent(0), r = rr.extent(1); if (m <= r) { - Kokkos::parallel_for(Kokkos::TeamVectorRange(member, m), - [&](const int &i) { - aa(i, i) += add_this; - for (int j = 0; j < 2; ++j) xx(i, j) = (i + 1); - }); + Kokkos::parallel_for(Kokkos::TeamVectorRange(member, m), [&](const int &i) { + aa(i, i) += add_this; + for (int j = 0; j < 2; ++j) xx(i, j) = (i + 1); + }); } else { - Kokkos::parallel_for(Kokkos::TeamVectorRange(member, m * m), - [=](const int &ij) { - const int i = ij / m, j = ij % m; - value_type tmp(0); - for (int l = 0; l < r; ++l) - tmp += rr(i, l) * rr(j, l); - aa(i, j) = tmp; - }); - Kokkos::parallel_for(Kokkos::TeamVectorRange(member, m), - [&](const int &i) { - for (int j = 0; j < 2; ++j) xx(i, j) = (i + 1); - }); + Kokkos::parallel_for(Kokkos::TeamVectorRange(member, m * m), [=](const int &ij) { + const int i = ij / m, j = ij % m; + value_type tmp(0); + for (int l = 0; l < r; ++l) tmp += rr(i, l) * rr(j, l); + aa(i, j) = tmp; + }); + Kokkos::parallel_for(Kokkos::TeamVectorRange(member, m), [&](const int &i) { + for (int j = 0; j < 2; ++j) xx(i, j) = (i + 1); + }); } member.team_barrier(); // finish writing aa, xx @@ -97,11 +91,8 @@ struct Functor_TestBatchedTeamVectorSolveUTV2 { TeamVectorCopy::invoke(member, aa, ac); /// bb = AA*xx - KokkosBatched::TeamVectorGemm::invoke(member, one, - aa, xx, zero, - bb); + KokkosBatched::TeamVectorGemm::invoke( + member, one, aa, xx, zero, bb); member.team_barrier(); /// Solving Ax = b using UTV transformation @@ -110,12 +101,10 @@ struct Functor_TestBatchedTeamVectorSolveUTV2 { /// UTV = A P^T int matrix_rank(0); - TeamVectorUTV::invoke(member, aa, pp, uu, vv, ww, - matrix_rank); + TeamVectorUTV::invoke(member, aa, pp, uu, vv, ww, matrix_rank); member.team_barrier(); - TeamVectorSolveUTV::invoke(member, matrix_rank, uu, - aa, vv, pp, xx, bb, ww); + TeamVectorSolveUTV::invoke(member, matrix_rank, uu, aa, vv, pp, xx, bb, ww); } inline void run() { @@ -133,8 +122,8 @@ struct Functor_TestBatchedTeamVectorSolveUTV2 { } }; -template +template void impl_test_batched_solve_utv2(const int N, const int BlkSize) { typedef typename MatrixViewType::non_const_value_type value_type; typedef Kokkos::ArithTraits ats; @@ -152,8 +141,7 @@ void impl_test_batched_solve_utv2(const int N, const int BlkSize) { Kokkos::fence(); - Kokkos::Random_XorShift64_Pool random( - 13718); + Kokkos::Random_XorShift64_Pool random(13718); if (BlkSize <= 3) Kokkos::fill_random(a, random, value_type(1.0)); else @@ -161,10 +149,8 @@ void impl_test_batched_solve_utv2(const int N, const int BlkSize) { Kokkos::fence(); - Functor_TestBatchedTeamVectorSolveUTV2( - r, a, acopy, u, v, p, x, b, w) + Functor_TestBatchedTeamVectorSolveUTV2(r, a, acopy, u, v, p, x, b, w) .run(); Kokkos::fence(); @@ -210,48 +196,35 @@ void impl_test_batched_solve_utv2(const int N, const int BlkSize) { } } // namespace Test -template +template int test_batched_solve_utv2() { #if defined(KOKKOSKERNELS_INST_LAYOUTLEFT) { - typedef Kokkos::View - MatrixViewType; - typedef Kokkos::View - VectorViewType; - typedef Kokkos::View - PivViewType; - typedef Kokkos::View - WorkViewType; - Test::impl_test_batched_solve_utv2(0, 10); + typedef Kokkos::View MatrixViewType; + typedef Kokkos::View VectorViewType; + typedef Kokkos::View PivViewType; + typedef Kokkos::View WorkViewType; + Test::impl_test_batched_solve_utv2(0, 10); for (int i = 1; i < 10; ++i) { // printf("Testing: LayoutLeft, Blksize %d\n", i); - Test::impl_test_batched_solve_utv2(1024, i); + Test::impl_test_batched_solve_utv2(1024, i); } } #endif #if defined(KOKKOSKERNELS_INST_LAYOUTRIGHT) { - typedef Kokkos::View - MatrixViewType; - typedef Kokkos::View - VectorViewType; - typedef Kokkos::View - PivViewType; - typedef Kokkos::View - WorkViewType; - Test::impl_test_batched_solve_utv2(0, 10); + typedef Kokkos::View MatrixViewType; + typedef Kokkos::View VectorViewType; + typedef Kokkos::View PivViewType; + typedef Kokkos::View WorkViewType; + Test::impl_test_batched_solve_utv2(0, 10); for (int i = 1; i < 10; ++i) { // printf("Testing: LayoutRight, Blksize %d\n", i); - Test::impl_test_batched_solve_utv2(1024, i); + Test::impl_test_batched_solve_utv2(1024, i); } } #endif diff --git a/batched/dense/unit_test/Test_Batched_TeamVectorUTV.hpp b/batched/dense/unit_test/Test_Batched_TeamVectorUTV.hpp index eb45a70c89..44f6ec394a 100644 --- a/batched/dense/unit_test/Test_Batched_TeamVectorUTV.hpp +++ b/batched/dense/unit_test/Test_Batched_TeamVectorUTV.hpp @@ -31,8 +31,8 @@ using namespace KokkosBatched; namespace Test { -template +template struct Functor_TestBatchedTeamVectorUTV { using execution_space = typename DeviceType::execution_space; MatrixViewType _r, _a, _acopy, _u, _v; @@ -41,11 +41,9 @@ struct Functor_TestBatchedTeamVectorUTV { WorkViewType _w; KOKKOS_INLINE_FUNCTION - Functor_TestBatchedTeamVectorUTV( - const MatrixViewType &r, const MatrixViewType &a, - const MatrixViewType &acopy, const MatrixViewType &u, - const MatrixViewType &v, const PivViewType &p, const VectorViewType &x, - const VectorViewType &b, const WorkViewType &w) + Functor_TestBatchedTeamVectorUTV(const MatrixViewType &r, const MatrixViewType &a, const MatrixViewType &acopy, + const MatrixViewType &u, const MatrixViewType &v, const PivViewType &p, + const VectorViewType &x, const VectorViewType &b, const WorkViewType &w) : _r(r), _a(a), _acopy(acopy), _u(u), _v(v), _p(p), _x(x), _b(b), _w(w) {} template @@ -71,22 +69,18 @@ struct Functor_TestBatchedTeamVectorUTV { // make diagonal dominant and set xx = 1,2,3,4,5 const int m = aa.extent(0), r = rr.extent(1); if (m <= r) { - Kokkos::parallel_for(Kokkos::TeamVectorRange(member, m), - [&](const int &i) { - aa(i, i) += add_this; - xx(i) = (i + 1); - }); + Kokkos::parallel_for(Kokkos::TeamVectorRange(member, m), [&](const int &i) { + aa(i, i) += add_this; + xx(i) = (i + 1); + }); } else { - Kokkos::parallel_for(Kokkos::TeamVectorRange(member, m * m), - [=](const int &ij) { - const int i = ij / m, j = ij % m; - value_type tmp(0); - for (int l = 0; l < r; ++l) - tmp += rr(i, l) * rr(j, l); - aa(i, j) = tmp; - }); - Kokkos::parallel_for(Kokkos::TeamVectorRange(member, m), - [&](const int &i) { xx(i) = (i + 1); }); + Kokkos::parallel_for(Kokkos::TeamVectorRange(member, m * m), [=](const int &ij) { + const int i = ij / m, j = ij % m; + value_type tmp(0); + for (int l = 0; l < r; ++l) tmp += rr(i, l) * rr(j, l); + aa(i, j) = tmp; + }); + Kokkos::parallel_for(Kokkos::TeamVectorRange(member, m), [&](const int &i) { xx(i) = (i + 1); }); } member.team_barrier(); // finish writing aa, xx @@ -94,9 +88,8 @@ struct Functor_TestBatchedTeamVectorUTV { TeamVectorCopy::invoke(member, aa, ac); /// bb = AA*xx - KokkosBlas::TeamVectorGemv::invoke(member, one, aa, - xx, zero, bb); + KokkosBlas::TeamVectorGemv::invoke(member, one, aa, xx, zero, + bb); member.team_barrier(); /// Solving Ax = b using UTV transformation @@ -105,46 +98,41 @@ struct Functor_TestBatchedTeamVectorUTV { /// UTV = A P^T int matrix_rank(0); - TeamVectorUTV::invoke(member, aa, pp, uu, vv, ww, - matrix_rank); + TeamVectorUTV::invoke(member, aa, pp, uu, vv, ww, matrix_rank); member.team_barrier(); const auto range_upto_rank = Kokkos::pair(0, matrix_rank); - auto um = Kokkos::subview(uu, Kokkos::ALL(), range_upto_rank); - auto am = Kokkos::subview(aa, range_upto_rank, range_upto_rank); - auto vm = Kokkos::subview(vv, range_upto_rank, Kokkos::ALL()); + auto um = Kokkos::subview(uu, Kokkos::ALL(), range_upto_rank); + auto am = Kokkos::subview(aa, range_upto_rank, range_upto_rank); + auto vm = Kokkos::subview(vv, range_upto_rank, Kokkos::ALL()); if (matrix_rank < m) { /// w = U^T b - KokkosBlas::TeamVectorGemv::invoke(member, one, um, - bb, zero, ww); + KokkosBlas::TeamVectorGemv::invoke(member, one, um, bb, zero, + ww); member.team_barrier(); /// w = T^{-1} w - TeamVectorTrsv::invoke(member, one, am, ww); + TeamVectorTrsv::invoke( + member, one, am, ww); member.team_barrier(); /// x = V^T w - KokkosBlas::TeamVectorGemv::invoke(member, one, vm, - ww, zero, xx); + KokkosBlas::TeamVectorGemv::invoke(member, one, vm, ww, zero, + xx); member.team_barrier(); } else { /// x = U^T b - KokkosBlas::TeamVectorGemv::invoke(member, one, um, - bb, zero, xx); + KokkosBlas::TeamVectorGemv::invoke(member, one, um, bb, zero, + xx); member.team_barrier(); /// x = T^{-1} x - TeamVectorTrsv::invoke(member, one, am, xx); + TeamVectorTrsv::invoke( + member, one, am, xx); member.team_barrier(); } /// x = P^T x - TeamVectorApplyPivot::invoke( - member, pp, xx); + TeamVectorApplyPivot::invoke(member, pp, xx); member.team_barrier(); } @@ -163,8 +151,8 @@ struct Functor_TestBatchedTeamVectorUTV { } }; -template +template void impl_test_batched_utv(const int N, const int BlkSize) { typedef typename MatrixViewType::non_const_value_type value_type; typedef Kokkos::ArithTraits ats; @@ -182,8 +170,7 @@ void impl_test_batched_utv(const int N, const int BlkSize) { Kokkos::fence(); - Kokkos::Random_XorShift64_Pool random( - 13718); + Kokkos::Random_XorShift64_Pool random(13718); if (BlkSize <= 3) Kokkos::fill_random(a, random, value_type(1.0)); else @@ -191,8 +178,7 @@ void impl_test_batched_utv(const int N, const int BlkSize) { Kokkos::fence(); - Functor_TestBatchedTeamVectorUTV( + Functor_TestBatchedTeamVectorUTV( r, a, acopy, u, v, p, x, b, w) .run(); @@ -236,46 +222,35 @@ void impl_test_batched_utv(const int N, const int BlkSize) { } } // namespace Test -template +template int test_batched_utv() { #if defined(KOKKOSKERNELS_INST_LAYOUTLEFT) { - typedef Kokkos::View - MatrixViewType; - typedef Kokkos::View - VectorViewType; - typedef Kokkos::View - PivViewType; - typedef Kokkos::View - WorkViewType; - Test::impl_test_batched_utv(0, 10); + typedef Kokkos::View MatrixViewType; + typedef Kokkos::View VectorViewType; + typedef Kokkos::View PivViewType; + typedef Kokkos::View WorkViewType; + Test::impl_test_batched_utv(0, + 10); for (int i = 1; i < 10; ++i) { // printf("Testing: LayoutLeft, Blksize %d\n", i); - Test::impl_test_batched_utv(1024, - i); + Test::impl_test_batched_utv( + 1024, i); } } #endif #if defined(KOKKOSKERNELS_INST_LAYOUTRIGHT) { - typedef Kokkos::View - MatrixViewType; - typedef Kokkos::View - VectorViewType; - typedef Kokkos::View - PivViewType; - typedef Kokkos::View - WorkViewType; - Test::impl_test_batched_utv(0, 10); + typedef Kokkos::View MatrixViewType; + typedef Kokkos::View VectorViewType; + typedef Kokkos::View PivViewType; + typedef Kokkos::View WorkViewType; + Test::impl_test_batched_utv(0, + 10); for (int i = 1; i < 10; ++i) { // printf("Testing: LayoutRight, Blksize %d\n", i); - Test::impl_test_batched_utv(1024, - i); + Test::impl_test_batched_utv( + 1024, i); } } #endif diff --git a/batched/dense/unit_test/Test_Batched_VectorArithmatic.hpp b/batched/dense/unit_test/Test_Batched_VectorArithmatic.hpp index 9d1205717f..654d199117 100644 --- a/batched/dense/unit_test/Test_Batched_VectorArithmatic.hpp +++ b/batched/dense/unit_test/Test_Batched_VectorArithmatic.hpp @@ -21,10 +21,8 @@ // to ensure it is not included in these // backends unit-test -#if !defined(TEST_CUDA_BATCHED_DENSE_CPP) && \ - !defined(TEST_HIP_BATCHED_DENSE_CPP) && \ - !defined(TEST_SYCL_BATCHED_DENSE_CPP) && \ - !defined(TEST_OPENMPTARGET_BATCHED_DENSE_CPP) +#if !defined(TEST_CUDA_BATCHED_DENSE_CPP) && !defined(TEST_HIP_BATCHED_DENSE_CPP) && \ + !defined(TEST_SYCL_BATCHED_DENSE_CPP) && !defined(TEST_OPENMPTARGET_BATCHED_DENSE_CPP) #include "gtest/gtest.h" #include "Kokkos_Core.hpp" @@ -95,132 +93,91 @@ void impl_test_batched_vector_arithmatic() { { /// test : vec + vec c = a + b; - for (int k = 0; k < vector_length; ++k) - EXPECT_NEAR(ats::abs(c[k]), ats::abs(a[k] + b[k]), - eps * ats::abs(c[k])); + for (int k = 0; k < vector_length; ++k) EXPECT_NEAR(ats::abs(c[k]), ats::abs(a[k] + b[k]), eps * ats::abs(c[k])); /// test : value + vec c = alpha + b; - for (int k = 0; k < vector_length; ++k) - EXPECT_NEAR(ats::abs(c[k]), ats::abs(alpha + b[k]), - eps * ats::abs(c[k])); + for (int k = 0; k < vector_length; ++k) EXPECT_NEAR(ats::abs(c[k]), ats::abs(alpha + b[k]), eps * ats::abs(c[k])); /// test : vec + value c = b + alpha; - for (int k = 0; k < vector_length; ++k) - EXPECT_NEAR(ats::abs(c[k]), ats::abs(b[k] + alpha), - eps * ats::abs(c[k])); + for (int k = 0; k < vector_length; ++k) EXPECT_NEAR(ats::abs(c[k]), ats::abs(b[k] + alpha), eps * ats::abs(c[k])); /// test : vec + mag c = a + beta; - for (int k = 0; k < vector_length; ++k) - EXPECT_NEAR(ats::abs(c[k]), ats::abs(a[k] + beta), - eps * ats::abs(c[k])); + for (int k = 0; k < vector_length; ++k) EXPECT_NEAR(ats::abs(c[k]), ats::abs(a[k] + beta), eps * ats::abs(c[k])); /// test : mag + vec c = beta + a; - for (int k = 0; k < vector_length; ++k) - EXPECT_NEAR(ats::abs(c[k]), ats::abs(beta + a[k]), - eps * ats::abs(c[k])); + for (int k = 0; k < vector_length; ++k) EXPECT_NEAR(ats::abs(c[k]), ats::abs(beta + a[k]), eps * ats::abs(c[k])); } { /// test : vec - vec c = a - b; - for (int k = 0; k < vector_length; ++k) - EXPECT_NEAR(ats::abs(c[k]), ats::abs(a[k] - b[k]), - eps * ats::abs(c[k])); + for (int k = 0; k < vector_length; ++k) EXPECT_NEAR(ats::abs(c[k]), ats::abs(a[k] - b[k]), eps * ats::abs(c[k])); /// test : value - vec c = alpha - b; - for (int k = 0; k < vector_length; ++k) - EXPECT_NEAR(ats::abs(c[k]), ats::abs(alpha - b[k]), - eps * ats::abs(c[k])); + for (int k = 0; k < vector_length; ++k) EXPECT_NEAR(ats::abs(c[k]), ats::abs(alpha - b[k]), eps * ats::abs(c[k])); /// test : vec + value c = b - alpha; - for (int k = 0; k < vector_length; ++k) - EXPECT_NEAR(ats::abs(c[k]), ats::abs(b[k] - alpha), - eps * ats::abs(c[k])); + for (int k = 0; k < vector_length; ++k) EXPECT_NEAR(ats::abs(c[k]), ats::abs(b[k] - alpha), eps * ats::abs(c[k])); /// test : vec - mag c = a - beta; - for (int k = 0; k < vector_length; ++k) - EXPECT_NEAR(ats::abs(c[k]), ats::abs(a[k] - beta), - eps * ats::abs(c[k])); + for (int k = 0; k < vector_length; ++k) EXPECT_NEAR(ats::abs(c[k]), ats::abs(a[k] - beta), eps * ats::abs(c[k])); /// test : mag - vec c = beta - a; - for (int k = 0; k < vector_length; ++k) - EXPECT_NEAR(ats::abs(c[k]), ats::abs(beta - a[k]), - eps * ats::abs(c[k])); + for (int k = 0; k < vector_length; ++k) EXPECT_NEAR(ats::abs(c[k]), ats::abs(beta - a[k]), eps * ats::abs(c[k])); } { /// test : vec * vec c = a * b; - for (int k = 0; k < vector_length; ++k) - EXPECT_NEAR(ats::abs(c[k]), ats::abs(a[k] * b[k]), - eps * ats::abs(c[k])); + for (int k = 0; k < vector_length; ++k) EXPECT_NEAR(ats::abs(c[k]), ats::abs(a[k] * b[k]), eps * ats::abs(c[k])); /// test : value * vec c = alpha * b; - for (int k = 0; k < vector_length; ++k) - EXPECT_NEAR(ats::abs(c[k]), ats::abs(alpha * b[k]), - eps * ats::abs(c[k])); + for (int k = 0; k < vector_length; ++k) EXPECT_NEAR(ats::abs(c[k]), ats::abs(alpha * b[k]), eps * ats::abs(c[k])); /// test : vec + value c = b * alpha; - for (int k = 0; k < vector_length; ++k) - EXPECT_NEAR(ats::abs(c[k]), ats::abs(b[k] * alpha), - eps * ats::abs(c[k])); + for (int k = 0; k < vector_length; ++k) EXPECT_NEAR(ats::abs(c[k]), ats::abs(b[k] * alpha), eps * ats::abs(c[k])); /// test : vec * mag c = a * beta; - for (int k = 0; k < vector_length; ++k) - EXPECT_NEAR(ats::abs(c[k]), ats::abs(a[k] * beta), - eps * ats::abs(c[k])); + for (int k = 0; k < vector_length; ++k) EXPECT_NEAR(ats::abs(c[k]), ats::abs(a[k] * beta), eps * ats::abs(c[k])); /// test : mag * vec c = beta * a; - for (int k = 0; k < vector_length; ++k) - EXPECT_NEAR(ats::abs(c[k]), ats::abs(beta * a[k]), - eps * ats::abs(c[k])); + for (int k = 0; k < vector_length; ++k) EXPECT_NEAR(ats::abs(c[k]), ats::abs(beta * a[k]), eps * ats::abs(c[k])); } { /// test : vec / vec c = a / b; - for (int k = 0; k < vector_length; ++k) - EXPECT_NEAR(ats::abs(c[k]), ats::abs(a[k] / b[k]), - eps * ats::abs(c[k])); + for (int k = 0; k < vector_length; ++k) EXPECT_NEAR(ats::abs(c[k]), ats::abs(a[k] / b[k]), eps * ats::abs(c[k])); /// test : value / vec c = alpha / b; - for (int k = 0; k < vector_length; ++k) - EXPECT_NEAR(ats::abs(c[k]), ats::abs(alpha / b[k]), - eps * ats::abs(c[k])); + for (int k = 0; k < vector_length; ++k) EXPECT_NEAR(ats::abs(c[k]), ats::abs(alpha / b[k]), eps * ats::abs(c[k])); /// test : vec / value c = b / alpha; - for (int k = 0; k < vector_length; ++k) - EXPECT_NEAR(ats::abs(c[k]), ats::abs(b[k] / alpha), - eps * ats::abs(c[k])); + for (int k = 0; k < vector_length; ++k) EXPECT_NEAR(ats::abs(c[k]), ats::abs(b[k] / alpha), eps * ats::abs(c[k])); /// test : mag / vec c = beta / a; - for (int k = 0; k < vector_length; ++k) - EXPECT_NEAR(ats::abs(c[k]), ats::abs(beta / a[k]), - eps * ats::abs(c[k])); + for (int k = 0; k < vector_length; ++k) EXPECT_NEAR(ats::abs(c[k]), ats::abs(beta / a[k]), eps * ats::abs(c[k])); /// test : vec / value c = a / beta; - for (int k = 0; k < vector_length; ++k) - EXPECT_NEAR(ats::abs(c[k]), ats::abs(a[k] / beta), - eps * ats::abs(c[k])); + for (int k = 0; k < vector_length; ++k) EXPECT_NEAR(ats::abs(c[k]), ats::abs(a[k] / beta), eps * ats::abs(c[k])); } { /// test : vec -vec c = -a; - for (int k = 0; k < vector_length; ++k) - EXPECT_NEAR(ats::abs(c[k]), ats::abs(-a[k]), eps * ats::abs(c[k])); + for (int k = 0; k < vector_length; ++k) EXPECT_NEAR(ats::abs(c[k]), ats::abs(-a[k]), eps * ats::abs(c[k])); } #if defined(__DO_NOT_TEST__) { @@ -232,8 +189,7 @@ void impl_test_batched_vector_arithmatic() { c += vector_type(tiny) * vector_type(a >= 0); for (int k = 0; k < vector_length; ++k) - EXPECT_NEAR(ats::abs(c[k]), ats::abs(a[k] < 0 ? -tiny : tiny), - eps * ats::abs(c[k])); + EXPECT_NEAR(ats::abs(c[k]), ats::abs(a[k] < 0 ? -tiny : tiny), eps * ats::abs(c[k])); } #endif } @@ -242,18 +198,16 @@ void impl_test_batched_vector_arithmatic() { template int test_batched_vector_arithmatic() { - static_assert( - Kokkos::SpaceAccessibility::accessible, - "vector datatype is only tested on host space"); + static_assert(Kokkos::SpaceAccessibility::accessible, + "vector datatype is only tested on host space"); Test::impl_test_batched_vector_arithmatic(); return 0; } template int test_batched_complex_real_imag_value() { - static_assert( - Kokkos::SpaceAccessibility::accessible, - "vector datatype is only tested on host space"); + static_assert(Kokkos::SpaceAccessibility::accessible, + "vector datatype is only tested on host space"); Test::impl_test_complex_real_imag_value(); return 0; @@ -297,65 +251,53 @@ TEST_F(TestCategory, batched_vector_arithmatic_simd_double8) { #define __DO_NOT_TEST__ #if defined(KOKKOSKERNELS_INST_COMPLEX_FLOAT) TEST_F(TestCategory, batched_vector_arithmatic_simd_scomplex3) { - test_batched_vector_arithmatic >, - 3>(); + test_batched_vector_arithmatic >, 3>(); } // avx TEST_F(TestCategory, batched_vector_arithmatic_simd_scomplex4) { - test_batched_vector_arithmatic >, - 4>(); + test_batched_vector_arithmatic >, 4>(); } // avx 512 TEST_F(TestCategory, batched_vector_arithmatic_simd_scomplex8) { - test_batched_vector_arithmatic >, - 8>(); + test_batched_vector_arithmatic >, 8>(); } TEST_F(TestCategory, batched_vector_scomplex_real_imag_value3) { - test_batched_complex_real_imag_value >, 3>(); + test_batched_complex_real_imag_value >, 3>(); } // avx TEST_F(TestCategory, batched_vector_scomplex_real_imag_value2) { - test_batched_complex_real_imag_value >, 2>(); + test_batched_complex_real_imag_value >, 2>(); } // avx 512 TEST_F(TestCategory, batched_vector_scomplex_real_imag_value4) { - test_batched_complex_real_imag_value >, 4>(); + test_batched_complex_real_imag_value >, 4>(); } #endif #if defined(KOKKOSKERNELS_INST_COMPLEX_DOUBLE) TEST_F(TestCategory, batched_vector_arithmatic_simd_dcomplex3) { - test_batched_vector_arithmatic >, - 3>(); + test_batched_vector_arithmatic >, 3>(); } // avx TEST_F(TestCategory, batched_vector_arithmatic_simd_dcomplex2) { - test_batched_vector_arithmatic >, - 2>(); + test_batched_vector_arithmatic >, 2>(); } // avx 512 TEST_F(TestCategory, batched_vector_arithmatic_simd_dcomplex4) { - test_batched_vector_arithmatic >, - 4>(); + test_batched_vector_arithmatic >, 4>(); } TEST_F(TestCategory, batched_vector_dcomplex_real_imag_value3) { - test_batched_complex_real_imag_value >, 3>(); + test_batched_complex_real_imag_value >, 3>(); } // avx TEST_F(TestCategory, batched_vector_dcomplex_real_imag_value2) { - test_batched_complex_real_imag_value >, 2>(); + test_batched_complex_real_imag_value >, 2>(); } // avx 512 TEST_F(TestCategory, batched_vector_dcomplex_real_imag_value4) { - test_batched_complex_real_imag_value >, 4>(); + test_batched_complex_real_imag_value >, 4>(); } #endif #undef __DO_NOT_TEST__ diff --git a/batched/dense/unit_test/Test_Batched_VectorLogical.hpp b/batched/dense/unit_test/Test_Batched_VectorLogical.hpp index 5ab10bb5bd..0427982a42 100644 --- a/batched/dense/unit_test/Test_Batched_VectorLogical.hpp +++ b/batched/dense/unit_test/Test_Batched_VectorLogical.hpp @@ -21,10 +21,8 @@ // to ensure it is not included in these // backends unit-test -#if !defined(TEST_CUDA_BATCHED_DENSE_CPP) && \ - !defined(TEST_HIP_BATCHED_DENSE_CPP) && \ - !defined(TEST_SYCL_BATCHED_DENSE_CPP) && \ - !defined(TEST_OPENMPTARGET_BATCHED_DENSE_CPP) +#if !defined(TEST_CUDA_BATCHED_DENSE_CPP) && !defined(TEST_HIP_BATCHED_DENSE_CPP) && \ + !defined(TEST_SYCL_BATCHED_DENSE_CPP) && !defined(TEST_OPENMPTARGET_BATCHED_DENSE_CPP) #include "gtest/gtest.h" #include "Kokkos_Core.hpp" @@ -59,33 +57,30 @@ void impl_test_batched_vector_logical() { { #undef CHECK -#define CHECK(op) \ - { \ - const auto comparison = a op b; \ - for (int i = 0; i < vector_length; ++i) \ - EXPECT_EQ(comparison[i], a[i] op b[i]); \ +#define CHECK(op) \ + { \ + const auto comparison = a op b; \ + for (int i = 0; i < vector_length; ++i) EXPECT_EQ(comparison[i], a[i] op b[i]); \ } CHECK(||); CHECK(&&); #undef CHECK -#define CHECK(op) \ - { \ - const auto comparison = a op 0; \ - for (int i = 0; i < vector_length; ++i) \ - EXPECT_EQ(comparison[i], a[i] op 0); \ +#define CHECK(op) \ + { \ + const auto comparison = a op 0; \ + for (int i = 0; i < vector_length; ++i) EXPECT_EQ(comparison[i], a[i] op 0); \ } CHECK(||); CHECK(&&); #undef CHECK -#define CHECK(op) \ - { \ - const auto comparison = 0 op b; \ - for (int i = 0; i < vector_length; ++i) \ - EXPECT_EQ(comparison[i], 0 op b[i]); \ +#define CHECK(op) \ + { \ + const auto comparison = 0 op b; \ + for (int i = 0; i < vector_length; ++i) EXPECT_EQ(comparison[i], 0 op b[i]); \ } CHECK(||); @@ -100,9 +95,8 @@ void impl_test_batched_vector_logical() { template int test_batched_vector_logical() { - static_assert( - Kokkos::SpaceAccessibility::accessible, - "vector datatype is only tested on host space"); + static_assert(Kokkos::SpaceAccessibility::accessible, + "vector datatype is only tested on host space"); Test::impl_test_batched_vector_logical(); return 0; @@ -113,21 +107,13 @@ int test_batched_vector_logical() { /// #if defined(KOKKOSKERNELS_INST_FLOAT) -TEST_F(TestCategory, batched_vector_logical_simd_float3) { - test_batched_vector_logical(); -} -TEST_F(TestCategory, batched_vector_logical_simd_float8) { - test_batched_vector_logical(); -} +TEST_F(TestCategory, batched_vector_logical_simd_float3) { test_batched_vector_logical(); } +TEST_F(TestCategory, batched_vector_logical_simd_float8) { test_batched_vector_logical(); } #endif #if defined(KOKKOSKERNELS_INST_DOUBLE) -TEST_F(TestCategory, batched_vector_logical_simd_double3) { - test_batched_vector_logical(); -} -TEST_F(TestCategory, batched_vector_logical_simd_double4) { - test_batched_vector_logical(); -} +TEST_F(TestCategory, batched_vector_logical_simd_double3) { test_batched_vector_logical(); } +TEST_F(TestCategory, batched_vector_logical_simd_double4) { test_batched_vector_logical(); } #endif // #if defined(KOKKOSKERNELS_INST_COMPLEX_FLOAT) diff --git a/batched/dense/unit_test/Test_Batched_VectorMath.hpp b/batched/dense/unit_test/Test_Batched_VectorMath.hpp index 02c943d587..2cd9f02a49 100644 --- a/batched/dense/unit_test/Test_Batched_VectorMath.hpp +++ b/batched/dense/unit_test/Test_Batched_VectorMath.hpp @@ -21,10 +21,8 @@ // to ensure it is not included in these // backends unit-test -#if !defined(TEST_CUDA_BATCHED_DENSE_CPP) && \ - !defined(TEST_HIP_BATCHED_DENSE_CPP) && \ - !defined(TEST_SYCL_BATCHED_DENSE_CPP) && \ - !defined(TEST_OPENMPTARGET_BATCHED_DENSE_CPP) +#if !defined(TEST_CUDA_BATCHED_DENSE_CPP) && !defined(TEST_HIP_BATCHED_DENSE_CPP) && \ + !defined(TEST_SYCL_BATCHED_DENSE_CPP) && !defined(TEST_OPENMPTARGET_BATCHED_DENSE_CPP) #include "gtest/gtest.h" #include "Kokkos_Core.hpp" @@ -67,11 +65,10 @@ void impl_test_batched_vector_math() { { #undef CHECK -#define CHECK(op) \ - { \ - a = op(aref); \ - for (int i = 0; i < vector_length; ++i) \ - EXPECT_NEAR_KK(a[i], ats::op(aref[i]), eps* a[i]); \ +#define CHECK(op) \ + { \ + a = op(aref); \ + for (int i = 0; i < vector_length; ++i) EXPECT_NEAR_KK(a[i], ats::op(aref[i]), eps* a[i]); \ } CHECK(sqrt); @@ -89,32 +86,29 @@ void impl_test_batched_vector_math() { CHECK(atan); #undef CHECK -#define CHECK \ - { \ - a = pow(aref, bref); \ - for (int i = 0; i < vector_length; ++i) \ - EXPECT_NEAR_KK(a[i], ats::pow(aref[i], bref[i]), eps* a[i]); \ - } \ +#define CHECK \ + { \ + a = pow(aref, bref); \ + for (int i = 0; i < vector_length; ++i) EXPECT_NEAR_KK(a[i], ats::pow(aref[i], bref[i]), eps* a[i]); \ + } \ CHECK; #undef CHECK -#define CHECK(op) \ - { \ - mag_type beta = mag_type(3.2); \ - a = op(aref, beta); \ - for (int i = 0; i < vector_length; ++i) \ - EXPECT_NEAR_KK(a[i], ats::op(aref[i], beta), eps* a[i]); \ +#define CHECK(op) \ + { \ + mag_type beta = mag_type(3.2); \ + a = op(aref, beta); \ + for (int i = 0; i < vector_length; ++i) EXPECT_NEAR_KK(a[i], ats::op(aref[i], beta), eps* a[i]); \ } CHECK(pow); #undef CHECK -#define CHECK(op) \ - { \ - value_type alpha = random.value() + 2.0; \ - a = op(alpha, bref); \ - for (int i = 0; i < vector_length; ++i) \ - EXPECT_NEAR_KK(a[i], ats::op(alpha, bref[i]), eps* a[i]); \ +#define CHECK(op) \ + { \ + value_type alpha = random.value() + 2.0; \ + a = op(alpha, bref); \ + for (int i = 0; i < vector_length; ++i) EXPECT_NEAR_KK(a[i], ats::op(alpha, bref[i]), eps* a[i]); \ } CHECK(pow); @@ -126,9 +120,8 @@ void impl_test_batched_vector_math() { template int test_batched_vector_math() { - static_assert( - Kokkos::SpaceAccessibility::accessible, - "vector datatype is only tested on host space"); + static_assert(Kokkos::SpaceAccessibility::accessible, + "vector datatype is only tested on host space"); Test::impl_test_batched_vector_math(); return 0; @@ -156,21 +149,13 @@ int test_batched_vector_math() { /// #if defined(KOKKOSKERNELS_INST_FLOAT) -TEST_F(TestCategory, batched_vector_math_simd_float3) { - test_batched_vector_math, 3>(); -} -TEST_F(TestCategory, batched_vector_math_simd_float8) { - test_batched_vector_math, 8>(); -} +TEST_F(TestCategory, batched_vector_math_simd_float3) { test_batched_vector_math, 3>(); } +TEST_F(TestCategory, batched_vector_math_simd_float8) { test_batched_vector_math, 8>(); } #endif #if defined(KOKKOSKERNELS_INST_DOUBLE) -TEST_F(TestCategory, batched_vector_math_simd_double3) { - test_batched_vector_math, 3>(); -} -TEST_F(TestCategory, batched_vector_math_simd_double4) { - test_batched_vector_math, 4>(); -} +TEST_F(TestCategory, batched_vector_math_simd_double3) { test_batched_vector_math, 3>(); } +TEST_F(TestCategory, batched_vector_math_simd_double4) { test_batched_vector_math, 4>(); } #endif // using namespace Test; diff --git a/batched/dense/unit_test/Test_Batched_VectorMisc.hpp b/batched/dense/unit_test/Test_Batched_VectorMisc.hpp index 5f176ccba8..98d7f4e87e 100644 --- a/batched/dense/unit_test/Test_Batched_VectorMisc.hpp +++ b/batched/dense/unit_test/Test_Batched_VectorMisc.hpp @@ -21,10 +21,8 @@ // to ensure it is not included in these // backends unit-test -#if !defined(TEST_CUDA_BATCHED_DENSE_CPP) && \ - !defined(TEST_HIP_BATCHED_DENSE_CPP) && \ - !defined(TEST_SYCL_BATCHED_DENSE_CPP) && \ - !defined(TEST_OPENMPTARGET_BATCHED_DENSE_CPP) +#if !defined(TEST_CUDA_BATCHED_DENSE_CPP) && !defined(TEST_HIP_BATCHED_DENSE_CPP) && \ + !defined(TEST_SYCL_BATCHED_DENSE_CPP) && !defined(TEST_OPENMPTARGET_BATCHED_DENSE_CPP) #include "gtest/gtest.h" #include "Kokkos_Core.hpp" @@ -159,9 +157,8 @@ void impl_test_batched_vector_misc() { template int test_batched_vector_misc() { - static_assert( - Kokkos::SpaceAccessibility::accessible, - "vector datatype is only tested on host space"); + static_assert(Kokkos::SpaceAccessibility::accessible, + "vector datatype is only tested on host space"); Test::impl_test_batched_vector_misc(); return 0; @@ -172,21 +169,13 @@ int test_batched_vector_misc() { /// #if defined(KOKKOSKERNELS_INST_FLOAT) -TEST_F(TestCategory, batched_vector_misc_simd_float3) { - test_batched_vector_misc, 3>(); -} -TEST_F(TestCategory, batched_vector_misc_simd_float8) { - test_batched_vector_misc, 8>(); -} +TEST_F(TestCategory, batched_vector_misc_simd_float3) { test_batched_vector_misc, 3>(); } +TEST_F(TestCategory, batched_vector_misc_simd_float8) { test_batched_vector_misc, 8>(); } #endif #if defined(KOKKOSKERNELS_INST_DOUBLE) -TEST_F(TestCategory, batched_vector_misc_simd_double3) { - test_batched_vector_misc, 3>(); -} -TEST_F(TestCategory, batched_vector_misc_simd_double4) { - test_batched_vector_misc, 4>(); -} +TEST_F(TestCategory, batched_vector_misc_simd_double3) { test_batched_vector_misc, 3>(); } +TEST_F(TestCategory, batched_vector_misc_simd_double4) { test_batched_vector_misc, 4>(); } #endif // #if defined(KOKKOSKERNELS_INST_COMPLEX_FLOAT) diff --git a/batched/dense/unit_test/Test_Batched_VectorRelation.hpp b/batched/dense/unit_test/Test_Batched_VectorRelation.hpp index 1aff1b2d0f..e5c3139c5c 100644 --- a/batched/dense/unit_test/Test_Batched_VectorRelation.hpp +++ b/batched/dense/unit_test/Test_Batched_VectorRelation.hpp @@ -21,10 +21,8 @@ // to ensure it is not included in these // backends unit-test -#if !defined(TEST_CUDA_BATCHED_DENSE_CPP) && \ - !defined(TEST_HIP_BATCHED_DENSE_CPP) && \ - !defined(TEST_SYCL_BATCHED_DENSE_CPP) && \ - !defined(TEST_OPENMPTARGET_BATCHED_DENSE_CPP) +#if !defined(TEST_CUDA_BATCHED_DENSE_CPP) && !defined(TEST_HIP_BATCHED_DENSE_CPP) && \ + !defined(TEST_SYCL_BATCHED_DENSE_CPP) && !defined(TEST_OPENMPTARGET_BATCHED_DENSE_CPP) #include "gtest/gtest.h" #include "Kokkos_Core.hpp" @@ -60,11 +58,10 @@ void impl_test_batched_vector_relation() { { #undef CHECK -#define CHECK(op) \ - { \ - const auto comparison = a op b; \ - for (int i = 0; i < vector_length; ++i) \ - EXPECT_EQ(comparison[i], a[i] op b[i]); \ +#define CHECK(op) \ + { \ + const auto comparison = a op b; \ + for (int i = 0; i < vector_length; ++i) EXPECT_EQ(comparison[i], a[i] op b[i]); \ } CHECK(<); @@ -75,11 +72,10 @@ void impl_test_batched_vector_relation() { CHECK(!=); #undef CHECK -#define CHECK(op) \ - { \ - const auto comparison = a op value_type(0); \ - for (int i = 0; i < vector_length; ++i) \ - EXPECT_EQ(comparison[i], a[i] op value_type(0)); \ +#define CHECK(op) \ + { \ + const auto comparison = a op value_type(0); \ + for (int i = 0; i < vector_length; ++i) EXPECT_EQ(comparison[i], a[i] op value_type(0)); \ } CHECK(<); @@ -90,11 +86,10 @@ void impl_test_batched_vector_relation() { CHECK(!=); #undef CHECK -#define CHECK(op) \ - { \ - const auto comparison = value_type(0) op b; \ - for (int i = 0; i < vector_length; ++i) \ - EXPECT_EQ(comparison[i], value_type(0) op b[i]); \ +#define CHECK(op) \ + { \ + const auto comparison = value_type(0) op b; \ + for (int i = 0; i < vector_length; ++i) EXPECT_EQ(comparison[i], value_type(0) op b[i]); \ } CHECK(<); @@ -113,9 +108,8 @@ void impl_test_batched_vector_relation() { template int test_batched_vector_relation() { - static_assert( - Kokkos::SpaceAccessibility::accessible, - "vector datatype is only tested on host space"); + static_assert(Kokkos::SpaceAccessibility::accessible, + "vector datatype is only tested on host space"); Test::impl_test_batched_vector_relation(); return 0; diff --git a/batched/dense/unit_test/Test_Batched_VectorView.hpp b/batched/dense/unit_test/Test_Batched_VectorView.hpp index 74c7748cba..5d9047e57c 100644 --- a/batched/dense/unit_test/Test_Batched_VectorView.hpp +++ b/batched/dense/unit_test/Test_Batched_VectorView.hpp @@ -21,10 +21,8 @@ // to ensure it is not included in these // backends unit-test -#if !defined(TEST_CUDA_BATCHED_DENSE_CPP) && \ - !defined(TEST_HIP_BATCHED_DENSE_CPP) && \ - !defined(TEST_SYCL_BATCHED_DENSE_CPP) && \ - !defined(TEST_OPENMPTARGET_BATCHED_DENSE_CPP) +#if !defined(TEST_CUDA_BATCHED_DENSE_CPP) && !defined(TEST_HIP_BATCHED_DENSE_CPP) && \ + !defined(TEST_SYCL_BATCHED_DENSE_CPP) && !defined(TEST_OPENMPTARGET_BATCHED_DENSE_CPP) #include "gtest/gtest.h" #include "Kokkos_Core.hpp" @@ -62,100 +60,76 @@ void impl_init_vector_view(const VectorViewType& a) { for (int i7 = 0, i7end = b.extent(7); i7 < i7end; ++i7) template -void impl_verify_vector_view( - const VectorViewType& a, - const SimdViewAccess >& b) { +void impl_verify_vector_view(const VectorViewType& a, const SimdViewAccess >& b) { typedef typename VectorViewType::value_type vector_type; constexpr int vl = vector_type::vector_length; typedef Kokkos::ArithTraits ats; const typename ats::mag_type eps = 1.0e3 * ats::epsilon(); TEST_LOOP - EXPECT_NEAR_KK(a.access(i0 / vl, i1, i2, i3, i4, i5, i6, i7)[i0 % vl], - b(i0, i1, i2, i3, i4, i5, i6, i7), eps); + EXPECT_NEAR_KK(a.access(i0 / vl, i1, i2, i3, i4, i5, i6, i7)[i0 % vl], b(i0, i1, i2, i3, i4, i5, i6, i7), eps); } template -void impl_verify_vector_view( - const VectorViewType& a, - const SimdViewAccess >& b) { +void impl_verify_vector_view(const VectorViewType& a, const SimdViewAccess >& b) { typedef typename VectorViewType::value_type vector_type; constexpr int vl = vector_type::vector_length; typedef Kokkos::ArithTraits ats; const typename ats::mag_type eps = 1.0e3 * ats::epsilon(); TEST_LOOP - EXPECT_NEAR_KK(a.access(i0, i1 / vl, i2, i3, i4, i5, i6, i7)[i1 % vl], - b(i0, i1, i2, i3, i4, i5, i6, i7), eps); + EXPECT_NEAR_KK(a.access(i0, i1 / vl, i2, i3, i4, i5, i6, i7)[i1 % vl], b(i0, i1, i2, i3, i4, i5, i6, i7), eps); } template -void impl_verify_vector_view( - const VectorViewType& a, - const SimdViewAccess >& b) { +void impl_verify_vector_view(const VectorViewType& a, const SimdViewAccess >& b) { typedef typename VectorViewType::value_type vector_type; constexpr int vl = vector_type::vector_length; typedef Kokkos::ArithTraits ats; const typename ats::mag_type eps = 1.0e3 * ats::epsilon(); TEST_LOOP - EXPECT_NEAR_KK(a.access(i0, i1, i2 / vl, i3, i4, i5, i6, i7)[i2 % vl], - b(i0, i1, i2, i3, i4, i5, i6, i7), eps); + EXPECT_NEAR_KK(a.access(i0, i1, i2 / vl, i3, i4, i5, i6, i7)[i2 % vl], b(i0, i1, i2, i3, i4, i5, i6, i7), eps); } template -void impl_verify_vector_view( - const VectorViewType& a, - const SimdViewAccess >& b) { +void impl_verify_vector_view(const VectorViewType& a, const SimdViewAccess >& b) { typedef typename VectorViewType::value_type vector_type; constexpr int vl = vector_type::vector_length; typedef Kokkos::ArithTraits ats; const typename ats::mag_type eps = 1.0e3 * ats::epsilon(); TEST_LOOP - EXPECT_NEAR_KK(a.access(i0, i1, i2, i3 / vl, i4, i5, i6, i7)[i3 % vl], - b(i0, i1, i2, i3, i4, i5, i6, i7), eps); + EXPECT_NEAR_KK(a.access(i0, i1, i2, i3 / vl, i4, i5, i6, i7)[i3 % vl], b(i0, i1, i2, i3, i4, i5, i6, i7), eps); } template -void impl_verify_vector_view( - const VectorViewType& a, - const SimdViewAccess >& b) { +void impl_verify_vector_view(const VectorViewType& a, const SimdViewAccess >& b) { typedef typename VectorViewType::value_type vector_type; constexpr int vl = vector_type::vector_length; typedef Kokkos::ArithTraits ats; const typename ats::mag_type eps = 1.0e3 * ats::epsilon(); TEST_LOOP - EXPECT_NEAR_KK(a.access(i0, i1, i2, i3, i4 / vl, i5, i6, i7)[i4 % vl], - b(i0, i1, i2, i3, i4, i5, i6, i7), eps); + EXPECT_NEAR_KK(a.access(i0, i1, i2, i3, i4 / vl, i5, i6, i7)[i4 % vl], b(i0, i1, i2, i3, i4, i5, i6, i7), eps); } template -void impl_verify_vector_view( - const VectorViewType& a, - const SimdViewAccess >& b) { +void impl_verify_vector_view(const VectorViewType& a, const SimdViewAccess >& b) { typedef typename VectorViewType::value_type vector_type; constexpr int vl = vector_type::vector_length; typedef Kokkos::ArithTraits ats; const typename ats::mag_type eps = 1.0e3 * ats::epsilon(); TEST_LOOP - EXPECT_NEAR_KK(a.access(i0, i1, i2, i3, i4, i5 / vl, i6, i7)[i5 % vl], - b(i0, i1, i2, i3, i4, i5, i6, i7), eps); + EXPECT_NEAR_KK(a.access(i0, i1, i2, i3, i4, i5 / vl, i6, i7)[i5 % vl], b(i0, i1, i2, i3, i4, i5, i6, i7), eps); } template -void impl_verify_vector_view( - const VectorViewType& a, - const SimdViewAccess >& b) { +void impl_verify_vector_view(const VectorViewType& a, const SimdViewAccess >& b) { typedef typename VectorViewType::value_type vector_type; constexpr int vl = vector_type::vector_length; typedef Kokkos::ArithTraits ats; const typename ats::mag_type eps = 1.0e3 * ats::epsilon(); TEST_LOOP - EXPECT_NEAR_KK(a.access(i0, i1, i2, i3, i4, i5, i6 / vl, i7)[i6 % vl], - b(i0, i1, i2, i3, i4, i5, i6, i7), eps); + EXPECT_NEAR_KK(a.access(i0, i1, i2, i3, i4, i5, i6 / vl, i7)[i6 % vl], b(i0, i1, i2, i3, i4, i5, i6, i7), eps); } template -void impl_verify_vector_view( - const VectorViewType& a, - const SimdViewAccess >& b) { +void impl_verify_vector_view(const VectorViewType& a, const SimdViewAccess >& b) { typedef typename VectorViewType::value_type vector_type; constexpr int vl = vector_type::vector_length; typedef Kokkos::ArithTraits ats; const typename ats::mag_type eps = 1.0e3 * ats::epsilon(); TEST_LOOP - EXPECT_NEAR_KK(a.access(i0, i1, i2, i3, i4, i5, i6, i7 / vl)[i7 % vl], - b(i0, i1, i2, i3, i4, i5, i6, i7), eps); + EXPECT_NEAR_KK(a.access(i0, i1, i2, i3, i4, i5, i6, i7 / vl)[i7 % vl], b(i0, i1, i2, i3, i4, i5, i6, i7), eps); } template @@ -169,183 +143,90 @@ void impl_test_batched_vector_view() { { /// rank 1 array Kokkos::View a("a", test_view_size); impl_init_vector_view(a); - impl_verify_vector_view( - a, - SimdViewAccess, PackDim<0> >(a)); + impl_verify_vector_view(a, SimdViewAccess, PackDim<0> >(a)); } { /// rank 2 array - Kokkos::View a("a", test_view_size, - test_view_size); + Kokkos::View a("a", test_view_size, test_view_size); impl_init_vector_view(a); - impl_verify_vector_view( - a, SimdViewAccess, PackDim<0> >( - a)); - impl_verify_vector_view( - a, SimdViewAccess, PackDim<1> >( - a)); + impl_verify_vector_view(a, SimdViewAccess, PackDim<0> >(a)); + impl_verify_vector_view(a, SimdViewAccess, PackDim<1> >(a)); } { /// rank 3 array - Kokkos::View a("a", test_view_size, - test_view_size, test_view_size); + Kokkos::View a("a", test_view_size, test_view_size, test_view_size); impl_init_vector_view(a); - impl_verify_vector_view( - a, - SimdViewAccess, PackDim<0> >( - a)); - impl_verify_vector_view( - a, - SimdViewAccess, PackDim<1> >( - a)); - impl_verify_vector_view( - a, - SimdViewAccess, PackDim<2> >( - a)); + impl_verify_vector_view(a, SimdViewAccess, PackDim<0> >(a)); + impl_verify_vector_view(a, SimdViewAccess, PackDim<1> >(a)); + impl_verify_vector_view(a, SimdViewAccess, PackDim<2> >(a)); } { /// rank 4 array - Kokkos::View a( - "a", test_view_size, test_view_size, test_view_size, test_view_size); + Kokkos::View a("a", test_view_size, test_view_size, test_view_size, test_view_size); impl_init_vector_view(a); - impl_verify_vector_view( - a, - SimdViewAccess, PackDim<0> >( - a)); - impl_verify_vector_view( - a, - SimdViewAccess, PackDim<1> >( - a)); - impl_verify_vector_view( - a, - SimdViewAccess, PackDim<2> >( - a)); - impl_verify_vector_view( - a, - SimdViewAccess, PackDim<3> >( - a)); + impl_verify_vector_view(a, SimdViewAccess, PackDim<0> >(a)); + impl_verify_vector_view(a, SimdViewAccess, PackDim<1> >(a)); + impl_verify_vector_view(a, SimdViewAccess, PackDim<2> >(a)); + impl_verify_vector_view(a, SimdViewAccess, PackDim<3> >(a)); } { /// rank 5 array - Kokkos::View a( - "a", test_view_size, test_view_size, test_view_size, test_view_size, - test_view_size); + Kokkos::View a("a", test_view_size, test_view_size, test_view_size, test_view_size, + test_view_size); impl_init_vector_view(a); - impl_verify_vector_view( - a, - SimdViewAccess, PackDim<0> >( - a)); - impl_verify_vector_view( - a, - SimdViewAccess, PackDim<1> >( - a)); - impl_verify_vector_view( - a, - SimdViewAccess, PackDim<2> >( - a)); - impl_verify_vector_view( - a, - SimdViewAccess, PackDim<3> >( - a)); - impl_verify_vector_view( - a, - SimdViewAccess, PackDim<4> >( - a)); + impl_verify_vector_view(a, SimdViewAccess, PackDim<0> >(a)); + impl_verify_vector_view(a, SimdViewAccess, PackDim<1> >(a)); + impl_verify_vector_view(a, SimdViewAccess, PackDim<2> >(a)); + impl_verify_vector_view(a, SimdViewAccess, PackDim<3> >(a)); + impl_verify_vector_view(a, SimdViewAccess, PackDim<4> >(a)); } { /// rank 6 array - Kokkos::View a( - "a", test_view_size, test_view_size, test_view_size, test_view_size, - test_view_size, test_view_size); + Kokkos::View a("a", test_view_size, test_view_size, test_view_size, test_view_size, + test_view_size, test_view_size); impl_init_vector_view(a); - impl_verify_vector_view( - a, SimdViewAccess, - PackDim<0> >(a)); - impl_verify_vector_view( - a, SimdViewAccess, - PackDim<1> >(a)); - impl_verify_vector_view( - a, SimdViewAccess, - PackDim<2> >(a)); - impl_verify_vector_view( - a, SimdViewAccess, - PackDim<3> >(a)); - impl_verify_vector_view( - a, SimdViewAccess, - PackDim<4> >(a)); - impl_verify_vector_view( - a, SimdViewAccess, - PackDim<5> >(a)); + impl_verify_vector_view(a, SimdViewAccess, PackDim<0> >(a)); + impl_verify_vector_view(a, SimdViewAccess, PackDim<1> >(a)); + impl_verify_vector_view(a, SimdViewAccess, PackDim<2> >(a)); + impl_verify_vector_view(a, SimdViewAccess, PackDim<3> >(a)); + impl_verify_vector_view(a, SimdViewAccess, PackDim<4> >(a)); + impl_verify_vector_view(a, SimdViewAccess, PackDim<5> >(a)); } { /// rank 7 array - Kokkos::View a( - "a", test_view_size, test_view_size, test_view_size, test_view_size, - test_view_size, test_view_size, test_view_size); + Kokkos::View a("a", test_view_size, test_view_size, test_view_size, test_view_size, + test_view_size, test_view_size, test_view_size); impl_init_vector_view(a); - impl_verify_vector_view( - a, SimdViewAccess, - PackDim<0> >(a)); - impl_verify_vector_view( - a, SimdViewAccess, - PackDim<1> >(a)); - impl_verify_vector_view( - a, SimdViewAccess, - PackDim<2> >(a)); - impl_verify_vector_view( - a, SimdViewAccess, - PackDim<3> >(a)); - impl_verify_vector_view( - a, SimdViewAccess, - PackDim<4> >(a)); - impl_verify_vector_view( - a, SimdViewAccess, - PackDim<5> >(a)); - impl_verify_vector_view( - a, SimdViewAccess, - PackDim<6> >(a)); + impl_verify_vector_view(a, SimdViewAccess, PackDim<0> >(a)); + impl_verify_vector_view(a, SimdViewAccess, PackDim<1> >(a)); + impl_verify_vector_view(a, SimdViewAccess, PackDim<2> >(a)); + impl_verify_vector_view(a, SimdViewAccess, PackDim<3> >(a)); + impl_verify_vector_view(a, SimdViewAccess, PackDim<4> >(a)); + impl_verify_vector_view(a, SimdViewAccess, PackDim<5> >(a)); + impl_verify_vector_view(a, SimdViewAccess, PackDim<6> >(a)); } { /// rank 8 array - Kokkos::View a( - "a", test_view_size, test_view_size, test_view_size, test_view_size, - test_view_size, test_view_size, test_view_size, test_view_size); + Kokkos::View a("a", test_view_size, test_view_size, test_view_size, test_view_size, + test_view_size, test_view_size, test_view_size, test_view_size); impl_init_vector_view(a); - impl_verify_vector_view( - a, SimdViewAccess, - PackDim<0> >(a)); - impl_verify_vector_view( - a, SimdViewAccess, - PackDim<1> >(a)); - impl_verify_vector_view( - a, SimdViewAccess, - PackDim<2> >(a)); - impl_verify_vector_view( - a, SimdViewAccess, - PackDim<3> >(a)); - impl_verify_vector_view( - a, SimdViewAccess, - PackDim<4> >(a)); - impl_verify_vector_view( - a, SimdViewAccess, - PackDim<5> >(a)); - impl_verify_vector_view( - a, SimdViewAccess, - PackDim<6> >(a)); - impl_verify_vector_view( - a, SimdViewAccess, - PackDim<7> >(a)); + impl_verify_vector_view(a, SimdViewAccess, PackDim<0> >(a)); + impl_verify_vector_view(a, SimdViewAccess, PackDim<1> >(a)); + impl_verify_vector_view(a, SimdViewAccess, PackDim<2> >(a)); + impl_verify_vector_view(a, SimdViewAccess, PackDim<3> >(a)); + impl_verify_vector_view(a, SimdViewAccess, PackDim<4> >(a)); + impl_verify_vector_view(a, SimdViewAccess, PackDim<5> >(a)); + impl_verify_vector_view(a, SimdViewAccess, PackDim<6> >(a)); + impl_verify_vector_view(a, SimdViewAccess, PackDim<7> >(a)); } } } // namespace Test template int test_batched_vector_view() { - static_assert( - Kokkos::SpaceAccessibility::accessible, - "vector datatype is only tested on host space"); - Test::impl_test_batched_vector_view(); + static_assert(Kokkos::SpaceAccessibility::accessible, + "vector datatype is only tested on host space"); + Test::impl_test_batched_vector_view(); return 0; } @@ -355,18 +236,12 @@ int test_batched_vector_view() { /// #if defined(KOKKOSKERNELS_INST_FLOAT) -TEST_F(TestCategory, batched_vector_view_simd_float8) { - test_batched_vector_view, 8>(); -} +TEST_F(TestCategory, batched_vector_view_simd_float8) { test_batched_vector_view, 8>(); } #endif #if defined(KOKKOSKERNELS_INST_DOUBLE) -TEST_F(TestCategory, batched_vector_view_simd_double4) { - test_batched_vector_view, 4>(); -} -TEST_F(TestCategory, batched_vector_view_simd_double8) { - test_batched_vector_view, 8>(); -} +TEST_F(TestCategory, batched_vector_view_simd_double4) { test_batched_vector_view, 4>(); } +TEST_F(TestCategory, batched_vector_view_simd_double8) { test_batched_vector_view, 8>(); } #endif #if defined(KOKKOSKERNELS_INST_COMPLEX_FLOAT) @@ -383,8 +258,7 @@ TEST_F(TestCategory, batched_vector_view_simd_dcomplex2) { test_batched_vector_view >, 2>(); } -#if defined(KOKKOS_COMPILER_INTEL) && \ - ((KOKKOS_COMPILER_INTEL > 1900) && (KOKKOS_COMPILER_INTEL <= 2021)) +#if defined(KOKKOS_COMPILER_INTEL) && ((KOKKOS_COMPILER_INTEL > 1900) && (KOKKOS_COMPILER_INTEL <= 2021)) TEST_F(TestCategory, batched_vector_view_simd_dcomplex4) { printf( "Skipped: intel compiler version > 19.0.05 && <= 2021\n" diff --git a/batched/sparse/impl/KokkosBatched_CG_TeamVector_Impl.hpp b/batched/sparse/impl/KokkosBatched_CG_TeamVector_Impl.hpp index c11ad96959..9aa4b95f2c 100644 --- a/batched/sparse/impl/KokkosBatched_CG_TeamVector_Impl.hpp +++ b/batched/sparse/impl/KokkosBatched_CG_TeamVector_Impl.hpp @@ -35,16 +35,14 @@ namespace KokkosBatched { /// template -template -KOKKOS_INLINE_FUNCTION int TeamVectorCG::invoke( - const MemberType& member, const OperatorType& A, const VectorViewType& _B, - const VectorViewType& _X, const KrylovHandleType& handle, - const TMPViewType& _TMPView, const TMPNormViewType& _TMPNormView) { +KOKKOS_INLINE_FUNCTION int TeamVectorCG::invoke(const MemberType& member, const OperatorType& A, + const VectorViewType& _B, const VectorViewType& _X, + const KrylovHandleType& handle, const TMPViewType& _TMPView, + const TMPNormViewType& _TMPNormView) { typedef int OrdinalType; - typedef typename Kokkos::ArithTraits< - typename VectorViewType::non_const_value_type>::mag_type MagnitudeType; + typedef typename Kokkos::ArithTraits::mag_type MagnitudeType; const size_t maximum_iteration = handle.get_max_iteration(); const MagnitudeType tolerance = handle.get_tolerance(); @@ -59,14 +57,10 @@ KOKKOS_INLINE_FUNCTION int TeamVectorCG::invoke( int offset_R = offset_Q + numRows; int offset_X = offset_R + numRows; - auto P = Kokkos::subview(_TMPView, Kokkos::ALL, - Kokkos::make_pair(offset_P, offset_P + numRows)); - auto Q = Kokkos::subview(_TMPView, Kokkos::ALL, - Kokkos::make_pair(offset_Q, offset_Q + numRows)); - auto R = Kokkos::subview(_TMPView, Kokkos::ALL, - Kokkos::make_pair(offset_R, offset_R + numRows)); - auto X = Kokkos::subview(_TMPView, Kokkos::ALL, - Kokkos::make_pair(offset_X, offset_X + numRows)); + auto P = Kokkos::subview(_TMPView, Kokkos::ALL, Kokkos::make_pair(offset_P, offset_P + numRows)); + auto Q = Kokkos::subview(_TMPView, Kokkos::ALL, Kokkos::make_pair(offset_Q, offset_Q + numRows)); + auto R = Kokkos::subview(_TMPView, Kokkos::ALL, Kokkos::make_pair(offset_R, offset_R + numRows)); + auto X = Kokkos::subview(_TMPView, Kokkos::ALL, Kokkos::make_pair(offset_X, offset_X + numRows)); auto sqr_norm_0 = Kokkos::subview(_TMPNormView, Kokkos::ALL, 0); auto sqr_norm_j = Kokkos::subview(_TMPNormView, Kokkos::ALL, 1); @@ -90,10 +84,7 @@ KOKKOS_INLINE_FUNCTION int TeamVectorCG::invoke( member.team_barrier(); Kokkos::parallel_for(Kokkos::TeamVectorRange(member, 0, numMatrices), - [&](const OrdinalType& i) { - mask(i) = - sqr_norm_0(i) > tolerance * tolerance ? 1. : 0; - }); + [&](const OrdinalType& i) { mask(i) = sqr_norm_0(i) > tolerance * tolerance ? 1. : 0; }); TeamVectorCopy1D::invoke(member, sqr_norm_0, sqr_norm_j); @@ -109,10 +100,7 @@ KOKKOS_INLINE_FUNCTION int TeamVectorCG::invoke( member.team_barrier(); Kokkos::parallel_for(Kokkos::TeamVectorRange(member, 0, numMatrices), - [&](const OrdinalType& i) { - alpha(i) = - mask(i) != 0. ? sqr_norm_j(i) / tmp(i) : 0.; - }); + [&](const OrdinalType& i) { alpha(i) = mask(i) != 0. ? sqr_norm_j(i) / tmp(i) : 0.; }); member.team_barrier(); // x_{j+1} := alpha p_j + x_j @@ -131,10 +119,7 @@ KOKKOS_INLINE_FUNCTION int TeamVectorCG::invoke( member.team_barrier(); Kokkos::parallel_for(Kokkos::TeamVectorRange(member, 0, numMatrices), - [&](const OrdinalType& i) { - alpha(i) = - mask(i) != 0. ? tmp(i) / sqr_norm_j(i) : 0.; - }); + [&](const OrdinalType& i) { alpha(i) = mask(i) != 0. ? tmp(i) / sqr_norm_j(i) : 0.; }); TeamVectorCopy1D::invoke(member, tmp, sqr_norm_j); @@ -167,55 +152,43 @@ KOKKOS_INLINE_FUNCTION int TeamVectorCG::invoke( } template -template -KOKKOS_INLINE_FUNCTION int TeamVectorCG::invoke( - const MemberType& member, const OperatorType& A, const VectorViewType& _B, - const VectorViewType& _X, const KrylovHandleType& handle) { +template +KOKKOS_INLINE_FUNCTION int TeamVectorCG::invoke(const MemberType& member, const OperatorType& A, + const VectorViewType& _B, const VectorViewType& _X, + const KrylovHandleType& handle) { const int strategy = handle.get_memory_strategy(); if (strategy == 0) { - using ScratchPadVectorViewType = Kokkos::View< - typename VectorViewType::non_const_value_type**, - typename VectorViewType::array_layout, - typename VectorViewType::execution_space::scratch_memory_space>; - using ScratchPadNormViewType = Kokkos::View< - typename Kokkos::ArithTraits< - typename VectorViewType::non_const_value_type>::mag_type**, - typename VectorViewType::execution_space::scratch_memory_space>; + using ScratchPadVectorViewType = + Kokkos::View; + using ScratchPadNormViewType = + Kokkos::View::mag_type**, + typename VectorViewType::execution_space::scratch_memory_space>; const int numMatrices = _X.extent(0); const int numRows = _X.extent(1); - ScratchPadVectorViewType _TMPView( - member.team_scratch(handle.get_scratch_pad_level()), numMatrices, - 4 * numRows); + ScratchPadVectorViewType _TMPView(member.team_scratch(handle.get_scratch_pad_level()), numMatrices, 4 * numRows); - ScratchPadNormViewType _TMPNormView( - member.team_scratch(handle.get_scratch_pad_level()), numMatrices, 5); + ScratchPadNormViewType _TMPNormView(member.team_scratch(handle.get_scratch_pad_level()), numMatrices, 5); - return invoke( - member, A, _B, _X, handle, _TMPView, _TMPNormView); + return invoke(member, A, _B, _X, handle, _TMPView, _TMPNormView); } if (strategy == 1) { const int first_matrix = handle.first_index(member.league_rank()); const int last_matrix = handle.last_index(member.league_rank()); - using ScratchPadNormViewType = Kokkos::View< - typename Kokkos::ArithTraits< - typename VectorViewType::non_const_value_type>::mag_type**, - typename VectorViewType::execution_space::scratch_memory_space>; + using ScratchPadNormViewType = + Kokkos::View::mag_type**, + typename VectorViewType::execution_space::scratch_memory_space>; const int numMatrices = _X.extent(0); - auto _TMPView = Kokkos::subview( - handle.tmp_view, Kokkos::make_pair(first_matrix, last_matrix), - Kokkos::ALL); + auto _TMPView = Kokkos::subview(handle.tmp_view, Kokkos::make_pair(first_matrix, last_matrix), Kokkos::ALL); - ScratchPadNormViewType _TMPNormView( - member.team_scratch(handle.get_scratch_pad_level()), numMatrices, 5); + ScratchPadNormViewType _TMPNormView(member.team_scratch(handle.get_scratch_pad_level()), numMatrices, 5); - return invoke( - member, A, _B, _X, handle, _TMPView, _TMPNormView); + return invoke(member, A, _B, _X, handle, _TMPView, _TMPNormView); } return 0; } diff --git a/batched/sparse/impl/KokkosBatched_CG_Team_Impl.hpp b/batched/sparse/impl/KokkosBatched_CG_Team_Impl.hpp index bf2f1d2e86..82c62624c1 100644 --- a/batched/sparse/impl/KokkosBatched_CG_Team_Impl.hpp +++ b/batched/sparse/impl/KokkosBatched_CG_Team_Impl.hpp @@ -34,15 +34,14 @@ namespace KokkosBatched { /// template -template -KOKKOS_INLINE_FUNCTION int TeamCG::invoke( - const MemberType& member, const OperatorType& A, const VectorViewType& _B, - const VectorViewType& _X, const KrylovHandle& handle, - const TMPViewType& _TMPView, const TMPNormViewType& _TMPNormView) { +template +KOKKOS_INLINE_FUNCTION int TeamCG::invoke(const MemberType& member, const OperatorType& A, + const VectorViewType& _B, const VectorViewType& _X, + const KrylovHandle& handle, const TMPViewType& _TMPView, + const TMPNormViewType& _TMPNormView) { typedef int OrdinalType; - typedef typename Kokkos::ArithTraits< - typename VectorViewType::non_const_value_type>::mag_type MagnitudeType; + typedef typename Kokkos::ArithTraits::mag_type MagnitudeType; size_t maximum_iteration = handle.get_max_iteration(); const MagnitudeType tolerance = handle.get_tolerance(); @@ -57,14 +56,10 @@ KOKKOS_INLINE_FUNCTION int TeamCG::invoke( int offset_R = offset_Q + numRows; int offset_X = offset_R + numRows; - auto P = Kokkos::subview(_TMPView, Kokkos::ALL, - Kokkos::make_pair(offset_P, offset_P + numRows)); - auto Q = Kokkos::subview(_TMPView, Kokkos::ALL, - Kokkos::make_pair(offset_Q, offset_Q + numRows)); - auto R = Kokkos::subview(_TMPView, Kokkos::ALL, - Kokkos::make_pair(offset_R, offset_R + numRows)); - auto X = Kokkos::subview(_TMPView, Kokkos::ALL, - Kokkos::make_pair(offset_X, offset_X + numRows)); + auto P = Kokkos::subview(_TMPView, Kokkos::ALL, Kokkos::make_pair(offset_P, offset_P + numRows)); + auto Q = Kokkos::subview(_TMPView, Kokkos::ALL, Kokkos::make_pair(offset_Q, offset_Q + numRows)); + auto R = Kokkos::subview(_TMPView, Kokkos::ALL, Kokkos::make_pair(offset_R, offset_R + numRows)); + auto X = Kokkos::subview(_TMPView, Kokkos::ALL, Kokkos::make_pair(offset_X, offset_X + numRows)); auto sqr_norm_0 = Kokkos::subview(_TMPNormView, Kokkos::ALL, 0); auto sqr_norm_j = Kokkos::subview(_TMPNormView, Kokkos::ALL, 1); @@ -88,10 +83,7 @@ KOKKOS_INLINE_FUNCTION int TeamCG::invoke( member.team_barrier(); Kokkos::parallel_for(Kokkos::TeamThreadRange(member, 0, numMatrices), - [&](const OrdinalType& i) { - mask(i) = - sqr_norm_0(i) > tolerance * tolerance ? 1. : 0; - }); + [&](const OrdinalType& i) { mask(i) = sqr_norm_0(i) > tolerance * tolerance ? 1. : 0; }); TeamCopy1D::invoke(member, sqr_norm_0, sqr_norm_j); @@ -107,10 +99,7 @@ KOKKOS_INLINE_FUNCTION int TeamCG::invoke( member.team_barrier(); Kokkos::parallel_for(Kokkos::TeamThreadRange(member, 0, numMatrices), - [&](const OrdinalType& i) { - alpha(i) = - mask(i) != 0. ? sqr_norm_j(i) / tmp(i) : 0.; - }); + [&](const OrdinalType& i) { alpha(i) = mask(i) != 0. ? sqr_norm_j(i) / tmp(i) : 0.; }); member.team_barrier(); // x_{j+1} := alpha p_j + x_j @@ -129,10 +118,7 @@ KOKKOS_INLINE_FUNCTION int TeamCG::invoke( member.team_barrier(); Kokkos::parallel_for(Kokkos::TeamThreadRange(member, 0, numMatrices), - [&](const OrdinalType& i) { - alpha(i) = - mask(i) != 0. ? tmp(i) / sqr_norm_j(i) : 0.; - }); + [&](const OrdinalType& i) { alpha(i) = mask(i) != 0. ? tmp(i) / sqr_norm_j(i) : 0.; }); TeamCopy1D::invoke(member, tmp, sqr_norm_j); @@ -165,55 +151,43 @@ KOKKOS_INLINE_FUNCTION int TeamCG::invoke( } template -template -KOKKOS_INLINE_FUNCTION int TeamCG::invoke( - const MemberType& member, const OperatorType& A, const VectorViewType& _B, - const VectorViewType& _X, const KrylovHandleType& handle) { +template +KOKKOS_INLINE_FUNCTION int TeamCG::invoke(const MemberType& member, const OperatorType& A, + const VectorViewType& _B, const VectorViewType& _X, + const KrylovHandleType& handle) { const int strategy = handle.get_memory_strategy(); if (strategy == 0) { - using ScratchPadVectorViewType = Kokkos::View< - typename VectorViewType::non_const_value_type**, - typename VectorViewType::array_layout, - typename VectorViewType::execution_space::scratch_memory_space>; - using ScratchPadNormViewType = Kokkos::View< - typename Kokkos::ArithTraits< - typename VectorViewType::non_const_value_type>::mag_type**, - typename VectorViewType::execution_space::scratch_memory_space>; + using ScratchPadVectorViewType = + Kokkos::View; + using ScratchPadNormViewType = + Kokkos::View::mag_type**, + typename VectorViewType::execution_space::scratch_memory_space>; const int numMatrices = _X.extent(0); const int numRows = _X.extent(1); - ScratchPadVectorViewType _TMPView( - member.team_scratch(handle.get_scratch_pad_level()), numMatrices, - 4 * numRows); + ScratchPadVectorViewType _TMPView(member.team_scratch(handle.get_scratch_pad_level()), numMatrices, 4 * numRows); - ScratchPadNormViewType _TMPNormView( - member.team_scratch(handle.get_scratch_pad_level()), numMatrices, 5); + ScratchPadNormViewType _TMPNormView(member.team_scratch(handle.get_scratch_pad_level()), numMatrices, 5); - return invoke( - member, A, _B, _X, handle, _TMPView, _TMPNormView); + return invoke(member, A, _B, _X, handle, _TMPView, _TMPNormView); } if (strategy == 1) { const int first_matrix = handle.first_index(member.league_rank()); const int last_matrix = handle.last_index(member.league_rank()); - using ScratchPadNormViewType = Kokkos::View< - typename Kokkos::ArithTraits< - typename VectorViewType::non_const_value_type>::mag_type**, - typename VectorViewType::execution_space::scratch_memory_space>; + using ScratchPadNormViewType = + Kokkos::View::mag_type**, + typename VectorViewType::execution_space::scratch_memory_space>; const int numMatrices = _X.extent(0); - auto _TMPView = Kokkos::subview( - handle.tmp_view, Kokkos::make_pair(first_matrix, last_matrix), - Kokkos::ALL); + auto _TMPView = Kokkos::subview(handle.tmp_view, Kokkos::make_pair(first_matrix, last_matrix), Kokkos::ALL); - ScratchPadNormViewType _TMPNormView( - member.team_scratch(handle.get_scratch_pad_level()), numMatrices, 5); + ScratchPadNormViewType _TMPNormView(member.team_scratch(handle.get_scratch_pad_level()), numMatrices, 5); - return invoke( - member, A, _B, _X, handle, _TMPView, _TMPNormView); + return invoke(member, A, _B, _X, handle, _TMPView, _TMPNormView); } return 0; } diff --git a/batched/sparse/impl/KokkosBatched_GMRES_Serial_Impl.hpp b/batched/sparse/impl/KokkosBatched_GMRES_Serial_Impl.hpp index 923b67c105..2d8c0cae00 100644 --- a/batched/sparse/impl/KokkosBatched_GMRES_Serial_Impl.hpp +++ b/batched/sparse/impl/KokkosBatched_GMRES_Serial_Impl.hpp @@ -36,17 +36,12 @@ namespace KokkosBatched { /// Serial GMRES /// -template -KOKKOS_INLINE_FUNCTION int SerialGMRES::invoke(const OperatorType& A, - const VectorViewType& _B, - const VectorViewType& _X, - const PrecOperatorType& P, - const KrylovHandleType& handle, - const int GMRES_id) { +template +KOKKOS_INLINE_FUNCTION int SerialGMRES::invoke(const OperatorType& A, const VectorViewType& _B, + const VectorViewType& _X, const PrecOperatorType& P, + const KrylovHandleType& handle, const int GMRES_id) { typedef int OrdinalType; - typedef typename Kokkos::ArithTraits< - typename VectorViewType::non_const_value_type>::mag_type MagnitudeType; + typedef typename Kokkos::ArithTraits::mag_type MagnitudeType; typedef Kokkos::ArithTraits ATM; using SerialCopy1D = SerialCopy; @@ -55,9 +50,7 @@ KOKKOS_INLINE_FUNCTION int SerialGMRES::invoke(const OperatorType& A, const OrdinalType numMatrices = _X.extent(0); const OrdinalType numRows = _X.extent(1); - size_t maximum_iteration = handle.get_max_iteration() < numRows - ? handle.get_max_iteration() - : numRows; + size_t maximum_iteration = handle.get_max_iteration() < numRows ? handle.get_max_iteration() : numRows; const MagnitudeType tolerance = handle.get_tolerance(); const MagnitudeType max_tolerance = handle.get_max_tolerance(); @@ -72,15 +65,12 @@ KOKKOS_INLINE_FUNCTION int SerialGMRES::invoke(const OperatorType& A, const int first_matrix = handle.first_index(GMRES_id); const int last_matrix = handle.last_index(GMRES_id); - auto V_view = Kokkos::subview( - handle.Arnoldi_view, Kokkos::make_pair(first_matrix, last_matrix), - Kokkos::ALL, Kokkos::make_pair(offset_V, offset_V + n_V)); - auto H_view = Kokkos::subview( - handle.Arnoldi_view, Kokkos::make_pair(first_matrix, last_matrix), - Kokkos::ALL, Kokkos::make_pair(offset_H, offset_H + n_H)); - auto Givens_view = Kokkos::subview( - handle.Arnoldi_view, Kokkos::make_pair(first_matrix, last_matrix), - Kokkos::ALL, Kokkos::make_pair(offset_Givens, offset_Givens + n_Givens)); + auto V_view = Kokkos::subview(handle.Arnoldi_view, Kokkos::make_pair(first_matrix, last_matrix), Kokkos::ALL, + Kokkos::make_pair(offset_V, offset_V + n_V)); + auto H_view = Kokkos::subview(handle.Arnoldi_view, Kokkos::make_pair(first_matrix, last_matrix), Kokkos::ALL, + Kokkos::make_pair(offset_H, offset_H + n_H)); + auto Givens_view = Kokkos::subview(handle.Arnoldi_view, Kokkos::make_pair(first_matrix, last_matrix), Kokkos::ALL, + Kokkos::make_pair(offset_Givens, offset_Givens + n_Givens)); int n_G = maximum_iteration + 1; int n_W = numRows; @@ -91,18 +81,12 @@ KOKKOS_INLINE_FUNCTION int SerialGMRES::invoke(const OperatorType& A, int offset_mask = offset_W + n_W; int offset_tmp = offset_mask + n_mask; - auto G = Kokkos::subview(handle.tmp_view, - Kokkos::make_pair(first_matrix, last_matrix), - Kokkos::make_pair(offset_G, offset_G + n_G)); - auto W = Kokkos::subview(handle.tmp_view, - Kokkos::make_pair(first_matrix, last_matrix), - Kokkos::make_pair(offset_W, offset_W + n_W)); - auto mask = Kokkos::subview(handle.tmp_view, - Kokkos::make_pair(first_matrix, last_matrix), - offset_mask); - auto tmp = - Kokkos::subview(handle.tmp_view, - Kokkos::make_pair(first_matrix, last_matrix), offset_tmp); + auto G = Kokkos::subview(handle.tmp_view, Kokkos::make_pair(first_matrix, last_matrix), + Kokkos::make_pair(offset_G, offset_G + n_G)); + auto W = Kokkos::subview(handle.tmp_view, Kokkos::make_pair(first_matrix, last_matrix), + Kokkos::make_pair(offset_W, offset_W + n_W)); + auto mask = Kokkos::subview(handle.tmp_view, Kokkos::make_pair(first_matrix, last_matrix), offset_mask); + auto tmp = Kokkos::subview(handle.tmp_view, Kokkos::make_pair(first_matrix, last_matrix), offset_tmp); // Deep copy of b into r_0: SerialCopy2D::invoke(_B, W); @@ -149,19 +133,14 @@ KOKKOS_INLINE_FUNCTION int SerialGMRES::invoke(const OperatorType& A, if (handle.get_ortho_strategy() == 0) { for (OrdinalType l = 0; l < numMatrices; ++l) { auto W_l = Kokkos::subview(W, l, Kokkos::ALL); - auto V_old = Kokkos::subview( - V_view, l, Kokkos::make_pair(0, (int)j + 1), Kokkos::ALL); - auto H_old = - Kokkos::subview(H_view, l, j, Kokkos::make_pair(0, (int)j + 1)); + auto V_old = Kokkos::subview(V_view, l, Kokkos::make_pair(0, (int)j + 1), Kokkos::ALL); + auto H_old = Kokkos::subview(H_view, l, j, Kokkos::make_pair(0, (int)j + 1)); // Inner products - KokkosBlas::SerialGemv::invoke(1, V_old, W_l, 0, - H_old); + KokkosBlas::SerialGemv::invoke(1, V_old, W_l, 0, H_old); // Update - KokkosBlas::SerialGemv::invoke( - -1, V_old, H_old, 1, W_l); + KokkosBlas::SerialGemv::invoke(-1, V_old, H_old, 1, W_l); } } if (handle.get_ortho_strategy() == 1) { @@ -179,8 +158,7 @@ KOKKOS_INLINE_FUNCTION int SerialGMRES::invoke(const OperatorType& A, for (OrdinalType i = 0; i < numMatrices; ++i) { H_view(i, j, j + 1) = ATM::sqrt(tmp(i)); - tmp(i) = - H_view(i, j, j + 1) > max_tolerance ? 1. / H_view(i, j, j + 1) : 0.; + tmp(i) = H_view(i, j, j + 1) > max_tolerance ? 1. / H_view(i, j, j + 1) : 0.; } if (j + 1 < maximum_iteration) { @@ -207,8 +185,7 @@ KOKKOS_INLINE_FUNCTION int SerialGMRES::invoke(const OperatorType& A, } // Compute the new Givens rotation: - Kokkos::pair + Kokkos::pair G_new(1, 0); typename VectorViewType::non_const_value_type alpha = 0; SerialGivensInternal::invoke(H_j(j), H_j(j + 1), &G_new, &alpha); @@ -241,8 +218,7 @@ KOKKOS_INLINE_FUNCTION int SerialGMRES::invoke(const OperatorType& A, } bool all_converged = true; - for (OrdinalType l = 0; l < numMatrices; ++l) - all_converged = (all_converged && mask(l) == 0.); + for (OrdinalType l = 0; l < numMatrices; ++l) all_converged = (all_converged && mask(l) == 0.); if (all_converged) { maximum_iteration = j + 1; break; @@ -255,23 +231,19 @@ KOKKOS_INLINE_FUNCTION int SerialGMRES::invoke(const OperatorType& A, auto A_l = Kokkos::subview(H_view, l, first_indices, first_indices); auto B_l = Kokkos::subview(G, l, first_indices); - SerialTrsm::invoke(1, A_l, B_l); + SerialTrsm::invoke(1, A_l, B_l); } if (handle.get_ortho_strategy() == 0) { for (OrdinalType l = 0; l < numMatrices; ++l) { KokkosBlas::SerialGemv::invoke( - 1, Kokkos::subview(V_view, l, first_indices, Kokkos::ALL), - Kokkos::subview(G, l, first_indices), 1, + 1, Kokkos::subview(V_view, l, first_indices, Kokkos::ALL), Kokkos::subview(G, l, first_indices), 1, Kokkos::subview(_X, l, Kokkos::ALL)); } } if (handle.get_ortho_strategy() == 1) { for (size_t j = 0; j < maximum_iteration; ++j) { - SerialAxpy::invoke(Kokkos::subview(G, Kokkos::ALL, j), - Kokkos::subview(V_view, Kokkos::ALL, j, Kokkos::ALL), - _X); + SerialAxpy::invoke(Kokkos::subview(G, Kokkos::ALL, j), Kokkos::subview(V_view, Kokkos::ALL, j, Kokkos::ALL), _X); } } @@ -289,12 +261,9 @@ KOKKOS_INLINE_FUNCTION int SerialGMRES::invoke(const OperatorType& A, return status; } -template -KOKKOS_INLINE_FUNCTION int SerialGMRES::invoke(const OperatorType& A, - const VectorViewType& _B, - const VectorViewType& _X, - const KrylovHandleType& handle) { +template +KOKKOS_INLINE_FUNCTION int SerialGMRES::invoke(const OperatorType& A, const VectorViewType& _B, + const VectorViewType& _X, const KrylovHandleType& handle) { Identity P; return invoke(A, _B, _X, P, handle); } diff --git a/batched/sparse/impl/KokkosBatched_GMRES_TeamVector_Impl.hpp b/batched/sparse/impl/KokkosBatched_GMRES_TeamVector_Impl.hpp index a7219ecc91..8d37b2ac5e 100644 --- a/batched/sparse/impl/KokkosBatched_GMRES_TeamVector_Impl.hpp +++ b/batched/sparse/impl/KokkosBatched_GMRES_TeamVector_Impl.hpp @@ -39,17 +39,16 @@ namespace KokkosBatched { /// template -template -KOKKOS_INLINE_FUNCTION int TeamVectorGMRES::invoke( - const MemberType& member, const OperatorType& A, const VectorViewType& _B, - const VectorViewType& _X, const PrecOperatorType& P, - const KrylovHandleType& handle, const ArnoldiViewType& _ArnoldiView, - const TMPViewType& _TMPView) { +KOKKOS_INLINE_FUNCTION int TeamVectorGMRES::invoke(const MemberType& member, const OperatorType& A, + const VectorViewType& _B, const VectorViewType& _X, + const PrecOperatorType& P, + const KrylovHandleType& handle, + const ArnoldiViewType& _ArnoldiView, + const TMPViewType& _TMPView) { typedef int OrdinalType; - typedef typename Kokkos::ArithTraits< - typename VectorViewType::non_const_value_type>::mag_type MagnitudeType; + typedef typename Kokkos::ArithTraits::mag_type MagnitudeType; typedef Kokkos::ArithTraits ATM; using TeamVectorCopy1D = TeamVectorCopy; @@ -57,9 +56,7 @@ KOKKOS_INLINE_FUNCTION int TeamVectorGMRES::invoke( const OrdinalType numMatrices = _X.extent(0); const OrdinalType numRows = _X.extent(1); - size_t maximum_iteration = handle.get_max_iteration() < numRows - ? handle.get_max_iteration() - : numRows; + size_t maximum_iteration = handle.get_max_iteration() < numRows ? handle.get_max_iteration() : numRows; const MagnitudeType tolerance = handle.get_tolerance(); const MagnitudeType max_tolerance = handle.get_max_tolerance(); @@ -71,13 +68,10 @@ KOKKOS_INLINE_FUNCTION int TeamVectorGMRES::invoke( int offset_H = offset_V + n_V; int offset_Givens = offset_H + n_H; - auto V_view = Kokkos::subview(_ArnoldiView, Kokkos::ALL, Kokkos::ALL, - Kokkos::make_pair(offset_V, offset_V + n_V)); - auto H_view = Kokkos::subview(_ArnoldiView, Kokkos::ALL, Kokkos::ALL, - Kokkos::make_pair(offset_H, offset_H + n_H)); - auto Givens_view = Kokkos::subview( - _ArnoldiView, Kokkos::ALL, Kokkos::ALL, - Kokkos::make_pair(offset_Givens, offset_Givens + n_Givens)); + auto V_view = Kokkos::subview(_ArnoldiView, Kokkos::ALL, Kokkos::ALL, Kokkos::make_pair(offset_V, offset_V + n_V)); + auto H_view = Kokkos::subview(_ArnoldiView, Kokkos::ALL, Kokkos::ALL, Kokkos::make_pair(offset_H, offset_H + n_H)); + auto Givens_view = Kokkos::subview(_ArnoldiView, Kokkos::ALL, Kokkos::ALL, + Kokkos::make_pair(offset_Givens, offset_Givens + n_Givens)); int n_G = maximum_iteration + 1; int n_W = numRows; @@ -88,10 +82,8 @@ KOKKOS_INLINE_FUNCTION int TeamVectorGMRES::invoke( int offset_mask = offset_W + n_W; int offset_tmp = offset_mask + n_mask; - auto G = Kokkos::subview(_TMPView, Kokkos::ALL, - Kokkos::make_pair(offset_G, offset_G + n_G)); - auto W = Kokkos::subview(_TMPView, Kokkos::ALL, - Kokkos::make_pair(offset_W, offset_W + n_W)); + auto G = Kokkos::subview(_TMPView, Kokkos::ALL, Kokkos::make_pair(offset_G, offset_G + n_G)); + auto W = Kokkos::subview(_TMPView, Kokkos::ALL, Kokkos::make_pair(offset_W, offset_W + n_W)); auto mask = Kokkos::subview(_TMPView, Kokkos::ALL, offset_mask); auto tmp = Kokkos::subview(_TMPView, Kokkos::ALL, offset_tmp); @@ -109,33 +101,29 @@ KOKKOS_INLINE_FUNCTION int TeamVectorGMRES::invoke( TeamVectorDot::invoke(member, W, W, tmp); member.team_barrier(); - Kokkos::parallel_for(Kokkos::TeamVectorRange(member, 0, numMatrices), - [&](const OrdinalType& i) { - tmp(i) = ATM::sqrt(tmp(i)); - handle.set_norm(member.league_rank(), i, 0, tmp(i)); - if (tmp(i) > max_tolerance) { - mask(i) = 1; - G(i, 0) = tmp(i); - tmp(i) = 1. / tmp(i); - } else { - handle.set_iteration(member.league_rank(), i, 0); - mask(i) = 0; - G(i, 0) = 0.; - tmp(i) = 0.; - } - }); + Kokkos::parallel_for(Kokkos::TeamVectorRange(member, 0, numMatrices), [&](const OrdinalType& i) { + tmp(i) = ATM::sqrt(tmp(i)); + handle.set_norm(member.league_rank(), i, 0, tmp(i)); + if (tmp(i) > max_tolerance) { + mask(i) = 1; + G(i, 0) = tmp(i); + tmp(i) = 1. / tmp(i); + } else { + handle.set_iteration(member.league_rank(), i, 0); + mask(i) = 0; + G(i, 0) = 0.; + tmp(i) = 0.; + } + }); member.team_barrier(); // Finish writing to tmp auto V_0 = Kokkos::subview(V_view, Kokkos::ALL, 0, Kokkos::ALL); - Kokkos::parallel_for( - Kokkos::TeamVectorRange(member, 0, numMatrices * numRows), - [&](const OrdinalType& iTemp) { - OrdinalType iRow, iMatrix; - getIndices( - iTemp, numRows, numMatrices, iRow, iMatrix); - V_0(iMatrix, iRow) = W(iMatrix, iRow) * tmp(iMatrix); - }); + Kokkos::parallel_for(Kokkos::TeamVectorRange(member, 0, numMatrices * numRows), [&](const OrdinalType& iTemp) { + OrdinalType iRow, iMatrix; + getIndices(iTemp, numRows, numMatrices, iRow, iMatrix); + V_0(iMatrix, iRow) = W(iMatrix, iRow) * tmp(iMatrix); + }); int status = 1; // int number_not_converged = 0; @@ -151,20 +139,14 @@ KOKKOS_INLINE_FUNCTION int TeamVectorGMRES::invoke( member.team_barrier(); if (handle.get_ortho_strategy() == 0) { - auto V_old = Kokkos::subview( - V_view, Kokkos::ALL, Kokkos::make_pair(0, (int)j + 1), Kokkos::ALL); - auto H_old = Kokkos::subview(H_view, Kokkos::ALL, j, - Kokkos::make_pair(0, (int)j + 1)); + auto V_old = Kokkos::subview(V_view, Kokkos::ALL, Kokkos::make_pair(0, (int)j + 1), Kokkos::ALL); + auto H_old = Kokkos::subview(H_view, Kokkos::ALL, j, Kokkos::make_pair(0, (int)j + 1)); // Inner products - TeamVectorGemv::invoke(member, 1, V_old, W, 0, - H_old); + TeamVectorGemv::invoke(member, 1, V_old, W, 0, H_old); member.team_barrier(); // Update - TeamVectorGemv::invoke(member, -1, V_old, H_old, 1, - W); + TeamVectorGemv::invoke(member, -1, V_old, H_old, 1, W); member.team_barrier(); // Finish writing to W } if (handle.get_ortho_strategy() == 1) { @@ -172,12 +154,10 @@ KOKKOS_INLINE_FUNCTION int TeamVectorGMRES::invoke( auto V_i = Kokkos::subview(V_view, Kokkos::ALL, i, Kokkos::ALL); TeamVectorDot::invoke(member, W, V_i, tmp); member.team_barrier(); - TeamVectorCopy1D::invoke(member, tmp, - Kokkos::subview(H_view, Kokkos::ALL, j, i)); + TeamVectorCopy1D::invoke(member, tmp, Kokkos::subview(H_view, Kokkos::ALL, j, i)); member.team_barrier(); - Kokkos::parallel_for( - Kokkos::TeamVectorRange(member, 0, numMatrices), - [&](const OrdinalType& ii) { tmp(ii) = -tmp(ii); }); + Kokkos::parallel_for(Kokkos::TeamVectorRange(member, 0, numMatrices), + [&](const OrdinalType& ii) { tmp(ii) = -tmp(ii); }); member.team_barrier(); // Finish writing to tmp @@ -188,82 +168,71 @@ KOKKOS_INLINE_FUNCTION int TeamVectorGMRES::invoke( TeamVectorDot::invoke(member, W, W, tmp); member.team_barrier(); - Kokkos::parallel_for(Kokkos::TeamVectorRange(member, 0, numMatrices), - [&](const OrdinalType& i) { - H_view(i, j, j + 1) = ATM::sqrt(tmp(i)); - tmp(i) = H_view(i, j, j + 1) > max_tolerance - ? 1. / H_view(i, j, j + 1) - : 0.; - }); + Kokkos::parallel_for(Kokkos::TeamVectorRange(member, 0, numMatrices), [&](const OrdinalType& i) { + H_view(i, j, j + 1) = ATM::sqrt(tmp(i)); + tmp(i) = H_view(i, j, j + 1) > max_tolerance ? 1. / H_view(i, j, j + 1) : 0.; + }); member.team_barrier(); if (j + 1 < maximum_iteration) { auto V_n = Kokkos::subview(V_view, Kokkos::ALL, j + 1, Kokkos::ALL); - Kokkos::parallel_for( - Kokkos::TeamVectorRange(member, 0, numMatrices * numRows), - [&](const OrdinalType& iTemp) { - OrdinalType iRow, iMatrix; - getIndices( - iTemp, numRows, numMatrices, iRow, iMatrix); - V_n(iMatrix, iRow) = W(iMatrix, iRow) * tmp(iMatrix); - }); + Kokkos::parallel_for(Kokkos::TeamVectorRange(member, 0, numMatrices * numRows), [&](const OrdinalType& iTemp) { + OrdinalType iRow, iMatrix; + getIndices(iTemp, numRows, numMatrices, iRow, iMatrix); + V_n(iMatrix, iRow) = W(iMatrix, iRow) * tmp(iMatrix); + }); member.team_barrier(); } - Kokkos::parallel_for( - Kokkos::TeamVectorRange(member, 0, numMatrices), - [&](const OrdinalType& l) { - // Apply the previous Givens rotations: - auto H_j = Kokkos::subview(H_view, l, j, Kokkos::ALL); - auto Givens_0_l = Kokkos::subview(Givens_view, l, Kokkos::ALL, 0); - auto Givens_1_l = Kokkos::subview(Givens_view, l, Kokkos::ALL, 1); - - if (mask(l) == 1.) { - for (size_t i = 0; i < j; ++i) { - auto tmp1 = Givens_0_l(i) * H_j(i) + Givens_1_l(i) * H_j(i + 1); - auto tmp2 = -Givens_1_l(i) * H_j(i) + Givens_0_l(i) * H_j(i + 1); - H_j(i) = tmp1; - H_j(i + 1) = tmp2; - } - - // Compute the new Givens rotation: - Kokkos::pair - G_new(1, 0); - typename VectorViewType::non_const_value_type alpha = 0; - SerialGivensInternal::invoke(H_j(j), H_j(j + 1), &G_new, &alpha); - - Givens_0_l(j) = G_new.first; - Givens_1_l(j) = G_new.second; - - // Apply the new Givens rotation: - auto tmp1 = Givens_0_l(j) * H_j(j) + Givens_1_l(j) * H_j(j + 1); - auto tmp2 = -Givens_1_l(j) * H_j(j) + Givens_0_l(j) * H_j(j + 1); - H_j(j) = tmp1; - H_j(j + 1) = tmp2; - - G(l, j + 1) = -Givens_1_l(j) * G(l, j); - G(l, j) *= Givens_0_l(j); - } else { - H_j(j) = 1.; - G(l, j + 1) = 0.; - } - - auto res_norm = - Kokkos::ArithTraits::abs(G(l, j + 1)) / G(l, 0); - - handle.set_norm(member.league_rank(), l, j + 1, res_norm); - - if (mask(l) == 1. && res_norm < tolerance) { - mask(l) = 0.; - G(l, j + 1) = 0.; - handle.set_iteration(member.league_rank(), l, j + 1); - } - }); + Kokkos::parallel_for(Kokkos::TeamVectorRange(member, 0, numMatrices), [&](const OrdinalType& l) { + // Apply the previous Givens rotations: + auto H_j = Kokkos::subview(H_view, l, j, Kokkos::ALL); + auto Givens_0_l = Kokkos::subview(Givens_view, l, Kokkos::ALL, 0); + auto Givens_1_l = Kokkos::subview(Givens_view, l, Kokkos::ALL, 1); + + if (mask(l) == 1.) { + for (size_t i = 0; i < j; ++i) { + auto tmp1 = Givens_0_l(i) * H_j(i) + Givens_1_l(i) * H_j(i + 1); + auto tmp2 = -Givens_1_l(i) * H_j(i) + Givens_0_l(i) * H_j(i + 1); + H_j(i) = tmp1; + H_j(i + 1) = tmp2; + } + + // Compute the new Givens rotation: + Kokkos::pair + G_new(1, 0); + typename VectorViewType::non_const_value_type alpha = 0; + SerialGivensInternal::invoke(H_j(j), H_j(j + 1), &G_new, &alpha); + + Givens_0_l(j) = G_new.first; + Givens_1_l(j) = G_new.second; + + // Apply the new Givens rotation: + auto tmp1 = Givens_0_l(j) * H_j(j) + Givens_1_l(j) * H_j(j + 1); + auto tmp2 = -Givens_1_l(j) * H_j(j) + Givens_0_l(j) * H_j(j + 1); + H_j(j) = tmp1; + H_j(j + 1) = tmp2; + + G(l, j + 1) = -Givens_1_l(j) * G(l, j); + G(l, j) *= Givens_0_l(j); + } else { + H_j(j) = 1.; + G(l, j + 1) = 0.; + } + + auto res_norm = Kokkos::ArithTraits::abs(G(l, j + 1)) / G(l, 0); + + handle.set_norm(member.league_rank(), l, j + 1, res_norm); + + if (mask(l) == 1. && res_norm < tolerance) { + mask(l) = 0.; + G(l, j + 1) = 0.; + handle.set_iteration(member.league_rank(), l, j + 1); + } + }); member.team_barrier(); bool all_converged = true; - for (OrdinalType l = 0; l < numMatrices; ++l) - all_converged = (all_converged && mask(l) == 0.); + for (OrdinalType l = 0; l < numMatrices; ++l) all_converged = (all_converged && mask(l) == 0.); if (all_converged) { maximum_iteration = j + 1; break; @@ -274,30 +243,25 @@ KOKKOS_INLINE_FUNCTION int TeamVectorGMRES::invoke( auto first_indices = Kokkos::make_pair(0, (int)maximum_iteration); - Kokkos::parallel_for( - Kokkos::TeamVectorRange(member, 0, numMatrices), - [&](const OrdinalType& l) { - auto A_l = Kokkos::subview(H_view, l, first_indices, first_indices); - auto B_l = Kokkos::subview(G, l, first_indices); + Kokkos::parallel_for(Kokkos::TeamVectorRange(member, 0, numMatrices), [&](const OrdinalType& l) { + auto A_l = Kokkos::subview(H_view, l, first_indices, first_indices); + auto B_l = Kokkos::subview(G, l, first_indices); - SerialTrsm::invoke(1, A_l, B_l); - }); + SerialTrsm::invoke(1, A_l, B_l); + }); member.team_barrier(); // Finish writing to G if (handle.get_ortho_strategy() == 0) { TeamVectorGemv::invoke( - member, 1, - Kokkos::subview(V_view, Kokkos::ALL, first_indices, Kokkos::ALL), + member, 1, Kokkos::subview(V_view, Kokkos::ALL, first_indices, Kokkos::ALL), Kokkos::subview(G, Kokkos::ALL, first_indices), 1, _X); member.team_barrier(); // Finish writing to _X } if (handle.get_ortho_strategy() == 1) { for (size_t j = 0; j < maximum_iteration; ++j) { - TeamVectorAxpy::invoke( - member, Kokkos::subview(G, Kokkos::ALL, j), - Kokkos::subview(V_view, Kokkos::ALL, j, Kokkos::ALL), _X); + TeamVectorAxpy::invoke(member, Kokkos::subview(G, Kokkos::ALL, j), + Kokkos::subview(V_view, Kokkos::ALL, j, Kokkos::ALL), _X); member.team_barrier(); // Finish writing to _X } } @@ -305,128 +269,105 @@ KOKKOS_INLINE_FUNCTION int TeamVectorGMRES::invoke( if (handle.get_compute_last_residual()) { TeamVectorCopy::invoke(member, _B, W); member.team_barrier(); - A.template apply(member, _X, W, -1, - 1); + A.template apply(member, _X, W, -1, 1); member.team_barrier(); P.template apply(member, W, W); member.team_barrier(); TeamVectorDot::invoke(member, W, W, tmp); member.team_barrier(); - Kokkos::parallel_for(Kokkos::TeamVectorRange(member, 0, numMatrices), - [&](const OrdinalType& i) { - tmp(i) = ATM::sqrt(tmp(i)); - handle.set_last_norm(member.league_rank(), i, - tmp(i)); - }); + Kokkos::parallel_for(Kokkos::TeamVectorRange(member, 0, numMatrices), [&](const OrdinalType& i) { + tmp(i) = ATM::sqrt(tmp(i)); + handle.set_last_norm(member.league_rank(), i, tmp(i)); + }); } return status; } template -template -KOKKOS_INLINE_FUNCTION int TeamVectorGMRES::invoke( - const MemberType& member, const OperatorType& A, const VectorViewType& _B, - const VectorViewType& _X, const PrecOperatorType& P, - const KrylovHandleType& handle) { +template +KOKKOS_INLINE_FUNCTION int TeamVectorGMRES::invoke(const MemberType& member, const OperatorType& A, + const VectorViewType& _B, const VectorViewType& _X, + const PrecOperatorType& P, + const KrylovHandleType& handle) { const int strategy = handle.get_memory_strategy(); if (strategy == 0) { const int first_matrix = handle.first_index(member.league_rank()); const int last_matrix = handle.last_index(member.league_rank()); - auto _ArnoldiView = Kokkos::subview( - handle.Arnoldi_view, Kokkos::make_pair(first_matrix, last_matrix), - Kokkos::ALL, Kokkos::ALL); + auto _ArnoldiView = + Kokkos::subview(handle.Arnoldi_view, Kokkos::make_pair(first_matrix, last_matrix), Kokkos::ALL, Kokkos::ALL); const int numMatrices = _X.extent(0); const int numRows = _X.extent(1); - size_t maximum_iteration = handle.get_max_iteration() < numRows - ? handle.get_max_iteration() - : numRows; + size_t maximum_iteration = handle.get_max_iteration() < numRows ? handle.get_max_iteration() : numRows; int n_G = maximum_iteration + 1; int n_W = numRows; int n_mask = 1; int n_tmp = 1; - using ScratchPadVectorViewType = Kokkos::View< - typename VectorViewType::non_const_value_type**, - typename VectorViewType::array_layout, - typename VectorViewType::execution_space::scratch_memory_space>; + using ScratchPadVectorViewType = + Kokkos::View; - ScratchPadVectorViewType _TMPView( - member.team_scratch(handle.get_scratch_pad_level()), numMatrices, - n_G + n_W + n_mask + n_tmp); + ScratchPadVectorViewType _TMPView(member.team_scratch(handle.get_scratch_pad_level()), numMatrices, + n_G + n_W + n_mask + n_tmp); - return invoke(member, A, _B, _X, P, handle, _ArnoldiView, - _TMPView); + return invoke(member, A, _B, _X, P, handle, + _ArnoldiView, _TMPView); } if (strategy == 1) { const int first_matrix = handle.first_index(member.league_rank()); const int last_matrix = handle.last_index(member.league_rank()); - auto _ArnoldiView = Kokkos::subview( - handle.Arnoldi_view, Kokkos::make_pair(first_matrix, last_matrix), - Kokkos::ALL, Kokkos::ALL); + auto _ArnoldiView = + Kokkos::subview(handle.Arnoldi_view, Kokkos::make_pair(first_matrix, last_matrix), Kokkos::ALL, Kokkos::ALL); - auto _TMPView = Kokkos::subview( - handle.tmp_view, Kokkos::make_pair(first_matrix, last_matrix), - Kokkos::ALL); + auto _TMPView = Kokkos::subview(handle.tmp_view, Kokkos::make_pair(first_matrix, last_matrix), Kokkos::ALL); - return invoke(member, A, _B, _X, P, handle, _ArnoldiView, - _TMPView); + return invoke(member, A, _B, _X, P, handle, + _ArnoldiView, _TMPView); } if (strategy == 2) { - using ScratchPadArnoldiViewType = Kokkos::View< - typename VectorViewType::non_const_value_type***, - typename VectorViewType::array_layout, - typename VectorViewType::execution_space::scratch_memory_space>; + using ScratchPadArnoldiViewType = + Kokkos::View; - using ScratchPadVectorViewType = Kokkos::View< - typename VectorViewType::non_const_value_type**, - typename VectorViewType::array_layout, - typename VectorViewType::execution_space::scratch_memory_space>; + using ScratchPadVectorViewType = + Kokkos::View; const int numMatrices = _X.extent(0); const int numRows = _X.extent(1); - size_t maximum_iteration = handle.get_max_iteration() < numRows - ? handle.get_max_iteration() - : numRows; + size_t maximum_iteration = handle.get_max_iteration() < numRows ? handle.get_max_iteration() : numRows; int n_G = maximum_iteration + 1; int n_W = numRows; int n_mask = 1; int n_tmp = 1; - ScratchPadArnoldiViewType _ArnoldiView( - member.team_scratch(handle.get_scratch_pad_level()), numMatrices, - maximum_iteration, numRows + maximum_iteration + 3); + ScratchPadArnoldiViewType _ArnoldiView(member.team_scratch(handle.get_scratch_pad_level()), numMatrices, + maximum_iteration, numRows + maximum_iteration + 3); - ScratchPadVectorViewType _TMPView( - member.team_scratch(handle.get_scratch_pad_level()), numMatrices, - n_G + n_W + n_mask + n_tmp); + ScratchPadVectorViewType _TMPView(member.team_scratch(handle.get_scratch_pad_level()), numMatrices, + n_G + n_W + n_mask + n_tmp); - return invoke(member, A, _B, _X, P, handle, _ArnoldiView, - _TMPView); + return invoke(member, A, _B, _X, P, handle, + _ArnoldiView, _TMPView); } return 0; } template -template -KOKKOS_INLINE_FUNCTION int TeamVectorGMRES::invoke( - const MemberType& member, const OperatorType& A, const VectorViewType& _B, - const VectorViewType& _X, const KrylovHandleType& handle) { +template +KOKKOS_INLINE_FUNCTION int TeamVectorGMRES::invoke(const MemberType& member, const OperatorType& A, + const VectorViewType& _B, const VectorViewType& _X, + const KrylovHandleType& handle) { Identity P; - return invoke(member, A, _B, _X, P, - handle); + return invoke(member, A, _B, _X, P, handle); } } // namespace KokkosBatched diff --git a/batched/sparse/impl/KokkosBatched_GMRES_Team_Impl.hpp b/batched/sparse/impl/KokkosBatched_GMRES_Team_Impl.hpp index bb8f446f07..9fd9e09bd9 100644 --- a/batched/sparse/impl/KokkosBatched_GMRES_Team_Impl.hpp +++ b/batched/sparse/impl/KokkosBatched_GMRES_Team_Impl.hpp @@ -38,17 +38,15 @@ namespace KokkosBatched { /// template -template -KOKKOS_INLINE_FUNCTION int TeamGMRES::invoke( - const MemberType& member, const OperatorType& A, const VectorViewType& _B, - const VectorViewType& _X, const PrecOperatorType& P, - const KrylovHandleType& handle, const ArnoldiViewType& _ArnoldiView, - const TMPViewType& _TMPView) { +KOKKOS_INLINE_FUNCTION int TeamGMRES::invoke(const MemberType& member, const OperatorType& A, + const VectorViewType& _B, const VectorViewType& _X, + const PrecOperatorType& P, const KrylovHandleType& handle, + const ArnoldiViewType& _ArnoldiView, + const TMPViewType& _TMPView) { typedef int OrdinalType; - typedef typename Kokkos::ArithTraits< - typename VectorViewType::non_const_value_type>::mag_type MagnitudeType; + typedef typename Kokkos::ArithTraits::mag_type MagnitudeType; typedef Kokkos::ArithTraits ATM; using TeamCopy1D = TeamCopy; @@ -56,9 +54,7 @@ KOKKOS_INLINE_FUNCTION int TeamGMRES::invoke( const OrdinalType numMatrices = _X.extent(0); const OrdinalType numRows = _X.extent(1); - size_t maximum_iteration = handle.get_max_iteration() < numRows - ? handle.get_max_iteration() - : numRows; + size_t maximum_iteration = handle.get_max_iteration() < numRows ? handle.get_max_iteration() : numRows; const MagnitudeType tolerance = handle.get_tolerance(); const MagnitudeType max_tolerance = handle.get_max_tolerance(); @@ -70,13 +66,10 @@ KOKKOS_INLINE_FUNCTION int TeamGMRES::invoke( int offset_H = offset_V + n_V; int offset_Givens = offset_H + n_H; - auto V_view = Kokkos::subview(_ArnoldiView, Kokkos::ALL, Kokkos::ALL, - Kokkos::make_pair(offset_V, offset_V + n_V)); - auto H_view = Kokkos::subview(_ArnoldiView, Kokkos::ALL, Kokkos::ALL, - Kokkos::make_pair(offset_H, offset_H + n_H)); - auto Givens_view = Kokkos::subview( - _ArnoldiView, Kokkos::ALL, Kokkos::ALL, - Kokkos::make_pair(offset_Givens, offset_Givens + n_Givens)); + auto V_view = Kokkos::subview(_ArnoldiView, Kokkos::ALL, Kokkos::ALL, Kokkos::make_pair(offset_V, offset_V + n_V)); + auto H_view = Kokkos::subview(_ArnoldiView, Kokkos::ALL, Kokkos::ALL, Kokkos::make_pair(offset_H, offset_H + n_H)); + auto Givens_view = Kokkos::subview(_ArnoldiView, Kokkos::ALL, Kokkos::ALL, + Kokkos::make_pair(offset_Givens, offset_Givens + n_Givens)); int n_G = maximum_iteration + 1; int n_W = numRows; @@ -87,10 +80,8 @@ KOKKOS_INLINE_FUNCTION int TeamGMRES::invoke( int offset_mask = offset_W + n_W; int offset_tmp = offset_mask + n_mask; - auto G = Kokkos::subview(_TMPView, Kokkos::ALL, - Kokkos::make_pair(offset_G, offset_G + n_G)); - auto W = Kokkos::subview(_TMPView, Kokkos::ALL, - Kokkos::make_pair(offset_W, offset_W + n_W)); + auto G = Kokkos::subview(_TMPView, Kokkos::ALL, Kokkos::make_pair(offset_G, offset_G + n_G)); + auto W = Kokkos::subview(_TMPView, Kokkos::ALL, Kokkos::make_pair(offset_W, offset_W + n_W)); auto mask = Kokkos::subview(_TMPView, Kokkos::ALL, offset_mask); auto tmp = Kokkos::subview(_TMPView, Kokkos::ALL, offset_tmp); @@ -108,33 +99,29 @@ KOKKOS_INLINE_FUNCTION int TeamGMRES::invoke( TeamDot::invoke(member, W, W, tmp); member.team_barrier(); - Kokkos::parallel_for(Kokkos::TeamThreadRange(member, 0, numMatrices), - [&](const OrdinalType& i) { - tmp(i) = ATM::sqrt(tmp(i)); - handle.set_norm(member.league_rank(), i, 0, tmp(i)); - if (tmp(i) > max_tolerance) { - mask(i) = 1; - G(i, 0) = tmp(i); - tmp(i) = 1. / tmp(i); - } else { - handle.set_iteration(member.league_rank(), i, 0); - mask(i) = 0; - G(i, 0) = 0.; - tmp(i) = 0.; - } - }); + Kokkos::parallel_for(Kokkos::TeamThreadRange(member, 0, numMatrices), [&](const OrdinalType& i) { + tmp(i) = ATM::sqrt(tmp(i)); + handle.set_norm(member.league_rank(), i, 0, tmp(i)); + if (tmp(i) > max_tolerance) { + mask(i) = 1; + G(i, 0) = tmp(i); + tmp(i) = 1. / tmp(i); + } else { + handle.set_iteration(member.league_rank(), i, 0); + mask(i) = 0; + G(i, 0) = 0.; + tmp(i) = 0.; + } + }); member.team_barrier(); // Finish writing to tmp auto V_0 = Kokkos::subview(V_view, Kokkos::ALL, 0, Kokkos::ALL); - Kokkos::parallel_for( - Kokkos::TeamThreadRange(member, 0, numMatrices * numRows), - [&](const OrdinalType& iTemp) { - OrdinalType iRow, iMatrix; - getIndices( - iTemp, numRows, numMatrices, iRow, iMatrix); - V_0(iMatrix, iRow) = W(iMatrix, iRow) * tmp(iMatrix); - }); + Kokkos::parallel_for(Kokkos::TeamThreadRange(member, 0, numMatrices * numRows), [&](const OrdinalType& iTemp) { + OrdinalType iRow, iMatrix; + getIndices(iTemp, numRows, numMatrices, iRow, iMatrix); + V_0(iMatrix, iRow) = W(iMatrix, iRow) * tmp(iMatrix); + }); int status = 1; // int number_not_converged = 0; @@ -150,18 +137,14 @@ KOKKOS_INLINE_FUNCTION int TeamGMRES::invoke( member.team_barrier(); if (handle.get_ortho_strategy() == 0) { - auto V_old = Kokkos::subview( - V_view, Kokkos::ALL, Kokkos::make_pair(0, (int)j + 1), Kokkos::ALL); - auto H_old = Kokkos::subview(H_view, Kokkos::ALL, j, - Kokkos::make_pair(0, (int)j + 1)); + auto V_old = Kokkos::subview(V_view, Kokkos::ALL, Kokkos::make_pair(0, (int)j + 1), Kokkos::ALL); + auto H_old = Kokkos::subview(H_view, Kokkos::ALL, j, Kokkos::make_pair(0, (int)j + 1)); // Inner products - TeamGemv::invoke( - member, 1, V_old, W, 0, H_old); + TeamGemv::invoke(member, 1, V_old, W, 0, H_old); member.team_barrier(); // Update - TeamGemv::invoke( - member, -1, V_old, H_old, 1, W); + TeamGemv::invoke(member, -1, V_old, H_old, 1, W); member.team_barrier(); // Finish writing to W } if (handle.get_ortho_strategy() == 1) { @@ -169,12 +152,10 @@ KOKKOS_INLINE_FUNCTION int TeamGMRES::invoke( auto V_i = Kokkos::subview(V_view, Kokkos::ALL, i, Kokkos::ALL); TeamDot::invoke(member, W, V_i, tmp); member.team_barrier(); - TeamCopy1D::invoke(member, tmp, - Kokkos::subview(H_view, Kokkos::ALL, j, i)); + TeamCopy1D::invoke(member, tmp, Kokkos::subview(H_view, Kokkos::ALL, j, i)); member.team_barrier(); - Kokkos::parallel_for( - Kokkos::TeamThreadRange(member, 0, numMatrices), - [&](const OrdinalType& ii) { tmp(ii) = -tmp(ii); }); + Kokkos::parallel_for(Kokkos::TeamThreadRange(member, 0, numMatrices), + [&](const OrdinalType& ii) { tmp(ii) = -tmp(ii); }); member.team_barrier(); // Finish writing to tmp @@ -185,82 +166,71 @@ KOKKOS_INLINE_FUNCTION int TeamGMRES::invoke( TeamDot::invoke(member, W, W, tmp); member.team_barrier(); - Kokkos::parallel_for(Kokkos::TeamThreadRange(member, 0, numMatrices), - [&](const OrdinalType& i) { - H_view(i, j, j + 1) = ATM::sqrt(tmp(i)); - tmp(i) = H_view(i, j, j + 1) > max_tolerance - ? 1. / H_view(i, j, j + 1) - : 0.; - }); + Kokkos::parallel_for(Kokkos::TeamThreadRange(member, 0, numMatrices), [&](const OrdinalType& i) { + H_view(i, j, j + 1) = ATM::sqrt(tmp(i)); + tmp(i) = H_view(i, j, j + 1) > max_tolerance ? 1. / H_view(i, j, j + 1) : 0.; + }); member.team_barrier(); if (j + 1 < maximum_iteration) { auto V_n = Kokkos::subview(V_view, Kokkos::ALL, j + 1, Kokkos::ALL); - Kokkos::parallel_for( - Kokkos::TeamThreadRange(member, 0, numMatrices * numRows), - [&](const OrdinalType& iTemp) { - OrdinalType iRow, iMatrix; - getIndices( - iTemp, numRows, numMatrices, iRow, iMatrix); - V_n(iMatrix, iRow) = W(iMatrix, iRow) * tmp(iMatrix); - }); + Kokkos::parallel_for(Kokkos::TeamThreadRange(member, 0, numMatrices * numRows), [&](const OrdinalType& iTemp) { + OrdinalType iRow, iMatrix; + getIndices(iTemp, numRows, numMatrices, iRow, iMatrix); + V_n(iMatrix, iRow) = W(iMatrix, iRow) * tmp(iMatrix); + }); member.team_barrier(); } - Kokkos::parallel_for( - Kokkos::TeamThreadRange(member, 0, numMatrices), - [&](const OrdinalType& l) { - // Apply the previous Givens rotations: - auto H_j = Kokkos::subview(H_view, l, j, Kokkos::ALL); - auto Givens_0_l = Kokkos::subview(Givens_view, l, Kokkos::ALL, 0); - auto Givens_1_l = Kokkos::subview(Givens_view, l, Kokkos::ALL, 1); - - if (mask(l) == 1.) { - for (size_t i = 0; i < j; ++i) { - auto tmp1 = Givens_0_l(i) * H_j(i) + Givens_1_l(i) * H_j(i + 1); - auto tmp2 = -Givens_1_l(i) * H_j(i) + Givens_0_l(i) * H_j(i + 1); - H_j(i) = tmp1; - H_j(i + 1) = tmp2; - } - - // Compute the new Givens rotation: - Kokkos::pair - G_new(1, 0); - typename VectorViewType::non_const_value_type alpha = 0; - SerialGivensInternal::invoke(H_j(j), H_j(j + 1), &G_new, &alpha); - - Givens_0_l(j) = G_new.first; - Givens_1_l(j) = G_new.second; - - // Apply the new Givens rotation: - auto tmp1 = Givens_0_l(j) * H_j(j) + Givens_1_l(j) * H_j(j + 1); - auto tmp2 = -Givens_1_l(j) * H_j(j) + Givens_0_l(j) * H_j(j + 1); - H_j(j) = tmp1; - H_j(j + 1) = tmp2; - - G(l, j + 1) = -Givens_1_l(j) * G(l, j); - G(l, j) *= Givens_0_l(j); - } else { - H_j(j) = 1.; - G(l, j + 1) = 0.; - } - - auto res_norm = - Kokkos::ArithTraits::abs(G(l, j + 1)) / G(l, 0); - - handle.set_norm(member.league_rank(), l, j + 1, res_norm); - - if (mask(l) == 1. && res_norm < tolerance) { - mask(l) = 0.; - G(l, j + 1) = 0.; - handle.set_iteration(member.league_rank(), l, j + 1); - } - }); + Kokkos::parallel_for(Kokkos::TeamThreadRange(member, 0, numMatrices), [&](const OrdinalType& l) { + // Apply the previous Givens rotations: + auto H_j = Kokkos::subview(H_view, l, j, Kokkos::ALL); + auto Givens_0_l = Kokkos::subview(Givens_view, l, Kokkos::ALL, 0); + auto Givens_1_l = Kokkos::subview(Givens_view, l, Kokkos::ALL, 1); + + if (mask(l) == 1.) { + for (size_t i = 0; i < j; ++i) { + auto tmp1 = Givens_0_l(i) * H_j(i) + Givens_1_l(i) * H_j(i + 1); + auto tmp2 = -Givens_1_l(i) * H_j(i) + Givens_0_l(i) * H_j(i + 1); + H_j(i) = tmp1; + H_j(i + 1) = tmp2; + } + + // Compute the new Givens rotation: + Kokkos::pair + G_new(1, 0); + typename VectorViewType::non_const_value_type alpha = 0; + SerialGivensInternal::invoke(H_j(j), H_j(j + 1), &G_new, &alpha); + + Givens_0_l(j) = G_new.first; + Givens_1_l(j) = G_new.second; + + // Apply the new Givens rotation: + auto tmp1 = Givens_0_l(j) * H_j(j) + Givens_1_l(j) * H_j(j + 1); + auto tmp2 = -Givens_1_l(j) * H_j(j) + Givens_0_l(j) * H_j(j + 1); + H_j(j) = tmp1; + H_j(j + 1) = tmp2; + + G(l, j + 1) = -Givens_1_l(j) * G(l, j); + G(l, j) *= Givens_0_l(j); + } else { + H_j(j) = 1.; + G(l, j + 1) = 0.; + } + + auto res_norm = Kokkos::ArithTraits::abs(G(l, j + 1)) / G(l, 0); + + handle.set_norm(member.league_rank(), l, j + 1, res_norm); + + if (mask(l) == 1. && res_norm < tolerance) { + mask(l) = 0.; + G(l, j + 1) = 0.; + handle.set_iteration(member.league_rank(), l, j + 1); + } + }); member.team_barrier(); bool all_converged = true; - for (OrdinalType l = 0; l < numMatrices; ++l) - all_converged = (all_converged && mask(l) == 0.); + for (OrdinalType l = 0; l < numMatrices; ++l) all_converged = (all_converged && mask(l) == 0.); if (all_converged) { maximum_iteration = j + 1; break; @@ -271,30 +241,25 @@ KOKKOS_INLINE_FUNCTION int TeamGMRES::invoke( auto first_indices = Kokkos::make_pair(0, (int)maximum_iteration); - Kokkos::parallel_for( - Kokkos::TeamVectorRange(member, 0, numMatrices), - [&](const OrdinalType& l) { - auto A_l = Kokkos::subview(H_view, l, first_indices, first_indices); - auto B_l = Kokkos::subview(G, l, first_indices); + Kokkos::parallel_for(Kokkos::TeamVectorRange(member, 0, numMatrices), [&](const OrdinalType& l) { + auto A_l = Kokkos::subview(H_view, l, first_indices, first_indices); + auto B_l = Kokkos::subview(G, l, first_indices); - SerialTrsm::invoke(1, A_l, B_l); - }); + SerialTrsm::invoke(1, A_l, B_l); + }); member.team_barrier(); // Finish writing to G if (handle.get_ortho_strategy() == 0) { TeamGemv::invoke( - member, 1, - Kokkos::subview(V_view, Kokkos::ALL, first_indices, Kokkos::ALL), + member, 1, Kokkos::subview(V_view, Kokkos::ALL, first_indices, Kokkos::ALL), Kokkos::subview(G, Kokkos::ALL, first_indices), 1, _X); member.team_barrier(); // Finish writing to _X } if (handle.get_ortho_strategy() == 1) { for (size_t j = 0; j < maximum_iteration; ++j) { - TeamAxpy::invoke( - member, Kokkos::subview(G, Kokkos::ALL, j), - Kokkos::subview(V_view, Kokkos::ALL, j, Kokkos::ALL), _X); + TeamAxpy::invoke(member, Kokkos::subview(G, Kokkos::ALL, j), + Kokkos::subview(V_view, Kokkos::ALL, j, Kokkos::ALL), _X); member.team_barrier(); // Finish writing to _X } } @@ -309,120 +274,97 @@ KOKKOS_INLINE_FUNCTION int TeamGMRES::invoke( TeamDot::invoke(member, W, W, tmp); member.team_barrier(); - Kokkos::parallel_for(Kokkos::TeamThreadRange(member, 0, numMatrices), - [&](const OrdinalType& i) { - tmp(i) = ATM::sqrt(tmp(i)); - handle.set_last_norm(member.league_rank(), i, - tmp(i)); - }); + Kokkos::parallel_for(Kokkos::TeamThreadRange(member, 0, numMatrices), [&](const OrdinalType& i) { + tmp(i) = ATM::sqrt(tmp(i)); + handle.set_last_norm(member.league_rank(), i, tmp(i)); + }); } return status; } template -template -KOKKOS_INLINE_FUNCTION int TeamGMRES::invoke( - const MemberType& member, const OperatorType& A, const VectorViewType& _B, - const VectorViewType& _X, const PrecOperatorType& P, - const KrylovHandleType& handle) { +template +KOKKOS_INLINE_FUNCTION int TeamGMRES::invoke(const MemberType& member, const OperatorType& A, + const VectorViewType& _B, const VectorViewType& _X, + const PrecOperatorType& P, const KrylovHandleType& handle) { const int strategy = handle.get_memory_strategy(); if (strategy == 0) { const int first_matrix = handle.first_index(member.league_rank()); const int last_matrix = handle.last_index(member.league_rank()); - auto _ArnoldiView = Kokkos::subview( - handle.Arnoldi_view, Kokkos::make_pair(first_matrix, last_matrix), - Kokkos::ALL, Kokkos::ALL); + auto _ArnoldiView = + Kokkos::subview(handle.Arnoldi_view, Kokkos::make_pair(first_matrix, last_matrix), Kokkos::ALL, Kokkos::ALL); const int numMatrices = _X.extent(0); const int numRows = _X.extent(1); - size_t maximum_iteration = handle.get_max_iteration() < numRows - ? handle.get_max_iteration() - : numRows; + size_t maximum_iteration = handle.get_max_iteration() < numRows ? handle.get_max_iteration() : numRows; int n_G = maximum_iteration + 1; int n_W = numRows; int n_mask = 1; int n_tmp = 1; - using ScratchPadVectorViewType = Kokkos::View< - typename VectorViewType::non_const_value_type**, - typename VectorViewType::array_layout, - typename VectorViewType::execution_space::scratch_memory_space>; + using ScratchPadVectorViewType = + Kokkos::View; - ScratchPadVectorViewType _TMPView( - member.team_scratch(handle.get_scratch_pad_level()), numMatrices, - n_G + n_W + n_mask + n_tmp); + ScratchPadVectorViewType _TMPView(member.team_scratch(handle.get_scratch_pad_level()), numMatrices, + n_G + n_W + n_mask + n_tmp); - return invoke(member, A, _B, _X, P, handle, _ArnoldiView, - _TMPView); + return invoke(member, A, _B, _X, P, handle, + _ArnoldiView, _TMPView); } if (strategy == 1) { const int first_matrix = handle.first_index(member.league_rank()); const int last_matrix = handle.last_index(member.league_rank()); - auto _ArnoldiView = Kokkos::subview( - handle.Arnoldi_view, Kokkos::make_pair(first_matrix, last_matrix), - Kokkos::ALL, Kokkos::ALL); + auto _ArnoldiView = + Kokkos::subview(handle.Arnoldi_view, Kokkos::make_pair(first_matrix, last_matrix), Kokkos::ALL, Kokkos::ALL); - auto _TMPView = Kokkos::subview( - handle.tmp_view, Kokkos::make_pair(first_matrix, last_matrix), - Kokkos::ALL); + auto _TMPView = Kokkos::subview(handle.tmp_view, Kokkos::make_pair(first_matrix, last_matrix), Kokkos::ALL); - return invoke(member, A, _B, _X, P, handle, _ArnoldiView, - _TMPView); + return invoke(member, A, _B, _X, P, handle, + _ArnoldiView, _TMPView); } if (strategy == 2) { - using ScratchPadArnoldiViewType = Kokkos::View< - typename VectorViewType::non_const_value_type***, - typename VectorViewType::array_layout, - typename VectorViewType::execution_space::scratch_memory_space>; + using ScratchPadArnoldiViewType = + Kokkos::View; - using ScratchPadVectorViewType = Kokkos::View< - typename VectorViewType::non_const_value_type**, - typename VectorViewType::array_layout, - typename VectorViewType::execution_space::scratch_memory_space>; + using ScratchPadVectorViewType = + Kokkos::View; const int numMatrices = _X.extent(0); const int numRows = _X.extent(1); - size_t maximum_iteration = handle.get_max_iteration() < numRows - ? handle.get_max_iteration() - : numRows; + size_t maximum_iteration = handle.get_max_iteration() < numRows ? handle.get_max_iteration() : numRows; int n_G = maximum_iteration + 1; int n_W = numRows; int n_mask = 1; int n_tmp = 1; - ScratchPadArnoldiViewType _ArnoldiView( - member.team_scratch(handle.get_scratch_pad_level()), numMatrices, - maximum_iteration, numRows + maximum_iteration + 3); + ScratchPadArnoldiViewType _ArnoldiView(member.team_scratch(handle.get_scratch_pad_level()), numMatrices, + maximum_iteration, numRows + maximum_iteration + 3); - ScratchPadVectorViewType _TMPView( - member.team_scratch(handle.get_scratch_pad_level()), numMatrices, - n_G + n_W + n_mask + n_tmp); + ScratchPadVectorViewType _TMPView(member.team_scratch(handle.get_scratch_pad_level()), numMatrices, + n_G + n_W + n_mask + n_tmp); - return invoke(member, A, _B, _X, P, handle, _ArnoldiView, - _TMPView); + return invoke(member, A, _B, _X, P, handle, + _ArnoldiView, _TMPView); } return 0; } template -template -KOKKOS_INLINE_FUNCTION int TeamGMRES::invoke( - const MemberType& member, const OperatorType& A, const VectorViewType& _B, - const VectorViewType& _X, const KrylovHandleType& handle) { +template +KOKKOS_INLINE_FUNCTION int TeamGMRES::invoke(const MemberType& member, const OperatorType& A, + const VectorViewType& _B, const VectorViewType& _X, + const KrylovHandleType& handle) { Identity P; - return invoke(member, A, _B, _X, P, - handle); + return invoke(member, A, _B, _X, P, handle); } } // namespace KokkosBatched diff --git a/batched/sparse/impl/KokkosBatched_Spmv_Serial_Impl.hpp b/batched/sparse/impl/KokkosBatched_Spmv_Serial_Impl.hpp index 0f1e5feb39..3f76ee3d9f 100644 --- a/batched/sparse/impl/KokkosBatched_Spmv_Serial_Impl.hpp +++ b/batched/sparse/impl/KokkosBatched_Spmv_Serial_Impl.hpp @@ -26,35 +26,24 @@ namespace KokkosBatched { /// Serial Internal Impl /// ==================== struct SerialSpmvInternal { - template + template KOKKOS_INLINE_FUNCTION static int invoke( - const OrdinalType numMatrices, const OrdinalType numRows, - const ScalarType* KOKKOS_RESTRICT alpha, const OrdinalType alphas0, - const ValueType* KOKKOS_RESTRICT values, const OrdinalType valuess0, - const OrdinalType valuess1, const OrdinalType* KOKKOS_RESTRICT row_ptr, - const OrdinalType row_ptrs0, - const OrdinalType* KOKKOS_RESTRICT colIndices, - const OrdinalType colIndicess0, const ValueType* KOKKOS_RESTRICT X, - const OrdinalType xs0, const OrdinalType xs1, - const ScalarType* KOKKOS_RESTRICT beta, const OrdinalType betas0, - /**/ ValueType* KOKKOS_RESTRICT Y, const OrdinalType ys0, - const OrdinalType ys1) { + const OrdinalType numMatrices, const OrdinalType numRows, const ScalarType* KOKKOS_RESTRICT alpha, + const OrdinalType alphas0, const ValueType* KOKKOS_RESTRICT values, const OrdinalType valuess0, + const OrdinalType valuess1, const OrdinalType* KOKKOS_RESTRICT row_ptr, const OrdinalType row_ptrs0, + const OrdinalType* KOKKOS_RESTRICT colIndices, const OrdinalType colIndicess0, const ValueType* KOKKOS_RESTRICT X, + const OrdinalType xs0, const OrdinalType xs1, const ScalarType* KOKKOS_RESTRICT beta, const OrdinalType betas0, + /**/ ValueType* KOKKOS_RESTRICT Y, const OrdinalType ys0, const OrdinalType ys1) { for (OrdinalType iMatrix = 0; iMatrix < numMatrices; ++iMatrix) { for (OrdinalType iRow = 0; iRow < numRows; ++iRow) { - const OrdinalType rowLength = - row_ptr[(iRow + 1) * row_ptrs0] - row_ptr[iRow * row_ptrs0]; - ValueType sum = 0; + const OrdinalType rowLength = row_ptr[(iRow + 1) * row_ptrs0] - row_ptr[iRow * row_ptrs0]; + ValueType sum = 0; #if defined(KOKKOS_ENABLE_PRAGMA_UNROLL) #pragma unroll #endif for (OrdinalType iEntry = 0; iEntry < rowLength; ++iEntry) { - sum += values[iMatrix * valuess0 + - (row_ptr[iRow * row_ptrs0] + iEntry) * valuess1] * - X[iMatrix * xs0 + - colIndices[(row_ptr[iRow * row_ptrs0] + iEntry) * - colIndicess0] * - xs1]; + sum += values[iMatrix * valuess0 + (row_ptr[iRow * row_ptrs0] + iEntry) * valuess1] * + X[iMatrix * xs0 + colIndices[(row_ptr[iRow * row_ptrs0] + iEntry) * colIndicess0] * xs1]; } sum *= alpha[iMatrix * alphas0]; @@ -62,8 +51,7 @@ struct SerialSpmvInternal { if (dobeta == 0) { Y[iMatrix * ys0 + iRow * ys1] = sum; } else { - Y[iMatrix * ys0 + iRow * ys1] = - beta[iMatrix * betas0] * Y[iMatrix * ys0 + iRow * ys1] + sum; + Y[iMatrix * ys0 + iRow * ys1] = beta[iMatrix * betas0] * Y[iMatrix * ys0 + iRow * ys1] + sum; } } } @@ -71,33 +59,26 @@ struct SerialSpmvInternal { return 0; } - template - KOKKOS_INLINE_FUNCTION static int invoke( - const OrdinalType numMatrices, const OrdinalType numRows, - const ScalarType alpha, const ValueType* KOKKOS_RESTRICT values, - const OrdinalType valuess0, const OrdinalType valuess1, - const OrdinalType* KOKKOS_RESTRICT row_ptr, const OrdinalType row_ptrs0, - const OrdinalType* KOKKOS_RESTRICT colIndices, - const OrdinalType colIndicess0, const ValueType* KOKKOS_RESTRICT X, - const OrdinalType xs0, const OrdinalType xs1, const ScalarType beta, - /**/ ValueType* KOKKOS_RESTRICT Y, const OrdinalType ys0, - const OrdinalType ys1) { + template + KOKKOS_INLINE_FUNCTION static int invoke(const OrdinalType numMatrices, const OrdinalType numRows, + const ScalarType alpha, const ValueType* KOKKOS_RESTRICT values, + const OrdinalType valuess0, const OrdinalType valuess1, + const OrdinalType* KOKKOS_RESTRICT row_ptr, const OrdinalType row_ptrs0, + const OrdinalType* KOKKOS_RESTRICT colIndices, + const OrdinalType colIndicess0, const ValueType* KOKKOS_RESTRICT X, + const OrdinalType xs0, const OrdinalType xs1, const ScalarType beta, + /**/ ValueType* KOKKOS_RESTRICT Y, const OrdinalType ys0, + const OrdinalType ys1) { for (OrdinalType iMatrix = 0; iMatrix < numMatrices; ++iMatrix) { for (OrdinalType iRow = 0; iRow < numRows; ++iRow) { - const OrdinalType rowLength = - row_ptr[(iRow + 1) * row_ptrs0] - row_ptr[iRow * row_ptrs0]; - ValueType sum = 0; + const OrdinalType rowLength = row_ptr[(iRow + 1) * row_ptrs0] - row_ptr[iRow * row_ptrs0]; + ValueType sum = 0; #if defined(KOKKOS_ENABLE_PRAGMA_UNROLL) #pragma unroll #endif for (OrdinalType iEntry = 0; iEntry < rowLength; ++iEntry) { - sum += values[iMatrix * valuess0 + - (row_ptr[iRow * row_ptrs0] + iEntry) * valuess1] * - X[iMatrix * xs0 + - colIndices[(row_ptr[iRow * row_ptrs0] + iEntry) * - colIndicess0] * - xs1]; + sum += values[iMatrix * valuess0 + (row_ptr[iRow * row_ptrs0] + iEntry) * valuess1] * + X[iMatrix * xs0 + colIndices[(row_ptr[iRow * row_ptrs0] + iEntry) * colIndicess0] * xs1]; } sum *= alpha; @@ -105,8 +86,7 @@ struct SerialSpmvInternal { if (dobeta == 0) { Y[iMatrix * ys0 + iRow * ys1] = sum; } else { - Y[iMatrix * ys0 + iRow * ys1] = - beta * Y[iMatrix * ys0 + iRow * ys1] + sum; + Y[iMatrix * ys0 + iRow * ys1] = beta * Y[iMatrix * ys0 + iRow * ys1] + sum; } } } @@ -117,47 +97,32 @@ struct SerialSpmvInternal { template <> struct SerialSpmv { - template - KOKKOS_INLINE_FUNCTION static int invoke( - const alphaViewType& alpha, const ValuesViewType& values, - const IntView& row_ptr, const IntView& colIndices, const xViewType& X, - const betaViewType& beta, const yViewType& Y) { + template + KOKKOS_INLINE_FUNCTION static int invoke(const alphaViewType& alpha, const ValuesViewType& values, + const IntView& row_ptr, const IntView& colIndices, const xViewType& X, + const betaViewType& beta, const yViewType& Y) { #if (KOKKOSKERNELS_DEBUG_LEVEL > 0) - static_assert(Kokkos::is_view::value, - "KokkosBatched::spmv: ValuesViewType is not a Kokkos::View."); - static_assert(Kokkos::is_view::value, - "KokkosBatched::spmv: IntView is not a Kokkos::View."); - static_assert(Kokkos::is_view::value, - "KokkosBatched::spmv: xViewType is not a Kokkos::View."); - static_assert(Kokkos::is_view::value, - "KokkosBatched::spmv: yViewType is not a Kokkos::View."); - static_assert(Kokkos::is_view::value, - "KokkosBatched::spmv: alphaViewType is not a Kokkos::View."); - static_assert(Kokkos::is_view::value, - "KokkosBatched::spmv: betaViewType is not a Kokkos::View."); - - static_assert(ValuesViewType::rank == 2, - "KokkosBatched::spmv: ValuesViewType must have rank 2."); - static_assert(IntView::rank == 1, - "KokkosBatched::spmv: IntView must have rank 2."); - static_assert(xViewType::rank == 2, - "KokkosBatched::spmv: xViewType must have rank 2."); - static_assert(yViewType::rank == 2, - "KokkosBatched::spmv: yViewType must have rank 2."); - static_assert(alphaViewType::rank == 1, - "KokkosBatched::spmv: alphaViewType must have rank 1."); - static_assert(betaViewType::rank == 1, - "KokkosBatched::spmv: betaViewType must have rank 1."); + static_assert(Kokkos::is_view::value, "KokkosBatched::spmv: ValuesViewType is not a Kokkos::View."); + static_assert(Kokkos::is_view::value, "KokkosBatched::spmv: IntView is not a Kokkos::View."); + static_assert(Kokkos::is_view::value, "KokkosBatched::spmv: xViewType is not a Kokkos::View."); + static_assert(Kokkos::is_view::value, "KokkosBatched::spmv: yViewType is not a Kokkos::View."); + static_assert(Kokkos::is_view::value, "KokkosBatched::spmv: alphaViewType is not a Kokkos::View."); + static_assert(Kokkos::is_view::value, "KokkosBatched::spmv: betaViewType is not a Kokkos::View."); + + static_assert(ValuesViewType::rank == 2, "KokkosBatched::spmv: ValuesViewType must have rank 2."); + static_assert(IntView::rank == 1, "KokkosBatched::spmv: IntView must have rank 2."); + static_assert(xViewType::rank == 2, "KokkosBatched::spmv: xViewType must have rank 2."); + static_assert(yViewType::rank == 2, "KokkosBatched::spmv: yViewType must have rank 2."); + static_assert(alphaViewType::rank == 1, "KokkosBatched::spmv: alphaViewType must have rank 1."); + static_assert(betaViewType::rank == 1, "KokkosBatched::spmv: betaViewType must have rank 1."); // Check compatibility of dimensions at run time. if (X.extent(0) != Y.extent(0) || X.extent(1) != Y.extent(1)) { Kokkos::printf( "KokkosBatched::spmv: Dimensions of X and Y do not match: X: %d x " "%d, Y: %d x %d\n", - (int)X.extent(0), (int)X.extent(1), (int)Y.extent(0), - (int)Y.extent(1)); + (int)X.extent(0), (int)X.extent(1), (int)Y.extent(0), (int)Y.extent(1)); return 1; } if (X.extent(0) != alpha.extent(0)) { @@ -178,8 +143,7 @@ struct SerialSpmv { Kokkos::printf( "KokkosBatched::spmv: First dimension of X and the first dimension " "of values do not match: X: %d x %d, values: %d x %d\n", - (int)X.extent(0), (int)X.extent(1), (int)values.extent(0), - (int)values.extent(1)); + (int)X.extent(0), (int)X.extent(1), (int)values.extent(0), (int)values.extent(1)); return 1; } if (colIndices.extent(0) != values.extent(1)) { @@ -187,8 +151,7 @@ struct SerialSpmv { "KokkosBatched::spmv: Dimension of colIndices and the second " "dimension of values do not match: colIndices: %d , values: %d x " "%d\n", - (int)colIndices.extent(0), (int)values.extent(0), - (int)values.extent(1)); + (int)colIndices.extent(0), (int)values.extent(0), (int)values.extent(1)); return 1; } if (row_ptr.extent(0) - 1 != X.extent(1)) { @@ -201,61 +164,43 @@ struct SerialSpmv { #endif return SerialSpmvInternal::template invoke< - typename alphaViewType::non_const_value_type, - typename ValuesViewType::non_const_value_type, - typename IntView::non_const_value_type, - typename ValuesViewType::array_layout, dobeta>( - X.extent(0), X.extent(1), alpha.data(), alpha.stride_0(), values.data(), - values.stride_0(), values.stride_1(), row_ptr.data(), - row_ptr.stride_0(), colIndices.data(), colIndices.stride_0(), X.data(), - X.stride_0(), X.stride_1(), beta.data(), beta.stride_0(), Y.data(), - Y.stride_0(), Y.stride_1()); + typename alphaViewType::non_const_value_type, typename ValuesViewType::non_const_value_type, + typename IntView::non_const_value_type, typename ValuesViewType::array_layout, dobeta>( + X.extent(0), X.extent(1), alpha.data(), alpha.stride_0(), values.data(), values.stride_0(), values.stride_1(), + row_ptr.data(), row_ptr.stride_0(), colIndices.data(), colIndices.stride_0(), X.data(), X.stride_0(), + X.stride_1(), beta.data(), beta.stride_0(), Y.data(), Y.stride_0(), Y.stride_1()); } - template + template KOKKOS_INLINE_FUNCTION static int invoke( - const typename Kokkos::ArithTraits< - typename ValuesViewType::non_const_value_type>::mag_type& alpha, - const ValuesViewType& values, const IntView& row_ptr, - const IntView& colIndices, const xViewType& X, - const typename Kokkos::ArithTraits< - typename ValuesViewType::non_const_value_type>::mag_type& beta, + const typename Kokkos::ArithTraits::mag_type& alpha, + const ValuesViewType& values, const IntView& row_ptr, const IntView& colIndices, const xViewType& X, + const typename Kokkos::ArithTraits::mag_type& beta, const yViewType& Y) { #if (KOKKOSKERNELS_DEBUG_LEVEL > 0) - static_assert(Kokkos::is_view::value, - "KokkosBatched::spmv: ValuesViewType is not a Kokkos::View."); - static_assert(Kokkos::is_view::value, - "KokkosBatched::spmv: IntView is not a Kokkos::View."); - static_assert(Kokkos::is_view::value, - "KokkosBatched::spmv: xViewType is not a Kokkos::View."); - static_assert(Kokkos::is_view::value, - "KokkosBatched::spmv: yViewType is not a Kokkos::View."); + static_assert(Kokkos::is_view::value, "KokkosBatched::spmv: ValuesViewType is not a Kokkos::View."); + static_assert(Kokkos::is_view::value, "KokkosBatched::spmv: IntView is not a Kokkos::View."); + static_assert(Kokkos::is_view::value, "KokkosBatched::spmv: xViewType is not a Kokkos::View."); + static_assert(Kokkos::is_view::value, "KokkosBatched::spmv: yViewType is not a Kokkos::View."); - static_assert(ValuesViewType::rank == 2, - "KokkosBatched::spmv: ValuesViewType must have rank 2."); - static_assert(IntView::rank == 1, - "KokkosBatched::spmv: IntView must have rank 2."); - static_assert(xViewType::rank == 2, - "KokkosBatched::spmv: xViewType must have rank 2."); - static_assert(yViewType::rank == 2, - "KokkosBatched::spmv: yViewType must have rank 2."); + static_assert(ValuesViewType::rank == 2, "KokkosBatched::spmv: ValuesViewType must have rank 2."); + static_assert(IntView::rank == 1, "KokkosBatched::spmv: IntView must have rank 2."); + static_assert(xViewType::rank == 2, "KokkosBatched::spmv: xViewType must have rank 2."); + static_assert(yViewType::rank == 2, "KokkosBatched::spmv: yViewType must have rank 2."); // Check compatibility of dimensions at run time. if (X.extent(0) != Y.extent(0) || X.extent(1) != Y.extent(1)) { Kokkos::printf( "KokkosBatched::spmv: Dimensions of X and Y do not match: X: %d x " "%d, Y: %d x %d\n", - (int)X.extent(0), (int)X.extent(1), (int)Y.extent(0), - (int)Y.extent(1)); + (int)X.extent(0), (int)X.extent(1), (int)Y.extent(0), (int)Y.extent(1)); return 1; } if (X.extent(0) != values.extent(0)) { Kokkos::printf( "KokkosBatched::spmv: First dimension of X and the first dimension " "of values do not match: X: %d x %d, values: %d x %d\n", - (int)X.extent(0), (int)X.extent(1), (int)values.extent(0), - (int)values.extent(1)); + (int)X.extent(0), (int)X.extent(1), (int)values.extent(0), (int)values.extent(1)); return 1; } if (colIndices.extent(0) != values.extent(1)) { @@ -263,8 +208,7 @@ struct SerialSpmv { "KokkosBatched::spmv: Dimension of colIndices and the second " "dimension of values do not match: colIndices: %d , values: %d x " "%d\n", - (int)colIndices.extent(0), (int)values.extent(0), - (int)values.extent(1)); + (int)colIndices.extent(0), (int)values.extent(0), (int)values.extent(1)); return 1; } if (row_ptr.extent(0) - 1 != X.extent(1)) { @@ -277,15 +221,12 @@ struct SerialSpmv { #endif return SerialSpmvInternal::template invoke< - typename Kokkos::ArithTraits< - typename ValuesViewType::non_const_value_type>::mag_type, - typename ValuesViewType::non_const_value_type, - typename IntView::non_const_value_type, + typename Kokkos::ArithTraits::mag_type, + typename ValuesViewType::non_const_value_type, typename IntView::non_const_value_type, typename ValuesViewType::array_layout, dobeta>( - X.extent(0), X.extent(1), alpha, values.data(), values.stride_0(), - values.stride_1(), row_ptr.data(), row_ptr.stride_0(), - colIndices.data(), colIndices.stride_0(), X.data(), X.stride_0(), - X.stride_1(), beta, Y.data(), Y.stride_0(), Y.stride_1()); + X.extent(0), X.extent(1), alpha, values.data(), values.stride_0(), values.stride_1(), row_ptr.data(), + row_ptr.stride_0(), colIndices.data(), colIndices.stride_0(), X.data(), X.stride_0(), X.stride_1(), beta, + Y.data(), Y.stride_0(), Y.stride_1()); } }; diff --git a/batched/sparse/impl/KokkosBatched_Spmv_TeamVector_Impl.hpp b/batched/sparse/impl/KokkosBatched_Spmv_TeamVector_Impl.hpp index dd510b2d0e..4df4b95e2c 100644 --- a/batched/sparse/impl/KokkosBatched_Spmv_TeamVector_Impl.hpp +++ b/batched/sparse/impl/KokkosBatched_Spmv_TeamVector_Impl.hpp @@ -27,50 +27,40 @@ namespace KokkosBatched { /// TeamVector Internal Impl /// ==================== struct TeamVectorSpmvInternal { - template + template KOKKOS_INLINE_FUNCTION static int invoke( - const MemberType& member, const OrdinalType numMatrices, - const OrdinalType numRows, const ScalarType* KOKKOS_RESTRICT alpha, - const OrdinalType alphas0, const ValueType* KOKKOS_RESTRICT values, - const OrdinalType valuess0, const OrdinalType valuess1, - const OrdinalType* KOKKOS_RESTRICT row_ptr, const OrdinalType row_ptrs0, - const OrdinalType* KOKKOS_RESTRICT colIndices, - const OrdinalType colIndicess0, const ValueType* KOKKOS_RESTRICT X, - const OrdinalType xs0, const OrdinalType xs1, + const MemberType& member, const OrdinalType numMatrices, const OrdinalType numRows, + const ScalarType* KOKKOS_RESTRICT alpha, const OrdinalType alphas0, const ValueType* KOKKOS_RESTRICT values, + const OrdinalType valuess0, const OrdinalType valuess1, const OrdinalType* KOKKOS_RESTRICT row_ptr, + const OrdinalType row_ptrs0, const OrdinalType* KOKKOS_RESTRICT colIndices, const OrdinalType colIndicess0, + const ValueType* KOKKOS_RESTRICT X, const OrdinalType xs0, const OrdinalType xs1, const ScalarType* KOKKOS_RESTRICT beta, const OrdinalType betas0, - /**/ ValueType* KOKKOS_RESTRICT Y, const OrdinalType ys0, - const OrdinalType ys1); - - template - KOKKOS_INLINE_FUNCTION static int invoke( - const MemberType& member, const OrdinalType numMatrices, - const OrdinalType numRows, const ScalarType alpha, - const ValueType* KOKKOS_RESTRICT values, const OrdinalType valuess0, - const OrdinalType valuess1, const OrdinalType* KOKKOS_RESTRICT row_ptr, - const OrdinalType row_ptrs0, - const OrdinalType* KOKKOS_RESTRICT colIndices, - const OrdinalType colIndicess0, const ValueType* KOKKOS_RESTRICT X, - const OrdinalType xs0, const OrdinalType xs1, const ScalarType beta, - /**/ ValueType* KOKKOS_RESTRICT Y, const OrdinalType ys0, - const OrdinalType ys1); + /**/ ValueType* KOKKOS_RESTRICT Y, const OrdinalType ys0, const OrdinalType ys1); + + template + KOKKOS_INLINE_FUNCTION static int invoke(const MemberType& member, const OrdinalType numMatrices, + const OrdinalType numRows, const ScalarType alpha, + const ValueType* KOKKOS_RESTRICT values, const OrdinalType valuess0, + const OrdinalType valuess1, const OrdinalType* KOKKOS_RESTRICT row_ptr, + const OrdinalType row_ptrs0, const OrdinalType* KOKKOS_RESTRICT colIndices, + const OrdinalType colIndicess0, const ValueType* KOKKOS_RESTRICT X, + const OrdinalType xs0, const OrdinalType xs1, const ScalarType beta, + /**/ ValueType* KOKKOS_RESTRICT Y, const OrdinalType ys0, + const OrdinalType ys1); }; -template +template KOKKOS_INLINE_FUNCTION int TeamVectorSpmvInternal::invoke( - const MemberType& member, const OrdinalType numMatrices, - const OrdinalType numRows, const ScalarType* KOKKOS_RESTRICT alpha, - const OrdinalType alphas0, const ValueType* KOKKOS_RESTRICT values, - const OrdinalType valuess0, const OrdinalType valuess1, - const OrdinalType* KOKKOS_RESTRICT row_ptr, const OrdinalType row_ptrs0, - const OrdinalType* KOKKOS_RESTRICT colIndices, - const OrdinalType colIndicess0, const ValueType* KOKKOS_RESTRICT X, - const OrdinalType xs0, const OrdinalType xs1, + const MemberType& member, const OrdinalType numMatrices, const OrdinalType numRows, + const ScalarType* KOKKOS_RESTRICT alpha, const OrdinalType alphas0, const ValueType* KOKKOS_RESTRICT values, + const OrdinalType valuess0, const OrdinalType valuess1, const OrdinalType* KOKKOS_RESTRICT row_ptr, + const OrdinalType row_ptrs0, const OrdinalType* KOKKOS_RESTRICT colIndices, const OrdinalType colIndicess0, + const ValueType* KOKKOS_RESTRICT X, const OrdinalType xs0, const OrdinalType xs1, const ScalarType* KOKKOS_RESTRICT beta, const OrdinalType betas0, - /**/ ValueType* KOKKOS_RESTRICT Y, const OrdinalType ys0, - const OrdinalType ys1) { + /**/ ValueType* KOKKOS_RESTRICT Y, const OrdinalType ys0, const OrdinalType ys1) { #if !defined(__CUDA_ARCH__) && !defined(__HIP_DEVICE_COMPILE__) if (member.team_size() == 1) { if (N_team > 1 && valuess0 == 1) { @@ -87,8 +77,7 @@ KOKKOS_INLINE_FUNCTION int TeamVectorSpmvInternal::invoke( beta_v.loadAligned(beta); for (OrdinalType iRow = 0; iRow < numRows; ++iRow) { - const OrdinalType rowLength = - row_ptr[(iRow + 1) * row_ptrs0] - row_ptr[iRow * row_ptrs0]; + const OrdinalType rowLength = row_ptr[(iRow + 1) * row_ptrs0] - row_ptr[iRow * row_ptrs0]; VectorType sum_v(0); @@ -96,11 +85,8 @@ KOKKOS_INLINE_FUNCTION int TeamVectorSpmvInternal::invoke( #pragma unroll #endif for (OrdinalType iEntry = 0; iEntry < rowLength; ++iEntry) { - values_v.loadAligned( - &values[(row_ptr[iRow * row_ptrs0] + iEntry) * valuess1]); - x_v.loadAligned(&X[colIndices[(row_ptr[iRow * row_ptrs0] + iEntry) * - colIndicess0] * - xs1]); + values_v.loadAligned(&values[(row_ptr[iRow * row_ptrs0] + iEntry) * valuess1]); + x_v.loadAligned(&X[colIndices[(row_ptr[iRow * row_ptrs0] + iEntry) * colIndicess0] * xs1]); sum_v += values_v * x_v; } sum_v *= alpha_v; @@ -113,20 +99,14 @@ KOKKOS_INLINE_FUNCTION int TeamVectorSpmvInternal::invoke( } else { for (unsigned iMatrix = 0; iMatrix < unsigned(numMatrices); ++iMatrix) { for (OrdinalType iRow = 0; iRow < numRows; ++iRow) { - const OrdinalType rowLength = - row_ptr[(iRow + 1) * row_ptrs0] - row_ptr[iRow * row_ptrs0]; + const OrdinalType rowLength = row_ptr[(iRow + 1) * row_ptrs0] - row_ptr[iRow * row_ptrs0]; ValueType sum = 0; Kokkos::parallel_reduce( Kokkos::ThreadVectorRange(member, rowLength), [&](const OrdinalType& iEntry, ValueType& lsum) { - lsum += - values[iMatrix * valuess0 + - (row_ptr[iRow * row_ptrs0] + iEntry) * valuess1] * - X[iMatrix * xs0 + - colIndices[(row_ptr[iRow * row_ptrs0] + iEntry) * - colIndicess0] * - xs1]; + lsum += values[iMatrix * valuess0 + (row_ptr[iRow * row_ptrs0] + iEntry) * valuess1] * + X[iMatrix * xs0 + colIndices[(row_ptr[iRow * row_ptrs0] + iEntry) * colIndicess0] * xs1]; }, sum); @@ -135,63 +115,50 @@ KOKKOS_INLINE_FUNCTION int TeamVectorSpmvInternal::invoke( if (dobeta == 0) { Y[iMatrix * ys0 + iRow * ys1] = sum; } else { - Y[iMatrix * ys0 + iRow * ys1] = - beta[iMatrix * betas0] * Y[iMatrix * ys0 + iRow * ys1] + sum; + Y[iMatrix * ys0 + iRow * ys1] = beta[iMatrix * betas0] * Y[iMatrix * ys0 + iRow * ys1] + sum; } } } } } else { #endif - Kokkos::parallel_for( - Kokkos::TeamVectorRange(member, 0, numMatrices * numRows), - [&](const OrdinalType& iTemp) { - OrdinalType iRow, iMatrix; - getIndices(iTemp, numRows, numMatrices, iRow, - iMatrix); - - const OrdinalType rowLength = - row_ptr[(iRow + 1) * row_ptrs0] - row_ptr[iRow * row_ptrs0]; - ValueType sum = 0; + Kokkos::parallel_for(Kokkos::TeamVectorRange(member, 0, numMatrices * numRows), [&](const OrdinalType& iTemp) { + OrdinalType iRow, iMatrix; + getIndices(iTemp, numRows, numMatrices, iRow, iMatrix); + + const OrdinalType rowLength = row_ptr[(iRow + 1) * row_ptrs0] - row_ptr[iRow * row_ptrs0]; + ValueType sum = 0; #if defined(KOKKOS_ENABLE_PRAGMA_UNROLL) #pragma unroll #endif - for (OrdinalType iEntry = 0; iEntry < rowLength; ++iEntry) { - sum += values[iMatrix * valuess0 + - (row_ptr[iRow * row_ptrs0] + iEntry) * valuess1] * - X[iMatrix * xs0 + - colIndices[(row_ptr[iRow * row_ptrs0] + iEntry) * - colIndicess0] * - xs1]; - } + for (OrdinalType iEntry = 0; iEntry < rowLength; ++iEntry) { + sum += values[iMatrix * valuess0 + (row_ptr[iRow * row_ptrs0] + iEntry) * valuess1] * + X[iMatrix * xs0 + colIndices[(row_ptr[iRow * row_ptrs0] + iEntry) * colIndicess0] * xs1]; + } - sum *= alpha[iMatrix * alphas0]; + sum *= alpha[iMatrix * alphas0]; - if (dobeta == 0) { - Y[iMatrix * ys0 + iRow * ys1] = sum; - } else { - Y[iMatrix * ys0 + iRow * ys1] = - beta[iMatrix * betas0] * Y[iMatrix * ys0 + iRow * ys1] + sum; - } - }); + if (dobeta == 0) { + Y[iMatrix * ys0 + iRow * ys1] = sum; + } else { + Y[iMatrix * ys0 + iRow * ys1] = beta[iMatrix * betas0] * Y[iMatrix * ys0 + iRow * ys1] + sum; + } + }); #if !defined(__CUDA_ARCH__) && !defined(__HIP_DEVICE_COMPILE__) } #endif return 0; } -template +template KOKKOS_INLINE_FUNCTION int TeamVectorSpmvInternal::invoke( - const MemberType& member, const OrdinalType numMatrices, - const OrdinalType numRows, const ScalarType alpha, - const ValueType* KOKKOS_RESTRICT values, const OrdinalType valuess0, - const OrdinalType valuess1, const OrdinalType* KOKKOS_RESTRICT row_ptr, - const OrdinalType row_ptrs0, const OrdinalType* KOKKOS_RESTRICT colIndices, - const OrdinalType colIndicess0, const ValueType* KOKKOS_RESTRICT X, + const MemberType& member, const OrdinalType numMatrices, const OrdinalType numRows, const ScalarType alpha, + const ValueType* KOKKOS_RESTRICT values, const OrdinalType valuess0, const OrdinalType valuess1, + const OrdinalType* KOKKOS_RESTRICT row_ptr, const OrdinalType row_ptrs0, + const OrdinalType* KOKKOS_RESTRICT colIndices, const OrdinalType colIndicess0, const ValueType* KOKKOS_RESTRICT X, const OrdinalType xs0, const OrdinalType xs1, const ScalarType beta, - /**/ ValueType* KOKKOS_RESTRICT Y, const OrdinalType ys0, - const OrdinalType ys1) { + /**/ ValueType* KOKKOS_RESTRICT Y, const OrdinalType ys0, const OrdinalType ys1) { #if !defined(__CUDA_ARCH__) && !defined(__HIP_DEVICE_COMPILE__) if (member.team_size() == 1) { if (N_team > 1 && valuess0 == 1 && valuess1 % N_team == 0) { @@ -205,8 +172,7 @@ KOKKOS_INLINE_FUNCTION int TeamVectorSpmvInternal::invoke( VectorType alpha_v(alpha), beta_v(beta), values_v, y_v, x_v; for (OrdinalType iRow = 0; iRow < numRows; ++iRow) { - const OrdinalType rowLength = - row_ptr[(iRow + 1) * row_ptrs0] - row_ptr[iRow * row_ptrs0]; + const OrdinalType rowLength = row_ptr[(iRow + 1) * row_ptrs0] - row_ptr[iRow * row_ptrs0]; VectorType sum_v(0); @@ -214,11 +180,8 @@ KOKKOS_INLINE_FUNCTION int TeamVectorSpmvInternal::invoke( #pragma unroll #endif for (OrdinalType iEntry = 0; iEntry < rowLength; ++iEntry) { - values_v.loadAligned( - &values[(row_ptr[iRow * row_ptrs0] + iEntry) * valuess1]); - x_v.loadAligned(&X[colIndices[(row_ptr[iRow * row_ptrs0] + iEntry) * - colIndicess0] * - xs1]); + values_v.loadAligned(&values[(row_ptr[iRow * row_ptrs0] + iEntry) * valuess1]); + x_v.loadAligned(&X[colIndices[(row_ptr[iRow * row_ptrs0] + iEntry) * colIndicess0] * xs1]); sum_v += values_v * x_v; } sum_v *= alpha_v; @@ -231,20 +194,14 @@ KOKKOS_INLINE_FUNCTION int TeamVectorSpmvInternal::invoke( } else { for (unsigned iMatrix = 0; iMatrix < unsigned(numMatrices); ++iMatrix) { for (OrdinalType iRow = 0; iRow < numRows; ++iRow) { - const OrdinalType rowLength = - row_ptr[(iRow + 1) * row_ptrs0] - row_ptr[iRow * row_ptrs0]; + const OrdinalType rowLength = row_ptr[(iRow + 1) * row_ptrs0] - row_ptr[iRow * row_ptrs0]; ValueType sum = 0; Kokkos::parallel_reduce( Kokkos::ThreadVectorRange(member, rowLength), [&](const OrdinalType& iEntry, ValueType& lsum) { - lsum += - values[iMatrix * valuess0 + - (row_ptr[iRow * row_ptrs0] + iEntry) * valuess1] * - X[iMatrix * xs0 + - colIndices[(row_ptr[iRow * row_ptrs0] + iEntry) * - colIndicess0] * - xs1]; + lsum += values[iMatrix * valuess0 + (row_ptr[iRow * row_ptrs0] + iEntry) * valuess1] * + X[iMatrix * xs0 + colIndices[(row_ptr[iRow * row_ptrs0] + iEntry) * colIndicess0] * xs1]; }, sum); @@ -253,45 +210,35 @@ KOKKOS_INLINE_FUNCTION int TeamVectorSpmvInternal::invoke( if (dobeta == 0) { Y[iMatrix * ys0 + iRow * ys1] = sum; } else { - Y[iMatrix * ys0 + iRow * ys1] = - beta * Y[iMatrix * ys0 + iRow * ys1] + sum; + Y[iMatrix * ys0 + iRow * ys1] = beta * Y[iMatrix * ys0 + iRow * ys1] + sum; } } } } } else { #endif - Kokkos::parallel_for( - Kokkos::TeamVectorRange(member, 0, numMatrices * numRows), - [&](const OrdinalType& iTemp) { - OrdinalType iRow, iMatrix; - getIndices(iTemp, numRows, numMatrices, iRow, - iMatrix); - - const OrdinalType rowLength = - row_ptr[(iRow + 1) * row_ptrs0] - row_ptr[iRow * row_ptrs0]; - ValueType sum = 0; + Kokkos::parallel_for(Kokkos::TeamVectorRange(member, 0, numMatrices * numRows), [&](const OrdinalType& iTemp) { + OrdinalType iRow, iMatrix; + getIndices(iTemp, numRows, numMatrices, iRow, iMatrix); + + const OrdinalType rowLength = row_ptr[(iRow + 1) * row_ptrs0] - row_ptr[iRow * row_ptrs0]; + ValueType sum = 0; #if defined(KOKKOS_ENABLE_PRAGMA_UNROLL) #pragma unroll #endif - for (OrdinalType iEntry = 0; iEntry < rowLength; ++iEntry) { - sum += values[iMatrix * valuess0 + - (row_ptr[iRow * row_ptrs0] + iEntry) * valuess1] * - X[iMatrix * xs0 + - colIndices[(row_ptr[iRow * row_ptrs0] + iEntry) * - colIndicess0] * - xs1]; - } + for (OrdinalType iEntry = 0; iEntry < rowLength; ++iEntry) { + sum += values[iMatrix * valuess0 + (row_ptr[iRow * row_ptrs0] + iEntry) * valuess1] * + X[iMatrix * xs0 + colIndices[(row_ptr[iRow * row_ptrs0] + iEntry) * colIndicess0] * xs1]; + } - sum *= alpha; + sum *= alpha; - if (dobeta == 0) { - Y[iMatrix * ys0 + iRow * ys1] = sum; - } else { - Y[iMatrix * ys0 + iRow * ys1] = - beta * Y[iMatrix * ys0 + iRow * ys1] + sum; - } - }); + if (dobeta == 0) { + Y[iMatrix * ys0 + iRow * ys1] = sum; + } else { + Y[iMatrix * ys0 + iRow * ys1] = beta * Y[iMatrix * ys0 + iRow * ys1] + sum; + } + }); #if !defined(__CUDA_ARCH__) && !defined(__HIP_DEVICE_COMPILE__) } #endif @@ -300,52 +247,35 @@ KOKKOS_INLINE_FUNCTION int TeamVectorSpmvInternal::invoke( template struct TeamVectorSpmv { - template - KOKKOS_INLINE_FUNCTION static int invoke( - const MemberType& member, const alphaViewType& alpha, - const ValuesViewType& values, const IntView& row_ptr, - const IntView& colIndices, const xViewType& X, const betaViewType& beta, - const yViewType& Y) { + template + KOKKOS_INLINE_FUNCTION static int invoke(const MemberType& member, const alphaViewType& alpha, + const ValuesViewType& values, const IntView& row_ptr, + const IntView& colIndices, const xViewType& X, const betaViewType& beta, + const yViewType& Y) { #if (KOKKOSKERNELS_DEBUG_LEVEL > 0) - static_assert(Kokkos::is_view::value, - "KokkosBatched::spmv: ValuesViewType is not a Kokkos::View."); - static_assert(Kokkos::is_view::value, - "KokkosBatched::spmv: IntView is not a Kokkos::View."); - static_assert(Kokkos::is_view::value, - "KokkosBatched::spmv: xViewType is not a Kokkos::View."); - static_assert(Kokkos::is_view::value, - "KokkosBatched::spmv: yViewType is not a Kokkos::View."); - static_assert(Kokkos::is_view::value, - "KokkosBatched::spmv: alphaViewType is not a Kokkos::View."); - static_assert(Kokkos::is_view::value, - "KokkosBatched::spmv: betaViewType is not a Kokkos::View."); - - static_assert(ValuesViewType::rank == 2, - "KokkosBatched::spmv: ValuesViewType must have rank 2."); - static_assert(IntView::rank == 1, - "KokkosBatched::spmv: IntView must have rank 2."); - static_assert(xViewType::rank == 2, - "KokkosBatched::spmv: xViewType must have rank 2."); - static_assert(yViewType::rank == 2, - "KokkosBatched::spmv: yViewType must have rank 2."); - static_assert(alphaViewType::rank == 1, - "KokkosBatched::spmv: alphaViewType must have rank 1."); - static_assert(betaViewType::rank == 1, - "KokkosBatched::spmv: betaViewType must have rank 1."); - static_assert(alphaViewType::rank == 1, - "KokkosBatched::spmv: alphaViewType must have rank 1."); - static_assert(betaViewType::rank == 1, - "KokkosBatched::spmv: betaViewType must have rank 1."); + static_assert(Kokkos::is_view::value, "KokkosBatched::spmv: ValuesViewType is not a Kokkos::View."); + static_assert(Kokkos::is_view::value, "KokkosBatched::spmv: IntView is not a Kokkos::View."); + static_assert(Kokkos::is_view::value, "KokkosBatched::spmv: xViewType is not a Kokkos::View."); + static_assert(Kokkos::is_view::value, "KokkosBatched::spmv: yViewType is not a Kokkos::View."); + static_assert(Kokkos::is_view::value, "KokkosBatched::spmv: alphaViewType is not a Kokkos::View."); + static_assert(Kokkos::is_view::value, "KokkosBatched::spmv: betaViewType is not a Kokkos::View."); + + static_assert(ValuesViewType::rank == 2, "KokkosBatched::spmv: ValuesViewType must have rank 2."); + static_assert(IntView::rank == 1, "KokkosBatched::spmv: IntView must have rank 2."); + static_assert(xViewType::rank == 2, "KokkosBatched::spmv: xViewType must have rank 2."); + static_assert(yViewType::rank == 2, "KokkosBatched::spmv: yViewType must have rank 2."); + static_assert(alphaViewType::rank == 1, "KokkosBatched::spmv: alphaViewType must have rank 1."); + static_assert(betaViewType::rank == 1, "KokkosBatched::spmv: betaViewType must have rank 1."); + static_assert(alphaViewType::rank == 1, "KokkosBatched::spmv: alphaViewType must have rank 1."); + static_assert(betaViewType::rank == 1, "KokkosBatched::spmv: betaViewType must have rank 1."); // Check compatibility of dimensions at run time. if (X.extent(0) != Y.extent(0) || X.extent(1) != Y.extent(1)) { Kokkos::printf( "KokkosBatched::spmv: Dimensions of X and Y do not match: X: %d x " "%d, Y: %d x %d\n", - (int)X.extent(0), (int)X.extent(1), (int)Y.extent(0), - (int)Y.extent(1)); + (int)X.extent(0), (int)X.extent(1), (int)Y.extent(0), (int)Y.extent(1)); return 1; } if (X.extent(0) != alpha.extent(0)) { @@ -366,8 +296,7 @@ struct TeamVectorSpmv { Kokkos::printf( "KokkosBatched::spmv: First dimension of X and the first dimension " "of values do not match: X: %d x %d, values: %d x %d\n", - (int)X.extent(0), (int)X.extent(1), (int)values.extent(0), - (int)values.extent(1)); + (int)X.extent(0), (int)X.extent(1), (int)values.extent(0), (int)values.extent(1)); return 1; } if (colIndices.extent(0) != values.extent(1)) { @@ -375,8 +304,7 @@ struct TeamVectorSpmv { "KokkosBatched::spmv: Dimension of colIndices and the second " "dimension of values do not match: colIndices: %d , values: %d x " "%d\n", - (int)colIndices.extent(0), (int)values.extent(0), - (int)values.extent(1)); + (int)colIndices.extent(0), (int)values.extent(0), (int)values.extent(1)); return 1; } if (row_ptr.extent(0) - 1 != X.extent(1)) { @@ -389,68 +317,49 @@ struct TeamVectorSpmv { #endif if (values.extent(0) == 1) { return KokkosSparse::Experimental::team_vector_spmv( - member, alpha.data()[0], Kokkos::subview(values, 0, Kokkos::ALL), - row_ptr, colIndices, Kokkos::subview(X, 0, Kokkos::ALL), - beta.data()[0], Kokkos::subview(Y, 0, Kokkos::ALL), dobeta); + member, alpha.data()[0], Kokkos::subview(values, 0, Kokkos::ALL), row_ptr, colIndices, + Kokkos::subview(X, 0, Kokkos::ALL), beta.data()[0], Kokkos::subview(Y, 0, Kokkos::ALL), dobeta); } return TeamVectorSpmvInternal::template invoke< - MemberType, typename alphaViewType::non_const_value_type, - typename ValuesViewType::non_const_value_type, - typename IntView::non_const_value_type, - typename ValuesViewType::array_layout, dobeta, N_team>( - member, X.extent(0), X.extent(1), alpha.data(), alpha.stride_0(), - values.data(), values.stride_0(), values.stride_1(), row_ptr.data(), - row_ptr.stride_0(), colIndices.data(), colIndices.stride_0(), X.data(), - X.stride_0(), X.stride_1(), beta.data(), beta.stride_0(), Y.data(), - Y.stride_0(), Y.stride_1()); + MemberType, typename alphaViewType::non_const_value_type, typename ValuesViewType::non_const_value_type, + typename IntView::non_const_value_type, typename ValuesViewType::array_layout, dobeta, N_team>( + member, X.extent(0), X.extent(1), alpha.data(), alpha.stride_0(), values.data(), values.stride_0(), + values.stride_1(), row_ptr.data(), row_ptr.stride_0(), colIndices.data(), colIndices.stride_0(), X.data(), + X.stride_0(), X.stride_1(), beta.data(), beta.stride_0(), Y.data(), Y.stride_0(), Y.stride_1()); } - template + template KOKKOS_INLINE_FUNCTION static int invoke( const MemberType& member, - const typename Kokkos::ArithTraits< - typename ValuesViewType::non_const_value_type>::mag_type& alpha, - const ValuesViewType& values, const IntView& row_ptr, - const IntView& colIndices, const xViewType& X, - const typename Kokkos::ArithTraits< - typename ValuesViewType::non_const_value_type>::mag_type& beta, + const typename Kokkos::ArithTraits::mag_type& alpha, + const ValuesViewType& values, const IntView& row_ptr, const IntView& colIndices, const xViewType& X, + const typename Kokkos::ArithTraits::mag_type& beta, const yViewType& Y) { #if (KOKKOSKERNELS_DEBUG_LEVEL > 0) - static_assert(Kokkos::is_view::value, - "KokkosBatched::spmv: ValuesViewType is not a Kokkos::View."); - static_assert(Kokkos::is_view::value, - "KokkosBatched::spmv: IntView is not a Kokkos::View."); - static_assert(Kokkos::is_view::value, - "KokkosBatched::spmv: xViewType is not a Kokkos::View."); - static_assert(Kokkos::is_view::value, - "KokkosBatched::spmv: yViewType is not a Kokkos::View."); - - static_assert(ValuesViewType::rank == 2, - "KokkosBatched::spmv: ValuesViewType must have rank 2."); - static_assert(IntView::rank == 1, - "KokkosBatched::spmv: IntView must have rank 2."); - static_assert(xViewType::rank == 2, - "KokkosBatched::spmv: xViewType must have rank 2."); - static_assert(yViewType::rank == 2, - "KokkosBatched::spmv: yViewType must have rank 2."); + static_assert(Kokkos::is_view::value, "KokkosBatched::spmv: ValuesViewType is not a Kokkos::View."); + static_assert(Kokkos::is_view::value, "KokkosBatched::spmv: IntView is not a Kokkos::View."); + static_assert(Kokkos::is_view::value, "KokkosBatched::spmv: xViewType is not a Kokkos::View."); + static_assert(Kokkos::is_view::value, "KokkosBatched::spmv: yViewType is not a Kokkos::View."); + + static_assert(ValuesViewType::rank == 2, "KokkosBatched::spmv: ValuesViewType must have rank 2."); + static_assert(IntView::rank == 1, "KokkosBatched::spmv: IntView must have rank 2."); + static_assert(xViewType::rank == 2, "KokkosBatched::spmv: xViewType must have rank 2."); + static_assert(yViewType::rank == 2, "KokkosBatched::spmv: yViewType must have rank 2."); // Check compatibility of dimensions at run time. if (X.extent(0) != Y.extent(0) || X.extent(1) != Y.extent(1)) { Kokkos::printf( "KokkosBatched::spmv: Dimensions of X and Y do not match: X: %d x " "%d, Y: %d x %d\n", - (int)X.extent(0), (int)X.extent(1), (int)Y.extent(0), - (int)Y.extent(1)); + (int)X.extent(0), (int)X.extent(1), (int)Y.extent(0), (int)Y.extent(1)); return 1; } if (X.extent(0) != values.extent(0)) { Kokkos::printf( "KokkosBatched::spmv: First dimension of X and the first dimension " "of values do not match: X: %d x %d, values: %d x %d\n", - (int)X.extent(0), (int)X.extent(1), (int)values.extent(0), - (int)values.extent(1)); + (int)X.extent(0), (int)X.extent(1), (int)values.extent(0), (int)values.extent(1)); return 1; } if (colIndices.extent(0) != values.extent(1)) { @@ -458,8 +367,7 @@ struct TeamVectorSpmv { "KokkosBatched::spmv: Dimension of colIndices and the second " "dimension of values do not match: colIndices: %d , values: %d x " "%d\n", - (int)colIndices.extent(0), (int)values.extent(0), - (int)values.extent(1)); + (int)colIndices.extent(0), (int)values.extent(0), (int)values.extent(1)); return 1; } if (row_ptr.extent(0) - 1 != X.extent(1)) { @@ -472,22 +380,17 @@ struct TeamVectorSpmv { #endif if (values.extent(0) == 1) { return KokkosSparse::Experimental::team_vector_spmv( - member, alpha, Kokkos::subview(values, 0, Kokkos::ALL), row_ptr, - colIndices, Kokkos::subview(X, 0, Kokkos::ALL), beta, - Kokkos::subview(Y, 0, Kokkos::ALL), dobeta); + member, alpha, Kokkos::subview(values, 0, Kokkos::ALL), row_ptr, colIndices, + Kokkos::subview(X, 0, Kokkos::ALL), beta, Kokkos::subview(Y, 0, Kokkos::ALL), dobeta); } return TeamVectorSpmvInternal::template invoke< - MemberType, - typename Kokkos::ArithTraits< - typename ValuesViewType::non_const_value_type>::mag_type, - typename ValuesViewType::non_const_value_type, - typename IntView::non_const_value_type, + MemberType, typename Kokkos::ArithTraits::mag_type, + typename ValuesViewType::non_const_value_type, typename IntView::non_const_value_type, typename ValuesViewType::array_layout, dobeta, N_team>( - member, X.extent(0), X.extent(1), alpha, values.data(), - values.stride_0(), values.stride_1(), row_ptr.data(), - row_ptr.stride_0(), colIndices.data(), colIndices.stride_0(), X.data(), - X.stride_0(), X.stride_1(), beta, Y.data(), Y.stride_0(), Y.stride_1()); + member, X.extent(0), X.extent(1), alpha, values.data(), values.stride_0(), values.stride_1(), row_ptr.data(), + row_ptr.stride_0(), colIndices.data(), colIndices.stride_0(), X.data(), X.stride_0(), X.stride_1(), beta, + Y.data(), Y.stride_0(), Y.stride_1()); } }; diff --git a/batched/sparse/impl/KokkosBatched_Spmv_Team_Impl.hpp b/batched/sparse/impl/KokkosBatched_Spmv_Team_Impl.hpp index 41128744a3..9e32861612 100644 --- a/batched/sparse/impl/KokkosBatched_Spmv_Team_Impl.hpp +++ b/batched/sparse/impl/KokkosBatched_Spmv_Team_Impl.hpp @@ -27,176 +27,130 @@ namespace KokkosBatched { /// Team Internal Impl /// ==================== struct TeamSpmvInternal { - template + template KOKKOS_INLINE_FUNCTION static int invoke( - const MemberType& member, const OrdinalType numMatrices, - const OrdinalType numRows, const ScalarType* KOKKOS_RESTRICT alpha, - const OrdinalType alphas0, const ValueType* KOKKOS_RESTRICT values, - const OrdinalType valuess0, const OrdinalType valuess1, - const OrdinalType* KOKKOS_RESTRICT row_ptr, const OrdinalType row_ptrs0, - const OrdinalType* KOKKOS_RESTRICT colIndices, - const OrdinalType colIndicess0, const ValueType* KOKKOS_RESTRICT X, - const OrdinalType xs0, const OrdinalType xs1, + const MemberType& member, const OrdinalType numMatrices, const OrdinalType numRows, + const ScalarType* KOKKOS_RESTRICT alpha, const OrdinalType alphas0, const ValueType* KOKKOS_RESTRICT values, + const OrdinalType valuess0, const OrdinalType valuess1, const OrdinalType* KOKKOS_RESTRICT row_ptr, + const OrdinalType row_ptrs0, const OrdinalType* KOKKOS_RESTRICT colIndices, const OrdinalType colIndicess0, + const ValueType* KOKKOS_RESTRICT X, const OrdinalType xs0, const OrdinalType xs1, const ScalarType* KOKKOS_RESTRICT beta, const OrdinalType betas0, - /**/ ValueType* KOKKOS_RESTRICT Y, const OrdinalType ys0, - const OrdinalType ys1); + /**/ ValueType* KOKKOS_RESTRICT Y, const OrdinalType ys0, const OrdinalType ys1); - template - KOKKOS_INLINE_FUNCTION static int invoke( - const MemberType& member, const OrdinalType numMatrices, - const OrdinalType numRows, const ScalarType alpha, - const ValueType* KOKKOS_RESTRICT values, const OrdinalType valuess0, - const OrdinalType valuess1, const OrdinalType* KOKKOS_RESTRICT row_ptr, - const OrdinalType row_ptrs0, - const OrdinalType* KOKKOS_RESTRICT colIndices, - const OrdinalType colIndicess0, const ValueType* KOKKOS_RESTRICT X, - const OrdinalType xs0, const OrdinalType xs1, const ScalarType beta, - /**/ ValueType* KOKKOS_RESTRICT Y, const OrdinalType ys0, - const OrdinalType ys1); + template + KOKKOS_INLINE_FUNCTION static int invoke(const MemberType& member, const OrdinalType numMatrices, + const OrdinalType numRows, const ScalarType alpha, + const ValueType* KOKKOS_RESTRICT values, const OrdinalType valuess0, + const OrdinalType valuess1, const OrdinalType* KOKKOS_RESTRICT row_ptr, + const OrdinalType row_ptrs0, const OrdinalType* KOKKOS_RESTRICT colIndices, + const OrdinalType colIndicess0, const ValueType* KOKKOS_RESTRICT X, + const OrdinalType xs0, const OrdinalType xs1, const ScalarType beta, + /**/ ValueType* KOKKOS_RESTRICT Y, const OrdinalType ys0, + const OrdinalType ys1); }; -template +template KOKKOS_INLINE_FUNCTION int TeamSpmvInternal::invoke( - const MemberType& member, const OrdinalType numMatrices, - const OrdinalType numRows, const ScalarType* KOKKOS_RESTRICT alpha, - const OrdinalType alphas0, const ValueType* KOKKOS_RESTRICT values, - const OrdinalType valuess0, const OrdinalType valuess1, - const OrdinalType* KOKKOS_RESTRICT row_ptr, const OrdinalType row_ptrs0, - const OrdinalType* KOKKOS_RESTRICT colIndices, - const OrdinalType colIndicess0, const ValueType* KOKKOS_RESTRICT X, - const OrdinalType xs0, const OrdinalType xs1, + const MemberType& member, const OrdinalType numMatrices, const OrdinalType numRows, + const ScalarType* KOKKOS_RESTRICT alpha, const OrdinalType alphas0, const ValueType* KOKKOS_RESTRICT values, + const OrdinalType valuess0, const OrdinalType valuess1, const OrdinalType* KOKKOS_RESTRICT row_ptr, + const OrdinalType row_ptrs0, const OrdinalType* KOKKOS_RESTRICT colIndices, const OrdinalType colIndicess0, + const ValueType* KOKKOS_RESTRICT X, const OrdinalType xs0, const OrdinalType xs1, const ScalarType* KOKKOS_RESTRICT beta, const OrdinalType betas0, - /**/ ValueType* KOKKOS_RESTRICT Y, const OrdinalType ys0, - const OrdinalType ys1) { - Kokkos::parallel_for( - Kokkos::TeamThreadRange(member, 0, numMatrices * numRows), - [&](const OrdinalType& iTemp) { - OrdinalType iRow, iMatrix; - getIndices(iTemp, numRows, numMatrices, iRow, - iMatrix); + /**/ ValueType* KOKKOS_RESTRICT Y, const OrdinalType ys0, const OrdinalType ys1) { + Kokkos::parallel_for(Kokkos::TeamThreadRange(member, 0, numMatrices * numRows), [&](const OrdinalType& iTemp) { + OrdinalType iRow, iMatrix; + getIndices(iTemp, numRows, numMatrices, iRow, iMatrix); - const OrdinalType rowLength = - row_ptr[(iRow + 1) * row_ptrs0] - row_ptr[iRow * row_ptrs0]; - ValueType sum = 0; + const OrdinalType rowLength = row_ptr[(iRow + 1) * row_ptrs0] - row_ptr[iRow * row_ptrs0]; + ValueType sum = 0; #if defined(KOKKOS_ENABLE_PRAGMA_UNROLL) #pragma unroll #endif - for (OrdinalType iEntry = 0; iEntry < rowLength; ++iEntry) { - sum += values[iMatrix * valuess0 + - (row_ptr[iRow * row_ptrs0] + iEntry) * valuess1] * - X[iMatrix * xs0 + - colIndices[(row_ptr[iRow * row_ptrs0] + iEntry) * - colIndicess0] * - xs1]; - } + for (OrdinalType iEntry = 0; iEntry < rowLength; ++iEntry) { + sum += values[iMatrix * valuess0 + (row_ptr[iRow * row_ptrs0] + iEntry) * valuess1] * + X[iMatrix * xs0 + colIndices[(row_ptr[iRow * row_ptrs0] + iEntry) * colIndicess0] * xs1]; + } - sum *= alpha[iMatrix * alphas0]; + sum *= alpha[iMatrix * alphas0]; - if (dobeta == 0) { - Y[iMatrix * ys0 + iRow * ys1] = sum; - } else { - Y[iMatrix * ys0 + iRow * ys1] = - beta[iMatrix * betas0] * Y[iMatrix * ys0 + iRow * ys1] + sum; - } - }); + if (dobeta == 0) { + Y[iMatrix * ys0 + iRow * ys1] = sum; + } else { + Y[iMatrix * ys0 + iRow * ys1] = beta[iMatrix * betas0] * Y[iMatrix * ys0 + iRow * ys1] + sum; + } + }); return 0; } -template +template KOKKOS_INLINE_FUNCTION int TeamSpmvInternal::invoke( - const MemberType& member, const OrdinalType numMatrices, - const OrdinalType numRows, const ScalarType alpha, - const ValueType* KOKKOS_RESTRICT values, const OrdinalType valuess0, - const OrdinalType valuess1, const OrdinalType* KOKKOS_RESTRICT row_ptr, - const OrdinalType row_ptrs0, const OrdinalType* KOKKOS_RESTRICT colIndices, - const OrdinalType colIndicess0, const ValueType* KOKKOS_RESTRICT X, + const MemberType& member, const OrdinalType numMatrices, const OrdinalType numRows, const ScalarType alpha, + const ValueType* KOKKOS_RESTRICT values, const OrdinalType valuess0, const OrdinalType valuess1, + const OrdinalType* KOKKOS_RESTRICT row_ptr, const OrdinalType row_ptrs0, + const OrdinalType* KOKKOS_RESTRICT colIndices, const OrdinalType colIndicess0, const ValueType* KOKKOS_RESTRICT X, const OrdinalType xs0, const OrdinalType xs1, const ScalarType beta, - /**/ ValueType* KOKKOS_RESTRICT Y, const OrdinalType ys0, - const OrdinalType ys1) { - Kokkos::parallel_for( - Kokkos::TeamThreadRange(member, 0, numMatrices * numRows), - [&](const OrdinalType& iTemp) { - OrdinalType iRow, iMatrix; - getIndices(iTemp, numRows, numMatrices, iRow, - iMatrix); + /**/ ValueType* KOKKOS_RESTRICT Y, const OrdinalType ys0, const OrdinalType ys1) { + Kokkos::parallel_for(Kokkos::TeamThreadRange(member, 0, numMatrices * numRows), [&](const OrdinalType& iTemp) { + OrdinalType iRow, iMatrix; + getIndices(iTemp, numRows, numMatrices, iRow, iMatrix); - const OrdinalType rowLength = - row_ptr[(iRow + 1) * row_ptrs0] - row_ptr[iRow * row_ptrs0]; - ValueType sum = 0; + const OrdinalType rowLength = row_ptr[(iRow + 1) * row_ptrs0] - row_ptr[iRow * row_ptrs0]; + ValueType sum = 0; #if defined(KOKKOS_ENABLE_PRAGMA_UNROLL) #pragma unroll #endif - for (OrdinalType iEntry = 0; iEntry < rowLength; ++iEntry) { - sum += values[iMatrix * valuess0 + - (row_ptr[iRow * row_ptrs0] + iEntry) * valuess1] * - X[iMatrix * xs0 + - colIndices[(row_ptr[iRow * row_ptrs0] + iEntry) * - colIndicess0] * - xs1]; - } + for (OrdinalType iEntry = 0; iEntry < rowLength; ++iEntry) { + sum += values[iMatrix * valuess0 + (row_ptr[iRow * row_ptrs0] + iEntry) * valuess1] * + X[iMatrix * xs0 + colIndices[(row_ptr[iRow * row_ptrs0] + iEntry) * colIndicess0] * xs1]; + } - sum *= alpha; + sum *= alpha; - if (dobeta == 0) { - Y[iMatrix * ys0 + iRow * ys1] = sum; - } else { - Y[iMatrix * ys0 + iRow * ys1] = - beta * Y[iMatrix * ys0 + iRow * ys1] + sum; - } - }); + if (dobeta == 0) { + Y[iMatrix * ys0 + iRow * ys1] = sum; + } else { + Y[iMatrix * ys0 + iRow * ys1] = beta * Y[iMatrix * ys0 + iRow * ys1] + sum; + } + }); return 0; } template struct TeamSpmv { - template - KOKKOS_INLINE_FUNCTION static int invoke( - const MemberType& member, const alphaViewType& alpha, - const ValuesViewType& values, const IntView& row_ptr, - const IntView& colIndices, const xViewType& X, const betaViewType& beta, - const yViewType& Y) { + template + KOKKOS_INLINE_FUNCTION static int invoke(const MemberType& member, const alphaViewType& alpha, + const ValuesViewType& values, const IntView& row_ptr, + const IntView& colIndices, const xViewType& X, const betaViewType& beta, + const yViewType& Y) { #if (KOKKOSKERNELS_DEBUG_LEVEL > 0) - static_assert(Kokkos::is_view::value, - "KokkosBatched::spmv: ValuesViewType is not a Kokkos::View."); - static_assert(Kokkos::is_view::value, - "KokkosBatched::spmv: IntView is not a Kokkos::View."); - static_assert(Kokkos::is_view::value, - "KokkosBatched::spmv: xViewType is not a Kokkos::View."); - static_assert(Kokkos::is_view::value, - "KokkosBatched::spmv: yViewType is not a Kokkos::View."); - static_assert(Kokkos::is_view::value, - "KokkosBatched::spmv: alphaViewType is not a Kokkos::View."); - static_assert(Kokkos::is_view::value, - "KokkosBatched::spmv: betaViewType is not a Kokkos::View."); + static_assert(Kokkos::is_view::value, "KokkosBatched::spmv: ValuesViewType is not a Kokkos::View."); + static_assert(Kokkos::is_view::value, "KokkosBatched::spmv: IntView is not a Kokkos::View."); + static_assert(Kokkos::is_view::value, "KokkosBatched::spmv: xViewType is not a Kokkos::View."); + static_assert(Kokkos::is_view::value, "KokkosBatched::spmv: yViewType is not a Kokkos::View."); + static_assert(Kokkos::is_view::value, "KokkosBatched::spmv: alphaViewType is not a Kokkos::View."); + static_assert(Kokkos::is_view::value, "KokkosBatched::spmv: betaViewType is not a Kokkos::View."); - static_assert(ValuesViewType::rank == 2, - "KokkosBatched::spmv: ValuesViewType must have rank 2."); - static_assert(IntView::rank == 1, - "KokkosBatched::spmv: IntView must have rank 2."); - static_assert(xViewType::rank == 2, - "KokkosBatched::spmv: xViewType must have rank 2."); - static_assert(yViewType::rank == 2, - "KokkosBatched::spmv: yViewType must have rank 2."); - static_assert(alphaViewType::rank == 1, - "KokkosBatched::spmv: alphaViewType must have rank 1."); - static_assert(betaViewType::rank == 1, - "KokkosBatched::spmv: betaViewType must have rank 1."); + static_assert(ValuesViewType::rank == 2, "KokkosBatched::spmv: ValuesViewType must have rank 2."); + static_assert(IntView::rank == 1, "KokkosBatched::spmv: IntView must have rank 2."); + static_assert(xViewType::rank == 2, "KokkosBatched::spmv: xViewType must have rank 2."); + static_assert(yViewType::rank == 2, "KokkosBatched::spmv: yViewType must have rank 2."); + static_assert(alphaViewType::rank == 1, "KokkosBatched::spmv: alphaViewType must have rank 1."); + static_assert(betaViewType::rank == 1, "KokkosBatched::spmv: betaViewType must have rank 1."); // Check compatibility of dimensions at run time. if (X.extent(0) != Y.extent(0) || X.extent(1) != Y.extent(1)) { Kokkos::printf( "KokkosBatched::spmv: Dimensions of X and Y do not match: X: %d x " "%d, Y: %d x %d\n", - (int)X.extent(0), (int)X.extent(1), (int)Y.extent(0), - (int)Y.extent(1)); + (int)X.extent(0), (int)X.extent(1), (int)Y.extent(0), (int)Y.extent(1)); return 1; } if (X.extent(0) != alpha.extent(0)) { @@ -217,8 +171,7 @@ struct TeamSpmv { Kokkos::printf( "KokkosBatched::spmv: First dimension of X and the first dimension " "of values do not match: X: %d x %d, values: %d x %d\n", - (int)X.extent(0), (int)X.extent(1), (int)values.extent(0), - (int)values.extent(1)); + (int)X.extent(0), (int)X.extent(1), (int)values.extent(0), (int)values.extent(1)); return 1; } if (colIndices.extent(0) != values.extent(1)) { @@ -226,8 +179,7 @@ struct TeamSpmv { "KokkosBatched::spmv: Dimension of colIndices and the second " "dimension of values do not match: colIndices: %d , values: %d x " "%d\n", - (int)colIndices.extent(0), (int)values.extent(0), - (int)values.extent(1)); + (int)colIndices.extent(0), (int)values.extent(0), (int)values.extent(1)); return 1; } if (row_ptr.extent(0) - 1 != X.extent(1)) { @@ -240,68 +192,49 @@ struct TeamSpmv { #endif if (values.extent(0) == 1) { return KokkosSparse::Experimental::team_spmv( - member, alpha.data()[0], Kokkos::subview(values, 0, Kokkos::ALL), - row_ptr, colIndices, Kokkos::subview(X, 0, Kokkos::ALL), - beta.data()[0], Kokkos::subview(Y, 0, Kokkos::ALL), dobeta); + member, alpha.data()[0], Kokkos::subview(values, 0, Kokkos::ALL), row_ptr, colIndices, + Kokkos::subview(X, 0, Kokkos::ALL), beta.data()[0], Kokkos::subview(Y, 0, Kokkos::ALL), dobeta); } return TeamSpmvInternal::template invoke< - MemberType, typename alphaViewType::non_const_value_type, - typename ValuesViewType::non_const_value_type, - typename IntView::non_const_value_type, - typename ValuesViewType::array_layout, dobeta>( - member, X.extent(0), X.extent(1), alpha.data(), alpha.stride_0(), - values.data(), values.stride_0(), values.stride_1(), row_ptr.data(), - row_ptr.stride_0(), colIndices.data(), colIndices.stride_0(), X.data(), - X.stride_0(), X.stride_1(), beta.data(), beta.stride_0(), Y.data(), - Y.stride_0(), Y.stride_1()); + MemberType, typename alphaViewType::non_const_value_type, typename ValuesViewType::non_const_value_type, + typename IntView::non_const_value_type, typename ValuesViewType::array_layout, dobeta>( + member, X.extent(0), X.extent(1), alpha.data(), alpha.stride_0(), values.data(), values.stride_0(), + values.stride_1(), row_ptr.data(), row_ptr.stride_0(), colIndices.data(), colIndices.stride_0(), X.data(), + X.stride_0(), X.stride_1(), beta.data(), beta.stride_0(), Y.data(), Y.stride_0(), Y.stride_1()); } - template + template KOKKOS_INLINE_FUNCTION static int invoke( const MemberType& member, - const typename Kokkos::ArithTraits< - typename ValuesViewType::non_const_value_type>::mag_type& alpha, - const ValuesViewType& values, const IntView& row_ptr, - const IntView& colIndices, const xViewType& X, - const typename Kokkos::ArithTraits< - typename ValuesViewType::non_const_value_type>::mag_type& beta, + const typename Kokkos::ArithTraits::mag_type& alpha, + const ValuesViewType& values, const IntView& row_ptr, const IntView& colIndices, const xViewType& X, + const typename Kokkos::ArithTraits::mag_type& beta, const yViewType& Y) { #if (KOKKOSKERNELS_DEBUG_LEVEL > 0) - static_assert(Kokkos::is_view::value, - "KokkosBatched::spmv: ValuesViewType is not a Kokkos::View."); - static_assert(Kokkos::is_view::value, - "KokkosBatched::spmv: IntView is not a Kokkos::View."); - static_assert(Kokkos::is_view::value, - "KokkosBatched::spmv: xViewType is not a Kokkos::View."); - static_assert(Kokkos::is_view::value, - "KokkosBatched::spmv: yViewType is not a Kokkos::View."); + static_assert(Kokkos::is_view::value, "KokkosBatched::spmv: ValuesViewType is not a Kokkos::View."); + static_assert(Kokkos::is_view::value, "KokkosBatched::spmv: IntView is not a Kokkos::View."); + static_assert(Kokkos::is_view::value, "KokkosBatched::spmv: xViewType is not a Kokkos::View."); + static_assert(Kokkos::is_view::value, "KokkosBatched::spmv: yViewType is not a Kokkos::View."); - static_assert(ValuesViewType::rank == 2, - "KokkosBatched::spmv: ValuesViewType must have rank 2."); - static_assert(IntView::rank == 1, - "KokkosBatched::spmv: IntView must have rank 2."); - static_assert(xViewType::rank == 2, - "KokkosBatched::spmv: xViewType must have rank 2."); - static_assert(yViewType::rank == 2, - "KokkosBatched::spmv: yViewType must have rank 2."); + static_assert(ValuesViewType::rank == 2, "KokkosBatched::spmv: ValuesViewType must have rank 2."); + static_assert(IntView::rank == 1, "KokkosBatched::spmv: IntView must have rank 2."); + static_assert(xViewType::rank == 2, "KokkosBatched::spmv: xViewType must have rank 2."); + static_assert(yViewType::rank == 2, "KokkosBatched::spmv: yViewType must have rank 2."); // Check compatibility of dimensions at run time. if (X.extent(0) != Y.extent(0) || X.extent(1) != Y.extent(1)) { Kokkos::printf( "KokkosBatched::spmv: Dimensions of X and Y do not match: X: %d x " "%d, Y: %d x %d\n", - (int)X.extent(0), (int)X.extent(1), (int)Y.extent(0), - (int)Y.extent(1)); + (int)X.extent(0), (int)X.extent(1), (int)Y.extent(0), (int)Y.extent(1)); return 1; } if (X.extent(0) != values.extent(0)) { Kokkos::printf( "KokkosBatched::spmv: First dimension of X and the first dimension " "of values do not match: X: %d x %d, values: %d x %d\n", - (int)X.extent(0), (int)X.extent(1), (int)values.extent(0), - (int)values.extent(1)); + (int)X.extent(0), (int)X.extent(1), (int)values.extent(0), (int)values.extent(1)); return 1; } if (colIndices.extent(0) != values.extent(1)) { @@ -309,8 +242,7 @@ struct TeamSpmv { "KokkosBatched::spmv: Dimension of colIndices and the second " "dimension of values do not match: colIndices: %d , values: %d x " "%d\n", - (int)colIndices.extent(0), (int)values.extent(0), - (int)values.extent(1)); + (int)colIndices.extent(0), (int)values.extent(0), (int)values.extent(1)); return 1; } if (row_ptr.extent(0) - 1 != X.extent(1)) { @@ -322,23 +254,18 @@ struct TeamSpmv { } #endif if (values.extent(0) == 1) { - return KokkosSparse::Experimental::team_spmv( - member, alpha, Kokkos::subview(values, 0, Kokkos::ALL), row_ptr, - colIndices, Kokkos::subview(X, 0, Kokkos::ALL), beta, - Kokkos::subview(Y, 0, Kokkos::ALL), dobeta); + return KokkosSparse::Experimental::team_spmv(member, alpha, Kokkos::subview(values, 0, Kokkos::ALL), + row_ptr, colIndices, Kokkos::subview(X, 0, Kokkos::ALL), + beta, Kokkos::subview(Y, 0, Kokkos::ALL), dobeta); } return TeamSpmvInternal::template invoke< - MemberType, - typename Kokkos::ArithTraits< - typename ValuesViewType::non_const_value_type>::mag_type, - typename ValuesViewType::non_const_value_type, - typename IntView::non_const_value_type, + MemberType, typename Kokkos::ArithTraits::mag_type, + typename ValuesViewType::non_const_value_type, typename IntView::non_const_value_type, typename ValuesViewType::array_layout, dobeta>( - member, X.extent(0), X.extent(1), alpha, values.data(), - values.stride_0(), values.stride_1(), row_ptr.data(), - row_ptr.stride_0(), colIndices.data(), colIndices.stride_0(), X.data(), - X.stride_0(), X.stride_1(), beta, Y.data(), Y.stride_0(), Y.stride_1()); + member, X.extent(0), X.extent(1), alpha, values.data(), values.stride_0(), values.stride_1(), row_ptr.data(), + row_ptr.stride_0(), colIndices.data(), colIndices.stride_0(), X.data(), X.stride_0(), X.stride_1(), beta, + Y.data(), Y.stride_0(), Y.stride_1()); } }; diff --git a/batched/sparse/src/KokkosBatched_CG.hpp b/batched/sparse/src/KokkosBatched_CG.hpp index baa6dca42e..cabf2eae98 100644 --- a/batched/sparse/src/KokkosBatched_CG.hpp +++ b/batched/sparse/src/KokkosBatched_CG.hpp @@ -42,22 +42,14 @@ namespace KokkosBatched { template struct CG { - template - KOKKOS_INLINE_FUNCTION static int invoke(const MemberType &member, - const OperatorType &A, - const VectorViewType &B, - const VectorViewType &X, - const KrylovHandleType &handle) { + template + KOKKOS_INLINE_FUNCTION static int invoke(const MemberType &member, const OperatorType &A, const VectorViewType &B, + const VectorViewType &X, const KrylovHandleType &handle) { int status = 0; if (std::is_same::value) { - status = - TeamCG::template invoke( - member, A, B, X, handle); + status = TeamCG::template invoke(member, A, B, X, handle); } else if (std::is_same::value) { - status = TeamVectorCG::template invoke( - member, A, B, X, handle); + status = TeamVectorCG::template invoke(member, A, B, X, handle); } return status; } diff --git a/batched/sparse/src/KokkosBatched_CrsMatrix.hpp b/batched/sparse/src/KokkosBatched_CrsMatrix.hpp index 92acc91a9e..0d880cd880 100644 --- a/batched/sparse/src/KokkosBatched_CrsMatrix.hpp +++ b/batched/sparse/src/KokkosBatched_CrsMatrix.hpp @@ -42,8 +42,7 @@ class CrsMatrix { public: KOKKOS_INLINE_FUNCTION - CrsMatrix(const ValuesViewType &_values, const IntViewType &_row_ptr, - const IntViewType &_colIndices) + CrsMatrix(const ValuesViewType &_values, const IntViewType &_row_ptr, const IntViewType &_colIndices) : values(_values), row_ptr(_row_ptr), colIndices(_colIndices) { n_operators = _values.extent(0); n_rows = _row_ptr.extent(0) - 1; @@ -77,45 +76,40 @@ class CrsMatrix { /// \param beta [in]: input coefficient for Y (default value 0.) /// \param Y [in/out]: Output vector Y, a rank 2 view - template - KOKKOS_INLINE_FUNCTION void apply( - const MemberType &member, const XViewType &X, const YViewType &Y, - MagnitudeType alpha = Kokkos::ArithTraits::one(), - MagnitudeType beta = Kokkos::ArithTraits::zero()) const { + template + KOKKOS_INLINE_FUNCTION void apply(const MemberType &member, const XViewType &X, const YViewType &Y, + MagnitudeType alpha = Kokkos::ArithTraits::one(), + MagnitudeType beta = Kokkos::ArithTraits::zero()) const { if (beta == Kokkos::ArithTraits::zero()) { if (member.team_size() == 1 && n_operators == 8) - KokkosBatched::TeamVectorSpmv::template invoke< - ValuesViewType, IntViewType, XViewType, YViewType, 0>( + KokkosBatched::TeamVectorSpmv::template invoke( member, alpha, values, row_ptr, colIndices, X, beta, Y); else - KokkosBatched::TeamVectorSpmv::template invoke< - ValuesViewType, IntViewType, XViewType, YViewType, 0>( + KokkosBatched::TeamVectorSpmv::template invoke( member, alpha, values, row_ptr, colIndices, X, beta, Y); } else { if (member.team_size() == 1 && n_operators == 8) - KokkosBatched::TeamVectorSpmv::template invoke< - ValuesViewType, IntViewType, XViewType, YViewType, 1>( + KokkosBatched::TeamVectorSpmv::template invoke( member, alpha, values, row_ptr, colIndices, X, beta, Y); else - KokkosBatched::TeamVectorSpmv::template invoke< - ValuesViewType, IntViewType, XViewType, YViewType, 1>( + KokkosBatched::TeamVectorSpmv::template invoke( member, alpha, values, row_ptr, colIndices, X, beta, Y); } } template - KOKKOS_INLINE_FUNCTION void apply( - const XViewType &X, const YViewType &Y, - MagnitudeType alpha = Kokkos::ArithTraits::one(), - MagnitudeType beta = Kokkos::ArithTraits::zero()) const { + KOKKOS_INLINE_FUNCTION void apply(const XViewType &X, const YViewType &Y, + MagnitudeType alpha = Kokkos::ArithTraits::one(), + MagnitudeType beta = Kokkos::ArithTraits::zero()) const { if (beta == Kokkos::ArithTraits::zero()) - KokkosBatched::SerialSpmv::template invoke< - ValuesViewType, IntViewType, XViewType, YViewType, 0>( + KokkosBatched::SerialSpmv::template invoke( alpha, values, row_ptr, colIndices, X, beta, Y); else - KokkosBatched::SerialSpmv::template invoke< - ValuesViewType, IntViewType, XViewType, YViewType, 1>( + KokkosBatched::SerialSpmv::template invoke( alpha, values, row_ptr, colIndices, X, beta, Y); } }; diff --git a/batched/sparse/src/KokkosBatched_GMRES.hpp b/batched/sparse/src/KokkosBatched_GMRES.hpp index 0d27bcd6fb..a3f4eda8d3 100644 --- a/batched/sparse/src/KokkosBatched_GMRES.hpp +++ b/batched/sparse/src/KokkosBatched_GMRES.hpp @@ -44,25 +44,16 @@ namespace KokkosBatched { template struct GMRES { - template - KOKKOS_INLINE_FUNCTION static int invoke(const MemberType &member, - const OperatorType &A, - const VectorViewType &B, - const VectorViewType &X, - const KrylovHandleType &handle) { + template + KOKKOS_INLINE_FUNCTION static int invoke(const MemberType &member, const OperatorType &A, const VectorViewType &B, + const VectorViewType &X, const KrylovHandleType &handle) { int status = 0; if (std::is_same::value) { - status = SerialGMRES::template invoke( - A, B, X, handle); + status = SerialGMRES::template invoke(A, B, X, handle); } else if (std::is_same::value) { - status = - TeamGMRES::template invoke( - member, A, B, X, handle); + status = TeamGMRES::template invoke(member, A, B, X, handle); } else if (std::is_same::value) { - status = TeamVectorGMRES::template invoke( - member, A, B, X, handle); + status = TeamVectorGMRES::template invoke(member, A, B, X, handle); } return status; } diff --git a/batched/sparse/src/KokkosBatched_Identity.hpp b/batched/sparse/src/KokkosBatched_Identity.hpp index 4e8e7c4308..311ec09d5c 100644 --- a/batched/sparse/src/KokkosBatched_Identity.hpp +++ b/batched/sparse/src/KokkosBatched_Identity.hpp @@ -34,26 +34,21 @@ class Identity { KOKKOS_INLINE_FUNCTION ~Identity() {} - template - KOKKOS_INLINE_FUNCTION void apply(const MemberType &member, - const XViewType &X, - const YViewType &Y) const { + template + KOKKOS_INLINE_FUNCTION void apply(const MemberType &member, const XViewType &X, const YViewType &Y) const { if (sameXY == 0) { if (std::is_same::value) { SerialCopy::invoke(X, Y); } else if (std::is_same::value) { TeamCopy::invoke(member, X, Y); - } else if (std::is_same::value) { + } else if (std::is_same::value) { TeamVectorCopy::invoke(member, X, Y); } } } - template - KOKKOS_INLINE_FUNCTION void apply(const XViewType &X, - const YViewType &Y) const { + template + KOKKOS_INLINE_FUNCTION void apply(const XViewType &X, const YViewType &Y) const { if (sameXY == 0) { SerialCopy::invoke(X, Y); } diff --git a/batched/sparse/src/KokkosBatched_JacobiPrec.hpp b/batched/sparse/src/KokkosBatched_JacobiPrec.hpp index eacb859636..580f85158b 100644 --- a/batched/sparse/src/KokkosBatched_JacobiPrec.hpp +++ b/batched/sparse/src/KokkosBatched_JacobiPrec.hpp @@ -75,15 +75,12 @@ class JacobiPrec { Kokkos::TeamThreadRange(member, 0, n_operators * n_rows), [&](const int &iTemp, int <ooSmall) { int i, j; - getIndices( - iTemp, n_rows, n_operators, j, i); - if (Kokkos::abs(diag_values_array[i * vs0 + j * vs1]) <= - epsilon) { + getIndices(iTemp, n_rows, n_operators, j, i); + if (Kokkos::abs(diag_values_array[i * vs0 + j * vs1]) <= epsilon) { ltooSmall++; diag_values_array[i * vs0 + j * vs1] = one; } else - diag_values_array[i * vs0 + j * vs1] = - one / diag_values_array[i * vs0 + j * vs1]; + diag_values_array[i * vs0 + j * vs1] = one / diag_values_array[i * vs0 + j * vs1]; }, tooSmall); } else if (std::is_same::value) { @@ -95,15 +92,12 @@ class JacobiPrec { Kokkos::TeamVectorRange(member, 0, n_operators * n_rows), [&](const int &iTemp, int <ooSmall) { int i, j; - getIndices( - iTemp, n_rows, n_operators, j, i); - if (Kokkos::abs(diag_values_array[i * vs0 + j * vs1]) <= - epsilon) { + getIndices(iTemp, n_rows, n_operators, j, i); + if (Kokkos::abs(diag_values_array[i * vs0 + j * vs1]) <= epsilon) { ltooSmall++; diag_values_array[i * vs0 + j * vs1] = one; } else - diag_values_array[i * vs0 + j * vs1] = - one / diag_values_array[i * vs0 + j * vs1]; + diag_values_array[i * vs0 + j * vs1] = one / diag_values_array[i * vs0 + j * vs1]; }, tooSmall); } @@ -138,31 +132,25 @@ class JacobiPrec { computed_inverse = true; } - template - KOKKOS_INLINE_FUNCTION void apply(const MemberType &member, - const XViewType &X, - const YViewType &Y) const { + template + KOKKOS_INLINE_FUNCTION void apply(const MemberType &member, const XViewType &X, const YViewType &Y) const { if (!computed_inverse) { this->computeInverse(member); member.team_barrier(); // Finish writing to this->diag_values } - KokkosBatched::HadamardProduct::template invoke< - ValuesViewType, XViewType, YViewType>(member, diag_values, X, Y); + KokkosBatched::HadamardProduct::template invoke( + member, diag_values, X, Y); } - template - KOKKOS_INLINE_FUNCTION void apply(const XViewType &X, - const YViewType &Y) const { + template + KOKKOS_INLINE_FUNCTION void apply(const XViewType &X, const YViewType &Y) const { if (!computed_inverse) { this->computeInverse(); } - KokkosBatched::SerialHadamardProduct::template invoke( - diag_values, X, Y); + KokkosBatched::SerialHadamardProduct::template invoke(diag_values, X, Y); } }; diff --git a/batched/sparse/src/KokkosBatched_Krylov_Handle.hpp b/batched/sparse/src/KokkosBatched_Krylov_Handle.hpp index 9992742dd8..c8e8392e11 100644 --- a/batched/sparse/src/KokkosBatched_Krylov_Handle.hpp +++ b/batched/sparse/src/KokkosBatched_Krylov_Handle.hpp @@ -51,8 +51,7 @@ class KrylovHandle { using norm_type = typename NormViewType::non_const_value_type; typedef ViewType3D ArnoldiViewType; - typedef Kokkos::View TemporaryViewType; @@ -81,8 +80,7 @@ class KrylovHandle { bool host_synchronised; public: - KrylovHandle(int _batched_size, int _N_team, int _max_iteration = 200, - bool _monitor_residual = false) + KrylovHandle(int _batched_size, int _N_team, int _max_iteration = 200, bool _monitor_residual = false) : max_iteration(_max_iteration), batched_size(_batched_size), N_team(_N_team), @@ -192,9 +190,7 @@ class KrylovHandle { /// \param batched_id [in]: Global batched ID KOKKOS_INLINE_FUNCTION - bool is_converged(int batched_id) const { - return (iteration_numbers(batched_id) != -1); - } + bool is_converged(int batched_id) const { return (iteration_numbers(batched_id) != -1); } /// \brief is_converged /// Test if one particular system has converged (host). @@ -226,9 +222,7 @@ class KrylovHandle { /// \param _max_tolerance [in]: New tolerance KOKKOS_INLINE_FUNCTION - void set_max_tolerance(norm_type _max_tolerance) { - max_tolerance = _max_tolerance; - } + void set_max_tolerance(norm_type _max_tolerance) { max_tolerance = _max_tolerance; } /// \brief get_max_tolerance /// Get the maximal tolerance of the batched Krylov solver @@ -310,9 +304,7 @@ class KrylovHandle { /// \param batched_id [in]: Global batched ID KOKKOS_INLINE_FUNCTION - int get_iteration(int batched_id) const { - return iteration_numbers(batched_id); - } + int get_iteration(int batched_id) const { return iteration_numbers(batched_id); } /// \brief get_iteration_host /// Get the number of iteration after convergence for one system (host) @@ -332,9 +324,7 @@ class KrylovHandle { /// \param _ortho_strategy [in]: used orthogonalization strategy KOKKOS_INLINE_FUNCTION - void set_ortho_strategy(int _ortho_strategy) { - ortho_strategy = _ortho_strategy; - } + void set_ortho_strategy(int _ortho_strategy) { ortho_strategy = _ortho_strategy; } /// \brief get_ortho_strategy /// Get the used orthogonalization strategy. @@ -350,9 +340,7 @@ class KrylovHandle { /// \param _scratch_pad_level [in]: used level KOKKOS_INLINE_FUNCTION - void set_scratch_pad_level(int _scratch_pad_level) { - scratch_pad_level = _scratch_pad_level; - } + void set_scratch_pad_level(int _scratch_pad_level) { scratch_pad_level = _scratch_pad_level; } /// \brief get_scratch_pad_level /// Get the scratch pad level used to store temporary variables. @@ -386,9 +374,7 @@ class KrylovHandle { } KOKKOS_INLINE_FUNCTION - void set_memory_strategy(int _memory_strategy) { - memory_strategy = _memory_strategy; - } + void set_memory_strategy(int _memory_strategy) { memory_strategy = _memory_strategy; } KOKKOS_INLINE_FUNCTION int get_memory_strategy() const { return memory_strategy; } @@ -415,10 +401,8 @@ class KrylovHandle { /// \param norm_i [in]: Norm to store KOKKOS_INLINE_FUNCTION - void set_norm(int team_id, int batched_id, int iteration_id, - norm_type norm_i) const { - if (monitor_residual) - residual_norms(team_id * N_team + batched_id, iteration_id) = norm_i; + void set_norm(int team_id, int batched_id, int iteration_id, norm_type norm_i) const { + if (monitor_residual) residual_norms(team_id * N_team + batched_id, iteration_id) = norm_i; } /// \brief set_last_norm @@ -429,8 +413,7 @@ class KrylovHandle { KOKKOS_INLINE_FUNCTION void set_last_norm(int batched_id, norm_type norm_i) const { - if (monitor_residual) - residual_norms(batched_id, max_iteration + 1) = norm_i; + if (monitor_residual) residual_norms(batched_id, max_iteration + 1) = norm_i; } /// \brief set_last_norm @@ -442,8 +425,7 @@ class KrylovHandle { KOKKOS_INLINE_FUNCTION void set_last_norm(int team_id, int batched_id, norm_type norm_i) const { - if (monitor_residual) - residual_norms(team_id * N_team + batched_id, max_iteration + 1) = norm_i; + if (monitor_residual) residual_norms(team_id * N_team + batched_id, max_iteration + 1) = norm_i; } /// \brief set_iteration @@ -453,9 +435,7 @@ class KrylovHandle { /// \param iteration_id [in]: Iteration ID KOKKOS_INLINE_FUNCTION - void set_iteration(int batched_id, int iteration_id) const { - iteration_numbers(batched_id) = iteration_id; - } + void set_iteration(int batched_id, int iteration_id) const { iteration_numbers(batched_id) = iteration_id; } /// \brief set_iteration /// Store the number of iteration after convergence for one system diff --git a/batched/sparse/src/KokkosBatched_Krylov_Solvers.hpp b/batched/sparse/src/KokkosBatched_Krylov_Solvers.hpp index 262167ee64..b07ed2b973 100644 --- a/batched/sparse/src/KokkosBatched_Krylov_Solvers.hpp +++ b/batched/sparse/src/KokkosBatched_Krylov_Solvers.hpp @@ -20,110 +20,71 @@ namespace KokkosBatched { struct SerialGMRES { - template - KOKKOS_INLINE_FUNCTION static int invoke(const OperatorType& A, - const VectorViewType& _B, - const VectorViewType& _X, - const PrecOperatorType& P, - const KrylovHandleType& handle, + template + KOKKOS_INLINE_FUNCTION static int invoke(const OperatorType& A, const VectorViewType& _B, const VectorViewType& _X, + const PrecOperatorType& P, const KrylovHandleType& handle, const int GMRES_id); - template - KOKKOS_INLINE_FUNCTION static int invoke(const OperatorType& A, - const VectorViewType& _B, - const VectorViewType& _X, + template + KOKKOS_INLINE_FUNCTION static int invoke(const OperatorType& A, const VectorViewType& _B, const VectorViewType& _X, const KrylovHandleType& handle); }; template struct TeamGMRES { - template - KOKKOS_INLINE_FUNCTION static int invoke( - const MemberType& member, const OperatorType& A, const VectorViewType& _B, - const VectorViewType& _X, const PrecOperatorType& P, - const KrylovHandleType& handle, const ArnoldiViewType& _ArnoldiView, - const TMPViewType& _TMPView); - template - KOKKOS_INLINE_FUNCTION static int invoke(const MemberType& member, - const OperatorType& A, - const VectorViewType& _B, - const VectorViewType& _X, - const PrecOperatorType& P, - const KrylovHandleType& handle); - template - KOKKOS_INLINE_FUNCTION static int invoke(const MemberType& member, - const OperatorType& A, - const VectorViewType& _B, - const VectorViewType& _X, + KOKKOS_INLINE_FUNCTION static int invoke(const MemberType& member, const OperatorType& A, const VectorViewType& _B, + const VectorViewType& _X, const PrecOperatorType& P, + const KrylovHandleType& handle, const ArnoldiViewType& _ArnoldiView, + const TMPViewType& _TMPView); + template + KOKKOS_INLINE_FUNCTION static int invoke(const MemberType& member, const OperatorType& A, const VectorViewType& _B, + const VectorViewType& _X, const PrecOperatorType& P, const KrylovHandleType& handle); + template + KOKKOS_INLINE_FUNCTION static int invoke(const MemberType& member, const OperatorType& A, const VectorViewType& _B, + const VectorViewType& _X, const KrylovHandleType& handle); }; template struct TeamVectorGMRES { - template - KOKKOS_INLINE_FUNCTION static int invoke( - const MemberType& member, const OperatorType& A, const VectorViewType& _B, - const VectorViewType& _X, const PrecOperatorType& P, - const KrylovHandleType& handle, const ArnoldiViewType& _ArnoldiView, - const TMPViewType& _TMPView); - template - KOKKOS_INLINE_FUNCTION static int invoke(const MemberType& member, - const OperatorType& A, - const VectorViewType& _B, - const VectorViewType& _X, - const PrecOperatorType& P, - const KrylovHandleType& handle); - template - KOKKOS_INLINE_FUNCTION static int invoke(const MemberType& member, - const OperatorType& A, - const VectorViewType& _B, - const VectorViewType& _X, + KOKKOS_INLINE_FUNCTION static int invoke(const MemberType& member, const OperatorType& A, const VectorViewType& _B, + const VectorViewType& _X, const PrecOperatorType& P, + const KrylovHandleType& handle, const ArnoldiViewType& _ArnoldiView, + const TMPViewType& _TMPView); + template + KOKKOS_INLINE_FUNCTION static int invoke(const MemberType& member, const OperatorType& A, const VectorViewType& _B, + const VectorViewType& _X, const PrecOperatorType& P, const KrylovHandleType& handle); + template + KOKKOS_INLINE_FUNCTION static int invoke(const MemberType& member, const OperatorType& A, const VectorViewType& _B, + const VectorViewType& _X, const KrylovHandleType& handle); }; template struct TeamCG { - template - KOKKOS_INLINE_FUNCTION static int invoke( - const MemberType& member, const OperatorType& A, const VectorViewType& _B, - const VectorViewType& _X, const KrylovHandleType& handle, - const TMPViewType& _TMPView, const TMPNormViewType& _TMPNormView); - template - KOKKOS_INLINE_FUNCTION static int invoke(const MemberType& member, - const OperatorType& A, - const VectorViewType& _B, - const VectorViewType& _X, - const KrylovHandleType& handle); + KOKKOS_INLINE_FUNCTION static int invoke(const MemberType& member, const OperatorType& A, const VectorViewType& _B, + const VectorViewType& _X, const KrylovHandleType& handle, + const TMPViewType& _TMPView, const TMPNormViewType& _TMPNormView); + template + KOKKOS_INLINE_FUNCTION static int invoke(const MemberType& member, const OperatorType& A, const VectorViewType& _B, + const VectorViewType& _X, const KrylovHandleType& handle); }; template struct TeamVectorCG { - template - KOKKOS_INLINE_FUNCTION static int invoke( - const MemberType& member, const OperatorType& A, const VectorViewType& _B, - const VectorViewType& _X, const KrylovHandleType& handle, - const TMPViewType& _TMPView, const TMPNormViewType& _TMPNormView); - template - KOKKOS_INLINE_FUNCTION static int invoke(const MemberType& member, - const OperatorType& A, - const VectorViewType& _B, - const VectorViewType& _X, - const KrylovHandleType& handle); + KOKKOS_INLINE_FUNCTION static int invoke(const MemberType& member, const OperatorType& A, const VectorViewType& _B, + const VectorViewType& _X, const KrylovHandleType& handle, + const TMPViewType& _TMPView, const TMPNormViewType& _TMPNormView); + template + KOKKOS_INLINE_FUNCTION static int invoke(const MemberType& member, const OperatorType& A, const VectorViewType& _B, + const VectorViewType& _X, const KrylovHandleType& handle); }; } // namespace KokkosBatched diff --git a/batched/sparse/src/KokkosBatched_Spmv.hpp b/batched/sparse/src/KokkosBatched_Spmv.hpp index da70acb6bb..a93d0775be 100644 --- a/batched/sparse/src/KokkosBatched_Spmv.hpp +++ b/batched/sparse/src/KokkosBatched_Spmv.hpp @@ -64,23 +64,17 @@ namespace KokkosBatched { template struct SerialSpmv { - template - KOKKOS_INLINE_FUNCTION static int invoke( - const alphaViewType &alpha, const ValuesViewType &values, - const IntView &row_ptr, const IntView &colIndices, const xViewType &x, - const betaViewType &beta, const yViewType &Y); + template + KOKKOS_INLINE_FUNCTION static int invoke(const alphaViewType &alpha, const ValuesViewType &values, + const IntView &row_ptr, const IntView &colIndices, const xViewType &x, + const betaViewType &beta, const yViewType &Y); - template + template KOKKOS_INLINE_FUNCTION static int invoke( - const typename Kokkos::ArithTraits< - typename ValuesViewType::non_const_value_type>::mag_type &alpha, - const ValuesViewType &values, const IntView &row_ptr, - const IntView &colIndices, const xViewType &X, - const typename Kokkos::ArithTraits< - typename ValuesViewType::non_const_value_type>::mag_type &beta, + const typename Kokkos::ArithTraits::mag_type &alpha, + const ValuesViewType &values, const IntView &row_ptr, const IntView &colIndices, const xViewType &X, + const typename Kokkos::ArithTraits::mag_type &beta, const yViewType &Y); }; @@ -126,25 +120,19 @@ struct SerialSpmv { template struct TeamSpmv { - template - KOKKOS_INLINE_FUNCTION static int invoke( - const MemberType &member, const alphaViewType &alpha, - const ValuesViewType &values, const IntView &row_ptr, - const IntView &colIndices, const xViewType &x, const betaViewType &beta, - const yViewType &y); + template + KOKKOS_INLINE_FUNCTION static int invoke(const MemberType &member, const alphaViewType &alpha, + const ValuesViewType &values, const IntView &row_ptr, + const IntView &colIndices, const xViewType &x, const betaViewType &beta, + const yViewType &y); - template + template KOKKOS_INLINE_FUNCTION static int invoke( const MemberType &member, - const typename Kokkos::ArithTraits< - typename ValuesViewType::non_const_value_type>::mag_type &alpha, - const ValuesViewType &values, const IntView &row_ptr, - const IntView &colIndices, const xViewType &x, - const typename Kokkos::ArithTraits< - typename ValuesViewType::non_const_value_type>::mag_type &beta, + const typename Kokkos::ArithTraits::mag_type &alpha, + const ValuesViewType &values, const IntView &row_ptr, const IntView &colIndices, const xViewType &x, + const typename Kokkos::ArithTraits::mag_type &beta, const yViewType &y); }; @@ -189,28 +177,21 @@ struct TeamSpmv { /// (or one with TeamVectorRange) are used inside. /// -template +template struct TeamVectorSpmv { - template - KOKKOS_INLINE_FUNCTION static int invoke( - const MemberType &member, const alphaViewType &alpha, - const ValuesViewType &values, const IntView &row_ptr, - const IntView &colIndices, const xViewType &x, const betaViewType &beta, - const yViewType &y); + template + KOKKOS_INLINE_FUNCTION static int invoke(const MemberType &member, const alphaViewType &alpha, + const ValuesViewType &values, const IntView &row_ptr, + const IntView &colIndices, const xViewType &x, const betaViewType &beta, + const yViewType &y); - template + template KOKKOS_INLINE_FUNCTION static int invoke( const MemberType &member, - const typename Kokkos::ArithTraits< - typename ValuesViewType::non_const_value_type>::mag_type &alpha, - const ValuesViewType &values, const IntView &row_ptr, - const IntView &colIndices, const xViewType &x, - const typename Kokkos::ArithTraits< - typename ValuesViewType::non_const_value_type>::mag_type &beta, + const typename Kokkos::ArithTraits::mag_type &alpha, + const ValuesViewType &values, const IntView &row_ptr, const IntView &colIndices, const xViewType &x, + const typename Kokkos::ArithTraits::mag_type &beta, const yViewType &y); }; @@ -245,58 +226,47 @@ struct TeamVectorSpmv { template struct Spmv { - template - KOKKOS_INLINE_FUNCTION static int invoke( - const MemberType &member, const alphaViewType &alpha, - const ValuesViewType &values, const IntView &row_ptr, - const IntView &colIndices, const xViewType &x, const betaViewType &beta, - const yViewType &y) { + template + KOKKOS_INLINE_FUNCTION static int invoke(const MemberType &member, const alphaViewType &alpha, + const ValuesViewType &values, const IntView &row_ptr, + const IntView &colIndices, const xViewType &x, const betaViewType &beta, + const yViewType &y) { int r_val = 0; if (std::is_same::value) { - r_val = SerialSpmv::template invoke< - ValuesViewType, IntView, xViewType, yViewType, alphaViewType, - betaViewType, dobeta>(alpha, values, row_ptr, colIndices, x, beta, y); + r_val = + SerialSpmv::template invoke(alpha, values, row_ptr, colIndices, x, beta, y); } else if (std::is_same::value) { - r_val = TeamSpmv::template invoke< - ValuesViewType, IntView, xViewType, yViewType, alphaViewType, - betaViewType, dobeta>(member, alpha, values, row_ptr, colIndices, x, - beta, y); + r_val = TeamSpmv::template invoke( + member, alpha, values, row_ptr, colIndices, x, beta, y); } else if (std::is_same::value) { - r_val = TeamVectorSpmv::template invoke< - ValuesViewType, IntView, xViewType, yViewType, alphaViewType, - betaViewType, dobeta>(member, alpha, values, row_ptr, colIndices, x, - beta, y); + r_val = TeamVectorSpmv::template invoke( + member, alpha, values, row_ptr, colIndices, x, beta, y); } return r_val; } - template + template KOKKOS_INLINE_FUNCTION static int invoke( const MemberType &member, - const typename Kokkos::ArithTraits< - typename ValuesViewType::non_const_value_type>::mag_type &alpha, - const ValuesViewType &values, const IntView &row_ptr, - const IntView &colIndices, const xViewType &x, - const typename Kokkos::ArithTraits< - typename ValuesViewType::non_const_value_type>::mag_type &beta, + const typename Kokkos::ArithTraits::mag_type &alpha, + const ValuesViewType &values, const IntView &row_ptr, const IntView &colIndices, const xViewType &x, + const typename Kokkos::ArithTraits::mag_type &beta, const yViewType &y) { int r_val = 0; if (std::is_same::value) { - r_val = - SerialSpmv::template invoke( - alpha, values, row_ptr, colIndices, x, beta, y); + r_val = SerialSpmv::template invoke( + alpha, values, row_ptr, colIndices, x, beta, y); } else if (std::is_same::value) { - r_val = TeamSpmv::template invoke< - ValuesViewType, IntView, xViewType, yViewType, dobeta>( + r_val = TeamSpmv::template invoke( member, alpha, values, row_ptr, colIndices, x, beta, y); } else if (std::is_same::value) { - r_val = TeamVectorSpmv::template invoke< - ValuesViewType, IntView, xViewType, yViewType, dobeta>( - member, alpha, values, row_ptr, colIndices, x, beta, y); + r_val = + TeamVectorSpmv::template invoke( + member, alpha, values, row_ptr, colIndices, x, beta, y); } return r_val; } diff --git a/batched/sparse/unit_test/Test_Batched_SerialGMRES.hpp b/batched/sparse/unit_test/Test_Batched_SerialGMRES.hpp index e28efb9b82..3147caefae 100644 --- a/batched/sparse/unit_test/Test_Batched_SerialGMRES.hpp +++ b/batched/sparse/unit_test/Test_Batched_SerialGMRES.hpp @@ -29,8 +29,8 @@ using namespace KokkosBatched; namespace Test { namespace GMRES { -template +template struct Functor_TestBatchedSerialGMRES { using execution_space = typename DeviceType::execution_space; const ValuesViewType _D; @@ -42,32 +42,19 @@ struct Functor_TestBatchedSerialGMRES { const int _N_team; KrylovHandleType _handle; - Functor_TestBatchedSerialGMRES(const ValuesViewType &D, const IntView &r, - const IntView &c, const VectorViewType &X, - const VectorViewType &B, - const VectorViewType &diag, const int N_team, + Functor_TestBatchedSerialGMRES(const ValuesViewType &D, const IntView &r, const IntView &c, const VectorViewType &X, + const VectorViewType &B, const VectorViewType &diag, const int N_team, KrylovHandleType &handle) - : _D(D), - _r(r), - _c(c), - _X(X), - _B(B), - _Diag(diag), - _N_team(N_team), - _handle(handle) {} + : _D(D), _r(r), _c(c), _X(X), _B(B), _Diag(diag), _N_team(N_team), _handle(handle) {} KOKKOS_INLINE_FUNCTION void operator()(const int k) const { const int first_matrix = _handle.first_index(k); const int last_matrix = _handle.last_index(k); - auto d = Kokkos::subview(_D, Kokkos::make_pair(first_matrix, last_matrix), - Kokkos::ALL); - auto diag = Kokkos::subview( - _Diag, Kokkos::make_pair(first_matrix, last_matrix), Kokkos::ALL); - auto x = Kokkos::subview(_X, Kokkos::make_pair(first_matrix, last_matrix), - Kokkos::ALL); - auto b = Kokkos::subview(_B, Kokkos::make_pair(first_matrix, last_matrix), - Kokkos::ALL); + auto d = Kokkos::subview(_D, Kokkos::make_pair(first_matrix, last_matrix), Kokkos::ALL); + auto diag = Kokkos::subview(_Diag, Kokkos::make_pair(first_matrix, last_matrix), Kokkos::ALL); + auto x = Kokkos::subview(_X, Kokkos::make_pair(first_matrix, last_matrix), Kokkos::ALL); + auto b = Kokkos::subview(_B, Kokkos::make_pair(first_matrix, last_matrix), Kokkos::ALL); using Operator = KokkosBatched::CrsMatrix; using PrecOperator = KokkosBatched::JacobiPrec; @@ -76,8 +63,7 @@ struct Functor_TestBatchedSerialGMRES { PrecOperator P(diag); P.setComputedInverse(); - KokkosBatched::SerialGMRES::template invoke( - A, b, x, P, _handle, k); + KokkosBatched::SerialGMRES::template invoke(A, b, x, P, _handle, k); } inline void run() { @@ -96,18 +82,16 @@ struct Functor_TestBatchedSerialGMRES { _handle.set_compute_last_residual(false); _handle.set_tolerance(1e-8); - _handle.Arnoldi_view = typename KrylovHandleType::ArnoldiViewType( - "", N, maximum_iteration, n + maximum_iteration + 3); - _handle.tmp_view = typename KrylovHandleType::TemporaryViewType( - "", N, n + maximum_iteration + 3); + _handle.Arnoldi_view = + typename KrylovHandleType::ArnoldiViewType("", N, maximum_iteration, n + maximum_iteration + 3); + _handle.tmp_view = typename KrylovHandleType::TemporaryViewType("", N, n + maximum_iteration + 3); Kokkos::parallel_for(name.c_str(), policy, *this); Kokkos::Profiling::popRegion(); } }; -template +template void impl_test_batched_GMRES(const int N, const int BlkSize, const int N_team) { typedef typename ValuesViewType::value_type value_type; typedef Kokkos::ArithTraits ats; @@ -133,8 +117,7 @@ void impl_test_batched_GMRES(const int N, const int BlkSize, const int N_team) { using Scalar3DViewType = Kokkos::View; using IntViewType = Kokkos::View; - using KrylovHandleType = - KrylovHandle; + using KrylovHandleType = KrylovHandle; NormViewType sqr_norm_0("sqr_norm_0", N); NormViewType sqr_norm_j("sqr_norm_j", N); @@ -153,12 +136,10 @@ void impl_test_batched_GMRES(const int N, const int BlkSize, const int N_team) { int current_index; for (int i = 0; i < BlkSize; ++i) { - for (current_index = row_ptr_host(i); current_index < row_ptr_host(i + 1); - ++current_index) { + for (current_index = row_ptr_host(i); current_index < row_ptr_host(i + 1); ++current_index) { if (colIndices_host(current_index) == i) break; } - for (int j = 0; j < N; ++j) - diag_values_host(j, i) = values_host(j, current_index); + for (int j = 0; j < N; ++j) diag_values_host(j, i) = values_host(j, current_index); } Kokkos::deep_copy(Diag, diag_values_host); @@ -188,13 +169,10 @@ void impl_test_batched_GMRES(const int N, const int BlkSize, const int N_team) { KrylovHandleType handle(N, N_team, n_iterations); KokkosBatched::SerialSpmv::template invoke< - typename ValuesViewType::HostMirror, typename IntView::HostMirror, - typename VectorViewType::HostMirror, typename VectorViewType::HostMirror, - 1>(-1, D_host, r_host, c_host, X_host, 1, R_host); - KokkosBatched::SerialDot::invoke(R_host, R_host, - sqr_norm_0_host); - Functor_TestBatchedSerialGMRES( + typename ValuesViewType::HostMirror, typename IntView::HostMirror, typename VectorViewType::HostMirror, + typename VectorViewType::HostMirror, 1>(-1, D_host, r_host, c_host, X_host, 1, R_host); + KokkosBatched::SerialDot::invoke(R_host, R_host, sqr_norm_0_host); + Functor_TestBatchedSerialGMRES( D, r, c, X, B, Diag, N_team, handle) .run(); @@ -205,17 +183,13 @@ void impl_test_batched_GMRES(const int N, const int BlkSize, const int N_team) { Kokkos::deep_copy(X_host, X); KokkosBatched::SerialSpmv::template invoke< - typename ValuesViewType::HostMirror, typename IntView::HostMirror, - typename VectorViewType::HostMirror, typename VectorViewType::HostMirror, - 1>(-1, D_host, r_host, c_host, X_host, 1, R_host); - KokkosBatched::SerialDot::invoke(R_host, R_host, - sqr_norm_j_host); + typename ValuesViewType::HostMirror, typename IntView::HostMirror, typename VectorViewType::HostMirror, + typename VectorViewType::HostMirror, 1>(-1, D_host, r_host, c_host, X_host, 1, R_host); + KokkosBatched::SerialDot::invoke(R_host, R_host, sqr_norm_j_host); const MagnitudeType eps = 1.0e5 * ats::epsilon(); - for (int l = 0; l < N; ++l) - EXPECT_NEAR_KK( - std::sqrt(sqr_norm_j_host(l)) / std::sqrt(sqr_norm_0_host(l)), 0, eps); + for (int l = 0; l < N; ++l) EXPECT_NEAR_KK(std::sqrt(sqr_norm_j_host(l)) / std::sqrt(sqr_norm_0_host(l)), 0, eps); } } // namespace GMRES } // namespace Test @@ -226,26 +200,21 @@ int test_batched_serial_GMRES() { { typedef Kokkos::View ViewType; typedef Kokkos::View IntView; - typedef Kokkos::View - VectorViewType; + typedef Kokkos::View VectorViewType; for (int i = 3; i < 10; ++i) { - Test::GMRES::impl_test_batched_GMRES(1024, i, 2); + Test::GMRES::impl_test_batched_GMRES(1024, i, 2); } } #endif #if defined(KOKKOSKERNELS_INST_LAYOUTRIGHT) { - typedef Kokkos::View - ViewType; + typedef Kokkos::View ViewType; typedef Kokkos::View IntView; - typedef Kokkos::View - VectorViewType; + typedef Kokkos::View VectorViewType; for (int i = 3; i < 10; ++i) { - Test::GMRES::impl_test_batched_GMRES(1024, i, 2); + Test::GMRES::impl_test_batched_GMRES(1024, i, 2); } } #endif diff --git a/batched/sparse/unit_test/Test_Batched_SerialGMRES_Real.hpp b/batched/sparse/unit_test/Test_Batched_SerialGMRES_Real.hpp index ccfe3c37d5..2756e11a1f 100644 --- a/batched/sparse/unit_test/Test_Batched_SerialGMRES_Real.hpp +++ b/batched/sparse/unit_test/Test_Batched_SerialGMRES_Real.hpp @@ -15,13 +15,9 @@ //@HEADER #if defined(KOKKOSKERNELS_INST_FLOAT) -TEST_F(TestCategory, batched_scalar_serial_GMRES_float) { - test_batched_serial_GMRES(); -} +TEST_F(TestCategory, batched_scalar_serial_GMRES_float) { test_batched_serial_GMRES(); } #endif #if defined(KOKKOSKERNELS_INST_DOUBLE) -TEST_F(TestCategory, batched_scalar_serial_GMRES_double) { - test_batched_serial_GMRES(); -} +TEST_F(TestCategory, batched_scalar_serial_GMRES_double) { test_batched_serial_GMRES(); } #endif diff --git a/batched/sparse/unit_test/Test_Batched_SerialSpmv.hpp b/batched/sparse/unit_test/Test_Batched_SerialSpmv.hpp index 05f2724c5b..2f32b6294a 100644 --- a/batched/sparse/unit_test/Test_Batched_SerialSpmv.hpp +++ b/batched/sparse/unit_test/Test_Batched_SerialSpmv.hpp @@ -19,7 +19,7 @@ #include "Kokkos_Core.hpp" #include "Kokkos_Random.hpp" -//#include "KokkosBatched_Vector.hpp" +// #include "KokkosBatched_Vector.hpp" #include "KokkosBatched_Spmv.hpp" #include "KokkosBatched_Spmv_Serial_Impl.hpp" @@ -37,9 +37,8 @@ struct ParamTag { typedef T trans; }; -template +template struct Functor_TestBatchedSerialSpmv { using execution_space = typename DeviceType::execution_space; const alphaViewType _alpha; @@ -51,10 +50,8 @@ struct Functor_TestBatchedSerialSpmv { const yViewType _Y; KOKKOS_INLINE_FUNCTION - Functor_TestBatchedSerialSpmv(const alphaViewType &alpha, - const ValuesViewType &D, const IntView &r, - const IntView &c, const xViewType &X, - const betaViewType &beta, const yViewType &Y) + Functor_TestBatchedSerialSpmv(const alphaViewType &alpha, const ValuesViewType &D, const IntView &r, const IntView &c, + const xViewType &X, const betaViewType &beta, const yViewType &Y) : _alpha(alpha), _D(D), _r(r), _c(c), _X(X), _beta(beta), _Y(Y) {} KOKKOS_INLINE_FUNCTION @@ -66,8 +63,8 @@ struct Functor_TestBatchedSerialSpmv { auto y = Kokkos::subview(_Y, Kokkos::make_pair(k, k + 1), Kokkos::ALL); KokkosBatched::SerialSpmv::template invoke< - ValuesViewType, IntView, xViewType, yViewType, alphaViewType, - betaViewType, dobeta>(alpha, d, _r, _c, x, beta, y); + ValuesViewType, IntView, xViewType, yViewType, alphaViewType, betaViewType, dobeta>(alpha, d, _r, _c, x, beta, + y); } inline void run() { @@ -82,9 +79,8 @@ struct Functor_TestBatchedSerialSpmv { } }; -template +template void impl_test_batched_spmv(const int N, const int BlkSize) { typedef typename ValuesViewType::value_type value_type; typedef Kokkos::ArithTraits ats; @@ -126,21 +122,15 @@ void impl_test_batched_spmv(const int N, const int BlkSize) { else Y0_host(l, i) *= beta_host(l); if (i != 0 && i != (BlkSize - 1)) - Y0_host(l, i) += - alpha_host(l) * - (2 * X0_host(l, i) - X0_host(l, i - 1) - X0_host(l, i + 1)); + Y0_host(l, i) += alpha_host(l) * (2 * X0_host(l, i) - X0_host(l, i - 1) - X0_host(l, i + 1)); else if (i == 0) - Y0_host(l, i) += - alpha_host(l) * (2 * X0_host(l, i) - X0_host(l, i + 1)); + Y0_host(l, i) += alpha_host(l) * (2 * X0_host(l, i) - X0_host(l, i + 1)); else - Y0_host(l, i) += - alpha_host(l) * (2 * X0_host(l, i) - X0_host(l, i - 1)); + Y0_host(l, i) += alpha_host(l) * (2 * X0_host(l, i) - X0_host(l, i - 1)); } - Functor_TestBatchedSerialSpmv(alpha, D, r, c, X1, beta, - Y1) + Functor_TestBatchedSerialSpmv(alpha, D, r, c, X1, beta, Y1) .run(); Kokkos::fence(); @@ -165,49 +155,37 @@ void impl_test_batched_spmv(const int N, const int BlkSize) { } // namespace Spmv } // namespace Test -template +template int test_batched_spmv() { #if defined(KOKKOSKERNELS_INST_LAYOUTLEFT) { typedef Kokkos::View ViewType; typedef Kokkos::View IntView; - typedef Kokkos::View - alphaViewType; + typedef Kokkos::View alphaViewType; for (int i = 3; i < 10; ++i) { - Test::Spmv::impl_test_batched_spmv(1024, - i); + Test::Spmv::impl_test_batched_spmv(1024, i); } for (int i = 3; i < 10; ++i) { - Test::Spmv::impl_test_batched_spmv(1024, - i); + Test::Spmv::impl_test_batched_spmv(1024, i); } } #endif #if defined(KOKKOSKERNELS_INST_LAYOUTRIGHT) { - typedef Kokkos::View - ViewType; + typedef Kokkos::View ViewType; typedef Kokkos::View IntView; - typedef Kokkos::View - alphaViewType; + typedef Kokkos::View alphaViewType; for (int i = 3; i < 10; ++i) { - Test::Spmv::impl_test_batched_spmv(1024, - i); + Test::Spmv::impl_test_batched_spmv(1024, i); } for (int i = 3; i < 10; ++i) { - Test::Spmv::impl_test_batched_spmv(1024, - i); + Test::Spmv::impl_test_batched_spmv(1024, i); } } #endif diff --git a/batched/sparse/unit_test/Test_Batched_SparseUtils.hpp b/batched/sparse/unit_test/Test_Batched_SparseUtils.hpp index 98bc25894f..808f95a9a7 100644 --- a/batched/sparse/unit_test/Test_Batched_SparseUtils.hpp +++ b/batched/sparse/unit_test/Test_Batched_SparseUtils.hpp @@ -18,21 +18,12 @@ namespace KokkosBatched { template -void create_tridiagonal_batched_matrices(const int nnz, const int BlkSize, - const int N, const IntView &r, - const IntView &c, - const VectorViewType &D, - const VectorViewType &X, +void create_tridiagonal_batched_matrices(const int nnz, const int BlkSize, const int N, const IntView &r, + const IntView &c, const VectorViewType &D, const VectorViewType &X, const VectorViewType &B) { - Kokkos::Random_XorShift64_Pool< - typename VectorViewType::device_type::execution_space> - random(13718); - Kokkos::fill_random( - X, random, - Kokkos::reduction_identity::prod()); - Kokkos::fill_random( - B, random, - Kokkos::reduction_identity::prod()); + Kokkos::Random_XorShift64_Pool random(13718); + Kokkos::fill_random(X, random, Kokkos::reduction_identity::prod()); + Kokkos::fill_random(B, random, Kokkos::reduction_identity::prod()); auto D_host = Kokkos::create_mirror_view(D); auto r_host = Kokkos::create_mirror_view(r); diff --git a/batched/sparse/unit_test/Test_Batched_TeamCG.hpp b/batched/sparse/unit_test/Test_Batched_TeamCG.hpp index b05f3db61f..3c0b194faf 100644 --- a/batched/sparse/unit_test/Test_Batched_TeamCG.hpp +++ b/batched/sparse/unit_test/Test_Batched_TeamCG.hpp @@ -28,8 +28,8 @@ using namespace KokkosBatched; namespace Test { namespace TeamCG { -template +template struct Functor_TestBatchedTeamCG { using execution_space = typename DeviceType::execution_space; const ValuesViewType _D; @@ -40,32 +40,21 @@ struct Functor_TestBatchedTeamCG { const int _N_team; KrylovHandleType handle; - Functor_TestBatchedTeamCG(const ValuesViewType &D, const IntView &r, - const IntView &c, const VectorViewType &X, + Functor_TestBatchedTeamCG(const ValuesViewType &D, const IntView &r, const IntView &c, const VectorViewType &X, const VectorViewType &B, const int N_team) - : _D(D), - _r(r), - _c(c), - _X(X), - _B(B), - _N_team(N_team), - handle(KrylovHandleType(_D.extent(0), _N_team)) {} + : _D(D), _r(r), _c(c), _X(X), _B(B), _N_team(N_team), handle(KrylovHandleType(_D.extent(0), _N_team)) {} template KOKKOS_INLINE_FUNCTION void operator()(const MemberType &member) const { const int first_matrix = static_cast(member.league_rank()) * _N_team; const int N = _D.extent(0); const int last_matrix = - (static_cast(member.league_rank() + 1) * _N_team < N - ? static_cast(member.league_rank() + 1) * _N_team - : N); + (static_cast(member.league_rank() + 1) * _N_team < N ? static_cast(member.league_rank() + 1) * _N_team + : N); - auto d = Kokkos::subview(_D, Kokkos::make_pair(first_matrix, last_matrix), - Kokkos::ALL); - auto x = Kokkos::subview(_X, Kokkos::make_pair(first_matrix, last_matrix), - Kokkos::ALL); - auto b = Kokkos::subview(_B, Kokkos::make_pair(first_matrix, last_matrix), - Kokkos::ALL); + auto d = Kokkos::subview(_D, Kokkos::make_pair(first_matrix, last_matrix), Kokkos::ALL); + auto x = Kokkos::subview(_X, Kokkos::make_pair(first_matrix, last_matrix), Kokkos::ALL); + auto b = Kokkos::subview(_B, Kokkos::make_pair(first_matrix, last_matrix), Kokkos::ALL); using Operator = KokkosBatched::CrsMatrix; @@ -80,8 +69,7 @@ struct Functor_TestBatchedTeamCG { const std::string name_value_type = Test::value_type_name(); std::string name = name_region + name_value_type; Kokkos::Profiling::pushRegion(name.c_str()); - Kokkos::TeamPolicy policy(_D.extent(0) / _N_team, - Kokkos::AUTO(), Kokkos::AUTO()); + Kokkos::TeamPolicy policy(_D.extent(0) / _N_team, Kokkos::AUTO(), Kokkos::AUTO()); size_t bytes_0 = ValuesViewType::shmem_size(_N_team, _X.extent(1)); size_t bytes_1 = ValuesViewType::shmem_size(_N_team, 1); @@ -92,8 +80,7 @@ struct Functor_TestBatchedTeamCG { } }; -template +template void impl_test_batched_CG(const int N, const int BlkSize, const int N_team) { typedef typename ValuesViewType::value_type value_type; typedef Kokkos::ArithTraits ats; @@ -118,8 +105,7 @@ void impl_test_batched_CG(const int N, const int BlkSize, const int N_team) { using Scalar3DViewType = Kokkos::View; using IntViewType = Kokkos::View; - using KrylovHandleType = - KrylovHandle; + using KrylovHandleType = KrylovHandle; NormViewType sqr_norm_0("sqr_norm_0", N); NormViewType sqr_norm_j("sqr_norm_j", N); @@ -147,13 +133,11 @@ void impl_test_batched_CG(const int N, const int BlkSize, const int N_team) { Kokkos::deep_copy(D_host, D); KokkosBatched::SerialSpmv::template invoke< - typename ValuesViewType::HostMirror, typename IntView::HostMirror, - typename VectorViewType::HostMirror, typename VectorViewType::HostMirror, - 1>(-1, D_host, r_host, c_host, X_host, 1, R_host); - KokkosBatched::SerialDot::invoke(R_host, R_host, - sqr_norm_0_host); - Functor_TestBatchedTeamCG(D, r, c, X, B, N_team) + typename ValuesViewType::HostMirror, typename IntView::HostMirror, typename VectorViewType::HostMirror, + typename VectorViewType::HostMirror, 1>(-1, D_host, r_host, c_host, X_host, 1, R_host); + KokkosBatched::SerialDot::invoke(R_host, R_host, sqr_norm_0_host); + Functor_TestBatchedTeamCG(D, r, c, X, B, + N_team) .run(); Kokkos::fence(); @@ -163,16 +147,13 @@ void impl_test_batched_CG(const int N, const int BlkSize, const int N_team) { Kokkos::deep_copy(X_host, X); KokkosBatched::SerialSpmv::template invoke< - typename ValuesViewType::HostMirror, typename IntView::HostMirror, - typename VectorViewType::HostMirror, typename VectorViewType::HostMirror, - 1>(-1, D_host, r_host, c_host, X_host, 1, R_host); - KokkosBatched::SerialDot::invoke(R_host, R_host, - sqr_norm_j_host); + typename ValuesViewType::HostMirror, typename IntView::HostMirror, typename VectorViewType::HostMirror, + typename VectorViewType::HostMirror, 1>(-1, D_host, r_host, c_host, X_host, 1, R_host); + KokkosBatched::SerialDot::invoke(R_host, R_host, sqr_norm_j_host); const MagnitudeType eps = 1.0e3 * ats::epsilon(); - for (int l = 0; l < N; ++l) - EXPECT_NEAR_KK(sqr_norm_j_host(l) / sqr_norm_0_host(l), 0, eps); + for (int l = 0; l < N; ++l) EXPECT_NEAR_KK(sqr_norm_j_host(l) / sqr_norm_0_host(l), 0, eps); } } // namespace TeamCG } // namespace Test @@ -183,26 +164,21 @@ int test_batched_team_CG() { { typedef Kokkos::View ViewType; typedef Kokkos::View IntView; - typedef Kokkos::View - VectorViewType; + typedef Kokkos::View VectorViewType; for (int i = 3; i < 10; ++i) { - Test::TeamCG::impl_test_batched_CG(1024, i, 2); + Test::TeamCG::impl_test_batched_CG(1024, i, 2); } } #endif #if defined(KOKKOSKERNELS_INST_LAYOUTRIGHT) { - typedef Kokkos::View - ViewType; + typedef Kokkos::View ViewType; typedef Kokkos::View IntView; - typedef Kokkos::View - VectorViewType; + typedef Kokkos::View VectorViewType; for (int i = 3; i < 10; ++i) { - Test::TeamCG::impl_test_batched_CG(1024, i, 2); + Test::TeamCG::impl_test_batched_CG(1024, i, 2); } } #endif diff --git a/batched/sparse/unit_test/Test_Batched_TeamCG_Real.hpp b/batched/sparse/unit_test/Test_Batched_TeamCG_Real.hpp index 1bdb6bc95a..9d51be581b 100644 --- a/batched/sparse/unit_test/Test_Batched_TeamCG_Real.hpp +++ b/batched/sparse/unit_test/Test_Batched_TeamCG_Real.hpp @@ -15,13 +15,9 @@ //@HEADER #if defined(KOKKOSKERNELS_INST_FLOAT) -TEST_F(TestCategory, batched_scalar_team_CG_float) { - test_batched_team_CG(); -} +TEST_F(TestCategory, batched_scalar_team_CG_float) { test_batched_team_CG(); } #endif #if defined(KOKKOSKERNELS_INST_DOUBLE) -TEST_F(TestCategory, batched_scalar_team_CG_double) { - test_batched_team_CG(); -} +TEST_F(TestCategory, batched_scalar_team_CG_double) { test_batched_team_CG(); } #endif diff --git a/batched/sparse/unit_test/Test_Batched_TeamGMRES.hpp b/batched/sparse/unit_test/Test_Batched_TeamGMRES.hpp index de1a7f4fc2..e2250bab95 100644 --- a/batched/sparse/unit_test/Test_Batched_TeamGMRES.hpp +++ b/batched/sparse/unit_test/Test_Batched_TeamGMRES.hpp @@ -29,8 +29,8 @@ using namespace KokkosBatched; namespace Test { namespace TeamGMRES { -template +template struct Functor_TestBatchedTeamGMRES { using execution_space = typename DeviceType::execution_space; const ValuesViewType _D; @@ -42,37 +42,23 @@ struct Functor_TestBatchedTeamGMRES { const int _N_team; KrylovHandleType _handle; - Functor_TestBatchedTeamGMRES(const ValuesViewType &D, const IntView &r, - const IntView &c, const VectorViewType &X, - const VectorViewType &B, - const VectorViewType &diag, const int N_team, + Functor_TestBatchedTeamGMRES(const ValuesViewType &D, const IntView &r, const IntView &c, const VectorViewType &X, + const VectorViewType &B, const VectorViewType &diag, const int N_team, KrylovHandleType &handle) - : _D(D), - _r(r), - _c(c), - _X(X), - _B(B), - _Diag(diag), - _N_team(N_team), - _handle(handle) {} + : _D(D), _r(r), _c(c), _X(X), _B(B), _Diag(diag), _N_team(N_team), _handle(handle) {} template KOKKOS_INLINE_FUNCTION void operator()(const MemberType &member) const { const int first_matrix = static_cast(member.league_rank()) * _N_team; const int N = _D.extent(0); const int last_matrix = - (static_cast(member.league_rank() + 1) * _N_team < N - ? static_cast(member.league_rank() + 1) * _N_team - : N); - - auto d = Kokkos::subview(_D, Kokkos::make_pair(first_matrix, last_matrix), - Kokkos::ALL); - auto diag = Kokkos::subview( - _Diag, Kokkos::make_pair(first_matrix, last_matrix), Kokkos::ALL); - auto x = Kokkos::subview(_X, Kokkos::make_pair(first_matrix, last_matrix), - Kokkos::ALL); - auto b = Kokkos::subview(_B, Kokkos::make_pair(first_matrix, last_matrix), - Kokkos::ALL); + (static_cast(member.league_rank() + 1) * _N_team < N ? static_cast(member.league_rank() + 1) * _N_team + : N); + + auto d = Kokkos::subview(_D, Kokkos::make_pair(first_matrix, last_matrix), Kokkos::ALL); + auto diag = Kokkos::subview(_Diag, Kokkos::make_pair(first_matrix, last_matrix), Kokkos::ALL); + auto x = Kokkos::subview(_X, Kokkos::make_pair(first_matrix, last_matrix), Kokkos::ALL); + auto b = Kokkos::subview(_B, Kokkos::make_pair(first_matrix, last_matrix), Kokkos::ALL); using Operator = KokkosBatched::CrsMatrix; using PrecOperator = KokkosBatched::JacobiPrec; @@ -81,9 +67,7 @@ struct Functor_TestBatchedTeamGMRES { PrecOperator P(diag); P.setComputedInverse(); - KokkosBatched::TeamGMRES::template invoke( - member, A, b, x, P, _handle); + KokkosBatched::TeamGMRES::template invoke(member, A, b, x, P, _handle); } inline void run() { @@ -92,8 +76,7 @@ struct Functor_TestBatchedTeamGMRES { const std::string name_value_type = Test::value_type_name(); std::string name = name_region + name_value_type; Kokkos::Profiling::pushRegion(name.c_str()); - Kokkos::TeamPolicy policy(_D.extent(0) / _N_team, - Kokkos::AUTO(), Kokkos::AUTO()); + Kokkos::TeamPolicy policy(_D.extent(0) / _N_team, Kokkos::AUTO(), Kokkos::AUTO()); const int N = _D.extent(0); const int n = _X.extent(1); @@ -103,8 +86,8 @@ struct Functor_TestBatchedTeamGMRES { _handle.set_compute_last_residual(false); _handle.set_tolerance(1e-8); - _handle.Arnoldi_view = typename KrylovHandleType::ArnoldiViewType( - "", N, maximum_iteration, n + maximum_iteration + 3); + _handle.Arnoldi_view = + typename KrylovHandleType::ArnoldiViewType("", N, maximum_iteration, n + maximum_iteration + 3); using ScalarType = typename ValuesViewType::non_const_value_type; using Layout = typename ValuesViewType::array_layout; @@ -122,16 +105,14 @@ struct Functor_TestBatchedTeamGMRES { size_t bytes_int = bytes_row_ptr + bytes_col_idc; size_t bytes_diag = bytes_2D_1; size_t bytes_tmp = 2 * bytes_2D_1 + 2 * bytes_1D + bytes_2D_2; - policy.set_scratch_size( - 0, Kokkos::PerTeam(bytes_tmp + bytes_diag + bytes_int)); + policy.set_scratch_size(0, Kokkos::PerTeam(bytes_tmp + bytes_diag + bytes_int)); Kokkos::parallel_for(name.c_str(), policy, *this); Kokkos::Profiling::popRegion(); } }; -template +template void impl_test_batched_GMRES(const int N, const int BlkSize, const int N_team) { typedef typename ValuesViewType::value_type value_type; typedef Kokkos::ArithTraits ats; @@ -157,8 +138,7 @@ void impl_test_batched_GMRES(const int N, const int BlkSize, const int N_team) { using Scalar3DViewType = Kokkos::View; using IntViewType = Kokkos::View; - using KrylovHandleType = - KrylovHandle; + using KrylovHandleType = KrylovHandle; NormViewType sqr_norm_0("sqr_norm_0", N); NormViewType sqr_norm_j("sqr_norm_j", N); @@ -177,12 +157,10 @@ void impl_test_batched_GMRES(const int N, const int BlkSize, const int N_team) { int current_index; for (int i = 0; i < BlkSize; ++i) { - for (current_index = row_ptr_host(i); current_index < row_ptr_host(i + 1); - ++current_index) { + for (current_index = row_ptr_host(i); current_index < row_ptr_host(i + 1); ++current_index) { if (colIndices_host(current_index) == i) break; } - for (int j = 0; j < N; ++j) - diag_values_host(j, i) = values_host(j, current_index); + for (int j = 0; j < N; ++j) diag_values_host(j, i) = values_host(j, current_index); } Kokkos::deep_copy(Diag, diag_values_host); @@ -212,13 +190,10 @@ void impl_test_batched_GMRES(const int N, const int BlkSize, const int N_team) { KrylovHandleType handle(N, N_team, n_iterations); KokkosBatched::SerialSpmv::template invoke< - typename ValuesViewType::HostMirror, typename IntView::HostMirror, - typename VectorViewType::HostMirror, typename VectorViewType::HostMirror, - 1>(-1, D_host, r_host, c_host, X_host, 1, R_host); - KokkosBatched::SerialDot::invoke(R_host, R_host, - sqr_norm_0_host); - Functor_TestBatchedTeamGMRES( + typename ValuesViewType::HostMirror, typename IntView::HostMirror, typename VectorViewType::HostMirror, + typename VectorViewType::HostMirror, 1>(-1, D_host, r_host, c_host, X_host, 1, R_host); + KokkosBatched::SerialDot::invoke(R_host, R_host, sqr_norm_0_host); + Functor_TestBatchedTeamGMRES( D, r, c, X, B, Diag, N_team, handle) .run(); @@ -229,17 +204,13 @@ void impl_test_batched_GMRES(const int N, const int BlkSize, const int N_team) { Kokkos::deep_copy(X_host, X); KokkosBatched::SerialSpmv::template invoke< - typename ValuesViewType::HostMirror, typename IntView::HostMirror, - typename VectorViewType::HostMirror, typename VectorViewType::HostMirror, - 1>(-1, D_host, r_host, c_host, X_host, 1, R_host); - KokkosBatched::SerialDot::invoke(R_host, R_host, - sqr_norm_j_host); + typename ValuesViewType::HostMirror, typename IntView::HostMirror, typename VectorViewType::HostMirror, + typename VectorViewType::HostMirror, 1>(-1, D_host, r_host, c_host, X_host, 1, R_host); + KokkosBatched::SerialDot::invoke(R_host, R_host, sqr_norm_j_host); const MagnitudeType eps = 1.0e5 * ats::epsilon(); - for (int l = 0; l < N; ++l) - EXPECT_NEAR_KK( - std::sqrt(sqr_norm_j_host(l)) / std::sqrt(sqr_norm_0_host(l)), 0, eps); + for (int l = 0; l < N; ++l) EXPECT_NEAR_KK(std::sqrt(sqr_norm_j_host(l)) / std::sqrt(sqr_norm_0_host(l)), 0, eps); } } // namespace TeamGMRES } // namespace Test @@ -250,26 +221,21 @@ int test_batched_team_GMRES() { { typedef Kokkos::View ViewType; typedef Kokkos::View IntView; - typedef Kokkos::View - VectorViewType; + typedef Kokkos::View VectorViewType; for (int i = 3; i < 10; ++i) { - Test::TeamGMRES::impl_test_batched_GMRES(1024, i, 2); + Test::TeamGMRES::impl_test_batched_GMRES(1024, i, 2); } } #endif #if defined(KOKKOSKERNELS_INST_LAYOUTRIGHT) { - typedef Kokkos::View - ViewType; + typedef Kokkos::View ViewType; typedef Kokkos::View IntView; - typedef Kokkos::View - VectorViewType; + typedef Kokkos::View VectorViewType; for (int i = 3; i < 10; ++i) { - Test::TeamGMRES::impl_test_batched_GMRES(1024, i, 2); + Test::TeamGMRES::impl_test_batched_GMRES(1024, i, 2); } } #endif diff --git a/batched/sparse/unit_test/Test_Batched_TeamGMRES_Real.hpp b/batched/sparse/unit_test/Test_Batched_TeamGMRES_Real.hpp index f8aab13eec..3ca0466630 100644 --- a/batched/sparse/unit_test/Test_Batched_TeamGMRES_Real.hpp +++ b/batched/sparse/unit_test/Test_Batched_TeamGMRES_Real.hpp @@ -15,13 +15,9 @@ //@HEADER #if defined(KOKKOSKERNELS_INST_FLOAT) -TEST_F(TestCategory, batched_scalar_team_GMRES_float) { - test_batched_team_GMRES(); -} +TEST_F(TestCategory, batched_scalar_team_GMRES_float) { test_batched_team_GMRES(); } #endif #if defined(KOKKOSKERNELS_INST_DOUBLE) -TEST_F(TestCategory, batched_scalar_team_GMRES_double) { - test_batched_team_GMRES(); -} +TEST_F(TestCategory, batched_scalar_team_GMRES_double) { test_batched_team_GMRES(); } #endif diff --git a/batched/sparse/unit_test/Test_Batched_TeamSpmv.hpp b/batched/sparse/unit_test/Test_Batched_TeamSpmv.hpp index a6c9ac7ea8..228bd01afa 100644 --- a/batched/sparse/unit_test/Test_Batched_TeamSpmv.hpp +++ b/batched/sparse/unit_test/Test_Batched_TeamSpmv.hpp @@ -19,7 +19,7 @@ #include "Kokkos_Core.hpp" #include "Kokkos_Random.hpp" -//#include "KokkosBatched_Vector.hpp" +// #include "KokkosBatched_Vector.hpp" #include "KokkosBatched_Spmv.hpp" #include "KokkosBatched_Spmv_Team_Impl.hpp" @@ -38,9 +38,8 @@ struct ParamTag { typedef T trans; }; -template +template struct Functor_TestBatchedTeamSpmv { using execution_space = typename DeviceType::execution_space; const alphaViewType _alpha; @@ -53,45 +52,27 @@ struct Functor_TestBatchedTeamSpmv { const int _N_team; KOKKOS_INLINE_FUNCTION - Functor_TestBatchedTeamSpmv(const alphaViewType &alpha, - const ValuesViewType &D, const IntView &r, - const IntView &c, const xViewType &X, - const betaViewType &beta, const yViewType &Y, - const int N_team) - : _alpha(alpha), - _D(D), - _r(r), - _c(c), - _X(X), - _beta(beta), - _Y(Y), - _N_team(N_team) {} + Functor_TestBatchedTeamSpmv(const alphaViewType &alpha, const ValuesViewType &D, const IntView &r, const IntView &c, + const xViewType &X, const betaViewType &beta, const yViewType &Y, const int N_team) + : _alpha(alpha), _D(D), _r(r), _c(c), _X(X), _beta(beta), _Y(Y), _N_team(N_team) {} template - KOKKOS_INLINE_FUNCTION void operator()(const ParamTagType &, - const MemberType &member) const { + KOKKOS_INLINE_FUNCTION void operator()(const ParamTagType &, const MemberType &member) const { const int first_matrix = static_cast(member.league_rank()) * _N_team; const int N = _D.extent(0); const int last_matrix = - (static_cast(member.league_rank() + 1) * _N_team < N - ? static_cast(member.league_rank() + 1) * _N_team - : N); - - auto alpha = - Kokkos::subview(_alpha, Kokkos::make_pair(first_matrix, last_matrix)); - auto d = Kokkos::subview(_D, Kokkos::make_pair(first_matrix, last_matrix), - Kokkos::ALL); - auto x = Kokkos::subview(_X, Kokkos::make_pair(first_matrix, last_matrix), - Kokkos::ALL); - auto beta = - Kokkos::subview(_beta, Kokkos::make_pair(first_matrix, last_matrix)); - auto y = Kokkos::subview(_Y, Kokkos::make_pair(first_matrix, last_matrix), - Kokkos::ALL); - - KokkosBatched::TeamSpmv:: - template invoke( - member, alpha, d, _r, _c, x, beta, y); + (static_cast(member.league_rank() + 1) * _N_team < N ? static_cast(member.league_rank() + 1) * _N_team + : N); + + auto alpha = Kokkos::subview(_alpha, Kokkos::make_pair(first_matrix, last_matrix)); + auto d = Kokkos::subview(_D, Kokkos::make_pair(first_matrix, last_matrix), Kokkos::ALL); + auto x = Kokkos::subview(_X, Kokkos::make_pair(first_matrix, last_matrix), Kokkos::ALL); + auto beta = Kokkos::subview(_beta, Kokkos::make_pair(first_matrix, last_matrix)); + auto y = Kokkos::subview(_Y, Kokkos::make_pair(first_matrix, last_matrix), Kokkos::ALL); + + KokkosBatched::TeamSpmv::template invoke< + ValuesViewType, IntView, xViewType, yViewType, alphaViewType, betaViewType, dobeta>(member, alpha, d, _r, _c, x, + beta, y); } inline void run() { @@ -100,16 +81,14 @@ struct Functor_TestBatchedTeamSpmv { const std::string name_value_type = Test::value_type_name(); std::string name = name_region + name_value_type; Kokkos::Profiling::pushRegion(name.c_str()); - Kokkos::TeamPolicy policy( - _D.extent(0) / _N_team, Kokkos::AUTO(), Kokkos::AUTO()); + Kokkos::TeamPolicy policy(_D.extent(0) / _N_team, Kokkos::AUTO(), Kokkos::AUTO()); Kokkos::parallel_for(name.c_str(), policy, *this); Kokkos::Profiling::popRegion(); } }; -template +template void impl_test_batched_spmv(const int N, const int BlkSize, const int N_team) { typedef typename ValuesViewType::value_type value_type; typedef Kokkos::ArithTraits ats; @@ -151,20 +130,15 @@ void impl_test_batched_spmv(const int N, const int BlkSize, const int N_team) { else Y0_host(l, i) *= beta_host(l); if (i != 0 && i != (BlkSize - 1)) - Y0_host(l, i) += - alpha_host(l) * - (2 * X0_host(l, i) - X0_host(l, i - 1) - X0_host(l, i + 1)); + Y0_host(l, i) += alpha_host(l) * (2 * X0_host(l, i) - X0_host(l, i - 1) - X0_host(l, i + 1)); else if (i == 0) - Y0_host(l, i) += - alpha_host(l) * (2 * X0_host(l, i) - X0_host(l, i + 1)); + Y0_host(l, i) += alpha_host(l) * (2 * X0_host(l, i) - X0_host(l, i + 1)); else - Y0_host(l, i) += - alpha_host(l) * (2 * X0_host(l, i) - X0_host(l, i - 1)); + Y0_host(l, i) += alpha_host(l) * (2 * X0_host(l, i) - X0_host(l, i - 1)); } - Functor_TestBatchedTeamSpmv(alpha, D, r, c, X1, beta, Y1, N_team) + Functor_TestBatchedTeamSpmv(alpha, D, r, c, X1, beta, Y1, N_team) .run(); Kokkos::fence(); @@ -189,50 +163,38 @@ void impl_test_batched_spmv(const int N, const int BlkSize, const int N_team) { } // namespace TeamSpmv } // namespace Test -template +template int test_batched_team_spmv() { #if defined(KOKKOSKERNELS_INST_LAYOUTLEFT) { typedef Kokkos::View ViewType; typedef Kokkos::View IntView; - typedef Kokkos::View - alphaViewType; + typedef Kokkos::View alphaViewType; for (int i = 3; i < 10; ++i) { - Test::TeamSpmv::impl_test_batched_spmv( - 1024, i, 2); + Test::TeamSpmv::impl_test_batched_spmv(1024, i, 2); } for (int i = 3; i < 10; ++i) { - Test::TeamSpmv::impl_test_batched_spmv( - 1024, i, 2); + Test::TeamSpmv::impl_test_batched_spmv(1024, i, 2); } } #endif #if defined(KOKKOSKERNELS_INST_LAYOUTRIGHT) { - typedef Kokkos::View - ViewType; + typedef Kokkos::View ViewType; typedef Kokkos::View IntView; - typedef Kokkos::View - alphaViewType; + typedef Kokkos::View alphaViewType; for (int i = 3; i < 10; ++i) { - Test::TeamSpmv::impl_test_batched_spmv( - 1024, i, 2); + Test::TeamSpmv::impl_test_batched_spmv(1024, i, 2); } for (int i = 3; i < 10; ++i) { - Test::TeamSpmv::impl_test_batched_spmv( - 1024, i, 2); + Test::TeamSpmv::impl_test_batched_spmv(1024, i, 2); } } #endif diff --git a/batched/sparse/unit_test/Test_Batched_TeamVectorCG.hpp b/batched/sparse/unit_test/Test_Batched_TeamVectorCG.hpp index 3ffd68209b..9ca4405b89 100644 --- a/batched/sparse/unit_test/Test_Batched_TeamVectorCG.hpp +++ b/batched/sparse/unit_test/Test_Batched_TeamVectorCG.hpp @@ -28,8 +28,8 @@ using namespace KokkosBatched; namespace Test { namespace TeamVectorCG { -template +template struct Functor_TestBatchedTeamVectorCG { using execution_space = typename DeviceType::execution_space; const ValuesViewType _D; @@ -40,40 +40,27 @@ struct Functor_TestBatchedTeamVectorCG { const int _N_team; KrylovHandleType handle; - Functor_TestBatchedTeamVectorCG(const ValuesViewType &D, const IntView &r, - const IntView &c, const VectorViewType &X, + Functor_TestBatchedTeamVectorCG(const ValuesViewType &D, const IntView &r, const IntView &c, const VectorViewType &X, const VectorViewType &B, const int N_team) - : _D(D), - _r(r), - _c(c), - _X(X), - _B(B), - _N_team(N_team), - handle(KrylovHandleType(_D.extent(0), _N_team)) {} + : _D(D), _r(r), _c(c), _X(X), _B(B), _N_team(N_team), handle(KrylovHandleType(_D.extent(0), _N_team)) {} template KOKKOS_INLINE_FUNCTION void operator()(const MemberType &member) const { const int first_matrix = static_cast(member.league_rank()) * _N_team; const int N = _D.extent(0); const int last_matrix = - (static_cast(member.league_rank() + 1) * _N_team < N - ? static_cast(member.league_rank() + 1) * _N_team - : N); + (static_cast(member.league_rank() + 1) * _N_team < N ? static_cast(member.league_rank() + 1) * _N_team + : N); - auto d = Kokkos::subview(_D, Kokkos::make_pair(first_matrix, last_matrix), - Kokkos::ALL); - auto x = Kokkos::subview(_X, Kokkos::make_pair(first_matrix, last_matrix), - Kokkos::ALL); - auto b = Kokkos::subview(_B, Kokkos::make_pair(first_matrix, last_matrix), - Kokkos::ALL); + auto d = Kokkos::subview(_D, Kokkos::make_pair(first_matrix, last_matrix), Kokkos::ALL); + auto x = Kokkos::subview(_X, Kokkos::make_pair(first_matrix, last_matrix), Kokkos::ALL); + auto b = Kokkos::subview(_B, Kokkos::make_pair(first_matrix, last_matrix), Kokkos::ALL); using Operator = KokkosBatched::CrsMatrix; Operator A(d, _r, _c); - KokkosBatched::TeamVectorCG::template invoke( - member, A, b, x, handle); + KokkosBatched::TeamVectorCG::template invoke(member, A, b, x, handle); } inline void run() { @@ -82,8 +69,7 @@ struct Functor_TestBatchedTeamVectorCG { const std::string name_value_type = Test::value_type_name(); std::string name = name_region + name_value_type; Kokkos::Profiling::pushRegion(name.c_str()); - Kokkos::TeamPolicy policy(_D.extent(0) / _N_team, - Kokkos::AUTO(), Kokkos::AUTO()); + Kokkos::TeamPolicy policy(_D.extent(0) / _N_team, Kokkos::AUTO(), Kokkos::AUTO()); size_t bytes_0 = ValuesViewType::shmem_size(_N_team, _X.extent(1)); size_t bytes_1 = ValuesViewType::shmem_size(_N_team, 1); @@ -94,8 +80,7 @@ struct Functor_TestBatchedTeamVectorCG { } }; -template +template void impl_test_batched_CG(const int N, const int BlkSize, const int N_team) { typedef typename ValuesViewType::value_type value_type; typedef Kokkos::ArithTraits ats; @@ -120,8 +105,7 @@ void impl_test_batched_CG(const int N, const int BlkSize, const int N_team) { using Scalar3DViewType = Kokkos::View; using IntViewType = Kokkos::View; - using KrylovHandleType = - KrylovHandle; + using KrylovHandleType = KrylovHandle; NormViewType sqr_norm_0("sqr_norm_0", N); NormViewType sqr_norm_j("sqr_norm_j", N); @@ -149,14 +133,11 @@ void impl_test_batched_CG(const int N, const int BlkSize, const int N_team) { Kokkos::deep_copy(D_host, D); KokkosBatched::SerialSpmv::template invoke< - typename ValuesViewType::HostMirror, typename IntView::HostMirror, - typename VectorViewType::HostMirror, typename VectorViewType::HostMirror, - 1>(-1, D_host, r_host, c_host, X_host, 1, R_host); - KokkosBatched::SerialDot::invoke(R_host, R_host, - sqr_norm_0_host); - Functor_TestBatchedTeamVectorCG(D, r, c, X, - B, N_team) + typename ValuesViewType::HostMirror, typename IntView::HostMirror, typename VectorViewType::HostMirror, + typename VectorViewType::HostMirror, 1>(-1, D_host, r_host, c_host, X_host, 1, R_host); + KokkosBatched::SerialDot::invoke(R_host, R_host, sqr_norm_0_host); + Functor_TestBatchedTeamVectorCG(D, r, c, X, B, + N_team) .run(); Kokkos::fence(); @@ -166,16 +147,13 @@ void impl_test_batched_CG(const int N, const int BlkSize, const int N_team) { Kokkos::deep_copy(X_host, X); KokkosBatched::SerialSpmv::template invoke< - typename ValuesViewType::HostMirror, typename IntView::HostMirror, - typename VectorViewType::HostMirror, typename VectorViewType::HostMirror, - 1>(-1, D_host, r_host, c_host, X_host, 1, R_host); - KokkosBatched::SerialDot::invoke(R_host, R_host, - sqr_norm_j_host); + typename ValuesViewType::HostMirror, typename IntView::HostMirror, typename VectorViewType::HostMirror, + typename VectorViewType::HostMirror, 1>(-1, D_host, r_host, c_host, X_host, 1, R_host); + KokkosBatched::SerialDot::invoke(R_host, R_host, sqr_norm_j_host); const MagnitudeType eps = 1.0e3 * ats::epsilon(); - for (int l = 0; l < N; ++l) - EXPECT_NEAR_KK(sqr_norm_j_host(l) / sqr_norm_0_host(l), 0, eps); + for (int l = 0; l < N; ++l) EXPECT_NEAR_KK(sqr_norm_j_host(l) / sqr_norm_0_host(l), 0, eps); } } // namespace TeamVectorCG } // namespace Test @@ -186,26 +164,21 @@ int test_batched_teamvector_CG() { { typedef Kokkos::View ViewType; typedef Kokkos::View IntView; - typedef Kokkos::View - VectorViewType; + typedef Kokkos::View VectorViewType; for (int i = 3; i < 10; ++i) { - Test::TeamVectorCG::impl_test_batched_CG(1024, i, 2); + Test::TeamVectorCG::impl_test_batched_CG(1024, i, 2); } } #endif #if defined(KOKKOSKERNELS_INST_LAYOUTRIGHT) { - typedef Kokkos::View - ViewType; + typedef Kokkos::View ViewType; typedef Kokkos::View IntView; - typedef Kokkos::View - VectorViewType; + typedef Kokkos::View VectorViewType; for (int i = 3; i < 10; ++i) { - Test::TeamVectorCG::impl_test_batched_CG(1024, i, 2); + Test::TeamVectorCG::impl_test_batched_CG(1024, i, 2); } } #endif diff --git a/batched/sparse/unit_test/Test_Batched_TeamVectorCG_Real.hpp b/batched/sparse/unit_test/Test_Batched_TeamVectorCG_Real.hpp index 859a1a885c..85935e07f3 100644 --- a/batched/sparse/unit_test/Test_Batched_TeamVectorCG_Real.hpp +++ b/batched/sparse/unit_test/Test_Batched_TeamVectorCG_Real.hpp @@ -15,13 +15,9 @@ //@HEADER #if defined(KOKKOSKERNELS_INST_FLOAT) -TEST_F(TestCategory, batched_scalar_teamvector_CG_float) { - test_batched_teamvector_CG(); -} +TEST_F(TestCategory, batched_scalar_teamvector_CG_float) { test_batched_teamvector_CG(); } #endif #if defined(KOKKOSKERNELS_INST_DOUBLE) -TEST_F(TestCategory, batched_scalar_teamvector_CG_double) { - test_batched_teamvector_CG(); -} +TEST_F(TestCategory, batched_scalar_teamvector_CG_double) { test_batched_teamvector_CG(); } #endif diff --git a/batched/sparse/unit_test/Test_Batched_TeamVectorGMRES.hpp b/batched/sparse/unit_test/Test_Batched_TeamVectorGMRES.hpp index 084b623aa2..a14077f014 100644 --- a/batched/sparse/unit_test/Test_Batched_TeamVectorGMRES.hpp +++ b/batched/sparse/unit_test/Test_Batched_TeamVectorGMRES.hpp @@ -29,8 +29,8 @@ using namespace KokkosBatched; namespace Test { namespace TeamVectorGMRES { -template +template struct Functor_TestBatchedTeamVectorGMRES { using execution_space = typename DeviceType::execution_space; const ValuesViewType _D; @@ -42,37 +42,23 @@ struct Functor_TestBatchedTeamVectorGMRES { const int _N_team; KrylovHandleType _handle; - Functor_TestBatchedTeamVectorGMRES(const ValuesViewType &D, const IntView &r, - const IntView &c, const VectorViewType &X, - const VectorViewType &B, - const VectorViewType &diag, + Functor_TestBatchedTeamVectorGMRES(const ValuesViewType &D, const IntView &r, const IntView &c, + const VectorViewType &X, const VectorViewType &B, const VectorViewType &diag, const int N_team, KrylovHandleType &handle) - : _D(D), - _r(r), - _c(c), - _X(X), - _B(B), - _Diag(diag), - _N_team(N_team), - _handle(handle) {} + : _D(D), _r(r), _c(c), _X(X), _B(B), _Diag(diag), _N_team(N_team), _handle(handle) {} template KOKKOS_INLINE_FUNCTION void operator()(const MemberType &member) const { const int first_matrix = static_cast(member.league_rank()) * _N_team; const int N = _D.extent(0); const int last_matrix = - (static_cast(member.league_rank() + 1) * _N_team < N - ? static_cast(member.league_rank() + 1) * _N_team - : N); - - auto d = Kokkos::subview(_D, Kokkos::make_pair(first_matrix, last_matrix), - Kokkos::ALL); - auto diag = Kokkos::subview( - _Diag, Kokkos::make_pair(first_matrix, last_matrix), Kokkos::ALL); - auto x = Kokkos::subview(_X, Kokkos::make_pair(first_matrix, last_matrix), - Kokkos::ALL); - auto b = Kokkos::subview(_B, Kokkos::make_pair(first_matrix, last_matrix), - Kokkos::ALL); + (static_cast(member.league_rank() + 1) * _N_team < N ? static_cast(member.league_rank() + 1) * _N_team + : N); + + auto d = Kokkos::subview(_D, Kokkos::make_pair(first_matrix, last_matrix), Kokkos::ALL); + auto diag = Kokkos::subview(_Diag, Kokkos::make_pair(first_matrix, last_matrix), Kokkos::ALL); + auto x = Kokkos::subview(_X, Kokkos::make_pair(first_matrix, last_matrix), Kokkos::ALL); + auto b = Kokkos::subview(_B, Kokkos::make_pair(first_matrix, last_matrix), Kokkos::ALL); using Operator = KokkosBatched::CrsMatrix; using PrecOperator = KokkosBatched::JacobiPrec; @@ -81,9 +67,7 @@ struct Functor_TestBatchedTeamVectorGMRES { PrecOperator P(diag); P.setComputedInverse(); - KokkosBatched::TeamVectorGMRES::template invoke( - member, A, b, x, P, _handle); + KokkosBatched::TeamVectorGMRES::template invoke(member, A, b, x, P, _handle); } inline void run() { @@ -92,8 +76,7 @@ struct Functor_TestBatchedTeamVectorGMRES { const std::string name_value_type = Test::value_type_name(); std::string name = name_region + name_value_type; Kokkos::Profiling::pushRegion(name.c_str()); - Kokkos::TeamPolicy policy(_D.extent(0) / _N_team, - Kokkos::AUTO(), Kokkos::AUTO()); + Kokkos::TeamPolicy policy(_D.extent(0) / _N_team, Kokkos::AUTO(), Kokkos::AUTO()); const int N = _D.extent(0); const int n = _X.extent(1); @@ -103,8 +86,8 @@ struct Functor_TestBatchedTeamVectorGMRES { _handle.set_compute_last_residual(false); _handle.set_tolerance(1e-8); - _handle.Arnoldi_view = typename KrylovHandleType::ArnoldiViewType( - "", N, maximum_iteration, n + maximum_iteration + 3); + _handle.Arnoldi_view = + typename KrylovHandleType::ArnoldiViewType("", N, maximum_iteration, n + maximum_iteration + 3); using ScalarType = typename ValuesViewType::non_const_value_type; using Layout = typename ValuesViewType::array_layout; @@ -122,16 +105,14 @@ struct Functor_TestBatchedTeamVectorGMRES { size_t bytes_int = bytes_row_ptr + bytes_col_idc; size_t bytes_diag = bytes_2D_1; size_t bytes_tmp = 2 * bytes_2D_1 + 2 * bytes_1D + bytes_2D_2; - policy.set_scratch_size( - 0, Kokkos::PerTeam(bytes_tmp + bytes_diag + bytes_int)); + policy.set_scratch_size(0, Kokkos::PerTeam(bytes_tmp + bytes_diag + bytes_int)); Kokkos::parallel_for(name.c_str(), policy, *this); Kokkos::Profiling::popRegion(); } }; -template +template void impl_test_batched_GMRES(const int N, const int BlkSize, const int N_team) { typedef typename ValuesViewType::value_type value_type; typedef Kokkos::ArithTraits ats; @@ -157,8 +138,7 @@ void impl_test_batched_GMRES(const int N, const int BlkSize, const int N_team) { using Scalar3DViewType = Kokkos::View; using IntViewType = Kokkos::View; - using KrylovHandleType = - KrylovHandle; + using KrylovHandleType = KrylovHandle; NormViewType sqr_norm_0("sqr_norm_0", N); NormViewType sqr_norm_j("sqr_norm_j", N); @@ -177,12 +157,10 @@ void impl_test_batched_GMRES(const int N, const int BlkSize, const int N_team) { int current_index; for (int i = 0; i < BlkSize; ++i) { - for (current_index = row_ptr_host(i); current_index < row_ptr_host(i + 1); - ++current_index) { + for (current_index = row_ptr_host(i); current_index < row_ptr_host(i + 1); ++current_index) { if (colIndices_host(current_index) == i) break; } - for (int j = 0; j < N; ++j) - diag_values_host(j, i) = values_host(j, current_index); + for (int j = 0; j < N; ++j) diag_values_host(j, i) = values_host(j, current_index); } Kokkos::deep_copy(Diag, diag_values_host); @@ -212,13 +190,10 @@ void impl_test_batched_GMRES(const int N, const int BlkSize, const int N_team) { KrylovHandleType handle(N, N_team, n_iterations); KokkosBatched::SerialSpmv::template invoke< - typename ValuesViewType::HostMirror, typename IntView::HostMirror, - typename VectorViewType::HostMirror, typename VectorViewType::HostMirror, - 1>(-1, D_host, r_host, c_host, X_host, 1, R_host); - KokkosBatched::SerialDot::invoke(R_host, R_host, - sqr_norm_0_host); - Functor_TestBatchedTeamVectorGMRES( + typename ValuesViewType::HostMirror, typename IntView::HostMirror, typename VectorViewType::HostMirror, + typename VectorViewType::HostMirror, 1>(-1, D_host, r_host, c_host, X_host, 1, R_host); + KokkosBatched::SerialDot::invoke(R_host, R_host, sqr_norm_0_host); + Functor_TestBatchedTeamVectorGMRES( D, r, c, X, B, Diag, N_team, handle) .run(); @@ -229,17 +204,13 @@ void impl_test_batched_GMRES(const int N, const int BlkSize, const int N_team) { Kokkos::deep_copy(X_host, X); KokkosBatched::SerialSpmv::template invoke< - typename ValuesViewType::HostMirror, typename IntView::HostMirror, - typename VectorViewType::HostMirror, typename VectorViewType::HostMirror, - 1>(-1, D_host, r_host, c_host, X_host, 1, R_host); - KokkosBatched::SerialDot::invoke(R_host, R_host, - sqr_norm_j_host); + typename ValuesViewType::HostMirror, typename IntView::HostMirror, typename VectorViewType::HostMirror, + typename VectorViewType::HostMirror, 1>(-1, D_host, r_host, c_host, X_host, 1, R_host); + KokkosBatched::SerialDot::invoke(R_host, R_host, sqr_norm_j_host); const MagnitudeType eps = 1.0e5 * ats::epsilon(); - for (int l = 0; l < N; ++l) - EXPECT_NEAR_KK( - std::sqrt(sqr_norm_j_host(l)) / std::sqrt(sqr_norm_0_host(l)), 0, eps); + for (int l = 0; l < N; ++l) EXPECT_NEAR_KK(std::sqrt(sqr_norm_j_host(l)) / std::sqrt(sqr_norm_0_host(l)), 0, eps); } } // namespace TeamVectorGMRES } // namespace Test @@ -250,28 +221,21 @@ int test_batched_teamvector_GMRES() { { typedef Kokkos::View ViewType; typedef Kokkos::View IntView; - typedef Kokkos::View - VectorViewType; + typedef Kokkos::View VectorViewType; for (int i = 3; i < 10; ++i) { - Test::TeamVectorGMRES::impl_test_batched_GMRES( - 1024, i, 2); + Test::TeamVectorGMRES::impl_test_batched_GMRES(1024, i, 2); } } #endif #if defined(KOKKOSKERNELS_INST_LAYOUTRIGHT) { - typedef Kokkos::View - ViewType; + typedef Kokkos::View ViewType; typedef Kokkos::View IntView; - typedef Kokkos::View - VectorViewType; + typedef Kokkos::View VectorViewType; for (int i = 3; i < 10; ++i) { - Test::TeamVectorGMRES::impl_test_batched_GMRES( - 1024, i, 2); + Test::TeamVectorGMRES::impl_test_batched_GMRES(1024, i, 2); } } #endif diff --git a/batched/sparse/unit_test/Test_Batched_TeamVectorGMRES_Real.hpp b/batched/sparse/unit_test/Test_Batched_TeamVectorGMRES_Real.hpp index 53b740deaa..ab889844a9 100644 --- a/batched/sparse/unit_test/Test_Batched_TeamVectorGMRES_Real.hpp +++ b/batched/sparse/unit_test/Test_Batched_TeamVectorGMRES_Real.hpp @@ -15,13 +15,9 @@ //@HEADER #if defined(KOKKOSKERNELS_INST_FLOAT) -TEST_F(TestCategory, batched_scalar_teamvector_GMRES_float) { - test_batched_teamvector_GMRES(); -} +TEST_F(TestCategory, batched_scalar_teamvector_GMRES_float) { test_batched_teamvector_GMRES(); } #endif #if defined(KOKKOSKERNELS_INST_DOUBLE) -TEST_F(TestCategory, batched_scalar_teamvector_GMRES_double) { - test_batched_teamvector_GMRES(); -} +TEST_F(TestCategory, batched_scalar_teamvector_GMRES_double) { test_batched_teamvector_GMRES(); } #endif diff --git a/batched/sparse/unit_test/Test_Batched_TeamVectorSpmv.hpp b/batched/sparse/unit_test/Test_Batched_TeamVectorSpmv.hpp index 9cbba56370..83a78228b3 100644 --- a/batched/sparse/unit_test/Test_Batched_TeamVectorSpmv.hpp +++ b/batched/sparse/unit_test/Test_Batched_TeamVectorSpmv.hpp @@ -19,7 +19,7 @@ #include "Kokkos_Core.hpp" #include "Kokkos_Random.hpp" -//#include "KokkosBatched_Vector.hpp" +// #include "KokkosBatched_Vector.hpp" #include "KokkosBatched_Spmv.hpp" #include "KokkosBatched_Spmv_TeamVector_Impl.hpp" @@ -38,9 +38,8 @@ struct ParamTag { typedef T trans; }; -template +template struct Functor_TestBatchedTeamVectorSpmv { using execution_space = typename DeviceType::execution_space; const alphaViewType _alpha; @@ -53,52 +52,33 @@ struct Functor_TestBatchedTeamVectorSpmv { const int _N_team; KOKKOS_INLINE_FUNCTION - Functor_TestBatchedTeamVectorSpmv(const alphaViewType &alpha, - const ValuesViewType &D, const IntView &r, - const IntView &c, const xViewType &X, - const betaViewType &beta, - const yViewType &Y, const int N_team) - : _alpha(alpha), - _D(D), - _r(r), - _c(c), - _X(X), - _beta(beta), - _Y(Y), - _N_team(N_team) {} + Functor_TestBatchedTeamVectorSpmv(const alphaViewType &alpha, const ValuesViewType &D, const IntView &r, + const IntView &c, const xViewType &X, const betaViewType &beta, const yViewType &Y, + const int N_team) + : _alpha(alpha), _D(D), _r(r), _c(c), _X(X), _beta(beta), _Y(Y), _N_team(N_team) {} template - KOKKOS_INLINE_FUNCTION void operator()(const ParamTagType &, - const MemberType &member) const { + KOKKOS_INLINE_FUNCTION void operator()(const ParamTagType &, const MemberType &member) const { const int first_matrix = static_cast(member.league_rank()) * _N_team; const int N = _D.extent(0); const int last_matrix = - (static_cast(member.league_rank() + 1) * _N_team < N - ? static_cast(member.league_rank() + 1) * _N_team - : N); - - auto alpha = - Kokkos::subview(_alpha, Kokkos::make_pair(first_matrix, last_matrix)); - auto d = Kokkos::subview(_D, Kokkos::make_pair(first_matrix, last_matrix), - Kokkos::ALL); - auto x = Kokkos::subview(_X, Kokkos::make_pair(first_matrix, last_matrix), - Kokkos::ALL); - auto beta = - Kokkos::subview(_beta, Kokkos::make_pair(first_matrix, last_matrix)); - auto y = Kokkos::subview(_Y, Kokkos::make_pair(first_matrix, last_matrix), - Kokkos::ALL); + (static_cast(member.league_rank() + 1) * _N_team < N ? static_cast(member.league_rank() + 1) * _N_team + : N); + + auto alpha = Kokkos::subview(_alpha, Kokkos::make_pair(first_matrix, last_matrix)); + auto d = Kokkos::subview(_D, Kokkos::make_pair(first_matrix, last_matrix), Kokkos::ALL); + auto x = Kokkos::subview(_X, Kokkos::make_pair(first_matrix, last_matrix), Kokkos::ALL); + auto beta = Kokkos::subview(_beta, Kokkos::make_pair(first_matrix, last_matrix)); + auto y = Kokkos::subview(_Y, Kokkos::make_pair(first_matrix, last_matrix), Kokkos::ALL); if (last_matrix != N) - KokkosBatched::TeamVectorSpmv< - MemberType, typename ParamTagType::trans, - 2>::template invoke( - member, alpha, d, _r, _c, x, beta, y); + KokkosBatched::TeamVectorSpmv::template invoke< + ValuesViewType, IntView, xViewType, yViewType, alphaViewType, betaViewType, dobeta>(member, alpha, d, _r, _c, + x, beta, y); else - KokkosBatched::TeamVectorSpmv:: - template invoke( - member, alpha, d, _r, _c, x, beta, y); + KokkosBatched::TeamVectorSpmv::template invoke< + ValuesViewType, IntView, xViewType, yViewType, alphaViewType, betaViewType, dobeta>(member, alpha, d, _r, _c, + x, beta, y); } inline void run() { @@ -107,17 +87,15 @@ struct Functor_TestBatchedTeamVectorSpmv { const std::string name_value_type = Test::value_type_name(); std::string name = name_region + name_value_type; Kokkos::Profiling::pushRegion(name.c_str()); - Kokkos::TeamPolicy policy( - ceil(static_cast(_D.extent(0)) / _N_team), Kokkos::AUTO(), - Kokkos::AUTO()); + Kokkos::TeamPolicy policy(ceil(static_cast(_D.extent(0)) / _N_team), + Kokkos::AUTO(), Kokkos::AUTO()); Kokkos::parallel_for(name.c_str(), policy, *this); Kokkos::Profiling::popRegion(); } }; -template +template void impl_test_batched_spmv(const int N, const int BlkSize, const int N_team) { typedef typename ValuesViewType::value_type value_type; typedef Kokkos::ArithTraits ats; @@ -159,21 +137,15 @@ void impl_test_batched_spmv(const int N, const int BlkSize, const int N_team) { else Y0_host(l, i) *= beta_host(l); if (i != 0 && i != (BlkSize - 1)) - Y0_host(l, i) += - alpha_host(l) * - (2 * X0_host(l, i) - X0_host(l, i - 1) - X0_host(l, i + 1)); + Y0_host(l, i) += alpha_host(l) * (2 * X0_host(l, i) - X0_host(l, i - 1) - X0_host(l, i + 1)); else if (i == 0) - Y0_host(l, i) += - alpha_host(l) * (2 * X0_host(l, i) - X0_host(l, i + 1)); + Y0_host(l, i) += alpha_host(l) * (2 * X0_host(l, i) - X0_host(l, i + 1)); else - Y0_host(l, i) += - alpha_host(l) * (2 * X0_host(l, i) - X0_host(l, i - 1)); + Y0_host(l, i) += alpha_host(l) * (2 * X0_host(l, i) - X0_host(l, i - 1)); } - Functor_TestBatchedTeamVectorSpmv( - alpha, D, r, c, X1, beta, Y1, N_team) + Functor_TestBatchedTeamVectorSpmv(alpha, D, r, c, X1, beta, Y1, N_team) .run(); Kokkos::fence(); @@ -198,45 +170,37 @@ void impl_test_batched_spmv(const int N, const int BlkSize, const int N_team) { } // namespace TeamVectorSpmv } // namespace Test -template +template int test_batched_teamvector_spmv() { #if defined(KOKKOSKERNELS_INST_LAYOUTLEFT) { typedef Kokkos::View ViewType; typedef Kokkos::View IntView; - typedef Kokkos::View - alphaViewType; + typedef Kokkos::View alphaViewType; for (int i = 3; i < 10; ++i) { - Test::TeamVectorSpmv::impl_test_batched_spmv< - DeviceType, ParamTagType, ViewType, IntView, ViewType, ViewType, - alphaViewType, alphaViewType, 0>(1025, i, 2); + Test::TeamVectorSpmv::impl_test_batched_spmv(1025, i, 2); } for (int i = 3; i < 10; ++i) { - Test::TeamVectorSpmv::impl_test_batched_spmv< - DeviceType, ParamTagType, ViewType, IntView, ViewType, ViewType, - alphaViewType, alphaViewType, 1>(1025, i, 2); + Test::TeamVectorSpmv::impl_test_batched_spmv(1025, i, 2); } } #endif #if defined(KOKKOSKERNELS_INST_LAYOUTRIGHT) { - typedef Kokkos::View - ViewType; + typedef Kokkos::View ViewType; typedef Kokkos::View IntView; - typedef Kokkos::View - alphaViewType; + typedef Kokkos::View alphaViewType; for (int i = 3; i < 10; ++i) { - Test::TeamVectorSpmv::impl_test_batched_spmv< - DeviceType, ParamTagType, ViewType, IntView, ViewType, ViewType, - alphaViewType, alphaViewType, 0>(1025, i, 2); + Test::TeamVectorSpmv::impl_test_batched_spmv(1025, i, 2); } for (int i = 3; i < 10; ++i) { - Test::TeamVectorSpmv::impl_test_batched_spmv< - DeviceType, ParamTagType, ViewType, IntView, ViewType, ViewType, - alphaViewType, alphaViewType, 1>(1025, i, 2); + Test::TeamVectorSpmv::impl_test_batched_spmv(1025, i, 2); } } #endif diff --git a/blas/impl/KokkosBlas1_abs_impl.hpp b/blas/impl/KokkosBlas1_abs_impl.hpp index 0334adbafe..0c674f25f5 100644 --- a/blas/impl/KokkosBlas1_abs_impl.hpp +++ b/blas/impl/KokkosBlas1_abs_impl.hpp @@ -37,8 +37,7 @@ struct MV_Abs_Functor { RMV R_; XMV X_; - MV_Abs_Functor(const RMV& R, const XMV& X) - : numCols(X.extent(1)), R_(R), X_(X) { + MV_Abs_Functor(const RMV& R, const XMV& X) : numCols(X.extent(1)), R_(R), X_(X) { static_assert(Kokkos::is_view::value, "KokkosBlas::Impl::" "MV_Abs_Functor: RMV is not a Kokkos::View."); @@ -163,8 +162,7 @@ void MV_Abs_Generic(const execution_space& space, const RMV& R, const XMV& X) { const SizeType numRows = X.extent(0); Kokkos::RangePolicy policy(space, 0, numRows); - if ((void*)(R.data()) == - (void*)(X.data())) { // if R and X are the same (alias one another) + if ((void*)(R.data()) == (void*)(X.data())) { // if R and X are the same (alias one another) MV_AbsSelf_Functor op(R); Kokkos::parallel_for("KokkosBlas::Abs::S0", policy, op); } else { @@ -192,8 +190,7 @@ void V_Abs_Generic(const execution_space& space, const RV& R, const XV& X) { const SizeType numRows = X.extent(0); Kokkos::RangePolicy policy(space, 0, numRows); - if ((void*)(R.data()) == - (void*)(X.data())) { // if R and X are the same (alias one another) + if ((void*)(R.data()) == (void*)(X.data())) { // if R and X are the same (alias one another) V_AbsSelf_Functor op(R); Kokkos::parallel_for("KokkosBlas::Abs::S2", policy, op); } else { diff --git a/blas/impl/KokkosBlas1_abs_spec.hpp b/blas/impl/KokkosBlas1_abs_spec.hpp index a4695bd505..fb6357b38e 100644 --- a/blas/impl/KokkosBlas1_abs_spec.hpp +++ b/blas/impl/KokkosBlas1_abs_spec.hpp @@ -42,17 +42,15 @@ struct abs_eti_spec_avail { // We may spread out definitions (see _INST macro below) across one or // more .cpp files. // -#define KOKKOSBLAS1_ABS_ETI_SPEC_AVAIL(SCALAR, LAYOUT, EXEC_SPACE, MEM_SPACE) \ - template <> \ - struct abs_eti_spec_avail< \ - EXEC_SPACE, \ - Kokkos::View, \ - Kokkos::MemoryTraits >, \ - Kokkos::View, \ - Kokkos::MemoryTraits >, \ - 1> { \ - enum : bool { value = true }; \ +#define KOKKOSBLAS1_ABS_ETI_SPEC_AVAIL(SCALAR, LAYOUT, EXEC_SPACE, MEM_SPACE) \ + template <> \ + struct abs_eti_spec_avail< \ + EXEC_SPACE, \ + Kokkos::View, Kokkos::MemoryTraits >, \ + Kokkos::View, \ + Kokkos::MemoryTraits >, \ + 1> { \ + enum : bool { value = true }; \ }; // @@ -62,18 +60,15 @@ struct abs_eti_spec_avail { // We may spread out definitions (see _DEF macro below) across one or // more .cpp files. // -#define KOKKOSBLAS1_ABS_MV_ETI_SPEC_AVAIL(SCALAR, LAYOUT, EXEC_SPACE, \ - MEM_SPACE) \ - template <> \ - struct abs_eti_spec_avail< \ - EXEC_SPACE, \ - Kokkos::View, \ - Kokkos::MemoryTraits >, \ - Kokkos::View, \ - Kokkos::MemoryTraits >, \ - 2> { \ - enum : bool { value = true }; \ +#define KOKKOSBLAS1_ABS_MV_ETI_SPEC_AVAIL(SCALAR, LAYOUT, EXEC_SPACE, MEM_SPACE) \ + template <> \ + struct abs_eti_spec_avail< \ + EXEC_SPACE, \ + Kokkos::View, Kokkos::MemoryTraits >, \ + Kokkos::View, \ + Kokkos::MemoryTraits >, \ + 2> { \ + enum : bool { value = true }; \ }; // Include the actual specialization declarations @@ -85,10 +80,9 @@ namespace KokkosBlas { namespace Impl { // Unification layer -template < - class execution_space, class RMV, class XMV, int rank = RMV::rank, - bool tpl_spec_avail = abs_tpl_spec_avail::value, - bool eti_spec_avail = abs_eti_spec_avail::value> +template ::value, + bool eti_spec_avail = abs_eti_spec_avail::value> struct Abs { static void abs(const execution_space& space, const RMV& R, const XMV& X); }; @@ -96,8 +90,7 @@ struct Abs { #if !defined(KOKKOSKERNELS_ETI_ONLY) || KOKKOSKERNELS_IMPL_COMPILE_LIBRARY //! Full specialization of Abs for single vectors (1-D Views). template -struct Abs { +struct Abs { using size_type = typename XMV::size_type; static void abs(const execution_space& space, const RMV& R, const XMV& X) { @@ -113,16 +106,13 @@ struct Abs: " "XMV is not rank 1."); - Kokkos::Profiling::pushRegion(KOKKOSKERNELS_IMPL_COMPILE_LIBRARY - ? "KokkosBlas::abs[ETI]" - : "KokkosBlas::abs[noETI]"); + Kokkos::Profiling::pushRegion(KOKKOSKERNELS_IMPL_COMPILE_LIBRARY ? "KokkosBlas::abs[ETI]" + : "KokkosBlas::abs[noETI]"); #ifdef KOKKOSKERNELS_ENABLE_CHECK_SPECIALIZATION if (KOKKOSKERNELS_IMPL_COMPILE_LIBRARY) - printf("KokkosBlas1::abs<> ETI specialization for < %s , %s >\n", - typeid(RMV).name(), typeid(XMV).name()); + printf("KokkosBlas1::abs<> ETI specialization for < %s , %s >\n", typeid(RMV).name(), typeid(XMV).name()); else { - printf("KokkosBlas1::abs<> non-ETI specialization for < %s , %s >\n", - typeid(RMV).name(), typeid(XMV).name()); + printf("KokkosBlas1::abs<> non-ETI specialization for < %s , %s >\n", typeid(RMV).name(), typeid(XMV).name()); } #endif const size_type numRows = X.extent(0); @@ -139,8 +129,7 @@ struct Abs -struct Abs { +struct Abs { using size_type = typename XMV::size_type; static void abs(const execution_space& space, const RMV& R, const XMV& X) { @@ -156,23 +145,19 @@ struct Abs: " "XMV is not rank 2."); - Kokkos::Profiling::pushRegion(KOKKOSKERNELS_IMPL_COMPILE_LIBRARY - ? "KokkosBlas::abs[ETI]" - : "KokkosBlas::abs[noETI]"); + Kokkos::Profiling::pushRegion(KOKKOSKERNELS_IMPL_COMPILE_LIBRARY ? "KokkosBlas::abs[ETI]" + : "KokkosBlas::abs[noETI]"); #ifdef KOKKOSKERNELS_ENABLE_CHECK_SPECIALIZATION if (KOKKOSKERNELS_IMPL_COMPILE_LIBRARY) - printf("KokkosBlas1::abs<> ETI specialization for < %s , %s >\n", - typeid(RMV).name(), typeid(XMV).name()); + printf("KokkosBlas1::abs<> ETI specialization for < %s , %s >\n", typeid(RMV).name(), typeid(XMV).name()); else { - printf("KokkosBlas1::asb<> non-ETI specialization for < %s , %s >\n", - typeid(RMV).name(), typeid(XMV).name()); + printf("KokkosBlas1::asb<> non-ETI specialization for < %s , %s >\n", typeid(RMV).name(), typeid(XMV).name()); } #endif const size_type numRows = X.extent(0); const size_type numCols = X.extent(1); - if (numRows < static_cast(INT_MAX) && - numRows * numCols < static_cast(INT_MAX)) { + if (numRows < static_cast(INT_MAX) && numRows * numCols < static_cast(INT_MAX)) { typedef int index_type; MV_Abs_Generic(space, R, X); } else { @@ -194,14 +179,12 @@ struct Abs, \ - Kokkos::MemoryTraits >, \ - Kokkos::View, \ - Kokkos::MemoryTraits >, \ +#define KOKKOSBLAS1_ABS_ETI_SPEC_DECL(SCALAR, LAYOUT, EXEC_SPACE, MEM_SPACE) \ + extern template struct Abs< \ + EXEC_SPACE, \ + Kokkos::View, Kokkos::MemoryTraits >, \ + Kokkos::View, \ + Kokkos::MemoryTraits >, \ 1, false, true>; // @@ -209,14 +192,12 @@ struct Abs, \ - Kokkos::MemoryTraits >, \ - Kokkos::View, \ - Kokkos::MemoryTraits >, \ +#define KOKKOSBLAS1_ABS_ETI_SPEC_INST(SCALAR, LAYOUT, EXEC_SPACE, MEM_SPACE) \ + template struct Abs< \ + EXEC_SPACE, \ + Kokkos::View, Kokkos::MemoryTraits >, \ + Kokkos::View, \ + Kokkos::MemoryTraits >, \ 1, false, true>; // @@ -226,15 +207,12 @@ struct Abs, \ - Kokkos::MemoryTraits >, \ - Kokkos::View, \ - Kokkos::MemoryTraits >, \ +#define KOKKOSBLAS1_ABS_MV_ETI_SPEC_DECL(SCALAR, LAYOUT, EXEC_SPACE, MEM_SPACE) \ + extern template struct Abs< \ + EXEC_SPACE, \ + Kokkos::View, Kokkos::MemoryTraits >, \ + Kokkos::View, \ + Kokkos::MemoryTraits >, \ 2, false, true>; // @@ -242,15 +220,12 @@ struct Abs, \ - Kokkos::MemoryTraits >, \ - Kokkos::View, \ - Kokkos::MemoryTraits >, \ +#define KOKKOSBLAS1_ABS_MV_ETI_SPEC_INST(SCALAR, LAYOUT, EXEC_SPACE, MEM_SPACE) \ + template struct Abs< \ + EXEC_SPACE, \ + Kokkos::View, Kokkos::MemoryTraits >, \ + Kokkos::View, \ + Kokkos::MemoryTraits >, \ 2, false, true>; #include diff --git a/blas/impl/KokkosBlas1_axpby_impl.hpp b/blas/impl/KokkosBlas1_axpby_impl.hpp index b919d76a94..6baed662cf 100644 --- a/blas/impl/KokkosBlas1_axpby_impl.hpp +++ b/blas/impl/KokkosBlas1_axpby_impl.hpp @@ -25,14 +25,12 @@ namespace KokkosBlas { namespace Impl { template -constexpr typename std::enable_if, int>::type -axpbyVarExtent(T& v) { +constexpr typename std::enable_if, int>::type axpbyVarExtent(T& v) { return v.extent(0); } template -constexpr typename std::enable_if, int>::type -axpbyVarExtent(T&) { +constexpr typename std::enable_if, int>::type axpbyVarExtent(T&) { return 0; } @@ -58,8 +56,7 @@ axpbyVarExtent(T&) { // coefficients. Any literal coefficient of zero has BLAS semantics // of ignoring the corresponding (multi)vector entry. This does not // apply to coefficients in the a and b vectors, if they are used. -template +template struct Axpby_Functor { typedef typename YV::execution_space execution_space; typedef SizeType size_type; @@ -70,8 +67,7 @@ struct Axpby_Functor { AV m_a; BV m_b; - Axpby_Functor(const XV& x, const YV& y, const AV& av, const BV& bv, - const SizeType startingColumn) + Axpby_Functor(const XV& x, const YV& y, const AV& av, const BV& bv, const SizeType startingColumn) : m_x(x), m_y(y), m_a(av), m_b(bv) { static_assert(Kokkos::is_view::value, "KokkosBlas::Impl::Axpby_Functor(ABgeneric)" @@ -79,8 +75,7 @@ struct Axpby_Functor { static_assert(Kokkos::is_view::value, "KokkosBlas::Impl::Axpby_Functor(ABgeneric)" ": Y is not a Kokkos::View."); - static_assert(std::is_same::value, + static_assert(std::is_same::value, "KokkosBlas::Impl::Axpby_Functor(ABgeneric)" ": Y must be nonconst, since it is an output argument" " and we have to be able to write to its entries."); @@ -90,18 +85,15 @@ struct Axpby_Functor { static_assert(YV::rank == 1, "KokkosBlas::Impl::Axpby_Functor(ABgeneric)" ": XV and YV must have rank 1."); - static_assert((-1 <= scalar_x) && (scalar_x <= 2) && (-1 <= scalar_y) && - (scalar_y <= 2), + static_assert((-1 <= scalar_x) && (scalar_x <= 2) && (-1 <= scalar_y) && (scalar_y <= 2), "KokkosBlas::Impl::Axpby_Functor(ABgeneric)" ": scalar_x and/or scalar_y are out of range."); if (startingColumn != 0) { if (axpbyVarExtent(m_a) > 1) { - m_a = Kokkos::subview( - av, std::make_pair(startingColumn, SizeType(av.extent(0)))); + m_a = Kokkos::subview(av, std::make_pair(startingColumn, SizeType(av.extent(0)))); } if (axpbyVarExtent(m_b) > 1) { - m_b = Kokkos::subview( - bv, std::make_pair(startingColumn, SizeType(bv.extent(0)))); + m_b = Kokkos::subview(bv, std::make_pair(startingColumn, SizeType(bv.extent(0)))); } } } @@ -123,10 +115,8 @@ struct Axpby_Functor { } else if constexpr (scalar_y == 1) { // Nothing to do: m_y(i) = m_y(i); } else if constexpr (scalar_y == 2) { - if (m_b(0) == - Kokkos::ArithTraits::zero()) { - m_y(i) = - Kokkos::ArithTraits::zero(); + if (m_b(0) == Kokkos::ArithTraits::zero()) { + m_y(i) = Kokkos::ArithTraits::zero(); } else { m_y(i) = m_b(0) * m_y(i); } @@ -143,8 +133,7 @@ struct Axpby_Functor { } else if constexpr (scalar_y == 1) { m_y(i) = -m_x(i) + m_y(i); } else if constexpr (scalar_y == 2) { - if (m_b(0) == - Kokkos::ArithTraits::zero()) { + if (m_b(0) == Kokkos::ArithTraits::zero()) { m_y(i) = -m_x(i); } else { m_y(i) = -m_x(i) + m_b(0) * m_y(i); @@ -162,8 +151,7 @@ struct Axpby_Functor { } else if constexpr (scalar_y == 1) { m_y(i) = m_x(i) + m_y(i); } else if constexpr (scalar_y == 2) { - if (m_b(0) == - Kokkos::ArithTraits::zero()) { + if (m_b(0) == Kokkos::ArithTraits::zero()) { m_y(i) = m_x(i); } else { m_y(i) = m_x(i) + m_b(0) * m_y(i); @@ -181,8 +169,7 @@ struct Axpby_Functor { } else if constexpr (scalar_y == 1) { m_y(i) = m_a(0) * m_x(i) + m_y(i); } else if constexpr (scalar_y == 2) { - if (m_b(0) == - Kokkos::ArithTraits::zero()) { + if (m_b(0) == Kokkos::ArithTraits::zero()) { m_y(i) = m_a(0) * m_x(i); } else { m_y(i) = m_a(0) * m_x(i) + m_b(0) * m_y(i); @@ -209,8 +196,7 @@ struct Axpby_Functor { // of ignoring the corresponding (multi)vector entry. This does not // apply to coefficients in the a and b vectors, if they are used. template -struct Axpby_Functor { typedef typename YV::execution_space execution_space; typedef SizeType size_type; @@ -221,10 +207,8 @@ struct Axpby_Functor::value, "KokkosBlas::Impl::Axpby_Functor(ABscalars)" @@ -232,8 +216,7 @@ struct Axpby_Functor::value, "KokkosBlas::Impl::Axpby_Functor(ABscalars)" ": Y is not a Kokkos::View."); - static_assert(std::is_same::value, + static_assert(std::is_same::value, "KokkosBlas::Impl::Axpby_Functor(ABscalars)" ": Y must be nonconst, since it is an output argument" " and we have to be able to write to its entries."); @@ -243,8 +226,7 @@ struct Axpby_Functor -void Axpby_Generic(const execution_space& space, const AV& av, const XV& x, - const BV& bv, const YV& y, const SizeType startingColumn, - int scalar_x = 2, int scalar_y = 2) { +template +void Axpby_Generic(const execution_space& space, const AV& av, const XV& x, const BV& bv, const YV& y, + const SizeType startingColumn, int scalar_x = 2, int scalar_y = 2) { static_assert(Kokkos::is_view::value, "KokkosBlas::Impl::" "Axpby_Generic: X is not a Kokkos::View."); static_assert(Kokkos::is_view::value, "KokkosBlas::Impl::" "Axpby_Generic: Y is not a Kokkos::View."); - static_assert(std::is_same::value, + static_assert(std::is_same::value, "KokkosBlas::Impl::Axpby_Generic: Y is const. " "It must be nonconst, because it is an output argument " "(we have to be able to write to its entries)."); @@ -344,8 +323,7 @@ void Axpby_Generic(const execution_space& space, const AV& av, const XV& x, "KokkosBlas::Impl::Axpby_Generic: " "XV and YV must have rank 1."); - if ((-1 <= scalar_x) && (scalar_x <= 2) && (-1 <= scalar_y) && - (scalar_y <= 2)) { + if ((-1 <= scalar_x) && (scalar_x <= 2) && (-1 <= scalar_y) && (scalar_y <= 2)) { // Ok } else { KokkosKernels::Impl::throw_runtime_exception( @@ -361,20 +339,16 @@ void Axpby_Generic(const execution_space& space, const AV& av, const XV& x, // **************************************************************** if (scalar_x == 0) { if (scalar_y == 0) { - Axpby_Functor op(x, y, av, bv, - startingColumn); + Axpby_Functor op(x, y, av, bv, startingColumn); Kokkos::parallel_for("KokkosBlas::Axpby::S0", policy, op); } else if (scalar_y == -1) { - Axpby_Functor op(x, y, av, bv, - startingColumn); + Axpby_Functor op(x, y, av, bv, startingColumn); Kokkos::parallel_for("KokkosBlas::Axpby::S1", policy, op); } else if (scalar_y == 1) { - Axpby_Functor op(x, y, av, bv, - startingColumn); + Axpby_Functor op(x, y, av, bv, startingColumn); Kokkos::parallel_for("KokkosBlas::Axpby::S2", policy, op); } else if (scalar_y == 2) { - Axpby_Functor op(x, y, av, bv, - startingColumn); + Axpby_Functor op(x, y, av, bv, startingColumn); Kokkos::parallel_for("KokkosBlas::Axpby::S3", policy, op); } } @@ -383,20 +357,16 @@ void Axpby_Generic(const execution_space& space, const AV& av, const XV& x, // **************************************************************** else if (scalar_x == -1) { if (scalar_y == 0) { - Axpby_Functor op(x, y, av, bv, - startingColumn); + Axpby_Functor op(x, y, av, bv, startingColumn); Kokkos::parallel_for("KokkosBlas::Axpby::S4", policy, op); } else if (scalar_y == -1) { - Axpby_Functor op(x, y, av, bv, - startingColumn); + Axpby_Functor op(x, y, av, bv, startingColumn); Kokkos::parallel_for("KokkosBlas::Axpby::S5", policy, op); } else if (scalar_y == 1) { - Axpby_Functor op(x, y, av, bv, - startingColumn); + Axpby_Functor op(x, y, av, bv, startingColumn); Kokkos::parallel_for("KokkosBlas::Axpby::S6", policy, op); } else if (scalar_y == 2) { - Axpby_Functor op(x, y, av, bv, - startingColumn); + Axpby_Functor op(x, y, av, bv, startingColumn); Kokkos::parallel_for("KokkosBlas::Axpby::S7", policy, op); } } @@ -405,20 +375,16 @@ void Axpby_Generic(const execution_space& space, const AV& av, const XV& x, // **************************************************************** else if (scalar_x == 1) { if (scalar_y == 0) { - Axpby_Functor op(x, y, av, bv, - startingColumn); + Axpby_Functor op(x, y, av, bv, startingColumn); Kokkos::parallel_for("KokkosBlas::Axpby::S8", policy, op); } else if (scalar_y == -1) { - Axpby_Functor op(x, y, av, bv, - startingColumn); + Axpby_Functor op(x, y, av, bv, startingColumn); Kokkos::parallel_for("KokkosBlas::Axpby::S9", policy, op); } else if (scalar_y == 1) { - Axpby_Functor op(x, y, av, bv, - startingColumn); + Axpby_Functor op(x, y, av, bv, startingColumn); Kokkos::parallel_for("KokkosBlas::Axpby::S10", policy, op); } else if (scalar_y == 2) { - Axpby_Functor op(x, y, av, bv, - startingColumn); + Axpby_Functor op(x, y, av, bv, startingColumn); Kokkos::parallel_for("KokkosBlas::Axpby::S11", policy, op); } } @@ -427,20 +393,16 @@ void Axpby_Generic(const execution_space& space, const AV& av, const XV& x, // **************************************************************** else if (scalar_x == 2) { if (scalar_y == 0) { - Axpby_Functor op(x, y, av, bv, - startingColumn); + Axpby_Functor op(x, y, av, bv, startingColumn); Kokkos::parallel_for("KokkosBlas::Axpby::S12", policy, op); } else if (scalar_y == -1) { - Axpby_Functor op(x, y, av, bv, - startingColumn); + Axpby_Functor op(x, y, av, bv, startingColumn); Kokkos::parallel_for("KokkosBlas::Axpby::S13", policy, op); } else if (scalar_y == 1) { - Axpby_Functor op(x, y, av, bv, - startingColumn); + Axpby_Functor op(x, y, av, bv, startingColumn); Kokkos::parallel_for("KokkosBlas::Axpby::S14", policy, op); } else if (scalar_y == 2) { - Axpby_Functor op(x, y, av, bv, - startingColumn); + Axpby_Functor op(x, y, av, bv, startingColumn); Kokkos::parallel_for("KokkosBlas::Axpby::S15", policy, op); } } diff --git a/blas/impl/KokkosBlas1_axpby_mv_impl.hpp b/blas/impl/KokkosBlas1_axpby_mv_impl.hpp index 7db7b0abe3..81c05fe7df 100644 --- a/blas/impl/KokkosBlas1_axpby_mv_impl.hpp +++ b/blas/impl/KokkosBlas1_axpby_mv_impl.hpp @@ -66,8 +66,7 @@ struct Axpby_MV_Functor { static_assert(Kokkos::is_view::value, "KokkosBlas::Impl::Axpby_MV_Functor(ABgeneric)" ": Y is not a Kokkos::View."); - static_assert(std::is_same::value, + static_assert(std::is_same::value, "KokkosBlas::Impl::Axpby_MV_Functor(ABgeneric)" ": Y must be nonconst, since it is an output argument" " and we have to be able to write to its entries."); @@ -83,8 +82,7 @@ struct Axpby_MV_Functor { static_assert(BV::rank == 1, "KokkosBlas::Impl::Axpby_MV_Functor(ABgeneric)" ": BV must have rank 1."); - static_assert((-1 <= scalar_x) && (scalar_x <= 2) && (-1 <= scalar_y) && - (scalar_y <= 2), + static_assert((-1 <= scalar_x) && (scalar_x <= 2) && (-1 <= scalar_y) && (scalar_y <= 2), "KokkosBlas::Impl::Axpby_MV_Functor(ABgeneric)" ": scalar_x and/or scalar_y are out of range."); } @@ -123,8 +121,7 @@ struct Axpby_MV_Functor { // Nothing to do: Y(i,j) := Y(i,j) } else if constexpr (scalar_y == 2) { if (m_b.extent(0) == 1) { - if (m_b(0) == - Kokkos::ArithTraits::zero()) { + if (m_b(0) == Kokkos::ArithTraits::zero()) { #ifdef KOKKOS_ENABLE_PRAGMA_IVDEP #pragma ivdep #endif @@ -132,8 +129,7 @@ struct Axpby_MV_Functor { #pragma vector always #endif for (size_type k = 0; k < numCols; ++k) { - m_y(i, k) = Kokkos::ArithTraits< - typename YMV::non_const_value_type>::zero(); + m_y(i, k) = Kokkos::ArithTraits::zero(); } } else { #ifdef KOKKOS_ENABLE_PRAGMA_IVDEP @@ -195,8 +191,7 @@ struct Axpby_MV_Functor { } } else if constexpr (scalar_y == 2) { if (m_b.extent(0) == 1) { - if (m_b(0) == - Kokkos::ArithTraits::zero()) { + if (m_b(0) == Kokkos::ArithTraits::zero()) { #ifdef KOKKOS_ENABLE_PRAGMA_IVDEP #pragma ivdep #endif @@ -266,8 +261,7 @@ struct Axpby_MV_Functor { } } else if constexpr (scalar_y == 2) { if (m_b.extent(0) == 1) { - if (m_b(0) == - Kokkos::ArithTraits::zero()) { + if (m_b(0) == Kokkos::ArithTraits::zero()) { #ifdef KOKKOS_ENABLE_PRAGMA_IVDEP #pragma ivdep #endif @@ -374,8 +368,7 @@ struct Axpby_MV_Functor { } else if constexpr (scalar_y == 2) { if (m_a.extent(0) == 1) { if (m_b.extent(0) == 1) { - if (m_b(0) == Kokkos::ArithTraits< - typename BV::non_const_value_type>::zero()) { + if (m_b(0) == Kokkos::ArithTraits::zero()) { #ifdef KOKKOS_ENABLE_PRAGMA_IVDEP #pragma ivdep #endif @@ -409,8 +402,7 @@ struct Axpby_MV_Functor { } } else { if (m_b.extent(0) == 1) { - if (m_b(0) == Kokkos::ArithTraits< - typename BV::non_const_value_type>::zero()) { + if (m_b(0) == Kokkos::ArithTraits::zero()) { #ifdef KOKKOS_ENABLE_PRAGMA_IVDEP #pragma ivdep #endif @@ -467,8 +459,7 @@ struct Axpby_MV_Functor { // This version works by partial specialization on AV and BV. // In this partial specialization, both AV and BV are scalars. template -struct Axpby_MV_Functor { typedef SizeType size_type; typedef Kokkos::ArithTraits ATS; @@ -479,8 +470,7 @@ struct Axpby_MV_Functor::value, @@ -489,8 +479,7 @@ struct Axpby_MV_Functor::value, "KokkosBlas::Impl::Axpby_MV_Functor(ABscalars)" ": Y is not a Kokkos::View."); - static_assert(std::is_same::value, + static_assert(std::is_same::value, "KokkosBlas::Impl::Axpby_MV_Functor(ABscalars)" ": Y must be nonconst, since it is an output argument" " and we have to be able to write to its entries."); @@ -500,8 +489,7 @@ struct Axpby_MV_Functor +template struct Axpby_MV_Unroll_Functor { typedef SizeType size_type; typedef Kokkos::ArithTraits ATS; @@ -704,8 +691,7 @@ struct Axpby_MV_Unroll_Functor { AV m_a; BV m_b; - Axpby_MV_Unroll_Functor(const XMV& x, const YMV& y, const AV& av, - const BV& bv, const SizeType startingColumn) + Axpby_MV_Unroll_Functor(const XMV& x, const YMV& y, const AV& av, const BV& bv, const SizeType startingColumn) : m_x(x), m_y(y), m_a(av), m_b(bv) { static_assert(Kokkos::is_view::value, "KokkosBlas::Impl::Axpby_MV_Unroll_Functor(ABgeneric)" @@ -719,8 +705,7 @@ struct Axpby_MV_Unroll_Functor { static_assert(Kokkos::is_view::value, "KokkosBlas::Impl::Axpby_MV_Unroll_Functor(ABgeneric)" ": Y is not a Kokkos::View."); - static_assert(std::is_same::value, + static_assert(std::is_same::value, "KokkosBlas::Impl::Axpby_MV_Unroll_Functor(ABgeneric)" ": Y must be nonconst, since it is an output argument" " and we have to be able to write to its entries."); @@ -736,19 +721,16 @@ struct Axpby_MV_Unroll_Functor { static_assert(BV::rank == 1, "KokkosBlas::Impl::Axpby_MV_Unroll_Functor(ABgeneric)" ": BV must have rank 1."); - static_assert((-1 <= scalar_x) && (scalar_x <= 2) && (-1 <= scalar_y) && - (scalar_y <= 2), + static_assert((-1 <= scalar_x) && (scalar_x <= 2) && (-1 <= scalar_y) && (scalar_y <= 2), "KokkosBlas::Impl::Axpby_MV_Unroll_Functor(ABgeneric)" ": scalar_x and/or scalar_y are out of range."); if (startingColumn != 0) { if (axpbyVarExtent(m_a) > 1) { - m_a = Kokkos::subview( - av, std::make_pair(startingColumn, SizeType(av.extent(0)))); + m_a = Kokkos::subview(av, std::make_pair(startingColumn, SizeType(av.extent(0)))); } if (axpbyVarExtent(m_b) > 1) { - m_b = Kokkos::subview( - bv, std::make_pair(startingColumn, SizeType(bv.extent(0)))); + m_b = Kokkos::subview(bv, std::make_pair(startingColumn, SizeType(bv.extent(0)))); } } } @@ -781,14 +763,12 @@ struct Axpby_MV_Unroll_Functor { // Nothing to do: Y(i,j) := Y(i,j) } else if constexpr (scalar_y == 2) { if (m_b.extent(0) == 1) { - if (m_b(0) == - Kokkos::ArithTraits::zero()) { + if (m_b(0) == Kokkos::ArithTraits::zero()) { #ifdef KOKKOS_ENABLE_PRAGMA_UNROLL #pragma unroll #endif for (int k = 0; k < UNROLL; ++k) { - m_y(i, k) = Kokkos::ArithTraits< - typename YMV::non_const_value_type>::zero(); + m_y(i, k) = Kokkos::ArithTraits::zero(); } } else { #ifdef KOKKOS_ENABLE_PRAGMA_UNROLL @@ -835,8 +815,7 @@ struct Axpby_MV_Unroll_Functor { } } else if constexpr (scalar_y == 2) { if (m_b.extent(0) == 1) { - if (m_b(0) == - Kokkos::ArithTraits::zero()) { + if (m_b(0) == Kokkos::ArithTraits::zero()) { #ifdef KOKKOS_ENABLE_PRAGMA_UNROLL #pragma unroll #endif @@ -888,8 +867,7 @@ struct Axpby_MV_Unroll_Functor { } } else if constexpr (scalar_y == 2) { if (m_b.extent(0) == 1) { - if (m_b(0) == - Kokkos::ArithTraits::zero()) { + if (m_b(0) == Kokkos::ArithTraits::zero()) { #ifdef KOKKOS_ENABLE_PRAGMA_UNROLL #pragma unroll #endif @@ -969,8 +947,7 @@ struct Axpby_MV_Unroll_Functor { } else if constexpr (scalar_y == 2) { if (m_a.extent(0) == 1) { if (m_b.extent(0) == 1) { - if (m_b(0) == Kokkos::ArithTraits< - typename BV::non_const_value_type>::zero()) { + if (m_b(0) == Kokkos::ArithTraits::zero()) { #ifdef KOKKOS_ENABLE_PRAGMA_UNROLL #pragma unroll #endif @@ -995,8 +972,7 @@ struct Axpby_MV_Unroll_Functor { } } else { if (m_b.extent(0) == 1) { - if (m_b(0) == Kokkos::ArithTraits< - typename BV::non_const_value_type>::zero()) { + if (m_b(0) == Kokkos::ArithTraits::zero()) { #ifdef KOKKOS_ENABLE_PRAGMA_UNROLL #pragma unroll #endif @@ -1028,10 +1004,8 @@ struct Axpby_MV_Unroll_Functor { // Variant of Axpby_MV_Unroll_Functor for single coefficients (rather // than vectors of coefficients) a and b. The number of columns in X // and Y, UNROLL, is a compile-time constant. -template -struct Axpby_MV_Unroll_Functor +struct Axpby_MV_Unroll_Functor { typedef SizeType size_type; typedef Kokkos::ArithTraits ATS; @@ -1041,10 +1015,8 @@ struct Axpby_MV_Unroll_Functor::value, "KokkosBlas::Impl::Axpby_MV_Unroll_Functor(ABscalars)" @@ -1052,8 +1024,7 @@ struct Axpby_MV_Unroll_Functor::value, "KokkosBlas::Impl::Axpby_MV_Unroll_Functor(ABscalars)" ": Y is not a Kokkos::View."); - static_assert(std::is_same::value, + static_assert(std::is_same::value, "KokkosBlas::Impl::Axpby_MV_Unroll_Functor(ABscalars)" ": Y must be nonconst, since it is an output argument" " and we have to be able to write to its entries."); @@ -1063,8 +1034,7 @@ struct Axpby_MV_Unroll_Functor -void Axpby_MV_Unrolled(const execution_space& space, const AV& av, const XMV& x, - const BV& bv, const YMV& y, - const SizeType startingColumn, int scalar_x = 2, - int scalar_y = 2) { +template +void Axpby_MV_Unrolled(const execution_space& space, const AV& av, const XMV& x, const BV& bv, const YMV& y, + const SizeType startingColumn, int scalar_x = 2, int scalar_y = 2) { static_assert(Kokkos::is_view::value, "KokkosBlas::Impl::Axpby_MV_Unrolled()" ": X is not a Kokkos::View."); static_assert(Kokkos::is_view::value, "KokkosBlas::Impl::Axpby_MV_Unrolled()" ": Y is not a Kokkos::View."); - static_assert(std::is_same::value, + static_assert(std::is_same::value, "KokkosBlas::Impl::Axpby_MV_Unrolled()" ": Y must be nonconst, since it is an output argument" " and we have to be able to write to its entries."); @@ -1251,8 +1217,7 @@ void Axpby_MV_Unrolled(const execution_space& space, const AV& av, const XMV& x, static_assert(YMV::rank == 2, "KokkosBlas::Impl::Axpby_MV_Unrolled()" ": XMV and YMV must have rank 2."); - if ((-1 <= scalar_x) && (scalar_x <= 2) && (-1 <= scalar_y) && - (scalar_y <= 2)) { + if ((-1 <= scalar_x) && (scalar_x <= 2) && (-1 <= scalar_y) && (scalar_y <= 2)) { // Ok } else { KokkosKernels::Impl::throw_runtime_exception( @@ -1268,20 +1233,16 @@ void Axpby_MV_Unrolled(const execution_space& space, const AV& av, const XMV& x, // **************************************************************** if (scalar_x == 0) { if (scalar_y == 0) { - Axpby_MV_Unroll_Functor op( - x, y, av, bv, startingColumn); + Axpby_MV_Unroll_Functor op(x, y, av, bv, startingColumn); Kokkos::parallel_for("KokkosBlas::Axpby::MV::S0", policy, op); } else if (scalar_y == -1) { - Axpby_MV_Unroll_Functor op( - x, y, av, bv, startingColumn); + Axpby_MV_Unroll_Functor op(x, y, av, bv, startingColumn); Kokkos::parallel_for("KokkosBlas::Axpby::MV::S1", policy, op); } else if (scalar_y == 1) { - Axpby_MV_Unroll_Functor op( - x, y, av, bv, startingColumn); + Axpby_MV_Unroll_Functor op(x, y, av, bv, startingColumn); Kokkos::parallel_for("KokkosBlas::Axpby::MV::S2", policy, op); } else if (scalar_y == 2) { - Axpby_MV_Unroll_Functor op( - x, y, av, bv, startingColumn); + Axpby_MV_Unroll_Functor op(x, y, av, bv, startingColumn); Kokkos::parallel_for("KokkosBlas::Axpby::MV::S3", policy, op); } } @@ -1290,20 +1251,16 @@ void Axpby_MV_Unrolled(const execution_space& space, const AV& av, const XMV& x, // **************************************************************** else if (scalar_x == -1) { if (scalar_y == 0) { - Axpby_MV_Unroll_Functor op( - x, y, av, bv, startingColumn); + Axpby_MV_Unroll_Functor op(x, y, av, bv, startingColumn); Kokkos::parallel_for("KokkosBlas::Axpby::MV::S4", policy, op); } else if (scalar_y == -1) { - Axpby_MV_Unroll_Functor op( - x, y, av, bv, startingColumn); + Axpby_MV_Unroll_Functor op(x, y, av, bv, startingColumn); Kokkos::parallel_for("KokkosBlas::Axpby::MV::S5", policy, op); } else if (scalar_y == 1) { - Axpby_MV_Unroll_Functor op( - x, y, av, bv, startingColumn); + Axpby_MV_Unroll_Functor op(x, y, av, bv, startingColumn); Kokkos::parallel_for("KokkosBlas::Axpby::MV::S6", policy, op); } else if (scalar_y == 2) { - Axpby_MV_Unroll_Functor op( - x, y, av, bv, startingColumn); + Axpby_MV_Unroll_Functor op(x, y, av, bv, startingColumn); Kokkos::parallel_for("KokkosBlas::Axpby::MV::S7", policy, op); } } @@ -1312,20 +1269,16 @@ void Axpby_MV_Unrolled(const execution_space& space, const AV& av, const XMV& x, // **************************************************************** else if (scalar_x == 1) { if (scalar_y == 0) { - Axpby_MV_Unroll_Functor op( - x, y, av, bv, startingColumn); + Axpby_MV_Unroll_Functor op(x, y, av, bv, startingColumn); Kokkos::parallel_for("KokkosBlas::Axpby::MV::S8", policy, op); } else if (scalar_y == -1) { - Axpby_MV_Unroll_Functor op( - x, y, av, bv, startingColumn); + Axpby_MV_Unroll_Functor op(x, y, av, bv, startingColumn); Kokkos::parallel_for("KokkosBlas::Axpby::MV::S9", policy, op); } else if (scalar_y == 1) { - Axpby_MV_Unroll_Functor op( - x, y, av, bv, startingColumn); + Axpby_MV_Unroll_Functor op(x, y, av, bv, startingColumn); Kokkos::parallel_for("KokkosBlas::Axpby::MV::S10", policy, op); } else if (scalar_y == 2) { - Axpby_MV_Unroll_Functor op( - x, y, av, bv, startingColumn); + Axpby_MV_Unroll_Functor op(x, y, av, bv, startingColumn); Kokkos::parallel_for("KokkosBlas::Axpby::MV::S11", policy, op); } } @@ -1334,20 +1287,16 @@ void Axpby_MV_Unrolled(const execution_space& space, const AV& av, const XMV& x, // **************************************************************** else if (scalar_x == 2) { if (scalar_y == 0) { - Axpby_MV_Unroll_Functor op( - x, y, av, bv, startingColumn); + Axpby_MV_Unroll_Functor op(x, y, av, bv, startingColumn); Kokkos::parallel_for("KokkosBlas::Axpby::MV::S12", policy, op); } else if (scalar_y == -1) { - Axpby_MV_Unroll_Functor op( - x, y, av, bv, startingColumn); + Axpby_MV_Unroll_Functor op(x, y, av, bv, startingColumn); Kokkos::parallel_for("KokkosBlas::Axpby::MV::S13", policy, op); } else if (scalar_y == 1) { - Axpby_MV_Unroll_Functor op( - x, y, av, bv, startingColumn); + Axpby_MV_Unroll_Functor op(x, y, av, bv, startingColumn); Kokkos::parallel_for("KokkosBlas::Axpby::MV::S14", policy, op); } else if (scalar_y == 2) { - Axpby_MV_Unroll_Functor op( - x, y, av, bv, startingColumn); + Axpby_MV_Unroll_Functor op(x, y, av, bv, startingColumn); Kokkos::parallel_for("KokkosBlas::Axpby::MV::S15", policy, op); } } @@ -1372,19 +1321,16 @@ void Axpby_MV_Unrolled(const execution_space& space, const AV& av, const XMV& x, // coefficients in av and bv vectors, if they are used. // // Either av and bv are both 1-D Views, or av and bv are both scalars. -template -void Axpby_MV_Generic(const execution_space& space, const AV& av, const XMV& x, - const BV& bv, const YMV& y, int scalar_x = 2, - int scalar_y = 2) { +template +void Axpby_MV_Generic(const execution_space& space, const AV& av, const XMV& x, const BV& bv, const YMV& y, + int scalar_x = 2, int scalar_y = 2) { static_assert(Kokkos::is_view::value, "KokkosBlas::Impl::Axpby_MV_Generic()" ": X is not a Kokkos::View."); static_assert(Kokkos::is_view::value, "KokkosBlas::Impl::Axpby_MV_Generic()" ": Y is not a Kokkos::View."); - static_assert(std::is_same::value, + static_assert(std::is_same::value, "KokkosBlas::Impl::Axpby_MV_Generic()" ": Y must be nonconst, since it is an output argument" " and we have to be able to write to its entries."); @@ -1394,8 +1340,7 @@ void Axpby_MV_Generic(const execution_space& space, const AV& av, const XMV& x, static_assert(YMV::rank == 2, "KokkosBlas::Impl::Axpby_MV_Generic()" ": XMV and YMV must have rank 2."); - if ((-1 <= scalar_x) && (scalar_x <= 2) && (-1 <= scalar_y) && - (scalar_y <= 2)) { + if ((-1 <= scalar_x) && (scalar_x <= 2) && (-1 <= scalar_y) && (scalar_y <= 2)) { // Ok } else { KokkosKernels::Impl::throw_runtime_exception( @@ -1499,20 +1444,17 @@ void Axpby_MV_Generic(const execution_space& space, const AV& av, const XMV& x, // coefficients in av and bv vectors, if they are used. // // Either av and bv are both 1-D Views, or av and bv are both scalars. -template +template struct Axpby_MV_Invoke_Left { - static void run(const execution_space& space, const AV& av, const XMV& x, - const BV& bv, const YMV& y, int scalar_x = 2, - int scalar_y = 2) { + static void run(const execution_space& space, const AV& av, const XMV& x, const BV& bv, const YMV& y, + int scalar_x = 2, int scalar_y = 2) { static_assert(Kokkos::is_view::value, "KokkosBlas::Impl::Axpby_MV_Invoke_Left::run()" ": X is not a Kokkos::View."); static_assert(Kokkos::is_view::value, "KokkosBlas::Impl::Axpby_MV_Invoke_Left::run()" ": Y is not a Kokkos::View."); - static_assert(std::is_same::value, + static_assert(std::is_same::value, "KokkosBlas::Impl::Axpby_MV_Invoke_Left::run()" ": Y must be nonconst, since it is an output argument" " and we have to be able to write to its entries."); @@ -1522,8 +1464,7 @@ struct Axpby_MV_Invoke_Left { static_assert(YMV::rank == 2, "KokkosBlas::Impl::Axpby_MV_Invoke_Left::run()" ": X and Y must have rank 2."); - if ((-1 <= scalar_x) && (scalar_x <= 2) && (-1 <= scalar_y) && - (scalar_y <= 2)) { + if ((-1 <= scalar_x) && (scalar_x <= 2) && (-1 <= scalar_y) && (scalar_y <= 2)) { // Ok } else { KokkosKernels::Impl::throw_runtime_exception( @@ -1544,8 +1485,8 @@ struct Axpby_MV_Invoke_Left { // Passing in the starting column index lets the functor take // subviews of av and bv, if they are Views. If they are scalars, // the functor doesn't have to do anything to them. - Axpby_MV_Unrolled( - space, av, X_cur, bv, Y_cur, j, scalar_x, scalar_y); + Axpby_MV_Unrolled(space, av, X_cur, bv, Y_cur, j, scalar_x, + scalar_y); } for (; j + 4 <= numCols; j += 4) { XMV X_cur = Kokkos::subview(x, Kokkos::ALL(), std::make_pair(j, j + 4)); @@ -1554,8 +1495,8 @@ struct Axpby_MV_Invoke_Left { // Passing in the starting column index lets the functor take // subviews of av and bv, if they are Views. If they are scalars, // the functor doesn't have to do anything to them. - Axpby_MV_Unrolled( - space, av, X_cur, bv, Y_cur, j, scalar_x, scalar_y); + Axpby_MV_Unrolled(space, av, X_cur, bv, Y_cur, j, scalar_x, + scalar_y); } for (; j < numCols; ++j) { auto x_cur = Kokkos::subview(x, Kokkos::ALL(), j); @@ -1566,8 +1507,7 @@ struct Axpby_MV_Invoke_Left { // the functor doesn't have to do anything to them. typedef decltype(x_cur) XV; typedef decltype(y_cur) YV; - Axpby_Generic( - space, av, x_cur, bv, y_cur, j, scalar_x, scalar_y); + Axpby_Generic(space, av, x_cur, bv, y_cur, j, scalar_x, scalar_y); } } }; @@ -1591,20 +1531,17 @@ struct Axpby_MV_Invoke_Left { // coefficients in av and bv vectors, if they are used. // // Either av and bv are both 1-D Views, or av and bv are both scalars. -template +template struct Axpby_MV_Invoke_Right { - static void run(const execution_space& space, const AV& av, const XMV& x, - const BV& bv, const YMV& y, int scalar_x = 2, - int scalar_y = 2) { + static void run(const execution_space& space, const AV& av, const XMV& x, const BV& bv, const YMV& y, + int scalar_x = 2, int scalar_y = 2) { static_assert(Kokkos::is_view::value, "KokkosBlas::Impl::Axpby_MV_Invoke_Right::run()" ": X is not a Kokkos::View."); static_assert(Kokkos::is_view::value, "KokkosBlas::Impl::Axpby_MV_Invoke_Right::run()" ": Y is not a Kokkos::View."); - static_assert(std::is_same::value, + static_assert(std::is_same::value, "KokkosBlas::Impl::Axpby_MV_Invoke_Right::run()" ": Y must be nonconst, since it is an output argument" " and we have to be able to write to its entries."); @@ -1614,8 +1551,7 @@ struct Axpby_MV_Invoke_Right { static_assert(YMV::rank == 2, "KokkosBlas::Impl::Axpby_MV_Invoke_Right::run()" ": X and Y must have rank 2."); - if ((-1 <= scalar_x) && (scalar_x <= 2) && (-1 <= scalar_y) && - (scalar_y <= 2)) { + if ((-1 <= scalar_x) && (scalar_x <= 2) && (-1 <= scalar_y) && (scalar_y <= 2)) { // Ok } else { KokkosKernels::Impl::throw_runtime_exception( @@ -1629,11 +1565,9 @@ struct Axpby_MV_Invoke_Right { auto y_0 = Kokkos::subview(y, Kokkos::ALL(), 0); typedef decltype(x_0) XV; typedef decltype(y_0) YV; - Axpby_Generic( - space, av, x_0, bv, y_0, 0, scalar_x, scalar_y); + Axpby_Generic(space, av, x_0, bv, y_0, 0, scalar_x, scalar_y); } else { - Axpby_MV_Generic( - space, av, x, bv, y, scalar_x, scalar_y); + Axpby_MV_Generic(space, av, x, bv, y, scalar_x, scalar_y); } } }; diff --git a/blas/impl/KokkosBlas1_axpby_spec.hpp b/blas/impl/KokkosBlas1_axpby_spec.hpp index 3aff21e0be..f4f85c8f6b 100644 --- a/blas/impl/KokkosBlas1_axpby_spec.hpp +++ b/blas/impl/KokkosBlas1_axpby_spec.hpp @@ -28,8 +28,7 @@ namespace KokkosBlas { namespace Impl { // Specialization struct which defines whether a specialization exists -template +template struct axpby_eti_spec_avail { enum : bool { value = false }; }; @@ -43,36 +42,29 @@ struct axpby_eti_spec_avail { // We may spread out definitions (see _INST macro below) across one or // more .cpp files. // -#define KOKKOSBLAS1_AXPBY_ETI_SPEC_AVAIL(SCALAR, LAYOUT, EXEC_SPACE, \ - MEM_SPACE) \ - template <> \ - struct axpby_eti_spec_avail< \ - EXEC_SPACE, SCALAR, \ - Kokkos::View, \ - Kokkos::MemoryTraits >, \ - SCALAR, \ - Kokkos::View, \ - Kokkos::MemoryTraits >, \ - 1> { \ - enum : bool { value = true }; \ - }; \ - template <> \ - struct axpby_eti_spec_avail< \ - EXEC_SPACE, \ - Kokkos::View, \ - Kokkos::MemoryTraits >, \ - Kokkos::View, \ - Kokkos::MemoryTraits >, \ - Kokkos::View, \ - Kokkos::MemoryTraits >, \ - Kokkos::View, \ - Kokkos::MemoryTraits >, \ - 1> { \ - enum : bool { value = true }; \ +#define KOKKOSBLAS1_AXPBY_ETI_SPEC_AVAIL(SCALAR, LAYOUT, EXEC_SPACE, MEM_SPACE) \ + template <> \ + struct axpby_eti_spec_avail< \ + EXEC_SPACE, SCALAR, \ + Kokkos::View, \ + Kokkos::MemoryTraits >, \ + SCALAR, \ + Kokkos::View, Kokkos::MemoryTraits >, \ + 1> { \ + enum : bool { value = true }; \ + }; \ + template <> \ + struct axpby_eti_spec_avail< \ + EXEC_SPACE, \ + Kokkos::View, \ + Kokkos::MemoryTraits >, \ + Kokkos::View, \ + Kokkos::MemoryTraits >, \ + Kokkos::View, \ + Kokkos::MemoryTraits >, \ + Kokkos::View, Kokkos::MemoryTraits >, \ + 1> { \ + enum : bool { value = true }; \ }; // @@ -82,36 +74,29 @@ struct axpby_eti_spec_avail { // We may spread out definitions (see _INST macro below) across one or // more .cpp files. // -#define KOKKOSBLAS1_AXPBY_MV_ETI_SPEC_AVAIL(SCALAR, LAYOUT, EXEC_SPACE, \ - MEM_SPACE) \ - template <> \ - struct axpby_eti_spec_avail< \ - EXEC_SPACE, SCALAR, \ - Kokkos::View, \ - Kokkos::MemoryTraits >, \ - SCALAR, \ - Kokkos::View, \ - Kokkos::MemoryTraits >, \ - 2> { \ - enum : bool { value = true }; \ - }; \ - template <> \ - struct axpby_eti_spec_avail< \ - EXEC_SPACE, \ - Kokkos::View, \ - Kokkos::MemoryTraits >, \ - Kokkos::View, \ - Kokkos::MemoryTraits >, \ - Kokkos::View, \ - Kokkos::MemoryTraits >, \ - Kokkos::View, \ - Kokkos::MemoryTraits >, \ - 2> { \ - enum : bool { value = true }; \ +#define KOKKOSBLAS1_AXPBY_MV_ETI_SPEC_AVAIL(SCALAR, LAYOUT, EXEC_SPACE, MEM_SPACE) \ + template <> \ + struct axpby_eti_spec_avail< \ + EXEC_SPACE, SCALAR, \ + Kokkos::View, \ + Kokkos::MemoryTraits >, \ + SCALAR, \ + Kokkos::View, Kokkos::MemoryTraits >, \ + 2> { \ + enum : bool { value = true }; \ + }; \ + template <> \ + struct axpby_eti_spec_avail< \ + EXEC_SPACE, \ + Kokkos::View, \ + Kokkos::MemoryTraits >, \ + Kokkos::View, \ + Kokkos::MemoryTraits >, \ + Kokkos::View, \ + Kokkos::MemoryTraits >, \ + Kokkos::View, Kokkos::MemoryTraits >, \ + 2> { \ + enum : bool { value = true }; \ }; // Include the actual specialization declarations @@ -146,21 +131,16 @@ namespace Impl { /// Any scalar coefficient of zero has BLAS semantics of /// ignoring the corresponding (multi)vector entry. This does NOT /// apply to coefficients in av and bv vectors, if they are used. -template ::value, - bool eti_spec_avail = - axpby_eti_spec_avail::value> +template ::value, + bool eti_spec_avail = axpby_eti_spec_avail::value> struct Axpby { - static void axpby(const execution_space& space, const AV& av, const XMV& X, - const BV& bv, const YMV& Y); + static void axpby(const execution_space& space, const AV& av, const XMV& X, const BV& bv, const YMV& Y); }; template struct Axpby { - static void axpby(const execution_space& /*space*/, const AV& /* av */, - const XMV& /* X */, const BV& /* bv */, + static void axpby(const execution_space& /*space*/, const AV& /* av */, const XMV& /* X */, const BV& /* bv */, const YMV& /* Y */) { static_assert(YMV::rank == 0, "Oh My God"); } @@ -175,20 +155,17 @@ struct Axpby { // the unification process forces AV = view and BV = view // ********************************************************************** template -struct Axpby { +struct Axpby { using size_type = typename YMV::size_type; - static void axpby(const execution_space& space, const AV& av, const XMV& X, - const BV& bv, const YMV& Y) { + static void axpby(const execution_space& space, const AV& av, const XMV& X, const BV& bv, const YMV& Y) { static_assert(Kokkos::is_view::value, "KokkosBlas::Impl::" "Axpby::axpby: X is not a Kokkos::View."); static_assert(Kokkos::is_view::value, "KokkosBlas::Impl::" "Axpby::axpby: Y is not a Kokkos::View."); - static_assert(std::is_same::value, + static_assert(std::is_same::value, "KokkosBlas::Impl::Axpby::axpby: Y is const. " "It must be nonconst, because it is an output argument " "(we have to be able to write to its entries)."); @@ -198,21 +175,17 @@ struct Axpby::axpby: " "X and Y must have rank 2."); - Kokkos::Profiling::pushRegion(KOKKOSKERNELS_IMPL_COMPILE_LIBRARY - ? "KokkosBlas::axpby[ETI]" - : "KokkosBlas::axpby[noETI]"); + Kokkos::Profiling::pushRegion(KOKKOSKERNELS_IMPL_COMPILE_LIBRARY ? "KokkosBlas::axpby[ETI]" + : "KokkosBlas::axpby[noETI]"); #ifdef KOKKOSKERNELS_ENABLE_CHECK_SPECIALIZATION if (KOKKOSKERNELS_IMPL_COMPILE_LIBRARY) - printf( - "KokkosBlas1::axpby<> ETI specialization for < %s , %s , %s , %s >\n", - typeid(AV).name(), typeid(XMV).name(), typeid(BV).name(), - typeid(YMV).name()); + printf("KokkosBlas1::axpby<> ETI specialization for < %s , %s , %s , %s >\n", typeid(AV).name(), + typeid(XMV).name(), typeid(BV).name(), typeid(YMV).name()); else { printf( "KokkosBlas1::axpby<> non-ETI specialization for < %s , %s , %s , %s " ">\n", - typeid(AV).name(), typeid(XMV).name(), typeid(BV).name(), - typeid(YMV).name()); + typeid(AV).name(), typeid(XMV).name(), typeid(BV).name(), typeid(YMV).name()); } #endif @@ -255,22 +228,19 @@ struct Axpby(INT_MAX) && - numRows * numCols < static_cast(INT_MAX)) { - using index_type = int; - using Axpby_MV_Invoke_Layout = typename std::conditional< - std::is_same::value, - Axpby_MV_Invoke_Left, - Axpby_MV_Invoke_Right >::type; + if (numRows < static_cast(INT_MAX) && numRows * numCols < static_cast(INT_MAX)) { + using index_type = int; + using Axpby_MV_Invoke_Layout = + typename std::conditional::value, + Axpby_MV_Invoke_Left, + Axpby_MV_Invoke_Right >::type; Axpby_MV_Invoke_Layout::run(space, av, X, bv, Y, scalar_x, scalar_y); } else { - using index_type = typename XMV::size_type; - using Axpby_MV_Invoke_Layout = typename std::conditional< - std::is_same::value, - Axpby_MV_Invoke_Left, - Axpby_MV_Invoke_Right >::type; + using index_type = typename XMV::size_type; + using Axpby_MV_Invoke_Layout = + typename std::conditional::value, + Axpby_MV_Invoke_Left, + Axpby_MV_Invoke_Right >::type; Axpby_MV_Invoke_Layout::run(space, av, X, bv, Y, scalar_x, scalar_y); } Kokkos::Profiling::popRegion(); @@ -285,25 +255,22 @@ struct Axpby -struct Axpby { +struct Axpby { using AV = typename XMV::non_const_value_type; using BV = typename YMV::non_const_value_type; using size_type = typename YMV::size_type; using ATA = Kokkos::ArithTraits; using ATB = Kokkos::ArithTraits; - static void axpby(const execution_space& space, const AV& alpha, const XMV& X, - const BV& beta, const YMV& Y) { + static void axpby(const execution_space& space, const AV& alpha, const XMV& X, const BV& beta, const YMV& Y) { static_assert(Kokkos::is_view::value, "KokkosBlas::Impl::Axpby::axpby (MV): " "X is not a Kokkos::View."); static_assert(Kokkos::is_view::value, "KokkosBlas::Impl::Axpby::axpby (MV): " "Y is not a Kokkos::View."); - static_assert(std::is_same::value, + static_assert(std::is_same::value, "KokkosBlas::Impl::Axpby::axpby (MV): Y is const. " "It must be nonconst, because it is an output argument " "(we have to be able to write to its entries)."); @@ -313,22 +280,18 @@ struct Axpby ETI specialization for < %s , %s , %s , %s >\n", - typeid(AV).name(), typeid(XMV).name(), typeid(BV).name(), - typeid(YMV).name()); + printf("KokkosBlas1::axpby<> ETI specialization for < %s , %s , %s , %s >\n", typeid(AV).name(), + typeid(XMV).name(), typeid(BV).name(), typeid(YMV).name()); else { printf( "KokkosBlas1::axpby<> non-ETI specialization for < %s , %s , %s , %s " ">\n", - typeid(AV).name(), typeid(XMV).name(), typeid(BV).name(), - typeid(YMV).name()); + typeid(AV).name(), typeid(XMV).name(), typeid(BV).name(), typeid(YMV).name()); } #endif @@ -353,22 +316,19 @@ struct Axpby(INT_MAX) && - numRows * numCols < static_cast(INT_MAX)) { - using index_type = int; - using Axpby_MV_Invoke_Layout = typename std::conditional< - std::is_same::value, - Axpby_MV_Invoke_Left, - Axpby_MV_Invoke_Right >::type; + if (numRows < static_cast(INT_MAX) && numRows * numCols < static_cast(INT_MAX)) { + using index_type = int; + using Axpby_MV_Invoke_Layout = + typename std::conditional::value, + Axpby_MV_Invoke_Left, + Axpby_MV_Invoke_Right >::type; Axpby_MV_Invoke_Layout::run(space, alpha, X, beta, Y, scalar_x, scalar_y); } else { - using index_type = typename XMV::size_type; - using Axpby_MV_Invoke_Layout = typename std::conditional< - std::is_same::value, - Axpby_MV_Invoke_Left, - Axpby_MV_Invoke_Right >::type; + using index_type = typename XMV::size_type; + using Axpby_MV_Invoke_Layout = + typename std::conditional::value, + Axpby_MV_Invoke_Left, + Axpby_MV_Invoke_Right >::type; Axpby_MV_Invoke_Layout::run(space, alpha, X, beta, Y, scalar_x, scalar_y); } Kokkos::Profiling::popRegion(); @@ -383,15 +343,12 @@ struct Axpby -struct Axpby { +struct Axpby { using size_type = typename YV::size_type; - static void axpby(const execution_space& space, const AV& av, const XV& X, - const BV& bv, const YV& Y) { - Kokkos::Profiling::pushRegion(KOKKOSKERNELS_IMPL_COMPILE_LIBRARY - ? "KokkosBlas::axpby[ETI]" - : "KokkosBlas::axpby[noETI]"); + static void axpby(const execution_space& space, const AV& av, const XV& X, const BV& bv, const YV& Y) { + Kokkos::Profiling::pushRegion(KOKKOSKERNELS_IMPL_COMPILE_LIBRARY ? "KokkosBlas::axpby[ETI]" + : "KokkosBlas::axpby[noETI]"); size_type const numRows = X.extent(0); @@ -433,12 +390,10 @@ struct Axpby(INT_MAX)) { using index_type = int; - Axpby_Generic( - space, av, X, bv, Y, 0, scalar_x, scalar_y); + Axpby_Generic(space, av, X, bv, Y, 0, scalar_x, scalar_y); } else { using index_type = typename XV::size_type; - Axpby_Generic( - space, av, X, bv, Y, 0, scalar_x, scalar_y); + Axpby_Generic(space, av, X, bv, Y, 0, scalar_x, scalar_y); } Kokkos::Profiling::popRegion(); @@ -453,8 +408,7 @@ struct Axpby -struct Axpby { using AV = typename XV::non_const_value_type; using BV = typename YV::non_const_value_type; @@ -462,16 +416,14 @@ struct Axpby; using ATB = Kokkos::ArithTraits; - static void axpby(const execution_space& space, const AV& alpha, const XV& X, - const BV& beta, const YV& Y) { + static void axpby(const execution_space& space, const AV& alpha, const XV& X, const BV& beta, const YV& Y) { static_assert(Kokkos::is_view::value, "KokkosBlas::Impl::" "Axpby::axpby: X is not a Kokkos::View."); static_assert(Kokkos::is_view::value, "KokkosBlas::Impl::" "Axpby::axpby: Y is not a Kokkos::View."); - static_assert(std::is_same::value, + static_assert(std::is_same::value, "KokkosBlas::Impl::Axpby::axpby: Y is const. " "It must be nonconst, because it is an output argument " "(we have to be able to write to its entries)."); @@ -482,21 +434,17 @@ struct Axpby::axpby: " "X and Y must have rank 1."); - Kokkos::Profiling::pushRegion(KOKKOSKERNELS_IMPL_COMPILE_LIBRARY - ? "KokkosBlas::axpby[ETI]" - : "KokkosBlas::axpby[noETI]"); + Kokkos::Profiling::pushRegion(KOKKOSKERNELS_IMPL_COMPILE_LIBRARY ? "KokkosBlas::axpby[ETI]" + : "KokkosBlas::axpby[noETI]"); #ifdef KOKKOSKERNELS_ENABLE_CHECK_SPECIALIZATION if (KOKKOSKERNELS_IMPL_COMPILE_LIBRARY) - printf( - "KokkosBlas1::axpby<> ETI specialization for < %s , %s , %s , %s >\n", - typeid(AV).name(), typeid(XV).name(), typeid(BV).name(), - typeid(YV).name()); + printf("KokkosBlas1::axpby<> ETI specialization for < %s , %s , %s , %s >\n", typeid(AV).name(), + typeid(XV).name(), typeid(BV).name(), typeid(YV).name()); else { printf( "KokkosBlas1::axpby<> non-ETI specialization for < %s , %s , %s , %s " ">\n", - typeid(AV).name(), typeid(XV).name(), typeid(BV).name(), - typeid(YV).name()); + typeid(AV).name(), typeid(XV).name(), typeid(BV).name(), typeid(YV).name()); } #endif @@ -522,14 +470,12 @@ struct Axpby(INT_MAX)) { using index_type = int; - Axpby_Generic( - space, alpha, X, beta, Y, 0, scalar_x, scalar_y); + Axpby_Generic(space, alpha, X, beta, Y, 0, scalar_x, scalar_y); } else { using index_type = typename XV::size_type; - Axpby_Generic( - space, alpha, X, beta, Y, 0, scalar_x, scalar_y); + Axpby_Generic(space, alpha, X, beta, Y, 0, scalar_x, scalar_y); } Kokkos::Profiling::popRegion(); } @@ -548,54 +494,42 @@ struct Axpby, \ - Kokkos::MemoryTraits >, \ - SCALAR, \ - Kokkos::View, \ - Kokkos::MemoryTraits >, \ - 1, false, true>; \ - extern template struct Axpby< \ - EXEC_SPACE, \ - Kokkos::View, \ - Kokkos::MemoryTraits >, \ - Kokkos::View, \ - Kokkos::MemoryTraits >, \ - Kokkos::View, \ - Kokkos::MemoryTraits >, \ - Kokkos::View, \ - Kokkos::MemoryTraits >, \ +#define KOKKOSBLAS1_AXPBY_ETI_SPEC_DECL(SCALAR, LAYOUT, EXEC_SPACE, MEM_SPACE) \ + extern template struct Axpby< \ + EXEC_SPACE, SCALAR, \ + Kokkos::View, \ + Kokkos::MemoryTraits >, \ + SCALAR, \ + Kokkos::View, Kokkos::MemoryTraits >, \ + 1, false, true>; \ + extern template struct Axpby< \ + EXEC_SPACE, \ + Kokkos::View, \ + Kokkos::MemoryTraits >, \ + Kokkos::View, \ + Kokkos::MemoryTraits >, \ + Kokkos::View, \ + Kokkos::MemoryTraits >, \ + Kokkos::View, Kokkos::MemoryTraits >, \ 1, false, true>; -#define KOKKOSBLAS1_AXPBY_ETI_SPEC_INST(SCALAR, LAYOUT, EXEC_SPACE, MEM_SPACE) \ - template struct Axpby< \ - EXEC_SPACE, SCALAR, \ - Kokkos::View, \ - Kokkos::MemoryTraits >, \ - SCALAR, \ - Kokkos::View, \ - Kokkos::MemoryTraits >, \ - 1, false, true>; \ - template struct Axpby< \ - EXEC_SPACE, \ - Kokkos::View, \ - Kokkos::MemoryTraits >, \ - Kokkos::View, \ - Kokkos::MemoryTraits >, \ - Kokkos::View, \ - Kokkos::MemoryTraits >, \ - Kokkos::View, \ - Kokkos::MemoryTraits >, \ +#define KOKKOSBLAS1_AXPBY_ETI_SPEC_INST(SCALAR, LAYOUT, EXEC_SPACE, MEM_SPACE) \ + template struct Axpby< \ + EXEC_SPACE, SCALAR, \ + Kokkos::View, \ + Kokkos::MemoryTraits >, \ + SCALAR, \ + Kokkos::View, Kokkos::MemoryTraits >, \ + 1, false, true>; \ + template struct Axpby< \ + EXEC_SPACE, \ + Kokkos::View, \ + Kokkos::MemoryTraits >, \ + Kokkos::View, \ + Kokkos::MemoryTraits >, \ + Kokkos::View, \ + Kokkos::MemoryTraits >, \ + Kokkos::View, Kokkos::MemoryTraits >, \ 1, false, true>; // @@ -606,56 +540,42 @@ struct Axpby, \ - Kokkos::MemoryTraits >, \ - SCALAR, \ - Kokkos::View, \ - Kokkos::MemoryTraits >, \ - 2, false, true>; \ - extern template struct Axpby< \ - EXEC_SPACE, \ - Kokkos::View, \ - Kokkos::MemoryTraits >, \ - Kokkos::View, \ - Kokkos::MemoryTraits >, \ - Kokkos::View, \ - Kokkos::MemoryTraits >, \ - Kokkos::View, \ - Kokkos::MemoryTraits >, \ +#define KOKKOSBLAS1_AXPBY_MV_ETI_SPEC_DECL(SCALAR, LAYOUT, EXEC_SPACE, MEM_SPACE) \ + extern template struct Axpby< \ + EXEC_SPACE, SCALAR, \ + Kokkos::View, \ + Kokkos::MemoryTraits >, \ + SCALAR, \ + Kokkos::View, Kokkos::MemoryTraits >, \ + 2, false, true>; \ + extern template struct Axpby< \ + EXEC_SPACE, \ + Kokkos::View, \ + Kokkos::MemoryTraits >, \ + Kokkos::View, \ + Kokkos::MemoryTraits >, \ + Kokkos::View, \ + Kokkos::MemoryTraits >, \ + Kokkos::View, Kokkos::MemoryTraits >, \ 2, false, true>; -#define KOKKOSBLAS1_AXPBY_MV_ETI_SPEC_INST(SCALAR, LAYOUT, EXEC_SPACE, \ - MEM_SPACE) \ - template struct Axpby< \ - EXEC_SPACE, SCALAR, \ - Kokkos::View, \ - Kokkos::MemoryTraits >, \ - SCALAR, \ - Kokkos::View, \ - Kokkos::MemoryTraits >, \ - 2, false, true>; \ - template struct Axpby< \ - EXEC_SPACE, \ - Kokkos::View, \ - Kokkos::MemoryTraits >, \ - Kokkos::View, \ - Kokkos::MemoryTraits >, \ - Kokkos::View, \ - Kokkos::MemoryTraits >, \ - Kokkos::View, \ - Kokkos::MemoryTraits >, \ +#define KOKKOSBLAS1_AXPBY_MV_ETI_SPEC_INST(SCALAR, LAYOUT, EXEC_SPACE, MEM_SPACE) \ + template struct Axpby< \ + EXEC_SPACE, SCALAR, \ + Kokkos::View, \ + Kokkos::MemoryTraits >, \ + SCALAR, \ + Kokkos::View, Kokkos::MemoryTraits >, \ + 2, false, true>; \ + template struct Axpby< \ + EXEC_SPACE, \ + Kokkos::View, \ + Kokkos::MemoryTraits >, \ + Kokkos::View, \ + Kokkos::MemoryTraits >, \ + Kokkos::View, \ + Kokkos::MemoryTraits >, \ + Kokkos::View, Kokkos::MemoryTraits >, \ 2, false, true>; #include diff --git a/blas/impl/KokkosBlas1_axpby_unification_attempt_traits.hpp b/blas/impl/KokkosBlas1_axpby_unification_attempt_traits.hpp index 9d200e892d..0a03007801 100644 --- a/blas/impl/KokkosBlas1_axpby_unification_attempt_traits.hpp +++ b/blas/impl/KokkosBlas1_axpby_unification_attempt_traits.hpp @@ -53,8 +53,7 @@ constexpr typename std::enable_if, bool>::type Tr1s_val() { } template -constexpr typename std::enable_if, bool>::type -Tr1s_val() { +constexpr typename std::enable_if, bool>::type Tr1s_val() { return false; } @@ -66,8 +65,7 @@ constexpr typename std::enable_if, bool>::type Tr1d_val() { } template -constexpr typename std::enable_if, bool>::type -Tr1d_val() { +constexpr typename std::enable_if, bool>::type Tr1d_val() { return false; } @@ -105,8 +103,7 @@ struct AxpbyUnificationAttemptTraits { // - type names begin with upper case letters // ******************************************************************** public: - static constexpr bool onDevice = - KokkosKernels::Impl::kk_is_gpu_exec_space(); + static constexpr bool onDevice = KokkosKernels::Impl::kk_is_gpu_exec_space(); private: static constexpr bool onHost = !onDevice; @@ -139,23 +136,15 @@ struct AxpbyUnificationAttemptTraits { // ******************************************************************** // Declare 'AtInputScalarTypeA_nonConst' // ******************************************************************** - using ScalarTypeA2_onDevice = - typename getScalarTypeFromView::type; - using ScalarTypeA1_onDevice = - std::conditional_t; + using ScalarTypeA2_onDevice = typename getScalarTypeFromView::type; + using ScalarTypeA1_onDevice = std::conditional_t; - using ScalarTypeA2_onHost = - typename getScalarTypeFromView::type; - using ScalarTypeA1_onHost = - std::conditional_t; + using ScalarTypeA2_onHost = typename getScalarTypeFromView::type; + using ScalarTypeA1_onHost = std::conditional_t; - using AtInputScalarTypeA = - std::conditional_t; + using AtInputScalarTypeA = std::conditional_t; - using AtInputScalarTypeA_nonConst = - typename std::remove_const::type; + using AtInputScalarTypeA_nonConst = typename std::remove_const::type; // ******************************************************************** // Declare 'AtInputScalarTypeX_nonConst' @@ -167,23 +156,15 @@ struct AxpbyUnificationAttemptTraits { // ******************************************************************** // Declare 'AtInputScalarTypeB_nonConst' // ******************************************************************** - using ScalarTypeB2_onDevice = - typename getScalarTypeFromView::type; - using ScalarTypeB1_onDevice = - std::conditional_t; + using ScalarTypeB2_onDevice = typename getScalarTypeFromView::type; + using ScalarTypeB1_onDevice = std::conditional_t; - using ScalarTypeB2_onHost = - typename getScalarTypeFromView::type; - using ScalarTypeB1_onHost = - std::conditional_t; + using ScalarTypeB2_onHost = typename getScalarTypeFromView::type; + using ScalarTypeB1_onHost = std::conditional_t; - using AtInputScalarTypeB = - std::conditional_t; + using AtInputScalarTypeB = std::conditional_t; - using AtInputScalarTypeB_nonConst = - typename std::remove_const::type; + using AtInputScalarTypeB_nonConst = typename std::remove_const::type; // ******************************************************************** // Declare 'AtInputScalarTypeY_nonConst' @@ -195,138 +176,115 @@ struct AxpbyUnificationAttemptTraits { // ******************************************************************** // Declare 'InternalLayoutX' and 'InternalLayoutY' // ******************************************************************** - using InternalLayoutX = - typename KokkosKernels::Impl::GetUnifiedLayout::array_layout; - using InternalLayoutY = - typename KokkosKernels::Impl::GetUnifiedLayoutPreferring< - YMV, InternalLayoutX>::array_layout; + using InternalLayoutX = typename KokkosKernels::Impl::GetUnifiedLayout::array_layout; + using InternalLayoutY = typename KokkosKernels::Impl::GetUnifiedLayoutPreferring::array_layout; // ******************************************************************** // Declare 'InternalTypeA_tmp' // ******************************************************************** - using AtInputLayoutA = - typename getLayoutFromView::type; + using AtInputLayoutA = typename getLayoutFromView::type; public: - static constexpr bool atInputLayoutA_isStride = - std::is_same_v; + static constexpr bool atInputLayoutA_isStride = std::is_same_v; private: using InternalLayoutA = - std::conditional_t<(a_is_r1d || a_is_r1s) && atInputLayoutA_isStride, - AtInputLayoutA, InternalLayoutX>; - - static constexpr bool atInputScalarTypeA_mustRemain = - Kokkos::ArithTraits::is_complex && - !Kokkos::ArithTraits::is_complex; - - using InternalScalarTypeA = std::conditional_t< - atInputScalarTypeA_mustRemain || ((a_is_r1d || a_is_r1s) && xyRank2Case), - AtInputScalarTypeA_nonConst // Yes, keep the input scalar type - , - AtInputScalarTypeX_nonConst // Yes, instead of - // 'AtInputScalarTypeA_nonConst' - >; - - using InternalTypeA_onDevice = std::conditional_t< - a_is_scalar && b_is_scalar && onDevice, // Keep 'a' as scalar - InternalScalarTypeA, - Kokkos::View>>; - - using InternalTypeA_onHost = std::conditional_t< - (a_is_r1d || a_is_r1s) && xyRank2Case && onHost, - Kokkos::View>, - InternalScalarTypeA>; - - using InternalTypeA_tmp = - std::conditional_t; + std::conditional_t<(a_is_r1d || a_is_r1s) && atInputLayoutA_isStride, AtInputLayoutA, InternalLayoutX>; + + static constexpr bool atInputScalarTypeA_mustRemain = Kokkos::ArithTraits::is_complex && + !Kokkos::ArithTraits::is_complex; + + using InternalScalarTypeA = + std::conditional_t; + + using InternalTypeA_onDevice = + std::conditional_t>>; + + using InternalTypeA_onHost = + std::conditional_t<(a_is_r1d || a_is_r1s) && xyRank2Case && onHost, + Kokkos::View>, + InternalScalarTypeA>; + + using InternalTypeA_tmp = std::conditional_t; // ******************************************************************** // Declare 'InternalTypeX' // ******************************************************************** public: - using InternalTypeX = std::conditional_t< - x_is_r2, - Kokkos::View>, - Kokkos::View>>; + using InternalTypeX = + std::conditional_t>, + Kokkos::View>>; // ******************************************************************** // Declare 'InternalTypeB_tmp' // ******************************************************************** private: - using AtInputLayoutB = - typename getLayoutFromView::type; + using AtInputLayoutB = typename getLayoutFromView::type; public: - static constexpr bool atInputLayoutB_isStride = - std::is_same_v; + static constexpr bool atInputLayoutB_isStride = std::is_same_v; private: using InternalLayoutB = - std::conditional_t<(b_is_r1d || b_is_r1s) && atInputLayoutB_isStride, - AtInputLayoutB, InternalLayoutY>; - - static constexpr bool atInputScalarTypeB_mustRemain = - Kokkos::ArithTraits::is_complex && - !Kokkos::ArithTraits::is_complex; - - using InternalScalarTypeB = std::conditional_t< - atInputScalarTypeB_mustRemain || ((b_is_r1d || b_is_r1s) && xyRank2Case), - AtInputScalarTypeB_nonConst // Yes, keep the input scalar type - , - AtInputScalarTypeY_nonConst // Yes, instead of - // 'AtInputScalarTypeB_nonConst' - >; - - using InternalTypeB_onDevice = std::conditional_t< - a_is_scalar && b_is_scalar && onDevice, // Keep 'b' as scalar - InternalScalarTypeB, - Kokkos::View>>; - - using InternalTypeB_onHost = std::conditional_t< - (b_is_r1d || b_is_r1s) && xyRank2Case && onHost, - Kokkos::View>, - InternalScalarTypeB>; - - using InternalTypeB_tmp = - std::conditional_t; + std::conditional_t<(b_is_r1d || b_is_r1s) && atInputLayoutB_isStride, AtInputLayoutB, InternalLayoutY>; + + static constexpr bool atInputScalarTypeB_mustRemain = Kokkos::ArithTraits::is_complex && + !Kokkos::ArithTraits::is_complex; + + using InternalScalarTypeB = + std::conditional_t; + + using InternalTypeB_onDevice = + std::conditional_t>>; + + using InternalTypeB_onHost = + std::conditional_t<(b_is_r1d || b_is_r1s) && xyRank2Case && onHost, + Kokkos::View>, + InternalScalarTypeB>; + + using InternalTypeB_tmp = std::conditional_t; // ******************************************************************** // Declare 'InternalTypeY' // ******************************************************************** public: - using InternalTypeY = std::conditional_t< - y_is_r2, - Kokkos::View>, - Kokkos::View>>; + using InternalTypeY = + std::conditional_t>, + Kokkos::View>>; // ******************************************************************** // Declare 'InternalTypeA': if 'InternalTypeB_tmp' is a view then // make sure 'InternalTypeA' is a view as well // ******************************************************************** - using InternalTypeA = std::conditional_t< - !Kokkos::is_view_v && - Kokkos::is_view_v, - Kokkos::View>, - InternalTypeA_tmp>; + using InternalTypeA = + std::conditional_t && Kokkos::is_view_v, + Kokkos::View>, + InternalTypeA_tmp>; // ******************************************************************** // Declare 'InternalTypeA_managed' with the same scalar type in @@ -336,23 +294,19 @@ struct AxpbyUnificationAttemptTraits { using InternalLayoutA_managed = InternalLayoutA; public: - using InternalTypeA_managed = std::conditional_t< - Kokkos::is_view_v, - Kokkos::View, - void>; + using InternalTypeA_managed = + std::conditional_t, + Kokkos::View, void>; // ******************************************************************** // Declare 'InternalTypeB' if 'InternalTypeA_tmp' is a view then // make sure 'InternalTypeB' is a view as well // ******************************************************************** - using InternalTypeB = std::conditional_t< - Kokkos::is_view_v && - !Kokkos::is_view_v, - Kokkos::View>, - InternalTypeB_tmp>; + using InternalTypeB = + std::conditional_t && !Kokkos::is_view_v, + Kokkos::View>, + InternalTypeB_tmp>; // ******************************************************************** // Declare 'InternalTypeB_managed' with the same scalar type in @@ -362,91 +316,72 @@ struct AxpbyUnificationAttemptTraits { using InternalLayoutB_managed = InternalLayoutB; public: - using InternalTypeB_managed = std::conditional_t< - Kokkos::is_view_v, - Kokkos::View, - void>; + using InternalTypeB_managed = + std::conditional_t, + Kokkos::View, void>; // ******************************************************************** // Auxiliary Boolean results on internal types // ******************************************************************** private: - static constexpr bool internalTypeA_is_scalar = - !Kokkos::is_view_v; - static constexpr bool internalTypeA_is_r1d = Tr1d_val(); + static constexpr bool internalTypeA_is_scalar = !Kokkos::is_view_v; + static constexpr bool internalTypeA_is_r1d = Tr1d_val(); - static constexpr bool internalTypeB_is_scalar = - !Kokkos::is_view_v; - static constexpr bool internalTypeB_is_r1d = Tr1d_val(); + static constexpr bool internalTypeB_is_scalar = !Kokkos::is_view_v; + static constexpr bool internalTypeB_is_r1d = Tr1d_val(); public: - static constexpr bool internalTypesAB_bothScalars = - (internalTypeA_is_scalar && internalTypeB_is_scalar); - static constexpr bool internalTypesAB_bothViews = - (internalTypeA_is_r1d && internalTypeB_is_r1d); + static constexpr bool internalTypesAB_bothScalars = (internalTypeA_is_scalar && internalTypeB_is_scalar); + static constexpr bool internalTypesAB_bothViews = (internalTypeA_is_r1d && internalTypeB_is_r1d); // ******************************************************************** // Routine to perform checks (both compile time and run time) // ******************************************************************** - static void performChecks(const AV& a, const XMV& X, const BV& b, - const YMV& Y) { + static void performChecks(const AV& a, const XMV& X, const BV& b, const YMV& Y) { // ****************************************************************** // Check 1/6: General checks // ****************************************************************** - static_assert( - Kokkos::is_execution_space_v, - "KokkosBlas::Impl::AxpbyUnificationAttemptTraits::performChecks()" - ": tExecSpace must be a valid Kokkos execution space."); - - static_assert( - (xyRank1Case && !xyRank2Case) || (!xyRank1Case && xyRank2Case), - "KokkosBlas::Impl::AxpbyUnificationAttemptTraits::performChecks()" - ": one must have either both X and Y as rank 1, or both X and Y as " - "rank 2"); - - if constexpr (!Kokkos::ArithTraits< - AtInputScalarTypeY_nonConst>::is_complex) { - static_assert( - (!Kokkos::ArithTraits::is_complex) && - (!Kokkos::ArithTraits::is_complex) && - (!Kokkos::ArithTraits::is_complex), - "KokkosBlas::Impl::AxpbyUnificationAttemptTraits::performChecks()" - ": if Y is not complex, then A, X and B cannot be complex"); + static_assert(Kokkos::is_execution_space_v, + "KokkosBlas::Impl::AxpbyUnificationAttemptTraits::performChecks()" + ": tExecSpace must be a valid Kokkos execution space."); + + static_assert((xyRank1Case && !xyRank2Case) || (!xyRank1Case && xyRank2Case), + "KokkosBlas::Impl::AxpbyUnificationAttemptTraits::performChecks()" + ": one must have either both X and Y as rank 1, or both X and Y as " + "rank 2"); + + if constexpr (!Kokkos::ArithTraits::is_complex) { + static_assert((!Kokkos::ArithTraits::is_complex) && + (!Kokkos::ArithTraits::is_complex) && + (!Kokkos::ArithTraits::is_complex), + "KokkosBlas::Impl::AxpbyUnificationAttemptTraits::performChecks()" + ": if Y is not complex, then A, X and B cannot be complex"); } // ****************************************************************** // Check 2/6: YMV is valid // ****************************************************************** - static_assert( - Kokkos::is_view::value, - "KokkosBlas::Impl::AxpbyUnificationAttemptTraits::performChecks()" - ": Y is not a Kokkos::View."); - static_assert( - std::is_same::value, - "KokkosBlas::Impl::AxpbyUnificationAttemptTraits::performChecks()" - ": Y is const. It must be nonconst, " - "because it is an output argument " - "(we must be able to write to its entries)."); - static_assert( - Kokkos::SpaceAccessibility::accessible, - "KokkosBlas::Impl::AxpbyUnificationAttemptTraits::performChecks()" - ": XMV must be accessible from tExecSpace"); + static_assert(Kokkos::is_view::value, + "KokkosBlas::Impl::AxpbyUnificationAttemptTraits::performChecks()" + ": Y is not a Kokkos::View."); + static_assert(std::is_same::value, + "KokkosBlas::Impl::AxpbyUnificationAttemptTraits::performChecks()" + ": Y is const. It must be nonconst, " + "because it is an output argument " + "(we must be able to write to its entries)."); + static_assert(Kokkos::SpaceAccessibility::accessible, + "KokkosBlas::Impl::AxpbyUnificationAttemptTraits::performChecks()" + ": XMV must be accessible from tExecSpace"); // ****************************************************************** // Check 3/6: XMV is valid // ****************************************************************** - static_assert( - Kokkos::is_view::value, - "KokkosBlas::Impl::AxpbyUnificationAttemptTraits::performChecks()" - ": X is not a Kokkos::View."); - static_assert( - Kokkos::SpaceAccessibility::accessible, - "KokkosBlas::Impl::AxpbyUnificationAttemptTraits::performChecks()" - ": XMV must be accessible from tExecSpace"); + static_assert(Kokkos::is_view::value, + "KokkosBlas::Impl::AxpbyUnificationAttemptTraits::performChecks()" + ": X is not a Kokkos::View."); + static_assert(Kokkos::SpaceAccessibility::accessible, + "KokkosBlas::Impl::AxpbyUnificationAttemptTraits::performChecks()" + ": XMV must be accessible from tExecSpace"); if constexpr (xyRank1Case) { if (X.extent(0) != Y.extent(0)) { @@ -454,8 +389,7 @@ struct AxpbyUnificationAttemptTraits { msg << "KokkosBlas::Impl::AxpbyUnificationAttemptTraits::performChecks(" ")" << ", invalid rank-1 X extent" - << ": X.extent(0) = " << X.extent(0) - << ", Y.extent(0) = " << Y.extent(0); + << ": X.extent(0) = " << X.extent(0) << ", Y.extent(0) = " << Y.extent(0); KokkosKernels::Impl::throw_runtime_exception(msg.str()); } } else { @@ -464,10 +398,8 @@ struct AxpbyUnificationAttemptTraits { msg << "KokkosBlas::Impl::AxpbyUnificationAttemptTraits::performChecks(" ")" << ", invalid rank-2 X extents" - << ": X.extent(0) = " << X.extent(0) - << ", X.extent(1) = " << X.extent(1) - << ", Y.extent(0) = " << Y.extent(0) - << ", Y.extent(1) = " << Y.extent(1); + << ": X.extent(0) = " << X.extent(0) << ", X.extent(1) = " << X.extent(1) + << ", Y.extent(0) = " << Y.extent(0) << ", Y.extent(1) = " << Y.extent(1); KokkosKernels::Impl::throw_runtime_exception(msg.str()); } } @@ -476,10 +408,8 @@ struct AxpbyUnificationAttemptTraits { // Check 4/6: AV is valid // ****************************************************************** static_assert( - (a_is_scalar && !a_is_r0 && !a_is_r1s && !a_is_r1d) || - (!a_is_scalar && a_is_r0 && !a_is_r1s && !a_is_r1d) || - (!a_is_scalar && !a_is_r0 && a_is_r1s && !a_is_r1d) || - (!a_is_scalar && !a_is_r0 && !a_is_r1s && a_is_r1d), + (a_is_scalar && !a_is_r0 && !a_is_r1s && !a_is_r1d) || (!a_is_scalar && a_is_r0 && !a_is_r1s && !a_is_r1d) || + (!a_is_scalar && !a_is_r0 && a_is_r1s && !a_is_r1d) || (!a_is_scalar && !a_is_r0 && !a_is_r1s && a_is_r1d), "KokkosBlas::Impl::AxpbyUnificationAttemptTraits::performChecks()" ": 'a' must be either scalar or rank 0 or rank 1 static or rank 1 " "dynamic"); @@ -495,8 +425,7 @@ struct AxpbyUnificationAttemptTraits { KokkosKernels::Impl::throw_runtime_exception(msg.str()); } } else { - if ((a.extent(0) == 1) || - (a.extent(0) == Y.extent(1))) { // Yes, 'Y' is the reference + if ((a.extent(0) == 1) || (a.extent(0) == Y.extent(1))) { // Yes, 'Y' is the reference // Ok } else { std::ostringstream msg; @@ -504,8 +433,7 @@ struct AxpbyUnificationAttemptTraits { "performChecks()" << ": view 'a' must have extent(0) == 1 or Y.extent(1) for " "xyRank2Case" - << ", a.extent(0) = " << a.extent(0) - << ", Y.extent(0) = " << Y.extent(0) + << ", a.extent(0) = " << a.extent(0) << ", Y.extent(0) = " << Y.extent(0) << ", Y.extent(1) = " << Y.extent(1); KokkosKernels::Impl::throw_runtime_exception(msg.str()); } @@ -516,10 +444,8 @@ struct AxpbyUnificationAttemptTraits { // Check 5/6: BV is valid // ****************************************************************** static_assert( - (b_is_scalar && !b_is_r0 && !b_is_r1s && !b_is_r1d) || - (!b_is_scalar && b_is_r0 && !b_is_r1s && !b_is_r1d) || - (!b_is_scalar && !b_is_r0 && b_is_r1s && !b_is_r1d) || - (!b_is_scalar && !b_is_r0 && !b_is_r1s && b_is_r1d), + (b_is_scalar && !b_is_r0 && !b_is_r1s && !b_is_r1d) || (!b_is_scalar && b_is_r0 && !b_is_r1s && !b_is_r1d) || + (!b_is_scalar && !b_is_r0 && b_is_r1s && !b_is_r1d) || (!b_is_scalar && !b_is_r0 && !b_is_r1s && b_is_r1d), "KokkosBlas::Impl::AxpbyUnificationAttemptTraits::performChecks()" ": 'b' must be either scalar or rank 0 or rank 1 static or rank 1 " "dynamic"); @@ -543,8 +469,7 @@ struct AxpbyUnificationAttemptTraits { "performChecks()" << ": view 'b' must have extent(0) == 1 or Y.extent(1) for " "xyRank2Case" - << ", b.extent(0) = " << b.extent(0) - << ", Y.extent(0) = " << Y.extent(0) + << ", b.extent(0) = " << b.extent(0) << ", Y.extent(0) = " << Y.extent(0) << ", Y.extent(1) = " << Y.extent(1); KokkosKernels::Impl::throw_runtime_exception(msg.str()); } @@ -556,147 +481,115 @@ struct AxpbyUnificationAttemptTraits { // ****************************************************************** if constexpr (onHost) { if constexpr (xyRank1Case) { - constexpr bool internalTypeA_isOk = - (internalTypeA_is_scalar || internalTypeA_is_r1d); - static_assert( - internalTypeA_isOk, - "KokkosBlas::Impl::AxpbyUnificationAttemptTraits::performChecks()" - ", onHost, xyRank1Case: InternalTypeA is wrong"); - - constexpr bool internalTypeX_isOk = std::is_same_v< - InternalTypeX, - Kokkos::View>>; - static_assert( - internalTypeX_isOk, - "KokkosBlas::Impl::AxpbyUnificationAttemptTraits::performChecks()" - ", onHost, xyRank1Case: InternalTypeX is wrong"); - - constexpr bool internalTypeB_isOk = - (internalTypeB_is_scalar || internalTypeB_is_r1d); - static_assert( - internalTypeB_isOk, - "KokkosBlas::Impl::AxpbyUnificationAttemptTraits::performChecks()" - ", onHost, xyRank1Case: InternalTypeB is wrong"); - - constexpr bool internalTypeY_isOk = std::is_same_v< - InternalTypeY, - Kokkos::View>>; - static_assert( - internalTypeY_isOk, - "KokkosBlas::Impl::AxpbyUnificationAttemptTraits::performChecks()" - ", onHost, xyRank1Case: InternalTypeY is wrong"); + constexpr bool internalTypeA_isOk = (internalTypeA_is_scalar || internalTypeA_is_r1d); + static_assert(internalTypeA_isOk, + "KokkosBlas::Impl::AxpbyUnificationAttemptTraits::performChecks()" + ", onHost, xyRank1Case: InternalTypeA is wrong"); + + constexpr bool internalTypeX_isOk = + std::is_same_v>>; + static_assert(internalTypeX_isOk, + "KokkosBlas::Impl::AxpbyUnificationAttemptTraits::performChecks()" + ", onHost, xyRank1Case: InternalTypeX is wrong"); + + constexpr bool internalTypeB_isOk = (internalTypeB_is_scalar || internalTypeB_is_r1d); + static_assert(internalTypeB_isOk, + "KokkosBlas::Impl::AxpbyUnificationAttemptTraits::performChecks()" + ", onHost, xyRank1Case: InternalTypeB is wrong"); + + constexpr bool internalTypeY_isOk = + std::is_same_v>>; + static_assert(internalTypeY_isOk, + "KokkosBlas::Impl::AxpbyUnificationAttemptTraits::performChecks()" + ", onHost, xyRank1Case: InternalTypeY is wrong"); } else { - constexpr bool internalTypeA_isOk = - (internalTypeA_is_scalar || internalTypeA_is_r1d); - static_assert( - internalTypeA_isOk, - "KokkosBlas::Impl::AxpbyUnificationAttemptTraits::performChecks()" - ", onHost, xyRank2Case: InternalTypeA is wrong"); - - constexpr bool internalTypeX_isOk = std::is_same_v< - InternalTypeX, - Kokkos::View>>; - static_assert( - internalTypeX_isOk, - "KokkosBlas::Impl::AxpbyUnificationAttemptTraits::performChecks()" - ", onHost, xyRank2Case: InternalTypeX is wrong"); - - constexpr bool internalTypeB_isOk = - (internalTypeB_is_scalar || internalTypeB_is_r1d); - static_assert( - internalTypeB_isOk, - "KokkosBlas::Impl::AxpbyUnificationAttemptTraits::performChecks()" - ", onHost, xyRank2Case: InternalTypeB is wrong"); - - constexpr bool internalTypeY_isOk = std::is_same_v< - InternalTypeY, - Kokkos::View>>; - static_assert( - internalTypeY_isOk, - "KokkosBlas::Impl::AxpbyUnificationAttemptTraits::performChecks()" - ", onHost, xyRank2Case: InternalTypeY is wrong"); + constexpr bool internalTypeA_isOk = (internalTypeA_is_scalar || internalTypeA_is_r1d); + static_assert(internalTypeA_isOk, + "KokkosBlas::Impl::AxpbyUnificationAttemptTraits::performChecks()" + ", onHost, xyRank2Case: InternalTypeA is wrong"); + + constexpr bool internalTypeX_isOk = + std::is_same_v>>; + static_assert(internalTypeX_isOk, + "KokkosBlas::Impl::AxpbyUnificationAttemptTraits::performChecks()" + ", onHost, xyRank2Case: InternalTypeX is wrong"); + + constexpr bool internalTypeB_isOk = (internalTypeB_is_scalar || internalTypeB_is_r1d); + static_assert(internalTypeB_isOk, + "KokkosBlas::Impl::AxpbyUnificationAttemptTraits::performChecks()" + ", onHost, xyRank2Case: InternalTypeB is wrong"); + + constexpr bool internalTypeY_isOk = + std::is_same_v>>; + static_assert(internalTypeY_isOk, + "KokkosBlas::Impl::AxpbyUnificationAttemptTraits::performChecks()" + ", onHost, xyRank2Case: InternalTypeY is wrong"); } } else { if constexpr (xyRank1Case) { constexpr bool internalTypeA_isOk = - internalTypeA_is_r1d || - (a_is_scalar && b_is_scalar && internalTypeA_is_scalar); - static_assert( - internalTypeA_isOk, - "KokkosBlas::Impl::AxpbyUnificationAttemptTraits::performChecks()" - ", onDevice, xyRank1Case: InternalTypeA is wrong"); - - constexpr bool internalTypeX_isOk = std::is_same_v< - InternalTypeX, - Kokkos::View>>; - static_assert( - internalTypeX_isOk, - "KokkosBlas::Impl::AxpbyUnificationAttemptTraits::performChecks()" - ", onDevice, xyRank1Case: InternalTypeX is wrong"); + internalTypeA_is_r1d || (a_is_scalar && b_is_scalar && internalTypeA_is_scalar); + static_assert(internalTypeA_isOk, + "KokkosBlas::Impl::AxpbyUnificationAttemptTraits::performChecks()" + ", onDevice, xyRank1Case: InternalTypeA is wrong"); + + constexpr bool internalTypeX_isOk = + std::is_same_v>>; + static_assert(internalTypeX_isOk, + "KokkosBlas::Impl::AxpbyUnificationAttemptTraits::performChecks()" + ", onDevice, xyRank1Case: InternalTypeX is wrong"); constexpr bool internalTypeB_isOk = - internalTypeB_is_r1d || - (a_is_scalar && b_is_scalar && internalTypeA_is_scalar); - static_assert( - internalTypeB_isOk, - "KokkosBlas::Impl::AxpbyUnificationAttemptTraits::performChecks()" - ", onDevice, xyRank1Case: InternalTypeB is wrong"); - - constexpr bool internalTypeY_isOk = std::is_same_v< - InternalTypeY, - Kokkos::View>>; - static_assert( - internalTypeY_isOk, - "KokkosBlas::Impl::AxpbyUnificationAttemptTraits::performChecks()" - ", onDevice, xyRank1Case: InternalTypeY is wrong"); + internalTypeB_is_r1d || (a_is_scalar && b_is_scalar && internalTypeA_is_scalar); + static_assert(internalTypeB_isOk, + "KokkosBlas::Impl::AxpbyUnificationAttemptTraits::performChecks()" + ", onDevice, xyRank1Case: InternalTypeB is wrong"); + + constexpr bool internalTypeY_isOk = + std::is_same_v>>; + static_assert(internalTypeY_isOk, + "KokkosBlas::Impl::AxpbyUnificationAttemptTraits::performChecks()" + ", onDevice, xyRank1Case: InternalTypeY is wrong"); } else { constexpr bool internalTypeA_isOk = - internalTypeA_is_r1d || - (a_is_scalar && b_is_scalar && internalTypeA_is_scalar); - static_assert( - internalTypeA_isOk, - "KokkosBlas::Impl::AxpbyUnificationAttemptTraits::performChecks()" - ", onDevice, xyRank2Case: InternalTypeA is wrong"); - - constexpr bool internalTypeX_isOk = std::is_same_v< - InternalTypeX, - Kokkos::View>>; - static_assert( - internalTypeX_isOk, - "KokkosBlas::Impl::AxpbyUnificationAttemptTraits::performChecks()" - ", onDevice, xyRank2Case: InternalTypeX is wrong"); + internalTypeA_is_r1d || (a_is_scalar && b_is_scalar && internalTypeA_is_scalar); + static_assert(internalTypeA_isOk, + "KokkosBlas::Impl::AxpbyUnificationAttemptTraits::performChecks()" + ", onDevice, xyRank2Case: InternalTypeA is wrong"); + + constexpr bool internalTypeX_isOk = + std::is_same_v>>; + static_assert(internalTypeX_isOk, + "KokkosBlas::Impl::AxpbyUnificationAttemptTraits::performChecks()" + ", onDevice, xyRank2Case: InternalTypeX is wrong"); constexpr bool internalTypeB_isOk = - internalTypeB_is_r1d || - (a_is_scalar && b_is_scalar && internalTypeB_is_scalar); - static_assert( - internalTypeB_isOk, - "KokkosBlas::Impl::AxpbyUnificationAttemptTraits::performChecks()" - ", onDevice, xyRank2Case: InternalTypeB is wrong"); - - constexpr bool internalTypeY_isOk = std::is_same_v< - InternalTypeY, - Kokkos::View>>; - static_assert( - internalTypeY_isOk, - "KokkosBlas::Impl::AxpbyUnificationAttemptTraits::performChecks()" - ", onDevice, xyRank2Case: InternalTypeY is wrong"); + internalTypeB_is_r1d || (a_is_scalar && b_is_scalar && internalTypeB_is_scalar); + static_assert(internalTypeB_isOk, + "KokkosBlas::Impl::AxpbyUnificationAttemptTraits::performChecks()" + ", onDevice, xyRank2Case: InternalTypeB is wrong"); + + constexpr bool internalTypeY_isOk = + std::is_same_v>>; + static_assert(internalTypeY_isOk, + "KokkosBlas::Impl::AxpbyUnificationAttemptTraits::performChecks()" + ", onDevice, xyRank2Case: InternalTypeY is wrong"); } } @@ -714,10 +607,9 @@ struct AxpbyUnificationAttemptTraits { // - [InternalTypeA, B] = [S_a, S_b], or // - [InternalTypeA, B] = [view, view] // **************************************************************** - static_assert( - internalTypesAB_bothScalars || internalTypesAB_bothViews, - "KokkosBlas::Impl::AxpbyUnificationAttemptTraits::performChecks()" - ", onHost, invalid combination of types"); + static_assert(internalTypesAB_bothScalars || internalTypesAB_bothViews, + "KokkosBlas::Impl::AxpbyUnificationAttemptTraits::performChecks()" + ", onHost, invalid combination of types"); } // If onHost else if constexpr (onDevice) { // **************************************************************** @@ -733,35 +625,25 @@ struct AxpbyUnificationAttemptTraits { // - [InternalTypeA, B] = [S_a, S_b], or // - [InternalTypeA, B] = [view, view] // **************************************************************** - static_assert( - internalTypesAB_bothViews || - (a_is_scalar && b_is_scalar && internalTypesAB_bothScalars), - "KokkosBlas::Impl::AxpbyUnificationAttemptTraits::performChecks()" - ", onDevice, invalid combination of types"); + static_assert(internalTypesAB_bothViews || (a_is_scalar && b_is_scalar && internalTypesAB_bothScalars), + "KokkosBlas::Impl::AxpbyUnificationAttemptTraits::performChecks()" + ", onDevice, invalid combination of types"); } - if constexpr (xyRank2Case && (a_is_r1d || a_is_r1s) && - atInputLayoutA_isStride) { - static_assert( - std::is_same_v< - typename getLayoutFromView< - InternalTypeA, Kokkos::is_view_v>::type, - Kokkos::LayoutStride>, - "KokkosBlas::Impl::AxpbyUnificationAttemptTraits::performChecks()" - ", xyRank2Case: coeff 'a' is rank-1 and has LayoutStride at input" - ", but no LayoutStride internally"); + if constexpr (xyRank2Case && (a_is_r1d || a_is_r1s) && atInputLayoutA_isStride) { + static_assert(std::is_same_v>::type, + Kokkos::LayoutStride>, + "KokkosBlas::Impl::AxpbyUnificationAttemptTraits::performChecks()" + ", xyRank2Case: coeff 'a' is rank-1 and has LayoutStride at input" + ", but no LayoutStride internally"); } - if constexpr (xyRank2Case && (b_is_r1d || b_is_r1s) && - atInputLayoutB_isStride) { - static_assert( - std::is_same_v< - typename getLayoutFromView< - InternalTypeB, Kokkos::is_view_v>::type, - Kokkos::LayoutStride>, - "KokkosBlas::Impl::AxpbyUnificationAttemptTraits::performChecks()" - ", xyRank2Case: coeff 'b' is rank-1 and has LayoutStride at input" - ", but no LayoutStride internally"); + if constexpr (xyRank2Case && (b_is_r1d || b_is_r1s) && atInputLayoutB_isStride) { + static_assert(std::is_same_v>::type, + Kokkos::LayoutStride>, + "KokkosBlas::Impl::AxpbyUnificationAttemptTraits::performChecks()" + ", xyRank2Case: coeff 'b' is rank-1 and has LayoutStride at input" + ", but no LayoutStride internally"); } } // Constructor @@ -776,28 +658,20 @@ struct AxpbyUnificationAttemptTraits { //<< ", AV::non_const_data_type = " << // typeid(AV::non_const_data_type).name() << ", AtInputScalarTypeA = " << typeid(AtInputScalarTypeA).name() - << ", isConst = " - << std::is_const_v << ", isComplex = " + << ", isConst = " << std::is_const_v << ", isComplex = " << Kokkos::ArithTraits::is_complex - << ", AtInputScalarTypeA_nonConst = " - << typeid(AtInputScalarTypeA_nonConst).name() + << ", AtInputScalarTypeA_nonConst = " << typeid(AtInputScalarTypeA_nonConst).name() << ", InternalTypeA = " << typeid(InternalTypeA).name() << "\n" - << ", InternalTypeA_managed = " << typeid(InternalTypeA_managed).name() - << "\n" + << ", InternalTypeA_managed = " << typeid(InternalTypeA_managed).name() << "\n" << "\n" << "XMV = " << typeid(XMV).name() << "\n" - << "XMV::value_type = " << typeid(typename XMV::value_type).name() - << "\n" - << "XMV::const_data_type = " - << typeid(typename XMV::const_data_type).name() << "\n" - << "XMV::non_const_data_type = " - << typeid(typename XMV::non_const_data_type).name() << "\n" + << "XMV::value_type = " << typeid(typename XMV::value_type).name() << "\n" + << "XMV::const_data_type = " << typeid(typename XMV::const_data_type).name() << "\n" + << "XMV::non_const_data_type = " << typeid(typename XMV::non_const_data_type).name() << "\n" << "AtInputScalarTypeX = " << typeid(AtInputScalarTypeX).name() << "\n" << "isConst = " << std::is_const_v << "\n" - << "isComplex = " - << Kokkos::ArithTraits::is_complex << "\n" - << "AtInputScalarTypeX_nonConst = " - << typeid(AtInputScalarTypeX_nonConst).name() << "\n" + << "isComplex = " << Kokkos::ArithTraits::is_complex << "\n" + << "AtInputScalarTypeX_nonConst = " << typeid(AtInputScalarTypeX_nonConst).name() << "\n" << "InternalTypeX = " << typeid(InternalTypeX).name() << "\n" << "\n" << "BV = " @@ -806,28 +680,20 @@ struct AxpbyUnificationAttemptTraits { //<< ", BV::non_const_data_type = " << // typeid(BV::non_const_data_type).name() << ", AtInputScalarTypeB = " << typeid(AtInputScalarTypeB).name() - << ", isConst = " - << std::is_const_v << ", isComplex = " + << ", isConst = " << std::is_const_v << ", isComplex = " << Kokkos::ArithTraits::is_complex - << ", AtInputScalarTypeB_nonConst = " - << typeid(AtInputScalarTypeB_nonConst).name() + << ", AtInputScalarTypeB_nonConst = " << typeid(AtInputScalarTypeB_nonConst).name() << ", InternalTypeB = " << typeid(InternalTypeB).name() << "\n" - << ", InternalTypeB_managed = " << typeid(InternalTypeB_managed).name() - << "\n" + << ", InternalTypeB_managed = " << typeid(InternalTypeB_managed).name() << "\n" << "\n" << "YMV = " << typeid(YMV).name() << "\n" - << "YMV::value_type = " << typeid(typename YMV::value_type).name() - << "\n" - << "YMV::const_data_type = " - << typeid(typename YMV::const_data_type).name() << "\n" - << "YMV::non_const_data_type = " - << typeid(typename YMV::non_const_data_type).name() << "\n" + << "YMV::value_type = " << typeid(typename YMV::value_type).name() << "\n" + << "YMV::const_data_type = " << typeid(typename YMV::const_data_type).name() << "\n" + << "YMV::non_const_data_type = " << typeid(typename YMV::non_const_data_type).name() << "\n" << "AtInputScalarTypeY = " << typeid(AtInputScalarTypeY).name() << "\n" << "isConst = " << std::is_const_v << "\n" - << "isComplex = " - << Kokkos::ArithTraits::is_complex << "\n" - << "AtInputScalarTypeY_nonConst = " - << typeid(AtInputScalarTypeY_nonConst).name() << "\n" + << "isComplex = " << Kokkos::ArithTraits::is_complex << "\n" + << "AtInputScalarTypeY_nonConst = " << typeid(AtInputScalarTypeY_nonConst).name() << "\n" << "InternalTypeY = " << typeid(InternalTypeY).name() << "\n" << std::endl; } @@ -840,8 +706,7 @@ struct AxpbyUnificationAttemptTraits { template struct getScalarValueFromVariableAtHost { getScalarValueFromVariableAtHost() { - static_assert((rankT == -1) || (rankT == 0) || (rankT == 1), - "Generic struct should not have been invoked!"); + static_assert((rankT == -1) || (rankT == 0) || (rankT == 1), "Generic struct should not have been invoked!"); } }; @@ -879,8 +744,7 @@ template size_t getStrideInCoefficient(T const& coeff) { size_t result = 1; if constexpr (Kokkos::is_view_v) { - if constexpr ((T::rank == 1) && (std::is_same_v)) { + if constexpr ((T::rank == 1) && (std::is_same_v)) { result = coeff.stride_0(); } } @@ -890,8 +754,7 @@ size_t getStrideInCoefficient(T const& coeff) { // -------------------------------- template -static void populateRank1Stride1ViewWithScalarOrNonStrideView( - T_in const& coeff_in, T_out& coeff_out) { +static void populateRank1Stride1ViewWithScalarOrNonStrideView(T_in const& coeff_in, T_out& coeff_out) { // *********************************************************************** // 'coeff_out' is assumed to be rank-1, of LayoutLeft or LayoutRight // @@ -899,8 +762,7 @@ static void populateRank1Stride1ViewWithScalarOrNonStrideView( // - a coeff_in that deals with 'double', and // - a coeff_out deals with 'complex' // *********************************************************************** - using ScalarOutType = - typename std::remove_const::type; + using ScalarOutType = typename std::remove_const::type; if constexpr (!Kokkos::is_view_v) { // ********************************************************************* @@ -924,17 +786,13 @@ static void populateRank1Stride1ViewWithScalarOrNonStrideView( std::ostringstream msg; msg << "In populateRank1Stride1ViewWithScalarOrNonStrideView()" << ": 'in' and 'out' should have the same extent(0)" - << ", T_in = " << typeid(T_in).name() - << ", coeff_in.label() = " << coeff_in.label() - << ", coeff_in.extent(0) = " << coeff_in.extent(0) - << ", T_out = " << typeid(T_out).name() - << ", coeff_out.label() = " << coeff_out.label() - << ", coeff_out.extent(0) = " << coeff_out.extent(0); + << ", T_in = " << typeid(T_in).name() << ", coeff_in.label() = " << coeff_in.label() + << ", coeff_in.extent(0) = " << coeff_in.extent(0) << ", T_out = " << typeid(T_out).name() + << ", coeff_out.label() = " << coeff_out.label() << ", coeff_out.extent(0) = " << coeff_out.extent(0); KokkosKernels::Impl::throw_runtime_exception(msg.str()); } - using ScalarInType = - typename std::remove_const::type; + using ScalarInType = typename std::remove_const::type; if constexpr (std::is_same_v) { coeff_out = coeff_in; } else if (coeff_out.extent(0) == 1) { @@ -946,14 +804,10 @@ static void populateRank1Stride1ViewWithScalarOrNonStrideView( std::ostringstream msg; msg << "In populateRank1Stride1ViewWithScalarOrNonStrideView()" << ": scalar types 'in' and 'out' should be the same" - << ", T_in = " << typeid(T_in).name() - << ", ScalarInType = " << typeid(ScalarInType).name() - << ", coeff_in.label() = " << coeff_in.label() - << ", coeff_in.extent(0) = " << coeff_in.extent(0) - << ", T_out = " << typeid(T_out).name() - << ", ScalarOutType = " << typeid(ScalarOutType).name() - << ", coeff_out.label() = " << coeff_out.label() - << ", coeff_out.extent(0) = " << coeff_out.extent(0); + << ", T_in = " << typeid(T_in).name() << ", ScalarInType = " << typeid(ScalarInType).name() + << ", coeff_in.label() = " << coeff_in.label() << ", coeff_in.extent(0) = " << coeff_in.extent(0) + << ", T_out = " << typeid(T_out).name() << ", ScalarOutType = " << typeid(ScalarOutType).name() + << ", coeff_out.label() = " << coeff_out.label() << ", coeff_out.extent(0) = " << coeff_out.extent(0); KokkosKernels::Impl::throw_runtime_exception(msg.str()); } } diff --git a/blas/impl/KokkosBlas1_dot_impl.hpp b/blas/impl/KokkosBlas1_dot_impl.hpp index 2003f7cc2c..61e7307bc8 100644 --- a/blas/impl/KokkosBlas1_dot_impl.hpp +++ b/blas/impl/KokkosBlas1_dot_impl.hpp @@ -30,8 +30,7 @@ namespace Impl { /// \tparam YVector Type of the second vector y; 1-D View /// \tparam SizeType Type of the row index used in the dot product. /// For best performance, use int instead of size_t here. -template +template struct DotFunctor { typedef SizeType size_type; typedef typename AV::non_const_value_type avalue_type; @@ -44,26 +43,19 @@ struct DotFunctor { DotFunctor(const XVector& x, const YVector& y) : m_x(x), m_y(y) {} void run(const char* label, const execution_space& space, AV result) { - Kokkos::RangePolicy policy(space, 0, - m_x.extent(0)); + Kokkos::RangePolicy policy(space, 0, m_x.extent(0)); Kokkos::parallel_reduce(label, policy, *this, result); } // Prefer const size_type& to const size_type or size_type, // since the compiler has an easier time inlining the former. - KOKKOS_FORCEINLINE_FUNCTION void operator()(const size_type& i, - value_type& sum) const { + KOKKOS_FORCEINLINE_FUNCTION void operator()(const size_type& i, value_type& sum) const { Kokkos::Details::updateDot(sum, m_x(i), m_y(i)); // sum += m_x(i) * m_y(i) } - KOKKOS_INLINE_FUNCTION void init(value_type& update) const { - update = Kokkos::ArithTraits::zero(); - } + KOKKOS_INLINE_FUNCTION void init(value_type& update) const { update = Kokkos::ArithTraits::zero(); } - KOKKOS_INLINE_FUNCTION void join(value_type& update, - const value_type& source) const { - update += source; - } + KOKKOS_INLINE_FUNCTION void join(value_type& update, const value_type& source) const { update += source; } }; } // namespace Impl diff --git a/blas/impl/KokkosBlas1_dot_mv_impl.hpp b/blas/impl/KokkosBlas1_dot_mv_impl.hpp index d19e512599..15db366ceb 100644 --- a/blas/impl/KokkosBlas1_dot_mv_impl.hpp +++ b/blas/impl/KokkosBlas1_dot_mv_impl.hpp @@ -27,9 +27,8 @@ namespace Impl { template struct Dot_MV_Functor { - using Scalar = typename RV::non_const_value_type; - using IPT = Kokkos::Details::InnerProductSpaceTraits< - typename XV::non_const_value_type>; + using Scalar = typename RV::non_const_value_type; + using IPT = Kokkos::Details::InnerProductSpaceTraits; using dot_type = typename IPT::dot_type; using KAT = Kokkos::ArithTraits; @@ -39,8 +38,7 @@ struct Dot_MV_Functor { XV x; YV y; - size_type - teamsPerDot; // number of teams collectively performing a dot product + size_type teamsPerDot; // number of teams collectively performing a dot product Dot_MV_Functor(const RV& r_, const XV& x_, const YV& y_, int teamsPerDot_) : r(r_), x(x_), y(y_), teamsPerDot(teamsPerDot_) {} @@ -60,13 +58,11 @@ struct Dot_MV_Functor { Kokkos::parallel_reduce( Kokkos::TeamThreadRange(t, begin, end), [&](size_type k, dot_type& update) { - Kokkos::Details::updateDot(update, x.access(k, xcol), - y.access(k, ycol)); + Kokkos::Details::updateDot(update, x.access(k, xcol), y.access(k, ycol)); }, localResult); - Kokkos::single(Kokkos::PerTeam(t), - [&]() { Kokkos::atomic_add(&r(i), Scalar(localResult)); }); + Kokkos::single(Kokkos::PerTeam(t), [&]() { Kokkos::atomic_add(&r(i), Scalar(localResult)); }); } }; @@ -75,14 +71,12 @@ struct Dot_MV_Functor { template void MV_Dot_Invoke( const execution_space& space, const RV& r, const XV& x, const YV& y, - typename std::enable_if::accessible>::type* = + typename std::enable_if::accessible>::type* = nullptr) { size_type numDots = std::max(x.extent(1), y.extent(1)); if (x.extent(0) != y.extent(0)) { std::ostringstream oss; - oss << "KokkosBlas::dot (rank-2): x and y have different lengths (" - << x.extent(0) << " and " << y.extent(0) << ")"; + oss << "KokkosBlas::dot (rank-2): x and y have different lengths (" << x.extent(0) << " and " << y.extent(0) << ")"; throw std::runtime_error(oss.str()); } if ((x.extent(1) != size_t(1) && x.extent(1) != size_t(numDots)) || @@ -95,23 +89,17 @@ void MV_Dot_Invoke( } if (r.extent(0) != size_t(numDots)) { std::ostringstream oss; - oss << "KokkosBlas::dot (rank-2): result vector has wrong length (" - << r.extent(0) << ", but " << numDots + oss << "KokkosBlas::dot (rank-2): result vector has wrong length (" << r.extent(0) << ", but " << numDots << " dot products will be computed)"; throw std::runtime_error(oss.str()); } // Zero out the result vector - Kokkos::deep_copy( - space, r, Kokkos::ArithTraits::zero()); + Kokkos::deep_copy(space, r, Kokkos::ArithTraits::zero()); size_type teamsPerDot; - KokkosBlas::Impl::multipleReductionWorkDistribution( - x.extent(0), numDots, teamsPerDot); + KokkosBlas::Impl::multipleReductionWorkDistribution(x.extent(0), numDots, teamsPerDot); size_type numTeams = numDots * teamsPerDot; Kokkos::TeamPolicy pol(space, numTeams, Kokkos::AUTO); - Kokkos::parallel_for("Dot_MV", pol, - Dot_MV_Functor( - r, x, y, teamsPerDot)); + Kokkos::parallel_for("Dot_MV", pol, Dot_MV_Functor(r, x, y, teamsPerDot)); } // Version for when a temporary result view is needed (implemented in terms of @@ -119,15 +107,11 @@ void MV_Dot_Invoke( template void MV_Dot_Invoke( const execution_space& space, const RV& r, const XV& x, const YV& y, - typename std::enable_if::accessible>::type* = - nullptr) { - Kokkos::View - tempResult( - Kokkos::view_alloc(Kokkos::WithoutInitializing, "Dot_MV temp result"), - r.extent(0)); - MV_Dot_Invoke( - space, tempResult, x, y); + typename std::enable_if< + !Kokkos::SpaceAccessibility::accessible>::type* = nullptr) { + Kokkos::View tempResult( + Kokkos::view_alloc(Kokkos::WithoutInitializing, "Dot_MV temp result"), r.extent(0)); + MV_Dot_Invoke(space, tempResult, x, y); Kokkos::deep_copy(space, r, tempResult); space.fence(); } diff --git a/blas/impl/KokkosBlas1_dot_spec.hpp b/blas/impl/KokkosBlas1_dot_spec.hpp index 02efee6bc5..982e2eaa0c 100644 --- a/blas/impl/KokkosBlas1_dot_spec.hpp +++ b/blas/impl/KokkosBlas1_dot_spec.hpp @@ -54,15 +54,11 @@ struct DotAccumulatingScalar> { template struct HasSpecialAccumulator { - enum : bool { - value = !std::is_same::type>::value - }; + enum : bool { value = !std::is_same::type>::value }; }; // Specialization struct which defines whether a specialization exists -template +template struct dot_eti_spec_avail { enum : bool { value = false }; }; @@ -75,34 +71,27 @@ struct dot_eti_spec_avail { // the declarations of full specializations go in this header file. // We may spread out definitions (see _INST macro below) across one or // more .cpp files. -#define KOKKOSBLAS1_DOT_ETI_SPEC_AVAIL(SCALAR, LAYOUT, EXEC_SPACE, MEM_SPACE) \ - template <> \ - struct dot_eti_spec_avail< \ - EXEC_SPACE, \ - Kokkos::View>, \ - Kokkos::View, \ - Kokkos::MemoryTraits>, \ - Kokkos::View, \ - Kokkos::MemoryTraits>, \ - 1, 1> { \ - enum : bool { value = true }; \ - }; \ - template <> \ - struct dot_eti_spec_avail< \ - EXEC_SPACE, \ - Kokkos::View, \ - Kokkos::MemoryTraits>, \ - Kokkos::View, \ - Kokkos::MemoryTraits>, \ - Kokkos::View, \ - Kokkos::MemoryTraits>, \ - 1, 1> { \ - enum : bool { value = true }; \ +#define KOKKOSBLAS1_DOT_ETI_SPEC_AVAIL(SCALAR, LAYOUT, EXEC_SPACE, MEM_SPACE) \ + template <> \ + struct dot_eti_spec_avail>, \ + Kokkos::View, \ + Kokkos::MemoryTraits>, \ + Kokkos::View, \ + Kokkos::MemoryTraits>, \ + 1, 1> { \ + enum : bool { value = true }; \ + }; \ + template <> \ + struct dot_eti_spec_avail< \ + EXEC_SPACE, \ + Kokkos::View, Kokkos::MemoryTraits>, \ + Kokkos::View, \ + Kokkos::MemoryTraits>, \ + Kokkos::View, \ + Kokkos::MemoryTraits>, \ + 1, 1> { \ + enum : bool { value = true }; \ }; // @@ -112,55 +101,42 @@ struct dot_eti_spec_avail { // We may spread out definitions (see _DEF macro below) across one or // more .cpp files. // -#define KOKKOSBLAS1_DOT_MV_ETI_SPEC_AVAIL(SCALAR, LAYOUT, EXEC_SPACE, \ - MEM_SPACE) \ - template <> \ - struct dot_eti_spec_avail< \ - EXEC_SPACE, \ - Kokkos::View, \ - Kokkos::MemoryTraits>, \ - Kokkos::View, \ - Kokkos::MemoryTraits>, \ - Kokkos::View, \ - Kokkos::MemoryTraits>, \ - 2, 2> { \ - enum : bool { value = true }; \ - }; \ - template <> \ - struct dot_eti_spec_avail< \ - EXEC_SPACE, \ - Kokkos::View, \ - Kokkos::MemoryTraits>, \ - Kokkos::View, \ - Kokkos::MemoryTraits>, \ - Kokkos::View, \ - Kokkos::MemoryTraits>, \ - 2, 1> { \ - enum : bool { value = true }; \ - }; \ - template <> \ - struct dot_eti_spec_avail< \ - EXEC_SPACE, \ - Kokkos::View, \ - Kokkos::MemoryTraits>, \ - Kokkos::View, \ - Kokkos::MemoryTraits>, \ - Kokkos::View, \ - Kokkos::MemoryTraits>, \ - 1, 2> { \ - enum : bool { value = true }; \ +#define KOKKOSBLAS1_DOT_MV_ETI_SPEC_AVAIL(SCALAR, LAYOUT, EXEC_SPACE, MEM_SPACE) \ + template <> \ + struct dot_eti_spec_avail< \ + EXEC_SPACE, \ + Kokkos::View, \ + Kokkos::MemoryTraits>, \ + Kokkos::View, \ + Kokkos::MemoryTraits>, \ + Kokkos::View, \ + Kokkos::MemoryTraits>, \ + 2, 2> { \ + enum : bool { value = true }; \ + }; \ + template <> \ + struct dot_eti_spec_avail< \ + EXEC_SPACE, \ + Kokkos::View, \ + Kokkos::MemoryTraits>, \ + Kokkos::View, \ + Kokkos::MemoryTraits>, \ + Kokkos::View, \ + Kokkos::MemoryTraits>, \ + 2, 1> { \ + enum : bool { value = true }; \ + }; \ + template <> \ + struct dot_eti_spec_avail< \ + EXEC_SPACE, \ + Kokkos::View, \ + Kokkos::MemoryTraits>, \ + Kokkos::View, \ + Kokkos::MemoryTraits>, \ + Kokkos::View, \ + Kokkos::MemoryTraits>, \ + 1, 2> { \ + enum : bool { value = true }; \ }; // Include the actual specialization declarations @@ -172,36 +148,28 @@ namespace KokkosBlas { namespace Impl { // Unification layer -template ::value, - bool eti_spec_avail = - dot_eti_spec_avail::value> +template ::value, + bool eti_spec_avail = dot_eti_spec_avail::value> struct Dot { - static void dot(const execution_space& space, const RV&, const XV& R, - const YV& X); + static void dot(const execution_space& space, const RV&, const XV& R, const YV& X); }; // This version never has TPL support, but it does use the same ETI system template ::value> + bool eti_spec_avail = dot_eti_spec_avail::value> struct DotSpecialAccumulator { // Note: not doing the static_asserts to validate RV, XV, YV since those // errors would have already arisen when building the library. - using size_type = typename YV::size_type; - using dot_type = typename Kokkos::Details::InnerProductSpaceTraits< - typename XV::non_const_value_type>::dot_type; + using size_type = typename YV::size_type; + using dot_type = typename Kokkos::Details::InnerProductSpaceTraits::dot_type; using accum_type = typename DotAccumulatingScalar::type; // This is the same View type as RV, but using the special accumulator as the // value type - using RV_Result = Kokkos::View>; - static void dot(const execution_space& space, const RV_Result& R, const XV& X, - const YV& Y); + static void dot(const execution_space& space, const RV_Result& R, const XV& X, const YV& Y); }; #if !defined(KOKKOSKERNELS_ETI_ONLY) || KOKKOSKERNELS_IMPL_COMPILE_LIBRARY @@ -209,8 +177,7 @@ struct DotSpecialAccumulator { // The rank-1 case is currently the only one that may use a different // accumulator type than InnerProductSpaceTraits::dot_type. template -struct Dot { +struct Dot { // Check some things about the template parameters at compile time to get nice // error messages, before using them under the assumption they are valid. static_assert(Kokkos::is_view::value, @@ -231,8 +198,7 @@ struct Dot: " "YV is not rank 1."); - static_assert(std::is_same::value, + static_assert(std::is_same::value, "KokkosBlas::Dot<1D>: R is const. " "It must be nonconst, because it is an output argument " "(we have to be able to write to its entries)."); @@ -243,23 +209,18 @@ struct Dot> RV_Result; - static void dot(const execution_space& space, const RV& R, const XV& X, - const YV& Y) { - Kokkos::Profiling::pushRegion(KOKKOSKERNELS_IMPL_COMPILE_LIBRARY - ? "KokkosBlas::dot[ETI]" - : "KokkosBlas::dot[noETI]"); + static void dot(const execution_space& space, const RV& R, const XV& X, const YV& Y) { + Kokkos::Profiling::pushRegion(KOKKOSKERNELS_IMPL_COMPILE_LIBRARY ? "KokkosBlas::dot[ETI]" + : "KokkosBlas::dot[noETI]"); #ifdef KOKKOSKERNELS_ENABLE_CHECK_SPECIALIZATION if (KOKKOSKERNELS_IMPL_COMPILE_LIBRARY) - printf("KokkosBlas::dot<> ETI specialization for < %s , %s >\n", - typeid(XV).name(), typeid(YV).name()); + printf("KokkosBlas::dot<> ETI specialization for < %s , %s >\n", typeid(XV).name(), typeid(YV).name()); else { - printf("KokkosBlas::dot<> non-ETI specialization for < %s , %s >\n", - typeid(XV).name(), typeid(YV).name()); + printf("KokkosBlas::dot<> non-ETI specialization for < %s , %s >\n", typeid(XV).name(), typeid(YV).name()); } #endif const size_type numElems = X.extent(0); @@ -282,8 +243,7 @@ struct Dot -struct DotSpecialAccumulator { +struct DotSpecialAccumulator { static_assert(Kokkos::is_view::value, "KokkosBlas::Impl::" "DotSpecialAccumulator: XV is not a Kokkos::View."); @@ -299,38 +259,30 @@ struct DotSpecialAccumulator::value, "KokkosBlas::Impl::" "DotSpecialAccumulator: RV is not a Kokkos::View."); - static_assert(std::is_same::value, + static_assert(std::is_same::value, "KokkosBlas::Impl::DotSpecialAccumulator: X and Y have " "different scalar types."); - static_assert(std::is_same::value, + static_assert(std::is_same::value, "KokkosBlas::Dot<1D>: R is const. " "It must be nonconst, because it is an output argument " "(we have to be able to write to its entries)."); - using size_type = typename YV::size_type; - using dot_type = typename Kokkos::Details::InnerProductSpaceTraits< - typename XV::non_const_value_type>::dot_type; + using size_type = typename YV::size_type; + using dot_type = typename Kokkos::Details::InnerProductSpaceTraits::dot_type; using accum_type = typename DotAccumulatingScalar::type; // This is the same View type as RV, but using the special accumulator as the // value type - using RV_Result = Kokkos::View>; - static void dot(const execution_space& space, const RV_Result& R, const XV& X, - const YV& Y) { - Kokkos::Profiling::pushRegion(KOKKOSKERNELS_IMPL_COMPILE_LIBRARY - ? "KokkosBlas::dot[ETI]" - : "KokkosBlas::dot[noETI]"); + static void dot(const execution_space& space, const RV_Result& R, const XV& X, const YV& Y) { + Kokkos::Profiling::pushRegion(KOKKOSKERNELS_IMPL_COMPILE_LIBRARY ? "KokkosBlas::dot[ETI]" + : "KokkosBlas::dot[noETI]"); #ifdef KOKKOSKERNELS_ENABLE_CHECK_SPECIALIZATION if (KOKKOSKERNELS_IMPL_COMPILE_LIBRARY) - printf("KokkosBlas::dot<> ETI specialization for < %s , %s >\n", - typeid(XV).name(), typeid(YV).name()); + printf("KokkosBlas::dot<> ETI specialization for < %s , %s >\n", typeid(XV).name(), typeid(YV).name()); else { - printf("KokkosBlas::dot<> non-ETI specialization for < %s , %s >\n", - typeid(XV).name(), typeid(YV).name()); + printf("KokkosBlas::dot<> non-ETI specialization for < %s , %s >\n", typeid(XV).name(), typeid(YV).name()); } #endif const size_type numElems = X.extent(0); @@ -348,10 +300,8 @@ struct DotSpecialAccumulator -struct Dot { +template +struct Dot { static_assert(Kokkos::is_view::value, "KokkosBlas::Impl::" "Dot<2-D>: XV is not a Kokkos::View."); @@ -367,29 +317,25 @@ struct Dot - static auto getFirstColumn( - const V& v, typename std::enable_if::type* = nullptr) { + static auto getFirstColumn(const V& v, typename std::enable_if::type* = nullptr) { return Kokkos::subview(v, Kokkos::ALL(), 0); } template - static V getFirstColumn( - const V& v, typename std::enable_if::type* = nullptr) { + static V getFirstColumn(const V& v, typename std::enable_if::type* = nullptr) { return v; } - static void dot(const execution_space& space, const RV& R, const XV& X, - const YV& Y) { - Kokkos::Profiling::pushRegion(KOKKOSKERNELS_IMPL_COMPILE_LIBRARY - ? "KokkosBlas::dot[ETI]" - : "KokkosBlas::dot[noETI]"); + static void dot(const execution_space& space, const RV& R, const XV& X, const YV& Y) { + Kokkos::Profiling::pushRegion(KOKKOSKERNELS_IMPL_COMPILE_LIBRARY ? "KokkosBlas::dot[ETI]" + : "KokkosBlas::dot[noETI]"); #ifdef KOKKOSKERNELS_ENABLE_CHECK_SPECIALIZATION if (KOKKOSKERNELS_IMPL_COMPILE_LIBRARY) - printf("KokkosBlas1::dot<> ETI specialization for < %s , %s , %s >\n", - typeid(RV).name(), typeid(XV).name(), typeid(YV).name()); + printf("KokkosBlas1::dot<> ETI specialization for < %s , %s , %s >\n", typeid(RV).name(), typeid(XV).name(), + typeid(YV).name()); else { - printf("KokkosBlas1::dot<> non-ETI specialization for < %s , %s , %s >\n", - typeid(RV).name(), typeid(XV).name(), typeid(YV).name()); + printf("KokkosBlas1::dot<> non-ETI specialization for < %s , %s , %s >\n", typeid(RV).name(), typeid(XV).name(), + typeid(YV).name()); } #endif @@ -401,20 +347,15 @@ struct Dot(INT_MAX)) { typedef int index_type; - DotFunctor - f(X0, Y0); + DotFunctor f(X0, Y0); f.run("KokkosBlas::dot<1D>", space, R0); } else { typedef int64_t index_type; - DotFunctor - f(X0, Y0); + DotFunctor f(X0, Y0); f.run("KokkosBlas::dot<1D>", space, R0); } } else { - if (numRows < static_cast(INT_MAX) && - numRows * numDots < static_cast(INT_MAX)) { + if (numRows < static_cast(INT_MAX) && numRows * numDots < static_cast(INT_MAX)) { typedef int index_type; MV_Dot_Invoke(space, R, X, Y); } else { @@ -437,95 +378,68 @@ struct Dot>, \ - Kokkos::View, \ - Kokkos::MemoryTraits>, \ - Kokkos::View, \ - Kokkos::MemoryTraits>, \ - 1, 1, false, true>; \ - extern template struct Dot< \ - EXEC_SPACE, \ - Kokkos::View, \ - Kokkos::MemoryTraits>, \ - Kokkos::View, \ - Kokkos::MemoryTraits>, \ - Kokkos::View, \ - Kokkos::MemoryTraits>, \ - 1, 1, false, true>; \ - extern template struct DotSpecialAccumulator< \ - EXEC_SPACE, \ - Kokkos::View, \ - Kokkos::MemoryTraits>, \ - Kokkos::View, \ - Kokkos::MemoryTraits>, \ - Kokkos::View, \ - Kokkos::MemoryTraits>, \ - true>; \ - extern template struct DotSpecialAccumulator< \ - EXEC_SPACE, \ - Kokkos::View>, \ - Kokkos::View, \ - Kokkos::MemoryTraits>, \ - Kokkos::View, \ - Kokkos::MemoryTraits>, \ +#define KOKKOSBLAS1_DOT_ETI_SPEC_DECL(SCALAR, LAYOUT, EXEC_SPACE, MEM_SPACE) \ + extern template struct Dot>, \ + Kokkos::View, \ + Kokkos::MemoryTraits>, \ + Kokkos::View, \ + Kokkos::MemoryTraits>, \ + 1, 1, false, true>; \ + extern template struct Dot< \ + EXEC_SPACE, \ + Kokkos::View, Kokkos::MemoryTraits>, \ + Kokkos::View, \ + Kokkos::MemoryTraits>, \ + Kokkos::View, \ + Kokkos::MemoryTraits>, \ + 1, 1, false, true>; \ + extern template struct DotSpecialAccumulator< \ + EXEC_SPACE, \ + Kokkos::View, Kokkos::MemoryTraits>, \ + Kokkos::View, \ + Kokkos::MemoryTraits>, \ + Kokkos::View, \ + Kokkos::MemoryTraits>, \ + true>; \ + extern template struct DotSpecialAccumulator< \ + EXEC_SPACE, Kokkos::View>, \ + Kokkos::View, \ + Kokkos::MemoryTraits>, \ + Kokkos::View, \ + Kokkos::MemoryTraits>, \ true>; -#define KOKKOSBLAS1_DOT_ETI_SPEC_INST(SCALAR, LAYOUT, EXEC_SPACE, MEM_SPACE) \ - template struct Dot>, \ - Kokkos::View, \ - Kokkos::MemoryTraits>, \ - Kokkos::View, \ - Kokkos::MemoryTraits>, \ - 1, 1, false, true>; \ - template struct Dot< \ - EXEC_SPACE, \ - Kokkos::View, \ - Kokkos::MemoryTraits>, \ - Kokkos::View, \ - Kokkos::MemoryTraits>, \ - Kokkos::View, \ - Kokkos::MemoryTraits>, \ - 1, 1, false, true>; \ - template struct DotSpecialAccumulator< \ - EXEC_SPACE, \ - Kokkos::View>, \ - Kokkos::View, \ - Kokkos::MemoryTraits>, \ - Kokkos::View, \ - Kokkos::MemoryTraits>, \ - true>; \ - template struct DotSpecialAccumulator< \ - EXEC_SPACE, \ - Kokkos::View, \ - Kokkos::MemoryTraits>, \ - Kokkos::View, \ - Kokkos::MemoryTraits>, \ - Kokkos::View, \ - Kokkos::MemoryTraits>, \ +#define KOKKOSBLAS1_DOT_ETI_SPEC_INST(SCALAR, LAYOUT, EXEC_SPACE, MEM_SPACE) \ + template struct Dot>, \ + Kokkos::View, \ + Kokkos::MemoryTraits>, \ + Kokkos::View, \ + Kokkos::MemoryTraits>, \ + 1, 1, false, true>; \ + template struct Dot< \ + EXEC_SPACE, \ + Kokkos::View, Kokkos::MemoryTraits>, \ + Kokkos::View, \ + Kokkos::MemoryTraits>, \ + Kokkos::View, \ + Kokkos::MemoryTraits>, \ + 1, 1, false, true>; \ + template struct DotSpecialAccumulator< \ + EXEC_SPACE, Kokkos::View>, \ + Kokkos::View, \ + Kokkos::MemoryTraits>, \ + Kokkos::View, \ + Kokkos::MemoryTraits>, \ + true>; \ + template struct DotSpecialAccumulator< \ + EXEC_SPACE, \ + Kokkos::View, Kokkos::MemoryTraits>, \ + Kokkos::View, \ + Kokkos::MemoryTraits>, \ + Kokkos::View, \ + Kokkos::MemoryTraits>, \ true>; // @@ -534,88 +448,62 @@ struct Dot, \ - Kokkos::MemoryTraits>, \ - Kokkos::View, \ - Kokkos::MemoryTraits>, \ - Kokkos::View, \ - Kokkos::MemoryTraits>, \ - 2, 2, false, true>; \ - extern template struct Dot< \ - EXEC_SPACE, \ - Kokkos::View, \ - Kokkos::MemoryTraits>, \ - Kokkos::View, \ - Kokkos::MemoryTraits>, \ - Kokkos::View, \ - Kokkos::MemoryTraits>, \ - 2, 1, false, true>; \ - extern template struct Dot< \ - EXEC_SPACE, \ - Kokkos::View, \ - Kokkos::MemoryTraits>, \ - Kokkos::View, \ - Kokkos::MemoryTraits>, \ - Kokkos::View, \ - Kokkos::MemoryTraits>, \ +#define KOKKOSBLAS1_DOT_MV_ETI_SPEC_DECL(SCALAR, LAYOUT, EXEC_SPACE, MEM_SPACE) \ + extern template struct Dot< \ + EXEC_SPACE, \ + Kokkos::View, \ + Kokkos::MemoryTraits>, \ + Kokkos::View, \ + Kokkos::MemoryTraits>, \ + Kokkos::View, \ + Kokkos::MemoryTraits>, \ + 2, 2, false, true>; \ + extern template struct Dot< \ + EXEC_SPACE, \ + Kokkos::View, \ + Kokkos::MemoryTraits>, \ + Kokkos::View, \ + Kokkos::MemoryTraits>, \ + Kokkos::View, \ + Kokkos::MemoryTraits>, \ + 2, 1, false, true>; \ + extern template struct Dot< \ + EXEC_SPACE, \ + Kokkos::View, \ + Kokkos::MemoryTraits>, \ + Kokkos::View, \ + Kokkos::MemoryTraits>, \ + Kokkos::View, \ + Kokkos::MemoryTraits>, \ 1, 2, false, true>; -#define KOKKOSBLAS1_DOT_MV_ETI_SPEC_INST(SCALAR, LAYOUT, EXEC_SPACE, \ - MEM_SPACE) \ - template struct Dot< \ - EXEC_SPACE, \ - Kokkos::View, \ - Kokkos::MemoryTraits>, \ - Kokkos::View, \ - Kokkos::MemoryTraits>, \ - Kokkos::View, \ - Kokkos::MemoryTraits>, \ - 2, 2, false, true>; \ - template struct Dot< \ - EXEC_SPACE, \ - Kokkos::View, \ - Kokkos::MemoryTraits>, \ - Kokkos::View, \ - Kokkos::MemoryTraits>, \ - Kokkos::View, \ - Kokkos::MemoryTraits>, \ - 2, 1, false, true>; \ - template struct Dot< \ - EXEC_SPACE, \ - Kokkos::View, \ - Kokkos::MemoryTraits>, \ - Kokkos::View, \ - Kokkos::MemoryTraits>, \ - Kokkos::View, \ - Kokkos::MemoryTraits>, \ +#define KOKKOSBLAS1_DOT_MV_ETI_SPEC_INST(SCALAR, LAYOUT, EXEC_SPACE, MEM_SPACE) \ + template struct Dot< \ + EXEC_SPACE, \ + Kokkos::View, \ + Kokkos::MemoryTraits>, \ + Kokkos::View, \ + Kokkos::MemoryTraits>, \ + Kokkos::View, \ + Kokkos::MemoryTraits>, \ + 2, 2, false, true>; \ + template struct Dot< \ + EXEC_SPACE, \ + Kokkos::View, \ + Kokkos::MemoryTraits>, \ + Kokkos::View, \ + Kokkos::MemoryTraits>, \ + Kokkos::View, \ + Kokkos::MemoryTraits>, \ + 2, 1, false, true>; \ + template struct Dot< \ + EXEC_SPACE, \ + Kokkos::View, \ + Kokkos::MemoryTraits>, \ + Kokkos::View, \ + Kokkos::MemoryTraits>, \ + Kokkos::View, \ + Kokkos::MemoryTraits>, \ 1, 2, false, true>; #include diff --git a/blas/impl/KokkosBlas1_iamax_impl.hpp b/blas/impl/KokkosBlas1_iamax_impl.hpp index 4c7a3fcc0c..bef00fad8c 100644 --- a/blas/impl/KokkosBlas1_iamax_impl.hpp +++ b/blas/impl/KokkosBlas1_iamax_impl.hpp @@ -29,8 +29,7 @@ namespace Impl { /// \tparam XV 1-D input View /// \tparam MagType Magnitude type /// \tparam SizeType Index type. Use int (32 bits) if possible. -template +template struct V_Iamax_Functor { using size_type = SizeType; using mag_type = MagType; @@ -47,8 +46,7 @@ struct V_Iamax_Functor { static_assert(Kokkos::is_view::value, "KokkosBlas::Impl::V_Iamax_Functor: " "X is not a Kokkos::View."); - static_assert(std::is_same::value, + static_assert(std::is_same::value, "KokkosBlas::Impl::V_Iamax_Functor: R is const. " "It must be nonconst, because it is an output argument " "(we have to be able to write to its entries)."); @@ -57,8 +55,7 @@ struct V_Iamax_Functor { "RV must have rank 0 and XV must have rank 1."); } - KOKKOS_INLINE_FUNCTION void operator()(const size_type i, - value_type& lmaxloc) const { + KOKKOS_INLINE_FUNCTION void operator()(const size_type i, value_type& lmaxloc) const { mag_type val = IPT::norm(m_x(i - 1)); mag_type maxval = IPT::norm(m_x(lmaxloc - 1)); if (val > maxval) lmaxloc = i; @@ -68,8 +65,7 @@ struct V_Iamax_Functor { update = Kokkos::reduction_identity::max() + 1; } - KOKKOS_INLINE_FUNCTION void join(value_type& update, - const value_type& source) const { + KOKKOS_INLINE_FUNCTION void join(value_type& update, const value_type& source) const { mag_type source_val = IPT::norm(m_x(source - 1)); mag_type update_val = IPT::norm(m_x(update - 1)); if (update_val < source_val) update = source; @@ -107,8 +103,7 @@ void MV_Iamax_Invoke(const execution_space& space, const RV& r, const XMV& X) { for (size_t i = 0; i < X.extent(1); i++) { auto ri = Kokkos::subview(r, i); auto Xi = Kokkos::subview(X, Kokkos::ALL(), i); - V_Iamax_Invoke( - space, ri, Xi); + V_Iamax_Invoke(space, ri, Xi); } } diff --git a/blas/impl/KokkosBlas1_iamax_spec.hpp b/blas/impl/KokkosBlas1_iamax_spec.hpp index 341b949050..80e4cb6036 100644 --- a/blas/impl/KokkosBlas1_iamax_spec.hpp +++ b/blas/impl/KokkosBlas1_iamax_spec.hpp @@ -43,39 +43,29 @@ struct iamax_eti_spec_avail { // We may spread out definitions (see _INST macro below) across one or // more .cpp files. // -#define KOKKOSBLAS1_IAMAX_ETI_SPEC_AVAIL_INDEX(INDEX_TYPE, SCALAR, LAYOUT, \ - EXEC_SPACE, MEM_SPACE) \ - template <> \ - struct iamax_eti_spec_avail< \ - EXEC_SPACE, \ - Kokkos::View >, \ - Kokkos::View, \ - Kokkos::MemoryTraits >, \ - 1> { \ - enum : bool { value = true }; \ - }; \ - template <> \ - struct iamax_eti_spec_avail< \ - EXEC_SPACE, \ - Kokkos::View, \ - Kokkos::MemoryTraits >, \ - Kokkos::View, \ - Kokkos::MemoryTraits >, \ - 1> { \ - enum : bool { value = true }; \ +#define KOKKOSBLAS1_IAMAX_ETI_SPEC_AVAIL_INDEX(INDEX_TYPE, SCALAR, LAYOUT, EXEC_SPACE, MEM_SPACE) \ + template <> \ + struct iamax_eti_spec_avail< \ + EXEC_SPACE, Kokkos::View >, \ + Kokkos::View, \ + Kokkos::MemoryTraits >, \ + 1> { \ + enum : bool { value = true }; \ + }; \ + template <> \ + struct iamax_eti_spec_avail, \ + Kokkos::MemoryTraits >, \ + Kokkos::View, \ + Kokkos::MemoryTraits >, \ + 1> { \ + enum : bool { value = true }; \ }; -#define KOKKOSBLAS1_IAMAX_ETI_SPEC_AVAIL(SCALAR, LAYOUT, EXEC_SPACE, \ - MEM_SPACE) \ - KOKKOSBLAS1_IAMAX_ETI_SPEC_AVAIL_INDEX(unsigned long, SCALAR, LAYOUT, \ - EXEC_SPACE, MEM_SPACE) \ - KOKKOSBLAS1_IAMAX_ETI_SPEC_AVAIL_INDEX(unsigned int, SCALAR, LAYOUT, \ - EXEC_SPACE, MEM_SPACE) \ - KOKKOSBLAS1_IAMAX_ETI_SPEC_AVAIL_INDEX(int, SCALAR, LAYOUT, EXEC_SPACE, \ - MEM_SPACE) +#define KOKKOSBLAS1_IAMAX_ETI_SPEC_AVAIL(SCALAR, LAYOUT, EXEC_SPACE, MEM_SPACE) \ + KOKKOSBLAS1_IAMAX_ETI_SPEC_AVAIL_INDEX(unsigned long, SCALAR, LAYOUT, EXEC_SPACE, MEM_SPACE) \ + KOKKOSBLAS1_IAMAX_ETI_SPEC_AVAIL_INDEX(unsigned int, SCALAR, LAYOUT, EXEC_SPACE, MEM_SPACE) \ + KOKKOSBLAS1_IAMAX_ETI_SPEC_AVAIL_INDEX(int, SCALAR, LAYOUT, EXEC_SPACE, MEM_SPACE) // // Macro for declaration of full specialization availability @@ -84,39 +74,29 @@ struct iamax_eti_spec_avail { // We may spread out definitions (see _DEF macro below) across one or // more .cpp files. // -#define KOKKOSBLAS1_IAMAX_MV_ETI_SPEC_AVAIL_INDEX_HOST( \ - INDEX_TYPE, SCALAR, LAYOUT, EXEC_SPACE, MEM_SPACE) \ - template <> \ - struct iamax_eti_spec_avail< \ - EXEC_SPACE, \ - Kokkos::View >, \ - Kokkos::View, \ - Kokkos::MemoryTraits >, \ - 2> { \ - enum : bool { value = true }; \ - }; \ - template <> \ - struct iamax_eti_spec_avail< \ - EXEC_SPACE, \ - Kokkos::View, \ - Kokkos::MemoryTraits >, \ - Kokkos::View, \ - Kokkos::MemoryTraits >, \ - 2> { \ - enum : bool { value = true }; \ +#define KOKKOSBLAS1_IAMAX_MV_ETI_SPEC_AVAIL_INDEX_HOST(INDEX_TYPE, SCALAR, LAYOUT, EXEC_SPACE, MEM_SPACE) \ + template <> \ + struct iamax_eti_spec_avail< \ + EXEC_SPACE, Kokkos::View >, \ + Kokkos::View, \ + Kokkos::MemoryTraits >, \ + 2> { \ + enum : bool { value = true }; \ + }; \ + template <> \ + struct iamax_eti_spec_avail, \ + Kokkos::MemoryTraits >, \ + Kokkos::View, \ + Kokkos::MemoryTraits >, \ + 2> { \ + enum : bool { value = true }; \ }; -#define KOKKOSBLAS1_IAMAX_MV_ETI_SPEC_AVAIL(SCALAR, LAYOUT, EXEC_SPACE, \ - MEM_SPACE) \ - KOKKOSBLAS1_IAMAX_MV_ETI_SPEC_AVAIL_INDEX_HOST( \ - unsigned long, SCALAR, LAYOUT, EXEC_SPACE, MEM_SPACE) \ - KOKKOSBLAS1_IAMAX_MV_ETI_SPEC_AVAIL_INDEX_HOST(unsigned int, SCALAR, LAYOUT, \ - EXEC_SPACE, MEM_SPACE) \ - KOKKOSBLAS1_IAMAX_MV_ETI_SPEC_AVAIL_INDEX_HOST(int, SCALAR, LAYOUT, \ - EXEC_SPACE, MEM_SPACE) +#define KOKKOSBLAS1_IAMAX_MV_ETI_SPEC_AVAIL(SCALAR, LAYOUT, EXEC_SPACE, MEM_SPACE) \ + KOKKOSBLAS1_IAMAX_MV_ETI_SPEC_AVAIL_INDEX_HOST(unsigned long, SCALAR, LAYOUT, EXEC_SPACE, MEM_SPACE) \ + KOKKOSBLAS1_IAMAX_MV_ETI_SPEC_AVAIL_INDEX_HOST(unsigned int, SCALAR, LAYOUT, EXEC_SPACE, MEM_SPACE) \ + KOKKOSBLAS1_IAMAX_MV_ETI_SPEC_AVAIL_INDEX_HOST(int, SCALAR, LAYOUT, EXEC_SPACE, MEM_SPACE) // Include the actual specialization declarations #include @@ -128,10 +108,8 @@ namespace Impl { // Unification layer template ::value, - bool eti_spec_avail = - iamax_eti_spec_avail::value> + bool tpl_spec_avail = iamax_tpl_spec_avail::value, + bool eti_spec_avail = iamax_eti_spec_avail::value> struct Iamax { static void iamax(const execution_space& space, const RMV& R, const XMV& X); }; @@ -139,8 +117,7 @@ struct Iamax { #if !defined(KOKKOSKERNELS_ETI_ONLY) || KOKKOSKERNELS_IMPL_COMPILE_LIBRARY //! Full specialization of Iamax for single vectors (1-D Views). template -struct Iamax { +struct Iamax { typedef typename XMV::size_type size_type; static void iamax(const execution_space& space, const RMV& R, const XMV& X) { @@ -156,16 +133,13 @@ struct Iamax: " "XMV is not rank 1."); - Kokkos::Profiling::pushRegion(KOKKOSKERNELS_IMPL_COMPILE_LIBRARY - ? "KokkosBlas::iamax[ETI]" - : "KokkosBlas::iamax[noETI]"); + Kokkos::Profiling::pushRegion(KOKKOSKERNELS_IMPL_COMPILE_LIBRARY ? "KokkosBlas::iamax[ETI]" + : "KokkosBlas::iamax[noETI]"); #ifdef KOKKOSKERNELS_ENABLE_CHECK_SPECIALIZATION if (KOKKOSKERNELS_IMPL_COMPILE_LIBRARY) - printf("KokkosBlas1::iamax<> ETI specialization for < %s , %s >\n", - typeid(RMV).name(), typeid(XMV).name()); + printf("KokkosBlas1::iamax<> ETI specialization for < %s , %s >\n", typeid(RMV).name(), typeid(XMV).name()); else { - printf("KokkosBlas1::iamax<> non-ETI specialization for < %s , %s >\n", - typeid(RMV).name(), typeid(XMV).name()); + printf("KokkosBlas1::iamax<> non-ETI specialization for < %s , %s >\n", typeid(RMV).name(), typeid(XMV).name()); } #endif const size_type numRows = X.extent(0); @@ -181,8 +155,7 @@ struct Iamax -struct Iamax { +struct Iamax { typedef typename XMV::size_type size_type; static void iamax(const execution_space& space, const RV& R, const XMV& X) { @@ -198,23 +171,19 @@ struct Iamax: " "XMV is not rank 2."); - Kokkos::Profiling::pushRegion(KOKKOSKERNELS_IMPL_COMPILE_LIBRARY - ? "KokkosBlas::iamax[ETI]" - : "KokkosBlas::iamax[noETI]"); + Kokkos::Profiling::pushRegion(KOKKOSKERNELS_IMPL_COMPILE_LIBRARY ? "KokkosBlas::iamax[ETI]" + : "KokkosBlas::iamax[noETI]"); #ifdef KOKKOSKERNELS_ENABLE_CHECK_SPECIALIZATION if (KOKKOSKERNELS_IMPL_COMPILE_LIBRARY) - printf("KokkosBlas1::iamax<> ETI specialization for < %s , %s >\n", - typeid(RV).name(), typeid(XMV).name()); + printf("KokkosBlas1::iamax<> ETI specialization for < %s , %s >\n", typeid(RV).name(), typeid(XMV).name()); else { - printf("KokkosBlas1::iamax<> non-ETI specialization for < %s , %s >\n", - typeid(RV).name(), typeid(XMV).name()); + printf("KokkosBlas1::iamax<> non-ETI specialization for < %s , %s >\n", typeid(RV).name(), typeid(XMV).name()); } #endif const size_type numRows = X.extent(0); const size_type numCols = X.extent(1); - if (numRows < static_cast(INT_MAX) && - numRows * numCols < static_cast(INT_MAX)) { + if (numRows < static_cast(INT_MAX) && numRows * numCols < static_cast(INT_MAX)) { MV_Iamax_Invoke(space, R, X); } else { typedef std::int64_t index_type; @@ -235,64 +204,46 @@ struct Iamax >, \ - Kokkos::View, \ - Kokkos::MemoryTraits >, \ - 1, false, true>; \ - extern template struct Iamax< \ - EXEC_SPACE, \ - Kokkos::View, \ - Kokkos::MemoryTraits >, \ - Kokkos::View, \ - Kokkos::MemoryTraits >, \ - 1, false, true>; +#define KOKKOSBLAS1_IAMAX_ETI_SPEC_DECL_INDEX(INDEX_TYPE, SCALAR, LAYOUT, EXEC_SPACE, MEM_SPACE) \ + extern template struct Iamax< \ + EXEC_SPACE, Kokkos::View >, \ + Kokkos::View, \ + Kokkos::MemoryTraits >, \ + 1, false, true>; \ + extern template struct Iamax, \ + Kokkos::MemoryTraits >, \ + Kokkos::View, \ + Kokkos::MemoryTraits >, \ + 1, false, true>; -#define KOKKOSBLAS1_IAMAX_ETI_SPEC_DECL(SCALAR, LAYOUT, EXEC_SPACE, MEM_SPACE) \ - KOKKOSBLAS1_IAMAX_ETI_SPEC_DECL_INDEX(unsigned long, SCALAR, LAYOUT, \ - EXEC_SPACE, MEM_SPACE) \ - KOKKOSBLAS1_IAMAX_ETI_SPEC_DECL_INDEX(unsigned int, SCALAR, LAYOUT, \ - EXEC_SPACE, MEM_SPACE) \ - KOKKOSBLAS1_IAMAX_ETI_SPEC_DECL_INDEX(int, SCALAR, LAYOUT, EXEC_SPACE, \ - MEM_SPACE) +#define KOKKOSBLAS1_IAMAX_ETI_SPEC_DECL(SCALAR, LAYOUT, EXEC_SPACE, MEM_SPACE) \ + KOKKOSBLAS1_IAMAX_ETI_SPEC_DECL_INDEX(unsigned long, SCALAR, LAYOUT, EXEC_SPACE, MEM_SPACE) \ + KOKKOSBLAS1_IAMAX_ETI_SPEC_DECL_INDEX(unsigned int, SCALAR, LAYOUT, EXEC_SPACE, MEM_SPACE) \ + KOKKOSBLAS1_IAMAX_ETI_SPEC_DECL_INDEX(int, SCALAR, LAYOUT, EXEC_SPACE, MEM_SPACE) // // Macro for definition of full specialization of // KokkosBlas::Impl::Iamax for rank == 1. This is NOT for users!!! We // use this macro in one or more .cpp files in this directory. // -#define KOKKOSBLAS1_IAMAX_ETI_SPEC_INST_INDEX(INDEX_TYPE, SCALAR, LAYOUT, \ - EXEC_SPACE, MEM_SPACE) \ - template struct Iamax< \ - EXEC_SPACE, \ - Kokkos::View >, \ - Kokkos::View, \ - Kokkos::MemoryTraits >, \ - 1, false, true>; \ - template struct Iamax< \ - EXEC_SPACE, \ - Kokkos::View, \ - Kokkos::MemoryTraits >, \ - Kokkos::View, \ - Kokkos::MemoryTraits >, \ - 1, false, true>; +#define KOKKOSBLAS1_IAMAX_ETI_SPEC_INST_INDEX(INDEX_TYPE, SCALAR, LAYOUT, EXEC_SPACE, MEM_SPACE) \ + template struct Iamax >, \ + Kokkos::View, \ + Kokkos::MemoryTraits >, \ + 1, false, true>; \ + template struct Iamax, \ + Kokkos::MemoryTraits >, \ + Kokkos::View, \ + Kokkos::MemoryTraits >, \ + 1, false, true>; -#define KOKKOSBLAS1_IAMAX_ETI_SPEC_INST(SCALAR, LAYOUT, EXEC_SPACE, MEM_SPACE) \ - KOKKOSBLAS1_IAMAX_ETI_SPEC_INST_INDEX(unsigned long, SCALAR, LAYOUT, \ - EXEC_SPACE, MEM_SPACE) \ - KOKKOSBLAS1_IAMAX_ETI_SPEC_INST_INDEX(unsigned int, SCALAR, LAYOUT, \ - EXEC_SPACE, MEM_SPACE) \ - KOKKOSBLAS1_IAMAX_ETI_SPEC_INST_INDEX(int, SCALAR, LAYOUT, EXEC_SPACE, \ - MEM_SPACE) +#define KOKKOSBLAS1_IAMAX_ETI_SPEC_INST(SCALAR, LAYOUT, EXEC_SPACE, MEM_SPACE) \ + KOKKOSBLAS1_IAMAX_ETI_SPEC_INST_INDEX(unsigned long, SCALAR, LAYOUT, EXEC_SPACE, MEM_SPACE) \ + KOKKOSBLAS1_IAMAX_ETI_SPEC_INST_INDEX(unsigned int, SCALAR, LAYOUT, EXEC_SPACE, MEM_SPACE) \ + KOKKOSBLAS1_IAMAX_ETI_SPEC_INST_INDEX(int, SCALAR, LAYOUT, EXEC_SPACE, MEM_SPACE) // // Macro for declaration of full specialization of @@ -301,66 +252,46 @@ struct Iamax >, \ - Kokkos::View, \ - Kokkos::MemoryTraits >, \ - 2, false, true>; \ - extern template struct Iamax< \ - EXEC_SPACE, \ - Kokkos::View, \ - Kokkos::MemoryTraits >, \ - Kokkos::View, \ - Kokkos::MemoryTraits >, \ - 2, false, true>; +#define KOKKOSBLAS1_IAMAX_MV_ETI_SPEC_DECL_INDEX(INDEX_TYPE, SCALAR, LAYOUT, EXEC_SPACE, MEM_SPACE) \ + extern template struct Iamax< \ + EXEC_SPACE, Kokkos::View >, \ + Kokkos::View, \ + Kokkos::MemoryTraits >, \ + 2, false, true>; \ + extern template struct Iamax, \ + Kokkos::MemoryTraits >, \ + Kokkos::View, \ + Kokkos::MemoryTraits >, \ + 2, false, true>; -#define KOKKOSBLAS1_IAMAX_MV_ETI_SPEC_DECL(SCALAR, LAYOUT, EXEC_SPACE, \ - MEM_SPACE) \ - KOKKOSBLAS1_IAMAX_MV_ETI_SPEC_DECL_INDEX(unsigned long, SCALAR, LAYOUT, \ - EXEC_SPACE, MEM_SPACE) \ - KOKKOSBLAS1_IAMAX_MV_ETI_SPEC_DECL_INDEX(unsigned int, SCALAR, LAYOUT, \ - EXEC_SPACE, MEM_SPACE) \ - KOKKOSBLAS1_IAMAX_MV_ETI_SPEC_DECL_INDEX(int, SCALAR, LAYOUT, EXEC_SPACE, \ - MEM_SPACE) +#define KOKKOSBLAS1_IAMAX_MV_ETI_SPEC_DECL(SCALAR, LAYOUT, EXEC_SPACE, MEM_SPACE) \ + KOKKOSBLAS1_IAMAX_MV_ETI_SPEC_DECL_INDEX(unsigned long, SCALAR, LAYOUT, EXEC_SPACE, MEM_SPACE) \ + KOKKOSBLAS1_IAMAX_MV_ETI_SPEC_DECL_INDEX(unsigned int, SCALAR, LAYOUT, EXEC_SPACE, MEM_SPACE) \ + KOKKOSBLAS1_IAMAX_MV_ETI_SPEC_DECL_INDEX(int, SCALAR, LAYOUT, EXEC_SPACE, MEM_SPACE) // // Macro for definition of full specialization of // KokkosBlas::Impl::Iamax for rank == 2. This is NOT for users!!! We // use this macro in one or more .cpp files in this directory. // -#define KOKKOSBLAS1_IAMAX_MV_ETI_SPEC_INST_INDEX(INDEX_TYPE, SCALAR, LAYOUT, \ - EXEC_SPACE, MEM_SPACE) \ - template struct Iamax< \ - EXEC_SPACE, \ - Kokkos::View >, \ - Kokkos::View, \ - Kokkos::MemoryTraits >, \ - 2, false, true>; \ - template struct Iamax< \ - EXEC_SPACE, \ - Kokkos::View, \ - Kokkos::MemoryTraits >, \ - Kokkos::View, \ - Kokkos::MemoryTraits >, \ - 2, false, true>; +#define KOKKOSBLAS1_IAMAX_MV_ETI_SPEC_INST_INDEX(INDEX_TYPE, SCALAR, LAYOUT, EXEC_SPACE, MEM_SPACE) \ + template struct Iamax< \ + EXEC_SPACE, Kokkos::View >, \ + Kokkos::View, \ + Kokkos::MemoryTraits >, \ + 2, false, true>; \ + template struct Iamax, \ + Kokkos::MemoryTraits >, \ + Kokkos::View, \ + Kokkos::MemoryTraits >, \ + 2, false, true>; -#define KOKKOSBLAS1_IAMAX_MV_ETI_SPEC_INST(SCALAR, LAYOUT, EXEC_SPACE, \ - MEM_SPACE) \ - KOKKOSBLAS1_IAMAX_MV_ETI_SPEC_INST_INDEX(unsigned long, SCALAR, LAYOUT, \ - EXEC_SPACE, MEM_SPACE) \ - KOKKOSBLAS1_IAMAX_MV_ETI_SPEC_INST_INDEX(unsigned int, SCALAR, LAYOUT, \ - EXEC_SPACE, MEM_SPACE) \ - KOKKOSBLAS1_IAMAX_MV_ETI_SPEC_INST_INDEX(int, SCALAR, LAYOUT, EXEC_SPACE, \ - MEM_SPACE) +#define KOKKOSBLAS1_IAMAX_MV_ETI_SPEC_INST(SCALAR, LAYOUT, EXEC_SPACE, MEM_SPACE) \ + KOKKOSBLAS1_IAMAX_MV_ETI_SPEC_INST_INDEX(unsigned long, SCALAR, LAYOUT, EXEC_SPACE, MEM_SPACE) \ + KOKKOSBLAS1_IAMAX_MV_ETI_SPEC_INST_INDEX(unsigned int, SCALAR, LAYOUT, EXEC_SPACE, MEM_SPACE) \ + KOKKOSBLAS1_IAMAX_MV_ETI_SPEC_INST_INDEX(int, SCALAR, LAYOUT, EXEC_SPACE, MEM_SPACE) #include diff --git a/blas/impl/KokkosBlas1_mult_impl.hpp b/blas/impl/KokkosBlas1_mult_impl.hpp index 048db395b0..3584240e70 100644 --- a/blas/impl/KokkosBlas1_mult_impl.hpp +++ b/blas/impl/KokkosBlas1_mult_impl.hpp @@ -34,8 +34,7 @@ namespace Impl { /// /// C(i,j) = c * C(i,j) + ab * A(i) * B(i,j), subject to the usual /// BLAS update rules. -template +template struct MV_MultFunctor { typedef SizeType size_type; typedef Kokkos::ArithTraits ATS; @@ -47,8 +46,8 @@ struct MV_MultFunctor { AV m_A; BMV m_B; - MV_MultFunctor(typename CMV::const_value_type& c, const CMV& C, - typename AV::const_value_type& ab, const AV& A, const BMV& B) + MV_MultFunctor(typename CMV::const_value_type& c, const CMV& C, typename AV::const_value_type& ab, const AV& A, + const BMV& B) : m_n(C.extent(1)), m_c(c), m_C(C), m_ab(ab), m_A(A), m_B(B) {} KOKKOS_INLINE_FUNCTION void operator()(const size_type& i) const { @@ -101,8 +100,7 @@ struct MV_MultFunctor { /// /// C(i) = c * C(i) + ab * A(i) * B(i), subject to the usual /// BLAS update rules. -template +template struct V_MultFunctor { typedef SizeType size_type; typedef Kokkos::ArithTraits ATS; @@ -113,8 +111,8 @@ struct V_MultFunctor { AV m_A; BV m_B; - V_MultFunctor(typename CV::const_value_type& c, const CV& C, - typename AV::const_value_type& ab, const AV& A, const BV& B) + V_MultFunctor(typename CV::const_value_type& c, const CV& C, typename AV::const_value_type& ab, const AV& A, + const BV& B) : m_c(c), m_C(C), m_ab(ab), m_A(A), m_B(B) {} KOKKOS_INLINE_FUNCTION void operator()(const size_type& i) const { @@ -145,10 +143,8 @@ struct V_MultFunctor { /// C(i) = c * C(i) + ab * A(i) * B(i), subject to the usual BLAS /// update rules. template -void V_Mult_Generic(const execution_space& space, - typename CV::const_value_type& c, const CV& C, - typename AV::const_value_type& ab, const AV& A, - const BV& B) { +void V_Mult_Generic(const execution_space& space, typename CV::const_value_type& c, const CV& C, + typename AV::const_value_type& ab, const AV& A, const BV& B) { using Kokkos::ALL; using Kokkos::subview; typedef Kokkos::ArithTraits ATA; @@ -192,10 +188,8 @@ void V_Mult_Generic(const execution_space& space, /// C(i,j) = c * C(i,j) + ab * A(i) * B(i,j), subject to the usual /// BLAS update rules. template -void MV_Mult_Generic(const execution_space& space, - typename CMV::const_value_type& c, const CMV& C, - typename AV::const_value_type& ab, const AV& A, - const BMV& B) { +void MV_Mult_Generic(const execution_space& space, typename CMV::const_value_type& c, const CMV& C, + typename AV::const_value_type& ab, const AV& A, const BMV& B) { typedef Kokkos::ArithTraits ATA; typedef Kokkos::ArithTraits ATC; @@ -205,8 +199,7 @@ void MV_Mult_Generic(const execution_space& space, typedef decltype(C_0) CV; typedef decltype(B_0) BV; - V_Mult_Generic(space, c, C_0, ab, A, - B_0); + V_Mult_Generic(space, c, C_0, ab, A, B_0); return; } diff --git a/blas/impl/KokkosBlas1_mult_spec.hpp b/blas/impl/KokkosBlas1_mult_spec.hpp index c81e00a6b0..3cd847dc1d 100644 --- a/blas/impl/KokkosBlas1_mult_spec.hpp +++ b/blas/impl/KokkosBlas1_mult_spec.hpp @@ -27,8 +27,7 @@ namespace KokkosBlas { namespace Impl { // Specialization struct which defines whether a specialization exists -template +template struct mult_eti_spec_avail { enum : bool { value = false }; }; @@ -42,20 +41,17 @@ struct mult_eti_spec_avail { // We may spread out definitions (see _INST macro below) across one or // more .cpp files. // -#define KOKKOSBLAS1_MULT_ETI_SPEC_AVAIL(SCALAR, LAYOUT, EXEC_SPACE, MEM_SPACE) \ - template <> \ - struct mult_eti_spec_avail< \ - EXEC_SPACE, \ - Kokkos::View, \ - Kokkos::MemoryTraits >, \ - Kokkos::View, \ - Kokkos::MemoryTraits >, \ - Kokkos::View, \ - Kokkos::MemoryTraits >, \ - 1> { \ - enum : bool { value = true }; \ +#define KOKKOSBLAS1_MULT_ETI_SPEC_AVAIL(SCALAR, LAYOUT, EXEC_SPACE, MEM_SPACE) \ + template <> \ + struct mult_eti_spec_avail< \ + EXEC_SPACE, \ + Kokkos::View, Kokkos::MemoryTraits >, \ + Kokkos::View, \ + Kokkos::MemoryTraits >, \ + Kokkos::View, \ + Kokkos::MemoryTraits >, \ + 1> { \ + enum : bool { value = true }; \ }; // @@ -65,21 +61,17 @@ struct mult_eti_spec_avail { // We may spread out definitions (see _INST macro below) across one or // more .cpp files. // -#define KOKKOSBLAS1_MULT_MV_ETI_SPEC_AVAIL(SCALAR, LAYOUT, EXEC_SPACE, \ - MEM_SPACE) \ - template <> \ - struct mult_eti_spec_avail< \ - EXEC_SPACE, \ - Kokkos::View, \ - Kokkos::MemoryTraits >, \ - Kokkos::View, \ - Kokkos::MemoryTraits >, \ - Kokkos::View, \ - Kokkos::MemoryTraits >, \ - 2> { \ - enum : bool { value = true }; \ +#define KOKKOSBLAS1_MULT_MV_ETI_SPEC_AVAIL(SCALAR, LAYOUT, EXEC_SPACE, MEM_SPACE) \ + template <> \ + struct mult_eti_spec_avail< \ + EXEC_SPACE, \ + Kokkos::View, Kokkos::MemoryTraits >, \ + Kokkos::View, \ + Kokkos::MemoryTraits >, \ + Kokkos::View, \ + Kokkos::MemoryTraits >, \ + 2> { \ + enum : bool { value = true }; \ }; // Include the actual specialization declarations @@ -101,32 +93,24 @@ namespace Impl { /// Y(i,j) = alpha*A(i,j)*X(i,j) + gamma*Y(i,j) /// /// with special cases for alpha, or gamma = 0. -template ::value, - bool eti_spec_avail = - mult_eti_spec_avail::value> +template ::value, + bool eti_spec_avail = mult_eti_spec_avail::value> struct Mult { - static void mult(const execution_space& space, - const typename YMV::non_const_value_type& gamma, - const YMV& Y, - const typename XMV::non_const_value_type& alpha, const AV& A, - const XMV& X); + static void mult(const execution_space& space, const typename YMV::non_const_value_type& gamma, const YMV& Y, + const typename XMV::non_const_value_type& alpha, const AV& A, const XMV& X); }; #if !defined(KOKKOSKERNELS_ETI_ONLY) || KOKKOSKERNELS_IMPL_COMPILE_LIBRARY // Partial specialization for YMV, AV, and XMV rank-2 Views. template -struct Mult { +struct Mult { typedef typename YMV::size_type size_type; typedef typename YMV::non_const_value_type YMV_scalar; typedef typename XMV::non_const_value_type XMV_scalar; - static void mult(const execution_space& space, const YMV_scalar& gamma, - const YMV& Y, const XMV_scalar& alpha, const AV& A, - const XMV& X) { + static void mult(const execution_space& space, const YMV_scalar& gamma, const YMV& Y, const XMV_scalar& alpha, + const AV& A, const XMV& X) { static_assert(Kokkos::is_view::value, "KokkosBlas::Impl::" "Mult::mult: Y is not a Kokkos::View."); @@ -136,8 +120,7 @@ struct Mult::value, "KokkosBlas::Impl::" "Mult::mult: X is not a Kokkos::View."); - static_assert(std::is_same::value, + static_assert(std::is_same::value, "KokkosBlas::Impl::Mult::mult: Y is const. " "It must be nonconst, because it is an output argument " "(we have to be able to write to its entries)."); @@ -149,31 +132,26 @@ struct Mult::mult: " "AV must have rank 1."); - Kokkos::Profiling::pushRegion(KOKKOSKERNELS_IMPL_COMPILE_LIBRARY - ? "KokkosBlas::mult[ETI]" - : "KokkosBlas::mult[noETI]"); + Kokkos::Profiling::pushRegion(KOKKOSKERNELS_IMPL_COMPILE_LIBRARY ? "KokkosBlas::mult[ETI]" + : "KokkosBlas::mult[noETI]"); #ifdef KOKKOSKERNELS_ENABLE_CHECK_SPECIALIZATION if (KOKKOSKERNELS_IMPL_COMPILE_LIBRARY) - printf("KokkosBlas1::mult<> ETI specialization for < %s , %s , %s >\n", - typeid(YMV).name(), typeid(AV).name(), typeid(XMV).name()); + printf("KokkosBlas1::mult<> ETI specialization for < %s , %s , %s >\n", typeid(YMV).name(), typeid(AV).name(), + typeid(XMV).name()); else { - printf( - "KokkosBlas1::mult<> non-ETI specialization for < %s , %s , %s >\n", - typeid(YMV).name(), typeid(AV).name(), typeid(XMV).name()); + printf("KokkosBlas1::mult<> non-ETI specialization for < %s , %s , %s >\n", typeid(YMV).name(), typeid(AV).name(), + typeid(XMV).name()); } #endif const size_type numRows = X.extent(0); const size_type numCols = X.extent(1); - if (numRows < static_cast(INT_MAX) && - numRows * numCols < static_cast(INT_MAX)) { - MV_Mult_Generic(space, gamma, Y, - alpha, A, X); + if (numRows < static_cast(INT_MAX) && numRows * numCols < static_cast(INT_MAX)) { + MV_Mult_Generic(space, gamma, Y, alpha, A, X); } else { - MV_Mult_Generic(space, gamma, Y, - alpha, A, X); + MV_Mult_Generic(space, gamma, Y, alpha, A, X); } Kokkos::Profiling::popRegion(); } @@ -181,15 +159,13 @@ struct Mult -struct Mult { +struct Mult { typedef typename YV::size_type size_type; typedef typename YV::non_const_value_type YV_scalar; typedef typename XV::non_const_value_type XV_scalar; - static void mult(const execution_space& space, const YV_scalar& gamma, - const YV& Y, const XV_scalar& alpha, const AV& A, - const XV& X) { + static void mult(const execution_space& space, const YV_scalar& gamma, const YV& Y, const XV_scalar& alpha, + const AV& A, const XV& X) { // YV, AV, and XV must be Kokkos::View specializations. static_assert(Kokkos::is_view::value, "KokkosBlas::Impl::" @@ -201,35 +177,30 @@ struct Mult::mult: X is not a Kokkos::View."); // XV must be nonconst (else it can't be an output argument). - static_assert(std::is_same::value, + static_assert(std::is_same::value, "KokkosBlas::Impl::Mult::mult: Y is const. " "It must be nonconst, because it is an output argument " "(we have to be able to write to its entries)."); static_assert((int)XV::rank == (int)YV::rank && (int)AV::rank == 1, "KokkosBlas::Impl::Mult::mult: " "X, Y, and Z must have rank 1."); - Kokkos::Profiling::pushRegion(KOKKOSKERNELS_IMPL_COMPILE_LIBRARY - ? "KokkosBlas::mult[ETI]" - : "KokkosBlas::mult[noETI]"); + Kokkos::Profiling::pushRegion(KOKKOSKERNELS_IMPL_COMPILE_LIBRARY ? "KokkosBlas::mult[ETI]" + : "KokkosBlas::mult[noETI]"); #ifdef KOKKOSKERNELS_ENABLE_CHECK_SPECIALIZATION if (KOKKOSKERNELS_IMPL_COMPILE_LIBRARY) - printf("KokkosBlas1::mult<> ETI specialization for < %s , %s , %s >\n", - typeid(YV).name(), typeid(AV).name(), typeid(XV).name()); + printf("KokkosBlas1::mult<> ETI specialization for < %s , %s , %s >\n", typeid(YV).name(), typeid(AV).name(), + typeid(XV).name()); else { - printf( - "KokkosBlas1::mult<> non-ETI specialization for < %s , %s , %s >\n", - typeid(YV).name(), typeid(AV).name(), typeid(XV).name()); + printf("KokkosBlas1::mult<> non-ETI specialization for < %s , %s , %s >\n", typeid(YV).name(), typeid(AV).name(), + typeid(XV).name()); } #endif const size_type numRows = Y.extent(0); if (numRows < static_cast(INT_MAX)) { - V_Mult_Generic(space, gamma, Y, alpha, - A, X); + V_Mult_Generic(space, gamma, Y, alpha, A, X); } else { - V_Mult_Generic(space, gamma, Y, - alpha, A, X); + V_Mult_Generic(space, gamma, Y, alpha, A, X); } Kokkos::Profiling::popRegion(); } @@ -248,30 +219,24 @@ struct Mult, \ - Kokkos::MemoryTraits >, \ - Kokkos::View, \ - Kokkos::MemoryTraits >, \ - Kokkos::View, \ - Kokkos::MemoryTraits >, \ +#define KOKKOSBLAS1_MULT_ETI_SPEC_DECL(SCALAR, LAYOUT, EXEC_SPACE, MEM_SPACE) \ + extern template struct Mult< \ + EXEC_SPACE, \ + Kokkos::View, Kokkos::MemoryTraits >, \ + Kokkos::View, \ + Kokkos::MemoryTraits >, \ + Kokkos::View, \ + Kokkos::MemoryTraits >, \ 1, false, true>; -#define KOKKOSBLAS1_MULT_ETI_SPEC_INST(SCALAR, LAYOUT, EXEC_SPACE, MEM_SPACE) \ - template struct Mult< \ - EXEC_SPACE, \ - Kokkos::View, \ - Kokkos::MemoryTraits >, \ - Kokkos::View, \ - Kokkos::MemoryTraits >, \ - Kokkos::View, \ - Kokkos::MemoryTraits >, \ +#define KOKKOSBLAS1_MULT_ETI_SPEC_INST(SCALAR, LAYOUT, EXEC_SPACE, MEM_SPACE) \ + template struct Mult< \ + EXEC_SPACE, \ + Kokkos::View, Kokkos::MemoryTraits >, \ + Kokkos::View, \ + Kokkos::MemoryTraits >, \ + Kokkos::View, \ + Kokkos::MemoryTraits >, \ 1, false, true>; // @@ -282,32 +247,24 @@ struct Mult, \ - Kokkos::MemoryTraits >, \ - Kokkos::View, \ - Kokkos::MemoryTraits >, \ - Kokkos::View, \ - Kokkos::MemoryTraits >, \ +#define KOKKOSBLAS1_MULT_MV_ETI_SPEC_DECL(SCALAR, LAYOUT, EXEC_SPACE, MEM_SPACE) \ + extern template struct Mult< \ + EXEC_SPACE, \ + Kokkos::View, Kokkos::MemoryTraits >, \ + Kokkos::View, \ + Kokkos::MemoryTraits >, \ + Kokkos::View, \ + Kokkos::MemoryTraits >, \ 2, false, true>; -#define KOKKOSBLAS1_MULT_MV_ETI_SPEC_INST(SCALAR, LAYOUT, EXEC_SPACE, \ - MEM_SPACE) \ - template struct Mult< \ - EXEC_SPACE, \ - Kokkos::View, \ - Kokkos::MemoryTraits >, \ - Kokkos::View, \ - Kokkos::MemoryTraits >, \ - Kokkos::View, \ - Kokkos::MemoryTraits >, \ +#define KOKKOSBLAS1_MULT_MV_ETI_SPEC_INST(SCALAR, LAYOUT, EXEC_SPACE, MEM_SPACE) \ + template struct Mult< \ + EXEC_SPACE, \ + Kokkos::View, Kokkos::MemoryTraits >, \ + Kokkos::View, \ + Kokkos::MemoryTraits >, \ + Kokkos::View, \ + Kokkos::MemoryTraits >, \ 2, false, true>; #include diff --git a/blas/impl/KokkosBlas1_nrm1_impl.hpp b/blas/impl/KokkosBlas1_nrm1_impl.hpp index a88c01023e..8ba857c9e9 100644 --- a/blas/impl/KokkosBlas1_nrm1_impl.hpp +++ b/blas/impl/KokkosBlas1_nrm1_impl.hpp @@ -50,8 +50,7 @@ struct V_Nrm1_Functor { static_assert(Kokkos::is_view::value, "KokkosBlas::Impl::V_Nrm1_Functor: " "X is not a Kokkos::View."); - static_assert(std::is_same::value, + static_assert(std::is_same::value, "KokkosBlas::Impl::V_Nrm1_Functor: R is const. " "It must be nonconst, because it is an output argument " "(we have to be able to write to its entries)."); @@ -80,11 +79,9 @@ struct Nrm1_MV_Functor { RV r; XV x; - size_type - teamsPerVec; // number of teams collectively performing a dot product + size_type teamsPerVec; // number of teams collectively performing a dot product - Nrm1_MV_Functor(const RV& r_, const XV& x_, int teamsPerVec_) - : r(r_), x(x_), teamsPerVec(teamsPerVec_) {} + Nrm1_MV_Functor(const RV& r_, const XV& x_, int teamsPerVec_) : r(r_), x(x_), teamsPerVec(teamsPerVec_) {} KOKKOS_INLINE_FUNCTION void operator()(const TeamMem& t) const { @@ -103,9 +100,7 @@ struct Nrm1_MV_Functor { }, localResult); - Kokkos::single(Kokkos::PerTeam(t), [&]() { - Kokkos::atomic_add(&r(i), rvalue_type(localResult)); - }); + Kokkos::single(Kokkos::PerTeam(t), [&]() { Kokkos::atomic_add(&r(i), rvalue_type(localResult)); }); } }; @@ -128,27 +123,23 @@ void V_Nrm1_Invoke(const execution_space& space, const RV& r, const XV& X) { template void MV_Nrm1_Invoke( const execution_space& space, const RV& r, const XV& x, - typename std::enable_if::accessible>::type* = + typename std::enable_if::accessible>::type* = nullptr) { if (r.extent(0) != x.extent(1)) { std::ostringstream oss; - oss << "KokkosBlas::nrm1 (rank-2): result vector has wrong length (" - << r.extent(0) << ", but x has " << x.extent(1) << " columns)"; + oss << "KokkosBlas::nrm1 (rank-2): result vector has wrong length (" << r.extent(0) << ", but x has " << x.extent(1) + << " columns)"; throw std::runtime_error(oss.str()); } // Zero out the result vector - Kokkos::deep_copy( - space, r, Kokkos::ArithTraits::zero()); + Kokkos::deep_copy(space, r, Kokkos::ArithTraits::zero()); size_type teamsPerVec; - KokkosBlas::Impl::multipleReductionWorkDistribution( - x.extent(0), x.extent(1), teamsPerVec); + KokkosBlas::Impl::multipleReductionWorkDistribution(x.extent(0), x.extent(1), + teamsPerVec); size_type numTeams = x.extent(1) * teamsPerVec; Kokkos::TeamPolicy pol(space, numTeams, Kokkos::AUTO); - Kokkos::parallel_for( - "KokkosBlas1::Nrm1::S1", pol, - Nrm1_MV_Functor(r, x, teamsPerVec)); + Kokkos::parallel_for("KokkosBlas1::Nrm1::S1", pol, + Nrm1_MV_Functor(r, x, teamsPerVec)); } // Version for when a temporary result view is needed (implemented in terms of @@ -156,15 +147,11 @@ void MV_Nrm1_Invoke( template void MV_Nrm1_Invoke( const execution_space& space, const RV& r, const XV& x, - typename std::enable_if::accessible>::type* = - nullptr) { - Kokkos::View - tempResult( - Kokkos::view_alloc(Kokkos::WithoutInitializing, "Nrm1 temp result"), - r.extent(0)); - MV_Nrm1_Invoke( - space, tempResult, x); + typename std::enable_if< + !Kokkos::SpaceAccessibility::accessible>::type* = nullptr) { + Kokkos::View tempResult( + Kokkos::view_alloc(Kokkos::WithoutInitializing, "Nrm1 temp result"), r.extent(0)); + MV_Nrm1_Invoke(space, tempResult, x); Kokkos::deep_copy(space, r, tempResult); // Fence needed to ensure that the deep_copy // above finishes before we exit this function diff --git a/blas/impl/KokkosBlas1_nrm1_spec.hpp b/blas/impl/KokkosBlas1_nrm1_spec.hpp index 24f093c736..3977c5225c 100644 --- a/blas/impl/KokkosBlas1_nrm1_spec.hpp +++ b/blas/impl/KokkosBlas1_nrm1_spec.hpp @@ -43,19 +43,15 @@ struct nrm1_eti_spec_avail { // We may spread out definitions (see _INST macro below) across one or // more .cpp files. // -#define KOKKOSBLAS1_NRM1_ETI_SPEC_AVAIL(SCALAR, LAYOUT, EXEC_SPACE, MEM_SPACE) \ - template <> \ - struct nrm1_eti_spec_avail< \ - EXEC_SPACE, \ - Kokkos::View< \ - typename Kokkos::Details::InnerProductSpaceTraits::mag_type, \ - LAYOUT, Kokkos::HostSpace, \ - Kokkos::MemoryTraits >, \ - Kokkos::View, \ - Kokkos::MemoryTraits >, \ - 1> { \ - enum : bool { value = true }; \ +#define KOKKOSBLAS1_NRM1_ETI_SPEC_AVAIL(SCALAR, LAYOUT, EXEC_SPACE, MEM_SPACE) \ + template <> \ + struct nrm1_eti_spec_avail::mag_type, LAYOUT, \ + Kokkos::HostSpace, Kokkos::MemoryTraits >, \ + Kokkos::View, \ + Kokkos::MemoryTraits >, \ + 1> { \ + enum : bool { value = true }; \ }; // @@ -65,22 +61,17 @@ struct nrm1_eti_spec_avail { // We may spread out definitions (see _DEF macro below) across one or // more .cpp files. // -#define KOKKOSBLAS1_NRM1_MV_ETI_SPEC_AVAIL(SCALAR, LAYOUT, EXEC_SPACE, \ - MEM_SPACE) \ - template <> \ - struct nrm1_eti_spec_avail< \ - EXEC_SPACE, \ - Kokkos::View::mag_type*, \ - LAYOUT, \ - Kokkos::Device, \ - Kokkos::MemoryTraits >, \ - Kokkos::View, \ - Kokkos::MemoryTraits >, \ - 2> { \ - enum : bool { value = true }; \ +#define KOKKOSBLAS1_NRM1_MV_ETI_SPEC_AVAIL(SCALAR, LAYOUT, EXEC_SPACE, MEM_SPACE) \ + template <> \ + struct nrm1_eti_spec_avail< \ + EXEC_SPACE, \ + Kokkos::View::mag_type*, LAYOUT, \ + Kokkos::Device, \ + Kokkos::MemoryTraits >, \ + Kokkos::View, \ + Kokkos::MemoryTraits >, \ + 2> { \ + enum : bool { value = true }; \ }; // Include the actual specialization declarations @@ -92,10 +83,9 @@ namespace KokkosBlas { namespace Impl { // Unification layer -template < - class execution_space, class RMV, class XMV, int rank = XMV::rank, - bool tpl_spec_avail = nrm1_tpl_spec_avail::value, - bool eti_spec_avail = nrm1_eti_spec_avail::value> +template ::value, + bool eti_spec_avail = nrm1_eti_spec_avail::value> struct Nrm1 { static void nrm1(const execution_space& space, const RMV& R, const XMV& X); }; @@ -103,8 +93,7 @@ struct Nrm1 { #if !defined(KOKKOSKERNELS_ETI_ONLY) || KOKKOSKERNELS_IMPL_COMPILE_LIBRARY //! Full specialization of Nrm1 for single vectors (1-D Views). template -struct Nrm1 { +struct Nrm1 { using size_type = typename XMV::size_type; static void nrm1(const execution_space& space, const RMV& R, const XMV& X) { @@ -120,16 +109,13 @@ struct Nrm1: " "XMV is not rank 1."); - Kokkos::Profiling::pushRegion(KOKKOSKERNELS_IMPL_COMPILE_LIBRARY - ? "KokkosBlas::nrm1[ETI]" - : "KokkosBlas::nrm1[noETI]"); + Kokkos::Profiling::pushRegion(KOKKOSKERNELS_IMPL_COMPILE_LIBRARY ? "KokkosBlas::nrm1[ETI]" + : "KokkosBlas::nrm1[noETI]"); #ifdef KOKKOSKERNELS_ENABLE_CHECK_SPECIALIZATION if (KOKKOSKERNELS_IMPL_COMPILE_LIBRARY) - printf("KokkosBlas1::nrm1<> ETI specialization for < %s , %s >\n", - typeid(RMV).name(), typeid(XMV).name()); + printf("KokkosBlas1::nrm1<> ETI specialization for < %s , %s >\n", typeid(RMV).name(), typeid(XMV).name()); else { - printf("KokkosBlas1::nrm1<> non-ETI specialization for < %s , %s >\n", - typeid(RMV).name(), typeid(XMV).name()); + printf("KokkosBlas1::nrm1<> non-ETI specialization for < %s , %s >\n", typeid(RMV).name(), typeid(XMV).name()); } #endif const size_type numRows = X.extent(0); @@ -145,8 +131,7 @@ struct Nrm1 -struct Nrm1 { +struct Nrm1 { using size_type = typename XMV::size_type; static void nrm1(const execution_space& space, const RV& R, const XMV& X) { @@ -165,32 +150,26 @@ struct Nrm1 ETI specialization for < %s , %s >\n", - typeid(RV).name(), typeid(XMV).name()); + printf("KokkosBlas1::nrm1<> ETI specialization for < %s , %s >\n", typeid(RV).name(), typeid(XMV).name()); else { - printf("KokkosBlas1::nrm1<> non-ETI specialization for < %s , %s >\n", - typeid(RV).name(), typeid(XMV).name()); + printf("KokkosBlas1::nrm1<> non-ETI specialization for < %s , %s >\n", typeid(RV).name(), typeid(XMV).name()); } #endif - Kokkos::Profiling::pushRegion(KOKKOSKERNELS_IMPL_COMPILE_LIBRARY - ? "KokkosBlas::nrm1[ETI]" - : "KokkosBlas::nrm1[noETI]"); + Kokkos::Profiling::pushRegion(KOKKOSKERNELS_IMPL_COMPILE_LIBRARY ? "KokkosBlas::nrm1[ETI]" + : "KokkosBlas::nrm1[noETI]"); const size_type numRows = X.extent(0); const size_type numCols = X.extent(1); if (numCols == Kokkos::ArithTraits::one()) { auto R0 = Kokkos::subview(R, 0); auto X0 = Kokkos::subview(X, Kokkos::ALL(), 0); if (numRows < static_cast(INT_MAX)) { - V_Nrm1_Invoke(space, - R0, X0); + V_Nrm1_Invoke(space, R0, X0); } else { typedef std::int64_t index_type; - V_Nrm1_Invoke( - space, R0, X0); + V_Nrm1_Invoke(space, R0, X0); } } else { - if (numRows < static_cast(INT_MAX) && - numRows * numCols < static_cast(INT_MAX)) { + if (numRows < static_cast(INT_MAX) && numRows * numCols < static_cast(INT_MAX)) { MV_Nrm1_Invoke(space, R, X); } else { using index_type = std::int64_t; @@ -212,34 +191,26 @@ struct Nrm1::mag_type, \ - LAYOUT, Kokkos::HostSpace, \ - Kokkos::MemoryTraits >, \ - Kokkos::View, \ - Kokkos::MemoryTraits >, \ - 1, false, true>; +#define KOKKOSBLAS1_NRM1_ETI_SPEC_DECL(SCALAR, LAYOUT, EXEC_SPACE, MEM_SPACE) \ + extern template struct Nrm1::mag_type, \ + LAYOUT, Kokkos::HostSpace, Kokkos::MemoryTraits >, \ + Kokkos::View, \ + Kokkos::MemoryTraits >, \ + 1, false, true>; // // Macro for definition of full specialization of // KokkosBlas::Impl::Nrm1 for rank == 2. This is NOT for users!!! We // use this macro in one or more .cpp files in this directory. // -#define KOKKOSBLAS1_NRM1_ETI_SPEC_INST(SCALAR, LAYOUT, EXEC_SPACE, MEM_SPACE) \ - template struct Nrm1< \ - EXEC_SPACE, \ - Kokkos::View< \ - typename Kokkos::Details::InnerProductSpaceTraits::mag_type, \ - LAYOUT, Kokkos::HostSpace, \ - Kokkos::MemoryTraits >, \ - Kokkos::View, \ - Kokkos::MemoryTraits >, \ - 1, false, true>; +#define KOKKOSBLAS1_NRM1_ETI_SPEC_INST(SCALAR, LAYOUT, EXEC_SPACE, MEM_SPACE) \ + template struct Nrm1::mag_type, LAYOUT, \ + Kokkos::HostSpace, Kokkos::MemoryTraits >, \ + Kokkos::View, \ + Kokkos::MemoryTraits >, \ + 1, false, true>; // // Macro for declaration of full specialization of @@ -248,19 +219,14 @@ struct Nrm1::mag_type*, \ - LAYOUT, \ - Kokkos::Device, \ - Kokkos::MemoryTraits >, \ - Kokkos::View, \ - Kokkos::MemoryTraits >, \ +#define KOKKOSBLAS1_NRM1_MV_ETI_SPEC_DECL(SCALAR, LAYOUT, EXEC_SPACE, MEM_SPACE) \ + extern template struct Nrm1< \ + EXEC_SPACE, \ + Kokkos::View::mag_type*, LAYOUT, \ + Kokkos::Device, \ + Kokkos::MemoryTraits >, \ + Kokkos::View, \ + Kokkos::MemoryTraits >, \ 2, false, true>; // @@ -268,20 +234,14 @@ struct Nrm1::mag_type*, \ - LAYOUT, \ - Kokkos::Device, \ - Kokkos::MemoryTraits >, \ - Kokkos::View, \ - Kokkos::MemoryTraits >, \ - 2, false, true>; +#define KOKKOSBLAS1_NRM1_MV_ETI_SPEC_INST(SCALAR, LAYOUT, EXEC_SPACE, MEM_SPACE) \ + template struct Nrm1::mag_type*, LAYOUT, \ + Kokkos::Device, \ + Kokkos::MemoryTraits >, \ + Kokkos::View, \ + Kokkos::MemoryTraits >, \ + 2, false, true>; #include diff --git a/blas/impl/KokkosBlas1_nrm2_impl.hpp b/blas/impl/KokkosBlas1_nrm2_impl.hpp index 276023c171..e840d0bfd4 100644 --- a/blas/impl/KokkosBlas1_nrm2_impl.hpp +++ b/blas/impl/KokkosBlas1_nrm2_impl.hpp @@ -51,8 +51,7 @@ struct V_Nrm2_Functor { static_assert(Kokkos::is_view::value, "KokkosBlas::Impl::V_Nrm2_Functor: " "X is not a Kokkos::View."); - static_assert(std::is_same::value, + static_assert(std::is_same::value, "KokkosBlas::Impl::V_Nrm2_Functor: R is const. " "It must be nonconst, because it is an output argument " "(we have to be able to write to its entries)."); @@ -67,19 +66,12 @@ struct V_Nrm2_Functor { sum += tmp * tmp; } - KOKKOS_INLINE_FUNCTION void init(value_type& update) const { - update = AT::zero(); - } + KOKKOS_INLINE_FUNCTION void init(value_type& update) const { update = AT::zero(); } - KOKKOS_INLINE_FUNCTION void join(value_type& update, - const value_type& source) const { - update += source; - } + KOKKOS_INLINE_FUNCTION void join(value_type& update, const value_type& source) const { update += source; } KOKKOS_INLINE_FUNCTION void final(value_type& update) const { - if (m_take_sqrt) - update = - Kokkos::ArithTraits::sqrt(update); + if (m_take_sqrt) update = Kokkos::ArithTraits::sqrt(update); } }; @@ -102,11 +94,9 @@ struct Nrm2_MV_Functor { RV r; XV x; - size_type - teamsPerVec; // number of teams collectively performing a dot product + size_type teamsPerVec; // number of teams collectively performing a dot product - Nrm2_MV_Functor(const RV& r_, const XV& x_, int teamsPerVec_) - : r(r_), x(x_), teamsPerVec(teamsPerVec_) {} + Nrm2_MV_Functor(const RV& r_, const XV& x_, int teamsPerVec_) : r(r_), x(x_), teamsPerVec(teamsPerVec_) {} KOKKOS_INLINE_FUNCTION void operator()(const TeamMem& t) const { @@ -127,17 +117,14 @@ struct Nrm2_MV_Functor { }, localResult); - Kokkos::single(Kokkos::PerTeam(t), [&]() { - Kokkos::atomic_add(&r(i), rvalue_type(localResult)); - }); + Kokkos::single(Kokkos::PerTeam(t), [&]() { Kokkos::atomic_add(&r(i), rvalue_type(localResult)); }); } }; /// \brief Compute the 2-norm (or its square) of the single vector (1-D /// View) X, and store the result in the 0-D View r. template -void V_Nrm2_Invoke(const execution_space& space, const RV& r, const XV& X, - const bool& take_sqrt) { +void V_Nrm2_Invoke(const execution_space& space, const RV& r, const XV& X, const bool& take_sqrt) { const SizeType numRows = static_cast(X.extent(0)); Kokkos::RangePolicy policy(space, 0, numRows); @@ -153,32 +140,26 @@ void V_Nrm2_Invoke(const execution_space& space, const RV& r, const XV& X, template void MV_Nrm2_Invoke( const execution_space& space, const RV& r, const XV& x, bool take_sqrt, - typename std::enable_if::accessible>::type* = + typename std::enable_if::accessible>::type* = nullptr) { if (r.extent(0) != x.extent(1)) { std::ostringstream oss; - oss << "KokkosBlas::nrm2 (rank-2): result vector has wrong length (" - << r.extent(0) << ", but x has " << x.extent(1) << " columns)"; + oss << "KokkosBlas::nrm2 (rank-2): result vector has wrong length (" << r.extent(0) << ", but x has " << x.extent(1) + << " columns)"; throw std::runtime_error(oss.str()); } // Zero out the result vector - Kokkos::deep_copy( - space, r, Kokkos::ArithTraits::zero()); + Kokkos::deep_copy(space, r, Kokkos::ArithTraits::zero()); size_type teamsPerVec; - KokkosBlas::Impl::multipleReductionWorkDistribution( - x.extent(0), x.extent(1), teamsPerVec); + KokkosBlas::Impl::multipleReductionWorkDistribution(x.extent(0), x.extent(1), + teamsPerVec); size_type numTeams = x.extent(1) * teamsPerVec; Kokkos::TeamPolicy pol(space, numTeams, Kokkos::AUTO); - Kokkos::parallel_for( - "KokkosBlas1::Nrm2::S1", pol, - Nrm2_MV_Functor(r, x, teamsPerVec)); + Kokkos::parallel_for("KokkosBlas1::Nrm2::S1", pol, + Nrm2_MV_Functor(r, x, teamsPerVec)); if (take_sqrt) { - Kokkos::parallel_for( - "KokkosBlas1::Nrm2::Sqrt", - Kokkos::RangePolicy(space, 0, r.extent(0)), - TakeSqrtFunctor(r)); + Kokkos::parallel_for("KokkosBlas1::Nrm2::Sqrt", Kokkos::RangePolicy(space, 0, r.extent(0)), + TakeSqrtFunctor(r)); } } @@ -187,15 +168,11 @@ void MV_Nrm2_Invoke( template void MV_Nrm2_Invoke( const execution_space& space, const RV& r, const XV& x, bool take_sqrt, - typename std::enable_if::accessible>::type* = - nullptr) { - Kokkos::View - tempResult( - Kokkos::view_alloc(Kokkos::WithoutInitializing, "Nrm2 temp result"), - r.extent(0)); - MV_Nrm2_Invoke( - space, tempResult, x, take_sqrt); + typename std::enable_if< + !Kokkos::SpaceAccessibility::accessible>::type* = nullptr) { + Kokkos::View tempResult( + Kokkos::view_alloc(Kokkos::WithoutInitializing, "Nrm2 temp result"), r.extent(0)); + MV_Nrm2_Invoke(space, tempResult, x, take_sqrt); Kokkos::deep_copy(space, r, tempResult); space.fence(); } diff --git a/blas/impl/KokkosBlas1_nrm2_spec.hpp b/blas/impl/KokkosBlas1_nrm2_spec.hpp index 6c21e551a8..4d0b2e1396 100644 --- a/blas/impl/KokkosBlas1_nrm2_spec.hpp +++ b/blas/impl/KokkosBlas1_nrm2_spec.hpp @@ -43,19 +43,15 @@ struct nrm2_eti_spec_avail { // We may spread out definitions (see _INST macro below) across one or // more .cpp files. // -#define KOKKOSBLAS1_NRM2_ETI_SPEC_AVAIL(SCALAR, LAYOUT, EXEC_SPACE, MEM_SPACE) \ - template <> \ - struct nrm2_eti_spec_avail< \ - EXEC_SPACE, \ - Kokkos::View< \ - typename Kokkos::Details::InnerProductSpaceTraits::mag_type, \ - LAYOUT, Kokkos::HostSpace, \ - Kokkos::MemoryTraits >, \ - Kokkos::View, \ - Kokkos::MemoryTraits >, \ - 1> { \ - enum : bool { value = true }; \ +#define KOKKOSBLAS1_NRM2_ETI_SPEC_AVAIL(SCALAR, LAYOUT, EXEC_SPACE, MEM_SPACE) \ + template <> \ + struct nrm2_eti_spec_avail::mag_type, LAYOUT, \ + Kokkos::HostSpace, Kokkos::MemoryTraits >, \ + Kokkos::View, \ + Kokkos::MemoryTraits >, \ + 1> { \ + enum : bool { value = true }; \ }; // @@ -65,22 +61,17 @@ struct nrm2_eti_spec_avail { // We may spread out definitions (see _DEF macro below) across one or // more .cpp files. // -#define KOKKOSBLAS1_NRM2_MV_ETI_SPEC_AVAIL(SCALAR, LAYOUT, EXEC_SPACE, \ - MEM_SPACE) \ - template <> \ - struct nrm2_eti_spec_avail< \ - EXEC_SPACE, \ - Kokkos::View::mag_type*, \ - LAYOUT, \ - Kokkos::Device, \ - Kokkos::MemoryTraits >, \ - Kokkos::View, \ - Kokkos::MemoryTraits >, \ - 2> { \ - enum : bool { value = true }; \ +#define KOKKOSBLAS1_NRM2_MV_ETI_SPEC_AVAIL(SCALAR, LAYOUT, EXEC_SPACE, MEM_SPACE) \ + template <> \ + struct nrm2_eti_spec_avail< \ + EXEC_SPACE, \ + Kokkos::View::mag_type*, LAYOUT, \ + Kokkos::Device, \ + Kokkos::MemoryTraits >, \ + Kokkos::View, \ + Kokkos::MemoryTraits >, \ + 2> { \ + enum : bool { value = true }; \ }; // Include the actual specialization declarations @@ -92,24 +83,20 @@ namespace KokkosBlas { namespace Impl { // Unification layer -template < - class execution_space, class RMV, class XMV, int rank = XMV::rank, - bool tpl_spec_avail = nrm2_tpl_spec_avail::value, - bool eti_spec_avail = nrm2_eti_spec_avail::value> +template ::value, + bool eti_spec_avail = nrm2_eti_spec_avail::value> struct Nrm2 { - static void nrm2(const execution_space& space, const RMV& R, const XMV& X, - const bool& take_sqrt); + static void nrm2(const execution_space& space, const RMV& R, const XMV& X, const bool& take_sqrt); }; #if !defined(KOKKOSKERNELS_ETI_ONLY) || KOKKOSKERNELS_IMPL_COMPILE_LIBRARY //! Full specialization of Nrm2 for single vectors (1-D Views). template -struct Nrm2 { +struct Nrm2 { typedef typename XMV::size_type size_type; - static void nrm2(const execution_space& space, const RMV& R, const XMV& X, - const bool& take_sqrt) { + static void nrm2(const execution_space& space, const RMV& R, const XMV& X, const bool& take_sqrt) { static_assert(Kokkos::is_view::value, "KokkosBlas::Impl::" "Nrm2<1-D>: RMV is not a Kokkos::View."); @@ -122,16 +109,13 @@ struct Nrm2: " "XMV is not rank 1."); - Kokkos::Profiling::pushRegion(KOKKOSKERNELS_IMPL_COMPILE_LIBRARY - ? "KokkosBlas::nrm2[ETI]" - : "KokkosBlas::nrm2[noETI]"); + Kokkos::Profiling::pushRegion(KOKKOSKERNELS_IMPL_COMPILE_LIBRARY ? "KokkosBlas::nrm2[ETI]" + : "KokkosBlas::nrm2[noETI]"); #ifdef KOKKOSKERNELS_ENABLE_CHECK_SPECIALIZATION if (KOKKOSKERNELS_IMPL_COMPILE_LIBRARY) - printf("KokkosBlas1::nrm2<> ETI specialization for < %s , %s >\n", - typeid(RMV).name(), typeid(XMV).name()); + printf("KokkosBlas1::nrm2<> ETI specialization for < %s , %s >\n", typeid(RMV).name(), typeid(XMV).name()); else { - printf("KokkosBlas1::nrm2<> non-ETI specialization for < %s , %s >\n", - typeid(RMV).name(), typeid(XMV).name()); + printf("KokkosBlas1::nrm2<> non-ETI specialization for < %s , %s >\n", typeid(RMV).name(), typeid(XMV).name()); } #endif const size_type numRows = X.extent(0); @@ -140,20 +124,17 @@ struct Nrm2(space, R, X, take_sqrt); } else { typedef std::int64_t index_type; - V_Nrm2_Invoke(space, R, X, - take_sqrt); + V_Nrm2_Invoke(space, R, X, take_sqrt); } Kokkos::Profiling::popRegion(); } }; template -struct Nrm2 { +struct Nrm2 { typedef typename XMV::size_type size_type; - static void nrm2(const execution_space& space, const RV& R, const XMV& X, - const bool& take_sqrt) { + static void nrm2(const execution_space& space, const RV& R, const XMV& X, const bool& take_sqrt) { static_assert(Kokkos::is_view::value, "KokkosBlas::Impl::" "Nrm2<2-D>: RV is not a Kokkos::View."); @@ -166,16 +147,13 @@ struct Nrm2: " "XMV is not rank 2."); - Kokkos::Profiling::pushRegion(KOKKOSKERNELS_IMPL_COMPILE_LIBRARY - ? "KokkosBlas::nrm2[ETI]" - : "KokkosBlas::nrm2[noETI]"); + Kokkos::Profiling::pushRegion(KOKKOSKERNELS_IMPL_COMPILE_LIBRARY ? "KokkosBlas::nrm2[ETI]" + : "KokkosBlas::nrm2[noETI]"); #ifdef KOKKOSKERNELS_ENABLE_CHECK_SPECIALIZATION if (KOKKOSKERNELS_IMPL_COMPILE_LIBRARY) - printf("KokkosBlas1::nrm2<> ETI specialization for < %s , %s >\n", - typeid(RV).name(), typeid(XMV).name()); + printf("KokkosBlas1::nrm2<> ETI specialization for < %s , %s >\n", typeid(RV).name(), typeid(XMV).name()); else { - printf("KokkosBlas1::nrm2<> non-ETI specialization for < %s , %s >\n", - typeid(RV).name(), typeid(XMV).name()); + printf("KokkosBlas1::nrm2<> non-ETI specialization for < %s , %s >\n", typeid(RV).name(), typeid(XMV).name()); } #endif @@ -185,21 +163,17 @@ struct Nrm2(INT_MAX)) { - V_Nrm2_Invoke( - space, R0, X0, take_sqrt); + V_Nrm2_Invoke(space, R0, X0, take_sqrt); } else { typedef std::int64_t index_type; - V_Nrm2_Invoke( - space, R0, X0, take_sqrt); + V_Nrm2_Invoke(space, R0, X0, take_sqrt); } } else { - if (numRows < static_cast(INT_MAX) && - numRows * numCols < static_cast(INT_MAX)) { + if (numRows < static_cast(INT_MAX) && numRows * numCols < static_cast(INT_MAX)) { MV_Nrm2_Invoke(space, R, X, take_sqrt); } else { typedef std::int64_t index_type; - MV_Nrm2_Invoke(space, R, X, - take_sqrt); + MV_Nrm2_Invoke(space, R, X, take_sqrt); } } Kokkos::Profiling::popRegion(); @@ -217,34 +191,26 @@ struct Nrm2::mag_type, \ - LAYOUT, Kokkos::HostSpace, \ - Kokkos::MemoryTraits >, \ - Kokkos::View, \ - Kokkos::MemoryTraits >, \ - 1, false, true>; +#define KOKKOSBLAS1_NRM2_ETI_SPEC_DECL(SCALAR, LAYOUT, EXEC_SPACE, MEM_SPACE) \ + extern template struct Nrm2::mag_type, \ + LAYOUT, Kokkos::HostSpace, Kokkos::MemoryTraits >, \ + Kokkos::View, \ + Kokkos::MemoryTraits >, \ + 1, false, true>; // // Macro for definition of full specialization of // KokkosBlas::Impl::Nrm2 for rank == 2. This is NOT for users!!! We // use this macro in one or more .cpp files in this directory. // -#define KOKKOSBLAS1_NRM2_ETI_SPEC_INST(SCALAR, LAYOUT, EXEC_SPACE, MEM_SPACE) \ - template struct Nrm2< \ - EXEC_SPACE, \ - Kokkos::View< \ - typename Kokkos::Details::InnerProductSpaceTraits::mag_type, \ - LAYOUT, Kokkos::HostSpace, \ - Kokkos::MemoryTraits >, \ - Kokkos::View, \ - Kokkos::MemoryTraits >, \ - 1, false, true>; +#define KOKKOSBLAS1_NRM2_ETI_SPEC_INST(SCALAR, LAYOUT, EXEC_SPACE, MEM_SPACE) \ + template struct Nrm2::mag_type, LAYOUT, \ + Kokkos::HostSpace, Kokkos::MemoryTraits >, \ + Kokkos::View, \ + Kokkos::MemoryTraits >, \ + 1, false, true>; // // Macro for declaration of full specialization of @@ -253,19 +219,14 @@ struct Nrm2::mag_type*, \ - LAYOUT, \ - Kokkos::Device, \ - Kokkos::MemoryTraits >, \ - Kokkos::View, \ - Kokkos::MemoryTraits >, \ +#define KOKKOSBLAS1_NRM2_MV_ETI_SPEC_DECL(SCALAR, LAYOUT, EXEC_SPACE, MEM_SPACE) \ + extern template struct Nrm2< \ + EXEC_SPACE, \ + Kokkos::View::mag_type*, LAYOUT, \ + Kokkos::Device, \ + Kokkos::MemoryTraits >, \ + Kokkos::View, \ + Kokkos::MemoryTraits >, \ 2, false, true>; // @@ -273,20 +234,14 @@ struct Nrm2::mag_type*, \ - LAYOUT, \ - Kokkos::Device, \ - Kokkos::MemoryTraits >, \ - Kokkos::View, \ - Kokkos::MemoryTraits >, \ - 2, false, true>; +#define KOKKOSBLAS1_NRM2_MV_ETI_SPEC_INST(SCALAR, LAYOUT, EXEC_SPACE, MEM_SPACE) \ + template struct Nrm2::mag_type*, LAYOUT, \ + Kokkos::Device, \ + Kokkos::MemoryTraits >, \ + Kokkos::View, \ + Kokkos::MemoryTraits >, \ + 2, false, true>; #include diff --git a/blas/impl/KokkosBlas1_nrm2w_impl.hpp b/blas/impl/KokkosBlas1_nrm2w_impl.hpp index fb9b1f7858..979ba2cec3 100644 --- a/blas/impl/KokkosBlas1_nrm2w_impl.hpp +++ b/blas/impl/KokkosBlas1_nrm2w_impl.hpp @@ -46,16 +46,14 @@ struct V_Nrm2w_Functor { typename XV::const_type m_x, m_w; bool m_take_sqrt; - V_Nrm2w_Functor(const XV& x, const XV& w, bool take_sqrt) - : m_x(x), m_w(w), m_take_sqrt(take_sqrt) { + V_Nrm2w_Functor(const XV& x, const XV& w, bool take_sqrt) : m_x(x), m_w(w), m_take_sqrt(take_sqrt) { static_assert(Kokkos::is_view::value, "KokkosBlas::Impl::V_Nrm2w_Functor: " "R is not a Kokkos::View."); static_assert(Kokkos::is_view::value, "KokkosBlas::Impl::V_Nrm2w_Functor: " "X is not a Kokkos::View."); - static_assert(std::is_same::value, + static_assert(std::is_same::value, "KokkosBlas::Impl::V_Nrm2w_Functor: R is const. " "It must be nonconst, because it is an output argument " "(we have to be able to write to its entries)."); @@ -71,19 +69,12 @@ struct V_Nrm2w_Functor { ; } - KOKKOS_INLINE_FUNCTION void init(value_type& update) const { - update = AT::zero(); - } + KOKKOS_INLINE_FUNCTION void init(value_type& update) const { update = AT::zero(); } - KOKKOS_INLINE_FUNCTION void join(value_type& update, - const value_type& source) const { - update += source; - } + KOKKOS_INLINE_FUNCTION void join(value_type& update, const value_type& source) const { update += source; } KOKKOS_INLINE_FUNCTION void final(value_type& update) const { - if (m_take_sqrt) - update = - Kokkos::ArithTraits::sqrt(update); + if (m_take_sqrt) update = Kokkos::ArithTraits::sqrt(update); } }; @@ -101,8 +92,7 @@ struct Nrm2w_MV_Functor { XV x; XV w; - size_type - teamsPerVec; // number of teams collectively performing a dot product + size_type teamsPerVec; // number of teams collectively performing a dot product Nrm2w_MV_Functor(const RV& r_, const XV& x_, const XV& w_, int teamsPerVec_) : r(r_), x(x_), w(w_), teamsPerVec(teamsPerVec_) {} @@ -120,23 +110,19 @@ struct Nrm2w_MV_Functor { Kokkos::parallel_reduce( Kokkos::TeamThreadRange(t, begin, end), [&](size_type k, value_type& update) { - const typename IPT::mag_type tmp = - IPT::norm(x(k, i)) / IPT::norm(w(k, i)); + const typename IPT::mag_type tmp = IPT::norm(x(k, i)) / IPT::norm(w(k, i)); update += tmp * tmp; }, localResult); - Kokkos::single(Kokkos::PerTeam(t), [&]() { - Kokkos::atomic_add(&r(i), rvalue_type(localResult)); - }); + Kokkos::single(Kokkos::PerTeam(t), [&]() { Kokkos::atomic_add(&r(i), rvalue_type(localResult)); }); } }; /// \brief Compute the 2-norm (or its square) of the single vector (1-D /// View) X, and store the result in the 0-D View r. template -void V_Nrm2w_Invoke(const execution_space& space, const RV& r, const XV& X, - const XV& W, const bool& take_sqrt) { +void V_Nrm2w_Invoke(const execution_space& space, const RV& r, const XV& X, const XV& W, const bool& take_sqrt) { const SizeType numRows = static_cast(X.extent(0)); Kokkos::RangePolicy policy(space, 0, numRows); @@ -151,34 +137,27 @@ void V_Nrm2w_Invoke(const execution_space& space, const RV& r, const XV& X, // be computed in-place template void MV_Nrm2w_Invoke( - const execution_space& space, const RV& r, const XV& x, const XV& w, - bool take_sqrt, - typename std::enable_if::accessible>::type* = + const execution_space& space, const RV& r, const XV& x, const XV& w, bool take_sqrt, + typename std::enable_if::accessible>::type* = nullptr) { if (r.extent(0) != x.extent(1)) { std::ostringstream oss; - oss << "KokkosBlas::nrm2w (rank-2): result vector has wrong length (" - << r.extent(0) << ", but x has " << x.extent(1) << " columns)"; + oss << "KokkosBlas::nrm2w (rank-2): result vector has wrong length (" << r.extent(0) << ", but x has " + << x.extent(1) << " columns)"; throw std::runtime_error(oss.str()); } // Zero out the result vector - Kokkos::deep_copy( - space, r, Kokkos::ArithTraits::zero()); + Kokkos::deep_copy(space, r, Kokkos::ArithTraits::zero()); size_type teamsPerVec; - KokkosBlas::Impl::multipleReductionWorkDistribution( - x.extent(0), x.extent(1), teamsPerVec); + KokkosBlas::Impl::multipleReductionWorkDistribution(x.extent(0), x.extent(1), + teamsPerVec); size_type numTeams = x.extent(1) * teamsPerVec; Kokkos::TeamPolicy pol(space, numTeams, Kokkos::AUTO); Kokkos::parallel_for("KokkosBlas1::Nrm2w::S1", pol, - Nrm2w_MV_Functor( - r, x, w, teamsPerVec)); + Nrm2w_MV_Functor(r, x, w, teamsPerVec)); if (take_sqrt) { - Kokkos::parallel_for( - "KokkosBlas1::Nrm2w::Sqrt", - Kokkos::RangePolicy(space, 0, r.extent(0)), - TakeSqrtFunctor(r)); + Kokkos::parallel_for("KokkosBlas1::Nrm2w::Sqrt", Kokkos::RangePolicy(space, 0, r.extent(0)), + TakeSqrtFunctor(r)); } } @@ -186,17 +165,12 @@ void MV_Nrm2w_Invoke( // the other version) template void MV_Nrm2w_Invoke( - const execution_space& space, const RV& r, const XV& x, const XV& w, - bool take_sqrt, - typename std::enable_if::accessible>::type* = - nullptr) { - Kokkos::View - tempResult( - Kokkos::view_alloc(Kokkos::WithoutInitializing, "Nrm2w temp result"), - r.extent(0)); - MV_Nrm2w_Invoke(space, tempResult, x, w, - take_sqrt); + const execution_space& space, const RV& r, const XV& x, const XV& w, bool take_sqrt, + typename std::enable_if< + !Kokkos::SpaceAccessibility::accessible>::type* = nullptr) { + Kokkos::View tempResult( + Kokkos::view_alloc(Kokkos::WithoutInitializing, "Nrm2w temp result"), r.extent(0)); + MV_Nrm2w_Invoke(space, tempResult, x, w, take_sqrt); Kokkos::deep_copy(space, r, tempResult); space.fence(); } diff --git a/blas/impl/KokkosBlas1_nrm2w_spec.hpp b/blas/impl/KokkosBlas1_nrm2w_spec.hpp index f4bbe286ef..5660832139 100644 --- a/blas/impl/KokkosBlas1_nrm2w_spec.hpp +++ b/blas/impl/KokkosBlas1_nrm2w_spec.hpp @@ -42,20 +42,15 @@ struct nrm2w_eti_spec_avail { // We may spread out definitions (see _INST macro below) across one or // more .cpp files. // -#define KOKKOSBLAS1_NRM2W_ETI_SPEC_AVAIL(SCALAR, LAYOUT, EXEC_SPACE, \ - MEM_SPACE) \ - template <> \ - struct nrm2w_eti_spec_avail< \ - EXEC_SPACE, \ - Kokkos::View< \ - typename Kokkos::Details::InnerProductSpaceTraits::mag_type, \ - LAYOUT, Kokkos::HostSpace, \ - Kokkos::MemoryTraits >, \ - Kokkos::View, \ - Kokkos::MemoryTraits >, \ - 1> { \ - enum : bool { value = true }; \ +#define KOKKOSBLAS1_NRM2W_ETI_SPEC_AVAIL(SCALAR, LAYOUT, EXEC_SPACE, MEM_SPACE) \ + template <> \ + struct nrm2w_eti_spec_avail::mag_type, \ + LAYOUT, Kokkos::HostSpace, Kokkos::MemoryTraits >, \ + Kokkos::View, \ + Kokkos::MemoryTraits >, \ + 1> { \ + enum : bool { value = true }; \ }; // @@ -65,20 +60,16 @@ struct nrm2w_eti_spec_avail { // We may spread out definitions (see _DEF macro below) across one or // more .cpp files. // -#define KOKKOSBLAS1_NRM2W_MV_ETI_SPEC_AVAIL(SCALAR, LAYOUT, EXEC_SPACE, \ - MEM_SPACE) \ - template <> \ - struct nrm2w_eti_spec_avail< \ - EXEC_SPACE, \ - Kokkos::View::mag_type*, \ - LAYOUT, Kokkos::Device, \ - Kokkos::MemoryTraits >, \ - Kokkos::View, \ - Kokkos::MemoryTraits >, \ - 2> { \ - enum : bool { value = true }; \ +#define KOKKOSBLAS1_NRM2W_MV_ETI_SPEC_AVAIL(SCALAR, LAYOUT, EXEC_SPACE, MEM_SPACE) \ + template <> \ + struct nrm2w_eti_spec_avail< \ + EXEC_SPACE, \ + Kokkos::View::mag_type*, LAYOUT, \ + Kokkos::Device, Kokkos::MemoryTraits >, \ + Kokkos::View, \ + Kokkos::MemoryTraits >, \ + 2> { \ + enum : bool { value = true }; \ }; // Include the actual specialization declarations @@ -91,24 +82,19 @@ namespace Impl { // Unification layer template ::value, - bool eti_spec_avail = - nrm2w_eti_spec_avail::value> + bool tpl_spec_avail = nrm2w_tpl_spec_avail::value, + bool eti_spec_avail = nrm2w_eti_spec_avail::value> struct Nrm2w { - static void nrm2w(const execution_space& space, const RMV& R, const XMV& X, - const XMV& W, const bool& take_sqrt); + static void nrm2w(const execution_space& space, const RMV& R, const XMV& X, const XMV& W, const bool& take_sqrt); }; #if !defined(KOKKOSKERNELS_ETI_ONLY) || KOKKOSKERNELS_IMPL_COMPILE_LIBRARY //! Full specialization of Nrm2w for single vectors (1-D Views). template -struct Nrm2w { +struct Nrm2w { using size_type = typename XMV::size_type; - static void nrm2w(const execution_space& space, const RMV& R, const XMV& X, - const XMV& W, const bool& take_sqrt) { + static void nrm2w(const execution_space& space, const RMV& R, const XMV& X, const XMV& W, const bool& take_sqrt) { static_assert(Kokkos::is_view::value, "KokkosBlas::Impl::" "Nrm2w<1-D>: RMV is not a Kokkos::View."); @@ -121,16 +107,13 @@ struct Nrm2w: " "XMV is not rank 1."); - Kokkos::Profiling::pushRegion(KOKKOSKERNELS_IMPL_COMPILE_LIBRARY - ? "KokkosBlas::nrm2w[ETI]" - : "KokkosBlas::nrm2w[noETI]"); + Kokkos::Profiling::pushRegion(KOKKOSKERNELS_IMPL_COMPILE_LIBRARY ? "KokkosBlas::nrm2w[ETI]" + : "KokkosBlas::nrm2w[noETI]"); #ifdef KOKKOSKERNELS_ENABLE_CHECK_SPECIALIZATION if (KOKKOSKERNELS_IMPL_COMPILE_LIBRARY) - printf("KokkosBlas1::nrm2w<> ETI specialization for < %s , %s >\n", - typeid(RMV).name(), typeid(XMV).name()); + printf("KokkosBlas1::nrm2w<> ETI specialization for < %s , %s >\n", typeid(RMV).name(), typeid(XMV).name()); else { - printf("KokkosBlas1::nrm2w<> non-ETI specialization for < %s , %s >\n", - typeid(RMV).name(), typeid(XMV).name()); + printf("KokkosBlas1::nrm2w<> non-ETI specialization for < %s , %s >\n", typeid(RMV).name(), typeid(XMV).name()); } #endif const size_type numRows = X.extent(0); @@ -139,20 +122,17 @@ struct Nrm2w(space, R, X, W, take_sqrt); } else { typedef std::int64_t index_type; - V_Nrm2w_Invoke(space, R, X, W, - take_sqrt); + V_Nrm2w_Invoke(space, R, X, W, take_sqrt); } Kokkos::Profiling::popRegion(); } }; template -struct Nrm2w { +struct Nrm2w { using size_type = typename XMV::size_type; - static void nrm2w(const execution_space& space, const RV& R, const XMV& X, - const XMV& W, const bool& take_sqrt) { + static void nrm2w(const execution_space& space, const RV& R, const XMV& X, const XMV& W, const bool& take_sqrt) { static_assert(Kokkos::is_view::value, "KokkosBlas::Impl::" "Nrm2w<2-D>: RV is not a Kokkos::View."); @@ -165,16 +145,13 @@ struct Nrm2w: " "XMV is not rank 2."); - Kokkos::Profiling::pushRegion(KOKKOSKERNELS_IMPL_COMPILE_LIBRARY - ? "KokkosBlas::nrm2w[ETI]" - : "KokkosBlas::nrm2w[noETI]"); + Kokkos::Profiling::pushRegion(KOKKOSKERNELS_IMPL_COMPILE_LIBRARY ? "KokkosBlas::nrm2w[ETI]" + : "KokkosBlas::nrm2w[noETI]"); #ifdef KOKKOSKERNELS_ENABLE_CHECK_SPECIALIZATION if (KOKKOSKERNELS_IMPL_COMPILE_LIBRARY) - printf("KokkosBlas1::nrm2w<> ETI specialization for < %s , %s >\n", - typeid(RV).name(), typeid(XMV).name()); + printf("KokkosBlas1::nrm2w<> ETI specialization for < %s , %s >\n", typeid(RV).name(), typeid(XMV).name()); else { - printf("KokkosBlas1::nrm2w<> non-ETI specialization for < %s , %s >\n", - typeid(RV).name(), typeid(XMV).name()); + printf("KokkosBlas1::nrm2w<> non-ETI specialization for < %s , %s >\n", typeid(RV).name(), typeid(XMV).name()); } #endif @@ -185,22 +162,17 @@ struct Nrm2w(INT_MAX)) { - V_Nrm2w_Invoke( - space, R0, X0, W0, take_sqrt); + V_Nrm2w_Invoke(space, R0, X0, W0, take_sqrt); } else { typedef std::int64_t index_type; - V_Nrm2w_Invoke( - space, R0, X0, W0, take_sqrt); + V_Nrm2w_Invoke(space, R0, X0, W0, take_sqrt); } } else { - if (numRows < static_cast(INT_MAX) && - numRows * numCols < static_cast(INT_MAX)) { - MV_Nrm2w_Invoke(space, R, X, W, - take_sqrt); + if (numRows < static_cast(INT_MAX) && numRows * numCols < static_cast(INT_MAX)) { + MV_Nrm2w_Invoke(space, R, X, W, take_sqrt); } else { typedef std::int64_t index_type; - MV_Nrm2w_Invoke(space, R, X, W, - take_sqrt); + MV_Nrm2w_Invoke(space, R, X, W, take_sqrt); } } Kokkos::Profiling::popRegion(); @@ -218,33 +190,25 @@ struct Nrm2w::mag_type, \ - LAYOUT, Kokkos::HostSpace, \ - Kokkos::MemoryTraits >, \ - Kokkos::View, \ - Kokkos::MemoryTraits >, \ - 1, false, true>; +#define KOKKOSBLAS1_NRM2W_ETI_SPEC_DECL(SCALAR, LAYOUT, EXEC_SPACE, MEM_SPACE) \ + extern template struct Nrm2w::mag_type, \ + LAYOUT, Kokkos::HostSpace, Kokkos::MemoryTraits >, \ + Kokkos::View, \ + Kokkos::MemoryTraits >, \ + 1, false, true>; // // Macro for definition of full specialization of // KokkosBlas::Impl::Nrm2w for rank == 2. This is NOT for users!!! We // use this macro in one or more .cpp files in this directory. // -#define KOKKOSBLAS1_NRM2W_ETI_SPEC_INST(SCALAR, LAYOUT, EXEC_SPACE, MEM_SPACE) \ - template struct Nrm2w< \ - EXEC_SPACE, \ - Kokkos::View< \ - typename Kokkos::Details::InnerProductSpaceTraits::mag_type, \ - LAYOUT, Kokkos::HostSpace, \ - Kokkos::MemoryTraits >, \ - Kokkos::View, \ - Kokkos::MemoryTraits >, \ - 1, false, true>; +#define KOKKOSBLAS1_NRM2W_ETI_SPEC_INST(SCALAR, LAYOUT, EXEC_SPACE, MEM_SPACE) \ + template struct Nrm2w::mag_type, LAYOUT, \ + Kokkos::HostSpace, Kokkos::MemoryTraits >, \ + Kokkos::View, \ + Kokkos::MemoryTraits >, \ + 1, false, true>; // // Macro for declaration of full specialization of @@ -253,17 +217,13 @@ struct Nrm2w::mag_type*, \ - LAYOUT, Kokkos::Device, \ - Kokkos::MemoryTraits >, \ - Kokkos::View, \ - Kokkos::MemoryTraits >, \ +#define KOKKOSBLAS1_NRM2W_MV_ETI_SPEC_DECL(SCALAR, LAYOUT, EXEC_SPACE, MEM_SPACE) \ + extern template struct Nrm2w< \ + EXEC_SPACE, \ + Kokkos::View::mag_type*, LAYOUT, \ + Kokkos::Device, Kokkos::MemoryTraits >, \ + Kokkos::View, \ + Kokkos::MemoryTraits >, \ 2, false, true>; // @@ -271,18 +231,13 @@ struct Nrm2w::mag_type*, \ - LAYOUT, Kokkos::Device, \ - Kokkos::MemoryTraits >, \ - Kokkos::View, \ - Kokkos::MemoryTraits >, \ - 2, false, true>; +#define KOKKOSBLAS1_NRM2W_MV_ETI_SPEC_INST(SCALAR, LAYOUT, EXEC_SPACE, MEM_SPACE) \ + template struct Nrm2w::mag_type*, LAYOUT, \ + Kokkos::Device, Kokkos::MemoryTraits >, \ + Kokkos::View, \ + Kokkos::MemoryTraits >, \ + 2, false, true>; #include diff --git a/blas/impl/KokkosBlas1_nrminf_impl.hpp b/blas/impl/KokkosBlas1_nrminf_impl.hpp index b8431ac8ea..e7479e6697 100644 --- a/blas/impl/KokkosBlas1_nrminf_impl.hpp +++ b/blas/impl/KokkosBlas1_nrminf_impl.hpp @@ -50,8 +50,7 @@ struct V_NrmInf_Functor { static_assert(Kokkos::is_view::value, "KokkosBlas::Impl::V_NrmInf_Functor: " "X is not a Kokkos::View."); - static_assert(std::is_same::value, + static_assert(std::is_same::value, "KokkosBlas::Impl::V_NrmInf_Functor: R is const. " "It must be nonconst, because it is an output argument " "(we have to be able to write to its entries)."); @@ -85,8 +84,7 @@ void V_NrmInf_Invoke(const execution_space& space, const RV& r, const XV& X) { typedef V_NrmInf_Functor functor_type; functor_type op(X); - Kokkos::parallel_reduce("KokkosBlas::NrmInf::S0", policy, op, - Kokkos::Max(r())); + Kokkos::parallel_reduce("KokkosBlas::NrmInf::S0", policy, op, Kokkos::Max(r())); } /// \brief Compute the 2-norms (or their square) of the columns of the @@ -96,8 +94,7 @@ void MV_NrmInf_Invoke(const execution_space& space, const RV& r, const XMV& X) { for (size_t i = 0; i < X.extent(1); i++) { auto ri = Kokkos::subview(r, i); auto Xi = Kokkos::subview(X, Kokkos::ALL(), i); - V_NrmInf_Invoke( - space, ri, Xi); + V_NrmInf_Invoke(space, ri, Xi); } } diff --git a/blas/impl/KokkosBlas1_nrminf_spec.hpp b/blas/impl/KokkosBlas1_nrminf_spec.hpp index 3659d61f19..e7b365ce85 100644 --- a/blas/impl/KokkosBlas1_nrminf_spec.hpp +++ b/blas/impl/KokkosBlas1_nrminf_spec.hpp @@ -43,20 +43,15 @@ struct nrminf_eti_spec_avail { // We may spread out definitions (see _INST macro below) across one or // more .cpp files. // -#define KOKKOSBLAS1_NRMINF_ETI_SPEC_AVAIL(SCALAR, LAYOUT, EXEC_SPACE, \ - MEM_SPACE) \ - template <> \ - struct nrminf_eti_spec_avail< \ - EXEC_SPACE, \ - Kokkos::View< \ - typename Kokkos::Details::InnerProductSpaceTraits::mag_type, \ - LAYOUT, Kokkos::HostSpace, \ - Kokkos::MemoryTraits >, \ - Kokkos::View, \ - Kokkos::MemoryTraits >, \ - 1> { \ - enum : bool { value = true }; \ +#define KOKKOSBLAS1_NRMINF_ETI_SPEC_AVAIL(SCALAR, LAYOUT, EXEC_SPACE, MEM_SPACE) \ + template <> \ + struct nrminf_eti_spec_avail::mag_type, \ + LAYOUT, Kokkos::HostSpace, Kokkos::MemoryTraits >, \ + Kokkos::View, \ + Kokkos::MemoryTraits >, \ + 1> { \ + enum : bool { value = true }; \ }; // @@ -66,22 +61,17 @@ struct nrminf_eti_spec_avail { // We may spread out definitions (see _DEF macro below) across one or // more .cpp files. // -#define KOKKOSBLAS1_NRMINF_MV_ETI_SPEC_AVAIL(SCALAR, LAYOUT, EXEC_SPACE, \ - MEM_SPACE) \ - template <> \ - struct nrminf_eti_spec_avail< \ - EXEC_SPACE, \ - Kokkos::View::mag_type*, \ - LAYOUT, \ - Kokkos::Device, \ - Kokkos::MemoryTraits >, \ - Kokkos::View, \ - Kokkos::MemoryTraits >, \ - 2> { \ - enum : bool { value = true }; \ +#define KOKKOSBLAS1_NRMINF_MV_ETI_SPEC_AVAIL(SCALAR, LAYOUT, EXEC_SPACE, MEM_SPACE) \ + template <> \ + struct nrminf_eti_spec_avail< \ + EXEC_SPACE, \ + Kokkos::View::mag_type*, LAYOUT, \ + Kokkos::Device, \ + Kokkos::MemoryTraits >, \ + Kokkos::View, \ + Kokkos::MemoryTraits >, \ + 2> { \ + enum : bool { value = true }; \ }; // Include the actual specialization declarations @@ -94,10 +84,8 @@ namespace Impl { // Unification layer template ::value, - bool eti_spec_avail = - nrminf_eti_spec_avail::value> + bool tpl_spec_avail = nrminf_tpl_spec_avail::value, + bool eti_spec_avail = nrminf_eti_spec_avail::value> struct NrmInf { static void nrminf(const execution_space& space, const RMV& R, const XMV& X); }; @@ -105,8 +93,7 @@ struct NrmInf { #if !defined(KOKKOSKERNELS_ETI_ONLY) || KOKKOSKERNELS_IMPL_COMPILE_LIBRARY //! Full specialization of NrmInf for single vectors (1-D Views). template -struct NrmInf { +struct NrmInf { typedef typename XMV::size_type size_type; static void nrminf(const execution_space& space, const RMV& R, const XMV& X) { @@ -122,16 +109,13 @@ struct NrmInf: " "XMV is not rank 1."); - Kokkos::Profiling::pushRegion(KOKKOSKERNELS_IMPL_COMPILE_LIBRARY - ? "KokkosBlas::nrminf[ETI]" - : "KokkosBlas::nrminf[noETI]"); + Kokkos::Profiling::pushRegion(KOKKOSKERNELS_IMPL_COMPILE_LIBRARY ? "KokkosBlas::nrminf[ETI]" + : "KokkosBlas::nrminf[noETI]"); #ifdef KOKKOSKERNELS_ENABLE_CHECK_SPECIALIZATION if (KOKKOSKERNELS_IMPL_COMPILE_LIBRARY) - printf("KokkosBlas1::nrminf<> ETI specialization for < %s , %s >\n", - typeid(RMV).name(), typeid(XMV).name()); + printf("KokkosBlas1::nrminf<> ETI specialization for < %s , %s >\n", typeid(RMV).name(), typeid(XMV).name()); else { - printf("KokkosBlas1::nrminf<> non-ETI specialization for < %s , %s >\n", - typeid(RMV).name(), typeid(XMV).name()); + printf("KokkosBlas1::nrminf<> non-ETI specialization for < %s , %s >\n", typeid(RMV).name(), typeid(XMV).name()); } #endif const size_type numRows = X.extent(0); @@ -147,8 +131,7 @@ struct NrmInf -struct NrmInf { +struct NrmInf { typedef typename XMV::size_type size_type; static void nrminf(const execution_space& space, const RV& R, const XMV& X) { @@ -164,23 +147,19 @@ struct NrmInf: " "XMV is not rank 2."); - Kokkos::Profiling::pushRegion(KOKKOSKERNELS_IMPL_COMPILE_LIBRARY - ? "KokkosBlas::nrminf[ETI]" - : "KokkosBlas::nrminf[noETI]"); + Kokkos::Profiling::pushRegion(KOKKOSKERNELS_IMPL_COMPILE_LIBRARY ? "KokkosBlas::nrminf[ETI]" + : "KokkosBlas::nrminf[noETI]"); #ifdef KOKKOSKERNELS_ENABLE_CHECK_SPECIALIZATION if (KOKKOSKERNELS_IMPL_COMPILE_LIBRARY) - printf("KokkosBlas1::nrminf<> ETI specialization for < %s , %s >\n", - typeid(RV).name(), typeid(XMV).name()); + printf("KokkosBlas1::nrminf<> ETI specialization for < %s , %s >\n", typeid(RV).name(), typeid(XMV).name()); else { - printf("KokkosBlas1::nrminf<> non-ETI specialization for < %s , %s >\n", - typeid(RV).name(), typeid(XMV).name()); + printf("KokkosBlas1::nrminf<> non-ETI specialization for < %s , %s >\n", typeid(RV).name(), typeid(XMV).name()); } #endif const size_type numRows = X.extent(0); const size_type numCols = X.extent(1); - if (numRows < static_cast(INT_MAX) && - numRows * numCols < static_cast(INT_MAX)) { + if (numRows < static_cast(INT_MAX) && numRows * numCols < static_cast(INT_MAX)) { MV_NrmInf_Invoke(space, R, X); } else { typedef std::int64_t index_type; @@ -201,36 +180,26 @@ struct NrmInf::mag_type, \ - LAYOUT, Kokkos::HostSpace, \ - Kokkos::MemoryTraits >, \ - Kokkos::View, \ - Kokkos::MemoryTraits >, \ - 1, false, true>; +#define KOKKOSBLAS1_NRMINF_ETI_SPEC_DECL(SCALAR, LAYOUT, EXEC_SPACE, MEM_SPACE) \ + extern template struct NrmInf::mag_type, \ + LAYOUT, Kokkos::HostSpace, Kokkos::MemoryTraits >, \ + Kokkos::View, \ + Kokkos::MemoryTraits >, \ + 1, false, true>; // // Macro for definition of full specialization of // KokkosBlas::Impl::NrmInf for rank == 2. This is NOT for users!!! We // use this macro in one or more .cpp files in this directory. // -#define KOKKOSBLAS1_NRMINF_ETI_SPEC_INST(SCALAR, LAYOUT, EXEC_SPACE, \ - MEM_SPACE) \ - template struct NrmInf< \ - EXEC_SPACE, \ - Kokkos::View< \ - typename Kokkos::Details::InnerProductSpaceTraits::mag_type, \ - LAYOUT, Kokkos::HostSpace, \ - Kokkos::MemoryTraits >, \ - Kokkos::View, \ - Kokkos::MemoryTraits >, \ - 1, false, true>; +#define KOKKOSBLAS1_NRMINF_ETI_SPEC_INST(SCALAR, LAYOUT, EXEC_SPACE, MEM_SPACE) \ + template struct NrmInf::mag_type, LAYOUT, \ + Kokkos::HostSpace, Kokkos::MemoryTraits >, \ + Kokkos::View, \ + Kokkos::MemoryTraits >, \ + 1, false, true>; // // Macro for declaration of full specialization of @@ -239,19 +208,14 @@ struct NrmInf::mag_type*, \ - LAYOUT, \ - Kokkos::Device, \ - Kokkos::MemoryTraits >, \ - Kokkos::View, \ - Kokkos::MemoryTraits >, \ +#define KOKKOSBLAS1_NRMINF_MV_ETI_SPEC_DECL(SCALAR, LAYOUT, EXEC_SPACE, MEM_SPACE) \ + extern template struct NrmInf< \ + EXEC_SPACE, \ + Kokkos::View::mag_type*, LAYOUT, \ + Kokkos::Device, \ + Kokkos::MemoryTraits >, \ + Kokkos::View, \ + Kokkos::MemoryTraits >, \ 2, false, true>; // @@ -259,20 +223,14 @@ struct NrmInf::mag_type*, \ - LAYOUT, \ - Kokkos::Device, \ - Kokkos::MemoryTraits >, \ - Kokkos::View, \ - Kokkos::MemoryTraits >, \ - 2, false, true>; +#define KOKKOSBLAS1_NRMINF_MV_ETI_SPEC_INST(SCALAR, LAYOUT, EXEC_SPACE, MEM_SPACE) \ + template struct NrmInf::mag_type*, LAYOUT, \ + Kokkos::Device, \ + Kokkos::MemoryTraits >, \ + Kokkos::View, \ + Kokkos::MemoryTraits >, \ + 2, false, true>; #include diff --git a/blas/impl/KokkosBlas1_reciprocal_impl.hpp b/blas/impl/KokkosBlas1_reciprocal_impl.hpp index 21f736ac4f..7ad6ab95db 100644 --- a/blas/impl/KokkosBlas1_reciprocal_impl.hpp +++ b/blas/impl/KokkosBlas1_reciprocal_impl.hpp @@ -37,8 +37,7 @@ struct MV_Reciprocal_Functor { RMV R_; XMV X_; - MV_Reciprocal_Functor(const RMV& R, const XMV& X) - : numCols(X.extent(1)), R_(R), X_(X) { + MV_Reciprocal_Functor(const RMV& R, const XMV& X) : numCols(X.extent(1)), R_(R), X_(X) { static_assert(Kokkos::is_view::value, "KokkosBlas::Impl::" "MV_Reciprocal_Functor: RMV is not a Kokkos::View."); @@ -148,8 +147,7 @@ struct V_ReciprocalSelf_Functor { // Invoke the "generic" (not unrolled) multivector functor that // computes entry-wise reciprocalolute value. template -void MV_Reciprocal_Generic(const execution_space& space, const RMV& R, - const XMV& X) { +void MV_Reciprocal_Generic(const execution_space& space, const RMV& R, const XMV& X) { static_assert(Kokkos::is_view::value, "KokkosBlas::Impl::" "MV_Reciprocal_Generic: RMV is not a Kokkos::View."); @@ -177,8 +175,7 @@ void MV_Reciprocal_Generic(const execution_space& space, const RMV& R, // Variant of MV_Reciprocal_Generic for single vectors (1-D Views) R and X. template -void V_Reciprocal_Generic(const execution_space& space, const RV& R, - const XV& X) { +void V_Reciprocal_Generic(const execution_space& space, const RV& R, const XV& X) { static_assert(Kokkos::is_view::value, "KokkosBlas::Impl::" "V_Reciprocal_Generic: RV is not a Kokkos::View."); diff --git a/blas/impl/KokkosBlas1_reciprocal_spec.hpp b/blas/impl/KokkosBlas1_reciprocal_spec.hpp index 08fc8bc341..988043511b 100644 --- a/blas/impl/KokkosBlas1_reciprocal_spec.hpp +++ b/blas/impl/KokkosBlas1_reciprocal_spec.hpp @@ -42,18 +42,15 @@ struct reciprocal_eti_spec_avail { // We may spread out definitions (see _INST macro below) across one or // more .cpp files. // -#define KOKKOSBLAS1_RECIPROCAL_ETI_SPEC_AVAIL(SCALAR, LAYOUT, EXEC_SPACE, \ - MEM_SPACE) \ - template <> \ - struct reciprocal_eti_spec_avail< \ - EXEC_SPACE, \ - Kokkos::View, \ - Kokkos::MemoryTraits >, \ - Kokkos::View, \ - Kokkos::MemoryTraits >, \ - 1> { \ - enum : bool { value = true }; \ +#define KOKKOSBLAS1_RECIPROCAL_ETI_SPEC_AVAIL(SCALAR, LAYOUT, EXEC_SPACE, MEM_SPACE) \ + template <> \ + struct reciprocal_eti_spec_avail< \ + EXEC_SPACE, \ + Kokkos::View, Kokkos::MemoryTraits >, \ + Kokkos::View, \ + Kokkos::MemoryTraits >, \ + 1> { \ + enum : bool { value = true }; \ }; // @@ -63,18 +60,15 @@ struct reciprocal_eti_spec_avail { // We may spread out definitions (see _DEF macro below) across one or // more .cpp files. // -#define KOKKOSBLAS1_RECIPROCAL_MV_ETI_SPEC_AVAIL(SCALAR, LAYOUT, EXEC_SPACE, \ - MEM_SPACE) \ - template <> \ - struct reciprocal_eti_spec_avail< \ - EXEC_SPACE, \ - Kokkos::View, \ - Kokkos::MemoryTraits >, \ - Kokkos::View, \ - Kokkos::MemoryTraits >, \ - 2> { \ - enum : bool { value = true }; \ +#define KOKKOSBLAS1_RECIPROCAL_MV_ETI_SPEC_AVAIL(SCALAR, LAYOUT, EXEC_SPACE, MEM_SPACE) \ + template <> \ + struct reciprocal_eti_spec_avail< \ + EXEC_SPACE, \ + Kokkos::View, Kokkos::MemoryTraits >, \ + Kokkos::View, \ + Kokkos::MemoryTraits >, \ + 2> { \ + enum : bool { value = true }; \ }; // Include the actual specialization declarations @@ -87,24 +81,19 @@ namespace Impl { // Unification layer template ::value, - bool eti_spec_avail = - reciprocal_eti_spec_avail::value> + bool tpl_spec_avail = reciprocal_tpl_spec_avail::value, + bool eti_spec_avail = reciprocal_eti_spec_avail::value> struct Reciprocal { - static void reciprocal(const execution_space& space, const RMV& R, - const XMV& X); + static void reciprocal(const execution_space& space, const RMV& R, const XMV& X); }; #if !defined(KOKKOSKERNELS_ETI_ONLY) || KOKKOSKERNELS_IMPL_COMPILE_LIBRARY //! Full specialization of Reciprocal for single vectors (1-D Views). template -struct Reciprocal { +struct Reciprocal { typedef typename XMV::size_type size_type; - static void reciprocal(const execution_space& space, const RMV& R, - const XMV& X) { + static void reciprocal(const execution_space& space, const RMV& R, const XMV& X) { static_assert(Kokkos::is_view::value, "KokkosBlas::Impl::" "Reciprocal<1-D>: RMV is not a Kokkos::View."); @@ -117,17 +106,14 @@ struct Reciprocal: " "XMV is not rank 1."); - Kokkos::Profiling::pushRegion(KOKKOSKERNELS_IMPL_COMPILE_LIBRARY - ? "KokkosBlas::reciprocal[ETI]" - : "KokkosBlas::reciprocal[noETI]"); + Kokkos::Profiling::pushRegion(KOKKOSKERNELS_IMPL_COMPILE_LIBRARY ? "KokkosBlas::reciprocal[ETI]" + : "KokkosBlas::reciprocal[noETI]"); #ifdef KOKKOSKERNELS_ENABLE_CHECK_SPECIALIZATION if (KOKKOSKERNELS_IMPL_COMPILE_LIBRARY) - printf("KokkosBlas1::reciprocal<> ETI specialization for < %s , %s >\n", - typeid(RMV).name(), typeid(XMV).name()); + printf("KokkosBlas1::reciprocal<> ETI specialization for < %s , %s >\n", typeid(RMV).name(), typeid(XMV).name()); else { - printf( - "KokkosBlas1::reciprocal<> non-ETI specialization for < %s , %s >\n", - typeid(RMV).name(), typeid(XMV).name()); + printf("KokkosBlas1::reciprocal<> non-ETI specialization for < %s , %s >\n", typeid(RMV).name(), + typeid(XMV).name()); } #endif const size_type numRows = X.extent(0); @@ -144,12 +130,10 @@ struct Reciprocal -struct Reciprocal { +struct Reciprocal { typedef typename XMV::size_type size_type; - static void reciprocal(const execution_space& space, const RMV& R, - const XMV& X) { + static void reciprocal(const execution_space& space, const RMV& R, const XMV& X) { static_assert(Kokkos::is_view::value, "KokkosBlas::Impl::" "Reciprocal<2-D>: RMV is not a Kokkos::View."); @@ -162,23 +146,19 @@ struct Reciprocal: " "XMV is not rank 2."); - Kokkos::Profiling::pushRegion(KOKKOSKERNELS_IMPL_COMPILE_LIBRARY - ? "KokkosBlas::reciprocal[ETI]" - : "KokkosBlas::reciprocal[noETI]"); + Kokkos::Profiling::pushRegion(KOKKOSKERNELS_IMPL_COMPILE_LIBRARY ? "KokkosBlas::reciprocal[ETI]" + : "KokkosBlas::reciprocal[noETI]"); #ifdef KOKKOSKERNELS_ENABLE_CHECK_SPECIALIZATION if (KOKKOSKERNELS_IMPL_COMPILE_LIBRARY) - printf("KokkosBlas1::reciprocal<> ETI specialization for < %s , %s >\n", - typeid(RMV).name(), typeid(XMV).name()); + printf("KokkosBlas1::reciprocal<> ETI specialization for < %s , %s >\n", typeid(RMV).name(), typeid(XMV).name()); else { - printf("KokkosBlas1::asb<> non-ETI specialization for < %s , %s >\n", - typeid(RMV).name(), typeid(XMV).name()); + printf("KokkosBlas1::asb<> non-ETI specialization for < %s , %s >\n", typeid(RMV).name(), typeid(XMV).name()); } #endif const size_type numRows = X.extent(0); const size_type numCols = X.extent(1); - if (numRows < static_cast(INT_MAX) && - numRows * numCols < static_cast(INT_MAX)) { + if (numRows < static_cast(INT_MAX) && numRows * numCols < static_cast(INT_MAX)) { typedef int index_type; MV_Reciprocal_Generic(space, R, X); } else { @@ -200,15 +180,12 @@ struct Reciprocal, \ - Kokkos::MemoryTraits >, \ - Kokkos::View, \ - Kokkos::MemoryTraits >, \ +#define KOKKOSBLAS1_RECIPROCAL_ETI_SPEC_DECL(SCALAR, LAYOUT, EXEC_SPACE, MEM_SPACE) \ + extern template struct Reciprocal< \ + EXEC_SPACE, \ + Kokkos::View, Kokkos::MemoryTraits >, \ + Kokkos::View, \ + Kokkos::MemoryTraits >, \ 1, false, true>; // @@ -216,15 +193,12 @@ struct Reciprocal, \ - Kokkos::MemoryTraits >, \ - Kokkos::View, \ - Kokkos::MemoryTraits >, \ +#define KOKKOSBLAS1_RECIPROCAL_ETI_SPEC_INST(SCALAR, LAYOUT, EXEC_SPACE, MEM_SPACE) \ + template struct Reciprocal< \ + EXEC_SPACE, \ + Kokkos::View, Kokkos::MemoryTraits >, \ + Kokkos::View, \ + Kokkos::MemoryTraits >, \ 1, false, true>; // @@ -234,15 +208,12 @@ struct Reciprocal, \ - Kokkos::MemoryTraits >, \ - Kokkos::View, \ - Kokkos::MemoryTraits >, \ +#define KOKKOSBLAS1_RECIPROCAL_MV_ETI_SPEC_DECL(SCALAR, LAYOUT, EXEC_SPACE, MEM_SPACE) \ + extern template struct Reciprocal< \ + EXEC_SPACE, \ + Kokkos::View, Kokkos::MemoryTraits >, \ + Kokkos::View, \ + Kokkos::MemoryTraits >, \ 2, false, true>; // @@ -250,15 +221,12 @@ struct Reciprocal, \ - Kokkos::MemoryTraits >, \ - Kokkos::View, \ - Kokkos::MemoryTraits >, \ +#define KOKKOSBLAS1_RECIPROCAL_MV_ETI_SPEC_INST(SCALAR, LAYOUT, EXEC_SPACE, MEM_SPACE) \ + template struct Reciprocal< \ + EXEC_SPACE, \ + Kokkos::View, Kokkos::MemoryTraits >, \ + Kokkos::View, \ + Kokkos::MemoryTraits >, \ 2, false, true>; #include diff --git a/blas/impl/KokkosBlas1_rot_impl.hpp b/blas/impl/KokkosBlas1_rot_impl.hpp index 93d3b3d9b9..e139e916be 100644 --- a/blas/impl/KokkosBlas1_rot_impl.hpp +++ b/blas/impl/KokkosBlas1_rot_impl.hpp @@ -30,8 +30,7 @@ struct rot_functor { VectorView X, Y; ScalarView c, s; - rot_functor(VectorView const& X_, VectorView const& Y_, ScalarView const& c_, - ScalarView const& s_) + rot_functor(VectorView const& X_, VectorView const& Y_, ScalarView const& c_, ScalarView const& s_) : X(X_), Y(Y_), c(c_), s(s_) {} KOKKOS_INLINE_FUNCTION @@ -43,8 +42,8 @@ struct rot_functor { }; template -void Rot_Invoke(ExecutionSpace const& space, VectorView const& X, - VectorView const& Y, ScalarView const& c, ScalarView const& s) { +void Rot_Invoke(ExecutionSpace const& space, VectorView const& X, VectorView const& Y, ScalarView const& c, + ScalarView const& s) { Kokkos::RangePolicy rot_policy(space, 0, X.extent(0)); rot_functor rot_func(X, Y, c, s); Kokkos::parallel_for("KokkosBlas::rot", rot_policy, rot_func); diff --git a/blas/impl/KokkosBlas1_rot_spec.hpp b/blas/impl/KokkosBlas1_rot_spec.hpp index 214e0399e5..4ca4d8d1ef 100644 --- a/blas/impl/KokkosBlas1_rot_spec.hpp +++ b/blas/impl/KokkosBlas1_rot_spec.hpp @@ -43,16 +43,14 @@ struct rot_eti_spec_avail { // We may spread out definitions (see _INST macro below) across one or // more .cpp files. // -#define KOKKOSBLAS1_ROT_ETI_SPEC_AVAIL(SCALAR, LAYOUT, EXECSPACE, MEMSPACE) \ - template <> \ - struct rot_eti_spec_avail< \ - EXECSPACE, \ - Kokkos::View, \ - Kokkos::MemoryTraits>, \ - Kokkos::View::mag_type, LAYOUT, \ - Kokkos::Device, \ - Kokkos::MemoryTraits>> { \ - enum : bool { value = true }; \ +#define KOKKOSBLAS1_ROT_ETI_SPEC_AVAIL(SCALAR, LAYOUT, EXECSPACE, MEMSPACE) \ + template <> \ + struct rot_eti_spec_avail< \ + EXECSPACE, \ + Kokkos::View, Kokkos::MemoryTraits>, \ + Kokkos::View::mag_type, LAYOUT, Kokkos::Device, \ + Kokkos::MemoryTraits>> { \ + enum : bool { value = true }; \ }; // Include the actual specialization declarations @@ -64,36 +62,28 @@ namespace Impl { // Unification layer template ::value, - bool eti_spec_avail = - rot_eti_spec_avail::value> + bool tpl_spec_avail = rot_tpl_spec_avail::value, + bool eti_spec_avail = rot_eti_spec_avail::value> struct Rot { - static void rot(ExecutionSpace const& space, VectorView const& X, - VectorView const& Y, ScalarView const& c, + static void rot(ExecutionSpace const& space, VectorView const& X, VectorView const& Y, ScalarView const& c, ScalarView const& s); }; #if !defined(KOKKOSKERNELS_ETI_ONLY) || KOKKOSKERNELS_IMPL_COMPILE_LIBRARY //! Full specialization of Rot. template -struct Rot { - static void rot(ExecutionSpace const& space, VectorView const& X, - VectorView const& Y, ScalarView const& c, +struct Rot { + static void rot(ExecutionSpace const& space, VectorView const& X, VectorView const& Y, ScalarView const& c, ScalarView const& s) { - Kokkos::Profiling::pushRegion(KOKKOSKERNELS_IMPL_COMPILE_LIBRARY - ? "KokkosBlas::rot[ETI]" - : "KokkosBlas::rot[noETI]"); + Kokkos::Profiling::pushRegion(KOKKOSKERNELS_IMPL_COMPILE_LIBRARY ? "KokkosBlas::rot[ETI]" + : "KokkosBlas::rot[noETI]"); #ifdef KOKKOSKERNELS_ENABLE_CHECK_SPECIALIZATION if (KOKKOSKERNELS_IMPL_COMPILE_LIBRARY) - printf("KokkosBlas1::rot<> ETI specialization for < %s, %s, %s >\n", - typeid(ExecutionSpace).name(), typeid(VectorView).name(), - typeid(ScalarView).name()); + printf("KokkosBlas1::rot<> ETI specialization for < %s, %s, %s >\n", typeid(ExecutionSpace).name(), + typeid(VectorView).name(), typeid(ScalarView).name()); else { - printf("KokkosBlas1::rot<> non-ETI specialization for < %s, %s, %s >\n", - typeid(ExecutionSpace).name(), typeid(VectorView).name(), - typeid(ScalarView).name()); + printf("KokkosBlas1::rot<> non-ETI specialization for < %s, %s, %s >\n", typeid(ExecutionSpace).name(), + typeid(VectorView).name(), typeid(ScalarView).name()); } #endif Rot_Invoke(space, X, Y, c, s); @@ -112,14 +102,12 @@ struct Rot, \ - Kokkos::MemoryTraits>, \ - Kokkos::View::mag_type, LAYOUT, \ - Kokkos::Device, \ - Kokkos::MemoryTraits>, \ +#define KOKKOSBLAS1_ROT_ETI_SPEC_DECL(SCALAR, LAYOUT, EXECSPACE, MEMSPACE) \ + extern template struct Rot< \ + EXECSPACE, \ + Kokkos::View, Kokkos::MemoryTraits>, \ + Kokkos::View::mag_type, LAYOUT, Kokkos::Device, \ + Kokkos::MemoryTraits>, \ false, true>; // @@ -127,14 +115,12 @@ struct Rot, \ - Kokkos::MemoryTraits>, \ - Kokkos::View::mag_type, LAYOUT, \ - Kokkos::Device, \ - Kokkos::MemoryTraits>, \ +#define KOKKOSBLAS1_ROT_ETI_SPEC_INST(SCALAR, LAYOUT, EXECSPACE, MEMSPACE) \ + template struct Rot< \ + EXECSPACE, \ + Kokkos::View, Kokkos::MemoryTraits>, \ + Kokkos::View::mag_type, LAYOUT, Kokkos::Device, \ + Kokkos::MemoryTraits>, \ false, true>; #include diff --git a/blas/impl/KokkosBlas1_rotg_impl.hpp b/blas/impl/KokkosBlas1_rotg_impl.hpp index ff7830e147..834c773a8d 100644 --- a/blas/impl/KokkosBlas1_rotg_impl.hpp +++ b/blas/impl/KokkosBlas1_rotg_impl.hpp @@ -24,10 +24,8 @@ namespace KokkosBlas { namespace Impl { template ::is_complex, - bool>::type = true> -KOKKOS_INLINE_FUNCTION void rotg_impl(Scalar* a, Scalar* b, Magnitude* c, - Scalar* s) { + typename std::enable_if::is_complex, bool>::type = true> +KOKKOS_INLINE_FUNCTION void rotg_impl(Scalar* a, Scalar* b, Magnitude* c, Scalar* s) { const Scalar one = Kokkos::ArithTraits::one(); const Scalar zero = Kokkos::ArithTraits::zero(); @@ -40,12 +38,11 @@ KOKKOS_INLINE_FUNCTION void rotg_impl(Scalar* a, Scalar* b, Magnitude* c, } else { const Scalar scaled_a = *a / numerical_scaling; const Scalar scaled_b = *b / numerical_scaling; - Scalar norm = Kokkos::sqrt(scaled_a * scaled_a + scaled_b * scaled_b) * - numerical_scaling; - Scalar sign = Kokkos::abs(*a) > Kokkos::abs(*b) ? *a : *b; - norm = Kokkos::copysign(norm, sign); - *c = *a / norm; - *s = *b / norm; + Scalar norm = Kokkos::sqrt(scaled_a * scaled_a + scaled_b * scaled_b) * numerical_scaling; + Scalar sign = Kokkos::abs(*a) > Kokkos::abs(*b) ? *a : *b; + norm = Kokkos::copysign(norm, sign); + *c = *a / norm; + *s = *b / norm; Scalar z = one; if (Kokkos::abs(*a) > Kokkos::abs(*b)) { @@ -60,10 +57,8 @@ KOKKOS_INLINE_FUNCTION void rotg_impl(Scalar* a, Scalar* b, Magnitude* c, } template ::is_complex, - bool>::type = true> -KOKKOS_INLINE_FUNCTION void rotg_impl(Scalar* a, Scalar* b, Magnitude* c, - Scalar* s) { + typename std::enable_if::is_complex, bool>::type = true> +KOKKOS_INLINE_FUNCTION void rotg_impl(Scalar* a, Scalar* b, Magnitude* c, Scalar* s) { using mag_type = typename Kokkos::ArithTraits::mag_type; const Scalar one = Kokkos::ArithTraits::one(); @@ -78,13 +73,11 @@ KOKKOS_INLINE_FUNCTION void rotg_impl(Scalar* a, Scalar* b, Magnitude* c, } else { const Scalar scaled_a = Kokkos::abs(*a / numerical_scaling); const Scalar scaled_b = Kokkos::abs(*b / numerical_scaling); - mag_type norm = - Kokkos::abs(Kokkos::sqrt(scaled_a * scaled_a + scaled_b * scaled_b)) * - numerical_scaling; - Scalar unit_a = *a / Kokkos::abs(*a); - *c = Kokkos::abs(*a) / norm; - *s = unit_a * Kokkos::conj(*b) / norm; - *a = unit_a * norm; + mag_type norm = Kokkos::abs(Kokkos::sqrt(scaled_a * scaled_a + scaled_b * scaled_b)) * numerical_scaling; + Scalar unit_a = *a / Kokkos::abs(*a); + *c = Kokkos::abs(*a) / norm; + *s = unit_a * Kokkos::conj(*b) / norm; + *a = unit_a * norm; } } @@ -94,20 +87,17 @@ struct rotg_functor { MViewType c; SViewType s; - rotg_functor(SViewType const& a_, SViewType const& b_, MViewType const& c_, - SViewType const& s_) + rotg_functor(SViewType const& a_, SViewType const& b_, MViewType const& c_, SViewType const& s_) : a(a_), b(b_), c(c_), s(s_) {} KOKKOS_INLINE_FUNCTION - void operator()(int const) const { - rotg_impl(a.data(), b.data(), c.data(), s.data()); - } + void operator()(int const) const { rotg_impl(a.data(), b.data(), c.data(), s.data()); } }; /// \brief Compute Givens rotation coefficients. template -void Rotg_Invoke(ExecutionSpace const& space, SViewType const& a, - SViewType const& b, MViewType const& c, SViewType const& s) { +void Rotg_Invoke(ExecutionSpace const& space, SViewType const& a, SViewType const& b, MViewType const& c, + SViewType const& s) { Kokkos::RangePolicy rotg_policy(space, 0, 1); rotg_functor rotg_func(a, b, c, s); Kokkos::parallel_for("KokkosBlas::rotg", rotg_policy, rotg_func); diff --git a/blas/impl/KokkosBlas1_rotg_spec.hpp b/blas/impl/KokkosBlas1_rotg_spec.hpp index bdf313e3d0..87618f12c9 100644 --- a/blas/impl/KokkosBlas1_rotg_spec.hpp +++ b/blas/impl/KokkosBlas1_rotg_spec.hpp @@ -42,16 +42,14 @@ struct rotg_eti_spec_avail { // We may spread out definitions (see _INST macro below) across one or // more .cpp files. // -#define KOKKOSBLAS1_ROTG_ETI_SPEC_AVAIL(SCALAR, LAYOUT, EXECSPACE, MEMSPACE) \ - template <> \ - struct rotg_eti_spec_avail< \ - EXECSPACE, \ - Kokkos::View, \ - Kokkos::MemoryTraits>, \ - Kokkos::View::mag_type, LAYOUT, \ - Kokkos::Device, \ - Kokkos::MemoryTraits>> { \ - enum : bool { value = true }; \ +#define KOKKOSBLAS1_ROTG_ETI_SPEC_AVAIL(SCALAR, LAYOUT, EXECSPACE, MEMSPACE) \ + template <> \ + struct rotg_eti_spec_avail< \ + EXECSPACE, \ + Kokkos::View, Kokkos::MemoryTraits>, \ + Kokkos::View::mag_type, LAYOUT, Kokkos::Device, \ + Kokkos::MemoryTraits>> { \ + enum : bool { value = true }; \ }; // Include the actual specialization declarations @@ -63,34 +61,28 @@ namespace Impl { // Unification layer template ::value, - bool eti_spec_avail = - rotg_eti_spec_avail::value> + bool tpl_spec_avail = rotg_tpl_spec_avail::value, + bool eti_spec_avail = rotg_eti_spec_avail::value> struct Rotg { - static void rotg(ExecutionSpace const& space, SViewType const& a, - SViewType const& b, MViewType const& c, SViewType const& s); + static void rotg(ExecutionSpace const& space, SViewType const& a, SViewType const& b, MViewType const& c, + SViewType const& s); }; #if !defined(KOKKOSKERNELS_ETI_ONLY) || KOKKOSKERNELS_IMPL_COMPILE_LIBRARY //! Full specialization of Rotg. template -struct Rotg { - static void rotg(ExecutionSpace const& space, SViewType const& a, - SViewType const& b, MViewType const& c, SViewType const& s) { - Kokkos::Profiling::pushRegion(KOKKOSKERNELS_IMPL_COMPILE_LIBRARY - ? "KokkosBlas::rotg[ETI]" - : "KokkosBlas::rotg[noETI]"); +struct Rotg { + static void rotg(ExecutionSpace const& space, SViewType const& a, SViewType const& b, MViewType const& c, + SViewType const& s) { + Kokkos::Profiling::pushRegion(KOKKOSKERNELS_IMPL_COMPILE_LIBRARY ? "KokkosBlas::rotg[ETI]" + : "KokkosBlas::rotg[noETI]"); #ifdef KOKKOSKERNELS_ENABLE_CHECK_SPECIALIZATION if (KOKKOSKERNELS_IMPL_COMPILE_LIBRARY) - printf("KokkosBlas1::rotg<> ETI specialization for < %s, %s, %s >\n", - typeid(ExecutionSpace).name(), typeid(SViewType).name(), - typeid(MViewType).name()); + printf("KokkosBlas1::rotg<> ETI specialization for < %s, %s, %s >\n", typeid(ExecutionSpace).name(), + typeid(SViewType).name(), typeid(MViewType).name()); else { - printf("KokkosBlas1::rotg<> non-ETI specialization for < %s, %s, %s >\n", - typeid(ExecutionSpace).name(), typeid(SViewType).name(), - typeid(MViewType).name()); + printf("KokkosBlas1::rotg<> non-ETI specialization for < %s, %s, %s >\n", typeid(ExecutionSpace).name(), + typeid(SViewType).name(), typeid(MViewType).name()); } #endif Rotg_Invoke(space, a, b, c, s); @@ -109,14 +101,12 @@ struct Rotg, \ - Kokkos::MemoryTraits>, \ - Kokkos::View::mag_type, LAYOUT, \ - Kokkos::Device, \ - Kokkos::MemoryTraits>, \ +#define KOKKOSBLAS1_ROTG_ETI_SPEC_DECL(SCALAR, LAYOUT, EXECSPACE, MEMSPACE) \ + extern template struct Rotg< \ + EXECSPACE, \ + Kokkos::View, Kokkos::MemoryTraits>, \ + Kokkos::View::mag_type, LAYOUT, Kokkos::Device, \ + Kokkos::MemoryTraits>, \ false, true>; // @@ -124,14 +114,12 @@ struct Rotg, \ - Kokkos::MemoryTraits>, \ - Kokkos::View::mag_type, LAYOUT, \ - Kokkos::Device, \ - Kokkos::MemoryTraits>, \ +#define KOKKOSBLAS1_ROTG_ETI_SPEC_INST(SCALAR, LAYOUT, EXECSPACE, MEMSPACE) \ + template struct Rotg< \ + EXECSPACE, \ + Kokkos::View, Kokkos::MemoryTraits>, \ + Kokkos::View::mag_type, LAYOUT, Kokkos::Device, \ + Kokkos::MemoryTraits>, \ false, true>; #include diff --git a/blas/impl/KokkosBlas1_rotm_impl.hpp b/blas/impl/KokkosBlas1_rotm_impl.hpp index 91a2c7a1d8..697cb7902f 100644 --- a/blas/impl/KokkosBlas1_rotm_impl.hpp +++ b/blas/impl/KokkosBlas1_rotm_impl.hpp @@ -36,9 +36,7 @@ struct rotm_functor { VectorView X, Y; ParamView param; - rotm_functor(VectorView const& X_, VectorView const& Y_, - ParamView const& param_) - : X(X_), Y(Y_), param(param_) {} + rotm_functor(VectorView const& X_, VectorView const& Y_, ParamView const& param_) : X(X_), Y(Y_), param(param_) {} KOKKOS_INLINE_FUNCTION void operator()(const minus_one_tag&, const int idx) const { @@ -63,11 +61,9 @@ struct rotm_functor { }; template -void Rotm_Invoke(execution_space const& space, VectorView const& X, - VectorView const& Y, ParamView const& param) { +void Rotm_Invoke(execution_space const& space, VectorView const& X, VectorView const& Y, ParamView const& param) { using Scalar = typename VectorView::value_type; - static_assert(!Kokkos::ArithTraits::is_complex, - "rotm is not defined for complex types!"); + static_assert(!Kokkos::ArithTraits::is_complex, "rotm is not defined for complex types!"); Scalar const zero = Kokkos::ArithTraits::zero(); Scalar const one = Kokkos::ArithTraits::one(); @@ -82,24 +78,19 @@ void Rotm_Invoke(execution_space const& space, VectorView const& X, if (flag == -two) { return; } else if (flag == -one) { - Kokkos::RangePolicy< - execution_space, - typename rotm_functor::minus_one_tag> - rotm_policy(space, 0, X.extent(0)); + Kokkos::RangePolicy::minus_one_tag> rotm_policy( + space, 0, X.extent(0)); Kokkos::parallel_for("KokkosBlas1::rotm_minus_one", rotm_policy, myFunc); } else if (flag == zero) { - Kokkos::RangePolicy::zero_tag> - rotm_policy(space, 0, X.extent(0)); + Kokkos::RangePolicy::zero_tag> rotm_policy( + space, 0, X.extent(0)); Kokkos::parallel_for("KokkosBlas1::rotm_zero", rotm_policy, myFunc); } else if (flag == one) { - Kokkos::RangePolicy::one_tag> - rotm_policy(space, 0, X.extent(0)); + Kokkos::RangePolicy::one_tag> rotm_policy( + space, 0, X.extent(0)); Kokkos::parallel_for("KokkosBlas1::rotm_one", rotm_policy, myFunc); } else { - throw std::runtime_error( - "KokkosBlas::rotm: param(0) is not -2, -1, 0 or 1!"); + throw std::runtime_error("KokkosBlas::rotm: param(0) is not -2, -1, 0 or 1!"); } } diff --git a/blas/impl/KokkosBlas1_rotm_spec.hpp b/blas/impl/KokkosBlas1_rotm_spec.hpp index 854f2abacc..5000b35fc3 100644 --- a/blas/impl/KokkosBlas1_rotm_spec.hpp +++ b/blas/impl/KokkosBlas1_rotm_spec.hpp @@ -41,16 +41,14 @@ struct rotm_eti_spec_avail { // We may spread out definitions (see _INST macro below) across one or // more .cpp files. // -#define KOKKOSBLAS1_ROTM_ETI_SPEC_AVAIL(SCALAR, LAYOUT, EXEC_SPACE, MEM_SPACE) \ - template <> \ - struct rotm_eti_spec_avail< \ - EXEC_SPACE, \ - Kokkos::View, \ - Kokkos::MemoryTraits>, \ - Kokkos::View, \ - Kokkos::MemoryTraits>> { \ - enum : bool { value = true }; \ +#define KOKKOSBLAS1_ROTM_ETI_SPEC_AVAIL(SCALAR, LAYOUT, EXEC_SPACE, MEM_SPACE) \ + template <> \ + struct rotm_eti_spec_avail< \ + EXEC_SPACE, \ + Kokkos::View, Kokkos::MemoryTraits>, \ + Kokkos::View, \ + Kokkos::MemoryTraits>> { \ + enum : bool { value = true }; \ }; // Include the actual specialization declarations @@ -61,34 +59,27 @@ namespace KokkosBlas { namespace Impl { // Unification layer -template < - class execution_space, class VectorView, class ParamView, - bool tpl_spec_avail = - rotm_tpl_spec_avail::value, - bool eti_spec_avail = - rotm_eti_spec_avail::value> +template ::value, + bool eti_spec_avail = rotm_eti_spec_avail::value> struct Rotm { - static void rotm(execution_space const& space, VectorView const& X, - VectorView const& Y, ParamView const& param); + static void rotm(execution_space const& space, VectorView const& X, VectorView const& Y, ParamView const& param); }; #if !defined(KOKKOSKERNELS_ETI_ONLY) || KOKKOSKERNELS_IMPL_COMPILE_LIBRARY //! Full specialization of Rotm. template -struct Rotm { - static void rotm(execution_space const& space, VectorView const& X, - VectorView const& Y, ParamView const& param) { - Kokkos::Profiling::pushRegion(KOKKOSKERNELS_IMPL_COMPILE_LIBRARY - ? "KokkosBlas::rotm[ETI]" - : "KokkosBlas::rotm[noETI]"); +struct Rotm { + static void rotm(execution_space const& space, VectorView const& X, VectorView const& Y, ParamView const& param) { + Kokkos::Profiling::pushRegion(KOKKOSKERNELS_IMPL_COMPILE_LIBRARY ? "KokkosBlas::rotm[ETI]" + : "KokkosBlas::rotm[noETI]"); #ifdef KOKKOSKERNELS_ENABLE_CHECK_SPECIALIZATION if (KOKKOSKERNELS_IMPL_COMPILE_LIBRARY) - printf("KokkosBlas1::rotm<> ETI specialization for < %s, %s >\n", - typeid(VectorView).name(), typeid(ParamView).name()); + printf("KokkosBlas1::rotm<> ETI specialization for < %s, %s >\n", typeid(VectorView).name(), + typeid(ParamView).name()); else { - printf("KokkosBlas1::rotm<> non-ETI specialization for < %s, %s >\n", - typeid(VectorView).name(), typeid(ParamView).name()); + printf("KokkosBlas1::rotm<> non-ETI specialization for < %s, %s >\n", typeid(VectorView).name(), + typeid(ParamView).name()); } #endif Rotm_Invoke(space, X, Y, param); @@ -107,14 +98,12 @@ struct Rotm, \ - Kokkos::MemoryTraits>, \ - Kokkos::View, \ - Kokkos::MemoryTraits>, \ +#define KOKKOSBLAS1_ROTM_ETI_SPEC_DECL(SCALAR, LAYOUT, EXEC_SPACE, MEM_SPACE) \ + extern template struct Rotm< \ + EXEC_SPACE, \ + Kokkos::View, Kokkos::MemoryTraits>, \ + Kokkos::View, \ + Kokkos::MemoryTraits>, \ false, true>; // @@ -122,14 +111,12 @@ struct Rotm, \ - Kokkos::MemoryTraits>, \ - Kokkos::View, \ - Kokkos::MemoryTraits>, \ +#define KOKKOSBLAS1_ROTM_ETI_SPEC_INST(SCALAR, LAYOUT, EXEC_SPACE, MEM_SPACE) \ + template struct Rotm< \ + EXEC_SPACE, \ + Kokkos::View, Kokkos::MemoryTraits>, \ + Kokkos::View, \ + Kokkos::MemoryTraits>, \ false, true>; #include diff --git a/blas/impl/KokkosBlas1_rotmg_impl.hpp b/blas/impl/KokkosBlas1_rotmg_impl.hpp index b35fd62ece..558020e5a4 100644 --- a/blas/impl/KokkosBlas1_rotmg_impl.hpp +++ b/blas/impl/KokkosBlas1_rotmg_impl.hpp @@ -25,8 +25,7 @@ namespace KokkosBlas { namespace Impl { template -KOKKOS_INLINE_FUNCTION void rotmg_impl(DXView const& d1, DXView const& d2, - DXView const& x1, YView const& y1, +KOKKOS_INLINE_FUNCTION void rotmg_impl(DXView const& d1, DXView const& d2, DXView const& x1, YView const& y1, PView const& param) { using Scalar = typename DXView::non_const_value_type; @@ -133,8 +132,7 @@ KOKKOS_INLINE_FUNCTION void rotmg_impl(DXView const& d1, DXView const& d2, // Rescale d2, h21 and h22 if (d2() != zero) { - while ((Kokkos::abs(d2()) <= gammasqinv) || - (Kokkos::abs(d2()) >= gammasq)) { + while ((Kokkos::abs(d2()) <= gammasqinv) || (Kokkos::abs(d2()) >= gammasq)) { if (flag == zero) { h11 = one; h22 = one; @@ -182,8 +180,7 @@ struct rotmg_functor { YView y1; PView param; - rotmg_functor(DXView& d1_, DXView& d2_, DXView& x1_, const YView& y1_, - PView& param_) + rotmg_functor(DXView& d1_, DXView& d2_, DXView& x1_, const YView& y1_, PView& param_) : d1(d1_), d2(d2_), x1(x1_), y1(y1_), param(param_) {} KOKKOS_INLINE_FUNCTION @@ -191,12 +188,10 @@ struct rotmg_functor { }; template -void Rotmg_Invoke(execution_space const& space, DXView const& d1, - DXView const& d2, DXView const& x1, YView const& y1, +void Rotmg_Invoke(execution_space const& space, DXView const& d1, DXView const& d2, DXView const& x1, YView const& y1, PView const& param) { using Scalar = typename DXView::value_type; - static_assert(!Kokkos::ArithTraits::is_complex, - "rotmg is not defined for complex types!"); + static_assert(!Kokkos::ArithTraits::is_complex, "rotmg is not defined for complex types!"); rotmg_functor myFunc(d1, d2, x1, y1, param); Kokkos::RangePolicy rotmg_policy(space, 0, 1); diff --git a/blas/impl/KokkosBlas1_rotmg_spec.hpp b/blas/impl/KokkosBlas1_rotmg_spec.hpp index b90a158654..caa44dda5d 100644 --- a/blas/impl/KokkosBlas1_rotmg_spec.hpp +++ b/blas/impl/KokkosBlas1_rotmg_spec.hpp @@ -41,19 +41,16 @@ struct rotmg_eti_spec_avail { // We may spread out definitions (see _INST macro below) across one or // more .cpp files. // -#define KOKKOSBLAS1_ROTMG_ETI_SPEC_AVAIL(SCALAR, LAYOUT, EXEC_SPACE, \ - MEM_SPACE) \ - template <> \ - struct rotmg_eti_spec_avail< \ - EXEC_SPACE, \ - Kokkos::View, \ - Kokkos::MemoryTraits>, \ - Kokkos::View, \ - Kokkos::MemoryTraits>, \ - Kokkos::View, \ - Kokkos::MemoryTraits>> { \ - enum : bool { value = true }; \ +#define KOKKOSBLAS1_ROTMG_ETI_SPEC_AVAIL(SCALAR, LAYOUT, EXEC_SPACE, MEM_SPACE) \ + template <> \ + struct rotmg_eti_spec_avail< \ + EXEC_SPACE, \ + Kokkos::View, Kokkos::MemoryTraits>, \ + Kokkos::View, \ + Kokkos::MemoryTraits>, \ + Kokkos::View, \ + Kokkos::MemoryTraits>> { \ + enum : bool { value = true }; \ }; // Include the actual specialization declarations @@ -64,38 +61,30 @@ namespace KokkosBlas { namespace Impl { // Unification layer -template < - class execution_space, class DXView, class YView, class PView, - bool tpl_spec_avail = - rotmg_tpl_spec_avail::value, - bool eti_spec_avail = - rotmg_eti_spec_avail::value> +template ::value, + bool eti_spec_avail = rotmg_eti_spec_avail::value> struct Rotmg { - static void rotmg(execution_space const& space, DXView& d1, DXView& d2, - DXView& x1, YView& y1, PView& param); + static void rotmg(execution_space const& space, DXView& d1, DXView& d2, DXView& x1, YView& y1, PView& param); }; #if !defined(KOKKOSKERNELS_ETI_ONLY) || KOKKOSKERNELS_IMPL_COMPILE_LIBRARY //! Full specialization of Rotmg. template -struct Rotmg { - static void rotmg(execution_space const& space, DXView& d1, DXView& d2, - DXView& x1, YView& y1, PView& param) { - Kokkos::Profiling::pushRegion(KOKKOSKERNELS_IMPL_COMPILE_LIBRARY - ? "KokkosBlas::rotmg[ETI]" - : "KokkosBlas::rotmg[noETI]"); +struct Rotmg { + static void rotmg(execution_space const& space, DXView& d1, DXView& d2, DXView& x1, YView& y1, PView& param) { + Kokkos::Profiling::pushRegion(KOKKOSKERNELS_IMPL_COMPILE_LIBRARY ? "KokkosBlas::rotmg[ETI]" + : "KokkosBlas::rotmg[noETI]"); #ifdef KOKKOSKERNELS_ENABLE_CHECK_SPECIALIZATION if (KOKKOSKERNELS_IMPL_COMPILE_LIBRARY) - printf("KokkosBlas1::rotmg<> ETI specialization for < %s, %s, %s >\n", - typeid(DXView).name(), typeid(YView).name(), typeid(PView).name()); + printf("KokkosBlas1::rotmg<> ETI specialization for < %s, %s, %s >\n", typeid(DXView).name(), + typeid(YView).name(), typeid(PView).name()); else { - printf("KokkosBlas1::rotmg<> non-ETI specialization for < %s, %s, %s >\n", - typeid(DXView).name(), typeid(YView).name(), typeid(PView).name()); + printf("KokkosBlas1::rotmg<> non-ETI specialization for < %s, %s, %s >\n", typeid(DXView).name(), + typeid(YView).name(), typeid(PView).name()); } #endif - Rotmg_Invoke(space, d1, d2, x1, y1, - param); + Rotmg_Invoke(space, d1, d2, x1, y1, param); Kokkos::Profiling::popRegion(); } }; @@ -111,16 +100,13 @@ struct Rotmg, \ - Kokkos::MemoryTraits>, \ - Kokkos::View, \ - Kokkos::MemoryTraits>, \ - Kokkos::View, \ - Kokkos::MemoryTraits>, \ +#define KOKKOSBLAS1_ROTMG_ETI_SPEC_DECL(SCALAR, LAYOUT, EXEC_SPACE, MEM_SPACE) \ + extern template struct Rotmg< \ + EXEC_SPACE, \ + Kokkos::View, Kokkos::MemoryTraits>, \ + Kokkos::View, \ + Kokkos::MemoryTraits>, \ + Kokkos::View, Kokkos::MemoryTraits>, \ false, true>; // @@ -128,16 +114,13 @@ struct Rotmg, \ - Kokkos::MemoryTraits>, \ - Kokkos::View, \ - Kokkos::MemoryTraits>, \ - Kokkos::View, \ - Kokkos::MemoryTraits>, \ +#define KOKKOSBLAS1_ROTMG_ETI_SPEC_INST(SCALAR, LAYOUT, EXEC_SPACE, MEM_SPACE) \ + template struct Rotmg< \ + EXEC_SPACE, \ + Kokkos::View, Kokkos::MemoryTraits>, \ + Kokkos::View, \ + Kokkos::MemoryTraits>, \ + Kokkos::View, Kokkos::MemoryTraits>, \ false, true>; #include diff --git a/blas/impl/KokkosBlas1_scal_impl.hpp b/blas/impl/KokkosBlas1_scal_impl.hpp index 541d9a4934..510ca3808f 100644 --- a/blas/impl/KokkosBlas1_scal_impl.hpp +++ b/blas/impl/KokkosBlas1_scal_impl.hpp @@ -51,23 +51,16 @@ struct V_Scal_Functor { XV m_x; AV m_a; - V_Scal_Functor(const RV& r, const XV& x, const AV& a, - const SizeType startingColumn) - : m_r(r), m_x(x), m_a(a) { - static_assert(Kokkos::is_view::value, - "V_Scal_Functor: RV is not a Kokkos::View."); - static_assert(Kokkos::is_view::value, - "V_Scal_Functor: AV is not a Kokkos::View."); - static_assert(Kokkos::is_view::value, - "V_Scal_Functor: XV is not a Kokkos::View."); + V_Scal_Functor(const RV& r, const XV& x, const AV& a, const SizeType startingColumn) : m_r(r), m_x(x), m_a(a) { + static_assert(Kokkos::is_view::value, "V_Scal_Functor: RV is not a Kokkos::View."); + static_assert(Kokkos::is_view::value, "V_Scal_Functor: AV is not a Kokkos::View."); + static_assert(Kokkos::is_view::value, "V_Scal_Functor: XV is not a Kokkos::View."); static_assert(RV::rank == 1, "V_Scal_Functor: RV is not rank 1."); static_assert(AV::rank == 1, "V_Scal_Functor: AV is not rank 1."); static_assert(XV::rank == 1, "V_Scal_Functor: XV is not rank 1."); if (startingColumn != 0) { - m_a = Kokkos::subview( - a, - std::make_pair(startingColumn, static_cast(a.extent(0)))); + m_a = Kokkos::subview(a, std::make_pair(startingColumn, static_cast(a.extent(0)))); } } @@ -98,8 +91,7 @@ struct V_Scal_Functor { // 1. Y(i) = alpha*X(i) for alpha in -1,0,1 // 2. Y(i) = a*X(i) template -struct V_Scal_Functor { +struct V_Scal_Functor { typedef SizeType size_type; typedef Kokkos::ArithTraits ATS; @@ -107,8 +99,7 @@ struct V_Scal_Functor -void V_Scal_Generic(const execution_space& space, const RV& r, const AV& av, - const XV& x, const SizeType startingColumn, int a = 2) { - static_assert(Kokkos::is_view::value, - "V_Scal_Generic: RV is not a Kokkos::View."); - static_assert(Kokkos::is_view::value, - "V_Scal_Generic: XV is not a Kokkos::View."); +void V_Scal_Generic(const execution_space& space, const RV& r, const AV& av, const XV& x, const SizeType startingColumn, + int a = 2) { + static_assert(Kokkos::is_view::value, "V_Scal_Generic: RV is not a Kokkos::View."); + static_assert(Kokkos::is_view::value, "V_Scal_Generic: XV is not a Kokkos::View."); static_assert(RV::rank == 1, "V_Scal_Generic: RV is not rank 1."); static_assert(XV::rank == 1, "V_Scal_Generic: XV is not rank 1."); diff --git a/blas/impl/KokkosBlas1_scal_mv_impl.hpp b/blas/impl/KokkosBlas1_scal_mv_impl.hpp index da4d7a5149..a729e85025 100644 --- a/blas/impl/KokkosBlas1_scal_mv_impl.hpp +++ b/blas/impl/KokkosBlas1_scal_mv_impl.hpp @@ -42,8 +42,7 @@ namespace Impl { // coefficient of zero has BLAS semantics of ignoring the // corresponding (multi)vector entry. This does not apply to // coefficients in the a vector, if they are used. -template +template struct MV_Scal_Functor { typedef SizeType size_type; typedef Kokkos::ArithTraits ATS; @@ -53,13 +52,11 @@ struct MV_Scal_Functor { XMV X_; aVector a_; - MV_Scal_Functor(const RMV& R, const XMV& X, const aVector& a, - const SizeType startingColumn) + MV_Scal_Functor(const RMV& R, const XMV& X, const aVector& a, const SizeType startingColumn) : numCols(X.extent(1)), R_(R), X_(X), a_(a) { if (startingColumn != 0) { - auto rng = - std::make_pair(startingColumn, static_cast(a.extent(0))); - a_ = Kokkos::subview(a, rng); + auto rng = std::make_pair(startingColumn, static_cast(a.extent(0))); + a_ = Kokkos::subview(a, rng); } } @@ -124,8 +121,7 @@ struct MV_Scal_Functor { // This version works by partial specialization on aVector. // In this partial specialization, aVector is a scalar. template -struct MV_Scal_Functor { +struct MV_Scal_Functor { typedef SizeType size_type; typedef Kokkos::ArithTraits ATS; @@ -134,8 +130,7 @@ struct MV_Scal_Functor +template struct MV_Scal_Unroll_Functor { typedef SizeType size_type; typedef Kokkos::ArithTraits ATS; @@ -203,13 +197,11 @@ struct MV_Scal_Unroll_Functor { XMV m_x; aVector m_a; - MV_Scal_Unroll_Functor(const RMV& r, const XMV& x, const aVector& a, - const SizeType startingColumn) + MV_Scal_Unroll_Functor(const RMV& r, const XMV& x, const aVector& a, const SizeType startingColumn) : m_r(r), m_x(x), m_a(a) { if (startingColumn != 0) { - auto rng = - std::make_pair(startingColumn, static_cast(a.extent(0))); - m_a = Kokkos::subview(a, rng); + auto rng = std::make_pair(startingColumn, static_cast(a.extent(0))); + m_a = Kokkos::subview(a, rng); } } @@ -254,8 +246,7 @@ struct MV_Scal_Unroll_Functor { // than a vector of coefficients) a. The number of columns in X, // UNROLL, is a compile-time constant. template -struct MV_Scal_Unroll_Functor { +struct MV_Scal_Unroll_Functor { typedef SizeType size_type; typedef Kokkos::ArithTraits ATS; @@ -263,8 +254,7 @@ struct MV_Scal_Unroll_Functor -void MV_Scal_Unrolled(const execution_space& space, const RMV& r, - const aVector& av, const XMV& x, +template +void MV_Scal_Unrolled(const execution_space& space, const RMV& r, const aVector& av, const XMV& x, const SizeType startingColumn, int a = 2) { if (a == 0) { - MV_Scal_Unroll_Functor op( - r, x, av, startingColumn); + MV_Scal_Unroll_Functor op(r, x, av, startingColumn); const SizeType numRows = x.extent(0); Kokkos::RangePolicy policy(space, 0, numRows); Kokkos::parallel_for("KokkosBlas::Scal::MV::S0", policy, op); return; } if (a == -1) { - MV_Scal_Unroll_Functor op( - r, x, av, startingColumn); + MV_Scal_Unroll_Functor op(r, x, av, startingColumn); const SizeType numRows = x.extent(0); Kokkos::RangePolicy policy(space, 0, numRows); Kokkos::parallel_for("KokkosBlas::Scal::MV::S1", policy, op); return; } if (a == 1) { - MV_Scal_Unroll_Functor op( - r, x, av, startingColumn); + MV_Scal_Unroll_Functor op(r, x, av, startingColumn); const SizeType numRows = x.extent(0); Kokkos::RangePolicy policy(space, 0, numRows); Kokkos::parallel_for("KokkosBlas::Scal::MV::S2", policy, op); @@ -350,8 +335,7 @@ void MV_Scal_Unrolled(const execution_space& space, const RMV& r, } // a arbitrary (not -1, 0, or 1) - MV_Scal_Unroll_Functor op( - r, x, av, startingColumn); + MV_Scal_Unroll_Functor op(r, x, av, startingColumn); const SizeType numRows = x.extent(0); Kokkos::RangePolicy policy(space, 0, numRows); Kokkos::parallel_for("KokkosBlas::Scal::MV::S3", policy, op); @@ -371,36 +355,30 @@ void MV_Scal_Unrolled(const execution_space& space, const RMV& r, // Any literal coefficient of zero has BLAS semantics of ignoring the // corresponding (multi)vector entry. This does NOT apply to // coefficient(s) in av, if used. -template -void MV_Scal_Generic(const execution_space& space, const RVector& r, - const aVector& av, const XVector& x, +template +void MV_Scal_Generic(const execution_space& space, const RVector& r, const aVector& av, const XVector& x, const SizeType startingColumn, int a = 2) { const SizeType numRows = x.extent(0); Kokkos::RangePolicy policy(space, 0, numRows); if (a == 0) { - MV_Scal_Functor op(r, x, av, - startingColumn); + MV_Scal_Functor op(r, x, av, startingColumn); Kokkos::parallel_for("KokkosBlas::Scal::MV::S4", policy, op); return; } if (a == -1) { - MV_Scal_Functor op(r, x, av, - startingColumn); + MV_Scal_Functor op(r, x, av, startingColumn); Kokkos::parallel_for("KokkosBlas::Scal::MV::S5", policy, op); return; } if (a == 1) { - MV_Scal_Functor op(r, x, av, - startingColumn); + MV_Scal_Functor op(r, x, av, startingColumn); Kokkos::parallel_for("KokkosBlas::Scal::MV::S6", policy, op); return; } // a arbitrary (not -1, 0, or 1) - MV_Scal_Functor op(r, x, av, - startingColumn); + MV_Scal_Functor op(r, x, av, startingColumn); Kokkos::parallel_for("KokkosBlas::Scal::MV::S7", policy, op); } @@ -419,8 +397,7 @@ void MV_Scal_Generic(const execution_space& space, const RVector& r, // corresponding (multi)vector entry. This does NOT apply to // coefficient(s) in av, if used. template -void MV_Scal_Invoke_Left(const execution_space& space, const RMV& r, - const AV& av, const XMV& x, int a = 2) { +void MV_Scal_Invoke_Left(const execution_space& space, const RMV& r, const AV& av, const XMV& x, int a = 2) { const SizeType numCols = x.extent(1); #if KOKKOSBLAS_OPTIMIZATION_LEVEL_SCAL <= 2 @@ -437,8 +414,7 @@ void MV_Scal_Invoke_Left(const execution_space& space, const RMV& r, typedef decltype(X_cur) XMV2D; typedef decltype(R_cur) RMV2D; - MV_Scal_Unrolled( - space, R_cur, av, X_cur, j, a); + MV_Scal_Unrolled(space, R_cur, av, X_cur, j, a); } for (; j + 4 <= numCols; j += 4) { const std::pair rng(j, j + 4); @@ -447,8 +423,7 @@ void MV_Scal_Invoke_Left(const execution_space& space, const RMV& r, typedef decltype(X_cur) XMV2D; typedef decltype(R_cur) RMV2D; - MV_Scal_Unrolled( - space, R_cur, av, X_cur, j, a); + MV_Scal_Unrolled(space, R_cur, av, X_cur, j, a); } for (; j < numCols; ++j) { // RMV and XMV need to turn 1-D. @@ -457,8 +432,7 @@ void MV_Scal_Invoke_Left(const execution_space& space, const RMV& r, typedef decltype(r_cur) RV; typedef decltype(x_cur) XV; - V_Scal_Generic(space, r_cur, av, - x_cur, j, a); + V_Scal_Generic(space, r_cur, av, x_cur, j, a); } #else // KOKKOSBLAS_OPTIMIZATION_LEVEL_SCAL > 2 @@ -470,73 +444,25 @@ void MV_Scal_Invoke_Left(const execution_space& space, const RMV& r, typedef decltype(r_0) RV; typedef decltype(x_0) XV; - V_Scal_Generic(space, r_0, av, x_0, - 0, a); + V_Scal_Generic(space, r_0, av, x_0, 0, a); break; } - case 2: - MV_Scal_Unrolled(space, r, av, - x, 0, a); - break; - case 3: - MV_Scal_Unrolled(space, r, av, - x, 0, a); - break; - case 4: - MV_Scal_Unrolled(space, r, av, - x, 0, a); - break; - case 5: - MV_Scal_Unrolled(space, r, av, - x, 0, a); - break; - case 6: - MV_Scal_Unrolled(space, r, av, - x, 0, a); - break; - case 7: - MV_Scal_Unrolled(space, r, av, - x, 0, a); - break; - case 8: - MV_Scal_Unrolled(space, r, av, - x, 0, a); - break; - case 9: - MV_Scal_Unrolled(space, r, av, - x, 0, a); - break; - case 10: - MV_Scal_Unrolled( - space, r, av, x, 0, a); - break; - case 11: - MV_Scal_Unrolled( - space, r, av, x, 0, a); - break; - case 12: - MV_Scal_Unrolled( - space, r, av, x, 0, a); - break; - case 13: - MV_Scal_Unrolled( - space, r, av, x, 0, a); - break; - case 14: - MV_Scal_Unrolled( - space, r, av, x, 0, a); - break; - case 15: - MV_Scal_Unrolled( - space, r, av, x, 0, a); - break; - case 16: - MV_Scal_Unrolled( - space, r, av, x, 0, a); - break; - default: - MV_Scal_Generic(space, r, av, x, - 0, a); + case 2: MV_Scal_Unrolled(space, r, av, x, 0, a); break; + case 3: MV_Scal_Unrolled(space, r, av, x, 0, a); break; + case 4: MV_Scal_Unrolled(space, r, av, x, 0, a); break; + case 5: MV_Scal_Unrolled(space, r, av, x, 0, a); break; + case 6: MV_Scal_Unrolled(space, r, av, x, 0, a); break; + case 7: MV_Scal_Unrolled(space, r, av, x, 0, a); break; + case 8: MV_Scal_Unrolled(space, r, av, x, 0, a); break; + case 9: MV_Scal_Unrolled(space, r, av, x, 0, a); break; + case 10: MV_Scal_Unrolled(space, r, av, x, 0, a); break; + case 11: MV_Scal_Unrolled(space, r, av, x, 0, a); break; + case 12: MV_Scal_Unrolled(space, r, av, x, 0, a); break; + case 13: MV_Scal_Unrolled(space, r, av, x, 0, a); break; + case 14: MV_Scal_Unrolled(space, r, av, x, 0, a); break; + case 15: MV_Scal_Unrolled(space, r, av, x, 0, a); break; + case 16: MV_Scal_Unrolled(space, r, av, x, 0, a); break; + default: MV_Scal_Generic(space, r, av, x, 0, a); } #endif // KOKKOSBLAS_OPTIMIZATION_LEVEL_SCAL @@ -556,27 +482,23 @@ void MV_Scal_Invoke_Left(const execution_space& space, const RMV& r, // Any literal coefficient of zero has BLAS semantics of ignoring the // corresponding (multi)vector entry. This does NOT apply to // coefficient(s) in av, if used. -template -void MV_Scal_Invoke_Right(const execution_space& space, const RMV& r, - const aVector& av, const XMV& x, int a = 2) { +template +void MV_Scal_Invoke_Right(const execution_space& space, const RMV& r, const aVector& av, const XMV& x, int a = 2) { const SizeType numCols = x.extent(1); if (numCols == 1) { - typedef Kokkos::View + typedef Kokkos::View RV; - typedef Kokkos::View + typedef Kokkos::View XV; RV r_0 = Kokkos::subview(r, Kokkos::ALL(), 0); XV x_0 = Kokkos::subview(x, Kokkos::ALL(), 0); - V_Scal_Generic(space, r_0, - av, x_0, a); + V_Scal_Generic(space, r_0, av, x_0, a); } else { - MV_Scal_Generic(space, r, av, - x, a); + MV_Scal_Generic(space, r, av, x, a); } } diff --git a/blas/impl/KokkosBlas1_scal_spec.hpp b/blas/impl/KokkosBlas1_scal_spec.hpp index 38972b2223..70a95d33e2 100644 --- a/blas/impl/KokkosBlas1_scal_spec.hpp +++ b/blas/impl/KokkosBlas1_scal_spec.hpp @@ -29,8 +29,7 @@ namespace KokkosBlas { namespace Impl { // Specialization struct which defines whether a specialization exists -template +template struct scal_eti_spec_avail { enum : bool { value = false }; }; @@ -44,18 +43,16 @@ struct scal_eti_spec_avail { // We may spread out definitions (see _INST macro below) across one or // more .cpp files. // -#define KOKKOSBLAS1_SCAL_ETI_SPEC_AVAIL(SCALAR, LAYOUT, EXEC_SPACE, MEM_SPACE) \ - template <> \ - struct scal_eti_spec_avail< \ - EXEC_SPACE, \ - Kokkos::View, \ - Kokkos::MemoryTraits >, \ - SCALAR, \ - Kokkos::View, \ - Kokkos::MemoryTraits >, \ - 1> { \ - enum : bool { value = true }; \ +#define KOKKOSBLAS1_SCAL_ETI_SPEC_AVAIL(SCALAR, LAYOUT, EXEC_SPACE, MEM_SPACE) \ + template <> \ + struct scal_eti_spec_avail< \ + EXEC_SPACE, \ + Kokkos::View, Kokkos::MemoryTraits >, \ + SCALAR, \ + Kokkos::View, \ + Kokkos::MemoryTraits >, \ + 1> { \ + enum : bool { value = true }; \ }; // @@ -65,33 +62,27 @@ struct scal_eti_spec_avail { // We may spread out definitions (see _DEF macro below) across one or // more .cpp files. // -#define KOKKOSBLAS1_SCAL_MV_ETI_SPEC_AVAIL(SCALAR, LAYOUT, EXEC_SPACE, \ - MEM_SPACE) \ - template <> \ - struct scal_eti_spec_avail< \ - EXEC_SPACE, \ - Kokkos::View, \ - Kokkos::MemoryTraits >, \ - Kokkos::View, \ - Kokkos::MemoryTraits >, \ - Kokkos::View, \ - Kokkos::MemoryTraits >, \ - 2> { \ - enum : bool { value = true }; \ - }; \ - template <> \ - struct scal_eti_spec_avail< \ - EXEC_SPACE, \ - Kokkos::View, \ - Kokkos::MemoryTraits >, \ - SCALAR, \ - Kokkos::View, \ - Kokkos::MemoryTraits >, \ - 2> { \ - enum : bool { value = true }; \ +#define KOKKOSBLAS1_SCAL_MV_ETI_SPEC_AVAIL(SCALAR, LAYOUT, EXEC_SPACE, MEM_SPACE) \ + template <> \ + struct scal_eti_spec_avail< \ + EXEC_SPACE, \ + Kokkos::View, Kokkos::MemoryTraits >, \ + Kokkos::View, \ + Kokkos::MemoryTraits >, \ + Kokkos::View, \ + Kokkos::MemoryTraits >, \ + 2> { \ + enum : bool { value = true }; \ + }; \ + template <> \ + struct scal_eti_spec_avail< \ + EXEC_SPACE, \ + Kokkos::View, Kokkos::MemoryTraits >, \ + SCALAR, \ + Kokkos::View, \ + Kokkos::MemoryTraits >, \ + 2> { \ + enum : bool { value = true }; \ }; // Include the actual specialization declarations @@ -103,28 +94,22 @@ namespace KokkosBlas { namespace Impl { // Unification layer -template ::value, - bool eti_spec_avail = - scal_eti_spec_avail::value> +template ::value, + bool eti_spec_avail = scal_eti_spec_avail::value> struct Scal { - static void scal(const execution_space& space, const RV& R, const AV& A, - const XV& X); + static void scal(const execution_space& space, const RV& R, const AV& A, const XV& X); }; #if !defined(KOKKOSKERNELS_ETI_ONLY) || KOKKOSKERNELS_IMPL_COMPILE_LIBRARY //! Full specialization of Scal for single vectors (1-D Views). template -struct Scal { +struct Scal { typedef typename XV::non_const_value_type AV; typedef typename XV::size_type size_type; typedef Kokkos::ArithTraits ATA; - static void scal(const execution_space& space, const RV& R, const AV& alpha, - const XV& X) { + static void scal(const execution_space& space, const RV& R, const AV& alpha, const XV& X) { static_assert(Kokkos::is_view::value, "KokkosBlas::Impl::" "Scal<1-D>: RV is not a Kokkos::View."); @@ -137,18 +122,16 @@ struct Scal: " "XV is not rank 1."); - Kokkos::Profiling::pushRegion(KOKKOSKERNELS_IMPL_COMPILE_LIBRARY - ? "KokkosBlas::scal[ETI]" - : "KokkosBlas::scal[noETI]"); + Kokkos::Profiling::pushRegion(KOKKOSKERNELS_IMPL_COMPILE_LIBRARY ? "KokkosBlas::scal[ETI]" + : "KokkosBlas::scal[noETI]"); #ifdef KOKKOSKERNELS_ENABLE_CHECK_SPECIALIZATION if (KOKKOSKERNELS_IMPL_COMPILE_LIBRARY) - printf("KokkosBlas1::scal<1D> ETI specialization for < %s , %s , %s >\n", - typeid(RV).name(), typeid(AV).name(), typeid(XV).name()); + printf("KokkosBlas1::scal<1D> ETI specialization for < %s , %s , %s >\n", typeid(RV).name(), typeid(AV).name(), + typeid(XV).name()); else - printf( - "KokkosBlas1::scal<1D> non-ETI specialization for < %s , %s , %s >\n", - typeid(RV).name(), typeid(AV).name(), typeid(XV).name()); + printf("KokkosBlas1::scal<1D> non-ETI specialization for < %s , %s , %s >\n", typeid(RV).name(), + typeid(AV).name(), typeid(XV).name()); #endif const size_type numRows = X.extent(0); @@ -163,12 +146,10 @@ struct Scal(INT_MAX)) { typedef int index_type; - V_Scal_Generic(space, R, alpha, - X, a); + V_Scal_Generic(space, R, alpha, X, a); } else { typedef typename XV::size_type index_type; - V_Scal_Generic(space, R, alpha, - X, a); + V_Scal_Generic(space, R, alpha, X, a); } Kokkos::Profiling::popRegion(); } @@ -181,13 +162,11 @@ struct Scal -struct Scal { +struct Scal { typedef typename XMV::size_type size_type; typedef Kokkos::ArithTraits ATA; - static void scal(const execution_space& space, const RMV& R, const AV& av, - const XMV& X) { + static void scal(const execution_space& space, const RMV& R, const AV& av, const XMV& X) { static_assert(Kokkos::is_view::value, "KokkosBlas::Impl::" "Scal<2-D>: RMV is not a Kokkos::View."); @@ -206,31 +185,26 @@ struct Scal: " "XMV is not rank 2."); - Kokkos::Profiling::pushRegion(KOKKOSKERNELS_IMPL_COMPILE_LIBRARY - ? "KokkosBlas::scal[ETI]" - : "KokkosBlas::scal[noETI]"); + Kokkos::Profiling::pushRegion(KOKKOSKERNELS_IMPL_COMPILE_LIBRARY ? "KokkosBlas::scal[ETI]" + : "KokkosBlas::scal[noETI]"); #ifdef KOKKOSKERNELS_ENABLE_CHECK_SPECIALIZATION if (KOKKOSKERNELS_IMPL_COMPILE_LIBRARY) - printf("KokkosBlas1::scal<2D> ETI specialization for < %s , %s , %s >\n", - typeid(RMV).name(), typeid(AV).name(), typeid(XMV).name()); + printf("KokkosBlas1::scal<2D> ETI specialization for < %s , %s , %s >\n", typeid(RMV).name(), typeid(AV).name(), + typeid(XMV).name()); else - printf( - "KokkosBlas1::scal<2D> non-ETI specialization for < %s , %s , %s >\n", - typeid(RMV).name(), typeid(AV).name(), typeid(XMV).name()); + printf("KokkosBlas1::scal<2D> non-ETI specialization for < %s , %s , %s >\n", typeid(RMV).name(), + typeid(AV).name(), typeid(XMV).name()); #endif const size_type numRows = X.extent(0); const size_type numCols = X.extent(1); const int a = (av.extent(0) == 0) ? 0 : 2; - if (numRows < static_cast(INT_MAX) && - numRows * numCols < static_cast(INT_MAX)) { + if (numRows < static_cast(INT_MAX) && numRows * numCols < static_cast(INT_MAX)) { typedef int index_type; - MV_Scal_Invoke_Left(space, R, - av, X, a); + MV_Scal_Invoke_Left(space, R, av, X, a); } else { typedef typename XMV::size_type index_type; - MV_Scal_Invoke_Left(space, R, - av, X, a); + MV_Scal_Invoke_Left(space, R, av, X, a); } Kokkos::Profiling::popRegion(); } @@ -243,14 +217,13 @@ struct Scal -struct Scal { +struct Scal { typedef typename XMV::non_const_value_type AV; typedef typename XMV::size_type size_type; typedef Kokkos::ArithTraits ATA; - static void scal(const execution_space& space, const RMV& R, const AV& alpha, - const XMV& X) { + static void scal(const execution_space& space, const RMV& R, const AV& alpha, const XMV& X) { static_assert(Kokkos::is_view::value, "KokkosBlas::Impl::" "Scal<2-D, AV=scalar>: RMV is not a Kokkos::View."); @@ -263,18 +236,16 @@ struct Scal: " "XMV is not rank 2."); - Kokkos::Profiling::pushRegion(KOKKOSKERNELS_IMPL_COMPILE_LIBRARY - ? "KokkosBlas::scal[ETI]" - : "KokkosBlas::scal[noETI]"); + Kokkos::Profiling::pushRegion(KOKKOSKERNELS_IMPL_COMPILE_LIBRARY ? "KokkosBlas::scal[ETI]" + : "KokkosBlas::scal[noETI]"); #ifdef KOKKOSKERNELS_ENABLE_CHECK_SPECIALIZATION if (KOKKOSKERNELS_IMPL_COMPILE_LIBRARY) - printf("KokkosBlas1::scal<2D> ETI specialization for < %s , %s , %s >\n", - typeid(RMV).name(), typeid(AV).name(), typeid(XMV).name()); + printf("KokkosBlas1::scal<2D> ETI specialization for < %s , %s , %s >\n", typeid(RMV).name(), typeid(AV).name(), + typeid(XMV).name()); else - printf( - "KokkosBlas1::scal<2D> non-ETI specialization for < %s , %s , %s >\n", - typeid(RMV).name(), typeid(AV).name(), typeid(XMV).name()); + printf("KokkosBlas1::scal<2D> non-ETI specialization for < %s , %s , %s >\n", typeid(RMV).name(), + typeid(AV).name(), typeid(XMV).name()); #endif const size_type numRows = X.extent(0); @@ -288,17 +259,14 @@ struct Scal(INT_MAX) && - numRows * numCols < static_cast(INT_MAX)) { + if (numRows < static_cast(INT_MAX) && numRows * numCols < static_cast(INT_MAX)) { typedef int index_type; - MV_Scal_Invoke_Left( - space, R, alpha, X, a); + MV_Scal_Invoke_Left(space, R, alpha, X, + a); } else { typedef typename XMV::size_type index_type; - MV_Scal_Invoke_Left( - space, R, alpha, X, a); + MV_Scal_Invoke_Left(space, R, alpha, X, + a); } Kokkos::Profiling::popRegion(); } @@ -315,26 +283,22 @@ struct Scal, \ - Kokkos::MemoryTraits >, \ - SCALAR, \ - Kokkos::View, \ - Kokkos::MemoryTraits >, \ +#define KOKKOSBLAS1_SCAL_ETI_SPEC_DECL(SCALAR, LAYOUT, EXEC_SPACE, MEM_SPACE) \ + extern template struct Scal< \ + EXEC_SPACE, \ + Kokkos::View, Kokkos::MemoryTraits >, \ + SCALAR, \ + Kokkos::View, \ + Kokkos::MemoryTraits >, \ 1, false, true>; -#define KOKKOSBLAS1_SCAL_ETI_SPEC_INST(SCALAR, LAYOUT, EXEC_SPACE, MEM_SPACE) \ - template struct Scal< \ - EXEC_SPACE, \ - Kokkos::View, \ - Kokkos::MemoryTraits >, \ - SCALAR, \ - Kokkos::View, \ - Kokkos::MemoryTraits >, \ +#define KOKKOSBLAS1_SCAL_ETI_SPEC_INST(SCALAR, LAYOUT, EXEC_SPACE, MEM_SPACE) \ + template struct Scal< \ + EXEC_SPACE, \ + Kokkos::View, Kokkos::MemoryTraits >, \ + SCALAR, \ + Kokkos::View, \ + Kokkos::MemoryTraits >, \ 1, false, true>; // @@ -343,50 +307,38 @@ struct Scal, \ - Kokkos::MemoryTraits >, \ - Kokkos::View, \ - Kokkos::MemoryTraits >, \ - Kokkos::View, \ - Kokkos::MemoryTraits >, \ - 2, false, true>; \ - extern template struct Scal< \ - EXEC_SPACE, \ - Kokkos::View, \ - Kokkos::MemoryTraits >, \ - SCALAR, \ - Kokkos::View, \ - Kokkos::MemoryTraits >, \ +#define KOKKOSBLAS1_SCAL_MV_ETI_SPEC_DECL(SCALAR, LAYOUT, EXEC_SPACE, MEM_SPACE) \ + extern template struct Scal< \ + EXEC_SPACE, \ + Kokkos::View, Kokkos::MemoryTraits >, \ + Kokkos::View, \ + Kokkos::MemoryTraits >, \ + Kokkos::View, \ + Kokkos::MemoryTraits >, \ + 2, false, true>; \ + extern template struct Scal< \ + EXEC_SPACE, \ + Kokkos::View, Kokkos::MemoryTraits >, \ + SCALAR, \ + Kokkos::View, \ + Kokkos::MemoryTraits >, \ 2, false, true>; -#define KOKKOSBLAS1_SCAL_MV_ETI_SPEC_INST(SCALAR, LAYOUT, EXEC_SPACE, \ - MEM_SPACE) \ - template struct Scal< \ - EXEC_SPACE, \ - Kokkos::View, \ - Kokkos::MemoryTraits >, \ - Kokkos::View, \ - Kokkos::MemoryTraits >, \ - Kokkos::View, \ - Kokkos::MemoryTraits >, \ - 2, false, true>; \ - template struct Scal< \ - EXEC_SPACE, \ - Kokkos::View, \ - Kokkos::MemoryTraits >, \ - SCALAR, \ - Kokkos::View, \ - Kokkos::MemoryTraits >, \ +#define KOKKOSBLAS1_SCAL_MV_ETI_SPEC_INST(SCALAR, LAYOUT, EXEC_SPACE, MEM_SPACE) \ + template struct Scal< \ + EXEC_SPACE, \ + Kokkos::View, Kokkos::MemoryTraits >, \ + Kokkos::View, \ + Kokkos::MemoryTraits >, \ + Kokkos::View, \ + Kokkos::MemoryTraits >, \ + 2, false, true>; \ + template struct Scal< \ + EXEC_SPACE, \ + Kokkos::View, Kokkos::MemoryTraits >, \ + SCALAR, \ + Kokkos::View, \ + Kokkos::MemoryTraits >, \ 2, false, true>; #include diff --git a/blas/impl/KokkosBlas1_serial_scal_impl.hpp b/blas/impl/KokkosBlas1_serial_scal_impl.hpp index 4de4f18cc2..d783841929 100644 --- a/blas/impl/KokkosBlas1_serial_scal_impl.hpp +++ b/blas/impl/KokkosBlas1_serial_scal_impl.hpp @@ -28,8 +28,7 @@ namespace Impl { struct SerialScaleInternal { template KOKKOS_INLINE_FUNCTION static int invoke(const int m, const ScalarType alpha, - /* */ ValueType *KOKKOS_RESTRICT A, - const int as0) { + /* */ ValueType *KOKKOS_RESTRICT A, const int as0) { #if defined(KOKKOS_ENABLE_PRAGMA_UNROLL) #pragma unroll #endif @@ -39,10 +38,8 @@ struct SerialScaleInternal { } template - KOKKOS_INLINE_FUNCTION static int invoke(const int m, const int n, - const ScalarType alpha, - /* */ ValueType *KOKKOS_RESTRICT A, - const int as0, const int as1) { + KOKKOS_INLINE_FUNCTION static int invoke(const int m, const int n, const ScalarType alpha, + /* */ ValueType *KOKKOS_RESTRICT A, const int as0, const int as1) { if (as0 > as1) for (int i = 0; i < m; ++i) invoke(n, alpha, A + i * as0, as1); else diff --git a/blas/impl/KokkosBlas1_set_impl.hpp b/blas/impl/KokkosBlas1_set_impl.hpp index 38604dc4b2..037720253b 100644 --- a/blas/impl/KokkosBlas1_set_impl.hpp +++ b/blas/impl/KokkosBlas1_set_impl.hpp @@ -30,8 +30,7 @@ namespace Impl { struct SerialSetInternal { template KOKKOS_INLINE_FUNCTION static int invoke(const int m, const ScalarType alpha, - /* */ ValueType *KOKKOS_RESTRICT A, - const int as0) { + /* */ ValueType *KOKKOS_RESTRICT A, const int as0) { #if defined(KOKKOS_ENABLE_PRAGMA_UNROLL) #pragma unroll #endif @@ -41,10 +40,8 @@ struct SerialSetInternal { } template - KOKKOS_INLINE_FUNCTION static int invoke(const int m, const int n, - const ScalarType alpha, - /* */ ValueType *KOKKOS_RESTRICT A, - const int as0, const int as1) { + KOKKOS_INLINE_FUNCTION static int invoke(const int m, const int n, const ScalarType alpha, + /* */ ValueType *KOKKOS_RESTRICT A, const int as0, const int as1) { if (as0 > as1) for (int i = 0; i < m; ++i) invoke(n, alpha, A + i * as0, as1); else @@ -59,32 +56,22 @@ struct SerialSetInternal { /// ================== struct TeamSetInternal { template - KOKKOS_INLINE_FUNCTION static int invoke(const MemberType &member, - const int m, const ScalarType alpha, - /* */ ValueType *KOKKOS_RESTRICT A, - const int as0) { - Kokkos::parallel_for(Kokkos::TeamThreadRange(member, m), - [&](const int &i) { A[i * as0] = alpha; }); + KOKKOS_INLINE_FUNCTION static int invoke(const MemberType &member, const int m, const ScalarType alpha, + /* */ ValueType *KOKKOS_RESTRICT A, const int as0) { + Kokkos::parallel_for(Kokkos::TeamThreadRange(member, m), [&](const int &i) { A[i * as0] = alpha; }); // member.team_barrier(); return 0; } template - KOKKOS_INLINE_FUNCTION static int invoke(const MemberType &member, - const int m, const int n, - const ScalarType alpha, - /* */ ValueType *KOKKOS_RESTRICT A, - const int as0, const int as1) { + KOKKOS_INLINE_FUNCTION static int invoke(const MemberType &member, const int m, const int n, const ScalarType alpha, + /* */ ValueType *KOKKOS_RESTRICT A, const int as0, const int as1) { if (m > n) { - Kokkos::parallel_for( - Kokkos::TeamThreadRange(member, m), [&](const int &i) { - SerialSetInternal::invoke(n, alpha, A + i * as0, as1); - }); + Kokkos::parallel_for(Kokkos::TeamThreadRange(member, m), + [&](const int &i) { SerialSetInternal::invoke(n, alpha, A + i * as0, as1); }); } else { - Kokkos::parallel_for( - Kokkos::TeamThreadRange(member, n), [&](const int &j) { - SerialSetInternal::invoke(m, alpha, A + j * as1, as0); - }); + Kokkos::parallel_for(Kokkos::TeamThreadRange(member, n), + [&](const int &j) { SerialSetInternal::invoke(m, alpha, A + j * as1, as0); }); } // member.team_barrier(); return 0; @@ -96,36 +83,24 @@ struct TeamSetInternal { /// ======================== struct TeamVectorSetInternal { template - KOKKOS_INLINE_FUNCTION static int invoke(const MemberType &member, - const int m, const ScalarType alpha, - /* */ ValueType *KOKKOS_RESTRICT A, - const int as0) { - Kokkos::parallel_for(Kokkos::TeamVectorRange(member, m), - [&](const int &i) { A[i * as0] = alpha; }); + KOKKOS_INLINE_FUNCTION static int invoke(const MemberType &member, const int m, const ScalarType alpha, + /* */ ValueType *KOKKOS_RESTRICT A, const int as0) { + Kokkos::parallel_for(Kokkos::TeamVectorRange(member, m), [&](const int &i) { A[i * as0] = alpha; }); // member.team_barrier(); return 0; } template - KOKKOS_INLINE_FUNCTION static int invoke(const MemberType &member, - const int m, const int n, - const ScalarType alpha, - /* */ ValueType *KOKKOS_RESTRICT A, - const int as0, const int as1) { + KOKKOS_INLINE_FUNCTION static int invoke(const MemberType &member, const int m, const int n, const ScalarType alpha, + /* */ ValueType *KOKKOS_RESTRICT A, const int as0, const int as1) { if (m > n) { - Kokkos::parallel_for( - Kokkos::TeamThreadRange(member, m), [&](const int &i) { - Kokkos::parallel_for( - Kokkos::ThreadVectorRange(member, n), - [&](const int &j) { A[i * as0 + j * as1] = alpha; }); - }); + Kokkos::parallel_for(Kokkos::TeamThreadRange(member, m), [&](const int &i) { + Kokkos::parallel_for(Kokkos::ThreadVectorRange(member, n), [&](const int &j) { A[i * as0 + j * as1] = alpha; }); + }); } else { - Kokkos::parallel_for( - Kokkos::ThreadVectorRange(member, m), [&](const int &i) { - Kokkos::parallel_for( - Kokkos::TeamThreadRange(member, n), - [&](const int &j) { A[i * as0 + j * as1] = alpha; }); - }); + Kokkos::parallel_for(Kokkos::ThreadVectorRange(member, m), [&](const int &i) { + Kokkos::parallel_for(Kokkos::TeamThreadRange(member, n), [&](const int &j) { A[i * as0 + j * as1] = alpha; }); + }); } // member.team_barrier(); return 0; diff --git a/blas/impl/KokkosBlas1_sum_impl.hpp b/blas/impl/KokkosBlas1_sum_impl.hpp index 864c983541..222982dc24 100644 --- a/blas/impl/KokkosBlas1_sum_impl.hpp +++ b/blas/impl/KokkosBlas1_sum_impl.hpp @@ -51,8 +51,7 @@ struct V_Sum_Functor { static_assert(Kokkos::is_view::value, "KokkosBlas::Impl::V_Sum_Functor: " "X is not a Kokkos::View."); - static_assert(std::is_same::value, + static_assert(std::is_same::value, "KokkosBlas::Impl::V_Sum_Functor: R is const. " "It must be nonconst, because it is an output argument " "(we have to be able to write to its entries)."); @@ -75,11 +74,9 @@ struct Sum_MV_Functor { RV r; XV x; - size_type - teamsPerVec; // number of teams collectively performing a dot product + size_type teamsPerVec; // number of teams collectively performing a dot product - Sum_MV_Functor(const RV& r_, const XV& x_, int teamsPerVec_) - : r(r_), x(x_), teamsPerVec(teamsPerVec_) {} + Sum_MV_Functor(const RV& r_, const XV& x_, int teamsPerVec_) : r(r_), x(x_), teamsPerVec(teamsPerVec_) {} KOKKOS_INLINE_FUNCTION void operator()(const TeamMem& t) const { @@ -92,12 +89,10 @@ struct Sum_MV_Functor { value_type localResult = AT::zero(); Kokkos::parallel_reduce( - Kokkos::TeamThreadRange(t, begin, end), - [&](size_type k, value_type& update) { update += x(k, i); }, + Kokkos::TeamThreadRange(t, begin, end), [&](size_type k, value_type& update) { update += x(k, i); }, localResult); - Kokkos::single(Kokkos::PerTeam(t), - [&]() { Kokkos::atomic_add(&r(i), localResult); }); + Kokkos::single(Kokkos::PerTeam(t), [&]() { Kokkos::atomic_add(&r(i), localResult); }); } }; @@ -120,27 +115,23 @@ void V_Sum_Invoke(const execution_space& space, const RV& r, const XV& X) { template void MV_Sum_Invoke( const execution_space& space, const RV& r, const XV& x, - typename std::enable_if::accessible>::type* = + typename std::enable_if::accessible>::type* = nullptr) { if (r.extent(0) != x.extent(1)) { std::ostringstream oss; - oss << "KokkosBlas::Sum (rank-2): result vector has wrong length (" - << r.extent(0) << ", but x has " << x.extent(1) << " columns)"; + oss << "KokkosBlas::Sum (rank-2): result vector has wrong length (" << r.extent(0) << ", but x has " << x.extent(1) + << " columns)"; throw std::runtime_error(oss.str()); } // Zero out the result vector - Kokkos::deep_copy( - space, r, Kokkos::ArithTraits::zero()); + Kokkos::deep_copy(space, r, Kokkos::ArithTraits::zero()); size_type teamsPerVec; - KokkosBlas::Impl::multipleReductionWorkDistribution( - x.extent(0), x.extent(1), teamsPerVec); + KokkosBlas::Impl::multipleReductionWorkDistribution(x.extent(0), x.extent(1), + teamsPerVec); size_type numTeams = x.extent(1) * teamsPerVec; Kokkos::TeamPolicy pol(space, numTeams, Kokkos::AUTO); - Kokkos::parallel_for( - "KokkosBlas1::Sum::S1", pol, - Sum_MV_Functor(r, x, teamsPerVec)); + Kokkos::parallel_for("KokkosBlas1::Sum::S1", pol, + Sum_MV_Functor(r, x, teamsPerVec)); } // Version for when a temporary result view is needed (implemented in terms of @@ -148,15 +139,11 @@ void MV_Sum_Invoke( template void MV_Sum_Invoke( const execution_space& space, const RV& r, const XV& x, - typename std::enable_if::accessible>::type* = - nullptr) { - Kokkos::View - tempResult( - Kokkos::view_alloc(Kokkos::WithoutInitializing, "Sum temp result"), - r.extent(0)); - MV_Sum_Invoke( - space, tempResult, x); + typename std::enable_if< + !Kokkos::SpaceAccessibility::accessible>::type* = nullptr) { + Kokkos::View tempResult( + Kokkos::view_alloc(Kokkos::WithoutInitializing, "Sum temp result"), r.extent(0)); + MV_Sum_Invoke(space, tempResult, x); Kokkos::deep_copy(space, r, tempResult); space.fence(); } diff --git a/blas/impl/KokkosBlas1_sum_spec.hpp b/blas/impl/KokkosBlas1_sum_spec.hpp index 458e7ffdb7..6df41e0309 100644 --- a/blas/impl/KokkosBlas1_sum_spec.hpp +++ b/blas/impl/KokkosBlas1_sum_spec.hpp @@ -43,17 +43,14 @@ struct sum_eti_spec_avail { // We may spread out definitions (see _INST macro below) across one or // more .cpp files. // -#define KOKKOSBLAS1_SUM_ETI_SPEC_AVAIL(SCALAR, LAYOUT, EXEC_SPACE, MEM_SPACE) \ - template <> \ - struct sum_eti_spec_avail< \ - EXEC_SPACE, \ - Kokkos::View >, \ - Kokkos::View, \ - Kokkos::MemoryTraits >, \ - 1> { \ - enum : bool { value = true }; \ +#define KOKKOSBLAS1_SUM_ETI_SPEC_AVAIL(SCALAR, LAYOUT, EXEC_SPACE, MEM_SPACE) \ + template <> \ + struct sum_eti_spec_avail >, \ + Kokkos::View, \ + Kokkos::MemoryTraits >, \ + 1> { \ + enum : bool { value = true }; \ }; // @@ -63,20 +60,16 @@ struct sum_eti_spec_avail { // We may spread out definitions (see _DEF macro below) across one or // more .cpp files. // -#define KOKKOSBLAS1_SUM_MV_ETI_SPEC_AVAIL(SCALAR, LAYOUT, EXEC_SPACE, \ - MEM_SPACE) \ - template <> \ - struct sum_eti_spec_avail< \ - EXEC_SPACE, \ - Kokkos::View, \ - Kokkos::MemoryTraits >, \ - Kokkos::View, \ - Kokkos::MemoryTraits >, \ - 2> { \ - enum : bool { value = true }; \ +#define KOKKOSBLAS1_SUM_MV_ETI_SPEC_AVAIL(SCALAR, LAYOUT, EXEC_SPACE, MEM_SPACE) \ + template <> \ + struct sum_eti_spec_avail< \ + EXEC_SPACE, \ + Kokkos::View, \ + Kokkos::MemoryTraits >, \ + Kokkos::View, \ + Kokkos::MemoryTraits >, \ + 2> { \ + enum : bool { value = true }; \ }; // Include the actual specialization declarations @@ -88,10 +81,9 @@ namespace KokkosBlas { namespace Impl { // Unification layer -template < - class execution_space, class RMV, class XMV, int rank = XMV::rank, - bool tpl_spec_avail = sum_tpl_spec_avail::value, - bool eti_spec_avail = sum_eti_spec_avail::value> +template ::value, + bool eti_spec_avail = sum_eti_spec_avail::value> struct Sum { static void sum(const execution_space& space, const RMV& R, const XMV& X); }; @@ -99,8 +91,7 @@ struct Sum { #if !defined(KOKKOSKERNELS_ETI_ONLY) || KOKKOSKERNELS_IMPL_COMPILE_LIBRARY //! Full specialization of Sum for single vectors (1-D Views). template -struct Sum { +struct Sum { typedef typename XMV::size_type size_type; static void sum(const execution_space& space, const RMV& R, const XMV& X) { @@ -116,17 +107,14 @@ struct Sum: " "XMV is not rank 1."); - Kokkos::Profiling::pushRegion(KOKKOSKERNELS_IMPL_COMPILE_LIBRARY - ? "KokkosBlas::sum[ETI]" - : "KokkosBlas::sum[noETI]"); + Kokkos::Profiling::pushRegion(KOKKOSKERNELS_IMPL_COMPILE_LIBRARY ? "KokkosBlas::sum[ETI]" + : "KokkosBlas::sum[noETI]"); #ifdef KOKKOSKERNELS_ENABLE_CHECK_SPECIALIZATION if (KOKKOSKERNELS_IMPL_COMPILE_LIBRARY) - printf("KokkosBlas1::sum<> ETI specialization for < %s , %s >\n", - typeid(RMV).name(), typeid(XMV).name()); + printf("KokkosBlas1::sum<> ETI specialization for < %s , %s >\n", typeid(RMV).name(), typeid(XMV).name()); else { - printf("KokkosBlas1::sum<> non-ETI specialization for < %s , %s >\n", - typeid(RMV).name(), typeid(XMV).name()); + printf("KokkosBlas1::sum<> non-ETI specialization for < %s , %s >\n", typeid(RMV).name(), typeid(XMV).name()); } #endif const size_type numRows = X.extent(0); @@ -142,8 +130,7 @@ struct Sum -struct Sum { +struct Sum { typedef typename XMV::size_type size_type; static void sum(const execution_space& space, const RV& R, const XMV& X) { @@ -159,16 +146,13 @@ struct Sum: " "XMV is not rank 2."); - Kokkos::Profiling::pushRegion(KOKKOSKERNELS_IMPL_COMPILE_LIBRARY - ? "KokkosBlas::sum[ETI]" - : "KokkosBlas::sum[noETI]"); + Kokkos::Profiling::pushRegion(KOKKOSKERNELS_IMPL_COMPILE_LIBRARY ? "KokkosBlas::sum[ETI]" + : "KokkosBlas::sum[noETI]"); #ifdef KOKKOSKERNELS_ENABLE_CHECK_SPECIALIZATION if (KOKKOSKERNELS_IMPL_COMPILE_LIBRARY) - printf("KokkosBlas1::sum<> ETI specialization for < %s , %s >\n", - typeid(RV).name(), typeid(XMV).name()); + printf("KokkosBlas1::sum<> ETI specialization for < %s , %s >\n", typeid(RV).name(), typeid(XMV).name()); else { - printf("KokkosBlas1::sum<> non-ETI specialization for < %s , %s >\n", - typeid(RV).name(), typeid(XMV).name()); + printf("KokkosBlas1::sum<> non-ETI specialization for < %s , %s >\n", typeid(RV).name(), typeid(XMV).name()); } #endif @@ -178,16 +162,13 @@ struct Sum(INT_MAX)) { - V_Sum_Invoke(space, - R0, X0); + V_Sum_Invoke(space, R0, X0); } else { typedef std::int64_t index_type; - V_Sum_Invoke( - space, R0, X0); + V_Sum_Invoke(space, R0, X0); } } else { - if (numRows < static_cast(INT_MAX) && - numRows * numCols < static_cast(INT_MAX)) { + if (numRows < static_cast(INT_MAX) && numRows * numCols < static_cast(INT_MAX)) { MV_Sum_Invoke(space, R, X); } else { typedef std::int64_t index_type; @@ -209,14 +190,11 @@ struct Sum >, \ - Kokkos::View, \ - Kokkos::MemoryTraits >, \ +#define KOKKOSBLAS1_SUM_ETI_SPEC_DECL(SCALAR, LAYOUT, EXEC_SPACE, MEM_SPACE) \ + extern template struct Sum< \ + EXEC_SPACE, Kokkos::View >, \ + Kokkos::View, \ + Kokkos::MemoryTraits >, \ 1, false, true>; // @@ -224,13 +202,11 @@ struct Sum >, \ - Kokkos::View, \ - Kokkos::MemoryTraits >, \ +#define KOKKOSBLAS1_SUM_ETI_SPEC_INST(SCALAR, LAYOUT, EXEC_SPACE, MEM_SPACE) \ + template struct Sum >, \ + Kokkos::View, \ + Kokkos::MemoryTraits >, \ 1, false, true>; // @@ -240,17 +216,13 @@ struct Sum, \ - Kokkos::MemoryTraits >, \ - Kokkos::View, \ - Kokkos::MemoryTraits >, \ +#define KOKKOSBLAS1_SUM_MV_ETI_SPEC_DECL(SCALAR, LAYOUT, EXEC_SPACE, MEM_SPACE) \ + extern template struct Sum< \ + EXEC_SPACE, \ + Kokkos::View, \ + Kokkos::MemoryTraits >, \ + Kokkos::View, \ + Kokkos::MemoryTraits >, \ 2, false, true>; // @@ -258,17 +230,13 @@ struct Sum, \ - Kokkos::MemoryTraits >, \ - Kokkos::View, \ - Kokkos::MemoryTraits >, \ +#define KOKKOSBLAS1_SUM_MV_ETI_SPEC_INST(SCALAR, LAYOUT, EXEC_SPACE, MEM_SPACE) \ + template struct Sum< \ + EXEC_SPACE, \ + Kokkos::View, \ + Kokkos::MemoryTraits >, \ + Kokkos::View, \ + Kokkos::MemoryTraits >, \ 2, false, true>; #include diff --git a/blas/impl/KokkosBlas1_swap_impl.hpp b/blas/impl/KokkosBlas1_swap_impl.hpp index 32a13d6469..7d4d22b514 100644 --- a/blas/impl/KokkosBlas1_swap_impl.hpp +++ b/blas/impl/KokkosBlas1_swap_impl.hpp @@ -42,8 +42,7 @@ struct swap_functor { }; template -void Swap_Invoke(ExecutionSpace const& space, XVector const& X, - YVector const& Y) { +void Swap_Invoke(ExecutionSpace const& space, XVector const& X, YVector const& Y) { Kokkos::RangePolicy swap_policy(space, 0, X.extent(0)); swap_functor swap_func(X, Y); Kokkos::parallel_for("KokkosBlas::swap", swap_policy, swap_func); diff --git a/blas/impl/KokkosBlas1_swap_spec.hpp b/blas/impl/KokkosBlas1_swap_spec.hpp index db09a62f8f..749552a81c 100644 --- a/blas/impl/KokkosBlas1_swap_spec.hpp +++ b/blas/impl/KokkosBlas1_swap_spec.hpp @@ -44,15 +44,13 @@ struct swap_eti_spec_avail { // We may spread out definitions (see _INST macro below) across one or // more .cpp files. // -#define KOKKOSBLAS1_SWAP_ETI_SPEC_AVAIL(SCALAR, LAYOUT, EXECSPACE, MEMSPACE) \ - template <> \ - struct swap_eti_spec_avail< \ - EXECSPACE, \ - Kokkos::View, \ - Kokkos::MemoryTraits>, \ - Kokkos::View, \ - Kokkos::MemoryTraits>> { \ - enum : bool { value = true }; \ +#define KOKKOSBLAS1_SWAP_ETI_SPEC_AVAIL(SCALAR, LAYOUT, EXECSPACE, MEMSPACE) \ + template <> \ + struct swap_eti_spec_avail< \ + EXECSPACE, \ + Kokkos::View, Kokkos::MemoryTraits>, \ + Kokkos::View, Kokkos::MemoryTraits>> { \ + enum : bool { value = true }; \ }; // Include the actual specialization declarations @@ -64,34 +62,26 @@ namespace Impl { // Unification layer template ::value, - bool eti_spec_avail = - swap_eti_spec_avail::value> + bool tpl_spec_avail = swap_tpl_spec_avail::value, + bool eti_spec_avail = swap_eti_spec_avail::value> struct Swap { - static void swap(ExecutionSpace const& space, XVector const& X, - YVector const& Y); + static void swap(ExecutionSpace const& space, XVector const& X, YVector const& Y); }; #if !defined(KOKKOSKERNELS_ETI_ONLY) || KOKKOSKERNELS_IMPL_COMPILE_LIBRARY //! Full specialization of Swap. template -struct Swap { - static void swap(ExecutionSpace const& space, XVector const& X, - YVector const& Y) { - Kokkos::Profiling::pushRegion(KOKKOSKERNELS_IMPL_COMPILE_LIBRARY - ? "KokkosBlas::swap[ETI]" - : "KokkosBlas::swap[noETI]"); +struct Swap { + static void swap(ExecutionSpace const& space, XVector const& X, YVector const& Y) { + Kokkos::Profiling::pushRegion(KOKKOSKERNELS_IMPL_COMPILE_LIBRARY ? "KokkosBlas::swap[ETI]" + : "KokkosBlas::swap[noETI]"); #ifdef KOKKOSKERNELS_ENABLE_CHECK_SPECIALIZATION if (KOKKOSKERNELS_IMPL_COMPILE_LIBRARY) - printf("KokkosBlas1::swap<> ETI specialization for < %s, %s, %s >\n", - typeid(ExecutionSpace).name(), typeid(XVector).name(), - typeid(YVector).name()); + printf("KokkosBlas1::swap<> ETI specialization for < %s, %s, %s >\n", typeid(ExecutionSpace).name(), + typeid(XVector).name(), typeid(YVector).name()); else { - printf("KokkosBlas1::swap<> non-ETI specialization for < %s, %s, %s >\n", - typeid(ExecutionSpace).name(), typeid(XVector).name(), - typeid(YVector).name()); + printf("KokkosBlas1::swap<> non-ETI specialization for < %s, %s, %s >\n", typeid(ExecutionSpace).name(), + typeid(XVector).name(), typeid(YVector).name()); } #endif Swap_Invoke(space, X, Y); @@ -110,13 +100,11 @@ struct Swap, \ - Kokkos::MemoryTraits>, \ - Kokkos::View, \ - Kokkos::MemoryTraits>, \ +#define KOKKOSBLAS1_SWAP_ETI_SPEC_DECL(SCALAR, LAYOUT, EXECSPACE, MEMSPACE) \ + extern template struct Swap< \ + EXECSPACE, \ + Kokkos::View, Kokkos::MemoryTraits>, \ + Kokkos::View, Kokkos::MemoryTraits>, \ false, true>; // @@ -124,13 +112,11 @@ struct Swap, \ - Kokkos::MemoryTraits>, \ - Kokkos::View, \ - Kokkos::MemoryTraits>, \ +#define KOKKOSBLAS1_SWAP_ETI_SPEC_INST(SCALAR, LAYOUT, EXECSPACE, MEMSPACE) \ + template struct Swap< \ + EXECSPACE, \ + Kokkos::View, Kokkos::MemoryTraits>, \ + Kokkos::View, Kokkos::MemoryTraits>, \ false, true>; #include diff --git a/blas/impl/KokkosBlas1_team_abs_spec.hpp b/blas/impl/KokkosBlas1_team_abs_spec.hpp index bcd9545738..a5140a9b34 100644 --- a/blas/impl/KokkosBlas1_team_abs_spec.hpp +++ b/blas/impl/KokkosBlas1_team_abs_spec.hpp @@ -32,24 +32,20 @@ struct team_abs_tpl_spec_avail { }; // Unification and Specialization layer -template ::value> +template ::value> struct TeamAbs { typedef Kokkos::ArithTraits ATS; - static KOKKOS_INLINE_FUNCTION void team_abs(const TeamType& team, const RV& R, - const XV& X); + static KOKKOS_INLINE_FUNCTION void team_abs(const TeamType& team, const RV& R, const XV& X); }; template struct TeamAbs { typedef Kokkos::ArithTraits ATS; - static KOKKOS_INLINE_FUNCTION void team_abs(const TeamType& team, const RV& R, - const XV& X) { + static KOKKOS_INLINE_FUNCTION void team_abs(const TeamType& team, const RV& R, const XV& X) { int N = X.extent(0); - Kokkos::parallel_for(Kokkos::TeamThreadRange(team, N), - [&](const int& i) { R(i) = ATS::abs(X(i)); }); + Kokkos::parallel_for(Kokkos::TeamThreadRange(team, N), [&](const int& i) { R(i) = ATS::abs(X(i)); }); } }; diff --git a/blas/impl/KokkosBlas1_team_axpby_spec.hpp b/blas/impl/KokkosBlas1_team_axpby_spec.hpp index 356be339c3..4cd42ae37d 100644 --- a/blas/impl/KokkosBlas1_team_axpby_spec.hpp +++ b/blas/impl/KokkosBlas1_team_axpby_spec.hpp @@ -33,24 +33,20 @@ struct team_axpby_tpl_spec_avail { // Unification and Specialization layer template ::value> + bool tpl_spec_avail = team_axpby_tpl_spec_avail::value> struct TeamAXPBY { - static KOKKOS_INLINE_FUNCTION void team_axpby( - const TeamType& team, const typename XVector::non_const_value_type& a, - const XVector& x, const typename YVector::non_const_value_type& b, - const YVector& y); + static KOKKOS_INLINE_FUNCTION void team_axpby(const TeamType& team, const typename XVector::non_const_value_type& a, + const XVector& x, const typename YVector::non_const_value_type& b, + const YVector& y); }; template struct TeamAXPBY { - static KOKKOS_INLINE_FUNCTION void team_axpby( - const TeamType& team, const typename XVector::non_const_value_type& a, - const XVector& x, const typename YVector::non_const_value_type& b, - const YVector& y) { + static KOKKOS_INLINE_FUNCTION void team_axpby(const TeamType& team, const typename XVector::non_const_value_type& a, + const XVector& x, const typename YVector::non_const_value_type& b, + const YVector& y) { const int N = x.extent(0); - Kokkos::parallel_for(Kokkos::TeamThreadRange(team, N), - [&](const int& i) { y(i) = b * y(i) + a * x(i); }); + Kokkos::parallel_for(Kokkos::TeamThreadRange(team, N), [&](const int& i) { y(i) = b * y(i) + a * x(i); }); } }; diff --git a/blas/impl/KokkosBlas1_team_dot_spec.hpp b/blas/impl/KokkosBlas1_team_dot_spec.hpp index 041920d109..5c5e4ea85d 100644 --- a/blas/impl/KokkosBlas1_team_dot_spec.hpp +++ b/blas/impl/KokkosBlas1_team_dot_spec.hpp @@ -32,27 +32,20 @@ struct team_dot_tpl_spec_avail { }; // Unification and Specialization layer -template ::value> +template ::value> struct TeamDot { - typedef Kokkos::Details::InnerProductSpaceTraits< - typename XV::non_const_value_type> - IPT; + typedef Kokkos::Details::InnerProductSpaceTraits IPT; typedef typename IPT::dot_type dot_type; - static KOKKOS_INLINE_FUNCTION dot_type team_dot(const TeamType& team, - const XV& X, const YV& Y); + static KOKKOS_INLINE_FUNCTION dot_type team_dot(const TeamType& team, const XV& X, const YV& Y); }; template struct TeamDot { - typedef Kokkos::Details::InnerProductSpaceTraits< - typename XV::non_const_value_type> - IPT; + typedef Kokkos::Details::InnerProductSpaceTraits IPT; typedef typename IPT::dot_type dot_type; - static KOKKOS_INLINE_FUNCTION dot_type team_dot(const TeamType& team, - const XV& X, const YV& Y) { + static KOKKOS_INLINE_FUNCTION dot_type team_dot(const TeamType& team, const XV& X, const YV& Y) { dot_type result = 0.0; // Kokkos::ArithTraitszero(); int N = X.extent(0); Kokkos::parallel_reduce( diff --git a/blas/impl/KokkosBlas1_team_mult_spec.hpp b/blas/impl/KokkosBlas1_team_mult_spec.hpp index 381802eeb0..6138257582 100644 --- a/blas/impl/KokkosBlas1_team_mult_spec.hpp +++ b/blas/impl/KokkosBlas1_team_mult_spec.hpp @@ -33,25 +33,23 @@ struct team_mult_tpl_spec_avail { // Unification and Specialization layer template ::value> + bool tpl_spec_avail = team_mult_tpl_spec_avail::value> struct TeamMult { - static KOKKOS_INLINE_FUNCTION void team_mult( - const TeamType& team, const typename YVector::non_const_value_type& gamma, - const YVector& y, const typename AVector::non_const_value_type& alpha, - const AVector& a, const XVector& x); + static KOKKOS_INLINE_FUNCTION void team_mult(const TeamType& team, + const typename YVector::non_const_value_type& gamma, const YVector& y, + const typename AVector::non_const_value_type& alpha, const AVector& a, + const XVector& x); }; template struct TeamMult { - static KOKKOS_INLINE_FUNCTION void team_mult( - const TeamType& team, const typename YVector::non_const_value_type& gamma, - const YVector& y, const typename AVector::non_const_value_type& alpha, - const AVector& a, const XVector& x) { + static KOKKOS_INLINE_FUNCTION void team_mult(const TeamType& team, + const typename YVector::non_const_value_type& gamma, const YVector& y, + const typename AVector::non_const_value_type& alpha, const AVector& a, + const XVector& x) { const int N = x.extent(0); - Kokkos::parallel_for(Kokkos::TeamThreadRange(team, N), [&](const int& i) { - y(i) = gamma * y(i) + alpha * a(i) * x(i); - }); + Kokkos::parallel_for(Kokkos::TeamThreadRange(team, N), + [&](const int& i) { y(i) = gamma * y(i) + alpha * a(i) * x(i); }); } }; diff --git a/blas/impl/KokkosBlas1_team_nrm2_spec.hpp b/blas/impl/KokkosBlas1_team_nrm2_spec.hpp index ef050cb73b..bf486d88e8 100644 --- a/blas/impl/KokkosBlas1_team_nrm2_spec.hpp +++ b/blas/impl/KokkosBlas1_team_nrm2_spec.hpp @@ -32,31 +32,22 @@ struct team_nrm2_tpl_spec_avail { }; // Unification and Specialization layer -template ::value> +template ::value> struct TeamNrm2 { - typedef typename Kokkos::Details::InnerProductSpaceTraits< - typename XV::non_const_value_type>::mag_type mag_type; - typedef Kokkos::Details::InnerProductSpaceTraits< - typename XV::non_const_value_type> - IPT; + typedef typename Kokkos::Details::InnerProductSpaceTraits::mag_type mag_type; + typedef Kokkos::Details::InnerProductSpaceTraits IPT; typedef Kokkos::ArithTraits AT; - static KOKKOS_INLINE_FUNCTION mag_type team_nrm2(const TeamType& team, - const XV& X); + static KOKKOS_INLINE_FUNCTION mag_type team_nrm2(const TeamType& team, const XV& X); }; template struct TeamNrm2 { - typedef typename Kokkos::Details::InnerProductSpaceTraits< - typename XV::non_const_value_type>::mag_type mag_type; - typedef Kokkos::Details::InnerProductSpaceTraits< - typename XV::non_const_value_type> - IPT; + typedef typename Kokkos::Details::InnerProductSpaceTraits::mag_type mag_type; + typedef Kokkos::Details::InnerProductSpaceTraits IPT; typedef Kokkos::ArithTraits AT; - static KOKKOS_INLINE_FUNCTION mag_type team_nrm2(const TeamType& team, - const XV& X) { + static KOKKOS_INLINE_FUNCTION mag_type team_nrm2(const TeamType& team, const XV& X) { mag_type result = 0.0; // Kokkos::ArithTraitszero(); int N = X.extent(0); Kokkos::parallel_reduce( diff --git a/blas/impl/KokkosBlas1_team_scal_impl.hpp b/blas/impl/KokkosBlas1_team_scal_impl.hpp index dc3aa4d42e..2ce2eece5e 100644 --- a/blas/impl/KokkosBlas1_team_scal_impl.hpp +++ b/blas/impl/KokkosBlas1_team_scal_impl.hpp @@ -28,32 +28,22 @@ namespace Impl { /// ==================== struct TeamScaleInternal { template - KOKKOS_INLINE_FUNCTION static int invoke(const MemberType &member, - const int m, const ScalarType alpha, - /* */ ValueType *KOKKOS_RESTRICT A, - const int as0) { - Kokkos::parallel_for(Kokkos::TeamThreadRange(member, m), - [&](const int &i) { A[i * as0] *= alpha; }); + KOKKOS_INLINE_FUNCTION static int invoke(const MemberType &member, const int m, const ScalarType alpha, + /* */ ValueType *KOKKOS_RESTRICT A, const int as0) { + Kokkos::parallel_for(Kokkos::TeamThreadRange(member, m), [&](const int &i) { A[i * as0] *= alpha; }); // member.team_barrier(); return 0; } template - KOKKOS_INLINE_FUNCTION static int invoke(const MemberType &member, - const int m, const int n, - const ScalarType alpha, - /* */ ValueType *KOKKOS_RESTRICT A, - const int as0, const int as1) { + KOKKOS_INLINE_FUNCTION static int invoke(const MemberType &member, const int m, const int n, const ScalarType alpha, + /* */ ValueType *KOKKOS_RESTRICT A, const int as0, const int as1) { if (m > n) { - Kokkos::parallel_for( - Kokkos::TeamThreadRange(member, m), [&](const int &i) { - SerialScaleInternal::invoke(n, alpha, A + i * as0, as1); - }); + Kokkos::parallel_for(Kokkos::TeamThreadRange(member, m), + [&](const int &i) { SerialScaleInternal::invoke(n, alpha, A + i * as0, as1); }); } else { - Kokkos::parallel_for( - Kokkos::TeamThreadRange(member, n), [&](const int &j) { - SerialScaleInternal::invoke(m, alpha, A + j * as1, as0); - }); + Kokkos::parallel_for(Kokkos::TeamThreadRange(member, n), + [&](const int &j) { SerialScaleInternal::invoke(m, alpha, A + j * as1, as0); }); } // member.team_barrier(); return 0; @@ -65,36 +55,25 @@ struct TeamScaleInternal { /// ======================== struct TeamVectorScaleInternal { template - KOKKOS_INLINE_FUNCTION static int invoke(const MemberType &member, - const int m, const ScalarType alpha, - /* */ ValueType *KOKKOS_RESTRICT A, - const int as0) { - Kokkos::parallel_for(Kokkos::TeamVectorRange(member, m), - [&](const int &i) { A[i * as0] *= alpha; }); + KOKKOS_INLINE_FUNCTION static int invoke(const MemberType &member, const int m, const ScalarType alpha, + /* */ ValueType *KOKKOS_RESTRICT A, const int as0) { + Kokkos::parallel_for(Kokkos::TeamVectorRange(member, m), [&](const int &i) { A[i * as0] *= alpha; }); // member.team_barrier(); return 0; } template - KOKKOS_INLINE_FUNCTION static int invoke(const MemberType &member, - const int m, const int n, - const ScalarType alpha, - /* */ ValueType *KOKKOS_RESTRICT A, - const int as0, const int as1) { + KOKKOS_INLINE_FUNCTION static int invoke(const MemberType &member, const int m, const int n, const ScalarType alpha, + /* */ ValueType *KOKKOS_RESTRICT A, const int as0, const int as1) { if (as0 > as1) { - Kokkos::parallel_for( - Kokkos::TeamThreadRange(member, m), [&](const int &i) { - Kokkos::parallel_for( - Kokkos::ThreadVectorRange(member, n), - [&](const int &j) { A[i * as0 + j * as1] *= alpha; }); - }); + Kokkos::parallel_for(Kokkos::TeamThreadRange(member, m), [&](const int &i) { + Kokkos::parallel_for(Kokkos::ThreadVectorRange(member, n), + [&](const int &j) { A[i * as0 + j * as1] *= alpha; }); + }); } else { - Kokkos::parallel_for( - Kokkos::ThreadVectorRange(member, m), [&](const int &i) { - Kokkos::parallel_for( - Kokkos::TeamThreadRange(member, n), - [&](const int &j) { A[i * as0 + j * as1] *= alpha; }); - }); + Kokkos::parallel_for(Kokkos::ThreadVectorRange(member, m), [&](const int &i) { + Kokkos::parallel_for(Kokkos::TeamThreadRange(member, n), [&](const int &j) { A[i * as0 + j * as1] *= alpha; }); + }); } // member.team_barrier(); return 0; diff --git a/blas/impl/KokkosBlas1_team_scal_spec.hpp b/blas/impl/KokkosBlas1_team_scal_spec.hpp index ac6d36306a..3782fb4081 100644 --- a/blas/impl/KokkosBlas1_team_scal_spec.hpp +++ b/blas/impl/KokkosBlas1_team_scal_spec.hpp @@ -32,22 +32,18 @@ struct team_scal_tpl_spec_avail { }; // Unification and Specialization layer -template ::value> +template ::value> struct TeamScal { - static KOKKOS_INLINE_FUNCTION void team_scal( - const TeamType& team, const RV& R, - const typename XV::non_const_value_type& a, const XV& X); + static KOKKOS_INLINE_FUNCTION void team_scal(const TeamType& team, const RV& R, + const typename XV::non_const_value_type& a, const XV& X); }; template struct TeamScal { - static KOKKOS_INLINE_FUNCTION void team_scal( - const TeamType& team, const RV& R, - const typename XV::non_const_value_type& a, const XV& X) { + static KOKKOS_INLINE_FUNCTION void team_scal(const TeamType& team, const RV& R, + const typename XV::non_const_value_type& a, const XV& X) { const int N = X.extent(0); - Kokkos::parallel_for(Kokkos::TeamThreadRange(team, N), - [&](const int& i) { R(i) = a * X(i); }); + Kokkos::parallel_for(Kokkos::TeamThreadRange(team, N), [&](const int& i) { R(i) = a * X(i); }); } }; diff --git a/blas/impl/KokkosBlas1_team_update_spec.hpp b/blas/impl/KokkosBlas1_team_update_spec.hpp index 94a9221f4e..2fbf071d98 100644 --- a/blas/impl/KokkosBlas1_team_update_spec.hpp +++ b/blas/impl/KokkosBlas1_team_update_spec.hpp @@ -33,27 +33,24 @@ struct team_update_tpl_spec_avail { // Unification and Specialization layer template ::value> + bool tpl_spec_avail = team_update_tpl_spec_avail::value> struct TeamUpdate { - static KOKKOS_INLINE_FUNCTION void team_update( - const TeamType& team, const typename XVector::non_const_value_type& alpha, - const XVector& x, const typename YVector::non_const_value_type& beta, - const YVector& y, const typename ZVector::non_const_value_type& gamma, - const ZVector& z); + static KOKKOS_INLINE_FUNCTION void team_update(const TeamType& team, + const typename XVector::non_const_value_type& alpha, const XVector& x, + const typename YVector::non_const_value_type& beta, const YVector& y, + const typename ZVector::non_const_value_type& gamma, const ZVector& z); }; template struct TeamUpdate { - static KOKKOS_INLINE_FUNCTION void team_update( - const TeamType& team, const typename XVector::non_const_value_type& alpha, - const XVector& x, const typename YVector::non_const_value_type& beta, - const YVector& y, const typename ZVector::non_const_value_type& gamma, - const ZVector& z) { + static KOKKOS_INLINE_FUNCTION void team_update(const TeamType& team, + const typename XVector::non_const_value_type& alpha, const XVector& x, + const typename YVector::non_const_value_type& beta, const YVector& y, + const typename ZVector::non_const_value_type& gamma, + const ZVector& z) { const int N = x.extent(0); - Kokkos::parallel_for(Kokkos::TeamThreadRange(team, N), [&](const int& i) { - z(i) = gamma * z(i) + alpha * x(i) + beta * y(i); - }); + Kokkos::parallel_for(Kokkos::TeamThreadRange(team, N), + [&](const int& i) { z(i) = gamma * z(i) + alpha * x(i) + beta * y(i); }); } }; diff --git a/blas/impl/KokkosBlas1_update_impl.hpp b/blas/impl/KokkosBlas1_update_impl.hpp index 96aca5c70e..31502bee8b 100644 --- a/blas/impl/KokkosBlas1_update_impl.hpp +++ b/blas/impl/KokkosBlas1_update_impl.hpp @@ -40,8 +40,8 @@ namespace Impl { // corresponding input coefficient. Any literal coefficient of zero // has BLAS semantics of ignoring the corresponding (multi)vector // entry. -template +template struct MV_Update_Functor { typedef SizeType size_type; typedef Kokkos::ArithTraits ATS; @@ -54,19 +54,10 @@ struct MV_Update_Functor { const typename ZMV::non_const_value_type gamma_; ZMV Z_; - MV_Update_Functor(const typename XMV::non_const_value_type& alpha, - const XMV& X, - const typename YMV::non_const_value_type& beta, - const YMV& Y, - const typename ZMV::non_const_value_type& gamma, - const ZMV& Z) - : numCols(X.extent(1)), - alpha_(alpha), - X_(X), - beta_(beta), - Y_(Y), - gamma_(gamma), - Z_(Z) { + MV_Update_Functor(const typename XMV::non_const_value_type& alpha, const XMV& X, + const typename YMV::non_const_value_type& beta, const YMV& Y, + const typename ZMV::non_const_value_type& gamma, const ZMV& Z) + : numCols(X.extent(1)), alpha_(alpha), X_(X), beta_(beta), Y_(Y), gamma_(gamma), Z_(Z) { static_assert(Kokkos::is_view::value, "KokkosBlas::Impl::" "MV_Update_Functor: X is not a Kokkos::View."); @@ -76,17 +67,15 @@ struct MV_Update_Functor { static_assert(Kokkos::is_view::value, "KokkosBlas::Impl::" "MV_Update_Functor: Z is not a Kokkos::View."); - static_assert(std::is_same::value, + static_assert(std::is_same::value, "KokkosBlas::Impl::MV_Update_Functor: Z is const. " "It must be nonconst, because it is an output argument " "(we have to be able to write to its entries)."); // Casting enum values to int avoids compiler warnings about // comparing different kinds of enum values. - static_assert( - (int)ZMV::rank == (int)XMV::rank && (int)ZMV::rank == (int)YMV::rank, - "KokkosBlas::Impl::MV_Update_Functor: " - "X, Y, and Z must have the same rank."); + static_assert((int)ZMV::rank == (int)XMV::rank && (int)ZMV::rank == (int)YMV::rank, + "KokkosBlas::Impl::MV_Update_Functor: " + "X, Y, and Z must have the same rank."); static_assert(ZMV::rank == 2, "KokkosBlas::Impl::MV_Update_Functor: " "XMV, YMV, and ZMV must have rank 2."); @@ -209,8 +198,8 @@ struct MV_Update_Functor { // coefficients. The value 2 tells the functor to use the // corresponding input coefficient. Any literal coefficient of zero // has BLAS semantics of ignoring the corresponding vector entry. -template +template struct V_Update_Functor { typedef SizeType size_type; typedef Kokkos::ArithTraits ATS; @@ -226,13 +215,7 @@ struct V_Update_Functor { V_Update_Functor(const typename XV::non_const_value_type& alpha, const XV& X, const typename YV::non_const_value_type& beta, const YV& Y, const typename ZV::non_const_value_type& gamma, const ZV& Z) - : numCols(X.extent(1)), - alpha_(alpha), - X_(X), - beta_(beta), - Y_(Y), - gamma_(gamma), - Z_(Z) { + : numCols(X.extent(1)), alpha_(alpha), X_(X), beta_(beta), Y_(Y), gamma_(gamma), Z_(Z) { static_assert(Kokkos::is_view::value, "KokkosBlas::Impl::" "V_Update_Functor: X is not a Kokkos::View."); @@ -242,17 +225,15 @@ struct V_Update_Functor { static_assert(Kokkos::is_view::value, "KokkosBlas::Impl::" "V_Update_Functor: Z is not a Kokkos::View."); - static_assert(std::is_same::value, + static_assert(std::is_same::value, "KokkosBlas::Impl::V_Update_Functor: Z is const. " "It must be nonconst, because it is an output argument " "(we have to be able to write to its entries)."); // Casting to int avoids compiler warnings about comparing // different kinds of enum values. - static_assert( - (int)ZV::rank == (int)XV::rank && (int)ZV::rank == (int)YV::rank, - "KokkosBlas::Impl::V_Update_Functor: " - "X, Y, and Z must have the same rank."); + static_assert((int)ZV::rank == (int)XV::rank && (int)ZV::rank == (int)YV::rank, + "KokkosBlas::Impl::V_Update_Functor: " + "X, Y, and Z must have the same rank."); static_assert(ZV::rank == 1, "KokkosBlas::Impl::V_Update_Functor: " "XV, YV, and ZV must have rank 1."); @@ -314,15 +295,10 @@ struct V_Update_Functor { // // Any literal coefficient of zero has BLAS semantics of ignoring the // corresponding multivector entry. -template -void MV_Update_Generic(const execution_space& space, - const typename XMV::non_const_value_type& alpha, - const XMV& X, - const typename YMV::non_const_value_type& beta, - const YMV& Y, - const typename ZMV::non_const_value_type& gamma, - const ZMV& Z, int a = 2, int b = 2, int c = 2) { +template +void MV_Update_Generic(const execution_space& space, const typename XMV::non_const_value_type& alpha, const XMV& X, + const typename YMV::non_const_value_type& beta, const YMV& Y, + const typename ZMV::non_const_value_type& gamma, const ZMV& Z, int a = 2, int b = 2, int c = 2) { static_assert(Kokkos::is_view::value, "KokkosBlas::Impl::" "MV_Update_Generic: X is not a Kokkos::View."); @@ -332,17 +308,15 @@ void MV_Update_Generic(const execution_space& space, static_assert(Kokkos::is_view::value, "KokkosBlas::Impl::" "MV_Update_Generic: Z is not a Kokkos::View."); - static_assert(std::is_same::value, + static_assert(std::is_same::value, "KokkosBlas::Impl::MV_Update_Generic: Z is const. " "It must be nonconst, because it is an output argument " "(we have to be able to write to its entries)."); // Casting to int avoids compiler warnings about comparing different // kinds of enum values. - static_assert( - (int)ZMV::rank == (int)XMV::rank && (int)ZMV::rank == (int)YMV::rank, - "KokkosBlas::Impl::MV_Update_Generic: " - "X, Y, and Z must have the same rank."); + static_assert((int)ZMV::rank == (int)XMV::rank && (int)ZMV::rank == (int)YMV::rank, + "KokkosBlas::Impl::MV_Update_Generic: " + "X, Y, and Z must have the same rank."); static_assert(ZMV::rank == 2, "KokkosBlas::Impl::MV_Update_Generic: " "XMV, YMV, and ZMV must have rank 2."); @@ -353,22 +327,18 @@ void MV_Update_Generic(const execution_space& space, if (a == 0) { if (b == 0) { if (c == 0) { - MV_Update_Functor op(alpha, X, beta, - Y, gamma, Z); + MV_Update_Functor op(alpha, X, beta, Y, gamma, Z); Kokkos::parallel_for("KokkosBlas::update", policy, op); } else { - MV_Update_Functor op(alpha, X, beta, - Y, gamma, Z); + MV_Update_Functor op(alpha, X, beta, Y, gamma, Z); Kokkos::parallel_for("KokkosBlas::update", policy, op); } } else { if (c == 0) { - MV_Update_Functor op(alpha, X, beta, - Y, gamma, Z); + MV_Update_Functor op(alpha, X, beta, Y, gamma, Z); Kokkos::parallel_for("KokkosBlas::update", policy, op); } else { - MV_Update_Functor op(alpha, X, beta, - Y, gamma, Z); + MV_Update_Functor op(alpha, X, beta, Y, gamma, Z); Kokkos::parallel_for("KokkosBlas::update", policy, op); } } @@ -379,22 +349,18 @@ void MV_Update_Generic(const execution_space& space, else { if (b == 0) { if (c == 0) { - MV_Update_Functor op(alpha, X, beta, - Y, gamma, Z); + MV_Update_Functor op(alpha, X, beta, Y, gamma, Z); Kokkos::parallel_for("KokkosBlas::update", policy, op); } else { - MV_Update_Functor op(alpha, X, beta, - Y, gamma, Z); + MV_Update_Functor op(alpha, X, beta, Y, gamma, Z); Kokkos::parallel_for("KokkosBlas::update", policy, op); } } else { if (c == 0) { - MV_Update_Functor op(alpha, X, beta, - Y, gamma, Z); + MV_Update_Functor op(alpha, X, beta, Y, gamma, Z); Kokkos::parallel_for("KokkosBlas::update", policy, op); } else { - MV_Update_Functor op(alpha, X, beta, - Y, gamma, Z); + MV_Update_Functor op(alpha, X, beta, Y, gamma, Z); Kokkos::parallel_for("KokkosBlas::update", policy, op); } } @@ -417,13 +383,9 @@ void MV_Update_Generic(const execution_space& space, // Any literal coefficient of zero has BLAS semantics of ignoring the // corresponding vector entry. template -void V_Update_Generic(const execution_space& space, - const typename XV::non_const_value_type& alpha, - const XV& X, - const typename YV::non_const_value_type& beta, - const YV& Y, - const typename ZV::non_const_value_type& gamma, - const ZV& Z, int a = 2, int b = 2, int c = 2) { +void V_Update_Generic(const execution_space& space, const typename XV::non_const_value_type& alpha, const XV& X, + const typename YV::non_const_value_type& beta, const YV& Y, + const typename ZV::non_const_value_type& gamma, const ZV& Z, int a = 2, int b = 2, int c = 2) { static_assert(Kokkos::is_view::value, "KokkosBlas::Impl::" "V_Update_Generic: X is not a Kokkos::View."); @@ -433,17 +395,15 @@ void V_Update_Generic(const execution_space& space, static_assert(Kokkos::is_view::value, "KokkosBlas::Impl::" "V_Update_Generic: Z is not a Kokkos::View."); - static_assert(std::is_same::value, + static_assert(std::is_same::value, "KokkosBlas::Impl::V_Update_Generic: Z is const. " "It must be nonconst, because it is an output argument " "(we have to be able to write to its entries)."); // Casting to int avoids compiler warnings about comparing // different kinds of enum values. - static_assert( - (int)ZV::rank == (int)XV::rank && (int)ZV::rank == (int)YV::rank, - "KokkosBlas::Impl::V_Update_Generic: " - "X, Y, and Z must have the same rank."); + static_assert((int)ZV::rank == (int)XV::rank && (int)ZV::rank == (int)YV::rank, + "KokkosBlas::Impl::V_Update_Generic: " + "X, Y, and Z must have the same rank."); static_assert(ZV::rank == 1, "KokkosBlas::Impl::V_Update_Generic: " "XV, YV, and ZV must have rank 1."); @@ -454,22 +414,18 @@ void V_Update_Generic(const execution_space& space, if (a == 0) { if (b == 0) { if (c == 0) { - V_Update_Functor op(alpha, X, beta, Y, - gamma, Z); + V_Update_Functor op(alpha, X, beta, Y, gamma, Z); Kokkos::parallel_for("KokkosBlas::update<0,0,0>", policy, op); } else { - V_Update_Functor op(alpha, X, beta, Y, - gamma, Z); + V_Update_Functor op(alpha, X, beta, Y, gamma, Z); Kokkos::parallel_for("KokkosBlas::update<0,0,c>", policy, op); } } else { if (c == 0) { - V_Update_Functor op(alpha, X, beta, Y, - gamma, Z); + V_Update_Functor op(alpha, X, beta, Y, gamma, Z); Kokkos::parallel_for("KokkosBlas::update<0,b,0>", policy, op); } else { - V_Update_Functor op(alpha, X, beta, Y, - gamma, Z); + V_Update_Functor op(alpha, X, beta, Y, gamma, Z); Kokkos::parallel_for("KokkosBlas::update<0,b,c>", policy, op); } } @@ -480,22 +436,18 @@ void V_Update_Generic(const execution_space& space, else { if (b == 0) { if (c == 0) { - V_Update_Functor op(alpha, X, beta, Y, - gamma, Z); + V_Update_Functor op(alpha, X, beta, Y, gamma, Z); Kokkos::parallel_for("KokkosBlas::update", policy, op); } else { - V_Update_Functor op(alpha, X, beta, Y, - gamma, Z); + V_Update_Functor op(alpha, X, beta, Y, gamma, Z); Kokkos::parallel_for("KokkosBlas::update", policy, op); } } else { if (c == 0) { - V_Update_Functor op(alpha, X, beta, Y, - gamma, Z); + V_Update_Functor op(alpha, X, beta, Y, gamma, Z); Kokkos::parallel_for("KokkosBlas::update", policy, op); } else { - V_Update_Functor op(alpha, X, beta, Y, - gamma, Z); + V_Update_Functor op(alpha, X, beta, Y, gamma, Z); Kokkos::parallel_for("KokkosBlas::update", policy, op); } } diff --git a/blas/impl/KokkosBlas1_update_spec.hpp b/blas/impl/KokkosBlas1_update_spec.hpp index 9a54888012..b031a529b8 100644 --- a/blas/impl/KokkosBlas1_update_spec.hpp +++ b/blas/impl/KokkosBlas1_update_spec.hpp @@ -27,8 +27,7 @@ namespace KokkosBlas { namespace Impl { // Specialization struct which defines whether a specialization exists -template +template struct update_eti_spec_avail { enum : bool { value = false }; }; @@ -42,21 +41,17 @@ struct update_eti_spec_avail { // We may spread out definitions (see _INST macro below) across one or // more .cpp files. // -#define KOKKOSBLAS1_UPDATE_ETI_SPEC_AVAIL(SCALAR, LAYOUT, EXEC_SPACE, \ - MEM_SPACE) \ - template <> \ - struct update_eti_spec_avail< \ - EXEC_SPACE, \ - Kokkos::View, \ - Kokkos::MemoryTraits >, \ - Kokkos::View, \ - Kokkos::MemoryTraits >, \ - Kokkos::View, \ - Kokkos::MemoryTraits >, \ - 1> { \ - enum : bool { value = true }; \ +#define KOKKOSBLAS1_UPDATE_ETI_SPEC_AVAIL(SCALAR, LAYOUT, EXEC_SPACE, MEM_SPACE) \ + template <> \ + struct update_eti_spec_avail< \ + EXEC_SPACE, \ + Kokkos::View, \ + Kokkos::MemoryTraits >, \ + Kokkos::View, \ + Kokkos::MemoryTraits >, \ + Kokkos::View, Kokkos::MemoryTraits >, \ + 1> { \ + enum : bool { value = true }; \ }; // @@ -66,21 +61,17 @@ struct update_eti_spec_avail { // We may spread out definitions (see _INST macro below) across one or // more .cpp files. // -#define KOKKOSBLAS1_UPDATE_MV_ETI_SPEC_AVAIL(SCALAR, LAYOUT, EXEC_SPACE, \ - MEM_SPACE) \ - template <> \ - struct update_eti_spec_avail< \ - EXEC_SPACE, \ - Kokkos::View, \ - Kokkos::MemoryTraits >, \ - Kokkos::View, \ - Kokkos::MemoryTraits >, \ - Kokkos::View, \ - Kokkos::MemoryTraits >, \ - 2> { \ - enum : bool { value = true }; \ +#define KOKKOSBLAS1_UPDATE_MV_ETI_SPEC_AVAIL(SCALAR, LAYOUT, EXEC_SPACE, MEM_SPACE) \ + template <> \ + struct update_eti_spec_avail< \ + EXEC_SPACE, \ + Kokkos::View, \ + Kokkos::MemoryTraits >, \ + Kokkos::View, \ + Kokkos::MemoryTraits >, \ + Kokkos::View, Kokkos::MemoryTraits >, \ + 2> { \ + enum : bool { value = true }; \ }; // Include the actual specialization declarations @@ -103,39 +94,27 @@ namespace Impl { /// Z(i,j) = alpha*X(i,j) + beta*Y(i,j) + gamma*Z(i,j), /// /// with special cases for alpha, beta, or gamma = 0. -template ::value, - bool eti_spec_avail = - update_eti_spec_avail::value> +template ::value, + bool eti_spec_avail = update_eti_spec_avail::value> struct Update { - static void update(const execution_space& space, - const typename XMV::non_const_value_type& alpha, - const XMV& X, - const typename YMV::non_const_value_type& beta, - const YMV& Y, - const typename ZMV::non_const_value_type& gamma, - const ZMV& Z); + static void update(const execution_space& space, const typename XMV::non_const_value_type& alpha, const XMV& X, + const typename YMV::non_const_value_type& beta, const YMV& Y, + const typename ZMV::non_const_value_type& gamma, const ZMV& Z); }; #if !defined(KOKKOSKERNELS_ETI_ONLY) || KOKKOSKERNELS_IMPL_COMPILE_LIBRARY // Partial specialization for XMV, YMV, and ZMV rank-2 Views. template -struct Update { +struct Update { typedef typename XMV::size_type size_type; typedef Kokkos::ArithTraits ATA; typedef Kokkos::ArithTraits ATB; typedef Kokkos::ArithTraits ATC; - static void update(const execution_space& space, - const typename XMV::non_const_value_type& alpha, - const XMV& X, - const typename YMV::non_const_value_type& beta, - const YMV& Y, - const typename ZMV::non_const_value_type& gamma, - const ZMV& Z) { + static void update(const execution_space& space, const typename XMV::non_const_value_type& alpha, const XMV& X, + const typename YMV::non_const_value_type& beta, const YMV& Y, + const typename ZMV::non_const_value_type& gamma, const ZMV& Z) { static_assert(Kokkos::is_view::value, "KokkosBlas::Impl::" "Update::update: X is not a Kokkos::View."); @@ -145,32 +124,28 @@ struct Update::value, "KokkosBlas::Impl::" "Update::update: Z is not a Kokkos::View."); - static_assert(std::is_same::value, + static_assert(std::is_same::value, "KokkosBlas::Impl::Update::update: Z is const. " "It must be nonconst, because it is an output argument " "(we have to be able to write to its entries)."); // Casting to int avoids compiler warnings about comparing // different kinds of enum values. - static_assert( - (int)ZMV::rank == (int)XMV::rank && (int)ZMV::rank == (int)YMV::rank, - "KokkosBlas::Impl::Update::update: " - "X, Y, and Z must have the same rank."); + static_assert((int)ZMV::rank == (int)XMV::rank && (int)ZMV::rank == (int)YMV::rank, + "KokkosBlas::Impl::Update::update: " + "X, Y, and Z must have the same rank."); static_assert(ZMV::rank == 2, "KokkosBlas::Impl::Update::update: " "XMV, YMV, and ZMV must have rank 2."); - Kokkos::Profiling::pushRegion(KOKKOSKERNELS_IMPL_COMPILE_LIBRARY - ? "KokkosBlas::update[ETI]" - : "KokkosBlas::update[noETI]"); + Kokkos::Profiling::pushRegion(KOKKOSKERNELS_IMPL_COMPILE_LIBRARY ? "KokkosBlas::update[ETI]" + : "KokkosBlas::update[noETI]"); #ifdef KOKKOSKERNELS_ENABLE_CHECK_SPECIALIZATION if (KOKKOSKERNELS_IMPL_COMPILE_LIBRARY) - printf("KokkosBlas1::update<> ETI specialization for < %s , %s , %s >\n", - typeid(XMV).name(), typeid(YMV).name(), typeid(ZMV).name()); + printf("KokkosBlas1::update<> ETI specialization for < %s , %s , %s >\n", typeid(XMV).name(), typeid(YMV).name(), + typeid(ZMV).name()); else { - printf( - "KokkosBlas1::update<> non-ETI specialization for < %s , %s , %s >\n", - typeid(XMV).name(), typeid(YMV).name(), typeid(ZMV).name()); + printf("KokkosBlas1::update<> non-ETI specialization for < %s , %s , %s >\n", typeid(XMV).name(), + typeid(YMV).name(), typeid(ZMV).name()); } #endif @@ -203,24 +178,20 @@ struct Update(INT_MAX)) { typedef int index_type; - V_Update_Generic(space, alpha, X_0, beta, - Y_0, gamma, Z_0, a, b, c); + V_Update_Generic( + space, alpha, X_0, beta, Y_0, gamma, Z_0, a, b, c); } else { typedef typename XMV::size_type index_type; - V_Update_Generic(space, alpha, X_0, beta, - Y_0, gamma, Z_0, a, b, c); + V_Update_Generic( + space, alpha, X_0, beta, Y_0, gamma, Z_0, a, b, c); } } else { if (numRows * numCols < static_cast(INT_MAX)) { typedef int index_type; - MV_Update_Generic( - space, alpha, X, beta, Y, gamma, Z, a, b, c); + MV_Update_Generic(space, alpha, X, beta, Y, gamma, Z, a, b, c); } else { typedef typename XMV::size_type index_type; - MV_Update_Generic( - space, alpha, X, beta, Y, gamma, Z, a, b, c); + MV_Update_Generic(space, alpha, X, beta, Y, gamma, Z, a, b, c); } } Kokkos::Profiling::popRegion(); @@ -229,19 +200,15 @@ struct Update -struct Update { +struct Update { typedef typename XV::size_type size_type; typedef Kokkos::ArithTraits ATA; typedef Kokkos::ArithTraits ATB; typedef Kokkos::ArithTraits ATC; - static void update(const execution_space& space, - const typename XV::non_const_value_type& alpha, - const XV& X, const typename YV::non_const_value_type& beta, - const YV& Y, - const typename ZV::non_const_value_type& gamma, - const ZV& Z) { + static void update(const execution_space& space, const typename XV::non_const_value_type& alpha, const XV& X, + const typename YV::non_const_value_type& beta, const YV& Y, + const typename ZV::non_const_value_type& gamma, const ZV& Z) { // XV, YV, and ZV must be Kokkos::View specializations. static_assert(Kokkos::is_view::value, "KokkosBlas::Impl::" @@ -253,29 +220,25 @@ struct Update::update: Z is not a Kokkos::View."); // ZV must be nonconst (else it can't be an output argument). - static_assert(std::is_same::value, + static_assert(std::is_same::value, "KokkosBlas::Impl::Update::update: Z is const. " "It must be nonconst, because it is an output argument " "(we have to be able to write to its entries)."); - static_assert( - (int)ZV::rank == (int)XV::rank && (int)ZV::rank == (int)YV::rank, - "KokkosBlas::Impl::Update::update: " - "X, Y, and Z must have the same rank."); + static_assert((int)ZV::rank == (int)XV::rank && (int)ZV::rank == (int)YV::rank, + "KokkosBlas::Impl::Update::update: " + "X, Y, and Z must have the same rank."); static_assert(ZV::rank == 1, "KokkosBlas::Impl::Update::update: " "XV, YV, and ZV must have rank 1."); - Kokkos::Profiling::pushRegion(KOKKOSKERNELS_IMPL_COMPILE_LIBRARY - ? "KokkosBlas::update[ETI]" - : "KokkosBlas::update[noETI]"); + Kokkos::Profiling::pushRegion(KOKKOSKERNELS_IMPL_COMPILE_LIBRARY ? "KokkosBlas::update[ETI]" + : "KokkosBlas::update[noETI]"); #ifdef KOKKOSKERNELS_ENABLE_CHECK_SPECIALIZATION if (KOKKOSKERNELS_IMPL_COMPILE_LIBRARY) - printf("KokkosBlas1::update<> ETI specialization for < %s , %s , %s >\n", - typeid(XV).name(), typeid(YV).name(), typeid(ZV).name()); + printf("KokkosBlas1::update<> ETI specialization for < %s , %s , %s >\n", typeid(XV).name(), typeid(YV).name(), + typeid(ZV).name()); else { - printf( - "KokkosBlas1::update<> non-ETI specialization for < %s , %s , %s >\n", - typeid(XV).name(), typeid(YV).name(), typeid(ZV).name()); + printf("KokkosBlas1::update<> non-ETI specialization for < %s , %s , %s >\n", typeid(XV).name(), + typeid(YV).name(), typeid(ZV).name()); } #endif @@ -299,15 +262,12 @@ struct Update(INT_MAX) && - numRows * numCols < static_cast(INT_MAX)) { + if (numRows < static_cast(INT_MAX) && numRows * numCols < static_cast(INT_MAX)) { typedef int index_type; - V_Update_Generic( - space, alpha, X, beta, Y, gamma, Z, a, b, c); + V_Update_Generic(space, alpha, X, beta, Y, gamma, Z, a, b, c); } else { typedef typename XV::size_type index_type; - V_Update_Generic( - space, alpha, X, beta, Y, gamma, Z, a, b, c); + V_Update_Generic(space, alpha, X, beta, Y, gamma, Z, a, b, c); } Kokkos::Profiling::popRegion(); } @@ -326,32 +286,24 @@ struct Update, \ - Kokkos::MemoryTraits >, \ - Kokkos::View, \ - Kokkos::MemoryTraits >, \ - Kokkos::View, \ - Kokkos::MemoryTraits >, \ +#define KOKKOSBLAS1_UPDATE_ETI_SPEC_DECL(SCALAR, LAYOUT, EXEC_SPACE, MEM_SPACE) \ + extern template struct Update< \ + EXEC_SPACE, \ + Kokkos::View, \ + Kokkos::MemoryTraits >, \ + Kokkos::View, \ + Kokkos::MemoryTraits >, \ + Kokkos::View, Kokkos::MemoryTraits >, \ 1, false, true>; -#define KOKKOSBLAS1_UPDATE_ETI_SPEC_INST(SCALAR, LAYOUT, EXEC_SPACE, \ - MEM_SPACE) \ - template struct Update< \ - EXEC_SPACE, \ - Kokkos::View, \ - Kokkos::MemoryTraits >, \ - Kokkos::View, \ - Kokkos::MemoryTraits >, \ - Kokkos::View, \ - Kokkos::MemoryTraits >, \ +#define KOKKOSBLAS1_UPDATE_ETI_SPEC_INST(SCALAR, LAYOUT, EXEC_SPACE, MEM_SPACE) \ + template struct Update< \ + EXEC_SPACE, \ + Kokkos::View, \ + Kokkos::MemoryTraits >, \ + Kokkos::View, \ + Kokkos::MemoryTraits >, \ + Kokkos::View, Kokkos::MemoryTraits >, \ 1, false, true>; // @@ -362,32 +314,24 @@ struct Update, \ - Kokkos::MemoryTraits >, \ - Kokkos::View, \ - Kokkos::MemoryTraits >, \ - Kokkos::View, \ - Kokkos::MemoryTraits >, \ +#define KOKKOSBLAS1_UPDATE_MV_ETI_SPEC_DECL(SCALAR, LAYOUT, EXEC_SPACE, MEM_SPACE) \ + extern template struct Update< \ + EXEC_SPACE, \ + Kokkos::View, \ + Kokkos::MemoryTraits >, \ + Kokkos::View, \ + Kokkos::MemoryTraits >, \ + Kokkos::View, Kokkos::MemoryTraits >, \ 2, false, true>; -#define KOKKOSBLAS1_UPDATE_MV_ETI_SPEC_INST(SCALAR, LAYOUT, EXEC_SPACE, \ - MEM_SPACE) \ - template struct Update< \ - EXEC_SPACE, \ - Kokkos::View, \ - Kokkos::MemoryTraits >, \ - Kokkos::View, \ - Kokkos::MemoryTraits >, \ - Kokkos::View, \ - Kokkos::MemoryTraits >, \ +#define KOKKOSBLAS1_UPDATE_MV_ETI_SPEC_INST(SCALAR, LAYOUT, EXEC_SPACE, MEM_SPACE) \ + template struct Update< \ + EXEC_SPACE, \ + Kokkos::View, \ + Kokkos::MemoryTraits >, \ + Kokkos::View, \ + Kokkos::MemoryTraits >, \ + Kokkos::View, Kokkos::MemoryTraits >, \ 2, false, true>; #include diff --git a/blas/impl/KokkosBlas2_gemv_impl.hpp b/blas/impl/KokkosBlas2_gemv_impl.hpp index dc0f531583..b1976e2622 100644 --- a/blas/impl/KokkosBlas2_gemv_impl.hpp +++ b/blas/impl/KokkosBlas2_gemv_impl.hpp @@ -29,35 +29,26 @@ namespace Impl { template struct SingleLevelNontransposeGEMV { using AlphaCoeffType = typename AViewType::non_const_value_type; using BetaCoeffType = typename YViewType::non_const_value_type; using y_value_type = typename YViewType::non_const_value_type; - using AccumScalar = typename std::conditional< - std::is_same::value || - std::is_same::value, - float, y_value_type>::type; - - SingleLevelNontransposeGEMV(const AlphaCoeffType& alpha, const AViewType& A, - const XViewType& x, const BetaCoeffType& beta, - const YViewType& y) + using AccumScalar = typename std::conditional::value || + std::is_same::value, + float, y_value_type>::type; + + SingleLevelNontransposeGEMV(const AlphaCoeffType& alpha, const AViewType& A, const XViewType& x, + const BetaCoeffType& beta, const YViewType& y) : alpha_(alpha), A_(A), x_(x), beta_(beta), y_(y) { - static_assert(Kokkos::is_view::value, - "AViewType must be a Kokkos::View."); - static_assert(Kokkos::is_view::value, - "XViewType must be a Kokkos::View."); - static_assert(Kokkos::is_view::value, - "YViewType must be a Kokkos::View."); - static_assert(static_cast(AViewType::rank) == 2, - "AViewType must have rank 2."); - static_assert(static_cast(XViewType::rank) == 1, - "XViewType must have rank 1."); - static_assert(static_cast(YViewType::rank) == 1, - "YViewType must have rank 1."); - static_assert(std::is_integral::value, - "IndexType must be an integer."); + static_assert(Kokkos::is_view::value, "AViewType must be a Kokkos::View."); + static_assert(Kokkos::is_view::value, "XViewType must be a Kokkos::View."); + static_assert(Kokkos::is_view::value, "YViewType must be a Kokkos::View."); + static_assert(static_cast(AViewType::rank) == 2, "AViewType must have rank 2."); + static_assert(static_cast(XViewType::rank) == 1, "XViewType must have rank 1."); + static_assert(static_cast(YViewType::rank) == 1, "YViewType must have rank 1."); + static_assert(std::is_integral::value, "IndexType must be an integer."); static_assert(alphaPreset == 0 || alphaPreset == 1 || alphaPreset == -1, "Invalid alphaPreset value; valid values are 0, 1, and -1."); static_assert(betaPreset == 0 || betaPreset == 1 || betaPreset == -1, @@ -112,43 +103,29 @@ struct SingleLevelNontransposeGEMV { template struct SingleLevelTransposeGEMV { using y_value_type = typename YViewType::non_const_value_type; using AlphaCoeffType = typename AViewType::non_const_value_type; using BetaCoeffType = typename YViewType::non_const_value_type; - using AccumScalar = typename std::conditional< - std::is_same::value || - std::is_same::value, - float, y_value_type>::type; + using AccumScalar = typename std::conditional::value || + std::is_same::value, + float, y_value_type>::type; typedef AccumScalar value_type[]; IndexType value_count; // Kokkos needs this for reductions w/ array results - SingleLevelTransposeGEMV(const AlphaCoeffType& alpha, const AViewType& A, - const XViewType& x, const BetaCoeffType& beta, - const YViewType& y) - : value_count(A.extent(1)), - alpha_(alpha), - A_(A), - x_(x), - beta_(beta), - y_(y) { - static_assert(Kokkos::is_view::value, - "AViewType must be a Kokkos::View."); - static_assert(Kokkos::is_view::value, - "XViewType must be a Kokkos::View."); - static_assert(Kokkos::is_view::value, - "YViewType must be a Kokkos::View."); - static_assert(static_cast(AViewType::rank) == 2, - "AViewType must have rank 2."); - static_assert(static_cast(XViewType::rank) == 1, - "XViewType must have rank 1."); - static_assert(static_cast(YViewType::rank) == 1, - "YViewType must have rank 1."); - static_assert(std::is_integral::value, - "IndexType must be an integer."); + SingleLevelTransposeGEMV(const AlphaCoeffType& alpha, const AViewType& A, const XViewType& x, + const BetaCoeffType& beta, const YViewType& y) + : value_count(A.extent(1)), alpha_(alpha), A_(A), x_(x), beta_(beta), y_(y) { + static_assert(Kokkos::is_view::value, "AViewType must be a Kokkos::View."); + static_assert(Kokkos::is_view::value, "XViewType must be a Kokkos::View."); + static_assert(Kokkos::is_view::value, "YViewType must be a Kokkos::View."); + static_assert(static_cast(AViewType::rank) == 2, "AViewType must have rank 2."); + static_assert(static_cast(XViewType::rank) == 1, "XViewType must have rank 1."); + static_assert(static_cast(YViewType::rank) == 1, "YViewType must have rank 1."); + static_assert(std::is_integral::value, "IndexType must be an integer."); static_assert(alphaPreset == 0 || alphaPreset == 1 || alphaPreset == -1, "Invalid alphaPreset value; valid values are 0, 1, and -1."); static_assert(betaPreset == 0 || betaPreset == 1 || betaPreset == -1, @@ -178,8 +155,7 @@ struct SingleLevelTransposeGEMV { } } - KOKKOS_INLINE_FUNCTION void operator()(const IndexType& i, - value_type y_cur) const { + KOKKOS_INLINE_FUNCTION void operator()(const IndexType& i, value_type y_cur) const { using Kokkos::ArithTraits; using KAT = ArithTraits; @@ -199,27 +175,18 @@ struct SingleLevelTransposeGEMV { }; // Single-level parallel version of GEMV. -template -void singleLevelGemv(const ExecutionSpace& space, const char trans[], - typename AViewType::const_value_type& alpha, - const AViewType& A, const XViewType& x, - typename YViewType::const_value_type& beta, +template +void singleLevelGemv(const ExecutionSpace& space, const char trans[], typename AViewType::const_value_type& alpha, + const AViewType& A, const XViewType& x, typename YViewType::const_value_type& beta, const YViewType& y) { - static_assert(Kokkos::is_view::value, - "AViewType must be a Kokkos::View."); - static_assert(Kokkos::is_view::value, - "XViewType must be a Kokkos::View."); - static_assert(Kokkos::is_view::value, - "YViewType must be a Kokkos::View."); - static_assert(static_cast(AViewType::rank) == 2, - "AViewType must have rank 2."); - static_assert(static_cast(XViewType::rank) == 1, - "XViewType must have rank 1."); - static_assert(static_cast(YViewType::rank) == 1, - "YViewType must have rank 1."); - static_assert(std::is_integral::value, - "IndexType must be an integer"); + static_assert(Kokkos::is_view::value, "AViewType must be a Kokkos::View."); + static_assert(Kokkos::is_view::value, "XViewType must be a Kokkos::View."); + static_assert(Kokkos::is_view::value, "YViewType must be a Kokkos::View."); + static_assert(static_cast(AViewType::rank) == 2, "AViewType must have rank 2."); + static_assert(static_cast(XViewType::rank) == 1, "XViewType must have rank 1."); + static_assert(static_cast(YViewType::rank) == 1, "YViewType must have rank 1."); + static_assert(std::is_integral::value, "IndexType must be an integer"); using y_value_type = typename YViewType::non_const_value_type; using policy_type = Kokkos::RangePolicy; @@ -242,12 +209,9 @@ void singleLevelGemv(const ExecutionSpace& space, const char trans[], // "Fake out" a scal() by using the non-transpose alpha=0, // general beta case. This assumes that the functor doesn't // check dimensions. - using functor_type = - SingleLevelNontransposeGEMV; + using functor_type = SingleLevelNontransposeGEMV; functor_type functor(alpha, A, x, beta, y); - Kokkos::parallel_for("KokkosBlas::gemv[SingleLevel]", - policy_type(0, A.extent(1)), functor); + Kokkos::parallel_for("KokkosBlas::gemv[SingleLevel]", policy_type(0, A.extent(1)), functor); } return; } @@ -260,49 +224,35 @@ void singleLevelGemv(const ExecutionSpace& space, const char trans[], } else if (beta == Kokkos::ArithTraits::one()) { // Do nothing (y := 1 * y) } else { // beta != 0 && beta != 1 - using functor_type = - SingleLevelNontransposeGEMV; + using functor_type = SingleLevelNontransposeGEMV; functor_type functor(alpha, A, x, beta, y); Kokkos::parallel_for("KokkosBlas::gemv[SingleLevel]", range, functor); } } else if (alpha == Kokkos::ArithTraits::one()) { if (beta == Kokkos::ArithTraits::zero()) { - using functor_type = - SingleLevelNontransposeGEMV; + using functor_type = SingleLevelNontransposeGEMV; functor_type functor(alpha, A, x, beta, y); Kokkos::parallel_for("KokkosBlas::gemv[SingleLevel]", range, functor); } else if (beta == Kokkos::ArithTraits::one()) { - using functor_type = - SingleLevelNontransposeGEMV; + using functor_type = SingleLevelNontransposeGEMV; functor_type functor(alpha, A, x, beta, y); Kokkos::parallel_for("KokkosBlas::gemv[SingleLevel]", range, functor); } else { // beta != 0 && beta != 1 - using functor_type = - SingleLevelNontransposeGEMV; + using functor_type = SingleLevelNontransposeGEMV; functor_type functor(alpha, A, x, beta, y); Kokkos::parallel_for("KokkosBlas::gemv[SingleLevel]", range, functor); } } else { // alpha != 0 and alpha != 1 if (beta == Kokkos::ArithTraits::zero()) { - using functor_type = - SingleLevelNontransposeGEMV; + using functor_type = SingleLevelNontransposeGEMV; functor_type functor(alpha, A, x, beta, y); Kokkos::parallel_for("KokkosBlas::gemv[SingleLevel]", range, functor); } else if (beta == Kokkos::ArithTraits::one()) { - using functor_type = - SingleLevelNontransposeGEMV; + using functor_type = SingleLevelNontransposeGEMV; functor_type functor(alpha, A, x, beta, y); Kokkos::parallel_for("KokkosBlas::gemv[SingleLevel]", range, functor); } else { // beta != 0 && beta != 1 - using functor_type = - SingleLevelNontransposeGEMV; + using functor_type = SingleLevelNontransposeGEMV; functor_type functor(alpha, A, x, beta, y); Kokkos::parallel_for("KokkosBlas::gemv[SingleLevel]", range, functor); } @@ -315,58 +265,37 @@ void singleLevelGemv(const ExecutionSpace& space, const char trans[], } else if (beta == Kokkos::ArithTraits::one()) { // Do nothing (y := 1 * y) } else { // beta != 0 && beta != 1 - using functor_type = - SingleLevelTransposeGEMV; + using functor_type = SingleLevelTransposeGEMV; functor_type functor(alpha, A, x, beta, y); - Kokkos::parallel_reduce("KokkosBlas::gemv[SingleLevelTranspose]", range, - functor); + Kokkos::parallel_reduce("KokkosBlas::gemv[SingleLevelTranspose]", range, functor); } } else if (alpha == Kokkos::ArithTraits::one()) { if (beta == Kokkos::ArithTraits::zero()) { - using functor_type = - SingleLevelTransposeGEMV; + using functor_type = SingleLevelTransposeGEMV; functor_type functor(alpha, A, x, beta, y); - Kokkos::parallel_reduce("KokkosBlas::gemv[SingleLevelTranspose]", range, - functor); + Kokkos::parallel_reduce("KokkosBlas::gemv[SingleLevelTranspose]", range, functor); } else if (beta == Kokkos::ArithTraits::one()) { - using functor_type = - SingleLevelTransposeGEMV; + using functor_type = SingleLevelTransposeGEMV; functor_type functor(alpha, A, x, beta, y); - Kokkos::parallel_reduce("KokkosBlas::gemv[SingleLevelTranspose]", range, - functor); + Kokkos::parallel_reduce("KokkosBlas::gemv[SingleLevelTranspose]", range, functor); } else { // beta != 0 && beta != 1 - using functor_type = - SingleLevelTransposeGEMV; + using functor_type = SingleLevelTransposeGEMV; functor_type functor(alpha, A, x, beta, y); - Kokkos::parallel_reduce("KokkosBlas::gemv[SingleLevelTranspose]", range, - functor); + Kokkos::parallel_reduce("KokkosBlas::gemv[SingleLevelTranspose]", range, functor); } } else { // alpha != 0 and alpha != 1 if (beta == Kokkos::ArithTraits::zero()) { - using functor_type = - SingleLevelTransposeGEMV; + using functor_type = SingleLevelTransposeGEMV; functor_type functor(alpha, A, x, beta, y); - Kokkos::parallel_reduce("KokkosBlas::gemv[SingleLevelTranspose]", range, - functor); + Kokkos::parallel_reduce("KokkosBlas::gemv[SingleLevelTranspose]", range, functor); } else if (beta == Kokkos::ArithTraits::one()) { - using functor_type = - SingleLevelTransposeGEMV; + using functor_type = SingleLevelTransposeGEMV; functor_type functor(alpha, A, x, beta, y); - Kokkos::parallel_reduce("KokkosBlas::gemv[SingleLevelTranspose]", range, - functor); + Kokkos::parallel_reduce("KokkosBlas::gemv[SingleLevelTranspose]", range, functor); } else { // beta != 0 && beta != 1 - using functor_type = - SingleLevelTransposeGEMV; + using functor_type = SingleLevelTransposeGEMV; functor_type functor(alpha, A, x, beta, y); - Kokkos::parallel_reduce("KokkosBlas::gemv[SingleLevelTranspose]", range, - functor); + Kokkos::parallel_reduce("KokkosBlas::gemv[SingleLevelTranspose]", range, functor); } } } else if (tr == 'C' || tr == 'c' || tr == 'H' || tr == 'h') { // conj xpose @@ -377,58 +306,37 @@ void singleLevelGemv(const ExecutionSpace& space, const char trans[], } else if (beta == Kokkos::ArithTraits::one()) { // Do nothing (y := 1 * y) } else { // beta != 0 && beta != 1 - using functor_type = - SingleLevelTransposeGEMV; + using functor_type = SingleLevelTransposeGEMV; functor_type functor(alpha, A, x, beta, y); - Kokkos::parallel_reduce("KokkosBlas::gemv[SingleLevelTranspose]", range, - functor); + Kokkos::parallel_reduce("KokkosBlas::gemv[SingleLevelTranspose]", range, functor); } } else if (alpha == Kokkos::ArithTraits::one()) { if (beta == Kokkos::ArithTraits::zero()) { - using functor_type = - SingleLevelTransposeGEMV; + using functor_type = SingleLevelTransposeGEMV; functor_type functor(alpha, A, x, beta, y); - Kokkos::parallel_reduce("KokkosBlas::gemv[SingleLevelTranspose]", range, - functor); + Kokkos::parallel_reduce("KokkosBlas::gemv[SingleLevelTranspose]", range, functor); } else if (beta == Kokkos::ArithTraits::one()) { - using functor_type = - SingleLevelTransposeGEMV; + using functor_type = SingleLevelTransposeGEMV; functor_type functor(alpha, A, x, beta, y); - Kokkos::parallel_reduce("KokkosBlas::gemv[SingleLevelTranspose]", range, - functor); + Kokkos::parallel_reduce("KokkosBlas::gemv[SingleLevelTranspose]", range, functor); } else { // beta != 0 && beta != 1 - using functor_type = - SingleLevelTransposeGEMV; + using functor_type = SingleLevelTransposeGEMV; functor_type functor(alpha, A, x, beta, y); - Kokkos::parallel_reduce("KokkosBlas::gemv[SingleLevelTranspose]", range, - functor); + Kokkos::parallel_reduce("KokkosBlas::gemv[SingleLevelTranspose]", range, functor); } } else { // alpha != 0 and alpha != 1 if (beta == Kokkos::ArithTraits::zero()) { - using functor_type = - SingleLevelTransposeGEMV; + using functor_type = SingleLevelTransposeGEMV; functor_type functor(alpha, A, x, beta, y); - Kokkos::parallel_reduce("KokkosBlas::gemv[SingleLevelTranspose]", range, - functor); + Kokkos::parallel_reduce("KokkosBlas::gemv[SingleLevelTranspose]", range, functor); } else if (beta == Kokkos::ArithTraits::one()) { - using functor_type = - SingleLevelTransposeGEMV; + using functor_type = SingleLevelTransposeGEMV; functor_type functor(alpha, A, x, beta, y); - Kokkos::parallel_reduce("KokkosBlas::gemv[SingleLevelTranspose]", range, - functor); + Kokkos::parallel_reduce("KokkosBlas::gemv[SingleLevelTranspose]", range, functor); } else { // beta != 0 && beta != 1 - using functor_type = - SingleLevelTransposeGEMV; + using functor_type = SingleLevelTransposeGEMV; functor_type functor(alpha, A, x, beta, y); - Kokkos::parallel_reduce("KokkosBlas::gemv[SingleLevelTranspose]", range, - functor); + Kokkos::parallel_reduce("KokkosBlas::gemv[SingleLevelTranspose]", range, functor); } } } @@ -440,38 +348,29 @@ struct TwoLevelGEMV_LayoutRightTag {}; // --------------------------------------------------------------------------------------------- // Functor for a two-level parallel_reduce version of GEMV (non-transpose), // designed for performance on GPU. Kernel depends on the layout of A. -template +template struct TwoLevelGEMV { using y_value_type = typename YViewType::non_const_value_type; using AlphaCoeffType = typename AViewType::non_const_value_type; using BetaCoeffType = typename YViewType::non_const_value_type; - using AccumScalar = typename std::conditional< - std::is_same::value || - std::is_same::value, - float, y_value_type>::type; + using AccumScalar = typename std::conditional::value || + std::is_same::value, + float, y_value_type>::type; using policy_type = Kokkos::TeamPolicy; using member_type = typename policy_type::member_type; - TwoLevelGEMV(const AlphaCoeffType& alpha, const AViewType& A, - const XViewType& x, const BetaCoeffType& beta, + TwoLevelGEMV(const AlphaCoeffType& alpha, const AViewType& A, const XViewType& x, const BetaCoeffType& beta, const YViewType& y) : alpha_(alpha), A_(A), x_(x), beta_(beta), y_(y) { - static_assert(Kokkos::is_view::value, - "AViewType must be a Kokkos::View."); - static_assert(Kokkos::is_view::value, - "XViewType must be a Kokkos::View."); - static_assert(Kokkos::is_view::value, - "YViewType must be a Kokkos::View."); - static_assert(static_cast(AViewType::rank) == 2, - "AViewType must have rank 2."); - static_assert(static_cast(XViewType::rank) == 1, - "XViewType must have rank 1."); - static_assert(static_cast(YViewType::rank) == 1, - "YViewType must have rank 1."); - static_assert(std::is_integral::value, - "IndexType must be an integer."); + static_assert(Kokkos::is_view::value, "AViewType must be a Kokkos::View."); + static_assert(Kokkos::is_view::value, "XViewType must be a Kokkos::View."); + static_assert(Kokkos::is_view::value, "YViewType must be a Kokkos::View."); + static_assert(static_cast(AViewType::rank) == 2, "AViewType must have rank 2."); + static_assert(static_cast(XViewType::rank) == 1, "XViewType must have rank 1."); + static_assert(static_cast(YViewType::rank) == 1, "YViewType must have rank 1."); + static_assert(std::is_integral::value, "IndexType must be an integer."); } public: @@ -480,15 +379,12 @@ struct TwoLevelGEMV { // -Groups of 32 threads handle N/teamsize columns sequentially, placing // results into shared. -Then individual thread results are combined with // parallel_reduce. - KOKKOS_INLINE_FUNCTION void operator()(TwoLevelGEMV_LayoutLeftTag, - const member_type& team) const { + KOKKOS_INLINE_FUNCTION void operator()(TwoLevelGEMV_LayoutLeftTag, const member_type& team) const { using KAT = Kokkos::ArithTraits; using AKAT = Kokkos::ArithTraits; // Allocate a Scalar in shared for each thread - AccumScalar* blockResult = - (AccumScalar*)team.team_shmem().get_shmem(32 * sizeof(AccumScalar)); - Kokkos::parallel_for(Kokkos::TeamThreadRange(team, 32), - [&](int i) { blockResult[i] = AKAT::zero(); }); + AccumScalar* blockResult = (AccumScalar*)team.team_shmem().get_shmem(32 * sizeof(AccumScalar)); + Kokkos::parallel_for(Kokkos::TeamThreadRange(team, 32), [&](int i) { blockResult[i] = AKAT::zero(); }); team.team_barrier(); // Which block this thread will work on int block = team.team_rank() / 32; @@ -498,9 +394,7 @@ struct TwoLevelGEMV { AccumScalar localSum = AKAT::zero(); // compute local sum if (row < (IndexType)A_.extent(0)) { - for (IndexType col = blockColStart; - col < blockColStart + columnsPerThread && col < A_.extent(1); - col++) { + for (IndexType col = blockColStart; col < blockColStart + columnsPerThread && col < A_.extent(1); col++) { // A access is coalesced, x access is a broadcast localSum += AccumScalar(A_(row, col)) * AccumScalar(x_(col)); } @@ -514,15 +408,13 @@ struct TwoLevelGEMV { if (beta_ == KAT::zero()) y_(yrow) = y_value_type(alpha_ * blockResult[i]); else - y_(yrow) = y_value_type(beta_ * AccumScalar(y_(yrow)) + - alpha_ * blockResult[i]); + y_(yrow) = y_value_type(beta_ * AccumScalar(y_(yrow)) + alpha_ * blockResult[i]); } }); } // LayoutRight version: one team per row - KOKKOS_INLINE_FUNCTION void operator()(TwoLevelGEMV_LayoutRightTag, - const member_type& team) const { + KOKKOS_INLINE_FUNCTION void operator()(TwoLevelGEMV_LayoutRightTag, const member_type& team) const { using KAT = Kokkos::ArithTraits; const IndexType N = A_.extent(1); @@ -532,10 +424,7 @@ struct TwoLevelGEMV { AccumScalar val; Kokkos::parallel_reduce( Kokkos::TeamThreadRange(team, N), - [&](const int j, AccumScalar& update) { - update += AccumScalar(A_(i, j)) * x_(j); - }, - val); + [&](const int j, AccumScalar& update) { update += AccumScalar(A_(i, j)) * x_(j); }, val); // compute yj = beta*yj + alpha*val Kokkos::single(Kokkos::PerTeam(team), [&]() { @@ -561,39 +450,29 @@ struct TwoLevelGEMV { // transpose GEMV. The functor uses parallel-for over the columns of the input // matrix A and each team uses parallel-reduce over the row of its column. // The output vector y is the reduction result. -template struct TwoLevelTransposeGEMV { using y_value_type = typename YViewType::non_const_value_type; using AlphaCoeffType = typename AViewType::non_const_value_type; using BetaCoeffType = typename YViewType::non_const_value_type; - using AccumScalar = typename std::conditional< - std::is_same::value || - std::is_same::value, - float, y_value_type>::type; + using AccumScalar = typename std::conditional::value || + std::is_same::value, + float, y_value_type>::type; using policy_type = Kokkos::TeamPolicy; using member_type = typename policy_type::member_type; - TwoLevelTransposeGEMV(const AlphaCoeffType& alpha, const AViewType& A, - const XViewType& x, const BetaCoeffType& beta, + TwoLevelTransposeGEMV(const AlphaCoeffType& alpha, const AViewType& A, const XViewType& x, const BetaCoeffType& beta, const YViewType& y) : alpha_(alpha), A_(A), x_(x), beta_(beta), y_(y) { - static_assert(Kokkos::is_view::value, - "AViewType must be a Kokkos::View."); - static_assert(Kokkos::is_view::value, - "XViewType must be a Kokkos::View."); - static_assert(Kokkos::is_view::value, - "YViewType must be a Kokkos::View."); - static_assert(static_cast(AViewType::rank) == 2, - "AViewType must have rank 2."); - static_assert(static_cast(XViewType::rank) == 1, - "XViewType must have rank 1."); - static_assert(static_cast(YViewType::rank) == 1, - "YViewType must have rank 1."); - static_assert(std::is_integral::value, - "IndexType must be an integer."); + static_assert(Kokkos::is_view::value, "AViewType must be a Kokkos::View."); + static_assert(Kokkos::is_view::value, "XViewType must be a Kokkos::View."); + static_assert(Kokkos::is_view::value, "YViewType must be a Kokkos::View."); + static_assert(static_cast(AViewType::rank) == 2, "AViewType must have rank 2."); + static_assert(static_cast(XViewType::rank) == 1, "XViewType must have rank 1."); + static_assert(static_cast(YViewType::rank) == 1, "YViewType must have rank 1."); + static_assert(std::is_integral::value, "IndexType must be an integer."); } public: @@ -634,27 +513,18 @@ struct TwoLevelTransposeGEMV { }; // Two-level parallel version of GEMV. -template -void twoLevelGemv(const ExecutionSpace& space, const char trans[], - typename AViewType::const_value_type& alpha, - const AViewType& A, const XViewType& x, - typename YViewType::const_value_type& beta, +template +void twoLevelGemv(const ExecutionSpace& space, const char trans[], typename AViewType::const_value_type& alpha, + const AViewType& A, const XViewType& x, typename YViewType::const_value_type& beta, const YViewType& y) { - static_assert(Kokkos::is_view::value, - "AViewType must be a Kokkos::View."); - static_assert(Kokkos::is_view::value, - "XViewType must be a Kokkos::View."); - static_assert(Kokkos::is_view::value, - "YViewType must be a Kokkos::View."); - static_assert(static_cast(AViewType::rank) == 2, - "AViewType must have rank 2."); - static_assert(static_cast(XViewType::rank) == 1, - "XViewType must have rank 1."); - static_assert(static_cast(YViewType::rank) == 1, - "YViewType must have rank 1."); - static_assert(std::is_integral::value, - "IndexType must be an integer"); + static_assert(Kokkos::is_view::value, "AViewType must be a Kokkos::View."); + static_assert(Kokkos::is_view::value, "XViewType must be a Kokkos::View."); + static_assert(Kokkos::is_view::value, "YViewType must be a Kokkos::View."); + static_assert(static_cast(AViewType::rank) == 2, "AViewType must have rank 2."); + static_assert(static_cast(XViewType::rank) == 1, "XViewType must have rank 1."); + static_assert(static_cast(YViewType::rank) == 1, "YViewType must have rank 1."); + static_assert(std::is_integral::value, "IndexType must be an integer"); using y_value_type = typename YViewType::non_const_value_type; using team_policy_type = Kokkos::TeamPolicy; @@ -681,40 +551,33 @@ void twoLevelGemv(const ExecutionSpace& space, const char trans[], // "Fake out" a scal() by using the non-transpose alpha=0, // general beta case. This assumes that the functor doesn't // check dimensions. - using functor_type = - SingleLevelNontransposeGEMV; + using functor_type = SingleLevelNontransposeGEMV; functor_type functor(alpha, A, x, beta, y); - Kokkos::parallel_for("KokkosBlas::gemv[SingleLevel]", - range_policy_type(space, 0, y.extent(0)), functor); + Kokkos::parallel_for("KokkosBlas::gemv[SingleLevel]", range_policy_type(space, 0, y.extent(0)), functor); } return; } if (tr == 'N') { - constexpr bool isLayoutLeft = std::is_same::value; + constexpr bool isLayoutLeft = std::is_same::value; // Both kernels work for both layouts - the only difference is access // pattern. using layout_tag = - typename std::conditional::type; + typename std::conditional::type; using tagged_policy = Kokkos::TeamPolicy; - using functor_type = TwoLevelGEMV; + using functor_type = TwoLevelGEMV; functor_type functor(alpha, A, x, beta, y); tagged_policy team; if constexpr (isLayoutLeft) { - using AccumScalar = typename std::conditional< - std::is_same::value || - std::is_same::value, - float, y_value_type>::type; + using AccumScalar = + typename std::conditional::value || + std::is_same::value, + float, y_value_type>::type; size_t sharedPerTeam = 32 * sizeof(AccumScalar); IndexType numTeams = (A.extent(0) + 31) / 32; tagged_policy temp(space, 1, 1); temp.set_scratch_size(0, Kokkos::PerTeam(sharedPerTeam)); - int teamSize = - temp.team_size_recommended(functor, Kokkos::ParallelForTag()); + int teamSize = temp.team_size_recommended(functor, Kokkos::ParallelForTag()); // make sure teamSize is a multiple of 32 teamSize -= teamSize % 32; // don't make teamSize larger than what's useful @@ -728,8 +591,7 @@ void twoLevelGemv(const ExecutionSpace& space, const char trans[], #endif int numBlocks = teamSize / 32; functor.columnsPerThread = (A.extent(1) + numBlocks - 1) / numBlocks; - team = tagged_policy(space, numTeams, teamSize) - .set_scratch_size(0, Kokkos::PerTeam(sharedPerTeam)); + team = tagged_policy(space, numTeams, teamSize).set_scratch_size(0, Kokkos::PerTeam(sharedPerTeam)); } else { // LayoutRight: one team per row team = tagged_policy(space, A.extent(0), Kokkos::AUTO); @@ -744,21 +606,15 @@ void twoLevelGemv(const ExecutionSpace& space, const char trans[], } else if (tr == 'T') { // transpose, and not conj transpose team_policy_type team(space, A.extent(1), Kokkos::AUTO); - using functor_type = - TwoLevelTransposeGEMV; + using functor_type = TwoLevelTransposeGEMV; functor_type functor(alpha, A, x, beta, y); - Kokkos::parallel_for("KokkosBlas::gemv[twoLevelTranspose]", team, - functor); + Kokkos::parallel_for("KokkosBlas::gemv[twoLevelTranspose]", team, functor); } else if (tr == 'C' || tr == 'H') { // conjugate transpose team_policy_type team(space, A.extent(1), Kokkos::AUTO); - using functor_type = - TwoLevelTransposeGEMV; + using functor_type = TwoLevelTransposeGEMV; functor_type functor(alpha, A, x, beta, y); - Kokkos::parallel_for("KokkosBlas::gemv[twoLevelTranspose]", team, - functor); + Kokkos::parallel_for("KokkosBlas::gemv[twoLevelTranspose]", team, functor); } } } @@ -766,26 +622,18 @@ void twoLevelGemv(const ExecutionSpace& space, const char trans[], // generalGemv: use 1 level (Range) or 2 level (Team) implementation, // depending on whether execution space is CPU or GPU. enable_if makes sure // unused kernels are not instantiated. -template ()>::type* = nullptr> -void generalGemvImpl(const ExecutionSpace& space, const char trans[], - typename AViewType::const_value_type& alpha, - const AViewType& A, const XViewType& x, - typename YViewType::const_value_type& beta, +template ()>::type* = nullptr> +void generalGemvImpl(const ExecutionSpace& space, const char trans[], typename AViewType::const_value_type& alpha, + const AViewType& A, const XViewType& x, typename YViewType::const_value_type& beta, const YViewType& y) { singleLevelGemv(space, trans, alpha, A, x, beta, y); } -template ()>::type* = nullptr> -void generalGemvImpl(const ExecutionSpace& space, const char trans[], - typename AViewType::const_value_type& alpha, - const AViewType& A, const XViewType& x, - typename YViewType::const_value_type& beta, +template ()>::type* = nullptr> +void generalGemvImpl(const ExecutionSpace& space, const char trans[], typename AViewType::const_value_type& alpha, + const AViewType& A, const XViewType& x, typename YViewType::const_value_type& beta, const YViewType& y) { twoLevelGemv(space, trans, alpha, A, x, beta, y); } diff --git a/blas/impl/KokkosBlas2_gemv_spec.hpp b/blas/impl/KokkosBlas2_gemv_spec.hpp index 97e6e2717e..05e2d28bc7 100644 --- a/blas/impl/KokkosBlas2_gemv_spec.hpp +++ b/blas/impl/KokkosBlas2_gemv_spec.hpp @@ -41,19 +41,16 @@ struct gemv_eti_spec_avail { // We may spread out definitions (see _INST macro below) across one or // more .cpp files. // -#define KOKKOSBLAS2_GEMV_ETI_SPEC_AVAIL(SCALAR, LAYOUT, EXEC_SPACE, MEM_SPACE) \ - template <> \ - struct gemv_eti_spec_avail< \ - EXEC_SPACE, \ - Kokkos::View, \ - Kokkos::MemoryTraits >, \ - Kokkos::View, \ - Kokkos::MemoryTraits >, \ - Kokkos::View, \ - Kokkos::MemoryTraits > > { \ - enum : bool { value = true }; \ +#define KOKKOSBLAS2_GEMV_ETI_SPEC_AVAIL(SCALAR, LAYOUT, EXEC_SPACE, MEM_SPACE) \ + template <> \ + struct gemv_eti_spec_avail, \ + Kokkos::MemoryTraits >, \ + Kokkos::View, \ + Kokkos::MemoryTraits >, \ + Kokkos::View, \ + Kokkos::MemoryTraits > > { \ + enum : bool { value = true }; \ }; // Include the actual specialization declarations @@ -68,47 +65,32 @@ namespace Impl { // // Implementation of KokkosBlas::gemv. -template < - class ExecutionSpace, class AViewType, class XViewType, class YViewType, - bool tpl_spec_avail = gemv_tpl_spec_avail::value, - bool eti_spec_avail = gemv_eti_spec_avail::value> +template ::value, + bool eti_spec_avail = gemv_eti_spec_avail::value> struct GEMV { - static void gemv(const ExecutionSpace& space, const char trans[], - typename AViewType::const_value_type& alpha, - const AViewType& A, const XViewType& x, - typename YViewType::const_value_type& beta, + static void gemv(const ExecutionSpace& space, const char trans[], typename AViewType::const_value_type& alpha, + const AViewType& A, const XViewType& x, typename YViewType::const_value_type& beta, const YViewType& y) #if !defined(KOKKOSKERNELS_ETI_ONLY) || KOKKOSKERNELS_IMPL_COMPILE_LIBRARY { - static_assert(Kokkos::is_view::value, - "AViewType must be a Kokkos::View."); - static_assert(Kokkos::is_view::value, - "XViewType must be a Kokkos::View."); - static_assert(Kokkos::is_view::value, - "YViewType must be a Kokkos::View."); - static_assert(static_cast(AViewType::rank) == 2, - "AViewType must have rank 2."); - static_assert(static_cast(XViewType::rank) == 1, - "XViewType must have rank 1."); - static_assert(static_cast(YViewType::rank) == 1, - "YViewType must have rank 1."); - Kokkos::Profiling::pushRegion(KOKKOSKERNELS_IMPL_COMPILE_LIBRARY - ? "KokkosBlas::gemv[ETI]" - : "KokkosBlas::gemv[noETI]"); + static_assert(Kokkos::is_view::value, "AViewType must be a Kokkos::View."); + static_assert(Kokkos::is_view::value, "XViewType must be a Kokkos::View."); + static_assert(Kokkos::is_view::value, "YViewType must be a Kokkos::View."); + static_assert(static_cast(AViewType::rank) == 2, "AViewType must have rank 2."); + static_assert(static_cast(XViewType::rank) == 1, "XViewType must have rank 1."); + static_assert(static_cast(YViewType::rank) == 1, "YViewType must have rank 1."); + Kokkos::Profiling::pushRegion(KOKKOSKERNELS_IMPL_COMPILE_LIBRARY ? "KokkosBlas::gemv[ETI]" + : "KokkosBlas::gemv[noETI]"); typedef typename AViewType::size_type size_type; const size_type numRows = A.extent(0); const size_type numCols = A.extent(1); // Prefer int as the index type, but use a larger type if needed. - if (numRows < static_cast(INT_MAX) && - numCols < static_cast(INT_MAX)) { - generalGemvImpl( - space, trans, alpha, A, x, beta, y); + if (numRows < static_cast(INT_MAX) && numCols < static_cast(INT_MAX)) { + generalGemvImpl(space, trans, alpha, A, x, beta, y); } else { - generalGemvImpl( - space, trans, alpha, A, x, beta, y); + generalGemvImpl(space, trans, alpha, A, x, beta, y); } Kokkos::Profiling::popRegion(); } @@ -129,30 +111,24 @@ struct GEMV { // one or more .cpp files. // -#define KOKKOSBLAS2_GEMV_ETI_SPEC_DECL(SCALAR, LAYOUT, EXEC_SPACE, MEM_SPACE) \ - extern template struct GEMV< \ - EXEC_SPACE, \ - Kokkos::View, \ - Kokkos::MemoryTraits >, \ - Kokkos::View, \ - Kokkos::MemoryTraits >, \ - Kokkos::View, \ - Kokkos::MemoryTraits >, \ +#define KOKKOSBLAS2_GEMV_ETI_SPEC_DECL(SCALAR, LAYOUT, EXEC_SPACE, MEM_SPACE) \ + extern template struct GEMV< \ + EXEC_SPACE, \ + Kokkos::View, \ + Kokkos::MemoryTraits >, \ + Kokkos::View, \ + Kokkos::MemoryTraits >, \ + Kokkos::View, Kokkos::MemoryTraits >, \ false, true>; -#define KOKKOSBLAS2_GEMV_ETI_SPEC_INST(SCALAR, LAYOUT, EXEC_SPACE, MEM_SPACE) \ - template struct GEMV< \ - EXEC_SPACE, \ - Kokkos::View, \ - Kokkos::MemoryTraits >, \ - Kokkos::View, \ - Kokkos::MemoryTraits >, \ - Kokkos::View, \ - Kokkos::MemoryTraits >, \ +#define KOKKOSBLAS2_GEMV_ETI_SPEC_INST(SCALAR, LAYOUT, EXEC_SPACE, MEM_SPACE) \ + template struct GEMV< \ + EXEC_SPACE, \ + Kokkos::View, \ + Kokkos::MemoryTraits >, \ + Kokkos::View, \ + Kokkos::MemoryTraits >, \ + Kokkos::View, Kokkos::MemoryTraits >, \ false, true>; #include diff --git a/blas/impl/KokkosBlas2_ger_impl.hpp b/blas/impl/KokkosBlas2_ger_impl.hpp index 651db7f11a..94eb1868f9 100644 --- a/blas/impl/KokkosBlas2_ger_impl.hpp +++ b/blas/impl/KokkosBlas2_ger_impl.hpp @@ -34,8 +34,8 @@ struct ThreadParallelGER { using YComponentType = typename YViewType::non_const_value_type; using AComponentType = typename AViewType::non_const_value_type; - ThreadParallelGER(const bool justTranspose, const AlphaCoeffType& alpha, - const XViewType& x, const YViewType& y, const AViewType& A) + ThreadParallelGER(const bool justTranspose, const AlphaCoeffType& alpha, const XViewType& x, const YViewType& y, + const AViewType& A) : justTranspose_(justTranspose), alpha_(alpha), x_(x), y_(y), A_(A) { // Nothing to do } @@ -53,9 +53,7 @@ struct ThreadParallelGER { } } else { for (IndexType j = 0; j < N; ++j) { - A_(i, j) += - AComponentType(alpha_ * x_fixed * - Kokkos::ArithTraits::conj(y_(j))); + A_(i, j) += AComponentType(alpha_ * x_fixed * Kokkos::ArithTraits::conj(y_(j))); } } } @@ -70,14 +68,12 @@ struct ThreadParallelGER { }; // Thread parallel version of GER. -template +template void threadParallelGer(const ExecutionSpace& space, const char trans[], - const typename AViewType::const_value_type& alpha, - const XViewType& x, const YViewType& y, + const typename AViewType::const_value_type& alpha, const XViewType& x, const YViewType& y, const AViewType& A) { - static_assert(std::is_integral::value, - "IndexType must be an integer"); + static_assert(std::is_integral::value, "IndexType must be an integer"); using AlphaCoeffType = typename AViewType::non_const_value_type; @@ -88,12 +84,10 @@ void threadParallelGer(const ExecutionSpace& space, const char trans[], } else if (alpha == Kokkos::ArithTraits::zero()) { // no entries to update } else { - Kokkos::RangePolicy rangePolicy(space, 0, - A.extent(0)); - ThreadParallelGER functor( - (trans[0] == 'T') || (trans[0] == 't'), alpha, x, y, A); - Kokkos::parallel_for("KokkosBlas::ger[threadParallel]", rangePolicy, - functor); + Kokkos::RangePolicy rangePolicy(space, 0, A.extent(0)); + ThreadParallelGER functor((trans[0] == 'T') || (trans[0] == 't'), alpha, + x, y, A); + Kokkos::parallel_for("KokkosBlas::ger[threadParallel]", rangePolicy, functor); } } @@ -104,8 +98,7 @@ struct TeamParallelGER_LayoutRightTag {}; // Functor for the team parallel version of GER, designed for // performance on GPU. The kernel depends on the layout of A. -template +template struct TeamParallelGER { using AlphaCoeffType = typename AViewType::non_const_value_type; using XComponentType = typename XViewType::non_const_value_type; @@ -115,16 +108,15 @@ struct TeamParallelGER { using policy_type = Kokkos::TeamPolicy; using member_type = typename policy_type::member_type; - TeamParallelGER(const bool justTranspose, const AlphaCoeffType& alpha, - const XViewType& x, const YViewType& y, const AViewType& A) + TeamParallelGER(const bool justTranspose, const AlphaCoeffType& alpha, const XViewType& x, const YViewType& y, + const AViewType& A) : justTranspose_(justTranspose), alpha_(alpha), x_(x), y_(y), A_(A) { // Nothing to do } public: // LayoutLeft version: one team per column - KOKKOS_INLINE_FUNCTION void operator()(TeamParallelGER_LayoutLeftTag, - const member_type& team) const { + KOKKOS_INLINE_FUNCTION void operator()(TeamParallelGER_LayoutLeftTag, const member_type& team) const { if (alpha_ == Kokkos::ArithTraits::zero()) { // Nothing to do } else { @@ -132,24 +124,18 @@ struct TeamParallelGER { const IndexType j(team.league_rank()); if (justTranspose_) { const YComponentType y_fixed(y_(j)); - Kokkos::parallel_for( - Kokkos::TeamThreadRange(team, M), [&](const IndexType& i) { - A_(i, j) += AComponentType(alpha_ * x_(i) * y_fixed); - }); + Kokkos::parallel_for(Kokkos::TeamThreadRange(team, M), + [&](const IndexType& i) { A_(i, j) += AComponentType(alpha_ * x_(i) * y_fixed); }); } else { - const YComponentType y_fixed( - Kokkos::ArithTraits::conj(y_(j))); - Kokkos::parallel_for( - Kokkos::TeamThreadRange(team, M), [&](const IndexType& i) { - A_(i, j) += AComponentType(alpha_ * x_(i) * y_fixed); - }); + const YComponentType y_fixed(Kokkos::ArithTraits::conj(y_(j))); + Kokkos::parallel_for(Kokkos::TeamThreadRange(team, M), + [&](const IndexType& i) { A_(i, j) += AComponentType(alpha_ * x_(i) * y_fixed); }); } } } // LayoutRight version: one team per row - KOKKOS_INLINE_FUNCTION void operator()(TeamParallelGER_LayoutRightTag, - const member_type& team) const { + KOKKOS_INLINE_FUNCTION void operator()(TeamParallelGER_LayoutRightTag, const member_type& team) const { if (alpha_ == Kokkos::ArithTraits::zero()) { // Nothing to do } else { @@ -157,17 +143,12 @@ struct TeamParallelGER { const IndexType i(team.league_rank()); const XComponentType x_fixed(x_(i)); if (justTranspose_) { - Kokkos::parallel_for( - Kokkos::TeamThreadRange(team, N), [&](const IndexType& j) { - A_(i, j) += AComponentType(alpha_ * x_fixed * y_(j)); - }); + Kokkos::parallel_for(Kokkos::TeamThreadRange(team, N), + [&](const IndexType& j) { A_(i, j) += AComponentType(alpha_ * x_fixed * y_(j)); }); } else { - Kokkos::parallel_for( - Kokkos::TeamThreadRange(team, N), [&](const IndexType& j) { - A_(i, j) += AComponentType( - alpha_ * x_fixed * - Kokkos::ArithTraits::conj(y_(j))); - }); + Kokkos::parallel_for(Kokkos::TeamThreadRange(team, N), [&](const IndexType& j) { + A_(i, j) += AComponentType(alpha_ * x_fixed * Kokkos::ArithTraits::conj(y_(j))); + }); } } } @@ -181,14 +162,11 @@ struct TeamParallelGER { }; // Team parallel version of GER. -template -void teamParallelGer(const ExecutionSpace& space, const char trans[], - const typename AViewType::const_value_type& alpha, - const XViewType& x, const YViewType& y, - const AViewType& A) { - static_assert(std::is_integral::value, - "IndexType must be an integer"); +template +void teamParallelGer(const ExecutionSpace& space, const char trans[], const typename AViewType::const_value_type& alpha, + const XViewType& x, const YViewType& y, const AViewType& A) { + static_assert(std::is_integral::value, "IndexType must be an integer"); using AlphaCoeffType = typename AViewType::non_const_value_type; @@ -203,11 +181,9 @@ void teamParallelGer(const ExecutionSpace& space, const char trans[], return; } - constexpr bool isLayoutLeft = - std::is_same::value; + constexpr bool isLayoutLeft = std::is_same::value; using layout_tag = - typename std::conditional::type; + typename std::conditional::type; using TeamPolicyType = Kokkos::TeamPolicy; TeamPolicyType teamPolicy; if (isLayoutLeft) { @@ -218,8 +194,8 @@ void teamParallelGer(const ExecutionSpace& space, const char trans[], teamPolicy = TeamPolicyType(space, A.extent(0), Kokkos::AUTO); } - TeamParallelGER - functor((trans[0] == 'T') || (trans[0] == 't'), alpha, x, y, A); + TeamParallelGER functor( + (trans[0] == 'T') || (trans[0] == 't'), alpha, x, y, A); Kokkos::parallel_for("KokkosBlas::ger[teamParallel]", teamPolicy, functor); } @@ -231,25 +207,17 @@ void teamParallelGer(const ExecutionSpace& space, const char trans[], // // The 'enable_if' makes sure unused kernels are not instantiated. -template ()>::type* = nullptr> -void generalGerImpl(const ExecutionSpace& space, const char trans[], - const typename AViewType::const_value_type& alpha, - const XViewType& x, const YViewType& y, - const AViewType& A) { +template ()>::type* = nullptr> +void generalGerImpl(const ExecutionSpace& space, const char trans[], const typename AViewType::const_value_type& alpha, + const XViewType& x, const YViewType& y, const AViewType& A) { threadParallelGer(space, trans, alpha, x, y, A); } -template ()>::type* = nullptr> -void generalGerImpl(const ExecutionSpace& space, const char trans[], - const typename AViewType::const_value_type& alpha, - const XViewType& x, const YViewType& y, - const AViewType& A) { +template ()>::type* = nullptr> +void generalGerImpl(const ExecutionSpace& space, const char trans[], const typename AViewType::const_value_type& alpha, + const XViewType& x, const YViewType& y, const AViewType& A) { teamParallelGer(space, trans, alpha, x, y, A); } diff --git a/blas/impl/KokkosBlas2_ger_spec.hpp b/blas/impl/KokkosBlas2_ger_spec.hpp index 9802194b98..04e25ab422 100644 --- a/blas/impl/KokkosBlas2_ger_spec.hpp +++ b/blas/impl/KokkosBlas2_ger_spec.hpp @@ -40,19 +40,16 @@ struct ger_eti_spec_avail { // specializations go in this header file. We may spread out definitions (see // _INST macro below) across one or more .cpp files. // -#define KOKKOSBLAS2_GER_ETI_SPEC_AVAIL(SCALAR, LAYOUT, EXEC_SPACE, MEM_SPACE) \ - template <> \ - struct ger_eti_spec_avail< \ - EXEC_SPACE, \ - Kokkos::View, \ - Kokkos::MemoryTraits >, \ - Kokkos::View, \ - Kokkos::MemoryTraits >, \ - Kokkos::View, \ - Kokkos::MemoryTraits > > { \ - enum : bool { value = true }; \ +#define KOKKOSBLAS2_GER_ETI_SPEC_AVAIL(SCALAR, LAYOUT, EXEC_SPACE, MEM_SPACE) \ + template <> \ + struct ger_eti_spec_avail, \ + Kokkos::MemoryTraits >, \ + Kokkos::View, \ + Kokkos::MemoryTraits >, \ + Kokkos::View, \ + Kokkos::MemoryTraits > > { \ + enum : bool { value = true }; \ }; // Include the actual specialization declarations @@ -67,34 +64,26 @@ namespace Impl { // // Implementation of KokkosBlas::ger. -template ::value, - bool eti_spec_avail = ger_eti_spec_avail::value> +template ::value, + bool eti_spec_avail = ger_eti_spec_avail::value> struct GER { - static void ger(const ExecutionSpace& space, const char trans[], - const typename AViewType::const_value_type& alpha, + static void ger(const ExecutionSpace& space, const char trans[], const typename AViewType::const_value_type& alpha, const XViewType& x, const YViewType& y, const AViewType& A) #if !defined(KOKKOSKERNELS_ETI_ONLY) || KOKKOSKERNELS_IMPL_COMPILE_LIBRARY { - Kokkos::Profiling::pushRegion(KOKKOSKERNELS_IMPL_COMPILE_LIBRARY - ? "KokkosBlas::ger[ETI]" - : "KokkosBlas::ger[noETI]"); + Kokkos::Profiling::pushRegion(KOKKOSKERNELS_IMPL_COMPILE_LIBRARY ? "KokkosBlas::ger[ETI]" + : "KokkosBlas::ger[noETI]"); typedef typename AViewType::size_type size_type; const size_type numRows = A.extent(0); const size_type numCols = A.extent(1); // Prefer int as the index type, but use a larger type if needed. - if ((numRows < static_cast(INT_MAX)) && - (numCols < static_cast(INT_MAX))) { - generalGerImpl( - space, trans, alpha, x, y, A); + if ((numRows < static_cast(INT_MAX)) && (numCols < static_cast(INT_MAX))) { + generalGerImpl(space, trans, alpha, x, y, A); } else { - generalGerImpl( - space, trans, alpha, x, y, A); + generalGerImpl(space, trans, alpha, x, y, A); } Kokkos::Profiling::popRegion(); @@ -115,30 +104,24 @@ struct GER { // We may spread out definitions (see _DEF macro below) across one or more .cpp // files. // -#define KOKKOSBLAS2_GER_ETI_SPEC_DECL(SCALAR, LAYOUT, EXEC_SPACE, MEM_SPACE) \ - extern template struct GER< \ - EXEC_SPACE, \ - Kokkos::View, \ - Kokkos::MemoryTraits >, \ - Kokkos::View, \ - Kokkos::MemoryTraits >, \ - Kokkos::View, \ - Kokkos::MemoryTraits >, \ +#define KOKKOSBLAS2_GER_ETI_SPEC_DECL(SCALAR, LAYOUT, EXEC_SPACE, MEM_SPACE) \ + extern template struct GER< \ + EXEC_SPACE, \ + Kokkos::View, \ + Kokkos::MemoryTraits >, \ + Kokkos::View, \ + Kokkos::MemoryTraits >, \ + Kokkos::View, Kokkos::MemoryTraits >, \ false, true>; -#define KOKKOSBLAS2_GER_ETI_SPEC_INST(SCALAR, LAYOUT, EXEC_SPACE, MEM_SPACE) \ - template struct GER< \ - EXEC_SPACE, \ - Kokkos::View, \ - Kokkos::MemoryTraits >, \ - Kokkos::View, \ - Kokkos::MemoryTraits >, \ - Kokkos::View, \ - Kokkos::MemoryTraits >, \ +#define KOKKOSBLAS2_GER_ETI_SPEC_INST(SCALAR, LAYOUT, EXEC_SPACE, MEM_SPACE) \ + template struct GER< \ + EXEC_SPACE, \ + Kokkos::View, \ + Kokkos::MemoryTraits >, \ + Kokkos::View, \ + Kokkos::MemoryTraits >, \ + Kokkos::View, Kokkos::MemoryTraits >, \ false, true>; #include diff --git a/blas/impl/KokkosBlas2_serial_gemv_impl.hpp b/blas/impl/KokkosBlas2_serial_gemv_impl.hpp index 1fec8769cb..79f49fdd0e 100644 --- a/blas/impl/KokkosBlas2_serial_gemv_impl.hpp +++ b/blas/impl/KokkosBlas2_serial_gemv_impl.hpp @@ -25,13 +25,9 @@ namespace KokkosBlas { template struct SerialGemv { - template - KOKKOS_INLINE_FUNCTION static int invoke(const ScalarType /*alpha*/, - const AViewType & /*A*/, - const xViewType & /*x*/, - const ScalarType /*beta*/, - const yViewType & /*y*/); + template + KOKKOS_INLINE_FUNCTION static int invoke(const ScalarType /*alpha*/, const AViewType & /*A*/, const xViewType & /*x*/, + const ScalarType /*beta*/, const yViewType & /*y*/); }; } // namespace KokkosBlas @@ -49,27 +45,21 @@ namespace KokkosBlas { /// template <> -template -KOKKOS_INLINE_FUNCTION int -SerialGemv::invoke( - const ScalarType alpha, const AViewType &A, const xViewType &x, - const ScalarType beta, const yViewType &y) { - return Impl::SerialGemvInternal::invoke( - A.extent(0), A.extent(1), alpha, A.data(), A.stride_0(), A.stride_1(), - x.data(), x.stride_0(), beta, y.data(), y.stride_0()); +template +KOKKOS_INLINE_FUNCTION int SerialGemv::invoke( + const ScalarType alpha, const AViewType &A, const xViewType &x, const ScalarType beta, const yViewType &y) { + return Impl::SerialGemvInternal::invoke(A.extent(0), A.extent(1), alpha, A.data(), + A.stride_0(), A.stride_1(), x.data(), x.stride_0(), + beta, y.data(), y.stride_0()); } template <> -template -KOKKOS_INLINE_FUNCTION int -SerialGemv::invoke( - const ScalarType alpha, const AViewType &A, const xViewType &x, - const ScalarType beta, const yViewType &y) { - return Impl::SerialGemvInternal::invoke( - A.extent(0), A.extent(1), alpha, A.data(), A.stride_0(), A.stride_1(), - x.data(), x.stride_0(), beta, y.data(), y.stride_0()); +template +KOKKOS_INLINE_FUNCTION int SerialGemv::invoke( + const ScalarType alpha, const AViewType &A, const xViewType &x, const ScalarType beta, const yViewType &y) { + return Impl::SerialGemvInternal::invoke(A.extent(0), A.extent(1), alpha, A.data(), A.stride_0(), + A.stride_1(), x.data(), x.stride_0(), beta, y.data(), + y.stride_0()); } /// @@ -77,27 +67,21 @@ SerialGemv::invoke( /// template <> -template -KOKKOS_INLINE_FUNCTION int -SerialGemv::invoke( - const ScalarType alpha, const AViewType &A, const xViewType &x, - const ScalarType beta, const yViewType &y) { - return Impl::SerialGemvInternal::invoke( - A.extent(1), A.extent(0), alpha, A.data(), A.stride_1(), A.stride_0(), - x.data(), x.stride_0(), beta, y.data(), y.stride_0()); +template +KOKKOS_INLINE_FUNCTION int SerialGemv::invoke( + const ScalarType alpha, const AViewType &A, const xViewType &x, const ScalarType beta, const yViewType &y) { + return Impl::SerialGemvInternal::invoke(A.extent(1), A.extent(0), alpha, A.data(), + A.stride_1(), A.stride_0(), x.data(), x.stride_0(), + beta, y.data(), y.stride_0()); } template <> -template -KOKKOS_INLINE_FUNCTION int -SerialGemv::invoke( - const ScalarType alpha, const AViewType &A, const xViewType &x, - const ScalarType beta, const yViewType &y) { - return Impl::SerialGemvInternal::invoke( - A.extent(1), A.extent(0), alpha, A.data(), A.stride_1(), A.stride_0(), - x.data(), x.stride_0(), beta, y.data(), y.stride_0()); +template +KOKKOS_INLINE_FUNCTION int SerialGemv::invoke( + const ScalarType alpha, const AViewType &A, const xViewType &x, const ScalarType beta, const yViewType &y) { + return Impl::SerialGemvInternal::invoke(A.extent(1), A.extent(0), alpha, A.data(), A.stride_1(), + A.stride_0(), x.data(), x.stride_0(), beta, y.data(), + y.stride_0()); } /// @@ -105,27 +89,21 @@ SerialGemv::invoke( /// template <> -template -KOKKOS_INLINE_FUNCTION int -SerialGemv::invoke( - const ScalarType alpha, const AViewType &A, const xViewType &x, - const ScalarType beta, const yViewType &y) { - return Impl::SerialGemvInternal::invoke( - Impl::OpConj(), A.extent(1), A.extent(0), alpha, A.data(), A.stride_1(), - A.stride_0(), x.data(), x.stride_0(), beta, y.data(), y.stride_0()); +template +KOKKOS_INLINE_FUNCTION int SerialGemv::invoke( + const ScalarType alpha, const AViewType &A, const xViewType &x, const ScalarType beta, const yViewType &y) { + return Impl::SerialGemvInternal::invoke(Impl::OpConj(), A.extent(1), A.extent(0), alpha, + A.data(), A.stride_1(), A.stride_0(), x.data(), + x.stride_0(), beta, y.data(), y.stride_0()); } template <> -template -KOKKOS_INLINE_FUNCTION int -SerialGemv::invoke( - const ScalarType alpha, const AViewType &A, const xViewType &x, - const ScalarType beta, const yViewType &y) { - return Impl::SerialGemvInternal::invoke( - Impl::OpConj(), A.extent(1), A.extent(0), alpha, A.data(), A.stride_1(), - A.stride_0(), x.data(), x.stride_0(), beta, y.data(), y.stride_0()); +template +KOKKOS_INLINE_FUNCTION int SerialGemv::invoke( + const ScalarType alpha, const AViewType &A, const xViewType &x, const ScalarType beta, const yViewType &y) { + return Impl::SerialGemvInternal::invoke(Impl::OpConj(), A.extent(1), A.extent(0), alpha, + A.data(), A.stride_1(), A.stride_0(), x.data(), + x.stride_0(), beta, y.data(), y.stride_0()); } } // namespace KokkosBlas diff --git a/blas/impl/KokkosBlas2_serial_gemv_inner_multiple_dot.hpp b/blas/impl/KokkosBlas2_serial_gemv_inner_multiple_dot.hpp index aa7efc9122..1b70413119 100644 --- a/blas/impl/KokkosBlas2_serial_gemv_inner_multiple_dot.hpp +++ b/blas/impl/KokkosBlas2_serial_gemv_inner_multiple_dot.hpp @@ -41,24 +41,17 @@ struct InnerMultipleDotProduct { const int _as0, _as1, _xs0, _ys0; KOKKOS_INLINE_FUNCTION - InnerMultipleDotProduct(const int as0, const int as1, const int xs0, - const int ys0) + InnerMultipleDotProduct(const int as0, const int as1, const int xs0, const int ys0) : _as0(as0), _as1(as1), _xs0(xs0), _ys0(ys0) {} - template - KOKKOS_INLINE_FUNCTION int serial_invoke(const ScalarType alpha, - const ValueAType *KOKKOS_RESTRICT A, - const ValueXType *KOKKOS_RESTRICT x, - const int n, + template + KOKKOS_INLINE_FUNCTION int serial_invoke(const ScalarType alpha, const ValueAType *KOKKOS_RESTRICT A, + const ValueXType *KOKKOS_RESTRICT x, const int n, ValueYType *KOKKOS_RESTRICT y); - template - KOKKOS_INLINE_FUNCTION int serial_invoke(const ScalarType alpha, - const ValueAType *KOKKOS_RESTRICT A, - const ValueXType *KOKKOS_RESTRICT x, - const int m, const int n, + template + KOKKOS_INLINE_FUNCTION int serial_invoke(const ScalarType alpha, const ValueAType *KOKKOS_RESTRICT A, + const ValueXType *KOKKOS_RESTRICT x, const int m, const int n, ValueYType *KOKKOS_RESTRICT y); }; @@ -67,16 +60,14 @@ struct InnerMultipleDotProduct { /// ==================== template <> -template -KOKKOS_INLINE_FUNCTION int InnerMultipleDotProduct<5>::serial_invoke( - const ScalarType alpha, const ValueAType *KOKKOS_RESTRICT A, - const ValueXType *KOKKOS_RESTRICT x, const int n, - ValueYType *KOKKOS_RESTRICT y) { +template +KOKKOS_INLINE_FUNCTION int InnerMultipleDotProduct<5>::serial_invoke(const ScalarType alpha, + const ValueAType *KOKKOS_RESTRICT A, + const ValueXType *KOKKOS_RESTRICT x, const int n, + ValueYType *KOKKOS_RESTRICT y) { if (n <= 0) return 0; - const int i0 = 0 * _as0, i1 = 1 * _as0, i2 = 2 * _as0, i3 = 3 * _as0, - i4 = 4 * _as0; + const int i0 = 0 * _as0, i1 = 1 * _as0, i2 = 2 * _as0, i3 = 3 * _as0, i4 = 4 * _as0; // unroll by rows ValueYType y_0 = 0, y_1 = 0, y_2 = 0, y_3 = 0, y_4 = 0; @@ -105,12 +96,11 @@ KOKKOS_INLINE_FUNCTION int InnerMultipleDotProduct<5>::serial_invoke( } template <> -template -KOKKOS_INLINE_FUNCTION int InnerMultipleDotProduct<4>::serial_invoke( - const ScalarType alpha, const ValueAType *KOKKOS_RESTRICT A, - const ValueXType *KOKKOS_RESTRICT x, const int n, - ValueYType *KOKKOS_RESTRICT y) { +template +KOKKOS_INLINE_FUNCTION int InnerMultipleDotProduct<4>::serial_invoke(const ScalarType alpha, + const ValueAType *KOKKOS_RESTRICT A, + const ValueXType *KOKKOS_RESTRICT x, const int n, + ValueYType *KOKKOS_RESTRICT y) { if (!n) return 0; OpA op; @@ -141,12 +131,11 @@ KOKKOS_INLINE_FUNCTION int InnerMultipleDotProduct<4>::serial_invoke( } template <> -template -KOKKOS_INLINE_FUNCTION int InnerMultipleDotProduct<3>::serial_invoke( - const ScalarType alpha, const ValueAType *KOKKOS_RESTRICT A, - const ValueXType *KOKKOS_RESTRICT x, const int n, - ValueYType *KOKKOS_RESTRICT y) { +template +KOKKOS_INLINE_FUNCTION int InnerMultipleDotProduct<3>::serial_invoke(const ScalarType alpha, + const ValueAType *KOKKOS_RESTRICT A, + const ValueXType *KOKKOS_RESTRICT x, const int n, + ValueYType *KOKKOS_RESTRICT y) { if (n <= 0) return 0; OpA op; @@ -175,12 +164,11 @@ KOKKOS_INLINE_FUNCTION int InnerMultipleDotProduct<3>::serial_invoke( } template <> -template -KOKKOS_INLINE_FUNCTION int InnerMultipleDotProduct<2>::serial_invoke( - const ScalarType alpha, const ValueAType *KOKKOS_RESTRICT A, - const ValueXType *KOKKOS_RESTRICT x, const int n, - ValueYType *KOKKOS_RESTRICT y) { +template +KOKKOS_INLINE_FUNCTION int InnerMultipleDotProduct<2>::serial_invoke(const ScalarType alpha, + const ValueAType *KOKKOS_RESTRICT A, + const ValueXType *KOKKOS_RESTRICT x, const int n, + ValueYType *KOKKOS_RESTRICT y) { if (n <= 0) return 0; OpA op; @@ -207,12 +195,11 @@ KOKKOS_INLINE_FUNCTION int InnerMultipleDotProduct<2>::serial_invoke( } template <> -template -KOKKOS_INLINE_FUNCTION int InnerMultipleDotProduct<1>::serial_invoke( - const ScalarType alpha, const ValueAType *KOKKOS_RESTRICT A, - const ValueXType *KOKKOS_RESTRICT x, const int n, - ValueYType *KOKKOS_RESTRICT y) { +template +KOKKOS_INLINE_FUNCTION int InnerMultipleDotProduct<1>::serial_invoke(const ScalarType alpha, + const ValueAType *KOKKOS_RESTRICT A, + const ValueXType *KOKKOS_RESTRICT x, const int n, + ValueYType *KOKKOS_RESTRICT y) { if (n <= 0) return 0; OpA op; @@ -230,12 +217,11 @@ KOKKOS_INLINE_FUNCTION int InnerMultipleDotProduct<1>::serial_invoke( } template <> -template -KOKKOS_INLINE_FUNCTION int InnerMultipleDotProduct<5>::serial_invoke( - const ScalarType alpha, const ValueAType *KOKKOS_RESTRICT A, - const ValueXType *KOKKOS_RESTRICT x, const int m, const int n, - ValueYType *KOKKOS_RESTRICT y) { +template +KOKKOS_INLINE_FUNCTION int InnerMultipleDotProduct<5>::serial_invoke(const ScalarType alpha, + const ValueAType *KOKKOS_RESTRICT A, + const ValueXType *KOKKOS_RESTRICT x, const int m, + const int n, ValueYType *KOKKOS_RESTRICT y) { if (m <= 0 || n <= 0) return 0; switch (m) { case 5: { @@ -268,12 +254,11 @@ KOKKOS_INLINE_FUNCTION int InnerMultipleDotProduct<5>::serial_invoke( } template <> -template -KOKKOS_INLINE_FUNCTION int InnerMultipleDotProduct<4>::serial_invoke( - const ScalarType alpha, const ValueAType *KOKKOS_RESTRICT A, - const ValueXType *KOKKOS_RESTRICT x, const int m, const int n, - ValueYType *KOKKOS_RESTRICT y) { +template +KOKKOS_INLINE_FUNCTION int InnerMultipleDotProduct<4>::serial_invoke(const ScalarType alpha, + const ValueAType *KOKKOS_RESTRICT A, + const ValueXType *KOKKOS_RESTRICT x, const int m, + const int n, ValueYType *KOKKOS_RESTRICT y) { if (m <= 0 || n <= 0) return 0; switch (m) { case 4: { @@ -301,13 +286,12 @@ KOKKOS_INLINE_FUNCTION int InnerMultipleDotProduct<4>::serial_invoke( } template <> -template +template -KOKKOS_INLINE_FUNCTION int InnerMultipleDotProduct<3>::serial_invoke( - const ScalarType alpha, const ValueAType *KOKKOS_RESTRICT A, - const ValueXType *KOKKOS_RESTRICT x, const int m, const int n, - ValueYType *KOKKOS_RESTRICT y) { +KOKKOS_INLINE_FUNCTION int InnerMultipleDotProduct<3>::serial_invoke(const ScalarType alpha, + const ValueAType *KOKKOS_RESTRICT A, + const ValueXType *KOKKOS_RESTRICT x, const int m, + const int n, ValueYType *KOKKOS_RESTRICT y) { if (m <= 0 || n <= 0) return 0; switch (m) { case 3: { @@ -330,13 +314,12 @@ KOKKOS_INLINE_FUNCTION int InnerMultipleDotProduct<3>::serial_invoke( } template <> -template +template -KOKKOS_INLINE_FUNCTION int InnerMultipleDotProduct<2>::serial_invoke( - const ScalarType alpha, const ValueAType *KOKKOS_RESTRICT A, - const ValueXType *KOKKOS_RESTRICT x, const int m, const int n, - ValueYType *KOKKOS_RESTRICT y) { +KOKKOS_INLINE_FUNCTION int InnerMultipleDotProduct<2>::serial_invoke(const ScalarType alpha, + const ValueAType *KOKKOS_RESTRICT A, + const ValueXType *KOKKOS_RESTRICT x, const int m, + const int n, ValueYType *KOKKOS_RESTRICT y) { if (m <= 0 || n <= 0) return 0; switch (m) { case 2: { @@ -354,13 +337,12 @@ KOKKOS_INLINE_FUNCTION int InnerMultipleDotProduct<2>::serial_invoke( } template <> -template +template -KOKKOS_INLINE_FUNCTION int InnerMultipleDotProduct<1>::serial_invoke( - const ScalarType alpha, const ValueAType *KOKKOS_RESTRICT A, - const ValueXType *KOKKOS_RESTRICT x, const int m, const int n, - ValueYType *KOKKOS_RESTRICT y) { +KOKKOS_INLINE_FUNCTION int InnerMultipleDotProduct<1>::serial_invoke(const ScalarType alpha, + const ValueAType *KOKKOS_RESTRICT A, + const ValueXType *KOKKOS_RESTRICT x, const int m, + const int n, ValueYType *KOKKOS_RESTRICT y) { if (m <= 0 || n <= 0) return 0; switch (m) { case 1: { diff --git a/blas/impl/KokkosBlas2_serial_gemv_internal.hpp b/blas/impl/KokkosBlas2_serial_gemv_internal.hpp index 2d78102c7a..912972c7ee 100644 --- a/blas/impl/KokkosBlas2_serial_gemv_internal.hpp +++ b/blas/impl/KokkosBlas2_serial_gemv_internal.hpp @@ -31,33 +31,27 @@ namespace Impl { template struct SerialGemvInternal { - template - KOKKOS_INLINE_FUNCTION static int invoke( - OpA op, const int m, const int n, const ScalarType alpha, - const ValueAType *KOKKOS_RESTRICT A, const int as0, const int as1, - const ValueXType *KOKKOS_RESTRICT x, const int xs0, const ScalarType beta, - /**/ ValueYType *KOKKOS_RESTRICT y, const int ys0); + template + KOKKOS_INLINE_FUNCTION static int invoke(OpA op, const int m, const int n, const ScalarType alpha, + const ValueAType *KOKKOS_RESTRICT A, const int as0, const int as1, + const ValueXType *KOKKOS_RESTRICT x, const int xs0, const ScalarType beta, + /**/ ValueYType *KOKKOS_RESTRICT y, const int ys0); // default OpA = OpID - template - KOKKOS_INLINE_FUNCTION static int invoke( - const int m, const int n, const ScalarType alpha, - const ValueAType *KOKKOS_RESTRICT A, const int as0, const int as1, - const ValueXType *KOKKOS_RESTRICT x, const int xs0, const ScalarType beta, - /**/ ValueYType *KOKKOS_RESTRICT y, const int ys0) { + template + KOKKOS_INLINE_FUNCTION static int invoke(const int m, const int n, const ScalarType alpha, + const ValueAType *KOKKOS_RESTRICT A, const int as0, const int as1, + const ValueXType *KOKKOS_RESTRICT x, const int xs0, const ScalarType beta, + /**/ ValueYType *KOKKOS_RESTRICT y, const int ys0) { return invoke(OpID(), m, n, alpha, A, as0, as1, x, xs0, beta, y, ys0); } }; template <> -template +template KOKKOS_INLINE_FUNCTION int SerialGemvInternal::invoke( - OpA op, const int m, const int n, const ScalarType alpha, - const ValueAType *KOKKOS_RESTRICT A, const int as0, const int as1, - const ValueXType *KOKKOS_RESTRICT x, const int xs0, const ScalarType beta, + OpA op, const int m, const int n, const ScalarType alpha, const ValueAType *KOKKOS_RESTRICT A, const int as0, + const int as1, const ValueXType *KOKKOS_RESTRICT x, const int xs0, const ScalarType beta, /**/ ValueYType *KOKKOS_RESTRICT y, const int ys0) { const ScalarType one(1.0), zero(0.0); @@ -91,12 +85,10 @@ KOKKOS_INLINE_FUNCTION int SerialGemvInternal::invoke( } template <> -template +template KOKKOS_INLINE_FUNCTION int SerialGemvInternal::invoke( - OpA /* op */, const int m, const int n, const ScalarType alpha, - const ValueAType *KOKKOS_RESTRICT A, const int as0, const int as1, - const ValueXType *KOKKOS_RESTRICT x, const int xs0, const ScalarType beta, + OpA /* op */, const int m, const int n, const ScalarType alpha, const ValueAType *KOKKOS_RESTRICT A, const int as0, + const int as1, const ValueXType *KOKKOS_RESTRICT x, const int xs0, const ScalarType beta, /**/ ValueYType *KOKKOS_RESTRICT y, const int ys0) { const ScalarType one(1.0), zero(0.0); @@ -116,8 +108,7 @@ KOKKOS_INLINE_FUNCTION int SerialGemvInternal::invoke( Impl::InnerMultipleDotProduct inner(as0, as1, xs0, ys0); const int mb = mbAlgo; for (int i = 0; i < m; i += mb) - inner.serial_invoke(alpha, A + i * as0, x, - (i + mb) > m ? (m - i) : mb, n, y + i * ys0); + inner.serial_invoke(alpha, A + i * as0, x, (i + mb) > m ? (m - i) : mb, n, y + i * ys0); } return 0; } diff --git a/blas/impl/KokkosBlas2_syr2_impl.hpp b/blas/impl/KokkosBlas2_syr2_impl.hpp index 69284e9547..7bcb0069ab 100644 --- a/blas/impl/KokkosBlas2_syr2_impl.hpp +++ b/blas/impl/KokkosBlas2_syr2_impl.hpp @@ -27,16 +27,14 @@ namespace Impl { // Functor for the thread parallel version of SYR2. // This functor parallelizes over rows of the input matrix A. -template +template struct ThreadParallelSYR2 { using AlphaCoeffType = typename AViewType::non_const_value_type; using XComponentType = typename XViewType::non_const_value_type; using YComponentType = typename YViewType::non_const_value_type; using AComponentType = typename AViewType::non_const_value_type; - ThreadParallelSYR2(const AlphaCoeffType& alpha, const XViewType& x, - const YViewType& y, const AViewType& A) + ThreadParallelSYR2(const AlphaCoeffType& alpha, const XViewType& x, const YViewType& y, const AViewType& A) : alpha_(alpha), x_(x), y_(y), A_(A) { // Nothing to do } @@ -55,16 +53,14 @@ struct ThreadParallelSYR2 { if constexpr (tJustTranspose) { if (x_fixed != Kokkos::ArithTraits::zero()) { for (IndexType j = 0; j < N; ++j) { - if (((tJustUp == true) && (i <= j)) || - ((tJustUp == false) && (i >= j))) { + if (((tJustUp == true) && (i <= j)) || ((tJustUp == false) && (i >= j))) { A_(i, j) += AComponentType(alpha_ * x_fixed * y_(j)); } } } if (y_fixed != Kokkos::ArithTraits::zero()) { for (IndexType j = 0; j < N; ++j) { - if (((tJustUp == true) && (i <= j)) || - ((tJustUp == false) && (i >= j))) { + if (((tJustUp == true) && (i <= j)) || ((tJustUp == false) && (i >= j))) { A_(i, j) += AComponentType(alpha_ * y_fixed * x_(j)); } } @@ -72,21 +68,16 @@ struct ThreadParallelSYR2 { } else { if (x_fixed != Kokkos::ArithTraits::zero()) { for (IndexType j = 0; j < N; ++j) { - if (((tJustUp == true) && (i <= j)) || - ((tJustUp == false) && (i >= j))) { - A_(i, j) += AComponentType( - alpha_ * x_fixed * - Kokkos::ArithTraits::conj(y_(j))); + if (((tJustUp == true) && (i <= j)) || ((tJustUp == false) && (i >= j))) { + A_(i, j) += AComponentType(alpha_ * x_fixed * Kokkos::ArithTraits::conj(y_(j))); } } } if (y_fixed != Kokkos::ArithTraits::zero()) { for (IndexType j = 0; j < N; ++j) { - if (((tJustUp == true) && (i <= j)) || - ((tJustUp == false) && (i >= j))) { - A_(i, j) += AComponentType( - Kokkos::ArithTraits::conj(alpha_) * y_fixed * - Kokkos::ArithTraits::conj(x_(j))); + if (((tJustUp == true) && (i <= j)) || ((tJustUp == false) && (i >= j))) { + A_(i, j) += AComponentType(Kokkos::ArithTraits::conj(alpha_) * y_fixed * + Kokkos::ArithTraits::conj(x_(j))); } } } @@ -102,14 +93,11 @@ struct ThreadParallelSYR2 { }; // Thread parallel version of SYR2. -template -void threadParallelSyr2(const ExecutionSpace& space, - const typename AViewType::const_value_type& alpha, - const XViewType& x, const YViewType& y, - const AViewType& A) { - static_assert(std::is_integral::value, - "IndexType must be an integer"); +template +void threadParallelSyr2(const ExecutionSpace& space, const typename AViewType::const_value_type& alpha, + const XViewType& x, const YViewType& y, const AViewType& A) { + static_assert(std::is_integral::value, "IndexType must be an integer"); using AlphaCoeffType = typename AViewType::non_const_value_type; @@ -120,13 +108,9 @@ void threadParallelSyr2(const ExecutionSpace& space, } else if (alpha == Kokkos::ArithTraits::zero()) { // no entries to update } else { - Kokkos::RangePolicy rangePolicy(space, 0, - A.extent(0)); - ThreadParallelSYR2 - functor(alpha, x, y, A); - Kokkos::parallel_for("KokkosBlas::syr2[threadParallel]", rangePolicy, - functor); + Kokkos::RangePolicy rangePolicy(space, 0, A.extent(0)); + ThreadParallelSYR2 functor(alpha, x, y, A); + Kokkos::parallel_for("KokkosBlas::syr2[threadParallel]", rangePolicy, functor); } } @@ -137,8 +121,8 @@ struct TeamParallelSYR2_LayoutRightTag {}; // Functor for the team parallel version of SYR2, designed for // performance on GPUs. The kernel depends on the layout of A. -template +template struct TeamParallelSYR2 { using AlphaCoeffType = typename AViewType::non_const_value_type; using XComponentType = typename XViewType::non_const_value_type; @@ -148,16 +132,14 @@ struct TeamParallelSYR2 { using policy_type = Kokkos::TeamPolicy; using member_type = typename policy_type::member_type; - TeamParallelSYR2(const AlphaCoeffType& alpha, const XViewType& x, - const YViewType& y, const AViewType& A) + TeamParallelSYR2(const AlphaCoeffType& alpha, const XViewType& x, const YViewType& y, const AViewType& A) : alpha_(alpha), x_(x), y_(y), A_(A) { // Nothing to do } public: // LayoutLeft version: one team per column - KOKKOS_INLINE_FUNCTION void operator()(TeamParallelSYR2_LayoutLeftTag, - const member_type& team) const { + KOKKOS_INLINE_FUNCTION void operator()(TeamParallelSYR2_LayoutLeftTag, const member_type& team) const { if (alpha_ == Kokkos::ArithTraits::zero()) { // Nothing to do } else { @@ -171,47 +153,35 @@ struct TeamParallelSYR2 { const XComponentType x_fixed(x_(j)); const YComponentType y_fixed(y_(j)); if (y_fixed != Kokkos::ArithTraits::zero()) { - Kokkos::parallel_for( - Kokkos::TeamThreadRange(team, M), [&](const IndexType& i) { - if (((tJustUp == true) && (i <= j)) || - ((tJustUp == false) && (i >= j))) { - A_(i, j) += AComponentType(alpha_ * x_(i) * y_fixed); - } - }); + Kokkos::parallel_for(Kokkos::TeamThreadRange(team, M), [&](const IndexType& i) { + if (((tJustUp == true) && (i <= j)) || ((tJustUp == false) && (i >= j))) { + A_(i, j) += AComponentType(alpha_ * x_(i) * y_fixed); + } + }); } if (x_fixed != Kokkos::ArithTraits::zero()) { - Kokkos::parallel_for( - Kokkos::TeamThreadRange(team, M), [&](const IndexType& i) { - if (((tJustUp == true) && (i <= j)) || - ((tJustUp == false) && (i >= j))) { - A_(i, j) += AComponentType(alpha_ * y_(i) * x_fixed); - } - }); + Kokkos::parallel_for(Kokkos::TeamThreadRange(team, M), [&](const IndexType& i) { + if (((tJustUp == true) && (i <= j)) || ((tJustUp == false) && (i >= j))) { + A_(i, j) += AComponentType(alpha_ * y_(i) * x_fixed); + } + }); } } else { - const XComponentType x_fixed( - Kokkos::ArithTraits::conj(x_(j))); - const YComponentType y_fixed( - Kokkos::ArithTraits::conj(y_(j))); + const XComponentType x_fixed(Kokkos::ArithTraits::conj(x_(j))); + const YComponentType y_fixed(Kokkos::ArithTraits::conj(y_(j))); if (y_fixed != Kokkos::ArithTraits::zero()) { - Kokkos::parallel_for( - Kokkos::TeamThreadRange(team, M), [&](const IndexType& i) { - if (((tJustUp == true) && (i <= j)) || - ((tJustUp == false) && (i >= j))) { - A_(i, j) += AComponentType(alpha_ * x_(i) * y_fixed); - } - }); + Kokkos::parallel_for(Kokkos::TeamThreadRange(team, M), [&](const IndexType& i) { + if (((tJustUp == true) && (i <= j)) || ((tJustUp == false) && (i >= j))) { + A_(i, j) += AComponentType(alpha_ * x_(i) * y_fixed); + } + }); } if (x_fixed != Kokkos::ArithTraits::zero()) { - Kokkos::parallel_for( - Kokkos::TeamThreadRange(team, M), [&](const IndexType& i) { - if (((tJustUp == true) && (i <= j)) || - ((tJustUp == false) && (i >= j))) { - A_(i, j) += AComponentType( - Kokkos::ArithTraits::conj(alpha_) * - y_(i) * x_fixed); - } - }); + Kokkos::parallel_for(Kokkos::TeamThreadRange(team, M), [&](const IndexType& i) { + if (((tJustUp == true) && (i <= j)) || ((tJustUp == false) && (i >= j))) { + A_(i, j) += AComponentType(Kokkos::ArithTraits::conj(alpha_) * y_(i) * x_fixed); + } + }); } } } @@ -219,8 +189,7 @@ struct TeamParallelSYR2 { } // LayoutRight version: one team per row - KOKKOS_INLINE_FUNCTION void operator()(TeamParallelSYR2_LayoutRightTag, - const member_type& team) const { + KOKKOS_INLINE_FUNCTION void operator()(TeamParallelSYR2_LayoutRightTag, const member_type& team) const { if (alpha_ == Kokkos::ArithTraits::zero()) { // Nothing to do } else { @@ -234,46 +203,34 @@ struct TeamParallelSYR2 { const YComponentType y_fixed(y_(i)); if constexpr (tJustTranspose) { if (x_fixed != Kokkos::ArithTraits::zero()) { - Kokkos::parallel_for( - Kokkos::TeamThreadRange(team, N), [&](const IndexType& j) { - if (((tJustUp == true) && (i <= j)) || - ((tJustUp == false) && (i >= j))) { - A_(i, j) += AComponentType(alpha_ * x_fixed * y_(j)); - } - }); + Kokkos::parallel_for(Kokkos::TeamThreadRange(team, N), [&](const IndexType& j) { + if (((tJustUp == true) && (i <= j)) || ((tJustUp == false) && (i >= j))) { + A_(i, j) += AComponentType(alpha_ * x_fixed * y_(j)); + } + }); } if (y_fixed != Kokkos::ArithTraits::zero()) { - Kokkos::parallel_for( - Kokkos::TeamThreadRange(team, N), [&](const IndexType& j) { - if (((tJustUp == true) && (i <= j)) || - ((tJustUp == false) && (i >= j))) { - A_(i, j) += AComponentType(alpha_ * y_fixed * x_(j)); - } - }); + Kokkos::parallel_for(Kokkos::TeamThreadRange(team, N), [&](const IndexType& j) { + if (((tJustUp == true) && (i <= j)) || ((tJustUp == false) && (i >= j))) { + A_(i, j) += AComponentType(alpha_ * y_fixed * x_(j)); + } + }); } } else { if (x_fixed != Kokkos::ArithTraits::zero()) { - Kokkos::parallel_for( - Kokkos::TeamThreadRange(team, N), [&](const IndexType& j) { - if (((tJustUp == true) && (i <= j)) || - ((tJustUp == false) && (i >= j))) { - A_(i, j) += AComponentType( - alpha_ * x_fixed * - Kokkos::ArithTraits::conj(y_(j))); - } - }); + Kokkos::parallel_for(Kokkos::TeamThreadRange(team, N), [&](const IndexType& j) { + if (((tJustUp == true) && (i <= j)) || ((tJustUp == false) && (i >= j))) { + A_(i, j) += AComponentType(alpha_ * x_fixed * Kokkos::ArithTraits::conj(y_(j))); + } + }); } if (y_fixed != Kokkos::ArithTraits::zero()) { - Kokkos::parallel_for( - Kokkos::TeamThreadRange(team, N), [&](const IndexType& j) { - if (((tJustUp == true) && (i <= j)) || - ((tJustUp == false) && (i >= j))) { - A_(i, j) += AComponentType( - Kokkos::ArithTraits::conj(alpha_) * - y_fixed * - Kokkos::ArithTraits::conj(x_(j))); - } - }); + Kokkos::parallel_for(Kokkos::TeamThreadRange(team, N), [&](const IndexType& j) { + if (((tJustUp == true) && (i <= j)) || ((tJustUp == false) && (i >= j))) { + A_(i, j) += AComponentType(Kokkos::ArithTraits::conj(alpha_) * y_fixed * + Kokkos::ArithTraits::conj(x_(j))); + } + }); } } } @@ -288,14 +245,11 @@ struct TeamParallelSYR2 { }; // Team parallel version of SYR2. -template -void teamParallelSyr2(const ExecutionSpace& space, - const typename AViewType::const_value_type& alpha, - const XViewType& x, const YViewType& y, - const AViewType& A) { - static_assert(std::is_integral::value, - "IndexType must be an integer"); +template +void teamParallelSyr2(const ExecutionSpace& space, const typename AViewType::const_value_type& alpha, + const XViewType& x, const YViewType& y, const AViewType& A) { + static_assert(std::is_integral::value, "IndexType must be an integer"); using AlphaCoeffType = typename AViewType::non_const_value_type; @@ -310,11 +264,9 @@ void teamParallelSyr2(const ExecutionSpace& space, return; } - constexpr bool isLayoutLeft = - std::is_same::value; + constexpr bool isLayoutLeft = std::is_same::value; using layout_tag = - typename std::conditional::type; + typename std::conditional::type; using TeamPolicyType = Kokkos::TeamPolicy; TeamPolicyType teamPolicy; if (isLayoutLeft) { @@ -325,9 +277,8 @@ void teamParallelSyr2(const ExecutionSpace& space, teamPolicy = TeamPolicyType(space, A.extent(0), Kokkos::AUTO); } - TeamParallelSYR2 - functor(alpha, x, y, A); + TeamParallelSYR2 functor( + alpha, x, y, A); Kokkos::parallel_for("KokkosBlas::syr2[teamParallel]", teamPolicy, functor); } @@ -339,28 +290,22 @@ void teamParallelSyr2(const ExecutionSpace& space, // // The 'enable_if' makes sure unused kernels are not instantiated. -template ()>::type* = nullptr> -void generalSyr2Impl(const ExecutionSpace& space, - const typename AViewType::const_value_type& alpha, - const XViewType& x, const YViewType& y, - const AViewType& A) { - threadParallelSyr2(space, alpha, x, y, A); +template ()>::type* = nullptr> +void generalSyr2Impl(const ExecutionSpace& space, const typename AViewType::const_value_type& alpha, const XViewType& x, + const YViewType& y, const AViewType& A) { + threadParallelSyr2(space, alpha, + x, y, A); } -template ()>::type* = nullptr> -void generalSyr2Impl(const ExecutionSpace& space, - const typename AViewType::const_value_type& alpha, - const XViewType& x, const YViewType& y, - const AViewType& A) { - teamParallelSyr2(space, alpha, x, y, A); +template ()>::type* = nullptr> +void generalSyr2Impl(const ExecutionSpace& space, const typename AViewType::const_value_type& alpha, const XViewType& x, + const YViewType& y, const AViewType& A) { + teamParallelSyr2(space, alpha, x, + y, A); } } // namespace Impl diff --git a/blas/impl/KokkosBlas2_syr2_spec.hpp b/blas/impl/KokkosBlas2_syr2_spec.hpp index 01637ba1d4..a8ae741ede 100644 --- a/blas/impl/KokkosBlas2_syr2_spec.hpp +++ b/blas/impl/KokkosBlas2_syr2_spec.hpp @@ -40,19 +40,16 @@ struct syr2_eti_spec_avail { // specializations go in this header file. We may spread out definitions (see // _INST macro below) across one or more .cpp files. // -#define KOKKOSBLAS2_SYR2_ETI_SPEC_AVAIL(SCALAR, LAYOUT, EXEC_SPACE, MEM_SPACE) \ - template <> \ - struct syr2_eti_spec_avail< \ - EXEC_SPACE, \ - Kokkos::View, \ - Kokkos::MemoryTraits >, \ - Kokkos::View, \ - Kokkos::MemoryTraits >, \ - Kokkos::View, \ - Kokkos::MemoryTraits > > { \ - enum : bool { value = true }; \ +#define KOKKOSBLAS2_SYR2_ETI_SPEC_AVAIL(SCALAR, LAYOUT, EXEC_SPACE, MEM_SPACE) \ + template <> \ + struct syr2_eti_spec_avail, \ + Kokkos::MemoryTraits >, \ + Kokkos::View, \ + Kokkos::MemoryTraits >, \ + Kokkos::View, \ + Kokkos::MemoryTraits > > { \ + enum : bool { value = true }; \ }; // Include the actual specialization declarations @@ -67,22 +64,17 @@ namespace Impl { // // Implementation of KokkosBlas::syr2. -template < - class ExecutionSpace, class XViewType, class YViewType, class AViewType, - bool tpl_spec_avail = syr2_tpl_spec_avail::value, - bool eti_spec_avail = syr2_eti_spec_avail::value> +template ::value, + bool eti_spec_avail = syr2_eti_spec_avail::value> struct SYR2 { - static void syr2(const ExecutionSpace& space, const char trans[], - const char uplo[], - const typename AViewType::const_value_type& alpha, - const XViewType& x, const YViewType& y, const AViewType& A) + static void syr2(const ExecutionSpace& space, const char trans[], const char uplo[], + const typename AViewType::const_value_type& alpha, const XViewType& x, const YViewType& y, + const AViewType& A) #if !defined(KOKKOSKERNELS_ETI_ONLY) || KOKKOSKERNELS_IMPL_COMPILE_LIBRARY { - Kokkos::Profiling::pushRegion(KOKKOSKERNELS_IMPL_COMPILE_LIBRARY - ? "KokkosBlas::syr2[ETI]" - : "KokkosBlas::syr2[noETI]"); + Kokkos::Profiling::pushRegion(KOKKOSKERNELS_IMPL_COMPILE_LIBRARY ? "KokkosBlas::syr2[ETI]" + : "KokkosBlas::syr2[noETI]"); typedef typename AViewType::size_type size_type; const size_type numRows = A.extent(0); @@ -92,41 +84,33 @@ struct SYR2 { bool justUp = (uplo[0] == 'U') || (uplo[0] == 'u'); // Prefer int as the index type, but use a larsyr2 type if needed. - if ((numRows < static_cast(INT_MAX)) && - (numCols < static_cast(INT_MAX))) { + if ((numRows < static_cast(INT_MAX)) && (numCols < static_cast(INT_MAX))) { if (justTranspose) { if (justUp) { - generalSyr2Impl(space, alpha, x, y, A); + generalSyr2Impl(space, alpha, x, y, A); } else { - generalSyr2Impl(space, alpha, x, y, A); + generalSyr2Impl(space, alpha, x, y, A); } } else { if (justUp) { - generalSyr2Impl(space, alpha, x, y, A); + generalSyr2Impl(space, alpha, x, y, A); } else { - generalSyr2Impl(space, alpha, x, y, A); + generalSyr2Impl(space, alpha, x, y, A); } } } else { if (justTranspose) { if (justUp) { - generalSyr2Impl(space, alpha, x, y, A); + generalSyr2Impl(space, alpha, x, y, A); } else { - generalSyr2Impl(space, alpha, x, y, A); + generalSyr2Impl(space, alpha, x, y, A); } } else { if (justUp) { - generalSyr2Impl(space, alpha, x, y, A); + generalSyr2Impl(space, alpha, x, y, A); } else { - generalSyr2Impl(space, alpha, x, y, A); + generalSyr2Impl(space, alpha, x, y, + A); } } } @@ -149,30 +133,24 @@ struct SYR2 { // We may spread out definitions (see _DEF macro below) across one or more .cpp // files. // -#define KOKKOSBLAS2_SYR2_ETI_SPEC_DECL(SCALAR, LAYOUT, EXEC_SPACE, MEM_SPACE) \ - extern template struct SYR2< \ - EXEC_SPACE, \ - Kokkos::View, \ - Kokkos::MemoryTraits >, \ - Kokkos::View, \ - Kokkos::MemoryTraits >, \ - Kokkos::View, \ - Kokkos::MemoryTraits >, \ +#define KOKKOSBLAS2_SYR2_ETI_SPEC_DECL(SCALAR, LAYOUT, EXEC_SPACE, MEM_SPACE) \ + extern template struct SYR2< \ + EXEC_SPACE, \ + Kokkos::View, \ + Kokkos::MemoryTraits >, \ + Kokkos::View, \ + Kokkos::MemoryTraits >, \ + Kokkos::View, Kokkos::MemoryTraits >, \ false, true>; -#define KOKKOSBLAS2_SYR2_ETI_SPEC_INST(SCALAR, LAYOUT, EXEC_SPACE, MEM_SPACE) \ - template struct SYR2< \ - EXEC_SPACE, \ - Kokkos::View, \ - Kokkos::MemoryTraits >, \ - Kokkos::View, \ - Kokkos::MemoryTraits >, \ - Kokkos::View, \ - Kokkos::MemoryTraits >, \ +#define KOKKOSBLAS2_SYR2_ETI_SPEC_INST(SCALAR, LAYOUT, EXEC_SPACE, MEM_SPACE) \ + template struct SYR2< \ + EXEC_SPACE, \ + Kokkos::View, \ + Kokkos::MemoryTraits >, \ + Kokkos::View, \ + Kokkos::MemoryTraits >, \ + Kokkos::View, Kokkos::MemoryTraits >, \ false, true>; #include diff --git a/blas/impl/KokkosBlas2_syr_impl.hpp b/blas/impl/KokkosBlas2_syr_impl.hpp index 685ca75997..7685fd4b4b 100644 --- a/blas/impl/KokkosBlas2_syr_impl.hpp +++ b/blas/impl/KokkosBlas2_syr_impl.hpp @@ -27,16 +27,13 @@ namespace Impl { // Functor for the thread parallel version of SYR. // This functor parallelizes over rows of the input matrix A. -template +template struct ThreadParallelSYR { using AlphaCoeffType = typename AViewType::non_const_value_type; using XComponentType = typename XViewType::non_const_value_type; using AComponentType = typename AViewType::non_const_value_type; - ThreadParallelSYR(const AlphaCoeffType& alpha, const XViewType& x, - const AViewType& A) - : alpha_(alpha), x_(x), A_(A) { + ThreadParallelSYR(const AlphaCoeffType& alpha, const XViewType& x, const AViewType& A) : alpha_(alpha), x_(x), A_(A) { // Nothing to do } @@ -50,18 +47,14 @@ struct ThreadParallelSYR { if constexpr (tJustTranspose) { for (IndexType j = 0; j < N; ++j) { - if (((tJustUp == true) && (i <= j)) || - ((tJustUp == false) && (i >= j))) { + if (((tJustUp == true) && (i <= j)) || ((tJustUp == false) && (i >= j))) { A_(i, j) += AComponentType(alpha_ * x_fixed * x_(j)); } } } else { for (IndexType j = 0; j < N; ++j) { - if (((tJustUp == true) && (i <= j)) || - ((tJustUp == false) && (i >= j))) { - A_(i, j) += AComponentType( - alpha_ * x_fixed * - Kokkos::ArithTraits::conj(x_(j))); + if (((tJustUp == true) && (i <= j)) || ((tJustUp == false) && (i >= j))) { + A_(i, j) += AComponentType(alpha_ * x_fixed * Kokkos::ArithTraits::conj(x_(j))); } } } @@ -75,13 +68,10 @@ struct ThreadParallelSYR { }; // Thread parallel version of SYR. -template -void threadParallelSyr(const ExecutionSpace& space, - const typename AViewType::const_value_type& alpha, +template +void threadParallelSyr(const ExecutionSpace& space, const typename AViewType::const_value_type& alpha, const XViewType& x, const AViewType& A) { - static_assert(std::is_integral::value, - "IndexType must be an integer"); + static_assert(std::is_integral::value, "IndexType must be an integer"); using AlphaCoeffType = typename AViewType::non_const_value_type; @@ -90,12 +80,9 @@ void threadParallelSyr(const ExecutionSpace& space, } else if (alpha == Kokkos::ArithTraits::zero()) { // no entries to update } else { - Kokkos::RangePolicy rangePolicy(space, 0, - A.extent(0)); - ThreadParallelSYR - functor(alpha, x, A); - Kokkos::parallel_for("KokkosBlas::syr[threadParallel]", rangePolicy, - functor); + Kokkos::RangePolicy rangePolicy(space, 0, A.extent(0)); + ThreadParallelSYR functor(alpha, x, A); + Kokkos::parallel_for("KokkosBlas::syr[threadParallel]", rangePolicy, functor); } } @@ -106,8 +93,7 @@ struct TeamParallelSYR_LayoutRightTag {}; // Functor for the team parallel version of SYR, designed for // performance on GPUs. The kernel depends on the layout of A. -template +template struct TeamParallelSYR { using AlphaCoeffType = typename AViewType::non_const_value_type; using XComponentType = typename XViewType::non_const_value_type; @@ -116,16 +102,13 @@ struct TeamParallelSYR { using policy_type = Kokkos::TeamPolicy; using member_type = typename policy_type::member_type; - TeamParallelSYR(const AlphaCoeffType& alpha, const XViewType& x, - const AViewType& A) - : alpha_(alpha), x_(x), A_(A) { + TeamParallelSYR(const AlphaCoeffType& alpha, const XViewType& x, const AViewType& A) : alpha_(alpha), x_(x), A_(A) { // Nothing to do } public: // LayoutLeft version: one team per column - KOKKOS_INLINE_FUNCTION void operator()(TeamParallelSYR_LayoutLeftTag, - const member_type& team) const { + KOKKOS_INLINE_FUNCTION void operator()(TeamParallelSYR_LayoutLeftTag, const member_type& team) const { // Condition 'alpha_ == zero' has already been checked const IndexType j(team.league_rank()); if (x_(j) == Kokkos::ArithTraits::zero()) { @@ -134,30 +117,24 @@ struct TeamParallelSYR { const IndexType M(A_.extent(0)); if constexpr (tJustTranspose) { const XComponentType x_fixed(x_(j)); - Kokkos::parallel_for( - Kokkos::TeamThreadRange(team, M), [&](const IndexType& i) { - if (((tJustUp == true) && (i <= j)) || - ((tJustUp == false) && (i >= j))) { - A_(i, j) += AComponentType(alpha_ * x_(i) * x_fixed); - } - }); + Kokkos::parallel_for(Kokkos::TeamThreadRange(team, M), [&](const IndexType& i) { + if (((tJustUp == true) && (i <= j)) || ((tJustUp == false) && (i >= j))) { + A_(i, j) += AComponentType(alpha_ * x_(i) * x_fixed); + } + }); } else { - const XComponentType x_fixed( - Kokkos::ArithTraits::conj(x_(j))); - Kokkos::parallel_for( - Kokkos::TeamThreadRange(team, M), [&](const IndexType& i) { - if (((tJustUp == true) && (i <= j)) || - ((tJustUp == false) && (i >= j))) { - A_(i, j) += AComponentType(alpha_ * x_(i) * x_fixed); - } - }); + const XComponentType x_fixed(Kokkos::ArithTraits::conj(x_(j))); + Kokkos::parallel_for(Kokkos::TeamThreadRange(team, M), [&](const IndexType& i) { + if (((tJustUp == true) && (i <= j)) || ((tJustUp == false) && (i >= j))) { + A_(i, j) += AComponentType(alpha_ * x_(i) * x_fixed); + } + }); } } } // LayoutRight version: one team per row - KOKKOS_INLINE_FUNCTION void operator()(TeamParallelSYR_LayoutRightTag, - const member_type& team) const { + KOKKOS_INLINE_FUNCTION void operator()(TeamParallelSYR_LayoutRightTag, const member_type& team) const { // Condition 'alpha_ == zero' has already been checked const IndexType i(team.league_rank()); if (x_(i) == Kokkos::ArithTraits::zero()) { @@ -166,23 +143,17 @@ struct TeamParallelSYR { const IndexType N(A_.extent(1)); const XComponentType x_fixed(x_(i)); if constexpr (tJustTranspose) { - Kokkos::parallel_for( - Kokkos::TeamThreadRange(team, N), [&](const IndexType& j) { - if (((tJustUp == true) && (i <= j)) || - ((tJustUp == false) && (i >= j))) { - A_(i, j) += AComponentType(alpha_ * x_fixed * x_(j)); - } - }); + Kokkos::parallel_for(Kokkos::TeamThreadRange(team, N), [&](const IndexType& j) { + if (((tJustUp == true) && (i <= j)) || ((tJustUp == false) && (i >= j))) { + A_(i, j) += AComponentType(alpha_ * x_fixed * x_(j)); + } + }); } else { - Kokkos::parallel_for( - Kokkos::TeamThreadRange(team, N), [&](const IndexType& j) { - if (((tJustUp == true) && (i <= j)) || - ((tJustUp == false) && (i >= j))) { - A_(i, j) += AComponentType( - alpha_ * x_fixed * - Kokkos::ArithTraits::conj(x_(j))); - } - }); + Kokkos::parallel_for(Kokkos::TeamThreadRange(team, N), [&](const IndexType& j) { + if (((tJustUp == true) && (i <= j)) || ((tJustUp == false) && (i >= j))) { + A_(i, j) += AComponentType(alpha_ * x_fixed * Kokkos::ArithTraits::conj(x_(j))); + } + }); } } } @@ -194,13 +165,10 @@ struct TeamParallelSYR { }; // Team parallel version of SYR. -template -void teamParallelSyr(const ExecutionSpace& space, - const typename AViewType::const_value_type& alpha, - const XViewType& x, const AViewType& A) { - static_assert(std::is_integral::value, - "IndexType must be an integer"); +template +void teamParallelSyr(const ExecutionSpace& space, const typename AViewType::const_value_type& alpha, const XViewType& x, + const AViewType& A) { + static_assert(std::is_integral::value, "IndexType must be an integer"); using AlphaCoeffType = typename AViewType::non_const_value_type; @@ -212,11 +180,9 @@ void teamParallelSyr(const ExecutionSpace& space, return; } - constexpr bool isLayoutLeft = - std::is_same_v; + constexpr bool isLayoutLeft = std::is_same_v; using layout_tag = - typename std::conditional::type; + typename std::conditional::type; using TeamPolicyType = Kokkos::TeamPolicy; TeamPolicyType teamPolicy; if (isLayoutLeft) { @@ -227,9 +193,7 @@ void teamParallelSyr(const ExecutionSpace& space, teamPolicy = TeamPolicyType(space, A.extent(0), Kokkos::AUTO); } - TeamParallelSYR - functor(alpha, x, A); + TeamParallelSYR functor(alpha, x, A); Kokkos::parallel_for("KokkosBlas::syr[teamParallel]", teamPolicy, functor); } @@ -241,26 +205,18 @@ void teamParallelSyr(const ExecutionSpace& space, // // The 'enable_if' makes sure unused kernels are not instantiated. -template ()>::type* = nullptr> -void generalSyrImpl(const ExecutionSpace& space, - const typename AViewType::const_value_type& alpha, - const XViewType& x, const AViewType& A) { - threadParallelSyr(space, alpha, x, A); +template ()>::type* = nullptr> +void generalSyrImpl(const ExecutionSpace& space, const typename AViewType::const_value_type& alpha, const XViewType& x, + const AViewType& A) { + threadParallelSyr(space, alpha, x, A); } -template ()>::type* = nullptr> -void generalSyrImpl(const ExecutionSpace& space, - const typename AViewType::const_value_type& alpha, - const XViewType& x, const AViewType& A) { - teamParallelSyr(space, alpha, x, A); +template ()>::type* = nullptr> +void generalSyrImpl(const ExecutionSpace& space, const typename AViewType::const_value_type& alpha, const XViewType& x, + const AViewType& A) { + teamParallelSyr(space, alpha, x, A); } } // namespace Impl diff --git a/blas/impl/KokkosBlas2_syr_spec.hpp b/blas/impl/KokkosBlas2_syr_spec.hpp index b07c3a1446..58c7753618 100644 --- a/blas/impl/KokkosBlas2_syr_spec.hpp +++ b/blas/impl/KokkosBlas2_syr_spec.hpp @@ -40,16 +40,14 @@ struct syr_eti_spec_avail { // specializations go in this header file. We may spread out definitions (see // _INST macro below) across one or more .cpp files. // -#define KOKKOSBLAS2_SYR_ETI_SPEC_AVAIL(SCALAR, LAYOUT, EXEC_SPACE, MEM_SPACE) \ - template <> \ - struct syr_eti_spec_avail< \ - EXEC_SPACE, \ - Kokkos::View, \ - Kokkos::MemoryTraits >, \ - Kokkos::View, \ - Kokkos::MemoryTraits > > { \ - enum : bool { value = true }; \ +#define KOKKOSBLAS2_SYR_ETI_SPEC_AVAIL(SCALAR, LAYOUT, EXEC_SPACE, MEM_SPACE) \ + template <> \ + struct syr_eti_spec_avail, \ + Kokkos::MemoryTraits >, \ + Kokkos::View, \ + Kokkos::MemoryTraits > > { \ + enum : bool { value = true }; \ }; // Include the actual specialization declarations @@ -65,20 +63,15 @@ namespace Impl { // Implementation of KokkosBlas::syr. template ::value, - bool eti_spec_avail = - syr_eti_spec_avail::value> + bool tpl_spec_avail = syr_tpl_spec_avail::value, + bool eti_spec_avail = syr_eti_spec_avail::value> struct SYR { - static void syr(const ExecutionSpace& space, const char trans[], - const char uplo[], - const typename AViewType::const_value_type& alpha, - const XViewType& x, const AViewType& A) + static void syr(const ExecutionSpace& space, const char trans[], const char uplo[], + const typename AViewType::const_value_type& alpha, const XViewType& x, const AViewType& A) #if !defined(KOKKOSKERNELS_ETI_ONLY) || KOKKOSKERNELS_IMPL_COMPILE_LIBRARY { - Kokkos::Profiling::pushRegion(KOKKOSKERNELS_IMPL_COMPILE_LIBRARY - ? "KokkosBlas::syr[ETI]" - : "KokkosBlas::syr[noETI]"); + Kokkos::Profiling::pushRegion(KOKKOSKERNELS_IMPL_COMPILE_LIBRARY ? "KokkosBlas::syr[ETI]" + : "KokkosBlas::syr[noETI]"); typedef typename AViewType::size_type size_type; const size_type numRows = A.extent(0); @@ -88,41 +81,32 @@ struct SYR { bool justUp = (uplo[0] == 'U') || (uplo[0] == 'u'); // Prefer int as the index type, but use a larsyr type if needed. - if ((numRows < static_cast(INT_MAX)) && - (numCols < static_cast(INT_MAX))) { + if ((numRows < static_cast(INT_MAX)) && (numCols < static_cast(INT_MAX))) { if (justTranspose) { if (justUp) { - generalSyrImpl( - space, alpha, x, A); + generalSyrImpl(space, alpha, x, A); } else { - generalSyrImpl(space, alpha, x, A); + generalSyrImpl(space, alpha, x, A); } } else { if (justUp) { - generalSyrImpl(space, alpha, x, A); + generalSyrImpl(space, alpha, x, A); } else { - generalSyrImpl(space, alpha, x, A); + generalSyrImpl(space, alpha, x, A); } } } else { if (justTranspose) { if (justUp) { - generalSyrImpl(space, alpha, x, A); + generalSyrImpl(space, alpha, x, A); } else { - generalSyrImpl(space, alpha, x, A); + generalSyrImpl(space, alpha, x, A); } } else { if (justUp) { - generalSyrImpl(space, alpha, x, A); + generalSyrImpl(space, alpha, x, A); } else { - generalSyrImpl(space, alpha, x, A); + generalSyrImpl(space, alpha, x, A); } } } @@ -145,24 +129,20 @@ struct SYR { // We may spread out definitions (see _DEF macro below) across one or more .cpp // files. // -#define KOKKOSBLAS2_SYR_ETI_SPEC_DECL(SCALAR, LAYOUT, EXEC_SPACE, MEM_SPACE) \ - extern template struct SYR< \ - EXEC_SPACE, \ - Kokkos::View, \ - Kokkos::MemoryTraits >, \ - Kokkos::View, \ - Kokkos::MemoryTraits >, \ +#define KOKKOSBLAS2_SYR_ETI_SPEC_DECL(SCALAR, LAYOUT, EXEC_SPACE, MEM_SPACE) \ + extern template struct SYR< \ + EXEC_SPACE, \ + Kokkos::View, \ + Kokkos::MemoryTraits >, \ + Kokkos::View, Kokkos::MemoryTraits >, \ false, true>; -#define KOKKOSBLAS2_SYR_ETI_SPEC_INST(SCALAR, LAYOUT, EXEC_SPACE, MEM_SPACE) \ - template struct SYR< \ - EXEC_SPACE, \ - Kokkos::View, \ - Kokkos::MemoryTraits >, \ - Kokkos::View, \ - Kokkos::MemoryTraits >, \ +#define KOKKOSBLAS2_SYR_ETI_SPEC_INST(SCALAR, LAYOUT, EXEC_SPACE, MEM_SPACE) \ + template struct SYR< \ + EXEC_SPACE, \ + Kokkos::View, \ + Kokkos::MemoryTraits >, \ + Kokkos::View, Kokkos::MemoryTraits >, \ false, true>; #include diff --git a/blas/impl/KokkosBlas2_team_gemv_impl.hpp b/blas/impl/KokkosBlas2_team_gemv_impl.hpp index 5e43cae7d4..19e2bde931 100644 --- a/blas/impl/KokkosBlas2_team_gemv_impl.hpp +++ b/blas/impl/KokkosBlas2_team_gemv_impl.hpp @@ -26,51 +26,41 @@ namespace Impl { template struct TeamGemvInternal { - template - KOKKOS_INLINE_FUNCTION static int invoke( - const MemberType &member, OpA op, const int m, const int n, - const ScalarType alpha, const ValueAType *KOKKOS_RESTRICT A, - const int as0, const int as1, const ValueXType *KOKKOS_RESTRICT x, - const int xs0, const ScalarType beta, - /**/ ValueYType *KOKKOS_RESTRICT y, const int ys0); + template + KOKKOS_INLINE_FUNCTION static int invoke(const MemberType &member, OpA op, const int m, const int n, + const ScalarType alpha, const ValueAType *KOKKOS_RESTRICT A, const int as0, + const int as1, const ValueXType *KOKKOS_RESTRICT x, const int xs0, + const ScalarType beta, + /**/ ValueYType *KOKKOS_RESTRICT y, const int ys0); // default OpA = OpID - template - KOKKOS_INLINE_FUNCTION static int invoke( - const MemberType &member, const int m, const int n, - const ScalarType alpha, const ValueAType *KOKKOS_RESTRICT A, - const int as0, const int as1, const ValueXType *KOKKOS_RESTRICT x, - const int xs0, const ScalarType beta, - /**/ ValueYType *KOKKOS_RESTRICT y, const int ys0) { - return invoke(member, OpID{}, m, n, alpha, A, as0, as1, x, xs0, beta, y, - ys0); + template + KOKKOS_INLINE_FUNCTION static int invoke(const MemberType &member, const int m, const int n, const ScalarType alpha, + const ValueAType *KOKKOS_RESTRICT A, const int as0, const int as1, + const ValueXType *KOKKOS_RESTRICT x, const int xs0, const ScalarType beta, + /**/ ValueYType *KOKKOS_RESTRICT y, const int ys0) { + return invoke(member, OpID{}, m, n, alpha, A, as0, as1, x, xs0, beta, y, ys0); } }; template struct TeamVectorGemvInternal { - template - KOKKOS_INLINE_FUNCTION static int invoke( - const MemberType &member, OpA op, const int m, const int n, - const ScalarType alpha, const ValueAType *KOKKOS_RESTRICT A, - const int as0, const int as1, const ValueXType *KOKKOS_RESTRICT x, - const int xs0, const ScalarType beta, - /**/ ValueYType *KOKKOS_RESTRICT y, const int ys0); + template + KOKKOS_INLINE_FUNCTION static int invoke(const MemberType &member, OpA op, const int m, const int n, + const ScalarType alpha, const ValueAType *KOKKOS_RESTRICT A, const int as0, + const int as1, const ValueXType *KOKKOS_RESTRICT x, const int xs0, + const ScalarType beta, + /**/ ValueYType *KOKKOS_RESTRICT y, const int ys0); // default OpA = OpID - template - KOKKOS_INLINE_FUNCTION static int invoke( - const MemberType &member, const int m, const int n, - const ScalarType alpha, const ValueAType *KOKKOS_RESTRICT A, - const int as0, const int as1, const ValueXType *KOKKOS_RESTRICT x, - const int xs0, const ScalarType beta, - /**/ ValueYType *KOKKOS_RESTRICT y, const int ys0) { - return invoke(member, OpID{}, m, n, alpha, A, as0, as1, x, xs0, beta, y, - ys0); + template + KOKKOS_INLINE_FUNCTION static int invoke(const MemberType &member, const int m, const int n, const ScalarType alpha, + const ValueAType *KOKKOS_RESTRICT A, const int as0, const int as1, + const ValueXType *KOKKOS_RESTRICT x, const int xs0, const ScalarType beta, + /**/ ValueYType *KOKKOS_RESTRICT y, const int ys0) { + return invoke(member, OpID{}, m, n, alpha, A, as0, as1, x, xs0, beta, y, ys0); } }; @@ -79,13 +69,12 @@ struct TeamVectorGemvInternal { /// ==================== template <> -template +template KOKKOS_INLINE_FUNCTION int TeamGemvInternal::invoke( - const MemberType &member, OpA op, const int m, const int n, - const ScalarType alpha, const ValueAType *KOKKOS_RESTRICT A, const int as0, - const int as1, const ValueXType *KOKKOS_RESTRICT x, const int xs0, - const ScalarType beta, + const MemberType &member, OpA op, const int m, const int n, const ScalarType alpha, + const ValueAType *KOKKOS_RESTRICT A, const int as0, const int as1, const ValueXType *KOKKOS_RESTRICT x, + const int xs0, const ScalarType beta, /**/ ValueYType *KOKKOS_RESTRICT y, const int ys0) { const ScalarType one(1.0), zero(0.0); @@ -102,29 +91,26 @@ KOKKOS_INLINE_FUNCTION int TeamGemvInternal::invoke( if (beta != one) member.team_barrier(); - Kokkos::parallel_for(Kokkos::TeamThreadRange(member, 0, m), - [&](const int &i) { - ValueYType t(0); - const ValueAType *KOKKOS_RESTRICT tA = (A + i * as0); + Kokkos::parallel_for(Kokkos::TeamThreadRange(member, 0, m), [&](const int &i) { + ValueYType t(0); + const ValueAType *KOKKOS_RESTRICT tA = (A + i * as0); #if defined(KOKKOS_ENABLE_PRAGMA_UNROLL) #pragma unroll #endif - for (int j = 0; j < n; ++j) - t += op(tA[j * as1]) * x[j * xs0]; - y[i * ys0] += alpha * t; - }); + for (int j = 0; j < n; ++j) t += op(tA[j * as1]) * x[j * xs0]; + y[i * ys0] += alpha * t; + }); } return 0; } template <> -template +template KOKKOS_INLINE_FUNCTION int TeamGemvInternal::invoke( - const MemberType &member, OpA /* op */, const int m, const int n, - const ScalarType alpha, const ValueAType *KOKKOS_RESTRICT A, const int as0, - const int as1, const ValueXType *KOKKOS_RESTRICT x, const int xs0, - const ScalarType beta, + const MemberType &member, OpA /* op */, const int m, const int n, const ScalarType alpha, + const ValueAType *KOKKOS_RESTRICT A, const int as0, const int as1, const ValueXType *KOKKOS_RESTRICT x, + const int xs0, const ScalarType beta, /**/ ValueYType *KOKKOS_RESTRICT y, const int ys0) { const ScalarType one(1.0), zero(0.0); @@ -149,13 +135,10 @@ KOKKOS_INLINE_FUNCTION int TeamGemvInternal::invoke( // Made this non-const in order to WORKAROUND issue #349 int mb = mb_a < mb_b ? mb_a : mb_b, mp = m % mb; - Kokkos::parallel_for(Kokkos::TeamThreadRange(member, (m / mb) + (mp > 0)), - [&](const int &ii) { - const int i = ii * mb; - inner.serial_invoke(alpha, A + i * as0, x, - (i + mb) > m ? (m - i) : mb, - n, y + i * ys0); - }); + Kokkos::parallel_for(Kokkos::TeamThreadRange(member, (m / mb) + (mp > 0)), [&](const int &ii) { + const int i = ii * mb; + inner.serial_invoke(alpha, A + i * as0, x, (i + mb) > m ? (m - i) : mb, n, y + i * ys0); + }); member.team_barrier(); } @@ -167,14 +150,12 @@ KOKKOS_INLINE_FUNCTION int TeamGemvInternal::invoke( /// ==================== template <> -template -KOKKOS_INLINE_FUNCTION int -TeamVectorGemvInternal::invoke( - const MemberType &member, OpA op, const int m, const int n, - const ScalarType alpha, const ValueAType *KOKKOS_RESTRICT A, const int as0, - const int as1, const ValueXType *KOKKOS_RESTRICT x, const int xs0, - const ScalarType beta, +template +KOKKOS_INLINE_FUNCTION int TeamVectorGemvInternal::invoke( + const MemberType &member, OpA op, const int m, const int n, const ScalarType alpha, + const ValueAType *KOKKOS_RESTRICT A, const int as0, const int as1, const ValueXType *KOKKOS_RESTRICT x, + const int xs0, const ScalarType beta, /**/ ValueYType *KOKKOS_RESTRICT y, const int ys0) { const ScalarType one(1.0), zero(0.0); @@ -196,12 +177,8 @@ TeamVectorGemvInternal::invoke( const ValueAType *KOKKOS_RESTRICT tA = (A + i * as0); Kokkos::parallel_reduce( Kokkos::ThreadVectorRange(member, n), - [&](const int &j, ValueYType &update) { - update += op(tA[j * as1]) * x[j * xs0]; - }, - t); - Kokkos::single(Kokkos::PerThread(member), - [&]() { y[i * ys0] += alpha * t; }); + [&](const int &j, ValueYType &update) { update += op(tA[j * as1]) * x[j * xs0]; }, t); + Kokkos::single(Kokkos::PerThread(member), [&]() { y[i * ys0] += alpha * t; }); }); } return 0; diff --git a/blas/impl/KokkosBlas2_team_gemv_spec.hpp b/blas/impl/KokkosBlas2_team_gemv_spec.hpp index d46fb7be6f..c3cf43b743 100644 --- a/blas/impl/KokkosBlas2_team_gemv_spec.hpp +++ b/blas/impl/KokkosBlas2_team_gemv_spec.hpp @@ -25,28 +25,19 @@ namespace KokkosBlas { -template +template struct TeamGemv { - template - KOKKOS_INLINE_FUNCTION static int invoke(const MemberType& /*member*/, - const ScalarType /*alpha*/, - const AViewType& /*A*/, - const xViewType& /*x*/, - const ScalarType /*beta*/, + template + KOKKOS_INLINE_FUNCTION static int invoke(const MemberType& /*member*/, const ScalarType /*alpha*/, + const AViewType& /*A*/, const xViewType& /*x*/, const ScalarType /*beta*/, const yViewType& /*y*/); }; template struct TeamVectorGemv { - template - KOKKOS_INLINE_FUNCTION static int invoke(const MemberType& /*member*/, - const ScalarType /*alpha*/, - const AViewType& /*A*/, - const xViewType& /*x*/, - const ScalarType /*beta*/, + template + KOKKOS_INLINE_FUNCTION static int invoke(const MemberType& /*member*/, const ScalarType /*alpha*/, + const AViewType& /*A*/, const xViewType& /*x*/, const ScalarType /*beta*/, const yViewType& /*y*/); }; @@ -56,31 +47,25 @@ struct TeamVectorGemv { template struct TeamGemv { - template - KOKKOS_INLINE_FUNCTION static int invoke( - const MemberType& member, const ScalarType alpha, const AViewType& A, - const xViewType& x, const ScalarType beta, const yViewType& y) { - static_assert(AViewType::rank == 2, - "KokkosBlas::TeamGemv requires rank-2 A matrix"); - return Impl::TeamGemvInternal::invoke( - member, A.extent(0), A.extent(1), alpha, A.data(), A.stride_0(), - A.stride_1(), x.data(), x.stride_0(), beta, y.data(), y.stride_0()); + template + KOKKOS_INLINE_FUNCTION static int invoke(const MemberType& member, const ScalarType alpha, const AViewType& A, + const xViewType& x, const ScalarType beta, const yViewType& y) { + static_assert(AViewType::rank == 2, "KokkosBlas::TeamGemv requires rank-2 A matrix"); + return Impl::TeamGemvInternal::invoke(member, A.extent(0), A.extent(1), alpha, A.data(), + A.stride_0(), A.stride_1(), x.data(), x.stride_0(), + beta, y.data(), y.stride_0()); } }; template struct TeamGemv { - template - KOKKOS_INLINE_FUNCTION static int invoke( - const MemberType& member, const ScalarType alpha, const AViewType& A, - const xViewType& x, const ScalarType beta, const yViewType& y) { - static_assert(AViewType::rank == 2, - "KokkosBlas::TeamGemv requires rank-2 A matrix"); - return Impl::TeamGemvInternal::invoke( - member, A.extent(0), A.extent(1), alpha, A.data(), A.stride_0(), - A.stride_1(), x.data(), x.stride_0(), beta, y.data(), y.stride_0()); + template + KOKKOS_INLINE_FUNCTION static int invoke(const MemberType& member, const ScalarType alpha, const AViewType& A, + const xViewType& x, const ScalarType beta, const yViewType& y) { + static_assert(AViewType::rank == 2, "KokkosBlas::TeamGemv requires rank-2 A matrix"); + return Impl::TeamGemvInternal::invoke(member, A.extent(0), A.extent(1), alpha, A.data(), + A.stride_0(), A.stride_1(), x.data(), x.stride_0(), beta, + y.data(), y.stride_0()); } }; @@ -90,31 +75,25 @@ struct TeamGemv { template struct TeamGemv { - template - KOKKOS_INLINE_FUNCTION static int invoke( - const MemberType& member, const ScalarType alpha, const AViewType& A, - const xViewType& x, const ScalarType beta, const yViewType& y) { - static_assert(AViewType::rank == 2, - "BLAS TeamGemv requires rank-2 A matrix"); - return Impl::TeamGemvInternal::invoke( - member, A.extent(1), A.extent(0), alpha, A.data(), A.stride_1(), - A.stride_0(), x.data(), x.stride_0(), beta, y.data(), y.stride_0()); + template + KOKKOS_INLINE_FUNCTION static int invoke(const MemberType& member, const ScalarType alpha, const AViewType& A, + const xViewType& x, const ScalarType beta, const yViewType& y) { + static_assert(AViewType::rank == 2, "BLAS TeamGemv requires rank-2 A matrix"); + return Impl::TeamGemvInternal::invoke(member, A.extent(1), A.extent(0), alpha, A.data(), + A.stride_1(), A.stride_0(), x.data(), x.stride_0(), + beta, y.data(), y.stride_0()); } }; template struct TeamGemv { - template - KOKKOS_INLINE_FUNCTION static int invoke( - const MemberType& member, const ScalarType alpha, const AViewType& A, - const xViewType& x, const ScalarType beta, const yViewType& y) { - static_assert(AViewType::rank == 2, - "BLAS TeamGemv requires rank-2 A matrix"); - return Impl::TeamGemvInternal::invoke( - member, A.extent(1), A.extent(0), alpha, A.data(), A.stride_1(), - A.stride_0(), x.data(), x.stride_0(), beta, y.data(), y.stride_0()); + template + KOKKOS_INLINE_FUNCTION static int invoke(const MemberType& member, const ScalarType alpha, const AViewType& A, + const xViewType& x, const ScalarType beta, const yViewType& y) { + static_assert(AViewType::rank == 2, "BLAS TeamGemv requires rank-2 A matrix"); + return Impl::TeamGemvInternal::invoke(member, A.extent(1), A.extent(0), alpha, A.data(), + A.stride_1(), A.stride_0(), x.data(), x.stride_0(), beta, + y.data(), y.stride_0()); } }; @@ -124,33 +103,25 @@ struct TeamGemv { template struct TeamGemv { - template - KOKKOS_INLINE_FUNCTION static int invoke( - const MemberType& member, const ScalarType alpha, const AViewType& A, - const xViewType& x, const ScalarType beta, const yViewType& y) { - static_assert(AViewType::rank == 2, - "BLAS TeamGemv requires rank-2 A matrix"); - return Impl::TeamGemvInternal::invoke( - member, Impl::OpConj{}, A.extent(1), A.extent(0), alpha, A.data(), - A.stride_1(), A.stride_0(), x.data(), x.stride_0(), beta, y.data(), - y.stride_0()); + template + KOKKOS_INLINE_FUNCTION static int invoke(const MemberType& member, const ScalarType alpha, const AViewType& A, + const xViewType& x, const ScalarType beta, const yViewType& y) { + static_assert(AViewType::rank == 2, "BLAS TeamGemv requires rank-2 A matrix"); + return Impl::TeamGemvInternal::invoke(member, Impl::OpConj{}, A.extent(1), A.extent(0), + alpha, A.data(), A.stride_1(), A.stride_0(), x.data(), + x.stride_0(), beta, y.data(), y.stride_0()); } }; template struct TeamGemv { - template - KOKKOS_INLINE_FUNCTION static int invoke( - const MemberType& member, const ScalarType alpha, const AViewType& A, - const xViewType& x, const ScalarType beta, const yViewType& y) { - static_assert(AViewType::rank == 2, - "BLAS TeamGemv requires rank-2 A matrix"); - return Impl::TeamGemvInternal::invoke( - member, Impl::OpConj{}, A.extent(1), A.extent(0), alpha, A.data(), - A.stride_1(), A.stride_0(), x.data(), x.stride_0(), beta, y.data(), - y.stride_0()); + template + KOKKOS_INLINE_FUNCTION static int invoke(const MemberType& member, const ScalarType alpha, const AViewType& A, + const xViewType& x, const ScalarType beta, const yViewType& y) { + static_assert(AViewType::rank == 2, "BLAS TeamGemv requires rank-2 A matrix"); + return Impl::TeamGemvInternal::invoke(member, Impl::OpConj{}, A.extent(1), A.extent(0), alpha, + A.data(), A.stride_1(), A.stride_0(), x.data(), + x.stride_0(), beta, y.data(), y.stride_0()); } }; @@ -160,16 +131,13 @@ struct TeamGemv { template struct TeamVectorGemv { - template - KOKKOS_INLINE_FUNCTION static int invoke( - const MemberType& member, const ScalarType alpha, const AViewType& A, - const xViewType& x, const ScalarType beta, const yViewType& y) { - static_assert(AViewType::rank == 2, - "Batched TeamVectorGemv requires rank-2 A matrix"); - return Impl::TeamVectorGemvInternal::invoke( - member, A.extent(0), A.extent(1), alpha, A.data(), A.stride_0(), - A.stride_1(), x.data(), x.stride_0(), beta, y.data(), y.stride_0()); + template + KOKKOS_INLINE_FUNCTION static int invoke(const MemberType& member, const ScalarType alpha, const AViewType& A, + const xViewType& x, const ScalarType beta, const yViewType& y) { + static_assert(AViewType::rank == 2, "Batched TeamVectorGemv requires rank-2 A matrix"); + return Impl::TeamVectorGemvInternal::invoke(member, A.extent(0), A.extent(1), alpha, + A.data(), A.stride_0(), A.stride_1(), x.data(), + x.stride_0(), beta, y.data(), y.stride_0()); } }; @@ -179,16 +147,13 @@ struct TeamVectorGemv { template struct TeamVectorGemv { - template - KOKKOS_INLINE_FUNCTION static int invoke( - const MemberType& member, const ScalarType alpha, const AViewType& A, - const xViewType& x, const ScalarType beta, const yViewType& y) { - static_assert(AViewType::rank == 2, - "Batched TeamVectorGemv requires rank-2 A matrix"); - return Impl::TeamVectorGemvInternal::invoke( - member, A.extent(1), A.extent(0), alpha, A.data(), A.stride_1(), - A.stride_0(), x.data(), x.stride_0(), beta, y.data(), y.stride_0()); + template + KOKKOS_INLINE_FUNCTION static int invoke(const MemberType& member, const ScalarType alpha, const AViewType& A, + const xViewType& x, const ScalarType beta, const yViewType& y) { + static_assert(AViewType::rank == 2, "Batched TeamVectorGemv requires rank-2 A matrix"); + return Impl::TeamVectorGemvInternal::invoke(member, A.extent(1), A.extent(0), alpha, + A.data(), A.stride_1(), A.stride_0(), x.data(), + x.stride_0(), beta, y.data(), y.stride_0()); } }; @@ -198,17 +163,13 @@ struct TeamVectorGemv { template struct TeamVectorGemv { - template - KOKKOS_INLINE_FUNCTION static int invoke( - const MemberType& member, const ScalarType alpha, const AViewType& A, - const xViewType& x, const ScalarType beta, const yViewType& y) { - static_assert(AViewType::rank == 2, - "Batched TeamVectorGemv requires rank-2 A matrix"); + template + KOKKOS_INLINE_FUNCTION static int invoke(const MemberType& member, const ScalarType alpha, const AViewType& A, + const xViewType& x, const ScalarType beta, const yViewType& y) { + static_assert(AViewType::rank == 2, "Batched TeamVectorGemv requires rank-2 A matrix"); return Impl::TeamVectorGemvInternal::invoke( - member, Impl::OpConj{}, A.extent(1), A.extent(0), alpha, A.data(), - A.stride_1(), A.stride_0(), x.data(), x.stride_0(), beta, y.data(), - y.stride_0()); + member, Impl::OpConj{}, A.extent(1), A.extent(0), alpha, A.data(), A.stride_1(), A.stride_0(), x.data(), + x.stride_0(), beta, y.data(), y.stride_0()); } }; diff --git a/blas/impl/KokkosBlas3_gemm_dotbased_impl.hpp b/blas/impl/KokkosBlas3_gemm_dotbased_impl.hpp index 26c4c9624a..15c3c74ecd 100644 --- a/blas/impl/KokkosBlas3_gemm_dotbased_impl.hpp +++ b/blas/impl/KokkosBlas3_gemm_dotbased_impl.hpp @@ -57,8 +57,7 @@ struct DotBasedGEMM { const size_A dotSize; // the length of the vectors in the dot products - DotBasedGEMM(const scalar_A& alpha_, const AV& A_, const BV& B_, - const scalar_C& beta_, const CV& C_) + DotBasedGEMM(const scalar_A& alpha_, const AV& A_, const BV& B_, const scalar_C& beta_, const CV& C_) : A(A_), B(B_), C(C_), @@ -69,52 +68,39 @@ struct DotBasedGEMM { dotSize(A.extent(0)) {} void run(const ExecSpace& space, bool conjugateTranspose) { - multipleReductionWorkDistribution( - dotSize, numCrows * numCcols, numDivPerDot); + multipleReductionWorkDistribution(dotSize, numCrows * numCcols, numDivPerDot); const size_C ndots = numCrows * numCcols; // Number of dot products numTeams = ndots * numDivPerDot; // Initialize C matrix if beta != 1 if (beta == CVT::zero()) { - Kokkos::MDRangePolicy> policyInit( - space, {0, 0}, {numCrows, numCcols}); - Kokkos::parallel_for("Initialize C for Dot Product Based GEMM", - policyInit, *this); + Kokkos::MDRangePolicy> policyInit(space, {0, 0}, {numCrows, numCcols}); + Kokkos::parallel_for("Initialize C for Dot Product Based GEMM", policyInit, *this); } else if (beta != CVT::one()) { - Kokkos::MDRangePolicy> policyInit( - space, {0, 0}, {numCrows, numCcols}); - Kokkos::parallel_for("Initialize C for Dot Product Based GEMM", - policyInit, *this); + Kokkos::MDRangePolicy> policyInit(space, {0, 0}, {numCrows, numCcols}); + Kokkos::parallel_for("Initialize C for Dot Product Based GEMM", policyInit, *this); } // Multiply alpha*A^TB and add it to beta*C if (conjugateTranspose) { - Kokkos::TeamPolicy policyMult(space, numTeams, - Kokkos::AUTO); + Kokkos::TeamPolicy policyMult(space, numTeams, Kokkos::AUTO); Kokkos::parallel_for("Perform Dot Product Based GEMM", policyMult, *this); } else { - Kokkos::TeamPolicy policyMult(space, numTeams, - Kokkos::AUTO); + Kokkos::TeamPolicy policyMult(space, numTeams, Kokkos::AUTO); Kokkos::parallel_for("Perform Dot Product Based GEMM", policyMult, *this); } } KOKKOS_INLINE_FUNCTION - void operator()(const TagZero&, const size_C& rowId, - const size_C& colId) const { - C(rowId, colId) = CVT::zero(); - } + void operator()(const TagZero&, const size_C& rowId, const size_C& colId) const { C(rowId, colId) = CVT::zero(); } KOKKOS_INLINE_FUNCTION - void operator()(const TagInit&, const size_C& rowId, - const size_C& colId) const { + void operator()(const TagInit&, const size_C& rowId, const size_C& colId) const { C(rowId, colId) = beta * C(rowId, colId); } KOKKOS_INLINE_FUNCTION - void operator()(const TagMult&, - const typename Kokkos::TeamPolicy::member_type& - teamMember) const { + void operator()(const TagMult&, const typename Kokkos::TeamPolicy::member_type& teamMember) const { const size_C globalRank = teamMember.league_rank(); const size_C localRank = globalRank % numDivPerDot; const size_C i = globalRank / numDivPerDot; @@ -127,19 +113,13 @@ struct DotBasedGEMM { if (localRank == numDivPerDot - 1) end = dotSize; Kokkos::parallel_reduce( Kokkos::TeamThreadRange(teamMember, begin, end), - [&](const size_A k, scalar_C& update) { - update += alpha * A(k, rowId) * B(k, colId); - }, - result); + [&](const size_A k, scalar_C& update) { update += alpha * A(k, rowId) * B(k, colId); }, result); - Kokkos::single(Kokkos::PerTeam(teamMember), - [&]() { Kokkos::atomic_add(&C(rowId, colId), result); }); + Kokkos::single(Kokkos::PerTeam(teamMember), [&]() { Kokkos::atomic_add(&C(rowId, colId), result); }); } KOKKOS_INLINE_FUNCTION - void operator()(const TagMultCT&, - const typename Kokkos::TeamPolicy::member_type& - teamMember) const { + void operator()(const TagMultCT&, const typename Kokkos::TeamPolicy::member_type& teamMember) const { const size_C globalRank = teamMember.league_rank(); const size_C localRank = globalRank % numDivPerDot; const size_C i = globalRank / numDivPerDot; @@ -152,13 +132,9 @@ struct DotBasedGEMM { if (localRank == numDivPerDot - 1) end = dotSize; Kokkos::parallel_reduce( Kokkos::TeamThreadRange(teamMember, begin, end), - [&](const size_A k, scalar_C& update) { - update += alpha * AVT::conj(A(k, rowId)) * B(k, colId); - }, - result); + [&](const size_A k, scalar_C& update) { update += alpha * AVT::conj(A(k, rowId)) * B(k, colId); }, result); - Kokkos::single(Kokkos::PerTeam(teamMember), - [&]() { Kokkos::atomic_add(&C(rowId, colId), result); }); + Kokkos::single(Kokkos::PerTeam(teamMember), [&]() { Kokkos::atomic_add(&C(rowId, colId), result); }); } }; diff --git a/blas/impl/KokkosBlas3_gemm_impl.hpp b/blas/impl/KokkosBlas3_gemm_impl.hpp index 1a0ab46bb3..675ef5d3a4 100644 --- a/blas/impl/KokkosBlas3_gemm_impl.hpp +++ b/blas/impl/KokkosBlas3_gemm_impl.hpp @@ -55,419 +55,320 @@ struct impl_gemm_choose_copy_layout { #endif // DeepCopy matrix block into scratch -template +template struct impl_deep_copy_matrix_block; -template -struct impl_deep_copy_matrix_block { +template +struct impl_deep_copy_matrix_block { typedef typename ViewType::non_const_value_type value_type; typedef Kokkos::ArithTraits ATV; KOKKOS_INLINE_FUNCTION - static void copy(const TeamHandle& team, const ViewTypeScratch& A_scr, - const ViewType& A, const int& offset_i, + static void copy(const TeamHandle& team, const ViewTypeScratch& A_scr, const ViewType& A, const int& offset_i, const int& offset_j) { - if (offset_i + blockDim_i <= A.extent_int(0) && - offset_j + blockDim_j <= A.extent_int(1)) { - Kokkos::parallel_for( - Kokkos::TeamThreadRange(team, blockDim_j), [&](const int j) { + if (offset_i + blockDim_i <= A.extent_int(0) && offset_j + blockDim_j <= A.extent_int(1)) { + Kokkos::parallel_for(Kokkos::TeamThreadRange(team, blockDim_j), [&](const int j) { #ifndef KOKKOS_IMPL_BATCHED_GEMM_GCC_CXX14_WORKAROUND - const int idx_j = offset_j + j; + const int idx_j = offset_j + j; #endif - Kokkos::parallel_for(Kokkos::ThreadVectorRange(team, blockDim_i), - [&](const int i) { + Kokkos::parallel_for(Kokkos::ThreadVectorRange(team, blockDim_i), [&](const int i) { #ifdef KOKKOS_IMPL_BATCHED_GEMM_GCC_CXX14_WORKAROUND - const int idx_j = offset_j + j; + const int idx_j = offset_j + j; #endif - const int idx_i = offset_i + i; - A_scr(i, j) = A(idx_i, idx_j); - }); - }); + const int idx_i = offset_i + i; + A_scr(i, j) = A(idx_i, idx_j); + }); + }); } else { - Kokkos::parallel_for( - Kokkos::TeamThreadRange(team, blockDim_j), [&](const int j) { + Kokkos::parallel_for(Kokkos::TeamThreadRange(team, blockDim_j), [&](const int j) { #ifndef KOKKOS_IMPL_BATCHED_GEMM_GCC_CXX14_WORKAROUND - int idx_j = offset_j + j; + int idx_j = offset_j + j; #endif - Kokkos::parallel_for( - Kokkos::ThreadVectorRange(team, blockDim_i), [&](const int i) { + Kokkos::parallel_for(Kokkos::ThreadVectorRange(team, blockDim_i), [&](const int i) { #ifdef KOKKOS_IMPL_BATCHED_GEMM_GCC_CXX14_WORKAROUND - int idx_j = offset_j + j; -#endif - const int idx_i = offset_i + i; - A_scr(i, j) = - idx_i < A.extent_int(0) && idx_j < A.extent_int(1) - ? A(idx_i, idx_j) - : ATV::zero(); - }); - }); + int idx_j = offset_j + j; +#endif + const int idx_i = offset_i + i; + A_scr(i, j) = idx_i < A.extent_int(0) && idx_j < A.extent_int(1) ? A(idx_i, idx_j) : ATV::zero(); + }); + }); } } }; -template -struct impl_deep_copy_matrix_block +struct impl_deep_copy_matrix_block { typedef typename ViewType::non_const_value_type value_type; typedef Kokkos::ArithTraits ATV; KOKKOS_INLINE_FUNCTION - static void copy(const TeamHandle& team, const ViewTypeScratch& A_scr, - const ViewType& A, const int& offset_i, + static void copy(const TeamHandle& team, const ViewTypeScratch& A_scr, const ViewType& A, const int& offset_i, const int& offset_j) { - if (offset_i + blockDim_i <= A.extent_int(0) && - offset_j + blockDim_j <= A.extent_int(1)) { - Kokkos::parallel_for( - Kokkos::TeamThreadRange(team, blockDim_i), [&](const int i) { - const int idx_i = offset_i + i; - Kokkos::parallel_for(Kokkos::ThreadVectorRange(team, blockDim_j), - [&](const int j) { - const int idx_j = offset_j + j; - A_scr(i, j) = A(idx_i, idx_j); - }); - }); + if (offset_i + blockDim_i <= A.extent_int(0) && offset_j + blockDim_j <= A.extent_int(1)) { + Kokkos::parallel_for(Kokkos::TeamThreadRange(team, blockDim_i), [&](const int i) { + const int idx_i = offset_i + i; + Kokkos::parallel_for(Kokkos::ThreadVectorRange(team, blockDim_j), [&](const int j) { + const int idx_j = offset_j + j; + A_scr(i, j) = A(idx_i, idx_j); + }); + }); } else { - Kokkos::parallel_for( - Kokkos::TeamThreadRange(team, blockDim_i), [&](const int i) { + Kokkos::parallel_for(Kokkos::TeamThreadRange(team, blockDim_i), [&](const int i) { #ifndef KOKKOS_IMPL_BATCHED_GEMM_GCC_CXX14_WORKAROUND - int idx_i = offset_i + i; + int idx_i = offset_i + i; #endif - Kokkos::parallel_for( - Kokkos::ThreadVectorRange(team, blockDim_j), [&](const int j) { + Kokkos::parallel_for(Kokkos::ThreadVectorRange(team, blockDim_j), [&](const int j) { #ifdef KOKKOS_IMPL_BATCHED_GEMM_GCC_CXX14_WORKAROUND - int idx_i = offset_i + i; -#endif - const int idx_j = offset_j + j; - A_scr(i, j) = - idx_i < A.extent_int(0) && idx_j < A.extent_int(1) - ? A(idx_i, idx_j) - : ATV::zero(); - }); - }); + int idx_i = offset_i + i; +#endif + const int idx_j = offset_j + j; + A_scr(i, j) = idx_i < A.extent_int(0) && idx_j < A.extent_int(1) ? A(idx_i, idx_j) : ATV::zero(); + }); + }); } } }; -template -struct impl_deep_copy_matrix_block { +template +struct impl_deep_copy_matrix_block { typedef typename ViewType::non_const_value_type value_type; typedef Kokkos::ArithTraits ATV; KOKKOS_INLINE_FUNCTION - static void copy(const TeamHandle& team, const ViewTypeScratch& A_scr, - const ViewType& A, const int& offset_i, + static void copy(const TeamHandle& team, const ViewTypeScratch& A_scr, const ViewType& A, const int& offset_i, const int& offset_j) { - if (offset_i + blockDim_i <= A.extent_int(1) && - offset_j + blockDim_j <= A.extent_int(0)) { - Kokkos::parallel_for( - Kokkos::TeamThreadRange(team, blockDim_j), [&](const int j) { + if (offset_i + blockDim_i <= A.extent_int(1) && offset_j + blockDim_j <= A.extent_int(0)) { + Kokkos::parallel_for(Kokkos::TeamThreadRange(team, blockDim_j), [&](const int j) { #ifndef KOKKOS_IMPL_BATCHED_GEMM_GCC_CXX14_WORKAROUND - const int idx_j = offset_j + j; + const int idx_j = offset_j + j; #endif - Kokkos::parallel_for(Kokkos::ThreadVectorRange(team, blockDim_i), - [&](const int i) { + Kokkos::parallel_for(Kokkos::ThreadVectorRange(team, blockDim_i), [&](const int i) { #ifdef KOKKOS_IMPL_BATCHED_GEMM_GCC_CXX14_WORKAROUND - const int idx_j = offset_j + j; + const int idx_j = offset_j + j; #endif - const int idx_i = offset_i + i; - A_scr(i, j) = A(idx_j, idx_i); - }); - }); + const int idx_i = offset_i + i; + A_scr(i, j) = A(idx_j, idx_i); + }); + }); } else { - Kokkos::parallel_for( - Kokkos::TeamThreadRange(team, blockDim_j), [&](const int j) { + Kokkos::parallel_for(Kokkos::TeamThreadRange(team, blockDim_j), [&](const int j) { #ifndef KOKKOS_IMPL_BATCHED_GEMM_GCC_CXX14_WORKAROUND - int idx_j = offset_j + j; + int idx_j = offset_j + j; #endif - Kokkos::parallel_for( - Kokkos::ThreadVectorRange(team, blockDim_i), [&](const int i) { + Kokkos::parallel_for(Kokkos::ThreadVectorRange(team, blockDim_i), [&](const int i) { #ifdef KOKKOS_IMPL_BATCHED_GEMM_GCC_CXX14_WORKAROUND - int idx_j = offset_j + j; -#endif - const int idx_i = offset_i + i; - A_scr(i, j) = - idx_i < A.extent_int(1) && idx_j < A.extent_int(0) - ? A(idx_j, idx_i) - : ATV::zero(); - }); - }); + int idx_j = offset_j + j; +#endif + const int idx_i = offset_i + i; + A_scr(i, j) = idx_i < A.extent_int(1) && idx_j < A.extent_int(0) ? A(idx_j, idx_i) : ATV::zero(); + }); + }); } } }; -template -struct impl_deep_copy_matrix_block +struct impl_deep_copy_matrix_block { typedef typename ViewType::non_const_value_type value_type; typedef Kokkos::ArithTraits ATV; KOKKOS_INLINE_FUNCTION - static void copy(const TeamHandle& team, const ViewTypeScratch& A_scr, - const ViewType& A, const int& offset_i, + static void copy(const TeamHandle& team, const ViewTypeScratch& A_scr, const ViewType& A, const int& offset_i, const int& offset_j) { - if (offset_i + blockDim_i <= A.extent_int(1) && - offset_j + blockDim_j <= A.extent_int(0)) { - Kokkos::parallel_for( - Kokkos::TeamThreadRange(team, blockDim_i), [&](const int i) { + if (offset_i + blockDim_i <= A.extent_int(1) && offset_j + blockDim_j <= A.extent_int(0)) { + Kokkos::parallel_for(Kokkos::TeamThreadRange(team, blockDim_i), [&](const int i) { #ifndef KOKKOS_IMPL_BATCHED_GEMM_GCC_CXX14_WORKAROUND - const int idx_i = offset_i + i; + const int idx_i = offset_i + i; #endif - Kokkos::parallel_for(Kokkos::ThreadVectorRange(team, blockDim_j), - [&](const int j) { + Kokkos::parallel_for(Kokkos::ThreadVectorRange(team, blockDim_j), [&](const int j) { #ifdef KOKKOS_IMPL_BATCHED_GEMM_GCC_CXX14_WORKAROUND - const int idx_i = offset_i + i; + const int idx_i = offset_i + i; #endif - const int idx_j = offset_j + j; - A_scr(i, j) = A(idx_j, idx_i); - }); - }); + const int idx_j = offset_j + j; + A_scr(i, j) = A(idx_j, idx_i); + }); + }); } else { - Kokkos::parallel_for( - Kokkos::TeamThreadRange(team, blockDim_i), [&](const int i) { + Kokkos::parallel_for(Kokkos::TeamThreadRange(team, blockDim_i), [&](const int i) { #ifndef KOKKOS_IMPL_BATCHED_GEMM_GCC_CXX14_WORKAROUND - int idx_i = offset_i + i; + int idx_i = offset_i + i; #endif - Kokkos::parallel_for( - Kokkos::ThreadVectorRange(team, blockDim_j), [&](const int j) { + Kokkos::parallel_for(Kokkos::ThreadVectorRange(team, blockDim_j), [&](const int j) { #ifdef KOKKOS_IMPL_BATCHED_GEMM_GCC_CXX14_WORKAROUND - int idx_i = offset_i + i; -#endif - const int idx_j = offset_j + j; - A_scr(i, j) = - idx_i < A.extent_int(1) && idx_j < A.extent_int(0) - ? A(idx_j, idx_i) - : ATV::zero(); - }); - }); + int idx_i = offset_i + i; +#endif + const int idx_j = offset_j + j; + A_scr(i, j) = idx_i < A.extent_int(1) && idx_j < A.extent_int(0) ? A(idx_j, idx_i) : ATV::zero(); + }); + }); } } }; -template -struct impl_deep_copy_matrix_block { +template +struct impl_deep_copy_matrix_block { typedef typename ViewType::non_const_value_type value_type; typedef Kokkos::ArithTraits ATV; KOKKOS_INLINE_FUNCTION - static void copy(const TeamHandle& team, const ViewTypeScratch& A_scr, - const ViewType& A, const int& offset_i, + static void copy(const TeamHandle& team, const ViewTypeScratch& A_scr, const ViewType& A, const int& offset_i, const int& offset_j) { - if (offset_i + blockDim_i <= A.extent_int(1) && - offset_j + blockDim_j <= A.extent_int(0)) { - Kokkos::parallel_for( - Kokkos::TeamThreadRange(team, blockDim_j), [&](const int j) { + if (offset_i + blockDim_i <= A.extent_int(1) && offset_j + blockDim_j <= A.extent_int(0)) { + Kokkos::parallel_for(Kokkos::TeamThreadRange(team, blockDim_j), [&](const int j) { #ifndef KOKKOS_IMPL_BATCHED_GEMM_GCC_CXX14_WORKAROUND - const int idx_j = offset_j + j; + const int idx_j = offset_j + j; #endif - Kokkos::parallel_for(Kokkos::ThreadVectorRange(team, blockDim_i), - [&](const int i) { + Kokkos::parallel_for(Kokkos::ThreadVectorRange(team, blockDim_i), [&](const int i) { #ifdef KOKKOS_IMPL_BATCHED_GEMM_GCC_CXX14_WORKAROUND - const int idx_j = offset_j + j; + const int idx_j = offset_j + j; #endif - const int idx_i = offset_i + i; - A_scr(i, j) = ATV::conj(A(idx_j, idx_i)); - }); - }); + const int idx_i = offset_i + i; + A_scr(i, j) = ATV::conj(A(idx_j, idx_i)); + }); + }); } else { - Kokkos::parallel_for( - Kokkos::TeamThreadRange(team, blockDim_j), [&](const int j) { + Kokkos::parallel_for(Kokkos::TeamThreadRange(team, blockDim_j), [&](const int j) { #ifndef KOKKOS_IMPL_BATCHED_GEMM_GCC_CXX14_WORKAROUND - int idx_j = offset_j + j; + int idx_j = offset_j + j; #endif - Kokkos::parallel_for( - Kokkos::ThreadVectorRange(team, blockDim_i), [&](const int i) { + Kokkos::parallel_for(Kokkos::ThreadVectorRange(team, blockDim_i), [&](const int i) { #ifdef KOKKOS_IMPL_BATCHED_GEMM_GCC_CXX14_WORKAROUND - int idx_j = offset_j + j; -#endif - const int idx_i = offset_i + i; - A_scr(i, j) = - idx_i < A.extent_int(1) && idx_j < A.extent_int(0) - ? ATV::conj(A(idx_j, idx_i)) - : ATV::zero(); - }); - }); + int idx_j = offset_j + j; +#endif + const int idx_i = offset_i + i; + A_scr(i, j) = idx_i < A.extent_int(1) && idx_j < A.extent_int(0) ? ATV::conj(A(idx_j, idx_i)) : ATV::zero(); + }); + }); } } }; -template -struct impl_deep_copy_matrix_block +struct impl_deep_copy_matrix_block { typedef typename ViewType::non_const_value_type value_type; typedef Kokkos::ArithTraits ATV; KOKKOS_INLINE_FUNCTION - static void copy(const TeamHandle& team, const ViewTypeScratch& A_scr, - const ViewType& A, const int& offset_i, + static void copy(const TeamHandle& team, const ViewTypeScratch& A_scr, const ViewType& A, const int& offset_i, const int& offset_j) { - if (offset_i + blockDim_i <= A.extent_int(1) && - offset_j + blockDim_j <= A.extent_int(0)) { - Kokkos::parallel_for( - Kokkos::TeamThreadRange(team, blockDim_i), [&](const int i) { + if (offset_i + blockDim_i <= A.extent_int(1) && offset_j + blockDim_j <= A.extent_int(0)) { + Kokkos::parallel_for(Kokkos::TeamThreadRange(team, blockDim_i), [&](const int i) { #ifndef KOKKOS_IMPL_BATCHED_GEMM_GCC_CXX14_WORKAROUND - const int idx_i = offset_i + i; + const int idx_i = offset_i + i; #endif - Kokkos::parallel_for(Kokkos::ThreadVectorRange(team, blockDim_j), - [&](const int j) { + Kokkos::parallel_for(Kokkos::ThreadVectorRange(team, blockDim_j), [&](const int j) { #ifdef KOKKOS_IMPL_BATCHED_GEMM_GCC_CXX14_WORKAROUND - const int idx_i = offset_i + i; + const int idx_i = offset_i + i; #endif - const int idx_j = offset_j + j; - A_scr(i, j) = ATV::conj(A(idx_j, idx_i)); - }); - }); + const int idx_j = offset_j + j; + A_scr(i, j) = ATV::conj(A(idx_j, idx_i)); + }); + }); } else { - Kokkos::parallel_for( - Kokkos::TeamThreadRange(team, blockDim_i), [&](const int i) { + Kokkos::parallel_for(Kokkos::TeamThreadRange(team, blockDim_i), [&](const int i) { #ifndef KOKKOS_IMPL_BATCHED_GEMM_GCC_CXX14_WORKAROUND - int idx_i = offset_i + i; + int idx_i = offset_i + i; #endif - Kokkos::parallel_for( - Kokkos::ThreadVectorRange(team, blockDim_j), [&](const int j) { + Kokkos::parallel_for(Kokkos::ThreadVectorRange(team, blockDim_j), [&](const int j) { #ifdef KOKKOS_IMPL_BATCHED_GEMM_GCC_CXX14_WORKAROUND - int idx_i = offset_i + i; -#endif - const int idx_j = offset_j + j; - A_scr(i, j) = - idx_i < A.extent_int(1) && idx_j < A.extent_int(0) - ? ATV::conj(A(idx_j, idx_i)) - : ATV::zero(); - }); - }); + int idx_i = offset_i + i; +#endif + const int idx_j = offset_j + j; + A_scr(i, j) = idx_i < A.extent_int(1) && idx_j < A.extent_int(0) ? ATV::conj(A(idx_j, idx_i)) : ATV::zero(); + }); + }); } } }; -template +template struct impl_update_matrix_block { typedef typename ViewType::non_const_value_type value_type; typedef Kokkos::ArithTraits ATV; KOKKOS_INLINE_FUNCTION - static void update(const TeamHandle& team, const value_type& beta, - const ViewType& A, const value_type& alpha, - const ViewTypeScratch& A_scr, const int& offset_i, - const int& offset_j) { - if (offset_i + blockDim_i <= A.extent_int(0) && - offset_j + blockDim_j <= A.extent_int(1)) { - Kokkos::parallel_for( - Kokkos::TeamThreadRange(team, blockDim_j), [&](const int j) { - const int idx_j = offset_j + j; - if (beta == ATV::zero()) { - Kokkos::parallel_for(Kokkos::ThreadVectorRange(team, blockDim_i), - [&](const int i) { - const int idx_i = offset_i + i; - A(idx_i, idx_j) = alpha * A_scr(i, j); - }); - } else { - Kokkos::parallel_for(Kokkos::ThreadVectorRange(team, blockDim_i), - [&](const int i) { - const int idx_i = offset_i + i; - A(idx_i, idx_j) = beta * A(idx_i, idx_j) + - alpha * A_scr(i, j); - }); - } + static void update(const TeamHandle& team, const value_type& beta, const ViewType& A, const value_type& alpha, + const ViewTypeScratch& A_scr, const int& offset_i, const int& offset_j) { + if (offset_i + blockDim_i <= A.extent_int(0) && offset_j + blockDim_j <= A.extent_int(1)) { + Kokkos::parallel_for(Kokkos::TeamThreadRange(team, blockDim_j), [&](const int j) { + const int idx_j = offset_j + j; + if (beta == ATV::zero()) { + Kokkos::parallel_for(Kokkos::ThreadVectorRange(team, blockDim_i), [&](const int i) { + const int idx_i = offset_i + i; + A(idx_i, idx_j) = alpha * A_scr(i, j); + }); + } else { + Kokkos::parallel_for(Kokkos::ThreadVectorRange(team, blockDim_i), [&](const int i) { + const int idx_i = offset_i + i; + A(idx_i, idx_j) = beta * A(idx_i, idx_j) + alpha * A_scr(i, j); }); + } + }); } else { - const int range_i = offset_i + blockDim_i <= A.extent_int(0) - ? blockDim_i - : A.extent_int(0) % blockDim_i; - const int range_j = offset_j + blockDim_j <= A.extent_int(1) - ? blockDim_j - : A.extent_int(1) % blockDim_j; - Kokkos::parallel_for( - Kokkos::TeamThreadRange(team, range_j), [&](const int j) { - const int idx_j = offset_j + j; - if (beta == ATV::zero()) { - Kokkos::parallel_for(Kokkos::ThreadVectorRange(team, range_i), - [&](const int i) { - const int idx_i = offset_i + i; - A(idx_i, idx_j) = alpha * A_scr(i, j); - }); - } else { - Kokkos::parallel_for( - Kokkos::ThreadVectorRange(team, range_i), [&](const int i) { - const int idx_i = offset_i + i; - A(idx_i, idx_j) = - beta * A(idx_i, idx_j) + alpha * A_scr(i, j); - }); - } + const int range_i = offset_i + blockDim_i <= A.extent_int(0) ? blockDim_i : A.extent_int(0) % blockDim_i; + const int range_j = offset_j + blockDim_j <= A.extent_int(1) ? blockDim_j : A.extent_int(1) % blockDim_j; + Kokkos::parallel_for(Kokkos::TeamThreadRange(team, range_j), [&](const int j) { + const int idx_j = offset_j + j; + if (beta == ATV::zero()) { + Kokkos::parallel_for(Kokkos::ThreadVectorRange(team, range_i), [&](const int i) { + const int idx_i = offset_i + i; + A(idx_i, idx_j) = alpha * A_scr(i, j); }); + } else { + Kokkos::parallel_for(Kokkos::ThreadVectorRange(team, range_i), [&](const int i) { + const int idx_i = offset_i + i; + A(idx_i, idx_j) = beta * A(idx_i, idx_j) + alpha * A_scr(i, j); + }); + } + }); } } }; -template -struct impl_update_matrix_block { +template +struct impl_update_matrix_block { typedef typename ViewType::non_const_value_type value_type; typedef Kokkos::ArithTraits ATV; KOKKOS_INLINE_FUNCTION - static void update(const TeamHandle& team, const value_type& beta, - const ViewType& A, const value_type& alpha, - const ViewTypeScratch& A_scr, const int& offset_i, - const int& offset_j) { - if (offset_i + blockDim_i <= A.extent_int(0) && - offset_j + blockDim_j <= A.extent_int(1)) { - Kokkos::parallel_for( - Kokkos::TeamThreadRange(team, blockDim_i), [&](const int i) { - const int idx_i = offset_i + i; - if (beta == ATV::zero()) { - Kokkos::parallel_for(Kokkos::ThreadVectorRange(team, blockDim_j), - [&](const int j) { - const int idx_j = offset_j + j; - A(idx_i, idx_j) = alpha * A_scr(i, j); - }); - } else { - Kokkos::parallel_for(Kokkos::ThreadVectorRange(team, blockDim_j), - [&](const int j) { - const int idx_j = offset_j + j; - A(idx_i, idx_j) = beta * A(idx_i, idx_j) + - alpha * A_scr(i, j); - }); - } + static void update(const TeamHandle& team, const value_type& beta, const ViewType& A, const value_type& alpha, + const ViewTypeScratch& A_scr, const int& offset_i, const int& offset_j) { + if (offset_i + blockDim_i <= A.extent_int(0) && offset_j + blockDim_j <= A.extent_int(1)) { + Kokkos::parallel_for(Kokkos::TeamThreadRange(team, blockDim_i), [&](const int i) { + const int idx_i = offset_i + i; + if (beta == ATV::zero()) { + Kokkos::parallel_for(Kokkos::ThreadVectorRange(team, blockDim_j), [&](const int j) { + const int idx_j = offset_j + j; + A(idx_i, idx_j) = alpha * A_scr(i, j); + }); + } else { + Kokkos::parallel_for(Kokkos::ThreadVectorRange(team, blockDim_j), [&](const int j) { + const int idx_j = offset_j + j; + A(idx_i, idx_j) = beta * A(idx_i, idx_j) + alpha * A_scr(i, j); }); + } + }); } else { - const int range_i = offset_i + blockDim_i <= A.extent_int(0) - ? blockDim_i - : A.extent_int(0) % blockDim_i; - const int range_j = offset_j + blockDim_j <= A.extent_int(1) - ? blockDim_j - : A.extent_int(1) % blockDim_j; - Kokkos::parallel_for( - Kokkos::TeamThreadRange(team, range_i), [&](const int i) { - const int idx_i = offset_i + i; - if (beta == ATV::zero()) { - Kokkos::parallel_for(Kokkos::ThreadVectorRange(team, range_j), - [&](const int j) { - const int idx_j = offset_j + j; - A(idx_i, idx_j) = alpha * A_scr(i, j); - }); - } else { - Kokkos::parallel_for( - Kokkos::ThreadVectorRange(team, range_j), [&](const int j) { - const int idx_j = offset_j + j; - A(idx_i, idx_j) = - beta * A(idx_i, idx_j) + alpha * A_scr(i, j); - }); - } + const int range_i = offset_i + blockDim_i <= A.extent_int(0) ? blockDim_i : A.extent_int(0) % blockDim_i; + const int range_j = offset_j + blockDim_j <= A.extent_int(1) ? blockDim_j : A.extent_int(1) % blockDim_j; + Kokkos::parallel_for(Kokkos::TeamThreadRange(team, range_i), [&](const int i) { + const int idx_i = offset_i + i; + if (beta == ATV::zero()) { + Kokkos::parallel_for(Kokkos::ThreadVectorRange(team, range_j), [&](const int j) { + const int idx_j = offset_j + j; + A(idx_i, idx_j) = alpha * A_scr(i, j); + }); + } else { + Kokkos::parallel_for(Kokkos::ThreadVectorRange(team, range_j), [&](const int j) { + const int idx_j = offset_j + j; + A(idx_i, idx_j) = beta * A(idx_i, idx_j) + alpha * A_scr(i, j); }); + } + }); } } }; @@ -475,14 +376,11 @@ struct impl_update_matrix_block -KOKKOS_INLINE_FUNCTION void impl_team_gemm_block(const TeamHandle& team, - const ViewTypeC& C, - const ViewTypeA& A, +KOKKOS_INLINE_FUNCTION void impl_team_gemm_block(const TeamHandle& team, const ViewTypeC& C, const ViewTypeA& A, const ViewTypeB& B) { typedef typename ViewTypeC::non_const_value_type ScalarC; // GNU COMPILER BUG WORKAROUND -#if defined(KOKKOS_COMPILER_GNU) && \ - (!defined(__CUDA_ARCH__) || !defined(__HIP_DEVICE_COMPILE__)) +#if defined(KOKKOS_COMPILER_GNU) && (!defined(__CUDA_ARCH__) || !defined(__HIP_DEVICE_COMPILE__)) int blockA0 = A.extent_int(0); int blockA1 = A.extent_int(1); int blockB1 = B.extent_int(1); @@ -491,36 +389,34 @@ KOKKOS_INLINE_FUNCTION void impl_team_gemm_block(const TeamHandle& team, const int blockA1 = A.extent_int(1); const int blockB1 = B.extent_int(1); #endif - Kokkos::parallel_for( - Kokkos::TeamThreadRange(team, blockA0), [&](const int i) { + Kokkos::parallel_for(Kokkos::TeamThreadRange(team, blockA0), [&](const int i) { #ifndef KOKKOSKERNELS_ENABLE_OMP_SIMD - Kokkos::parallel_for(Kokkos::ThreadVectorRange(team, blockB1 / 4), - [&](const int B_j) { + Kokkos::parallel_for(Kokkos::ThreadVectorRange(team, blockB1 / 4), [&](const int B_j) { #else #pragma omp simd for(int B_j=0; B_j @@ -565,8 +461,8 @@ struct impl_gemm_label<2, 2> { static constexpr const char* label = "KokkosBlas::gemm[CC]"; }; -template +template struct GEMMImpl { ViewTypeA A; ViewTypeB B; @@ -580,18 +476,14 @@ struct GEMMImpl { int scratch_level; ScalarC alpha, beta; - typedef Kokkos::View + typedef Kokkos::View ViewTypeAScratch; - typedef Kokkos::View + typedef Kokkos::View ViewTypeBScratch; - typedef Kokkos::View + typedef Kokkos::View ViewTypeCScratch; - GEMMImpl(const ScalarA& alpha_, const ViewTypeA& A_, const ViewTypeB& B_, - const ScalarC& beta_, const ViewTypeC& C_) + GEMMImpl(const ScalarA& alpha_, const ViewTypeA& A_, const ViewTypeB& B_, const ScalarC& beta_, const ViewTypeC& C_) : A(A_), B(B_), C(C_), @@ -602,12 +494,10 @@ struct GEMMImpl { beta = beta_; } - void run(const ExecSpace& space, int team_size, int vector_length, - int scr_level) { - scratch_level = scr_level; - int scratch_memory_size = ViewTypeAScratch::shmem_size() + - ViewTypeBScratch::shmem_size() + - ViewTypeCScratch::shmem_size(); + void run(const ExecSpace& space, int team_size, int vector_length, int scr_level) { + scratch_level = scr_level; + int scratch_memory_size = + ViewTypeAScratch::shmem_size() + ViewTypeBScratch::shmem_size() + ViewTypeCScratch::shmem_size(); #if defined(KOKKOS_ENABLE_HIP) // Note lbv, 10/29/20: The LaunchBounds<384, 2> leads @@ -616,23 +506,19 @@ struct GEMMImpl { // are allocated... Switching to LaunchBounds<384, 0> fixes // that problem but I'm not sure if that it a good perf // parameter or why it is set to 2 for Cuda? - Kokkos::TeamPolicy> policy( - space, num_blocks_0 * num_blocks_1, team_size, vector_length); + Kokkos::TeamPolicy> policy(space, num_blocks_0 * num_blocks_1, team_size, + vector_length); #else - Kokkos::TeamPolicy> policy( - space, num_blocks_0 * num_blocks_1, team_size, vector_length); + Kokkos::TeamPolicy> policy(space, num_blocks_0 * num_blocks_1, team_size, + vector_length); #endif - Kokkos::parallel_for( - impl_gemm_label::label, - policy.set_scratch_size(scratch_level, - Kokkos::PerTeam(scratch_memory_size)), - *this); + Kokkos::parallel_for(impl_gemm_label::label, + policy.set_scratch_size(scratch_level, Kokkos::PerTeam(scratch_memory_size)), *this); } KOKKOS_INLINE_FUNCTION - void operator()( - const typename Kokkos::TeamPolicy::member_type& team) const { + void operator()(const typename Kokkos::TeamPolicy::member_type& team) const { // This team is responsible for computing a single block of C const int league_rank = team.league_rank(); const int num_blocks = num_blocks_1; @@ -642,11 +528,9 @@ struct GEMMImpl { ViewTypeAScratch A_scr(team.team_scratch(scratch_level)); ViewTypeBScratch B_scr(team.team_scratch(scratch_level)); ViewTypeCScratch C_scr(team.team_scratch(scratch_level)); - Kokkos::parallel_for( - Kokkos::TeamThreadRange(team, blockA0), [&](const int i) { - Kokkos::parallel_for(Kokkos::ThreadVectorRange(team, blockB1), - [&](const int j) { C_scr(i, j) = 0; }); - }); + Kokkos::parallel_for(Kokkos::TeamThreadRange(team, blockA0), [&](const int i) { + Kokkos::parallel_for(Kokkos::ThreadVectorRange(team, blockB1), [&](const int j) { C_scr(i, j) = 0; }); + }); team.team_barrier(); // Move along the inner dimension in blocks @@ -654,22 +538,16 @@ struct GEMMImpl { for (int A_j = 0; A_j < length; A_j += blockA1) { // Load A block into scratch - impl_deep_copy_matrix_block< - typename Kokkos::TeamPolicy::member_type, ViewTypeAScratch, - ViewTypeA, - typename impl_gemm_choose_copy_layout< - ExecSpace, typename ViewTypeA::array_layout, - typename ViewTypeAScratch::array_layout>::type, - blockA0, blockA1, TransposeA>::copy(team, A_scr, A, i_offset, A_j); + impl_deep_copy_matrix_block::member_type, ViewTypeAScratch, ViewTypeA, + typename impl_gemm_choose_copy_layout::type, + blockA0, blockA1, TransposeA>::copy(team, A_scr, A, i_offset, A_j); // Load B block into scratch - impl_deep_copy_matrix_block< - typename Kokkos::TeamPolicy::member_type, ViewTypeBScratch, - ViewTypeB, - typename impl_gemm_choose_copy_layout< - ExecSpace, typename ViewTypeB::array_layout, - typename ViewTypeBScratch::array_layout>::type, - blockA1, blockB1, TransposeB>::copy(team, B_scr, B, A_j, j_offset); + impl_deep_copy_matrix_block::member_type, ViewTypeBScratch, ViewTypeB, + typename impl_gemm_choose_copy_layout::type, + blockA1, blockB1, TransposeB>::copy(team, B_scr, B, A_j, j_offset); // Wait for A and B block to be in scratch memory team.team_barrier(); @@ -682,10 +560,9 @@ struct GEMMImpl { team.team_barrier(); } // Write back the C block from scratch to main memory - impl_update_matrix_block< - typename Kokkos::TeamPolicy::member_type, ViewTypeC, - ViewTypeCScratch, typename ViewTypeC::array_layout, blockA0, - blockB1>::update(team, beta, C, alpha, C_scr, i_offset, j_offset); + impl_update_matrix_block::member_type, ViewTypeC, ViewTypeCScratch, + typename ViewTypeC::array_layout, blockA0, blockB1>::update(team, beta, C, alpha, C_scr, + i_offset, j_offset); } }; diff --git a/blas/impl/KokkosBlas3_gemm_spec.hpp b/blas/impl/KokkosBlas3_gemm_spec.hpp index 367a8dad3f..f085b5fc92 100644 --- a/blas/impl/KokkosBlas3_gemm_spec.hpp +++ b/blas/impl/KokkosBlas3_gemm_spec.hpp @@ -43,35 +43,27 @@ struct gemm_eti_spec_avail { // We may spread out definitions (see _INST macro below) across one or // more .cpp files. // -#define KOKKOSBLAS3_GEMM_ETI_SPEC_AVAIL_LAYOUT(SCALAR, LAYOUTA, LAYOUTB, \ - LAYOUTC, EXEC_SPACE, MEM_SPACE) \ - template <> \ - struct gemm_eti_spec_avail< \ - EXEC_SPACE, \ - Kokkos::View, \ - Kokkos::MemoryTraits >, \ - Kokkos::View, \ - Kokkos::MemoryTraits >, \ - Kokkos::View, \ - Kokkos::MemoryTraits > > { \ - enum : bool { value = true }; \ +#define KOKKOSBLAS3_GEMM_ETI_SPEC_AVAIL_LAYOUT(SCALAR, LAYOUTA, LAYOUTB, LAYOUTC, EXEC_SPACE, MEM_SPACE) \ + template <> \ + struct gemm_eti_spec_avail, \ + Kokkos::MemoryTraits >, \ + Kokkos::View, \ + Kokkos::MemoryTraits >, \ + Kokkos::View, \ + Kokkos::MemoryTraits > > { \ + enum : bool { value = true }; \ }; -#define KOKKOSBLAS3_GEMM_ETI_SPEC_AVAIL(SCALAR, LAYOUT, EXEC_SPACE, MEM_SPACE) \ - KOKKOSBLAS3_GEMM_ETI_SPEC_AVAIL_LAYOUT(SCALAR, Kokkos::LayoutLeft, \ - Kokkos::LayoutLeft, LAYOUT, \ - EXEC_SPACE, MEM_SPACE) \ - KOKKOSBLAS3_GEMM_ETI_SPEC_AVAIL_LAYOUT(SCALAR, Kokkos::LayoutLeft, \ - Kokkos::LayoutRight, LAYOUT, \ - EXEC_SPACE, MEM_SPACE) \ - KOKKOSBLAS3_GEMM_ETI_SPEC_AVAIL_LAYOUT(SCALAR, Kokkos::LayoutRight, \ - Kokkos::LayoutLeft, LAYOUT, \ - EXEC_SPACE, MEM_SPACE) \ - KOKKOSBLAS3_GEMM_ETI_SPEC_AVAIL_LAYOUT(SCALAR, Kokkos::LayoutRight, \ - Kokkos::LayoutRight, LAYOUT, \ - EXEC_SPACE, MEM_SPACE) +#define KOKKOSBLAS3_GEMM_ETI_SPEC_AVAIL(SCALAR, LAYOUT, EXEC_SPACE, MEM_SPACE) \ + KOKKOSBLAS3_GEMM_ETI_SPEC_AVAIL_LAYOUT(SCALAR, Kokkos::LayoutLeft, Kokkos::LayoutLeft, LAYOUT, EXEC_SPACE, \ + MEM_SPACE) \ + KOKKOSBLAS3_GEMM_ETI_SPEC_AVAIL_LAYOUT(SCALAR, Kokkos::LayoutLeft, Kokkos::LayoutRight, LAYOUT, EXEC_SPACE, \ + MEM_SPACE) \ + KOKKOSBLAS3_GEMM_ETI_SPEC_AVAIL_LAYOUT(SCALAR, Kokkos::LayoutRight, Kokkos::LayoutLeft, LAYOUT, EXEC_SPACE, \ + MEM_SPACE) \ + KOKKOSBLAS3_GEMM_ETI_SPEC_AVAIL_LAYOUT(SCALAR, Kokkos::LayoutRight, Kokkos::LayoutRight, LAYOUT, EXEC_SPACE, \ + MEM_SPACE) // Include the actual specialization declarations #include @@ -85,37 +77,24 @@ namespace Impl { // // Implementation of KokkosBlas::gemm. -template < - class execution_space, class AViewType, class BViewType, class CViewType, - bool tpl_spec_avail = gemm_tpl_spec_avail::value, - bool eti_spec_avail = gemm_eti_spec_avail::value> +template ::value, + bool eti_spec_avail = gemm_eti_spec_avail::value> struct GEMM { - static void gemm(const execution_space& space, const char transA[], - const char transB[], - typename AViewType::const_value_type& alpha, - const AViewType& A, const BViewType& B, - typename CViewType::const_value_type& beta, - const CViewType& C) + static void gemm(const execution_space& space, const char transA[], const char transB[], + typename AViewType::const_value_type& alpha, const AViewType& A, const BViewType& B, + typename CViewType::const_value_type& beta, const CViewType& C) #if !defined(KOKKOSKERNELS_ETI_ONLY) || KOKKOSKERNELS_IMPL_COMPILE_LIBRARY { - static_assert(Kokkos::is_view::value, - "AViewType must be a Kokkos::View."); - static_assert(Kokkos::is_view::value, - "BViewType must be a Kokkos::View."); - static_assert(Kokkos::is_view::value, - "CViewType must be a Kokkos::View."); - static_assert(static_cast(AViewType::rank) == 2, - "AViewType must have rank 2."); - static_assert(static_cast(BViewType::rank) == 2, - "BViewType must have rank 2."); - static_assert(static_cast(CViewType::rank) == 2, - "CViewType must have rank 2."); + static_assert(Kokkos::is_view::value, "AViewType must be a Kokkos::View."); + static_assert(Kokkos::is_view::value, "BViewType must be a Kokkos::View."); + static_assert(Kokkos::is_view::value, "CViewType must be a Kokkos::View."); + static_assert(static_cast(AViewType::rank) == 2, "AViewType must have rank 2."); + static_assert(static_cast(BViewType::rank) == 2, "BViewType must have rank 2."); + static_assert(static_cast(CViewType::rank) == 2, "CViewType must have rank 2."); - Kokkos::Profiling::pushRegion(KOKKOSKERNELS_IMPL_COMPILE_LIBRARY - ? "KokkosBlas::gemm[ETI]" - : "KokkosBlas::gemm[noETI]"); + Kokkos::Profiling::pushRegion(KOKKOSKERNELS_IMPL_COMPILE_LIBRARY ? "KokkosBlas::gemm[ETI]" + : "KokkosBlas::gemm[noETI]"); // Figure out Scalar Types typedef typename AViewType::non_const_value_type ScalarA; typedef typename BViewType::non_const_value_type ScalarB; @@ -125,29 +104,22 @@ struct GEMM { const int M = static_cast(C.extent(0)); const int N = static_cast(C.extent(1)); - const bool is_device_space = - KokkosKernels::Impl::kk_is_gpu_exec_space(); - const bool A_is_lr = std::is_same::value; - const bool A_is_tr = ((transA[0] == 'T') || (transA[0] == 't') || - (transA[0] == 'C') || (transA[0] == 'c')); - const bool B_is_tr = ((transB[0] == 'T') || (transB[0] == 't') || - (transB[0] == 'C') || (transB[0] == 'c')); + const bool is_device_space = KokkosKernels::Impl::kk_is_gpu_exec_space(); + const bool A_is_lr = std::is_same::value; + const bool A_is_tr = ((transA[0] == 'T') || (transA[0] == 't') || (transA[0] == 'C') || (transA[0] == 'c')); + const bool B_is_tr = ((transB[0] == 'T') || (transB[0] == 't') || (transB[0] == 'C') || (transB[0] == 'c')); // NOTE: these thresholds were copied from TPL CUBLAS, and may need to be // retuned constexpr int numDotsLayoutLeftThreshold = 1600; constexpr int numDotsLayoutRightThreshold = 100; - if (((!A_is_lr && A_is_tr && !B_is_tr && - M * N < numDotsLayoutLeftThreshold) || - (A_is_lr && A_is_tr && !B_is_tr && - M * N < numDotsLayoutRightThreshold)) && + if (((!A_is_lr && A_is_tr && !B_is_tr && M * N < numDotsLayoutLeftThreshold) || + (A_is_lr && A_is_tr && !B_is_tr && M * N < numDotsLayoutRightThreshold)) && is_device_space) { // call dot-based GEMM, only for C := beta * C + alpha * A^T * B, on // device bool A_is_conj = ((transA[0] == 'C') || (transA[0] == 'c')); - DotBasedGEMM - dotBasedGemm(alpha, A, B, beta, C); + DotBasedGEMM dotBasedGemm(alpha, A, B, beta, C); dotBasedGemm.run(space, A_is_conj); } else { @@ -155,116 +127,87 @@ struct GEMM { static constexpr int blockA0 = 24; static constexpr int blockB1 = 64; static constexpr int blockA1 = - (sizeof(ScalarA) * blockA0 * 16 + sizeof(ScalarB) * 16 * blockB1 + - sizeof(ScalarC) * blockA0 * blockB1 < + (sizeof(ScalarA) * blockA0 * 16 + sizeof(ScalarB) * 16 * blockB1 + sizeof(ScalarC) * blockA0 * blockB1 < 24000) ? 16 - : (sizeof(ScalarA) * blockA0 * 8 + sizeof(ScalarB) * 8 * blockB1 + - sizeof(ScalarC) * blockA0 * blockB1 < - 24000) - ? 8 - : (sizeof(ScalarA) * blockA0 * 4 + - sizeof(ScalarB) * 4 * blockB1 + - sizeof(ScalarC) * blockA0 * blockB1 < - 24000) - ? 4 - : 16; - int vector_length = blockB1 / 4; - int max_vector_length = - KokkosKernels::Impl::kk_get_max_vector_size(); + : (sizeof(ScalarA) * blockA0 * 8 + sizeof(ScalarB) * 8 * blockB1 + sizeof(ScalarC) * blockA0 * blockB1 < + 24000) + ? 8 + : (sizeof(ScalarA) * blockA0 * 4 + sizeof(ScalarB) * 4 * blockB1 + sizeof(ScalarC) * blockA0 * blockB1 < + 24000) + ? 4 + : 16; + int vector_length = blockB1 / 4; + int max_vector_length = KokkosKernels::Impl::kk_get_max_vector_size(); if (vector_length > max_vector_length) vector_length = max_vector_length; // Compute scratch space size - typedef KokkosBlas::Impl::GEMMImpl + typedef KokkosBlas::Impl::GEMMImpl gemm_dummy_type; - const int scratch_memory_size = - gemm_dummy_type::ViewTypeAScratch::required_allocation_size() + - gemm_dummy_type::ViewTypeBScratch::required_allocation_size() + - gemm_dummy_type::ViewTypeCScratch::required_allocation_size(); + const int scratch_memory_size = gemm_dummy_type::ViewTypeAScratch::required_allocation_size() + + gemm_dummy_type::ViewTypeBScratch::required_allocation_size() + + gemm_dummy_type::ViewTypeCScratch::required_allocation_size(); const int scratch_level = scratch_memory_size < 24000 ? 0 : 1; // Figure out Team Sizes int team_size = 1; #if defined(KOKKOS_ENABLE_CUDA) - if (std::is_same::value) - team_size = blockA0; + if (std::is_same::value) team_size = blockA0; #endif #if defined(KOKKOS_ENABLE_HIP) - if (std::is_same::value) - team_size = blockA0; + if (std::is_same::value) team_size = blockA0; #endif #if defined(KOKKOS_ENABLE_ROCM) - if (std::is_same::value) - team_size = blockA0; + if (std::is_same::value) team_size = blockA0; #endif #if defined(KOKKOS_ENABLE_SYCL) - if (std::is_same::value) - team_size = blockA0; + if (std::is_same::value) team_size = blockA0; #endif // Call the correct kernel - if ((transA[0] == 'N' || transA[0] == 'n') && - (transB[0] == 'N' || transB[0] == 'n')) { - KokkosBlas::Impl::GEMMImpl + if ((transA[0] == 'N' || transA[0] == 'n') && (transB[0] == 'N' || transB[0] == 'n')) { + KokkosBlas::Impl::GEMMImpl gemm(alpha, A, B, beta, C); gemm.run(space, team_size, vector_length, scratch_level); } - if ((transA[0] == 'T' || transA[0] == 't') && - (transB[0] == 'N' || transB[0] == 'n')) { - KokkosBlas::Impl::GEMMImpl + if ((transA[0] == 'T' || transA[0] == 't') && (transB[0] == 'N' || transB[0] == 'n')) { + KokkosBlas::Impl::GEMMImpl gemm(alpha, A, B, beta, C); gemm.run(space, team_size, vector_length, scratch_level); } - if ((transA[0] == 'C' || transA[0] == 'c') && - (transB[0] == 'N' || transB[0] == 'n')) { - KokkosBlas::Impl::GEMMImpl + if ((transA[0] == 'C' || transA[0] == 'c') && (transB[0] == 'N' || transB[0] == 'n')) { + KokkosBlas::Impl::GEMMImpl gemm(alpha, A, B, beta, C); gemm.run(space, team_size, vector_length, scratch_level); } - if ((transA[0] == 'N' || transA[0] == 'n') && - (transB[0] == 'T' || transB[0] == 't')) { - KokkosBlas::Impl::GEMMImpl + if ((transA[0] == 'N' || transA[0] == 'n') && (transB[0] == 'T' || transB[0] == 't')) { + KokkosBlas::Impl::GEMMImpl gemm(alpha, A, B, beta, C); gemm.run(space, team_size, vector_length, scratch_level); } - if ((transA[0] == 'T' || transA[0] == 't') && - (transB[0] == 'T' || transB[0] == 't')) { - KokkosBlas::Impl::GEMMImpl + if ((transA[0] == 'T' || transA[0] == 't') && (transB[0] == 'T' || transB[0] == 't')) { + KokkosBlas::Impl::GEMMImpl gemm(alpha, A, B, beta, C); gemm.run(space, team_size, vector_length, scratch_level); } - if ((transA[0] == 'C' || transA[0] == 'c') && - (transB[0] == 'T' || transB[0] == 't')) { - KokkosBlas::Impl::GEMMImpl + if ((transA[0] == 'C' || transA[0] == 'c') && (transB[0] == 'T' || transB[0] == 't')) { + KokkosBlas::Impl::GEMMImpl gemm(alpha, A, B, beta, C); gemm.run(space, team_size, vector_length, scratch_level); } - if ((transA[0] == 'N' || transA[0] == 'n') && - (transB[0] == 'C' || transB[0] == 'c')) { - KokkosBlas::Impl::GEMMImpl + if ((transA[0] == 'N' || transA[0] == 'n') && (transB[0] == 'C' || transB[0] == 'c')) { + KokkosBlas::Impl::GEMMImpl gemm(alpha, A, B, beta, C); gemm.run(space, team_size, vector_length, scratch_level); } - if ((transA[0] == 'T' || transA[0] == 't') && - (transB[0] == 'C' || transB[0] == 'c')) { - KokkosBlas::Impl::GEMMImpl + if ((transA[0] == 'T' || transA[0] == 't') && (transB[0] == 'C' || transB[0] == 'c')) { + KokkosBlas::Impl::GEMMImpl gemm(alpha, A, B, beta, C); gemm.run(space, team_size, vector_length, scratch_level); } - if ((transA[0] == 'C' || transA[0] == 'c') && - (transB[0] == 'C' || transB[0] == 'c')) { - KokkosBlas::Impl::GEMMImpl + if ((transA[0] == 'C' || transA[0] == 'c') && (transB[0] == 'C' || transB[0] == 'c')) { + KokkosBlas::Impl::GEMMImpl gemm(alpha, A, B, beta, C); gemm.run(space, team_size, vector_length, scratch_level); } @@ -288,61 +231,45 @@ struct GEMM { // one or more .cpp files. // -#define KOKKOSBLAS3_GEMM_ETI_SPEC_DECL_LAYOUTS(SCALAR, LAYOUTA, LAYOUTB, \ - LAYOUTC, EXEC_SPACE, MEM_SPACE) \ - extern template struct GEMM< \ - EXEC_SPACE, \ - Kokkos::View, \ - Kokkos::MemoryTraits >, \ - Kokkos::View, \ - Kokkos::MemoryTraits >, \ - Kokkos::View, \ - Kokkos::MemoryTraits >, \ - false, true>; +#define KOKKOSBLAS3_GEMM_ETI_SPEC_DECL_LAYOUTS(SCALAR, LAYOUTA, LAYOUTB, LAYOUTC, EXEC_SPACE, MEM_SPACE) \ + extern template struct GEMM, \ + Kokkos::MemoryTraits >, \ + Kokkos::View, \ + Kokkos::MemoryTraits >, \ + Kokkos::View, \ + Kokkos::MemoryTraits >, \ + false, true>; -#define KOKKOSBLAS3_GEMM_ETI_SPEC_INST_LAYOUTS(SCALAR, LAYOUTA, LAYOUTB, \ - LAYOUTC, EXEC_SPACE, MEM_SPACE) \ - template struct GEMM< \ - EXEC_SPACE, \ - Kokkos::View, \ - Kokkos::MemoryTraits >, \ - Kokkos::View, \ - Kokkos::MemoryTraits >, \ - Kokkos::View, \ - Kokkos::MemoryTraits >, \ - false, true>; +#define KOKKOSBLAS3_GEMM_ETI_SPEC_INST_LAYOUTS(SCALAR, LAYOUTA, LAYOUTB, LAYOUTC, EXEC_SPACE, MEM_SPACE) \ + template struct GEMM, \ + Kokkos::MemoryTraits >, \ + Kokkos::View, \ + Kokkos::MemoryTraits >, \ + Kokkos::View, \ + Kokkos::MemoryTraits >, \ + false, true>; -#define KOKKOSBLAS3_GEMM_ETI_SPEC_DECL(SCALAR, LAYOUT, EXEC_SPACE, MEM_SPACE) \ - KOKKOSBLAS3_GEMM_ETI_SPEC_DECL_LAYOUTS(SCALAR, Kokkos::LayoutLeft, \ - Kokkos::LayoutLeft, LAYOUT, \ - EXEC_SPACE, MEM_SPACE) \ - KOKKOSBLAS3_GEMM_ETI_SPEC_DECL_LAYOUTS(SCALAR, Kokkos::LayoutLeft, \ - Kokkos::LayoutRight, LAYOUT, \ - EXEC_SPACE, MEM_SPACE) \ - KOKKOSBLAS3_GEMM_ETI_SPEC_DECL_LAYOUTS(SCALAR, Kokkos::LayoutRight, \ - Kokkos::LayoutLeft, LAYOUT, \ - EXEC_SPACE, MEM_SPACE) \ - KOKKOSBLAS3_GEMM_ETI_SPEC_DECL_LAYOUTS(SCALAR, Kokkos::LayoutRight, \ - Kokkos::LayoutRight, LAYOUT, \ - EXEC_SPACE, MEM_SPACE) +#define KOKKOSBLAS3_GEMM_ETI_SPEC_DECL(SCALAR, LAYOUT, EXEC_SPACE, MEM_SPACE) \ + KOKKOSBLAS3_GEMM_ETI_SPEC_DECL_LAYOUTS(SCALAR, Kokkos::LayoutLeft, Kokkos::LayoutLeft, LAYOUT, EXEC_SPACE, \ + MEM_SPACE) \ + KOKKOSBLAS3_GEMM_ETI_SPEC_DECL_LAYOUTS(SCALAR, Kokkos::LayoutLeft, Kokkos::LayoutRight, LAYOUT, EXEC_SPACE, \ + MEM_SPACE) \ + KOKKOSBLAS3_GEMM_ETI_SPEC_DECL_LAYOUTS(SCALAR, Kokkos::LayoutRight, Kokkos::LayoutLeft, LAYOUT, EXEC_SPACE, \ + MEM_SPACE) \ + KOKKOSBLAS3_GEMM_ETI_SPEC_DECL_LAYOUTS(SCALAR, Kokkos::LayoutRight, Kokkos::LayoutRight, LAYOUT, EXEC_SPACE, \ + MEM_SPACE) -#define KOKKOSBLAS3_GEMM_ETI_SPEC_INST(SCALAR, LAYOUT, EXEC_SPACE, MEM_SPACE) \ - KOKKOSBLAS3_GEMM_ETI_SPEC_INST_LAYOUTS(SCALAR, Kokkos::LayoutLeft, \ - Kokkos::LayoutLeft, LAYOUT, \ - EXEC_SPACE, MEM_SPACE) \ - KOKKOSBLAS3_GEMM_ETI_SPEC_INST_LAYOUTS(SCALAR, Kokkos::LayoutLeft, \ - Kokkos::LayoutRight, LAYOUT, \ - EXEC_SPACE, MEM_SPACE) \ - KOKKOSBLAS3_GEMM_ETI_SPEC_INST_LAYOUTS(SCALAR, Kokkos::LayoutRight, \ - Kokkos::LayoutLeft, LAYOUT, \ - EXEC_SPACE, MEM_SPACE) \ - KOKKOSBLAS3_GEMM_ETI_SPEC_INST_LAYOUTS(SCALAR, Kokkos::LayoutRight, \ - Kokkos::LayoutRight, LAYOUT, \ - EXEC_SPACE, MEM_SPACE) +#define KOKKOSBLAS3_GEMM_ETI_SPEC_INST(SCALAR, LAYOUT, EXEC_SPACE, MEM_SPACE) \ + KOKKOSBLAS3_GEMM_ETI_SPEC_INST_LAYOUTS(SCALAR, Kokkos::LayoutLeft, Kokkos::LayoutLeft, LAYOUT, EXEC_SPACE, \ + MEM_SPACE) \ + KOKKOSBLAS3_GEMM_ETI_SPEC_INST_LAYOUTS(SCALAR, Kokkos::LayoutLeft, Kokkos::LayoutRight, LAYOUT, EXEC_SPACE, \ + MEM_SPACE) \ + KOKKOSBLAS3_GEMM_ETI_SPEC_INST_LAYOUTS(SCALAR, Kokkos::LayoutRight, Kokkos::LayoutLeft, LAYOUT, EXEC_SPACE, \ + MEM_SPACE) \ + KOKKOSBLAS3_GEMM_ETI_SPEC_INST_LAYOUTS(SCALAR, Kokkos::LayoutRight, Kokkos::LayoutRight, LAYOUT, EXEC_SPACE, \ + MEM_SPACE) #include diff --git a/blas/impl/KokkosBlas3_trmm_impl.hpp b/blas/impl/KokkosBlas3_trmm_impl.hpp index a183675889..8a1e9a7a4a 100644 --- a/blas/impl/KokkosBlas3_trmm_impl.hpp +++ b/blas/impl/KokkosBlas3_trmm_impl.hpp @@ -32,10 +32,8 @@ namespace KokkosBlas { namespace Impl { template -void SerialTrmm_Invoke(const char side[], const char uplo[], const char trans[], - const char /*diag*/[], - typename BViewType::const_value_type& alpha, - const AViewType& A, const BViewType& B) { +void SerialTrmm_Invoke(const char side[], const char uplo[], const char trans[], const char /*diag*/[], + typename BViewType::const_value_type& alpha, const AViewType& A, const BViewType& B) { using KokkosBatched::Algo; using KokkosBatched::Diag; using KokkosBatched::SerialTrmmInternalLeftLower; @@ -43,8 +41,7 @@ void SerialTrmm_Invoke(const char side[], const char uplo[], const char trans[], using KokkosBatched::SerialTrmmInternalRightLower; using KokkosBatched::SerialTrmmInternalRightUpper; - char __side = tolower(side[0]), __uplo = tolower(uplo[0]), - __trans = tolower(trans[0]); + char __side = tolower(side[0]), __uplo = tolower(uplo[0]), __trans = tolower(trans[0]); //__diag = tolower(diag[0]); bool do_conj = true; @@ -53,79 +50,67 @@ void SerialTrmm_Invoke(const char side[], const char uplo[], const char trans[], //// Lower non-transpose //// if (__side == 'l' && __uplo == 'l' && __trans == 'n') SerialTrmmInternalLeftLower::invoke( - Diag::Unit::use_unit_diag, !do_conj, A.extent(0), A.extent(1), - B.extent(0), B.extent(1), alpha, A.data(), A.stride(0), A.stride(1), - B.data(), B.stride(0), B.stride(1)); + Diag::Unit::use_unit_diag, !do_conj, A.extent(0), A.extent(1), B.extent(0), B.extent(1), alpha, A.data(), + A.stride(0), A.stride(1), B.data(), B.stride(0), B.stride(1)); if (__side == 'r' && __uplo == 'l' && __trans == 'n') SerialTrmmInternalRightLower::invoke( - Diag::Unit::use_unit_diag, !do_conj, A.extent(0), A.extent(1), - B.extent(0), B.extent(1), alpha, A.data(), A.stride(0), A.stride(1), - B.data(), B.stride(0), B.stride(1)); + Diag::Unit::use_unit_diag, !do_conj, A.extent(0), A.extent(1), B.extent(0), B.extent(1), alpha, A.data(), + A.stride(0), A.stride(1), B.data(), B.stride(0), B.stride(1)); //// Lower transpose ///// // Transpose A by simply swapping the dimensions (extent) and stride // parameters if (__side == 'l' && __uplo == 'l' && __trans == 't') SerialTrmmInternalLeftUpper::invoke( - Diag::Unit::use_unit_diag, !do_conj, A.extent(1), A.extent(0), - B.extent(0), B.extent(1), alpha, A.data(), A.stride(1), A.stride(0), - B.data(), B.stride(0), B.stride(1)); + Diag::Unit::use_unit_diag, !do_conj, A.extent(1), A.extent(0), B.extent(0), B.extent(1), alpha, A.data(), + A.stride(1), A.stride(0), B.data(), B.stride(0), B.stride(1)); if (__side == 'r' && __uplo == 'l' && __trans == 't') SerialTrmmInternalRightUpper::invoke( - Diag::Unit::use_unit_diag, !do_conj, A.extent(1), A.extent(0), - B.extent(0), B.extent(1), alpha, A.data(), A.stride(1), A.stride(0), - B.data(), B.stride(0), B.stride(1)); + Diag::Unit::use_unit_diag, !do_conj, A.extent(1), A.extent(0), B.extent(0), B.extent(1), alpha, A.data(), + A.stride(1), A.stride(0), B.data(), B.stride(0), B.stride(1)); //// Lower conjugate-transpose //// // Conjugate-Transpose A by simply swapping the dimensions (extent) and stride // parameters if (__side == 'l' && __uplo == 'l' && __trans == 'c') SerialTrmmInternalLeftUpper::invoke( - Diag::Unit::use_unit_diag, do_conj, A.extent(1), A.extent(0), - B.extent(0), B.extent(1), alpha, A.data(), A.stride(1), A.stride(0), - B.data(), B.stride(0), B.stride(1)); + Diag::Unit::use_unit_diag, do_conj, A.extent(1), A.extent(0), B.extent(0), B.extent(1), alpha, A.data(), + A.stride(1), A.stride(0), B.data(), B.stride(0), B.stride(1)); if (__side == 'r' && __uplo == 'l' && __trans == 'c') SerialTrmmInternalRightUpper::invoke( - Diag::Unit::use_unit_diag, do_conj, A.extent(1), A.extent(0), - B.extent(0), B.extent(1), alpha, A.data(), A.stride(1), A.stride(0), - B.data(), B.stride(0), B.stride(1)); + Diag::Unit::use_unit_diag, do_conj, A.extent(1), A.extent(0), B.extent(0), B.extent(1), alpha, A.data(), + A.stride(1), A.stride(0), B.data(), B.stride(0), B.stride(1)); //// Upper non-transpose //// if (__side == 'l' && __uplo == 'u' && __trans == 'n') SerialTrmmInternalLeftUpper::invoke( - Diag::Unit::use_unit_diag, !do_conj, A.extent(0), A.extent(1), - B.extent(0), B.extent(1), alpha, A.data(), A.stride(0), A.stride(1), - B.data(), B.stride(0), B.stride(1)); + Diag::Unit::use_unit_diag, !do_conj, A.extent(0), A.extent(1), B.extent(0), B.extent(1), alpha, A.data(), + A.stride(0), A.stride(1), B.data(), B.stride(0), B.stride(1)); if (__side == 'r' && __uplo == 'u' && __trans == 'n') SerialTrmmInternalRightUpper::invoke( - Diag::Unit::use_unit_diag, !do_conj, A.extent(0), A.extent(1), - B.extent(0), B.extent(1), alpha, A.data(), A.stride(0), A.stride(1), - B.data(), B.stride(0), B.stride(1)); + Diag::Unit::use_unit_diag, !do_conj, A.extent(0), A.extent(1), B.extent(0), B.extent(1), alpha, A.data(), + A.stride(0), A.stride(1), B.data(), B.stride(0), B.stride(1)); //// Upper transpose // Transpose A by simply swapping the dimensions (extent) and stride // parameters if (__side == 'l' && __uplo == 'u' && __trans == 't') SerialTrmmInternalLeftLower::invoke( - Diag::Unit::use_unit_diag, !do_conj, A.extent(1), A.extent(0), - B.extent(0), B.extent(1), alpha, A.data(), A.stride(1), A.stride(0), - B.data(), B.stride(0), B.stride(1)); + Diag::Unit::use_unit_diag, !do_conj, A.extent(1), A.extent(0), B.extent(0), B.extent(1), alpha, A.data(), + A.stride(1), A.stride(0), B.data(), B.stride(0), B.stride(1)); if (__side == 'r' && __uplo == 'u' && __trans == 't') SerialTrmmInternalRightLower::invoke( - Diag::Unit::use_unit_diag, !do_conj, A.extent(1), A.extent(0), - B.extent(0), B.extent(1), alpha, A.data(), A.stride(1), A.stride(0), - B.data(), B.stride(0), B.stride(1)); + Diag::Unit::use_unit_diag, !do_conj, A.extent(1), A.extent(0), B.extent(0), B.extent(1), alpha, A.data(), + A.stride(1), A.stride(0), B.data(), B.stride(0), B.stride(1)); //// Upper conjugate-transpose //// // Conjugate-Transpose A by simply swapping the dimensions (extent) and stride // parameters if (__side == 'l' && __uplo == 'u' && __trans == 'c') SerialTrmmInternalLeftLower::invoke( - Diag::Unit::use_unit_diag, do_conj, A.extent(1), A.extent(0), - B.extent(0), B.extent(1), alpha, A.data(), A.stride(1), A.stride(0), - B.data(), B.stride(0), B.stride(1)); + Diag::Unit::use_unit_diag, do_conj, A.extent(1), A.extent(0), B.extent(0), B.extent(1), alpha, A.data(), + A.stride(1), A.stride(0), B.data(), B.stride(0), B.stride(1)); if (__side == 'r' && __uplo == 'u' && __trans == 'c') SerialTrmmInternalRightLower::invoke( - Diag::Unit::use_unit_diag, do_conj, A.extent(1), A.extent(0), - B.extent(0), B.extent(1), alpha, A.data(), A.stride(1), A.stride(0), - B.data(), B.stride(0), B.stride(1)); + Diag::Unit::use_unit_diag, do_conj, A.extent(1), A.extent(0), B.extent(0), B.extent(1), alpha, A.data(), + A.stride(1), A.stride(0), B.data(), B.stride(0), B.stride(1)); } } // namespace Impl } // namespace KokkosBlas diff --git a/blas/impl/KokkosBlas3_trmm_spec.hpp b/blas/impl/KokkosBlas3_trmm_spec.hpp index 85a8b1c6db..6399f9e57e 100644 --- a/blas/impl/KokkosBlas3_trmm_spec.hpp +++ b/blas/impl/KokkosBlas3_trmm_spec.hpp @@ -36,25 +36,21 @@ struct trmm_eti_spec_avail { // // This Macro is for readability of the template arguments. // -#define KOKKOSBLAS3_TRMM_ETI_SPEC_AVAIL_LAYOUT(SCALAR, LAYOUTA, LAYOUTB, \ - EXEC_SPACE, MEM_SPACE) \ - template <> \ - struct trmm_eti_spec_avail< \ - EXEC_SPACE, \ - Kokkos::View, \ - Kokkos::MemoryTraits >, \ - Kokkos::View, \ - Kokkos::MemoryTraits > > { \ - enum : bool { value = true }; \ +#define KOKKOSBLAS3_TRMM_ETI_SPEC_AVAIL_LAYOUT(SCALAR, LAYOUTA, LAYOUTB, EXEC_SPACE, MEM_SPACE) \ + template <> \ + struct trmm_eti_spec_avail, \ + Kokkos::MemoryTraits >, \ + Kokkos::View, \ + Kokkos::MemoryTraits > > { \ + enum : bool { value = true }; \ }; // // This Macros provides the ETI specialization of trmm // #define KOKKOSBLAS3_TRMM_ETI_SPEC_AVAIL(SCALAR, LAYOUT, EXEC_SPACE, MEM_SPACE) \ - KOKKOSBLAS3_TRMM_ETI_SPEC_AVAIL_LAYOUT(SCALAR, LAYOUT, LAYOUT, EXEC_SPACE, \ - MEM_SPACE) + KOKKOSBLAS3_TRMM_ETI_SPEC_AVAIL_LAYOUT(SCALAR, LAYOUT, LAYOUT, EXEC_SPACE, MEM_SPACE) // Include the actual specialization declarations #include @@ -69,33 +65,25 @@ namespace Impl { // Unification layer template ::value, - bool eti_spec_avail = - trmm_eti_spec_avail::value> + bool tpl_spec_avail = trmm_tpl_spec_avail::value, + bool eti_spec_avail = trmm_eti_spec_avail::value> struct TRMM { - static void trmm(const execution_space& space, const char side[], - const char uplo[], const char trans[], const char diag[], - typename BVIT::const_value_type& alpha, const AVIT& A, - const BVIT& B); + static void trmm(const execution_space& space, const char side[], const char uplo[], const char trans[], + const char diag[], typename BVIT::const_value_type& alpha, const AVIT& A, const BVIT& B); }; #if !defined(KOKKOSKERNELS_ETI_ONLY) || KOKKOSKERNELS_IMPL_COMPILE_LIBRARY template -struct TRMM { - static void trmm(const execution_space& /*space*/, const char side[], - const char uplo[], const char trans[], const char diag[], - typename BVIT::const_value_type& alpha, const AVIT& A, - const BVIT& B) { +struct TRMM { + static void trmm(const execution_space& /*space*/, const char side[], const char uplo[], const char trans[], + const char diag[], typename BVIT::const_value_type& alpha, const AVIT& A, const BVIT& B) { static_assert(Kokkos::is_view::value, "AVIT must be a Kokkos::View."); static_assert(Kokkos::is_view::value, "BVIT must be a Kokkos::View."); static_assert(static_cast(AVIT::rank) == 2, "AVIT must have rank 2."); static_assert(static_cast(BVIT::rank) == 2, "BVIT must have rank 2."); - Kokkos::Profiling::pushRegion(KOKKOSKERNELS_IMPL_COMPILE_LIBRARY - ? "KokkosBlas::trmm[ETI]" - : "KokkosBlas::trmm[noETI]"); + Kokkos::Profiling::pushRegion(KOKKOSKERNELS_IMPL_COMPILE_LIBRARY ? "KokkosBlas::trmm[ETI]" + : "KokkosBlas::trmm[noETI]"); typename AVIT::HostMirror host_A = Kokkos::create_mirror_view(A); typename BVIT::HostMirror host_B = Kokkos::create_mirror_view(B); @@ -105,8 +93,8 @@ struct TRMM( - side, uplo, trans, diag, alpha, host_A, host_B); + SerialTrmm_Invoke(side, uplo, trans, diag, alpha, host_A, + host_B); // Copy host_B to B // no-op if B's MemorySpace is HostSpace @@ -124,27 +112,21 @@ struct TRMM, \ - Kokkos::MemoryTraits >, \ - Kokkos::View, \ - Kokkos::MemoryTraits >, \ - false, true>; - -#define KOKKOSBLAS3_TRMM_ETI_SPEC_INST_LAYOUTS(SCALAR, LAYOUTA, LAYOUTB, \ - EXEC_SPACE, MEM_SPACE) \ - template struct TRMM< \ - EXEC_SPACE, \ - Kokkos::View, \ - Kokkos::MemoryTraits >, \ - Kokkos::View, \ - Kokkos::MemoryTraits >, \ - false, true>; +#define KOKKOSBLAS3_TRMM_ETI_SPEC_DECL_LAYOUTS(SCALAR, LAYOUTA, LAYOUTB, EXEC_SPACE, MEM_SPACE) \ + extern template struct TRMM, \ + Kokkos::MemoryTraits >, \ + Kokkos::View, \ + Kokkos::MemoryTraits >, \ + false, true>; + +#define KOKKOSBLAS3_TRMM_ETI_SPEC_INST_LAYOUTS(SCALAR, LAYOUTA, LAYOUTB, EXEC_SPACE, MEM_SPACE) \ + template struct TRMM, \ + Kokkos::MemoryTraits >, \ + Kokkos::View, \ + Kokkos::MemoryTraits >, \ + false, true>; // // These Macros are only included when we are not compiling libkokkoskernels but @@ -154,12 +136,10 @@ struct TRMM diff --git a/blas/impl/KokkosBlas3_trsm_impl.hpp b/blas/impl/KokkosBlas3_trsm_impl.hpp index 87cac8b86a..57c1342eb5 100644 --- a/blas/impl/KokkosBlas3_trsm_impl.hpp +++ b/blas/impl/KokkosBlas3_trsm_impl.hpp @@ -34,12 +34,9 @@ namespace KokkosBlas { namespace Impl { template -int SerialTrsmInternalLeftLowerConj(const bool use_unit_diag, const int m, - const int n, const ScalarType alpha, - const ValueType* KOKKOS_RESTRICT A, - const int as0, const int as1, - /**/ ValueType* KOKKOS_RESTRICT B, - const int bs0, const int bs1) { +int SerialTrsmInternalLeftLowerConj(const bool use_unit_diag, const int m, const int n, const ScalarType alpha, + const ValueType* KOKKOS_RESTRICT A, const int as0, const int as1, + /**/ ValueType* KOKKOS_RESTRICT B, const int bs0, const int bs1) { typedef Kokkos::ArithTraits AT; const ScalarType one(1.0), zero(0.0); @@ -47,8 +44,7 @@ int SerialTrsmInternalLeftLowerConj(const bool use_unit_diag, const int m, if (alpha == zero) SerialSetInternal::invoke(m, n, zero, B, bs0, bs1); else { - if (alpha != one) - KokkosBlas::Impl::SerialScaleInternal::invoke(m, n, alpha, B, bs0, bs1); + if (alpha != one) KokkosBlas::Impl::SerialScaleInternal::invoke(m, n, alpha, B, bs0, bs1); if (m <= 0 || n <= 0) return 0; for (int p = 0; p < m; ++p) { @@ -56,8 +52,7 @@ int SerialTrsmInternalLeftLowerConj(const bool use_unit_diag, const int m, const ValueType* KOKKOS_RESTRICT a21 = A + (p + 1) * as0 + p * as1; - ValueType *KOKKOS_RESTRICT b1t = B + p * bs0, - *KOKKOS_RESTRICT B2 = B + (p + 1) * bs0; + ValueType *KOKKOS_RESTRICT b1t = B + p * bs0, *KOKKOS_RESTRICT B2 = B + (p + 1) * bs0; if (!use_unit_diag) { const ValueType alpha11 = AT::conj(A[p * as0 + p * as1]); @@ -65,20 +60,16 @@ int SerialTrsmInternalLeftLowerConj(const bool use_unit_diag, const int m, } for (int i = 0; i < iend; ++i) - for (int j = 0; j < jend; ++j) - B2[i * bs0 + j * bs1] -= AT::conj(a21[i * as0]) * b1t[j * bs1]; + for (int j = 0; j < jend; ++j) B2[i * bs0 + j * bs1] -= AT::conj(a21[i * as0]) * b1t[j * bs1]; } } return 0; } template -int SerialTrsmInternalLeftUpperConj(const bool use_unit_diag, const int m, - const int n, const ScalarType alpha, - const ValueType* KOKKOS_RESTRICT A, - const int as0, const int as1, - /**/ ValueType* KOKKOS_RESTRICT B, - const int bs0, const int bs1) { +int SerialTrsmInternalLeftUpperConj(const bool use_unit_diag, const int m, const int n, const ScalarType alpha, + const ValueType* KOKKOS_RESTRICT A, const int as0, const int as1, + /**/ ValueType* KOKKOS_RESTRICT B, const int bs0, const int bs1) { typedef Kokkos::ArithTraits AT; const ScalarType one(1.0), zero(0.0); @@ -86,8 +77,7 @@ int SerialTrsmInternalLeftUpperConj(const bool use_unit_diag, const int m, if (alpha == zero) SerialSetInternal::invoke(m, n, zero, B, bs0, bs1); else { - if (alpha != one) - KokkosBlas::Impl::SerialScaleInternal::invoke(m, n, alpha, B, bs0, bs1); + if (alpha != one) KokkosBlas::Impl::SerialScaleInternal::invoke(m, n, alpha, B, bs0, bs1); if (m <= 0 || n <= 0) return 0; ValueType* KOKKOS_RESTRICT B0 = B; @@ -105,8 +95,7 @@ int SerialTrsmInternalLeftUpperConj(const bool use_unit_diag, const int m, if (p > 0) { // Note: A workaround to produce correct results for // complex with Intel-18.2.199 for (int i = 0; i < iend; ++i) - for (int j = 0; j < jend; ++j) - B0[i * bs0 + j * bs1] -= AT::conj(a01[i * as0]) * b1t[j * bs1]; + for (int j = 0; j < jend; ++j) B0[i * bs0 + j * bs1] -= AT::conj(a01[i * as0]) * b1t[j * bs1]; } } } @@ -114,204 +103,146 @@ int SerialTrsmInternalLeftUpperConj(const bool use_unit_diag, const int m, } template -void SerialTrsm_Invoke(const char side[], const char uplo[], const char trans[], - const char diag[], - typename BViewType::const_value_type& alpha, - const AViewType& A, const BViewType& B) { +void SerialTrsm_Invoke(const char side[], const char uplo[], const char trans[], const char diag[], + typename BViewType::const_value_type& alpha, const AViewType& A, const BViewType& B) { using KokkosBatched::Algo; using KokkosBatched::Diag; // Side::Left, Uplo::Lower, Trans::NoTranspose - if (((side[0] == 'L') || (side[0] == 'l')) && - ((uplo[0] == 'L') || (uplo[0] == 'l')) && - ((trans[0] == 'N') || (trans[0] == 'n')) && - ((diag[0] == 'U') || (diag[0] == 'u'))) + if (((side[0] == 'L') || (side[0] == 'l')) && ((uplo[0] == 'L') || (uplo[0] == 'l')) && + ((trans[0] == 'N') || (trans[0] == 'n')) && ((diag[0] == 'U') || (diag[0] == 'u'))) KokkosBatched::SerialTrsmInternalLeftLower::invoke( - Diag::Unit::use_unit_diag, B.extent(0), B.extent(1), alpha, A.data(), - A.stride(0), A.stride(1), B.data(), B.stride(0), B.stride(1)); - if (((side[0] == 'L') || (side[0] == 'l')) && - ((uplo[0] == 'L') || (uplo[0] == 'l')) && - ((trans[0] == 'N') || (trans[0] == 'n')) && - ((diag[0] == 'N') || (diag[0] == 'n'))) + Diag::Unit::use_unit_diag, B.extent(0), B.extent(1), alpha, A.data(), A.stride(0), A.stride(1), B.data(), + B.stride(0), B.stride(1)); + if (((side[0] == 'L') || (side[0] == 'l')) && ((uplo[0] == 'L') || (uplo[0] == 'l')) && + ((trans[0] == 'N') || (trans[0] == 'n')) && ((diag[0] == 'N') || (diag[0] == 'n'))) KokkosBatched::SerialTrsmInternalLeftLower::invoke( - Diag::NonUnit::use_unit_diag, B.extent(0), B.extent(1), alpha, A.data(), - A.stride(0), A.stride(1), B.data(), B.stride(0), B.stride(1)); + Diag::NonUnit::use_unit_diag, B.extent(0), B.extent(1), alpha, A.data(), A.stride(0), A.stride(1), B.data(), + B.stride(0), B.stride(1)); // Side::Left, Uplo::Lower, Trans::Transpose - if (((side[0] == 'L') || (side[0] == 'l')) && - ((uplo[0] == 'L') || (uplo[0] == 'l')) && - ((trans[0] == 'T') || (trans[0] == 't')) && - ((diag[0] == 'U') || (diag[0] == 'u'))) + if (((side[0] == 'L') || (side[0] == 'l')) && ((uplo[0] == 'L') || (uplo[0] == 'l')) && + ((trans[0] == 'T') || (trans[0] == 't')) && ((diag[0] == 'U') || (diag[0] == 'u'))) KokkosBatched::SerialTrsmInternalLeftUpper::invoke( - Diag::Unit::use_unit_diag, B.extent(0), B.extent(1), alpha, A.data(), - A.stride(1), A.stride(0), B.data(), B.stride(0), B.stride(1)); - if (((side[0] == 'L') || (side[0] == 'l')) && - ((uplo[0] == 'L') || (uplo[0] == 'l')) && - ((trans[0] == 'T') || (trans[0] == 't')) && - ((diag[0] == 'N') || (diag[0] == 'n'))) + Diag::Unit::use_unit_diag, B.extent(0), B.extent(1), alpha, A.data(), A.stride(1), A.stride(0), B.data(), + B.stride(0), B.stride(1)); + if (((side[0] == 'L') || (side[0] == 'l')) && ((uplo[0] == 'L') || (uplo[0] == 'l')) && + ((trans[0] == 'T') || (trans[0] == 't')) && ((diag[0] == 'N') || (diag[0] == 'n'))) KokkosBatched::SerialTrsmInternalLeftUpper::invoke( - Diag::NonUnit::use_unit_diag, B.extent(0), B.extent(1), alpha, A.data(), - A.stride(1), A.stride(0), B.data(), B.stride(0), B.stride(1)); + Diag::NonUnit::use_unit_diag, B.extent(0), B.extent(1), alpha, A.data(), A.stride(1), A.stride(0), B.data(), + B.stride(0), B.stride(1)); // Side::Left, Uplo::Lower, Trans::ConjTranspose - if (((side[0] == 'L') || (side[0] == 'l')) && - ((uplo[0] == 'L') || (uplo[0] == 'l')) && - ((trans[0] == 'C') || (trans[0] == 'c')) && - ((diag[0] == 'U') || (diag[0] == 'u'))) - SerialTrsmInternalLeftUpperConj( - Diag::Unit::use_unit_diag, B.extent(0), B.extent(1), alpha, A.data(), - A.stride(1), A.stride(0), B.data(), B.stride(0), B.stride(1)); - if (((side[0] == 'L') || (side[0] == 'l')) && - ((uplo[0] == 'L') || (uplo[0] == 'l')) && - ((trans[0] == 'C') || (trans[0] == 'c')) && - ((diag[0] == 'N') || (diag[0] == 'n'))) - SerialTrsmInternalLeftUpperConj( - Diag::NonUnit::use_unit_diag, B.extent(0), B.extent(1), alpha, A.data(), - A.stride(1), A.stride(0), B.data(), B.stride(0), B.stride(1)); + if (((side[0] == 'L') || (side[0] == 'l')) && ((uplo[0] == 'L') || (uplo[0] == 'l')) && + ((trans[0] == 'C') || (trans[0] == 'c')) && ((diag[0] == 'U') || (diag[0] == 'u'))) + SerialTrsmInternalLeftUpperConj(Diag::Unit::use_unit_diag, B.extent(0), B.extent(1), alpha, A.data(), A.stride(1), + A.stride(0), B.data(), B.stride(0), B.stride(1)); + if (((side[0] == 'L') || (side[0] == 'l')) && ((uplo[0] == 'L') || (uplo[0] == 'l')) && + ((trans[0] == 'C') || (trans[0] == 'c')) && ((diag[0] == 'N') || (diag[0] == 'n'))) + SerialTrsmInternalLeftUpperConj(Diag::NonUnit::use_unit_diag, B.extent(0), B.extent(1), alpha, A.data(), + A.stride(1), A.stride(0), B.data(), B.stride(0), B.stride(1)); // Side::Left, Uplo::Upper, Trans::NoTranspose - if (((side[0] == 'L') || (side[0] == 'l')) && - ((uplo[0] == 'U') || (uplo[0] == 'u')) && - ((trans[0] == 'N') || (trans[0] == 'n')) && - ((diag[0] == 'U') || (diag[0] == 'u'))) + if (((side[0] == 'L') || (side[0] == 'l')) && ((uplo[0] == 'U') || (uplo[0] == 'u')) && + ((trans[0] == 'N') || (trans[0] == 'n')) && ((diag[0] == 'U') || (diag[0] == 'u'))) KokkosBatched::SerialTrsmInternalLeftUpper::invoke( - Diag::Unit::use_unit_diag, B.extent(0), B.extent(1), alpha, A.data(), - A.stride(0), A.stride(1), B.data(), B.stride(0), B.stride(1)); - if (((side[0] == 'L') || (side[0] == 'l')) && - ((uplo[0] == 'U') || (uplo[0] == 'u')) && - ((trans[0] == 'N') || (trans[0] == 'n')) && - ((diag[0] == 'N') || (diag[0] == 'n'))) + Diag::Unit::use_unit_diag, B.extent(0), B.extent(1), alpha, A.data(), A.stride(0), A.stride(1), B.data(), + B.stride(0), B.stride(1)); + if (((side[0] == 'L') || (side[0] == 'l')) && ((uplo[0] == 'U') || (uplo[0] == 'u')) && + ((trans[0] == 'N') || (trans[0] == 'n')) && ((diag[0] == 'N') || (diag[0] == 'n'))) KokkosBatched::SerialTrsmInternalLeftUpper::invoke( - Diag::NonUnit::use_unit_diag, B.extent(0), B.extent(1), alpha, A.data(), - A.stride(0), A.stride(1), B.data(), B.stride(0), B.stride(1)); + Diag::NonUnit::use_unit_diag, B.extent(0), B.extent(1), alpha, A.data(), A.stride(0), A.stride(1), B.data(), + B.stride(0), B.stride(1)); // Side::Left, Uplo::Upper, Trans::Transpose - if (((side[0] == 'L') || (side[0] == 'l')) && - ((uplo[0] == 'U') || (uplo[0] == 'u')) && - ((trans[0] == 'T') || (trans[0] == 't')) && - ((diag[0] == 'U') || (diag[0] == 'u'))) + if (((side[0] == 'L') || (side[0] == 'l')) && ((uplo[0] == 'U') || (uplo[0] == 'u')) && + ((trans[0] == 'T') || (trans[0] == 't')) && ((diag[0] == 'U') || (diag[0] == 'u'))) KokkosBatched::SerialTrsmInternalLeftLower::invoke( - Diag::Unit::use_unit_diag, B.extent(0), B.extent(1), alpha, A.data(), - A.stride(1), A.stride(0), B.data(), B.stride(0), B.stride(1)); - if (((side[0] == 'L') || (side[0] == 'l')) && - ((uplo[0] == 'U') || (uplo[0] == 'u')) && - ((trans[0] == 'T') || (trans[0] == 't')) && - ((diag[0] == 'N') || (diag[0] == 'n'))) + Diag::Unit::use_unit_diag, B.extent(0), B.extent(1), alpha, A.data(), A.stride(1), A.stride(0), B.data(), + B.stride(0), B.stride(1)); + if (((side[0] == 'L') || (side[0] == 'l')) && ((uplo[0] == 'U') || (uplo[0] == 'u')) && + ((trans[0] == 'T') || (trans[0] == 't')) && ((diag[0] == 'N') || (diag[0] == 'n'))) KokkosBatched::SerialTrsmInternalLeftLower::invoke( - Diag::NonUnit::use_unit_diag, B.extent(0), B.extent(1), alpha, A.data(), - A.stride(1), A.stride(0), B.data(), B.stride(0), B.stride(1)); + Diag::NonUnit::use_unit_diag, B.extent(0), B.extent(1), alpha, A.data(), A.stride(1), A.stride(0), B.data(), + B.stride(0), B.stride(1)); // Side::Left, Uplo::Upper, Trans::ConjTranspose - if (((side[0] == 'L') || (side[0] == 'l')) && - ((uplo[0] == 'U') || (uplo[0] == 'u')) && - ((trans[0] == 'C') || (trans[0] == 'c')) && - ((diag[0] == 'U') || (diag[0] == 'u'))) - SerialTrsmInternalLeftLowerConj( - Diag::Unit::use_unit_diag, B.extent(0), B.extent(1), alpha, A.data(), - A.stride(1), A.stride(0), B.data(), B.stride(0), B.stride(1)); - if (((side[0] == 'L') || (side[0] == 'l')) && - ((uplo[0] == 'U') || (uplo[0] == 'u')) && - ((trans[0] == 'C') || (trans[0] == 'c')) && - ((diag[0] == 'N') || (diag[0] == 'n'))) - SerialTrsmInternalLeftLowerConj( - Diag::NonUnit::use_unit_diag, B.extent(0), B.extent(1), alpha, A.data(), - A.stride(1), A.stride(0), B.data(), B.stride(0), B.stride(1)); + if (((side[0] == 'L') || (side[0] == 'l')) && ((uplo[0] == 'U') || (uplo[0] == 'u')) && + ((trans[0] == 'C') || (trans[0] == 'c')) && ((diag[0] == 'U') || (diag[0] == 'u'))) + SerialTrsmInternalLeftLowerConj(Diag::Unit::use_unit_diag, B.extent(0), B.extent(1), alpha, A.data(), A.stride(1), + A.stride(0), B.data(), B.stride(0), B.stride(1)); + if (((side[0] == 'L') || (side[0] == 'l')) && ((uplo[0] == 'U') || (uplo[0] == 'u')) && + ((trans[0] == 'C') || (trans[0] == 'c')) && ((diag[0] == 'N') || (diag[0] == 'n'))) + SerialTrsmInternalLeftLowerConj(Diag::NonUnit::use_unit_diag, B.extent(0), B.extent(1), alpha, A.data(), + A.stride(1), A.stride(0), B.data(), B.stride(0), B.stride(1)); //// // Side::Right, Uplo::Lower, Trans::NoTranspose - if (((side[0] == 'R') || (side[0] == 'r')) && - ((uplo[0] == 'L') || (uplo[0] == 'l')) && - ((trans[0] == 'N') || (trans[0] == 'n')) && - ((diag[0] == 'U') || (diag[0] == 'u'))) + if (((side[0] == 'R') || (side[0] == 'r')) && ((uplo[0] == 'L') || (uplo[0] == 'l')) && + ((trans[0] == 'N') || (trans[0] == 'n')) && ((diag[0] == 'U') || (diag[0] == 'u'))) KokkosBatched::SerialTrsmInternalLeftUpper::invoke( - Diag::Unit::use_unit_diag, B.extent(1), B.extent(0), alpha, A.data(), - A.stride(1), A.stride(0), B.data(), B.stride(1), B.stride(0)); - if (((side[0] == 'R') || (side[0] == 'r')) && - ((uplo[0] == 'L') || (uplo[0] == 'l')) && - ((trans[0] == 'N') || (trans[0] == 'n')) && - ((diag[0] == 'N') || (diag[0] == 'n'))) + Diag::Unit::use_unit_diag, B.extent(1), B.extent(0), alpha, A.data(), A.stride(1), A.stride(0), B.data(), + B.stride(1), B.stride(0)); + if (((side[0] == 'R') || (side[0] == 'r')) && ((uplo[0] == 'L') || (uplo[0] == 'l')) && + ((trans[0] == 'N') || (trans[0] == 'n')) && ((diag[0] == 'N') || (diag[0] == 'n'))) KokkosBatched::SerialTrsmInternalLeftUpper::invoke( - Diag::NonUnit::use_unit_diag, B.extent(1), B.extent(0), alpha, A.data(), - A.stride(1), A.stride(0), B.data(), B.stride(1), B.stride(0)); + Diag::NonUnit::use_unit_diag, B.extent(1), B.extent(0), alpha, A.data(), A.stride(1), A.stride(0), B.data(), + B.stride(1), B.stride(0)); // Side::Right, Uplo::Lower, Trans::Transpose - if (((side[0] == 'R') || (side[0] == 'r')) && - ((uplo[0] == 'L') || (uplo[0] == 'l')) && - ((trans[0] == 'T') || (trans[0] == 't')) && - ((diag[0] == 'U') || (diag[0] == 'u'))) + if (((side[0] == 'R') || (side[0] == 'r')) && ((uplo[0] == 'L') || (uplo[0] == 'l')) && + ((trans[0] == 'T') || (trans[0] == 't')) && ((diag[0] == 'U') || (diag[0] == 'u'))) KokkosBatched::SerialTrsmInternalLeftLower::invoke( - Diag::Unit::use_unit_diag, B.extent(1), B.extent(0), alpha, A.data(), - A.stride(0), A.stride(1), B.data(), B.stride(1), B.stride(0)); - if (((side[0] == 'R') || (side[0] == 'r')) && - ((uplo[0] == 'L') || (uplo[0] == 'l')) && - ((trans[0] == 'T') || (trans[0] == 't')) && - ((diag[0] == 'N') || (diag[0] == 'n'))) + Diag::Unit::use_unit_diag, B.extent(1), B.extent(0), alpha, A.data(), A.stride(0), A.stride(1), B.data(), + B.stride(1), B.stride(0)); + if (((side[0] == 'R') || (side[0] == 'r')) && ((uplo[0] == 'L') || (uplo[0] == 'l')) && + ((trans[0] == 'T') || (trans[0] == 't')) && ((diag[0] == 'N') || (diag[0] == 'n'))) KokkosBatched::SerialTrsmInternalLeftLower::invoke( - Diag::NonUnit::use_unit_diag, B.extent(1), B.extent(0), alpha, A.data(), - A.stride(0), A.stride(1), B.data(), B.stride(1), B.stride(0)); + Diag::NonUnit::use_unit_diag, B.extent(1), B.extent(0), alpha, A.data(), A.stride(0), A.stride(1), B.data(), + B.stride(1), B.stride(0)); // Side::Right, Uplo::Lower, Trans::ConjTranspose - if (((side[0] == 'R') || (side[0] == 'r')) && - ((uplo[0] == 'L') || (uplo[0] == 'l')) && - ((trans[0] == 'C') || (trans[0] == 'c')) && - ((diag[0] == 'U') || (diag[0] == 'u'))) - SerialTrsmInternalLeftLowerConj( - Diag::Unit::use_unit_diag, B.extent(1), B.extent(0), alpha, A.data(), - A.stride(0), A.stride(1), B.data(), B.stride(1), B.stride(0)); - if (((side[0] == 'R') || (side[0] == 'r')) && - ((uplo[0] == 'L') || (uplo[0] == 'l')) && - ((trans[0] == 'C') || (trans[0] == 'c')) && - ((diag[0] == 'N') || (diag[0] == 'n'))) - SerialTrsmInternalLeftLowerConj( - Diag::NonUnit::use_unit_diag, B.extent(1), B.extent(0), alpha, A.data(), - A.stride(0), A.stride(1), B.data(), B.stride(1), B.stride(0)); + if (((side[0] == 'R') || (side[0] == 'r')) && ((uplo[0] == 'L') || (uplo[0] == 'l')) && + ((trans[0] == 'C') || (trans[0] == 'c')) && ((diag[0] == 'U') || (diag[0] == 'u'))) + SerialTrsmInternalLeftLowerConj(Diag::Unit::use_unit_diag, B.extent(1), B.extent(0), alpha, A.data(), A.stride(0), + A.stride(1), B.data(), B.stride(1), B.stride(0)); + if (((side[0] == 'R') || (side[0] == 'r')) && ((uplo[0] == 'L') || (uplo[0] == 'l')) && + ((trans[0] == 'C') || (trans[0] == 'c')) && ((diag[0] == 'N') || (diag[0] == 'n'))) + SerialTrsmInternalLeftLowerConj(Diag::NonUnit::use_unit_diag, B.extent(1), B.extent(0), alpha, A.data(), + A.stride(0), A.stride(1), B.data(), B.stride(1), B.stride(0)); // Side::Right, Uplo::Upper, Trans::NoTranspose - if (((side[0] == 'R') || (side[0] == 'r')) && - ((uplo[0] == 'U') || (uplo[0] == 'u')) && - ((trans[0] == 'N') || (trans[0] == 'n')) && - ((diag[0] == 'U') || (diag[0] == 'u'))) + if (((side[0] == 'R') || (side[0] == 'r')) && ((uplo[0] == 'U') || (uplo[0] == 'u')) && + ((trans[0] == 'N') || (trans[0] == 'n')) && ((diag[0] == 'U') || (diag[0] == 'u'))) KokkosBatched::SerialTrsmInternalLeftLower::invoke( - Diag::Unit::use_unit_diag, B.extent(1), B.extent(0), alpha, A.data(), - A.stride(1), A.stride(0), B.data(), B.stride(1), B.stride(0)); - if (((side[0] == 'R') || (side[0] == 'r')) && - ((uplo[0] == 'U') || (uplo[0] == 'u')) && - ((trans[0] == 'N') || (trans[0] == 'n')) && - ((diag[0] == 'N') || (diag[0] == 'n'))) + Diag::Unit::use_unit_diag, B.extent(1), B.extent(0), alpha, A.data(), A.stride(1), A.stride(0), B.data(), + B.stride(1), B.stride(0)); + if (((side[0] == 'R') || (side[0] == 'r')) && ((uplo[0] == 'U') || (uplo[0] == 'u')) && + ((trans[0] == 'N') || (trans[0] == 'n')) && ((diag[0] == 'N') || (diag[0] == 'n'))) KokkosBatched::SerialTrsmInternalLeftLower::invoke( - Diag::NonUnit::use_unit_diag, B.extent(1), B.extent(0), alpha, A.data(), - A.stride(1), A.stride(0), B.data(), B.stride(1), B.stride(0)); + Diag::NonUnit::use_unit_diag, B.extent(1), B.extent(0), alpha, A.data(), A.stride(1), A.stride(0), B.data(), + B.stride(1), B.stride(0)); // Side::Right, Uplo::Upper, Trans::Transpose - if (((side[0] == 'R') || (side[0] == 'r')) && - ((uplo[0] == 'U') || (uplo[0] == 'u')) && - ((trans[0] == 'T') || (trans[0] == 't')) && - ((diag[0] == 'U') || (diag[0] == 'u'))) + if (((side[0] == 'R') || (side[0] == 'r')) && ((uplo[0] == 'U') || (uplo[0] == 'u')) && + ((trans[0] == 'T') || (trans[0] == 't')) && ((diag[0] == 'U') || (diag[0] == 'u'))) KokkosBatched::SerialTrsmInternalLeftUpper::invoke( - Diag::Unit::use_unit_diag, B.extent(1), B.extent(0), alpha, A.data(), - A.stride(0), A.stride(1), B.data(), B.stride(1), B.stride(0)); - if (((side[0] == 'R') || (side[0] == 'r')) && - ((uplo[0] == 'U') || (uplo[0] == 'u')) && - ((trans[0] == 'T') || (trans[0] == 't')) && - ((diag[0] == 'N') || (diag[0] == 'n'))) + Diag::Unit::use_unit_diag, B.extent(1), B.extent(0), alpha, A.data(), A.stride(0), A.stride(1), B.data(), + B.stride(1), B.stride(0)); + if (((side[0] == 'R') || (side[0] == 'r')) && ((uplo[0] == 'U') || (uplo[0] == 'u')) && + ((trans[0] == 'T') || (trans[0] == 't')) && ((diag[0] == 'N') || (diag[0] == 'n'))) KokkosBatched::SerialTrsmInternalLeftUpper::invoke( - Diag::NonUnit::use_unit_diag, B.extent(1), B.extent(0), alpha, A.data(), - A.stride(0), A.stride(1), B.data(), B.stride(1), B.stride(0)); + Diag::NonUnit::use_unit_diag, B.extent(1), B.extent(0), alpha, A.data(), A.stride(0), A.stride(1), B.data(), + B.stride(1), B.stride(0)); // Side::Right, Uplo::Upper, Trans::ConjTranspose - if (((side[0] == 'R') || (side[0] == 'r')) && - ((uplo[0] == 'U') || (uplo[0] == 'u')) && - ((trans[0] == 'C') || (trans[0] == 'c')) && - ((diag[0] == 'U') || (diag[0] == 'u'))) - SerialTrsmInternalLeftUpperConj( - Diag::Unit::use_unit_diag, B.extent(1), B.extent(0), alpha, A.data(), - A.stride(0), A.stride(1), B.data(), B.stride(1), B.stride(0)); - if (((side[0] == 'R') || (side[0] == 'r')) && - ((uplo[0] == 'U') || (uplo[0] == 'u')) && - ((trans[0] == 'C') || (trans[0] == 'c')) && - ((diag[0] == 'N') || (diag[0] == 'n'))) - SerialTrsmInternalLeftUpperConj( - Diag::NonUnit::use_unit_diag, B.extent(1), B.extent(0), alpha, A.data(), - A.stride(0), A.stride(1), B.data(), B.stride(1), B.stride(0)); + if (((side[0] == 'R') || (side[0] == 'r')) && ((uplo[0] == 'U') || (uplo[0] == 'u')) && + ((trans[0] == 'C') || (trans[0] == 'c')) && ((diag[0] == 'U') || (diag[0] == 'u'))) + SerialTrsmInternalLeftUpperConj(Diag::Unit::use_unit_diag, B.extent(1), B.extent(0), alpha, A.data(), A.stride(0), + A.stride(1), B.data(), B.stride(1), B.stride(0)); + if (((side[0] == 'R') || (side[0] == 'r')) && ((uplo[0] == 'U') || (uplo[0] == 'u')) && + ((trans[0] == 'C') || (trans[0] == 'c')) && ((diag[0] == 'N') || (diag[0] == 'n'))) + SerialTrsmInternalLeftUpperConj(Diag::NonUnit::use_unit_diag, B.extent(1), B.extent(0), alpha, A.data(), + A.stride(0), A.stride(1), B.data(), B.stride(1), B.stride(0)); } } // namespace Impl diff --git a/blas/impl/KokkosBlas3_trsm_spec.hpp b/blas/impl/KokkosBlas3_trsm_spec.hpp index 93d01ed53b..8c9088e970 100644 --- a/blas/impl/KokkosBlas3_trsm_spec.hpp +++ b/blas/impl/KokkosBlas3_trsm_spec.hpp @@ -42,22 +42,18 @@ struct trsm_eti_spec_avail { // We may spread out definitions (see _INST macro below) across one or // more .cpp files. // -#define KOKKOSBLAS3_TRSM_ETI_SPEC_AVAIL_LAYOUT(SCALAR, LAYOUTA, LAYOUTB, \ - EXEC_SPACE, MEM_SPACE) \ - template <> \ - struct trsm_eti_spec_avail< \ - EXEC_SPACE, \ - Kokkos::View, \ - Kokkos::MemoryTraits >, \ - Kokkos::View, \ - Kokkos::MemoryTraits > > { \ - enum : bool { value = true }; \ +#define KOKKOSBLAS3_TRSM_ETI_SPEC_AVAIL_LAYOUT(SCALAR, LAYOUTA, LAYOUTB, EXEC_SPACE, MEM_SPACE) \ + template <> \ + struct trsm_eti_spec_avail, \ + Kokkos::MemoryTraits >, \ + Kokkos::View, \ + Kokkos::MemoryTraits > > { \ + enum : bool { value = true }; \ }; #define KOKKOSBLAS3_TRSM_ETI_SPEC_AVAIL(SCALAR, LAYOUT, EXEC_SPACE, MEM_SPACE) \ - KOKKOSBLAS3_TRSM_ETI_SPEC_AVAIL_LAYOUT(SCALAR, LAYOUT, LAYOUT, EXEC_SPACE, \ - MEM_SPACE) + KOKKOSBLAS3_TRSM_ETI_SPEC_AVAIL_LAYOUT(SCALAR, LAYOUT, LAYOUT, EXEC_SPACE, MEM_SPACE) // Include the actual specialization declarations #include @@ -72,38 +68,28 @@ namespace Impl { // Unification layer template ::value, - bool eti_spec_avail = - trsm_eti_spec_avail::value> + bool tpl_spec_avail = trsm_tpl_spec_avail::value, + bool eti_spec_avail = trsm_eti_spec_avail::value> struct TRSM { - static void trsm(const execution_space& space, const char side[], - const char uplo[], const char trans[], const char diag[], - typename BViewType::const_value_type& alpha, - const AViewType& A, const BViewType& B); + static void trsm(const execution_space& space, const char side[], const char uplo[], const char trans[], + const char diag[], typename BViewType::const_value_type& alpha, const AViewType& A, + const BViewType& B); }; // Implementation of KokkosBlas::trsm. #if !defined(KOKKOSKERNELS_ETI_ONLY) || KOKKOSKERNELS_IMPL_COMPILE_LIBRARY template -struct TRSM { - static void trsm(const execution_space& /*space*/, const char side[], - const char uplo[], const char trans[], const char diag[], - typename BViewType::const_value_type& alpha, - const AViewType& A, const BViewType& B) { - static_assert(Kokkos::is_view::value, - "AViewType must be a Kokkos::View."); - static_assert(Kokkos::is_view::value, - "BViewType must be a Kokkos::View."); - static_assert(static_cast(AViewType::rank) == 2, - "AViewType must have rank 2."); - static_assert(static_cast(BViewType::rank) == 2, - "BViewType must have rank 2."); - - Kokkos::Profiling::pushRegion(KOKKOSKERNELS_IMPL_COMPILE_LIBRARY - ? "KokkosBlas::trsm[ETI]" - : "KokkosBlas::trsm[noETI]"); +struct TRSM { + static void trsm(const execution_space& /*space*/, const char side[], const char uplo[], const char trans[], + const char diag[], typename BViewType::const_value_type& alpha, const AViewType& A, + const BViewType& B) { + static_assert(Kokkos::is_view::value, "AViewType must be a Kokkos::View."); + static_assert(Kokkos::is_view::value, "BViewType must be a Kokkos::View."); + static_assert(static_cast(AViewType::rank) == 2, "AViewType must have rank 2."); + static_assert(static_cast(BViewType::rank) == 2, "BViewType must have rank 2."); + + Kokkos::Profiling::pushRegion(KOKKOSKERNELS_IMPL_COMPILE_LIBRARY ? "KokkosBlas::trsm[ETI]" + : "KokkosBlas::trsm[noETI]"); typename AViewType::HostMirror h_A = Kokkos::create_mirror_view(A); typename BViewType::HostMirror h_B = Kokkos::create_mirror_view(B); @@ -111,9 +97,8 @@ struct TRSM(side, uplo, trans, diag, - alpha, h_A, h_B); + SerialTrsm_Invoke(side, uplo, trans, diag, alpha, + h_A, h_B); Kokkos::deep_copy(B, h_B); @@ -134,35 +119,27 @@ struct TRSM, \ - Kokkos::MemoryTraits >, \ - Kokkos::View, \ - Kokkos::MemoryTraits >, \ - false, true>; - -#define KOKKOSBLAS3_TRSM_ETI_SPEC_INST_LAYOUTS(SCALAR, LAYOUTA, LAYOUTB, \ - EXEC_SPACE, MEM_SPACE) \ - template struct TRSM< \ - EXEC_SPACE, \ - Kokkos::View, \ - Kokkos::MemoryTraits >, \ - Kokkos::View, \ - Kokkos::MemoryTraits >, \ - false, true>; +#define KOKKOSBLAS3_TRSM_ETI_SPEC_DECL_LAYOUTS(SCALAR, LAYOUTA, LAYOUTB, EXEC_SPACE, MEM_SPACE) \ + extern template struct TRSM, \ + Kokkos::MemoryTraits >, \ + Kokkos::View, \ + Kokkos::MemoryTraits >, \ + false, true>; + +#define KOKKOSBLAS3_TRSM_ETI_SPEC_INST_LAYOUTS(SCALAR, LAYOUTA, LAYOUTB, EXEC_SPACE, MEM_SPACE) \ + template struct TRSM, \ + Kokkos::MemoryTraits >, \ + Kokkos::View, \ + Kokkos::MemoryTraits >, \ + false, true>; #define KOKKOSBLAS3_TRSM_ETI_SPEC_DECL(SCALAR, LAYOUT, EXEC_SPACE, MEM_SPACE) \ - KOKKOSBLAS3_TRSM_ETI_SPEC_DECL_LAYOUTS(SCALAR, LAYOUT, LAYOUT, EXEC_SPACE, \ - MEM_SPACE) + KOKKOSBLAS3_TRSM_ETI_SPEC_DECL_LAYOUTS(SCALAR, LAYOUT, LAYOUT, EXEC_SPACE, MEM_SPACE) #define KOKKOSBLAS3_TRSM_ETI_SPEC_INST(SCALAR, LAYOUT, EXEC_SPACE, MEM_SPACE) \ - KOKKOSBLAS3_TRSM_ETI_SPEC_INST_LAYOUTS(SCALAR, LAYOUT, LAYOUT, EXEC_SPACE, \ - MEM_SPACE) + KOKKOSBLAS3_TRSM_ETI_SPEC_INST_LAYOUTS(SCALAR, LAYOUT, LAYOUT, EXEC_SPACE, MEM_SPACE) #include diff --git a/blas/impl/KokkosBlas_serial_axpy.hpp b/blas/impl/KokkosBlas_serial_axpy.hpp index 344632b8eb..83bb2b9c98 100644 --- a/blas/impl/KokkosBlas_serial_axpy.hpp +++ b/blas/impl/KokkosBlas_serial_axpy.hpp @@ -26,9 +26,8 @@ namespace Impl { /// Serial Internal Impl /// ==================== template -KOKKOS_INLINE_FUNCTION static void serial_axpy( - const int m, const ScalarType alpha, const ValueType *KOKKOS_RESTRICT X, - /* */ ValueType *KOKKOS_RESTRICT Y, const int xs0, const int ys0) { +KOKKOS_INLINE_FUNCTION static void serial_axpy(const int m, const ScalarType alpha, const ValueType *KOKKOS_RESTRICT X, + /* */ ValueType *KOKKOS_RESTRICT Y, const int xs0, const int ys0) { #if defined(KOKKOS_ENABLE_PRAGMA_UNROLL) #pragma unroll #endif @@ -38,17 +37,14 @@ KOKKOS_INLINE_FUNCTION static void serial_axpy( } template -KOKKOS_INLINE_FUNCTION static void serial_axpy_mv( - const int m, const int n, const ScalarType alpha, - const ValueType *KOKKOS_RESTRICT X, - /* */ ValueType *KOKKOS_RESTRICT Y, const int xs0, const int xs1, - const int ys0, const int ys1) { +KOKKOS_INLINE_FUNCTION static void serial_axpy_mv(const int m, const int n, const ScalarType alpha, + const ValueType *KOKKOS_RESTRICT X, + /* */ ValueType *KOKKOS_RESTRICT Y, const int xs0, const int xs1, + const int ys0, const int ys1) { if (xs0 > xs1) { - for (int i = 0; i < m; ++i) - serial_axpy(n, alpha, X + i * xs0, Y + i * ys0, xs1, ys1); + for (int i = 0; i < m; ++i) serial_axpy(n, alpha, X + i * xs0, Y + i * ys0, xs1, ys1); } else { - for (int j = 0; j < n; ++j) - serial_axpy(m, alpha, X + j * xs1, Y + j * ys1, xs0, ys0); + for (int j = 0; j < n; ++j) serial_axpy(m, alpha, X + j * xs1, Y + j * ys1, xs0, ys0); } return; diff --git a/blas/impl/KokkosBlas_serial_nrm2.hpp b/blas/impl/KokkosBlas_serial_nrm2.hpp index 1b40ea32a8..db17736c0f 100644 --- a/blas/impl/KokkosBlas_serial_nrm2.hpp +++ b/blas/impl/KokkosBlas_serial_nrm2.hpp @@ -27,10 +27,8 @@ namespace Impl { /// Serial Internal Impl /// ==================== template -KOKKOS_INLINE_FUNCTION static - typename Kokkos::Details::InnerProductSpaceTraits::mag_type - serial_nrm2(const int m, const ValueType *KOKKOS_RESTRICT X, - const int xs0) { +KOKKOS_INLINE_FUNCTION static typename Kokkos::Details::InnerProductSpaceTraits::mag_type serial_nrm2( + const int m, const ValueType *KOKKOS_RESTRICT X, const int xs0) { using IPT = Kokkos::Details::InnerProductSpaceTraits; using norm_type = typename IPT::mag_type; @@ -39,21 +37,16 @@ KOKKOS_INLINE_FUNCTION static #if defined(KOKKOS_ENABLE_PRAGMA_UNROLL) #pragma unroll #endif - for (int i = 0; i < m; ++i) - nrm += IPT::norm(IPT::dot(X[i * xs0], X[i * xs0])); + for (int i = 0; i < m; ++i) nrm += IPT::norm(IPT::dot(X[i * xs0], X[i * xs0])); return Kokkos::ArithTraits::sqrt(nrm); } template KOKKOS_INLINE_FUNCTION static void serial_nrm2( - const int m, const int n, const ValueType *KOKKOS_RESTRICT X, const int xs0, - const int xs1, - typename Kokkos::Details::InnerProductSpaceTraits::mag_type - *KOKKOS_RESTRICT R, - const int ys0) { - for (int vecIdx = 0; vecIdx < n; ++vecIdx) - R[vecIdx * ys0] = serial_nrm2(m, X + vecIdx * xs1, xs0); + const int m, const int n, const ValueType *KOKKOS_RESTRICT X, const int xs0, const int xs1, + typename Kokkos::Details::InnerProductSpaceTraits::mag_type *KOKKOS_RESTRICT R, const int ys0) { + for (int vecIdx = 0; vecIdx < n; ++vecIdx) R[vecIdx * ys0] = serial_nrm2(m, X + vecIdx * xs1, xs0); return; } diff --git a/blas/impl/KokkosBlas_util.hpp b/blas/impl/KokkosBlas_util.hpp index 1fc6b7d480..885625673f 100644 --- a/blas/impl/KokkosBlas_util.hpp +++ b/blas/impl/KokkosBlas_util.hpp @@ -135,12 +135,9 @@ namespace Impl { // Output params: // * teamsPerReduction: number of teams to use for each reduction template -void multipleReductionWorkDistribution(size_type length, - size_type numReductions, - size_type &teamsPerDot) { - constexpr size_type workPerTeam = 4096; // Amount of work per team - size_type appxNumTeams = - (length * numReductions) / workPerTeam; // Estimation for appxNumTeams +void multipleReductionWorkDistribution(size_type length, size_type numReductions, size_type &teamsPerDot) { + constexpr size_type workPerTeam = 4096; // Amount of work per team + size_type appxNumTeams = (length * numReductions) / workPerTeam; // Estimation for appxNumTeams // Adjust appxNumTeams in case it is too small or too large if (appxNumTeams < 1) appxNumTeams = 1; diff --git a/blas/src/KokkosBlas1_abs.hpp b/blas/src/KokkosBlas1_abs.hpp index bd63ccedf1..f3ea88bb03 100644 --- a/blas/src/KokkosBlas1_abs.hpp +++ b/blas/src/KokkosBlas1_abs.hpp @@ -46,19 +46,14 @@ void abs(const execution_space& space, const RMV& R, const XMV& X) { static_assert(Kokkos::is_view::value, "KokkosBlas::abs: " "R is not a Kokkos::View."); - static_assert( - Kokkos::SpaceAccessibility::accessible, - "KokkosBlas::abs: RMV must be accessible from execution space"); + static_assert(Kokkos::SpaceAccessibility::accessible, + "KokkosBlas::abs: RMV must be accessible from execution space"); static_assert(Kokkos::is_view::value, "KokkosBlas::abs: " "X is not a Kokkos::View."); - static_assert( - Kokkos::SpaceAccessibility::accessible, - "KokkosBlas::abs: XMV must be accessible from execution space"); - static_assert(std::is_same::value, + static_assert(Kokkos::SpaceAccessibility::accessible, + "KokkosBlas::abs: XMV must be accessible from execution space"); + static_assert(std::is_same::value, "KokkosBlas::abs: R is const. " "It must be nonconst, because it is an output argument " "(we have to be able to write to its entries)."); @@ -73,30 +68,25 @@ void abs(const execution_space& space, const RMV& R, const XMV& X) { if (X.extent(0) != R.extent(0) || X.extent(1) != R.extent(1)) { std::ostringstream os; os << "KokkosBlas::abs (MV): Dimensions of R and X do not match: " - << "R: " << R.extent(0) << " x " << R.extent(1) << ", X: " << X.extent(0) - << " x " << X.extent(1); + << "R: " << R.extent(0) << " x " << R.extent(1) << ", X: " << X.extent(0) << " x " << X.extent(1); KokkosKernels::Impl::throw_runtime_exception(os.str()); } // Create unmanaged versions of the input Views. RMV and XMV may be // rank 1 or rank 2. - using RMV_Internal = Kokkos::View< - typename std::conditional::type, - typename KokkosKernels::Impl::GetUnifiedLayout::array_layout, - typename RMV::device_type, Kokkos::MemoryTraits >; - using XMV_Internal = Kokkos::View< - typename std::conditional::type, - typename KokkosKernels::Impl::GetUnifiedLayout::array_layout, - typename XMV::device_type, Kokkos::MemoryTraits >; + using RMV_Internal = Kokkos::View::type, + typename KokkosKernels::Impl::GetUnifiedLayout::array_layout, + typename RMV::device_type, Kokkos::MemoryTraits >; + using XMV_Internal = Kokkos::View::type, + typename KokkosKernels::Impl::GetUnifiedLayout::array_layout, + typename XMV::device_type, Kokkos::MemoryTraits >; RMV_Internal R_internal = R; XMV_Internal X_internal = X; - Impl::Abs::abs(space, R_internal, - X_internal); + Impl::Abs::abs(space, R_internal, X_internal); } /// \brief R(i,j) = abs(X(i,j)) diff --git a/blas/src/KokkosBlas1_axpby.hpp b/blas/src/KokkosBlas1_axpby.hpp index 5cd03dd7c7..788995679c 100644 --- a/blas/src/KokkosBlas1_axpby.hpp +++ b/blas/src/KokkosBlas1_axpby.hpp @@ -60,10 +60,8 @@ namespace KokkosBlas { /// \param Y [in/out] View of type YMV in which the results will be /// stored. template -void axpby(const execution_space& exec_space, const AV& a, const XMV& X, - const BV& b, const YMV& Y) { - using AxpbyTraits = - Impl::AxpbyUnificationAttemptTraits; +void axpby(const execution_space& exec_space, const AV& a, const XMV& X, const BV& b, const YMV& Y) { + using AxpbyTraits = Impl::AxpbyUnificationAttemptTraits; using InternalTypeA = typename AxpbyTraits::InternalTypeA; using InternalTypeX = typename AxpbyTraits::InternalTypeX; using InternalTypeB = typename AxpbyTraits::InternalTypeB; @@ -95,37 +93,28 @@ void axpby(const execution_space& exec_space, const AV& a, const XMV& X, // and 'b' become scalars as well, eventually changing precision in // order to match the precisions of 'X' and 'Y'. // ******************************************************************** - if constexpr (AxpbyTraits::a_is_scalar && AxpbyTraits::b_is_scalar && - AxpbyTraits::onDevice) { + if constexpr (AxpbyTraits::a_is_scalar && AxpbyTraits::b_is_scalar && AxpbyTraits::onDevice) { // ****************************************************************** // We are in the exception situation for rule 2 // ****************************************************************** InternalTypeA internal_a(a); InternalTypeA internal_b(b); - Impl::Axpby::axpby(exec_space, internal_a, internal_X, - internal_b, internal_Y); + Impl::Axpby::axpby( + exec_space, internal_a, internal_X, internal_b, internal_Y); } else { // ****************************************************************** // We are in rule 1, that is, we are in a 'onHost' case now // ****************************************************************** - InternalTypeA internal_a(Impl::getScalarValueFromVariableAtHost< - AV, Impl::typeRank()>::getValue(a)); - InternalTypeB internal_b(Impl::getScalarValueFromVariableAtHost< - BV, Impl::typeRank()>::getValue(b)); + InternalTypeA internal_a(Impl::getScalarValueFromVariableAtHost()>::getValue(a)); + InternalTypeB internal_b(Impl::getScalarValueFromVariableAtHost()>::getValue(b)); - Impl::Axpby::axpby(exec_space, internal_a, internal_X, - internal_b, internal_Y); + Impl::Axpby::axpby( + exec_space, internal_a, internal_X, internal_b, internal_Y); } } else if constexpr (AxpbyTraits::internalTypesAB_bothViews) { - constexpr bool internalLayoutA_isStride( - std::is_same_v); - constexpr bool internalLayoutB_isStride( - std::is_same_v); + constexpr bool internalLayoutA_isStride(std::is_same_v); + constexpr bool internalLayoutB_isStride(std::is_same_v); const size_t numScalarsA(Impl::getAmountOfScalarsInCoefficient(a)); const size_t numScalarsB(Impl::getAmountOfScalarsInCoefficient(b)); @@ -143,8 +132,7 @@ void axpby(const execution_space& exec_space, const AV& a, const XMV& X, // ****************************************************************** // Prepare internal_a // ****************************************************************** - typename AxpbyTraits::InternalTypeA_managed managed_a("managed_a", - layoutStrideA); + typename AxpbyTraits::InternalTypeA_managed managed_a("managed_a", layoutStrideA); if constexpr (AxpbyTraits::atInputLayoutA_isStride) { Kokkos::deep_copy(managed_a, a); } else { @@ -156,8 +144,7 @@ void axpby(const execution_space& exec_space, const AV& a, const XMV& X, // **************************************************************** // Prepare internal_b // **************************************************************** - typename AxpbyTraits::InternalTypeB_managed managed_b("managed_b", - layoutStrideB); + typename AxpbyTraits::InternalTypeB_managed managed_b("managed_b", layoutStrideB); if constexpr (AxpbyTraits::atInputLayoutB_isStride) { Kokkos::deep_copy(managed_b, b); } else { @@ -168,16 +155,13 @@ void axpby(const execution_space& exec_space, const AV& a, const XMV& X, // **************************************************************** // Call Impl::Axpby<...>::axpby(...) // **************************************************************** - Impl::Axpby::axpby(exec_space, internal_a, - internal_X, internal_b, - internal_Y); + Impl::Axpby::axpby( + exec_space, internal_a, internal_X, internal_b, internal_Y); } else { // **************************************************************** // Prepare internal_b // **************************************************************** - typename AxpbyTraits::InternalTypeB_managed managed_b("managed_b", - numScalarsB); + typename AxpbyTraits::InternalTypeB_managed managed_b("managed_b", numScalarsB); if constexpr (AxpbyTraits::atInputLayoutB_isStride) { Kokkos::deep_copy(managed_b, b); } else { @@ -188,17 +172,14 @@ void axpby(const execution_space& exec_space, const AV& a, const XMV& X, // **************************************************************** // Call Impl::Axpby<...>::axpby(...) // **************************************************************** - Impl::Axpby::axpby(exec_space, internal_a, - internal_X, internal_b, - internal_Y); + Impl::Axpby::axpby( + exec_space, internal_a, internal_X, internal_b, internal_Y); } } else { // ****************************************************************** // Prepare internal_a // ****************************************************************** - typename AxpbyTraits::InternalTypeA_managed managed_a("managed_a", - numScalarsA); + typename AxpbyTraits::InternalTypeA_managed managed_a("managed_a", numScalarsA); if constexpr (AxpbyTraits::atInputLayoutA_isStride) { Kokkos::deep_copy(managed_a, a); } else { @@ -210,8 +191,7 @@ void axpby(const execution_space& exec_space, const AV& a, const XMV& X, // **************************************************************** // Prepare internal_b // **************************************************************** - typename AxpbyTraits::InternalTypeB_managed managed_b("managed_b", - layoutStrideB); + typename AxpbyTraits::InternalTypeB_managed managed_b("managed_b", layoutStrideB); if constexpr (AxpbyTraits::atInputLayoutB_isStride) { Kokkos::deep_copy(managed_b, b); } else { @@ -222,16 +202,13 @@ void axpby(const execution_space& exec_space, const AV& a, const XMV& X, // **************************************************************** // Call Impl::Axpby<...>::axpby(...) // **************************************************************** - Impl::Axpby::axpby(exec_space, internal_a, - internal_X, internal_b, - internal_Y); + Impl::Axpby::axpby( + exec_space, internal_a, internal_X, internal_b, internal_Y); } else { // **************************************************************** // Prepare internal_b // **************************************************************** - typename AxpbyTraits::InternalTypeB_managed managed_b("managed_b", - numScalarsB); + typename AxpbyTraits::InternalTypeB_managed managed_b("managed_b", numScalarsB); if constexpr (AxpbyTraits::atInputLayoutB_isStride) { Kokkos::deep_copy(managed_b, b); } else { @@ -242,10 +219,8 @@ void axpby(const execution_space& exec_space, const AV& a, const XMV& X, // **************************************************************** // Call Impl::Axpby<...>::axpby(...) // **************************************************************** - Impl::Axpby::axpby(exec_space, internal_a, - internal_X, internal_b, - internal_Y); + Impl::Axpby::axpby( + exec_space, internal_a, internal_X, internal_b, internal_Y); } } } @@ -299,10 +274,8 @@ void axpby(const AV& a, const XMV& X, const BV& b, const YMV& Y) { /// \param Y [in/out] View of type YMV in which the results will be /// stored. template -void axpy(const execution_space& exec_space, const AV& a, const XMV& X, - const YMV& Y) { - axpby(exec_space, a, X, - Kokkos::ArithTraits::one(), Y); +void axpy(const execution_space& exec_space, const AV& a, const XMV& X, const YMV& Y) { + axpby(exec_space, a, X, Kokkos::ArithTraits::one(), Y); } /// \brief Computes Y := a*X + Y @@ -334,23 +307,17 @@ void axpy(const AV& a, const XMV& X, const YMV& Y) { template KOKKOS_FUNCTION void serial_axpy(const scalar_type alpha, const XMV X, YMV Y) { #if (KOKKOSKERNELS_DEBUG_LEVEL > 0) - static_assert(Kokkos::is_view::value, - "KokkosBlas::serial_axpy: XMV is not a Kokkos::View"); - static_assert(Kokkos::is_view::value, - "KokkosBlas::serial_axpy: YMV is not a Kokkos::View"); - static_assert(XMV::rank == 1 || XMV::rank == 2, - "KokkosBlas::serial_axpy: XMV must have rank 1 or 2."); - static_assert( - XMV::rank == YMV::rank, - "KokkosBlas::serial_axpy: XMV and YMV must have the same rank."); + static_assert(Kokkos::is_view::value, "KokkosBlas::serial_axpy: XMV is not a Kokkos::View"); + static_assert(Kokkos::is_view::value, "KokkosBlas::serial_axpy: YMV is not a Kokkos::View"); + static_assert(XMV::rank == 1 || XMV::rank == 2, "KokkosBlas::serial_axpy: XMV must have rank 1 or 2."); + static_assert(XMV::rank == YMV::rank, "KokkosBlas::serial_axpy: XMV and YMV must have the same rank."); if (X.extent(0) != Y.extent(0) || X.extent(1) != Y.extent(1)) { Kokkos::abort("KokkosBlas::serial_axpy: X and Y dimensions do not match"); } #endif // KOKKOSKERNELS_DEBUG_LEVEL - return Impl::serial_axpy_mv(X.extent(0), X.extent(1), alpha, X.data(), - Y.data(), X.stride_0(), X.stride_1(), + return Impl::serial_axpy_mv(X.extent(0), X.extent(1), alpha, X.data(), Y.data(), X.stride_0(), X.stride_1(), Y.stride_0(), Y.stride_1()); } diff --git a/blas/src/KokkosBlas1_dot.hpp b/blas/src/KokkosBlas1_dot.hpp index aa995836eb..6e1a428b51 100644 --- a/blas/src/KokkosBlas1_dot.hpp +++ b/blas/src/KokkosBlas1_dot.hpp @@ -37,28 +37,19 @@ namespace KokkosBlas { /// /// \return The dot product result; a single value. template , - int>::type = 0> -typename Kokkos::Details::InnerProductSpaceTraits< - typename XVector::non_const_value_type>::dot_type -dot(const execution_space& space, const XVector& x, const YVector& y) { + typename std::enable_if, int>::type = 0> +typename Kokkos::Details::InnerProductSpaceTraits::dot_type dot( + const execution_space& space, const XVector& x, const YVector& y) { static_assert(Kokkos::is_execution_space_v, "KokkosBlas::dot: execution_space must be a valid Kokkos " "execution space."); - static_assert(Kokkos::is_view::value, - "KokkosBlas::dot: XVector must be a Kokkos::View."); - static_assert( - Kokkos::SpaceAccessibility::accessible, - "KokkosBlas::dot: XVector must be accessible from execution_space"); - static_assert(Kokkos::is_view::value, - "KokkosBlas::dot: YVector must be a Kokkos::View."); - static_assert( - Kokkos::SpaceAccessibility::accessible, - "KokkosBlas::dot: YVector must be accessible from execution_space"); - static_assert((int)XVector::rank == (int)YVector::rank, - "KokkosBlas::dot: Vector ranks do not match."); + static_assert(Kokkos::is_view::value, "KokkosBlas::dot: XVector must be a Kokkos::View."); + static_assert(Kokkos::SpaceAccessibility::accessible, + "KokkosBlas::dot: XVector must be accessible from execution_space"); + static_assert(Kokkos::is_view::value, "KokkosBlas::dot: YVector must be a Kokkos::View."); + static_assert(Kokkos::SpaceAccessibility::accessible, + "KokkosBlas::dot: YVector must be accessible from execution_space"); + static_assert((int)XVector::rank == (int)YVector::rank, "KokkosBlas::dot: Vector ranks do not match."); static_assert(XVector::rank == 1, "KokkosBlas::dot: " "Both Vector inputs must have rank 1."); @@ -72,29 +63,23 @@ dot(const execution_space& space, const XVector& x, const YVector& y) { KokkosKernels::Impl::throw_runtime_exception(os.str()); } - using XVector_Internal = Kokkos::View< - typename XVector::const_value_type*, - typename KokkosKernels::Impl::GetUnifiedLayout::array_layout, - typename XVector::device_type, Kokkos::MemoryTraits>; - using YVector_Internal = Kokkos::View< - typename YVector::const_value_type*, - typename KokkosKernels::Impl::GetUnifiedLayout::array_layout, - typename YVector::device_type, Kokkos::MemoryTraits>; + using XVector_Internal = Kokkos::View::array_layout, + typename XVector::device_type, Kokkos::MemoryTraits>; + using YVector_Internal = Kokkos::View::array_layout, + typename YVector::device_type, Kokkos::MemoryTraits>; - using dot_type = typename Kokkos::Details::InnerProductSpaceTraits< - typename XVector::non_const_value_type>::dot_type; + using dot_type = typename Kokkos::Details::InnerProductSpaceTraits::dot_type; // result_type is usually just dot_type, except: // if dot_type is float, result_type is double // if dot_type is complex, result_type is complex // These special cases are to maintain accuracy. - using result_type = - typename KokkosBlas::Impl::DotAccumulatingScalar::type; + using result_type = typename KokkosBlas::Impl::DotAccumulatingScalar::type; using RVector_Internal = - Kokkos::View>; + Kokkos::View>; using RVector_Result = - Kokkos::View>; + Kokkos::View>; XVector_Internal X = x; YVector_Internal Y = y; @@ -108,24 +93,19 @@ dot(const execution_space& space, const XVector& x, const YVector& y) { // two different scalar types. result_type result{}; RVector_Result R = RVector_Result(&result); - Impl::DotSpecialAccumulator::dot(space, - R, X, - Y); + Impl::DotSpecialAccumulator::dot(space, R, X, + Y); space.fence(); // mfh 22 Jan 2020: We need the line below because // Kokkos::complex lacks a constructor that takes a // Kokkos::complex with U != T. - return Kokkos::Details::CastPossiblyComplex::cast( - result); + return Kokkos::Details::CastPossiblyComplex::cast(result); } else { dot_type result{}; RVector_Internal R = RVector_Internal(&result); - Impl::Dot::dot(space, R, X, Y); + Impl::Dot::dot(space, R, X, Y); space.fence(); - return Kokkos::Details::CastPossiblyComplex::cast( - result); + return Kokkos::Details::CastPossiblyComplex::cast(result); } } @@ -142,9 +122,8 @@ dot(const execution_space& space, const XVector& x, const YVector& y) { /// /// \return The dot product result; a single value. template -typename Kokkos::Details::InnerProductSpaceTraits< - typename XVector::non_const_value_type>::dot_type -dot(const XVector& x, const YVector& y) { +typename Kokkos::Details::InnerProductSpaceTraits::dot_type dot( + const XVector& x, const YVector& y) { return dot(typename XVector::execution_space{}, x, y); } @@ -192,35 +171,26 @@ void dot(const execution_space& space, const RV& R, const XMV& X, const YMV& Y, static_assert(Kokkos::is_view::value, "KokkosBlas::dot: " "X is not a Kokkos::View."); - static_assert( - Kokkos::SpaceAccessibility::accessible, - "KokkosBlas::dot: XMV must be accessible from execution_space."); + static_assert(Kokkos::SpaceAccessibility::accessible, + "KokkosBlas::dot: XMV must be accessible from execution_space."); static_assert(Kokkos::is_view::value, "KokkosBlas::dot: " "Y is not a Kokkos::View."); - static_assert( - Kokkos::SpaceAccessibility::accessible, - "KokkosBlas::dot: XMV must be accessible from execution_space."); - static_assert(std::is_same::value, + static_assert(Kokkos::SpaceAccessibility::accessible, + "KokkosBlas::dot: XMV must be accessible from execution_space."); + static_assert(std::is_same::value, "KokkosBlas::dot: R is const. " "It must be nonconst, because it is an output argument " "(we have to be able to write to its entries)."); - static_assert(RV::rank == 0 || RV::rank == 1, - "KokkosBlas::dot: R must have rank 0 or 1."); - static_assert(XMV::rank == 1 || XMV::rank == 2, - "KokkosBlas::dot: X must have rank 1 or 2."); - static_assert(YMV::rank == 1 || YMV::rank == 2, - "KokkosBlas::dot: Y must have rank 1 or 2."); - static_assert((XMV::rank == 2 && YMV::rank == 2 && RV::rank == 1) || - (XMV::rank == 1 && YMV::rank == 1 && RV::rank == 0) || - (XMV::rank == 2 && YMV::rank == 1 && RV::rank == 1) || - (XMV::rank == 1 && YMV::rank == 2 && RV::rank == 1), - "KokkosBlas::dot: Ranks of RV, XMV, and YMV don't match. " - "See this function's documentation for the allowed " - "combinations of ranks."); + static_assert(RV::rank == 0 || RV::rank == 1, "KokkosBlas::dot: R must have rank 0 or 1."); + static_assert(XMV::rank == 1 || XMV::rank == 2, "KokkosBlas::dot: X must have rank 1 or 2."); + static_assert(YMV::rank == 1 || YMV::rank == 2, "KokkosBlas::dot: Y must have rank 1 or 2."); + static_assert( + (XMV::rank == 2 && YMV::rank == 2 && RV::rank == 1) || (XMV::rank == 1 && YMV::rank == 1 && RV::rank == 0) || + (XMV::rank == 2 && YMV::rank == 1 && RV::rank == 1) || (XMV::rank == 1 && YMV::rank == 2 && RV::rank == 1), + "KokkosBlas::dot: Ranks of RV, XMV, and YMV don't match. " + "See this function's documentation for the allowed " + "combinations of ranks."); // Check compatibility of dimensions at run time. @@ -228,8 +198,7 @@ void dot(const execution_space& space, const RV& R, const XMV& X, const YMV& Y, bool dimsMatch = true; if (X.extent(0) != Y.extent(0)) { dimsMatch = false; - } else if (X.extent(1) != Y.extent(1) && X.extent(1) != 1 && - Y.extent(1) != 1) { + } else if (X.extent(1) != Y.extent(1) && X.extent(1) != 1 && Y.extent(1) != 1) { // Numbers of columns don't match, and neither X nor Y have one column. dimsMatch = false; } @@ -244,43 +213,33 @@ void dot(const execution_space& space, const RV& R, const XMV& X, const YMV& Y, if (RV::rank == 1) { os << "R: " << R.extent(0) << " x " << X.extent(1) << ", "; } - os << "X: " << X.extent(0) << " x " << X.extent(1) << ", Y: " << Y.extent(0) - << " x " << Y.extent(1); + os << "X: " << X.extent(0) << " x " << X.extent(1) << ", Y: " << Y.extent(0) << " x " << Y.extent(1); KokkosKernels::Impl::throw_runtime_exception(os.str()); } // Create unmanaged versions of the input Views. - using UnifiedXLayout = - typename KokkosKernels::Impl::GetUnifiedLayout::array_layout; - using UnifiedRVLayout = - typename KokkosKernels::Impl::GetUnifiedLayoutPreferring< - RV, UnifiedXLayout>::array_layout; + using UnifiedXLayout = typename KokkosKernels::Impl::GetUnifiedLayout::array_layout; + using UnifiedRVLayout = typename KokkosKernels::Impl::GetUnifiedLayoutPreferring::array_layout; - typedef Kokkos::View::type, - UnifiedRVLayout, typename RV::device_type, - Kokkos::MemoryTraits> + typedef Kokkos::View::type, + UnifiedRVLayout, typename RV::device_type, Kokkos::MemoryTraits> RV_Internal; - typedef Kokkos::View< - typename std::conditional::type, - UnifiedXLayout, typename XMV::device_type, - Kokkos::MemoryTraits> + typedef Kokkos::View::type, + UnifiedXLayout, typename XMV::device_type, Kokkos::MemoryTraits> XMV_Internal; - typedef Kokkos::View< - typename std::conditional::type, - typename KokkosKernels::Impl::GetUnifiedLayout::array_layout, - typename YMV::device_type, Kokkos::MemoryTraits> + typedef Kokkos::View::type, + typename KokkosKernels::Impl::GetUnifiedLayout::array_layout, typename YMV::device_type, + Kokkos::MemoryTraits> YMV_Internal; RV_Internal R_internal = R; XMV_Internal X_internal = X; YMV_Internal Y_internal = Y; - Impl::Dot::dot( - space, R_internal, X_internal, Y_internal); + Impl::Dot::dot(space, R_internal, X_internal, Y_internal); } /// \brief Compute the column-wise dot products of two multivectors. @@ -314,8 +273,7 @@ void dot(const execution_space& space, const RV& R, const XMV& X, const YMV& Y, /// doesn't confuse this version of dot() with the three-argument /// version of dot() in Kokkos_Blas1.hpp. template -void dot(const RV& R, const XMV& X, const YMV& Y, - typename std::enable_if::value, int>::type = 0) { +void dot(const RV& R, const XMV& X, const YMV& Y, typename std::enable_if::value, int>::type = 0) { dot(typename XMV::execution_space{}, R, X, Y); } } // namespace KokkosBlas diff --git a/blas/src/KokkosBlas1_fill.hpp b/blas/src/KokkosBlas1_fill.hpp index 403411f7b8..486ee46c71 100644 --- a/blas/src/KokkosBlas1_fill.hpp +++ b/blas/src/KokkosBlas1_fill.hpp @@ -33,8 +33,7 @@ namespace KokkosBlas { /// \param X [out] Output View (1-D or 2-D). /// \param val [in] Value with which to fill the entries of X. template -void fill(const execution_space& space, const XMV& X, - const typename XMV::non_const_value_type& val) { +void fill(const execution_space& space, const XMV& X, const typename XMV::non_const_value_type& val) { Kokkos::Profiling::pushRegion("KokkosBlas::fill"); Kokkos::deep_copy(space, X, val); Kokkos::Profiling::popRegion(); diff --git a/blas/src/KokkosBlas1_iamax.hpp b/blas/src/KokkosBlas1_iamax.hpp index cfaaaeed63..4b69f8d507 100644 --- a/blas/src/KokkosBlas1_iamax.hpp +++ b/blas/src/KokkosBlas1_iamax.hpp @@ -36,43 +36,35 @@ namespace KokkosBlas { /// single value. /// Note: Returned index is 1-based for compatibility with Fortran. template , - int>::type = 0> -typename XVector::size_type iamax(const execution_space& space, - const XVector& x) { + typename std::enable_if, int>::type = 0> +typename XVector::size_type iamax(const execution_space& space, const XVector& x) { static_assert(Kokkos::is_execution_space_v, "KokkosBlas::iamax: execution_space must be a valid Kokkos " "execution space"); - static_assert(Kokkos::is_view::value, - "KokkosBlas::iamax: XVector must be a Kokkos::View."); - static_assert( - Kokkos::SpaceAccessibility::accessible, - "KokkosBlas::iamax: XVector must be accessible from execution_space"); + static_assert(Kokkos::is_view::value, "KokkosBlas::iamax: XVector must be a Kokkos::View."); + static_assert(Kokkos::SpaceAccessibility::accessible, + "KokkosBlas::iamax: XVector must be accessible from execution_space"); static_assert(XVector::rank == 1, "KokkosBlas::iamax: " "Both Vector inputs must have rank 1."); typedef typename XVector::size_type index_type; - typedef Kokkos::View< - typename XVector::const_value_type*, - typename KokkosKernels::Impl::GetUnifiedLayout::array_layout, - typename XVector::device_type, Kokkos::MemoryTraits > + typedef Kokkos::View::array_layout, + typename XVector::device_type, Kokkos::MemoryTraits > XVector_Internal; using layout_t = typename XVector_Internal::array_layout; - typedef Kokkos::View > + typedef Kokkos::View > RVector_Internal; index_type result; RVector_Internal R = RVector_Internal(&result, layout_t()); XVector_Internal X = x; - Impl::Iamax::iamax(space, - R, X); + Impl::Iamax::iamax(space, R, X); space.fence(); return result; } @@ -122,17 +114,13 @@ void iamax(const execution_space& space, const RV& R, const XMV& X, static_assert(Kokkos::is_view::value, "KokkosBlas::iamax: " "X is not a Kokkos::View."); - static_assert( - Kokkos::SpaceAccessibility::accessible, - "KokkosBlas::iamax: XMV must be accessible from execution_space."); - static_assert(std::is_same::value, + static_assert(Kokkos::SpaceAccessibility::accessible, + "KokkosBlas::iamax: XMV must be accessible from execution_space."); + static_assert(std::is_same::value, "KokkosBlas::iamax: R is const. " "It must be nonconst, because it is an output argument " "(we have to be able to write to its entries)."); - static_assert(((RV::rank == 0) && (XMV::rank == 1)) || - ((RV::rank == 1) && (XMV::rank == 2)), + static_assert(((RV::rank == 0) && (XMV::rank == 1)) || ((RV::rank == 1) && (XMV::rank == 2)), "KokkosBlas::iamax: " "RV and XMV must either have rank 0 and 1 or rank 1 and 2."); @@ -146,41 +134,32 @@ void iamax(const execution_space& space, const RV& R, const XMV& X, if (X.extent(1) != R.extent(0)) { std::ostringstream os; os << "KokkosBlas::iamax (MV): Dimensions of R and X do not match: " - << "R: " << R.extent(0) << ", X: " << X.extent(0) << " x " - << X.extent(1); + << "R: " << R.extent(0) << ", X: " << X.extent(0) << " x " << X.extent(1); KokkosKernels::Impl::throw_runtime_exception(os.str()); } - using UnifiedXLayout = - typename KokkosKernels::Impl::GetUnifiedLayout::array_layout; - using UnifiedRVLayout = - typename KokkosKernels::Impl::GetUnifiedLayoutPreferring< - RV, UnifiedXLayout>::array_layout; + using UnifiedXLayout = typename KokkosKernels::Impl::GetUnifiedLayout::array_layout; + using UnifiedRVLayout = typename KokkosKernels::Impl::GetUnifiedLayoutPreferring::array_layout; // Create unmanaged versions of the input Views. RV may be rank 0 or rank 1. // XMV may be rank 1 or rank 2. - typedef Kokkos::View::type, - UnifiedRVLayout, - typename std::conditional< - std::is_same::value, - Kokkos::HostSpace, typename RV::device_type>::type, - Kokkos::MemoryTraits > - RV_Internal; typedef Kokkos::View< - typename std::conditional::type, - UnifiedXLayout, typename XMV::device_type, + typename std::conditional::type, + UnifiedRVLayout, + typename std::conditional::value, + Kokkos::HostSpace, typename RV::device_type>::type, Kokkos::MemoryTraits > + RV_Internal; + typedef Kokkos::View::type, + UnifiedXLayout, typename XMV::device_type, Kokkos::MemoryTraits > XMV_Internal; RV_Internal R_internal = R; XMV_Internal X_internal = X; - Impl::Iamax::iamax( - space, R_internal, X_internal); + Impl::Iamax::iamax(space, R_internal, X_internal); } /// \brief R(j) = iamax(X(i,j)) @@ -197,8 +176,7 @@ void iamax(const execution_space& space, const RV& R, const XMV& X, /// Note for TPL cuBLAS: When TPL cuBLAS iamax is used and returns result to a /// view, RMV must be 0-D view and XMV must be 1-D view. template -void iamax(const RV& R, const XMV& X, - typename std::enable_if::value, int>::type = 0) { +void iamax(const RV& R, const XMV& X, typename std::enable_if::value, int>::type = 0) { iamax(typename XMV::execution_space{}, R, X); } diff --git a/blas/src/KokkosBlas1_mult.hpp b/blas/src/KokkosBlas1_mult.hpp index 32ede3090c..9d76d6a822 100644 --- a/blas/src/KokkosBlas1_mult.hpp +++ b/blas/src/KokkosBlas1_mult.hpp @@ -41,75 +41,56 @@ namespace KokkosBlas { /// \param A [in] The vector to apply to X. /// \param X [in] The X vector. template -void mult(const execution_space& space, typename YMV::const_value_type& gamma, - const YMV& Y, typename AV::const_value_type& alpha, const AV& A, - const XMV& X) { +void mult(const execution_space& space, typename YMV::const_value_type& gamma, const YMV& Y, + typename AV::const_value_type& alpha, const AV& A, const XMV& X) { static_assert(Kokkos::is_execution_space_v, "KokkosBlas::mult: execution_space must be a valid Kokkos " "execution space."); static_assert(Kokkos::is_view::value, "KokkosBlas::mult: " "Y is not a Kokkos::View."); - static_assert( - Kokkos::SpaceAccessibility::accessible, - "KokkosBlas::mult: YMV must be accessible from execution_space."); + static_assert(Kokkos::SpaceAccessibility::accessible, + "KokkosBlas::mult: YMV must be accessible from execution_space."); static_assert(Kokkos::is_view::value, "KokkosBlas::mult: " "A is not a Kokkos::View."); - static_assert( - Kokkos::SpaceAccessibility::accessible, - "KokkosBlas::mult: AV must be accessible from execution_space."); + static_assert(Kokkos::SpaceAccessibility::accessible, + "KokkosBlas::mult: AV must be accessible from execution_space."); static_assert(Kokkos::is_view::value, "KokkosBlas::mult: " "X is not a Kokkos::View."); - static_assert( - Kokkos::SpaceAccessibility::accessible, - "KokkosBlas::mult: AV must be accessible from execution_space."); - static_assert(std::is_same::value, + static_assert(Kokkos::SpaceAccessibility::accessible, + "KokkosBlas::mult: AV must be accessible from execution_space."); + static_assert(std::is_same::value, "KokkosBlas::mult: Y is const. " "It must be nonconst, because it is an output argument " "(we have to be able to write to its entries)."); - static_assert( - (XMV::rank == 1 && YMV::rank == 1) || (XMV::rank == 2 && YMV::rank == 2), - "KokkosBlas::mult: Y and X must be either both rank 1, " - "or both rank 2."); + static_assert((XMV::rank == 1 && YMV::rank == 1) || (XMV::rank == 2 && YMV::rank == 2), + "KokkosBlas::mult: Y and X must be either both rank 1, " + "or both rank 2."); static_assert(AV::rank == 1, "KokkosBlas::mult: A must have rank 1."); // Check compatibility of dimensions at run time. - if (Y.extent(0) != A.extent(0) || Y.extent(0) != X.extent(0) || - Y.extent(1) != X.extent(1)) { + if (Y.extent(0) != A.extent(0) || Y.extent(0) != X.extent(0) || Y.extent(1) != X.extent(1)) { std::ostringstream os; os << "KokkosBlas::mult: Dimensions do not match: " - << "Y: " << Y.extent(0) << " x " << Y.extent(1) << ", A: " << A.extent(0) - << " x " << A.extent(0) << ", X: " << X.extent(0) << " x " - << X.extent(1); + << "Y: " << Y.extent(0) << " x " << Y.extent(1) << ", A: " << A.extent(0) << " x " << A.extent(0) + << ", X: " << X.extent(0) << " x " << X.extent(1); KokkosKernels::Impl::throw_runtime_exception(os.str()); } - using YUnifiedLayout = - typename KokkosKernels::Impl::GetUnifiedLayout::array_layout; - using AUnifiedLayout = - typename KokkosKernels::Impl::GetUnifiedLayoutPreferring< - AV, YUnifiedLayout>::array_layout; - using XUnifiedLayout = - typename KokkosKernels::Impl::GetUnifiedLayoutPreferring< - XMV, YUnifiedLayout>::array_layout; + using YUnifiedLayout = typename KokkosKernels::Impl::GetUnifiedLayout::array_layout; + using AUnifiedLayout = typename KokkosKernels::Impl::GetUnifiedLayoutPreferring::array_layout; + using XUnifiedLayout = typename KokkosKernels::Impl::GetUnifiedLayoutPreferring::array_layout; // Create unmanaged versions of the input Views. - typedef Kokkos::View > YMV_Internal; - typedef Kokkos::View > AV_Internal; - typedef Kokkos::View > XMV_Internal; @@ -117,8 +98,8 @@ void mult(const execution_space& space, typename YMV::const_value_type& gamma, AV_Internal A_internal = A; XMV_Internal X_internal = X; - Impl::Mult::mult( - space, gamma, Y_internal, alpha, A_internal, X_internal); + Impl::Mult::mult(space, gamma, Y_internal, alpha, + A_internal, X_internal); } /// \brief Element wise multiplication of two vectors: @@ -138,8 +119,8 @@ void mult(const execution_space& space, typename YMV::const_value_type& gamma, /// \param A [in] The vector to apply to X. /// \param X [in] The X vector. template -void mult(typename YMV::const_value_type& gamma, const YMV& Y, - typename AV::const_value_type& alpha, const AV& A, const XMV& X) { +void mult(typename YMV::const_value_type& gamma, const YMV& Y, typename AV::const_value_type& alpha, const AV& A, + const XMV& X) { mult(typename YMV::execution_space{}, gamma, Y, alpha, A, X); } diff --git a/blas/src/KokkosBlas1_nrm1.hpp b/blas/src/KokkosBlas1_nrm1.hpp index e9b26e6177..bf7119a585 100644 --- a/blas/src/KokkosBlas1_nrm1.hpp +++ b/blas/src/KokkosBlas1_nrm1.hpp @@ -33,39 +33,30 @@ namespace KokkosBlas { /// \param x [in] Input 1-D View. /// /// \return The nrm1 product result; a single value. -template < - class execution_space, class XVector, - typename std::enable_if::value, - int>::type = 0> -typename Kokkos::Details::InnerProductSpaceTraits< - typename XVector::non_const_value_type>::mag_type -nrm1(const execution_space& space, const XVector& x) { - static_assert( - Kokkos::is_execution_space::value, - "KokkosBlas::nrm1: execution_space must be a Kokkos::execution_space."); - static_assert(Kokkos::is_view::value, - "KokkosBlas::nrm1: XVector must be a Kokkos::View."); +template ::value, int>::type = 0> +typename Kokkos::Details::InnerProductSpaceTraits::mag_type nrm1( + const execution_space& space, const XVector& x) { + static_assert(Kokkos::is_execution_space::value, + "KokkosBlas::nrm1: execution_space must be a Kokkos::execution_space."); + static_assert(Kokkos::is_view::value, "KokkosBlas::nrm1: XVector must be a Kokkos::View."); static_assert(XVector::rank == 1, "KokkosBlas::nrm1: " "Both Vector inputs must have rank 1."); - using mag_type = typename Kokkos::Details::InnerProductSpaceTraits< - typename XVector::non_const_value_type>::mag_type; + using mag_type = typename Kokkos::Details::InnerProductSpaceTraits::mag_type; - using XVector_Internal = Kokkos::View< - typename XVector::const_value_type*, - typename KokkosKernels::Impl::GetUnifiedLayout::array_layout, - typename XVector::device_type, Kokkos::MemoryTraits >; + using XVector_Internal = Kokkos::View::array_layout, + typename XVector::device_type, Kokkos::MemoryTraits >; using RVector_Internal = - Kokkos::View >; + Kokkos::View >; mag_type result; RVector_Internal R = RVector_Internal(&result); XVector_Internal X = x; - Impl::Nrm1::nrm1(space, - R, X); + Impl::Nrm1::nrm1(space, R, X); space.fence(); return result; } @@ -78,9 +69,8 @@ nrm1(const execution_space& space, const XVector& x) { /// /// \return The nrm1 product result; a single value. template -typename Kokkos::Details::InnerProductSpaceTraits< - typename XVector::non_const_value_type>::mag_type -nrm1(const XVector& x) { +typename Kokkos::Details::InnerProductSpaceTraits::mag_type nrm1( + const XVector& x) { return nrm1(typename XVector::execution_space{}, x); } @@ -109,22 +99,17 @@ void nrm1(const execution_space& space, const RV& R, const XMV& X, static_assert(Kokkos::is_view::value, "KokkosBlas::nrm1: " "X is not a Kokkos::View."); - static_assert(std::is_same::value, + static_assert(std::is_same::value, "KokkosBlas::nrm1: R is const. " "It must be nonconst, because it is an output argument " "(we have to be able to write to its entries)."); - static_assert(((RV::rank == 0) && (XMV::rank == 1)) || - ((RV::rank == 1) && (XMV::rank == 2)), + static_assert(((RV::rank == 0) && (XMV::rank == 1)) || ((RV::rank == 1) && (XMV::rank == 2)), "KokkosBlas::nrm1: " "RV and XMV must either have rank 0 and 1 or rank 1 and 2."); - static_assert( - Kokkos::SpaceAccessibility::accessible, - "KokkosBlas::nrm1: execution_space cannot access data in XMV"); + static_assert(Kokkos::SpaceAccessibility::accessible, + "KokkosBlas::nrm1: execution_space cannot access data in XMV"); - typedef typename Kokkos::Details::InnerProductSpaceTraits< - typename XMV::non_const_value_type>::mag_type mag_type; + typedef typename Kokkos::Details::InnerProductSpaceTraits::mag_type mag_type; static_assert(std::is_same::value, "KokkosBlas::nrm1: R must have the magnitude type of" "the xvectors value_type it is an output argument " @@ -134,37 +119,28 @@ void nrm1(const execution_space& space, const RV& R, const XMV& X, if (X.extent(1) != R.extent(0)) { std::ostringstream os; os << "KokkosBlas::nrm1 (MV): Dimensions of R and X do not match: " - << "R: " << R.extent(0) << ", X: " << X.extent(0) << " x " - << X.extent(1); + << "R: " << R.extent(0) << ", X: " << X.extent(0) << " x " << X.extent(1); KokkosKernels::Impl::throw_runtime_exception(os.str()); } - using UnifiedXLayout = - typename KokkosKernels::Impl::GetUnifiedLayout::array_layout; - using UnifiedRVLayout = - typename KokkosKernels::Impl::GetUnifiedLayoutPreferring< - RV, UnifiedXLayout>::array_layout; + using UnifiedXLayout = typename KokkosKernels::Impl::GetUnifiedLayout::array_layout; + using UnifiedRVLayout = typename KokkosKernels::Impl::GetUnifiedLayoutPreferring::array_layout; // Create unmanaged versions of the input Views. RV and XMV may be // rank 1 or rank 2. - typedef Kokkos::View::type, - UnifiedRVLayout, typename RV::device_type, - Kokkos::MemoryTraits > + typedef Kokkos::View::type, + UnifiedRVLayout, typename RV::device_type, Kokkos::MemoryTraits > RV_Internal; - typedef Kokkos::View< - typename std::conditional::type, - UnifiedXLayout, typename XMV::device_type, - Kokkos::MemoryTraits > + typedef Kokkos::View::type, + UnifiedXLayout, typename XMV::device_type, Kokkos::MemoryTraits > XMV_Internal; RV_Internal R_internal = R; XMV_Internal X_internal = X; - Impl::Nrm1::nrm1( - space, R_internal, X_internal); + Impl::Nrm1::nrm1(space, R_internal, X_internal); } /// \brief R(j) = nrm1(X(i,j)) @@ -182,16 +158,14 @@ void nrm1(const execution_space& space, const RV& R, const XMV& X, /// \param R [out] Output 1-D View containing the result /// \param X [in] Input 1-D View. template -void nrm1(const RV& R, const XMV& X, - typename std::enable_if::value, int>::type = 0) { +void nrm1(const RV& R, const XMV& X, typename std::enable_if::value, int>::type = 0) { nrm1(typename XMV::execution_space{}, R, X); } /// \brief Return the nrm1 of the vector x via asum (the actual blas name). template -typename Kokkos::Details::InnerProductSpaceTraits< - typename XVector::non_const_value_type>::mag_type -asum(const XVector& x) { +typename Kokkos::Details::InnerProductSpaceTraits::mag_type asum( + const XVector& x) { return nrm1(x); } diff --git a/blas/src/KokkosBlas1_nrm2.hpp b/blas/src/KokkosBlas1_nrm2.hpp index 59f105f5a4..2e8558ba32 100644 --- a/blas/src/KokkosBlas1_nrm2.hpp +++ b/blas/src/KokkosBlas1_nrm2.hpp @@ -34,46 +34,36 @@ namespace KokkosBlas { /// \param x [in] Input 1-D View. /// /// \return The nrm2 product result; a single value. -template < - class execution_space, class XVector, - typename std::enable_if::value, - int>::type = 0> -typename Kokkos::Details::InnerProductSpaceTraits< - typename XVector::non_const_value_type>::mag_type -nrm2(const execution_space& space, const XVector& x) { +template ::value, int>::type = 0> +typename Kokkos::Details::InnerProductSpaceTraits::mag_type nrm2( + const execution_space& space, const XVector& x) { static_assert(Kokkos::is_execution_space::value, "KokkosBlas::nrm2: execution_space must be a valid" " Kokkos execution space."); - static_assert(Kokkos::is_view::value, - "KokkosBlas::nrm2: XVector must be a Kokkos::View."); - static_assert( - Kokkos::SpaceAccessibility::accessible, - "KokkosBlas::nrm2: XVector must be accessible from execution_space"); + static_assert(Kokkos::is_view::value, "KokkosBlas::nrm2: XVector must be a Kokkos::View."); + static_assert(Kokkos::SpaceAccessibility::accessible, + "KokkosBlas::nrm2: XVector must be accessible from execution_space"); static_assert(XVector::rank == 1, "KokkosBlas::nrm2: " "XVector must have rank 1."); - typedef typename Kokkos::Details::InnerProductSpaceTraits< - typename XVector::non_const_value_type>::mag_type mag_type; + typedef typename Kokkos::Details::InnerProductSpaceTraits::mag_type mag_type; - typedef Kokkos::View< - typename XVector::const_value_type*, - typename KokkosKernels::Impl::GetUnifiedLayout::array_layout, - typename XVector::device_type, Kokkos::MemoryTraits > + typedef Kokkos::View::array_layout, + typename XVector::device_type, Kokkos::MemoryTraits > XVector_Internal; using layout_t = typename XVector_Internal::array_layout; - typedef Kokkos::View > + typedef Kokkos::View > RVector_Internal; mag_type result; RVector_Internal R = RVector_Internal(&result, layout_t()); XVector_Internal X = x; - Impl::Nrm2::nrm2( - space, R, X, true); + Impl::Nrm2::nrm2(space, R, X, true); space.fence(); return result; } @@ -89,9 +79,8 @@ nrm2(const execution_space& space, const XVector& x) { /// /// \return The nrm2 product result; a single value. template -typename Kokkos::Details::InnerProductSpaceTraits< - typename XVector::non_const_value_type>::mag_type -nrm2(const XVector& x) { +typename Kokkos::Details::InnerProductSpaceTraits::mag_type nrm2( + const XVector& x) { return nrm2(typename XVector::execution_space{}, x); } @@ -122,22 +111,17 @@ void nrm2(const execution_space& space, const RV& R, const XMV& X, static_assert(Kokkos::is_view::value, "KokkosBlas::nrm2: " "X is not a Kokkos::View."); - static_assert( - Kokkos::SpaceAccessibility::accessible, - "KokkosBlas::nrm2: X cannot be accessed from execution_space."); - static_assert(std::is_same::value, + static_assert(Kokkos::SpaceAccessibility::accessible, + "KokkosBlas::nrm2: X cannot be accessed from execution_space."); + static_assert(std::is_same::value, "KokkosBlas::nrm2: R is const. " "It must be nonconst, because it is an output argument " "(we have to be able to write to its entries)."); - static_assert(((RV::rank == 0) && (XMV::rank == 1)) || - ((RV::rank == 1) && (XMV::rank == 2)), + static_assert(((RV::rank == 0) && (XMV::rank == 1)) || ((RV::rank == 1) && (XMV::rank == 2)), "KokkosBlas::nrm2: " "RV and XMV must either have rank 0 and 1 or rank 1 and 2."); - typedef typename Kokkos::Details::InnerProductSpaceTraits< - typename XMV::non_const_value_type>::mag_type mag_type; + typedef typename Kokkos::Details::InnerProductSpaceTraits::mag_type mag_type; static_assert(std::is_same::value, "KokkosBlas::nrm2: R must have the magnitude type of" "the xvectors value_type it is an output argument " @@ -147,33 +131,26 @@ void nrm2(const execution_space& space, const RV& R, const XMV& X, if (X.extent(1) != R.extent(0)) { std::ostringstream os; os << "KokkosBlas::nrm2 (MV): Dimensions of R and X do not match: " - << "R: " << R.extent(0) << ", X: " << X.extent(0) << " x " - << X.extent(1); + << "R: " << R.extent(0) << ", X: " << X.extent(0) << " x " << X.extent(1); KokkosKernels::Impl::throw_runtime_exception(os.str()); } - using UnifiedXLayout = - typename KokkosKernels::Impl::GetUnifiedLayout::array_layout; - using UnifiedRVLayout = - typename KokkosKernels::Impl::GetUnifiedLayoutPreferring< - RV, UnifiedXLayout>::array_layout; + using UnifiedXLayout = typename KokkosKernels::Impl::GetUnifiedLayout::array_layout; + using UnifiedRVLayout = typename KokkosKernels::Impl::GetUnifiedLayoutPreferring::array_layout; // Create unmanaged versions of the input Views. RV and XMV may be // rank 1 or rank 2. - typedef Kokkos::View > RV_Internal; - typedef Kokkos::View > XMV_Internal; RV_Internal R_internal = R; XMV_Internal X_internal = X; - Impl::Nrm2::nrm2( - space, R_internal, X_internal, true); + Impl::Nrm2::nrm2(space, R_internal, X_internal, true); } /// \brief R(i,j) = nrm2(X(i,j)) @@ -193,8 +170,7 @@ void nrm2(const execution_space& space, const RV& R, const XMV& X, /// \param R [out] Output View containing results (rank 0 or 1). /// \param X [in] Input View (rank 1 or 2). template -void nrm2(const RV& R, const XMV& X, - typename std::enable_if::value, int>::type = 0) { +void nrm2(const RV& R, const XMV& X, typename std::enable_if::value, int>::type = 0) { nrm2(typename XMV::execution_space{}, R, X); } @@ -202,14 +178,11 @@ void nrm2(const RV& R, const XMV& X, /// Serial nrm2 /// template -KOKKOS_INLINE_FUNCTION typename Kokkos::Details::InnerProductSpaceTraits< - typename XMV::non_const_value_type>::mag_type +KOKKOS_INLINE_FUNCTION typename Kokkos::Details::InnerProductSpaceTraits::mag_type serial_nrm2(const XMV X) { #if (KOKKOSKERNELS_DEBUG_LEVEL > 0) - static_assert(Kokkos::is_view::value, - "KokkosBlas::serial_nrm2: XMV is not a Kokkos::View"); - static_assert(XMV::rank == 1, - "KokkosBlas::serial_nrm2: XMV must have rank 1"); + static_assert(Kokkos::is_view::value, "KokkosBlas::serial_nrm2: XMV is not a Kokkos::View"); + static_assert(XMV::rank == 1, "KokkosBlas::serial_nrm2: XMV must have rank 1"); #endif // KOKKOSKERNELS_DEBUG_LEVEL return Impl::serial_nrm2(X.extent(0), X.data(), X.stride_0()); @@ -219,26 +192,20 @@ template KOKKOS_INLINE_FUNCTION int serial_nrm2(const XMV X, const RV& R) { // Do some compile time check when debug is enabled #if (KOKKOSKERNELS_DEBUG_LEVEL > 0) - static_assert(Kokkos::is_view::value, - "KokkosBlas::serial_nrm2: XMV is not a Kokkos::View"); - static_assert(Kokkos::is_view::value, - "KokkosBlas::serial_nrm2: RV is not a Kokkos::View"); - static_assert(std::is_same::value, + static_assert(Kokkos::is_view::value, "KokkosBlas::serial_nrm2: XMV is not a Kokkos::View"); + static_assert(Kokkos::is_view::value, "KokkosBlas::serial_nrm2: RV is not a Kokkos::View"); + static_assert(std::is_same::value, "KokkosBlas::serial_nrm2: R is const. " "It must be nonconst, because it is an output argument " "(we have to be able to write to its entries)."); - static_assert(((RV::rank == 0) && (XMV::rank == 1)) || - ((RV::rank == 1) && (XMV::rank == 2)), + static_assert(((RV::rank == 0) && (XMV::rank == 1)) || ((RV::rank == 1) && (XMV::rank == 2)), "KokkosBlas::serial_nrm2: " "RV and XMV must either have rank 0 and 1 or rank 1 and 2."); - using norm_type = typename Kokkos::Details::InnerProductSpaceTraits< - typename XMV::non_const_value_type>::mag_type; - static_assert( - std::is_same::value, - "KokkosBlas::serial_nrm2: RV must have same value_type as" - " Kokkos::ArithTraits::mag_type"); + using norm_type = typename Kokkos::Details::InnerProductSpaceTraits::mag_type; + static_assert(std::is_same::value, + "KokkosBlas::serial_nrm2: RV must have same value_type as" + " Kokkos::ArithTraits::mag_type"); if (R.extent(0) != X.extent(1)) { Kokkos::printf( @@ -249,8 +216,7 @@ KOKKOS_INLINE_FUNCTION int serial_nrm2(const XMV X, const RV& R) { } #endif // KOKKOSKERNELS_DEBUG_LEVEL - Impl::serial_nrm2(X.extent(0), X.extent(1), X.data(), X.stride_0(), - X.stride_1(), R.data(), R.stride_0()); + Impl::serial_nrm2(X.extent(0), X.extent(1), X.data(), X.stride_0(), X.stride_1(), R.data(), R.stride_0()); return 0; } diff --git a/blas/src/KokkosBlas1_nrm2_squared.hpp b/blas/src/KokkosBlas1_nrm2_squared.hpp index c065efb290..748ece3663 100644 --- a/blas/src/KokkosBlas1_nrm2_squared.hpp +++ b/blas/src/KokkosBlas1_nrm2_squared.hpp @@ -33,46 +33,36 @@ namespace KokkosBlas { /// \param x [in] Input 1-D View. /// /// \return The nrm2 product result; a single value. -template < - class execution_space, class XVector, - typename std::enable_if::value, - int>::type = 0> -typename Kokkos::Details::InnerProductSpaceTraits< - typename XVector::non_const_value_type>::mag_type -nrm2_squared(const execution_space& space, const XVector& x) { +template ::value, int>::type = 0> +typename Kokkos::Details::InnerProductSpaceTraits::mag_type nrm2_squared( + const execution_space& space, const XVector& x) { static_assert(Kokkos::is_execution_space::value, "KokkosBlas::nrm2_squared: execution_space must be a valid" " Kokkos execution space"); - static_assert(Kokkos::is_view::value, - "KokkosBlas::nrm2_squared: XVector must be a Kokkos::View."); - static_assert( - Kokkos::SpaceAccessibility::accessible, - "KokkosBlas::nrm2_squared: XVector must be accessible" - " from execution_space"); + static_assert(Kokkos::is_view::value, "KokkosBlas::nrm2_squared: XVector must be a Kokkos::View."); + static_assert(Kokkos::SpaceAccessibility::accessible, + "KokkosBlas::nrm2_squared: XVector must be accessible" + " from execution_space"); static_assert(XVector::rank == 1, "KokkosBlas::nrm2_squared: " "Both Vector inputs must have rank 1."); - typedef typename Kokkos::Details::InnerProductSpaceTraits< - typename XVector::non_const_value_type>::mag_type mag_type; + typedef typename Kokkos::Details::InnerProductSpaceTraits::mag_type mag_type; - typedef Kokkos::View< - typename XVector::const_value_type*, - typename KokkosKernels::Impl::GetUnifiedLayout::array_layout, - typename XVector::device_type, Kokkos::MemoryTraits > + typedef Kokkos::View::array_layout, + typename XVector::device_type, Kokkos::MemoryTraits > XVector_Internal; - typedef Kokkos::View > + typedef Kokkos::View > RVector_Internal; mag_type result; RVector_Internal R = RVector_Internal(&result); XVector_Internal X = x; - Impl::Nrm2::nrm2( - space, R, X, false); + Impl::Nrm2::nrm2(space, R, X, false); space.fence(); return result; } @@ -88,9 +78,8 @@ nrm2_squared(const execution_space& space, const XVector& x) { /// /// \return The nrm2 product result; a single value. template -typename Kokkos::Details::InnerProductSpaceTraits< - typename XVector::non_const_value_type>::mag_type -nrm2_squared(const XVector& x) { +typename Kokkos::Details::InnerProductSpaceTraits::mag_type nrm2_squared( + const XVector& x) { return nrm2_squared(typename XVector::execution_space{}, x); } @@ -111,9 +100,8 @@ nrm2_squared(const XVector& x) { /// \param R [in] Output View (rank 0 or 1) that holds the result. /// \param X [in] Input View (rank 1 or 2). template -void nrm2_squared( - const execution_space& space, const RV& R, const XMV& X, - typename std::enable_if::value, int>::type = 0) { +void nrm2_squared(const execution_space& space, const RV& R, const XMV& X, + typename std::enable_if::value, int>::type = 0) { static_assert(Kokkos::is_execution_space::value, "KokkosBlas::nrm2_squared: execution_space must be a valid" " Kokkos execution space"); @@ -123,22 +111,17 @@ void nrm2_squared( static_assert(Kokkos::is_view::value, "KokkosBlas::nrm2_squared: " "X is not a Kokkos::View."); - static_assert( - Kokkos::SpaceAccessibility::accessible, - "KokkosBlas::nrm2_squared: XVector must be accessible" - " from execution_space"); - static_assert(std::is_same::value, + static_assert(Kokkos::SpaceAccessibility::accessible, + "KokkosBlas::nrm2_squared: XVector must be accessible" + " from execution_space"); + static_assert(std::is_same::value, "KokkosBlas::nrm2_squared: R is const. " "It must be nonconst, because it is an output argument " "(we have to be able to write to its entries)."); - static_assert(((RV::rank == 0) && (XMV::rank == 1)) || - ((RV::rank == 1) && (XMV::rank == 2)), + static_assert(((RV::rank == 0) && (XMV::rank == 1)) || ((RV::rank == 1) && (XMV::rank == 2)), "KokkosBlas::nrm2_squared: " "RV and XMV must either have rank 0 and 1 or rank 1 and 2."); - typedef typename Kokkos::Details::InnerProductSpaceTraits< - typename XMV::non_const_value_type>::mag_type mag_type; + typedef typename Kokkos::Details::InnerProductSpaceTraits::mag_type mag_type; static_assert(std::is_same::value, "KokkosBlas::nrm2: R must have the magnitude type of" "the xvectors value_type it is an output argument " @@ -148,33 +131,26 @@ void nrm2_squared( if (X.extent(1) != R.extent(0)) { std::ostringstream os; os << "KokkosBlas::nrm2 (MV): Dimensions of R and X do not match: " - << "R: " << R.extent(0) << ", X: " << X.extent(0) << " x " - << X.extent(1); + << "R: " << R.extent(0) << ", X: " << X.extent(0) << " x " << X.extent(1); KokkosKernels::Impl::throw_runtime_exception(os.str()); } - using UnifiedXLayout = - typename KokkosKernels::Impl::GetUnifiedLayout::array_layout; - using UnifiedRVLayout = - typename KokkosKernels::Impl::GetUnifiedLayoutPreferring< - RV, UnifiedXLayout>::array_layout; + using UnifiedXLayout = typename KokkosKernels::Impl::GetUnifiedLayout::array_layout; + using UnifiedRVLayout = typename KokkosKernels::Impl::GetUnifiedLayoutPreferring::array_layout; // Create unmanaged versions of the input Views. RV and XMV may be // rank 1 or rank 2. - typedef Kokkos::View > RV_Internal; - typedef Kokkos::View > XMV_Internal; RV_Internal R_internal = R; XMV_Internal X_internal = X; - Impl::Nrm2::nrm2( - space, R_internal, X_internal, false); + Impl::Nrm2::nrm2(space, R_internal, X_internal, false); } /// \brief R(i,j) = nrm2(X(i,j)) @@ -190,9 +166,7 @@ void nrm2_squared( /// the same rank as RMV, and its entries must be assignable to /// those of RMV. template -void nrm2_squared( - const RV& R, const XMV& X, - typename std::enable_if::value, int>::type = 0) { +void nrm2_squared(const RV& R, const XMV& X, typename std::enable_if::value, int>::type = 0) { nrm2_squared(typename XMV::execution_space{}, R, X); } } // namespace KokkosBlas diff --git a/blas/src/KokkosBlas1_nrm2w.hpp b/blas/src/KokkosBlas1_nrm2w.hpp index c5eaa0621b..5fea0c783c 100644 --- a/blas/src/KokkosBlas1_nrm2w.hpp +++ b/blas/src/KokkosBlas1_nrm2w.hpp @@ -36,44 +36,35 @@ namespace KokkosBlas { /// /// \return The nrm2w product result; a single value. template -typename Kokkos::Details::InnerProductSpaceTraits< - typename XVector::non_const_value_type>::mag_type -nrm2w(const execution_space& space, const XVector& x, const XVector& w, - typename std::enable_if< - Kokkos::is_execution_space::value, int>::type = 0) { +typename Kokkos::Details::InnerProductSpaceTraits::mag_type nrm2w( + const execution_space& space, const XVector& x, const XVector& w, + typename std::enable_if::value, int>::type = 0) { static_assert(Kokkos::is_execution_space::value, "KokkosBlas::nrm2w: execution_space must be a valid" " Kokkos execution space."); - static_assert(Kokkos::is_view::value, - "KokkosBlas::nrm2w: XVector must be a Kokkos::View."); - static_assert( - Kokkos::SpaceAccessibility::accessible, - "KokkosBlas::nrm2w: XVector must be accessible from execution_space"); + static_assert(Kokkos::is_view::value, "KokkosBlas::nrm2w: XVector must be a Kokkos::View."); + static_assert(Kokkos::SpaceAccessibility::accessible, + "KokkosBlas::nrm2w: XVector must be accessible from execution_space"); static_assert(XVector::rank == 1, "KokkosBlas::nrm2w: " "Both Vector inputs must have rank 1."); - using mag_type = typename Kokkos::Details::InnerProductSpaceTraits< - typename XVector::non_const_value_type>::mag_type; + using mag_type = typename Kokkos::Details::InnerProductSpaceTraits::mag_type; - using XVector_Internal = Kokkos::View< - typename XVector::const_value_type*, - typename KokkosKernels::Impl::GetUnifiedLayout::array_layout, - typename XVector::device_type, Kokkos::MemoryTraits >; + using XVector_Internal = Kokkos::View::array_layout, + typename XVector::device_type, Kokkos::MemoryTraits >; using layout_t = typename XVector_Internal::array_layout; using RVector_Internal = - Kokkos::View >; + Kokkos::View >; mag_type result; RVector_Internal R = RVector_Internal(&result, layout_t()); XVector_Internal X = x; XVector_Internal W = w; - Impl::Nrm2w::nrm2w( - space, R, X, W, true); + Impl::Nrm2w::nrm2w(space, R, X, W, true); space.fence(); return result; } @@ -90,9 +81,8 @@ nrm2w(const execution_space& space, const XVector& x, const XVector& w, /// /// \return The nrm2w product result; a single value. template -typename Kokkos::Details::InnerProductSpaceTraits< - typename XVector::non_const_value_type>::mag_type -nrm2w(const XVector& x, const XVector& w) { +typename Kokkos::Details::InnerProductSpaceTraits::mag_type nrm2w( + const XVector& x, const XVector& w) { return nrm2w(typename XVector::execution_space{}, x, w); } @@ -114,8 +104,7 @@ nrm2w(const XVector& x, const XVector& w) { /// \param X [in] Input View (rank 1 or 2). /// \param W [in] Input View (rank 1 or 2). template -void nrm2w(const execution_space& space, const RV& R, const XMV& X, - const XMV& W, +void nrm2w(const execution_space& space, const RV& R, const XMV& X, const XMV& W, typename std::enable_if::value, int>::type = 0) { static_assert(Kokkos::is_execution_space::value, "KokkosBlas::nrm2w: execution_space must be a valid" @@ -126,22 +115,17 @@ void nrm2w(const execution_space& space, const RV& R, const XMV& X, static_assert(Kokkos::is_view::value, "KokkosBlas::nrm2w: " "X is not a Kokkos::View."); - static_assert( - Kokkos::SpaceAccessibility::accessible, - "KokkosBlas::nrm2w: XMV must be accessible from execution_space"); - static_assert(std::is_same::value, + static_assert(Kokkos::SpaceAccessibility::accessible, + "KokkosBlas::nrm2w: XMV must be accessible from execution_space"); + static_assert(std::is_same::value, "KokkosBlas::nrm2w: R is const. " "It must be nonconst, because it is an output argument " "(we have to be able to write to its entries)."); - static_assert(((RV::rank == 0) && (XMV::rank == 1)) || - ((RV::rank == 1) && (XMV::rank == 2)), + static_assert(((RV::rank == 0) && (XMV::rank == 1)) || ((RV::rank == 1) && (XMV::rank == 2)), "KokkosBlas::nrm2w: " "RV and XMV must either have rank 0 and 1 or rank 1 and 2."); - typedef typename Kokkos::Details::InnerProductSpaceTraits< - typename XMV::non_const_value_type>::mag_type mag_type; + typedef typename Kokkos::Details::InnerProductSpaceTraits::mag_type mag_type; static_assert(std::is_same::value, "KokkosBlas::nrm2w: R must have the magnitude type of" "the xvectors value_type it is an output argument " @@ -151,25 +135,19 @@ void nrm2w(const execution_space& space, const RV& R, const XMV& X, if (X.extent(1) != R.extent(0)) { std::ostringstream os; os << "KokkosBlas::nrm2w (MV): Dimensions of R and X do not match: " - << "R: " << R.extent(0) << ", X: " << X.extent(0) << " x " - << X.extent(1); + << "R: " << R.extent(0) << ", X: " << X.extent(0) << " x " << X.extent(1); KokkosKernels::Impl::throw_runtime_exception(os.str()); } - using UnifiedXLayout = - typename KokkosKernels::Impl::GetUnifiedLayout::array_layout; - using UnifiedRVLayout = - typename KokkosKernels::Impl::GetUnifiedLayoutPreferring< - RV, UnifiedXLayout>::array_layout; + using UnifiedXLayout = typename KokkosKernels::Impl::GetUnifiedLayout::array_layout; + using UnifiedRVLayout = typename KokkosKernels::Impl::GetUnifiedLayoutPreferring::array_layout; // Create unmanaged versions of the input Views. RV and XMV may be // rank 1 or rank 2. - typedef Kokkos::View > RV_Internal; - typedef Kokkos::View > XMV_Internal; @@ -177,8 +155,7 @@ void nrm2w(const execution_space& space, const RV& R, const XMV& X, XMV_Internal X_internal = X; XMV_Internal W_internal = W; - Impl::Nrm2w::nrm2w( - space, R_internal, X_internal, W_internal, true); + Impl::Nrm2w::nrm2w(space, R_internal, X_internal, W_internal, true); } /// \brief R(i,j) = nrm2w(X(i,j)) diff --git a/blas/src/KokkosBlas1_nrm2w_squared.hpp b/blas/src/KokkosBlas1_nrm2w_squared.hpp index a1fe10bf1e..375a55c294 100644 --- a/blas/src/KokkosBlas1_nrm2w_squared.hpp +++ b/blas/src/KokkosBlas1_nrm2w_squared.hpp @@ -34,49 +34,38 @@ namespace KokkosBlas { /// \param w [in] Input weights (1-D View). /// /// \return The nrm2w product result; a single value. -template < - class execution_space, class XVector, - typename std::enable_if::value, - int>::type = 0> -typename Kokkos::Details::InnerProductSpaceTraits< - typename XVector::non_const_value_type>::mag_type -nrm2w_squared(const execution_space& space, const XVector& x, - const XVector& w) { +template ::value, int>::type = 0> +typename Kokkos::Details::InnerProductSpaceTraits::mag_type nrm2w_squared( + const execution_space& space, const XVector& x, const XVector& w) { static_assert(Kokkos::is_execution_space::value, "KokkosBlas::nrm2w_squared: execution_space must be a valid " "Kokkos execution space."); - static_assert(Kokkos::is_view::value, - "KokkosBlas::nrm2w_squared: XVector must be a Kokkos::View."); - static_assert( - Kokkos::SpaceAccessibility::accessible, - "KokkosBlas::nrm2w_squared: XVector must be accessible from " - "execution_space."); + static_assert(Kokkos::is_view::value, "KokkosBlas::nrm2w_squared: XVector must be a Kokkos::View."); + static_assert(Kokkos::SpaceAccessibility::accessible, + "KokkosBlas::nrm2w_squared: XVector must be accessible from " + "execution_space."); static_assert(XVector::rank == 1, "KokkosBlas::nrm2w_squared: " "Both Vector inputs must have rank 1."); - using mag_type = typename Kokkos::Details::InnerProductSpaceTraits< - typename XVector::non_const_value_type>::mag_type; + using mag_type = typename Kokkos::Details::InnerProductSpaceTraits::mag_type; - using XVector_Internal = Kokkos::View< - typename XVector::const_value_type*, - typename KokkosKernels::Impl::GetUnifiedLayout::array_layout, - typename XVector::device_type, Kokkos::MemoryTraits >; + using XVector_Internal = Kokkos::View::array_layout, + typename XVector::device_type, Kokkos::MemoryTraits >; using layout_t = typename XVector_Internal::array_layout; using RVector_Internal = - Kokkos::View >; + Kokkos::View >; mag_type result; RVector_Internal R = RVector_Internal(&result, layout_t()); XVector_Internal X = x; XVector_Internal W = w; - Impl::Nrm2w::nrm2w( - space, R, X, W, false); + Impl::Nrm2w::nrm2w(space, R, X, W, false); space.fence(); return result; } @@ -93,9 +82,8 @@ nrm2w_squared(const execution_space& space, const XVector& x, /// /// \return The nrm2w product result; a single value. template -typename Kokkos::Details::InnerProductSpaceTraits< - typename XVector::non_const_value_type>::mag_type -nrm2w_squared(const XVector& x, const XVector& w) { +typename Kokkos::Details::InnerProductSpaceTraits::mag_type nrm2w_squared( + const XVector& x, const XVector& w) { return nrm2w_squared(typename XVector::execution_space(), x, w); } @@ -117,9 +105,8 @@ nrm2w_squared(const XVector& x, const XVector& w) { /// \param X [in] Input View (rank 1 or 2). /// \param W [in] Input View (rank 1 or 2). template -void nrm2w_squared( - const execution_space& space, const RV& R, const XMV& X, const XMV& W, - typename std::enable_if::value, int>::type = 0) { +void nrm2w_squared(const execution_space& space, const RV& R, const XMV& X, const XMV& W, + typename std::enable_if::value, int>::type = 0) { static_assert(Kokkos::is_execution_space::value, "KokkosBlas::nrm2w_squared: execution_space must be a valid " "Kokkos execution space."); @@ -129,22 +116,17 @@ void nrm2w_squared( static_assert(Kokkos::is_view::value, "KokkosBlas::nrm2w_squared: " "X is not a Kokkos::View."); - static_assert( - Kokkos::SpaceAccessibility::accessible, - "KokkosBlas::nrm2w_squared: XVector must be accessible from " - "execution_space."); - static_assert(std::is_same::value, + static_assert(Kokkos::SpaceAccessibility::accessible, + "KokkosBlas::nrm2w_squared: XVector must be accessible from " + "execution_space."); + static_assert(std::is_same::value, "KokkosBlas::nrm2w_squared: R is const. " "It must be nonconst, because it is an output argument " "(we have to be able to write to its entries)."); - static_assert(((RV::rank == 0) && (XMV::rank == 1)) || - ((RV::rank == 1) && (XMV::rank == 2)), + static_assert(((RV::rank == 0) && (XMV::rank == 1)) || ((RV::rank == 1) && (XMV::rank == 2)), "KokkosBlas::nrm2w_squared: " "RV and XMV must either have rank 0 and 1 or rank 1 and 2."); - using mag_type = typename Kokkos::Details::InnerProductSpaceTraits< - typename XMV::non_const_value_type>::mag_type; + using mag_type = typename Kokkos::Details::InnerProductSpaceTraits::mag_type; static_assert(std::is_same::value, "KokkosBlas::nrm2w: R must have the magnitude type of" "the xvectors value_type it is an output argument " @@ -154,32 +136,25 @@ void nrm2w_squared( if (X.extent(1) != R.extent(0)) { std::ostringstream os; os << "KokkosBlas::nrm2w (MV): Dimensions of R and X do not match: " - << "R: " << R.extent(0) << ", X: " << X.extent(0) << " x " - << X.extent(1); + << "R: " << R.extent(0) << ", X: " << X.extent(0) << " x " << X.extent(1); KokkosKernels::Impl::throw_runtime_exception(os.str()); } - using UnifiedXLayout = - typename KokkosKernels::Impl::GetUnifiedLayout::array_layout; - using UnifiedRVLayout = - typename KokkosKernels::Impl::GetUnifiedLayoutPreferring< - RV, UnifiedXLayout>::array_layout; + using UnifiedXLayout = typename KokkosKernels::Impl::GetUnifiedLayout::array_layout; + using UnifiedRVLayout = typename KokkosKernels::Impl::GetUnifiedLayoutPreferring::array_layout; // Create unmanaged versions of the input Views. RV and XMV may be // rank 1 or rank 2. - using RV_Internal = Kokkos::View >; - using XMV_Internal = Kokkos::View >; RV_Internal R_internal = R; XMV_Internal X_internal = X; XMV_Internal W_internal = W; - Impl::Nrm2w::nrm2w( - space, R_internal, X_internal, W_internal, false); + Impl::Nrm2w::nrm2w(space, R_internal, X_internal, W_internal, false); } /// \brief R(i,j) = nrm2w(X(i,j)) @@ -199,9 +174,8 @@ void nrm2w_squared( /// \param X [in] Input View (rank 1 or 2). /// \param W [in] Input View (rank 1 or 2). template -void nrm2w_squared( - const RV& R, const XMV& X, const XMV& W, - typename std::enable_if::value, int>::type = 0) { +void nrm2w_squared(const RV& R, const XMV& X, const XMV& W, + typename std::enable_if::value, int>::type = 0) { nrm2w_squared(typename XMV::execution_space{}, R, X, W); } } // namespace KokkosBlas diff --git a/blas/src/KokkosBlas1_nrminf.hpp b/blas/src/KokkosBlas1_nrminf.hpp index c6f923aefe..ec3a98fa95 100644 --- a/blas/src/KokkosBlas1_nrminf.hpp +++ b/blas/src/KokkosBlas1_nrminf.hpp @@ -33,39 +33,31 @@ namespace KokkosBlas { /// \param x [in] Input 1-D View. /// /// \return The nrminf product result; a single value. -template < - class execution_space, class XVector, - typename std::enable_if::value, - int>::type = 0> -typename Kokkos::Details::InnerProductSpaceTraits< - typename XVector::non_const_value_type>::mag_type -nrminf(const execution_space& space, const XVector& x) { - static_assert(Kokkos::is_view::value, - "KokkosBlas::nrminf: XVector must be a Kokkos::View."); +template ::value, int>::type = 0> +typename Kokkos::Details::InnerProductSpaceTraits::mag_type nrminf( + const execution_space& space, const XVector& x) { + static_assert(Kokkos::is_view::value, "KokkosBlas::nrminf: XVector must be a Kokkos::View."); static_assert(XVector::rank == 1, "KokkosBlas::nrminf: " "Both Vector inputs must have rank 1."); - typedef typename Kokkos::Details::InnerProductSpaceTraits< - typename XVector::non_const_value_type>::mag_type mag_type; + typedef typename Kokkos::Details::InnerProductSpaceTraits::mag_type mag_type; - typedef Kokkos::View< - typename XVector::const_value_type*, - typename KokkosKernels::Impl::GetUnifiedLayout::array_layout, - typename XVector::device_type, Kokkos::MemoryTraits > + typedef Kokkos::View::array_layout, + typename XVector::device_type, Kokkos::MemoryTraits > XVector_Internal; using layout_t = typename XVector_Internal::array_layout; - typedef Kokkos::View > + typedef Kokkos::View > RVector_Internal; mag_type result; RVector_Internal R = RVector_Internal(&result, layout_t()); XVector_Internal X = x; - Impl::NrmInf::nrminf( - space, R, X); + Impl::NrmInf::nrminf(space, R, X); space.fence(); return result; } @@ -78,9 +70,8 @@ nrminf(const execution_space& space, const XVector& x) { /// /// \return The nrminf product result; a single value. template -typename Kokkos::Details::InnerProductSpaceTraits< - typename XVector::non_const_value_type>::mag_type -nrminf(const XVector& x) { +typename Kokkos::Details::InnerProductSpaceTraits::mag_type nrminf( + const XVector& x) { return nrminf(typename XVector::execution_space{}, x); } @@ -95,9 +86,8 @@ nrminf(const XVector& x) { /// the same rank as RMV, and its entries must be assignable to /// those of RMV. template -void nrminf( - const execution_space& space, const RV& R, const XMV& X, - typename std::enable_if::value, int>::type = 0) { +void nrminf(const execution_space& space, const RV& R, const XMV& X, + typename std::enable_if::value, int>::type = 0) { static_assert(Kokkos::is_execution_space::value, "KokkosBlas::nrminf: space is not an execution space instance"); static_assert(Kokkos::is_view::value, @@ -106,22 +96,17 @@ void nrminf( static_assert(Kokkos::is_view::value, "KokkosBlas::nrminf: " "X is not a Kokkos::View."); - static_assert( - Kokkos::SpaceAccessibility::accessible, - "KokkosBlas::nrminf: X is not accessible from execution_space"); - static_assert(std::is_same::value, + static_assert(Kokkos::SpaceAccessibility::accessible, + "KokkosBlas::nrminf: X is not accessible from execution_space"); + static_assert(std::is_same::value, "KokkosBlas::nrminf: R is const. " "It must be nonconst, because it is an output argument " "(we have to be able to write to its entries)."); - static_assert(((RV::rank == 0) && (XMV::rank == 1)) || - ((RV::rank == 1) && (XMV::rank == 2)), + static_assert(((RV::rank == 0) && (XMV::rank == 1)) || ((RV::rank == 1) && (XMV::rank == 2)), "KokkosBlas::nrminf: " "RV and XMV must either have rank 0 and 1 or rank 1 and 2."); - typedef typename Kokkos::Details::InnerProductSpaceTraits< - typename XMV::non_const_value_type>::mag_type mag_type; + typedef typename Kokkos::Details::InnerProductSpaceTraits::mag_type mag_type; static_assert(std::is_same::value, "KokkosBlas::nrminf: R must have the magnitude type of" "the xvectors value_type it is an output argument " @@ -131,37 +116,28 @@ void nrminf( if (X.extent(1) != R.extent(0)) { std::ostringstream os; os << "KokkosBlas::nrminf (MV): Dimensions of R and X do not match: " - << "R: " << R.extent(0) << ", X: " << X.extent(0) << " x " - << X.extent(1); + << "R: " << R.extent(0) << ", X: " << X.extent(0) << " x " << X.extent(1); KokkosKernels::Impl::throw_runtime_exception(os.str()); } - using UnifiedXLayout = - typename KokkosKernels::Impl::GetUnifiedLayout::array_layout; - using UnifiedRVLayout = - typename KokkosKernels::Impl::GetUnifiedLayoutPreferring< - RV, UnifiedXLayout>::array_layout; + using UnifiedXLayout = typename KokkosKernels::Impl::GetUnifiedLayout::array_layout; + using UnifiedRVLayout = typename KokkosKernels::Impl::GetUnifiedLayoutPreferring::array_layout; // Create unmanaged versions of the input Views. RV and XMV may be // rank 1 or rank 2. - typedef Kokkos::View::type, - UnifiedRVLayout, typename RV::device_type, - Kokkos::MemoryTraits > + typedef Kokkos::View::type, + UnifiedRVLayout, typename RV::device_type, Kokkos::MemoryTraits > RV_Internal; - typedef Kokkos::View< - typename std::conditional::type, - UnifiedXLayout, typename XMV::device_type, - Kokkos::MemoryTraits > + typedef Kokkos::View::type, + UnifiedXLayout, typename XMV::device_type, Kokkos::MemoryTraits > XMV_Internal; RV_Internal R_internal = R; XMV_Internal X_internal = X; - Impl::NrmInf::nrminf( - space, R_internal, X_internal); + Impl::NrmInf::nrminf(space, R_internal, X_internal); } /// \brief R(j) = nrminf(X(i,j)) @@ -174,9 +150,7 @@ void nrminf( /// the same rank as RMV, and its entries must be assignable to /// those of RMV. template -void nrminf( - const RV& R, const XMV& X, - typename std::enable_if::value, int>::type = 0) { +void nrminf(const RV& R, const XMV& X, typename std::enable_if::value, int>::type = 0) { nrminf(typename XMV::execution_space{}, R, X); } diff --git a/blas/src/KokkosBlas1_reciprocal.hpp b/blas/src/KokkosBlas1_reciprocal.hpp index ef73d26828..477c885e5e 100644 --- a/blas/src/KokkosBlas1_reciprocal.hpp +++ b/blas/src/KokkosBlas1_reciprocal.hpp @@ -47,19 +47,14 @@ void reciprocal(const execution_space& space, const RMV& R, const XMV& X) { static_assert(Kokkos::is_view::value, "KokkosBlas::reciprocal: " "R is not a Kokkos::View."); - static_assert( - Kokkos::SpaceAccessibility::accessible, - "KokkosBlas::reciprocal: RMV must be accessible from execution_space"); + static_assert(Kokkos::SpaceAccessibility::accessible, + "KokkosBlas::reciprocal: RMV must be accessible from execution_space"); static_assert(Kokkos::is_view::value, "KokkosBlas::reciprocal: " "X is not a Kokkos::View."); - static_assert( - Kokkos::SpaceAccessibility::accessible, - "KokkosBlas::reciprocal: XMV must be accessible from execution_space"); - static_assert(std::is_same::value, + static_assert(Kokkos::SpaceAccessibility::accessible, + "KokkosBlas::reciprocal: XMV must be accessible from execution_space"); + static_assert(std::is_same::value, "KokkosBlas::reciprocal: R is const. " "It must be nonconst, because it is an output argument " "(we have to be able to write to its entries)."); @@ -74,32 +69,27 @@ void reciprocal(const execution_space& space, const RMV& R, const XMV& X) { if (X.extent(0) != R.extent(0) || X.extent(1) != R.extent(1)) { std::ostringstream os; os << "KokkosBlas::reciprocal (MV): Dimensions of R and X do not match: " - << "R: " << R.extent(0) << " x " << R.extent(1) << ", X: " << X.extent(0) - << " x " << X.extent(1); + << "R: " << R.extent(0) << " x " << R.extent(1) << ", X: " << X.extent(0) << " x " << X.extent(1); KokkosKernels::Impl::throw_runtime_exception(os.str()); } // Create unmanaged versions of the input Views. RMV and XMV may be // rank 1 or rank 2. - typedef Kokkos::View< - typename std::conditional::type, - typename KokkosKernels::Impl::GetUnifiedLayout::array_layout, - typename RMV::device_type, Kokkos::MemoryTraits > + typedef Kokkos::View::type, + typename KokkosKernels::Impl::GetUnifiedLayout::array_layout, typename RMV::device_type, + Kokkos::MemoryTraits > RMV_Internal; - typedef Kokkos::View< - typename std::conditional::type, - typename KokkosKernels::Impl::GetUnifiedLayout::array_layout, - typename XMV::device_type, Kokkos::MemoryTraits > + typedef Kokkos::View::type, + typename KokkosKernels::Impl::GetUnifiedLayout::array_layout, typename XMV::device_type, + Kokkos::MemoryTraits > XMV_Internal; RMV_Internal R_internal = R; XMV_Internal X_internal = X; - Impl::Reciprocal::reciprocal( - space, R_internal, X_internal); + Impl::Reciprocal::reciprocal(space, R_internal, X_internal); } /// \brief R(i,j) = reciprocal(X(i,j)) diff --git a/blas/src/KokkosBlas1_rot.hpp b/blas/src/KokkosBlas1_rot.hpp index d848617b6e..7bc3215604 100644 --- a/blas/src/KokkosBlas1_rot.hpp +++ b/blas/src/KokkosBlas1_rot.hpp @@ -22,58 +22,45 @@ namespace KokkosBlas { template -void rot(execution_space const& space, VectorView const& X, VectorView const& Y, - ScalarView const& c, ScalarView const& s) { +void rot(execution_space const& space, VectorView const& X, VectorView const& Y, ScalarView const& c, + ScalarView const& s) { static_assert(Kokkos::is_execution_space::value, "rot: execution_space template parameter is not a Kokkos " "execution space."); - static_assert(VectorView::rank == 1, - "rot: VectorView template parameter needs to be a rank 1 view"); - static_assert(ScalarView::rank == 0, - "rot: ScalarView template parameter needs to be a rank 0 view"); - static_assert( - Kokkos::SpaceAccessibility::accessible, - "rot: VectorView template parameter memory space needs to be accessible " - "from " - "execution_space template parameter"); - static_assert( - Kokkos::SpaceAccessibility::accessible, - "rot: VectorView template parameter memory space needs to be accessible " - "from " - "execution_space template parameter"); - static_assert( - std::is_same::value, - "rot: VectorView template parameter needs to store non-const values"); + static_assert(VectorView::rank == 1, "rot: VectorView template parameter needs to be a rank 1 view"); + static_assert(ScalarView::rank == 0, "rot: ScalarView template parameter needs to be a rank 0 view"); + static_assert(Kokkos::SpaceAccessibility::accessible, + "rot: VectorView template parameter memory space needs to be accessible " + "from " + "execution_space template parameter"); + static_assert(Kokkos::SpaceAccessibility::accessible, + "rot: VectorView template parameter memory space needs to be accessible " + "from " + "execution_space template parameter"); + static_assert(std::is_same::value, + "rot: VectorView template parameter needs to store non-const values"); - using VectorView_Internal = Kokkos::View< - typename VectorView::non_const_value_type*, - typename KokkosKernels::Impl::GetUnifiedLayout::array_layout, - Kokkos::Device, - Kokkos::MemoryTraits>; + using VectorView_Internal = Kokkos::View::array_layout, + Kokkos::Device, + Kokkos::MemoryTraits>; - using ScalarView_Internal = Kokkos::View< - typename ScalarView::non_const_value_type, - typename KokkosKernels::Impl::GetUnifiedLayout::array_layout, - Kokkos::Device, - Kokkos::MemoryTraits>; + using ScalarView_Internal = Kokkos::View::array_layout, + Kokkos::Device, + Kokkos::MemoryTraits>; VectorView_Internal X_(X), Y_(Y); ScalarView_Internal c_(c), s_(s); Kokkos::Profiling::pushRegion("KokkosBlas::rot"); - Impl::Rot::rot( - space, X_, Y_, c_, s_); + Impl::Rot::rot(space, X_, Y_, c_, s_); Kokkos::Profiling::popRegion(); } template -void rot(VectorView const& X, VectorView const& Y, ScalarView const& c, - ScalarView const& s) { - const typename VectorView::execution_space space = - typename VectorView::execution_space(); +void rot(VectorView const& X, VectorView const& Y, ScalarView const& c, ScalarView const& s) { + const typename VectorView::execution_space space = typename VectorView::execution_space(); rot(space, X, Y, c, s); } diff --git a/blas/src/KokkosBlas1_rotg.hpp b/blas/src/KokkosBlas1_rotg.hpp index 3b66ae0115..1927bc2df9 100644 --- a/blas/src/KokkosBlas1_rotg.hpp +++ b/blas/src/KokkosBlas1_rotg.hpp @@ -35,40 +35,28 @@ namespace KokkosBlas { /// rotation /// \param s [out] sine value associated with the rotation template -void rotg(execution_space const& space, SViewType const& a, SViewType const& b, - MViewType const& c, SViewType const& s) { - static_assert(SViewType::rank == 0, - "rotg: the inputs need to be rank 0 views"); - static_assert(MViewType::rank == 0, - "rotg: the inputs need to be rank 0 views"); - static_assert( - !Kokkos::ArithTraits::is_complex); - static_assert( - Kokkos::SpaceAccessibility::accessible, - "rotg: execution_space cannot access data in SViewType"); - static_assert( - Kokkos::SpaceAccessibility::accessible, - "rotg: execution_space cannot access data in MViewType"); +void rotg(execution_space const& space, SViewType const& a, SViewType const& b, MViewType const& c, + SViewType const& s) { + static_assert(SViewType::rank == 0, "rotg: the inputs need to be rank 0 views"); + static_assert(MViewType::rank == 0, "rotg: the inputs need to be rank 0 views"); + static_assert(!Kokkos::ArithTraits::is_complex); + static_assert(Kokkos::SpaceAccessibility::accessible, + "rotg: execution_space cannot access data in SViewType"); + static_assert(Kokkos::SpaceAccessibility::accessible, + "rotg: execution_space cannot access data in MViewType"); using SView_Internal = Kokkos::View< - typename SViewType::value_type, - typename KokkosKernels::Impl::GetUnifiedLayout::array_layout, - Kokkos::Device, - Kokkos::MemoryTraits>; + typename SViewType::value_type, typename KokkosKernels::Impl::GetUnifiedLayout::array_layout, + Kokkos::Device, Kokkos::MemoryTraits>; using MView_Internal = Kokkos::View< - typename MViewType::value_type, - typename KokkosKernels::Impl::GetUnifiedLayout::array_layout, - Kokkos::Device, - Kokkos::MemoryTraits>; + typename MViewType::value_type, typename KokkosKernels::Impl::GetUnifiedLayout::array_layout, + Kokkos::Device, Kokkos::MemoryTraits>; SView_Internal a_(a), b_(b), s_(s); MView_Internal c_(c); Kokkos::Profiling::pushRegion("KokkosBlas::rotg"); - Impl::Rotg::rotg(space, a, b, - c, s); + Impl::Rotg::rotg(space, a, b, c, s); Kokkos::Profiling::popRegion(); } diff --git a/blas/src/KokkosBlas1_rotm.hpp b/blas/src/KokkosBlas1_rotm.hpp index 077d3350fe..6f5442e931 100644 --- a/blas/src/KokkosBlas1_rotm.hpp +++ b/blas/src/KokkosBlas1_rotm.hpp @@ -36,62 +36,45 @@ namespace KokkosBlas { /// \param param [in] output of rotmg contains rotation coefficients /// template -void rotm(execution_space const& space, VectorView const& X, - VectorView const& Y, ParamView const& param) { +void rotm(execution_space const& space, VectorView const& X, VectorView const& Y, ParamView const& param) { static_assert(Kokkos::is_execution_space::value, "rotm: execution_space template parameter is not a Kokkos " "execution space."); - static_assert( - VectorView::rank == 1, - "rotm: VectorView template parameter needs to be a rank 1 view"); - static_assert(ParamView::rank == 1, - "rotm: ParamView template parameter needs to be a rank 1 view"); - static_assert( - Kokkos::SpaceAccessibility::accessible, - "rotm: VectorView template parameter memory space needs to be accessible " - "from execution_space template parameter"); - static_assert( - Kokkos::SpaceAccessibility::accessible, - "rotm: ScalarView template parameter memory space needs to be accessible " - "from execution_space template parameter"); - static_assert( - std::is_same::value, - "rotm: VectorView template parameter needs to store non-const values"); - static_assert( - !Kokkos::ArithTraits::is_complex, - "rotm: VectorView template parameter cannot use complex value_type"); - static_assert( - !Kokkos::ArithTraits::is_complex, - "rotm: ParamView template parameter cannot use complex value_type"); + static_assert(VectorView::rank == 1, "rotm: VectorView template parameter needs to be a rank 1 view"); + static_assert(ParamView::rank == 1, "rotm: ParamView template parameter needs to be a rank 1 view"); + static_assert(Kokkos::SpaceAccessibility::accessible, + "rotm: VectorView template parameter memory space needs to be accessible " + "from execution_space template parameter"); + static_assert(Kokkos::SpaceAccessibility::accessible, + "rotm: ScalarView template parameter memory space needs to be accessible " + "from execution_space template parameter"); + static_assert(std::is_same::value, + "rotm: VectorView template parameter needs to store non-const values"); + static_assert(!Kokkos::ArithTraits::is_complex, + "rotm: VectorView template parameter cannot use complex value_type"); + static_assert(!Kokkos::ArithTraits::is_complex, + "rotm: ParamView template parameter cannot use complex value_type"); - using VectorView_Internal = Kokkos::View< - typename VectorView::non_const_value_type*, - typename KokkosKernels::Impl::GetUnifiedLayout::array_layout, - Kokkos::Device, - Kokkos::MemoryTraits>; + using VectorView_Internal = Kokkos::View::array_layout, + Kokkos::Device, + Kokkos::MemoryTraits>; using ParamView_Internal = Kokkos::View< - typename ParamView::const_value_type[5], - typename KokkosKernels::Impl::GetUnifiedLayout::array_layout, - Kokkos::Device, - Kokkos::MemoryTraits>; + typename ParamView::const_value_type[5], typename KokkosKernels::Impl::GetUnifiedLayout::array_layout, + Kokkos::Device, Kokkos::MemoryTraits>; VectorView_Internal X_(X), Y_(Y); ParamView_Internal param_(param); Kokkos::Profiling::pushRegion("KokkosBlas::rotm"); - Impl::Rotm::rotm( - space, X_, Y_, param_); + Impl::Rotm::rotm(space, X_, Y_, param_); Kokkos::Profiling::popRegion(); } template void rotm(VectorView const& X, VectorView const& Y, ParamView const& param) { - const typename VectorView::execution_space space = - typename VectorView::execution_space(); + const typename VectorView::execution_space space = typename VectorView::execution_space(); rotm(space, X, Y, param); } diff --git a/blas/src/KokkosBlas1_rotmg.hpp b/blas/src/KokkosBlas1_rotmg.hpp index 723b0eac1a..a6c629f987 100644 --- a/blas/src/KokkosBlas1_rotmg.hpp +++ b/blas/src/KokkosBlas1_rotmg.hpp @@ -39,46 +39,39 @@ namespace KokkosBlas { /// \param param [out] /// template -void rotmg(execution_space const& space, DXView const& d1, DXView const& d2, - DXView const& x1, YView const& y1, PView const& param) { - static_assert( - Kokkos::SpaceAccessibility::accessible, - "rotmg: execution_space cannot access data in DXView"); +void rotmg(execution_space const& space, DXView const& d1, DXView const& d2, DXView const& x1, YView const& y1, + PView const& param) { + static_assert(Kokkos::SpaceAccessibility::accessible, + "rotmg: execution_space cannot access data in DXView"); - using DXView_Internal = Kokkos::View< - typename DXView::value_type, - typename KokkosKernels::Impl::GetUnifiedLayout::array_layout, - Kokkos::Device, - Kokkos::MemoryTraits>; + using DXView_Internal = + Kokkos::View::array_layout, + Kokkos::Device, + Kokkos::MemoryTraits>; - using YView_Internal = Kokkos::View< - typename YView::value_type, - typename KokkosKernels::Impl::GetUnifiedLayout::array_layout, - Kokkos::Device, - Kokkos::MemoryTraits>; + using YView_Internal = + Kokkos::View::array_layout, + Kokkos::Device, + Kokkos::MemoryTraits>; - using PView_Internal = Kokkos::View< - typename PView::value_type[5], - typename KokkosKernels::Impl::GetUnifiedLayout::array_layout, - Kokkos::Device, - Kokkos::MemoryTraits>; + using PView_Internal = + Kokkos::View::array_layout, + Kokkos::Device, + Kokkos::MemoryTraits>; DXView_Internal d1_(d1), d2_(d2), x1_(x1); YView_Internal y1_(y1); PView_Internal param_(param); Kokkos::Profiling::pushRegion("KokkosBlas::rotmg"); - Impl::Rotmg::rotmg(space, d1_, d2_, x1_, y1_, param_); + Impl::Rotmg::rotmg(space, d1_, d2_, x1_, y1_, + param_); Kokkos::Profiling::popRegion(); } template -void rotmg(DXView const& d1, DXView const& d2, DXView const& x1, - YView const& y1, PView const& param) { - const typename PView::execution_space space = - typename PView::execution_space(); +void rotmg(DXView const& d1, DXView const& d2, DXView const& x1, YView const& y1, PView const& param) { + const typename PView::execution_space space = typename PView::execution_space(); rotmg(space, d1, d2, x1, y1, param); } diff --git a/blas/src/KokkosBlas1_scal.hpp b/blas/src/KokkosBlas1_scal.hpp index 39c197f352..561c505035 100644 --- a/blas/src/KokkosBlas1_scal.hpp +++ b/blas/src/KokkosBlas1_scal.hpp @@ -44,31 +44,23 @@ namespace KokkosBlas { /// \param a [in] view of type AV, scaling parameter for X. /// \param X [in] input view of type XMV. template -void scal(const execution_space& space, const RMV& R, const AV& a, - const XMV& X) { +void scal(const execution_space& space, const RMV& R, const AV& a, const XMV& X) { static_assert(Kokkos::is_execution_space_v, "KokkosBlas::scal: execution_space must be a valid Kokkos " "execution space"); static_assert(Kokkos::is_view::value, "KokkosBlas::scal: " "R is not a Kokkos::View."); - static_assert( - Kokkos::SpaceAccessibility::accessible, - "KokkosBlas::scal: RMV must be accessible from execution_space."); + static_assert(Kokkos::SpaceAccessibility::accessible, + "KokkosBlas::scal: RMV must be accessible from execution_space."); static_assert(Kokkos::is_view::value, "KokkosBlas::scal: " "X is not a Kokkos::View."); - static_assert( - Kokkos::SpaceAccessibility::accessible, - "KokkosBlas::scal: XMV must be accessible from execution_space"); - static_assert( - Kokkos::SpaceAccessibility::assignable, - "KokkosBlas::scal: XMV must be assignable to RMV"); - static_assert(std::is_same::value, + static_assert(Kokkos::SpaceAccessibility::accessible, + "KokkosBlas::scal: XMV must be accessible from execution_space"); + static_assert(Kokkos::SpaceAccessibility::assignable, + "KokkosBlas::scal: XMV must be assignable to RMV"); + static_assert(std::is_same::value, "KokkosBlas::scal: R is const. " "It must be nonconst, because it is an output argument " "(we have to be able to write to its entries)."); @@ -83,36 +75,27 @@ void scal(const execution_space& space, const RMV& R, const AV& a, if (X.extent(0) != R.extent(0) || X.extent(1) != R.extent(1)) { std::ostringstream os; os << "KokkosBlas::scal: Dimensions of R and X do not match: " - << "R: " << R.extent(0) << " x " << R.extent(1) << ", X: " << X.extent(0) - << " x " << X.extent(1); + << "R: " << R.extent(0) << " x " << R.extent(1) << ", X: " << X.extent(0) << " x " << X.extent(1); KokkosKernels::Impl::throw_runtime_exception(os.str()); } - using UnifiedRLayout = - typename KokkosKernels::Impl::GetUnifiedLayout::array_layout; - using UnifiedXLayout = - typename KokkosKernels::Impl::GetUnifiedLayoutPreferring< - XMV, UnifiedRLayout>::array_layout; + using UnifiedRLayout = typename KokkosKernels::Impl::GetUnifiedLayout::array_layout; + using UnifiedXLayout = typename KokkosKernels::Impl::GetUnifiedLayoutPreferring::array_layout; // Create unmanaged versions of the input Views. RMV and XMV may be // rank 1 or rank 2. AV may be either a rank-1 View, or a scalar // value. - using RMV_Internal = Kokkos::View >; - using XMV_Internal = Kokkos::View >; - using AV_Internal = - typename KokkosKernels::Impl::GetUnifiedScalarViewType::type; + using AV_Internal = typename KokkosKernels::Impl::GetUnifiedScalarViewType::type; RMV_Internal R_internal = R; AV_Internal a_internal = a; XMV_Internal X_internal = X; - Impl::Scal::scal( - space, R_internal, a_internal, X_internal); + Impl::Scal::scal(space, R_internal, a_internal, X_internal); } /// \brief Computes R := alpha*X @@ -140,10 +123,8 @@ void scal(const RMV& R, const AV& a, const XMV& X) { struct SerialScale { template - KOKKOS_INLINE_FUNCTION static int invoke(const ScalarType alpha, - const AViewType& A) { - return Impl::SerialScaleInternal::invoke( - A.extent(0), A.extent(1), alpha, A.data(), A.stride_0(), A.stride_1()); + KOKKOS_INLINE_FUNCTION static int invoke(const ScalarType alpha, const AViewType& A) { + return Impl::SerialScaleInternal::invoke(A.extent(0), A.extent(1), alpha, A.data(), A.stride_0(), A.stride_1()); } }; @@ -154,11 +135,8 @@ struct SerialScale { template struct TeamScale { template - KOKKOS_INLINE_FUNCTION static int invoke(const MemberType& member, - const ScalarType alpha, - const AViewType& A) { - return Impl::TeamScaleInternal::invoke(member, A.extent(0), A.extent(1), - alpha, A.data(), A.stride_0(), + KOKKOS_INLINE_FUNCTION static int invoke(const MemberType& member, const ScalarType alpha, const AViewType& A) { + return Impl::TeamScaleInternal::invoke(member, A.extent(0), A.extent(1), alpha, A.data(), A.stride_0(), A.stride_1()); } }; @@ -170,12 +148,9 @@ struct TeamScale { template struct TeamVectorScale { template - KOKKOS_INLINE_FUNCTION static int invoke(const MemberType& member, - const ScalarType alpha, - const AViewType& A) { - return Impl::TeamVectorScaleInternal::invoke(member, A.extent(0), - A.extent(1), alpha, A.data(), - A.stride_0(), A.stride_1()); + KOKKOS_INLINE_FUNCTION static int invoke(const MemberType& member, const ScalarType alpha, const AViewType& A) { + return Impl::TeamVectorScaleInternal::invoke(member, A.extent(0), A.extent(1), alpha, A.data(), A.stride_0(), + A.stride_1()); } }; diff --git a/blas/src/KokkosBlas1_set.hpp b/blas/src/KokkosBlas1_set.hpp index ea31ff6282..6a6a5e0f22 100644 --- a/blas/src/KokkosBlas1_set.hpp +++ b/blas/src/KokkosBlas1_set.hpp @@ -27,10 +27,8 @@ namespace KokkosBlas { struct SerialSet { template - KOKKOS_INLINE_FUNCTION static int invoke(const ScalarType alpha, - const AViewType &A) { - return Impl::SerialSetInternal::invoke( - A.extent(0), A.extent(1), alpha, A.data(), A.stride_0(), A.stride_1()); + KOKKOS_INLINE_FUNCTION static int invoke(const ScalarType alpha, const AViewType &A) { + return Impl::SerialSetInternal::invoke(A.extent(0), A.extent(1), alpha, A.data(), A.stride_0(), A.stride_1()); } }; @@ -41,12 +39,8 @@ struct SerialSet { template struct TeamSet { template - KOKKOS_INLINE_FUNCTION static int invoke(const MemberType &member, - const ScalarType alpha, - const AViewType &A) { - return Impl::TeamSetInternal::invoke(member, A.extent(0), A.extent(1), - alpha, A.data(), A.stride_0(), - A.stride_1()); + KOKKOS_INLINE_FUNCTION static int invoke(const MemberType &member, const ScalarType alpha, const AViewType &A) { + return Impl::TeamSetInternal::invoke(member, A.extent(0), A.extent(1), alpha, A.data(), A.stride_0(), A.stride_1()); } }; @@ -57,11 +51,8 @@ struct TeamSet { template struct TeamVectorSet { template - KOKKOS_INLINE_FUNCTION static int invoke(const MemberType &member, - const ScalarType alpha, - const AViewType &A) { - return Impl::TeamVectorSetInternal::invoke(member, A.extent(0), A.extent(1), - alpha, A.data(), A.stride_0(), + KOKKOS_INLINE_FUNCTION static int invoke(const MemberType &member, const ScalarType alpha, const AViewType &A) { + return Impl::TeamVectorSetInternal::invoke(member, A.extent(0), A.extent(1), alpha, A.data(), A.stride_0(), A.stride_1()); } }; diff --git a/blas/src/KokkosBlas1_sum.hpp b/blas/src/KokkosBlas1_sum.hpp index 88c7b10021..dffd19382e 100644 --- a/blas/src/KokkosBlas1_sum.hpp +++ b/blas/src/KokkosBlas1_sum.hpp @@ -33,40 +33,32 @@ namespace KokkosBlas { /// /// \return The sum product result; a single value. template , - int>::type = 0> -typename XVector::non_const_value_type sum(const execution_space& space, - const XVector& x) { + typename std::enable_if, int>::type = 0> +typename XVector::non_const_value_type sum(const execution_space& space, const XVector& x) { static_assert(Kokkos::is_execution_space_v, "KokkosBlas::sum: execution_space must be a valid Kokkos " "execution space"); - static_assert(Kokkos::is_view::value, - "KokkosBlas::sum: XVector must be a Kokkos::View."); - static_assert( - Kokkos::SpaceAccessibility::accessible, - "KokkosBlas::sum: XVector must be accessible from execution_space."); + static_assert(Kokkos::is_view::value, "KokkosBlas::sum: XVector must be a Kokkos::View."); + static_assert(Kokkos::SpaceAccessibility::accessible, + "KokkosBlas::sum: XVector must be accessible from execution_space."); static_assert(XVector::rank == 1, "KokkosBlas::sum: " "Both Vector inputs must have rank 1."); - using XVector_Internal = Kokkos::View< - typename XVector::const_value_type*, - typename KokkosKernels::Impl::GetUnifiedLayout::array_layout, - typename XVector::device_type, Kokkos::MemoryTraits >; + using XVector_Internal = Kokkos::View::array_layout, + typename XVector::device_type, Kokkos::MemoryTraits >; using layout_t = typename XVector_Internal::array_layout; - using RVector_Internal = - Kokkos::View >; + using RVector_Internal = Kokkos::View >; typename XVector::non_const_value_type result; RVector_Internal R = RVector_Internal(&result, layout_t()); XVector_Internal X = x; - Impl::Sum::sum(space, R, - X); + Impl::Sum::sum(space, R, X); space.fence(); return result; } @@ -113,17 +105,13 @@ void sum(const execution_space& space, const RV& R, const XMV& X, static_assert(Kokkos::is_view::value, "KokkosBlas::sum: " "X is not a Kokkos::View."); - static_assert( - Kokkos::SpaceAccessibility::accessible, - "KokkosBlas::sum: XMV must be accessible from execution_space."); - static_assert(std::is_same::value, + static_assert(Kokkos::SpaceAccessibility::accessible, + "KokkosBlas::sum: XMV must be accessible from execution_space."); + static_assert(std::is_same::value, "KokkosBlas::sum: R is const. " "It must be nonconst, because it is an output argument " "(we have to be able to write to its entries)."); - static_assert(((RV::rank == 0) && (XMV::rank == 1)) || - ((RV::rank == 1) && (XMV::rank == 2)), + static_assert(((RV::rank == 0) && (XMV::rank == 1)) || ((RV::rank == 1) && (XMV::rank == 2)), "KokkosBlas::sum: " "RV and XMV must either have rank 0 and 1 or rank 1 and 2."); @@ -131,33 +119,26 @@ void sum(const execution_space& space, const RV& R, const XMV& X, if (X.extent(1) != R.extent(0)) { std::ostringstream os; os << "KokkosBlas::sum (MV): Dimensions of R and X do not match: " - << "R: " << R.extent(0) << ", X: " << X.extent(0) << " x " - << X.extent(1); + << "R: " << R.extent(0) << ", X: " << X.extent(0) << " x " << X.extent(1); KokkosKernels::Impl::throw_runtime_exception(os.str()); } - using UnifiedXLayout = - typename KokkosKernels::Impl::GetUnifiedLayout::array_layout; - using UnifiedRVLayout = - typename KokkosKernels::Impl::GetUnifiedLayoutPreferring< - RV, UnifiedXLayout>::array_layout; + using UnifiedXLayout = typename KokkosKernels::Impl::GetUnifiedLayout::array_layout; + using UnifiedRVLayout = typename KokkosKernels::Impl::GetUnifiedLayoutPreferring::array_layout; // Create unmanaged versions of the input Views. RV and XMV may be // rank 1 or rank 2. - typedef Kokkos::View > RV_Internal; - typedef Kokkos::View > XMV_Internal; RV_Internal R_internal = R; XMV_Internal X_internal = X; - Impl::Sum::sum(space, R_internal, - X_internal); + Impl::Sum::sum(space, R_internal, X_internal); } /// \brief R(j) = sum(X(i,j)) @@ -176,8 +157,7 @@ void sum(const execution_space& space, const RV& R, const XMV& X, /// \param R [out] Output View (rank 0 or 1) containing the results. /// \param X [in] Input View (rank 1 or 2). template -void sum(const RV& R, const XMV& X, - typename std::enable_if::value, int>::type = 0) { +void sum(const RV& R, const XMV& X, typename std::enable_if::value, int>::type = 0) { sum(typename XMV::execution_space{}, R, X); } diff --git a/blas/src/KokkosBlas1_swap.hpp b/blas/src/KokkosBlas1_swap.hpp index 9ddcd106df..30155f5d44 100644 --- a/blas/src/KokkosBlas1_swap.hpp +++ b/blas/src/KokkosBlas1_swap.hpp @@ -42,44 +42,32 @@ namespace KokkosBlas { template void swap(execution_space const& space, XVector const& x, YVector const& y) { // Assert properties of XVector - static_assert(Kokkos::is_view::value, - "KokkosBlas::swap: XVector must be a Kokkos::View."); + static_assert(Kokkos::is_view::value, "KokkosBlas::swap: XVector must be a Kokkos::View."); static_assert(XVector::rank == 1, "KokkosBlas::swap: " "Input vector x must have rank 1."); - static_assert(std::is_same_v, + static_assert(std::is_same_v, "KokkosBlas::swap: XVector must store non const values."); - static_assert( - Kokkos::SpaceAccessibility::accessible, - "swap: execution_space cannot access data in XVector"); + static_assert(Kokkos::SpaceAccessibility::accessible, + "swap: execution_space cannot access data in XVector"); // Assert properties of YVector, could probably use a function for this as // XVector and YVector are required to have identical properties... - static_assert(Kokkos::is_view::value, - "KokkosBlas::swap: YVector must be a Kokkos::View."); + static_assert(Kokkos::is_view::value, "KokkosBlas::swap: YVector must be a Kokkos::View."); static_assert(YVector::rank == 1, "KokkosBlas::swap: " "Input vector y must have rank 1."); - static_assert(std::is_same_v, + static_assert(std::is_same_v, "KokkosBlas::swap: YVector must store non const values."); - static_assert( - Kokkos::SpaceAccessibility::accessible, - "swap: execution_space cannot access data in YVector"); + static_assert(Kokkos::SpaceAccessibility::accessible, + "swap: execution_space cannot access data in YVector"); using XVector_Internal = Kokkos::View< - typename XVector::non_const_value_type*, - typename KokkosKernels::Impl::GetUnifiedLayout::array_layout, - Kokkos::Device, - Kokkos::MemoryTraits >; + typename XVector::non_const_value_type*, typename KokkosKernels::Impl::GetUnifiedLayout::array_layout, + Kokkos::Device, Kokkos::MemoryTraits >; using YVector_Internal = Kokkos::View< - typename YVector::non_const_value_type*, - typename KokkosKernels::Impl::GetUnifiedLayout::array_layout, - Kokkos::Device, - Kokkos::MemoryTraits >; + typename YVector::non_const_value_type*, typename KokkosKernels::Impl::GetUnifiedLayout::array_layout, + Kokkos::Device, Kokkos::MemoryTraits >; XVector_Internal X(x); YVector_Internal Y(y); @@ -92,8 +80,7 @@ void swap(execution_space const& space, XVector const& x, YVector const& y) { Kokkos::Profiling::pushRegion("KokkosBlas::swap"); // If X.extent(0) == 0, do nothing if (X.extent(0) != 0) { - Impl::Swap::swap(space, - X, Y); + Impl::Swap::swap(space, X, Y); } Kokkos::Profiling::popRegion(); } @@ -111,8 +98,7 @@ void swap(execution_space const& space, XVector const& x, YVector const& y) { /// executed on the default stream of the execution_space associted with x. template void swap(const XVector& x, const YVector& y) { - const typename XVector::execution_space space = - typename XVector::execution_space(); + const typename XVector::execution_space space = typename XVector::execution_space(); swap(space, x, y); } diff --git a/blas/src/KokkosBlas1_team_abs.hpp b/blas/src/KokkosBlas1_team_abs.hpp index 55dcc668db..a7e808c713 100644 --- a/blas/src/KokkosBlas1_team_abs.hpp +++ b/blas/src/KokkosBlas1_team_abs.hpp @@ -23,8 +23,7 @@ namespace KokkosBlas { namespace Experimental { template -void KOKKOS_INLINE_FUNCTION abs(const TeamType& team, const RVector& r, - const XVector& x) { +void KOKKOS_INLINE_FUNCTION abs(const TeamType& team, const RVector& r, const XVector& x) { Impl::TeamAbs::team_abs(team, r, x); } diff --git a/blas/src/KokkosBlas1_team_axpby.hpp b/blas/src/KokkosBlas1_team_axpby.hpp index 374bc42390..1b8734a852 100644 --- a/blas/src/KokkosBlas1_team_axpby.hpp +++ b/blas/src/KokkosBlas1_team_axpby.hpp @@ -23,21 +23,16 @@ namespace KokkosBlas { namespace Experimental { template -void KOKKOS_INLINE_FUNCTION -axpby(const TeamType& team, const typename XVector::non_const_value_type& a, - const XVector& x, const typename YVector::non_const_value_type& b, - const YVector& y) { - return Impl::TeamAXPBY::team_axpby(team, a, x, b, - y); +void KOKKOS_INLINE_FUNCTION axpby(const TeamType& team, const typename XVector::non_const_value_type& a, + const XVector& x, const typename YVector::non_const_value_type& b, const YVector& y) { + return Impl::TeamAXPBY::team_axpby(team, a, x, b, y); } template -void KOKKOS_INLINE_FUNCTION -axpy(const TeamType& team, const typename XVector::non_const_value_type& a, - const XVector& x, const YVector& y) { +void KOKKOS_INLINE_FUNCTION axpy(const TeamType& team, const typename XVector::non_const_value_type& a, + const XVector& x, const YVector& y) { KokkosBlas::Experimental::axpby( - team, a, x, - Kokkos::ArithTraits::one(), y); + team, a, x, Kokkos::ArithTraits::one(), y); } } // namespace Experimental diff --git a/blas/src/KokkosBlas1_team_dot.hpp b/blas/src/KokkosBlas1_team_dot.hpp index 25c5c05cfc..53065b6fae 100644 --- a/blas/src/KokkosBlas1_team_dot.hpp +++ b/blas/src/KokkosBlas1_team_dot.hpp @@ -23,9 +23,9 @@ namespace KokkosBlas { namespace Experimental { template -typename Kokkos::Details::InnerProductSpaceTraits< - typename XVector::non_const_value_type>::dot_type KOKKOS_INLINE_FUNCTION -dot(const TeamType& team, const XVector& x, const YVector& y) { +typename Kokkos::Details::InnerProductSpaceTraits::dot_type + KOKKOS_INLINE_FUNCTION + dot(const TeamType& team, const XVector& x, const YVector& y) { return Impl::TeamDot::team_dot(team, x, y); } diff --git a/blas/src/KokkosBlas1_team_mult.hpp b/blas/src/KokkosBlas1_team_mult.hpp index 2737f835c0..08d9c6813e 100644 --- a/blas/src/KokkosBlas1_team_mult.hpp +++ b/blas/src/KokkosBlas1_team_mult.hpp @@ -23,12 +23,10 @@ namespace KokkosBlas { namespace Experimental { template -void KOKKOS_INLINE_FUNCTION -mult(const TeamType& team, const typename YVector::non_const_value_type& gamma, - const YVector& y, const typename AVector::non_const_value_type& alpha, - const AVector& a, const XVector& x) { - return Impl::TeamMult::team_mult( - team, gamma, y, alpha, a, x); +void KOKKOS_INLINE_FUNCTION mult(const TeamType& team, const typename YVector::non_const_value_type& gamma, + const YVector& y, const typename AVector::non_const_value_type& alpha, + const AVector& a, const XVector& x) { + return Impl::TeamMult::team_mult(team, gamma, y, alpha, a, x); } } // namespace Experimental diff --git a/blas/src/KokkosBlas1_team_nrm2.hpp b/blas/src/KokkosBlas1_team_nrm2.hpp index ee58cd3331..f0ac33f4f2 100644 --- a/blas/src/KokkosBlas1_team_nrm2.hpp +++ b/blas/src/KokkosBlas1_team_nrm2.hpp @@ -23,9 +23,9 @@ namespace KokkosBlas { namespace Experimental { template -typename Kokkos::Details::InnerProductSpaceTraits< - typename XVector::non_const_value_type>::mag_type KOKKOS_INLINE_FUNCTION -nrm2(const TeamType& team, const XVector& x) { +typename Kokkos::Details::InnerProductSpaceTraits::mag_type + KOKKOS_INLINE_FUNCTION + nrm2(const TeamType& team, const XVector& x) { return Impl::TeamNrm2::team_nrm2(team, x); } diff --git a/blas/src/KokkosBlas1_team_scal.hpp b/blas/src/KokkosBlas1_team_scal.hpp index b148e165f1..31d0c63b6d 100644 --- a/blas/src/KokkosBlas1_team_scal.hpp +++ b/blas/src/KokkosBlas1_team_scal.hpp @@ -23,9 +23,8 @@ namespace KokkosBlas { namespace Experimental { template -void KOKKOS_INLINE_FUNCTION -scal(const TeamType& team, const RVector& r, - const typename XVector::non_const_value_type& a, const XVector& x) { +void KOKKOS_INLINE_FUNCTION scal(const TeamType& team, const RVector& r, + const typename XVector::non_const_value_type& a, const XVector& x) { return Impl::TeamScal::team_scal(team, r, a, x); } diff --git a/blas/src/KokkosBlas1_team_update.hpp b/blas/src/KokkosBlas1_team_update.hpp index 069932b1e5..587c492c6e 100644 --- a/blas/src/KokkosBlas1_team_update.hpp +++ b/blas/src/KokkosBlas1_team_update.hpp @@ -23,13 +23,11 @@ namespace KokkosBlas { namespace Experimental { template -void KOKKOS_INLINE_FUNCTION -update(const TeamType& team, - const typename XVector::non_const_value_type& alpha, const XVector& x, - const typename YVector::non_const_value_type& beta, const YVector& y, - const typename ZVector::non_const_value_type& gamma, const ZVector& z) { - return Impl::TeamUpdate::team_update( - team, alpha, x, beta, y, gamma, z); +void KOKKOS_INLINE_FUNCTION update(const TeamType& team, const typename XVector::non_const_value_type& alpha, + const XVector& x, const typename YVector::non_const_value_type& beta, + const YVector& y, const typename ZVector::non_const_value_type& gamma, + const ZVector& z) { + return Impl::TeamUpdate::team_update(team, alpha, x, beta, y, gamma, z); } } // namespace Experimental diff --git a/blas/src/KokkosBlas1_update.hpp b/blas/src/KokkosBlas1_update.hpp index 889f9ede32..95d1a2d7e0 100644 --- a/blas/src/KokkosBlas1_update.hpp +++ b/blas/src/KokkosBlas1_update.hpp @@ -44,8 +44,7 @@ namespace KokkosBlas { /// \param gamma [in] scaling parameter for Z /// \param Z [in/out] view of type ZMV in which the results will be stored. template -void update(const execution_space& space, - const typename XMV::non_const_value_type& alpha, const XMV& X, +void update(const execution_space& space, const typename XMV::non_const_value_type& alpha, const XMV& X, const typename YMV::non_const_value_type& beta, const YMV& Y, const typename ZMV::non_const_value_type& gamma, const ZMV& Z) { static_assert(Kokkos::is_execution_space_v, @@ -60,20 +59,13 @@ void update(const execution_space& space, static_assert(Kokkos::is_view::value, "KokkosBlas::update: " "Z is not a Kokkos::View."); - static_assert( - Kokkos::SpaceAccessibility::accessible, - "KokkosBlas::update: XMV must be accessible from execution_space."); - static_assert( - Kokkos::SpaceAccessibility::accessible, - "KokkosBlas::update: YMV must be accessible from execution_space."); - static_assert( - Kokkos::SpaceAccessibility::accessible, - "KokkosBlas::update: ZMV must be accessible from execution_space."); - static_assert(std::is_same::value, + static_assert(Kokkos::SpaceAccessibility::accessible, + "KokkosBlas::update: XMV must be accessible from execution_space."); + static_assert(Kokkos::SpaceAccessibility::accessible, + "KokkosBlas::update: YMV must be accessible from execution_space."); + static_assert(Kokkos::SpaceAccessibility::accessible, + "KokkosBlas::update: ZMV must be accessible from execution_space."); + static_assert(std::is_same::value, "KokkosBlas::update: Z is const. " "It must be nonconst, because it is an output argument " "(we have to be able to write to its entries)."); @@ -88,37 +80,32 @@ void update(const execution_space& space, "XMV, YMV, and ZMV must either have rank 1 or rank 2."); // Check compatibility of dimensions at run time. - if (X.extent(0) != Y.extent(0) || X.extent(1) != Y.extent(1) || - X.extent(0) != Z.extent(0) || X.extent(1) != Z.extent(1)) { + if (X.extent(0) != Y.extent(0) || X.extent(1) != Y.extent(1) || X.extent(0) != Z.extent(0) || + X.extent(1) != Z.extent(1)) { std::ostringstream os; os << "KokkosBlas::update (MV): Dimensions of X, Y, and Z do not match: " - << "Z: " << Z.extent(0) << " x " << Z.extent(1) << ", X: " << X.extent(0) - << " x " << X.extent(1) << ", Y: " << Y.extent(0) << " x " - << Y.extent(1); + << "Z: " << Z.extent(0) << " x " << Z.extent(1) << ", X: " << X.extent(0) << " x " << X.extent(1) + << ", Y: " << Y.extent(0) << " x " << Y.extent(1); KokkosKernels::Impl::throw_runtime_exception(os.str()); } // Create unmanaged versions of the input Views. XMV, YMV, and ZMV // may be rank 1 or rank 2, but they must all have the same rank. - using XMV_Internal = Kokkos::View< - typename std::conditional::type, - typename KokkosKernels::Impl::GetUnifiedLayout::array_layout, - typename XMV::device_type, Kokkos::MemoryTraits >; + using XMV_Internal = Kokkos::View::type, + typename KokkosKernels::Impl::GetUnifiedLayout::array_layout, + typename XMV::device_type, Kokkos::MemoryTraits >; - using YMV_Internal = Kokkos::View< - typename std::conditional::type, - typename KokkosKernels::Impl::GetUnifiedLayout::array_layout, - typename YMV::device_type, Kokkos::MemoryTraits >; + using YMV_Internal = Kokkos::View::type, + typename KokkosKernels::Impl::GetUnifiedLayout::array_layout, + typename YMV::device_type, Kokkos::MemoryTraits >; - using ZMV_Internal = Kokkos::View< - typename std::conditional::type, - typename KokkosKernels::Impl::GetUnifiedLayout::array_layout, - typename ZMV::device_type, Kokkos::MemoryTraits >; + using ZMV_Internal = Kokkos::View::type, + typename KokkosKernels::Impl::GetUnifiedLayout::array_layout, + typename ZMV::device_type, Kokkos::MemoryTraits >; XMV_Internal X_internal = X; YMV_Internal Y_internal = Y; @@ -134,9 +121,8 @@ void update(const execution_space& space, << endl; #endif // KOKKOSKERNELS_PRINT_DEMANGLED_TYPE_INFO - Impl::Update::update(space, alpha, X_internal, beta, Y_internal, - gamma, Z_internal); + Impl::Update::update(space, alpha, X_internal, beta, + Y_internal, gamma, Z_internal); } /// \brief Compute Z := alpha*X + beta*Y + gamma*Z. diff --git a/blas/src/KokkosBlas2_gemv.hpp b/blas/src/KokkosBlas2_gemv.hpp index 88ffc63810..22d2b7bbbf 100644 --- a/blas/src/KokkosBlas2_gemv.hpp +++ b/blas/src/KokkosBlas2_gemv.hpp @@ -49,56 +49,39 @@ namespace KokkosBlas { /// \param x [in] Input vector, as a 1-D Kokkos::View /// \param beta [in] Input coefficient of y /// \param y [in/out] Output vector, as a nonconst 1-D Kokkos::View -template -void gemv(const ExecutionSpace& space, const char trans[], - typename AViewType::const_value_type& alpha, const AViewType& A, - const XViewType& x, typename YViewType::const_value_type& beta, - const YViewType& y) { +template +void gemv(const ExecutionSpace& space, const char trans[], typename AViewType::const_value_type& alpha, + const AViewType& A, const XViewType& x, typename YViewType::const_value_type& beta, const YViewType& y) { static_assert(Kokkos::is_execution_space_v, "KokkosBlas::gemv: ExecutionSpace must be a valid Kokkos " "execution space."); - static_assert(Kokkos::is_view::value, - "KokkosBlas::gemv: AViewType must be a Kokkos::View."); - static_assert(Kokkos::is_view::value, - "KokkosBlas::gemv: XViewType must be a Kokkos::View."); - static_assert(Kokkos::is_view::value, - "KokkosBlas::gemv: YViewType must be a Kokkos::View."); - static_assert(static_cast(AViewType::rank) == 2, - "KokkosBlas::gemv: AViewType must have rank 2."); - static_assert(static_cast(XViewType::rank) == 1, - "KokkosBlas::gemv: XViewType must have rank 1."); - static_assert(static_cast(YViewType::rank) == 1, - "KokkosBlas::gemv: YViewType must have rank 1."); - static_assert( - Kokkos::SpaceAccessibility::accessible, - "KokkosBlas::gemv: AViewType must be accessible from ExecutionSpace"); - static_assert( - Kokkos::SpaceAccessibility::accessible, - "KokkosBlas::gemv: XViewType must be accessible from ExecutionSpace"); - static_assert( - Kokkos::SpaceAccessibility::accessible, - "KokkosBlas::gemv: YViewType must be accessible from ExecutionSpace"); + static_assert(Kokkos::is_view::value, "KokkosBlas::gemv: AViewType must be a Kokkos::View."); + static_assert(Kokkos::is_view::value, "KokkosBlas::gemv: XViewType must be a Kokkos::View."); + static_assert(Kokkos::is_view::value, "KokkosBlas::gemv: YViewType must be a Kokkos::View."); + static_assert(static_cast(AViewType::rank) == 2, "KokkosBlas::gemv: AViewType must have rank 2."); + static_assert(static_cast(XViewType::rank) == 1, "KokkosBlas::gemv: XViewType must have rank 1."); + static_assert(static_cast(YViewType::rank) == 1, "KokkosBlas::gemv: YViewType must have rank 1."); + static_assert(Kokkos::SpaceAccessibility::accessible, + "KokkosBlas::gemv: AViewType must be accessible from ExecutionSpace"); + static_assert(Kokkos::SpaceAccessibility::accessible, + "KokkosBlas::gemv: XViewType must be accessible from ExecutionSpace"); + static_assert(Kokkos::SpaceAccessibility::accessible, + "KokkosBlas::gemv: YViewType must be accessible from ExecutionSpace"); // Check compatibility of dimensions at run time. if (trans[0] == 'N' || trans[0] == 'n') { if (A.extent(0) != y.extent(0) || A.extent(1) != x.extent(0)) { std::ostringstream os; os << "KokkosBlas::gemv: Dimensions of A, x, and y do not match: " - << "A: " << A.extent(0) << " x " << A.extent(1) - << ", x: " << x.extent(0) << ", y: " << y.extent(0); + << "A: " << A.extent(0) << " x " << A.extent(1) << ", x: " << x.extent(0) << ", y: " << y.extent(0); KokkosKernels::Impl::throw_runtime_exception(os.str()); } - } else if (trans[0] == 'T' || trans[0] == 't' || trans[0] == 'C' || - trans[0] == 'c' || trans[0] == 'H' || trans[0] == 'h') { + } else if (trans[0] == 'T' || trans[0] == 't' || trans[0] == 'C' || trans[0] == 'c' || trans[0] == 'H' || + trans[0] == 'h') { if (A.extent(1) != y.extent(0) || A.extent(0) != x.extent(0)) { std::ostringstream os; os << "KokkosBlas::dot: Dimensions of A, x, and y do not match: " - << "A: " << A.extent(0) << " x " << A.extent(1) - << ", x: " << x.extent(0) << ", y: " << y.extent(0); + << "A: " << A.extent(0) << " x " << A.extent(1) << ", x: " << x.extent(0) << ", y: " << y.extent(0); KokkosKernels::Impl::throw_runtime_exception(os.str()); } } else { @@ -115,21 +98,16 @@ void gemv(const ExecutionSpace& space, const char trans[], // Minimize the number of Impl::GEMV instantiations, by // standardizing on particular View specializations for its template // parameters. - typedef Kokkos::View > AVT; typedef Kokkos::View::array_layout, - typename XViewType::device_type, - Kokkos::MemoryTraits > + typename KokkosKernels::Impl::GetUnifiedLayoutPreferring::array_layout, + typename XViewType::device_type, Kokkos::MemoryTraits > XVT; typedef Kokkos::View::array_layout, - typename YViewType::device_type, - Kokkos::MemoryTraits > + typename KokkosKernels::Impl::GetUnifiedLayoutPreferring::array_layout, + typename YViewType::device_type, Kokkos::MemoryTraits > YVT; // Degenerate case is essentially same as scal - use fallback impl @@ -139,43 +117,32 @@ void gemv(const ExecutionSpace& space, const char trans[], // If A is LayoutRight and we have the BLAS, cuBLAS or rocBLAS TPL, use // fallback because those only support LayoutLeft #ifdef KOKKOSKERNELS_ENABLE_TPL_CUBLAS - useFallback = useFallback || (tolower(*trans) == 'c' && - std::is_same::value && - std::is_same::value); + useFallback = useFallback || + (tolower(*trans) == 'c' && std::is_same::value && + std::is_same::value); #endif #ifdef KOKKOSKERNELS_ENABLE_TPL_ROCBLAS - useFallback = - useFallback || - (tolower(*trans) == 'c' && - std::is_same::value && - std::is_same::value); + useFallback = useFallback || + (tolower(*trans) == 'c' && std::is_same::value && + std::is_same::value); #endif #ifdef KOKKOSKERNELS_ENABLE_TPL_BLAS - useFallback = useFallback || (tolower(*trans) == 'c' && - std::is_same::value && - std::is_same::value); + useFallback = useFallback || + (tolower(*trans) == 'c' && std::is_same::value && + std::is_same::value); #endif #ifdef KOKKOSKERNELS_ENABLE_TPL_MKL #ifdef KOKKOS_ENABLE_SYCL // oneMKL supports both row-major and column-major of A // but only supports oneapi::mkl::transpose::nontrans op useFallback = - useFallback || !std::is_same_v; + useFallback || !std::is_same_v; #endif #endif if (useFallback) { - const bool eti_spec_avail = - KokkosBlas::Impl::gemv_eti_spec_avail::value; - typedef Impl::GEMV - fallback_impl_type; + const bool eti_spec_avail = KokkosBlas::Impl::gemv_eti_spec_avail::value; + typedef Impl::GEMV fallback_impl_type; fallback_impl_type::gemv(space, trans, alpha, A, x, beta, y); } else { typedef Impl::GEMV impl_type; @@ -200,8 +167,7 @@ void gemv(const ExecutionSpace& space, const char trans[], /// \param beta [in] Input coefficient of y /// \param y [in/out] Output vector, as a nonconst 1-D Kokkos::View template -void gemv(const char trans[], typename AViewType::const_value_type& alpha, - const AViewType& A, const XViewType& x, +void gemv(const char trans[], typename AViewType::const_value_type& alpha, const AViewType& A, const XViewType& x, typename YViewType::const_value_type& beta, const YViewType& y) { gemv(typename AViewType::execution_space{}, trans, alpha, A, x, beta, y); } @@ -212,46 +178,38 @@ namespace Experimental { /// template struct Gemv { - template - static void KOKKOS_INLINE_FUNCTION - invoke(const MemberType& member, const char trans, const ScalarType& alpha, - const MatrixType& A, const XVector& x, const ScalarType& beta, - const YVector& y); + template + static void KOKKOS_INLINE_FUNCTION invoke(const MemberType& member, const char trans, const ScalarType& alpha, + const MatrixType& A, const XVector& x, const ScalarType& beta, + const YVector& y); }; template struct Gemv { - template - static void KOKKOS_INLINE_FUNCTION - invoke(const MemberType& /*member*/, const char trans, - const ScalarType& alpha, const MatrixType& A, const XVector& x, - const ScalarType& beta, const YVector& y) { + template + static void KOKKOS_INLINE_FUNCTION invoke(const MemberType& /*member*/, const char trans, const ScalarType& alpha, + const MatrixType& A, const XVector& x, const ScalarType& beta, + const YVector& y) { serial_gemv(trans, alpha, A, x, beta, y); } }; template struct Gemv { - template - static void KOKKOS_INLINE_FUNCTION - invoke(const MemberType& member, const char trans, const ScalarType& alpha, - const MatrixType& A, const XVector& x, const ScalarType& beta, - const YVector& y) { + template + static void KOKKOS_INLINE_FUNCTION invoke(const MemberType& member, const char trans, const ScalarType& alpha, + const MatrixType& A, const XVector& x, const ScalarType& beta, + const YVector& y) { team_gemv(member, trans, alpha, A, x, beta, y); } }; template struct Gemv { - template - static void KOKKOS_INLINE_FUNCTION - invoke(const MemberType& member, const char trans, const ScalarType& alpha, - const MatrixType& A, const XVector& x, const ScalarType& beta, - const YVector& y) { + template + static void KOKKOS_INLINE_FUNCTION invoke(const MemberType& member, const char trans, const ScalarType& alpha, + const MatrixType& A, const XVector& x, const ScalarType& beta, + const YVector& y) { teamvector_gemv(member, trans, alpha, A, x, beta, y); } }; diff --git a/blas/src/KokkosBlas2_ger.hpp b/blas/src/KokkosBlas2_ger.hpp index 8650577faf..88786649ba 100644 --- a/blas/src/KokkosBlas2_ger.hpp +++ b/blas/src/KokkosBlas2_ger.hpp @@ -39,54 +39,38 @@ namespace KokkosBlas { /// \param x [in] Input vector, as a 1-D Kokkos::View /// \param y [in] Input vector, as a 1-D Kokkos::View /// \param A [in/out] Output matrix, as a nonconst 2-D Kokkos::View -template -void ger(const ExecutionSpace& space, const char trans[], - const typename AViewType::const_value_type& alpha, const XViewType& x, - const YViewType& y, const AViewType& A) { - static_assert( - Kokkos::SpaceAccessibility::accessible, - "AViewType memory space must be accessible from ExecutionSpace"); - static_assert( - Kokkos::SpaceAccessibility::accessible, - "XViewType memory space must be accessible from ExecutionSpace"); - static_assert( - Kokkos::SpaceAccessibility::accessible, - "YViewType memory space must be accessible from ExecutionSpace"); - - static_assert(Kokkos::is_view::value, - "AViewType must be a Kokkos::View."); - static_assert(Kokkos::is_view::value, - "XViewType must be a Kokkos::View."); - static_assert(Kokkos::is_view::value, - "YViewType must be a Kokkos::View."); - - static_assert(static_cast(AViewType::rank) == 2, - "AViewType must have rank 2."); - static_assert(static_cast(XViewType::rank) == 1, - "XViewType must have rank 1."); - static_assert(static_cast(YViewType::rank) == 1, - "YViewType must have rank 1."); +template +void ger(const ExecutionSpace& space, const char trans[], const typename AViewType::const_value_type& alpha, + const XViewType& x, const YViewType& y, const AViewType& A) { + static_assert(Kokkos::SpaceAccessibility::accessible, + "AViewType memory space must be accessible from ExecutionSpace"); + static_assert(Kokkos::SpaceAccessibility::accessible, + "XViewType memory space must be accessible from ExecutionSpace"); + static_assert(Kokkos::SpaceAccessibility::accessible, + "YViewType memory space must be accessible from ExecutionSpace"); + + static_assert(Kokkos::is_view::value, "AViewType must be a Kokkos::View."); + static_assert(Kokkos::is_view::value, "XViewType must be a Kokkos::View."); + static_assert(Kokkos::is_view::value, "YViewType must be a Kokkos::View."); + + static_assert(static_cast(AViewType::rank) == 2, "AViewType must have rank 2."); + static_assert(static_cast(XViewType::rank) == 1, "XViewType must have rank 1."); + static_assert(static_cast(YViewType::rank) == 1, "YViewType must have rank 1."); // Check compatibility of dimensions at run time. if ((A.extent(0) != x.extent(0)) || (A.extent(1) != y.extent(0))) { std::ostringstream os; os << "KokkosBlas::ger: Dimensions of A, x, and y do not match: " - << "A is " << A.extent(0) << " by " << A.extent(1) << ", x has size " - << x.extent(0) << ", y has size " << y.extent(0); + << "A is " << A.extent(0) << " by " << A.extent(1) << ", x has size " << x.extent(0) << ", y has size " + << y.extent(0); KokkosKernels::Impl::throw_runtime_exception(os.str()); } - if ((trans[0] == 'T') || (trans[0] == 't') || (trans[0] == 'H') || - (trans[0] == 'h')) { + if ((trans[0] == 'T') || (trans[0] == 't') || (trans[0] == 'H') || (trans[0] == 'h')) { // Ok } else { std::ostringstream os; - os << "KokkosBlas::ger: invalid trans[0] = '" << trans[0] - << "'. It must be equalt to 'T' or 't' or 'H' or 'h'"; + os << "KokkosBlas::ger: invalid trans[0] = '" << trans[0] << "'. It must be equalt to 'T' or 't' or 'H' or 'h'"; KokkosKernels::Impl::throw_runtime_exception(os.str()); } @@ -99,21 +83,16 @@ void ger(const ExecutionSpace& space, const char trans[], // Minimize the number of Impl::GER instantiations, by standardizing // on particular View specializations for its template parameters. typedef Kokkos::View::array_layout, - typename XViewType::device_type, - Kokkos::MemoryTraits > + typename KokkosKernels::Impl::GetUnifiedLayoutPreferring::array_layout, + typename XViewType::device_type, Kokkos::MemoryTraits > XVT; typedef Kokkos::View::array_layout, - typename YViewType::device_type, - Kokkos::MemoryTraits > + typename KokkosKernels::Impl::GetUnifiedLayoutPreferring::array_layout, + typename YViewType::device_type, Kokkos::MemoryTraits > YVT; - typedef Kokkos::View > AVT; @@ -133,12 +112,10 @@ void ger(const ExecutionSpace& space, const char trans[], /// \param y [in] Input vector, as a 1-D Kokkos::View /// \param A [in/out] Output matrix, as a nonconst 2-D Kokkos::View template -void ger(const char trans[], const typename AViewType::const_value_type& alpha, - const XViewType& x, const YViewType& y, const AViewType& A) { - const typename AViewType::execution_space space = - typename AViewType::execution_space(); - ger( - space, trans, alpha, x, y, A); +void ger(const char trans[], const typename AViewType::const_value_type& alpha, const XViewType& x, const YViewType& y, + const AViewType& A) { + const typename AViewType::execution_space space = typename AViewType::execution_space(); + ger(space, trans, alpha, x, y, A); } } // namespace KokkosBlas diff --git a/blas/src/KokkosBlas2_serial_gemv.hpp b/blas/src/KokkosBlas2_serial_gemv.hpp index 12dbf61c3a..2b52d6c5a9 100644 --- a/blas/src/KokkosBlas2_serial_gemv.hpp +++ b/blas/src/KokkosBlas2_serial_gemv.hpp @@ -23,13 +23,9 @@ namespace KokkosBlas { namespace Experimental { -template -void KOKKOS_INLINE_FUNCTION serial_gemv(const char trans, - const ScalarType& alpha, - const MatrixType& A, const XVector& x, - const ScalarType& beta, - const YVector& y) { +template +void KOKKOS_INLINE_FUNCTION serial_gemv(const char trans, const ScalarType& alpha, const MatrixType& A, + const XVector& x, const ScalarType& beta, const YVector& y) { if (trans == 'N' || trans == 'n') { using mode = KokkosBlas::Trans::NoTranspose; KokkosBlas::SerialGemv::invoke(alpha, A, x, beta, y); @@ -46,11 +42,8 @@ void KOKKOS_INLINE_FUNCTION serial_gemv(const char trans, // default AlgoTag template -void KOKKOS_INLINE_FUNCTION serial_gemv(const char trans, - const ScalarType& alpha, - const MatrixType& A, const XVector& x, - const ScalarType& beta, - const YVector& y) { +void KOKKOS_INLINE_FUNCTION serial_gemv(const char trans, const ScalarType& alpha, const MatrixType& A, + const XVector& x, const ScalarType& beta, const YVector& y) { serial_gemv(trans, alpha, A, x, beta, y); } diff --git a/blas/src/KokkosBlas2_syr.hpp b/blas/src/KokkosBlas2_syr.hpp index 00d1d8b3de..7cb226fd7f 100644 --- a/blas/src/KokkosBlas2_syr.hpp +++ b/blas/src/KokkosBlas2_syr.hpp @@ -64,53 +64,39 @@ namespace KokkosBlas { /// \param A [in/out] Output matrix, as a nonconst 2-D Kokkos::View template void syr(const ExecutionSpace& space, const char trans[], const char uplo[], - const typename AViewType::const_value_type& alpha, const XViewType& x, - const AViewType& A) { - static_assert( - Kokkos::SpaceAccessibility::accessible, - "AViewType memory space must be accessible from ExecutionSpace"); - static_assert( - Kokkos::SpaceAccessibility::accessible, - "XViewType memory space must be accessible from ExecutionSpace"); - - static_assert(Kokkos::is_view::value, - "AViewType must be a Kokkos::View."); - static_assert(Kokkos::is_view::value, - "XViewType must be a Kokkos::View."); - - static_assert(static_cast(AViewType::rank) == 2, - "AViewType must have rank 2."); - static_assert(static_cast(XViewType::rank) == 1, - "XViewType must have rank 1."); + const typename AViewType::const_value_type& alpha, const XViewType& x, const AViewType& A) { + static_assert(Kokkos::SpaceAccessibility::accessible, + "AViewType memory space must be accessible from ExecutionSpace"); + static_assert(Kokkos::SpaceAccessibility::accessible, + "XViewType memory space must be accessible from ExecutionSpace"); + + static_assert(Kokkos::is_view::value, "AViewType must be a Kokkos::View."); + static_assert(Kokkos::is_view::value, "XViewType must be a Kokkos::View."); + + static_assert(static_cast(AViewType::rank) == 2, "AViewType must have rank 2."); + static_assert(static_cast(XViewType::rank) == 1, "XViewType must have rank 1."); // Check compatibility of dimensions at run time. if ((A.extent(0) != x.extent(0)) || (A.extent(1) != x.extent(0))) { std::ostringstream os; os << "KokkosBlas::syr: Dimensions of A, x: " - << "A is " << A.extent(0) << " by " << A.extent(1) << ", x has size " - << x.extent(0); + << "A is " << A.extent(0) << " by " << A.extent(1) << ", x has size " << x.extent(0); KokkosKernels::Impl::throw_runtime_exception(os.str()); } - if ((trans[0] == 'T') || (trans[0] == 't') || (trans[0] == 'H') || - (trans[0] == 'h')) { + if ((trans[0] == 'T') || (trans[0] == 't') || (trans[0] == 'H') || (trans[0] == 'h')) { // Ok } else { std::ostringstream os; - os << "KokkosBlas2::syr(): invalid trans[0] = '" << trans[0] - << "'. It must be equal to 'T' or 't' or 'H' or 'h'"; + os << "KokkosBlas2::syr(): invalid trans[0] = '" << trans[0] << "'. It must be equal to 'T' or 't' or 'H' or 'h'"; KokkosKernels::Impl::throw_runtime_exception(os.str()); } - if ((uplo[0] == 'U') || (uplo[0] == 'u') || (uplo[0] == 'L') || - (uplo[0] == 'l')) { + if ((uplo[0] == 'U') || (uplo[0] == 'u') || (uplo[0] == 'L') || (uplo[0] == 'l')) { // Ok } else { std::ostringstream oss; - oss << "KokkosBlas2::syr(): invalid uplo[0] = " << uplo[0] - << "'. It must be equal to 'U' or 'u' or 'L' or 'l'"; + oss << "KokkosBlas2::syr(): invalid uplo[0] = " << uplo[0] << "'. It must be equal to 'U' or 'u' or 'L' or 'l'"; throw std::runtime_error(oss.str()); } @@ -122,15 +108,11 @@ void syr(const ExecutionSpace& space, const char trans[], const char uplo[], // Minimize the number of Impl::SYR instantiations, by standardizing // on particular View specializations for its template parameters. - using XVT = - Kokkos::View::array_layout, - typename XViewType::device_type, - Kokkos::MemoryTraits >; - - using AVT = Kokkos::View::array_layout, + typename XViewType::device_type, Kokkos::MemoryTraits >; + + using AVT = Kokkos::View >; Impl::SYR::syr(space, trans, uplo, alpha, x, A); @@ -172,13 +154,10 @@ void syr(const ExecutionSpace& space, const char trans[], const char uplo[], /// \param x [in] Input vector, as a 1-D Kokkos::View /// \param A [in/out] Output matrix, as a nonconst 2-D Kokkos::View template -void syr(const char trans[], const char uplo[], - const typename AViewType::const_value_type& alpha, const XViewType& x, +void syr(const char trans[], const char uplo[], const typename AViewType::const_value_type& alpha, const XViewType& x, const AViewType& A) { - const typename AViewType::execution_space space = - typename AViewType::execution_space(); - syr( - space, trans, uplo, alpha, x, A); + const typename AViewType::execution_space space = typename AViewType::execution_space(); + syr(space, trans, uplo, alpha, x, A); } } // namespace KokkosBlas diff --git a/blas/src/KokkosBlas2_syr2.hpp b/blas/src/KokkosBlas2_syr2.hpp index d86abd31c1..91f4b20dee 100644 --- a/blas/src/KokkosBlas2_syr2.hpp +++ b/blas/src/KokkosBlas2_syr2.hpp @@ -78,67 +78,49 @@ namespace KokkosBlas { /// \param x [in] Input vector, as a 1-D Kokkos::View /// \param y [in] Input vector, as a 1-D Kokkos::View /// \param A [in/out] Output matrix, as a nonconst 2-D Kokkos::View -template +template void syr2(const ExecutionSpace& space, const char trans[], const char uplo[], - const typename AViewType::const_value_type& alpha, const XViewType& x, - const YViewType& y, const AViewType& A) { - static_assert( - Kokkos::SpaceAccessibility::accessible, - "AViewType memory space must be accessible from ExecutionSpace"); - static_assert( - Kokkos::SpaceAccessibility::accessible, - "XViewType memory space must be accessible from ExecutionSpace"); - static_assert( - Kokkos::SpaceAccessibility::accessible, - "YViewType memory space must be accessible from ExecutionSpace"); - - static_assert(Kokkos::is_view::value, - "AViewType must be a Kokkos::View."); - static_assert(Kokkos::is_view::value, - "XViewType must be a Kokkos::View."); - static_assert(Kokkos::is_view::value, - "YViewType must be a Kokkos::View."); - - static_assert(static_cast(AViewType::rank()) == 2, - "AViewType must have rank 2."); - static_assert(static_cast(XViewType::rank()) == 1, - "XViewType must have rank 1."); - static_assert(static_cast(YViewType::rank()) == 1, - "YViewType must have rank 1."); + const typename AViewType::const_value_type& alpha, const XViewType& x, const YViewType& y, + const AViewType& A) { + static_assert(Kokkos::SpaceAccessibility::accessible, + "AViewType memory space must be accessible from ExecutionSpace"); + static_assert(Kokkos::SpaceAccessibility::accessible, + "XViewType memory space must be accessible from ExecutionSpace"); + static_assert(Kokkos::SpaceAccessibility::accessible, + "YViewType memory space must be accessible from ExecutionSpace"); + + static_assert(Kokkos::is_view::value, "AViewType must be a Kokkos::View."); + static_assert(Kokkos::is_view::value, "XViewType must be a Kokkos::View."); + static_assert(Kokkos::is_view::value, "YViewType must be a Kokkos::View."); + + static_assert(static_cast(AViewType::rank()) == 2, "AViewType must have rank 2."); + static_assert(static_cast(XViewType::rank()) == 1, "XViewType must have rank 1."); + static_assert(static_cast(YViewType::rank()) == 1, "YViewType must have rank 1."); // Check compatibility of dimensions at run time. - if ((A.extent(0) == A.extent(1)) && (A.extent(0) == x.extent(0)) && - (A.extent(0) == y.extent(0))) { + if ((A.extent(0) == A.extent(1)) && (A.extent(0) == x.extent(0)) && (A.extent(0) == y.extent(0))) { // Ok } else { std::ostringstream os; os << "KokkosBlas::syr2: Dimensions of A, x: " - << "A is " << A.extent(0) << " by " << A.extent(1) << ", x has size " - << x.extent(0) << ", y has size " << y.extent(0); + << "A is " << A.extent(0) << " by " << A.extent(1) << ", x has size " << x.extent(0) << ", y has size " + << y.extent(0); KokkosKernels::Impl::throw_runtime_exception(os.str()); } - if ((trans[0] == 'T') || (trans[0] == 't') || (trans[0] == 'H') || - (trans[0] == 'h')) { + if ((trans[0] == 'T') || (trans[0] == 't') || (trans[0] == 'H') || (trans[0] == 'h')) { // Ok } else { std::ostringstream os; - os << "KokkosBlas2::syr2(): invalid trans[0] = '" << trans[0] - << "'. It must be equalt to 'T' or 't' or 'H' or 'h'"; + os << "KokkosBlas2::syr2(): invalid trans[0] = '" << trans[0] << "'. It must be equalt to 'T' or 't' or 'H' or 'h'"; KokkosKernels::Impl::throw_runtime_exception(os.str()); } - if ((uplo[0] == 'U') || (uplo[0] == 'u') || (uplo[0] == 'L') || - (uplo[0] == 'l')) { + if ((uplo[0] == 'U') || (uplo[0] == 'u') || (uplo[0] == 'L') || (uplo[0] == 'l')) { // Ok } else { std::ostringstream oss; - oss << "KokkosBlas2::syr2(): invalid uplo[0] = " << uplo[0] - << "'. It must be equalt to 'U' or 'u' or 'L' or 'l'"; + oss << "KokkosBlas2::syr2(): invalid uplo[0] = " << uplo[0] << "'. It must be equalt to 'U' or 'u' or 'L' or 'l'"; throw std::runtime_error(oss.str()); } @@ -151,26 +133,20 @@ void syr2(const ExecutionSpace& space, const char trans[], const char uplo[], // Minimize the number of Impl::SYR2 instantiations, by standardizing // on particular View specializations for its template parameters. typedef Kokkos::View::array_layout, - typename XViewType::device_type, - Kokkos::MemoryTraits > + typename KokkosKernels::Impl::GetUnifiedLayoutPreferring::array_layout, + typename XViewType::device_type, Kokkos::MemoryTraits > XVT; typedef Kokkos::View::array_layout, - typename YViewType::device_type, - Kokkos::MemoryTraits > + typename KokkosKernels::Impl::GetUnifiedLayoutPreferring::array_layout, + typename YViewType::device_type, Kokkos::MemoryTraits > YVT; - typedef Kokkos::View > AVT; - Impl::SYR2::syr2(space, trans, uplo, alpha, x, - y, A); + Impl::SYR2::syr2(space, trans, uplo, alpha, x, y, A); } /// \brief Rank-1 update (just lower portion or just upper portion) of a @@ -224,13 +200,10 @@ void syr2(const ExecutionSpace& space, const char trans[], const char uplo[], /// \param y [in] Input vector, as a 1-D Kokkos::View /// \param A [in/out] Output matrix, as a nonconst 2-D Kokkos::View template -void syr2(const char trans[], const char uplo[], - const typename AViewType::const_value_type& alpha, const XViewType& x, +void syr2(const char trans[], const char uplo[], const typename AViewType::const_value_type& alpha, const XViewType& x, const YViewType& y, const AViewType& A) { - const typename AViewType::execution_space space = - typename AViewType::execution_space(); - syr2( - space, trans, uplo, alpha, x, y, A); + const typename AViewType::execution_space space = typename AViewType::execution_space(); + syr2(space, trans, uplo, alpha, x, y, A); } } // namespace KokkosBlas diff --git a/blas/src/KokkosBlas2_team_gemv.hpp b/blas/src/KokkosBlas2_team_gemv.hpp index 09a1ae2330..a4a6dade2d 100644 --- a/blas/src/KokkosBlas2_team_gemv.hpp +++ b/blas/src/KokkosBlas2_team_gemv.hpp @@ -22,67 +22,48 @@ namespace KokkosBlas { namespace Experimental { -template -void KOKKOS_INLINE_FUNCTION team_gemv(const TeamType& team, const char trans, - const ScalarType& alpha, - const MatrixType& A, const XVector& x, - const ScalarType& beta, - const YVector& y) { +template +void KOKKOS_INLINE_FUNCTION team_gemv(const TeamType& team, const char trans, const ScalarType& alpha, + const MatrixType& A, const XVector& x, const ScalarType& beta, const YVector& y) { if (trans == 'N' || trans == 'n') - TeamGemv::invoke(team, alpha, A, x, - beta, y); + TeamGemv::invoke(team, alpha, A, x, beta, y); else if (trans == 'T' || trans == 't') - TeamGemv::invoke(team, alpha, A, x, - beta, y); + TeamGemv::invoke(team, alpha, A, x, beta, y); else if (trans == 'C' || trans == 'c') - TeamGemv::invoke(team, alpha, A, x, - beta, y); + TeamGemv::invoke(team, alpha, A, x, beta, y); else { Kokkos::abort("Matrix mode not supported"); } } // default AlgoTag -template -void KOKKOS_INLINE_FUNCTION team_gemv(const TeamType& team, const char trans, - const ScalarType& alpha, - const MatrixType& A, const XVector& x, - const ScalarType& beta, - const YVector& y) { +template +void KOKKOS_INLINE_FUNCTION team_gemv(const TeamType& team, const char trans, const ScalarType& alpha, + const MatrixType& A, const XVector& x, const ScalarType& beta, const YVector& y) { team_gemv(team, trans, alpha, A, x, beta, y); } -template -void KOKKOS_INLINE_FUNCTION -teamvector_gemv(const TeamType& team, const char trans, const ScalarType& alpha, - const MatrixType& A, const XVector& x, const ScalarType& beta, - const YVector& y) { +template +void KOKKOS_INLINE_FUNCTION teamvector_gemv(const TeamType& team, const char trans, const ScalarType& alpha, + const MatrixType& A, const XVector& x, const ScalarType& beta, + const YVector& y) { if (trans == 'N' || trans == 'n') { - KokkosBlas::TeamVectorGemv::invoke( - team, alpha, A, x, beta, y); + KokkosBlas::TeamVectorGemv::invoke(team, alpha, A, x, beta, y); } else if (trans == 'T' || trans == 't') { - KokkosBlas::TeamVectorGemv::invoke( - team, alpha, A, x, beta, y); + KokkosBlas::TeamVectorGemv::invoke(team, alpha, A, x, beta, y); } else if (trans == 'C' || trans == 'c') { - KokkosBlas::TeamVectorGemv::invoke( - team, alpha, A, x, beta, y); + KokkosBlas::TeamVectorGemv::invoke(team, alpha, A, x, beta, y); } else { Kokkos::abort("Matrix mode not supported"); } } // default AlgoTag -template -void KOKKOS_INLINE_FUNCTION -team_vector_gemv(const TeamType& team, const char trans, - const ScalarType& alpha, const MatrixType& A, const XVector& x, - const ScalarType& beta, const YVector& y) { - teamvector_gemv(team, trans, alpha, A, x, - beta, y); +template +void KOKKOS_INLINE_FUNCTION team_vector_gemv(const TeamType& team, const char trans, const ScalarType& alpha, + const MatrixType& A, const XVector& x, const ScalarType& beta, + const YVector& y) { + teamvector_gemv(team, trans, alpha, A, x, beta, y); } } // namespace Experimental diff --git a/blas/src/KokkosBlas3_gemm.hpp b/blas/src/KokkosBlas3_gemm.hpp index febd39b149..b0bff7ea71 100644 --- a/blas/src/KokkosBlas3_gemm.hpp +++ b/blas/src/KokkosBlas3_gemm.hpp @@ -38,31 +38,21 @@ namespace Impl { // This case must be intercepted here rather than impl in order to call TPL // GEMV instead of TPL GEMM. This codepath was measured to be profitable with // cuBLAS. -template +template bool gemv_based_gemm( - const execution_space& space, const char transA[], const char transB[], - typename AViewType::const_value_type& alpha, const AViewType& A, - const BViewType& B, typename CViewType::const_value_type& beta, - const CViewType& C, - typename std::enable_if::value && - !std::is_same::value>::type* = + const execution_space& space, const char transA[], const char transB[], typename AViewType::const_value_type& alpha, + const AViewType& A, const BViewType& B, typename CViewType::const_value_type& beta, const CViewType& C, + typename std::enable_if::value && + !std::is_same::value>::type* = nullptr) { - if (toupper(transA[0]) == 'N' && toupper(transB[0]) == 'N' && - B.extent(1) == size_t(1)) { + if (toupper(transA[0]) == 'N' && toupper(transB[0]) == 'N' && B.extent(1) == size_t(1)) { // since B/C both have a single column and are not LayoutStride, // can create a raw contiguous rank-1 vector from them rather than using // subview. - Kokkos::View> Bvec(B.data(), B.extent(0)); - Kokkos::View> Cvec(C.data(), C.extent(0)); KokkosBlas::gemv(space, "N", alpha, A, Bvec, beta, Cvec); @@ -76,15 +66,11 @@ bool gemv_based_gemm( // tests. template bool gemv_based_gemm( - const typename CViewType::execution_space& /*space*/, - const char /*transA*/[], const char /*transB*/[], - typename AViewType::const_value_type& /*alpha*/, const AViewType& /*A*/, - const BViewType& /*B*/, typename CViewType::const_value_type& /*beta*/, - const CViewType& /*C*/, - typename std::enable_if::value || - std::is_same::value>::type* = + const typename CViewType::execution_space& /*space*/, const char /*transA*/[], const char /*transB*/[], + typename AViewType::const_value_type& /*alpha*/, const AViewType& /*A*/, const BViewType& /*B*/, + typename CViewType::const_value_type& /*beta*/, const CViewType& /*C*/, + typename std::enable_if::value || + std::is_same::value>::type* = nullptr) { return false; } @@ -108,52 +94,35 @@ bool gemv_based_gemm( /// \param B [in] Input matrix, as a 2-D Kokkos::View /// \param beta [in] Input coefficient of C /// \param C [in/out] Output vector, as a nonconst 2-D Kokkos::View -template -void gemm(const execution_space& space, const char transA[], - const char transB[], typename AViewType::const_value_type& alpha, - const AViewType& A, const BViewType& B, +template +void gemm(const execution_space& space, const char transA[], const char transB[], + typename AViewType::const_value_type& alpha, const AViewType& A, const BViewType& B, typename CViewType::const_value_type& beta, const CViewType& C) { #if (KOKKOSKERNELS_DEBUG_LEVEL > 0) static_assert(Kokkos::is_execution_space_v, "KokkosBlas::gemm: execution_space must be a valid Kokkos " "execution space"); - static_assert(Kokkos::is_view::value, - "KokkosBlas::gemm: AViewType must be a Kokkos::View."); - static_assert(Kokkos::is_view::value, - "KokkosBlas::gemm: BViewType must be a Kokkos::View."); - static_assert(Kokkos::is_view::value, - "KokkosBlas::gemm: CViewType must be a Kokkos::View."); - static_assert(static_cast(AViewType::rank) == 2, - "KokkosBlas::gemm: AViewType must have rank 2."); - static_assert(static_cast(BViewType::rank) == 2, - "KokkosBlas::gemm: BViewType must have rank 2."); - static_assert(static_cast(CViewType::rank) == 2, - "KokkosBlas::gemm: CViewType must have rank 2."); - static_assert( - Kokkos::SpaceAccessibility::accessible, - "KokkosBlas::gemm: AViewType must be accessible from execution_space"); - static_assert( - Kokkos::SpaceAccessibility::accessible, - "KokkosBlas::gemm: BViewType must be accessible from execution_space"); - static_assert( - Kokkos::SpaceAccessibility::accessible, - "KokkosBlas::gemm: CViewType must be accessible from execution_space"); + static_assert(Kokkos::is_view::value, "KokkosBlas::gemm: AViewType must be a Kokkos::View."); + static_assert(Kokkos::is_view::value, "KokkosBlas::gemm: BViewType must be a Kokkos::View."); + static_assert(Kokkos::is_view::value, "KokkosBlas::gemm: CViewType must be a Kokkos::View."); + static_assert(static_cast(AViewType::rank) == 2, "KokkosBlas::gemm: AViewType must have rank 2."); + static_assert(static_cast(BViewType::rank) == 2, "KokkosBlas::gemm: BViewType must have rank 2."); + static_assert(static_cast(CViewType::rank) == 2, "KokkosBlas::gemm: CViewType must have rank 2."); + static_assert(Kokkos::SpaceAccessibility::accessible, + "KokkosBlas::gemm: AViewType must be accessible from execution_space"); + static_assert(Kokkos::SpaceAccessibility::accessible, + "KokkosBlas::gemm: BViewType must be accessible from execution_space"); + static_assert(Kokkos::SpaceAccessibility::accessible, + "KokkosBlas::gemm: CViewType must be accessible from execution_space"); // Check validity of transpose argument - bool valid_transA = (transA[0] == 'N') || (transA[0] == 'n') || - (transA[0] == 'T') || (transA[0] == 't') || + bool valid_transA = (transA[0] == 'N') || (transA[0] == 'n') || (transA[0] == 'T') || (transA[0] == 't') || (transA[0] == 'C') || (transA[0] == 'c'); - bool valid_transB = (transB[0] == 'N') || (transB[0] == 'n') || - (transB[0] == 'T') || (transB[0] == 't') || + bool valid_transB = (transB[0] == 'N') || (transB[0] == 'n') || (transB[0] == 'T') || (transB[0] == 't') || (transB[0] == 'C') || (transB[0] == 'c'); if (!(valid_transA && valid_transB)) { std::ostringstream os; - os << "KokkosBlas::gemm: transA[0] = '" << transA[0] << " transB[0] = '" - << transB[0] << "'. " + os << "KokkosBlas::gemm: transA[0] = '" << transA[0] << " transB[0] = '" << transB[0] << "'. " << "Valid values include 'N' or 'n' (No transpose), 'T' or 't' " "(Transpose), " "and 'C' or 'c' (Conjugate transpose)."; @@ -172,13 +141,11 @@ void gemm(const execution_space& space, const char transA[], int64_t C0 = C.extent(0); int64_t C1 = C.extent(1); - if (((A_t ? A1 : A0) != C0) || ((B_t ? B_0 : B1) != C1) || - ((A_t ? A0 : A1) != (B_t ? B1 : B_0))) { + if (((A_t ? A1 : A0) != C0) || ((B_t ? B_0 : B1) != C1) || ((A_t ? A0 : A1) != (B_t ? B1 : B_0))) { std::ostringstream os; os << "KokkosBlas::gemm: Dimensions of A, B, and C do not match: " - << "transA: " << transA[0] << " transB: " << transB[0] - << " A: " << A.extent(0) << " x " << A.extent(1) << " B: " << B.extent(0) - << " x " << B.extent(1) << " C: " << C.extent(0) << " x " << C.extent(1); + << "transA: " << transA[0] << " transB: " << transB[0] << " A: " << A.extent(0) << " x " << A.extent(1) + << " B: " << B.extent(0) << " x " << B.extent(1) << " C: " << C.extent(0) << " x " << C.extent(1); KokkosKernels::Impl::throw_runtime_exception(os.str()); } #endif // KOKKOSKERNELS_DEBUG_LEVEL > 0 @@ -195,24 +162,19 @@ void gemm(const execution_space& space, const char transA[], } // Check if gemv code path is allowed and profitable, and if so run it. - if (Impl::gemv_based_gemm(space, transA, transB, alpha, A, B, beta, C)) - return; + if (Impl::gemv_based_gemm(space, transA, transB, alpha, A, B, beta, C)) return; // Minimize the number of Impl::GEMM instantiations, by // standardizing on particular View specializations for its template // parameters. - typedef Kokkos::View< - typename AViewType::const_value_type**, typename AViewType::array_layout, - typename AViewType::device_type, Kokkos::MemoryTraits> + typedef Kokkos::View> AVT; - typedef Kokkos::View< - typename BViewType::const_value_type**, typename BViewType::array_layout, - typename BViewType::device_type, Kokkos::MemoryTraits> + typedef Kokkos::View> BVT; - typedef Kokkos::View> + typedef Kokkos::View> CVT; typedef Impl::GEMM impl_type; impl_type::gemm(space, transA, transB, alpha, A, B, beta, C); @@ -236,12 +198,9 @@ void gemm(const execution_space& space, const char transA[], /// \param beta [in] Input coefficient of C /// \param C [in/out] Output vector, as a nonconst 2-D Kokkos::View template -void gemm(const char transA[], const char transB[], - typename AViewType::const_value_type& alpha, const AViewType& A, - const BViewType& B, typename CViewType::const_value_type& beta, - const CViewType& C) { - gemm(typename CViewType::execution_space{}, transA, transB, alpha, A, B, beta, - C); +void gemm(const char transA[], const char transB[], typename AViewType::const_value_type& alpha, const AViewType& A, + const BViewType& B, typename CViewType::const_value_type& beta, const CViewType& C) { + gemm(typename CViewType::execution_space{}, transA, transB, alpha, A, B, beta, C); } } // namespace KokkosBlas diff --git a/blas/src/KokkosBlas3_trmm.hpp b/blas/src/KokkosBlas3_trmm.hpp index bdc86d4d9e..9da47b7160 100644 --- a/blas/src/KokkosBlas3_trmm.hpp +++ b/blas/src/KokkosBlas3_trmm.hpp @@ -64,29 +64,19 @@ namespace KokkosBlas { /// On entry, M-by-N matrix /// On exit, overwritten with the solution template -void trmm(const execution_space& space, const char side[], const char uplo[], - const char trans[], const char diag[], - typename BViewType::const_value_type& alpha, const AViewType& A, - const BViewType& B) { - static_assert(Kokkos::is_view::value, - "AViewType must be a Kokkos::View."); - static_assert(Kokkos::is_view::value, - "BViewType must be a Kokkos::View."); - static_assert(static_cast(AViewType::rank) == 2, - "AViewType must have rank 2."); - static_assert(static_cast(BViewType::rank) == 2, - "BViewType must have rank 2."); +void trmm(const execution_space& space, const char side[], const char uplo[], const char trans[], const char diag[], + typename BViewType::const_value_type& alpha, const AViewType& A, const BViewType& B) { + static_assert(Kokkos::is_view::value, "AViewType must be a Kokkos::View."); + static_assert(Kokkos::is_view::value, "BViewType must be a Kokkos::View."); + static_assert(static_cast(AViewType::rank) == 2, "AViewType must have rank 2."); + static_assert(static_cast(BViewType::rank) == 2, "BViewType must have rank 2."); // Check validity of indicator argument - bool valid_side = (side[0] == 'L') || (side[0] == 'l') || (side[0] == 'R') || - (side[0] == 'r'); - bool valid_uplo = (uplo[0] == 'U') || (uplo[0] == 'u') || (uplo[0] == 'L') || - (uplo[0] == 'l'); - bool valid_trans = (trans[0] == 'N') || (trans[0] == 'n') || - (trans[0] == 'T') || (trans[0] == 't') || + bool valid_side = (side[0] == 'L') || (side[0] == 'l') || (side[0] == 'R') || (side[0] == 'r'); + bool valid_uplo = (uplo[0] == 'U') || (uplo[0] == 'u') || (uplo[0] == 'L') || (uplo[0] == 'l'); + bool valid_trans = (trans[0] == 'N') || (trans[0] == 'n') || (trans[0] == 'T') || (trans[0] == 't') || (trans[0] == 'C') || (trans[0] == 'c'); - bool valid_diag = (diag[0] == 'U') || (diag[0] == 'u') || (diag[0] == 'N') || - (diag[0] == 'n'); + bool valid_diag = (diag[0] == 'U') || (diag[0] == 'u') || (diag[0] == 'N') || (diag[0] == 'n'); if (!valid_side) { std::ostringstream os; os << "KokkosBlas::trmm: side = '" << side[0] << "'. " @@ -133,27 +123,20 @@ void trmm(const execution_space& space, const char side[], const char uplo[], if (A_m != A_n || (is_A_lower_triangle ? B_m : B_n) != A_n) { std::ostringstream os; os << "KokkosBlas::trmm: Dimensions of A and B do not match: " - << "side: " << side[0] << " A: " << A.extent(0) << " x " << A.extent(1) - << " B: " << B.extent(0) << " x " << B.extent(1); + << "side: " << side[0] << " A: " << A.extent(0) << " x " << A.extent(1) << " B: " << B.extent(0) << " x " + << B.extent(1); KokkosKernels::Impl::throw_runtime_exception(os.str()); } // Create A matrix view type alias - using AViewInternalType = - Kokkos::View >; + using AViewInternalType = Kokkos::View >; // Crease B matrix view type alias - using BViewInternalType = - Kokkos::View >; + using BViewInternalType = Kokkos::View >; - KokkosBlas::Impl::TRMM::trmm(space, side, uplo, trans, - diag, alpha, A, B); + KokkosBlas::Impl::TRMM::trmm(space, side, uplo, trans, diag, + alpha, A, B); } /// \brief Solve triangular linear system with multiple RHSs: @@ -186,11 +169,9 @@ void trmm(const execution_space& space, const char side[], const char uplo[], /// On entry, M-by-N matrix /// On exit, overwritten with the solution template -void trmm(const char side[], const char uplo[], const char trans[], - const char diag[], typename BViewType::const_value_type& alpha, - const AViewType& A, const BViewType& B) { - trmm(typename AViewType::execution_space{}, side, uplo, trans, diag, alpha, A, - B); +void trmm(const char side[], const char uplo[], const char trans[], const char diag[], + typename BViewType::const_value_type& alpha, const AViewType& A, const BViewType& B) { + trmm(typename AViewType::execution_space{}, side, uplo, trans, diag, alpha, A, B); } } // namespace KokkosBlas diff --git a/blas/src/KokkosBlas3_trsm.hpp b/blas/src/KokkosBlas3_trsm.hpp index 890b2ff6aa..fd0123174e 100644 --- a/blas/src/KokkosBlas3_trsm.hpp +++ b/blas/src/KokkosBlas3_trsm.hpp @@ -60,29 +60,19 @@ namespace KokkosBlas { /// On entry, M-by-N matrix of multile RHS /// On exit, overwritten with the solution X template -void trsm(const execution_space& space, const char side[], const char uplo[], - const char trans[], const char diag[], - typename BViewType::const_value_type& alpha, const AViewType& A, - const BViewType& B) { - static_assert(Kokkos::is_view::value, - "AViewType must be a Kokkos::View."); - static_assert(Kokkos::is_view::value, - "BViewType must be a Kokkos::View."); - static_assert(static_cast(AViewType::rank) == 2, - "AViewType must have rank 2."); - static_assert(static_cast(BViewType::rank) == 2, - "BViewType must have rank 2."); +void trsm(const execution_space& space, const char side[], const char uplo[], const char trans[], const char diag[], + typename BViewType::const_value_type& alpha, const AViewType& A, const BViewType& B) { + static_assert(Kokkos::is_view::value, "AViewType must be a Kokkos::View."); + static_assert(Kokkos::is_view::value, "BViewType must be a Kokkos::View."); + static_assert(static_cast(AViewType::rank) == 2, "AViewType must have rank 2."); + static_assert(static_cast(BViewType::rank) == 2, "BViewType must have rank 2."); // Check validity of indicator argument - bool valid_side = (side[0] == 'L') || (side[0] == 'l') || (side[0] == 'R') || - (side[0] == 'r'); - bool valid_uplo = (uplo[0] == 'U') || (uplo[0] == 'u') || (uplo[0] == 'L') || - (uplo[0] == 'l'); - bool valid_trans = (trans[0] == 'N') || (trans[0] == 'n') || - (trans[0] == 'T') || (trans[0] == 't') || + bool valid_side = (side[0] == 'L') || (side[0] == 'l') || (side[0] == 'R') || (side[0] == 'r'); + bool valid_uplo = (uplo[0] == 'U') || (uplo[0] == 'u') || (uplo[0] == 'L') || (uplo[0] == 'l'); + bool valid_trans = (trans[0] == 'N') || (trans[0] == 'n') || (trans[0] == 'T') || (trans[0] == 't') || (trans[0] == 'C') || (trans[0] == 'c'); - bool valid_diag = (diag[0] == 'U') || (diag[0] == 'u') || (diag[0] == 'N') || - (diag[0] == 'n'); + bool valid_diag = (diag[0] == 'U') || (diag[0] == 'u') || (diag[0] == 'N') || (diag[0] == 'n'); if (!valid_side) { std::ostringstream os; os << "KokkosBlas::trsm: side = '" << side[0] << "'. " @@ -125,30 +115,23 @@ void trsm(const execution_space& space, const char side[], const char uplo[], if ((A0 != A1) || ((A_s ? B0 : B1) != A1)) { std::ostringstream os; os << "KokkosBlas::trsm: Dimensions of A and B do not match: " - << "side: " << side[0] << " A: " << A.extent(0) << " x " << A.extent(1) - << " B: " << B.extent(0) << " x " << B.extent(1); + << "side: " << side[0] << " A: " << A.extent(0) << " x " << A.extent(1) << " B: " << B.extent(0) << " x " + << B.extent(1); KokkosKernels::Impl::throw_runtime_exception(os.str()); } // Return if degenerated matrices are provided - if ((A.extent(0) == 0) || (A.extent(1) == 0) || (B.extent(0) == 0) || - (B.extent(1) == 0)) - return; + if ((A.extent(0) == 0) || (A.extent(1) == 0) || (B.extent(0) == 0) || (B.extent(1) == 0)) return; // Minimize the number of Impl::TRSM instantiations, by // standardizing on particular View specializations for its template // parameters. - using AVT = Kokkos::View >; - using BVT = Kokkos::View >; + using AVT = Kokkos::View >; + using BVT = Kokkos::View >; - KokkosBlas::Impl::TRSM::trsm( - space, side, uplo, trans, diag, alpha, A, B); + KokkosBlas::Impl::TRSM::trsm(space, side, uplo, trans, diag, alpha, A, B); } /// \brief Solve triangular linear system with multiple RHSs: @@ -179,11 +162,9 @@ void trsm(const execution_space& space, const char side[], const char uplo[], /// On entry, M-by-N matrix of multile RHS /// On exit, overwritten with the solution X template -void trsm(const char side[], const char uplo[], const char trans[], - const char diag[], typename BViewType::const_value_type& alpha, - const AViewType& A, const BViewType& B) { - trsm(typename AViewType::execution_space{}, side, uplo, trans, diag, alpha, A, - B); +void trsm(const char side[], const char uplo[], const char trans[], const char diag[], + typename BViewType::const_value_type& alpha, const AViewType& A, const BViewType& B) { + trsm(typename AViewType::execution_space{}, side, uplo, trans, diag, alpha, A, B); } } // namespace KokkosBlas diff --git a/blas/src/KokkosBlas_trtri.hpp b/blas/src/KokkosBlas_trtri.hpp index d9771e3a16..34ca96b2d4 100644 --- a/blas/src/KokkosBlas_trtri.hpp +++ b/blas/src/KokkosBlas_trtri.hpp @@ -43,8 +43,7 @@ namespace KokkosBlas { // and the inversion could not be completed. // source: https://software.intel.com/en-us/mkl-developer-reference-c-trtri template -[[deprecated]] int trtri(const char uplo[], const char diag[], - const AViewType& A) { +[[deprecated]] int trtri(const char uplo[], const char diag[], const AViewType& A) { return KokkosLapack::trtri(uplo, diag, A); } diff --git a/blas/tpls/KokkosBlas1_axpby_tpl_spec_avail.hpp b/blas/tpls/KokkosBlas1_axpby_tpl_spec_avail.hpp index e2b04e300d..1ed52d35b8 100644 --- a/blas/tpls/KokkosBlas1_axpby_tpl_spec_avail.hpp +++ b/blas/tpls/KokkosBlas1_axpby_tpl_spec_avail.hpp @@ -20,8 +20,7 @@ namespace KokkosBlas { namespace Impl { // Specialization struct which defines whether a specialization exists -template +template struct axpby_tpl_spec_avail { enum : bool { value = false }; }; @@ -34,54 +33,44 @@ namespace Impl { // Generic Host side BLAS (could be MKL or whatever) #ifdef KOKKOSKERNELS_ENABLE_TPL_BLAS -#define KOKKOSBLAS1_AXPBY_TPL_SPEC_AVAIL_BLAS(SCALAR, LAYOUT, MEMSPACE) \ - template \ - struct axpby_tpl_spec_avail< \ - ExecSpace, SCALAR, \ - Kokkos::View, \ - Kokkos::MemoryTraits >, \ - SCALAR, \ - Kokkos::View, \ - Kokkos::MemoryTraits >, \ - 1> { \ - enum : bool { value = true }; \ +#define KOKKOSBLAS1_AXPBY_TPL_SPEC_AVAIL_BLAS(SCALAR, LAYOUT, MEMSPACE) \ + template \ + struct axpby_tpl_spec_avail< \ + ExecSpace, SCALAR, \ + Kokkos::View, \ + Kokkos::MemoryTraits >, \ + SCALAR, \ + Kokkos::View, Kokkos::MemoryTraits >, \ + 1> { \ + enum : bool { value = true }; \ }; -KOKKOSBLAS1_AXPBY_TPL_SPEC_AVAIL_BLAS(double, Kokkos::LayoutLeft, - Kokkos::HostSpace) -KOKKOSBLAS1_AXPBY_TPL_SPEC_AVAIL_BLAS(float, Kokkos::LayoutLeft, - Kokkos::HostSpace) -KOKKOSBLAS1_AXPBY_TPL_SPEC_AVAIL_BLAS(Kokkos::complex, - Kokkos::LayoutLeft, Kokkos::HostSpace) -KOKKOSBLAS1_AXPBY_TPL_SPEC_AVAIL_BLAS(Kokkos::complex, - Kokkos::LayoutLeft, Kokkos::HostSpace) +KOKKOSBLAS1_AXPBY_TPL_SPEC_AVAIL_BLAS(double, Kokkos::LayoutLeft, Kokkos::HostSpace) +KOKKOSBLAS1_AXPBY_TPL_SPEC_AVAIL_BLAS(float, Kokkos::LayoutLeft, Kokkos::HostSpace) +KOKKOSBLAS1_AXPBY_TPL_SPEC_AVAIL_BLAS(Kokkos::complex, Kokkos::LayoutLeft, Kokkos::HostSpace) +KOKKOSBLAS1_AXPBY_TPL_SPEC_AVAIL_BLAS(Kokkos::complex, Kokkos::LayoutLeft, Kokkos::HostSpace) #endif // cuBLAS #ifdef KOKKOSKERNELS_ENABLE_TPL_CUBLAS -#define KOKKOSBLAS1_AXPBY_TPL_SPEC_AVAIL_CUBLAS(SCALAR, LAYOUT, MEMSPACE) \ - template \ - struct axpby_tpl_spec_avail< \ - ExecSpace, SCALAR, \ - Kokkos::View, \ - Kokkos::MemoryTraits >, \ - SCALAR, \ - Kokkos::View, \ - Kokkos::MemoryTraits >, \ - 1> { \ - enum : bool { value = true }; \ +#define KOKKOSBLAS1_AXPBY_TPL_SPEC_AVAIL_CUBLAS(SCALAR, LAYOUT, MEMSPACE) \ + template \ + struct axpby_tpl_spec_avail< \ + ExecSpace, SCALAR, \ + Kokkos::View, \ + Kokkos::MemoryTraits >, \ + SCALAR, \ + Kokkos::View, Kokkos::MemoryTraits >, \ + 1> { \ + enum : bool { value = true }; \ }; -KOKKOSBLAS1_AXPBY_TPL_SPEC_AVAIL_CUBLAS(double, Kokkos::LayoutLeft, - Kokkos::CudaSpace) -KOKKOSBLAS1_AXPBY_TPL_SPEC_AVAIL_CUBLAS(float, Kokkos::LayoutLeft, - Kokkos::CudaSpace) -KOKKOSBLAS1_AXPBY_TPL_SPEC_AVAIL_CUBLAS(Kokkos::complex, - Kokkos::LayoutLeft, Kokkos::CudaSpace) -KOKKOSBLAS1_AXPBY_TPL_SPEC_AVAIL_CUBLAS(Kokkos::complex, - Kokkos::LayoutLeft, Kokkos::CudaSpace) +KOKKOSBLAS1_AXPBY_TPL_SPEC_AVAIL_CUBLAS(double, Kokkos::LayoutLeft, Kokkos::CudaSpace) +KOKKOSBLAS1_AXPBY_TPL_SPEC_AVAIL_CUBLAS(float, Kokkos::LayoutLeft, Kokkos::CudaSpace) +KOKKOSBLAS1_AXPBY_TPL_SPEC_AVAIL_CUBLAS(Kokkos::complex, Kokkos::LayoutLeft, Kokkos::CudaSpace) +KOKKOSBLAS1_AXPBY_TPL_SPEC_AVAIL_CUBLAS(Kokkos::complex, Kokkos::LayoutLeft, Kokkos::CudaSpace) #endif } // namespace Impl diff --git a/blas/tpls/KokkosBlas1_axpby_tpl_spec_decl.hpp b/blas/tpls/KokkosBlas1_axpby_tpl_spec_decl.hpp index 65154b9985..5ab29e632f 100644 --- a/blas/tpls/KokkosBlas1_axpby_tpl_spec_decl.hpp +++ b/blas/tpls/KokkosBlas1_axpby_tpl_spec_decl.hpp @@ -27,8 +27,7 @@ inline void axpby_print_specialization() { printf( "KokkosBlas1::axpby<> TPL Blas specialization for < %s , %s , %s , %s " ">\n", - typeid(AV).name(), typeid(XMV).name(), typeid(BV).name(), - typeid(YMV).name()); + typeid(AV).name(), typeid(XMV).name(), typeid(BV).name(), typeid(YMV).name()); #endif } } // namespace @@ -40,158 +39,132 @@ inline void axpby_print_specialization() { namespace KokkosBlas { namespace Impl { -#define KOKKOSBLAS1_DAXPBY_BLAS(LAYOUT, MEMSPACE, ETI_SPEC_AVAIL) \ - template \ - struct Axpby< \ - ExecSpace, double, \ - Kokkos::View, \ - Kokkos::MemoryTraits >, \ - double, \ - Kokkos::View, \ - Kokkos::MemoryTraits >, \ - 1, true, ETI_SPEC_AVAIL> { \ - typedef double AV; \ - typedef double BV; \ - typedef Kokkos::View, \ - Kokkos::MemoryTraits > \ - XV; \ - typedef Kokkos::View, \ - Kokkos::MemoryTraits > \ - YV; \ - \ - static void axpby(const ExecSpace& space, const AV& alpha, const XV& X, \ - const BV& beta, const YV& Y) { \ - Kokkos::Profiling::pushRegion("KokkosBlas::axpby[TPL_BLAS,double]"); \ - if ((X.extent(0) < INT_MAX) && (beta == 1.0)) { \ - axpby_print_specialization(); \ - int N = X.extent(0); \ - int one = 1; \ - HostBlas::axpy(N, alpha, X.data(), one, Y.data(), one); \ - } else \ - Axpby::axpby(space, alpha, X, beta, Y); \ - Kokkos::Profiling::popRegion(); \ - } \ +#define KOKKOSBLAS1_DAXPBY_BLAS(LAYOUT, MEMSPACE, ETI_SPEC_AVAIL) \ + template \ + struct Axpby< \ + ExecSpace, double, \ + Kokkos::View, \ + Kokkos::MemoryTraits >, \ + double, \ + Kokkos::View, Kokkos::MemoryTraits >, 1, \ + true, ETI_SPEC_AVAIL> { \ + typedef double AV; \ + typedef double BV; \ + typedef Kokkos::View, \ + Kokkos::MemoryTraits > \ + XV; \ + typedef Kokkos::View, \ + Kokkos::MemoryTraits > \ + YV; \ + \ + static void axpby(const ExecSpace& space, const AV& alpha, const XV& X, const BV& beta, const YV& Y) { \ + Kokkos::Profiling::pushRegion("KokkosBlas::axpby[TPL_BLAS,double]"); \ + if ((X.extent(0) < INT_MAX) && (beta == 1.0)) { \ + axpby_print_specialization(); \ + int N = X.extent(0); \ + int one = 1; \ + HostBlas::axpy(N, alpha, X.data(), one, Y.data(), one); \ + } else \ + Axpby::axpby(space, alpha, X, beta, Y); \ + Kokkos::Profiling::popRegion(); \ + } \ }; -#define KOKKOSBLAS1_SAXPBY_BLAS(LAYOUT, MEMSPACE, ETI_SPEC_AVAIL) \ - template \ - struct Axpby< \ - ExecSpace, float, \ - Kokkos::View, \ - Kokkos::MemoryTraits >, \ - float, \ - Kokkos::View, \ - Kokkos::MemoryTraits >, \ - 1, true, ETI_SPEC_AVAIL> { \ - typedef float AV; \ - typedef float BV; \ - typedef Kokkos::View, \ - Kokkos::MemoryTraits > \ - XV; \ - typedef Kokkos::View, \ - Kokkos::MemoryTraits > \ - YV; \ - \ - static void axpby(const ExecSpace& space, const AV& alpha, const XV& X, \ - const BV& beta, const YV& Y) { \ - Kokkos::Profiling::pushRegion("KokkosBlas::axpby[TPL_BLAS,float]"); \ - if ((X.extent(0) < INT_MAX) && (beta == 1.0f)) { \ - axpby_print_specialization(); \ - int N = X.extent(0); \ - int one = 1; \ - HostBlas::axpy(N, alpha, X.data(), one, Y.data(), one); \ - } else \ - Axpby::axpby(space, alpha, X, beta, Y); \ - Kokkos::Profiling::popRegion(); \ - } \ +#define KOKKOSBLAS1_SAXPBY_BLAS(LAYOUT, MEMSPACE, ETI_SPEC_AVAIL) \ + template \ + struct Axpby< \ + ExecSpace, float, \ + Kokkos::View, \ + Kokkos::MemoryTraits >, \ + float, \ + Kokkos::View, Kokkos::MemoryTraits >, 1, \ + true, ETI_SPEC_AVAIL> { \ + typedef float AV; \ + typedef float BV; \ + typedef Kokkos::View, \ + Kokkos::MemoryTraits > \ + XV; \ + typedef Kokkos::View, \ + Kokkos::MemoryTraits > \ + YV; \ + \ + static void axpby(const ExecSpace& space, const AV& alpha, const XV& X, const BV& beta, const YV& Y) { \ + Kokkos::Profiling::pushRegion("KokkosBlas::axpby[TPL_BLAS,float]"); \ + if ((X.extent(0) < INT_MAX) && (beta == 1.0f)) { \ + axpby_print_specialization(); \ + int N = X.extent(0); \ + int one = 1; \ + HostBlas::axpy(N, alpha, X.data(), one, Y.data(), one); \ + } else \ + Axpby::axpby(space, alpha, X, beta, Y); \ + Kokkos::Profiling::popRegion(); \ + } \ }; -#define KOKKOSBLAS1_ZAXPBY_BLAS(LAYOUT, MEMSPACE, ETI_SPEC_AVAIL) \ - template \ - struct Axpby, \ - Kokkos::View*, LAYOUT, \ - Kokkos::Device, \ - Kokkos::MemoryTraits >, \ - Kokkos::complex, \ - Kokkos::View*, LAYOUT, \ - Kokkos::Device, \ - Kokkos::MemoryTraits >, \ - 1, true, ETI_SPEC_AVAIL> { \ - typedef Kokkos::complex AV; \ - typedef Kokkos::complex BV; \ - typedef Kokkos::View*, LAYOUT, \ - Kokkos::Device, \ - Kokkos::MemoryTraits > \ - XV; \ - typedef Kokkos::View*, LAYOUT, \ - Kokkos::Device, \ - Kokkos::MemoryTraits > \ - YV; \ - \ - static void axpby(const ExecSpace& space, const AV& alpha, const XV& X, \ - const BV& beta, const YV& Y) { \ - Kokkos::Profiling::pushRegion( \ - "KokkosBlas::axpby[TPL_BLAS,complex]"); \ - if ((X.extent(0) < INT_MAX) && (beta == 1.0f)) { \ - axpby_print_specialization(); \ - int N = X.extent(0); \ - int one = 1; \ - const std::complex alpha_val = alpha; \ - HostBlas >::axpy( \ - N, alpha_val, \ - reinterpret_cast*>(X.data()), one, \ - reinterpret_cast*>(Y.data()), one); \ - } else \ - Axpby::axpby(space, alpha, X, beta, Y); \ - Kokkos::Profiling::popRegion(); \ - } \ +#define KOKKOSBLAS1_ZAXPBY_BLAS(LAYOUT, MEMSPACE, ETI_SPEC_AVAIL) \ + template \ + struct Axpby, \ + Kokkos::View*, LAYOUT, Kokkos::Device, \ + Kokkos::MemoryTraits >, \ + Kokkos::complex, \ + Kokkos::View*, LAYOUT, Kokkos::Device, \ + Kokkos::MemoryTraits >, \ + 1, true, ETI_SPEC_AVAIL> { \ + typedef Kokkos::complex AV; \ + typedef Kokkos::complex BV; \ + typedef Kokkos::View*, LAYOUT, Kokkos::Device, \ + Kokkos::MemoryTraits > \ + XV; \ + typedef Kokkos::View*, LAYOUT, Kokkos::Device, \ + Kokkos::MemoryTraits > \ + YV; \ + \ + static void axpby(const ExecSpace& space, const AV& alpha, const XV& X, const BV& beta, const YV& Y) { \ + Kokkos::Profiling::pushRegion("KokkosBlas::axpby[TPL_BLAS,complex]"); \ + if ((X.extent(0) < INT_MAX) && (beta == 1.0f)) { \ + axpby_print_specialization(); \ + int N = X.extent(0); \ + int one = 1; \ + const std::complex alpha_val = alpha; \ + HostBlas >::axpy(N, alpha_val, reinterpret_cast*>(X.data()), \ + one, reinterpret_cast*>(Y.data()), one); \ + } else \ + Axpby::axpby(space, alpha, X, beta, Y); \ + Kokkos::Profiling::popRegion(); \ + } \ }; -#define KOKKOSBLAS1_CAXPBY_BLAS(LAYOUT, MEMSPACE, ETI_SPEC_AVAIL) \ - template \ - struct Axpby, \ - Kokkos::View*, LAYOUT, \ - Kokkos::Device, \ - Kokkos::MemoryTraits >, \ - Kokkos::complex, \ - Kokkos::View*, LAYOUT, \ - Kokkos::Device, \ - Kokkos::MemoryTraits >, \ - 1, true, ETI_SPEC_AVAIL> { \ - typedef Kokkos::complex AV; \ - typedef Kokkos::complex BV; \ - typedef Kokkos::View*, LAYOUT, \ - Kokkos::Device, \ - Kokkos::MemoryTraits > \ - XV; \ - typedef Kokkos::View*, LAYOUT, \ - Kokkos::Device, \ - Kokkos::MemoryTraits > \ - YV; \ - \ - static void axpby(const ExecSpace& space, const AV& alpha, const XV& X, \ - const BV& beta, const YV& Y) { \ - Kokkos::Profiling::pushRegion( \ - "KokkosBlas::axpby[TPL_BLAS,complex]"); \ - if ((X.extent(0) < INT_MAX) && (beta == 1.0f)) { \ - axpby_print_specialization(); \ - int N = X.extent(0); \ - int one = 1; \ - const std::complex alpha_val = alpha; \ - HostBlas >::axpy( \ - N, alpha_val, \ - reinterpret_cast*>(X.data()), one, \ - reinterpret_cast*>(Y.data()), one); \ - } else \ - Axpby::axpby(space, alpha, X, beta, Y); \ - Kokkos::Profiling::popRegion(); \ - } \ +#define KOKKOSBLAS1_CAXPBY_BLAS(LAYOUT, MEMSPACE, ETI_SPEC_AVAIL) \ + template \ + struct Axpby, \ + Kokkos::View*, LAYOUT, Kokkos::Device, \ + Kokkos::MemoryTraits >, \ + Kokkos::complex, \ + Kokkos::View*, LAYOUT, Kokkos::Device, \ + Kokkos::MemoryTraits >, \ + 1, true, ETI_SPEC_AVAIL> { \ + typedef Kokkos::complex AV; \ + typedef Kokkos::complex BV; \ + typedef Kokkos::View*, LAYOUT, Kokkos::Device, \ + Kokkos::MemoryTraits > \ + XV; \ + typedef Kokkos::View*, LAYOUT, Kokkos::Device, \ + Kokkos::MemoryTraits > \ + YV; \ + \ + static void axpby(const ExecSpace& space, const AV& alpha, const XV& X, const BV& beta, const YV& Y) { \ + Kokkos::Profiling::pushRegion("KokkosBlas::axpby[TPL_BLAS,complex]"); \ + if ((X.extent(0) < INT_MAX) && (beta == 1.0f)) { \ + axpby_print_specialization(); \ + int N = X.extent(0); \ + int one = 1; \ + const std::complex alpha_val = alpha; \ + HostBlas >::axpy(N, alpha_val, reinterpret_cast*>(X.data()), \ + one, reinterpret_cast*>(Y.data()), one); \ + } else \ + Axpby::axpby(space, alpha, X, beta, Y); \ + Kokkos::Profiling::popRegion(); \ + } \ }; KOKKOSBLAS1_DAXPBY_BLAS(Kokkos::LayoutLeft, Kokkos::HostSpace, true) @@ -222,186 +195,152 @@ KOKKOSBLAS1_CAXPBY_BLAS(Kokkos::LayoutLeft, Kokkos::HostSpace, false) namespace KokkosBlas { namespace Impl { -#define KOKKOSBLAS1_DAXPBY_CUBLAS(LAYOUT, MEMSPACE, ETI_SPEC_AVAIL) \ - template \ - struct Axpby< \ - ExecSpace, double, \ - Kokkos::View, \ - Kokkos::MemoryTraits >, \ - double, \ - Kokkos::View, \ - Kokkos::MemoryTraits >, \ - 1, true, ETI_SPEC_AVAIL> { \ - typedef double AV; \ - typedef double BV; \ - typedef Kokkos::View, \ - Kokkos::MemoryTraits > \ - XV; \ - typedef Kokkos::View, \ - Kokkos::MemoryTraits > \ - YV; \ - typedef typename XV::size_type size_type; \ - \ - static void axpby(const ExecSpace& space, const AV& alpha, const XV& X, \ - const BV& beta, const YV& Y) { \ - Kokkos::Profiling::pushRegion("KokkosBlas::axpby[TPL_CUBLAS,double]"); \ - const size_type numElems = X.extent(0); \ - if ((numElems < static_cast(INT_MAX)) && (beta == 1.0)) { \ - axpby_print_specialization(); \ - const int N = static_cast(numElems); \ - constexpr int one = 1; \ - KokkosBlas::Impl::CudaBlasSingleton& s = \ - KokkosBlas::Impl::CudaBlasSingleton::singleton(); \ - KOKKOS_CUBLAS_SAFE_CALL_IMPL( \ - cublasSetStream(s.handle, space.cuda_stream())); \ - KOKKOS_CUBLAS_SAFE_CALL_IMPL( \ - cublasDaxpy(s.handle, N, &alpha, X.data(), one, Y.data(), one)); \ - KOKKOS_CUBLAS_SAFE_CALL_IMPL(cublasSetStream(s.handle, NULL)); \ - } else \ - Axpby::axpby(space, alpha, X, beta, Y); \ - Kokkos::Profiling::popRegion(); \ - } \ +#define KOKKOSBLAS1_DAXPBY_CUBLAS(LAYOUT, MEMSPACE, ETI_SPEC_AVAIL) \ + template \ + struct Axpby< \ + ExecSpace, double, \ + Kokkos::View, \ + Kokkos::MemoryTraits >, \ + double, \ + Kokkos::View, Kokkos::MemoryTraits >, 1, \ + true, ETI_SPEC_AVAIL> { \ + typedef double AV; \ + typedef double BV; \ + typedef Kokkos::View, \ + Kokkos::MemoryTraits > \ + XV; \ + typedef Kokkos::View, \ + Kokkos::MemoryTraits > \ + YV; \ + typedef typename XV::size_type size_type; \ + \ + static void axpby(const ExecSpace& space, const AV& alpha, const XV& X, const BV& beta, const YV& Y) { \ + Kokkos::Profiling::pushRegion("KokkosBlas::axpby[TPL_CUBLAS,double]"); \ + const size_type numElems = X.extent(0); \ + if ((numElems < static_cast(INT_MAX)) && (beta == 1.0)) { \ + axpby_print_specialization(); \ + const int N = static_cast(numElems); \ + constexpr int one = 1; \ + KokkosBlas::Impl::CudaBlasSingleton& s = KokkosBlas::Impl::CudaBlasSingleton::singleton(); \ + KOKKOS_CUBLAS_SAFE_CALL_IMPL(cublasSetStream(s.handle, space.cuda_stream())); \ + KOKKOS_CUBLAS_SAFE_CALL_IMPL(cublasDaxpy(s.handle, N, &alpha, X.data(), one, Y.data(), one)); \ + KOKKOS_CUBLAS_SAFE_CALL_IMPL(cublasSetStream(s.handle, NULL)); \ + } else \ + Axpby::axpby(space, alpha, X, beta, Y); \ + Kokkos::Profiling::popRegion(); \ + } \ }; -#define KOKKOSBLAS1_SAXPBY_CUBLAS(LAYOUT, MEMSPACE, ETI_SPEC_AVAIL) \ - template \ - struct Axpby< \ - ExecSpace, float, \ - Kokkos::View, \ - Kokkos::MemoryTraits >, \ - float, \ - Kokkos::View, \ - Kokkos::MemoryTraits >, \ - 1, true, ETI_SPEC_AVAIL> { \ - typedef float AV; \ - typedef float BV; \ - typedef Kokkos::View, \ - Kokkos::MemoryTraits > \ - XV; \ - typedef Kokkos::View, \ - Kokkos::MemoryTraits > \ - YV; \ - typedef typename XV::size_type size_type; \ - \ - static void axpby(const ExecSpace& space, const AV& alpha, const XV& X, \ - const BV& beta, const YV& Y) { \ - Kokkos::Profiling::pushRegion("KokkosBlas::axpby[TPL_CUBLAS,float]"); \ - const size_type numElems = X.extent(0); \ - if ((numElems < static_cast(INT_MAX)) && (beta == 1.0f)) { \ - axpby_print_specialization(); \ - const int N = static_cast(numElems); \ - constexpr int one = 1; \ - KokkosBlas::Impl::CudaBlasSingleton& s = \ - KokkosBlas::Impl::CudaBlasSingleton::singleton(); \ - KOKKOS_CUBLAS_SAFE_CALL_IMPL( \ - cublasSetStream(s.handle, space.cuda_stream())); \ - KOKKOS_CUBLAS_SAFE_CALL_IMPL( \ - cublasSaxpy(s.handle, N, &alpha, X.data(), one, Y.data(), one)); \ - KOKKOS_CUBLAS_SAFE_CALL_IMPL(cublasSetStream(s.handle, NULL)); \ - } else \ - Axpby::axpby(space, alpha, X, beta, Y); \ - Kokkos::Profiling::popRegion(); \ - } \ +#define KOKKOSBLAS1_SAXPBY_CUBLAS(LAYOUT, MEMSPACE, ETI_SPEC_AVAIL) \ + template \ + struct Axpby< \ + ExecSpace, float, \ + Kokkos::View, \ + Kokkos::MemoryTraits >, \ + float, \ + Kokkos::View, Kokkos::MemoryTraits >, 1, \ + true, ETI_SPEC_AVAIL> { \ + typedef float AV; \ + typedef float BV; \ + typedef Kokkos::View, \ + Kokkos::MemoryTraits > \ + XV; \ + typedef Kokkos::View, \ + Kokkos::MemoryTraits > \ + YV; \ + typedef typename XV::size_type size_type; \ + \ + static void axpby(const ExecSpace& space, const AV& alpha, const XV& X, const BV& beta, const YV& Y) { \ + Kokkos::Profiling::pushRegion("KokkosBlas::axpby[TPL_CUBLAS,float]"); \ + const size_type numElems = X.extent(0); \ + if ((numElems < static_cast(INT_MAX)) && (beta == 1.0f)) { \ + axpby_print_specialization(); \ + const int N = static_cast(numElems); \ + constexpr int one = 1; \ + KokkosBlas::Impl::CudaBlasSingleton& s = KokkosBlas::Impl::CudaBlasSingleton::singleton(); \ + KOKKOS_CUBLAS_SAFE_CALL_IMPL(cublasSetStream(s.handle, space.cuda_stream())); \ + KOKKOS_CUBLAS_SAFE_CALL_IMPL(cublasSaxpy(s.handle, N, &alpha, X.data(), one, Y.data(), one)); \ + KOKKOS_CUBLAS_SAFE_CALL_IMPL(cublasSetStream(s.handle, NULL)); \ + } else \ + Axpby::axpby(space, alpha, X, beta, Y); \ + Kokkos::Profiling::popRegion(); \ + } \ }; -#define KOKKOSBLAS1_ZAXPBY_CUBLAS(LAYOUT, MEMSPACE, ETI_SPEC_AVAIL) \ - template \ - struct Axpby, \ - Kokkos::View*, LAYOUT, \ - Kokkos::Device, \ - Kokkos::MemoryTraits >, \ - Kokkos::complex, \ - Kokkos::View*, LAYOUT, \ - Kokkos::Device, \ - Kokkos::MemoryTraits >, \ - 1, true, ETI_SPEC_AVAIL> { \ - typedef Kokkos::complex AV; \ - typedef Kokkos::complex BV; \ - typedef Kokkos::View*, LAYOUT, \ - Kokkos::Device, \ - Kokkos::MemoryTraits > \ - XV; \ - typedef Kokkos::View*, LAYOUT, \ - Kokkos::Device, \ - Kokkos::MemoryTraits > \ - YV; \ - typedef typename XV::size_type size_type; \ - \ - static void axpby(const ExecSpace& space, const AV& alpha, const XV& X, \ - const BV& beta, const YV& Y) { \ - Kokkos::Profiling::pushRegion( \ - "KokkosBlas::axpby[TPL_CUBLAS,complex]"); \ - const size_type numElems = X.extent(0); \ - if ((numElems < static_cast(INT_MAX)) && (beta == 1.0f)) { \ - axpby_print_specialization(); \ - const int N = static_cast(numElems); \ - constexpr int one = 1; \ - KokkosBlas::Impl::CudaBlasSingleton& s = \ - KokkosBlas::Impl::CudaBlasSingleton::singleton(); \ - KOKKOS_CUBLAS_SAFE_CALL_IMPL( \ - cublasSetStream(s.handle, space.cuda_stream())); \ - KOKKOS_CUBLAS_SAFE_CALL_IMPL(cublasZaxpy( \ - s.handle, N, reinterpret_cast(&alpha), \ - reinterpret_cast(X.data()), one, \ - reinterpret_cast(Y.data()), one)); \ - KOKKOS_CUBLAS_SAFE_CALL_IMPL(cublasSetStream(s.handle, NULL)); \ - } else \ - Axpby::axpby(space, alpha, X, beta, Y); \ - Kokkos::Profiling::popRegion(); \ - } \ +#define KOKKOSBLAS1_ZAXPBY_CUBLAS(LAYOUT, MEMSPACE, ETI_SPEC_AVAIL) \ + template \ + struct Axpby, \ + Kokkos::View*, LAYOUT, Kokkos::Device, \ + Kokkos::MemoryTraits >, \ + Kokkos::complex, \ + Kokkos::View*, LAYOUT, Kokkos::Device, \ + Kokkos::MemoryTraits >, \ + 1, true, ETI_SPEC_AVAIL> { \ + typedef Kokkos::complex AV; \ + typedef Kokkos::complex BV; \ + typedef Kokkos::View*, LAYOUT, Kokkos::Device, \ + Kokkos::MemoryTraits > \ + XV; \ + typedef Kokkos::View*, LAYOUT, Kokkos::Device, \ + Kokkos::MemoryTraits > \ + YV; \ + typedef typename XV::size_type size_type; \ + \ + static void axpby(const ExecSpace& space, const AV& alpha, const XV& X, const BV& beta, const YV& Y) { \ + Kokkos::Profiling::pushRegion("KokkosBlas::axpby[TPL_CUBLAS,complex]"); \ + const size_type numElems = X.extent(0); \ + if ((numElems < static_cast(INT_MAX)) && (beta == 1.0f)) { \ + axpby_print_specialization(); \ + const int N = static_cast(numElems); \ + constexpr int one = 1; \ + KokkosBlas::Impl::CudaBlasSingleton& s = KokkosBlas::Impl::CudaBlasSingleton::singleton(); \ + KOKKOS_CUBLAS_SAFE_CALL_IMPL(cublasSetStream(s.handle, space.cuda_stream())); \ + KOKKOS_CUBLAS_SAFE_CALL_IMPL(cublasZaxpy(s.handle, N, reinterpret_cast(&alpha), \ + reinterpret_cast(X.data()), one, \ + reinterpret_cast(Y.data()), one)); \ + KOKKOS_CUBLAS_SAFE_CALL_IMPL(cublasSetStream(s.handle, NULL)); \ + } else \ + Axpby::axpby(space, alpha, X, beta, Y); \ + Kokkos::Profiling::popRegion(); \ + } \ }; -#define KOKKOSBLAS1_CAXPBY_CUBLAS(LAYOUT, MEMSPACE, ETI_SPEC_AVAIL) \ - template \ - struct Axpby, \ - Kokkos::View*, LAYOUT, \ - Kokkos::Device, \ - Kokkos::MemoryTraits >, \ - Kokkos::complex, \ - Kokkos::View*, LAYOUT, \ - Kokkos::Device, \ - Kokkos::MemoryTraits >, \ - 1, true, ETI_SPEC_AVAIL> { \ - typedef Kokkos::complex AV; \ - typedef Kokkos::complex BV; \ - typedef Kokkos::View*, LAYOUT, \ - Kokkos::Device, \ - Kokkos::MemoryTraits > \ - XV; \ - typedef Kokkos::View*, LAYOUT, \ - Kokkos::Device, \ - Kokkos::MemoryTraits > \ - YV; \ - typedef typename XV::size_type size_type; \ - \ - static void axpby(const ExecSpace& space, const AV& alpha, const XV& X, \ - const BV& beta, const YV& Y) { \ - Kokkos::Profiling::pushRegion( \ - "KokkosBlas::axpby[TPL_CUBLAS,complex]"); \ - const size_type numElems = X.extent(0); \ - if ((numElems < static_cast(INT_MAX)) && (beta == 1.0f)) { \ - axpby_print_specialization(); \ - const int N = static_cast(numElems); \ - constexpr int one = 1; \ - KokkosBlas::Impl::CudaBlasSingleton& s = \ - KokkosBlas::Impl::CudaBlasSingleton::singleton(); \ - KOKKOS_CUBLAS_SAFE_CALL_IMPL( \ - cublasSetStream(s.handle, space.cuda_stream())); \ - KOKKOS_CUBLAS_SAFE_CALL_IMPL(cublasCaxpy( \ - s.handle, N, reinterpret_cast(&alpha), \ - reinterpret_cast(X.data()), one, \ - reinterpret_cast(Y.data()), one)); \ - KOKKOS_CUBLAS_SAFE_CALL_IMPL(cublasSetStream(s.handle, NULL)); \ - } else \ - Axpby::axpby(space, alpha, X, beta, Y); \ - Kokkos::Profiling::popRegion(); \ - } \ +#define KOKKOSBLAS1_CAXPBY_CUBLAS(LAYOUT, MEMSPACE, ETI_SPEC_AVAIL) \ + template \ + struct Axpby, \ + Kokkos::View*, LAYOUT, Kokkos::Device, \ + Kokkos::MemoryTraits >, \ + Kokkos::complex, \ + Kokkos::View*, LAYOUT, Kokkos::Device, \ + Kokkos::MemoryTraits >, \ + 1, true, ETI_SPEC_AVAIL> { \ + typedef Kokkos::complex AV; \ + typedef Kokkos::complex BV; \ + typedef Kokkos::View*, LAYOUT, Kokkos::Device, \ + Kokkos::MemoryTraits > \ + XV; \ + typedef Kokkos::View*, LAYOUT, Kokkos::Device, \ + Kokkos::MemoryTraits > \ + YV; \ + typedef typename XV::size_type size_type; \ + \ + static void axpby(const ExecSpace& space, const AV& alpha, const XV& X, const BV& beta, const YV& Y) { \ + Kokkos::Profiling::pushRegion("KokkosBlas::axpby[TPL_CUBLAS,complex]"); \ + const size_type numElems = X.extent(0); \ + if ((numElems < static_cast(INT_MAX)) && (beta == 1.0f)) { \ + axpby_print_specialization(); \ + const int N = static_cast(numElems); \ + constexpr int one = 1; \ + KokkosBlas::Impl::CudaBlasSingleton& s = KokkosBlas::Impl::CudaBlasSingleton::singleton(); \ + KOKKOS_CUBLAS_SAFE_CALL_IMPL(cublasSetStream(s.handle, space.cuda_stream())); \ + KOKKOS_CUBLAS_SAFE_CALL_IMPL(cublasCaxpy(s.handle, N, reinterpret_cast(&alpha), \ + reinterpret_cast(X.data()), one, \ + reinterpret_cast(Y.data()), one)); \ + KOKKOS_CUBLAS_SAFE_CALL_IMPL(cublasSetStream(s.handle, NULL)); \ + } else \ + Axpby::axpby(space, alpha, X, beta, Y); \ + Kokkos::Profiling::popRegion(); \ + } \ }; KOKKOSBLAS1_DAXPBY_CUBLAS(Kokkos::LayoutLeft, Kokkos::CudaSpace, true) diff --git a/blas/tpls/KokkosBlas1_dot_tpl_spec_avail.hpp b/blas/tpls/KokkosBlas1_dot_tpl_spec_avail.hpp index 13cc2a6f92..8d5f1b939b 100644 --- a/blas/tpls/KokkosBlas1_dot_tpl_spec_avail.hpp +++ b/blas/tpls/KokkosBlas1_dot_tpl_spec_avail.hpp @@ -20,8 +20,7 @@ namespace KokkosBlas { namespace Impl { // Specialization struct which defines whether a specialization exists -template +template struct dot_tpl_spec_avail { enum : bool { value = false }; }; @@ -34,24 +33,20 @@ namespace Impl { // Generic Host side BLAS (could be MKL or whatever) #ifdef KOKKOSKERNELS_ENABLE_TPL_BLAS // double -#define KOKKOSBLAS1_DOT_TPL_SPEC_AVAIL_BLAS(SCALAR, LAYOUT, MEMSPACE) \ - template \ - struct dot_tpl_spec_avail< \ - ExecSpace, \ - Kokkos::View >, \ - Kokkos::View, \ - Kokkos::MemoryTraits >, \ - Kokkos::View, \ - Kokkos::MemoryTraits >, \ - 1, 1> { \ - enum : bool { value = true }; \ +#define KOKKOSBLAS1_DOT_TPL_SPEC_AVAIL_BLAS(SCALAR, LAYOUT, MEMSPACE) \ + template \ + struct dot_tpl_spec_avail >, \ + Kokkos::View, \ + Kokkos::MemoryTraits >, \ + Kokkos::View, \ + Kokkos::MemoryTraits >, \ + 1, 1> { \ + enum : bool { value = true }; \ }; -KOKKOSBLAS1_DOT_TPL_SPEC_AVAIL_BLAS(double, Kokkos::LayoutLeft, - Kokkos::HostSpace) -KOKKOSBLAS1_DOT_TPL_SPEC_AVAIL_BLAS(float, Kokkos::LayoutLeft, - Kokkos::HostSpace) +KOKKOSBLAS1_DOT_TPL_SPEC_AVAIL_BLAS(double, Kokkos::LayoutLeft, Kokkos::HostSpace) +KOKKOSBLAS1_DOT_TPL_SPEC_AVAIL_BLAS(float, Kokkos::LayoutLeft, Kokkos::HostSpace) // TODO: we met difficuties in FindTPLMKL.cmake to set the BLAS library properly // such that the test in CheckHostBlasReturnComplex.cmake could not be @@ -59,33 +54,28 @@ KOKKOSBLAS1_DOT_TPL_SPEC_AVAIL_BLAS(float, Kokkos::LayoutLeft, // This resulted in segfault in dot() with MKL and complex. // So we just temporarily disable it until FindTPLMKL.cmake is fixed. #if !defined(KOKKOSKERNELS_ENABLE_TPL_MKL) -KOKKOSBLAS1_DOT_TPL_SPEC_AVAIL_BLAS(Kokkos::complex, Kokkos::LayoutLeft, - Kokkos::HostSpace) -KOKKOSBLAS1_DOT_TPL_SPEC_AVAIL_BLAS(Kokkos::complex, Kokkos::LayoutLeft, - Kokkos::HostSpace) +KOKKOSBLAS1_DOT_TPL_SPEC_AVAIL_BLAS(Kokkos::complex, Kokkos::LayoutLeft, Kokkos::HostSpace) +KOKKOSBLAS1_DOT_TPL_SPEC_AVAIL_BLAS(Kokkos::complex, Kokkos::LayoutLeft, Kokkos::HostSpace) #endif #endif -#define KOKKOSBLAS1_DOT_TPL_SPEC(SCALAR, LAYOUT, EXECSPACE, MEMSPACE) \ - template <> \ - struct dot_tpl_spec_avail< \ - EXECSPACE, \ - Kokkos::View >, \ - Kokkos::View, \ - Kokkos::MemoryTraits >, \ - Kokkos::View, \ - Kokkos::MemoryTraits >, \ - 1, 1> { \ - enum : bool { value = true }; \ +#define KOKKOSBLAS1_DOT_TPL_SPEC(SCALAR, LAYOUT, EXECSPACE, MEMSPACE) \ + template <> \ + struct dot_tpl_spec_avail >, \ + Kokkos::View, \ + Kokkos::MemoryTraits >, \ + Kokkos::View, \ + Kokkos::MemoryTraits >, \ + 1, 1> { \ + enum : bool { value = true }; \ }; -#define KOKKOSBLAS1_DOT_TPL_SPEC_AVAIL(LAYOUT, EXECSPACE, MEMSPACE) \ - KOKKOSBLAS1_DOT_TPL_SPEC(float, LAYOUT, EXECSPACE, MEMSPACE) \ - KOKKOSBLAS1_DOT_TPL_SPEC(double, LAYOUT, EXECSPACE, MEMSPACE) \ - KOKKOSBLAS1_DOT_TPL_SPEC(Kokkos::complex, LAYOUT, EXECSPACE, \ - MEMSPACE) \ +#define KOKKOSBLAS1_DOT_TPL_SPEC_AVAIL(LAYOUT, EXECSPACE, MEMSPACE) \ + KOKKOSBLAS1_DOT_TPL_SPEC(float, LAYOUT, EXECSPACE, MEMSPACE) \ + KOKKOSBLAS1_DOT_TPL_SPEC(double, LAYOUT, EXECSPACE, MEMSPACE) \ + KOKKOSBLAS1_DOT_TPL_SPEC(Kokkos::complex, LAYOUT, EXECSPACE, MEMSPACE) \ KOKKOSBLAS1_DOT_TPL_SPEC(Kokkos::complex, LAYOUT, EXECSPACE, MEMSPACE) #ifdef KOKKOSKERNELS_ENABLE_TPL_CUBLAS @@ -100,13 +90,11 @@ KOKKOSBLAS1_DOT_TPL_SPEC_AVAIL(Kokkos::LayoutLeft, Kokkos::Cuda, #endif #ifdef KOKKOSKERNELS_ENABLE_TPL_ROCBLAS -KOKKOSBLAS1_DOT_TPL_SPEC_AVAIL(Kokkos::LayoutLeft, Kokkos::HIP, - Kokkos::HIPSpace) +KOKKOSBLAS1_DOT_TPL_SPEC_AVAIL(Kokkos::LayoutLeft, Kokkos::HIP, Kokkos::HIPSpace) #endif #if defined(KOKKOSKERNELS_ENABLE_TPL_MKL) && defined(KOKKOS_ENABLE_SYCL) -KOKKOSBLAS1_DOT_TPL_SPEC_AVAIL(Kokkos::LayoutLeft, Kokkos::Experimental::SYCL, - Kokkos::Experimental::SYCLDeviceUSMSpace) +KOKKOSBLAS1_DOT_TPL_SPEC_AVAIL(Kokkos::LayoutLeft, Kokkos::Experimental::SYCL, Kokkos::Experimental::SYCLDeviceUSMSpace) #endif } // namespace Impl } // namespace KokkosBlas diff --git a/blas/tpls/KokkosBlas1_dot_tpl_spec_decl.hpp b/blas/tpls/KokkosBlas1_dot_tpl_spec_decl.hpp index 247957b2c8..fa9d5fafce 100644 --- a/blas/tpls/KokkosBlas1_dot_tpl_spec_decl.hpp +++ b/blas/tpls/KokkosBlas1_dot_tpl_spec_decl.hpp @@ -24,8 +24,8 @@ namespace { template inline void dot_print_specialization() { #ifdef KOKKOSKERNELS_ENABLE_CHECK_SPECIALIZATION - printf("KokkosBlas1::dot<> TPL Blas specialization for < %s , %s , %s >\n", - typeid(RV).name(), typeid(XV).name(), typeid(YV).name()); + printf("KokkosBlas1::dot<> TPL Blas specialization for < %s , %s , %s >\n", typeid(RV).name(), typeid(XV).name(), + typeid(YV).name()); #endif } } // namespace @@ -39,59 +39,44 @@ inline void dot_print_specialization() { namespace KokkosBlas { namespace Impl { -#define KOKKOSBLAS1_DOT_TPL_SPEC_DECL_BLAS(LAYOUT, KOKKOS_TYPE, TPL_TYPE, \ - MEMSPACE, ETI_SPEC_AVAIL) \ - template \ - struct Dot >, \ - Kokkos::View, \ - Kokkos::MemoryTraits >, \ - Kokkos::View, \ - Kokkos::MemoryTraits >, \ - 1, 1, true, ETI_SPEC_AVAIL> { \ - typedef Kokkos::View > \ - RV; \ - typedef Kokkos::View, \ - Kokkos::MemoryTraits > \ - XV; \ - typedef typename XV::size_type size_type; \ - \ - static void dot(const ExecSpace& space, RV& R, const XV& X, const XV& Y) { \ - Kokkos::Profiling::pushRegion("KokkosBlas::dot[TPL_BLAS," + \ - Kokkos::ArithTraits::name() + \ - "]"); \ - const size_type numElems = X.extent(0); \ - if (numElems < static_cast(INT_MAX)) { \ - dot_print_specialization(); \ - int N = numElems; \ - int one = 1; \ - R() = HostBlas::dot( \ - N, reinterpret_cast(X.data()), one, \ - reinterpret_cast(Y.data()), one); \ - } else { \ - Dot::dot(space, R, \ - X, Y); \ - } \ - Kokkos::Profiling::popRegion(); \ - } \ +#define KOKKOSBLAS1_DOT_TPL_SPEC_DECL_BLAS(LAYOUT, KOKKOS_TYPE, TPL_TYPE, MEMSPACE, ETI_SPEC_AVAIL) \ + template \ + struct Dot >, \ + Kokkos::View, \ + Kokkos::MemoryTraits >, \ + Kokkos::View, \ + Kokkos::MemoryTraits >, \ + 1, 1, true, ETI_SPEC_AVAIL> { \ + typedef Kokkos::View > RV; \ + typedef Kokkos::View, \ + Kokkos::MemoryTraits > \ + XV; \ + typedef typename XV::size_type size_type; \ + \ + static void dot(const ExecSpace& space, RV& R, const XV& X, const XV& Y) { \ + Kokkos::Profiling::pushRegion("KokkosBlas::dot[TPL_BLAS," + Kokkos::ArithTraits::name() + "]"); \ + const size_type numElems = X.extent(0); \ + if (numElems < static_cast(INT_MAX)) { \ + dot_print_specialization(); \ + int N = numElems; \ + int one = 1; \ + R() = HostBlas::dot(N, reinterpret_cast(X.data()), one, \ + reinterpret_cast(Y.data()), one); \ + } else { \ + Dot::dot(space, R, X, Y); \ + } \ + Kokkos::Profiling::popRegion(); \ + } \ }; -#define KOKKOSBLAS1_DOT_TPL_SPEC_DECL_BLAS_EXT(ETI_SPEC_AVAIL) \ - KOKKOSBLAS1_DOT_TPL_SPEC_DECL_BLAS(Kokkos::LayoutLeft, float, float, \ - Kokkos::HostSpace, ETI_SPEC_AVAIL) \ - KOKKOSBLAS1_DOT_TPL_SPEC_DECL_BLAS(Kokkos::LayoutLeft, double, double, \ - Kokkos::HostSpace, ETI_SPEC_AVAIL) \ - KOKKOSBLAS1_DOT_TPL_SPEC_DECL_BLAS( \ - Kokkos::LayoutLeft, Kokkos::complex, std::complex, \ - Kokkos::HostSpace, ETI_SPEC_AVAIL) \ - KOKKOSBLAS1_DOT_TPL_SPEC_DECL_BLAS( \ - Kokkos::LayoutLeft, Kokkos::complex, std::complex, \ - Kokkos::HostSpace, ETI_SPEC_AVAIL) +#define KOKKOSBLAS1_DOT_TPL_SPEC_DECL_BLAS_EXT(ETI_SPEC_AVAIL) \ + KOKKOSBLAS1_DOT_TPL_SPEC_DECL_BLAS(Kokkos::LayoutLeft, float, float, Kokkos::HostSpace, ETI_SPEC_AVAIL) \ + KOKKOSBLAS1_DOT_TPL_SPEC_DECL_BLAS(Kokkos::LayoutLeft, double, double, Kokkos::HostSpace, ETI_SPEC_AVAIL) \ + KOKKOSBLAS1_DOT_TPL_SPEC_DECL_BLAS(Kokkos::LayoutLeft, Kokkos::complex, std::complex, \ + Kokkos::HostSpace, ETI_SPEC_AVAIL) \ + KOKKOSBLAS1_DOT_TPL_SPEC_DECL_BLAS(Kokkos::LayoutLeft, Kokkos::complex, std::complex, \ + Kokkos::HostSpace, ETI_SPEC_AVAIL) KOKKOSBLAS1_DOT_TPL_SPEC_DECL_BLAS_EXT(true) KOKKOSBLAS1_DOT_TPL_SPEC_DECL_BLAS_EXT(false) @@ -108,69 +93,51 @@ KOKKOSBLAS1_DOT_TPL_SPEC_DECL_BLAS_EXT(false) namespace KokkosBlas { namespace Impl { -#define KOKKOSBLAS1_DOT_TPL_SPEC_DECL_CUBLAS(LAYOUT, KOKKOS_TYPE, TPL_TYPE, \ - EXECSPACE, MEMSPACE, TPL_DOT, \ - ETI_SPEC_AVAIL) \ - template <> \ - struct Dot >, \ - Kokkos::View, \ - Kokkos::MemoryTraits >, \ - Kokkos::View, \ - Kokkos::MemoryTraits >, \ - 1, 1, true, ETI_SPEC_AVAIL> { \ - typedef Kokkos::View > \ - RV; \ - typedef Kokkos::View, \ - Kokkos::MemoryTraits > \ - XV; \ - typedef typename XV::size_type size_type; \ - \ - static void dot(const EXECSPACE& space, RV& R, const XV& X, const XV& Y) { \ - Kokkos::Profiling::pushRegion("KokkosBlas::dot[TPL_CUBLAS," + \ - Kokkos::ArithTraits::name() + \ - "]"); \ - const size_type numElems = X.extent(0); \ - /* TODO: CUDA-12's 64-bit indices allow larger numElems */ \ - if (numElems <= \ - static_cast(std::numeric_limits::max())) { \ - dot_print_specialization(); \ - const int N = static_cast(numElems); \ - KokkosBlas::Impl::CudaBlasSingleton& s = \ - KokkosBlas::Impl::CudaBlasSingleton::singleton(); \ - KOKKOS_CUBLAS_SAFE_CALL_IMPL( \ - cublasSetStream(s.handle, space.cuda_stream())); \ - KOKKOS_CUBLAS_SAFE_CALL_IMPL( \ - TPL_DOT(s.handle, N, reinterpret_cast(X.data()), \ - 1, reinterpret_cast(Y.data()), 1, \ - reinterpret_cast(&R()))); \ - KOKKOS_CUBLAS_SAFE_CALL_IMPL(cublasSetStream(s.handle, NULL)); \ - } else { \ - Dot::dot(space, R, \ - X, Y); \ - } \ - Kokkos::Profiling::popRegion(); \ - } \ +#define KOKKOSBLAS1_DOT_TPL_SPEC_DECL_CUBLAS(LAYOUT, KOKKOS_TYPE, TPL_TYPE, EXECSPACE, MEMSPACE, TPL_DOT, \ + ETI_SPEC_AVAIL) \ + template <> \ + struct Dot >, \ + Kokkos::View, \ + Kokkos::MemoryTraits >, \ + Kokkos::View, \ + Kokkos::MemoryTraits >, \ + 1, 1, true, ETI_SPEC_AVAIL> { \ + typedef Kokkos::View > RV; \ + typedef Kokkos::View, \ + Kokkos::MemoryTraits > \ + XV; \ + typedef typename XV::size_type size_type; \ + \ + static void dot(const EXECSPACE& space, RV& R, const XV& X, const XV& Y) { \ + Kokkos::Profiling::pushRegion("KokkosBlas::dot[TPL_CUBLAS," + Kokkos::ArithTraits::name() + "]"); \ + const size_type numElems = X.extent(0); \ + /* TODO: CUDA-12's 64-bit indices allow larger numElems */ \ + if (numElems <= static_cast(std::numeric_limits::max())) { \ + dot_print_specialization(); \ + const int N = static_cast(numElems); \ + KokkosBlas::Impl::CudaBlasSingleton& s = KokkosBlas::Impl::CudaBlasSingleton::singleton(); \ + KOKKOS_CUBLAS_SAFE_CALL_IMPL(cublasSetStream(s.handle, space.cuda_stream())); \ + KOKKOS_CUBLAS_SAFE_CALL_IMPL(TPL_DOT(s.handle, N, reinterpret_cast(X.data()), 1, \ + reinterpret_cast(Y.data()), 1, \ + reinterpret_cast(&R()))); \ + KOKKOS_CUBLAS_SAFE_CALL_IMPL(cublasSetStream(s.handle, NULL)); \ + } else { \ + Dot::dot(space, R, X, Y); \ + } \ + Kokkos::Profiling::popRegion(); \ + } \ }; -#define KOKKOSBLAS1_DOT_TPL_SPEC_DECL_CUBLAS_EXT(ETI_SPEC_AVAIL) \ - KOKKOSBLAS1_DOT_TPL_SPEC_DECL_CUBLAS(Kokkos::LayoutLeft, float, float, \ - Kokkos::Cuda, Kokkos::CudaSpace, \ - cublasSdot, ETI_SPEC_AVAIL) \ - KOKKOSBLAS1_DOT_TPL_SPEC_DECL_CUBLAS(Kokkos::LayoutLeft, double, double, \ - Kokkos::Cuda, Kokkos::CudaSpace, \ - cublasDdot, ETI_SPEC_AVAIL) \ - KOKKOSBLAS1_DOT_TPL_SPEC_DECL_CUBLAS( \ - Kokkos::LayoutLeft, Kokkos::complex, cuComplex, Kokkos::Cuda, \ - Kokkos::CudaSpace, cublasCdotc, ETI_SPEC_AVAIL) \ - KOKKOSBLAS1_DOT_TPL_SPEC_DECL_CUBLAS( \ - Kokkos::LayoutLeft, Kokkos::complex, cuDoubleComplex, \ - Kokkos::Cuda, Kokkos::CudaSpace, cublasZdotc, ETI_SPEC_AVAIL) +#define KOKKOSBLAS1_DOT_TPL_SPEC_DECL_CUBLAS_EXT(ETI_SPEC_AVAIL) \ + KOKKOSBLAS1_DOT_TPL_SPEC_DECL_CUBLAS(Kokkos::LayoutLeft, float, float, Kokkos::Cuda, Kokkos::CudaSpace, cublasSdot, \ + ETI_SPEC_AVAIL) \ + KOKKOSBLAS1_DOT_TPL_SPEC_DECL_CUBLAS(Kokkos::LayoutLeft, double, double, Kokkos::Cuda, Kokkos::CudaSpace, \ + cublasDdot, ETI_SPEC_AVAIL) \ + KOKKOSBLAS1_DOT_TPL_SPEC_DECL_CUBLAS(Kokkos::LayoutLeft, Kokkos::complex, cuComplex, Kokkos::Cuda, \ + Kokkos::CudaSpace, cublasCdotc, ETI_SPEC_AVAIL) \ + KOKKOSBLAS1_DOT_TPL_SPEC_DECL_CUBLAS(Kokkos::LayoutLeft, Kokkos::complex, cuDoubleComplex, Kokkos::Cuda, \ + Kokkos::CudaSpace, cublasZdotc, ETI_SPEC_AVAIL) KOKKOSBLAS1_DOT_TPL_SPEC_DECL_CUBLAS_EXT(true) KOKKOSBLAS1_DOT_TPL_SPEC_DECL_CUBLAS_EXT(false) @@ -185,68 +152,50 @@ KOKKOSBLAS1_DOT_TPL_SPEC_DECL_CUBLAS_EXT(false) namespace KokkosBlas { namespace Impl { -#define KOKKOSBLAS1_DOT_TPL_SPEC_DECL_ROCBLAS(LAYOUT, KOKKOS_TYPE, TPL_TYPE, \ - EXECSPACE, MEMSPACE, TPL_DOT, \ - ETI_SPEC_AVAIL) \ - template <> \ - struct Dot >, \ - Kokkos::View, \ - Kokkos::MemoryTraits >, \ - Kokkos::View, \ - Kokkos::MemoryTraits >, \ - 1, 1, true, ETI_SPEC_AVAIL> { \ - typedef Kokkos::View > \ - RV; \ - typedef Kokkos::View, \ - Kokkos::MemoryTraits > \ - XV; \ - typedef typename XV::size_type size_type; \ - \ - static void dot(const EXECSPACE& space, RV& R, const XV& X, const XV& Y) { \ - Kokkos::Profiling::pushRegion("KokkosBlas::dot[TPL_ROCBLAS," + \ - Kokkos::ArithTraits::name() + \ - "]"); \ - const size_type numElems = X.extent(0); \ - if (numElems <= \ - static_cast(std::numeric_limits::max())) { \ - dot_print_specialization(); \ - const rocblas_int N = static_cast(numElems); \ - KokkosBlas::Impl::RocBlasSingleton& s = \ - KokkosBlas::Impl::RocBlasSingleton::singleton(); \ - KOKKOS_ROCBLAS_SAFE_CALL_IMPL( \ - rocblas_set_stream(s.handle, space.hip_stream())); \ - KOKKOS_ROCBLAS_SAFE_CALL_IMPL( \ - TPL_DOT(s.handle, N, reinterpret_cast(X.data()), \ - 1, reinterpret_cast(Y.data()), 1, \ - reinterpret_cast(&R()))); \ - KOKKOS_ROCBLAS_SAFE_CALL_IMPL(rocblas_set_stream(s.handle, NULL)); \ - } else { \ - Dot::dot(space, R, \ - X, Y); \ - } \ - Kokkos::Profiling::popRegion(); \ - } \ +#define KOKKOSBLAS1_DOT_TPL_SPEC_DECL_ROCBLAS(LAYOUT, KOKKOS_TYPE, TPL_TYPE, EXECSPACE, MEMSPACE, TPL_DOT, \ + ETI_SPEC_AVAIL) \ + template <> \ + struct Dot >, \ + Kokkos::View, \ + Kokkos::MemoryTraits >, \ + Kokkos::View, \ + Kokkos::MemoryTraits >, \ + 1, 1, true, ETI_SPEC_AVAIL> { \ + typedef Kokkos::View > RV; \ + typedef Kokkos::View, \ + Kokkos::MemoryTraits > \ + XV; \ + typedef typename XV::size_type size_type; \ + \ + static void dot(const EXECSPACE& space, RV& R, const XV& X, const XV& Y) { \ + Kokkos::Profiling::pushRegion("KokkosBlas::dot[TPL_ROCBLAS," + Kokkos::ArithTraits::name() + "]"); \ + const size_type numElems = X.extent(0); \ + if (numElems <= static_cast(std::numeric_limits::max())) { \ + dot_print_specialization(); \ + const rocblas_int N = static_cast(numElems); \ + KokkosBlas::Impl::RocBlasSingleton& s = KokkosBlas::Impl::RocBlasSingleton::singleton(); \ + KOKKOS_ROCBLAS_SAFE_CALL_IMPL(rocblas_set_stream(s.handle, space.hip_stream())); \ + KOKKOS_ROCBLAS_SAFE_CALL_IMPL(TPL_DOT(s.handle, N, reinterpret_cast(X.data()), 1, \ + reinterpret_cast(Y.data()), 1, \ + reinterpret_cast(&R()))); \ + KOKKOS_ROCBLAS_SAFE_CALL_IMPL(rocblas_set_stream(s.handle, NULL)); \ + } else { \ + Dot::dot(space, R, X, Y); \ + } \ + Kokkos::Profiling::popRegion(); \ + } \ }; -#define KOKKOSBLAS1_DOT_TPL_SPEC_DECL_ROCBLAS_EXT(ETI_SPEC_AVAIL) \ - KOKKOSBLAS1_DOT_TPL_SPEC_DECL_ROCBLAS(Kokkos::LayoutLeft, float, float, \ - Kokkos::HIP, Kokkos::HIPSpace, \ - rocblas_sdot, ETI_SPEC_AVAIL) \ - KOKKOSBLAS1_DOT_TPL_SPEC_DECL_ROCBLAS(Kokkos::LayoutLeft, double, double, \ - Kokkos::HIP, Kokkos::HIPSpace, \ - rocblas_ddot, ETI_SPEC_AVAIL) \ - KOKKOSBLAS1_DOT_TPL_SPEC_DECL_ROCBLAS( \ - Kokkos::LayoutLeft, Kokkos::complex, rocblas_float_complex, \ - Kokkos::HIP, Kokkos::HIPSpace, rocblas_cdotc, ETI_SPEC_AVAIL) \ - KOKKOSBLAS1_DOT_TPL_SPEC_DECL_ROCBLAS( \ - Kokkos::LayoutLeft, Kokkos::complex, rocblas_double_complex, \ - Kokkos::HIP, Kokkos::HIPSpace, rocblas_zdotc, ETI_SPEC_AVAIL) +#define KOKKOSBLAS1_DOT_TPL_SPEC_DECL_ROCBLAS_EXT(ETI_SPEC_AVAIL) \ + KOKKOSBLAS1_DOT_TPL_SPEC_DECL_ROCBLAS(Kokkos::LayoutLeft, float, float, Kokkos::HIP, Kokkos::HIPSpace, rocblas_sdot, \ + ETI_SPEC_AVAIL) \ + KOKKOSBLAS1_DOT_TPL_SPEC_DECL_ROCBLAS(Kokkos::LayoutLeft, double, double, Kokkos::HIP, Kokkos::HIPSpace, \ + rocblas_ddot, ETI_SPEC_AVAIL) \ + KOKKOSBLAS1_DOT_TPL_SPEC_DECL_ROCBLAS(Kokkos::LayoutLeft, Kokkos::complex, rocblas_float_complex, \ + Kokkos::HIP, Kokkos::HIPSpace, rocblas_cdotc, ETI_SPEC_AVAIL) \ + KOKKOSBLAS1_DOT_TPL_SPEC_DECL_ROCBLAS(Kokkos::LayoutLeft, Kokkos::complex, rocblas_double_complex, \ + Kokkos::HIP, Kokkos::HIPSpace, rocblas_zdotc, ETI_SPEC_AVAIL) KOKKOSBLAS1_DOT_TPL_SPEC_DECL_ROCBLAS_EXT(true) KOKKOSBLAS1_DOT_TPL_SPEC_DECL_ROCBLAS_EXT(false) @@ -262,67 +211,50 @@ KOKKOSBLAS1_DOT_TPL_SPEC_DECL_ROCBLAS_EXT(false) namespace KokkosBlas { namespace Impl { -#define KOKKOSBLAS1_DOT_TPL_SPEC_DECL_ONEMKL(LAYOUT, KOKKOS_TYPE, TPL_TYPE, \ - EXECSPACE, MEMSPACE, TPL_DOT, \ - ETI_SPEC_AVAIL) \ - template <> \ - struct Dot >, \ - Kokkos::View, \ - Kokkos::MemoryTraits >, \ - Kokkos::View, \ - Kokkos::MemoryTraits >, \ - 1, 1, true, ETI_SPEC_AVAIL> { \ - typedef Kokkos::View > \ - RV; \ - typedef Kokkos::View, \ - Kokkos::MemoryTraits > \ - XV; \ - typedef typename XV::size_type size_type; \ - \ - static void dot(const EXECSPACE& exec, RV& R, const XV& X, const XV& Y) { \ - Kokkos::Profiling::pushRegion("KokkosBlas::dot[TPL_ONEMKL," + \ - Kokkos::ArithTraits::name() + \ - "]"); \ - const size_type numElems = X.extent(0); \ - if (numElems <= \ - static_cast(std::numeric_limits::max())) { \ - dot_print_specialization(); \ - const std::int64_t N = static_cast(numElems); \ - TPL_DOT(exec.sycl_queue(), N, \ - reinterpret_cast(X.data()), 1, \ - reinterpret_cast(Y.data()), 1, \ - reinterpret_cast(&R())); \ - } else { \ - Dot::dot(exec, R, \ - X, Y); \ - } \ - Kokkos::Profiling::popRegion(); \ - } \ +#define KOKKOSBLAS1_DOT_TPL_SPEC_DECL_ONEMKL(LAYOUT, KOKKOS_TYPE, TPL_TYPE, EXECSPACE, MEMSPACE, TPL_DOT, \ + ETI_SPEC_AVAIL) \ + template <> \ + struct Dot >, \ + Kokkos::View, \ + Kokkos::MemoryTraits >, \ + Kokkos::View, \ + Kokkos::MemoryTraits >, \ + 1, 1, true, ETI_SPEC_AVAIL> { \ + typedef Kokkos::View > RV; \ + typedef Kokkos::View, \ + Kokkos::MemoryTraits > \ + XV; \ + typedef typename XV::size_type size_type; \ + \ + static void dot(const EXECSPACE& exec, RV& R, const XV& X, const XV& Y) { \ + Kokkos::Profiling::pushRegion("KokkosBlas::dot[TPL_ONEMKL," + Kokkos::ArithTraits::name() + "]"); \ + const size_type numElems = X.extent(0); \ + if (numElems <= static_cast(std::numeric_limits::max())) { \ + dot_print_specialization(); \ + const std::int64_t N = static_cast(numElems); \ + TPL_DOT(exec.sycl_queue(), N, reinterpret_cast(X.data()), 1, \ + reinterpret_cast(Y.data()), 1, reinterpret_cast(&R())); \ + } else { \ + Dot::dot(exec, R, X, Y); \ + } \ + Kokkos::Profiling::popRegion(); \ + } \ }; -#define KOKKOSBLAS1_DOT_TPL_SPEC_DECL_ONEMKL_EXT(ETI_SPEC_AVAIL) \ - KOKKOSBLAS1_DOT_TPL_SPEC_DECL_ONEMKL( \ - Kokkos::LayoutLeft, float, float, Kokkos::Experimental::SYCL, \ - Kokkos::Experimental::SYCLDeviceUSMSpace, \ - oneapi::mkl::blas::row_major::dot, ETI_SPEC_AVAIL) \ - KOKKOSBLAS1_DOT_TPL_SPEC_DECL_ONEMKL( \ - Kokkos::LayoutLeft, double, double, Kokkos::Experimental::SYCL, \ - Kokkos::Experimental::SYCLDeviceUSMSpace, \ - oneapi::mkl::blas::row_major::dot, ETI_SPEC_AVAIL) \ - KOKKOSBLAS1_DOT_TPL_SPEC_DECL_ONEMKL( \ - Kokkos::LayoutLeft, Kokkos::complex, std::complex, \ - Kokkos::Experimental::SYCL, Kokkos::Experimental::SYCLDeviceUSMSpace, \ - oneapi::mkl::blas::row_major::dotc, ETI_SPEC_AVAIL) \ - KOKKOSBLAS1_DOT_TPL_SPEC_DECL_ONEMKL( \ - Kokkos::LayoutLeft, Kokkos::complex, std::complex, \ - Kokkos::Experimental::SYCL, Kokkos::Experimental::SYCLDeviceUSMSpace, \ - oneapi::mkl::blas::row_major::dotc, ETI_SPEC_AVAIL) +#define KOKKOSBLAS1_DOT_TPL_SPEC_DECL_ONEMKL_EXT(ETI_SPEC_AVAIL) \ + KOKKOSBLAS1_DOT_TPL_SPEC_DECL_ONEMKL(Kokkos::LayoutLeft, float, float, Kokkos::Experimental::SYCL, \ + Kokkos::Experimental::SYCLDeviceUSMSpace, oneapi::mkl::blas::row_major::dot, \ + ETI_SPEC_AVAIL) \ + KOKKOSBLAS1_DOT_TPL_SPEC_DECL_ONEMKL(Kokkos::LayoutLeft, double, double, Kokkos::Experimental::SYCL, \ + Kokkos::Experimental::SYCLDeviceUSMSpace, oneapi::mkl::blas::row_major::dot, \ + ETI_SPEC_AVAIL) \ + KOKKOSBLAS1_DOT_TPL_SPEC_DECL_ONEMKL(Kokkos::LayoutLeft, Kokkos::complex, std::complex, \ + Kokkos::Experimental::SYCL, Kokkos::Experimental::SYCLDeviceUSMSpace, \ + oneapi::mkl::blas::row_major::dotc, ETI_SPEC_AVAIL) \ + KOKKOSBLAS1_DOT_TPL_SPEC_DECL_ONEMKL(Kokkos::LayoutLeft, Kokkos::complex, std::complex, \ + Kokkos::Experimental::SYCL, Kokkos::Experimental::SYCLDeviceUSMSpace, \ + oneapi::mkl::blas::row_major::dotc, ETI_SPEC_AVAIL) KOKKOSBLAS1_DOT_TPL_SPEC_DECL_ONEMKL_EXT(true) KOKKOSBLAS1_DOT_TPL_SPEC_DECL_ONEMKL_EXT(false) diff --git a/blas/tpls/KokkosBlas1_iamax_tpl_spec_avail.hpp b/blas/tpls/KokkosBlas1_iamax_tpl_spec_avail.hpp index 616c26c87a..36a5e5171f 100644 --- a/blas/tpls/KokkosBlas1_iamax_tpl_spec_avail.hpp +++ b/blas/tpls/KokkosBlas1_iamax_tpl_spec_avail.hpp @@ -33,145 +33,96 @@ namespace Impl { // Generic Host side BLAS (could be MKL or whatever) #if defined(KOKKOSKERNELS_ENABLE_TPL_BLAS) // double -#define KOKKOSBLAS1_IAMAX_TPL_SPEC_AVAIL_BLAS(INDEX_TYPE, SCALAR, LAYOUT, \ - MEMSPACE) \ - template \ - struct iamax_tpl_spec_avail< \ - ExecSpace, \ - Kokkos::View >, \ - Kokkos::View, \ - Kokkos::MemoryTraits >, \ - 1> { \ - enum : bool { value = true }; \ +#define KOKKOSBLAS1_IAMAX_TPL_SPEC_AVAIL_BLAS(INDEX_TYPE, SCALAR, LAYOUT, MEMSPACE) \ + template \ + struct iamax_tpl_spec_avail< \ + ExecSpace, Kokkos::View >, \ + Kokkos::View, \ + Kokkos::MemoryTraits >, \ + 1> { \ + enum : bool { value = true }; \ }; -KOKKOSBLAS1_IAMAX_TPL_SPEC_AVAIL_BLAS(unsigned long, double, Kokkos::LayoutLeft, - Kokkos::HostSpace) -KOKKOSBLAS1_IAMAX_TPL_SPEC_AVAIL_BLAS(unsigned long, float, Kokkos::LayoutLeft, - Kokkos::HostSpace) -KOKKOSBLAS1_IAMAX_TPL_SPEC_AVAIL_BLAS(unsigned long, Kokkos::complex, - Kokkos::LayoutLeft, Kokkos::HostSpace) -KOKKOSBLAS1_IAMAX_TPL_SPEC_AVAIL_BLAS(unsigned long, Kokkos::complex, - Kokkos::LayoutLeft, Kokkos::HostSpace) +KOKKOSBLAS1_IAMAX_TPL_SPEC_AVAIL_BLAS(unsigned long, double, Kokkos::LayoutLeft, Kokkos::HostSpace) +KOKKOSBLAS1_IAMAX_TPL_SPEC_AVAIL_BLAS(unsigned long, float, Kokkos::LayoutLeft, Kokkos::HostSpace) +KOKKOSBLAS1_IAMAX_TPL_SPEC_AVAIL_BLAS(unsigned long, Kokkos::complex, Kokkos::LayoutLeft, Kokkos::HostSpace) +KOKKOSBLAS1_IAMAX_TPL_SPEC_AVAIL_BLAS(unsigned long, Kokkos::complex, Kokkos::LayoutLeft, Kokkos::HostSpace) #endif // cuBLAS #if defined(KOKKOSKERNELS_ENABLE_TPL_CUBLAS) // double -#define KOKKOSBLAS1_IAMAX_TPL_SPEC_AVAIL_CUBLAS(INDEX_TYPE, SCALAR, LAYOUT, \ - MEMSPACE) \ - template <> \ - struct iamax_tpl_spec_avail< \ - Kokkos::Cuda, \ - Kokkos::View >, \ - Kokkos::View, \ - Kokkos::MemoryTraits >, \ - 1> { \ - enum : bool { value = true }; \ - }; \ - template <> \ - struct iamax_tpl_spec_avail< \ - Kokkos::Cuda, \ - Kokkos::View, \ - Kokkos::MemoryTraits >, \ - Kokkos::View, \ - Kokkos::MemoryTraits >, \ - 1> { \ - enum : bool { value = true }; \ +#define KOKKOSBLAS1_IAMAX_TPL_SPEC_AVAIL_CUBLAS(INDEX_TYPE, SCALAR, LAYOUT, MEMSPACE) \ + template <> \ + struct iamax_tpl_spec_avail< \ + Kokkos::Cuda, Kokkos::View >, \ + Kokkos::View, \ + Kokkos::MemoryTraits >, \ + 1> { \ + enum : bool { value = true }; \ + }; \ + template <> \ + struct iamax_tpl_spec_avail, \ + Kokkos::MemoryTraits >, \ + Kokkos::View, \ + Kokkos::MemoryTraits >, \ + 1> { \ + enum : bool { value = true }; \ }; -KOKKOSBLAS1_IAMAX_TPL_SPEC_AVAIL_CUBLAS(unsigned long, double, - Kokkos::LayoutLeft, Kokkos::CudaSpace) -KOKKOSBLAS1_IAMAX_TPL_SPEC_AVAIL_CUBLAS(unsigned int, double, - Kokkos::LayoutLeft, Kokkos::CudaSpace) -KOKKOSBLAS1_IAMAX_TPL_SPEC_AVAIL_CUBLAS(unsigned long, float, - Kokkos::LayoutLeft, Kokkos::CudaSpace) -KOKKOSBLAS1_IAMAX_TPL_SPEC_AVAIL_CUBLAS(unsigned int, float, Kokkos::LayoutLeft, - Kokkos::CudaSpace) -KOKKOSBLAS1_IAMAX_TPL_SPEC_AVAIL_CUBLAS(unsigned long, Kokkos::complex, - Kokkos::LayoutLeft, Kokkos::CudaSpace) -KOKKOSBLAS1_IAMAX_TPL_SPEC_AVAIL_CUBLAS(unsigned int, Kokkos::complex, - Kokkos::LayoutLeft, Kokkos::CudaSpace) -KOKKOSBLAS1_IAMAX_TPL_SPEC_AVAIL_CUBLAS(unsigned long, Kokkos::complex, - Kokkos::LayoutLeft, Kokkos::CudaSpace) -KOKKOSBLAS1_IAMAX_TPL_SPEC_AVAIL_CUBLAS(unsigned int, Kokkos::complex, - Kokkos::LayoutLeft, Kokkos::CudaSpace) +KOKKOSBLAS1_IAMAX_TPL_SPEC_AVAIL_CUBLAS(unsigned long, double, Kokkos::LayoutLeft, Kokkos::CudaSpace) +KOKKOSBLAS1_IAMAX_TPL_SPEC_AVAIL_CUBLAS(unsigned int, double, Kokkos::LayoutLeft, Kokkos::CudaSpace) +KOKKOSBLAS1_IAMAX_TPL_SPEC_AVAIL_CUBLAS(unsigned long, float, Kokkos::LayoutLeft, Kokkos::CudaSpace) +KOKKOSBLAS1_IAMAX_TPL_SPEC_AVAIL_CUBLAS(unsigned int, float, Kokkos::LayoutLeft, Kokkos::CudaSpace) +KOKKOSBLAS1_IAMAX_TPL_SPEC_AVAIL_CUBLAS(unsigned long, Kokkos::complex, Kokkos::LayoutLeft, Kokkos::CudaSpace) +KOKKOSBLAS1_IAMAX_TPL_SPEC_AVAIL_CUBLAS(unsigned int, Kokkos::complex, Kokkos::LayoutLeft, Kokkos::CudaSpace) +KOKKOSBLAS1_IAMAX_TPL_SPEC_AVAIL_CUBLAS(unsigned long, Kokkos::complex, Kokkos::LayoutLeft, Kokkos::CudaSpace) +KOKKOSBLAS1_IAMAX_TPL_SPEC_AVAIL_CUBLAS(unsigned int, Kokkos::complex, Kokkos::LayoutLeft, Kokkos::CudaSpace) -KOKKOSBLAS1_IAMAX_TPL_SPEC_AVAIL_CUBLAS(unsigned long, double, - Kokkos::LayoutLeft, - Kokkos::CudaUVMSpace) -KOKKOSBLAS1_IAMAX_TPL_SPEC_AVAIL_CUBLAS(unsigned int, double, - Kokkos::LayoutLeft, - Kokkos::CudaUVMSpace) -KOKKOSBLAS1_IAMAX_TPL_SPEC_AVAIL_CUBLAS(unsigned long, float, - Kokkos::LayoutLeft, - Kokkos::CudaUVMSpace) -KOKKOSBLAS1_IAMAX_TPL_SPEC_AVAIL_CUBLAS(unsigned int, float, Kokkos::LayoutLeft, - Kokkos::CudaUVMSpace) -KOKKOSBLAS1_IAMAX_TPL_SPEC_AVAIL_CUBLAS(unsigned long, Kokkos::complex, - Kokkos::LayoutLeft, - Kokkos::CudaUVMSpace) -KOKKOSBLAS1_IAMAX_TPL_SPEC_AVAIL_CUBLAS(unsigned int, Kokkos::complex, - Kokkos::LayoutLeft, - Kokkos::CudaUVMSpace) -KOKKOSBLAS1_IAMAX_TPL_SPEC_AVAIL_CUBLAS(unsigned long, Kokkos::complex, - Kokkos::LayoutLeft, - Kokkos::CudaUVMSpace) -KOKKOSBLAS1_IAMAX_TPL_SPEC_AVAIL_CUBLAS(unsigned int, Kokkos::complex, - Kokkos::LayoutLeft, +KOKKOSBLAS1_IAMAX_TPL_SPEC_AVAIL_CUBLAS(unsigned long, double, Kokkos::LayoutLeft, Kokkos::CudaUVMSpace) +KOKKOSBLAS1_IAMAX_TPL_SPEC_AVAIL_CUBLAS(unsigned int, double, Kokkos::LayoutLeft, Kokkos::CudaUVMSpace) +KOKKOSBLAS1_IAMAX_TPL_SPEC_AVAIL_CUBLAS(unsigned long, float, Kokkos::LayoutLeft, Kokkos::CudaUVMSpace) +KOKKOSBLAS1_IAMAX_TPL_SPEC_AVAIL_CUBLAS(unsigned int, float, Kokkos::LayoutLeft, Kokkos::CudaUVMSpace) +KOKKOSBLAS1_IAMAX_TPL_SPEC_AVAIL_CUBLAS(unsigned long, Kokkos::complex, Kokkos::LayoutLeft, Kokkos::CudaUVMSpace) +KOKKOSBLAS1_IAMAX_TPL_SPEC_AVAIL_CUBLAS(unsigned int, Kokkos::complex, Kokkos::LayoutLeft, Kokkos::CudaUVMSpace) +KOKKOSBLAS1_IAMAX_TPL_SPEC_AVAIL_CUBLAS(unsigned long, Kokkos::complex, Kokkos::LayoutLeft, Kokkos::CudaUVMSpace) +KOKKOSBLAS1_IAMAX_TPL_SPEC_AVAIL_CUBLAS(unsigned int, Kokkos::complex, Kokkos::LayoutLeft, Kokkos::CudaUVMSpace) #endif // rocBLAS #if defined(KOKKOSKERNELS_ENABLE_TPL_ROCBLAS) -#define KOKKOSBLAS1_IAMAX_TPL_SPEC_AVAIL_ROCBLAS(INDEX_TYPE, SCALAR, LAYOUT, \ - MEMSPACE) \ - template <> \ - struct iamax_tpl_spec_avail< \ - Kokkos::HIP, \ - Kokkos::View >, \ - Kokkos::View, \ - Kokkos::MemoryTraits >, \ - 1> { \ - enum : bool { value = true }; \ - }; \ - template <> \ - struct iamax_tpl_spec_avail< \ - Kokkos::HIP, \ - Kokkos::View, \ - Kokkos::MemoryTraits >, \ - Kokkos::View, \ - Kokkos::MemoryTraits >, \ - 1> { \ - enum : bool { value = true }; \ +#define KOKKOSBLAS1_IAMAX_TPL_SPEC_AVAIL_ROCBLAS(INDEX_TYPE, SCALAR, LAYOUT, MEMSPACE) \ + template <> \ + struct iamax_tpl_spec_avail< \ + Kokkos::HIP, Kokkos::View >, \ + Kokkos::View, \ + Kokkos::MemoryTraits >, \ + 1> { \ + enum : bool { value = true }; \ + }; \ + template <> \ + struct iamax_tpl_spec_avail, \ + Kokkos::MemoryTraits >, \ + Kokkos::View, \ + Kokkos::MemoryTraits >, \ + 1> { \ + enum : bool { value = true }; \ }; -KOKKOSBLAS1_IAMAX_TPL_SPEC_AVAIL_ROCBLAS(unsigned long, double, - Kokkos::LayoutLeft, Kokkos::HIPSpace) -KOKKOSBLAS1_IAMAX_TPL_SPEC_AVAIL_ROCBLAS(unsigned int, double, - Kokkos::LayoutLeft, Kokkos::HIPSpace) -KOKKOSBLAS1_IAMAX_TPL_SPEC_AVAIL_ROCBLAS(unsigned long, float, - Kokkos::LayoutLeft, Kokkos::HIPSpace) -KOKKOSBLAS1_IAMAX_TPL_SPEC_AVAIL_ROCBLAS(unsigned int, float, - Kokkos::LayoutLeft, Kokkos::HIPSpace) -KOKKOSBLAS1_IAMAX_TPL_SPEC_AVAIL_ROCBLAS(unsigned long, Kokkos::complex, - Kokkos::LayoutLeft, Kokkos::HIPSpace) -KOKKOSBLAS1_IAMAX_TPL_SPEC_AVAIL_ROCBLAS(unsigned int, Kokkos::complex, - Kokkos::LayoutLeft, Kokkos::HIPSpace) -KOKKOSBLAS1_IAMAX_TPL_SPEC_AVAIL_ROCBLAS(unsigned long, Kokkos::complex, - Kokkos::LayoutLeft, Kokkos::HIPSpace) -KOKKOSBLAS1_IAMAX_TPL_SPEC_AVAIL_ROCBLAS(unsigned int, Kokkos::complex, - Kokkos::LayoutLeft, Kokkos::HIPSpace) +KOKKOSBLAS1_IAMAX_TPL_SPEC_AVAIL_ROCBLAS(unsigned long, double, Kokkos::LayoutLeft, Kokkos::HIPSpace) +KOKKOSBLAS1_IAMAX_TPL_SPEC_AVAIL_ROCBLAS(unsigned int, double, Kokkos::LayoutLeft, Kokkos::HIPSpace) +KOKKOSBLAS1_IAMAX_TPL_SPEC_AVAIL_ROCBLAS(unsigned long, float, Kokkos::LayoutLeft, Kokkos::HIPSpace) +KOKKOSBLAS1_IAMAX_TPL_SPEC_AVAIL_ROCBLAS(unsigned int, float, Kokkos::LayoutLeft, Kokkos::HIPSpace) +KOKKOSBLAS1_IAMAX_TPL_SPEC_AVAIL_ROCBLAS(unsigned long, Kokkos::complex, Kokkos::LayoutLeft, Kokkos::HIPSpace) +KOKKOSBLAS1_IAMAX_TPL_SPEC_AVAIL_ROCBLAS(unsigned int, Kokkos::complex, Kokkos::LayoutLeft, Kokkos::HIPSpace) +KOKKOSBLAS1_IAMAX_TPL_SPEC_AVAIL_ROCBLAS(unsigned long, Kokkos::complex, Kokkos::LayoutLeft, Kokkos::HIPSpace) +KOKKOSBLAS1_IAMAX_TPL_SPEC_AVAIL_ROCBLAS(unsigned int, Kokkos::complex, Kokkos::LayoutLeft, Kokkos::HIPSpace) #endif diff --git a/blas/tpls/KokkosBlas1_iamax_tpl_spec_decl.hpp b/blas/tpls/KokkosBlas1_iamax_tpl_spec_decl.hpp index 913ec5a151..c85de4d186 100644 --- a/blas/tpls/KokkosBlas1_iamax_tpl_spec_decl.hpp +++ b/blas/tpls/KokkosBlas1_iamax_tpl_spec_decl.hpp @@ -23,15 +23,12 @@ template inline void iamax_print_specialization() { #if defined(KOKKOSKERNELS_ENABLE_CHECK_SPECIALIZATION) #if defined(KOKKOSKERNELS_ENABLE_TPL_CUBLAS) - printf("KokkosBlas1::iamax<> TPL cuBLAS specialization for < %s , %s >\n", - typeid(RV).name(), typeid(XV).name()); + printf("KokkosBlas1::iamax<> TPL cuBLAS specialization for < %s , %s >\n", typeid(RV).name(), typeid(XV).name()); #elif defined(KOKKOSKERNELS_ENABLE_TPL_ROCBLAS) - printf("KokkosBlas1::iamax<> TPL rocBLAS specialization for < %s , %s >\n", - typeid(RV).name(), typeid(XV).name()); + printf("KokkosBlas1::iamax<> TPL rocBLAS specialization for < %s , %s >\n", typeid(RV).name(), typeid(XV).name()); #else #ifdef KOKKOSKERNELS_ENABLE_TPL_BLAS - printf("KokkosBlas1::iamax<> TPL Blas specialization for < %s , %s >\n", - typeid(RV).name(), typeid(XV).name()); + printf("KokkosBlas1::iamax<> TPL Blas specialization for < %s , %s >\n", typeid(RV).name(), typeid(XV).name()); #endif #endif #endif @@ -46,90 +43,63 @@ inline void iamax_print_specialization() { namespace KokkosBlas { namespace Impl { -#define KOKKOSBLAS1_XIAMAX_TPL_SPEC_DECL_BLAS( \ - SCALAR_TYPE, BASE_SCALAR_TYPE, LAYOUT, MEMSPACE, ETI_SPEC_AVAIL) \ - template \ - struct Iamax >, \ - Kokkos::View, \ - Kokkos::MemoryTraits >, \ - 1, true, ETI_SPEC_AVAIL> { \ - typedef Kokkos::View > \ - RV; \ - typedef Kokkos::View, \ - Kokkos::MemoryTraits > \ - XV; \ - typedef typename XV::size_type size_type; \ - \ - static void iamax(const ExecSpace& space, RV& R, const XV& X) { \ - Kokkos::Profiling::pushRegion("KokkosBlas::iamax[TPL_BLAS," #SCALAR_TYPE \ - "]"); \ - const size_type numElems = X.extent(0); \ - if (numElems == 0) { \ - R() = 0; \ - return; \ - } \ - if (numElems < static_cast(INT_MAX)) { \ - iamax_print_specialization(); \ - int N = static_cast(numElems); \ - const int XST = X.stride(0); \ - const int LDX = (XST == 0) ? 1 : XST; \ - int idx = HostBlas::iamax( \ - N, reinterpret_cast(X.data()), LDX); \ - R() = static_cast(idx); \ - } else { \ - Iamax::iamax(space, R, \ - X); \ - } \ - Kokkos::Profiling::popRegion(); \ - } \ +#define KOKKOSBLAS1_XIAMAX_TPL_SPEC_DECL_BLAS(SCALAR_TYPE, BASE_SCALAR_TYPE, LAYOUT, MEMSPACE, ETI_SPEC_AVAIL) \ + template \ + struct Iamax >, \ + Kokkos::View, \ + Kokkos::MemoryTraits >, \ + 1, true, ETI_SPEC_AVAIL> { \ + typedef Kokkos::View > RV; \ + typedef Kokkos::View, \ + Kokkos::MemoryTraits > \ + XV; \ + typedef typename XV::size_type size_type; \ + \ + static void iamax(const ExecSpace& space, RV& R, const XV& X) { \ + Kokkos::Profiling::pushRegion("KokkosBlas::iamax[TPL_BLAS," #SCALAR_TYPE "]"); \ + const size_type numElems = X.extent(0); \ + if (numElems == 0) { \ + R() = 0; \ + return; \ + } \ + if (numElems < static_cast(INT_MAX)) { \ + iamax_print_specialization(); \ + int N = static_cast(numElems); \ + const int XST = X.stride(0); \ + const int LDX = (XST == 0) ? 1 : XST; \ + int idx = HostBlas::iamax(N, reinterpret_cast(X.data()), LDX); \ + R() = static_cast(idx); \ + } else { \ + Iamax::iamax(space, R, X); \ + } \ + Kokkos::Profiling::popRegion(); \ + } \ }; -#define KOKKOSBLAS1_DIAMAX_TPL_SPEC_DECL_BLAS(LAYOUT, MEMSPACE, \ - ETI_SPEC_AVAIL) \ - KOKKOSBLAS1_XIAMAX_TPL_SPEC_DECL_BLAS(double, double, LAYOUT, MEMSPACE, \ - ETI_SPEC_AVAIL) - -#define KOKKOSBLAS1_SIAMAX_TPL_SPEC_DECL_BLAS(LAYOUT, MEMSPACE, \ - ETI_SPEC_AVAIL) \ - KOKKOSBLAS1_XIAMAX_TPL_SPEC_DECL_BLAS(float, float, LAYOUT, MEMSPACE, \ - ETI_SPEC_AVAIL) - -#define KOKKOSBLAS1_ZIAMAX_TPL_SPEC_DECL_BLAS(LAYOUT, MEMSPACE, \ - ETI_SPEC_AVAIL) \ - KOKKOSBLAS1_XIAMAX_TPL_SPEC_DECL_BLAS(Kokkos::complex, \ - std::complex, LAYOUT, \ - MEMSPACE, ETI_SPEC_AVAIL) - -#define KOKKOSBLAS1_CIAMAX_TPL_SPEC_DECL_BLAS(LAYOUT, MEMSPACE, \ - ETI_SPEC_AVAIL) \ - KOKKOSBLAS1_XIAMAX_TPL_SPEC_DECL_BLAS(Kokkos::complex, \ - std::complex, LAYOUT, MEMSPACE, \ - ETI_SPEC_AVAIL) - -KOKKOSBLAS1_DIAMAX_TPL_SPEC_DECL_BLAS(Kokkos::LayoutLeft, Kokkos::HostSpace, - true) -KOKKOSBLAS1_DIAMAX_TPL_SPEC_DECL_BLAS(Kokkos::LayoutLeft, Kokkos::HostSpace, - false) - -KOKKOSBLAS1_SIAMAX_TPL_SPEC_DECL_BLAS(Kokkos::LayoutLeft, Kokkos::HostSpace, - true) -KOKKOSBLAS1_SIAMAX_TPL_SPEC_DECL_BLAS(Kokkos::LayoutLeft, Kokkos::HostSpace, - false) - -KOKKOSBLAS1_ZIAMAX_TPL_SPEC_DECL_BLAS(Kokkos::LayoutLeft, Kokkos::HostSpace, - true) -KOKKOSBLAS1_ZIAMAX_TPL_SPEC_DECL_BLAS(Kokkos::LayoutLeft, Kokkos::HostSpace, - false) - -KOKKOSBLAS1_CIAMAX_TPL_SPEC_DECL_BLAS(Kokkos::LayoutLeft, Kokkos::HostSpace, - true) -KOKKOSBLAS1_CIAMAX_TPL_SPEC_DECL_BLAS(Kokkos::LayoutLeft, Kokkos::HostSpace, - false) +#define KOKKOSBLAS1_DIAMAX_TPL_SPEC_DECL_BLAS(LAYOUT, MEMSPACE, ETI_SPEC_AVAIL) \ + KOKKOSBLAS1_XIAMAX_TPL_SPEC_DECL_BLAS(double, double, LAYOUT, MEMSPACE, ETI_SPEC_AVAIL) + +#define KOKKOSBLAS1_SIAMAX_TPL_SPEC_DECL_BLAS(LAYOUT, MEMSPACE, ETI_SPEC_AVAIL) \ + KOKKOSBLAS1_XIAMAX_TPL_SPEC_DECL_BLAS(float, float, LAYOUT, MEMSPACE, ETI_SPEC_AVAIL) + +#define KOKKOSBLAS1_ZIAMAX_TPL_SPEC_DECL_BLAS(LAYOUT, MEMSPACE, ETI_SPEC_AVAIL) \ + KOKKOSBLAS1_XIAMAX_TPL_SPEC_DECL_BLAS(Kokkos::complex, std::complex, LAYOUT, MEMSPACE, ETI_SPEC_AVAIL) + +#define KOKKOSBLAS1_CIAMAX_TPL_SPEC_DECL_BLAS(LAYOUT, MEMSPACE, ETI_SPEC_AVAIL) \ + KOKKOSBLAS1_XIAMAX_TPL_SPEC_DECL_BLAS(Kokkos::complex, std::complex, LAYOUT, MEMSPACE, ETI_SPEC_AVAIL) + +KOKKOSBLAS1_DIAMAX_TPL_SPEC_DECL_BLAS(Kokkos::LayoutLeft, Kokkos::HostSpace, true) +KOKKOSBLAS1_DIAMAX_TPL_SPEC_DECL_BLAS(Kokkos::LayoutLeft, Kokkos::HostSpace, false) + +KOKKOSBLAS1_SIAMAX_TPL_SPEC_DECL_BLAS(Kokkos::LayoutLeft, Kokkos::HostSpace, true) +KOKKOSBLAS1_SIAMAX_TPL_SPEC_DECL_BLAS(Kokkos::LayoutLeft, Kokkos::HostSpace, false) + +KOKKOSBLAS1_ZIAMAX_TPL_SPEC_DECL_BLAS(Kokkos::LayoutLeft, Kokkos::HostSpace, true) +KOKKOSBLAS1_ZIAMAX_TPL_SPEC_DECL_BLAS(Kokkos::LayoutLeft, Kokkos::HostSpace, false) + +KOKKOSBLAS1_CIAMAX_TPL_SPEC_DECL_BLAS(Kokkos::LayoutLeft, Kokkos::HostSpace, true) +KOKKOSBLAS1_CIAMAX_TPL_SPEC_DECL_BLAS(Kokkos::LayoutLeft, Kokkos::HostSpace, false) } // namespace Impl } // namespace KokkosBlas @@ -145,227 +115,155 @@ namespace Impl { using CUBLAS_DEVICE_TYPE = Kokkos::Device; #if defined(KOKKOS_ENABLE_CUDA_UVM) -using CUBLASUVM_DEVICE_TYPE = - Kokkos::Device; +using CUBLASUVM_DEVICE_TYPE = Kokkos::Device; #endif -#define KOKKOSBLAS1_XIAMAX_TPL_SPEC_DECL_CUBLAS_WRAPPER( \ - SCALAR_TYPE, CUDA_SCALAR_TYPE, CUBLAS_FN, INDEX_TYPE, LAYOUT, EXEC_SPACE, \ - MEMSPACE, ETI_SPEC_AVAIL, RET_DEVICE_TYPE, CUBLAS_PTR_MODE_1, \ - CUBLAS_PTR_MODE_2) \ - template <> \ - struct Iamax >, \ - Kokkos::View, \ - Kokkos::MemoryTraits >, \ - 1, true, ETI_SPEC_AVAIL> { \ - typedef Kokkos::View > \ - RV; \ - typedef Kokkos::View, \ - Kokkos::MemoryTraits > \ - XV; \ - typedef typename XV::size_type size_type; \ - \ - static void iamax(const EXEC_SPACE& space, RV& R, const XV& X) { \ - Kokkos::Profiling::pushRegion( \ - "KokkosBlas::iamax[TPL_CUBLAS," #SCALAR_TYPE "]"); \ - const size_type numElems = X.extent(0); \ - if (numElems == 0) { \ - Kokkos::deep_copy(R, 0); \ - return; \ - } \ - if (numElems < static_cast(INT_MAX)) { \ - iamax_print_specialization(); \ - const int N = static_cast(numElems); \ - const int XST = X.stride(0); \ - const int LDX = (XST == 0) ? 1 : XST; \ - KokkosBlas::Impl::CudaBlasSingleton& s = \ - KokkosBlas::Impl::CudaBlasSingleton::singleton(); \ - KOKKOS_CUBLAS_SAFE_CALL_IMPL( \ - cublasSetStream(s.handle, space.cuda_stream())); \ - cublasPointerMode_t prevPtrMode; \ - KOKKOS_CUBLAS_SAFE_CALL_IMPL( \ - cublasGetPointerMode(s.handle, &prevPtrMode)); \ - if (prevPtrMode == CUBLAS_PTR_MODE_2) { \ - KOKKOS_CUBLAS_SAFE_CALL_IMPL( \ - cublasSetPointerMode(s.handle, CUBLAS_PTR_MODE_1)); \ - } \ - KOKKOS_CUBLAS_SAFE_CALL_IMPL(CUBLAS_FN( \ - s.handle, N, reinterpret_cast(X.data()), \ - LDX, reinterpret_cast(R.data()))); \ - if (prevPtrMode == CUBLAS_PTR_MODE_2) { \ - KOKKOS_CUBLAS_SAFE_CALL_IMPL( \ - cublasSetPointerMode(s.handle, CUBLAS_PTR_MODE_2)); \ - KOKKOS_CUBLAS_SAFE_CALL_IMPL(cublasSetStream(s.handle, NULL)); \ - } \ - } else { \ - Iamax::iamax(space, R, \ - X); \ - } \ - Kokkos::Profiling::popRegion(); \ - } \ +#define KOKKOSBLAS1_XIAMAX_TPL_SPEC_DECL_CUBLAS_WRAPPER(SCALAR_TYPE, CUDA_SCALAR_TYPE, CUBLAS_FN, INDEX_TYPE, LAYOUT, \ + EXEC_SPACE, MEMSPACE, ETI_SPEC_AVAIL, RET_DEVICE_TYPE, \ + CUBLAS_PTR_MODE_1, CUBLAS_PTR_MODE_2) \ + template <> \ + struct Iamax >, \ + Kokkos::View, \ + Kokkos::MemoryTraits >, \ + 1, true, ETI_SPEC_AVAIL> { \ + typedef Kokkos::View > RV; \ + typedef Kokkos::View, \ + Kokkos::MemoryTraits > \ + XV; \ + typedef typename XV::size_type size_type; \ + \ + static void iamax(const EXEC_SPACE& space, RV& R, const XV& X) { \ + Kokkos::Profiling::pushRegion("KokkosBlas::iamax[TPL_CUBLAS," #SCALAR_TYPE "]"); \ + const size_type numElems = X.extent(0); \ + if (numElems == 0) { \ + Kokkos::deep_copy(R, 0); \ + return; \ + } \ + if (numElems < static_cast(INT_MAX)) { \ + iamax_print_specialization(); \ + const int N = static_cast(numElems); \ + const int XST = X.stride(0); \ + const int LDX = (XST == 0) ? 1 : XST; \ + KokkosBlas::Impl::CudaBlasSingleton& s = KokkosBlas::Impl::CudaBlasSingleton::singleton(); \ + KOKKOS_CUBLAS_SAFE_CALL_IMPL(cublasSetStream(s.handle, space.cuda_stream())); \ + cublasPointerMode_t prevPtrMode; \ + KOKKOS_CUBLAS_SAFE_CALL_IMPL(cublasGetPointerMode(s.handle, &prevPtrMode)); \ + if (prevPtrMode == CUBLAS_PTR_MODE_2) { \ + KOKKOS_CUBLAS_SAFE_CALL_IMPL(cublasSetPointerMode(s.handle, CUBLAS_PTR_MODE_1)); \ + } \ + KOKKOS_CUBLAS_SAFE_CALL_IMPL(CUBLAS_FN(s.handle, N, reinterpret_cast(X.data()), LDX, \ + reinterpret_cast(R.data()))); \ + if (prevPtrMode == CUBLAS_PTR_MODE_2) { \ + KOKKOS_CUBLAS_SAFE_CALL_IMPL(cublasSetPointerMode(s.handle, CUBLAS_PTR_MODE_2)); \ + KOKKOS_CUBLAS_SAFE_CALL_IMPL(cublasSetStream(s.handle, NULL)); \ + } \ + } else { \ + Iamax::iamax(space, R, X); \ + } \ + Kokkos::Profiling::popRegion(); \ + } \ }; -#define KOKKOSBLAS1_XIAMAX_TPL_SPEC_DECL_CUBLAS(SCALAR_TYPE, CUDA_SCALAR_TYPE, \ - CUBLAS_FN, INDEX_TYPE, LAYOUT, \ - MEMSPACE, ETI_SPEC_AVAIL) \ - KOKKOSBLAS1_XIAMAX_TPL_SPEC_DECL_CUBLAS_WRAPPER( \ - SCALAR_TYPE, CUDA_SCALAR_TYPE, CUBLAS_FN, INDEX_TYPE, LAYOUT, \ - Kokkos::Cuda, MEMSPACE, ETI_SPEC_AVAIL, Kokkos::HostSpace, \ - CUBLAS_POINTER_MODE_HOST, CUBLAS_POINTER_MODE_DEVICE) \ - KOKKOSBLAS1_XIAMAX_TPL_SPEC_DECL_CUBLAS_WRAPPER( \ - SCALAR_TYPE, CUDA_SCALAR_TYPE, CUBLAS_FN, INDEX_TYPE, LAYOUT, \ - Kokkos::Cuda, MEMSPACE, ETI_SPEC_AVAIL, CUBLAS_DEVICE_TYPE, \ - CUBLAS_POINTER_MODE_DEVICE, CUBLAS_POINTER_MODE_HOST) +#define KOKKOSBLAS1_XIAMAX_TPL_SPEC_DECL_CUBLAS(SCALAR_TYPE, CUDA_SCALAR_TYPE, CUBLAS_FN, INDEX_TYPE, LAYOUT, \ + MEMSPACE, ETI_SPEC_AVAIL) \ + KOKKOSBLAS1_XIAMAX_TPL_SPEC_DECL_CUBLAS_WRAPPER(SCALAR_TYPE, CUDA_SCALAR_TYPE, CUBLAS_FN, INDEX_TYPE, LAYOUT, \ + Kokkos::Cuda, MEMSPACE, ETI_SPEC_AVAIL, Kokkos::HostSpace, \ + CUBLAS_POINTER_MODE_HOST, CUBLAS_POINTER_MODE_DEVICE) \ + KOKKOSBLAS1_XIAMAX_TPL_SPEC_DECL_CUBLAS_WRAPPER(SCALAR_TYPE, CUDA_SCALAR_TYPE, CUBLAS_FN, INDEX_TYPE, LAYOUT, \ + Kokkos::Cuda, MEMSPACE, ETI_SPEC_AVAIL, CUBLAS_DEVICE_TYPE, \ + CUBLAS_POINTER_MODE_DEVICE, CUBLAS_POINTER_MODE_HOST) #if defined(KOKKOS_ENABLE_CUDA_UVM) -#define KOKKOSBLAS1_XIAMAX_TPL_SPEC_DECL_CUBLAS_UVM( \ - SCALAR_TYPE, CUDA_SCALAR_TYPE, CUBLAS_FN, INDEX_TYPE, LAYOUT, MEMSPACE, \ - ETI_SPEC_AVAIL) \ - KOKKOSBLAS1_XIAMAX_TPL_SPEC_DECL_CUBLAS_WRAPPER( \ - SCALAR_TYPE, CUDA_SCALAR_TYPE, CUBLAS_FN, INDEX_TYPE, LAYOUT, \ - Kokkos::Cuda, MEMSPACE, ETI_SPEC_AVAIL, Kokkos::HostSpace, \ - CUBLAS_POINTER_MODE_HOST, CUBLAS_POINTER_MODE_DEVICE) \ - KOKKOSBLAS1_XIAMAX_TPL_SPEC_DECL_CUBLAS_WRAPPER( \ - SCALAR_TYPE, CUDA_SCALAR_TYPE, CUBLAS_FN, INDEX_TYPE, LAYOUT, \ - Kokkos::Cuda, MEMSPACE, ETI_SPEC_AVAIL, CUBLASUVM_DEVICE_TYPE, \ - CUBLAS_POINTER_MODE_DEVICE, CUBLAS_POINTER_MODE_HOST) +#define KOKKOSBLAS1_XIAMAX_TPL_SPEC_DECL_CUBLAS_UVM(SCALAR_TYPE, CUDA_SCALAR_TYPE, CUBLAS_FN, INDEX_TYPE, LAYOUT, \ + MEMSPACE, ETI_SPEC_AVAIL) \ + KOKKOSBLAS1_XIAMAX_TPL_SPEC_DECL_CUBLAS_WRAPPER(SCALAR_TYPE, CUDA_SCALAR_TYPE, CUBLAS_FN, INDEX_TYPE, LAYOUT, \ + Kokkos::Cuda, MEMSPACE, ETI_SPEC_AVAIL, Kokkos::HostSpace, \ + CUBLAS_POINTER_MODE_HOST, CUBLAS_POINTER_MODE_DEVICE) \ + KOKKOSBLAS1_XIAMAX_TPL_SPEC_DECL_CUBLAS_WRAPPER(SCALAR_TYPE, CUDA_SCALAR_TYPE, CUBLAS_FN, INDEX_TYPE, LAYOUT, \ + Kokkos::Cuda, MEMSPACE, ETI_SPEC_AVAIL, CUBLASUVM_DEVICE_TYPE, \ + CUBLAS_POINTER_MODE_DEVICE, CUBLAS_POINTER_MODE_HOST) #endif -#define KOKKOSBLAS1_DIAMAX_TPL_SPEC_DECL_CUBLAS(INDEX_TYPE, LAYOUT, MEMSPACE, \ - ETI_SPEC_AVAIL) \ - KOKKOSBLAS1_XIAMAX_TPL_SPEC_DECL_CUBLAS(double, double, cublasIdamax, \ - INDEX_TYPE, LAYOUT, MEMSPACE, \ - ETI_SPEC_AVAIL) - -#define KOKKOSBLAS1_SIAMAX_TPL_SPEC_DECL_CUBLAS(INDEX_TYPE, LAYOUT, MEMSPACE, \ - ETI_SPEC_AVAIL) \ - KOKKOSBLAS1_XIAMAX_TPL_SPEC_DECL_CUBLAS(float, float, cublasIsamax, \ - INDEX_TYPE, LAYOUT, MEMSPACE, \ - ETI_SPEC_AVAIL) - -#define KOKKOSBLAS1_ZIAMAX_TPL_SPEC_DECL_CUBLAS(INDEX_TYPE, LAYOUT, MEMSPACE, \ - ETI_SPEC_AVAIL) \ - KOKKOSBLAS1_XIAMAX_TPL_SPEC_DECL_CUBLAS( \ - Kokkos::complex, cuDoubleComplex, cublasIzamax, INDEX_TYPE, \ - LAYOUT, MEMSPACE, ETI_SPEC_AVAIL) - -#define KOKKOSBLAS1_CIAMAX_TPL_SPEC_DECL_CUBLAS(INDEX_TYPE, LAYOUT, MEMSPACE, \ - ETI_SPEC_AVAIL) \ - KOKKOSBLAS1_XIAMAX_TPL_SPEC_DECL_CUBLAS(Kokkos::complex, cuComplex, \ - cublasIcamax, INDEX_TYPE, LAYOUT, \ +#define KOKKOSBLAS1_DIAMAX_TPL_SPEC_DECL_CUBLAS(INDEX_TYPE, LAYOUT, MEMSPACE, ETI_SPEC_AVAIL) \ + KOKKOSBLAS1_XIAMAX_TPL_SPEC_DECL_CUBLAS(double, double, cublasIdamax, INDEX_TYPE, LAYOUT, MEMSPACE, ETI_SPEC_AVAIL) + +#define KOKKOSBLAS1_SIAMAX_TPL_SPEC_DECL_CUBLAS(INDEX_TYPE, LAYOUT, MEMSPACE, ETI_SPEC_AVAIL) \ + KOKKOSBLAS1_XIAMAX_TPL_SPEC_DECL_CUBLAS(float, float, cublasIsamax, INDEX_TYPE, LAYOUT, MEMSPACE, ETI_SPEC_AVAIL) + +#define KOKKOSBLAS1_ZIAMAX_TPL_SPEC_DECL_CUBLAS(INDEX_TYPE, LAYOUT, MEMSPACE, ETI_SPEC_AVAIL) \ + KOKKOSBLAS1_XIAMAX_TPL_SPEC_DECL_CUBLAS(Kokkos::complex, cuDoubleComplex, cublasIzamax, INDEX_TYPE, LAYOUT, \ + MEMSPACE, ETI_SPEC_AVAIL) + +#define KOKKOSBLAS1_CIAMAX_TPL_SPEC_DECL_CUBLAS(INDEX_TYPE, LAYOUT, MEMSPACE, ETI_SPEC_AVAIL) \ + KOKKOSBLAS1_XIAMAX_TPL_SPEC_DECL_CUBLAS(Kokkos::complex, cuComplex, cublasIcamax, INDEX_TYPE, LAYOUT, \ MEMSPACE, ETI_SPEC_AVAIL) #if defined(KOKKOS_ENABLE_CUDA_UVM) -#define KOKKOSBLAS1_DIAMAX_TPL_SPEC_DECL_CUBLAS_UVM(INDEX_TYPE, LAYOUT, \ - MEMSPACE, ETI_SPEC_AVAIL) \ - KOKKOSBLAS1_XIAMAX_TPL_SPEC_DECL_CUBLAS_UVM(double, double, cublasIdamax, \ - INDEX_TYPE, LAYOUT, MEMSPACE, \ +#define KOKKOSBLAS1_DIAMAX_TPL_SPEC_DECL_CUBLAS_UVM(INDEX_TYPE, LAYOUT, MEMSPACE, ETI_SPEC_AVAIL) \ + KOKKOSBLAS1_XIAMAX_TPL_SPEC_DECL_CUBLAS_UVM(double, double, cublasIdamax, INDEX_TYPE, LAYOUT, MEMSPACE, \ ETI_SPEC_AVAIL) -#define KOKKOSBLAS1_SIAMAX_TPL_SPEC_DECL_CUBLAS_UVM(INDEX_TYPE, LAYOUT, \ - MEMSPACE, ETI_SPEC_AVAIL) \ - KOKKOSBLAS1_XIAMAX_TPL_SPEC_DECL_CUBLAS_UVM(float, float, cublasIsamax, \ - INDEX_TYPE, LAYOUT, MEMSPACE, \ - ETI_SPEC_AVAIL) +#define KOKKOSBLAS1_SIAMAX_TPL_SPEC_DECL_CUBLAS_UVM(INDEX_TYPE, LAYOUT, MEMSPACE, ETI_SPEC_AVAIL) \ + KOKKOSBLAS1_XIAMAX_TPL_SPEC_DECL_CUBLAS_UVM(float, float, cublasIsamax, INDEX_TYPE, LAYOUT, MEMSPACE, ETI_SPEC_AVAIL) + +#define KOKKOSBLAS1_ZIAMAX_TPL_SPEC_DECL_CUBLAS_UVM(INDEX_TYPE, LAYOUT, MEMSPACE, ETI_SPEC_AVAIL) \ + KOKKOSBLAS1_XIAMAX_TPL_SPEC_DECL_CUBLAS_UVM(Kokkos::complex, cuDoubleComplex, cublasIzamax, INDEX_TYPE, \ + LAYOUT, MEMSPACE, ETI_SPEC_AVAIL) -#define KOKKOSBLAS1_ZIAMAX_TPL_SPEC_DECL_CUBLAS_UVM(INDEX_TYPE, LAYOUT, \ - MEMSPACE, ETI_SPEC_AVAIL) \ - KOKKOSBLAS1_XIAMAX_TPL_SPEC_DECL_CUBLAS_UVM( \ - Kokkos::complex, cuDoubleComplex, cublasIzamax, INDEX_TYPE, \ - LAYOUT, MEMSPACE, ETI_SPEC_AVAIL) - -#define KOKKOSBLAS1_CIAMAX_TPL_SPEC_DECL_CUBLAS_UVM(INDEX_TYPE, LAYOUT, \ - MEMSPACE, ETI_SPEC_AVAIL) \ - KOKKOSBLAS1_XIAMAX_TPL_SPEC_DECL_CUBLAS_UVM( \ - Kokkos::complex, cuComplex, cublasIcamax, INDEX_TYPE, LAYOUT, \ - MEMSPACE, ETI_SPEC_AVAIL) +#define KOKKOSBLAS1_CIAMAX_TPL_SPEC_DECL_CUBLAS_UVM(INDEX_TYPE, LAYOUT, MEMSPACE, ETI_SPEC_AVAIL) \ + KOKKOSBLAS1_XIAMAX_TPL_SPEC_DECL_CUBLAS_UVM(Kokkos::complex, cuComplex, cublasIcamax, INDEX_TYPE, LAYOUT, \ + MEMSPACE, ETI_SPEC_AVAIL) #endif -KOKKOSBLAS1_DIAMAX_TPL_SPEC_DECL_CUBLAS(unsigned long, Kokkos::LayoutLeft, - Kokkos::CudaSpace, true) -KOKKOSBLAS1_DIAMAX_TPL_SPEC_DECL_CUBLAS(unsigned long, Kokkos::LayoutLeft, - Kokkos::CudaSpace, false) - -KOKKOSBLAS1_SIAMAX_TPL_SPEC_DECL_CUBLAS(unsigned long, Kokkos::LayoutLeft, - Kokkos::CudaSpace, true) -KOKKOSBLAS1_SIAMAX_TPL_SPEC_DECL_CUBLAS(unsigned long, Kokkos::LayoutLeft, - Kokkos::CudaSpace, false) - -KOKKOSBLAS1_ZIAMAX_TPL_SPEC_DECL_CUBLAS(unsigned long, Kokkos::LayoutLeft, - Kokkos::CudaSpace, true) -KOKKOSBLAS1_ZIAMAX_TPL_SPEC_DECL_CUBLAS(unsigned long, Kokkos::LayoutLeft, - Kokkos::CudaSpace, false) - -KOKKOSBLAS1_CIAMAX_TPL_SPEC_DECL_CUBLAS(unsigned long, Kokkos::LayoutLeft, - Kokkos::CudaSpace, true) -KOKKOSBLAS1_CIAMAX_TPL_SPEC_DECL_CUBLAS(unsigned long, Kokkos::LayoutLeft, - Kokkos::CudaSpace, false) - -KOKKOSBLAS1_DIAMAX_TPL_SPEC_DECL_CUBLAS(unsigned int, Kokkos::LayoutLeft, - Kokkos::CudaSpace, true) -KOKKOSBLAS1_DIAMAX_TPL_SPEC_DECL_CUBLAS(unsigned int, Kokkos::LayoutLeft, - Kokkos::CudaSpace, false) - -KOKKOSBLAS1_SIAMAX_TPL_SPEC_DECL_CUBLAS(unsigned int, Kokkos::LayoutLeft, - Kokkos::CudaSpace, true) -KOKKOSBLAS1_SIAMAX_TPL_SPEC_DECL_CUBLAS(unsigned int, Kokkos::LayoutLeft, - Kokkos::CudaSpace, false) - -KOKKOSBLAS1_ZIAMAX_TPL_SPEC_DECL_CUBLAS(unsigned int, Kokkos::LayoutLeft, - Kokkos::CudaSpace, true) -KOKKOSBLAS1_ZIAMAX_TPL_SPEC_DECL_CUBLAS(unsigned int, Kokkos::LayoutLeft, - Kokkos::CudaSpace, false) - -KOKKOSBLAS1_CIAMAX_TPL_SPEC_DECL_CUBLAS(unsigned int, Kokkos::LayoutLeft, - Kokkos::CudaSpace, true) -KOKKOSBLAS1_CIAMAX_TPL_SPEC_DECL_CUBLAS(unsigned int, Kokkos::LayoutLeft, - Kokkos::CudaSpace, false) +KOKKOSBLAS1_DIAMAX_TPL_SPEC_DECL_CUBLAS(unsigned long, Kokkos::LayoutLeft, Kokkos::CudaSpace, true) +KOKKOSBLAS1_DIAMAX_TPL_SPEC_DECL_CUBLAS(unsigned long, Kokkos::LayoutLeft, Kokkos::CudaSpace, false) + +KOKKOSBLAS1_SIAMAX_TPL_SPEC_DECL_CUBLAS(unsigned long, Kokkos::LayoutLeft, Kokkos::CudaSpace, true) +KOKKOSBLAS1_SIAMAX_TPL_SPEC_DECL_CUBLAS(unsigned long, Kokkos::LayoutLeft, Kokkos::CudaSpace, false) + +KOKKOSBLAS1_ZIAMAX_TPL_SPEC_DECL_CUBLAS(unsigned long, Kokkos::LayoutLeft, Kokkos::CudaSpace, true) +KOKKOSBLAS1_ZIAMAX_TPL_SPEC_DECL_CUBLAS(unsigned long, Kokkos::LayoutLeft, Kokkos::CudaSpace, false) + +KOKKOSBLAS1_CIAMAX_TPL_SPEC_DECL_CUBLAS(unsigned long, Kokkos::LayoutLeft, Kokkos::CudaSpace, true) +KOKKOSBLAS1_CIAMAX_TPL_SPEC_DECL_CUBLAS(unsigned long, Kokkos::LayoutLeft, Kokkos::CudaSpace, false) + +KOKKOSBLAS1_DIAMAX_TPL_SPEC_DECL_CUBLAS(unsigned int, Kokkos::LayoutLeft, Kokkos::CudaSpace, true) +KOKKOSBLAS1_DIAMAX_TPL_SPEC_DECL_CUBLAS(unsigned int, Kokkos::LayoutLeft, Kokkos::CudaSpace, false) + +KOKKOSBLAS1_SIAMAX_TPL_SPEC_DECL_CUBLAS(unsigned int, Kokkos::LayoutLeft, Kokkos::CudaSpace, true) +KOKKOSBLAS1_SIAMAX_TPL_SPEC_DECL_CUBLAS(unsigned int, Kokkos::LayoutLeft, Kokkos::CudaSpace, false) + +KOKKOSBLAS1_ZIAMAX_TPL_SPEC_DECL_CUBLAS(unsigned int, Kokkos::LayoutLeft, Kokkos::CudaSpace, true) +KOKKOSBLAS1_ZIAMAX_TPL_SPEC_DECL_CUBLAS(unsigned int, Kokkos::LayoutLeft, Kokkos::CudaSpace, false) + +KOKKOSBLAS1_CIAMAX_TPL_SPEC_DECL_CUBLAS(unsigned int, Kokkos::LayoutLeft, Kokkos::CudaSpace, true) +KOKKOSBLAS1_CIAMAX_TPL_SPEC_DECL_CUBLAS(unsigned int, Kokkos::LayoutLeft, Kokkos::CudaSpace, false) #if defined(KOKKOS_ENABLE_CUDA_UVM) -KOKKOSBLAS1_DIAMAX_TPL_SPEC_DECL_CUBLAS_UVM(unsigned long, Kokkos::LayoutLeft, - Kokkos::CudaUVMSpace, true) -KOKKOSBLAS1_DIAMAX_TPL_SPEC_DECL_CUBLAS_UVM(unsigned long, Kokkos::LayoutLeft, - Kokkos::CudaUVMSpace, false) - -KOKKOSBLAS1_SIAMAX_TPL_SPEC_DECL_CUBLAS_UVM(unsigned long, Kokkos::LayoutLeft, - Kokkos::CudaUVMSpace, true) -KOKKOSBLAS1_SIAMAX_TPL_SPEC_DECL_CUBLAS_UVM(unsigned long, Kokkos::LayoutLeft, - Kokkos::CudaUVMSpace, false) - -KOKKOSBLAS1_ZIAMAX_TPL_SPEC_DECL_CUBLAS_UVM(unsigned long, Kokkos::LayoutLeft, - Kokkos::CudaUVMSpace, true) -KOKKOSBLAS1_ZIAMAX_TPL_SPEC_DECL_CUBLAS_UVM(unsigned long, Kokkos::LayoutLeft, - Kokkos::CudaUVMSpace, false) - -KOKKOSBLAS1_CIAMAX_TPL_SPEC_DECL_CUBLAS_UVM(unsigned long, Kokkos::LayoutLeft, - Kokkos::CudaUVMSpace, true) -KOKKOSBLAS1_CIAMAX_TPL_SPEC_DECL_CUBLAS_UVM(unsigned long, Kokkos::LayoutLeft, - Kokkos::CudaUVMSpace, false) - -KOKKOSBLAS1_DIAMAX_TPL_SPEC_DECL_CUBLAS_UVM(unsigned int, Kokkos::LayoutLeft, - Kokkos::CudaUVMSpace, true) -KOKKOSBLAS1_DIAMAX_TPL_SPEC_DECL_CUBLAS_UVM(unsigned int, Kokkos::LayoutLeft, - Kokkos::CudaUVMSpace, false) - -KOKKOSBLAS1_SIAMAX_TPL_SPEC_DECL_CUBLAS_UVM(unsigned int, Kokkos::LayoutLeft, - Kokkos::CudaUVMSpace, true) -KOKKOSBLAS1_SIAMAX_TPL_SPEC_DECL_CUBLAS_UVM(unsigned int, Kokkos::LayoutLeft, - Kokkos::CudaUVMSpace, false) - -KOKKOSBLAS1_ZIAMAX_TPL_SPEC_DECL_CUBLAS_UVM(unsigned int, Kokkos::LayoutLeft, - Kokkos::CudaUVMSpace, true) -KOKKOSBLAS1_ZIAMAX_TPL_SPEC_DECL_CUBLAS_UVM(unsigned int, Kokkos::LayoutLeft, - Kokkos::CudaUVMSpace, false) - -KOKKOSBLAS1_CIAMAX_TPL_SPEC_DECL_CUBLAS_UVM(unsigned int, Kokkos::LayoutLeft, - Kokkos::CudaUVMSpace, true) -KOKKOSBLAS1_CIAMAX_TPL_SPEC_DECL_CUBLAS_UVM(unsigned int, Kokkos::LayoutLeft, - Kokkos::CudaUVMSpace, false) +KOKKOSBLAS1_DIAMAX_TPL_SPEC_DECL_CUBLAS_UVM(unsigned long, Kokkos::LayoutLeft, Kokkos::CudaUVMSpace, true) +KOKKOSBLAS1_DIAMAX_TPL_SPEC_DECL_CUBLAS_UVM(unsigned long, Kokkos::LayoutLeft, Kokkos::CudaUVMSpace, false) + +KOKKOSBLAS1_SIAMAX_TPL_SPEC_DECL_CUBLAS_UVM(unsigned long, Kokkos::LayoutLeft, Kokkos::CudaUVMSpace, true) +KOKKOSBLAS1_SIAMAX_TPL_SPEC_DECL_CUBLAS_UVM(unsigned long, Kokkos::LayoutLeft, Kokkos::CudaUVMSpace, false) + +KOKKOSBLAS1_ZIAMAX_TPL_SPEC_DECL_CUBLAS_UVM(unsigned long, Kokkos::LayoutLeft, Kokkos::CudaUVMSpace, true) +KOKKOSBLAS1_ZIAMAX_TPL_SPEC_DECL_CUBLAS_UVM(unsigned long, Kokkos::LayoutLeft, Kokkos::CudaUVMSpace, false) + +KOKKOSBLAS1_CIAMAX_TPL_SPEC_DECL_CUBLAS_UVM(unsigned long, Kokkos::LayoutLeft, Kokkos::CudaUVMSpace, true) +KOKKOSBLAS1_CIAMAX_TPL_SPEC_DECL_CUBLAS_UVM(unsigned long, Kokkos::LayoutLeft, Kokkos::CudaUVMSpace, false) + +KOKKOSBLAS1_DIAMAX_TPL_SPEC_DECL_CUBLAS_UVM(unsigned int, Kokkos::LayoutLeft, Kokkos::CudaUVMSpace, true) +KOKKOSBLAS1_DIAMAX_TPL_SPEC_DECL_CUBLAS_UVM(unsigned int, Kokkos::LayoutLeft, Kokkos::CudaUVMSpace, false) + +KOKKOSBLAS1_SIAMAX_TPL_SPEC_DECL_CUBLAS_UVM(unsigned int, Kokkos::LayoutLeft, Kokkos::CudaUVMSpace, true) +KOKKOSBLAS1_SIAMAX_TPL_SPEC_DECL_CUBLAS_UVM(unsigned int, Kokkos::LayoutLeft, Kokkos::CudaUVMSpace, false) + +KOKKOSBLAS1_ZIAMAX_TPL_SPEC_DECL_CUBLAS_UVM(unsigned int, Kokkos::LayoutLeft, Kokkos::CudaUVMSpace, true) +KOKKOSBLAS1_ZIAMAX_TPL_SPEC_DECL_CUBLAS_UVM(unsigned int, Kokkos::LayoutLeft, Kokkos::CudaUVMSpace, false) + +KOKKOSBLAS1_CIAMAX_TPL_SPEC_DECL_CUBLAS_UVM(unsigned int, Kokkos::LayoutLeft, Kokkos::CudaUVMSpace, true) +KOKKOSBLAS1_CIAMAX_TPL_SPEC_DECL_CUBLAS_UVM(unsigned int, Kokkos::LayoutLeft, Kokkos::CudaUVMSpace, false) #endif } // namespace Impl @@ -382,144 +280,100 @@ namespace Impl { using ROCBLAS_DEVICE_TYPE = Kokkos::Device; -#define KOKKOSBLAS1_XIAMAX_TPL_SPEC_DECL_ROCBLAS_WRAPPER( \ - SCALAR_TYPE, ROCBLAS_SCALAR_TYPE, ROCBLAS_FN, INDEX_TYPE, LAYOUT, \ - MEMSPACE, ETI_SPEC_AVAIL, RET_DEVICE_TYPE, ROCBLAS_PTR_MODE_1, \ - ROCBLAS_PTR_MODE_2) \ - template <> \ - struct Iamax >, \ - Kokkos::View, \ - Kokkos::MemoryTraits >, \ - 1, true, ETI_SPEC_AVAIL> { \ - using execution_space = Kokkos::HIP; \ - typedef Kokkos::View > \ - RV; \ - typedef Kokkos::View, \ - Kokkos::MemoryTraits > \ - XV; \ - typedef typename XV::size_type size_type; \ - \ - static void iamax(const execution_space& space, RV& R, const XV& X) { \ - Kokkos::Profiling::pushRegion( \ - "KokkosBlas::iamax[TPL_ROCBLAS," #SCALAR_TYPE "]"); \ - const size_type numElems = X.extent(0); \ - if (numElems == 0) { \ - Kokkos::deep_copy(R, 0); \ - return; \ - } \ - if (numElems < static_cast(INT_MAX)) { \ - iamax_print_specialization(); \ - const int N = static_cast(numElems); \ - const int XST = X.stride(0); \ - const int LDX = (XST == 0) ? 1 : XST; \ - KokkosBlas::Impl::RocBlasSingleton& s = \ - KokkosBlas::Impl::RocBlasSingleton::singleton(); \ - KOKKOS_ROCBLAS_SAFE_CALL_IMPL( \ - rocblas_set_stream(s.handle, space.hip_stream())); \ - rocblas_pointer_mode prevPtrMode; \ - KOKKOS_ROCBLAS_SAFE_CALL_IMPL( \ - rocblas_get_pointer_mode(s.handle, &prevPtrMode)); \ - if (prevPtrMode == ROCBLAS_PTR_MODE_2) { \ - KOKKOS_ROCBLAS_SAFE_CALL_IMPL( \ - rocblas_set_pointer_mode(s.handle, ROCBLAS_PTR_MODE_1)); \ - } \ - KOKKOS_ROCBLAS_SAFE_CALL_IMPL( \ - ROCBLAS_FN(s.handle, N, \ - reinterpret_cast(X.data()), \ - LDX, reinterpret_cast(R.data()))); \ - KOKKOS_ROCBLAS_SAFE_CALL_IMPL(rocblas_set_stream(s.handle, NULL)); \ - if (prevPtrMode == ROCBLAS_PTR_MODE_2) { \ - KOKKOS_ROCBLAS_SAFE_CALL_IMPL( \ - rocblas_set_pointer_mode(s.handle, ROCBLAS_PTR_MODE_2)); \ - } \ - } else { \ - Iamax::iamax(space, \ - R, X); \ - } \ - Kokkos::Profiling::popRegion(); \ - } \ +#define KOKKOSBLAS1_XIAMAX_TPL_SPEC_DECL_ROCBLAS_WRAPPER(SCALAR_TYPE, ROCBLAS_SCALAR_TYPE, ROCBLAS_FN, INDEX_TYPE, \ + LAYOUT, MEMSPACE, ETI_SPEC_AVAIL, RET_DEVICE_TYPE, \ + ROCBLAS_PTR_MODE_1, ROCBLAS_PTR_MODE_2) \ + template <> \ + struct Iamax >, \ + Kokkos::View, \ + Kokkos::MemoryTraits >, \ + 1, true, ETI_SPEC_AVAIL> { \ + using execution_space = Kokkos::HIP; \ + typedef Kokkos::View > RV; \ + typedef Kokkos::View, \ + Kokkos::MemoryTraits > \ + XV; \ + typedef typename XV::size_type size_type; \ + \ + static void iamax(const execution_space& space, RV& R, const XV& X) { \ + Kokkos::Profiling::pushRegion("KokkosBlas::iamax[TPL_ROCBLAS," #SCALAR_TYPE "]"); \ + const size_type numElems = X.extent(0); \ + if (numElems == 0) { \ + Kokkos::deep_copy(R, 0); \ + return; \ + } \ + if (numElems < static_cast(INT_MAX)) { \ + iamax_print_specialization(); \ + const int N = static_cast(numElems); \ + const int XST = X.stride(0); \ + const int LDX = (XST == 0) ? 1 : XST; \ + KokkosBlas::Impl::RocBlasSingleton& s = KokkosBlas::Impl::RocBlasSingleton::singleton(); \ + KOKKOS_ROCBLAS_SAFE_CALL_IMPL(rocblas_set_stream(s.handle, space.hip_stream())); \ + rocblas_pointer_mode prevPtrMode; \ + KOKKOS_ROCBLAS_SAFE_CALL_IMPL(rocblas_get_pointer_mode(s.handle, &prevPtrMode)); \ + if (prevPtrMode == ROCBLAS_PTR_MODE_2) { \ + KOKKOS_ROCBLAS_SAFE_CALL_IMPL(rocblas_set_pointer_mode(s.handle, ROCBLAS_PTR_MODE_1)); \ + } \ + KOKKOS_ROCBLAS_SAFE_CALL_IMPL(ROCBLAS_FN(s.handle, N, reinterpret_cast(X.data()), \ + LDX, reinterpret_cast(R.data()))); \ + KOKKOS_ROCBLAS_SAFE_CALL_IMPL(rocblas_set_stream(s.handle, NULL)); \ + if (prevPtrMode == ROCBLAS_PTR_MODE_2) { \ + KOKKOS_ROCBLAS_SAFE_CALL_IMPL(rocblas_set_pointer_mode(s.handle, ROCBLAS_PTR_MODE_2)); \ + } \ + } else { \ + Iamax::iamax(space, R, X); \ + } \ + Kokkos::Profiling::popRegion(); \ + } \ }; -#define KOKKOSBLAS1_XIAMAX_TPL_SPEC_DECL_ROCBLAS( \ - SCALAR_TYPE, ROCBLAS_SCALAR_TYPE, ROCBLAS_FN, INDEX_TYPE, LAYOUT, \ - MEMSPACE, ETI_SPEC_AVAIL) \ - KOKKOSBLAS1_XIAMAX_TPL_SPEC_DECL_ROCBLAS_WRAPPER( \ - SCALAR_TYPE, ROCBLAS_SCALAR_TYPE, ROCBLAS_FN, INDEX_TYPE, LAYOUT, \ - MEMSPACE, ETI_SPEC_AVAIL, Kokkos::HostSpace, rocblas_pointer_mode_host, \ - rocblas_pointer_mode_device) \ - KOKKOSBLAS1_XIAMAX_TPL_SPEC_DECL_ROCBLAS_WRAPPER( \ - SCALAR_TYPE, ROCBLAS_SCALAR_TYPE, ROCBLAS_FN, INDEX_TYPE, LAYOUT, \ - MEMSPACE, ETI_SPEC_AVAIL, ROCBLAS_DEVICE_TYPE, \ - rocblas_pointer_mode_device, rocblas_pointer_mode_host) - -#define KOKKOSBLAS1_DIAMAX_TPL_SPEC_DECL_ROCBLAS(INDEX_TYPE, LAYOUT, MEMSPACE, \ - ETI_SPEC_AVAIL) \ - KOKKOSBLAS1_XIAMAX_TPL_SPEC_DECL_ROCBLAS(double, double, rocblas_idamax, \ - INDEX_TYPE, LAYOUT, MEMSPACE, \ - ETI_SPEC_AVAIL) - -#define KOKKOSBLAS1_SIAMAX_TPL_SPEC_DECL_ROCBLAS(INDEX_TYPE, LAYOUT, MEMSPACE, \ - ETI_SPEC_AVAIL) \ - KOKKOSBLAS1_XIAMAX_TPL_SPEC_DECL_ROCBLAS(float, float, rocblas_isamax, \ - INDEX_TYPE, LAYOUT, MEMSPACE, \ - ETI_SPEC_AVAIL) - -#define KOKKOSBLAS1_ZIAMAX_TPL_SPEC_DECL_ROCBLAS(INDEX_TYPE, LAYOUT, MEMSPACE, \ - ETI_SPEC_AVAIL) \ - KOKKOSBLAS1_XIAMAX_TPL_SPEC_DECL_ROCBLAS( \ - Kokkos::complex, rocblas_double_complex, rocblas_izamax, \ - INDEX_TYPE, LAYOUT, MEMSPACE, ETI_SPEC_AVAIL) - -#define KOKKOSBLAS1_CIAMAX_TPL_SPEC_DECL_ROCBLAS(INDEX_TYPE, LAYOUT, MEMSPACE, \ - ETI_SPEC_AVAIL) \ - KOKKOSBLAS1_XIAMAX_TPL_SPEC_DECL_ROCBLAS( \ - Kokkos::complex, rocblas_float_complex, rocblas_icamax, \ - INDEX_TYPE, LAYOUT, MEMSPACE, ETI_SPEC_AVAIL) - -KOKKOSBLAS1_DIAMAX_TPL_SPEC_DECL_ROCBLAS(unsigned long, Kokkos::LayoutLeft, - Kokkos::HIPSpace, true) -KOKKOSBLAS1_DIAMAX_TPL_SPEC_DECL_ROCBLAS(unsigned long, Kokkos::LayoutLeft, - Kokkos::HIPSpace, false) - -KOKKOSBLAS1_SIAMAX_TPL_SPEC_DECL_ROCBLAS(unsigned long, Kokkos::LayoutLeft, - Kokkos::HIPSpace, true) -KOKKOSBLAS1_SIAMAX_TPL_SPEC_DECL_ROCBLAS(unsigned long, Kokkos::LayoutLeft, - Kokkos::HIPSpace, false) - -KOKKOSBLAS1_ZIAMAX_TPL_SPEC_DECL_ROCBLAS(unsigned long, Kokkos::LayoutLeft, - Kokkos::HIPSpace, true) -KOKKOSBLAS1_ZIAMAX_TPL_SPEC_DECL_ROCBLAS(unsigned long, Kokkos::LayoutLeft, - Kokkos::HIPSpace, false) - -KOKKOSBLAS1_CIAMAX_TPL_SPEC_DECL_ROCBLAS(unsigned long, Kokkos::LayoutLeft, - Kokkos::HIPSpace, true) -KOKKOSBLAS1_CIAMAX_TPL_SPEC_DECL_ROCBLAS(unsigned long, Kokkos::LayoutLeft, - Kokkos::HIPSpace, false) - -KOKKOSBLAS1_DIAMAX_TPL_SPEC_DECL_ROCBLAS(unsigned int, Kokkos::LayoutLeft, - Kokkos::HIPSpace, true) -KOKKOSBLAS1_DIAMAX_TPL_SPEC_DECL_ROCBLAS(unsigned int, Kokkos::LayoutLeft, - Kokkos::HIPSpace, false) - -KOKKOSBLAS1_SIAMAX_TPL_SPEC_DECL_ROCBLAS(unsigned int, Kokkos::LayoutLeft, - Kokkos::HIPSpace, true) -KOKKOSBLAS1_SIAMAX_TPL_SPEC_DECL_ROCBLAS(unsigned int, Kokkos::LayoutLeft, - Kokkos::HIPSpace, false) - -KOKKOSBLAS1_ZIAMAX_TPL_SPEC_DECL_ROCBLAS(unsigned int, Kokkos::LayoutLeft, - Kokkos::HIPSpace, true) -KOKKOSBLAS1_ZIAMAX_TPL_SPEC_DECL_ROCBLAS(unsigned int, Kokkos::LayoutLeft, - Kokkos::HIPSpace, false) - -KOKKOSBLAS1_CIAMAX_TPL_SPEC_DECL_ROCBLAS(unsigned int, Kokkos::LayoutLeft, - Kokkos::HIPSpace, true) -KOKKOSBLAS1_CIAMAX_TPL_SPEC_DECL_ROCBLAS(unsigned int, Kokkos::LayoutLeft, - Kokkos::HIPSpace, false) +#define KOKKOSBLAS1_XIAMAX_TPL_SPEC_DECL_ROCBLAS(SCALAR_TYPE, ROCBLAS_SCALAR_TYPE, ROCBLAS_FN, INDEX_TYPE, LAYOUT, \ + MEMSPACE, ETI_SPEC_AVAIL) \ + KOKKOSBLAS1_XIAMAX_TPL_SPEC_DECL_ROCBLAS_WRAPPER(SCALAR_TYPE, ROCBLAS_SCALAR_TYPE, ROCBLAS_FN, INDEX_TYPE, LAYOUT, \ + MEMSPACE, ETI_SPEC_AVAIL, Kokkos::HostSpace, \ + rocblas_pointer_mode_host, rocblas_pointer_mode_device) \ + KOKKOSBLAS1_XIAMAX_TPL_SPEC_DECL_ROCBLAS_WRAPPER(SCALAR_TYPE, ROCBLAS_SCALAR_TYPE, ROCBLAS_FN, INDEX_TYPE, LAYOUT, \ + MEMSPACE, ETI_SPEC_AVAIL, ROCBLAS_DEVICE_TYPE, \ + rocblas_pointer_mode_device, rocblas_pointer_mode_host) + +#define KOKKOSBLAS1_DIAMAX_TPL_SPEC_DECL_ROCBLAS(INDEX_TYPE, LAYOUT, MEMSPACE, ETI_SPEC_AVAIL) \ + KOKKOSBLAS1_XIAMAX_TPL_SPEC_DECL_ROCBLAS(double, double, rocblas_idamax, INDEX_TYPE, LAYOUT, MEMSPACE, ETI_SPEC_AVAIL) + +#define KOKKOSBLAS1_SIAMAX_TPL_SPEC_DECL_ROCBLAS(INDEX_TYPE, LAYOUT, MEMSPACE, ETI_SPEC_AVAIL) \ + KOKKOSBLAS1_XIAMAX_TPL_SPEC_DECL_ROCBLAS(float, float, rocblas_isamax, INDEX_TYPE, LAYOUT, MEMSPACE, ETI_SPEC_AVAIL) + +#define KOKKOSBLAS1_ZIAMAX_TPL_SPEC_DECL_ROCBLAS(INDEX_TYPE, LAYOUT, MEMSPACE, ETI_SPEC_AVAIL) \ + KOKKOSBLAS1_XIAMAX_TPL_SPEC_DECL_ROCBLAS(Kokkos::complex, rocblas_double_complex, rocblas_izamax, \ + INDEX_TYPE, LAYOUT, MEMSPACE, ETI_SPEC_AVAIL) + +#define KOKKOSBLAS1_CIAMAX_TPL_SPEC_DECL_ROCBLAS(INDEX_TYPE, LAYOUT, MEMSPACE, ETI_SPEC_AVAIL) \ + KOKKOSBLAS1_XIAMAX_TPL_SPEC_DECL_ROCBLAS(Kokkos::complex, rocblas_float_complex, rocblas_icamax, INDEX_TYPE, \ + LAYOUT, MEMSPACE, ETI_SPEC_AVAIL) + +KOKKOSBLAS1_DIAMAX_TPL_SPEC_DECL_ROCBLAS(unsigned long, Kokkos::LayoutLeft, Kokkos::HIPSpace, true) +KOKKOSBLAS1_DIAMAX_TPL_SPEC_DECL_ROCBLAS(unsigned long, Kokkos::LayoutLeft, Kokkos::HIPSpace, false) + +KOKKOSBLAS1_SIAMAX_TPL_SPEC_DECL_ROCBLAS(unsigned long, Kokkos::LayoutLeft, Kokkos::HIPSpace, true) +KOKKOSBLAS1_SIAMAX_TPL_SPEC_DECL_ROCBLAS(unsigned long, Kokkos::LayoutLeft, Kokkos::HIPSpace, false) + +KOKKOSBLAS1_ZIAMAX_TPL_SPEC_DECL_ROCBLAS(unsigned long, Kokkos::LayoutLeft, Kokkos::HIPSpace, true) +KOKKOSBLAS1_ZIAMAX_TPL_SPEC_DECL_ROCBLAS(unsigned long, Kokkos::LayoutLeft, Kokkos::HIPSpace, false) + +KOKKOSBLAS1_CIAMAX_TPL_SPEC_DECL_ROCBLAS(unsigned long, Kokkos::LayoutLeft, Kokkos::HIPSpace, true) +KOKKOSBLAS1_CIAMAX_TPL_SPEC_DECL_ROCBLAS(unsigned long, Kokkos::LayoutLeft, Kokkos::HIPSpace, false) + +KOKKOSBLAS1_DIAMAX_TPL_SPEC_DECL_ROCBLAS(unsigned int, Kokkos::LayoutLeft, Kokkos::HIPSpace, true) +KOKKOSBLAS1_DIAMAX_TPL_SPEC_DECL_ROCBLAS(unsigned int, Kokkos::LayoutLeft, Kokkos::HIPSpace, false) + +KOKKOSBLAS1_SIAMAX_TPL_SPEC_DECL_ROCBLAS(unsigned int, Kokkos::LayoutLeft, Kokkos::HIPSpace, true) +KOKKOSBLAS1_SIAMAX_TPL_SPEC_DECL_ROCBLAS(unsigned int, Kokkos::LayoutLeft, Kokkos::HIPSpace, false) + +KOKKOSBLAS1_ZIAMAX_TPL_SPEC_DECL_ROCBLAS(unsigned int, Kokkos::LayoutLeft, Kokkos::HIPSpace, true) +KOKKOSBLAS1_ZIAMAX_TPL_SPEC_DECL_ROCBLAS(unsigned int, Kokkos::LayoutLeft, Kokkos::HIPSpace, false) + +KOKKOSBLAS1_CIAMAX_TPL_SPEC_DECL_ROCBLAS(unsigned int, Kokkos::LayoutLeft, Kokkos::HIPSpace, true) +KOKKOSBLAS1_CIAMAX_TPL_SPEC_DECL_ROCBLAS(unsigned int, Kokkos::LayoutLeft, Kokkos::HIPSpace, false) } // namespace Impl } // namespace KokkosBlas diff --git a/blas/tpls/KokkosBlas1_mult_tpl_spec_avail.hpp b/blas/tpls/KokkosBlas1_mult_tpl_spec_avail.hpp index 8d3fc0f4d2..3924e0da21 100644 --- a/blas/tpls/KokkosBlas1_mult_tpl_spec_avail.hpp +++ b/blas/tpls/KokkosBlas1_mult_tpl_spec_avail.hpp @@ -20,8 +20,7 @@ namespace KokkosBlas { namespace Impl { // Specialization struct which defines whether a specialization exists -template +template struct mult_tpl_spec_avail { enum : bool { value = false }; }; diff --git a/blas/tpls/KokkosBlas1_nrm1_tpl_spec_avail.hpp b/blas/tpls/KokkosBlas1_nrm1_tpl_spec_avail.hpp index 8f79c8a58d..6de384380e 100644 --- a/blas/tpls/KokkosBlas1_nrm1_tpl_spec_avail.hpp +++ b/blas/tpls/KokkosBlas1_nrm1_tpl_spec_avail.hpp @@ -33,83 +33,62 @@ namespace Impl { // Generic Host side BLAS (could be MKL or whatever) #ifdef KOKKOSKERNELS_ENABLE_TPL_BLAS // double -#define KOKKOSBLAS1_NRM1_TPL_SPEC_AVAIL_BLAS(SCALAR, LAYOUT, MEMSPACE) \ - template \ - struct nrm1_tpl_spec_avail< \ - ExecSpace, \ - Kokkos::View< \ - typename Kokkos::Details::InnerProductSpaceTraits::mag_type, \ - LAYOUT, Kokkos::HostSpace, \ - Kokkos::MemoryTraits >, \ - Kokkos::View, \ - Kokkos::MemoryTraits >, \ - 1> { \ - enum : bool { value = true }; \ +#define KOKKOSBLAS1_NRM1_TPL_SPEC_AVAIL_BLAS(SCALAR, LAYOUT, MEMSPACE) \ + template \ + struct nrm1_tpl_spec_avail::mag_type, LAYOUT, \ + Kokkos::HostSpace, Kokkos::MemoryTraits >, \ + Kokkos::View, \ + Kokkos::MemoryTraits >, \ + 1> { \ + enum : bool { value = true }; \ }; -KOKKOSBLAS1_NRM1_TPL_SPEC_AVAIL_BLAS(double, Kokkos::LayoutLeft, - Kokkos::HostSpace) -KOKKOSBLAS1_NRM1_TPL_SPEC_AVAIL_BLAS(float, Kokkos::LayoutLeft, - Kokkos::HostSpace) -KOKKOSBLAS1_NRM1_TPL_SPEC_AVAIL_BLAS(Kokkos::complex, - Kokkos::LayoutLeft, Kokkos::HostSpace) -KOKKOSBLAS1_NRM1_TPL_SPEC_AVAIL_BLAS(Kokkos::complex, Kokkos::LayoutLeft, - Kokkos::HostSpace) +KOKKOSBLAS1_NRM1_TPL_SPEC_AVAIL_BLAS(double, Kokkos::LayoutLeft, Kokkos::HostSpace) +KOKKOSBLAS1_NRM1_TPL_SPEC_AVAIL_BLAS(float, Kokkos::LayoutLeft, Kokkos::HostSpace) +KOKKOSBLAS1_NRM1_TPL_SPEC_AVAIL_BLAS(Kokkos::complex, Kokkos::LayoutLeft, Kokkos::HostSpace) +KOKKOSBLAS1_NRM1_TPL_SPEC_AVAIL_BLAS(Kokkos::complex, Kokkos::LayoutLeft, Kokkos::HostSpace) #endif // cuBLAS #ifdef KOKKOSKERNELS_ENABLE_TPL_CUBLAS // double -#define KOKKOSBLAS1_NRM1_TPL_SPEC_AVAIL_CUBLAS(SCALAR, LAYOUT, MEMSPACE) \ - template \ - struct nrm1_tpl_spec_avail< \ - ExecSpace, \ - Kokkos::View< \ - typename Kokkos::Details::InnerProductSpaceTraits::mag_type, \ - LAYOUT, Kokkos::HostSpace, \ - Kokkos::MemoryTraits >, \ - Kokkos::View, \ - Kokkos::MemoryTraits >, \ - 1> { \ - enum : bool { value = true }; \ +#define KOKKOSBLAS1_NRM1_TPL_SPEC_AVAIL_CUBLAS(SCALAR, LAYOUT, MEMSPACE) \ + template \ + struct nrm1_tpl_spec_avail::mag_type, LAYOUT, \ + Kokkos::HostSpace, Kokkos::MemoryTraits >, \ + Kokkos::View, \ + Kokkos::MemoryTraits >, \ + 1> { \ + enum : bool { value = true }; \ }; -KOKKOSBLAS1_NRM1_TPL_SPEC_AVAIL_CUBLAS(double, Kokkos::LayoutLeft, - Kokkos::CudaSpace) -KOKKOSBLAS1_NRM1_TPL_SPEC_AVAIL_CUBLAS(float, Kokkos::LayoutLeft, - Kokkos::CudaSpace) -KOKKOSBLAS1_NRM1_TPL_SPEC_AVAIL_CUBLAS(Kokkos::complex, - Kokkos::LayoutLeft, Kokkos::CudaSpace) -KOKKOSBLAS1_NRM1_TPL_SPEC_AVAIL_CUBLAS(Kokkos::complex, - Kokkos::LayoutLeft, Kokkos::CudaSpace) +KOKKOSBLAS1_NRM1_TPL_SPEC_AVAIL_CUBLAS(double, Kokkos::LayoutLeft, Kokkos::CudaSpace) +KOKKOSBLAS1_NRM1_TPL_SPEC_AVAIL_CUBLAS(float, Kokkos::LayoutLeft, Kokkos::CudaSpace) +KOKKOSBLAS1_NRM1_TPL_SPEC_AVAIL_CUBLAS(Kokkos::complex, Kokkos::LayoutLeft, Kokkos::CudaSpace) +KOKKOSBLAS1_NRM1_TPL_SPEC_AVAIL_CUBLAS(Kokkos::complex, Kokkos::LayoutLeft, Kokkos::CudaSpace) #endif // rocBLAS #ifdef KOKKOSKERNELS_ENABLE_TPL_ROCBLAS -#define KOKKOSBLAS1_NRM1_TPL_SPEC_AVAIL_ROCBLAS(SCALAR, LAYOUT, MEMSPACE) \ - template \ - struct nrm1_tpl_spec_avail< \ - ExecSpace, \ - Kokkos::View< \ - typename Kokkos::Details::InnerProductSpaceTraits::mag_type, \ - LAYOUT, Kokkos::HostSpace, \ - Kokkos::MemoryTraits >, \ - Kokkos::View, \ - Kokkos::MemoryTraits >, \ - 1> { \ - enum : bool { value = true }; \ +#define KOKKOSBLAS1_NRM1_TPL_SPEC_AVAIL_ROCBLAS(SCALAR, LAYOUT, MEMSPACE) \ + template \ + struct nrm1_tpl_spec_avail::mag_type, LAYOUT, \ + Kokkos::HostSpace, Kokkos::MemoryTraits >, \ + Kokkos::View, \ + Kokkos::MemoryTraits >, \ + 1> { \ + enum : bool { value = true }; \ }; -KOKKOSBLAS1_NRM1_TPL_SPEC_AVAIL_ROCBLAS(double, Kokkos::LayoutLeft, - Kokkos::HIPSpace) -KOKKOSBLAS1_NRM1_TPL_SPEC_AVAIL_ROCBLAS(float, Kokkos::LayoutLeft, - Kokkos::HIPSpace) -KOKKOSBLAS1_NRM1_TPL_SPEC_AVAIL_ROCBLAS(Kokkos::complex, - Kokkos::LayoutLeft, Kokkos::HIPSpace) -KOKKOSBLAS1_NRM1_TPL_SPEC_AVAIL_ROCBLAS(Kokkos::complex, - Kokkos::LayoutLeft, Kokkos::HIPSpace) +KOKKOSBLAS1_NRM1_TPL_SPEC_AVAIL_ROCBLAS(double, Kokkos::LayoutLeft, Kokkos::HIPSpace) +KOKKOSBLAS1_NRM1_TPL_SPEC_AVAIL_ROCBLAS(float, Kokkos::LayoutLeft, Kokkos::HIPSpace) +KOKKOSBLAS1_NRM1_TPL_SPEC_AVAIL_ROCBLAS(Kokkos::complex, Kokkos::LayoutLeft, Kokkos::HIPSpace) +KOKKOSBLAS1_NRM1_TPL_SPEC_AVAIL_ROCBLAS(Kokkos::complex, Kokkos::LayoutLeft, Kokkos::HIPSpace) #endif // KOKKOSKERNELS_ENABLE_TPL_ROCBLAS @@ -118,30 +97,23 @@ KOKKOSBLAS1_NRM1_TPL_SPEC_AVAIL_ROCBLAS(Kokkos::complex, #if defined(KOKKOS_ENABLE_SYCL) -#define KOKKOSBLAS1_NRM1_TPL_SPEC_AVAIL_MKL_SYCL(SCALAR, LAYOUT, MEMSPACE) \ - template \ - struct nrm1_tpl_spec_avail< \ - ExecSpace, \ - Kokkos::View< \ - typename Kokkos::Details::InnerProductSpaceTraits::mag_type, \ - LAYOUT, Kokkos::HostSpace, \ - Kokkos::MemoryTraits >, \ - Kokkos::View, \ - Kokkos::MemoryTraits >, \ - 1> { \ - enum : bool { value = true }; \ +#define KOKKOSBLAS1_NRM1_TPL_SPEC_AVAIL_MKL_SYCL(SCALAR, LAYOUT, MEMSPACE) \ + template \ + struct nrm1_tpl_spec_avail::mag_type, LAYOUT, \ + Kokkos::HostSpace, Kokkos::MemoryTraits >, \ + Kokkos::View, \ + Kokkos::MemoryTraits >, \ + 1> { \ + enum : bool { value = true }; \ }; -KOKKOSBLAS1_NRM1_TPL_SPEC_AVAIL_MKL_SYCL( - double, Kokkos::LayoutLeft, Kokkos::Experimental::SYCLDeviceUSMSpace) -KOKKOSBLAS1_NRM1_TPL_SPEC_AVAIL_MKL_SYCL( - float, Kokkos::LayoutLeft, Kokkos::Experimental::SYCLDeviceUSMSpace) -KOKKOSBLAS1_NRM1_TPL_SPEC_AVAIL_MKL_SYCL( - Kokkos::complex, Kokkos::LayoutLeft, - Kokkos::Experimental::SYCLDeviceUSMSpace) -KOKKOSBLAS1_NRM1_TPL_SPEC_AVAIL_MKL_SYCL( - Kokkos::complex, Kokkos::LayoutLeft, - Kokkos::Experimental::SYCLDeviceUSMSpace) +KOKKOSBLAS1_NRM1_TPL_SPEC_AVAIL_MKL_SYCL(double, Kokkos::LayoutLeft, Kokkos::Experimental::SYCLDeviceUSMSpace) +KOKKOSBLAS1_NRM1_TPL_SPEC_AVAIL_MKL_SYCL(float, Kokkos::LayoutLeft, Kokkos::Experimental::SYCLDeviceUSMSpace) +KOKKOSBLAS1_NRM1_TPL_SPEC_AVAIL_MKL_SYCL(Kokkos::complex, Kokkos::LayoutLeft, + Kokkos::Experimental::SYCLDeviceUSMSpace) +KOKKOSBLAS1_NRM1_TPL_SPEC_AVAIL_MKL_SYCL(Kokkos::complex, Kokkos::LayoutLeft, + Kokkos::Experimental::SYCLDeviceUSMSpace) #endif // KOKKOS_ENABLE_SYCL #endif // KOKKOSKERNELS_ENABLE_TPL_MKL diff --git a/blas/tpls/KokkosBlas1_nrm1_tpl_spec_decl.hpp b/blas/tpls/KokkosBlas1_nrm1_tpl_spec_decl.hpp index 12a240db6b..378fbc936f 100644 --- a/blas/tpls/KokkosBlas1_nrm1_tpl_spec_decl.hpp +++ b/blas/tpls/KokkosBlas1_nrm1_tpl_spec_decl.hpp @@ -24,8 +24,7 @@ namespace { template inline void nrm1_print_specialization() { #ifdef KOKKOSKERNELS_ENABLE_CHECK_SPECIALIZATION - printf("KokkosBlas1::nrm1<> TPL Blas specialization for < %s , %s >\n", - typeid(RV).name(), typeid(XV).name()); + printf("KokkosBlas1::nrm1<> TPL Blas specialization for < %s , %s >\n", typeid(RV).name(), typeid(XV).name()); #endif } } // namespace @@ -39,87 +38,64 @@ inline void nrm1_print_specialization() { namespace KokkosBlas { namespace Impl { -#define KOKKOSBLAS1_NRM1_TPL_SPEC_DECL_BLAS(SCALAR, LAYOUT, EXECSPACE, \ - MEMSPACE) \ - template <> \ - struct Nrm1< \ - EXECSPACE, \ - Kokkos::View::mag_type, LAYOUT, \ - Kokkos::HostSpace, \ - Kokkos::MemoryTraits>, \ - Kokkos::View, \ - Kokkos::MemoryTraits>, \ - 1, true, \ - nrm1_eti_spec_avail< \ - EXECSPACE, \ - Kokkos::View::mag_type, LAYOUT, \ - Kokkos::HostSpace, \ - Kokkos::MemoryTraits>, \ - Kokkos::View, \ - Kokkos::MemoryTraits>>::value> { \ - using mag_type = typename Kokkos::ArithTraits::mag_type; \ - using RV = Kokkos::View>; \ - using XV = Kokkos::View, \ - Kokkos::MemoryTraits>; \ - using size_type = typename XV::size_type; \ - \ - static void nrm1(const EXECSPACE& space, RV& R, const XV& X) { \ - Kokkos::Profiling::pushRegion("KokkosBlas::nrm1[TPL_BLAS," #SCALAR "]"); \ - const size_type numElems = X.extent(0); \ - if (numElems < static_cast(INT_MAX)) { \ - nrm1_print_specialization(); \ - int N = numElems; \ - int one = 1; \ - if constexpr (Kokkos::ArithTraits::is_complex) { \ - R() = HostBlas>::asum( \ - N, reinterpret_cast*>(X.data()), \ - one); \ - } else { \ - R() = HostBlas::asum(N, X.data(), one); \ - } \ - } else { \ - Nrm1::value>::nrm1(space, R, \ - X); \ - } \ - Kokkos::Profiling::popRegion(); \ - } \ +#define KOKKOSBLAS1_NRM1_TPL_SPEC_DECL_BLAS(SCALAR, LAYOUT, EXECSPACE, MEMSPACE) \ + template <> \ + struct Nrm1::mag_type, LAYOUT, Kokkos::HostSpace, \ + Kokkos::MemoryTraits>, \ + Kokkos::View, \ + Kokkos::MemoryTraits>, \ + 1, true, \ + nrm1_eti_spec_avail::mag_type, LAYOUT, \ + Kokkos::HostSpace, Kokkos::MemoryTraits>, \ + Kokkos::View, \ + Kokkos::MemoryTraits>>::value> { \ + using mag_type = typename Kokkos::ArithTraits::mag_type; \ + using RV = Kokkos::View>; \ + using XV = Kokkos::View, \ + Kokkos::MemoryTraits>; \ + using size_type = typename XV::size_type; \ + \ + static void nrm1(const EXECSPACE& space, RV& R, const XV& X) { \ + Kokkos::Profiling::pushRegion("KokkosBlas::nrm1[TPL_BLAS," #SCALAR "]"); \ + const size_type numElems = X.extent(0); \ + if (numElems < static_cast(INT_MAX)) { \ + nrm1_print_specialization(); \ + int N = numElems; \ + int one = 1; \ + if constexpr (Kokkos::ArithTraits::is_complex) { \ + R() = HostBlas>::asum(N, reinterpret_cast*>(X.data()), \ + one); \ + } else { \ + R() = HostBlas::asum(N, X.data(), one); \ + } \ + } else { \ + Nrm1::value>::nrm1(space, R, X); \ + } \ + Kokkos::Profiling::popRegion(); \ + } \ }; #if defined(KOKKOS_ENABLE_SERIAL) -KOKKOSBLAS1_NRM1_TPL_SPEC_DECL_BLAS(float, Kokkos::LayoutLeft, Kokkos::Serial, - Kokkos::HostSpace) -KOKKOSBLAS1_NRM1_TPL_SPEC_DECL_BLAS(double, Kokkos::LayoutLeft, Kokkos::Serial, - Kokkos::HostSpace) -KOKKOSBLAS1_NRM1_TPL_SPEC_DECL_BLAS(Kokkos::complex, Kokkos::LayoutLeft, - Kokkos::Serial, Kokkos::HostSpace) -KOKKOSBLAS1_NRM1_TPL_SPEC_DECL_BLAS(Kokkos::complex, Kokkos::LayoutLeft, - Kokkos::Serial, Kokkos::HostSpace) +KOKKOSBLAS1_NRM1_TPL_SPEC_DECL_BLAS(float, Kokkos::LayoutLeft, Kokkos::Serial, Kokkos::HostSpace) +KOKKOSBLAS1_NRM1_TPL_SPEC_DECL_BLAS(double, Kokkos::LayoutLeft, Kokkos::Serial, Kokkos::HostSpace) +KOKKOSBLAS1_NRM1_TPL_SPEC_DECL_BLAS(Kokkos::complex, Kokkos::LayoutLeft, Kokkos::Serial, Kokkos::HostSpace) +KOKKOSBLAS1_NRM1_TPL_SPEC_DECL_BLAS(Kokkos::complex, Kokkos::LayoutLeft, Kokkos::Serial, Kokkos::HostSpace) #endif #if defined(KOKKOS_ENABLE_OPENMP) -KOKKOSBLAS1_NRM1_TPL_SPEC_DECL_BLAS(float, Kokkos::LayoutLeft, Kokkos::OpenMP, - Kokkos::HostSpace) -KOKKOSBLAS1_NRM1_TPL_SPEC_DECL_BLAS(double, Kokkos::LayoutLeft, Kokkos::OpenMP, - Kokkos::HostSpace) -KOKKOSBLAS1_NRM1_TPL_SPEC_DECL_BLAS(Kokkos::complex, Kokkos::LayoutLeft, - Kokkos::OpenMP, Kokkos::HostSpace) -KOKKOSBLAS1_NRM1_TPL_SPEC_DECL_BLAS(Kokkos::complex, Kokkos::LayoutLeft, - Kokkos::OpenMP, Kokkos::HostSpace) +KOKKOSBLAS1_NRM1_TPL_SPEC_DECL_BLAS(float, Kokkos::LayoutLeft, Kokkos::OpenMP, Kokkos::HostSpace) +KOKKOSBLAS1_NRM1_TPL_SPEC_DECL_BLAS(double, Kokkos::LayoutLeft, Kokkos::OpenMP, Kokkos::HostSpace) +KOKKOSBLAS1_NRM1_TPL_SPEC_DECL_BLAS(Kokkos::complex, Kokkos::LayoutLeft, Kokkos::OpenMP, Kokkos::HostSpace) +KOKKOSBLAS1_NRM1_TPL_SPEC_DECL_BLAS(Kokkos::complex, Kokkos::LayoutLeft, Kokkos::OpenMP, Kokkos::HostSpace) #endif #if defined(KOKKOS_ENABLE_THREADS) -KOKKOSBLAS1_NRM1_TPL_SPEC_DECL_BLAS(float, Kokkos::LayoutLeft, Kokkos::Threads, - Kokkos::HostSpace) -KOKKOSBLAS1_NRM1_TPL_SPEC_DECL_BLAS(double, Kokkos::LayoutLeft, Kokkos::Threads, - Kokkos::HostSpace) -KOKKOSBLAS1_NRM1_TPL_SPEC_DECL_BLAS(Kokkos::complex, Kokkos::LayoutLeft, - Kokkos::Threads, Kokkos::HostSpace) -KOKKOSBLAS1_NRM1_TPL_SPEC_DECL_BLAS(Kokkos::complex, Kokkos::LayoutLeft, - Kokkos::Threads, Kokkos::HostSpace) +KOKKOSBLAS1_NRM1_TPL_SPEC_DECL_BLAS(float, Kokkos::LayoutLeft, Kokkos::Threads, Kokkos::HostSpace) +KOKKOSBLAS1_NRM1_TPL_SPEC_DECL_BLAS(double, Kokkos::LayoutLeft, Kokkos::Threads, Kokkos::HostSpace) +KOKKOSBLAS1_NRM1_TPL_SPEC_DECL_BLAS(Kokkos::complex, Kokkos::LayoutLeft, Kokkos::Threads, Kokkos::HostSpace) +KOKKOSBLAS1_NRM1_TPL_SPEC_DECL_BLAS(Kokkos::complex, Kokkos::LayoutLeft, Kokkos::Threads, Kokkos::HostSpace) #endif } // namespace Impl @@ -135,99 +111,74 @@ namespace KokkosBlas { namespace Impl { template -void cublasAsumWrapper(const ExecutionSpace& space, RViewType& R, - const XViewType& X) { +void cublasAsumWrapper(const ExecutionSpace& space, RViewType& R, const XViewType& X) { using XScalar = typename XViewType::non_const_value_type; nrm1_print_specialization(); - const int N = static_cast(X.extent(0)); - constexpr int one = 1; - KokkosBlas::Impl::CudaBlasSingleton& s = - KokkosBlas::Impl::CudaBlasSingleton::singleton(); + const int N = static_cast(X.extent(0)); + constexpr int one = 1; + KokkosBlas::Impl::CudaBlasSingleton& s = KokkosBlas::Impl::CudaBlasSingleton::singleton(); KOKKOS_CUBLAS_SAFE_CALL_IMPL(cublasSetStream(s.handle, space.cuda_stream())); if constexpr (std::is_same_v) { - KOKKOS_CUBLAS_SAFE_CALL_IMPL( - cublasSasum(s.handle, N, X.data(), one, R.data())); + KOKKOS_CUBLAS_SAFE_CALL_IMPL(cublasSasum(s.handle, N, X.data(), one, R.data())); } if constexpr (std::is_same_v) { - KOKKOS_CUBLAS_SAFE_CALL_IMPL( - cublasDasum(s.handle, N, X.data(), one, R.data())); + KOKKOS_CUBLAS_SAFE_CALL_IMPL(cublasDasum(s.handle, N, X.data(), one, R.data())); } if constexpr (std::is_same_v>) { KOKKOS_CUBLAS_SAFE_CALL_IMPL( - cublasScasum(s.handle, N, reinterpret_cast(X.data()), - one, R.data())); + cublasScasum(s.handle, N, reinterpret_cast(X.data()), one, R.data())); } if constexpr (std::is_same_v>) { - KOKKOS_CUBLAS_SAFE_CALL_IMPL(cublasDzasum( - s.handle, N, reinterpret_cast(X.data()), one, - R.data())); + KOKKOS_CUBLAS_SAFE_CALL_IMPL( + cublasDzasum(s.handle, N, reinterpret_cast(X.data()), one, R.data())); } KOKKOS_CUBLAS_SAFE_CALL_IMPL(cublasSetStream(s.handle, NULL)); } -#define KOKKOSBLAS1_NRM1_TPL_SPEC_DECL_CUBLAS(SCALAR, LAYOUT, MEMSPACE) \ - template <> \ - struct Nrm1< \ - Kokkos::Cuda, \ - Kokkos::View::mag_type, LAYOUT, \ - Kokkos::HostSpace, \ - Kokkos::MemoryTraits>, \ - Kokkos::View, \ - Kokkos::MemoryTraits>, \ - 1, true, \ - nrm1_eti_spec_avail< \ - Kokkos::Cuda, \ - Kokkos::View::mag_type, LAYOUT, \ - Kokkos::HostSpace, \ - Kokkos::MemoryTraits>, \ - Kokkos::View, \ - Kokkos::MemoryTraits>>::value> { \ - using execution_space = Kokkos::Cuda; \ - using RV = Kokkos::View::mag_type, \ - LAYOUT, Kokkos::HostSpace, \ - Kokkos::MemoryTraits>; \ - using XV = Kokkos::View, \ - Kokkos::MemoryTraits>; \ - using size_type = typename XV::size_type; \ - \ - static void nrm1(const execution_space& space, RV& R, const XV& X) { \ - Kokkos::Profiling::pushRegion("KokkosBlas::nrm1[TPL_CUBLAS," #SCALAR \ - "]"); \ - const size_type numElems = X.extent(0); \ - if (numElems < static_cast(INT_MAX)) { \ - cublasAsumWrapper(space, R, X); \ - } else { \ - Nrm1::value>::nrm1(space, R, \ - X); \ - } \ - Kokkos::Profiling::popRegion(); \ - } \ +#define KOKKOSBLAS1_NRM1_TPL_SPEC_DECL_CUBLAS(SCALAR, LAYOUT, MEMSPACE) \ + template <> \ + struct Nrm1::mag_type, LAYOUT, Kokkos::HostSpace, \ + Kokkos::MemoryTraits>, \ + Kokkos::View, \ + Kokkos::MemoryTraits>, \ + 1, true, \ + nrm1_eti_spec_avail::mag_type, LAYOUT, \ + Kokkos::HostSpace, Kokkos::MemoryTraits>, \ + Kokkos::View, \ + Kokkos::MemoryTraits>>::value> { \ + using execution_space = Kokkos::Cuda; \ + using RV = Kokkos::View::mag_type, LAYOUT, Kokkos::HostSpace, \ + Kokkos::MemoryTraits>; \ + using XV = Kokkos::View, \ + Kokkos::MemoryTraits>; \ + using size_type = typename XV::size_type; \ + \ + static void nrm1(const execution_space& space, RV& R, const XV& X) { \ + Kokkos::Profiling::pushRegion("KokkosBlas::nrm1[TPL_CUBLAS," #SCALAR "]"); \ + const size_type numElems = X.extent(0); \ + if (numElems < static_cast(INT_MAX)) { \ + cublasAsumWrapper(space, R, X); \ + } else { \ + Nrm1::value>::nrm1(space, R, X); \ + } \ + Kokkos::Profiling::popRegion(); \ + } \ }; -KOKKOSBLAS1_NRM1_TPL_SPEC_DECL_CUBLAS(float, Kokkos::LayoutLeft, - Kokkos::CudaSpace) -KOKKOSBLAS1_NRM1_TPL_SPEC_DECL_CUBLAS(double, Kokkos::LayoutLeft, - Kokkos::CudaSpace) -KOKKOSBLAS1_NRM1_TPL_SPEC_DECL_CUBLAS(Kokkos::complex, - Kokkos::LayoutLeft, Kokkos::CudaSpace) -KOKKOSBLAS1_NRM1_TPL_SPEC_DECL_CUBLAS(Kokkos::complex, - Kokkos::LayoutLeft, Kokkos::CudaSpace) +KOKKOSBLAS1_NRM1_TPL_SPEC_DECL_CUBLAS(float, Kokkos::LayoutLeft, Kokkos::CudaSpace) +KOKKOSBLAS1_NRM1_TPL_SPEC_DECL_CUBLAS(double, Kokkos::LayoutLeft, Kokkos::CudaSpace) +KOKKOSBLAS1_NRM1_TPL_SPEC_DECL_CUBLAS(Kokkos::complex, Kokkos::LayoutLeft, Kokkos::CudaSpace) +KOKKOSBLAS1_NRM1_TPL_SPEC_DECL_CUBLAS(Kokkos::complex, Kokkos::LayoutLeft, Kokkos::CudaSpace) #if defined(KOKKOSKERNELS_INST_MEMSPACE_CUDAUVMSPACE) -KOKKOSBLAS1_NRM1_TPL_SPEC_DECL_CUBLAS(float, Kokkos::LayoutLeft, - Kokkos::CudaUVMSpace) -KOKKOSBLAS1_NRM1_TPL_SPEC_DECL_CUBLAS(double, Kokkos::LayoutLeft, - Kokkos::CudaUVMSpace) -KOKKOSBLAS1_NRM1_TPL_SPEC_DECL_CUBLAS(Kokkos::complex, - Kokkos::LayoutLeft, Kokkos::CudaUVMSpace) -KOKKOSBLAS1_NRM1_TPL_SPEC_DECL_CUBLAS(Kokkos::complex, - Kokkos::LayoutLeft, Kokkos::CudaUVMSpace) +KOKKOSBLAS1_NRM1_TPL_SPEC_DECL_CUBLAS(float, Kokkos::LayoutLeft, Kokkos::CudaUVMSpace) +KOKKOSBLAS1_NRM1_TPL_SPEC_DECL_CUBLAS(double, Kokkos::LayoutLeft, Kokkos::CudaUVMSpace) +KOKKOSBLAS1_NRM1_TPL_SPEC_DECL_CUBLAS(Kokkos::complex, Kokkos::LayoutLeft, Kokkos::CudaUVMSpace) +KOKKOSBLAS1_NRM1_TPL_SPEC_DECL_CUBLAS(Kokkos::complex, Kokkos::LayoutLeft, Kokkos::CudaUVMSpace) #endif } // namespace Impl @@ -242,89 +193,67 @@ namespace KokkosBlas { namespace Impl { template -void rocblasAsumWrapper(const ExecutionSpace& space, RViewType& R, - const XViewType& X) { +void rocblasAsumWrapper(const ExecutionSpace& space, RViewType& R, const XViewType& X) { using XScalar = typename XViewType::non_const_value_type; nrm1_print_specialization(); - const int N = static_cast(X.extent(0)); - constexpr int one = 1; - KokkosBlas::Impl::RocBlasSingleton& s = - KokkosBlas::Impl::RocBlasSingleton::singleton(); + const int N = static_cast(X.extent(0)); + constexpr int one = 1; + KokkosBlas::Impl::RocBlasSingleton& s = KokkosBlas::Impl::RocBlasSingleton::singleton(); - KOKKOS_ROCBLAS_SAFE_CALL_IMPL( - rocblas_set_stream(s.handle, space.hip_stream())); + KOKKOS_ROCBLAS_SAFE_CALL_IMPL(rocblas_set_stream(s.handle, space.hip_stream())); if constexpr (std::is_same_v) { - KOKKOS_ROCBLAS_SAFE_CALL_IMPL( - rocblas_sasum(s.handle, N, X.data(), one, R.data())); + KOKKOS_ROCBLAS_SAFE_CALL_IMPL(rocblas_sasum(s.handle, N, X.data(), one, R.data())); } if constexpr (std::is_same_v) { - KOKKOS_ROCBLAS_SAFE_CALL_IMPL( - rocblas_dasum(s.handle, N, X.data(), one, R.data())); + KOKKOS_ROCBLAS_SAFE_CALL_IMPL(rocblas_dasum(s.handle, N, X.data(), one, R.data())); } if constexpr (std::is_same_v>) { - KOKKOS_ROCBLAS_SAFE_CALL_IMPL(rocblas_scasum( - s.handle, N, reinterpret_cast(X.data()), - one, R.data())); + KOKKOS_ROCBLAS_SAFE_CALL_IMPL( + rocblas_scasum(s.handle, N, reinterpret_cast(X.data()), one, R.data())); } if constexpr (std::is_same_v>) { - KOKKOS_ROCBLAS_SAFE_CALL_IMPL(rocblas_dzasum( - s.handle, N, reinterpret_cast(X.data()), - one, R.data())); + KOKKOS_ROCBLAS_SAFE_CALL_IMPL( + rocblas_dzasum(s.handle, N, reinterpret_cast(X.data()), one, R.data())); } KOKKOS_ROCBLAS_SAFE_CALL_IMPL(rocblas_set_stream(s.handle, NULL)); } -#define KOKKOSBLAS1_NRM1_TPL_SPEC_DECL_ROCBLAS(SCALAR, LAYOUT, MEMSPACE) \ - template <> \ - struct Nrm1< \ - Kokkos::HIP, \ - Kokkos::View::mag_type, LAYOUT, \ - Kokkos::HostSpace, \ - Kokkos::MemoryTraits>, \ - Kokkos::View, \ - Kokkos::MemoryTraits>, \ - 1, true, \ - nrm1_eti_spec_avail< \ - Kokkos::HIP, \ - Kokkos::View::mag_type, LAYOUT, \ - Kokkos::HostSpace, \ - Kokkos::MemoryTraits>, \ - Kokkos::View, \ - Kokkos::MemoryTraits>>::value> { \ - using RV = Kokkos::View::mag_type, \ - LAYOUT, Kokkos::HostSpace, \ - Kokkos::MemoryTraits>; \ - using XV = Kokkos::View, \ - Kokkos::MemoryTraits>; \ - using size_type = typename XV::size_type; \ - \ - static void nrm1(const Kokkos::HIP& space, RV& R, const XV& X) { \ - Kokkos::Profiling::pushRegion("KokkosBlas::nrm1[TPL_ROCBLAS," #SCALAR \ - "]"); \ - const size_type numElems = X.extent(0); \ - if (numElems < static_cast(INT_MAX)) { \ - rocblasAsumWrapper(space, R, X); \ - } else { \ - Nrm1::value>::nrm1(space, R, \ - X); \ - } \ - Kokkos::Profiling::popRegion(); \ - } \ +#define KOKKOSBLAS1_NRM1_TPL_SPEC_DECL_ROCBLAS(SCALAR, LAYOUT, MEMSPACE) \ + template <> \ + struct Nrm1::mag_type, LAYOUT, Kokkos::HostSpace, \ + Kokkos::MemoryTraits>, \ + Kokkos::View, \ + Kokkos::MemoryTraits>, \ + 1, true, \ + nrm1_eti_spec_avail::mag_type, LAYOUT, \ + Kokkos::HostSpace, Kokkos::MemoryTraits>, \ + Kokkos::View, \ + Kokkos::MemoryTraits>>::value> { \ + using RV = Kokkos::View::mag_type, LAYOUT, Kokkos::HostSpace, \ + Kokkos::MemoryTraits>; \ + using XV = Kokkos::View, \ + Kokkos::MemoryTraits>; \ + using size_type = typename XV::size_type; \ + \ + static void nrm1(const Kokkos::HIP& space, RV& R, const XV& X) { \ + Kokkos::Profiling::pushRegion("KokkosBlas::nrm1[TPL_ROCBLAS," #SCALAR "]"); \ + const size_type numElems = X.extent(0); \ + if (numElems < static_cast(INT_MAX)) { \ + rocblasAsumWrapper(space, R, X); \ + } else { \ + Nrm1::value>::nrm1(space, R, X); \ + } \ + Kokkos::Profiling::popRegion(); \ + } \ }; -KOKKOSBLAS1_NRM1_TPL_SPEC_DECL_ROCBLAS(float, Kokkos::LayoutLeft, - Kokkos::HIPSpace) -KOKKOSBLAS1_NRM1_TPL_SPEC_DECL_ROCBLAS(double, Kokkos::LayoutLeft, - Kokkos::HIPSpace) -KOKKOSBLAS1_NRM1_TPL_SPEC_DECL_ROCBLAS(Kokkos::complex, - Kokkos::LayoutLeft, Kokkos::HIPSpace) -KOKKOSBLAS1_NRM1_TPL_SPEC_DECL_ROCBLAS(Kokkos::complex, - Kokkos::LayoutLeft, Kokkos::HIPSpace) +KOKKOSBLAS1_NRM1_TPL_SPEC_DECL_ROCBLAS(float, Kokkos::LayoutLeft, Kokkos::HIPSpace) +KOKKOSBLAS1_NRM1_TPL_SPEC_DECL_ROCBLAS(double, Kokkos::LayoutLeft, Kokkos::HIPSpace) +KOKKOSBLAS1_NRM1_TPL_SPEC_DECL_ROCBLAS(Kokkos::complex, Kokkos::LayoutLeft, Kokkos::HIPSpace) +KOKKOSBLAS1_NRM1_TPL_SPEC_DECL_ROCBLAS(Kokkos::complex, Kokkos::LayoutLeft, Kokkos::HIPSpace) } // namespace Impl } // namespace KokkosBlas @@ -343,8 +272,7 @@ namespace KokkosBlas { namespace Impl { template -void onemklAsumWrapper(const ExecutionSpace& space, RViewType& R, - const XViewType& X) { +void onemklAsumWrapper(const ExecutionSpace& space, RViewType& R, const XViewType& X) { using XScalar = typename XViewType::non_const_value_type; using KAT_X = Kokkos::ArithTraits; using layout_t = typename XViewType::array_layout; @@ -352,100 +280,75 @@ void onemklAsumWrapper(const ExecutionSpace& space, RViewType& R, const std::int64_t N = static_cast(X.extent(0)); // Create temp view on device to store the result - Kokkos::View::mag_type, - typename XViewType::memory_space> - res("sycl asum result"); + Kokkos::View::mag_type, typename XViewType::memory_space> res( + "sycl asum result"); // Decide to call row_major or column_major function if constexpr (std::is_same_v) { if constexpr (KAT_X::is_complex) { - oneapi::mkl::blas::row_major::asum( - space.sycl_queue(), N, - reinterpret_cast*>( - X.data()), - 1, res.data()); - } else { - oneapi::mkl::blas::row_major::asum(space.sycl_queue(), N, X.data(), 1, + oneapi::mkl::blas::row_major::asum(space.sycl_queue(), N, + reinterpret_cast*>(X.data()), 1, res.data()); + } else { + oneapi::mkl::blas::row_major::asum(space.sycl_queue(), N, X.data(), 1, res.data()); } } else { if constexpr (KAT_X::is_complex) { - oneapi::mkl::blas::column_major::asum( - space.sycl_queue(), N, - reinterpret_cast*>( - X.data()), - 1, res.data()); + oneapi::mkl::blas::column_major::asum(space.sycl_queue(), N, + reinterpret_cast*>(X.data()), + 1, res.data()); } else { - oneapi::mkl::blas::column_major::asum(space.sycl_queue(), X.extent_int(0), - X.data(), 1, res.data()); + oneapi::mkl::blas::column_major::asum(space.sycl_queue(), X.extent_int(0), X.data(), 1, res.data()); } } // Bring result back to host Kokkos::deep_copy(space, R, res); } -#define KOKKOSBLAS1_NRM1_ONEMKL(SCALAR, LAYOUT, MEMSPACE) \ - template <> \ - struct Nrm1< \ - Kokkos::Experimental::SYCL, \ - Kokkos::View::mag_type, LAYOUT, \ - Kokkos::HostSpace, \ - Kokkos::MemoryTraits>, \ - Kokkos::View, \ - Kokkos::MemoryTraits>, \ - 1, true, \ - nrm1_eti_spec_avail< \ - Kokkos::Experimental::SYCL, \ - Kokkos::View::mag_type, LAYOUT, \ - Kokkos::HostSpace, \ - Kokkos::MemoryTraits>, \ - Kokkos::View, \ - Kokkos::MemoryTraits>>::value> { \ - using execution_space = Kokkos::Experimental::SYCL; \ - using RV = Kokkos::View::mag_type, \ - LAYOUT, Kokkos::HostSpace, \ - Kokkos::MemoryTraits>; \ - using XV = \ - Kokkos::View, \ - Kokkos::MemoryTraits>; \ - using size_type = typename XV::size_type; \ - \ - static void nrm1(const execution_space& space, RV& R, const XV& X) { \ - Kokkos::Profiling::pushRegion("KokkosBlas::nrm1[TPL_ONEMKL," #SCALAR \ - "]"); \ - const size_type numElems = X.extent(0); \ - if (numElems < static_cast(INT_MAX)) { \ - onemklAsumWrapper(space, R, X); \ - } else { \ - Nrm1::value>::nrm1(space, R, X); \ - } \ - Kokkos::Profiling::popRegion(); \ - } \ +#define KOKKOSBLAS1_NRM1_ONEMKL(SCALAR, LAYOUT, MEMSPACE) \ + template <> \ + struct Nrm1< \ + Kokkos::Experimental::SYCL, \ + Kokkos::View::mag_type, LAYOUT, Kokkos::HostSpace, \ + Kokkos::MemoryTraits>, \ + Kokkos::View, \ + Kokkos::MemoryTraits>, \ + 1, true, \ + nrm1_eti_spec_avail::mag_type, LAYOUT, Kokkos::HostSpace, \ + Kokkos::MemoryTraits>, \ + Kokkos::View, \ + Kokkos::MemoryTraits>>::value> { \ + using execution_space = Kokkos::Experimental::SYCL; \ + using RV = Kokkos::View::mag_type, LAYOUT, Kokkos::HostSpace, \ + Kokkos::MemoryTraits>; \ + using XV = Kokkos::View, \ + Kokkos::MemoryTraits>; \ + using size_type = typename XV::size_type; \ + \ + static void nrm1(const execution_space& space, RV& R, const XV& X) { \ + Kokkos::Profiling::pushRegion("KokkosBlas::nrm1[TPL_ONEMKL," #SCALAR "]"); \ + const size_type numElems = X.extent(0); \ + if (numElems < static_cast(INT_MAX)) { \ + onemklAsumWrapper(space, R, X); \ + } else { \ + Nrm1::value>::nrm1( \ + space, R, X); \ + } \ + Kokkos::Profiling::popRegion(); \ + } \ }; -KOKKOSBLAS1_NRM1_ONEMKL(float, Kokkos::LayoutLeft, - Kokkos::Experimental::SYCLDeviceUSMSpace) -KOKKOSBLAS1_NRM1_ONEMKL(double, Kokkos::LayoutLeft, - Kokkos::Experimental::SYCLDeviceUSMSpace) -KOKKOSBLAS1_NRM1_ONEMKL(Kokkos::complex, Kokkos::LayoutLeft, - Kokkos::Experimental::SYCLDeviceUSMSpace) -KOKKOSBLAS1_NRM1_ONEMKL(Kokkos::complex, Kokkos::LayoutLeft, - Kokkos::Experimental::SYCLDeviceUSMSpace) +KOKKOSBLAS1_NRM1_ONEMKL(float, Kokkos::LayoutLeft, Kokkos::Experimental::SYCLDeviceUSMSpace) +KOKKOSBLAS1_NRM1_ONEMKL(double, Kokkos::LayoutLeft, Kokkos::Experimental::SYCLDeviceUSMSpace) +KOKKOSBLAS1_NRM1_ONEMKL(Kokkos::complex, Kokkos::LayoutLeft, Kokkos::Experimental::SYCLDeviceUSMSpace) +KOKKOSBLAS1_NRM1_ONEMKL(Kokkos::complex, Kokkos::LayoutLeft, Kokkos::Experimental::SYCLDeviceUSMSpace) #if defined(KOKKOSKERNELS_INST_MEMSPACE_SYCLSHAREDSPACE) -KOKKOSBLAS1_NRM1_ONEMKL(float, Kokkos::LayoutLeft, - Kokkos::Experimental::SYCLSharedUSMSpace) -KOKKOSBLAS1_NRM1_ONEMKL(double, Kokkos::LayoutLeft, - Kokkos::Experimental::SYCLSharedUSMSpace) -KOKKOSBLAS1_NRM1_ONEMKL(Kokkos::complex, Kokkos::LayoutLeft, - Kokkos::Experimental::SYCLSharedUSMSpace) -KOKKOSBLAS1_NRM1_ONEMKL(Kokkos::complex, Kokkos::LayoutLeft, - Kokkos::Experimental::SYCLSharedUSMSpace) +KOKKOSBLAS1_NRM1_ONEMKL(float, Kokkos::LayoutLeft, Kokkos::Experimental::SYCLSharedUSMSpace) +KOKKOSBLAS1_NRM1_ONEMKL(double, Kokkos::LayoutLeft, Kokkos::Experimental::SYCLSharedUSMSpace) +KOKKOSBLAS1_NRM1_ONEMKL(Kokkos::complex, Kokkos::LayoutLeft, Kokkos::Experimental::SYCLSharedUSMSpace) +KOKKOSBLAS1_NRM1_ONEMKL(Kokkos::complex, Kokkos::LayoutLeft, Kokkos::Experimental::SYCLSharedUSMSpace) #endif } // namespace Impl diff --git a/blas/tpls/KokkosBlas1_nrm2_tpl_spec_avail.hpp b/blas/tpls/KokkosBlas1_nrm2_tpl_spec_avail.hpp index 7bc55becc0..b7b70b5edb 100644 --- a/blas/tpls/KokkosBlas1_nrm2_tpl_spec_avail.hpp +++ b/blas/tpls/KokkosBlas1_nrm2_tpl_spec_avail.hpp @@ -32,60 +32,47 @@ namespace Impl { // Generic Host side BLAS (could be MKL or whatever) #ifdef KOKKOSKERNELS_ENABLE_TPL_BLAS // double -#define KOKKOSBLAS1_NRM2_TPL_SPEC_AVAIL_BLAS(SCALAR, LAYOUT, MEMSPACE) \ - template \ - struct nrm2_tpl_spec_avail< \ - ExecSpace, \ - Kokkos::View< \ - typename Kokkos::Details::InnerProductSpaceTraits::mag_type, \ - LAYOUT, Kokkos::HostSpace, \ - Kokkos::MemoryTraits >, \ - Kokkos::View, \ - Kokkos::MemoryTraits >, \ - 1> { \ - enum : bool { value = true }; \ +#define KOKKOSBLAS1_NRM2_TPL_SPEC_AVAIL_BLAS(SCALAR, LAYOUT, MEMSPACE) \ + template \ + struct nrm2_tpl_spec_avail::mag_type, LAYOUT, \ + Kokkos::HostSpace, Kokkos::MemoryTraits >, \ + Kokkos::View, \ + Kokkos::MemoryTraits >, \ + 1> { \ + enum : bool { value = true }; \ }; -KOKKOSBLAS1_NRM2_TPL_SPEC_AVAIL_BLAS(double, Kokkos::LayoutLeft, - Kokkos::HostSpace) -KOKKOSBLAS1_NRM2_TPL_SPEC_AVAIL_BLAS(float, Kokkos::LayoutLeft, - Kokkos::HostSpace) -KOKKOSBLAS1_NRM2_TPL_SPEC_AVAIL_BLAS(Kokkos::complex, - Kokkos::LayoutLeft, Kokkos::HostSpace) -KOKKOSBLAS1_NRM2_TPL_SPEC_AVAIL_BLAS(Kokkos::complex, Kokkos::LayoutLeft, - Kokkos::HostSpace) +KOKKOSBLAS1_NRM2_TPL_SPEC_AVAIL_BLAS(double, Kokkos::LayoutLeft, Kokkos::HostSpace) +KOKKOSBLAS1_NRM2_TPL_SPEC_AVAIL_BLAS(float, Kokkos::LayoutLeft, Kokkos::HostSpace) +KOKKOSBLAS1_NRM2_TPL_SPEC_AVAIL_BLAS(Kokkos::complex, Kokkos::LayoutLeft, Kokkos::HostSpace) +KOKKOSBLAS1_NRM2_TPL_SPEC_AVAIL_BLAS(Kokkos::complex, Kokkos::LayoutLeft, Kokkos::HostSpace) #endif -#define KOKKOSBLAS1_NRM2_TPL_SPEC(SCALAR, LAYOUT, EXECSPACE, MEMSPACE) \ - template <> \ - struct nrm2_tpl_spec_avail< \ - EXECSPACE, \ - Kokkos::View::mag_type, LAYOUT, \ - Kokkos::HostSpace, \ - Kokkos::MemoryTraits >, \ - Kokkos::View, \ - Kokkos::MemoryTraits >, \ - 1> { \ - enum : bool { value = true }; \ +#define KOKKOSBLAS1_NRM2_TPL_SPEC(SCALAR, LAYOUT, EXECSPACE, MEMSPACE) \ + template <> \ + struct nrm2_tpl_spec_avail::mag_type, LAYOUT, Kokkos::HostSpace, \ + Kokkos::MemoryTraits >, \ + Kokkos::View, \ + Kokkos::MemoryTraits >, \ + 1> { \ + enum : bool { value = true }; \ }; -#define KOKKOSBLAS1_NRM2_TPL_SPEC_AVAIL(LAYOUT, EXECSPACE, MEMSPACE) \ - KOKKOSBLAS1_NRM2_TPL_SPEC(float, LAYOUT, EXECSPACE, MEMSPACE) \ - KOKKOSBLAS1_NRM2_TPL_SPEC(double, LAYOUT, EXECSPACE, MEMSPACE) \ - KOKKOSBLAS1_NRM2_TPL_SPEC(Kokkos::complex, LAYOUT, EXECSPACE, \ - MEMSPACE) \ - KOKKOSBLAS1_NRM2_TPL_SPEC(Kokkos::complex, LAYOUT, EXECSPACE, \ - MEMSPACE) +#define KOKKOSBLAS1_NRM2_TPL_SPEC_AVAIL(LAYOUT, EXECSPACE, MEMSPACE) \ + KOKKOSBLAS1_NRM2_TPL_SPEC(float, LAYOUT, EXECSPACE, MEMSPACE) \ + KOKKOSBLAS1_NRM2_TPL_SPEC(double, LAYOUT, EXECSPACE, MEMSPACE) \ + KOKKOSBLAS1_NRM2_TPL_SPEC(Kokkos::complex, LAYOUT, EXECSPACE, MEMSPACE) \ + KOKKOSBLAS1_NRM2_TPL_SPEC(Kokkos::complex, LAYOUT, EXECSPACE, MEMSPACE) #ifdef KOKKOSKERNELS_ENABLE_TPL_CUBLAS -KOKKOSBLAS1_NRM2_TPL_SPEC_AVAIL(Kokkos::LayoutLeft, Kokkos::Cuda, - Kokkos::CudaSpace) +KOKKOSBLAS1_NRM2_TPL_SPEC_AVAIL(Kokkos::LayoutLeft, Kokkos::Cuda, Kokkos::CudaSpace) #endif #ifdef KOKKOSKERNELS_ENABLE_TPL_ROCBLAS -KOKKOSBLAS1_NRM2_TPL_SPEC_AVAIL(Kokkos::LayoutLeft, Kokkos::HIP, - Kokkos::HIPSpace) +KOKKOSBLAS1_NRM2_TPL_SPEC_AVAIL(Kokkos::LayoutLeft, Kokkos::HIP, Kokkos::HIPSpace) #endif #if defined(KOKKOSKERNELS_ENABLE_TPL_MKL) && defined(KOKKOS_ENABLE_SYCL) diff --git a/blas/tpls/KokkosBlas1_nrm2_tpl_spec_decl.hpp b/blas/tpls/KokkosBlas1_nrm2_tpl_spec_decl.hpp index ef45238405..b1e4cd58b9 100644 --- a/blas/tpls/KokkosBlas1_nrm2_tpl_spec_decl.hpp +++ b/blas/tpls/KokkosBlas1_nrm2_tpl_spec_decl.hpp @@ -24,8 +24,7 @@ namespace { template inline void nrm2_print_specialization() { #ifdef KOKKOSKERNELS_ENABLE_CHECK_SPECIALIZATION - printf("KokkosBlas1::nrm2<> TPL Blas specialization for < %s , %s >\n", - typeid(RV).name(), typeid(XV).name()); + printf("KokkosBlas1::nrm2<> TPL Blas specialization for < %s , %s >\n", typeid(RV).name(), typeid(XV).name()); #endif } } // namespace @@ -39,175 +38,131 @@ inline void nrm2_print_specialization() { namespace KokkosBlas { namespace Impl { -#define KOKKOSBLAS1_DNRM2_TPL_SPEC_DECL_BLAS(LAYOUT, MEMSPACE, ETI_SPEC_AVAIL) \ - template \ - struct Nrm2< \ - ExecSpace, \ - Kokkos::View >, \ - Kokkos::View, \ - Kokkos::MemoryTraits >, \ - 1, true, ETI_SPEC_AVAIL> { \ - typedef Kokkos::View > \ - RV; \ - typedef Kokkos::View, \ - Kokkos::MemoryTraits > \ - XV; \ - typedef typename XV::size_type size_type; \ - \ - static void nrm2(const ExecSpace& space, RV& R, const XV& X, \ - const bool& take_sqrt) { \ - Kokkos::Profiling::pushRegion("KokkosBlas::nrm2[TPL_BLAS,double]"); \ - const size_type numElems = X.extent(0); \ - if (numElems < static_cast(INT_MAX)) { \ - nrm2_print_specialization(); \ - int N = numElems; \ - int int_one = 1; \ - R() = HostBlas::nrm2(N, X.data(), int_one); \ - if (!take_sqrt) R() = R() * R(); \ - } else { \ - Nrm2::nrm2(space, R, X, \ - take_sqrt); \ - } \ - Kokkos::Profiling::popRegion(); \ - } \ +#define KOKKOSBLAS1_DNRM2_TPL_SPEC_DECL_BLAS(LAYOUT, MEMSPACE, ETI_SPEC_AVAIL) \ + template \ + struct Nrm2 >, \ + Kokkos::View, \ + Kokkos::MemoryTraits >, \ + 1, true, ETI_SPEC_AVAIL> { \ + typedef Kokkos::View > RV; \ + typedef Kokkos::View, \ + Kokkos::MemoryTraits > \ + XV; \ + typedef typename XV::size_type size_type; \ + \ + static void nrm2(const ExecSpace& space, RV& R, const XV& X, const bool& take_sqrt) { \ + Kokkos::Profiling::pushRegion("KokkosBlas::nrm2[TPL_BLAS,double]"); \ + const size_type numElems = X.extent(0); \ + if (numElems < static_cast(INT_MAX)) { \ + nrm2_print_specialization(); \ + int N = numElems; \ + int int_one = 1; \ + R() = HostBlas::nrm2(N, X.data(), int_one); \ + if (!take_sqrt) R() = R() * R(); \ + } else { \ + Nrm2::nrm2(space, R, X, take_sqrt); \ + } \ + Kokkos::Profiling::popRegion(); \ + } \ }; -#define KOKKOSBLAS1_SNRM2_TPL_SPEC_DECL_BLAS(LAYOUT, MEMSPACE, ETI_SPEC_AVAIL) \ - template \ - struct Nrm2< \ - ExecSpace, \ - Kokkos::View >, \ - Kokkos::View, \ - Kokkos::MemoryTraits >, \ - 1, true, ETI_SPEC_AVAIL> { \ - typedef Kokkos::View > \ - RV; \ - typedef Kokkos::View, \ - Kokkos::MemoryTraits > \ - XV; \ - typedef typename XV::size_type size_type; \ - \ - static void nrm2(const ExecSpace& space, RV& R, const XV& X, \ - const bool& take_sqrt) { \ - Kokkos::Profiling::pushRegion("KokkosBlas::nrm2[TPL_BLAS,float]"); \ - const size_type numElems = X.extent(0); \ - if (numElems < static_cast(INT_MAX)) { \ - nrm2_print_specialization(); \ - int N = numElems; \ - int int_one = 1; \ - R() = HostBlas::nrm2(N, X.data(), int_one); \ - if (!take_sqrt) R() = R() * R(); \ - } else { \ - Nrm2::nrm2(space, R, X, \ - take_sqrt); \ - } \ - Kokkos::Profiling::popRegion(); \ - } \ +#define KOKKOSBLAS1_SNRM2_TPL_SPEC_DECL_BLAS(LAYOUT, MEMSPACE, ETI_SPEC_AVAIL) \ + template \ + struct Nrm2 >, \ + Kokkos::View, \ + Kokkos::MemoryTraits >, \ + 1, true, ETI_SPEC_AVAIL> { \ + typedef Kokkos::View > RV; \ + typedef Kokkos::View, \ + Kokkos::MemoryTraits > \ + XV; \ + typedef typename XV::size_type size_type; \ + \ + static void nrm2(const ExecSpace& space, RV& R, const XV& X, const bool& take_sqrt) { \ + Kokkos::Profiling::pushRegion("KokkosBlas::nrm2[TPL_BLAS,float]"); \ + const size_type numElems = X.extent(0); \ + if (numElems < static_cast(INT_MAX)) { \ + nrm2_print_specialization(); \ + int N = numElems; \ + int int_one = 1; \ + R() = HostBlas::nrm2(N, X.data(), int_one); \ + if (!take_sqrt) R() = R() * R(); \ + } else { \ + Nrm2::nrm2(space, R, X, take_sqrt); \ + } \ + Kokkos::Profiling::popRegion(); \ + } \ }; -#define KOKKOSBLAS1_ZNRM2_TPL_SPEC_DECL_BLAS(LAYOUT, MEMSPACE, ETI_SPEC_AVAIL) \ - template \ - struct Nrm2 >, \ - Kokkos::View*, LAYOUT, \ - Kokkos::Device, \ - Kokkos::MemoryTraits >, \ - 1, true, ETI_SPEC_AVAIL> { \ - typedef Kokkos::View > \ - RV; \ - typedef Kokkos::View*, LAYOUT, \ - Kokkos::Device, \ - Kokkos::MemoryTraits > \ - XV; \ - typedef typename XV::size_type size_type; \ - \ - static void nrm2(const ExecSpace& space, RV& R, const XV& X, \ - const bool& take_sqrt) { \ - Kokkos::Profiling::pushRegion( \ - "KokkosBlas::nrm2[TPL_BLAS,complex]"); \ - const size_type numElems = X.extent(0); \ - if (numElems < static_cast(INT_MAX)) { \ - nrm2_print_specialization(); \ - int N = numElems; \ - int int_one = 1; \ - R() = HostBlas >::nrm2( \ - N, reinterpret_cast*>(X.data()), \ - int_one); \ - if (!take_sqrt) R() = R() * R(); \ - } else { \ - Nrm2::nrm2(space, R, X, \ - take_sqrt); \ - } \ - Kokkos::Profiling::popRegion(); \ - } \ +#define KOKKOSBLAS1_ZNRM2_TPL_SPEC_DECL_BLAS(LAYOUT, MEMSPACE, ETI_SPEC_AVAIL) \ + template \ + struct Nrm2 >, \ + Kokkos::View*, LAYOUT, Kokkos::Device, \ + Kokkos::MemoryTraits >, \ + 1, true, ETI_SPEC_AVAIL> { \ + typedef Kokkos::View > RV; \ + typedef Kokkos::View*, LAYOUT, Kokkos::Device, \ + Kokkos::MemoryTraits > \ + XV; \ + typedef typename XV::size_type size_type; \ + \ + static void nrm2(const ExecSpace& space, RV& R, const XV& X, const bool& take_sqrt) { \ + Kokkos::Profiling::pushRegion("KokkosBlas::nrm2[TPL_BLAS,complex]"); \ + const size_type numElems = X.extent(0); \ + if (numElems < static_cast(INT_MAX)) { \ + nrm2_print_specialization(); \ + int N = numElems; \ + int int_one = 1; \ + R() = HostBlas >::nrm2(N, reinterpret_cast*>(X.data()), \ + int_one); \ + if (!take_sqrt) R() = R() * R(); \ + } else { \ + Nrm2::nrm2(space, R, X, take_sqrt); \ + } \ + Kokkos::Profiling::popRegion(); \ + } \ }; -#define KOKKOSBLAS1_CNRM2_TPL_SPEC_DECL_BLAS(LAYOUT, MEMSPACE, ETI_SPEC_AVAIL) \ - template \ - struct Nrm2 >, \ - Kokkos::View*, LAYOUT, \ - Kokkos::Device, \ - Kokkos::MemoryTraits >, \ - 1, true, ETI_SPEC_AVAIL> { \ - typedef Kokkos::View > \ - RV; \ - typedef Kokkos::View*, LAYOUT, \ - Kokkos::Device, \ - Kokkos::MemoryTraits > \ - XV; \ - typedef typename XV::size_type size_type; \ - \ - static void nrm2(const ExecSpace& space, RV& R, const XV& X, \ - const bool& take_sqrt) { \ - Kokkos::Profiling::pushRegion( \ - "KokkosBlas::nrm2[TPL_BLAS,complex]"); \ - const size_type numElems = X.extent(0); \ - if (numElems < static_cast(INT_MAX)) { \ - nrm2_print_specialization(); \ - int N = numElems; \ - int int_one = 1; \ - R() = HostBlas >::nrm2( \ - N, reinterpret_cast*>(X.data()), \ - int_one); \ - if (!take_sqrt) R() = R() * R(); \ - } else { \ - Nrm2::nrm2(space, R, X, \ - take_sqrt); \ - } \ - Kokkos::Profiling::popRegion(); \ - } \ +#define KOKKOSBLAS1_CNRM2_TPL_SPEC_DECL_BLAS(LAYOUT, MEMSPACE, ETI_SPEC_AVAIL) \ + template \ + struct Nrm2 >, \ + Kokkos::View*, LAYOUT, Kokkos::Device, \ + Kokkos::MemoryTraits >, \ + 1, true, ETI_SPEC_AVAIL> { \ + typedef Kokkos::View > RV; \ + typedef Kokkos::View*, LAYOUT, Kokkos::Device, \ + Kokkos::MemoryTraits > \ + XV; \ + typedef typename XV::size_type size_type; \ + \ + static void nrm2(const ExecSpace& space, RV& R, const XV& X, const bool& take_sqrt) { \ + Kokkos::Profiling::pushRegion("KokkosBlas::nrm2[TPL_BLAS,complex]"); \ + const size_type numElems = X.extent(0); \ + if (numElems < static_cast(INT_MAX)) { \ + nrm2_print_specialization(); \ + int N = numElems; \ + int int_one = 1; \ + R() = \ + HostBlas >::nrm2(N, reinterpret_cast*>(X.data()), int_one); \ + if (!take_sqrt) R() = R() * R(); \ + } else { \ + Nrm2::nrm2(space, R, X, take_sqrt); \ + } \ + Kokkos::Profiling::popRegion(); \ + } \ }; -KOKKOSBLAS1_DNRM2_TPL_SPEC_DECL_BLAS(Kokkos::LayoutLeft, Kokkos::HostSpace, - true) -KOKKOSBLAS1_DNRM2_TPL_SPEC_DECL_BLAS(Kokkos::LayoutLeft, Kokkos::HostSpace, - false) +KOKKOSBLAS1_DNRM2_TPL_SPEC_DECL_BLAS(Kokkos::LayoutLeft, Kokkos::HostSpace, true) +KOKKOSBLAS1_DNRM2_TPL_SPEC_DECL_BLAS(Kokkos::LayoutLeft, Kokkos::HostSpace, false) -KOKKOSBLAS1_SNRM2_TPL_SPEC_DECL_BLAS(Kokkos::LayoutLeft, Kokkos::HostSpace, - true) -KOKKOSBLAS1_SNRM2_TPL_SPEC_DECL_BLAS(Kokkos::LayoutLeft, Kokkos::HostSpace, - false) +KOKKOSBLAS1_SNRM2_TPL_SPEC_DECL_BLAS(Kokkos::LayoutLeft, Kokkos::HostSpace, true) +KOKKOSBLAS1_SNRM2_TPL_SPEC_DECL_BLAS(Kokkos::LayoutLeft, Kokkos::HostSpace, false) -KOKKOSBLAS1_ZNRM2_TPL_SPEC_DECL_BLAS(Kokkos::LayoutLeft, Kokkos::HostSpace, - true) -KOKKOSBLAS1_ZNRM2_TPL_SPEC_DECL_BLAS(Kokkos::LayoutLeft, Kokkos::HostSpace, - false) +KOKKOSBLAS1_ZNRM2_TPL_SPEC_DECL_BLAS(Kokkos::LayoutLeft, Kokkos::HostSpace, true) +KOKKOSBLAS1_ZNRM2_TPL_SPEC_DECL_BLAS(Kokkos::LayoutLeft, Kokkos::HostSpace, false) -KOKKOSBLAS1_CNRM2_TPL_SPEC_DECL_BLAS(Kokkos::LayoutLeft, Kokkos::HostSpace, - true) -KOKKOSBLAS1_CNRM2_TPL_SPEC_DECL_BLAS(Kokkos::LayoutLeft, Kokkos::HostSpace, - false) +KOKKOSBLAS1_CNRM2_TPL_SPEC_DECL_BLAS(Kokkos::LayoutLeft, Kokkos::HostSpace, true) +KOKKOSBLAS1_CNRM2_TPL_SPEC_DECL_BLAS(Kokkos::LayoutLeft, Kokkos::HostSpace, false) } // namespace Impl } // namespace KokkosBlas @@ -220,66 +175,48 @@ KOKKOSBLAS1_CNRM2_TPL_SPEC_DECL_BLAS(Kokkos::LayoutLeft, Kokkos::HostSpace, namespace KokkosBlas { namespace Impl { -#define KOKKOSBLAS1_NRM2_TPL_SPEC_DECL_CUBLAS(LAYOUT, KOKKOS_TYPE, TPL_TYPE, \ - EXECSPACE, MEMSPACE, TPL_NRM2, \ - ETI_SPEC_AVAIL) \ - template <> \ - struct Nrm2::mag_type, LAYOUT, \ - Kokkos::HostSpace, \ - Kokkos::MemoryTraits >, \ - Kokkos::View, \ - Kokkos::MemoryTraits >, \ - 1, true, ETI_SPEC_AVAIL> { \ - using RT = Kokkos::ArithTraits::mag_type; \ - using RV = Kokkos::View >; \ - using XV = Kokkos::View, \ - Kokkos::MemoryTraits >; \ - using size_type = typename XV::size_type; \ - \ - static void nrm2(const EXECSPACE& space, RV& R, const XV& X, \ - const bool& take_sqrt) { \ - Kokkos::Profiling::pushRegion("KokkosBlas::nrm2[TPL_CUBLAS," + \ - Kokkos::ArithTraits::name() + \ - "]"); \ - const size_type numElems = X.extent(0); \ - if (numElems <= \ - static_cast(std::numeric_limits::max())) { \ - nrm2_print_specialization(); \ - const int N = static_cast(numElems); \ - KokkosBlas::Impl::CudaBlasSingleton& s = \ - KokkosBlas::Impl::CudaBlasSingleton::singleton(); \ - KOKKOS_CUBLAS_SAFE_CALL_IMPL( \ - cublasSetStream(s.handle, space.cuda_stream())); \ - KOKKOS_CUBLAS_SAFE_CALL_IMPL( \ - TPL_NRM2(s.handle, N, reinterpret_cast(X.data()), \ - 1, &R())); \ - KOKKOS_CUBLAS_SAFE_CALL_IMPL(cublasSetStream(s.handle, NULL)); \ - if (!take_sqrt) R() = R() * R(); \ - } else { \ - Nrm2::nrm2(space, R, X, \ - take_sqrt); \ - } \ - Kokkos::Profiling::popRegion(); \ - } \ +#define KOKKOSBLAS1_NRM2_TPL_SPEC_DECL_CUBLAS(LAYOUT, KOKKOS_TYPE, TPL_TYPE, EXECSPACE, MEMSPACE, TPL_NRM2, \ + ETI_SPEC_AVAIL) \ + template <> \ + struct Nrm2::mag_type, LAYOUT, Kokkos::HostSpace, \ + Kokkos::MemoryTraits >, \ + Kokkos::View, \ + Kokkos::MemoryTraits >, \ + 1, true, ETI_SPEC_AVAIL> { \ + using RT = Kokkos::ArithTraits::mag_type; \ + using RV = Kokkos::View >; \ + using XV = Kokkos::View, \ + Kokkos::MemoryTraits >; \ + using size_type = typename XV::size_type; \ + \ + static void nrm2(const EXECSPACE& space, RV& R, const XV& X, const bool& take_sqrt) { \ + Kokkos::Profiling::pushRegion("KokkosBlas::nrm2[TPL_CUBLAS," + Kokkos::ArithTraits::name() + "]"); \ + const size_type numElems = X.extent(0); \ + if (numElems <= static_cast(std::numeric_limits::max())) { \ + nrm2_print_specialization(); \ + const int N = static_cast(numElems); \ + KokkosBlas::Impl::CudaBlasSingleton& s = KokkosBlas::Impl::CudaBlasSingleton::singleton(); \ + KOKKOS_CUBLAS_SAFE_CALL_IMPL(cublasSetStream(s.handle, space.cuda_stream())); \ + KOKKOS_CUBLAS_SAFE_CALL_IMPL(TPL_NRM2(s.handle, N, reinterpret_cast(X.data()), 1, &R())); \ + KOKKOS_CUBLAS_SAFE_CALL_IMPL(cublasSetStream(s.handle, NULL)); \ + if (!take_sqrt) R() = R() * R(); \ + } else { \ + Nrm2::nrm2(space, R, X, take_sqrt); \ + } \ + Kokkos::Profiling::popRegion(); \ + } \ }; -#define KOKKOSBLAS1_NRM2_TPL_SPEC_DECL_CUBLAS_EXT(ETI_SPEC_AVAIL) \ - KOKKOSBLAS1_NRM2_TPL_SPEC_DECL_CUBLAS(Kokkos::LayoutLeft, float, float, \ - Kokkos::Cuda, Kokkos::CudaSpace, \ - cublasSnrm2, ETI_SPEC_AVAIL) \ - KOKKOSBLAS1_NRM2_TPL_SPEC_DECL_CUBLAS(Kokkos::LayoutLeft, double, double, \ - Kokkos::Cuda, Kokkos::CudaSpace, \ - cublasDnrm2, ETI_SPEC_AVAIL) \ - KOKKOSBLAS1_NRM2_TPL_SPEC_DECL_CUBLAS( \ - Kokkos::LayoutLeft, Kokkos::complex, cuComplex, Kokkos::Cuda, \ - Kokkos::CudaSpace, cublasScnrm2, ETI_SPEC_AVAIL) \ - KOKKOSBLAS1_NRM2_TPL_SPEC_DECL_CUBLAS( \ - Kokkos::LayoutLeft, Kokkos::complex, cuDoubleComplex, \ - Kokkos::Cuda, Kokkos::CudaSpace, cublasDznrm2, ETI_SPEC_AVAIL) +#define KOKKOSBLAS1_NRM2_TPL_SPEC_DECL_CUBLAS_EXT(ETI_SPEC_AVAIL) \ + KOKKOSBLAS1_NRM2_TPL_SPEC_DECL_CUBLAS(Kokkos::LayoutLeft, float, float, Kokkos::Cuda, Kokkos::CudaSpace, \ + cublasSnrm2, ETI_SPEC_AVAIL) \ + KOKKOSBLAS1_NRM2_TPL_SPEC_DECL_CUBLAS(Kokkos::LayoutLeft, double, double, Kokkos::Cuda, Kokkos::CudaSpace, \ + cublasDnrm2, ETI_SPEC_AVAIL) \ + KOKKOSBLAS1_NRM2_TPL_SPEC_DECL_CUBLAS(Kokkos::LayoutLeft, Kokkos::complex, cuComplex, Kokkos::Cuda, \ + Kokkos::CudaSpace, cublasScnrm2, ETI_SPEC_AVAIL) \ + KOKKOSBLAS1_NRM2_TPL_SPEC_DECL_CUBLAS(Kokkos::LayoutLeft, Kokkos::complex, cuDoubleComplex, Kokkos::Cuda, \ + Kokkos::CudaSpace, cublasDznrm2, ETI_SPEC_AVAIL) KOKKOSBLAS1_NRM2_TPL_SPEC_DECL_CUBLAS_EXT(true) KOKKOSBLAS1_NRM2_TPL_SPEC_DECL_CUBLAS_EXT(false) @@ -295,66 +232,48 @@ KOKKOSBLAS1_NRM2_TPL_SPEC_DECL_CUBLAS_EXT(false) namespace KokkosBlas { namespace Impl { -#define KOKKOSBLAS1_NRM2_TPL_SPEC_DECL_ROCBLAS(LAYOUT, KOKKOS_TYPE, TPL_TYPE, \ - EXECSPACE, MEMSPACE, TPL_NRM2, \ - ETI_SPEC_AVAIL) \ - template <> \ - struct Nrm2::mag_type, LAYOUT, \ - Kokkos::HostSpace, \ - Kokkos::MemoryTraits >, \ - Kokkos::View, \ - Kokkos::MemoryTraits >, \ - 1, true, ETI_SPEC_AVAIL> { \ - using RT = Kokkos::ArithTraits::mag_type; \ - using RV = Kokkos::View >; \ - using XV = Kokkos::View, \ - Kokkos::MemoryTraits >; \ - using size_type = typename XV::size_type; \ - \ - static void nrm2(const EXECSPACE& space, RV& R, const XV& X, \ - const bool& take_sqrt) { \ - Kokkos::Profiling::pushRegion("KokkosBlas::nrm2[TPL_ROCBLAS," + \ - Kokkos::ArithTraits::name() + \ - "]"); \ - const size_type numElems = X.extent(0); \ - if (numElems <= \ - static_cast(std::numeric_limits::max())) { \ - nrm2_print_specialization(); \ - const rocblas_int N = static_cast(numElems); \ - KokkosBlas::Impl::RocBlasSingleton& s = \ - KokkosBlas::Impl::RocBlasSingleton::singleton(); \ - KOKKOS_ROCBLAS_SAFE_CALL_IMPL( \ - rocblas_set_stream(s.handle, space.hip_stream())); \ - KOKKOS_ROCBLAS_SAFE_CALL_IMPL( \ - TPL_NRM2(s.handle, N, reinterpret_cast(X.data()), \ - 1, &R())); \ - KOKKOS_ROCBLAS_SAFE_CALL_IMPL(rocblas_set_stream(s.handle, NULL)); \ - if (!take_sqrt) R() = R() * R(); \ - } else { \ - Nrm2::nrm2(space, R, X, \ - take_sqrt); \ - } \ - Kokkos::Profiling::popRegion(); \ - } \ +#define KOKKOSBLAS1_NRM2_TPL_SPEC_DECL_ROCBLAS(LAYOUT, KOKKOS_TYPE, TPL_TYPE, EXECSPACE, MEMSPACE, TPL_NRM2, \ + ETI_SPEC_AVAIL) \ + template <> \ + struct Nrm2::mag_type, LAYOUT, Kokkos::HostSpace, \ + Kokkos::MemoryTraits >, \ + Kokkos::View, \ + Kokkos::MemoryTraits >, \ + 1, true, ETI_SPEC_AVAIL> { \ + using RT = Kokkos::ArithTraits::mag_type; \ + using RV = Kokkos::View >; \ + using XV = Kokkos::View, \ + Kokkos::MemoryTraits >; \ + using size_type = typename XV::size_type; \ + \ + static void nrm2(const EXECSPACE& space, RV& R, const XV& X, const bool& take_sqrt) { \ + Kokkos::Profiling::pushRegion("KokkosBlas::nrm2[TPL_ROCBLAS," + Kokkos::ArithTraits::name() + "]"); \ + const size_type numElems = X.extent(0); \ + if (numElems <= static_cast(std::numeric_limits::max())) { \ + nrm2_print_specialization(); \ + const rocblas_int N = static_cast(numElems); \ + KokkosBlas::Impl::RocBlasSingleton& s = KokkosBlas::Impl::RocBlasSingleton::singleton(); \ + KOKKOS_ROCBLAS_SAFE_CALL_IMPL(rocblas_set_stream(s.handle, space.hip_stream())); \ + KOKKOS_ROCBLAS_SAFE_CALL_IMPL(TPL_NRM2(s.handle, N, reinterpret_cast(X.data()), 1, &R())); \ + KOKKOS_ROCBLAS_SAFE_CALL_IMPL(rocblas_set_stream(s.handle, NULL)); \ + if (!take_sqrt) R() = R() * R(); \ + } else { \ + Nrm2::nrm2(space, R, X, take_sqrt); \ + } \ + Kokkos::Profiling::popRegion(); \ + } \ }; -#define KOKKOSBLAS1_NRM2_TPL_SPEC_DECL_ROCBLAS_EXT(ETI_SPEC_AVAIL) \ - KOKKOSBLAS1_NRM2_TPL_SPEC_DECL_ROCBLAS(Kokkos::LayoutLeft, float, float, \ - Kokkos::HIP, Kokkos::HIPSpace, \ - rocblas_snrm2, ETI_SPEC_AVAIL) \ - KOKKOSBLAS1_NRM2_TPL_SPEC_DECL_ROCBLAS(Kokkos::LayoutLeft, double, double, \ - Kokkos::HIP, Kokkos::HIPSpace, \ - rocblas_dnrm2, ETI_SPEC_AVAIL) \ - KOKKOSBLAS1_NRM2_TPL_SPEC_DECL_ROCBLAS( \ - Kokkos::LayoutLeft, Kokkos::complex, rocblas_float_complex, \ - Kokkos::HIP, Kokkos::HIPSpace, rocblas_scnrm2, ETI_SPEC_AVAIL) \ - KOKKOSBLAS1_NRM2_TPL_SPEC_DECL_ROCBLAS( \ - Kokkos::LayoutLeft, Kokkos::complex, rocblas_double_complex, \ - Kokkos::HIP, Kokkos::HIPSpace, rocblas_dznrm2, ETI_SPEC_AVAIL) +#define KOKKOSBLAS1_NRM2_TPL_SPEC_DECL_ROCBLAS_EXT(ETI_SPEC_AVAIL) \ + KOKKOSBLAS1_NRM2_TPL_SPEC_DECL_ROCBLAS(Kokkos::LayoutLeft, float, float, Kokkos::HIP, Kokkos::HIPSpace, \ + rocblas_snrm2, ETI_SPEC_AVAIL) \ + KOKKOSBLAS1_NRM2_TPL_SPEC_DECL_ROCBLAS(Kokkos::LayoutLeft, double, double, Kokkos::HIP, Kokkos::HIPSpace, \ + rocblas_dnrm2, ETI_SPEC_AVAIL) \ + KOKKOSBLAS1_NRM2_TPL_SPEC_DECL_ROCBLAS(Kokkos::LayoutLeft, Kokkos::complex, rocblas_float_complex, \ + Kokkos::HIP, Kokkos::HIPSpace, rocblas_scnrm2, ETI_SPEC_AVAIL) \ + KOKKOSBLAS1_NRM2_TPL_SPEC_DECL_ROCBLAS(Kokkos::LayoutLeft, Kokkos::complex, rocblas_double_complex, \ + Kokkos::HIP, Kokkos::HIPSpace, rocblas_dznrm2, ETI_SPEC_AVAIL) KOKKOSBLAS1_NRM2_TPL_SPEC_DECL_ROCBLAS_EXT(true) KOKKOSBLAS1_NRM2_TPL_SPEC_DECL_ROCBLAS_EXT(false) @@ -372,64 +291,49 @@ KOKKOSBLAS1_NRM2_TPL_SPEC_DECL_ROCBLAS_EXT(false) namespace KokkosBlas { namespace Impl { -#define KOKKOSBLAS1_NRM2_TPL_SPEC_DECL_ONEMKL(LAYOUT, KOKKOS_TYPE, TPL_TYPE, \ - EXECSPACE, MEMSPACE, TPL_NRM2, \ - ETI_SPEC_AVAIL) \ - template <> \ - struct Nrm2::mag_type, LAYOUT, \ - Kokkos::HostSpace, \ - Kokkos::MemoryTraits >, \ - Kokkos::View, \ - Kokkos::MemoryTraits >, \ - 1, true, ETI_SPEC_AVAIL> { \ - using RT = Kokkos::ArithTraits::mag_type; \ - using RV = Kokkos::View >; \ - using XV = Kokkos::View, \ - Kokkos::MemoryTraits >; \ - using size_type = typename XV::size_type; \ - \ - static void nrm2(const EXECSPACE& space, RV& R, const XV& X, \ - const bool& take_sqrt) { \ - Kokkos::Profiling::pushRegion("KokkosBlas::nrm2[TPL_ONEMKL," + \ - Kokkos::ArithTraits::name() + \ - "]"); \ - const size_type numElems = X.extent(0); \ - if (numElems <= \ - static_cast(std::numeric_limits::max())) { \ - nrm2_print_specialization(); \ - const std::int64_t N = static_cast(numElems); \ - TPL_NRM2(space.sycl_queue(), N, \ - reinterpret_cast(X.data()), 1, &R()); \ - if (!take_sqrt) R() = R() * R(); \ - } else { \ - Nrm2::nrm2(space, R, X, \ - take_sqrt); \ - } \ - Kokkos::Profiling::popRegion(); \ - } \ +#define KOKKOSBLAS1_NRM2_TPL_SPEC_DECL_ONEMKL(LAYOUT, KOKKOS_TYPE, TPL_TYPE, EXECSPACE, MEMSPACE, TPL_NRM2, \ + ETI_SPEC_AVAIL) \ + template <> \ + struct Nrm2::mag_type, LAYOUT, Kokkos::HostSpace, \ + Kokkos::MemoryTraits >, \ + Kokkos::View, \ + Kokkos::MemoryTraits >, \ + 1, true, ETI_SPEC_AVAIL> { \ + using RT = Kokkos::ArithTraits::mag_type; \ + using RV = Kokkos::View >; \ + using XV = Kokkos::View, \ + Kokkos::MemoryTraits >; \ + using size_type = typename XV::size_type; \ + \ + static void nrm2(const EXECSPACE& space, RV& R, const XV& X, const bool& take_sqrt) { \ + Kokkos::Profiling::pushRegion("KokkosBlas::nrm2[TPL_ONEMKL," + Kokkos::ArithTraits::name() + "]"); \ + const size_type numElems = X.extent(0); \ + if (numElems <= static_cast(std::numeric_limits::max())) { \ + nrm2_print_specialization(); \ + const std::int64_t N = static_cast(numElems); \ + TPL_NRM2(space.sycl_queue(), N, reinterpret_cast(X.data()), 1, &R()); \ + if (!take_sqrt) R() = R() * R(); \ + } else { \ + Nrm2::nrm2(space, R, X, take_sqrt); \ + } \ + Kokkos::Profiling::popRegion(); \ + } \ }; -#define KOKKOSBLAS1_NRM2_TPL_SPEC_DECL_ONEMKL_EXT(ETI_SPEC_AVAIL) \ - KOKKOSBLAS1_NRM2_TPL_SPEC_DECL_ONEMKL( \ - Kokkos::LayoutLeft, float, float, Kokkos::Experimental::SYCL, \ - Kokkos::Experimental::SYCLDeviceUSMSpace, \ - oneapi::mkl::blas::row_major::nrm2, ETI_SPEC_AVAIL) \ - KOKKOSBLAS1_NRM2_TPL_SPEC_DECL_ONEMKL( \ - Kokkos::LayoutLeft, double, double, Kokkos::Experimental::SYCL, \ - Kokkos::Experimental::SYCLDeviceUSMSpace, \ - oneapi::mkl::blas::row_major::nrm2, ETI_SPEC_AVAIL) \ - KOKKOSBLAS1_NRM2_TPL_SPEC_DECL_ONEMKL( \ - Kokkos::LayoutLeft, Kokkos::complex, std::complex, \ - Kokkos::Experimental::SYCL, Kokkos::Experimental::SYCLDeviceUSMSpace, \ - oneapi::mkl::blas::row_major::nrm2, ETI_SPEC_AVAIL) \ - KOKKOSBLAS1_NRM2_TPL_SPEC_DECL_ONEMKL( \ - Kokkos::LayoutLeft, Kokkos::complex, std::complex, \ - Kokkos::Experimental::SYCL, Kokkos::Experimental::SYCLDeviceUSMSpace, \ - oneapi::mkl::blas::row_major::nrm2, ETI_SPEC_AVAIL) +#define KOKKOSBLAS1_NRM2_TPL_SPEC_DECL_ONEMKL_EXT(ETI_SPEC_AVAIL) \ + KOKKOSBLAS1_NRM2_TPL_SPEC_DECL_ONEMKL(Kokkos::LayoutLeft, float, float, Kokkos::Experimental::SYCL, \ + Kokkos::Experimental::SYCLDeviceUSMSpace, oneapi::mkl::blas::row_major::nrm2, \ + ETI_SPEC_AVAIL) \ + KOKKOSBLAS1_NRM2_TPL_SPEC_DECL_ONEMKL(Kokkos::LayoutLeft, double, double, Kokkos::Experimental::SYCL, \ + Kokkos::Experimental::SYCLDeviceUSMSpace, oneapi::mkl::blas::row_major::nrm2, \ + ETI_SPEC_AVAIL) \ + KOKKOSBLAS1_NRM2_TPL_SPEC_DECL_ONEMKL(Kokkos::LayoutLeft, Kokkos::complex, std::complex, \ + Kokkos::Experimental::SYCL, Kokkos::Experimental::SYCLDeviceUSMSpace, \ + oneapi::mkl::blas::row_major::nrm2, ETI_SPEC_AVAIL) \ + KOKKOSBLAS1_NRM2_TPL_SPEC_DECL_ONEMKL(Kokkos::LayoutLeft, Kokkos::complex, std::complex, \ + Kokkos::Experimental::SYCL, Kokkos::Experimental::SYCLDeviceUSMSpace, \ + oneapi::mkl::blas::row_major::nrm2, ETI_SPEC_AVAIL) KOKKOSBLAS1_NRM2_TPL_SPEC_DECL_ONEMKL_EXT(true) KOKKOSBLAS1_NRM2_TPL_SPEC_DECL_ONEMKL_EXT(false) diff --git a/blas/tpls/KokkosBlas1_nrminf_tpl_spec_avail.hpp b/blas/tpls/KokkosBlas1_nrminf_tpl_spec_avail.hpp index 88591fbf0c..27647eed11 100644 --- a/blas/tpls/KokkosBlas1_nrminf_tpl_spec_avail.hpp +++ b/blas/tpls/KokkosBlas1_nrminf_tpl_spec_avail.hpp @@ -33,28 +33,21 @@ namespace Impl { // Generic Host side BLAS (could be MKL or whatever) #ifdef KOKKOSKERNELS_ENABLE_TPL_BLAS // double -#define KOKKOSBLAS1_NRMINF_TPL_SPEC_AVAIL_BLAS(SCALAR, LAYOUT, MEMSPACE) \ - template \ - struct nrminf_tpl_spec_avail< \ - ExecSpace, \ - Kokkos::View< \ - typename Kokkos::Details::InnerProductSpaceTraits::mag_type, \ - LAYOUT, Kokkos::HostSpace, \ - Kokkos::MemoryTraits >, \ - Kokkos::View, \ - Kokkos::MemoryTraits >, \ - 1> { \ - enum : bool { value = true }; \ +#define KOKKOSBLAS1_NRMINF_TPL_SPEC_AVAIL_BLAS(SCALAR, LAYOUT, MEMSPACE) \ + template \ + struct nrminf_tpl_spec_avail::mag_type, \ + LAYOUT, Kokkos::HostSpace, Kokkos::MemoryTraits >, \ + Kokkos::View, \ + Kokkos::MemoryTraits >, \ + 1> { \ + enum : bool { value = true }; \ }; -KOKKOSBLAS1_NRMINF_TPL_SPEC_AVAIL_BLAS(double, Kokkos::LayoutLeft, - Kokkos::HostSpace) -KOKKOSBLAS1_NRMINF_TPL_SPEC_AVAIL_BLAS(float, Kokkos::LayoutLeft, - Kokkos::HostSpace) -KOKKOSBLAS1_NRMINF_TPL_SPEC_AVAIL_BLAS(Kokkos::complex, - Kokkos::LayoutLeft, Kokkos::HostSpace) -KOKKOSBLAS1_NRMINF_TPL_SPEC_AVAIL_BLAS(Kokkos::complex, - Kokkos::LayoutLeft, Kokkos::HostSpace) +KOKKOSBLAS1_NRMINF_TPL_SPEC_AVAIL_BLAS(double, Kokkos::LayoutLeft, Kokkos::HostSpace) +KOKKOSBLAS1_NRMINF_TPL_SPEC_AVAIL_BLAS(float, Kokkos::LayoutLeft, Kokkos::HostSpace) +KOKKOSBLAS1_NRMINF_TPL_SPEC_AVAIL_BLAS(Kokkos::complex, Kokkos::LayoutLeft, Kokkos::HostSpace) +KOKKOSBLAS1_NRMINF_TPL_SPEC_AVAIL_BLAS(Kokkos::complex, Kokkos::LayoutLeft, Kokkos::HostSpace) #endif diff --git a/blas/tpls/KokkosBlas1_nrminf_tpl_spec_decl.hpp b/blas/tpls/KokkosBlas1_nrminf_tpl_spec_decl.hpp index 17ec54e057..0b2081fc27 100644 --- a/blas/tpls/KokkosBlas1_nrminf_tpl_spec_decl.hpp +++ b/blas/tpls/KokkosBlas1_nrminf_tpl_spec_decl.hpp @@ -24,8 +24,7 @@ namespace { template inline void nrminf_print_specialization() { #ifdef KOKKOSKERNELS_ENABLE_CHECK_SPECIALIZATION - printf("KokkosBlas1::nrminf<> TPL Blas specialization for < %s , %s >\n", - typeid(RV).name(), typeid(XV).name()); + printf("KokkosBlas1::nrminf<> TPL Blas specialization for < %s , %s >\n", typeid(RV).name(), typeid(XV).name()); #endif } } // namespace @@ -39,201 +38,152 @@ inline void nrminf_print_specialization() { namespace KokkosBlas { namespace Impl { -#define KOKKOSBLAS1_DNRMINF_TPL_SPEC_DECL_BLAS(LAYOUT, MEMSPACE, \ - ETI_SPEC_AVAIL) \ - template \ - struct NrmInf< \ - ExecSpace, \ - Kokkos::View>, \ - Kokkos::View, \ - Kokkos::MemoryTraits>, \ - 1, true, ETI_SPEC_AVAIL> { \ - typedef Kokkos::View> \ - RV; \ - typedef Kokkos::View, \ - Kokkos::MemoryTraits> \ - XV; \ - typedef typename XV::size_type size_type; \ - typedef Kokkos::Details::InnerProductSpaceTraits IPT; \ - \ - static void nrminf(const ExecSpace& space, RV& R, const XV& X) { \ - Kokkos::Profiling::pushRegion("KokkosBlas::nrminf[TPL_BLAS,double]"); \ - const size_type numElems = X.extent(0); \ - if (numElems == 0) { \ - R() = 0.0; \ - return; \ - } \ - if (numElems < static_cast(INT_MAX)) { \ - nrminf_print_specialization(); \ - int N = numElems; \ - int one = 1; \ - int idx = HostBlas::iamax(N, X.data(), one) - 1; \ - R() = IPT::norm(X(idx)); \ - } else { \ - NrmInf::nrminf(space, R, \ - X); \ - } \ - Kokkos::Profiling::popRegion(); \ - } \ +#define KOKKOSBLAS1_DNRMINF_TPL_SPEC_DECL_BLAS(LAYOUT, MEMSPACE, ETI_SPEC_AVAIL) \ + template \ + struct NrmInf>, \ + Kokkos::View, \ + Kokkos::MemoryTraits>, \ + 1, true, ETI_SPEC_AVAIL> { \ + typedef Kokkos::View> RV; \ + typedef Kokkos::View, \ + Kokkos::MemoryTraits> \ + XV; \ + typedef typename XV::size_type size_type; \ + typedef Kokkos::Details::InnerProductSpaceTraits IPT; \ + \ + static void nrminf(const ExecSpace& space, RV& R, const XV& X) { \ + Kokkos::Profiling::pushRegion("KokkosBlas::nrminf[TPL_BLAS,double]"); \ + const size_type numElems = X.extent(0); \ + if (numElems == 0) { \ + R() = 0.0; \ + return; \ + } \ + if (numElems < static_cast(INT_MAX)) { \ + nrminf_print_specialization(); \ + int N = numElems; \ + int one = 1; \ + int idx = HostBlas::iamax(N, X.data(), one) - 1; \ + R() = IPT::norm(X(idx)); \ + } else { \ + NrmInf::nrminf(space, R, X); \ + } \ + Kokkos::Profiling::popRegion(); \ + } \ }; -#define KOKKOSBLAS1_SNRMINF_TPL_SPEC_DECL_BLAS(LAYOUT, MEMSPACE, \ - ETI_SPEC_AVAIL) \ - template \ - struct NrmInf< \ - ExecSpace, \ - Kokkos::View>, \ - Kokkos::View, \ - Kokkos::MemoryTraits>, \ - 1, true, ETI_SPEC_AVAIL> { \ - typedef Kokkos::View> \ - RV; \ - typedef Kokkos::View, \ - Kokkos::MemoryTraits> \ - XV; \ - typedef typename XV::size_type size_type; \ - typedef Kokkos::Details::InnerProductSpaceTraits IPT; \ - \ - static void nrminf(const ExecSpace& space, RV& R, const XV& X) { \ - Kokkos::Profiling::pushRegion("KokkosBlas::nrminf[TPL_BLAS,float]"); \ - const size_type numElems = X.extent(0); \ - if (numElems == 0) { \ - R() = 0.0f; \ - return; \ - } \ - if (numElems < static_cast(INT_MAX)) { \ - nrminf_print_specialization(); \ - int N = numElems; \ - int one = 1; \ - int idx = HostBlas::iamax(N, X.data(), one) - 1; \ - R() = IPT::norm(X(idx)); \ - } else { \ - NrmInf::nrminf(space, R, \ - X); \ - } \ - Kokkos::Profiling::popRegion(); \ - } \ +#define KOKKOSBLAS1_SNRMINF_TPL_SPEC_DECL_BLAS(LAYOUT, MEMSPACE, ETI_SPEC_AVAIL) \ + template \ + struct NrmInf>, \ + Kokkos::View, \ + Kokkos::MemoryTraits>, \ + 1, true, ETI_SPEC_AVAIL> { \ + typedef Kokkos::View> RV; \ + typedef Kokkos::View, \ + Kokkos::MemoryTraits> \ + XV; \ + typedef typename XV::size_type size_type; \ + typedef Kokkos::Details::InnerProductSpaceTraits IPT; \ + \ + static void nrminf(const ExecSpace& space, RV& R, const XV& X) { \ + Kokkos::Profiling::pushRegion("KokkosBlas::nrminf[TPL_BLAS,float]"); \ + const size_type numElems = X.extent(0); \ + if (numElems == 0) { \ + R() = 0.0f; \ + return; \ + } \ + if (numElems < static_cast(INT_MAX)) { \ + nrminf_print_specialization(); \ + int N = numElems; \ + int one = 1; \ + int idx = HostBlas::iamax(N, X.data(), one) - 1; \ + R() = IPT::norm(X(idx)); \ + } else { \ + NrmInf::nrminf(space, R, X); \ + } \ + Kokkos::Profiling::popRegion(); \ + } \ }; -#define KOKKOSBLAS1_ZNRMINF_TPL_SPEC_DECL_BLAS(LAYOUT, MEMSPACE, \ - ETI_SPEC_AVAIL) \ - template \ - struct NrmInf>, \ - Kokkos::View*, LAYOUT, \ - Kokkos::Device, \ - Kokkos::MemoryTraits>, \ - 1, true, ETI_SPEC_AVAIL> { \ - typedef Kokkos::View> \ - RV; \ - typedef Kokkos::View*, LAYOUT, \ - Kokkos::Device, \ - Kokkos::MemoryTraits> \ - XV; \ - typedef typename XV::size_type size_type; \ - typedef Kokkos::Details::InnerProductSpaceTraits> \ - IPT; \ - \ - static void nrminf(const ExecSpace& space, RV& R, const XV& X) { \ - Kokkos::Profiling::pushRegion( \ - "KokkosBlas::nrminf[TPL_BLAS,complex]"); \ - const size_type numElems = X.extent(0); \ - if (numElems == 0) { \ - R() = 0.0; \ - return; \ - } \ - if (numElems < static_cast(INT_MAX)) { \ - nrminf_print_specialization(); \ - int N = numElems; \ - int one = 1; \ - int idx = \ - HostBlas>::iamax( \ - N, reinterpret_cast*>(X.data()), \ - one) - \ - 1; \ - R() = IPT::norm(X(idx)); \ - } else { \ - NrmInf::nrminf(space, R, \ - X); \ - } \ - Kokkos::Profiling::popRegion(); \ - } \ +#define KOKKOSBLAS1_ZNRMINF_TPL_SPEC_DECL_BLAS(LAYOUT, MEMSPACE, ETI_SPEC_AVAIL) \ + template \ + struct NrmInf>, \ + Kokkos::View*, LAYOUT, Kokkos::Device, \ + Kokkos::MemoryTraits>, \ + 1, true, ETI_SPEC_AVAIL> { \ + typedef Kokkos::View> RV; \ + typedef Kokkos::View*, LAYOUT, Kokkos::Device, \ + Kokkos::MemoryTraits> \ + XV; \ + typedef typename XV::size_type size_type; \ + typedef Kokkos::Details::InnerProductSpaceTraits> IPT; \ + \ + static void nrminf(const ExecSpace& space, RV& R, const XV& X) { \ + Kokkos::Profiling::pushRegion("KokkosBlas::nrminf[TPL_BLAS,complex]"); \ + const size_type numElems = X.extent(0); \ + if (numElems == 0) { \ + R() = 0.0; \ + return; \ + } \ + if (numElems < static_cast(INT_MAX)) { \ + nrminf_print_specialization(); \ + int N = numElems; \ + int one = 1; \ + int idx = \ + HostBlas>::iamax(N, reinterpret_cast*>(X.data()), one) - \ + 1; \ + R() = IPT::norm(X(idx)); \ + } else { \ + NrmInf::nrminf(space, R, X); \ + } \ + Kokkos::Profiling::popRegion(); \ + } \ }; -#define KOKKOSBLAS1_CNRMINF_TPL_SPEC_DECL_BLAS(LAYOUT, MEMSPACE, \ - ETI_SPEC_AVAIL) \ - template \ - struct NrmInf>, \ - Kokkos::View*, LAYOUT, \ - Kokkos::Device, \ - Kokkos::MemoryTraits>, \ - 1, true, ETI_SPEC_AVAIL> { \ - typedef Kokkos::View> \ - RV; \ - typedef Kokkos::View*, LAYOUT, \ - Kokkos::Device, \ - Kokkos::MemoryTraits> \ - XV; \ - typedef typename XV::size_type size_type; \ - typedef Kokkos::Details::InnerProductSpaceTraits> \ - IPT; \ - \ - static void nrminf(const ExecSpace& space, RV& R, const XV& X) { \ - Kokkos::Profiling::pushRegion( \ - "KokkosBlas::nrminf[TPL_BLAS,complex]"); \ - const size_type numElems = X.extent(0); \ - if (numElems == 0) { \ - R() = 0.0f; \ - return; \ - } \ - if (numElems < static_cast(INT_MAX)) { \ - nrminf_print_specialization(); \ - int N = numElems; \ - int one = 1; \ - int idx = \ - HostBlas>::iamax( \ - N, reinterpret_cast*>(X.data()), \ - one) - \ - 1; \ - R() = IPT::norm(X(idx)); \ - } else { \ - NrmInf::nrminf(space, R, \ - X); \ - } \ - Kokkos::Profiling::popRegion(); \ - } \ +#define KOKKOSBLAS1_CNRMINF_TPL_SPEC_DECL_BLAS(LAYOUT, MEMSPACE, ETI_SPEC_AVAIL) \ + template \ + struct NrmInf>, \ + Kokkos::View*, LAYOUT, Kokkos::Device, \ + Kokkos::MemoryTraits>, \ + 1, true, ETI_SPEC_AVAIL> { \ + typedef Kokkos::View> RV; \ + typedef Kokkos::View*, LAYOUT, Kokkos::Device, \ + Kokkos::MemoryTraits> \ + XV; \ + typedef typename XV::size_type size_type; \ + typedef Kokkos::Details::InnerProductSpaceTraits> IPT; \ + \ + static void nrminf(const ExecSpace& space, RV& R, const XV& X) { \ + Kokkos::Profiling::pushRegion("KokkosBlas::nrminf[TPL_BLAS,complex]"); \ + const size_type numElems = X.extent(0); \ + if (numElems == 0) { \ + R() = 0.0f; \ + return; \ + } \ + if (numElems < static_cast(INT_MAX)) { \ + nrminf_print_specialization(); \ + int N = numElems; \ + int one = 1; \ + int idx = \ + HostBlas>::iamax(N, reinterpret_cast*>(X.data()), one) - 1; \ + R() = IPT::norm(X(idx)); \ + } else { \ + NrmInf::nrminf(space, R, X); \ + } \ + Kokkos::Profiling::popRegion(); \ + } \ }; -KOKKOSBLAS1_DNRMINF_TPL_SPEC_DECL_BLAS(Kokkos::LayoutLeft, Kokkos::HostSpace, - true) -KOKKOSBLAS1_DNRMINF_TPL_SPEC_DECL_BLAS(Kokkos::LayoutLeft, Kokkos::HostSpace, - false) - -KOKKOSBLAS1_SNRMINF_TPL_SPEC_DECL_BLAS(Kokkos::LayoutLeft, Kokkos::HostSpace, - true) -KOKKOSBLAS1_SNRMINF_TPL_SPEC_DECL_BLAS(Kokkos::LayoutLeft, Kokkos::HostSpace, - false) - -KOKKOSBLAS1_ZNRMINF_TPL_SPEC_DECL_BLAS(Kokkos::LayoutLeft, Kokkos::HostSpace, - true) -KOKKOSBLAS1_ZNRMINF_TPL_SPEC_DECL_BLAS(Kokkos::LayoutLeft, Kokkos::HostSpace, - false) - -KOKKOSBLAS1_CNRMINF_TPL_SPEC_DECL_BLAS(Kokkos::LayoutLeft, Kokkos::HostSpace, - true) -KOKKOSBLAS1_CNRMINF_TPL_SPEC_DECL_BLAS(Kokkos::LayoutLeft, Kokkos::HostSpace, - false) +KOKKOSBLAS1_DNRMINF_TPL_SPEC_DECL_BLAS(Kokkos::LayoutLeft, Kokkos::HostSpace, true) +KOKKOSBLAS1_DNRMINF_TPL_SPEC_DECL_BLAS(Kokkos::LayoutLeft, Kokkos::HostSpace, false) + +KOKKOSBLAS1_SNRMINF_TPL_SPEC_DECL_BLAS(Kokkos::LayoutLeft, Kokkos::HostSpace, true) +KOKKOSBLAS1_SNRMINF_TPL_SPEC_DECL_BLAS(Kokkos::LayoutLeft, Kokkos::HostSpace, false) + +KOKKOSBLAS1_ZNRMINF_TPL_SPEC_DECL_BLAS(Kokkos::LayoutLeft, Kokkos::HostSpace, true) +KOKKOSBLAS1_ZNRMINF_TPL_SPEC_DECL_BLAS(Kokkos::LayoutLeft, Kokkos::HostSpace, false) + +KOKKOSBLAS1_CNRMINF_TPL_SPEC_DECL_BLAS(Kokkos::LayoutLeft, Kokkos::HostSpace, true) +KOKKOSBLAS1_CNRMINF_TPL_SPEC_DECL_BLAS(Kokkos::LayoutLeft, Kokkos::HostSpace, false) } // namespace Impl } // namespace KokkosBlas diff --git a/blas/tpls/KokkosBlas1_rot_tpl_spec_avail.hpp b/blas/tpls/KokkosBlas1_rot_tpl_spec_avail.hpp index 59f1715e54..fee65fce14 100644 --- a/blas/tpls/KokkosBlas1_rot_tpl_spec_avail.hpp +++ b/blas/tpls/KokkosBlas1_rot_tpl_spec_avail.hpp @@ -32,62 +32,46 @@ namespace Impl { // Generic Host side BLAS (could be MKL or whatever) #ifdef KOKKOSKERNELS_ENABLE_TPL_BLAS -#define KOKKOSBLAS1_ROT_TPL_SPEC_AVAIL_BLAS(SCALAR, LAYOUT, EXECSPACE) \ - template <> \ - struct rot_tpl_spec_avail< \ - EXECSPACE, \ - Kokkos::View, \ - Kokkos::MemoryTraits>, \ - Kokkos::View, \ - Kokkos::MemoryTraits>> { \ - enum : bool { value = true }; \ +#define KOKKOSBLAS1_ROT_TPL_SPEC_AVAIL_BLAS(SCALAR, LAYOUT, EXECSPACE) \ + template <> \ + struct rot_tpl_spec_avail, \ + Kokkos::MemoryTraits>, \ + Kokkos::View, \ + Kokkos::MemoryTraits>> { \ + enum : bool { value = true }; \ }; #ifdef KOKKOS_ENABLE_SERIAL KOKKOSBLAS1_ROT_TPL_SPEC_AVAIL_BLAS(double, Kokkos::LayoutLeft, Kokkos::Serial) KOKKOSBLAS1_ROT_TPL_SPEC_AVAIL_BLAS(float, Kokkos::LayoutLeft, Kokkos::Serial) -KOKKOSBLAS1_ROT_TPL_SPEC_AVAIL_BLAS(Kokkos::complex, Kokkos::LayoutLeft, - Kokkos::Serial) -KOKKOSBLAS1_ROT_TPL_SPEC_AVAIL_BLAS(Kokkos::complex, Kokkos::LayoutLeft, - Kokkos::Serial) +KOKKOSBLAS1_ROT_TPL_SPEC_AVAIL_BLAS(Kokkos::complex, Kokkos::LayoutLeft, Kokkos::Serial) +KOKKOSBLAS1_ROT_TPL_SPEC_AVAIL_BLAS(Kokkos::complex, Kokkos::LayoutLeft, Kokkos::Serial) #endif #ifdef KOKKOS_ENABLE_OPENMP KOKKOSBLAS1_ROT_TPL_SPEC_AVAIL_BLAS(double, Kokkos::LayoutLeft, Kokkos::OpenMP) KOKKOSBLAS1_ROT_TPL_SPEC_AVAIL_BLAS(float, Kokkos::LayoutLeft, Kokkos::OpenMP) -KOKKOSBLAS1_ROT_TPL_SPEC_AVAIL_BLAS(Kokkos::complex, Kokkos::LayoutLeft, - Kokkos::OpenMP) -KOKKOSBLAS1_ROT_TPL_SPEC_AVAIL_BLAS(Kokkos::complex, Kokkos::LayoutLeft, - Kokkos::OpenMP) +KOKKOSBLAS1_ROT_TPL_SPEC_AVAIL_BLAS(Kokkos::complex, Kokkos::LayoutLeft, Kokkos::OpenMP) +KOKKOSBLAS1_ROT_TPL_SPEC_AVAIL_BLAS(Kokkos::complex, Kokkos::LayoutLeft, Kokkos::OpenMP) #endif #endif // cuBLAS #ifdef KOKKOSKERNELS_ENABLE_TPL_CUBLAS -#define KOKKOSBLAS1_ROT_TPL_SPEC_AVAIL_CUBLAS(SCALAR, LAYOUT, EXECSPACE, \ - MEMSPACE) \ - template <> \ - struct rot_tpl_spec_avail< \ - EXECSPACE, \ - Kokkos::View, \ - Kokkos::MemoryTraits>, \ - Kokkos::View, \ - Kokkos::MemoryTraits>> { \ - enum : bool { value = true }; \ +#define KOKKOSBLAS1_ROT_TPL_SPEC_AVAIL_CUBLAS(SCALAR, LAYOUT, EXECSPACE, MEMSPACE) \ + template <> \ + struct rot_tpl_spec_avail< \ + EXECSPACE, \ + Kokkos::View, Kokkos::MemoryTraits>, \ + Kokkos::View, Kokkos::MemoryTraits>> { \ + enum : bool { value = true }; \ }; -KOKKOSBLAS1_ROT_TPL_SPEC_AVAIL_CUBLAS(double, Kokkos::LayoutLeft, Kokkos::Cuda, - Kokkos::CudaSpace) -KOKKOSBLAS1_ROT_TPL_SPEC_AVAIL_CUBLAS(float, Kokkos::LayoutLeft, Kokkos::Cuda, - Kokkos::CudaSpace) -KOKKOSBLAS1_ROT_TPL_SPEC_AVAIL_CUBLAS(Kokkos::complex, - Kokkos::LayoutLeft, Kokkos::Cuda, - Kokkos::CudaSpace) -KOKKOSBLAS1_ROT_TPL_SPEC_AVAIL_CUBLAS(Kokkos::complex, - Kokkos::LayoutLeft, Kokkos::Cuda, - Kokkos::CudaSpace) +KOKKOSBLAS1_ROT_TPL_SPEC_AVAIL_CUBLAS(double, Kokkos::LayoutLeft, Kokkos::Cuda, Kokkos::CudaSpace) +KOKKOSBLAS1_ROT_TPL_SPEC_AVAIL_CUBLAS(float, Kokkos::LayoutLeft, Kokkos::Cuda, Kokkos::CudaSpace) +KOKKOSBLAS1_ROT_TPL_SPEC_AVAIL_CUBLAS(Kokkos::complex, Kokkos::LayoutLeft, Kokkos::Cuda, Kokkos::CudaSpace) +KOKKOSBLAS1_ROT_TPL_SPEC_AVAIL_CUBLAS(Kokkos::complex, Kokkos::LayoutLeft, Kokkos::Cuda, Kokkos::CudaSpace) #endif // rocBLAS diff --git a/blas/tpls/KokkosBlas1_rot_tpl_spec_decl.hpp b/blas/tpls/KokkosBlas1_rot_tpl_spec_decl.hpp index 8c83f9a096..404c5c0e3b 100644 --- a/blas/tpls/KokkosBlas1_rot_tpl_spec_decl.hpp +++ b/blas/tpls/KokkosBlas1_rot_tpl_spec_decl.hpp @@ -24,9 +24,8 @@ namespace { template inline void rot_print_specialization() { #ifdef KOKKOSKERNELS_ENABLE_CHECK_SPECIALIZATION - printf("KokkosBlas::rot<> TPL Blas specialization for < %s, %s, %s >\n", - typeid(VectorView).name(), typeid(ScalarView).name(), - typeid(ExecutionSpace).name); + printf("KokkosBlas::rot<> TPL Blas specialization for < %s, %s, %s >\n", typeid(VectorView).name(), + typeid(ScalarView).name(), typeid(ExecutionSpace).name); #endif } } // namespace @@ -40,110 +39,76 @@ inline void rot_print_specialization() { namespace KokkosBlas { namespace Impl { -#define KOKKOSBLAS1_DROT_TPL_SPEC_DECL_BLAS(LAYOUT, EXECSPACE, ETI_SPEC_AVAIL) \ - template <> \ - struct Rot, \ - Kokkos::MemoryTraits>, \ - Kokkos::View, \ - Kokkos::MemoryTraits>, \ - true, ETI_SPEC_AVAIL> { \ - using VectorView = \ - Kokkos::View, \ - Kokkos::MemoryTraits>; \ - using ScalarView = \ - Kokkos::View, \ - Kokkos::MemoryTraits>; \ - static void rot(EXECSPACE const& /*space*/, VectorView const& X, \ - VectorView const& Y, ScalarView const& c, \ - ScalarView const& s) { \ - Kokkos::Profiling::pushRegion("KokkosBlas::rot[TPL_BLAS,double]"); \ - HostBlas::rot(X.extent_int(0), X.data(), 1, Y.data(), 1, \ - c.data(), s.data()); \ - Kokkos::Profiling::popRegion(); \ - } \ +#define KOKKOSBLAS1_DROT_TPL_SPEC_DECL_BLAS(LAYOUT, EXECSPACE, ETI_SPEC_AVAIL) \ + template <> \ + struct Rot, \ + Kokkos::MemoryTraits>, \ + Kokkos::View, \ + Kokkos::MemoryTraits>, \ + true, ETI_SPEC_AVAIL> { \ + using VectorView = Kokkos::View, \ + Kokkos::MemoryTraits>; \ + using ScalarView = Kokkos::View, \ + Kokkos::MemoryTraits>; \ + static void rot(EXECSPACE const& /*space*/, VectorView const& X, VectorView const& Y, ScalarView const& c, \ + ScalarView const& s) { \ + Kokkos::Profiling::pushRegion("KokkosBlas::rot[TPL_BLAS,double]"); \ + HostBlas::rot(X.extent_int(0), X.data(), 1, Y.data(), 1, c.data(), s.data()); \ + Kokkos::Profiling::popRegion(); \ + } \ }; -#define KOKKOSBLAS1_SROT_TPL_SPEC_DECL_BLAS(LAYOUT, EXECSPACE, ETI_SPEC_AVAIL) \ - template <> \ - struct Rot, \ - Kokkos::MemoryTraits>, \ - Kokkos::View, \ - Kokkos::MemoryTraits>, \ - true, ETI_SPEC_AVAIL> { \ - using VectorView = \ - Kokkos::View, \ - Kokkos::MemoryTraits>; \ - using ScalarView = \ - Kokkos::View, \ - Kokkos::MemoryTraits>; \ - static void rot(EXECSPACE const& /*space*/, VectorView const& X, \ - VectorView const& Y, ScalarView const& c, \ - ScalarView const& s) { \ - Kokkos::Profiling::pushRegion("KokkosBlas::rot[TPL_BLAS,float]"); \ - HostBlas::rot(X.extent_int(0), X.data(), 1, Y.data(), 1, \ - c.data(), s.data()); \ - Kokkos::Profiling::popRegion(); \ - } \ +#define KOKKOSBLAS1_SROT_TPL_SPEC_DECL_BLAS(LAYOUT, EXECSPACE, ETI_SPEC_AVAIL) \ + template <> \ + struct Rot, \ + Kokkos::MemoryTraits>, \ + Kokkos::View, \ + Kokkos::MemoryTraits>, \ + true, ETI_SPEC_AVAIL> { \ + using VectorView = Kokkos::View, \ + Kokkos::MemoryTraits>; \ + using ScalarView = Kokkos::View, \ + Kokkos::MemoryTraits>; \ + static void rot(EXECSPACE const& /*space*/, VectorView const& X, VectorView const& Y, ScalarView const& c, \ + ScalarView const& s) { \ + Kokkos::Profiling::pushRegion("KokkosBlas::rot[TPL_BLAS,float]"); \ + HostBlas::rot(X.extent_int(0), X.data(), 1, Y.data(), 1, c.data(), s.data()); \ + Kokkos::Profiling::popRegion(); \ + } \ }; -#define KOKKOSBLAS1_ZROT_TPL_SPEC_DECL_BLAS(LAYOUT, EXECSPACE, ETI_SPEC_AVAIL) \ - template \ - struct Rot, EXECSPACE, MEMSPACE, true, \ - ETI_SPEC_AVAIL> { \ - using VectorView = \ - Kokkos::View*, LAYOUT, \ - Kokkos::Device, \ - Kokkos::MemoryTraits>; \ - using ScalarView = \ - Kokkos::View, \ - Kokkos::MemoryTraits>; \ - static void rot(EXECSPACE const& /*space*/, VectorView const& X, \ - VectorView const& Y, ScalarView const& c, \ - ScalarView const& s) { \ - Kokkos::Profiling::pushRegion( \ - "KokkosBlas::rot[TPL_BLAS,complex]"); \ - HostBlas>::rot( \ - X.extent_int(0), reinterpret_cast*>(X.data()), \ - 1, reinterpret_cast*>(Y.data()), 1, c.data(), \ - s.data()); \ - Kokkos::Profiling::popRegion(); \ - } \ +#define KOKKOSBLAS1_ZROT_TPL_SPEC_DECL_BLAS(LAYOUT, EXECSPACE, ETI_SPEC_AVAIL) \ + template \ + struct Rot, EXECSPACE, MEMSPACE, true, ETI_SPEC_AVAIL> { \ + using VectorView = Kokkos::View*, LAYOUT, Kokkos::Device, \ + Kokkos::MemoryTraits>; \ + using ScalarView = Kokkos::View, \ + Kokkos::MemoryTraits>; \ + static void rot(EXECSPACE const& /*space*/, VectorView const& X, VectorView const& Y, ScalarView const& c, \ + ScalarView const& s) { \ + Kokkos::Profiling::pushRegion("KokkosBlas::rot[TPL_BLAS,complex]"); \ + HostBlas>::rot(X.extent_int(0), reinterpret_cast*>(X.data()), 1, \ + reinterpret_cast*>(Y.data()), 1, c.data(), s.data()); \ + Kokkos::Profiling::popRegion(); \ + } \ }; -#define KOKKOSBLAS1_CROT_TPL_SPEC_DECL_BLAS(LAYOUT, EXECSPACE, ETI_SPEC_AVAIL) \ - template \ - struct Rot, EXECSPACE, MEMSPACE, true, \ - ETI_SPEC_AVAIL> { \ - using VectorView = \ - Kokkos::View*, LAYOUT, \ - Kokkos::Device, \ - Kokkos::MemoryTraits>; \ - using ScalarView = \ - Kokkos::View, \ - Kokkos::MemoryTraits>; \ - static void rot(EXECSPACE const& /*space*/, VectorView const& X, \ - VectorView const& Y, ScalarView const& c, \ - ScalarView const& s) { \ - Kokkos::Profiling::pushRegion( \ - "KokkosBlas::rot[TPL_BLAS,complex]"); \ - HostBlas>::rot( \ - X.extent_int(0), reinterpret_cast*>(X.data()), \ - 1, reinterpret_cast*>(Y.data()), 1, c.data(), \ - s.data()); \ - Kokkos::Profiling::popRegion(); \ - } \ +#define KOKKOSBLAS1_CROT_TPL_SPEC_DECL_BLAS(LAYOUT, EXECSPACE, ETI_SPEC_AVAIL) \ + template \ + struct Rot, EXECSPACE, MEMSPACE, true, ETI_SPEC_AVAIL> { \ + using VectorView = Kokkos::View*, LAYOUT, Kokkos::Device, \ + Kokkos::MemoryTraits>; \ + using ScalarView = Kokkos::View, \ + Kokkos::MemoryTraits>; \ + static void rot(EXECSPACE const& /*space*/, VectorView const& X, VectorView const& Y, ScalarView const& c, \ + ScalarView const& s) { \ + Kokkos::Profiling::pushRegion("KokkosBlas::rot[TPL_BLAS,complex]"); \ + HostBlas>::rot(X.extent_int(0), reinterpret_cast*>(X.data()), 1, \ + reinterpret_cast*>(Y.data()), 1, c.data(), s.data()); \ + Kokkos::Profiling::popRegion(); \ + } \ }; #ifdef KOKKOS_ENABLE_SERIAL @@ -186,230 +151,149 @@ KOKKOSBLAS1_CROT_TPL_SPEC_DECL_BLAS(Kokkos::LayoutLeft, Kokkos::OpenMP, false) namespace KokkosBlas { namespace Impl { -#define KOKKOSBLAS1_DROT_TPL_SPEC_DECL_CUBLAS(LAYOUT, EXECSPACE, MEMSPACE, \ - ETI_SPEC_AVAIL) \ - template <> \ - struct Rot< \ - EXECSPACE, \ - Kokkos::View, \ - Kokkos::MemoryTraits>, \ - Kokkos::View, \ - Kokkos::MemoryTraits>, \ - true, ETI_SPEC_AVAIL> { \ - using VectorView = \ - Kokkos::View, \ - Kokkos::MemoryTraits>; \ - using ScalarView = \ - Kokkos::View, \ - Kokkos::MemoryTraits>; \ - static void rot(EXECSPACE const& space, VectorView const& X, \ - VectorView const& Y, ScalarView const& c, \ - ScalarView const& s) { \ - Kokkos::Profiling::pushRegion("KokkosBlas::rot[TPL_CUBLAS,double]"); \ - rot_print_specialization(); \ - KokkosBlas::Impl::CudaBlasSingleton& singleton = \ - KokkosBlas::Impl::CudaBlasSingleton::singleton(); \ - KOKKOS_CUBLAS_SAFE_CALL_IMPL( \ - cublasSetStream(singleton.handle, space.cuda_stream())); \ - cublasPointerMode_t pointer_mode; \ - KOKKOS_CUBLAS_SAFE_CALL_IMPL( \ - cublasGetPointerMode(singleton.handle, &pointer_mode)); \ - KOKKOS_CUBLAS_SAFE_CALL_IMPL( \ - cublasSetPointerMode(singleton.handle, CUBLAS_POINTER_MODE_DEVICE)); \ - cublasDrot(singleton.handle, X.extent_int(0), X.data(), 1, Y.data(), 1, \ - c.data(), s.data()); \ - KOKKOS_CUBLAS_SAFE_CALL_IMPL( \ - cublasSetPointerMode(singleton.handle, pointer_mode)); \ - Kokkos::Profiling::popRegion(); \ - } \ +#define KOKKOSBLAS1_DROT_TPL_SPEC_DECL_CUBLAS(LAYOUT, EXECSPACE, MEMSPACE, ETI_SPEC_AVAIL) \ + template <> \ + struct Rot< \ + EXECSPACE, \ + Kokkos::View, Kokkos::MemoryTraits>, \ + Kokkos::View, Kokkos::MemoryTraits>, \ + true, ETI_SPEC_AVAIL> { \ + using VectorView = \ + Kokkos::View, Kokkos::MemoryTraits>; \ + using ScalarView = \ + Kokkos::View, Kokkos::MemoryTraits>; \ + static void rot(EXECSPACE const& space, VectorView const& X, VectorView const& Y, ScalarView const& c, \ + ScalarView const& s) { \ + Kokkos::Profiling::pushRegion("KokkosBlas::rot[TPL_CUBLAS,double]"); \ + rot_print_specialization(); \ + KokkosBlas::Impl::CudaBlasSingleton& singleton = KokkosBlas::Impl::CudaBlasSingleton::singleton(); \ + KOKKOS_CUBLAS_SAFE_CALL_IMPL(cublasSetStream(singleton.handle, space.cuda_stream())); \ + cublasPointerMode_t pointer_mode; \ + KOKKOS_CUBLAS_SAFE_CALL_IMPL(cublasGetPointerMode(singleton.handle, &pointer_mode)); \ + KOKKOS_CUBLAS_SAFE_CALL_IMPL(cublasSetPointerMode(singleton.handle, CUBLAS_POINTER_MODE_DEVICE)); \ + cublasDrot(singleton.handle, X.extent_int(0), X.data(), 1, Y.data(), 1, c.data(), s.data()); \ + KOKKOS_CUBLAS_SAFE_CALL_IMPL(cublasSetPointerMode(singleton.handle, pointer_mode)); \ + Kokkos::Profiling::popRegion(); \ + } \ }; -#define KOKKOSBLAS1_SROT_TPL_SPEC_DECL_CUBLAS(LAYOUT, EXECSPACE, MEMSPACE, \ - ETI_SPEC_AVAIL) \ - template <> \ - struct Rot, \ - Kokkos::MemoryTraits>, \ - Kokkos::View, \ - Kokkos::MemoryTraits>, \ - true, ETI_SPEC_AVAIL> { \ - using VectorView = \ - Kokkos::View, \ - Kokkos::MemoryTraits>; \ - using ScalarView = \ - Kokkos::View, \ - Kokkos::MemoryTraits>; \ - static void rot(EXECSPACE const& space, VectorView const& X, \ - VectorView const& Y, ScalarView const& c, \ - ScalarView const& s) { \ - Kokkos::Profiling::pushRegion("KokkosBlas::rot[TPL_CUBLAS,float]"); \ - rot_print_specialization(); \ - KokkosBlas::Impl::CudaBlasSingleton& singleton = \ - KokkosBlas::Impl::CudaBlasSingleton::singleton(); \ - KOKKOS_CUBLAS_SAFE_CALL_IMPL( \ - cublasSetStream(singleton.handle, space.cuda_stream())); \ - cublasPointerMode_t pointer_mode; \ - KOKKOS_CUBLAS_SAFE_CALL_IMPL( \ - cublasGetPointerMode(singleton.handle, &pointer_mode)); \ - KOKKOS_CUBLAS_SAFE_CALL_IMPL( \ - cublasSetPointerMode(singleton.handle, CUBLAS_POINTER_MODE_DEVICE)); \ - cublasSrot(singleton.handle, X.extent_int(0), X.data(), 1, Y.data(), 1, \ - c.data(), s.data()); \ - KOKKOS_CUBLAS_SAFE_CALL_IMPL( \ - cublasSetPointerMode(singleton.handle, pointer_mode)); \ - Kokkos::Profiling::popRegion(); \ - } \ +#define KOKKOSBLAS1_SROT_TPL_SPEC_DECL_CUBLAS(LAYOUT, EXECSPACE, MEMSPACE, ETI_SPEC_AVAIL) \ + template <> \ + struct Rot< \ + EXECSPACE, \ + Kokkos::View, Kokkos::MemoryTraits>, \ + Kokkos::View, Kokkos::MemoryTraits>, true, \ + ETI_SPEC_AVAIL> { \ + using VectorView = \ + Kokkos::View, Kokkos::MemoryTraits>; \ + using ScalarView = \ + Kokkos::View, Kokkos::MemoryTraits>; \ + static void rot(EXECSPACE const& space, VectorView const& X, VectorView const& Y, ScalarView const& c, \ + ScalarView const& s) { \ + Kokkos::Profiling::pushRegion("KokkosBlas::rot[TPL_CUBLAS,float]"); \ + rot_print_specialization(); \ + KokkosBlas::Impl::CudaBlasSingleton& singleton = KokkosBlas::Impl::CudaBlasSingleton::singleton(); \ + KOKKOS_CUBLAS_SAFE_CALL_IMPL(cublasSetStream(singleton.handle, space.cuda_stream())); \ + cublasPointerMode_t pointer_mode; \ + KOKKOS_CUBLAS_SAFE_CALL_IMPL(cublasGetPointerMode(singleton.handle, &pointer_mode)); \ + KOKKOS_CUBLAS_SAFE_CALL_IMPL(cublasSetPointerMode(singleton.handle, CUBLAS_POINTER_MODE_DEVICE)); \ + cublasSrot(singleton.handle, X.extent_int(0), X.data(), 1, Y.data(), 1, c.data(), s.data()); \ + KOKKOS_CUBLAS_SAFE_CALL_IMPL(cublasSetPointerMode(singleton.handle, pointer_mode)); \ + Kokkos::Profiling::popRegion(); \ + } \ }; -#define KOKKOSBLAS1_ZROT_TPL_SPEC_DECL_CUBLAS(LAYOUT, EXECSPACE, MEMSPACE, \ - ETI_SPEC_AVAIL) \ - template <> \ - struct Rot*, LAYOUT, \ - Kokkos::Device, \ - Kokkos::MemoryTraits>, \ - Kokkos::View, \ - Kokkos::MemoryTraits>, \ - true, ETI_SPEC_AVAIL> { \ - using VectorView = Kokkos::View*, LAYOUT, \ - Kokkos::Device, \ - Kokkos::MemoryTraits>; \ - using ScalarView = \ - Kokkos::View, \ - Kokkos::MemoryTraits>; \ - static void rot(EXECSPACE const& space, VectorView const& X, \ - VectorView const& Y, ScalarView const& c, \ - ScalarView const& s) { \ - Kokkos::Profiling::pushRegion( \ - "KokkosBlas::rot[TPL_CUBLAS,complex]"); \ - rot_print_specialization(); \ - KokkosBlas::Impl::CudaBlasSingleton& singleton = \ - KokkosBlas::Impl::CudaBlasSingleton::singleton(); \ - KOKKOS_CUBLAS_SAFE_CALL_IMPL( \ - cublasSetStream(singleton.handle, space.cuda_stream())); \ - cublasPointerMode_t pointer_mode; \ - KOKKOS_CUBLAS_SAFE_CALL_IMPL( \ - cublasGetPointerMode(singleton.handle, &pointer_mode)); \ - KOKKOS_CUBLAS_SAFE_CALL_IMPL( \ - cublasSetPointerMode(singleton.handle, CUBLAS_POINTER_MODE_DEVICE)); \ - cublasZdrot(singleton.handle, X.extent_int(0), \ - reinterpret_cast(X.data()), 1, \ - reinterpret_cast(Y.data()), 1, c.data(), \ - s.data()); \ - KOKKOS_CUBLAS_SAFE_CALL_IMPL( \ - cublasSetPointerMode(singleton.handle, pointer_mode)); \ - Kokkos::Profiling::popRegion(); \ - } \ +#define KOKKOSBLAS1_ZROT_TPL_SPEC_DECL_CUBLAS(LAYOUT, EXECSPACE, MEMSPACE, ETI_SPEC_AVAIL) \ + template <> \ + struct Rot< \ + EXECSPACE, \ + Kokkos::View*, LAYOUT, Kokkos::Device, \ + Kokkos::MemoryTraits>, \ + Kokkos::View, Kokkos::MemoryTraits>, \ + true, ETI_SPEC_AVAIL> { \ + using VectorView = Kokkos::View*, LAYOUT, Kokkos::Device, \ + Kokkos::MemoryTraits>; \ + using ScalarView = \ + Kokkos::View, Kokkos::MemoryTraits>; \ + static void rot(EXECSPACE const& space, VectorView const& X, VectorView const& Y, ScalarView const& c, \ + ScalarView const& s) { \ + Kokkos::Profiling::pushRegion("KokkosBlas::rot[TPL_CUBLAS,complex]"); \ + rot_print_specialization(); \ + KokkosBlas::Impl::CudaBlasSingleton& singleton = KokkosBlas::Impl::CudaBlasSingleton::singleton(); \ + KOKKOS_CUBLAS_SAFE_CALL_IMPL(cublasSetStream(singleton.handle, space.cuda_stream())); \ + cublasPointerMode_t pointer_mode; \ + KOKKOS_CUBLAS_SAFE_CALL_IMPL(cublasGetPointerMode(singleton.handle, &pointer_mode)); \ + KOKKOS_CUBLAS_SAFE_CALL_IMPL(cublasSetPointerMode(singleton.handle, CUBLAS_POINTER_MODE_DEVICE)); \ + cublasZdrot(singleton.handle, X.extent_int(0), reinterpret_cast(X.data()), 1, \ + reinterpret_cast(Y.data()), 1, c.data(), s.data()); \ + KOKKOS_CUBLAS_SAFE_CALL_IMPL(cublasSetPointerMode(singleton.handle, pointer_mode)); \ + Kokkos::Profiling::popRegion(); \ + } \ }; -#define KOKKOSBLAS1_CROT_TPL_SPEC_DECL_CUBLAS(LAYOUT, EXECSPACE, MEMSPACE, \ - ETI_SPEC_AVAIL) \ - template <> \ - struct Rot*, LAYOUT, \ - Kokkos::Device, \ - Kokkos::MemoryTraits>, \ - Kokkos::View, \ - Kokkos::MemoryTraits>, \ - true, ETI_SPEC_AVAIL> { \ - using VectorView = Kokkos::View, LAYOUT, \ - Kokkos::Device, \ - Kokkos::MemoryTraits>; \ - using ScalarView = \ - Kokkos::View, \ - Kokkos::MemoryTraits>; \ - static void rot(EXECSPACE const& space, VectorView const& X, \ - VectorView const& Y, ScalarView const& c, \ - ScalarView const& s) { \ - Kokkos::Profiling::pushRegion( \ - "KokkosBlas::rot[TPL_CUBLAS,complex]"); \ - rot_print_specialization(); \ - KokkosBlas::Impl::CudaBlasSingleton& singleton = \ - KokkosBlas::Impl::CudaBlasSingleton::singleton(); \ - KOKKOS_CUBLAS_SAFE_CALL_IMPL( \ - cublasSetStream(singleton.handle, space.cuda_stream())); \ - cublasPointerMode_t pointer_mode; \ - KOKKOS_CUBLAS_SAFE_CALL_IMPL( \ - cublasGetPointerMode(singleton.handle, &pointer_mode)); \ - KOKKOS_CUBLAS_SAFE_CALL_IMPL( \ - cublasSetPointerMode(singleton.handle, CUBLAS_POINTER_MODE_DEVICE)); \ - cublasCsrot(singleton.handle, X.extent_int(0), \ - reinterpret_cast(X.data()), 1, \ - reinterpret_cast(Y.data()), 1, c.data(), \ - s.data()); \ - KOKKOS_CUBLAS_SAFE_CALL_IMPL( \ - cublasSetPointerMode(singleton.handle, pointer_mode)); \ - Kokkos::Profiling::popRegion(); \ - } \ +#define KOKKOSBLAS1_CROT_TPL_SPEC_DECL_CUBLAS(LAYOUT, EXECSPACE, MEMSPACE, ETI_SPEC_AVAIL) \ + template <> \ + struct Rot< \ + EXECSPACE, \ + Kokkos::View*, LAYOUT, Kokkos::Device, \ + Kokkos::MemoryTraits>, \ + Kokkos::View, Kokkos::MemoryTraits>, true, \ + ETI_SPEC_AVAIL> { \ + using VectorView = Kokkos::View, LAYOUT, Kokkos::Device, \ + Kokkos::MemoryTraits>; \ + using ScalarView = \ + Kokkos::View, Kokkos::MemoryTraits>; \ + static void rot(EXECSPACE const& space, VectorView const& X, VectorView const& Y, ScalarView const& c, \ + ScalarView const& s) { \ + Kokkos::Profiling::pushRegion("KokkosBlas::rot[TPL_CUBLAS,complex]"); \ + rot_print_specialization(); \ + KokkosBlas::Impl::CudaBlasSingleton& singleton = KokkosBlas::Impl::CudaBlasSingleton::singleton(); \ + KOKKOS_CUBLAS_SAFE_CALL_IMPL(cublasSetStream(singleton.handle, space.cuda_stream())); \ + cublasPointerMode_t pointer_mode; \ + KOKKOS_CUBLAS_SAFE_CALL_IMPL(cublasGetPointerMode(singleton.handle, &pointer_mode)); \ + KOKKOS_CUBLAS_SAFE_CALL_IMPL(cublasSetPointerMode(singleton.handle, CUBLAS_POINTER_MODE_DEVICE)); \ + cublasCsrot(singleton.handle, X.extent_int(0), reinterpret_cast(X.data()), 1, \ + reinterpret_cast(Y.data()), 1, c.data(), s.data()); \ + KOKKOS_CUBLAS_SAFE_CALL_IMPL(cublasSetPointerMode(singleton.handle, pointer_mode)); \ + Kokkos::Profiling::popRegion(); \ + } \ }; -KOKKOSBLAS1_DROT_TPL_SPEC_DECL_CUBLAS(Kokkos::LayoutLeft, Kokkos::Cuda, - Kokkos::CudaSpace, true) -KOKKOSBLAS1_DROT_TPL_SPEC_DECL_CUBLAS(Kokkos::LayoutLeft, Kokkos::Cuda, - Kokkos::CudaSpace, false) -KOKKOSBLAS1_DROT_TPL_SPEC_DECL_CUBLAS(Kokkos::LayoutRight, Kokkos::Cuda, - Kokkos::CudaSpace, true) -KOKKOSBLAS1_DROT_TPL_SPEC_DECL_CUBLAS(Kokkos::LayoutRight, Kokkos::Cuda, - Kokkos::CudaSpace, false) -KOKKOSBLAS1_DROT_TPL_SPEC_DECL_CUBLAS(Kokkos::LayoutLeft, Kokkos::Cuda, - Kokkos::CudaUVMSpace, true) -KOKKOSBLAS1_DROT_TPL_SPEC_DECL_CUBLAS(Kokkos::LayoutLeft, Kokkos::Cuda, - Kokkos::CudaUVMSpace, false) -KOKKOSBLAS1_DROT_TPL_SPEC_DECL_CUBLAS(Kokkos::LayoutRight, Kokkos::Cuda, - Kokkos::CudaUVMSpace, true) -KOKKOSBLAS1_DROT_TPL_SPEC_DECL_CUBLAS(Kokkos::LayoutRight, Kokkos::Cuda, - Kokkos::CudaUVMSpace, false) +KOKKOSBLAS1_DROT_TPL_SPEC_DECL_CUBLAS(Kokkos::LayoutLeft, Kokkos::Cuda, Kokkos::CudaSpace, true) +KOKKOSBLAS1_DROT_TPL_SPEC_DECL_CUBLAS(Kokkos::LayoutLeft, Kokkos::Cuda, Kokkos::CudaSpace, false) +KOKKOSBLAS1_DROT_TPL_SPEC_DECL_CUBLAS(Kokkos::LayoutRight, Kokkos::Cuda, Kokkos::CudaSpace, true) +KOKKOSBLAS1_DROT_TPL_SPEC_DECL_CUBLAS(Kokkos::LayoutRight, Kokkos::Cuda, Kokkos::CudaSpace, false) +KOKKOSBLAS1_DROT_TPL_SPEC_DECL_CUBLAS(Kokkos::LayoutLeft, Kokkos::Cuda, Kokkos::CudaUVMSpace, true) +KOKKOSBLAS1_DROT_TPL_SPEC_DECL_CUBLAS(Kokkos::LayoutLeft, Kokkos::Cuda, Kokkos::CudaUVMSpace, false) +KOKKOSBLAS1_DROT_TPL_SPEC_DECL_CUBLAS(Kokkos::LayoutRight, Kokkos::Cuda, Kokkos::CudaUVMSpace, true) +KOKKOSBLAS1_DROT_TPL_SPEC_DECL_CUBLAS(Kokkos::LayoutRight, Kokkos::Cuda, Kokkos::CudaUVMSpace, false) -KOKKOSBLAS1_SROT_TPL_SPEC_DECL_CUBLAS(Kokkos::LayoutLeft, Kokkos::Cuda, - Kokkos::CudaSpace, true) -KOKKOSBLAS1_SROT_TPL_SPEC_DECL_CUBLAS(Kokkos::LayoutLeft, Kokkos::Cuda, - Kokkos::CudaSpace, false) -KOKKOSBLAS1_SROT_TPL_SPEC_DECL_CUBLAS(Kokkos::LayoutRight, Kokkos::Cuda, - Kokkos::CudaSpace, true) -KOKKOSBLAS1_SROT_TPL_SPEC_DECL_CUBLAS(Kokkos::LayoutRight, Kokkos::Cuda, - Kokkos::CudaSpace, false) -KOKKOSBLAS1_SROT_TPL_SPEC_DECL_CUBLAS(Kokkos::LayoutLeft, Kokkos::Cuda, - Kokkos::CudaUVMSpace, true) -KOKKOSBLAS1_SROT_TPL_SPEC_DECL_CUBLAS(Kokkos::LayoutLeft, Kokkos::Cuda, - Kokkos::CudaUVMSpace, false) -KOKKOSBLAS1_SROT_TPL_SPEC_DECL_CUBLAS(Kokkos::LayoutRight, Kokkos::Cuda, - Kokkos::CudaUVMSpace, true) -KOKKOSBLAS1_SROT_TPL_SPEC_DECL_CUBLAS(Kokkos::LayoutRight, Kokkos::Cuda, - Kokkos::CudaUVMSpace, false) +KOKKOSBLAS1_SROT_TPL_SPEC_DECL_CUBLAS(Kokkos::LayoutLeft, Kokkos::Cuda, Kokkos::CudaSpace, true) +KOKKOSBLAS1_SROT_TPL_SPEC_DECL_CUBLAS(Kokkos::LayoutLeft, Kokkos::Cuda, Kokkos::CudaSpace, false) +KOKKOSBLAS1_SROT_TPL_SPEC_DECL_CUBLAS(Kokkos::LayoutRight, Kokkos::Cuda, Kokkos::CudaSpace, true) +KOKKOSBLAS1_SROT_TPL_SPEC_DECL_CUBLAS(Kokkos::LayoutRight, Kokkos::Cuda, Kokkos::CudaSpace, false) +KOKKOSBLAS1_SROT_TPL_SPEC_DECL_CUBLAS(Kokkos::LayoutLeft, Kokkos::Cuda, Kokkos::CudaUVMSpace, true) +KOKKOSBLAS1_SROT_TPL_SPEC_DECL_CUBLAS(Kokkos::LayoutLeft, Kokkos::Cuda, Kokkos::CudaUVMSpace, false) +KOKKOSBLAS1_SROT_TPL_SPEC_DECL_CUBLAS(Kokkos::LayoutRight, Kokkos::Cuda, Kokkos::CudaUVMSpace, true) +KOKKOSBLAS1_SROT_TPL_SPEC_DECL_CUBLAS(Kokkos::LayoutRight, Kokkos::Cuda, Kokkos::CudaUVMSpace, false) -KOKKOSBLAS1_ZROT_TPL_SPEC_DECL_CUBLAS(Kokkos::LayoutLeft, Kokkos::Cuda, - Kokkos::CudaSpace, true) -KOKKOSBLAS1_ZROT_TPL_SPEC_DECL_CUBLAS(Kokkos::LayoutLeft, Kokkos::Cuda, - Kokkos::CudaSpace, false) -KOKKOSBLAS1_ZROT_TPL_SPEC_DECL_CUBLAS(Kokkos::LayoutRight, Kokkos::Cuda, - Kokkos::CudaSpace, true) -KOKKOSBLAS1_ZROT_TPL_SPEC_DECL_CUBLAS(Kokkos::LayoutRight, Kokkos::Cuda, - Kokkos::CudaSpace, false) -KOKKOSBLAS1_ZROT_TPL_SPEC_DECL_CUBLAS(Kokkos::LayoutLeft, Kokkos::Cuda, - Kokkos::CudaUVMSpace, true) -KOKKOSBLAS1_ZROT_TPL_SPEC_DECL_CUBLAS(Kokkos::LayoutLeft, Kokkos::Cuda, - Kokkos::CudaUVMSpace, false) -KOKKOSBLAS1_ZROT_TPL_SPEC_DECL_CUBLAS(Kokkos::LayoutRight, Kokkos::Cuda, - Kokkos::CudaUVMSpace, true) -KOKKOSBLAS1_ZROT_TPL_SPEC_DECL_CUBLAS(Kokkos::LayoutRight, Kokkos::Cuda, - Kokkos::CudaUVMSpace, false) +KOKKOSBLAS1_ZROT_TPL_SPEC_DECL_CUBLAS(Kokkos::LayoutLeft, Kokkos::Cuda, Kokkos::CudaSpace, true) +KOKKOSBLAS1_ZROT_TPL_SPEC_DECL_CUBLAS(Kokkos::LayoutLeft, Kokkos::Cuda, Kokkos::CudaSpace, false) +KOKKOSBLAS1_ZROT_TPL_SPEC_DECL_CUBLAS(Kokkos::LayoutRight, Kokkos::Cuda, Kokkos::CudaSpace, true) +KOKKOSBLAS1_ZROT_TPL_SPEC_DECL_CUBLAS(Kokkos::LayoutRight, Kokkos::Cuda, Kokkos::CudaSpace, false) +KOKKOSBLAS1_ZROT_TPL_SPEC_DECL_CUBLAS(Kokkos::LayoutLeft, Kokkos::Cuda, Kokkos::CudaUVMSpace, true) +KOKKOSBLAS1_ZROT_TPL_SPEC_DECL_CUBLAS(Kokkos::LayoutLeft, Kokkos::Cuda, Kokkos::CudaUVMSpace, false) +KOKKOSBLAS1_ZROT_TPL_SPEC_DECL_CUBLAS(Kokkos::LayoutRight, Kokkos::Cuda, Kokkos::CudaUVMSpace, true) +KOKKOSBLAS1_ZROT_TPL_SPEC_DECL_CUBLAS(Kokkos::LayoutRight, Kokkos::Cuda, Kokkos::CudaUVMSpace, false) -KOKKOSBLAS1_CROT_TPL_SPEC_DECL_CUBLAS(Kokkos::LayoutLeft, Kokkos::Cuda, - Kokkos::CudaSpace, true) -KOKKOSBLAS1_CROT_TPL_SPEC_DECL_CUBLAS(Kokkos::LayoutLeft, Kokkos::Cuda, - Kokkos::CudaSpace, false) -KOKKOSBLAS1_CROT_TPL_SPEC_DECL_CUBLAS(Kokkos::LayoutRight, Kokkos::Cuda, - Kokkos::CudaSpace, true) -KOKKOSBLAS1_CROT_TPL_SPEC_DECL_CUBLAS(Kokkos::LayoutRight, Kokkos::Cuda, - Kokkos::CudaSpace, false) -KOKKOSBLAS1_CROT_TPL_SPEC_DECL_CUBLAS(Kokkos::LayoutLeft, Kokkos::Cuda, - Kokkos::CudaUVMSpace, true) -KOKKOSBLAS1_CROT_TPL_SPEC_DECL_CUBLAS(Kokkos::LayoutLeft, Kokkos::Cuda, - Kokkos::CudaUVMSpace, false) -KOKKOSBLAS1_CROT_TPL_SPEC_DECL_CUBLAS(Kokkos::LayoutRight, Kokkos::Cuda, - Kokkos::CudaUVMSpace, true) -KOKKOSBLAS1_CROT_TPL_SPEC_DECL_CUBLAS(Kokkos::LayoutRight, Kokkos::Cuda, - Kokkos::CudaUVMSpace, false) +KOKKOSBLAS1_CROT_TPL_SPEC_DECL_CUBLAS(Kokkos::LayoutLeft, Kokkos::Cuda, Kokkos::CudaSpace, true) +KOKKOSBLAS1_CROT_TPL_SPEC_DECL_CUBLAS(Kokkos::LayoutLeft, Kokkos::Cuda, Kokkos::CudaSpace, false) +KOKKOSBLAS1_CROT_TPL_SPEC_DECL_CUBLAS(Kokkos::LayoutRight, Kokkos::Cuda, Kokkos::CudaSpace, true) +KOKKOSBLAS1_CROT_TPL_SPEC_DECL_CUBLAS(Kokkos::LayoutRight, Kokkos::Cuda, Kokkos::CudaSpace, false) +KOKKOSBLAS1_CROT_TPL_SPEC_DECL_CUBLAS(Kokkos::LayoutLeft, Kokkos::Cuda, Kokkos::CudaUVMSpace, true) +KOKKOSBLAS1_CROT_TPL_SPEC_DECL_CUBLAS(Kokkos::LayoutLeft, Kokkos::Cuda, Kokkos::CudaUVMSpace, false) +KOKKOSBLAS1_CROT_TPL_SPEC_DECL_CUBLAS(Kokkos::LayoutRight, Kokkos::Cuda, Kokkos::CudaUVMSpace, true) +KOKKOSBLAS1_CROT_TPL_SPEC_DECL_CUBLAS(Kokkos::LayoutRight, Kokkos::Cuda, Kokkos::CudaUVMSpace, false) } // namespace Impl } // namespace KokkosBlas #endif // KOKKOSKERNELS_ENABLE_TPL_CUBLAS diff --git a/blas/tpls/KokkosBlas1_rotg_tpl_spec_avail.hpp b/blas/tpls/KokkosBlas1_rotg_tpl_spec_avail.hpp index ea94ff04dc..f8b8184b80 100644 --- a/blas/tpls/KokkosBlas1_rotg_tpl_spec_avail.hpp +++ b/blas/tpls/KokkosBlas1_rotg_tpl_spec_avail.hpp @@ -32,157 +32,90 @@ namespace Impl { // Generic Host side BLAS (could be MKL or whatever) #ifdef KOKKOSKERNELS_ENABLE_TPL_BLAS -#define KOKKOSBLAS1_ROTG_TPL_SPEC_AVAIL_BLAS(SCALAR, LAYOUT, EXECSPACE, \ - MEMSPACE) \ - template <> \ - struct rotg_tpl_spec_avail< \ - EXECSPACE, \ - Kokkos::View, \ - Kokkos::MemoryTraits>, \ - Kokkos::View::mag_type, LAYOUT, \ - Kokkos::Device, \ - Kokkos::MemoryTraits>> { \ - enum : bool { value = true }; \ +#define KOKKOSBLAS1_ROTG_TPL_SPEC_AVAIL_BLAS(SCALAR, LAYOUT, EXECSPACE, MEMSPACE) \ + template <> \ + struct rotg_tpl_spec_avail< \ + EXECSPACE, \ + Kokkos::View, Kokkos::MemoryTraits>, \ + Kokkos::View::mag_type, LAYOUT, Kokkos::Device, \ + Kokkos::MemoryTraits>> { \ + enum : bool { value = true }; \ }; #ifdef KOKKOS_ENABLE_SERIAL -KOKKOSBLAS1_ROTG_TPL_SPEC_AVAIL_BLAS(double, Kokkos::LayoutLeft, Kokkos::Serial, - Kokkos::HostSpace) -KOKKOSBLAS1_ROTG_TPL_SPEC_AVAIL_BLAS(double, Kokkos::LayoutRight, - Kokkos::Serial, Kokkos::HostSpace) -KOKKOSBLAS1_ROTG_TPL_SPEC_AVAIL_BLAS(float, Kokkos::LayoutLeft, Kokkos::Serial, - Kokkos::HostSpace) -KOKKOSBLAS1_ROTG_TPL_SPEC_AVAIL_BLAS(float, Kokkos::LayoutRight, Kokkos::Serial, - Kokkos::HostSpace) -KOKKOSBLAS1_ROTG_TPL_SPEC_AVAIL_BLAS(Kokkos::complex, - Kokkos::LayoutLeft, Kokkos::Serial, - Kokkos::HostSpace) -KOKKOSBLAS1_ROTG_TPL_SPEC_AVAIL_BLAS(Kokkos::complex, - Kokkos::LayoutRight, Kokkos::Serial, - Kokkos::HostSpace) -KOKKOSBLAS1_ROTG_TPL_SPEC_AVAIL_BLAS(Kokkos::complex, Kokkos::LayoutLeft, - Kokkos::Serial, Kokkos::HostSpace) -KOKKOSBLAS1_ROTG_TPL_SPEC_AVAIL_BLAS(Kokkos::complex, - Kokkos::LayoutRight, Kokkos::Serial, - Kokkos::HostSpace) +KOKKOSBLAS1_ROTG_TPL_SPEC_AVAIL_BLAS(double, Kokkos::LayoutLeft, Kokkos::Serial, Kokkos::HostSpace) +KOKKOSBLAS1_ROTG_TPL_SPEC_AVAIL_BLAS(double, Kokkos::LayoutRight, Kokkos::Serial, Kokkos::HostSpace) +KOKKOSBLAS1_ROTG_TPL_SPEC_AVAIL_BLAS(float, Kokkos::LayoutLeft, Kokkos::Serial, Kokkos::HostSpace) +KOKKOSBLAS1_ROTG_TPL_SPEC_AVAIL_BLAS(float, Kokkos::LayoutRight, Kokkos::Serial, Kokkos::HostSpace) +KOKKOSBLAS1_ROTG_TPL_SPEC_AVAIL_BLAS(Kokkos::complex, Kokkos::LayoutLeft, Kokkos::Serial, Kokkos::HostSpace) +KOKKOSBLAS1_ROTG_TPL_SPEC_AVAIL_BLAS(Kokkos::complex, Kokkos::LayoutRight, Kokkos::Serial, Kokkos::HostSpace) +KOKKOSBLAS1_ROTG_TPL_SPEC_AVAIL_BLAS(Kokkos::complex, Kokkos::LayoutLeft, Kokkos::Serial, Kokkos::HostSpace) +KOKKOSBLAS1_ROTG_TPL_SPEC_AVAIL_BLAS(Kokkos::complex, Kokkos::LayoutRight, Kokkos::Serial, Kokkos::HostSpace) #endif #ifdef KOKKOS_ENABLE_OPENMP -KOKKOSBLAS1_ROTG_TPL_SPEC_AVAIL_BLAS(double, Kokkos::LayoutLeft, Kokkos::OpenMP, - Kokkos::HostSpace) -KOKKOSBLAS1_ROTG_TPL_SPEC_AVAIL_BLAS(double, Kokkos::LayoutRight, - Kokkos::OpenMP, Kokkos::HostSpace) -KOKKOSBLAS1_ROTG_TPL_SPEC_AVAIL_BLAS(float, Kokkos::LayoutLeft, Kokkos::OpenMP, - Kokkos::HostSpace) -KOKKOSBLAS1_ROTG_TPL_SPEC_AVAIL_BLAS(float, Kokkos::LayoutRight, Kokkos::OpenMP, - Kokkos::HostSpace) -KOKKOSBLAS1_ROTG_TPL_SPEC_AVAIL_BLAS(Kokkos::complex, - Kokkos::LayoutLeft, Kokkos::OpenMP, - Kokkos::HostSpace) -KOKKOSBLAS1_ROTG_TPL_SPEC_AVAIL_BLAS(Kokkos::complex, - Kokkos::LayoutRight, Kokkos::OpenMP, - Kokkos::HostSpace) -KOKKOSBLAS1_ROTG_TPL_SPEC_AVAIL_BLAS(Kokkos::complex, Kokkos::LayoutLeft, - Kokkos::OpenMP, Kokkos::HostSpace) -KOKKOSBLAS1_ROTG_TPL_SPEC_AVAIL_BLAS(Kokkos::complex, - Kokkos::LayoutRight, Kokkos::OpenMP, - Kokkos::HostSpace) +KOKKOSBLAS1_ROTG_TPL_SPEC_AVAIL_BLAS(double, Kokkos::LayoutLeft, Kokkos::OpenMP, Kokkos::HostSpace) +KOKKOSBLAS1_ROTG_TPL_SPEC_AVAIL_BLAS(double, Kokkos::LayoutRight, Kokkos::OpenMP, Kokkos::HostSpace) +KOKKOSBLAS1_ROTG_TPL_SPEC_AVAIL_BLAS(float, Kokkos::LayoutLeft, Kokkos::OpenMP, Kokkos::HostSpace) +KOKKOSBLAS1_ROTG_TPL_SPEC_AVAIL_BLAS(float, Kokkos::LayoutRight, Kokkos::OpenMP, Kokkos::HostSpace) +KOKKOSBLAS1_ROTG_TPL_SPEC_AVAIL_BLAS(Kokkos::complex, Kokkos::LayoutLeft, Kokkos::OpenMP, Kokkos::HostSpace) +KOKKOSBLAS1_ROTG_TPL_SPEC_AVAIL_BLAS(Kokkos::complex, Kokkos::LayoutRight, Kokkos::OpenMP, Kokkos::HostSpace) +KOKKOSBLAS1_ROTG_TPL_SPEC_AVAIL_BLAS(Kokkos::complex, Kokkos::LayoutLeft, Kokkos::OpenMP, Kokkos::HostSpace) +KOKKOSBLAS1_ROTG_TPL_SPEC_AVAIL_BLAS(Kokkos::complex, Kokkos::LayoutRight, Kokkos::OpenMP, Kokkos::HostSpace) #endif #endif // cuBLAS #ifdef KOKKOSKERNELS_ENABLE_TPL_CUBLAS -#define KOKKOSBLAS1_ROTG_TPL_SPEC_AVAIL_CUBLAS(SCALAR, LAYOUT, EXECSPACE, \ - MEMSPACE) \ - template <> \ - struct rotg_tpl_spec_avail< \ - EXECSPACE, \ - Kokkos::View, \ - Kokkos::MemoryTraits>, \ - Kokkos::View::mag_type, LAYOUT, \ - Kokkos::Device, \ - Kokkos::MemoryTraits>> { \ - enum : bool { value = true }; \ +#define KOKKOSBLAS1_ROTG_TPL_SPEC_AVAIL_CUBLAS(SCALAR, LAYOUT, EXECSPACE, MEMSPACE) \ + template <> \ + struct rotg_tpl_spec_avail< \ + EXECSPACE, \ + Kokkos::View, Kokkos::MemoryTraits>, \ + Kokkos::View::mag_type, LAYOUT, Kokkos::Device, \ + Kokkos::MemoryTraits>> { \ + enum : bool { value = true }; \ }; -KOKKOSBLAS1_ROTG_TPL_SPEC_AVAIL_CUBLAS(double, Kokkos::LayoutLeft, Kokkos::Cuda, - Kokkos::CudaSpace) -KOKKOSBLAS1_ROTG_TPL_SPEC_AVAIL_CUBLAS(double, Kokkos::LayoutRight, - Kokkos::Cuda, Kokkos::CudaSpace) -KOKKOSBLAS1_ROTG_TPL_SPEC_AVAIL_CUBLAS(float, Kokkos::LayoutLeft, Kokkos::Cuda, - Kokkos::CudaSpace) -KOKKOSBLAS1_ROTG_TPL_SPEC_AVAIL_CUBLAS(float, Kokkos::LayoutRight, Kokkos::Cuda, - Kokkos::CudaSpace) -KOKKOSBLAS1_ROTG_TPL_SPEC_AVAIL_CUBLAS(Kokkos::complex, - Kokkos::LayoutLeft, Kokkos::Cuda, - Kokkos::CudaSpace) -KOKKOSBLAS1_ROTG_TPL_SPEC_AVAIL_CUBLAS(Kokkos::complex, - Kokkos::LayoutRight, Kokkos::Cuda, - Kokkos::CudaSpace) -KOKKOSBLAS1_ROTG_TPL_SPEC_AVAIL_CUBLAS(Kokkos::complex, - Kokkos::LayoutLeft, Kokkos::Cuda, - Kokkos::CudaSpace) -KOKKOSBLAS1_ROTG_TPL_SPEC_AVAIL_CUBLAS(Kokkos::complex, - Kokkos::LayoutRight, Kokkos::Cuda, - Kokkos::CudaSpace) +KOKKOSBLAS1_ROTG_TPL_SPEC_AVAIL_CUBLAS(double, Kokkos::LayoutLeft, Kokkos::Cuda, Kokkos::CudaSpace) +KOKKOSBLAS1_ROTG_TPL_SPEC_AVAIL_CUBLAS(double, Kokkos::LayoutRight, Kokkos::Cuda, Kokkos::CudaSpace) +KOKKOSBLAS1_ROTG_TPL_SPEC_AVAIL_CUBLAS(float, Kokkos::LayoutLeft, Kokkos::Cuda, Kokkos::CudaSpace) +KOKKOSBLAS1_ROTG_TPL_SPEC_AVAIL_CUBLAS(float, Kokkos::LayoutRight, Kokkos::Cuda, Kokkos::CudaSpace) +KOKKOSBLAS1_ROTG_TPL_SPEC_AVAIL_CUBLAS(Kokkos::complex, Kokkos::LayoutLeft, Kokkos::Cuda, Kokkos::CudaSpace) +KOKKOSBLAS1_ROTG_TPL_SPEC_AVAIL_CUBLAS(Kokkos::complex, Kokkos::LayoutRight, Kokkos::Cuda, Kokkos::CudaSpace) +KOKKOSBLAS1_ROTG_TPL_SPEC_AVAIL_CUBLAS(Kokkos::complex, Kokkos::LayoutLeft, Kokkos::Cuda, Kokkos::CudaSpace) +KOKKOSBLAS1_ROTG_TPL_SPEC_AVAIL_CUBLAS(Kokkos::complex, Kokkos::LayoutRight, Kokkos::Cuda, Kokkos::CudaSpace) -KOKKOSBLAS1_ROTG_TPL_SPEC_AVAIL_CUBLAS(double, Kokkos::LayoutLeft, Kokkos::Cuda, - Kokkos::CudaUVMSpace) -KOKKOSBLAS1_ROTG_TPL_SPEC_AVAIL_CUBLAS(double, Kokkos::LayoutRight, - Kokkos::Cuda, Kokkos::CudaUVMSpace) -KOKKOSBLAS1_ROTG_TPL_SPEC_AVAIL_CUBLAS(float, Kokkos::LayoutLeft, Kokkos::Cuda, - Kokkos::CudaUVMSpace) -KOKKOSBLAS1_ROTG_TPL_SPEC_AVAIL_CUBLAS(float, Kokkos::LayoutRight, Kokkos::Cuda, - Kokkos::CudaUVMSpace) -KOKKOSBLAS1_ROTG_TPL_SPEC_AVAIL_CUBLAS(Kokkos::complex, - Kokkos::LayoutLeft, Kokkos::Cuda, - Kokkos::CudaUVMSpace) -KOKKOSBLAS1_ROTG_TPL_SPEC_AVAIL_CUBLAS(Kokkos::complex, - Kokkos::LayoutRight, Kokkos::Cuda, - Kokkos::CudaUVMSpace) -KOKKOSBLAS1_ROTG_TPL_SPEC_AVAIL_CUBLAS(Kokkos::complex, - Kokkos::LayoutLeft, Kokkos::Cuda, - Kokkos::CudaUVMSpace) -KOKKOSBLAS1_ROTG_TPL_SPEC_AVAIL_CUBLAS(Kokkos::complex, - Kokkos::LayoutRight, Kokkos::Cuda, - Kokkos::CudaUVMSpace) +KOKKOSBLAS1_ROTG_TPL_SPEC_AVAIL_CUBLAS(double, Kokkos::LayoutLeft, Kokkos::Cuda, Kokkos::CudaUVMSpace) +KOKKOSBLAS1_ROTG_TPL_SPEC_AVAIL_CUBLAS(double, Kokkos::LayoutRight, Kokkos::Cuda, Kokkos::CudaUVMSpace) +KOKKOSBLAS1_ROTG_TPL_SPEC_AVAIL_CUBLAS(float, Kokkos::LayoutLeft, Kokkos::Cuda, Kokkos::CudaUVMSpace) +KOKKOSBLAS1_ROTG_TPL_SPEC_AVAIL_CUBLAS(float, Kokkos::LayoutRight, Kokkos::Cuda, Kokkos::CudaUVMSpace) +KOKKOSBLAS1_ROTG_TPL_SPEC_AVAIL_CUBLAS(Kokkos::complex, Kokkos::LayoutLeft, Kokkos::Cuda, Kokkos::CudaUVMSpace) +KOKKOSBLAS1_ROTG_TPL_SPEC_AVAIL_CUBLAS(Kokkos::complex, Kokkos::LayoutRight, Kokkos::Cuda, Kokkos::CudaUVMSpace) +KOKKOSBLAS1_ROTG_TPL_SPEC_AVAIL_CUBLAS(Kokkos::complex, Kokkos::LayoutLeft, Kokkos::Cuda, Kokkos::CudaUVMSpace) +KOKKOSBLAS1_ROTG_TPL_SPEC_AVAIL_CUBLAS(Kokkos::complex, Kokkos::LayoutRight, Kokkos::Cuda, Kokkos::CudaUVMSpace) #endif // rocBLAS #ifdef KOKKOSKERNELS_ENABLE_TPL_ROCBLAS -#define KOKKOSBLAS1_ROTG_TPL_SPEC_AVAIL_ROCBLAS(SCALAR, LAYOUT, EXECSPACE, \ - MEMSPACE) \ - template <> \ - struct rotg_tpl_spec_avail< \ - EXECSPACE, \ - Kokkos::View, \ - Kokkos::MemoryTraits>, \ - Kokkos::View::mag_type, LAYOUT, \ - Kokkos::Device, \ - Kokkos::MemoryTraits>> { \ - enum : bool { value = true }; \ +#define KOKKOSBLAS1_ROTG_TPL_SPEC_AVAIL_ROCBLAS(SCALAR, LAYOUT, EXECSPACE, MEMSPACE) \ + template <> \ + struct rotg_tpl_spec_avail< \ + EXECSPACE, \ + Kokkos::View, Kokkos::MemoryTraits>, \ + Kokkos::View::mag_type, LAYOUT, Kokkos::Device, \ + Kokkos::MemoryTraits>> { \ + enum : bool { value = true }; \ }; -KOKKOSBLAS1_ROTG_TPL_SPEC_AVAIL_ROCBLAS(double, Kokkos::LayoutLeft, Kokkos::HIP, - Kokkos::HIPSpace) -KOKKOSBLAS1_ROTG_TPL_SPEC_AVAIL_ROCBLAS(double, Kokkos::LayoutRight, - Kokkos::HIP, Kokkos::HIPSpace) -KOKKOSBLAS1_ROTG_TPL_SPEC_AVAIL_ROCBLAS(float, Kokkos::LayoutLeft, Kokkos::HIP, - Kokkos::HIPSpace) -KOKKOSBLAS1_ROTG_TPL_SPEC_AVAIL_ROCBLAS(float, Kokkos::LayoutRight, Kokkos::HIP, - Kokkos::HIPSpace) -KOKKOSBLAS1_ROTG_TPL_SPEC_AVAIL_ROCBLAS(Kokkos::complex, - Kokkos::LayoutLeft, Kokkos::HIP, - Kokkos::HIPSpace) -KOKKOSBLAS1_ROTG_TPL_SPEC_AVAIL_ROCBLAS(Kokkos::complex, - Kokkos::LayoutRight, Kokkos::HIP, - Kokkos::HIPSpace) -KOKKOSBLAS1_ROTG_TPL_SPEC_AVAIL_ROCBLAS(Kokkos::complex, - Kokkos::LayoutLeft, Kokkos::HIP, - Kokkos::HIPSpace) -KOKKOSBLAS1_ROTG_TPL_SPEC_AVAIL_ROCBLAS(Kokkos::complex, - Kokkos::LayoutRight, Kokkos::HIP, - Kokkos::HIPSpace) +KOKKOSBLAS1_ROTG_TPL_SPEC_AVAIL_ROCBLAS(double, Kokkos::LayoutLeft, Kokkos::HIP, Kokkos::HIPSpace) +KOKKOSBLAS1_ROTG_TPL_SPEC_AVAIL_ROCBLAS(double, Kokkos::LayoutRight, Kokkos::HIP, Kokkos::HIPSpace) +KOKKOSBLAS1_ROTG_TPL_SPEC_AVAIL_ROCBLAS(float, Kokkos::LayoutLeft, Kokkos::HIP, Kokkos::HIPSpace) +KOKKOSBLAS1_ROTG_TPL_SPEC_AVAIL_ROCBLAS(float, Kokkos::LayoutRight, Kokkos::HIP, Kokkos::HIPSpace) +KOKKOSBLAS1_ROTG_TPL_SPEC_AVAIL_ROCBLAS(Kokkos::complex, Kokkos::LayoutLeft, Kokkos::HIP, Kokkos::HIPSpace) +KOKKOSBLAS1_ROTG_TPL_SPEC_AVAIL_ROCBLAS(Kokkos::complex, Kokkos::LayoutRight, Kokkos::HIP, Kokkos::HIPSpace) +KOKKOSBLAS1_ROTG_TPL_SPEC_AVAIL_ROCBLAS(Kokkos::complex, Kokkos::LayoutLeft, Kokkos::HIP, Kokkos::HIPSpace) +KOKKOSBLAS1_ROTG_TPL_SPEC_AVAIL_ROCBLAS(Kokkos::complex, Kokkos::LayoutRight, Kokkos::HIP, Kokkos::HIPSpace) #endif } // namespace Impl diff --git a/blas/tpls/KokkosBlas1_rotg_tpl_spec_decl.hpp b/blas/tpls/KokkosBlas1_rotg_tpl_spec_decl.hpp index ee6a6c8c04..e6583d5ae3 100644 --- a/blas/tpls/KokkosBlas1_rotg_tpl_spec_decl.hpp +++ b/blas/tpls/KokkosBlas1_rotg_tpl_spec_decl.hpp @@ -24,8 +24,8 @@ namespace { template inline void rotg_print_specialization() { #ifdef KOKKOSKERNELS_ENABLE_CHECK_SPECIALIZATION - printf("KokkosBlas1::rotg<> TPL Blas specialization for < %s, %s >\n", - typeid(Scalar).name(), typeid(ExecutionSpace).name); + printf("KokkosBlas1::rotg<> TPL Blas specialization for < %s, %s >\n", typeid(Scalar).name(), + typeid(ExecutionSpace).name); #endif } } // namespace @@ -39,184 +39,130 @@ inline void rotg_print_specialization() { namespace KokkosBlas { namespace Impl { -#define KOKKOSBLAS1_DROTG_TPL_SPEC_DECL_BLAS(LAYOUT, EXECSPACE, MEMSPACE, \ - ETI_SPEC_AVAIL) \ - template <> \ - struct Rotg< \ - EXECSPACE, \ - Kokkos::View, \ - Kokkos::MemoryTraits>, \ - Kokkos::View, \ - Kokkos::MemoryTraits>, \ - true, ETI_SPEC_AVAIL> { \ - using SViewType = \ - Kokkos::View, \ - Kokkos::MemoryTraits>; \ - using MViewType = \ - Kokkos::View, \ - Kokkos::MemoryTraits>; \ - static void rotg(EXECSPACE const, SViewType const& a, SViewType const& b, \ - MViewType const& c, SViewType const& s) { \ - Kokkos::Profiling::pushRegion("KokkosBlas::rotg[TPL_BLAS,double]"); \ - HostBlas::rotg(a.data(), b.data(), c.data(), s.data()); \ - Kokkos::Profiling::popRegion(); \ - } \ +#define KOKKOSBLAS1_DROTG_TPL_SPEC_DECL_BLAS(LAYOUT, EXECSPACE, MEMSPACE, ETI_SPEC_AVAIL) \ + template <> \ + struct Rotg< \ + EXECSPACE, \ + Kokkos::View, Kokkos::MemoryTraits>, \ + Kokkos::View, Kokkos::MemoryTraits>, \ + true, ETI_SPEC_AVAIL> { \ + using SViewType = \ + Kokkos::View, Kokkos::MemoryTraits>; \ + using MViewType = \ + Kokkos::View, Kokkos::MemoryTraits>; \ + static void rotg(EXECSPACE const, SViewType const& a, SViewType const& b, MViewType const& c, \ + SViewType const& s) { \ + Kokkos::Profiling::pushRegion("KokkosBlas::rotg[TPL_BLAS,double]"); \ + HostBlas::rotg(a.data(), b.data(), c.data(), s.data()); \ + Kokkos::Profiling::popRegion(); \ + } \ }; -#define KOKKOSBLAS1_SROTG_TPL_SPEC_DECL_BLAS(LAYOUT, EXECSPACE, MEMSPACE, \ - ETI_SPEC_AVAIL) \ - template <> \ - struct Rotg, \ - Kokkos::MemoryTraits>, \ - Kokkos::View, \ - Kokkos::MemoryTraits>, \ - true, ETI_SPEC_AVAIL> { \ - using SViewType = \ - Kokkos::View, \ - Kokkos::MemoryTraits>; \ - using MViewType = \ - Kokkos::View, \ - Kokkos::MemoryTraits>; \ - static void rotg(EXECSPACE const, SViewType const& a, SViewType const& b, \ - MViewType const& c, SViewType const& s) { \ - Kokkos::Profiling::pushRegion("KokkosBlas::rotg[TPL_BLAS,float]"); \ - HostBlas::rotg(a.data(), b.data(), c.data(), s.data()); \ - Kokkos::Profiling::popRegion(); \ - } \ +#define KOKKOSBLAS1_SROTG_TPL_SPEC_DECL_BLAS(LAYOUT, EXECSPACE, MEMSPACE, ETI_SPEC_AVAIL) \ + template <> \ + struct Rotg< \ + EXECSPACE, \ + Kokkos::View, Kokkos::MemoryTraits>, \ + Kokkos::View, Kokkos::MemoryTraits>, true, \ + ETI_SPEC_AVAIL> { \ + using SViewType = \ + Kokkos::View, Kokkos::MemoryTraits>; \ + using MViewType = \ + Kokkos::View, Kokkos::MemoryTraits>; \ + static void rotg(EXECSPACE const, SViewType const& a, SViewType const& b, MViewType const& c, \ + SViewType const& s) { \ + Kokkos::Profiling::pushRegion("KokkosBlas::rotg[TPL_BLAS,float]"); \ + HostBlas::rotg(a.data(), b.data(), c.data(), s.data()); \ + Kokkos::Profiling::popRegion(); \ + } \ }; -#define KOKKOSBLAS1_ZROTG_TPL_SPEC_DECL_BLAS(LAYOUT, EXECSPACE, MEMSPACE, \ - ETI_SPEC_AVAIL) \ - template <> \ - struct Rotg< \ - EXECSPACE, \ - Kokkos::View, LAYOUT, \ - Kokkos::Device, \ - Kokkos::MemoryTraits>, \ - Kokkos::View, \ - Kokkos::MemoryTraits>, \ - true, ETI_SPEC_AVAIL> { \ - using SViewType = Kokkos::View, LAYOUT, \ - Kokkos::Device, \ - Kokkos::MemoryTraits>; \ - using MViewType = \ - Kokkos::View, \ - Kokkos::MemoryTraits>; \ - static void rotg(EXECSPACE const, SViewType const& a, SViewType const& b, \ - MViewType const& c, SViewType const& s) { \ - Kokkos::Profiling::pushRegion( \ - "KokkosBlas::rotg[TPL_BLAS,complex]"); \ - HostBlas>::rotg( \ - reinterpret_cast*>(a.data()), \ - reinterpret_cast*>(b.data()), c.data(), \ - reinterpret_cast*>(s.data())); \ - Kokkos::Profiling::popRegion(); \ - } \ +#define KOKKOSBLAS1_ZROTG_TPL_SPEC_DECL_BLAS(LAYOUT, EXECSPACE, MEMSPACE, ETI_SPEC_AVAIL) \ + template <> \ + struct Rotg< \ + EXECSPACE, \ + Kokkos::View, LAYOUT, Kokkos::Device, \ + Kokkos::MemoryTraits>, \ + Kokkos::View, Kokkos::MemoryTraits>, \ + true, ETI_SPEC_AVAIL> { \ + using SViewType = Kokkos::View, LAYOUT, Kokkos::Device, \ + Kokkos::MemoryTraits>; \ + using MViewType = \ + Kokkos::View, Kokkos::MemoryTraits>; \ + static void rotg(EXECSPACE const, SViewType const& a, SViewType const& b, MViewType const& c, \ + SViewType const& s) { \ + Kokkos::Profiling::pushRegion("KokkosBlas::rotg[TPL_BLAS,complex]"); \ + HostBlas>::rotg(reinterpret_cast*>(a.data()), \ + reinterpret_cast*>(b.data()), c.data(), \ + reinterpret_cast*>(s.data())); \ + Kokkos::Profiling::popRegion(); \ + } \ }; -#define KOKKOSBLAS1_CROTG_TPL_SPEC_DECL_BLAS(LAYOUT, EXECSPACE, MEMSPACE, \ - ETI_SPEC_AVAIL) \ - template <> \ - struct Rotg, LAYOUT, \ - Kokkos::Device, \ - Kokkos::MemoryTraits>, \ - Kokkos::View, \ - Kokkos::MemoryTraits>, \ - true, ETI_SPEC_AVAIL> { \ - using SViewType = Kokkos::View, LAYOUT, \ - Kokkos::Device, \ - Kokkos::MemoryTraits>; \ - using MViewType = \ - Kokkos::View, \ - Kokkos::MemoryTraits>; \ - static void rotg(EXECSPACE const, SViewType const& a, SViewType const& b, \ - MViewType const& c, SViewType const& s) { \ - Kokkos::Profiling::pushRegion( \ - "KokkosBlas::rotg[TPL_BLAS,complex]"); \ - HostBlas>::rotg( \ - reinterpret_cast*>(a.data()), \ - reinterpret_cast*>(b.data()), c.data(), \ - reinterpret_cast*>(s.data())); \ - Kokkos::Profiling::popRegion(); \ - } \ +#define KOKKOSBLAS1_CROTG_TPL_SPEC_DECL_BLAS(LAYOUT, EXECSPACE, MEMSPACE, ETI_SPEC_AVAIL) \ + template <> \ + struct Rotg< \ + EXECSPACE, \ + Kokkos::View, LAYOUT, Kokkos::Device, \ + Kokkos::MemoryTraits>, \ + Kokkos::View, Kokkos::MemoryTraits>, true, \ + ETI_SPEC_AVAIL> { \ + using SViewType = Kokkos::View, LAYOUT, Kokkos::Device, \ + Kokkos::MemoryTraits>; \ + using MViewType = \ + Kokkos::View, Kokkos::MemoryTraits>; \ + static void rotg(EXECSPACE const, SViewType const& a, SViewType const& b, MViewType const& c, \ + SViewType const& s) { \ + Kokkos::Profiling::pushRegion("KokkosBlas::rotg[TPL_BLAS,complex]"); \ + HostBlas>::rotg(reinterpret_cast*>(a.data()), \ + reinterpret_cast*>(b.data()), c.data(), \ + reinterpret_cast*>(s.data())); \ + Kokkos::Profiling::popRegion(); \ + } \ }; #ifdef KOKKOS_ENABLE_SERIAL -KOKKOSBLAS1_DROTG_TPL_SPEC_DECL_BLAS(Kokkos::LayoutLeft, Kokkos::Serial, - Kokkos::HostSpace, true) -KOKKOSBLAS1_DROTG_TPL_SPEC_DECL_BLAS(Kokkos::LayoutRight, Kokkos::Serial, - Kokkos::HostSpace, true) -KOKKOSBLAS1_DROTG_TPL_SPEC_DECL_BLAS(Kokkos::LayoutLeft, Kokkos::Serial, - Kokkos::HostSpace, false) -KOKKOSBLAS1_DROTG_TPL_SPEC_DECL_BLAS(Kokkos::LayoutRight, Kokkos::Serial, - Kokkos::HostSpace, false) - -KOKKOSBLAS1_SROTG_TPL_SPEC_DECL_BLAS(Kokkos::LayoutLeft, Kokkos::Serial, - Kokkos::HostSpace, true) -KOKKOSBLAS1_SROTG_TPL_SPEC_DECL_BLAS(Kokkos::LayoutRight, Kokkos::Serial, - Kokkos::HostSpace, true) -KOKKOSBLAS1_SROTG_TPL_SPEC_DECL_BLAS(Kokkos::LayoutLeft, Kokkos::Serial, - Kokkos::HostSpace, false) -KOKKOSBLAS1_SROTG_TPL_SPEC_DECL_BLAS(Kokkos::LayoutRight, Kokkos::Serial, - Kokkos::HostSpace, false) - -KOKKOSBLAS1_ZROTG_TPL_SPEC_DECL_BLAS(Kokkos::LayoutLeft, Kokkos::Serial, - Kokkos::HostSpace, true) -KOKKOSBLAS1_ZROTG_TPL_SPEC_DECL_BLAS(Kokkos::LayoutRight, Kokkos::Serial, - Kokkos::HostSpace, true) -KOKKOSBLAS1_ZROTG_TPL_SPEC_DECL_BLAS(Kokkos::LayoutLeft, Kokkos::Serial, - Kokkos::HostSpace, false) -KOKKOSBLAS1_ZROTG_TPL_SPEC_DECL_BLAS(Kokkos::LayoutRight, Kokkos::Serial, - Kokkos::HostSpace, false) - -KOKKOSBLAS1_CROTG_TPL_SPEC_DECL_BLAS(Kokkos::LayoutLeft, Kokkos::Serial, - Kokkos::HostSpace, true) -KOKKOSBLAS1_CROTG_TPL_SPEC_DECL_BLAS(Kokkos::LayoutRight, Kokkos::Serial, - Kokkos::HostSpace, true) -KOKKOSBLAS1_CROTG_TPL_SPEC_DECL_BLAS(Kokkos::LayoutLeft, Kokkos::Serial, - Kokkos::HostSpace, false) -KOKKOSBLAS1_CROTG_TPL_SPEC_DECL_BLAS(Kokkos::LayoutRight, Kokkos::Serial, - Kokkos::HostSpace, false) +KOKKOSBLAS1_DROTG_TPL_SPEC_DECL_BLAS(Kokkos::LayoutLeft, Kokkos::Serial, Kokkos::HostSpace, true) +KOKKOSBLAS1_DROTG_TPL_SPEC_DECL_BLAS(Kokkos::LayoutRight, Kokkos::Serial, Kokkos::HostSpace, true) +KOKKOSBLAS1_DROTG_TPL_SPEC_DECL_BLAS(Kokkos::LayoutLeft, Kokkos::Serial, Kokkos::HostSpace, false) +KOKKOSBLAS1_DROTG_TPL_SPEC_DECL_BLAS(Kokkos::LayoutRight, Kokkos::Serial, Kokkos::HostSpace, false) + +KOKKOSBLAS1_SROTG_TPL_SPEC_DECL_BLAS(Kokkos::LayoutLeft, Kokkos::Serial, Kokkos::HostSpace, true) +KOKKOSBLAS1_SROTG_TPL_SPEC_DECL_BLAS(Kokkos::LayoutRight, Kokkos::Serial, Kokkos::HostSpace, true) +KOKKOSBLAS1_SROTG_TPL_SPEC_DECL_BLAS(Kokkos::LayoutLeft, Kokkos::Serial, Kokkos::HostSpace, false) +KOKKOSBLAS1_SROTG_TPL_SPEC_DECL_BLAS(Kokkos::LayoutRight, Kokkos::Serial, Kokkos::HostSpace, false) + +KOKKOSBLAS1_ZROTG_TPL_SPEC_DECL_BLAS(Kokkos::LayoutLeft, Kokkos::Serial, Kokkos::HostSpace, true) +KOKKOSBLAS1_ZROTG_TPL_SPEC_DECL_BLAS(Kokkos::LayoutRight, Kokkos::Serial, Kokkos::HostSpace, true) +KOKKOSBLAS1_ZROTG_TPL_SPEC_DECL_BLAS(Kokkos::LayoutLeft, Kokkos::Serial, Kokkos::HostSpace, false) +KOKKOSBLAS1_ZROTG_TPL_SPEC_DECL_BLAS(Kokkos::LayoutRight, Kokkos::Serial, Kokkos::HostSpace, false) + +KOKKOSBLAS1_CROTG_TPL_SPEC_DECL_BLAS(Kokkos::LayoutLeft, Kokkos::Serial, Kokkos::HostSpace, true) +KOKKOSBLAS1_CROTG_TPL_SPEC_DECL_BLAS(Kokkos::LayoutRight, Kokkos::Serial, Kokkos::HostSpace, true) +KOKKOSBLAS1_CROTG_TPL_SPEC_DECL_BLAS(Kokkos::LayoutLeft, Kokkos::Serial, Kokkos::HostSpace, false) +KOKKOSBLAS1_CROTG_TPL_SPEC_DECL_BLAS(Kokkos::LayoutRight, Kokkos::Serial, Kokkos::HostSpace, false) #endif #ifdef KOKKOS_ENABLE_OPENMP -KOKKOSBLAS1_DROTG_TPL_SPEC_DECL_BLAS(Kokkos::LayoutLeft, Kokkos::OpenMP, - Kokkos::HostSpace, true) -KOKKOSBLAS1_DROTG_TPL_SPEC_DECL_BLAS(Kokkos::LayoutRight, Kokkos::OpenMP, - Kokkos::HostSpace, true) -KOKKOSBLAS1_DROTG_TPL_SPEC_DECL_BLAS(Kokkos::LayoutLeft, Kokkos::OpenMP, - Kokkos::HostSpace, false) -KOKKOSBLAS1_DROTG_TPL_SPEC_DECL_BLAS(Kokkos::LayoutRight, Kokkos::OpenMP, - Kokkos::HostSpace, false) - -KOKKOSBLAS1_SROTG_TPL_SPEC_DECL_BLAS(Kokkos::LayoutLeft, Kokkos::OpenMP, - Kokkos::HostSpace, true) -KOKKOSBLAS1_SROTG_TPL_SPEC_DECL_BLAS(Kokkos::LayoutRight, Kokkos::OpenMP, - Kokkos::HostSpace, true) -KOKKOSBLAS1_SROTG_TPL_SPEC_DECL_BLAS(Kokkos::LayoutLeft, Kokkos::OpenMP, - Kokkos::HostSpace, false) -KOKKOSBLAS1_SROTG_TPL_SPEC_DECL_BLAS(Kokkos::LayoutRight, Kokkos::OpenMP, - Kokkos::HostSpace, false) - -KOKKOSBLAS1_ZROTG_TPL_SPEC_DECL_BLAS(Kokkos::LayoutLeft, Kokkos::OpenMP, - Kokkos::HostSpace, true) -KOKKOSBLAS1_ZROTG_TPL_SPEC_DECL_BLAS(Kokkos::LayoutRight, Kokkos::OpenMP, - Kokkos::HostSpace, true) -KOKKOSBLAS1_ZROTG_TPL_SPEC_DECL_BLAS(Kokkos::LayoutLeft, Kokkos::OpenMP, - Kokkos::HostSpace, false) -KOKKOSBLAS1_ZROTG_TPL_SPEC_DECL_BLAS(Kokkos::LayoutRight, Kokkos::OpenMP, - Kokkos::HostSpace, false) - -KOKKOSBLAS1_CROTG_TPL_SPEC_DECL_BLAS(Kokkos::LayoutLeft, Kokkos::OpenMP, - Kokkos::HostSpace, true) -KOKKOSBLAS1_CROTG_TPL_SPEC_DECL_BLAS(Kokkos::LayoutRight, Kokkos::OpenMP, - Kokkos::HostSpace, true) -KOKKOSBLAS1_CROTG_TPL_SPEC_DECL_BLAS(Kokkos::LayoutLeft, Kokkos::OpenMP, - Kokkos::HostSpace, false) -KOKKOSBLAS1_CROTG_TPL_SPEC_DECL_BLAS(Kokkos::LayoutRight, Kokkos::OpenMP, - Kokkos::HostSpace, false) +KOKKOSBLAS1_DROTG_TPL_SPEC_DECL_BLAS(Kokkos::LayoutLeft, Kokkos::OpenMP, Kokkos::HostSpace, true) +KOKKOSBLAS1_DROTG_TPL_SPEC_DECL_BLAS(Kokkos::LayoutRight, Kokkos::OpenMP, Kokkos::HostSpace, true) +KOKKOSBLAS1_DROTG_TPL_SPEC_DECL_BLAS(Kokkos::LayoutLeft, Kokkos::OpenMP, Kokkos::HostSpace, false) +KOKKOSBLAS1_DROTG_TPL_SPEC_DECL_BLAS(Kokkos::LayoutRight, Kokkos::OpenMP, Kokkos::HostSpace, false) + +KOKKOSBLAS1_SROTG_TPL_SPEC_DECL_BLAS(Kokkos::LayoutLeft, Kokkos::OpenMP, Kokkos::HostSpace, true) +KOKKOSBLAS1_SROTG_TPL_SPEC_DECL_BLAS(Kokkos::LayoutRight, Kokkos::OpenMP, Kokkos::HostSpace, true) +KOKKOSBLAS1_SROTG_TPL_SPEC_DECL_BLAS(Kokkos::LayoutLeft, Kokkos::OpenMP, Kokkos::HostSpace, false) +KOKKOSBLAS1_SROTG_TPL_SPEC_DECL_BLAS(Kokkos::LayoutRight, Kokkos::OpenMP, Kokkos::HostSpace, false) + +KOKKOSBLAS1_ZROTG_TPL_SPEC_DECL_BLAS(Kokkos::LayoutLeft, Kokkos::OpenMP, Kokkos::HostSpace, true) +KOKKOSBLAS1_ZROTG_TPL_SPEC_DECL_BLAS(Kokkos::LayoutRight, Kokkos::OpenMP, Kokkos::HostSpace, true) +KOKKOSBLAS1_ZROTG_TPL_SPEC_DECL_BLAS(Kokkos::LayoutLeft, Kokkos::OpenMP, Kokkos::HostSpace, false) +KOKKOSBLAS1_ZROTG_TPL_SPEC_DECL_BLAS(Kokkos::LayoutRight, Kokkos::OpenMP, Kokkos::HostSpace, false) + +KOKKOSBLAS1_CROTG_TPL_SPEC_DECL_BLAS(Kokkos::LayoutLeft, Kokkos::OpenMP, Kokkos::HostSpace, true) +KOKKOSBLAS1_CROTG_TPL_SPEC_DECL_BLAS(Kokkos::LayoutRight, Kokkos::OpenMP, Kokkos::HostSpace, true) +KOKKOSBLAS1_CROTG_TPL_SPEC_DECL_BLAS(Kokkos::LayoutLeft, Kokkos::OpenMP, Kokkos::HostSpace, false) +KOKKOSBLAS1_CROTG_TPL_SPEC_DECL_BLAS(Kokkos::LayoutRight, Kokkos::OpenMP, Kokkos::HostSpace, false) #endif } // namespace Impl @@ -231,231 +177,151 @@ KOKKOSBLAS1_CROTG_TPL_SPEC_DECL_BLAS(Kokkos::LayoutRight, Kokkos::OpenMP, namespace KokkosBlas { namespace Impl { -#define KOKKOSBLAS1_DROTG_TPL_SPEC_DECL_CUBLAS(LAYOUT, EXECSPACE, MEMSPACE, \ - ETI_SPEC_AVAIL) \ - template <> \ - struct Rotg< \ - EXECSPACE, \ - Kokkos::View, \ - Kokkos::MemoryTraits>, \ - Kokkos::View, \ - Kokkos::MemoryTraits>, \ - true, ETI_SPEC_AVAIL> { \ - using SViewType = \ - Kokkos::View, \ - Kokkos::MemoryTraits>; \ - using MViewType = \ - Kokkos::View, \ - Kokkos::MemoryTraits>; \ - static void rotg(EXECSPACE const& space, SViewType const& a, \ - SViewType const& b, MViewType const& c, \ - SViewType const& s) { \ - Kokkos::Profiling::pushRegion("KokkosBlas::rotg[TPL_CUBLAS,double]"); \ - rotg_print_specialization(); \ - KokkosBlas::Impl::CudaBlasSingleton& singleton = \ - KokkosBlas::Impl::CudaBlasSingleton::singleton(); \ - KOKKOS_CUBLAS_SAFE_CALL_IMPL( \ - cublasSetStream(singleton.handle, space.cuda_stream())); \ - cublasPointerMode_t pointer_mode; \ - KOKKOS_CUBLAS_SAFE_CALL_IMPL( \ - cublasGetPointerMode(singleton.handle, &pointer_mode)); \ - KOKKOS_CUBLAS_SAFE_CALL_IMPL( \ - cublasSetPointerMode(singleton.handle, CUBLAS_POINTER_MODE_DEVICE)); \ - KOKKOS_CUBLAS_SAFE_CALL_IMPL(cublasDrotg(singleton.handle, a.data(), \ - b.data(), c.data(), s.data())); \ - KOKKOS_CUBLAS_SAFE_CALL_IMPL( \ - cublasSetPointerMode(singleton.handle, pointer_mode)); \ - Kokkos::Profiling::popRegion(); \ - } \ +#define KOKKOSBLAS1_DROTG_TPL_SPEC_DECL_CUBLAS(LAYOUT, EXECSPACE, MEMSPACE, ETI_SPEC_AVAIL) \ + template <> \ + struct Rotg< \ + EXECSPACE, \ + Kokkos::View, Kokkos::MemoryTraits>, \ + Kokkos::View, Kokkos::MemoryTraits>, \ + true, ETI_SPEC_AVAIL> { \ + using SViewType = \ + Kokkos::View, Kokkos::MemoryTraits>; \ + using MViewType = \ + Kokkos::View, Kokkos::MemoryTraits>; \ + static void rotg(EXECSPACE const& space, SViewType const& a, SViewType const& b, MViewType const& c, \ + SViewType const& s) { \ + Kokkos::Profiling::pushRegion("KokkosBlas::rotg[TPL_CUBLAS,double]"); \ + rotg_print_specialization(); \ + KokkosBlas::Impl::CudaBlasSingleton& singleton = KokkosBlas::Impl::CudaBlasSingleton::singleton(); \ + KOKKOS_CUBLAS_SAFE_CALL_IMPL(cublasSetStream(singleton.handle, space.cuda_stream())); \ + cublasPointerMode_t pointer_mode; \ + KOKKOS_CUBLAS_SAFE_CALL_IMPL(cublasGetPointerMode(singleton.handle, &pointer_mode)); \ + KOKKOS_CUBLAS_SAFE_CALL_IMPL(cublasSetPointerMode(singleton.handle, CUBLAS_POINTER_MODE_DEVICE)); \ + KOKKOS_CUBLAS_SAFE_CALL_IMPL(cublasDrotg(singleton.handle, a.data(), b.data(), c.data(), s.data())); \ + KOKKOS_CUBLAS_SAFE_CALL_IMPL(cublasSetPointerMode(singleton.handle, pointer_mode)); \ + Kokkos::Profiling::popRegion(); \ + } \ }; -#define KOKKOSBLAS1_SROTG_TPL_SPEC_DECL_CUBLAS(LAYOUT, EXECSPACE, MEMSPACE, \ - ETI_SPEC_AVAIL) \ - template <> \ - struct Rotg, \ - Kokkos::MemoryTraits>, \ - Kokkos::View, \ - Kokkos::MemoryTraits>, \ - true, ETI_SPEC_AVAIL> { \ - using SViewType = \ - Kokkos::View, \ - Kokkos::MemoryTraits>; \ - using MViewType = \ - Kokkos::View, \ - Kokkos::MemoryTraits>; \ - static void rotg(EXECSPACE const& space, SViewType const& a, \ - SViewType const& b, MViewType const& c, \ - SViewType const& s) { \ - Kokkos::Profiling::pushRegion("KokkosBlas::rotg[TPL_CUBLAS,float]"); \ - rotg_print_specialization(); \ - KokkosBlas::Impl::CudaBlasSingleton& singleton = \ - KokkosBlas::Impl::CudaBlasSingleton::singleton(); \ - KOKKOS_CUBLAS_SAFE_CALL_IMPL( \ - cublasSetStream(singleton.handle, space.cuda_stream())); \ - cublasPointerMode_t pointer_mode; \ - KOKKOS_CUBLAS_SAFE_CALL_IMPL( \ - cublasGetPointerMode(singleton.handle, &pointer_mode)); \ - KOKKOS_CUBLAS_SAFE_CALL_IMPL( \ - cublasSetPointerMode(singleton.handle, CUBLAS_POINTER_MODE_DEVICE)); \ - KOKKOS_CUBLAS_SAFE_CALL_IMPL(cublasSrotg(singleton.handle, a.data(), \ - b.data(), c.data(), s.data())); \ - KOKKOS_CUBLAS_SAFE_CALL_IMPL( \ - cublasSetPointerMode(singleton.handle, pointer_mode)); \ - Kokkos::Profiling::popRegion(); \ - } \ +#define KOKKOSBLAS1_SROTG_TPL_SPEC_DECL_CUBLAS(LAYOUT, EXECSPACE, MEMSPACE, ETI_SPEC_AVAIL) \ + template <> \ + struct Rotg< \ + EXECSPACE, \ + Kokkos::View, Kokkos::MemoryTraits>, \ + Kokkos::View, Kokkos::MemoryTraits>, true, \ + ETI_SPEC_AVAIL> { \ + using SViewType = \ + Kokkos::View, Kokkos::MemoryTraits>; \ + using MViewType = \ + Kokkos::View, Kokkos::MemoryTraits>; \ + static void rotg(EXECSPACE const& space, SViewType const& a, SViewType const& b, MViewType const& c, \ + SViewType const& s) { \ + Kokkos::Profiling::pushRegion("KokkosBlas::rotg[TPL_CUBLAS,float]"); \ + rotg_print_specialization(); \ + KokkosBlas::Impl::CudaBlasSingleton& singleton = KokkosBlas::Impl::CudaBlasSingleton::singleton(); \ + KOKKOS_CUBLAS_SAFE_CALL_IMPL(cublasSetStream(singleton.handle, space.cuda_stream())); \ + cublasPointerMode_t pointer_mode; \ + KOKKOS_CUBLAS_SAFE_CALL_IMPL(cublasGetPointerMode(singleton.handle, &pointer_mode)); \ + KOKKOS_CUBLAS_SAFE_CALL_IMPL(cublasSetPointerMode(singleton.handle, CUBLAS_POINTER_MODE_DEVICE)); \ + KOKKOS_CUBLAS_SAFE_CALL_IMPL(cublasSrotg(singleton.handle, a.data(), b.data(), c.data(), s.data())); \ + KOKKOS_CUBLAS_SAFE_CALL_IMPL(cublasSetPointerMode(singleton.handle, pointer_mode)); \ + Kokkos::Profiling::popRegion(); \ + } \ }; -#define KOKKOSBLAS1_ZROTG_TPL_SPEC_DECL_CUBLAS(LAYOUT, EXECSPACE, MEMSPACE, \ - ETI_SPEC_AVAIL) \ - template <> \ - struct Rotg< \ - EXECSPACE, \ - Kokkos::View, LAYOUT, \ - Kokkos::Device, \ - Kokkos::MemoryTraits>, \ - Kokkos::View, \ - Kokkos::MemoryTraits>, \ - true, ETI_SPEC_AVAIL> { \ - using SViewType = Kokkos::View, LAYOUT, \ - Kokkos::Device, \ - Kokkos::MemoryTraits>; \ - using MViewType = \ - Kokkos::View, \ - Kokkos::MemoryTraits>; \ - static void rotg(EXECSPACE const& space, SViewType const& a, \ - SViewType const& b, MViewType const& c, \ - SViewType const& s) { \ - Kokkos::Profiling::pushRegion( \ - "KokkosBlas::rotg[TPL_CUBLAS,complex]"); \ - rotg_print_specialization, EXECSPACE>(); \ - KokkosBlas::Impl::CudaBlasSingleton& singleton = \ - KokkosBlas::Impl::CudaBlasSingleton::singleton(); \ - KOKKOS_CUBLAS_SAFE_CALL_IMPL( \ - cublasSetStream(singleton.handle, space.cuda_stream())); \ - cublasPointerMode_t pointer_mode; \ - KOKKOS_CUBLAS_SAFE_CALL_IMPL( \ - cublasGetPointerMode(singleton.handle, &pointer_mode)); \ - KOKKOS_CUBLAS_SAFE_CALL_IMPL( \ - cublasSetPointerMode(singleton.handle, CUBLAS_POINTER_MODE_DEVICE)); \ - KOKKOS_CUBLAS_SAFE_CALL_IMPL(cublasZrotg( \ - singleton.handle, reinterpret_cast(a.data()), \ - reinterpret_cast(b.data()), c.data(), \ - reinterpret_cast(s.data()))); \ - KOKKOS_CUBLAS_SAFE_CALL_IMPL( \ - cublasSetPointerMode(singleton.handle, pointer_mode)); \ - Kokkos::Profiling::popRegion(); \ - } \ +#define KOKKOSBLAS1_ZROTG_TPL_SPEC_DECL_CUBLAS(LAYOUT, EXECSPACE, MEMSPACE, ETI_SPEC_AVAIL) \ + template <> \ + struct Rotg< \ + EXECSPACE, \ + Kokkos::View, LAYOUT, Kokkos::Device, \ + Kokkos::MemoryTraits>, \ + Kokkos::View, Kokkos::MemoryTraits>, \ + true, ETI_SPEC_AVAIL> { \ + using SViewType = Kokkos::View, LAYOUT, Kokkos::Device, \ + Kokkos::MemoryTraits>; \ + using MViewType = \ + Kokkos::View, Kokkos::MemoryTraits>; \ + static void rotg(EXECSPACE const& space, SViewType const& a, SViewType const& b, MViewType const& c, \ + SViewType const& s) { \ + Kokkos::Profiling::pushRegion("KokkosBlas::rotg[TPL_CUBLAS,complex]"); \ + rotg_print_specialization, EXECSPACE>(); \ + KokkosBlas::Impl::CudaBlasSingleton& singleton = KokkosBlas::Impl::CudaBlasSingleton::singleton(); \ + KOKKOS_CUBLAS_SAFE_CALL_IMPL(cublasSetStream(singleton.handle, space.cuda_stream())); \ + cublasPointerMode_t pointer_mode; \ + KOKKOS_CUBLAS_SAFE_CALL_IMPL(cublasGetPointerMode(singleton.handle, &pointer_mode)); \ + KOKKOS_CUBLAS_SAFE_CALL_IMPL(cublasSetPointerMode(singleton.handle, CUBLAS_POINTER_MODE_DEVICE)); \ + KOKKOS_CUBLAS_SAFE_CALL_IMPL(cublasZrotg(singleton.handle, reinterpret_cast(a.data()), \ + reinterpret_cast(b.data()), c.data(), \ + reinterpret_cast(s.data()))); \ + KOKKOS_CUBLAS_SAFE_CALL_IMPL(cublasSetPointerMode(singleton.handle, pointer_mode)); \ + Kokkos::Profiling::popRegion(); \ + } \ }; -#define KOKKOSBLAS1_CROTG_TPL_SPEC_DECL_CUBLAS(LAYOUT, EXECSPACE, MEMSPACE, \ - ETI_SPEC_AVAIL) \ - template <> \ - struct Rotg, LAYOUT, \ - Kokkos::Device, \ - Kokkos::MemoryTraits>, \ - Kokkos::View, \ - Kokkos::MemoryTraits>, \ - true, ETI_SPEC_AVAIL> { \ - using SViewType = Kokkos::View, LAYOUT, \ - Kokkos::Device, \ - Kokkos::MemoryTraits>; \ - using MViewType = \ - Kokkos::View, \ - Kokkos::MemoryTraits>; \ - static void rotg(EXECSPACE const& space, SViewType const& a, \ - SViewType const& b, MViewType const& c, \ - SViewType const& s) { \ - Kokkos::Profiling::pushRegion( \ - "KokkosBlas::rotg[TPL_CUBLAS,complex]"); \ - rotg_print_specialization, EXECSPACE>(); \ - KokkosBlas::Impl::CudaBlasSingleton& singleton = \ - KokkosBlas::Impl::CudaBlasSingleton::singleton(); \ - KOKKOS_CUBLAS_SAFE_CALL_IMPL( \ - cublasSetStream(singleton.handle, space.cuda_stream())); \ - cublasPointerMode_t pointer_mode; \ - KOKKOS_CUBLAS_SAFE_CALL_IMPL( \ - cublasGetPointerMode(singleton.handle, &pointer_mode)); \ - KOKKOS_CUBLAS_SAFE_CALL_IMPL( \ - cublasSetPointerMode(singleton.handle, CUBLAS_POINTER_MODE_DEVICE)); \ - KOKKOS_CUBLAS_SAFE_CALL_IMPL(cublasCrotg( \ - singleton.handle, reinterpret_cast(a.data()), \ - reinterpret_cast(b.data()), c.data(), \ - reinterpret_cast(s.data()))); \ - KOKKOS_CUBLAS_SAFE_CALL_IMPL( \ - cublasSetPointerMode(singleton.handle, pointer_mode)); \ - Kokkos::Profiling::popRegion(); \ - } \ +#define KOKKOSBLAS1_CROTG_TPL_SPEC_DECL_CUBLAS(LAYOUT, EXECSPACE, MEMSPACE, ETI_SPEC_AVAIL) \ + template <> \ + struct Rotg< \ + EXECSPACE, \ + Kokkos::View, LAYOUT, Kokkos::Device, \ + Kokkos::MemoryTraits>, \ + Kokkos::View, Kokkos::MemoryTraits>, true, \ + ETI_SPEC_AVAIL> { \ + using SViewType = Kokkos::View, LAYOUT, Kokkos::Device, \ + Kokkos::MemoryTraits>; \ + using MViewType = \ + Kokkos::View, Kokkos::MemoryTraits>; \ + static void rotg(EXECSPACE const& space, SViewType const& a, SViewType const& b, MViewType const& c, \ + SViewType const& s) { \ + Kokkos::Profiling::pushRegion("KokkosBlas::rotg[TPL_CUBLAS,complex]"); \ + rotg_print_specialization, EXECSPACE>(); \ + KokkosBlas::Impl::CudaBlasSingleton& singleton = KokkosBlas::Impl::CudaBlasSingleton::singleton(); \ + KOKKOS_CUBLAS_SAFE_CALL_IMPL(cublasSetStream(singleton.handle, space.cuda_stream())); \ + cublasPointerMode_t pointer_mode; \ + KOKKOS_CUBLAS_SAFE_CALL_IMPL(cublasGetPointerMode(singleton.handle, &pointer_mode)); \ + KOKKOS_CUBLAS_SAFE_CALL_IMPL(cublasSetPointerMode(singleton.handle, CUBLAS_POINTER_MODE_DEVICE)); \ + KOKKOS_CUBLAS_SAFE_CALL_IMPL(cublasCrotg(singleton.handle, reinterpret_cast(a.data()), \ + reinterpret_cast(b.data()), c.data(), \ + reinterpret_cast(s.data()))); \ + KOKKOS_CUBLAS_SAFE_CALL_IMPL(cublasSetPointerMode(singleton.handle, pointer_mode)); \ + Kokkos::Profiling::popRegion(); \ + } \ }; -KOKKOSBLAS1_DROTG_TPL_SPEC_DECL_CUBLAS(Kokkos::LayoutLeft, Kokkos::Cuda, - Kokkos::CudaSpace, true) -KOKKOSBLAS1_DROTG_TPL_SPEC_DECL_CUBLAS(Kokkos::LayoutLeft, Kokkos::Cuda, - Kokkos::CudaSpace, false) -KOKKOSBLAS1_DROTG_TPL_SPEC_DECL_CUBLAS(Kokkos::LayoutRight, Kokkos::Cuda, - Kokkos::CudaSpace, true) -KOKKOSBLAS1_DROTG_TPL_SPEC_DECL_CUBLAS(Kokkos::LayoutRight, Kokkos::Cuda, - Kokkos::CudaSpace, false) -KOKKOSBLAS1_DROTG_TPL_SPEC_DECL_CUBLAS(Kokkos::LayoutLeft, Kokkos::Cuda, - Kokkos::CudaUVMSpace, true) -KOKKOSBLAS1_DROTG_TPL_SPEC_DECL_CUBLAS(Kokkos::LayoutLeft, Kokkos::Cuda, - Kokkos::CudaUVMSpace, false) -KOKKOSBLAS1_DROTG_TPL_SPEC_DECL_CUBLAS(Kokkos::LayoutRight, Kokkos::Cuda, - Kokkos::CudaUVMSpace, true) -KOKKOSBLAS1_DROTG_TPL_SPEC_DECL_CUBLAS(Kokkos::LayoutRight, Kokkos::Cuda, - Kokkos::CudaUVMSpace, false) - -KOKKOSBLAS1_SROTG_TPL_SPEC_DECL_CUBLAS(Kokkos::LayoutLeft, Kokkos::Cuda, - Kokkos::CudaSpace, true) -KOKKOSBLAS1_SROTG_TPL_SPEC_DECL_CUBLAS(Kokkos::LayoutLeft, Kokkos::Cuda, - Kokkos::CudaSpace, false) -KOKKOSBLAS1_SROTG_TPL_SPEC_DECL_CUBLAS(Kokkos::LayoutRight, Kokkos::Cuda, - Kokkos::CudaSpace, true) -KOKKOSBLAS1_SROTG_TPL_SPEC_DECL_CUBLAS(Kokkos::LayoutRight, Kokkos::Cuda, - Kokkos::CudaSpace, false) -KOKKOSBLAS1_SROTG_TPL_SPEC_DECL_CUBLAS(Kokkos::LayoutLeft, Kokkos::Cuda, - Kokkos::CudaUVMSpace, true) -KOKKOSBLAS1_SROTG_TPL_SPEC_DECL_CUBLAS(Kokkos::LayoutLeft, Kokkos::Cuda, - Kokkos::CudaUVMSpace, false) -KOKKOSBLAS1_SROTG_TPL_SPEC_DECL_CUBLAS(Kokkos::LayoutRight, Kokkos::Cuda, - Kokkos::CudaUVMSpace, true) -KOKKOSBLAS1_SROTG_TPL_SPEC_DECL_CUBLAS(Kokkos::LayoutRight, Kokkos::Cuda, - Kokkos::CudaUVMSpace, false) - -KOKKOSBLAS1_ZROTG_TPL_SPEC_DECL_CUBLAS(Kokkos::LayoutLeft, Kokkos::Cuda, - Kokkos::CudaSpace, true) -KOKKOSBLAS1_ZROTG_TPL_SPEC_DECL_CUBLAS(Kokkos::LayoutLeft, Kokkos::Cuda, - Kokkos::CudaSpace, false) -KOKKOSBLAS1_ZROTG_TPL_SPEC_DECL_CUBLAS(Kokkos::LayoutRight, Kokkos::Cuda, - Kokkos::CudaSpace, true) -KOKKOSBLAS1_ZROTG_TPL_SPEC_DECL_CUBLAS(Kokkos::LayoutRight, Kokkos::Cuda, - Kokkos::CudaSpace, false) -KOKKOSBLAS1_ZROTG_TPL_SPEC_DECL_CUBLAS(Kokkos::LayoutLeft, Kokkos::Cuda, - Kokkos::CudaUVMSpace, true) -KOKKOSBLAS1_ZROTG_TPL_SPEC_DECL_CUBLAS(Kokkos::LayoutLeft, Kokkos::Cuda, - Kokkos::CudaUVMSpace, false) -KOKKOSBLAS1_ZROTG_TPL_SPEC_DECL_CUBLAS(Kokkos::LayoutRight, Kokkos::Cuda, - Kokkos::CudaUVMSpace, true) -KOKKOSBLAS1_ZROTG_TPL_SPEC_DECL_CUBLAS(Kokkos::LayoutRight, Kokkos::Cuda, - Kokkos::CudaUVMSpace, false) - -KOKKOSBLAS1_CROTG_TPL_SPEC_DECL_CUBLAS(Kokkos::LayoutLeft, Kokkos::Cuda, - Kokkos::CudaSpace, true) -KOKKOSBLAS1_CROTG_TPL_SPEC_DECL_CUBLAS(Kokkos::LayoutLeft, Kokkos::Cuda, - Kokkos::CudaSpace, false) -KOKKOSBLAS1_CROTG_TPL_SPEC_DECL_CUBLAS(Kokkos::LayoutRight, Kokkos::Cuda, - Kokkos::CudaSpace, true) -KOKKOSBLAS1_CROTG_TPL_SPEC_DECL_CUBLAS(Kokkos::LayoutRight, Kokkos::Cuda, - Kokkos::CudaSpace, false) -KOKKOSBLAS1_CROTG_TPL_SPEC_DECL_CUBLAS(Kokkos::LayoutLeft, Kokkos::Cuda, - Kokkos::CudaUVMSpace, true) -KOKKOSBLAS1_CROTG_TPL_SPEC_DECL_CUBLAS(Kokkos::LayoutLeft, Kokkos::Cuda, - Kokkos::CudaUVMSpace, false) -KOKKOSBLAS1_CROTG_TPL_SPEC_DECL_CUBLAS(Kokkos::LayoutRight, Kokkos::Cuda, - Kokkos::CudaUVMSpace, true) -KOKKOSBLAS1_CROTG_TPL_SPEC_DECL_CUBLAS(Kokkos::LayoutRight, Kokkos::Cuda, - Kokkos::CudaUVMSpace, false) +KOKKOSBLAS1_DROTG_TPL_SPEC_DECL_CUBLAS(Kokkos::LayoutLeft, Kokkos::Cuda, Kokkos::CudaSpace, true) +KOKKOSBLAS1_DROTG_TPL_SPEC_DECL_CUBLAS(Kokkos::LayoutLeft, Kokkos::Cuda, Kokkos::CudaSpace, false) +KOKKOSBLAS1_DROTG_TPL_SPEC_DECL_CUBLAS(Kokkos::LayoutRight, Kokkos::Cuda, Kokkos::CudaSpace, true) +KOKKOSBLAS1_DROTG_TPL_SPEC_DECL_CUBLAS(Kokkos::LayoutRight, Kokkos::Cuda, Kokkos::CudaSpace, false) +KOKKOSBLAS1_DROTG_TPL_SPEC_DECL_CUBLAS(Kokkos::LayoutLeft, Kokkos::Cuda, Kokkos::CudaUVMSpace, true) +KOKKOSBLAS1_DROTG_TPL_SPEC_DECL_CUBLAS(Kokkos::LayoutLeft, Kokkos::Cuda, Kokkos::CudaUVMSpace, false) +KOKKOSBLAS1_DROTG_TPL_SPEC_DECL_CUBLAS(Kokkos::LayoutRight, Kokkos::Cuda, Kokkos::CudaUVMSpace, true) +KOKKOSBLAS1_DROTG_TPL_SPEC_DECL_CUBLAS(Kokkos::LayoutRight, Kokkos::Cuda, Kokkos::CudaUVMSpace, false) + +KOKKOSBLAS1_SROTG_TPL_SPEC_DECL_CUBLAS(Kokkos::LayoutLeft, Kokkos::Cuda, Kokkos::CudaSpace, true) +KOKKOSBLAS1_SROTG_TPL_SPEC_DECL_CUBLAS(Kokkos::LayoutLeft, Kokkos::Cuda, Kokkos::CudaSpace, false) +KOKKOSBLAS1_SROTG_TPL_SPEC_DECL_CUBLAS(Kokkos::LayoutRight, Kokkos::Cuda, Kokkos::CudaSpace, true) +KOKKOSBLAS1_SROTG_TPL_SPEC_DECL_CUBLAS(Kokkos::LayoutRight, Kokkos::Cuda, Kokkos::CudaSpace, false) +KOKKOSBLAS1_SROTG_TPL_SPEC_DECL_CUBLAS(Kokkos::LayoutLeft, Kokkos::Cuda, Kokkos::CudaUVMSpace, true) +KOKKOSBLAS1_SROTG_TPL_SPEC_DECL_CUBLAS(Kokkos::LayoutLeft, Kokkos::Cuda, Kokkos::CudaUVMSpace, false) +KOKKOSBLAS1_SROTG_TPL_SPEC_DECL_CUBLAS(Kokkos::LayoutRight, Kokkos::Cuda, Kokkos::CudaUVMSpace, true) +KOKKOSBLAS1_SROTG_TPL_SPEC_DECL_CUBLAS(Kokkos::LayoutRight, Kokkos::Cuda, Kokkos::CudaUVMSpace, false) + +KOKKOSBLAS1_ZROTG_TPL_SPEC_DECL_CUBLAS(Kokkos::LayoutLeft, Kokkos::Cuda, Kokkos::CudaSpace, true) +KOKKOSBLAS1_ZROTG_TPL_SPEC_DECL_CUBLAS(Kokkos::LayoutLeft, Kokkos::Cuda, Kokkos::CudaSpace, false) +KOKKOSBLAS1_ZROTG_TPL_SPEC_DECL_CUBLAS(Kokkos::LayoutRight, Kokkos::Cuda, Kokkos::CudaSpace, true) +KOKKOSBLAS1_ZROTG_TPL_SPEC_DECL_CUBLAS(Kokkos::LayoutRight, Kokkos::Cuda, Kokkos::CudaSpace, false) +KOKKOSBLAS1_ZROTG_TPL_SPEC_DECL_CUBLAS(Kokkos::LayoutLeft, Kokkos::Cuda, Kokkos::CudaUVMSpace, true) +KOKKOSBLAS1_ZROTG_TPL_SPEC_DECL_CUBLAS(Kokkos::LayoutLeft, Kokkos::Cuda, Kokkos::CudaUVMSpace, false) +KOKKOSBLAS1_ZROTG_TPL_SPEC_DECL_CUBLAS(Kokkos::LayoutRight, Kokkos::Cuda, Kokkos::CudaUVMSpace, true) +KOKKOSBLAS1_ZROTG_TPL_SPEC_DECL_CUBLAS(Kokkos::LayoutRight, Kokkos::Cuda, Kokkos::CudaUVMSpace, false) + +KOKKOSBLAS1_CROTG_TPL_SPEC_DECL_CUBLAS(Kokkos::LayoutLeft, Kokkos::Cuda, Kokkos::CudaSpace, true) +KOKKOSBLAS1_CROTG_TPL_SPEC_DECL_CUBLAS(Kokkos::LayoutLeft, Kokkos::Cuda, Kokkos::CudaSpace, false) +KOKKOSBLAS1_CROTG_TPL_SPEC_DECL_CUBLAS(Kokkos::LayoutRight, Kokkos::Cuda, Kokkos::CudaSpace, true) +KOKKOSBLAS1_CROTG_TPL_SPEC_DECL_CUBLAS(Kokkos::LayoutRight, Kokkos::Cuda, Kokkos::CudaSpace, false) +KOKKOSBLAS1_CROTG_TPL_SPEC_DECL_CUBLAS(Kokkos::LayoutLeft, Kokkos::Cuda, Kokkos::CudaUVMSpace, true) +KOKKOSBLAS1_CROTG_TPL_SPEC_DECL_CUBLAS(Kokkos::LayoutLeft, Kokkos::Cuda, Kokkos::CudaUVMSpace, false) +KOKKOSBLAS1_CROTG_TPL_SPEC_DECL_CUBLAS(Kokkos::LayoutRight, Kokkos::Cuda, Kokkos::CudaUVMSpace, true) +KOKKOSBLAS1_CROTG_TPL_SPEC_DECL_CUBLAS(Kokkos::LayoutRight, Kokkos::Cuda, Kokkos::CudaUVMSpace, false) } // namespace Impl } // namespace KokkosBlas @@ -469,201 +335,137 @@ KOKKOSBLAS1_CROTG_TPL_SPEC_DECL_CUBLAS(Kokkos::LayoutRight, Kokkos::Cuda, namespace KokkosBlas { namespace Impl { -#define KOKKOSBLAS1_DROTG_TPL_SPEC_DECL_ROCBLAS(LAYOUT, EXECSPACE, MEMSPACE, \ - ETI_SPEC_AVAIL) \ - template <> \ - struct Rotg< \ - EXECSPACE, \ - Kokkos::View, \ - Kokkos::MemoryTraits>, \ - Kokkos::View, \ - Kokkos::MemoryTraits>, \ - true, ETI_SPEC_AVAIL> { \ - using SViewType = \ - Kokkos::View, \ - Kokkos::MemoryTraits>; \ - using MViewType = \ - Kokkos::View, \ - Kokkos::MemoryTraits>; \ - static void rotg(EXECSPACE const& space, SViewType const& a, \ - SViewType const& b, MViewType const& c, \ - SViewType const& s) { \ - Kokkos::Profiling::pushRegion("KokkosBlas::rotg[TPL_ROCBLAS,double]"); \ - rotg_print_specialization(); \ - KokkosBlas::Impl::RocBlasSingleton& singleton = \ - KokkosBlas::Impl::RocBlasSingleton::singleton(); \ - KOKKOS_ROCBLAS_SAFE_CALL_IMPL( \ - rocblas_set_stream(singleton.handle, space.hip_stream())); \ - rocblas_pointer_mode pointer_mode; \ - KOKKOS_ROCBLAS_SAFE_CALL_IMPL( \ - rocblas_get_pointer_mode(singleton.handle, &pointer_mode)); \ - KOKKOS_ROCBLAS_SAFE_CALL_IMPL(rocblas_set_pointer_mode( \ - singleton.handle, rocblas_pointer_mode_device)); \ - KOKKOS_ROCBLAS_SAFE_CALL_IMPL(rocblas_drotg( \ - singleton.handle, a.data(), b.data(), c.data(), s.data())); \ - KOKKOS_ROCBLAS_SAFE_CALL_IMPL( \ - rocblas_set_pointer_mode(singleton.handle, pointer_mode)); \ - Kokkos::Profiling::popRegion(); \ - } \ +#define KOKKOSBLAS1_DROTG_TPL_SPEC_DECL_ROCBLAS(LAYOUT, EXECSPACE, MEMSPACE, ETI_SPEC_AVAIL) \ + template <> \ + struct Rotg< \ + EXECSPACE, \ + Kokkos::View, Kokkos::MemoryTraits>, \ + Kokkos::View, Kokkos::MemoryTraits>, \ + true, ETI_SPEC_AVAIL> { \ + using SViewType = \ + Kokkos::View, Kokkos::MemoryTraits>; \ + using MViewType = \ + Kokkos::View, Kokkos::MemoryTraits>; \ + static void rotg(EXECSPACE const& space, SViewType const& a, SViewType const& b, MViewType const& c, \ + SViewType const& s) { \ + Kokkos::Profiling::pushRegion("KokkosBlas::rotg[TPL_ROCBLAS,double]"); \ + rotg_print_specialization(); \ + KokkosBlas::Impl::RocBlasSingleton& singleton = KokkosBlas::Impl::RocBlasSingleton::singleton(); \ + KOKKOS_ROCBLAS_SAFE_CALL_IMPL(rocblas_set_stream(singleton.handle, space.hip_stream())); \ + rocblas_pointer_mode pointer_mode; \ + KOKKOS_ROCBLAS_SAFE_CALL_IMPL(rocblas_get_pointer_mode(singleton.handle, &pointer_mode)); \ + KOKKOS_ROCBLAS_SAFE_CALL_IMPL(rocblas_set_pointer_mode(singleton.handle, rocblas_pointer_mode_device)); \ + KOKKOS_ROCBLAS_SAFE_CALL_IMPL(rocblas_drotg(singleton.handle, a.data(), b.data(), c.data(), s.data())); \ + KOKKOS_ROCBLAS_SAFE_CALL_IMPL(rocblas_set_pointer_mode(singleton.handle, pointer_mode)); \ + Kokkos::Profiling::popRegion(); \ + } \ }; -#define KOKKOSBLAS1_SROTG_TPL_SPEC_DECL_ROCBLAS(LAYOUT, EXECSPACE, MEMSPACE, \ - ETI_SPEC_AVAIL) \ - template <> \ - struct Rotg, \ - Kokkos::MemoryTraits>, \ - Kokkos::View, \ - Kokkos::MemoryTraits>, \ - true, ETI_SPEC_AVAIL> { \ - using SViewType = \ - Kokkos::View, \ - Kokkos::MemoryTraits>; \ - using MViewType = \ - Kokkos::View, \ - Kokkos::MemoryTraits>; \ - static void rotg(EXECSPACE const& space, SViewType const& a, \ - SViewType const& b, MViewType const& c, \ - SViewType const& s) { \ - Kokkos::Profiling::pushRegion("KokkosBlas::rotg[TPL_ROCBLAS,float]"); \ - rotg_print_specialization(); \ - KokkosBlas::Impl::RocBlasSingleton& singleton = \ - KokkosBlas::Impl::RocBlasSingleton::singleton(); \ - KOKKOS_ROCBLAS_SAFE_CALL_IMPL( \ - rocblas_set_stream(singleton.handle, space.hip_stream())); \ - rocblas_pointer_mode pointer_mode; \ - KOKKOS_ROCBLAS_SAFE_CALL_IMPL( \ - rocblas_get_pointer_mode(singleton.handle, &pointer_mode)); \ - KOKKOS_ROCBLAS_SAFE_CALL_IMPL(rocblas_set_pointer_mode( \ - singleton.handle, rocblas_pointer_mode_device)); \ - KOKKOS_ROCBLAS_SAFE_CALL_IMPL(rocblas_srotg( \ - singleton.handle, a.data(), b.data(), c.data(), s.data())); \ - KOKKOS_ROCBLAS_SAFE_CALL_IMPL( \ - rocblas_set_pointer_mode(singleton.handle, pointer_mode)); \ - Kokkos::Profiling::popRegion(); \ - } \ +#define KOKKOSBLAS1_SROTG_TPL_SPEC_DECL_ROCBLAS(LAYOUT, EXECSPACE, MEMSPACE, ETI_SPEC_AVAIL) \ + template <> \ + struct Rotg< \ + EXECSPACE, \ + Kokkos::View, Kokkos::MemoryTraits>, \ + Kokkos::View, Kokkos::MemoryTraits>, true, \ + ETI_SPEC_AVAIL> { \ + using SViewType = \ + Kokkos::View, Kokkos::MemoryTraits>; \ + using MViewType = \ + Kokkos::View, Kokkos::MemoryTraits>; \ + static void rotg(EXECSPACE const& space, SViewType const& a, SViewType const& b, MViewType const& c, \ + SViewType const& s) { \ + Kokkos::Profiling::pushRegion("KokkosBlas::rotg[TPL_ROCBLAS,float]"); \ + rotg_print_specialization(); \ + KokkosBlas::Impl::RocBlasSingleton& singleton = KokkosBlas::Impl::RocBlasSingleton::singleton(); \ + KOKKOS_ROCBLAS_SAFE_CALL_IMPL(rocblas_set_stream(singleton.handle, space.hip_stream())); \ + rocblas_pointer_mode pointer_mode; \ + KOKKOS_ROCBLAS_SAFE_CALL_IMPL(rocblas_get_pointer_mode(singleton.handle, &pointer_mode)); \ + KOKKOS_ROCBLAS_SAFE_CALL_IMPL(rocblas_set_pointer_mode(singleton.handle, rocblas_pointer_mode_device)); \ + KOKKOS_ROCBLAS_SAFE_CALL_IMPL(rocblas_srotg(singleton.handle, a.data(), b.data(), c.data(), s.data())); \ + KOKKOS_ROCBLAS_SAFE_CALL_IMPL(rocblas_set_pointer_mode(singleton.handle, pointer_mode)); \ + Kokkos::Profiling::popRegion(); \ + } \ }; -#define KOKKOSBLAS1_ZROTG_TPL_SPEC_DECL_ROCBLAS(LAYOUT, EXECSPACE, MEMSPACE, \ - ETI_SPEC_AVAIL) \ - template <> \ - struct Rotg< \ - EXECSPACE, \ - Kokkos::View, LAYOUT, \ - Kokkos::Device, \ - Kokkos::MemoryTraits>, \ - Kokkos::View, \ - Kokkos::MemoryTraits>, \ - true, ETI_SPEC_AVAIL> { \ - using SViewType = Kokkos::View, LAYOUT, \ - Kokkos::Device, \ - Kokkos::MemoryTraits>; \ - using MViewType = \ - Kokkos::View, \ - Kokkos::MemoryTraits>; \ - static void rotg(EXECSPACE const& space, SViewType const& a, \ - SViewType const& b, MViewType const& c, \ - SViewType const& s) { \ - Kokkos::Profiling::pushRegion( \ - "KokkosBlas::rotg[TPL_ROCBLAS,complex]"); \ - rotg_print_specialization, EXECSPACE>(); \ - KokkosBlas::Impl::RocBlasSingleton& singleton = \ - KokkosBlas::Impl::RocBlasSingleton::singleton(); \ - KOKKOS_ROCBLAS_SAFE_CALL_IMPL( \ - rocblas_set_stream(singleton.handle, space.hip_stream())); \ - rocblas_pointer_mode pointer_mode; \ - KOKKOS_ROCBLAS_SAFE_CALL_IMPL( \ - rocblas_get_pointer_mode(singleton.handle, &pointer_mode)); \ - KOKKOS_ROCBLAS_SAFE_CALL_IMPL(rocblas_set_pointer_mode( \ - singleton.handle, rocblas_pointer_mode_device)); \ - KOKKOS_ROCBLAS_SAFE_CALL_IMPL(rocblas_zrotg( \ - singleton.handle, \ - reinterpret_cast(a.data()), \ - reinterpret_cast(b.data()), c.data(), \ - reinterpret_cast(s.data()))); \ - KOKKOS_ROCBLAS_SAFE_CALL_IMPL( \ - rocblas_set_pointer_mode(singleton.handle, pointer_mode)); \ - Kokkos::Profiling::popRegion(); \ - } \ +#define KOKKOSBLAS1_ZROTG_TPL_SPEC_DECL_ROCBLAS(LAYOUT, EXECSPACE, MEMSPACE, ETI_SPEC_AVAIL) \ + template <> \ + struct Rotg< \ + EXECSPACE, \ + Kokkos::View, LAYOUT, Kokkos::Device, \ + Kokkos::MemoryTraits>, \ + Kokkos::View, Kokkos::MemoryTraits>, \ + true, ETI_SPEC_AVAIL> { \ + using SViewType = Kokkos::View, LAYOUT, Kokkos::Device, \ + Kokkos::MemoryTraits>; \ + using MViewType = \ + Kokkos::View, Kokkos::MemoryTraits>; \ + static void rotg(EXECSPACE const& space, SViewType const& a, SViewType const& b, MViewType const& c, \ + SViewType const& s) { \ + Kokkos::Profiling::pushRegion("KokkosBlas::rotg[TPL_ROCBLAS,complex]"); \ + rotg_print_specialization, EXECSPACE>(); \ + KokkosBlas::Impl::RocBlasSingleton& singleton = KokkosBlas::Impl::RocBlasSingleton::singleton(); \ + KOKKOS_ROCBLAS_SAFE_CALL_IMPL(rocblas_set_stream(singleton.handle, space.hip_stream())); \ + rocblas_pointer_mode pointer_mode; \ + KOKKOS_ROCBLAS_SAFE_CALL_IMPL(rocblas_get_pointer_mode(singleton.handle, &pointer_mode)); \ + KOKKOS_ROCBLAS_SAFE_CALL_IMPL(rocblas_set_pointer_mode(singleton.handle, rocblas_pointer_mode_device)); \ + KOKKOS_ROCBLAS_SAFE_CALL_IMPL(rocblas_zrotg(singleton.handle, \ + reinterpret_cast(a.data()), \ + reinterpret_cast(b.data()), c.data(), \ + reinterpret_cast(s.data()))); \ + KOKKOS_ROCBLAS_SAFE_CALL_IMPL(rocblas_set_pointer_mode(singleton.handle, pointer_mode)); \ + Kokkos::Profiling::popRegion(); \ + } \ }; -#define KOKKOSBLAS1_CROTG_TPL_SPEC_DECL_ROCBLAS(LAYOUT, EXECSPACE, MEMSPACE, \ - ETI_SPEC_AVAIL) \ - template <> \ - struct Rotg, LAYOUT, \ - Kokkos::Device, \ - Kokkos::MemoryTraits>, \ - Kokkos::View, \ - Kokkos::MemoryTraits>, \ - true, ETI_SPEC_AVAIL> { \ - using SViewType = Kokkos::View, LAYOUT, \ - Kokkos::Device, \ - Kokkos::MemoryTraits>; \ - using MViewType = \ - Kokkos::View, \ - Kokkos::MemoryTraits>; \ - static void rotg(EXECSPACE const& space, SViewType const& a, \ - SViewType const& b, MViewType const& c, \ - SViewType const& s) { \ - Kokkos::Profiling::pushRegion( \ - "KokkosBlas::rotg[TPL_ROCBLAS,complex]"); \ - rotg_print_specialization, EXECSPACE>(); \ - KokkosBlas::Impl::RocBlasSingleton& singleton = \ - KokkosBlas::Impl::RocBlasSingleton::singleton(); \ - KOKKOS_ROCBLAS_SAFE_CALL_IMPL( \ - rocblas_set_stream(singleton.handle, space.hip_stream())); \ - rocblas_pointer_mode pointer_mode; \ - KOKKOS_ROCBLAS_SAFE_CALL_IMPL( \ - rocblas_get_pointer_mode(singleton.handle, &pointer_mode)); \ - KOKKOS_ROCBLAS_SAFE_CALL_IMPL(rocblas_set_pointer_mode( \ - singleton.handle, rocblas_pointer_mode_device)); \ - KOKKOS_ROCBLAS_SAFE_CALL_IMPL(rocblas_crotg( \ - singleton.handle, \ - reinterpret_cast(a.data()), \ - reinterpret_cast(b.data()), c.data(), \ - reinterpret_cast(s.data()))); \ - KOKKOS_ROCBLAS_SAFE_CALL_IMPL( \ - rocblas_set_pointer_mode(singleton.handle, pointer_mode)); \ - Kokkos::Profiling::popRegion(); \ - } \ +#define KOKKOSBLAS1_CROTG_TPL_SPEC_DECL_ROCBLAS(LAYOUT, EXECSPACE, MEMSPACE, ETI_SPEC_AVAIL) \ + template <> \ + struct Rotg< \ + EXECSPACE, \ + Kokkos::View, LAYOUT, Kokkos::Device, \ + Kokkos::MemoryTraits>, \ + Kokkos::View, Kokkos::MemoryTraits>, true, \ + ETI_SPEC_AVAIL> { \ + using SViewType = Kokkos::View, LAYOUT, Kokkos::Device, \ + Kokkos::MemoryTraits>; \ + using MViewType = \ + Kokkos::View, Kokkos::MemoryTraits>; \ + static void rotg(EXECSPACE const& space, SViewType const& a, SViewType const& b, MViewType const& c, \ + SViewType const& s) { \ + Kokkos::Profiling::pushRegion("KokkosBlas::rotg[TPL_ROCBLAS,complex]"); \ + rotg_print_specialization, EXECSPACE>(); \ + KokkosBlas::Impl::RocBlasSingleton& singleton = KokkosBlas::Impl::RocBlasSingleton::singleton(); \ + KOKKOS_ROCBLAS_SAFE_CALL_IMPL(rocblas_set_stream(singleton.handle, space.hip_stream())); \ + rocblas_pointer_mode pointer_mode; \ + KOKKOS_ROCBLAS_SAFE_CALL_IMPL(rocblas_get_pointer_mode(singleton.handle, &pointer_mode)); \ + KOKKOS_ROCBLAS_SAFE_CALL_IMPL(rocblas_set_pointer_mode(singleton.handle, rocblas_pointer_mode_device)); \ + KOKKOS_ROCBLAS_SAFE_CALL_IMPL(rocblas_crotg(singleton.handle, \ + reinterpret_cast(a.data()), \ + reinterpret_cast(b.data()), c.data(), \ + reinterpret_cast(s.data()))); \ + KOKKOS_ROCBLAS_SAFE_CALL_IMPL(rocblas_set_pointer_mode(singleton.handle, pointer_mode)); \ + Kokkos::Profiling::popRegion(); \ + } \ }; -KOKKOSBLAS1_DROTG_TPL_SPEC_DECL_ROCBLAS(Kokkos::LayoutLeft, Kokkos::HIP, - Kokkos::HIPSpace, true) -KOKKOSBLAS1_DROTG_TPL_SPEC_DECL_ROCBLAS(Kokkos::LayoutLeft, Kokkos::HIP, - Kokkos::HIPSpace, false) -KOKKOSBLAS1_DROTG_TPL_SPEC_DECL_ROCBLAS(Kokkos::LayoutRight, Kokkos::HIP, - Kokkos::HIPSpace, true) -KOKKOSBLAS1_DROTG_TPL_SPEC_DECL_ROCBLAS(Kokkos::LayoutRight, Kokkos::HIP, - Kokkos::HIPSpace, false) - -KOKKOSBLAS1_SROTG_TPL_SPEC_DECL_ROCBLAS(Kokkos::LayoutLeft, Kokkos::HIP, - Kokkos::HIPSpace, true) -KOKKOSBLAS1_SROTG_TPL_SPEC_DECL_ROCBLAS(Kokkos::LayoutLeft, Kokkos::HIP, - Kokkos::HIPSpace, false) -KOKKOSBLAS1_SROTG_TPL_SPEC_DECL_ROCBLAS(Kokkos::LayoutRight, Kokkos::HIP, - Kokkos::HIPSpace, true) -KOKKOSBLAS1_SROTG_TPL_SPEC_DECL_ROCBLAS(Kokkos::LayoutRight, Kokkos::HIP, - Kokkos::HIPSpace, false) - -KOKKOSBLAS1_ZROTG_TPL_SPEC_DECL_ROCBLAS(Kokkos::LayoutLeft, Kokkos::HIP, - Kokkos::HIPSpace, true) -KOKKOSBLAS1_ZROTG_TPL_SPEC_DECL_ROCBLAS(Kokkos::LayoutLeft, Kokkos::HIP, - Kokkos::HIPSpace, false) -KOKKOSBLAS1_ZROTG_TPL_SPEC_DECL_ROCBLAS(Kokkos::LayoutRight, Kokkos::HIP, - Kokkos::HIPSpace, true) -KOKKOSBLAS1_ZROTG_TPL_SPEC_DECL_ROCBLAS(Kokkos::LayoutRight, Kokkos::HIP, - Kokkos::HIPSpace, false) - -KOKKOSBLAS1_CROTG_TPL_SPEC_DECL_ROCBLAS(Kokkos::LayoutLeft, Kokkos::HIP, - Kokkos::HIPSpace, true) -KOKKOSBLAS1_CROTG_TPL_SPEC_DECL_ROCBLAS(Kokkos::LayoutLeft, Kokkos::HIP, - Kokkos::HIPSpace, false) -KOKKOSBLAS1_CROTG_TPL_SPEC_DECL_ROCBLAS(Kokkos::LayoutRight, Kokkos::HIP, - Kokkos::HIPSpace, true) -KOKKOSBLAS1_CROTG_TPL_SPEC_DECL_ROCBLAS(Kokkos::LayoutRight, Kokkos::HIP, - Kokkos::HIPSpace, false) +KOKKOSBLAS1_DROTG_TPL_SPEC_DECL_ROCBLAS(Kokkos::LayoutLeft, Kokkos::HIP, Kokkos::HIPSpace, true) +KOKKOSBLAS1_DROTG_TPL_SPEC_DECL_ROCBLAS(Kokkos::LayoutLeft, Kokkos::HIP, Kokkos::HIPSpace, false) +KOKKOSBLAS1_DROTG_TPL_SPEC_DECL_ROCBLAS(Kokkos::LayoutRight, Kokkos::HIP, Kokkos::HIPSpace, true) +KOKKOSBLAS1_DROTG_TPL_SPEC_DECL_ROCBLAS(Kokkos::LayoutRight, Kokkos::HIP, Kokkos::HIPSpace, false) + +KOKKOSBLAS1_SROTG_TPL_SPEC_DECL_ROCBLAS(Kokkos::LayoutLeft, Kokkos::HIP, Kokkos::HIPSpace, true) +KOKKOSBLAS1_SROTG_TPL_SPEC_DECL_ROCBLAS(Kokkos::LayoutLeft, Kokkos::HIP, Kokkos::HIPSpace, false) +KOKKOSBLAS1_SROTG_TPL_SPEC_DECL_ROCBLAS(Kokkos::LayoutRight, Kokkos::HIP, Kokkos::HIPSpace, true) +KOKKOSBLAS1_SROTG_TPL_SPEC_DECL_ROCBLAS(Kokkos::LayoutRight, Kokkos::HIP, Kokkos::HIPSpace, false) + +KOKKOSBLAS1_ZROTG_TPL_SPEC_DECL_ROCBLAS(Kokkos::LayoutLeft, Kokkos::HIP, Kokkos::HIPSpace, true) +KOKKOSBLAS1_ZROTG_TPL_SPEC_DECL_ROCBLAS(Kokkos::LayoutLeft, Kokkos::HIP, Kokkos::HIPSpace, false) +KOKKOSBLAS1_ZROTG_TPL_SPEC_DECL_ROCBLAS(Kokkos::LayoutRight, Kokkos::HIP, Kokkos::HIPSpace, true) +KOKKOSBLAS1_ZROTG_TPL_SPEC_DECL_ROCBLAS(Kokkos::LayoutRight, Kokkos::HIP, Kokkos::HIPSpace, false) + +KOKKOSBLAS1_CROTG_TPL_SPEC_DECL_ROCBLAS(Kokkos::LayoutLeft, Kokkos::HIP, Kokkos::HIPSpace, true) +KOKKOSBLAS1_CROTG_TPL_SPEC_DECL_ROCBLAS(Kokkos::LayoutLeft, Kokkos::HIP, Kokkos::HIPSpace, false) +KOKKOSBLAS1_CROTG_TPL_SPEC_DECL_ROCBLAS(Kokkos::LayoutRight, Kokkos::HIP, Kokkos::HIPSpace, true) +KOKKOSBLAS1_CROTG_TPL_SPEC_DECL_ROCBLAS(Kokkos::LayoutRight, Kokkos::HIP, Kokkos::HIPSpace, false) } // namespace Impl } // namespace KokkosBlas diff --git a/blas/tpls/KokkosBlas1_rotm_tpl_spec_avail.hpp b/blas/tpls/KokkosBlas1_rotm_tpl_spec_avail.hpp index 2a1ee21cc6..84e7452e65 100644 --- a/blas/tpls/KokkosBlas1_rotm_tpl_spec_avail.hpp +++ b/blas/tpls/KokkosBlas1_rotm_tpl_spec_avail.hpp @@ -34,90 +34,65 @@ namespace Impl { // ARMPL is disabled as it does not detect some corner // cases correctly which leads to failing unit-tests #if defined(KOKKOSKERNELS_ENABLE_TPL_BLAS) -#define KOKKOSBLAS1_ROTM_TPL_SPEC_AVAIL_BLAS(SCALAR, LAYOUT, EXEC_SPACE, \ - MEM_SPACE) \ - template <> \ - struct rotm_tpl_spec_avail< \ - EXEC_SPACE, \ - Kokkos::View, \ - Kokkos::MemoryTraits>, \ - Kokkos::View, \ - Kokkos::MemoryTraits>> { \ - enum : bool { value = true }; \ +#define KOKKOSBLAS1_ROTM_TPL_SPEC_AVAIL_BLAS(SCALAR, LAYOUT, EXEC_SPACE, MEM_SPACE) \ + template <> \ + struct rotm_tpl_spec_avail< \ + EXEC_SPACE, \ + Kokkos::View, Kokkos::MemoryTraits>, \ + Kokkos::View, \ + Kokkos::MemoryTraits>> { \ + enum : bool { value = true }; \ }; #ifdef KOKKOS_ENABLE_SERIAL -KOKKOSBLAS1_ROTM_TPL_SPEC_AVAIL_BLAS(double, Kokkos::LayoutLeft, Kokkos::Serial, - Kokkos::HostSpace) -KOKKOSBLAS1_ROTM_TPL_SPEC_AVAIL_BLAS(float, Kokkos::LayoutLeft, Kokkos::Serial, - Kokkos::HostSpace) -KOKKOSBLAS1_ROTM_TPL_SPEC_AVAIL_BLAS(double, Kokkos::LayoutRight, - Kokkos::Serial, Kokkos::HostSpace) -KOKKOSBLAS1_ROTM_TPL_SPEC_AVAIL_BLAS(float, Kokkos::LayoutRight, Kokkos::Serial, - Kokkos::HostSpace) +KOKKOSBLAS1_ROTM_TPL_SPEC_AVAIL_BLAS(double, Kokkos::LayoutLeft, Kokkos::Serial, Kokkos::HostSpace) +KOKKOSBLAS1_ROTM_TPL_SPEC_AVAIL_BLAS(float, Kokkos::LayoutLeft, Kokkos::Serial, Kokkos::HostSpace) +KOKKOSBLAS1_ROTM_TPL_SPEC_AVAIL_BLAS(double, Kokkos::LayoutRight, Kokkos::Serial, Kokkos::HostSpace) +KOKKOSBLAS1_ROTM_TPL_SPEC_AVAIL_BLAS(float, Kokkos::LayoutRight, Kokkos::Serial, Kokkos::HostSpace) #endif #ifdef KOKKOS_ENABLE_OPENMP -KOKKOSBLAS1_ROTM_TPL_SPEC_AVAIL_BLAS(double, Kokkos::LayoutLeft, Kokkos::OpenMP, - Kokkos::HostSpace) -KOKKOSBLAS1_ROTM_TPL_SPEC_AVAIL_BLAS(float, Kokkos::LayoutLeft, Kokkos::OpenMP, - Kokkos::HostSpace) -KOKKOSBLAS1_ROTM_TPL_SPEC_AVAIL_BLAS(double, Kokkos::LayoutRight, - Kokkos::OpenMP, Kokkos::HostSpace) -KOKKOSBLAS1_ROTM_TPL_SPEC_AVAIL_BLAS(float, Kokkos::LayoutRight, Kokkos::OpenMP, - Kokkos::HostSpace) +KOKKOSBLAS1_ROTM_TPL_SPEC_AVAIL_BLAS(double, Kokkos::LayoutLeft, Kokkos::OpenMP, Kokkos::HostSpace) +KOKKOSBLAS1_ROTM_TPL_SPEC_AVAIL_BLAS(float, Kokkos::LayoutLeft, Kokkos::OpenMP, Kokkos::HostSpace) +KOKKOSBLAS1_ROTM_TPL_SPEC_AVAIL_BLAS(double, Kokkos::LayoutRight, Kokkos::OpenMP, Kokkos::HostSpace) +KOKKOSBLAS1_ROTM_TPL_SPEC_AVAIL_BLAS(float, Kokkos::LayoutRight, Kokkos::OpenMP, Kokkos::HostSpace) #endif #endif // KOKKOSKERNELS_ENABLE_TPL_BLAS // cuBLAS #ifdef KOKKOSKERNELS_ENABLE_TPL_CUBLAS -#define KOKKOSBLAS1_ROTM_TPL_SPEC_AVAIL_CUBLAS(SCALAR, LAYOUT, EXEC_SPACE, \ - MEM_SPACE) \ - template <> \ - struct rotm_tpl_spec_avail< \ - EXEC_SPACE, \ - Kokkos::View, \ - Kokkos::MemoryTraits>, \ - Kokkos::View, \ - Kokkos::MemoryTraits>> { \ - enum : bool { value = true }; \ +#define KOKKOSBLAS1_ROTM_TPL_SPEC_AVAIL_CUBLAS(SCALAR, LAYOUT, EXEC_SPACE, MEM_SPACE) \ + template <> \ + struct rotm_tpl_spec_avail< \ + EXEC_SPACE, \ + Kokkos::View, Kokkos::MemoryTraits>, \ + Kokkos::View, \ + Kokkos::MemoryTraits>> { \ + enum : bool { value = true }; \ }; -KOKKOSBLAS1_ROTM_TPL_SPEC_AVAIL_CUBLAS(double, Kokkos::LayoutLeft, Kokkos::Cuda, - Kokkos::CudaSpace) -KOKKOSBLAS1_ROTM_TPL_SPEC_AVAIL_CUBLAS(float, Kokkos::LayoutLeft, Kokkos::Cuda, - Kokkos::CudaSpace) -KOKKOSBLAS1_ROTM_TPL_SPEC_AVAIL_CUBLAS(double, Kokkos::LayoutRight, - Kokkos::Cuda, Kokkos::CudaSpace) -KOKKOSBLAS1_ROTM_TPL_SPEC_AVAIL_CUBLAS(float, Kokkos::LayoutRight, Kokkos::Cuda, - Kokkos::CudaSpace) +KOKKOSBLAS1_ROTM_TPL_SPEC_AVAIL_CUBLAS(double, Kokkos::LayoutLeft, Kokkos::Cuda, Kokkos::CudaSpace) +KOKKOSBLAS1_ROTM_TPL_SPEC_AVAIL_CUBLAS(float, Kokkos::LayoutLeft, Kokkos::Cuda, Kokkos::CudaSpace) +KOKKOSBLAS1_ROTM_TPL_SPEC_AVAIL_CUBLAS(double, Kokkos::LayoutRight, Kokkos::Cuda, Kokkos::CudaSpace) +KOKKOSBLAS1_ROTM_TPL_SPEC_AVAIL_CUBLAS(float, Kokkos::LayoutRight, Kokkos::Cuda, Kokkos::CudaSpace) #endif // rocBLAS #ifdef KOKKOSKERNELS_ENABLE_TPL_ROCBLAS -#define KOKKOSBLAS1_ROTM_TPL_SPEC_AVAIL_ROCBLAS(SCALAR, LAYOUT, EXEC_SPACE, \ - MEM_SPACE) \ - template <> \ - struct rotm_tpl_spec_avail< \ - EXEC_SPACE, \ - Kokkos::View, \ - Kokkos::MemoryTraits>, \ - Kokkos::View, \ - Kokkos::MemoryTraits>> { \ - enum : bool { value = true }; \ +#define KOKKOSBLAS1_ROTM_TPL_SPEC_AVAIL_ROCBLAS(SCALAR, LAYOUT, EXEC_SPACE, MEM_SPACE) \ + template <> \ + struct rotm_tpl_spec_avail< \ + EXEC_SPACE, \ + Kokkos::View, Kokkos::MemoryTraits>, \ + Kokkos::View, \ + Kokkos::MemoryTraits>> { \ + enum : bool { value = true }; \ }; -KOKKOSBLAS1_ROTM_TPL_SPEC_AVAIL_ROCBLAS(double, Kokkos::LayoutLeft, Kokkos::HIP, - Kokkos::HIPSpace) -KOKKOSBLAS1_ROTM_TPL_SPEC_AVAIL_ROCBLAS(float, Kokkos::LayoutLeft, Kokkos::HIP, - Kokkos::HIPSpace) -KOKKOSBLAS1_ROTM_TPL_SPEC_AVAIL_ROCBLAS(double, Kokkos::LayoutRight, - Kokkos::HIP, Kokkos::HIPSpace) -KOKKOSBLAS1_ROTM_TPL_SPEC_AVAIL_ROCBLAS(float, Kokkos::LayoutRight, Kokkos::HIP, - Kokkos::HIPSpace) +KOKKOSBLAS1_ROTM_TPL_SPEC_AVAIL_ROCBLAS(double, Kokkos::LayoutLeft, Kokkos::HIP, Kokkos::HIPSpace) +KOKKOSBLAS1_ROTM_TPL_SPEC_AVAIL_ROCBLAS(float, Kokkos::LayoutLeft, Kokkos::HIP, Kokkos::HIPSpace) +KOKKOSBLAS1_ROTM_TPL_SPEC_AVAIL_ROCBLAS(double, Kokkos::LayoutRight, Kokkos::HIP, Kokkos::HIPSpace) +KOKKOSBLAS1_ROTM_TPL_SPEC_AVAIL_ROCBLAS(float, Kokkos::LayoutRight, Kokkos::HIP, Kokkos::HIPSpace) #endif } // namespace Impl diff --git a/blas/tpls/KokkosBlas1_rotm_tpl_spec_decl.hpp b/blas/tpls/KokkosBlas1_rotm_tpl_spec_decl.hpp index ce8826e1ee..7bde6d0835 100644 --- a/blas/tpls/KokkosBlas1_rotm_tpl_spec_decl.hpp +++ b/blas/tpls/KokkosBlas1_rotm_tpl_spec_decl.hpp @@ -24,8 +24,7 @@ namespace { template inline void rotm_print_specialization() { #ifdef KOKKOSKERNELS_ENABLE_CHECK_SPECIALIZATION - printf("KokkosBlas1::rotm<> TPL Blas specialization for < %s >\n", - typeid(Scalar).name()); + printf("KokkosBlas1::rotm<> TPL Blas specialization for < %s >\n", typeid(Scalar).name()); #endif } } // namespace @@ -39,68 +38,45 @@ inline void rotm_print_specialization() { namespace KokkosBlas { namespace Impl { -#define KOKKOSBLAS1_ROTM_TPL_SPEC_DECL_BLAS(SCALAR, LAYOUT, EXEC_SPACE, \ - MEM_SPACE, ETI_SPEC_AVAIL) \ - template <> \ - struct Rotm< \ - EXEC_SPACE, \ - Kokkos::View, \ - Kokkos::MemoryTraits>, \ - Kokkos::View, \ - Kokkos::MemoryTraits>, \ - true, ETI_SPEC_AVAIL> { \ - using VectorView = \ - Kokkos::View, \ - Kokkos::MemoryTraits>; \ - using ParamView = Kokkos::View, \ - Kokkos::MemoryTraits>; \ - static void rotm(EXEC_SPACE const& /* space */, VectorView& X, \ - VectorView& Y, ParamView& param) { \ - Kokkos::Profiling::pushRegion("KokkosBlas::rotm[TPL_BLAS,SCALAR]"); \ - HostBlas::rotm(X.extent(0), X.data(), 1, Y.data(), 1, \ - param.data()); \ - Kokkos::Profiling::popRegion(); \ - } \ +#define KOKKOSBLAS1_ROTM_TPL_SPEC_DECL_BLAS(SCALAR, LAYOUT, EXEC_SPACE, MEM_SPACE, ETI_SPEC_AVAIL) \ + template <> \ + struct Rotm< \ + EXEC_SPACE, \ + Kokkos::View, Kokkos::MemoryTraits>, \ + Kokkos::View, \ + Kokkos::MemoryTraits>, \ + true, ETI_SPEC_AVAIL> { \ + using VectorView = \ + Kokkos::View, Kokkos::MemoryTraits>; \ + using ParamView = Kokkos::View, \ + Kokkos::MemoryTraits>; \ + static void rotm(EXEC_SPACE const& /* space */, VectorView& X, VectorView& Y, ParamView& param) { \ + Kokkos::Profiling::pushRegion("KokkosBlas::rotm[TPL_BLAS,SCALAR]"); \ + HostBlas::rotm(X.extent(0), X.data(), 1, Y.data(), 1, param.data()); \ + Kokkos::Profiling::popRegion(); \ + } \ }; #ifdef KOKKOS_ENABLE_SERIAL -KOKKOSBLAS1_ROTM_TPL_SPEC_DECL_BLAS(double, Kokkos::LayoutLeft, Kokkos::Serial, - Kokkos::HostSpace, true) -KOKKOSBLAS1_ROTM_TPL_SPEC_DECL_BLAS(double, Kokkos::LayoutLeft, Kokkos::Serial, - Kokkos::HostSpace, false) -KOKKOSBLAS1_ROTM_TPL_SPEC_DECL_BLAS(double, Kokkos::LayoutRight, Kokkos::Serial, - Kokkos::HostSpace, true) -KOKKOSBLAS1_ROTM_TPL_SPEC_DECL_BLAS(double, Kokkos::LayoutRight, Kokkos::Serial, - Kokkos::HostSpace, false) -KOKKOSBLAS1_ROTM_TPL_SPEC_DECL_BLAS(float, Kokkos::LayoutLeft, Kokkos::Serial, - Kokkos::HostSpace, true) -KOKKOSBLAS1_ROTM_TPL_SPEC_DECL_BLAS(float, Kokkos::LayoutLeft, Kokkos::Serial, - Kokkos::HostSpace, false) -KOKKOSBLAS1_ROTM_TPL_SPEC_DECL_BLAS(float, Kokkos::LayoutRight, Kokkos::Serial, - Kokkos::HostSpace, true) -KOKKOSBLAS1_ROTM_TPL_SPEC_DECL_BLAS(float, Kokkos::LayoutRight, Kokkos::Serial, - Kokkos::HostSpace, false) +KOKKOSBLAS1_ROTM_TPL_SPEC_DECL_BLAS(double, Kokkos::LayoutLeft, Kokkos::Serial, Kokkos::HostSpace, true) +KOKKOSBLAS1_ROTM_TPL_SPEC_DECL_BLAS(double, Kokkos::LayoutLeft, Kokkos::Serial, Kokkos::HostSpace, false) +KOKKOSBLAS1_ROTM_TPL_SPEC_DECL_BLAS(double, Kokkos::LayoutRight, Kokkos::Serial, Kokkos::HostSpace, true) +KOKKOSBLAS1_ROTM_TPL_SPEC_DECL_BLAS(double, Kokkos::LayoutRight, Kokkos::Serial, Kokkos::HostSpace, false) +KOKKOSBLAS1_ROTM_TPL_SPEC_DECL_BLAS(float, Kokkos::LayoutLeft, Kokkos::Serial, Kokkos::HostSpace, true) +KOKKOSBLAS1_ROTM_TPL_SPEC_DECL_BLAS(float, Kokkos::LayoutLeft, Kokkos::Serial, Kokkos::HostSpace, false) +KOKKOSBLAS1_ROTM_TPL_SPEC_DECL_BLAS(float, Kokkos::LayoutRight, Kokkos::Serial, Kokkos::HostSpace, true) +KOKKOSBLAS1_ROTM_TPL_SPEC_DECL_BLAS(float, Kokkos::LayoutRight, Kokkos::Serial, Kokkos::HostSpace, false) #endif #ifdef KOKKOS_ENABLE_OPENMP -KOKKOSBLAS1_ROTM_TPL_SPEC_DECL_BLAS(double, Kokkos::LayoutLeft, Kokkos::OpenMP, - Kokkos::HostSpace, true) -KOKKOSBLAS1_ROTM_TPL_SPEC_DECL_BLAS(double, Kokkos::LayoutLeft, Kokkos::OpenMP, - Kokkos::HostSpace, false) -KOKKOSBLAS1_ROTM_TPL_SPEC_DECL_BLAS(double, Kokkos::LayoutRight, Kokkos::OpenMP, - Kokkos::HostSpace, true) -KOKKOSBLAS1_ROTM_TPL_SPEC_DECL_BLAS(double, Kokkos::LayoutRight, Kokkos::OpenMP, - Kokkos::HostSpace, false) -KOKKOSBLAS1_ROTM_TPL_SPEC_DECL_BLAS(float, Kokkos::LayoutLeft, Kokkos::OpenMP, - Kokkos::HostSpace, true) -KOKKOSBLAS1_ROTM_TPL_SPEC_DECL_BLAS(float, Kokkos::LayoutLeft, Kokkos::OpenMP, - Kokkos::HostSpace, false) -KOKKOSBLAS1_ROTM_TPL_SPEC_DECL_BLAS(float, Kokkos::LayoutRight, Kokkos::OpenMP, - Kokkos::HostSpace, true) -KOKKOSBLAS1_ROTM_TPL_SPEC_DECL_BLAS(float, Kokkos::LayoutRight, Kokkos::OpenMP, - Kokkos::HostSpace, false) +KOKKOSBLAS1_ROTM_TPL_SPEC_DECL_BLAS(double, Kokkos::LayoutLeft, Kokkos::OpenMP, Kokkos::HostSpace, true) +KOKKOSBLAS1_ROTM_TPL_SPEC_DECL_BLAS(double, Kokkos::LayoutLeft, Kokkos::OpenMP, Kokkos::HostSpace, false) +KOKKOSBLAS1_ROTM_TPL_SPEC_DECL_BLAS(double, Kokkos::LayoutRight, Kokkos::OpenMP, Kokkos::HostSpace, true) +KOKKOSBLAS1_ROTM_TPL_SPEC_DECL_BLAS(double, Kokkos::LayoutRight, Kokkos::OpenMP, Kokkos::HostSpace, false) +KOKKOSBLAS1_ROTM_TPL_SPEC_DECL_BLAS(float, Kokkos::LayoutLeft, Kokkos::OpenMP, Kokkos::HostSpace, true) +KOKKOSBLAS1_ROTM_TPL_SPEC_DECL_BLAS(float, Kokkos::LayoutLeft, Kokkos::OpenMP, Kokkos::HostSpace, false) +KOKKOSBLAS1_ROTM_TPL_SPEC_DECL_BLAS(float, Kokkos::LayoutRight, Kokkos::OpenMP, Kokkos::HostSpace, true) +KOKKOSBLAS1_ROTM_TPL_SPEC_DECL_BLAS(float, Kokkos::LayoutRight, Kokkos::OpenMP, Kokkos::HostSpace, false) #endif } // namespace Impl @@ -115,101 +91,69 @@ KOKKOSBLAS1_ROTM_TPL_SPEC_DECL_BLAS(float, Kokkos::LayoutRight, Kokkos::OpenMP, namespace KokkosBlas { namespace Impl { -#define KOKKOSBLAS1_DROTM_TPL_SPEC_DECL_CUBLAS(LAYOUT, EXEC_SPACE, MEM_SPACE, \ - ETI_SPEC_AVAIL) \ - template <> \ - struct Rotm< \ - EXEC_SPACE, \ - Kokkos::View, \ - Kokkos::MemoryTraits>, \ - Kokkos::View, \ - Kokkos::MemoryTraits>, \ - true, ETI_SPEC_AVAIL> { \ - using VectorView = \ - Kokkos::View, \ - Kokkos::MemoryTraits>; \ - using ParamView = Kokkos::View, \ - Kokkos::MemoryTraits>; \ - \ - static void rotm(EXEC_SPACE const& space, VectorView const& X, \ - VectorView const& Y, ParamView const& param) { \ - Kokkos::Profiling::pushRegion("KokkosBlas::rotm[TPL_CUBLAS,double]"); \ - rotm_print_specialization(); \ - KokkosBlas::Impl::CudaBlasSingleton& s = \ - KokkosBlas::Impl::CudaBlasSingleton::singleton(); \ - KOKKOS_CUBLAS_SAFE_CALL_IMPL( \ - cublasSetStream(s.handle, space.cuda_stream())); \ - cublasPointerMode_t pointer_mode; \ - KOKKOS_CUBLAS_SAFE_CALL_IMPL( \ - cublasGetPointerMode(s.handle, &pointer_mode)); \ - KOKKOS_CUBLAS_SAFE_CALL_IMPL( \ - cublasSetPointerMode(s.handle, CUBLAS_POINTER_MODE_DEVICE)); \ - KOKKOS_CUBLAS_SAFE_CALL_IMPL(cublasDrotm( \ - s.handle, X.extent(0), X.data(), 1, Y.data(), 1, param.data())); \ - KOKKOS_CUBLAS_SAFE_CALL_IMPL( \ - cublasSetPointerMode(s.handle, pointer_mode)); \ - Kokkos::Profiling::popRegion(); \ - } \ +#define KOKKOSBLAS1_DROTM_TPL_SPEC_DECL_CUBLAS(LAYOUT, EXEC_SPACE, MEM_SPACE, ETI_SPEC_AVAIL) \ + template <> \ + struct Rotm< \ + EXEC_SPACE, \ + Kokkos::View, Kokkos::MemoryTraits>, \ + Kokkos::View, \ + Kokkos::MemoryTraits>, \ + true, ETI_SPEC_AVAIL> { \ + using VectorView = \ + Kokkos::View, Kokkos::MemoryTraits>; \ + using ParamView = Kokkos::View, \ + Kokkos::MemoryTraits>; \ + \ + static void rotm(EXEC_SPACE const& space, VectorView const& X, VectorView const& Y, ParamView const& param) { \ + Kokkos::Profiling::pushRegion("KokkosBlas::rotm[TPL_CUBLAS,double]"); \ + rotm_print_specialization(); \ + KokkosBlas::Impl::CudaBlasSingleton& s = KokkosBlas::Impl::CudaBlasSingleton::singleton(); \ + KOKKOS_CUBLAS_SAFE_CALL_IMPL(cublasSetStream(s.handle, space.cuda_stream())); \ + cublasPointerMode_t pointer_mode; \ + KOKKOS_CUBLAS_SAFE_CALL_IMPL(cublasGetPointerMode(s.handle, &pointer_mode)); \ + KOKKOS_CUBLAS_SAFE_CALL_IMPL(cublasSetPointerMode(s.handle, CUBLAS_POINTER_MODE_DEVICE)); \ + KOKKOS_CUBLAS_SAFE_CALL_IMPL(cublasDrotm(s.handle, X.extent(0), X.data(), 1, Y.data(), 1, param.data())); \ + KOKKOS_CUBLAS_SAFE_CALL_IMPL(cublasSetPointerMode(s.handle, pointer_mode)); \ + Kokkos::Profiling::popRegion(); \ + } \ }; -KOKKOSBLAS1_DROTM_TPL_SPEC_DECL_CUBLAS(Kokkos::LayoutLeft, Kokkos::Cuda, - Kokkos::CudaSpace, true) -KOKKOSBLAS1_DROTM_TPL_SPEC_DECL_CUBLAS(Kokkos::LayoutRight, Kokkos::Cuda, - Kokkos::CudaSpace, true) -KOKKOSBLAS1_DROTM_TPL_SPEC_DECL_CUBLAS(Kokkos::LayoutLeft, Kokkos::Cuda, - Kokkos::CudaSpace, false) -KOKKOSBLAS1_DROTM_TPL_SPEC_DECL_CUBLAS(Kokkos::LayoutRight, Kokkos::Cuda, - Kokkos::CudaSpace, false) - -#define KOKKOSBLAS1_SROTM_TPL_SPEC_DECL_CUBLAS(LAYOUT, EXEC_SPACE, MEM_SPACE, \ - ETI_SPEC_AVAIL) \ - template <> \ - struct Rotm< \ - EXEC_SPACE, \ - Kokkos::View, \ - Kokkos::MemoryTraits>, \ - Kokkos::View, \ - Kokkos::MemoryTraits>, \ - true, ETI_SPEC_AVAIL> { \ - using VectorView = \ - Kokkos::View, \ - Kokkos::MemoryTraits>; \ - using ParamView = Kokkos::View, \ - Kokkos::MemoryTraits>; \ - \ - static void rotm(EXEC_SPACE const& space, VectorView const& X, \ - VectorView const& Y, ParamView const& param) { \ - Kokkos::Profiling::pushRegion("KokkosBlas::rotm[TPL_CUBLAS,float]"); \ - rotm_print_specialization(); \ - KokkosBlas::Impl::CudaBlasSingleton& s = \ - KokkosBlas::Impl::CudaBlasSingleton::singleton(); \ - KOKKOS_CUBLAS_SAFE_CALL_IMPL( \ - cublasSetStream(s.handle, space.cuda_stream())); \ - cublasPointerMode_t pointer_mode; \ - KOKKOS_CUBLAS_SAFE_CALL_IMPL( \ - cublasGetPointerMode(s.handle, &pointer_mode)); \ - KOKKOS_CUBLAS_SAFE_CALL_IMPL( \ - cublasSetPointerMode(s.handle, CUBLAS_POINTER_MODE_DEVICE)); \ - KOKKOS_CUBLAS_SAFE_CALL_IMPL(cublasSrotm( \ - s.handle, X.extent(0), X.data(), 1, Y.data(), 1, param.data())); \ - KOKKOS_CUBLAS_SAFE_CALL_IMPL( \ - cublasSetPointerMode(s.handle, pointer_mode)); \ - Kokkos::Profiling::popRegion(); \ - } \ +KOKKOSBLAS1_DROTM_TPL_SPEC_DECL_CUBLAS(Kokkos::LayoutLeft, Kokkos::Cuda, Kokkos::CudaSpace, true) +KOKKOSBLAS1_DROTM_TPL_SPEC_DECL_CUBLAS(Kokkos::LayoutRight, Kokkos::Cuda, Kokkos::CudaSpace, true) +KOKKOSBLAS1_DROTM_TPL_SPEC_DECL_CUBLAS(Kokkos::LayoutLeft, Kokkos::Cuda, Kokkos::CudaSpace, false) +KOKKOSBLAS1_DROTM_TPL_SPEC_DECL_CUBLAS(Kokkos::LayoutRight, Kokkos::Cuda, Kokkos::CudaSpace, false) + +#define KOKKOSBLAS1_SROTM_TPL_SPEC_DECL_CUBLAS(LAYOUT, EXEC_SPACE, MEM_SPACE, ETI_SPEC_AVAIL) \ + template <> \ + struct Rotm< \ + EXEC_SPACE, \ + Kokkos::View, Kokkos::MemoryTraits>, \ + Kokkos::View, \ + Kokkos::MemoryTraits>, \ + true, ETI_SPEC_AVAIL> { \ + using VectorView = \ + Kokkos::View, Kokkos::MemoryTraits>; \ + using ParamView = Kokkos::View, \ + Kokkos::MemoryTraits>; \ + \ + static void rotm(EXEC_SPACE const& space, VectorView const& X, VectorView const& Y, ParamView const& param) { \ + Kokkos::Profiling::pushRegion("KokkosBlas::rotm[TPL_CUBLAS,float]"); \ + rotm_print_specialization(); \ + KokkosBlas::Impl::CudaBlasSingleton& s = KokkosBlas::Impl::CudaBlasSingleton::singleton(); \ + KOKKOS_CUBLAS_SAFE_CALL_IMPL(cublasSetStream(s.handle, space.cuda_stream())); \ + cublasPointerMode_t pointer_mode; \ + KOKKOS_CUBLAS_SAFE_CALL_IMPL(cublasGetPointerMode(s.handle, &pointer_mode)); \ + KOKKOS_CUBLAS_SAFE_CALL_IMPL(cublasSetPointerMode(s.handle, CUBLAS_POINTER_MODE_DEVICE)); \ + KOKKOS_CUBLAS_SAFE_CALL_IMPL(cublasSrotm(s.handle, X.extent(0), X.data(), 1, Y.data(), 1, param.data())); \ + KOKKOS_CUBLAS_SAFE_CALL_IMPL(cublasSetPointerMode(s.handle, pointer_mode)); \ + Kokkos::Profiling::popRegion(); \ + } \ }; -KOKKOSBLAS1_SROTM_TPL_SPEC_DECL_CUBLAS(Kokkos::LayoutLeft, Kokkos::Cuda, - Kokkos::CudaSpace, true) -KOKKOSBLAS1_SROTM_TPL_SPEC_DECL_CUBLAS(Kokkos::LayoutRight, Kokkos::Cuda, - Kokkos::CudaSpace, true) -KOKKOSBLAS1_SROTM_TPL_SPEC_DECL_CUBLAS(Kokkos::LayoutLeft, Kokkos::Cuda, - Kokkos::CudaSpace, false) -KOKKOSBLAS1_SROTM_TPL_SPEC_DECL_CUBLAS(Kokkos::LayoutRight, Kokkos::Cuda, - Kokkos::CudaSpace, false) +KOKKOSBLAS1_SROTM_TPL_SPEC_DECL_CUBLAS(Kokkos::LayoutLeft, Kokkos::Cuda, Kokkos::CudaSpace, true) +KOKKOSBLAS1_SROTM_TPL_SPEC_DECL_CUBLAS(Kokkos::LayoutRight, Kokkos::Cuda, Kokkos::CudaSpace, true) +KOKKOSBLAS1_SROTM_TPL_SPEC_DECL_CUBLAS(Kokkos::LayoutLeft, Kokkos::Cuda, Kokkos::CudaSpace, false) +KOKKOSBLAS1_SROTM_TPL_SPEC_DECL_CUBLAS(Kokkos::LayoutRight, Kokkos::Cuda, Kokkos::CudaSpace, false) } // namespace Impl } // namespace KokkosBlas @@ -223,103 +167,71 @@ KOKKOSBLAS1_SROTM_TPL_SPEC_DECL_CUBLAS(Kokkos::LayoutRight, Kokkos::Cuda, namespace KokkosBlas { namespace Impl { -#define KOKKOSBLAS1_DROTM_TPL_SPEC_DECL_ROCBLAS(LAYOUT, EXEC_SPACE, MEM_SPACE, \ - ETI_SPEC_AVAIL) \ - template <> \ - struct Rotm< \ - EXEC_SPACE, \ - Kokkos::View, \ - Kokkos::MemoryTraits>, \ - Kokkos::View, \ - Kokkos::MemoryTraits>, \ - true, ETI_SPEC_AVAIL> { \ - using VectorView = \ - Kokkos::View, \ - Kokkos::MemoryTraits>; \ - using PView = Kokkos::View, \ - Kokkos::MemoryTraits>; \ - \ - static void rotm(EXEC_SPACE const& space, VectorView const& X, \ - VectorView const& Y, PView const& param) { \ - Kokkos::Profiling::pushRegion("KokkosBlas::rotm[TPL_ROCBLAS,double]"); \ - rotm_print_specialization(); \ - KokkosBlas::Impl::RocBlasSingleton& s = \ - KokkosBlas::Impl::RocBlasSingleton::singleton(); \ - KOKKOS_ROCBLAS_SAFE_CALL_IMPL( \ - rocblas_set_stream(s.handle, space.hip_stream())); \ - rocblas_pointer_mode pointer_mode; \ - KOKKOS_ROCBLAS_SAFE_CALL_IMPL( \ - rocblas_get_pointer_mode(s.handle, &pointer_mode)); \ - KOKKOS_ROCBLAS_SAFE_CALL_IMPL( \ - rocblas_set_pointer_mode(s.handle, rocblas_pointer_mode_device)); \ - KOKKOS_ROCBLAS_SAFE_CALL_IMPL( \ - rocblas_drotm(s.handle, static_cast(X.extent(0)), X.data(), 1, \ - Y.data(), 1, param.data())); \ - KOKKOS_ROCBLAS_SAFE_CALL_IMPL( \ - rocblas_set_pointer_mode(s.handle, pointer_mode)); \ - Kokkos::Profiling::popRegion(); \ - } \ +#define KOKKOSBLAS1_DROTM_TPL_SPEC_DECL_ROCBLAS(LAYOUT, EXEC_SPACE, MEM_SPACE, ETI_SPEC_AVAIL) \ + template <> \ + struct Rotm< \ + EXEC_SPACE, \ + Kokkos::View, Kokkos::MemoryTraits>, \ + Kokkos::View, \ + Kokkos::MemoryTraits>, \ + true, ETI_SPEC_AVAIL> { \ + using VectorView = \ + Kokkos::View, Kokkos::MemoryTraits>; \ + using PView = Kokkos::View, \ + Kokkos::MemoryTraits>; \ + \ + static void rotm(EXEC_SPACE const& space, VectorView const& X, VectorView const& Y, PView const& param) { \ + Kokkos::Profiling::pushRegion("KokkosBlas::rotm[TPL_ROCBLAS,double]"); \ + rotm_print_specialization(); \ + KokkosBlas::Impl::RocBlasSingleton& s = KokkosBlas::Impl::RocBlasSingleton::singleton(); \ + KOKKOS_ROCBLAS_SAFE_CALL_IMPL(rocblas_set_stream(s.handle, space.hip_stream())); \ + rocblas_pointer_mode pointer_mode; \ + KOKKOS_ROCBLAS_SAFE_CALL_IMPL(rocblas_get_pointer_mode(s.handle, &pointer_mode)); \ + KOKKOS_ROCBLAS_SAFE_CALL_IMPL(rocblas_set_pointer_mode(s.handle, rocblas_pointer_mode_device)); \ + KOKKOS_ROCBLAS_SAFE_CALL_IMPL( \ + rocblas_drotm(s.handle, static_cast(X.extent(0)), X.data(), 1, Y.data(), 1, param.data())); \ + KOKKOS_ROCBLAS_SAFE_CALL_IMPL(rocblas_set_pointer_mode(s.handle, pointer_mode)); \ + Kokkos::Profiling::popRegion(); \ + } \ }; -KOKKOSBLAS1_DROTM_TPL_SPEC_DECL_ROCBLAS(Kokkos::LayoutLeft, Kokkos::HIP, - Kokkos::HIPSpace, true) -KOKKOSBLAS1_DROTM_TPL_SPEC_DECL_ROCBLAS(Kokkos::LayoutRight, Kokkos::HIP, - Kokkos::HIPSpace, true) -KOKKOSBLAS1_DROTM_TPL_SPEC_DECL_ROCBLAS(Kokkos::LayoutLeft, Kokkos::HIP, - Kokkos::HIPSpace, false) -KOKKOSBLAS1_DROTM_TPL_SPEC_DECL_ROCBLAS(Kokkos::LayoutRight, Kokkos::HIP, - Kokkos::HIPSpace, false) - -#define KOKKOSBLAS1_SROTM_TPL_SPEC_DECL_ROCBLAS(LAYOUT, EXEC_SPACE, MEM_SPACE, \ - ETI_SPEC_AVAIL) \ - template <> \ - struct Rotm< \ - EXEC_SPACE, \ - Kokkos::View, \ - Kokkos::MemoryTraits>, \ - Kokkos::View, \ - Kokkos::MemoryTraits>, \ - true, ETI_SPEC_AVAIL> { \ - using VectorView = \ - Kokkos::View, \ - Kokkos::MemoryTraits>; \ - using PView = Kokkos::View, \ - Kokkos::MemoryTraits>; \ - \ - static void rotm(EXEC_SPACE const& space, VectorView const& X, \ - VectorView const& Y, PView const& param) { \ - Kokkos::Profiling::pushRegion("KokkosBlas::rotm[TPL_ROCBLAS,float]"); \ - rotm_print_specialization(); \ - KokkosBlas::Impl::RocBlasSingleton& s = \ - KokkosBlas::Impl::RocBlasSingleton::singleton(); \ - KOKKOS_ROCBLAS_SAFE_CALL_IMPL( \ - rocblas_set_stream(s.handle, space.hip_stream())); \ - rocblas_pointer_mode pointer_mode; \ - KOKKOS_ROCBLAS_SAFE_CALL_IMPL( \ - rocblas_get_pointer_mode(s.handle, &pointer_mode)); \ - KOKKOS_ROCBLAS_SAFE_CALL_IMPL( \ - rocblas_set_pointer_mode(s.handle, rocblas_pointer_mode_device)); \ - KOKKOS_ROCBLAS_SAFE_CALL_IMPL( \ - rocblas_srotm(s.handle, static_cast(X.extent(0)), X.data(), 1, \ - Y.data(), 1, param.data())); \ - KOKKOS_ROCBLAS_SAFE_CALL_IMPL( \ - rocblas_set_pointer_mode(s.handle, pointer_mode)); \ - Kokkos::Profiling::popRegion(); \ - } \ +KOKKOSBLAS1_DROTM_TPL_SPEC_DECL_ROCBLAS(Kokkos::LayoutLeft, Kokkos::HIP, Kokkos::HIPSpace, true) +KOKKOSBLAS1_DROTM_TPL_SPEC_DECL_ROCBLAS(Kokkos::LayoutRight, Kokkos::HIP, Kokkos::HIPSpace, true) +KOKKOSBLAS1_DROTM_TPL_SPEC_DECL_ROCBLAS(Kokkos::LayoutLeft, Kokkos::HIP, Kokkos::HIPSpace, false) +KOKKOSBLAS1_DROTM_TPL_SPEC_DECL_ROCBLAS(Kokkos::LayoutRight, Kokkos::HIP, Kokkos::HIPSpace, false) + +#define KOKKOSBLAS1_SROTM_TPL_SPEC_DECL_ROCBLAS(LAYOUT, EXEC_SPACE, MEM_SPACE, ETI_SPEC_AVAIL) \ + template <> \ + struct Rotm< \ + EXEC_SPACE, \ + Kokkos::View, Kokkos::MemoryTraits>, \ + Kokkos::View, \ + Kokkos::MemoryTraits>, \ + true, ETI_SPEC_AVAIL> { \ + using VectorView = \ + Kokkos::View, Kokkos::MemoryTraits>; \ + using PView = Kokkos::View, \ + Kokkos::MemoryTraits>; \ + \ + static void rotm(EXEC_SPACE const& space, VectorView const& X, VectorView const& Y, PView const& param) { \ + Kokkos::Profiling::pushRegion("KokkosBlas::rotm[TPL_ROCBLAS,float]"); \ + rotm_print_specialization(); \ + KokkosBlas::Impl::RocBlasSingleton& s = KokkosBlas::Impl::RocBlasSingleton::singleton(); \ + KOKKOS_ROCBLAS_SAFE_CALL_IMPL(rocblas_set_stream(s.handle, space.hip_stream())); \ + rocblas_pointer_mode pointer_mode; \ + KOKKOS_ROCBLAS_SAFE_CALL_IMPL(rocblas_get_pointer_mode(s.handle, &pointer_mode)); \ + KOKKOS_ROCBLAS_SAFE_CALL_IMPL(rocblas_set_pointer_mode(s.handle, rocblas_pointer_mode_device)); \ + KOKKOS_ROCBLAS_SAFE_CALL_IMPL( \ + rocblas_srotm(s.handle, static_cast(X.extent(0)), X.data(), 1, Y.data(), 1, param.data())); \ + KOKKOS_ROCBLAS_SAFE_CALL_IMPL(rocblas_set_pointer_mode(s.handle, pointer_mode)); \ + Kokkos::Profiling::popRegion(); \ + } \ }; -KOKKOSBLAS1_SROTM_TPL_SPEC_DECL_ROCBLAS(Kokkos::LayoutLeft, Kokkos::HIP, - Kokkos::HIPSpace, true) -KOKKOSBLAS1_SROTM_TPL_SPEC_DECL_ROCBLAS(Kokkos::LayoutRight, Kokkos::HIP, - Kokkos::HIPSpace, true) -KOKKOSBLAS1_SROTM_TPL_SPEC_DECL_ROCBLAS(Kokkos::LayoutLeft, Kokkos::HIP, - Kokkos::HIPSpace, false) -KOKKOSBLAS1_SROTM_TPL_SPEC_DECL_ROCBLAS(Kokkos::LayoutRight, Kokkos::HIP, - Kokkos::HIPSpace, false) +KOKKOSBLAS1_SROTM_TPL_SPEC_DECL_ROCBLAS(Kokkos::LayoutLeft, Kokkos::HIP, Kokkos::HIPSpace, true) +KOKKOSBLAS1_SROTM_TPL_SPEC_DECL_ROCBLAS(Kokkos::LayoutRight, Kokkos::HIP, Kokkos::HIPSpace, true) +KOKKOSBLAS1_SROTM_TPL_SPEC_DECL_ROCBLAS(Kokkos::LayoutLeft, Kokkos::HIP, Kokkos::HIPSpace, false) +KOKKOSBLAS1_SROTM_TPL_SPEC_DECL_ROCBLAS(Kokkos::LayoutRight, Kokkos::HIP, Kokkos::HIPSpace, false) } // namespace Impl } // namespace KokkosBlas diff --git a/blas/tpls/KokkosBlas1_rotmg_tpl_spec_avail.hpp b/blas/tpls/KokkosBlas1_rotmg_tpl_spec_avail.hpp index d4db1143f9..3a2925fd49 100644 --- a/blas/tpls/KokkosBlas1_rotmg_tpl_spec_avail.hpp +++ b/blas/tpls/KokkosBlas1_rotmg_tpl_spec_avail.hpp @@ -33,88 +33,66 @@ namespace Impl { // Generic Host side BLAS (could be MKL or whatever) // ARMPL is disabled as it does not detect some corner // cases correctly which leads to failing unit-tests -#if defined(KOKKOSKERNELS_ENABLE_TPL_BLAS) && \ - !defined(KOKKOSKERNELS_ENABLE_TPL_ARMPL) -#define KOKKOSBLAS1_ROTMG_TPL_SPEC_AVAIL_BLAS(SCALAR, LAYOUT, EXEC_SPACE, \ - MEM_SPACE) \ - template <> \ - struct rotmg_tpl_spec_avail< \ - EXEC_SPACE, \ - Kokkos::View, \ - Kokkos::MemoryTraits>, \ - Kokkos::View, \ - Kokkos::MemoryTraits>, \ - Kokkos::View, \ - Kokkos::MemoryTraits>> { \ - enum : bool { value = true }; \ +#if defined(KOKKOSKERNELS_ENABLE_TPL_BLAS) && !defined(KOKKOSKERNELS_ENABLE_TPL_ARMPL) +#define KOKKOSBLAS1_ROTMG_TPL_SPEC_AVAIL_BLAS(SCALAR, LAYOUT, EXEC_SPACE, MEM_SPACE) \ + template <> \ + struct rotmg_tpl_spec_avail< \ + EXEC_SPACE, \ + Kokkos::View, Kokkos::MemoryTraits>, \ + Kokkos::View, \ + Kokkos::MemoryTraits>, \ + Kokkos::View, \ + Kokkos::MemoryTraits>> { \ + enum : bool { value = true }; \ }; #ifdef KOKKOS_ENABLE_SERIAL -KOKKOSBLAS1_ROTMG_TPL_SPEC_AVAIL_BLAS(double, Kokkos::LayoutLeft, - Kokkos::Serial, Kokkos::HostSpace) -KOKKOSBLAS1_ROTMG_TPL_SPEC_AVAIL_BLAS(float, Kokkos::LayoutLeft, Kokkos::Serial, - Kokkos::HostSpace) -KOKKOSBLAS1_ROTMG_TPL_SPEC_AVAIL_BLAS(double, Kokkos::LayoutRight, - Kokkos::Serial, Kokkos::HostSpace) -KOKKOSBLAS1_ROTMG_TPL_SPEC_AVAIL_BLAS(float, Kokkos::LayoutRight, - Kokkos::Serial, Kokkos::HostSpace) +KOKKOSBLAS1_ROTMG_TPL_SPEC_AVAIL_BLAS(double, Kokkos::LayoutLeft, Kokkos::Serial, Kokkos::HostSpace) +KOKKOSBLAS1_ROTMG_TPL_SPEC_AVAIL_BLAS(float, Kokkos::LayoutLeft, Kokkos::Serial, Kokkos::HostSpace) +KOKKOSBLAS1_ROTMG_TPL_SPEC_AVAIL_BLAS(double, Kokkos::LayoutRight, Kokkos::Serial, Kokkos::HostSpace) +KOKKOSBLAS1_ROTMG_TPL_SPEC_AVAIL_BLAS(float, Kokkos::LayoutRight, Kokkos::Serial, Kokkos::HostSpace) #endif #ifdef KOKKOS_ENABLE_OPENMP -KOKKOSBLAS1_ROTMG_TPL_SPEC_AVAIL_BLAS(double, Kokkos::LayoutLeft, - Kokkos::OpenMP, Kokkos::HostSpace) -KOKKOSBLAS1_ROTMG_TPL_SPEC_AVAIL_BLAS(float, Kokkos::LayoutLeft, Kokkos::OpenMP, - Kokkos::HostSpace) -KOKKOSBLAS1_ROTMG_TPL_SPEC_AVAIL_BLAS(double, Kokkos::LayoutRight, - Kokkos::OpenMP, Kokkos::HostSpace) -KOKKOSBLAS1_ROTMG_TPL_SPEC_AVAIL_BLAS(float, Kokkos::LayoutRight, - Kokkos::OpenMP, Kokkos::HostSpace) +KOKKOSBLAS1_ROTMG_TPL_SPEC_AVAIL_BLAS(double, Kokkos::LayoutLeft, Kokkos::OpenMP, Kokkos::HostSpace) +KOKKOSBLAS1_ROTMG_TPL_SPEC_AVAIL_BLAS(float, Kokkos::LayoutLeft, Kokkos::OpenMP, Kokkos::HostSpace) +KOKKOSBLAS1_ROTMG_TPL_SPEC_AVAIL_BLAS(double, Kokkos::LayoutRight, Kokkos::OpenMP, Kokkos::HostSpace) +KOKKOSBLAS1_ROTMG_TPL_SPEC_AVAIL_BLAS(float, Kokkos::LayoutRight, Kokkos::OpenMP, Kokkos::HostSpace) #endif #endif // KOKKOSKERNELS_ENABLE_TPL_BLAS // cuBLAS #ifdef KOKKOSKERNELS_ENABLE_TPL_CUBLAS -#define KOKKOSBLAS1_ROTMG_TPL_SPEC_AVAIL_CUBLAS(SCALAR, LAYOUT, EXEC_SPACE, \ - MEM_SPACE) \ - template <> \ - struct rotmg_tpl_spec_avail< \ - EXEC_SPACE, \ - Kokkos::View, \ - Kokkos::MemoryTraits>, \ - Kokkos::View, \ - Kokkos::MemoryTraits>, \ - Kokkos::View, \ - Kokkos::MemoryTraits>> { \ - enum : bool { value = true }; \ +#define KOKKOSBLAS1_ROTMG_TPL_SPEC_AVAIL_CUBLAS(SCALAR, LAYOUT, EXEC_SPACE, MEM_SPACE) \ + template <> \ + struct rotmg_tpl_spec_avail< \ + EXEC_SPACE, \ + Kokkos::View, Kokkos::MemoryTraits>, \ + Kokkos::View, \ + Kokkos::MemoryTraits>, \ + Kokkos::View, \ + Kokkos::MemoryTraits>> { \ + enum : bool { value = true }; \ }; -KOKKOSBLAS1_ROTMG_TPL_SPEC_AVAIL_CUBLAS(double, Kokkos::LayoutLeft, - Kokkos::Cuda, Kokkos::CudaSpace) -KOKKOSBLAS1_ROTMG_TPL_SPEC_AVAIL_CUBLAS(float, Kokkos::LayoutLeft, Kokkos::Cuda, - Kokkos::CudaSpace) -KOKKOSBLAS1_ROTMG_TPL_SPEC_AVAIL_CUBLAS(double, Kokkos::LayoutRight, - Kokkos::Cuda, Kokkos::CudaSpace) -KOKKOSBLAS1_ROTMG_TPL_SPEC_AVAIL_CUBLAS(float, Kokkos::LayoutRight, - Kokkos::Cuda, Kokkos::CudaSpace) +KOKKOSBLAS1_ROTMG_TPL_SPEC_AVAIL_CUBLAS(double, Kokkos::LayoutLeft, Kokkos::Cuda, Kokkos::CudaSpace) +KOKKOSBLAS1_ROTMG_TPL_SPEC_AVAIL_CUBLAS(float, Kokkos::LayoutLeft, Kokkos::Cuda, Kokkos::CudaSpace) +KOKKOSBLAS1_ROTMG_TPL_SPEC_AVAIL_CUBLAS(double, Kokkos::LayoutRight, Kokkos::Cuda, Kokkos::CudaSpace) +KOKKOSBLAS1_ROTMG_TPL_SPEC_AVAIL_CUBLAS(float, Kokkos::LayoutRight, Kokkos::Cuda, Kokkos::CudaSpace) #endif // rocBLAS #ifdef KOKKOSKERNELS_ENABLE_TPL_ROCBLAS -#define KOKKOSBLAS1_ROTMG_TPL_SPEC_AVAIL_ROCBLAS(SCALAR, LAYOUT, EXEC_SPACE, \ - MEM_SPACE) \ - template <> \ - struct rotmg_tpl_spec_avail< \ - EXEC_SPACE, \ - Kokkos::View, \ - Kokkos::MemoryTraits>, \ - Kokkos::View, \ - Kokkos::MemoryTraits>, \ - Kokkos::View, \ - Kokkos::MemoryTraits>> { \ - enum : bool { value = true }; \ +#define KOKKOSBLAS1_ROTMG_TPL_SPEC_AVAIL_ROCBLAS(SCALAR, LAYOUT, EXEC_SPACE, MEM_SPACE) \ + template <> \ + struct rotmg_tpl_spec_avail< \ + EXEC_SPACE, \ + Kokkos::View, Kokkos::MemoryTraits>, \ + Kokkos::View, \ + Kokkos::MemoryTraits>, \ + Kokkos::View, \ + Kokkos::MemoryTraits>> { \ + enum : bool { value = true }; \ }; // Turning off use of rocBLAS as it returns false results in some of the diff --git a/blas/tpls/KokkosBlas1_rotmg_tpl_spec_decl.hpp b/blas/tpls/KokkosBlas1_rotmg_tpl_spec_decl.hpp index e911294df4..0271cfd981 100644 --- a/blas/tpls/KokkosBlas1_rotmg_tpl_spec_decl.hpp +++ b/blas/tpls/KokkosBlas1_rotmg_tpl_spec_decl.hpp @@ -24,8 +24,7 @@ namespace { template inline void rotmg_print_specialization() { #ifdef KOKKOSKERNELS_ENABLE_CHECK_SPECIALIZATION - printf("KokkosBlas1::rotmg<> TPL Blas specialization for < %s >\n", - typeid(Scalar).name()); + printf("KokkosBlas1::rotmg<> TPL Blas specialization for < %s >\n", typeid(Scalar).name()); #endif } } // namespace @@ -33,80 +32,54 @@ inline void rotmg_print_specialization() { } // namespace KokkosBlas // Generic Host side BLAS (could be MKL or whatever) -#if defined(KOKKOSKERNELS_ENABLE_TPL_BLAS) && \ - !defined(KOKKOSKERNELS_ENABLE_TPL_ARMPL) +#if defined(KOKKOSKERNELS_ENABLE_TPL_BLAS) && !defined(KOKKOSKERNELS_ENABLE_TPL_ARMPL) #include "KokkosBlas_Host_tpl.hpp" namespace KokkosBlas { namespace Impl { -#define KOKKOSBLAS1_ROTMG_TPL_SPEC_DECL_BLAS(SCALAR, LAYOUT, EXEC_SPACE, \ - MEM_SPACE, ETI_SPEC_AVAIL) \ - template <> \ - struct Rotmg< \ - EXEC_SPACE, \ - Kokkos::View, \ - Kokkos::MemoryTraits>, \ - Kokkos::View, \ - Kokkos::MemoryTraits>, \ - Kokkos::View, \ - Kokkos::MemoryTraits>, \ - true, ETI_SPEC_AVAIL> { \ - using DXView = \ - Kokkos::View, \ - Kokkos::MemoryTraits>; \ - using YView = Kokkos::View, \ - Kokkos::MemoryTraits>; \ - using PView = \ - Kokkos::View, \ - Kokkos::MemoryTraits>; \ - static void rotmg(EXEC_SPACE const& /* space */, DXView& d1, DXView& d2, \ - DXView& x1, YView& y1, PView& param) { \ - Kokkos::Profiling::pushRegion("KokkosBlas::rotmg[TPL_BLAS,double]"); \ - HostBlas::rotmg(d1.data(), d2.data(), x1.data(), y1.data(), \ - param.data()); \ - Kokkos::Profiling::popRegion(); \ - } \ +#define KOKKOSBLAS1_ROTMG_TPL_SPEC_DECL_BLAS(SCALAR, LAYOUT, EXEC_SPACE, MEM_SPACE, ETI_SPEC_AVAIL) \ + template <> \ + struct Rotmg< \ + EXEC_SPACE, \ + Kokkos::View, Kokkos::MemoryTraits>, \ + Kokkos::View, \ + Kokkos::MemoryTraits>, \ + Kokkos::View, Kokkos::MemoryTraits>, \ + true, ETI_SPEC_AVAIL> { \ + using DXView = \ + Kokkos::View, Kokkos::MemoryTraits>; \ + using YView = Kokkos::View, \ + Kokkos::MemoryTraits>; \ + using PView = Kokkos::View, \ + Kokkos::MemoryTraits>; \ + static void rotmg(EXEC_SPACE const& /* space */, DXView& d1, DXView& d2, DXView& x1, YView& y1, PView& param) { \ + Kokkos::Profiling::pushRegion("KokkosBlas::rotmg[TPL_BLAS,double]"); \ + HostBlas::rotmg(d1.data(), d2.data(), x1.data(), y1.data(), param.data()); \ + Kokkos::Profiling::popRegion(); \ + } \ }; #ifdef KOKKOS_ENABLE_SERIAL -KOKKOSBLAS1_ROTMG_TPL_SPEC_DECL_BLAS(double, Kokkos::LayoutLeft, Kokkos::Serial, - Kokkos::HostSpace, true) -KOKKOSBLAS1_ROTMG_TPL_SPEC_DECL_BLAS(double, Kokkos::LayoutLeft, Kokkos::Serial, - Kokkos::HostSpace, false) -KOKKOSBLAS1_ROTMG_TPL_SPEC_DECL_BLAS(double, Kokkos::LayoutRight, - Kokkos::Serial, Kokkos::HostSpace, true) -KOKKOSBLAS1_ROTMG_TPL_SPEC_DECL_BLAS(double, Kokkos::LayoutRight, - Kokkos::Serial, Kokkos::HostSpace, false) -KOKKOSBLAS1_ROTMG_TPL_SPEC_DECL_BLAS(float, Kokkos::LayoutLeft, Kokkos::Serial, - Kokkos::HostSpace, true) -KOKKOSBLAS1_ROTMG_TPL_SPEC_DECL_BLAS(float, Kokkos::LayoutLeft, Kokkos::Serial, - Kokkos::HostSpace, false) -KOKKOSBLAS1_ROTMG_TPL_SPEC_DECL_BLAS(float, Kokkos::LayoutRight, Kokkos::Serial, - Kokkos::HostSpace, true) -KOKKOSBLAS1_ROTMG_TPL_SPEC_DECL_BLAS(float, Kokkos::LayoutRight, Kokkos::Serial, - Kokkos::HostSpace, false) +KOKKOSBLAS1_ROTMG_TPL_SPEC_DECL_BLAS(double, Kokkos::LayoutLeft, Kokkos::Serial, Kokkos::HostSpace, true) +KOKKOSBLAS1_ROTMG_TPL_SPEC_DECL_BLAS(double, Kokkos::LayoutLeft, Kokkos::Serial, Kokkos::HostSpace, false) +KOKKOSBLAS1_ROTMG_TPL_SPEC_DECL_BLAS(double, Kokkos::LayoutRight, Kokkos::Serial, Kokkos::HostSpace, true) +KOKKOSBLAS1_ROTMG_TPL_SPEC_DECL_BLAS(double, Kokkos::LayoutRight, Kokkos::Serial, Kokkos::HostSpace, false) +KOKKOSBLAS1_ROTMG_TPL_SPEC_DECL_BLAS(float, Kokkos::LayoutLeft, Kokkos::Serial, Kokkos::HostSpace, true) +KOKKOSBLAS1_ROTMG_TPL_SPEC_DECL_BLAS(float, Kokkos::LayoutLeft, Kokkos::Serial, Kokkos::HostSpace, false) +KOKKOSBLAS1_ROTMG_TPL_SPEC_DECL_BLAS(float, Kokkos::LayoutRight, Kokkos::Serial, Kokkos::HostSpace, true) +KOKKOSBLAS1_ROTMG_TPL_SPEC_DECL_BLAS(float, Kokkos::LayoutRight, Kokkos::Serial, Kokkos::HostSpace, false) #endif #ifdef KOKKOS_ENABLE_OPENMP -KOKKOSBLAS1_ROTMG_TPL_SPEC_DECL_BLAS(double, Kokkos::LayoutLeft, Kokkos::OpenMP, - Kokkos::HostSpace, true) -KOKKOSBLAS1_ROTMG_TPL_SPEC_DECL_BLAS(double, Kokkos::LayoutLeft, Kokkos::OpenMP, - Kokkos::HostSpace, false) -KOKKOSBLAS1_ROTMG_TPL_SPEC_DECL_BLAS(double, Kokkos::LayoutRight, - Kokkos::OpenMP, Kokkos::HostSpace, true) -KOKKOSBLAS1_ROTMG_TPL_SPEC_DECL_BLAS(double, Kokkos::LayoutRight, - Kokkos::OpenMP, Kokkos::HostSpace, false) -KOKKOSBLAS1_ROTMG_TPL_SPEC_DECL_BLAS(float, Kokkos::LayoutLeft, Kokkos::OpenMP, - Kokkos::HostSpace, true) -KOKKOSBLAS1_ROTMG_TPL_SPEC_DECL_BLAS(float, Kokkos::LayoutLeft, Kokkos::OpenMP, - Kokkos::HostSpace, false) -KOKKOSBLAS1_ROTMG_TPL_SPEC_DECL_BLAS(float, Kokkos::LayoutRight, Kokkos::OpenMP, - Kokkos::HostSpace, true) -KOKKOSBLAS1_ROTMG_TPL_SPEC_DECL_BLAS(float, Kokkos::LayoutRight, Kokkos::OpenMP, - Kokkos::HostSpace, false) +KOKKOSBLAS1_ROTMG_TPL_SPEC_DECL_BLAS(double, Kokkos::LayoutLeft, Kokkos::OpenMP, Kokkos::HostSpace, true) +KOKKOSBLAS1_ROTMG_TPL_SPEC_DECL_BLAS(double, Kokkos::LayoutLeft, Kokkos::OpenMP, Kokkos::HostSpace, false) +KOKKOSBLAS1_ROTMG_TPL_SPEC_DECL_BLAS(double, Kokkos::LayoutRight, Kokkos::OpenMP, Kokkos::HostSpace, true) +KOKKOSBLAS1_ROTMG_TPL_SPEC_DECL_BLAS(double, Kokkos::LayoutRight, Kokkos::OpenMP, Kokkos::HostSpace, false) +KOKKOSBLAS1_ROTMG_TPL_SPEC_DECL_BLAS(float, Kokkos::LayoutLeft, Kokkos::OpenMP, Kokkos::HostSpace, true) +KOKKOSBLAS1_ROTMG_TPL_SPEC_DECL_BLAS(float, Kokkos::LayoutLeft, Kokkos::OpenMP, Kokkos::HostSpace, false) +KOKKOSBLAS1_ROTMG_TPL_SPEC_DECL_BLAS(float, Kokkos::LayoutRight, Kokkos::OpenMP, Kokkos::HostSpace, true) +KOKKOSBLAS1_ROTMG_TPL_SPEC_DECL_BLAS(float, Kokkos::LayoutRight, Kokkos::OpenMP, Kokkos::HostSpace, false) #endif } // namespace Impl @@ -121,114 +94,77 @@ KOKKOSBLAS1_ROTMG_TPL_SPEC_DECL_BLAS(float, Kokkos::LayoutRight, Kokkos::OpenMP, namespace KokkosBlas { namespace Impl { -#define KOKKOSBLAS1_DROTMG_TPL_SPEC_DECL_CUBLAS(LAYOUT, EXEC_SPACE, MEM_SPACE, \ - ETI_SPEC_AVAIL) \ - template <> \ - struct Rotmg< \ - EXEC_SPACE, \ - Kokkos::View, \ - Kokkos::MemoryTraits>, \ - Kokkos::View, \ - Kokkos::MemoryTraits>, \ - Kokkos::View, \ - Kokkos::MemoryTraits>, \ - true, ETI_SPEC_AVAIL> { \ - using DXView = \ - Kokkos::View, \ - Kokkos::MemoryTraits>; \ - using YView = Kokkos::View, \ - Kokkos::MemoryTraits>; \ - using PView = \ - Kokkos::View, \ - Kokkos::MemoryTraits>; \ - \ - static void rotmg(EXEC_SPACE const& space, DXView const& d1, \ - DXView const& d2, DXView const& x1, YView const& y1, \ - PView const& param) { \ - Kokkos::Profiling::pushRegion("KokkosBlas::rotmg[TPL_CUBLAS,double]"); \ - rotmg_print_specialization(); \ - KokkosBlas::Impl::CudaBlasSingleton& s = \ - KokkosBlas::Impl::CudaBlasSingleton::singleton(); \ - KOKKOS_CUBLAS_SAFE_CALL_IMPL( \ - cublasSetStream(s.handle, space.cuda_stream())); \ - cublasPointerMode_t pointer_mode; \ - KOKKOS_CUBLAS_SAFE_CALL_IMPL( \ - cublasGetPointerMode(s.handle, &pointer_mode)); \ - KOKKOS_CUBLAS_SAFE_CALL_IMPL( \ - cublasSetPointerMode(s.handle, CUBLAS_POINTER_MODE_DEVICE)); \ - KOKKOS_CUBLAS_SAFE_CALL_IMPL(cublasDrotmg(s.handle, d1.data(), \ - d2.data(), x1.data(), \ - y1.data(), param.data())); \ - KOKKOS_CUBLAS_SAFE_CALL_IMPL( \ - cublasSetPointerMode(s.handle, pointer_mode)); \ - Kokkos::Profiling::popRegion(); \ - } \ +#define KOKKOSBLAS1_DROTMG_TPL_SPEC_DECL_CUBLAS(LAYOUT, EXEC_SPACE, MEM_SPACE, ETI_SPEC_AVAIL) \ + template <> \ + struct Rotmg< \ + EXEC_SPACE, \ + Kokkos::View, Kokkos::MemoryTraits>, \ + Kokkos::View, \ + Kokkos::MemoryTraits>, \ + Kokkos::View, Kokkos::MemoryTraits>, \ + true, ETI_SPEC_AVAIL> { \ + using DXView = \ + Kokkos::View, Kokkos::MemoryTraits>; \ + using YView = Kokkos::View, \ + Kokkos::MemoryTraits>; \ + using PView = Kokkos::View, \ + Kokkos::MemoryTraits>; \ + \ + static void rotmg(EXEC_SPACE const& space, DXView const& d1, DXView const& d2, DXView const& x1, YView const& y1, \ + PView const& param) { \ + Kokkos::Profiling::pushRegion("KokkosBlas::rotmg[TPL_CUBLAS,double]"); \ + rotmg_print_specialization(); \ + KokkosBlas::Impl::CudaBlasSingleton& s = KokkosBlas::Impl::CudaBlasSingleton::singleton(); \ + KOKKOS_CUBLAS_SAFE_CALL_IMPL(cublasSetStream(s.handle, space.cuda_stream())); \ + cublasPointerMode_t pointer_mode; \ + KOKKOS_CUBLAS_SAFE_CALL_IMPL(cublasGetPointerMode(s.handle, &pointer_mode)); \ + KOKKOS_CUBLAS_SAFE_CALL_IMPL(cublasSetPointerMode(s.handle, CUBLAS_POINTER_MODE_DEVICE)); \ + KOKKOS_CUBLAS_SAFE_CALL_IMPL(cublasDrotmg(s.handle, d1.data(), d2.data(), x1.data(), y1.data(), param.data())); \ + KOKKOS_CUBLAS_SAFE_CALL_IMPL(cublasSetPointerMode(s.handle, pointer_mode)); \ + Kokkos::Profiling::popRegion(); \ + } \ }; -KOKKOSBLAS1_DROTMG_TPL_SPEC_DECL_CUBLAS(Kokkos::LayoutLeft, Kokkos::Cuda, - Kokkos::CudaSpace, true) -KOKKOSBLAS1_DROTMG_TPL_SPEC_DECL_CUBLAS(Kokkos::LayoutRight, Kokkos::Cuda, - Kokkos::CudaSpace, true) -KOKKOSBLAS1_DROTMG_TPL_SPEC_DECL_CUBLAS(Kokkos::LayoutLeft, Kokkos::Cuda, - Kokkos::CudaSpace, false) -KOKKOSBLAS1_DROTMG_TPL_SPEC_DECL_CUBLAS(Kokkos::LayoutRight, Kokkos::Cuda, - Kokkos::CudaSpace, false) +KOKKOSBLAS1_DROTMG_TPL_SPEC_DECL_CUBLAS(Kokkos::LayoutLeft, Kokkos::Cuda, Kokkos::CudaSpace, true) +KOKKOSBLAS1_DROTMG_TPL_SPEC_DECL_CUBLAS(Kokkos::LayoutRight, Kokkos::Cuda, Kokkos::CudaSpace, true) +KOKKOSBLAS1_DROTMG_TPL_SPEC_DECL_CUBLAS(Kokkos::LayoutLeft, Kokkos::Cuda, Kokkos::CudaSpace, false) +KOKKOSBLAS1_DROTMG_TPL_SPEC_DECL_CUBLAS(Kokkos::LayoutRight, Kokkos::Cuda, Kokkos::CudaSpace, false) -#define KOKKOSBLAS1_SROTMG_TPL_SPEC_DECL_CUBLAS(LAYOUT, EXEC_SPACE, MEM_SPACE, \ - ETI_SPEC_AVAIL) \ - template <> \ - struct Rotmg< \ - EXEC_SPACE, \ - Kokkos::View, \ - Kokkos::MemoryTraits>, \ - Kokkos::View, \ - Kokkos::MemoryTraits>, \ - Kokkos::View, \ - Kokkos::MemoryTraits>, \ - true, ETI_SPEC_AVAIL> { \ - using DXView = \ - Kokkos::View, \ - Kokkos::MemoryTraits>; \ - using YView = Kokkos::View, \ - Kokkos::MemoryTraits>; \ - using PView = \ - Kokkos::View, \ - Kokkos::MemoryTraits>; \ - \ - static void rotmg(EXEC_SPACE const& space, DXView const& d1, \ - DXView const& d2, DXView const& x1, YView const& y1, \ - PView const& param) { \ - Kokkos::Profiling::pushRegion("KokkosBlas::rotmg[TPL_CUBLAS,float]"); \ - rotmg_print_specialization(); \ - KokkosBlas::Impl::CudaBlasSingleton& s = \ - KokkosBlas::Impl::CudaBlasSingleton::singleton(); \ - KOKKOS_CUBLAS_SAFE_CALL_IMPL( \ - cublasSetStream(s.handle, space.cuda_stream())); \ - cublasPointerMode_t pointer_mode; \ - KOKKOS_CUBLAS_SAFE_CALL_IMPL( \ - cublasGetPointerMode(s.handle, &pointer_mode)); \ - KOKKOS_CUBLAS_SAFE_CALL_IMPL( \ - cublasSetPointerMode(s.handle, CUBLAS_POINTER_MODE_DEVICE)); \ - KOKKOS_CUBLAS_SAFE_CALL_IMPL(cublasSrotmg(s.handle, d1.data(), \ - d2.data(), x1.data(), \ - y1.data(), param.data())); \ - KOKKOS_CUBLAS_SAFE_CALL_IMPL( \ - cublasSetPointerMode(s.handle, pointer_mode)); \ - Kokkos::Profiling::popRegion(); \ - } \ +#define KOKKOSBLAS1_SROTMG_TPL_SPEC_DECL_CUBLAS(LAYOUT, EXEC_SPACE, MEM_SPACE, ETI_SPEC_AVAIL) \ + template <> \ + struct Rotmg< \ + EXEC_SPACE, \ + Kokkos::View, Kokkos::MemoryTraits>, \ + Kokkos::View, \ + Kokkos::MemoryTraits>, \ + Kokkos::View, Kokkos::MemoryTraits>, \ + true, ETI_SPEC_AVAIL> { \ + using DXView = \ + Kokkos::View, Kokkos::MemoryTraits>; \ + using YView = Kokkos::View, \ + Kokkos::MemoryTraits>; \ + using PView = Kokkos::View, \ + Kokkos::MemoryTraits>; \ + \ + static void rotmg(EXEC_SPACE const& space, DXView const& d1, DXView const& d2, DXView const& x1, YView const& y1, \ + PView const& param) { \ + Kokkos::Profiling::pushRegion("KokkosBlas::rotmg[TPL_CUBLAS,float]"); \ + rotmg_print_specialization(); \ + KokkosBlas::Impl::CudaBlasSingleton& s = KokkosBlas::Impl::CudaBlasSingleton::singleton(); \ + KOKKOS_CUBLAS_SAFE_CALL_IMPL(cublasSetStream(s.handle, space.cuda_stream())); \ + cublasPointerMode_t pointer_mode; \ + KOKKOS_CUBLAS_SAFE_CALL_IMPL(cublasGetPointerMode(s.handle, &pointer_mode)); \ + KOKKOS_CUBLAS_SAFE_CALL_IMPL(cublasSetPointerMode(s.handle, CUBLAS_POINTER_MODE_DEVICE)); \ + KOKKOS_CUBLAS_SAFE_CALL_IMPL(cublasSrotmg(s.handle, d1.data(), d2.data(), x1.data(), y1.data(), param.data())); \ + KOKKOS_CUBLAS_SAFE_CALL_IMPL(cublasSetPointerMode(s.handle, pointer_mode)); \ + Kokkos::Profiling::popRegion(); \ + } \ }; -KOKKOSBLAS1_SROTMG_TPL_SPEC_DECL_CUBLAS(Kokkos::LayoutLeft, Kokkos::Cuda, - Kokkos::CudaSpace, true) -KOKKOSBLAS1_SROTMG_TPL_SPEC_DECL_CUBLAS(Kokkos::LayoutRight, Kokkos::Cuda, - Kokkos::CudaSpace, true) -KOKKOSBLAS1_SROTMG_TPL_SPEC_DECL_CUBLAS(Kokkos::LayoutLeft, Kokkos::Cuda, - Kokkos::CudaSpace, false) -KOKKOSBLAS1_SROTMG_TPL_SPEC_DECL_CUBLAS(Kokkos::LayoutRight, Kokkos::Cuda, - Kokkos::CudaSpace, false) +KOKKOSBLAS1_SROTMG_TPL_SPEC_DECL_CUBLAS(Kokkos::LayoutLeft, Kokkos::Cuda, Kokkos::CudaSpace, true) +KOKKOSBLAS1_SROTMG_TPL_SPEC_DECL_CUBLAS(Kokkos::LayoutRight, Kokkos::Cuda, Kokkos::CudaSpace, true) +KOKKOSBLAS1_SROTMG_TPL_SPEC_DECL_CUBLAS(Kokkos::LayoutLeft, Kokkos::Cuda, Kokkos::CudaSpace, false) +KOKKOSBLAS1_SROTMG_TPL_SPEC_DECL_CUBLAS(Kokkos::LayoutRight, Kokkos::Cuda, Kokkos::CudaSpace, false) } // namespace Impl } // namespace KokkosBlas @@ -242,114 +178,79 @@ KOKKOSBLAS1_SROTMG_TPL_SPEC_DECL_CUBLAS(Kokkos::LayoutRight, Kokkos::Cuda, namespace KokkosBlas { namespace Impl { -#define KOKKOSBLAS1_DROTMG_TPL_SPEC_DECL_ROCBLAS(LAYOUT, EXEC_SPACE, \ - MEM_SPACE, ETI_SPEC_AVAIL) \ - template <> \ - struct Rotmg< \ - EXEC_SPACE, \ - Kokkos::View, \ - Kokkos::MemoryTraits>, \ - Kokkos::View, \ - Kokkos::MemoryTraits>, \ - Kokkos::View, \ - Kokkos::MemoryTraits>, \ - true, ETI_SPEC_AVAIL> { \ - using DXView = \ - Kokkos::View, \ - Kokkos::MemoryTraits>; \ - using YView = Kokkos::View, \ - Kokkos::MemoryTraits>; \ - using PView = \ - Kokkos::View, \ - Kokkos::MemoryTraits>; \ - \ - static void rotmg(EXEC_SPACE const& space, DXView const& d1, \ - DXView const& d2, DXView const& x1, YView const& y1, \ - PView const& param) { \ - Kokkos::Profiling::pushRegion("KokkosBlas::rotmg[TPL_ROCBLAS,double]"); \ - rotmg_print_specialization(); \ - KokkosBlas::Impl::RocBlasSingleton& s = \ - KokkosBlas::Impl::RocBlasSingleton::singleton(); \ - KOKKOS_ROCBLAS_SAFE_CALL_IMPL( \ - rocblas_set_stream(s.handle, space.hip_stream())); \ - rocblas_pointer_mode pointer_mode; \ - KOKKOS_ROCBLAS_SAFE_CALL_IMPL( \ - rocblas_get_pointer_mode(s.handle, &pointer_mode)); \ - KOKKOS_ROCBLAS_SAFE_CALL_IMPL( \ - rocblas_set_pointer_mode(s.handle, rocblas_pointer_mode_device)); \ - KOKKOS_ROCBLAS_SAFE_CALL_IMPL(rocblas_drotmg(s.handle, d1.data(), \ - d2.data(), x1.data(), \ - y1.data(), param.data())); \ - KOKKOS_ROCBLAS_SAFE_CALL_IMPL( \ - rocblas_set_pointer_mode(s.handle, pointer_mode)); \ - Kokkos::Profiling::popRegion(); \ - } \ +#define KOKKOSBLAS1_DROTMG_TPL_SPEC_DECL_ROCBLAS(LAYOUT, EXEC_SPACE, MEM_SPACE, ETI_SPEC_AVAIL) \ + template <> \ + struct Rotmg< \ + EXEC_SPACE, \ + Kokkos::View, Kokkos::MemoryTraits>, \ + Kokkos::View, \ + Kokkos::MemoryTraits>, \ + Kokkos::View, Kokkos::MemoryTraits>, \ + true, ETI_SPEC_AVAIL> { \ + using DXView = \ + Kokkos::View, Kokkos::MemoryTraits>; \ + using YView = Kokkos::View, \ + Kokkos::MemoryTraits>; \ + using PView = Kokkos::View, \ + Kokkos::MemoryTraits>; \ + \ + static void rotmg(EXEC_SPACE const& space, DXView const& d1, DXView const& d2, DXView const& x1, YView const& y1, \ + PView const& param) { \ + Kokkos::Profiling::pushRegion("KokkosBlas::rotmg[TPL_ROCBLAS,double]"); \ + rotmg_print_specialization(); \ + KokkosBlas::Impl::RocBlasSingleton& s = KokkosBlas::Impl::RocBlasSingleton::singleton(); \ + KOKKOS_ROCBLAS_SAFE_CALL_IMPL(rocblas_set_stream(s.handle, space.hip_stream())); \ + rocblas_pointer_mode pointer_mode; \ + KOKKOS_ROCBLAS_SAFE_CALL_IMPL(rocblas_get_pointer_mode(s.handle, &pointer_mode)); \ + KOKKOS_ROCBLAS_SAFE_CALL_IMPL(rocblas_set_pointer_mode(s.handle, rocblas_pointer_mode_device)); \ + KOKKOS_ROCBLAS_SAFE_CALL_IMPL( \ + rocblas_drotmg(s.handle, d1.data(), d2.data(), x1.data(), y1.data(), param.data())); \ + KOKKOS_ROCBLAS_SAFE_CALL_IMPL(rocblas_set_pointer_mode(s.handle, pointer_mode)); \ + Kokkos::Profiling::popRegion(); \ + } \ }; -KOKKOSBLAS1_DROTMG_TPL_SPEC_DECL_ROCBLAS(Kokkos::LayoutLeft, Kokkos::HIP, - Kokkos::HIPSpace, true) -KOKKOSBLAS1_DROTMG_TPL_SPEC_DECL_ROCBLAS(Kokkos::LayoutRight, Kokkos::HIP, - Kokkos::HIPSpace, true) -KOKKOSBLAS1_DROTMG_TPL_SPEC_DECL_ROCBLAS(Kokkos::LayoutLeft, Kokkos::HIP, - Kokkos::HIPSpace, false) -KOKKOSBLAS1_DROTMG_TPL_SPEC_DECL_ROCBLAS(Kokkos::LayoutRight, Kokkos::HIP, - Kokkos::HIPSpace, false) +KOKKOSBLAS1_DROTMG_TPL_SPEC_DECL_ROCBLAS(Kokkos::LayoutLeft, Kokkos::HIP, Kokkos::HIPSpace, true) +KOKKOSBLAS1_DROTMG_TPL_SPEC_DECL_ROCBLAS(Kokkos::LayoutRight, Kokkos::HIP, Kokkos::HIPSpace, true) +KOKKOSBLAS1_DROTMG_TPL_SPEC_DECL_ROCBLAS(Kokkos::LayoutLeft, Kokkos::HIP, Kokkos::HIPSpace, false) +KOKKOSBLAS1_DROTMG_TPL_SPEC_DECL_ROCBLAS(Kokkos::LayoutRight, Kokkos::HIP, Kokkos::HIPSpace, false) -#define KOKKOSBLAS1_SROTMG_TPL_SPEC_DECL_ROCBLAS(LAYOUT, EXEC_SPACE, \ - MEM_SPACE, ETI_SPEC_AVAIL) \ - template <> \ - struct Rotmg< \ - EXEC_SPACE, \ - Kokkos::View, \ - Kokkos::MemoryTraits>, \ - Kokkos::View, \ - Kokkos::MemoryTraits>, \ - Kokkos::View, \ - Kokkos::MemoryTraits>, \ - true, ETI_SPEC_AVAIL> { \ - using DXView = \ - Kokkos::View, \ - Kokkos::MemoryTraits>; \ - using YView = Kokkos::View, \ - Kokkos::MemoryTraits>; \ - using PView = \ - Kokkos::View, \ - Kokkos::MemoryTraits>; \ - \ - static void rotmg(EXEC_SPACE const& space, DXView const& d1, \ - DXView const& d2, DXView const& x1, YView const& y1, \ - PView const& param) { \ - Kokkos::Profiling::pushRegion("KokkosBlas::rotmg[TPL_ROCBLAS,float]"); \ - rotmg_print_specialization(); \ - KokkosBlas::Impl::RocBlasSingleton& s = \ - KokkosBlas::Impl::RocBlasSingleton::singleton(); \ - KOKKOS_ROCBLAS_SAFE_CALL_IMPL( \ - rocblas_set_stream(s.handle, space.hip_stream())); \ - rocblas_pointer_mode pointer_mode; \ - KOKKOS_ROCBLAS_SAFE_CALL_IMPL( \ - rocblas_get_pointer_mode(s.handle, &pointer_mode)); \ - KOKKOS_ROCBLAS_SAFE_CALL_IMPL( \ - rocblas_set_pointer_mode(s.handle, rocblas_pointer_mode_device)); \ - KOKKOS_ROCBLAS_SAFE_CALL_IMPL(rocblas_srotmg(s.handle, d1.data(), \ - d2.data(), x1.data(), \ - y1.data(), param.data())); \ - KOKKOS_ROCBLAS_SAFE_CALL_IMPL( \ - rocblas_set_pointer_mode(s.handle, pointer_mode)); \ - Kokkos::Profiling::popRegion(); \ - } \ +#define KOKKOSBLAS1_SROTMG_TPL_SPEC_DECL_ROCBLAS(LAYOUT, EXEC_SPACE, MEM_SPACE, ETI_SPEC_AVAIL) \ + template <> \ + struct Rotmg< \ + EXEC_SPACE, \ + Kokkos::View, Kokkos::MemoryTraits>, \ + Kokkos::View, \ + Kokkos::MemoryTraits>, \ + Kokkos::View, Kokkos::MemoryTraits>, \ + true, ETI_SPEC_AVAIL> { \ + using DXView = \ + Kokkos::View, Kokkos::MemoryTraits>; \ + using YView = Kokkos::View, \ + Kokkos::MemoryTraits>; \ + using PView = Kokkos::View, \ + Kokkos::MemoryTraits>; \ + \ + static void rotmg(EXEC_SPACE const& space, DXView const& d1, DXView const& d2, DXView const& x1, YView const& y1, \ + PView const& param) { \ + Kokkos::Profiling::pushRegion("KokkosBlas::rotmg[TPL_ROCBLAS,float]"); \ + rotmg_print_specialization(); \ + KokkosBlas::Impl::RocBlasSingleton& s = KokkosBlas::Impl::RocBlasSingleton::singleton(); \ + KOKKOS_ROCBLAS_SAFE_CALL_IMPL(rocblas_set_stream(s.handle, space.hip_stream())); \ + rocblas_pointer_mode pointer_mode; \ + KOKKOS_ROCBLAS_SAFE_CALL_IMPL(rocblas_get_pointer_mode(s.handle, &pointer_mode)); \ + KOKKOS_ROCBLAS_SAFE_CALL_IMPL(rocblas_set_pointer_mode(s.handle, rocblas_pointer_mode_device)); \ + KOKKOS_ROCBLAS_SAFE_CALL_IMPL( \ + rocblas_srotmg(s.handle, d1.data(), d2.data(), x1.data(), y1.data(), param.data())); \ + KOKKOS_ROCBLAS_SAFE_CALL_IMPL(rocblas_set_pointer_mode(s.handle, pointer_mode)); \ + Kokkos::Profiling::popRegion(); \ + } \ }; -KOKKOSBLAS1_SROTMG_TPL_SPEC_DECL_ROCBLAS(Kokkos::LayoutLeft, Kokkos::HIP, - Kokkos::HIPSpace, true) -KOKKOSBLAS1_SROTMG_TPL_SPEC_DECL_ROCBLAS(Kokkos::LayoutRight, Kokkos::HIP, - Kokkos::HIPSpace, true) -KOKKOSBLAS1_SROTMG_TPL_SPEC_DECL_ROCBLAS(Kokkos::LayoutLeft, Kokkos::HIP, - Kokkos::HIPSpace, false) -KOKKOSBLAS1_SROTMG_TPL_SPEC_DECL_ROCBLAS(Kokkos::LayoutRight, Kokkos::HIP, - Kokkos::HIPSpace, false) +KOKKOSBLAS1_SROTMG_TPL_SPEC_DECL_ROCBLAS(Kokkos::LayoutLeft, Kokkos::HIP, Kokkos::HIPSpace, true) +KOKKOSBLAS1_SROTMG_TPL_SPEC_DECL_ROCBLAS(Kokkos::LayoutRight, Kokkos::HIP, Kokkos::HIPSpace, true) +KOKKOSBLAS1_SROTMG_TPL_SPEC_DECL_ROCBLAS(Kokkos::LayoutLeft, Kokkos::HIP, Kokkos::HIPSpace, false) +KOKKOSBLAS1_SROTMG_TPL_SPEC_DECL_ROCBLAS(Kokkos::LayoutRight, Kokkos::HIP, Kokkos::HIPSpace, false) } // namespace Impl } // namespace KokkosBlas diff --git a/blas/tpls/KokkosBlas1_scal_tpl_spec_avail.hpp b/blas/tpls/KokkosBlas1_scal_tpl_spec_avail.hpp index 5c5a6008ec..b5efa5c3a4 100644 --- a/blas/tpls/KokkosBlas1_scal_tpl_spec_avail.hpp +++ b/blas/tpls/KokkosBlas1_scal_tpl_spec_avail.hpp @@ -20,8 +20,7 @@ namespace KokkosBlas { namespace Impl { // Specialization struct which defines whether a specialization exists -template +template struct scal_tpl_spec_avail { enum : bool { value = false }; }; @@ -34,98 +33,71 @@ namespace Impl { // Generic Host side BLAS (could be MKL or whatever) #if defined(KOKKOSKERNELS_ENABLE_TPL_BLAS) // double -#define KOKKOSBLAS1_SCAL_TPL_SPEC_AVAIL_BLAS(SCALAR, LAYOUT, MEMSPACE) \ - template \ - struct scal_tpl_spec_avail< \ - ExecSpace, \ - Kokkos::View, \ - Kokkos::MemoryTraits >, \ - SCALAR, \ - Kokkos::View, \ - Kokkos::MemoryTraits >, \ - 1> { \ - enum : bool { value = true }; \ +#define KOKKOSBLAS1_SCAL_TPL_SPEC_AVAIL_BLAS(SCALAR, LAYOUT, MEMSPACE) \ + template \ + struct scal_tpl_spec_avail< \ + ExecSpace, \ + Kokkos::View, Kokkos::MemoryTraits >, \ + SCALAR, \ + Kokkos::View, \ + Kokkos::MemoryTraits >, \ + 1> { \ + enum : bool { value = true }; \ }; -KOKKOSBLAS1_SCAL_TPL_SPEC_AVAIL_BLAS(double, Kokkos::LayoutLeft, - Kokkos::HostSpace) -KOKKOSBLAS1_SCAL_TPL_SPEC_AVAIL_BLAS(float, Kokkos::LayoutLeft, - Kokkos::HostSpace) -KOKKOSBLAS1_SCAL_TPL_SPEC_AVAIL_BLAS(Kokkos::complex, - Kokkos::LayoutLeft, Kokkos::HostSpace) -KOKKOSBLAS1_SCAL_TPL_SPEC_AVAIL_BLAS(Kokkos::complex, Kokkos::LayoutLeft, - Kokkos::HostSpace) +KOKKOSBLAS1_SCAL_TPL_SPEC_AVAIL_BLAS(double, Kokkos::LayoutLeft, Kokkos::HostSpace) +KOKKOSBLAS1_SCAL_TPL_SPEC_AVAIL_BLAS(float, Kokkos::LayoutLeft, Kokkos::HostSpace) +KOKKOSBLAS1_SCAL_TPL_SPEC_AVAIL_BLAS(Kokkos::complex, Kokkos::LayoutLeft, Kokkos::HostSpace) +KOKKOSBLAS1_SCAL_TPL_SPEC_AVAIL_BLAS(Kokkos::complex, Kokkos::LayoutLeft, Kokkos::HostSpace) #endif // cuBLAS #if defined(KOKKOSKERNELS_ENABLE_TPL_CUBLAS) // double -#define KOKKOSBLAS1_SCAL_TPL_SPEC_AVAIL_CUBLAS(SCALAR, LAYOUT, EXECSPACE, \ - MEMSPACE) \ - template <> \ - struct scal_tpl_spec_avail< \ - EXECSPACE, \ - Kokkos::View, \ - Kokkos::MemoryTraits >, \ - SCALAR, \ - Kokkos::View, \ - Kokkos::MemoryTraits >, \ - 1> { \ - enum : bool { value = true }; \ +#define KOKKOSBLAS1_SCAL_TPL_SPEC_AVAIL_CUBLAS(SCALAR, LAYOUT, EXECSPACE, MEMSPACE) \ + template <> \ + struct scal_tpl_spec_avail< \ + EXECSPACE, \ + Kokkos::View, Kokkos::MemoryTraits >, \ + SCALAR, \ + Kokkos::View, \ + Kokkos::MemoryTraits >, \ + 1> { \ + enum : bool { value = true }; \ }; -KOKKOSBLAS1_SCAL_TPL_SPEC_AVAIL_CUBLAS(double, Kokkos::LayoutLeft, Kokkos::Cuda, - Kokkos::CudaSpace) -KOKKOSBLAS1_SCAL_TPL_SPEC_AVAIL_CUBLAS(float, Kokkos::LayoutLeft, Kokkos::Cuda, - Kokkos::CudaSpace) -KOKKOSBLAS1_SCAL_TPL_SPEC_AVAIL_CUBLAS(Kokkos::complex, - Kokkos::LayoutLeft, Kokkos::Cuda, - Kokkos::CudaSpace) -KOKKOSBLAS1_SCAL_TPL_SPEC_AVAIL_CUBLAS(Kokkos::complex, - Kokkos::LayoutLeft, Kokkos::Cuda, - Kokkos::CudaSpace) +KOKKOSBLAS1_SCAL_TPL_SPEC_AVAIL_CUBLAS(double, Kokkos::LayoutLeft, Kokkos::Cuda, Kokkos::CudaSpace) +KOKKOSBLAS1_SCAL_TPL_SPEC_AVAIL_CUBLAS(float, Kokkos::LayoutLeft, Kokkos::Cuda, Kokkos::CudaSpace) +KOKKOSBLAS1_SCAL_TPL_SPEC_AVAIL_CUBLAS(Kokkos::complex, Kokkos::LayoutLeft, Kokkos::Cuda, Kokkos::CudaSpace) +KOKKOSBLAS1_SCAL_TPL_SPEC_AVAIL_CUBLAS(Kokkos::complex, Kokkos::LayoutLeft, Kokkos::Cuda, Kokkos::CudaSpace) -KOKKOSBLAS1_SCAL_TPL_SPEC_AVAIL_CUBLAS(double, Kokkos::LayoutLeft, Kokkos::Cuda, - Kokkos::CudaUVMSpace) -KOKKOSBLAS1_SCAL_TPL_SPEC_AVAIL_CUBLAS(float, Kokkos::LayoutLeft, Kokkos::Cuda, - Kokkos::CudaUVMSpace) -KOKKOSBLAS1_SCAL_TPL_SPEC_AVAIL_CUBLAS(Kokkos::complex, - Kokkos::LayoutLeft, Kokkos::Cuda, - Kokkos::CudaUVMSpace) -KOKKOSBLAS1_SCAL_TPL_SPEC_AVAIL_CUBLAS(Kokkos::complex, - Kokkos::LayoutLeft, Kokkos::Cuda, - Kokkos::CudaUVMSpace) +KOKKOSBLAS1_SCAL_TPL_SPEC_AVAIL_CUBLAS(double, Kokkos::LayoutLeft, Kokkos::Cuda, Kokkos::CudaUVMSpace) +KOKKOSBLAS1_SCAL_TPL_SPEC_AVAIL_CUBLAS(float, Kokkos::LayoutLeft, Kokkos::Cuda, Kokkos::CudaUVMSpace) +KOKKOSBLAS1_SCAL_TPL_SPEC_AVAIL_CUBLAS(Kokkos::complex, Kokkos::LayoutLeft, Kokkos::Cuda, Kokkos::CudaUVMSpace) +KOKKOSBLAS1_SCAL_TPL_SPEC_AVAIL_CUBLAS(Kokkos::complex, Kokkos::LayoutLeft, Kokkos::Cuda, Kokkos::CudaUVMSpace) #endif // rocBLAS #if defined(KOKKOSKERNELS_ENABLE_TPL_ROCBLAS) -#define KOKKOSBLAS1_SCAL_TPL_SPEC_AVAIL_ROCBLAS(SCALAR, LAYOUT, EXECSPACE, \ - MEMSPACE) \ - template <> \ - struct scal_tpl_spec_avail< \ - EXECSPACE, \ - Kokkos::View, \ - Kokkos::MemoryTraits >, \ - SCALAR, \ - Kokkos::View, \ - Kokkos::MemoryTraits >, \ - 1> { \ - enum : bool { value = true }; \ +#define KOKKOSBLAS1_SCAL_TPL_SPEC_AVAIL_ROCBLAS(SCALAR, LAYOUT, EXECSPACE, MEMSPACE) \ + template <> \ + struct scal_tpl_spec_avail< \ + EXECSPACE, \ + Kokkos::View, Kokkos::MemoryTraits >, \ + SCALAR, \ + Kokkos::View, \ + Kokkos::MemoryTraits >, \ + 1> { \ + enum : bool { value = true }; \ }; -KOKKOSBLAS1_SCAL_TPL_SPEC_AVAIL_ROCBLAS(double, Kokkos::LayoutLeft, Kokkos::HIP, - Kokkos::HIPSpace) -KOKKOSBLAS1_SCAL_TPL_SPEC_AVAIL_ROCBLAS(float, Kokkos::LayoutLeft, Kokkos::HIP, - Kokkos::HIPSpace) -KOKKOSBLAS1_SCAL_TPL_SPEC_AVAIL_ROCBLAS(Kokkos::complex, - Kokkos::LayoutLeft, Kokkos::HIP, - Kokkos::HIPSpace) -KOKKOSBLAS1_SCAL_TPL_SPEC_AVAIL_ROCBLAS(Kokkos::complex, - Kokkos::LayoutLeft, Kokkos::HIP, - Kokkos::HIPSpace) +KOKKOSBLAS1_SCAL_TPL_SPEC_AVAIL_ROCBLAS(double, Kokkos::LayoutLeft, Kokkos::HIP, Kokkos::HIPSpace) +KOKKOSBLAS1_SCAL_TPL_SPEC_AVAIL_ROCBLAS(float, Kokkos::LayoutLeft, Kokkos::HIP, Kokkos::HIPSpace) +KOKKOSBLAS1_SCAL_TPL_SPEC_AVAIL_ROCBLAS(Kokkos::complex, Kokkos::LayoutLeft, Kokkos::HIP, Kokkos::HIPSpace) +KOKKOSBLAS1_SCAL_TPL_SPEC_AVAIL_ROCBLAS(Kokkos::complex, Kokkos::LayoutLeft, Kokkos::HIP, Kokkos::HIPSpace) #endif diff --git a/blas/tpls/KokkosBlas1_scal_tpl_spec_decl.hpp b/blas/tpls/KokkosBlas1_scal_tpl_spec_decl.hpp index da11555f7b..7083e28730 100644 --- a/blas/tpls/KokkosBlas1_scal_tpl_spec_decl.hpp +++ b/blas/tpls/KokkosBlas1_scal_tpl_spec_decl.hpp @@ -24,8 +24,8 @@ namespace { template inline void scal_print_specialization() { #if defined(KOKKOSKERNELS_ENABLE_CHECK_SPECIALIZATION) - printf("KokkosBlas1::scal<> TPL Blas specialization for < %s , %s , %s >\n", - typeid(RV).name(), typeid(AS).name(), typeid(XV).name()); + printf("KokkosBlas1::scal<> TPL Blas specialization for < %s , %s , %s >\n", typeid(RV).name(), typeid(AS).name(), + typeid(XV).name()); #endif } } // namespace @@ -38,87 +38,63 @@ inline void scal_print_specialization() { namespace KokkosBlas { namespace Impl { -#define KOKKOSBLAS1_XSCAL_TPL_SPEC_DECL_BLAS(SCALAR_TYPE, BASE_SCALAR_TYPE, \ - LAYOUT, MEMSPACE, ETI_SPEC_AVAIL) \ - template \ - struct Scal< \ - ExecSpace, \ - Kokkos::View, \ - Kokkos::MemoryTraits >, \ - SCALAR_TYPE, \ - Kokkos::View, \ - Kokkos::MemoryTraits >, \ - 1, true, ETI_SPEC_AVAIL> { \ - typedef Kokkos::View, \ - Kokkos::MemoryTraits > \ - RV; \ - typedef SCALAR_TYPE AS; \ - typedef Kokkos::View, \ - Kokkos::MemoryTraits > \ - XV; \ - typedef typename XV::size_type size_type; \ - \ - static void scal(const ExecSpace& space, const RV& R, const AS& alpha, \ - const XV& X) { \ - Kokkos::Profiling::pushRegion("KokkosBlas::scal[TPL_BLAS," #SCALAR_TYPE \ - "]"); \ - const size_type numElems = X.extent(0); \ - if ((numElems < static_cast(INT_MAX)) && \ - (R.data() == X.data())) { \ - scal_print_specialization(); \ - int N = numElems; \ - int one = 1; \ - const BASE_SCALAR_TYPE alpha_b = static_cast(alpha); \ - HostBlas::scal( \ - N, alpha_b, reinterpret_cast(R.data()), one); \ - } else { \ - Scal::scal(space, R, \ - alpha, X); \ - } \ - Kokkos::Profiling::popRegion(); \ - } \ +#define KOKKOSBLAS1_XSCAL_TPL_SPEC_DECL_BLAS(SCALAR_TYPE, BASE_SCALAR_TYPE, LAYOUT, MEMSPACE, ETI_SPEC_AVAIL) \ + template \ + struct Scal, \ + Kokkos::MemoryTraits >, \ + SCALAR_TYPE, \ + Kokkos::View, \ + Kokkos::MemoryTraits >, \ + 1, true, ETI_SPEC_AVAIL> { \ + typedef Kokkos::View, \ + Kokkos::MemoryTraits > \ + RV; \ + typedef SCALAR_TYPE AS; \ + typedef Kokkos::View, \ + Kokkos::MemoryTraits > \ + XV; \ + typedef typename XV::size_type size_type; \ + \ + static void scal(const ExecSpace& space, const RV& R, const AS& alpha, const XV& X) { \ + Kokkos::Profiling::pushRegion("KokkosBlas::scal[TPL_BLAS," #SCALAR_TYPE "]"); \ + const size_type numElems = X.extent(0); \ + if ((numElems < static_cast(INT_MAX)) && (R.data() == X.data())) { \ + scal_print_specialization(); \ + int N = numElems; \ + int one = 1; \ + const BASE_SCALAR_TYPE alpha_b = static_cast(alpha); \ + HostBlas::scal(N, alpha_b, reinterpret_cast(R.data()), one); \ + } else { \ + Scal::scal(space, R, alpha, X); \ + } \ + Kokkos::Profiling::popRegion(); \ + } \ }; #define KOKKOSBLAS1_DSCAL_TPL_SPEC_DECL_BLAS(LAYOUT, MEMSPACE, ETI_SPEC_AVAIL) \ - KOKKOSBLAS1_XSCAL_TPL_SPEC_DECL_BLAS(double, double, LAYOUT, MEMSPACE, \ - ETI_SPEC_AVAIL) + KOKKOSBLAS1_XSCAL_TPL_SPEC_DECL_BLAS(double, double, LAYOUT, MEMSPACE, ETI_SPEC_AVAIL) #define KOKKOSBLAS1_SSCAL_TPL_SPEC_DECL_BLAS(LAYOUT, MEMSPACE, ETI_SPEC_AVAIL) \ - KOKKOSBLAS1_XSCAL_TPL_SPEC_DECL_BLAS(float, float, LAYOUT, MEMSPACE, \ - ETI_SPEC_AVAIL) + KOKKOSBLAS1_XSCAL_TPL_SPEC_DECL_BLAS(float, float, LAYOUT, MEMSPACE, ETI_SPEC_AVAIL) #define KOKKOSBLAS1_ZSCAL_TPL_SPEC_DECL_BLAS(LAYOUT, MEMSPACE, ETI_SPEC_AVAIL) \ - KOKKOSBLAS1_XSCAL_TPL_SPEC_DECL_BLAS(Kokkos::complex, \ - std::complex, LAYOUT, MEMSPACE, \ - ETI_SPEC_AVAIL) + KOKKOSBLAS1_XSCAL_TPL_SPEC_DECL_BLAS(Kokkos::complex, std::complex, LAYOUT, MEMSPACE, ETI_SPEC_AVAIL) #define KOKKOSBLAS1_CSCAL_TPL_SPEC_DECL_BLAS(LAYOUT, MEMSPACE, ETI_SPEC_AVAIL) \ - KOKKOSBLAS1_XSCAL_TPL_SPEC_DECL_BLAS(Kokkos::complex, \ - std::complex, LAYOUT, MEMSPACE, \ - ETI_SPEC_AVAIL) - -KOKKOSBLAS1_DSCAL_TPL_SPEC_DECL_BLAS(Kokkos::LayoutLeft, Kokkos::HostSpace, - true) -KOKKOSBLAS1_DSCAL_TPL_SPEC_DECL_BLAS(Kokkos::LayoutLeft, Kokkos::HostSpace, - false) - -KOKKOSBLAS1_SSCAL_TPL_SPEC_DECL_BLAS(Kokkos::LayoutLeft, Kokkos::HostSpace, - true) -KOKKOSBLAS1_SSCAL_TPL_SPEC_DECL_BLAS(Kokkos::LayoutLeft, Kokkos::HostSpace, - false) - -KOKKOSBLAS1_ZSCAL_TPL_SPEC_DECL_BLAS(Kokkos::LayoutLeft, Kokkos::HostSpace, - true) -KOKKOSBLAS1_ZSCAL_TPL_SPEC_DECL_BLAS(Kokkos::LayoutLeft, Kokkos::HostSpace, - false) - -KOKKOSBLAS1_CSCAL_TPL_SPEC_DECL_BLAS(Kokkos::LayoutLeft, Kokkos::HostSpace, - true) -KOKKOSBLAS1_CSCAL_TPL_SPEC_DECL_BLAS(Kokkos::LayoutLeft, Kokkos::HostSpace, - false) + KOKKOSBLAS1_XSCAL_TPL_SPEC_DECL_BLAS(Kokkos::complex, std::complex, LAYOUT, MEMSPACE, ETI_SPEC_AVAIL) + +KOKKOSBLAS1_DSCAL_TPL_SPEC_DECL_BLAS(Kokkos::LayoutLeft, Kokkos::HostSpace, true) +KOKKOSBLAS1_DSCAL_TPL_SPEC_DECL_BLAS(Kokkos::LayoutLeft, Kokkos::HostSpace, false) + +KOKKOSBLAS1_SSCAL_TPL_SPEC_DECL_BLAS(Kokkos::LayoutLeft, Kokkos::HostSpace, true) +KOKKOSBLAS1_SSCAL_TPL_SPEC_DECL_BLAS(Kokkos::LayoutLeft, Kokkos::HostSpace, false) + +KOKKOSBLAS1_ZSCAL_TPL_SPEC_DECL_BLAS(Kokkos::LayoutLeft, Kokkos::HostSpace, true) +KOKKOSBLAS1_ZSCAL_TPL_SPEC_DECL_BLAS(Kokkos::LayoutLeft, Kokkos::HostSpace, false) + +KOKKOSBLAS1_CSCAL_TPL_SPEC_DECL_BLAS(Kokkos::LayoutLeft, Kokkos::HostSpace, true) +KOKKOSBLAS1_CSCAL_TPL_SPEC_DECL_BLAS(Kokkos::LayoutLeft, Kokkos::HostSpace, false) } // namespace Impl } // namespace KokkosBlas @@ -132,117 +108,81 @@ KOKKOSBLAS1_CSCAL_TPL_SPEC_DECL_BLAS(Kokkos::LayoutLeft, Kokkos::HostSpace, namespace KokkosBlas { namespace Impl { -#define KOKKOSBLAS1_XSCAL_TPL_SPEC_DECL_CUBLAS(SCALAR_TYPE, CUDA_SCALAR_TYPE, \ - CUBLAS_FN, LAYOUT, MEMSPACE, \ - ETI_SPEC_AVAIL) \ - template \ - struct Scal< \ - ExecSpace, \ - Kokkos::View, \ - Kokkos::MemoryTraits >, \ - SCALAR_TYPE, \ - Kokkos::View, \ - Kokkos::MemoryTraits >, \ - 1, true, ETI_SPEC_AVAIL> { \ - typedef Kokkos::View, \ - Kokkos::MemoryTraits > \ - RV; \ - typedef SCALAR_TYPE AS; \ - typedef Kokkos::View, \ - Kokkos::MemoryTraits > \ - XV; \ - typedef typename XV::size_type size_type; \ - \ - static void scal(const ExecSpace& space, const RV& R, const AS& alpha, \ - const XV& X) { \ - Kokkos::Profiling::pushRegion( \ - "KokkosBlas::scal[TPL_CUBLAS," #SCALAR_TYPE "]"); \ - const size_type numElems = X.extent(0); \ - if ((numElems < static_cast(INT_MAX)) && \ - (R.data() == X.data())) { \ - scal_print_specialization(); \ - const int N = static_cast(numElems); \ - constexpr int one = 1; \ - KokkosBlas::Impl::CudaBlasSingleton& s = \ - KokkosBlas::Impl::CudaBlasSingleton::singleton(); \ - KOKKOS_CUBLAS_SAFE_CALL_IMPL( \ - cublasSetStream(s.handle, space.cuda_stream())); \ - KOKKOS_CUBLAS_SAFE_CALL_IMPL(CUBLAS_FN( \ - s.handle, N, reinterpret_cast(&alpha), \ - reinterpret_cast(R.data()), one)); \ - KOKKOS_CUBLAS_SAFE_CALL_IMPL(cublasSetStream(s.handle, NULL)); \ - } else { \ - Scal::scal(space, R, \ - alpha, X); \ - } \ - Kokkos::Profiling::popRegion(); \ - } \ +#define KOKKOSBLAS1_XSCAL_TPL_SPEC_DECL_CUBLAS(SCALAR_TYPE, CUDA_SCALAR_TYPE, CUBLAS_FN, LAYOUT, MEMSPACE, \ + ETI_SPEC_AVAIL) \ + template \ + struct Scal, \ + Kokkos::MemoryTraits >, \ + SCALAR_TYPE, \ + Kokkos::View, \ + Kokkos::MemoryTraits >, \ + 1, true, ETI_SPEC_AVAIL> { \ + typedef Kokkos::View, \ + Kokkos::MemoryTraits > \ + RV; \ + typedef SCALAR_TYPE AS; \ + typedef Kokkos::View, \ + Kokkos::MemoryTraits > \ + XV; \ + typedef typename XV::size_type size_type; \ + \ + static void scal(const ExecSpace& space, const RV& R, const AS& alpha, const XV& X) { \ + Kokkos::Profiling::pushRegion("KokkosBlas::scal[TPL_CUBLAS," #SCALAR_TYPE "]"); \ + const size_type numElems = X.extent(0); \ + if ((numElems < static_cast(INT_MAX)) && (R.data() == X.data())) { \ + scal_print_specialization(); \ + const int N = static_cast(numElems); \ + constexpr int one = 1; \ + KokkosBlas::Impl::CudaBlasSingleton& s = KokkosBlas::Impl::CudaBlasSingleton::singleton(); \ + KOKKOS_CUBLAS_SAFE_CALL_IMPL(cublasSetStream(s.handle, space.cuda_stream())); \ + KOKKOS_CUBLAS_SAFE_CALL_IMPL(CUBLAS_FN(s.handle, N, reinterpret_cast(&alpha), \ + reinterpret_cast(R.data()), one)); \ + KOKKOS_CUBLAS_SAFE_CALL_IMPL(cublasSetStream(s.handle, NULL)); \ + } else { \ + Scal::scal(space, R, alpha, X); \ + } \ + Kokkos::Profiling::popRegion(); \ + } \ }; -#define KOKKOSBLAS1_DSCAL_TPL_SPEC_DECL_CUBLAS(LAYOUT, MEMSPACE, \ - ETI_SPEC_AVAIL) \ - KOKKOSBLAS1_XSCAL_TPL_SPEC_DECL_CUBLAS(double, double, cublasDscal, LAYOUT, \ - MEMSPACE, ETI_SPEC_AVAIL) - -#define KOKKOSBLAS1_SSCAL_TPL_SPEC_DECL_CUBLAS(LAYOUT, MEMSPACE, \ - ETI_SPEC_AVAIL) \ - KOKKOSBLAS1_XSCAL_TPL_SPEC_DECL_CUBLAS(float, float, cublasSscal, LAYOUT, \ - MEMSPACE, ETI_SPEC_AVAIL) - -#define KOKKOSBLAS1_ZSCAL_TPL_SPEC_DECL_CUBLAS(LAYOUT, MEMSPACE, \ - ETI_SPEC_AVAIL) \ - KOKKOSBLAS1_XSCAL_TPL_SPEC_DECL_CUBLAS(Kokkos::complex, \ - cuDoubleComplex, cublasZscal, LAYOUT, \ - MEMSPACE, ETI_SPEC_AVAIL) - -#define KOKKOSBLAS1_CSCAL_TPL_SPEC_DECL_CUBLAS(LAYOUT, MEMSPACE, \ - ETI_SPEC_AVAIL) \ - KOKKOSBLAS1_XSCAL_TPL_SPEC_DECL_CUBLAS(Kokkos::complex, cuComplex, \ - cublasCscal, LAYOUT, MEMSPACE, \ +#define KOKKOSBLAS1_DSCAL_TPL_SPEC_DECL_CUBLAS(LAYOUT, MEMSPACE, ETI_SPEC_AVAIL) \ + KOKKOSBLAS1_XSCAL_TPL_SPEC_DECL_CUBLAS(double, double, cublasDscal, LAYOUT, MEMSPACE, ETI_SPEC_AVAIL) + +#define KOKKOSBLAS1_SSCAL_TPL_SPEC_DECL_CUBLAS(LAYOUT, MEMSPACE, ETI_SPEC_AVAIL) \ + KOKKOSBLAS1_XSCAL_TPL_SPEC_DECL_CUBLAS(float, float, cublasSscal, LAYOUT, MEMSPACE, ETI_SPEC_AVAIL) + +#define KOKKOSBLAS1_ZSCAL_TPL_SPEC_DECL_CUBLAS(LAYOUT, MEMSPACE, ETI_SPEC_AVAIL) \ + KOKKOSBLAS1_XSCAL_TPL_SPEC_DECL_CUBLAS(Kokkos::complex, cuDoubleComplex, cublasZscal, LAYOUT, MEMSPACE, \ ETI_SPEC_AVAIL) -KOKKOSBLAS1_DSCAL_TPL_SPEC_DECL_CUBLAS(Kokkos::LayoutLeft, Kokkos::CudaSpace, - true) -KOKKOSBLAS1_DSCAL_TPL_SPEC_DECL_CUBLAS(Kokkos::LayoutLeft, Kokkos::CudaSpace, - false) - -KOKKOSBLAS1_SSCAL_TPL_SPEC_DECL_CUBLAS(Kokkos::LayoutLeft, Kokkos::CudaSpace, - true) -KOKKOSBLAS1_SSCAL_TPL_SPEC_DECL_CUBLAS(Kokkos::LayoutLeft, Kokkos::CudaSpace, - false) - -KOKKOSBLAS1_ZSCAL_TPL_SPEC_DECL_CUBLAS(Kokkos::LayoutLeft, Kokkos::CudaSpace, - true) -KOKKOSBLAS1_ZSCAL_TPL_SPEC_DECL_CUBLAS(Kokkos::LayoutLeft, Kokkos::CudaSpace, - false) - -KOKKOSBLAS1_CSCAL_TPL_SPEC_DECL_CUBLAS(Kokkos::LayoutLeft, Kokkos::CudaSpace, - true) -KOKKOSBLAS1_CSCAL_TPL_SPEC_DECL_CUBLAS(Kokkos::LayoutLeft, Kokkos::CudaSpace, - false) - -KOKKOSBLAS1_DSCAL_TPL_SPEC_DECL_CUBLAS(Kokkos::LayoutLeft, Kokkos::CudaUVMSpace, - true) -KOKKOSBLAS1_DSCAL_TPL_SPEC_DECL_CUBLAS(Kokkos::LayoutLeft, Kokkos::CudaUVMSpace, - false) - -KOKKOSBLAS1_SSCAL_TPL_SPEC_DECL_CUBLAS(Kokkos::LayoutLeft, Kokkos::CudaUVMSpace, - true) -KOKKOSBLAS1_SSCAL_TPL_SPEC_DECL_CUBLAS(Kokkos::LayoutLeft, Kokkos::CudaUVMSpace, - false) - -KOKKOSBLAS1_ZSCAL_TPL_SPEC_DECL_CUBLAS(Kokkos::LayoutLeft, Kokkos::CudaUVMSpace, - true) -KOKKOSBLAS1_ZSCAL_TPL_SPEC_DECL_CUBLAS(Kokkos::LayoutLeft, Kokkos::CudaUVMSpace, - false) - -KOKKOSBLAS1_CSCAL_TPL_SPEC_DECL_CUBLAS(Kokkos::LayoutLeft, Kokkos::CudaUVMSpace, - true) -KOKKOSBLAS1_CSCAL_TPL_SPEC_DECL_CUBLAS(Kokkos::LayoutLeft, Kokkos::CudaUVMSpace, - false) +#define KOKKOSBLAS1_CSCAL_TPL_SPEC_DECL_CUBLAS(LAYOUT, MEMSPACE, ETI_SPEC_AVAIL) \ + KOKKOSBLAS1_XSCAL_TPL_SPEC_DECL_CUBLAS(Kokkos::complex, cuComplex, cublasCscal, LAYOUT, MEMSPACE, \ + ETI_SPEC_AVAIL) + +KOKKOSBLAS1_DSCAL_TPL_SPEC_DECL_CUBLAS(Kokkos::LayoutLeft, Kokkos::CudaSpace, true) +KOKKOSBLAS1_DSCAL_TPL_SPEC_DECL_CUBLAS(Kokkos::LayoutLeft, Kokkos::CudaSpace, false) + +KOKKOSBLAS1_SSCAL_TPL_SPEC_DECL_CUBLAS(Kokkos::LayoutLeft, Kokkos::CudaSpace, true) +KOKKOSBLAS1_SSCAL_TPL_SPEC_DECL_CUBLAS(Kokkos::LayoutLeft, Kokkos::CudaSpace, false) + +KOKKOSBLAS1_ZSCAL_TPL_SPEC_DECL_CUBLAS(Kokkos::LayoutLeft, Kokkos::CudaSpace, true) +KOKKOSBLAS1_ZSCAL_TPL_SPEC_DECL_CUBLAS(Kokkos::LayoutLeft, Kokkos::CudaSpace, false) + +KOKKOSBLAS1_CSCAL_TPL_SPEC_DECL_CUBLAS(Kokkos::LayoutLeft, Kokkos::CudaSpace, true) +KOKKOSBLAS1_CSCAL_TPL_SPEC_DECL_CUBLAS(Kokkos::LayoutLeft, Kokkos::CudaSpace, false) + +KOKKOSBLAS1_DSCAL_TPL_SPEC_DECL_CUBLAS(Kokkos::LayoutLeft, Kokkos::CudaUVMSpace, true) +KOKKOSBLAS1_DSCAL_TPL_SPEC_DECL_CUBLAS(Kokkos::LayoutLeft, Kokkos::CudaUVMSpace, false) + +KOKKOSBLAS1_SSCAL_TPL_SPEC_DECL_CUBLAS(Kokkos::LayoutLeft, Kokkos::CudaUVMSpace, true) +KOKKOSBLAS1_SSCAL_TPL_SPEC_DECL_CUBLAS(Kokkos::LayoutLeft, Kokkos::CudaUVMSpace, false) + +KOKKOSBLAS1_ZSCAL_TPL_SPEC_DECL_CUBLAS(Kokkos::LayoutLeft, Kokkos::CudaUVMSpace, true) +KOKKOSBLAS1_ZSCAL_TPL_SPEC_DECL_CUBLAS(Kokkos::LayoutLeft, Kokkos::CudaUVMSpace, false) + +KOKKOSBLAS1_CSCAL_TPL_SPEC_DECL_CUBLAS(Kokkos::LayoutLeft, Kokkos::CudaUVMSpace, true) +KOKKOSBLAS1_CSCAL_TPL_SPEC_DECL_CUBLAS(Kokkos::LayoutLeft, Kokkos::CudaUVMSpace, false) } // namespace Impl } // namespace KokkosBlas @@ -256,105 +196,73 @@ KOKKOSBLAS1_CSCAL_TPL_SPEC_DECL_CUBLAS(Kokkos::LayoutLeft, Kokkos::CudaUVMSpace, namespace KokkosBlas { namespace Impl { -#define KOKKOSBLAS1_XSCAL_TPL_SPEC_DECL_ROCBLAS( \ - SCALAR_TYPE, ROCBLAS_SCALAR_TYPE, ROCBLAS_FN, LAYOUT, EXECSPACE, MEMSPACE, \ - ETI_SPEC_AVAIL) \ - template <> \ - struct Scal< \ - EXECSPACE, \ - Kokkos::View, \ - Kokkos::MemoryTraits >, \ - SCALAR_TYPE, \ - Kokkos::View, \ - Kokkos::MemoryTraits >, \ - 1, true, ETI_SPEC_AVAIL> { \ - using execution_space = EXECSPACE; \ - typedef Kokkos::View, \ - Kokkos::MemoryTraits > \ - RV; \ - typedef SCALAR_TYPE AS; \ - typedef Kokkos::View, \ - Kokkos::MemoryTraits > \ - XV; \ - typedef typename XV::size_type size_type; \ - \ - static void scal(const execution_space& space, const RV& R, \ - const AS& alpha, const XV& X) { \ - Kokkos::Profiling::pushRegion( \ - "KokkosBlas::scal[TPL_ROCBLAS," #SCALAR_TYPE "]"); \ - const size_type numElems = X.extent(0); \ - if ((numElems < static_cast(INT_MAX)) && \ - (R.data() == X.data())) { \ - scal_print_specialization(); \ - const int N = static_cast(numElems); \ - constexpr int one = 1; \ - KokkosBlas::Impl::RocBlasSingleton& s = \ - KokkosBlas::Impl::RocBlasSingleton::singleton(); \ - KOKKOS_ROCBLAS_SAFE_CALL_IMPL( \ - rocblas_set_stream(s.handle, space.hip_stream())); \ - rocblas_pointer_mode pointer_mode; \ - KOKKOS_ROCBLAS_SAFE_CALL_IMPL( \ - rocblas_get_pointer_mode(s.handle, &pointer_mode)); \ - KOKKOS_ROCBLAS_SAFE_CALL_IMPL( \ - rocblas_set_pointer_mode(s.handle, rocblas_pointer_mode_host)); \ - KOKKOS_ROCBLAS_SAFE_CALL_IMPL(ROCBLAS_FN( \ - s.handle, N, reinterpret_cast(&alpha), \ - reinterpret_cast(R.data()), one)); \ - KOKKOS_ROCBLAS_SAFE_CALL_IMPL( \ - rocblas_set_pointer_mode(s.handle, pointer_mode)); \ - } else { \ - Scal::scal(space, R, \ - alpha, X); \ - } \ - Kokkos::Profiling::popRegion(); \ - } \ +#define KOKKOSBLAS1_XSCAL_TPL_SPEC_DECL_ROCBLAS(SCALAR_TYPE, ROCBLAS_SCALAR_TYPE, ROCBLAS_FN, LAYOUT, EXECSPACE, \ + MEMSPACE, ETI_SPEC_AVAIL) \ + template <> \ + struct Scal, \ + Kokkos::MemoryTraits >, \ + SCALAR_TYPE, \ + Kokkos::View, \ + Kokkos::MemoryTraits >, \ + 1, true, ETI_SPEC_AVAIL> { \ + using execution_space = EXECSPACE; \ + typedef Kokkos::View, \ + Kokkos::MemoryTraits > \ + RV; \ + typedef SCALAR_TYPE AS; \ + typedef Kokkos::View, \ + Kokkos::MemoryTraits > \ + XV; \ + typedef typename XV::size_type size_type; \ + \ + static void scal(const execution_space& space, const RV& R, const AS& alpha, const XV& X) { \ + Kokkos::Profiling::pushRegion("KokkosBlas::scal[TPL_ROCBLAS," #SCALAR_TYPE "]"); \ + const size_type numElems = X.extent(0); \ + if ((numElems < static_cast(INT_MAX)) && (R.data() == X.data())) { \ + scal_print_specialization(); \ + const int N = static_cast(numElems); \ + constexpr int one = 1; \ + KokkosBlas::Impl::RocBlasSingleton& s = KokkosBlas::Impl::RocBlasSingleton::singleton(); \ + KOKKOS_ROCBLAS_SAFE_CALL_IMPL(rocblas_set_stream(s.handle, space.hip_stream())); \ + rocblas_pointer_mode pointer_mode; \ + KOKKOS_ROCBLAS_SAFE_CALL_IMPL(rocblas_get_pointer_mode(s.handle, &pointer_mode)); \ + KOKKOS_ROCBLAS_SAFE_CALL_IMPL(rocblas_set_pointer_mode(s.handle, rocblas_pointer_mode_host)); \ + KOKKOS_ROCBLAS_SAFE_CALL_IMPL(ROCBLAS_FN(s.handle, N, reinterpret_cast(&alpha), \ + reinterpret_cast(R.data()), one)); \ + KOKKOS_ROCBLAS_SAFE_CALL_IMPL(rocblas_set_pointer_mode(s.handle, pointer_mode)); \ + } else { \ + Scal::scal(space, R, alpha, X); \ + } \ + Kokkos::Profiling::popRegion(); \ + } \ }; -#define KOKKOSBLAS1_DSCAL_TPL_SPEC_DECL_ROCBLAS(LAYOUT, EXECSPACE, MEMSPACE, \ - ETI_SPEC_AVAIL) \ - KOKKOSBLAS1_XSCAL_TPL_SPEC_DECL_ROCBLAS(double, double, rocblas_dscal, \ - LAYOUT, EXECSPACE, MEMSPACE, \ - ETI_SPEC_AVAIL) +#define KOKKOSBLAS1_DSCAL_TPL_SPEC_DECL_ROCBLAS(LAYOUT, EXECSPACE, MEMSPACE, ETI_SPEC_AVAIL) \ + KOKKOSBLAS1_XSCAL_TPL_SPEC_DECL_ROCBLAS(double, double, rocblas_dscal, LAYOUT, EXECSPACE, MEMSPACE, ETI_SPEC_AVAIL) + +#define KOKKOSBLAS1_SSCAL_TPL_SPEC_DECL_ROCBLAS(LAYOUT, EXECSPACE, MEMSPACE, ETI_SPEC_AVAIL) \ + KOKKOSBLAS1_XSCAL_TPL_SPEC_DECL_ROCBLAS(float, float, rocblas_sscal, LAYOUT, EXECSPACE, MEMSPACE, ETI_SPEC_AVAIL) -#define KOKKOSBLAS1_SSCAL_TPL_SPEC_DECL_ROCBLAS(LAYOUT, EXECSPACE, MEMSPACE, \ - ETI_SPEC_AVAIL) \ - KOKKOSBLAS1_XSCAL_TPL_SPEC_DECL_ROCBLAS(float, float, rocblas_sscal, LAYOUT, \ +#define KOKKOSBLAS1_ZSCAL_TPL_SPEC_DECL_ROCBLAS(LAYOUT, EXECSPACE, MEMSPACE, ETI_SPEC_AVAIL) \ + KOKKOSBLAS1_XSCAL_TPL_SPEC_DECL_ROCBLAS(Kokkos::complex, rocblas_double_complex, rocblas_zscal, LAYOUT, \ EXECSPACE, MEMSPACE, ETI_SPEC_AVAIL) -#define KOKKOSBLAS1_ZSCAL_TPL_SPEC_DECL_ROCBLAS(LAYOUT, EXECSPACE, MEMSPACE, \ - ETI_SPEC_AVAIL) \ - KOKKOSBLAS1_XSCAL_TPL_SPEC_DECL_ROCBLAS( \ - Kokkos::complex, rocblas_double_complex, rocblas_zscal, LAYOUT, \ - EXECSPACE, MEMSPACE, ETI_SPEC_AVAIL) - -#define KOKKOSBLAS1_CSCAL_TPL_SPEC_DECL_ROCBLAS(LAYOUT, EXECSPACE, MEMSPACE, \ - ETI_SPEC_AVAIL) \ - KOKKOSBLAS1_XSCAL_TPL_SPEC_DECL_ROCBLAS( \ - Kokkos::complex, rocblas_float_complex, rocblas_cscal, LAYOUT, \ - EXECSPACE, MEMSPACE, ETI_SPEC_AVAIL) - -KOKKOSBLAS1_DSCAL_TPL_SPEC_DECL_ROCBLAS(Kokkos::LayoutLeft, Kokkos::HIP, - Kokkos::HIPSpace, true) -KOKKOSBLAS1_DSCAL_TPL_SPEC_DECL_ROCBLAS(Kokkos::LayoutLeft, Kokkos::HIP, - Kokkos::HIPSpace, false) - -KOKKOSBLAS1_SSCAL_TPL_SPEC_DECL_ROCBLAS(Kokkos::LayoutLeft, Kokkos::HIP, - Kokkos::HIPSpace, true) -KOKKOSBLAS1_SSCAL_TPL_SPEC_DECL_ROCBLAS(Kokkos::LayoutLeft, Kokkos::HIP, - Kokkos::HIPSpace, false) - -KOKKOSBLAS1_ZSCAL_TPL_SPEC_DECL_ROCBLAS(Kokkos::LayoutLeft, Kokkos::HIP, - Kokkos::HIPSpace, true) -KOKKOSBLAS1_ZSCAL_TPL_SPEC_DECL_ROCBLAS(Kokkos::LayoutLeft, Kokkos::HIP, - Kokkos::HIPSpace, false) - -KOKKOSBLAS1_CSCAL_TPL_SPEC_DECL_ROCBLAS(Kokkos::LayoutLeft, Kokkos::HIP, - Kokkos::HIPSpace, true) -KOKKOSBLAS1_CSCAL_TPL_SPEC_DECL_ROCBLAS(Kokkos::LayoutLeft, Kokkos::HIP, - Kokkos::HIPSpace, false) +#define KOKKOSBLAS1_CSCAL_TPL_SPEC_DECL_ROCBLAS(LAYOUT, EXECSPACE, MEMSPACE, ETI_SPEC_AVAIL) \ + KOKKOSBLAS1_XSCAL_TPL_SPEC_DECL_ROCBLAS(Kokkos::complex, rocblas_float_complex, rocblas_cscal, LAYOUT, \ + EXECSPACE, MEMSPACE, ETI_SPEC_AVAIL) + +KOKKOSBLAS1_DSCAL_TPL_SPEC_DECL_ROCBLAS(Kokkos::LayoutLeft, Kokkos::HIP, Kokkos::HIPSpace, true) +KOKKOSBLAS1_DSCAL_TPL_SPEC_DECL_ROCBLAS(Kokkos::LayoutLeft, Kokkos::HIP, Kokkos::HIPSpace, false) + +KOKKOSBLAS1_SSCAL_TPL_SPEC_DECL_ROCBLAS(Kokkos::LayoutLeft, Kokkos::HIP, Kokkos::HIPSpace, true) +KOKKOSBLAS1_SSCAL_TPL_SPEC_DECL_ROCBLAS(Kokkos::LayoutLeft, Kokkos::HIP, Kokkos::HIPSpace, false) + +KOKKOSBLAS1_ZSCAL_TPL_SPEC_DECL_ROCBLAS(Kokkos::LayoutLeft, Kokkos::HIP, Kokkos::HIPSpace, true) +KOKKOSBLAS1_ZSCAL_TPL_SPEC_DECL_ROCBLAS(Kokkos::LayoutLeft, Kokkos::HIP, Kokkos::HIPSpace, false) + +KOKKOSBLAS1_CSCAL_TPL_SPEC_DECL_ROCBLAS(Kokkos::LayoutLeft, Kokkos::HIP, Kokkos::HIPSpace, true) +KOKKOSBLAS1_CSCAL_TPL_SPEC_DECL_ROCBLAS(Kokkos::LayoutLeft, Kokkos::HIP, Kokkos::HIPSpace, false) } // namespace Impl } // namespace KokkosBlas diff --git a/blas/tpls/KokkosBlas1_swap_tpl_spec_avail.hpp b/blas/tpls/KokkosBlas1_swap_tpl_spec_avail.hpp index 14ecce2740..de1fa19cb3 100644 --- a/blas/tpls/KokkosBlas1_swap_tpl_spec_avail.hpp +++ b/blas/tpls/KokkosBlas1_swap_tpl_spec_avail.hpp @@ -34,132 +34,83 @@ namespace Impl { // Generic Host side BLAS (could be MKL or whatever) #ifdef KOKKOSKERNELS_ENABLE_TPL_BLAS -#define KOKKOSBLAS1_SWAP_TPL_SPEC_AVAIL_BLAS(SCALAR, LAYOUT, EXECSPACE) \ - template <> \ - struct swap_tpl_spec_avail< \ - EXECSPACE, \ - Kokkos::View, \ - Kokkos::MemoryTraits>, \ - Kokkos::View, \ - Kokkos::MemoryTraits>> { \ - enum : bool { value = true }; \ +#define KOKKOSBLAS1_SWAP_TPL_SPEC_AVAIL_BLAS(SCALAR, LAYOUT, EXECSPACE) \ + template <> \ + struct swap_tpl_spec_avail, \ + Kokkos::MemoryTraits>, \ + Kokkos::View, \ + Kokkos::MemoryTraits>> { \ + enum : bool { value = true }; \ }; #ifdef KOKKOS_ENABLE_SERIAL KOKKOSBLAS1_SWAP_TPL_SPEC_AVAIL_BLAS(double, Kokkos::LayoutLeft, Kokkos::Serial) KOKKOSBLAS1_SWAP_TPL_SPEC_AVAIL_BLAS(float, Kokkos::LayoutLeft, Kokkos::Serial) -KOKKOSBLAS1_SWAP_TPL_SPEC_AVAIL_BLAS(Kokkos::complex, - Kokkos::LayoutLeft, Kokkos::Serial) -KOKKOSBLAS1_SWAP_TPL_SPEC_AVAIL_BLAS(Kokkos::complex, Kokkos::LayoutLeft, - Kokkos::Serial) +KOKKOSBLAS1_SWAP_TPL_SPEC_AVAIL_BLAS(Kokkos::complex, Kokkos::LayoutLeft, Kokkos::Serial) +KOKKOSBLAS1_SWAP_TPL_SPEC_AVAIL_BLAS(Kokkos::complex, Kokkos::LayoutLeft, Kokkos::Serial) #endif #ifdef KOKKOS_ENABLE_OPENMP KOKKOSBLAS1_SWAP_TPL_SPEC_AVAIL_BLAS(double, Kokkos::LayoutLeft, Kokkos::OpenMP) KOKKOSBLAS1_SWAP_TPL_SPEC_AVAIL_BLAS(float, Kokkos::LayoutLeft, Kokkos::OpenMP) -KOKKOSBLAS1_SWAP_TPL_SPEC_AVAIL_BLAS(Kokkos::complex, - Kokkos::LayoutLeft, Kokkos::OpenMP) -KOKKOSBLAS1_SWAP_TPL_SPEC_AVAIL_BLAS(Kokkos::complex, Kokkos::LayoutLeft, - Kokkos::OpenMP) +KOKKOSBLAS1_SWAP_TPL_SPEC_AVAIL_BLAS(Kokkos::complex, Kokkos::LayoutLeft, Kokkos::OpenMP) +KOKKOSBLAS1_SWAP_TPL_SPEC_AVAIL_BLAS(Kokkos::complex, Kokkos::LayoutLeft, Kokkos::OpenMP) #endif #endif // cuBLAS #ifdef KOKKOSKERNELS_ENABLE_TPL_CUBLAS -#define KOKKOSBLAS1_SWAP_TPL_SPEC_AVAIL_CUBLAS(SCALAR, LAYOUT, EXECSPACE, \ - MEMSPACE) \ - template <> \ - struct swap_tpl_spec_avail< \ - EXECSPACE, \ - Kokkos::View, \ - Kokkos::MemoryTraits>, \ - Kokkos::View, \ - Kokkos::MemoryTraits>> { \ - enum : bool { value = true }; \ +#define KOKKOSBLAS1_SWAP_TPL_SPEC_AVAIL_CUBLAS(SCALAR, LAYOUT, EXECSPACE, MEMSPACE) \ + template <> \ + struct swap_tpl_spec_avail< \ + EXECSPACE, \ + Kokkos::View, Kokkos::MemoryTraits>, \ + Kokkos::View, Kokkos::MemoryTraits>> { \ + enum : bool { value = true }; \ }; -KOKKOSBLAS1_SWAP_TPL_SPEC_AVAIL_CUBLAS(double, Kokkos::LayoutLeft, Kokkos::Cuda, - Kokkos::CudaSpace) -KOKKOSBLAS1_SWAP_TPL_SPEC_AVAIL_CUBLAS(float, Kokkos::LayoutLeft, Kokkos::Cuda, - Kokkos::CudaSpace) -KOKKOSBLAS1_SWAP_TPL_SPEC_AVAIL_CUBLAS(Kokkos::complex, - Kokkos::LayoutLeft, Kokkos::Cuda, - Kokkos::CudaSpace) -KOKKOSBLAS1_SWAP_TPL_SPEC_AVAIL_CUBLAS(Kokkos::complex, - Kokkos::LayoutLeft, Kokkos::Cuda, - Kokkos::CudaSpace) +KOKKOSBLAS1_SWAP_TPL_SPEC_AVAIL_CUBLAS(double, Kokkos::LayoutLeft, Kokkos::Cuda, Kokkos::CudaSpace) +KOKKOSBLAS1_SWAP_TPL_SPEC_AVAIL_CUBLAS(float, Kokkos::LayoutLeft, Kokkos::Cuda, Kokkos::CudaSpace) +KOKKOSBLAS1_SWAP_TPL_SPEC_AVAIL_CUBLAS(Kokkos::complex, Kokkos::LayoutLeft, Kokkos::Cuda, Kokkos::CudaSpace) +KOKKOSBLAS1_SWAP_TPL_SPEC_AVAIL_CUBLAS(Kokkos::complex, Kokkos::LayoutLeft, Kokkos::Cuda, Kokkos::CudaSpace) -KOKKOSBLAS1_SWAP_TPL_SPEC_AVAIL_CUBLAS(double, Kokkos::LayoutRight, - Kokkos::Cuda, Kokkos::CudaSpace) -KOKKOSBLAS1_SWAP_TPL_SPEC_AVAIL_CUBLAS(float, Kokkos::LayoutRight, Kokkos::Cuda, - Kokkos::CudaSpace) -KOKKOSBLAS1_SWAP_TPL_SPEC_AVAIL_CUBLAS(Kokkos::complex, - Kokkos::LayoutRight, Kokkos::Cuda, - Kokkos::CudaSpace) -KOKKOSBLAS1_SWAP_TPL_SPEC_AVAIL_CUBLAS(Kokkos::complex, - Kokkos::LayoutRight, Kokkos::Cuda, - Kokkos::CudaSpace) +KOKKOSBLAS1_SWAP_TPL_SPEC_AVAIL_CUBLAS(double, Kokkos::LayoutRight, Kokkos::Cuda, Kokkos::CudaSpace) +KOKKOSBLAS1_SWAP_TPL_SPEC_AVAIL_CUBLAS(float, Kokkos::LayoutRight, Kokkos::Cuda, Kokkos::CudaSpace) +KOKKOSBLAS1_SWAP_TPL_SPEC_AVAIL_CUBLAS(Kokkos::complex, Kokkos::LayoutRight, Kokkos::Cuda, Kokkos::CudaSpace) +KOKKOSBLAS1_SWAP_TPL_SPEC_AVAIL_CUBLAS(Kokkos::complex, Kokkos::LayoutRight, Kokkos::Cuda, Kokkos::CudaSpace) -KOKKOSBLAS1_SWAP_TPL_SPEC_AVAIL_CUBLAS(double, Kokkos::LayoutLeft, Kokkos::Cuda, - Kokkos::CudaUVMSpace) -KOKKOSBLAS1_SWAP_TPL_SPEC_AVAIL_CUBLAS(float, Kokkos::LayoutLeft, Kokkos::Cuda, - Kokkos::CudaUVMSpace) -KOKKOSBLAS1_SWAP_TPL_SPEC_AVAIL_CUBLAS(Kokkos::complex, - Kokkos::LayoutLeft, Kokkos::Cuda, - Kokkos::CudaUVMSpace) -KOKKOSBLAS1_SWAP_TPL_SPEC_AVAIL_CUBLAS(Kokkos::complex, - Kokkos::LayoutLeft, Kokkos::Cuda, - Kokkos::CudaUVMSpace) +KOKKOSBLAS1_SWAP_TPL_SPEC_AVAIL_CUBLAS(double, Kokkos::LayoutLeft, Kokkos::Cuda, Kokkos::CudaUVMSpace) +KOKKOSBLAS1_SWAP_TPL_SPEC_AVAIL_CUBLAS(float, Kokkos::LayoutLeft, Kokkos::Cuda, Kokkos::CudaUVMSpace) +KOKKOSBLAS1_SWAP_TPL_SPEC_AVAIL_CUBLAS(Kokkos::complex, Kokkos::LayoutLeft, Kokkos::Cuda, Kokkos::CudaUVMSpace) +KOKKOSBLAS1_SWAP_TPL_SPEC_AVAIL_CUBLAS(Kokkos::complex, Kokkos::LayoutLeft, Kokkos::Cuda, Kokkos::CudaUVMSpace) -KOKKOSBLAS1_SWAP_TPL_SPEC_AVAIL_CUBLAS(double, Kokkos::LayoutRight, - Kokkos::Cuda, Kokkos::CudaUVMSpace) -KOKKOSBLAS1_SWAP_TPL_SPEC_AVAIL_CUBLAS(float, Kokkos::LayoutRight, Kokkos::Cuda, - Kokkos::CudaUVMSpace) -KOKKOSBLAS1_SWAP_TPL_SPEC_AVAIL_CUBLAS(Kokkos::complex, - Kokkos::LayoutRight, Kokkos::Cuda, - Kokkos::CudaUVMSpace) -KOKKOSBLAS1_SWAP_TPL_SPEC_AVAIL_CUBLAS(Kokkos::complex, - Kokkos::LayoutRight, Kokkos::Cuda, - Kokkos::CudaUVMSpace) +KOKKOSBLAS1_SWAP_TPL_SPEC_AVAIL_CUBLAS(double, Kokkos::LayoutRight, Kokkos::Cuda, Kokkos::CudaUVMSpace) +KOKKOSBLAS1_SWAP_TPL_SPEC_AVAIL_CUBLAS(float, Kokkos::LayoutRight, Kokkos::Cuda, Kokkos::CudaUVMSpace) +KOKKOSBLAS1_SWAP_TPL_SPEC_AVAIL_CUBLAS(Kokkos::complex, Kokkos::LayoutRight, Kokkos::Cuda, Kokkos::CudaUVMSpace) +KOKKOSBLAS1_SWAP_TPL_SPEC_AVAIL_CUBLAS(Kokkos::complex, Kokkos::LayoutRight, Kokkos::Cuda, Kokkos::CudaUVMSpace) #endif // rocBLAS #ifdef KOKKOSKERNELS_ENABLE_TPL_ROCBLAS -#define KOKKOSBLAS1_SWAP_TPL_SPEC_AVAIL_ROCBLAS(SCALAR, LAYOUT, EXECSPACE, \ - MEMSPACE) \ - template <> \ - struct swap_tpl_spec_avail< \ - EXECSPACE, \ - Kokkos::View, \ - Kokkos::MemoryTraits>, \ - Kokkos::View, \ - Kokkos::MemoryTraits>> { \ - enum : bool { value = true }; \ +#define KOKKOSBLAS1_SWAP_TPL_SPEC_AVAIL_ROCBLAS(SCALAR, LAYOUT, EXECSPACE, MEMSPACE) \ + template <> \ + struct swap_tpl_spec_avail< \ + EXECSPACE, \ + Kokkos::View, Kokkos::MemoryTraits>, \ + Kokkos::View, Kokkos::MemoryTraits>> { \ + enum : bool { value = true }; \ }; -KOKKOSBLAS1_SWAP_TPL_SPEC_AVAIL_ROCBLAS(double, Kokkos::LayoutLeft, Kokkos::HIP, - Kokkos::HIPSpace) -KOKKOSBLAS1_SWAP_TPL_SPEC_AVAIL_ROCBLAS(float, Kokkos::LayoutLeft, Kokkos::HIP, - Kokkos::HIPSpace) -KOKKOSBLAS1_SWAP_TPL_SPEC_AVAIL_ROCBLAS(Kokkos::complex, - Kokkos::LayoutLeft, Kokkos::HIP, - Kokkos::HIPSpace) -KOKKOSBLAS1_SWAP_TPL_SPEC_AVAIL_ROCBLAS(Kokkos::complex, - Kokkos::LayoutLeft, Kokkos::HIP, - Kokkos::HIPSpace) +KOKKOSBLAS1_SWAP_TPL_SPEC_AVAIL_ROCBLAS(double, Kokkos::LayoutLeft, Kokkos::HIP, Kokkos::HIPSpace) +KOKKOSBLAS1_SWAP_TPL_SPEC_AVAIL_ROCBLAS(float, Kokkos::LayoutLeft, Kokkos::HIP, Kokkos::HIPSpace) +KOKKOSBLAS1_SWAP_TPL_SPEC_AVAIL_ROCBLAS(Kokkos::complex, Kokkos::LayoutLeft, Kokkos::HIP, Kokkos::HIPSpace) +KOKKOSBLAS1_SWAP_TPL_SPEC_AVAIL_ROCBLAS(Kokkos::complex, Kokkos::LayoutLeft, Kokkos::HIP, Kokkos::HIPSpace) -KOKKOSBLAS1_SWAP_TPL_SPEC_AVAIL_ROCBLAS(double, Kokkos::LayoutRight, - Kokkos::HIP, Kokkos::HIPSpace) -KOKKOSBLAS1_SWAP_TPL_SPEC_AVAIL_ROCBLAS(float, Kokkos::LayoutRight, Kokkos::HIP, - Kokkos::HIPSpace) -KOKKOSBLAS1_SWAP_TPL_SPEC_AVAIL_ROCBLAS(Kokkos::complex, - Kokkos::LayoutRight, Kokkos::HIP, - Kokkos::HIPSpace) -KOKKOSBLAS1_SWAP_TPL_SPEC_AVAIL_ROCBLAS(Kokkos::complex, - Kokkos::LayoutRight, Kokkos::HIP, - Kokkos::HIPSpace) +KOKKOSBLAS1_SWAP_TPL_SPEC_AVAIL_ROCBLAS(double, Kokkos::LayoutRight, Kokkos::HIP, Kokkos::HIPSpace) +KOKKOSBLAS1_SWAP_TPL_SPEC_AVAIL_ROCBLAS(float, Kokkos::LayoutRight, Kokkos::HIP, Kokkos::HIPSpace) +KOKKOSBLAS1_SWAP_TPL_SPEC_AVAIL_ROCBLAS(Kokkos::complex, Kokkos::LayoutRight, Kokkos::HIP, Kokkos::HIPSpace) +KOKKOSBLAS1_SWAP_TPL_SPEC_AVAIL_ROCBLAS(Kokkos::complex, Kokkos::LayoutRight, Kokkos::HIP, Kokkos::HIPSpace) #endif } // namespace Impl diff --git a/blas/tpls/KokkosBlas1_swap_tpl_spec_decl.hpp b/blas/tpls/KokkosBlas1_swap_tpl_spec_decl.hpp index 555c942c12..e74b498c33 100644 --- a/blas/tpls/KokkosBlas1_swap_tpl_spec_decl.hpp +++ b/blas/tpls/KokkosBlas1_swap_tpl_spec_decl.hpp @@ -26,9 +26,8 @@ namespace { template inline void swap_print_specialization() { #ifdef KOKKOSKERNELS_ENABLE_CHECK_SPECIALIZATION - printf("KokkosBlas::swap<> TPL Blas specialization for < %s, %s, %s >\n", - typeid(XVector).name(), typeid(YVector).name(), - typeid(ExecutionSpace).name); + printf("KokkosBlas::swap<> TPL Blas specialization for < %s, %s, %s >\n", typeid(XVector).name(), + typeid(YVector).name(), typeid(ExecutionSpace).name); #endif } } // namespace @@ -42,110 +41,82 @@ inline void swap_print_specialization() { namespace KokkosBlas { namespace Impl { -#define KOKKOSBLAS1_DSWAP_TPL_SPEC_DECL_BLAS(LAYOUT, EXECSPACE, \ - ETI_SPEC_AVAIL) \ - template <> \ - struct Swap, \ - Kokkos::MemoryTraits>, \ - Kokkos::View, \ - Kokkos::MemoryTraits>, \ - true, ETI_SPEC_AVAIL> { \ - using XVector = Kokkos::View, \ - Kokkos::MemoryTraits>; \ - using YVector = Kokkos::View, \ - Kokkos::MemoryTraits>; \ - static void swap(EXECSPACE const& /*space*/, XVector const& X, \ - YVector const& Y) { \ - Kokkos::Profiling::pushRegion("KokkosBlas::swap[TPL_BLAS,double]"); \ - HostBlas::swap(X.extent_int(0), X.data(), 1, Y.data(), 1); \ - Kokkos::Profiling::popRegion(); \ - } \ +#define KOKKOSBLAS1_DSWAP_TPL_SPEC_DECL_BLAS(LAYOUT, EXECSPACE, ETI_SPEC_AVAIL) \ + template <> \ + struct Swap, \ + Kokkos::MemoryTraits>, \ + Kokkos::View, \ + Kokkos::MemoryTraits>, \ + true, ETI_SPEC_AVAIL> { \ + using XVector = Kokkos::View, \ + Kokkos::MemoryTraits>; \ + using YVector = Kokkos::View, \ + Kokkos::MemoryTraits>; \ + static void swap(EXECSPACE const& /*space*/, XVector const& X, YVector const& Y) { \ + Kokkos::Profiling::pushRegion("KokkosBlas::swap[TPL_BLAS,double]"); \ + HostBlas::swap(X.extent_int(0), X.data(), 1, Y.data(), 1); \ + Kokkos::Profiling::popRegion(); \ + } \ }; -#define KOKKOSBLAS1_SSWAP_TPL_SPEC_DECL_BLAS(LAYOUT, EXECSPACE, \ - ETI_SPEC_AVAIL) \ - template <> \ - struct Swap, \ - Kokkos::MemoryTraits>, \ - Kokkos::View, \ - Kokkos::MemoryTraits>, \ - true, ETI_SPEC_AVAIL> { \ - using XVector = Kokkos::View, \ - Kokkos::MemoryTraits>; \ - using YVector = Kokkos::View, \ - Kokkos::MemoryTraits>; \ - static void swap(EXECSPACE const& /*space*/, XVector const& X, \ - YVector const& Y) { \ - Kokkos::Profiling::pushRegion("KokkosBlas::swap[TPL_BLAS,float]"); \ - HostBlas::swap(X.extent_int(0), X.data(), 1, Y.data(), 1); \ - Kokkos::Profiling::popRegion(); \ - } \ +#define KOKKOSBLAS1_SSWAP_TPL_SPEC_DECL_BLAS(LAYOUT, EXECSPACE, ETI_SPEC_AVAIL) \ + template <> \ + struct Swap, \ + Kokkos::MemoryTraits>, \ + Kokkos::View, \ + Kokkos::MemoryTraits>, \ + true, ETI_SPEC_AVAIL> { \ + using XVector = Kokkos::View, \ + Kokkos::MemoryTraits>; \ + using YVector = Kokkos::View, \ + Kokkos::MemoryTraits>; \ + static void swap(EXECSPACE const& /*space*/, XVector const& X, YVector const& Y) { \ + Kokkos::Profiling::pushRegion("KokkosBlas::swap[TPL_BLAS,float]"); \ + HostBlas::swap(X.extent_int(0), X.data(), 1, Y.data(), 1); \ + Kokkos::Profiling::popRegion(); \ + } \ }; -#define KOKKOSBLAS1_ZSWAP_TPL_SPEC_DECL_BLAS(LAYOUT, EXECSPACE, \ - ETI_SPEC_AVAIL) \ - template <> \ - struct Swap*, LAYOUT, \ - Kokkos::Device, \ - Kokkos::MemoryTraits>, \ - Kokkos::View*, LAYOUT, \ - Kokkos::Device, \ - Kokkos::MemoryTraits>, \ - true, ETI_SPEC_AVAIL> { \ - using XVector = Kokkos::View*, LAYOUT, \ - Kokkos::Device, \ - Kokkos::MemoryTraits>; \ - using YVector = Kokkos::View*, LAYOUT, \ - Kokkos::Device, \ - Kokkos::MemoryTraits>; \ - static void swap(EXECSPACE const& /*space*/, XVector const& X, \ - YVector const& Y) { \ - Kokkos::Profiling::pushRegion( \ - "KokkosBlas::swap[TPL_BLAS,complex]"); \ - HostBlas>::swap( \ - X.extent_int(0), reinterpret_cast*>(X.data()), \ - 1, reinterpret_cast*>(Y.data()), 1); \ - Kokkos::Profiling::popRegion(); \ - } \ +#define KOKKOSBLAS1_ZSWAP_TPL_SPEC_DECL_BLAS(LAYOUT, EXECSPACE, ETI_SPEC_AVAIL) \ + template <> \ + struct Swap*, LAYOUT, Kokkos::Device, \ + Kokkos::MemoryTraits>, \ + Kokkos::View*, LAYOUT, Kokkos::Device, \ + Kokkos::MemoryTraits>, \ + true, ETI_SPEC_AVAIL> { \ + using XVector = Kokkos::View*, LAYOUT, Kokkos::Device, \ + Kokkos::MemoryTraits>; \ + using YVector = Kokkos::View*, LAYOUT, Kokkos::Device, \ + Kokkos::MemoryTraits>; \ + static void swap(EXECSPACE const& /*space*/, XVector const& X, YVector const& Y) { \ + Kokkos::Profiling::pushRegion("KokkosBlas::swap[TPL_BLAS,complex]"); \ + HostBlas>::swap(X.extent_int(0), reinterpret_cast*>(X.data()), 1, \ + reinterpret_cast*>(Y.data()), 1); \ + Kokkos::Profiling::popRegion(); \ + } \ }; -#define KOKKOSBLAS1_CSWAP_TPL_SPEC_DECL_BLAS(LAYOUT, EXECSPACE, \ - ETI_SPEC_AVAIL) \ - template <> \ - struct Swap*, LAYOUT, \ - Kokkos::Device, \ - Kokkos::MemoryTraits>, \ - Kokkos::View*, LAYOUT, \ - Kokkos::Device, \ - Kokkos::MemoryTraits>, \ - true, ETI_SPEC_AVAIL> { \ - using XVector = Kokkos::View*, LAYOUT, \ - Kokkos::Device, \ - Kokkos::MemoryTraits>; \ - using YVector = Kokkos::View*, LAYOUT, \ - Kokkos::Device, \ - Kokkos::MemoryTraits>; \ - static void swap(EXECSPACE const& /*space*/, XVector const& X, \ - YVector const& Y) { \ - Kokkos::Profiling::pushRegion( \ - "KokkosBlas::swap[TPL_BLAS,complex]"); \ - HostBlas>::swap( \ - X.extent_int(0), reinterpret_cast*>(X.data()), \ - 1, reinterpret_cast*>(Y.data()), 1); \ - Kokkos::Profiling::popRegion(); \ - } \ +#define KOKKOSBLAS1_CSWAP_TPL_SPEC_DECL_BLAS(LAYOUT, EXECSPACE, ETI_SPEC_AVAIL) \ + template <> \ + struct Swap*, LAYOUT, Kokkos::Device, \ + Kokkos::MemoryTraits>, \ + Kokkos::View*, LAYOUT, Kokkos::Device, \ + Kokkos::MemoryTraits>, \ + true, ETI_SPEC_AVAIL> { \ + using XVector = Kokkos::View*, LAYOUT, Kokkos::Device, \ + Kokkos::MemoryTraits>; \ + using YVector = Kokkos::View*, LAYOUT, Kokkos::Device, \ + Kokkos::MemoryTraits>; \ + static void swap(EXECSPACE const& /*space*/, XVector const& X, YVector const& Y) { \ + Kokkos::Profiling::pushRegion("KokkosBlas::swap[TPL_BLAS,complex]"); \ + HostBlas>::swap(X.extent_int(0), reinterpret_cast*>(X.data()), 1, \ + reinterpret_cast*>(Y.data()), 1); \ + Kokkos::Profiling::popRegion(); \ + } \ }; #ifdef KOKKOS_ENABLE_SERIAL @@ -188,201 +159,131 @@ KOKKOSBLAS1_CSWAP_TPL_SPEC_DECL_BLAS(Kokkos::LayoutLeft, Kokkos::OpenMP, false) namespace KokkosBlas { namespace Impl { -#define KOKKOSBLAS1_DSWAP_TPL_SPEC_DECL_CUBLAS(LAYOUT, EXECSPACE, MEMSPACE, \ - ETI_SPEC_AVAIL) \ - template <> \ - struct Swap< \ - EXECSPACE, \ - Kokkos::View, \ - Kokkos::MemoryTraits>, \ - Kokkos::View, \ - Kokkos::MemoryTraits>, \ - true, ETI_SPEC_AVAIL> { \ - using XVector = \ - Kokkos::View, \ - Kokkos::MemoryTraits>; \ - using YVector = \ - Kokkos::View, \ - Kokkos::MemoryTraits>; \ - static void swap(EXECSPACE const& space, XVector const& X, \ - YVector const& Y) { \ - Kokkos::Profiling::pushRegion("KokkosBlas::swap[TPL_CUBLAS,double]"); \ - swap_print_specialization(); \ - KokkosBlas::Impl::CudaBlasSingleton& singleton = \ - KokkosBlas::Impl::CudaBlasSingleton::singleton(); \ - KOKKOS_CUBLAS_SAFE_CALL_IMPL( \ - cublasSetStream(singleton.handle, space.cuda_stream())); \ - KOKKOS_CUBLAS_SAFE_CALL_IMPL(cublasDswap( \ - singleton.handle, X.extent_int(0), X.data(), 1, Y.data(), 1)); \ - Kokkos::Profiling::popRegion(); \ - } \ +#define KOKKOSBLAS1_DSWAP_TPL_SPEC_DECL_CUBLAS(LAYOUT, EXECSPACE, MEMSPACE, ETI_SPEC_AVAIL) \ + template <> \ + struct Swap< \ + EXECSPACE, \ + Kokkos::View, Kokkos::MemoryTraits>, \ + Kokkos::View, Kokkos::MemoryTraits>, \ + true, ETI_SPEC_AVAIL> { \ + using XVector = \ + Kokkos::View, Kokkos::MemoryTraits>; \ + using YVector = \ + Kokkos::View, Kokkos::MemoryTraits>; \ + static void swap(EXECSPACE const& space, XVector const& X, YVector const& Y) { \ + Kokkos::Profiling::pushRegion("KokkosBlas::swap[TPL_CUBLAS,double]"); \ + swap_print_specialization(); \ + KokkosBlas::Impl::CudaBlasSingleton& singleton = KokkosBlas::Impl::CudaBlasSingleton::singleton(); \ + KOKKOS_CUBLAS_SAFE_CALL_IMPL(cublasSetStream(singleton.handle, space.cuda_stream())); \ + KOKKOS_CUBLAS_SAFE_CALL_IMPL(cublasDswap(singleton.handle, X.extent_int(0), X.data(), 1, Y.data(), 1)); \ + Kokkos::Profiling::popRegion(); \ + } \ }; -#define KOKKOSBLAS1_SSWAP_TPL_SPEC_DECL_CUBLAS(LAYOUT, EXECSPACE, MEMSPACE, \ - ETI_SPEC_AVAIL) \ - template <> \ - struct Swap< \ - EXECSPACE, \ - Kokkos::View, \ - Kokkos::MemoryTraits>, \ - Kokkos::View, \ - Kokkos::MemoryTraits>, \ - true, ETI_SPEC_AVAIL> { \ - using XVector = \ - Kokkos::View, \ - Kokkos::MemoryTraits>; \ - using YVector = \ - Kokkos::View, \ - Kokkos::MemoryTraits>; \ - static void swap(EXECSPACE const& space, XVector const& X, \ - YVector const& Y) { \ - Kokkos::Profiling::pushRegion("KokkosBlas::swap[TPL_CUBLAS,float]"); \ - swap_print_specialization(); \ - KokkosBlas::Impl::CudaBlasSingleton& singleton = \ - KokkosBlas::Impl::CudaBlasSingleton::singleton(); \ - KOKKOS_CUBLAS_SAFE_CALL_IMPL( \ - cublasSetStream(singleton.handle, space.cuda_stream())); \ - KOKKOS_CUBLAS_SAFE_CALL_IMPL(cublasSswap( \ - singleton.handle, X.extent_int(0), X.data(), 1, Y.data(), 1)); \ - Kokkos::Profiling::popRegion(); \ - } \ +#define KOKKOSBLAS1_SSWAP_TPL_SPEC_DECL_CUBLAS(LAYOUT, EXECSPACE, MEMSPACE, ETI_SPEC_AVAIL) \ + template <> \ + struct Swap< \ + EXECSPACE, \ + Kokkos::View, Kokkos::MemoryTraits>, \ + Kokkos::View, Kokkos::MemoryTraits>, \ + true, ETI_SPEC_AVAIL> { \ + using XVector = \ + Kokkos::View, Kokkos::MemoryTraits>; \ + using YVector = \ + Kokkos::View, Kokkos::MemoryTraits>; \ + static void swap(EXECSPACE const& space, XVector const& X, YVector const& Y) { \ + Kokkos::Profiling::pushRegion("KokkosBlas::swap[TPL_CUBLAS,float]"); \ + swap_print_specialization(); \ + KokkosBlas::Impl::CudaBlasSingleton& singleton = KokkosBlas::Impl::CudaBlasSingleton::singleton(); \ + KOKKOS_CUBLAS_SAFE_CALL_IMPL(cublasSetStream(singleton.handle, space.cuda_stream())); \ + KOKKOS_CUBLAS_SAFE_CALL_IMPL(cublasSswap(singleton.handle, X.extent_int(0), X.data(), 1, Y.data(), 1)); \ + Kokkos::Profiling::popRegion(); \ + } \ }; -#define KOKKOSBLAS1_ZSWAP_TPL_SPEC_DECL_CUBLAS(LAYOUT, EXECSPACE, MEMSPACE, \ - ETI_SPEC_AVAIL) \ - template <> \ - struct Swap*, LAYOUT, \ - Kokkos::Device, \ - Kokkos::MemoryTraits>, \ - Kokkos::View*, LAYOUT, \ - Kokkos::Device, \ - Kokkos::MemoryTraits>, \ - true, ETI_SPEC_AVAIL> { \ - using XVector = Kokkos::View*, LAYOUT, \ - Kokkos::Device, \ - Kokkos::MemoryTraits>; \ - using YVector = Kokkos::View*, LAYOUT, \ - Kokkos::Device, \ - Kokkos::MemoryTraits>; \ - static void swap(EXECSPACE const& space, XVector const& X, \ - YVector const& Y) { \ - Kokkos::Profiling::pushRegion( \ - "KokkosBlas::swap[TPL_CUBLAS,complex]"); \ - swap_print_specialization(); \ - KokkosBlas::Impl::CudaBlasSingleton& singleton = \ - KokkosBlas::Impl::CudaBlasSingleton::singleton(); \ - KOKKOS_CUBLAS_SAFE_CALL_IMPL( \ - cublasSetStream(singleton.handle, space.cuda_stream())); \ - KOKKOS_CUBLAS_SAFE_CALL_IMPL( \ - cublasZswap(singleton.handle, X.extent_int(0), \ - reinterpret_cast(X.data()), 1, \ - reinterpret_cast(Y.data()), 1)); \ - Kokkos::Profiling::popRegion(); \ - } \ +#define KOKKOSBLAS1_ZSWAP_TPL_SPEC_DECL_CUBLAS(LAYOUT, EXECSPACE, MEMSPACE, ETI_SPEC_AVAIL) \ + template <> \ + struct Swap*, LAYOUT, Kokkos::Device, \ + Kokkos::MemoryTraits>, \ + Kokkos::View*, LAYOUT, Kokkos::Device, \ + Kokkos::MemoryTraits>, \ + true, ETI_SPEC_AVAIL> { \ + using XVector = Kokkos::View*, LAYOUT, Kokkos::Device, \ + Kokkos::MemoryTraits>; \ + using YVector = Kokkos::View*, LAYOUT, Kokkos::Device, \ + Kokkos::MemoryTraits>; \ + static void swap(EXECSPACE const& space, XVector const& X, YVector const& Y) { \ + Kokkos::Profiling::pushRegion("KokkosBlas::swap[TPL_CUBLAS,complex]"); \ + swap_print_specialization(); \ + KokkosBlas::Impl::CudaBlasSingleton& singleton = KokkosBlas::Impl::CudaBlasSingleton::singleton(); \ + KOKKOS_CUBLAS_SAFE_CALL_IMPL(cublasSetStream(singleton.handle, space.cuda_stream())); \ + KOKKOS_CUBLAS_SAFE_CALL_IMPL(cublasZswap(singleton.handle, X.extent_int(0), \ + reinterpret_cast(X.data()), 1, \ + reinterpret_cast(Y.data()), 1)); \ + Kokkos::Profiling::popRegion(); \ + } \ }; -#define KOKKOSBLAS1_CSWAP_TPL_SPEC_DECL_CUBLAS(LAYOUT, EXECSPACE, MEMSPACE, \ - ETI_SPEC_AVAIL) \ - template <> \ - struct Swap*, LAYOUT, \ - Kokkos::Device, \ - Kokkos::MemoryTraits>, \ - Kokkos::View*, LAYOUT, \ - Kokkos::Device, \ - Kokkos::MemoryTraits>, \ - true, ETI_SPEC_AVAIL> { \ - using XVector = Kokkos::View*, LAYOUT, \ - Kokkos::Device, \ - Kokkos::MemoryTraits>; \ - using YVector = Kokkos::View*, LAYOUT, \ - Kokkos::Device, \ - Kokkos::MemoryTraits>; \ - static void swap(EXECSPACE const& space, XVector const& X, \ - YVector const& Y) { \ - Kokkos::Profiling::pushRegion( \ - "KokkosBlas::swap[TPL_CUBLAS,complex]"); \ - swap_print_specialization(); \ - KokkosBlas::Impl::CudaBlasSingleton& singleton = \ - KokkosBlas::Impl::CudaBlasSingleton::singleton(); \ - KOKKOS_CUBLAS_SAFE_CALL_IMPL( \ - cublasSetStream(singleton.handle, space.cuda_stream())); \ - KOKKOS_CUBLAS_SAFE_CALL_IMPL( \ - cublasCswap(singleton.handle, X.extent_int(0), \ - reinterpret_cast(X.data()), 1, \ - reinterpret_cast(Y.data()), 1)); \ - Kokkos::Profiling::popRegion(); \ - } \ +#define KOKKOSBLAS1_CSWAP_TPL_SPEC_DECL_CUBLAS(LAYOUT, EXECSPACE, MEMSPACE, ETI_SPEC_AVAIL) \ + template <> \ + struct Swap*, LAYOUT, Kokkos::Device, \ + Kokkos::MemoryTraits>, \ + Kokkos::View*, LAYOUT, Kokkos::Device, \ + Kokkos::MemoryTraits>, \ + true, ETI_SPEC_AVAIL> { \ + using XVector = Kokkos::View*, LAYOUT, Kokkos::Device, \ + Kokkos::MemoryTraits>; \ + using YVector = Kokkos::View*, LAYOUT, Kokkos::Device, \ + Kokkos::MemoryTraits>; \ + static void swap(EXECSPACE const& space, XVector const& X, YVector const& Y) { \ + Kokkos::Profiling::pushRegion("KokkosBlas::swap[TPL_CUBLAS,complex]"); \ + swap_print_specialization(); \ + KokkosBlas::Impl::CudaBlasSingleton& singleton = KokkosBlas::Impl::CudaBlasSingleton::singleton(); \ + KOKKOS_CUBLAS_SAFE_CALL_IMPL(cublasSetStream(singleton.handle, space.cuda_stream())); \ + KOKKOS_CUBLAS_SAFE_CALL_IMPL(cublasCswap(singleton.handle, X.extent_int(0), \ + reinterpret_cast(X.data()), 1, \ + reinterpret_cast(Y.data()), 1)); \ + Kokkos::Profiling::popRegion(); \ + } \ }; -KOKKOSBLAS1_DSWAP_TPL_SPEC_DECL_CUBLAS(Kokkos::LayoutLeft, Kokkos::Cuda, - Kokkos::CudaSpace, true) -KOKKOSBLAS1_DSWAP_TPL_SPEC_DECL_CUBLAS(Kokkos::LayoutLeft, Kokkos::Cuda, - Kokkos::CudaSpace, false) -KOKKOSBLAS1_DSWAP_TPL_SPEC_DECL_CUBLAS(Kokkos::LayoutRight, Kokkos::Cuda, - Kokkos::CudaSpace, true) -KOKKOSBLAS1_DSWAP_TPL_SPEC_DECL_CUBLAS(Kokkos::LayoutRight, Kokkos::Cuda, - Kokkos::CudaSpace, false) -KOKKOSBLAS1_DSWAP_TPL_SPEC_DECL_CUBLAS(Kokkos::LayoutLeft, Kokkos::Cuda, - Kokkos::CudaUVMSpace, true) -KOKKOSBLAS1_DSWAP_TPL_SPEC_DECL_CUBLAS(Kokkos::LayoutLeft, Kokkos::Cuda, - Kokkos::CudaUVMSpace, false) -KOKKOSBLAS1_DSWAP_TPL_SPEC_DECL_CUBLAS(Kokkos::LayoutRight, Kokkos::Cuda, - Kokkos::CudaUVMSpace, true) -KOKKOSBLAS1_DSWAP_TPL_SPEC_DECL_CUBLAS(Kokkos::LayoutRight, Kokkos::Cuda, - Kokkos::CudaUVMSpace, false) - -KOKKOSBLAS1_SSWAP_TPL_SPEC_DECL_CUBLAS(Kokkos::LayoutLeft, Kokkos::Cuda, - Kokkos::CudaSpace, true) -KOKKOSBLAS1_SSWAP_TPL_SPEC_DECL_CUBLAS(Kokkos::LayoutLeft, Kokkos::Cuda, - Kokkos::CudaSpace, false) -KOKKOSBLAS1_SSWAP_TPL_SPEC_DECL_CUBLAS(Kokkos::LayoutRight, Kokkos::Cuda, - Kokkos::CudaSpace, true) -KOKKOSBLAS1_SSWAP_TPL_SPEC_DECL_CUBLAS(Kokkos::LayoutRight, Kokkos::Cuda, - Kokkos::CudaSpace, false) -KOKKOSBLAS1_SSWAP_TPL_SPEC_DECL_CUBLAS(Kokkos::LayoutLeft, Kokkos::Cuda, - Kokkos::CudaUVMSpace, true) -KOKKOSBLAS1_SSWAP_TPL_SPEC_DECL_CUBLAS(Kokkos::LayoutLeft, Kokkos::Cuda, - Kokkos::CudaUVMSpace, false) -KOKKOSBLAS1_SSWAP_TPL_SPEC_DECL_CUBLAS(Kokkos::LayoutRight, Kokkos::Cuda, - Kokkos::CudaUVMSpace, true) -KOKKOSBLAS1_SSWAP_TPL_SPEC_DECL_CUBLAS(Kokkos::LayoutRight, Kokkos::Cuda, - Kokkos::CudaUVMSpace, false) - -KOKKOSBLAS1_ZSWAP_TPL_SPEC_DECL_CUBLAS(Kokkos::LayoutLeft, Kokkos::Cuda, - Kokkos::CudaSpace, true) -KOKKOSBLAS1_ZSWAP_TPL_SPEC_DECL_CUBLAS(Kokkos::LayoutLeft, Kokkos::Cuda, - Kokkos::CudaSpace, false) -KOKKOSBLAS1_ZSWAP_TPL_SPEC_DECL_CUBLAS(Kokkos::LayoutRight, Kokkos::Cuda, - Kokkos::CudaSpace, true) -KOKKOSBLAS1_ZSWAP_TPL_SPEC_DECL_CUBLAS(Kokkos::LayoutRight, Kokkos::Cuda, - Kokkos::CudaSpace, false) -KOKKOSBLAS1_ZSWAP_TPL_SPEC_DECL_CUBLAS(Kokkos::LayoutLeft, Kokkos::Cuda, - Kokkos::CudaUVMSpace, true) -KOKKOSBLAS1_ZSWAP_TPL_SPEC_DECL_CUBLAS(Kokkos::LayoutLeft, Kokkos::Cuda, - Kokkos::CudaUVMSpace, false) -KOKKOSBLAS1_ZSWAP_TPL_SPEC_DECL_CUBLAS(Kokkos::LayoutRight, Kokkos::Cuda, - Kokkos::CudaUVMSpace, true) -KOKKOSBLAS1_ZSWAP_TPL_SPEC_DECL_CUBLAS(Kokkos::LayoutRight, Kokkos::Cuda, - Kokkos::CudaUVMSpace, false) - -KOKKOSBLAS1_CSWAP_TPL_SPEC_DECL_CUBLAS(Kokkos::LayoutLeft, Kokkos::Cuda, - Kokkos::CudaSpace, true) -KOKKOSBLAS1_CSWAP_TPL_SPEC_DECL_CUBLAS(Kokkos::LayoutLeft, Kokkos::Cuda, - Kokkos::CudaSpace, false) -KOKKOSBLAS1_CSWAP_TPL_SPEC_DECL_CUBLAS(Kokkos::LayoutRight, Kokkos::Cuda, - Kokkos::CudaSpace, true) -KOKKOSBLAS1_CSWAP_TPL_SPEC_DECL_CUBLAS(Kokkos::LayoutRight, Kokkos::Cuda, - Kokkos::CudaSpace, false) -KOKKOSBLAS1_CSWAP_TPL_SPEC_DECL_CUBLAS(Kokkos::LayoutLeft, Kokkos::Cuda, - Kokkos::CudaUVMSpace, true) -KOKKOSBLAS1_CSWAP_TPL_SPEC_DECL_CUBLAS(Kokkos::LayoutLeft, Kokkos::Cuda, - Kokkos::CudaUVMSpace, false) -KOKKOSBLAS1_CSWAP_TPL_SPEC_DECL_CUBLAS(Kokkos::LayoutRight, Kokkos::Cuda, - Kokkos::CudaUVMSpace, true) -KOKKOSBLAS1_CSWAP_TPL_SPEC_DECL_CUBLAS(Kokkos::LayoutRight, Kokkos::Cuda, - Kokkos::CudaUVMSpace, false) +KOKKOSBLAS1_DSWAP_TPL_SPEC_DECL_CUBLAS(Kokkos::LayoutLeft, Kokkos::Cuda, Kokkos::CudaSpace, true) +KOKKOSBLAS1_DSWAP_TPL_SPEC_DECL_CUBLAS(Kokkos::LayoutLeft, Kokkos::Cuda, Kokkos::CudaSpace, false) +KOKKOSBLAS1_DSWAP_TPL_SPEC_DECL_CUBLAS(Kokkos::LayoutRight, Kokkos::Cuda, Kokkos::CudaSpace, true) +KOKKOSBLAS1_DSWAP_TPL_SPEC_DECL_CUBLAS(Kokkos::LayoutRight, Kokkos::Cuda, Kokkos::CudaSpace, false) +KOKKOSBLAS1_DSWAP_TPL_SPEC_DECL_CUBLAS(Kokkos::LayoutLeft, Kokkos::Cuda, Kokkos::CudaUVMSpace, true) +KOKKOSBLAS1_DSWAP_TPL_SPEC_DECL_CUBLAS(Kokkos::LayoutLeft, Kokkos::Cuda, Kokkos::CudaUVMSpace, false) +KOKKOSBLAS1_DSWAP_TPL_SPEC_DECL_CUBLAS(Kokkos::LayoutRight, Kokkos::Cuda, Kokkos::CudaUVMSpace, true) +KOKKOSBLAS1_DSWAP_TPL_SPEC_DECL_CUBLAS(Kokkos::LayoutRight, Kokkos::Cuda, Kokkos::CudaUVMSpace, false) + +KOKKOSBLAS1_SSWAP_TPL_SPEC_DECL_CUBLAS(Kokkos::LayoutLeft, Kokkos::Cuda, Kokkos::CudaSpace, true) +KOKKOSBLAS1_SSWAP_TPL_SPEC_DECL_CUBLAS(Kokkos::LayoutLeft, Kokkos::Cuda, Kokkos::CudaSpace, false) +KOKKOSBLAS1_SSWAP_TPL_SPEC_DECL_CUBLAS(Kokkos::LayoutRight, Kokkos::Cuda, Kokkos::CudaSpace, true) +KOKKOSBLAS1_SSWAP_TPL_SPEC_DECL_CUBLAS(Kokkos::LayoutRight, Kokkos::Cuda, Kokkos::CudaSpace, false) +KOKKOSBLAS1_SSWAP_TPL_SPEC_DECL_CUBLAS(Kokkos::LayoutLeft, Kokkos::Cuda, Kokkos::CudaUVMSpace, true) +KOKKOSBLAS1_SSWAP_TPL_SPEC_DECL_CUBLAS(Kokkos::LayoutLeft, Kokkos::Cuda, Kokkos::CudaUVMSpace, false) +KOKKOSBLAS1_SSWAP_TPL_SPEC_DECL_CUBLAS(Kokkos::LayoutRight, Kokkos::Cuda, Kokkos::CudaUVMSpace, true) +KOKKOSBLAS1_SSWAP_TPL_SPEC_DECL_CUBLAS(Kokkos::LayoutRight, Kokkos::Cuda, Kokkos::CudaUVMSpace, false) + +KOKKOSBLAS1_ZSWAP_TPL_SPEC_DECL_CUBLAS(Kokkos::LayoutLeft, Kokkos::Cuda, Kokkos::CudaSpace, true) +KOKKOSBLAS1_ZSWAP_TPL_SPEC_DECL_CUBLAS(Kokkos::LayoutLeft, Kokkos::Cuda, Kokkos::CudaSpace, false) +KOKKOSBLAS1_ZSWAP_TPL_SPEC_DECL_CUBLAS(Kokkos::LayoutRight, Kokkos::Cuda, Kokkos::CudaSpace, true) +KOKKOSBLAS1_ZSWAP_TPL_SPEC_DECL_CUBLAS(Kokkos::LayoutRight, Kokkos::Cuda, Kokkos::CudaSpace, false) +KOKKOSBLAS1_ZSWAP_TPL_SPEC_DECL_CUBLAS(Kokkos::LayoutLeft, Kokkos::Cuda, Kokkos::CudaUVMSpace, true) +KOKKOSBLAS1_ZSWAP_TPL_SPEC_DECL_CUBLAS(Kokkos::LayoutLeft, Kokkos::Cuda, Kokkos::CudaUVMSpace, false) +KOKKOSBLAS1_ZSWAP_TPL_SPEC_DECL_CUBLAS(Kokkos::LayoutRight, Kokkos::Cuda, Kokkos::CudaUVMSpace, true) +KOKKOSBLAS1_ZSWAP_TPL_SPEC_DECL_CUBLAS(Kokkos::LayoutRight, Kokkos::Cuda, Kokkos::CudaUVMSpace, false) + +KOKKOSBLAS1_CSWAP_TPL_SPEC_DECL_CUBLAS(Kokkos::LayoutLeft, Kokkos::Cuda, Kokkos::CudaSpace, true) +KOKKOSBLAS1_CSWAP_TPL_SPEC_DECL_CUBLAS(Kokkos::LayoutLeft, Kokkos::Cuda, Kokkos::CudaSpace, false) +KOKKOSBLAS1_CSWAP_TPL_SPEC_DECL_CUBLAS(Kokkos::LayoutRight, Kokkos::Cuda, Kokkos::CudaSpace, true) +KOKKOSBLAS1_CSWAP_TPL_SPEC_DECL_CUBLAS(Kokkos::LayoutRight, Kokkos::Cuda, Kokkos::CudaSpace, false) +KOKKOSBLAS1_CSWAP_TPL_SPEC_DECL_CUBLAS(Kokkos::LayoutLeft, Kokkos::Cuda, Kokkos::CudaUVMSpace, true) +KOKKOSBLAS1_CSWAP_TPL_SPEC_DECL_CUBLAS(Kokkos::LayoutLeft, Kokkos::Cuda, Kokkos::CudaUVMSpace, false) +KOKKOSBLAS1_CSWAP_TPL_SPEC_DECL_CUBLAS(Kokkos::LayoutRight, Kokkos::Cuda, Kokkos::CudaUVMSpace, true) +KOKKOSBLAS1_CSWAP_TPL_SPEC_DECL_CUBLAS(Kokkos::LayoutRight, Kokkos::Cuda, Kokkos::CudaUVMSpace, false) } // namespace Impl } // namespace KokkosBlas #endif // KOKKOSKERNELS_ENABLE_TPL_CUBLAS @@ -394,169 +295,115 @@ KOKKOSBLAS1_CSWAP_TPL_SPEC_DECL_CUBLAS(Kokkos::LayoutRight, Kokkos::Cuda, namespace KokkosBlas { namespace Impl { -#define KOKKOSBLAS1_DSWAP_TPL_SPEC_DECL_ROCBLAS(LAYOUT, EXECSPACE, MEMSPACE, \ - ETI_SPEC_AVAIL) \ - template <> \ - struct Swap< \ - EXECSPACE, \ - Kokkos::View, \ - Kokkos::MemoryTraits>, \ - Kokkos::View, \ - Kokkos::MemoryTraits>, \ - true, ETI_SPEC_AVAIL> { \ - using XVector = \ - Kokkos::View, \ - Kokkos::MemoryTraits>; \ - using YVector = \ - Kokkos::View, \ - Kokkos::MemoryTraits>; \ - static void swap(EXECSPACE const& space, XVector const& X, \ - YVector const& Y) { \ - Kokkos::Profiling::pushRegion("KokkosBlas::swap[TPL_ROCBLAS,double]"); \ - swap_print_specialization(); \ - KokkosBlas::Impl::RocBlasSingleton& singleton = \ - KokkosBlas::Impl::RocBlasSingleton::singleton(); \ - KOKKOS_ROCBLAS_SAFE_CALL_IMPL( \ - rocblas_set_stream(singleton.handle, space.hip_stream())); \ - KOKKOS_ROCBLAS_SAFE_CALL_IMPL(rocblas_dswap( \ - singleton.handle, X.extent_int(0), X.data(), 1, Y.data(), 1)); \ - Kokkos::Profiling::popRegion(); \ - } \ +#define KOKKOSBLAS1_DSWAP_TPL_SPEC_DECL_ROCBLAS(LAYOUT, EXECSPACE, MEMSPACE, ETI_SPEC_AVAIL) \ + template <> \ + struct Swap< \ + EXECSPACE, \ + Kokkos::View, Kokkos::MemoryTraits>, \ + Kokkos::View, Kokkos::MemoryTraits>, \ + true, ETI_SPEC_AVAIL> { \ + using XVector = \ + Kokkos::View, Kokkos::MemoryTraits>; \ + using YVector = \ + Kokkos::View, Kokkos::MemoryTraits>; \ + static void swap(EXECSPACE const& space, XVector const& X, YVector const& Y) { \ + Kokkos::Profiling::pushRegion("KokkosBlas::swap[TPL_ROCBLAS,double]"); \ + swap_print_specialization(); \ + KokkosBlas::Impl::RocBlasSingleton& singleton = KokkosBlas::Impl::RocBlasSingleton::singleton(); \ + KOKKOS_ROCBLAS_SAFE_CALL_IMPL(rocblas_set_stream(singleton.handle, space.hip_stream())); \ + KOKKOS_ROCBLAS_SAFE_CALL_IMPL(rocblas_dswap(singleton.handle, X.extent_int(0), X.data(), 1, Y.data(), 1)); \ + Kokkos::Profiling::popRegion(); \ + } \ }; -#define KOKKOSBLAS1_SSWAP_TPL_SPEC_DECL_ROCBLAS(LAYOUT, EXECSPACE, MEMSPACE, \ - ETI_SPEC_AVAIL) \ - template <> \ - struct Swap< \ - EXECSPACE, \ - Kokkos::View, \ - Kokkos::MemoryTraits>, \ - Kokkos::View, \ - Kokkos::MemoryTraits>, \ - true, ETI_SPEC_AVAIL> { \ - using XVector = \ - Kokkos::View, \ - Kokkos::MemoryTraits>; \ - using YVector = \ - Kokkos::View, \ - Kokkos::MemoryTraits>; \ - static void swap(EXECSPACE const& space, XVector const& X, \ - YVector const& Y) { \ - Kokkos::Profiling::pushRegion("KokkosBlas::swap[TPL_ROCBLAS,float]"); \ - swap_print_specialization(); \ - KokkosBlas::Impl::RocBlasSingleton& singleton = \ - KokkosBlas::Impl::RocBlasSingleton::singleton(); \ - KOKKOS_ROCBLAS_SAFE_CALL_IMPL( \ - rocblas_set_stream(singleton.handle, space.hip_stream())); \ - KOKKOS_ROCBLAS_SAFE_CALL_IMPL(rocblas_sswap( \ - singleton.handle, X.extent_int(0), X.data(), 1, Y.data(), 1)); \ - Kokkos::Profiling::popRegion(); \ - } \ +#define KOKKOSBLAS1_SSWAP_TPL_SPEC_DECL_ROCBLAS(LAYOUT, EXECSPACE, MEMSPACE, ETI_SPEC_AVAIL) \ + template <> \ + struct Swap< \ + EXECSPACE, \ + Kokkos::View, Kokkos::MemoryTraits>, \ + Kokkos::View, Kokkos::MemoryTraits>, \ + true, ETI_SPEC_AVAIL> { \ + using XVector = \ + Kokkos::View, Kokkos::MemoryTraits>; \ + using YVector = \ + Kokkos::View, Kokkos::MemoryTraits>; \ + static void swap(EXECSPACE const& space, XVector const& X, YVector const& Y) { \ + Kokkos::Profiling::pushRegion("KokkosBlas::swap[TPL_ROCBLAS,float]"); \ + swap_print_specialization(); \ + KokkosBlas::Impl::RocBlasSingleton& singleton = KokkosBlas::Impl::RocBlasSingleton::singleton(); \ + KOKKOS_ROCBLAS_SAFE_CALL_IMPL(rocblas_set_stream(singleton.handle, space.hip_stream())); \ + KOKKOS_ROCBLAS_SAFE_CALL_IMPL(rocblas_sswap(singleton.handle, X.extent_int(0), X.data(), 1, Y.data(), 1)); \ + Kokkos::Profiling::popRegion(); \ + } \ }; -#define KOKKOSBLAS1_ZSWAP_TPL_SPEC_DECL_ROCBLAS(LAYOUT, EXECSPACE, MEMSPACE, \ - ETI_SPEC_AVAIL) \ - template <> \ - struct Swap*, LAYOUT, \ - Kokkos::Device, \ - Kokkos::MemoryTraits>, \ - Kokkos::View*, LAYOUT, \ - Kokkos::Device, \ - Kokkos::MemoryTraits>, \ - true, ETI_SPEC_AVAIL> { \ - using XVector = Kokkos::View*, LAYOUT, \ - Kokkos::Device, \ - Kokkos::MemoryTraits>; \ - using YVector = Kokkos::View*, LAYOUT, \ - Kokkos::Device, \ - Kokkos::MemoryTraits>; \ - static void swap(EXECSPACE const& space, XVector const& X, \ - YVector const& Y) { \ - Kokkos::Profiling::pushRegion( \ - "KokkosBlas::swap[TPL_ROCBLAS,complex_double]"); \ - swap_print_specialization(); \ - KokkosBlas::Impl::RocBlasSingleton& singleton = \ - KokkosBlas::Impl::RocBlasSingleton::singleton(); \ - KOKKOS_ROCBLAS_SAFE_CALL_IMPL( \ - rocblas_set_stream(singleton.handle, space.hip_stream())); \ - KOKKOS_ROCBLAS_SAFE_CALL_IMPL(rocblas_zswap( \ - singleton.handle, X.extent_int(0), \ - reinterpret_cast(X.data()), 1, \ - reinterpret_cast(Y.data()), 1)); \ - Kokkos::Profiling::popRegion(); \ - } \ +#define KOKKOSBLAS1_ZSWAP_TPL_SPEC_DECL_ROCBLAS(LAYOUT, EXECSPACE, MEMSPACE, ETI_SPEC_AVAIL) \ + template <> \ + struct Swap*, LAYOUT, Kokkos::Device, \ + Kokkos::MemoryTraits>, \ + Kokkos::View*, LAYOUT, Kokkos::Device, \ + Kokkos::MemoryTraits>, \ + true, ETI_SPEC_AVAIL> { \ + using XVector = Kokkos::View*, LAYOUT, Kokkos::Device, \ + Kokkos::MemoryTraits>; \ + using YVector = Kokkos::View*, LAYOUT, Kokkos::Device, \ + Kokkos::MemoryTraits>; \ + static void swap(EXECSPACE const& space, XVector const& X, YVector const& Y) { \ + Kokkos::Profiling::pushRegion("KokkosBlas::swap[TPL_ROCBLAS,complex_double]"); \ + swap_print_specialization(); \ + KokkosBlas::Impl::RocBlasSingleton& singleton = KokkosBlas::Impl::RocBlasSingleton::singleton(); \ + KOKKOS_ROCBLAS_SAFE_CALL_IMPL(rocblas_set_stream(singleton.handle, space.hip_stream())); \ + KOKKOS_ROCBLAS_SAFE_CALL_IMPL(rocblas_zswap(singleton.handle, X.extent_int(0), \ + reinterpret_cast(X.data()), 1, \ + reinterpret_cast(Y.data()), 1)); \ + Kokkos::Profiling::popRegion(); \ + } \ }; -#define KOKKOSBLAS1_CSWAP_TPL_SPEC_DECL_ROCBLAS(LAYOUT, EXECSPACE, MEMSPACE, \ - ETI_SPEC_AVAIL) \ - template <> \ - struct Swap*, LAYOUT, \ - Kokkos::Device, \ - Kokkos::MemoryTraits>, \ - Kokkos::View*, LAYOUT, \ - Kokkos::Device, \ - Kokkos::MemoryTraits>, \ - true, ETI_SPEC_AVAIL> { \ - using XVector = Kokkos::View*, LAYOUT, \ - Kokkos::Device, \ - Kokkos::MemoryTraits>; \ - using YVector = Kokkos::View*, LAYOUT, \ - Kokkos::Device, \ - Kokkos::MemoryTraits>; \ - static void swap(EXECSPACE const& space, XVector const& X, \ - YVector const& Y) { \ - Kokkos::Profiling::pushRegion( \ - "KokkosBlas::swap[TPL_ROCBLAS,complex_float]"); \ - swap_print_specialization(); \ - KokkosBlas::Impl::RocBlasSingleton& singleton = \ - KokkosBlas::Impl::RocBlasSingleton::singleton(); \ - KOKKOS_ROCBLAS_SAFE_CALL_IMPL( \ - rocblas_set_stream(singleton.handle, space.hip_stream())); \ - KOKKOS_ROCBLAS_SAFE_CALL_IMPL(rocblas_cswap( \ - singleton.handle, X.extent_int(0), \ - reinterpret_cast(X.data()), 1, \ - reinterpret_cast(Y.data()), 1)); \ - Kokkos::Profiling::popRegion(); \ - } \ +#define KOKKOSBLAS1_CSWAP_TPL_SPEC_DECL_ROCBLAS(LAYOUT, EXECSPACE, MEMSPACE, ETI_SPEC_AVAIL) \ + template <> \ + struct Swap*, LAYOUT, Kokkos::Device, \ + Kokkos::MemoryTraits>, \ + Kokkos::View*, LAYOUT, Kokkos::Device, \ + Kokkos::MemoryTraits>, \ + true, ETI_SPEC_AVAIL> { \ + using XVector = Kokkos::View*, LAYOUT, Kokkos::Device, \ + Kokkos::MemoryTraits>; \ + using YVector = Kokkos::View*, LAYOUT, Kokkos::Device, \ + Kokkos::MemoryTraits>; \ + static void swap(EXECSPACE const& space, XVector const& X, YVector const& Y) { \ + Kokkos::Profiling::pushRegion("KokkosBlas::swap[TPL_ROCBLAS,complex_float]"); \ + swap_print_specialization(); \ + KokkosBlas::Impl::RocBlasSingleton& singleton = KokkosBlas::Impl::RocBlasSingleton::singleton(); \ + KOKKOS_ROCBLAS_SAFE_CALL_IMPL(rocblas_set_stream(singleton.handle, space.hip_stream())); \ + KOKKOS_ROCBLAS_SAFE_CALL_IMPL(rocblas_cswap(singleton.handle, X.extent_int(0), \ + reinterpret_cast(X.data()), 1, \ + reinterpret_cast(Y.data()), 1)); \ + Kokkos::Profiling::popRegion(); \ + } \ }; -KOKKOSBLAS1_DSWAP_TPL_SPEC_DECL_ROCBLAS(Kokkos::LayoutLeft, Kokkos::HIP, - Kokkos::HIPSpace, true) -KOKKOSBLAS1_DSWAP_TPL_SPEC_DECL_ROCBLAS(Kokkos::LayoutLeft, Kokkos::HIP, - Kokkos::HIPSpace, false) -KOKKOSBLAS1_DSWAP_TPL_SPEC_DECL_ROCBLAS(Kokkos::LayoutRight, Kokkos::HIP, - Kokkos::HIPSpace, true) -KOKKOSBLAS1_DSWAP_TPL_SPEC_DECL_ROCBLAS(Kokkos::LayoutRight, Kokkos::HIP, - Kokkos::HIPSpace, false) - -KOKKOSBLAS1_SSWAP_TPL_SPEC_DECL_ROCBLAS(Kokkos::LayoutLeft, Kokkos::HIP, - Kokkos::HIPSpace, true) -KOKKOSBLAS1_SSWAP_TPL_SPEC_DECL_ROCBLAS(Kokkos::LayoutLeft, Kokkos::HIP, - Kokkos::HIPSpace, false) -KOKKOSBLAS1_SSWAP_TPL_SPEC_DECL_ROCBLAS(Kokkos::LayoutRight, Kokkos::HIP, - Kokkos::HIPSpace, true) -KOKKOSBLAS1_SSWAP_TPL_SPEC_DECL_ROCBLAS(Kokkos::LayoutRight, Kokkos::HIP, - Kokkos::HIPSpace, false) - -KOKKOSBLAS1_ZSWAP_TPL_SPEC_DECL_ROCBLAS(Kokkos::LayoutLeft, Kokkos::HIP, - Kokkos::HIPSpace, true) -KOKKOSBLAS1_ZSWAP_TPL_SPEC_DECL_ROCBLAS(Kokkos::LayoutLeft, Kokkos::HIP, - Kokkos::HIPSpace, false) -KOKKOSBLAS1_ZSWAP_TPL_SPEC_DECL_ROCBLAS(Kokkos::LayoutRight, Kokkos::HIP, - Kokkos::HIPSpace, true) -KOKKOSBLAS1_ZSWAP_TPL_SPEC_DECL_ROCBLAS(Kokkos::LayoutRight, Kokkos::HIP, - Kokkos::HIPSpace, false) - -KOKKOSBLAS1_CSWAP_TPL_SPEC_DECL_ROCBLAS(Kokkos::LayoutLeft, Kokkos::HIP, - Kokkos::HIPSpace, true) -KOKKOSBLAS1_CSWAP_TPL_SPEC_DECL_ROCBLAS(Kokkos::LayoutLeft, Kokkos::HIP, - Kokkos::HIPSpace, false) -KOKKOSBLAS1_CSWAP_TPL_SPEC_DECL_ROCBLAS(Kokkos::LayoutRight, Kokkos::HIP, - Kokkos::HIPSpace, true) -KOKKOSBLAS1_CSWAP_TPL_SPEC_DECL_ROCBLAS(Kokkos::LayoutRight, Kokkos::HIP, - Kokkos::HIPSpace, false) +KOKKOSBLAS1_DSWAP_TPL_SPEC_DECL_ROCBLAS(Kokkos::LayoutLeft, Kokkos::HIP, Kokkos::HIPSpace, true) +KOKKOSBLAS1_DSWAP_TPL_SPEC_DECL_ROCBLAS(Kokkos::LayoutLeft, Kokkos::HIP, Kokkos::HIPSpace, false) +KOKKOSBLAS1_DSWAP_TPL_SPEC_DECL_ROCBLAS(Kokkos::LayoutRight, Kokkos::HIP, Kokkos::HIPSpace, true) +KOKKOSBLAS1_DSWAP_TPL_SPEC_DECL_ROCBLAS(Kokkos::LayoutRight, Kokkos::HIP, Kokkos::HIPSpace, false) + +KOKKOSBLAS1_SSWAP_TPL_SPEC_DECL_ROCBLAS(Kokkos::LayoutLeft, Kokkos::HIP, Kokkos::HIPSpace, true) +KOKKOSBLAS1_SSWAP_TPL_SPEC_DECL_ROCBLAS(Kokkos::LayoutLeft, Kokkos::HIP, Kokkos::HIPSpace, false) +KOKKOSBLAS1_SSWAP_TPL_SPEC_DECL_ROCBLAS(Kokkos::LayoutRight, Kokkos::HIP, Kokkos::HIPSpace, true) +KOKKOSBLAS1_SSWAP_TPL_SPEC_DECL_ROCBLAS(Kokkos::LayoutRight, Kokkos::HIP, Kokkos::HIPSpace, false) + +KOKKOSBLAS1_ZSWAP_TPL_SPEC_DECL_ROCBLAS(Kokkos::LayoutLeft, Kokkos::HIP, Kokkos::HIPSpace, true) +KOKKOSBLAS1_ZSWAP_TPL_SPEC_DECL_ROCBLAS(Kokkos::LayoutLeft, Kokkos::HIP, Kokkos::HIPSpace, false) +KOKKOSBLAS1_ZSWAP_TPL_SPEC_DECL_ROCBLAS(Kokkos::LayoutRight, Kokkos::HIP, Kokkos::HIPSpace, true) +KOKKOSBLAS1_ZSWAP_TPL_SPEC_DECL_ROCBLAS(Kokkos::LayoutRight, Kokkos::HIP, Kokkos::HIPSpace, false) + +KOKKOSBLAS1_CSWAP_TPL_SPEC_DECL_ROCBLAS(Kokkos::LayoutLeft, Kokkos::HIP, Kokkos::HIPSpace, true) +KOKKOSBLAS1_CSWAP_TPL_SPEC_DECL_ROCBLAS(Kokkos::LayoutLeft, Kokkos::HIP, Kokkos::HIPSpace, false) +KOKKOSBLAS1_CSWAP_TPL_SPEC_DECL_ROCBLAS(Kokkos::LayoutRight, Kokkos::HIP, Kokkos::HIPSpace, true) +KOKKOSBLAS1_CSWAP_TPL_SPEC_DECL_ROCBLAS(Kokkos::LayoutRight, Kokkos::HIP, Kokkos::HIPSpace, false) } // namespace Impl } // namespace KokkosBlas #endif // KOKKOSKERNELS_ENABLE_TPL_ROCBLAS diff --git a/blas/tpls/KokkosBlas1_update_tpl_spec_avail.hpp b/blas/tpls/KokkosBlas1_update_tpl_spec_avail.hpp index 88a60e6d19..55e1383ed7 100644 --- a/blas/tpls/KokkosBlas1_update_tpl_spec_avail.hpp +++ b/blas/tpls/KokkosBlas1_update_tpl_spec_avail.hpp @@ -20,8 +20,7 @@ namespace KokkosBlas { namespace Impl { // Specialization struct which defines whether a specialization exists -template +template struct update_tpl_spec_avail { enum : bool { value = false }; }; diff --git a/blas/tpls/KokkosBlas2_gemv_tpl_spec_avail.hpp b/blas/tpls/KokkosBlas2_gemv_tpl_spec_avail.hpp index 661393e445..679a5ddace 100644 --- a/blas/tpls/KokkosBlas2_gemv_tpl_spec_avail.hpp +++ b/blas/tpls/KokkosBlas2_gemv_tpl_spec_avail.hpp @@ -28,46 +28,34 @@ struct gemv_tpl_spec_avail { // Generic Host side BLAS (could be MKL or whatever) #ifdef KOKKOSKERNELS_ENABLE_TPL_BLAS -#define KOKKOSBLAS2_GEMV_TPL_SPEC_AVAIL_BLAS(SCALAR, LAYOUTA, LAYOUTX, \ - LAYOUTY, MEMSPACE) \ - template \ - struct gemv_tpl_spec_avail< \ - ExecSpace, \ - Kokkos::View, \ - Kokkos::MemoryTraits >, \ - Kokkos::View, \ - Kokkos::MemoryTraits >, \ - Kokkos::View, \ - Kokkos::MemoryTraits > > { \ - enum : bool { value = true }; \ +#define KOKKOSBLAS2_GEMV_TPL_SPEC_AVAIL_BLAS(SCALAR, LAYOUTA, LAYOUTX, LAYOUTY, MEMSPACE) \ + template \ + struct gemv_tpl_spec_avail, \ + Kokkos::MemoryTraits >, \ + Kokkos::View, \ + Kokkos::MemoryTraits >, \ + Kokkos::View, \ + Kokkos::MemoryTraits > > { \ + enum : bool { value = true }; \ }; -KOKKOSBLAS2_GEMV_TPL_SPEC_AVAIL_BLAS(double, Kokkos::LayoutLeft, - Kokkos::LayoutLeft, Kokkos::LayoutLeft, +KOKKOSBLAS2_GEMV_TPL_SPEC_AVAIL_BLAS(double, Kokkos::LayoutLeft, Kokkos::LayoutLeft, Kokkos::LayoutLeft, Kokkos::HostSpace) -KOKKOSBLAS2_GEMV_TPL_SPEC_AVAIL_BLAS(float, Kokkos::LayoutLeft, - Kokkos::LayoutLeft, Kokkos::LayoutLeft, +KOKKOSBLAS2_GEMV_TPL_SPEC_AVAIL_BLAS(float, Kokkos::LayoutLeft, Kokkos::LayoutLeft, Kokkos::LayoutLeft, Kokkos::HostSpace) -KOKKOSBLAS2_GEMV_TPL_SPEC_AVAIL_BLAS(Kokkos::complex, - Kokkos::LayoutLeft, Kokkos::LayoutLeft, +KOKKOSBLAS2_GEMV_TPL_SPEC_AVAIL_BLAS(Kokkos::complex, Kokkos::LayoutLeft, Kokkos::LayoutLeft, Kokkos::LayoutLeft, Kokkos::HostSpace) -KOKKOSBLAS2_GEMV_TPL_SPEC_AVAIL_BLAS(Kokkos::complex, Kokkos::LayoutLeft, - Kokkos::LayoutLeft, Kokkos::LayoutLeft, +KOKKOSBLAS2_GEMV_TPL_SPEC_AVAIL_BLAS(Kokkos::complex, Kokkos::LayoutLeft, Kokkos::LayoutLeft, Kokkos::LayoutLeft, Kokkos::HostSpace) -KOKKOSBLAS2_GEMV_TPL_SPEC_AVAIL_BLAS(double, Kokkos::LayoutRight, - Kokkos::LayoutRight, Kokkos::LayoutRight, +KOKKOSBLAS2_GEMV_TPL_SPEC_AVAIL_BLAS(double, Kokkos::LayoutRight, Kokkos::LayoutRight, Kokkos::LayoutRight, Kokkos::HostSpace) -KOKKOSBLAS2_GEMV_TPL_SPEC_AVAIL_BLAS(float, Kokkos::LayoutRight, - Kokkos::LayoutRight, Kokkos::LayoutRight, +KOKKOSBLAS2_GEMV_TPL_SPEC_AVAIL_BLAS(float, Kokkos::LayoutRight, Kokkos::LayoutRight, Kokkos::LayoutRight, Kokkos::HostSpace) -KOKKOSBLAS2_GEMV_TPL_SPEC_AVAIL_BLAS(Kokkos::complex, - Kokkos::LayoutRight, Kokkos::LayoutRight, +KOKKOSBLAS2_GEMV_TPL_SPEC_AVAIL_BLAS(Kokkos::complex, Kokkos::LayoutRight, Kokkos::LayoutRight, Kokkos::LayoutRight, Kokkos::HostSpace) -KOKKOSBLAS2_GEMV_TPL_SPEC_AVAIL_BLAS(Kokkos::complex, - Kokkos::LayoutRight, Kokkos::LayoutRight, +KOKKOSBLAS2_GEMV_TPL_SPEC_AVAIL_BLAS(Kokkos::complex, Kokkos::LayoutRight, Kokkos::LayoutRight, Kokkos::LayoutRight, Kokkos::HostSpace) #endif @@ -75,20 +63,16 @@ KOKKOSBLAS2_GEMV_TPL_SPEC_AVAIL_BLAS(Kokkos::complex, // cuBLAS #ifdef KOKKOSKERNELS_ENABLE_TPL_CUBLAS -#define KOKKOSBLAS2_GEMV_TPL_SPEC_AVAIL_CUBLAS(SCALAR, LAYOUTA, LAYOUTX, \ - LAYOUTY, MEMSPACE) \ - template \ - struct gemv_tpl_spec_avail< \ - ExecSpace, \ - Kokkos::View, \ - Kokkos::MemoryTraits >, \ - Kokkos::View, \ - Kokkos::MemoryTraits >, \ - Kokkos::View, \ - Kokkos::MemoryTraits > > { \ - enum : bool { value = true }; \ +#define KOKKOSBLAS2_GEMV_TPL_SPEC_AVAIL_CUBLAS(SCALAR, LAYOUTA, LAYOUTX, LAYOUTY, MEMSPACE) \ + template \ + struct gemv_tpl_spec_avail, \ + Kokkos::MemoryTraits >, \ + Kokkos::View, \ + Kokkos::MemoryTraits >, \ + Kokkos::View, \ + Kokkos::MemoryTraits > > { \ + enum : bool { value = true }; \ }; // Note BMK: We use the same layout for A, X and Y because the GEMV @@ -96,30 +80,22 @@ KOKKOSBLAS2_GEMV_TPL_SPEC_AVAIL_BLAS(Kokkos::complex, // So this TPL version will match any layout combination, as long // as none are LayoutStride. -KOKKOSBLAS2_GEMV_TPL_SPEC_AVAIL_CUBLAS(double, Kokkos::LayoutLeft, - Kokkos::LayoutLeft, Kokkos::LayoutLeft, +KOKKOSBLAS2_GEMV_TPL_SPEC_AVAIL_CUBLAS(double, Kokkos::LayoutLeft, Kokkos::LayoutLeft, Kokkos::LayoutLeft, Kokkos::CudaSpace) -KOKKOSBLAS2_GEMV_TPL_SPEC_AVAIL_CUBLAS(float, Kokkos::LayoutLeft, - Kokkos::LayoutLeft, Kokkos::LayoutLeft, +KOKKOSBLAS2_GEMV_TPL_SPEC_AVAIL_CUBLAS(float, Kokkos::LayoutLeft, Kokkos::LayoutLeft, Kokkos::LayoutLeft, Kokkos::CudaSpace) -KOKKOSBLAS2_GEMV_TPL_SPEC_AVAIL_CUBLAS(Kokkos::complex, - Kokkos::LayoutLeft, Kokkos::LayoutLeft, +KOKKOSBLAS2_GEMV_TPL_SPEC_AVAIL_CUBLAS(Kokkos::complex, Kokkos::LayoutLeft, Kokkos::LayoutLeft, Kokkos::LayoutLeft, Kokkos::CudaSpace) -KOKKOSBLAS2_GEMV_TPL_SPEC_AVAIL_CUBLAS(Kokkos::complex, - Kokkos::LayoutLeft, Kokkos::LayoutLeft, +KOKKOSBLAS2_GEMV_TPL_SPEC_AVAIL_CUBLAS(Kokkos::complex, Kokkos::LayoutLeft, Kokkos::LayoutLeft, Kokkos::LayoutLeft, Kokkos::CudaSpace) -KOKKOSBLAS2_GEMV_TPL_SPEC_AVAIL_CUBLAS(double, Kokkos::LayoutRight, - Kokkos::LayoutRight, Kokkos::LayoutRight, +KOKKOSBLAS2_GEMV_TPL_SPEC_AVAIL_CUBLAS(double, Kokkos::LayoutRight, Kokkos::LayoutRight, Kokkos::LayoutRight, Kokkos::CudaSpace) -KOKKOSBLAS2_GEMV_TPL_SPEC_AVAIL_CUBLAS(float, Kokkos::LayoutRight, - Kokkos::LayoutRight, Kokkos::LayoutRight, +KOKKOSBLAS2_GEMV_TPL_SPEC_AVAIL_CUBLAS(float, Kokkos::LayoutRight, Kokkos::LayoutRight, Kokkos::LayoutRight, Kokkos::CudaSpace) -KOKKOSBLAS2_GEMV_TPL_SPEC_AVAIL_CUBLAS(Kokkos::complex, - Kokkos::LayoutRight, Kokkos::LayoutRight, +KOKKOSBLAS2_GEMV_TPL_SPEC_AVAIL_CUBLAS(Kokkos::complex, Kokkos::LayoutRight, Kokkos::LayoutRight, Kokkos::LayoutRight, Kokkos::CudaSpace) -KOKKOSBLAS2_GEMV_TPL_SPEC_AVAIL_CUBLAS(Kokkos::complex, - Kokkos::LayoutRight, Kokkos::LayoutRight, +KOKKOSBLAS2_GEMV_TPL_SPEC_AVAIL_CUBLAS(Kokkos::complex, Kokkos::LayoutRight, Kokkos::LayoutRight, Kokkos::LayoutRight, Kokkos::CudaSpace) #endif @@ -127,35 +103,27 @@ KOKKOSBLAS2_GEMV_TPL_SPEC_AVAIL_CUBLAS(Kokkos::complex, // rocBLAS #ifdef KOKKOSKERNELS_ENABLE_TPL_ROCBLAS -#define KOKKOSBLAS2_GEMV_TPL_SPEC_AVAIL_ROCBLAS(SCALAR, LAYOUT) \ - template \ - struct gemv_tpl_spec_avail< \ - ExecSpace, \ - Kokkos::View, \ - Kokkos::MemoryTraits >, \ - Kokkos::View, \ - Kokkos::MemoryTraits >, \ - Kokkos::View, \ - Kokkos::MemoryTraits > > { \ - enum : bool { value = true }; \ +#define KOKKOSBLAS2_GEMV_TPL_SPEC_AVAIL_ROCBLAS(SCALAR, LAYOUT) \ + template \ + struct gemv_tpl_spec_avail, \ + Kokkos::MemoryTraits >, \ + Kokkos::View, \ + Kokkos::MemoryTraits >, \ + Kokkos::View, \ + Kokkos::MemoryTraits > > { \ + enum : bool { value = true }; \ }; KOKKOSBLAS2_GEMV_TPL_SPEC_AVAIL_ROCBLAS(double, Kokkos::LayoutLeft) KOKKOSBLAS2_GEMV_TPL_SPEC_AVAIL_ROCBLAS(float, Kokkos::LayoutLeft) -KOKKOSBLAS2_GEMV_TPL_SPEC_AVAIL_ROCBLAS(Kokkos::complex, - Kokkos::LayoutLeft) -KOKKOSBLAS2_GEMV_TPL_SPEC_AVAIL_ROCBLAS(Kokkos::complex, - Kokkos::LayoutLeft) +KOKKOSBLAS2_GEMV_TPL_SPEC_AVAIL_ROCBLAS(Kokkos::complex, Kokkos::LayoutLeft) +KOKKOSBLAS2_GEMV_TPL_SPEC_AVAIL_ROCBLAS(Kokkos::complex, Kokkos::LayoutLeft) KOKKOSBLAS2_GEMV_TPL_SPEC_AVAIL_ROCBLAS(double, Kokkos::LayoutRight) KOKKOSBLAS2_GEMV_TPL_SPEC_AVAIL_ROCBLAS(float, Kokkos::LayoutRight) -KOKKOSBLAS2_GEMV_TPL_SPEC_AVAIL_ROCBLAS(Kokkos::complex, - Kokkos::LayoutRight) -KOKKOSBLAS2_GEMV_TPL_SPEC_AVAIL_ROCBLAS(Kokkos::complex, - Kokkos::LayoutRight) +KOKKOSBLAS2_GEMV_TPL_SPEC_AVAIL_ROCBLAS(Kokkos::complex, Kokkos::LayoutRight) +KOKKOSBLAS2_GEMV_TPL_SPEC_AVAIL_ROCBLAS(Kokkos::complex, Kokkos::LayoutRight) #endif @@ -163,38 +131,31 @@ KOKKOSBLAS2_GEMV_TPL_SPEC_AVAIL_ROCBLAS(Kokkos::complex, #if defined(KOKKOS_ENABLE_SYCL) -#define KOKKOSBLAS2_GEMV_TPL_SPEC_AVAIL_ONEMKL(SCALAR, LAYOUT) \ - template \ - struct gemv_tpl_spec_avail< \ - ExecSpace, \ - Kokkos::View, \ - Kokkos::MemoryTraits >, \ - Kokkos::View, \ - Kokkos::MemoryTraits >, \ - Kokkos::View, \ - Kokkos::MemoryTraits > > { \ - enum : bool { value = true }; \ +#define KOKKOSBLAS2_GEMV_TPL_SPEC_AVAIL_ONEMKL(SCALAR, LAYOUT) \ + template \ + struct gemv_tpl_spec_avail< \ + ExecSpace, \ + Kokkos::View, \ + Kokkos::MemoryTraits >, \ + Kokkos::View, \ + Kokkos::MemoryTraits >, \ + Kokkos::View, \ + Kokkos::MemoryTraits > > { \ + enum : bool { value = true }; \ }; KOKKOSBLAS2_GEMV_TPL_SPEC_AVAIL_ONEMKL(double, Kokkos::LayoutLeft) KOKKOSBLAS2_GEMV_TPL_SPEC_AVAIL_ONEMKL(float, Kokkos::LayoutLeft) -KOKKOSBLAS2_GEMV_TPL_SPEC_AVAIL_ONEMKL(Kokkos::complex, - Kokkos::LayoutLeft) -KOKKOSBLAS2_GEMV_TPL_SPEC_AVAIL_ONEMKL(Kokkos::complex, - Kokkos::LayoutLeft) +KOKKOSBLAS2_GEMV_TPL_SPEC_AVAIL_ONEMKL(Kokkos::complex, Kokkos::LayoutLeft) +KOKKOSBLAS2_GEMV_TPL_SPEC_AVAIL_ONEMKL(Kokkos::complex, Kokkos::LayoutLeft) KOKKOSBLAS2_GEMV_TPL_SPEC_AVAIL_ONEMKL(double, Kokkos::LayoutRight) KOKKOSBLAS2_GEMV_TPL_SPEC_AVAIL_ONEMKL(float, Kokkos::LayoutRight) -KOKKOSBLAS2_GEMV_TPL_SPEC_AVAIL_ONEMKL(Kokkos::complex, - Kokkos::LayoutRight) -KOKKOSBLAS2_GEMV_TPL_SPEC_AVAIL_ONEMKL(Kokkos::complex, - Kokkos::LayoutRight) +KOKKOSBLAS2_GEMV_TPL_SPEC_AVAIL_ONEMKL(Kokkos::complex, Kokkos::LayoutRight) +KOKKOSBLAS2_GEMV_TPL_SPEC_AVAIL_ONEMKL(Kokkos::complex, Kokkos::LayoutRight) #endif diff --git a/blas/tpls/KokkosBlas2_gemv_tpl_spec_decl.hpp b/blas/tpls/KokkosBlas2_gemv_tpl_spec_decl.hpp index 07d9476b66..fcc5762f57 100644 --- a/blas/tpls/KokkosBlas2_gemv_tpl_spec_decl.hpp +++ b/blas/tpls/KokkosBlas2_gemv_tpl_spec_decl.hpp @@ -43,215 +43,157 @@ namespace Impl { transa = 'C'; \ } -#define KOKKOSBLAS2_DGEMV_BLAS(LAYOUTA, LAYOUTX, LAYOUTY, MEM_SPACE, \ - ETI_SPEC_AVAIL) \ - template \ - struct GEMV< \ - ExecSpace, \ - Kokkos::View, \ - Kokkos::MemoryTraits >, \ - Kokkos::View, \ - Kokkos::MemoryTraits >, \ - Kokkos::View, \ - Kokkos::MemoryTraits >, \ - true, ETI_SPEC_AVAIL> { \ - typedef double SCALAR; \ - typedef Kokkos::View, \ - Kokkos::MemoryTraits > \ - AViewType; \ - typedef Kokkos::View, \ - Kokkos::MemoryTraits > \ - XViewType; \ - typedef Kokkos::View, \ - Kokkos::MemoryTraits > \ - YViewType; \ - \ - static void gemv(const ExecSpace& /* space */, const char trans[], \ - typename AViewType::const_value_type& alpha, \ - const AViewType& A, const XViewType& X, \ - typename YViewType::const_value_type& beta, \ - const YViewType& Y) { \ - Kokkos::Profiling::pushRegion("KokkosBlas::gemv[TPL_BLAS,double]"); \ - KOKKOSBLAS2_GEMV_DETERMINE_ARGS(LAYOUTA); \ - HostBlas::gemv(transa, M, N, alpha, A.data(), LDA, X.data(), \ - one, beta, Y.data(), one); \ - Kokkos::Profiling::popRegion(); \ - } \ +#define KOKKOSBLAS2_DGEMV_BLAS(LAYOUTA, LAYOUTX, LAYOUTY, MEM_SPACE, ETI_SPEC_AVAIL) \ + template \ + struct GEMV< \ + ExecSpace, \ + Kokkos::View, \ + Kokkos::MemoryTraits >, \ + Kokkos::View, \ + Kokkos::MemoryTraits >, \ + Kokkos::View, Kokkos::MemoryTraits >, \ + true, ETI_SPEC_AVAIL> { \ + typedef double SCALAR; \ + typedef Kokkos::View, \ + Kokkos::MemoryTraits > \ + AViewType; \ + typedef Kokkos::View, \ + Kokkos::MemoryTraits > \ + XViewType; \ + typedef Kokkos::View, \ + Kokkos::MemoryTraits > \ + YViewType; \ + \ + static void gemv(const ExecSpace& /* space */, const char trans[], typename AViewType::const_value_type& alpha, \ + const AViewType& A, const XViewType& X, typename YViewType::const_value_type& beta, \ + const YViewType& Y) { \ + Kokkos::Profiling::pushRegion("KokkosBlas::gemv[TPL_BLAS,double]"); \ + KOKKOSBLAS2_GEMV_DETERMINE_ARGS(LAYOUTA); \ + HostBlas::gemv(transa, M, N, alpha, A.data(), LDA, X.data(), one, beta, Y.data(), one); \ + Kokkos::Profiling::popRegion(); \ + } \ }; -#define KOKKOSBLAS2_SGEMV_BLAS(LAYOUTA, LAYOUTX, LAYOUTY, MEM_SPACE, \ - ETI_SPEC_AVAIL) \ - template \ - struct GEMV< \ - ExecSpace, \ - Kokkos::View, \ - Kokkos::MemoryTraits >, \ - Kokkos::View, \ - Kokkos::MemoryTraits >, \ - Kokkos::View, \ - Kokkos::MemoryTraits >, \ - true, ETI_SPEC_AVAIL> { \ - typedef float SCALAR; \ - typedef Kokkos::View, \ - Kokkos::MemoryTraits > \ - AViewType; \ - typedef Kokkos::View, \ - Kokkos::MemoryTraits > \ - XViewType; \ - typedef Kokkos::View, \ - Kokkos::MemoryTraits > \ - YViewType; \ - \ - static void gemv(const ExecSpace& /* space */, const char trans[], \ - typename AViewType::const_value_type& alpha, \ - const AViewType& A, const XViewType& X, \ - typename YViewType::const_value_type& beta, \ - const YViewType& Y) { \ - Kokkos::Profiling::pushRegion("KokkosBlas::gemv[TPL_BLAS,float]"); \ - KOKKOSBLAS2_GEMV_DETERMINE_ARGS(LAYOUTA); \ - HostBlas::gemv(transa, M, N, alpha, A.data(), LDA, X.data(), one, \ - beta, Y.data(), one); \ - Kokkos::Profiling::popRegion(); \ - } \ +#define KOKKOSBLAS2_SGEMV_BLAS(LAYOUTA, LAYOUTX, LAYOUTY, MEM_SPACE, ETI_SPEC_AVAIL) \ + template \ + struct GEMV< \ + ExecSpace, \ + Kokkos::View, \ + Kokkos::MemoryTraits >, \ + Kokkos::View, \ + Kokkos::MemoryTraits >, \ + Kokkos::View, Kokkos::MemoryTraits >, \ + true, ETI_SPEC_AVAIL> { \ + typedef float SCALAR; \ + typedef Kokkos::View, \ + Kokkos::MemoryTraits > \ + AViewType; \ + typedef Kokkos::View, \ + Kokkos::MemoryTraits > \ + XViewType; \ + typedef Kokkos::View, \ + Kokkos::MemoryTraits > \ + YViewType; \ + \ + static void gemv(const ExecSpace& /* space */, const char trans[], typename AViewType::const_value_type& alpha, \ + const AViewType& A, const XViewType& X, typename YViewType::const_value_type& beta, \ + const YViewType& Y) { \ + Kokkos::Profiling::pushRegion("KokkosBlas::gemv[TPL_BLAS,float]"); \ + KOKKOSBLAS2_GEMV_DETERMINE_ARGS(LAYOUTA); \ + HostBlas::gemv(transa, M, N, alpha, A.data(), LDA, X.data(), one, beta, Y.data(), one); \ + Kokkos::Profiling::popRegion(); \ + } \ }; -#define KOKKOSBLAS2_ZGEMV_BLAS(LAYOUTA, LAYOUTX, LAYOUTY, MEM_SPACE, \ - ETI_SPEC_AVAIL) \ - template \ - struct GEMV**, LAYOUTA, \ - Kokkos::Device, \ - Kokkos::MemoryTraits >, \ - Kokkos::View*, LAYOUTX, \ - Kokkos::Device, \ - Kokkos::MemoryTraits >, \ - Kokkos::View*, LAYOUTY, \ - Kokkos::Device, \ - Kokkos::MemoryTraits >, \ - true, ETI_SPEC_AVAIL> { \ - typedef Kokkos::complex SCALAR; \ - typedef Kokkos::View, \ - Kokkos::MemoryTraits > \ - AViewType; \ - typedef Kokkos::View, \ - Kokkos::MemoryTraits > \ - XViewType; \ - typedef Kokkos::View, \ - Kokkos::MemoryTraits > \ - YViewType; \ - \ - static void gemv(const ExecSpace& /* space */, const char trans[], \ - typename AViewType::const_value_type& alpha, \ - const AViewType& A, const XViewType& X, \ - typename YViewType::const_value_type& beta, \ - const YViewType& Y) { \ - Kokkos::Profiling::pushRegion( \ - "KokkosBlas::gemv[TPL_BLAS,complex]"); \ - KOKKOSBLAS2_GEMV_DETERMINE_ARGS(LAYOUTA); \ - const std::complex alpha_val = alpha, beta_val = beta; \ - HostBlas >::gemv( \ - transa, M, N, alpha_val, \ - reinterpret_cast*>(A.data()), LDA, \ - reinterpret_cast*>(X.data()), one, \ - beta_val, reinterpret_cast*>(Y.data()), one); \ - Kokkos::Profiling::popRegion(); \ - } \ +#define KOKKOSBLAS2_ZGEMV_BLAS(LAYOUTA, LAYOUTX, LAYOUTY, MEM_SPACE, ETI_SPEC_AVAIL) \ + template \ + struct GEMV**, LAYOUTA, Kokkos::Device, \ + Kokkos::MemoryTraits >, \ + Kokkos::View*, LAYOUTX, Kokkos::Device, \ + Kokkos::MemoryTraits >, \ + Kokkos::View*, LAYOUTY, Kokkos::Device, \ + Kokkos::MemoryTraits >, \ + true, ETI_SPEC_AVAIL> { \ + typedef Kokkos::complex SCALAR; \ + typedef Kokkos::View, \ + Kokkos::MemoryTraits > \ + AViewType; \ + typedef Kokkos::View, \ + Kokkos::MemoryTraits > \ + XViewType; \ + typedef Kokkos::View, \ + Kokkos::MemoryTraits > \ + YViewType; \ + \ + static void gemv(const ExecSpace& /* space */, const char trans[], typename AViewType::const_value_type& alpha, \ + const AViewType& A, const XViewType& X, typename YViewType::const_value_type& beta, \ + const YViewType& Y) { \ + Kokkos::Profiling::pushRegion("KokkosBlas::gemv[TPL_BLAS,complex]"); \ + KOKKOSBLAS2_GEMV_DETERMINE_ARGS(LAYOUTA); \ + const std::complex alpha_val = alpha, beta_val = beta; \ + HostBlas >::gemv(transa, M, N, alpha_val, \ + reinterpret_cast*>(A.data()), LDA, \ + reinterpret_cast*>(X.data()), one, beta_val, \ + reinterpret_cast*>(Y.data()), one); \ + Kokkos::Profiling::popRegion(); \ + } \ }; -#define KOKKOSBLAS2_CGEMV_BLAS(LAYOUTA, LAYOUTX, LAYOUTY, MEM_SPACE, \ - ETI_SPEC_AVAIL) \ - template \ - struct GEMV**, LAYOUTA, \ - Kokkos::Device, \ - Kokkos::MemoryTraits >, \ - Kokkos::View*, LAYOUTX, \ - Kokkos::Device, \ - Kokkos::MemoryTraits >, \ - Kokkos::View*, LAYOUTY, \ - Kokkos::Device, \ - Kokkos::MemoryTraits >, \ - true, ETI_SPEC_AVAIL> { \ - typedef Kokkos::complex SCALAR; \ - typedef Kokkos::View, \ - Kokkos::MemoryTraits > \ - AViewType; \ - typedef Kokkos::View, \ - Kokkos::MemoryTraits > \ - XViewType; \ - typedef Kokkos::View, \ - Kokkos::MemoryTraits > \ - YViewType; \ - \ - static void gemv(const ExecSpace& /* space */, const char trans[], \ - typename AViewType::const_value_type& alpha, \ - const AViewType& A, const XViewType& X, \ - typename YViewType::const_value_type& beta, \ - const YViewType& Y) { \ - Kokkos::Profiling::pushRegion( \ - "KokkosBlas::gemv[TPL_BLAS,complex]"); \ - KOKKOSBLAS2_GEMV_DETERMINE_ARGS(LAYOUTA); \ - const std::complex alpha_val = alpha, beta_val = beta; \ - HostBlas >::gemv( \ - transa, M, N, alpha_val, \ - reinterpret_cast*>(A.data()), LDA, \ - reinterpret_cast*>(X.data()), one, \ - beta_val, reinterpret_cast*>(Y.data()), one); \ - Kokkos::Profiling::popRegion(); \ - } \ +#define KOKKOSBLAS2_CGEMV_BLAS(LAYOUTA, LAYOUTX, LAYOUTY, MEM_SPACE, ETI_SPEC_AVAIL) \ + template \ + struct GEMV**, LAYOUTA, Kokkos::Device, \ + Kokkos::MemoryTraits >, \ + Kokkos::View*, LAYOUTX, Kokkos::Device, \ + Kokkos::MemoryTraits >, \ + Kokkos::View*, LAYOUTY, Kokkos::Device, \ + Kokkos::MemoryTraits >, \ + true, ETI_SPEC_AVAIL> { \ + typedef Kokkos::complex SCALAR; \ + typedef Kokkos::View, \ + Kokkos::MemoryTraits > \ + AViewType; \ + typedef Kokkos::View, \ + Kokkos::MemoryTraits > \ + XViewType; \ + typedef Kokkos::View, \ + Kokkos::MemoryTraits > \ + YViewType; \ + \ + static void gemv(const ExecSpace& /* space */, const char trans[], typename AViewType::const_value_type& alpha, \ + const AViewType& A, const XViewType& X, typename YViewType::const_value_type& beta, \ + const YViewType& Y) { \ + Kokkos::Profiling::pushRegion("KokkosBlas::gemv[TPL_BLAS,complex]"); \ + KOKKOSBLAS2_GEMV_DETERMINE_ARGS(LAYOUTA); \ + const std::complex alpha_val = alpha, beta_val = beta; \ + HostBlas >::gemv(transa, M, N, alpha_val, \ + reinterpret_cast*>(A.data()), LDA, \ + reinterpret_cast*>(X.data()), one, beta_val, \ + reinterpret_cast*>(Y.data()), one); \ + Kokkos::Profiling::popRegion(); \ + } \ }; -KOKKOSBLAS2_DGEMV_BLAS(Kokkos::LayoutLeft, Kokkos::LayoutLeft, - Kokkos::LayoutLeft, Kokkos::HostSpace, true) -KOKKOSBLAS2_DGEMV_BLAS(Kokkos::LayoutLeft, Kokkos::LayoutLeft, - Kokkos::LayoutLeft, Kokkos::HostSpace, false) -KOKKOSBLAS2_DGEMV_BLAS(Kokkos::LayoutRight, Kokkos::LayoutRight, - Kokkos::LayoutRight, Kokkos::HostSpace, true) -KOKKOSBLAS2_DGEMV_BLAS(Kokkos::LayoutRight, Kokkos::LayoutRight, - Kokkos::LayoutRight, Kokkos::HostSpace, false) - -KOKKOSBLAS2_SGEMV_BLAS(Kokkos::LayoutLeft, Kokkos::LayoutLeft, - Kokkos::LayoutLeft, Kokkos::HostSpace, true) -KOKKOSBLAS2_SGEMV_BLAS(Kokkos::LayoutLeft, Kokkos::LayoutLeft, - Kokkos::LayoutLeft, Kokkos::HostSpace, false) -KOKKOSBLAS2_SGEMV_BLAS(Kokkos::LayoutRight, Kokkos::LayoutRight, - Kokkos::LayoutRight, Kokkos::HostSpace, true) -KOKKOSBLAS2_SGEMV_BLAS(Kokkos::LayoutRight, Kokkos::LayoutRight, - Kokkos::LayoutRight, Kokkos::HostSpace, false) - -KOKKOSBLAS2_ZGEMV_BLAS(Kokkos::LayoutLeft, Kokkos::LayoutLeft, - Kokkos::LayoutLeft, Kokkos::HostSpace, true) -KOKKOSBLAS2_ZGEMV_BLAS(Kokkos::LayoutLeft, Kokkos::LayoutLeft, - Kokkos::LayoutLeft, Kokkos::HostSpace, false) -KOKKOSBLAS2_ZGEMV_BLAS(Kokkos::LayoutRight, Kokkos::LayoutRight, - Kokkos::LayoutRight, Kokkos::HostSpace, true) -KOKKOSBLAS2_ZGEMV_BLAS(Kokkos::LayoutRight, Kokkos::LayoutRight, - Kokkos::LayoutRight, Kokkos::HostSpace, false) - -KOKKOSBLAS2_CGEMV_BLAS(Kokkos::LayoutLeft, Kokkos::LayoutLeft, - Kokkos::LayoutLeft, Kokkos::HostSpace, true) -KOKKOSBLAS2_CGEMV_BLAS(Kokkos::LayoutLeft, Kokkos::LayoutLeft, - Kokkos::LayoutLeft, Kokkos::HostSpace, false) -KOKKOSBLAS2_CGEMV_BLAS(Kokkos::LayoutRight, Kokkos::LayoutRight, - Kokkos::LayoutRight, Kokkos::HostSpace, true) -KOKKOSBLAS2_CGEMV_BLAS(Kokkos::LayoutRight, Kokkos::LayoutRight, - Kokkos::LayoutRight, Kokkos::HostSpace, false) +KOKKOSBLAS2_DGEMV_BLAS(Kokkos::LayoutLeft, Kokkos::LayoutLeft, Kokkos::LayoutLeft, Kokkos::HostSpace, true) +KOKKOSBLAS2_DGEMV_BLAS(Kokkos::LayoutLeft, Kokkos::LayoutLeft, Kokkos::LayoutLeft, Kokkos::HostSpace, false) +KOKKOSBLAS2_DGEMV_BLAS(Kokkos::LayoutRight, Kokkos::LayoutRight, Kokkos::LayoutRight, Kokkos::HostSpace, true) +KOKKOSBLAS2_DGEMV_BLAS(Kokkos::LayoutRight, Kokkos::LayoutRight, Kokkos::LayoutRight, Kokkos::HostSpace, false) + +KOKKOSBLAS2_SGEMV_BLAS(Kokkos::LayoutLeft, Kokkos::LayoutLeft, Kokkos::LayoutLeft, Kokkos::HostSpace, true) +KOKKOSBLAS2_SGEMV_BLAS(Kokkos::LayoutLeft, Kokkos::LayoutLeft, Kokkos::LayoutLeft, Kokkos::HostSpace, false) +KOKKOSBLAS2_SGEMV_BLAS(Kokkos::LayoutRight, Kokkos::LayoutRight, Kokkos::LayoutRight, Kokkos::HostSpace, true) +KOKKOSBLAS2_SGEMV_BLAS(Kokkos::LayoutRight, Kokkos::LayoutRight, Kokkos::LayoutRight, Kokkos::HostSpace, false) + +KOKKOSBLAS2_ZGEMV_BLAS(Kokkos::LayoutLeft, Kokkos::LayoutLeft, Kokkos::LayoutLeft, Kokkos::HostSpace, true) +KOKKOSBLAS2_ZGEMV_BLAS(Kokkos::LayoutLeft, Kokkos::LayoutLeft, Kokkos::LayoutLeft, Kokkos::HostSpace, false) +KOKKOSBLAS2_ZGEMV_BLAS(Kokkos::LayoutRight, Kokkos::LayoutRight, Kokkos::LayoutRight, Kokkos::HostSpace, true) +KOKKOSBLAS2_ZGEMV_BLAS(Kokkos::LayoutRight, Kokkos::LayoutRight, Kokkos::LayoutRight, Kokkos::HostSpace, false) + +KOKKOSBLAS2_CGEMV_BLAS(Kokkos::LayoutLeft, Kokkos::LayoutLeft, Kokkos::LayoutLeft, Kokkos::HostSpace, true) +KOKKOSBLAS2_CGEMV_BLAS(Kokkos::LayoutLeft, Kokkos::LayoutLeft, Kokkos::LayoutLeft, Kokkos::HostSpace, false) +KOKKOSBLAS2_CGEMV_BLAS(Kokkos::LayoutRight, Kokkos::LayoutRight, Kokkos::LayoutRight, Kokkos::HostSpace, true) +KOKKOSBLAS2_CGEMV_BLAS(Kokkos::LayoutRight, Kokkos::LayoutRight, Kokkos::LayoutRight, Kokkos::HostSpace, false) } // namespace Impl } // namespace KokkosBlas @@ -284,238 +226,169 @@ namespace Impl { transa = CUBLAS_OP_C; \ } -#define KOKKOSBLAS2_DGEMV_CUBLAS(LAYOUTA, LAYOUTX, LAYOUTY, MEM_SPACE, \ - ETI_SPEC_AVAIL) \ - template \ - struct GEMV< \ - ExecSpace, \ - Kokkos::View, \ - Kokkos::MemoryTraits >, \ - Kokkos::View, \ - Kokkos::MemoryTraits >, \ - Kokkos::View, \ - Kokkos::MemoryTraits >, \ - true, ETI_SPEC_AVAIL> { \ - typedef double SCALAR; \ - typedef Kokkos::View, \ - Kokkos::MemoryTraits > \ - AViewType; \ - typedef Kokkos::View, \ - Kokkos::MemoryTraits > \ - XViewType; \ - typedef Kokkos::View, \ - Kokkos::MemoryTraits > \ - YViewType; \ - \ - static void gemv(const ExecSpace& space, const char trans[], \ - typename AViewType::const_value_type& alpha, \ - const AViewType& A, const XViewType& X, \ - typename YViewType::const_value_type& beta, \ - const YViewType& Y) { \ - Kokkos::Profiling::pushRegion("KokkosBlas::gemv[TPL_CUBLAS,double]"); \ - KOKKOSBLAS2_GEMV_CUBLAS_DETERMINE_ARGS(LAYOUTA); \ - KokkosBlas::Impl::CudaBlasSingleton& s = \ - KokkosBlas::Impl::CudaBlasSingleton::singleton(); \ - KOKKOS_CUBLAS_SAFE_CALL_IMPL( \ - cublasSetStream(s.handle, space.cuda_stream())); \ - KOKKOS_CUBLAS_SAFE_CALL_IMPL(cublasDgemv(s.handle, transa, M, N, &alpha, \ - A.data(), LDA, X.data(), one, \ - &beta, Y.data(), one)); \ - KOKKOS_CUBLAS_SAFE_CALL_IMPL(cublasSetStream(s.handle, NULL)); \ - Kokkos::Profiling::popRegion(); \ - } \ +#define KOKKOSBLAS2_DGEMV_CUBLAS(LAYOUTA, LAYOUTX, LAYOUTY, MEM_SPACE, ETI_SPEC_AVAIL) \ + template \ + struct GEMV< \ + ExecSpace, \ + Kokkos::View, \ + Kokkos::MemoryTraits >, \ + Kokkos::View, \ + Kokkos::MemoryTraits >, \ + Kokkos::View, Kokkos::MemoryTraits >, \ + true, ETI_SPEC_AVAIL> { \ + typedef double SCALAR; \ + typedef Kokkos::View, \ + Kokkos::MemoryTraits > \ + AViewType; \ + typedef Kokkos::View, \ + Kokkos::MemoryTraits > \ + XViewType; \ + typedef Kokkos::View, \ + Kokkos::MemoryTraits > \ + YViewType; \ + \ + static void gemv(const ExecSpace& space, const char trans[], typename AViewType::const_value_type& alpha, \ + const AViewType& A, const XViewType& X, typename YViewType::const_value_type& beta, \ + const YViewType& Y) { \ + Kokkos::Profiling::pushRegion("KokkosBlas::gemv[TPL_CUBLAS,double]"); \ + KOKKOSBLAS2_GEMV_CUBLAS_DETERMINE_ARGS(LAYOUTA); \ + KokkosBlas::Impl::CudaBlasSingleton& s = KokkosBlas::Impl::CudaBlasSingleton::singleton(); \ + KOKKOS_CUBLAS_SAFE_CALL_IMPL(cublasSetStream(s.handle, space.cuda_stream())); \ + KOKKOS_CUBLAS_SAFE_CALL_IMPL( \ + cublasDgemv(s.handle, transa, M, N, &alpha, A.data(), LDA, X.data(), one, &beta, Y.data(), one)); \ + KOKKOS_CUBLAS_SAFE_CALL_IMPL(cublasSetStream(s.handle, NULL)); \ + Kokkos::Profiling::popRegion(); \ + } \ }; -#define KOKKOSBLAS2_SGEMV_CUBLAS(LAYOUTA, LAYOUTX, LAYOUTY, MEM_SPACE, \ - ETI_SPEC_AVAIL) \ - template \ - struct GEMV< \ - ExecSpace, \ - Kokkos::View, \ - Kokkos::MemoryTraits >, \ - Kokkos::View, \ - Kokkos::MemoryTraits >, \ - Kokkos::View, \ - Kokkos::MemoryTraits >, \ - true, ETI_SPEC_AVAIL> { \ - typedef float SCALAR; \ - typedef Kokkos::View, \ - Kokkos::MemoryTraits > \ - AViewType; \ - typedef Kokkos::View, \ - Kokkos::MemoryTraits > \ - XViewType; \ - typedef Kokkos::View, \ - Kokkos::MemoryTraits > \ - YViewType; \ - \ - static void gemv(const ExecSpace& space, const char trans[], \ - typename AViewType::const_value_type& alpha, \ - const AViewType& A, const XViewType& X, \ - typename YViewType::const_value_type& beta, \ - const YViewType& Y) { \ - Kokkos::Profiling::pushRegion("KokkosBlas::gemv[TPL_CUBLAS,float]"); \ - KOKKOSBLAS2_GEMV_CUBLAS_DETERMINE_ARGS(LAYOUTA); \ - KokkosBlas::Impl::CudaBlasSingleton& s = \ - KokkosBlas::Impl::CudaBlasSingleton::singleton(); \ - KOKKOS_CUBLAS_SAFE_CALL_IMPL( \ - cublasSetStream(s.handle, space.cuda_stream())); \ - KOKKOS_CUBLAS_SAFE_CALL_IMPL(cublasSgemv(s.handle, transa, M, N, &alpha, \ - A.data(), LDA, X.data(), one, \ - &beta, Y.data(), one)); \ - KOKKOS_CUBLAS_SAFE_CALL_IMPL(cublasSetStream(s.handle, NULL)); \ - Kokkos::Profiling::popRegion(); \ - } \ +#define KOKKOSBLAS2_SGEMV_CUBLAS(LAYOUTA, LAYOUTX, LAYOUTY, MEM_SPACE, ETI_SPEC_AVAIL) \ + template \ + struct GEMV< \ + ExecSpace, \ + Kokkos::View, \ + Kokkos::MemoryTraits >, \ + Kokkos::View, \ + Kokkos::MemoryTraits >, \ + Kokkos::View, Kokkos::MemoryTraits >, \ + true, ETI_SPEC_AVAIL> { \ + typedef float SCALAR; \ + typedef Kokkos::View, \ + Kokkos::MemoryTraits > \ + AViewType; \ + typedef Kokkos::View, \ + Kokkos::MemoryTraits > \ + XViewType; \ + typedef Kokkos::View, \ + Kokkos::MemoryTraits > \ + YViewType; \ + \ + static void gemv(const ExecSpace& space, const char trans[], typename AViewType::const_value_type& alpha, \ + const AViewType& A, const XViewType& X, typename YViewType::const_value_type& beta, \ + const YViewType& Y) { \ + Kokkos::Profiling::pushRegion("KokkosBlas::gemv[TPL_CUBLAS,float]"); \ + KOKKOSBLAS2_GEMV_CUBLAS_DETERMINE_ARGS(LAYOUTA); \ + KokkosBlas::Impl::CudaBlasSingleton& s = KokkosBlas::Impl::CudaBlasSingleton::singleton(); \ + KOKKOS_CUBLAS_SAFE_CALL_IMPL(cublasSetStream(s.handle, space.cuda_stream())); \ + KOKKOS_CUBLAS_SAFE_CALL_IMPL( \ + cublasSgemv(s.handle, transa, M, N, &alpha, A.data(), LDA, X.data(), one, &beta, Y.data(), one)); \ + KOKKOS_CUBLAS_SAFE_CALL_IMPL(cublasSetStream(s.handle, NULL)); \ + Kokkos::Profiling::popRegion(); \ + } \ }; -#define KOKKOSBLAS2_ZGEMV_CUBLAS(LAYOUTA, LAYOUTX, LAYOUTY, MEM_SPACE, \ - ETI_SPEC_AVAIL) \ - template \ - struct GEMV**, LAYOUTA, \ - Kokkos::Device, \ - Kokkos::MemoryTraits >, \ - Kokkos::View*, LAYOUTX, \ - Kokkos::Device, \ - Kokkos::MemoryTraits >, \ - Kokkos::View*, LAYOUTY, \ - Kokkos::Device, \ - Kokkos::MemoryTraits >, \ - true, ETI_SPEC_AVAIL> { \ - typedef Kokkos::complex SCALAR; \ - typedef Kokkos::View, \ - Kokkos::MemoryTraits > \ - AViewType; \ - typedef Kokkos::View, \ - Kokkos::MemoryTraits > \ - XViewType; \ - typedef Kokkos::View, \ - Kokkos::MemoryTraits > \ - YViewType; \ - \ - static void gemv(const ExecSpace& space, const char trans[], \ - typename AViewType::const_value_type& alpha, \ - const AViewType& A, const XViewType& X, \ - typename YViewType::const_value_type& beta, \ - const YViewType& Y) { \ - Kokkos::Profiling::pushRegion( \ - "KokkosBlas::gemv[TPL_CUBLAS,complex]"); \ - KOKKOSBLAS2_GEMV_CUBLAS_DETERMINE_ARGS(LAYOUTA); \ - KokkosBlas::Impl::CudaBlasSingleton& s = \ - KokkosBlas::Impl::CudaBlasSingleton::singleton(); \ - KOKKOS_CUBLAS_SAFE_CALL_IMPL( \ - cublasSetStream(s.handle, space.cuda_stream())); \ - KOKKOS_CUBLAS_SAFE_CALL_IMPL( \ - cublasZgemv(s.handle, transa, M, N, \ - reinterpret_cast(&alpha), \ - reinterpret_cast(A.data()), LDA, \ - reinterpret_cast(X.data()), one, \ - reinterpret_cast(&beta), \ - reinterpret_cast(Y.data()), one)); \ - KOKKOS_CUBLAS_SAFE_CALL_IMPL(cublasSetStream(s.handle, NULL)); \ - Kokkos::Profiling::popRegion(); \ - } \ +#define KOKKOSBLAS2_ZGEMV_CUBLAS(LAYOUTA, LAYOUTX, LAYOUTY, MEM_SPACE, ETI_SPEC_AVAIL) \ + template \ + struct GEMV**, LAYOUTA, Kokkos::Device, \ + Kokkos::MemoryTraits >, \ + Kokkos::View*, LAYOUTX, Kokkos::Device, \ + Kokkos::MemoryTraits >, \ + Kokkos::View*, LAYOUTY, Kokkos::Device, \ + Kokkos::MemoryTraits >, \ + true, ETI_SPEC_AVAIL> { \ + typedef Kokkos::complex SCALAR; \ + typedef Kokkos::View, \ + Kokkos::MemoryTraits > \ + AViewType; \ + typedef Kokkos::View, \ + Kokkos::MemoryTraits > \ + XViewType; \ + typedef Kokkos::View, \ + Kokkos::MemoryTraits > \ + YViewType; \ + \ + static void gemv(const ExecSpace& space, const char trans[], typename AViewType::const_value_type& alpha, \ + const AViewType& A, const XViewType& X, typename YViewType::const_value_type& beta, \ + const YViewType& Y) { \ + Kokkos::Profiling::pushRegion("KokkosBlas::gemv[TPL_CUBLAS,complex]"); \ + KOKKOSBLAS2_GEMV_CUBLAS_DETERMINE_ARGS(LAYOUTA); \ + KokkosBlas::Impl::CudaBlasSingleton& s = KokkosBlas::Impl::CudaBlasSingleton::singleton(); \ + KOKKOS_CUBLAS_SAFE_CALL_IMPL(cublasSetStream(s.handle, space.cuda_stream())); \ + KOKKOS_CUBLAS_SAFE_CALL_IMPL(cublasZgemv( \ + s.handle, transa, M, N, reinterpret_cast(&alpha), \ + reinterpret_cast(A.data()), LDA, reinterpret_cast(X.data()), \ + one, reinterpret_cast(&beta), reinterpret_cast(Y.data()), one)); \ + KOKKOS_CUBLAS_SAFE_CALL_IMPL(cublasSetStream(s.handle, NULL)); \ + Kokkos::Profiling::popRegion(); \ + } \ }; -#define KOKKOSBLAS2_CGEMV_CUBLAS(LAYOUTA, LAYOUTX, LAYOUTY, MEM_SPACE, \ - ETI_SPEC_AVAIL) \ - template \ - struct GEMV**, LAYOUTA, \ - Kokkos::Device, \ - Kokkos::MemoryTraits >, \ - Kokkos::View*, LAYOUTX, \ - Kokkos::Device, \ - Kokkos::MemoryTraits >, \ - Kokkos::View*, LAYOUTY, \ - Kokkos::Device, \ - Kokkos::MemoryTraits >, \ - true, ETI_SPEC_AVAIL> { \ - typedef Kokkos::complex SCALAR; \ - typedef Kokkos::View, \ - Kokkos::MemoryTraits > \ - AViewType; \ - typedef Kokkos::View, \ - Kokkos::MemoryTraits > \ - XViewType; \ - typedef Kokkos::View, \ - Kokkos::MemoryTraits > \ - YViewType; \ - \ - static void gemv(const ExecSpace& space, const char trans[], \ - typename AViewType::const_value_type& alpha, \ - const AViewType& A, const XViewType& X, \ - typename YViewType::const_value_type& beta, \ - const YViewType& Y) { \ - Kokkos::Profiling::pushRegion( \ - "KokkosBlas::gemv[TPL_CUBLAS,complex]"); \ - KOKKOSBLAS2_GEMV_CUBLAS_DETERMINE_ARGS(LAYOUTA); \ - KokkosBlas::Impl::CudaBlasSingleton& s = \ - KokkosBlas::Impl::CudaBlasSingleton::singleton(); \ - KOKKOS_CUBLAS_SAFE_CALL_IMPL( \ - cublasSetStream(s.handle, space.cuda_stream())); \ - KOKKOS_CUBLAS_SAFE_CALL_IMPL(cublasCgemv( \ - s.handle, transa, M, N, reinterpret_cast(&alpha), \ - reinterpret_cast(A.data()), LDA, \ - reinterpret_cast(X.data()), one, \ - reinterpret_cast(&beta), \ - reinterpret_cast(Y.data()), one)); \ - KOKKOS_CUBLAS_SAFE_CALL_IMPL(cublasSetStream(s.handle, NULL)); \ - Kokkos::Profiling::popRegion(); \ - } \ +#define KOKKOSBLAS2_CGEMV_CUBLAS(LAYOUTA, LAYOUTX, LAYOUTY, MEM_SPACE, ETI_SPEC_AVAIL) \ + template \ + struct GEMV**, LAYOUTA, Kokkos::Device, \ + Kokkos::MemoryTraits >, \ + Kokkos::View*, LAYOUTX, Kokkos::Device, \ + Kokkos::MemoryTraits >, \ + Kokkos::View*, LAYOUTY, Kokkos::Device, \ + Kokkos::MemoryTraits >, \ + true, ETI_SPEC_AVAIL> { \ + typedef Kokkos::complex SCALAR; \ + typedef Kokkos::View, \ + Kokkos::MemoryTraits > \ + AViewType; \ + typedef Kokkos::View, \ + Kokkos::MemoryTraits > \ + XViewType; \ + typedef Kokkos::View, \ + Kokkos::MemoryTraits > \ + YViewType; \ + \ + static void gemv(const ExecSpace& space, const char trans[], typename AViewType::const_value_type& alpha, \ + const AViewType& A, const XViewType& X, typename YViewType::const_value_type& beta, \ + const YViewType& Y) { \ + Kokkos::Profiling::pushRegion("KokkosBlas::gemv[TPL_CUBLAS,complex]"); \ + KOKKOSBLAS2_GEMV_CUBLAS_DETERMINE_ARGS(LAYOUTA); \ + KokkosBlas::Impl::CudaBlasSingleton& s = KokkosBlas::Impl::CudaBlasSingleton::singleton(); \ + KOKKOS_CUBLAS_SAFE_CALL_IMPL(cublasSetStream(s.handle, space.cuda_stream())); \ + KOKKOS_CUBLAS_SAFE_CALL_IMPL( \ + cublasCgemv(s.handle, transa, M, N, reinterpret_cast(&alpha), \ + reinterpret_cast(A.data()), LDA, reinterpret_cast(X.data()), \ + one, reinterpret_cast(&beta), reinterpret_cast(Y.data()), one)); \ + KOKKOS_CUBLAS_SAFE_CALL_IMPL(cublasSetStream(s.handle, NULL)); \ + Kokkos::Profiling::popRegion(); \ + } \ }; -KOKKOSBLAS2_DGEMV_CUBLAS(Kokkos::LayoutLeft, Kokkos::LayoutLeft, - Kokkos::LayoutLeft, Kokkos::CudaSpace, true) -KOKKOSBLAS2_DGEMV_CUBLAS(Kokkos::LayoutLeft, Kokkos::LayoutLeft, - Kokkos::LayoutLeft, Kokkos::CudaSpace, false) -KOKKOSBLAS2_DGEMV_CUBLAS(Kokkos::LayoutRight, Kokkos::LayoutRight, - Kokkos::LayoutRight, Kokkos::CudaSpace, true) -KOKKOSBLAS2_DGEMV_CUBLAS(Kokkos::LayoutRight, Kokkos::LayoutRight, - Kokkos::LayoutRight, Kokkos::CudaSpace, false) - -KOKKOSBLAS2_SGEMV_CUBLAS(Kokkos::LayoutLeft, Kokkos::LayoutLeft, - Kokkos::LayoutLeft, Kokkos::CudaSpace, true) -KOKKOSBLAS2_SGEMV_CUBLAS(Kokkos::LayoutLeft, Kokkos::LayoutLeft, - Kokkos::LayoutLeft, Kokkos::CudaSpace, false) -KOKKOSBLAS2_SGEMV_CUBLAS(Kokkos::LayoutRight, Kokkos::LayoutRight, - Kokkos::LayoutRight, Kokkos::CudaSpace, true) -KOKKOSBLAS2_SGEMV_CUBLAS(Kokkos::LayoutRight, Kokkos::LayoutRight, - Kokkos::LayoutRight, Kokkos::CudaSpace, false) - -KOKKOSBLAS2_ZGEMV_CUBLAS(Kokkos::LayoutLeft, Kokkos::LayoutLeft, - Kokkos::LayoutLeft, Kokkos::CudaSpace, true) -KOKKOSBLAS2_ZGEMV_CUBLAS(Kokkos::LayoutLeft, Kokkos::LayoutLeft, - Kokkos::LayoutLeft, Kokkos::CudaSpace, false) -KOKKOSBLAS2_ZGEMV_CUBLAS(Kokkos::LayoutRight, Kokkos::LayoutRight, - Kokkos::LayoutRight, Kokkos::CudaSpace, true) -KOKKOSBLAS2_ZGEMV_CUBLAS(Kokkos::LayoutRight, Kokkos::LayoutRight, - Kokkos::LayoutRight, Kokkos::CudaSpace, false) - -KOKKOSBLAS2_CGEMV_CUBLAS(Kokkos::LayoutLeft, Kokkos::LayoutLeft, - Kokkos::LayoutLeft, Kokkos::CudaSpace, true) -KOKKOSBLAS2_CGEMV_CUBLAS(Kokkos::LayoutLeft, Kokkos::LayoutLeft, - Kokkos::LayoutLeft, Kokkos::CudaSpace, false) -KOKKOSBLAS2_CGEMV_CUBLAS(Kokkos::LayoutRight, Kokkos::LayoutRight, - Kokkos::LayoutRight, Kokkos::CudaSpace, true) -KOKKOSBLAS2_CGEMV_CUBLAS(Kokkos::LayoutRight, Kokkos::LayoutRight, - Kokkos::LayoutRight, Kokkos::CudaSpace, false) +KOKKOSBLAS2_DGEMV_CUBLAS(Kokkos::LayoutLeft, Kokkos::LayoutLeft, Kokkos::LayoutLeft, Kokkos::CudaSpace, true) +KOKKOSBLAS2_DGEMV_CUBLAS(Kokkos::LayoutLeft, Kokkos::LayoutLeft, Kokkos::LayoutLeft, Kokkos::CudaSpace, false) +KOKKOSBLAS2_DGEMV_CUBLAS(Kokkos::LayoutRight, Kokkos::LayoutRight, Kokkos::LayoutRight, Kokkos::CudaSpace, true) +KOKKOSBLAS2_DGEMV_CUBLAS(Kokkos::LayoutRight, Kokkos::LayoutRight, Kokkos::LayoutRight, Kokkos::CudaSpace, false) + +KOKKOSBLAS2_SGEMV_CUBLAS(Kokkos::LayoutLeft, Kokkos::LayoutLeft, Kokkos::LayoutLeft, Kokkos::CudaSpace, true) +KOKKOSBLAS2_SGEMV_CUBLAS(Kokkos::LayoutLeft, Kokkos::LayoutLeft, Kokkos::LayoutLeft, Kokkos::CudaSpace, false) +KOKKOSBLAS2_SGEMV_CUBLAS(Kokkos::LayoutRight, Kokkos::LayoutRight, Kokkos::LayoutRight, Kokkos::CudaSpace, true) +KOKKOSBLAS2_SGEMV_CUBLAS(Kokkos::LayoutRight, Kokkos::LayoutRight, Kokkos::LayoutRight, Kokkos::CudaSpace, false) + +KOKKOSBLAS2_ZGEMV_CUBLAS(Kokkos::LayoutLeft, Kokkos::LayoutLeft, Kokkos::LayoutLeft, Kokkos::CudaSpace, true) +KOKKOSBLAS2_ZGEMV_CUBLAS(Kokkos::LayoutLeft, Kokkos::LayoutLeft, Kokkos::LayoutLeft, Kokkos::CudaSpace, false) +KOKKOSBLAS2_ZGEMV_CUBLAS(Kokkos::LayoutRight, Kokkos::LayoutRight, Kokkos::LayoutRight, Kokkos::CudaSpace, true) +KOKKOSBLAS2_ZGEMV_CUBLAS(Kokkos::LayoutRight, Kokkos::LayoutRight, Kokkos::LayoutRight, Kokkos::CudaSpace, false) + +KOKKOSBLAS2_CGEMV_CUBLAS(Kokkos::LayoutLeft, Kokkos::LayoutLeft, Kokkos::LayoutLeft, Kokkos::CudaSpace, true) +KOKKOSBLAS2_CGEMV_CUBLAS(Kokkos::LayoutLeft, Kokkos::LayoutLeft, Kokkos::LayoutLeft, Kokkos::CudaSpace, false) +KOKKOSBLAS2_CGEMV_CUBLAS(Kokkos::LayoutRight, Kokkos::LayoutRight, Kokkos::LayoutRight, Kokkos::CudaSpace, true) +KOKKOSBLAS2_CGEMV_CUBLAS(Kokkos::LayoutRight, Kokkos::LayoutRight, Kokkos::LayoutRight, Kokkos::CudaSpace, false) } // namespace Impl } // namespace KokkosBlas @@ -548,198 +421,152 @@ namespace Impl { transa = rocblas_operation_conjugate_transpose; \ } -#define KOKKOSBLAS2_DGEMV_ROCBLAS(LAYOUT, MEM_SPACE, ETI_SPEC_AVAIL) \ - template \ - struct GEMV< \ - ExecSpace, \ - Kokkos::View, \ - Kokkos::MemoryTraits >, \ - Kokkos::View, \ - Kokkos::MemoryTraits >, \ - Kokkos::View, \ - Kokkos::MemoryTraits >, \ - true, ETI_SPEC_AVAIL> { \ - typedef double SCALAR; \ - typedef Kokkos::View, \ - Kokkos::MemoryTraits > \ - AViewType; \ - typedef Kokkos::View, \ - Kokkos::MemoryTraits > \ - XViewType; \ - typedef Kokkos::View, \ - Kokkos::MemoryTraits > \ - YViewType; \ - \ - static void gemv(const ExecSpace& space, const char trans[], \ - typename AViewType::const_value_type& alpha, \ - const AViewType& A, const XViewType& X, \ - typename YViewType::const_value_type& beta, \ - const YViewType& Y) { \ - Kokkos::Profiling::pushRegion("KokkosBlas::gemv[TPL_ROCBLAS,double]"); \ - KOKKOSBLAS2_GEMV_ROCBLAS_DETERMINE_ARGS(LAYOUT); \ - KokkosBlas::Impl::RocBlasSingleton& s = \ - KokkosBlas::Impl::RocBlasSingleton::singleton(); \ - KOKKOS_ROCBLAS_SAFE_CALL_IMPL( \ - rocblas_set_stream(s.handle, space.hip_stream())); \ - KOKKOS_ROCBLAS_SAFE_CALL_IMPL( \ - rocblas_dgemv(s.handle, transa, M, N, &alpha, A.data(), LDA, \ - X.data(), one, &beta, Y.data(), one)); \ - KOKKOS_ROCBLAS_SAFE_CALL_IMPL(rocblas_set_stream(s.handle, NULL)); \ - Kokkos::Profiling::popRegion(); \ - } \ +#define KOKKOSBLAS2_DGEMV_ROCBLAS(LAYOUT, MEM_SPACE, ETI_SPEC_AVAIL) \ + template \ + struct GEMV< \ + ExecSpace, \ + Kokkos::View, \ + Kokkos::MemoryTraits >, \ + Kokkos::View, \ + Kokkos::MemoryTraits >, \ + Kokkos::View, Kokkos::MemoryTraits >, \ + true, ETI_SPEC_AVAIL> { \ + typedef double SCALAR; \ + typedef Kokkos::View, \ + Kokkos::MemoryTraits > \ + AViewType; \ + typedef Kokkos::View, \ + Kokkos::MemoryTraits > \ + XViewType; \ + typedef Kokkos::View, \ + Kokkos::MemoryTraits > \ + YViewType; \ + \ + static void gemv(const ExecSpace& space, const char trans[], typename AViewType::const_value_type& alpha, \ + const AViewType& A, const XViewType& X, typename YViewType::const_value_type& beta, \ + const YViewType& Y) { \ + Kokkos::Profiling::pushRegion("KokkosBlas::gemv[TPL_ROCBLAS,double]"); \ + KOKKOSBLAS2_GEMV_ROCBLAS_DETERMINE_ARGS(LAYOUT); \ + KokkosBlas::Impl::RocBlasSingleton& s = KokkosBlas::Impl::RocBlasSingleton::singleton(); \ + KOKKOS_ROCBLAS_SAFE_CALL_IMPL(rocblas_set_stream(s.handle, space.hip_stream())); \ + KOKKOS_ROCBLAS_SAFE_CALL_IMPL( \ + rocblas_dgemv(s.handle, transa, M, N, &alpha, A.data(), LDA, X.data(), one, &beta, Y.data(), one)); \ + KOKKOS_ROCBLAS_SAFE_CALL_IMPL(rocblas_set_stream(s.handle, NULL)); \ + Kokkos::Profiling::popRegion(); \ + } \ }; -#define KOKKOSBLAS2_SGEMV_ROCBLAS(LAYOUT, MEM_SPACE, ETI_SPEC_AVAIL) \ - template \ - struct GEMV< \ - ExecSpace, \ - Kokkos::View, \ - Kokkos::MemoryTraits >, \ - Kokkos::View, \ - Kokkos::MemoryTraits >, \ - Kokkos::View, \ - Kokkos::MemoryTraits >, \ - true, ETI_SPEC_AVAIL> { \ - typedef float SCALAR; \ - typedef Kokkos::View, \ - Kokkos::MemoryTraits > \ - AViewType; \ - typedef Kokkos::View, \ - Kokkos::MemoryTraits > \ - XViewType; \ - typedef Kokkos::View, \ - Kokkos::MemoryTraits > \ - YViewType; \ - \ - static void gemv(const ExecSpace& space, const char trans[], \ - typename AViewType::const_value_type& alpha, \ - const AViewType& A, const XViewType& X, \ - typename YViewType::const_value_type& beta, \ - const YViewType& Y) { \ - Kokkos::Profiling::pushRegion("KokkosBlas::gemv[TPL_ROCBLAS,float]"); \ - KOKKOSBLAS2_GEMV_ROCBLAS_DETERMINE_ARGS(LAYOUT); \ - KokkosBlas::Impl::RocBlasSingleton& s = \ - KokkosBlas::Impl::RocBlasSingleton::singleton(); \ - KOKKOS_ROCBLAS_SAFE_CALL_IMPL( \ - rocblas_set_stream(s.handle, space.hip_stream())); \ - KOKKOS_ROCBLAS_SAFE_CALL_IMPL( \ - rocblas_sgemv(s.handle, transa, M, N, &alpha, A.data(), LDA, \ - X.data(), one, &beta, Y.data(), one)); \ - KOKKOS_ROCBLAS_SAFE_CALL_IMPL(rocblas_set_stream(s.handle, NULL)); \ - Kokkos::Profiling::popRegion(); \ - } \ +#define KOKKOSBLAS2_SGEMV_ROCBLAS(LAYOUT, MEM_SPACE, ETI_SPEC_AVAIL) \ + template \ + struct GEMV< \ + ExecSpace, \ + Kokkos::View, \ + Kokkos::MemoryTraits >, \ + Kokkos::View, \ + Kokkos::MemoryTraits >, \ + Kokkos::View, Kokkos::MemoryTraits >, \ + true, ETI_SPEC_AVAIL> { \ + typedef float SCALAR; \ + typedef Kokkos::View, \ + Kokkos::MemoryTraits > \ + AViewType; \ + typedef Kokkos::View, \ + Kokkos::MemoryTraits > \ + XViewType; \ + typedef Kokkos::View, \ + Kokkos::MemoryTraits > \ + YViewType; \ + \ + static void gemv(const ExecSpace& space, const char trans[], typename AViewType::const_value_type& alpha, \ + const AViewType& A, const XViewType& X, typename YViewType::const_value_type& beta, \ + const YViewType& Y) { \ + Kokkos::Profiling::pushRegion("KokkosBlas::gemv[TPL_ROCBLAS,float]"); \ + KOKKOSBLAS2_GEMV_ROCBLAS_DETERMINE_ARGS(LAYOUT); \ + KokkosBlas::Impl::RocBlasSingleton& s = KokkosBlas::Impl::RocBlasSingleton::singleton(); \ + KOKKOS_ROCBLAS_SAFE_CALL_IMPL(rocblas_set_stream(s.handle, space.hip_stream())); \ + KOKKOS_ROCBLAS_SAFE_CALL_IMPL( \ + rocblas_sgemv(s.handle, transa, M, N, &alpha, A.data(), LDA, X.data(), one, &beta, Y.data(), one)); \ + KOKKOS_ROCBLAS_SAFE_CALL_IMPL(rocblas_set_stream(s.handle, NULL)); \ + Kokkos::Profiling::popRegion(); \ + } \ }; -#define KOKKOSBLAS2_ZGEMV_ROCBLAS(LAYOUT, MEM_SPACE, ETI_SPEC_AVAIL) \ - template \ - struct GEMV**, LAYOUT, \ - Kokkos::Device, \ - Kokkos::MemoryTraits >, \ - Kokkos::View*, LAYOUT, \ - Kokkos::Device, \ - Kokkos::MemoryTraits >, \ - Kokkos::View*, LAYOUT, \ - Kokkos::Device, \ - Kokkos::MemoryTraits >, \ - true, ETI_SPEC_AVAIL> { \ - typedef Kokkos::complex SCALAR; \ - typedef Kokkos::View, \ - Kokkos::MemoryTraits > \ - AViewType; \ - typedef Kokkos::View, \ - Kokkos::MemoryTraits > \ - XViewType; \ - typedef Kokkos::View, \ - Kokkos::MemoryTraits > \ - YViewType; \ - \ - static void gemv(const ExecSpace& space, const char trans[], \ - typename AViewType::const_value_type& alpha, \ - const AViewType& A, const XViewType& X, \ - typename YViewType::const_value_type& beta, \ - const YViewType& Y) { \ - Kokkos::Profiling::pushRegion( \ - "KokkosBlas::gemv[TPL_ROCBLAS,complex]"); \ - KOKKOSBLAS2_GEMV_ROCBLAS_DETERMINE_ARGS(LAYOUT); \ - KokkosBlas::Impl::RocBlasSingleton& s = \ - KokkosBlas::Impl::RocBlasSingleton::singleton(); \ - KOKKOS_ROCBLAS_SAFE_CALL_IMPL( \ - rocblas_set_stream(s.handle, space.hip_stream())); \ - KOKKOS_ROCBLAS_SAFE_CALL_IMPL(rocblas_zgemv( \ - s.handle, transa, M, N, \ - reinterpret_cast(&alpha), \ - reinterpret_cast(A.data()), LDA, \ - reinterpret_cast(X.data()), one, \ - reinterpret_cast(&beta), \ - reinterpret_cast(Y.data()), one)); \ - KOKKOS_ROCBLAS_SAFE_CALL_IMPL(rocblas_set_stream(s.handle, NULL)); \ - Kokkos::Profiling::popRegion(); \ - } \ +#define KOKKOSBLAS2_ZGEMV_ROCBLAS(LAYOUT, MEM_SPACE, ETI_SPEC_AVAIL) \ + template \ + struct GEMV**, LAYOUT, Kokkos::Device, \ + Kokkos::MemoryTraits >, \ + Kokkos::View*, LAYOUT, Kokkos::Device, \ + Kokkos::MemoryTraits >, \ + Kokkos::View*, LAYOUT, Kokkos::Device, \ + Kokkos::MemoryTraits >, \ + true, ETI_SPEC_AVAIL> { \ + typedef Kokkos::complex SCALAR; \ + typedef Kokkos::View, \ + Kokkos::MemoryTraits > \ + AViewType; \ + typedef Kokkos::View, \ + Kokkos::MemoryTraits > \ + XViewType; \ + typedef Kokkos::View, \ + Kokkos::MemoryTraits > \ + YViewType; \ + \ + static void gemv(const ExecSpace& space, const char trans[], typename AViewType::const_value_type& alpha, \ + const AViewType& A, const XViewType& X, typename YViewType::const_value_type& beta, \ + const YViewType& Y) { \ + Kokkos::Profiling::pushRegion("KokkosBlas::gemv[TPL_ROCBLAS,complex]"); \ + KOKKOSBLAS2_GEMV_ROCBLAS_DETERMINE_ARGS(LAYOUT); \ + KokkosBlas::Impl::RocBlasSingleton& s = KokkosBlas::Impl::RocBlasSingleton::singleton(); \ + KOKKOS_ROCBLAS_SAFE_CALL_IMPL(rocblas_set_stream(s.handle, space.hip_stream())); \ + KOKKOS_ROCBLAS_SAFE_CALL_IMPL(rocblas_zgemv(s.handle, transa, M, N, \ + reinterpret_cast(&alpha), \ + reinterpret_cast(A.data()), LDA, \ + reinterpret_cast(X.data()), one, \ + reinterpret_cast(&beta), \ + reinterpret_cast(Y.data()), one)); \ + KOKKOS_ROCBLAS_SAFE_CALL_IMPL(rocblas_set_stream(s.handle, NULL)); \ + Kokkos::Profiling::popRegion(); \ + } \ }; -#define KOKKOSBLAS2_CGEMV_ROCBLAS(LAYOUT, MEM_SPACE, ETI_SPEC_AVAIL) \ - template \ - struct GEMV**, LAYOUT, \ - Kokkos::Device, \ - Kokkos::MemoryTraits >, \ - Kokkos::View*, LAYOUT, \ - Kokkos::Device, \ - Kokkos::MemoryTraits >, \ - Kokkos::View*, LAYOUT, \ - Kokkos::Device, \ - Kokkos::MemoryTraits >, \ - true, ETI_SPEC_AVAIL> { \ - typedef Kokkos::complex SCALAR; \ - typedef Kokkos::View, \ - Kokkos::MemoryTraits > \ - AViewType; \ - typedef Kokkos::View, \ - Kokkos::MemoryTraits > \ - XViewType; \ - typedef Kokkos::View, \ - Kokkos::MemoryTraits > \ - YViewType; \ - \ - static void gemv(const ExecSpace& space, const char trans[], \ - typename AViewType::const_value_type& alpha, \ - const AViewType& A, const XViewType& X, \ - typename YViewType::const_value_type& beta, \ - const YViewType& Y) { \ - Kokkos::Profiling::pushRegion( \ - "KokkosBlas::gemv[TPL_ROCBLAS,complex]"); \ - KOKKOSBLAS2_GEMV_ROCBLAS_DETERMINE_ARGS(LAYOUT); \ - KokkosBlas::Impl::RocBlasSingleton& s = \ - KokkosBlas::Impl::RocBlasSingleton::singleton(); \ - KOKKOS_ROCBLAS_SAFE_CALL_IMPL( \ - rocblas_set_stream(s.handle, space.hip_stream())); \ - KOKKOS_ROCBLAS_SAFE_CALL_IMPL(rocblas_cgemv( \ - s.handle, transa, M, N, \ - reinterpret_cast(&alpha), \ - reinterpret_cast(A.data()), LDA, \ - reinterpret_cast(X.data()), one, \ - reinterpret_cast(&beta), \ - reinterpret_cast(Y.data()), one)); \ - KOKKOS_ROCBLAS_SAFE_CALL_IMPL(rocblas_set_stream(s.handle, NULL)); \ - Kokkos::Profiling::popRegion(); \ - } \ +#define KOKKOSBLAS2_CGEMV_ROCBLAS(LAYOUT, MEM_SPACE, ETI_SPEC_AVAIL) \ + template \ + struct GEMV**, LAYOUT, Kokkos::Device, \ + Kokkos::MemoryTraits >, \ + Kokkos::View*, LAYOUT, Kokkos::Device, \ + Kokkos::MemoryTraits >, \ + Kokkos::View*, LAYOUT, Kokkos::Device, \ + Kokkos::MemoryTraits >, \ + true, ETI_SPEC_AVAIL> { \ + typedef Kokkos::complex SCALAR; \ + typedef Kokkos::View, \ + Kokkos::MemoryTraits > \ + AViewType; \ + typedef Kokkos::View, \ + Kokkos::MemoryTraits > \ + XViewType; \ + typedef Kokkos::View, \ + Kokkos::MemoryTraits > \ + YViewType; \ + \ + static void gemv(const ExecSpace& space, const char trans[], typename AViewType::const_value_type& alpha, \ + const AViewType& A, const XViewType& X, typename YViewType::const_value_type& beta, \ + const YViewType& Y) { \ + Kokkos::Profiling::pushRegion("KokkosBlas::gemv[TPL_ROCBLAS,complex]"); \ + KOKKOSBLAS2_GEMV_ROCBLAS_DETERMINE_ARGS(LAYOUT); \ + KokkosBlas::Impl::RocBlasSingleton& s = KokkosBlas::Impl::RocBlasSingleton::singleton(); \ + KOKKOS_ROCBLAS_SAFE_CALL_IMPL(rocblas_set_stream(s.handle, space.hip_stream())); \ + KOKKOS_ROCBLAS_SAFE_CALL_IMPL(rocblas_cgemv(s.handle, transa, M, N, \ + reinterpret_cast(&alpha), \ + reinterpret_cast(A.data()), LDA, \ + reinterpret_cast(X.data()), one, \ + reinterpret_cast(&beta), \ + reinterpret_cast(Y.data()), one)); \ + KOKKOS_ROCBLAS_SAFE_CALL_IMPL(rocblas_set_stream(s.handle, NULL)); \ + Kokkos::Profiling::popRegion(); \ + } \ }; KOKKOSBLAS2_DGEMV_ROCBLAS(Kokkos::LayoutLeft, Kokkos::HIPSpace, true) @@ -782,8 +609,7 @@ inline oneapi::mkl::transpose mode_kk_to_onemkl(char mode_kk) { case 'C': return oneapi::mkl::transpose::conjtrans; default:; } - throw std::invalid_argument( - "Invalid mode for oneMKL (should be one of N, T, C)"); + throw std::invalid_argument("Invalid mode for oneMKL (should be one of N, T, C)"); } template @@ -797,78 +623,58 @@ struct kokkos_to_std_type_map { using type = std::complex::mag_type>; }; -#define KOKKOSBLAS2_GEMV_ONEMKL(SCALAR, LAYOUT, MEM_SPACE, ETI_SPEC_AVAIL) \ - template \ - struct GEMV< \ - ExecSpace, \ - Kokkos::View, \ - Kokkos::MemoryTraits >, \ - Kokkos::View, \ - Kokkos::MemoryTraits >, \ - Kokkos::View, \ - Kokkos::MemoryTraits >, \ - true, ETI_SPEC_AVAIL> { \ - using device_type = Kokkos::Device; \ - using mem_traits = Kokkos::MemoryTraits; \ - using AViewType = \ - Kokkos::View; \ - using XViewType = \ - Kokkos::View; \ - using YViewType = Kokkos::View; \ - \ - static void gemv(const ExecSpace& exec, const char kk_trans[], \ - typename AViewType::const_value_type& alpha, \ - const AViewType& A, const XViewType& X, \ - typename YViewType::const_value_type& beta, \ - const YViewType& Y) { \ - if (beta == Kokkos::ArithTraits::zero()) { \ - Kokkos::deep_copy(Y, Kokkos::ArithTraits::zero()); \ - } \ - \ - bool row_major = std::is_same::value; \ - const std::int64_t M = A.extent(0); \ - const std::int64_t N = A.extent(1); \ - oneapi::mkl::transpose trans = mode_kk_to_onemkl(kk_trans[0]); \ - const std::int64_t LDA = row_major ? A.stride(0) : A.stride(1); \ - std::string label = "KokkosBlas::gemv[TPL_ONEMKL," + \ - Kokkos::ArithTraits::name() + "]"; \ - \ - Kokkos::Profiling::pushRegion(label); \ - using mag_type = kokkos_to_std_type_map< \ - SCALAR, Kokkos::ArithTraits::is_complex>::type; \ - const mag_type* a = reinterpret_cast(A.data()); \ - const mag_type* x = reinterpret_cast(X.data()); \ - mag_type* y = reinterpret_cast(Y.data()); \ - if (row_major) { \ - oneapi::mkl::blas::row_major::gemv(exec.sycl_queue(), trans, M, N, \ - alpha, a, LDA, x, 1, beta, y, 1); \ - } else { \ - oneapi::mkl::blas::column_major::gemv( \ - exec.sycl_queue(), trans, M, N, alpha, a, LDA, x, 1, beta, y, 1); \ - } \ - Kokkos::Profiling::popRegion(); \ - } \ +#define KOKKOSBLAS2_GEMV_ONEMKL(SCALAR, LAYOUT, MEM_SPACE, ETI_SPEC_AVAIL) \ + template \ + struct GEMV, \ + Kokkos::MemoryTraits >, \ + Kokkos::View, \ + Kokkos::MemoryTraits >, \ + Kokkos::View, \ + Kokkos::MemoryTraits >, \ + true, ETI_SPEC_AVAIL> { \ + using device_type = Kokkos::Device; \ + using mem_traits = Kokkos::MemoryTraits; \ + using AViewType = Kokkos::View; \ + using XViewType = Kokkos::View; \ + using YViewType = Kokkos::View; \ + \ + static void gemv(const ExecSpace& exec, const char kk_trans[], typename AViewType::const_value_type& alpha, \ + const AViewType& A, const XViewType& X, typename YViewType::const_value_type& beta, \ + const YViewType& Y) { \ + if (beta == Kokkos::ArithTraits::zero()) { \ + Kokkos::deep_copy(Y, Kokkos::ArithTraits::zero()); \ + } \ + \ + bool row_major = std::is_same::value; \ + const std::int64_t M = A.extent(0); \ + const std::int64_t N = A.extent(1); \ + oneapi::mkl::transpose trans = mode_kk_to_onemkl(kk_trans[0]); \ + const std::int64_t LDA = row_major ? A.stride(0) : A.stride(1); \ + std::string label = "KokkosBlas::gemv[TPL_ONEMKL," + Kokkos::ArithTraits::name() + "]"; \ + \ + Kokkos::Profiling::pushRegion(label); \ + using mag_type = kokkos_to_std_type_map::is_complex>::type; \ + const mag_type* a = reinterpret_cast(A.data()); \ + const mag_type* x = reinterpret_cast(X.data()); \ + mag_type* y = reinterpret_cast(Y.data()); \ + if (row_major) { \ + oneapi::mkl::blas::row_major::gemv(exec.sycl_queue(), trans, M, N, alpha, a, LDA, x, 1, beta, y, 1); \ + } else { \ + oneapi::mkl::blas::column_major::gemv(exec.sycl_queue(), trans, M, N, alpha, a, LDA, x, 1, beta, y, 1); \ + } \ + Kokkos::Profiling::popRegion(); \ + } \ }; -KOKKOSBLAS2_GEMV_ONEMKL(float, Kokkos::LayoutLeft, - Kokkos::Experimental::SYCLDeviceUSMSpace, true) -KOKKOSBLAS2_GEMV_ONEMKL(float, Kokkos::LayoutRight, - Kokkos::Experimental::SYCLDeviceUSMSpace, true) -KOKKOSBLAS2_GEMV_ONEMKL(double, Kokkos::LayoutLeft, - Kokkos::Experimental::SYCLDeviceUSMSpace, true) -KOKKOSBLAS2_GEMV_ONEMKL(double, Kokkos::LayoutRight, - Kokkos::Experimental::SYCLDeviceUSMSpace, true) -KOKKOSBLAS2_GEMV_ONEMKL(Kokkos::complex, Kokkos::LayoutLeft, - Kokkos::Experimental::SYCLDeviceUSMSpace, true) -KOKKOSBLAS2_GEMV_ONEMKL(Kokkos::complex, Kokkos::LayoutRight, - Kokkos::Experimental::SYCLDeviceUSMSpace, true) -KOKKOSBLAS2_GEMV_ONEMKL(Kokkos::complex, Kokkos::LayoutLeft, - Kokkos::Experimental::SYCLDeviceUSMSpace, true) -KOKKOSBLAS2_GEMV_ONEMKL(Kokkos::complex, Kokkos::LayoutRight, - Kokkos::Experimental::SYCLDeviceUSMSpace, true) +KOKKOSBLAS2_GEMV_ONEMKL(float, Kokkos::LayoutLeft, Kokkos::Experimental::SYCLDeviceUSMSpace, true) +KOKKOSBLAS2_GEMV_ONEMKL(float, Kokkos::LayoutRight, Kokkos::Experimental::SYCLDeviceUSMSpace, true) +KOKKOSBLAS2_GEMV_ONEMKL(double, Kokkos::LayoutLeft, Kokkos::Experimental::SYCLDeviceUSMSpace, true) +KOKKOSBLAS2_GEMV_ONEMKL(double, Kokkos::LayoutRight, Kokkos::Experimental::SYCLDeviceUSMSpace, true) +KOKKOSBLAS2_GEMV_ONEMKL(Kokkos::complex, Kokkos::LayoutLeft, Kokkos::Experimental::SYCLDeviceUSMSpace, true) +KOKKOSBLAS2_GEMV_ONEMKL(Kokkos::complex, Kokkos::LayoutRight, Kokkos::Experimental::SYCLDeviceUSMSpace, true) +KOKKOSBLAS2_GEMV_ONEMKL(Kokkos::complex, Kokkos::LayoutLeft, Kokkos::Experimental::SYCLDeviceUSMSpace, true) +KOKKOSBLAS2_GEMV_ONEMKL(Kokkos::complex, Kokkos::LayoutRight, Kokkos::Experimental::SYCLDeviceUSMSpace, true) } // namespace Impl } // namespace KokkosBlas #endif diff --git a/blas/tpls/KokkosBlas2_ger_tpl_spec_avail.hpp b/blas/tpls/KokkosBlas2_ger_tpl_spec_avail.hpp index 3013689f34..b6156c2d3a 100644 --- a/blas/tpls/KokkosBlas2_ger_tpl_spec_avail.hpp +++ b/blas/tpls/KokkosBlas2_ger_tpl_spec_avail.hpp @@ -28,62 +28,40 @@ struct ger_tpl_spec_avail { // Generic Host side BLAS (could be MKL or whatever) #ifdef KOKKOSKERNELS_ENABLE_TPL_BLAS -#define KOKKOSBLAS2_GER_TPL_SPEC_AVAIL_BLAS(SCALAR, LAYOUT, EXEC_SPACE, \ - MEM_SPACE) \ - template <> \ - struct ger_tpl_spec_avail< \ - EXEC_SPACE, \ - Kokkos::View, \ - Kokkos::MemoryTraits >, \ - Kokkos::View, \ - Kokkos::MemoryTraits >, \ - Kokkos::View, \ - Kokkos::MemoryTraits > > { \ - enum : bool { value = true }; \ +#define KOKKOSBLAS2_GER_TPL_SPEC_AVAIL_BLAS(SCALAR, LAYOUT, EXEC_SPACE, MEM_SPACE) \ + template <> \ + struct ger_tpl_spec_avail, \ + Kokkos::MemoryTraits >, \ + Kokkos::View, \ + Kokkos::MemoryTraits >, \ + Kokkos::View, \ + Kokkos::MemoryTraits > > { \ + enum : bool { value = true }; \ }; #ifdef KOKKOS_ENABLE_SERIAL -KOKKOSBLAS2_GER_TPL_SPEC_AVAIL_BLAS(double, Kokkos::LayoutLeft, Kokkos::Serial, - Kokkos::HostSpace) -KOKKOSBLAS2_GER_TPL_SPEC_AVAIL_BLAS(float, Kokkos::LayoutLeft, Kokkos::Serial, - Kokkos::HostSpace) -KOKKOSBLAS2_GER_TPL_SPEC_AVAIL_BLAS(Kokkos::complex, Kokkos::LayoutLeft, - Kokkos::Serial, Kokkos::HostSpace) -KOKKOSBLAS2_GER_TPL_SPEC_AVAIL_BLAS(Kokkos::complex, Kokkos::LayoutLeft, - Kokkos::Serial, Kokkos::HostSpace) - -KOKKOSBLAS2_GER_TPL_SPEC_AVAIL_BLAS(double, Kokkos::LayoutRight, Kokkos::Serial, - Kokkos::HostSpace) -KOKKOSBLAS2_GER_TPL_SPEC_AVAIL_BLAS(float, Kokkos::LayoutRight, Kokkos::Serial, - Kokkos::HostSpace) -KOKKOSBLAS2_GER_TPL_SPEC_AVAIL_BLAS(Kokkos::complex, - Kokkos::LayoutRight, Kokkos::Serial, - Kokkos::HostSpace) -KOKKOSBLAS2_GER_TPL_SPEC_AVAIL_BLAS(Kokkos::complex, Kokkos::LayoutRight, - Kokkos::Serial, Kokkos::HostSpace) +KOKKOSBLAS2_GER_TPL_SPEC_AVAIL_BLAS(double, Kokkos::LayoutLeft, Kokkos::Serial, Kokkos::HostSpace) +KOKKOSBLAS2_GER_TPL_SPEC_AVAIL_BLAS(float, Kokkos::LayoutLeft, Kokkos::Serial, Kokkos::HostSpace) +KOKKOSBLAS2_GER_TPL_SPEC_AVAIL_BLAS(Kokkos::complex, Kokkos::LayoutLeft, Kokkos::Serial, Kokkos::HostSpace) +KOKKOSBLAS2_GER_TPL_SPEC_AVAIL_BLAS(Kokkos::complex, Kokkos::LayoutLeft, Kokkos::Serial, Kokkos::HostSpace) + +KOKKOSBLAS2_GER_TPL_SPEC_AVAIL_BLAS(double, Kokkos::LayoutRight, Kokkos::Serial, Kokkos::HostSpace) +KOKKOSBLAS2_GER_TPL_SPEC_AVAIL_BLAS(float, Kokkos::LayoutRight, Kokkos::Serial, Kokkos::HostSpace) +KOKKOSBLAS2_GER_TPL_SPEC_AVAIL_BLAS(Kokkos::complex, Kokkos::LayoutRight, Kokkos::Serial, Kokkos::HostSpace) +KOKKOSBLAS2_GER_TPL_SPEC_AVAIL_BLAS(Kokkos::complex, Kokkos::LayoutRight, Kokkos::Serial, Kokkos::HostSpace) #endif #ifdef KOKKOS_ENABLE_OPENMP -KOKKOSBLAS2_GER_TPL_SPEC_AVAIL_BLAS(double, Kokkos::LayoutLeft, Kokkos::OpenMP, - Kokkos::HostSpace) -KOKKOSBLAS2_GER_TPL_SPEC_AVAIL_BLAS(float, Kokkos::LayoutLeft, Kokkos::OpenMP, - Kokkos::HostSpace) -KOKKOSBLAS2_GER_TPL_SPEC_AVAIL_BLAS(Kokkos::complex, Kokkos::LayoutLeft, - Kokkos::OpenMP, Kokkos::HostSpace) -KOKKOSBLAS2_GER_TPL_SPEC_AVAIL_BLAS(Kokkos::complex, Kokkos::LayoutLeft, - Kokkos::OpenMP, Kokkos::HostSpace) - -KOKKOSBLAS2_GER_TPL_SPEC_AVAIL_BLAS(double, Kokkos::LayoutRight, Kokkos::OpenMP, - Kokkos::HostSpace) -KOKKOSBLAS2_GER_TPL_SPEC_AVAIL_BLAS(float, Kokkos::LayoutRight, Kokkos::OpenMP, - Kokkos::HostSpace) -KOKKOSBLAS2_GER_TPL_SPEC_AVAIL_BLAS(Kokkos::complex, - Kokkos::LayoutRight, Kokkos::OpenMP, - Kokkos::HostSpace) -KOKKOSBLAS2_GER_TPL_SPEC_AVAIL_BLAS(Kokkos::complex, Kokkos::LayoutRight, - Kokkos::OpenMP, Kokkos::HostSpace) +KOKKOSBLAS2_GER_TPL_SPEC_AVAIL_BLAS(double, Kokkos::LayoutLeft, Kokkos::OpenMP, Kokkos::HostSpace) +KOKKOSBLAS2_GER_TPL_SPEC_AVAIL_BLAS(float, Kokkos::LayoutLeft, Kokkos::OpenMP, Kokkos::HostSpace) +KOKKOSBLAS2_GER_TPL_SPEC_AVAIL_BLAS(Kokkos::complex, Kokkos::LayoutLeft, Kokkos::OpenMP, Kokkos::HostSpace) +KOKKOSBLAS2_GER_TPL_SPEC_AVAIL_BLAS(Kokkos::complex, Kokkos::LayoutLeft, Kokkos::OpenMP, Kokkos::HostSpace) + +KOKKOSBLAS2_GER_TPL_SPEC_AVAIL_BLAS(double, Kokkos::LayoutRight, Kokkos::OpenMP, Kokkos::HostSpace) +KOKKOSBLAS2_GER_TPL_SPEC_AVAIL_BLAS(float, Kokkos::LayoutRight, Kokkos::OpenMP, Kokkos::HostSpace) +KOKKOSBLAS2_GER_TPL_SPEC_AVAIL_BLAS(Kokkos::complex, Kokkos::LayoutRight, Kokkos::OpenMP, Kokkos::HostSpace) +KOKKOSBLAS2_GER_TPL_SPEC_AVAIL_BLAS(Kokkos::complex, Kokkos::LayoutRight, Kokkos::OpenMP, Kokkos::HostSpace) #endif #endif @@ -91,112 +69,68 @@ KOKKOSBLAS2_GER_TPL_SPEC_AVAIL_BLAS(Kokkos::complex, Kokkos::LayoutRight, // cuBLAS #ifdef KOKKOSKERNELS_ENABLE_TPL_CUBLAS -#define KOKKOSBLAS2_GER_TPL_SPEC_AVAIL_CUBLAS(SCALAR, LAYOUT, EXEC_SPACE, \ - MEM_SPACE) \ - template <> \ - struct ger_tpl_spec_avail< \ - EXEC_SPACE, \ - Kokkos::View, \ - Kokkos::MemoryTraits >, \ - Kokkos::View, \ - Kokkos::MemoryTraits >, \ - Kokkos::View, \ - Kokkos::MemoryTraits > > { \ - enum : bool { value = true }; \ +#define KOKKOSBLAS2_GER_TPL_SPEC_AVAIL_CUBLAS(SCALAR, LAYOUT, EXEC_SPACE, MEM_SPACE) \ + template <> \ + struct ger_tpl_spec_avail, \ + Kokkos::MemoryTraits >, \ + Kokkos::View, \ + Kokkos::MemoryTraits >, \ + Kokkos::View, \ + Kokkos::MemoryTraits > > { \ + enum : bool { value = true }; \ }; // We use the same layout for X, Y and Abecause the GER interface will // switch the layouts of X and Y to that of A. So this TPL version will // match any layout combination, as long as none are LayoutStride. -KOKKOSBLAS2_GER_TPL_SPEC_AVAIL_CUBLAS(double, Kokkos::LayoutLeft, Kokkos::Cuda, - Kokkos::CudaSpace) -KOKKOSBLAS2_GER_TPL_SPEC_AVAIL_CUBLAS(float, Kokkos::LayoutLeft, Kokkos::Cuda, - Kokkos::CudaSpace) -KOKKOSBLAS2_GER_TPL_SPEC_AVAIL_CUBLAS(Kokkos::complex, - Kokkos::LayoutLeft, Kokkos::Cuda, - Kokkos::CudaSpace) -KOKKOSBLAS2_GER_TPL_SPEC_AVAIL_CUBLAS(Kokkos::complex, - Kokkos::LayoutLeft, Kokkos::Cuda, - Kokkos::CudaSpace) - -KOKKOSBLAS2_GER_TPL_SPEC_AVAIL_CUBLAS(double, Kokkos::LayoutLeft, Kokkos::Cuda, - Kokkos::CudaUVMSpace) -KOKKOSBLAS2_GER_TPL_SPEC_AVAIL_CUBLAS(float, Kokkos::LayoutLeft, Kokkos::Cuda, - Kokkos::CudaUVMSpace) -KOKKOSBLAS2_GER_TPL_SPEC_AVAIL_CUBLAS(Kokkos::complex, - Kokkos::LayoutLeft, Kokkos::Cuda, - Kokkos::CudaUVMSpace) -KOKKOSBLAS2_GER_TPL_SPEC_AVAIL_CUBLAS(Kokkos::complex, - Kokkos::LayoutLeft, Kokkos::Cuda, - Kokkos::CudaUVMSpace) - -KOKKOSBLAS2_GER_TPL_SPEC_AVAIL_CUBLAS(double, Kokkos::LayoutRight, Kokkos::Cuda, - Kokkos::CudaSpace) -KOKKOSBLAS2_GER_TPL_SPEC_AVAIL_CUBLAS(float, Kokkos::LayoutRight, Kokkos::Cuda, - Kokkos::CudaSpace) -KOKKOSBLAS2_GER_TPL_SPEC_AVAIL_CUBLAS(Kokkos::complex, - Kokkos::LayoutRight, Kokkos::Cuda, - Kokkos::CudaSpace) -KOKKOSBLAS2_GER_TPL_SPEC_AVAIL_CUBLAS(Kokkos::complex, - Kokkos::LayoutRight, Kokkos::Cuda, - Kokkos::CudaSpace) - -KOKKOSBLAS2_GER_TPL_SPEC_AVAIL_CUBLAS(double, Kokkos::LayoutRight, Kokkos::Cuda, - Kokkos::CudaUVMSpace) -KOKKOSBLAS2_GER_TPL_SPEC_AVAIL_CUBLAS(float, Kokkos::LayoutRight, Kokkos::Cuda, - Kokkos::CudaUVMSpace) -KOKKOSBLAS2_GER_TPL_SPEC_AVAIL_CUBLAS(Kokkos::complex, - Kokkos::LayoutRight, Kokkos::Cuda, - Kokkos::CudaUVMSpace) -KOKKOSBLAS2_GER_TPL_SPEC_AVAIL_CUBLAS(Kokkos::complex, - Kokkos::LayoutRight, Kokkos::Cuda, - Kokkos::CudaUVMSpace) +KOKKOSBLAS2_GER_TPL_SPEC_AVAIL_CUBLAS(double, Kokkos::LayoutLeft, Kokkos::Cuda, Kokkos::CudaSpace) +KOKKOSBLAS2_GER_TPL_SPEC_AVAIL_CUBLAS(float, Kokkos::LayoutLeft, Kokkos::Cuda, Kokkos::CudaSpace) +KOKKOSBLAS2_GER_TPL_SPEC_AVAIL_CUBLAS(Kokkos::complex, Kokkos::LayoutLeft, Kokkos::Cuda, Kokkos::CudaSpace) +KOKKOSBLAS2_GER_TPL_SPEC_AVAIL_CUBLAS(Kokkos::complex, Kokkos::LayoutLeft, Kokkos::Cuda, Kokkos::CudaSpace) + +KOKKOSBLAS2_GER_TPL_SPEC_AVAIL_CUBLAS(double, Kokkos::LayoutLeft, Kokkos::Cuda, Kokkos::CudaUVMSpace) +KOKKOSBLAS2_GER_TPL_SPEC_AVAIL_CUBLAS(float, Kokkos::LayoutLeft, Kokkos::Cuda, Kokkos::CudaUVMSpace) +KOKKOSBLAS2_GER_TPL_SPEC_AVAIL_CUBLAS(Kokkos::complex, Kokkos::LayoutLeft, Kokkos::Cuda, Kokkos::CudaUVMSpace) +KOKKOSBLAS2_GER_TPL_SPEC_AVAIL_CUBLAS(Kokkos::complex, Kokkos::LayoutLeft, Kokkos::Cuda, Kokkos::CudaUVMSpace) + +KOKKOSBLAS2_GER_TPL_SPEC_AVAIL_CUBLAS(double, Kokkos::LayoutRight, Kokkos::Cuda, Kokkos::CudaSpace) +KOKKOSBLAS2_GER_TPL_SPEC_AVAIL_CUBLAS(float, Kokkos::LayoutRight, Kokkos::Cuda, Kokkos::CudaSpace) +KOKKOSBLAS2_GER_TPL_SPEC_AVAIL_CUBLAS(Kokkos::complex, Kokkos::LayoutRight, Kokkos::Cuda, Kokkos::CudaSpace) +KOKKOSBLAS2_GER_TPL_SPEC_AVAIL_CUBLAS(Kokkos::complex, Kokkos::LayoutRight, Kokkos::Cuda, Kokkos::CudaSpace) + +KOKKOSBLAS2_GER_TPL_SPEC_AVAIL_CUBLAS(double, Kokkos::LayoutRight, Kokkos::Cuda, Kokkos::CudaUVMSpace) +KOKKOSBLAS2_GER_TPL_SPEC_AVAIL_CUBLAS(float, Kokkos::LayoutRight, Kokkos::Cuda, Kokkos::CudaUVMSpace) +KOKKOSBLAS2_GER_TPL_SPEC_AVAIL_CUBLAS(Kokkos::complex, Kokkos::LayoutRight, Kokkos::Cuda, Kokkos::CudaUVMSpace) +KOKKOSBLAS2_GER_TPL_SPEC_AVAIL_CUBLAS(Kokkos::complex, Kokkos::LayoutRight, Kokkos::Cuda, Kokkos::CudaUVMSpace) #endif // rocBLAS #ifdef KOKKOSKERNELS_ENABLE_TPL_ROCBLAS -#define KOKKOSBLAS2_GER_TPL_SPEC_AVAIL_ROCBLAS(SCALAR, LAYOUT, EXEC_SPACE, \ - MEM_SPACE) \ - template <> \ - struct ger_tpl_spec_avail< \ - EXEC_SPACE, \ - Kokkos::View, \ - Kokkos::MemoryTraits >, \ - Kokkos::View, \ - Kokkos::MemoryTraits >, \ - Kokkos::View, \ - Kokkos::MemoryTraits > > { \ - enum : bool { value = true }; \ +#define KOKKOSBLAS2_GER_TPL_SPEC_AVAIL_ROCBLAS(SCALAR, LAYOUT, EXEC_SPACE, MEM_SPACE) \ + template <> \ + struct ger_tpl_spec_avail, \ + Kokkos::MemoryTraits >, \ + Kokkos::View, \ + Kokkos::MemoryTraits >, \ + Kokkos::View, \ + Kokkos::MemoryTraits > > { \ + enum : bool { value = true }; \ }; -KOKKOSBLAS2_GER_TPL_SPEC_AVAIL_ROCBLAS(double, Kokkos::LayoutLeft, Kokkos::HIP, - Kokkos::HIPSpace) -KOKKOSBLAS2_GER_TPL_SPEC_AVAIL_ROCBLAS(float, Kokkos::LayoutLeft, Kokkos::HIP, - Kokkos::HIPSpace) -KOKKOSBLAS2_GER_TPL_SPEC_AVAIL_ROCBLAS(Kokkos::complex, - Kokkos::LayoutLeft, Kokkos::HIP, - Kokkos::HIPSpace) -KOKKOSBLAS2_GER_TPL_SPEC_AVAIL_ROCBLAS(Kokkos::complex, - Kokkos::LayoutLeft, Kokkos::HIP, - Kokkos::HIPSpace) - -KOKKOSBLAS2_GER_TPL_SPEC_AVAIL_ROCBLAS(double, Kokkos::LayoutRight, Kokkos::HIP, - Kokkos::HIPSpace) -KOKKOSBLAS2_GER_TPL_SPEC_AVAIL_ROCBLAS(float, Kokkos::LayoutRight, Kokkos::HIP, - Kokkos::HIPSpace) -KOKKOSBLAS2_GER_TPL_SPEC_AVAIL_ROCBLAS(Kokkos::complex, - Kokkos::LayoutRight, Kokkos::HIP, - Kokkos::HIPSpace) -KOKKOSBLAS2_GER_TPL_SPEC_AVAIL_ROCBLAS(Kokkos::complex, - Kokkos::LayoutRight, Kokkos::HIP, - Kokkos::HIPSpace) +KOKKOSBLAS2_GER_TPL_SPEC_AVAIL_ROCBLAS(double, Kokkos::LayoutLeft, Kokkos::HIP, Kokkos::HIPSpace) +KOKKOSBLAS2_GER_TPL_SPEC_AVAIL_ROCBLAS(float, Kokkos::LayoutLeft, Kokkos::HIP, Kokkos::HIPSpace) +KOKKOSBLAS2_GER_TPL_SPEC_AVAIL_ROCBLAS(Kokkos::complex, Kokkos::LayoutLeft, Kokkos::HIP, Kokkos::HIPSpace) +KOKKOSBLAS2_GER_TPL_SPEC_AVAIL_ROCBLAS(Kokkos::complex, Kokkos::LayoutLeft, Kokkos::HIP, Kokkos::HIPSpace) + +KOKKOSBLAS2_GER_TPL_SPEC_AVAIL_ROCBLAS(double, Kokkos::LayoutRight, Kokkos::HIP, Kokkos::HIPSpace) +KOKKOSBLAS2_GER_TPL_SPEC_AVAIL_ROCBLAS(float, Kokkos::LayoutRight, Kokkos::HIP, Kokkos::HIPSpace) +KOKKOSBLAS2_GER_TPL_SPEC_AVAIL_ROCBLAS(Kokkos::complex, Kokkos::LayoutRight, Kokkos::HIP, Kokkos::HIPSpace) +KOKKOSBLAS2_GER_TPL_SPEC_AVAIL_ROCBLAS(Kokkos::complex, Kokkos::LayoutRight, Kokkos::HIP, Kokkos::HIPSpace) #endif } // namespace Impl diff --git a/blas/tpls/KokkosBlas2_ger_tpl_spec_decl_blas.hpp b/blas/tpls/KokkosBlas2_ger_tpl_spec_decl_blas.hpp index bc1a10f61e..680df7c464 100644 --- a/blas/tpls/KokkosBlas2_ger_tpl_spec_decl_blas.hpp +++ b/blas/tpls/KokkosBlas2_ger_tpl_spec_decl_blas.hpp @@ -30,308 +30,225 @@ namespace Impl { constexpr int one = 1; \ const int LDA = A_is_lr ? A.stride(0) : A.stride(1); -#define KOKKOSBLAS2_DGER_BLAS(LAYOUT, EXEC_SPACE, MEM_SPACE, ETI_SPEC_AVAIL) \ - template <> \ - struct GER< \ - EXEC_SPACE, \ - Kokkos::View, \ - Kokkos::MemoryTraits>, \ - Kokkos::View, \ - Kokkos::MemoryTraits>, \ - Kokkos::View, \ - Kokkos::MemoryTraits>, \ - true, ETI_SPEC_AVAIL> { \ - typedef double SCALAR; \ - typedef Kokkos::View, \ - Kokkos::MemoryTraits> \ - XViewType; \ - typedef Kokkos::View, \ - Kokkos::MemoryTraits> \ - YViewType; \ - typedef Kokkos::View, \ - Kokkos::MemoryTraits> \ - AViewType; \ - \ - static void ger(const EXEC_SPACE& /* space */ \ - , \ - const char /*trans*/[], \ - typename AViewType::const_value_type& alpha, \ - const XViewType& X, const YViewType& Y, \ - const AViewType& A) { \ - Kokkos::Profiling::pushRegion("KokkosBlas::ger[TPL_BLAS,double]"); \ - KOKKOSBLAS2_GER_DETERMINE_ARGS(LAYOUT); \ - if (A_is_ll) { \ - HostBlas::ger(M, N, alpha, X.data(), one, Y.data(), one, \ - A.data(), LDA); \ - } else { \ - HostBlas::ger(M, N, alpha, Y.data(), one, X.data(), one, \ - A.data(), LDA); \ - } \ - Kokkos::Profiling::popRegion(); \ - } \ +#define KOKKOSBLAS2_DGER_BLAS(LAYOUT, EXEC_SPACE, MEM_SPACE, ETI_SPEC_AVAIL) \ + template <> \ + struct GER< \ + EXEC_SPACE, \ + Kokkos::View, \ + Kokkos::MemoryTraits>, \ + Kokkos::View, \ + Kokkos::MemoryTraits>, \ + Kokkos::View, Kokkos::MemoryTraits>, \ + true, ETI_SPEC_AVAIL> { \ + typedef double SCALAR; \ + typedef Kokkos::View, \ + Kokkos::MemoryTraits> \ + XViewType; \ + typedef Kokkos::View, \ + Kokkos::MemoryTraits> \ + YViewType; \ + typedef Kokkos::View, \ + Kokkos::MemoryTraits> \ + AViewType; \ + \ + static void ger(const EXEC_SPACE& /* space */ \ + , \ + const char /*trans*/[], typename AViewType::const_value_type& alpha, const XViewType& X, \ + const YViewType& Y, const AViewType& A) { \ + Kokkos::Profiling::pushRegion("KokkosBlas::ger[TPL_BLAS,double]"); \ + KOKKOSBLAS2_GER_DETERMINE_ARGS(LAYOUT); \ + if (A_is_ll) { \ + HostBlas::ger(M, N, alpha, X.data(), one, Y.data(), one, A.data(), LDA); \ + } else { \ + HostBlas::ger(M, N, alpha, Y.data(), one, X.data(), one, A.data(), LDA); \ + } \ + Kokkos::Profiling::popRegion(); \ + } \ }; -#define KOKKOSBLAS2_SGER_BLAS(LAYOUT, EXEC_SPACE, MEM_SPACE, ETI_SPEC_AVAIL) \ - template <> \ - struct GER< \ - EXEC_SPACE, \ - Kokkos::View, \ - Kokkos::MemoryTraits>, \ - Kokkos::View, \ - Kokkos::MemoryTraits>, \ - Kokkos::View, \ - Kokkos::MemoryTraits>, \ - true, ETI_SPEC_AVAIL> { \ - typedef float SCALAR; \ - typedef Kokkos::View, \ - Kokkos::MemoryTraits> \ - XViewType; \ - typedef Kokkos::View, \ - Kokkos::MemoryTraits> \ - YViewType; \ - typedef Kokkos::View, \ - Kokkos::MemoryTraits> \ - AViewType; \ - \ - static void ger(const EXEC_SPACE& /* space */ \ - , \ - const char /*trans*/[], \ - typename AViewType::const_value_type& alpha, \ - const XViewType& X, const YViewType& Y, \ - const AViewType& A) { \ - Kokkos::Profiling::pushRegion("KokkosBlas::ger[TPL_BLAS,float]"); \ - KOKKOSBLAS2_GER_DETERMINE_ARGS(LAYOUT); \ - if (A_is_ll) { \ - HostBlas::ger(M, N, alpha, X.data(), one, Y.data(), one, \ - A.data(), LDA); \ - } else { \ - HostBlas::ger(M, N, alpha, Y.data(), one, X.data(), one, \ - A.data(), LDA); \ - } \ - Kokkos::Profiling::popRegion(); \ - } \ +#define KOKKOSBLAS2_SGER_BLAS(LAYOUT, EXEC_SPACE, MEM_SPACE, ETI_SPEC_AVAIL) \ + template <> \ + struct GER< \ + EXEC_SPACE, \ + Kokkos::View, \ + Kokkos::MemoryTraits>, \ + Kokkos::View, \ + Kokkos::MemoryTraits>, \ + Kokkos::View, Kokkos::MemoryTraits>, \ + true, ETI_SPEC_AVAIL> { \ + typedef float SCALAR; \ + typedef Kokkos::View, \ + Kokkos::MemoryTraits> \ + XViewType; \ + typedef Kokkos::View, \ + Kokkos::MemoryTraits> \ + YViewType; \ + typedef Kokkos::View, \ + Kokkos::MemoryTraits> \ + AViewType; \ + \ + static void ger(const EXEC_SPACE& /* space */ \ + , \ + const char /*trans*/[], typename AViewType::const_value_type& alpha, const XViewType& X, \ + const YViewType& Y, const AViewType& A) { \ + Kokkos::Profiling::pushRegion("KokkosBlas::ger[TPL_BLAS,float]"); \ + KOKKOSBLAS2_GER_DETERMINE_ARGS(LAYOUT); \ + if (A_is_ll) { \ + HostBlas::ger(M, N, alpha, X.data(), one, Y.data(), one, A.data(), LDA); \ + } else { \ + HostBlas::ger(M, N, alpha, Y.data(), one, X.data(), one, A.data(), LDA); \ + } \ + Kokkos::Profiling::popRegion(); \ + } \ }; -#define KOKKOSBLAS2_ZGER_BLAS(LAYOUT, EXEC_SPACE, MEM_SPACE, ETI_SPEC_AVAIL) \ - template <> \ - struct GER*, LAYOUT, \ - Kokkos::Device, \ - Kokkos::MemoryTraits>, \ - Kokkos::View*, LAYOUT, \ - Kokkos::Device, \ - Kokkos::MemoryTraits>, \ - Kokkos::View**, LAYOUT, \ - Kokkos::Device, \ - Kokkos::MemoryTraits>, \ - true, ETI_SPEC_AVAIL> { \ - typedef Kokkos::complex SCALAR; \ - typedef Kokkos::View, \ - Kokkos::MemoryTraits> \ - XViewType; \ - typedef Kokkos::View, \ - Kokkos::MemoryTraits> \ - YViewType; \ - typedef Kokkos::View, \ - Kokkos::MemoryTraits> \ - AViewType; \ - \ - static void ger(const EXEC_SPACE& space, const char trans[], \ - typename AViewType::const_value_type& alpha, \ - const XViewType& X, const YViewType& Y, \ - const AViewType& A) { \ - Kokkos::Profiling::pushRegion( \ - "KokkosBlas::ger[TPL_BLAS,complex"); \ - KOKKOSBLAS2_GER_DETERMINE_ARGS(LAYOUT); \ - const std::complex alpha_val = \ - static_cast>(alpha); \ - bool justTranspose = (trans[0] == 'T') || (trans[0] == 't'); \ - if (A_is_ll) { \ - if (justTranspose) { \ - HostBlas>::geru( \ - M, N, alpha_val, \ - reinterpret_cast*>(X.data()), one, \ - reinterpret_cast*>(Y.data()), one, \ - reinterpret_cast*>(A.data()), LDA); \ - } else { \ - HostBlas>::gerc( \ - M, N, alpha_val, \ - reinterpret_cast*>(X.data()), one, \ - reinterpret_cast*>(Y.data()), one, \ - reinterpret_cast*>(A.data()), LDA); \ - } \ - } else { \ - if (justTranspose) { \ - HostBlas>::geru( \ - M, N, alpha_val, \ - reinterpret_cast*>(Y.data()), one, \ - reinterpret_cast*>(X.data()), one, \ - reinterpret_cast*>(A.data()), LDA); \ - } else { \ - /* blasgerc() + ~A_ll => call kokkos-kernels' implementation */ \ - GER::ger(space, trans, alpha, X, Y, A); \ - } \ - } \ - Kokkos::Profiling::popRegion(); \ - } \ +#define KOKKOSBLAS2_ZGER_BLAS(LAYOUT, EXEC_SPACE, MEM_SPACE, ETI_SPEC_AVAIL) \ + template <> \ + struct GER*, LAYOUT, Kokkos::Device, \ + Kokkos::MemoryTraits>, \ + Kokkos::View*, LAYOUT, Kokkos::Device, \ + Kokkos::MemoryTraits>, \ + Kokkos::View**, LAYOUT, Kokkos::Device, \ + Kokkos::MemoryTraits>, \ + true, ETI_SPEC_AVAIL> { \ + typedef Kokkos::complex SCALAR; \ + typedef Kokkos::View, \ + Kokkos::MemoryTraits> \ + XViewType; \ + typedef Kokkos::View, \ + Kokkos::MemoryTraits> \ + YViewType; \ + typedef Kokkos::View, \ + Kokkos::MemoryTraits> \ + AViewType; \ + \ + static void ger(const EXEC_SPACE& space, const char trans[], typename AViewType::const_value_type& alpha, \ + const XViewType& X, const YViewType& Y, const AViewType& A) { \ + Kokkos::Profiling::pushRegion("KokkosBlas::ger[TPL_BLAS,complex"); \ + KOKKOSBLAS2_GER_DETERMINE_ARGS(LAYOUT); \ + const std::complex alpha_val = static_cast>(alpha); \ + bool justTranspose = (trans[0] == 'T') || (trans[0] == 't'); \ + if (A_is_ll) { \ + if (justTranspose) { \ + HostBlas>::geru(M, N, alpha_val, \ + reinterpret_cast*>(X.data()), one, \ + reinterpret_cast*>(Y.data()), one, \ + reinterpret_cast*>(A.data()), LDA); \ + } else { \ + HostBlas>::gerc(M, N, alpha_val, \ + reinterpret_cast*>(X.data()), one, \ + reinterpret_cast*>(Y.data()), one, \ + reinterpret_cast*>(A.data()), LDA); \ + } \ + } else { \ + if (justTranspose) { \ + HostBlas>::geru(M, N, alpha_val, \ + reinterpret_cast*>(Y.data()), one, \ + reinterpret_cast*>(X.data()), one, \ + reinterpret_cast*>(A.data()), LDA); \ + } else { \ + /* blasgerc() + ~A_ll => call kokkos-kernels' implementation */ \ + GER::ger(space, trans, alpha, X, Y, A); \ + } \ + } \ + Kokkos::Profiling::popRegion(); \ + } \ }; -#define KOKKOSBLAS2_CGER_BLAS(LAYOUT, EXEC_SPACE, MEM_SPACE, ETI_SPEC_AVAIL) \ - template <> \ - struct GER*, LAYOUT, \ - Kokkos::Device, \ - Kokkos::MemoryTraits>, \ - Kokkos::View*, LAYOUT, \ - Kokkos::Device, \ - Kokkos::MemoryTraits>, \ - Kokkos::View**, LAYOUT, \ - Kokkos::Device, \ - Kokkos::MemoryTraits>, \ - true, ETI_SPEC_AVAIL> { \ - typedef Kokkos::complex SCALAR; \ - typedef Kokkos::View, \ - Kokkos::MemoryTraits> \ - XViewType; \ - typedef Kokkos::View, \ - Kokkos::MemoryTraits> \ - YViewType; \ - typedef Kokkos::View, \ - Kokkos::MemoryTraits> \ - AViewType; \ - \ - static void ger(const EXEC_SPACE& space, const char trans[], \ - typename AViewType::const_value_type& alpha, \ - const XViewType& X, const YViewType& Y, \ - const AViewType& A) { \ - Kokkos::Profiling::pushRegion( \ - "KokkosBlas::ger[TPL_BLAS,complex"); \ - KOKKOSBLAS2_GER_DETERMINE_ARGS(LAYOUT); \ - const std::complex alpha_val = \ - static_cast>(alpha); \ - bool justTranspose = (trans[0] == 'T') || (trans[0] == 't'); \ - if (A_is_ll) { \ - if (justTranspose) { \ - HostBlas>::geru( \ - M, N, alpha_val, \ - reinterpret_cast*>(X.data()), one, \ - reinterpret_cast*>(Y.data()), one, \ - reinterpret_cast*>(A.data()), LDA); \ - } else { \ - HostBlas>::gerc( \ - M, N, alpha_val, \ - reinterpret_cast*>(X.data()), one, \ - reinterpret_cast*>(Y.data()), one, \ - reinterpret_cast*>(A.data()), LDA); \ - } \ - } else { \ - if (justTranspose) { \ - HostBlas>::geru( \ - M, N, alpha_val, \ - reinterpret_cast*>(Y.data()), one, \ - reinterpret_cast*>(X.data()), one, \ - reinterpret_cast*>(A.data()), LDA); \ - } else { \ - /* blasgerc() + ~A_ll => call kokkos-kernels' implementation */ \ - GER::ger(space, trans, alpha, X, Y, A); \ - } \ - } \ - Kokkos::Profiling::popRegion(); \ - } \ +#define KOKKOSBLAS2_CGER_BLAS(LAYOUT, EXEC_SPACE, MEM_SPACE, ETI_SPEC_AVAIL) \ + template <> \ + struct GER*, LAYOUT, Kokkos::Device, \ + Kokkos::MemoryTraits>, \ + Kokkos::View*, LAYOUT, Kokkos::Device, \ + Kokkos::MemoryTraits>, \ + Kokkos::View**, LAYOUT, Kokkos::Device, \ + Kokkos::MemoryTraits>, \ + true, ETI_SPEC_AVAIL> { \ + typedef Kokkos::complex SCALAR; \ + typedef Kokkos::View, \ + Kokkos::MemoryTraits> \ + XViewType; \ + typedef Kokkos::View, \ + Kokkos::MemoryTraits> \ + YViewType; \ + typedef Kokkos::View, \ + Kokkos::MemoryTraits> \ + AViewType; \ + \ + static void ger(const EXEC_SPACE& space, const char trans[], typename AViewType::const_value_type& alpha, \ + const XViewType& X, const YViewType& Y, const AViewType& A) { \ + Kokkos::Profiling::pushRegion("KokkosBlas::ger[TPL_BLAS,complex"); \ + KOKKOSBLAS2_GER_DETERMINE_ARGS(LAYOUT); \ + const std::complex alpha_val = static_cast>(alpha); \ + bool justTranspose = (trans[0] == 'T') || (trans[0] == 't'); \ + if (A_is_ll) { \ + if (justTranspose) { \ + HostBlas>::geru(M, N, alpha_val, reinterpret_cast*>(X.data()), \ + one, reinterpret_cast*>(Y.data()), one, \ + reinterpret_cast*>(A.data()), LDA); \ + } else { \ + HostBlas>::gerc(M, N, alpha_val, reinterpret_cast*>(X.data()), \ + one, reinterpret_cast*>(Y.data()), one, \ + reinterpret_cast*>(A.data()), LDA); \ + } \ + } else { \ + if (justTranspose) { \ + HostBlas>::geru(M, N, alpha_val, reinterpret_cast*>(Y.data()), \ + one, reinterpret_cast*>(X.data()), one, \ + reinterpret_cast*>(A.data()), LDA); \ + } else { \ + /* blasgerc() + ~A_ll => call kokkos-kernels' implementation */ \ + GER::ger(space, trans, alpha, X, Y, A); \ + } \ + } \ + Kokkos::Profiling::popRegion(); \ + } \ }; #ifdef KOKKOS_ENABLE_SERIAL -KOKKOSBLAS2_DGER_BLAS(Kokkos::LayoutLeft, Kokkos::Serial, Kokkos::HostSpace, - true) -KOKKOSBLAS2_DGER_BLAS(Kokkos::LayoutLeft, Kokkos::Serial, Kokkos::HostSpace, - false) -KOKKOSBLAS2_DGER_BLAS(Kokkos::LayoutRight, Kokkos::Serial, Kokkos::HostSpace, - true) -KOKKOSBLAS2_DGER_BLAS(Kokkos::LayoutRight, Kokkos::Serial, Kokkos::HostSpace, - false) +KOKKOSBLAS2_DGER_BLAS(Kokkos::LayoutLeft, Kokkos::Serial, Kokkos::HostSpace, true) +KOKKOSBLAS2_DGER_BLAS(Kokkos::LayoutLeft, Kokkos::Serial, Kokkos::HostSpace, false) +KOKKOSBLAS2_DGER_BLAS(Kokkos::LayoutRight, Kokkos::Serial, Kokkos::HostSpace, true) +KOKKOSBLAS2_DGER_BLAS(Kokkos::LayoutRight, Kokkos::Serial, Kokkos::HostSpace, false) -KOKKOSBLAS2_SGER_BLAS(Kokkos::LayoutLeft, Kokkos::Serial, Kokkos::HostSpace, - true) -KOKKOSBLAS2_SGER_BLAS(Kokkos::LayoutLeft, Kokkos::Serial, Kokkos::HostSpace, - false) -KOKKOSBLAS2_SGER_BLAS(Kokkos::LayoutRight, Kokkos::Serial, Kokkos::HostSpace, - true) -KOKKOSBLAS2_SGER_BLAS(Kokkos::LayoutRight, Kokkos::Serial, Kokkos::HostSpace, - false) +KOKKOSBLAS2_SGER_BLAS(Kokkos::LayoutLeft, Kokkos::Serial, Kokkos::HostSpace, true) +KOKKOSBLAS2_SGER_BLAS(Kokkos::LayoutLeft, Kokkos::Serial, Kokkos::HostSpace, false) +KOKKOSBLAS2_SGER_BLAS(Kokkos::LayoutRight, Kokkos::Serial, Kokkos::HostSpace, true) +KOKKOSBLAS2_SGER_BLAS(Kokkos::LayoutRight, Kokkos::Serial, Kokkos::HostSpace, false) -KOKKOSBLAS2_ZGER_BLAS(Kokkos::LayoutLeft, Kokkos::Serial, Kokkos::HostSpace, - true) -KOKKOSBLAS2_ZGER_BLAS(Kokkos::LayoutLeft, Kokkos::Serial, Kokkos::HostSpace, - false) -KOKKOSBLAS2_ZGER_BLAS(Kokkos::LayoutRight, Kokkos::Serial, Kokkos::HostSpace, - true) -KOKKOSBLAS2_ZGER_BLAS(Kokkos::LayoutRight, Kokkos::Serial, Kokkos::HostSpace, - false) +KOKKOSBLAS2_ZGER_BLAS(Kokkos::LayoutLeft, Kokkos::Serial, Kokkos::HostSpace, true) +KOKKOSBLAS2_ZGER_BLAS(Kokkos::LayoutLeft, Kokkos::Serial, Kokkos::HostSpace, false) +KOKKOSBLAS2_ZGER_BLAS(Kokkos::LayoutRight, Kokkos::Serial, Kokkos::HostSpace, true) +KOKKOSBLAS2_ZGER_BLAS(Kokkos::LayoutRight, Kokkos::Serial, Kokkos::HostSpace, false) -KOKKOSBLAS2_CGER_BLAS(Kokkos::LayoutLeft, Kokkos::Serial, Kokkos::HostSpace, - true) -KOKKOSBLAS2_CGER_BLAS(Kokkos::LayoutLeft, Kokkos::Serial, Kokkos::HostSpace, - false) -KOKKOSBLAS2_CGER_BLAS(Kokkos::LayoutRight, Kokkos::Serial, Kokkos::HostSpace, - true) -KOKKOSBLAS2_CGER_BLAS(Kokkos::LayoutRight, Kokkos::Serial, Kokkos::HostSpace, - false) +KOKKOSBLAS2_CGER_BLAS(Kokkos::LayoutLeft, Kokkos::Serial, Kokkos::HostSpace, true) +KOKKOSBLAS2_CGER_BLAS(Kokkos::LayoutLeft, Kokkos::Serial, Kokkos::HostSpace, false) +KOKKOSBLAS2_CGER_BLAS(Kokkos::LayoutRight, Kokkos::Serial, Kokkos::HostSpace, true) +KOKKOSBLAS2_CGER_BLAS(Kokkos::LayoutRight, Kokkos::Serial, Kokkos::HostSpace, false) #endif #ifdef KOKKOS_ENABLE_OPENMP -KOKKOSBLAS2_DGER_BLAS(Kokkos::LayoutLeft, Kokkos::OpenMP, Kokkos::HostSpace, - true) -KOKKOSBLAS2_DGER_BLAS(Kokkos::LayoutLeft, Kokkos::OpenMP, Kokkos::HostSpace, - false) -KOKKOSBLAS2_DGER_BLAS(Kokkos::LayoutRight, Kokkos::OpenMP, Kokkos::HostSpace, - true) -KOKKOSBLAS2_DGER_BLAS(Kokkos::LayoutRight, Kokkos::OpenMP, Kokkos::HostSpace, - false) +KOKKOSBLAS2_DGER_BLAS(Kokkos::LayoutLeft, Kokkos::OpenMP, Kokkos::HostSpace, true) +KOKKOSBLAS2_DGER_BLAS(Kokkos::LayoutLeft, Kokkos::OpenMP, Kokkos::HostSpace, false) +KOKKOSBLAS2_DGER_BLAS(Kokkos::LayoutRight, Kokkos::OpenMP, Kokkos::HostSpace, true) +KOKKOSBLAS2_DGER_BLAS(Kokkos::LayoutRight, Kokkos::OpenMP, Kokkos::HostSpace, false) -KOKKOSBLAS2_SGER_BLAS(Kokkos::LayoutLeft, Kokkos::OpenMP, Kokkos::HostSpace, - true) -KOKKOSBLAS2_SGER_BLAS(Kokkos::LayoutLeft, Kokkos::OpenMP, Kokkos::HostSpace, - false) -KOKKOSBLAS2_SGER_BLAS(Kokkos::LayoutRight, Kokkos::OpenMP, Kokkos::HostSpace, - true) -KOKKOSBLAS2_SGER_BLAS(Kokkos::LayoutRight, Kokkos::OpenMP, Kokkos::HostSpace, - false) +KOKKOSBLAS2_SGER_BLAS(Kokkos::LayoutLeft, Kokkos::OpenMP, Kokkos::HostSpace, true) +KOKKOSBLAS2_SGER_BLAS(Kokkos::LayoutLeft, Kokkos::OpenMP, Kokkos::HostSpace, false) +KOKKOSBLAS2_SGER_BLAS(Kokkos::LayoutRight, Kokkos::OpenMP, Kokkos::HostSpace, true) +KOKKOSBLAS2_SGER_BLAS(Kokkos::LayoutRight, Kokkos::OpenMP, Kokkos::HostSpace, false) -KOKKOSBLAS2_ZGER_BLAS(Kokkos::LayoutLeft, Kokkos::OpenMP, Kokkos::HostSpace, - true) -KOKKOSBLAS2_ZGER_BLAS(Kokkos::LayoutLeft, Kokkos::OpenMP, Kokkos::HostSpace, - false) -KOKKOSBLAS2_ZGER_BLAS(Kokkos::LayoutRight, Kokkos::OpenMP, Kokkos::HostSpace, - true) -KOKKOSBLAS2_ZGER_BLAS(Kokkos::LayoutRight, Kokkos::OpenMP, Kokkos::HostSpace, - false) +KOKKOSBLAS2_ZGER_BLAS(Kokkos::LayoutLeft, Kokkos::OpenMP, Kokkos::HostSpace, true) +KOKKOSBLAS2_ZGER_BLAS(Kokkos::LayoutLeft, Kokkos::OpenMP, Kokkos::HostSpace, false) +KOKKOSBLAS2_ZGER_BLAS(Kokkos::LayoutRight, Kokkos::OpenMP, Kokkos::HostSpace, true) +KOKKOSBLAS2_ZGER_BLAS(Kokkos::LayoutRight, Kokkos::OpenMP, Kokkos::HostSpace, false) -KOKKOSBLAS2_CGER_BLAS(Kokkos::LayoutLeft, Kokkos::OpenMP, Kokkos::HostSpace, - true) -KOKKOSBLAS2_CGER_BLAS(Kokkos::LayoutLeft, Kokkos::OpenMP, Kokkos::HostSpace, - false) -KOKKOSBLAS2_CGER_BLAS(Kokkos::LayoutRight, Kokkos::OpenMP, Kokkos::HostSpace, - true) -KOKKOSBLAS2_CGER_BLAS(Kokkos::LayoutRight, Kokkos::OpenMP, Kokkos::HostSpace, - false) +KOKKOSBLAS2_CGER_BLAS(Kokkos::LayoutLeft, Kokkos::OpenMP, Kokkos::HostSpace, true) +KOKKOSBLAS2_CGER_BLAS(Kokkos::LayoutLeft, Kokkos::OpenMP, Kokkos::HostSpace, false) +KOKKOSBLAS2_CGER_BLAS(Kokkos::LayoutRight, Kokkos::OpenMP, Kokkos::HostSpace, true) +KOKKOSBLAS2_CGER_BLAS(Kokkos::LayoutRight, Kokkos::OpenMP, Kokkos::HostSpace, false) #endif } // namespace Impl diff --git a/blas/tpls/KokkosBlas2_ger_tpl_spec_decl_cublas.hpp b/blas/tpls/KokkosBlas2_ger_tpl_spec_decl_cublas.hpp index 3f80144f62..fdb09d1c91 100644 --- a/blas/tpls/KokkosBlas2_ger_tpl_spec_decl_cublas.hpp +++ b/blas/tpls/KokkosBlas2_ger_tpl_spec_decl_cublas.hpp @@ -30,324 +30,231 @@ namespace Impl { constexpr int one = 1; \ const int LDA = A_is_lr ? A.stride(0) : A.stride(1); -#define KOKKOSBLAS2_DGER_CUBLAS(LAYOUT, EXEC_SPACE, MEM_SPACE, ETI_SPEC_AVAIL) \ - template <> \ - struct GER< \ - EXEC_SPACE, \ - Kokkos::View, \ - Kokkos::MemoryTraits >, \ - Kokkos::View, \ - Kokkos::MemoryTraits >, \ - Kokkos::View, \ - Kokkos::MemoryTraits >, \ - true, ETI_SPEC_AVAIL> { \ - typedef double SCALAR; \ - typedef Kokkos::View, \ - Kokkos::MemoryTraits > \ - XViewType; \ - typedef Kokkos::View, \ - Kokkos::MemoryTraits > \ - YViewType; \ - typedef Kokkos::View, \ - Kokkos::MemoryTraits > \ - AViewType; \ - \ - static void ger(const EXEC_SPACE& space, const char /*trans*/[], \ - typename AViewType::const_value_type& alpha, \ - const XViewType& X, const YViewType& Y, \ - const AViewType& A) { \ - Kokkos::Profiling::pushRegion("KokkosBlas::ger[TPL_CUBLAS,double]"); \ - KOKKOSBLAS2_GER_CUBLAS_DETERMINE_ARGS(LAYOUT); \ - KokkosBlas::Impl::CudaBlasSingleton& s = \ - KokkosBlas::Impl::CudaBlasSingleton::singleton(); \ - KOKKOS_CUBLAS_SAFE_CALL_IMPL( \ - cublasSetStream(s.handle, space.cuda_stream())); \ - if (A_is_ll) { \ - KOKKOS_CUBLAS_SAFE_CALL_IMPL(cublasDger(s.handle, M, N, &alpha, \ - X.data(), one, Y.data(), one, \ - A.data(), LDA)); \ - } else { \ - KOKKOS_CUBLAS_SAFE_CALL_IMPL(cublasDger(s.handle, M, N, &alpha, \ - Y.data(), one, X.data(), one, \ - A.data(), LDA)); \ - } \ - KOKKOS_CUBLAS_SAFE_CALL_IMPL(cublasSetStream(s.handle, NULL)); \ - Kokkos::Profiling::popRegion(); \ - } \ +#define KOKKOSBLAS2_DGER_CUBLAS(LAYOUT, EXEC_SPACE, MEM_SPACE, ETI_SPEC_AVAIL) \ + template <> \ + struct GER< \ + EXEC_SPACE, \ + Kokkos::View, \ + Kokkos::MemoryTraits >, \ + Kokkos::View, \ + Kokkos::MemoryTraits >, \ + Kokkos::View, Kokkos::MemoryTraits >, \ + true, ETI_SPEC_AVAIL> { \ + typedef double SCALAR; \ + typedef Kokkos::View, \ + Kokkos::MemoryTraits > \ + XViewType; \ + typedef Kokkos::View, \ + Kokkos::MemoryTraits > \ + YViewType; \ + typedef Kokkos::View, \ + Kokkos::MemoryTraits > \ + AViewType; \ + \ + static void ger(const EXEC_SPACE& space, const char /*trans*/[], typename AViewType::const_value_type& alpha, \ + const XViewType& X, const YViewType& Y, const AViewType& A) { \ + Kokkos::Profiling::pushRegion("KokkosBlas::ger[TPL_CUBLAS,double]"); \ + KOKKOSBLAS2_GER_CUBLAS_DETERMINE_ARGS(LAYOUT); \ + KokkosBlas::Impl::CudaBlasSingleton& s = KokkosBlas::Impl::CudaBlasSingleton::singleton(); \ + KOKKOS_CUBLAS_SAFE_CALL_IMPL(cublasSetStream(s.handle, space.cuda_stream())); \ + if (A_is_ll) { \ + KOKKOS_CUBLAS_SAFE_CALL_IMPL(cublasDger(s.handle, M, N, &alpha, X.data(), one, Y.data(), one, A.data(), LDA)); \ + } else { \ + KOKKOS_CUBLAS_SAFE_CALL_IMPL(cublasDger(s.handle, M, N, &alpha, Y.data(), one, X.data(), one, A.data(), LDA)); \ + } \ + KOKKOS_CUBLAS_SAFE_CALL_IMPL(cublasSetStream(s.handle, NULL)); \ + Kokkos::Profiling::popRegion(); \ + } \ }; -#define KOKKOSBLAS2_SGER_CUBLAS(LAYOUT, EXEC_SPACE, MEM_SPACE, ETI_SPEC_AVAIL) \ - template <> \ - struct GER< \ - EXEC_SPACE, \ - Kokkos::View, \ - Kokkos::MemoryTraits >, \ - Kokkos::View, \ - Kokkos::MemoryTraits >, \ - Kokkos::View, \ - Kokkos::MemoryTraits >, \ - true, ETI_SPEC_AVAIL> { \ - typedef float SCALAR; \ - typedef Kokkos::View, \ - Kokkos::MemoryTraits > \ - XViewType; \ - typedef Kokkos::View, \ - Kokkos::MemoryTraits > \ - YViewType; \ - typedef Kokkos::View, \ - Kokkos::MemoryTraits > \ - AViewType; \ - \ - static void ger(const EXEC_SPACE& space, const char /*trans*/[], \ - typename AViewType::const_value_type& alpha, \ - const XViewType& X, const YViewType& Y, \ - const AViewType& A) { \ - Kokkos::Profiling::pushRegion("KokkosBlas::ger[TPL_CUBLAS,float]"); \ - KOKKOSBLAS2_GER_CUBLAS_DETERMINE_ARGS(LAYOUT); \ - KokkosBlas::Impl::CudaBlasSingleton& s = \ - KokkosBlas::Impl::CudaBlasSingleton::singleton(); \ - KOKKOS_CUBLAS_SAFE_CALL_IMPL( \ - cublasSetStream(s.handle, space.cuda_stream())); \ - if (A_is_ll) { \ - KOKKOS_CUBLAS_SAFE_CALL_IMPL(cublasSger(s.handle, M, N, &alpha, \ - X.data(), one, Y.data(), one, \ - A.data(), LDA)); \ - } else { \ - KOKKOS_CUBLAS_SAFE_CALL_IMPL(cublasSger(s.handle, M, N, &alpha, \ - Y.data(), one, X.data(), one, \ - A.data(), LDA)); \ - } \ - KOKKOS_CUBLAS_SAFE_CALL_IMPL(cublasSetStream(s.handle, NULL)); \ - Kokkos::Profiling::popRegion(); \ - } \ +#define KOKKOSBLAS2_SGER_CUBLAS(LAYOUT, EXEC_SPACE, MEM_SPACE, ETI_SPEC_AVAIL) \ + template <> \ + struct GER< \ + EXEC_SPACE, \ + Kokkos::View, \ + Kokkos::MemoryTraits >, \ + Kokkos::View, \ + Kokkos::MemoryTraits >, \ + Kokkos::View, Kokkos::MemoryTraits >, \ + true, ETI_SPEC_AVAIL> { \ + typedef float SCALAR; \ + typedef Kokkos::View, \ + Kokkos::MemoryTraits > \ + XViewType; \ + typedef Kokkos::View, \ + Kokkos::MemoryTraits > \ + YViewType; \ + typedef Kokkos::View, \ + Kokkos::MemoryTraits > \ + AViewType; \ + \ + static void ger(const EXEC_SPACE& space, const char /*trans*/[], typename AViewType::const_value_type& alpha, \ + const XViewType& X, const YViewType& Y, const AViewType& A) { \ + Kokkos::Profiling::pushRegion("KokkosBlas::ger[TPL_CUBLAS,float]"); \ + KOKKOSBLAS2_GER_CUBLAS_DETERMINE_ARGS(LAYOUT); \ + KokkosBlas::Impl::CudaBlasSingleton& s = KokkosBlas::Impl::CudaBlasSingleton::singleton(); \ + KOKKOS_CUBLAS_SAFE_CALL_IMPL(cublasSetStream(s.handle, space.cuda_stream())); \ + if (A_is_ll) { \ + KOKKOS_CUBLAS_SAFE_CALL_IMPL(cublasSger(s.handle, M, N, &alpha, X.data(), one, Y.data(), one, A.data(), LDA)); \ + } else { \ + KOKKOS_CUBLAS_SAFE_CALL_IMPL(cublasSger(s.handle, M, N, &alpha, Y.data(), one, X.data(), one, A.data(), LDA)); \ + } \ + KOKKOS_CUBLAS_SAFE_CALL_IMPL(cublasSetStream(s.handle, NULL)); \ + Kokkos::Profiling::popRegion(); \ + } \ }; -#define KOKKOSBLAS2_ZGER_CUBLAS(LAYOUT, EXEC_SPACE, MEM_SPACE, ETI_SPEC_AVAIL) \ - template <> \ - struct GER*, LAYOUT, \ - Kokkos::Device, \ - Kokkos::MemoryTraits >, \ - Kokkos::View*, LAYOUT, \ - Kokkos::Device, \ - Kokkos::MemoryTraits >, \ - Kokkos::View**, LAYOUT, \ - Kokkos::Device, \ - Kokkos::MemoryTraits >, \ - true, ETI_SPEC_AVAIL> { \ - typedef Kokkos::complex SCALAR; \ - typedef Kokkos::View, \ - Kokkos::MemoryTraits > \ - XViewType; \ - typedef Kokkos::View, \ - Kokkos::MemoryTraits > \ - YViewType; \ - typedef Kokkos::View, \ - Kokkos::MemoryTraits > \ - AViewType; \ - \ - static void ger(const EXEC_SPACE& space, const char trans[], \ - typename AViewType::const_value_type& alpha, \ - const XViewType& X, const YViewType& Y, \ - const AViewType& A) { \ - Kokkos::Profiling::pushRegion( \ - "KokkosBlas::ger[TPL_CUBLAS,complex]"); \ - KOKKOSBLAS2_GER_CUBLAS_DETERMINE_ARGS(LAYOUT); \ - bool justTranspose = (trans[0] == 'T') || (trans[0] == 't'); \ - KokkosBlas::Impl::CudaBlasSingleton& s = \ - KokkosBlas::Impl::CudaBlasSingleton::singleton(); \ - KOKKOS_CUBLAS_SAFE_CALL_IMPL( \ - cublasSetStream(s.handle, space.cuda_stream())); \ - if (A_is_ll) { \ - if (justTranspose) { \ - KOKKOS_CUBLAS_SAFE_CALL_IMPL(cublasZgeru( \ - s.handle, M, N, \ - reinterpret_cast(&alpha), \ - reinterpret_cast(X.data()), one, \ - reinterpret_cast(Y.data()), one, \ - reinterpret_cast(A.data()), LDA)); \ - } else { \ - KOKKOS_CUBLAS_SAFE_CALL_IMPL(cublasZgerc( \ - s.handle, M, N, \ - reinterpret_cast(&alpha), \ - reinterpret_cast(X.data()), one, \ - reinterpret_cast(Y.data()), one, \ - reinterpret_cast(A.data()), LDA)); \ - } \ - } else { \ - if (justTranspose) { \ - KOKKOS_CUBLAS_SAFE_CALL_IMPL(cublasZgeru( \ - s.handle, M, N, \ - reinterpret_cast(&alpha), \ - reinterpret_cast(Y.data()), one, \ - reinterpret_cast(X.data()), one, \ - reinterpret_cast(A.data()), LDA)); \ - } else { \ - /* cublasZgerc() + ~A_ll => call kokkos-kernels' implementation */ \ - GER::ger(space, trans, alpha, X, Y, A); \ - } \ - } \ - KOKKOS_CUBLAS_SAFE_CALL_IMPL(cublasSetStream(s.handle, NULL)); \ - Kokkos::Profiling::popRegion(); \ - } \ +#define KOKKOSBLAS2_ZGER_CUBLAS(LAYOUT, EXEC_SPACE, MEM_SPACE, ETI_SPEC_AVAIL) \ + template <> \ + struct GER*, LAYOUT, Kokkos::Device, \ + Kokkos::MemoryTraits >, \ + Kokkos::View*, LAYOUT, Kokkos::Device, \ + Kokkos::MemoryTraits >, \ + Kokkos::View**, LAYOUT, Kokkos::Device, \ + Kokkos::MemoryTraits >, \ + true, ETI_SPEC_AVAIL> { \ + typedef Kokkos::complex SCALAR; \ + typedef Kokkos::View, \ + Kokkos::MemoryTraits > \ + XViewType; \ + typedef Kokkos::View, \ + Kokkos::MemoryTraits > \ + YViewType; \ + typedef Kokkos::View, \ + Kokkos::MemoryTraits > \ + AViewType; \ + \ + static void ger(const EXEC_SPACE& space, const char trans[], typename AViewType::const_value_type& alpha, \ + const XViewType& X, const YViewType& Y, const AViewType& A) { \ + Kokkos::Profiling::pushRegion("KokkosBlas::ger[TPL_CUBLAS,complex]"); \ + KOKKOSBLAS2_GER_CUBLAS_DETERMINE_ARGS(LAYOUT); \ + bool justTranspose = (trans[0] == 'T') || (trans[0] == 't'); \ + KokkosBlas::Impl::CudaBlasSingleton& s = KokkosBlas::Impl::CudaBlasSingleton::singleton(); \ + KOKKOS_CUBLAS_SAFE_CALL_IMPL(cublasSetStream(s.handle, space.cuda_stream())); \ + if (A_is_ll) { \ + if (justTranspose) { \ + KOKKOS_CUBLAS_SAFE_CALL_IMPL(cublasZgeru(s.handle, M, N, reinterpret_cast(&alpha), \ + reinterpret_cast(X.data()), one, \ + reinterpret_cast(Y.data()), one, \ + reinterpret_cast(A.data()), LDA)); \ + } else { \ + KOKKOS_CUBLAS_SAFE_CALL_IMPL(cublasZgerc(s.handle, M, N, reinterpret_cast(&alpha), \ + reinterpret_cast(X.data()), one, \ + reinterpret_cast(Y.data()), one, \ + reinterpret_cast(A.data()), LDA)); \ + } \ + } else { \ + if (justTranspose) { \ + KOKKOS_CUBLAS_SAFE_CALL_IMPL(cublasZgeru(s.handle, M, N, reinterpret_cast(&alpha), \ + reinterpret_cast(Y.data()), one, \ + reinterpret_cast(X.data()), one, \ + reinterpret_cast(A.data()), LDA)); \ + } else { \ + /* cublasZgerc() + ~A_ll => call kokkos-kernels' implementation */ \ + GER::ger(space, trans, alpha, X, Y, A); \ + } \ + } \ + KOKKOS_CUBLAS_SAFE_CALL_IMPL(cublasSetStream(s.handle, NULL)); \ + Kokkos::Profiling::popRegion(); \ + } \ }; -#define KOKKOSBLAS2_CGER_CUBLAS(LAYOUT, EXEC_SPACE, MEM_SPACE, ETI_SPEC_AVAIL) \ - template <> \ - struct GER*, LAYOUT, \ - Kokkos::Device, \ - Kokkos::MemoryTraits >, \ - Kokkos::View*, LAYOUT, \ - Kokkos::Device, \ - Kokkos::MemoryTraits >, \ - Kokkos::View**, LAYOUT, \ - Kokkos::Device, \ - Kokkos::MemoryTraits >, \ - true, ETI_SPEC_AVAIL> { \ - typedef Kokkos::complex SCALAR; \ - typedef Kokkos::View, \ - Kokkos::MemoryTraits > \ - XViewType; \ - typedef Kokkos::View, \ - Kokkos::MemoryTraits > \ - YViewType; \ - typedef Kokkos::View, \ - Kokkos::MemoryTraits > \ - AViewType; \ - \ - static void ger(const EXEC_SPACE& space, const char trans[], \ - typename AViewType::const_value_type& alpha, \ - const XViewType& X, const YViewType& Y, \ - const AViewType& A) { \ - Kokkos::Profiling::pushRegion( \ - "KokkosBlas::ger[TPL_CUBLAS,complex]"); \ - KOKKOSBLAS2_GER_CUBLAS_DETERMINE_ARGS(LAYOUT); \ - bool justTranspose = (trans[0] == 'T') || (trans[0] == 't'); \ - KokkosBlas::Impl::CudaBlasSingleton& s = \ - KokkosBlas::Impl::CudaBlasSingleton::singleton(); \ - KOKKOS_CUBLAS_SAFE_CALL_IMPL( \ - cublasSetStream(s.handle, space.cuda_stream())); \ - if (A_is_ll) { \ - if (justTranspose) { \ - KOKKOS_CUBLAS_SAFE_CALL_IMPL(cublasCgeru( \ - s.handle, M, N, reinterpret_cast(&alpha), \ - reinterpret_cast(X.data()), one, \ - reinterpret_cast(Y.data()), one, \ - reinterpret_cast(A.data()), LDA)); \ - } else { \ - KOKKOS_CUBLAS_SAFE_CALL_IMPL(cublasCgerc( \ - s.handle, M, N, reinterpret_cast(&alpha), \ - reinterpret_cast(X.data()), one, \ - reinterpret_cast(Y.data()), one, \ - reinterpret_cast(A.data()), LDA)); \ - } \ - } else { \ - if (justTranspose) { \ - KOKKOS_CUBLAS_SAFE_CALL_IMPL(cublasCgeru( \ - s.handle, M, N, reinterpret_cast(&alpha), \ - reinterpret_cast(Y.data()), one, \ - reinterpret_cast(X.data()), one, \ - reinterpret_cast(A.data()), LDA)); \ - } else { \ - /* cublasCgerc() + ~A_ll => call kokkos-kernels' implementation */ \ - GER::ger(space, trans, alpha, X, Y, A); \ - } \ - } \ - KOKKOS_CUBLAS_SAFE_CALL_IMPL(cublasSetStream(s.handle, NULL)); \ - Kokkos::Profiling::popRegion(); \ - } \ +#define KOKKOSBLAS2_CGER_CUBLAS(LAYOUT, EXEC_SPACE, MEM_SPACE, ETI_SPEC_AVAIL) \ + template <> \ + struct GER*, LAYOUT, Kokkos::Device, \ + Kokkos::MemoryTraits >, \ + Kokkos::View*, LAYOUT, Kokkos::Device, \ + Kokkos::MemoryTraits >, \ + Kokkos::View**, LAYOUT, Kokkos::Device, \ + Kokkos::MemoryTraits >, \ + true, ETI_SPEC_AVAIL> { \ + typedef Kokkos::complex SCALAR; \ + typedef Kokkos::View, \ + Kokkos::MemoryTraits > \ + XViewType; \ + typedef Kokkos::View, \ + Kokkos::MemoryTraits > \ + YViewType; \ + typedef Kokkos::View, \ + Kokkos::MemoryTraits > \ + AViewType; \ + \ + static void ger(const EXEC_SPACE& space, const char trans[], typename AViewType::const_value_type& alpha, \ + const XViewType& X, const YViewType& Y, const AViewType& A) { \ + Kokkos::Profiling::pushRegion("KokkosBlas::ger[TPL_CUBLAS,complex]"); \ + KOKKOSBLAS2_GER_CUBLAS_DETERMINE_ARGS(LAYOUT); \ + bool justTranspose = (trans[0] == 'T') || (trans[0] == 't'); \ + KokkosBlas::Impl::CudaBlasSingleton& s = KokkosBlas::Impl::CudaBlasSingleton::singleton(); \ + KOKKOS_CUBLAS_SAFE_CALL_IMPL(cublasSetStream(s.handle, space.cuda_stream())); \ + if (A_is_ll) { \ + if (justTranspose) { \ + KOKKOS_CUBLAS_SAFE_CALL_IMPL(cublasCgeru(s.handle, M, N, reinterpret_cast(&alpha), \ + reinterpret_cast(X.data()), one, \ + reinterpret_cast(Y.data()), one, \ + reinterpret_cast(A.data()), LDA)); \ + } else { \ + KOKKOS_CUBLAS_SAFE_CALL_IMPL(cublasCgerc(s.handle, M, N, reinterpret_cast(&alpha), \ + reinterpret_cast(X.data()), one, \ + reinterpret_cast(Y.data()), one, \ + reinterpret_cast(A.data()), LDA)); \ + } \ + } else { \ + if (justTranspose) { \ + KOKKOS_CUBLAS_SAFE_CALL_IMPL(cublasCgeru(s.handle, M, N, reinterpret_cast(&alpha), \ + reinterpret_cast(Y.data()), one, \ + reinterpret_cast(X.data()), one, \ + reinterpret_cast(A.data()), LDA)); \ + } else { \ + /* cublasCgerc() + ~A_ll => call kokkos-kernels' implementation */ \ + GER::ger(space, trans, alpha, X, Y, A); \ + } \ + } \ + KOKKOS_CUBLAS_SAFE_CALL_IMPL(cublasSetStream(s.handle, NULL)); \ + Kokkos::Profiling::popRegion(); \ + } \ }; -KOKKOSBLAS2_DGER_CUBLAS(Kokkos::LayoutLeft, Kokkos::Cuda, Kokkos::CudaSpace, - true) -KOKKOSBLAS2_DGER_CUBLAS(Kokkos::LayoutLeft, Kokkos::Cuda, Kokkos::CudaSpace, - false) -KOKKOSBLAS2_DGER_CUBLAS(Kokkos::LayoutRight, Kokkos::Cuda, Kokkos::CudaSpace, - true) -KOKKOSBLAS2_DGER_CUBLAS(Kokkos::LayoutRight, Kokkos::Cuda, Kokkos::CudaSpace, - false) +KOKKOSBLAS2_DGER_CUBLAS(Kokkos::LayoutLeft, Kokkos::Cuda, Kokkos::CudaSpace, true) +KOKKOSBLAS2_DGER_CUBLAS(Kokkos::LayoutLeft, Kokkos::Cuda, Kokkos::CudaSpace, false) +KOKKOSBLAS2_DGER_CUBLAS(Kokkos::LayoutRight, Kokkos::Cuda, Kokkos::CudaSpace, true) +KOKKOSBLAS2_DGER_CUBLAS(Kokkos::LayoutRight, Kokkos::Cuda, Kokkos::CudaSpace, false) -KOKKOSBLAS2_DGER_CUBLAS(Kokkos::LayoutLeft, Kokkos::Cuda, Kokkos::CudaUVMSpace, - true) -KOKKOSBLAS2_DGER_CUBLAS(Kokkos::LayoutLeft, Kokkos::Cuda, Kokkos::CudaUVMSpace, - false) -KOKKOSBLAS2_DGER_CUBLAS(Kokkos::LayoutRight, Kokkos::Cuda, Kokkos::CudaUVMSpace, - true) -KOKKOSBLAS2_DGER_CUBLAS(Kokkos::LayoutRight, Kokkos::Cuda, Kokkos::CudaUVMSpace, - false) +KOKKOSBLAS2_DGER_CUBLAS(Kokkos::LayoutLeft, Kokkos::Cuda, Kokkos::CudaUVMSpace, true) +KOKKOSBLAS2_DGER_CUBLAS(Kokkos::LayoutLeft, Kokkos::Cuda, Kokkos::CudaUVMSpace, false) +KOKKOSBLAS2_DGER_CUBLAS(Kokkos::LayoutRight, Kokkos::Cuda, Kokkos::CudaUVMSpace, true) +KOKKOSBLAS2_DGER_CUBLAS(Kokkos::LayoutRight, Kokkos::Cuda, Kokkos::CudaUVMSpace, false) -KOKKOSBLAS2_SGER_CUBLAS(Kokkos::LayoutLeft, Kokkos::Cuda, Kokkos::CudaSpace, - true) -KOKKOSBLAS2_SGER_CUBLAS(Kokkos::LayoutLeft, Kokkos::Cuda, Kokkos::CudaSpace, - false) -KOKKOSBLAS2_SGER_CUBLAS(Kokkos::LayoutRight, Kokkos::Cuda, Kokkos::CudaSpace, - true) -KOKKOSBLAS2_SGER_CUBLAS(Kokkos::LayoutRight, Kokkos::Cuda, Kokkos::CudaSpace, - false) +KOKKOSBLAS2_SGER_CUBLAS(Kokkos::LayoutLeft, Kokkos::Cuda, Kokkos::CudaSpace, true) +KOKKOSBLAS2_SGER_CUBLAS(Kokkos::LayoutLeft, Kokkos::Cuda, Kokkos::CudaSpace, false) +KOKKOSBLAS2_SGER_CUBLAS(Kokkos::LayoutRight, Kokkos::Cuda, Kokkos::CudaSpace, true) +KOKKOSBLAS2_SGER_CUBLAS(Kokkos::LayoutRight, Kokkos::Cuda, Kokkos::CudaSpace, false) -KOKKOSBLAS2_SGER_CUBLAS(Kokkos::LayoutLeft, Kokkos::Cuda, Kokkos::CudaUVMSpace, - true) -KOKKOSBLAS2_SGER_CUBLAS(Kokkos::LayoutLeft, Kokkos::Cuda, Kokkos::CudaUVMSpace, - false) -KOKKOSBLAS2_SGER_CUBLAS(Kokkos::LayoutRight, Kokkos::Cuda, Kokkos::CudaUVMSpace, - true) -KOKKOSBLAS2_SGER_CUBLAS(Kokkos::LayoutRight, Kokkos::Cuda, Kokkos::CudaUVMSpace, - false) +KOKKOSBLAS2_SGER_CUBLAS(Kokkos::LayoutLeft, Kokkos::Cuda, Kokkos::CudaUVMSpace, true) +KOKKOSBLAS2_SGER_CUBLAS(Kokkos::LayoutLeft, Kokkos::Cuda, Kokkos::CudaUVMSpace, false) +KOKKOSBLAS2_SGER_CUBLAS(Kokkos::LayoutRight, Kokkos::Cuda, Kokkos::CudaUVMSpace, true) +KOKKOSBLAS2_SGER_CUBLAS(Kokkos::LayoutRight, Kokkos::Cuda, Kokkos::CudaUVMSpace, false) -KOKKOSBLAS2_ZGER_CUBLAS(Kokkos::LayoutLeft, Kokkos::Cuda, Kokkos::CudaSpace, - true) -KOKKOSBLAS2_ZGER_CUBLAS(Kokkos::LayoutLeft, Kokkos::Cuda, Kokkos::CudaSpace, - false) -KOKKOSBLAS2_ZGER_CUBLAS(Kokkos::LayoutRight, Kokkos::Cuda, Kokkos::CudaSpace, - true) -KOKKOSBLAS2_ZGER_CUBLAS(Kokkos::LayoutRight, Kokkos::Cuda, Kokkos::CudaSpace, - false) +KOKKOSBLAS2_ZGER_CUBLAS(Kokkos::LayoutLeft, Kokkos::Cuda, Kokkos::CudaSpace, true) +KOKKOSBLAS2_ZGER_CUBLAS(Kokkos::LayoutLeft, Kokkos::Cuda, Kokkos::CudaSpace, false) +KOKKOSBLAS2_ZGER_CUBLAS(Kokkos::LayoutRight, Kokkos::Cuda, Kokkos::CudaSpace, true) +KOKKOSBLAS2_ZGER_CUBLAS(Kokkos::LayoutRight, Kokkos::Cuda, Kokkos::CudaSpace, false) -KOKKOSBLAS2_ZGER_CUBLAS(Kokkos::LayoutLeft, Kokkos::Cuda, Kokkos::CudaUVMSpace, - true) -KOKKOSBLAS2_ZGER_CUBLAS(Kokkos::LayoutLeft, Kokkos::Cuda, Kokkos::CudaUVMSpace, - false) -KOKKOSBLAS2_ZGER_CUBLAS(Kokkos::LayoutRight, Kokkos::Cuda, Kokkos::CudaUVMSpace, - true) -KOKKOSBLAS2_ZGER_CUBLAS(Kokkos::LayoutRight, Kokkos::Cuda, Kokkos::CudaUVMSpace, - false) +KOKKOSBLAS2_ZGER_CUBLAS(Kokkos::LayoutLeft, Kokkos::Cuda, Kokkos::CudaUVMSpace, true) +KOKKOSBLAS2_ZGER_CUBLAS(Kokkos::LayoutLeft, Kokkos::Cuda, Kokkos::CudaUVMSpace, false) +KOKKOSBLAS2_ZGER_CUBLAS(Kokkos::LayoutRight, Kokkos::Cuda, Kokkos::CudaUVMSpace, true) +KOKKOSBLAS2_ZGER_CUBLAS(Kokkos::LayoutRight, Kokkos::Cuda, Kokkos::CudaUVMSpace, false) -KOKKOSBLAS2_CGER_CUBLAS(Kokkos::LayoutLeft, Kokkos::Cuda, Kokkos::CudaSpace, - true) -KOKKOSBLAS2_CGER_CUBLAS(Kokkos::LayoutLeft, Kokkos::Cuda, Kokkos::CudaSpace, - false) -KOKKOSBLAS2_CGER_CUBLAS(Kokkos::LayoutRight, Kokkos::Cuda, Kokkos::CudaSpace, - true) -KOKKOSBLAS2_CGER_CUBLAS(Kokkos::LayoutRight, Kokkos::Cuda, Kokkos::CudaSpace, - false) +KOKKOSBLAS2_CGER_CUBLAS(Kokkos::LayoutLeft, Kokkos::Cuda, Kokkos::CudaSpace, true) +KOKKOSBLAS2_CGER_CUBLAS(Kokkos::LayoutLeft, Kokkos::Cuda, Kokkos::CudaSpace, false) +KOKKOSBLAS2_CGER_CUBLAS(Kokkos::LayoutRight, Kokkos::Cuda, Kokkos::CudaSpace, true) +KOKKOSBLAS2_CGER_CUBLAS(Kokkos::LayoutRight, Kokkos::Cuda, Kokkos::CudaSpace, false) -KOKKOSBLAS2_CGER_CUBLAS(Kokkos::LayoutLeft, Kokkos::Cuda, Kokkos::CudaUVMSpace, - true) -KOKKOSBLAS2_CGER_CUBLAS(Kokkos::LayoutLeft, Kokkos::Cuda, Kokkos::CudaUVMSpace, - false) -KOKKOSBLAS2_CGER_CUBLAS(Kokkos::LayoutRight, Kokkos::Cuda, Kokkos::CudaUVMSpace, - true) -KOKKOSBLAS2_CGER_CUBLAS(Kokkos::LayoutRight, Kokkos::Cuda, Kokkos::CudaUVMSpace, - false) +KOKKOSBLAS2_CGER_CUBLAS(Kokkos::LayoutLeft, Kokkos::Cuda, Kokkos::CudaUVMSpace, true) +KOKKOSBLAS2_CGER_CUBLAS(Kokkos::LayoutLeft, Kokkos::Cuda, Kokkos::CudaUVMSpace, false) +KOKKOSBLAS2_CGER_CUBLAS(Kokkos::LayoutRight, Kokkos::Cuda, Kokkos::CudaUVMSpace, true) +KOKKOSBLAS2_CGER_CUBLAS(Kokkos::LayoutRight, Kokkos::Cuda, Kokkos::CudaUVMSpace, false) } // namespace Impl } // namespace KokkosBlas diff --git a/blas/tpls/KokkosBlas2_ger_tpl_spec_decl_rocblas.hpp b/blas/tpls/KokkosBlas2_ger_tpl_spec_decl_rocblas.hpp index c21b61befa..26a0da5864 100644 --- a/blas/tpls/KokkosBlas2_ger_tpl_spec_decl_rocblas.hpp +++ b/blas/tpls/KokkosBlas2_ger_tpl_spec_decl_rocblas.hpp @@ -30,295 +30,221 @@ namespace Impl { constexpr int one = 1; \ const int LDA = A_is_lr ? A.stride(0) : A.stride(1); -#define KOKKOSBLAS2_DGER_ROCBLAS(LAYOUT, EXEC_SPACE, MEM_SPACE, \ - ETI_SPEC_AVAIL) \ - template <> \ - struct GER< \ - EXEC_SPACE, \ - Kokkos::View, \ - Kokkos::MemoryTraits >, \ - Kokkos::View, \ - Kokkos::MemoryTraits >, \ - Kokkos::View, \ - Kokkos::MemoryTraits >, \ - true, ETI_SPEC_AVAIL> { \ - typedef double SCALAR; \ - typedef Kokkos::View, \ - Kokkos::MemoryTraits > \ - XViewType; \ - typedef Kokkos::View, \ - Kokkos::MemoryTraits > \ - YViewType; \ - typedef Kokkos::View, \ - Kokkos::MemoryTraits > \ - AViewType; \ - \ - static void ger(const EXEC_SPACE& space, const char /*trans*/[], \ - typename AViewType::const_value_type& alpha, \ - const XViewType& X, const YViewType& Y, \ - const AViewType& A) { \ - Kokkos::Profiling::pushRegion("KokkosBlas::ger[TPL_ROCBLAS,double]"); \ - KOKKOSBLAS2_GER_ROCBLAS_DETERMINE_ARGS(LAYOUT); \ - KokkosBlas::Impl::RocBlasSingleton& s = \ - KokkosBlas::Impl::RocBlasSingleton::singleton(); \ - KOKKOS_ROCBLAS_SAFE_CALL_IMPL( \ - rocblas_set_stream(s.handle, space.hip_stream())); \ - if (A_is_ll) { \ - KOKKOS_ROCBLAS_SAFE_CALL_IMPL(rocblas_dger(s.handle, M, N, &alpha, \ - X.data(), one, Y.data(), \ - one, A.data(), LDA)); \ - } else { \ - KOKKOS_ROCBLAS_SAFE_CALL_IMPL(rocblas_dger(s.handle, M, N, &alpha, \ - Y.data(), one, X.data(), \ - one, A.data(), LDA)); \ - } \ - KOKKOS_ROCBLAS_SAFE_CALL_IMPL(rocblas_set_stream(s.handle, NULL)); \ - Kokkos::Profiling::popRegion(); \ - } \ +#define KOKKOSBLAS2_DGER_ROCBLAS(LAYOUT, EXEC_SPACE, MEM_SPACE, ETI_SPEC_AVAIL) \ + template <> \ + struct GER< \ + EXEC_SPACE, \ + Kokkos::View, \ + Kokkos::MemoryTraits >, \ + Kokkos::View, \ + Kokkos::MemoryTraits >, \ + Kokkos::View, Kokkos::MemoryTraits >, \ + true, ETI_SPEC_AVAIL> { \ + typedef double SCALAR; \ + typedef Kokkos::View, \ + Kokkos::MemoryTraits > \ + XViewType; \ + typedef Kokkos::View, \ + Kokkos::MemoryTraits > \ + YViewType; \ + typedef Kokkos::View, \ + Kokkos::MemoryTraits > \ + AViewType; \ + \ + static void ger(const EXEC_SPACE& space, const char /*trans*/[], typename AViewType::const_value_type& alpha, \ + const XViewType& X, const YViewType& Y, const AViewType& A) { \ + Kokkos::Profiling::pushRegion("KokkosBlas::ger[TPL_ROCBLAS,double]"); \ + KOKKOSBLAS2_GER_ROCBLAS_DETERMINE_ARGS(LAYOUT); \ + KokkosBlas::Impl::RocBlasSingleton& s = KokkosBlas::Impl::RocBlasSingleton::singleton(); \ + KOKKOS_ROCBLAS_SAFE_CALL_IMPL(rocblas_set_stream(s.handle, space.hip_stream())); \ + if (A_is_ll) { \ + KOKKOS_ROCBLAS_SAFE_CALL_IMPL( \ + rocblas_dger(s.handle, M, N, &alpha, X.data(), one, Y.data(), one, A.data(), LDA)); \ + } else { \ + KOKKOS_ROCBLAS_SAFE_CALL_IMPL( \ + rocblas_dger(s.handle, M, N, &alpha, Y.data(), one, X.data(), one, A.data(), LDA)); \ + } \ + KOKKOS_ROCBLAS_SAFE_CALL_IMPL(rocblas_set_stream(s.handle, NULL)); \ + Kokkos::Profiling::popRegion(); \ + } \ }; -#define KOKKOSBLAS2_SGER_ROCBLAS(LAYOUT, EXEC_SPACE, MEM_SPACE, \ - ETI_SPEC_AVAIL) \ - template <> \ - struct GER< \ - EXEC_SPACE, \ - Kokkos::View, \ - Kokkos::MemoryTraits >, \ - Kokkos::View, \ - Kokkos::MemoryTraits >, \ - Kokkos::View, \ - Kokkos::MemoryTraits >, \ - true, ETI_SPEC_AVAIL> { \ - typedef float SCALAR; \ - typedef Kokkos::View, \ - Kokkos::MemoryTraits > \ - XViewType; \ - typedef Kokkos::View, \ - Kokkos::MemoryTraits > \ - YViewType; \ - typedef Kokkos::View, \ - Kokkos::MemoryTraits > \ - AViewType; \ - \ - static void ger(const EXEC_SPACE& space, const char /*trans*/[], \ - typename AViewType::const_value_type& alpha, \ - const XViewType& X, const YViewType& Y, \ - const AViewType& A) { \ - Kokkos::Profiling::pushRegion("KokkosBlas::ger[TPL_ROCBLAS,float]"); \ - KOKKOSBLAS2_GER_ROCBLAS_DETERMINE_ARGS(LAYOUT); \ - KokkosBlas::Impl::RocBlasSingleton& s = \ - KokkosBlas::Impl::RocBlasSingleton::singleton(); \ - KOKKOS_ROCBLAS_SAFE_CALL_IMPL( \ - rocblas_set_stream(s.handle, space.hip_stream())); \ - if (A_is_ll) { \ - KOKKOS_ROCBLAS_SAFE_CALL_IMPL(rocblas_sger(s.handle, M, N, &alpha, \ - X.data(), one, Y.data(), \ - one, A.data(), LDA)); \ - } else { \ - KOKKOS_ROCBLAS_SAFE_CALL_IMPL(rocblas_sger(s.handle, M, N, &alpha, \ - Y.data(), one, X.data(), \ - one, A.data(), LDA)); \ - } \ - KOKKOS_ROCBLAS_SAFE_CALL_IMPL(rocblas_set_stream(s.handle, NULL)); \ - Kokkos::Profiling::popRegion(); \ - } \ +#define KOKKOSBLAS2_SGER_ROCBLAS(LAYOUT, EXEC_SPACE, MEM_SPACE, ETI_SPEC_AVAIL) \ + template <> \ + struct GER< \ + EXEC_SPACE, \ + Kokkos::View, \ + Kokkos::MemoryTraits >, \ + Kokkos::View, \ + Kokkos::MemoryTraits >, \ + Kokkos::View, Kokkos::MemoryTraits >, \ + true, ETI_SPEC_AVAIL> { \ + typedef float SCALAR; \ + typedef Kokkos::View, \ + Kokkos::MemoryTraits > \ + XViewType; \ + typedef Kokkos::View, \ + Kokkos::MemoryTraits > \ + YViewType; \ + typedef Kokkos::View, \ + Kokkos::MemoryTraits > \ + AViewType; \ + \ + static void ger(const EXEC_SPACE& space, const char /*trans*/[], typename AViewType::const_value_type& alpha, \ + const XViewType& X, const YViewType& Y, const AViewType& A) { \ + Kokkos::Profiling::pushRegion("KokkosBlas::ger[TPL_ROCBLAS,float]"); \ + KOKKOSBLAS2_GER_ROCBLAS_DETERMINE_ARGS(LAYOUT); \ + KokkosBlas::Impl::RocBlasSingleton& s = KokkosBlas::Impl::RocBlasSingleton::singleton(); \ + KOKKOS_ROCBLAS_SAFE_CALL_IMPL(rocblas_set_stream(s.handle, space.hip_stream())); \ + if (A_is_ll) { \ + KOKKOS_ROCBLAS_SAFE_CALL_IMPL( \ + rocblas_sger(s.handle, M, N, &alpha, X.data(), one, Y.data(), one, A.data(), LDA)); \ + } else { \ + KOKKOS_ROCBLAS_SAFE_CALL_IMPL( \ + rocblas_sger(s.handle, M, N, &alpha, Y.data(), one, X.data(), one, A.data(), LDA)); \ + } \ + KOKKOS_ROCBLAS_SAFE_CALL_IMPL(rocblas_set_stream(s.handle, NULL)); \ + Kokkos::Profiling::popRegion(); \ + } \ }; -#define KOKKOSBLAS2_ZGER_ROCBLAS(LAYOUT, EXEC_SPACE, MEM_SPACE, \ - ETI_SPEC_AVAIL) \ - template <> \ - struct GER*, LAYOUT, \ - Kokkos::Device, \ - Kokkos::MemoryTraits >, \ - Kokkos::View*, LAYOUT, \ - Kokkos::Device, \ - Kokkos::MemoryTraits >, \ - Kokkos::View**, LAYOUT, \ - Kokkos::Device, \ - Kokkos::MemoryTraits >, \ - true, ETI_SPEC_AVAIL> { \ - typedef Kokkos::complex SCALAR; \ - typedef Kokkos::View, \ - Kokkos::MemoryTraits > \ - XViewType; \ - typedef Kokkos::View, \ - Kokkos::MemoryTraits > \ - YViewType; \ - typedef Kokkos::View, \ - Kokkos::MemoryTraits > \ - AViewType; \ - \ - static void ger(const EXEC_SPACE& space, const char trans[], \ - typename AViewType::const_value_type& alpha, \ - const XViewType& X, const YViewType& Y, \ - const AViewType& A) { \ - Kokkos::Profiling::pushRegion( \ - "KokkosBlas::ger[TPL_ROCBLAS,complex]"); \ - KOKKOSBLAS2_GER_ROCBLAS_DETERMINE_ARGS(LAYOUT); \ - bool justTranspose = (trans[0] == 'T') || (trans[0] == 't'); \ - KokkosBlas::Impl::RocBlasSingleton& s = \ - KokkosBlas::Impl::RocBlasSingleton::singleton(); \ - KOKKOS_ROCBLAS_SAFE_CALL_IMPL( \ - rocblas_set_stream(s.handle, space.hip_stream())); \ - if (A_is_ll) { \ - if (justTranspose) { \ - KOKKOS_ROCBLAS_SAFE_CALL_IMPL(rocblas_zgeru( \ - s.handle, M, N, \ - reinterpret_cast(&alpha), \ - reinterpret_cast(X.data()), one, \ - reinterpret_cast(Y.data()), one, \ - reinterpret_cast(A.data()), LDA)); \ - } else { \ - KOKKOS_ROCBLAS_SAFE_CALL_IMPL(rocblas_zgerc( \ - s.handle, M, N, \ - reinterpret_cast(&alpha), \ - reinterpret_cast(X.data()), one, \ - reinterpret_cast(Y.data()), one, \ - reinterpret_cast(A.data()), LDA)); \ - } \ - } else { \ - if (justTranspose) { \ - KOKKOS_ROCBLAS_SAFE_CALL_IMPL(rocblas_zgeru( \ - s.handle, M, N, \ - reinterpret_cast(&alpha), \ - reinterpret_cast(Y.data()), one, \ - reinterpret_cast(X.data()), one, \ - reinterpret_cast(A.data()), LDA)); \ - } else { \ - /* rocblas_zgerc() + ~A_ll => call k-kernels' implementation */ \ - GER::ger(space, trans, alpha, X, Y, A); \ - } \ - } \ - KOKKOS_ROCBLAS_SAFE_CALL_IMPL(rocblas_set_stream(s.handle, NULL)); \ - Kokkos::Profiling::popRegion(); \ - } \ +#define KOKKOSBLAS2_ZGER_ROCBLAS(LAYOUT, EXEC_SPACE, MEM_SPACE, ETI_SPEC_AVAIL) \ + template <> \ + struct GER*, LAYOUT, Kokkos::Device, \ + Kokkos::MemoryTraits >, \ + Kokkos::View*, LAYOUT, Kokkos::Device, \ + Kokkos::MemoryTraits >, \ + Kokkos::View**, LAYOUT, Kokkos::Device, \ + Kokkos::MemoryTraits >, \ + true, ETI_SPEC_AVAIL> { \ + typedef Kokkos::complex SCALAR; \ + typedef Kokkos::View, \ + Kokkos::MemoryTraits > \ + XViewType; \ + typedef Kokkos::View, \ + Kokkos::MemoryTraits > \ + YViewType; \ + typedef Kokkos::View, \ + Kokkos::MemoryTraits > \ + AViewType; \ + \ + static void ger(const EXEC_SPACE& space, const char trans[], typename AViewType::const_value_type& alpha, \ + const XViewType& X, const YViewType& Y, const AViewType& A) { \ + Kokkos::Profiling::pushRegion("KokkosBlas::ger[TPL_ROCBLAS,complex]"); \ + KOKKOSBLAS2_GER_ROCBLAS_DETERMINE_ARGS(LAYOUT); \ + bool justTranspose = (trans[0] == 'T') || (trans[0] == 't'); \ + KokkosBlas::Impl::RocBlasSingleton& s = KokkosBlas::Impl::RocBlasSingleton::singleton(); \ + KOKKOS_ROCBLAS_SAFE_CALL_IMPL(rocblas_set_stream(s.handle, space.hip_stream())); \ + if (A_is_ll) { \ + if (justTranspose) { \ + KOKKOS_ROCBLAS_SAFE_CALL_IMPL(rocblas_zgeru(s.handle, M, N, \ + reinterpret_cast(&alpha), \ + reinterpret_cast(X.data()), one, \ + reinterpret_cast(Y.data()), one, \ + reinterpret_cast(A.data()), LDA)); \ + } else { \ + KOKKOS_ROCBLAS_SAFE_CALL_IMPL(rocblas_zgerc(s.handle, M, N, \ + reinterpret_cast(&alpha), \ + reinterpret_cast(X.data()), one, \ + reinterpret_cast(Y.data()), one, \ + reinterpret_cast(A.data()), LDA)); \ + } \ + } else { \ + if (justTranspose) { \ + KOKKOS_ROCBLAS_SAFE_CALL_IMPL(rocblas_zgeru(s.handle, M, N, \ + reinterpret_cast(&alpha), \ + reinterpret_cast(Y.data()), one, \ + reinterpret_cast(X.data()), one, \ + reinterpret_cast(A.data()), LDA)); \ + } else { \ + /* rocblas_zgerc() + ~A_ll => call k-kernels' implementation */ \ + GER::ger(space, trans, alpha, X, Y, A); \ + } \ + } \ + KOKKOS_ROCBLAS_SAFE_CALL_IMPL(rocblas_set_stream(s.handle, NULL)); \ + Kokkos::Profiling::popRegion(); \ + } \ }; -#define KOKKOSBLAS2_CGER_ROCBLAS(LAYOUT, EXEC_SPACE, MEM_SPACE, \ - ETI_SPEC_AVAIL) \ - template <> \ - struct GER*, LAYOUT, \ - Kokkos::Device, \ - Kokkos::MemoryTraits >, \ - Kokkos::View*, LAYOUT, \ - Kokkos::Device, \ - Kokkos::MemoryTraits >, \ - Kokkos::View**, LAYOUT, \ - Kokkos::Device, \ - Kokkos::MemoryTraits >, \ - true, ETI_SPEC_AVAIL> { \ - typedef Kokkos::complex SCALAR; \ - typedef Kokkos::View, \ - Kokkos::MemoryTraits > \ - XViewType; \ - typedef Kokkos::View, \ - Kokkos::MemoryTraits > \ - YViewType; \ - typedef Kokkos::View, \ - Kokkos::MemoryTraits > \ - AViewType; \ - \ - static void ger(const EXEC_SPACE& space, const char trans[], \ - typename AViewType::const_value_type& alpha, \ - const XViewType& X, const YViewType& Y, \ - const AViewType& A) { \ - Kokkos::Profiling::pushRegion( \ - "KokkosBlas::ger[TPL_ROCBLAS,complex]"); \ - KOKKOSBLAS2_GER_ROCBLAS_DETERMINE_ARGS(LAYOUT); \ - bool justTranspose = (trans[0] == 'T') || (trans[0] == 't'); \ - KokkosBlas::Impl::RocBlasSingleton& s = \ - KokkosBlas::Impl::RocBlasSingleton::singleton(); \ - KOKKOS_ROCBLAS_SAFE_CALL_IMPL( \ - rocblas_set_stream(s.handle, space.hip_stream())); \ - if (A_is_ll) { \ - if (justTranspose) { \ - KOKKOS_ROCBLAS_SAFE_CALL_IMPL(rocblas_cgeru( \ - s.handle, M, N, \ - reinterpret_cast(&alpha), \ - reinterpret_cast(X.data()), one, \ - reinterpret_cast(Y.data()), one, \ - reinterpret_cast(A.data()), LDA)); \ - } else { \ - KOKKOS_ROCBLAS_SAFE_CALL_IMPL(rocblas_cgerc( \ - s.handle, M, N, \ - reinterpret_cast(&alpha), \ - reinterpret_cast(X.data()), one, \ - reinterpret_cast(Y.data()), one, \ - reinterpret_cast(A.data()), LDA)); \ - } \ - } else { \ - if (justTranspose) { \ - KOKKOS_ROCBLAS_SAFE_CALL_IMPL(rocblas_cgeru( \ - s.handle, M, N, \ - reinterpret_cast(&alpha), \ - reinterpret_cast(Y.data()), one, \ - reinterpret_cast(X.data()), one, \ - reinterpret_cast(A.data()), LDA)); \ - } else { \ - /* rocblas_cgerc() + ~A_ll => call k-kernels' implementation */ \ - GER::ger(space, trans, alpha, X, Y, A); \ - } \ - } \ - KOKKOS_ROCBLAS_SAFE_CALL_IMPL(rocblas_set_stream(s.handle, NULL)); \ - Kokkos::Profiling::popRegion(); \ - } \ +#define KOKKOSBLAS2_CGER_ROCBLAS(LAYOUT, EXEC_SPACE, MEM_SPACE, ETI_SPEC_AVAIL) \ + template <> \ + struct GER*, LAYOUT, Kokkos::Device, \ + Kokkos::MemoryTraits >, \ + Kokkos::View*, LAYOUT, Kokkos::Device, \ + Kokkos::MemoryTraits >, \ + Kokkos::View**, LAYOUT, Kokkos::Device, \ + Kokkos::MemoryTraits >, \ + true, ETI_SPEC_AVAIL> { \ + typedef Kokkos::complex SCALAR; \ + typedef Kokkos::View, \ + Kokkos::MemoryTraits > \ + XViewType; \ + typedef Kokkos::View, \ + Kokkos::MemoryTraits > \ + YViewType; \ + typedef Kokkos::View, \ + Kokkos::MemoryTraits > \ + AViewType; \ + \ + static void ger(const EXEC_SPACE& space, const char trans[], typename AViewType::const_value_type& alpha, \ + const XViewType& X, const YViewType& Y, const AViewType& A) { \ + Kokkos::Profiling::pushRegion("KokkosBlas::ger[TPL_ROCBLAS,complex]"); \ + KOKKOSBLAS2_GER_ROCBLAS_DETERMINE_ARGS(LAYOUT); \ + bool justTranspose = (trans[0] == 'T') || (trans[0] == 't'); \ + KokkosBlas::Impl::RocBlasSingleton& s = KokkosBlas::Impl::RocBlasSingleton::singleton(); \ + KOKKOS_ROCBLAS_SAFE_CALL_IMPL(rocblas_set_stream(s.handle, space.hip_stream())); \ + if (A_is_ll) { \ + if (justTranspose) { \ + KOKKOS_ROCBLAS_SAFE_CALL_IMPL(rocblas_cgeru(s.handle, M, N, \ + reinterpret_cast(&alpha), \ + reinterpret_cast(X.data()), one, \ + reinterpret_cast(Y.data()), one, \ + reinterpret_cast(A.data()), LDA)); \ + } else { \ + KOKKOS_ROCBLAS_SAFE_CALL_IMPL(rocblas_cgerc(s.handle, M, N, \ + reinterpret_cast(&alpha), \ + reinterpret_cast(X.data()), one, \ + reinterpret_cast(Y.data()), one, \ + reinterpret_cast(A.data()), LDA)); \ + } \ + } else { \ + if (justTranspose) { \ + KOKKOS_ROCBLAS_SAFE_CALL_IMPL(rocblas_cgeru(s.handle, M, N, \ + reinterpret_cast(&alpha), \ + reinterpret_cast(Y.data()), one, \ + reinterpret_cast(X.data()), one, \ + reinterpret_cast(A.data()), LDA)); \ + } else { \ + /* rocblas_cgerc() + ~A_ll => call k-kernels' implementation */ \ + GER::ger(space, trans, alpha, X, Y, A); \ + } \ + } \ + KOKKOS_ROCBLAS_SAFE_CALL_IMPL(rocblas_set_stream(s.handle, NULL)); \ + Kokkos::Profiling::popRegion(); \ + } \ }; -KOKKOSBLAS2_DGER_ROCBLAS(Kokkos::LayoutLeft, Kokkos::HIP, Kokkos::HIPSpace, - true) -KOKKOSBLAS2_DGER_ROCBLAS(Kokkos::LayoutLeft, Kokkos::HIP, Kokkos::HIPSpace, - false) -KOKKOSBLAS2_DGER_ROCBLAS(Kokkos::LayoutRight, Kokkos::HIP, Kokkos::HIPSpace, - true) -KOKKOSBLAS2_DGER_ROCBLAS(Kokkos::LayoutRight, Kokkos::HIP, Kokkos::HIPSpace, - false) +KOKKOSBLAS2_DGER_ROCBLAS(Kokkos::LayoutLeft, Kokkos::HIP, Kokkos::HIPSpace, true) +KOKKOSBLAS2_DGER_ROCBLAS(Kokkos::LayoutLeft, Kokkos::HIP, Kokkos::HIPSpace, false) +KOKKOSBLAS2_DGER_ROCBLAS(Kokkos::LayoutRight, Kokkos::HIP, Kokkos::HIPSpace, true) +KOKKOSBLAS2_DGER_ROCBLAS(Kokkos::LayoutRight, Kokkos::HIP, Kokkos::HIPSpace, false) -KOKKOSBLAS2_SGER_ROCBLAS(Kokkos::LayoutLeft, Kokkos::HIP, Kokkos::HIPSpace, - true) -KOKKOSBLAS2_SGER_ROCBLAS(Kokkos::LayoutLeft, Kokkos::HIP, Kokkos::HIPSpace, - false) -KOKKOSBLAS2_SGER_ROCBLAS(Kokkos::LayoutRight, Kokkos::HIP, Kokkos::HIPSpace, - true) -KOKKOSBLAS2_SGER_ROCBLAS(Kokkos::LayoutRight, Kokkos::HIP, Kokkos::HIPSpace, - false) +KOKKOSBLAS2_SGER_ROCBLAS(Kokkos::LayoutLeft, Kokkos::HIP, Kokkos::HIPSpace, true) +KOKKOSBLAS2_SGER_ROCBLAS(Kokkos::LayoutLeft, Kokkos::HIP, Kokkos::HIPSpace, false) +KOKKOSBLAS2_SGER_ROCBLAS(Kokkos::LayoutRight, Kokkos::HIP, Kokkos::HIPSpace, true) +KOKKOSBLAS2_SGER_ROCBLAS(Kokkos::LayoutRight, Kokkos::HIP, Kokkos::HIPSpace, false) -KOKKOSBLAS2_ZGER_ROCBLAS(Kokkos::LayoutLeft, Kokkos::HIP, Kokkos::HIPSpace, - true) -KOKKOSBLAS2_ZGER_ROCBLAS(Kokkos::LayoutLeft, Kokkos::HIP, Kokkos::HIPSpace, - false) -KOKKOSBLAS2_ZGER_ROCBLAS(Kokkos::LayoutRight, Kokkos::HIP, Kokkos::HIPSpace, - true) -KOKKOSBLAS2_ZGER_ROCBLAS(Kokkos::LayoutRight, Kokkos::HIP, Kokkos::HIPSpace, - false) +KOKKOSBLAS2_ZGER_ROCBLAS(Kokkos::LayoutLeft, Kokkos::HIP, Kokkos::HIPSpace, true) +KOKKOSBLAS2_ZGER_ROCBLAS(Kokkos::LayoutLeft, Kokkos::HIP, Kokkos::HIPSpace, false) +KOKKOSBLAS2_ZGER_ROCBLAS(Kokkos::LayoutRight, Kokkos::HIP, Kokkos::HIPSpace, true) +KOKKOSBLAS2_ZGER_ROCBLAS(Kokkos::LayoutRight, Kokkos::HIP, Kokkos::HIPSpace, false) -KOKKOSBLAS2_CGER_ROCBLAS(Kokkos::LayoutLeft, Kokkos::HIP, Kokkos::HIPSpace, - true) -KOKKOSBLAS2_CGER_ROCBLAS(Kokkos::LayoutLeft, Kokkos::HIP, Kokkos::HIPSpace, - false) -KOKKOSBLAS2_CGER_ROCBLAS(Kokkos::LayoutRight, Kokkos::HIP, Kokkos::HIPSpace, - true) -KOKKOSBLAS2_CGER_ROCBLAS(Kokkos::LayoutRight, Kokkos::HIP, Kokkos::HIPSpace, - false) +KOKKOSBLAS2_CGER_ROCBLAS(Kokkos::LayoutLeft, Kokkos::HIP, Kokkos::HIPSpace, true) +KOKKOSBLAS2_CGER_ROCBLAS(Kokkos::LayoutLeft, Kokkos::HIP, Kokkos::HIPSpace, false) +KOKKOSBLAS2_CGER_ROCBLAS(Kokkos::LayoutRight, Kokkos::HIP, Kokkos::HIPSpace, true) +KOKKOSBLAS2_CGER_ROCBLAS(Kokkos::LayoutRight, Kokkos::HIP, Kokkos::HIPSpace, false) } // namespace Impl } // namespace KokkosBlas diff --git a/blas/tpls/KokkosBlas2_serial_gemv_tpl_spec_decl.hpp b/blas/tpls/KokkosBlas2_serial_gemv_tpl_spec_decl.hpp index 6f6a7a2e9f..d894433540 100644 --- a/blas/tpls/KokkosBlas2_serial_gemv_tpl_spec_decl.hpp +++ b/blas/tpls/KokkosBlas2_serial_gemv_tpl_spec_decl.hpp @@ -35,14 +35,12 @@ namespace Impl { // Note: using GEMM because there is no GEMV in MKL compact routines -#define __IMPL_KK_MKL_DGEMM_COMPACT(SCALAR, MKL_ROUTINE) \ - inline void kk_mkl_gemm_compact( \ - MKL_LAYOUT layout, MKL_TRANSPOSE transa, MKL_TRANSPOSE transb, \ - MKL_INT m, MKL_INT n, MKL_INT k, SCALAR alpha, const SCALAR *a, \ - MKL_INT ldap, const SCALAR *b, MKL_INT ldbp, SCALAR beta, SCALAR *c, \ - MKL_INT ldcp, MKL_COMPACT_PACK format, MKL_INT nm) { \ - MKL_ROUTINE(layout, transa, transb, m, n, k, alpha, a, ldap, b, ldbp, \ - beta, c, ldcp, format, nm); \ +#define __IMPL_KK_MKL_DGEMM_COMPACT(SCALAR, MKL_ROUTINE) \ + inline void kk_mkl_gemm_compact(MKL_LAYOUT layout, MKL_TRANSPOSE transa, MKL_TRANSPOSE transb, MKL_INT m, MKL_INT n, \ + MKL_INT k, SCALAR alpha, const SCALAR *a, MKL_INT ldap, const SCALAR *b, \ + MKL_INT ldbp, SCALAR beta, SCALAR *c, MKL_INT ldcp, MKL_COMPACT_PACK format, \ + MKL_INT nm) { \ + MKL_ROUTINE(layout, transa, transb, m, n, k, alpha, a, ldap, b, ldbp, beta, c, ldcp, format, nm); \ } __IMPL_KK_MKL_DGEMM_COMPACT(double, mkl_dgemm_compact) @@ -81,23 +79,17 @@ inline MKL_COMPACT_PACK mkl_compact_format() { return MKL_COMPACT_AVX512; } -template -void kk_mkl_gemv(MKL_TRANSPOSE trans, const ScalarType alpha, - const AViewType &A, const xViewType &x, const ScalarType beta, - const yViewType &y) { +template +void kk_mkl_gemv(MKL_TRANSPOSE trans, const ScalarType alpha, const AViewType &A, const xViewType &x, + const ScalarType beta, const yViewType &y) { typedef typename yViewType::value_type vector_type; - static_assert(KokkosBatched::is_vector::value, - "value type is not vector type"); + static_assert(KokkosBatched::is_vector::value, "value type is not vector type"); using value_type = typename vector_type::value_type; - static_assert(std::is_same::value && - std::is_same::value, + static_assert(std::is_same::value && + std::is_same::value, "scalar type mismatch"); - if (A.stride_0() != 1 && A.stride_1() != 1 && x.stride_0() != 1 && - y.stride_0() != 1) { + if (A.stride_0() != 1 && A.stride_1() != 1 && x.stride_0() != 1 && y.stride_0() != 1) { Kokkos::abort("Strided inputs are not supported in MKL gemv/gemm"); } @@ -107,21 +99,18 @@ void kk_mkl_gemv(MKL_TRANSPOSE trans, const ScalarType alpha, const int n = 1; const int k = A.extent_int(transposed ? 0 : 1); - const bool col_major = A.stride_0() == 1; - const MKL_LAYOUT layout = col_major ? MKL_COL_MAJOR : MKL_ROW_MAJOR; - const MKL_INT A_ld = KOKKOSKERNELS_MACRO_MAX(1, A.extent(col_major ? 0 : 1)); - const MKL_COMPACT_PACK format = - Impl::mkl_compact_format(); + const bool col_major = A.stride_0() == 1; + const MKL_LAYOUT layout = col_major ? MKL_COL_MAJOR : MKL_ROW_MAJOR; + const MKL_INT A_ld = KOKKOSKERNELS_MACRO_MAX(1, A.extent(col_major ? 0 : 1)); + const MKL_COMPACT_PACK format = Impl::mkl_compact_format(); // cast away simd-vector pointers auto A_data = reinterpret_cast(A.data()); auto x_data = reinterpret_cast(x.data()); auto y_data = reinterpret_cast(y.data()); - Impl::kk_mkl_gemm_compact(layout, trans, MKL_NOTRANS, m, n, k, - (value_type)alpha, A_data, A_ld, x_data, 1, - (value_type)beta, y_data, 1, format, - (MKL_INT)vector_type::vector_length); + Impl::kk_mkl_gemm_compact(layout, trans, MKL_NOTRANS, m, n, k, (value_type)alpha, A_data, A_ld, x_data, 1, + (value_type)beta, y_data, 1, format, (MKL_INT)vector_type::vector_length); } } // namespace Impl @@ -131,12 +120,9 @@ void kk_mkl_gemv(MKL_TRANSPOSE trans, const ScalarType alpha, /// template <> -template -KOKKOS_INLINE_FUNCTION int -SerialGemv::invoke( - const ScalarType alpha, const AViewType &A, const xViewType &x, - const ScalarType beta, const yViewType &y) { +template +KOKKOS_INLINE_FUNCTION int SerialGemv::invoke( + const ScalarType alpha, const AViewType &A, const xViewType &x, const ScalarType beta, const yViewType &y) { Impl::kk_mkl_gemv(MKL_NOTRANS, alpha, A, x, beta, y); return 0; } @@ -146,12 +132,9 @@ SerialGemv::invoke( /// template <> -template -KOKKOS_INLINE_FUNCTION int -SerialGemv::invoke( - const ScalarType alpha, const AViewType &A, const xViewType &x, - const ScalarType beta, const yViewType &y) { +template +KOKKOS_INLINE_FUNCTION int SerialGemv::invoke( + const ScalarType alpha, const AViewType &A, const xViewType &x, const ScalarType beta, const yViewType &y) { Impl::kk_mkl_gemv(MKL_TRANS, alpha, A, x, beta, y); return 0; } @@ -161,12 +144,9 @@ SerialGemv::invoke( /// template <> -template -KOKKOS_INLINE_FUNCTION int -SerialGemv::invoke( - const ScalarType alpha, const AViewType &A, const xViewType &x, - const ScalarType beta, const yViewType &y) { +template +KOKKOS_INLINE_FUNCTION int SerialGemv::invoke( + const ScalarType alpha, const AViewType &A, const xViewType &x, const ScalarType beta, const yViewType &y) { Impl::kk_mkl_gemv(MKL_CONJTRANS, alpha, A, x, beta, y); return 0; } diff --git a/blas/tpls/KokkosBlas2_syr2_tpl_spec_avail.hpp b/blas/tpls/KokkosBlas2_syr2_tpl_spec_avail.hpp index 59fb154d35..2c3cdc990e 100644 --- a/blas/tpls/KokkosBlas2_syr2_tpl_spec_avail.hpp +++ b/blas/tpls/KokkosBlas2_syr2_tpl_spec_avail.hpp @@ -28,66 +28,40 @@ struct syr2_tpl_spec_avail { // Generic Host side BLAS (could be MKL or whatever) #ifdef KOKKOSKERNELS_ENABLE_TPL_BLAS -#define KOKKOSBLAS2_SYR2_TPL_SPEC_AVAIL_BLAS(SCALAR, LAYOUT, EXEC_SPACE, \ - MEM_SPACE) \ - template <> \ - struct syr2_tpl_spec_avail< \ - EXEC_SPACE, \ - Kokkos::View, \ - Kokkos::MemoryTraits >, \ - Kokkos::View, \ - Kokkos::MemoryTraits >, \ - Kokkos::View, \ - Kokkos::MemoryTraits > > { \ - enum : bool { value = true }; \ +#define KOKKOSBLAS2_SYR2_TPL_SPEC_AVAIL_BLAS(SCALAR, LAYOUT, EXEC_SPACE, MEM_SPACE) \ + template <> \ + struct syr2_tpl_spec_avail, \ + Kokkos::MemoryTraits >, \ + Kokkos::View, \ + Kokkos::MemoryTraits >, \ + Kokkos::View, \ + Kokkos::MemoryTraits > > { \ + enum : bool { value = true }; \ }; #ifdef KOKKOS_ENABLE_SERIAL -KOKKOSBLAS2_SYR2_TPL_SPEC_AVAIL_BLAS(double, Kokkos::LayoutLeft, Kokkos::Serial, - Kokkos::HostSpace) -KOKKOSBLAS2_SYR2_TPL_SPEC_AVAIL_BLAS(float, Kokkos::LayoutLeft, Kokkos::Serial, - Kokkos::HostSpace) -KOKKOSBLAS2_SYR2_TPL_SPEC_AVAIL_BLAS(Kokkos::complex, - Kokkos::LayoutLeft, Kokkos::Serial, - Kokkos::HostSpace) -KOKKOSBLAS2_SYR2_TPL_SPEC_AVAIL_BLAS(Kokkos::complex, Kokkos::LayoutLeft, - Kokkos::Serial, Kokkos::HostSpace) - -KOKKOSBLAS2_SYR2_TPL_SPEC_AVAIL_BLAS(double, Kokkos::LayoutRight, - Kokkos::Serial, Kokkos::HostSpace) -KOKKOSBLAS2_SYR2_TPL_SPEC_AVAIL_BLAS(float, Kokkos::LayoutRight, Kokkos::Serial, - Kokkos::HostSpace) -KOKKOSBLAS2_SYR2_TPL_SPEC_AVAIL_BLAS(Kokkos::complex, - Kokkos::LayoutRight, Kokkos::Serial, - Kokkos::HostSpace) -KOKKOSBLAS2_SYR2_TPL_SPEC_AVAIL_BLAS(Kokkos::complex, - Kokkos::LayoutRight, Kokkos::Serial, - Kokkos::HostSpace) +KOKKOSBLAS2_SYR2_TPL_SPEC_AVAIL_BLAS(double, Kokkos::LayoutLeft, Kokkos::Serial, Kokkos::HostSpace) +KOKKOSBLAS2_SYR2_TPL_SPEC_AVAIL_BLAS(float, Kokkos::LayoutLeft, Kokkos::Serial, Kokkos::HostSpace) +KOKKOSBLAS2_SYR2_TPL_SPEC_AVAIL_BLAS(Kokkos::complex, Kokkos::LayoutLeft, Kokkos::Serial, Kokkos::HostSpace) +KOKKOSBLAS2_SYR2_TPL_SPEC_AVAIL_BLAS(Kokkos::complex, Kokkos::LayoutLeft, Kokkos::Serial, Kokkos::HostSpace) + +KOKKOSBLAS2_SYR2_TPL_SPEC_AVAIL_BLAS(double, Kokkos::LayoutRight, Kokkos::Serial, Kokkos::HostSpace) +KOKKOSBLAS2_SYR2_TPL_SPEC_AVAIL_BLAS(float, Kokkos::LayoutRight, Kokkos::Serial, Kokkos::HostSpace) +KOKKOSBLAS2_SYR2_TPL_SPEC_AVAIL_BLAS(Kokkos::complex, Kokkos::LayoutRight, Kokkos::Serial, Kokkos::HostSpace) +KOKKOSBLAS2_SYR2_TPL_SPEC_AVAIL_BLAS(Kokkos::complex, Kokkos::LayoutRight, Kokkos::Serial, Kokkos::HostSpace) #endif #ifdef KOKKOS_ENABLE_OPENMP -KOKKOSBLAS2_SYR2_TPL_SPEC_AVAIL_BLAS(double, Kokkos::LayoutLeft, Kokkos::OpenMP, - Kokkos::HostSpace) -KOKKOSBLAS2_SYR2_TPL_SPEC_AVAIL_BLAS(float, Kokkos::LayoutLeft, Kokkos::OpenMP, - Kokkos::HostSpace) -KOKKOSBLAS2_SYR2_TPL_SPEC_AVAIL_BLAS(Kokkos::complex, - Kokkos::LayoutLeft, Kokkos::OpenMP, - Kokkos::HostSpace) -KOKKOSBLAS2_SYR2_TPL_SPEC_AVAIL_BLAS(Kokkos::complex, Kokkos::LayoutLeft, - Kokkos::OpenMP, Kokkos::HostSpace) - -KOKKOSBLAS2_SYR2_TPL_SPEC_AVAIL_BLAS(double, Kokkos::LayoutRight, - Kokkos::OpenMP, Kokkos::HostSpace) -KOKKOSBLAS2_SYR2_TPL_SPEC_AVAIL_BLAS(float, Kokkos::LayoutRight, Kokkos::OpenMP, - Kokkos::HostSpace) -KOKKOSBLAS2_SYR2_TPL_SPEC_AVAIL_BLAS(Kokkos::complex, - Kokkos::LayoutRight, Kokkos::OpenMP, - Kokkos::HostSpace) -KOKKOSBLAS2_SYR2_TPL_SPEC_AVAIL_BLAS(Kokkos::complex, - Kokkos::LayoutRight, Kokkos::OpenMP, - Kokkos::HostSpace) +KOKKOSBLAS2_SYR2_TPL_SPEC_AVAIL_BLAS(double, Kokkos::LayoutLeft, Kokkos::OpenMP, Kokkos::HostSpace) +KOKKOSBLAS2_SYR2_TPL_SPEC_AVAIL_BLAS(float, Kokkos::LayoutLeft, Kokkos::OpenMP, Kokkos::HostSpace) +KOKKOSBLAS2_SYR2_TPL_SPEC_AVAIL_BLAS(Kokkos::complex, Kokkos::LayoutLeft, Kokkos::OpenMP, Kokkos::HostSpace) +KOKKOSBLAS2_SYR2_TPL_SPEC_AVAIL_BLAS(Kokkos::complex, Kokkos::LayoutLeft, Kokkos::OpenMP, Kokkos::HostSpace) + +KOKKOSBLAS2_SYR2_TPL_SPEC_AVAIL_BLAS(double, Kokkos::LayoutRight, Kokkos::OpenMP, Kokkos::HostSpace) +KOKKOSBLAS2_SYR2_TPL_SPEC_AVAIL_BLAS(float, Kokkos::LayoutRight, Kokkos::OpenMP, Kokkos::HostSpace) +KOKKOSBLAS2_SYR2_TPL_SPEC_AVAIL_BLAS(Kokkos::complex, Kokkos::LayoutRight, Kokkos::OpenMP, Kokkos::HostSpace) +KOKKOSBLAS2_SYR2_TPL_SPEC_AVAIL_BLAS(Kokkos::complex, Kokkos::LayoutRight, Kokkos::OpenMP, Kokkos::HostSpace) #endif #endif @@ -95,108 +69,64 @@ KOKKOSBLAS2_SYR2_TPL_SPEC_AVAIL_BLAS(Kokkos::complex, // cuBLAS #ifdef KOKKOSKERNELS_ENABLE_TPL_CUBLAS -#define KOKKOSBLAS2_SYR2_TPL_SPEC_AVAIL_CUBLAS(SCALAR, LAYOUT, EXEC_SPACE, \ - MEM_SPACE) \ - template <> \ - struct syr2_tpl_spec_avail< \ - EXEC_SPACE, \ - Kokkos::View, \ - Kokkos::MemoryTraits >, \ - Kokkos::View, \ - Kokkos::MemoryTraits >, \ - Kokkos::View, \ - Kokkos::MemoryTraits > > { \ - enum : bool { value = true }; \ +#define KOKKOSBLAS2_SYR2_TPL_SPEC_AVAIL_CUBLAS(SCALAR, LAYOUT, EXEC_SPACE, MEM_SPACE) \ + template <> \ + struct syr2_tpl_spec_avail, \ + Kokkos::MemoryTraits >, \ + Kokkos::View, \ + Kokkos::MemoryTraits >, \ + Kokkos::View, \ + Kokkos::MemoryTraits > > { \ + enum : bool { value = true }; \ }; -KOKKOSBLAS2_SYR2_TPL_SPEC_AVAIL_CUBLAS(double, Kokkos::LayoutLeft, Kokkos::Cuda, - Kokkos::CudaSpace) -KOKKOSBLAS2_SYR2_TPL_SPEC_AVAIL_CUBLAS(float, Kokkos::LayoutLeft, Kokkos::Cuda, - Kokkos::CudaSpace) -KOKKOSBLAS2_SYR2_TPL_SPEC_AVAIL_CUBLAS(Kokkos::complex, - Kokkos::LayoutLeft, Kokkos::Cuda, - Kokkos::CudaSpace) -KOKKOSBLAS2_SYR2_TPL_SPEC_AVAIL_CUBLAS(Kokkos::complex, - Kokkos::LayoutLeft, Kokkos::Cuda, - Kokkos::CudaSpace) - -KOKKOSBLAS2_SYR2_TPL_SPEC_AVAIL_CUBLAS(double, Kokkos::LayoutLeft, Kokkos::Cuda, - Kokkos::CudaUVMSpace) -KOKKOSBLAS2_SYR2_TPL_SPEC_AVAIL_CUBLAS(float, Kokkos::LayoutLeft, Kokkos::Cuda, - Kokkos::CudaUVMSpace) -KOKKOSBLAS2_SYR2_TPL_SPEC_AVAIL_CUBLAS(Kokkos::complex, - Kokkos::LayoutLeft, Kokkos::Cuda, - Kokkos::CudaUVMSpace) -KOKKOSBLAS2_SYR2_TPL_SPEC_AVAIL_CUBLAS(Kokkos::complex, - Kokkos::LayoutLeft, Kokkos::Cuda, - Kokkos::CudaUVMSpace) - -KOKKOSBLAS2_SYR2_TPL_SPEC_AVAIL_CUBLAS(double, Kokkos::LayoutRight, - Kokkos::Cuda, Kokkos::CudaSpace) -KOKKOSBLAS2_SYR2_TPL_SPEC_AVAIL_CUBLAS(float, Kokkos::LayoutRight, Kokkos::Cuda, - Kokkos::CudaSpace) -KOKKOSBLAS2_SYR2_TPL_SPEC_AVAIL_CUBLAS(Kokkos::complex, - Kokkos::LayoutRight, Kokkos::Cuda, - Kokkos::CudaSpace) -KOKKOSBLAS2_SYR2_TPL_SPEC_AVAIL_CUBLAS(Kokkos::complex, - Kokkos::LayoutRight, Kokkos::Cuda, - Kokkos::CudaSpace) - -KOKKOSBLAS2_SYR2_TPL_SPEC_AVAIL_CUBLAS(double, Kokkos::LayoutRight, - Kokkos::Cuda, Kokkos::CudaUVMSpace) -KOKKOSBLAS2_SYR2_TPL_SPEC_AVAIL_CUBLAS(float, Kokkos::LayoutRight, Kokkos::Cuda, - Kokkos::CudaUVMSpace) -KOKKOSBLAS2_SYR2_TPL_SPEC_AVAIL_CUBLAS(Kokkos::complex, - Kokkos::LayoutRight, Kokkos::Cuda, - Kokkos::CudaUVMSpace) -KOKKOSBLAS2_SYR2_TPL_SPEC_AVAIL_CUBLAS(Kokkos::complex, - Kokkos::LayoutRight, Kokkos::Cuda, - Kokkos::CudaUVMSpace) +KOKKOSBLAS2_SYR2_TPL_SPEC_AVAIL_CUBLAS(double, Kokkos::LayoutLeft, Kokkos::Cuda, Kokkos::CudaSpace) +KOKKOSBLAS2_SYR2_TPL_SPEC_AVAIL_CUBLAS(float, Kokkos::LayoutLeft, Kokkos::Cuda, Kokkos::CudaSpace) +KOKKOSBLAS2_SYR2_TPL_SPEC_AVAIL_CUBLAS(Kokkos::complex, Kokkos::LayoutLeft, Kokkos::Cuda, Kokkos::CudaSpace) +KOKKOSBLAS2_SYR2_TPL_SPEC_AVAIL_CUBLAS(Kokkos::complex, Kokkos::LayoutLeft, Kokkos::Cuda, Kokkos::CudaSpace) + +KOKKOSBLAS2_SYR2_TPL_SPEC_AVAIL_CUBLAS(double, Kokkos::LayoutLeft, Kokkos::Cuda, Kokkos::CudaUVMSpace) +KOKKOSBLAS2_SYR2_TPL_SPEC_AVAIL_CUBLAS(float, Kokkos::LayoutLeft, Kokkos::Cuda, Kokkos::CudaUVMSpace) +KOKKOSBLAS2_SYR2_TPL_SPEC_AVAIL_CUBLAS(Kokkos::complex, Kokkos::LayoutLeft, Kokkos::Cuda, Kokkos::CudaUVMSpace) +KOKKOSBLAS2_SYR2_TPL_SPEC_AVAIL_CUBLAS(Kokkos::complex, Kokkos::LayoutLeft, Kokkos::Cuda, Kokkos::CudaUVMSpace) + +KOKKOSBLAS2_SYR2_TPL_SPEC_AVAIL_CUBLAS(double, Kokkos::LayoutRight, Kokkos::Cuda, Kokkos::CudaSpace) +KOKKOSBLAS2_SYR2_TPL_SPEC_AVAIL_CUBLAS(float, Kokkos::LayoutRight, Kokkos::Cuda, Kokkos::CudaSpace) +KOKKOSBLAS2_SYR2_TPL_SPEC_AVAIL_CUBLAS(Kokkos::complex, Kokkos::LayoutRight, Kokkos::Cuda, Kokkos::CudaSpace) +KOKKOSBLAS2_SYR2_TPL_SPEC_AVAIL_CUBLAS(Kokkos::complex, Kokkos::LayoutRight, Kokkos::Cuda, Kokkos::CudaSpace) + +KOKKOSBLAS2_SYR2_TPL_SPEC_AVAIL_CUBLAS(double, Kokkos::LayoutRight, Kokkos::Cuda, Kokkos::CudaUVMSpace) +KOKKOSBLAS2_SYR2_TPL_SPEC_AVAIL_CUBLAS(float, Kokkos::LayoutRight, Kokkos::Cuda, Kokkos::CudaUVMSpace) +KOKKOSBLAS2_SYR2_TPL_SPEC_AVAIL_CUBLAS(Kokkos::complex, Kokkos::LayoutRight, Kokkos::Cuda, Kokkos::CudaUVMSpace) +KOKKOSBLAS2_SYR2_TPL_SPEC_AVAIL_CUBLAS(Kokkos::complex, Kokkos::LayoutRight, Kokkos::Cuda, Kokkos::CudaUVMSpace) #endif // rocBLAS #ifdef KOKKOSKERNELS_ENABLE_TPL_ROCBLAS -#define KOKKOSBLAS2_SYR2_TPL_SPEC_AVAIL_ROCBLAS(SCALAR, LAYOUT, EXEC_SPACE, \ - MEM_SPACE) \ - template <> \ - struct syr2_tpl_spec_avail< \ - EXEC_SPACE, \ - Kokkos::View, \ - Kokkos::MemoryTraits >, \ - Kokkos::View, \ - Kokkos::MemoryTraits >, \ - Kokkos::View, \ - Kokkos::MemoryTraits > > { \ - enum : bool { value = true }; \ +#define KOKKOSBLAS2_SYR2_TPL_SPEC_AVAIL_ROCBLAS(SCALAR, LAYOUT, EXEC_SPACE, MEM_SPACE) \ + template <> \ + struct syr2_tpl_spec_avail, \ + Kokkos::MemoryTraits >, \ + Kokkos::View, \ + Kokkos::MemoryTraits >, \ + Kokkos::View, \ + Kokkos::MemoryTraits > > { \ + enum : bool { value = true }; \ }; -KOKKOSBLAS2_SYR2_TPL_SPEC_AVAIL_ROCBLAS(double, Kokkos::LayoutLeft, Kokkos::HIP, - Kokkos::HIPSpace) -KOKKOSBLAS2_SYR2_TPL_SPEC_AVAIL_ROCBLAS(float, Kokkos::LayoutLeft, Kokkos::HIP, - Kokkos::HIPSpace) -KOKKOSBLAS2_SYR2_TPL_SPEC_AVAIL_ROCBLAS(Kokkos::complex, - Kokkos::LayoutLeft, Kokkos::HIP, - Kokkos::HIPSpace) -KOKKOSBLAS2_SYR2_TPL_SPEC_AVAIL_ROCBLAS(Kokkos::complex, - Kokkos::LayoutLeft, Kokkos::HIP, - Kokkos::HIPSpace) - -KOKKOSBLAS2_SYR2_TPL_SPEC_AVAIL_ROCBLAS(double, Kokkos::LayoutRight, - Kokkos::HIP, Kokkos::HIPSpace) -KOKKOSBLAS2_SYR2_TPL_SPEC_AVAIL_ROCBLAS(float, Kokkos::LayoutRight, Kokkos::HIP, - Kokkos::HIPSpace) -KOKKOSBLAS2_SYR2_TPL_SPEC_AVAIL_ROCBLAS(Kokkos::complex, - Kokkos::LayoutRight, Kokkos::HIP, - Kokkos::HIPSpace) -KOKKOSBLAS2_SYR2_TPL_SPEC_AVAIL_ROCBLAS(Kokkos::complex, - Kokkos::LayoutRight, Kokkos::HIP, - Kokkos::HIPSpace) +KOKKOSBLAS2_SYR2_TPL_SPEC_AVAIL_ROCBLAS(double, Kokkos::LayoutLeft, Kokkos::HIP, Kokkos::HIPSpace) +KOKKOSBLAS2_SYR2_TPL_SPEC_AVAIL_ROCBLAS(float, Kokkos::LayoutLeft, Kokkos::HIP, Kokkos::HIPSpace) +KOKKOSBLAS2_SYR2_TPL_SPEC_AVAIL_ROCBLAS(Kokkos::complex, Kokkos::LayoutLeft, Kokkos::HIP, Kokkos::HIPSpace) +KOKKOSBLAS2_SYR2_TPL_SPEC_AVAIL_ROCBLAS(Kokkos::complex, Kokkos::LayoutLeft, Kokkos::HIP, Kokkos::HIPSpace) + +KOKKOSBLAS2_SYR2_TPL_SPEC_AVAIL_ROCBLAS(double, Kokkos::LayoutRight, Kokkos::HIP, Kokkos::HIPSpace) +KOKKOSBLAS2_SYR2_TPL_SPEC_AVAIL_ROCBLAS(float, Kokkos::LayoutRight, Kokkos::HIP, Kokkos::HIPSpace) +KOKKOSBLAS2_SYR2_TPL_SPEC_AVAIL_ROCBLAS(Kokkos::complex, Kokkos::LayoutRight, Kokkos::HIP, Kokkos::HIPSpace) +KOKKOSBLAS2_SYR2_TPL_SPEC_AVAIL_ROCBLAS(Kokkos::complex, Kokkos::LayoutRight, Kokkos::HIP, Kokkos::HIPSpace) #endif } // namespace Impl diff --git a/blas/tpls/KokkosBlas2_syr2_tpl_spec_decl_blas.hpp b/blas/tpls/KokkosBlas2_syr2_tpl_spec_decl_blas.hpp index f22e800bc5..4aa32b5b0e 100644 --- a/blas/tpls/KokkosBlas2_syr2_tpl_spec_decl_blas.hpp +++ b/blas/tpls/KokkosBlas2_syr2_tpl_spec_decl_blas.hpp @@ -29,286 +29,216 @@ namespace Impl { constexpr int one = 1; \ const int LDA = A_is_lr ? A.stride(0) : A.stride(1); -#define KOKKOSBLAS2_DSYR2_BLAS(LAYOUT, EXEC_SPACE, MEM_SPACE, ETI_SPEC_AVAIL) \ - template <> \ - struct SYR2< \ - EXEC_SPACE, \ - Kokkos::View, \ - Kokkos::MemoryTraits>, \ - Kokkos::View, \ - Kokkos::MemoryTraits>, \ - Kokkos::View, \ - Kokkos::MemoryTraits>, \ - true, ETI_SPEC_AVAIL> { \ - typedef double SCALAR; \ - typedef Kokkos::View, \ - Kokkos::MemoryTraits> \ - XViewType; \ - typedef Kokkos::View, \ - Kokkos::MemoryTraits> \ - YViewType; \ - typedef Kokkos::View, \ - Kokkos::MemoryTraits> \ - AViewType; \ - \ - static void syr2(const typename AViewType::execution_space& space, \ - const char trans[], const char uplo[], \ - typename AViewType::const_value_type& alpha, \ - const XViewType& X, const YViewType& Y, \ - const AViewType& A) { \ - Kokkos::Profiling::pushRegion("KokkosBlas::syr2[TPL_BLAS,double]"); \ - KOKKOSBLAS2_SYR2_DETERMINE_ARGS(LAYOUT); \ - if (A_is_ll) { \ - HostBlas::syr2(uplo[0], N, alpha, X.data(), one, Y.data(), \ - one, A.data(), LDA); \ - } else { \ - /* blasDsyr2() + ~A_ll => call kokkos-kernels' implementation */ \ - SYR2::syr2(space, trans, uplo, alpha, X, Y, A); \ - } \ - Kokkos::Profiling::popRegion(); \ - } \ +#define KOKKOSBLAS2_DSYR2_BLAS(LAYOUT, EXEC_SPACE, MEM_SPACE, ETI_SPEC_AVAIL) \ + template <> \ + struct SYR2< \ + EXEC_SPACE, \ + Kokkos::View, \ + Kokkos::MemoryTraits>, \ + Kokkos::View, \ + Kokkos::MemoryTraits>, \ + Kokkos::View, Kokkos::MemoryTraits>, \ + true, ETI_SPEC_AVAIL> { \ + typedef double SCALAR; \ + typedef Kokkos::View, \ + Kokkos::MemoryTraits> \ + XViewType; \ + typedef Kokkos::View, \ + Kokkos::MemoryTraits> \ + YViewType; \ + typedef Kokkos::View, \ + Kokkos::MemoryTraits> \ + AViewType; \ + \ + static void syr2(const typename AViewType::execution_space& space, const char trans[], const char uplo[], \ + typename AViewType::const_value_type& alpha, const XViewType& X, const YViewType& Y, \ + const AViewType& A) { \ + Kokkos::Profiling::pushRegion("KokkosBlas::syr2[TPL_BLAS,double]"); \ + KOKKOSBLAS2_SYR2_DETERMINE_ARGS(LAYOUT); \ + if (A_is_ll) { \ + HostBlas::syr2(uplo[0], N, alpha, X.data(), one, Y.data(), one, A.data(), LDA); \ + } else { \ + /* blasDsyr2() + ~A_ll => call kokkos-kernels' implementation */ \ + SYR2::syr2(space, trans, uplo, alpha, X, \ + Y, A); \ + } \ + Kokkos::Profiling::popRegion(); \ + } \ }; -#define KOKKOSBLAS2_SSYR2_BLAS(LAYOUT, EXEC_SPACE, MEM_SPACE, ETI_SPEC_AVAIL) \ - template <> \ - struct SYR2< \ - EXEC_SPACE, \ - Kokkos::View, \ - Kokkos::MemoryTraits>, \ - Kokkos::View, \ - Kokkos::MemoryTraits>, \ - Kokkos::View, \ - Kokkos::MemoryTraits>, \ - true, ETI_SPEC_AVAIL> { \ - typedef float SCALAR; \ - typedef Kokkos::View, \ - Kokkos::MemoryTraits> \ - XViewType; \ - typedef Kokkos::View, \ - Kokkos::MemoryTraits> \ - YViewType; \ - typedef Kokkos::View, \ - Kokkos::MemoryTraits> \ - AViewType; \ - \ - static void syr2(const typename AViewType::execution_space& space, \ - const char trans[], const char uplo[], \ - typename AViewType::const_value_type& alpha, \ - const XViewType& X, const YViewType& Y, \ - const AViewType& A) { \ - Kokkos::Profiling::pushRegion("KokkosBlas::syr2[TPL_BLAS,float]"); \ - KOKKOSBLAS2_SYR2_DETERMINE_ARGS(LAYOUT); \ - if (A_is_ll) { \ - HostBlas::syr2(uplo[0], N, alpha, X.data(), one, Y.data(), \ - one, A.data(), LDA); \ - } else { \ - /* blasSsyr2() + ~A_ll => call kokkos-kernels' implementation */ \ - SYR2::syr2(space, trans, uplo, alpha, X, Y, A); \ - } \ - Kokkos::Profiling::popRegion(); \ - } \ +#define KOKKOSBLAS2_SSYR2_BLAS(LAYOUT, EXEC_SPACE, MEM_SPACE, ETI_SPEC_AVAIL) \ + template <> \ + struct SYR2< \ + EXEC_SPACE, \ + Kokkos::View, \ + Kokkos::MemoryTraits>, \ + Kokkos::View, \ + Kokkos::MemoryTraits>, \ + Kokkos::View, Kokkos::MemoryTraits>, \ + true, ETI_SPEC_AVAIL> { \ + typedef float SCALAR; \ + typedef Kokkos::View, \ + Kokkos::MemoryTraits> \ + XViewType; \ + typedef Kokkos::View, \ + Kokkos::MemoryTraits> \ + YViewType; \ + typedef Kokkos::View, \ + Kokkos::MemoryTraits> \ + AViewType; \ + \ + static void syr2(const typename AViewType::execution_space& space, const char trans[], const char uplo[], \ + typename AViewType::const_value_type& alpha, const XViewType& X, const YViewType& Y, \ + const AViewType& A) { \ + Kokkos::Profiling::pushRegion("KokkosBlas::syr2[TPL_BLAS,float]"); \ + KOKKOSBLAS2_SYR2_DETERMINE_ARGS(LAYOUT); \ + if (A_is_ll) { \ + HostBlas::syr2(uplo[0], N, alpha, X.data(), one, Y.data(), one, A.data(), LDA); \ + } else { \ + /* blasSsyr2() + ~A_ll => call kokkos-kernels' implementation */ \ + SYR2::syr2(space, trans, uplo, alpha, X, \ + Y, A); \ + } \ + Kokkos::Profiling::popRegion(); \ + } \ }; -#define KOKKOSBLAS2_ZSYR2_BLAS(LAYOUT, EXEC_SPACE, MEM_SPACE, ETI_SPEC_AVAIL) \ - template <> \ - struct SYR2*, LAYOUT, \ - Kokkos::Device, \ - Kokkos::MemoryTraits>, \ - Kokkos::View*, LAYOUT, \ - Kokkos::Device, \ - Kokkos::MemoryTraits>, \ - Kokkos::View**, LAYOUT, \ - Kokkos::Device, \ - Kokkos::MemoryTraits>, \ - true, ETI_SPEC_AVAIL> { \ - typedef Kokkos::complex SCALAR; \ - typedef Kokkos::View, \ - Kokkos::MemoryTraits> \ - XViewType; \ - typedef Kokkos::View, \ - Kokkos::MemoryTraits> \ - YViewType; \ - typedef Kokkos::View, \ - Kokkos::MemoryTraits> \ - AViewType; \ - \ - static void syr2(const typename AViewType::execution_space& space, \ - const char trans[], const char uplo[], \ - typename AViewType::const_value_type& alpha, \ - const XViewType& X, const YViewType& Y, \ - const AViewType& A) { \ - Kokkos::Profiling::pushRegion( \ - "KokkosBlas::syr2[TPL_BLAS,complex"); \ - KOKKOSBLAS2_SYR2_DETERMINE_ARGS(LAYOUT); \ - bool justTranspose = (trans[0] == 'T') || (trans[0] == 't'); \ - if (justTranspose) { \ - /* No blasZsyr2() => call kokkos-kernels' implementation */ \ - SYR2::syr2(space, trans, uplo, alpha, X, Y, A); \ - } else { \ - if (A_is_ll) { \ - HostBlas>::her2( \ - uplo[0], N, alpha, \ - reinterpret_cast*>(X.data()), one, \ - reinterpret_cast*>(Y.data()), one, \ - reinterpret_cast*>(A.data()), LDA); \ - } else { \ - /* blasZher2() + ~A_ll => call kokkos-kernels' implementation */ \ - SYR2::syr2(space, trans, uplo, alpha, X, Y, A); \ - } \ - } \ - Kokkos::Profiling::popRegion(); \ - } \ +#define KOKKOSBLAS2_ZSYR2_BLAS(LAYOUT, EXEC_SPACE, MEM_SPACE, ETI_SPEC_AVAIL) \ + template <> \ + struct SYR2*, LAYOUT, Kokkos::Device, \ + Kokkos::MemoryTraits>, \ + Kokkos::View*, LAYOUT, Kokkos::Device, \ + Kokkos::MemoryTraits>, \ + Kokkos::View**, LAYOUT, Kokkos::Device, \ + Kokkos::MemoryTraits>, \ + true, ETI_SPEC_AVAIL> { \ + typedef Kokkos::complex SCALAR; \ + typedef Kokkos::View, \ + Kokkos::MemoryTraits> \ + XViewType; \ + typedef Kokkos::View, \ + Kokkos::MemoryTraits> \ + YViewType; \ + typedef Kokkos::View, \ + Kokkos::MemoryTraits> \ + AViewType; \ + \ + static void syr2(const typename AViewType::execution_space& space, const char trans[], const char uplo[], \ + typename AViewType::const_value_type& alpha, const XViewType& X, const YViewType& Y, \ + const AViewType& A) { \ + Kokkos::Profiling::pushRegion("KokkosBlas::syr2[TPL_BLAS,complex"); \ + KOKKOSBLAS2_SYR2_DETERMINE_ARGS(LAYOUT); \ + bool justTranspose = (trans[0] == 'T') || (trans[0] == 't'); \ + if (justTranspose) { \ + /* No blasZsyr2() => call kokkos-kernels' implementation */ \ + SYR2::syr2(space, trans, uplo, alpha, X, \ + Y, A); \ + } else { \ + if (A_is_ll) { \ + HostBlas>::her2(uplo[0], N, alpha, \ + reinterpret_cast*>(X.data()), one, \ + reinterpret_cast*>(Y.data()), one, \ + reinterpret_cast*>(A.data()), LDA); \ + } else { \ + /* blasZher2() + ~A_ll => call kokkos-kernels' implementation */ \ + SYR2::syr2(space, trans, uplo, alpha, X, \ + Y, A); \ + } \ + } \ + Kokkos::Profiling::popRegion(); \ + } \ }; -#define KOKKOSBLAS2_CSYR2_BLAS(LAYOUT, EXEC_SPACE, MEM_SPACE, ETI_SPEC_AVAIL) \ - template <> \ - struct SYR2*, LAYOUT, \ - Kokkos::Device, \ - Kokkos::MemoryTraits>, \ - Kokkos::View*, LAYOUT, \ - Kokkos::Device, \ - Kokkos::MemoryTraits>, \ - Kokkos::View**, LAYOUT, \ - Kokkos::Device, \ - Kokkos::MemoryTraits>, \ - true, ETI_SPEC_AVAIL> { \ - typedef Kokkos::complex SCALAR; \ - typedef Kokkos::View, \ - Kokkos::MemoryTraits> \ - XViewType; \ - typedef Kokkos::View, \ - Kokkos::MemoryTraits> \ - YViewType; \ - typedef Kokkos::View, \ - Kokkos::MemoryTraits> \ - AViewType; \ - \ - static void syr2(const typename AViewType::execution_space& space, \ - const char trans[], const char uplo[], \ - typename AViewType::const_value_type& alpha, \ - const XViewType& X, const YViewType& Y, \ - const AViewType& A) { \ - Kokkos::Profiling::pushRegion( \ - "KokkosBlas::syr2[TPL_BLAS,complex"); \ - KOKKOSBLAS2_SYR2_DETERMINE_ARGS(LAYOUT); \ - bool justTranspose = (trans[0] == 'T') || (trans[0] == 't'); \ - if (justTranspose) { \ - /* No blasCsyr2() => call kokkos-kernels' implementation */ \ - SYR2::syr2(space, trans, uplo, alpha, X, Y, A); \ - } else { \ - if (A_is_ll) { \ - HostBlas>::her2( \ - uplo[0], N, alpha, \ - reinterpret_cast*>(X.data()), one, \ - reinterpret_cast*>(Y.data()), one, \ - reinterpret_cast*>(A.data()), LDA); \ - } else { \ - /* blasCher2() + ~A_ll => call kokkos-kernels' implementation */ \ - SYR2::syr2(space, trans, uplo, alpha, X, Y, A); \ - } \ - } \ - Kokkos::Profiling::popRegion(); \ - } \ +#define KOKKOSBLAS2_CSYR2_BLAS(LAYOUT, EXEC_SPACE, MEM_SPACE, ETI_SPEC_AVAIL) \ + template <> \ + struct SYR2*, LAYOUT, Kokkos::Device, \ + Kokkos::MemoryTraits>, \ + Kokkos::View*, LAYOUT, Kokkos::Device, \ + Kokkos::MemoryTraits>, \ + Kokkos::View**, LAYOUT, Kokkos::Device, \ + Kokkos::MemoryTraits>, \ + true, ETI_SPEC_AVAIL> { \ + typedef Kokkos::complex SCALAR; \ + typedef Kokkos::View, \ + Kokkos::MemoryTraits> \ + XViewType; \ + typedef Kokkos::View, \ + Kokkos::MemoryTraits> \ + YViewType; \ + typedef Kokkos::View, \ + Kokkos::MemoryTraits> \ + AViewType; \ + \ + static void syr2(const typename AViewType::execution_space& space, const char trans[], const char uplo[], \ + typename AViewType::const_value_type& alpha, const XViewType& X, const YViewType& Y, \ + const AViewType& A) { \ + Kokkos::Profiling::pushRegion("KokkosBlas::syr2[TPL_BLAS,complex"); \ + KOKKOSBLAS2_SYR2_DETERMINE_ARGS(LAYOUT); \ + bool justTranspose = (trans[0] == 'T') || (trans[0] == 't'); \ + if (justTranspose) { \ + /* No blasCsyr2() => call kokkos-kernels' implementation */ \ + SYR2::syr2(space, trans, uplo, alpha, X, \ + Y, A); \ + } else { \ + if (A_is_ll) { \ + HostBlas>::her2(uplo[0], N, alpha, \ + reinterpret_cast*>(X.data()), one, \ + reinterpret_cast*>(Y.data()), one, \ + reinterpret_cast*>(A.data()), LDA); \ + } else { \ + /* blasCher2() + ~A_ll => call kokkos-kernels' implementation */ \ + SYR2::syr2(space, trans, uplo, alpha, X, \ + Y, A); \ + } \ + } \ + Kokkos::Profiling::popRegion(); \ + } \ }; #ifdef KOKKOS_ENABLE_SERIAL -KOKKOSBLAS2_DSYR2_BLAS(Kokkos::LayoutLeft, Kokkos::Serial, Kokkos::HostSpace, - true) -KOKKOSBLAS2_DSYR2_BLAS(Kokkos::LayoutLeft, Kokkos::Serial, Kokkos::HostSpace, - false) -KOKKOSBLAS2_DSYR2_BLAS(Kokkos::LayoutRight, Kokkos::Serial, Kokkos::HostSpace, - true) -KOKKOSBLAS2_DSYR2_BLAS(Kokkos::LayoutRight, Kokkos::Serial, Kokkos::HostSpace, - false) - -KOKKOSBLAS2_SSYR2_BLAS(Kokkos::LayoutLeft, Kokkos::Serial, Kokkos::HostSpace, - true) -KOKKOSBLAS2_SSYR2_BLAS(Kokkos::LayoutLeft, Kokkos::Serial, Kokkos::HostSpace, - false) -KOKKOSBLAS2_SSYR2_BLAS(Kokkos::LayoutRight, Kokkos::Serial, Kokkos::HostSpace, - true) -KOKKOSBLAS2_SSYR2_BLAS(Kokkos::LayoutRight, Kokkos::Serial, Kokkos::HostSpace, - false) - -KOKKOSBLAS2_ZSYR2_BLAS(Kokkos::LayoutLeft, Kokkos::Serial, Kokkos::HostSpace, - true) -KOKKOSBLAS2_ZSYR2_BLAS(Kokkos::LayoutLeft, Kokkos::Serial, Kokkos::HostSpace, - false) -KOKKOSBLAS2_ZSYR2_BLAS(Kokkos::LayoutRight, Kokkos::Serial, Kokkos::HostSpace, - true) -KOKKOSBLAS2_ZSYR2_BLAS(Kokkos::LayoutRight, Kokkos::Serial, Kokkos::HostSpace, - false) - -KOKKOSBLAS2_CSYR2_BLAS(Kokkos::LayoutLeft, Kokkos::Serial, Kokkos::HostSpace, - true) -KOKKOSBLAS2_CSYR2_BLAS(Kokkos::LayoutLeft, Kokkos::Serial, Kokkos::HostSpace, - false) -KOKKOSBLAS2_CSYR2_BLAS(Kokkos::LayoutRight, Kokkos::Serial, Kokkos::HostSpace, - true) -KOKKOSBLAS2_CSYR2_BLAS(Kokkos::LayoutRight, Kokkos::Serial, Kokkos::HostSpace, - false) +KOKKOSBLAS2_DSYR2_BLAS(Kokkos::LayoutLeft, Kokkos::Serial, Kokkos::HostSpace, true) +KOKKOSBLAS2_DSYR2_BLAS(Kokkos::LayoutLeft, Kokkos::Serial, Kokkos::HostSpace, false) +KOKKOSBLAS2_DSYR2_BLAS(Kokkos::LayoutRight, Kokkos::Serial, Kokkos::HostSpace, true) +KOKKOSBLAS2_DSYR2_BLAS(Kokkos::LayoutRight, Kokkos::Serial, Kokkos::HostSpace, false) + +KOKKOSBLAS2_SSYR2_BLAS(Kokkos::LayoutLeft, Kokkos::Serial, Kokkos::HostSpace, true) +KOKKOSBLAS2_SSYR2_BLAS(Kokkos::LayoutLeft, Kokkos::Serial, Kokkos::HostSpace, false) +KOKKOSBLAS2_SSYR2_BLAS(Kokkos::LayoutRight, Kokkos::Serial, Kokkos::HostSpace, true) +KOKKOSBLAS2_SSYR2_BLAS(Kokkos::LayoutRight, Kokkos::Serial, Kokkos::HostSpace, false) + +KOKKOSBLAS2_ZSYR2_BLAS(Kokkos::LayoutLeft, Kokkos::Serial, Kokkos::HostSpace, true) +KOKKOSBLAS2_ZSYR2_BLAS(Kokkos::LayoutLeft, Kokkos::Serial, Kokkos::HostSpace, false) +KOKKOSBLAS2_ZSYR2_BLAS(Kokkos::LayoutRight, Kokkos::Serial, Kokkos::HostSpace, true) +KOKKOSBLAS2_ZSYR2_BLAS(Kokkos::LayoutRight, Kokkos::Serial, Kokkos::HostSpace, false) + +KOKKOSBLAS2_CSYR2_BLAS(Kokkos::LayoutLeft, Kokkos::Serial, Kokkos::HostSpace, true) +KOKKOSBLAS2_CSYR2_BLAS(Kokkos::LayoutLeft, Kokkos::Serial, Kokkos::HostSpace, false) +KOKKOSBLAS2_CSYR2_BLAS(Kokkos::LayoutRight, Kokkos::Serial, Kokkos::HostSpace, true) +KOKKOSBLAS2_CSYR2_BLAS(Kokkos::LayoutRight, Kokkos::Serial, Kokkos::HostSpace, false) #endif #ifdef KOKKOS_ENABLE_OPENMP -KOKKOSBLAS2_DSYR2_BLAS(Kokkos::LayoutLeft, Kokkos::OpenMP, Kokkos::HostSpace, - true) -KOKKOSBLAS2_DSYR2_BLAS(Kokkos::LayoutLeft, Kokkos::OpenMP, Kokkos::HostSpace, - false) -KOKKOSBLAS2_DSYR2_BLAS(Kokkos::LayoutRight, Kokkos::OpenMP, Kokkos::HostSpace, - true) -KOKKOSBLAS2_DSYR2_BLAS(Kokkos::LayoutRight, Kokkos::OpenMP, Kokkos::HostSpace, - false) - -KOKKOSBLAS2_SSYR2_BLAS(Kokkos::LayoutLeft, Kokkos::OpenMP, Kokkos::HostSpace, - true) -KOKKOSBLAS2_SSYR2_BLAS(Kokkos::LayoutLeft, Kokkos::OpenMP, Kokkos::HostSpace, - false) -KOKKOSBLAS2_SSYR2_BLAS(Kokkos::LayoutRight, Kokkos::OpenMP, Kokkos::HostSpace, - true) -KOKKOSBLAS2_SSYR2_BLAS(Kokkos::LayoutRight, Kokkos::OpenMP, Kokkos::HostSpace, - false) - -KOKKOSBLAS2_ZSYR2_BLAS(Kokkos::LayoutLeft, Kokkos::OpenMP, Kokkos::HostSpace, - true) -KOKKOSBLAS2_ZSYR2_BLAS(Kokkos::LayoutLeft, Kokkos::OpenMP, Kokkos::HostSpace, - false) -KOKKOSBLAS2_ZSYR2_BLAS(Kokkos::LayoutRight, Kokkos::OpenMP, Kokkos::HostSpace, - true) -KOKKOSBLAS2_ZSYR2_BLAS(Kokkos::LayoutRight, Kokkos::OpenMP, Kokkos::HostSpace, - false) - -KOKKOSBLAS2_CSYR2_BLAS(Kokkos::LayoutLeft, Kokkos::OpenMP, Kokkos::HostSpace, - true) -KOKKOSBLAS2_CSYR2_BLAS(Kokkos::LayoutLeft, Kokkos::OpenMP, Kokkos::HostSpace, - false) -KOKKOSBLAS2_CSYR2_BLAS(Kokkos::LayoutRight, Kokkos::OpenMP, Kokkos::HostSpace, - true) -KOKKOSBLAS2_CSYR2_BLAS(Kokkos::LayoutRight, Kokkos::OpenMP, Kokkos::HostSpace, - false) +KOKKOSBLAS2_DSYR2_BLAS(Kokkos::LayoutLeft, Kokkos::OpenMP, Kokkos::HostSpace, true) +KOKKOSBLAS2_DSYR2_BLAS(Kokkos::LayoutLeft, Kokkos::OpenMP, Kokkos::HostSpace, false) +KOKKOSBLAS2_DSYR2_BLAS(Kokkos::LayoutRight, Kokkos::OpenMP, Kokkos::HostSpace, true) +KOKKOSBLAS2_DSYR2_BLAS(Kokkos::LayoutRight, Kokkos::OpenMP, Kokkos::HostSpace, false) + +KOKKOSBLAS2_SSYR2_BLAS(Kokkos::LayoutLeft, Kokkos::OpenMP, Kokkos::HostSpace, true) +KOKKOSBLAS2_SSYR2_BLAS(Kokkos::LayoutLeft, Kokkos::OpenMP, Kokkos::HostSpace, false) +KOKKOSBLAS2_SSYR2_BLAS(Kokkos::LayoutRight, Kokkos::OpenMP, Kokkos::HostSpace, true) +KOKKOSBLAS2_SSYR2_BLAS(Kokkos::LayoutRight, Kokkos::OpenMP, Kokkos::HostSpace, false) + +KOKKOSBLAS2_ZSYR2_BLAS(Kokkos::LayoutLeft, Kokkos::OpenMP, Kokkos::HostSpace, true) +KOKKOSBLAS2_ZSYR2_BLAS(Kokkos::LayoutLeft, Kokkos::OpenMP, Kokkos::HostSpace, false) +KOKKOSBLAS2_ZSYR2_BLAS(Kokkos::LayoutRight, Kokkos::OpenMP, Kokkos::HostSpace, true) +KOKKOSBLAS2_ZSYR2_BLAS(Kokkos::LayoutRight, Kokkos::OpenMP, Kokkos::HostSpace, false) + +KOKKOSBLAS2_CSYR2_BLAS(Kokkos::LayoutLeft, Kokkos::OpenMP, Kokkos::HostSpace, true) +KOKKOSBLAS2_CSYR2_BLAS(Kokkos::LayoutLeft, Kokkos::OpenMP, Kokkos::HostSpace, false) +KOKKOSBLAS2_CSYR2_BLAS(Kokkos::LayoutRight, Kokkos::OpenMP, Kokkos::HostSpace, true) +KOKKOSBLAS2_CSYR2_BLAS(Kokkos::LayoutRight, Kokkos::OpenMP, Kokkos::HostSpace, false) #endif } // namespace Impl diff --git a/blas/tpls/KokkosBlas2_syr2_tpl_spec_decl_cublas.hpp b/blas/tpls/KokkosBlas2_syr2_tpl_spec_decl_cublas.hpp index ca98fedf0d..4dd95aa79a 100644 --- a/blas/tpls/KokkosBlas2_syr2_tpl_spec_decl_cublas.hpp +++ b/blas/tpls/KokkosBlas2_syr2_tpl_spec_decl_cublas.hpp @@ -22,349 +22,257 @@ namespace KokkosBlas { namespace Impl { -#define KOKKOSBLAS2_SYR2_CUBLAS_DETERMINE_ARGS(LAYOUT, uploChar) \ - bool A_is_ll = std::is_same::value; \ - bool A_is_lr = std::is_same::value; \ - const int N = static_cast(A_is_lr ? A.extent(0) : A.extent(1)); \ - constexpr int one = 1; \ - const int LDA = A_is_lr ? A.stride(0) : A.stride(1); \ - cublasFillMode_t fillMode = (uploChar == 'L' || uploChar == 'l') \ - ? CUBLAS_FILL_MODE_LOWER \ - : CUBLAS_FILL_MODE_UPPER; +#define KOKKOSBLAS2_SYR2_CUBLAS_DETERMINE_ARGS(LAYOUT, uploChar) \ + bool A_is_ll = std::is_same::value; \ + bool A_is_lr = std::is_same::value; \ + const int N = static_cast(A_is_lr ? A.extent(0) : A.extent(1)); \ + constexpr int one = 1; \ + const int LDA = A_is_lr ? A.stride(0) : A.stride(1); \ + cublasFillMode_t fillMode = (uploChar == 'L' || uploChar == 'l') ? CUBLAS_FILL_MODE_LOWER : CUBLAS_FILL_MODE_UPPER; -#define KOKKOSBLAS2_DSYR2_CUBLAS(LAYOUT, EXEC_SPACE, MEM_SPACE, \ - ETI_SPEC_AVAIL) \ - template <> \ - struct SYR2< \ - EXEC_SPACE, \ - Kokkos::View, \ - Kokkos::MemoryTraits >, \ - Kokkos::View, \ - Kokkos::MemoryTraits >, \ - Kokkos::View, \ - Kokkos::MemoryTraits >, \ - true, ETI_SPEC_AVAIL> { \ - typedef double SCALAR; \ - typedef Kokkos::View, \ - Kokkos::MemoryTraits > \ - XViewType; \ - typedef Kokkos::View, \ - Kokkos::MemoryTraits > \ - YViewType; \ - typedef Kokkos::View, \ - Kokkos::MemoryTraits > \ - AViewType; \ - \ - static void syr2(const typename AViewType::execution_space& space, \ - const char trans[], const char uplo[], \ - typename AViewType::const_value_type& alpha, \ - const XViewType& X, const YViewType& Y, \ - const AViewType& A) { \ - Kokkos::Profiling::pushRegion("KokkosBlas::syr2[TPL_CUBLAS,double]"); \ - KOKKOSBLAS2_SYR2_CUBLAS_DETERMINE_ARGS(LAYOUT, uplo[0]); \ - if (A_is_ll) { \ - KokkosBlas::Impl::CudaBlasSingleton& s = \ - KokkosBlas::Impl::CudaBlasSingleton::singleton(); \ - KOKKOS_CUBLAS_SAFE_CALL_IMPL( \ - cublasSetStream(s.handle, space.cuda_stream())); \ - KOKKOS_CUBLAS_SAFE_CALL_IMPL( \ - cublasDsyr2(s.handle, fillMode, N, &alpha, X.data(), one, \ - Y.data(), one, A.data(), LDA)); \ - KOKKOS_CUBLAS_SAFE_CALL_IMPL(cublasSetStream(s.handle, NULL)); \ - } else { \ - /* cublasDsyr2() + ~A_ll => call kokkos-kernels' implementation */ \ - SYR2::syr2(space, trans, uplo, alpha, X, Y, A); \ - } \ - Kokkos::Profiling::popRegion(); \ - } \ +#define KOKKOSBLAS2_DSYR2_CUBLAS(LAYOUT, EXEC_SPACE, MEM_SPACE, ETI_SPEC_AVAIL) \ + template <> \ + struct SYR2< \ + EXEC_SPACE, \ + Kokkos::View, \ + Kokkos::MemoryTraits >, \ + Kokkos::View, \ + Kokkos::MemoryTraits >, \ + Kokkos::View, Kokkos::MemoryTraits >, \ + true, ETI_SPEC_AVAIL> { \ + typedef double SCALAR; \ + typedef Kokkos::View, \ + Kokkos::MemoryTraits > \ + XViewType; \ + typedef Kokkos::View, \ + Kokkos::MemoryTraits > \ + YViewType; \ + typedef Kokkos::View, \ + Kokkos::MemoryTraits > \ + AViewType; \ + \ + static void syr2(const typename AViewType::execution_space& space, const char trans[], const char uplo[], \ + typename AViewType::const_value_type& alpha, const XViewType& X, const YViewType& Y, \ + const AViewType& A) { \ + Kokkos::Profiling::pushRegion("KokkosBlas::syr2[TPL_CUBLAS,double]"); \ + KOKKOSBLAS2_SYR2_CUBLAS_DETERMINE_ARGS(LAYOUT, uplo[0]); \ + if (A_is_ll) { \ + KokkosBlas::Impl::CudaBlasSingleton& s = KokkosBlas::Impl::CudaBlasSingleton::singleton(); \ + KOKKOS_CUBLAS_SAFE_CALL_IMPL(cublasSetStream(s.handle, space.cuda_stream())); \ + KOKKOS_CUBLAS_SAFE_CALL_IMPL( \ + cublasDsyr2(s.handle, fillMode, N, &alpha, X.data(), one, Y.data(), one, A.data(), LDA)); \ + KOKKOS_CUBLAS_SAFE_CALL_IMPL(cublasSetStream(s.handle, NULL)); \ + } else { \ + /* cublasDsyr2() + ~A_ll => call kokkos-kernels' implementation */ \ + SYR2::syr2(space, trans, uplo, alpha, X, \ + Y, A); \ + } \ + Kokkos::Profiling::popRegion(); \ + } \ }; -#define KOKKOSBLAS2_SSYR2_CUBLAS(LAYOUT, EXEC_SPACE, MEM_SPACE, \ - ETI_SPEC_AVAIL) \ - template <> \ - struct SYR2< \ - EXEC_SPACE, \ - Kokkos::View, \ - Kokkos::MemoryTraits >, \ - Kokkos::View, \ - Kokkos::MemoryTraits >, \ - Kokkos::View, \ - Kokkos::MemoryTraits >, \ - true, ETI_SPEC_AVAIL> { \ - typedef float SCALAR; \ - typedef Kokkos::View, \ - Kokkos::MemoryTraits > \ - XViewType; \ - typedef Kokkos::View, \ - Kokkos::MemoryTraits > \ - YViewType; \ - typedef Kokkos::View, \ - Kokkos::MemoryTraits > \ - AViewType; \ - \ - static void syr2(const typename AViewType::execution_space& space, \ - const char trans[], const char uplo[], \ - typename AViewType::const_value_type& alpha, \ - const XViewType& X, const YViewType& Y, \ - const AViewType& A) { \ - Kokkos::Profiling::pushRegion("KokkosBlas::syr2[TPL_CUBLAS,float]"); \ - KOKKOSBLAS2_SYR2_CUBLAS_DETERMINE_ARGS(LAYOUT, uplo[0]); \ - if (A_is_ll) { \ - KokkosBlas::Impl::CudaBlasSingleton& s = \ - KokkosBlas::Impl::CudaBlasSingleton::singleton(); \ - KOKKOS_CUBLAS_SAFE_CALL_IMPL( \ - cublasSetStream(s.handle, space.cuda_stream())); \ - KOKKOS_CUBLAS_SAFE_CALL_IMPL( \ - cublasSsyr2(s.handle, fillMode, N, &alpha, X.data(), one, \ - Y.data(), one, A.data(), LDA)); \ - KOKKOS_CUBLAS_SAFE_CALL_IMPL(cublasSetStream(s.handle, NULL)); \ - } else { \ - /* cublasSsyr2() + ~A_ll => call kokkos-kernels' implementation */ \ - SYR2::syr2(space, trans, uplo, alpha, X, Y, A); \ - } \ - Kokkos::Profiling::popRegion(); \ - } \ +#define KOKKOSBLAS2_SSYR2_CUBLAS(LAYOUT, EXEC_SPACE, MEM_SPACE, ETI_SPEC_AVAIL) \ + template <> \ + struct SYR2< \ + EXEC_SPACE, \ + Kokkos::View, \ + Kokkos::MemoryTraits >, \ + Kokkos::View, \ + Kokkos::MemoryTraits >, \ + Kokkos::View, Kokkos::MemoryTraits >, \ + true, ETI_SPEC_AVAIL> { \ + typedef float SCALAR; \ + typedef Kokkos::View, \ + Kokkos::MemoryTraits > \ + XViewType; \ + typedef Kokkos::View, \ + Kokkos::MemoryTraits > \ + YViewType; \ + typedef Kokkos::View, \ + Kokkos::MemoryTraits > \ + AViewType; \ + \ + static void syr2(const typename AViewType::execution_space& space, const char trans[], const char uplo[], \ + typename AViewType::const_value_type& alpha, const XViewType& X, const YViewType& Y, \ + const AViewType& A) { \ + Kokkos::Profiling::pushRegion("KokkosBlas::syr2[TPL_CUBLAS,float]"); \ + KOKKOSBLAS2_SYR2_CUBLAS_DETERMINE_ARGS(LAYOUT, uplo[0]); \ + if (A_is_ll) { \ + KokkosBlas::Impl::CudaBlasSingleton& s = KokkosBlas::Impl::CudaBlasSingleton::singleton(); \ + KOKKOS_CUBLAS_SAFE_CALL_IMPL(cublasSetStream(s.handle, space.cuda_stream())); \ + KOKKOS_CUBLAS_SAFE_CALL_IMPL( \ + cublasSsyr2(s.handle, fillMode, N, &alpha, X.data(), one, Y.data(), one, A.data(), LDA)); \ + KOKKOS_CUBLAS_SAFE_CALL_IMPL(cublasSetStream(s.handle, NULL)); \ + } else { \ + /* cublasSsyr2() + ~A_ll => call kokkos-kernels' implementation */ \ + SYR2::syr2(space, trans, uplo, alpha, X, \ + Y, A); \ + } \ + Kokkos::Profiling::popRegion(); \ + } \ }; -#define KOKKOSBLAS2_ZSYR2_CUBLAS(LAYOUT, EXEC_SPACE, MEM_SPACE, \ - ETI_SPEC_AVAIL) \ - template <> \ - struct SYR2*, LAYOUT, \ - Kokkos::Device, \ - Kokkos::MemoryTraits >, \ - Kokkos::View*, LAYOUT, \ - Kokkos::Device, \ - Kokkos::MemoryTraits >, \ - Kokkos::View**, LAYOUT, \ - Kokkos::Device, \ - Kokkos::MemoryTraits >, \ - true, ETI_SPEC_AVAIL> { \ - typedef Kokkos::complex SCALAR; \ - typedef Kokkos::View, \ - Kokkos::MemoryTraits > \ - XViewType; \ - typedef Kokkos::View, \ - Kokkos::MemoryTraits > \ - YViewType; \ - typedef Kokkos::View, \ - Kokkos::MemoryTraits > \ - AViewType; \ - \ - static void syr2(const typename AViewType::execution_space& space, \ - const char trans[], const char uplo[], \ - typename AViewType::const_value_type& alpha, \ - const XViewType& X, const YViewType& Y, \ - const AViewType& A) { \ - Kokkos::Profiling::pushRegion( \ - "KokkosBlas::syr2[TPL_CUBLAS,complex]"); \ - KOKKOSBLAS2_SYR2_CUBLAS_DETERMINE_ARGS(LAYOUT, uplo[0]); \ - bool justTranspose = (trans[0] == 'T') || (trans[0] == 't'); \ - if (justTranspose) { \ - if (A_is_ll) { \ - KokkosBlas::Impl::CudaBlasSingleton& s = \ - KokkosBlas::Impl::CudaBlasSingleton::singleton(); \ - KOKKOS_CUBLAS_SAFE_CALL_IMPL( \ - cublasSetStream(s.handle, space.cuda_stream())); \ - KOKKOS_CUBLAS_SAFE_CALL_IMPL(cublasZsyr2( \ - s.handle, fillMode, N, \ - reinterpret_cast(&alpha), \ - reinterpret_cast(X.data()), one, \ - reinterpret_cast(Y.data()), one, \ - reinterpret_cast(A.data()), LDA)); \ - KOKKOS_CUBLAS_SAFE_CALL_IMPL(cublasSetStream(s.handle, NULL)); \ - } else { \ - /* cublasZsyr2() + ~A_ll => call kokkos-kernels' implementation */ \ - SYR2::syr2(space, trans, uplo, alpha, X, Y, A); \ - } \ - } else { \ - if (A_is_ll) { \ - KokkosBlas::Impl::CudaBlasSingleton& s = \ - KokkosBlas::Impl::CudaBlasSingleton::singleton(); \ - KOKKOS_CUBLAS_SAFE_CALL_IMPL( \ - cublasSetStream(s.handle, space.cuda_stream())); \ - KOKKOS_CUBLAS_SAFE_CALL_IMPL(cublasZher2( \ - s.handle, fillMode, N, \ - reinterpret_cast(&alpha), \ - reinterpret_cast(X.data()), one, \ - reinterpret_cast(Y.data()), one, \ - reinterpret_cast(A.data()), LDA)); \ - KOKKOS_CUBLAS_SAFE_CALL_IMPL(cublasSetStream(s.handle, NULL)); \ - } else { \ - /* cublasZher2() + ~A_ll => call kokkos-kernels' implementation */ \ - SYR2::syr2(space, trans, uplo, alpha, X, Y, A); \ - } \ - } \ - Kokkos::Profiling::popRegion(); \ - } \ +#define KOKKOSBLAS2_ZSYR2_CUBLAS(LAYOUT, EXEC_SPACE, MEM_SPACE, ETI_SPEC_AVAIL) \ + template <> \ + struct SYR2*, LAYOUT, Kokkos::Device, \ + Kokkos::MemoryTraits >, \ + Kokkos::View*, LAYOUT, Kokkos::Device, \ + Kokkos::MemoryTraits >, \ + Kokkos::View**, LAYOUT, Kokkos::Device, \ + Kokkos::MemoryTraits >, \ + true, ETI_SPEC_AVAIL> { \ + typedef Kokkos::complex SCALAR; \ + typedef Kokkos::View, \ + Kokkos::MemoryTraits > \ + XViewType; \ + typedef Kokkos::View, \ + Kokkos::MemoryTraits > \ + YViewType; \ + typedef Kokkos::View, \ + Kokkos::MemoryTraits > \ + AViewType; \ + \ + static void syr2(const typename AViewType::execution_space& space, const char trans[], const char uplo[], \ + typename AViewType::const_value_type& alpha, const XViewType& X, const YViewType& Y, \ + const AViewType& A) { \ + Kokkos::Profiling::pushRegion("KokkosBlas::syr2[TPL_CUBLAS,complex]"); \ + KOKKOSBLAS2_SYR2_CUBLAS_DETERMINE_ARGS(LAYOUT, uplo[0]); \ + bool justTranspose = (trans[0] == 'T') || (trans[0] == 't'); \ + if (justTranspose) { \ + if (A_is_ll) { \ + KokkosBlas::Impl::CudaBlasSingleton& s = KokkosBlas::Impl::CudaBlasSingleton::singleton(); \ + KOKKOS_CUBLAS_SAFE_CALL_IMPL(cublasSetStream(s.handle, space.cuda_stream())); \ + KOKKOS_CUBLAS_SAFE_CALL_IMPL(cublasZsyr2(s.handle, fillMode, N, \ + reinterpret_cast(&alpha), \ + reinterpret_cast(X.data()), one, \ + reinterpret_cast(Y.data()), one, \ + reinterpret_cast(A.data()), LDA)); \ + KOKKOS_CUBLAS_SAFE_CALL_IMPL(cublasSetStream(s.handle, NULL)); \ + } else { \ + /* cublasZsyr2() + ~A_ll => call kokkos-kernels' implementation */ \ + SYR2::syr2(space, trans, uplo, alpha, X, \ + Y, A); \ + } \ + } else { \ + if (A_is_ll) { \ + KokkosBlas::Impl::CudaBlasSingleton& s = KokkosBlas::Impl::CudaBlasSingleton::singleton(); \ + KOKKOS_CUBLAS_SAFE_CALL_IMPL(cublasSetStream(s.handle, space.cuda_stream())); \ + KOKKOS_CUBLAS_SAFE_CALL_IMPL(cublasZher2(s.handle, fillMode, N, \ + reinterpret_cast(&alpha), \ + reinterpret_cast(X.data()), one, \ + reinterpret_cast(Y.data()), one, \ + reinterpret_cast(A.data()), LDA)); \ + KOKKOS_CUBLAS_SAFE_CALL_IMPL(cublasSetStream(s.handle, NULL)); \ + } else { \ + /* cublasZher2() + ~A_ll => call kokkos-kernels' implementation */ \ + SYR2::syr2(space, trans, uplo, alpha, X, \ + Y, A); \ + } \ + } \ + Kokkos::Profiling::popRegion(); \ + } \ }; -#define KOKKOSBLAS2_CSYR2_CUBLAS(LAYOUT, EXEC_SPACE, MEM_SPACE, \ - ETI_SPEC_AVAIL) \ - template <> \ - struct SYR2*, LAYOUT, \ - Kokkos::Device, \ - Kokkos::MemoryTraits >, \ - Kokkos::View*, LAYOUT, \ - Kokkos::Device, \ - Kokkos::MemoryTraits >, \ - Kokkos::View**, LAYOUT, \ - Kokkos::Device, \ - Kokkos::MemoryTraits >, \ - true, ETI_SPEC_AVAIL> { \ - typedef Kokkos::complex SCALAR; \ - typedef Kokkos::View, \ - Kokkos::MemoryTraits > \ - XViewType; \ - typedef Kokkos::View, \ - Kokkos::MemoryTraits > \ - YViewType; \ - typedef Kokkos::View, \ - Kokkos::MemoryTraits > \ - AViewType; \ - \ - static void syr2(const typename AViewType::execution_space& space, \ - const char trans[], const char uplo[], \ - typename AViewType::const_value_type& alpha, \ - const XViewType& X, const YViewType& Y, \ - const AViewType& A) { \ - Kokkos::Profiling::pushRegion( \ - "KokkosBlas::syr2[TPL_CUBLAS,complex]"); \ - KOKKOSBLAS2_SYR2_CUBLAS_DETERMINE_ARGS(LAYOUT, uplo[0]); \ - bool justTranspose = (trans[0] == 'T') || (trans[0] == 't'); \ - if (justTranspose) { \ - if (A_is_ll) { \ - KokkosBlas::Impl::CudaBlasSingleton& s = \ - KokkosBlas::Impl::CudaBlasSingleton::singleton(); \ - KOKKOS_CUBLAS_SAFE_CALL_IMPL( \ - cublasSetStream(s.handle, space.cuda_stream())); \ - KOKKOS_CUBLAS_SAFE_CALL_IMPL( \ - cublasCsyr2(s.handle, fillMode, N, \ - reinterpret_cast(&alpha), \ - reinterpret_cast(X.data()), one, \ - reinterpret_cast(Y.data()), one, \ - reinterpret_cast(A.data()), LDA)); \ - KOKKOS_CUBLAS_SAFE_CALL_IMPL(cublasSetStream(s.handle, NULL)); \ - } else { \ - /* cublasCsyr2() + ~A_ll => call kokkos-kernels' implementation */ \ - SYR2::syr2(space, trans, uplo, alpha, X, Y, A); \ - } \ - } else { \ - if (A_is_ll) { \ - KokkosBlas::Impl::CudaBlasSingleton& s = \ - KokkosBlas::Impl::CudaBlasSingleton::singleton(); \ - KOKKOS_CUBLAS_SAFE_CALL_IMPL( \ - cublasSetStream(s.handle, space.cuda_stream())); \ - KOKKOS_CUBLAS_SAFE_CALL_IMPL( \ - cublasCher2(s.handle, fillMode, N, \ - reinterpret_cast(&alpha), \ - reinterpret_cast(X.data()), one, \ - reinterpret_cast(Y.data()), one, \ - reinterpret_cast(A.data()), LDA)); \ - KOKKOS_CUBLAS_SAFE_CALL_IMPL(cublasSetStream(s.handle, NULL)); \ - } else { \ - /* cublasCher2() + ~A_ll => call kokkos-kernels' implementation */ \ - SYR2::syr2(space, trans, uplo, alpha, X, Y, A); \ - } \ - } \ - Kokkos::Profiling::popRegion(); \ - } \ +#define KOKKOSBLAS2_CSYR2_CUBLAS(LAYOUT, EXEC_SPACE, MEM_SPACE, ETI_SPEC_AVAIL) \ + template <> \ + struct SYR2*, LAYOUT, Kokkos::Device, \ + Kokkos::MemoryTraits >, \ + Kokkos::View*, LAYOUT, Kokkos::Device, \ + Kokkos::MemoryTraits >, \ + Kokkos::View**, LAYOUT, Kokkos::Device, \ + Kokkos::MemoryTraits >, \ + true, ETI_SPEC_AVAIL> { \ + typedef Kokkos::complex SCALAR; \ + typedef Kokkos::View, \ + Kokkos::MemoryTraits > \ + XViewType; \ + typedef Kokkos::View, \ + Kokkos::MemoryTraits > \ + YViewType; \ + typedef Kokkos::View, \ + Kokkos::MemoryTraits > \ + AViewType; \ + \ + static void syr2(const typename AViewType::execution_space& space, const char trans[], const char uplo[], \ + typename AViewType::const_value_type& alpha, const XViewType& X, const YViewType& Y, \ + const AViewType& A) { \ + Kokkos::Profiling::pushRegion("KokkosBlas::syr2[TPL_CUBLAS,complex]"); \ + KOKKOSBLAS2_SYR2_CUBLAS_DETERMINE_ARGS(LAYOUT, uplo[0]); \ + bool justTranspose = (trans[0] == 'T') || (trans[0] == 't'); \ + if (justTranspose) { \ + if (A_is_ll) { \ + KokkosBlas::Impl::CudaBlasSingleton& s = KokkosBlas::Impl::CudaBlasSingleton::singleton(); \ + KOKKOS_CUBLAS_SAFE_CALL_IMPL(cublasSetStream(s.handle, space.cuda_stream())); \ + KOKKOS_CUBLAS_SAFE_CALL_IMPL(cublasCsyr2(s.handle, fillMode, N, reinterpret_cast(&alpha), \ + reinterpret_cast(X.data()), one, \ + reinterpret_cast(Y.data()), one, \ + reinterpret_cast(A.data()), LDA)); \ + KOKKOS_CUBLAS_SAFE_CALL_IMPL(cublasSetStream(s.handle, NULL)); \ + } else { \ + /* cublasCsyr2() + ~A_ll => call kokkos-kernels' implementation */ \ + SYR2::syr2(space, trans, uplo, alpha, X, \ + Y, A); \ + } \ + } else { \ + if (A_is_ll) { \ + KokkosBlas::Impl::CudaBlasSingleton& s = KokkosBlas::Impl::CudaBlasSingleton::singleton(); \ + KOKKOS_CUBLAS_SAFE_CALL_IMPL(cublasSetStream(s.handle, space.cuda_stream())); \ + KOKKOS_CUBLAS_SAFE_CALL_IMPL(cublasCher2(s.handle, fillMode, N, reinterpret_cast(&alpha), \ + reinterpret_cast(X.data()), one, \ + reinterpret_cast(Y.data()), one, \ + reinterpret_cast(A.data()), LDA)); \ + KOKKOS_CUBLAS_SAFE_CALL_IMPL(cublasSetStream(s.handle, NULL)); \ + } else { \ + /* cublasCher2() + ~A_ll => call kokkos-kernels' implementation */ \ + SYR2::syr2(space, trans, uplo, alpha, X, \ + Y, A); \ + } \ + } \ + Kokkos::Profiling::popRegion(); \ + } \ }; -KOKKOSBLAS2_DSYR2_CUBLAS(Kokkos::LayoutLeft, Kokkos::Cuda, Kokkos::CudaSpace, - true) -KOKKOSBLAS2_DSYR2_CUBLAS(Kokkos::LayoutLeft, Kokkos::Cuda, Kokkos::CudaSpace, - false) -KOKKOSBLAS2_DSYR2_CUBLAS(Kokkos::LayoutRight, Kokkos::Cuda, Kokkos::CudaSpace, - true) -KOKKOSBLAS2_DSYR2_CUBLAS(Kokkos::LayoutRight, Kokkos::Cuda, Kokkos::CudaSpace, - false) +KOKKOSBLAS2_DSYR2_CUBLAS(Kokkos::LayoutLeft, Kokkos::Cuda, Kokkos::CudaSpace, true) +KOKKOSBLAS2_DSYR2_CUBLAS(Kokkos::LayoutLeft, Kokkos::Cuda, Kokkos::CudaSpace, false) +KOKKOSBLAS2_DSYR2_CUBLAS(Kokkos::LayoutRight, Kokkos::Cuda, Kokkos::CudaSpace, true) +KOKKOSBLAS2_DSYR2_CUBLAS(Kokkos::LayoutRight, Kokkos::Cuda, Kokkos::CudaSpace, false) -KOKKOSBLAS2_DSYR2_CUBLAS(Kokkos::LayoutLeft, Kokkos::Cuda, Kokkos::CudaUVMSpace, - true) -KOKKOSBLAS2_DSYR2_CUBLAS(Kokkos::LayoutLeft, Kokkos::Cuda, Kokkos::CudaUVMSpace, - false) -KOKKOSBLAS2_DSYR2_CUBLAS(Kokkos::LayoutRight, Kokkos::Cuda, - Kokkos::CudaUVMSpace, true) -KOKKOSBLAS2_DSYR2_CUBLAS(Kokkos::LayoutRight, Kokkos::Cuda, - Kokkos::CudaUVMSpace, false) +KOKKOSBLAS2_DSYR2_CUBLAS(Kokkos::LayoutLeft, Kokkos::Cuda, Kokkos::CudaUVMSpace, true) +KOKKOSBLAS2_DSYR2_CUBLAS(Kokkos::LayoutLeft, Kokkos::Cuda, Kokkos::CudaUVMSpace, false) +KOKKOSBLAS2_DSYR2_CUBLAS(Kokkos::LayoutRight, Kokkos::Cuda, Kokkos::CudaUVMSpace, true) +KOKKOSBLAS2_DSYR2_CUBLAS(Kokkos::LayoutRight, Kokkos::Cuda, Kokkos::CudaUVMSpace, false) -KOKKOSBLAS2_SSYR2_CUBLAS(Kokkos::LayoutLeft, Kokkos::Cuda, Kokkos::CudaSpace, - true) -KOKKOSBLAS2_SSYR2_CUBLAS(Kokkos::LayoutLeft, Kokkos::Cuda, Kokkos::CudaSpace, - false) -KOKKOSBLAS2_SSYR2_CUBLAS(Kokkos::LayoutRight, Kokkos::Cuda, Kokkos::CudaSpace, - true) -KOKKOSBLAS2_SSYR2_CUBLAS(Kokkos::LayoutRight, Kokkos::Cuda, Kokkos::CudaSpace, - false) +KOKKOSBLAS2_SSYR2_CUBLAS(Kokkos::LayoutLeft, Kokkos::Cuda, Kokkos::CudaSpace, true) +KOKKOSBLAS2_SSYR2_CUBLAS(Kokkos::LayoutLeft, Kokkos::Cuda, Kokkos::CudaSpace, false) +KOKKOSBLAS2_SSYR2_CUBLAS(Kokkos::LayoutRight, Kokkos::Cuda, Kokkos::CudaSpace, true) +KOKKOSBLAS2_SSYR2_CUBLAS(Kokkos::LayoutRight, Kokkos::Cuda, Kokkos::CudaSpace, false) -KOKKOSBLAS2_SSYR2_CUBLAS(Kokkos::LayoutLeft, Kokkos::Cuda, Kokkos::CudaUVMSpace, - true) -KOKKOSBLAS2_SSYR2_CUBLAS(Kokkos::LayoutLeft, Kokkos::Cuda, Kokkos::CudaUVMSpace, - false) -KOKKOSBLAS2_SSYR2_CUBLAS(Kokkos::LayoutRight, Kokkos::Cuda, - Kokkos::CudaUVMSpace, true) -KOKKOSBLAS2_SSYR2_CUBLAS(Kokkos::LayoutRight, Kokkos::Cuda, - Kokkos::CudaUVMSpace, false) +KOKKOSBLAS2_SSYR2_CUBLAS(Kokkos::LayoutLeft, Kokkos::Cuda, Kokkos::CudaUVMSpace, true) +KOKKOSBLAS2_SSYR2_CUBLAS(Kokkos::LayoutLeft, Kokkos::Cuda, Kokkos::CudaUVMSpace, false) +KOKKOSBLAS2_SSYR2_CUBLAS(Kokkos::LayoutRight, Kokkos::Cuda, Kokkos::CudaUVMSpace, true) +KOKKOSBLAS2_SSYR2_CUBLAS(Kokkos::LayoutRight, Kokkos::Cuda, Kokkos::CudaUVMSpace, false) -KOKKOSBLAS2_ZSYR2_CUBLAS(Kokkos::LayoutLeft, Kokkos::Cuda, Kokkos::CudaSpace, - true) -KOKKOSBLAS2_ZSYR2_CUBLAS(Kokkos::LayoutLeft, Kokkos::Cuda, Kokkos::CudaSpace, - false) -KOKKOSBLAS2_ZSYR2_CUBLAS(Kokkos::LayoutRight, Kokkos::Cuda, Kokkos::CudaSpace, - true) -KOKKOSBLAS2_ZSYR2_CUBLAS(Kokkos::LayoutRight, Kokkos::Cuda, Kokkos::CudaSpace, - false) +KOKKOSBLAS2_ZSYR2_CUBLAS(Kokkos::LayoutLeft, Kokkos::Cuda, Kokkos::CudaSpace, true) +KOKKOSBLAS2_ZSYR2_CUBLAS(Kokkos::LayoutLeft, Kokkos::Cuda, Kokkos::CudaSpace, false) +KOKKOSBLAS2_ZSYR2_CUBLAS(Kokkos::LayoutRight, Kokkos::Cuda, Kokkos::CudaSpace, true) +KOKKOSBLAS2_ZSYR2_CUBLAS(Kokkos::LayoutRight, Kokkos::Cuda, Kokkos::CudaSpace, false) -KOKKOSBLAS2_ZSYR2_CUBLAS(Kokkos::LayoutLeft, Kokkos::Cuda, Kokkos::CudaUVMSpace, - true) -KOKKOSBLAS2_ZSYR2_CUBLAS(Kokkos::LayoutLeft, Kokkos::Cuda, Kokkos::CudaUVMSpace, - false) -KOKKOSBLAS2_ZSYR2_CUBLAS(Kokkos::LayoutRight, Kokkos::Cuda, - Kokkos::CudaUVMSpace, true) -KOKKOSBLAS2_ZSYR2_CUBLAS(Kokkos::LayoutRight, Kokkos::Cuda, - Kokkos::CudaUVMSpace, false) +KOKKOSBLAS2_ZSYR2_CUBLAS(Kokkos::LayoutLeft, Kokkos::Cuda, Kokkos::CudaUVMSpace, true) +KOKKOSBLAS2_ZSYR2_CUBLAS(Kokkos::LayoutLeft, Kokkos::Cuda, Kokkos::CudaUVMSpace, false) +KOKKOSBLAS2_ZSYR2_CUBLAS(Kokkos::LayoutRight, Kokkos::Cuda, Kokkos::CudaUVMSpace, true) +KOKKOSBLAS2_ZSYR2_CUBLAS(Kokkos::LayoutRight, Kokkos::Cuda, Kokkos::CudaUVMSpace, false) -KOKKOSBLAS2_CSYR2_CUBLAS(Kokkos::LayoutLeft, Kokkos::Cuda, Kokkos::CudaSpace, - true) -KOKKOSBLAS2_CSYR2_CUBLAS(Kokkos::LayoutLeft, Kokkos::Cuda, Kokkos::CudaSpace, - false) -KOKKOSBLAS2_CSYR2_CUBLAS(Kokkos::LayoutRight, Kokkos::Cuda, Kokkos::CudaSpace, - true) -KOKKOSBLAS2_CSYR2_CUBLAS(Kokkos::LayoutRight, Kokkos::Cuda, Kokkos::CudaSpace, - false) +KOKKOSBLAS2_CSYR2_CUBLAS(Kokkos::LayoutLeft, Kokkos::Cuda, Kokkos::CudaSpace, true) +KOKKOSBLAS2_CSYR2_CUBLAS(Kokkos::LayoutLeft, Kokkos::Cuda, Kokkos::CudaSpace, false) +KOKKOSBLAS2_CSYR2_CUBLAS(Kokkos::LayoutRight, Kokkos::Cuda, Kokkos::CudaSpace, true) +KOKKOSBLAS2_CSYR2_CUBLAS(Kokkos::LayoutRight, Kokkos::Cuda, Kokkos::CudaSpace, false) -KOKKOSBLAS2_CSYR2_CUBLAS(Kokkos::LayoutLeft, Kokkos::Cuda, Kokkos::CudaUVMSpace, - true) -KOKKOSBLAS2_CSYR2_CUBLAS(Kokkos::LayoutLeft, Kokkos::Cuda, Kokkos::CudaUVMSpace, - false) -KOKKOSBLAS2_CSYR2_CUBLAS(Kokkos::LayoutRight, Kokkos::Cuda, - Kokkos::CudaUVMSpace, true) -KOKKOSBLAS2_CSYR2_CUBLAS(Kokkos::LayoutRight, Kokkos::Cuda, - Kokkos::CudaUVMSpace, false) +KOKKOSBLAS2_CSYR2_CUBLAS(Kokkos::LayoutLeft, Kokkos::Cuda, Kokkos::CudaUVMSpace, true) +KOKKOSBLAS2_CSYR2_CUBLAS(Kokkos::LayoutLeft, Kokkos::Cuda, Kokkos::CudaUVMSpace, false) +KOKKOSBLAS2_CSYR2_CUBLAS(Kokkos::LayoutRight, Kokkos::Cuda, Kokkos::CudaUVMSpace, true) +KOKKOSBLAS2_CSYR2_CUBLAS(Kokkos::LayoutRight, Kokkos::Cuda, Kokkos::CudaUVMSpace, false) } // namespace Impl } // namespace KokkosBlas diff --git a/blas/tpls/KokkosBlas2_syr2_tpl_spec_decl_rocblas.hpp b/blas/tpls/KokkosBlas2_syr2_tpl_spec_decl_rocblas.hpp index 869c065af2..84085224ac 100644 --- a/blas/tpls/KokkosBlas2_syr2_tpl_spec_decl_rocblas.hpp +++ b/blas/tpls/KokkosBlas2_syr2_tpl_spec_decl_rocblas.hpp @@ -28,307 +28,233 @@ namespace Impl { const int N = static_cast(A_is_lr ? A.extent(0) : A.extent(1)); \ constexpr int one = 1; \ const int LDA = A_is_lr ? A.stride(0) : A.stride(1); \ - rocblas_fill fillMode = (uploChar == 'L' || uploChar == 'l') \ - ? rocblas_fill_lower \ - : rocblas_fill_upper; + rocblas_fill fillMode = (uploChar == 'L' || uploChar == 'l') ? rocblas_fill_lower : rocblas_fill_upper; -#define KOKKOSBLAS2_DSYR2_ROCBLAS(LAYOUT, EXEC_SPACE, MEM_SPACE, \ - ETI_SPEC_AVAIL) \ - template <> \ - struct SYR2< \ - EXEC_SPACE, \ - Kokkos::View, \ - Kokkos::MemoryTraits >, \ - Kokkos::View, \ - Kokkos::MemoryTraits >, \ - Kokkos::View, \ - Kokkos::MemoryTraits >, \ - true, ETI_SPEC_AVAIL> { \ - typedef double SCALAR; \ - typedef Kokkos::View, \ - Kokkos::MemoryTraits > \ - XViewType; \ - typedef Kokkos::View, \ - Kokkos::MemoryTraits > \ - YViewType; \ - typedef Kokkos::View, \ - Kokkos::MemoryTraits > \ - AViewType; \ - \ - static void syr2(const typename AViewType::execution_space& space, \ - const char trans[], const char uplo[], \ - typename AViewType::const_value_type& alpha, \ - const XViewType& X, const YViewType& Y, \ - const AViewType& A) { \ - Kokkos::Profiling::pushRegion("KokkosBlas::syr2[TPL_ROCBLAS,double]"); \ - KOKKOSBLAS2_SYR2_ROCBLAS_DETERMINE_ARGS(LAYOUT, uplo[0]); \ - if (A_is_ll) { \ - KokkosBlas::Impl::RocBlasSingleton& s = \ - KokkosBlas::Impl::RocBlasSingleton::singleton(); \ - KOKKOS_ROCBLAS_SAFE_CALL_IMPL( \ - rocblas_set_stream(s.handle, space.hip_stream())); \ - KOKKOS_ROCBLAS_SAFE_CALL_IMPL( \ - rocblas_dsyr2(s.handle, fillMode, N, &alpha, X.data(), one, \ - Y.data(), one, A.data(), LDA)); \ - KOKKOS_ROCBLAS_SAFE_CALL_IMPL(rocblas_set_stream(s.handle, NULL)); \ - } else { \ - /* rocblas_dsyr2() + ~A_ll => call kokkos-kernels' implementation */ \ - SYR2::syr2(space, trans, uplo, alpha, X, Y, A); \ - } \ - Kokkos::Profiling::popRegion(); \ - } \ +#define KOKKOSBLAS2_DSYR2_ROCBLAS(LAYOUT, EXEC_SPACE, MEM_SPACE, ETI_SPEC_AVAIL) \ + template <> \ + struct SYR2< \ + EXEC_SPACE, \ + Kokkos::View, \ + Kokkos::MemoryTraits >, \ + Kokkos::View, \ + Kokkos::MemoryTraits >, \ + Kokkos::View, Kokkos::MemoryTraits >, \ + true, ETI_SPEC_AVAIL> { \ + typedef double SCALAR; \ + typedef Kokkos::View, \ + Kokkos::MemoryTraits > \ + XViewType; \ + typedef Kokkos::View, \ + Kokkos::MemoryTraits > \ + YViewType; \ + typedef Kokkos::View, \ + Kokkos::MemoryTraits > \ + AViewType; \ + \ + static void syr2(const typename AViewType::execution_space& space, const char trans[], const char uplo[], \ + typename AViewType::const_value_type& alpha, const XViewType& X, const YViewType& Y, \ + const AViewType& A) { \ + Kokkos::Profiling::pushRegion("KokkosBlas::syr2[TPL_ROCBLAS,double]"); \ + KOKKOSBLAS2_SYR2_ROCBLAS_DETERMINE_ARGS(LAYOUT, uplo[0]); \ + if (A_is_ll) { \ + KokkosBlas::Impl::RocBlasSingleton& s = KokkosBlas::Impl::RocBlasSingleton::singleton(); \ + KOKKOS_ROCBLAS_SAFE_CALL_IMPL(rocblas_set_stream(s.handle, space.hip_stream())); \ + KOKKOS_ROCBLAS_SAFE_CALL_IMPL( \ + rocblas_dsyr2(s.handle, fillMode, N, &alpha, X.data(), one, Y.data(), one, A.data(), LDA)); \ + KOKKOS_ROCBLAS_SAFE_CALL_IMPL(rocblas_set_stream(s.handle, NULL)); \ + } else { \ + /* rocblas_dsyr2() + ~A_ll => call kokkos-kernels' implementation */ \ + SYR2::syr2(space, trans, uplo, alpha, X, \ + Y, A); \ + } \ + Kokkos::Profiling::popRegion(); \ + } \ }; -#define KOKKOSBLAS2_SSYR2_ROCBLAS(LAYOUT, EXEC_SPACE, MEM_SPACE, \ - ETI_SPEC_AVAIL) \ - template <> \ - struct SYR2< \ - EXEC_SPACE, \ - Kokkos::View, \ - Kokkos::MemoryTraits >, \ - Kokkos::View, \ - Kokkos::MemoryTraits >, \ - Kokkos::View, \ - Kokkos::MemoryTraits >, \ - true, ETI_SPEC_AVAIL> { \ - typedef float SCALAR; \ - typedef Kokkos::View, \ - Kokkos::MemoryTraits > \ - XViewType; \ - typedef Kokkos::View, \ - Kokkos::MemoryTraits > \ - YViewType; \ - typedef Kokkos::View, \ - Kokkos::MemoryTraits > \ - AViewType; \ - \ - static void syr2(const typename AViewType::execution_space& space, \ - const char trans[], const char uplo[], \ - typename AViewType::const_value_type& alpha, \ - const XViewType& X, const YViewType& Y, \ - const AViewType& A) { \ - Kokkos::Profiling::pushRegion("KokkosBlas::syr2[TPL_ROCBLAS,float]"); \ - KOKKOSBLAS2_SYR2_ROCBLAS_DETERMINE_ARGS(LAYOUT, uplo[0]); \ - if (A_is_ll) { \ - KokkosBlas::Impl::RocBlasSingleton& s = \ - KokkosBlas::Impl::RocBlasSingleton::singleton(); \ - KOKKOS_ROCBLAS_SAFE_CALL_IMPL( \ - rocblas_set_stream(s.handle, space.hip_stream())); \ - KOKKOS_ROCBLAS_SAFE_CALL_IMPL( \ - rocblas_ssyr2(s.handle, fillMode, N, &alpha, X.data(), one, \ - Y.data(), one, A.data(), LDA)); \ - KOKKOS_ROCBLAS_SAFE_CALL_IMPL(rocblas_set_stream(s.handle, NULL)); \ - } else { \ - /* rocblas_ssyr2() + ~A_ll => call kokkos-kernels' implementation */ \ - SYR2::syr2(space, trans, uplo, alpha, X, Y, A); \ - } \ - Kokkos::Profiling::popRegion(); \ - } \ +#define KOKKOSBLAS2_SSYR2_ROCBLAS(LAYOUT, EXEC_SPACE, MEM_SPACE, ETI_SPEC_AVAIL) \ + template <> \ + struct SYR2< \ + EXEC_SPACE, \ + Kokkos::View, \ + Kokkos::MemoryTraits >, \ + Kokkos::View, \ + Kokkos::MemoryTraits >, \ + Kokkos::View, Kokkos::MemoryTraits >, \ + true, ETI_SPEC_AVAIL> { \ + typedef float SCALAR; \ + typedef Kokkos::View, \ + Kokkos::MemoryTraits > \ + XViewType; \ + typedef Kokkos::View, \ + Kokkos::MemoryTraits > \ + YViewType; \ + typedef Kokkos::View, \ + Kokkos::MemoryTraits > \ + AViewType; \ + \ + static void syr2(const typename AViewType::execution_space& space, const char trans[], const char uplo[], \ + typename AViewType::const_value_type& alpha, const XViewType& X, const YViewType& Y, \ + const AViewType& A) { \ + Kokkos::Profiling::pushRegion("KokkosBlas::syr2[TPL_ROCBLAS,float]"); \ + KOKKOSBLAS2_SYR2_ROCBLAS_DETERMINE_ARGS(LAYOUT, uplo[0]); \ + if (A_is_ll) { \ + KokkosBlas::Impl::RocBlasSingleton& s = KokkosBlas::Impl::RocBlasSingleton::singleton(); \ + KOKKOS_ROCBLAS_SAFE_CALL_IMPL(rocblas_set_stream(s.handle, space.hip_stream())); \ + KOKKOS_ROCBLAS_SAFE_CALL_IMPL( \ + rocblas_ssyr2(s.handle, fillMode, N, &alpha, X.data(), one, Y.data(), one, A.data(), LDA)); \ + KOKKOS_ROCBLAS_SAFE_CALL_IMPL(rocblas_set_stream(s.handle, NULL)); \ + } else { \ + /* rocblas_ssyr2() + ~A_ll => call kokkos-kernels' implementation */ \ + SYR2::syr2(space, trans, uplo, alpha, X, \ + Y, A); \ + } \ + Kokkos::Profiling::popRegion(); \ + } \ }; -#define KOKKOSBLAS2_ZSYR2_ROCBLAS(LAYOUT, EXEC_SPACE, MEM_SPACE, \ - ETI_SPEC_AVAIL) \ - template <> \ - struct SYR2*, LAYOUT, \ - Kokkos::Device, \ - Kokkos::MemoryTraits >, \ - Kokkos::View*, LAYOUT, \ - Kokkos::Device, \ - Kokkos::MemoryTraits >, \ - Kokkos::View**, LAYOUT, \ - Kokkos::Device, \ - Kokkos::MemoryTraits >, \ - true, ETI_SPEC_AVAIL> { \ - typedef Kokkos::complex SCALAR; \ - typedef Kokkos::View, \ - Kokkos::MemoryTraits > \ - XViewType; \ - typedef Kokkos::View, \ - Kokkos::MemoryTraits > \ - YViewType; \ - typedef Kokkos::View, \ - Kokkos::MemoryTraits > \ - AViewType; \ - \ - static void syr2(const typename AViewType::execution_space& space, \ - const char trans[], const char uplo[], \ - typename AViewType::const_value_type& alpha, \ - const XViewType& X, const YViewType& Y, \ - const AViewType& A) { \ - Kokkos::Profiling::pushRegion( \ - "KokkosBlas::syr2[TPL_ROCBLAS,complex]"); \ - KOKKOSBLAS2_SYR2_ROCBLAS_DETERMINE_ARGS(LAYOUT, uplo[0]); \ - bool justTranspose = (trans[0] == 'T') || (trans[0] == 't'); \ - if (justTranspose) { \ - if (A_is_ll) { \ - KokkosBlas::Impl::RocBlasSingleton& s = \ - KokkosBlas::Impl::RocBlasSingleton::singleton(); \ - KOKKOS_ROCBLAS_SAFE_CALL_IMPL( \ - rocblas_set_stream(s.handle, space.hip_stream())); \ - KOKKOS_ROCBLAS_SAFE_CALL_IMPL(rocblas_zsyr2( \ - s.handle, fillMode, N, \ - reinterpret_cast(&alpha), \ - reinterpret_cast(X.data()), one, \ - reinterpret_cast(Y.data()), one, \ - reinterpret_cast(A.data()), LDA)); \ - KOKKOS_ROCBLAS_SAFE_CALL_IMPL(rocblas_set_stream(s.handle, NULL)); \ - } else { \ - /* rocblas_zsyr2() + ~A_ll => call kokkos-kernels' implementation */ \ - SYR2::syr2(space, trans, uplo, alpha, X, Y, A); \ - } \ - } else { \ - if (A_is_ll && (alpha.imag() == 0.)) { \ - KokkosBlas::Impl::RocBlasSingleton& s = \ - KokkosBlas::Impl::RocBlasSingleton::singleton(); \ - KOKKOS_ROCBLAS_SAFE_CALL_IMPL( \ - rocblas_set_stream(s.handle, space.hip_stream())); \ - KOKKOS_ROCBLAS_SAFE_CALL_IMPL(rocblas_zher2( \ - s.handle, fillMode, N, \ - reinterpret_cast(&alpha), \ - reinterpret_cast(X.data()), one, \ - reinterpret_cast(Y.data()), one, \ - reinterpret_cast(A.data()), LDA)); \ - KOKKOS_ROCBLAS_SAFE_CALL_IMPL(rocblas_set_stream(s.handle, NULL)); \ - } else { \ - /* rocblas_zher2() + ~A_ll => call kokkos-kernels' implementation */ \ - SYR2::syr2(space, trans, uplo, alpha, X, Y, A); \ - } \ - } \ - Kokkos::Profiling::popRegion(); \ - } \ +#define KOKKOSBLAS2_ZSYR2_ROCBLAS(LAYOUT, EXEC_SPACE, MEM_SPACE, ETI_SPEC_AVAIL) \ + template <> \ + struct SYR2*, LAYOUT, Kokkos::Device, \ + Kokkos::MemoryTraits >, \ + Kokkos::View*, LAYOUT, Kokkos::Device, \ + Kokkos::MemoryTraits >, \ + Kokkos::View**, LAYOUT, Kokkos::Device, \ + Kokkos::MemoryTraits >, \ + true, ETI_SPEC_AVAIL> { \ + typedef Kokkos::complex SCALAR; \ + typedef Kokkos::View, \ + Kokkos::MemoryTraits > \ + XViewType; \ + typedef Kokkos::View, \ + Kokkos::MemoryTraits > \ + YViewType; \ + typedef Kokkos::View, \ + Kokkos::MemoryTraits > \ + AViewType; \ + \ + static void syr2(const typename AViewType::execution_space& space, const char trans[], const char uplo[], \ + typename AViewType::const_value_type& alpha, const XViewType& X, const YViewType& Y, \ + const AViewType& A) { \ + Kokkos::Profiling::pushRegion("KokkosBlas::syr2[TPL_ROCBLAS,complex]"); \ + KOKKOSBLAS2_SYR2_ROCBLAS_DETERMINE_ARGS(LAYOUT, uplo[0]); \ + bool justTranspose = (trans[0] == 'T') || (trans[0] == 't'); \ + if (justTranspose) { \ + if (A_is_ll) { \ + KokkosBlas::Impl::RocBlasSingleton& s = KokkosBlas::Impl::RocBlasSingleton::singleton(); \ + KOKKOS_ROCBLAS_SAFE_CALL_IMPL(rocblas_set_stream(s.handle, space.hip_stream())); \ + KOKKOS_ROCBLAS_SAFE_CALL_IMPL(rocblas_zsyr2(s.handle, fillMode, N, \ + reinterpret_cast(&alpha), \ + reinterpret_cast(X.data()), one, \ + reinterpret_cast(Y.data()), one, \ + reinterpret_cast(A.data()), LDA)); \ + KOKKOS_ROCBLAS_SAFE_CALL_IMPL(rocblas_set_stream(s.handle, NULL)); \ + } else { \ + /* rocblas_zsyr2() + ~A_ll => call kokkos-kernels' implementation */ \ + SYR2::syr2(space, trans, uplo, alpha, X, \ + Y, A); \ + } \ + } else { \ + if (A_is_ll && (alpha.imag() == 0.)) { \ + KokkosBlas::Impl::RocBlasSingleton& s = KokkosBlas::Impl::RocBlasSingleton::singleton(); \ + KOKKOS_ROCBLAS_SAFE_CALL_IMPL(rocblas_set_stream(s.handle, space.hip_stream())); \ + KOKKOS_ROCBLAS_SAFE_CALL_IMPL(rocblas_zher2(s.handle, fillMode, N, \ + reinterpret_cast(&alpha), \ + reinterpret_cast(X.data()), one, \ + reinterpret_cast(Y.data()), one, \ + reinterpret_cast(A.data()), LDA)); \ + KOKKOS_ROCBLAS_SAFE_CALL_IMPL(rocblas_set_stream(s.handle, NULL)); \ + } else { \ + /* rocblas_zher2() + ~A_ll => call kokkos-kernels' implementation */ \ + SYR2::syr2(space, trans, uplo, alpha, X, \ + Y, A); \ + } \ + } \ + Kokkos::Profiling::popRegion(); \ + } \ }; -#define KOKKOSBLAS2_CSYR2_ROCBLAS(LAYOUT, EXEC_SPACE, MEM_SPACE, \ - ETI_SPEC_AVAIL) \ - template <> \ - struct SYR2*, LAYOUT, \ - Kokkos::Device, \ - Kokkos::MemoryTraits >, \ - Kokkos::View*, LAYOUT, \ - Kokkos::Device, \ - Kokkos::MemoryTraits >, \ - Kokkos::View**, LAYOUT, \ - Kokkos::Device, \ - Kokkos::MemoryTraits >, \ - true, ETI_SPEC_AVAIL> { \ - typedef Kokkos::complex SCALAR; \ - typedef Kokkos::View, \ - Kokkos::MemoryTraits > \ - XViewType; \ - typedef Kokkos::View, \ - Kokkos::MemoryTraits > \ - YViewType; \ - typedef Kokkos::View, \ - Kokkos::MemoryTraits > \ - AViewType; \ - \ - static void syr2(const typename AViewType::execution_space& space, \ - const char trans[], const char uplo[], \ - typename AViewType::const_value_type& alpha, \ - const XViewType& X, const YViewType& Y, \ - const AViewType& A) { \ - Kokkos::Profiling::pushRegion( \ - "KokkosBlas::syr2[TPL_ROCBLAS,complex]"); \ - KOKKOSBLAS2_SYR2_ROCBLAS_DETERMINE_ARGS(LAYOUT, uplo[0]); \ - bool justTranspose = (trans[0] == 'T') || (trans[0] == 't'); \ - if (justTranspose) { \ - if (A_is_ll) { \ - KokkosBlas::Impl::RocBlasSingleton& s = \ - KokkosBlas::Impl::RocBlasSingleton::singleton(); \ - KOKKOS_ROCBLAS_SAFE_CALL_IMPL( \ - rocblas_set_stream(s.handle, space.hip_stream())); \ - KOKKOS_ROCBLAS_SAFE_CALL_IMPL(rocblas_csyr2( \ - s.handle, fillMode, N, \ - reinterpret_cast(&alpha), \ - reinterpret_cast(X.data()), one, \ - reinterpret_cast(Y.data()), one, \ - reinterpret_cast(A.data()), LDA)); \ - KOKKOS_ROCBLAS_SAFE_CALL_IMPL(rocblas_set_stream(s.handle, NULL)); \ - } else { \ - /* rocblas_csyr2() + ~A_ll => call kokkos-kernels' implementation */ \ - SYR2::syr2(space, trans, uplo, alpha, X, Y, A); \ - } \ - } else { \ - if (A_is_ll && (alpha.imag() == 0.)) { \ - KokkosBlas::Impl::RocBlasSingleton& s = \ - KokkosBlas::Impl::RocBlasSingleton::singleton(); \ - KOKKOS_ROCBLAS_SAFE_CALL_IMPL( \ - rocblas_set_stream(s.handle, space.hip_stream())); \ - KOKKOS_ROCBLAS_SAFE_CALL_IMPL(rocblas_cher2( \ - s.handle, fillMode, N, \ - reinterpret_cast(&alpha), \ - reinterpret_cast(X.data()), one, \ - reinterpret_cast(Y.data()), one, \ - reinterpret_cast(A.data()), LDA)); \ - KOKKOS_ROCBLAS_SAFE_CALL_IMPL(rocblas_set_stream(s.handle, NULL)); \ - } else { \ - /* rocblas_cher2() + ~A_ll => call kokkos-kernels' implementation */ \ - SYR2::syr2(space, trans, uplo, alpha, X, Y, A); \ - } \ - } \ - Kokkos::Profiling::popRegion(); \ - } \ +#define KOKKOSBLAS2_CSYR2_ROCBLAS(LAYOUT, EXEC_SPACE, MEM_SPACE, ETI_SPEC_AVAIL) \ + template <> \ + struct SYR2*, LAYOUT, Kokkos::Device, \ + Kokkos::MemoryTraits >, \ + Kokkos::View*, LAYOUT, Kokkos::Device, \ + Kokkos::MemoryTraits >, \ + Kokkos::View**, LAYOUT, Kokkos::Device, \ + Kokkos::MemoryTraits >, \ + true, ETI_SPEC_AVAIL> { \ + typedef Kokkos::complex SCALAR; \ + typedef Kokkos::View, \ + Kokkos::MemoryTraits > \ + XViewType; \ + typedef Kokkos::View, \ + Kokkos::MemoryTraits > \ + YViewType; \ + typedef Kokkos::View, \ + Kokkos::MemoryTraits > \ + AViewType; \ + \ + static void syr2(const typename AViewType::execution_space& space, const char trans[], const char uplo[], \ + typename AViewType::const_value_type& alpha, const XViewType& X, const YViewType& Y, \ + const AViewType& A) { \ + Kokkos::Profiling::pushRegion("KokkosBlas::syr2[TPL_ROCBLAS,complex]"); \ + KOKKOSBLAS2_SYR2_ROCBLAS_DETERMINE_ARGS(LAYOUT, uplo[0]); \ + bool justTranspose = (trans[0] == 'T') || (trans[0] == 't'); \ + if (justTranspose) { \ + if (A_is_ll) { \ + KokkosBlas::Impl::RocBlasSingleton& s = KokkosBlas::Impl::RocBlasSingleton::singleton(); \ + KOKKOS_ROCBLAS_SAFE_CALL_IMPL(rocblas_set_stream(s.handle, space.hip_stream())); \ + KOKKOS_ROCBLAS_SAFE_CALL_IMPL(rocblas_csyr2(s.handle, fillMode, N, \ + reinterpret_cast(&alpha), \ + reinterpret_cast(X.data()), one, \ + reinterpret_cast(Y.data()), one, \ + reinterpret_cast(A.data()), LDA)); \ + KOKKOS_ROCBLAS_SAFE_CALL_IMPL(rocblas_set_stream(s.handle, NULL)); \ + } else { \ + /* rocblas_csyr2() + ~A_ll => call kokkos-kernels' implementation */ \ + SYR2::syr2(space, trans, uplo, alpha, X, \ + Y, A); \ + } \ + } else { \ + if (A_is_ll && (alpha.imag() == 0.)) { \ + KokkosBlas::Impl::RocBlasSingleton& s = KokkosBlas::Impl::RocBlasSingleton::singleton(); \ + KOKKOS_ROCBLAS_SAFE_CALL_IMPL(rocblas_set_stream(s.handle, space.hip_stream())); \ + KOKKOS_ROCBLAS_SAFE_CALL_IMPL(rocblas_cher2(s.handle, fillMode, N, \ + reinterpret_cast(&alpha), \ + reinterpret_cast(X.data()), one, \ + reinterpret_cast(Y.data()), one, \ + reinterpret_cast(A.data()), LDA)); \ + KOKKOS_ROCBLAS_SAFE_CALL_IMPL(rocblas_set_stream(s.handle, NULL)); \ + } else { \ + /* rocblas_cher2() + ~A_ll => call kokkos-kernels' implementation */ \ + SYR2::syr2(space, trans, uplo, alpha, X, \ + Y, A); \ + } \ + } \ + Kokkos::Profiling::popRegion(); \ + } \ }; -KOKKOSBLAS2_DSYR2_ROCBLAS(Kokkos::LayoutLeft, Kokkos::HIP, Kokkos::HIPSpace, - true) -KOKKOSBLAS2_DSYR2_ROCBLAS(Kokkos::LayoutLeft, Kokkos::HIP, Kokkos::HIPSpace, - false) -KOKKOSBLAS2_DSYR2_ROCBLAS(Kokkos::LayoutRight, Kokkos::HIP, Kokkos::HIPSpace, - true) -KOKKOSBLAS2_DSYR2_ROCBLAS(Kokkos::LayoutRight, Kokkos::HIP, Kokkos::HIPSpace, - false) +KOKKOSBLAS2_DSYR2_ROCBLAS(Kokkos::LayoutLeft, Kokkos::HIP, Kokkos::HIPSpace, true) +KOKKOSBLAS2_DSYR2_ROCBLAS(Kokkos::LayoutLeft, Kokkos::HIP, Kokkos::HIPSpace, false) +KOKKOSBLAS2_DSYR2_ROCBLAS(Kokkos::LayoutRight, Kokkos::HIP, Kokkos::HIPSpace, true) +KOKKOSBLAS2_DSYR2_ROCBLAS(Kokkos::LayoutRight, Kokkos::HIP, Kokkos::HIPSpace, false) -KOKKOSBLAS2_SSYR2_ROCBLAS(Kokkos::LayoutLeft, Kokkos::HIP, Kokkos::HIPSpace, - true) -KOKKOSBLAS2_SSYR2_ROCBLAS(Kokkos::LayoutLeft, Kokkos::HIP, Kokkos::HIPSpace, - false) -KOKKOSBLAS2_SSYR2_ROCBLAS(Kokkos::LayoutRight, Kokkos::HIP, Kokkos::HIPSpace, - true) -KOKKOSBLAS2_SSYR2_ROCBLAS(Kokkos::LayoutRight, Kokkos::HIP, Kokkos::HIPSpace, - false) +KOKKOSBLAS2_SSYR2_ROCBLAS(Kokkos::LayoutLeft, Kokkos::HIP, Kokkos::HIPSpace, true) +KOKKOSBLAS2_SSYR2_ROCBLAS(Kokkos::LayoutLeft, Kokkos::HIP, Kokkos::HIPSpace, false) +KOKKOSBLAS2_SSYR2_ROCBLAS(Kokkos::LayoutRight, Kokkos::HIP, Kokkos::HIPSpace, true) +KOKKOSBLAS2_SSYR2_ROCBLAS(Kokkos::LayoutRight, Kokkos::HIP, Kokkos::HIPSpace, false) -KOKKOSBLAS2_ZSYR2_ROCBLAS(Kokkos::LayoutLeft, Kokkos::HIP, Kokkos::HIPSpace, - true) -KOKKOSBLAS2_ZSYR2_ROCBLAS(Kokkos::LayoutLeft, Kokkos::HIP, Kokkos::HIPSpace, - false) -KOKKOSBLAS2_ZSYR2_ROCBLAS(Kokkos::LayoutRight, Kokkos::HIP, Kokkos::HIPSpace, - true) -KOKKOSBLAS2_ZSYR2_ROCBLAS(Kokkos::LayoutRight, Kokkos::HIP, Kokkos::HIPSpace, - false) +KOKKOSBLAS2_ZSYR2_ROCBLAS(Kokkos::LayoutLeft, Kokkos::HIP, Kokkos::HIPSpace, true) +KOKKOSBLAS2_ZSYR2_ROCBLAS(Kokkos::LayoutLeft, Kokkos::HIP, Kokkos::HIPSpace, false) +KOKKOSBLAS2_ZSYR2_ROCBLAS(Kokkos::LayoutRight, Kokkos::HIP, Kokkos::HIPSpace, true) +KOKKOSBLAS2_ZSYR2_ROCBLAS(Kokkos::LayoutRight, Kokkos::HIP, Kokkos::HIPSpace, false) -KOKKOSBLAS2_CSYR2_ROCBLAS(Kokkos::LayoutLeft, Kokkos::HIP, Kokkos::HIPSpace, - true) -KOKKOSBLAS2_CSYR2_ROCBLAS(Kokkos::LayoutLeft, Kokkos::HIP, Kokkos::HIPSpace, - false) -KOKKOSBLAS2_CSYR2_ROCBLAS(Kokkos::LayoutRight, Kokkos::HIP, Kokkos::HIPSpace, - true) -KOKKOSBLAS2_CSYR2_ROCBLAS(Kokkos::LayoutRight, Kokkos::HIP, Kokkos::HIPSpace, - false) +KOKKOSBLAS2_CSYR2_ROCBLAS(Kokkos::LayoutLeft, Kokkos::HIP, Kokkos::HIPSpace, true) +KOKKOSBLAS2_CSYR2_ROCBLAS(Kokkos::LayoutLeft, Kokkos::HIP, Kokkos::HIPSpace, false) +KOKKOSBLAS2_CSYR2_ROCBLAS(Kokkos::LayoutRight, Kokkos::HIP, Kokkos::HIPSpace, true) +KOKKOSBLAS2_CSYR2_ROCBLAS(Kokkos::LayoutRight, Kokkos::HIP, Kokkos::HIPSpace, false) } // namespace Impl } // namespace KokkosBlas diff --git a/blas/tpls/KokkosBlas2_syr_tpl_spec_avail.hpp b/blas/tpls/KokkosBlas2_syr_tpl_spec_avail.hpp index f537b3854a..e1eb94e425 100644 --- a/blas/tpls/KokkosBlas2_syr_tpl_spec_avail.hpp +++ b/blas/tpls/KokkosBlas2_syr_tpl_spec_avail.hpp @@ -28,59 +28,38 @@ struct syr_tpl_spec_avail { // Generic Host side BLAS (could be MKL or whatever) #ifdef KOKKOSKERNELS_ENABLE_TPL_BLAS -#define KOKKOSBLAS2_SYR_TPL_SPEC_AVAIL_BLAS(SCALAR, LAYOUT, EXEC_SPACE, \ - MEM_SPACE) \ - template <> \ - struct syr_tpl_spec_avail< \ - EXEC_SPACE, \ - Kokkos::View, \ - Kokkos::MemoryTraits >, \ - Kokkos::View, \ - Kokkos::MemoryTraits > > { \ - enum : bool { value = true }; \ +#define KOKKOSBLAS2_SYR_TPL_SPEC_AVAIL_BLAS(SCALAR, LAYOUT, EXEC_SPACE, MEM_SPACE) \ + template <> \ + struct syr_tpl_spec_avail, \ + Kokkos::MemoryTraits >, \ + Kokkos::View, \ + Kokkos::MemoryTraits > > { \ + enum : bool { value = true }; \ }; #ifdef KOKKOS_ENABLE_SERIAL -KOKKOSBLAS2_SYR_TPL_SPEC_AVAIL_BLAS(double, Kokkos::LayoutLeft, Kokkos::Serial, - Kokkos::HostSpace) -KOKKOSBLAS2_SYR_TPL_SPEC_AVAIL_BLAS(float, Kokkos::LayoutLeft, Kokkos::Serial, - Kokkos::HostSpace) -KOKKOSBLAS2_SYR_TPL_SPEC_AVAIL_BLAS(Kokkos::complex, Kokkos::LayoutLeft, - Kokkos::Serial, Kokkos::HostSpace) -KOKKOSBLAS2_SYR_TPL_SPEC_AVAIL_BLAS(Kokkos::complex, Kokkos::LayoutLeft, - Kokkos::Serial, Kokkos::HostSpace) - -KOKKOSBLAS2_SYR_TPL_SPEC_AVAIL_BLAS(double, Kokkos::LayoutRight, Kokkos::Serial, - Kokkos::HostSpace) -KOKKOSBLAS2_SYR_TPL_SPEC_AVAIL_BLAS(float, Kokkos::LayoutRight, Kokkos::Serial, - Kokkos::HostSpace) -KOKKOSBLAS2_SYR_TPL_SPEC_AVAIL_BLAS(Kokkos::complex, - Kokkos::LayoutRight, Kokkos::Serial, - Kokkos::HostSpace) -KOKKOSBLAS2_SYR_TPL_SPEC_AVAIL_BLAS(Kokkos::complex, Kokkos::LayoutRight, - Kokkos::Serial, Kokkos::HostSpace) +KOKKOSBLAS2_SYR_TPL_SPEC_AVAIL_BLAS(double, Kokkos::LayoutLeft, Kokkos::Serial, Kokkos::HostSpace) +KOKKOSBLAS2_SYR_TPL_SPEC_AVAIL_BLAS(float, Kokkos::LayoutLeft, Kokkos::Serial, Kokkos::HostSpace) +KOKKOSBLAS2_SYR_TPL_SPEC_AVAIL_BLAS(Kokkos::complex, Kokkos::LayoutLeft, Kokkos::Serial, Kokkos::HostSpace) +KOKKOSBLAS2_SYR_TPL_SPEC_AVAIL_BLAS(Kokkos::complex, Kokkos::LayoutLeft, Kokkos::Serial, Kokkos::HostSpace) + +KOKKOSBLAS2_SYR_TPL_SPEC_AVAIL_BLAS(double, Kokkos::LayoutRight, Kokkos::Serial, Kokkos::HostSpace) +KOKKOSBLAS2_SYR_TPL_SPEC_AVAIL_BLAS(float, Kokkos::LayoutRight, Kokkos::Serial, Kokkos::HostSpace) +KOKKOSBLAS2_SYR_TPL_SPEC_AVAIL_BLAS(Kokkos::complex, Kokkos::LayoutRight, Kokkos::Serial, Kokkos::HostSpace) +KOKKOSBLAS2_SYR_TPL_SPEC_AVAIL_BLAS(Kokkos::complex, Kokkos::LayoutRight, Kokkos::Serial, Kokkos::HostSpace) #endif #ifdef KOKKOS_ENABLE_OPENMP -KOKKOSBLAS2_SYR_TPL_SPEC_AVAIL_BLAS(double, Kokkos::LayoutLeft, Kokkos::OpenMP, - Kokkos::HostSpace) -KOKKOSBLAS2_SYR_TPL_SPEC_AVAIL_BLAS(float, Kokkos::LayoutLeft, Kokkos::OpenMP, - Kokkos::HostSpace) -KOKKOSBLAS2_SYR_TPL_SPEC_AVAIL_BLAS(Kokkos::complex, Kokkos::LayoutLeft, - Kokkos::OpenMP, Kokkos::HostSpace) -KOKKOSBLAS2_SYR_TPL_SPEC_AVAIL_BLAS(Kokkos::complex, Kokkos::LayoutLeft, - Kokkos::OpenMP, Kokkos::HostSpace) - -KOKKOSBLAS2_SYR_TPL_SPEC_AVAIL_BLAS(double, Kokkos::LayoutRight, Kokkos::OpenMP, - Kokkos::HostSpace) -KOKKOSBLAS2_SYR_TPL_SPEC_AVAIL_BLAS(float, Kokkos::LayoutRight, Kokkos::OpenMP, - Kokkos::HostSpace) -KOKKOSBLAS2_SYR_TPL_SPEC_AVAIL_BLAS(Kokkos::complex, - Kokkos::LayoutRight, Kokkos::OpenMP, - Kokkos::HostSpace) -KOKKOSBLAS2_SYR_TPL_SPEC_AVAIL_BLAS(Kokkos::complex, Kokkos::LayoutRight, - Kokkos::OpenMP, Kokkos::HostSpace) +KOKKOSBLAS2_SYR_TPL_SPEC_AVAIL_BLAS(double, Kokkos::LayoutLeft, Kokkos::OpenMP, Kokkos::HostSpace) +KOKKOSBLAS2_SYR_TPL_SPEC_AVAIL_BLAS(float, Kokkos::LayoutLeft, Kokkos::OpenMP, Kokkos::HostSpace) +KOKKOSBLAS2_SYR_TPL_SPEC_AVAIL_BLAS(Kokkos::complex, Kokkos::LayoutLeft, Kokkos::OpenMP, Kokkos::HostSpace) +KOKKOSBLAS2_SYR_TPL_SPEC_AVAIL_BLAS(Kokkos::complex, Kokkos::LayoutLeft, Kokkos::OpenMP, Kokkos::HostSpace) + +KOKKOSBLAS2_SYR_TPL_SPEC_AVAIL_BLAS(double, Kokkos::LayoutRight, Kokkos::OpenMP, Kokkos::HostSpace) +KOKKOSBLAS2_SYR_TPL_SPEC_AVAIL_BLAS(float, Kokkos::LayoutRight, Kokkos::OpenMP, Kokkos::HostSpace) +KOKKOSBLAS2_SYR_TPL_SPEC_AVAIL_BLAS(Kokkos::complex, Kokkos::LayoutRight, Kokkos::OpenMP, Kokkos::HostSpace) +KOKKOSBLAS2_SYR_TPL_SPEC_AVAIL_BLAS(Kokkos::complex, Kokkos::LayoutRight, Kokkos::OpenMP, Kokkos::HostSpace) #endif #endif @@ -88,102 +67,60 @@ KOKKOSBLAS2_SYR_TPL_SPEC_AVAIL_BLAS(Kokkos::complex, Kokkos::LayoutRight, // cuBLAS #ifdef KOKKOSKERNELS_ENABLE_TPL_CUBLAS -#define KOKKOSBLAS2_SYR_TPL_SPEC_AVAIL_CUBLAS(SCALAR, LAYOUT, EXEC_SPACE, \ - MEM_SPACE) \ - template <> \ - struct syr_tpl_spec_avail< \ - EXEC_SPACE, \ - Kokkos::View, \ - Kokkos::MemoryTraits >, \ - Kokkos::View, \ - Kokkos::MemoryTraits > > { \ - enum : bool { value = true }; \ +#define KOKKOSBLAS2_SYR_TPL_SPEC_AVAIL_CUBLAS(SCALAR, LAYOUT, EXEC_SPACE, MEM_SPACE) \ + template <> \ + struct syr_tpl_spec_avail, \ + Kokkos::MemoryTraits >, \ + Kokkos::View, \ + Kokkos::MemoryTraits > > { \ + enum : bool { value = true }; \ }; -KOKKOSBLAS2_SYR_TPL_SPEC_AVAIL_CUBLAS(double, Kokkos::LayoutLeft, Kokkos::Cuda, - Kokkos::CudaSpace) -KOKKOSBLAS2_SYR_TPL_SPEC_AVAIL_CUBLAS(float, Kokkos::LayoutLeft, Kokkos::Cuda, - Kokkos::CudaSpace) -KOKKOSBLAS2_SYR_TPL_SPEC_AVAIL_CUBLAS(Kokkos::complex, - Kokkos::LayoutLeft, Kokkos::Cuda, - Kokkos::CudaSpace) -KOKKOSBLAS2_SYR_TPL_SPEC_AVAIL_CUBLAS(Kokkos::complex, - Kokkos::LayoutLeft, Kokkos::Cuda, - Kokkos::CudaSpace) - -KOKKOSBLAS2_SYR_TPL_SPEC_AVAIL_CUBLAS(double, Kokkos::LayoutLeft, Kokkos::Cuda, - Kokkos::CudaUVMSpace) -KOKKOSBLAS2_SYR_TPL_SPEC_AVAIL_CUBLAS(float, Kokkos::LayoutLeft, Kokkos::Cuda, - Kokkos::CudaUVMSpace) -KOKKOSBLAS2_SYR_TPL_SPEC_AVAIL_CUBLAS(Kokkos::complex, - Kokkos::LayoutLeft, Kokkos::Cuda, - Kokkos::CudaUVMSpace) -KOKKOSBLAS2_SYR_TPL_SPEC_AVAIL_CUBLAS(Kokkos::complex, - Kokkos::LayoutLeft, Kokkos::Cuda, - Kokkos::CudaUVMSpace) - -KOKKOSBLAS2_SYR_TPL_SPEC_AVAIL_CUBLAS(double, Kokkos::LayoutRight, Kokkos::Cuda, - Kokkos::CudaSpace) -KOKKOSBLAS2_SYR_TPL_SPEC_AVAIL_CUBLAS(float, Kokkos::LayoutRight, Kokkos::Cuda, - Kokkos::CudaSpace) -KOKKOSBLAS2_SYR_TPL_SPEC_AVAIL_CUBLAS(Kokkos::complex, - Kokkos::LayoutRight, Kokkos::Cuda, - Kokkos::CudaSpace) -KOKKOSBLAS2_SYR_TPL_SPEC_AVAIL_CUBLAS(Kokkos::complex, - Kokkos::LayoutRight, Kokkos::Cuda, - Kokkos::CudaSpace) - -KOKKOSBLAS2_SYR_TPL_SPEC_AVAIL_CUBLAS(double, Kokkos::LayoutRight, Kokkos::Cuda, - Kokkos::CudaUVMSpace) -KOKKOSBLAS2_SYR_TPL_SPEC_AVAIL_CUBLAS(float, Kokkos::LayoutRight, Kokkos::Cuda, - Kokkos::CudaUVMSpace) -KOKKOSBLAS2_SYR_TPL_SPEC_AVAIL_CUBLAS(Kokkos::complex, - Kokkos::LayoutRight, Kokkos::Cuda, - Kokkos::CudaUVMSpace) -KOKKOSBLAS2_SYR_TPL_SPEC_AVAIL_CUBLAS(Kokkos::complex, - Kokkos::LayoutRight, Kokkos::Cuda, - Kokkos::CudaUVMSpace) +KOKKOSBLAS2_SYR_TPL_SPEC_AVAIL_CUBLAS(double, Kokkos::LayoutLeft, Kokkos::Cuda, Kokkos::CudaSpace) +KOKKOSBLAS2_SYR_TPL_SPEC_AVAIL_CUBLAS(float, Kokkos::LayoutLeft, Kokkos::Cuda, Kokkos::CudaSpace) +KOKKOSBLAS2_SYR_TPL_SPEC_AVAIL_CUBLAS(Kokkos::complex, Kokkos::LayoutLeft, Kokkos::Cuda, Kokkos::CudaSpace) +KOKKOSBLAS2_SYR_TPL_SPEC_AVAIL_CUBLAS(Kokkos::complex, Kokkos::LayoutLeft, Kokkos::Cuda, Kokkos::CudaSpace) + +KOKKOSBLAS2_SYR_TPL_SPEC_AVAIL_CUBLAS(double, Kokkos::LayoutLeft, Kokkos::Cuda, Kokkos::CudaUVMSpace) +KOKKOSBLAS2_SYR_TPL_SPEC_AVAIL_CUBLAS(float, Kokkos::LayoutLeft, Kokkos::Cuda, Kokkos::CudaUVMSpace) +KOKKOSBLAS2_SYR_TPL_SPEC_AVAIL_CUBLAS(Kokkos::complex, Kokkos::LayoutLeft, Kokkos::Cuda, Kokkos::CudaUVMSpace) +KOKKOSBLAS2_SYR_TPL_SPEC_AVAIL_CUBLAS(Kokkos::complex, Kokkos::LayoutLeft, Kokkos::Cuda, Kokkos::CudaUVMSpace) + +KOKKOSBLAS2_SYR_TPL_SPEC_AVAIL_CUBLAS(double, Kokkos::LayoutRight, Kokkos::Cuda, Kokkos::CudaSpace) +KOKKOSBLAS2_SYR_TPL_SPEC_AVAIL_CUBLAS(float, Kokkos::LayoutRight, Kokkos::Cuda, Kokkos::CudaSpace) +KOKKOSBLAS2_SYR_TPL_SPEC_AVAIL_CUBLAS(Kokkos::complex, Kokkos::LayoutRight, Kokkos::Cuda, Kokkos::CudaSpace) +KOKKOSBLAS2_SYR_TPL_SPEC_AVAIL_CUBLAS(Kokkos::complex, Kokkos::LayoutRight, Kokkos::Cuda, Kokkos::CudaSpace) + +KOKKOSBLAS2_SYR_TPL_SPEC_AVAIL_CUBLAS(double, Kokkos::LayoutRight, Kokkos::Cuda, Kokkos::CudaUVMSpace) +KOKKOSBLAS2_SYR_TPL_SPEC_AVAIL_CUBLAS(float, Kokkos::LayoutRight, Kokkos::Cuda, Kokkos::CudaUVMSpace) +KOKKOSBLAS2_SYR_TPL_SPEC_AVAIL_CUBLAS(Kokkos::complex, Kokkos::LayoutRight, Kokkos::Cuda, Kokkos::CudaUVMSpace) +KOKKOSBLAS2_SYR_TPL_SPEC_AVAIL_CUBLAS(Kokkos::complex, Kokkos::LayoutRight, Kokkos::Cuda, Kokkos::CudaUVMSpace) #endif // rocBLAS #ifdef KOKKOSKERNELS_ENABLE_TPL_ROCBLAS -#define KOKKOSBLAS2_SYR_TPL_SPEC_AVAIL_ROCBLAS(SCALAR, LAYOUT, EXEC_SPACE, \ - MEM_SPACE) \ - template <> \ - struct syr_tpl_spec_avail< \ - EXEC_SPACE, \ - Kokkos::View, \ - Kokkos::MemoryTraits >, \ - Kokkos::View, \ - Kokkos::MemoryTraits > > { \ - enum : bool { value = true }; \ +#define KOKKOSBLAS2_SYR_TPL_SPEC_AVAIL_ROCBLAS(SCALAR, LAYOUT, EXEC_SPACE, MEM_SPACE) \ + template <> \ + struct syr_tpl_spec_avail, \ + Kokkos::MemoryTraits >, \ + Kokkos::View, \ + Kokkos::MemoryTraits > > { \ + enum : bool { value = true }; \ }; -KOKKOSBLAS2_SYR_TPL_SPEC_AVAIL_ROCBLAS(double, Kokkos::LayoutLeft, Kokkos::HIP, - Kokkos::HIPSpace) -KOKKOSBLAS2_SYR_TPL_SPEC_AVAIL_ROCBLAS(float, Kokkos::LayoutLeft, Kokkos::HIP, - Kokkos::HIPSpace) -KOKKOSBLAS2_SYR_TPL_SPEC_AVAIL_ROCBLAS(Kokkos::complex, - Kokkos::LayoutLeft, Kokkos::HIP, - Kokkos::HIPSpace) -KOKKOSBLAS2_SYR_TPL_SPEC_AVAIL_ROCBLAS(Kokkos::complex, - Kokkos::LayoutLeft, Kokkos::HIP, - Kokkos::HIPSpace) - -KOKKOSBLAS2_SYR_TPL_SPEC_AVAIL_ROCBLAS(double, Kokkos::LayoutRight, Kokkos::HIP, - Kokkos::HIPSpace) -KOKKOSBLAS2_SYR_TPL_SPEC_AVAIL_ROCBLAS(float, Kokkos::LayoutRight, Kokkos::HIP, - Kokkos::HIPSpace) -KOKKOSBLAS2_SYR_TPL_SPEC_AVAIL_ROCBLAS(Kokkos::complex, - Kokkos::LayoutRight, Kokkos::HIP, - Kokkos::HIPSpace) -KOKKOSBLAS2_SYR_TPL_SPEC_AVAIL_ROCBLAS(Kokkos::complex, - Kokkos::LayoutRight, Kokkos::HIP, - Kokkos::HIPSpace) +KOKKOSBLAS2_SYR_TPL_SPEC_AVAIL_ROCBLAS(double, Kokkos::LayoutLeft, Kokkos::HIP, Kokkos::HIPSpace) +KOKKOSBLAS2_SYR_TPL_SPEC_AVAIL_ROCBLAS(float, Kokkos::LayoutLeft, Kokkos::HIP, Kokkos::HIPSpace) +KOKKOSBLAS2_SYR_TPL_SPEC_AVAIL_ROCBLAS(Kokkos::complex, Kokkos::LayoutLeft, Kokkos::HIP, Kokkos::HIPSpace) +KOKKOSBLAS2_SYR_TPL_SPEC_AVAIL_ROCBLAS(Kokkos::complex, Kokkos::LayoutLeft, Kokkos::HIP, Kokkos::HIPSpace) + +KOKKOSBLAS2_SYR_TPL_SPEC_AVAIL_ROCBLAS(double, Kokkos::LayoutRight, Kokkos::HIP, Kokkos::HIPSpace) +KOKKOSBLAS2_SYR_TPL_SPEC_AVAIL_ROCBLAS(float, Kokkos::LayoutRight, Kokkos::HIP, Kokkos::HIPSpace) +KOKKOSBLAS2_SYR_TPL_SPEC_AVAIL_ROCBLAS(Kokkos::complex, Kokkos::LayoutRight, Kokkos::HIP, Kokkos::HIPSpace) +KOKKOSBLAS2_SYR_TPL_SPEC_AVAIL_ROCBLAS(Kokkos::complex, Kokkos::LayoutRight, Kokkos::HIP, Kokkos::HIPSpace) #endif } // namespace Impl diff --git a/blas/tpls/KokkosBlas2_syr_tpl_spec_decl_blas.hpp b/blas/tpls/KokkosBlas2_syr_tpl_spec_decl_blas.hpp index fc8fb949d7..5b0eb0ec52 100644 --- a/blas/tpls/KokkosBlas2_syr_tpl_spec_decl_blas.hpp +++ b/blas/tpls/KokkosBlas2_syr_tpl_spec_decl_blas.hpp @@ -29,254 +29,186 @@ namespace Impl { constexpr int one = 1; \ const int LDA = A_is_lr ? A.stride(0) : A.stride(1); -#define KOKKOSBLAS2_DSYR_BLAS(LAYOUT, EXEC_SPACE, MEM_SPACE, ETI_SPEC_AVAIL) \ - template <> \ - struct SYR< \ - EXEC_SPACE, \ - Kokkos::View, \ - Kokkos::MemoryTraits>, \ - Kokkos::View, \ - Kokkos::MemoryTraits>, \ - true, ETI_SPEC_AVAIL> { \ - typedef double SCALAR; \ - typedef Kokkos::View, \ - Kokkos::MemoryTraits> \ - XViewType; \ - typedef Kokkos::View, \ - Kokkos::MemoryTraits> \ - AViewType; \ - \ - static void syr(const typename AViewType::execution_space& space, \ - const char trans[], const char uplo[], \ - typename AViewType::const_value_type& alpha, \ - const XViewType& X, const AViewType& A) { \ - Kokkos::Profiling::pushRegion("KokkosBlas::syr[TPL_BLAS,double]"); \ - KOKKOSBLAS2_SYR_DETERMINE_ARGS(LAYOUT); \ - if (A_is_ll) { \ - HostBlas::syr(uplo[0], N, alpha, X.data(), one, A.data(), \ - LDA); \ - } else { \ - /* blasDsyr() + ~A_ll => call kokkos-kernels' implementation */ \ - SYR::syr( \ - space, trans, uplo, alpha, X, A); \ - } \ - Kokkos::Profiling::popRegion(); \ - } \ +#define KOKKOSBLAS2_DSYR_BLAS(LAYOUT, EXEC_SPACE, MEM_SPACE, ETI_SPEC_AVAIL) \ + template <> \ + struct SYR< \ + EXEC_SPACE, \ + Kokkos::View, \ + Kokkos::MemoryTraits>, \ + Kokkos::View, Kokkos::MemoryTraits>, \ + true, ETI_SPEC_AVAIL> { \ + typedef double SCALAR; \ + typedef Kokkos::View, \ + Kokkos::MemoryTraits> \ + XViewType; \ + typedef Kokkos::View, \ + Kokkos::MemoryTraits> \ + AViewType; \ + \ + static void syr(const typename AViewType::execution_space& space, const char trans[], const char uplo[], \ + typename AViewType::const_value_type& alpha, const XViewType& X, const AViewType& A) { \ + Kokkos::Profiling::pushRegion("KokkosBlas::syr[TPL_BLAS,double]"); \ + KOKKOSBLAS2_SYR_DETERMINE_ARGS(LAYOUT); \ + if (A_is_ll) { \ + HostBlas::syr(uplo[0], N, alpha, X.data(), one, A.data(), LDA); \ + } else { \ + /* blasDsyr() + ~A_ll => call kokkos-kernels' implementation */ \ + SYR::syr(space, trans, uplo, alpha, X, A); \ + } \ + Kokkos::Profiling::popRegion(); \ + } \ }; -#define KOKKOSBLAS2_SSYR_BLAS(LAYOUT, EXEC_SPACE, MEM_SPACE, ETI_SPEC_AVAIL) \ - template <> \ - struct SYR< \ - EXEC_SPACE, \ - Kokkos::View, \ - Kokkos::MemoryTraits>, \ - Kokkos::View, \ - Kokkos::MemoryTraits>, \ - true, ETI_SPEC_AVAIL> { \ - typedef float SCALAR; \ - typedef Kokkos::View, \ - Kokkos::MemoryTraits> \ - XViewType; \ - typedef Kokkos::View, \ - Kokkos::MemoryTraits> \ - AViewType; \ - \ - static void syr(const typename AViewType::execution_space& space, \ - const char trans[], const char uplo[], \ - typename AViewType::const_value_type& alpha, \ - const XViewType& X, const AViewType& A) { \ - Kokkos::Profiling::pushRegion("KokkosBlas::syr[TPL_BLAS,float]"); \ - KOKKOSBLAS2_SYR_DETERMINE_ARGS(LAYOUT); \ - if (A_is_ll) { \ - HostBlas::syr(uplo[0], N, alpha, X.data(), one, A.data(), \ - LDA); \ - } else { \ - /* blasSsyr() + ~A_ll => call kokkos-kernels' implementation */ \ - SYR::syr( \ - space, trans, uplo, alpha, X, A); \ - } \ - Kokkos::Profiling::popRegion(); \ - } \ +#define KOKKOSBLAS2_SSYR_BLAS(LAYOUT, EXEC_SPACE, MEM_SPACE, ETI_SPEC_AVAIL) \ + template <> \ + struct SYR< \ + EXEC_SPACE, \ + Kokkos::View, \ + Kokkos::MemoryTraits>, \ + Kokkos::View, Kokkos::MemoryTraits>, \ + true, ETI_SPEC_AVAIL> { \ + typedef float SCALAR; \ + typedef Kokkos::View, \ + Kokkos::MemoryTraits> \ + XViewType; \ + typedef Kokkos::View, \ + Kokkos::MemoryTraits> \ + AViewType; \ + \ + static void syr(const typename AViewType::execution_space& space, const char trans[], const char uplo[], \ + typename AViewType::const_value_type& alpha, const XViewType& X, const AViewType& A) { \ + Kokkos::Profiling::pushRegion("KokkosBlas::syr[TPL_BLAS,float]"); \ + KOKKOSBLAS2_SYR_DETERMINE_ARGS(LAYOUT); \ + if (A_is_ll) { \ + HostBlas::syr(uplo[0], N, alpha, X.data(), one, A.data(), LDA); \ + } else { \ + /* blasSsyr() + ~A_ll => call kokkos-kernels' implementation */ \ + SYR::syr(space, trans, uplo, alpha, X, A); \ + } \ + Kokkos::Profiling::popRegion(); \ + } \ }; -#define KOKKOSBLAS2_ZSYR_BLAS(LAYOUT, EXEC_SPACE, MEM_SPACE, ETI_SPEC_AVAIL) \ - template <> \ - struct SYR*, LAYOUT, \ - Kokkos::Device, \ - Kokkos::MemoryTraits>, \ - Kokkos::View**, LAYOUT, \ - Kokkos::Device, \ - Kokkos::MemoryTraits>, \ - true, ETI_SPEC_AVAIL> { \ - typedef Kokkos::complex SCALAR; \ - typedef Kokkos::View, \ - Kokkos::MemoryTraits> \ - XViewType; \ - typedef Kokkos::View, \ - Kokkos::MemoryTraits> \ - AViewType; \ - \ - static void syr(const typename AViewType::execution_space& space, \ - const char trans[], const char uplo[], \ - typename AViewType::const_value_type& alpha, \ - const XViewType& X, const AViewType& A) { \ - Kokkos::Profiling::pushRegion( \ - "KokkosBlas::syr[TPL_BLAS,complex"); \ - KOKKOSBLAS2_SYR_DETERMINE_ARGS(LAYOUT); \ - bool justTranspose = (trans[0] == 'T') || (trans[0] == 't'); \ - if (justTranspose) { \ - /* No blasZsyr() => call kokkos-kernels' implementation */ \ - SYR::syr( \ - space, trans, uplo, alpha, X, A); \ - } else { \ - if (A_is_ll) { \ - HostBlas>::her( \ - uplo[0], N, alpha.real(), \ - reinterpret_cast*>(X.data()), one, \ - reinterpret_cast*>(A.data()), LDA); \ - } else { \ - /* blasZher() + [~A_ll or ~real alpha] => call kokkos-kernels' \ - * implementation */ \ - SYR::syr( \ - space, trans, uplo, alpha, X, A); \ - } \ - } \ - Kokkos::Profiling::popRegion(); \ - } \ +#define KOKKOSBLAS2_ZSYR_BLAS(LAYOUT, EXEC_SPACE, MEM_SPACE, ETI_SPEC_AVAIL) \ + template <> \ + struct SYR*, LAYOUT, Kokkos::Device, \ + Kokkos::MemoryTraits>, \ + Kokkos::View**, LAYOUT, Kokkos::Device, \ + Kokkos::MemoryTraits>, \ + true, ETI_SPEC_AVAIL> { \ + typedef Kokkos::complex SCALAR; \ + typedef Kokkos::View, \ + Kokkos::MemoryTraits> \ + XViewType; \ + typedef Kokkos::View, \ + Kokkos::MemoryTraits> \ + AViewType; \ + \ + static void syr(const typename AViewType::execution_space& space, const char trans[], const char uplo[], \ + typename AViewType::const_value_type& alpha, const XViewType& X, const AViewType& A) { \ + Kokkos::Profiling::pushRegion("KokkosBlas::syr[TPL_BLAS,complex"); \ + KOKKOSBLAS2_SYR_DETERMINE_ARGS(LAYOUT); \ + bool justTranspose = (trans[0] == 'T') || (trans[0] == 't'); \ + if (justTranspose) { \ + /* No blasZsyr() => call kokkos-kernels' implementation */ \ + SYR::syr(space, trans, uplo, alpha, X, A); \ + } else { \ + if (A_is_ll) { \ + HostBlas>::her(uplo[0], N, alpha.real(), \ + reinterpret_cast*>(X.data()), one, \ + reinterpret_cast*>(A.data()), LDA); \ + } else { \ + /* blasZher() + [~A_ll or ~real alpha] => call kokkos-kernels' \ + * implementation */ \ + SYR::syr(space, trans, uplo, alpha, X, A); \ + } \ + } \ + Kokkos::Profiling::popRegion(); \ + } \ }; -#define KOKKOSBLAS2_CSYR_BLAS(LAYOUT, EXEC_SPACE, MEM_SPACE, ETI_SPEC_AVAIL) \ - template <> \ - struct SYR*, LAYOUT, \ - Kokkos::Device, \ - Kokkos::MemoryTraits>, \ - Kokkos::View**, LAYOUT, \ - Kokkos::Device, \ - Kokkos::MemoryTraits>, \ - true, ETI_SPEC_AVAIL> { \ - typedef Kokkos::complex SCALAR; \ - typedef Kokkos::View, \ - Kokkos::MemoryTraits> \ - XViewType; \ - typedef Kokkos::View, \ - Kokkos::MemoryTraits> \ - AViewType; \ - \ - static void syr(const typename AViewType::execution_space& space, \ - const char trans[], const char uplo[], \ - typename AViewType::const_value_type& alpha, \ - const XViewType& X, const AViewType& A) { \ - Kokkos::Profiling::pushRegion( \ - "KokkosBlas::syr[TPL_BLAS,complex"); \ - KOKKOSBLAS2_SYR_DETERMINE_ARGS(LAYOUT); \ - bool justTranspose = (trans[0] == 'T') || (trans[0] == 't'); \ - if (justTranspose) { \ - /* No blasCsyr() => call kokkos-kernels' implementation */ \ - SYR::syr( \ - space, trans, uplo, alpha, X, A); \ - } else { \ - if (A_is_ll && (alpha.imag() == 0.)) { \ - HostBlas>::her( \ - uplo[0], N, alpha.real(), \ - reinterpret_cast*>(X.data()), one, \ - reinterpret_cast*>(A.data()), LDA); \ - } else { \ - /* blasCher() + [~A_ll or ~real alpha] => call kokkos-kernels' \ - * implementation */ \ - SYR::syr( \ - space, trans, uplo, alpha, X, A); \ - } \ - } \ - Kokkos::Profiling::popRegion(); \ - } \ +#define KOKKOSBLAS2_CSYR_BLAS(LAYOUT, EXEC_SPACE, MEM_SPACE, ETI_SPEC_AVAIL) \ + template <> \ + struct SYR*, LAYOUT, Kokkos::Device, \ + Kokkos::MemoryTraits>, \ + Kokkos::View**, LAYOUT, Kokkos::Device, \ + Kokkos::MemoryTraits>, \ + true, ETI_SPEC_AVAIL> { \ + typedef Kokkos::complex SCALAR; \ + typedef Kokkos::View, \ + Kokkos::MemoryTraits> \ + XViewType; \ + typedef Kokkos::View, \ + Kokkos::MemoryTraits> \ + AViewType; \ + \ + static void syr(const typename AViewType::execution_space& space, const char trans[], const char uplo[], \ + typename AViewType::const_value_type& alpha, const XViewType& X, const AViewType& A) { \ + Kokkos::Profiling::pushRegion("KokkosBlas::syr[TPL_BLAS,complex"); \ + KOKKOSBLAS2_SYR_DETERMINE_ARGS(LAYOUT); \ + bool justTranspose = (trans[0] == 'T') || (trans[0] == 't'); \ + if (justTranspose) { \ + /* No blasCsyr() => call kokkos-kernels' implementation */ \ + SYR::syr(space, trans, uplo, alpha, X, A); \ + } else { \ + if (A_is_ll && (alpha.imag() == 0.)) { \ + HostBlas>::her(uplo[0], N, alpha.real(), \ + reinterpret_cast*>(X.data()), one, \ + reinterpret_cast*>(A.data()), LDA); \ + } else { \ + /* blasCher() + [~A_ll or ~real alpha] => call kokkos-kernels' \ + * implementation */ \ + SYR::syr(space, trans, uplo, alpha, X, A); \ + } \ + } \ + Kokkos::Profiling::popRegion(); \ + } \ }; #ifdef KOKKOS_ENABLE_SERIAL -KOKKOSBLAS2_DSYR_BLAS(Kokkos::LayoutLeft, Kokkos::Serial, Kokkos::HostSpace, - true) -KOKKOSBLAS2_DSYR_BLAS(Kokkos::LayoutLeft, Kokkos::Serial, Kokkos::HostSpace, - false) -KOKKOSBLAS2_DSYR_BLAS(Kokkos::LayoutRight, Kokkos::Serial, Kokkos::HostSpace, - true) -KOKKOSBLAS2_DSYR_BLAS(Kokkos::LayoutRight, Kokkos::Serial, Kokkos::HostSpace, - false) - -KOKKOSBLAS2_SSYR_BLAS(Kokkos::LayoutLeft, Kokkos::Serial, Kokkos::HostSpace, - true) -KOKKOSBLAS2_SSYR_BLAS(Kokkos::LayoutLeft, Kokkos::Serial, Kokkos::HostSpace, - false) -KOKKOSBLAS2_SSYR_BLAS(Kokkos::LayoutRight, Kokkos::Serial, Kokkos::HostSpace, - true) -KOKKOSBLAS2_SSYR_BLAS(Kokkos::LayoutRight, Kokkos::Serial, Kokkos::HostSpace, - false) - -KOKKOSBLAS2_ZSYR_BLAS(Kokkos::LayoutLeft, Kokkos::Serial, Kokkos::HostSpace, - true) -KOKKOSBLAS2_ZSYR_BLAS(Kokkos::LayoutLeft, Kokkos::Serial, Kokkos::HostSpace, - false) -KOKKOSBLAS2_ZSYR_BLAS(Kokkos::LayoutRight, Kokkos::Serial, Kokkos::HostSpace, - true) -KOKKOSBLAS2_ZSYR_BLAS(Kokkos::LayoutRight, Kokkos::Serial, Kokkos::HostSpace, - false) - -KOKKOSBLAS2_CSYR_BLAS(Kokkos::LayoutLeft, Kokkos::Serial, Kokkos::HostSpace, - true) -KOKKOSBLAS2_CSYR_BLAS(Kokkos::LayoutLeft, Kokkos::Serial, Kokkos::HostSpace, - false) -KOKKOSBLAS2_CSYR_BLAS(Kokkos::LayoutRight, Kokkos::Serial, Kokkos::HostSpace, - true) -KOKKOSBLAS2_CSYR_BLAS(Kokkos::LayoutRight, Kokkos::Serial, Kokkos::HostSpace, - false) +KOKKOSBLAS2_DSYR_BLAS(Kokkos::LayoutLeft, Kokkos::Serial, Kokkos::HostSpace, true) +KOKKOSBLAS2_DSYR_BLAS(Kokkos::LayoutLeft, Kokkos::Serial, Kokkos::HostSpace, false) +KOKKOSBLAS2_DSYR_BLAS(Kokkos::LayoutRight, Kokkos::Serial, Kokkos::HostSpace, true) +KOKKOSBLAS2_DSYR_BLAS(Kokkos::LayoutRight, Kokkos::Serial, Kokkos::HostSpace, false) + +KOKKOSBLAS2_SSYR_BLAS(Kokkos::LayoutLeft, Kokkos::Serial, Kokkos::HostSpace, true) +KOKKOSBLAS2_SSYR_BLAS(Kokkos::LayoutLeft, Kokkos::Serial, Kokkos::HostSpace, false) +KOKKOSBLAS2_SSYR_BLAS(Kokkos::LayoutRight, Kokkos::Serial, Kokkos::HostSpace, true) +KOKKOSBLAS2_SSYR_BLAS(Kokkos::LayoutRight, Kokkos::Serial, Kokkos::HostSpace, false) + +KOKKOSBLAS2_ZSYR_BLAS(Kokkos::LayoutLeft, Kokkos::Serial, Kokkos::HostSpace, true) +KOKKOSBLAS2_ZSYR_BLAS(Kokkos::LayoutLeft, Kokkos::Serial, Kokkos::HostSpace, false) +KOKKOSBLAS2_ZSYR_BLAS(Kokkos::LayoutRight, Kokkos::Serial, Kokkos::HostSpace, true) +KOKKOSBLAS2_ZSYR_BLAS(Kokkos::LayoutRight, Kokkos::Serial, Kokkos::HostSpace, false) + +KOKKOSBLAS2_CSYR_BLAS(Kokkos::LayoutLeft, Kokkos::Serial, Kokkos::HostSpace, true) +KOKKOSBLAS2_CSYR_BLAS(Kokkos::LayoutLeft, Kokkos::Serial, Kokkos::HostSpace, false) +KOKKOSBLAS2_CSYR_BLAS(Kokkos::LayoutRight, Kokkos::Serial, Kokkos::HostSpace, true) +KOKKOSBLAS2_CSYR_BLAS(Kokkos::LayoutRight, Kokkos::Serial, Kokkos::HostSpace, false) #endif #ifdef KOKKOS_ENABLE_OPENMP -KOKKOSBLAS2_DSYR_BLAS(Kokkos::LayoutLeft, Kokkos::OpenMP, Kokkos::HostSpace, - true) -KOKKOSBLAS2_DSYR_BLAS(Kokkos::LayoutLeft, Kokkos::OpenMP, Kokkos::HostSpace, - false) -KOKKOSBLAS2_DSYR_BLAS(Kokkos::LayoutRight, Kokkos::OpenMP, Kokkos::HostSpace, - true) -KOKKOSBLAS2_DSYR_BLAS(Kokkos::LayoutRight, Kokkos::OpenMP, Kokkos::HostSpace, - false) - -KOKKOSBLAS2_SSYR_BLAS(Kokkos::LayoutLeft, Kokkos::OpenMP, Kokkos::HostSpace, - true) -KOKKOSBLAS2_SSYR_BLAS(Kokkos::LayoutLeft, Kokkos::OpenMP, Kokkos::HostSpace, - false) -KOKKOSBLAS2_SSYR_BLAS(Kokkos::LayoutRight, Kokkos::OpenMP, Kokkos::HostSpace, - true) -KOKKOSBLAS2_SSYR_BLAS(Kokkos::LayoutRight, Kokkos::OpenMP, Kokkos::HostSpace, - false) - -KOKKOSBLAS2_ZSYR_BLAS(Kokkos::LayoutLeft, Kokkos::OpenMP, Kokkos::HostSpace, - true) -KOKKOSBLAS2_ZSYR_BLAS(Kokkos::LayoutLeft, Kokkos::OpenMP, Kokkos::HostSpace, - false) -KOKKOSBLAS2_ZSYR_BLAS(Kokkos::LayoutRight, Kokkos::OpenMP, Kokkos::HostSpace, - true) -KOKKOSBLAS2_ZSYR_BLAS(Kokkos::LayoutRight, Kokkos::OpenMP, Kokkos::HostSpace, - false) - -KOKKOSBLAS2_CSYR_BLAS(Kokkos::LayoutLeft, Kokkos::OpenMP, Kokkos::HostSpace, - true) -KOKKOSBLAS2_CSYR_BLAS(Kokkos::LayoutLeft, Kokkos::OpenMP, Kokkos::HostSpace, - false) -KOKKOSBLAS2_CSYR_BLAS(Kokkos::LayoutRight, Kokkos::OpenMP, Kokkos::HostSpace, - true) -KOKKOSBLAS2_CSYR_BLAS(Kokkos::LayoutRight, Kokkos::OpenMP, Kokkos::HostSpace, - false) +KOKKOSBLAS2_DSYR_BLAS(Kokkos::LayoutLeft, Kokkos::OpenMP, Kokkos::HostSpace, true) +KOKKOSBLAS2_DSYR_BLAS(Kokkos::LayoutLeft, Kokkos::OpenMP, Kokkos::HostSpace, false) +KOKKOSBLAS2_DSYR_BLAS(Kokkos::LayoutRight, Kokkos::OpenMP, Kokkos::HostSpace, true) +KOKKOSBLAS2_DSYR_BLAS(Kokkos::LayoutRight, Kokkos::OpenMP, Kokkos::HostSpace, false) + +KOKKOSBLAS2_SSYR_BLAS(Kokkos::LayoutLeft, Kokkos::OpenMP, Kokkos::HostSpace, true) +KOKKOSBLAS2_SSYR_BLAS(Kokkos::LayoutLeft, Kokkos::OpenMP, Kokkos::HostSpace, false) +KOKKOSBLAS2_SSYR_BLAS(Kokkos::LayoutRight, Kokkos::OpenMP, Kokkos::HostSpace, true) +KOKKOSBLAS2_SSYR_BLAS(Kokkos::LayoutRight, Kokkos::OpenMP, Kokkos::HostSpace, false) + +KOKKOSBLAS2_ZSYR_BLAS(Kokkos::LayoutLeft, Kokkos::OpenMP, Kokkos::HostSpace, true) +KOKKOSBLAS2_ZSYR_BLAS(Kokkos::LayoutLeft, Kokkos::OpenMP, Kokkos::HostSpace, false) +KOKKOSBLAS2_ZSYR_BLAS(Kokkos::LayoutRight, Kokkos::OpenMP, Kokkos::HostSpace, true) +KOKKOSBLAS2_ZSYR_BLAS(Kokkos::LayoutRight, Kokkos::OpenMP, Kokkos::HostSpace, false) + +KOKKOSBLAS2_CSYR_BLAS(Kokkos::LayoutLeft, Kokkos::OpenMP, Kokkos::HostSpace, true) +KOKKOSBLAS2_CSYR_BLAS(Kokkos::LayoutLeft, Kokkos::OpenMP, Kokkos::HostSpace, false) +KOKKOSBLAS2_CSYR_BLAS(Kokkos::LayoutRight, Kokkos::OpenMP, Kokkos::HostSpace, true) +KOKKOSBLAS2_CSYR_BLAS(Kokkos::LayoutRight, Kokkos::OpenMP, Kokkos::HostSpace, false) #endif } // namespace Impl diff --git a/blas/tpls/KokkosBlas2_syr_tpl_spec_decl_cublas.hpp b/blas/tpls/KokkosBlas2_syr_tpl_spec_decl_cublas.hpp index dad3c93dbc..43b177d9a5 100644 --- a/blas/tpls/KokkosBlas2_syr_tpl_spec_decl_cublas.hpp +++ b/blas/tpls/KokkosBlas2_syr_tpl_spec_decl_cublas.hpp @@ -22,309 +22,224 @@ namespace KokkosBlas { namespace Impl { -#define KOKKOSBLAS2_SYR_CUBLAS_DETERMINE_ARGS(LAYOUT, uploChar) \ - bool A_is_ll = std::is_same::value; \ - bool A_is_lr = std::is_same::value; \ - const int N = static_cast(A_is_lr ? A.extent(0) : A.extent(1)); \ - constexpr int one = 1; \ - const int LDA = A_is_lr ? A.stride(0) : A.stride(1); \ - cublasFillMode_t fillMode = (uploChar == 'L' || uploChar == 'l') \ - ? CUBLAS_FILL_MODE_LOWER \ - : CUBLAS_FILL_MODE_UPPER; - -#define KOKKOSBLAS2_DSYR_CUBLAS(LAYOUT, EXEC_SPACE, MEM_SPACE, ETI_SPEC_AVAIL) \ - template <> \ - struct SYR< \ - EXEC_SPACE, \ - Kokkos::View, \ - Kokkos::MemoryTraits >, \ - Kokkos::View, \ - Kokkos::MemoryTraits >, \ - true, ETI_SPEC_AVAIL> { \ - typedef double SCALAR; \ - typedef Kokkos::View, \ - Kokkos::MemoryTraits > \ - XViewType; \ - typedef Kokkos::View, \ - Kokkos::MemoryTraits > \ - AViewType; \ - \ - static void syr(const typename AViewType::execution_space& space, \ - const char trans[], const char uplo[], \ - typename AViewType::const_value_type& alpha, \ - const XViewType& X, const AViewType& A) { \ - Kokkos::Profiling::pushRegion("KokkosBlas::syr[TPL_CUBLAS,double]"); \ - KOKKOSBLAS2_SYR_CUBLAS_DETERMINE_ARGS(LAYOUT, uplo[0]); \ - if (A_is_ll) { \ - KokkosBlas::Impl::CudaBlasSingleton& s = \ - KokkosBlas::Impl::CudaBlasSingleton::singleton(); \ - KOKKOS_CUBLAS_SAFE_CALL_IMPL( \ - cublasSetStream(s.handle, space.cuda_stream())); \ - KOKKOS_CUBLAS_SAFE_CALL_IMPL(cublasDsyr( \ - s.handle, fillMode, N, &alpha, X.data(), one, A.data(), LDA)); \ - KOKKOS_CUBLAS_SAFE_CALL_IMPL(cublasSetStream(s.handle, NULL)); \ - } else { \ - /* cublasDsyr() + ~A_ll => call kokkos-kernels' implementation */ \ - SYR::syr( \ - space, trans, uplo, alpha, X, A); \ - } \ - Kokkos::Profiling::popRegion(); \ - } \ +#define KOKKOSBLAS2_SYR_CUBLAS_DETERMINE_ARGS(LAYOUT, uploChar) \ + bool A_is_ll = std::is_same::value; \ + bool A_is_lr = std::is_same::value; \ + const int N = static_cast(A_is_lr ? A.extent(0) : A.extent(1)); \ + constexpr int one = 1; \ + const int LDA = A_is_lr ? A.stride(0) : A.stride(1); \ + cublasFillMode_t fillMode = (uploChar == 'L' || uploChar == 'l') ? CUBLAS_FILL_MODE_LOWER : CUBLAS_FILL_MODE_UPPER; + +#define KOKKOSBLAS2_DSYR_CUBLAS(LAYOUT, EXEC_SPACE, MEM_SPACE, ETI_SPEC_AVAIL) \ + template <> \ + struct SYR< \ + EXEC_SPACE, \ + Kokkos::View, \ + Kokkos::MemoryTraits >, \ + Kokkos::View, Kokkos::MemoryTraits >, \ + true, ETI_SPEC_AVAIL> { \ + typedef double SCALAR; \ + typedef Kokkos::View, \ + Kokkos::MemoryTraits > \ + XViewType; \ + typedef Kokkos::View, \ + Kokkos::MemoryTraits > \ + AViewType; \ + \ + static void syr(const typename AViewType::execution_space& space, const char trans[], const char uplo[], \ + typename AViewType::const_value_type& alpha, const XViewType& X, const AViewType& A) { \ + Kokkos::Profiling::pushRegion("KokkosBlas::syr[TPL_CUBLAS,double]"); \ + KOKKOSBLAS2_SYR_CUBLAS_DETERMINE_ARGS(LAYOUT, uplo[0]); \ + if (A_is_ll) { \ + KokkosBlas::Impl::CudaBlasSingleton& s = KokkosBlas::Impl::CudaBlasSingleton::singleton(); \ + KOKKOS_CUBLAS_SAFE_CALL_IMPL(cublasSetStream(s.handle, space.cuda_stream())); \ + KOKKOS_CUBLAS_SAFE_CALL_IMPL(cublasDsyr(s.handle, fillMode, N, &alpha, X.data(), one, A.data(), LDA)); \ + KOKKOS_CUBLAS_SAFE_CALL_IMPL(cublasSetStream(s.handle, NULL)); \ + } else { \ + /* cublasDsyr() + ~A_ll => call kokkos-kernels' implementation */ \ + SYR::syr(space, trans, uplo, alpha, X, A); \ + } \ + Kokkos::Profiling::popRegion(); \ + } \ }; -#define KOKKOSBLAS2_SSYR_CUBLAS(LAYOUT, EXEC_SPACE, MEM_SPACE, ETI_SPEC_AVAIL) \ - template <> \ - struct SYR< \ - EXEC_SPACE, \ - Kokkos::View, \ - Kokkos::MemoryTraits >, \ - Kokkos::View, \ - Kokkos::MemoryTraits >, \ - true, ETI_SPEC_AVAIL> { \ - typedef float SCALAR; \ - typedef Kokkos::View, \ - Kokkos::MemoryTraits > \ - XViewType; \ - typedef Kokkos::View, \ - Kokkos::MemoryTraits > \ - AViewType; \ - \ - static void syr(const typename AViewType::execution_space& space, \ - const char trans[], const char uplo[], \ - typename AViewType::const_value_type& alpha, \ - const XViewType& X, const AViewType& A) { \ - Kokkos::Profiling::pushRegion("KokkosBlas::syr[TPL_CUBLAS,float]"); \ - KOKKOSBLAS2_SYR_CUBLAS_DETERMINE_ARGS(LAYOUT, uplo[0]); \ - if (A_is_ll) { \ - KokkosBlas::Impl::CudaBlasSingleton& s = \ - KokkosBlas::Impl::CudaBlasSingleton::singleton(); \ - KOKKOS_CUBLAS_SAFE_CALL_IMPL( \ - cublasSetStream(s.handle, space.cuda_stream())); \ - KOKKOS_CUBLAS_SAFE_CALL_IMPL(cublasSsyr( \ - s.handle, fillMode, N, &alpha, X.data(), one, A.data(), LDA)); \ - KOKKOS_CUBLAS_SAFE_CALL_IMPL(cublasSetStream(s.handle, NULL)); \ - } else { \ - /* cublasSsyr() + ~A_ll => call kokkos-kernels' implementation */ \ - SYR::syr( \ - space, trans, uplo, alpha, X, A); \ - } \ - Kokkos::Profiling::popRegion(); \ - } \ +#define KOKKOSBLAS2_SSYR_CUBLAS(LAYOUT, EXEC_SPACE, MEM_SPACE, ETI_SPEC_AVAIL) \ + template <> \ + struct SYR< \ + EXEC_SPACE, \ + Kokkos::View, \ + Kokkos::MemoryTraits >, \ + Kokkos::View, Kokkos::MemoryTraits >, \ + true, ETI_SPEC_AVAIL> { \ + typedef float SCALAR; \ + typedef Kokkos::View, \ + Kokkos::MemoryTraits > \ + XViewType; \ + typedef Kokkos::View, \ + Kokkos::MemoryTraits > \ + AViewType; \ + \ + static void syr(const typename AViewType::execution_space& space, const char trans[], const char uplo[], \ + typename AViewType::const_value_type& alpha, const XViewType& X, const AViewType& A) { \ + Kokkos::Profiling::pushRegion("KokkosBlas::syr[TPL_CUBLAS,float]"); \ + KOKKOSBLAS2_SYR_CUBLAS_DETERMINE_ARGS(LAYOUT, uplo[0]); \ + if (A_is_ll) { \ + KokkosBlas::Impl::CudaBlasSingleton& s = KokkosBlas::Impl::CudaBlasSingleton::singleton(); \ + KOKKOS_CUBLAS_SAFE_CALL_IMPL(cublasSetStream(s.handle, space.cuda_stream())); \ + KOKKOS_CUBLAS_SAFE_CALL_IMPL(cublasSsyr(s.handle, fillMode, N, &alpha, X.data(), one, A.data(), LDA)); \ + KOKKOS_CUBLAS_SAFE_CALL_IMPL(cublasSetStream(s.handle, NULL)); \ + } else { \ + /* cublasSsyr() + ~A_ll => call kokkos-kernels' implementation */ \ + SYR::syr(space, trans, uplo, alpha, X, A); \ + } \ + Kokkos::Profiling::popRegion(); \ + } \ }; -#define KOKKOSBLAS2_ZSYR_CUBLAS(LAYOUT, EXEC_SPACE, MEM_SPACE, ETI_SPEC_AVAIL) \ - template <> \ - struct SYR*, LAYOUT, \ - Kokkos::Device, \ - Kokkos::MemoryTraits >, \ - Kokkos::View**, LAYOUT, \ - Kokkos::Device, \ - Kokkos::MemoryTraits >, \ - true, ETI_SPEC_AVAIL> { \ - typedef Kokkos::complex SCALAR; \ - typedef Kokkos::View, \ - Kokkos::MemoryTraits > \ - XViewType; \ - typedef Kokkos::View, \ - Kokkos::MemoryTraits > \ - AViewType; \ - \ - static void syr(const typename AViewType::execution_space& space, \ - const char trans[], const char uplo[], \ - typename AViewType::const_value_type& alpha, \ - const XViewType& X, const AViewType& A) { \ - Kokkos::Profiling::pushRegion( \ - "KokkosBlas::syr[TPL_CUBLAS,complex]"); \ - KOKKOSBLAS2_SYR_CUBLAS_DETERMINE_ARGS(LAYOUT, uplo[0]); \ - bool justTranspose = (trans[0] == 'T') || (trans[0] == 't'); \ - if (justTranspose) { \ - if (A_is_ll) { \ - KokkosBlas::Impl::CudaBlasSingleton& s = \ - KokkosBlas::Impl::CudaBlasSingleton::singleton(); \ - KOKKOS_CUBLAS_SAFE_CALL_IMPL( \ - cublasSetStream(s.handle, space.cuda_stream())); \ - KOKKOS_CUBLAS_SAFE_CALL_IMPL(cublasZsyr( \ - s.handle, fillMode, N, \ - reinterpret_cast(&alpha), \ - reinterpret_cast(X.data()), one, \ - reinterpret_cast(A.data()), LDA)); \ - KOKKOS_CUBLAS_SAFE_CALL_IMPL(cublasSetStream(s.handle, NULL)); \ - } else { \ - /* cublasZsyr() + ~A_ll => call kokkos-kernels' implementation */ \ - SYR::syr( \ - space, trans, uplo, alpha, X, A); \ - } \ - } else { \ - if (A_is_ll && (alpha.imag() == 0.)) { \ - const double alpha_val = alpha.real(); \ - KokkosBlas::Impl::CudaBlasSingleton& s = \ - KokkosBlas::Impl::CudaBlasSingleton::singleton(); \ - KOKKOS_CUBLAS_SAFE_CALL_IMPL( \ - cublasSetStream(s.handle, space.cuda_stream())); \ - KOKKOS_CUBLAS_SAFE_CALL_IMPL(cublasZher( \ - s.handle, fillMode, N, &alpha_val, \ - reinterpret_cast(X.data()), one, \ - reinterpret_cast(A.data()), LDA)); \ - KOKKOS_CUBLAS_SAFE_CALL_IMPL(cublasSetStream(s.handle, NULL)); \ - } else { \ - /* cublasZher() + [~A_ll or ~real alpha]=> call kokkos-kernels' \ - * implementation */ \ - SYR::syr( \ - space, trans, uplo, alpha, X, A); \ - } \ - } \ - Kokkos::Profiling::popRegion(); \ - } \ +#define KOKKOSBLAS2_ZSYR_CUBLAS(LAYOUT, EXEC_SPACE, MEM_SPACE, ETI_SPEC_AVAIL) \ + template <> \ + struct SYR*, LAYOUT, Kokkos::Device, \ + Kokkos::MemoryTraits >, \ + Kokkos::View**, LAYOUT, Kokkos::Device, \ + Kokkos::MemoryTraits >, \ + true, ETI_SPEC_AVAIL> { \ + typedef Kokkos::complex SCALAR; \ + typedef Kokkos::View, \ + Kokkos::MemoryTraits > \ + XViewType; \ + typedef Kokkos::View, \ + Kokkos::MemoryTraits > \ + AViewType; \ + \ + static void syr(const typename AViewType::execution_space& space, const char trans[], const char uplo[], \ + typename AViewType::const_value_type& alpha, const XViewType& X, const AViewType& A) { \ + Kokkos::Profiling::pushRegion("KokkosBlas::syr[TPL_CUBLAS,complex]"); \ + KOKKOSBLAS2_SYR_CUBLAS_DETERMINE_ARGS(LAYOUT, uplo[0]); \ + bool justTranspose = (trans[0] == 'T') || (trans[0] == 't'); \ + if (justTranspose) { \ + if (A_is_ll) { \ + KokkosBlas::Impl::CudaBlasSingleton& s = KokkosBlas::Impl::CudaBlasSingleton::singleton(); \ + KOKKOS_CUBLAS_SAFE_CALL_IMPL(cublasSetStream(s.handle, space.cuda_stream())); \ + KOKKOS_CUBLAS_SAFE_CALL_IMPL(cublasZsyr(s.handle, fillMode, N, \ + reinterpret_cast(&alpha), \ + reinterpret_cast(X.data()), one, \ + reinterpret_cast(A.data()), LDA)); \ + KOKKOS_CUBLAS_SAFE_CALL_IMPL(cublasSetStream(s.handle, NULL)); \ + } else { \ + /* cublasZsyr() + ~A_ll => call kokkos-kernels' implementation */ \ + SYR::syr(space, trans, uplo, alpha, X, A); \ + } \ + } else { \ + if (A_is_ll && (alpha.imag() == 0.)) { \ + const double alpha_val = alpha.real(); \ + KokkosBlas::Impl::CudaBlasSingleton& s = KokkosBlas::Impl::CudaBlasSingleton::singleton(); \ + KOKKOS_CUBLAS_SAFE_CALL_IMPL(cublasSetStream(s.handle, space.cuda_stream())); \ + KOKKOS_CUBLAS_SAFE_CALL_IMPL(cublasZher(s.handle, fillMode, N, &alpha_val, \ + reinterpret_cast(X.data()), one, \ + reinterpret_cast(A.data()), LDA)); \ + KOKKOS_CUBLAS_SAFE_CALL_IMPL(cublasSetStream(s.handle, NULL)); \ + } else { \ + /* cublasZher() + [~A_ll or ~real alpha]=> call kokkos-kernels' \ + * implementation */ \ + SYR::syr(space, trans, uplo, alpha, X, A); \ + } \ + } \ + Kokkos::Profiling::popRegion(); \ + } \ }; -#define KOKKOSBLAS2_CSYR_CUBLAS(LAYOUT, EXEC_SPACE, MEM_SPACE, ETI_SPEC_AVAIL) \ - template <> \ - struct SYR*, LAYOUT, \ - Kokkos::Device, \ - Kokkos::MemoryTraits >, \ - Kokkos::View**, LAYOUT, \ - Kokkos::Device, \ - Kokkos::MemoryTraits >, \ - true, ETI_SPEC_AVAIL> { \ - typedef Kokkos::complex SCALAR; \ - typedef Kokkos::View, \ - Kokkos::MemoryTraits > \ - XViewType; \ - typedef Kokkos::View, \ - Kokkos::MemoryTraits > \ - AViewType; \ - \ - static void syr(const typename AViewType::execution_space& space, \ - const char trans[], const char uplo[], \ - typename AViewType::const_value_type& alpha, \ - const XViewType& X, const AViewType& A) { \ - Kokkos::Profiling::pushRegion( \ - "KokkosBlas::syr[TPL_CUBLAS,complex]"); \ - KOKKOSBLAS2_SYR_CUBLAS_DETERMINE_ARGS(LAYOUT, uplo[0]); \ - bool justTranspose = (trans[0] == 'T') || (trans[0] == 't'); \ - if (justTranspose) { \ - if (A_is_ll) { \ - KokkosBlas::Impl::CudaBlasSingleton& s = \ - KokkosBlas::Impl::CudaBlasSingleton::singleton(); \ - KOKKOS_CUBLAS_SAFE_CALL_IMPL( \ - cublasSetStream(s.handle, space.cuda_stream())); \ - KOKKOS_CUBLAS_SAFE_CALL_IMPL( \ - cublasCsyr(s.handle, fillMode, N, \ - reinterpret_cast(&alpha), \ - reinterpret_cast(X.data()), one, \ - reinterpret_cast(A.data()), LDA)); \ - KOKKOS_CUBLAS_SAFE_CALL_IMPL(cublasSetStream(s.handle, NULL)); \ - } else { \ - /* cublasCsyr() + ~A_ll => call kokkos-kernels' implementation */ \ - SYR::syr( \ - space, trans, uplo, alpha, X, A); \ - } \ - } else { \ - if (A_is_ll && (alpha.imag() == 0.)) { \ - const float alpha_val = alpha.real(); \ - KokkosBlas::Impl::CudaBlasSingleton& s = \ - KokkosBlas::Impl::CudaBlasSingleton::singleton(); \ - KOKKOS_CUBLAS_SAFE_CALL_IMPL( \ - cublasSetStream(s.handle, space.cuda_stream())); \ - KOKKOS_CUBLAS_SAFE_CALL_IMPL( \ - cublasCher(s.handle, fillMode, N, &alpha_val, \ - reinterpret_cast(X.data()), one, \ - reinterpret_cast(A.data()), LDA)); \ - KOKKOS_CUBLAS_SAFE_CALL_IMPL(cublasSetStream(s.handle, NULL)); \ - } else { \ - /* cublasCher() + [~A_ll or ~real alpha]=> call kokkos-kernels' \ - * implementation */ \ - SYR::syr( \ - space, trans, uplo, alpha, X, A); \ - } \ - } \ - Kokkos::Profiling::popRegion(); \ - } \ +#define KOKKOSBLAS2_CSYR_CUBLAS(LAYOUT, EXEC_SPACE, MEM_SPACE, ETI_SPEC_AVAIL) \ + template <> \ + struct SYR*, LAYOUT, Kokkos::Device, \ + Kokkos::MemoryTraits >, \ + Kokkos::View**, LAYOUT, Kokkos::Device, \ + Kokkos::MemoryTraits >, \ + true, ETI_SPEC_AVAIL> { \ + typedef Kokkos::complex SCALAR; \ + typedef Kokkos::View, \ + Kokkos::MemoryTraits > \ + XViewType; \ + typedef Kokkos::View, \ + Kokkos::MemoryTraits > \ + AViewType; \ + \ + static void syr(const typename AViewType::execution_space& space, const char trans[], const char uplo[], \ + typename AViewType::const_value_type& alpha, const XViewType& X, const AViewType& A) { \ + Kokkos::Profiling::pushRegion("KokkosBlas::syr[TPL_CUBLAS,complex]"); \ + KOKKOSBLAS2_SYR_CUBLAS_DETERMINE_ARGS(LAYOUT, uplo[0]); \ + bool justTranspose = (trans[0] == 'T') || (trans[0] == 't'); \ + if (justTranspose) { \ + if (A_is_ll) { \ + KokkosBlas::Impl::CudaBlasSingleton& s = KokkosBlas::Impl::CudaBlasSingleton::singleton(); \ + KOKKOS_CUBLAS_SAFE_CALL_IMPL(cublasSetStream(s.handle, space.cuda_stream())); \ + KOKKOS_CUBLAS_SAFE_CALL_IMPL(cublasCsyr(s.handle, fillMode, N, reinterpret_cast(&alpha), \ + reinterpret_cast(X.data()), one, \ + reinterpret_cast(A.data()), LDA)); \ + KOKKOS_CUBLAS_SAFE_CALL_IMPL(cublasSetStream(s.handle, NULL)); \ + } else { \ + /* cublasCsyr() + ~A_ll => call kokkos-kernels' implementation */ \ + SYR::syr(space, trans, uplo, alpha, X, A); \ + } \ + } else { \ + if (A_is_ll && (alpha.imag() == 0.)) { \ + const float alpha_val = alpha.real(); \ + KokkosBlas::Impl::CudaBlasSingleton& s = KokkosBlas::Impl::CudaBlasSingleton::singleton(); \ + KOKKOS_CUBLAS_SAFE_CALL_IMPL(cublasSetStream(s.handle, space.cuda_stream())); \ + KOKKOS_CUBLAS_SAFE_CALL_IMPL(cublasCher(s.handle, fillMode, N, &alpha_val, \ + reinterpret_cast(X.data()), one, \ + reinterpret_cast(A.data()), LDA)); \ + KOKKOS_CUBLAS_SAFE_CALL_IMPL(cublasSetStream(s.handle, NULL)); \ + } else { \ + /* cublasCher() + [~A_ll or ~real alpha]=> call kokkos-kernels' \ + * implementation */ \ + SYR::syr(space, trans, uplo, alpha, X, A); \ + } \ + } \ + Kokkos::Profiling::popRegion(); \ + } \ }; -KOKKOSBLAS2_DSYR_CUBLAS(Kokkos::LayoutLeft, Kokkos::Cuda, Kokkos::CudaSpace, - true) -KOKKOSBLAS2_DSYR_CUBLAS(Kokkos::LayoutLeft, Kokkos::Cuda, Kokkos::CudaSpace, - false) -KOKKOSBLAS2_DSYR_CUBLAS(Kokkos::LayoutRight, Kokkos::Cuda, Kokkos::CudaSpace, - true) -KOKKOSBLAS2_DSYR_CUBLAS(Kokkos::LayoutRight, Kokkos::Cuda, Kokkos::CudaSpace, - false) - -KOKKOSBLAS2_DSYR_CUBLAS(Kokkos::LayoutLeft, Kokkos::Cuda, Kokkos::CudaUVMSpace, - true) -KOKKOSBLAS2_DSYR_CUBLAS(Kokkos::LayoutLeft, Kokkos::Cuda, Kokkos::CudaUVMSpace, - false) -KOKKOSBLAS2_DSYR_CUBLAS(Kokkos::LayoutRight, Kokkos::Cuda, Kokkos::CudaUVMSpace, - true) -KOKKOSBLAS2_DSYR_CUBLAS(Kokkos::LayoutRight, Kokkos::Cuda, Kokkos::CudaUVMSpace, - false) - -KOKKOSBLAS2_SSYR_CUBLAS(Kokkos::LayoutLeft, Kokkos::Cuda, Kokkos::CudaSpace, - true) -KOKKOSBLAS2_SSYR_CUBLAS(Kokkos::LayoutLeft, Kokkos::Cuda, Kokkos::CudaSpace, - false) -KOKKOSBLAS2_SSYR_CUBLAS(Kokkos::LayoutRight, Kokkos::Cuda, Kokkos::CudaSpace, - true) -KOKKOSBLAS2_SSYR_CUBLAS(Kokkos::LayoutRight, Kokkos::Cuda, Kokkos::CudaSpace, - false) - -KOKKOSBLAS2_SSYR_CUBLAS(Kokkos::LayoutLeft, Kokkos::Cuda, Kokkos::CudaUVMSpace, - true) -KOKKOSBLAS2_SSYR_CUBLAS(Kokkos::LayoutLeft, Kokkos::Cuda, Kokkos::CudaUVMSpace, - false) -KOKKOSBLAS2_SSYR_CUBLAS(Kokkos::LayoutRight, Kokkos::Cuda, Kokkos::CudaUVMSpace, - true) -KOKKOSBLAS2_SSYR_CUBLAS(Kokkos::LayoutRight, Kokkos::Cuda, Kokkos::CudaUVMSpace, - false) - -KOKKOSBLAS2_ZSYR_CUBLAS(Kokkos::LayoutLeft, Kokkos::Cuda, Kokkos::CudaSpace, - true) -KOKKOSBLAS2_ZSYR_CUBLAS(Kokkos::LayoutLeft, Kokkos::Cuda, Kokkos::CudaSpace, - false) -KOKKOSBLAS2_ZSYR_CUBLAS(Kokkos::LayoutRight, Kokkos::Cuda, Kokkos::CudaSpace, - true) -KOKKOSBLAS2_ZSYR_CUBLAS(Kokkos::LayoutRight, Kokkos::Cuda, Kokkos::CudaSpace, - false) - -KOKKOSBLAS2_ZSYR_CUBLAS(Kokkos::LayoutLeft, Kokkos::Cuda, Kokkos::CudaUVMSpace, - true) -KOKKOSBLAS2_ZSYR_CUBLAS(Kokkos::LayoutLeft, Kokkos::Cuda, Kokkos::CudaUVMSpace, - false) -KOKKOSBLAS2_ZSYR_CUBLAS(Kokkos::LayoutRight, Kokkos::Cuda, Kokkos::CudaUVMSpace, - true) -KOKKOSBLAS2_ZSYR_CUBLAS(Kokkos::LayoutRight, Kokkos::Cuda, Kokkos::CudaUVMSpace, - false) - -KOKKOSBLAS2_CSYR_CUBLAS(Kokkos::LayoutLeft, Kokkos::Cuda, Kokkos::CudaSpace, - true) -KOKKOSBLAS2_CSYR_CUBLAS(Kokkos::LayoutLeft, Kokkos::Cuda, Kokkos::CudaSpace, - false) -KOKKOSBLAS2_CSYR_CUBLAS(Kokkos::LayoutRight, Kokkos::Cuda, Kokkos::CudaSpace, - true) -KOKKOSBLAS2_CSYR_CUBLAS(Kokkos::LayoutRight, Kokkos::Cuda, Kokkos::CudaSpace, - false) - -KOKKOSBLAS2_CSYR_CUBLAS(Kokkos::LayoutLeft, Kokkos::Cuda, Kokkos::CudaUVMSpace, - true) -KOKKOSBLAS2_CSYR_CUBLAS(Kokkos::LayoutLeft, Kokkos::Cuda, Kokkos::CudaUVMSpace, - false) -KOKKOSBLAS2_CSYR_CUBLAS(Kokkos::LayoutRight, Kokkos::Cuda, Kokkos::CudaUVMSpace, - true) -KOKKOSBLAS2_CSYR_CUBLAS(Kokkos::LayoutRight, Kokkos::Cuda, Kokkos::CudaUVMSpace, - false) +KOKKOSBLAS2_DSYR_CUBLAS(Kokkos::LayoutLeft, Kokkos::Cuda, Kokkos::CudaSpace, true) +KOKKOSBLAS2_DSYR_CUBLAS(Kokkos::LayoutLeft, Kokkos::Cuda, Kokkos::CudaSpace, false) +KOKKOSBLAS2_DSYR_CUBLAS(Kokkos::LayoutRight, Kokkos::Cuda, Kokkos::CudaSpace, true) +KOKKOSBLAS2_DSYR_CUBLAS(Kokkos::LayoutRight, Kokkos::Cuda, Kokkos::CudaSpace, false) + +KOKKOSBLAS2_DSYR_CUBLAS(Kokkos::LayoutLeft, Kokkos::Cuda, Kokkos::CudaUVMSpace, true) +KOKKOSBLAS2_DSYR_CUBLAS(Kokkos::LayoutLeft, Kokkos::Cuda, Kokkos::CudaUVMSpace, false) +KOKKOSBLAS2_DSYR_CUBLAS(Kokkos::LayoutRight, Kokkos::Cuda, Kokkos::CudaUVMSpace, true) +KOKKOSBLAS2_DSYR_CUBLAS(Kokkos::LayoutRight, Kokkos::Cuda, Kokkos::CudaUVMSpace, false) + +KOKKOSBLAS2_SSYR_CUBLAS(Kokkos::LayoutLeft, Kokkos::Cuda, Kokkos::CudaSpace, true) +KOKKOSBLAS2_SSYR_CUBLAS(Kokkos::LayoutLeft, Kokkos::Cuda, Kokkos::CudaSpace, false) +KOKKOSBLAS2_SSYR_CUBLAS(Kokkos::LayoutRight, Kokkos::Cuda, Kokkos::CudaSpace, true) +KOKKOSBLAS2_SSYR_CUBLAS(Kokkos::LayoutRight, Kokkos::Cuda, Kokkos::CudaSpace, false) + +KOKKOSBLAS2_SSYR_CUBLAS(Kokkos::LayoutLeft, Kokkos::Cuda, Kokkos::CudaUVMSpace, true) +KOKKOSBLAS2_SSYR_CUBLAS(Kokkos::LayoutLeft, Kokkos::Cuda, Kokkos::CudaUVMSpace, false) +KOKKOSBLAS2_SSYR_CUBLAS(Kokkos::LayoutRight, Kokkos::Cuda, Kokkos::CudaUVMSpace, true) +KOKKOSBLAS2_SSYR_CUBLAS(Kokkos::LayoutRight, Kokkos::Cuda, Kokkos::CudaUVMSpace, false) + +KOKKOSBLAS2_ZSYR_CUBLAS(Kokkos::LayoutLeft, Kokkos::Cuda, Kokkos::CudaSpace, true) +KOKKOSBLAS2_ZSYR_CUBLAS(Kokkos::LayoutLeft, Kokkos::Cuda, Kokkos::CudaSpace, false) +KOKKOSBLAS2_ZSYR_CUBLAS(Kokkos::LayoutRight, Kokkos::Cuda, Kokkos::CudaSpace, true) +KOKKOSBLAS2_ZSYR_CUBLAS(Kokkos::LayoutRight, Kokkos::Cuda, Kokkos::CudaSpace, false) + +KOKKOSBLAS2_ZSYR_CUBLAS(Kokkos::LayoutLeft, Kokkos::Cuda, Kokkos::CudaUVMSpace, true) +KOKKOSBLAS2_ZSYR_CUBLAS(Kokkos::LayoutLeft, Kokkos::Cuda, Kokkos::CudaUVMSpace, false) +KOKKOSBLAS2_ZSYR_CUBLAS(Kokkos::LayoutRight, Kokkos::Cuda, Kokkos::CudaUVMSpace, true) +KOKKOSBLAS2_ZSYR_CUBLAS(Kokkos::LayoutRight, Kokkos::Cuda, Kokkos::CudaUVMSpace, false) + +KOKKOSBLAS2_CSYR_CUBLAS(Kokkos::LayoutLeft, Kokkos::Cuda, Kokkos::CudaSpace, true) +KOKKOSBLAS2_CSYR_CUBLAS(Kokkos::LayoutLeft, Kokkos::Cuda, Kokkos::CudaSpace, false) +KOKKOSBLAS2_CSYR_CUBLAS(Kokkos::LayoutRight, Kokkos::Cuda, Kokkos::CudaSpace, true) +KOKKOSBLAS2_CSYR_CUBLAS(Kokkos::LayoutRight, Kokkos::Cuda, Kokkos::CudaSpace, false) + +KOKKOSBLAS2_CSYR_CUBLAS(Kokkos::LayoutLeft, Kokkos::Cuda, Kokkos::CudaUVMSpace, true) +KOKKOSBLAS2_CSYR_CUBLAS(Kokkos::LayoutLeft, Kokkos::Cuda, Kokkos::CudaUVMSpace, false) +KOKKOSBLAS2_CSYR_CUBLAS(Kokkos::LayoutRight, Kokkos::Cuda, Kokkos::CudaUVMSpace, true) +KOKKOSBLAS2_CSYR_CUBLAS(Kokkos::LayoutRight, Kokkos::Cuda, Kokkos::CudaUVMSpace, false) } // namespace Impl } // namespace KokkosBlas diff --git a/blas/tpls/KokkosBlas2_syr_tpl_spec_decl_rocblas.hpp b/blas/tpls/KokkosBlas2_syr_tpl_spec_decl_rocblas.hpp index cf02e9e207..59c99c1225 100644 --- a/blas/tpls/KokkosBlas2_syr_tpl_spec_decl_rocblas.hpp +++ b/blas/tpls/KokkosBlas2_syr_tpl_spec_decl_rocblas.hpp @@ -28,283 +28,205 @@ namespace Impl { const int N = static_cast(A_is_lr ? A.extent(0) : A.extent(1)); \ constexpr int one = 1; \ const int LDA = A_is_lr ? A.stride(0) : A.stride(1); \ - rocblas_fill fillMode = (uploChar == 'L' || uploChar == 'l') \ - ? rocblas_fill_lower \ - : rocblas_fill_upper; + rocblas_fill fillMode = (uploChar == 'L' || uploChar == 'l') ? rocblas_fill_lower : rocblas_fill_upper; -#define KOKKOSBLAS2_DSYR_ROCBLAS(LAYOUT, EXEC_SPACE, MEM_SPACE, \ - ETI_SPEC_AVAIL) \ - template <> \ - struct SYR< \ - EXEC_SPACE, \ - Kokkos::View, \ - Kokkos::MemoryTraits >, \ - Kokkos::View, \ - Kokkos::MemoryTraits >, \ - true, ETI_SPEC_AVAIL> { \ - typedef double SCALAR; \ - typedef Kokkos::View, \ - Kokkos::MemoryTraits > \ - XViewType; \ - typedef Kokkos::View, \ - Kokkos::MemoryTraits > \ - AViewType; \ - \ - static void syr(const typename AViewType::execution_space& space, \ - const char trans[], const char uplo[], \ - typename AViewType::const_value_type& alpha, \ - const XViewType& X, const AViewType& A) { \ - Kokkos::Profiling::pushRegion("KokkosBlas::syr[TPL_ROCBLAS,double]"); \ - KOKKOSBLAS2_SYR_ROCBLAS_DETERMINE_ARGS(LAYOUT, uplo[0]); \ - if (A_is_ll) { \ - KokkosBlas::Impl::RocBlasSingleton& s = \ - KokkosBlas::Impl::RocBlasSingleton::singleton(); \ - KOKKOS_ROCBLAS_SAFE_CALL_IMPL( \ - rocblas_set_stream(s.handle, space.hip_stream())); \ - KOKKOS_ROCBLAS_SAFE_CALL_IMPL( \ - rocblas_set_pointer_mode(s.handle, rocblas_pointer_mode_host)); \ - KOKKOS_ROCBLAS_SAFE_CALL_IMPL(rocblas_dsyr( \ - s.handle, fillMode, N, &alpha, X.data(), one, A.data(), LDA)); \ - KOKKOS_ROCBLAS_SAFE_CALL_IMPL(rocblas_set_stream(s.handle, NULL)); \ - } else { \ - /* rocblas_dsyr() + ~A_ll => call kokkos-kernels' implementation */ \ - SYR::syr( \ - space, trans, uplo, alpha, X, A); \ - } \ - Kokkos::Profiling::popRegion(); \ - } \ +#define KOKKOSBLAS2_DSYR_ROCBLAS(LAYOUT, EXEC_SPACE, MEM_SPACE, ETI_SPEC_AVAIL) \ + template <> \ + struct SYR< \ + EXEC_SPACE, \ + Kokkos::View, \ + Kokkos::MemoryTraits >, \ + Kokkos::View, Kokkos::MemoryTraits >, \ + true, ETI_SPEC_AVAIL> { \ + typedef double SCALAR; \ + typedef Kokkos::View, \ + Kokkos::MemoryTraits > \ + XViewType; \ + typedef Kokkos::View, \ + Kokkos::MemoryTraits > \ + AViewType; \ + \ + static void syr(const typename AViewType::execution_space& space, const char trans[], const char uplo[], \ + typename AViewType::const_value_type& alpha, const XViewType& X, const AViewType& A) { \ + Kokkos::Profiling::pushRegion("KokkosBlas::syr[TPL_ROCBLAS,double]"); \ + KOKKOSBLAS2_SYR_ROCBLAS_DETERMINE_ARGS(LAYOUT, uplo[0]); \ + if (A_is_ll) { \ + KokkosBlas::Impl::RocBlasSingleton& s = KokkosBlas::Impl::RocBlasSingleton::singleton(); \ + KOKKOS_ROCBLAS_SAFE_CALL_IMPL(rocblas_set_stream(s.handle, space.hip_stream())); \ + KOKKOS_ROCBLAS_SAFE_CALL_IMPL(rocblas_set_pointer_mode(s.handle, rocblas_pointer_mode_host)); \ + KOKKOS_ROCBLAS_SAFE_CALL_IMPL(rocblas_dsyr(s.handle, fillMode, N, &alpha, X.data(), one, A.data(), LDA)); \ + KOKKOS_ROCBLAS_SAFE_CALL_IMPL(rocblas_set_stream(s.handle, NULL)); \ + } else { \ + /* rocblas_dsyr() + ~A_ll => call kokkos-kernels' implementation */ \ + SYR::syr(space, trans, uplo, alpha, X, A); \ + } \ + Kokkos::Profiling::popRegion(); \ + } \ }; -#define KOKKOSBLAS2_SSYR_ROCBLAS(LAYOUT, EXEC_SPACE, MEM_SPACE, \ - ETI_SPEC_AVAIL) \ - template <> \ - struct SYR< \ - EXEC_SPACE, \ - Kokkos::View, \ - Kokkos::MemoryTraits >, \ - Kokkos::View, \ - Kokkos::MemoryTraits >, \ - true, ETI_SPEC_AVAIL> { \ - typedef float SCALAR; \ - typedef Kokkos::View, \ - Kokkos::MemoryTraits > \ - XViewType; \ - typedef Kokkos::View, \ - Kokkos::MemoryTraits > \ - AViewType; \ - \ - static void syr(const typename AViewType::execution_space& space, \ - const char trans[], const char uplo[], \ - typename AViewType::const_value_type& alpha, \ - const XViewType& X, const AViewType& A) { \ - Kokkos::Profiling::pushRegion("KokkosBlas::syr[TPL_ROCBLAS,float]"); \ - KOKKOSBLAS2_SYR_ROCBLAS_DETERMINE_ARGS(LAYOUT, uplo[0]); \ - if (A_is_ll) { \ - KokkosBlas::Impl::RocBlasSingleton& s = \ - KokkosBlas::Impl::RocBlasSingleton::singleton(); \ - KOKKOS_ROCBLAS_SAFE_CALL_IMPL( \ - rocblas_set_stream(s.handle, space.hip_stream())); \ - KOKKOS_ROCBLAS_SAFE_CALL_IMPL( \ - rocblas_set_pointer_mode(s.handle, rocblas_pointer_mode_host)); \ - KOKKOS_ROCBLAS_SAFE_CALL_IMPL(rocblas_ssyr( \ - s.handle, fillMode, N, &alpha, X.data(), one, A.data(), LDA)); \ - KOKKOS_ROCBLAS_SAFE_CALL_IMPL(rocblas_set_stream(s.handle, NULL)); \ - } else { \ - /* rocblas_ssyr() + ~A_ll => call kokkos-kernels' implementation */ \ - SYR::syr( \ - space, trans, uplo, alpha, X, A); \ - } \ - Kokkos::Profiling::popRegion(); \ - } \ +#define KOKKOSBLAS2_SSYR_ROCBLAS(LAYOUT, EXEC_SPACE, MEM_SPACE, ETI_SPEC_AVAIL) \ + template <> \ + struct SYR< \ + EXEC_SPACE, \ + Kokkos::View, \ + Kokkos::MemoryTraits >, \ + Kokkos::View, Kokkos::MemoryTraits >, \ + true, ETI_SPEC_AVAIL> { \ + typedef float SCALAR; \ + typedef Kokkos::View, \ + Kokkos::MemoryTraits > \ + XViewType; \ + typedef Kokkos::View, \ + Kokkos::MemoryTraits > \ + AViewType; \ + \ + static void syr(const typename AViewType::execution_space& space, const char trans[], const char uplo[], \ + typename AViewType::const_value_type& alpha, const XViewType& X, const AViewType& A) { \ + Kokkos::Profiling::pushRegion("KokkosBlas::syr[TPL_ROCBLAS,float]"); \ + KOKKOSBLAS2_SYR_ROCBLAS_DETERMINE_ARGS(LAYOUT, uplo[0]); \ + if (A_is_ll) { \ + KokkosBlas::Impl::RocBlasSingleton& s = KokkosBlas::Impl::RocBlasSingleton::singleton(); \ + KOKKOS_ROCBLAS_SAFE_CALL_IMPL(rocblas_set_stream(s.handle, space.hip_stream())); \ + KOKKOS_ROCBLAS_SAFE_CALL_IMPL(rocblas_set_pointer_mode(s.handle, rocblas_pointer_mode_host)); \ + KOKKOS_ROCBLAS_SAFE_CALL_IMPL(rocblas_ssyr(s.handle, fillMode, N, &alpha, X.data(), one, A.data(), LDA)); \ + KOKKOS_ROCBLAS_SAFE_CALL_IMPL(rocblas_set_stream(s.handle, NULL)); \ + } else { \ + /* rocblas_ssyr() + ~A_ll => call kokkos-kernels' implementation */ \ + SYR::syr(space, trans, uplo, alpha, X, A); \ + } \ + Kokkos::Profiling::popRegion(); \ + } \ }; -#define KOKKOSBLAS2_ZSYR_ROCBLAS(LAYOUT, EXEC_SPACE, MEM_SPACE, \ - ETI_SPEC_AVAIL) \ - template <> \ - struct SYR*, LAYOUT, \ - Kokkos::Device, \ - Kokkos::MemoryTraits >, \ - Kokkos::View**, LAYOUT, \ - Kokkos::Device, \ - Kokkos::MemoryTraits >, \ - true, ETI_SPEC_AVAIL> { \ - typedef Kokkos::complex SCALAR; \ - typedef Kokkos::View, \ - Kokkos::MemoryTraits > \ - XViewType; \ - typedef Kokkos::View, \ - Kokkos::MemoryTraits > \ - AViewType; \ - \ - static void syr(const typename AViewType::execution_space& space, \ - const char trans[], const char uplo[], \ - typename AViewType::const_value_type& alpha, \ - const XViewType& X, const AViewType& A) { \ - Kokkos::Profiling::pushRegion( \ - "KokkosBlas::syr[TPL_ROCBLAS,complex]"); \ - KOKKOSBLAS2_SYR_ROCBLAS_DETERMINE_ARGS(LAYOUT, uplo[0]); \ - bool justTranspose = (trans[0] == 'T') || (trans[0] == 't'); \ - if (justTranspose) { \ - if (A_is_ll) { \ - KokkosBlas::Impl::RocBlasSingleton& s = \ - KokkosBlas::Impl::RocBlasSingleton::singleton(); \ - KOKKOS_ROCBLAS_SAFE_CALL_IMPL( \ - rocblas_set_stream(s.handle, space.hip_stream())); \ - KOKKOS_ROCBLAS_SAFE_CALL_IMPL( \ - rocblas_set_pointer_mode(s.handle, rocblas_pointer_mode_host)); \ - KOKKOS_ROCBLAS_SAFE_CALL_IMPL(rocblas_zsyr( \ - s.handle, fillMode, N, \ - reinterpret_cast(&alpha), \ - reinterpret_cast(X.data()), one, \ - reinterpret_cast(A.data()), LDA)); \ - KOKKOS_ROCBLAS_SAFE_CALL_IMPL(rocblas_set_stream(s.handle, NULL)); \ - } else { \ - /* rocblas_zsyr() + ~A_ll => call kokkos-kernels' implementation */ \ - SYR::syr( \ - space, trans, uplo, alpha, X, A); \ - } \ - } else { \ - if (A_is_ll && (alpha.imag() == 0.)) { \ - const double alpha_val = alpha.real(); \ - KokkosBlas::Impl::RocBlasSingleton& s = \ - KokkosBlas::Impl::RocBlasSingleton::singleton(); \ - KOKKOS_ROCBLAS_SAFE_CALL_IMPL( \ - rocblas_set_stream(s.handle, space.hip_stream())); \ - KOKKOS_ROCBLAS_SAFE_CALL_IMPL( \ - rocblas_set_pointer_mode(s.handle, rocblas_pointer_mode_host)); \ - KOKKOS_ROCBLAS_SAFE_CALL_IMPL(rocblas_zher( \ - s.handle, fillMode, N, &alpha_val, \ - reinterpret_cast(X.data()), one, \ - reinterpret_cast(A.data()), LDA)); \ - KOKKOS_ROCBLAS_SAFE_CALL_IMPL(rocblas_set_stream(s.handle, NULL)); \ - } else { \ - /* rocblas_zher() + [~A_ll or ~real alpha]=> call kokkos-kernels' \ - * implementation */ \ - SYR::syr( \ - space, trans, uplo, alpha, X, A); \ - } \ - } \ - Kokkos::Profiling::popRegion(); \ - } \ +#define KOKKOSBLAS2_ZSYR_ROCBLAS(LAYOUT, EXEC_SPACE, MEM_SPACE, ETI_SPEC_AVAIL) \ + template <> \ + struct SYR*, LAYOUT, Kokkos::Device, \ + Kokkos::MemoryTraits >, \ + Kokkos::View**, LAYOUT, Kokkos::Device, \ + Kokkos::MemoryTraits >, \ + true, ETI_SPEC_AVAIL> { \ + typedef Kokkos::complex SCALAR; \ + typedef Kokkos::View, \ + Kokkos::MemoryTraits > \ + XViewType; \ + typedef Kokkos::View, \ + Kokkos::MemoryTraits > \ + AViewType; \ + \ + static void syr(const typename AViewType::execution_space& space, const char trans[], const char uplo[], \ + typename AViewType::const_value_type& alpha, const XViewType& X, const AViewType& A) { \ + Kokkos::Profiling::pushRegion("KokkosBlas::syr[TPL_ROCBLAS,complex]"); \ + KOKKOSBLAS2_SYR_ROCBLAS_DETERMINE_ARGS(LAYOUT, uplo[0]); \ + bool justTranspose = (trans[0] == 'T') || (trans[0] == 't'); \ + if (justTranspose) { \ + if (A_is_ll) { \ + KokkosBlas::Impl::RocBlasSingleton& s = KokkosBlas::Impl::RocBlasSingleton::singleton(); \ + KOKKOS_ROCBLAS_SAFE_CALL_IMPL(rocblas_set_stream(s.handle, space.hip_stream())); \ + KOKKOS_ROCBLAS_SAFE_CALL_IMPL(rocblas_set_pointer_mode(s.handle, rocblas_pointer_mode_host)); \ + KOKKOS_ROCBLAS_SAFE_CALL_IMPL(rocblas_zsyr(s.handle, fillMode, N, \ + reinterpret_cast(&alpha), \ + reinterpret_cast(X.data()), one, \ + reinterpret_cast(A.data()), LDA)); \ + KOKKOS_ROCBLAS_SAFE_CALL_IMPL(rocblas_set_stream(s.handle, NULL)); \ + } else { \ + /* rocblas_zsyr() + ~A_ll => call kokkos-kernels' implementation */ \ + SYR::syr(space, trans, uplo, alpha, X, A); \ + } \ + } else { \ + if (A_is_ll && (alpha.imag() == 0.)) { \ + const double alpha_val = alpha.real(); \ + KokkosBlas::Impl::RocBlasSingleton& s = KokkosBlas::Impl::RocBlasSingleton::singleton(); \ + KOKKOS_ROCBLAS_SAFE_CALL_IMPL(rocblas_set_stream(s.handle, space.hip_stream())); \ + KOKKOS_ROCBLAS_SAFE_CALL_IMPL(rocblas_set_pointer_mode(s.handle, rocblas_pointer_mode_host)); \ + KOKKOS_ROCBLAS_SAFE_CALL_IMPL(rocblas_zher(s.handle, fillMode, N, &alpha_val, \ + reinterpret_cast(X.data()), one, \ + reinterpret_cast(A.data()), LDA)); \ + KOKKOS_ROCBLAS_SAFE_CALL_IMPL(rocblas_set_stream(s.handle, NULL)); \ + } else { \ + /* rocblas_zher() + [~A_ll or ~real alpha]=> call kokkos-kernels' \ + * implementation */ \ + SYR::syr(space, trans, uplo, alpha, X, A); \ + } \ + } \ + Kokkos::Profiling::popRegion(); \ + } \ }; -#define KOKKOSBLAS2_CSYR_ROCBLAS(LAYOUT, EXEC_SPACE, MEM_SPACE, \ - ETI_SPEC_AVAIL) \ - template <> \ - struct SYR*, LAYOUT, \ - Kokkos::Device, \ - Kokkos::MemoryTraits >, \ - Kokkos::View**, LAYOUT, \ - Kokkos::Device, \ - Kokkos::MemoryTraits >, \ - true, ETI_SPEC_AVAIL> { \ - typedef Kokkos::complex SCALAR; \ - typedef Kokkos::View, \ - Kokkos::MemoryTraits > \ - XViewType; \ - typedef Kokkos::View, \ - Kokkos::MemoryTraits > \ - AViewType; \ - \ - static void syr(const typename AViewType::execution_space& space, \ - const char trans[], const char uplo[], \ - typename AViewType::const_value_type& alpha, \ - const XViewType& X, const AViewType& A) { \ - Kokkos::Profiling::pushRegion( \ - "KokkosBlas::syr[TPL_ROCBLAS,complex]"); \ - KOKKOSBLAS2_SYR_ROCBLAS_DETERMINE_ARGS(LAYOUT, uplo[0]); \ - bool justTranspose = (trans[0] == 'T') || (trans[0] == 't'); \ - if (justTranspose) { \ - if (A_is_ll) { \ - KokkosBlas::Impl::RocBlasSingleton& s = \ - KokkosBlas::Impl::RocBlasSingleton::singleton(); \ - KOKKOS_ROCBLAS_SAFE_CALL_IMPL( \ - rocblas_set_stream(s.handle, space.hip_stream())); \ - KOKKOS_ROCBLAS_SAFE_CALL_IMPL( \ - rocblas_set_pointer_mode(s.handle, rocblas_pointer_mode_host)); \ - KOKKOS_ROCBLAS_SAFE_CALL_IMPL(rocblas_csyr( \ - s.handle, fillMode, N, \ - reinterpret_cast(&alpha), \ - reinterpret_cast(X.data()), one, \ - reinterpret_cast(A.data()), LDA)); \ - KOKKOS_ROCBLAS_SAFE_CALL_IMPL(rocblas_set_stream(s.handle, NULL)); \ - } else { \ - /* rocblas_csyr() + ~A_ll => call kokkos-kernels' implementation */ \ - SYR::syr( \ - space, trans, uplo, alpha, X, A); \ - } \ - } else { \ - if (A_is_ll && (alpha.imag() == 0.)) { \ - const float alpha_val = alpha.real(); \ - KokkosBlas::Impl::RocBlasSingleton& s = \ - KokkosBlas::Impl::RocBlasSingleton::singleton(); \ - KOKKOS_ROCBLAS_SAFE_CALL_IMPL( \ - rocblas_set_stream(s.handle, space.hip_stream())); \ - KOKKOS_ROCBLAS_SAFE_CALL_IMPL( \ - rocblas_set_pointer_mode(s.handle, rocblas_pointer_mode_host)); \ - KOKKOS_ROCBLAS_SAFE_CALL_IMPL(rocblas_cher( \ - s.handle, fillMode, N, &alpha_val, \ - reinterpret_cast(X.data()), one, \ - reinterpret_cast(A.data()), LDA)); \ - KOKKOS_ROCBLAS_SAFE_CALL_IMPL(rocblas_set_stream(s.handle, NULL)); \ - } else { \ - /* rocblas_cher() + [~A_ll or ~real alpha]=> call kokkos-kernels' \ - * implementation */ \ - SYR::syr( \ - space, trans, uplo, alpha, X, A); \ - } \ - } \ - Kokkos::Profiling::popRegion(); \ - } \ +#define KOKKOSBLAS2_CSYR_ROCBLAS(LAYOUT, EXEC_SPACE, MEM_SPACE, ETI_SPEC_AVAIL) \ + template <> \ + struct SYR*, LAYOUT, Kokkos::Device, \ + Kokkos::MemoryTraits >, \ + Kokkos::View**, LAYOUT, Kokkos::Device, \ + Kokkos::MemoryTraits >, \ + true, ETI_SPEC_AVAIL> { \ + typedef Kokkos::complex SCALAR; \ + typedef Kokkos::View, \ + Kokkos::MemoryTraits > \ + XViewType; \ + typedef Kokkos::View, \ + Kokkos::MemoryTraits > \ + AViewType; \ + \ + static void syr(const typename AViewType::execution_space& space, const char trans[], const char uplo[], \ + typename AViewType::const_value_type& alpha, const XViewType& X, const AViewType& A) { \ + Kokkos::Profiling::pushRegion("KokkosBlas::syr[TPL_ROCBLAS,complex]"); \ + KOKKOSBLAS2_SYR_ROCBLAS_DETERMINE_ARGS(LAYOUT, uplo[0]); \ + bool justTranspose = (trans[0] == 'T') || (trans[0] == 't'); \ + if (justTranspose) { \ + if (A_is_ll) { \ + KokkosBlas::Impl::RocBlasSingleton& s = KokkosBlas::Impl::RocBlasSingleton::singleton(); \ + KOKKOS_ROCBLAS_SAFE_CALL_IMPL(rocblas_set_stream(s.handle, space.hip_stream())); \ + KOKKOS_ROCBLAS_SAFE_CALL_IMPL(rocblas_set_pointer_mode(s.handle, rocblas_pointer_mode_host)); \ + KOKKOS_ROCBLAS_SAFE_CALL_IMPL(rocblas_csyr(s.handle, fillMode, N, \ + reinterpret_cast(&alpha), \ + reinterpret_cast(X.data()), one, \ + reinterpret_cast(A.data()), LDA)); \ + KOKKOS_ROCBLAS_SAFE_CALL_IMPL(rocblas_set_stream(s.handle, NULL)); \ + } else { \ + /* rocblas_csyr() + ~A_ll => call kokkos-kernels' implementation */ \ + SYR::syr(space, trans, uplo, alpha, X, A); \ + } \ + } else { \ + if (A_is_ll && (alpha.imag() == 0.)) { \ + const float alpha_val = alpha.real(); \ + KokkosBlas::Impl::RocBlasSingleton& s = KokkosBlas::Impl::RocBlasSingleton::singleton(); \ + KOKKOS_ROCBLAS_SAFE_CALL_IMPL(rocblas_set_stream(s.handle, space.hip_stream())); \ + KOKKOS_ROCBLAS_SAFE_CALL_IMPL(rocblas_set_pointer_mode(s.handle, rocblas_pointer_mode_host)); \ + KOKKOS_ROCBLAS_SAFE_CALL_IMPL(rocblas_cher(s.handle, fillMode, N, &alpha_val, \ + reinterpret_cast(X.data()), one, \ + reinterpret_cast(A.data()), LDA)); \ + KOKKOS_ROCBLAS_SAFE_CALL_IMPL(rocblas_set_stream(s.handle, NULL)); \ + } else { \ + /* rocblas_cher() + [~A_ll or ~real alpha]=> call kokkos-kernels' \ + * implementation */ \ + SYR::syr(space, trans, uplo, alpha, X, A); \ + } \ + } \ + Kokkos::Profiling::popRegion(); \ + } \ }; -KOKKOSBLAS2_DSYR_ROCBLAS(Kokkos::LayoutLeft, Kokkos::HIP, Kokkos::HIPSpace, - true) -KOKKOSBLAS2_DSYR_ROCBLAS(Kokkos::LayoutLeft, Kokkos::HIP, Kokkos::HIPSpace, - false) -KOKKOSBLAS2_DSYR_ROCBLAS(Kokkos::LayoutRight, Kokkos::HIP, Kokkos::HIPSpace, - true) -KOKKOSBLAS2_DSYR_ROCBLAS(Kokkos::LayoutRight, Kokkos::HIP, Kokkos::HIPSpace, - false) +KOKKOSBLAS2_DSYR_ROCBLAS(Kokkos::LayoutLeft, Kokkos::HIP, Kokkos::HIPSpace, true) +KOKKOSBLAS2_DSYR_ROCBLAS(Kokkos::LayoutLeft, Kokkos::HIP, Kokkos::HIPSpace, false) +KOKKOSBLAS2_DSYR_ROCBLAS(Kokkos::LayoutRight, Kokkos::HIP, Kokkos::HIPSpace, true) +KOKKOSBLAS2_DSYR_ROCBLAS(Kokkos::LayoutRight, Kokkos::HIP, Kokkos::HIPSpace, false) -KOKKOSBLAS2_SSYR_ROCBLAS(Kokkos::LayoutLeft, Kokkos::HIP, Kokkos::HIPSpace, - true) -KOKKOSBLAS2_SSYR_ROCBLAS(Kokkos::LayoutLeft, Kokkos::HIP, Kokkos::HIPSpace, - false) -KOKKOSBLAS2_SSYR_ROCBLAS(Kokkos::LayoutRight, Kokkos::HIP, Kokkos::HIPSpace, - true) -KOKKOSBLAS2_SSYR_ROCBLAS(Kokkos::LayoutRight, Kokkos::HIP, Kokkos::HIPSpace, - false) +KOKKOSBLAS2_SSYR_ROCBLAS(Kokkos::LayoutLeft, Kokkos::HIP, Kokkos::HIPSpace, true) +KOKKOSBLAS2_SSYR_ROCBLAS(Kokkos::LayoutLeft, Kokkos::HIP, Kokkos::HIPSpace, false) +KOKKOSBLAS2_SSYR_ROCBLAS(Kokkos::LayoutRight, Kokkos::HIP, Kokkos::HIPSpace, true) +KOKKOSBLAS2_SSYR_ROCBLAS(Kokkos::LayoutRight, Kokkos::HIP, Kokkos::HIPSpace, false) -KOKKOSBLAS2_ZSYR_ROCBLAS(Kokkos::LayoutLeft, Kokkos::HIP, Kokkos::HIPSpace, - true) -KOKKOSBLAS2_ZSYR_ROCBLAS(Kokkos::LayoutLeft, Kokkos::HIP, Kokkos::HIPSpace, - false) -KOKKOSBLAS2_ZSYR_ROCBLAS(Kokkos::LayoutRight, Kokkos::HIP, Kokkos::HIPSpace, - true) -KOKKOSBLAS2_ZSYR_ROCBLAS(Kokkos::LayoutRight, Kokkos::HIP, Kokkos::HIPSpace, - false) +KOKKOSBLAS2_ZSYR_ROCBLAS(Kokkos::LayoutLeft, Kokkos::HIP, Kokkos::HIPSpace, true) +KOKKOSBLAS2_ZSYR_ROCBLAS(Kokkos::LayoutLeft, Kokkos::HIP, Kokkos::HIPSpace, false) +KOKKOSBLAS2_ZSYR_ROCBLAS(Kokkos::LayoutRight, Kokkos::HIP, Kokkos::HIPSpace, true) +KOKKOSBLAS2_ZSYR_ROCBLAS(Kokkos::LayoutRight, Kokkos::HIP, Kokkos::HIPSpace, false) -KOKKOSBLAS2_CSYR_ROCBLAS(Kokkos::LayoutLeft, Kokkos::HIP, Kokkos::HIPSpace, - true) -KOKKOSBLAS2_CSYR_ROCBLAS(Kokkos::LayoutLeft, Kokkos::HIP, Kokkos::HIPSpace, - false) -KOKKOSBLAS2_CSYR_ROCBLAS(Kokkos::LayoutRight, Kokkos::HIP, Kokkos::HIPSpace, - true) -KOKKOSBLAS2_CSYR_ROCBLAS(Kokkos::LayoutRight, Kokkos::HIP, Kokkos::HIPSpace, - false) +KOKKOSBLAS2_CSYR_ROCBLAS(Kokkos::LayoutLeft, Kokkos::HIP, Kokkos::HIPSpace, true) +KOKKOSBLAS2_CSYR_ROCBLAS(Kokkos::LayoutLeft, Kokkos::HIP, Kokkos::HIPSpace, false) +KOKKOSBLAS2_CSYR_ROCBLAS(Kokkos::LayoutRight, Kokkos::HIP, Kokkos::HIPSpace, true) +KOKKOSBLAS2_CSYR_ROCBLAS(Kokkos::LayoutRight, Kokkos::HIP, Kokkos::HIPSpace, false) } // namespace Impl } // namespace KokkosBlas diff --git a/blas/tpls/KokkosBlas3_gemm_tpl_spec_avail.hpp b/blas/tpls/KokkosBlas3_gemm_tpl_spec_avail.hpp index 8e96898b10..0dd3ef81e9 100644 --- a/blas/tpls/KokkosBlas3_gemm_tpl_spec_avail.hpp +++ b/blas/tpls/KokkosBlas3_gemm_tpl_spec_avail.hpp @@ -28,46 +28,34 @@ struct gemm_tpl_spec_avail { // Generic Host side BLAS (could be MKL or whatever) #if defined(KOKKOSKERNELS_ENABLE_TPL_BLAS) -#define KOKKOSBLAS3_GEMM_TPL_SPEC_AVAIL_BLAS(SCALAR, LAYOUTA, LAYOUTB, \ - LAYOUTC, MEMSPACE) \ - template \ - struct gemm_tpl_spec_avail< \ - ExecSpace, \ - Kokkos::View, \ - Kokkos::MemoryTraits >, \ - Kokkos::View, \ - Kokkos::MemoryTraits >, \ - Kokkos::View, \ - Kokkos::MemoryTraits > > { \ - enum : bool { value = true }; \ +#define KOKKOSBLAS3_GEMM_TPL_SPEC_AVAIL_BLAS(SCALAR, LAYOUTA, LAYOUTB, LAYOUTC, MEMSPACE) \ + template \ + struct gemm_tpl_spec_avail, \ + Kokkos::MemoryTraits >, \ + Kokkos::View, \ + Kokkos::MemoryTraits >, \ + Kokkos::View, \ + Kokkos::MemoryTraits > > { \ + enum : bool { value = true }; \ }; -KOKKOSBLAS3_GEMM_TPL_SPEC_AVAIL_BLAS(double, Kokkos::LayoutLeft, - Kokkos::LayoutLeft, Kokkos::LayoutLeft, +KOKKOSBLAS3_GEMM_TPL_SPEC_AVAIL_BLAS(double, Kokkos::LayoutLeft, Kokkos::LayoutLeft, Kokkos::LayoutLeft, Kokkos::HostSpace) -KOKKOSBLAS3_GEMM_TPL_SPEC_AVAIL_BLAS(float, Kokkos::LayoutLeft, - Kokkos::LayoutLeft, Kokkos::LayoutLeft, +KOKKOSBLAS3_GEMM_TPL_SPEC_AVAIL_BLAS(float, Kokkos::LayoutLeft, Kokkos::LayoutLeft, Kokkos::LayoutLeft, Kokkos::HostSpace) -KOKKOSBLAS3_GEMM_TPL_SPEC_AVAIL_BLAS(Kokkos::complex, - Kokkos::LayoutLeft, Kokkos::LayoutLeft, +KOKKOSBLAS3_GEMM_TPL_SPEC_AVAIL_BLAS(Kokkos::complex, Kokkos::LayoutLeft, Kokkos::LayoutLeft, Kokkos::LayoutLeft, Kokkos::HostSpace) -KOKKOSBLAS3_GEMM_TPL_SPEC_AVAIL_BLAS(Kokkos::complex, Kokkos::LayoutLeft, - Kokkos::LayoutLeft, Kokkos::LayoutLeft, +KOKKOSBLAS3_GEMM_TPL_SPEC_AVAIL_BLAS(Kokkos::complex, Kokkos::LayoutLeft, Kokkos::LayoutLeft, Kokkos::LayoutLeft, Kokkos::HostSpace) -KOKKOSBLAS3_GEMM_TPL_SPEC_AVAIL_BLAS(double, Kokkos::LayoutRight, - Kokkos::LayoutRight, Kokkos::LayoutRight, +KOKKOSBLAS3_GEMM_TPL_SPEC_AVAIL_BLAS(double, Kokkos::LayoutRight, Kokkos::LayoutRight, Kokkos::LayoutRight, Kokkos::HostSpace) -KOKKOSBLAS3_GEMM_TPL_SPEC_AVAIL_BLAS(float, Kokkos::LayoutRight, - Kokkos::LayoutRight, Kokkos::LayoutRight, +KOKKOSBLAS3_GEMM_TPL_SPEC_AVAIL_BLAS(float, Kokkos::LayoutRight, Kokkos::LayoutRight, Kokkos::LayoutRight, Kokkos::HostSpace) -KOKKOSBLAS3_GEMM_TPL_SPEC_AVAIL_BLAS(Kokkos::complex, - Kokkos::LayoutRight, Kokkos::LayoutRight, +KOKKOSBLAS3_GEMM_TPL_SPEC_AVAIL_BLAS(Kokkos::complex, Kokkos::LayoutRight, Kokkos::LayoutRight, Kokkos::LayoutRight, Kokkos::HostSpace) -KOKKOSBLAS3_GEMM_TPL_SPEC_AVAIL_BLAS(Kokkos::complex, - Kokkos::LayoutRight, Kokkos::LayoutRight, +KOKKOSBLAS3_GEMM_TPL_SPEC_AVAIL_BLAS(Kokkos::complex, Kokkos::LayoutRight, Kokkos::LayoutRight, Kokkos::LayoutRight, Kokkos::HostSpace) #endif @@ -75,111 +63,78 @@ KOKKOSBLAS3_GEMM_TPL_SPEC_AVAIL_BLAS(Kokkos::complex, // cuBLAS #if defined(KOKKOSKERNELS_ENABLE_TPL_CUBLAS) -#define KOKKOSBLAS3_GEMM_TPL_SPEC_AVAIL_CUBLAS(SCALAR, LAYOUTA, LAYOUTB, \ - LAYOUTC, MEMSPACE) \ - template \ - struct gemm_tpl_spec_avail< \ - ExecSpace, \ - Kokkos::View, \ - Kokkos::MemoryTraits >, \ - Kokkos::View, \ - Kokkos::MemoryTraits >, \ - Kokkos::View, \ - Kokkos::MemoryTraits > > { \ - enum : bool { value = true }; \ +#define KOKKOSBLAS3_GEMM_TPL_SPEC_AVAIL_CUBLAS(SCALAR, LAYOUTA, LAYOUTB, LAYOUTC, MEMSPACE) \ + template \ + struct gemm_tpl_spec_avail, \ + Kokkos::MemoryTraits >, \ + Kokkos::View, \ + Kokkos::MemoryTraits >, \ + Kokkos::View, \ + Kokkos::MemoryTraits > > { \ + enum : bool { value = true }; \ }; -KOKKOSBLAS3_GEMM_TPL_SPEC_AVAIL_CUBLAS(double, Kokkos::LayoutLeft, - Kokkos::LayoutLeft, Kokkos::LayoutLeft, +KOKKOSBLAS3_GEMM_TPL_SPEC_AVAIL_CUBLAS(double, Kokkos::LayoutLeft, Kokkos::LayoutLeft, Kokkos::LayoutLeft, Kokkos::CudaSpace) -KOKKOSBLAS3_GEMM_TPL_SPEC_AVAIL_CUBLAS(double, Kokkos::LayoutLeft, - Kokkos::LayoutLeft, Kokkos::LayoutLeft, +KOKKOSBLAS3_GEMM_TPL_SPEC_AVAIL_CUBLAS(double, Kokkos::LayoutLeft, Kokkos::LayoutLeft, Kokkos::LayoutLeft, Kokkos::CudaUVMSpace) -KOKKOSBLAS3_GEMM_TPL_SPEC_AVAIL_CUBLAS(float, Kokkos::LayoutLeft, - Kokkos::LayoutLeft, Kokkos::LayoutLeft, +KOKKOSBLAS3_GEMM_TPL_SPEC_AVAIL_CUBLAS(float, Kokkos::LayoutLeft, Kokkos::LayoutLeft, Kokkos::LayoutLeft, Kokkos::CudaSpace) -KOKKOSBLAS3_GEMM_TPL_SPEC_AVAIL_CUBLAS(float, Kokkos::LayoutLeft, - Kokkos::LayoutLeft, Kokkos::LayoutLeft, +KOKKOSBLAS3_GEMM_TPL_SPEC_AVAIL_CUBLAS(float, Kokkos::LayoutLeft, Kokkos::LayoutLeft, Kokkos::LayoutLeft, Kokkos::CudaUVMSpace) -KOKKOSBLAS3_GEMM_TPL_SPEC_AVAIL_CUBLAS(Kokkos::complex, - Kokkos::LayoutLeft, Kokkos::LayoutLeft, +KOKKOSBLAS3_GEMM_TPL_SPEC_AVAIL_CUBLAS(Kokkos::complex, Kokkos::LayoutLeft, Kokkos::LayoutLeft, Kokkos::LayoutLeft, Kokkos::CudaSpace) -KOKKOSBLAS3_GEMM_TPL_SPEC_AVAIL_CUBLAS(Kokkos::complex, - Kokkos::LayoutLeft, Kokkos::LayoutLeft, +KOKKOSBLAS3_GEMM_TPL_SPEC_AVAIL_CUBLAS(Kokkos::complex, Kokkos::LayoutLeft, Kokkos::LayoutLeft, Kokkos::LayoutLeft, Kokkos::CudaUVMSpace) -KOKKOSBLAS3_GEMM_TPL_SPEC_AVAIL_CUBLAS(Kokkos::complex, - Kokkos::LayoutLeft, Kokkos::LayoutLeft, +KOKKOSBLAS3_GEMM_TPL_SPEC_AVAIL_CUBLAS(Kokkos::complex, Kokkos::LayoutLeft, Kokkos::LayoutLeft, Kokkos::LayoutLeft, Kokkos::CudaSpace) -KOKKOSBLAS3_GEMM_TPL_SPEC_AVAIL_CUBLAS(Kokkos::complex, - Kokkos::LayoutLeft, Kokkos::LayoutLeft, +KOKKOSBLAS3_GEMM_TPL_SPEC_AVAIL_CUBLAS(Kokkos::complex, Kokkos::LayoutLeft, Kokkos::LayoutLeft, Kokkos::LayoutLeft, Kokkos::CudaUVMSpace) -KOKKOSBLAS3_GEMM_TPL_SPEC_AVAIL_CUBLAS(double, Kokkos::LayoutRight, - Kokkos::LayoutRight, Kokkos::LayoutRight, +KOKKOSBLAS3_GEMM_TPL_SPEC_AVAIL_CUBLAS(double, Kokkos::LayoutRight, Kokkos::LayoutRight, Kokkos::LayoutRight, Kokkos::CudaSpace) -KOKKOSBLAS3_GEMM_TPL_SPEC_AVAIL_CUBLAS(double, Kokkos::LayoutRight, - Kokkos::LayoutRight, Kokkos::LayoutRight, +KOKKOSBLAS3_GEMM_TPL_SPEC_AVAIL_CUBLAS(double, Kokkos::LayoutRight, Kokkos::LayoutRight, Kokkos::LayoutRight, Kokkos::CudaUVMSpace) -KOKKOSBLAS3_GEMM_TPL_SPEC_AVAIL_CUBLAS(float, Kokkos::LayoutRight, - Kokkos::LayoutRight, Kokkos::LayoutRight, +KOKKOSBLAS3_GEMM_TPL_SPEC_AVAIL_CUBLAS(float, Kokkos::LayoutRight, Kokkos::LayoutRight, Kokkos::LayoutRight, Kokkos::CudaSpace) -KOKKOSBLAS3_GEMM_TPL_SPEC_AVAIL_CUBLAS(float, Kokkos::LayoutRight, - Kokkos::LayoutRight, Kokkos::LayoutRight, +KOKKOSBLAS3_GEMM_TPL_SPEC_AVAIL_CUBLAS(float, Kokkos::LayoutRight, Kokkos::LayoutRight, Kokkos::LayoutRight, Kokkos::CudaUVMSpace) -KOKKOSBLAS3_GEMM_TPL_SPEC_AVAIL_CUBLAS(Kokkos::complex, - Kokkos::LayoutRight, Kokkos::LayoutRight, +KOKKOSBLAS3_GEMM_TPL_SPEC_AVAIL_CUBLAS(Kokkos::complex, Kokkos::LayoutRight, Kokkos::LayoutRight, Kokkos::LayoutRight, Kokkos::CudaSpace) -KOKKOSBLAS3_GEMM_TPL_SPEC_AVAIL_CUBLAS(Kokkos::complex, - Kokkos::LayoutRight, Kokkos::LayoutRight, - Kokkos::LayoutRight, - Kokkos::CudaUVMSpace) -KOKKOSBLAS3_GEMM_TPL_SPEC_AVAIL_CUBLAS(Kokkos::complex, - Kokkos::LayoutRight, Kokkos::LayoutRight, +KOKKOSBLAS3_GEMM_TPL_SPEC_AVAIL_CUBLAS(Kokkos::complex, Kokkos::LayoutRight, Kokkos::LayoutRight, + Kokkos::LayoutRight, Kokkos::CudaUVMSpace) +KOKKOSBLAS3_GEMM_TPL_SPEC_AVAIL_CUBLAS(Kokkos::complex, Kokkos::LayoutRight, Kokkos::LayoutRight, Kokkos::LayoutRight, Kokkos::CudaSpace) -KOKKOSBLAS3_GEMM_TPL_SPEC_AVAIL_CUBLAS(Kokkos::complex, - Kokkos::LayoutRight, Kokkos::LayoutRight, - Kokkos::LayoutRight, - Kokkos::CudaUVMSpace) +KOKKOSBLAS3_GEMM_TPL_SPEC_AVAIL_CUBLAS(Kokkos::complex, Kokkos::LayoutRight, Kokkos::LayoutRight, + Kokkos::LayoutRight, Kokkos::CudaUVMSpace) #endif // rocBLAS #if defined(KOKKOSKERNELS_ENABLE_TPL_ROCBLAS) -#define KOKKOSBLAS3_GEMM_TPL_SPEC_AVAIL_ROCBLAS(SCALAR, LAYOUT, MEMSPACE) \ - template \ - struct gemm_tpl_spec_avail< \ - ExecSpace, \ - Kokkos::View, \ - Kokkos::MemoryTraits >, \ - Kokkos::View, \ - Kokkos::MemoryTraits >, \ - Kokkos::View, \ - Kokkos::MemoryTraits > > { \ - enum : bool { value = true }; \ +#define KOKKOSBLAS3_GEMM_TPL_SPEC_AVAIL_ROCBLAS(SCALAR, LAYOUT, MEMSPACE) \ + template \ + struct gemm_tpl_spec_avail, \ + Kokkos::MemoryTraits >, \ + Kokkos::View, \ + Kokkos::MemoryTraits >, \ + Kokkos::View, \ + Kokkos::MemoryTraits > > { \ + enum : bool { value = true }; \ }; -KOKKOSBLAS3_GEMM_TPL_SPEC_AVAIL_ROCBLAS(double, Kokkos::LayoutLeft, - Kokkos::HIPSpace) -KOKKOSBLAS3_GEMM_TPL_SPEC_AVAIL_ROCBLAS(float, Kokkos::LayoutLeft, - Kokkos::HIPSpace) -KOKKOSBLAS3_GEMM_TPL_SPEC_AVAIL_ROCBLAS(Kokkos::complex, - Kokkos::LayoutLeft, Kokkos::HIPSpace) -KOKKOSBLAS3_GEMM_TPL_SPEC_AVAIL_ROCBLAS(Kokkos::complex, - Kokkos::LayoutLeft, Kokkos::HIPSpace) - -KOKKOSBLAS3_GEMM_TPL_SPEC_AVAIL_ROCBLAS(double, Kokkos::LayoutRight, - Kokkos::HIPSpace) -KOKKOSBLAS3_GEMM_TPL_SPEC_AVAIL_ROCBLAS(float, Kokkos::LayoutRight, - Kokkos::HIPSpace) -KOKKOSBLAS3_GEMM_TPL_SPEC_AVAIL_ROCBLAS(Kokkos::complex, - Kokkos::LayoutRight, Kokkos::HIPSpace) -KOKKOSBLAS3_GEMM_TPL_SPEC_AVAIL_ROCBLAS(Kokkos::complex, - Kokkos::LayoutRight, Kokkos::HIPSpace) +KOKKOSBLAS3_GEMM_TPL_SPEC_AVAIL_ROCBLAS(double, Kokkos::LayoutLeft, Kokkos::HIPSpace) +KOKKOSBLAS3_GEMM_TPL_SPEC_AVAIL_ROCBLAS(float, Kokkos::LayoutLeft, Kokkos::HIPSpace) +KOKKOSBLAS3_GEMM_TPL_SPEC_AVAIL_ROCBLAS(Kokkos::complex, Kokkos::LayoutLeft, Kokkos::HIPSpace) +KOKKOSBLAS3_GEMM_TPL_SPEC_AVAIL_ROCBLAS(Kokkos::complex, Kokkos::LayoutLeft, Kokkos::HIPSpace) + +KOKKOSBLAS3_GEMM_TPL_SPEC_AVAIL_ROCBLAS(double, Kokkos::LayoutRight, Kokkos::HIPSpace) +KOKKOSBLAS3_GEMM_TPL_SPEC_AVAIL_ROCBLAS(float, Kokkos::LayoutRight, Kokkos::HIPSpace) +KOKKOSBLAS3_GEMM_TPL_SPEC_AVAIL_ROCBLAS(Kokkos::complex, Kokkos::LayoutRight, Kokkos::HIPSpace) +KOKKOSBLAS3_GEMM_TPL_SPEC_AVAIL_ROCBLAS(Kokkos::complex, Kokkos::LayoutRight, Kokkos::HIPSpace) #endif } // namespace Impl diff --git a/blas/tpls/KokkosBlas3_gemm_tpl_spec_decl.hpp b/blas/tpls/KokkosBlas3_gemm_tpl_spec_decl.hpp index 68bf2708ec..52123a9daf 100644 --- a/blas/tpls/KokkosBlas3_gemm_tpl_spec_decl.hpp +++ b/blas/tpls/KokkosBlas3_gemm_tpl_spec_decl.hpp @@ -23,130 +23,92 @@ namespace KokkosBlas { namespace Impl { -#define KOKKOSBLAS3_XGEMM_BLAS(SCALAR_TYPE, BASE_SCALAR_TYPE, LAYOUTA, \ - LAYOUTB, LAYOUTC, MEM_SPACE, ETI_SPEC_AVAIL) \ - template \ - struct GEMM, \ - Kokkos::MemoryTraits >, \ - Kokkos::View, \ - Kokkos::MemoryTraits >, \ - Kokkos::View, \ - Kokkos::MemoryTraits >, \ - true, ETI_SPEC_AVAIL> { \ - typedef SCALAR_TYPE SCALAR; \ - typedef Kokkos::View, \ - Kokkos::MemoryTraits > \ - AViewType; \ - typedef Kokkos::View, \ - Kokkos::MemoryTraits > \ - BViewType; \ - typedef Kokkos::View, \ - Kokkos::MemoryTraits > \ - CViewType; \ - \ - static void gemm(const ExecSpace& /* space*/, const char transA[], \ - const char transB[], \ - typename AViewType::const_value_type& alpha, \ - const AViewType& A, const BViewType& B, \ - typename CViewType::const_value_type& beta, \ - const CViewType& C) { \ - Kokkos::Profiling::pushRegion("KokkosBlas::gemm[TPL_BLAS," #SCALAR_TYPE \ - "]"); \ - const bool A_t = (transA[0] != 'N') && (transA[0] != 'n'); \ - const KK_INT M = C.extent(0); \ - const KK_INT N = C.extent(1); \ - const KK_INT K = A.extent(A_t ? 0 : 1); \ - \ - bool A_is_lr = std::is_same::value; \ - bool B_is_lr = std::is_same::value; \ - bool C_is_lr = std::is_same::value; \ - \ - const KK_INT AST = A_is_lr ? A.stride(0) : A.stride(1), \ - LDA = AST == 0 ? 1 : AST; \ - const KK_INT BST = B_is_lr ? B.stride(0) : B.stride(1), \ - LDB = BST == 0 ? 1 : BST; \ - const KK_INT CST = C_is_lr ? C.stride(0) : C.stride(1), \ - LDC = CST == 0 ? 1 : CST; \ - \ - const BASE_SCALAR_TYPE alpha_val = alpha, beta_val = beta; \ - if (!A_is_lr && !B_is_lr && !C_is_lr) \ - HostBlas::gemm( \ - transA[0], transB[0], M, N, K, alpha_val, \ - reinterpret_cast(A.data()), LDA, \ - reinterpret_cast(B.data()), LDB, \ - beta_val, reinterpret_cast(C.data()), LDC); \ - if (A_is_lr && B_is_lr && C_is_lr) \ - HostBlas::gemm( \ - transB[0], transA[0], N, M, K, alpha_val, \ - reinterpret_cast(B.data()), LDB, \ - reinterpret_cast(A.data()), LDA, \ - beta_val, reinterpret_cast(C.data()), LDC); \ - Kokkos::Profiling::popRegion(); \ - } \ +#define KOKKOSBLAS3_XGEMM_BLAS(SCALAR_TYPE, BASE_SCALAR_TYPE, LAYOUTA, LAYOUTB, LAYOUTC, MEM_SPACE, ETI_SPEC_AVAIL) \ + template \ + struct GEMM, \ + Kokkos::MemoryTraits >, \ + Kokkos::View, \ + Kokkos::MemoryTraits >, \ + Kokkos::View, \ + Kokkos::MemoryTraits >, \ + true, ETI_SPEC_AVAIL> { \ + typedef SCALAR_TYPE SCALAR; \ + typedef Kokkos::View, \ + Kokkos::MemoryTraits > \ + AViewType; \ + typedef Kokkos::View, \ + Kokkos::MemoryTraits > \ + BViewType; \ + typedef Kokkos::View, \ + Kokkos::MemoryTraits > \ + CViewType; \ + \ + static void gemm(const ExecSpace& /* space*/, const char transA[], const char transB[], \ + typename AViewType::const_value_type& alpha, const AViewType& A, const BViewType& B, \ + typename CViewType::const_value_type& beta, const CViewType& C) { \ + Kokkos::Profiling::pushRegion("KokkosBlas::gemm[TPL_BLAS," #SCALAR_TYPE "]"); \ + const bool A_t = (transA[0] != 'N') && (transA[0] != 'n'); \ + const KK_INT M = C.extent(0); \ + const KK_INT N = C.extent(1); \ + const KK_INT K = A.extent(A_t ? 0 : 1); \ + \ + bool A_is_lr = std::is_same::value; \ + bool B_is_lr = std::is_same::value; \ + bool C_is_lr = std::is_same::value; \ + \ + const KK_INT AST = A_is_lr ? A.stride(0) : A.stride(1), LDA = AST == 0 ? 1 : AST; \ + const KK_INT BST = B_is_lr ? B.stride(0) : B.stride(1), LDB = BST == 0 ? 1 : BST; \ + const KK_INT CST = C_is_lr ? C.stride(0) : C.stride(1), LDC = CST == 0 ? 1 : CST; \ + \ + const BASE_SCALAR_TYPE alpha_val = alpha, beta_val = beta; \ + if (!A_is_lr && !B_is_lr && !C_is_lr) \ + HostBlas::gemm(transA[0], transB[0], M, N, K, alpha_val, \ + reinterpret_cast(A.data()), LDA, \ + reinterpret_cast(B.data()), LDB, beta_val, \ + reinterpret_cast(C.data()), LDC); \ + if (A_is_lr && B_is_lr && C_is_lr) \ + HostBlas::gemm(transB[0], transA[0], N, M, K, alpha_val, \ + reinterpret_cast(B.data()), LDB, \ + reinterpret_cast(A.data()), LDA, beta_val, \ + reinterpret_cast(C.data()), LDC); \ + Kokkos::Profiling::popRegion(); \ + } \ }; -#define KOKKOSBLAS3_DGEMM_BLAS(LAYOUTA, LAYOUTB, LAYOUTC, MEM_SPACE, \ - ETI_SPEC_AVAIL) \ - KOKKOSBLAS3_XGEMM_BLAS(double, double, LAYOUTA, LAYOUTB, LAYOUTC, MEM_SPACE, \ +#define KOKKOSBLAS3_DGEMM_BLAS(LAYOUTA, LAYOUTB, LAYOUTC, MEM_SPACE, ETI_SPEC_AVAIL) \ + KOKKOSBLAS3_XGEMM_BLAS(double, double, LAYOUTA, LAYOUTB, LAYOUTC, MEM_SPACE, ETI_SPEC_AVAIL) + +#define KOKKOSBLAS3_SGEMM_BLAS(LAYOUTA, LAYOUTB, LAYOUTC, MEM_SPACE, ETI_SPEC_AVAIL) \ + KOKKOSBLAS3_XGEMM_BLAS(float, float, LAYOUTA, LAYOUTB, LAYOUTC, MEM_SPACE, ETI_SPEC_AVAIL) + +#define KOKKOSBLAS3_ZGEMM_BLAS(LAYOUTA, LAYOUTB, LAYOUTC, MEM_SPACE, ETI_SPEC_AVAIL) \ + KOKKOSBLAS3_XGEMM_BLAS(Kokkos::complex, std::complex, LAYOUTA, LAYOUTB, LAYOUTC, MEM_SPACE, \ ETI_SPEC_AVAIL) -#define KOKKOSBLAS3_SGEMM_BLAS(LAYOUTA, LAYOUTB, LAYOUTC, MEM_SPACE, \ - ETI_SPEC_AVAIL) \ - KOKKOSBLAS3_XGEMM_BLAS(float, float, LAYOUTA, LAYOUTB, LAYOUTC, MEM_SPACE, \ +#define KOKKOSBLAS3_CGEMM_BLAS(LAYOUTA, LAYOUTB, LAYOUTC, MEM_SPACE, ETI_SPEC_AVAIL) \ + KOKKOSBLAS3_XGEMM_BLAS(Kokkos::complex, std::complex, LAYOUTA, LAYOUTB, LAYOUTC, MEM_SPACE, \ ETI_SPEC_AVAIL) -#define KOKKOSBLAS3_ZGEMM_BLAS(LAYOUTA, LAYOUTB, LAYOUTC, MEM_SPACE, \ - ETI_SPEC_AVAIL) \ - KOKKOSBLAS3_XGEMM_BLAS(Kokkos::complex, std::complex, \ - LAYOUTA, LAYOUTB, LAYOUTC, MEM_SPACE, ETI_SPEC_AVAIL) - -#define KOKKOSBLAS3_CGEMM_BLAS(LAYOUTA, LAYOUTB, LAYOUTC, MEM_SPACE, \ - ETI_SPEC_AVAIL) \ - KOKKOSBLAS3_XGEMM_BLAS(Kokkos::complex, std::complex, LAYOUTA, \ - LAYOUTB, LAYOUTC, MEM_SPACE, ETI_SPEC_AVAIL) - -KOKKOSBLAS3_DGEMM_BLAS(Kokkos::LayoutLeft, Kokkos::LayoutLeft, - Kokkos::LayoutLeft, Kokkos::HostSpace, true) -KOKKOSBLAS3_DGEMM_BLAS(Kokkos::LayoutLeft, Kokkos::LayoutLeft, - Kokkos::LayoutLeft, Kokkos::HostSpace, false) -KOKKOSBLAS3_DGEMM_BLAS(Kokkos::LayoutRight, Kokkos::LayoutRight, - Kokkos::LayoutRight, Kokkos::HostSpace, true) -KOKKOSBLAS3_DGEMM_BLAS(Kokkos::LayoutRight, Kokkos::LayoutRight, - Kokkos::LayoutRight, Kokkos::HostSpace, false) - -KOKKOSBLAS3_SGEMM_BLAS(Kokkos::LayoutLeft, Kokkos::LayoutLeft, - Kokkos::LayoutLeft, Kokkos::HostSpace, true) -KOKKOSBLAS3_SGEMM_BLAS(Kokkos::LayoutLeft, Kokkos::LayoutLeft, - Kokkos::LayoutLeft, Kokkos::HostSpace, false) -KOKKOSBLAS3_SGEMM_BLAS(Kokkos::LayoutRight, Kokkos::LayoutRight, - Kokkos::LayoutRight, Kokkos::HostSpace, true) -KOKKOSBLAS3_SGEMM_BLAS(Kokkos::LayoutRight, Kokkos::LayoutRight, - Kokkos::LayoutRight, Kokkos::HostSpace, false) - -KOKKOSBLAS3_ZGEMM_BLAS(Kokkos::LayoutLeft, Kokkos::LayoutLeft, - Kokkos::LayoutLeft, Kokkos::HostSpace, true) -KOKKOSBLAS3_ZGEMM_BLAS(Kokkos::LayoutLeft, Kokkos::LayoutLeft, - Kokkos::LayoutLeft, Kokkos::HostSpace, false) -KOKKOSBLAS3_ZGEMM_BLAS(Kokkos::LayoutRight, Kokkos::LayoutRight, - Kokkos::LayoutRight, Kokkos::HostSpace, true) -KOKKOSBLAS3_ZGEMM_BLAS(Kokkos::LayoutRight, Kokkos::LayoutRight, - Kokkos::LayoutRight, Kokkos::HostSpace, false) - -KOKKOSBLAS3_CGEMM_BLAS(Kokkos::LayoutLeft, Kokkos::LayoutLeft, - Kokkos::LayoutLeft, Kokkos::HostSpace, true) -KOKKOSBLAS3_CGEMM_BLAS(Kokkos::LayoutLeft, Kokkos::LayoutLeft, - Kokkos::LayoutLeft, Kokkos::HostSpace, false) -KOKKOSBLAS3_CGEMM_BLAS(Kokkos::LayoutRight, Kokkos::LayoutRight, - Kokkos::LayoutRight, Kokkos::HostSpace, true) -KOKKOSBLAS3_CGEMM_BLAS(Kokkos::LayoutRight, Kokkos::LayoutRight, - Kokkos::LayoutRight, Kokkos::HostSpace, false) +KOKKOSBLAS3_DGEMM_BLAS(Kokkos::LayoutLeft, Kokkos::LayoutLeft, Kokkos::LayoutLeft, Kokkos::HostSpace, true) +KOKKOSBLAS3_DGEMM_BLAS(Kokkos::LayoutLeft, Kokkos::LayoutLeft, Kokkos::LayoutLeft, Kokkos::HostSpace, false) +KOKKOSBLAS3_DGEMM_BLAS(Kokkos::LayoutRight, Kokkos::LayoutRight, Kokkos::LayoutRight, Kokkos::HostSpace, true) +KOKKOSBLAS3_DGEMM_BLAS(Kokkos::LayoutRight, Kokkos::LayoutRight, Kokkos::LayoutRight, Kokkos::HostSpace, false) + +KOKKOSBLAS3_SGEMM_BLAS(Kokkos::LayoutLeft, Kokkos::LayoutLeft, Kokkos::LayoutLeft, Kokkos::HostSpace, true) +KOKKOSBLAS3_SGEMM_BLAS(Kokkos::LayoutLeft, Kokkos::LayoutLeft, Kokkos::LayoutLeft, Kokkos::HostSpace, false) +KOKKOSBLAS3_SGEMM_BLAS(Kokkos::LayoutRight, Kokkos::LayoutRight, Kokkos::LayoutRight, Kokkos::HostSpace, true) +KOKKOSBLAS3_SGEMM_BLAS(Kokkos::LayoutRight, Kokkos::LayoutRight, Kokkos::LayoutRight, Kokkos::HostSpace, false) + +KOKKOSBLAS3_ZGEMM_BLAS(Kokkos::LayoutLeft, Kokkos::LayoutLeft, Kokkos::LayoutLeft, Kokkos::HostSpace, true) +KOKKOSBLAS3_ZGEMM_BLAS(Kokkos::LayoutLeft, Kokkos::LayoutLeft, Kokkos::LayoutLeft, Kokkos::HostSpace, false) +KOKKOSBLAS3_ZGEMM_BLAS(Kokkos::LayoutRight, Kokkos::LayoutRight, Kokkos::LayoutRight, Kokkos::HostSpace, true) +KOKKOSBLAS3_ZGEMM_BLAS(Kokkos::LayoutRight, Kokkos::LayoutRight, Kokkos::LayoutRight, Kokkos::HostSpace, false) + +KOKKOSBLAS3_CGEMM_BLAS(Kokkos::LayoutLeft, Kokkos::LayoutLeft, Kokkos::LayoutLeft, Kokkos::HostSpace, true) +KOKKOSBLAS3_CGEMM_BLAS(Kokkos::LayoutLeft, Kokkos::LayoutLeft, Kokkos::LayoutLeft, Kokkos::HostSpace, false) +KOKKOSBLAS3_CGEMM_BLAS(Kokkos::LayoutRight, Kokkos::LayoutRight, Kokkos::LayoutRight, Kokkos::HostSpace, true) +KOKKOSBLAS3_CGEMM_BLAS(Kokkos::LayoutRight, Kokkos::LayoutRight, Kokkos::LayoutRight, Kokkos::HostSpace, false) } // namespace Impl } // namespace KokkosBlas @@ -160,195 +122,131 @@ KOKKOSBLAS3_CGEMM_BLAS(Kokkos::LayoutRight, Kokkos::LayoutRight, namespace KokkosBlas { namespace Impl { -#define KOKKOSBLAS3_XGEMM_CUBLAS(SCALAR_TYPE, CUDA_SCALAR_TYPE, CUBLAS_FN, \ - LAYOUTA, LAYOUTB, LAYOUTC, MEM_SPACE, \ - ETI_SPEC_AVAIL) \ - template \ - struct GEMM, \ - Kokkos::MemoryTraits >, \ - Kokkos::View, \ - Kokkos::MemoryTraits >, \ - Kokkos::View, \ - Kokkos::MemoryTraits >, \ - true, ETI_SPEC_AVAIL> { \ - typedef SCALAR_TYPE SCALAR; \ - typedef Kokkos::View, \ - Kokkos::MemoryTraits > \ - AViewType; \ - typedef Kokkos::View, \ - Kokkos::MemoryTraits > \ - BViewType; \ - typedef Kokkos::View, \ - Kokkos::MemoryTraits > \ - CViewType; \ - \ - static void gemm(const ExecSpace& space, const char transA[], \ - const char transB[], \ - typename AViewType::const_value_type& alpha, \ - const AViewType& A, const BViewType& B, \ - typename CViewType::const_value_type& beta, \ - const CViewType& C) { \ - Kokkos::Profiling::pushRegion( \ - "KokkosBlas::gemm[TPL_CUBLAS," #SCALAR_TYPE "]"); \ - const bool A_t = (transA[0] != 'N') && (transA[0] != 'n'); \ - const int M = static_cast(C.extent(0)); \ - const int N = static_cast(C.extent(1)); \ - const int K = static_cast(A.extent(A_t ? 0 : 1)); \ - \ - bool A_is_lr = std::is_same::value; \ - bool B_is_lr = std::is_same::value; \ - bool C_is_lr = std::is_same::value; \ - \ - const int AST = A_is_lr ? A.stride(0) : A.stride(1), \ - LDA = AST == 0 ? 1 : AST; \ - const int BST = B_is_lr ? B.stride(0) : B.stride(1), \ - LDB = BST == 0 ? 1 : BST; \ - const int CST = C_is_lr ? C.stride(0) : C.stride(1), \ - LDC = CST == 0 ? 1 : CST; \ - \ - cublasOperation_t transa = trans_mode_kk_to_cublas(transA); \ - cublasOperation_t transb = trans_mode_kk_to_cublas(transB); \ - \ - constexpr int numDotsLayoutLeftThreshold = 1600; \ - constexpr int numDotsLayoutRightThreshold = 100; \ - if ((!A_is_lr && transa != CUBLAS_OP_N && transb == CUBLAS_OP_N && \ - M * N < numDotsLayoutLeftThreshold) || \ - (A_is_lr && transa != CUBLAS_OP_N && transb == CUBLAS_OP_N && \ - M * N < numDotsLayoutRightThreshold)) { \ - DotBasedGEMM gemm( \ - alpha, A, B, beta, C); \ - bool conjT = (std::is_same::value || \ - std::is_same::value) \ - ? false \ - : (transa == CUBLAS_OP_C ? true : false); \ - gemm.run(space, conjT); \ - } else { \ - KokkosBlas::Impl::CudaBlasSingleton& s = \ - KokkosBlas::Impl::CudaBlasSingleton::singleton(); \ - KOKKOS_CUBLAS_SAFE_CALL_IMPL( \ - cublasSetStream(s.handle, space.cuda_stream())); \ - if (!A_is_lr && !B_is_lr && !C_is_lr) \ - KOKKOS_CUBLAS_SAFE_CALL_IMPL(CUBLAS_FN( \ - s.handle, transa, transb, M, N, K, \ - reinterpret_cast(&alpha), \ - reinterpret_cast(A.data()), LDA, \ - reinterpret_cast(B.data()), LDB, \ - reinterpret_cast(&beta), \ - reinterpret_cast(C.data()), LDC)); \ - if (A_is_lr && B_is_lr && C_is_lr) \ - KOKKOS_CUBLAS_SAFE_CALL_IMPL(CUBLAS_FN( \ - s.handle, transb, transa, N, M, K, \ - reinterpret_cast(&alpha), \ - reinterpret_cast(B.data()), LDB, \ - reinterpret_cast(A.data()), LDA, \ - reinterpret_cast(&beta), \ - reinterpret_cast(C.data()), LDC)); \ - KOKKOS_CUBLAS_SAFE_CALL_IMPL(cublasSetStream(s.handle, NULL)); \ - } \ - Kokkos::Profiling::popRegion(); \ - } \ +#define KOKKOSBLAS3_XGEMM_CUBLAS(SCALAR_TYPE, CUDA_SCALAR_TYPE, CUBLAS_FN, LAYOUTA, LAYOUTB, LAYOUTC, MEM_SPACE, \ + ETI_SPEC_AVAIL) \ + template \ + struct GEMM, \ + Kokkos::MemoryTraits >, \ + Kokkos::View, \ + Kokkos::MemoryTraits >, \ + Kokkos::View, \ + Kokkos::MemoryTraits >, \ + true, ETI_SPEC_AVAIL> { \ + typedef SCALAR_TYPE SCALAR; \ + typedef Kokkos::View, \ + Kokkos::MemoryTraits > \ + AViewType; \ + typedef Kokkos::View, \ + Kokkos::MemoryTraits > \ + BViewType; \ + typedef Kokkos::View, \ + Kokkos::MemoryTraits > \ + CViewType; \ + \ + static void gemm(const ExecSpace& space, const char transA[], const char transB[], \ + typename AViewType::const_value_type& alpha, const AViewType& A, const BViewType& B, \ + typename CViewType::const_value_type& beta, const CViewType& C) { \ + Kokkos::Profiling::pushRegion("KokkosBlas::gemm[TPL_CUBLAS," #SCALAR_TYPE "]"); \ + const bool A_t = (transA[0] != 'N') && (transA[0] != 'n'); \ + const int M = static_cast(C.extent(0)); \ + const int N = static_cast(C.extent(1)); \ + const int K = static_cast(A.extent(A_t ? 0 : 1)); \ + \ + bool A_is_lr = std::is_same::value; \ + bool B_is_lr = std::is_same::value; \ + bool C_is_lr = std::is_same::value; \ + \ + const int AST = A_is_lr ? A.stride(0) : A.stride(1), LDA = AST == 0 ? 1 : AST; \ + const int BST = B_is_lr ? B.stride(0) : B.stride(1), LDB = BST == 0 ? 1 : BST; \ + const int CST = C_is_lr ? C.stride(0) : C.stride(1), LDC = CST == 0 ? 1 : CST; \ + \ + cublasOperation_t transa = trans_mode_kk_to_cublas(transA); \ + cublasOperation_t transb = trans_mode_kk_to_cublas(transB); \ + \ + constexpr int numDotsLayoutLeftThreshold = 1600; \ + constexpr int numDotsLayoutRightThreshold = 100; \ + if ((!A_is_lr && transa != CUBLAS_OP_N && transb == CUBLAS_OP_N && M * N < numDotsLayoutLeftThreshold) || \ + (A_is_lr && transa != CUBLAS_OP_N && transb == CUBLAS_OP_N && M * N < numDotsLayoutRightThreshold)) { \ + DotBasedGEMM gemm(alpha, A, B, beta, C); \ + bool conjT = (std::is_same::value || std::is_same::value) \ + ? false \ + : (transa == CUBLAS_OP_C ? true : false); \ + gemm.run(space, conjT); \ + } else { \ + KokkosBlas::Impl::CudaBlasSingleton& s = KokkosBlas::Impl::CudaBlasSingleton::singleton(); \ + KOKKOS_CUBLAS_SAFE_CALL_IMPL(cublasSetStream(s.handle, space.cuda_stream())); \ + if (!A_is_lr && !B_is_lr && !C_is_lr) \ + KOKKOS_CUBLAS_SAFE_CALL_IMPL(CUBLAS_FN( \ + s.handle, transa, transb, M, N, K, reinterpret_cast(&alpha), \ + reinterpret_cast(A.data()), LDA, \ + reinterpret_cast(B.data()), LDB, \ + reinterpret_cast(&beta), reinterpret_cast(C.data()), LDC)); \ + if (A_is_lr && B_is_lr && C_is_lr) \ + KOKKOS_CUBLAS_SAFE_CALL_IMPL(CUBLAS_FN( \ + s.handle, transb, transa, N, M, K, reinterpret_cast(&alpha), \ + reinterpret_cast(B.data()), LDB, \ + reinterpret_cast(A.data()), LDA, \ + reinterpret_cast(&beta), reinterpret_cast(C.data()), LDC)); \ + KOKKOS_CUBLAS_SAFE_CALL_IMPL(cublasSetStream(s.handle, NULL)); \ + } \ + Kokkos::Profiling::popRegion(); \ + } \ }; -#define KOKKOSBLAS3_DGEMM_CUBLAS(LAYOUTA, LAYOUTB, LAYOUTC, MEM_SPACE, \ - ETI_SPEC_AVAIL) \ - KOKKOSBLAS3_XGEMM_CUBLAS(double, double, cublasDgemm, LAYOUTA, LAYOUTB, \ - LAYOUTC, MEM_SPACE, ETI_SPEC_AVAIL) +#define KOKKOSBLAS3_DGEMM_CUBLAS(LAYOUTA, LAYOUTB, LAYOUTC, MEM_SPACE, ETI_SPEC_AVAIL) \ + KOKKOSBLAS3_XGEMM_CUBLAS(double, double, cublasDgemm, LAYOUTA, LAYOUTB, LAYOUTC, MEM_SPACE, ETI_SPEC_AVAIL) -#define KOKKOSBLAS3_SGEMM_CUBLAS(LAYOUTA, LAYOUTB, LAYOUTC, MEM_SPACE, \ - ETI_SPEC_AVAIL) \ - KOKKOSBLAS3_XGEMM_CUBLAS(float, float, cublasSgemm, LAYOUTA, LAYOUTB, \ - LAYOUTC, MEM_SPACE, ETI_SPEC_AVAIL) +#define KOKKOSBLAS3_SGEMM_CUBLAS(LAYOUTA, LAYOUTB, LAYOUTC, MEM_SPACE, ETI_SPEC_AVAIL) \ + KOKKOSBLAS3_XGEMM_CUBLAS(float, float, cublasSgemm, LAYOUTA, LAYOUTB, LAYOUTC, MEM_SPACE, ETI_SPEC_AVAIL) -#define KOKKOSBLAS3_ZGEMM_CUBLAS(LAYOUTA, LAYOUTB, LAYOUTC, MEM_SPACE, \ - ETI_SPEC_AVAIL) \ - KOKKOSBLAS3_XGEMM_CUBLAS(Kokkos::complex, cuDoubleComplex, \ - cublasZgemm, LAYOUTA, LAYOUTB, LAYOUTC, MEM_SPACE, \ - ETI_SPEC_AVAIL) +#define KOKKOSBLAS3_ZGEMM_CUBLAS(LAYOUTA, LAYOUTB, LAYOUTC, MEM_SPACE, ETI_SPEC_AVAIL) \ + KOKKOSBLAS3_XGEMM_CUBLAS(Kokkos::complex, cuDoubleComplex, cublasZgemm, LAYOUTA, LAYOUTB, LAYOUTC, \ + MEM_SPACE, ETI_SPEC_AVAIL) -#define KOKKOSBLAS3_CGEMM_CUBLAS(LAYOUTA, LAYOUTB, LAYOUTC, MEM_SPACE, \ - ETI_SPEC_AVAIL) \ - KOKKOSBLAS3_XGEMM_CUBLAS(Kokkos::complex, cuComplex, cublasCgemm, \ - LAYOUTA, LAYOUTB, LAYOUTC, MEM_SPACE, \ +#define KOKKOSBLAS3_CGEMM_CUBLAS(LAYOUTA, LAYOUTB, LAYOUTC, MEM_SPACE, ETI_SPEC_AVAIL) \ + KOKKOSBLAS3_XGEMM_CUBLAS(Kokkos::complex, cuComplex, cublasCgemm, LAYOUTA, LAYOUTB, LAYOUTC, MEM_SPACE, \ ETI_SPEC_AVAIL) -KOKKOSBLAS3_DGEMM_CUBLAS(Kokkos::LayoutLeft, Kokkos::LayoutLeft, - Kokkos::LayoutLeft, Kokkos::CudaSpace, true) -KOKKOSBLAS3_DGEMM_CUBLAS(Kokkos::LayoutLeft, Kokkos::LayoutLeft, - Kokkos::LayoutLeft, Kokkos::CudaSpace, false) -KOKKOSBLAS3_DGEMM_CUBLAS(Kokkos::LayoutRight, Kokkos::LayoutRight, - Kokkos::LayoutRight, Kokkos::CudaSpace, true) -KOKKOSBLAS3_DGEMM_CUBLAS(Kokkos::LayoutRight, Kokkos::LayoutRight, - Kokkos::LayoutRight, Kokkos::CudaSpace, false) - -KOKKOSBLAS3_DGEMM_CUBLAS(Kokkos::LayoutLeft, Kokkos::LayoutLeft, - Kokkos::LayoutLeft, Kokkos::CudaUVMSpace, true) -KOKKOSBLAS3_DGEMM_CUBLAS(Kokkos::LayoutLeft, Kokkos::LayoutLeft, - Kokkos::LayoutLeft, Kokkos::CudaUVMSpace, false) -KOKKOSBLAS3_DGEMM_CUBLAS(Kokkos::LayoutRight, Kokkos::LayoutRight, - Kokkos::LayoutRight, Kokkos::CudaUVMSpace, true) -KOKKOSBLAS3_DGEMM_CUBLAS(Kokkos::LayoutRight, Kokkos::LayoutRight, - Kokkos::LayoutRight, Kokkos::CudaUVMSpace, false) - -KOKKOSBLAS3_SGEMM_CUBLAS(Kokkos::LayoutLeft, Kokkos::LayoutLeft, - Kokkos::LayoutLeft, Kokkos::CudaSpace, true) -KOKKOSBLAS3_SGEMM_CUBLAS(Kokkos::LayoutLeft, Kokkos::LayoutLeft, - Kokkos::LayoutLeft, Kokkos::CudaSpace, false) -KOKKOSBLAS3_SGEMM_CUBLAS(Kokkos::LayoutRight, Kokkos::LayoutRight, - Kokkos::LayoutRight, Kokkos::CudaSpace, true) -KOKKOSBLAS3_SGEMM_CUBLAS(Kokkos::LayoutRight, Kokkos::LayoutRight, - Kokkos::LayoutRight, Kokkos::CudaSpace, false) - -KOKKOSBLAS3_SGEMM_CUBLAS(Kokkos::LayoutLeft, Kokkos::LayoutLeft, - Kokkos::LayoutLeft, Kokkos::CudaUVMSpace, true) -KOKKOSBLAS3_SGEMM_CUBLAS(Kokkos::LayoutLeft, Kokkos::LayoutLeft, - Kokkos::LayoutLeft, Kokkos::CudaUVMSpace, false) -KOKKOSBLAS3_SGEMM_CUBLAS(Kokkos::LayoutRight, Kokkos::LayoutRight, - Kokkos::LayoutRight, Kokkos::CudaUVMSpace, true) -KOKKOSBLAS3_SGEMM_CUBLAS(Kokkos::LayoutRight, Kokkos::LayoutRight, - Kokkos::LayoutRight, Kokkos::CudaUVMSpace, false) - -KOKKOSBLAS3_ZGEMM_CUBLAS(Kokkos::LayoutLeft, Kokkos::LayoutLeft, - Kokkos::LayoutLeft, Kokkos::CudaSpace, true) -KOKKOSBLAS3_ZGEMM_CUBLAS(Kokkos::LayoutLeft, Kokkos::LayoutLeft, - Kokkos::LayoutLeft, Kokkos::CudaSpace, false) -KOKKOSBLAS3_ZGEMM_CUBLAS(Kokkos::LayoutRight, Kokkos::LayoutRight, - Kokkos::LayoutRight, Kokkos::CudaSpace, true) -KOKKOSBLAS3_ZGEMM_CUBLAS(Kokkos::LayoutRight, Kokkos::LayoutRight, - Kokkos::LayoutRight, Kokkos::CudaSpace, false) - -KOKKOSBLAS3_ZGEMM_CUBLAS(Kokkos::LayoutLeft, Kokkos::LayoutLeft, - Kokkos::LayoutLeft, Kokkos::CudaUVMSpace, true) -KOKKOSBLAS3_ZGEMM_CUBLAS(Kokkos::LayoutLeft, Kokkos::LayoutLeft, - Kokkos::LayoutLeft, Kokkos::CudaUVMSpace, false) -KOKKOSBLAS3_ZGEMM_CUBLAS(Kokkos::LayoutRight, Kokkos::LayoutRight, - Kokkos::LayoutRight, Kokkos::CudaUVMSpace, true) -KOKKOSBLAS3_ZGEMM_CUBLAS(Kokkos::LayoutRight, Kokkos::LayoutRight, - Kokkos::LayoutRight, Kokkos::CudaUVMSpace, false) - -KOKKOSBLAS3_CGEMM_CUBLAS(Kokkos::LayoutLeft, Kokkos::LayoutLeft, - Kokkos::LayoutLeft, Kokkos::CudaSpace, true) -KOKKOSBLAS3_CGEMM_CUBLAS(Kokkos::LayoutLeft, Kokkos::LayoutLeft, - Kokkos::LayoutLeft, Kokkos::CudaSpace, false) -KOKKOSBLAS3_CGEMM_CUBLAS(Kokkos::LayoutRight, Kokkos::LayoutRight, - Kokkos::LayoutRight, Kokkos::CudaSpace, true) -KOKKOSBLAS3_CGEMM_CUBLAS(Kokkos::LayoutRight, Kokkos::LayoutRight, - Kokkos::LayoutRight, Kokkos::CudaSpace, false) - -KOKKOSBLAS3_CGEMM_CUBLAS(Kokkos::LayoutLeft, Kokkos::LayoutLeft, - Kokkos::LayoutLeft, Kokkos::CudaUVMSpace, true) -KOKKOSBLAS3_CGEMM_CUBLAS(Kokkos::LayoutLeft, Kokkos::LayoutLeft, - Kokkos::LayoutLeft, Kokkos::CudaUVMSpace, false) -KOKKOSBLAS3_CGEMM_CUBLAS(Kokkos::LayoutRight, Kokkos::LayoutRight, - Kokkos::LayoutRight, Kokkos::CudaUVMSpace, true) -KOKKOSBLAS3_CGEMM_CUBLAS(Kokkos::LayoutRight, Kokkos::LayoutRight, - Kokkos::LayoutRight, Kokkos::CudaUVMSpace, false) +KOKKOSBLAS3_DGEMM_CUBLAS(Kokkos::LayoutLeft, Kokkos::LayoutLeft, Kokkos::LayoutLeft, Kokkos::CudaSpace, true) +KOKKOSBLAS3_DGEMM_CUBLAS(Kokkos::LayoutLeft, Kokkos::LayoutLeft, Kokkos::LayoutLeft, Kokkos::CudaSpace, false) +KOKKOSBLAS3_DGEMM_CUBLAS(Kokkos::LayoutRight, Kokkos::LayoutRight, Kokkos::LayoutRight, Kokkos::CudaSpace, true) +KOKKOSBLAS3_DGEMM_CUBLAS(Kokkos::LayoutRight, Kokkos::LayoutRight, Kokkos::LayoutRight, Kokkos::CudaSpace, false) + +KOKKOSBLAS3_DGEMM_CUBLAS(Kokkos::LayoutLeft, Kokkos::LayoutLeft, Kokkos::LayoutLeft, Kokkos::CudaUVMSpace, true) +KOKKOSBLAS3_DGEMM_CUBLAS(Kokkos::LayoutLeft, Kokkos::LayoutLeft, Kokkos::LayoutLeft, Kokkos::CudaUVMSpace, false) +KOKKOSBLAS3_DGEMM_CUBLAS(Kokkos::LayoutRight, Kokkos::LayoutRight, Kokkos::LayoutRight, Kokkos::CudaUVMSpace, true) +KOKKOSBLAS3_DGEMM_CUBLAS(Kokkos::LayoutRight, Kokkos::LayoutRight, Kokkos::LayoutRight, Kokkos::CudaUVMSpace, false) + +KOKKOSBLAS3_SGEMM_CUBLAS(Kokkos::LayoutLeft, Kokkos::LayoutLeft, Kokkos::LayoutLeft, Kokkos::CudaSpace, true) +KOKKOSBLAS3_SGEMM_CUBLAS(Kokkos::LayoutLeft, Kokkos::LayoutLeft, Kokkos::LayoutLeft, Kokkos::CudaSpace, false) +KOKKOSBLAS3_SGEMM_CUBLAS(Kokkos::LayoutRight, Kokkos::LayoutRight, Kokkos::LayoutRight, Kokkos::CudaSpace, true) +KOKKOSBLAS3_SGEMM_CUBLAS(Kokkos::LayoutRight, Kokkos::LayoutRight, Kokkos::LayoutRight, Kokkos::CudaSpace, false) + +KOKKOSBLAS3_SGEMM_CUBLAS(Kokkos::LayoutLeft, Kokkos::LayoutLeft, Kokkos::LayoutLeft, Kokkos::CudaUVMSpace, true) +KOKKOSBLAS3_SGEMM_CUBLAS(Kokkos::LayoutLeft, Kokkos::LayoutLeft, Kokkos::LayoutLeft, Kokkos::CudaUVMSpace, false) +KOKKOSBLAS3_SGEMM_CUBLAS(Kokkos::LayoutRight, Kokkos::LayoutRight, Kokkos::LayoutRight, Kokkos::CudaUVMSpace, true) +KOKKOSBLAS3_SGEMM_CUBLAS(Kokkos::LayoutRight, Kokkos::LayoutRight, Kokkos::LayoutRight, Kokkos::CudaUVMSpace, false) + +KOKKOSBLAS3_ZGEMM_CUBLAS(Kokkos::LayoutLeft, Kokkos::LayoutLeft, Kokkos::LayoutLeft, Kokkos::CudaSpace, true) +KOKKOSBLAS3_ZGEMM_CUBLAS(Kokkos::LayoutLeft, Kokkos::LayoutLeft, Kokkos::LayoutLeft, Kokkos::CudaSpace, false) +KOKKOSBLAS3_ZGEMM_CUBLAS(Kokkos::LayoutRight, Kokkos::LayoutRight, Kokkos::LayoutRight, Kokkos::CudaSpace, true) +KOKKOSBLAS3_ZGEMM_CUBLAS(Kokkos::LayoutRight, Kokkos::LayoutRight, Kokkos::LayoutRight, Kokkos::CudaSpace, false) + +KOKKOSBLAS3_ZGEMM_CUBLAS(Kokkos::LayoutLeft, Kokkos::LayoutLeft, Kokkos::LayoutLeft, Kokkos::CudaUVMSpace, true) +KOKKOSBLAS3_ZGEMM_CUBLAS(Kokkos::LayoutLeft, Kokkos::LayoutLeft, Kokkos::LayoutLeft, Kokkos::CudaUVMSpace, false) +KOKKOSBLAS3_ZGEMM_CUBLAS(Kokkos::LayoutRight, Kokkos::LayoutRight, Kokkos::LayoutRight, Kokkos::CudaUVMSpace, true) +KOKKOSBLAS3_ZGEMM_CUBLAS(Kokkos::LayoutRight, Kokkos::LayoutRight, Kokkos::LayoutRight, Kokkos::CudaUVMSpace, false) + +KOKKOSBLAS3_CGEMM_CUBLAS(Kokkos::LayoutLeft, Kokkos::LayoutLeft, Kokkos::LayoutLeft, Kokkos::CudaSpace, true) +KOKKOSBLAS3_CGEMM_CUBLAS(Kokkos::LayoutLeft, Kokkos::LayoutLeft, Kokkos::LayoutLeft, Kokkos::CudaSpace, false) +KOKKOSBLAS3_CGEMM_CUBLAS(Kokkos::LayoutRight, Kokkos::LayoutRight, Kokkos::LayoutRight, Kokkos::CudaSpace, true) +KOKKOSBLAS3_CGEMM_CUBLAS(Kokkos::LayoutRight, Kokkos::LayoutRight, Kokkos::LayoutRight, Kokkos::CudaSpace, false) + +KOKKOSBLAS3_CGEMM_CUBLAS(Kokkos::LayoutLeft, Kokkos::LayoutLeft, Kokkos::LayoutLeft, Kokkos::CudaUVMSpace, true) +KOKKOSBLAS3_CGEMM_CUBLAS(Kokkos::LayoutLeft, Kokkos::LayoutLeft, Kokkos::LayoutLeft, Kokkos::CudaUVMSpace, false) +KOKKOSBLAS3_CGEMM_CUBLAS(Kokkos::LayoutRight, Kokkos::LayoutRight, Kokkos::LayoutRight, Kokkos::CudaUVMSpace, true) +KOKKOSBLAS3_CGEMM_CUBLAS(Kokkos::LayoutRight, Kokkos::LayoutRight, Kokkos::LayoutRight, Kokkos::CudaUVMSpace, false) } // namespace Impl } // namespace KokkosBlas @@ -362,120 +260,93 @@ KOKKOSBLAS3_CGEMM_CUBLAS(Kokkos::LayoutRight, Kokkos::LayoutRight, namespace KokkosBlas { namespace Impl { -#define KOKKOSBLAS3_XGEMM_ROCBLAS(SCALAR_TYPE, ROCBLAS_SCALAR_TYPE, \ - ROCBLAS_FN, LAYOUT, MEM_SPACE, \ - ETI_SPEC_AVAIL) \ - template \ - struct GEMM, \ - Kokkos::MemoryTraits >, \ - Kokkos::View, \ - Kokkos::MemoryTraits >, \ - Kokkos::View, \ - Kokkos::MemoryTraits >, \ - true, ETI_SPEC_AVAIL> { \ - typedef SCALAR_TYPE SCALAR; \ - typedef Kokkos::View, \ - Kokkos::MemoryTraits > \ - AViewType; \ - typedef Kokkos::View, \ - Kokkos::MemoryTraits > \ - BViewType; \ - typedef Kokkos::View, \ - Kokkos::MemoryTraits > \ - CViewType; \ - \ - static void gemm(const typename CViewType::execution_space& space, \ - const char transA[], const char transB[], \ - typename AViewType::const_value_type& alpha, \ - const AViewType& A, const BViewType& B, \ - typename CViewType::const_value_type& beta, \ - const CViewType& C) { \ - Kokkos::Profiling::pushRegion( \ - "KokkosBlas::gemm[TPL_ROCBLAS," #SCALAR_TYPE "]"); \ - \ - const bool A_t = (transA[0] != 'N') && (transA[0] != 'n'); \ - const int M = static_cast(C.extent(0)); \ - const int N = static_cast(C.extent(1)); \ - const int K = static_cast(A.extent(A_t ? 0 : 1)); \ - \ - bool is_lr = std::is_same::value; \ - \ - const int AST = is_lr ? A.stride(0) : A.stride(1), \ - LDA = AST == 0 ? 1 : AST; \ - const int BST = is_lr ? B.stride(0) : B.stride(1), \ - LDB = BST == 0 ? 1 : BST; \ - const int CST = is_lr ? C.stride(0) : C.stride(1), \ - LDC = CST == 0 ? 1 : CST; \ - \ - rocblas_operation transa = trans_mode_kk_to_rocblas(transA); \ - rocblas_operation transb = trans_mode_kk_to_rocblas(transB); \ - \ - constexpr int numDotsLayoutLeftThreshold = 1600; \ - constexpr int numDotsLayoutRightThreshold = 100; \ - if ((!is_lr && transa != rocblas_operation_none && \ - transb == rocblas_operation_none && \ - M * N < numDotsLayoutLeftThreshold) || \ - (is_lr && transa != rocblas_operation_none && \ - transb == rocblas_operation_none && \ - M * N < numDotsLayoutRightThreshold)) { \ - DotBasedGEMM gemm( \ - alpha, A, B, beta, C); \ - bool conjT = \ - (std::is_same::value || \ - std::is_same::value) \ - ? false \ - : (transa == rocblas_operation_conjugate_transpose ? true \ - : false); \ - gemm.run(space, conjT); \ - } else { \ - KokkosBlas::Impl::RocBlasSingleton& s = \ - KokkosBlas::Impl::RocBlasSingleton::singleton(); \ - KOKKOS_ROCBLAS_SAFE_CALL_IMPL( \ - rocblas_set_stream(s.handle, space.hip_stream())); \ - if (!is_lr) \ - KOKKOS_ROCBLAS_SAFE_CALL_IMPL(ROCBLAS_FN( \ - s.handle, transa, transb, M, N, K, \ - reinterpret_cast(&alpha), \ - reinterpret_cast(A.data()), LDA, \ - reinterpret_cast(B.data()), LDB, \ - reinterpret_cast(&beta), \ - reinterpret_cast(C.data()), LDC)); \ - else \ - KOKKOS_ROCBLAS_SAFE_CALL_IMPL(ROCBLAS_FN( \ - s.handle, transb, transa, N, M, K, \ - reinterpret_cast(&alpha), \ - reinterpret_cast(B.data()), LDB, \ - reinterpret_cast(A.data()), LDA, \ - reinterpret_cast(&beta), \ - reinterpret_cast(C.data()), LDC)); \ - KOKKOS_ROCBLAS_SAFE_CALL_IMPL(rocblas_set_stream(s.handle, NULL)); \ - } \ - Kokkos::Profiling::popRegion(); \ - } \ +#define KOKKOSBLAS3_XGEMM_ROCBLAS(SCALAR_TYPE, ROCBLAS_SCALAR_TYPE, ROCBLAS_FN, LAYOUT, MEM_SPACE, ETI_SPEC_AVAIL) \ + template \ + struct GEMM, \ + Kokkos::MemoryTraits >, \ + Kokkos::View, \ + Kokkos::MemoryTraits >, \ + Kokkos::View, \ + Kokkos::MemoryTraits >, \ + true, ETI_SPEC_AVAIL> { \ + typedef SCALAR_TYPE SCALAR; \ + typedef Kokkos::View, \ + Kokkos::MemoryTraits > \ + AViewType; \ + typedef Kokkos::View, \ + Kokkos::MemoryTraits > \ + BViewType; \ + typedef Kokkos::View, \ + Kokkos::MemoryTraits > \ + CViewType; \ + \ + static void gemm(const typename CViewType::execution_space& space, const char transA[], const char transB[], \ + typename AViewType::const_value_type& alpha, const AViewType& A, const BViewType& B, \ + typename CViewType::const_value_type& beta, const CViewType& C) { \ + Kokkos::Profiling::pushRegion("KokkosBlas::gemm[TPL_ROCBLAS," #SCALAR_TYPE "]"); \ + \ + const bool A_t = (transA[0] != 'N') && (transA[0] != 'n'); \ + const int M = static_cast(C.extent(0)); \ + const int N = static_cast(C.extent(1)); \ + const int K = static_cast(A.extent(A_t ? 0 : 1)); \ + \ + bool is_lr = std::is_same::value; \ + \ + const int AST = is_lr ? A.stride(0) : A.stride(1), LDA = AST == 0 ? 1 : AST; \ + const int BST = is_lr ? B.stride(0) : B.stride(1), LDB = BST == 0 ? 1 : BST; \ + const int CST = is_lr ? C.stride(0) : C.stride(1), LDC = CST == 0 ? 1 : CST; \ + \ + rocblas_operation transa = trans_mode_kk_to_rocblas(transA); \ + rocblas_operation transb = trans_mode_kk_to_rocblas(transB); \ + \ + constexpr int numDotsLayoutLeftThreshold = 1600; \ + constexpr int numDotsLayoutRightThreshold = 100; \ + if ((!is_lr && transa != rocblas_operation_none && transb == rocblas_operation_none && \ + M * N < numDotsLayoutLeftThreshold) || \ + (is_lr && transa != rocblas_operation_none && transb == rocblas_operation_none && \ + M * N < numDotsLayoutRightThreshold)) { \ + DotBasedGEMM gemm(alpha, A, B, beta, C); \ + bool conjT = (std::is_same::value || std::is_same::value) \ + ? false \ + : (transa == rocblas_operation_conjugate_transpose ? true : false); \ + gemm.run(space, conjT); \ + } else { \ + KokkosBlas::Impl::RocBlasSingleton& s = KokkosBlas::Impl::RocBlasSingleton::singleton(); \ + KOKKOS_ROCBLAS_SAFE_CALL_IMPL(rocblas_set_stream(s.handle, space.hip_stream())); \ + if (!is_lr) \ + KOKKOS_ROCBLAS_SAFE_CALL_IMPL(ROCBLAS_FN(s.handle, transa, transb, M, N, K, \ + reinterpret_cast(&alpha), \ + reinterpret_cast(A.data()), LDA, \ + reinterpret_cast(B.data()), LDB, \ + reinterpret_cast(&beta), \ + reinterpret_cast(C.data()), LDC)); \ + else \ + KOKKOS_ROCBLAS_SAFE_CALL_IMPL(ROCBLAS_FN(s.handle, transb, transa, N, M, K, \ + reinterpret_cast(&alpha), \ + reinterpret_cast(B.data()), LDB, \ + reinterpret_cast(A.data()), LDA, \ + reinterpret_cast(&beta), \ + reinterpret_cast(C.data()), LDC)); \ + KOKKOS_ROCBLAS_SAFE_CALL_IMPL(rocblas_set_stream(s.handle, NULL)); \ + } \ + Kokkos::Profiling::popRegion(); \ + } \ }; -#define KOKKOSBLAS3_DGEMM_ROCBLAS(LAYOUT, MEM_SPACE, ETI_SPEC_AVAIL) \ - KOKKOSBLAS3_XGEMM_ROCBLAS(double, double, rocblas_dgemm, LAYOUT, MEM_SPACE, \ - ETI_SPEC_AVAIL) +#define KOKKOSBLAS3_DGEMM_ROCBLAS(LAYOUT, MEM_SPACE, ETI_SPEC_AVAIL) \ + KOKKOSBLAS3_XGEMM_ROCBLAS(double, double, rocblas_dgemm, LAYOUT, MEM_SPACE, ETI_SPEC_AVAIL) -#define KOKKOSBLAS3_SGEMM_ROCBLAS(LAYOUT, MEM_SPACE, ETI_SPEC_AVAIL) \ - KOKKOSBLAS3_XGEMM_ROCBLAS(float, float, rocblas_sgemm, LAYOUT, MEM_SPACE, \ - ETI_SPEC_AVAIL) +#define KOKKOSBLAS3_SGEMM_ROCBLAS(LAYOUT, MEM_SPACE, ETI_SPEC_AVAIL) \ + KOKKOSBLAS3_XGEMM_ROCBLAS(float, float, rocblas_sgemm, LAYOUT, MEM_SPACE, ETI_SPEC_AVAIL) -#define KOKKOSBLAS3_ZGEMM_ROCBLAS(LAYOUT, MEM_SPACE, ETI_SPEC_AVAIL) \ - KOKKOSBLAS3_XGEMM_ROCBLAS(Kokkos::complex, rocblas_double_complex, \ - rocblas_zgemm, LAYOUT, MEM_SPACE, ETI_SPEC_AVAIL) +#define KOKKOSBLAS3_ZGEMM_ROCBLAS(LAYOUT, MEM_SPACE, ETI_SPEC_AVAIL) \ + KOKKOSBLAS3_XGEMM_ROCBLAS(Kokkos::complex, rocblas_double_complex, rocblas_zgemm, LAYOUT, MEM_SPACE, \ + ETI_SPEC_AVAIL) -#define KOKKOSBLAS3_CGEMM_ROCBLAS(LAYOUT, MEM_SPACE, ETI_SPEC_AVAIL) \ - KOKKOSBLAS3_XGEMM_ROCBLAS(Kokkos::complex, rocblas_float_complex, \ - rocblas_cgemm, LAYOUT, MEM_SPACE, ETI_SPEC_AVAIL) +#define KOKKOSBLAS3_CGEMM_ROCBLAS(LAYOUT, MEM_SPACE, ETI_SPEC_AVAIL) \ + KOKKOSBLAS3_XGEMM_ROCBLAS(Kokkos::complex, rocblas_float_complex, rocblas_cgemm, LAYOUT, MEM_SPACE, \ + ETI_SPEC_AVAIL) KOKKOSBLAS3_DGEMM_ROCBLAS(Kokkos::LayoutLeft, Kokkos::HIPSpace, true) KOKKOSBLAS3_DGEMM_ROCBLAS(Kokkos::LayoutLeft, Kokkos::HIPSpace, false) diff --git a/blas/tpls/KokkosBlas3_trmm_tpl_spec_avail.hpp b/blas/tpls/KokkosBlas3_trmm_tpl_spec_avail.hpp index 010b44a154..83e39a240e 100644 --- a/blas/tpls/KokkosBlas3_trmm_tpl_spec_avail.hpp +++ b/blas/tpls/KokkosBlas3_trmm_tpl_spec_avail.hpp @@ -29,38 +29,26 @@ struct trmm_tpl_spec_avail { // Generic Host side BLAS (could be MKL or whatever) #ifdef KOKKOSKERNELS_ENABLE_TPL_BLAS -#define KOKKOSBLAS3_TRMM_TPL_SPEC_AVAIL_BLAS(SCALAR, LAYOUTA, LAYOUTB, \ - MEMSPACE) \ - template \ - struct trmm_tpl_spec_avail< \ - ExecSpace, \ - Kokkos::View, \ - Kokkos::MemoryTraits >, \ - Kokkos::View, \ - Kokkos::MemoryTraits > > { \ - enum : bool { value = true }; \ +#define KOKKOSBLAS3_TRMM_TPL_SPEC_AVAIL_BLAS(SCALAR, LAYOUTA, LAYOUTB, MEMSPACE) \ + template \ + struct trmm_tpl_spec_avail, \ + Kokkos::MemoryTraits >, \ + Kokkos::View, \ + Kokkos::MemoryTraits > > { \ + enum : bool { value = true }; \ }; -KOKKOSBLAS3_TRMM_TPL_SPEC_AVAIL_BLAS(double, Kokkos::LayoutLeft, - Kokkos::LayoutLeft, Kokkos::HostSpace) -KOKKOSBLAS3_TRMM_TPL_SPEC_AVAIL_BLAS(float, Kokkos::LayoutLeft, - Kokkos::LayoutLeft, Kokkos::HostSpace) -KOKKOSBLAS3_TRMM_TPL_SPEC_AVAIL_BLAS(Kokkos::complex, - Kokkos::LayoutLeft, Kokkos::LayoutLeft, - Kokkos::HostSpace) -KOKKOSBLAS3_TRMM_TPL_SPEC_AVAIL_BLAS(Kokkos::complex, Kokkos::LayoutLeft, - Kokkos::LayoutLeft, Kokkos::HostSpace) +KOKKOSBLAS3_TRMM_TPL_SPEC_AVAIL_BLAS(double, Kokkos::LayoutLeft, Kokkos::LayoutLeft, Kokkos::HostSpace) +KOKKOSBLAS3_TRMM_TPL_SPEC_AVAIL_BLAS(float, Kokkos::LayoutLeft, Kokkos::LayoutLeft, Kokkos::HostSpace) +KOKKOSBLAS3_TRMM_TPL_SPEC_AVAIL_BLAS(Kokkos::complex, Kokkos::LayoutLeft, Kokkos::LayoutLeft, Kokkos::HostSpace) +KOKKOSBLAS3_TRMM_TPL_SPEC_AVAIL_BLAS(Kokkos::complex, Kokkos::LayoutLeft, Kokkos::LayoutLeft, Kokkos::HostSpace) -KOKKOSBLAS3_TRMM_TPL_SPEC_AVAIL_BLAS(double, Kokkos::LayoutRight, - Kokkos::LayoutRight, Kokkos::HostSpace) -KOKKOSBLAS3_TRMM_TPL_SPEC_AVAIL_BLAS(float, Kokkos::LayoutRight, - Kokkos::LayoutRight, Kokkos::HostSpace) -KOKKOSBLAS3_TRMM_TPL_SPEC_AVAIL_BLAS(Kokkos::complex, - Kokkos::LayoutRight, Kokkos::LayoutRight, +KOKKOSBLAS3_TRMM_TPL_SPEC_AVAIL_BLAS(double, Kokkos::LayoutRight, Kokkos::LayoutRight, Kokkos::HostSpace) +KOKKOSBLAS3_TRMM_TPL_SPEC_AVAIL_BLAS(float, Kokkos::LayoutRight, Kokkos::LayoutRight, Kokkos::HostSpace) +KOKKOSBLAS3_TRMM_TPL_SPEC_AVAIL_BLAS(Kokkos::complex, Kokkos::LayoutRight, Kokkos::LayoutRight, Kokkos::HostSpace) -KOKKOSBLAS3_TRMM_TPL_SPEC_AVAIL_BLAS(Kokkos::complex, - Kokkos::LayoutRight, Kokkos::LayoutRight, +KOKKOSBLAS3_TRMM_TPL_SPEC_AVAIL_BLAS(Kokkos::complex, Kokkos::LayoutRight, Kokkos::LayoutRight, Kokkos::HostSpace) #endif // KOKKOSKERNELS_ENABLE_TPL_BLAS @@ -68,61 +56,40 @@ KOKKOSBLAS3_TRMM_TPL_SPEC_AVAIL_BLAS(Kokkos::complex, // cuBLAS #ifdef KOKKOSKERNELS_ENABLE_TPL_CUBLAS -#define KOKKOSBLAS3_TRMM_TPL_SPEC_AVAIL_CUBLAS(SCALAR, LAYOUTA, LAYOUTB, \ - MEMSPACE) \ - template \ - struct trmm_tpl_spec_avail< \ - Kokkos::Cuda, \ - Kokkos::View, \ - Kokkos::MemoryTraits >, \ - Kokkos::View, \ - Kokkos::MemoryTraits > > { \ - enum : bool { value = true }; \ +#define KOKKOSBLAS3_TRMM_TPL_SPEC_AVAIL_CUBLAS(SCALAR, LAYOUTA, LAYOUTB, MEMSPACE) \ + template \ + struct trmm_tpl_spec_avail, \ + Kokkos::MemoryTraits >, \ + Kokkos::View, \ + Kokkos::MemoryTraits > > { \ + enum : bool { value = true }; \ }; -KOKKOSBLAS3_TRMM_TPL_SPEC_AVAIL_CUBLAS(double, Kokkos::LayoutLeft, - Kokkos::LayoutLeft, Kokkos::CudaSpace) -KOKKOSBLAS3_TRMM_TPL_SPEC_AVAIL_CUBLAS(double, Kokkos::LayoutLeft, - Kokkos::LayoutLeft, Kokkos::CudaUVMSpace) -KOKKOSBLAS3_TRMM_TPL_SPEC_AVAIL_CUBLAS(float, Kokkos::LayoutLeft, - Kokkos::LayoutLeft, Kokkos::CudaSpace) -KOKKOSBLAS3_TRMM_TPL_SPEC_AVAIL_CUBLAS(float, Kokkos::LayoutLeft, - Kokkos::LayoutLeft, Kokkos::CudaUVMSpace) -KOKKOSBLAS3_TRMM_TPL_SPEC_AVAIL_CUBLAS(Kokkos::complex, - Kokkos::LayoutLeft, Kokkos::LayoutLeft, +KOKKOSBLAS3_TRMM_TPL_SPEC_AVAIL_CUBLAS(double, Kokkos::LayoutLeft, Kokkos::LayoutLeft, Kokkos::CudaSpace) +KOKKOSBLAS3_TRMM_TPL_SPEC_AVAIL_CUBLAS(double, Kokkos::LayoutLeft, Kokkos::LayoutLeft, Kokkos::CudaUVMSpace) +KOKKOSBLAS3_TRMM_TPL_SPEC_AVAIL_CUBLAS(float, Kokkos::LayoutLeft, Kokkos::LayoutLeft, Kokkos::CudaSpace) +KOKKOSBLAS3_TRMM_TPL_SPEC_AVAIL_CUBLAS(float, Kokkos::LayoutLeft, Kokkos::LayoutLeft, Kokkos::CudaUVMSpace) +KOKKOSBLAS3_TRMM_TPL_SPEC_AVAIL_CUBLAS(Kokkos::complex, Kokkos::LayoutLeft, Kokkos::LayoutLeft, Kokkos::CudaSpace) -KOKKOSBLAS3_TRMM_TPL_SPEC_AVAIL_CUBLAS(Kokkos::complex, - Kokkos::LayoutLeft, Kokkos::LayoutLeft, +KOKKOSBLAS3_TRMM_TPL_SPEC_AVAIL_CUBLAS(Kokkos::complex, Kokkos::LayoutLeft, Kokkos::LayoutLeft, Kokkos::CudaUVMSpace) -KOKKOSBLAS3_TRMM_TPL_SPEC_AVAIL_CUBLAS(Kokkos::complex, - Kokkos::LayoutLeft, Kokkos::LayoutLeft, +KOKKOSBLAS3_TRMM_TPL_SPEC_AVAIL_CUBLAS(Kokkos::complex, Kokkos::LayoutLeft, Kokkos::LayoutLeft, Kokkos::CudaSpace) -KOKKOSBLAS3_TRMM_TPL_SPEC_AVAIL_CUBLAS(Kokkos::complex, - Kokkos::LayoutLeft, Kokkos::LayoutLeft, +KOKKOSBLAS3_TRMM_TPL_SPEC_AVAIL_CUBLAS(Kokkos::complex, Kokkos::LayoutLeft, Kokkos::LayoutLeft, Kokkos::CudaUVMSpace) -KOKKOSBLAS3_TRMM_TPL_SPEC_AVAIL_CUBLAS(double, Kokkos::LayoutRight, - Kokkos::LayoutRight, Kokkos::CudaSpace) -KOKKOSBLAS3_TRMM_TPL_SPEC_AVAIL_CUBLAS(double, Kokkos::LayoutRight, - Kokkos::LayoutRight, - Kokkos::CudaUVMSpace) -KOKKOSBLAS3_TRMM_TPL_SPEC_AVAIL_CUBLAS(float, Kokkos::LayoutRight, - Kokkos::LayoutRight, Kokkos::CudaSpace) -KOKKOSBLAS3_TRMM_TPL_SPEC_AVAIL_CUBLAS(float, Kokkos::LayoutRight, - Kokkos::LayoutRight, - Kokkos::CudaUVMSpace) -KOKKOSBLAS3_TRMM_TPL_SPEC_AVAIL_CUBLAS(Kokkos::complex, - Kokkos::LayoutRight, Kokkos::LayoutRight, +KOKKOSBLAS3_TRMM_TPL_SPEC_AVAIL_CUBLAS(double, Kokkos::LayoutRight, Kokkos::LayoutRight, Kokkos::CudaSpace) +KOKKOSBLAS3_TRMM_TPL_SPEC_AVAIL_CUBLAS(double, Kokkos::LayoutRight, Kokkos::LayoutRight, Kokkos::CudaUVMSpace) +KOKKOSBLAS3_TRMM_TPL_SPEC_AVAIL_CUBLAS(float, Kokkos::LayoutRight, Kokkos::LayoutRight, Kokkos::CudaSpace) +KOKKOSBLAS3_TRMM_TPL_SPEC_AVAIL_CUBLAS(float, Kokkos::LayoutRight, Kokkos::LayoutRight, Kokkos::CudaUVMSpace) +KOKKOSBLAS3_TRMM_TPL_SPEC_AVAIL_CUBLAS(Kokkos::complex, Kokkos::LayoutRight, Kokkos::LayoutRight, Kokkos::CudaSpace) -KOKKOSBLAS3_TRMM_TPL_SPEC_AVAIL_CUBLAS(Kokkos::complex, - Kokkos::LayoutRight, Kokkos::LayoutRight, +KOKKOSBLAS3_TRMM_TPL_SPEC_AVAIL_CUBLAS(Kokkos::complex, Kokkos::LayoutRight, Kokkos::LayoutRight, Kokkos::CudaUVMSpace) -KOKKOSBLAS3_TRMM_TPL_SPEC_AVAIL_CUBLAS(Kokkos::complex, - Kokkos::LayoutRight, Kokkos::LayoutRight, +KOKKOSBLAS3_TRMM_TPL_SPEC_AVAIL_CUBLAS(Kokkos::complex, Kokkos::LayoutRight, Kokkos::LayoutRight, Kokkos::CudaSpace) -KOKKOSBLAS3_TRMM_TPL_SPEC_AVAIL_CUBLAS(Kokkos::complex, - Kokkos::LayoutRight, Kokkos::LayoutRight, +KOKKOSBLAS3_TRMM_TPL_SPEC_AVAIL_CUBLAS(Kokkos::complex, Kokkos::LayoutRight, Kokkos::LayoutRight, Kokkos::CudaUVMSpace) #endif // KOKKOSKERNELS_ENABLE_TPL_CUBLAS diff --git a/blas/tpls/KokkosBlas3_trmm_tpl_spec_decl.hpp b/blas/tpls/KokkosBlas3_trmm_tpl_spec_decl.hpp index 53c73f7416..4e68c08dec 100644 --- a/blas/tpls/KokkosBlas3_trmm_tpl_spec_decl.hpp +++ b/blas/tpls/KokkosBlas3_trmm_tpl_spec_decl.hpp @@ -24,136 +24,103 @@ namespace KokkosBlas { namespace Impl { -#define KOKKOSBLAS3_TRMM_BLAS(SCALAR_TYPE, BASE_SCALAR_TYPE, LAYOUTA, LAYOUTB, \ - MEM_SPACE, ETI_SPEC_AVAIL) \ - template \ - struct TRMM, \ - Kokkos::MemoryTraits >, \ - Kokkos::View, \ - Kokkos::MemoryTraits >, \ - true, ETI_SPEC_AVAIL> { \ - typedef SCALAR_TYPE SCALAR; \ - typedef Kokkos::View, \ - Kokkos::MemoryTraits > \ - AViewType; \ - typedef Kokkos::View, \ - Kokkos::MemoryTraits > \ - BViewType; \ - \ - static void trmm(const ExecSpace& /*space*/, const char side[], \ - const char uplo[], const char trans[], const char diag[], \ - typename BViewType::const_value_type& alpha, \ - const AViewType& A, const BViewType& B) { \ - Kokkos::Profiling::pushRegion("KokkosBlas::trmm[TPL_BLAS," #SCALAR_TYPE \ - "]"); \ - const int M = static_cast(B.extent(0)); \ - const int N = static_cast(B.extent(1)); \ - \ - bool A_is_layout_left = \ - std::is_same::value; \ - bool B_is_layout_left = \ - std::is_same::value; \ - \ - const int AST = A_is_layout_left ? A.stride(1) : A.stride(0), \ - LDA = (AST == 0) ? 1 : AST; \ - const int BST = B_is_layout_left ? B.stride(1) : B.stride(0), \ - LDB = (BST == 0) ? 1 : BST; \ - \ - char side_; \ - char uplo_; \ - \ - if (A_is_layout_left) { \ - if ((side[0] == 'L') || (side[0] == 'l')) \ - side_ = 'L'; \ - else \ - side_ = 'R'; \ - if ((uplo[0] == 'L') || (uplo[0] == 'l')) \ - uplo_ = 'L'; \ - else \ - uplo_ = 'U'; \ - } else { \ - if ((side[0] == 'L') || (side[0] == 'l')) \ - side_ = 'R'; \ - else \ - side_ = 'L'; \ - if ((uplo[0] == 'L') || (uplo[0] == 'l')) \ - uplo_ = 'U'; \ - else \ - uplo_ = 'L'; \ - } \ - \ - if (A_is_layout_left) \ - HostBlas::trmm( \ - side_, uplo_, trans[0], diag[0], M, N, alpha, \ - reinterpret_cast(A.data()), LDA, \ - reinterpret_cast(B.data()), LDB); \ - else \ - HostBlas::trmm( \ - side_, uplo_, trans[0], diag[0], N, M, alpha, \ - reinterpret_cast(A.data()), LDA, \ - reinterpret_cast(B.data()), LDB); \ - Kokkos::Profiling::popRegion(); \ - } \ +#define KOKKOSBLAS3_TRMM_BLAS(SCALAR_TYPE, BASE_SCALAR_TYPE, LAYOUTA, LAYOUTB, MEM_SPACE, ETI_SPEC_AVAIL) \ + template \ + struct TRMM, \ + Kokkos::MemoryTraits >, \ + Kokkos::View, \ + Kokkos::MemoryTraits >, \ + true, ETI_SPEC_AVAIL> { \ + typedef SCALAR_TYPE SCALAR; \ + typedef Kokkos::View, \ + Kokkos::MemoryTraits > \ + AViewType; \ + typedef Kokkos::View, \ + Kokkos::MemoryTraits > \ + BViewType; \ + \ + static void trmm(const ExecSpace& /*space*/, const char side[], const char uplo[], const char trans[], \ + const char diag[], typename BViewType::const_value_type& alpha, const AViewType& A, \ + const BViewType& B) { \ + Kokkos::Profiling::pushRegion("KokkosBlas::trmm[TPL_BLAS," #SCALAR_TYPE "]"); \ + const int M = static_cast(B.extent(0)); \ + const int N = static_cast(B.extent(1)); \ + \ + bool A_is_layout_left = std::is_same::value; \ + bool B_is_layout_left = std::is_same::value; \ + \ + const int AST = A_is_layout_left ? A.stride(1) : A.stride(0), LDA = (AST == 0) ? 1 : AST; \ + const int BST = B_is_layout_left ? B.stride(1) : B.stride(0), LDB = (BST == 0) ? 1 : BST; \ + \ + char side_; \ + char uplo_; \ + \ + if (A_is_layout_left) { \ + if ((side[0] == 'L') || (side[0] == 'l')) \ + side_ = 'L'; \ + else \ + side_ = 'R'; \ + if ((uplo[0] == 'L') || (uplo[0] == 'l')) \ + uplo_ = 'L'; \ + else \ + uplo_ = 'U'; \ + } else { \ + if ((side[0] == 'L') || (side[0] == 'l')) \ + side_ = 'R'; \ + else \ + side_ = 'L'; \ + if ((uplo[0] == 'L') || (uplo[0] == 'l')) \ + uplo_ = 'U'; \ + else \ + uplo_ = 'L'; \ + } \ + \ + if (A_is_layout_left) \ + HostBlas::trmm(side_, uplo_, trans[0], diag[0], M, N, alpha, \ + reinterpret_cast(A.data()), LDA, \ + reinterpret_cast(B.data()), LDB); \ + else \ + HostBlas::trmm(side_, uplo_, trans[0], diag[0], N, M, alpha, \ + reinterpret_cast(A.data()), LDA, \ + reinterpret_cast(B.data()), LDB); \ + Kokkos::Profiling::popRegion(); \ + } \ }; #define KOKKOSBLAS3_DTRMM_BLAS(LAYOUTA, LAYOUTB, MEM_SPACE, ETI_SPEC_AVAIL) \ - KOKKOSBLAS3_TRMM_BLAS(double, double, LAYOUTA, LAYOUTB, MEM_SPACE, \ - ETI_SPEC_AVAIL) + KOKKOSBLAS3_TRMM_BLAS(double, double, LAYOUTA, LAYOUTB, MEM_SPACE, ETI_SPEC_AVAIL) #define KOKKOSBLAS3_STRMM_BLAS(LAYOUTA, LAYOUTB, MEM_SPACE, ETI_SPEC_AVAIL) \ - KOKKOSBLAS3_TRMM_BLAS(float, float, LAYOUTA, LAYOUTB, MEM_SPACE, \ - ETI_SPEC_AVAIL) + KOKKOSBLAS3_TRMM_BLAS(float, float, LAYOUTA, LAYOUTB, MEM_SPACE, ETI_SPEC_AVAIL) #define KOKKOSBLAS3_ZTRMM_BLAS(LAYOUTA, LAYOUTB, MEM_SPACE, ETI_SPEC_AVAIL) \ - KOKKOSBLAS3_TRMM_BLAS(Kokkos::complex, std::complex, \ - LAYOUTA, LAYOUTB, MEM_SPACE, ETI_SPEC_AVAIL) + KOKKOSBLAS3_TRMM_BLAS(Kokkos::complex, std::complex, LAYOUTA, LAYOUTB, MEM_SPACE, ETI_SPEC_AVAIL) -#define KOKKOSBLAS3_CTRMM_BLAS(LAYOUTA, LAYOUTB, MEM_SPACE, ETI_SPEC_AVAIL) \ - KOKKOSBLAS3_TRMM_BLAS(Kokkos::complex, std::complex, LAYOUTA, \ - LAYOUTB, MEM_SPACE, ETI_SPEC_AVAIL) +#define KOKKOSBLAS3_CTRMM_BLAS(LAYOUTA, LAYOUTB, MEM_SPACE, ETI_SPEC_AVAIL) \ + KOKKOSBLAS3_TRMM_BLAS(Kokkos::complex, std::complex, LAYOUTA, LAYOUTB, MEM_SPACE, ETI_SPEC_AVAIL) // Explicitly define the TRMM class for all permutations listed below -KOKKOSBLAS3_DTRMM_BLAS(Kokkos::LayoutLeft, Kokkos::LayoutLeft, - Kokkos::HostSpace, true) -KOKKOSBLAS3_DTRMM_BLAS(Kokkos::LayoutLeft, Kokkos::LayoutLeft, - Kokkos::HostSpace, false) -KOKKOSBLAS3_DTRMM_BLAS(Kokkos::LayoutRight, Kokkos::LayoutRight, - Kokkos::HostSpace, true) -KOKKOSBLAS3_DTRMM_BLAS(Kokkos::LayoutRight, Kokkos::LayoutRight, - Kokkos::HostSpace, false) - -KOKKOSBLAS3_STRMM_BLAS(Kokkos::LayoutLeft, Kokkos::LayoutLeft, - Kokkos::HostSpace, true) -KOKKOSBLAS3_STRMM_BLAS(Kokkos::LayoutLeft, Kokkos::LayoutLeft, - Kokkos::HostSpace, false) -KOKKOSBLAS3_STRMM_BLAS(Kokkos::LayoutRight, Kokkos::LayoutRight, - Kokkos::HostSpace, true) -KOKKOSBLAS3_STRMM_BLAS(Kokkos::LayoutRight, Kokkos::LayoutRight, - Kokkos::HostSpace, false) - -KOKKOSBLAS3_ZTRMM_BLAS(Kokkos::LayoutLeft, Kokkos::LayoutLeft, - Kokkos::HostSpace, true) -KOKKOSBLAS3_ZTRMM_BLAS(Kokkos::LayoutLeft, Kokkos::LayoutLeft, - Kokkos::HostSpace, false) -KOKKOSBLAS3_ZTRMM_BLAS(Kokkos::LayoutRight, Kokkos::LayoutRight, - Kokkos::HostSpace, true) -KOKKOSBLAS3_ZTRMM_BLAS(Kokkos::LayoutRight, Kokkos::LayoutRight, - Kokkos::HostSpace, false) - -KOKKOSBLAS3_CTRMM_BLAS(Kokkos::LayoutLeft, Kokkos::LayoutLeft, - Kokkos::HostSpace, true) -KOKKOSBLAS3_CTRMM_BLAS(Kokkos::LayoutLeft, Kokkos::LayoutLeft, - Kokkos::HostSpace, false) -KOKKOSBLAS3_CTRMM_BLAS(Kokkos::LayoutRight, Kokkos::LayoutRight, - Kokkos::HostSpace, true) -KOKKOSBLAS3_CTRMM_BLAS(Kokkos::LayoutRight, Kokkos::LayoutRight, - Kokkos::HostSpace, false) +KOKKOSBLAS3_DTRMM_BLAS(Kokkos::LayoutLeft, Kokkos::LayoutLeft, Kokkos::HostSpace, true) +KOKKOSBLAS3_DTRMM_BLAS(Kokkos::LayoutLeft, Kokkos::LayoutLeft, Kokkos::HostSpace, false) +KOKKOSBLAS3_DTRMM_BLAS(Kokkos::LayoutRight, Kokkos::LayoutRight, Kokkos::HostSpace, true) +KOKKOSBLAS3_DTRMM_BLAS(Kokkos::LayoutRight, Kokkos::LayoutRight, Kokkos::HostSpace, false) + +KOKKOSBLAS3_STRMM_BLAS(Kokkos::LayoutLeft, Kokkos::LayoutLeft, Kokkos::HostSpace, true) +KOKKOSBLAS3_STRMM_BLAS(Kokkos::LayoutLeft, Kokkos::LayoutLeft, Kokkos::HostSpace, false) +KOKKOSBLAS3_STRMM_BLAS(Kokkos::LayoutRight, Kokkos::LayoutRight, Kokkos::HostSpace, true) +KOKKOSBLAS3_STRMM_BLAS(Kokkos::LayoutRight, Kokkos::LayoutRight, Kokkos::HostSpace, false) + +KOKKOSBLAS3_ZTRMM_BLAS(Kokkos::LayoutLeft, Kokkos::LayoutLeft, Kokkos::HostSpace, true) +KOKKOSBLAS3_ZTRMM_BLAS(Kokkos::LayoutLeft, Kokkos::LayoutLeft, Kokkos::HostSpace, false) +KOKKOSBLAS3_ZTRMM_BLAS(Kokkos::LayoutRight, Kokkos::LayoutRight, Kokkos::HostSpace, true) +KOKKOSBLAS3_ZTRMM_BLAS(Kokkos::LayoutRight, Kokkos::LayoutRight, Kokkos::HostSpace, false) + +KOKKOSBLAS3_CTRMM_BLAS(Kokkos::LayoutLeft, Kokkos::LayoutLeft, Kokkos::HostSpace, true) +KOKKOSBLAS3_CTRMM_BLAS(Kokkos::LayoutLeft, Kokkos::LayoutLeft, Kokkos::HostSpace, false) +KOKKOSBLAS3_CTRMM_BLAS(Kokkos::LayoutRight, Kokkos::LayoutRight, Kokkos::HostSpace, true) +KOKKOSBLAS3_CTRMM_BLAS(Kokkos::LayoutRight, Kokkos::LayoutRight, Kokkos::HostSpace, false) } // namespace Impl } // namespace KokkosBlas @@ -166,196 +133,143 @@ KOKKOSBLAS3_CTRMM_BLAS(Kokkos::LayoutRight, Kokkos::LayoutRight, namespace KokkosBlas { namespace Impl { -#define KOKKOSBLAS3_TRMM_CUBLAS(SCALAR_TYPE, CUDA_SCALAR_TYPE, CUBLAS_FN, \ - LAYOUTA, LAYOUTB, MEM_SPACE, ETI_SPEC_AVAIL) \ - template \ - struct TRMM, \ - Kokkos::MemoryTraits >, \ - Kokkos::View, \ - Kokkos::MemoryTraits >, \ - true, ETI_SPEC_AVAIL> { \ - typedef SCALAR_TYPE SCALAR; \ - typedef Kokkos::View, \ - Kokkos::MemoryTraits > \ - AViewType; \ - typedef Kokkos::View, \ - Kokkos::MemoryTraits > \ - BViewType; \ - \ - static void trmm(const ExecSpace& space, const char side[], \ - const char uplo[], const char trans[], const char diag[], \ - typename BViewType::const_value_type& alpha, \ - const AViewType& A, const BViewType& B) { \ - Kokkos::Profiling::pushRegion( \ - "KokkosBlas::trmm[TPL_CUBLAS," #SCALAR_TYPE "]"); \ - const int M = static_cast(B.extent(0)); \ - const int N = static_cast(B.extent(1)); \ - \ - bool A_is_layout_left = \ - std::is_same::value; \ - bool B_is_layout_left = \ - std::is_same::value; \ - \ - const int AST = A_is_layout_left ? A.stride(1) : A.stride(0), \ - LDA = (AST == 0) ? 1 : AST; \ - const int BST = B_is_layout_left ? B.stride(1) : B.stride(0), \ - LDB = (BST == 0) ? 1 : BST; \ - \ - cublasSideMode_t side_; \ - cublasFillMode_t uplo_; \ - cublasOperation_t trans_; \ - cublasDiagType_t diag_; \ - \ - if (A_is_layout_left) { \ - if ((side[0] == 'L') || (side[0] == 'l')) \ - side_ = CUBLAS_SIDE_LEFT; \ - else \ - side_ = CUBLAS_SIDE_RIGHT; \ - if ((uplo[0] == 'L') || (uplo[0] == 'l')) \ - uplo_ = CUBLAS_FILL_MODE_LOWER; \ - else \ - uplo_ = CUBLAS_FILL_MODE_UPPER; \ - } else { \ - if ((side[0] == 'L') || (side[0] == 'l')) \ - side_ = CUBLAS_SIDE_RIGHT; \ - else \ - side_ = CUBLAS_SIDE_LEFT; \ - if ((uplo[0] == 'L') || (uplo[0] == 'l')) \ - uplo_ = CUBLAS_FILL_MODE_UPPER; \ - else \ - uplo_ = CUBLAS_FILL_MODE_LOWER; \ - } \ - \ - if ((trans[0] == 'N') || (trans[0] == 'n')) \ - trans_ = CUBLAS_OP_N; \ - else if ((trans[0] == 'T') || (trans[0] == 't')) \ - trans_ = CUBLAS_OP_T; \ - else \ - trans_ = CUBLAS_OP_C; \ - if ((diag[0] == 'U') || (diag[0] == 'u')) \ - diag_ = CUBLAS_DIAG_UNIT; \ - else \ - diag_ = CUBLAS_DIAG_NON_UNIT; \ - \ - KokkosBlas::Impl::CudaBlasSingleton& s = \ - KokkosBlas::Impl::CudaBlasSingleton::singleton(); \ - KOKKOS_CUBLAS_SAFE_CALL_IMPL( \ - cublasSetStream(s.handle, space.cuda_stream())); \ - if (A_is_layout_left) { \ - KOKKOS_CUBLAS_SAFE_CALL_IMPL( \ - CUBLAS_FN(s.handle, side_, uplo_, trans_, diag_, M, N, \ - reinterpret_cast(&alpha), \ - reinterpret_cast(A.data()), \ - LDA, reinterpret_cast(B.data()), LDB, \ - reinterpret_cast(B.data()), LDB)); \ - } else { \ - KOKKOS_CUBLAS_SAFE_CALL_IMPL( \ - CUBLAS_FN(s.handle, side_, uplo_, trans_, diag_, N, M, \ - reinterpret_cast(&alpha), \ - reinterpret_cast(A.data()), \ - LDA, reinterpret_cast(B.data()), LDB, \ - reinterpret_cast(B.data()), LDB)); \ - } \ - KOKKOS_CUBLAS_SAFE_CALL_IMPL(cublasSetStream(s.handle, NULL)); \ - Kokkos::Profiling::popRegion(); \ - } \ +#define KOKKOSBLAS3_TRMM_CUBLAS(SCALAR_TYPE, CUDA_SCALAR_TYPE, CUBLAS_FN, LAYOUTA, LAYOUTB, MEM_SPACE, ETI_SPEC_AVAIL) \ + template \ + struct TRMM, \ + Kokkos::MemoryTraits >, \ + Kokkos::View, \ + Kokkos::MemoryTraits >, \ + true, ETI_SPEC_AVAIL> { \ + typedef SCALAR_TYPE SCALAR; \ + typedef Kokkos::View, \ + Kokkos::MemoryTraits > \ + AViewType; \ + typedef Kokkos::View, \ + Kokkos::MemoryTraits > \ + BViewType; \ + \ + static void trmm(const ExecSpace& space, const char side[], const char uplo[], const char trans[], \ + const char diag[], typename BViewType::const_value_type& alpha, const AViewType& A, \ + const BViewType& B) { \ + Kokkos::Profiling::pushRegion("KokkosBlas::trmm[TPL_CUBLAS," #SCALAR_TYPE "]"); \ + const int M = static_cast(B.extent(0)); \ + const int N = static_cast(B.extent(1)); \ + \ + bool A_is_layout_left = std::is_same::value; \ + bool B_is_layout_left = std::is_same::value; \ + \ + const int AST = A_is_layout_left ? A.stride(1) : A.stride(0), LDA = (AST == 0) ? 1 : AST; \ + const int BST = B_is_layout_left ? B.stride(1) : B.stride(0), LDB = (BST == 0) ? 1 : BST; \ + \ + cublasSideMode_t side_; \ + cublasFillMode_t uplo_; \ + cublasOperation_t trans_; \ + cublasDiagType_t diag_; \ + \ + if (A_is_layout_left) { \ + if ((side[0] == 'L') || (side[0] == 'l')) \ + side_ = CUBLAS_SIDE_LEFT; \ + else \ + side_ = CUBLAS_SIDE_RIGHT; \ + if ((uplo[0] == 'L') || (uplo[0] == 'l')) \ + uplo_ = CUBLAS_FILL_MODE_LOWER; \ + else \ + uplo_ = CUBLAS_FILL_MODE_UPPER; \ + } else { \ + if ((side[0] == 'L') || (side[0] == 'l')) \ + side_ = CUBLAS_SIDE_RIGHT; \ + else \ + side_ = CUBLAS_SIDE_LEFT; \ + if ((uplo[0] == 'L') || (uplo[0] == 'l')) \ + uplo_ = CUBLAS_FILL_MODE_UPPER; \ + else \ + uplo_ = CUBLAS_FILL_MODE_LOWER; \ + } \ + \ + if ((trans[0] == 'N') || (trans[0] == 'n')) \ + trans_ = CUBLAS_OP_N; \ + else if ((trans[0] == 'T') || (trans[0] == 't')) \ + trans_ = CUBLAS_OP_T; \ + else \ + trans_ = CUBLAS_OP_C; \ + if ((diag[0] == 'U') || (diag[0] == 'u')) \ + diag_ = CUBLAS_DIAG_UNIT; \ + else \ + diag_ = CUBLAS_DIAG_NON_UNIT; \ + \ + KokkosBlas::Impl::CudaBlasSingleton& s = KokkosBlas::Impl::CudaBlasSingleton::singleton(); \ + KOKKOS_CUBLAS_SAFE_CALL_IMPL(cublasSetStream(s.handle, space.cuda_stream())); \ + if (A_is_layout_left) { \ + KOKKOS_CUBLAS_SAFE_CALL_IMPL(CUBLAS_FN( \ + s.handle, side_, uplo_, trans_, diag_, M, N, reinterpret_cast(&alpha), \ + reinterpret_cast(A.data()), LDA, reinterpret_cast(B.data()), \ + LDB, reinterpret_cast(B.data()), LDB)); \ + } else { \ + KOKKOS_CUBLAS_SAFE_CALL_IMPL(CUBLAS_FN( \ + s.handle, side_, uplo_, trans_, diag_, N, M, reinterpret_cast(&alpha), \ + reinterpret_cast(A.data()), LDA, reinterpret_cast(B.data()), \ + LDB, reinterpret_cast(B.data()), LDB)); \ + } \ + KOKKOS_CUBLAS_SAFE_CALL_IMPL(cublasSetStream(s.handle, NULL)); \ + Kokkos::Profiling::popRegion(); \ + } \ }; #define KOKKOSBLAS3_DTRMM_CUBLAS(LAYOUTA, LAYOUTB, MEM_SPACE, ETI_SPEC_AVAIL) \ - KOKKOSBLAS3_TRMM_CUBLAS(double, double, cublasDtrmm, LAYOUTA, LAYOUTB, \ - MEM_SPACE, ETI_SPEC_AVAIL) + KOKKOSBLAS3_TRMM_CUBLAS(double, double, cublasDtrmm, LAYOUTA, LAYOUTB, MEM_SPACE, ETI_SPEC_AVAIL) #define KOKKOSBLAS3_STRMM_CUBLAS(LAYOUTA, LAYOUTB, MEM_SPACE, ETI_SPEC_AVAIL) \ - KOKKOSBLAS3_TRMM_CUBLAS(float, float, cublasStrmm, LAYOUTA, LAYOUTB, \ - MEM_SPACE, ETI_SPEC_AVAIL) + KOKKOSBLAS3_TRMM_CUBLAS(float, float, cublasStrmm, LAYOUTA, LAYOUTB, MEM_SPACE, ETI_SPEC_AVAIL) -#define KOKKOSBLAS3_ZTRMM_CUBLAS(LAYOUTA, LAYOUTB, MEM_SPACE, ETI_SPEC_AVAIL) \ - KOKKOSBLAS3_TRMM_CUBLAS(Kokkos::complex, cuDoubleComplex, \ - cublasZtrmm, LAYOUTA, LAYOUTB, MEM_SPACE, \ +#define KOKKOSBLAS3_ZTRMM_CUBLAS(LAYOUTA, LAYOUTB, MEM_SPACE, ETI_SPEC_AVAIL) \ + KOKKOSBLAS3_TRMM_CUBLAS(Kokkos::complex, cuDoubleComplex, cublasZtrmm, LAYOUTA, LAYOUTB, MEM_SPACE, \ ETI_SPEC_AVAIL) #define KOKKOSBLAS3_CTRMM_CUBLAS(LAYOUTA, LAYOUTB, MEM_SPACE, ETI_SPEC_AVAIL) \ - KOKKOSBLAS3_TRMM_CUBLAS(Kokkos::complex, cuComplex, cublasCtrmm, \ - LAYOUTA, LAYOUTB, MEM_SPACE, ETI_SPEC_AVAIL) + KOKKOSBLAS3_TRMM_CUBLAS(Kokkos::complex, cuComplex, cublasCtrmm, LAYOUTA, LAYOUTB, MEM_SPACE, ETI_SPEC_AVAIL) // Explicitly define the TRMM class for all permutations listed below -KOKKOSBLAS3_DTRMM_CUBLAS(Kokkos::LayoutLeft, Kokkos::LayoutLeft, - Kokkos::CudaSpace, true) -KOKKOSBLAS3_DTRMM_CUBLAS(Kokkos::LayoutLeft, Kokkos::LayoutLeft, - Kokkos::CudaSpace, false) -KOKKOSBLAS3_DTRMM_CUBLAS(Kokkos::LayoutRight, Kokkos::LayoutRight, - Kokkos::CudaSpace, true) -KOKKOSBLAS3_DTRMM_CUBLAS(Kokkos::LayoutRight, Kokkos::LayoutRight, - Kokkos::CudaSpace, false) - -KOKKOSBLAS3_DTRMM_CUBLAS(Kokkos::LayoutLeft, Kokkos::LayoutLeft, - Kokkos::CudaUVMSpace, true) -KOKKOSBLAS3_DTRMM_CUBLAS(Kokkos::LayoutLeft, Kokkos::LayoutLeft, - Kokkos::CudaUVMSpace, false) -KOKKOSBLAS3_DTRMM_CUBLAS(Kokkos::LayoutRight, Kokkos::LayoutRight, - Kokkos::CudaUVMSpace, true) -KOKKOSBLAS3_DTRMM_CUBLAS(Kokkos::LayoutRight, Kokkos::LayoutRight, - Kokkos::CudaUVMSpace, false) - -KOKKOSBLAS3_STRMM_CUBLAS(Kokkos::LayoutLeft, Kokkos::LayoutLeft, - Kokkos::CudaSpace, true) -KOKKOSBLAS3_STRMM_CUBLAS(Kokkos::LayoutLeft, Kokkos::LayoutLeft, - Kokkos::CudaSpace, false) -KOKKOSBLAS3_STRMM_CUBLAS(Kokkos::LayoutRight, Kokkos::LayoutRight, - Kokkos::CudaSpace, true) -KOKKOSBLAS3_STRMM_CUBLAS(Kokkos::LayoutRight, Kokkos::LayoutRight, - Kokkos::CudaSpace, false) - -KOKKOSBLAS3_STRMM_CUBLAS(Kokkos::LayoutLeft, Kokkos::LayoutLeft, - Kokkos::CudaUVMSpace, true) -KOKKOSBLAS3_STRMM_CUBLAS(Kokkos::LayoutLeft, Kokkos::LayoutLeft, - Kokkos::CudaUVMSpace, false) -KOKKOSBLAS3_STRMM_CUBLAS(Kokkos::LayoutRight, Kokkos::LayoutRight, - Kokkos::CudaUVMSpace, true) -KOKKOSBLAS3_STRMM_CUBLAS(Kokkos::LayoutRight, Kokkos::LayoutRight, - Kokkos::CudaUVMSpace, false) - -KOKKOSBLAS3_ZTRMM_CUBLAS(Kokkos::LayoutLeft, Kokkos::LayoutLeft, - Kokkos::CudaSpace, true) -KOKKOSBLAS3_ZTRMM_CUBLAS(Kokkos::LayoutLeft, Kokkos::LayoutLeft, - Kokkos::CudaSpace, false) -KOKKOSBLAS3_ZTRMM_CUBLAS(Kokkos::LayoutRight, Kokkos::LayoutRight, - Kokkos::CudaSpace, true) -KOKKOSBLAS3_ZTRMM_CUBLAS(Kokkos::LayoutRight, Kokkos::LayoutRight, - Kokkos::CudaSpace, false) - -KOKKOSBLAS3_ZTRMM_CUBLAS(Kokkos::LayoutLeft, Kokkos::LayoutLeft, - Kokkos::CudaUVMSpace, true) -KOKKOSBLAS3_ZTRMM_CUBLAS(Kokkos::LayoutLeft, Kokkos::LayoutLeft, - Kokkos::CudaUVMSpace, false) -KOKKOSBLAS3_ZTRMM_CUBLAS(Kokkos::LayoutRight, Kokkos::LayoutRight, - Kokkos::CudaUVMSpace, true) -KOKKOSBLAS3_ZTRMM_CUBLAS(Kokkos::LayoutRight, Kokkos::LayoutRight, - Kokkos::CudaUVMSpace, false) - -KOKKOSBLAS3_CTRMM_CUBLAS(Kokkos::LayoutLeft, Kokkos::LayoutLeft, - Kokkos::CudaSpace, true) -KOKKOSBLAS3_CTRMM_CUBLAS(Kokkos::LayoutLeft, Kokkos::LayoutLeft, - Kokkos::CudaSpace, false) -KOKKOSBLAS3_CTRMM_CUBLAS(Kokkos::LayoutRight, Kokkos::LayoutRight, - Kokkos::CudaSpace, true) -KOKKOSBLAS3_CTRMM_CUBLAS(Kokkos::LayoutRight, Kokkos::LayoutRight, - Kokkos::CudaSpace, false) - -KOKKOSBLAS3_CTRMM_CUBLAS(Kokkos::LayoutLeft, Kokkos::LayoutLeft, - Kokkos::CudaUVMSpace, true) -KOKKOSBLAS3_CTRMM_CUBLAS(Kokkos::LayoutLeft, Kokkos::LayoutLeft, - Kokkos::CudaUVMSpace, false) -KOKKOSBLAS3_CTRMM_CUBLAS(Kokkos::LayoutRight, Kokkos::LayoutRight, - Kokkos::CudaUVMSpace, true) -KOKKOSBLAS3_CTRMM_CUBLAS(Kokkos::LayoutRight, Kokkos::LayoutRight, - Kokkos::CudaUVMSpace, false) +KOKKOSBLAS3_DTRMM_CUBLAS(Kokkos::LayoutLeft, Kokkos::LayoutLeft, Kokkos::CudaSpace, true) +KOKKOSBLAS3_DTRMM_CUBLAS(Kokkos::LayoutLeft, Kokkos::LayoutLeft, Kokkos::CudaSpace, false) +KOKKOSBLAS3_DTRMM_CUBLAS(Kokkos::LayoutRight, Kokkos::LayoutRight, Kokkos::CudaSpace, true) +KOKKOSBLAS3_DTRMM_CUBLAS(Kokkos::LayoutRight, Kokkos::LayoutRight, Kokkos::CudaSpace, false) + +KOKKOSBLAS3_DTRMM_CUBLAS(Kokkos::LayoutLeft, Kokkos::LayoutLeft, Kokkos::CudaUVMSpace, true) +KOKKOSBLAS3_DTRMM_CUBLAS(Kokkos::LayoutLeft, Kokkos::LayoutLeft, Kokkos::CudaUVMSpace, false) +KOKKOSBLAS3_DTRMM_CUBLAS(Kokkos::LayoutRight, Kokkos::LayoutRight, Kokkos::CudaUVMSpace, true) +KOKKOSBLAS3_DTRMM_CUBLAS(Kokkos::LayoutRight, Kokkos::LayoutRight, Kokkos::CudaUVMSpace, false) + +KOKKOSBLAS3_STRMM_CUBLAS(Kokkos::LayoutLeft, Kokkos::LayoutLeft, Kokkos::CudaSpace, true) +KOKKOSBLAS3_STRMM_CUBLAS(Kokkos::LayoutLeft, Kokkos::LayoutLeft, Kokkos::CudaSpace, false) +KOKKOSBLAS3_STRMM_CUBLAS(Kokkos::LayoutRight, Kokkos::LayoutRight, Kokkos::CudaSpace, true) +KOKKOSBLAS3_STRMM_CUBLAS(Kokkos::LayoutRight, Kokkos::LayoutRight, Kokkos::CudaSpace, false) + +KOKKOSBLAS3_STRMM_CUBLAS(Kokkos::LayoutLeft, Kokkos::LayoutLeft, Kokkos::CudaUVMSpace, true) +KOKKOSBLAS3_STRMM_CUBLAS(Kokkos::LayoutLeft, Kokkos::LayoutLeft, Kokkos::CudaUVMSpace, false) +KOKKOSBLAS3_STRMM_CUBLAS(Kokkos::LayoutRight, Kokkos::LayoutRight, Kokkos::CudaUVMSpace, true) +KOKKOSBLAS3_STRMM_CUBLAS(Kokkos::LayoutRight, Kokkos::LayoutRight, Kokkos::CudaUVMSpace, false) + +KOKKOSBLAS3_ZTRMM_CUBLAS(Kokkos::LayoutLeft, Kokkos::LayoutLeft, Kokkos::CudaSpace, true) +KOKKOSBLAS3_ZTRMM_CUBLAS(Kokkos::LayoutLeft, Kokkos::LayoutLeft, Kokkos::CudaSpace, false) +KOKKOSBLAS3_ZTRMM_CUBLAS(Kokkos::LayoutRight, Kokkos::LayoutRight, Kokkos::CudaSpace, true) +KOKKOSBLAS3_ZTRMM_CUBLAS(Kokkos::LayoutRight, Kokkos::LayoutRight, Kokkos::CudaSpace, false) + +KOKKOSBLAS3_ZTRMM_CUBLAS(Kokkos::LayoutLeft, Kokkos::LayoutLeft, Kokkos::CudaUVMSpace, true) +KOKKOSBLAS3_ZTRMM_CUBLAS(Kokkos::LayoutLeft, Kokkos::LayoutLeft, Kokkos::CudaUVMSpace, false) +KOKKOSBLAS3_ZTRMM_CUBLAS(Kokkos::LayoutRight, Kokkos::LayoutRight, Kokkos::CudaUVMSpace, true) +KOKKOSBLAS3_ZTRMM_CUBLAS(Kokkos::LayoutRight, Kokkos::LayoutRight, Kokkos::CudaUVMSpace, false) + +KOKKOSBLAS3_CTRMM_CUBLAS(Kokkos::LayoutLeft, Kokkos::LayoutLeft, Kokkos::CudaSpace, true) +KOKKOSBLAS3_CTRMM_CUBLAS(Kokkos::LayoutLeft, Kokkos::LayoutLeft, Kokkos::CudaSpace, false) +KOKKOSBLAS3_CTRMM_CUBLAS(Kokkos::LayoutRight, Kokkos::LayoutRight, Kokkos::CudaSpace, true) +KOKKOSBLAS3_CTRMM_CUBLAS(Kokkos::LayoutRight, Kokkos::LayoutRight, Kokkos::CudaSpace, false) + +KOKKOSBLAS3_CTRMM_CUBLAS(Kokkos::LayoutLeft, Kokkos::LayoutLeft, Kokkos::CudaUVMSpace, true) +KOKKOSBLAS3_CTRMM_CUBLAS(Kokkos::LayoutLeft, Kokkos::LayoutLeft, Kokkos::CudaUVMSpace, false) +KOKKOSBLAS3_CTRMM_CUBLAS(Kokkos::LayoutRight, Kokkos::LayoutRight, Kokkos::CudaUVMSpace, true) +KOKKOSBLAS3_CTRMM_CUBLAS(Kokkos::LayoutRight, Kokkos::LayoutRight, Kokkos::CudaUVMSpace, false) } // namespace Impl } // namespace KokkosBlas diff --git a/blas/tpls/KokkosBlas3_trsm_tpl_spec_avail.hpp b/blas/tpls/KokkosBlas3_trsm_tpl_spec_avail.hpp index d1836809ec..21289655de 100644 --- a/blas/tpls/KokkosBlas3_trsm_tpl_spec_avail.hpp +++ b/blas/tpls/KokkosBlas3_trsm_tpl_spec_avail.hpp @@ -29,38 +29,26 @@ struct trsm_tpl_spec_avail { // Generic Host side BLAS (could be MKL or whatever) #ifdef KOKKOSKERNELS_ENABLE_TPL_BLAS -#define KOKKOSBLAS3_TRSM_TPL_SPEC_AVAIL_BLAS(SCALAR, LAYOUTA, LAYOUTB, \ - MEMSPACE) \ - template \ - struct trsm_tpl_spec_avail< \ - ExecSpace, \ - Kokkos::View, \ - Kokkos::MemoryTraits >, \ - Kokkos::View, \ - Kokkos::MemoryTraits > > { \ - enum : bool { value = true }; \ +#define KOKKOSBLAS3_TRSM_TPL_SPEC_AVAIL_BLAS(SCALAR, LAYOUTA, LAYOUTB, MEMSPACE) \ + template \ + struct trsm_tpl_spec_avail, \ + Kokkos::MemoryTraits >, \ + Kokkos::View, \ + Kokkos::MemoryTraits > > { \ + enum : bool { value = true }; \ }; -KOKKOSBLAS3_TRSM_TPL_SPEC_AVAIL_BLAS(double, Kokkos::LayoutLeft, - Kokkos::LayoutLeft, Kokkos::HostSpace) -KOKKOSBLAS3_TRSM_TPL_SPEC_AVAIL_BLAS(float, Kokkos::LayoutLeft, - Kokkos::LayoutLeft, Kokkos::HostSpace) -KOKKOSBLAS3_TRSM_TPL_SPEC_AVAIL_BLAS(Kokkos::complex, - Kokkos::LayoutLeft, Kokkos::LayoutLeft, - Kokkos::HostSpace) -KOKKOSBLAS3_TRSM_TPL_SPEC_AVAIL_BLAS(Kokkos::complex, Kokkos::LayoutLeft, - Kokkos::LayoutLeft, Kokkos::HostSpace) +KOKKOSBLAS3_TRSM_TPL_SPEC_AVAIL_BLAS(double, Kokkos::LayoutLeft, Kokkos::LayoutLeft, Kokkos::HostSpace) +KOKKOSBLAS3_TRSM_TPL_SPEC_AVAIL_BLAS(float, Kokkos::LayoutLeft, Kokkos::LayoutLeft, Kokkos::HostSpace) +KOKKOSBLAS3_TRSM_TPL_SPEC_AVAIL_BLAS(Kokkos::complex, Kokkos::LayoutLeft, Kokkos::LayoutLeft, Kokkos::HostSpace) +KOKKOSBLAS3_TRSM_TPL_SPEC_AVAIL_BLAS(Kokkos::complex, Kokkos::LayoutLeft, Kokkos::LayoutLeft, Kokkos::HostSpace) -KOKKOSBLAS3_TRSM_TPL_SPEC_AVAIL_BLAS(double, Kokkos::LayoutRight, - Kokkos::LayoutRight, Kokkos::HostSpace) -KOKKOSBLAS3_TRSM_TPL_SPEC_AVAIL_BLAS(float, Kokkos::LayoutRight, - Kokkos::LayoutRight, Kokkos::HostSpace) -KOKKOSBLAS3_TRSM_TPL_SPEC_AVAIL_BLAS(Kokkos::complex, - Kokkos::LayoutRight, Kokkos::LayoutRight, +KOKKOSBLAS3_TRSM_TPL_SPEC_AVAIL_BLAS(double, Kokkos::LayoutRight, Kokkos::LayoutRight, Kokkos::HostSpace) +KOKKOSBLAS3_TRSM_TPL_SPEC_AVAIL_BLAS(float, Kokkos::LayoutRight, Kokkos::LayoutRight, Kokkos::HostSpace) +KOKKOSBLAS3_TRSM_TPL_SPEC_AVAIL_BLAS(Kokkos::complex, Kokkos::LayoutRight, Kokkos::LayoutRight, Kokkos::HostSpace) -KOKKOSBLAS3_TRSM_TPL_SPEC_AVAIL_BLAS(Kokkos::complex, - Kokkos::LayoutRight, Kokkos::LayoutRight, +KOKKOSBLAS3_TRSM_TPL_SPEC_AVAIL_BLAS(Kokkos::complex, Kokkos::LayoutRight, Kokkos::LayoutRight, Kokkos::HostSpace) #endif @@ -68,61 +56,40 @@ KOKKOSBLAS3_TRSM_TPL_SPEC_AVAIL_BLAS(Kokkos::complex, // cuBLAS #ifdef KOKKOSKERNELS_ENABLE_TPL_CUBLAS -#define KOKKOSBLAS3_TRSM_TPL_SPEC_AVAIL_CUBLAS(SCALAR, LAYOUTA, LAYOUTB, \ - MEMSPACE) \ - template \ - struct trsm_tpl_spec_avail< \ - ExecSpace, \ - Kokkos::View, \ - Kokkos::MemoryTraits >, \ - Kokkos::View, \ - Kokkos::MemoryTraits > > { \ - enum : bool { value = true }; \ +#define KOKKOSBLAS3_TRSM_TPL_SPEC_AVAIL_CUBLAS(SCALAR, LAYOUTA, LAYOUTB, MEMSPACE) \ + template \ + struct trsm_tpl_spec_avail, \ + Kokkos::MemoryTraits >, \ + Kokkos::View, \ + Kokkos::MemoryTraits > > { \ + enum : bool { value = true }; \ }; -KOKKOSBLAS3_TRSM_TPL_SPEC_AVAIL_CUBLAS(double, Kokkos::LayoutLeft, - Kokkos::LayoutLeft, Kokkos::CudaSpace) -KOKKOSBLAS3_TRSM_TPL_SPEC_AVAIL_CUBLAS(double, Kokkos::LayoutLeft, - Kokkos::LayoutLeft, Kokkos::CudaUVMSpace) -KOKKOSBLAS3_TRSM_TPL_SPEC_AVAIL_CUBLAS(float, Kokkos::LayoutLeft, - Kokkos::LayoutLeft, Kokkos::CudaSpace) -KOKKOSBLAS3_TRSM_TPL_SPEC_AVAIL_CUBLAS(float, Kokkos::LayoutLeft, - Kokkos::LayoutLeft, Kokkos::CudaUVMSpace) -KOKKOSBLAS3_TRSM_TPL_SPEC_AVAIL_CUBLAS(Kokkos::complex, - Kokkos::LayoutLeft, Kokkos::LayoutLeft, +KOKKOSBLAS3_TRSM_TPL_SPEC_AVAIL_CUBLAS(double, Kokkos::LayoutLeft, Kokkos::LayoutLeft, Kokkos::CudaSpace) +KOKKOSBLAS3_TRSM_TPL_SPEC_AVAIL_CUBLAS(double, Kokkos::LayoutLeft, Kokkos::LayoutLeft, Kokkos::CudaUVMSpace) +KOKKOSBLAS3_TRSM_TPL_SPEC_AVAIL_CUBLAS(float, Kokkos::LayoutLeft, Kokkos::LayoutLeft, Kokkos::CudaSpace) +KOKKOSBLAS3_TRSM_TPL_SPEC_AVAIL_CUBLAS(float, Kokkos::LayoutLeft, Kokkos::LayoutLeft, Kokkos::CudaUVMSpace) +KOKKOSBLAS3_TRSM_TPL_SPEC_AVAIL_CUBLAS(Kokkos::complex, Kokkos::LayoutLeft, Kokkos::LayoutLeft, Kokkos::CudaSpace) -KOKKOSBLAS3_TRSM_TPL_SPEC_AVAIL_CUBLAS(Kokkos::complex, - Kokkos::LayoutLeft, Kokkos::LayoutLeft, +KOKKOSBLAS3_TRSM_TPL_SPEC_AVAIL_CUBLAS(Kokkos::complex, Kokkos::LayoutLeft, Kokkos::LayoutLeft, Kokkos::CudaUVMSpace) -KOKKOSBLAS3_TRSM_TPL_SPEC_AVAIL_CUBLAS(Kokkos::complex, - Kokkos::LayoutLeft, Kokkos::LayoutLeft, +KOKKOSBLAS3_TRSM_TPL_SPEC_AVAIL_CUBLAS(Kokkos::complex, Kokkos::LayoutLeft, Kokkos::LayoutLeft, Kokkos::CudaSpace) -KOKKOSBLAS3_TRSM_TPL_SPEC_AVAIL_CUBLAS(Kokkos::complex, - Kokkos::LayoutLeft, Kokkos::LayoutLeft, +KOKKOSBLAS3_TRSM_TPL_SPEC_AVAIL_CUBLAS(Kokkos::complex, Kokkos::LayoutLeft, Kokkos::LayoutLeft, Kokkos::CudaUVMSpace) -KOKKOSBLAS3_TRSM_TPL_SPEC_AVAIL_CUBLAS(double, Kokkos::LayoutRight, - Kokkos::LayoutRight, Kokkos::CudaSpace) -KOKKOSBLAS3_TRSM_TPL_SPEC_AVAIL_CUBLAS(double, Kokkos::LayoutRight, - Kokkos::LayoutRight, - Kokkos::CudaUVMSpace) -KOKKOSBLAS3_TRSM_TPL_SPEC_AVAIL_CUBLAS(float, Kokkos::LayoutRight, - Kokkos::LayoutRight, Kokkos::CudaSpace) -KOKKOSBLAS3_TRSM_TPL_SPEC_AVAIL_CUBLAS(float, Kokkos::LayoutRight, - Kokkos::LayoutRight, - Kokkos::CudaUVMSpace) -KOKKOSBLAS3_TRSM_TPL_SPEC_AVAIL_CUBLAS(Kokkos::complex, - Kokkos::LayoutRight, Kokkos::LayoutRight, +KOKKOSBLAS3_TRSM_TPL_SPEC_AVAIL_CUBLAS(double, Kokkos::LayoutRight, Kokkos::LayoutRight, Kokkos::CudaSpace) +KOKKOSBLAS3_TRSM_TPL_SPEC_AVAIL_CUBLAS(double, Kokkos::LayoutRight, Kokkos::LayoutRight, Kokkos::CudaUVMSpace) +KOKKOSBLAS3_TRSM_TPL_SPEC_AVAIL_CUBLAS(float, Kokkos::LayoutRight, Kokkos::LayoutRight, Kokkos::CudaSpace) +KOKKOSBLAS3_TRSM_TPL_SPEC_AVAIL_CUBLAS(float, Kokkos::LayoutRight, Kokkos::LayoutRight, Kokkos::CudaUVMSpace) +KOKKOSBLAS3_TRSM_TPL_SPEC_AVAIL_CUBLAS(Kokkos::complex, Kokkos::LayoutRight, Kokkos::LayoutRight, Kokkos::CudaSpace) -KOKKOSBLAS3_TRSM_TPL_SPEC_AVAIL_CUBLAS(Kokkos::complex, - Kokkos::LayoutRight, Kokkos::LayoutRight, +KOKKOSBLAS3_TRSM_TPL_SPEC_AVAIL_CUBLAS(Kokkos::complex, Kokkos::LayoutRight, Kokkos::LayoutRight, Kokkos::CudaUVMSpace) -KOKKOSBLAS3_TRSM_TPL_SPEC_AVAIL_CUBLAS(Kokkos::complex, - Kokkos::LayoutRight, Kokkos::LayoutRight, +KOKKOSBLAS3_TRSM_TPL_SPEC_AVAIL_CUBLAS(Kokkos::complex, Kokkos::LayoutRight, Kokkos::LayoutRight, Kokkos::CudaSpace) -KOKKOSBLAS3_TRSM_TPL_SPEC_AVAIL_CUBLAS(Kokkos::complex, - Kokkos::LayoutRight, Kokkos::LayoutRight, +KOKKOSBLAS3_TRSM_TPL_SPEC_AVAIL_CUBLAS(Kokkos::complex, Kokkos::LayoutRight, Kokkos::LayoutRight, Kokkos::CudaUVMSpace) #endif diff --git a/blas/tpls/KokkosBlas3_trsm_tpl_spec_decl.hpp b/blas/tpls/KokkosBlas3_trsm_tpl_spec_decl.hpp index ec36388094..7074a4e0e2 100644 --- a/blas/tpls/KokkosBlas3_trsm_tpl_spec_decl.hpp +++ b/blas/tpls/KokkosBlas3_trsm_tpl_spec_decl.hpp @@ -23,329 +23,275 @@ namespace KokkosBlas { namespace Impl { -#define KOKKOSBLAS3_DTRSM_BLAS(LAYOUTA, LAYOUTB, MEM_SPACE, ETI_SPEC_AVAIL) \ - template \ - struct TRSM< \ - ExecSpace, \ - Kokkos::View, \ - Kokkos::MemoryTraits >, \ - Kokkos::View, \ - Kokkos::MemoryTraits >, \ - true, ETI_SPEC_AVAIL> { \ - typedef double SCALAR; \ - typedef Kokkos::View, \ - Kokkos::MemoryTraits > \ - AViewType; \ - typedef Kokkos::View, \ - Kokkos::MemoryTraits > \ - BViewType; \ - \ - static void trsm(const ExecSpace& /*space*/, const char side[], \ - const char uplo[], const char trans[], const char diag[], \ - typename BViewType::const_value_type& alpha, \ - const AViewType& A, const BViewType& B) { \ - Kokkos::Profiling::pushRegion("KokkosBlas::trsm[TPL_BLAS,double]"); \ - const int M = static_cast(B.extent(0)); \ - const int N = static_cast(B.extent(1)); \ - \ - bool A_is_ll = std::is_same::value; \ - bool B_is_ll = std::is_same::value; \ - \ - const int AST = A_is_ll ? A.stride(1) : A.stride(0), \ - LDA = (AST == 0) ? 1 : AST; \ - const int BST = B_is_ll ? B.stride(1) : B.stride(0), \ - LDB = (BST == 0) ? 1 : BST; \ - \ - char side_; \ - char uplo_; \ - \ - if (A_is_ll) { \ - if ((side[0] == 'L') || (side[0] == 'l')) \ - side_ = 'L'; \ - else \ - side_ = 'R'; \ - if ((uplo[0] == 'L') || (uplo[0] == 'l')) \ - uplo_ = 'L'; \ - else \ - uplo_ = 'U'; \ - } else { \ - if ((side[0] == 'L') || (side[0] == 'l')) \ - side_ = 'R'; \ - else \ - side_ = 'L'; \ - if ((uplo[0] == 'L') || (uplo[0] == 'l')) \ - uplo_ = 'U'; \ - else \ - uplo_ = 'L'; \ - } \ - \ - if (A_is_ll) \ - HostBlas::trsm(side_, uplo_, trans[0], diag[0], M, N, alpha, \ - A.data(), LDA, B.data(), LDB); \ - else \ - HostBlas::trsm(side_, uplo_, trans[0], diag[0], N, M, alpha, \ - A.data(), LDA, B.data(), LDB); \ - Kokkos::Profiling::popRegion(); \ - } \ +#define KOKKOSBLAS3_DTRSM_BLAS(LAYOUTA, LAYOUTB, MEM_SPACE, ETI_SPEC_AVAIL) \ + template \ + struct TRSM< \ + ExecSpace, \ + Kokkos::View, \ + Kokkos::MemoryTraits >, \ + Kokkos::View, Kokkos::MemoryTraits >, \ + true, ETI_SPEC_AVAIL> { \ + typedef double SCALAR; \ + typedef Kokkos::View, \ + Kokkos::MemoryTraits > \ + AViewType; \ + typedef Kokkos::View, \ + Kokkos::MemoryTraits > \ + BViewType; \ + \ + static void trsm(const ExecSpace& /*space*/, const char side[], const char uplo[], const char trans[], \ + const char diag[], typename BViewType::const_value_type& alpha, const AViewType& A, \ + const BViewType& B) { \ + Kokkos::Profiling::pushRegion("KokkosBlas::trsm[TPL_BLAS,double]"); \ + const int M = static_cast(B.extent(0)); \ + const int N = static_cast(B.extent(1)); \ + \ + bool A_is_ll = std::is_same::value; \ + bool B_is_ll = std::is_same::value; \ + \ + const int AST = A_is_ll ? A.stride(1) : A.stride(0), LDA = (AST == 0) ? 1 : AST; \ + const int BST = B_is_ll ? B.stride(1) : B.stride(0), LDB = (BST == 0) ? 1 : BST; \ + \ + char side_; \ + char uplo_; \ + \ + if (A_is_ll) { \ + if ((side[0] == 'L') || (side[0] == 'l')) \ + side_ = 'L'; \ + else \ + side_ = 'R'; \ + if ((uplo[0] == 'L') || (uplo[0] == 'l')) \ + uplo_ = 'L'; \ + else \ + uplo_ = 'U'; \ + } else { \ + if ((side[0] == 'L') || (side[0] == 'l')) \ + side_ = 'R'; \ + else \ + side_ = 'L'; \ + if ((uplo[0] == 'L') || (uplo[0] == 'l')) \ + uplo_ = 'U'; \ + else \ + uplo_ = 'L'; \ + } \ + \ + if (A_is_ll) \ + HostBlas::trsm(side_, uplo_, trans[0], diag[0], M, N, alpha, A.data(), LDA, B.data(), LDB); \ + else \ + HostBlas::trsm(side_, uplo_, trans[0], diag[0], N, M, alpha, A.data(), LDA, B.data(), LDB); \ + Kokkos::Profiling::popRegion(); \ + } \ }; -#define KOKKOSBLAS3_STRSM_BLAS(LAYOUTA, LAYOUTB, MEM_SPACE, ETI_SPEC_AVAIL) \ - template \ - struct TRSM< \ - ExecSpace, \ - Kokkos::View, \ - Kokkos::MemoryTraits >, \ - Kokkos::View, \ - Kokkos::MemoryTraits >, \ - true, ETI_SPEC_AVAIL> { \ - typedef float SCALAR; \ - typedef Kokkos::View, \ - Kokkos::MemoryTraits > \ - AViewType; \ - typedef Kokkos::View, \ - Kokkos::MemoryTraits > \ - BViewType; \ - \ - static void trsm(const ExecSpace& /*space*/, const char side[], \ - const char uplo[], const char trans[], const char diag[], \ - typename BViewType::const_value_type& alpha, \ - const AViewType& A, const BViewType& B) { \ - Kokkos::Profiling::pushRegion("KokkosBlas::trsm[TPL_BLAS,float]"); \ - const int M = static_cast(B.extent(0)); \ - const int N = static_cast(B.extent(1)); \ - \ - bool A_is_ll = std::is_same::value; \ - bool B_is_ll = std::is_same::value; \ - \ - const int AST = A_is_ll ? A.stride(1) : A.stride(0), \ - LDA = (AST == 0) ? 1 : AST; \ - const int BST = B_is_ll ? B.stride(1) : B.stride(0), \ - LDB = (BST == 0) ? 1 : BST; \ - \ - char side_; \ - char uplo_; \ - \ - if (A_is_ll) { \ - if ((side[0] == 'L') || (side[0] == 'l')) \ - side_ = 'L'; \ - else \ - side_ = 'R'; \ - if ((uplo[0] == 'L') || (uplo[0] == 'l')) \ - uplo_ = 'L'; \ - else \ - uplo_ = 'U'; \ - } else { \ - if ((side[0] == 'L') || (side[0] == 'l')) \ - side_ = 'R'; \ - else \ - side_ = 'L'; \ - if ((uplo[0] == 'L') || (uplo[0] == 'l')) \ - uplo_ = 'U'; \ - else \ - uplo_ = 'L'; \ - } \ - \ - if (A_is_ll) \ - HostBlas::trsm(side_, uplo_, trans[0], diag[0], M, N, alpha, \ - A.data(), LDA, B.data(), LDB); \ - else \ - HostBlas::trsm(side_, uplo_, trans[0], diag[0], N, M, alpha, \ - A.data(), LDA, B.data(), LDB); \ - Kokkos::Profiling::popRegion(); \ - } \ +#define KOKKOSBLAS3_STRSM_BLAS(LAYOUTA, LAYOUTB, MEM_SPACE, ETI_SPEC_AVAIL) \ + template \ + struct TRSM< \ + ExecSpace, \ + Kokkos::View, \ + Kokkos::MemoryTraits >, \ + Kokkos::View, Kokkos::MemoryTraits >, \ + true, ETI_SPEC_AVAIL> { \ + typedef float SCALAR; \ + typedef Kokkos::View, \ + Kokkos::MemoryTraits > \ + AViewType; \ + typedef Kokkos::View, \ + Kokkos::MemoryTraits > \ + BViewType; \ + \ + static void trsm(const ExecSpace& /*space*/, const char side[], const char uplo[], const char trans[], \ + const char diag[], typename BViewType::const_value_type& alpha, const AViewType& A, \ + const BViewType& B) { \ + Kokkos::Profiling::pushRegion("KokkosBlas::trsm[TPL_BLAS,float]"); \ + const int M = static_cast(B.extent(0)); \ + const int N = static_cast(B.extent(1)); \ + \ + bool A_is_ll = std::is_same::value; \ + bool B_is_ll = std::is_same::value; \ + \ + const int AST = A_is_ll ? A.stride(1) : A.stride(0), LDA = (AST == 0) ? 1 : AST; \ + const int BST = B_is_ll ? B.stride(1) : B.stride(0), LDB = (BST == 0) ? 1 : BST; \ + \ + char side_; \ + char uplo_; \ + \ + if (A_is_ll) { \ + if ((side[0] == 'L') || (side[0] == 'l')) \ + side_ = 'L'; \ + else \ + side_ = 'R'; \ + if ((uplo[0] == 'L') || (uplo[0] == 'l')) \ + uplo_ = 'L'; \ + else \ + uplo_ = 'U'; \ + } else { \ + if ((side[0] == 'L') || (side[0] == 'l')) \ + side_ = 'R'; \ + else \ + side_ = 'L'; \ + if ((uplo[0] == 'L') || (uplo[0] == 'l')) \ + uplo_ = 'U'; \ + else \ + uplo_ = 'L'; \ + } \ + \ + if (A_is_ll) \ + HostBlas::trsm(side_, uplo_, trans[0], diag[0], M, N, alpha, A.data(), LDA, B.data(), LDB); \ + else \ + HostBlas::trsm(side_, uplo_, trans[0], diag[0], N, M, alpha, A.data(), LDA, B.data(), LDB); \ + Kokkos::Profiling::popRegion(); \ + } \ }; -#define KOKKOSBLAS3_ZTRSM_BLAS(LAYOUTA, LAYOUTB, MEM_SPACE, ETI_SPEC_AVAIL) \ - template \ - struct TRSM**, LAYOUTA, \ - Kokkos::Device, \ - Kokkos::MemoryTraits >, \ - Kokkos::View**, LAYOUTB, \ - Kokkos::Device, \ - Kokkos::MemoryTraits >, \ - true, ETI_SPEC_AVAIL> { \ - typedef Kokkos::complex SCALAR; \ - typedef Kokkos::View, \ - Kokkos::MemoryTraits > \ - AViewType; \ - typedef Kokkos::View, \ - Kokkos::MemoryTraits > \ - BViewType; \ - \ - static void trsm(const ExecSpace& /*space*/, const char side[], \ - const char uplo[], const char trans[], const char diag[], \ - typename BViewType::const_value_type& alpha, \ - const AViewType& A, const BViewType& B) { \ - Kokkos::Profiling::pushRegion( \ - "KokkosBlas::trsm[TPL_BLAS,complex]"); \ - const int M = static_cast(B.extent(0)); \ - const int N = static_cast(B.extent(1)); \ - \ - bool A_is_ll = std::is_same::value; \ - bool B_is_ll = std::is_same::value; \ - \ - const int AST = A_is_ll ? A.stride(1) : A.stride(0), \ - LDA = (AST == 0) ? 1 : AST; \ - const int BST = B_is_ll ? B.stride(1) : B.stride(0), \ - LDB = (BST == 0) ? 1 : BST; \ - \ - char side_; \ - char uplo_; \ - \ - if (A_is_ll) { \ - if ((side[0] == 'L') || (side[0] == 'l')) \ - side_ = 'L'; \ - else \ - side_ = 'R'; \ - if ((uplo[0] == 'L') || (uplo[0] == 'l')) \ - uplo_ = 'L'; \ - else \ - uplo_ = 'U'; \ - } else { \ - if ((side[0] == 'L') || (side[0] == 'l')) \ - side_ = 'R'; \ - else \ - side_ = 'L'; \ - if ((uplo[0] == 'L') || (uplo[0] == 'l')) \ - uplo_ = 'U'; \ - else \ - uplo_ = 'L'; \ - } \ - \ - const std::complex alpha_val = alpha; \ - if (A_is_ll) \ - HostBlas >::trsm( \ - side_, uplo_, trans[0], diag[0], M, N, alpha_val, \ - reinterpret_cast*>(A.data()), LDA, \ - reinterpret_cast*>(B.data()), LDB); \ - else \ - HostBlas >::trsm( \ - side_, uplo_, trans[0], diag[0], N, M, alpha_val, \ - reinterpret_cast*>(A.data()), LDA, \ - reinterpret_cast*>(B.data()), LDB); \ - Kokkos::Profiling::popRegion(); \ - } \ +#define KOKKOSBLAS3_ZTRSM_BLAS(LAYOUTA, LAYOUTB, MEM_SPACE, ETI_SPEC_AVAIL) \ + template \ + struct TRSM**, LAYOUTA, Kokkos::Device, \ + Kokkos::MemoryTraits >, \ + Kokkos::View**, LAYOUTB, Kokkos::Device, \ + Kokkos::MemoryTraits >, \ + true, ETI_SPEC_AVAIL> { \ + typedef Kokkos::complex SCALAR; \ + typedef Kokkos::View, \ + Kokkos::MemoryTraits > \ + AViewType; \ + typedef Kokkos::View, \ + Kokkos::MemoryTraits > \ + BViewType; \ + \ + static void trsm(const ExecSpace& /*space*/, const char side[], const char uplo[], const char trans[], \ + const char diag[], typename BViewType::const_value_type& alpha, const AViewType& A, \ + const BViewType& B) { \ + Kokkos::Profiling::pushRegion("KokkosBlas::trsm[TPL_BLAS,complex]"); \ + const int M = static_cast(B.extent(0)); \ + const int N = static_cast(B.extent(1)); \ + \ + bool A_is_ll = std::is_same::value; \ + bool B_is_ll = std::is_same::value; \ + \ + const int AST = A_is_ll ? A.stride(1) : A.stride(0), LDA = (AST == 0) ? 1 : AST; \ + const int BST = B_is_ll ? B.stride(1) : B.stride(0), LDB = (BST == 0) ? 1 : BST; \ + \ + char side_; \ + char uplo_; \ + \ + if (A_is_ll) { \ + if ((side[0] == 'L') || (side[0] == 'l')) \ + side_ = 'L'; \ + else \ + side_ = 'R'; \ + if ((uplo[0] == 'L') || (uplo[0] == 'l')) \ + uplo_ = 'L'; \ + else \ + uplo_ = 'U'; \ + } else { \ + if ((side[0] == 'L') || (side[0] == 'l')) \ + side_ = 'R'; \ + else \ + side_ = 'L'; \ + if ((uplo[0] == 'L') || (uplo[0] == 'l')) \ + uplo_ = 'U'; \ + else \ + uplo_ = 'L'; \ + } \ + \ + const std::complex alpha_val = alpha; \ + if (A_is_ll) \ + HostBlas >::trsm(side_, uplo_, trans[0], diag[0], M, N, alpha_val, \ + reinterpret_cast*>(A.data()), LDA, \ + reinterpret_cast*>(B.data()), LDB); \ + else \ + HostBlas >::trsm(side_, uplo_, trans[0], diag[0], N, M, alpha_val, \ + reinterpret_cast*>(A.data()), LDA, \ + reinterpret_cast*>(B.data()), LDB); \ + Kokkos::Profiling::popRegion(); \ + } \ }; -#define KOKKOSBLAS3_CTRSM_BLAS(LAYOUTA, LAYOUTB, MEM_SPACE, ETI_SPEC_AVAIL) \ - template \ - struct TRSM**, LAYOUTA, \ - Kokkos::Device, \ - Kokkos::MemoryTraits >, \ - Kokkos::View**, LAYOUTB, \ - Kokkos::Device, \ - Kokkos::MemoryTraits >, \ - true, ETI_SPEC_AVAIL> { \ - typedef Kokkos::complex SCALAR; \ - typedef Kokkos::View, \ - Kokkos::MemoryTraits > \ - AViewType; \ - typedef Kokkos::View, \ - Kokkos::MemoryTraits > \ - BViewType; \ - \ - static void trsm(const ExecSpace& /*space*/, const char side[], \ - const char uplo[], const char trans[], const char diag[], \ - typename BViewType::const_value_type& alpha, \ - const AViewType& A, const BViewType& B) { \ - Kokkos::Profiling::pushRegion( \ - "KokkosBlas::trsm[TPL_BLAS,complex]"); \ - const int M = static_cast(B.extent(0)); \ - const int N = static_cast(B.extent(1)); \ - \ - bool A_is_ll = std::is_same::value; \ - bool B_is_ll = std::is_same::value; \ - \ - const int AST = A_is_ll ? A.stride(1) : A.stride(0), \ - LDA = (AST == 0) ? 1 : AST; \ - const int BST = B_is_ll ? B.stride(1) : B.stride(0), \ - LDB = (BST == 0) ? 1 : BST; \ - \ - char side_; \ - char uplo_; \ - \ - if (A_is_ll) { \ - if ((side[0] == 'L') || (side[0] == 'l')) \ - side_ = 'L'; \ - else \ - side_ = 'R'; \ - if ((uplo[0] == 'L') || (uplo[0] == 'l')) \ - uplo_ = 'L'; \ - else \ - uplo_ = 'U'; \ - } else { \ - if ((side[0] == 'L') || (side[0] == 'l')) \ - side_ = 'R'; \ - else \ - side_ = 'L'; \ - if ((uplo[0] == 'L') || (uplo[0] == 'l')) \ - uplo_ = 'U'; \ - else \ - uplo_ = 'L'; \ - } \ - \ - const std::complex alpha_val = alpha; \ - if (A_is_ll) \ - HostBlas >::trsm( \ - side_, uplo_, trans[0], diag[0], M, N, alpha_val, \ - reinterpret_cast*>(A.data()), LDA, \ - reinterpret_cast*>(B.data()), LDB); \ - else \ - HostBlas >::trsm( \ - side_, uplo_, trans[0], diag[0], N, M, alpha_val, \ - reinterpret_cast*>(A.data()), LDA, \ - reinterpret_cast*>(B.data()), LDB); \ - Kokkos::Profiling::popRegion(); \ - } \ +#define KOKKOSBLAS3_CTRSM_BLAS(LAYOUTA, LAYOUTB, MEM_SPACE, ETI_SPEC_AVAIL) \ + template \ + struct TRSM**, LAYOUTA, Kokkos::Device, \ + Kokkos::MemoryTraits >, \ + Kokkos::View**, LAYOUTB, Kokkos::Device, \ + Kokkos::MemoryTraits >, \ + true, ETI_SPEC_AVAIL> { \ + typedef Kokkos::complex SCALAR; \ + typedef Kokkos::View, \ + Kokkos::MemoryTraits > \ + AViewType; \ + typedef Kokkos::View, \ + Kokkos::MemoryTraits > \ + BViewType; \ + \ + static void trsm(const ExecSpace& /*space*/, const char side[], const char uplo[], const char trans[], \ + const char diag[], typename BViewType::const_value_type& alpha, const AViewType& A, \ + const BViewType& B) { \ + Kokkos::Profiling::pushRegion("KokkosBlas::trsm[TPL_BLAS,complex]"); \ + const int M = static_cast(B.extent(0)); \ + const int N = static_cast(B.extent(1)); \ + \ + bool A_is_ll = std::is_same::value; \ + bool B_is_ll = std::is_same::value; \ + \ + const int AST = A_is_ll ? A.stride(1) : A.stride(0), LDA = (AST == 0) ? 1 : AST; \ + const int BST = B_is_ll ? B.stride(1) : B.stride(0), LDB = (BST == 0) ? 1 : BST; \ + \ + char side_; \ + char uplo_; \ + \ + if (A_is_ll) { \ + if ((side[0] == 'L') || (side[0] == 'l')) \ + side_ = 'L'; \ + else \ + side_ = 'R'; \ + if ((uplo[0] == 'L') || (uplo[0] == 'l')) \ + uplo_ = 'L'; \ + else \ + uplo_ = 'U'; \ + } else { \ + if ((side[0] == 'L') || (side[0] == 'l')) \ + side_ = 'R'; \ + else \ + side_ = 'L'; \ + if ((uplo[0] == 'L') || (uplo[0] == 'l')) \ + uplo_ = 'U'; \ + else \ + uplo_ = 'L'; \ + } \ + \ + const std::complex alpha_val = alpha; \ + if (A_is_ll) \ + HostBlas >::trsm(side_, uplo_, trans[0], diag[0], M, N, alpha_val, \ + reinterpret_cast*>(A.data()), LDA, \ + reinterpret_cast*>(B.data()), LDB); \ + else \ + HostBlas >::trsm(side_, uplo_, trans[0], diag[0], N, M, alpha_val, \ + reinterpret_cast*>(A.data()), LDA, \ + reinterpret_cast*>(B.data()), LDB); \ + Kokkos::Profiling::popRegion(); \ + } \ }; -KOKKOSBLAS3_DTRSM_BLAS(Kokkos::LayoutLeft, Kokkos::LayoutLeft, - Kokkos::HostSpace, true) -KOKKOSBLAS3_DTRSM_BLAS(Kokkos::LayoutLeft, Kokkos::LayoutLeft, - Kokkos::HostSpace, false) -KOKKOSBLAS3_DTRSM_BLAS(Kokkos::LayoutRight, Kokkos::LayoutRight, - Kokkos::HostSpace, true) -KOKKOSBLAS3_DTRSM_BLAS(Kokkos::LayoutRight, Kokkos::LayoutRight, - Kokkos::HostSpace, false) +KOKKOSBLAS3_DTRSM_BLAS(Kokkos::LayoutLeft, Kokkos::LayoutLeft, Kokkos::HostSpace, true) +KOKKOSBLAS3_DTRSM_BLAS(Kokkos::LayoutLeft, Kokkos::LayoutLeft, Kokkos::HostSpace, false) +KOKKOSBLAS3_DTRSM_BLAS(Kokkos::LayoutRight, Kokkos::LayoutRight, Kokkos::HostSpace, true) +KOKKOSBLAS3_DTRSM_BLAS(Kokkos::LayoutRight, Kokkos::LayoutRight, Kokkos::HostSpace, false) -KOKKOSBLAS3_STRSM_BLAS(Kokkos::LayoutLeft, Kokkos::LayoutLeft, - Kokkos::HostSpace, true) -KOKKOSBLAS3_STRSM_BLAS(Kokkos::LayoutLeft, Kokkos::LayoutLeft, - Kokkos::HostSpace, false) -KOKKOSBLAS3_STRSM_BLAS(Kokkos::LayoutRight, Kokkos::LayoutRight, - Kokkos::HostSpace, true) -KOKKOSBLAS3_STRSM_BLAS(Kokkos::LayoutRight, Kokkos::LayoutRight, - Kokkos::HostSpace, false) +KOKKOSBLAS3_STRSM_BLAS(Kokkos::LayoutLeft, Kokkos::LayoutLeft, Kokkos::HostSpace, true) +KOKKOSBLAS3_STRSM_BLAS(Kokkos::LayoutLeft, Kokkos::LayoutLeft, Kokkos::HostSpace, false) +KOKKOSBLAS3_STRSM_BLAS(Kokkos::LayoutRight, Kokkos::LayoutRight, Kokkos::HostSpace, true) +KOKKOSBLAS3_STRSM_BLAS(Kokkos::LayoutRight, Kokkos::LayoutRight, Kokkos::HostSpace, false) -KOKKOSBLAS3_ZTRSM_BLAS(Kokkos::LayoutLeft, Kokkos::LayoutLeft, - Kokkos::HostSpace, true) -KOKKOSBLAS3_ZTRSM_BLAS(Kokkos::LayoutLeft, Kokkos::LayoutLeft, - Kokkos::HostSpace, false) -KOKKOSBLAS3_ZTRSM_BLAS(Kokkos::LayoutRight, Kokkos::LayoutRight, - Kokkos::HostSpace, true) -KOKKOSBLAS3_ZTRSM_BLAS(Kokkos::LayoutRight, Kokkos::LayoutRight, - Kokkos::HostSpace, false) +KOKKOSBLAS3_ZTRSM_BLAS(Kokkos::LayoutLeft, Kokkos::LayoutLeft, Kokkos::HostSpace, true) +KOKKOSBLAS3_ZTRSM_BLAS(Kokkos::LayoutLeft, Kokkos::LayoutLeft, Kokkos::HostSpace, false) +KOKKOSBLAS3_ZTRSM_BLAS(Kokkos::LayoutRight, Kokkos::LayoutRight, Kokkos::HostSpace, true) +KOKKOSBLAS3_ZTRSM_BLAS(Kokkos::LayoutRight, Kokkos::LayoutRight, Kokkos::HostSpace, false) -KOKKOSBLAS3_CTRSM_BLAS(Kokkos::LayoutLeft, Kokkos::LayoutLeft, - Kokkos::HostSpace, true) -KOKKOSBLAS3_CTRSM_BLAS(Kokkos::LayoutLeft, Kokkos::LayoutLeft, - Kokkos::HostSpace, false) -KOKKOSBLAS3_CTRSM_BLAS(Kokkos::LayoutRight, Kokkos::LayoutRight, - Kokkos::HostSpace, true) -KOKKOSBLAS3_CTRSM_BLAS(Kokkos::LayoutRight, Kokkos::LayoutRight, - Kokkos::HostSpace, false) +KOKKOSBLAS3_CTRSM_BLAS(Kokkos::LayoutLeft, Kokkos::LayoutLeft, Kokkos::HostSpace, true) +KOKKOSBLAS3_CTRSM_BLAS(Kokkos::LayoutLeft, Kokkos::LayoutLeft, Kokkos::HostSpace, false) +KOKKOSBLAS3_CTRSM_BLAS(Kokkos::LayoutRight, Kokkos::LayoutRight, Kokkos::HostSpace, true) +KOKKOSBLAS3_CTRSM_BLAS(Kokkos::LayoutRight, Kokkos::LayoutRight, Kokkos::HostSpace, false) } // namespace Impl } // namespace KokkosBlas @@ -358,450 +304,370 @@ KOKKOSBLAS3_CTRSM_BLAS(Kokkos::LayoutRight, Kokkos::LayoutRight, namespace KokkosBlas { namespace Impl { -#define KOKKOSBLAS3_DTRSM_CUBLAS(LAYOUTA, LAYOUTB, MEM_SPACE, ETI_SPEC_AVAIL) \ - template \ - struct TRSM< \ - ExecSpace, \ - Kokkos::View, \ - Kokkos::MemoryTraits >, \ - Kokkos::View, \ - Kokkos::MemoryTraits >, \ - true, ETI_SPEC_AVAIL> { \ - typedef double SCALAR; \ - typedef Kokkos::View, \ - Kokkos::MemoryTraits > \ - AViewType; \ - typedef Kokkos::View, \ - Kokkos::MemoryTraits > \ - BViewType; \ - \ - static void trsm(const ExecSpace& space, const char side[], \ - const char uplo[], const char trans[], const char diag[], \ - typename BViewType::const_value_type& alpha, \ - const AViewType& A, const BViewType& B) { \ - Kokkos::Profiling::pushRegion("KokkosBlas::trsm[TPL_CUBLAS,double]"); \ - const int M = static_cast(B.extent(0)); \ - const int N = static_cast(B.extent(1)); \ - \ - bool A_is_ll = std::is_same::value; \ - bool B_is_ll = std::is_same::value; \ - \ - const int AST = A_is_ll ? A.stride(1) : A.stride(0), \ - LDA = (AST == 0) ? 1 : AST; \ - const int BST = B_is_ll ? B.stride(1) : B.stride(0), \ - LDB = (BST == 0) ? 1 : BST; \ - \ - cublasSideMode_t side_; \ - cublasFillMode_t uplo_; \ - cublasOperation_t trans_; \ - cublasDiagType_t diag_; \ - \ - if (A_is_ll) { \ - if ((side[0] == 'L') || (side[0] == 'l')) \ - side_ = CUBLAS_SIDE_LEFT; \ - else \ - side_ = CUBLAS_SIDE_RIGHT; \ - if ((uplo[0] == 'L') || (uplo[0] == 'l')) \ - uplo_ = CUBLAS_FILL_MODE_LOWER; \ - else \ - uplo_ = CUBLAS_FILL_MODE_UPPER; \ - } else { \ - if ((side[0] == 'L') || (side[0] == 'l')) \ - side_ = CUBLAS_SIDE_RIGHT; \ - else \ - side_ = CUBLAS_SIDE_LEFT; \ - if ((uplo[0] == 'L') || (uplo[0] == 'l')) \ - uplo_ = CUBLAS_FILL_MODE_UPPER; \ - else \ - uplo_ = CUBLAS_FILL_MODE_LOWER; \ - } \ - \ - if ((trans[0] == 'N') || (trans[0] == 'n')) \ - trans_ = CUBLAS_OP_N; \ - else if ((trans[0] == 'T') || (trans[0] == 't')) \ - trans_ = CUBLAS_OP_T; \ - else \ - trans_ = CUBLAS_OP_C; \ - if ((diag[0] == 'U') || (diag[0] == 'u')) \ - diag_ = CUBLAS_DIAG_UNIT; \ - else \ - diag_ = CUBLAS_DIAG_NON_UNIT; \ - \ - KokkosBlas::Impl::CudaBlasSingleton& s = \ - KokkosBlas::Impl::CudaBlasSingleton::singleton(); \ - KOKKOS_CUBLAS_SAFE_CALL_IMPL( \ - cublasSetStream(s.handle, space.cuda_stream())); \ - if (A_is_ll) { \ - KOKKOS_CUBLAS_SAFE_CALL_IMPL( \ - cublasDtrsm(s.handle, side_, uplo_, trans_, diag_, M, N, &alpha, \ - A.data(), LDA, B.data(), LDB)); \ - } else { \ - KOKKOS_CUBLAS_SAFE_CALL_IMPL( \ - cublasDtrsm(s.handle, side_, uplo_, trans_, diag_, N, M, &alpha, \ - A.data(), LDA, B.data(), LDB)); \ - } \ - KOKKOS_CUBLAS_SAFE_CALL_IMPL(cublasSetStream(s.handle, NULL)); \ - Kokkos::Profiling::popRegion(); \ - } \ +#define KOKKOSBLAS3_DTRSM_CUBLAS(LAYOUTA, LAYOUTB, MEM_SPACE, ETI_SPEC_AVAIL) \ + template \ + struct TRSM< \ + ExecSpace, \ + Kokkos::View, \ + Kokkos::MemoryTraits >, \ + Kokkos::View, Kokkos::MemoryTraits >, \ + true, ETI_SPEC_AVAIL> { \ + typedef double SCALAR; \ + typedef Kokkos::View, \ + Kokkos::MemoryTraits > \ + AViewType; \ + typedef Kokkos::View, \ + Kokkos::MemoryTraits > \ + BViewType; \ + \ + static void trsm(const ExecSpace& space, const char side[], const char uplo[], const char trans[], \ + const char diag[], typename BViewType::const_value_type& alpha, const AViewType& A, \ + const BViewType& B) { \ + Kokkos::Profiling::pushRegion("KokkosBlas::trsm[TPL_CUBLAS,double]"); \ + const int M = static_cast(B.extent(0)); \ + const int N = static_cast(B.extent(1)); \ + \ + bool A_is_ll = std::is_same::value; \ + bool B_is_ll = std::is_same::value; \ + \ + const int AST = A_is_ll ? A.stride(1) : A.stride(0), LDA = (AST == 0) ? 1 : AST; \ + const int BST = B_is_ll ? B.stride(1) : B.stride(0), LDB = (BST == 0) ? 1 : BST; \ + \ + cublasSideMode_t side_; \ + cublasFillMode_t uplo_; \ + cublasOperation_t trans_; \ + cublasDiagType_t diag_; \ + \ + if (A_is_ll) { \ + if ((side[0] == 'L') || (side[0] == 'l')) \ + side_ = CUBLAS_SIDE_LEFT; \ + else \ + side_ = CUBLAS_SIDE_RIGHT; \ + if ((uplo[0] == 'L') || (uplo[0] == 'l')) \ + uplo_ = CUBLAS_FILL_MODE_LOWER; \ + else \ + uplo_ = CUBLAS_FILL_MODE_UPPER; \ + } else { \ + if ((side[0] == 'L') || (side[0] == 'l')) \ + side_ = CUBLAS_SIDE_RIGHT; \ + else \ + side_ = CUBLAS_SIDE_LEFT; \ + if ((uplo[0] == 'L') || (uplo[0] == 'l')) \ + uplo_ = CUBLAS_FILL_MODE_UPPER; \ + else \ + uplo_ = CUBLAS_FILL_MODE_LOWER; \ + } \ + \ + if ((trans[0] == 'N') || (trans[0] == 'n')) \ + trans_ = CUBLAS_OP_N; \ + else if ((trans[0] == 'T') || (trans[0] == 't')) \ + trans_ = CUBLAS_OP_T; \ + else \ + trans_ = CUBLAS_OP_C; \ + if ((diag[0] == 'U') || (diag[0] == 'u')) \ + diag_ = CUBLAS_DIAG_UNIT; \ + else \ + diag_ = CUBLAS_DIAG_NON_UNIT; \ + \ + KokkosBlas::Impl::CudaBlasSingleton& s = KokkosBlas::Impl::CudaBlasSingleton::singleton(); \ + KOKKOS_CUBLAS_SAFE_CALL_IMPL(cublasSetStream(s.handle, space.cuda_stream())); \ + if (A_is_ll) { \ + KOKKOS_CUBLAS_SAFE_CALL_IMPL( \ + cublasDtrsm(s.handle, side_, uplo_, trans_, diag_, M, N, &alpha, A.data(), LDA, B.data(), LDB)); \ + } else { \ + KOKKOS_CUBLAS_SAFE_CALL_IMPL( \ + cublasDtrsm(s.handle, side_, uplo_, trans_, diag_, N, M, &alpha, A.data(), LDA, B.data(), LDB)); \ + } \ + KOKKOS_CUBLAS_SAFE_CALL_IMPL(cublasSetStream(s.handle, NULL)); \ + Kokkos::Profiling::popRegion(); \ + } \ }; -#define KOKKOSBLAS3_STRSM_CUBLAS(LAYOUTA, LAYOUTB, MEM_SPACE, ETI_SPEC_AVAIL) \ - template \ - struct TRSM< \ - ExecSpace, \ - Kokkos::View, \ - Kokkos::MemoryTraits >, \ - Kokkos::View, \ - Kokkos::MemoryTraits >, \ - true, ETI_SPEC_AVAIL> { \ - typedef float SCALAR; \ - typedef Kokkos::View, \ - Kokkos::MemoryTraits > \ - AViewType; \ - typedef Kokkos::View, \ - Kokkos::MemoryTraits > \ - BViewType; \ - \ - static void trsm(const ExecSpace& space, const char side[], \ - const char uplo[], const char trans[], const char diag[], \ - typename BViewType::const_value_type& alpha, \ - const AViewType& A, const BViewType& B) { \ - Kokkos::Profiling::pushRegion("KokkosBlas::trsm[TPL_CUBLAS,float]"); \ - const int M = static_cast(B.extent(0)); \ - const int N = static_cast(B.extent(1)); \ - \ - bool A_is_ll = std::is_same::value; \ - bool B_is_ll = std::is_same::value; \ - \ - const int AST = A_is_ll ? A.stride(1) : A.stride(0), \ - LDA = (AST == 0) ? 1 : AST; \ - const int BST = B_is_ll ? B.stride(1) : B.stride(0), \ - LDB = (BST == 0) ? 1 : BST; \ - \ - cublasSideMode_t side_; \ - cublasFillMode_t uplo_; \ - cublasOperation_t trans_; \ - cublasDiagType_t diag_; \ - \ - if (A_is_ll) { \ - if ((side[0] == 'L') || (side[0] == 'l')) \ - side_ = CUBLAS_SIDE_LEFT; \ - else \ - side_ = CUBLAS_SIDE_RIGHT; \ - if ((uplo[0] == 'L') || (uplo[0] == 'l')) \ - uplo_ = CUBLAS_FILL_MODE_LOWER; \ - else \ - uplo_ = CUBLAS_FILL_MODE_UPPER; \ - } else { \ - if ((side[0] == 'L') || (side[0] == 'l')) \ - side_ = CUBLAS_SIDE_RIGHT; \ - else \ - side_ = CUBLAS_SIDE_LEFT; \ - if ((uplo[0] == 'L') || (uplo[0] == 'l')) \ - uplo_ = CUBLAS_FILL_MODE_UPPER; \ - else \ - uplo_ = CUBLAS_FILL_MODE_LOWER; \ - } \ - \ - if ((trans[0] == 'N') || (trans[0] == 'n')) \ - trans_ = CUBLAS_OP_N; \ - else if ((trans[0] == 'T') || (trans[0] == 't')) \ - trans_ = CUBLAS_OP_T; \ - else \ - trans_ = CUBLAS_OP_C; \ - if ((diag[0] == 'U') || (diag[0] == 'u')) \ - diag_ = CUBLAS_DIAG_UNIT; \ - else \ - diag_ = CUBLAS_DIAG_NON_UNIT; \ - \ - KokkosBlas::Impl::CudaBlasSingleton& s = \ - KokkosBlas::Impl::CudaBlasSingleton::singleton(); \ - KOKKOS_CUBLAS_SAFE_CALL_IMPL( \ - cublasSetStream(s.handle, space.cuda_stream())); \ - if (A_is_ll) { \ - KOKKOS_CUBLAS_SAFE_CALL_IMPL( \ - cublasStrsm(s.handle, side_, uplo_, trans_, diag_, M, N, &alpha, \ - A.data(), LDA, B.data(), LDB)); \ - } else { \ - KOKKOS_CUBLAS_SAFE_CALL_IMPL( \ - cublasStrsm(s.handle, side_, uplo_, trans_, diag_, N, M, &alpha, \ - A.data(), LDA, B.data(), LDB)); \ - } \ - KOKKOS_CUBLAS_SAFE_CALL_IMPL(cublasSetStream(s.handle, NULL)); \ - \ - Kokkos::Profiling::popRegion(); \ - } \ +#define KOKKOSBLAS3_STRSM_CUBLAS(LAYOUTA, LAYOUTB, MEM_SPACE, ETI_SPEC_AVAIL) \ + template \ + struct TRSM< \ + ExecSpace, \ + Kokkos::View, \ + Kokkos::MemoryTraits >, \ + Kokkos::View, Kokkos::MemoryTraits >, \ + true, ETI_SPEC_AVAIL> { \ + typedef float SCALAR; \ + typedef Kokkos::View, \ + Kokkos::MemoryTraits > \ + AViewType; \ + typedef Kokkos::View, \ + Kokkos::MemoryTraits > \ + BViewType; \ + \ + static void trsm(const ExecSpace& space, const char side[], const char uplo[], const char trans[], \ + const char diag[], typename BViewType::const_value_type& alpha, const AViewType& A, \ + const BViewType& B) { \ + Kokkos::Profiling::pushRegion("KokkosBlas::trsm[TPL_CUBLAS,float]"); \ + const int M = static_cast(B.extent(0)); \ + const int N = static_cast(B.extent(1)); \ + \ + bool A_is_ll = std::is_same::value; \ + bool B_is_ll = std::is_same::value; \ + \ + const int AST = A_is_ll ? A.stride(1) : A.stride(0), LDA = (AST == 0) ? 1 : AST; \ + const int BST = B_is_ll ? B.stride(1) : B.stride(0), LDB = (BST == 0) ? 1 : BST; \ + \ + cublasSideMode_t side_; \ + cublasFillMode_t uplo_; \ + cublasOperation_t trans_; \ + cublasDiagType_t diag_; \ + \ + if (A_is_ll) { \ + if ((side[0] == 'L') || (side[0] == 'l')) \ + side_ = CUBLAS_SIDE_LEFT; \ + else \ + side_ = CUBLAS_SIDE_RIGHT; \ + if ((uplo[0] == 'L') || (uplo[0] == 'l')) \ + uplo_ = CUBLAS_FILL_MODE_LOWER; \ + else \ + uplo_ = CUBLAS_FILL_MODE_UPPER; \ + } else { \ + if ((side[0] == 'L') || (side[0] == 'l')) \ + side_ = CUBLAS_SIDE_RIGHT; \ + else \ + side_ = CUBLAS_SIDE_LEFT; \ + if ((uplo[0] == 'L') || (uplo[0] == 'l')) \ + uplo_ = CUBLAS_FILL_MODE_UPPER; \ + else \ + uplo_ = CUBLAS_FILL_MODE_LOWER; \ + } \ + \ + if ((trans[0] == 'N') || (trans[0] == 'n')) \ + trans_ = CUBLAS_OP_N; \ + else if ((trans[0] == 'T') || (trans[0] == 't')) \ + trans_ = CUBLAS_OP_T; \ + else \ + trans_ = CUBLAS_OP_C; \ + if ((diag[0] == 'U') || (diag[0] == 'u')) \ + diag_ = CUBLAS_DIAG_UNIT; \ + else \ + diag_ = CUBLAS_DIAG_NON_UNIT; \ + \ + KokkosBlas::Impl::CudaBlasSingleton& s = KokkosBlas::Impl::CudaBlasSingleton::singleton(); \ + KOKKOS_CUBLAS_SAFE_CALL_IMPL(cublasSetStream(s.handle, space.cuda_stream())); \ + if (A_is_ll) { \ + KOKKOS_CUBLAS_SAFE_CALL_IMPL( \ + cublasStrsm(s.handle, side_, uplo_, trans_, diag_, M, N, &alpha, A.data(), LDA, B.data(), LDB)); \ + } else { \ + KOKKOS_CUBLAS_SAFE_CALL_IMPL( \ + cublasStrsm(s.handle, side_, uplo_, trans_, diag_, N, M, &alpha, A.data(), LDA, B.data(), LDB)); \ + } \ + KOKKOS_CUBLAS_SAFE_CALL_IMPL(cublasSetStream(s.handle, NULL)); \ + \ + Kokkos::Profiling::popRegion(); \ + } \ }; -#define KOKKOSBLAS3_ZTRSM_CUBLAS(LAYOUTA, LAYOUTB, MEM_SPACE, ETI_SPEC_AVAIL) \ - template \ - struct TRSM**, LAYOUTA, \ - Kokkos::Device, \ - Kokkos::MemoryTraits >, \ - Kokkos::View**, LAYOUTB, \ - Kokkos::Device, \ - Kokkos::MemoryTraits >, \ - true, ETI_SPEC_AVAIL> { \ - typedef Kokkos::complex SCALAR; \ - typedef Kokkos::View, \ - Kokkos::MemoryTraits > \ - AViewType; \ - typedef Kokkos::View, \ - Kokkos::MemoryTraits > \ - BViewType; \ - \ - static void trsm(const ExecSpace& space, const char side[], \ - const char uplo[], const char trans[], const char diag[], \ - typename BViewType::const_value_type& alpha, \ - const AViewType& A, const BViewType& B) { \ - Kokkos::Profiling::pushRegion( \ - "KokkosBlas::trsm[TPL_CUBLAS,complex]"); \ - const int M = static_cast(B.extent(0)); \ - const int N = static_cast(B.extent(1)); \ - \ - bool A_is_ll = std::is_same::value; \ - bool B_is_ll = std::is_same::value; \ - \ - const int AST = A_is_ll ? A.stride(1) : A.stride(0), \ - LDA = (AST == 0) ? 1 : AST; \ - const int BST = B_is_ll ? B.stride(1) : B.stride(0), \ - LDB = (BST == 0) ? 1 : BST; \ - \ - cublasSideMode_t side_; \ - cublasFillMode_t uplo_; \ - cublasOperation_t trans_; \ - cublasDiagType_t diag_; \ - \ - if (A_is_ll) { \ - if ((side[0] == 'L') || (side[0] == 'l')) \ - side_ = CUBLAS_SIDE_LEFT; \ - else \ - side_ = CUBLAS_SIDE_RIGHT; \ - if ((uplo[0] == 'L') || (uplo[0] == 'l')) \ - uplo_ = CUBLAS_FILL_MODE_LOWER; \ - else \ - uplo_ = CUBLAS_FILL_MODE_UPPER; \ - } else { \ - if ((side[0] == 'L') || (side[0] == 'l')) \ - side_ = CUBLAS_SIDE_RIGHT; \ - else \ - side_ = CUBLAS_SIDE_LEFT; \ - if ((uplo[0] == 'L') || (uplo[0] == 'l')) \ - uplo_ = CUBLAS_FILL_MODE_UPPER; \ - else \ - uplo_ = CUBLAS_FILL_MODE_LOWER; \ - } \ - \ - if ((trans[0] == 'N') || (trans[0] == 'n')) \ - trans_ = CUBLAS_OP_N; \ - else if ((trans[0] == 'T') || (trans[0] == 't')) \ - trans_ = CUBLAS_OP_T; \ - else \ - trans_ = CUBLAS_OP_C; \ - if ((diag[0] == 'U') || (diag[0] == 'u')) \ - diag_ = CUBLAS_DIAG_UNIT; \ - else \ - diag_ = CUBLAS_DIAG_NON_UNIT; \ - \ - KokkosBlas::Impl::CudaBlasSingleton& s = \ - KokkosBlas::Impl::CudaBlasSingleton::singleton(); \ - KOKKOS_CUBLAS_SAFE_CALL_IMPL( \ - cublasSetStream(s.handle, space.cuda_stream())); \ - if (A_is_ll) { \ - KOKKOS_CUBLAS_SAFE_CALL_IMPL(cublasZtrsm( \ - s.handle, side_, uplo_, trans_, diag_, M, N, \ - reinterpret_cast(&alpha), \ - reinterpret_cast(A.data()), LDA, \ - reinterpret_cast(B.data()), LDB)); \ - } else { \ - KOKKOS_CUBLAS_SAFE_CALL_IMPL(cublasZtrsm( \ - s.handle, side_, uplo_, trans_, diag_, N, M, \ - reinterpret_cast(&alpha), \ - reinterpret_cast(A.data()), LDA, \ - reinterpret_cast(B.data()), LDB)); \ - } \ - KOKKOS_CUBLAS_SAFE_CALL_IMPL(cublasSetStream(s.handle, NULL)); \ - \ - Kokkos::Profiling::popRegion(); \ - } \ +#define KOKKOSBLAS3_ZTRSM_CUBLAS(LAYOUTA, LAYOUTB, MEM_SPACE, ETI_SPEC_AVAIL) \ + template \ + struct TRSM**, LAYOUTA, Kokkos::Device, \ + Kokkos::MemoryTraits >, \ + Kokkos::View**, LAYOUTB, Kokkos::Device, \ + Kokkos::MemoryTraits >, \ + true, ETI_SPEC_AVAIL> { \ + typedef Kokkos::complex SCALAR; \ + typedef Kokkos::View, \ + Kokkos::MemoryTraits > \ + AViewType; \ + typedef Kokkos::View, \ + Kokkos::MemoryTraits > \ + BViewType; \ + \ + static void trsm(const ExecSpace& space, const char side[], const char uplo[], const char trans[], \ + const char diag[], typename BViewType::const_value_type& alpha, const AViewType& A, \ + const BViewType& B) { \ + Kokkos::Profiling::pushRegion("KokkosBlas::trsm[TPL_CUBLAS,complex]"); \ + const int M = static_cast(B.extent(0)); \ + const int N = static_cast(B.extent(1)); \ + \ + bool A_is_ll = std::is_same::value; \ + bool B_is_ll = std::is_same::value; \ + \ + const int AST = A_is_ll ? A.stride(1) : A.stride(0), LDA = (AST == 0) ? 1 : AST; \ + const int BST = B_is_ll ? B.stride(1) : B.stride(0), LDB = (BST == 0) ? 1 : BST; \ + \ + cublasSideMode_t side_; \ + cublasFillMode_t uplo_; \ + cublasOperation_t trans_; \ + cublasDiagType_t diag_; \ + \ + if (A_is_ll) { \ + if ((side[0] == 'L') || (side[0] == 'l')) \ + side_ = CUBLAS_SIDE_LEFT; \ + else \ + side_ = CUBLAS_SIDE_RIGHT; \ + if ((uplo[0] == 'L') || (uplo[0] == 'l')) \ + uplo_ = CUBLAS_FILL_MODE_LOWER; \ + else \ + uplo_ = CUBLAS_FILL_MODE_UPPER; \ + } else { \ + if ((side[0] == 'L') || (side[0] == 'l')) \ + side_ = CUBLAS_SIDE_RIGHT; \ + else \ + side_ = CUBLAS_SIDE_LEFT; \ + if ((uplo[0] == 'L') || (uplo[0] == 'l')) \ + uplo_ = CUBLAS_FILL_MODE_UPPER; \ + else \ + uplo_ = CUBLAS_FILL_MODE_LOWER; \ + } \ + \ + if ((trans[0] == 'N') || (trans[0] == 'n')) \ + trans_ = CUBLAS_OP_N; \ + else if ((trans[0] == 'T') || (trans[0] == 't')) \ + trans_ = CUBLAS_OP_T; \ + else \ + trans_ = CUBLAS_OP_C; \ + if ((diag[0] == 'U') || (diag[0] == 'u')) \ + diag_ = CUBLAS_DIAG_UNIT; \ + else \ + diag_ = CUBLAS_DIAG_NON_UNIT; \ + \ + KokkosBlas::Impl::CudaBlasSingleton& s = KokkosBlas::Impl::CudaBlasSingleton::singleton(); \ + KOKKOS_CUBLAS_SAFE_CALL_IMPL(cublasSetStream(s.handle, space.cuda_stream())); \ + if (A_is_ll) { \ + KOKKOS_CUBLAS_SAFE_CALL_IMPL(cublasZtrsm(s.handle, side_, uplo_, trans_, diag_, M, N, \ + reinterpret_cast(&alpha), \ + reinterpret_cast(A.data()), LDA, \ + reinterpret_cast(B.data()), LDB)); \ + } else { \ + KOKKOS_CUBLAS_SAFE_CALL_IMPL(cublasZtrsm(s.handle, side_, uplo_, trans_, diag_, N, M, \ + reinterpret_cast(&alpha), \ + reinterpret_cast(A.data()), LDA, \ + reinterpret_cast(B.data()), LDB)); \ + } \ + KOKKOS_CUBLAS_SAFE_CALL_IMPL(cublasSetStream(s.handle, NULL)); \ + \ + Kokkos::Profiling::popRegion(); \ + } \ }; -#define KOKKOSBLAS3_CTRSM_CUBLAS(LAYOUTA, LAYOUTB, MEM_SPACE, ETI_SPEC_AVAIL) \ - template \ - struct TRSM**, LAYOUTA, \ - Kokkos::Device, \ - Kokkos::MemoryTraits >, \ - Kokkos::View**, LAYOUTB, \ - Kokkos::Device, \ - Kokkos::MemoryTraits >, \ - true, ETI_SPEC_AVAIL> { \ - typedef Kokkos::complex SCALAR; \ - typedef Kokkos::View, \ - Kokkos::MemoryTraits > \ - AViewType; \ - typedef Kokkos::View, \ - Kokkos::MemoryTraits > \ - BViewType; \ - \ - static void trsm(const ExecSpace& space, const char side[], \ - const char uplo[], const char trans[], const char diag[], \ - typename BViewType::const_value_type& alpha, \ - const AViewType& A, const BViewType& B) { \ - Kokkos::Profiling::pushRegion( \ - "KokkosBlas::trsm[TPL_CUBLAS,complex]"); \ - const int M = static_cast(B.extent(0)); \ - const int N = static_cast(B.extent(1)); \ - \ - bool A_is_ll = std::is_same::value; \ - bool B_is_ll = std::is_same::value; \ - \ - const int AST = A_is_ll ? A.stride(1) : A.stride(0), \ - LDA = (AST == 0) ? 1 : AST; \ - const int BST = B_is_ll ? B.stride(1) : B.stride(0), \ - LDB = (BST == 0) ? 1 : BST; \ - \ - cublasSideMode_t side_; \ - cublasFillMode_t uplo_; \ - cublasOperation_t trans_; \ - cublasDiagType_t diag_; \ - \ - if (A_is_ll) { \ - if ((side[0] == 'L') || (side[0] == 'l')) \ - side_ = CUBLAS_SIDE_LEFT; \ - else \ - side_ = CUBLAS_SIDE_RIGHT; \ - if ((uplo[0] == 'L') || (uplo[0] == 'l')) \ - uplo_ = CUBLAS_FILL_MODE_LOWER; \ - else \ - uplo_ = CUBLAS_FILL_MODE_UPPER; \ - } else { \ - if ((side[0] == 'L') || (side[0] == 'l')) \ - side_ = CUBLAS_SIDE_RIGHT; \ - else \ - side_ = CUBLAS_SIDE_LEFT; \ - if ((uplo[0] == 'L') || (uplo[0] == 'l')) \ - uplo_ = CUBLAS_FILL_MODE_UPPER; \ - else \ - uplo_ = CUBLAS_FILL_MODE_LOWER; \ - } \ - \ - if ((trans[0] == 'N') || (trans[0] == 'n')) \ - trans_ = CUBLAS_OP_N; \ - else if ((trans[0] == 'T') || (trans[0] == 't')) \ - trans_ = CUBLAS_OP_T; \ - else \ - trans_ = CUBLAS_OP_C; \ - if ((diag[0] == 'U') || (diag[0] == 'u')) \ - diag_ = CUBLAS_DIAG_UNIT; \ - else \ - diag_ = CUBLAS_DIAG_NON_UNIT; \ - \ - KokkosBlas::Impl::CudaBlasSingleton& s = \ - KokkosBlas::Impl::CudaBlasSingleton::singleton(); \ - KOKKOS_CUBLAS_SAFE_CALL_IMPL( \ - cublasSetStream(s.handle, space.cuda_stream())); \ - if (A_is_ll) { \ - KOKKOS_CUBLAS_SAFE_CALL_IMPL( \ - cublasCtrsm(s.handle, side_, uplo_, trans_, diag_, M, N, \ - reinterpret_cast(&alpha), \ - reinterpret_cast(A.data()), LDA, \ - reinterpret_cast(B.data()), LDB)); \ - } else { \ - KOKKOS_CUBLAS_SAFE_CALL_IMPL( \ - cublasCtrsm(s.handle, side_, uplo_, trans_, diag_, N, M, \ - reinterpret_cast(&alpha), \ - reinterpret_cast(A.data()), LDA, \ - reinterpret_cast(B.data()), LDB)); \ - } \ - KOKKOS_CUBLAS_SAFE_CALL_IMPL(cublasSetStream(s.handle, NULL)); \ - \ - Kokkos::Profiling::popRegion(); \ - } \ +#define KOKKOSBLAS3_CTRSM_CUBLAS(LAYOUTA, LAYOUTB, MEM_SPACE, ETI_SPEC_AVAIL) \ + template \ + struct TRSM**, LAYOUTA, Kokkos::Device, \ + Kokkos::MemoryTraits >, \ + Kokkos::View**, LAYOUTB, Kokkos::Device, \ + Kokkos::MemoryTraits >, \ + true, ETI_SPEC_AVAIL> { \ + typedef Kokkos::complex SCALAR; \ + typedef Kokkos::View, \ + Kokkos::MemoryTraits > \ + AViewType; \ + typedef Kokkos::View, \ + Kokkos::MemoryTraits > \ + BViewType; \ + \ + static void trsm(const ExecSpace& space, const char side[], const char uplo[], const char trans[], \ + const char diag[], typename BViewType::const_value_type& alpha, const AViewType& A, \ + const BViewType& B) { \ + Kokkos::Profiling::pushRegion("KokkosBlas::trsm[TPL_CUBLAS,complex]"); \ + const int M = static_cast(B.extent(0)); \ + const int N = static_cast(B.extent(1)); \ + \ + bool A_is_ll = std::is_same::value; \ + bool B_is_ll = std::is_same::value; \ + \ + const int AST = A_is_ll ? A.stride(1) : A.stride(0), LDA = (AST == 0) ? 1 : AST; \ + const int BST = B_is_ll ? B.stride(1) : B.stride(0), LDB = (BST == 0) ? 1 : BST; \ + \ + cublasSideMode_t side_; \ + cublasFillMode_t uplo_; \ + cublasOperation_t trans_; \ + cublasDiagType_t diag_; \ + \ + if (A_is_ll) { \ + if ((side[0] == 'L') || (side[0] == 'l')) \ + side_ = CUBLAS_SIDE_LEFT; \ + else \ + side_ = CUBLAS_SIDE_RIGHT; \ + if ((uplo[0] == 'L') || (uplo[0] == 'l')) \ + uplo_ = CUBLAS_FILL_MODE_LOWER; \ + else \ + uplo_ = CUBLAS_FILL_MODE_UPPER; \ + } else { \ + if ((side[0] == 'L') || (side[0] == 'l')) \ + side_ = CUBLAS_SIDE_RIGHT; \ + else \ + side_ = CUBLAS_SIDE_LEFT; \ + if ((uplo[0] == 'L') || (uplo[0] == 'l')) \ + uplo_ = CUBLAS_FILL_MODE_UPPER; \ + else \ + uplo_ = CUBLAS_FILL_MODE_LOWER; \ + } \ + \ + if ((trans[0] == 'N') || (trans[0] == 'n')) \ + trans_ = CUBLAS_OP_N; \ + else if ((trans[0] == 'T') || (trans[0] == 't')) \ + trans_ = CUBLAS_OP_T; \ + else \ + trans_ = CUBLAS_OP_C; \ + if ((diag[0] == 'U') || (diag[0] == 'u')) \ + diag_ = CUBLAS_DIAG_UNIT; \ + else \ + diag_ = CUBLAS_DIAG_NON_UNIT; \ + \ + KokkosBlas::Impl::CudaBlasSingleton& s = KokkosBlas::Impl::CudaBlasSingleton::singleton(); \ + KOKKOS_CUBLAS_SAFE_CALL_IMPL(cublasSetStream(s.handle, space.cuda_stream())); \ + if (A_is_ll) { \ + KOKKOS_CUBLAS_SAFE_CALL_IMPL(cublasCtrsm( \ + s.handle, side_, uplo_, trans_, diag_, M, N, reinterpret_cast(&alpha), \ + reinterpret_cast(A.data()), LDA, reinterpret_cast(B.data()), LDB)); \ + } else { \ + KOKKOS_CUBLAS_SAFE_CALL_IMPL(cublasCtrsm( \ + s.handle, side_, uplo_, trans_, diag_, N, M, reinterpret_cast(&alpha), \ + reinterpret_cast(A.data()), LDA, reinterpret_cast(B.data()), LDB)); \ + } \ + KOKKOS_CUBLAS_SAFE_CALL_IMPL(cublasSetStream(s.handle, NULL)); \ + \ + Kokkos::Profiling::popRegion(); \ + } \ }; -KOKKOSBLAS3_DTRSM_CUBLAS(Kokkos::LayoutLeft, Kokkos::LayoutLeft, - Kokkos::CudaSpace, true) -KOKKOSBLAS3_DTRSM_CUBLAS(Kokkos::LayoutLeft, Kokkos::LayoutLeft, - Kokkos::CudaSpace, false) -KOKKOSBLAS3_DTRSM_CUBLAS(Kokkos::LayoutRight, Kokkos::LayoutRight, - Kokkos::CudaSpace, true) -KOKKOSBLAS3_DTRSM_CUBLAS(Kokkos::LayoutRight, Kokkos::LayoutRight, - Kokkos::CudaSpace, false) +KOKKOSBLAS3_DTRSM_CUBLAS(Kokkos::LayoutLeft, Kokkos::LayoutLeft, Kokkos::CudaSpace, true) +KOKKOSBLAS3_DTRSM_CUBLAS(Kokkos::LayoutLeft, Kokkos::LayoutLeft, Kokkos::CudaSpace, false) +KOKKOSBLAS3_DTRSM_CUBLAS(Kokkos::LayoutRight, Kokkos::LayoutRight, Kokkos::CudaSpace, true) +KOKKOSBLAS3_DTRSM_CUBLAS(Kokkos::LayoutRight, Kokkos::LayoutRight, Kokkos::CudaSpace, false) -KOKKOSBLAS3_DTRSM_CUBLAS(Kokkos::LayoutLeft, Kokkos::LayoutLeft, - Kokkos::CudaUVMSpace, true) -KOKKOSBLAS3_DTRSM_CUBLAS(Kokkos::LayoutLeft, Kokkos::LayoutLeft, - Kokkos::CudaUVMSpace, false) -KOKKOSBLAS3_DTRSM_CUBLAS(Kokkos::LayoutRight, Kokkos::LayoutRight, - Kokkos::CudaUVMSpace, true) -KOKKOSBLAS3_DTRSM_CUBLAS(Kokkos::LayoutRight, Kokkos::LayoutRight, - Kokkos::CudaUVMSpace, false) +KOKKOSBLAS3_DTRSM_CUBLAS(Kokkos::LayoutLeft, Kokkos::LayoutLeft, Kokkos::CudaUVMSpace, true) +KOKKOSBLAS3_DTRSM_CUBLAS(Kokkos::LayoutLeft, Kokkos::LayoutLeft, Kokkos::CudaUVMSpace, false) +KOKKOSBLAS3_DTRSM_CUBLAS(Kokkos::LayoutRight, Kokkos::LayoutRight, Kokkos::CudaUVMSpace, true) +KOKKOSBLAS3_DTRSM_CUBLAS(Kokkos::LayoutRight, Kokkos::LayoutRight, Kokkos::CudaUVMSpace, false) -KOKKOSBLAS3_STRSM_CUBLAS(Kokkos::LayoutLeft, Kokkos::LayoutLeft, - Kokkos::CudaSpace, true) -KOKKOSBLAS3_STRSM_CUBLAS(Kokkos::LayoutLeft, Kokkos::LayoutLeft, - Kokkos::CudaSpace, false) -KOKKOSBLAS3_STRSM_CUBLAS(Kokkos::LayoutRight, Kokkos::LayoutRight, - Kokkos::CudaSpace, true) -KOKKOSBLAS3_STRSM_CUBLAS(Kokkos::LayoutRight, Kokkos::LayoutRight, - Kokkos::CudaSpace, false) +KOKKOSBLAS3_STRSM_CUBLAS(Kokkos::LayoutLeft, Kokkos::LayoutLeft, Kokkos::CudaSpace, true) +KOKKOSBLAS3_STRSM_CUBLAS(Kokkos::LayoutLeft, Kokkos::LayoutLeft, Kokkos::CudaSpace, false) +KOKKOSBLAS3_STRSM_CUBLAS(Kokkos::LayoutRight, Kokkos::LayoutRight, Kokkos::CudaSpace, true) +KOKKOSBLAS3_STRSM_CUBLAS(Kokkos::LayoutRight, Kokkos::LayoutRight, Kokkos::CudaSpace, false) -KOKKOSBLAS3_STRSM_CUBLAS(Kokkos::LayoutLeft, Kokkos::LayoutLeft, - Kokkos::CudaUVMSpace, true) -KOKKOSBLAS3_STRSM_CUBLAS(Kokkos::LayoutLeft, Kokkos::LayoutLeft, - Kokkos::CudaUVMSpace, false) -KOKKOSBLAS3_STRSM_CUBLAS(Kokkos::LayoutRight, Kokkos::LayoutRight, - Kokkos::CudaUVMSpace, true) -KOKKOSBLAS3_STRSM_CUBLAS(Kokkos::LayoutRight, Kokkos::LayoutRight, - Kokkos::CudaUVMSpace, false) +KOKKOSBLAS3_STRSM_CUBLAS(Kokkos::LayoutLeft, Kokkos::LayoutLeft, Kokkos::CudaUVMSpace, true) +KOKKOSBLAS3_STRSM_CUBLAS(Kokkos::LayoutLeft, Kokkos::LayoutLeft, Kokkos::CudaUVMSpace, false) +KOKKOSBLAS3_STRSM_CUBLAS(Kokkos::LayoutRight, Kokkos::LayoutRight, Kokkos::CudaUVMSpace, true) +KOKKOSBLAS3_STRSM_CUBLAS(Kokkos::LayoutRight, Kokkos::LayoutRight, Kokkos::CudaUVMSpace, false) -KOKKOSBLAS3_ZTRSM_CUBLAS(Kokkos::LayoutLeft, Kokkos::LayoutLeft, - Kokkos::CudaSpace, true) -KOKKOSBLAS3_ZTRSM_CUBLAS(Kokkos::LayoutLeft, Kokkos::LayoutLeft, - Kokkos::CudaSpace, false) -KOKKOSBLAS3_ZTRSM_CUBLAS(Kokkos::LayoutRight, Kokkos::LayoutRight, - Kokkos::CudaSpace, true) -KOKKOSBLAS3_ZTRSM_CUBLAS(Kokkos::LayoutRight, Kokkos::LayoutRight, - Kokkos::CudaSpace, false) +KOKKOSBLAS3_ZTRSM_CUBLAS(Kokkos::LayoutLeft, Kokkos::LayoutLeft, Kokkos::CudaSpace, true) +KOKKOSBLAS3_ZTRSM_CUBLAS(Kokkos::LayoutLeft, Kokkos::LayoutLeft, Kokkos::CudaSpace, false) +KOKKOSBLAS3_ZTRSM_CUBLAS(Kokkos::LayoutRight, Kokkos::LayoutRight, Kokkos::CudaSpace, true) +KOKKOSBLAS3_ZTRSM_CUBLAS(Kokkos::LayoutRight, Kokkos::LayoutRight, Kokkos::CudaSpace, false) -KOKKOSBLAS3_ZTRSM_CUBLAS(Kokkos::LayoutLeft, Kokkos::LayoutLeft, - Kokkos::CudaUVMSpace, true) -KOKKOSBLAS3_ZTRSM_CUBLAS(Kokkos::LayoutLeft, Kokkos::LayoutLeft, - Kokkos::CudaUVMSpace, false) -KOKKOSBLAS3_ZTRSM_CUBLAS(Kokkos::LayoutRight, Kokkos::LayoutRight, - Kokkos::CudaUVMSpace, true) -KOKKOSBLAS3_ZTRSM_CUBLAS(Kokkos::LayoutRight, Kokkos::LayoutRight, - Kokkos::CudaUVMSpace, false) +KOKKOSBLAS3_ZTRSM_CUBLAS(Kokkos::LayoutLeft, Kokkos::LayoutLeft, Kokkos::CudaUVMSpace, true) +KOKKOSBLAS3_ZTRSM_CUBLAS(Kokkos::LayoutLeft, Kokkos::LayoutLeft, Kokkos::CudaUVMSpace, false) +KOKKOSBLAS3_ZTRSM_CUBLAS(Kokkos::LayoutRight, Kokkos::LayoutRight, Kokkos::CudaUVMSpace, true) +KOKKOSBLAS3_ZTRSM_CUBLAS(Kokkos::LayoutRight, Kokkos::LayoutRight, Kokkos::CudaUVMSpace, false) -KOKKOSBLAS3_CTRSM_CUBLAS(Kokkos::LayoutLeft, Kokkos::LayoutLeft, - Kokkos::CudaSpace, true) -KOKKOSBLAS3_CTRSM_CUBLAS(Kokkos::LayoutLeft, Kokkos::LayoutLeft, - Kokkos::CudaSpace, false) -KOKKOSBLAS3_CTRSM_CUBLAS(Kokkos::LayoutRight, Kokkos::LayoutRight, - Kokkos::CudaSpace, true) -KOKKOSBLAS3_CTRSM_CUBLAS(Kokkos::LayoutRight, Kokkos::LayoutRight, - Kokkos::CudaSpace, false) +KOKKOSBLAS3_CTRSM_CUBLAS(Kokkos::LayoutLeft, Kokkos::LayoutLeft, Kokkos::CudaSpace, true) +KOKKOSBLAS3_CTRSM_CUBLAS(Kokkos::LayoutLeft, Kokkos::LayoutLeft, Kokkos::CudaSpace, false) +KOKKOSBLAS3_CTRSM_CUBLAS(Kokkos::LayoutRight, Kokkos::LayoutRight, Kokkos::CudaSpace, true) +KOKKOSBLAS3_CTRSM_CUBLAS(Kokkos::LayoutRight, Kokkos::LayoutRight, Kokkos::CudaSpace, false) -KOKKOSBLAS3_CTRSM_CUBLAS(Kokkos::LayoutLeft, Kokkos::LayoutLeft, - Kokkos::CudaUVMSpace, true) -KOKKOSBLAS3_CTRSM_CUBLAS(Kokkos::LayoutLeft, Kokkos::LayoutLeft, - Kokkos::CudaUVMSpace, false) -KOKKOSBLAS3_CTRSM_CUBLAS(Kokkos::LayoutRight, Kokkos::LayoutRight, - Kokkos::CudaUVMSpace, true) -KOKKOSBLAS3_CTRSM_CUBLAS(Kokkos::LayoutRight, Kokkos::LayoutRight, - Kokkos::CudaUVMSpace, false) +KOKKOSBLAS3_CTRSM_CUBLAS(Kokkos::LayoutLeft, Kokkos::LayoutLeft, Kokkos::CudaUVMSpace, true) +KOKKOSBLAS3_CTRSM_CUBLAS(Kokkos::LayoutLeft, Kokkos::LayoutLeft, Kokkos::CudaUVMSpace, false) +KOKKOSBLAS3_CTRSM_CUBLAS(Kokkos::LayoutRight, Kokkos::LayoutRight, Kokkos::CudaUVMSpace, true) +KOKKOSBLAS3_CTRSM_CUBLAS(Kokkos::LayoutRight, Kokkos::LayoutRight, Kokkos::CudaUVMSpace, false) } // namespace Impl } // namespace KokkosBlas diff --git a/blas/tpls/KokkosBlas_Cuda_tpl.hpp b/blas/tpls/KokkosBlas_Cuda_tpl.hpp index d85785316e..d80e3a23d8 100644 --- a/blas/tpls/KokkosBlas_Cuda_tpl.hpp +++ b/blas/tpls/KokkosBlas_Cuda_tpl.hpp @@ -24,8 +24,7 @@ namespace Impl { CudaBlasSingleton::CudaBlasSingleton() { cublasStatus_t stat = cublasCreate(&handle); - if (stat != CUBLAS_STATUS_SUCCESS) - Kokkos::abort("CUBLAS initialization failed\n"); + if (stat != CUBLAS_STATUS_SUCCESS) Kokkos::abort("CUBLAS initialization failed\n"); Kokkos::push_finalize_hook([&]() { cublasDestroy(handle); }); } diff --git a/blas/tpls/KokkosBlas_Host_tpl.cpp b/blas/tpls/KokkosBlas_Host_tpl.cpp index dc04ca7e67..6989aea34d 100644 --- a/blas/tpls/KokkosBlas_Host_tpl.cpp +++ b/blas/tpls/KokkosBlas_Host_tpl.cpp @@ -34,63 +34,41 @@ void F77_BLAS_MANGLE(sscal, SSCAL)(const KK_INT* N, const float* alpha, /* */ float* x, const KK_INT* x_inc); void F77_BLAS_MANGLE(dscal, DSCAL)(const KK_INT* N, const double* alpha, /* */ double* x, const KK_INT* x_inc); -void F77_BLAS_MANGLE(cscal, - CSCAL)(const KK_INT* N, const std::complex* alpha, - /* */ std::complex* x, const KK_INT* x_inc); -void F77_BLAS_MANGLE(zscal, - ZSCAL)(const KK_INT* N, const std::complex* alpha, - /* */ std::complex* x, const KK_INT* x_inc); +void F77_BLAS_MANGLE(cscal, CSCAL)(const KK_INT* N, const std::complex* alpha, + /* */ std::complex* x, const KK_INT* x_inc); +void F77_BLAS_MANGLE(zscal, ZSCAL)(const KK_INT* N, const std::complex* alpha, + /* */ std::complex* x, const KK_INT* x_inc); /// /// max /// -KK_INT F77_BLAS_MANGLE(isamax, ISAMAX)(const KK_INT* N, const float* x, - const KK_INT* x_inc); -KK_INT F77_BLAS_MANGLE(idamax, IDAMAX)(const KK_INT* N, const double* x, - const KK_INT* x_inc); -KK_INT F77_BLAS_MANGLE(icamax, ICAMAX)(const KK_INT* N, - const std::complex* x, - const KK_INT* x_inc); -KK_INT F77_BLAS_MANGLE(izamax, IZAMAX)(const KK_INT* N, - const std::complex* x, - const KK_INT* x_inc); +KK_INT F77_BLAS_MANGLE(isamax, ISAMAX)(const KK_INT* N, const float* x, const KK_INT* x_inc); +KK_INT F77_BLAS_MANGLE(idamax, IDAMAX)(const KK_INT* N, const double* x, const KK_INT* x_inc); +KK_INT F77_BLAS_MANGLE(icamax, ICAMAX)(const KK_INT* N, const std::complex* x, const KK_INT* x_inc); +KK_INT F77_BLAS_MANGLE(izamax, IZAMAX)(const KK_INT* N, const std::complex* x, const KK_INT* x_inc); /// /// nrm2 /// -float F77_BLAS_MANGLE(snrm2, SNRM2)(const KK_INT* N, const float* x, - const KK_INT* x_inc); -double F77_BLAS_MANGLE(dnrm2, DNRM2)(const KK_INT* N, const double* x, - const KK_INT* x_inc); -float F77_BLAS_MANGLE(scnrm2, SCNRM2)(const KK_INT* N, - const std::complex* x, - const KK_INT* x_inc); -double F77_BLAS_MANGLE(dznrm2, DZNRM2)(const KK_INT* N, - const std::complex* x, - const KK_INT* x_inc); +float F77_BLAS_MANGLE(snrm2, SNRM2)(const KK_INT* N, const float* x, const KK_INT* x_inc); +double F77_BLAS_MANGLE(dnrm2, DNRM2)(const KK_INT* N, const double* x, const KK_INT* x_inc); +float F77_BLAS_MANGLE(scnrm2, SCNRM2)(const KK_INT* N, const std::complex* x, const KK_INT* x_inc); +double F77_BLAS_MANGLE(dznrm2, DZNRM2)(const KK_INT* N, const std::complex* x, const KK_INT* x_inc); /// /// sum /// -float F77_BLAS_MANGLE(sasum, SASUM)(const KK_INT* N, const float* x, - const KK_INT* x_inc); -double F77_BLAS_MANGLE(dasum, DASUM)(const KK_INT* N, const double* x, - const KK_INT* x_inc); -float F77_BLAS_MANGLE(scasum, SCASUM)(const KK_INT* N, - const std::complex* x, - const KK_INT* x_inc); -double F77_BLAS_MANGLE(dzasum, DZASUM)(const KK_INT* N, - const std::complex* x, - const KK_INT* x_inc); +float F77_BLAS_MANGLE(sasum, SASUM)(const KK_INT* N, const float* x, const KK_INT* x_inc); +double F77_BLAS_MANGLE(dasum, DASUM)(const KK_INT* N, const double* x, const KK_INT* x_inc); +float F77_BLAS_MANGLE(scasum, SCASUM)(const KK_INT* N, const std::complex* x, const KK_INT* x_inc); +double F77_BLAS_MANGLE(dzasum, DZASUM)(const KK_INT* N, const std::complex* x, const KK_INT* x_inc); /// /// dot /// -float F77_BLAS_MANGLE(sdot, SDOT)(const KK_INT* N, const float* x, - const KK_INT* x_inc, const float* y, +float F77_BLAS_MANGLE(sdot, SDOT)(const KK_INT* N, const float* x, const KK_INT* x_inc, const float* y, const KK_INT* y_inc); -double F77_BLAS_MANGLE(ddot, DDOT)(const KK_INT* N, const double* x, - const KK_INT* x_inc, const double* y, +double F77_BLAS_MANGLE(ddot, DDOT)(const KK_INT* N, const double* x, const KK_INT* x_inc, const double* y, const KK_INT* y_inc); #if defined(KOKKOSKERNELS_TPL_BLAS_RETURN_COMPLEX) // clang-format off @@ -106,77 +84,49 @@ typedef struct { double vals[2]; } _kk_double2; -_kk_float2 F77_BLAS_MANGLE(cdotu, CDOTU)(const KK_INT* N, - const std::complex* x, - const KK_INT* x_inc, - const std::complex* y, - const KK_INT* y_inc); -_kk_double2 F77_BLAS_MANGLE(zdotu, ZDOTU)(const KK_INT* N, - const std::complex* x, - const KK_INT* x_inc, - const std::complex* y, - const KK_INT* y_inc); -_kk_float2 F77_BLAS_MANGLE(cdotc, CDOTC)(const KK_INT* N, - const std::complex* x, - const KK_INT* x_inc, - const std::complex* y, - const KK_INT* y_inc); -_kk_double2 F77_BLAS_MANGLE(zdotc, ZDOTC)(const KK_INT* N, - const std::complex* x, - const KK_INT* x_inc, - const std::complex* y, - const KK_INT* y_inc); +_kk_float2 F77_BLAS_MANGLE(cdotu, CDOTU)(const KK_INT* N, const std::complex* x, const KK_INT* x_inc, + const std::complex* y, const KK_INT* y_inc); +_kk_double2 F77_BLAS_MANGLE(zdotu, ZDOTU)(const KK_INT* N, const std::complex* x, const KK_INT* x_inc, + const std::complex* y, const KK_INT* y_inc); +_kk_float2 F77_BLAS_MANGLE(cdotc, CDOTC)(const KK_INT* N, const std::complex* x, const KK_INT* x_inc, + const std::complex* y, const KK_INT* y_inc); +_kk_double2 F77_BLAS_MANGLE(zdotc, ZDOTC)(const KK_INT* N, const std::complex* x, const KK_INT* x_inc, + const std::complex* y, const KK_INT* y_inc); #else -void F77_BLAS_MANGLE(cdotu, - CDOTU)(std::complex* res, const KK_INT* N, - const std::complex* x, const KK_INT* x_inc, - const std::complex* y, const KK_INT* y_inc); -void F77_BLAS_MANGLE(zdotu, - ZDOTU)(std::complex* res, const KK_INT* N, - const std::complex* x, const KK_INT* x_inc, - const std::complex* y, const KK_INT* y_inc); -void F77_BLAS_MANGLE(cdotc, - CDOTC)(std::complex* res, const KK_INT* N, - const std::complex* x, const KK_INT* x_inc, - const std::complex* y, const KK_INT* y_inc); -void F77_BLAS_MANGLE(zdotc, - ZDOTC)(std::complex* res, const KK_INT* N, - const std::complex* x, const KK_INT* x_inc, - const std::complex* y, const KK_INT* y_inc); +void F77_BLAS_MANGLE(cdotu, CDOTU)(std::complex* res, const KK_INT* N, const std::complex* x, + const KK_INT* x_inc, const std::complex* y, const KK_INT* y_inc); +void F77_BLAS_MANGLE(zdotu, ZDOTU)(std::complex* res, const KK_INT* N, const std::complex* x, + const KK_INT* x_inc, const std::complex* y, const KK_INT* y_inc); +void F77_BLAS_MANGLE(cdotc, CDOTC)(std::complex* res, const KK_INT* N, const std::complex* x, + const KK_INT* x_inc, const std::complex* y, const KK_INT* y_inc); +void F77_BLAS_MANGLE(zdotc, ZDOTC)(std::complex* res, const KK_INT* N, const std::complex* x, + const KK_INT* x_inc, const std::complex* y, const KK_INT* y_inc); #endif /// /// axpy /// -void F77_BLAS_MANGLE(saxpy, SAXPY)(const KK_INT* N, const float* alpha, - const float* x, const KK_INT* x_inc, +void F77_BLAS_MANGLE(saxpy, SAXPY)(const KK_INT* N, const float* alpha, const float* x, const KK_INT* x_inc, /* */ float* y, const KK_INT* y_inc); -void F77_BLAS_MANGLE(daxpy, DAXPY)(const KK_INT* N, const double* alpha, - const double* x, const KK_INT* x_inc, +void F77_BLAS_MANGLE(daxpy, DAXPY)(const KK_INT* N, const double* alpha, const double* x, const KK_INT* x_inc, /* */ double* y, const KK_INT* y_inc); -void F77_BLAS_MANGLE(caxpy, - CAXPY)(const KK_INT* N, const std::complex* alpha, - const std::complex* x, const KK_INT* x_inc, - /* */ std::complex* y, const KK_INT* y_inc); -void F77_BLAS_MANGLE(zaxpy, - ZAXPY)(const KK_INT* N, const std::complex* alpha, - const std::complex* x, const KK_INT* x_inc, - /* */ std::complex* y, const KK_INT* y_inc); +void F77_BLAS_MANGLE(caxpy, CAXPY)(const KK_INT* N, const std::complex* alpha, const std::complex* x, + const KK_INT* x_inc, + /* */ std::complex* y, const KK_INT* y_inc); +void F77_BLAS_MANGLE(zaxpy, ZAXPY)(const KK_INT* N, const std::complex* alpha, const std::complex* x, + const KK_INT* x_inc, + /* */ std::complex* y, const KK_INT* y_inc); /// /// rot /// -void F77_BLAS_MANGLE(srot, SROT)(KK_INT const* N, float* X, KK_INT const* incx, - float* Y, KK_INT const* incy, float* c, +void F77_BLAS_MANGLE(srot, SROT)(KK_INT const* N, float* X, KK_INT const* incx, float* Y, KK_INT const* incy, float* c, float* s); -void F77_BLAS_MANGLE(drot, DROT)(KK_INT const* N, double* X, KK_INT const* incx, - double* Y, KK_INT const* incy, double* c, - double* s); -void F77_BLAS_MANGLE(crot, CROT)(KK_INT const* N, std::complex* X, - KK_INT const* incx, std::complex* Y, +void F77_BLAS_MANGLE(drot, DROT)(KK_INT const* N, double* X, KK_INT const* incx, double* Y, KK_INT const* incy, + double* c, double* s); +void F77_BLAS_MANGLE(crot, CROT)(KK_INT const* N, std::complex* X, KK_INT const* incx, std::complex* Y, KK_INT const* incy, float* c, float* s); -void F77_BLAS_MANGLE(zrot, ZROT)(KK_INT const* N, std::complex* X, - KK_INT const* incx, std::complex* Y, +void F77_BLAS_MANGLE(zrot, ZROT)(KK_INT const* N, std::complex* X, KK_INT const* incx, std::complex* Y, KK_INT const* incy, double* c, double* s); /// @@ -184,106 +134,73 @@ void F77_BLAS_MANGLE(zrot, ZROT)(KK_INT const* N, std::complex* X, /// void F77_BLAS_MANGLE(srotg, SROTG)(float* a, float* b, float* c, float* s); void F77_BLAS_MANGLE(drotg, DROTG)(double* a, double* b, double* c, double* s); -void F77_BLAS_MANGLE(crotg, CROTG)(std::complex* a, - std::complex* b, float* c, - std::complex* s); -void F77_BLAS_MANGLE(zrotg, ZROTG)(std::complex* a, - std::complex* b, double* c, +void F77_BLAS_MANGLE(crotg, CROTG)(std::complex* a, std::complex* b, float* c, std::complex* s); +void F77_BLAS_MANGLE(zrotg, ZROTG)(std::complex* a, std::complex* b, double* c, std::complex* s); /// /// rotm /// -void F77_BLAS_MANGLE(srotm, SROTM)(const KK_INT* n, float* X, - const KK_INT* incx, float* Y, - const KK_INT* incy, float const* param); -void F77_BLAS_MANGLE(drotm, DROTM)(const KK_INT* n, double* X, - const KK_INT* incx, double* Y, - const KK_INT* incy, double const* param); +void F77_BLAS_MANGLE(srotm, SROTM)(const KK_INT* n, float* X, const KK_INT* incx, float* Y, const KK_INT* incy, + float const* param); +void F77_BLAS_MANGLE(drotm, DROTM)(const KK_INT* n, double* X, const KK_INT* incx, double* Y, const KK_INT* incy, + double const* param); /// /// rotmg /// -void F77_BLAS_MANGLE(srotmg, SROTMG)(float* d1, float* d2, float* x1, - const float* y1, float* param); -void F77_BLAS_MANGLE(drotmg, DROTMG)(double* d1, double* d2, double* x1, - const double* y1, double* param); +void F77_BLAS_MANGLE(srotmg, SROTMG)(float* d1, float* d2, float* x1, const float* y1, float* param); +void F77_BLAS_MANGLE(drotmg, DROTMG)(double* d1, double* d2, double* x1, const double* y1, double* param); /// /// swap /// -void F77_BLAS_MANGLE(sswap, SSWAP)(KK_INT const* N, float* X, - KK_INT const* incx, float* Y, - KK_INT const* incy); -void F77_BLAS_MANGLE(dswap, DSWAP)(KK_INT const* N, double* X, - KK_INT const* incx, double* Y, - KK_INT const* incy); -void F77_BLAS_MANGLE(cswap, CSWAP)(KK_INT const* N, std::complex* X, - KK_INT const* incx, std::complex* Y, - KK_INT const* incy); -void F77_BLAS_MANGLE(zswap, ZSWAP)(KK_INT const* N, std::complex* X, - KK_INT const* incx, std::complex* Y, +void F77_BLAS_MANGLE(sswap, SSWAP)(KK_INT const* N, float* X, KK_INT const* incx, float* Y, KK_INT const* incy); +void F77_BLAS_MANGLE(dswap, DSWAP)(KK_INT const* N, double* X, KK_INT const* incx, double* Y, KK_INT const* incy); +void F77_BLAS_MANGLE(cswap, CSWAP)(KK_INT const* N, std::complex* X, KK_INT const* incx, std::complex* Y, KK_INT const* incy); +void F77_BLAS_MANGLE(zswap, ZSWAP)(KK_INT const* N, std::complex* X, KK_INT const* incx, + std::complex* Y, KK_INT const* incy); /// /// Gemv /// -void F77_BLAS_MANGLE(sgemv, SGEMV)(const char*, KK_INT*, KK_INT*, const float*, - const float*, KK_INT*, const float*, KK_INT*, - const float*, +void F77_BLAS_MANGLE(sgemv, SGEMV)(const char*, KK_INT*, KK_INT*, const float*, const float*, KK_INT*, const float*, + KK_INT*, const float*, /* */ float*, KK_INT*); -void F77_BLAS_MANGLE(dgemv, DGEMV)(const char*, KK_INT*, KK_INT*, const double*, - const double*, KK_INT*, const double*, +void F77_BLAS_MANGLE(dgemv, DGEMV)(const char*, KK_INT*, KK_INT*, const double*, const double*, KK_INT*, const double*, KK_INT*, const double*, /* */ double*, KK_INT*); -void F77_BLAS_MANGLE(cgemv, CGEMV)(const char*, KK_INT*, KK_INT*, - const std::complex*, - const std::complex*, KK_INT*, - const std::complex*, KK_INT*, +void F77_BLAS_MANGLE(cgemv, CGEMV)(const char*, KK_INT*, KK_INT*, const std::complex*, + const std::complex*, KK_INT*, const std::complex*, KK_INT*, const std::complex*, /* */ std::complex*, KK_INT*); -void F77_BLAS_MANGLE(zgemv, ZGEMV)(const char*, KK_INT*, KK_INT*, - const std::complex*, - const std::complex*, KK_INT*, - const std::complex*, KK_INT*, +void F77_BLAS_MANGLE(zgemv, ZGEMV)(const char*, KK_INT*, KK_INT*, const std::complex*, + const std::complex*, KK_INT*, const std::complex*, KK_INT*, const std::complex*, /* */ std::complex*, KK_INT*); /// /// Ger /// -void F77_BLAS_MANGLE(sger, SGER)(KK_INT*, KK_INT*, const float*, const float*, - KK_INT*, const float*, KK_INT*, float*, - KK_INT*); -void F77_BLAS_MANGLE(dger, DGER)(KK_INT*, KK_INT*, const double*, const double*, - KK_INT*, const double*, KK_INT*, double*, +void F77_BLAS_MANGLE(sger, SGER)(KK_INT*, KK_INT*, const float*, const float*, KK_INT*, const float*, KK_INT*, float*, KK_INT*); -void F77_BLAS_MANGLE(cgeru, CGERU)(KK_INT*, KK_INT*, const std::complex*, - const std::complex*, KK_INT*, - const std::complex*, KK_INT*, - std::complex*, KK_INT*); -void F77_BLAS_MANGLE(zgeru, ZGERU)(KK_INT*, KK_INT*, - const std::complex*, - const std::complex*, KK_INT*, - const std::complex*, KK_INT*, - std::complex*, KK_INT*); -void F77_BLAS_MANGLE(cgerc, CGERC)(KK_INT*, KK_INT*, const std::complex*, - const std::complex*, KK_INT*, - const std::complex*, KK_INT*, - std::complex*, KK_INT*); -void F77_BLAS_MANGLE(zgerc, ZGERC)(KK_INT*, KK_INT*, - const std::complex*, - const std::complex*, KK_INT*, - const std::complex*, KK_INT*, - std::complex*, KK_INT*); +void F77_BLAS_MANGLE(dger, DGER)(KK_INT*, KK_INT*, const double*, const double*, KK_INT*, const double*, KK_INT*, + double*, KK_INT*); +void F77_BLAS_MANGLE(cgeru, CGERU)(KK_INT*, KK_INT*, const std::complex*, const std::complex*, KK_INT*, + const std::complex*, KK_INT*, std::complex*, KK_INT*); +void F77_BLAS_MANGLE(zgeru, ZGERU)(KK_INT*, KK_INT*, const std::complex*, const std::complex*, KK_INT*, + const std::complex*, KK_INT*, std::complex*, KK_INT*); +void F77_BLAS_MANGLE(cgerc, CGERC)(KK_INT*, KK_INT*, const std::complex*, const std::complex*, KK_INT*, + const std::complex*, KK_INT*, std::complex*, KK_INT*); +void F77_BLAS_MANGLE(zgerc, ZGERC)(KK_INT*, KK_INT*, const std::complex*, const std::complex*, KK_INT*, + const std::complex*, KK_INT*, std::complex*, KK_INT*); /// /// Syr /// -void F77_BLAS_MANGLE(ssyr, SSYR)(const char*, KK_INT*, const float*, - const float*, KK_INT*, float*, KK_INT*); -void F77_BLAS_MANGLE(dsyr, DSYR)(const char*, KK_INT*, const double*, - const double*, KK_INT*, double*, KK_INT*); +void F77_BLAS_MANGLE(ssyr, SSYR)(const char*, KK_INT*, const float*, const float*, KK_INT*, float*, KK_INT*); +void F77_BLAS_MANGLE(dsyr, DSYR)(const char*, KK_INT*, const double*, const double*, KK_INT*, double*, KK_INT*); // Although there is a cgeru, there is no csyru // Although there is a zgeru, there is no zsyru // Although there is a cgerc, there is no csyrc, but there is cher (see below) @@ -293,21 +210,17 @@ void F77_BLAS_MANGLE(dsyr, DSYR)(const char*, KK_INT*, const double*, /// Her /// -void F77_BLAS_MANGLE(cher, CHER)(const char*, KK_INT*, const float*, - const std::complex*, KK_INT*, +void F77_BLAS_MANGLE(cher, CHER)(const char*, KK_INT*, const float*, const std::complex*, KK_INT*, /* */ std::complex*, KK_INT*); -void F77_BLAS_MANGLE(zher, ZHER)(const char*, KK_INT*, const double*, - const std::complex*, KK_INT*, +void F77_BLAS_MANGLE(zher, ZHER)(const char*, KK_INT*, const double*, const std::complex*, KK_INT*, /* */ std::complex*, KK_INT*); /// /// Syr2 /// -void F77_BLAS_MANGLE(ssyr2, SSYR2)(const char*, KK_INT*, const float*, - const float*, const KK_INT*, const float*, +void F77_BLAS_MANGLE(ssyr2, SSYR2)(const char*, KK_INT*, const float*, const float*, const KK_INT*, const float*, KK_INT*, float*, KK_INT*); -void F77_BLAS_MANGLE(dsyr2, DSYR2)(const char*, KK_INT*, const double*, - const double*, const KK_INT*, const double*, +void F77_BLAS_MANGLE(dsyr2, DSYR2)(const char*, KK_INT*, const double*, const double*, const KK_INT*, const double*, KK_INT*, double*, KK_INT*); // Although there is a cgeru, there is no csyr2u // Although there is a zgeru, there is no zsyr2u @@ -318,58 +231,42 @@ void F77_BLAS_MANGLE(dsyr2, DSYR2)(const char*, KK_INT*, const double*, /// Her2 /// -void F77_BLAS_MANGLE(cher2, CHER2)(const char*, KK_INT*, - const std::complex*, - const std::complex*, KK_INT*, - const std::complex*, KK_INT*, +void F77_BLAS_MANGLE(cher2, CHER2)(const char*, KK_INT*, const std::complex*, const std::complex*, + KK_INT*, const std::complex*, KK_INT*, /* */ std::complex*, KK_INT*); -void F77_BLAS_MANGLE(zher2, ZHER2)(const char*, KK_INT*, - const std::complex*, - const std::complex*, KK_INT*, - const std::complex*, KK_INT*, +void F77_BLAS_MANGLE(zher2, ZHER2)(const char*, KK_INT*, const std::complex*, const std::complex*, + KK_INT*, const std::complex*, KK_INT*, /* */ std::complex*, KK_INT*); /// /// Trsv /// -void F77_BLAS_MANGLE(strsv, STRSV)(const char*, const char*, const char*, - KK_INT*, const float*, KK_INT*, +void F77_BLAS_MANGLE(strsv, STRSV)(const char*, const char*, const char*, KK_INT*, const float*, KK_INT*, /* */ float*, KK_INT*); -void F77_BLAS_MANGLE(dtrsv, DTRSV)(const char*, const char*, const char*, - KK_INT*, const double*, KK_INT*, +void F77_BLAS_MANGLE(dtrsv, DTRSV)(const char*, const char*, const char*, KK_INT*, const double*, KK_INT*, /* */ double*, KK_INT*); -void F77_BLAS_MANGLE(ctrsv, CTRSV)(const char*, const char*, const char*, - KK_INT*, const std::complex*, KK_INT*, +void F77_BLAS_MANGLE(ctrsv, CTRSV)(const char*, const char*, const char*, KK_INT*, const std::complex*, KK_INT*, /* */ std::complex*, KK_INT*); -void F77_BLAS_MANGLE(ztrsv, ZTRSV)(const char*, const char*, const char*, - KK_INT*, const std::complex*, - KK_INT*, +void F77_BLAS_MANGLE(ztrsv, ZTRSV)(const char*, const char*, const char*, KK_INT*, const std::complex*, KK_INT*, /* */ std::complex*, KK_INT*); /// /// Gemm /// -void F77_BLAS_MANGLE(sgemm, SGEMM)(const char*, const char*, KK_INT*, KK_INT*, - KK_INT*, const float*, const float*, KK_INT*, - const float*, KK_INT*, const float*, +void F77_BLAS_MANGLE(sgemm, SGEMM)(const char*, const char*, KK_INT*, KK_INT*, KK_INT*, const float*, const float*, + KK_INT*, const float*, KK_INT*, const float*, /* */ float*, KK_INT*); -void F77_BLAS_MANGLE(dgemm, DGEMM)(const char*, const char*, KK_INT*, KK_INT*, - KK_INT*, const double*, const double*, - KK_INT*, const double*, KK_INT*, - const double*, +void F77_BLAS_MANGLE(dgemm, DGEMM)(const char*, const char*, KK_INT*, KK_INT*, KK_INT*, const double*, const double*, + KK_INT*, const double*, KK_INT*, const double*, /* */ double*, KK_INT*); -void F77_BLAS_MANGLE(cgemm, CGEMM)(const char*, const char*, KK_INT*, KK_INT*, - KK_INT*, const std::complex*, - const std::complex*, KK_INT*, - const std::complex*, KK_INT*, +void F77_BLAS_MANGLE(cgemm, CGEMM)(const char*, const char*, KK_INT*, KK_INT*, KK_INT*, const std::complex*, + const std::complex*, KK_INT*, const std::complex*, KK_INT*, const std::complex*, /* */ std::complex*, KK_INT*); -void F77_BLAS_MANGLE(zgemm, ZGEMM)(const char*, const char*, KK_INT*, KK_INT*, - KK_INT*, const std::complex*, - const std::complex*, KK_INT*, - const std::complex*, KK_INT*, +void F77_BLAS_MANGLE(zgemm, ZGEMM)(const char*, const char*, KK_INT*, KK_INT*, KK_INT*, const std::complex*, + const std::complex*, KK_INT*, const std::complex*, KK_INT*, const std::complex*, /* */ std::complex*, KK_INT*); @@ -377,69 +274,51 @@ void F77_BLAS_MANGLE(zgemm, ZGEMM)(const char*, const char*, KK_INT*, KK_INT*, /// Herk /// -void F77_BLAS_MANGLE(ssyrk, SSYRK)(const char*, const char*, KK_INT*, KK_INT*, - const float*, const float*, KK_INT*, +void F77_BLAS_MANGLE(ssyrk, SSYRK)(const char*, const char*, KK_INT*, KK_INT*, const float*, const float*, KK_INT*, const float*, /* */ float*, KK_INT*); -void F77_BLAS_MANGLE(dsyrk, DSYRK)(const char*, const char*, KK_INT*, KK_INT*, - const double*, const double*, KK_INT*, +void F77_BLAS_MANGLE(dsyrk, DSYRK)(const char*, const char*, KK_INT*, KK_INT*, const double*, const double*, KK_INT*, const double*, /* */ double*, KK_INT*); -void F77_BLAS_MANGLE(cherk, CHERK)(const char*, const char*, KK_INT*, KK_INT*, - const std::complex*, - const std::complex*, KK_INT*, - const std::complex*, +void F77_BLAS_MANGLE(cherk, CHERK)(const char*, const char*, KK_INT*, KK_INT*, const std::complex*, + const std::complex*, KK_INT*, const std::complex*, /* */ std::complex*, KK_INT*); -void F77_BLAS_MANGLE(zherk, ZHERK)(const char*, const char*, KK_INT*, KK_INT*, - const std::complex*, - const std::complex*, KK_INT*, - const std::complex*, +void F77_BLAS_MANGLE(zherk, ZHERK)(const char*, const char*, KK_INT*, KK_INT*, const std::complex*, + const std::complex*, KK_INT*, const std::complex*, /* */ std::complex*, KK_INT*); /// /// Trmm /// -void F77_BLAS_MANGLE(strmm, STRMM)(const char*, const char*, const char*, - const char*, KK_INT*, KK_INT*, const float*, +void F77_BLAS_MANGLE(strmm, STRMM)(const char*, const char*, const char*, const char*, KK_INT*, KK_INT*, const float*, const float*, KK_INT*, /* */ float*, KK_INT*); -void F77_BLAS_MANGLE(dtrmm, DTRMM)(const char*, const char*, const char*, - const char*, KK_INT*, KK_INT*, const double*, +void F77_BLAS_MANGLE(dtrmm, DTRMM)(const char*, const char*, const char*, const char*, KK_INT*, KK_INT*, const double*, const double*, KK_INT*, /* */ double*, KK_INT*); -void F77_BLAS_MANGLE(ctrmm, CTRMM)(const char*, const char*, const char*, - const char*, KK_INT*, KK_INT*, - const std::complex*, - const std::complex*, KK_INT*, +void F77_BLAS_MANGLE(ctrmm, CTRMM)(const char*, const char*, const char*, const char*, KK_INT*, KK_INT*, + const std::complex*, const std::complex*, KK_INT*, /* */ std::complex*, KK_INT*); -void F77_BLAS_MANGLE(ztrmm, ZTRMM)(const char*, const char*, const char*, - const char*, KK_INT*, KK_INT*, - const std::complex*, - const std::complex*, KK_INT*, +void F77_BLAS_MANGLE(ztrmm, ZTRMM)(const char*, const char*, const char*, const char*, KK_INT*, KK_INT*, + const std::complex*, const std::complex*, KK_INT*, /* */ std::complex*, KK_INT*); /// /// Trsm /// -void F77_BLAS_MANGLE(strsm, STRSM)(const char*, const char*, const char*, - const char*, KK_INT*, KK_INT*, const float*, +void F77_BLAS_MANGLE(strsm, STRSM)(const char*, const char*, const char*, const char*, KK_INT*, KK_INT*, const float*, const float*, KK_INT*, /* */ float*, KK_INT*); -void F77_BLAS_MANGLE(dtrsm, DTRSM)(const char*, const char*, const char*, - const char*, KK_INT*, KK_INT*, const double*, +void F77_BLAS_MANGLE(dtrsm, DTRSM)(const char*, const char*, const char*, const char*, KK_INT*, KK_INT*, const double*, const double*, KK_INT*, /* */ double*, KK_INT*); -void F77_BLAS_MANGLE(ctrsm, CTRSM)(const char*, const char*, const char*, - const char*, KK_INT*, KK_INT*, - const std::complex*, - const std::complex*, KK_INT*, +void F77_BLAS_MANGLE(ctrsm, CTRSM)(const char*, const char*, const char*, const char*, KK_INT*, KK_INT*, + const std::complex*, const std::complex*, KK_INT*, /* */ std::complex*, KK_INT*); -void F77_BLAS_MANGLE(ztrsm, ZTRSM)(const char*, const char*, const char*, - const char*, KK_INT*, KK_INT*, - const std::complex*, - const std::complex*, KK_INT*, +void F77_BLAS_MANGLE(ztrsm, ZTRSM)(const char*, const char*, const char*, const char*, KK_INT*, KK_INT*, + const std::complex*, const std::complex*, KK_INT*, /* */ std::complex*, KK_INT*); } @@ -447,12 +326,10 @@ void F77_BLAS_MANGLE(sscal, SSCAL)(const KK_INT* N, const float* alpha, /* */ float* x, const KK_INT* x_inc); void F77_BLAS_MANGLE(dscal, DSCAL)(const KK_INT* N, const double* alpha, /* */ double* x, const KK_INT* x_inc); -void F77_BLAS_MANGLE(cscal, - CSCAL)(const KK_INT* N, const std::complex* alpha, - /* */ std::complex* x, const KK_INT* x_inc); -void F77_BLAS_MANGLE(zscal, - ZSCAL)(const KK_INT* N, const std::complex* alpha, - /* */ std::complex* x, const KK_INT* x_inc); +void F77_BLAS_MANGLE(cscal, CSCAL)(const KK_INT* N, const std::complex* alpha, + /* */ std::complex* x, const KK_INT* x_inc); +void F77_BLAS_MANGLE(zscal, ZSCAL)(const KK_INT* N, const std::complex* alpha, + /* */ std::complex* x, const KK_INT* x_inc); #define F77_FUNC_SSCAL F77_BLAS_MANGLE(sscal, SSCAL) #define F77_FUNC_DSCAL F77_BLAS_MANGLE(dscal, DSCAL) @@ -581,19 +458,17 @@ float HostBlas::asum(KK_INT n, const float* x, KK_INT x_inc) { return F77_FUNC_SASUM(&n, x, &x_inc); } template <> -float HostBlas::dot(KK_INT n, const float* x, KK_INT x_inc, - const float* y, KK_INT y_inc) { +float HostBlas::dot(KK_INT n, const float* x, KK_INT x_inc, const float* y, KK_INT y_inc) { return F77_FUNC_SDOT(&n, x, &x_inc, y, &y_inc); } template <> -void HostBlas::axpy(KK_INT n, const float alpha, const float* x, - KK_INT x_inc, +void HostBlas::axpy(KK_INT n, const float alpha, const float* x, KK_INT x_inc, /* */ float* y, KK_INT y_inc) { F77_FUNC_SAXPY(&n, &alpha, x, &x_inc, y, &y_inc); } template <> -void HostBlas::rot(KK_INT const N, float* X, KK_INT const incx, float* Y, - KK_INT const incy, float* c, float* s) { +void HostBlas::rot(KK_INT const N, float* X, KK_INT const incx, float* Y, KK_INT const incy, float* c, + float* s) { F77_FUNC_SROT(&N, X, &incx, Y, &incy, c, s); } template <> @@ -601,81 +476,67 @@ void HostBlas::rotg(float* a, float* b, float* c, float* s) { F77_FUNC_SROTG(a, b, c, s); } template <> -void HostBlas::rotm(const KK_INT n, float* X, const KK_INT incx, - float* Y, const KK_INT incy, const float* param) { +void HostBlas::rotm(const KK_INT n, float* X, const KK_INT incx, float* Y, const KK_INT incy, + const float* param) { F77_FUNC_SROTM(&n, X, &incx, Y, &incy, param); } template <> -void HostBlas::rotmg(float* d1, float* d2, float* x1, const float* y1, - float* param) { +void HostBlas::rotmg(float* d1, float* d2, float* x1, const float* y1, float* param) { F77_FUNC_SROTMG(d1, d2, x1, y1, param); } template <> -void HostBlas::swap(KK_INT const N, float* X, KK_INT const incx, - float* Y, KK_INT const incy) { +void HostBlas::swap(KK_INT const N, float* X, KK_INT const incx, float* Y, KK_INT const incy) { F77_FUNC_SSWAP(&N, X, &incx, Y, &incy); } template <> -void HostBlas::gemv(const char trans, KK_INT m, KK_INT n, - const float alpha, const float* a, KK_INT lda, +void HostBlas::gemv(const char trans, KK_INT m, KK_INT n, const float alpha, const float* a, KK_INT lda, const float* b, KK_INT ldb, const float beta, /* */ float* c, KK_INT ldc) { F77_FUNC_SGEMV(&trans, &m, &n, &alpha, a, &lda, b, &ldb, &beta, c, &ldc); } template <> -void HostBlas::ger(KK_INT m, KK_INT n, const float alpha, const float* x, - KK_INT incx, const float* y, KK_INT incy, float* a, - KK_INT lda) { +void HostBlas::ger(KK_INT m, KK_INT n, const float alpha, const float* x, KK_INT incx, const float* y, + KK_INT incy, float* a, KK_INT lda) { F77_FUNC_SGER(&m, &n, &alpha, x, &incx, y, &incy, a, &lda); } template <> -void HostBlas::syr(const char uplo, KK_INT n, const float alpha, - const float* x, KK_INT incx, float* a, KK_INT lda) { +void HostBlas::syr(const char uplo, KK_INT n, const float alpha, const float* x, KK_INT incx, float* a, + KK_INT lda) { F77_FUNC_SSYR(&uplo, &n, &alpha, x, &incx, a, &lda); } template <> -void HostBlas::syr2(const char uplo, KK_INT n, const float alpha, - const float* x, KK_INT incx, const float* y, +void HostBlas::syr2(const char uplo, KK_INT n, const float alpha, const float* x, KK_INT incx, const float* y, KK_INT incy, float* a, KK_INT lda) { F77_FUNC_SSYR2(&uplo, &n, &alpha, x, &incx, y, &incy, a, &lda); } template <> -void HostBlas::trsv(const char uplo, const char transa, const char diag, - KK_INT m, const float* a, KK_INT lda, +void HostBlas::trsv(const char uplo, const char transa, const char diag, KK_INT m, const float* a, KK_INT lda, /* */ float* b, KK_INT ldb) { F77_FUNC_STRSV(&uplo, &transa, &diag, &m, a, &lda, b, &ldb); } template <> -void HostBlas::gemm(const char transa, const char transb, KK_INT m, - KK_INT n, KK_INT k, const float alpha, - const float* a, KK_INT lda, const float* b, - KK_INT ldb, const float beta, +void HostBlas::gemm(const char transa, const char transb, KK_INT m, KK_INT n, KK_INT k, const float alpha, + const float* a, KK_INT lda, const float* b, KK_INT ldb, const float beta, /* */ float* c, KK_INT ldc) { - F77_FUNC_SGEMM(&transa, &transb, &m, &n, &k, &alpha, a, &lda, b, &ldb, &beta, - c, &ldc); + F77_FUNC_SGEMM(&transa, &transb, &m, &n, &k, &alpha, a, &lda, b, &ldb, &beta, c, &ldc); } template <> -void HostBlas::herk(const char transa, const char transb, KK_INT n, - KK_INT k, const float alpha, const float* a, +void HostBlas::herk(const char transa, const char transb, KK_INT n, KK_INT k, const float alpha, const float* a, KK_INT lda, const float beta, /* */ float* c, KK_INT ldc) { F77_FUNC_SSYRK(&transa, &transb, &n, &k, &alpha, a, &lda, &beta, c, &ldc); } template <> -void HostBlas::trmm(const char side, const char uplo, const char transa, - const char diag, KK_INT m, KK_INT n, +void HostBlas::trmm(const char side, const char uplo, const char transa, const char diag, KK_INT m, KK_INT n, const float alpha, const float* a, KK_INT lda, /* */ float* b, KK_INT ldb) { - F77_FUNC_STRMM(&side, &uplo, &transa, &diag, &m, &n, &alpha, a, &lda, b, - &ldb); + F77_FUNC_STRMM(&side, &uplo, &transa, &diag, &m, &n, &alpha, a, &lda, b, &ldb); } template <> -void HostBlas::trsm(const char side, const char uplo, const char transa, - const char diag, KK_INT m, KK_INT n, +void HostBlas::trsm(const char side, const char uplo, const char transa, const char diag, KK_INT m, KK_INT n, const float alpha, const float* a, KK_INT lda, /* */ float* b, KK_INT ldb) { - F77_FUNC_STRSM(&side, &uplo, &transa, &diag, &m, &n, &alpha, a, &lda, b, - &ldb); + F77_FUNC_STRSM(&side, &uplo, &transa, &diag, &m, &n, &alpha, a, &lda, b, &ldb); } /// @@ -700,19 +561,17 @@ double HostBlas::asum(KK_INT n, const double* x, KK_INT x_inc) { return F77_FUNC_DASUM(&n, x, &x_inc); } template <> -double HostBlas::dot(KK_INT n, const double* x, KK_INT x_inc, - const double* y, KK_INT y_inc) { +double HostBlas::dot(KK_INT n, const double* x, KK_INT x_inc, const double* y, KK_INT y_inc) { return F77_FUNC_DDOT(&n, x, &x_inc, y, &y_inc); } template <> -void HostBlas::axpy(KK_INT n, const double alpha, const double* x, - KK_INT x_inc, +void HostBlas::axpy(KK_INT n, const double alpha, const double* x, KK_INT x_inc, /* */ double* y, KK_INT y_inc) { F77_FUNC_DAXPY(&n, &alpha, x, &x_inc, y, &y_inc); } template <> -void HostBlas::rot(KK_INT const N, double* X, KK_INT const incx, - double* Y, KK_INT const incy, double* c, double* s) { +void HostBlas::rot(KK_INT const N, double* X, KK_INT const incx, double* Y, KK_INT const incy, double* c, + double* s) { F77_FUNC_DROT(&N, X, &incx, Y, &incy, c, s); } template <> @@ -720,82 +579,67 @@ void HostBlas::rotg(double* a, double* b, double* c, double* s) { F77_FUNC_DROTG(a, b, c, s); } template <> -void HostBlas::rotm(const KK_INT n, double* X, const KK_INT incx, - double* Y, const KK_INT incy, const double* param) { +void HostBlas::rotm(const KK_INT n, double* X, const KK_INT incx, double* Y, const KK_INT incy, + const double* param) { F77_FUNC_DROTM(&n, X, &incx, Y, &incy, param); } template <> -void HostBlas::rotmg(double* d1, double* d2, double* x1, - const double* y1, double* param) { +void HostBlas::rotmg(double* d1, double* d2, double* x1, const double* y1, double* param) { F77_FUNC_DROTMG(d1, d2, x1, y1, param); } template <> -void HostBlas::swap(KK_INT const N, double* X, KK_INT const incx, - double* Y, KK_INT const incy) { +void HostBlas::swap(KK_INT const N, double* X, KK_INT const incx, double* Y, KK_INT const incy) { F77_FUNC_DSWAP(&N, X, &incx, Y, &incy); } template <> -void HostBlas::gemv(const char trans, KK_INT m, KK_INT n, - const double alpha, const double* a, KK_INT lda, +void HostBlas::gemv(const char trans, KK_INT m, KK_INT n, const double alpha, const double* a, KK_INT lda, const double* b, KK_INT ldb, const double beta, /* */ double* c, KK_INT ldc) { F77_FUNC_DGEMV(&trans, &m, &n, &alpha, a, &lda, b, &ldb, &beta, c, &ldc); } template <> -void HostBlas::ger(KK_INT m, KK_INT n, const double alpha, - const double* x, KK_INT incx, const double* y, +void HostBlas::ger(KK_INT m, KK_INT n, const double alpha, const double* x, KK_INT incx, const double* y, KK_INT incy, double* a, KK_INT lda) { F77_FUNC_DGER(&m, &n, &alpha, x, &incx, y, &incy, a, &lda); } template <> -void HostBlas::syr(const char uplo, KK_INT n, const double alpha, - const double* x, KK_INT incx, double* a, +void HostBlas::syr(const char uplo, KK_INT n, const double alpha, const double* x, KK_INT incx, double* a, KK_INT lda) { F77_FUNC_DSYR(&uplo, &n, &alpha, x, &incx, a, &lda); } template <> -void HostBlas::syr2(const char uplo, KK_INT n, const double alpha, - const double* x, KK_INT incx, const double* y, - KK_INT incy, double* a, KK_INT lda) { +void HostBlas::syr2(const char uplo, KK_INT n, const double alpha, const double* x, KK_INT incx, + const double* y, KK_INT incy, double* a, KK_INT lda) { F77_FUNC_DSYR2(&uplo, &n, &alpha, x, &incx, y, &incy, a, &lda); } template <> -void HostBlas::trsv(const char uplo, const char transa, const char diag, - KK_INT m, const double* a, KK_INT lda, +void HostBlas::trsv(const char uplo, const char transa, const char diag, KK_INT m, const double* a, KK_INT lda, /* */ double* b, KK_INT ldb) { F77_FUNC_DTRSV(&uplo, &transa, &diag, &m, a, &lda, b, &ldb); } template <> -void HostBlas::gemm(const char transa, const char transb, KK_INT m, - KK_INT n, KK_INT k, const double alpha, - const double* a, KK_INT lda, const double* b, - KK_INT ldb, const double beta, +void HostBlas::gemm(const char transa, const char transb, KK_INT m, KK_INT n, KK_INT k, const double alpha, + const double* a, KK_INT lda, const double* b, KK_INT ldb, const double beta, /* */ double* c, KK_INT ldc) { - F77_FUNC_DGEMM(&transa, &transb, &m, &n, &k, &alpha, a, &lda, b, &ldb, &beta, - c, &ldc); + F77_FUNC_DGEMM(&transa, &transb, &m, &n, &k, &alpha, a, &lda, b, &ldb, &beta, c, &ldc); } template <> -void HostBlas::herk(const char transa, const char transb, KK_INT n, - KK_INT k, const double alpha, const double* a, - KK_INT lda, const double beta, +void HostBlas::herk(const char transa, const char transb, KK_INT n, KK_INT k, const double alpha, + const double* a, KK_INT lda, const double beta, /* */ double* c, KK_INT ldc) { F77_FUNC_DSYRK(&transa, &transb, &n, &k, &alpha, a, &lda, &beta, c, &ldc); } template <> -void HostBlas::trmm(const char side, const char uplo, const char transa, - const char diag, KK_INT m, KK_INT n, +void HostBlas::trmm(const char side, const char uplo, const char transa, const char diag, KK_INT m, KK_INT n, const double alpha, const double* a, KK_INT lda, /* */ double* b, KK_INT ldb) { - F77_FUNC_DTRMM(&side, &uplo, &transa, &diag, &m, &n, &alpha, a, &lda, b, - &ldb); + F77_FUNC_DTRMM(&side, &uplo, &transa, &diag, &m, &n, &alpha, a, &lda, b, &ldb); } template <> -void HostBlas::trsm(const char side, const char uplo, const char transa, - const char diag, KK_INT m, KK_INT n, +void HostBlas::trsm(const char side, const char uplo, const char transa, const char diag, KK_INT m, KK_INT n, const double alpha, const double* a, KK_INT lda, /* */ double* b, KK_INT ldb) { - F77_FUNC_DTRSM(&side, &uplo, &transa, &diag, &m, &n, &alpha, a, &lda, b, - &ldb); + F77_FUNC_DTRSM(&side, &uplo, &transa, &diag, &m, &n, &alpha, a, &lda, b, &ldb); } /// @@ -803,34 +647,25 @@ void HostBlas::trsm(const char side, const char uplo, const char transa, /// template <> -void HostBlas >::scal(KK_INT n, - const std::complex alpha, - /* */ std::complex* x, - KK_INT x_inc) { +void HostBlas >::scal(KK_INT n, const std::complex alpha, + /* */ std::complex* x, KK_INT x_inc) { F77_FUNC_CSCAL(&n, &alpha, x, &x_inc); } template <> -KK_INT HostBlas >::iamax(KK_INT n, - const std::complex* x, - KK_INT x_inc) { +KK_INT HostBlas >::iamax(KK_INT n, const std::complex* x, KK_INT x_inc) { return F77_FUNC_ICAMAX(&n, x, &x_inc); } template <> -float HostBlas >::nrm2(KK_INT n, - const std::complex* x, - KK_INT x_inc) { +float HostBlas >::nrm2(KK_INT n, const std::complex* x, KK_INT x_inc) { return F77_FUNC_SCNRM2(&n, x, &x_inc); } template <> -float HostBlas >::asum(KK_INT n, - const std::complex* x, - KK_INT x_inc) { +float HostBlas >::asum(KK_INT n, const std::complex* x, KK_INT x_inc) { return F77_FUNC_SCASUM(&n, x, &x_inc); } template <> -std::complex HostBlas >::dot( - KK_INT n, const std::complex* x, KK_INT x_inc, - const std::complex* y, KK_INT y_inc) { +std::complex HostBlas >::dot(KK_INT n, const std::complex* x, KK_INT x_inc, + const std::complex* y, KK_INT y_inc) { #if defined(KOKKOSKERNELS_TPL_BLAS_RETURN_COMPLEX) _kk_float2 res = F77_FUNC_CDOTC(&n, x, &x_inc, y, &y_inc); return std::complex(res.vals[0], res.vals[1]); @@ -841,131 +676,99 @@ std::complex HostBlas >::dot( #endif } template <> -void HostBlas >::axpy(KK_INT n, - const std::complex alpha, - const std::complex* x, +void HostBlas >::axpy(KK_INT n, const std::complex alpha, const std::complex* x, KK_INT x_inc, - /* */ std::complex* y, - KK_INT y_inc) { + /* */ std::complex* y, KK_INT y_inc) { F77_FUNC_CAXPY(&n, &alpha, x, &x_inc, y, &y_inc); } template <> -void HostBlas >::rot(KK_INT const N, std::complex* X, - KK_INT const incx, - std::complex* Y, - KK_INT const incy, float* c, - float* s) { +void HostBlas >::rot(KK_INT const N, std::complex* X, KK_INT const incx, + std::complex* Y, KK_INT const incy, float* c, float* s) { F77_FUNC_CROT(&N, X, &incx, Y, &incy, c, s); } template <> -void HostBlas >::rotg(std::complex* a, - std::complex* b, float* c, +void HostBlas >::rotg(std::complex* a, std::complex* b, float* c, std::complex* s) { F77_FUNC_CROTG(a, b, c, s); } template <> -void HostBlas >::swap(KK_INT const N, - std::complex* X, - KK_INT const incx, - std::complex* Y, - KK_INT const incy) { +void HostBlas >::swap(KK_INT const N, std::complex* X, KK_INT const incx, + std::complex* Y, KK_INT const incy) { F77_FUNC_CSWAP(&N, X, &incx, Y, &incy); } template <> -void HostBlas >::gemv( - const char trans, KK_INT m, KK_INT n, const std::complex alpha, - const std::complex* a, KK_INT lda, const std::complex* b, - KK_INT ldb, const std::complex beta, - /* */ std::complex* c, KK_INT ldc) { - F77_FUNC_CGEMV(&trans, &m, &n, &alpha, (const std::complex*)a, &lda, - (const std::complex*)b, &ldb, &beta, - (std::complex*)c, &ldc); +void HostBlas >::gemv(const char trans, KK_INT m, KK_INT n, const std::complex alpha, + const std::complex* a, KK_INT lda, const std::complex* b, + KK_INT ldb, const std::complex beta, + /* */ std::complex* c, KK_INT ldc) { + F77_FUNC_CGEMV(&trans, &m, &n, &alpha, (const std::complex*)a, &lda, (const std::complex*)b, &ldb, + &beta, (std::complex*)c, &ldc); } template <> -void HostBlas >::geru( - KK_INT m, KK_INT n, const std::complex alpha, - const std::complex* x, KK_INT incx, const std::complex* y, - KK_INT incy, std::complex* a, KK_INT lda) { - F77_FUNC_CGERU(&m, &n, &alpha, (const std::complex*)x, &incx, - (const std::complex*)y, &incy, (std::complex*)a, - &lda); +void HostBlas >::geru(KK_INT m, KK_INT n, const std::complex alpha, + const std::complex* x, KK_INT incx, const std::complex* y, + KK_INT incy, std::complex* a, KK_INT lda) { + F77_FUNC_CGERU(&m, &n, &alpha, (const std::complex*)x, &incx, (const std::complex*)y, &incy, + (std::complex*)a, &lda); } template <> -void HostBlas >::gerc( - KK_INT m, KK_INT n, const std::complex alpha, - const std::complex* x, KK_INT incx, const std::complex* y, - KK_INT incy, std::complex* a, KK_INT lda) { - F77_FUNC_CGERC(&m, &n, &alpha, (const std::complex*)x, &incx, - (const std::complex*)y, &incy, (std::complex*)a, - &lda); +void HostBlas >::gerc(KK_INT m, KK_INT n, const std::complex alpha, + const std::complex* x, KK_INT incx, const std::complex* y, + KK_INT incy, std::complex* a, KK_INT lda) { + F77_FUNC_CGERC(&m, &n, &alpha, (const std::complex*)x, &incx, (const std::complex*)y, &incy, + (std::complex*)a, &lda); } template <> template <> -void HostBlas >::her( - const char uplo, KK_INT n, const float alpha, const std::complex* x, - KK_INT incx, std::complex* a, KK_INT lda) { - F77_FUNC_CHER(&uplo, &n, &alpha, (const std::complex*)x, &incx, - (std::complex*)a, &lda); +void HostBlas >::her(const char uplo, KK_INT n, const float alpha, + const std::complex* x, KK_INT incx, std::complex* a, + KK_INT lda) { + F77_FUNC_CHER(&uplo, &n, &alpha, (const std::complex*)x, &incx, (std::complex*)a, &lda); } template <> -void HostBlas >::her2( - const char uplo, KK_INT n, const std::complex alpha, - const std::complex* x, KK_INT incx, const std::complex* y, - KK_INT incy, std::complex* a, KK_INT lda) { - F77_FUNC_CHER2(&uplo, &n, &alpha, (const std::complex*)x, &incx, - (const std::complex*)y, &incy, (std::complex*)a, - &lda); +void HostBlas >::her2(const char uplo, KK_INT n, const std::complex alpha, + const std::complex* x, KK_INT incx, const std::complex* y, + KK_INT incy, std::complex* a, KK_INT lda) { + F77_FUNC_CHER2(&uplo, &n, &alpha, (const std::complex*)x, &incx, (const std::complex*)y, &incy, + (std::complex*)a, &lda); } template <> -void HostBlas >::trsv(const char uplo, const char transa, - const char diag, KK_INT m, - const std::complex* a, - KK_INT lda, - /* */ std::complex* b, - KK_INT ldb) { - F77_FUNC_CTRSV(&uplo, &transa, &diag, &m, (const std::complex*)a, &lda, - (std::complex*)b, &ldb); +void HostBlas >::trsv(const char uplo, const char transa, const char diag, KK_INT m, + const std::complex* a, KK_INT lda, + /* */ std::complex* b, KK_INT ldb) { + F77_FUNC_CTRSV(&uplo, &transa, &diag, &m, (const std::complex*)a, &lda, (std::complex*)b, &ldb); } template <> -void HostBlas >::gemm( - const char transa, const char transb, KK_INT m, KK_INT n, KK_INT k, - const std::complex alpha, const std::complex* a, KK_INT lda, - const std::complex* b, KK_INT ldb, const std::complex beta, - /* */ std::complex* c, KK_INT ldc) { - F77_FUNC_CGEMM(&transa, &transb, &m, &n, &k, &alpha, - (const std::complex*)a, &lda, - (const std::complex*)b, &ldb, &beta, - (std::complex*)c, &ldc); +void HostBlas >::gemm(const char transa, const char transb, KK_INT m, KK_INT n, KK_INT k, + const std::complex alpha, const std::complex* a, KK_INT lda, + const std::complex* b, KK_INT ldb, const std::complex beta, + /* */ std::complex* c, KK_INT ldc) { + F77_FUNC_CGEMM(&transa, &transb, &m, &n, &k, &alpha, (const std::complex*)a, &lda, + (const std::complex*)b, &ldb, &beta, (std::complex*)c, &ldc); } template <> -void HostBlas >::herk( - const char transa, const char transb, KK_INT n, KK_INT k, - const std::complex alpha, const std::complex* a, KK_INT lda, - const std::complex beta, - /* */ std::complex* c, KK_INT ldc) { - F77_FUNC_CHERK(&transa, &transb, &n, &k, &alpha, - (const std::complex*)a, &lda, &beta, - (std::complex*)c, &ldc); +void HostBlas >::herk(const char transa, const char transb, KK_INT n, KK_INT k, + const std::complex alpha, const std::complex* a, KK_INT lda, + const std::complex beta, + /* */ std::complex* c, KK_INT ldc) { + F77_FUNC_CHERK(&transa, &transb, &n, &k, &alpha, (const std::complex*)a, &lda, &beta, (std::complex*)c, + &ldc); } template <> -void HostBlas >::trmm( - const char side, const char uplo, const char transa, const char diag, - KK_INT m, KK_INT n, const std::complex alpha, - const std::complex* a, KK_INT lda, - /* */ std::complex* b, KK_INT ldb) { - F77_FUNC_CTRMM(&side, &uplo, &transa, &diag, &m, &n, &alpha, - (const std::complex*)a, &lda, (std::complex*)b, - &ldb); +void HostBlas >::trmm(const char side, const char uplo, const char transa, const char diag, + KK_INT m, KK_INT n, const std::complex alpha, + const std::complex* a, KK_INT lda, + /* */ std::complex* b, KK_INT ldb) { + F77_FUNC_CTRMM(&side, &uplo, &transa, &diag, &m, &n, &alpha, (const std::complex*)a, &lda, + (std::complex*)b, &ldb); } template <> -void HostBlas >::trsm( - const char side, const char uplo, const char transa, const char diag, - KK_INT m, KK_INT n, const std::complex alpha, - const std::complex* a, KK_INT lda, - /* */ std::complex* b, KK_INT ldb) { - F77_FUNC_CTRSM(&side, &uplo, &transa, &diag, &m, &n, &alpha, - (const std::complex*)a, &lda, (std::complex*)b, - &ldb); +void HostBlas >::trsm(const char side, const char uplo, const char transa, const char diag, + KK_INT m, KK_INT n, const std::complex alpha, + const std::complex* a, KK_INT lda, + /* */ std::complex* b, KK_INT ldb) { + F77_FUNC_CTRSM(&side, &uplo, &transa, &diag, &m, &n, &alpha, (const std::complex*)a, &lda, + (std::complex*)b, &ldb); } /// @@ -973,34 +776,25 @@ void HostBlas >::trsm( /// template <> -void HostBlas >::scal(KK_INT n, - const std::complex alpha, - /* */ std::complex* x, - KK_INT x_inc) { +void HostBlas >::scal(KK_INT n, const std::complex alpha, + /* */ std::complex* x, KK_INT x_inc) { F77_FUNC_ZSCAL(&n, &alpha, x, &x_inc); } template <> -KK_INT HostBlas >::iamax(KK_INT n, - const std::complex* x, - KK_INT x_inc) { +KK_INT HostBlas >::iamax(KK_INT n, const std::complex* x, KK_INT x_inc) { return F77_FUNC_IZAMAX(&n, x, &x_inc); } template <> -double HostBlas >::nrm2(KK_INT n, - const std::complex* x, - KK_INT x_inc) { +double HostBlas >::nrm2(KK_INT n, const std::complex* x, KK_INT x_inc) { return F77_FUNC_DZNRM2(&n, x, &x_inc); } template <> -double HostBlas >::asum(KK_INT n, - const std::complex* x, - KK_INT x_inc) { +double HostBlas >::asum(KK_INT n, const std::complex* x, KK_INT x_inc) { return F77_FUNC_DZASUM(&n, x, &x_inc); } template <> -std::complex HostBlas >::dot( - KK_INT n, const std::complex* x, KK_INT x_inc, - const std::complex* y, KK_INT y_inc) { +std::complex HostBlas >::dot(KK_INT n, const std::complex* x, KK_INT x_inc, + const std::complex* y, KK_INT y_inc) { #if defined(KOKKOSKERNELS_TPL_BLAS_RETURN_COMPLEX) _kk_double2 res = F77_FUNC_ZDOTC(&n, x, &x_inc, y, &y_inc); return std::complex(res.vals[0], res.vals[1]); @@ -1011,133 +805,100 @@ std::complex HostBlas >::dot( #endif } template <> -void HostBlas >::axpy(KK_INT n, - const std::complex alpha, - const std::complex* x, +void HostBlas >::axpy(KK_INT n, const std::complex alpha, const std::complex* x, KK_INT x_inc, - /* */ std::complex* y, - KK_INT y_inc) { + /* */ std::complex* y, KK_INT y_inc) { F77_FUNC_ZAXPY(&n, &alpha, x, &x_inc, y, &y_inc); } template <> -void HostBlas >::rot( - KK_INT const N, std::complex* X, KK_INT const incx, - std::complex* Y, KK_INT const incy, double* c, double* s) { +void HostBlas >::rot(KK_INT const N, std::complex* X, KK_INT const incx, + std::complex* Y, KK_INT const incy, double* c, double* s) { F77_FUNC_ZROT(&N, X, &incx, Y, &incy, c, s); } template <> -void HostBlas >::rotg(std::complex* a, - std::complex* b, double* c, +void HostBlas >::rotg(std::complex* a, std::complex* b, double* c, std::complex* s) { F77_FUNC_ZROTG(a, b, c, s); } template <> -void HostBlas >::swap(KK_INT const N, - std::complex* X, - KK_INT const incx, - std::complex* Y, - KK_INT const incy) { +void HostBlas >::swap(KK_INT const N, std::complex* X, KK_INT const incx, + std::complex* Y, KK_INT const incy) { F77_FUNC_ZSWAP(&N, X, &incx, Y, &incy); } template <> -void HostBlas >::gemv( - const char trans, KK_INT m, KK_INT n, const std::complex alpha, - const std::complex* a, KK_INT lda, const std::complex* b, - KK_INT ldb, const std::complex beta, - /* */ std::complex* c, KK_INT ldc) { - F77_FUNC_ZGEMV(&trans, &m, &n, &alpha, (const std::complex*)a, &lda, - (const std::complex*)b, &ldb, &beta, - (std::complex*)c, &ldc); +void HostBlas >::gemv(const char trans, KK_INT m, KK_INT n, const std::complex alpha, + const std::complex* a, KK_INT lda, const std::complex* b, + KK_INT ldb, const std::complex beta, + /* */ std::complex* c, KK_INT ldc) { + F77_FUNC_ZGEMV(&trans, &m, &n, &alpha, (const std::complex*)a, &lda, (const std::complex*)b, &ldb, + &beta, (std::complex*)c, &ldc); } template <> -void HostBlas >::geru( - KK_INT m, KK_INT n, const std::complex alpha, - const std::complex* x, KK_INT incx, const std::complex* y, - KK_INT incy, std::complex* a, KK_INT lda) { - F77_FUNC_ZGERU(&m, &n, &alpha, (const std::complex*)x, &incx, - (const std::complex*)y, &incy, +void HostBlas >::geru(KK_INT m, KK_INT n, const std::complex alpha, + const std::complex* x, KK_INT incx, const std::complex* y, + KK_INT incy, std::complex* a, KK_INT lda) { + F77_FUNC_ZGERU(&m, &n, &alpha, (const std::complex*)x, &incx, (const std::complex*)y, &incy, (std::complex*)a, &lda); } template <> -void HostBlas >::gerc( - KK_INT m, KK_INT n, const std::complex alpha, - const std::complex* x, KK_INT incx, const std::complex* y, - KK_INT incy, std::complex* a, KK_INT lda) { - F77_FUNC_ZGERC(&m, &n, &alpha, (const std::complex*)x, &incx, - (const std::complex*)y, &incy, +void HostBlas >::gerc(KK_INT m, KK_INT n, const std::complex alpha, + const std::complex* x, KK_INT incx, const std::complex* y, + KK_INT incy, std::complex* a, KK_INT lda) { + F77_FUNC_ZGERC(&m, &n, &alpha, (const std::complex*)x, &incx, (const std::complex*)y, &incy, (std::complex*)a, &lda); } template <> template <> -void HostBlas >::her(const char uplo, KK_INT n, - const double alpha, - const std::complex* x, - KK_INT incx, - std::complex* a, +void HostBlas >::her(const char uplo, KK_INT n, const double alpha, + const std::complex* x, KK_INT incx, std::complex* a, KK_INT lda) { - F77_FUNC_ZHER(&uplo, &n, &alpha, (const std::complex*)x, &incx, - (std::complex*)a, &lda); + F77_FUNC_ZHER(&uplo, &n, &alpha, (const std::complex*)x, &incx, (std::complex*)a, &lda); } template <> -void HostBlas >::her2( - const char uplo, KK_INT n, const std::complex alpha, - const std::complex* x, KK_INT incx, const std::complex* y, - KK_INT incy, std::complex* a, KK_INT lda) { - F77_FUNC_ZHER2(&uplo, &n, &alpha, (const std::complex*)x, &incx, - (const std::complex*)y, &incy, +void HostBlas >::her2(const char uplo, KK_INT n, const std::complex alpha, + const std::complex* x, KK_INT incx, const std::complex* y, + KK_INT incy, std::complex* a, KK_INT lda) { + F77_FUNC_ZHER2(&uplo, &n, &alpha, (const std::complex*)x, &incx, (const std::complex*)y, &incy, (std::complex*)a, &lda); } template <> -void HostBlas >::trsv(const char uplo, const char transa, - const char diag, KK_INT m, - const std::complex* a, - KK_INT lda, - /* */ std::complex* b, - KK_INT ldb) { - F77_FUNC_ZTRSV(&uplo, &transa, &diag, &m, (const std::complex*)a, - &lda, (std::complex*)b, &ldb); +void HostBlas >::trsv(const char uplo, const char transa, const char diag, KK_INT m, + const std::complex* a, KK_INT lda, + /* */ std::complex* b, KK_INT ldb) { + F77_FUNC_ZTRSV(&uplo, &transa, &diag, &m, (const std::complex*)a, &lda, (std::complex*)b, &ldb); } template <> -void HostBlas >::gemm( - const char transa, const char transb, KK_INT m, KK_INT n, KK_INT k, - const std::complex alpha, const std::complex* a, KK_INT lda, - const std::complex* b, KK_INT ldb, const std::complex beta, - /* */ std::complex* c, KK_INT ldc) { - F77_FUNC_ZGEMM(&transa, &transb, &m, &n, &k, &alpha, - (const std::complex*)a, &lda, - (const std::complex*)b, &ldb, &beta, - (std::complex*)c, &ldc); +void HostBlas >::gemm(const char transa, const char transb, KK_INT m, KK_INT n, KK_INT k, + const std::complex alpha, const std::complex* a, KK_INT lda, + const std::complex* b, KK_INT ldb, const std::complex beta, + /* */ std::complex* c, KK_INT ldc) { + F77_FUNC_ZGEMM(&transa, &transb, &m, &n, &k, &alpha, (const std::complex*)a, &lda, + (const std::complex*)b, &ldb, &beta, (std::complex*)c, &ldc); } template <> -void HostBlas >::herk( - const char transa, const char transb, KK_INT n, KK_INT k, - const std::complex alpha, const std::complex* a, KK_INT lda, - const std::complex beta, - /* */ std::complex* c, KK_INT ldc) { - F77_FUNC_ZHERK(&transa, &transb, &n, &k, &alpha, - (const std::complex*)a, &lda, &beta, +void HostBlas >::herk(const char transa, const char transb, KK_INT n, KK_INT k, + const std::complex alpha, const std::complex* a, KK_INT lda, + const std::complex beta, + /* */ std::complex* c, KK_INT ldc) { + F77_FUNC_ZHERK(&transa, &transb, &n, &k, &alpha, (const std::complex*)a, &lda, &beta, (std::complex*)c, &ldc); } template <> -void HostBlas >::trmm( - const char side, const char uplo, const char transa, const char diag, - KK_INT m, KK_INT n, const std::complex alpha, - const std::complex* a, KK_INT lda, - /* */ std::complex* b, KK_INT ldb) { - F77_FUNC_ZTRMM(&side, &uplo, &transa, &diag, &m, &n, &alpha, - (const std::complex*)a, &lda, (std::complex*)b, - &ldb); -} -template <> -void HostBlas >::trsm( - const char side, const char uplo, const char transa, const char diag, - KK_INT m, KK_INT n, const std::complex alpha, - const std::complex* a, KK_INT lda, - /* */ std::complex* b, KK_INT ldb) { - F77_FUNC_ZTRSM(&side, &uplo, &transa, &diag, &m, &n, &alpha, - (const std::complex*)a, &lda, (std::complex*)b, - &ldb); +void HostBlas >::trmm(const char side, const char uplo, const char transa, const char diag, + KK_INT m, KK_INT n, const std::complex alpha, + const std::complex* a, KK_INT lda, + /* */ std::complex* b, KK_INT ldb) { + F77_FUNC_ZTRMM(&side, &uplo, &transa, &diag, &m, &n, &alpha, (const std::complex*)a, &lda, + (std::complex*)b, &ldb); +} +template <> +void HostBlas >::trsm(const char side, const char uplo, const char transa, const char diag, + KK_INT m, KK_INT n, const std::complex alpha, + const std::complex* a, KK_INT lda, + /* */ std::complex* b, KK_INT ldb) { + F77_FUNC_ZTRSM(&side, &uplo, &transa, &diag, &m, &n, &alpha, (const std::complex*)a, &lda, + (std::complex*)b, &ldb); } } // namespace Impl diff --git a/blas/tpls/KokkosBlas_Host_tpl.hpp b/blas/tpls/KokkosBlas_Host_tpl.hpp index f7fb3d3978..576fde8471 100644 --- a/blas/tpls/KokkosBlas_Host_tpl.hpp +++ b/blas/tpls/KokkosBlas_Host_tpl.hpp @@ -57,66 +57,57 @@ struct HostBlas { static void axpy(KK_INT n, const T alpha, const T *x, KK_INT x_inc, /* */ T *y, KK_INT y_inc); - static void rot(KK_INT const N, T *X, KK_INT const incx, T *Y, - KK_INT const incy, mag_type *c, mag_type *s); + static void rot(KK_INT const N, T *X, KK_INT const incx, T *Y, KK_INT const incy, mag_type *c, mag_type *s); static void rotg(T *a, T *b, mag_type *c, T *s); - static void rotm(const KK_INT n, T *X, const KK_INT incx, T *Y, - const KK_INT incy, T const *param); + static void rotm(const KK_INT n, T *X, const KK_INT incx, T *Y, const KK_INT incy, T const *param); static void rotmg(T *d1, T *d2, T *x1, const T *y1, T *param); - static void swap(KK_INT const N, T *X, KK_INT const incx, T *Y, - KK_INT const incy); + static void swap(KK_INT const N, T *X, KK_INT const incx, T *Y, KK_INT const incy); - static void gemv(const char trans, KK_INT m, KK_INT n, const T alpha, - const T *a, KK_INT lda, const T *b, KK_INT ldb, const T beta, + static void gemv(const char trans, KK_INT m, KK_INT n, const T alpha, const T *a, KK_INT lda, const T *b, KK_INT ldb, + const T beta, /* */ T *c, KK_INT ldc); - static void ger(KK_INT m, KK_INT n, const T alpha, const T *x, KK_INT incx, - const T *y, KK_INT incy, T *a, KK_INT lda); + static void ger(KK_INT m, KK_INT n, const T alpha, const T *x, KK_INT incx, const T *y, KK_INT incy, T *a, + KK_INT lda); - static void geru(KK_INT m, KK_INT n, const T alpha, const T *x, KK_INT incx, - const T *y, KK_INT incy, T *a, KK_INT lda); + static void geru(KK_INT m, KK_INT n, const T alpha, const T *x, KK_INT incx, const T *y, KK_INT incy, T *a, + KK_INT lda); - static void gerc(KK_INT m, KK_INT n, const T alpha, const T *x, KK_INT incx, - const T *y, KK_INT incy, T *a, KK_INT lda); + static void gerc(KK_INT m, KK_INT n, const T alpha, const T *x, KK_INT incx, const T *y, KK_INT incy, T *a, + KK_INT lda); - static void syr(const char uplo, KK_INT n, const T alpha, const T *x, - KK_INT incx, T *a, KK_INT lda); + static void syr(const char uplo, KK_INT n, const T alpha, const T *x, KK_INT incx, T *a, KK_INT lda); - static void syr2(const char uplo, KK_INT n, const T alpha, const T *x, - KK_INT incx, const T *y, KK_INT incy, T *a, KK_INT lda); + static void syr2(const char uplo, KK_INT n, const T alpha, const T *x, KK_INT incx, const T *y, KK_INT incy, T *a, + KK_INT lda); template - static void her(const char uplo, KK_INT n, const tAlpha alpha, const T *x, - KK_INT incx, T *a, KK_INT lda); + static void her(const char uplo, KK_INT n, const tAlpha alpha, const T *x, KK_INT incx, T *a, KK_INT lda); - static void her2(const char uplo, KK_INT n, const T alpha, const T *x, - KK_INT incx, const T *y, KK_INT incy, T *a, KK_INT lda); + static void her2(const char uplo, KK_INT n, const T alpha, const T *x, KK_INT incx, const T *y, KK_INT incy, T *a, + KK_INT lda); - static void trsv(const char uplo, const char transa, const char diag, - KK_INT m, const T *a, KK_INT lda, + static void trsv(const char uplo, const char transa, const char diag, KK_INT m, const T *a, KK_INT lda, /* */ T *b, KK_INT ldb); - static void gemm(const char transa, const char transb, KK_INT m, KK_INT n, - KK_INT k, const T alpha, const T *a, KK_INT lda, const T *b, - KK_INT ldb, const T beta, + static void gemm(const char transa, const char transb, KK_INT m, KK_INT n, KK_INT k, const T alpha, const T *a, + KK_INT lda, const T *b, KK_INT ldb, const T beta, /* */ T *c, KK_INT ldc); - static void herk(const char transa, const char transb, KK_INT n, KK_INT k, - const T alpha, const T *a, KK_INT lda, const T beta, + static void herk(const char transa, const char transb, KK_INT n, KK_INT k, const T alpha, const T *a, KK_INT lda, + const T beta, /* */ T *c, KK_INT ldc); - static void trmm(const char side, const char uplo, const char transa, - const char diag, KK_INT m, KK_INT n, const T alpha, - const T *a, KK_INT lda, + static void trmm(const char side, const char uplo, const char transa, const char diag, KK_INT m, KK_INT n, + const T alpha, const T *a, KK_INT lda, /* */ T *b, KK_INT ldb); - static void trsm(const char side, const char uplo, const char transa, - const char diag, KK_INT m, KK_INT n, const T alpha, - const T *a, KK_INT lda, + static void trsm(const char side, const char uplo, const char transa, const char diag, KK_INT m, KK_INT n, + const T alpha, const T *a, KK_INT lda, /* */ T *b, KK_INT ldb); }; } // namespace Impl diff --git a/blas/tpls/KokkosBlas_Rocm_tpl.hpp b/blas/tpls/KokkosBlas_Rocm_tpl.hpp index 6f89d349c9..b5a7dabf6f 100644 --- a/blas/tpls/KokkosBlas_Rocm_tpl.hpp +++ b/blas/tpls/KokkosBlas_Rocm_tpl.hpp @@ -25,8 +25,7 @@ namespace Impl { RocBlasSingleton::RocBlasSingleton() { KOKKOS_ROCBLAS_SAFE_CALL_IMPL(rocblas_create_handle(&handle)); - Kokkos::push_finalize_hook( - [&]() { KOKKOS_ROCBLAS_SAFE_CALL_IMPL(rocblas_destroy_handle(handle)); }); + Kokkos::push_finalize_hook([&]() { KOKKOS_ROCBLAS_SAFE_CALL_IMPL(rocblas_destroy_handle(handle)); }); } RocBlasSingleton& RocBlasSingleton::singleton() { diff --git a/blas/tpls/KokkosBlas_tpl_spec.hpp b/blas/tpls/KokkosBlas_tpl_spec.hpp index 0151c0534f..7f40edf435 100644 --- a/blas/tpls/KokkosBlas_tpl_spec.hpp +++ b/blas/tpls/KokkosBlas_tpl_spec.hpp @@ -32,8 +32,7 @@ struct CudaBlasSingleton { static CudaBlasSingleton& singleton(); }; -inline void cublas_internal_error_throw(cublasStatus_t cublasState, - const char* name, const char* file, +inline void cublas_internal_error_throw(cublasStatus_t cublasState, const char* name, const char* file, const int line) { std::ostringstream out; // out << name << " error( " << cublasGetStatusName(cublasState) @@ -43,9 +42,7 @@ inline void cublas_internal_error_throw(cublasStatus_t cublasState, case CUBLAS_STATUS_NOT_INITIALIZED: out << "CUBLAS_STATUS_NOT_INITIALIZED): the library was not initialized."; break; - case CUBLAS_STATUS_ALLOC_FAILED: - out << "CUBLAS_STATUS_ALLOC_FAILED): the resource allocation failed."; - break; + case CUBLAS_STATUS_ALLOC_FAILED: out << "CUBLAS_STATUS_ALLOC_FAILED): the resource allocation failed."; break; case CUBLAS_STATUS_INVALID_VALUE: out << "CUBLAS_STATUS_INVALID_VALUE): an invalid numerical value was " "used as an argument."; @@ -62,9 +59,7 @@ inline void cublas_internal_error_throw(cublasStatus_t cublasState, out << "CUBLAS_STATUS_EXECUTION_FAILED): the GPU program failed to " "execute."; break; - case CUBLAS_STATUS_INTERNAL_ERROR: - out << "CUBLAS_STATUS_INTERNAL_ERROR): an internal operation failed."; - break; + case CUBLAS_STATUS_INTERNAL_ERROR: out << "CUBLAS_STATUS_INTERNAL_ERROR): an internal operation failed."; break; case CUBLAS_STATUS_NOT_SUPPORTED: out << "CUBLAS_STATUS_NOT_SUPPORTED): the feature required is not " "supported."; @@ -77,10 +72,8 @@ inline void cublas_internal_error_throw(cublasStatus_t cublasState, throw std::runtime_error(out.str()); } -inline void cublas_internal_safe_call(cublasStatus_t cublasState, - const char* name, - const char* file = nullptr, - const int line = 0) { +inline void cublas_internal_safe_call(cublasStatus_t cublasState, const char* name, const char* file = nullptr, + const int line = 0) { if (CUBLAS_STATUS_SUCCESS != cublasState) { cublas_internal_error_throw(cublasState, name, file, line); } @@ -89,8 +82,7 @@ inline void cublas_internal_safe_call(cublasStatus_t cublasState, // The macro below defines the interface for the safe cublas calls. // The functions themselves are protected by impl namespace and this // is not meant to be used by external application or libraries. -#define KOKKOS_CUBLAS_SAFE_CALL_IMPL(call) \ - KokkosBlas::Impl::cublas_internal_safe_call(call, #call, __FILE__, __LINE__) +#define KOKKOS_CUBLAS_SAFE_CALL_IMPL(call) KokkosBlas::Impl::cublas_internal_safe_call(call, #call, __FILE__, __LINE__) /// \brief This function converts KK transpose mode to cuBLAS transpose mode inline cublasOperation_t trans_mode_kk_to_cublas(const char kkMode[]) { @@ -122,8 +114,7 @@ struct RocBlasSingleton { static RocBlasSingleton& singleton(); }; -inline void rocblas_internal_error_throw(rocblas_status rocblasState, - const char* name, const char* file, +inline void rocblas_internal_error_throw(rocblas_status rocblasState, const char* name, const char* file, const int line) { std::ostringstream out; out << name << " error( "; @@ -132,29 +123,19 @@ inline void rocblas_internal_error_throw(rocblas_status rocblasState, out << "rocblas_status_invalid_handle): handle not initialized, invalid " "or null."; break; - case rocblas_status_not_implemented: - out << "rocblas_status_not_implemented): function is not implemented."; - break; - case rocblas_status_invalid_pointer: - out << "rocblas_status_invalid_pointer): invalid pointer argument."; - break; - case rocblas_status_invalid_size: - out << "rocblas_status_invalid_size): invalid size argument."; - break; + case rocblas_status_not_implemented: out << "rocblas_status_not_implemented): function is not implemented."; break; + case rocblas_status_invalid_pointer: out << "rocblas_status_invalid_pointer): invalid pointer argument."; break; + case rocblas_status_invalid_size: out << "rocblas_status_invalid_size): invalid size argument."; break; case rocblas_status_memory_error: out << "rocblas_status_memory_error): failed internal memory allocation, " "copy or dealloc."; break; - case rocblas_status_internal_error: - out << "rocblas_status_internal_error): other internal library failure."; - break; + case rocblas_status_internal_error: out << "rocblas_status_internal_error): other internal library failure."; break; case rocblas_status_perf_degraded: out << "rocblas_status_perf_degraded): performance degraded due to low " "device memory."; break; - case rocblas_status_size_query_mismatch: - out << "unmatched start/stop size query): ."; - break; + case rocblas_status_size_query_mismatch: out << "unmatched start/stop size query): ."; break; case rocblas_status_size_increased: out << "rocblas_status_size_increased): queried device memory size " "increased."; @@ -163,9 +144,7 @@ inline void rocblas_internal_error_throw(rocblas_status rocblasState, out << "rocblas_status_size_unchanged): queried device memory size " "unchanged."; break; - case rocblas_status_invalid_value: - out << "rocblas_status_invalid_value): passed argument not valid."; - break; + case rocblas_status_invalid_value: out << "rocblas_status_invalid_value): passed argument not valid."; break; case rocblas_status_continue: out << "rocblas_status_continue): nothing preventing function to " "proceed."; @@ -182,10 +161,8 @@ inline void rocblas_internal_error_throw(rocblas_status rocblasState, throw std::runtime_error(out.str()); } -inline void rocblas_internal_safe_call(rocblas_status rocblasState, - const char* name, - const char* file = nullptr, - const int line = 0) { +inline void rocblas_internal_safe_call(rocblas_status rocblasState, const char* name, const char* file = nullptr, + const int line = 0) { if (rocblas_status_success != rocblasState) { rocblas_internal_error_throw(rocblasState, name, file, line); } diff --git a/blas/unit_test/Test_Blas1_abs.hpp b/blas/unit_test/Test_Blas1_abs.hpp index 5bf3f55388..eb2d290a6f 100644 --- a/blas/unit_test/Test_Blas1_abs.hpp +++ b/blas/unit_test/Test_Blas1_abs.hpp @@ -32,8 +32,7 @@ void impl_test_abs(int N) { view_stride_adapter y("Y", N); view_stride_adapter org_y("Org_Y", N); - Kokkos::Random_XorShift64_Pool rand_pool( - 13718); + Kokkos::Random_XorShift64_Pool rand_pool(13718); { ScalarA randStart, randEnd; @@ -55,8 +54,7 @@ void impl_test_abs(int N) { // Copy result to host (h_y is subview of h_b_y) Kokkos::deep_copy(y.h_base, y.d_base); for (int i = 0; i < N; i++) { - EXPECT_NEAR_KK(y.h_view(i), AT::abs(x.h_view(i)), - eps * AT::abs(x.h_view(i))); + EXPECT_NEAR_KK(y.h_view(i), AT::abs(x.h_view(i)), eps * AT::abs(x.h_view(i))); } // Run with const input // Reset output @@ -64,8 +62,7 @@ void impl_test_abs(int N) { KokkosBlas::abs(y.d_view, x.d_view_const); Kokkos::deep_copy(y.h_base, y.d_base); for (int i = 0; i < N; i++) { - EXPECT_NEAR_KK(y.h_view(i), AT::abs(x.h_view(i)), - eps * AT::abs(x.h_view(i))); + EXPECT_NEAR_KK(y.h_view(i), AT::abs(x.h_view(i)), eps * AT::abs(x.h_view(i))); } } @@ -79,8 +76,7 @@ void impl_test_abs_mv(int N, int K) { view_stride_adapter y("Y", N, K); view_stride_adapter org_y("Org_Y", N, K); - Kokkos::Random_XorShift64_Pool rand_pool( - 13718); + Kokkos::Random_XorShift64_Pool rand_pool(13718); { ScalarA randStart, randEnd; @@ -104,8 +100,7 @@ void impl_test_abs_mv(int N, int K) { Kokkos::deep_copy(y.h_base, y.d_base); for (int i = 0; i < N; i++) { for (int j = 0; j < K; j++) { - EXPECT_NEAR_KK(y.h_view(i, j), AT::abs(x.h_view(i, j)), - eps * AT::abs(x.h_view(i, j))); + EXPECT_NEAR_KK(y.h_view(i, j), AT::abs(x.h_view(i, j)), eps * AT::abs(x.h_view(i, j))); } } // Test and verify const input @@ -115,8 +110,7 @@ void impl_test_abs_mv(int N, int K) { Kokkos::deep_copy(y.h_base, y.d_base); for (int i = 0; i < N; i++) { for (int j = 0; j < K; j++) { - EXPECT_NEAR_KK(y.h_view(i, j), AT::abs(x.h_view(i, j)), - eps * AT::abs(x.h_view(i, j))); + EXPECT_NEAR_KK(y.h_view(i, j), AT::abs(x.h_view(i, j)), eps * AT::abs(x.h_view(i, j))); } } } @@ -125,8 +119,7 @@ void impl_test_abs_mv(int N, int K) { template int test_abs() { #if defined(KOKKOSKERNELS_INST_LAYOUTLEFT) || \ - (!defined(KOKKOSKERNELS_ETI_ONLY) && \ - !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS)) + (!defined(KOKKOSKERNELS_ETI_ONLY) && !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS)) typedef Kokkos::View view_type_a_ll; typedef Kokkos::View view_type_b_ll; Test::impl_test_abs(0); @@ -136,8 +129,7 @@ int test_abs() { #endif #if defined(KOKKOSKERNELS_INST_LAYOUTRIGHT) || \ - (!defined(KOKKOSKERNELS_ETI_ONLY) && \ - !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS)) + (!defined(KOKKOSKERNELS_ETI_ONLY) && !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS)) typedef Kokkos::View view_type_a_lr; typedef Kokkos::View view_type_b_lr; Test::impl_test_abs(0); @@ -146,8 +138,7 @@ int test_abs() { // Test::impl_test_abs(132231); #endif -#if (!defined(KOKKOSKERNELS_ETI_ONLY) && \ - !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS)) +#if (!defined(KOKKOSKERNELS_ETI_ONLY) && !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS)) typedef Kokkos::View view_type_a_ls; typedef Kokkos::View view_type_b_ls; Test::impl_test_abs(0); @@ -156,8 +147,7 @@ int test_abs() { // Test::impl_test_abs(132231); #endif -#if !defined(KOKKOSKERNELS_ETI_ONLY) && \ - !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS) +#if !defined(KOKKOSKERNELS_ETI_ONLY) && !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS) Test::impl_test_abs(1024); Test::impl_test_abs(1024); #endif @@ -168,8 +158,7 @@ int test_abs() { template int test_abs_mv() { #if defined(KOKKOSKERNELS_INST_LAYOUTLEFT) || \ - (!defined(KOKKOSKERNELS_ETI_ONLY) && \ - !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS)) + (!defined(KOKKOSKERNELS_ETI_ONLY) && !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS)) typedef Kokkos::View view_type_a_ll; typedef Kokkos::View view_type_b_ll; Test::impl_test_abs_mv(0, 5); @@ -179,8 +168,7 @@ int test_abs_mv() { #endif #if defined(KOKKOSKERNELS_INST_LAYOUTRIGHT) || \ - (!defined(KOKKOSKERNELS_ETI_ONLY) && \ - !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS)) + (!defined(KOKKOSKERNELS_ETI_ONLY) && !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS)) typedef Kokkos::View view_type_a_lr; typedef Kokkos::View view_type_b_lr; Test::impl_test_abs_mv(0, 5); @@ -189,8 +177,7 @@ int test_abs_mv() { // Test::impl_test_abs_mv(132231,5); #endif -#if (!defined(KOKKOSKERNELS_ETI_ONLY) && \ - !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS)) +#if (!defined(KOKKOSKERNELS_ETI_ONLY) && !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS)) typedef Kokkos::View view_type_a_ls; typedef Kokkos::View view_type_b_ls; Test::impl_test_abs_mv(0, 5); @@ -199,8 +186,7 @@ int test_abs_mv() { // Test::impl_test_abs_mv(132231,5); #endif -#if !defined(KOKKOSKERNELS_ETI_ONLY) && \ - !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS) +#if !defined(KOKKOSKERNELS_ETI_ONLY) && !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS) Test::impl_test_abs_mv(1024, 5); Test::impl_test_abs_mv(1024, 5); #endif @@ -209,8 +195,7 @@ int test_abs_mv() { } #if defined(KOKKOSKERNELS_INST_FLOAT) || \ - (!defined(KOKKOSKERNELS_ETI_ONLY) && \ - !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS)) + (!defined(KOKKOSKERNELS_ETI_ONLY) && !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS)) TEST_F(TestCategory, abs_float) { Kokkos::Profiling::pushRegion("KokkosBlas::Test::abs_float"); test_abs(); @@ -224,8 +209,7 @@ TEST_F(TestCategory, abs_mv_float) { #endif #if defined(KOKKOSKERNELS_INST_DOUBLE) || \ - (!defined(KOKKOSKERNELS_ETI_ONLY) && \ - !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS)) + (!defined(KOKKOSKERNELS_ETI_ONLY) && !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS)) TEST_F(TestCategory, abs_double) { Kokkos::Profiling::pushRegion("KokkosBlas::Test::abs_double"); test_abs(); @@ -239,8 +223,7 @@ TEST_F(TestCategory, abs_mv_double) { #endif #if defined(KOKKOSKERNELS_INST_COMPLEX_DOUBLE) || \ - (!defined(KOKKOSKERNELS_ETI_ONLY) && \ - !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS)) + (!defined(KOKKOSKERNELS_ETI_ONLY) && !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS)) TEST_F(TestCategory, abs_complex_double) { Kokkos::Profiling::pushRegion("KokkosBlas::Test::abs_double"); test_abs, Kokkos::complex, TestDevice>(); @@ -253,9 +236,8 @@ TEST_F(TestCategory, abs_mv_complex_double) { } #endif -#if defined(KOKKOSKERNELS_INST_INT) || \ - (!defined(KOKKOSKERNELS_ETI_ONLY) && \ - !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS)) +#if defined(KOKKOSKERNELS_INST_INT) || \ + (!defined(KOKKOSKERNELS_ETI_ONLY) && !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS)) TEST_F(TestCategory, abs_int) { Kokkos::Profiling::pushRegion("KokkosBlas::Test::abs_int"); test_abs(); diff --git a/blas/unit_test/Test_Blas1_asum.hpp b/blas/unit_test/Test_Blas1_asum.hpp index 65b5b2c063..07cf2e6998 100644 --- a/blas/unit_test/Test_Blas1_asum.hpp +++ b/blas/unit_test/Test_Blas1_asum.hpp @@ -28,8 +28,7 @@ void impl_test_asum(int N) { view_stride_adapter a("A", N); - Kokkos::Random_XorShift64_Pool rand_pool( - 13718); + Kokkos::Random_XorShift64_Pool rand_pool(13718); ScalarA randStart, randEnd; Test::getRandomBounds(10.0, randStart, randEnd); @@ -46,8 +45,7 @@ void impl_test_asum(int N) { // parts. // // This is safe; ArithTraits::imag is 0 if T is real. - expected_result += - MAT::abs(AT::real(a.h_view(i))) + MAT::abs(AT::imag(a.h_view(i))); + expected_result += MAT::abs(AT::real(a.h_view(i))) + MAT::abs(AT::imag(a.h_view(i))); } typename AT::mag_type nonconst_result = KokkosBlas::asum(a.d_view); @@ -62,8 +60,7 @@ void impl_test_asum(int N) { template int test_asum() { #if defined(KOKKOSKERNELS_INST_LAYOUTLEFT) || \ - (!defined(KOKKOSKERNELS_ETI_ONLY) && \ - !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS)) + (!defined(KOKKOSKERNELS_ETI_ONLY) && !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS)) typedef Kokkos::View view_type_a_ll; Test::impl_test_asum(0); Test::impl_test_asum(13); @@ -72,8 +69,7 @@ int test_asum() { #endif #if defined(KOKKOSKERNELS_INST_LAYOUTRIGHT) || \ - (!defined(KOKKOSKERNELS_ETI_ONLY) && \ - !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS)) + (!defined(KOKKOSKERNELS_ETI_ONLY) && !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS)) typedef Kokkos::View view_type_a_lr; Test::impl_test_asum(0); Test::impl_test_asum(13); @@ -81,8 +77,7 @@ int test_asum() { // Test::impl_test_asum(132231); #endif -#if (!defined(KOKKOSKERNELS_ETI_ONLY) && \ - !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS)) +#if (!defined(KOKKOSKERNELS_ETI_ONLY) && !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS)) typedef Kokkos::View view_type_a_ls; Test::impl_test_asum(0); Test::impl_test_asum(13); @@ -94,8 +89,7 @@ int test_asum() { } #if defined(KOKKOSKERNELS_INST_FLOAT) || \ - (!defined(KOKKOSKERNELS_ETI_ONLY) && \ - !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS)) + (!defined(KOKKOSKERNELS_ETI_ONLY) && !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS)) TEST_F(TestCategory, asum_float) { Kokkos::Profiling::pushRegion("KokkosBlas::Test::asum_float"); test_asum(); @@ -104,8 +98,7 @@ TEST_F(TestCategory, asum_float) { #endif #if defined(KOKKOSKERNELS_INST_DOUBLE) || \ - (!defined(KOKKOSKERNELS_ETI_ONLY) && \ - !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS)) + (!defined(KOKKOSKERNELS_ETI_ONLY) && !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS)) TEST_F(TestCategory, asum_double) { Kokkos::Profiling::pushRegion("KokkosBlas::Test::asum_double"); test_asum(); @@ -114,8 +107,7 @@ TEST_F(TestCategory, asum_double) { #endif #if defined(KOKKOSKERNELS_INST_COMPLEX_DOUBLE) || \ - (!defined(KOKKOSKERNELS_ETI_ONLY) && \ - !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS)) + (!defined(KOKKOSKERNELS_ETI_ONLY) && !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS)) TEST_F(TestCategory, asum_complex_double) { Kokkos::Profiling::pushRegion("KokkosBlas::Test::asum_complex_double"); test_asum, TestDevice>(); @@ -123,9 +115,8 @@ TEST_F(TestCategory, asum_complex_double) { } #endif -#if defined(KOKKOSKERNELS_INST_INT) || \ - (!defined(KOKKOSKERNELS_ETI_ONLY) && \ - !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS)) +#if defined(KOKKOSKERNELS_INST_INT) || \ + (!defined(KOKKOSKERNELS_ETI_ONLY) && !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS)) TEST_F(TestCategory, asum_int) { Kokkos::Profiling::pushRegion("KokkosBlas::Test::asum_int"); test_asum(); diff --git a/blas/unit_test/Test_Blas1_axpby.hpp b/blas/unit_test/Test_Blas1_axpby.hpp index 299e18e493..16d6bdc78f 100644 --- a/blas/unit_test/Test_Blas1_axpby.hpp +++ b/blas/unit_test/Test_Blas1_axpby.hpp @@ -34,16 +34,14 @@ void impl_test_axpby(int N) { const MagnitudeB eps = Kokkos::ArithTraits::epsilon(); const MagnitudeB max_val = 10; const MagnitudeB max_error = - (static_cast(Kokkos::ArithTraits::abs(a)) + - Kokkos::ArithTraits::abs(b)) * - max_val * eps; + (static_cast(Kokkos::ArithTraits::abs(a)) + Kokkos::ArithTraits::abs(b)) * max_val * + eps; view_stride_adapter x("X", N); view_stride_adapter y("Y", N); view_stride_adapter org_y("Org_Y", N); - Kokkos::Random_XorShift64_Pool rand_pool( - 13718); + Kokkos::Random_XorShift64_Pool rand_pool(13718); { ScalarA randStart, randEnd; @@ -58,8 +56,7 @@ void impl_test_axpby(int N) { KokkosBlas::axpby(a, x.d_view, b, y.d_view); Kokkos::deep_copy(y.h_base, y.d_base); for (int i = 0; i < N; i++) { - EXPECT_NEAR_KK(static_cast(a * x.h_view(i) + b * org_y.h_view(i)), - y.h_view(i), 2 * max_error); + EXPECT_NEAR_KK(static_cast(a * x.h_view(i) + b * org_y.h_view(i)), y.h_view(i), 2 * max_error); } // Re-randomize y @@ -68,8 +65,7 @@ void impl_test_axpby(int N) { KokkosBlas::axpby(a, x.d_view_const, b, y.d_view); Kokkos::deep_copy(y.h_base, y.d_base); for (int i = 0; i < N; i++) { - EXPECT_NEAR_KK(static_cast(a * x.h_view(i) + b * org_y.h_view(i)), - y.h_view(i), 2 * max_error); + EXPECT_NEAR_KK(static_cast(a * x.h_view(i) + b * org_y.h_view(i)), y.h_view(i), 2 * max_error); } } @@ -88,12 +84,10 @@ void impl_test_axpby_mv(int N, int K) { const MagnitudeB eps = Kokkos::ArithTraits::epsilon(); const MagnitudeB max_val = 10; const MagnitudeB max_error = - (static_cast(Kokkos::ArithTraits::abs(a)) + - Kokkos::ArithTraits::abs(b)) * - max_val * eps; + (static_cast(Kokkos::ArithTraits::abs(a)) + Kokkos::ArithTraits::abs(b)) * max_val * + eps; - Kokkos::Random_XorShift64_Pool rand_pool( - 13718); + Kokkos::Random_XorShift64_Pool rand_pool(13718); { ScalarA randStart, randEnd; @@ -114,9 +108,7 @@ void impl_test_axpby_mv(int N, int K) { for (int i = 0; i < N; i++) { for (int j = 0; j < K; j++) { - EXPECT_NEAR_KK( - static_cast(a * x.h_view(i, j) + b * org_y.h_view(i, j)), - y.h_view(i, j), 2 * max_error); + EXPECT_NEAR_KK(static_cast(a * x.h_view(i, j) + b * org_y.h_view(i, j)), y.h_view(i, j), 2 * max_error); } } @@ -126,9 +118,7 @@ void impl_test_axpby_mv(int N, int K) { for (int i = 0; i < N; i++) { for (int j = 0; j < K; j++) { - EXPECT_NEAR_KK( - static_cast(a * x.h_view(i, j) + b * org_y.h_view(i, j)), - y.h_view(i, j), 2 * max_error); + EXPECT_NEAR_KK(static_cast(a * x.h_view(i, j) + b * org_y.h_view(i, j)), y.h_view(i, j), 2 * max_error); } } } @@ -137,8 +127,7 @@ void impl_test_axpby_mv(int N, int K) { template int test_axpby() { #if defined(KOKKOSKERNELS_INST_LAYOUTLEFT) || \ - (!defined(KOKKOSKERNELS_ETI_ONLY) && \ - !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS)) + (!defined(KOKKOSKERNELS_ETI_ONLY) && !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS)) typedef Kokkos::View view_type_a_ll; typedef Kokkos::View view_type_b_ll; Test::impl_test_axpby(0); @@ -148,8 +137,7 @@ int test_axpby() { #endif #if defined(KOKKOSKERNELS_INST_LAYOUTRIGHT) || \ - (!defined(KOKKOSKERNELS_ETI_ONLY) && \ - !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS)) + (!defined(KOKKOSKERNELS_ETI_ONLY) && !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS)) typedef Kokkos::View view_type_a_lr; typedef Kokkos::View view_type_b_lr; Test::impl_test_axpby(0); @@ -158,8 +146,7 @@ int test_axpby() { Test::impl_test_axpby(132231); #endif -#if (!defined(KOKKOSKERNELS_ETI_ONLY) && \ - !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS)) +#if (!defined(KOKKOSKERNELS_ETI_ONLY) && !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS)) typedef Kokkos::View view_type_a_ls; typedef Kokkos::View view_type_b_ls; Test::impl_test_axpby(0); @@ -168,8 +155,7 @@ int test_axpby() { Test::impl_test_axpby(132231); #endif -#if !defined(KOKKOSKERNELS_ETI_ONLY) && \ - !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS) +#if !defined(KOKKOSKERNELS_ETI_ONLY) && !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS) Test::impl_test_axpby(1024); Test::impl_test_axpby(1024); #endif @@ -180,8 +166,7 @@ int test_axpby() { template int test_axpby_mv() { #if defined(KOKKOSKERNELS_INST_LAYOUTLEFT) || \ - (!defined(KOKKOSKERNELS_ETI_ONLY) && \ - !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS)) + (!defined(KOKKOSKERNELS_ETI_ONLY) && !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS)) typedef Kokkos::View view_type_a_ll; typedef Kokkos::View view_type_b_ll; Test::impl_test_axpby_mv(0, 5); @@ -191,8 +176,7 @@ int test_axpby_mv() { #endif #if defined(KOKKOSKERNELS_INST_LAYOUTRIGHT) || \ - (!defined(KOKKOSKERNELS_ETI_ONLY) && \ - !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS)) + (!defined(KOKKOSKERNELS_ETI_ONLY) && !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS)) typedef Kokkos::View view_type_a_lr; typedef Kokkos::View view_type_b_lr; Test::impl_test_axpby_mv(0, 5); @@ -201,8 +185,7 @@ int test_axpby_mv() { Test::impl_test_axpby_mv(132231, 5); #endif -#if (!defined(KOKKOSKERNELS_ETI_ONLY) && \ - !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS)) +#if (!defined(KOKKOSKERNELS_ETI_ONLY) && !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS)) typedef Kokkos::View view_type_a_ls; typedef Kokkos::View view_type_b_ls; Test::impl_test_axpby_mv(0, 5); @@ -211,8 +194,7 @@ int test_axpby_mv() { Test::impl_test_axpby_mv(132231, 5); #endif -#if !defined(KOKKOSKERNELS_ETI_ONLY) && \ - !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS) +#if !defined(KOKKOSKERNELS_ETI_ONLY) && !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS) Test::impl_test_axpby_mv(1024, 5); Test::impl_test_axpby_mv(1024, 5); #endif @@ -221,8 +203,7 @@ int test_axpby_mv() { } #if defined(KOKKOSKERNELS_INST_FLOAT) || \ - (!defined(KOKKOSKERNELS_ETI_ONLY) && \ - !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS)) + (!defined(KOKKOSKERNELS_ETI_ONLY) && !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS)) TEST_F(TestCategory, axpby_float) { Kokkos::Profiling::pushRegion("KokkosBlas::Test::axpby_float"); test_axpby(); @@ -236,8 +217,7 @@ TEST_F(TestCategory, axpby_mv_float) { #endif #if defined(KOKKOSKERNELS_INST_DOUBLE) || \ - (!defined(KOKKOSKERNELS_ETI_ONLY) && \ - !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS)) + (!defined(KOKKOSKERNELS_ETI_ONLY) && !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS)) TEST_F(TestCategory, axpby_double) { Kokkos::Profiling::pushRegion("KokkosBlas::Test::axpby_double"); test_axpby(); @@ -250,8 +230,7 @@ TEST_F(TestCategory, axpby_mv_double) { #endif #if defined(KOKKOSKERNELS_INST_COMPLEX_DOUBLE) || \ - (!defined(KOKKOSKERNELS_ETI_ONLY) && \ - !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS)) + (!defined(KOKKOSKERNELS_ETI_ONLY) && !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS)) TEST_F(TestCategory, axpby_complex_double) { Kokkos::Profiling::pushRegion("KokkosBlas::Test::axpby_complex_double"); test_axpby, Kokkos::complex, TestDevice>(); @@ -264,9 +243,8 @@ TEST_F(TestCategory, axpby_mv_complex_double) { } #endif -#if defined(KOKKOSKERNELS_INST_INT) || \ - (!defined(KOKKOSKERNELS_ETI_ONLY) && \ - !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS)) +#if defined(KOKKOSKERNELS_INST_INT) || \ + (!defined(KOKKOSKERNELS_ETI_ONLY) && !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS)) TEST_F(TestCategory, axpby_int) { Kokkos::Profiling::pushRegion("KokkosBlas::Test::axpby_int"); test_axpby(); @@ -279,8 +257,7 @@ TEST_F(TestCategory, axpby_mv_int) { } #endif -#if !defined(KOKKOSKERNELS_ETI_ONLY) && \ - !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS) +#if !defined(KOKKOSKERNELS_ETI_ONLY) && !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS) TEST_F(TestCategory, axpby_double_int) { Kokkos::Profiling::pushRegion("KokkosBlas::Test::axpby_double_int"); test_axpby(); diff --git a/blas/unit_test/Test_Blas1_axpby_unification.hpp b/blas/unit_test/Test_Blas1_axpby_unification.hpp index 6ce7bad0b1..4f9b394c25 100644 --- a/blas/unit_test/Test_Blas1_axpby_unification.hpp +++ b/blas/unit_test/Test_Blas1_axpby_unification.hpp @@ -76,22 +76,16 @@ constexpr bool isRank0() { return false; } -template -void impl_test_axpby_unification_compare( - tA const& a, tX const& x, tB const& b, tY const& y, int N, - bool testWithNanY, - typename Kokkos::ArithTraits::mag_type const max_val, - typename Kokkos::ArithTraits::mag_type const max_error, - tScalarA const inputValueA = Kokkos::ArithTraits::zero(), - tScalarB const inputValueB = Kokkos::ArithTraits::zero()) { - using ScalarTypeX = - typename std::remove_const::type; - using ScalarTypeY = - typename std::remove_const::type; - - Kokkos::Random_XorShift64_Pool rand_pool( - 13718); +template +void impl_test_axpby_unification_compare(tA const& a, tX const& x, tB const& b, tY const& y, int N, bool testWithNanY, + typename Kokkos::ArithTraits::mag_type const max_val, + typename Kokkos::ArithTraits::mag_type const max_error, + tScalarA const inputValueA = Kokkos::ArithTraits::zero(), + tScalarB const inputValueB = Kokkos::ArithTraits::zero()) { + using ScalarTypeX = typename std::remove_const::type; + using ScalarTypeY = typename std::remove_const::type; + + Kokkos::Random_XorShift64_Pool rand_pool(13718); { ScalarTypeX randStart, randEnd; @@ -121,8 +115,7 @@ void impl_test_axpby_unification_compare( valueB = b; KokkosBlas::axpby(a, x.d_view, b, y.d_view); } else if constexpr (isRank0()) { - if constexpr (std::is_same_v) { + if constexpr (std::is_same_v) { valueB = inputValueB; } else { typename tB::HostMirror h_b("h_B"); @@ -136,8 +129,7 @@ void impl_test_axpby_unification_compare( KokkosBlas::axpby(a, x.d_view, b.d_view, y.d_view); } } else if constexpr (isRank0()) { - if constexpr (std::is_same_v) { + if constexpr (std::is_same_v) { valueA = inputValueA; } else { typename tA::HostMirror h_a("h_A"); @@ -148,8 +140,7 @@ void impl_test_axpby_unification_compare( valueB = b; KokkosBlas::axpby(a, x.d_view, b, y.d_view); } else if constexpr (isRank0()) { - if constexpr (std::is_same_v) { + if constexpr (std::is_same_v) { valueB = inputValueB; } else { typename tB::HostMirror h_b("h_B"); @@ -169,8 +160,7 @@ void impl_test_axpby_unification_compare( valueB = b; KokkosBlas::axpby(a.d_view, x.d_view, b, y.d_view); } else if constexpr (isRank0()) { - if constexpr (std::is_same_v) { + if constexpr (std::is_same_v) { valueB = inputValueB; } else { typename tB::HostMirror h_b("h_B"); @@ -189,9 +179,8 @@ void impl_test_axpby_unification_compare( if (testWithNanY == false) { for (int i(0); i < N; ++i) { - EXPECT_NEAR_KK(static_cast(valueA * x.h_view(i) + - valueB * org_y.h_view(i)), - y.h_view(i), 4. * max_error); + EXPECT_NEAR_KK(static_cast(valueA * x.h_view(i) + valueB * org_y.h_view(i)), y.h_view(i), + 4. * max_error); } } else { // ******************************************************** @@ -220,28 +209,22 @@ void impl_test_axpby_unification_compare( } else { EXPECT_NE(y.h_view(i), Kokkos::ArithTraits::nan()); } - EXPECT_NEAR_KK(static_cast(valueA * x.h_view(i)), - y.h_view(i), 4. * max_error); + EXPECT_NEAR_KK(static_cast(valueA * x.h_view(i)), y.h_view(i), 4. * max_error); } } } -template -void impl_test_axpby_mv_unification_compare( - tA const& a, tX const& x, tB const& b, tY const& y, int N, int K, - bool testWithNanY, - typename Kokkos::ArithTraits::mag_type const max_val, - typename Kokkos::ArithTraits::mag_type const max_error, - tScalarA const inputValueA = Kokkos::ArithTraits::zero(), - tScalarB const inputValueB = Kokkos::ArithTraits::zero()) { - using ScalarTypeX = - typename std::remove_const::type; - using ScalarTypeY = - typename std::remove_const::type; - - Kokkos::Random_XorShift64_Pool rand_pool( - 13718); +template +void impl_test_axpby_mv_unification_compare(tA const& a, tX const& x, tB const& b, tY const& y, int N, int K, + bool testWithNanY, + typename Kokkos::ArithTraits::mag_type const max_val, + typename Kokkos::ArithTraits::mag_type const max_error, + tScalarA const inputValueA = Kokkos::ArithTraits::zero(), + tScalarB const inputValueB = Kokkos::ArithTraits::zero()) { + using ScalarTypeX = typename std::remove_const::type; + using ScalarTypeY = typename std::remove_const::type; + + Kokkos::Random_XorShift64_Pool rand_pool(13718); { ScalarTypeX randStart, randEnd; @@ -284,8 +267,7 @@ void impl_test_axpby_mv_unification_compare( valueB = b; KokkosBlas::axpby(a, x.d_view, b, y.d_view); } else if constexpr (isRank0()) { - if constexpr (std::is_same_v) { + if constexpr (std::is_same_v) { valueB = inputValueB; } else { typename tB::HostMirror h_b("h_B"); @@ -298,8 +280,7 @@ void impl_test_axpby_mv_unification_compare( KokkosBlas::axpby(a, x.d_view, b.d_view, y.d_view); } } else if constexpr (isRank0()) { - if constexpr (std::is_same_v) { + if constexpr (std::is_same_v) { valueA = inputValueA; } else { typename tA::HostMirror h_a("h_A"); @@ -310,8 +291,7 @@ void impl_test_axpby_mv_unification_compare( valueB = b; KokkosBlas::axpby(a, x.d_view, b, y.d_view); } else if constexpr (isRank0()) { - if constexpr (std::is_same_v) { + if constexpr (std::is_same_v) { valueB = inputValueB; } else { typename tB::HostMirror h_b("h_B"); @@ -329,8 +309,7 @@ void impl_test_axpby_mv_unification_compare( valueB = b; KokkosBlas::axpby(a.d_view, x.d_view, b, y.d_view); } else if constexpr (isRank0()) { - if constexpr (std::is_same_v) { + if constexpr (std::is_same_v) { valueB = inputValueB; } else { typename tB::HostMirror h_b("h_B"); @@ -371,22 +350,18 @@ void impl_test_axpby_mv_unification_compare( << std::endl; #endif vanillaValue = - static_cast(a.h_view(a_k) * x.h_view(i, k) + - b.h_view(b_k) * org_y.h_view(i, k)); + static_cast(a.h_view(a_k) * x.h_view(i, k) + b.h_view(b_k) * org_y.h_view(i, k)); } else { int a_k(a.h_view.extent(0) == 1 ? 0 : k); - vanillaValue = static_cast( - a.h_view(a_k) * x.h_view(i, k) + valueB * org_y.h_view(i, k)); + vanillaValue = static_cast(a.h_view(a_k) * x.h_view(i, k) + valueB * org_y.h_view(i, k)); } } else { if constexpr (bIsRank1) { (void)valueB; // Avoid "set but not used" error int b_k(b.h_view.extent(0) == 1 ? 0 : k); - vanillaValue = static_cast( - valueA * x.h_view(i, k) + b.h_view(b_k) * org_y.h_view(i, k)); + vanillaValue = static_cast(valueA * x.h_view(i, k) + b.h_view(b_k) * org_y.h_view(i, k)); } else { - vanillaValue = static_cast( - valueA * x.h_view(i, k) + valueB * org_y.h_view(i, k)); + vanillaValue = static_cast(valueA * x.h_view(i, k) + valueB * org_y.h_view(i, k)); } } #if 0 @@ -411,8 +386,7 @@ void impl_test_axpby_mv_unification_compare( if constexpr (aIsRank1) { (void)valueA; // Avoid "set but not used" error int a_k(a.h_view.extent(0) == 1 ? 0 : k); - vanillaValue = - static_cast(a.h_view(a_k) * x.h_view(i, k)); + vanillaValue = static_cast(a.h_view(a_k) * x.h_view(i, k)); #if 0 ScalarTypeY tmp = static_cast(a.h_view(a_k) * x.h_view(i, k) + valueB * org_y.h_view(i, k)); std::cout << "i = " << i @@ -468,9 +442,8 @@ void impl_test_axpby_mv_unification_compare( } } -template +template void impl_test_axpby_unification(int const N) { using ViewTypeAr0 = Kokkos::View; using ViewTypeAr1s_1 = Kokkos::View; @@ -484,10 +457,8 @@ void impl_test_axpby_unification(int const N) { using ViewTypeY = Kokkos::View; - std::array const valuesA{ - -1, Kokkos::ArithTraits::zero(), 1, 3}; - std::array const valuesB{ - -1, Kokkos::ArithTraits::zero(), 1, 5}; + std::array const valuesA{-1, Kokkos::ArithTraits::zero(), 1, 3}; + std::array const valuesB{-1, Kokkos::ArithTraits::zero(), 1, 5}; // eps should probably be based on tScalarB since that is the type // in which the result is computed. @@ -495,9 +466,8 @@ void impl_test_axpby_unification(int const N) { MagnitudeB const eps = Kokkos::ArithTraits::epsilon(); MagnitudeB const max_val = 10; MagnitudeB const max_error = - static_cast( - Kokkos::ArithTraits::abs(valuesA[valuesA.size() - 1]) + - Kokkos::ArithTraits::abs(valuesB[valuesB.size() - 1])) * + static_cast(Kokkos::ArithTraits::abs(valuesA[valuesA.size() - 1]) + + Kokkos::ArithTraits::abs(valuesB[valuesB.size() - 1])) * max_val * eps; // ************************************************************ @@ -518,15 +488,13 @@ void impl_test_axpby_unification(int const N) { a = valueA; b = valueB; - impl_test_axpby_unification_compare< - tScalarA, tScalarA, view_stride_adapter, tScalarB, - tScalarB, view_stride_adapter, Device>( - a, x, b, y, N, false, max_val, max_error); + impl_test_axpby_unification_compare, tScalarB, tScalarB, + view_stride_adapter, Device>(a, x, b, y, N, false, max_val, + max_error); if (valueB == Kokkos::ArithTraits::zero()) { - impl_test_axpby_unification_compare< - tScalarA, tScalarA, view_stride_adapter, tScalarB, - tScalarB, view_stride_adapter, Device>( - a, x, b, y, N, true, max_val, max_error); + impl_test_axpby_unification_compare, tScalarB, tScalarB, + view_stride_adapter, Device>(a, x, b, y, N, true, max_val, + max_error); } } } @@ -556,14 +524,12 @@ void impl_test_axpby_unification(int const N) { a = valueA; Kokkos::deep_copy(b, valueB); - impl_test_axpby_unification_compare< - tScalarA, tScalarA, view_stride_adapter, tScalarB, - ViewTypeBr0, view_stride_adapter, Device>( - a, x, b, y, N, false, max_val, max_error); + impl_test_axpby_unification_compare, tScalarB, ViewTypeBr0, + view_stride_adapter, Device>(a, x, b, y, N, false, max_val, + max_error); if (valueB == Kokkos::ArithTraits::zero()) { - impl_test_axpby_unification_compare< - tScalarA, tScalarA, view_stride_adapter, tScalarB, - ViewTypeBr0, view_stride_adapter, Device>( + impl_test_axpby_unification_compare, tScalarB, + ViewTypeBr0, view_stride_adapter, Device>( a, x, b, y, N, true, max_val, max_error); } } @@ -589,16 +555,13 @@ void impl_test_axpby_unification(int const N) { a = valueA; Kokkos::deep_copy(b.d_base, valueB); - impl_test_axpby_unification_compare< - tScalarA, tScalarA, view_stride_adapter, tScalarB, - view_stride_adapter, view_stride_adapter, - Device>(a, x, b, y, N, false, max_val, max_error); + impl_test_axpby_unification_compare, tScalarB, + view_stride_adapter, view_stride_adapter, + Device>(a, x, b, y, N, false, max_val, max_error); if (valueB == Kokkos::ArithTraits::zero()) { - impl_test_axpby_unification_compare< - tScalarA, tScalarA, view_stride_adapter, tScalarB, - view_stride_adapter, - view_stride_adapter, Device>(a, x, b, y, N, true, - max_val, max_error); + impl_test_axpby_unification_compare, tScalarB, + view_stride_adapter, view_stride_adapter, + Device>(a, x, b, y, N, true, max_val, max_error); } } } @@ -622,15 +585,13 @@ void impl_test_axpby_unification(int const N) { a = valueA; Kokkos::deep_copy(b.d_base, valueB); - impl_test_axpby_unification_compare< - tScalarA, tScalarA, view_stride_adapter, tScalarB, - view_stride_adapter, view_stride_adapter, - Device>(a, x, b, y, N, false, max_val, max_error); + impl_test_axpby_unification_compare, tScalarB, + view_stride_adapter, view_stride_adapter, Device>( + a, x, b, y, N, false, max_val, max_error); if (valueB == Kokkos::ArithTraits::zero()) { - impl_test_axpby_unification_compare< - tScalarA, tScalarA, view_stride_adapter, tScalarB, - view_stride_adapter, view_stride_adapter, - Device>(a, x, b, y, N, true, max_val, max_error); + impl_test_axpby_unification_compare, tScalarB, + view_stride_adapter, view_stride_adapter, + Device>(a, x, b, y, N, true, max_val, max_error); } } } @@ -657,15 +618,13 @@ void impl_test_axpby_unification(int const N) { Kokkos::deep_copy(a, valueA); b = valueB; - impl_test_axpby_unification_compare< - tScalarA, ViewTypeAr0, view_stride_adapter, tScalarB, - tScalarB, view_stride_adapter, Device>( - a, x, b, y, N, false, max_val, max_error); + impl_test_axpby_unification_compare, tScalarB, tScalarB, + view_stride_adapter, Device>(a, x, b, y, N, false, max_val, + max_error); if (valueB == Kokkos::ArithTraits::zero()) { - impl_test_axpby_unification_compare< - tScalarA, ViewTypeAr0, view_stride_adapter, tScalarB, - tScalarB, view_stride_adapter, Device>( - a, x, b, y, N, true, max_val, max_error); + impl_test_axpby_unification_compare, tScalarB, + tScalarB, view_stride_adapter, Device>(a, x, b, y, N, true, + max_val, max_error); } } } @@ -678,8 +637,7 @@ void impl_test_axpby_unification(int const N) { #ifdef HAVE_KOKKOSKERNELS_DEBUG std::cout << "Starting case 06/16" << std::endl; #endif - if constexpr ((std::is_same_v) || - (std::is_same_v)) { + if constexpr ((std::is_same_v) || (std::is_same_v)) { // Avoid the test, due to compilation errors } else { for (size_t i(0); i < valuesA.size(); ++i) { @@ -694,14 +652,12 @@ void impl_test_axpby_unification(int const N) { Kokkos::deep_copy(a, valueA); Kokkos::deep_copy(b, valueB); - impl_test_axpby_unification_compare< - tScalarA, ViewTypeAr0, view_stride_adapter, tScalarB, - ViewTypeBr0, view_stride_adapter, Device>( - a, x, b, y, N, false, max_val, max_error); + impl_test_axpby_unification_compare, tScalarB, + ViewTypeBr0, view_stride_adapter, Device>(a, x, b, y, N, false, + max_val, max_error); if (valueB == Kokkos::ArithTraits::zero()) { - impl_test_axpby_unification_compare< - tScalarA, ViewTypeAr0, view_stride_adapter, tScalarB, - ViewTypeBr0, view_stride_adapter, Device>( + impl_test_axpby_unification_compare, tScalarB, + ViewTypeBr0, view_stride_adapter, Device>( a, x, b, y, N, true, max_val, max_error); } } @@ -730,17 +686,13 @@ void impl_test_axpby_unification(int const N) { Kokkos::deep_copy(a, valueA); Kokkos::deep_copy(b.d_base, valueB); - impl_test_axpby_unification_compare< - tScalarA, ViewTypeAr0, view_stride_adapter, tScalarB, - view_stride_adapter, - view_stride_adapter, Device>(a, x, b, y, N, false, - max_val, max_error); + impl_test_axpby_unification_compare, tScalarB, + view_stride_adapter, view_stride_adapter, + Device>(a, x, b, y, N, false, max_val, max_error); if (valueB == Kokkos::ArithTraits::zero()) { - impl_test_axpby_unification_compare< - tScalarA, ViewTypeAr0, view_stride_adapter, tScalarB, - view_stride_adapter, - view_stride_adapter, Device>(a, x, b, y, N, true, - max_val, max_error); + impl_test_axpby_unification_compare, tScalarB, + view_stride_adapter, view_stride_adapter, + Device>(a, x, b, y, N, true, max_val, max_error); } } } @@ -768,16 +720,13 @@ void impl_test_axpby_unification(int const N) { Kokkos::deep_copy(a, valueA); Kokkos::deep_copy(b.d_base, valueB); - impl_test_axpby_unification_compare< - tScalarA, ViewTypeAr0, view_stride_adapter, tScalarB, - view_stride_adapter, view_stride_adapter, - Device>(a, x, b, y, N, false, max_val, max_error); + impl_test_axpby_unification_compare, tScalarB, + view_stride_adapter, view_stride_adapter, + Device>(a, x, b, y, N, false, max_val, max_error); if (valueB == Kokkos::ArithTraits::zero()) { - impl_test_axpby_unification_compare< - tScalarA, ViewTypeAr0, view_stride_adapter, tScalarB, - view_stride_adapter, - view_stride_adapter, Device>(a, x, b, y, N, true, - max_val, max_error); + impl_test_axpby_unification_compare, tScalarB, + view_stride_adapter, view_stride_adapter, + Device>(a, x, b, y, N, true, max_val, max_error); } } } @@ -802,17 +751,15 @@ void impl_test_axpby_unification(int const N) { Kokkos::deep_copy(a.d_base, valueA); b = valueB; - impl_test_axpby_unification_compare< - tScalarA, view_stride_adapter, - view_stride_adapter, tScalarB, tScalarB, - view_stride_adapter, Device>(a, x, b, y, N, false, - max_val, max_error); + impl_test_axpby_unification_compare, + view_stride_adapter, tScalarB, tScalarB, + view_stride_adapter, Device>(a, x, b, y, N, false, max_val, + max_error); if (valueB == Kokkos::ArithTraits::zero()) { - impl_test_axpby_unification_compare< - tScalarA, view_stride_adapter, - view_stride_adapter, tScalarB, tScalarB, - view_stride_adapter, Device>(a, x, b, y, N, true, - max_val, max_error); + impl_test_axpby_unification_compare, + view_stride_adapter, tScalarB, tScalarB, + view_stride_adapter, Device>(a, x, b, y, N, true, max_val, + max_error); } } } @@ -839,17 +786,15 @@ void impl_test_axpby_unification(int const N) { Kokkos::deep_copy(a.d_base, valueA); Kokkos::deep_copy(b, valueB); - impl_test_axpby_unification_compare< - tScalarA, view_stride_adapter, - view_stride_adapter, tScalarB, ViewTypeBr0, - view_stride_adapter, Device>(a, x, b, y, N, false, - max_val, max_error); + impl_test_axpby_unification_compare, + view_stride_adapter, tScalarB, ViewTypeBr0, + view_stride_adapter, Device>(a, x, b, y, N, false, max_val, + max_error); if (valueB == Kokkos::ArithTraits::zero()) { - impl_test_axpby_unification_compare< - tScalarA, view_stride_adapter, - view_stride_adapter, tScalarB, ViewTypeBr0, - view_stride_adapter, Device>(a, x, b, y, N, true, - max_val, max_error); + impl_test_axpby_unification_compare, + view_stride_adapter, tScalarB, ViewTypeBr0, + view_stride_adapter, Device>(a, x, b, y, N, true, max_val, + max_error); } } } @@ -875,17 +820,14 @@ void impl_test_axpby_unification(int const N) { Kokkos::deep_copy(a.d_base, valueA); Kokkos::deep_copy(b.d_base, valueB); impl_test_axpby_unification_compare< - tScalarA, view_stride_adapter, - view_stride_adapter, tScalarB, - view_stride_adapter, view_stride_adapter, - Device>(a, x, b, y, N, false, max_val, max_error); + tScalarA, view_stride_adapter, view_stride_adapter, tScalarB, + view_stride_adapter, view_stride_adapter, Device>(a, x, b, y, N, false, max_val, + max_error); if (valueB == Kokkos::ArithTraits::zero()) { impl_test_axpby_unification_compare< - tScalarA, view_stride_adapter, - view_stride_adapter, tScalarB, - view_stride_adapter, - view_stride_adapter, Device>(a, x, b, y, N, true, - max_val, max_error); + tScalarA, view_stride_adapter, view_stride_adapter, tScalarB, + view_stride_adapter, view_stride_adapter, Device>(a, x, b, y, N, true, max_val, + max_error); } } } @@ -909,17 +851,15 @@ void impl_test_axpby_unification(int const N) { Kokkos::deep_copy(a.d_base, valueA); Kokkos::deep_copy(b.d_base, valueB); - impl_test_axpby_unification_compare< - tScalarA, view_stride_adapter, - view_stride_adapter, tScalarB, - view_stride_adapter, view_stride_adapter, - Device>(a, x, b, y, N, false, max_val, max_error); + impl_test_axpby_unification_compare, + view_stride_adapter, tScalarB, view_stride_adapter, + view_stride_adapter, Device>(a, x, b, y, N, false, max_val, + max_error); if (valueB == Kokkos::ArithTraits::zero()) { impl_test_axpby_unification_compare< - tScalarA, view_stride_adapter, - view_stride_adapter, tScalarB, - view_stride_adapter, view_stride_adapter, - Device>(a, x, b, y, N, true, max_val, max_error); + tScalarA, view_stride_adapter, view_stride_adapter, tScalarB, + view_stride_adapter, view_stride_adapter, Device>(a, x, b, y, N, true, max_val, + max_error); } } } @@ -943,17 +883,14 @@ void impl_test_axpby_unification(int const N) { Kokkos::deep_copy(a.d_base, valueA); b = valueB; - impl_test_axpby_unification_compare< - tScalarA, view_stride_adapter, - view_stride_adapter, tScalarB, tScalarB, - view_stride_adapter, Device>(a, x, b, y, N, false, - max_val, max_error); + impl_test_axpby_unification_compare, view_stride_adapter, + tScalarB, tScalarB, view_stride_adapter, Device>( + a, x, b, y, N, false, max_val, max_error); if (valueB == Kokkos::ArithTraits::zero()) { - impl_test_axpby_unification_compare< - tScalarA, view_stride_adapter, - view_stride_adapter, tScalarB, tScalarB, - view_stride_adapter, Device>(a, x, b, y, N, true, - max_val, max_error); + impl_test_axpby_unification_compare, + view_stride_adapter, tScalarB, tScalarB, + view_stride_adapter, Device>(a, x, b, y, N, true, max_val, + max_error); } } } @@ -980,17 +917,15 @@ void impl_test_axpby_unification(int const N) { Kokkos::deep_copy(a.d_base, valueA); Kokkos::deep_copy(b, valueB); - impl_test_axpby_unification_compare< - tScalarA, view_stride_adapter, - view_stride_adapter, tScalarB, ViewTypeBr0, - view_stride_adapter, Device>(a, x, b, y, N, false, - max_val, max_error); + impl_test_axpby_unification_compare, + view_stride_adapter, tScalarB, ViewTypeBr0, + view_stride_adapter, Device>(a, x, b, y, N, false, max_val, + max_error); if (valueB == Kokkos::ArithTraits::zero()) { - impl_test_axpby_unification_compare< - tScalarA, view_stride_adapter, - view_stride_adapter, tScalarB, ViewTypeBr0, - view_stride_adapter, Device>(a, x, b, y, N, true, - max_val, max_error); + impl_test_axpby_unification_compare, + view_stride_adapter, tScalarB, ViewTypeBr0, + view_stride_adapter, Device>(a, x, b, y, N, true, max_val, + max_error); } } } @@ -1015,18 +950,15 @@ void impl_test_axpby_unification(int const N) { Kokkos::deep_copy(a.d_base, valueA); Kokkos::deep_copy(b.d_base, valueB); - impl_test_axpby_unification_compare< - tScalarA, view_stride_adapter, - view_stride_adapter, tScalarB, - view_stride_adapter, view_stride_adapter, - Device>(a, x, b, y, N, false, max_val, max_error); + impl_test_axpby_unification_compare, view_stride_adapter, + tScalarB, view_stride_adapter, + view_stride_adapter, Device>(a, x, b, y, N, false, max_val, + max_error); if (valueB == Kokkos::ArithTraits::zero()) { impl_test_axpby_unification_compare< - tScalarA, view_stride_adapter, - view_stride_adapter, tScalarB, - view_stride_adapter, - view_stride_adapter, Device>(a, x, b, y, N, true, - max_val, max_error); + tScalarA, view_stride_adapter, view_stride_adapter, tScalarB, + view_stride_adapter, view_stride_adapter, Device>(a, x, b, y, N, true, max_val, + max_error); } } } @@ -1050,26 +982,22 @@ void impl_test_axpby_unification(int const N) { Kokkos::deep_copy(a.d_base, valueA); Kokkos::deep_copy(b.d_base, valueB); - impl_test_axpby_unification_compare< - tScalarA, view_stride_adapter, - view_stride_adapter, tScalarB, - view_stride_adapter, view_stride_adapter, - Device>(a, x, b, y, N, false, max_val, max_error); + impl_test_axpby_unification_compare, view_stride_adapter, + tScalarB, view_stride_adapter, view_stride_adapter, + Device>(a, x, b, y, N, false, max_val, max_error); if (valueB == Kokkos::ArithTraits::zero()) { impl_test_axpby_unification_compare< - tScalarA, view_stride_adapter, - view_stride_adapter, tScalarB, - view_stride_adapter, view_stride_adapter, - Device>(a, x, b, y, N, true, max_val, max_error); + tScalarA, view_stride_adapter, view_stride_adapter, tScalarB, + view_stride_adapter, view_stride_adapter, Device>(a, x, b, y, N, true, max_val, + max_error); } } } } } -template +template void impl_test_axpby_mv_unification(int const N, int const K) { // std::cout << "=========================================" << std::endl; // std::cout << "Entering impl_test_axpby_mv_unification()" @@ -1094,10 +1022,8 @@ void impl_test_axpby_mv_unification(int const N, int const K) { using ViewTypeY = Kokkos::View; - std::array const valuesA{ - -1, Kokkos::ArithTraits::zero(), 1, 3}; - std::array const valuesB{ - -1, Kokkos::ArithTraits::zero(), 1, 5}; + std::array const valuesA{-1, Kokkos::ArithTraits::zero(), 1, 3}; + std::array const valuesB{-1, Kokkos::ArithTraits::zero(), 1, 5}; // eps should probably be based on tScalarB since that is the type // in which the result is computed. @@ -1105,9 +1031,8 @@ void impl_test_axpby_mv_unification(int const N, int const K) { MagnitudeB const eps = Kokkos::ArithTraits::epsilon(); MagnitudeB const max_val = 10; MagnitudeB const max_error = - static_cast( - Kokkos::ArithTraits::abs(valuesA[valuesA.size() - 1]) + - Kokkos::ArithTraits::abs(valuesB[valuesB.size() - 1])) * + static_cast(Kokkos::ArithTraits::abs(valuesA[valuesA.size() - 1]) + + Kokkos::ArithTraits::abs(valuesB[valuesB.size() - 1])) * max_val * eps; // ************************************************************ @@ -1128,15 +1053,13 @@ void impl_test_axpby_mv_unification(int const N, int const K) { a = valueA; b = valueB; - impl_test_axpby_mv_unification_compare< - tScalarA, tScalarA, view_stride_adapter, tScalarB, - tScalarB, view_stride_adapter, Device>( - a, x, b, y, N, K, false, max_val, max_error); + impl_test_axpby_mv_unification_compare, tScalarB, tScalarB, + view_stride_adapter, Device>(a, x, b, y, N, K, false, max_val, + max_error); if (valueB == Kokkos::ArithTraits::zero()) { - impl_test_axpby_mv_unification_compare< - tScalarA, tScalarA, view_stride_adapter, tScalarB, - tScalarB, view_stride_adapter, Device>( - a, x, b, y, N, K, true, max_val, max_error); + impl_test_axpby_mv_unification_compare, tScalarB, tScalarB, + view_stride_adapter, Device>(a, x, b, y, N, K, true, + max_val, max_error); } } } @@ -1163,14 +1086,12 @@ void impl_test_axpby_mv_unification(int const N, int const K) { a = valueA; Kokkos::deep_copy(b, valueB); - impl_test_axpby_mv_unification_compare< - tScalarA, tScalarA, view_stride_adapter, tScalarB, - ViewTypeBr0, view_stride_adapter, Device>( + impl_test_axpby_mv_unification_compare, tScalarB, + ViewTypeBr0, view_stride_adapter, Device>( a, x, b, y, N, K, false, max_val, max_error); if (valueB == Kokkos::ArithTraits::zero()) { - impl_test_axpby_mv_unification_compare< - tScalarA, tScalarA, view_stride_adapter, tScalarB, - ViewTypeBr0, view_stride_adapter, Device>( + impl_test_axpby_mv_unification_compare, tScalarB, + ViewTypeBr0, view_stride_adapter, Device>( a, x, b, y, N, K, true, max_val, max_error); } } @@ -1196,16 +1117,13 @@ void impl_test_axpby_mv_unification(int const N, int const K) { a = valueA; Kokkos::deep_copy(b.d_base, valueB); - impl_test_axpby_mv_unification_compare< - tScalarA, tScalarA, view_stride_adapter, tScalarB, - view_stride_adapter, view_stride_adapter, - Device>(a, x, b, y, N, K, false, max_val, max_error); + impl_test_axpby_mv_unification_compare, tScalarB, + view_stride_adapter, view_stride_adapter, + Device>(a, x, b, y, N, K, false, max_val, max_error); if (valueB == Kokkos::ArithTraits::zero()) { - impl_test_axpby_mv_unification_compare< - tScalarA, tScalarA, view_stride_adapter, tScalarB, - view_stride_adapter, - view_stride_adapter, Device>(a, x, b, y, N, K, true, - max_val, max_error); + impl_test_axpby_mv_unification_compare, tScalarB, + view_stride_adapter, view_stride_adapter, + Device>(a, x, b, y, N, K, true, max_val, max_error); } } } @@ -1239,10 +1157,9 @@ void impl_test_axpby_mv_unification(int const N, int const K) { } Kokkos::deep_copy(b.d_base, b.h_base); } - impl_test_axpby_mv_unification_compare< - tScalarA, tScalarA, view_stride_adapter, tScalarB, - view_stride_adapter, view_stride_adapter, - Device>(a, x, b, y, N, K, false, max_val, max_error); + impl_test_axpby_mv_unification_compare, tScalarB, + view_stride_adapter, view_stride_adapter, + Device>(a, x, b, y, N, K, false, max_val, max_error); } } } @@ -1265,15 +1182,13 @@ void impl_test_axpby_mv_unification(int const N, int const K) { a = valueA; Kokkos::deep_copy(b.d_base, valueB); - impl_test_axpby_mv_unification_compare< - tScalarA, tScalarA, view_stride_adapter, tScalarB, - view_stride_adapter, view_stride_adapter, - Device>(a, x, b, y, N, K, false, max_val, max_error); + impl_test_axpby_mv_unification_compare, tScalarB, + view_stride_adapter, view_stride_adapter, + Device>(a, x, b, y, N, K, false, max_val, max_error); if (valueB == Kokkos::ArithTraits::zero()) { - impl_test_axpby_mv_unification_compare< - tScalarA, tScalarA, view_stride_adapter, tScalarB, - view_stride_adapter, view_stride_adapter, - Device>(a, x, b, y, N, K, true, max_val, max_error); + impl_test_axpby_mv_unification_compare, tScalarB, + view_stride_adapter, view_stride_adapter, + Device>(a, x, b, y, N, K, true, max_val, max_error); } } } @@ -1307,10 +1222,9 @@ void impl_test_axpby_mv_unification(int const N, int const K) { } Kokkos::deep_copy(b.d_base, b.h_base); } - impl_test_axpby_mv_unification_compare< - tScalarA, tScalarA, view_stride_adapter, tScalarB, - view_stride_adapter, view_stride_adapter, - Device>(a, x, b, y, N, K, false, max_val, max_error); + impl_test_axpby_mv_unification_compare, tScalarB, + view_stride_adapter, view_stride_adapter, + Device>(a, x, b, y, N, K, false, max_val, max_error); } } } @@ -1336,14 +1250,12 @@ void impl_test_axpby_mv_unification(int const N, int const K) { Kokkos::deep_copy(a, valueA); b = valueB; - impl_test_axpby_mv_unification_compare< - tScalarA, ViewTypeAr0, view_stride_adapter, tScalarB, - tScalarB, view_stride_adapter, Device>( + impl_test_axpby_mv_unification_compare, tScalarB, + tScalarB, view_stride_adapter, Device>( a, x, b, y, N, K, false, max_val, max_error); if (valueB == Kokkos::ArithTraits::zero()) { - impl_test_axpby_mv_unification_compare< - tScalarA, ViewTypeAr0, view_stride_adapter, tScalarB, - tScalarB, view_stride_adapter, Device>( + impl_test_axpby_mv_unification_compare, tScalarB, + tScalarB, view_stride_adapter, Device>( a, x, b, y, N, K, true, max_val, max_error); } } @@ -1357,8 +1269,7 @@ void impl_test_axpby_mv_unification(int const N, int const K) { #ifdef HAVE_KOKKOSKERNELS_DEBUG std::cout << "Starting case 08/36" << std::endl; #endif - if constexpr ((std::is_same_v) || - (std::is_same_v)) { + if constexpr ((std::is_same_v) || (std::is_same_v)) { // Avoid the test, due to compilation errors } else { for (size_t i(0); i < valuesA.size(); ++i) { @@ -1373,14 +1284,12 @@ void impl_test_axpby_mv_unification(int const N, int const K) { Kokkos::deep_copy(a, valueA); Kokkos::deep_copy(b, valueB); - impl_test_axpby_mv_unification_compare< - tScalarA, ViewTypeAr0, view_stride_adapter, tScalarB, - ViewTypeBr0, view_stride_adapter, Device>( + impl_test_axpby_mv_unification_compare, tScalarB, + ViewTypeBr0, view_stride_adapter, Device>( a, x, b, y, N, K, false, max_val, max_error); if (valueB == Kokkos::ArithTraits::zero()) { - impl_test_axpby_mv_unification_compare< - tScalarA, ViewTypeAr0, view_stride_adapter, tScalarB, - ViewTypeBr0, view_stride_adapter, Device>( + impl_test_axpby_mv_unification_compare, tScalarB, + ViewTypeBr0, view_stride_adapter, Device>( a, x, b, y, N, K, true, max_val, max_error); } } @@ -1409,17 +1318,13 @@ void impl_test_axpby_mv_unification(int const N, int const K) { Kokkos::deep_copy(a, valueA); Kokkos::deep_copy(b.d_base, valueB); - impl_test_axpby_mv_unification_compare< - tScalarA, ViewTypeAr0, view_stride_adapter, tScalarB, - view_stride_adapter, - view_stride_adapter, Device>(a, x, b, y, N, K, false, - max_val, max_error); + impl_test_axpby_mv_unification_compare, tScalarB, + view_stride_adapter, view_stride_adapter, + Device>(a, x, b, y, N, K, false, max_val, max_error); if (valueB == Kokkos::ArithTraits::zero()) { - impl_test_axpby_mv_unification_compare< - tScalarA, ViewTypeAr0, view_stride_adapter, tScalarB, - view_stride_adapter, - view_stride_adapter, Device>(a, x, b, y, N, K, true, - max_val, max_error); + impl_test_axpby_mv_unification_compare, tScalarB, + view_stride_adapter, view_stride_adapter, + Device>(a, x, b, y, N, K, true, max_val, max_error); } } } @@ -1457,11 +1362,9 @@ void impl_test_axpby_mv_unification(int const N, int const K) { } Kokkos::deep_copy(b.d_base, b.h_base); } - impl_test_axpby_mv_unification_compare< - tScalarA, ViewTypeAr0, view_stride_adapter, tScalarB, - view_stride_adapter, - view_stride_adapter, Device>(a, x, b, y, N, K, false, - max_val, max_error); + impl_test_axpby_mv_unification_compare, tScalarB, + view_stride_adapter, view_stride_adapter, + Device>(a, x, b, y, N, K, false, max_val, max_error); } } } @@ -1488,16 +1391,13 @@ void impl_test_axpby_mv_unification(int const N, int const K) { Kokkos::deep_copy(a, valueA); Kokkos::deep_copy(b.d_base, valueB); - impl_test_axpby_mv_unification_compare< - tScalarA, ViewTypeAr0, view_stride_adapter, tScalarB, - view_stride_adapter, view_stride_adapter, - Device>(a, x, b, y, N, K, false, max_val, max_error); + impl_test_axpby_mv_unification_compare, tScalarB, + view_stride_adapter, view_stride_adapter, + Device>(a, x, b, y, N, K, false, max_val, max_error); if (valueB == Kokkos::ArithTraits::zero()) { - impl_test_axpby_mv_unification_compare< - tScalarA, ViewTypeAr0, view_stride_adapter, tScalarB, - view_stride_adapter, - view_stride_adapter, Device>(a, x, b, y, N, K, true, - max_val, max_error); + impl_test_axpby_mv_unification_compare, tScalarB, + view_stride_adapter, view_stride_adapter, + Device>(a, x, b, y, N, K, true, max_val, max_error); } } } @@ -1535,10 +1435,9 @@ void impl_test_axpby_mv_unification(int const N, int const K) { } Kokkos::deep_copy(b.d_base, b.h_base); } - impl_test_axpby_mv_unification_compare< - tScalarA, ViewTypeAr0, view_stride_adapter, tScalarB, - view_stride_adapter, view_stride_adapter, - Device>(a, x, b, y, N, K, false, max_val, max_error); + impl_test_axpby_mv_unification_compare, tScalarB, + view_stride_adapter, view_stride_adapter, + Device>(a, x, b, y, N, K, false, max_val, max_error); } } } @@ -1562,17 +1461,15 @@ void impl_test_axpby_mv_unification(int const N, int const K) { Kokkos::deep_copy(a.d_base, valueA); b = valueB; - impl_test_axpby_mv_unification_compare< - tScalarA, view_stride_adapter, - view_stride_adapter, tScalarB, tScalarB, - view_stride_adapter, Device>(a, x, b, y, N, K, false, - max_val, max_error); + impl_test_axpby_mv_unification_compare, + view_stride_adapter, tScalarB, tScalarB, + view_stride_adapter, Device>(a, x, b, y, N, K, false, max_val, + max_error); if (valueB == Kokkos::ArithTraits::zero()) { - impl_test_axpby_mv_unification_compare< - tScalarA, view_stride_adapter, - view_stride_adapter, tScalarB, tScalarB, - view_stride_adapter, Device>(a, x, b, y, N, K, true, - max_val, max_error); + impl_test_axpby_mv_unification_compare, + view_stride_adapter, tScalarB, tScalarB, + view_stride_adapter, Device>(a, x, b, y, N, K, true, + max_val, max_error); } } } @@ -1599,17 +1496,15 @@ void impl_test_axpby_mv_unification(int const N, int const K) { Kokkos::deep_copy(a.d_base, valueA); Kokkos::deep_copy(b, valueB); - impl_test_axpby_mv_unification_compare< - tScalarA, view_stride_adapter, - view_stride_adapter, tScalarB, ViewTypeBr0, - view_stride_adapter, Device>(a, x, b, y, N, K, false, - max_val, max_error); + impl_test_axpby_mv_unification_compare, + view_stride_adapter, tScalarB, ViewTypeBr0, + view_stride_adapter, Device>(a, x, b, y, N, K, false, + max_val, max_error); if (valueB == Kokkos::ArithTraits::zero()) { - impl_test_axpby_mv_unification_compare< - tScalarA, view_stride_adapter, - view_stride_adapter, tScalarB, ViewTypeBr0, - view_stride_adapter, Device>(a, x, b, y, N, K, true, - max_val, max_error); + impl_test_axpby_mv_unification_compare, + view_stride_adapter, tScalarB, ViewTypeBr0, + view_stride_adapter, Device>(a, x, b, y, N, K, true, + max_val, max_error); } } } @@ -1635,17 +1530,14 @@ void impl_test_axpby_mv_unification(int const N, int const K) { Kokkos::deep_copy(a.d_base, valueA); Kokkos::deep_copy(b.d_base, valueB); impl_test_axpby_mv_unification_compare< - tScalarA, view_stride_adapter, - view_stride_adapter, tScalarB, - view_stride_adapter, view_stride_adapter, - Device>(a, x, b, y, N, K, false, max_val, max_error); + tScalarA, view_stride_adapter, view_stride_adapter, tScalarB, + view_stride_adapter, view_stride_adapter, Device>(a, x, b, y, N, K, false, + max_val, max_error); if (valueB == Kokkos::ArithTraits::zero()) { impl_test_axpby_mv_unification_compare< - tScalarA, view_stride_adapter, - view_stride_adapter, tScalarB, - view_stride_adapter, - view_stride_adapter, Device>(a, x, b, y, N, K, true, - max_val, max_error); + tScalarA, view_stride_adapter, view_stride_adapter, tScalarB, + view_stride_adapter, view_stride_adapter, Device>(a, x, b, y, N, K, true, + max_val, max_error); } } } @@ -1680,10 +1572,9 @@ void impl_test_axpby_mv_unification(int const N, int const K) { Kokkos::deep_copy(b.d_base, b.h_base); } impl_test_axpby_mv_unification_compare< - tScalarA, view_stride_adapter, - view_stride_adapter, tScalarB, - view_stride_adapter, view_stride_adapter, - Device>(a, x, b, y, N, K, false, max_val, max_error); + tScalarA, view_stride_adapter, view_stride_adapter, tScalarB, + view_stride_adapter, view_stride_adapter, Device>(a, x, b, y, N, K, false, + max_val, max_error); } } } @@ -1707,16 +1598,14 @@ void impl_test_axpby_mv_unification(int const N, int const K) { Kokkos::deep_copy(a.d_base, valueA); Kokkos::deep_copy(b.d_base, valueB); impl_test_axpby_mv_unification_compare< - tScalarA, view_stride_adapter, - view_stride_adapter, tScalarB, - view_stride_adapter, view_stride_adapter, - Device>(a, x, b, y, N, K, false, max_val, max_error); + tScalarA, view_stride_adapter, view_stride_adapter, tScalarB, + view_stride_adapter, view_stride_adapter, Device>(a, x, b, y, N, K, false, max_val, + max_error); if (valueB == Kokkos::ArithTraits::zero()) { impl_test_axpby_mv_unification_compare< - tScalarA, view_stride_adapter, - view_stride_adapter, tScalarB, - view_stride_adapter, view_stride_adapter, - Device>(a, x, b, y, N, K, true, max_val, max_error); + tScalarA, view_stride_adapter, view_stride_adapter, tScalarB, + view_stride_adapter, view_stride_adapter, Device>(a, x, b, y, N, K, true, + max_val, max_error); } } } @@ -1751,10 +1640,9 @@ void impl_test_axpby_mv_unification(int const N, int const K) { Kokkos::deep_copy(b.d_base, b.h_base); } impl_test_axpby_mv_unification_compare< - tScalarA, view_stride_adapter, - view_stride_adapter, tScalarB, - view_stride_adapter, view_stride_adapter, - Device>(a, x, b, y, N, K, false, max_val, max_error); + tScalarA, view_stride_adapter, view_stride_adapter, tScalarB, + view_stride_adapter, view_stride_adapter, Device>(a, x, b, y, N, K, false, max_val, + max_error); } } } @@ -1787,17 +1675,15 @@ void impl_test_axpby_mv_unification(int const N, int const K) { Kokkos::deep_copy(a.d_base, a.h_base); } b = valueB; - impl_test_axpby_mv_unification_compare< - tScalarA, view_stride_adapter, - view_stride_adapter, tScalarB, tScalarB, - view_stride_adapter, Device>(a, x, b, y, N, K, false, - max_val, max_error); + impl_test_axpby_mv_unification_compare, + view_stride_adapter, tScalarB, tScalarB, + view_stride_adapter, Device>(a, x, b, y, N, K, false, max_val, + max_error); if (valueB == Kokkos::ArithTraits::zero()) { - impl_test_axpby_mv_unification_compare< - tScalarA, view_stride_adapter, - view_stride_adapter, tScalarB, tScalarB, - view_stride_adapter, Device>(a, x, b, y, N, K, true, - max_val, max_error); + impl_test_axpby_mv_unification_compare, + view_stride_adapter, tScalarB, tScalarB, + view_stride_adapter, Device>(a, x, b, y, N, K, true, + max_val, max_error); } } } @@ -1834,17 +1720,15 @@ void impl_test_axpby_mv_unification(int const N, int const K) { Kokkos::deep_copy(a.d_base, a.h_base); } Kokkos::deep_copy(b, valueB); - impl_test_axpby_mv_unification_compare< - tScalarA, view_stride_adapter, - view_stride_adapter, tScalarB, ViewTypeBr0, - view_stride_adapter, Device>(a, x, b, y, N, K, false, - max_val, max_error); + impl_test_axpby_mv_unification_compare, + view_stride_adapter, tScalarB, ViewTypeBr0, + view_stride_adapter, Device>(a, x, b, y, N, K, false, + max_val, max_error); if (valueB == Kokkos::ArithTraits::zero()) { - impl_test_axpby_mv_unification_compare< - tScalarA, view_stride_adapter, - view_stride_adapter, tScalarB, ViewTypeBr0, - view_stride_adapter, Device>(a, x, b, y, N, K, true, - max_val, max_error); + impl_test_axpby_mv_unification_compare, + view_stride_adapter, tScalarB, ViewTypeBr0, + view_stride_adapter, Device>(a, x, b, y, N, K, true, + max_val, max_error); } } } @@ -1880,17 +1764,14 @@ void impl_test_axpby_mv_unification(int const N, int const K) { } Kokkos::deep_copy(b.d_base, valueB); impl_test_axpby_mv_unification_compare< - tScalarA, view_stride_adapter, - view_stride_adapter, tScalarB, - view_stride_adapter, view_stride_adapter, - Device>(a, x, b, y, N, K, false, max_val, max_error); + tScalarA, view_stride_adapter, view_stride_adapter, tScalarB, + view_stride_adapter, view_stride_adapter, Device>(a, x, b, y, N, K, false, + max_val, max_error); if (valueB == Kokkos::ArithTraits::zero()) { impl_test_axpby_mv_unification_compare< - tScalarA, view_stride_adapter, - view_stride_adapter, tScalarB, - view_stride_adapter, - view_stride_adapter, Device>(a, x, b, y, N, K, true, - max_val, max_error); + tScalarA, view_stride_adapter, view_stride_adapter, tScalarB, + view_stride_adapter, view_stride_adapter, Device>(a, x, b, y, N, K, true, + max_val, max_error); } } } @@ -1936,10 +1817,9 @@ void impl_test_axpby_mv_unification(int const N, int const K) { Kokkos::deep_copy(b.d_base, b.h_base); } impl_test_axpby_mv_unification_compare< - tScalarA, view_stride_adapter, - view_stride_adapter, tScalarB, - view_stride_adapter, view_stride_adapter, - Device>(a, x, b, y, N, K, false, max_val, max_error); + tScalarA, view_stride_adapter, view_stride_adapter, tScalarB, + view_stride_adapter, view_stride_adapter, Device>(a, x, b, y, N, K, false, + max_val, max_error); } } } @@ -1973,16 +1853,14 @@ void impl_test_axpby_mv_unification(int const N, int const K) { } Kokkos::deep_copy(b.d_base, valueB); impl_test_axpby_mv_unification_compare< - tScalarA, view_stride_adapter, - view_stride_adapter, tScalarB, - view_stride_adapter, view_stride_adapter, - Device>(a, x, b, y, N, K, false, max_val, max_error); + tScalarA, view_stride_adapter, view_stride_adapter, tScalarB, + view_stride_adapter, view_stride_adapter, Device>(a, x, b, y, N, K, false, max_val, + max_error); if (valueB == Kokkos::ArithTraits::zero()) { impl_test_axpby_mv_unification_compare< - tScalarA, view_stride_adapter, - view_stride_adapter, tScalarB, - view_stride_adapter, view_stride_adapter, - Device>(a, x, b, y, N, K, true, max_val, max_error); + tScalarA, view_stride_adapter, view_stride_adapter, tScalarB, + view_stride_adapter, view_stride_adapter, Device>(a, x, b, y, N, K, true, + max_val, max_error); } } } @@ -2029,10 +1907,9 @@ void impl_test_axpby_mv_unification(int const N, int const K) { } impl_test_axpby_mv_unification_compare< - tScalarA, view_stride_adapter, - view_stride_adapter, tScalarB, - view_stride_adapter, view_stride_adapter, - Device>(a, x, b, y, N, K, false, max_val, max_error); + tScalarA, view_stride_adapter, view_stride_adapter, tScalarB, + view_stride_adapter, view_stride_adapter, Device>(a, x, b, y, N, K, false, max_val, + max_error); } } } @@ -2055,17 +1932,15 @@ void impl_test_axpby_mv_unification(int const N, int const K) { Kokkos::deep_copy(a.d_base, valueA); b = valueB; - impl_test_axpby_mv_unification_compare< - tScalarA, view_stride_adapter, - view_stride_adapter, tScalarB, tScalarB, - view_stride_adapter, Device>(a, x, b, y, N, K, false, - max_val, max_error); + impl_test_axpby_mv_unification_compare, + view_stride_adapter, tScalarB, tScalarB, + view_stride_adapter, Device>(a, x, b, y, N, K, false, max_val, + max_error); if (valueB == Kokkos::ArithTraits::zero()) { - impl_test_axpby_mv_unification_compare< - tScalarA, view_stride_adapter, - view_stride_adapter, tScalarB, tScalarB, - view_stride_adapter, Device>(a, x, b, y, N, K, true, - max_val, max_error); + impl_test_axpby_mv_unification_compare, + view_stride_adapter, tScalarB, tScalarB, + view_stride_adapter, Device>(a, x, b, y, N, K, true, + max_val, max_error); } } } @@ -2092,17 +1967,15 @@ void impl_test_axpby_mv_unification(int const N, int const K) { Kokkos::deep_copy(a.d_base, valueA); Kokkos::deep_copy(b, valueB); - impl_test_axpby_mv_unification_compare< - tScalarA, view_stride_adapter, - view_stride_adapter, tScalarB, ViewTypeBr0, - view_stride_adapter, Device>(a, x, b, y, N, K, false, - max_val, max_error); + impl_test_axpby_mv_unification_compare, + view_stride_adapter, tScalarB, ViewTypeBr0, + view_stride_adapter, Device>(a, x, b, y, N, K, false, + max_val, max_error); if (valueB == Kokkos::ArithTraits::zero()) { - impl_test_axpby_mv_unification_compare< - tScalarA, view_stride_adapter, - view_stride_adapter, tScalarB, ViewTypeBr0, - view_stride_adapter, Device>(a, x, b, y, N, K, true, - max_val, max_error); + impl_test_axpby_mv_unification_compare, + view_stride_adapter, tScalarB, ViewTypeBr0, + view_stride_adapter, Device>(a, x, b, y, N, K, true, + max_val, max_error); } } } @@ -2128,17 +2001,14 @@ void impl_test_axpby_mv_unification(int const N, int const K) { Kokkos::deep_copy(a.d_base, valueA); Kokkos::deep_copy(b.d_base, valueB); impl_test_axpby_mv_unification_compare< - tScalarA, view_stride_adapter, - view_stride_adapter, tScalarB, - view_stride_adapter, view_stride_adapter, - Device>(a, x, b, y, N, K, false, max_val, max_error); + tScalarA, view_stride_adapter, view_stride_adapter, tScalarB, + view_stride_adapter, view_stride_adapter, Device>(a, x, b, y, N, K, false, + max_val, max_error); if (valueB == Kokkos::ArithTraits::zero()) { impl_test_axpby_mv_unification_compare< - tScalarA, view_stride_adapter, - view_stride_adapter, tScalarB, - view_stride_adapter, - view_stride_adapter, Device>(a, x, b, y, N, K, true, - max_val, max_error); + tScalarA, view_stride_adapter, view_stride_adapter, tScalarB, + view_stride_adapter, view_stride_adapter, Device>(a, x, b, y, N, K, true, + max_val, max_error); } } } @@ -2173,10 +2043,9 @@ void impl_test_axpby_mv_unification(int const N, int const K) { Kokkos::deep_copy(b.d_base, b.h_base); } impl_test_axpby_mv_unification_compare< - tScalarA, view_stride_adapter, - view_stride_adapter, tScalarB, - view_stride_adapter, view_stride_adapter, - Device>(a, x, b, y, N, K, false, max_val, max_error); + tScalarA, view_stride_adapter, view_stride_adapter, tScalarB, + view_stride_adapter, view_stride_adapter, Device>(a, x, b, y, N, K, false, + max_val, max_error); } } } @@ -2200,16 +2069,14 @@ void impl_test_axpby_mv_unification(int const N, int const K) { Kokkos::deep_copy(a.d_base, valueA); Kokkos::deep_copy(b.d_base, valueB); impl_test_axpby_mv_unification_compare< - tScalarA, view_stride_adapter, - view_stride_adapter, tScalarB, - view_stride_adapter, view_stride_adapter, - Device>(a, x, b, y, N, K, false, max_val, max_error); + tScalarA, view_stride_adapter, view_stride_adapter, tScalarB, + view_stride_adapter, view_stride_adapter, Device>(a, x, b, y, N, K, false, max_val, + max_error); if (valueB == Kokkos::ArithTraits::zero()) { impl_test_axpby_mv_unification_compare< - tScalarA, view_stride_adapter, - view_stride_adapter, tScalarB, - view_stride_adapter, view_stride_adapter, - Device>(a, x, b, y, N, K, true, max_val, max_error); + tScalarA, view_stride_adapter, view_stride_adapter, tScalarB, + view_stride_adapter, view_stride_adapter, Device>(a, x, b, y, N, K, true, + max_val, max_error); } } } @@ -2244,10 +2111,9 @@ void impl_test_axpby_mv_unification(int const N, int const K) { Kokkos::deep_copy(b.d_base, b.h_base); } impl_test_axpby_mv_unification_compare< - tScalarA, view_stride_adapter, - view_stride_adapter, tScalarB, - view_stride_adapter, view_stride_adapter, - Device>(a, x, b, y, N, K, false, max_val, max_error); + tScalarA, view_stride_adapter, view_stride_adapter, tScalarB, + view_stride_adapter, view_stride_adapter, Device>(a, x, b, y, N, K, false, max_val, + max_error); } } } @@ -2280,17 +2146,15 @@ void impl_test_axpby_mv_unification(int const N, int const K) { Kokkos::deep_copy(a.d_base, a.h_base); } b = valueB; - impl_test_axpby_mv_unification_compare< - tScalarA, view_stride_adapter, - view_stride_adapter, tScalarB, tScalarB, - view_stride_adapter, Device>(a, x, b, y, N, K, false, - max_val, max_error); + impl_test_axpby_mv_unification_compare, + view_stride_adapter, tScalarB, tScalarB, + view_stride_adapter, Device>(a, x, b, y, N, K, false, max_val, + max_error); if (valueB == Kokkos::ArithTraits::zero()) { - impl_test_axpby_mv_unification_compare< - tScalarA, view_stride_adapter, - view_stride_adapter, tScalarB, tScalarB, - view_stride_adapter, Device>(a, x, b, y, N, K, true, - max_val, max_error); + impl_test_axpby_mv_unification_compare, + view_stride_adapter, tScalarB, tScalarB, + view_stride_adapter, Device>(a, x, b, y, N, K, true, + max_val, max_error); } } } @@ -2327,17 +2191,15 @@ void impl_test_axpby_mv_unification(int const N, int const K) { Kokkos::deep_copy(a.d_base, a.h_base); } Kokkos::deep_copy(b, valueB); - impl_test_axpby_mv_unification_compare< - tScalarA, view_stride_adapter, - view_stride_adapter, tScalarB, ViewTypeBr0, - view_stride_adapter, Device>(a, x, b, y, N, K, false, - max_val, max_error); + impl_test_axpby_mv_unification_compare, + view_stride_adapter, tScalarB, ViewTypeBr0, + view_stride_adapter, Device>(a, x, b, y, N, K, false, + max_val, max_error); if (valueB == Kokkos::ArithTraits::zero()) { - impl_test_axpby_mv_unification_compare< - tScalarA, view_stride_adapter, - view_stride_adapter, tScalarB, ViewTypeBr0, - view_stride_adapter, Device>(a, x, b, y, N, K, true, - max_val, max_error); + impl_test_axpby_mv_unification_compare, + view_stride_adapter, tScalarB, ViewTypeBr0, + view_stride_adapter, Device>(a, x, b, y, N, K, true, + max_val, max_error); } } } @@ -2373,17 +2235,14 @@ void impl_test_axpby_mv_unification(int const N, int const K) { } Kokkos::deep_copy(b.d_base, valueB); impl_test_axpby_mv_unification_compare< - tScalarA, view_stride_adapter, - view_stride_adapter, tScalarB, - view_stride_adapter, view_stride_adapter, - Device>(a, x, b, y, N, K, false, max_val, max_error); + tScalarA, view_stride_adapter, view_stride_adapter, tScalarB, + view_stride_adapter, view_stride_adapter, Device>(a, x, b, y, N, K, false, + max_val, max_error); if (valueB == Kokkos::ArithTraits::zero()) { impl_test_axpby_mv_unification_compare< - tScalarA, view_stride_adapter, - view_stride_adapter, tScalarB, - view_stride_adapter, - view_stride_adapter, Device>(a, x, b, y, N, K, true, - max_val, max_error); + tScalarA, view_stride_adapter, view_stride_adapter, tScalarB, + view_stride_adapter, view_stride_adapter, Device>(a, x, b, y, N, K, true, + max_val, max_error); } } } @@ -2430,10 +2289,9 @@ void impl_test_axpby_mv_unification(int const N, int const K) { } impl_test_axpby_mv_unification_compare< - tScalarA, view_stride_adapter, - view_stride_adapter, tScalarB, - view_stride_adapter, view_stride_adapter, - Device>(a, x, b, y, N, K, false, max_val, max_error); + tScalarA, view_stride_adapter, view_stride_adapter, tScalarB, + view_stride_adapter, view_stride_adapter, Device>(a, x, b, y, N, K, false, + max_val, max_error); } } } @@ -2467,16 +2325,14 @@ void impl_test_axpby_mv_unification(int const N, int const K) { } Kokkos::deep_copy(b.d_base, valueB); impl_test_axpby_mv_unification_compare< - tScalarA, view_stride_adapter, - view_stride_adapter, tScalarB, - view_stride_adapter, view_stride_adapter, - Device>(a, x, b, y, N, K, false, max_val, max_error); + tScalarA, view_stride_adapter, view_stride_adapter, tScalarB, + view_stride_adapter, view_stride_adapter, Device>(a, x, b, y, N, K, false, max_val, + max_error); if (valueB == Kokkos::ArithTraits::zero()) { impl_test_axpby_mv_unification_compare< - tScalarA, view_stride_adapter, - view_stride_adapter, tScalarB, - view_stride_adapter, view_stride_adapter, - Device>(a, x, b, y, N, K, true, max_val, max_error); + tScalarA, view_stride_adapter, view_stride_adapter, tScalarB, + view_stride_adapter, view_stride_adapter, Device>(a, x, b, y, N, K, true, + max_val, max_error); } } } @@ -2523,10 +2379,9 @@ void impl_test_axpby_mv_unification(int const N, int const K) { } impl_test_axpby_mv_unification_compare< - tScalarA, view_stride_adapter, - view_stride_adapter, tScalarB, - view_stride_adapter, view_stride_adapter, - Device>(a, x, b, y, N, K, false, max_val, max_error); + tScalarA, view_stride_adapter, view_stride_adapter, tScalarB, + view_stride_adapter, view_stride_adapter, Device>(a, x, b, y, N, K, false, max_val, + max_error); } } } @@ -2537,130 +2392,103 @@ void impl_test_axpby_mv_unification(int const N, int const K) { } // namespace Test -template +template int test_axpby_unification() { #if defined(KOKKOSKERNELS_INST_LAYOUTLEFT) || \ - (!defined(KOKKOSKERNELS_ETI_ONLY) && \ - !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS)) + (!defined(KOKKOSKERNELS_ETI_ONLY) && !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS)) #ifdef HAVE_KOKKOSKERNELS_DEBUG std::cout << "Calling impl_test_axpby_unif(), L-LLL" << std::endl; #endif - Test::impl_test_axpby_unification< - tScalarA, Kokkos::LayoutLeft, tScalarX, Kokkos::LayoutLeft, tScalarB, - Kokkos::LayoutLeft, tScalarY, Kokkos::LayoutLeft, Device>(14); + Test::impl_test_axpby_unification(14); #endif #if defined(KOKKOSKERNELS_INST_LAYOUTRIGHT) || \ - (!defined(KOKKOSKERNELS_ETI_ONLY) && \ - !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS)) + (!defined(KOKKOSKERNELS_ETI_ONLY) && !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS)) #ifdef HAVE_KOKKOSKERNELS_DEBUG std::cout << "Calling impl_test_axpby_unif(), L-RRR" << std::endl; #endif - Test::impl_test_axpby_unification< - tScalarA, Kokkos::LayoutRight, tScalarX, Kokkos::LayoutRight, tScalarB, - Kokkos::LayoutRight, tScalarY, Kokkos::LayoutRight, Device>(14); + Test::impl_test_axpby_unification(14); #endif -#if (!defined(KOKKOSKERNELS_ETI_ONLY) && \ - !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS)) +#if (!defined(KOKKOSKERNELS_ETI_ONLY) && !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS)) #ifdef HAVE_KOKKOSKERNELS_DEBUG std::cout << "Calling impl_test_axpby_unif(), L-SSS" << std::endl; #endif - Test::impl_test_axpby_unification< - tScalarA, Kokkos::LayoutStride, tScalarX, Kokkos::LayoutStride, tScalarB, - Kokkos::LayoutStride, tScalarY, Kokkos::LayoutStride, Device>(14); + Test::impl_test_axpby_unification(14); #endif -#if !defined(KOKKOSKERNELS_ETI_ONLY) && \ - !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS) +#if !defined(KOKKOSKERNELS_ETI_ONLY) && !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS) #ifdef HAVE_KOKKOSKERNELS_DEBUG std::cout << "Calling impl_test_axpby_unif(), L-SLL" << std::endl; #endif - Test::impl_test_axpby_unification< - tScalarA, Kokkos::LayoutStride, tScalarX, Kokkos::LayoutStride, tScalarB, - Kokkos::LayoutLeft, tScalarY, Kokkos::LayoutLeft, Device>(14); + Test::impl_test_axpby_unification(14); #ifdef HAVE_KOKKOSKERNELS_DEBUG std::cout << "Calling impl_test_axpby_unif(), L-LSS" << std::endl; #endif - Test::impl_test_axpby_unification< - tScalarA, Kokkos::LayoutLeft, tScalarX, Kokkos::LayoutLeft, tScalarB, - Kokkos::LayoutStride, tScalarY, Kokkos::LayoutStride, Device>(14); + Test::impl_test_axpby_unification(14); #ifdef HAVE_KOKKOSKERNELS_DEBUG std::cout << "Calling impl_test_axpby_unif(), L-SRS" << std::endl; #endif - Test::impl_test_axpby_unification< - tScalarA, Kokkos::LayoutLeft, tScalarX, Kokkos::LayoutStride, tScalarB, - Kokkos::LayoutRight, tScalarY, Kokkos::LayoutStride, Device>(14); + Test::impl_test_axpby_unification(14); #ifdef HAVE_KOKKOSKERNELS_DEBUG std::cout << "Calling impl_test_axpby_unif(), L-LSR" << std::endl; #endif - Test::impl_test_axpby_unification< - tScalarA, Kokkos::LayoutStride, tScalarX, Kokkos::LayoutLeft, tScalarB, - Kokkos::LayoutStride, tScalarY, Kokkos::LayoutRight, Device>(14); + Test::impl_test_axpby_unification(14); #endif return 1; } -template +template int test_axpby_mv_unification() { #if defined(KOKKOSKERNELS_INST_LAYOUTLEFT) || \ - (!defined(KOKKOSKERNELS_ETI_ONLY) && \ - !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS)) - Test::impl_test_axpby_mv_unification< - tScalarA, Kokkos::LayoutLeft, tScalarX, Kokkos::LayoutLeft, tScalarB, - Kokkos::LayoutLeft, tScalarY, Kokkos::LayoutLeft, Device>( - 14, numVecsAxpbyTest); + (!defined(KOKKOSKERNELS_ETI_ONLY) && !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS)) + Test::impl_test_axpby_mv_unification(14, numVecsAxpbyTest); #endif #if defined(KOKKOSKERNELS_INST_LAYOUTRIGHT) || \ - (!defined(KOKKOSKERNELS_ETI_ONLY) && \ - !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS)) - Test::impl_test_axpby_mv_unification< - tScalarA, Kokkos::LayoutRight, tScalarX, Kokkos::LayoutRight, tScalarB, - Kokkos::LayoutRight, tScalarY, Kokkos::LayoutRight, Device>( - 14, numVecsAxpbyTest); -#endif - -#if (!defined(KOKKOSKERNELS_ETI_ONLY) && \ - !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS)) - Test::impl_test_axpby_mv_unification< - tScalarA, Kokkos::LayoutStride, tScalarX, Kokkos::LayoutStride, tScalarB, - Kokkos::LayoutStride, tScalarY, Kokkos::LayoutStride, Device>( - 14, numVecsAxpbyTest); -#endif - -#if !defined(KOKKOSKERNELS_ETI_ONLY) && \ - !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS) - Test::impl_test_axpby_mv_unification< - tScalarA, Kokkos::LayoutStride, tScalarX, Kokkos::LayoutStride, tScalarB, - Kokkos::LayoutLeft, tScalarY, Kokkos::LayoutLeft, Device>( - 14, numVecsAxpbyTest); - Test::impl_test_axpby_mv_unification< - tScalarA, Kokkos::LayoutLeft, tScalarX, Kokkos::LayoutLeft, tScalarB, - Kokkos::LayoutStride, tScalarY, Kokkos::LayoutStride, Device>( - 14, numVecsAxpbyTest); - - Test::impl_test_axpby_mv_unification< - tScalarA, Kokkos::LayoutLeft, tScalarX, Kokkos::LayoutStride, tScalarB, - Kokkos::LayoutRight, tScalarY, Kokkos::LayoutStride, Device>( - 14, numVecsAxpbyTest); - - Test::impl_test_axpby_mv_unification< - tScalarA, Kokkos::LayoutStride, tScalarX, Kokkos::LayoutLeft, tScalarB, - Kokkos::LayoutStride, tScalarY, Kokkos::LayoutRight, Device>( - 14, numVecsAxpbyTest); + (!defined(KOKKOSKERNELS_ETI_ONLY) && !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS)) + Test::impl_test_axpby_mv_unification(14, + numVecsAxpbyTest); +#endif + +#if (!defined(KOKKOSKERNELS_ETI_ONLY) && !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS)) + Test::impl_test_axpby_mv_unification(14, + numVecsAxpbyTest); +#endif + +#if !defined(KOKKOSKERNELS_ETI_ONLY) && !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS) + Test::impl_test_axpby_mv_unification(14, numVecsAxpbyTest); + Test::impl_test_axpby_mv_unification(14, + numVecsAxpbyTest); + + Test::impl_test_axpby_mv_unification(14, + numVecsAxpbyTest); + + Test::impl_test_axpby_mv_unification(14, + numVecsAxpbyTest); #endif return 1; } #if defined(KOKKOSKERNELS_INST_FLOAT) || \ - (!defined(KOKKOSKERNELS_ETI_ONLY) && \ - !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS)) + (!defined(KOKKOSKERNELS_ETI_ONLY) && !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS)) TEST_F(TestCategory, axpby_unification_float) { Kokkos::Profiling::pushRegion("KokkosBlas::Test::axpby_unification_float"); test_axpby_unification(); @@ -2674,44 +2502,36 @@ TEST_F(TestCategory, axpby_mv_unification_float) { #endif #if defined(KOKKOSKERNELS_INST_DOUBLE) || \ - (!defined(KOKKOSKERNELS_ETI_ONLY) && \ - !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS)) + (!defined(KOKKOSKERNELS_ETI_ONLY) && !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS)) TEST_F(TestCategory, axpby_unification_double) { Kokkos::Profiling::pushRegion("KokkosBlas::Test::axpby_unification_double"); test_axpby_unification(); } TEST_F(TestCategory, axpby_mv_unification_double) { - Kokkos::Profiling::pushRegion( - "KokkosBlas::Test::axpby_mv_unification_double"); + Kokkos::Profiling::pushRegion("KokkosBlas::Test::axpby_mv_unification_double"); test_axpby_mv_unification(); Kokkos::Profiling::popRegion(); } #endif #if defined(KOKKOSKERNELS_INST_COMPLEX_DOUBLE) || \ - (!defined(KOKKOSKERNELS_ETI_ONLY) && \ - !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS)) + (!defined(KOKKOSKERNELS_ETI_ONLY) && !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS)) TEST_F(TestCategory, axpby_unification_complex_double) { - Kokkos::Profiling::pushRegion( - "KokkosBlas::Test::axpby_unification_complex_double"); - test_axpby_unification, Kokkos::complex, - Kokkos::complex, Kokkos::complex, - TestDevice>(); + Kokkos::Profiling::pushRegion("KokkosBlas::Test::axpby_unification_complex_double"); + test_axpby_unification, Kokkos::complex, Kokkos::complex, + Kokkos::complex, TestDevice>(); Kokkos::Profiling::popRegion(); } TEST_F(TestCategory, axpby_mv_unification_complex_double) { - Kokkos::Profiling::pushRegion( - "KokkosBlas::Test::axpby_mv_unification_complex_double"); - test_axpby_mv_unification, Kokkos::complex, - Kokkos::complex, Kokkos::complex, - TestDevice>(); + Kokkos::Profiling::pushRegion("KokkosBlas::Test::axpby_mv_unification_complex_double"); + test_axpby_mv_unification, Kokkos::complex, Kokkos::complex, + Kokkos::complex, TestDevice>(); Kokkos::Profiling::popRegion(); } #endif -#if defined(KOKKOSKERNELS_INST_INT) || \ - (!defined(KOKKOSKERNELS_ETI_ONLY) && \ - !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS)) +#if defined(KOKKOSKERNELS_INST_INT) || \ + (!defined(KOKKOSKERNELS_ETI_ONLY) && !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS)) TEST_F(TestCategory, axpby_unification_int) { Kokkos::Profiling::pushRegion("KokkosBlas::Test::axpby_unification_int"); test_axpby_unification(); @@ -2724,17 +2544,14 @@ TEST_F(TestCategory, axpby_mv_unification_int) { } #endif -#if !defined(KOKKOSKERNELS_ETI_ONLY) && \ - !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS) +#if !defined(KOKKOSKERNELS_ETI_ONLY) && !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS) TEST_F(TestCategory, axpby_unification_double_int) { - Kokkos::Profiling::pushRegion( - "KokkosBlas::Test::axpby_unification_double_int"); + Kokkos::Profiling::pushRegion("KokkosBlas::Test::axpby_unification_double_int"); test_axpby_unification(); Kokkos::Profiling::popRegion(); } TEST_F(TestCategory, axpby_double_mv_unification_int) { - Kokkos::Profiling::pushRegion( - "KokkosBlas::Test::axpby_mv_unification_double_int"); + Kokkos::Profiling::pushRegion("KokkosBlas::Test::axpby_mv_unification_double_int"); test_axpby_mv_unification(); Kokkos::Profiling::popRegion(); } diff --git a/blas/unit_test/Test_Blas1_axpy.hpp b/blas/unit_test/Test_Blas1_axpy.hpp index 76528f4a52..94e4260268 100644 --- a/blas/unit_test/Test_Blas1_axpy.hpp +++ b/blas/unit_test/Test_Blas1_axpy.hpp @@ -31,16 +31,13 @@ void impl_test_axpy(int N) { const MagnitudeB max_val = 10; const MagnitudeB eps = Kokkos::ArithTraits::epsilon(); const MagnitudeB max_error = - (static_cast(Kokkos::ArithTraits::abs(a)) * max_val + - max_val) * - eps; + (static_cast(Kokkos::ArithTraits::abs(a)) * max_val + max_val) * eps; view_stride_adapter x("X", N); view_stride_adapter y("Y", N); view_stride_adapter org_y("Org_Y", N); - Kokkos::Random_XorShift64_Pool rand_pool( - 13718); + Kokkos::Random_XorShift64_Pool rand_pool(13718); { ScalarA randStart, randEnd; @@ -88,12 +85,9 @@ void impl_test_axpy_mv(int N, int K) { const MagnitudeB eps = Kokkos::ArithTraits::epsilon(); const MagnitudeB max_val = 10; const MagnitudeB max_error = - (static_cast(Kokkos::ArithTraits::abs(a)) * max_val + - max_val) * - eps; + (static_cast(Kokkos::ArithTraits::abs(a)) * max_val + max_val) * eps; - Kokkos::Random_XorShift64_Pool rand_pool( - 13718); + Kokkos::Random_XorShift64_Pool rand_pool(13718); { ScalarA randStart, randEnd; @@ -113,9 +107,7 @@ void impl_test_axpy_mv(int N, int K) { Kokkos::deep_copy(y.h_base, y.d_base); for (int i = 0; i < N; i++) { for (int j = 0; j < K; j++) { - EXPECT_NEAR_KK( - static_cast(a * x.h_view(i, j) + org_y.h_view(i, j)), - y.h_view(i, j), 2 * max_error); + EXPECT_NEAR_KK(static_cast(a * x.h_view(i, j) + org_y.h_view(i, j)), y.h_view(i, j), 2 * max_error); } } @@ -125,9 +117,7 @@ void impl_test_axpy_mv(int N, int K) { Kokkos::deep_copy(y.h_base, y.d_base); for (int i = 0; i < N; i++) { for (int j = 0; j < K; j++) { - EXPECT_NEAR_KK( - static_cast(a * x.h_view(i, j) + org_y.h_view(i, j)), - y.h_view(i, j), 2 * max_error); + EXPECT_NEAR_KK(static_cast(a * x.h_view(i, j) + org_y.h_view(i, j)), y.h_view(i, j), 2 * max_error); } } } @@ -136,8 +126,7 @@ void impl_test_axpy_mv(int N, int K) { template int test_axpy() { #if defined(KOKKOSKERNELS_INST_LAYOUTLEFT) || \ - (!defined(KOKKOSKERNELS_ETI_ONLY) && \ - !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS)) + (!defined(KOKKOSKERNELS_ETI_ONLY) && !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS)) typedef Kokkos::View view_type_a_ll; typedef Kokkos::View view_type_b_ll; Test::impl_test_axpy(0); @@ -147,8 +136,7 @@ int test_axpy() { #endif #if defined(KOKKOSKERNELS_INST_LAYOUTRIGHT) || \ - (!defined(KOKKOSKERNELS_ETI_ONLY) && \ - !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS)) + (!defined(KOKKOSKERNELS_ETI_ONLY) && !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS)) typedef Kokkos::View view_type_a_lr; typedef Kokkos::View view_type_b_lr; Test::impl_test_axpy(0); @@ -157,8 +145,7 @@ int test_axpy() { // Test::impl_test_axpy(132231); #endif -#if (!defined(KOKKOSKERNELS_ETI_ONLY) && \ - !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS)) +#if (!defined(KOKKOSKERNELS_ETI_ONLY) && !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS)) typedef Kokkos::View view_type_a_ls; typedef Kokkos::View view_type_b_ls; Test::impl_test_axpy(0); @@ -167,8 +154,7 @@ int test_axpy() { // Test::impl_test_axpy(132231); #endif -#if !defined(KOKKOSKERNELS_ETI_ONLY) && \ - !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS) +#if !defined(KOKKOSKERNELS_ETI_ONLY) && !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS) Test::impl_test_axpy(1024); Test::impl_test_axpy(1024); #endif @@ -179,8 +165,7 @@ int test_axpy() { template int test_axpy_mv() { #if defined(KOKKOSKERNELS_INST_LAYOUTLEFT) || \ - (!defined(KOKKOSKERNELS_ETI_ONLY) && \ - !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS)) + (!defined(KOKKOSKERNELS_ETI_ONLY) && !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS)) typedef Kokkos::View view_type_a_ll; typedef Kokkos::View view_type_b_ll; Test::impl_test_axpy_mv(0, 5); @@ -190,8 +175,7 @@ int test_axpy_mv() { #endif #if defined(KOKKOSKERNELS_INST_LAYOUTRIGHT) || \ - (!defined(KOKKOSKERNELS_ETI_ONLY) && \ - !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS)) + (!defined(KOKKOSKERNELS_ETI_ONLY) && !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS)) typedef Kokkos::View view_type_a_lr; typedef Kokkos::View view_type_b_lr; Test::impl_test_axpy_mv(0, 5); @@ -200,8 +184,7 @@ int test_axpy_mv() { // Test::impl_test_axpy_mv(132231,5); #endif -#if (!defined(KOKKOSKERNELS_ETI_ONLY) && \ - !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS)) +#if (!defined(KOKKOSKERNELS_ETI_ONLY) && !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS)) typedef Kokkos::View view_type_a_ls; typedef Kokkos::View view_type_b_ls; Test::impl_test_axpy_mv(0, 5); @@ -210,8 +193,7 @@ int test_axpy_mv() { // Test::impl_test_axpy_mv(132231,5); #endif -#if !defined(KOKKOSKERNELS_ETI_ONLY) && \ - !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS) +#if !defined(KOKKOSKERNELS_ETI_ONLY) && !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS) Test::impl_test_axpy_mv(1024, 5); Test::impl_test_axpy_mv(1024, 5); #endif @@ -220,8 +202,7 @@ int test_axpy_mv() { } #if defined(KOKKOSKERNELS_INST_FLOAT) || \ - (!defined(KOKKOSKERNELS_ETI_ONLY) && \ - !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS)) + (!defined(KOKKOSKERNELS_ETI_ONLY) && !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS)) TEST_F(TestCategory, axpy_float) { Kokkos::Profiling::pushRegion("KokkosBlas::Test::axpy_float"); test_axpy(); @@ -235,8 +216,7 @@ TEST_F(TestCategory, axpy_mv_float) { #endif #if defined(KOKKOSKERNELS_INST_DOUBLE) || \ - (!defined(KOKKOSKERNELS_ETI_ONLY) && \ - !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS)) + (!defined(KOKKOSKERNELS_ETI_ONLY) && !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS)) TEST_F(TestCategory, axpy_double) { Kokkos::Profiling::pushRegion("KokkosBlas::Test::axpy_double"); test_axpy(); @@ -250,8 +230,7 @@ TEST_F(TestCategory, axpy_mv_double) { #endif #if defined(KOKKOSKERNELS_INST_COMPLEX_DOUBLE) || \ - (!defined(KOKKOSKERNELS_ETI_ONLY) && \ - !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS)) + (!defined(KOKKOSKERNELS_ETI_ONLY) && !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS)) TEST_F(TestCategory, axpy_complex_double) { Kokkos::Profiling::pushRegion("KokkosBlas::Test::axpy_complex_double"); test_axpy, Kokkos::complex, TestDevice>(); @@ -264,9 +243,8 @@ TEST_F(TestCategory, axpy_mv_complex_double) { } #endif -#if defined(KOKKOSKERNELS_INST_INT) || \ - (!defined(KOKKOSKERNELS_ETI_ONLY) && \ - !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS)) +#if defined(KOKKOSKERNELS_INST_INT) || \ + (!defined(KOKKOSKERNELS_ETI_ONLY) && !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS)) TEST_F(TestCategory, axpy_int) { Kokkos::Profiling::pushRegion("KokkosBlas::Test::axpy_int"); test_axpy(); @@ -279,8 +257,7 @@ TEST_F(TestCategory, axpy_mv_int) { } #endif -#if !defined(KOKKOSKERNELS_ETI_ONLY) && \ - !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS) +#if !defined(KOKKOSKERNELS_ETI_ONLY) && !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS) TEST_F(TestCategory, axpy_double_int) { Kokkos::Profiling::pushRegion("KokkosBlas::Test::axpy_double_int"); test_axpy(); diff --git a/blas/unit_test/Test_Blas1_dot.hpp b/blas/unit_test/Test_Blas1_dot.hpp index 911925476a..3de0fae12d 100644 --- a/blas/unit_test/Test_Blas1_dot.hpp +++ b/blas/unit_test/Test_Blas1_dot.hpp @@ -30,8 +30,7 @@ void impl_test_dot(int N) { view_stride_adapter a("a", N); view_stride_adapter b("b", N); - Kokkos::Random_XorShift64_Pool rand_pool( - 13718); + Kokkos::Random_XorShift64_Pool rand_pool(13718); { ScalarA randStart, randEnd; @@ -48,13 +47,11 @@ void impl_test_dot(int N) { Kokkos::deep_copy(b.h_base, b.d_base); ScalarA expected_result = 0; - for (int i = 0; i < N; i++) - expected_result += ats::conj(a.h_view(i)) * b.h_view(i); + for (int i = 0; i < N; i++) expected_result += ats::conj(a.h_view(i)) * b.h_view(i); ScalarA nonconst_nonconst_result = KokkosBlas::dot(a.d_view, b.d_view); - double eps = std::is_same::value ? 2 * 1e-5 : 1e-7; - EXPECT_NEAR_KK(nonconst_nonconst_result, expected_result, - eps * expected_result); + double eps = std::is_same::value ? 2 * 1e-5 : 1e-7; + EXPECT_NEAR_KK(nonconst_nonconst_result, expected_result, eps * expected_result); ScalarA const_const_result = KokkosBlas::dot(a.d_view_const, b.d_view_const); EXPECT_NEAR_KK(const_const_result, expected_result, eps * expected_result); @@ -75,8 +72,7 @@ void impl_test_dot_mv(int N, int K) { view_stride_adapter a("A", N, K); view_stride_adapter b("B", N, K); - Kokkos::Random_XorShift64_Pool rand_pool( - 13718); + Kokkos::Random_XorShift64_Pool rand_pool(13718); { ScalarA randStart, randEnd; @@ -95,8 +91,7 @@ void impl_test_dot_mv(int N, int K) { ScalarA* expected_result = new ScalarA[K]; for (int j = 0; j < K; j++) { expected_result[j] = ScalarA(); - for (int i = 0; i < N; i++) - expected_result[j] += ats::conj(a.h_view(i, j)) * b.h_view(i, j); + for (int i = 0; i < N; i++) expected_result[j] += ats::conj(a.h_view(i, j)) * b.h_view(i, j); } double eps = std::is_same::value ? 2 * 1e-5 : 1e-7; @@ -107,32 +102,28 @@ void impl_test_dot_mv(int N, int K) { Kokkos::fence(); for (int k = 0; k < K; k++) { ScalarA nonconst_nonconst_result = r(k); - EXPECT_NEAR_KK(nonconst_nonconst_result, expected_result[k], - eps * expected_result[k]); + EXPECT_NEAR_KK(nonconst_nonconst_result, expected_result[k], eps * expected_result[k]); } KokkosBlas::dot(r, a.d_view_const, b.d_view_const); Kokkos::fence(); for (int k = 0; k < K; k++) { ScalarA const_const_result = r(k); - EXPECT_NEAR_KK(const_const_result, expected_result[k], - eps * expected_result[k]); + EXPECT_NEAR_KK(const_const_result, expected_result[k], eps * expected_result[k]); } KokkosBlas::dot(r, a.d_view, b.d_view_const); Kokkos::fence(); for (int k = 0; k < K; k++) { ScalarA non_const_const_result = r(k); - EXPECT_NEAR_KK(non_const_const_result, expected_result[k], - eps * expected_result[k]); + EXPECT_NEAR_KK(non_const_const_result, expected_result[k], eps * expected_result[k]); } KokkosBlas::dot(r, a.d_view_const, b.d_view); Kokkos::fence(); for (int k = 0; k < K; k++) { ScalarA const_non_const_result = r(k); - EXPECT_NEAR_KK(const_non_const_result, expected_result[k], - eps * expected_result[k]); + EXPECT_NEAR_KK(const_non_const_result, expected_result[k], eps * expected_result[k]); } delete[] expected_result; @@ -142,8 +133,7 @@ void impl_test_dot_mv(int N, int K) { template int test_dot() { #if defined(KOKKOSKERNELS_INST_LAYOUTLEFT) || \ - (!defined(KOKKOSKERNELS_ETI_ONLY) && \ - !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS)) + (!defined(KOKKOSKERNELS_ETI_ONLY) && !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS)) typedef Kokkos::View view_type_a_ll; typedef Kokkos::View view_type_b_ll; Test::impl_test_dot(0); @@ -153,8 +143,7 @@ int test_dot() { #endif #if defined(KOKKOSKERNELS_INST_LAYOUTRIGHT) || \ - (!defined(KOKKOSKERNELS_ETI_ONLY) && \ - !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS)) + (!defined(KOKKOSKERNELS_ETI_ONLY) && !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS)) typedef Kokkos::View view_type_a_lr; typedef Kokkos::View view_type_b_lr; Test::impl_test_dot(0); @@ -163,8 +152,7 @@ int test_dot() { // Test::impl_test_dot(132231); #endif -#if (!defined(KOKKOSKERNELS_ETI_ONLY) && \ - !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS)) +#if (!defined(KOKKOSKERNELS_ETI_ONLY) && !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS)) typedef Kokkos::View view_type_a_ls; typedef Kokkos::View view_type_b_ls; Test::impl_test_dot(0); @@ -173,8 +161,7 @@ int test_dot() { // Test::impl_test_dot(132231); #endif -#if !defined(KOKKOSKERNELS_ETI_ONLY) && \ - !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS) +#if !defined(KOKKOSKERNELS_ETI_ONLY) && !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS) Test::impl_test_dot(1024); Test::impl_test_dot(1024); #endif @@ -185,8 +172,7 @@ int test_dot() { template int test_dot_mv() { #if defined(KOKKOSKERNELS_INST_LAYOUTLEFT) || \ - (!defined(KOKKOSKERNELS_ETI_ONLY) && \ - !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS)) + (!defined(KOKKOSKERNELS_ETI_ONLY) && !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS)) typedef Kokkos::View view_type_a_ll; typedef Kokkos::View view_type_b_ll; Test::impl_test_dot_mv(0, 5); @@ -197,8 +183,7 @@ int test_dot_mv() { #endif #if defined(KOKKOSKERNELS_INST_LAYOUTRIGHT) || \ - (!defined(KOKKOSKERNELS_ETI_ONLY) && \ - !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS)) + (!defined(KOKKOSKERNELS_ETI_ONLY) && !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS)) typedef Kokkos::View view_type_a_lr; typedef Kokkos::View view_type_b_lr; Test::impl_test_dot_mv(0, 5); @@ -210,8 +195,7 @@ int test_dot_mv() { // Removing the layout stride test as ViewTypeA a("a", N); // is invalid since the view constructor needs a stride object! -#if (!defined(KOKKOSKERNELS_ETI_ONLY) && \ - !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS)) +#if (!defined(KOKKOSKERNELS_ETI_ONLY) && !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS)) typedef Kokkos::View view_type_a_ls; typedef Kokkos::View view_type_b_ls; Test::impl_test_dot_mv(0, 5); @@ -221,8 +205,7 @@ int test_dot_mv() { // Test::impl_test_dot_mv(132231,5); #endif -#if !defined(KOKKOSKERNELS_ETI_ONLY) && \ - !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS) +#if !defined(KOKKOSKERNELS_ETI_ONLY) && !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS) Test::impl_test_dot_mv(1024, 5); Test::impl_test_dot_mv(1024, 5); #endif @@ -231,8 +214,7 @@ int test_dot_mv() { } #if defined(KOKKOSKERNELS_INST_FLOAT) || \ - (!defined(KOKKOSKERNELS_ETI_ONLY) && \ - !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS)) + (!defined(KOKKOSKERNELS_ETI_ONLY) && !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS)) TEST_F(TestCategory, dot_float) { Kokkos::Profiling::pushRegion("KokkosBlas::Test::dot_float"); test_dot(); @@ -246,8 +228,7 @@ TEST_F(TestCategory, dot_mv_float) { #endif #if defined(KOKKOSKERNELS_INST_DOUBLE) || \ - (!defined(KOKKOSKERNELS_ETI_ONLY) && \ - !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS)) + (!defined(KOKKOSKERNELS_ETI_ONLY) && !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS)) TEST_F(TestCategory, dot_double) { Kokkos::Profiling::pushRegion("KokkosBlas::Test::dot_double"); test_dot(); @@ -261,8 +242,7 @@ TEST_F(TestCategory, dot_mv_double) { #endif #if defined(KOKKOSKERNELS_INST_COMPLEX_DOUBLE) || \ - (!defined(KOKKOSKERNELS_ETI_ONLY) && \ - !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS)) + (!defined(KOKKOSKERNELS_ETI_ONLY) && !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS)) TEST_F(TestCategory, dot_complex_double) { Kokkos::Profiling::pushRegion("KokkosBlas::Test::dot_complex_double"); test_dot, Kokkos::complex, TestDevice>(); @@ -275,9 +255,8 @@ TEST_F(TestCategory, dot_mv_complex_double) { } #endif -#if defined(KOKKOSKERNELS_INST_INT) || \ - (!defined(KOKKOSKERNELS_ETI_ONLY) && \ - !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS)) +#if defined(KOKKOSKERNELS_INST_INT) || \ + (!defined(KOKKOSKERNELS_ETI_ONLY) && !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS)) TEST_F(TestCategory, dot_int) { Kokkos::Profiling::pushRegion("KokkosBlas::Test::dot_int"); test_dot(); diff --git a/blas/unit_test/Test_Blas1_iamax.hpp b/blas/unit_test/Test_Blas1_iamax.hpp index 49f759958a..94ff8b3ebe 100644 --- a/blas/unit_test/Test_Blas1_iamax.hpp +++ b/blas/unit_test/Test_Blas1_iamax.hpp @@ -29,8 +29,7 @@ void impl_test_iamax(int N) { view_stride_adapter a("X", N); - Kokkos::Random_XorShift64_Pool rand_pool( - 13718); + Kokkos::Random_XorShift64_Pool rand_pool(13718); ScalarA randStart, randEnd; Test::getRandomBounds(10.0, randStart, randEnd); @@ -66,11 +65,8 @@ void impl_test_iamax(int N) { { // printf("impl_test_iamax -- return result as a 0-D View on host -- N // %d\n", N); - typedef Kokkos::View - ViewType0D; - ViewType0D r("Iamax::Result 0-D View on host", - typename ViewTypeA::array_layout()); + typedef Kokkos::View ViewType0D; + ViewType0D r("Iamax::Result 0-D View on host", typename ViewTypeA::array_layout()); KokkosBlas::iamax(r, a.d_view); Kokkos::fence(); @@ -85,10 +81,8 @@ void impl_test_iamax(int N) { { // printf("impl_test_iamax -- return result as a 0-D View on device -- N // %d\n", N); - typedef Kokkos::View - ViewType0D; - ViewType0D r("Iamax::Result 0-D View on device", - typename ViewTypeA::array_layout()); + typedef Kokkos::View ViewType0D; + ViewType0D r("Iamax::Result 0-D View on device", typename ViewTypeA::array_layout()); typename ViewType0D::HostMirror h_r = Kokkos::create_mirror_view(r); size_type nonconst_max_loc, const_max_loc; @@ -118,8 +112,7 @@ void impl_test_iamax_mv(int N, int K) { view_stride_adapter a("A", N, K); - Kokkos::Random_XorShift64_Pool rand_pool( - 13718); + Kokkos::Random_XorShift64_Pool rand_pool(13718); ScalarA randStart, randEnd; Test::getRandomBounds(10.0, randStart, randEnd); @@ -148,11 +141,8 @@ void impl_test_iamax_mv(int N, int K) { { // printf("impl_test_iamax_mv -- return results as a 1-D View on host -- N // %d\n", N); - Kokkos::View rcontig( - "Iamax::Result View on host", K); - Kokkos::View - r = rcontig; + Kokkos::View rcontig("Iamax::Result View on host", K); + Kokkos::View r = rcontig; KokkosBlas::iamax(r, a.d_view); Kokkos::fence(); @@ -177,10 +167,8 @@ void impl_test_iamax_mv(int N, int K) { // printf("impl_test_iamax_mv -- return results as a 1-D View on device -- N // %d\n", N); Kokkos::View rcontig("Iamax::Result View on host", K); - Kokkos::View r = - rcontig; - typename Kokkos::View::HostMirror h_r = + Kokkos::View r = rcontig; + typename Kokkos::View::HostMirror h_r = Kokkos::create_mirror_view(rcontig); KokkosBlas::iamax(r, a.d_view); @@ -210,8 +198,7 @@ void impl_test_iamax_mv(int N, int K) { template int test_iamax() { #if defined(KOKKOSKERNELS_INST_LAYOUTLEFT) || \ - (!defined(KOKKOSKERNELS_ETI_ONLY) && \ - !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS)) + (!defined(KOKKOSKERNELS_ETI_ONLY) && !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS)) typedef Kokkos::View view_type_a_ll; Test::impl_test_iamax(0); Test::impl_test_iamax(13); @@ -220,8 +207,7 @@ int test_iamax() { #endif #if defined(KOKKOSKERNELS_INST_LAYOUTRIGHT) || \ - (!defined(KOKKOSKERNELS_ETI_ONLY) && \ - !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS)) + (!defined(KOKKOSKERNELS_ETI_ONLY) && !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS)) typedef Kokkos::View view_type_a_lr; Test::impl_test_iamax(0); Test::impl_test_iamax(13); @@ -229,8 +215,7 @@ int test_iamax() { // Test::impl_test_iamax(132231); #endif -#if (!defined(KOKKOSKERNELS_ETI_ONLY) && \ - !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS)) +#if (!defined(KOKKOSKERNELS_ETI_ONLY) && !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS)) typedef Kokkos::View view_type_a_ls; Test::impl_test_iamax(0); Test::impl_test_iamax(13); @@ -244,8 +229,7 @@ int test_iamax() { template int test_iamax_mv() { #if defined(KOKKOSKERNELS_INST_LAYOUTLEFT) || \ - (!defined(KOKKOSKERNELS_ETI_ONLY) && \ - !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS)) + (!defined(KOKKOSKERNELS_ETI_ONLY) && !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS)) typedef Kokkos::View view_type_a_ll; Test::impl_test_iamax_mv(0, 5); Test::impl_test_iamax_mv(13, 5); @@ -254,8 +238,7 @@ int test_iamax_mv() { #endif #if defined(KOKKOSKERNELS_INST_LAYOUTRIGHT) || \ - (!defined(KOKKOSKERNELS_ETI_ONLY) && \ - !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS)) + (!defined(KOKKOSKERNELS_ETI_ONLY) && !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS)) typedef Kokkos::View view_type_a_lr; Test::impl_test_iamax_mv(0, 5); Test::impl_test_iamax_mv(13, 5); @@ -263,8 +246,7 @@ int test_iamax_mv() { // Test::impl_test_iamax_mv(132231,5); #endif -#if (!defined(KOKKOSKERNELS_ETI_ONLY) && \ - !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS)) +#if (!defined(KOKKOSKERNELS_ETI_ONLY) && !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS)) typedef Kokkos::View view_type_a_ls; Test::impl_test_iamax_mv(0, 5); Test::impl_test_iamax_mv(13, 5); @@ -276,8 +258,7 @@ int test_iamax_mv() { } #if defined(KOKKOSKERNELS_INST_FLOAT) || \ - (!defined(KOKKOSKERNELS_ETI_ONLY) && \ - !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS)) + (!defined(KOKKOSKERNELS_ETI_ONLY) && !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS)) TEST_F(TestCategory, iamax_float) { Kokkos::Profiling::pushRegion("KokkosBlas::Test::iamax_float"); test_iamax(); @@ -291,8 +272,7 @@ TEST_F(TestCategory, iamax_mv_float) { #endif #if defined(KOKKOSKERNELS_INST_DOUBLE) || \ - (!defined(KOKKOSKERNELS_ETI_ONLY) && \ - !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS)) + (!defined(KOKKOSKERNELS_ETI_ONLY) && !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS)) TEST_F(TestCategory, iamax_double) { Kokkos::Profiling::pushRegion("KokkosBlas::Test::iamax_double"); test_iamax(); @@ -306,8 +286,7 @@ TEST_F(TestCategory, iamax_mv_double) { #endif #if defined(KOKKOSKERNELS_INST_COMPLEX_DOUBLE) || \ - (!defined(KOKKOSKERNELS_ETI_ONLY) && \ - !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS)) + (!defined(KOKKOSKERNELS_ETI_ONLY) && !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS)) TEST_F(TestCategory, iamax_complex_double) { Kokkos::Profiling::pushRegion("KokkosBlas::Test::iamax_complex_double"); test_iamax, TestDevice>(); @@ -320,9 +299,8 @@ TEST_F(TestCategory, iamax_mv_complex_double) { } #endif -#if defined(KOKKOSKERNELS_INST_INT) || \ - (!defined(KOKKOSKERNELS_ETI_ONLY) && \ - !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS)) +#if defined(KOKKOSKERNELS_INST_INT) || \ + (!defined(KOKKOSKERNELS_ETI_ONLY) && !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS)) TEST_F(TestCategory, iamax_int) { Kokkos::Profiling::pushRegion("KokkosBlas::Test::iamax_int"); test_iamax(); diff --git a/blas/unit_test/Test_Blas1_mult.hpp b/blas/unit_test/Test_Blas1_mult.hpp index 6555280f0d..f5755982e7 100644 --- a/blas/unit_test/Test_Blas1_mult.hpp +++ b/blas/unit_test/Test_Blas1_mult.hpp @@ -36,8 +36,7 @@ void impl_test_mult(int N) { view_stride_adapter z("Z", N); view_stride_adapter org_z("Org_Z", N); - Kokkos::Random_XorShift64_Pool rand_pool( - 13718); + Kokkos::Random_XorShift64_Pool rand_pool(13718); { ScalarA randStart, randEnd; @@ -63,27 +62,21 @@ void impl_test_mult(int N) { KokkosBlas::mult(b, z.d_view, a, x.d_view, y.d_view); Kokkos::deep_copy(z.h_base, z.d_base); for (int i = 0; i < N; i++) { - EXPECT_NEAR_KK(static_cast(a * x.h_view(i) * y.h_view(i) + - b * org_z.h_view(i)), - z.h_view(i), eps); + EXPECT_NEAR_KK(static_cast(a * x.h_view(i) * y.h_view(i) + b * org_z.h_view(i)), z.h_view(i), eps); } Kokkos::deep_copy(z.d_base, org_z.h_base); KokkosBlas::mult(b, z.d_view, a, x.d_view, y.d_view_const); Kokkos::deep_copy(z.h_base, z.d_base); for (int i = 0; i < N; i++) { - EXPECT_NEAR_KK(static_cast(a * x.h_view(i) * y.h_view(i) + - b * org_z.h_view(i)), - z.h_view(i), eps); + EXPECT_NEAR_KK(static_cast(a * x.h_view(i) * y.h_view(i) + b * org_z.h_view(i)), z.h_view(i), eps); } Kokkos::deep_copy(z.d_base, org_z.h_base); KokkosBlas::mult(b, z.d_view, a, x.d_view_const, y.d_view_const); Kokkos::deep_copy(z.h_base, z.d_base); for (int i = 0; i < N; i++) { - EXPECT_NEAR_KK(static_cast(a * x.h_view(i) * y.h_view(i) + - b * org_z.h_view(i)), - z.h_view(i), eps); + EXPECT_NEAR_KK(static_cast(a * x.h_view(i) * y.h_view(i) + b * org_z.h_view(i)), z.h_view(i), eps); } } @@ -99,8 +92,7 @@ void impl_test_mult_mv(int N, int K) { view_stride_adapter z("Z", N, K); view_stride_adapter org_z("Org_Z", N, K); - Kokkos::Random_XorShift64_Pool rand_pool( - 13718); + Kokkos::Random_XorShift64_Pool rand_pool(13718); { ScalarA randStart, randEnd; @@ -131,9 +123,8 @@ void impl_test_mult_mv(int N, int K) { Kokkos::deep_copy(z.h_base, z.d_base); for (int i = 0; i < N; i++) { for (int j = 0; j < K; j++) { - EXPECT_NEAR_KK(static_cast(a * x.h_view(i) * y.h_view(i, j) + - b * org_z.h_view(i, j)), - z.h_view(i, j), eps); + EXPECT_NEAR_KK(static_cast(a * x.h_view(i) * y.h_view(i, j) + b * org_z.h_view(i, j)), z.h_view(i, j), + eps); } } @@ -142,9 +133,8 @@ void impl_test_mult_mv(int N, int K) { Kokkos::deep_copy(z.h_base, z.d_base); for (int i = 0; i < N; i++) { for (int j = 0; j < K; j++) { - EXPECT_NEAR_KK(static_cast(a * x.h_view(i) * y.h_view(i, j) + - b * org_z.h_view(i, j)), - z.h_view(i, j), eps); + EXPECT_NEAR_KK(static_cast(a * x.h_view(i) * y.h_view(i, j) + b * org_z.h_view(i, j)), z.h_view(i, j), + eps); } } } @@ -153,58 +143,43 @@ void impl_test_mult_mv(int N, int K) { template int test_mult() { #if defined(KOKKOSKERNELS_INST_LAYOUTLEFT) || \ - (!defined(KOKKOSKERNELS_ETI_ONLY) && \ - !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS)) + (!defined(KOKKOSKERNELS_ETI_ONLY) && !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS)) typedef Kokkos::View view_type_a_ll; typedef Kokkos::View view_type_b_ll; typedef Kokkos::View view_type_c_ll; - Test::impl_test_mult( - 0); - Test::impl_test_mult( - 13); - Test::impl_test_mult( - 1024); + Test::impl_test_mult(0); + Test::impl_test_mult(13); + Test::impl_test_mult(1024); // Test::impl_test_mult(132231); #endif #if defined(KOKKOSKERNELS_INST_LAYOUTRIGHT) || \ - (!defined(KOKKOSKERNELS_ETI_ONLY) && \ - !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS)) + (!defined(KOKKOSKERNELS_ETI_ONLY) && !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS)) typedef Kokkos::View view_type_a_lr; typedef Kokkos::View view_type_b_lr; typedef Kokkos::View view_type_c_lr; - Test::impl_test_mult( - 0); - Test::impl_test_mult( - 13); - Test::impl_test_mult( - 1024); + Test::impl_test_mult(0); + Test::impl_test_mult(13); + Test::impl_test_mult(1024); // Test::impl_test_mult(132231); #endif -#if (!defined(KOKKOSKERNELS_ETI_ONLY) && \ - !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS)) +#if (!defined(KOKKOSKERNELS_ETI_ONLY) && !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS)) typedef Kokkos::View view_type_a_ls; typedef Kokkos::View view_type_b_ls; typedef Kokkos::View view_type_c_ls; - Test::impl_test_mult( - 0); - Test::impl_test_mult( - 13); - Test::impl_test_mult( - 1024); + Test::impl_test_mult(0); + Test::impl_test_mult(13); + Test::impl_test_mult(1024); // Test::impl_test_mult(132231); #endif -#if !defined(KOKKOSKERNELS_ETI_ONLY) && \ - !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS) - Test::impl_test_mult( - 1024); - Test::impl_test_mult( - 1024); +#if !defined(KOKKOSKERNELS_ETI_ONLY) && !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS) + Test::impl_test_mult(1024); + Test::impl_test_mult(1024); #endif return 1; @@ -213,66 +188,50 @@ int test_mult() { template int test_mult_mv() { #if defined(KOKKOSKERNELS_INST_LAYOUTLEFT) || \ - (!defined(KOKKOSKERNELS_ETI_ONLY) && \ - !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS)) + (!defined(KOKKOSKERNELS_ETI_ONLY) && !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS)) typedef Kokkos::View view_type_a_ll; typedef Kokkos::View view_type_b_ll; typedef Kokkos::View view_type_c_ll; - Test::impl_test_mult_mv(0, 5); - Test::impl_test_mult_mv(13, 5); - Test::impl_test_mult_mv(1024, 5); + Test::impl_test_mult_mv(0, 5); + Test::impl_test_mult_mv(13, 5); + Test::impl_test_mult_mv(1024, 5); // Test::impl_test_mult_mv(132231,5); #endif #if defined(KOKKOSKERNELS_INST_LAYOUTRIGHT) || \ - (!defined(KOKKOSKERNELS_ETI_ONLY) && \ - !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS)) + (!defined(KOKKOSKERNELS_ETI_ONLY) && !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS)) typedef Kokkos::View view_type_a_lr; typedef Kokkos::View view_type_b_lr; typedef Kokkos::View view_type_c_lr; - Test::impl_test_mult_mv(0, 5); - Test::impl_test_mult_mv(13, 5); - Test::impl_test_mult_mv(1024, 5); + Test::impl_test_mult_mv(0, 5); + Test::impl_test_mult_mv(13, 5); + Test::impl_test_mult_mv(1024, 5); // Test::impl_test_mult_mv(132231,5); #endif -#if (!defined(KOKKOSKERNELS_ETI_ONLY) && \ - !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS)) +#if (!defined(KOKKOSKERNELS_ETI_ONLY) && !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS)) typedef Kokkos::View view_type_a_ls; typedef Kokkos::View view_type_b_ls; typedef Kokkos::View view_type_c_ls; - Test::impl_test_mult_mv(0, 5); - Test::impl_test_mult_mv(13, 5); - Test::impl_test_mult_mv(1024, 5); + Test::impl_test_mult_mv(0, 5); + Test::impl_test_mult_mv(13, 5); + Test::impl_test_mult_mv(1024, 5); // Test::impl_test_mult_mv(132231,5); #endif -#if !defined(KOKKOSKERNELS_ETI_ONLY) && \ - !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS) - Test::impl_test_mult_mv(1024, 5); - Test::impl_test_mult_mv(1024, 5); +#if !defined(KOKKOSKERNELS_ETI_ONLY) && !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS) + Test::impl_test_mult_mv(1024, 5); + Test::impl_test_mult_mv(1024, 5); #endif return 1; } #if defined(KOKKOSKERNELS_INST_FLOAT) || \ - (!defined(KOKKOSKERNELS_ETI_ONLY) && \ - !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS)) + (!defined(KOKKOSKERNELS_ETI_ONLY) && !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS)) TEST_F(TestCategory, mult_float) { Kokkos::Profiling::pushRegion("KokkosBlas::Test::mult_float"); test_mult(); @@ -286,8 +245,7 @@ TEST_F(TestCategory, mult_mv_float) { #endif #if defined(KOKKOSKERNELS_INST_DOUBLE) || \ - (!defined(KOKKOSKERNELS_ETI_ONLY) && \ - !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS)) + (!defined(KOKKOSKERNELS_ETI_ONLY) && !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS)) TEST_F(TestCategory, mult_double) { Kokkos::Profiling::pushRegion("KokkosBlas::Test::mult_double"); test_mult(); @@ -301,25 +259,21 @@ TEST_F(TestCategory, mult_mv_double) { #endif #if defined(KOKKOSKERNELS_INST_COMPLEX_DOUBLE) || \ - (!defined(KOKKOSKERNELS_ETI_ONLY) && \ - !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS)) + (!defined(KOKKOSKERNELS_ETI_ONLY) && !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS)) TEST_F(TestCategory, mult_complex_double) { Kokkos::Profiling::pushRegion("KokkosBlas::Test::mult_complex_double"); - test_mult, Kokkos::complex, - Kokkos::complex, TestDevice>(); + test_mult, Kokkos::complex, Kokkos::complex, TestDevice>(); Kokkos::Profiling::popRegion(); } TEST_F(TestCategory, mult_mv_complex_double) { Kokkos::Profiling::pushRegion("KokkosBlas::Test::mult_mv_complex_double"); - test_mult_mv, Kokkos::complex, - Kokkos::complex, TestDevice>(); + test_mult_mv, Kokkos::complex, Kokkos::complex, TestDevice>(); Kokkos::Profiling::popRegion(); } #endif -#if defined(KOKKOSKERNELS_INST_INT) || \ - (!defined(KOKKOSKERNELS_ETI_ONLY) && \ - !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS)) +#if defined(KOKKOSKERNELS_INST_INT) || \ + (!defined(KOKKOSKERNELS_ETI_ONLY) && !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS)) TEST_F(TestCategory, mult_int) { Kokkos::Profiling::pushRegion("KokkosBlas::Test::mult_int"); test_mult(); @@ -332,8 +286,7 @@ TEST_F(TestCategory, mult_mv_int) { } #endif -#if !defined(KOKKOSKERNELS_ETI_ONLY) && \ - !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS) +#if !defined(KOKKOSKERNELS_ETI_ONLY) && !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS) TEST_F(TestCategory, mult_double_int) { Kokkos::Profiling::pushRegion("KokkosBlas::Test::mult_double_int"); test_mult(); diff --git a/blas/unit_test/Test_Blas1_nrm1.hpp b/blas/unit_test/Test_Blas1_nrm1.hpp index 24795878d1..3942dafe93 100644 --- a/blas/unit_test/Test_Blas1_nrm1.hpp +++ b/blas/unit_test/Test_Blas1_nrm1.hpp @@ -29,8 +29,7 @@ void impl_test_nrm1(int N) { view_stride_adapter a("a", N); - Kokkos::Random_XorShift64_Pool rand_pool( - 13718); + Kokkos::Random_XorShift64_Pool rand_pool(13718); ScalarA randStart, randEnd; Test::getRandomBounds(10.0, randStart, randEnd); @@ -38,10 +37,7 @@ void impl_test_nrm1(int N) { Kokkos::deep_copy(a.h_base, a.d_base); - double eps = (std::is_same::mag_type, - float>::value - ? 1e-4 - : 1e-7); + double eps = (std::is_same::mag_type, float>::value ? 1e-4 : 1e-7); mag_type expected_result = 0; for (int i = 0; i < N; i++) { @@ -50,8 +46,7 @@ void impl_test_nrm1(int N) { // parts. See netlib, MKL, and CUBLAS documentation. // // This is safe; ArithTraits::imag is 0 if T is real. - expected_result += - MAT::abs(AT::real(a.h_view(i))) + MAT::abs(AT::imag(a.h_view(i))); + expected_result += MAT::abs(AT::real(a.h_view(i))) + MAT::abs(AT::imag(a.h_view(i))); } mag_type nonconst_result = KokkosBlas::nrm1(a.d_view); @@ -70,8 +65,7 @@ void impl_test_nrm1_mv(int N, int K) { view_stride_adapter a("A", N, K); - Kokkos::Random_XorShift64_Pool rand_pool( - 13718); + Kokkos::Random_XorShift64_Pool rand_pool(13718); ScalarA randStart, randEnd; Test::getRandomBounds(10.0, randStart, randEnd); @@ -79,18 +73,13 @@ void impl_test_nrm1_mv(int N, int K) { Kokkos::deep_copy(a.h_base, a.d_base); - double eps = (std::is_same::mag_type, - float>::value - ? 1e-4 - : 1e-7); + double eps = (std::is_same::mag_type, float>::value ? 1e-4 : 1e-7); - Kokkos::View expected_result("Expected Nrm1", - K); + Kokkos::View expected_result("Expected Nrm1", K); for (int k = 0; k < K; k++) { expected_result(k) = MAT::zero(); for (int i = 0; i < N; i++) { - expected_result(k) += MAT::abs(AT::real(a.h_view(i, k))) + - MAT::abs(AT::imag(a.h_view(i, k))); + expected_result(k) += MAT::abs(AT::real(a.h_view(i, k))) + MAT::abs(AT::imag(a.h_view(i, k))); } } @@ -109,8 +98,7 @@ void impl_test_nrm1_mv(int N, int K) { template int test_nrm1() { #if defined(KOKKOSKERNELS_INST_LAYOUTLEFT) || \ - (!defined(KOKKOSKERNELS_ETI_ONLY) && \ - !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS)) + (!defined(KOKKOSKERNELS_ETI_ONLY) && !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS)) typedef Kokkos::View view_type_a_ll; Test::impl_test_nrm1(0); Test::impl_test_nrm1(13); @@ -119,8 +107,7 @@ int test_nrm1() { #endif #if defined(KOKKOSKERNELS_INST_LAYOUTRIGHT) || \ - (!defined(KOKKOSKERNELS_ETI_ONLY) && \ - !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS)) + (!defined(KOKKOSKERNELS_ETI_ONLY) && !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS)) typedef Kokkos::View view_type_a_lr; Test::impl_test_nrm1(0); Test::impl_test_nrm1(13); @@ -128,8 +115,7 @@ int test_nrm1() { Test::impl_test_nrm1(132231); #endif -#if (!defined(KOKKOSKERNELS_ETI_ONLY) && \ - !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS)) +#if (!defined(KOKKOSKERNELS_ETI_ONLY) && !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS)) typedef Kokkos::View view_type_a_ls; Test::impl_test_nrm1(0); Test::impl_test_nrm1(13); @@ -143,8 +129,7 @@ int test_nrm1() { template int test_nrm1_mv() { #if defined(KOKKOSKERNELS_INST_LAYOUTLEFT) || \ - (!defined(KOKKOSKERNELS_ETI_ONLY) && \ - !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS)) + (!defined(KOKKOSKERNELS_ETI_ONLY) && !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS)) typedef Kokkos::View view_type_a_ll; Test::impl_test_nrm1_mv(0, 5); Test::impl_test_nrm1_mv(13, 5); @@ -154,8 +139,7 @@ int test_nrm1_mv() { #endif #if defined(KOKKOSKERNELS_INST_LAYOUTRIGHT) || \ - (!defined(KOKKOSKERNELS_ETI_ONLY) && \ - !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS)) + (!defined(KOKKOSKERNELS_ETI_ONLY) && !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS)) typedef Kokkos::View view_type_a_lr; Test::impl_test_nrm1_mv(0, 5); Test::impl_test_nrm1_mv(13, 5); @@ -164,8 +148,7 @@ int test_nrm1_mv() { Test::impl_test_nrm1_mv(132231, 5); #endif -#if (!defined(KOKKOSKERNELS_ETI_ONLY) && \ - !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS)) +#if (!defined(KOKKOSKERNELS_ETI_ONLY) && !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS)) typedef Kokkos::View view_type_a_ls; Test::impl_test_nrm1_mv(0, 5); Test::impl_test_nrm1_mv(13, 5); @@ -178,8 +161,7 @@ int test_nrm1_mv() { } #if defined(KOKKOSKERNELS_INST_FLOAT) || \ - (!defined(KOKKOSKERNELS_ETI_ONLY) && \ - !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS)) + (!defined(KOKKOSKERNELS_ETI_ONLY) && !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS)) TEST_F(TestCategory, nrm1_float) { Kokkos::Profiling::pushRegion("KokkosBlas::Test::nrm1_float"); test_nrm1(); @@ -193,8 +175,7 @@ TEST_F(TestCategory, nrm1_mv_float) { #endif #if defined(KOKKOSKERNELS_INST_DOUBLE) || \ - (!defined(KOKKOSKERNELS_ETI_ONLY) && \ - !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS)) + (!defined(KOKKOSKERNELS_ETI_ONLY) && !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS)) TEST_F(TestCategory, nrm1_double) { Kokkos::Profiling::pushRegion("KokkosBlas::Test::nrm1_double"); test_nrm1(); @@ -208,8 +189,7 @@ TEST_F(TestCategory, nrm1_mv_double) { #endif #if defined(KOKKOSKERNELS_INST_COMPLEX_DOUBLE) || \ - (!defined(KOKKOSKERNELS_ETI_ONLY) && \ - !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS)) + (!defined(KOKKOSKERNELS_ETI_ONLY) && !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS)) TEST_F(TestCategory, nrm1_complex_double) { Kokkos::Profiling::pushRegion("KokkosBlas::Test::nrm1_complex_double"); test_nrm1, TestDevice>(); @@ -222,9 +202,8 @@ TEST_F(TestCategory, nrm1_mv_complex_double) { } #endif -#if defined(KOKKOSKERNELS_INST_INT) || \ - (!defined(KOKKOSKERNELS_ETI_ONLY) && \ - !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS)) +#if defined(KOKKOSKERNELS_INST_INT) || \ + (!defined(KOKKOSKERNELS_ETI_ONLY) && !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS)) TEST_F(TestCategory, nrm1_int) { Kokkos::Profiling::pushRegion("KokkosBlas::Test::nrm1_int"); test_nrm1(); diff --git a/blas/unit_test/Test_Blas1_nrm2.hpp b/blas/unit_test/Test_Blas1_nrm2.hpp index a9b3f7c10f..556d48f753 100644 --- a/blas/unit_test/Test_Blas1_nrm2.hpp +++ b/blas/unit_test/Test_Blas1_nrm2.hpp @@ -27,8 +27,7 @@ void impl_test_nrm2(int N) { view_stride_adapter a("a", N); - Kokkos::Random_XorShift64_Pool rand_pool( - 13718); + Kokkos::Random_XorShift64_Pool rand_pool(13718); ScalarA randStart, randEnd; Test::getRandomBounds(1.0, randStart, randEnd); @@ -42,8 +41,7 @@ void impl_test_nrm2(int N) { for (int i = 0; i < N; i++) { expected_result += AT::abs(a.h_view(i)) * AT::abs(a.h_view(i)); } - expected_result = - Kokkos::ArithTraits::sqrt(expected_result); + expected_result = Kokkos::ArithTraits::sqrt(expected_result); typename AT::mag_type nonconst_result = KokkosBlas::nrm2(a.d_view); EXPECT_NEAR_KK(nonconst_result, expected_result, eps * expected_result); @@ -59,8 +57,7 @@ void impl_test_nrm2_mv(int N, int K) { view_stride_adapter a("A", N, K); - Kokkos::Random_XorShift64_Pool rand_pool( - 13718); + Kokkos::Random_XorShift64_Pool rand_pool(13718); ScalarA randStart, randEnd; Test::getRandomBounds(1.0, randStart, randEnd); @@ -74,8 +71,7 @@ void impl_test_nrm2_mv(int N, int K) { for (int i = 0; i < N; i++) { expected_result[j] += AT::abs(a.h_view(i, j)) * AT::abs(a.h_view(i, j)); } - expected_result[j] = - Kokkos::ArithTraits::sqrt(expected_result[j]); + expected_result[j] = Kokkos::ArithTraits::sqrt(expected_result[j]); } double eps = std::is_same::value ? 2 * 1e-5 : 1e-7; @@ -86,8 +82,7 @@ void impl_test_nrm2_mv(int N, int K) { Kokkos::fence(); for (int k = 0; k < K; k++) { typename AT::mag_type nonconst_result = r(k); - EXPECT_NEAR_KK(nonconst_result, expected_result[k], - eps * expected_result[k]); + EXPECT_NEAR_KK(nonconst_result, expected_result[k], eps * expected_result[k]); } KokkosBlas::nrm2(r, a.d_view_const); @@ -104,8 +99,7 @@ void impl_test_nrm2_mv(int N, int K) { template int test_nrm2() { #if defined(KOKKOSKERNELS_INST_LAYOUTLEFT) || \ - (!defined(KOKKOSKERNELS_ETI_ONLY) && \ - !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS)) + (!defined(KOKKOSKERNELS_ETI_ONLY) && !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS)) typedef Kokkos::View view_type_a_ll; Test::impl_test_nrm2(0); Test::impl_test_nrm2(13); @@ -114,8 +108,7 @@ int test_nrm2() { #endif #if defined(KOKKOSKERNELS_INST_LAYOUTRIGHT) || \ - (!defined(KOKKOSKERNELS_ETI_ONLY) && \ - !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS)) + (!defined(KOKKOSKERNELS_ETI_ONLY) && !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS)) typedef Kokkos::View view_type_a_lr; Test::impl_test_nrm2(0); Test::impl_test_nrm2(13); @@ -123,8 +116,7 @@ int test_nrm2() { // Test::impl_test_nrm2(132231); #endif -#if (!defined(KOKKOSKERNELS_ETI_ONLY) && \ - !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS)) +#if (!defined(KOKKOSKERNELS_ETI_ONLY) && !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS)) typedef Kokkos::View view_type_a_ls; Test::impl_test_nrm2(0); Test::impl_test_nrm2(13); @@ -138,8 +130,7 @@ int test_nrm2() { template int test_nrm2_mv() { #if defined(KOKKOSKERNELS_INST_LAYOUTLEFT) || \ - (!defined(KOKKOSKERNELS_ETI_ONLY) && \ - !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS)) + (!defined(KOKKOSKERNELS_ETI_ONLY) && !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS)) typedef Kokkos::View view_type_a_ll; Test::impl_test_nrm2_mv(0, 5); Test::impl_test_nrm2_mv(13, 5); @@ -149,8 +140,7 @@ int test_nrm2_mv() { #endif #if defined(KOKKOSKERNELS_INST_LAYOUTRIGHT) || \ - (!defined(KOKKOSKERNELS_ETI_ONLY) && \ - !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS)) + (!defined(KOKKOSKERNELS_ETI_ONLY) && !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS)) typedef Kokkos::View view_type_a_lr; Test::impl_test_nrm2_mv(0, 5); Test::impl_test_nrm2_mv(13, 5); @@ -159,8 +149,7 @@ int test_nrm2_mv() { // Test::impl_test_nrm2_mv(132231,5); #endif -#if (!defined(KOKKOSKERNELS_ETI_ONLY) && \ - !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS)) +#if (!defined(KOKKOSKERNELS_ETI_ONLY) && !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS)) typedef Kokkos::View view_type_a_ls; Test::impl_test_nrm2_mv(0, 5); Test::impl_test_nrm2_mv(13, 5); @@ -173,8 +162,7 @@ int test_nrm2_mv() { } #if defined(KOKKOSKERNELS_INST_FLOAT) || \ - (!defined(KOKKOSKERNELS_ETI_ONLY) && \ - !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS)) + (!defined(KOKKOSKERNELS_ETI_ONLY) && !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS)) TEST_F(TestCategory, nrm2_float) { Kokkos::Profiling::pushRegion("KokkosBlas::Test::nrm2_float"); test_nrm2(); @@ -188,8 +176,7 @@ TEST_F(TestCategory, nrm2_mv_float) { #endif #if defined(KOKKOSKERNELS_INST_DOUBLE) || \ - (!defined(KOKKOSKERNELS_ETI_ONLY) && \ - !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS)) + (!defined(KOKKOSKERNELS_ETI_ONLY) && !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS)) TEST_F(TestCategory, nrm2_double) { Kokkos::Profiling::pushRegion("KokkosBlas::Test::nrm2_double"); test_nrm2(); @@ -203,8 +190,7 @@ TEST_F(TestCategory, nrm2_mv_double) { #endif #if defined(KOKKOSKERNELS_INST_COMPLEX_DOUBLE) || \ - (!defined(KOKKOSKERNELS_ETI_ONLY) && \ - !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS)) + (!defined(KOKKOSKERNELS_ETI_ONLY) && !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS)) TEST_F(TestCategory, nrm2_complex_double) { Kokkos::Profiling::pushRegion("KokkosBlas::Test::nrm2_complex_double"); test_nrm2, TestDevice>(); @@ -217,9 +203,8 @@ TEST_F(TestCategory, nrm2_mv_complex_double) { } #endif -#if defined(KOKKOSKERNELS_INST_INT) || \ - (!defined(KOKKOSKERNELS_ETI_ONLY) && \ - !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS)) +#if defined(KOKKOSKERNELS_INST_INT) || \ + (!defined(KOKKOSKERNELS_ETI_ONLY) && !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS)) TEST_F(TestCategory, nrm2_int) { Kokkos::Profiling::pushRegion("KokkosBlas::Test::nrm2_int"); test_nrm2(); diff --git a/blas/unit_test/Test_Blas1_nrm2_squared.hpp b/blas/unit_test/Test_Blas1_nrm2_squared.hpp index 09e4b3d45d..d718626f8e 100644 --- a/blas/unit_test/Test_Blas1_nrm2_squared.hpp +++ b/blas/unit_test/Test_Blas1_nrm2_squared.hpp @@ -27,8 +27,7 @@ void impl_test_nrm2_squared(int N) { view_stride_adapter a("a", N); - Kokkos::Random_XorShift64_Pool rand_pool( - 13718); + Kokkos::Random_XorShift64_Pool rand_pool(13718); ScalarA randStart, randEnd; Test::getRandomBounds(1.0, randStart, randEnd); @@ -57,8 +56,7 @@ void impl_test_nrm2_squared_mv(int N, int K) { view_stride_adapter a("A", N, K); - Kokkos::Random_XorShift64_Pool rand_pool( - 13718); + Kokkos::Random_XorShift64_Pool rand_pool(13718); ScalarA randStart, randEnd; Test::getRandomBounds(1.0, randStart, randEnd); @@ -84,10 +82,8 @@ void impl_test_nrm2_squared_mv(int N, int K) { Kokkos::fence(); for (int k = 0; k < K; k++) { typename AT::mag_type nonconst_result = r(k); - typename AT::mag_type divisor = - AT::abs(expected_result[k]) == zero ? one : AT::abs(expected_result[k]); - typename AT::mag_type diff = - AT::abs(nonconst_result - expected_result[k]) / divisor; + typename AT::mag_type divisor = AT::abs(expected_result[k]) == zero ? one : AT::abs(expected_result[k]); + typename AT::mag_type diff = AT::abs(nonconst_result - expected_result[k]) / divisor; EXPECT_NEAR_KK(diff, zero, eps); } @@ -95,10 +91,8 @@ void impl_test_nrm2_squared_mv(int N, int K) { Kokkos::fence(); for (int k = 0; k < K; k++) { typename AT::mag_type const_result = r(k); - typename AT::mag_type divisor = - AT::abs(expected_result[k]) == zero ? one : AT::abs(expected_result[k]); - typename AT::mag_type diff = - AT::abs(const_result - expected_result[k]) / divisor; + typename AT::mag_type divisor = AT::abs(expected_result[k]) == zero ? one : AT::abs(expected_result[k]); + typename AT::mag_type diff = AT::abs(const_result - expected_result[k]) / divisor; EXPECT_NEAR_KK(diff, zero, eps); } @@ -109,8 +103,7 @@ void impl_test_nrm2_squared_mv(int N, int K) { template int test_nrm2_squared() { #if defined(KOKKOSKERNELS_INST_LAYOUTLEFT) || \ - (!defined(KOKKOSKERNELS_ETI_ONLY) && \ - !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS)) + (!defined(KOKKOSKERNELS_ETI_ONLY) && !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS)) typedef Kokkos::View view_type_a_ll; Test::impl_test_nrm2_squared(0); Test::impl_test_nrm2_squared(13); @@ -119,8 +112,7 @@ int test_nrm2_squared() { #endif #if defined(KOKKOSKERNELS_INST_LAYOUTRIGHT) || \ - (!defined(KOKKOSKERNELS_ETI_ONLY) && \ - !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS)) + (!defined(KOKKOSKERNELS_ETI_ONLY) && !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS)) typedef Kokkos::View view_type_a_lr; Test::impl_test_nrm2_squared(0); Test::impl_test_nrm2_squared(13); @@ -128,8 +120,7 @@ int test_nrm2_squared() { // Test::impl_test_nrm2_squared(132231); #endif -#if (!defined(KOKKOSKERNELS_ETI_ONLY) && \ - !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS)) +#if (!defined(KOKKOSKERNELS_ETI_ONLY) && !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS)) typedef Kokkos::View view_type_a_ls; Test::impl_test_nrm2_squared(0); Test::impl_test_nrm2_squared(13); @@ -143,8 +134,7 @@ int test_nrm2_squared() { template int test_nrm2_squared_mv() { #if defined(KOKKOSKERNELS_INST_LAYOUTLEFT) || \ - (!defined(KOKKOSKERNELS_ETI_ONLY) && \ - !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS)) + (!defined(KOKKOSKERNELS_ETI_ONLY) && !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS)) typedef Kokkos::View view_type_a_ll; Test::impl_test_nrm2_squared_mv(0, 5); Test::impl_test_nrm2_squared_mv(13, 5); @@ -154,8 +144,7 @@ int test_nrm2_squared_mv() { #endif #if defined(KOKKOSKERNELS_INST_LAYOUTRIGHT) || \ - (!defined(KOKKOSKERNELS_ETI_ONLY) && \ - !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS)) + (!defined(KOKKOSKERNELS_ETI_ONLY) && !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS)) typedef Kokkos::View view_type_a_lr; Test::impl_test_nrm2_squared_mv(0, 5); Test::impl_test_nrm2_squared_mv(13, 5); @@ -164,8 +153,7 @@ int test_nrm2_squared_mv() { // Test::impl_test_nrm2_squared_mv(132231,5); #endif -#if (!defined(KOKKOSKERNELS_ETI_ONLY) && \ - !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS)) +#if (!defined(KOKKOSKERNELS_ETI_ONLY) && !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS)) typedef Kokkos::View view_type_a_ls; Test::impl_test_nrm2_squared_mv(0, 5); Test::impl_test_nrm2_squared_mv(13, 5); @@ -178,8 +166,7 @@ int test_nrm2_squared_mv() { } #if defined(KOKKOSKERNELS_INST_FLOAT) || \ - (!defined(KOKKOSKERNELS_ETI_ONLY) && \ - !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS)) + (!defined(KOKKOSKERNELS_ETI_ONLY) && !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS)) TEST_F(TestCategory, nrm2_squared_float) { Kokkos::Profiling::pushRegion("KokkosBlas::Test::nrm2_squared_float"); test_nrm2_squared(); @@ -193,8 +180,7 @@ TEST_F(TestCategory, nrm2_squared_mv_float) { #endif #if defined(KOKKOSKERNELS_INST_DOUBLE) || \ - (!defined(KOKKOSKERNELS_ETI_ONLY) && \ - !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS)) + (!defined(KOKKOSKERNELS_ETI_ONLY) && !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS)) TEST_F(TestCategory, nrm2_squared_double) { Kokkos::Profiling::pushRegion("KokkosBlas::Test::nrm2_squared_double"); test_nrm2_squared(); @@ -208,25 +194,21 @@ TEST_F(TestCategory, nrm2_squared_mv_double) { #endif #if defined(KOKKOSKERNELS_INST_COMPLEX_DOUBLE) || \ - (!defined(KOKKOSKERNELS_ETI_ONLY) && \ - !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS)) + (!defined(KOKKOSKERNELS_ETI_ONLY) && !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS)) TEST_F(TestCategory, nrm2_squared_complex_double) { - Kokkos::Profiling::pushRegion( - "KokkosBlas::Test::nrm2_squared_complex_double"); + Kokkos::Profiling::pushRegion("KokkosBlas::Test::nrm2_squared_complex_double"); test_nrm2_squared, TestDevice>(); Kokkos::Profiling::popRegion(); } TEST_F(TestCategory, nrm2_squared_mv_complex_double) { - Kokkos::Profiling::pushRegion( - "KokkosBlas::Test::nrm2_squared_mv_complex_double"); + Kokkos::Profiling::pushRegion("KokkosBlas::Test::nrm2_squared_mv_complex_double"); test_nrm2_squared_mv, TestDevice>(); Kokkos::Profiling::popRegion(); } #endif -#if defined(KOKKOSKERNELS_INST_INT) || \ - (!defined(KOKKOSKERNELS_ETI_ONLY) && \ - !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS)) +#if defined(KOKKOSKERNELS_INST_INT) || \ + (!defined(KOKKOSKERNELS_ETI_ONLY) && !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS)) TEST_F(TestCategory, nrm2_squared_int) { Kokkos::Profiling::pushRegion("KokkosBlas::Test::nrm2_squared_int"); test_nrm2_squared(); diff --git a/blas/unit_test/Test_Blas1_nrm2w.hpp b/blas/unit_test/Test_Blas1_nrm2w.hpp index 48d8676fe4..6dcc01bf17 100644 --- a/blas/unit_test/Test_Blas1_nrm2w.hpp +++ b/blas/unit_test/Test_Blas1_nrm2w.hpp @@ -31,11 +31,9 @@ void impl_test_nrm2w(int N) { constexpr MagnitudeA max_val = 10; const MagnitudeA eps = AT::epsilon(); - const MagnitudeA max_error = - max_val * std::sqrt(static_cast(N)) * eps; + const MagnitudeA max_error = max_val * std::sqrt(static_cast(N)) * eps; - Kokkos::Random_XorShift64_Pool rand_pool( - 13718); + Kokkos::Random_XorShift64_Pool rand_pool(13718); ScalarA randStart, randEnd; Test::getRandomBounds(max_val, randStart, randEnd); @@ -51,8 +49,7 @@ void impl_test_nrm2w(int N) { typename AT::mag_type term = AT::abs(a.h_view(i)) / AT::abs(w.h_view(i)); expected_result += term * term; } - expected_result = - Kokkos::ArithTraits::sqrt(expected_result); + expected_result = Kokkos::ArithTraits::sqrt(expected_result); typename AT::mag_type nonconst_result = KokkosBlas::nrm2w(a.d_view, w.d_view); EXPECT_NEAR_KK(nonconst_result, expected_result, max_error); @@ -69,11 +66,9 @@ void impl_test_nrm2w_mv(int N, int K) { constexpr MagnitudeA max_val = 10; const MagnitudeA eps = AT::epsilon(); - const MagnitudeA max_error = - max_val * std::sqrt(static_cast(N)) * eps; + const MagnitudeA max_error = max_val * std::sqrt(static_cast(N)) * eps; - Kokkos::Random_XorShift64_Pool rand_pool( - 13718); + Kokkos::Random_XorShift64_Pool rand_pool(13718); ScalarA randStart, randEnd; Test::getRandomBounds(max_val, randStart, randEnd); @@ -88,12 +83,10 @@ void impl_test_nrm2w_mv(int N, int K) { for (int j = 0; j < K; j++) { expected_result[j] = typename AT::mag_type(); for (int i = 0; i < N; i++) { - typename AT::mag_type term = - AT::abs(a.h_view(i, j)) / AT::abs(w.h_view(i, j)); + typename AT::mag_type term = AT::abs(a.h_view(i, j)) / AT::abs(w.h_view(i, j)); expected_result[j] += term * term; } - expected_result[j] = - Kokkos::ArithTraits::sqrt(expected_result[j]); + expected_result[j] = Kokkos::ArithTraits::sqrt(expected_result[j]); } Kokkos::View r("Dot::Result", K); @@ -112,8 +105,7 @@ void impl_test_nrm2w_mv(int N, int K) { template int test_nrm2w() { #if defined(KOKKOSKERNELS_INST_LAYOUTLEFT) || \ - (!defined(KOKKOSKERNELS_ETI_ONLY) && \ - !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS)) + (!defined(KOKKOSKERNELS_ETI_ONLY) && !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS)) typedef Kokkos::View view_type_a_ll; Test::impl_test_nrm2w(0); Test::impl_test_nrm2w(13); @@ -122,8 +114,7 @@ int test_nrm2w() { #endif #if defined(KOKKOSKERNELS_INST_LAYOUTRIGHT) || \ - (!defined(KOKKOSKERNELS_ETI_ONLY) && \ - !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS)) + (!defined(KOKKOSKERNELS_ETI_ONLY) && !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS)) typedef Kokkos::View view_type_a_lr; Test::impl_test_nrm2w(0); Test::impl_test_nrm2w(13); @@ -131,8 +122,7 @@ int test_nrm2w() { // Test::impl_test_nrm2(132231); #endif -#if (!defined(KOKKOSKERNELS_ETI_ONLY) && \ - !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS)) +#if (!defined(KOKKOSKERNELS_ETI_ONLY) && !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS)) typedef Kokkos::View view_type_a_ls; Test::impl_test_nrm2w(0); Test::impl_test_nrm2w(13); @@ -146,8 +136,7 @@ int test_nrm2w() { template int test_nrm2w_mv() { #if defined(KOKKOSKERNELS_INST_LAYOUTLEFT) || \ - (!defined(KOKKOSKERNELS_ETI_ONLY) && \ - !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS)) + (!defined(KOKKOSKERNELS_ETI_ONLY) && !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS)) typedef Kokkos::View view_type_a_ll; Test::impl_test_nrm2w_mv(0, 5); Test::impl_test_nrm2w_mv(13, 5); @@ -157,8 +146,7 @@ int test_nrm2w_mv() { #endif #if defined(KOKKOSKERNELS_INST_LAYOUTRIGHT) || \ - (!defined(KOKKOSKERNELS_ETI_ONLY) && \ - !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS)) + (!defined(KOKKOSKERNELS_ETI_ONLY) && !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS)) typedef Kokkos::View view_type_a_lr; Test::impl_test_nrm2w_mv(0, 5); Test::impl_test_nrm2w_mv(13, 5); @@ -167,8 +155,7 @@ int test_nrm2w_mv() { // Test::impl_test_nrm2w_mv(132231,5); #endif -#if (!defined(KOKKOSKERNELS_ETI_ONLY) && \ - !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS)) +#if (!defined(KOKKOSKERNELS_ETI_ONLY) && !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS)) typedef Kokkos::View view_type_a_ls; Test::impl_test_nrm2w_mv(0, 5); Test::impl_test_nrm2w_mv(13, 5); @@ -181,8 +168,7 @@ int test_nrm2w_mv() { } #if defined(KOKKOSKERNELS_INST_FLOAT) || \ - (!defined(KOKKOSKERNELS_ETI_ONLY) && \ - !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS)) + (!defined(KOKKOSKERNELS_ETI_ONLY) && !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS)) TEST_F(TestCategory, nrm2w_float) { Kokkos::Profiling::pushRegion("KokkosBlas::Test::nrm2w_float"); test_nrm2w(); @@ -196,8 +182,7 @@ TEST_F(TestCategory, nrm2w_mv_float) { #endif #if defined(KOKKOSKERNELS_INST_DOUBLE) || \ - (!defined(KOKKOSKERNELS_ETI_ONLY) && \ - !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS)) + (!defined(KOKKOSKERNELS_ETI_ONLY) && !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS)) TEST_F(TestCategory, nrm2w_double) { Kokkos::Profiling::pushRegion("KokkosBlas::Test::nrm2w_double"); test_nrm2w(); @@ -211,8 +196,7 @@ TEST_F(TestCategory, nrm2w_mv_double) { #endif #if defined(KOKKOSKERNELS_INST_COMPLEX_DOUBLE) || \ - (!defined(KOKKOSKERNELS_ETI_ONLY) && \ - !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS)) + (!defined(KOKKOSKERNELS_ETI_ONLY) && !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS)) TEST_F(TestCategory, nrm2w_complex_double) { Kokkos::Profiling::pushRegion("KokkosBlas::Test::nrm2w_complex_double"); test_nrm2w, TestDevice>(); @@ -225,9 +209,8 @@ TEST_F(TestCategory, nrm2w_mv_complex_double) { } #endif -#if defined(KOKKOSKERNELS_INST_INT) || \ - (!defined(KOKKOSKERNELS_ETI_ONLY) && \ - !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS)) +#if defined(KOKKOSKERNELS_INST_INT) || \ + (!defined(KOKKOSKERNELS_ETI_ONLY) && !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS)) TEST_F(TestCategory, nrm2w_int) { Kokkos::Profiling::pushRegion("KokkosBlas::Test::nrm2w_int"); test_nrm2w(); diff --git a/blas/unit_test/Test_Blas1_nrm2w_squared.hpp b/blas/unit_test/Test_Blas1_nrm2w_squared.hpp index 5a55d15fad..42bcdb0848 100644 --- a/blas/unit_test/Test_Blas1_nrm2w_squared.hpp +++ b/blas/unit_test/Test_Blas1_nrm2w_squared.hpp @@ -33,8 +33,7 @@ void impl_test_nrm2w_squared(int N) { const MagnitudeA eps = AT::epsilon(); const MagnitudeA max_error = max_val * max_val * N * eps; - Kokkos::Random_XorShift64_Pool rand_pool( - 13718); + Kokkos::Random_XorShift64_Pool rand_pool(13718); ScalarA randStart, randEnd; Test::getRandomBounds(max_val, randStart, randEnd); @@ -51,8 +50,7 @@ void impl_test_nrm2w_squared(int N) { expected_result += term * term; } - typename AT::mag_type nonconst_result = - KokkosBlas::nrm2w_squared(a.d_view, w.d_view); + typename AT::mag_type nonconst_result = KokkosBlas::nrm2w_squared(a.d_view, w.d_view); EXPECT_NEAR_KK(nonconst_result, expected_result, max_error); } @@ -69,8 +67,7 @@ void impl_test_nrm2w_squared_mv(int N, int K) { const MagnitudeA eps = AT::epsilon(); const MagnitudeA max_error = max_val * max_val * N * eps; - Kokkos::Random_XorShift64_Pool rand_pool( - 13718); + Kokkos::Random_XorShift64_Pool rand_pool(13718); ScalarA randStart, randEnd; Test::getRandomBounds(max_val, randStart, randEnd); @@ -84,8 +81,7 @@ void impl_test_nrm2w_squared_mv(int N, int K) { for (int j = 0; j < K; j++) { expected_result[j] = typename AT::mag_type(); for (int i = 0; i < N; i++) { - typename AT::mag_type term = - AT::abs(a.h_view(i, j)) / AT::abs(w.h_view(i, j)); + typename AT::mag_type term = AT::abs(a.h_view(i, j)) / AT::abs(w.h_view(i, j)); expected_result[j] += term * term; } } @@ -106,8 +102,7 @@ void impl_test_nrm2w_squared_mv(int N, int K) { template int test_nrm2w_squared() { #if defined(KOKKOSKERNELS_INST_LAYOUTLEFT) || \ - (!defined(KOKKOSKERNELS_ETI_ONLY) && \ - !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS)) + (!defined(KOKKOSKERNELS_ETI_ONLY) && !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS)) typedef Kokkos::View view_type_a_ll; Test::impl_test_nrm2w_squared(0); Test::impl_test_nrm2w_squared(13); @@ -116,8 +111,7 @@ int test_nrm2w_squared() { #endif #if defined(KOKKOSKERNELS_INST_LAYOUTRIGHT) || \ - (!defined(KOKKOSKERNELS_ETI_ONLY) && \ - !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS)) + (!defined(KOKKOSKERNELS_ETI_ONLY) && !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS)) typedef Kokkos::View view_type_a_lr; Test::impl_test_nrm2w_squared(0); Test::impl_test_nrm2w_squared(13); @@ -125,8 +119,7 @@ int test_nrm2w_squared() { // Test::impl_test_nrm2(132231); #endif -#if (!defined(KOKKOSKERNELS_ETI_ONLY) && \ - !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS)) +#if (!defined(KOKKOSKERNELS_ETI_ONLY) && !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS)) typedef Kokkos::View view_type_a_ls; Test::impl_test_nrm2w_squared(0); Test::impl_test_nrm2w_squared(13); @@ -140,8 +133,7 @@ int test_nrm2w_squared() { template int test_nrm2w_squared_mv() { #if defined(KOKKOSKERNELS_INST_LAYOUTLEFT) || \ - (!defined(KOKKOSKERNELS_ETI_ONLY) && \ - !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS)) + (!defined(KOKKOSKERNELS_ETI_ONLY) && !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS)) typedef Kokkos::View view_type_a_ll; Test::impl_test_nrm2w_squared_mv(0, 5); Test::impl_test_nrm2w_squared_mv(13, 5); @@ -151,8 +143,7 @@ int test_nrm2w_squared_mv() { #endif #if defined(KOKKOSKERNELS_INST_LAYOUTRIGHT) || \ - (!defined(KOKKOSKERNELS_ETI_ONLY) && \ - !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS)) + (!defined(KOKKOSKERNELS_ETI_ONLY) && !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS)) typedef Kokkos::View view_type_a_lr; Test::impl_test_nrm2w_squared_mv(0, 5); Test::impl_test_nrm2w_squared_mv(13, 5); @@ -161,8 +152,7 @@ int test_nrm2w_squared_mv() { // Test::impl_test_nrm2w_squared_mv(132231,5); #endif -#if (!defined(KOKKOSKERNELS_ETI_ONLY) && \ - !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS)) +#if (!defined(KOKKOSKERNELS_ETI_ONLY) && !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS)) typedef Kokkos::View view_type_a_ls; Test::impl_test_nrm2w_squared_mv(0, 5); Test::impl_test_nrm2w_squared_mv(13, 5); @@ -175,8 +165,7 @@ int test_nrm2w_squared_mv() { } #if defined(KOKKOSKERNELS_INST_FLOAT) || \ - (!defined(KOKKOSKERNELS_ETI_ONLY) && \ - !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS)) + (!defined(KOKKOSKERNELS_ETI_ONLY) && !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS)) TEST_F(TestCategory, nrm2w_squared_float) { Kokkos::Profiling::pushRegion("KokkosBlas::Test::nrm2w_squared_float"); test_nrm2w_squared(); @@ -190,8 +179,7 @@ TEST_F(TestCategory, nrm2w_squared_mv_float) { #endif #if defined(KOKKOSKERNELS_INST_DOUBLE) || \ - (!defined(KOKKOSKERNELS_ETI_ONLY) && \ - !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS)) + (!defined(KOKKOSKERNELS_ETI_ONLY) && !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS)) TEST_F(TestCategory, nrm2w_squared_double) { Kokkos::Profiling::pushRegion("KokkosBlas::Test::nrm2w_squared_double"); test_nrm2w_squared(); @@ -205,25 +193,21 @@ TEST_F(TestCategory, nrm2w_squared_mv_double) { #endif #if defined(KOKKOSKERNELS_INST_COMPLEX_DOUBLE) || \ - (!defined(KOKKOSKERNELS_ETI_ONLY) && \ - !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS)) + (!defined(KOKKOSKERNELS_ETI_ONLY) && !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS)) TEST_F(TestCategory, nrm2w_squared_complex_double) { - Kokkos::Profiling::pushRegion( - "KokkosBlas::Test::nrm2w_squared_complex_double"); + Kokkos::Profiling::pushRegion("KokkosBlas::Test::nrm2w_squared_complex_double"); test_nrm2w_squared, TestDevice>(); Kokkos::Profiling::popRegion(); } TEST_F(TestCategory, nrm2w_squared_mv_complex_double) { - Kokkos::Profiling::pushRegion( - "KokkosBlas::Test::nrm2w_squared_mv_complex_double"); + Kokkos::Profiling::pushRegion("KokkosBlas::Test::nrm2w_squared_mv_complex_double"); test_nrm2w_squared_mv, TestDevice>(); Kokkos::Profiling::popRegion(); } #endif -#if defined(KOKKOSKERNELS_INST_INT) || \ - (!defined(KOKKOSKERNELS_ETI_ONLY) && \ - !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS)) +#if defined(KOKKOSKERNELS_INST_INT) || \ + (!defined(KOKKOSKERNELS_ETI_ONLY) && !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS)) TEST_F(TestCategory, nrm2w_squared_int) { Kokkos::Profiling::pushRegion("KokkosBlas::Test::nrm2w_squared_int"); test_nrm2w_squared(); diff --git a/blas/unit_test/Test_Blas1_nrminf.hpp b/blas/unit_test/Test_Blas1_nrminf.hpp index 91cc1c7502..e4a9101e85 100644 --- a/blas/unit_test/Test_Blas1_nrminf.hpp +++ b/blas/unit_test/Test_Blas1_nrminf.hpp @@ -27,8 +27,7 @@ void impl_test_nrminf(int N) { view_stride_adapter a("A", N); - Kokkos::Random_XorShift64_Pool rand_pool( - 13718); + Kokkos::Random_XorShift64_Pool rand_pool(13718); ScalarA randStart, randEnd; Test::getRandomBounds(10.0, randStart, randEnd); @@ -38,11 +37,9 @@ void impl_test_nrminf(int N) { double eps = std::is_same::value ? 2 * 1e-5 : 1e-7; - typename AT::mag_type expected_result = - Kokkos::ArithTraits::min(); + typename AT::mag_type expected_result = Kokkos::ArithTraits::min(); for (int i = 0; i < N; i++) - if (AT::abs(a.h_view(i)) > expected_result) - expected_result = AT::abs(a.h_view(i)); + if (AT::abs(a.h_view(i)) > expected_result) expected_result = AT::abs(a.h_view(i)); if (N == 0) expected_result = typename AT::mag_type(0); @@ -60,8 +57,7 @@ void impl_test_nrminf_mv(int N, int K) { view_stride_adapter a("A", N, K); - Kokkos::Random_XorShift64_Pool rand_pool( - 13718); + Kokkos::Random_XorShift64_Pool rand_pool(13718); ScalarA randStart, randEnd; Test::getRandomBounds(10.0, randStart, randEnd); @@ -73,8 +69,7 @@ void impl_test_nrminf_mv(int N, int K) { for (int j = 0; j < K; j++) { expected_result[j] = Kokkos::ArithTraits::min(); for (int i = 0; i < N; i++) { - if (AT::abs(a.h_view(i, j)) > expected_result[j]) - expected_result[j] = AT::abs(a.h_view(i, j)); + if (AT::abs(a.h_view(i, j)) > expected_result[j]) expected_result[j] = AT::abs(a.h_view(i, j)); } if (N == 0) expected_result[j] = typename AT::mag_type(0); } @@ -103,8 +98,7 @@ void impl_test_nrminf_mv(int N, int K) { template int test_nrminf() { #if defined(KOKKOSKERNELS_INST_LAYOUTLEFT) || \ - (!defined(KOKKOSKERNELS_ETI_ONLY) && \ - !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS)) + (!defined(KOKKOSKERNELS_ETI_ONLY) && !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS)) typedef Kokkos::View view_type_a_ll; Test::impl_test_nrminf(0); Test::impl_test_nrminf(13); @@ -113,8 +107,7 @@ int test_nrminf() { #endif #if defined(KOKKOSKERNELS_INST_LAYOUTRIGHT) || \ - (!defined(KOKKOSKERNELS_ETI_ONLY) && \ - !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS)) + (!defined(KOKKOSKERNELS_ETI_ONLY) && !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS)) typedef Kokkos::View view_type_a_lr; Test::impl_test_nrminf(0); Test::impl_test_nrminf(13); @@ -122,8 +115,7 @@ int test_nrminf() { // Test::impl_test_nrminf(132231); #endif -#if (!defined(KOKKOSKERNELS_ETI_ONLY) && \ - !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS)) +#if (!defined(KOKKOSKERNELS_ETI_ONLY) && !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS)) typedef Kokkos::View view_type_a_ls; Test::impl_test_nrminf(0); Test::impl_test_nrminf(13); @@ -137,8 +129,7 @@ int test_nrminf() { template int test_nrminf_mv() { #if defined(KOKKOSKERNELS_INST_LAYOUTLEFT) || \ - (!defined(KOKKOSKERNELS_ETI_ONLY) && \ - !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS)) + (!defined(KOKKOSKERNELS_ETI_ONLY) && !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS)) typedef Kokkos::View view_type_a_ll; Test::impl_test_nrminf_mv(0, 5); Test::impl_test_nrminf_mv(13, 5); @@ -147,8 +138,7 @@ int test_nrminf_mv() { #endif #if defined(KOKKOSKERNELS_INST_LAYOUTRIGHT) || \ - (!defined(KOKKOSKERNELS_ETI_ONLY) && \ - !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS)) + (!defined(KOKKOSKERNELS_ETI_ONLY) && !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS)) typedef Kokkos::View view_type_a_lr; Test::impl_test_nrminf_mv(0, 5); Test::impl_test_nrminf_mv(13, 5); @@ -156,8 +146,7 @@ int test_nrminf_mv() { // Test::impl_test_nrminf_mv(132231,5); #endif -#if (!defined(KOKKOSKERNELS_ETI_ONLY) && \ - !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS)) +#if (!defined(KOKKOSKERNELS_ETI_ONLY) && !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS)) typedef Kokkos::View view_type_a_ls; Test::impl_test_nrminf_mv(0, 5); Test::impl_test_nrminf_mv(13, 5); @@ -169,8 +158,7 @@ int test_nrminf_mv() { } #if defined(KOKKOSKERNELS_INST_FLOAT) || \ - (!defined(KOKKOSKERNELS_ETI_ONLY) && \ - !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS)) + (!defined(KOKKOSKERNELS_ETI_ONLY) && !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS)) TEST_F(TestCategory, nrminf_float) { Kokkos::Profiling::pushRegion("KokkosBlas::Test::nrminf_float"); test_nrminf(); @@ -184,8 +172,7 @@ TEST_F(TestCategory, nrminf_mv_float) { #endif #if defined(KOKKOSKERNELS_INST_DOUBLE) || \ - (!defined(KOKKOSKERNELS_ETI_ONLY) && \ - !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS)) + (!defined(KOKKOSKERNELS_ETI_ONLY) && !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS)) TEST_F(TestCategory, nrminf_double) { Kokkos::Profiling::pushRegion("KokkosBlas::Test::nrminf_double"); test_nrminf(); @@ -199,8 +186,7 @@ TEST_F(TestCategory, nrminf_mv_double) { #endif #if defined(KOKKOSKERNELS_INST_COMPLEX_DOUBLE) || \ - (!defined(KOKKOSKERNELS_ETI_ONLY) && \ - !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS)) + (!defined(KOKKOSKERNELS_ETI_ONLY) && !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS)) TEST_F(TestCategory, nrminf_complex_double) { Kokkos::Profiling::pushRegion("KokkosBlas::Test::nrminf_complex_double"); test_nrminf, TestDevice>(); @@ -213,9 +199,8 @@ TEST_F(TestCategory, nrminf_mv_complex_double) { } #endif -#if defined(KOKKOSKERNELS_INST_INT) || \ - (!defined(KOKKOSKERNELS_ETI_ONLY) && \ - !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS)) +#if defined(KOKKOSKERNELS_INST_INT) || \ + (!defined(KOKKOSKERNELS_ETI_ONLY) && !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS)) TEST_F(TestCategory, nrminf_int) { Kokkos::Profiling::pushRegion("KokkosBlas::Test::nrminf_int"); test_nrminf(); diff --git a/blas/unit_test/Test_Blas1_reciprocal.hpp b/blas/unit_test/Test_Blas1_reciprocal.hpp index c293fa04eb..2b8a07a552 100644 --- a/blas/unit_test/Test_Blas1_reciprocal.hpp +++ b/blas/unit_test/Test_Blas1_reciprocal.hpp @@ -36,8 +36,7 @@ void impl_test_reciprocal(int N) { view_stride_adapter x("X", N); view_stride_adapter y("Y", N); - Kokkos::Random_XorShift64_Pool rand_pool( - 13718); + Kokkos::Random_XorShift64_Pool rand_pool(13718); { ScalarA randStart, randEnd; @@ -71,14 +70,12 @@ void impl_test_reciprocal_mv(int N, int K) { view_stride_adapter x("X", N, K); view_stride_adapter y("Y", N, K); - Kokkos::Random_XorShift64_Pool rand_pool( - 13718); + Kokkos::Random_XorShift64_Pool rand_pool(13718); { ScalarA randStart, randEnd; Test::getRandomBounds(10, randStart, randEnd); - Kokkos::fill_random(x.d_view, rand_pool, - Kokkos::ArithTraits::one(), randEnd); + Kokkos::fill_random(x.d_view, rand_pool, Kokkos::ArithTraits::one(), randEnd); } Kokkos::deep_copy(x.h_base, x.d_base); @@ -88,10 +85,8 @@ void impl_test_reciprocal_mv(int N, int K) { Kokkos::deep_copy(y.h_base, y.d_base); for (int j = 0; j < K; ++j) { for (int i = 0; i < N; ++i) { - EXPECT_NEAR_KK( - y.h_view(i, j), - Kokkos::ArithTraits::one() / ScalarB(x.h_view(i, j)), - 2 * Kokkos::ArithTraits::epsilon()); + EXPECT_NEAR_KK(y.h_view(i, j), Kokkos::ArithTraits::one() / ScalarB(x.h_view(i, j)), + 2 * Kokkos::ArithTraits::epsilon()); } } @@ -102,10 +97,8 @@ void impl_test_reciprocal_mv(int N, int K) { Kokkos::deep_copy(y.h_base, y.d_base); for (int j = 0; j < K; j++) { for (int i = 0; i < N; ++i) { - EXPECT_NEAR_KK( - y.h_view(i, j), - Kokkos::ArithTraits::one() / ScalarB(x.h_view(i, j)), - 2 * Kokkos::ArithTraits::epsilon()); + EXPECT_NEAR_KK(y.h_view(i, j), Kokkos::ArithTraits::one() / ScalarB(x.h_view(i, j)), + 2 * Kokkos::ArithTraits::epsilon()); } } } @@ -114,8 +107,7 @@ void impl_test_reciprocal_mv(int N, int K) { template int test_reciprocal() { #if defined(KOKKOSKERNELS_INST_LAYOUTLEFT) || \ - (!defined(KOKKOSKERNELS_ETI_ONLY) && \ - !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS)) + (!defined(KOKKOSKERNELS_ETI_ONLY) && !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS)) typedef Kokkos::View view_type_a_ll; typedef Kokkos::View view_type_b_ll; Test::impl_test_reciprocal(0); @@ -125,8 +117,7 @@ int test_reciprocal() { #endif #if defined(KOKKOSKERNELS_INST_LAYOUTRIGHT) || \ - (!defined(KOKKOSKERNELS_ETI_ONLY) && \ - !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS)) + (!defined(KOKKOSKERNELS_ETI_ONLY) && !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS)) typedef Kokkos::View view_type_a_lr; typedef Kokkos::View view_type_b_lr; Test::impl_test_reciprocal(0); @@ -135,8 +126,7 @@ int test_reciprocal() { // Test::impl_test_reciprocal(132231); #endif -#if (!defined(KOKKOSKERNELS_ETI_ONLY) && \ - !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS)) +#if (!defined(KOKKOSKERNELS_ETI_ONLY) && !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS)) typedef Kokkos::View view_type_a_ls; typedef Kokkos::View view_type_b_ls; Test::impl_test_reciprocal(0); @@ -145,8 +135,7 @@ int test_reciprocal() { // Test::impl_test_reciprocal(132231); #endif -#if !defined(KOKKOSKERNELS_ETI_ONLY) && \ - !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS) +#if !defined(KOKKOSKERNELS_ETI_ONLY) && !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS) Test::impl_test_reciprocal(1024); Test::impl_test_reciprocal(1024); #endif @@ -157,57 +146,47 @@ int test_reciprocal() { template int test_reciprocal_mv() { #if defined(KOKKOSKERNELS_INST_LAYOUTLEFT) || \ - (!defined(KOKKOSKERNELS_ETI_ONLY) && \ - !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS)) + (!defined(KOKKOSKERNELS_ETI_ONLY) && !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS)) typedef Kokkos::View view_type_a_ll; typedef Kokkos::View view_type_b_ll; Test::impl_test_reciprocal_mv(0, 5); Test::impl_test_reciprocal_mv(13, 5); - Test::impl_test_reciprocal_mv(1024, - 5); + Test::impl_test_reciprocal_mv(1024, 5); // Test::impl_test_reciprocal_mv(132231,5); #endif #if defined(KOKKOSKERNELS_INST_LAYOUTRIGHT) || \ - (!defined(KOKKOSKERNELS_ETI_ONLY) && \ - !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS)) + (!defined(KOKKOSKERNELS_ETI_ONLY) && !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS)) typedef Kokkos::View view_type_a_lr; typedef Kokkos::View view_type_b_lr; Test::impl_test_reciprocal_mv(0, 5); Test::impl_test_reciprocal_mv(13, 5); - Test::impl_test_reciprocal_mv(1024, - 5); + Test::impl_test_reciprocal_mv(1024, 5); // Test::impl_test_reciprocal_mv(132231,5); #endif -#if (!defined(KOKKOSKERNELS_ETI_ONLY) && \ - !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS)) +#if (!defined(KOKKOSKERNELS_ETI_ONLY) && !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS)) typedef Kokkos::View view_type_a_ls; typedef Kokkos::View view_type_b_ls; Test::impl_test_reciprocal_mv(0, 5); Test::impl_test_reciprocal_mv(13, 5); - Test::impl_test_reciprocal_mv(1024, - 5); + Test::impl_test_reciprocal_mv(1024, 5); // Test::impl_test_reciprocal_mv(132231,5); #endif -#if !defined(KOKKOSKERNELS_ETI_ONLY) && \ - !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS) - Test::impl_test_reciprocal_mv(1024, - 5); - Test::impl_test_reciprocal_mv(1024, - 5); +#if !defined(KOKKOSKERNELS_ETI_ONLY) && !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS) + Test::impl_test_reciprocal_mv(1024, 5); + Test::impl_test_reciprocal_mv(1024, 5); #endif return 1; } #if defined(KOKKOSKERNELS_INST_FLOAT) || \ - (!defined(KOKKOSKERNELS_ETI_ONLY) && \ - !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS)) + (!defined(KOKKOSKERNELS_ETI_ONLY) && !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS)) TEST_F(TestCategory, reciprocal_float) { Kokkos::Profiling::pushRegion("KokkosBlas::Test::reciprocal_float"); test_reciprocal(); @@ -221,8 +200,7 @@ TEST_F(TestCategory, reciprocal_mv_float) { #endif #if defined(KOKKOSKERNELS_INST_DOUBLE) || \ - (!defined(KOKKOSKERNELS_ETI_ONLY) && \ - !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS)) + (!defined(KOKKOSKERNELS_ETI_ONLY) && !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS)) TEST_F(TestCategory, reciprocal_double) { Kokkos::Profiling::pushRegion("KokkosBlas::Test::reciprocal_double"); test_reciprocal(); @@ -236,26 +214,21 @@ TEST_F(TestCategory, reciprocal_mv_double) { #endif #if defined(KOKKOSKERNELS_INST_COMPLEX_DOUBLE) || \ - (!defined(KOKKOSKERNELS_ETI_ONLY) && \ - !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS)) + (!defined(KOKKOSKERNELS_ETI_ONLY) && !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS)) TEST_F(TestCategory, reciprocal_complex_double) { Kokkos::Profiling::pushRegion("KokkosBlas::Test::reciprocal_complex_double"); - test_reciprocal, Kokkos::complex, - TestDevice>(); + test_reciprocal, Kokkos::complex, TestDevice>(); Kokkos::Profiling::popRegion(); } TEST_F(TestCategory, reciprocal_mv_complex_double) { - Kokkos::Profiling::pushRegion( - "KokkosBlas::Test::reciprocal_mv_complex_double"); - test_reciprocal_mv, Kokkos::complex, - TestDevice>(); + Kokkos::Profiling::pushRegion("KokkosBlas::Test::reciprocal_mv_complex_double"); + test_reciprocal_mv, Kokkos::complex, TestDevice>(); Kokkos::Profiling::popRegion(); } #endif -#if defined(KOKKOSKERNELS_INST_INT) || \ - (!defined(KOKKOSKERNELS_ETI_ONLY) && \ - !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS)) +#if defined(KOKKOSKERNELS_INST_INT) || \ + (!defined(KOKKOSKERNELS_ETI_ONLY) && !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS)) TEST_F(TestCategory, reciprocal_int) { Kokkos::Profiling::pushRegion("KokkosBlas::Test::reciprocal_int"); test_reciprocal(); diff --git a/blas/unit_test/Test_Blas1_rot.hpp b/blas/unit_test/Test_Blas1_rot.hpp index ab1f395923..db9367cb42 100644 --- a/blas/unit_test/Test_Blas1_rot.hpp +++ b/blas/unit_test/Test_Blas1_rot.hpp @@ -71,8 +71,7 @@ int test_rot() { } #if defined(KOKKOSKERNELS_INST_FLOAT) || \ - (!defined(KOKKOSKERNELS_ETI_ONLY) && \ - !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS)) + (!defined(KOKKOSKERNELS_ETI_ONLY) && !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS)) TEST_F(TestCategory, rot_float) { Kokkos::Profiling::pushRegion("KokkosBlas::Test::rot"); test_rot(); @@ -81,8 +80,7 @@ TEST_F(TestCategory, rot_float) { #endif #if defined(KOKKOSKERNELS_INST_DOUBLE) || \ - (!defined(KOKKOSKERNELS_ETI_ONLY) && \ - !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS)) + (!defined(KOKKOSKERNELS_ETI_ONLY) && !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS)) TEST_F(TestCategory, rot_double) { Kokkos::Profiling::pushRegion("KokkosBlas::Test::rot"); test_rot(); @@ -91,8 +89,7 @@ TEST_F(TestCategory, rot_double) { #endif #if defined(KOKKOSKERNELS_INST_COMPLEX_FLOAT) || \ - (!defined(KOKKOSKERNELS_ETI_ONLY) && \ - !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS)) + (!defined(KOKKOSKERNELS_ETI_ONLY) && !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS)) TEST_F(TestCategory, rot_complex_float) { Kokkos::Profiling::pushRegion("KokkosBlas::Test::rot"); test_rot, TestDevice>(); @@ -101,8 +98,7 @@ TEST_F(TestCategory, rot_complex_float) { #endif #if defined(KOKKOSKERNELS_INST_COMPLEX_DOUBLE) || \ - (!defined(KOKKOSKERNELS_ETI_ONLY) && \ - !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS)) + (!defined(KOKKOSKERNELS_ETI_ONLY) && !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS)) TEST_F(TestCategory, rot_complex_double) { Kokkos::Profiling::pushRegion("KokkosBlas::Test::rot"); test_rot, TestDevice>(); diff --git a/blas/unit_test/Test_Blas1_rotg.hpp b/blas/unit_test/Test_Blas1_rotg.hpp index 27f9c3cf71..31945ba6d9 100644 --- a/blas/unit_test/Test_Blas1_rotg.hpp +++ b/blas/unit_test/Test_Blas1_rotg.hpp @@ -17,8 +17,7 @@ namespace Test { template -void test_rotg_impl(typename Device::execution_space const& space, - Scalar const a_in, Scalar const b_in) { +void test_rotg_impl(typename Device::execution_space const& space, Scalar const a_in, Scalar const b_in) { using magnitude_type = typename Kokkos::ArithTraits::mag_type; using SViewType = Kokkos::View; using MViewType = Kokkos::View; @@ -59,8 +58,7 @@ int test_rotg() { } #if defined(KOKKOSKERNELS_INST_FLOAT) || \ - (!defined(KOKKOSKERNELS_ETI_ONLY) && \ - !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS)) + (!defined(KOKKOSKERNELS_ETI_ONLY) && !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS)) TEST_F(TestCategory, rotg_float) { Kokkos::Profiling::pushRegion("KokkosBlas::Test::rotg"); test_rotg(); @@ -69,8 +67,7 @@ TEST_F(TestCategory, rotg_float) { #endif #if defined(KOKKOSKERNELS_INST_DOUBLE) || \ - (!defined(KOKKOSKERNELS_ETI_ONLY) && \ - !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS)) + (!defined(KOKKOSKERNELS_ETI_ONLY) && !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS)) TEST_F(TestCategory, rotg_double) { Kokkos::Profiling::pushRegion("KokkosBlas::Test::rotg"); test_rotg(); @@ -79,8 +76,7 @@ TEST_F(TestCategory, rotg_double) { #endif #if defined(KOKKOSKERNELS_INST_COMPLEX_FLOAT) || \ - (!defined(KOKKOSKERNELS_ETI_ONLY) && \ - !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS)) + (!defined(KOKKOSKERNELS_ETI_ONLY) && !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS)) TEST_F(TestCategory, rotg_complex_float) { Kokkos::Profiling::pushRegion("KokkosBlas::Test::rotg"); test_rotg, TestDevice>(); @@ -89,8 +85,7 @@ TEST_F(TestCategory, rotg_complex_float) { #endif #if defined(KOKKOSKERNELS_INST_COMPLEX_DOUBLE) || \ - (!defined(KOKKOSKERNELS_ETI_ONLY) && \ - !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS)) + (!defined(KOKKOSKERNELS_ETI_ONLY) && !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS)) TEST_F(TestCategory, rotg_complex_double) { Kokkos::Profiling::pushRegion("KokkosBlas::Test::rotg"); test_rotg, TestDevice>(); diff --git a/blas/unit_test/Test_Blas1_rotm.hpp b/blas/unit_test/Test_Blas1_rotm.hpp index 1f41fd06bc..e1a7cddb3c 100644 --- a/blas/unit_test/Test_Blas1_rotm.hpp +++ b/blas/unit_test/Test_Blas1_rotm.hpp @@ -18,8 +18,7 @@ namespace Test { template -void set_rotm_inputs(const int &test_case, vector_view_type &X, - vector_view_type &Y, param_view_type ¶m, +void set_rotm_inputs(const int &test_case, vector_view_type &X, vector_view_type &Y, param_view_type ¶m, vector_ref_type &Xref, vector_ref_type &Yref) { // Initialize X and Y inputs typename vector_view_type::HostMirror X_h = Kokkos::create_mirror_view(X); @@ -37,8 +36,7 @@ void set_rotm_inputs(const int &test_case, vector_view_type &X, Kokkos::deep_copy(Y, Y_h); // Initialize Xref, Yref and param (test case dependent) - typename param_view_type::HostMirror param_h = - Kokkos::create_mirror_view(param); + typename param_view_type::HostMirror param_h = Kokkos::create_mirror_view(param); switch (test_case) { case 0: param_h(0) = -2.0; @@ -116,8 +114,7 @@ void set_rotm_inputs(const int &test_case, vector_view_type &X, } template -void check_results(vector_view_type &X, vector_view_type &Y, - vector_ref_type &Xref, vector_ref_type &Yref) { +void check_results(vector_view_type &X, vector_view_type &Y, vector_ref_type &Xref, vector_ref_type &Yref) { using Scalar = typename vector_view_type::value_type; typename vector_view_type::HostMirror X_h = Kokkos::create_mirror_view(X); @@ -162,8 +159,7 @@ int test_rotm() { } #if defined(KOKKOSKERNELS_INST_FLOAT) || \ - (!defined(KOKKOSKERNELS_ETI_ONLY) && \ - !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS)) + (!defined(KOKKOSKERNELS_ETI_ONLY) && !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS)) TEST_F(TestCategory, rotm_float) { Kokkos::Profiling::pushRegion("KokkosBlas::Test::rotm"); test_rotm(); @@ -172,8 +168,7 @@ TEST_F(TestCategory, rotm_float) { #endif #if defined(KOKKOSKERNELS_INST_DOUBLE) || \ - (!defined(KOKKOSKERNELS_ETI_ONLY) && \ - !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS)) + (!defined(KOKKOSKERNELS_ETI_ONLY) && !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS)) TEST_F(TestCategory, rotm_double) { Kokkos::Profiling::pushRegion("KokkosBlas::Test::rotm"); test_rotm(); diff --git a/blas/unit_test/Test_Blas1_rotmg.hpp b/blas/unit_test/Test_Blas1_rotmg.hpp index ecfc3b6815..0fb3c5f67e 100644 --- a/blas/unit_test/Test_Blas1_rotmg.hpp +++ b/blas/unit_test/Test_Blas1_rotmg.hpp @@ -17,8 +17,7 @@ namespace Test { template -void test_rotmg_impl(View0& d1, View0& d2, View0& x1, View0& y1, PView& param, - RView& ref_vals) { +void test_rotmg_impl(View0& d1, View0& d2, View0& x1, View0& y1, PView& param, RView& ref_vals) { using scalar_type = typename View0::non_const_value_type; using YView = typename View0::const_type; @@ -28,10 +27,8 @@ void test_rotmg_impl(View0& d1, View0& d2, View0& x1, View0& y1, PView& param, const scalar_type eps = Kokkos::ArithTraits::eps(); const scalar_type tol = -#if defined(KOKKOSKERNELS_ENABLE_TPL_BLAS) || \ - defined(KOKKOSKERNELS_ENABLE_TPL_MKL) - 100 * - eps; // Guessing MKL implements sin/cos differently so need larger tol +#if defined(KOKKOSKERNELS_ENABLE_TPL_BLAS) || defined(KOKKOSKERNELS_ENABLE_TPL_MKL) + 100 * eps; // Guessing MKL implements sin/cos differently so need larger tol #else 10 * eps; #endif @@ -61,8 +58,7 @@ void test_rotmg_impl(View0& d1, View0& d2, View0& x1, View0& y1, PView& param, } template -void set_rotmg_input_ref_vals(const int test_case, View0& d1, View0& d2, - View0& x1, View0& y1, PView& param, +void set_rotmg_input_ref_vals(const int test_case, View0& d1, View0& d2, View0& x1, View0& y1, PView& param, RView& ref_vals) { constexpr double gamma = 4096; Kokkos::deep_copy(param, 0.0); @@ -211,9 +207,7 @@ void set_rotmg_input_ref_vals(const int test_case, View0& d1, View0& d2, ref_vals(7) = -0.25; ref_vals(8) = 0.0; break; - default: - throw std::runtime_error("rotmg test: test case unrecognized!"); - break; + default: throw std::runtime_error("rotmg test: test case unrecognized!"); break; } } } // namespace Test @@ -222,8 +216,7 @@ template int test_rotmg() { Kokkos::View d1("d1"), d2("d2"), x1("x1"), y1("y1"); Kokkos::View param("param"); - Kokkos::View ref_vals( - "reference values"); + Kokkos::View ref_vals("reference values"); constexpr int num_test_cases = 9; for (int test_case = 0; test_case < num_test_cases; ++test_case) { diff --git a/blas/unit_test/Test_Blas1_scal.hpp b/blas/unit_test/Test_Blas1_scal.hpp index a88ed646f1..b0169095fd 100644 --- a/blas/unit_test/Test_Blas1_scal.hpp +++ b/blas/unit_test/Test_Blas1_scal.hpp @@ -33,8 +33,7 @@ void impl_test_scal(int N) { view_stride_adapter x("X", N); view_stride_adapter y("Y", N); - Kokkos::Random_XorShift64_Pool rand_pool( - 13718); + Kokkos::Random_XorShift64_Pool rand_pool(13718); { ScalarA randStart, randEnd; @@ -68,8 +67,7 @@ void impl_test_scal_mv(int N, int K) { view_stride_adapter x("X", N, K); view_stride_adapter y("Y", N, K); - Kokkos::Random_XorShift64_Pool rand_pool( - 13718); + Kokkos::Random_XorShift64_Pool rand_pool(13718); { ScalarA randStart, randEnd; @@ -89,8 +87,7 @@ void impl_test_scal_mv(int N, int K) { Kokkos::deep_copy(y.h_base, y.d_base); for (int i = 0; i < N; i++) { for (int j = 0; j < K; j++) { - EXPECT_NEAR_KK(static_cast(a * x.h_view(i, j)), y.h_view(i, j), - eps); + EXPECT_NEAR_KK(static_cast(a * x.h_view(i, j)), y.h_view(i, j), eps); } } @@ -100,8 +97,7 @@ void impl_test_scal_mv(int N, int K) { Kokkos::deep_copy(y.h_base, y.d_base); for (int i = 0; i < N; i++) { for (int j = 0; j < K; j++) { - EXPECT_NEAR_KK(static_cast(a * x.h_view(i, j)), y.h_view(i, j), - eps); + EXPECT_NEAR_KK(static_cast(a * x.h_view(i, j)), y.h_view(i, j), eps); } } @@ -113,16 +109,14 @@ void impl_test_scal_mv(int N, int K) { Kokkos::deep_copy(param_j, ScalarA(3 + j)); } - auto h_params = - Kokkos::create_mirror_view_and_copy(Kokkos::HostSpace(), params); + auto h_params = Kokkos::create_mirror_view_and_copy(Kokkos::HostSpace(), params); Kokkos::deep_copy(y.d_view, Kokkos::ArithTraits::zero()); KokkosBlas::scal(y.d_view, params, x.d_view); Kokkos::deep_copy(y.h_base, y.d_base); for (int i = 0; i < N; i++) { for (int j = 0; j < K; j++) { - EXPECT_NEAR_KK(static_cast(h_params(j) * x.h_view(i, j)), - y.h_view(i, j), eps); + EXPECT_NEAR_KK(static_cast(h_params(j) * x.h_view(i, j)), y.h_view(i, j), eps); } } @@ -131,8 +125,7 @@ void impl_test_scal_mv(int N, int K) { Kokkos::deep_copy(y.h_base, y.d_base); for (int i = 0; i < N; i++) { for (int j = 0; j < K; j++) { - EXPECT_NEAR_KK(static_cast(h_params(j) * x.h_view(i, j)), - y.h_view(i, j), eps); + EXPECT_NEAR_KK(static_cast(h_params(j) * x.h_view(i, j)), y.h_view(i, j), eps); } } } @@ -141,8 +134,7 @@ void impl_test_scal_mv(int N, int K) { template int test_scal() { #if defined(KOKKOSKERNELS_INST_LAYOUTLEFT) || \ - (!defined(KOKKOSKERNELS_ETI_ONLY) && \ - !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS)) + (!defined(KOKKOSKERNELS_ETI_ONLY) && !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS)) typedef Kokkos::View view_type_a_ll; typedef Kokkos::View view_type_b_ll; Test::impl_test_scal(0); @@ -152,8 +144,7 @@ int test_scal() { #endif #if defined(KOKKOSKERNELS_INST_LAYOUTRIGHT) || \ - (!defined(KOKKOSKERNELS_ETI_ONLY) && \ - !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS)) + (!defined(KOKKOSKERNELS_ETI_ONLY) && !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS)) typedef Kokkos::View view_type_a_lr; typedef Kokkos::View view_type_b_lr; Test::impl_test_scal(0); @@ -162,8 +153,7 @@ int test_scal() { // Test::impl_test_scal(132231); #endif -#if (!defined(KOKKOSKERNELS_ETI_ONLY) && \ - !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS)) +#if (!defined(KOKKOSKERNELS_ETI_ONLY) && !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS)) typedef Kokkos::View view_type_a_ls; typedef Kokkos::View view_type_b_ls; Test::impl_test_scal(0); @@ -172,8 +162,7 @@ int test_scal() { // Test::impl_test_scal(132231); #endif -#if !defined(KOKKOSKERNELS_ETI_ONLY) && \ - !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS) +#if !defined(KOKKOSKERNELS_ETI_ONLY) && !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS) Test::impl_test_scal(1024); Test::impl_test_scal(1024); #endif @@ -184,8 +173,7 @@ int test_scal() { template int test_scal_mv() { #if defined(KOKKOSKERNELS_INST_LAYOUTLEFT) || \ - (!defined(KOKKOSKERNELS_ETI_ONLY) && \ - !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS)) + (!defined(KOKKOSKERNELS_ETI_ONLY) && !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS)) typedef Kokkos::View view_type_a_ll; typedef Kokkos::View view_type_b_ll; Test::impl_test_scal_mv(0, 5); @@ -195,8 +183,7 @@ int test_scal_mv() { #endif #if defined(KOKKOSKERNELS_INST_LAYOUTRIGHT) || \ - (!defined(KOKKOSKERNELS_ETI_ONLY) && \ - !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS)) + (!defined(KOKKOSKERNELS_ETI_ONLY) && !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS)) typedef Kokkos::View view_type_a_lr; typedef Kokkos::View view_type_b_lr; Test::impl_test_scal_mv(0, 5); @@ -205,8 +192,7 @@ int test_scal_mv() { // Test::impl_test_scal_mv(132231,5); #endif -#if (!defined(KOKKOSKERNELS_ETI_ONLY) && \ - !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS)) +#if (!defined(KOKKOSKERNELS_ETI_ONLY) && !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS)) typedef Kokkos::View view_type_a_ls; typedef Kokkos::View view_type_b_ls; Test::impl_test_scal_mv(0, 5); @@ -215,8 +201,7 @@ int test_scal_mv() { // Test::impl_test_scal_mv(132231,5); #endif -#if !defined(KOKKOSKERNELS_ETI_ONLY) && \ - !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS) +#if !defined(KOKKOSKERNELS_ETI_ONLY) && !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS) Test::impl_test_scal_mv(1024, 5); Test::impl_test_scal_mv(1024, 5); #endif @@ -225,8 +210,7 @@ int test_scal_mv() { } #if defined(KOKKOSKERNELS_INST_FLOAT) || \ - (!defined(KOKKOSKERNELS_ETI_ONLY) && \ - !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS)) + (!defined(KOKKOSKERNELS_ETI_ONLY) && !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS)) TEST_F(TestCategory, scal_float) { Kokkos::Profiling::pushRegion("KokkosBlas::Test::scal_float"); test_scal(); @@ -240,8 +224,7 @@ TEST_F(TestCategory, scal_mv_float) { #endif #if defined(KOKKOSKERNELS_INST_DOUBLE) || \ - (!defined(KOKKOSKERNELS_ETI_ONLY) && \ - !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS)) + (!defined(KOKKOSKERNELS_ETI_ONLY) && !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS)) TEST_F(TestCategory, scal_double) { Kokkos::Profiling::pushRegion("KokkosBlas::Test::scal_double"); test_scal(); @@ -255,8 +238,7 @@ TEST_F(TestCategory, scal_mv_double) { #endif #if defined(KOKKOSKERNELS_INST_COMPLEX_DOUBLE) || \ - (!defined(KOKKOSKERNELS_ETI_ONLY) && \ - !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS)) + (!defined(KOKKOSKERNELS_ETI_ONLY) && !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS)) TEST_F(TestCategory, scal_complex_double) { Kokkos::Profiling::pushRegion("KokkosBlas::Test::scal_complex_double"); test_scal, Kokkos::complex, TestDevice>(); @@ -269,9 +251,8 @@ TEST_F(TestCategory, scal_mv_complex_double) { } #endif -#if defined(KOKKOSKERNELS_INST_INT) || \ - (!defined(KOKKOSKERNELS_ETI_ONLY) && \ - !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS)) +#if defined(KOKKOSKERNELS_INST_INT) || \ + (!defined(KOKKOSKERNELS_ETI_ONLY) && !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS)) TEST_F(TestCategory, scal_int) { Kokkos::Profiling::pushRegion("KokkosBlas::Test::scal_int"); test_scal(); @@ -284,8 +265,7 @@ TEST_F(TestCategory, scal_mv_int) { } #endif -#if !defined(KOKKOSKERNELS_ETI_ONLY) && \ - !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS) +#if !defined(KOKKOSKERNELS_ETI_ONLY) && !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS) TEST_F(TestCategory, scal_double_int) { Kokkos::Profiling::pushRegion("KokkosBlas::Test::scal_double_int"); test_scal(); diff --git a/blas/unit_test/Test_Blas1_serial_setscal.hpp b/blas/unit_test/Test_Blas1_serial_setscal.hpp index cfbe4d602d..31ad998ac4 100644 --- a/blas/unit_test/Test_Blas1_serial_setscal.hpp +++ b/blas/unit_test/Test_Blas1_serial_setscal.hpp @@ -34,15 +34,13 @@ enum : int { BlasSet = 0, BlasScale = 1 }; struct KokkosKernelTag {}; struct NaiveTag {}; -template +template struct Functor_TestBlasSerialMatUtil { ScalarType _alpha; ViewType _a; KOKKOS_INLINE_FUNCTION - Functor_TestBlasSerialMatUtil(const ScalarType alpha, const ViewType &a) - : _alpha(alpha), _a(a) {} + Functor_TestBlasSerialMatUtil(const ScalarType alpha, const ViewType &a) : _alpha(alpha), _a(a) {} KOKKOS_INLINE_FUNCTION void operator()(const KokkosKernelTag &, const int i) const { @@ -76,27 +74,20 @@ struct Functor_TestBlasSerialMatUtil { typedef typename ViewType::value_type value_type; std::string name_region("KokkosBlas::Test::SerialMatUtil"); const std::string name_value_type = Test::value_type_name(); - std::string name_work_tag = - (std::is_same::value - ? "::KokkosBlas" - : std::is_same::value ? "::Naive" - : "::UnknownWorkTag"); - std::string name_test_id = - (TestID == BlasSet ? "Set" - : TestID == BlasScale ? "Scale" : "UnknownTest"); - std::string name = - name_region + name_value_type + name_work_tag + name_test_id; + std::string name_work_tag = (std::is_same::value ? "::KokkosBlas" + : std::is_same::value ? "::Naive" + : "::UnknownWorkTag"); + std::string name_test_id = (TestID == BlasSet ? "Set" : TestID == BlasScale ? "Scale" : "UnknownTest"); + std::string name = name_region + name_value_type + name_work_tag + name_test_id; Kokkos::Profiling::pushRegion(name.c_str()); - Kokkos::RangePolicy - policy(0, _a.extent(0)); + Kokkos::RangePolicy policy(0, _a.extent(0)); Kokkos::parallel_for(name.c_str(), policy, *this); Kokkos::Profiling::popRegion(); return 0; } }; -template +template void impl_test_blas_matutil(const int N, const int BlkSize) { /// typedefs typedef typename ViewType::value_type value_type; @@ -107,8 +98,7 @@ void impl_test_blas_matutil(const int N, const int BlkSize) { ViewType a("a", N, BlkSize, BlkSize); ViewType b("b", N, BlkSize, BlkSize); - Kokkos::Random_XorShift64_Pool random( - 13718); + Kokkos::Random_XorShift64_Pool random(13718); Kokkos::fill_random(a, random, value_type(1.0)); Kokkos::fence(); @@ -116,12 +106,8 @@ void impl_test_blas_matutil(const int N, const int BlkSize) { Kokkos::deep_copy(b, a); /// test body - Functor_TestBlasSerialMatUtil(alpha, a) - .run(); - Functor_TestBlasSerialMatUtil(alpha, b) - .run(); + Functor_TestBlasSerialMatUtil(alpha, a).run(); + Functor_TestBlasSerialMatUtil(alpha, b).run(); Kokkos::fence(); @@ -133,44 +119,31 @@ void impl_test_blas_matutil(const int N, const int BlkSize) { Kokkos::deep_copy(b_host, b); /// check a = b - typename ats::mag_type eps = - 100 * std::numeric_limits::epsilon(); + typename ats::mag_type eps = 100 * std::numeric_limits::epsilon(); for (int k = 0; k < N; ++k) for (int i = 0; i < BlkSize; ++i) - for (int j = 0; j < BlkSize; ++j) - EXPECT_NEAR_KK(b_host(k, i, j), a_host(k, i, j), eps); + for (int j = 0; j < BlkSize; ++j) EXPECT_NEAR_KK(b_host(k, i, j), a_host(k, i, j), eps); } } // namespace Test -template +template int test_blas_matutil() { #if defined(KOKKOSKERNELS_INST_LAYOUTLEFT) { - typedef Kokkos::View - ViewType; - Test::impl_test_blas_matutil(0, - 10); - Test::impl_test_blas_matutil(10, - 15); - Test::impl_test_blas_matutil(1024, - 9); - Test::impl_test_blas_matutil( - 132231, 3); + typedef Kokkos::View ViewType; + Test::impl_test_blas_matutil(0, 10); + Test::impl_test_blas_matutil(10, 15); + Test::impl_test_blas_matutil(1024, 9); + Test::impl_test_blas_matutil(132231, 3); } #endif #if defined(KOKKOSKERNELS_INST_LAYOUTRIGHT) { - typedef Kokkos::View - ViewType; - Test::impl_test_blas_matutil(0, - 10); - Test::impl_test_blas_matutil(10, - 15); - Test::impl_test_blas_matutil(1024, - 9); - Test::impl_test_blas_matutil( - 132231, 3); + typedef Kokkos::View ViewType; + Test::impl_test_blas_matutil(0, 10); + Test::impl_test_blas_matutil(10, 15); + Test::impl_test_blas_matutil(1024, 9); + Test::impl_test_blas_matutil(132231, 3); } #endif @@ -201,19 +174,15 @@ TEST_F(TestCategory, blas_scalar_serial_scale_double_double) { #if defined(KOKKOSKERNELS_INST_COMPLEX_DOUBLE) TEST_F(TestCategory, blas_scalar_serial_set_dcomplex_dcomplex) { - test_blas_matutil, - Kokkos::complex, ::Test::BlasSet>(); + test_blas_matutil, Kokkos::complex, ::Test::BlasSet>(); } TEST_F(TestCategory, blas_scalar_serial_scale_dcomplex_dcomplex) { - test_blas_matutil, - Kokkos::complex, ::Test::BlasScale>(); + test_blas_matutil, Kokkos::complex, ::Test::BlasScale>(); } TEST_F(TestCategory, blas_scalar_serial_set_dcomplex_double) { - test_blas_matutil, double, - ::Test::BlasSet>(); + test_blas_matutil, double, ::Test::BlasSet>(); } TEST_F(TestCategory, blas_scalar_serial_scale_dcomplex_double) { - test_blas_matutil, double, - ::Test::BlasScale>(); + test_blas_matutil, double, ::Test::BlasScale>(); } #endif diff --git a/blas/unit_test/Test_Blas1_sum.hpp b/blas/unit_test/Test_Blas1_sum.hpp index 34d52a7e4a..6d7ae3818e 100644 --- a/blas/unit_test/Test_Blas1_sum.hpp +++ b/blas/unit_test/Test_Blas1_sum.hpp @@ -26,8 +26,7 @@ void impl_test_sum(int N) { view_stride_adapter a("A", N); - Kokkos::Random_XorShift64_Pool rand_pool( - 13718); + Kokkos::Random_XorShift64_Pool rand_pool(13718); ScalarA randStart, randEnd; Test::getRandomBounds(10.0, randStart, randEnd); @@ -53,8 +52,7 @@ void impl_test_sum_mv(int N, int K) { view_stride_adapter a("A", N, K); - Kokkos::Random_XorShift64_Pool rand_pool( - 13718); + Kokkos::Random_XorShift64_Pool rand_pool(13718); ScalarA randStart, randEnd; Test::getRandomBounds(10.0, randStart, randEnd); @@ -76,8 +74,7 @@ void impl_test_sum_mv(int N, int K) { Kokkos::fence(); for (int k = 0; k < K; k++) { ScalarA nonconst_result = r(k); - EXPECT_NEAR_KK(nonconst_result, expected_result[k], - eps * expected_result[k]); + EXPECT_NEAR_KK(nonconst_result, expected_result[k], eps * expected_result[k]); } KokkosBlas::sum(r, a.d_view_const); @@ -94,8 +91,7 @@ void impl_test_sum_mv(int N, int K) { template int test_sum() { #if defined(KOKKOSKERNELS_INST_LAYOUTLEFT) || \ - (!defined(KOKKOSKERNELS_ETI_ONLY) && \ - !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS)) + (!defined(KOKKOSKERNELS_ETI_ONLY) && !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS)) typedef Kokkos::View view_type_a_ll; Test::impl_test_sum(0); Test::impl_test_sum(13); @@ -104,8 +100,7 @@ int test_sum() { #endif #if defined(KOKKOSKERNELS_INST_LAYOUTRIGHT) || \ - (!defined(KOKKOSKERNELS_ETI_ONLY) && \ - !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS)) + (!defined(KOKKOSKERNELS_ETI_ONLY) && !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS)) typedef Kokkos::View view_type_a_lr; Test::impl_test_sum(0); Test::impl_test_sum(13); @@ -113,8 +108,7 @@ int test_sum() { // Test::impl_test_sum(132231); #endif -#if (!defined(KOKKOSKERNELS_ETI_ONLY) && \ - !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS)) +#if (!defined(KOKKOSKERNELS_ETI_ONLY) && !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS)) typedef Kokkos::View view_type_a_ls; Test::impl_test_sum(0); Test::impl_test_sum(13); @@ -128,8 +122,7 @@ int test_sum() { template int test_sum_mv() { #if defined(KOKKOSKERNELS_INST_LAYOUTLEFT) || \ - (!defined(KOKKOSKERNELS_ETI_ONLY) && \ - !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS)) + (!defined(KOKKOSKERNELS_ETI_ONLY) && !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS)) typedef Kokkos::View view_type_a_ll; Test::impl_test_sum_mv(0, 5); Test::impl_test_sum_mv(13, 5); @@ -139,8 +132,7 @@ int test_sum_mv() { #endif #if defined(KOKKOSKERNELS_INST_LAYOUTRIGHT) || \ - (!defined(KOKKOSKERNELS_ETI_ONLY) && \ - !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS)) + (!defined(KOKKOSKERNELS_ETI_ONLY) && !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS)) typedef Kokkos::View view_type_a_lr; Test::impl_test_sum_mv(0, 5); Test::impl_test_sum_mv(13, 5); @@ -149,8 +141,7 @@ int test_sum_mv() { // Test::impl_test_sum_mv(132231,5); #endif -#if (!defined(KOKKOSKERNELS_ETI_ONLY) && \ - !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS)) +#if (!defined(KOKKOSKERNELS_ETI_ONLY) && !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS)) typedef Kokkos::View view_type_a_ls; Test::impl_test_sum_mv(0, 5); Test::impl_test_sum_mv(13, 5); @@ -163,8 +154,7 @@ int test_sum_mv() { } #if defined(KOKKOSKERNELS_INST_FLOAT) || \ - (!defined(KOKKOSKERNELS_ETI_ONLY) && \ - !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS)) + (!defined(KOKKOSKERNELS_ETI_ONLY) && !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS)) TEST_F(TestCategory, sum_float) { Kokkos::Profiling::pushRegion("KokkosBlas::Test::sum_float"); test_sum(); @@ -178,8 +168,7 @@ TEST_F(TestCategory, sum_mv_float) { #endif #if defined(KOKKOSKERNELS_INST_DOUBLE) || \ - (!defined(KOKKOSKERNELS_ETI_ONLY) && \ - !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS)) + (!defined(KOKKOSKERNELS_ETI_ONLY) && !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS)) TEST_F(TestCategory, sum_double) { Kokkos::Profiling::pushRegion("KokkosBlas::Test::sum_double"); test_sum(); @@ -193,8 +182,7 @@ TEST_F(TestCategory, sum_mv_double) { #endif #if defined(KOKKOSKERNELS_INST_COMPLEX_DOUBLE) || \ - (!defined(KOKKOSKERNELS_ETI_ONLY) && \ - !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS)) + (!defined(KOKKOSKERNELS_ETI_ONLY) && !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS)) TEST_F(TestCategory, sum_complex_double) { Kokkos::Profiling::pushRegion("KokkosBlas::Test::sum_complex_double"); test_sum, TestDevice>(); @@ -207,9 +195,8 @@ TEST_F(TestCategory, sum_mv_complex_double) { } #endif -#if defined(KOKKOSKERNELS_INST_INT) || \ - (!defined(KOKKOSKERNELS_ETI_ONLY) && \ - !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS)) +#if defined(KOKKOSKERNELS_INST_INT) || \ + (!defined(KOKKOSKERNELS_ETI_ONLY) && !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS)) TEST_F(TestCategory, sum_int) { Kokkos::Profiling::pushRegion("KokkosBlas::Test::sum_int"); test_sum(); diff --git a/blas/unit_test/Test_Blas1_swap.hpp b/blas/unit_test/Test_Blas1_swap.hpp index 624552f1dc..15a04c652c 100644 --- a/blas/unit_test/Test_Blas1_swap.hpp +++ b/blas/unit_test/Test_Blas1_swap.hpp @@ -55,8 +55,7 @@ int test_swap() { } #if defined(KOKKOSKERNELS_INST_FLOAT) || \ - (!defined(KOKKOSKERNELS_ETI_ONLY) && \ - !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS)) + (!defined(KOKKOSKERNELS_ETI_ONLY) && !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS)) TEST_F(TestCategory, swap_float) { Kokkos::Profiling::pushRegion("KokkosBlas::Test::swap_float"); test_swap(); @@ -65,8 +64,7 @@ TEST_F(TestCategory, swap_float) { #endif #if defined(KOKKOSKERNELS_INST_DOUBLE) || \ - (!defined(KOKKOSKERNELS_ETI_ONLY) && \ - !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS)) + (!defined(KOKKOSKERNELS_ETI_ONLY) && !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS)) TEST_F(TestCategory, swap_double) { Kokkos::Profiling::pushRegion("KokkosBlas::Test::swap_double"); test_swap(); @@ -75,8 +73,7 @@ TEST_F(TestCategory, swap_double) { #endif #if defined(KOKKOSKERNELS_INST_COMPLEX_FLOAT) || \ - (!defined(KOKKOSKERNELS_ETI_ONLY) && \ - !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS)) + (!defined(KOKKOSKERNELS_ETI_ONLY) && !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS)) TEST_F(TestCategory, swap_complex_float) { Kokkos::Profiling::pushRegion("KokkosBlas::Test::swap_complex_float"); test_swap, TestDevice>(); @@ -85,8 +82,7 @@ TEST_F(TestCategory, swap_complex_float) { #endif #if defined(KOKKOSKERNELS_INST_COMPLEX_DOUBLE) || \ - (!defined(KOKKOSKERNELS_ETI_ONLY) && \ - !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS)) + (!defined(KOKKOSKERNELS_ETI_ONLY) && !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS)) TEST_F(TestCategory, swap_complex_double) { Kokkos::Profiling::pushRegion("KokkosBlas::Test::swap_complex_double"); test_swap, TestDevice>(); diff --git a/blas/unit_test/Test_Blas1_team_abs.hpp b/blas/unit_test/Test_Blas1_team_abs.hpp index eca7657b55..0f78731ab3 100644 --- a/blas/unit_test/Test_Blas1_team_abs.hpp +++ b/blas/unit_test/Test_Blas1_team_abs.hpp @@ -47,8 +47,7 @@ void impl_test_team_abs(int N) { view_stride_adapter x("X", N); view_stride_adapter y("Y", N); - Kokkos::Random_XorShift64_Pool rand_pool( - 13718); + Kokkos::Random_XorShift64_Pool rand_pool(13718); Kokkos::fill_random(x.d_view, rand_pool, ScalarA(1)); Kokkos::fill_random(y.d_view, rand_pool, ScalarB(1)); @@ -56,52 +55,36 @@ void impl_test_team_abs(int N) { Kokkos::deep_copy(x.h_base, x.d_base); ScalarA expected_result = 0; - for (int i = 0; i < N; i++) - expected_result += AT::abs(x.h_view(i)) * AT::abs(x.h_view(i)); + for (int i = 0; i < N; i++) expected_result += AT::abs(x.h_view(i)) * AT::abs(x.h_view(i)); // KokkosBlas::abs(y,x); Kokkos::parallel_for( - "KokkosBlas::Test::TeamAbs", policy, - KOKKOS_LAMBDA(const team_member &teamMember) { + "KokkosBlas::Test::TeamAbs", policy, KOKKOS_LAMBDA(const team_member &teamMember) { const int teamId = teamMember.league_rank(); KokkosBlas::Experimental::abs( teamMember, - Kokkos::subview( - y.d_view, - Kokkos::make_pair( - teamId * team_data_siz, - (teamId < M - 1) ? (teamId + 1) * team_data_siz : N)), - Kokkos::subview( - x.d_view, - Kokkos::make_pair( - teamId * team_data_siz, - (teamId < M - 1) ? (teamId + 1) * team_data_siz : N))); + Kokkos::subview(y.d_view, Kokkos::make_pair(teamId * team_data_siz, + (teamId < M - 1) ? (teamId + 1) * team_data_siz : N)), + Kokkos::subview(x.d_view, Kokkos::make_pair(teamId * team_data_siz, + (teamId < M - 1) ? (teamId + 1) * team_data_siz : N))); }); ScalarB nonconst_nonconst_result = KokkosBlas::dot(y.d_view, y.d_view); - EXPECT_NEAR_KK(nonconst_nonconst_result, expected_result, - eps * expected_result); + EXPECT_NEAR_KK(nonconst_nonconst_result, expected_result, eps * expected_result); // Zero out y and run again with const input Kokkos::deep_copy(y.d_view, Kokkos::ArithTraits::zero()); // KokkosBlas::abs(y,c_x); Kokkos::parallel_for( - "KokkosBlas::Test::TeamAbs", policy, - KOKKOS_LAMBDA(const team_member &teamMember) { + "KokkosBlas::Test::TeamAbs", policy, KOKKOS_LAMBDA(const team_member &teamMember) { const int teamId = teamMember.league_rank(); KokkosBlas::Experimental::abs( teamMember, - Kokkos::subview( - y.d_view, - Kokkos::make_pair( - teamId * team_data_siz, - (teamId < M - 1) ? (teamId + 1) * team_data_siz : N)), - Kokkos::subview( - x.d_view_const, - Kokkos::make_pair( - teamId * team_data_siz, - (teamId < M - 1) ? (teamId + 1) * team_data_siz : N))); + Kokkos::subview(y.d_view, Kokkos::make_pair(teamId * team_data_siz, + (teamId < M - 1) ? (teamId + 1) * team_data_siz : N)), + Kokkos::subview(x.d_view_const, Kokkos::make_pair(teamId * team_data_siz, + (teamId < M - 1) ? (teamId + 1) * team_data_siz : N))); }); ScalarB const_nonconst_result = KokkosBlas::dot(y.d_view, y.d_view); @@ -134,8 +117,7 @@ void impl_test_team_abs_mv(int N, int K) { ScalarA *expected_result = new ScalarA[K]; for (int j = 0; j < K; j++) { expected_result[j] = ScalarA(); - for (int i = 0; i < N; i++) - expected_result[j] += AT::abs(x.h_view(i, j)) * AT::abs(x.h_view(i, j)); + for (int i = 0; i < N; i++) expected_result[j] += AT::abs(x.h_view(i, j)) * AT::abs(x.h_view(i, j)); } // double eps = std::is_same::value?2*1e-5:1e-7; @@ -147,21 +129,17 @@ void impl_test_team_abs_mv(int N, int K) { // KokkosBlas::abs(y,x); Kokkos::parallel_for( - "KokkosBlas::Test::TeamAbs", policy, - KOKKOS_LAMBDA(const team_member &teamMember) { + "KokkosBlas::Test::TeamAbs", policy, KOKKOS_LAMBDA(const team_member &teamMember) { const int teamId = teamMember.league_rank(); - KokkosBlas::Experimental::abs( - teamMember, Kokkos::subview(y.d_view, Kokkos::ALL(), teamId), - Kokkos::subview(x.d_view, Kokkos::ALL(), teamId)); + KokkosBlas::Experimental::abs(teamMember, Kokkos::subview(y.d_view, Kokkos::ALL(), teamId), + Kokkos::subview(x.d_view, Kokkos::ALL(), teamId)); }); KokkosBlas::dot(r, y.d_view, y.d_view); for (int k = 0; k < K; k++) { - ScalarA nonconst_result = r(k); - typename AT::mag_type divisor = - AT::abs(expected_result[k]) == zero ? one : AT::abs(expected_result[k]); - typename AT::mag_type diff = - AT::abs(nonconst_result - expected_result[k]) / divisor; + ScalarA nonconst_result = r(k); + typename AT::mag_type divisor = AT::abs(expected_result[k]) == zero ? one : AT::abs(expected_result[k]); + typename AT::mag_type diff = AT::abs(nonconst_result - expected_result[k]) / divisor; EXPECT_NEAR_KK(diff, zero, eps); // EXPECT_NEAR_KK( nonconst_result, expected_result[k], // eps*expected_result[k]); @@ -172,21 +150,17 @@ void impl_test_team_abs_mv(int N, int K) { // KokkosBlas::abs(y,c_x); Kokkos::parallel_for( - "KokkosBlas::Test::TeamAbs", policy, - KOKKOS_LAMBDA(const team_member &teamMember) { + "KokkosBlas::Test::TeamAbs", policy, KOKKOS_LAMBDA(const team_member &teamMember) { const int teamId = teamMember.league_rank(); - KokkosBlas::Experimental::abs( - teamMember, Kokkos::subview(y.d_view, Kokkos::ALL(), teamId), - Kokkos::subview(x.d_view_const, Kokkos::ALL(), teamId)); + KokkosBlas::Experimental::abs(teamMember, Kokkos::subview(y.d_view, Kokkos::ALL(), teamId), + Kokkos::subview(x.d_view_const, Kokkos::ALL(), teamId)); }); KokkosBlas::dot(r, y.d_view, y.d_view); for (int k = 0; k < K; k++) { - ScalarA const_result = r(k); - typename AT::mag_type divisor = - AT::abs(expected_result[k]) == zero ? one : AT::abs(expected_result[k]); - typename AT::mag_type diff = - AT::abs(const_result - expected_result[k]) / divisor; + ScalarA const_result = r(k); + typename AT::mag_type divisor = AT::abs(expected_result[k]) == zero ? one : AT::abs(expected_result[k]); + typename AT::mag_type diff = AT::abs(const_result - expected_result[k]) / divisor; EXPECT_NEAR_KK(diff, zero, eps); // EXPECT_NEAR_KK( const_result, expected_result[k], // eps*expected_result[k]); @@ -199,8 +173,7 @@ void impl_test_team_abs_mv(int N, int K) { template int test_team_abs() { #if defined(KOKKOSKERNELS_INST_LAYOUTLEFT) || \ - (!defined(KOKKOSKERNELS_ETI_ONLY) && \ - !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS)) + (!defined(KOKKOSKERNELS_ETI_ONLY) && !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS)) typedef Kokkos::View view_type_a_ll; typedef Kokkos::View view_type_b_ll; Test::impl_test_team_abs(0); @@ -210,8 +183,7 @@ int test_team_abs() { #endif #if defined(KOKKOSKERNELS_INST_LAYOUTRIGHT) || \ - (!defined(KOKKOSKERNELS_ETI_ONLY) && \ - !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS)) + (!defined(KOKKOSKERNELS_ETI_ONLY) && !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS)) typedef Kokkos::View view_type_a_lr; typedef Kokkos::View view_type_b_lr; Test::impl_test_team_abs(0); @@ -220,8 +192,7 @@ int test_team_abs() { // Test::impl_test_team_abs(132231); #endif -#if (!defined(KOKKOSKERNELS_ETI_ONLY) && \ - !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS)) +#if (!defined(KOKKOSKERNELS_ETI_ONLY) && !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS)) typedef Kokkos::View view_type_a_ls; typedef Kokkos::View view_type_b_ls; Test::impl_test_team_abs(0); @@ -230,8 +201,7 @@ int test_team_abs() { // Test::impl_test_team_abs(132231); #endif -#if !defined(KOKKOSKERNELS_ETI_ONLY) && \ - !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS) +#if !defined(KOKKOSKERNELS_ETI_ONLY) && !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS) Test::impl_test_team_abs(124); Test::impl_test_team_abs(124); #endif @@ -242,8 +212,7 @@ int test_team_abs() { template int test_team_abs_mv() { #if defined(KOKKOSKERNELS_INST_LAYOUTLEFT) || \ - (!defined(KOKKOSKERNELS_ETI_ONLY) && \ - !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS)) + (!defined(KOKKOSKERNELS_ETI_ONLY) && !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS)) typedef Kokkos::View view_type_a_ll; typedef Kokkos::View view_type_b_ll; Test::impl_test_team_abs_mv(0, 5); @@ -254,8 +223,7 @@ int test_team_abs_mv() { #endif #if defined(KOKKOSKERNELS_INST_LAYOUTRIGHT) || \ - (!defined(KOKKOSKERNELS_ETI_ONLY) && \ - !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS)) + (!defined(KOKKOSKERNELS_ETI_ONLY) && !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS)) typedef Kokkos::View view_type_a_lr; typedef Kokkos::View view_type_b_lr; Test::impl_test_team_abs_mv(0, 5); @@ -265,8 +233,7 @@ int test_team_abs_mv() { // Device>(132231,5); #endif -#if (!defined(KOKKOSKERNELS_ETI_ONLY) && \ - !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS)) +#if (!defined(KOKKOSKERNELS_ETI_ONLY) && !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS)) typedef Kokkos::View view_type_a_ls; typedef Kokkos::View view_type_b_ls; Test::impl_test_team_abs_mv(0, 5); @@ -276,8 +243,7 @@ int test_team_abs_mv() { // Device>(132231,5); #endif -#if !defined(KOKKOSKERNELS_ETI_ONLY) && \ - !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS) +#if !defined(KOKKOSKERNELS_ETI_ONLY) && !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS) Test::impl_test_team_abs_mv(124, 5); Test::impl_test_team_abs_mv(124, 5); #endif @@ -286,46 +252,31 @@ int test_team_abs_mv() { } #if defined(KOKKOSKERNELS_INST_FLOAT) || \ - (!defined(KOKKOSKERNELS_ETI_ONLY) && \ - !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS)) -TEST_F(TestCategory, team_abs_float) { - test_team_abs(); -} -TEST_F(TestCategory, team_abs_mv_float) { - test_team_abs_mv(); -} + (!defined(KOKKOSKERNELS_ETI_ONLY) && !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS)) +TEST_F(TestCategory, team_abs_float) { test_team_abs(); } +TEST_F(TestCategory, team_abs_mv_float) { test_team_abs_mv(); } #endif #if defined(KOKKOSKERNELS_INST_DOUBLE) || \ - (!defined(KOKKOSKERNELS_ETI_ONLY) && \ - !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS)) -TEST_F(TestCategory, team_abs_double) { - test_team_abs(); -} -TEST_F(TestCategory, team_abs_mv_double) { - test_team_abs_mv(); -} + (!defined(KOKKOSKERNELS_ETI_ONLY) && !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS)) +TEST_F(TestCategory, team_abs_double) { test_team_abs(); } +TEST_F(TestCategory, team_abs_mv_double) { test_team_abs_mv(); } #endif #if defined(KOKKOSKERNELS_INST_COMPLEX_DOUBLE) || \ - (!defined(KOKKOSKERNELS_ETI_ONLY) && \ - !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS)) + (!defined(KOKKOSKERNELS_ETI_ONLY) && !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS)) TEST_F(TestCategory, team_abs_complex_double) { test_team_abs, Kokkos::complex, TestDevice>(); } TEST_F(TestCategory, team_abs_mv_complex_double) { - test_team_abs_mv, Kokkos::complex, - TestDevice>(); + test_team_abs_mv, Kokkos::complex, TestDevice>(); } #endif -#if defined(KOKKOSKERNELS_INST_INT) || \ - (!defined(KOKKOSKERNELS_ETI_ONLY) && \ - !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS)) +#if defined(KOKKOSKERNELS_INST_INT) || \ + (!defined(KOKKOSKERNELS_ETI_ONLY) && !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS)) TEST_F(TestCategory, team_abs_int) { test_team_abs(); } -TEST_F(TestCategory, team_abs_mv_int) { - test_team_abs_mv(); -} +TEST_F(TestCategory, team_abs_mv_int) { test_team_abs_mv(); } #endif /*#if !defined(KOKKOSKERNELS_ETI_ONLY) && diff --git a/blas/unit_test/Test_Blas1_team_axpby.hpp b/blas/unit_test/Test_Blas1_team_axpby.hpp index 5875f2bc1f..cadb2d0d09 100644 --- a/blas/unit_test/Test_Blas1_team_axpby.hpp +++ b/blas/unit_test/Test_Blas1_team_axpby.hpp @@ -60,57 +60,40 @@ void impl_test_team_axpby(int N) { ScalarA expected_result = 0; for (int i = 0; i < N; i++) - expected_result += ScalarB(a * x.h_view(i) + b * y.h_view(i)) * - ScalarB(a * x.h_view(i) + b * y.h_view(i)); + expected_result += ScalarB(a * x.h_view(i) + b * y.h_view(i)) * ScalarB(a * x.h_view(i) + b * y.h_view(i)); // KokkosBlas::axpby(a,x,b,y); Kokkos::parallel_for( - "KokkosBlas::Test::TeamAxpby", policy, - KOKKOS_LAMBDA(const team_member &teamMember) { + "KokkosBlas::Test::TeamAxpby", policy, KOKKOS_LAMBDA(const team_member &teamMember) { const int teamId = teamMember.league_rank(); KokkosBlas::Experimental::axpby( teamMember, a, - Kokkos::subview( - x.d_view, - Kokkos::make_pair( - teamId * team_data_siz, - (teamId < M - 1) ? (teamId + 1) * team_data_siz : N)), + Kokkos::subview(x.d_view, Kokkos::make_pair(teamId * team_data_siz, + (teamId < M - 1) ? (teamId + 1) * team_data_siz : N)), b, - Kokkos::subview( - y.d_view, - Kokkos::make_pair( - teamId * team_data_siz, - (teamId < M - 1) ? (teamId + 1) * team_data_siz : N))); + Kokkos::subview(y.d_view, Kokkos::make_pair(teamId * team_data_siz, + (teamId < M - 1) ? (teamId + 1) * team_data_siz : N))); }); ScalarB nonconst_nonconst_result = KokkosBlas::dot(y.d_view, y.d_view); - EXPECT_NEAR_KK(nonconst_nonconst_result, expected_result, - eps * expected_result); + EXPECT_NEAR_KK(nonconst_nonconst_result, expected_result, eps * expected_result); Kokkos::deep_copy(y.d_base, org_y.h_base); // KokkosBlas::axpby(a,c_x,b,y); Kokkos::parallel_for( - "KokkosBlas::Test::TeamAxpby", policy, - KOKKOS_LAMBDA(const team_member &teamMember) { + "KokkosBlas::Test::TeamAxpby", policy, KOKKOS_LAMBDA(const team_member &teamMember) { const int teamId = teamMember.league_rank(); KokkosBlas::Experimental::axpby( teamMember, a, - Kokkos::subview( - x.d_view_const, - Kokkos::make_pair( - teamId * team_data_siz, - (teamId < M - 1) ? (teamId + 1) * team_data_siz : N)), + Kokkos::subview(x.d_view_const, Kokkos::make_pair(teamId * team_data_siz, + (teamId < M - 1) ? (teamId + 1) * team_data_siz : N)), b, - Kokkos::subview( - y.d_view, - Kokkos::make_pair( - teamId * team_data_siz, - (teamId < M - 1) ? (teamId + 1) * team_data_siz : N))); + Kokkos::subview(y.d_view, Kokkos::make_pair(teamId * team_data_siz, + (teamId < M - 1) ? (teamId + 1) * team_data_siz : N))); }); - ScalarB const_nonconst_result = - KokkosBlas::dot(y.d_view_const, y.d_view_const); + ScalarB const_nonconst_result = KokkosBlas::dot(y.d_view_const, y.d_view_const); EXPECT_NEAR_KK(const_nonconst_result, expected_result, eps * expected_result); } @@ -146,8 +129,8 @@ void impl_test_team_axpby_mv(int N, int K) { for (int j = 0; j < K; j++) { expected_result[j] = ScalarA(); for (int i = 0; i < N; i++) - expected_result[j] += ScalarB(a * x.h_view(i, j) + b * y.h_view(i, j)) * - ScalarB(a * x.h_view(i, j) + b * y.h_view(i, j)); + expected_result[j] += + ScalarB(a * x.h_view(i, j) + b * y.h_view(i, j)) * ScalarB(a * x.h_view(i, j) + b * y.h_view(i, j)); } double eps = std::is_same::value ? 2 * 1e-5 : 1e-7; @@ -158,40 +141,32 @@ void impl_test_team_axpby_mv(int N, int K) { // KokkosBlas::axpby(a,x,b,y); Kokkos::parallel_for( - "KokkosBlas::Test::TeamAxpby", policy, - KOKKOS_LAMBDA(const team_member &teamMember) { + "KokkosBlas::Test::TeamAxpby", policy, KOKKOS_LAMBDA(const team_member &teamMember) { const int teamId = teamMember.league_rank(); - KokkosBlas::Experimental::axpby( - teamMember, a, Kokkos::subview(x.d_view, Kokkos::ALL(), teamId), b, - Kokkos::subview(y.d_view, Kokkos::ALL(), teamId)); + KokkosBlas::Experimental::axpby(teamMember, a, Kokkos::subview(x.d_view, Kokkos::ALL(), teamId), b, + Kokkos::subview(y.d_view, Kokkos::ALL(), teamId)); }); KokkosBlas::dot(r, y.d_view, y.d_view); for (int k = 0; k < K; k++) { ScalarA nonconst_nonconst_result = r(k); - EXPECT_NEAR_KK(AT::abs(nonconst_nonconst_result), - AT::abs(expected_result[k]), - AT::abs(expected_result[k] * eps)); + EXPECT_NEAR_KK(AT::abs(nonconst_nonconst_result), AT::abs(expected_result[k]), AT::abs(expected_result[k] * eps)); } Kokkos::deep_copy(y.d_base, org_y.h_base); // KokkosBlas::axpby(a,c_x,b,y); Kokkos::parallel_for( - "KokkosBlas::Test::TeamAxpby", policy, - KOKKOS_LAMBDA(const team_member &teamMember) { + "KokkosBlas::Test::TeamAxpby", policy, KOKKOS_LAMBDA(const team_member &teamMember) { const int teamId = teamMember.league_rank(); - KokkosBlas::Experimental::axpby( - teamMember, a, - Kokkos::subview(x.d_view_const, Kokkos::ALL(), teamId), b, - Kokkos::subview(y.d_view, Kokkos::ALL(), teamId)); + KokkosBlas::Experimental::axpby(teamMember, a, Kokkos::subview(x.d_view_const, Kokkos::ALL(), teamId), b, + Kokkos::subview(y.d_view, Kokkos::ALL(), teamId)); }); KokkosBlas::dot(r, y.d_view, y.d_view); for (int k = 0; k < K; k++) { ScalarA const_non_const_result = r(k); - EXPECT_NEAR_KK(AT::abs(const_non_const_result), AT::abs(expected_result[k]), - AT::abs(eps * expected_result[k])); + EXPECT_NEAR_KK(AT::abs(const_non_const_result), AT::abs(expected_result[k]), AT::abs(eps * expected_result[k])); } delete[] expected_result; @@ -201,8 +176,7 @@ void impl_test_team_axpby_mv(int N, int K) { template int test_team_axpby() { #if defined(KOKKOSKERNELS_INST_LAYOUTLEFT) || \ - (!defined(KOKKOSKERNELS_ETI_ONLY) && \ - !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS)) + (!defined(KOKKOSKERNELS_ETI_ONLY) && !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS)) typedef Kokkos::View view_type_a_ll; typedef Kokkos::View view_type_b_ll; Test::impl_test_team_axpby(0); @@ -212,8 +186,7 @@ int test_team_axpby() { #endif #if defined(KOKKOSKERNELS_INST_LAYOUTRIGHT) || \ - (!defined(KOKKOSKERNELS_ETI_ONLY) && \ - !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS)) + (!defined(KOKKOSKERNELS_ETI_ONLY) && !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS)) typedef Kokkos::View view_type_a_lr; typedef Kokkos::View view_type_b_lr; Test::impl_test_team_axpby(0); @@ -222,8 +195,7 @@ int test_team_axpby() { // Test::impl_test_team_axpby(132231); #endif -#if (!defined(KOKKOSKERNELS_ETI_ONLY) && \ - !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS)) +#if (!defined(KOKKOSKERNELS_ETI_ONLY) && !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS)) typedef Kokkos::View view_type_a_ls; typedef Kokkos::View view_type_b_ls; Test::impl_test_team_axpby(0); @@ -232,8 +204,7 @@ int test_team_axpby() { // Test::impl_test_team_axpby(132231); #endif -#if !defined(KOKKOSKERNELS_ETI_ONLY) && \ - !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS) +#if !defined(KOKKOSKERNELS_ETI_ONLY) && !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS) Test::impl_test_team_axpby(124); Test::impl_test_team_axpby(124); #endif @@ -244,8 +215,7 @@ int test_team_axpby() { template int test_team_axpby_mv() { #if defined(KOKKOSKERNELS_INST_LAYOUTLEFT) || \ - (!defined(KOKKOSKERNELS_ETI_ONLY) && \ - !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS)) + (!defined(KOKKOSKERNELS_ETI_ONLY) && !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS)) typedef Kokkos::View view_type_a_ll; typedef Kokkos::View view_type_b_ll; Test::impl_test_team_axpby_mv(0, 5); @@ -256,8 +226,7 @@ int test_team_axpby_mv() { #endif #if defined(KOKKOSKERNELS_INST_LAYOUTRIGHT) || \ - (!defined(KOKKOSKERNELS_ETI_ONLY) && \ - !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS)) + (!defined(KOKKOSKERNELS_ETI_ONLY) && !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS)) typedef Kokkos::View view_type_a_lr; typedef Kokkos::View view_type_b_lr; Test::impl_test_team_axpby_mv(0, 5); @@ -267,8 +236,7 @@ int test_team_axpby_mv() { // Device>(132231,5); #endif -#if (!defined(KOKKOSKERNELS_ETI_ONLY) && \ - !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS)) +#if (!defined(KOKKOSKERNELS_ETI_ONLY) && !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS)) typedef Kokkos::View view_type_a_ls; typedef Kokkos::View view_type_b_ls; Test::impl_test_team_axpby_mv(0, 5); @@ -278,8 +246,7 @@ int test_team_axpby_mv() { // Device>(132231,5); #endif -#if !defined(KOKKOSKERNELS_ETI_ONLY) && \ - !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS) +#if !defined(KOKKOSKERNELS_ETI_ONLY) && !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS) Test::impl_test_team_axpby_mv(124, 5); Test::impl_test_team_axpby_mv(124, 5); #endif @@ -288,59 +255,36 @@ int test_team_axpby_mv() { } #if defined(KOKKOSKERNELS_INST_FLOAT) || \ - (!defined(KOKKOSKERNELS_ETI_ONLY) && \ - !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS)) -TEST_F(TestCategory, team_axpby_float) { - test_team_axpby(); -} -TEST_F(TestCategory, team_axpby_mv_float) { - test_team_axpby_mv(); -} + (!defined(KOKKOSKERNELS_ETI_ONLY) && !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS)) +TEST_F(TestCategory, team_axpby_float) { test_team_axpby(); } +TEST_F(TestCategory, team_axpby_mv_float) { test_team_axpby_mv(); } #endif #if defined(KOKKOSKERNELS_INST_DOUBLE) || \ - (!defined(KOKKOSKERNELS_ETI_ONLY) && \ - !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS)) -TEST_F(TestCategory, team_axpby_double) { - test_team_axpby(); -} -TEST_F(TestCategory, team_axpby_mv_double) { - test_team_axpby_mv(); -} + (!defined(KOKKOSKERNELS_ETI_ONLY) && !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS)) +TEST_F(TestCategory, team_axpby_double) { test_team_axpby(); } +TEST_F(TestCategory, team_axpby_mv_double) { test_team_axpby_mv(); } #endif #if defined(KOKKOSKERNELS_INST_COMPLEX_DOUBLE) || \ - (!defined(KOKKOSKERNELS_ETI_ONLY) && \ - !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS)) + (!defined(KOKKOSKERNELS_ETI_ONLY) && !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS)) TEST_F(TestCategory, team_axpby_complex_double) { - test_team_axpby, Kokkos::complex, - TestDevice>(); + test_team_axpby, Kokkos::complex, TestDevice>(); } TEST_F(TestCategory, team_axpby_mv_complex_double) { - test_team_axpby_mv, Kokkos::complex, - TestDevice>(); + test_team_axpby_mv, Kokkos::complex, TestDevice>(); } #endif -#if defined(KOKKOSKERNELS_INST_INT) || \ - (!defined(KOKKOSKERNELS_ETI_ONLY) && \ - !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS)) -TEST_F(TestCategory, team_axpby_int) { - test_team_axpby(); -} -TEST_F(TestCategory, team_axpby_mv_int) { - test_team_axpby_mv(); -} +#if defined(KOKKOSKERNELS_INST_INT) || \ + (!defined(KOKKOSKERNELS_ETI_ONLY) && !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS)) +TEST_F(TestCategory, team_axpby_int) { test_team_axpby(); } +TEST_F(TestCategory, team_axpby_mv_int) { test_team_axpby_mv(); } #endif -#if !defined(KOKKOSKERNELS_ETI_ONLY) && \ - !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS) -TEST_F(TestCategory, team_axpby_double_int) { - test_team_axpby(); -} -TEST_F(TestCategory, team_axpby_double_mv_int) { - test_team_axpby_mv(); -} +#if !defined(KOKKOSKERNELS_ETI_ONLY) && !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS) +TEST_F(TestCategory, team_axpby_double_int) { test_team_axpby(); } +TEST_F(TestCategory, team_axpby_double_mv_int) { test_team_axpby_mv(); } #endif #endif // check for lambda availability in CUDA backend diff --git a/blas/unit_test/Test_Blas1_team_axpy.hpp b/blas/unit_test/Test_Blas1_team_axpy.hpp index a5ac6a9c66..de2bf78855 100644 --- a/blas/unit_test/Test_Blas1_team_axpy.hpp +++ b/blas/unit_test/Test_Blas1_team_axpy.hpp @@ -48,8 +48,7 @@ void impl_test_team_axpy(int N) { ScalarA a = 3; double eps = std::is_same::value ? 2 * 1e-5 : 1e-7; - Kokkos::Random_XorShift64_Pool rand_pool( - 13718); + Kokkos::Random_XorShift64_Pool rand_pool(13718); Kokkos::fill_random(x.d_view, rand_pool, ScalarA(10)); Kokkos::fill_random(y.d_view, rand_pool, ScalarB(10)); @@ -60,55 +59,38 @@ void impl_test_team_axpy(int N) { ScalarA expected_result = 0; for (int i = 0; i < N; i++) - expected_result += ScalarB(a * x.h_view(i) + y.h_view(i)) * - ScalarB(a * x.h_view(i) + y.h_view(i)); + expected_result += ScalarB(a * x.h_view(i) + y.h_view(i)) * ScalarB(a * x.h_view(i) + y.h_view(i)); // KokkosBlas::axpy(a,x,y); Kokkos::parallel_for( - "KokkosBlas::Test::TeamAxpy", policy, - KOKKOS_LAMBDA(const team_member &teamMember) { + "KokkosBlas::Test::TeamAxpy", policy, KOKKOS_LAMBDA(const team_member &teamMember) { const int teamId = teamMember.league_rank(); KokkosBlas::Experimental::axpy( teamMember, a, - Kokkos::subview( - x.d_view, - Kokkos::make_pair( - teamId * team_data_siz, - (teamId < M - 1) ? (teamId + 1) * team_data_siz : N)), - Kokkos::subview( - y.d_view, - Kokkos::make_pair( - teamId * team_data_siz, - (teamId < M - 1) ? (teamId + 1) * team_data_siz : N))); + Kokkos::subview(x.d_view, Kokkos::make_pair(teamId * team_data_siz, + (teamId < M - 1) ? (teamId + 1) * team_data_siz : N)), + Kokkos::subview(y.d_view, Kokkos::make_pair(teamId * team_data_siz, + (teamId < M - 1) ? (teamId + 1) * team_data_siz : N))); }); ScalarB nonconst_nonconst_result = KokkosBlas::dot(y.d_view, y.d_view); - EXPECT_NEAR_KK(nonconst_nonconst_result, expected_result, - eps * expected_result); + EXPECT_NEAR_KK(nonconst_nonconst_result, expected_result, eps * expected_result); Kokkos::deep_copy(y.d_base, org_y.h_base); // KokkosBlas::axpy(a,c_x,y); Kokkos::parallel_for( - "KokkosBlas::Test::TeamAxpy", policy, - KOKKOS_LAMBDA(const team_member &teamMember) { + "KokkosBlas::Test::TeamAxpy", policy, KOKKOS_LAMBDA(const team_member &teamMember) { const int teamId = teamMember.league_rank(); KokkosBlas::Experimental::axpy( teamMember, a, - Kokkos::subview( - x.d_view_const, - Kokkos::make_pair( - teamId * team_data_siz, - (teamId < M - 1) ? (teamId + 1) * team_data_siz : N)), - Kokkos::subview( - y.d_view, - Kokkos::make_pair( - teamId * team_data_siz, - (teamId < M - 1) ? (teamId + 1) * team_data_siz : N))); + Kokkos::subview(x.d_view_const, Kokkos::make_pair(teamId * team_data_siz, + (teamId < M - 1) ? (teamId + 1) * team_data_siz : N)), + Kokkos::subview(y.d_view, Kokkos::make_pair(teamId * team_data_siz, + (teamId < M - 1) ? (teamId + 1) * team_data_siz : N))); }); - ScalarB const_nonconst_result = - KokkosBlas::dot(y.d_view_const, y.d_view_const); + ScalarB const_nonconst_result = KokkosBlas::dot(y.d_view_const, y.d_view_const); EXPECT_NEAR_KK(const_nonconst_result, expected_result, eps * expected_result); } @@ -143,8 +125,7 @@ void impl_test_team_axpy_mv(int N, int K) { for (int j = 0; j < K; j++) { expected_result[j] = ScalarA(); for (int i = 0; i < N; i++) - expected_result[j] += ScalarB(a * x.h_view(i, j) + y.h_view(i, j)) * - ScalarB(a * x.h_view(i, j) + y.h_view(i, j)); + expected_result[j] += ScalarB(a * x.h_view(i, j) + y.h_view(i, j)) * ScalarB(a * x.h_view(i, j) + y.h_view(i, j)); } double eps = std::is_same::value ? 2 * 1e-5 : 1e-7; @@ -153,39 +134,32 @@ void impl_test_team_axpy_mv(int N, int K) { // KokkosBlas::axpy(a,x,y); Kokkos::parallel_for( - "KokkosBlas::Test::TeamAxpy", policy, - KOKKOS_LAMBDA(const team_member &teamMember) { + "KokkosBlas::Test::TeamAxpy", policy, KOKKOS_LAMBDA(const team_member &teamMember) { const int teamId = teamMember.league_rank(); - KokkosBlas::Experimental::axpy( - teamMember, a, Kokkos::subview(x.d_view, Kokkos::ALL(), teamId), - Kokkos::subview(y.d_view, Kokkos::ALL(), teamId)); + KokkosBlas::Experimental::axpy(teamMember, a, Kokkos::subview(x.d_view, Kokkos::ALL(), teamId), + Kokkos::subview(y.d_view, Kokkos::ALL(), teamId)); }); KokkosBlas::dot(r, y.d_view, y.d_view); for (int k = 0; k < K; k++) { ScalarA nonconst_nonconst_result = r(k); - EXPECT_NEAR_KK(nonconst_nonconst_result, expected_result[k], - eps * expected_result[k]); + EXPECT_NEAR_KK(nonconst_nonconst_result, expected_result[k], eps * expected_result[k]); } Kokkos::deep_copy(y.d_base, org_y.h_base); // KokkosBlas::axpy(a,c_x,y); Kokkos::parallel_for( - "KokkosBlas::Test::TeamAxpy", policy, - KOKKOS_LAMBDA(const team_member &teamMember) { + "KokkosBlas::Test::TeamAxpy", policy, KOKKOS_LAMBDA(const team_member &teamMember) { const int teamId = teamMember.league_rank(); - KokkosBlas::Experimental::axpy( - teamMember, a, - Kokkos::subview(x.d_view_const, Kokkos::ALL(), teamId), - Kokkos::subview(y.d_view, Kokkos::ALL(), teamId)); + KokkosBlas::Experimental::axpy(teamMember, a, Kokkos::subview(x.d_view_const, Kokkos::ALL(), teamId), + Kokkos::subview(y.d_view, Kokkos::ALL(), teamId)); }); KokkosBlas::dot(r, y.d_view, y.d_view); for (int k = 0; k < K; k++) { ScalarA const_non_const_result = r(k); - EXPECT_NEAR_KK(const_non_const_result, expected_result[k], - eps * expected_result[k]); + EXPECT_NEAR_KK(const_non_const_result, expected_result[k], eps * expected_result[k]); } delete[] expected_result; @@ -195,8 +169,7 @@ void impl_test_team_axpy_mv(int N, int K) { template int test_team_axpy() { #if defined(KOKKOSKERNELS_INST_LAYOUTLEFT) || \ - (!defined(KOKKOSKERNELS_ETI_ONLY) && \ - !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS)) + (!defined(KOKKOSKERNELS_ETI_ONLY) && !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS)) typedef Kokkos::View view_type_a_ll; typedef Kokkos::View view_type_b_ll; Test::impl_test_team_axpy(0); @@ -206,8 +179,7 @@ int test_team_axpy() { #endif #if defined(KOKKOSKERNELS_INST_LAYOUTRIGHT) || \ - (!defined(KOKKOSKERNELS_ETI_ONLY) && \ - !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS)) + (!defined(KOKKOSKERNELS_ETI_ONLY) && !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS)) typedef Kokkos::View view_type_a_lr; typedef Kokkos::View view_type_b_lr; Test::impl_test_team_axpy(0); @@ -216,8 +188,7 @@ int test_team_axpy() { // Test::impl_test_team_axpy(132231); #endif -#if (!defined(KOKKOSKERNELS_ETI_ONLY) && \ - !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS)) +#if (!defined(KOKKOSKERNELS_ETI_ONLY) && !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS)) typedef Kokkos::View view_type_a_ls; typedef Kokkos::View view_type_b_ls; Test::impl_test_team_axpy(0); @@ -226,8 +197,7 @@ int test_team_axpy() { // Test::impl_test_team_axpy(132231); #endif -#if !defined(KOKKOSKERNELS_ETI_ONLY) && \ - !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS) +#if !defined(KOKKOSKERNELS_ETI_ONLY) && !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS) Test::impl_test_team_axpy(124); Test::impl_test_team_axpy(124); #endif @@ -238,8 +208,7 @@ int test_team_axpy() { template int test_team_axpy_mv() { #if defined(KOKKOSKERNELS_INST_LAYOUTLEFT) || \ - (!defined(KOKKOSKERNELS_ETI_ONLY) && \ - !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS)) + (!defined(KOKKOSKERNELS_ETI_ONLY) && !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS)) typedef Kokkos::View view_type_a_ll; typedef Kokkos::View view_type_b_ll; Test::impl_test_team_axpy_mv(0, 5); @@ -250,8 +219,7 @@ int test_team_axpy_mv() { #endif #if defined(KOKKOSKERNELS_INST_LAYOUTRIGHT) || \ - (!defined(KOKKOSKERNELS_ETI_ONLY) && \ - !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS)) + (!defined(KOKKOSKERNELS_ETI_ONLY) && !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS)) typedef Kokkos::View view_type_a_lr; typedef Kokkos::View view_type_b_lr; Test::impl_test_team_axpy_mv(0, 5); @@ -261,8 +229,7 @@ int test_team_axpy_mv() { // Device>(132231,5); #endif -#if (!defined(KOKKOSKERNELS_ETI_ONLY) && \ - !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS)) +#if (!defined(KOKKOSKERNELS_ETI_ONLY) && !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS)) typedef Kokkos::View view_type_a_ls; typedef Kokkos::View view_type_b_ls; Test::impl_test_team_axpy_mv(0, 5); @@ -272,8 +239,7 @@ int test_team_axpy_mv() { // Device>(132231,5); #endif -#if !defined(KOKKOSKERNELS_ETI_ONLY) && \ - !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS) +#if !defined(KOKKOSKERNELS_ETI_ONLY) && !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS) Test::impl_test_team_axpy_mv(124, 5); Test::impl_test_team_axpy_mv(124, 5); #endif @@ -282,57 +248,36 @@ int test_team_axpy_mv() { } #if defined(KOKKOSKERNELS_INST_FLOAT) || \ - (!defined(KOKKOSKERNELS_ETI_ONLY) && \ - !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS)) -TEST_F(TestCategory, team_axpy_float) { - test_team_axpy(); -} -TEST_F(TestCategory, team_axpy_mv_float) { - test_team_axpy_mv(); -} + (!defined(KOKKOSKERNELS_ETI_ONLY) && !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS)) +TEST_F(TestCategory, team_axpy_float) { test_team_axpy(); } +TEST_F(TestCategory, team_axpy_mv_float) { test_team_axpy_mv(); } #endif #if defined(KOKKOSKERNELS_INST_DOUBLE) || \ - (!defined(KOKKOSKERNELS_ETI_ONLY) && \ - !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS)) -TEST_F(TestCategory, team_axpy_double) { - test_team_axpy(); -} -TEST_F(TestCategory, team_axpy_mv_double) { - test_team_axpy_mv(); -} + (!defined(KOKKOSKERNELS_ETI_ONLY) && !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS)) +TEST_F(TestCategory, team_axpy_double) { test_team_axpy(); } +TEST_F(TestCategory, team_axpy_mv_double) { test_team_axpy_mv(); } #endif #if defined(KOKKOSKERNELS_INST_COMPLEX_DOUBLE) || \ - (!defined(KOKKOSKERNELS_ETI_ONLY) && \ - !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS)) + (!defined(KOKKOSKERNELS_ETI_ONLY) && !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS)) TEST_F(TestCategory, team_axpy_complex_double) { - test_team_axpy, Kokkos::complex, - TestDevice>(); + test_team_axpy, Kokkos::complex, TestDevice>(); } TEST_F(TestCategory, team_axpy_mv_complex_double) { - test_team_axpy_mv, Kokkos::complex, - TestDevice>(); + test_team_axpy_mv, Kokkos::complex, TestDevice>(); } #endif -#if defined(KOKKOSKERNELS_INST_INT) || \ - (!defined(KOKKOSKERNELS_ETI_ONLY) && \ - !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS)) +#if defined(KOKKOSKERNELS_INST_INT) || \ + (!defined(KOKKOSKERNELS_ETI_ONLY) && !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS)) TEST_F(TestCategory, team_axpy_int) { test_team_axpy(); } -TEST_F(TestCategory, team_axpy_mv_int) { - test_team_axpy_mv(); -} +TEST_F(TestCategory, team_axpy_mv_int) { test_team_axpy_mv(); } #endif -#if !defined(KOKKOSKERNELS_ETI_ONLY) && \ - !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS) -TEST_F(TestCategory, team_axpy_double_int) { - test_team_axpy(); -} -TEST_F(TestCategory, team_axpy_double_mv_int) { - test_team_axpy_mv(); -} +#if !defined(KOKKOSKERNELS_ETI_ONLY) && !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS) +TEST_F(TestCategory, team_axpy_double_int) { test_team_axpy(); } +TEST_F(TestCategory, team_axpy_double_mv_int) { test_team_axpy_mv(); } #endif #endif // Check for lambda availability in CUDA backend diff --git a/blas/unit_test/Test_Blas1_team_dot.hpp b/blas/unit_test/Test_Blas1_team_dot.hpp index 26baf261fe..9445d5784d 100644 --- a/blas/unit_test/Test_Blas1_team_dot.hpp +++ b/blas/unit_test/Test_Blas1_team_dot.hpp @@ -61,47 +61,32 @@ void impl_test_team_dot(int N) { ScalarA nonconst_nonconst_result = 0; Kokkos::parallel_for( - "KokkosBlas::Test::TeamDot", policy, - KOKKOS_LAMBDA(const team_member &teamMember) { + "KokkosBlas::Test::TeamDot", policy, KOKKOS_LAMBDA(const team_member &teamMember) { const int teamId = teamMember.league_rank(); d_r(teamId) = KokkosBlas::Experimental::dot( teamMember, - Kokkos::subview( - a.d_view, - Kokkos::make_pair( - teamId * team_data_siz, - (teamId < M - 1) ? (teamId + 1) * team_data_siz : N)), - Kokkos::subview( - b.d_view, - Kokkos::make_pair( - teamId * team_data_siz, - (teamId < M - 1) ? (teamId + 1) * team_data_siz : N))); + Kokkos::subview(a.d_view, Kokkos::make_pair(teamId * team_data_siz, + (teamId < M - 1) ? (teamId + 1) * team_data_siz : N)), + Kokkos::subview(b.d_view, Kokkos::make_pair(teamId * team_data_siz, + (teamId < M - 1) ? (teamId + 1) * team_data_siz : N))); }); Kokkos::deep_copy(r, d_r); for (int k = 0; k < M; k++) nonconst_nonconst_result += r(k); double eps = std::is_same::value ? 2 * 1e-5 : 1e-7; - EXPECT_NEAR_KK(nonconst_nonconst_result, expected_result, - eps * expected_result); + EXPECT_NEAR_KK(nonconst_nonconst_result, expected_result, eps * expected_result); ScalarA const_const_result = 0; Kokkos::parallel_for( - "KokkosBlas::Test::TeamDot", policy, - KOKKOS_LAMBDA(const team_member &teamMember) { + "KokkosBlas::Test::TeamDot", policy, KOKKOS_LAMBDA(const team_member &teamMember) { const int teamId = teamMember.league_rank(); d_r(teamId) = KokkosBlas::Experimental::dot( teamMember, - Kokkos::subview( - a.d_view_const, - Kokkos::make_pair( - teamId * team_data_siz, - (teamId < M - 1) ? (teamId + 1) * team_data_siz : N)), - Kokkos::subview( - b.d_view_const, - Kokkos::make_pair( - teamId * team_data_siz, - (teamId < M - 1) ? (teamId + 1) * team_data_siz : N))); + Kokkos::subview(a.d_view_const, Kokkos::make_pair(teamId * team_data_siz, + (teamId < M - 1) ? (teamId + 1) * team_data_siz : N)), + Kokkos::subview(b.d_view_const, Kokkos::make_pair(teamId * team_data_siz, + (teamId < M - 1) ? (teamId + 1) * team_data_siz : N))); }); Kokkos::deep_copy(r, d_r); for (int k = 0; k < M; k++) const_const_result += r(k); @@ -112,21 +97,14 @@ void impl_test_team_dot(int N) { ScalarA nonconst_const_result = 0; Kokkos::parallel_for( - "KokkosBlas::Test::TeamDot", policy, - KOKKOS_LAMBDA(const team_member &teamMember) { + "KokkosBlas::Test::TeamDot", policy, KOKKOS_LAMBDA(const team_member &teamMember) { const int teamId = teamMember.league_rank(); d_r(teamId) = KokkosBlas::Experimental::dot( teamMember, - Kokkos::subview( - a.d_view, - Kokkos::make_pair( - teamId * team_data_siz, - (teamId < M - 1) ? (teamId + 1) * team_data_siz : N)), - Kokkos::subview( - b.d_view_const, - Kokkos::make_pair( - teamId * team_data_siz, - (teamId < M - 1) ? (teamId + 1) * team_data_siz : N))); + Kokkos::subview(a.d_view, Kokkos::make_pair(teamId * team_data_siz, + (teamId < M - 1) ? (teamId + 1) * team_data_siz : N)), + Kokkos::subview(b.d_view_const, Kokkos::make_pair(teamId * team_data_siz, + (teamId < M - 1) ? (teamId + 1) * team_data_siz : N))); }); Kokkos::deep_copy(r, d_r); for (int k = 0; k < M; k++) nonconst_const_result += r(k); @@ -137,21 +115,14 @@ void impl_test_team_dot(int N) { ScalarA const_nonconst_result = 0; Kokkos::parallel_for( - "KokkosBlas::Test::TeamDot", policy, - KOKKOS_LAMBDA(const team_member &teamMember) { + "KokkosBlas::Test::TeamDot", policy, KOKKOS_LAMBDA(const team_member &teamMember) { const int teamId = teamMember.league_rank(); d_r(teamId) = KokkosBlas::Experimental::dot( teamMember, - Kokkos::subview( - a.d_view_const, - Kokkos::make_pair( - teamId * team_data_siz, - (teamId < M - 1) ? (teamId + 1) * team_data_siz : N)), - Kokkos::subview( - b.d_view, - Kokkos::make_pair( - teamId * team_data_siz, - (teamId < M - 1) ? (teamId + 1) * team_data_siz : N))); + Kokkos::subview(a.d_view_const, Kokkos::make_pair(teamId * team_data_siz, + (teamId < M - 1) ? (teamId + 1) * team_data_siz : N)), + Kokkos::subview(b.d_view, Kokkos::make_pair(teamId * team_data_siz, + (teamId < M - 1) ? (teamId + 1) * team_data_siz : N))); }); Kokkos::deep_copy(r, d_r); for (int k = 0; k < M; k++) const_nonconst_result += r(k); @@ -185,8 +156,7 @@ void impl_test_team_dot_mv(int N, int K) { ScalarA *expected_result = new ScalarA[K]; for (int j = 0; j < K; j++) { expected_result[j] = ScalarA(); - for (int i = 0; i < N; i++) - expected_result[j] += a.h_view(i, j) * b.h_view(i, j); + for (int i = 0; i < N; i++) expected_result[j] += a.h_view(i, j) * b.h_view(i, j); } double eps = std::is_same::value ? 2 * 1e-5 : 1e-7; @@ -196,66 +166,54 @@ void impl_test_team_dot_mv(int N, int K) { // KokkosBlas::dot(r,a,b); Kokkos::parallel_for( - "KokkosBlas::Test::TeamDot", policy, - KOKKOS_LAMBDA(const team_member &teamMember) { + "KokkosBlas::Test::TeamDot", policy, KOKKOS_LAMBDA(const team_member &teamMember) { const int teamId = teamMember.league_rank(); - d_r(teamId) = KokkosBlas::Experimental::dot( - teamMember, Kokkos::subview(a.d_view, Kokkos::ALL(), teamId), - Kokkos::subview(b.d_view, Kokkos::ALL(), teamId)); + d_r(teamId) = KokkosBlas::Experimental::dot(teamMember, Kokkos::subview(a.d_view, Kokkos::ALL(), teamId), + Kokkos::subview(b.d_view, Kokkos::ALL(), teamId)); }); Kokkos::deep_copy(r, d_r); for (int k = 0; k < K; k++) { ScalarA nonconst_nonconst_result = r(k); - EXPECT_NEAR_KK(nonconst_nonconst_result, expected_result[k], - eps * expected_result[k]); + EXPECT_NEAR_KK(nonconst_nonconst_result, expected_result[k], eps * expected_result[k]); } // KokkosBlas::dot(r,c_a,c_b); Kokkos::parallel_for( - "KokkosBlas::Test::TeamDot", policy, - KOKKOS_LAMBDA(const team_member &teamMember) { + "KokkosBlas::Test::TeamDot", policy, KOKKOS_LAMBDA(const team_member &teamMember) { const int teamId = teamMember.league_rank(); - d_r(teamId) = KokkosBlas::Experimental::dot( - teamMember, Kokkos::subview(a.d_view_const, Kokkos::ALL(), teamId), - Kokkos::subview(b.d_view_const, Kokkos::ALL(), teamId)); + d_r(teamId) = KokkosBlas::Experimental::dot(teamMember, Kokkos::subview(a.d_view_const, Kokkos::ALL(), teamId), + Kokkos::subview(b.d_view_const, Kokkos::ALL(), teamId)); }); Kokkos::deep_copy(r, d_r); for (int k = 0; k < K; k++) { ScalarA const_const_result = r(k); - EXPECT_NEAR_KK(const_const_result, expected_result[k], - eps * expected_result[k]); + EXPECT_NEAR_KK(const_const_result, expected_result[k], eps * expected_result[k]); } // KokkosBlas::dot(r,a,c_b); Kokkos::parallel_for( - "KokkosBlas::Test::TeamDot", policy, - KOKKOS_LAMBDA(const team_member &teamMember) { + "KokkosBlas::Test::TeamDot", policy, KOKKOS_LAMBDA(const team_member &teamMember) { const int teamId = teamMember.league_rank(); - d_r(teamId) = KokkosBlas::Experimental::dot( - teamMember, Kokkos::subview(a.d_view, Kokkos::ALL(), teamId), - Kokkos::subview(b.d_view_const, Kokkos::ALL(), teamId)); + d_r(teamId) = KokkosBlas::Experimental::dot(teamMember, Kokkos::subview(a.d_view, Kokkos::ALL(), teamId), + Kokkos::subview(b.d_view_const, Kokkos::ALL(), teamId)); }); Kokkos::deep_copy(r, d_r); for (int k = 0; k < K; k++) { ScalarA non_const_const_result = r(k); - EXPECT_NEAR_KK(non_const_const_result, expected_result[k], - eps * expected_result[k]); + EXPECT_NEAR_KK(non_const_const_result, expected_result[k], eps * expected_result[k]); } // KokkosBlas::dot(r,c_a,b); Kokkos::parallel_for( - "KokkosBlas::Test::TeamDot", policy, - KOKKOS_LAMBDA(const team_member &teamMember) { + "KokkosBlas::Test::TeamDot", policy, KOKKOS_LAMBDA(const team_member &teamMember) { const int teamId = teamMember.league_rank(); - d_r(teamId) = KokkosBlas::Experimental::dot( - teamMember, Kokkos::subview(a.d_view_const, Kokkos::ALL(), teamId), - Kokkos::subview(b.d_view, Kokkos::ALL(), teamId)); + d_r(teamId) = KokkosBlas::Experimental::dot(teamMember, Kokkos::subview(a.d_view_const, Kokkos::ALL(), teamId), + Kokkos::subview(b.d_view, Kokkos::ALL(), teamId)); }); Kokkos::deep_copy(r, d_r); for (int k = 0; k < K; k++) { ScalarA const_non_const_result = r(k); - EXPECT_NEAR_KK(const_non_const_result, expected_result[k], - eps * expected_result[k]); + EXPECT_NEAR_KK(const_non_const_result, expected_result[k], eps * expected_result[k]); } delete[] expected_result; @@ -265,8 +223,7 @@ void impl_test_team_dot_mv(int N, int K) { template int test_team_dot() { #if defined(KOKKOSKERNELS_INST_LAYOUTLEFT) || \ - (!defined(KOKKOSKERNELS_ETI_ONLY) && \ - !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS)) + (!defined(KOKKOSKERNELS_ETI_ONLY) && !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS)) typedef Kokkos::View view_type_a_ll; typedef Kokkos::View view_type_b_ll; Test::impl_test_team_dot(0); @@ -276,8 +233,7 @@ int test_team_dot() { #endif #if defined(KOKKOSKERNELS_INST_LAYOUTRIGHT) || \ - (!defined(KOKKOSKERNELS_ETI_ONLY) && \ - !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS)) + (!defined(KOKKOSKERNELS_ETI_ONLY) && !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS)) typedef Kokkos::View view_type_a_lr; typedef Kokkos::View view_type_b_lr; Test::impl_test_team_dot(0); @@ -286,8 +242,7 @@ int test_team_dot() { // Test::impl_test_team_dot(132231); #endif -#if (!defined(KOKKOSKERNELS_ETI_ONLY) && \ - !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS)) +#if (!defined(KOKKOSKERNELS_ETI_ONLY) && !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS)) typedef Kokkos::View view_type_a_ls; typedef Kokkos::View view_type_b_ls; Test::impl_test_team_dot(0); @@ -296,8 +251,7 @@ int test_team_dot() { // Test::impl_test_team_dot(132231); #endif -#if !defined(KOKKOSKERNELS_ETI_ONLY) && \ - !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS) +#if !defined(KOKKOSKERNELS_ETI_ONLY) && !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS) Test::impl_test_team_dot(124); Test::impl_test_team_dot(124); #endif @@ -308,8 +262,7 @@ int test_team_dot() { template int test_team_dot_mv() { #if defined(KOKKOSKERNELS_INST_LAYOUTLEFT) || \ - (!defined(KOKKOSKERNELS_ETI_ONLY) && \ - !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS)) + (!defined(KOKKOSKERNELS_ETI_ONLY) && !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS)) typedef Kokkos::View view_type_a_ll; typedef Kokkos::View view_type_b_ll; Test::impl_test_team_dot_mv(0, 5); @@ -320,8 +273,7 @@ int test_team_dot_mv() { #endif #if defined(KOKKOSKERNELS_INST_LAYOUTRIGHT) || \ - (!defined(KOKKOSKERNELS_ETI_ONLY) && \ - !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS)) + (!defined(KOKKOSKERNELS_ETI_ONLY) && !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS)) typedef Kokkos::View view_type_a_lr; typedef Kokkos::View view_type_b_lr; Test::impl_test_team_dot_mv(0, 5); @@ -331,8 +283,7 @@ int test_team_dot_mv() { // Device>(132231,5); #endif -#if (!defined(KOKKOSKERNELS_ETI_ONLY) && \ - !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS)) +#if (!defined(KOKKOSKERNELS_ETI_ONLY) && !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS)) typedef Kokkos::View view_type_a_ls; typedef Kokkos::View view_type_b_ls; Test::impl_test_team_dot_mv(0, 5); @@ -342,8 +293,7 @@ int test_team_dot_mv() { // Device>(132231,5); #endif -#if !defined(KOKKOSKERNELS_ETI_ONLY) && \ - !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS) +#if !defined(KOKKOSKERNELS_ETI_ONLY) && !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS) Test::impl_test_team_dot_mv(124, 5); Test::impl_test_team_dot_mv(124, 5); #endif @@ -352,46 +302,31 @@ int test_team_dot_mv() { } #if defined(KOKKOSKERNELS_INST_FLOAT) || \ - (!defined(KOKKOSKERNELS_ETI_ONLY) && \ - !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS)) -TEST_F(TestCategory, team_dot_float) { - test_team_dot(); -} -TEST_F(TestCategory, team_dot_mv_float) { - test_team_dot_mv(); -} + (!defined(KOKKOSKERNELS_ETI_ONLY) && !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS)) +TEST_F(TestCategory, team_dot_float) { test_team_dot(); } +TEST_F(TestCategory, team_dot_mv_float) { test_team_dot_mv(); } #endif #if defined(KOKKOSKERNELS_INST_DOUBLE) || \ - (!defined(KOKKOSKERNELS_ETI_ONLY) && \ - !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS)) -TEST_F(TestCategory, team_dot_double) { - test_team_dot(); -} -TEST_F(TestCategory, team_dot_mv_double) { - test_team_dot_mv(); -} + (!defined(KOKKOSKERNELS_ETI_ONLY) && !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS)) +TEST_F(TestCategory, team_dot_double) { test_team_dot(); } +TEST_F(TestCategory, team_dot_mv_double) { test_team_dot_mv(); } #endif #if defined(KOKKOSKERNELS_INST_COMPLEX_DOUBLE) || \ - (!defined(KOKKOSKERNELS_ETI_ONLY) && \ - !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS)) + (!defined(KOKKOSKERNELS_ETI_ONLY) && !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS)) TEST_F(TestCategory, team_dot_complex_double) { test_team_dot, Kokkos::complex, TestDevice>(); } TEST_F(TestCategory, team_dot_mv_complex_double) { - test_team_dot_mv, Kokkos::complex, - TestDevice>(); + test_team_dot_mv, Kokkos::complex, TestDevice>(); } #endif -#if defined(KOKKOSKERNELS_INST_INT) || \ - (!defined(KOKKOSKERNELS_ETI_ONLY) && \ - !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS)) +#if defined(KOKKOSKERNELS_INST_INT) || \ + (!defined(KOKKOSKERNELS_ETI_ONLY) && !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS)) TEST_F(TestCategory, team_dot_int) { test_team_dot(); } -TEST_F(TestCategory, team_dot_mv_int) { - test_team_dot_mv(); -} +TEST_F(TestCategory, team_dot_mv_int) { test_team_dot_mv(); } #endif /*#if !defined(KOKKOSKERNELS_ETI_ONLY) && diff --git a/blas/unit_test/Test_Blas1_team_mult.hpp b/blas/unit_test/Test_Blas1_team_mult.hpp index 488e9ccf51..63fdbf99c1 100644 --- a/blas/unit_test/Test_Blas1_team_mult.hpp +++ b/blas/unit_test/Test_Blas1_team_mult.hpp @@ -65,63 +65,41 @@ void impl_test_team_mult(int N) { ScalarA expected_result = 0; for (int i = 0; i < N; i++) - expected_result += - ScalarC(b * z.h_view(i) + a * x.h_view(i) * y.h_view(i)) * - ScalarC(b * z.h_view(i) + a * x.h_view(i) * y.h_view(i)); + expected_result += ScalarC(b * z.h_view(i) + a * x.h_view(i) * y.h_view(i)) * + ScalarC(b * z.h_view(i) + a * x.h_view(i) * y.h_view(i)); // KokkosBlas::mult(b,z,a,x,y); Kokkos::parallel_for( - "KokkosBlas::Test::TeamMult", policy, - KOKKOS_LAMBDA(const team_member &teamMember) { + "KokkosBlas::Test::TeamMult", policy, KOKKOS_LAMBDA(const team_member &teamMember) { const int teamId = teamMember.league_rank(); KokkosBlas::Experimental::mult( teamMember, b, - Kokkos::subview( - z.d_view, - Kokkos::make_pair( - teamId * team_data_siz, - (teamId < M - 1) ? (teamId + 1) * team_data_siz : N)), + Kokkos::subview(z.d_view, Kokkos::make_pair(teamId * team_data_siz, + (teamId < M - 1) ? (teamId + 1) * team_data_siz : N)), a, - Kokkos::subview( - x.d_view, - Kokkos::make_pair( - teamId * team_data_siz, - (teamId < M - 1) ? (teamId + 1) * team_data_siz : N)), - Kokkos::subview( - y.d_view, - Kokkos::make_pair( - teamId * team_data_siz, - (teamId < M - 1) ? (teamId + 1) * team_data_siz : N))); + Kokkos::subview(x.d_view, Kokkos::make_pair(teamId * team_data_siz, + (teamId < M - 1) ? (teamId + 1) * team_data_siz : N)), + Kokkos::subview(y.d_view, Kokkos::make_pair(teamId * team_data_siz, + (teamId < M - 1) ? (teamId + 1) * team_data_siz : N))); }); ScalarC nonconst_nonconst_result = KokkosBlas::dot(z.d_view, z.d_view); - EXPECT_NEAR_KK(nonconst_nonconst_result, expected_result, - eps * expected_result); + EXPECT_NEAR_KK(nonconst_nonconst_result, expected_result, eps * expected_result); // Reset z on device to orig and run again with const-valued y Kokkos::deep_copy(z.d_base, org_z.h_base); // KokkosBlas::mult(b,z,a,x,c_y); Kokkos::parallel_for( - "KokkosBlas::Test::TeamMult", policy, - KOKKOS_LAMBDA(const team_member &teamMember) { + "KokkosBlas::Test::TeamMult", policy, KOKKOS_LAMBDA(const team_member &teamMember) { const int teamId = teamMember.league_rank(); KokkosBlas::Experimental::mult( teamMember, b, - Kokkos::subview( - z.d_view, - Kokkos::make_pair( - teamId * team_data_siz, - (teamId < M - 1) ? (teamId + 1) * team_data_siz : N)), + Kokkos::subview(z.d_view, Kokkos::make_pair(teamId * team_data_siz, + (teamId < M - 1) ? (teamId + 1) * team_data_siz : N)), a, - Kokkos::subview( - x.d_view, - Kokkos::make_pair( - teamId * team_data_siz, - (teamId < M - 1) ? (teamId + 1) * team_data_siz : N)), - Kokkos::subview( - y.d_view_const, - Kokkos::make_pair( - teamId * team_data_siz, - (teamId < M - 1) ? (teamId + 1) * team_data_siz : N))); + Kokkos::subview(x.d_view, Kokkos::make_pair(teamId * team_data_siz, + (teamId < M - 1) ? (teamId + 1) * team_data_siz : N)), + Kokkos::subview(y.d_view_const, Kokkos::make_pair(teamId * team_data_siz, + (teamId < M - 1) ? (teamId + 1) * team_data_siz : N))); }); ScalarC const_nonconst_result = KokkosBlas::dot(z.d_view, z.d_view); EXPECT_NEAR_KK(const_nonconst_result, expected_result, eps * expected_result); @@ -130,27 +108,17 @@ void impl_test_team_mult(int N) { Kokkos::deep_copy(z.d_base, org_z.h_base); // KokkosBlas::mult(b,z,a,c_x,c_y); Kokkos::parallel_for( - "KokkosBlas::Test::TeamMult", policy, - KOKKOS_LAMBDA(const team_member &teamMember) { + "KokkosBlas::Test::TeamMult", policy, KOKKOS_LAMBDA(const team_member &teamMember) { const int teamId = teamMember.league_rank(); KokkosBlas::Experimental::mult( teamMember, b, - Kokkos::subview( - z.d_view, - Kokkos::make_pair( - teamId * team_data_siz, - (teamId < M - 1) ? (teamId + 1) * team_data_siz : N)), + Kokkos::subview(z.d_view, Kokkos::make_pair(teamId * team_data_siz, + (teamId < M - 1) ? (teamId + 1) * team_data_siz : N)), a, - Kokkos::subview( - x.d_view_const, - Kokkos::make_pair( - teamId * team_data_siz, - (teamId < M - 1) ? (teamId + 1) * team_data_siz : N)), - Kokkos::subview( - y.d_view_const, - Kokkos::make_pair( - teamId * team_data_siz, - (teamId < M - 1) ? (teamId + 1) * team_data_siz : N))); + Kokkos::subview(x.d_view_const, Kokkos::make_pair(teamId * team_data_siz, + (teamId < M - 1) ? (teamId + 1) * team_data_siz : N)), + Kokkos::subview(y.d_view_const, Kokkos::make_pair(teamId * team_data_siz, + (teamId < M - 1) ? (teamId + 1) * team_data_siz : N))); }); ScalarC const_const_result = KokkosBlas::dot(z.d_view, z.d_view); EXPECT_NEAR_KK(const_const_result, expected_result, eps * expected_result); @@ -195,19 +163,16 @@ void impl_test_team_mult_mv(int N, int K) { // Since b and a are known and the largest value in z, x and y // is set by the variables max_val, the error upper bound will be // max_error = a * max_val * max_val - typename Kokkos::ArithTraits::mag_type const eps = - Kokkos::ArithTraits::epsilon(); + typename Kokkos::ArithTraits::mag_type const eps = Kokkos::ArithTraits::epsilon(); typename Kokkos::ArithTraits::mag_type const max_error = Kokkos::ArithTraits::abs(a) * max_val * max_val * eps; // KokkosBlas::mult(b,z,a,x,y); Kokkos::parallel_for( - "KokkosBlas::Test::TeamMult", policy, - KOKKOS_LAMBDA(const team_member &teamMember) { + "KokkosBlas::Test::TeamMult", policy, KOKKOS_LAMBDA(const team_member &teamMember) { const int teamId = teamMember.league_rank(); - KokkosBlas::Experimental::mult( - teamMember, b, Kokkos::subview(z.d_view, Kokkos::ALL(), teamId), a, - x.d_view, Kokkos::subview(y.d_view, Kokkos::ALL(), teamId)); + KokkosBlas::Experimental::mult(teamMember, b, Kokkos::subview(z.d_view, Kokkos::ALL(), teamId), a, x.d_view, + Kokkos::subview(y.d_view, Kokkos::ALL(), teamId)); }); Kokkos::deep_copy(z.h_base, z.d_base); @@ -224,12 +189,10 @@ void impl_test_team_mult_mv(int N, int K) { Kokkos::deep_copy(z.d_base, org_z.h_base); // KokkosBlas::mult(b,z,a,x,c_y); Kokkos::parallel_for( - "KokkosBlas::Test::TeamMult", policy, - KOKKOS_LAMBDA(const team_member &teamMember) { + "KokkosBlas::Test::TeamMult", policy, KOKKOS_LAMBDA(const team_member &teamMember) { const int teamId = teamMember.league_rank(); - KokkosBlas::Experimental::mult( - teamMember, b, Kokkos::subview(z.d_view, Kokkos::ALL(), teamId), a, - x.d_view, Kokkos::subview(y.d_view_const, Kokkos::ALL(), teamId)); + KokkosBlas::Experimental::mult(teamMember, b, Kokkos::subview(z.d_view, Kokkos::ALL(), teamId), a, x.d_view, + Kokkos::subview(y.d_view_const, Kokkos::ALL(), teamId)); }); Kokkos::deep_copy(z.h_base, z.d_base); @@ -245,58 +208,43 @@ void impl_test_team_mult_mv(int N, int K) { template int test_team_mult() { #if defined(KOKKOSKERNELS_INST_LAYOUTLEFT) || \ - (!defined(KOKKOSKERNELS_ETI_ONLY) && \ - !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS)) + (!defined(KOKKOSKERNELS_ETI_ONLY) && !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS)) typedef Kokkos::View view_type_a_ll; typedef Kokkos::View view_type_b_ll; typedef Kokkos::View view_type_c_ll; - Test::impl_test_team_mult(0); - Test::impl_test_team_mult(13); - Test::impl_test_team_mult(124); + Test::impl_test_team_mult(0); + Test::impl_test_team_mult(13); + Test::impl_test_team_mult(124); // Test::impl_test_team_mult(132231); #endif #if defined(KOKKOSKERNELS_INST_LAYOUTRIGHT) || \ - (!defined(KOKKOSKERNELS_ETI_ONLY) && \ - !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS)) + (!defined(KOKKOSKERNELS_ETI_ONLY) && !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS)) typedef Kokkos::View view_type_a_lr; typedef Kokkos::View view_type_b_lr; typedef Kokkos::View view_type_c_lr; - Test::impl_test_team_mult(0); - Test::impl_test_team_mult(13); - Test::impl_test_team_mult(124); + Test::impl_test_team_mult(0); + Test::impl_test_team_mult(13); + Test::impl_test_team_mult(124); // Test::impl_test_team_mult(132231); #endif -#if (!defined(KOKKOSKERNELS_ETI_ONLY) && \ - !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS)) +#if (!defined(KOKKOSKERNELS_ETI_ONLY) && !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS)) typedef Kokkos::View view_type_a_ls; typedef Kokkos::View view_type_b_ls; typedef Kokkos::View view_type_c_ls; - Test::impl_test_team_mult(0); - Test::impl_test_team_mult(13); - Test::impl_test_team_mult(124); + Test::impl_test_team_mult(0); + Test::impl_test_team_mult(13); + Test::impl_test_team_mult(124); // Test::impl_test_team_mult(132231); #endif -#if !defined(KOKKOSKERNELS_ETI_ONLY) && \ - !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS) - Test::impl_test_team_mult(124); - Test::impl_test_team_mult(124); +#if !defined(KOKKOSKERNELS_ETI_ONLY) && !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS) + Test::impl_test_team_mult(124); + Test::impl_test_team_mult(124); #endif return 1; @@ -305,117 +253,79 @@ int test_team_mult() { template int test_team_mult_mv() { #if defined(KOKKOSKERNELS_INST_LAYOUTLEFT) || \ - (!defined(KOKKOSKERNELS_ETI_ONLY) && \ - !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS)) + (!defined(KOKKOSKERNELS_ETI_ONLY) && !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS)) typedef Kokkos::View view_type_a_ll; typedef Kokkos::View view_type_b_ll; typedef Kokkos::View view_type_c_ll; - Test::impl_test_team_mult_mv(0, 5); - Test::impl_test_team_mult_mv(13, 5); - Test::impl_test_team_mult_mv(124, 5); + Test::impl_test_team_mult_mv(0, 5); + Test::impl_test_team_mult_mv(13, 5); + Test::impl_test_team_mult_mv(124, 5); // Test::impl_test_team_mult_mv(132231,5); #endif #if defined(KOKKOSKERNELS_INST_LAYOUTRIGHT) || \ - (!defined(KOKKOSKERNELS_ETI_ONLY) && \ - !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS)) + (!defined(KOKKOSKERNELS_ETI_ONLY) && !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS)) typedef Kokkos::View view_type_a_lr; typedef Kokkos::View view_type_b_lr; typedef Kokkos::View view_type_c_lr; - Test::impl_test_team_mult_mv(0, 5); - Test::impl_test_team_mult_mv(13, 5); - Test::impl_test_team_mult_mv(124, 5); + Test::impl_test_team_mult_mv(0, 5); + Test::impl_test_team_mult_mv(13, 5); + Test::impl_test_team_mult_mv(124, 5); // Test::impl_test_team_mult_mv(132231,5); #endif -#if (!defined(KOKKOSKERNELS_ETI_ONLY) && \ - !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS)) +#if (!defined(KOKKOSKERNELS_ETI_ONLY) && !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS)) typedef Kokkos::View view_type_a_ls; typedef Kokkos::View view_type_b_ls; typedef Kokkos::View view_type_c_ls; - Test::impl_test_team_mult_mv(0, 5); - Test::impl_test_team_mult_mv(13, 5); - Test::impl_test_team_mult_mv(124, 5); + Test::impl_test_team_mult_mv(0, 5); + Test::impl_test_team_mult_mv(13, 5); + Test::impl_test_team_mult_mv(124, 5); // Test::impl_test_team_mult_mv(132231,5); #endif -#if !defined(KOKKOSKERNELS_ETI_ONLY) && \ - !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS) - Test::impl_test_team_mult_mv(124, 5); - Test::impl_test_team_mult_mv(124, 5); +#if !defined(KOKKOSKERNELS_ETI_ONLY) && !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS) + Test::impl_test_team_mult_mv(124, 5); + Test::impl_test_team_mult_mv(124, 5); #endif return 1; } #if defined(KOKKOSKERNELS_INST_FLOAT) || \ - (!defined(KOKKOSKERNELS_ETI_ONLY) && \ - !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS)) -TEST_F(TestCategory, team_mult_float) { - test_team_mult(); -} -TEST_F(TestCategory, team_mult_mv_float) { - test_team_mult_mv(); -} + (!defined(KOKKOSKERNELS_ETI_ONLY) && !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS)) +TEST_F(TestCategory, team_mult_float) { test_team_mult(); } +TEST_F(TestCategory, team_mult_mv_float) { test_team_mult_mv(); } #endif #if defined(KOKKOSKERNELS_INST_DOUBLE) || \ - (!defined(KOKKOSKERNELS_ETI_ONLY) && \ - !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS)) -TEST_F(TestCategory, team_mult_double) { - test_team_mult(); -} -TEST_F(TestCategory, team_mult_mv_double) { - test_team_mult_mv(); -} + (!defined(KOKKOSKERNELS_ETI_ONLY) && !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS)) +TEST_F(TestCategory, team_mult_double) { test_team_mult(); } +TEST_F(TestCategory, team_mult_mv_double) { test_team_mult_mv(); } #endif #if defined(KOKKOSKERNELS_INST_COMPLEX_DOUBLE) || \ - (!defined(KOKKOSKERNELS_ETI_ONLY) && \ - !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS)) + (!defined(KOKKOSKERNELS_ETI_ONLY) && !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS)) TEST_F(TestCategory, team_mult_complex_double) { - test_team_mult, Kokkos::complex, - Kokkos::complex, TestDevice>(); + test_team_mult, Kokkos::complex, Kokkos::complex, TestDevice>(); } TEST_F(TestCategory, team_mult_mv_complex_double) { - test_team_mult_mv, Kokkos::complex, - Kokkos::complex, TestDevice>(); + test_team_mult_mv, Kokkos::complex, Kokkos::complex, TestDevice>(); } #endif -#if defined(KOKKOSKERNELS_INST_INT) || \ - (!defined(KOKKOSKERNELS_ETI_ONLY) && \ - !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS)) -TEST_F(TestCategory, team_mult_int) { - test_team_mult(); -} -TEST_F(TestCategory, team_mult_mv_int) { - test_team_mult_mv(); -} +#if defined(KOKKOSKERNELS_INST_INT) || \ + (!defined(KOKKOSKERNELS_ETI_ONLY) && !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS)) +TEST_F(TestCategory, team_mult_int) { test_team_mult(); } +TEST_F(TestCategory, team_mult_mv_int) { test_team_mult_mv(); } #endif -#if !defined(KOKKOSKERNELS_ETI_ONLY) && \ - !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS) -TEST_F(TestCategory, team_mult_double_int) { - test_team_mult(); -} -TEST_F(TestCategory, team_mult_double_mv_int) { - test_team_mult_mv(); -} +#if !defined(KOKKOSKERNELS_ETI_ONLY) && !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS) +TEST_F(TestCategory, team_mult_double_int) { test_team_mult(); } +TEST_F(TestCategory, team_mult_double_mv_int) { test_team_mult_mv(); } #endif #endif // Check for lambda availability in CUDA backend diff --git a/blas/unit_test/Test_Blas1_team_nrm2.hpp b/blas/unit_test/Test_Blas1_team_nrm2.hpp index 12192032c9..befec6e57b 100644 --- a/blas/unit_test/Test_Blas1_team_nrm2.hpp +++ b/blas/unit_test/Test_Blas1_team_nrm2.hpp @@ -49,10 +49,8 @@ void impl_test_team_nrm2(int N, int K) { typename AT::mag_type *expected_result = new typename AT::mag_type[K]; for (int j = 0; j < K; j++) { expected_result[j] = typename AT::mag_type(); - for (int i = 0; i < N; i++) - expected_result[j] += AT::abs(a.h_view(i, j)) * AT::abs(a.h_view(i, j)); - expected_result[j] = - Kokkos::ArithTraits::sqrt(expected_result[j]); + for (int i = 0; i < N; i++) expected_result[j] += AT::abs(a.h_view(i, j)) * AT::abs(a.h_view(i, j)); + expected_result[j] = Kokkos::ArithTraits::sqrt(expected_result[j]); } double eps = std::is_same::value ? 2 * 1e-5 : 1e-7; @@ -62,26 +60,22 @@ void impl_test_team_nrm2(int N, int K) { // KokkosBlas::nrm2(r,a); Kokkos::parallel_for( - "KokkosBlas::Test::TeamNrm2", policy, - KOKKOS_LAMBDA(const team_member &teamMember) { + "KokkosBlas::Test::TeamNrm2", policy, KOKKOS_LAMBDA(const team_member &teamMember) { const int teamId = teamMember.league_rank(); - d_r(teamId) = KokkosBlas::Experimental::nrm2( - teamMember, Kokkos::subview(a.d_view, Kokkos::ALL(), teamId)); + d_r(teamId) = KokkosBlas::Experimental::nrm2(teamMember, Kokkos::subview(a.d_view, Kokkos::ALL(), teamId)); }); Kokkos::deep_copy(r, d_r); for (int k = 0; k < K; k++) { typename AT::mag_type nonconst_result = r(k); - EXPECT_NEAR_KK(nonconst_result, expected_result[k], - eps * expected_result[k]); + EXPECT_NEAR_KK(nonconst_result, expected_result[k], eps * expected_result[k]); } // KokkosBlas::nrm2(r,c_a); Kokkos::parallel_for( - "KokkosBlas::Test::TeamNrm2", policy, - KOKKOS_LAMBDA(const team_member &teamMember) { + "KokkosBlas::Test::TeamNrm2", policy, KOKKOS_LAMBDA(const team_member &teamMember) { const int teamId = teamMember.league_rank(); - d_r(teamId) = KokkosBlas::Experimental::nrm2( - teamMember, Kokkos::subview(a.d_view_const, Kokkos::ALL(), teamId)); + d_r(teamId) = + KokkosBlas::Experimental::nrm2(teamMember, Kokkos::subview(a.d_view_const, Kokkos::ALL(), teamId)); }); Kokkos::deep_copy(r, d_r); for (int k = 0; k < K; k++) { @@ -96,8 +90,7 @@ void impl_test_team_nrm2(int N, int K) { template int test_team_nrm2() { #if defined(KOKKOSKERNELS_INST_LAYOUTLEFT) || \ - (!defined(KOKKOSKERNELS_ETI_ONLY) && \ - !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS)) + (!defined(KOKKOSKERNELS_ETI_ONLY) && !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS)) typedef Kokkos::View view_type_a_ll; Test::impl_test_team_nrm2(0, 5); Test::impl_test_team_nrm2(13, 5); @@ -106,8 +99,7 @@ int test_team_nrm2() { #endif #if defined(KOKKOSKERNELS_INST_LAYOUTRIGHT) || \ - (!defined(KOKKOSKERNELS_ETI_ONLY) && \ - !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS)) + (!defined(KOKKOSKERNELS_ETI_ONLY) && !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS)) typedef Kokkos::View view_type_a_lr; Test::impl_test_team_nrm2(0, 5); Test::impl_test_team_nrm2(13, 5); @@ -115,8 +107,7 @@ int test_team_nrm2() { // Test::impl_test_team_nrm2(132231,5); #endif -#if (!defined(KOKKOSKERNELS_ETI_ONLY) && \ - !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS)) +#if (!defined(KOKKOSKERNELS_ETI_ONLY) && !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS)) typedef Kokkos::View view_type_a_ls; Test::impl_test_team_nrm2(0, 5); Test::impl_test_team_nrm2(13, 5); @@ -128,28 +119,22 @@ int test_team_nrm2() { } #if defined(KOKKOSKERNELS_INST_FLOAT) || \ - (!defined(KOKKOSKERNELS_ETI_ONLY) && \ - !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS)) + (!defined(KOKKOSKERNELS_ETI_ONLY) && !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS)) TEST_F(TestCategory, team_nrm2_float) { test_team_nrm2(); } #endif #if defined(KOKKOSKERNELS_INST_DOUBLE) || \ - (!defined(KOKKOSKERNELS_ETI_ONLY) && \ - !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS)) + (!defined(KOKKOSKERNELS_ETI_ONLY) && !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS)) TEST_F(TestCategory, team_nrm2_double) { test_team_nrm2(); } #endif #if defined(KOKKOSKERNELS_INST_COMPLEX_DOUBLE) || \ - (!defined(KOKKOSKERNELS_ETI_ONLY) && \ - !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS)) -TEST_F(TestCategory, team_nrm2_complex_double) { - test_team_nrm2, TestDevice>(); -} + (!defined(KOKKOSKERNELS_ETI_ONLY) && !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS)) +TEST_F(TestCategory, team_nrm2_complex_double) { test_team_nrm2, TestDevice>(); } #endif -#if defined(KOKKOSKERNELS_INST_INT) || \ - (!defined(KOKKOSKERNELS_ETI_ONLY) && \ - !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS)) +#if defined(KOKKOSKERNELS_INST_INT) || \ + (!defined(KOKKOSKERNELS_ETI_ONLY) && !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS)) TEST_F(TestCategory, team_nrm2_int) { test_team_nrm2(); } #endif diff --git a/blas/unit_test/Test_Blas1_team_scal.hpp b/blas/unit_test/Test_Blas1_team_scal.hpp index 212b1e09e9..f3d6707ba3 100644 --- a/blas/unit_test/Test_Blas1_team_scal.hpp +++ b/blas/unit_test/Test_Blas1_team_scal.hpp @@ -62,60 +62,42 @@ void impl_test_team_scal(int N) { } Kokkos::parallel_for( - "KokkosBlas::Test::TeamScal", policy, - KOKKOS_LAMBDA(const team_member &teamMember) { + "KokkosBlas::Test::TeamScal", policy, KOKKOS_LAMBDA(const team_member &teamMember) { const int teamId = teamMember.league_rank(); KokkosBlas::Experimental::scal( teamMember, - Kokkos::subview( - y.d_view, - Kokkos::make_pair( - teamId * team_data_siz, - (teamId < M - 1) ? (teamId + 1) * team_data_siz : N)), + Kokkos::subview(y.d_view, Kokkos::make_pair(teamId * team_data_siz, + (teamId < M - 1) ? (teamId + 1) * team_data_siz : N)), a, - Kokkos::subview( - x.d_view, - Kokkos::make_pair( - teamId * team_data_siz, - (teamId < M - 1) ? (teamId + 1) * team_data_siz : N))); + Kokkos::subview(x.d_view, Kokkos::make_pair(teamId * team_data_siz, + (teamId < M - 1) ? (teamId + 1) * team_data_siz : N))); }); { ScalarB nonconst_nonconst_result = KokkosBlas::dot(y.d_view, y.d_view); - typename AT::mag_type divisor = - AT::abs(expected_result) == zero ? one : AT::abs(expected_result); - typename AT::mag_type diff = - AT::abs(nonconst_nonconst_result - expected_result) / divisor; + typename AT::mag_type divisor = AT::abs(expected_result) == zero ? one : AT::abs(expected_result); + typename AT::mag_type diff = AT::abs(nonconst_nonconst_result - expected_result) / divisor; EXPECT_NEAR_KK(diff, zero, eps); } Kokkos::deep_copy(y.d_view, Kokkos::ArithTraits::zero()); Kokkos::parallel_for( - "KokkosBlas::Test::TeamScal", policy, - KOKKOS_LAMBDA(const team_member &teamMember) { + "KokkosBlas::Test::TeamScal", policy, KOKKOS_LAMBDA(const team_member &teamMember) { const int teamId = teamMember.league_rank(); KokkosBlas::Experimental::scal( teamMember, - Kokkos::subview( - y.d_view, - Kokkos::make_pair( - teamId * team_data_siz, - (teamId < M - 1) ? (teamId + 1) * team_data_siz : N)), + Kokkos::subview(y.d_view, Kokkos::make_pair(teamId * team_data_siz, + (teamId < M - 1) ? (teamId + 1) * team_data_siz : N)), a, - Kokkos::subview( - x.d_view_const, - Kokkos::make_pair( - teamId * team_data_siz, - (teamId < M - 1) ? (teamId + 1) * team_data_siz : N))); + Kokkos::subview(x.d_view_const, Kokkos::make_pair(teamId * team_data_siz, + (teamId < M - 1) ? (teamId + 1) * team_data_siz : N))); }); { ScalarB const_nonconst_result = KokkosBlas::dot(y.d_view, y.d_view); - typename AT::mag_type divisor = - AT::abs(expected_result) == zero ? one : AT::abs(expected_result); - typename AT::mag_type diff = - AT::abs(const_nonconst_result - expected_result) / divisor; + typename AT::mag_type divisor = AT::abs(expected_result) == zero ? one : AT::abs(expected_result); + typename AT::mag_type diff = AT::abs(const_nonconst_result - expected_result) / divisor; EXPECT_NEAR_KK(diff, zero, eps); } } @@ -147,8 +129,7 @@ void impl_test_team_scal_mv(int N, int K) { for (int j = 0; j < K; j++) { expected_result[j] = ScalarA(); for (int i = 0; i < N; i++) { - expected_result[j] += - ScalarB(a * x.h_view(i, j)) * ScalarB(a * x.h_view(i, j)); + expected_result[j] += ScalarB(a * x.h_view(i, j)) * ScalarB(a * x.h_view(i, j)); } } @@ -159,21 +140,17 @@ void impl_test_team_scal_mv(int N, int K) { Kokkos::View r("Dot::Result", K); Kokkos::parallel_for( - "KokkosBlas::Test::TeamScal", policy, - KOKKOS_LAMBDA(const team_member &teamMember) { + "KokkosBlas::Test::TeamScal", policy, KOKKOS_LAMBDA(const team_member &teamMember) { const int teamId = teamMember.league_rank(); - KokkosBlas::Experimental::scal( - teamMember, Kokkos::subview(y.d_view, Kokkos::ALL(), teamId), a, - Kokkos::subview(x.d_view, Kokkos::ALL(), teamId)); + KokkosBlas::Experimental::scal(teamMember, Kokkos::subview(y.d_view, Kokkos::ALL(), teamId), a, + Kokkos::subview(x.d_view, Kokkos::ALL(), teamId)); }); KokkosBlas::dot(r, y.d_view, y.d_view); for (int k = 0; k < K; k++) { ScalarA nonconst_scalar_result = r(k); - typename AT::mag_type divisor = - AT::abs(expected_result[k]) == zero ? one : AT::abs(expected_result[k]); - typename AT::mag_type diff = - AT::abs(nonconst_scalar_result - expected_result[k]) / divisor; + typename AT::mag_type divisor = AT::abs(expected_result[k]) == zero ? one : AT::abs(expected_result[k]); + typename AT::mag_type diff = AT::abs(nonconst_scalar_result - expected_result[k]) / divisor; EXPECT_NEAR_KK(diff, zero, eps); } @@ -181,21 +158,17 @@ void impl_test_team_scal_mv(int N, int K) { Kokkos::deep_copy(y.d_view, Kokkos::ArithTraits::zero()); Kokkos::parallel_for( - "KokkosBlas::Test::TeamScal", policy, - KOKKOS_LAMBDA(const team_member &teamMember) { + "KokkosBlas::Test::TeamScal", policy, KOKKOS_LAMBDA(const team_member &teamMember) { const int teamId = teamMember.league_rank(); - KokkosBlas::Experimental::scal( - teamMember, Kokkos::subview(y.d_view, Kokkos::ALL(), teamId), a, - Kokkos::subview(x.d_view_const, Kokkos::ALL(), teamId)); + KokkosBlas::Experimental::scal(teamMember, Kokkos::subview(y.d_view, Kokkos::ALL(), teamId), a, + Kokkos::subview(x.d_view_const, Kokkos::ALL(), teamId)); }); KokkosBlas::dot(r, y.d_view, y.d_view); for (int k = 0; k < K; k++) { - ScalarA const_scalar_result = r(k); - typename AT::mag_type divisor = - AT::abs(expected_result[k]) == zero ? one : AT::abs(expected_result[k]); - typename AT::mag_type diff = - AT::abs(const_scalar_result - expected_result[k]) / divisor; + ScalarA const_scalar_result = r(k); + typename AT::mag_type divisor = AT::abs(expected_result[k]) == zero ? one : AT::abs(expected_result[k]); + typename AT::mag_type diff = AT::abs(const_scalar_result - expected_result[k]) / divisor; EXPECT_NEAR_KK(diff, zero, eps); } @@ -211,8 +184,7 @@ void impl_test_team_scal_mv(int N, int K) { for (int j = 0; j < K; j++) { expected_result[j] = ScalarA(); for (int i = 0; i < N; i++) { - expected_result[j] += ScalarB((3.0 + j) * x.h_view(i, j)) * - ScalarB((3.0 + j) * x.h_view(i, j)); + expected_result[j] += ScalarB((3.0 + j) * x.h_view(i, j)) * ScalarB((3.0 + j) * x.h_view(i, j)); } } @@ -220,21 +192,17 @@ void impl_test_team_scal_mv(int N, int K) { Kokkos::deep_copy(y.d_view, Kokkos::ArithTraits::zero()); Kokkos::parallel_for( - "KokkosBlas::Test::TeamScal", policy, - KOKKOS_LAMBDA(const team_member &teamMember) { + "KokkosBlas::Test::TeamScal", policy, KOKKOS_LAMBDA(const team_member &teamMember) { const int teamId = teamMember.league_rank(); - KokkosBlas::Experimental::scal( - teamMember, Kokkos::subview(y.d_view, Kokkos::ALL(), teamId), - params(teamId), Kokkos::subview(x.d_view, Kokkos::ALL(), teamId)); + KokkosBlas::Experimental::scal(teamMember, Kokkos::subview(y.d_view, Kokkos::ALL(), teamId), params(teamId), + Kokkos::subview(x.d_view, Kokkos::ALL(), teamId)); }); KokkosBlas::dot(r, y.d_view, y.d_view); for (int k = 0; k < K; k++) { ScalarA nonconst_vector_result = r(k); - typename AT::mag_type divisor = - AT::abs(expected_result[k]) == zero ? one : AT::abs(expected_result[k]); - typename AT::mag_type diff = - AT::abs(nonconst_vector_result - expected_result[k]) / divisor; + typename AT::mag_type divisor = AT::abs(expected_result[k]) == zero ? one : AT::abs(expected_result[k]); + typename AT::mag_type diff = AT::abs(nonconst_vector_result - expected_result[k]) / divisor; EXPECT_NEAR_KK(diff, zero, eps); } @@ -242,22 +210,17 @@ void impl_test_team_scal_mv(int N, int K) { Kokkos::deep_copy(y.d_view, Kokkos::ArithTraits::zero()); Kokkos::parallel_for( - "KokkosBlas::Test::TeamScal", policy, - KOKKOS_LAMBDA(const team_member &teamMember) { + "KokkosBlas::Test::TeamScal", policy, KOKKOS_LAMBDA(const team_member &teamMember) { const int teamId = teamMember.league_rank(); - KokkosBlas::Experimental::scal( - teamMember, Kokkos::subview(y.d_view, Kokkos::ALL(), teamId), - params(teamId), - Kokkos::subview(x.d_view_const, Kokkos::ALL(), teamId)); + KokkosBlas::Experimental::scal(teamMember, Kokkos::subview(y.d_view, Kokkos::ALL(), teamId), params(teamId), + Kokkos::subview(x.d_view_const, Kokkos::ALL(), teamId)); }); KokkosBlas::dot(r, y.d_view, y.d_view); for (int k = 0; k < K; k++) { - ScalarA const_vector_result = r(k); - typename AT::mag_type divisor = - AT::abs(expected_result[k]) == zero ? one : AT::abs(expected_result[k]); - typename AT::mag_type diff = - AT::abs(const_vector_result - expected_result[k]) / divisor; + ScalarA const_vector_result = r(k); + typename AT::mag_type divisor = AT::abs(expected_result[k]) == zero ? one : AT::abs(expected_result[k]); + typename AT::mag_type diff = AT::abs(const_vector_result - expected_result[k]) / divisor; EXPECT_NEAR_KK(diff, zero, eps); } @@ -268,8 +231,7 @@ void impl_test_team_scal_mv(int N, int K) { template int test_team_scal() { #if defined(KOKKOSKERNELS_INST_LAYOUTLEFT) || \ - (!defined(KOKKOSKERNELS_ETI_ONLY) && \ - !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS)) + (!defined(KOKKOSKERNELS_ETI_ONLY) && !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS)) typedef Kokkos::View view_type_a_ll; typedef Kokkos::View view_type_b_ll; Test::impl_test_team_scal(0); @@ -279,8 +241,7 @@ int test_team_scal() { #endif #if defined(KOKKOSKERNELS_INST_LAYOUTRIGHT) || \ - (!defined(KOKKOSKERNELS_ETI_ONLY) && \ - !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS)) + (!defined(KOKKOSKERNELS_ETI_ONLY) && !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS)) typedef Kokkos::View view_type_a_lr; typedef Kokkos::View view_type_b_lr; Test::impl_test_team_scal(0); @@ -289,8 +250,7 @@ int test_team_scal() { // Test::impl_test_team_scal(132231); #endif -#if (!defined(KOKKOSKERNELS_ETI_ONLY) && \ - !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS)) +#if (!defined(KOKKOSKERNELS_ETI_ONLY) && !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS)) typedef Kokkos::View view_type_a_ls; typedef Kokkos::View view_type_b_ls; Test::impl_test_team_scal(0); @@ -299,8 +259,7 @@ int test_team_scal() { // Test::impl_test_team_scal(132231); #endif -#if !defined(KOKKOSKERNELS_ETI_ONLY) && \ - !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS) +#if !defined(KOKKOSKERNELS_ETI_ONLY) && !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS) Test::impl_test_team_scal(124); Test::impl_test_team_scal(124); #endif @@ -311,8 +270,7 @@ int test_team_scal() { template int test_team_scal_mv() { #if defined(KOKKOSKERNELS_INST_LAYOUTLEFT) || \ - (!defined(KOKKOSKERNELS_ETI_ONLY) && \ - !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS)) + (!defined(KOKKOSKERNELS_ETI_ONLY) && !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS)) typedef Kokkos::View view_type_a_ll; typedef Kokkos::View view_type_b_ll; Test::impl_test_team_scal_mv(0, 5); @@ -323,8 +281,7 @@ int test_team_scal_mv() { #endif #if defined(KOKKOSKERNELS_INST_LAYOUTRIGHT) || \ - (!defined(KOKKOSKERNELS_ETI_ONLY) && \ - !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS)) + (!defined(KOKKOSKERNELS_ETI_ONLY) && !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS)) typedef Kokkos::View view_type_a_lr; typedef Kokkos::View view_type_b_lr; Test::impl_test_team_scal_mv(0, 5); @@ -334,8 +291,7 @@ int test_team_scal_mv() { // Device>(132231,5); #endif -#if (!defined(KOKKOSKERNELS_ETI_ONLY) && \ - !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS)) +#if (!defined(KOKKOSKERNELS_ETI_ONLY) && !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS)) typedef Kokkos::View view_type_a_ls; typedef Kokkos::View view_type_b_ls; Test::impl_test_team_scal_mv(0, 5); @@ -345,8 +301,7 @@ int test_team_scal_mv() { // Device>(132231,5); #endif -#if !defined(KOKKOSKERNELS_ETI_ONLY) && \ - !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS) +#if !defined(KOKKOSKERNELS_ETI_ONLY) && !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS) Test::impl_test_team_scal_mv(124, 5); Test::impl_test_team_scal_mv(124, 5); #endif @@ -355,57 +310,36 @@ int test_team_scal_mv() { } #if defined(KOKKOSKERNELS_INST_FLOAT) || \ - (!defined(KOKKOSKERNELS_ETI_ONLY) && \ - !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS)) -TEST_F(TestCategory, team_scal_float) { - test_team_scal(); -} -TEST_F(TestCategory, team_scal_mv_float) { - test_team_scal_mv(); -} + (!defined(KOKKOSKERNELS_ETI_ONLY) && !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS)) +TEST_F(TestCategory, team_scal_float) { test_team_scal(); } +TEST_F(TestCategory, team_scal_mv_float) { test_team_scal_mv(); } #endif #if defined(KOKKOSKERNELS_INST_DOUBLE) || \ - (!defined(KOKKOSKERNELS_ETI_ONLY) && \ - !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS)) -TEST_F(TestCategory, team_scal_double) { - test_team_scal(); -} -TEST_F(TestCategory, team_scal_mv_double) { - test_team_scal_mv(); -} + (!defined(KOKKOSKERNELS_ETI_ONLY) && !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS)) +TEST_F(TestCategory, team_scal_double) { test_team_scal(); } +TEST_F(TestCategory, team_scal_mv_double) { test_team_scal_mv(); } #endif #if defined(KOKKOSKERNELS_INST_COMPLEX_DOUBLE) || \ - (!defined(KOKKOSKERNELS_ETI_ONLY) && \ - !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS)) + (!defined(KOKKOSKERNELS_ETI_ONLY) && !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS)) TEST_F(TestCategory, team_scal_complex_double) { - test_team_scal, Kokkos::complex, - TestDevice>(); + test_team_scal, Kokkos::complex, TestDevice>(); } TEST_F(TestCategory, team_scal_mv_complex_double) { - test_team_scal_mv, Kokkos::complex, - TestDevice>(); + test_team_scal_mv, Kokkos::complex, TestDevice>(); } #endif -#if defined(KOKKOSKERNELS_INST_INT) || \ - (!defined(KOKKOSKERNELS_ETI_ONLY) && \ - !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS)) +#if defined(KOKKOSKERNELS_INST_INT) || \ + (!defined(KOKKOSKERNELS_ETI_ONLY) && !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS)) TEST_F(TestCategory, team_scal_int) { test_team_scal(); } -TEST_F(TestCategory, team_scal_mv_int) { - test_team_scal_mv(); -} +TEST_F(TestCategory, team_scal_mv_int) { test_team_scal_mv(); } #endif -#if !defined(KOKKOSKERNELS_ETI_ONLY) && \ - !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS) -TEST_F(TestCategory, team_scal_double_int) { - test_team_scal(); -} -TEST_F(TestCategory, team_scal_double_mv_int) { - test_team_scal_mv(); -} +#if !defined(KOKKOSKERNELS_ETI_ONLY) && !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS) +TEST_F(TestCategory, team_scal_double_int) { test_team_scal(); } +TEST_F(TestCategory, team_scal_double_mv_int) { test_team_scal_mv(); } #endif #endif // Check for lambda availability in CUDA backend diff --git a/blas/unit_test/Test_Blas1_team_setscal.hpp b/blas/unit_test/Test_Blas1_team_setscal.hpp index 4d2499a466..33b264aa79 100644 --- a/blas/unit_test/Test_Blas1_team_setscal.hpp +++ b/blas/unit_test/Test_Blas1_team_setscal.hpp @@ -33,35 +33,27 @@ enum : int { BlasSet = 0, BlasScale = 1 }; struct KokkosKernelTag {}; struct NaiveTag {}; -template +template struct Functor_TestBlasTeamMatUtil { using execution_space = typename DeviceType::execution_space; ScalarType _alpha; ViewType _a; KOKKOS_INLINE_FUNCTION - Functor_TestBlasTeamMatUtil(const ScalarType alpha, const ViewType &a) - : _alpha(alpha), _a(a) {} + Functor_TestBlasTeamMatUtil(const ScalarType alpha, const ViewType &a) : _alpha(alpha), _a(a) {} template - KOKKOS_INLINE_FUNCTION void operator()(const KokkosKernelTag &, - const MemberType &member) const { + KOKKOS_INLINE_FUNCTION void operator()(const KokkosKernelTag &, const MemberType &member) const { const int i = member.league_rank(); auto A = Kokkos::subview(_a, i, Kokkos::ALL(), Kokkos::ALL()); switch (TestID) { - case BlasSet: - KokkosBlas::TeamSet::invoke(member, _alpha, A); - break; - case BlasScale: - KokkosBlas::TeamScale::invoke(member, _alpha, A); - break; + case BlasSet: KokkosBlas::TeamSet::invoke(member, _alpha, A); break; + case BlasScale: KokkosBlas::TeamScale::invoke(member, _alpha, A); break; } } template - KOKKOS_INLINE_FUNCTION void operator()(const NaiveTag &, - const MemberType &member) const { + KOKKOS_INLINE_FUNCTION void operator()(const NaiveTag &, const MemberType &member) const { if (member.team_rank() == 0) { const int k = member.league_rank(); auto A = Kokkos::subview(_a, k, Kokkos::ALL(), Kokkos::ALL()); @@ -85,21 +77,15 @@ struct Functor_TestBlasTeamMatUtil { typedef typename ViewType::value_type value_type; std::string name_region("KokkosBlas::Test::SerialMatUtil"); const std::string name_value_type = Test::value_type_name(); - std::string name_work_tag = - (std::is_same::value - ? "::KokkosBlas" - : std::is_same::value ? "::Naive" - : "::UnknownWorkTag"); - std::string name_test_id = - (TestID == BlasSet ? "Set" - : TestID == BlasScale ? "Scale" : "UnknownTest"); - std::string name = - name_region + name_value_type + name_work_tag + name_test_id; + std::string name_work_tag = (std::is_same::value ? "::KokkosBlas" + : std::is_same::value ? "::Naive" + : "::UnknownWorkTag"); + std::string name_test_id = (TestID == BlasSet ? "Set" : TestID == BlasScale ? "Scale" : "UnknownTest"); + std::string name = name_region + name_value_type + name_work_tag + name_test_id; Kokkos::Profiling::pushRegion(name.c_str()); const int league_size = _a.extent(0); - Kokkos::TeamPolicy policy(league_size, - Kokkos::AUTO); + Kokkos::TeamPolicy policy(league_size, Kokkos::AUTO); Kokkos::parallel_for(name.c_str(), policy, *this); Kokkos::Profiling::popRegion(); @@ -107,8 +93,7 @@ struct Functor_TestBlasTeamMatUtil { } }; -template +template void impl_test_blas_matutil(const int N, const int BlkSize) { /// typedefs typedef typename ViewType::value_type value_type; @@ -119,8 +104,7 @@ void impl_test_blas_matutil(const int N, const int BlkSize) { ViewType a("a", N, BlkSize, BlkSize); ViewType b("b", N, BlkSize, BlkSize); - Kokkos::Random_XorShift64_Pool random( - 13718); + Kokkos::Random_XorShift64_Pool random(13718); Kokkos::fill_random(a, random, value_type(1.0)); Kokkos::fence(); @@ -128,12 +112,8 @@ void impl_test_blas_matutil(const int N, const int BlkSize) { Kokkos::deep_copy(b, a); /// test body - Functor_TestBlasTeamMatUtil(alpha, a) - .run(); - Functor_TestBlasTeamMatUtil(alpha, b) - .run(); + Functor_TestBlasTeamMatUtil(alpha, a).run(); + Functor_TestBlasTeamMatUtil(alpha, b).run(); Kokkos::fence(); @@ -145,45 +125,32 @@ void impl_test_blas_matutil(const int N, const int BlkSize) { Kokkos::deep_copy(b_host, b); /// check a = b - typename ats::mag_type eps = - 100 * std::numeric_limits::epsilon(); + typename ats::mag_type eps = 100 * std::numeric_limits::epsilon(); for (int k = 0; k < N; ++k) for (int i = 0; i < BlkSize; ++i) - for (int j = 0; j < BlkSize; ++j) - EXPECT_NEAR_KK(b_host(k, i, j), a_host(k, i, j), eps); + for (int j = 0; j < BlkSize; ++j) EXPECT_NEAR_KK(b_host(k, i, j), a_host(k, i, j), eps); } } // namespace TeamMatUtil } // namespace Test -template +template int test_blas_team_matutil() { #if defined(KOKKOSKERNELS_INST_LAYOUTLEFT) { - typedef Kokkos::View - ViewType; - Test::TeamMatUtil::impl_test_blas_matutil(0, 10); - Test::TeamMatUtil::impl_test_blas_matutil(10, 15); - Test::TeamMatUtil::impl_test_blas_matutil(1024, 9); - Test::TeamMatUtil::impl_test_blas_matutil(132231, 3); + typedef Kokkos::View ViewType; + Test::TeamMatUtil::impl_test_blas_matutil(0, 10); + Test::TeamMatUtil::impl_test_blas_matutil(10, 15); + Test::TeamMatUtil::impl_test_blas_matutil(1024, 9); + Test::TeamMatUtil::impl_test_blas_matutil(132231, 3); } #endif #if defined(KOKKOSKERNELS_INST_LAYOUTRIGHT) { - typedef Kokkos::View - ViewType; - Test::TeamMatUtil::impl_test_blas_matutil(0, 10); - Test::TeamMatUtil::impl_test_blas_matutil(10, 15); - Test::TeamMatUtil::impl_test_blas_matutil(1024, 9); - Test::TeamMatUtil::impl_test_blas_matutil(132231, 3); + typedef Kokkos::View ViewType; + Test::TeamMatUtil::impl_test_blas_matutil(0, 10); + Test::TeamMatUtil::impl_test_blas_matutil(10, 15); + Test::TeamMatUtil::impl_test_blas_matutil(1024, 9); + Test::TeamMatUtil::impl_test_blas_matutil(132231, 3); } #endif @@ -214,19 +181,15 @@ TEST_F(TestCategory, blas_scalar_team_scale_double_double) { #if defined(KOKKOSKERNELS_INST_COMPLEX_DOUBLE) TEST_F(TestCategory, blas_scalar_team_set_dcomplex_dcomplex) { - test_blas_team_matutil, - Kokkos::complex, ::Test::BlasSet>(); + test_blas_team_matutil, Kokkos::complex, ::Test::BlasSet>(); } TEST_F(TestCategory, blas_scalar_team_scale_dcomplex_dcomplex) { - test_blas_team_matutil, - Kokkos::complex, ::Test::BlasScale>(); + test_blas_team_matutil, Kokkos::complex, ::Test::BlasScale>(); } TEST_F(TestCategory, blas_scalar_team_set_dcomplex_double) { - test_blas_team_matutil, double, - ::Test::BlasSet>(); + test_blas_team_matutil, double, ::Test::BlasSet>(); } TEST_F(TestCategory, blas_scalar_team_scale_dcomplex_double) { - test_blas_team_matutil, double, - ::Test::BlasScale>(); + test_blas_team_matutil, double, ::Test::BlasScale>(); } #endif diff --git a/blas/unit_test/Test_Blas1_team_update.hpp b/blas/unit_test/Test_Blas1_team_update.hpp index cfc76455f3..27765b0936 100644 --- a/blas/unit_test/Test_Blas1_team_update.hpp +++ b/blas/unit_test/Test_Blas1_team_update.hpp @@ -66,64 +66,42 @@ void impl_test_team_update(int N) { ScalarA expected_result = 0; for (int i = 0; i < N; i++) - expected_result += - ScalarC(c * z.h_view(i) + a * x.h_view(i) + b * y.h_view(i)) * - ScalarC(c * z.h_view(i) + a * x.h_view(i) + b * y.h_view(i)); + expected_result += ScalarC(c * z.h_view(i) + a * x.h_view(i) + b * y.h_view(i)) * + ScalarC(c * z.h_view(i) + a * x.h_view(i) + b * y.h_view(i)); // KokkosBlas::update(a,x,b,y,c,z); Kokkos::parallel_for( - "KokkosBlas::Test::TeamUpdate", policy, - KOKKOS_LAMBDA(const team_member &teamMember) { + "KokkosBlas::Test::TeamUpdate", policy, KOKKOS_LAMBDA(const team_member &teamMember) { const int teamId = teamMember.league_rank(); KokkosBlas::Experimental::update( teamMember, a, - Kokkos::subview( - x.d_view, - Kokkos::make_pair( - teamId * team_data_siz, - (teamId < M - 1) ? (teamId + 1) * team_data_siz : N)), + Kokkos::subview(x.d_view, Kokkos::make_pair(teamId * team_data_siz, + (teamId < M - 1) ? (teamId + 1) * team_data_siz : N)), b, - Kokkos::subview( - y.d_view, - Kokkos::make_pair( - teamId * team_data_siz, - (teamId < M - 1) ? (teamId + 1) * team_data_siz : N)), + Kokkos::subview(y.d_view, Kokkos::make_pair(teamId * team_data_siz, + (teamId < M - 1) ? (teamId + 1) * team_data_siz : N)), c, - Kokkos::subview( - z.d_view, - Kokkos::make_pair( - teamId * team_data_siz, - (teamId < M - 1) ? (teamId + 1) * team_data_siz : N))); + Kokkos::subview(z.d_view, Kokkos::make_pair(teamId * team_data_siz, + (teamId < M - 1) ? (teamId + 1) * team_data_siz : N))); }); ScalarC nonconst_nonconst_result = KokkosBlas::dot(z.d_view, z.d_view); - EXPECT_NEAR_KK(nonconst_nonconst_result, expected_result, - eps * expected_result); + EXPECT_NEAR_KK(nonconst_nonconst_result, expected_result, eps * expected_result); Kokkos::deep_copy(z.d_base, org_z.h_base); // KokkosBlas::update(a,c_x,b,y,c,z); Kokkos::parallel_for( - "KokkosBlas::Test::TeamUpdate", policy, - KOKKOS_LAMBDA(const team_member &teamMember) { + "KokkosBlas::Test::TeamUpdate", policy, KOKKOS_LAMBDA(const team_member &teamMember) { const int teamId = teamMember.league_rank(); KokkosBlas::Experimental::update( teamMember, a, - Kokkos::subview( - x.d_view_const, - Kokkos::make_pair( - teamId * team_data_siz, - (teamId < M - 1) ? (teamId + 1) * team_data_siz : N)), + Kokkos::subview(x.d_view_const, Kokkos::make_pair(teamId * team_data_siz, + (teamId < M - 1) ? (teamId + 1) * team_data_siz : N)), b, - Kokkos::subview( - y.d_view, - Kokkos::make_pair( - teamId * team_data_siz, - (teamId < M - 1) ? (teamId + 1) * team_data_siz : N)), + Kokkos::subview(y.d_view, Kokkos::make_pair(teamId * team_data_siz, + (teamId < M - 1) ? (teamId + 1) * team_data_siz : N)), c, - Kokkos::subview( - z.d_view, - Kokkos::make_pair( - teamId * team_data_siz, - (teamId < M - 1) ? (teamId + 1) * team_data_siz : N))); + Kokkos::subview(z.d_view, Kokkos::make_pair(teamId * team_data_siz, + (teamId < M - 1) ? (teamId + 1) * team_data_siz : N))); }); ScalarC const_nonconst_result = KokkosBlas::dot(z.d_view, z.d_view); EXPECT_NEAR_KK(const_nonconst_result, expected_result, eps * expected_result); @@ -131,28 +109,18 @@ void impl_test_team_update(int N) { Kokkos::deep_copy(z.d_base, org_z.h_base); // KokkosBlas::update(a,c_x,b,c_y,c,z); Kokkos::parallel_for( - "KokkosBlas::Test::TeamUpdate", policy, - KOKKOS_LAMBDA(const team_member &teamMember) { + "KokkosBlas::Test::TeamUpdate", policy, KOKKOS_LAMBDA(const team_member &teamMember) { const int teamId = teamMember.league_rank(); KokkosBlas::Experimental::update( teamMember, a, - Kokkos::subview( - x.d_view_const, - Kokkos::make_pair( - teamId * team_data_siz, - (teamId < M - 1) ? (teamId + 1) * team_data_siz : N)), + Kokkos::subview(x.d_view_const, Kokkos::make_pair(teamId * team_data_siz, + (teamId < M - 1) ? (teamId + 1) * team_data_siz : N)), b, - Kokkos::subview( - y.d_view_const, - Kokkos::make_pair( - teamId * team_data_siz, - (teamId < M - 1) ? (teamId + 1) * team_data_siz : N)), + Kokkos::subview(y.d_view_const, Kokkos::make_pair(teamId * team_data_siz, + (teamId < M - 1) ? (teamId + 1) * team_data_siz : N)), c, - Kokkos::subview( - z.d_view, - Kokkos::make_pair( - teamId * team_data_siz, - (teamId < M - 1) ? (teamId + 1) * team_data_siz : N))); + Kokkos::subview(z.d_view, Kokkos::make_pair(teamId * team_data_siz, + (teamId < M - 1) ? (teamId + 1) * team_data_siz : N))); }); ScalarC const_const_result = KokkosBlas::dot(z.d_view, z.d_view); EXPECT_NEAR_KK(const_const_result, expected_result, eps * expected_result); @@ -196,10 +164,8 @@ void impl_test_team_update_mv(int N, int K) { for (int j = 0; j < K; j++) { expected_result[j] = ScalarC(); for (int i = 0; i < N; i++) - expected_result[j] += - ScalarC(a * x.h_view(i, j) + b * y.h_view(i, j) + - c * z.h_view(i, j)) * - ScalarC(a * x.h_view(i, j) + b * y.h_view(i, j) + c * z.h_view(i, j)); + expected_result[j] += ScalarC(a * x.h_view(i, j) + b * y.h_view(i, j) + c * z.h_view(i, j)) * + ScalarC(a * x.h_view(i, j) + b * y.h_view(i, j) + c * z.h_view(i, j)); } double eps = std::is_same::value ? 2 * 1e-5 : 1e-7; @@ -208,38 +174,31 @@ void impl_test_team_update_mv(int N, int K) { // KokkosBlas::update(a,x,b,y,c,z); Kokkos::parallel_for( - "KokkosBlas::Test::TeamUpdate", policy, - KOKKOS_LAMBDA(const team_member &teamMember) { + "KokkosBlas::Test::TeamUpdate", policy, KOKKOS_LAMBDA(const team_member &teamMember) { const int teamId = teamMember.league_rank(); - KokkosBlas::Experimental::update( - teamMember, a, Kokkos::subview(x.d_view, Kokkos::ALL(), teamId), b, - Kokkos::subview(y.d_view, Kokkos::ALL(), teamId), c, - Kokkos::subview(z.d_view, Kokkos::ALL(), teamId)); + KokkosBlas::Experimental::update(teamMember, a, Kokkos::subview(x.d_view, Kokkos::ALL(), teamId), b, + Kokkos::subview(y.d_view, Kokkos::ALL(), teamId), c, + Kokkos::subview(z.d_view, Kokkos::ALL(), teamId)); }); KokkosBlas::dot(r, z.d_view, z.d_view); for (int k = 0; k < K; k++) { ScalarA nonconst_nonconst_result = r(k); - EXPECT_NEAR_KK(nonconst_nonconst_result, expected_result[k], - eps * expected_result[k]); + EXPECT_NEAR_KK(nonconst_nonconst_result, expected_result[k], eps * expected_result[k]); } Kokkos::deep_copy(z.d_base, org_z.h_base); // KokkosBlas::update(a,c_x,b,y,c,z); Kokkos::parallel_for( - "KokkosBlas::Test::TeamUpdate", policy, - KOKKOS_LAMBDA(const team_member &teamMember) { + "KokkosBlas::Test::TeamUpdate", policy, KOKKOS_LAMBDA(const team_member &teamMember) { const int teamId = teamMember.league_rank(); - KokkosBlas::Experimental::update( - teamMember, a, - Kokkos::subview(x.d_view_const, Kokkos::ALL(), teamId), b, - Kokkos::subview(y.d_view, Kokkos::ALL(), teamId), c, - Kokkos::subview(z.d_view, Kokkos::ALL(), teamId)); + KokkosBlas::Experimental::update(teamMember, a, Kokkos::subview(x.d_view_const, Kokkos::ALL(), teamId), b, + Kokkos::subview(y.d_view, Kokkos::ALL(), teamId), c, + Kokkos::subview(z.d_view, Kokkos::ALL(), teamId)); }); KokkosBlas::dot(r, z.d_view, z.d_view); for (int k = 0; k < K; k++) { ScalarA const_non_const_result = r(k); - EXPECT_NEAR_KK(const_non_const_result, expected_result[k], - eps * expected_result[k]); + EXPECT_NEAR_KK(const_non_const_result, expected_result[k], eps * expected_result[k]); } delete[] expected_result; @@ -249,58 +208,43 @@ void impl_test_team_update_mv(int N, int K) { template int test_team_update() { #if defined(KOKKOSKERNELS_INST_LAYOUTLEFT) || \ - (!defined(KOKKOSKERNELS_ETI_ONLY) && \ - !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS)) + (!defined(KOKKOSKERNELS_ETI_ONLY) && !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS)) typedef Kokkos::View view_type_a_ll; typedef Kokkos::View view_type_b_ll; typedef Kokkos::View view_type_c_ll; - Test::impl_test_team_update(0); - Test::impl_test_team_update(13); - Test::impl_test_team_update(124); + Test::impl_test_team_update(0); + Test::impl_test_team_update(13); + Test::impl_test_team_update(124); // Test::impl_test_team_update(132231); #endif #if defined(KOKKOSKERNELS_INST_LAYOUTRIGHT) || \ - (!defined(KOKKOSKERNELS_ETI_ONLY) && \ - !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS)) + (!defined(KOKKOSKERNELS_ETI_ONLY) && !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS)) typedef Kokkos::View view_type_a_lr; typedef Kokkos::View view_type_b_lr; typedef Kokkos::View view_type_c_lr; - Test::impl_test_team_update(0); - Test::impl_test_team_update(13); - Test::impl_test_team_update(124); + Test::impl_test_team_update(0); + Test::impl_test_team_update(13); + Test::impl_test_team_update(124); // Test::impl_test_team_update(132231); #endif -#if (!defined(KOKKOSKERNELS_ETI_ONLY) && \ - !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS)) +#if (!defined(KOKKOSKERNELS_ETI_ONLY) && !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS)) typedef Kokkos::View view_type_a_ls; typedef Kokkos::View view_type_b_ls; typedef Kokkos::View view_type_c_ls; - Test::impl_test_team_update(0); - Test::impl_test_team_update(13); - Test::impl_test_team_update(124); + Test::impl_test_team_update(0); + Test::impl_test_team_update(13); + Test::impl_test_team_update(124); // Test::impl_test_team_update(132231); #endif -#if !defined(KOKKOSKERNELS_ETI_ONLY) && \ - !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS) - Test::impl_test_team_update(124); - Test::impl_test_team_update(124); +#if !defined(KOKKOSKERNELS_ETI_ONLY) && !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS) + Test::impl_test_team_update(124); + Test::impl_test_team_update(124); #endif return 1; @@ -309,117 +253,79 @@ int test_team_update() { template int test_team_update_mv() { #if defined(KOKKOSKERNELS_INST_LAYOUTLEFT) || \ - (!defined(KOKKOSKERNELS_ETI_ONLY) && \ - !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS)) + (!defined(KOKKOSKERNELS_ETI_ONLY) && !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS)) typedef Kokkos::View view_type_a_ll; typedef Kokkos::View view_type_b_ll; typedef Kokkos::View view_type_c_ll; - Test::impl_test_team_update_mv(0, 5); - Test::impl_test_team_update_mv(13, 5); - Test::impl_test_team_update_mv(124, 5); + Test::impl_test_team_update_mv(0, 5); + Test::impl_test_team_update_mv(13, 5); + Test::impl_test_team_update_mv(124, 5); // Test::impl_test_team_update_mv(132231,5); #endif #if defined(KOKKOSKERNELS_INST_LAYOUTRIGHT) || \ - (!defined(KOKKOSKERNELS_ETI_ONLY) && \ - !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS)) + (!defined(KOKKOSKERNELS_ETI_ONLY) && !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS)) typedef Kokkos::View view_type_a_lr; typedef Kokkos::View view_type_b_lr; typedef Kokkos::View view_type_c_lr; - Test::impl_test_team_update_mv(0, 5); - Test::impl_test_team_update_mv(13, 5); - Test::impl_test_team_update_mv(124, 5); + Test::impl_test_team_update_mv(0, 5); + Test::impl_test_team_update_mv(13, 5); + Test::impl_test_team_update_mv(124, 5); // Test::impl_test_team_update_mv(132231,5); #endif -#if (!defined(KOKKOSKERNELS_ETI_ONLY) && \ - !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS)) +#if (!defined(KOKKOSKERNELS_ETI_ONLY) && !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS)) typedef Kokkos::View view_type_a_ls; typedef Kokkos::View view_type_b_ls; typedef Kokkos::View view_type_c_ls; - Test::impl_test_team_update_mv(0, 5); - Test::impl_test_team_update_mv(13, 5); - Test::impl_test_team_update_mv(124, 5); + Test::impl_test_team_update_mv(0, 5); + Test::impl_test_team_update_mv(13, 5); + Test::impl_test_team_update_mv(124, 5); // Test::impl_test_team_update_mv(132231,5); #endif -#if !defined(KOKKOSKERNELS_ETI_ONLY) && \ - !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS) - Test::impl_test_team_update_mv(124, 5); - Test::impl_test_team_update_mv(124, 5); +#if !defined(KOKKOSKERNELS_ETI_ONLY) && !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS) + Test::impl_test_team_update_mv(124, 5); + Test::impl_test_team_update_mv(124, 5); #endif return 1; } #if defined(KOKKOSKERNELS_INST_FLOAT) || \ - (!defined(KOKKOSKERNELS_ETI_ONLY) && \ - !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS)) -TEST_F(TestCategory, team_update_float) { - test_team_update(); -} -TEST_F(TestCategory, team_update_mv_float) { - test_team_update_mv(); -} + (!defined(KOKKOSKERNELS_ETI_ONLY) && !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS)) +TEST_F(TestCategory, team_update_float) { test_team_update(); } +TEST_F(TestCategory, team_update_mv_float) { test_team_update_mv(); } #endif #if defined(KOKKOSKERNELS_INST_DOUBLE) || \ - (!defined(KOKKOSKERNELS_ETI_ONLY) && \ - !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS)) -TEST_F(TestCategory, team_update_double) { - test_team_update(); -} -TEST_F(TestCategory, team_update_mv_double) { - test_team_update_mv(); -} + (!defined(KOKKOSKERNELS_ETI_ONLY) && !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS)) +TEST_F(TestCategory, team_update_double) { test_team_update(); } +TEST_F(TestCategory, team_update_mv_double) { test_team_update_mv(); } #endif #if defined(KOKKOSKERNELS_INST_COMPLEX_DOUBLE) || \ - (!defined(KOKKOSKERNELS_ETI_ONLY) && \ - !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS)) + (!defined(KOKKOSKERNELS_ETI_ONLY) && !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS)) TEST_F(TestCategory, team_update_complex_double) { - test_team_update, Kokkos::complex, - Kokkos::complex, TestDevice>(); + test_team_update, Kokkos::complex, Kokkos::complex, TestDevice>(); } TEST_F(TestCategory, team_update_mv_complex_double) { - test_team_update_mv, Kokkos::complex, - Kokkos::complex, TestDevice>(); + test_team_update_mv, Kokkos::complex, Kokkos::complex, TestDevice>(); } #endif -#if defined(KOKKOSKERNELS_INST_INT) || \ - (!defined(KOKKOSKERNELS_ETI_ONLY) && \ - !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS)) -TEST_F(TestCategory, team_update_int) { - test_team_update(); -} -TEST_F(TestCategory, team_update_mv_int) { - test_team_update_mv(); -} +#if defined(KOKKOSKERNELS_INST_INT) || \ + (!defined(KOKKOSKERNELS_ETI_ONLY) && !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS)) +TEST_F(TestCategory, team_update_int) { test_team_update(); } +TEST_F(TestCategory, team_update_mv_int) { test_team_update_mv(); } #endif -#if !defined(KOKKOSKERNELS_ETI_ONLY) && \ - !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS) -TEST_F(TestCategory, team_update_double_int) { - test_team_update(); -} -TEST_F(TestCategory, team_update_double_mv_int) { - test_team_update_mv(); -} +#if !defined(KOKKOSKERNELS_ETI_ONLY) && !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS) +TEST_F(TestCategory, team_update_double_int) { test_team_update(); } +TEST_F(TestCategory, team_update_double_mv_int) { test_team_update_mv(); } #endif #endif // Check for lambda availability in CUDA backend diff --git a/blas/unit_test/Test_Blas1_update.hpp b/blas/unit_test/Test_Blas1_update.hpp index cfeddb9d3d..6152a3493b 100644 --- a/blas/unit_test/Test_Blas1_update.hpp +++ b/blas/unit_test/Test_Blas1_update.hpp @@ -37,8 +37,7 @@ void impl_test_update(int N) { view_stride_adapter z("Z", N); view_stride_adapter org_z("Org_Z", N); - Kokkos::Random_XorShift64_Pool rand_pool( - 13718); + Kokkos::Random_XorShift64_Pool rand_pool(13718); { ScalarA randStart, randEnd; @@ -64,27 +63,21 @@ void impl_test_update(int N) { KokkosBlas::update(a, x.d_view, b, y.d_view, c, z.d_view); Kokkos::deep_copy(z.h_base, z.d_base); for (int i = 0; i < N; i++) { - EXPECT_NEAR_KK(static_cast(a * x.h_view(i) + b * y.h_view(i) + - c * org_z.h_view(i)), - z.h_view(i), eps); + EXPECT_NEAR_KK(static_cast(a * x.h_view(i) + b * y.h_view(i) + c * org_z.h_view(i)), z.h_view(i), eps); } Kokkos::deep_copy(z.d_base, org_z.h_base); KokkosBlas::update(a, x.d_view_const, b, y.d_view, c, z.d_view); Kokkos::deep_copy(z.h_base, z.d_base); for (int i = 0; i < N; i++) { - EXPECT_NEAR_KK(static_cast(a * x.h_view(i) + b * y.h_view(i) + - c * org_z.h_view(i)), - z.h_view(i), eps); + EXPECT_NEAR_KK(static_cast(a * x.h_view(i) + b * y.h_view(i) + c * org_z.h_view(i)), z.h_view(i), eps); } Kokkos::deep_copy(z.d_base, org_z.h_base); KokkosBlas::update(a, x.d_view_const, b, y.d_view_const, c, z.d_view); Kokkos::deep_copy(z.h_base, z.d_base); for (int i = 0; i < N; i++) { - EXPECT_NEAR_KK(static_cast(a * x.h_view(i) + b * y.h_view(i) + - c * org_z.h_view(i)), - z.h_view(i), eps); + EXPECT_NEAR_KK(static_cast(a * x.h_view(i) + b * y.h_view(i) + c * org_z.h_view(i)), z.h_view(i), eps); } } @@ -99,8 +92,7 @@ void impl_test_update_mv(int N, int K) { view_stride_adapter z("Z", N, K); view_stride_adapter org_z("Org_Z", N, K); - Kokkos::Random_XorShift64_Pool rand_pool( - 13718); + Kokkos::Random_XorShift64_Pool rand_pool(13718); { ScalarA randStart, randEnd; @@ -133,10 +125,8 @@ void impl_test_update_mv(int N, int K) { Kokkos::deep_copy(z.h_base, z.d_base); for (int i = 0; i < N; i++) { for (int j = 0; j < K; j++) { - EXPECT_NEAR_KK( - static_cast(a * x.h_view(i, j) + b * y.h_view(i, j) + - c * org_z.h_view(i, j)), - z.h_view(i, j), eps); + EXPECT_NEAR_KK(static_cast(a * x.h_view(i, j) + b * y.h_view(i, j) + c * org_z.h_view(i, j)), + z.h_view(i, j), eps); } } @@ -145,10 +135,8 @@ void impl_test_update_mv(int N, int K) { Kokkos::deep_copy(z.h_base, z.d_base); for (int i = 0; i < N; i++) { for (int j = 0; j < K; j++) { - EXPECT_NEAR_KK( - static_cast(a * x.h_view(i, j) + b * y.h_view(i, j) + - c * org_z.h_view(i, j)), - z.h_view(i, j), eps); + EXPECT_NEAR_KK(static_cast(a * x.h_view(i, j) + b * y.h_view(i, j) + c * org_z.h_view(i, j)), + z.h_view(i, j), eps); } } } @@ -157,58 +145,43 @@ void impl_test_update_mv(int N, int K) { template int test_update() { #if defined(KOKKOSKERNELS_INST_LAYOUTLEFT) || \ - (!defined(KOKKOSKERNELS_ETI_ONLY) && \ - !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS)) + (!defined(KOKKOSKERNELS_ETI_ONLY) && !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS)) typedef Kokkos::View view_type_a_ll; typedef Kokkos::View view_type_b_ll; typedef Kokkos::View view_type_c_ll; - Test::impl_test_update(0); - Test::impl_test_update(13); - Test::impl_test_update(1024); + Test::impl_test_update(0); + Test::impl_test_update(13); + Test::impl_test_update(1024); // Test::impl_test_update(132231); #endif #if defined(KOKKOSKERNELS_INST_LAYOUTRIGHT) || \ - (!defined(KOKKOSKERNELS_ETI_ONLY) && \ - !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS)) + (!defined(KOKKOSKERNELS_ETI_ONLY) && !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS)) typedef Kokkos::View view_type_a_lr; typedef Kokkos::View view_type_b_lr; typedef Kokkos::View view_type_c_lr; - Test::impl_test_update(0); - Test::impl_test_update(13); - Test::impl_test_update(1024); + Test::impl_test_update(0); + Test::impl_test_update(13); + Test::impl_test_update(1024); // Test::impl_test_update(132231); #endif -#if (!defined(KOKKOSKERNELS_ETI_ONLY) && \ - !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS)) +#if (!defined(KOKKOSKERNELS_ETI_ONLY) && !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS)) typedef Kokkos::View view_type_a_ls; typedef Kokkos::View view_type_b_ls; typedef Kokkos::View view_type_c_ls; - Test::impl_test_update(0); - Test::impl_test_update(13); - Test::impl_test_update(1024); + Test::impl_test_update(0); + Test::impl_test_update(13); + Test::impl_test_update(1024); // Test::impl_test_update(132231); #endif -#if !defined(KOKKOSKERNELS_ETI_ONLY) && \ - !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS) - Test::impl_test_update(1024); - Test::impl_test_update(1024); +#if !defined(KOKKOSKERNELS_ETI_ONLY) && !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS) + Test::impl_test_update(1024); + Test::impl_test_update(1024); #endif return 1; @@ -217,66 +190,47 @@ int test_update() { template int test_update_mv() { #if defined(KOKKOSKERNELS_INST_LAYOUTLEFT) || \ - (!defined(KOKKOSKERNELS_ETI_ONLY) && \ - !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS)) + (!defined(KOKKOSKERNELS_ETI_ONLY) && !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS)) typedef Kokkos::View view_type_a_ll; typedef Kokkos::View view_type_b_ll; typedef Kokkos::View view_type_c_ll; - Test::impl_test_update_mv(0, 5); - Test::impl_test_update_mv(13, 5); - Test::impl_test_update_mv(1024, 5); - Test::impl_test_update_mv(132231, 5); + Test::impl_test_update_mv(0, 5); + Test::impl_test_update_mv(13, 5); + Test::impl_test_update_mv(1024, 5); + Test::impl_test_update_mv(132231, 5); #endif #if defined(KOKKOSKERNELS_INST_LAYOUTRIGHT) || \ - (!defined(KOKKOSKERNELS_ETI_ONLY) && \ - !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS)) + (!defined(KOKKOSKERNELS_ETI_ONLY) && !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS)) typedef Kokkos::View view_type_a_lr; typedef Kokkos::View view_type_b_lr; typedef Kokkos::View view_type_c_lr; - Test::impl_test_update_mv(0, 5); - Test::impl_test_update_mv(13, 5); - Test::impl_test_update_mv(1024, 5); - Test::impl_test_update_mv(132231, 5); + Test::impl_test_update_mv(0, 5); + Test::impl_test_update_mv(13, 5); + Test::impl_test_update_mv(1024, 5); + Test::impl_test_update_mv(132231, 5); #endif -#if (!defined(KOKKOSKERNELS_ETI_ONLY) && \ - !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS)) +#if (!defined(KOKKOSKERNELS_ETI_ONLY) && !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS)) typedef Kokkos::View view_type_a_ls; typedef Kokkos::View view_type_b_ls; typedef Kokkos::View view_type_c_ls; - Test::impl_test_update_mv(0, 5); - Test::impl_test_update_mv(13, 5); - Test::impl_test_update_mv(1024, 5); - Test::impl_test_update_mv(132231, 5); + Test::impl_test_update_mv(0, 5); + Test::impl_test_update_mv(13, 5); + Test::impl_test_update_mv(1024, 5); + Test::impl_test_update_mv(132231, 5); #endif -#if !defined(KOKKOSKERNELS_ETI_ONLY) && \ - !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS) - Test::impl_test_update_mv(1024, 5); - Test::impl_test_update_mv(1024, 5); +#if !defined(KOKKOSKERNELS_ETI_ONLY) && !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS) + Test::impl_test_update_mv(1024, 5); + Test::impl_test_update_mv(1024, 5); #endif return 1; } #if defined(KOKKOSKERNELS_INST_FLOAT) || \ - (!defined(KOKKOSKERNELS_ETI_ONLY) && \ - !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS)) + (!defined(KOKKOSKERNELS_ETI_ONLY) && !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS)) TEST_F(TestCategory, update_float) { Kokkos::Profiling::pushRegion("KokkosBlas::Test::update_float"); test_update(); @@ -290,8 +244,7 @@ TEST_F(TestCategory, update_mv_float) { #endif #if defined(KOKKOSKERNELS_INST_DOUBLE) || \ - (!defined(KOKKOSKERNELS_ETI_ONLY) && \ - !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS)) + (!defined(KOKKOSKERNELS_ETI_ONLY) && !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS)) TEST_F(TestCategory, update_double) { Kokkos::Profiling::pushRegion("KokkosBlas::Test::update_double"); test_update(); @@ -304,25 +257,21 @@ TEST_F(TestCategory, update_mv_double) { #endif #if defined(KOKKOSKERNELS_INST_COMPLEX_DOUBLE) || \ - (!defined(KOKKOSKERNELS_ETI_ONLY) && \ - !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS)) + (!defined(KOKKOSKERNELS_ETI_ONLY) && !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS)) TEST_F(TestCategory, update_complex_double) { Kokkos::Profiling::pushRegion("KokkosBlas::Test::update_complex_double"); - test_update, Kokkos::complex, - Kokkos::complex, TestDevice>(); + test_update, Kokkos::complex, Kokkos::complex, TestDevice>(); Kokkos::Profiling::popRegion(); } TEST_F(TestCategory, update_mv_complex_double) { Kokkos::Profiling::pushRegion("KokkosBlas::Test::update_mv_complex_double"); - test_update_mv, Kokkos::complex, - Kokkos::complex, TestDevice>(); + test_update_mv, Kokkos::complex, Kokkos::complex, TestDevice>(); Kokkos::Profiling::popRegion(); } #endif -#if defined(KOKKOSKERNELS_INST_INT) || \ - (!defined(KOKKOSKERNELS_ETI_ONLY) && \ - !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS)) +#if defined(KOKKOSKERNELS_INST_INT) || \ + (!defined(KOKKOSKERNELS_ETI_ONLY) && !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS)) TEST_F(TestCategory, update_int) { Kokkos::Profiling::pushRegion("KokkosBlas::Test::update_int"); test_update(); @@ -335,8 +284,7 @@ TEST_F(TestCategory, update_mv_int) { } #endif -#if !defined(KOKKOSKERNELS_ETI_ONLY) && \ - !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS) +#if !defined(KOKKOSKERNELS_ETI_ONLY) && !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS) TEST_F(TestCategory, update_double_int) { Kokkos::Profiling::pushRegion("KokkosBlas::Test::update_double_int"); test_update(); diff --git a/blas/unit_test/Test_Blas2_gemv.hpp b/blas/unit_test/Test_Blas2_gemv.hpp index b3f3566f83..d70935c2ac 100644 --- a/blas/unit_test/Test_Blas2_gemv.hpp +++ b/blas/unit_test/Test_Blas2_gemv.hpp @@ -21,10 +21,8 @@ #include namespace Test { -template -void impl_test_gemv_streams(ExecutionSpace& space, const char* mode, int M, - int N) { +template +void impl_test_gemv_streams(ExecutionSpace& space, const char* mode, int M, int N) { typedef typename ViewTypeA::value_type ScalarA; typedef typename ViewTypeX::value_type ScalarX; typedef typename ViewTypeY::value_type ScalarY; @@ -70,10 +68,8 @@ void impl_test_gemv_streams(ExecutionSpace& space, const char* mode, int M, Kokkos::fill_random(space, A.d_view, rand_pool, randStart, randEnd); } - const typename KAT_Y::mag_type max_error = - KAT_Y::abs(alpha * max_valA * max_valX * ldx + beta * max_valY); - const typename KAT_Y::mag_type tol = - max_error * eps * 2; // adding small fudge factor of 2 + const typename KAT_Y::mag_type max_error = KAT_Y::abs(alpha * max_valA * max_valX * ldx + beta * max_valY); + const typename KAT_Y::mag_type tol = max_error * eps * 2; // adding small fudge factor of 2 Kokkos::deep_copy(org_y.h_base, y.d_base); Kokkos::deep_copy(x.h_base, x.d_base); @@ -89,39 +85,33 @@ void impl_test_gemv_streams(ExecutionSpace& space, const char* mode, int M, for (int i = 0; i < ldy; i++) { if (KAT_Y::abs(expected(i) - y.h_view(i)) > tol) { numErrors++; - std::cerr << __FILE__ << ":" << __LINE__ - << ": expected(i)=" << expected(i) << ", h_y(i)=" << y.h_view(i) + std::cerr << __FILE__ << ":" << __LINE__ << ": expected(i)=" << expected(i) << ", h_y(i)=" << y.h_view(i) << std::endl; } } - EXPECT_EQ(numErrors, 0) << "Nonconst input, " << M << 'x' << N - << ", alpha = " << alpha << ", beta = " << beta + EXPECT_EQ(numErrors, 0) << "Nonconst input, " << M << 'x' << N << ", alpha = " << alpha << ", beta = " << beta << ", mode " << mode << ": gemv incorrect"; Kokkos::deep_copy(space, y.d_base, org_y.h_base); - KokkosBlas::gemv(space, mode, alpha, A.d_view, x.d_view_const, beta, - y.d_view); + KokkosBlas::gemv(space, mode, alpha, A.d_view, x.d_view_const, beta, y.d_view); Kokkos::deep_copy(y.h_base, y.d_base); numErrors = 0; Kokkos::fence(); // Wait for vanillaGEMV for (int i = 0; i < ldy; i++) { if (KAT_Y::abs(expected(i) - y.h_view(i)) > tol) numErrors++; } - EXPECT_EQ(numErrors, 0) << "Const vector input, " << M << 'x' << N - << ", alpha = " << alpha << ", beta = " << beta + EXPECT_EQ(numErrors, 0) << "Const vector input, " << M << 'x' << N << ", alpha = " << alpha << ", beta = " << beta << ", mode " << mode << ": gemv incorrect"; Kokkos::deep_copy(space, y.d_base, org_y.h_base); - KokkosBlas::gemv(space, mode, alpha, A.d_view_const, x.d_view_const, beta, - y.d_view); + KokkosBlas::gemv(space, mode, alpha, A.d_view_const, x.d_view_const, beta, y.d_view); Kokkos::deep_copy(y.h_base, y.d_base); numErrors = 0; for (int i = 0; i < ldy; i++) { if (KAT_Y::abs(expected(i) - y.h_view(i)) > tol) numErrors++; } - EXPECT_EQ(numErrors, 0) << "Const matrix/vector input, " << M << 'x' << N - << ", alpha = " << alpha << ", beta = " << beta - << ", mode " << mode << ": gemv incorrect"; + EXPECT_EQ(numErrors, 0) << "Const matrix/vector input, " << M << 'x' << N << ", alpha = " << alpha + << ", beta = " << beta << ", mode " << mode << ": gemv incorrect"; // Test once with beta = 0, but with y initially filled with NaN. // This should overwrite the NaNs with the correct result. beta = KAT_Y::zero(); @@ -135,32 +125,28 @@ void impl_test_gemv_streams(ExecutionSpace& space, const char* mode, int M, numErrors = 0; for (int i = 0; i < ldy; i++) { if (KAT_Y::isNan(y.h_view(i)) || - KAT_Y::abs(expected(i) - y.h_view(i)) > - KAT_Y::abs(alpha * max_valA * max_valX * ldx * eps * 2)) { + KAT_Y::abs(expected(i) - y.h_view(i)) > KAT_Y::abs(alpha * max_valA * max_valX * ldx * eps * 2)) { numErrors++; - std::cerr << __FILE__ << ":" << __LINE__ << ": expected(" << i - << ")=" << expected(i) << ", h_y(" << i << ")=" << y.h_view(i) - << ", eps=" << eps - << ", 1024*2*eps=" << 1024 * 2 * KAT_Y::epsilon() << std::endl; + std::cerr << __FILE__ << ":" << __LINE__ << ": expected(" << i << ")=" << expected(i) << ", h_y(" << i + << ")=" << y.h_view(i) << ", eps=" << eps << ", 1024*2*eps=" << 1024 * 2 * KAT_Y::epsilon() + << std::endl; } } - EXPECT_EQ(numErrors, 0) << "beta = 0, input contains NaN, A is " << M << 'x' - << N << ", mode " << mode << ": gemv incorrect"; + EXPECT_EQ(numErrors, 0) << "beta = 0, input contains NaN, A is " << M << 'x' << N << ", mode " << mode + << ": gemv incorrect"; } template void impl_test_gemv(const char* mode, int M, int N) { using execution_space = typename Device::execution_space; execution_space space; - impl_test_gemv_streams(space, mode, M, N); + impl_test_gemv_streams(space, mode, M, N); } } // namespace Test template int test_gemv(const char* mode) { #if defined(KOKKOSKERNELS_INST_LAYOUTLEFT) || \ - (!defined(KOKKOSKERNELS_ETI_ONLY) && \ - !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS)) + (!defined(KOKKOSKERNELS_ETI_ONLY) && !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS)) typedef Kokkos::View view_type_a_ll; typedef Kokkos::View view_type_b_ll; typedef Kokkos::View view_type_c_ll; @@ -172,85 +158,58 @@ int test_gemv(const char* mode) { Test::impl_test_gemv(mode,10,200); Test::impl_test_gemv(mode,200,10); #endif - Test::impl_test_gemv( - mode, 0, 1024); - Test::impl_test_gemv( - mode, 1024, 0); - Test::impl_test_gemv( - mode, 13, 13); - Test::impl_test_gemv( - mode, 13, 1024); - Test::impl_test_gemv( - mode, 50, 40); - Test::impl_test_gemv( - mode, 1024, 1024); - Test::impl_test_gemv( - mode, 2131, 2131); + Test::impl_test_gemv(mode, 0, 1024); + Test::impl_test_gemv(mode, 1024, 0); + Test::impl_test_gemv(mode, 13, 13); + Test::impl_test_gemv(mode, 13, 1024); + Test::impl_test_gemv(mode, 50, 40); + Test::impl_test_gemv(mode, 1024, 1024); + Test::impl_test_gemv(mode, 2131, 2131); // Test::impl_test_gemv(mode,132231,1024); #endif #if defined(KOKKOSKERNELS_INST_LAYOUTRIGHT) || \ - (!defined(KOKKOSKERNELS_ETI_ONLY) && \ - !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS)) + (!defined(KOKKOSKERNELS_ETI_ONLY) && !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS)) typedef Kokkos::View view_type_a_lr; typedef Kokkos::View view_type_b_lr; typedef Kokkos::View view_type_c_lr; - Test::impl_test_gemv( - mode, 0, 1024); - Test::impl_test_gemv( - mode, 1024, 0); - Test::impl_test_gemv( - mode, 13, 13); - Test::impl_test_gemv( - mode, 13, 1024); - Test::impl_test_gemv( - mode, 50, 40); - Test::impl_test_gemv( - mode, 1024, 1024); - Test::impl_test_gemv( - mode, 2131, 2131); + Test::impl_test_gemv(mode, 0, 1024); + Test::impl_test_gemv(mode, 1024, 0); + Test::impl_test_gemv(mode, 13, 13); + Test::impl_test_gemv(mode, 13, 1024); + Test::impl_test_gemv(mode, 50, 40); + Test::impl_test_gemv(mode, 1024, 1024); + Test::impl_test_gemv(mode, 2131, 2131); // Test::impl_test_gemv(mode,132231,1024); #endif -#if (!defined(KOKKOSKERNELS_ETI_ONLY) && \ - !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS)) +#if (!defined(KOKKOSKERNELS_ETI_ONLY) && !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS)) typedef Kokkos::View view_type_a_ls; typedef Kokkos::View view_type_b_ls; typedef Kokkos::View view_type_c_ls; - Test::impl_test_gemv( - mode, 0, 1024); - Test::impl_test_gemv( - mode, 1024, 0); - Test::impl_test_gemv( - mode, 13, 13); - Test::impl_test_gemv( - mode, 13, 1024); - Test::impl_test_gemv( - mode, 50, 40); - Test::impl_test_gemv( - mode, 1024, 1024); - Test::impl_test_gemv( - mode, 2131, 2131); + Test::impl_test_gemv(mode, 0, 1024); + Test::impl_test_gemv(mode, 1024, 0); + Test::impl_test_gemv(mode, 13, 13); + Test::impl_test_gemv(mode, 13, 1024); + Test::impl_test_gemv(mode, 50, 40); + Test::impl_test_gemv(mode, 1024, 1024); + Test::impl_test_gemv(mode, 2131, 2131); // Test::impl_test_gemv(mode,132231,1024); #endif -#if !defined(KOKKOSKERNELS_ETI_ONLY) && \ - !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS) - Test::impl_test_gemv( - mode, 1024, 1024); - Test::impl_test_gemv( - mode, 1024, 1024); +#if !defined(KOKKOSKERNELS_ETI_ONLY) && !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS) + Test::impl_test_gemv(mode, 1024, 1024); + Test::impl_test_gemv(mode, 1024, 1024); #endif return 1; } #if defined(KOKKOSKERNELS_INST_FLOAT) || \ - (!defined(KOKKOSKERNELS_ETI_ONLY) && \ - !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS)) + (!defined(KOKKOSKERNELS_ETI_ONLY) && !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS)) TEST_F(TestCategory, gemv_float) { Kokkos::Profiling::pushRegion("KokkosBlas::Test::gemv_float"); test_gemv("N"); @@ -263,8 +222,7 @@ TEST_F(TestCategory, gemv_float) { #endif #if defined(KOKKOSKERNELS_INST_DOUBLE) || \ - (!defined(KOKKOSKERNELS_ETI_ONLY) && \ - !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS)) + (!defined(KOKKOSKERNELS_ETI_ONLY) && !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS)) TEST_F(TestCategory, gemv_double) { Kokkos::Profiling::pushRegion("KokkosBlas::Test::gemv_double"); test_gemv("N"); @@ -277,29 +235,24 @@ TEST_F(TestCategory, gemv_double) { #endif #if defined(KOKKOSKERNELS_INST_COMPLEX_DOUBLE) || \ - (!defined(KOKKOSKERNELS_ETI_ONLY) && \ - !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS)) + (!defined(KOKKOSKERNELS_ETI_ONLY) && !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS)) TEST_F(TestCategory, gemv_complex_double) { Kokkos::Profiling::pushRegion("KokkosBlas::Test::gemv_complex_double"); - test_gemv, Kokkos::complex, - Kokkos::complex, TestDevice>("N"); + test_gemv, Kokkos::complex, Kokkos::complex, TestDevice>("N"); Kokkos::Profiling::popRegion(); Kokkos::Profiling::pushRegion("KokkosBlas::Test::gemv_tran_complex_double"); - test_gemv, Kokkos::complex, - Kokkos::complex, TestDevice>("T"); + test_gemv, Kokkos::complex, Kokkos::complex, TestDevice>("T"); Kokkos::Profiling::popRegion(); Kokkos::Profiling::pushRegion("KokkosBlas::Test::gemv_conj_complex_double"); - test_gemv, Kokkos::complex, - Kokkos::complex, TestDevice>("C"); + test_gemv, Kokkos::complex, Kokkos::complex, TestDevice>("C"); Kokkos::Profiling::popRegion(); } #endif -#if defined(KOKKOSKERNELS_INST_INT) || \ - (!defined(KOKKOSKERNELS_ETI_ONLY) && \ - !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS)) +#if defined(KOKKOSKERNELS_INST_INT) || \ + (!defined(KOKKOSKERNELS_ETI_ONLY) && !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS)) TEST_F(TestCategory, gemv_int) { Kokkos::Profiling::pushRegion("KokkosBlas::Test::gemv_int"); test_gemv("N"); @@ -311,8 +264,7 @@ TEST_F(TestCategory, gemv_int) { } #endif -#if !defined(KOKKOSKERNELS_ETI_ONLY) && \ - !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS) +#if !defined(KOKKOSKERNELS_ETI_ONLY) && !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS) TEST_F(TestCategory, gemv_double_int) { Kokkos::Profiling::pushRegion("KokkosBlas::Test::gemv_double_int"); test_gemv("N"); @@ -332,34 +284,33 @@ int test_gemv_streams(const char* mode) { using view_type_a_ll = Kokkos::View; using view_type_b_ll = Kokkos::View; using view_type_c_ll = Kokkos::View; - Test::impl_test_gemv_streams(space, mode, 0, 1024); - Test::impl_test_gemv_streams(space, mode, 13, 1024); - Test::impl_test_gemv_streams(space, mode, 50, 40); + Test::impl_test_gemv_streams(space, mode, 0, + 1024); + Test::impl_test_gemv_streams(space, mode, 13, + 1024); + Test::impl_test_gemv_streams(space, mode, 50, + 40); #endif #if defined(KOKKOSKERNELS_INST_LAYOUTRIGHT) using view_type_a_lr = Kokkos::View; using view_type_b_lr = Kokkos::View; using view_type_c_lr = Kokkos::View; - Test::impl_test_gemv_streams(space, mode, 0, 1024); - Test::impl_test_gemv_streams(space, mode, 13, 1024); - Test::impl_test_gemv_streams(space, mode, 50, 40); + Test::impl_test_gemv_streams(space, mode, 0, + 1024); + Test::impl_test_gemv_streams(space, mode, 13, + 1024); + Test::impl_test_gemv_streams(space, mode, 50, + 40); #endif (void)space; return 1; } -#define KOKKOSKERNELS_EXECUTE_TEST(SCALAR, ORDINAL, OFFSET, DEVICE) \ - TEST_F(TestCategory, \ - blas##_##gemv_streams##_##SCALAR##_##ORDINAL##_##OFFSET##_##DEVICE) { \ - test_gemv_streams("N"); \ - test_gemv_streams("T"); \ +#define KOKKOSKERNELS_EXECUTE_TEST(SCALAR, ORDINAL, OFFSET, DEVICE) \ + TEST_F(TestCategory, blas##_##gemv_streams##_##SCALAR##_##ORDINAL##_##OFFSET##_##DEVICE) { \ + test_gemv_streams("N"); \ + test_gemv_streams("T"); \ } #define NO_TEST_COMPLEX diff --git a/blas/unit_test/Test_Blas2_gemv_util.hpp b/blas/unit_test/Test_Blas2_gemv_util.hpp index e28310c8eb..724a2fc004 100644 --- a/blas/unit_test/Test_Blas2_gemv_util.hpp +++ b/blas/unit_test/Test_Blas2_gemv_util.hpp @@ -23,16 +23,12 @@ namespace Test { -template ::value> -using simd_vector = - KokkosBatched::Vector, length>; +template ::value> +using simd_vector = KokkosBatched::Vector, length>; template struct GemvOpBase { - GemvOpBase(char trans_, ScalarType alpha_, AType A_, XType x_, - ScalarType beta_, YType y_) + GemvOpBase(char trans_, ScalarType alpha_, AType A_, XType x_, ScalarType beta_, YType y_) : trans(trans_), alpha(alpha_), beta(beta_), A(A_), x(x_), y(y_) {} protected: @@ -52,42 +48,32 @@ template struct RefGEMVOp : public GemvOpBase { using params = GemvOpBase; - RefGEMVOp(char trans_, ScalarType alpha_, AType A_, XType x_, - ScalarType beta_, YType y_) + RefGEMVOp(char trans_, ScalarType alpha_, AType A_, XType x_, ScalarType beta_, YType y_) : params(trans_, alpha_, A_, x_, beta_, y_) {} template - KOKKOS_INLINE_FUNCTION void operator()( - const TeamMember & /* member */) const { - vanillaGEMV(params::trans, params::alpha, params::A, params::x, - params::beta, params::y); + KOKKOS_INLINE_FUNCTION void operator()(const TeamMember & /* member */) const { + vanillaGEMV(params::trans, params::alpha, params::A, params::x, params::beta, params::y); } }; // RefGEMVOp // fill regular view with random values -template -typename std::enable_if::value>::type -fill_random_view(ViewType A, PoolType &rand_pool, - const ScalarType max_val = 10.0) { +template +typename std::enable_if::value>::type fill_random_view( + ViewType A, PoolType &rand_pool, const ScalarType max_val = 10.0) { Kokkos::fill_random(A, rand_pool, max_val); Kokkos::fence(); } // fill rank-1 view of SIMD vectors with random values -template +template void fill_random_view( - Kokkos::View< - KokkosBatched::Vector, VecLength> *, - Layout, Props...> - x, + Kokkos::View, VecLength> *, Layout, Props...> x, PoolType &rand_pool, const ValueType max_val = 10.0) { // the view can be strided and have Vector values, so randoms // are generated in a plain, linear view first and then copied using device_type = typename decltype(x)::device_type; - Kokkos::View rnd("random_vals", - x.extent(0) * VecLength); + Kokkos::View rnd("random_vals", x.extent(0) * VecLength); Kokkos::fill_random(rnd, rand_pool, max_val); using size_type = decltype(x.extent(0)); for (size_type i = 0; i < x.extent(0); ++i) { @@ -96,19 +82,14 @@ void fill_random_view( } // fill rank-2 view of SIMD vectors with random values -template +template static void fill_random_view( - Kokkos::View< - KokkosBatched::Vector, VecLength> **, - Layout, Props...> - A, + Kokkos::View, VecLength> **, Layout, Props...> A, PoolType &rand_pool, const ValueType max_val = 10.0) { // the view can be strided and have Vector values, so randoms // are generated in a plain, linear view first and then copied using device_type = typename decltype(A)::device_type; - Kokkos::View rnd( - "random_vals", A.extent(0) * A.extent(1) * VecLength); + Kokkos::View rnd("random_vals", A.extent(0) * A.extent(1) * VecLength); Kokkos::fill_random(rnd, rand_pool, max_val); using size_type = decltype(A.extent(0)); size_type idx = 0; @@ -120,29 +101,22 @@ static void fill_random_view( } } -template +template struct GEMVTest { - static void run(const char *mode) { - run_algorithms<0, typename GemvFunc::algorithms>(mode); - } + static void run(const char *mode) { run_algorithms<0, typename GemvFunc::algorithms>(mode); } private: // ScalarCoef==void default behavior is to derive alpha/beta scalar types // from A and X scalar types - using ScalarType = typename std::conditional< - !std::is_void::value, ScalarCoef, - typename std::common_type::type>::type; + using ScalarType = typename std::conditional::value, ScalarCoef, + typename std::common_type::type>::type; template - static std::enable_if_t::value> - run_algorithms(const char * /*mode*/) {} + static std::enable_if_t::value> run_algorithms(const char * /*mode*/) {} template - static - typename std::enable_if<(Idx < - std::tuple_size::value)>::type - run_algorithms(const char *mode) { + static typename std::enable_if<(Idx < std::tuple_size::value)>::type run_algorithms( + const char *mode) { run_layouts::type>(mode); run_algorithms(mode); } @@ -156,8 +130,7 @@ struct GEMVTest { #ifdef KOKKOSKERNELS_TEST_LAYOUTRIGHT run_view_types(mode); #endif -#if defined(KOKKOSKERNELS_TEST_LAYOUTLEFT) && \ - defined(KOKKOSKERNELS_TEST_LAYOUTRIGHT) +#if defined(KOKKOSKERNELS_TEST_LAYOUTLEFT) && defined(KOKKOSKERNELS_TEST_LAYOUTRIGHT) using A_t = typename Kokkos::View; using x_t = typename Kokkos::View; using y_t = typename Kokkos::View; @@ -224,24 +197,16 @@ struct GEMVTest { auto y = Kokkos::subview(b_y, 0, Kokkos::ALL(), 0); // make sure it's actually LayoutStride there - static_assert(std::is_same::value, - ""); - static_assert(std::is_same::value, - ""); - static_assert(std::is_same::value, - ""); + static_assert(std::is_same::value, ""); + static_assert(std::is_same::value, ""); + static_assert(std::is_same::value, ""); run_views(trans, A, x, y); } } template - static void run_views(const char trans, ViewTypeA A, ViewTypeX x, - ViewTypeY y) { - Kokkos::TeamPolicy teams( - 1, 1); // just run on device + static void run_views(const char trans, ViewTypeA A, ViewTypeX x, ViewTypeY y) { + Kokkos::TeamPolicy teams(1, 1); // just run on device fill_inputs(A, x, y); ScalarType alpha = 3; // TODO: test also with zero alpha/beta ? ScalarType beta = 5; @@ -249,8 +214,7 @@ struct GEMVTest { // get reference results Kokkos::View y_ref("Y_ref", y.extent(0)); Kokkos::deep_copy(y_ref, y); - RefGEMVOp gemv_ref( - trans, alpha, A, x, beta, y_ref); + RefGEMVOp gemv_ref(trans, alpha, A, x, beta, y_ref); Kokkos::parallel_for(teams, gemv_ref); // 1. check non-consts @@ -265,10 +229,8 @@ struct GEMVTest { run_case(trans, alpha, c_A, c_x, beta, y, y_ref); } - template - static void run_case(const char trans, ScalarType alpha, ViewTypeA A, - ViewTypeX x, ScalarType beta, ViewTypeY y, + template + static void run_case(const char trans, ScalarType alpha, ViewTypeA A, ViewTypeX x, ScalarType beta, ViewTypeY y, ViewTypeYRef y_ref) { // run on original y view (not to alter the test) // but backup it and restore, so it can be reused @@ -277,12 +239,10 @@ struct GEMVTest { // fetch GEMV functor from the factory using op_type = - typename GemvFunc::template functor_type; + typename GemvFunc::template functor_type; op_type gemv_op(trans, alpha, A, x, beta, y); - Kokkos::parallel_for( - Kokkos::TeamPolicy(1, 1), gemv_op); + Kokkos::parallel_for(Kokkos::TeamPolicy(1, 1), gemv_op); const double eps = epsilon(ScalarY{}); EXPECT_NEAR_KK_REL_1DVIEW(y, y_ref, eps); @@ -317,24 +277,15 @@ struct GEMVTest { } // namespace Test -#define TEST_CASE4(PREFIX, FACTORY, NAME, SCALAR_A, SCALAR_X, SCALAR_Y, \ - SCALAR_COEF) \ - using PREFIX##_##NAME##_gemv_test = \ - ::Test::GEMVTest<::Test::FACTORY, SCALAR_A, SCALAR_X, SCALAR_Y, \ - TestDevice, SCALAR_COEF>; \ - TEST_F(TestCategory, PREFIX##_gemv_nt_##NAME) { \ - PREFIX##_##NAME##_gemv_test::run("N"); \ - } \ - TEST_F(TestCategory, PREFIX##_gemv_t_##NAME) { \ - PREFIX##_##NAME##_gemv_test::run("T"); \ - } \ - TEST_F(TestCategory, PREFIX##_gemv_ct_##NAME) { \ - PREFIX##_##NAME##_gemv_test::run("C"); \ - } +#define TEST_CASE4(PREFIX, FACTORY, NAME, SCALAR_A, SCALAR_X, SCALAR_Y, SCALAR_COEF) \ + using PREFIX##_##NAME##_gemv_test = \ + ::Test::GEMVTest<::Test::FACTORY, SCALAR_A, SCALAR_X, SCALAR_Y, TestDevice, SCALAR_COEF>; \ + TEST_F(TestCategory, PREFIX##_gemv_nt_##NAME) { PREFIX##_##NAME##_gemv_test::run("N"); } \ + TEST_F(TestCategory, PREFIX##_gemv_t_##NAME) { PREFIX##_##NAME##_gemv_test::run("T"); } \ + TEST_F(TestCategory, PREFIX##_gemv_ct_##NAME) { PREFIX##_##NAME##_gemv_test::run("C"); } #define TEST_CASE2(PREFIX, FACTORY, NAME, SCALAR, SCALAR_COEF) \ TEST_CASE4(PREFIX, FACTORY, NAME, SCALAR, SCALAR, SCALAR, SCALAR_COEF) -#define TEST_CASE(PREFIX, FACTORY, NAME, SCALAR) \ - TEST_CASE2(PREFIX, FACTORY, NAME, SCALAR, SCALAR) +#define TEST_CASE(PREFIX, FACTORY, NAME, SCALAR) TEST_CASE2(PREFIX, FACTORY, NAME, SCALAR, SCALAR) #endif // TEST_BLAS2_GEMV_UTIL_HPP diff --git a/blas/unit_test/Test_Blas2_ger.hpp b/blas/unit_test/Test_Blas2_ger.hpp index 9a8f740569..6e975532e1 100644 --- a/blas/unit_test/Test_Blas2_ger.hpp +++ b/blas/unit_test/Test_Blas2_ger.hpp @@ -53,107 +53,85 @@ namespace Test { -template +template class GerTester { public: GerTester(); ~GerTester(); - void test(const int M, const int N, const int nonConstConstCombinations, - const bool useAnalyticalResults = false, - const bool useHermitianOption = false); + void test(const int M, const int N, const int nonConstConstCombinations, const bool useAnalyticalResults = false, + const bool useHermitianOption = false); private: using _ViewTypeX = Kokkos::View; using _ViewTypeY = Kokkos::View; using _ViewTypeA = Kokkos::View; - using _HostViewTypeX = typename _ViewTypeX::HostMirror; - using _HostViewTypeY = typename _ViewTypeY::HostMirror; - using _HostViewTypeA = typename _ViewTypeA::HostMirror; - using _ViewTypeExpected = - Kokkos::View; + using _HostViewTypeX = typename _ViewTypeX::HostMirror; + using _HostViewTypeY = typename _ViewTypeY::HostMirror; + using _HostViewTypeA = typename _ViewTypeA::HostMirror; + using _ViewTypeExpected = Kokkos::View; using _KAT_A = Kokkos::ArithTraits; using _AuxType = typename _KAT_A::mag_type; - void populateVariables(ScalarA& alpha, - view_stride_adapter<_ViewTypeX, false>& x, - view_stride_adapter<_ViewTypeY, false>& y, - view_stride_adapter<_ViewTypeA, false>& A, - _ViewTypeExpected& h_expected, - bool& expectedResultIsKnown); + void populateVariables(ScalarA& alpha, view_stride_adapter<_ViewTypeX, false>& x, + view_stride_adapter<_ViewTypeY, false>& y, view_stride_adapter<_ViewTypeA, false>& A, + _ViewTypeExpected& h_expected, bool& expectedResultIsKnown); template - typename std::enable_if>::value || - std::is_same>::value, - void>::type - populateAnalyticalValues(T& alpha, _HostViewTypeX& h_x, _HostViewTypeY& h_y, - _HostViewTypeA& h_A, _ViewTypeExpected& h_expected); + typename std::enable_if< + std::is_same>::value || std::is_same>::value, void>::type + populateAnalyticalValues(T& alpha, _HostViewTypeX& h_x, _HostViewTypeY& h_y, _HostViewTypeA& h_A, + _ViewTypeExpected& h_expected); template - typename std::enable_if>::value && - !std::is_same>::value, - void>::type - populateAnalyticalValues(T& alpha, _HostViewTypeX& h_x, _HostViewTypeY& h_y, - _HostViewTypeA& h_A, _ViewTypeExpected& h_expected); + typename std::enable_if< + !std::is_same>::value && !std::is_same>::value, void>::type + populateAnalyticalValues(T& alpha, _HostViewTypeX& h_x, _HostViewTypeY& h_y, _HostViewTypeA& h_A, + _ViewTypeExpected& h_expected); template - typename std::enable_if>::value || - std::is_same>::value, - void>::type - populateVanillaValues(const T& alpha, const _HostViewTypeX& h_x, - const _HostViewTypeY& h_y, const _HostViewTypeA& h_A, + typename std::enable_if< + std::is_same>::value || std::is_same>::value, void>::type + populateVanillaValues(const T& alpha, const _HostViewTypeX& h_x, const _HostViewTypeY& h_y, const _HostViewTypeA& h_A, _ViewTypeExpected& h_vanilla); template - typename std::enable_if>::value && - !std::is_same>::value, - void>::type - populateVanillaValues(const T& alpha, const _HostViewTypeX& h_x, - const _HostViewTypeY& h_y, const _HostViewTypeA& h_A, + typename std::enable_if< + !std::is_same>::value && !std::is_same>::value, void>::type + populateVanillaValues(const T& alpha, const _HostViewTypeX& h_x, const _HostViewTypeY& h_y, const _HostViewTypeA& h_A, _ViewTypeExpected& h_vanilla); template - typename std::enable_if>::value || - std::is_same>::value, - void>::type - compareVanillaAgainstExpected(const T& alpha, - const _ViewTypeExpected& h_vanilla, + typename std::enable_if< + std::is_same>::value || std::is_same>::value, void>::type + compareVanillaAgainstExpected(const T& alpha, const _ViewTypeExpected& h_vanilla, const _ViewTypeExpected& h_expected); template - typename std::enable_if>::value && - !std::is_same>::value, - void>::type - compareVanillaAgainstExpected(const T& alpha, - const _ViewTypeExpected& h_vanilla, + typename std::enable_if< + !std::is_same>::value && !std::is_same>::value, void>::type + compareVanillaAgainstExpected(const T& alpha, const _ViewTypeExpected& h_vanilla, const _ViewTypeExpected& h_expected); template - typename std::enable_if>::value || - std::is_same>::value, - void>::type - compareKkGerAgainstExpected(const T& alpha, const _HostViewTypeA& h_A, - const _ViewTypeExpected& h_expected); + typename std::enable_if< + std::is_same>::value || std::is_same>::value, void>::type + compareKkGerAgainstExpected(const T& alpha, const _HostViewTypeA& h_A, const _ViewTypeExpected& h_expected); template - typename std::enable_if>::value && - !std::is_same>::value, - void>::type - compareKkGerAgainstExpected(const T& alpha, const _HostViewTypeA& h_A, - const _ViewTypeExpected& h_expected); + typename std::enable_if< + !std::is_same>::value && !std::is_same>::value, void>::type + compareKkGerAgainstExpected(const T& alpha, const _HostViewTypeA& h_A, const _ViewTypeExpected& h_expected); template T shrinkAngleToZeroTwoPiRange(const T input); template - void callKkGerAndCompareAgainstExpected( - const ScalarA& alpha, TX& x, TY& y, - view_stride_adapter<_ViewTypeA, false>& A, - const _ViewTypeExpected& h_expected, const std::string& situation); + void callKkGerAndCompareAgainstExpected(const ScalarA& alpha, TX& x, TY& y, view_stride_adapter<_ViewTypeA, false>& A, + const _ViewTypeExpected& h_expected, const std::string& situation); const bool _A_is_complex; const bool _A_is_lr; @@ -169,16 +147,13 @@ class GerTester { bool _kkGerShouldThrowException; }; -template -GerTester::GerTester() +template +GerTester::GerTester() : _A_is_complex(std::is_same>::value || std::is_same>::value), _A_is_lr(std::is_same::value), _A_is_ll(std::is_same::value), - _testIsGpu(KokkosKernels::Impl::kk_is_gpu_exec_space< - typename Device::execution_space>()) + _testIsGpu(KokkosKernels::Impl::kk_is_gpu_exec_space()) #ifdef KOKKOSKERNELS_ENABLE_TPL_BLAS , _vanillaUsesDifferentOrderOfOps(_A_is_lr && _testIsGpu) @@ -195,12 +170,8 @@ GerTester::value - ? 1.0e-6 - : (std::is_same<_AuxType, double>::value ? 1.0e-9 : 0)), - _relTol(std::is_same<_AuxType, float>::value - ? 5.0e-3 - : (std::is_same<_AuxType, double>::value ? 1.0e-6 : 0)), + _absTol(std::is_same<_AuxType, float>::value ? 1.0e-6 : (std::is_same<_AuxType, double>::value ? 1.0e-9 : 0)), + _relTol(std::is_same<_AuxType, float>::value ? 5.0e-3 : (std::is_same<_AuxType, double>::value ? 1.0e-6 : 0)), _M(-1), _N(-1), _useAnalyticalResults(false), @@ -208,31 +179,24 @@ GerTester -GerTester::~GerTester() { +template +GerTester::~GerTester() { // Nothing to do } -template -void GerTester::test(const int M, const int N, - const int nonConstConstCombinations, - const bool useAnalyticalResults, - const bool useHermitianOption) { +template +void GerTester::test( + const int M, const int N, const int nonConstConstCombinations, const bool useAnalyticalResults, + const bool useHermitianOption) { #ifdef HAVE_KOKKOSKERNELS_DEBUG std::cout << "Entering GerTester::test()... - - - - - - - - - - - - - - - - " "- - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - " "- - - - - - - - - " << std::endl; - std::cout << "_A_is_complex = " << _A_is_complex - << ", _A_is_lr = " << _A_is_lr << ", _A_is_ll = " << _A_is_ll + std::cout << "_A_is_complex = " << _A_is_complex << ", _A_is_lr = " << _A_is_lr << ", _A_is_ll = " << _A_is_ll << ", _testIsGpu = " << _testIsGpu - << ", _vanillaUsesDifferentOrderOfOps = " - << _vanillaUsesDifferentOrderOfOps << ", _absTol = " << _absTol + << ", _vanillaUsesDifferentOrderOfOps = " << _vanillaUsesDifferentOrderOfOps << ", _absTol = " << _absTol << ", _relTol = " << _relTol << std::endl; #endif // ******************************************************************** @@ -277,8 +241,7 @@ void GerTester y("Y", _N); view_stride_adapter<_ViewTypeA, false> A("A", _M, _N); - view_stride_adapter<_ViewTypeExpected, true> h_expected( - "expected A += alpha * x * y^{t,h}", _M, _N); + view_stride_adapter<_ViewTypeExpected, true> h_expected("expected A += alpha * x * y^{t,h}", _M, _N); bool expectedResultIsKnown = false; ScalarA alpha(0.); @@ -286,21 +249,16 @@ void GerTesterpopulateVariables(alpha, x, y, A, h_expected.d_view, - expectedResultIsKnown); + this->populateVariables(alpha, x, y, A, h_expected.d_view, expectedResultIsKnown); // ******************************************************************** // Step 3 of 9: populate h_vanilla // ******************************************************************** - view_stride_adapter<_ViewTypeExpected, true> h_vanilla( - "vanilla = A + alpha * x * y^{t,h}", _M, _N); + view_stride_adapter<_ViewTypeExpected, true> h_vanilla("vanilla = A + alpha * x * y^{t,h}", _M, _N); #ifdef HAVE_KOKKOSKERNELS_DEBUG - Kokkos::printf( - "In Test_Blas2_ger.hpp, computing vanilla A with alpha type = %s\n", - typeid(alpha).name()); + Kokkos::printf("In Test_Blas2_ger.hpp, computing vanilla A with alpha type = %s\n", typeid(alpha).name()); #endif - this->populateVanillaValues(alpha, x.h_view, y.h_view, A.h_view, - h_vanilla.d_view); + this->populateVanillaValues(alpha, x.h_view, y.h_view, A.h_view, h_vanilla.d_view); // ******************************************************************** // Step 4 of 9: use h_vanilla and h_expected as appropriate @@ -309,8 +267,7 @@ void GerTestercompareVanillaAgainstExpected(alpha, h_vanilla.d_view, - h_expected.d_view); + this->compareVanillaAgainstExpected(alpha, h_vanilla.d_view, h_expected.d_view); } else { // ****************************************************************** // Copy h_vanilla to h_expected @@ -325,8 +282,7 @@ void GerTestercallKkGerAndCompareAgainstExpected( - alpha, x.d_view, y.d_view, A, h_expected.d_view, "non const {x,y}"); + this->callKkGerAndCompareAgainstExpected(alpha, x.d_view, y.d_view, A, h_expected.d_view, "non const {x,y}"); } // ******************************************************************** @@ -335,8 +291,7 @@ void GerTestercallKkGerAndCompareAgainstExpected(alpha, x.d_view_const, y.d_view, A, - h_expected.d_view, "const x"); + this->callKkGerAndCompareAgainstExpected(alpha, x.d_view_const, y.d_view, A, h_expected.d_view, "const x"); } // ******************************************************************** @@ -345,8 +300,7 @@ void GerTestercallKkGerAndCompareAgainstExpected(alpha, x.d_view, y.d_view_const, A, - h_expected.d_view, "const y"); + this->callKkGerAndCompareAgainstExpected(alpha, x.d_view, y.d_view_const, A, h_expected.d_view, "const y"); } // ******************************************************************** @@ -355,9 +309,8 @@ void GerTestercallKkGerAndCompareAgainstExpected(alpha, x.d_view_const, - y.d_view_const, A, - h_expected.d_view, "const {x,y}"); + this->callKkGerAndCompareAgainstExpected(alpha, x.d_view_const, y.d_view_const, A, h_expected.d_view, + "const {x,y}"); } // ******************************************************************** @@ -376,21 +329,14 @@ void GerTester -void GerTester< - ScalarX, tLayoutX, ScalarY, tLayoutY, ScalarA, tLayoutA, - Device>::populateVariables(ScalarA& alpha, - view_stride_adapter<_ViewTypeX, false>& x, - view_stride_adapter<_ViewTypeY, false>& y, - view_stride_adapter<_ViewTypeA, false>& A, - _ViewTypeExpected& h_expected, - bool& expectedResultIsKnown) { +template +void GerTester::populateVariables( + ScalarA& alpha, view_stride_adapter<_ViewTypeX, false>& x, view_stride_adapter<_ViewTypeY, false>& y, + view_stride_adapter<_ViewTypeA, false>& A, _ViewTypeExpected& h_expected, bool& expectedResultIsKnown) { expectedResultIsKnown = false; if (_useAnalyticalResults) { - this->populateAnalyticalValues(alpha, x.h_view, y.h_view, A.h_view, - h_expected); + this->populateAnalyticalValues(alpha, x.h_view, y.h_view, A.h_view, h_expected); Kokkos::deep_copy(x.d_base, x.h_base); Kokkos::deep_copy(y.d_base, y.h_base); Kokkos::deep_copy(A.d_base, A.h_base); @@ -455,8 +401,7 @@ void GerTester< } else { alpha = 3; - Kokkos::Random_XorShift64_Pool rand_pool( - 13718); + Kokkos::Random_XorShift64_Pool rand_pool(13718); { ScalarX randStart, randEnd; @@ -483,17 +428,12 @@ void GerTester< } // Code for complex values -template +template template -typename std::enable_if>::value || - std::is_same>::value, - void>::type -GerTester::populateAnalyticalValues(T& alpha, _HostViewTypeX& h_x, - _HostViewTypeY& h_y, - _HostViewTypeA& h_A, - _ViewTypeExpected& h_expected) { +typename std::enable_if< + std::is_same>::value || std::is_same>::value, void>::type +GerTester::populateAnalyticalValues( + T& alpha, _HostViewTypeX& h_x, _HostViewTypeY& h_y, _HostViewTypeA& h_A, _ViewTypeExpected& h_expected) { _AuxType auxI(0.); _AuxType auxJ(0.); _AuxType auxIpJ(0.); @@ -518,26 +458,20 @@ GerTestershrinkAngleToZeroTwoPiRange(static_cast<_AuxType>(i)); for (int j = 0; j < _N; ++j) { - auxJ = this->shrinkAngleToZeroTwoPiRange(static_cast<_AuxType>(j)); - auxIpJ = - this->shrinkAngleToZeroTwoPiRange(static_cast<_AuxType>(i + j)); - h_A(i, j).real() = - -sin(auxIpJ) - sin(auxI) * sin(auxJ) - cos(auxI) * cos(auxJ); - h_A(i, j).imag() = - -sin(auxIpJ) - sin(auxI) * sin(auxJ) + cos(auxI) * cos(auxJ); + auxJ = this->shrinkAngleToZeroTwoPiRange(static_cast<_AuxType>(j)); + auxIpJ = this->shrinkAngleToZeroTwoPiRange(static_cast<_AuxType>(i + j)); + h_A(i, j).real() = -sin(auxIpJ) - sin(auxI) * sin(auxJ) - cos(auxI) * cos(auxJ); + h_A(i, j).imag() = -sin(auxIpJ) - sin(auxI) * sin(auxJ) + cos(auxI) * cos(auxJ); } } } else { for (int i = 0; i < _M; ++i) { auxI = this->shrinkAngleToZeroTwoPiRange(static_cast<_AuxType>(i)); for (int j = 0; j < _N; ++j) { - auxJ = this->shrinkAngleToZeroTwoPiRange(static_cast<_AuxType>(j)); - auxImJ = - this->shrinkAngleToZeroTwoPiRange(static_cast<_AuxType>(i - j)); - h_A(i, j).real() = - -sin(auxImJ) - sin(auxI) * sin(auxJ) + cos(auxI) * cos(auxJ); - h_A(i, j).imag() = - -sin(auxImJ) - sin(auxI) * sin(auxJ) - cos(auxI) * cos(auxJ); + auxJ = this->shrinkAngleToZeroTwoPiRange(static_cast<_AuxType>(j)); + auxImJ = this->shrinkAngleToZeroTwoPiRange(static_cast<_AuxType>(i - j)); + h_A(i, j).real() = -sin(auxImJ) - sin(auxI) * sin(auxJ) + cos(auxI) * cos(auxJ); + h_A(i, j).imag() = -sin(auxImJ) - sin(auxI) * sin(auxJ) - cos(auxI) * cos(auxJ); } } } @@ -546,9 +480,8 @@ GerTestershrinkAngleToZeroTwoPiRange(static_cast<_AuxType>(i)); for (int j = 0; j < _N; ++j) { - auxJ = this->shrinkAngleToZeroTwoPiRange(static_cast<_AuxType>(j)); - auxIpJ = - this->shrinkAngleToZeroTwoPiRange(static_cast<_AuxType>(i + j)); + auxJ = this->shrinkAngleToZeroTwoPiRange(static_cast<_AuxType>(j)); + auxIpJ = this->shrinkAngleToZeroTwoPiRange(static_cast<_AuxType>(i + j)); h_expected(i, j).real() = -2. * sin(auxI) * sin(auxJ); h_expected(i, j).imag() = 2. * (cos(auxIpJ) - sin(auxIpJ)); } @@ -557,9 +490,8 @@ GerTestershrinkAngleToZeroTwoPiRange(static_cast<_AuxType>(i)); for (int j = 0; j < _N; ++j) { - auxJ = this->shrinkAngleToZeroTwoPiRange(static_cast<_AuxType>(j)); - auxImJ = - this->shrinkAngleToZeroTwoPiRange(static_cast<_AuxType>(i - j)); + auxJ = this->shrinkAngleToZeroTwoPiRange(static_cast<_AuxType>(j)); + auxImJ = this->shrinkAngleToZeroTwoPiRange(static_cast<_AuxType>(i - j)); h_expected(i, j).real() = 2. * cos(auxI) * cos(auxJ); h_expected(i, j).imag() = -2. * sin(auxImJ); } @@ -568,17 +500,12 @@ GerTester +template template -typename std::enable_if>::value && - !std::is_same>::value, - void>::type -GerTester::populateAnalyticalValues(T& alpha, _HostViewTypeX& h_x, - _HostViewTypeY& h_y, - _HostViewTypeA& h_A, - _ViewTypeExpected& h_expected) { +typename std::enable_if< + !std::is_same>::value && !std::is_same>::value, void>::type +GerTester::populateAnalyticalValues( + T& alpha, _HostViewTypeX& h_x, _HostViewTypeY& h_y, _HostViewTypeA& h_A, _ViewTypeExpected& h_expected) { _AuxType auxI(0.); _AuxType auxJ(0.); _AuxType auxIpJ(0.); @@ -605,25 +532,20 @@ GerTestershrinkAngleToZeroTwoPiRange(static_cast<_AuxType>(i + j)); + auxIpJ = this->shrinkAngleToZeroTwoPiRange(static_cast<_AuxType>(i + j)); h_expected(i, j) = 3 * sin(auxIpJ); } } } // Code for complex values -template +template template -typename std::enable_if>::value || - std::is_same>::value, - void>::type -GerTester::populateVanillaValues(const T& alpha, - const _HostViewTypeX& h_x, - const _HostViewTypeY& h_y, - const _HostViewTypeA& h_A, - _ViewTypeExpected& h_vanilla) { +typename std::enable_if< + std::is_same>::value || std::is_same>::value, void>::type +GerTester::populateVanillaValues( + const T& alpha, const _HostViewTypeX& h_x, const _HostViewTypeY& h_y, const _HostViewTypeA& h_A, + _ViewTypeExpected& h_vanilla) { if (_vanillaUsesDifferentOrderOfOps) { if (_useHermitianOption) { for (int i = 0; i < _M; ++i) { @@ -656,18 +578,13 @@ GerTester +template template -typename std::enable_if>::value && - !std::is_same>::value, - void>::type -GerTester::populateVanillaValues(const T& alpha, - const _HostViewTypeX& h_x, - const _HostViewTypeY& h_y, - const _HostViewTypeA& h_A, - _ViewTypeExpected& h_vanilla) { +typename std::enable_if< + !std::is_same>::value && !std::is_same>::value, void>::type +GerTester::populateVanillaValues( + const T& alpha, const _HostViewTypeX& h_x, const _HostViewTypeY& h_y, const _HostViewTypeA& h_A, + _ViewTypeExpected& h_vanilla) { if (_vanillaUsesDifferentOrderOfOps) { for (int i = 0; i < _M; ++i) { for (int j = 0; j < _N; ++j) { @@ -683,11 +600,10 @@ GerTester +template template -T GerTester::shrinkAngleToZeroTwoPiRange(const T input) { +T GerTester::shrinkAngleToZeroTwoPiRange( + const T input) { T output(input); #if 0 T twoPi( 2. * Kokkos::numbers::pi ); @@ -702,18 +618,13 @@ T GerTester +template template -typename std::enable_if>::value || - std::is_same>::value, - void>::type -GerTester:: - compareVanillaAgainstExpected(const T& alpha, - const _ViewTypeExpected& h_vanilla, - const _ViewTypeExpected& h_expected) { - int maxNumErrorsAllowed(static_cast(_M) * static_cast(_N) * - 1.e-3); +typename std::enable_if< + std::is_same>::value || std::is_same>::value, void>::type +GerTester::compareVanillaAgainstExpected( + const T& alpha, const _ViewTypeExpected& h_vanilla, const _ViewTypeExpected& h_expected) { + int maxNumErrorsAllowed(static_cast(_M) * static_cast(_N) * 1.e-3); if (_useAnalyticalResults) { int numErrorsRealAbs(0); @@ -732,7 +643,7 @@ GerTester:: for (int i(0); i < _M; ++i) { for (int j(0); j < _N; ++j) { - diff = _KAT_A::abs(h_expected(i, j).real() - h_vanilla(i, j).real()); + diff = _KAT_A::abs(h_expected(i, j).real() - h_vanilla(i, j).real()); errorHappened = false; if (h_expected(i, j).real() == 0.) { diffThreshold = _KAT_A::abs(_absTol); @@ -756,17 +667,15 @@ GerTester:: } if (errorHappened && (numErrorsRealAbs + numErrorsRealRel == 1)) { #ifdef HAVE_KOKKOSKERNELS_DEBUG - std::cout << "ERROR, i = " << i << ", j = " << j - << ": h_expected(i,j).real() = " << h_expected(i, j).real() + std::cout << "ERROR, i = " << i << ", j = " << j << ": h_expected(i,j).real() = " << h_expected(i, j).real() << ", h_vanilla(i,j).real() = " << h_vanilla(i, j).real() << ", _KAT_A::abs(h_expected(i,j).real() - " "h_vanilla(i,j).real()) = " - << diff << ", diffThreshold = " << diffThreshold - << std::endl; + << diff << ", diffThreshold = " << diffThreshold << std::endl; #endif } - diff = _KAT_A::abs(h_expected(i, j).imag() - h_vanilla(i, j).imag()); + diff = _KAT_A::abs(h_expected(i, j).imag() - h_vanilla(i, j).imag()); errorHappened = false; if (h_expected(i, j).imag() == 0.) { diffThreshold = _KAT_A::abs(_absTol); @@ -790,37 +699,26 @@ GerTester:: } if (errorHappened && (numErrorsImagAbs + numErrorsImagRel == 1)) { #ifdef HAVE_KOKKOSKERNELS_DEBUG - std::cout << "ERROR, i = " << i << ", j = " << j - << ": h_expected(i,j).imag() = " << h_expected(i, j).imag() + std::cout << "ERROR, i = " << i << ", j = " << j << ": h_expected(i,j).imag() = " << h_expected(i, j).imag() << ", h_vanilla(i,j).imag() = " << h_vanilla(i, j).imag() << ", _KAT_A::abs(h_expected(i,j).imag() - " "h_vanilla(i,j).imag()) = " - << diff << ", diffThreshold = " << diffThreshold - << std::endl; + << diff << ", diffThreshold = " << diffThreshold << std::endl; #endif } } // for j } // for i { std::ostringstream msg; - msg << ", A is " << _M << " by " << _N << ", _A_is_lr = " << _A_is_lr - << ", _A_is_ll = " << _A_is_ll - << ", alpha type = " << typeid(alpha).name() - << ", _useHermitianOption = " << _useHermitianOption + msg << ", A is " << _M << " by " << _N << ", _A_is_lr = " << _A_is_lr << ", _A_is_ll = " << _A_is_ll + << ", alpha type = " << typeid(alpha).name() << ", _useHermitianOption = " << _useHermitianOption << ": vanilla differs too much from analytical on real components" - << ", numErrorsRealAbs = " << numErrorsRealAbs - << ", numErrorsRealRel = " << numErrorsRealRel - << ", maxErrorRealRel = " << maxErrorRealRel - << ", iForMaxErrorRealRel = " << iForMaxErrorRealRel - << ", jForMaxErrorRealRel = " << jForMaxErrorRealRel - << ", h_expected(i,j).real() = " - << (((_M > 0) && (_N > 0)) - ? h_expected(iForMaxErrorRealRel, jForMaxErrorRealRel).real() - : 9.999e+99) + << ", numErrorsRealAbs = " << numErrorsRealAbs << ", numErrorsRealRel = " << numErrorsRealRel + << ", maxErrorRealRel = " << maxErrorRealRel << ", iForMaxErrorRealRel = " << iForMaxErrorRealRel + << ", jForMaxErrorRealRel = " << jForMaxErrorRealRel << ", h_expected(i,j).real() = " + << (((_M > 0) && (_N > 0)) ? h_expected(iForMaxErrorRealRel, jForMaxErrorRealRel).real() : 9.999e+99) << ", h_vanilla(i,j).real() = " - << (((_M > 0) && (_N > 0)) - ? h_vanilla(iForMaxErrorRealRel, jForMaxErrorRealRel).real() - : 9.999e+99) + << (((_M > 0) && (_N > 0)) ? h_vanilla(iForMaxErrorRealRel, jForMaxErrorRealRel).real() : 9.999e+99) << ", maxNumErrorsAllowed = " << maxNumErrorsAllowed; int numErrorsReal(numErrorsRealAbs + numErrorsRealRel); @@ -829,29 +727,19 @@ GerTester:: std::cout << "WARNING" << msg.str() << std::endl; #endif } - EXPECT_LE(numErrorsReal, maxNumErrorsAllowed) - << "Failed test" << msg.str(); + EXPECT_LE(numErrorsReal, maxNumErrorsAllowed) << "Failed test" << msg.str(); } { std::ostringstream msg; - msg << ", A is " << _M << " by " << _N << ", _A_is_lr = " << _A_is_lr - << ", _A_is_ll = " << _A_is_ll - << ", alpha type = " << typeid(alpha).name() - << ", _useHermitianOption = " << _useHermitianOption + msg << ", A is " << _M << " by " << _N << ", _A_is_lr = " << _A_is_lr << ", _A_is_ll = " << _A_is_ll + << ", alpha type = " << typeid(alpha).name() << ", _useHermitianOption = " << _useHermitianOption << ": vanilla differs too much from analytical on imag components" - << ", numErrorsImagAbs = " << numErrorsImagAbs - << ", numErrorsImagRel = " << numErrorsImagRel - << ", maxErrorImagRel = " << maxErrorImagRel - << ", iForMaxErrorImagRel = " << iForMaxErrorImagRel - << ", jForMaxErrorImagRel = " << jForMaxErrorImagRel - << ", h_expected(i,j).imag() = " - << (((_M > 0) && (_N > 0)) - ? h_expected(iForMaxErrorImagRel, jForMaxErrorImagRel).imag() - : 9.999e+99) + << ", numErrorsImagAbs = " << numErrorsImagAbs << ", numErrorsImagRel = " << numErrorsImagRel + << ", maxErrorImagRel = " << maxErrorImagRel << ", iForMaxErrorImagRel = " << iForMaxErrorImagRel + << ", jForMaxErrorImagRel = " << jForMaxErrorImagRel << ", h_expected(i,j).imag() = " + << (((_M > 0) && (_N > 0)) ? h_expected(iForMaxErrorImagRel, jForMaxErrorImagRel).imag() : 9.999e+99) << ", h_vanilla(i,j).imag() = " - << (((_M > 0) && (_N > 0)) - ? h_vanilla(iForMaxErrorImagRel, jForMaxErrorImagRel).imag() - : 9.999e+99) + << (((_M > 0) && (_N > 0)) ? h_vanilla(iForMaxErrorImagRel, jForMaxErrorImagRel).imag() : 9.999e+99) << ", maxNumErrorsAllowed = " << maxNumErrorsAllowed; int numErrorsImag(numErrorsImagAbs + numErrorsImagRel); @@ -860,8 +748,7 @@ GerTester:: std::cout << "WARNING" << msg.str() << std::endl; #endif } - EXPECT_LE(numErrorsImag, maxNumErrorsAllowed) - << "Failed test" << msg.str(); + EXPECT_LE(numErrorsImag, maxNumErrorsAllowed) << "Failed test" << msg.str(); } } else { int numErrorsReal(0); @@ -872,11 +759,8 @@ GerTester:: if (h_expected(i, j).real() != h_vanilla(i, j).real()) { if (numErrorsReal == 0) { #ifdef HAVE_KOKKOSKERNELS_DEBUG - std::cout << "ERROR, i = " << i << ", j = " << j - << ": h_expected(i,j).real() = " - << h_expected(i, j).real() - << ", h_vanilla(i,j).real() = " << h_vanilla(i, j).real() - << std::endl; + std::cout << "ERROR, i = " << i << ", j = " << j << ": h_expected(i,j).real() = " << h_expected(i, j).real() + << ", h_vanilla(i,j).real() = " << h_vanilla(i, j).real() << std::endl; #endif } numErrorsReal++; @@ -885,49 +769,37 @@ GerTester:: if (h_expected(i, j).imag() != h_vanilla(i, j).imag()) { if (numErrorsImag == 0) { #ifdef HAVE_KOKKOSKERNELS_DEBUG - std::cout << "ERROR, i = " << i << ", j = " << j - << ": h_expected(i,j).imag() = " - << h_expected(i, j).imag() - << ", h_vanilla(i,j).imag() = " << h_vanilla(i, j).imag() - << std::endl; + std::cout << "ERROR, i = " << i << ", j = " << j << ": h_expected(i,j).imag() = " << h_expected(i, j).imag() + << ", h_vanilla(i,j).imag() = " << h_vanilla(i, j).imag() << std::endl; #endif } numErrorsImag++; } } // for j } // for i - EXPECT_EQ(numErrorsReal, 0) - << "Failed test" - << ", A is " << _M << " by " << _N << ", _A_is_lr = " << _A_is_lr - << ", _A_is_ll = " << _A_is_ll - << ", alpha type = " << typeid(alpha).name() - << ", _useHermitianOption = " << _useHermitianOption - << ": vanilla result is incorrect on real components" - << ", numErrorsReal = " << numErrorsReal; - EXPECT_EQ(numErrorsImag, 0) - << "Failed test" - << ", A is " << _M << " by " << _N << ", _A_is_lr = " << _A_is_lr - << ", _A_is_ll = " << _A_is_ll - << ", alpha type = " << typeid(alpha).name() - << ", _useHermitianOption = " << _useHermitianOption - << ": vanilla result is incorrect on imag components" - << ", numErrorsImag = " << numErrorsImag; + EXPECT_EQ(numErrorsReal, 0) << "Failed test" + << ", A is " << _M << " by " << _N << ", _A_is_lr = " << _A_is_lr + << ", _A_is_ll = " << _A_is_ll << ", alpha type = " << typeid(alpha).name() + << ", _useHermitianOption = " << _useHermitianOption + << ": vanilla result is incorrect on real components" + << ", numErrorsReal = " << numErrorsReal; + EXPECT_EQ(numErrorsImag, 0) << "Failed test" + << ", A is " << _M << " by " << _N << ", _A_is_lr = " << _A_is_lr + << ", _A_is_ll = " << _A_is_ll << ", alpha type = " << typeid(alpha).name() + << ", _useHermitianOption = " << _useHermitianOption + << ": vanilla result is incorrect on imag components" + << ", numErrorsImag = " << numErrorsImag; } } // Code for non-complex values -template +template template -typename std::enable_if>::value && - !std::is_same>::value, - void>::type -GerTester:: - compareVanillaAgainstExpected(const T& alpha, - const _ViewTypeExpected& h_vanilla, - const _ViewTypeExpected& h_expected) { - int maxNumErrorsAllowed(static_cast(_M) * static_cast(_N) * - 1.e-3); +typename std::enable_if< + !std::is_same>::value && !std::is_same>::value, void>::type +GerTester::compareVanillaAgainstExpected( + const T& alpha, const _ViewTypeExpected& h_vanilla, const _ViewTypeExpected& h_expected) { + int maxNumErrorsAllowed(static_cast(_M) * static_cast(_N) * 1.e-3); if (_useAnalyticalResults) { int numErrorsAbs(0); @@ -965,35 +837,24 @@ GerTester:: } if (errorHappened && (numErrorsAbs + numErrorsRel == 1)) { #ifdef HAVE_KOKKOSKERNELS_DEBUG - std::cout << "ERROR, i = " << i << ", j = " << j - << ": h_expected(i,j) = " << h_expected(i, j) + std::cout << "ERROR, i = " << i << ", j = " << j << ": h_expected(i,j) = " << h_expected(i, j) << ", h_vanilla(i,j) = " << h_vanilla(i, j) - << ", _KAT_A::abs(h_expected(i,j) - h_vanilla(i,j)) = " - << diff << ", diffThreshold = " << diffThreshold - << std::endl; + << ", _KAT_A::abs(h_expected(i,j) - h_vanilla(i,j)) = " << diff + << ", diffThreshold = " << diffThreshold << std::endl; #endif } } // for j } // for i { std::ostringstream msg; - msg << ", A is " << _M << " by " << _N << ", _A_is_lr = " << _A_is_lr - << ", _A_is_ll = " << _A_is_ll - << ", alpha type = " << typeid(alpha).name() - << ", _useHermitianOption = " << _useHermitianOption + msg << ", A is " << _M << " by " << _N << ", _A_is_lr = " << _A_is_lr << ", _A_is_ll = " << _A_is_ll + << ", alpha type = " << typeid(alpha).name() << ", _useHermitianOption = " << _useHermitianOption << ": vanilla differs too much from expected" - << ", numErrorsAbs = " << numErrorsAbs - << ", numErrorsRel = " << numErrorsRel - << ", maxErrorRel = " << maxErrorRel - << ", iForMaxErrorRel = " << iForMaxErrorRel + << ", numErrorsAbs = " << numErrorsAbs << ", numErrorsRel = " << numErrorsRel + << ", maxErrorRel = " << maxErrorRel << ", iForMaxErrorRel = " << iForMaxErrorRel << ", jForMaxErrorRel = " << jForMaxErrorRel << ", h_expected(i,j) = " - << (((_M > 0) && (_N > 0)) - ? h_expected(iForMaxErrorRel, jForMaxErrorRel) - : 9.999e+99) - << ", h_vanilla(i,j) = " - << (((_M > 0) && (_N > 0)) - ? h_vanilla(iForMaxErrorRel, jForMaxErrorRel) - : 9.999e+99) + << (((_M > 0) && (_N > 0)) ? h_expected(iForMaxErrorRel, jForMaxErrorRel) : 9.999e+99) + << ", h_vanilla(i,j) = " << (((_M > 0) && (_N > 0)) ? h_vanilla(iForMaxErrorRel, jForMaxErrorRel) : 9.999e+99) << ", maxNumErrorsAllowed = " << maxNumErrorsAllowed; int numErrors(numErrorsAbs + numErrorsRel); @@ -1012,8 +873,7 @@ GerTester:: if (h_expected(i, j) != h_vanilla(i, j)) { if (numErrors == 0) { #ifdef HAVE_KOKKOSKERNELS_DEBUG - std::cout << "ERROR, i = " << i << ", j = " << j - << ": h_expected(i,j) = " << h_expected(i, j) + std::cout << "ERROR, i = " << i << ", j = " << j << ": h_expected(i,j) = " << h_expected(i, j) << ", h_vanilla(i,j) = " << h_vanilla(i, j) << std::endl; #endif } @@ -1021,29 +881,22 @@ GerTester:: } } // for j } // for i - EXPECT_EQ(numErrors, 0) - << "Failed test" - << ", A is " << _M << " by " << _N << ", _A_is_lr = " << _A_is_lr - << ", _A_is_ll = " << _A_is_ll - << ", alpha type = " << typeid(alpha).name() - << ", _useHermitianOption = " << _useHermitianOption - << ": vanilla result is incorrect" - << ", numErrors = " << numErrors; + EXPECT_EQ(numErrors, 0) << "Failed test" + << ", A is " << _M << " by " << _N << ", _A_is_lr = " << _A_is_lr + << ", _A_is_ll = " << _A_is_ll << ", alpha type = " << typeid(alpha).name() + << ", _useHermitianOption = " << _useHermitianOption << ": vanilla result is incorrect" + << ", numErrors = " << numErrors; } } // Code for complex values -template +template template -typename std::enable_if>::value || - std::is_same>::value, - void>::type -GerTester:: - compareKkGerAgainstExpected(const T& alpha, const _HostViewTypeA& h_A, - const _ViewTypeExpected& h_expected) { - int maxNumErrorsAllowed(static_cast(_M) * static_cast(_N) * - 1.e-3); +typename std::enable_if< + std::is_same>::value || std::is_same>::value, void>::type +GerTester::compareKkGerAgainstExpected( + const T& alpha, const _HostViewTypeA& h_A, const _ViewTypeExpected& h_expected) { + int maxNumErrorsAllowed(static_cast(_M) * static_cast(_N) * 1.e-3); int numErrorsRealAbs(0); int numErrorsRealRel(0); @@ -1084,12 +937,10 @@ GerTester:: } if (errorHappened && (numErrorsRealAbs + numErrorsRealRel == 1)) { #ifdef HAVE_KOKKOSKERNELS_DEBUG - std::cout - << "ERROR, i = " << i << ", j = " << j - << ": h_expected(i,j).real() = " << h_expected(i, j).real() - << ", h_A(i,j).real() = " << h_A(i, j).real() - << ", _KAT_A::abs(h_expected(i,j).real() - h_A(i,j).real()) = " - << diff << ", diffThreshold = " << diffThreshold << std::endl; + std::cout << "ERROR, i = " << i << ", j = " << j << ": h_expected(i,j).real() = " << h_expected(i, j).real() + << ", h_A(i,j).real() = " << h_A(i, j).real() + << ", _KAT_A::abs(h_expected(i,j).real() - h_A(i,j).real()) = " << diff + << ", diffThreshold = " << diffThreshold << std::endl; #endif } @@ -1117,90 +968,56 @@ GerTester:: } if (errorHappened && (numErrorsImagAbs + numErrorsImagRel == 1)) { #ifdef HAVE_KOKKOSKERNELS_DEBUG - std::cout - << "ERROR, i = " << i << ", j = " << j - << ": h_expected(i,j).imag() = " << h_expected(i, j).imag() - << ", h_A(i,j).imag() = " << h_A(i, j).imag() - << ", _KAT_A::abs(h_expected(i,j).imag() - h_A(i,j).imag()) = " - << diff << ", diffThreshold = " << diffThreshold << std::endl; + std::cout << "ERROR, i = " << i << ", j = " << j << ": h_expected(i,j).imag() = " << h_expected(i, j).imag() + << ", h_A(i,j).imag() = " << h_A(i, j).imag() + << ", _KAT_A::abs(h_expected(i,j).imag() - h_A(i,j).imag()) = " << diff + << ", diffThreshold = " << diffThreshold << std::endl; #endif } } // for j } // for i #ifdef HAVE_KOKKOSKERNELS_DEBUG - std::cout - << "A is " << _M << " by " << _N << ", _A_is_lr = " << _A_is_lr - << ", _A_is_ll = " << _A_is_ll - << ", alpha type = " << typeid(alpha).name() - << ", _useHermitianOption = " << _useHermitianOption - << ", numErrorsRealAbs = " << numErrorsRealAbs - << ", numErrorsRealRel = " << numErrorsRealRel - << ", maxErrorRealRel = " << maxErrorRealRel - << ", iForMaxErrorRealRel = " << iForMaxErrorRealRel - << ", jForMaxErrorRealRel = " << jForMaxErrorRealRel - << ", h_expected(i,j).real() = " - << (((_M > 0) && (_N > 0)) - ? h_expected(iForMaxErrorRealRel, jForMaxErrorRealRel).real() - : 9.999e+99) - << ", h_A(i,j).real() = " - << (((_M > 0) && (_N > 0)) - ? h_A(iForMaxErrorRealRel, jForMaxErrorRealRel).real() - : 9.999e+99) - << ", numErrorsImagAbs = " << numErrorsImagAbs - << ", numErrorsImagRel = " << numErrorsImagRel - << ", maxErrorImagRel = " << maxErrorImagRel - << ", iForMaxErrorImagRel = " << iForMaxErrorImagRel - << ", jForMaxErrorImagRel = " << jForMaxErrorImagRel - << ", h_expected(i,j).imag() = " - << (((_M > 0) && (_N > 0)) - ? h_expected(iForMaxErrorImagRel, jForMaxErrorImagRel).imag() - : 9.999e+99) - << ", h_A(i,j).imag() = " - << (((_M > 0) && (_N > 0)) - ? h_A(iForMaxErrorImagRel, jForMaxErrorImagRel).imag() - : 9.999e+99) - << ", maxNumErrorsAllowed = " << maxNumErrorsAllowed << std::endl; + std::cout << "A is " << _M << " by " << _N << ", _A_is_lr = " << _A_is_lr << ", _A_is_ll = " << _A_is_ll + << ", alpha type = " << typeid(alpha).name() << ", _useHermitianOption = " << _useHermitianOption + << ", numErrorsRealAbs = " << numErrorsRealAbs << ", numErrorsRealRel = " << numErrorsRealRel + << ", maxErrorRealRel = " << maxErrorRealRel << ", iForMaxErrorRealRel = " << iForMaxErrorRealRel + << ", jForMaxErrorRealRel = " << jForMaxErrorRealRel << ", h_expected(i,j).real() = " + << (((_M > 0) && (_N > 0)) ? h_expected(iForMaxErrorRealRel, jForMaxErrorRealRel).real() : 9.999e+99) + << ", h_A(i,j).real() = " + << (((_M > 0) && (_N > 0)) ? h_A(iForMaxErrorRealRel, jForMaxErrorRealRel).real() : 9.999e+99) + << ", numErrorsImagAbs = " << numErrorsImagAbs << ", numErrorsImagRel = " << numErrorsImagRel + << ", maxErrorImagRel = " << maxErrorImagRel << ", iForMaxErrorImagRel = " << iForMaxErrorImagRel + << ", jForMaxErrorImagRel = " << jForMaxErrorImagRel << ", h_expected(i,j).imag() = " + << (((_M > 0) && (_N > 0)) ? h_expected(iForMaxErrorImagRel, jForMaxErrorImagRel).imag() : 9.999e+99) + << ", h_A(i,j).imag() = " + << (((_M > 0) && (_N > 0)) ? h_A(iForMaxErrorImagRel, jForMaxErrorImagRel).imag() : 9.999e+99) + << ", maxNumErrorsAllowed = " << maxNumErrorsAllowed << std::endl; if ((_M == 2131) && (_N == 2131)) { std::cout << "Information" - << ": A is " << _M << " by " << _N << ", _A_is_lr = " << _A_is_lr - << ", _A_is_ll = " << _A_is_ll - << ", alpha type = " << typeid(alpha).name() - << ", _useHermitianOption = " << _useHermitianOption - << ", h_expected(11, 2119) = (" << h_expected(11, 2119).real() - << ", " << h_expected(11, 2119).imag() << ")" - << ", h_A(11, 2119) = (" << h_A(11, 2119).real() << ", " - << h_A(11, 2119).imag() << ")" << std::endl; + << ": A is " << _M << " by " << _N << ", _A_is_lr = " << _A_is_lr << ", _A_is_ll = " << _A_is_ll + << ", alpha type = " << typeid(alpha).name() << ", _useHermitianOption = " << _useHermitianOption + << ", h_expected(11, 2119) = (" << h_expected(11, 2119).real() << ", " << h_expected(11, 2119).imag() + << ")" + << ", h_A(11, 2119) = (" << h_A(11, 2119).real() << ", " << h_A(11, 2119).imag() << ")" << std::endl; std::cout << "Information" - << ": A is " << _M << " by " << _N << ", _A_is_lr = " << _A_is_lr - << ", _A_is_ll = " << _A_is_ll - << ", alpha type = " << typeid(alpha).name() - << ", _useHermitianOption = " << _useHermitianOption - << ", h_expected(710, 1065) = (" << h_expected(710, 1065).real() - << ", " << h_expected(710, 1065).imag() << ")" - << ", h_A(710, 1065) = (" << h_A(710, 1065).real() << ", " - << h_A(710, 1065).imag() << ")" << std::endl; + << ": A is " << _M << " by " << _N << ", _A_is_lr = " << _A_is_lr << ", _A_is_ll = " << _A_is_ll + << ", alpha type = " << typeid(alpha).name() << ", _useHermitianOption = " << _useHermitianOption + << ", h_expected(710, 1065) = (" << h_expected(710, 1065).real() << ", " << h_expected(710, 1065).imag() + << ")" + << ", h_A(710, 1065) = (" << h_A(710, 1065).real() << ", " << h_A(710, 1065).imag() << ")" << std::endl; } #endif { std::ostringstream msg; - msg << ", A is " << _M << " by " << _N << ", _A_is_lr = " << _A_is_lr - << ", _A_is_ll = " << _A_is_ll - << ", alpha type = " << typeid(alpha).name() - << ", _useHermitianOption = " << _useHermitianOption + msg << ", A is " << _M << " by " << _N << ", _A_is_lr = " << _A_is_lr << ", _A_is_ll = " << _A_is_ll + << ", alpha type = " << typeid(alpha).name() << ", _useHermitianOption = " << _useHermitianOption << ": ger result is incorrect on real components" - << ", numErrorsRealAbs = " << numErrorsRealAbs - << ", numErrorsRealRel = " << numErrorsRealRel - << ", maxErrorRealRel = " << maxErrorRealRel - << ", iForMaxErrorRealRel = " << iForMaxErrorRealRel - << ", jForMaxErrorRealRel = " << jForMaxErrorRealRel - << ", h_expected(i,j).real() = " - << (((_M > 0) && (_N > 0)) - ? h_expected(iForMaxErrorRealRel, jForMaxErrorRealRel).real() - : 9.999e+99) + << ", numErrorsRealAbs = " << numErrorsRealAbs << ", numErrorsRealRel = " << numErrorsRealRel + << ", maxErrorRealRel = " << maxErrorRealRel << ", iForMaxErrorRealRel = " << iForMaxErrorRealRel + << ", jForMaxErrorRealRel = " << jForMaxErrorRealRel << ", h_expected(i,j).real() = " + << (((_M > 0) && (_N > 0)) ? h_expected(iForMaxErrorRealRel, jForMaxErrorRealRel).real() : 9.999e+99) << ", h_A(i,j).real() = " - << (((_M > 0) && (_N > 0)) - ? h_A(iForMaxErrorRealRel, jForMaxErrorRealRel).real() - : 9.999e+99) + << (((_M > 0) && (_N > 0)) ? h_A(iForMaxErrorRealRel, jForMaxErrorRealRel).real() : 9.999e+99) << ", maxNumErrorsAllowed = " << maxNumErrorsAllowed; int numErrorsReal(numErrorsRealAbs + numErrorsRealRel); @@ -1213,24 +1030,15 @@ GerTester:: } { std::ostringstream msg; - msg << ", A is " << _M << " by " << _N << ", _A_is_lr = " << _A_is_lr - << ", _A_is_ll = " << _A_is_ll - << ", alpha type = " << typeid(alpha).name() - << ", _useHermitianOption = " << _useHermitianOption + msg << ", A is " << _M << " by " << _N << ", _A_is_lr = " << _A_is_lr << ", _A_is_ll = " << _A_is_ll + << ", alpha type = " << typeid(alpha).name() << ", _useHermitianOption = " << _useHermitianOption << ": ger result is incorrect on imag components" - << ", numErrorsImagAbs = " << numErrorsImagAbs - << ", numErrorsImagRel = " << numErrorsImagRel - << ", maxErrorImagRel = " << maxErrorImagRel - << ", iForMaxErrorImagRel = " << iForMaxErrorImagRel - << ", jForMaxErrorImagRel = " << jForMaxErrorImagRel - << ", h_expected(i,j).imag() = " - << (((_M > 0) && (_N > 0)) - ? h_expected(iForMaxErrorImagRel, jForMaxErrorImagRel).imag() - : 9.999e+99) + << ", numErrorsImagAbs = " << numErrorsImagAbs << ", numErrorsImagRel = " << numErrorsImagRel + << ", maxErrorImagRel = " << maxErrorImagRel << ", iForMaxErrorImagRel = " << iForMaxErrorImagRel + << ", jForMaxErrorImagRel = " << jForMaxErrorImagRel << ", h_expected(i,j).imag() = " + << (((_M > 0) && (_N > 0)) ? h_expected(iForMaxErrorImagRel, jForMaxErrorImagRel).imag() : 9.999e+99) << ", h_A(i,j).imag() = " - << (((_M > 0) && (_N > 0)) - ? h_A(iForMaxErrorImagRel, jForMaxErrorImagRel).imag() - : 9.999e+99) + << (((_M > 0) && (_N > 0)) ? h_A(iForMaxErrorImagRel, jForMaxErrorImagRel).imag() : 9.999e+99) << ", maxNumErrorsAllowed = " << maxNumErrorsAllowed; int numErrorsImag(numErrorsImagAbs + numErrorsImagRel); @@ -1244,17 +1052,13 @@ GerTester:: } // Code for non-complex values -template +template template -typename std::enable_if>::value && - !std::is_same>::value, - void>::type -GerTester:: - compareKkGerAgainstExpected(const T& alpha, const _HostViewTypeA& h_A, - const _ViewTypeExpected& h_expected) { - int maxNumErrorsAllowed(static_cast(_M) * static_cast(_N) * - 1.e-3); +typename std::enable_if< + !std::is_same>::value && !std::is_same>::value, void>::type +GerTester::compareKkGerAgainstExpected( + const T& alpha, const _HostViewTypeA& h_A, const _ViewTypeExpected& h_expected) { + int maxNumErrorsAllowed(static_cast(_M) * static_cast(_N) * 1.e-3); int numErrorsAbs(0); int numErrorsRel(0); @@ -1290,52 +1094,33 @@ GerTester:: } if (errorHappened && (numErrorsAbs + numErrorsRel == 1)) { #ifdef HAVE_KOKKOSKERNELS_DEBUG - std::cout << "ERROR, i = " << i << ", j = " << j - << ": h_expected(i,j) = " << h_expected(i, j) - << ", h_A(i,j) = " << h_A(i, j) - << ", _KAT_A::abs(h_expected(i,j) - h_A(i,j)) = " << diff + std::cout << "ERROR, i = " << i << ", j = " << j << ": h_expected(i,j) = " << h_expected(i, j) + << ", h_A(i,j) = " << h_A(i, j) << ", _KAT_A::abs(h_expected(i,j) - h_A(i,j)) = " << diff << ", diffThreshold = " << diffThreshold << std::endl; #endif } } // for j } // for i #ifdef HAVE_KOKKOSKERNELS_DEBUG - std::cout << "A is " << _M << " by " << _N << ", _A_is_lr = " << _A_is_lr - << ", _A_is_ll = " << _A_is_ll - << ", alpha type = " << typeid(alpha).name() - << ", _useHermitianOption = " << _useHermitianOption - << ", numErrorsAbs = " << numErrorsAbs - << ", numErrorsRel = " << numErrorsRel - << ", maxErrorRel = " << maxErrorRel - << ", iForMaxErrorRel = " << iForMaxErrorRel - << ", jForMaxErrorRel = " << jForMaxErrorRel - << ", h_expected(i,j) = " - << (((_M > 0) && (_N > 0)) - ? h_expected(iForMaxErrorRel, jForMaxErrorRel) - : 9.999e+99) - << ", h_A(i,j) = " - << (((_M > 0) && (_N > 0)) ? h_A(iForMaxErrorRel, jForMaxErrorRel) - : 9.999e+99) + std::cout << "A is " << _M << " by " << _N << ", _A_is_lr = " << _A_is_lr << ", _A_is_ll = " << _A_is_ll + << ", alpha type = " << typeid(alpha).name() << ", _useHermitianOption = " << _useHermitianOption + << ", numErrorsAbs = " << numErrorsAbs << ", numErrorsRel = " << numErrorsRel + << ", maxErrorRel = " << maxErrorRel << ", iForMaxErrorRel = " << iForMaxErrorRel + << ", jForMaxErrorRel = " << jForMaxErrorRel << ", h_expected(i,j) = " + << (((_M > 0) && (_N > 0)) ? h_expected(iForMaxErrorRel, jForMaxErrorRel) : 9.999e+99) + << ", h_A(i,j) = " << (((_M > 0) && (_N > 0)) ? h_A(iForMaxErrorRel, jForMaxErrorRel) : 9.999e+99) << ", maxNumErrorsAllowed = " << maxNumErrorsAllowed << std::endl; #endif { std::ostringstream msg; - msg << ", A is " << _M << " by " << _N << ", _A_is_lr = " << _A_is_lr - << ", _A_is_ll = " << _A_is_ll - << ", alpha type = " << typeid(alpha).name() - << ", _useHermitianOption = " << _useHermitianOption + msg << ", A is " << _M << " by " << _N << ", _A_is_lr = " << _A_is_lr << ", _A_is_ll = " << _A_is_ll + << ", alpha type = " << typeid(alpha).name() << ", _useHermitianOption = " << _useHermitianOption << ": ger result is incorrect" - << ", numErrorsAbs = " << numErrorsAbs - << ", numErrorsRel = " << numErrorsRel - << ", maxErrorRel = " << maxErrorRel - << ", iForMaxErrorRel = " << iForMaxErrorRel - << ", jForMaxErrorRel = " << jForMaxErrorRel << ", h_expected(i,j) = " - << (((_M > 0) && (_N > 0)) - ? h_expected(iForMaxErrorRel, jForMaxErrorRel) - : 9.999e+99) - << ", h_A(i,j) = " - << (((_M > 0) && (_N > 0)) ? h_A(iForMaxErrorRel, jForMaxErrorRel) - : 9.999e+99) + << ", numErrorsAbs = " << numErrorsAbs << ", numErrorsRel = " << numErrorsRel + << ", maxErrorRel = " << maxErrorRel << ", iForMaxErrorRel = " << iForMaxErrorRel + << ", jForMaxErrorRel = " << jForMaxErrorRel + << ", h_expected(i,j) = " << (((_M > 0) && (_N > 0)) ? h_expected(iForMaxErrorRel, jForMaxErrorRel) : 9.999e+99) + << ", h_A(i,j) = " << (((_M > 0) && (_N > 0)) ? h_A(iForMaxErrorRel, jForMaxErrorRel) : 9.999e+99) << ", maxNumErrorsAllowed = " << maxNumErrorsAllowed; int numErrors(numErrorsAbs + numErrorsRel); @@ -1348,15 +1133,11 @@ GerTester:: } } -template +template template -void GerTester:: - callKkGerAndCompareAgainstExpected( - const ScalarA& alpha, TX& x, TY& y, - view_stride_adapter<_ViewTypeA, false>& A, - const _ViewTypeExpected& h_expected, const std::string& situation) { +void GerTester::callKkGerAndCompareAgainstExpected( + const ScalarA& alpha, TX& x, TY& y, view_stride_adapter<_ViewTypeA, false>& A, const _ViewTypeExpected& h_expected, + const std::string& situation) { #ifdef HAVE_KOKKOSKERNELS_DEBUG Kokkos::printf( "In Test_Blas2_ger.hpp, right before calling KokkosBlas::ger(): " @@ -1370,25 +1151,21 @@ void GerTester::value || - std::is_same::value || + bool xBool = std::is_same::value || std::is_same::value || std::is_same>::value || std::is_same>::value; - bool yBool = std::is_same::value || - std::is_same::value || + bool yBool = std::is_same::value || std::is_same::value || std::is_same>::value || std::is_same>::value; - bool aBool = std::is_same::value || - std::is_same::value || + bool aBool = std::is_same::value || std::is_same::value || std::is_same>::value || std::is_same>::value; bool useAnalyticalResults = xBool && yBool && aBool; #if defined(KOKKOSKERNELS_INST_LAYOUTLEFT) || \ - (!defined(KOKKOSKERNELS_ETI_ONLY) && \ - !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS)) + (!defined(KOKKOSKERNELS_ETI_ONLY) && !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS)) #ifdef HAVE_KOKKOSKERNELS_DEBUG Kokkos::printf( "+-----------------------------------------------------------------------" @@ -1435,8 +1207,7 @@ int test_ger(const std::string& /*caseName*/) { Kokkos::printf("Starting %s for LAYOUTLEFT ...\n", caseName.c_str()); #endif if (true) { - Test::GerTester + Test::GerTester tester; tester.test(0, 13, 0); tester.test(1024, 0, 0); @@ -1471,8 +1242,7 @@ int test_ger(const std::string& /*caseName*/) { #endif #if defined(KOKKOSKERNELS_INST_LAYOUTRIGHT) || \ - (!defined(KOKKOSKERNELS_ETI_ONLY) && \ - !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS)) + (!defined(KOKKOSKERNELS_ETI_ONLY) && !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS)) #ifdef HAVE_KOKKOSKERNELS_DEBUG Kokkos::printf( "+-----------------------------------------------------------------------" @@ -1480,8 +1250,7 @@ int test_ger(const std::string& /*caseName*/) { Kokkos::printf("Starting %s for LAYOUTRIGHT ...\n", caseName.c_str()); #endif if (true) { - Test::GerTester + Test::GerTester tester; tester.test(0, 13, 0); tester.test(1024, 0, 0); @@ -1515,8 +1284,7 @@ int test_ger(const std::string& /*caseName*/) { #endif #endif -#if (!defined(KOKKOSKERNELS_ETI_ONLY) && \ - !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS)) +#if (!defined(KOKKOSKERNELS_ETI_ONLY) && !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS)) #ifdef HAVE_KOKKOSKERNELS_DEBUG Kokkos::printf( "+-----------------------------------------------------------------------" @@ -1524,8 +1292,7 @@ int test_ger(const std::string& /*caseName*/) { Kokkos::printf("Starting %s for LAYOUTSTRIDE ...\n", caseName.c_str()); #endif if (true) { - Test::GerTester + Test::GerTester tester; tester.test(0, 13, 0); tester.test(1024, 0, 0); @@ -1556,8 +1323,7 @@ int test_ger(const std::string& /*caseName*/) { #endif #endif -#if !defined(KOKKOSKERNELS_ETI_ONLY) && \ - !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS) +#if !defined(KOKKOSKERNELS_ETI_ONLY) && !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS) #ifdef HAVE_KOKKOSKERNELS_DEBUG Kokkos::printf( "+-----------------------------------------------------------------------" @@ -1565,8 +1331,7 @@ int test_ger(const std::string& /*caseName*/) { Kokkos::printf("Starting %s for MIXED LAYOUTS ...\n", caseName.c_str()); #endif if (true) { - Test::GerTester + Test::GerTester tester; tester.test(1024, 1024, 0); if (useAnalyticalResults) { @@ -1578,8 +1343,7 @@ int test_ger(const std::string& /*caseName*/) { } if (true) { - Test::GerTester + Test::GerTester tester; tester.test(1024, 1024, 0); } @@ -1602,8 +1366,7 @@ int test_ger(const std::string& /*caseName*/) { } #if defined(KOKKOSKERNELS_INST_FLOAT) || \ - (!defined(KOKKOSKERNELS_ETI_ONLY) && \ - !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS)) + (!defined(KOKKOSKERNELS_ETI_ONLY) && !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS)) TEST_F(TestCategory, ger_float) { Kokkos::Profiling::pushRegion("KokkosBlas::Test::ger_float"); test_ger("test case ger_float"); @@ -1612,19 +1375,17 @@ TEST_F(TestCategory, ger_float) { #endif #if defined(KOKKOSKERNELS_INST_COMPLEX_FLOAT) || \ - (!defined(KOKKOSKERNELS_ETI_ONLY) && \ - !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS)) + (!defined(KOKKOSKERNELS_ETI_ONLY) && !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS)) TEST_F(TestCategory, ger_complex_float) { Kokkos::Profiling::pushRegion("KokkosBlas::Test::ger_complex_float"); - test_ger, Kokkos::complex, - Kokkos::complex, TestDevice>("test case ger_complex_float"); + test_ger, Kokkos::complex, Kokkos::complex, TestDevice>( + "test case ger_complex_float"); Kokkos::Profiling::popRegion(); } #endif #if defined(KOKKOSKERNELS_INST_DOUBLE) || \ - (!defined(KOKKOSKERNELS_ETI_ONLY) && \ - !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS)) + (!defined(KOKKOSKERNELS_ETI_ONLY) && !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS)) TEST_F(TestCategory, ger_double) { Kokkos::Profiling::pushRegion("KokkosBlas::Test::ger_double"); test_ger("test case ger_double"); @@ -1633,19 +1394,17 @@ TEST_F(TestCategory, ger_double) { #endif #if defined(KOKKOSKERNELS_INST_COMPLEX_DOUBLE) || \ - (!defined(KOKKOSKERNELS_ETI_ONLY) && \ - !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS)) + (!defined(KOKKOSKERNELS_ETI_ONLY) && !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS)) TEST_F(TestCategory, ger_complex_double) { Kokkos::Profiling::pushRegion("KokkosBlas::Test::ger_complex_double"); - test_ger, Kokkos::complex, - Kokkos::complex, TestDevice>("test case ger_complex_double"); + test_ger, Kokkos::complex, Kokkos::complex, TestDevice>( + "test case ger_complex_double"); Kokkos::Profiling::popRegion(); } #endif -#if defined(KOKKOSKERNELS_INST_INT) || \ - (!defined(KOKKOSKERNELS_ETI_ONLY) && \ - !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS)) +#if defined(KOKKOSKERNELS_INST_INT) || \ + (!defined(KOKKOSKERNELS_ETI_ONLY) && !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS)) TEST_F(TestCategory, ger_int) { Kokkos::Profiling::pushRegion("KokkosBlas::Test::ger_int"); test_ger("test case ger_int"); @@ -1653,8 +1412,7 @@ TEST_F(TestCategory, ger_int) { } #endif -#if !defined(KOKKOSKERNELS_ETI_ONLY) && \ - !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS) +#if !defined(KOKKOSKERNELS_ETI_ONLY) && !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS) TEST_F(TestCategory, ger_double_int_float) { Kokkos::Profiling::pushRegion("KokkosBlas::Test::ger_double_int_float"); test_ger("test case ger_double_int_float"); diff --git a/blas/unit_test/Test_Blas2_serial_gemv.hpp b/blas/unit_test/Test_Blas2_serial_gemv.hpp index 5c1aaf5a67..805ac1d283 100644 --- a/blas/unit_test/Test_Blas2_serial_gemv.hpp +++ b/blas/unit_test/Test_Blas2_serial_gemv.hpp @@ -21,39 +21,31 @@ namespace Test { -template +template struct SerialGEMVOp : public GemvOpBase { using params = GemvOpBase; - SerialGEMVOp(char trans_, ScalarType alpha_, AType A_, XType x_, - ScalarType beta_, YType y_) + SerialGEMVOp(char trans_, ScalarType alpha_, AType A_, XType x_, ScalarType beta_, YType y_) : params(trans_, alpha_, A_, x_, beta_, y_) {} template KOKKOS_INLINE_FUNCTION void operator()(const TeamMember& member) const { KokkosBlas::Experimental::Gemv::invoke( - member, params::trans, params::alpha, params::A, params::x, - params::beta, params::y); + member, params::trans, params::alpha, params::A, params::x, params::beta, params::y); } }; struct SerialGemvFactory { - template - using functor_type = - SerialGEMVOp; + template + using functor_type = SerialGEMVOp; - using algorithms = std::tuple; + using algorithms = std::tuple; }; #ifdef __KOKKOSBLAS_ENABLE_INTEL_MKL_COMPACT__ struct SerialMKLGemvFactory { - template - using functor_type = - SerialGEMVOp; + template + using functor_type = SerialGEMVOp; using algorithms = std::tuple; }; @@ -61,10 +53,8 @@ struct SerialMKLGemvFactory { } // namespace Test -#define TEST_SERIAL_CASE4(N, A, X, Y, SC) \ - TEST_CASE4(serial, SerialGemvFactory, N, A, X, Y, SC) -#define TEST_SERIAL_CASE2(N, S, SC) \ - TEST_CASE2(serial, SerialGemvFactory, N, S, SC) +#define TEST_SERIAL_CASE4(N, A, X, Y, SC) TEST_CASE4(serial, SerialGemvFactory, N, A, X, Y, SC) +#define TEST_SERIAL_CASE2(N, S, SC) TEST_CASE2(serial, SerialGemvFactory, N, S, SC) #define TEST_SERIAL_CASE(N, S) TEST_CASE(serial, SerialGemvFactory, N, S) #ifdef KOKKOSKERNELS_TEST_FLOAT @@ -76,8 +66,7 @@ using simd_float_avx = ::Test::simd_vector; using simd_float_avx512 = ::Test::simd_vector; TEST_CASE2(serial, SerialMKLGemvFactory, mkl_float_sse, simd_float_sse, float) TEST_CASE2(serial, SerialMKLGemvFactory, mkl_float_avx, simd_float_avx, float) -TEST_CASE2(serial, SerialMKLGemvFactory, mkl_float_avx512, simd_float_avx512, - float) +TEST_CASE2(serial, SerialMKLGemvFactory, mkl_float_avx512, simd_float_avx512, float) #endif #endif @@ -88,12 +77,9 @@ TEST_SERIAL_CASE(double, double) using simd_double_sse = ::Test::simd_vector; using simd_double_avx = ::Test::simd_vector; using simd_double_avx512 = ::Test::simd_vector; -TEST_CASE2(serial, SerialMKLGemvFactory, mkl_double_sse, simd_double_sse, - double) -TEST_CASE2(serial, SerialMKLGemvFactory, mkl_double_avx, simd_double_avx, - double) -TEST_CASE2(serial, SerialMKLGemvFactory, mkl_double_avx512, simd_double_avx512, - double) +TEST_CASE2(serial, SerialMKLGemvFactory, mkl_double_sse, simd_double_sse, double) +TEST_CASE2(serial, SerialMKLGemvFactory, mkl_double_avx, simd_double_avx, double) +TEST_CASE2(serial, SerialMKLGemvFactory, mkl_double_avx512, simd_double_avx512, double) #endif #endif diff --git a/blas/unit_test/Test_Blas2_syr.hpp b/blas/unit_test/Test_Blas2_syr.hpp index 5658ca5ea1..8dc7cadf51 100644 --- a/blas/unit_test/Test_Blas2_syr.hpp +++ b/blas/unit_test/Test_Blas2_syr.hpp @@ -51,110 +51,85 @@ namespace Test { -template +template class SyrTester { public: SyrTester(); ~SyrTester(); - void test(const int N, const int nonConstConstCombinations, - const bool useAnalyticalResults = false, - const bool useHermitianOption = false, - const bool useUpOption = false); + void test(const int N, const int nonConstConstCombinations, const bool useAnalyticalResults = false, + const bool useHermitianOption = false, const bool useUpOption = false); private: using _ViewTypeX = Kokkos::View; using _ViewTypeA = Kokkos::View; - using _HostViewTypeX = typename _ViewTypeX::HostMirror; - using _HostViewTypeA = typename _ViewTypeA::HostMirror; - using _ViewTypeExpected = - Kokkos::View; + using _HostViewTypeX = typename _ViewTypeX::HostMirror; + using _HostViewTypeA = typename _ViewTypeA::HostMirror; + using _ViewTypeExpected = Kokkos::View; using _KAT_A = Kokkos::ArithTraits; using _AuxType = typename _KAT_A::mag_type; - void populateVariables(ScalarA& alpha, - view_stride_adapter<_ViewTypeX, false>& x, - view_stride_adapter<_ViewTypeA, false>& A, - _ViewTypeExpected& h_expected, + void populateVariables(ScalarA& alpha, view_stride_adapter<_ViewTypeX, false>& x, + view_stride_adapter<_ViewTypeA, false>& A, _ViewTypeExpected& h_expected, bool& expectedResultIsKnown); template - typename std::enable_if>::value || - std::is_same>::value, - void>::type - populateAnalyticalValues(T& alpha, _HostViewTypeX& h_x, _HostViewTypeA& h_A, - _ViewTypeExpected& h_expected); + typename std::enable_if< + std::is_same>::value || std::is_same>::value, void>::type + populateAnalyticalValues(T& alpha, _HostViewTypeX& h_x, _HostViewTypeA& h_A, _ViewTypeExpected& h_expected); template - typename std::enable_if>::value && - !std::is_same>::value, - void>::type - populateAnalyticalValues(T& alpha, _HostViewTypeX& h_x, _HostViewTypeA& h_A, - _ViewTypeExpected& h_expected); + typename std::enable_if< + !std::is_same>::value && !std::is_same>::value, void>::type + populateAnalyticalValues(T& alpha, _HostViewTypeX& h_x, _HostViewTypeA& h_A, _ViewTypeExpected& h_expected); template - typename std::enable_if>::value || - std::is_same>::value, - void>::type - populateVanillaValues(const T& alpha, const _HostViewTypeX& h_x, - const _HostViewTypeA& h_A, + typename std::enable_if< + std::is_same>::value || std::is_same>::value, void>::type + populateVanillaValues(const T& alpha, const _HostViewTypeX& h_x, const _HostViewTypeA& h_A, _ViewTypeExpected& h_vanilla); template - typename std::enable_if>::value && - !std::is_same>::value, - void>::type - populateVanillaValues(const T& alpha, const _HostViewTypeX& h_x, - const _HostViewTypeA& h_A, + typename std::enable_if< + !std::is_same>::value && !std::is_same>::value, void>::type + populateVanillaValues(const T& alpha, const _HostViewTypeX& h_x, const _HostViewTypeA& h_A, _ViewTypeExpected& h_vanilla); template - typename std::enable_if>::value || - std::is_same>::value, - void>::type - compareVanillaAgainstExpected(const T& alpha, - const _ViewTypeExpected& h_vanilla, + typename std::enable_if< + std::is_same>::value || std::is_same>::value, void>::type + compareVanillaAgainstExpected(const T& alpha, const _ViewTypeExpected& h_vanilla, const _ViewTypeExpected& h_expected); template - typename std::enable_if>::value && - !std::is_same>::value, - void>::type - compareVanillaAgainstExpected(const T& alpha, - const _ViewTypeExpected& h_vanilla, + typename std::enable_if< + !std::is_same>::value && !std::is_same>::value, void>::type + compareVanillaAgainstExpected(const T& alpha, const _ViewTypeExpected& h_vanilla, const _ViewTypeExpected& h_expected); template - typename std::enable_if>::value || - std::is_same>::value, - void>::type - compareKkSyrAgainstReference(const T& alpha, const _HostViewTypeA& h_A, - const _ViewTypeExpected& h_reference); + typename std::enable_if< + std::is_same>::value || std::is_same>::value, void>::type + compareKkSyrAgainstReference(const T& alpha, const _HostViewTypeA& h_A, const _ViewTypeExpected& h_reference); template - typename std::enable_if>::value && - !std::is_same>::value, - void>::type - compareKkSyrAgainstReference(const T& alpha, const _HostViewTypeA& h_A, - const _ViewTypeExpected& h_reference); + typename std::enable_if< + !std::is_same>::value && !std::is_same>::value, void>::type + compareKkSyrAgainstReference(const T& alpha, const _HostViewTypeA& h_A, const _ViewTypeExpected& h_reference); template T shrinkAngleToZeroTwoPiRange(const T input); template - void callKkSyrAndCompareAgainstExpected( - const ScalarA& alpha, TX& x, view_stride_adapter<_ViewTypeA, false>& A, - const _ViewTypeExpected& h_expected, const std::string& situation); + void callKkSyrAndCompareAgainstExpected(const ScalarA& alpha, TX& x, view_stride_adapter<_ViewTypeA, false>& A, + const _ViewTypeExpected& h_expected, const std::string& situation); template - void callKkGerAndCompareKkSyrAgainstIt( - const ScalarA& alpha, TX& x, - view_stride_adapter<_ViewTypeA, false>& org_A, - const _HostViewTypeA& h_A_syr, const std::string& situation); + void callKkGerAndCompareKkSyrAgainstIt(const ScalarA& alpha, TX& x, view_stride_adapter<_ViewTypeA, false>& org_A, + const _HostViewTypeA& h_A_syr, const std::string& situation); const bool _A_is_complex; const bool _A_is_lr; @@ -172,15 +147,13 @@ class SyrTester { bool _kkGerShouldThrowException; }; -template +template SyrTester::SyrTester() : _A_is_complex(std::is_same>::value || std::is_same>::value), _A_is_lr(std::is_same::value), _A_is_ll(std::is_same::value), - _testIsGpu(KokkosKernels::Impl::kk_is_gpu_exec_space< - typename Device::execution_space>()) + _testIsGpu(KokkosKernels::Impl::kk_is_gpu_exec_space()) #ifdef KOKKOSKERNELS_ENABLE_TPL_BLAS , _vanillaUsesDifferentOrderOfOps(_A_is_lr) @@ -197,12 +170,8 @@ SyrTester::SyrTester() // large enough to require 'relTol' to value 5.0e-3. The same // calculations show no discrepancies for calculations with double. // **************************************************************** - _absTol(std::is_same<_AuxType, float>::value - ? 1.0e-6 - : (std::is_same<_AuxType, double>::value ? 1.0e-9 : 0)), - _relTol(std::is_same<_AuxType, float>::value - ? 5.0e-3 - : (std::is_same<_AuxType, double>::value ? 1.0e-6 : 0)), + _absTol(std::is_same<_AuxType, float>::value ? 1.0e-6 : (std::is_same<_AuxType, double>::value ? 1.0e-9 : 0)), + _relTol(std::is_same<_AuxType, float>::value ? 5.0e-3 : (std::is_same<_AuxType, double>::value ? 1.0e-6 : 0)), _M(-1), _N(-1), _useAnalyticalResults(false), @@ -212,33 +181,27 @@ SyrTester::SyrTester() _kkGerShouldThrowException(false) { } -template +template SyrTester::~SyrTester() { // Nothing to do } -template -void SyrTester::test( - const int N, const int nonConstConstCombinations, - const bool useAnalyticalResults, const bool useHermitianOption, - const bool useUpOption) { +template +void SyrTester::test(const int N, const int nonConstConstCombinations, + const bool useAnalyticalResults, + const bool useHermitianOption, + const bool useUpOption) { #ifdef HAVE_KOKKOSKERNELS_DEBUG std::cout << "Entering SyrTester::test()... - - - - - - - - - - - - - - - - " "- - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - " "- - - - - - - - - " << std::endl; - std::cout << "_A_is_complex = " << _A_is_complex - << ", _A_is_lr = " << _A_is_lr << ", _A_is_ll = " << _A_is_ll + std::cout << "_A_is_complex = " << _A_is_complex << ", _A_is_lr = " << _A_is_lr << ", _A_is_ll = " << _A_is_ll << ", _testIsGpu = " << _testIsGpu - << ", _vanillaUsesDifferentOrderOfOps = " - << _vanillaUsesDifferentOrderOfOps << ", _absTol = " << _absTol - << ", _relTol = " << _relTol - << ", nonConstConstCombinations = " << nonConstConstCombinations - << ", useAnalyticalResults = " << useAnalyticalResults - << ", useHermitianOption = " << useHermitianOption + << ", _vanillaUsesDifferentOrderOfOps = " << _vanillaUsesDifferentOrderOfOps << ", _absTol = " << _absTol + << ", _relTol = " << _relTol << ", nonConstConstCombinations = " << nonConstConstCombinations + << ", useAnalyticalResults = " << useAnalyticalResults << ", useHermitianOption = " << useHermitianOption << ", useUpOption = " << useUpOption << std::endl; #endif // ******************************************************************** @@ -273,8 +236,7 @@ void SyrTester::test( view_stride_adapter<_ViewTypeX, false> x("X", _M); view_stride_adapter<_ViewTypeA, false> A("A", _M, _N); - view_stride_adapter<_ViewTypeExpected, true> h_expected( - "expected A += alpha * x * x^{t,h}", _M, _N); + view_stride_adapter<_ViewTypeExpected, true> h_expected("expected A += alpha * x * x^{t,h}", _M, _N); bool expectedResultIsKnown = false; ScalarA alpha(_KAT_A::zero()); @@ -282,18 +244,14 @@ void SyrTester::test( // ******************************************************************** // Step 2 of 7: populate alpha, h_x, h_A, h_expected, x, A // ******************************************************************** - this->populateVariables(alpha, x, A, h_expected.d_view, - expectedResultIsKnown); + this->populateVariables(alpha, x, A, h_expected.d_view, expectedResultIsKnown); // ******************************************************************** // Step 3 of 7: populate h_vanilla // ******************************************************************** - view_stride_adapter<_ViewTypeExpected, true> h_vanilla( - "vanilla = A + alpha * x * x^{t,h}", _M, _N); + view_stride_adapter<_ViewTypeExpected, true> h_vanilla("vanilla = A + alpha * x * x^{t,h}", _M, _N); #ifdef HAVE_KOKKOSKERNELS_DEBUG - Kokkos::printf( - "In Test_Blas2_syr.hpp, computing vanilla A with alpha type = %s\n", - typeid(alpha).name()); + Kokkos::printf("In Test_Blas2_syr.hpp, computing vanilla A with alpha type = %s\n", typeid(alpha).name()); #endif this->populateVanillaValues(alpha, x.h_view, A.h_view, h_vanilla.d_view); @@ -304,8 +262,7 @@ void SyrTester::test( // ****************************************************************** // Compare h_vanilla against h_expected // ****************************************************************** - this->compareVanillaAgainstExpected(alpha, h_vanilla.d_view, - h_expected.d_view); + this->compareVanillaAgainstExpected(alpha, h_vanilla.d_view, h_expected.d_view); } else { // ****************************************************************** // Copy h_vanilla to h_expected @@ -321,13 +278,11 @@ void SyrTester::test( Kokkos::deep_copy(org_A.h_view, A.h_view); if (test_x) { - this->callKkSyrAndCompareAgainstExpected(alpha, x.d_view, A, - h_expected.d_view, "non const x"); + this->callKkSyrAndCompareAgainstExpected(alpha, x.d_view, A, h_expected.d_view, "non const x"); if ((_useAnalyticalResults == false) && // Just to save run time (_kkGerShouldThrowException == false)) { - this->callKkGerAndCompareKkSyrAgainstIt(alpha, x.d_view, org_A, A.h_view, - "non const x"); + this->callKkGerAndCompareKkSyrAgainstIt(alpha, x.d_view, org_A, A.h_view, "non const x"); } } @@ -337,8 +292,7 @@ void SyrTester::test( if (test_cx) { Kokkos::deep_copy(A.d_base, org_A.d_base); - this->callKkSyrAndCompareAgainstExpected(alpha, x.d_view_const, A, - h_expected.d_view, "const x"); + this->callKkSyrAndCompareAgainstExpected(alpha, x.d_view_const, A, h_expected.d_view, "const x"); } // ******************************************************************** @@ -361,12 +315,10 @@ void SyrTester::test( #endif } -template +template void SyrTester::populateVariables( - ScalarA& alpha, view_stride_adapter<_ViewTypeX, false>& x, - view_stride_adapter<_ViewTypeA, false>& A, _ViewTypeExpected& h_expected, - bool& expectedResultIsKnown) { + ScalarA& alpha, view_stride_adapter<_ViewTypeX, false>& x, view_stride_adapter<_ViewTypeA, false>& A, + _ViewTypeExpected& h_expected, bool& expectedResultIsKnown) { expectedResultIsKnown = false; if (_useAnalyticalResults) { @@ -416,8 +368,7 @@ void SyrTester::populateVariables( } else { alpha = 3; - Kokkos::Random_XorShift64_Pool rand_pool( - 13718); + Kokkos::Random_XorShift64_Pool rand_pool(13718); { ScalarX randStart, randEnd; @@ -464,8 +415,7 @@ void SyrTester::populateVariables( if (_N <= 2) { for (int i(0); i < _M; ++i) { for (int j(0); j < _N; ++j) { - std::cout << "h_origA(" << i << "," << j << ")=" << A.h_view(i, j) - << std::endl; + std::cout << "h_origA(" << i << "," << j << ")=" << A.h_view(i, j) << std::endl; } } } @@ -473,16 +423,13 @@ void SyrTester::populateVariables( } // Code for complex values -template +template template -typename std::enable_if>::value || - std::is_same>::value, - void>::type -SyrTester::populateAnalyticalValues(T& alpha, _HostViewTypeX& h_x, - _HostViewTypeA& h_A, - _ViewTypeExpected& h_expected) { +typename std::enable_if< + std::is_same>::value || std::is_same>::value, void>::type +SyrTester::populateAnalyticalValues(T& alpha, _HostViewTypeX& h_x, + _HostViewTypeA& h_A, + _ViewTypeExpected& h_expected) { if (_useHermitianOption) { alpha.real() = 1.; alpha.imag() = 0.; @@ -500,10 +447,8 @@ SyrTestershrinkAngleToZeroTwoPiRange(static_cast<_AuxType>(i - j)); - if (((_useUpOption == true) && (i <= j)) || - ((_useUpOption == false) && (i >= j))) { + _AuxType auxImJ = this->shrinkAngleToZeroTwoPiRange(static_cast<_AuxType>(i - j)); + if (((_useUpOption == true) && (i <= j)) || ((_useUpOption == false) && (i >= j))) { h_A(i, j).real() = cos(auxImJ); h_A(i, j).imag() = -sin(auxImJ); } else { @@ -515,8 +460,7 @@ SyrTestershrinkAngleToZeroTwoPiRange(static_cast<_AuxType>(i + j)); + _AuxType auxIpJ = this->shrinkAngleToZeroTwoPiRange(static_cast<_AuxType>(i + j)); h_A(i, j).real() = sin(auxIpJ) + cos(auxIpJ); h_A(i, j).imag() = sin(auxIpJ) - cos(auxIpJ); } @@ -526,10 +470,8 @@ SyrTester= j))) { - _AuxType auxImJ = - this->shrinkAngleToZeroTwoPiRange(static_cast<_AuxType>(i - j)); + if (((_useUpOption == true) && (i <= j)) || ((_useUpOption == false) && (i >= j))) { + _AuxType auxImJ = this->shrinkAngleToZeroTwoPiRange(static_cast<_AuxType>(i - j)); h_expected(i, j).real() = 2. * cos(auxImJ); h_expected(i, j).imag() = -2. * sin(auxImJ); } else { @@ -541,10 +483,8 @@ SyrTester= j))) { - _AuxType auxIpJ = - this->shrinkAngleToZeroTwoPiRange(static_cast<_AuxType>(i + j)); + if (((_useUpOption == true) && (i <= j)) || ((_useUpOption == false) && (i >= j))) { + _AuxType auxIpJ = this->shrinkAngleToZeroTwoPiRange(static_cast<_AuxType>(i + j)); h_expected(i, j).real() = 2. * sin(auxIpJ); h_expected(i, j).imag() = 2. * sin(auxIpJ); } else { @@ -557,16 +497,13 @@ SyrTester +template template -typename std::enable_if>::value && - !std::is_same>::value, - void>::type -SyrTester::populateAnalyticalValues(T& alpha, _HostViewTypeX& h_x, - _HostViewTypeA& h_A, - _ViewTypeExpected& h_expected) { +typename std::enable_if< + !std::is_same>::value && !std::is_same>::value, void>::type +SyrTester::populateAnalyticalValues(T& alpha, _HostViewTypeX& h_x, + _HostViewTypeA& h_A, + _ViewTypeExpected& h_expected) { alpha = 2; for (int i = 0; i < _M; ++i) { @@ -577,18 +514,15 @@ SyrTestershrinkAngleToZeroTwoPiRange(static_cast<_AuxType>(i)); for (int j = 0; j < _N; ++j) { - _AuxType auxJ = - this->shrinkAngleToZeroTwoPiRange(static_cast<_AuxType>(j)); - h_A(i, j) = 2 * cos(auxI) * cos(auxJ); + _AuxType auxJ = this->shrinkAngleToZeroTwoPiRange(static_cast<_AuxType>(j)); + h_A(i, j) = 2 * cos(auxI) * cos(auxJ); } } for (int i = 0; i < _M; ++i) { for (int j = 0; j < _N; ++j) { - if (((_useUpOption == true) && (i <= j)) || - ((_useUpOption == false) && (i >= j))) { - _AuxType auxImJ = - this->shrinkAngleToZeroTwoPiRange(static_cast<_AuxType>(i - j)); + if (((_useUpOption == true) && (i <= j)) || ((_useUpOption == false) && (i >= j))) { + _AuxType auxImJ = this->shrinkAngleToZeroTwoPiRange(static_cast<_AuxType>(i - j)); h_expected(i, j) = 2 * cos(auxImJ); } else { h_expected(i, j) = h_A(i, j); @@ -598,21 +532,19 @@ SyrTester +template template -typename std::enable_if>::value || - std::is_same>::value, - void>::type -SyrTester::populateVanillaValues( - const T& alpha, const _HostViewTypeX& h_x, const _HostViewTypeA& h_A, - _ViewTypeExpected& h_vanilla) { +typename std::enable_if< + std::is_same>::value || std::is_same>::value, void>::type +SyrTester::populateVanillaValues(const T& alpha, + const _HostViewTypeX& h_x, + const _HostViewTypeA& h_A, + _ViewTypeExpected& h_vanilla) { if (_vanillaUsesDifferentOrderOfOps) { if (_useHermitianOption) { for (int i = 0; i < _M; ++i) { for (int j = 0; j < _N; ++j) { - if (((_useUpOption == true) && (i <= j)) || - ((_useUpOption == false) && (i >= j))) { + if (((_useUpOption == true) && (i <= j)) || ((_useUpOption == false) && (i >= j))) { h_vanilla(i, j) = h_A(i, j) + alpha * _KAT_A::conj(h_x(j)) * h_x(i); } else { h_vanilla(i, j) = h_A(i, j); @@ -625,8 +557,7 @@ SyrTester::populateVanillaValues( } else { for (int i = 0; i < _M; ++i) { for (int j = 0; j < _N; ++j) { - if (((_useUpOption == true) && (i <= j)) || - ((_useUpOption == false) && (i >= j))) { + if (((_useUpOption == true) && (i <= j)) || ((_useUpOption == false) && (i >= j))) { h_vanilla(i, j) = h_A(i, j) + alpha * h_x(j) * h_x(i); } else { h_vanilla(i, j) = h_A(i, j); @@ -638,8 +569,7 @@ SyrTester::populateVanillaValues( if (_useHermitianOption) { for (int i = 0; i < _M; ++i) { for (int j = 0; j < _N; ++j) { - if (((_useUpOption == true) && (i <= j)) || - ((_useUpOption == false) && (i >= j))) { + if (((_useUpOption == true) && (i <= j)) || ((_useUpOption == false) && (i >= j))) { h_vanilla(i, j) = h_A(i, j) + alpha * h_x(i) * _KAT_A::conj(h_x(j)); } else { h_vanilla(i, j) = h_A(i, j); @@ -652,8 +582,7 @@ SyrTester::populateVanillaValues( } else { for (int i = 0; i < _M; ++i) { for (int j = 0; j < _N; ++j) { - if (((_useUpOption == true) && (i <= j)) || - ((_useUpOption == false) && (i >= j))) { + if (((_useUpOption == true) && (i <= j)) || ((_useUpOption == false) && (i >= j))) { h_vanilla(i, j) = h_A(i, j) + alpha * h_x(i) * h_x(j); } else { h_vanilla(i, j) = h_A(i, j); @@ -665,20 +594,18 @@ SyrTester::populateVanillaValues( } // Code for non-complex values -template +template template -typename std::enable_if>::value && - !std::is_same>::value, - void>::type -SyrTester::populateVanillaValues( - const T& alpha, const _HostViewTypeX& h_x, const _HostViewTypeA& h_A, - _ViewTypeExpected& h_vanilla) { +typename std::enable_if< + !std::is_same>::value && !std::is_same>::value, void>::type +SyrTester::populateVanillaValues(const T& alpha, + const _HostViewTypeX& h_x, + const _HostViewTypeA& h_A, + _ViewTypeExpected& h_vanilla) { if (_vanillaUsesDifferentOrderOfOps) { for (int i = 0; i < _M; ++i) { for (int j = 0; j < _N; ++j) { - if (((_useUpOption == true) && (i <= j)) || - ((_useUpOption == false) && (i >= j))) { + if (((_useUpOption == true) && (i <= j)) || ((_useUpOption == false) && (i >= j))) { h_vanilla(i, j) = h_A(i, j) + alpha * h_x(j) * h_x(i); } else { h_vanilla(i, j) = h_A(i, j); @@ -688,8 +615,7 @@ SyrTester::populateVanillaValues( } else { for (int i = 0; i < _M; ++i) { for (int j = 0; j < _N; ++j) { - if (((_useUpOption == true) && (i <= j)) || - ((_useUpOption == false) && (i >= j))) { + if (((_useUpOption == true) && (i <= j)) || ((_useUpOption == false) && (i >= j))) { h_vanilla(i, j) = h_A(i, j) + alpha * h_x(i) * h_x(j); } else { h_vanilla(i, j) = h_A(i, j); @@ -699,11 +625,9 @@ SyrTester::populateVanillaValues( } } -template +template template -T SyrTester::shrinkAngleToZeroTwoPiRange(const T input) { +T SyrTester::shrinkAngleToZeroTwoPiRange(const T input) { T output(input); #if 0 T twoPi( 2. * Kokkos::numbers::pi ); @@ -718,29 +642,23 @@ T SyrTester +template template -typename std::enable_if>::value || - std::is_same>::value, - void>::type -SyrTester:: - compareVanillaAgainstExpected(const T& alpha, - const _ViewTypeExpected& h_vanilla, - const _ViewTypeExpected& h_expected) { +typename std::enable_if< + std::is_same>::value || std::is_same>::value, void>::type +SyrTester::compareVanillaAgainstExpected( + const T& alpha, const _ViewTypeExpected& h_vanilla, const _ViewTypeExpected& h_expected) { #ifdef HAVE_KOKKOSKERNELS_DEBUG if (_N <= 2) { for (int i(0); i < _M; ++i) { for (int j(0); j < _N; ++j) { - std::cout << "h_exp(" << i << "," << j << ")=" << h_expected(i, j) - << ", h_van(" << i << "," << j << ")=" << h_vanilla(i, j) - << std::endl; + std::cout << "h_exp(" << i << "," << j << ")=" << h_expected(i, j) << ", h_van(" << i << "," << j + << ")=" << h_vanilla(i, j) << std::endl; } } } #endif - int maxNumErrorsAllowed(static_cast(_M) * static_cast(_N) * - 1.e-3); + int maxNumErrorsAllowed(static_cast(_M) * static_cast(_N) * 1.e-3); if (_useAnalyticalResults) { int numErrorsRealAbs(0); @@ -759,7 +677,7 @@ SyrTester:: for (int i(0); i < _M; ++i) { for (int j(0); j < _N; ++j) { - diff = _KAT_A::abs(h_expected(i, j).real() - h_vanilla(i, j).real()); + diff = _KAT_A::abs(h_expected(i, j).real() - h_vanilla(i, j).real()); errorHappened = false; if (h_expected(i, j).real() == 0.) { diffThreshold = _KAT_A::abs(_absTol); @@ -783,17 +701,15 @@ SyrTester:: } if (errorHappened && (numErrorsRealAbs + numErrorsRealRel == 1)) { #ifdef HAVE_KOKKOSKERNELS_DEBUG - std::cout << "ERROR, i = " << i << ", j = " << j - << ": h_expected(i,j).real() = " << h_expected(i, j).real() + std::cout << "ERROR, i = " << i << ", j = " << j << ": h_expected(i,j).real() = " << h_expected(i, j).real() << ", h_vanilla(i,j).real() = " << h_vanilla(i, j).real() << ", _KAT_A::abs(h_expected(i,j).real() - " "h_vanilla(i,j).real()) = " - << diff << ", diffThreshold = " << diffThreshold - << std::endl; + << diff << ", diffThreshold = " << diffThreshold << std::endl; #endif } - diff = _KAT_A::abs(h_expected(i, j).imag() - h_vanilla(i, j).imag()); + diff = _KAT_A::abs(h_expected(i, j).imag() - h_vanilla(i, j).imag()); errorHappened = false; if (h_expected(i, j).imag() == 0.) { diffThreshold = _KAT_A::abs(_absTol); @@ -817,13 +733,11 @@ SyrTester:: } if (errorHappened && (numErrorsImagAbs + numErrorsImagRel == 1)) { #ifdef HAVE_KOKKOSKERNELS_DEBUG - std::cout << "ERROR, i = " << i << ", j = " << j - << ": h_expected(i,j).imag() = " << h_expected(i, j).imag() + std::cout << "ERROR, i = " << i << ", j = " << j << ": h_expected(i,j).imag() = " << h_expected(i, j).imag() << ", h_vanilla(i,j).imag() = " << h_vanilla(i, j).imag() << ", _KAT_A::abs(h_expected(i,j).imag() - " "h_vanilla(i,j).imag()) = " - << diff << ", diffThreshold = " << diffThreshold - << std::endl; + << diff << ", diffThreshold = " << diffThreshold << std::endl; #endif } } // for j @@ -831,25 +745,15 @@ SyrTester:: { std::ostringstream msg; - msg << ", A is " << _M << " by " << _N << ", _A_is_lr = " << _A_is_lr - << ", _A_is_ll = " << _A_is_ll - << ", alpha type = " << typeid(alpha).name() - << ", _useHermitianOption = " << _useHermitianOption - << ", _useUpOption = " << _useUpOption - << ": vanilla differs too much from analytical on real components" - << ", numErrorsRealAbs = " << numErrorsRealAbs - << ", numErrorsRealRel = " << numErrorsRealRel - << ", maxErrorRealRel = " << maxErrorRealRel - << ", iForMaxErrorRealRel = " << iForMaxErrorRealRel - << ", jForMaxErrorRealRel = " << jForMaxErrorRealRel - << ", h_expected(i,j).real() = " - << (((_M > 0) && (_N > 0)) - ? h_expected(iForMaxErrorRealRel, jForMaxErrorRealRel).real() - : 9.999e+99) + msg << ", A is " << _M << " by " << _N << ", _A_is_lr = " << _A_is_lr << ", _A_is_ll = " << _A_is_ll + << ", alpha type = " << typeid(alpha).name() << ", _useHermitianOption = " << _useHermitianOption + << ", _useUpOption = " << _useUpOption << ": vanilla differs too much from analytical on real components" + << ", numErrorsRealAbs = " << numErrorsRealAbs << ", numErrorsRealRel = " << numErrorsRealRel + << ", maxErrorRealRel = " << maxErrorRealRel << ", iForMaxErrorRealRel = " << iForMaxErrorRealRel + << ", jForMaxErrorRealRel = " << jForMaxErrorRealRel << ", h_expected(i,j).real() = " + << (((_M > 0) && (_N > 0)) ? h_expected(iForMaxErrorRealRel, jForMaxErrorRealRel).real() : 9.999e+99) << ", h_vanilla(i,j).real() = " - << (((_M > 0) && (_N > 0)) - ? h_vanilla(iForMaxErrorRealRel, jForMaxErrorRealRel).real() - : 9.999e+99) + << (((_M > 0) && (_N > 0)) ? h_vanilla(iForMaxErrorRealRel, jForMaxErrorRealRel).real() : 9.999e+99) << ", maxNumErrorsAllowed = " << maxNumErrorsAllowed; int numErrorsReal(numErrorsRealAbs + numErrorsRealRel); @@ -858,30 +762,19 @@ SyrTester:: std::cout << "WARNING" << msg.str() << std::endl; #endif } - EXPECT_LE(numErrorsReal, maxNumErrorsAllowed) - << "Failed test" << msg.str(); + EXPECT_LE(numErrorsReal, maxNumErrorsAllowed) << "Failed test" << msg.str(); } { std::ostringstream msg; - msg << ", A is " << _M << " by " << _N << ", _A_is_lr = " << _A_is_lr - << ", _A_is_ll = " << _A_is_ll - << ", alpha type = " << typeid(alpha).name() - << ", _useHermitianOption = " << _useHermitianOption - << ", _useUpOption = " << _useUpOption - << ": vanilla differs too much from analytical on imag components" - << ", numErrorsImagAbs = " << numErrorsImagAbs - << ", numErrorsImagRel = " << numErrorsImagRel - << ", maxErrorImagRel = " << maxErrorImagRel - << ", iForMaxErrorImagRel = " << iForMaxErrorImagRel - << ", jForMaxErrorImagRel = " << jForMaxErrorImagRel - << ", h_expected(i,j).imag() = " - << (((_M > 0) && (_N > 0)) - ? h_expected(iForMaxErrorImagRel, jForMaxErrorImagRel).imag() - : 9.999e+99) + msg << ", A is " << _M << " by " << _N << ", _A_is_lr = " << _A_is_lr << ", _A_is_ll = " << _A_is_ll + << ", alpha type = " << typeid(alpha).name() << ", _useHermitianOption = " << _useHermitianOption + << ", _useUpOption = " << _useUpOption << ": vanilla differs too much from analytical on imag components" + << ", numErrorsImagAbs = " << numErrorsImagAbs << ", numErrorsImagRel = " << numErrorsImagRel + << ", maxErrorImagRel = " << maxErrorImagRel << ", iForMaxErrorImagRel = " << iForMaxErrorImagRel + << ", jForMaxErrorImagRel = " << jForMaxErrorImagRel << ", h_expected(i,j).imag() = " + << (((_M > 0) && (_N > 0)) ? h_expected(iForMaxErrorImagRel, jForMaxErrorImagRel).imag() : 9.999e+99) << ", h_vanilla(i,j).imag() = " - << (((_M > 0) && (_N > 0)) - ? h_vanilla(iForMaxErrorImagRel, jForMaxErrorImagRel).imag() - : 9.999e+99) + << (((_M > 0) && (_N > 0)) ? h_vanilla(iForMaxErrorImagRel, jForMaxErrorImagRel).imag() : 9.999e+99) << ", maxNumErrorsAllowed = " << maxNumErrorsAllowed; int numErrorsImag(numErrorsImagAbs + numErrorsImagRel); @@ -890,8 +783,7 @@ SyrTester:: std::cout << "WARNING" << msg.str() << std::endl; #endif } - EXPECT_LE(numErrorsImag, maxNumErrorsAllowed) - << "Failed test" << msg.str(); + EXPECT_LE(numErrorsImag, maxNumErrorsAllowed) << "Failed test" << msg.str(); } } else { int numErrorsReal(0); @@ -902,11 +794,8 @@ SyrTester:: if (h_expected(i, j).real() != h_vanilla(i, j).real()) { if (numErrorsReal == 0) { #ifdef HAVE_KOKKOSKERNELS_DEBUG - std::cout << "ERROR, i = " << i << ", j = " << j - << ": h_expected(i,j).real() = " - << h_expected(i, j).real() - << ", h_vanilla(i,j).real() = " << h_vanilla(i, j).real() - << std::endl; + std::cout << "ERROR, i = " << i << ", j = " << j << ": h_expected(i,j).real() = " << h_expected(i, j).real() + << ", h_vanilla(i,j).real() = " << h_vanilla(i, j).real() << std::endl; #endif } numErrorsReal++; @@ -915,63 +804,50 @@ SyrTester:: if (h_expected(i, j).imag() != h_vanilla(i, j).imag()) { if (numErrorsImag == 0) { #ifdef HAVE_KOKKOSKERNELS_DEBUG - std::cout << "ERROR, i = " << i << ", j = " << j - << ": h_expected(i,j).imag() = " - << h_expected(i, j).imag() - << ", h_vanilla(i,j).imag() = " << h_vanilla(i, j).imag() - << std::endl; + std::cout << "ERROR, i = " << i << ", j = " << j << ": h_expected(i,j).imag() = " << h_expected(i, j).imag() + << ", h_vanilla(i,j).imag() = " << h_vanilla(i, j).imag() << std::endl; #endif } numErrorsImag++; } } // for j } // for i - EXPECT_EQ(numErrorsReal, 0) - << "Failed test" - << ", A is " << _M << " by " << _N << ", _A_is_lr = " << _A_is_lr - << ", _A_is_ll = " << _A_is_ll - << ", alpha type = " << typeid(alpha).name() - << ", _useHermitianOption = " << _useHermitianOption - << ", _useUpOption = " << _useUpOption - << ": vanilla result is incorrect on real components" - << ", numErrorsReal = " << numErrorsReal; - EXPECT_EQ(numErrorsImag, 0) - << "Failed test" - << ", A is " << _M << " by " << _N << ", _A_is_lr = " << _A_is_lr - << ", _A_is_ll = " << _A_is_ll - << ", alpha type = " << typeid(alpha).name() - << ", _useHermitianOption = " << _useHermitianOption - << ", _useUpOption = " << _useUpOption - << ": vanilla result is incorrect on imag components" - << ", numErrorsImag = " << numErrorsImag; + EXPECT_EQ(numErrorsReal, 0) << "Failed test" + << ", A is " << _M << " by " << _N << ", _A_is_lr = " << _A_is_lr + << ", _A_is_ll = " << _A_is_ll << ", alpha type = " << typeid(alpha).name() + << ", _useHermitianOption = " << _useHermitianOption + << ", _useUpOption = " << _useUpOption + << ": vanilla result is incorrect on real components" + << ", numErrorsReal = " << numErrorsReal; + EXPECT_EQ(numErrorsImag, 0) << "Failed test" + << ", A is " << _M << " by " << _N << ", _A_is_lr = " << _A_is_lr + << ", _A_is_ll = " << _A_is_ll << ", alpha type = " << typeid(alpha).name() + << ", _useHermitianOption = " << _useHermitianOption + << ", _useUpOption = " << _useUpOption + << ": vanilla result is incorrect on imag components" + << ", numErrorsImag = " << numErrorsImag; } } // Code for non-complex values -template +template template -typename std::enable_if>::value && - !std::is_same>::value, - void>::type -SyrTester:: - compareVanillaAgainstExpected(const T& alpha, - const _ViewTypeExpected& h_vanilla, - const _ViewTypeExpected& h_expected) { +typename std::enable_if< + !std::is_same>::value && !std::is_same>::value, void>::type +SyrTester::compareVanillaAgainstExpected( + const T& alpha, const _ViewTypeExpected& h_vanilla, const _ViewTypeExpected& h_expected) { if (_N <= 2) { for (int i(0); i < _M; ++i) { for (int j(0); j < _N; ++j) { #ifdef HAVE_KOKKOSKERNELS_DEBUG - std::cout << "h_exp(" << i << "," << j << ")=" << h_expected(i, j) - << ", h_van(" << i << "," << j << ")=" << h_vanilla(i, j) - << std::endl; + std::cout << "h_exp(" << i << "," << j << ")=" << h_expected(i, j) << ", h_van(" << i << "," << j + << ")=" << h_vanilla(i, j) << std::endl; #endif } } } - int maxNumErrorsAllowed(static_cast(_M) * static_cast(_N) * - 1.e-3); + int maxNumErrorsAllowed(static_cast(_M) * static_cast(_N) * 1.e-3); if (_useAnalyticalResults) { int numErrorsAbs(0); @@ -1009,12 +885,10 @@ SyrTester:: } if (errorHappened && (numErrorsAbs + numErrorsRel == 1)) { #ifdef HAVE_KOKKOSKERNELS_DEBUG - std::cout << "ERROR, i = " << i << ", j = " << j - << ": h_expected(i,j) = " << h_expected(i, j) + std::cout << "ERROR, i = " << i << ", j = " << j << ": h_expected(i,j) = " << h_expected(i, j) << ", h_vanilla(i,j) = " << h_vanilla(i, j) - << ", _KAT_A::abs(h_expected(i,j) - h_vanilla(i,j)) = " - << diff << ", diffThreshold = " << diffThreshold - << std::endl; + << ", _KAT_A::abs(h_expected(i,j) - h_vanilla(i,j)) = " << diff + << ", diffThreshold = " << diffThreshold << std::endl; #endif } } // for j @@ -1022,24 +896,14 @@ SyrTester:: { std::ostringstream msg; - msg << ", A is " << _M << " by " << _N << ", _A_is_lr = " << _A_is_lr - << ", _A_is_ll = " << _A_is_ll - << ", alpha type = " << typeid(alpha).name() - << ", _useHermitianOption = " << _useHermitianOption - << ", _useUpOption = " << _useUpOption - << ": vanilla differs too much from expected" - << ", numErrorsAbs = " << numErrorsAbs - << ", numErrorsRel = " << numErrorsRel - << ", maxErrorRel = " << maxErrorRel - << ", iForMaxErrorRel = " << iForMaxErrorRel + msg << ", A is " << _M << " by " << _N << ", _A_is_lr = " << _A_is_lr << ", _A_is_ll = " << _A_is_ll + << ", alpha type = " << typeid(alpha).name() << ", _useHermitianOption = " << _useHermitianOption + << ", _useUpOption = " << _useUpOption << ": vanilla differs too much from expected" + << ", numErrorsAbs = " << numErrorsAbs << ", numErrorsRel = " << numErrorsRel + << ", maxErrorRel = " << maxErrorRel << ", iForMaxErrorRel = " << iForMaxErrorRel << ", jForMaxErrorRel = " << jForMaxErrorRel << ", h_expected(i,j) = " - << (((_M > 0) && (_N > 0)) - ? h_expected(iForMaxErrorRel, jForMaxErrorRel) - : 9.999e+99) - << ", h_vanilla(i,j) = " - << (((_M > 0) && (_N > 0)) - ? h_vanilla(iForMaxErrorRel, jForMaxErrorRel) - : 9.999e+99) + << (((_M > 0) && (_N > 0)) ? h_expected(iForMaxErrorRel, jForMaxErrorRel) : 9.999e+99) + << ", h_vanilla(i,j) = " << (((_M > 0) && (_N > 0)) ? h_vanilla(iForMaxErrorRel, jForMaxErrorRel) : 9.999e+99) << ", maxNumErrorsAllowed = " << maxNumErrorsAllowed; int numErrors(numErrorsAbs + numErrorsRel); @@ -1058,8 +922,7 @@ SyrTester:: if (h_expected(i, j) != h_vanilla(i, j)) { if (numErrors == 0) { #ifdef HAVE_KOKKOSKERNELS_DEBUG - std::cout << "ERROR, i = " << i << ", j = " << j - << ": h_expected(i,j) = " << h_expected(i, j) + std::cout << "ERROR, i = " << i << ", j = " << j << ": h_expected(i,j) = " << h_expected(i, j) << ", h_vanilla(i,j) = " << h_vanilla(i, j) << std::endl; #endif } @@ -1067,42 +930,34 @@ SyrTester:: } } // for j } // for i - EXPECT_EQ(numErrors, 0) - << "Failed test" - << ", A is " << _M << " by " << _N << ", _A_is_lr = " << _A_is_lr - << ", _A_is_ll = " << _A_is_ll - << ", alpha type = " << typeid(alpha).name() - << ", _useHermitianOption = " << _useHermitianOption - << ", _useUpOption = " << _useUpOption - << ": vanilla result is incorrect" - << ", numErrors = " << numErrors; + EXPECT_EQ(numErrors, 0) << "Failed test" + << ", A is " << _M << " by " << _N << ", _A_is_lr = " << _A_is_lr + << ", _A_is_ll = " << _A_is_ll << ", alpha type = " << typeid(alpha).name() + << ", _useHermitianOption = " << _useHermitianOption << ", _useUpOption = " << _useUpOption + << ": vanilla result is incorrect" + << ", numErrors = " << numErrors; } } // Code for complex values -template +template template -typename std::enable_if>::value || - std::is_same>::value, - void>::type -SyrTester:: - compareKkSyrAgainstReference(const T& alpha, const _HostViewTypeA& h_A, - const _ViewTypeExpected& h_reference) { +typename std::enable_if< + std::is_same>::value || std::is_same>::value, void>::type +SyrTester::compareKkSyrAgainstReference( + const T& alpha, const _HostViewTypeA& h_A, const _ViewTypeExpected& h_reference) { if (_N <= 2) { for (int i(0); i < _M; ++i) { for (int j(0); j < _N; ++j) { #ifdef HAVE_KOKKOSKERNELS_DEBUG - std::cout << "h_exp(" << i << "," << j << ")=" << h_reference(i, j) - << ", h_A(" << i << "," << j << ")=" << h_A(i, j) - << std::endl; + std::cout << "h_exp(" << i << "," << j << ")=" << h_reference(i, j) << ", h_A(" << i << "," << j + << ")=" << h_A(i, j) << std::endl; #endif } } } - int maxNumErrorsAllowed(static_cast(_M) * static_cast(_N) * - 1.e-3); + int maxNumErrorsAllowed(static_cast(_M) * static_cast(_N) * 1.e-3); int numErrorsRealAbs(0); int numErrorsRealRel(0); @@ -1143,12 +998,10 @@ SyrTester:: } if (errorHappened && (numErrorsRealAbs + numErrorsRealRel == 1)) { #ifdef HAVE_KOKKOSKERNELS_DEBUG - std::cout - << "ERROR, i = " << i << ", j = " << j - << ": h_reference(i,j).real() = " << h_reference(i, j).real() - << ", h_A(i,j).real() = " << h_A(i, j).real() - << ", _KAT_A::abs(h_reference(i,j).real() - h_A(i,j).real()) = " - << diff << ", diffThreshold = " << diffThreshold << std::endl; + std::cout << "ERROR, i = " << i << ", j = " << j << ": h_reference(i,j).real() = " << h_reference(i, j).real() + << ", h_A(i,j).real() = " << h_A(i, j).real() + << ", _KAT_A::abs(h_reference(i,j).real() - h_A(i,j).real()) = " << diff + << ", diffThreshold = " << diffThreshold << std::endl; #endif } @@ -1176,95 +1029,58 @@ SyrTester:: } if (errorHappened && (numErrorsImagAbs + numErrorsImagRel == 1)) { #ifdef HAVE_KOKKOSKERNELS_DEBUG - std::cout - << "ERROR, i = " << i << ", j = " << j - << ": h_reference(i,j).imag() = " << h_reference(i, j).imag() - << ", h_A(i,j).imag() = " << h_A(i, j).imag() - << ", _KAT_A::abs(h_reference(i,j).imag() - h_A(i,j).imag()) = " - << diff << ", diffThreshold = " << diffThreshold << std::endl; + std::cout << "ERROR, i = " << i << ", j = " << j << ": h_reference(i,j).imag() = " << h_reference(i, j).imag() + << ", h_A(i,j).imag() = " << h_A(i, j).imag() + << ", _KAT_A::abs(h_reference(i,j).imag() - h_A(i,j).imag()) = " << diff + << ", diffThreshold = " << diffThreshold << std::endl; #endif } } // for j } // for i #ifdef HAVE_KOKKOSKERNELS_DEBUG - std::cout - << "A is " << _M << " by " << _N << ", _A_is_lr = " << _A_is_lr - << ", _A_is_ll = " << _A_is_ll - << ", alpha type = " << typeid(alpha).name() - << ", _useHermitianOption = " << _useHermitianOption - << ", _useUpOption = " << _useUpOption - << ", numErrorsRealAbs = " << numErrorsRealAbs - << ", numErrorsRealRel = " << numErrorsRealRel - << ", maxErrorRealRel = " << maxErrorRealRel - << ", iForMaxErrorRealRel = " << iForMaxErrorRealRel - << ", jForMaxErrorRealRel = " << jForMaxErrorRealRel - << ", h_reference(i,j).real() = " - << (((_M > 0) && (_N > 0)) - ? h_reference(iForMaxErrorRealRel, jForMaxErrorRealRel).real() - : 9.999e+99) - << ", h_A(i,j).real() = " - << (((_M > 0) && (_N > 0)) - ? h_A(iForMaxErrorRealRel, jForMaxErrorRealRel).real() - : 9.999e+99) - << ", numErrorsImagAbs = " << numErrorsImagAbs - << ", numErrorsImagRel = " << numErrorsImagRel - << ", maxErrorImagRel = " << maxErrorImagRel - << ", iForMaxErrorImagRel = " << iForMaxErrorImagRel - << ", jForMaxErrorImagRel = " << jForMaxErrorImagRel - << ", h_reference(i,j).imag() = " - << (((_M > 0) && (_N > 0)) - ? h_reference(iForMaxErrorImagRel, jForMaxErrorImagRel).imag() - : 9.999e+99) - << ", h_A(i,j).imag() = " - << (((_M > 0) && (_N > 0)) - ? h_A(iForMaxErrorImagRel, jForMaxErrorImagRel).imag() - : 9.999e+99) - << ", maxNumErrorsAllowed = " << maxNumErrorsAllowed << std::endl; + std::cout << "A is " << _M << " by " << _N << ", _A_is_lr = " << _A_is_lr << ", _A_is_ll = " << _A_is_ll + << ", alpha type = " << typeid(alpha).name() << ", _useHermitianOption = " << _useHermitianOption + << ", _useUpOption = " << _useUpOption << ", numErrorsRealAbs = " << numErrorsRealAbs + << ", numErrorsRealRel = " << numErrorsRealRel << ", maxErrorRealRel = " << maxErrorRealRel + << ", iForMaxErrorRealRel = " << iForMaxErrorRealRel << ", jForMaxErrorRealRel = " << jForMaxErrorRealRel + << ", h_reference(i,j).real() = " + << (((_M > 0) && (_N > 0)) ? h_reference(iForMaxErrorRealRel, jForMaxErrorRealRel).real() : 9.999e+99) + << ", h_A(i,j).real() = " + << (((_M > 0) && (_N > 0)) ? h_A(iForMaxErrorRealRel, jForMaxErrorRealRel).real() : 9.999e+99) + << ", numErrorsImagAbs = " << numErrorsImagAbs << ", numErrorsImagRel = " << numErrorsImagRel + << ", maxErrorImagRel = " << maxErrorImagRel << ", iForMaxErrorImagRel = " << iForMaxErrorImagRel + << ", jForMaxErrorImagRel = " << jForMaxErrorImagRel << ", h_reference(i,j).imag() = " + << (((_M > 0) && (_N > 0)) ? h_reference(iForMaxErrorImagRel, jForMaxErrorImagRel).imag() : 9.999e+99) + << ", h_A(i,j).imag() = " + << (((_M > 0) && (_N > 0)) ? h_A(iForMaxErrorImagRel, jForMaxErrorImagRel).imag() : 9.999e+99) + << ", maxNumErrorsAllowed = " << maxNumErrorsAllowed << std::endl; if ((_M == 2131) && (_N == 2131)) { std::cout << "Information" - << ": A is " << _M << " by " << _N << ", _A_is_lr = " << _A_is_lr - << ", _A_is_ll = " << _A_is_ll - << ", alpha type = " << typeid(alpha).name() - << ", _useHermitianOption = " << _useHermitianOption - << ", _useUpOption = " << _useUpOption - << ", h_reference(11, 2119) = (" << h_reference(11, 2119).real() + << ": A is " << _M << " by " << _N << ", _A_is_lr = " << _A_is_lr << ", _A_is_ll = " << _A_is_ll + << ", alpha type = " << typeid(alpha).name() << ", _useHermitianOption = " << _useHermitianOption + << ", _useUpOption = " << _useUpOption << ", h_reference(11, 2119) = (" << h_reference(11, 2119).real() << ", " << h_reference(11, 2119).imag() << ")" - << ", h_A(11, 2119) = (" << h_A(11, 2119).real() << ", " - << h_A(11, 2119).imag() << ")" << std::endl; + << ", h_A(11, 2119) = (" << h_A(11, 2119).real() << ", " << h_A(11, 2119).imag() << ")" << std::endl; std::cout << "Information" - << ": A is " << _M << " by " << _N << ", _A_is_lr = " << _A_is_lr - << ", _A_is_ll = " << _A_is_ll - << ", alpha type = " << typeid(alpha).name() - << ", _useHermitianOption = " << _useHermitianOption - << ", _useUpOption = " << _useUpOption - << ", h_reference(710, 1065) = (" << h_reference(710, 1065).real() + << ": A is " << _M << " by " << _N << ", _A_is_lr = " << _A_is_lr << ", _A_is_ll = " << _A_is_ll + << ", alpha type = " << typeid(alpha).name() << ", _useHermitianOption = " << _useHermitianOption + << ", _useUpOption = " << _useUpOption << ", h_reference(710, 1065) = (" << h_reference(710, 1065).real() << ", " << h_reference(710, 1065).imag() << ")" - << ", h_A(710, 1065) = (" << h_A(710, 1065).real() << ", " - << h_A(710, 1065).imag() << ")" << std::endl; + << ", h_A(710, 1065) = (" << h_A(710, 1065).real() << ", " << h_A(710, 1065).imag() << ")" << std::endl; } #endif { std::ostringstream msg; - msg << ", A is " << _M << " by " << _N << ", _A_is_lr = " << _A_is_lr - << ", _A_is_ll = " << _A_is_ll - << ", alpha type = " << typeid(alpha).name() - << ", _useHermitianOption = " << _useHermitianOption - << ", _useUpOption = " << _useUpOption - << ": syr result is incorrect on real components" - << ", numErrorsRealAbs = " << numErrorsRealAbs - << ", numErrorsRealRel = " << numErrorsRealRel - << ", maxErrorRealRel = " << maxErrorRealRel - << ", iForMaxErrorRealRel = " << iForMaxErrorRealRel - << ", jForMaxErrorRealRel = " << jForMaxErrorRealRel - << ", h_reference(i,j).real() = " - << (((_M > 0) && (_N > 0)) - ? h_reference(iForMaxErrorRealRel, jForMaxErrorRealRel).real() - : 9.999e+99) + msg << ", A is " << _M << " by " << _N << ", _A_is_lr = " << _A_is_lr << ", _A_is_ll = " << _A_is_ll + << ", alpha type = " << typeid(alpha).name() << ", _useHermitianOption = " << _useHermitianOption + << ", _useUpOption = " << _useUpOption << ": syr result is incorrect on real components" + << ", numErrorsRealAbs = " << numErrorsRealAbs << ", numErrorsRealRel = " << numErrorsRealRel + << ", maxErrorRealRel = " << maxErrorRealRel << ", iForMaxErrorRealRel = " << iForMaxErrorRealRel + << ", jForMaxErrorRealRel = " << jForMaxErrorRealRel << ", h_reference(i,j).real() = " + << (((_M > 0) && (_N > 0)) ? h_reference(iForMaxErrorRealRel, jForMaxErrorRealRel).real() : 9.999e+99) << ", h_A(i,j).real() = " - << (((_M > 0) && (_N > 0)) - ? h_A(iForMaxErrorRealRel, jForMaxErrorRealRel).real() - : 9.999e+99) + << (((_M > 0) && (_N > 0)) ? h_A(iForMaxErrorRealRel, jForMaxErrorRealRel).real() : 9.999e+99) << ", maxNumErrorsAllowed = " << maxNumErrorsAllowed; int numErrorsReal(numErrorsRealAbs + numErrorsRealRel); @@ -1277,25 +1093,15 @@ SyrTester:: } { std::ostringstream msg; - msg << ", A is " << _M << " by " << _N << ", _A_is_lr = " << _A_is_lr - << ", _A_is_ll = " << _A_is_ll - << ", alpha type = " << typeid(alpha).name() - << ", _useHermitianOption = " << _useHermitianOption - << ", _useUpOption = " << _useUpOption - << ": syr result is incorrect on imag components" - << ", numErrorsImagAbs = " << numErrorsImagAbs - << ", numErrorsImagRel = " << numErrorsImagRel - << ", maxErrorImagRel = " << maxErrorImagRel - << ", iForMaxErrorImagRel = " << iForMaxErrorImagRel - << ", jForMaxErrorImagRel = " << jForMaxErrorImagRel - << ", h_reference(i,j).imag() = " - << (((_M > 0) && (_N > 0)) - ? h_reference(iForMaxErrorImagRel, jForMaxErrorImagRel).imag() - : 9.999e+99) + msg << ", A is " << _M << " by " << _N << ", _A_is_lr = " << _A_is_lr << ", _A_is_ll = " << _A_is_ll + << ", alpha type = " << typeid(alpha).name() << ", _useHermitianOption = " << _useHermitianOption + << ", _useUpOption = " << _useUpOption << ": syr result is incorrect on imag components" + << ", numErrorsImagAbs = " << numErrorsImagAbs << ", numErrorsImagRel = " << numErrorsImagRel + << ", maxErrorImagRel = " << maxErrorImagRel << ", iForMaxErrorImagRel = " << iForMaxErrorImagRel + << ", jForMaxErrorImagRel = " << jForMaxErrorImagRel << ", h_reference(i,j).imag() = " + << (((_M > 0) && (_N > 0)) ? h_reference(iForMaxErrorImagRel, jForMaxErrorImagRel).imag() : 9.999e+99) << ", h_A(i,j).imag() = " - << (((_M > 0) && (_N > 0)) - ? h_A(iForMaxErrorImagRel, jForMaxErrorImagRel).imag() - : 9.999e+99) + << (((_M > 0) && (_N > 0)) ? h_A(iForMaxErrorImagRel, jForMaxErrorImagRel).imag() : 9.999e+99) << ", maxNumErrorsAllowed = " << maxNumErrorsAllowed; int numErrorsImag(numErrorsImagAbs + numErrorsImagRel); @@ -1309,28 +1115,23 @@ SyrTester:: } // Code for non-complex values -template +template template -typename std::enable_if>::value && - !std::is_same>::value, - void>::type -SyrTester:: - compareKkSyrAgainstReference(const T& alpha, const _HostViewTypeA& h_A, - const _ViewTypeExpected& h_reference) { +typename std::enable_if< + !std::is_same>::value && !std::is_same>::value, void>::type +SyrTester::compareKkSyrAgainstReference( + const T& alpha, const _HostViewTypeA& h_A, const _ViewTypeExpected& h_reference) { #ifdef HAVE_KOKKOSKERNELS_DEBUG if (_N <= 2) { for (int i(0); i < _M; ++i) { for (int j(0); j < _N; ++j) { - std::cout << "h_exp(" << i << "," << j << ")=" << h_reference(i, j) - << ", h_A(" << i << "," << j << ")=" << h_A(i, j) - << std::endl; + std::cout << "h_exp(" << i << "," << j << ")=" << h_reference(i, j) << ", h_A(" << i << "," << j + << ")=" << h_A(i, j) << std::endl; } } } #endif - int maxNumErrorsAllowed(static_cast(_M) * static_cast(_N) * - 1.e-3); + int maxNumErrorsAllowed(static_cast(_M) * static_cast(_N) * 1.e-3); int numErrorsAbs(0); int numErrorsRel(0); @@ -1366,53 +1167,34 @@ SyrTester:: } if (errorHappened && (numErrorsAbs + numErrorsRel == 1)) { #ifdef HAVE_KOKKOSKERNELS_DEBUG - std::cout << "ERROR, i = " << i << ", j = " << j - << ": h_reference(i,j) = " << h_reference(i, j) - << ", h_A(i,j) = " << h_A(i, j) - << ", _KAT_A::abs(h_reference(i,j) - h_A(i,j)) = " << diff + std::cout << "ERROR, i = " << i << ", j = " << j << ": h_reference(i,j) = " << h_reference(i, j) + << ", h_A(i,j) = " << h_A(i, j) << ", _KAT_A::abs(h_reference(i,j) - h_A(i,j)) = " << diff << ", diffThreshold = " << diffThreshold << std::endl; #endif } } // for j } // for i #ifdef HAVE_KOKKOSKERNELS_DEBUG - std::cout << "A is " << _M << " by " << _N << ", _A_is_lr = " << _A_is_lr - << ", _A_is_ll = " << _A_is_ll - << ", alpha type = " << typeid(alpha).name() - << ", _useHermitianOption = " << _useHermitianOption - << ", _useUpOption = " << _useUpOption - << ", numErrorsAbs = " << numErrorsAbs - << ", numErrorsRel = " << numErrorsRel - << ", maxErrorRel = " << maxErrorRel - << ", iForMaxErrorRel = " << iForMaxErrorRel - << ", jForMaxErrorRel = " << jForMaxErrorRel + std::cout << "A is " << _M << " by " << _N << ", _A_is_lr = " << _A_is_lr << ", _A_is_ll = " << _A_is_ll + << ", alpha type = " << typeid(alpha).name() << ", _useHermitianOption = " << _useHermitianOption + << ", _useUpOption = " << _useUpOption << ", numErrorsAbs = " << numErrorsAbs + << ", numErrorsRel = " << numErrorsRel << ", maxErrorRel = " << maxErrorRel + << ", iForMaxErrorRel = " << iForMaxErrorRel << ", jForMaxErrorRel = " << jForMaxErrorRel << ", h_reference(i,j) = " - << (((_M > 0) && (_N > 0)) - ? h_reference(iForMaxErrorRel, jForMaxErrorRel) - : 9.999e+99) - << ", h_A(i,j) = " - << (((_M > 0) && (_N > 0)) ? h_A(iForMaxErrorRel, jForMaxErrorRel) - : 9.999e+99) + << (((_M > 0) && (_N > 0)) ? h_reference(iForMaxErrorRel, jForMaxErrorRel) : 9.999e+99) + << ", h_A(i,j) = " << (((_M > 0) && (_N > 0)) ? h_A(iForMaxErrorRel, jForMaxErrorRel) : 9.999e+99) << ", maxNumErrorsAllowed = " << maxNumErrorsAllowed << std::endl; #endif { std::ostringstream msg; - msg << ", A is " << _M << " by " << _N << ", _A_is_lr = " << _A_is_lr - << ", _A_is_ll = " << _A_is_ll - << ", alpha type = " << typeid(alpha).name() - << ", _useHermitianOption = " << _useHermitianOption + msg << ", A is " << _M << " by " << _N << ", _A_is_lr = " << _A_is_lr << ", _A_is_ll = " << _A_is_ll + << ", alpha type = " << typeid(alpha).name() << ", _useHermitianOption = " << _useHermitianOption << ", _useUpOption = " << _useUpOption << ": syr result is incorrect" - << ", numErrorsAbs = " << numErrorsAbs - << ", numErrorsRel = " << numErrorsRel - << ", maxErrorRel = " << maxErrorRel - << ", iForMaxErrorRel = " << iForMaxErrorRel + << ", numErrorsAbs = " << numErrorsAbs << ", numErrorsRel = " << numErrorsRel + << ", maxErrorRel = " << maxErrorRel << ", iForMaxErrorRel = " << iForMaxErrorRel << ", jForMaxErrorRel = " << jForMaxErrorRel << ", h_reference(i,j) = " - << (((_M > 0) && (_N > 0)) - ? h_reference(iForMaxErrorRel, jForMaxErrorRel) - : 9.999e+99) - << ", h_A(i,j) = " - << (((_M > 0) && (_N > 0)) ? h_A(iForMaxErrorRel, jForMaxErrorRel) - : 9.999e+99) + << (((_M > 0) && (_N > 0)) ? h_reference(iForMaxErrorRel, jForMaxErrorRel) : 9.999e+99) + << ", h_A(i,j) = " << (((_M > 0) && (_N > 0)) ? h_A(iForMaxErrorRel, jForMaxErrorRel) : 9.999e+99) << ", maxNumErrorsAllowed = " << maxNumErrorsAllowed; int numErrors(numErrorsAbs + numErrorsRel); @@ -1425,16 +1207,13 @@ SyrTester:: } } -template +template template -void SyrTester:: - callKkSyrAndCompareAgainstExpected( - const ScalarA& alpha, TX& x, view_stride_adapter<_ViewTypeA, false>& A, - const _ViewTypeExpected& h_expected, const std::string& situation) { +void SyrTester::callKkSyrAndCompareAgainstExpected( + const ScalarA& alpha, TX& x, view_stride_adapter<_ViewTypeA, false>& A, const _ViewTypeExpected& h_expected, + const std::string& situation) { #ifdef HAVE_KOKKOSKERNELS_DEBUG - std::cout << "In Test_Blas2_syr, '" << situation << "', alpha = " << alpha - << std::endl; + std::cout << "In Test_Blas2_syr, '" << situation << "', alpha = " << alpha << std::endl; Kokkos::printf( "In Test_Blas2_syr.hpp, right before calling KokkosBlas::syr(): " "ViewTypeA = %s, _kkSyrShouldThrowException=%d\n", @@ -1448,25 +1227,21 @@ void SyrTester:: KokkosBlas::syr(mode.c_str(), uplo.c_str(), alpha, x, A.d_view); } catch (const std::exception& e) { #ifdef HAVE_KOKKOSKERNELS_DEBUG - std::cout << "In Test_Blas2_syr, '" << situation - << "': caught exception, e.what() = " << e.what() << std::endl; + std::cout << "In Test_Blas2_syr, '" << situation << "': caught exception, e.what() = " << e.what() << std::endl; #endif gotStdException = true; } catch (...) { #ifdef HAVE_KOKKOSKERNELS_DEBUG - std::cout << "In Test_Blas2_syr, '" << situation - << "': caught unknown exception" << std::endl; + std::cout << "In Test_Blas2_syr, '" << situation << "': caught unknown exception" << std::endl; #endif gotUnknownException = true; } - EXPECT_EQ(gotUnknownException, false) - << "Failed test, '" << situation - << "': unknown exception should not have happened"; + EXPECT_EQ(gotUnknownException, false) << "Failed test, '" << situation + << "': unknown exception should not have happened"; EXPECT_EQ(gotStdException, _kkSyrShouldThrowException) - << "Failed test, '" << situation << "': kk syr() should" - << (_kkSyrShouldThrowException ? " " : " not ") + << "Failed test, '" << situation << "': kk syr() should" << (_kkSyrShouldThrowException ? " " : " not ") << "have thrown a std::exception"; if ((gotStdException == false) && (gotUnknownException == false)) { @@ -1475,14 +1250,11 @@ void SyrTester:: } } -template +template template -void SyrTester:: - callKkGerAndCompareKkSyrAgainstIt( - const ScalarA& alpha, TX& x, - view_stride_adapter<_ViewTypeA, false>& org_A, - const _HostViewTypeA& h_A_syr, const std::string& situation) { +void SyrTester::callKkGerAndCompareKkSyrAgainstIt( + const ScalarA& alpha, TX& x, view_stride_adapter<_ViewTypeA, false>& org_A, const _HostViewTypeA& h_A_syr, + const std::string& situation) { view_stride_adapter<_ViewTypeA, false> A_ger("A_ger", _M, _N); Kokkos::deep_copy(A_ger.d_base, org_A.d_base); @@ -1490,8 +1262,7 @@ void SyrTester:: // Call ger() // ******************************************************************** #ifdef HAVE_KOKKOSKERNELS_DEBUG - std::cout << "In Test_Blas2_syr, '" << situation << "', alpha = " << alpha - << std::endl; + std::cout << "In Test_Blas2_syr, '" << situation << "', alpha = " << alpha << std::endl; Kokkos::printf( "In Test_Blas2_syr.hpp, right before calling KokkosBlas::ger(): " "ViewTypeA = %s, _kkGerShouldThrowException=%d\n", @@ -1504,39 +1275,33 @@ void SyrTester:: KokkosBlas::ger(mode.c_str(), alpha, x, x, A_ger.d_view); } catch (const std::exception& e) { #ifdef HAVE_KOKKOSKERNELS_DEBUG - std::cout << "In Test_Blas2_syr, '" << situation - << "', ger() call: caught exception, e.what() = " << e.what() + std::cout << "In Test_Blas2_syr, '" << situation << "', ger() call: caught exception, e.what() = " << e.what() << std::endl; #endif gotStdException = true; } catch (...) { #ifdef HAVE_KOKKOSKERNELS_DEBUG - std::cout << "In Test_Blas2_syr, '" << situation - << "', ger() call: caught unknown exception" << std::endl; + std::cout << "In Test_Blas2_syr, '" << situation << "', ger() call: caught unknown exception" << std::endl; #endif gotUnknownException = true; } - EXPECT_EQ(gotUnknownException, false) - << "Failed test, '" << situation - << "': unknown exception should not have happened for ger() call"; + EXPECT_EQ(gotUnknownException, false) << "Failed test, '" << situation + << "': unknown exception should not have happened for ger() call"; - EXPECT_EQ(gotStdException, false) - << "Failed test, '" << situation - << "': kk ger() should not have thrown a std::exception"; + EXPECT_EQ(gotStdException, false) << "Failed test, '" << situation + << "': kk ger() should not have thrown a std::exception"; // ******************************************************************** // Prepare h_ger_reference to be compared against h_A_syr // ******************************************************************** - view_stride_adapter<_ViewTypeExpected, true> h_ger_reference( - "h_ger_reference", _M, _N); + view_stride_adapter<_ViewTypeExpected, true> h_ger_reference("h_ger_reference", _M, _N); Kokkos::deep_copy(h_ger_reference.d_base, A_ger.d_base); std::string uplo = _useUpOption ? "U" : "L"; for (int i = 0; i < _M; ++i) { for (int j = 0; j < _N; ++j) { - if (((_useUpOption == true) && (i <= j)) || - ((_useUpOption == false) && (i >= j))) { + if (((_useUpOption == true) && (i <= j)) || ((_useUpOption == false) && (i >= j))) { // Keep h_ger_reference as already computed } else { h_ger_reference.d_view(i, j) = org_A.h_view(i, j); @@ -1545,9 +1310,7 @@ void SyrTester:: } if (_useHermitianOption && _A_is_complex) { for (int i(0); i < _N; ++i) { - h_ger_reference.d_view(i, i) = - 0.5 * (h_ger_reference.d_view(i, i) + - _KAT_A::conj(h_ger_reference.d_view(i, i))); + h_ger_reference.d_view(i, i) = 0.5 * (h_ger_reference.d_view(i, i) + _KAT_A::conj(h_ger_reference.d_view(i, i))); } } @@ -1569,19 +1332,16 @@ int test_syr(const std::string& caseName) { #else int test_syr(const std::string& /*caseName*/) { #endif - bool xBool = std::is_same::value || - std::is_same::value || + bool xBool = std::is_same::value || std::is_same::value || std::is_same>::value || std::is_same>::value; - bool aBool = std::is_same::value || - std::is_same::value || + bool aBool = std::is_same::value || std::is_same::value || std::is_same>::value || std::is_same>::value; bool useAnalyticalResults = xBool && aBool; #if defined(KOKKOSKERNELS_INST_LAYOUTLEFT) || \ - (!defined(KOKKOSKERNELS_ETI_ONLY) && \ - !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS)) + (!defined(KOKKOSKERNELS_ETI_ONLY) && !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS)) #ifdef HAVE_KOKKOSKERNELS_DEBUG Kokkos::printf( "+-----------------------------------------------------------------------" @@ -1589,9 +1349,7 @@ int test_syr(const std::string& /*caseName*/) { Kokkos::printf("Starting %s for LAYOUTLEFT ...\n", caseName.c_str()); #endif if (true) { - Test::SyrTester - tester; + Test::SyrTester tester; tester.test(0, 0); tester.test(1, 0); tester.test(2, 0); @@ -1625,8 +1383,7 @@ int test_syr(const std::string& /*caseName*/) { #endif #if defined(KOKKOSKERNELS_INST_LAYOUTRIGHT) || \ - (!defined(KOKKOSKERNELS_ETI_ONLY) && \ - !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS)) + (!defined(KOKKOSKERNELS_ETI_ONLY) && !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS)) #ifdef HAVE_KOKKOSKERNELS_DEBUG Kokkos::printf( "+-----------------------------------------------------------------------" @@ -1634,9 +1391,7 @@ int test_syr(const std::string& /*caseName*/) { Kokkos::printf("Starting %s for LAYOUTRIGHT ...\n", caseName.c_str()); #endif if (true) { - Test::SyrTester - tester; + Test::SyrTester tester; tester.test(0, 0); tester.test(1, 0); tester.test(2, 0); @@ -1670,8 +1425,7 @@ int test_syr(const std::string& /*caseName*/) { #endif #if defined(KOKKOSKERNELS_INST_LAYOUTSTRIDE) || \ - (!defined(KOKKOSKERNELS_ETI_ONLY) && \ - !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS)) + (!defined(KOKKOSKERNELS_ETI_ONLY) && !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS)) #ifdef HAVE_KOKKOSKERNELS_DEBUG Kokkos::printf( "+-----------------------------------------------------------------------" @@ -1679,9 +1433,7 @@ int test_syr(const std::string& /*caseName*/) { Kokkos::printf("Starting %s for LAYOUTSTRIDE ...\n", caseName.c_str()); #endif if (true) { - Test::SyrTester - tester; + Test::SyrTester tester; tester.test(0, 0); tester.test(1, 0); tester.test(2, 0); @@ -1714,8 +1466,7 @@ int test_syr(const std::string& /*caseName*/) { #endif #endif -#if !defined(KOKKOSKERNELS_ETI_ONLY) && \ - !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS) +#if !defined(KOKKOSKERNELS_ETI_ONLY) && !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS) #ifdef HAVE_KOKKOSKERNELS_DEBUG Kokkos::printf( "+-----------------------------------------------------------------------" @@ -1723,9 +1474,7 @@ int test_syr(const std::string& /*caseName*/) { Kokkos::printf("Starting %s for MIXED LAYOUTS ...\n", caseName.c_str()); #endif if (true) { - Test::SyrTester - tester; + Test::SyrTester tester; tester.test(1, 0); tester.test(2, 0); tester.test(1024, 0); @@ -1742,9 +1491,7 @@ int test_syr(const std::string& /*caseName*/) { } if (true) { - Test::SyrTester - tester; + Test::SyrTester tester; tester.test(1024, 0); } @@ -1766,8 +1513,7 @@ int test_syr(const std::string& /*caseName*/) { } #if defined(KOKKOSKERNELS_INST_FLOAT) || \ - (!defined(KOKKOSKERNELS_ETI_ONLY) && \ - !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS)) + (!defined(KOKKOSKERNELS_ETI_ONLY) && !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS)) TEST_F(TestCategory, syr_float) { Kokkos::Profiling::pushRegion("KokkosBlas::Test::syr_float"); test_syr("test case syr_float"); @@ -1776,19 +1522,16 @@ TEST_F(TestCategory, syr_float) { #endif #if defined(KOKKOSKERNELS_INST_COMPLEX_FLOAT) || \ - (!defined(KOKKOSKERNELS_ETI_ONLY) && \ - !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS)) + (!defined(KOKKOSKERNELS_ETI_ONLY) && !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS)) TEST_F(TestCategory, syr_complex_float) { Kokkos::Profiling::pushRegion("KokkosBlas::Test::syr_complex_float"); - test_syr, Kokkos::complex, TestDevice>( - "test case syr_complex_float"); + test_syr, Kokkos::complex, TestDevice>("test case syr_complex_float"); Kokkos::Profiling::popRegion(); } #endif #if defined(KOKKOSKERNELS_INST_DOUBLE) || \ - (!defined(KOKKOSKERNELS_ETI_ONLY) && \ - !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS)) + (!defined(KOKKOSKERNELS_ETI_ONLY) && !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS)) TEST_F(TestCategory, syr_double) { Kokkos::Profiling::pushRegion("KokkosBlas::Test::syr_double"); test_syr("test case syr_double"); @@ -1797,19 +1540,16 @@ TEST_F(TestCategory, syr_double) { #endif #if defined(KOKKOSKERNELS_INST_COMPLEX_DOUBLE) || \ - (!defined(KOKKOSKERNELS_ETI_ONLY) && \ - !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS)) + (!defined(KOKKOSKERNELS_ETI_ONLY) && !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS)) TEST_F(TestCategory, syr_complex_double) { Kokkos::Profiling::pushRegion("KokkosBlas::Test::syr_complex_double"); - test_syr, Kokkos::complex, TestDevice>( - "test case syr_complex_double"); + test_syr, Kokkos::complex, TestDevice>("test case syr_complex_double"); Kokkos::Profiling::popRegion(); } #endif -#if defined(KOKKOSKERNELS_INST_INT) || \ - (!defined(KOKKOSKERNELS_ETI_ONLY) && \ - !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS)) +#if defined(KOKKOSKERNELS_INST_INT) || \ + (!defined(KOKKOSKERNELS_ETI_ONLY) && !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS)) TEST_F(TestCategory, syr_int) { Kokkos::Profiling::pushRegion("KokkosBlas::Test::syr_int"); test_syr("test case syr_int"); @@ -1817,8 +1557,7 @@ TEST_F(TestCategory, syr_int) { } #endif -#if !defined(KOKKOSKERNELS_ETI_ONLY) && \ - !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS) +#if !defined(KOKKOSKERNELS_ETI_ONLY) && !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS) TEST_F(TestCategory, syr_int_float) { Kokkos::Profiling::pushRegion("KokkosBlas::Test::syr_int_float"); test_syr("test case syr_int_float"); diff --git a/blas/unit_test/Test_Blas2_syr2.hpp b/blas/unit_test/Test_Blas2_syr2.hpp index c49eba765b..2d6792f8c8 100644 --- a/blas/unit_test/Test_Blas2_syr2.hpp +++ b/blas/unit_test/Test_Blas2_syr2.hpp @@ -56,114 +56,91 @@ namespace Test { -template +template class Syr2Tester { public: Syr2Tester(); ~Syr2Tester(); - void test(const int N, const int nonConstConstCombinations, - const bool useAnalyticalResults = false, - const bool useHermitianOption = false, - const bool useUpOption = false); + void test(const int N, const int nonConstConstCombinations, const bool useAnalyticalResults = false, + const bool useHermitianOption = false, const bool useUpOption = false); private: using _ViewTypeX = Kokkos::View; using _ViewTypeY = Kokkos::View; using _ViewTypeA = Kokkos::View; - using _HostViewTypeX = typename _ViewTypeX::HostMirror; - using _HostViewTypeY = typename _ViewTypeY::HostMirror; - using _HostViewTypeA = typename _ViewTypeA::HostMirror; - using _ViewTypeExpected = - Kokkos::View; + using _HostViewTypeX = typename _ViewTypeX::HostMirror; + using _HostViewTypeY = typename _ViewTypeY::HostMirror; + using _HostViewTypeA = typename _ViewTypeA::HostMirror; + using _ViewTypeExpected = Kokkos::View; using _KAT_A = Kokkos::ArithTraits; using _AuxType = typename _KAT_A::mag_type; - void populateVariables(ScalarA& alpha, - view_stride_adapter<_ViewTypeX, false>& x, - view_stride_adapter<_ViewTypeY, false>& y, - view_stride_adapter<_ViewTypeA, false>& A, - _ViewTypeExpected& h_expected, - bool& expectedResultIsKnown); + void populateVariables(ScalarA& alpha, view_stride_adapter<_ViewTypeX, false>& x, + view_stride_adapter<_ViewTypeY, false>& y, view_stride_adapter<_ViewTypeA, false>& A, + _ViewTypeExpected& h_expected, bool& expectedResultIsKnown); template - typename std::enable_if>::value || - std::is_same>::value, - void>::type - populateAnalyticalValues(T& alpha, _HostViewTypeX& h_x, _HostViewTypeY& h_y, - _HostViewTypeA& h_A, _ViewTypeExpected& h_expected); + typename std::enable_if< + std::is_same>::value || std::is_same>::value, void>::type + populateAnalyticalValues(T& alpha, _HostViewTypeX& h_x, _HostViewTypeY& h_y, _HostViewTypeA& h_A, + _ViewTypeExpected& h_expected); template - typename std::enable_if>::value && - !std::is_same>::value, - void>::type - populateAnalyticalValues(T& alpha, _HostViewTypeX& h_x, _HostViewTypeY& h_y, - _HostViewTypeA& h_A, _ViewTypeExpected& h_expected); + typename std::enable_if< + !std::is_same>::value && !std::is_same>::value, void>::type + populateAnalyticalValues(T& alpha, _HostViewTypeX& h_x, _HostViewTypeY& h_y, _HostViewTypeA& h_A, + _ViewTypeExpected& h_expected); template - typename std::enable_if>::value || - std::is_same>::value, - void>::type - populateVanillaValues(const T& alpha, const _HostViewTypeX& h_x, - const _HostViewTypeY& h_y, const _HostViewTypeA& h_A, + typename std::enable_if< + std::is_same>::value || std::is_same>::value, void>::type + populateVanillaValues(const T& alpha, const _HostViewTypeX& h_x, const _HostViewTypeY& h_y, const _HostViewTypeA& h_A, _ViewTypeExpected& h_vanilla); template - typename std::enable_if>::value && - !std::is_same>::value, - void>::type - populateVanillaValues(const T& alpha, const _HostViewTypeX& h_x, - const _HostViewTypeY& h_y, const _HostViewTypeA& h_A, + typename std::enable_if< + !std::is_same>::value && !std::is_same>::value, void>::type + populateVanillaValues(const T& alpha, const _HostViewTypeX& h_x, const _HostViewTypeY& h_y, const _HostViewTypeA& h_A, _ViewTypeExpected& h_vanilla); template - typename std::enable_if>::value || - std::is_same>::value, - void>::type - compareVanillaAgainstExpected(const T& alpha, - const _ViewTypeExpected& h_vanilla, + typename std::enable_if< + std::is_same>::value || std::is_same>::value, void>::type + compareVanillaAgainstExpected(const T& alpha, const _ViewTypeExpected& h_vanilla, const _ViewTypeExpected& h_expected); template - typename std::enable_if>::value && - !std::is_same>::value, - void>::type - compareVanillaAgainstExpected(const T& alpha, - const _ViewTypeExpected& h_vanilla, + typename std::enable_if< + !std::is_same>::value && !std::is_same>::value, void>::type + compareVanillaAgainstExpected(const T& alpha, const _ViewTypeExpected& h_vanilla, const _ViewTypeExpected& h_expected); template - typename std::enable_if>::value || - std::is_same>::value, - void>::type - compareKkSyr2AgainstReference(const T& alpha, const _HostViewTypeA& h_A, - const _ViewTypeExpected& h_reference); + typename std::enable_if< + std::is_same>::value || std::is_same>::value, void>::type + compareKkSyr2AgainstReference(const T& alpha, const _HostViewTypeA& h_A, const _ViewTypeExpected& h_reference); template - typename std::enable_if>::value && - !std::is_same>::value, - void>::type - compareKkSyr2AgainstReference(const T& alpha, const _HostViewTypeA& h_A, - const _ViewTypeExpected& h_reference); + typename std::enable_if< + !std::is_same>::value && !std::is_same>::value, void>::type + compareKkSyr2AgainstReference(const T& alpha, const _HostViewTypeA& h_A, const _ViewTypeExpected& h_reference); template T shrinkAngleToZeroTwoPiRange(const T input); template - void callKkSyr2AndCompareAgainstExpected( - const ScalarA& alpha, TX& x, TY& y, - view_stride_adapter<_ViewTypeA, false>& A, - const _ViewTypeExpected& h_expected, const std::string& situation); + void callKkSyr2AndCompareAgainstExpected(const ScalarA& alpha, TX& x, TY& y, + view_stride_adapter<_ViewTypeA, false>& A, + const _ViewTypeExpected& h_expected, const std::string& situation); template - void callKkGerAndCompareKkSyr2AgainstIt( - const ScalarA& alpha, TX& x, TY& y, - view_stride_adapter<_ViewTypeA, false>& org_A, - const _HostViewTypeA& h_A_syr2, const std::string& situation); + void callKkGerAndCompareKkSyr2AgainstIt(const ScalarA& alpha, TX& x, TY& y, + view_stride_adapter<_ViewTypeA, false>& org_A, const _HostViewTypeA& h_A_syr2, + const std::string& situation); const bool _A_is_complex; const bool _A_is_lr; @@ -181,16 +158,13 @@ class Syr2Tester { bool _kkGerShouldThrowException; }; -template -Syr2Tester::Syr2Tester() +template +Syr2Tester::Syr2Tester() : _A_is_complex(std::is_same>::value || std::is_same>::value), _A_is_lr(std::is_same::value), _A_is_ll(std::is_same::value), - _testIsGpu(KokkosKernels::Impl::kk_is_gpu_exec_space< - typename Device::execution_space>()) + _testIsGpu(KokkosKernels::Impl::kk_is_gpu_exec_space()) #ifdef KOKKOSKERNELS_ENABLE_TPL_BLAS , _vanillaUsesDifferentOrderOfOps(_A_is_lr) @@ -207,12 +181,8 @@ Syr2Tester::value - ? 1.0e-6 - : (std::is_same<_AuxType, double>::value ? 1.0e-9 : 0)), - _relTol(std::is_same<_AuxType, float>::value - ? 5.0e-3 - : (std::is_same<_AuxType, double>::value ? 1.0e-6 : 0)), + _absTol(std::is_same<_AuxType, float>::value ? 1.0e-6 : (std::is_same<_AuxType, double>::value ? 1.0e-9 : 0)), + _relTol(std::is_same<_AuxType, float>::value ? 5.0e-3 : (std::is_same<_AuxType, double>::value ? 1.0e-6 : 0)), _M(-1), _N(-1), _useAnalyticalResults(false), @@ -222,35 +192,26 @@ Syr2Tester -Syr2Tester::~Syr2Tester() { +template +Syr2Tester::~Syr2Tester() { // Nothing to do } -template -void Syr2Tester::test(const int N, const int nonConstConstCombinations, - const bool useAnalyticalResults, - const bool useHermitianOption, - const bool useUpOption) { +template +void Syr2Tester::test( + const int N, const int nonConstConstCombinations, const bool useAnalyticalResults, const bool useHermitianOption, + const bool useUpOption) { #ifdef HAVE_KOKKOSKERNELS_DEBUG std::cout << "Entering Syr2Tester::test()... - - - - - - - - - - - - - - - - " "- - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - " "- - - - - - - - - " << std::endl; - std::cout << "_A_is_complex = " << _A_is_complex - << ", _A_is_lr = " << _A_is_lr << ", _A_is_ll = " << _A_is_ll + std::cout << "_A_is_complex = " << _A_is_complex << ", _A_is_lr = " << _A_is_lr << ", _A_is_ll = " << _A_is_ll << ", _testIsGpu = " << _testIsGpu - << ", _vanillaUsesDifferentOrderOfOps = " - << _vanillaUsesDifferentOrderOfOps << ", _absTol = " << _absTol - << ", _relTol = " << _relTol - << ", nonConstConstCombinations = " << nonConstConstCombinations - << ", useAnalyticalResults = " << useAnalyticalResults - << ", useHermitianOption = " << useHermitianOption + << ", _vanillaUsesDifferentOrderOfOps = " << _vanillaUsesDifferentOrderOfOps << ", _absTol = " << _absTol + << ", _relTol = " << _relTol << ", nonConstConstCombinations = " << nonConstConstCombinations + << ", useAnalyticalResults = " << useAnalyticalResults << ", useHermitianOption = " << useHermitianOption << ", useUpOption = " << useUpOption << std::endl; #endif // ******************************************************************** @@ -286,8 +247,7 @@ void Syr2Tester y("Y", _N); view_stride_adapter<_ViewTypeA, false> A("A", _M, _N); - view_stride_adapter<_ViewTypeExpected, true> h_expected( - "expected A += alpha * x * x^{t,h}", _M, _N); + view_stride_adapter<_ViewTypeExpected, true> h_expected("expected A += alpha * x * x^{t,h}", _M, _N); bool expectedResultIsKnown = false; using AlphaCoeffType = typename _ViewTypeA::non_const_value_type; @@ -296,20 +256,16 @@ void Syr2TesterpopulateVariables(alpha, x, y, A, h_expected.d_view, - expectedResultIsKnown); + this->populateVariables(alpha, x, y, A, h_expected.d_view, expectedResultIsKnown); // ******************************************************************** // Step 3 of 7: populate h_vanilla // ******************************************************************** - view_stride_adapter<_ViewTypeExpected, true> h_vanilla( - "vanilla = A + alpha * x * x^{t,h}", _M, _N); + view_stride_adapter<_ViewTypeExpected, true> h_vanilla("vanilla = A + alpha * x * x^{t,h}", _M, _N); #ifdef HAVE_KOKKOSKERNELS_DEBUG - std::cout << "In Test_Blas2_syr2.hpp, computing vanilla A with alpha type = " - << typeid(alpha).name() << std::endl; + std::cout << "In Test_Blas2_syr2.hpp, computing vanilla A with alpha type = " << typeid(alpha).name() << std::endl; #endif - this->populateVanillaValues(alpha, x.h_view, y.h_view, A.h_view, - h_vanilla.d_view); + this->populateVanillaValues(alpha, x.h_view, y.h_view, A.h_view, h_vanilla.d_view); // ******************************************************************** // Step 4 of 7: use h_vanilla and h_expected as appropriate @@ -318,8 +274,7 @@ void Syr2TestercompareVanillaAgainstExpected(alpha, h_vanilla.d_view, - h_expected.d_view); + this->compareVanillaAgainstExpected(alpha, h_vanilla.d_view, h_expected.d_view); } else { // ****************************************************************** // Copy h_vanilla to h_expected @@ -335,13 +290,11 @@ void Syr2TestercallKkSyr2AndCompareAgainstExpected(alpha, x.d_view, y.d_view, A, - h_expected.d_view, "non const x"); + this->callKkSyr2AndCompareAgainstExpected(alpha, x.d_view, y.d_view, A, h_expected.d_view, "non const x"); if ((_useAnalyticalResults == false) && // Just to save run time (_kkGerShouldThrowException == false)) { - this->callKkGerAndCompareKkSyr2AgainstIt(alpha, x.d_view, y.d_view, org_A, - A.h_view, "non const x"); + this->callKkGerAndCompareKkSyr2AgainstIt(alpha, x.d_view, y.d_view, org_A, A.h_view, "non const x"); } } @@ -351,24 +304,19 @@ void Syr2TestercallKkSyr2AndCompareAgainstExpected( - alpha, x.d_view_const, y.d_view_const, A, h_expected.d_view, "const x"); + this->callKkSyr2AndCompareAgainstExpected(alpha, x.d_view_const, y.d_view_const, A, h_expected.d_view, "const x"); } // ******************************************************************** // Step 7 of 7: tests with invalid values on the first input parameter // ******************************************************************** - EXPECT_ANY_THROW( - KokkosBlas::syr2(".", "U", alpha, x.d_view, y.d_view, A.d_view)) + EXPECT_ANY_THROW(KokkosBlas::syr2(".", "U", alpha, x.d_view, y.d_view, A.d_view)) << "Failed test: kk syr2 should have thrown an exception for mode '.'"; - EXPECT_ANY_THROW( - KokkosBlas::syr2("", "U", alpha, x.d_view, y.d_view, A.d_view)) + EXPECT_ANY_THROW(KokkosBlas::syr2("", "U", alpha, x.d_view, y.d_view, A.d_view)) << "Failed test: kk syr2 should have thrown an exception for mode ''"; - EXPECT_ANY_THROW( - KokkosBlas::syr2("T", ".", alpha, x.d_view, y.d_view, A.d_view)) + EXPECT_ANY_THROW(KokkosBlas::syr2("T", ".", alpha, x.d_view, y.d_view, A.d_view)) << "Failed test: kk syr2 should have thrown an exception for uplo '.'"; - EXPECT_ANY_THROW( - KokkosBlas::syr2("T", "", alpha, x.d_view, y.d_view, A.d_view)) + EXPECT_ANY_THROW(KokkosBlas::syr2("T", "", alpha, x.d_view, y.d_view, A.d_view)) << "Failed test: kk syr2 should have thrown an exception for uplo ''"; #ifdef HAVE_KOKKOSKERNELS_DEBUG @@ -379,21 +327,14 @@ void Syr2Tester -void Syr2Tester< - ScalarX, tLayoutX, ScalarY, tLayoutY, ScalarA, tLayoutA, - Device>::populateVariables(ScalarA& alpha, - view_stride_adapter<_ViewTypeX, false>& x, - view_stride_adapter<_ViewTypeY, false>& y, - view_stride_adapter<_ViewTypeA, false>& A, - _ViewTypeExpected& h_expected, - bool& expectedResultIsKnown) { +template +void Syr2Tester::populateVariables( + ScalarA& alpha, view_stride_adapter<_ViewTypeX, false>& x, view_stride_adapter<_ViewTypeY, false>& y, + view_stride_adapter<_ViewTypeA, false>& A, _ViewTypeExpected& h_expected, bool& expectedResultIsKnown) { expectedResultIsKnown = false; if (_useAnalyticalResults) { - this->populateAnalyticalValues(alpha, x.h_view, y.h_view, A.h_view, - h_expected); + this->populateAnalyticalValues(alpha, x.h_view, y.h_view, A.h_view, h_expected); Kokkos::deep_copy(x.d_base, x.h_base); Kokkos::deep_copy(y.d_base, y.h_base); Kokkos::deep_copy(A.d_base, A.h_base); @@ -447,8 +388,7 @@ void Syr2Tester< } else { alpha = 3; - Kokkos::Random_XorShift64_Pool rand_pool( - 13718); + Kokkos::Random_XorShift64_Pool rand_pool(13718); { ScalarX randStart, randEnd; @@ -502,8 +442,7 @@ void Syr2Tester< if (_N <= 2) { for (int i(0); i < _M; ++i) { for (int j(0); j < _N; ++j) { - std::cout << "h_origA(" << i << "," << j << ") = " << A.h_view(i, j) - << std::endl; + std::cout << "h_origA(" << i << "," << j << ") = " << A.h_view(i, j) << std::endl; } } } @@ -511,17 +450,12 @@ void Syr2Tester< } // Code for complex values -template +template template -typename std::enable_if>::value || - std::is_same>::value, - void>::type -Syr2Tester::populateAnalyticalValues(T& alpha, _HostViewTypeX& h_x, - _HostViewTypeY& h_y, - _HostViewTypeA& h_A, - _ViewTypeExpected& h_expected) { +typename std::enable_if< + std::is_same>::value || std::is_same>::value, void>::type +Syr2Tester::populateAnalyticalValues( + T& alpha, _HostViewTypeX& h_x, _HostViewTypeY& h_y, _HostViewTypeA& h_A, _ViewTypeExpected& h_expected) { alpha.real() = 1.4; alpha.imag() = -2.3; @@ -540,12 +474,9 @@ Syr2TestershrinkAngleToZeroTwoPiRange(static_cast<_AuxType>(i + j)); - _AuxType auxImJ = - this->shrinkAngleToZeroTwoPiRange(static_cast<_AuxType>(i - j)); - if (((_useUpOption == true) && (i <= j)) || - ((_useUpOption == false) && (i >= j))) { + _AuxType auxIpJ = this->shrinkAngleToZeroTwoPiRange(static_cast<_AuxType>(i + j)); + _AuxType auxImJ = this->shrinkAngleToZeroTwoPiRange(static_cast<_AuxType>(i - j)); + if (((_useUpOption == true) && (i <= j)) || ((_useUpOption == false) && (i >= j))) { h_A(i, j).real() = sin(auxIpJ); h_A(i, j).imag() = -sin(auxImJ); } else { @@ -557,8 +488,7 @@ Syr2TestershrinkAngleToZeroTwoPiRange(static_cast<_AuxType>(i + j)); + _AuxType auxIpJ = this->shrinkAngleToZeroTwoPiRange(static_cast<_AuxType>(i + j)); h_A(i, j).real() = sin(auxIpJ); h_A(i, j).imag() = sin(auxIpJ); } @@ -568,12 +498,9 @@ Syr2Tester= j))) { - _AuxType auxIpJ = - this->shrinkAngleToZeroTwoPiRange(static_cast<_AuxType>(i + j)); - _AuxType auxImJ = - this->shrinkAngleToZeroTwoPiRange(static_cast<_AuxType>(i - j)); + if (((_useUpOption == true) && (i <= j)) || ((_useUpOption == false) && (i >= j))) { + _AuxType auxIpJ = this->shrinkAngleToZeroTwoPiRange(static_cast<_AuxType>(i + j)); + _AuxType auxImJ = this->shrinkAngleToZeroTwoPiRange(static_cast<_AuxType>(i - j)); h_expected(i, j).real() = 3.8 * sin(auxIpJ); h_expected(i, j).imag() = -5.6 * sin(auxImJ); } else { @@ -585,10 +512,8 @@ Syr2Tester= j))) { - _AuxType auxIpJ = - this->shrinkAngleToZeroTwoPiRange(static_cast<_AuxType>(i + j)); + if (((_useUpOption == true) && (i <= j)) || ((_useUpOption == false) && (i >= j))) { + _AuxType auxIpJ = this->shrinkAngleToZeroTwoPiRange(static_cast<_AuxType>(i + j)); h_expected(i, j).real() = 5.6 * sin(auxIpJ); h_expected(i, j).imag() = 3.8 * sin(auxIpJ); } else { @@ -601,17 +526,12 @@ Syr2Tester +template template -typename std::enable_if>::value && - !std::is_same>::value, - void>::type -Syr2Tester::populateAnalyticalValues(T& alpha, _HostViewTypeX& h_x, - _HostViewTypeY& h_y, - _HostViewTypeA& h_A, - _ViewTypeExpected& h_expected) { +typename std::enable_if< + !std::is_same>::value && !std::is_same>::value, void>::type +Syr2Tester::populateAnalyticalValues( + T& alpha, _HostViewTypeX& h_x, _HostViewTypeY& h_y, _HostViewTypeA& h_A, _ViewTypeExpected& h_expected) { alpha = std::is_same<_AuxType, int>::value ? 1 : 1.1; for (int i = 0; i < _M; ++i) { @@ -626,18 +546,15 @@ Syr2TestershrinkAngleToZeroTwoPiRange(static_cast<_AuxType>(i + j)); - h_A(i, j) = .1 * sin(auxIpJ); + _AuxType auxIpJ = this->shrinkAngleToZeroTwoPiRange(static_cast<_AuxType>(i + j)); + h_A(i, j) = .1 * sin(auxIpJ); } } for (int i = 0; i < _M; ++i) { for (int j = 0; j < _N; ++j) { - if (((_useUpOption == true) && (i <= j)) || - ((_useUpOption == false) && (i >= j))) { - _AuxType auxIpJ = - this->shrinkAngleToZeroTwoPiRange(static_cast<_AuxType>(i + j)); + if (((_useUpOption == true) && (i <= j)) || ((_useUpOption == false) && (i >= j))) { + _AuxType auxIpJ = this->shrinkAngleToZeroTwoPiRange(static_cast<_AuxType>(i + j)); h_expected(i, j) = 1.2 * sin(auxIpJ); } else { h_expected(i, j) = h_A(i, j); @@ -647,27 +564,20 @@ Syr2Tester +template template -typename std::enable_if>::value || - std::is_same>::value, - void>::type -Syr2Tester::populateVanillaValues(const T& alpha, - const _HostViewTypeX& h_x, - const _HostViewTypeY& h_y, - const _HostViewTypeA& h_A, - _ViewTypeExpected& h_vanilla) { +typename std::enable_if< + std::is_same>::value || std::is_same>::value, void>::type +Syr2Tester::populateVanillaValues( + const T& alpha, const _HostViewTypeX& h_x, const _HostViewTypeY& h_y, const _HostViewTypeA& h_A, + _ViewTypeExpected& h_vanilla) { if (_vanillaUsesDifferentOrderOfOps) { if (_useHermitianOption) { for (int i = 0; i < _M; ++i) { for (int j = 0; j < _N; ++j) { - if (((_useUpOption == true) && (i <= j)) || - ((_useUpOption == false) && (i >= j))) { + if (((_useUpOption == true) && (i <= j)) || ((_useUpOption == false) && (i >= j))) { h_vanilla(i, j) = - h_A(i, j) + alpha * _KAT_A::conj(h_y(j)) * h_x(i) + - _KAT_A::conj(alpha) * _KAT_A::conj(h_x(j)) * h_y(i); + h_A(i, j) + alpha * _KAT_A::conj(h_y(j)) * h_x(i) + _KAT_A::conj(alpha) * _KAT_A::conj(h_x(j)) * h_y(i); } else { h_vanilla(i, j) = h_A(i, j); } @@ -679,10 +589,8 @@ Syr2Tester= j))) { - h_vanilla(i, j) = - h_A(i, j) + alpha * h_x(j) * h_y(i) + alpha * h_y(j) * h_x(i); + if (((_useUpOption == true) && (i <= j)) || ((_useUpOption == false) && (i >= j))) { + h_vanilla(i, j) = h_A(i, j) + alpha * h_x(j) * h_y(i) + alpha * h_y(j) * h_x(i); } else { h_vanilla(i, j) = h_A(i, j); } @@ -693,11 +601,9 @@ Syr2Tester= j))) { + if (((_useUpOption == true) && (i <= j)) || ((_useUpOption == false) && (i >= j))) { h_vanilla(i, j) = - h_A(i, j) + alpha * h_x(i) * _KAT_A::conj(h_y(j)) + - _KAT_A::conj(alpha) * h_y(i) * _KAT_A::conj(h_x(j)); + h_A(i, j) + alpha * h_x(i) * _KAT_A::conj(h_y(j)) + _KAT_A::conj(alpha) * h_y(i) * _KAT_A::conj(h_x(j)); } else { h_vanilla(i, j) = h_A(i, j); } @@ -709,10 +615,8 @@ Syr2Tester= j))) { - h_vanilla(i, j) = - h_A(i, j) + alpha * h_x(i) * h_y(j) + alpha * h_y(i) * h_x(j); + if (((_useUpOption == true) && (i <= j)) || ((_useUpOption == false) && (i >= j))) { + h_vanilla(i, j) = h_A(i, j) + alpha * h_x(i) * h_y(j) + alpha * h_y(i) * h_x(j); } else { h_vanilla(i, j) = h_A(i, j); } @@ -723,27 +627,20 @@ Syr2Tester +template template -typename std::enable_if>::value && - !std::is_same>::value, - void>::type -Syr2Tester::populateVanillaValues(const T& alpha, - const _HostViewTypeX& h_x, - const _HostViewTypeY& h_y, - const _HostViewTypeA& h_A, - _ViewTypeExpected& h_vanilla) { +typename std::enable_if< + !std::is_same>::value && !std::is_same>::value, void>::type +Syr2Tester::populateVanillaValues( + const T& alpha, const _HostViewTypeX& h_x, const _HostViewTypeY& h_y, const _HostViewTypeA& h_A, + _ViewTypeExpected& h_vanilla) { if (_useHermitianOption) { if (_vanillaUsesDifferentOrderOfOps) { for (int i = 0; i < _M; ++i) { for (int j = 0; j < _N; ++j) { - if (((_useUpOption == true) && (i <= j)) || - ((_useUpOption == false) && (i >= j))) { + if (((_useUpOption == true) && (i <= j)) || ((_useUpOption == false) && (i >= j))) { h_vanilla(i, j) = - h_A(i, j) + alpha * h_x(j) * _KAT_A::conj(h_y(i)) + - _KAT_A::conj(alpha) * h_y(j) * _KAT_A::conj(h_x(i)); + h_A(i, j) + alpha * h_x(j) * _KAT_A::conj(h_y(i)) + _KAT_A::conj(alpha) * h_y(j) * _KAT_A::conj(h_x(i)); } else { h_vanilla(i, j) = h_A(i, j); } @@ -752,11 +649,9 @@ Syr2Tester= j))) { + if (((_useUpOption == true) && (i <= j)) || ((_useUpOption == false) && (i >= j))) { h_vanilla(i, j) = - h_A(i, j) + alpha * h_x(i) * _KAT_A::conj(h_y(j)) + - _KAT_A::conj(alpha) * h_y(i) * _KAT_A::conj(h_x(j)); + h_A(i, j) + alpha * h_x(i) * _KAT_A::conj(h_y(j)) + _KAT_A::conj(alpha) * h_y(i) * _KAT_A::conj(h_x(j)); } else { h_vanilla(i, j) = h_A(i, j); } @@ -767,10 +662,8 @@ Syr2Tester= j))) { - h_vanilla(i, j) = - h_A(i, j) + alpha * h_x(j) * h_y(i) + alpha * h_y(j) * h_x(i); + if (((_useUpOption == true) && (i <= j)) || ((_useUpOption == false) && (i >= j))) { + h_vanilla(i, j) = h_A(i, j) + alpha * h_x(j) * h_y(i) + alpha * h_y(j) * h_x(i); } else { h_vanilla(i, j) = h_A(i, j); } @@ -779,10 +672,8 @@ Syr2Tester= j))) { - h_vanilla(i, j) = - h_A(i, j) + alpha * h_x(i) * h_y(j) + alpha * h_y(i) * h_x(j); + if (((_useUpOption == true) && (i <= j)) || ((_useUpOption == false) && (i >= j))) { + h_vanilla(i, j) = h_A(i, j) + alpha * h_x(i) * h_y(j) + alpha * h_y(i) * h_x(j); } else { h_vanilla(i, j) = h_A(i, j); } @@ -792,11 +683,10 @@ Syr2Tester +template template -T Syr2Tester::shrinkAngleToZeroTwoPiRange(const T input) { +T Syr2Tester::shrinkAngleToZeroTwoPiRange( + const T input) { T output(input); #if 0 T twoPi( 2. * Kokkos::numbers::pi ); @@ -811,29 +701,23 @@ T Syr2Tester +template template -typename std::enable_if>::value || - std::is_same>::value, - void>::type -Syr2Tester:: - compareVanillaAgainstExpected(const T& alpha, - const _ViewTypeExpected& h_vanilla, - const _ViewTypeExpected& h_expected) { +typename std::enable_if< + std::is_same>::value || std::is_same>::value, void>::type +Syr2Tester::compareVanillaAgainstExpected( + const T& alpha, const _ViewTypeExpected& h_vanilla, const _ViewTypeExpected& h_expected) { #ifdef HAVE_KOKKOSKERNELS_DEBUG if (_N <= 2) { for (int i(0); i < _M; ++i) { for (int j(0); j < _N; ++j) { - std::cout << "h_exp(" << i << "," << j << ") = " << h_expected(i, j) - << ", h_van(" << i << "," << j << ") = " << h_vanilla(i, j) - << std::endl; + std::cout << "h_exp(" << i << "," << j << ") = " << h_expected(i, j) << ", h_van(" << i << "," << j + << ") = " << h_vanilla(i, j) << std::endl; } } } #endif - int maxNumErrorsAllowed(static_cast(_M) * static_cast(_N) * - 1.e-3); + int maxNumErrorsAllowed(static_cast(_M) * static_cast(_N) * 1.e-3); if (_useAnalyticalResults) { int numErrorsRealAbs(0); @@ -852,7 +736,7 @@ Syr2Tester:: for (int i(0); i < _M; ++i) { for (int j(0); j < _N; ++j) { - diff = _KAT_A::abs(h_expected(i, j).real() - h_vanilla(i, j).real()); + diff = _KAT_A::abs(h_expected(i, j).real() - h_vanilla(i, j).real()); errorHappened = false; if (h_expected(i, j).real() == 0.) { diffThreshold = _KAT_A::abs(_absTol); @@ -876,16 +760,14 @@ Syr2Tester:: } if (errorHappened && (numErrorsRealAbs + numErrorsRealRel == 1)) { #ifdef HAVE_KOKKOSKERNELS_DEBUG - std::cout << "ERROR, i = " << i << ", j = " << j - << ": h_expected(i,j).real() = " << h_expected(i, j).real() + std::cout << "ERROR, i = " << i << ", j = " << j << ": h_expected(i,j).real() = " << h_expected(i, j).real() << ", h_vanilla(i,j).real() = " << h_vanilla(i, j).real() << ", _KAT_A::abs(h_expected(i,j).real() - " "h_vanilla(i,j).real()) = " - << diff << ", diffThreshold = " << diffThreshold - << std::endl; + << diff << ", diffThreshold = " << diffThreshold << std::endl; #endif } - diff = _KAT_A::abs(h_expected(i, j).imag() - h_vanilla(i, j).imag()); + diff = _KAT_A::abs(h_expected(i, j).imag() - h_vanilla(i, j).imag()); errorHappened = false; if (h_expected(i, j).imag() == 0.) { diffThreshold = _KAT_A::abs(_absTol); @@ -909,13 +791,11 @@ Syr2Tester:: } if (errorHappened && (numErrorsImagAbs + numErrorsImagRel == 1)) { #ifdef HAVE_KOKKOSKERNELS_DEBUG - std::cout << "ERROR, i = " << i << ", j = " << j - << ": h_expected(i,j).imag() = " << h_expected(i, j).imag() + std::cout << "ERROR, i = " << i << ", j = " << j << ": h_expected(i,j).imag() = " << h_expected(i, j).imag() << ", h_vanilla(i,j).imag() = " << h_vanilla(i, j).imag() << ", _KAT_A::abs(h_expected(i,j).imag() - " "h_vanilla(i,j).imag()) = " - << diff << ", diffThreshold = " << diffThreshold - << std::endl; + << diff << ", diffThreshold = " << diffThreshold << std::endl; #endif } } // for j @@ -923,25 +803,15 @@ Syr2Tester:: { std::ostringstream msg; - msg << ", A is " << _M << " by " << _N << ", _A_is_lr = " << _A_is_lr - << ", _A_is_ll = " << _A_is_ll - << ", alpha type = " << typeid(alpha).name() - << ", _useHermitianOption = " << _useHermitianOption - << ", _useUpOption = " << _useUpOption - << ": vanilla differs too much from analytical on real components" - << ", numErrorsRealAbs = " << numErrorsRealAbs - << ", numErrorsRealRel = " << numErrorsRealRel - << ", maxErrorRealRel = " << maxErrorRealRel - << ", iForMaxErrorRealRel = " << iForMaxErrorRealRel - << ", jForMaxErrorRealRel = " << jForMaxErrorRealRel - << ", h_expected(i,j).real() = " - << (((_M > 0) && (_N > 0)) - ? h_expected(iForMaxErrorRealRel, jForMaxErrorRealRel).real() - : 9.999e+99) + msg << ", A is " << _M << " by " << _N << ", _A_is_lr = " << _A_is_lr << ", _A_is_ll = " << _A_is_ll + << ", alpha type = " << typeid(alpha).name() << ", _useHermitianOption = " << _useHermitianOption + << ", _useUpOption = " << _useUpOption << ": vanilla differs too much from analytical on real components" + << ", numErrorsRealAbs = " << numErrorsRealAbs << ", numErrorsRealRel = " << numErrorsRealRel + << ", maxErrorRealRel = " << maxErrorRealRel << ", iForMaxErrorRealRel = " << iForMaxErrorRealRel + << ", jForMaxErrorRealRel = " << jForMaxErrorRealRel << ", h_expected(i,j).real() = " + << (((_M > 0) && (_N > 0)) ? h_expected(iForMaxErrorRealRel, jForMaxErrorRealRel).real() : 9.999e+99) << ", h_vanilla(i,j).real() = " - << (((_M > 0) && (_N > 0)) - ? h_vanilla(iForMaxErrorRealRel, jForMaxErrorRealRel).real() - : 9.999e+99) + << (((_M > 0) && (_N > 0)) ? h_vanilla(iForMaxErrorRealRel, jForMaxErrorRealRel).real() : 9.999e+99) << ", maxNumErrorsAllowed = " << maxNumErrorsAllowed; int numErrorsReal(numErrorsRealAbs + numErrorsRealRel); @@ -950,30 +820,19 @@ Syr2Tester:: std::cout << "WARNING" << msg.str() << std::endl; } #endif - EXPECT_LE(numErrorsReal, maxNumErrorsAllowed) - << "Failed test" << msg.str(); + EXPECT_LE(numErrorsReal, maxNumErrorsAllowed) << "Failed test" << msg.str(); } { std::ostringstream msg; - msg << ", A is " << _M << " by " << _N << ", _A_is_lr = " << _A_is_lr - << ", _A_is_ll = " << _A_is_ll - << ", alpha type = " << typeid(alpha).name() - << ", _useHermitianOption = " << _useHermitianOption - << ", _useUpOption = " << _useUpOption - << ": vanilla differs too much from analytical on imag components" - << ", numErrorsImagAbs = " << numErrorsImagAbs - << ", numErrorsImagRel = " << numErrorsImagRel - << ", maxErrorImagRel = " << maxErrorImagRel - << ", iForMaxErrorImagRel = " << iForMaxErrorImagRel - << ", jForMaxErrorImagRel = " << jForMaxErrorImagRel - << ", h_expected(i,j).imag() = " - << (((_M > 0) && (_N > 0)) - ? h_expected(iForMaxErrorImagRel, jForMaxErrorImagRel).imag() - : 9.999e+99) + msg << ", A is " << _M << " by " << _N << ", _A_is_lr = " << _A_is_lr << ", _A_is_ll = " << _A_is_ll + << ", alpha type = " << typeid(alpha).name() << ", _useHermitianOption = " << _useHermitianOption + << ", _useUpOption = " << _useUpOption << ": vanilla differs too much from analytical on imag components" + << ", numErrorsImagAbs = " << numErrorsImagAbs << ", numErrorsImagRel = " << numErrorsImagRel + << ", maxErrorImagRel = " << maxErrorImagRel << ", iForMaxErrorImagRel = " << iForMaxErrorImagRel + << ", jForMaxErrorImagRel = " << jForMaxErrorImagRel << ", h_expected(i,j).imag() = " + << (((_M > 0) && (_N > 0)) ? h_expected(iForMaxErrorImagRel, jForMaxErrorImagRel).imag() : 9.999e+99) << ", h_vanilla(i,j).imag() = " - << (((_M > 0) && (_N > 0)) - ? h_vanilla(iForMaxErrorImagRel, jForMaxErrorImagRel).imag() - : 9.999e+99) + << (((_M > 0) && (_N > 0)) ? h_vanilla(iForMaxErrorImagRel, jForMaxErrorImagRel).imag() : 9.999e+99) << ", maxNumErrorsAllowed = " << maxNumErrorsAllowed; int numErrorsImag(numErrorsImagAbs + numErrorsImagRel); @@ -982,8 +841,7 @@ Syr2Tester:: std::cout << "WARNING" << msg.str() << std::endl; } #endif - EXPECT_LE(numErrorsImag, maxNumErrorsAllowed) - << "Failed test" << msg.str(); + EXPECT_LE(numErrorsImag, maxNumErrorsAllowed) << "Failed test" << msg.str(); } } else { int numErrorsReal(0); @@ -994,11 +852,8 @@ Syr2Tester:: if (h_expected(i, j).real() != h_vanilla(i, j).real()) { #ifdef HAVE_KOKKOSKERNELS_DEBUG if (numErrorsReal == 0) { - std::cout << "ERROR, i = " << i << ", j = " << j - << ": h_expected(i,j).real() = " - << h_expected(i, j).real() - << ", h_vanilla(i,j).real() = " << h_vanilla(i, j).real() - << std::endl; + std::cout << "ERROR, i = " << i << ", j = " << j << ": h_expected(i,j).real() = " << h_expected(i, j).real() + << ", h_vanilla(i,j).real() = " << h_vanilla(i, j).real() << std::endl; } #endif numErrorsReal++; @@ -1007,62 +862,49 @@ Syr2Tester:: if (h_expected(i, j).imag() != h_vanilla(i, j).imag()) { #ifdef HAVE_KOKKOSKERNELS_DEBUG if (numErrorsImag == 0) { - std::cout << "ERROR, i = " << i << ", j = " << j - << ": h_expected(i,j).imag() = " - << h_expected(i, j).imag() - << ", h_vanilla(i,j).imag() = " << h_vanilla(i, j).imag() - << std::endl; + std::cout << "ERROR, i = " << i << ", j = " << j << ": h_expected(i,j).imag() = " << h_expected(i, j).imag() + << ", h_vanilla(i,j).imag() = " << h_vanilla(i, j).imag() << std::endl; } #endif numErrorsImag++; } } // for j } // for i - EXPECT_EQ(numErrorsReal, 0) - << "Failed test" - << ", A is " << _M << " by " << _N << ", _A_is_lr = " << _A_is_lr - << ", _A_is_ll = " << _A_is_ll - << ", alpha type = " << typeid(alpha).name() - << ", _useHermitianOption = " << _useHermitianOption - << ", _useUpOption = " << _useUpOption - << ": vanilla result is incorrect on real components" - << ", numErrorsReal = " << numErrorsReal; - EXPECT_EQ(numErrorsImag, 0) - << "Failed test" - << ", A is " << _M << " by " << _N << ", _A_is_lr = " << _A_is_lr - << ", _A_is_ll = " << _A_is_ll - << ", alpha type = " << typeid(alpha).name() - << ", _useHermitianOption = " << _useHermitianOption - << ", _useUpOption = " << _useUpOption - << ": vanilla result is incorrect on imag components" - << ", numErrorsImag = " << numErrorsImag; + EXPECT_EQ(numErrorsReal, 0) << "Failed test" + << ", A is " << _M << " by " << _N << ", _A_is_lr = " << _A_is_lr + << ", _A_is_ll = " << _A_is_ll << ", alpha type = " << typeid(alpha).name() + << ", _useHermitianOption = " << _useHermitianOption + << ", _useUpOption = " << _useUpOption + << ": vanilla result is incorrect on real components" + << ", numErrorsReal = " << numErrorsReal; + EXPECT_EQ(numErrorsImag, 0) << "Failed test" + << ", A is " << _M << " by " << _N << ", _A_is_lr = " << _A_is_lr + << ", _A_is_ll = " << _A_is_ll << ", alpha type = " << typeid(alpha).name() + << ", _useHermitianOption = " << _useHermitianOption + << ", _useUpOption = " << _useUpOption + << ": vanilla result is incorrect on imag components" + << ", numErrorsImag = " << numErrorsImag; } } // Code for non-complex values -template +template template -typename std::enable_if>::value && - !std::is_same>::value, - void>::type -Syr2Tester:: - compareVanillaAgainstExpected(const T& alpha, - const _ViewTypeExpected& h_vanilla, - const _ViewTypeExpected& h_expected) { +typename std::enable_if< + !std::is_same>::value && !std::is_same>::value, void>::type +Syr2Tester::compareVanillaAgainstExpected( + const T& alpha, const _ViewTypeExpected& h_vanilla, const _ViewTypeExpected& h_expected) { #ifdef HAVE_KOKKOSKERNELS_DEBUG if (_N <= 2) { for (int i(0); i < _M; ++i) { for (int j(0); j < _N; ++j) { - std::cout << "h_exp(" << i << "," << j << ") = " << h_expected(i, j) - << ", h_van(" << i << "," << j << ") = " << h_vanilla(i, j) - << std::endl; + std::cout << "h_exp(" << i << "," << j << ") = " << h_expected(i, j) << ", h_van(" << i << "," << j + << ") = " << h_vanilla(i, j) << std::endl; } } } #endif - int maxNumErrorsAllowed(static_cast(_M) * static_cast(_N) * - 1.e-3); + int maxNumErrorsAllowed(static_cast(_M) * static_cast(_N) * 1.e-3); if (_useAnalyticalResults) { int numErrorsAbs(0); @@ -1100,12 +942,10 @@ Syr2Tester:: } if (errorHappened && (numErrorsAbs + numErrorsRel == 1)) { #ifdef HAVE_KOKKOSKERNELS_DEBUG - std::cout << "ERROR, i = " << i << ", j = " << j - << ": h_expected(i,j) = " << h_expected(i, j) + std::cout << "ERROR, i = " << i << ", j = " << j << ": h_expected(i,j) = " << h_expected(i, j) << ", h_vanilla(i,j) = " << h_vanilla(i, j) - << ", _KAT_A::abs(h_expected(i,j) - h_vanilla(i,j)) = " - << diff << ", diffThreshold = " << diffThreshold - << std::endl; + << ", _KAT_A::abs(h_expected(i,j) - h_vanilla(i,j)) = " << diff + << ", diffThreshold = " << diffThreshold << std::endl; #endif } } // for j @@ -1113,24 +953,14 @@ Syr2Tester:: { std::ostringstream msg; - msg << ", A is " << _M << " by " << _N << ", _A_is_lr = " << _A_is_lr - << ", _A_is_ll = " << _A_is_ll - << ", alpha type = " << typeid(alpha).name() - << ", _useHermitianOption = " << _useHermitianOption - << ", _useUpOption = " << _useUpOption - << ": vanilla differs too much from expected" - << ", numErrorsAbs = " << numErrorsAbs - << ", numErrorsRel = " << numErrorsRel - << ", maxErrorRel = " << maxErrorRel - << ", iForMaxErrorRel = " << iForMaxErrorRel + msg << ", A is " << _M << " by " << _N << ", _A_is_lr = " << _A_is_lr << ", _A_is_ll = " << _A_is_ll + << ", alpha type = " << typeid(alpha).name() << ", _useHermitianOption = " << _useHermitianOption + << ", _useUpOption = " << _useUpOption << ": vanilla differs too much from expected" + << ", numErrorsAbs = " << numErrorsAbs << ", numErrorsRel = " << numErrorsRel + << ", maxErrorRel = " << maxErrorRel << ", iForMaxErrorRel = " << iForMaxErrorRel << ", jForMaxErrorRel = " << jForMaxErrorRel << ", h_expected(i,j) = " - << (((_M > 0) && (_N > 0)) - ? h_expected(iForMaxErrorRel, jForMaxErrorRel) - : 9.999e+99) - << ", h_vanilla(i,j) = " - << (((_M > 0) && (_N > 0)) - ? h_vanilla(iForMaxErrorRel, jForMaxErrorRel) - : 9.999e+99) + << (((_M > 0) && (_N > 0)) ? h_expected(iForMaxErrorRel, jForMaxErrorRel) : 9.999e+99) + << ", h_vanilla(i,j) = " << (((_M > 0) && (_N > 0)) ? h_vanilla(iForMaxErrorRel, jForMaxErrorRel) : 9.999e+99) << ", maxNumErrorsAllowed = " << maxNumErrorsAllowed; int numErrors(numErrorsAbs + numErrorsRel); @@ -1149,8 +979,7 @@ Syr2Tester:: if (h_expected(i, j) != h_vanilla(i, j)) { #ifdef HAVE_KOKKOSKERNELS_DEBUG if (numErrors == 0) { - std::cout << "ERROR, i = " << i << ", j = " << j - << ": h_expected(i,j) = " << h_expected(i, j) + std::cout << "ERROR, i = " << i << ", j = " << j << ": h_expected(i,j) = " << h_expected(i, j) << ", h_vanilla(i,j) = " << h_vanilla(i, j) << std::endl; } #endif @@ -1158,41 +987,33 @@ Syr2Tester:: } } // for j } // for i - EXPECT_EQ(numErrors, 0) - << "Failed test" - << ", A is " << _M << " by " << _N << ", _A_is_lr = " << _A_is_lr - << ", _A_is_ll = " << _A_is_ll - << ", alpha type = " << typeid(alpha).name() - << ", _useHermitianOption = " << _useHermitianOption - << ", _useUpOption = " << _useUpOption - << ": vanilla result is incorrect" - << ", numErrors = " << numErrors; + EXPECT_EQ(numErrors, 0) << "Failed test" + << ", A is " << _M << " by " << _N << ", _A_is_lr = " << _A_is_lr + << ", _A_is_ll = " << _A_is_ll << ", alpha type = " << typeid(alpha).name() + << ", _useHermitianOption = " << _useHermitianOption << ", _useUpOption = " << _useUpOption + << ": vanilla result is incorrect" + << ", numErrors = " << numErrors; } } // Code for complex values -template +template template -typename std::enable_if>::value || - std::is_same>::value, - void>::type -Syr2Tester:: - compareKkSyr2AgainstReference(const T& alpha, const _HostViewTypeA& h_A, - const _ViewTypeExpected& h_reference) { +typename std::enable_if< + std::is_same>::value || std::is_same>::value, void>::type +Syr2Tester::compareKkSyr2AgainstReference( + const T& alpha, const _HostViewTypeA& h_A, const _ViewTypeExpected& h_reference) { #ifdef HAVE_KOKKOSKERNELS_DEBUG if (_N <= 2) { for (int i(0); i < _M; ++i) { for (int j(0); j < _N; ++j) { - std::cout << "h_exp(" << i << "," << j << ") = " << h_reference(i, j) - << ", h_A(" << i << "," << j << ") = " << h_A(i, j) - << std::endl; + std::cout << "h_exp(" << i << "," << j << ") = " << h_reference(i, j) << ", h_A(" << i << "," << j + << ") = " << h_A(i, j) << std::endl; } } } #endif - int maxNumErrorsAllowed(static_cast(_M) * static_cast(_N) * - 1.e-3); + int maxNumErrorsAllowed(static_cast(_M) * static_cast(_N) * 1.e-3); int numErrorsRealAbs(0); int numErrorsRealRel(0); @@ -1233,12 +1054,10 @@ Syr2Tester:: } if (errorHappened && (numErrorsRealAbs + numErrorsRealRel == 1)) { #ifdef HAVE_KOKKOSKERNELS_DEBUG - std::cout - << "ERROR, i = " << i << ", j = " << j - << ": h_reference(i,j).real() = " << h_reference(i, j).real() - << ", h_A(i,j).real() = " << h_A(i, j).real() - << ", _KAT_A::abs(h_reference(i,j).real() - h_A(i,j).real()) = " - << diff << ", diffThreshold = " << diffThreshold << std::endl; + std::cout << "ERROR, i = " << i << ", j = " << j << ": h_reference(i,j).real() = " << h_reference(i, j).real() + << ", h_A(i,j).real() = " << h_A(i, j).real() + << ", _KAT_A::abs(h_reference(i,j).real() - h_A(i,j).real()) = " << diff + << ", diffThreshold = " << diffThreshold << std::endl; #endif } diff = _KAT_A::abs(h_reference(i, j).imag() - h_A(i, j).imag()); @@ -1265,95 +1084,58 @@ Syr2Tester:: } if (errorHappened && (numErrorsImagAbs + numErrorsImagRel == 1)) { #ifdef HAVE_KOKKOSKERNELS_DEBUG - std::cout - << "ERROR, i = " << i << ", j = " << j - << ": h_reference(i,j).imag() = " << h_reference(i, j).imag() - << ", h_A(i,j).imag() = " << h_A(i, j).imag() - << ", _KAT_A::abs(h_reference(i,j).imag() - h_A(i,j).imag()) = " - << diff << ", diffThreshold = " << diffThreshold << std::endl; + std::cout << "ERROR, i = " << i << ", j = " << j << ": h_reference(i,j).imag() = " << h_reference(i, j).imag() + << ", h_A(i,j).imag() = " << h_A(i, j).imag() + << ", _KAT_A::abs(h_reference(i,j).imag() - h_A(i,j).imag()) = " << diff + << ", diffThreshold = " << diffThreshold << std::endl; #endif } } // for j } // for i #ifdef HAVE_KOKKOSKERNELS_DEBUG - std::cout - << "A is " << _M << " by " << _N << ", _A_is_lr = " << _A_is_lr - << ", _A_is_ll = " << _A_is_ll - << ", alpha type = " << typeid(alpha).name() - << ", _useHermitianOption = " << _useHermitianOption - << ", _useUpOption = " << _useUpOption - << ", numErrorsRealAbs = " << numErrorsRealAbs - << ", numErrorsRealRel = " << numErrorsRealRel - << ", maxErrorRealRel = " << maxErrorRealRel - << ", iForMaxErrorRealRel = " << iForMaxErrorRealRel - << ", jForMaxErrorRealRel = " << jForMaxErrorRealRel - << ", h_reference(i,j).real() = " - << (((_M > 0) && (_N > 0)) - ? h_reference(iForMaxErrorRealRel, jForMaxErrorRealRel).real() - : 9.999e+99) - << ", h_A(i,j).real() = " - << (((_M > 0) && (_N > 0)) - ? h_A(iForMaxErrorRealRel, jForMaxErrorRealRel).real() - : 9.999e+99) - << ", numErrorsImagAbs = " << numErrorsImagAbs - << ", numErrorsImagRel = " << numErrorsImagRel - << ", maxErrorImagRel = " << maxErrorImagRel - << ", iForMaxErrorImagRel = " << iForMaxErrorImagRel - << ", jForMaxErrorImagRel = " << jForMaxErrorImagRel - << ", h_reference(i,j).imag() = " - << (((_M > 0) && (_N > 0)) - ? h_reference(iForMaxErrorImagRel, jForMaxErrorImagRel).imag() - : 9.999e+99) - << ", h_A(i,j).imag() = " - << (((_M > 0) && (_N > 0)) - ? h_A(iForMaxErrorImagRel, jForMaxErrorImagRel).imag() - : 9.999e+99) - << ", maxNumErrorsAllowed = " << maxNumErrorsAllowed << std::endl; + std::cout << "A is " << _M << " by " << _N << ", _A_is_lr = " << _A_is_lr << ", _A_is_ll = " << _A_is_ll + << ", alpha type = " << typeid(alpha).name() << ", _useHermitianOption = " << _useHermitianOption + << ", _useUpOption = " << _useUpOption << ", numErrorsRealAbs = " << numErrorsRealAbs + << ", numErrorsRealRel = " << numErrorsRealRel << ", maxErrorRealRel = " << maxErrorRealRel + << ", iForMaxErrorRealRel = " << iForMaxErrorRealRel << ", jForMaxErrorRealRel = " << jForMaxErrorRealRel + << ", h_reference(i,j).real() = " + << (((_M > 0) && (_N > 0)) ? h_reference(iForMaxErrorRealRel, jForMaxErrorRealRel).real() : 9.999e+99) + << ", h_A(i,j).real() = " + << (((_M > 0) && (_N > 0)) ? h_A(iForMaxErrorRealRel, jForMaxErrorRealRel).real() : 9.999e+99) + << ", numErrorsImagAbs = " << numErrorsImagAbs << ", numErrorsImagRel = " << numErrorsImagRel + << ", maxErrorImagRel = " << maxErrorImagRel << ", iForMaxErrorImagRel = " << iForMaxErrorImagRel + << ", jForMaxErrorImagRel = " << jForMaxErrorImagRel << ", h_reference(i,j).imag() = " + << (((_M > 0) && (_N > 0)) ? h_reference(iForMaxErrorImagRel, jForMaxErrorImagRel).imag() : 9.999e+99) + << ", h_A(i,j).imag() = " + << (((_M > 0) && (_N > 0)) ? h_A(iForMaxErrorImagRel, jForMaxErrorImagRel).imag() : 9.999e+99) + << ", maxNumErrorsAllowed = " << maxNumErrorsAllowed << std::endl; if ((_M == 2131) && (_N == 2131)) { std::cout << "Information" - << ": A is " << _M << " by " << _N << ", _A_is_lr = " << _A_is_lr - << ", _A_is_ll = " << _A_is_ll - << ", alpha type = " << typeid(alpha).name() - << ", _useHermitianOption = " << _useHermitianOption - << ", _useUpOption = " << _useUpOption - << ", h_reference(11, 2119) = (" << h_reference(11, 2119).real() + << ": A is " << _M << " by " << _N << ", _A_is_lr = " << _A_is_lr << ", _A_is_ll = " << _A_is_ll + << ", alpha type = " << typeid(alpha).name() << ", _useHermitianOption = " << _useHermitianOption + << ", _useUpOption = " << _useUpOption << ", h_reference(11, 2119) = (" << h_reference(11, 2119).real() << ", " << h_reference(11, 2119).imag() << ")" - << ", h_A(11, 2119) = (" << h_A(11, 2119).real() << ", " - << h_A(11, 2119).imag() << ")" << std::endl; + << ", h_A(11, 2119) = (" << h_A(11, 2119).real() << ", " << h_A(11, 2119).imag() << ")" << std::endl; std::cout << "Information" - << ": A is " << _M << " by " << _N << ", _A_is_lr = " << _A_is_lr - << ", _A_is_ll = " << _A_is_ll - << ", alpha type = " << typeid(alpha).name() - << ", _useHermitianOption = " << _useHermitianOption - << ", _useUpOption = " << _useUpOption - << ", h_reference(710, 1065) = (" << h_reference(710, 1065).real() + << ": A is " << _M << " by " << _N << ", _A_is_lr = " << _A_is_lr << ", _A_is_ll = " << _A_is_ll + << ", alpha type = " << typeid(alpha).name() << ", _useHermitianOption = " << _useHermitianOption + << ", _useUpOption = " << _useUpOption << ", h_reference(710, 1065) = (" << h_reference(710, 1065).real() << ", " << h_reference(710, 1065).imag() << ")" - << ", h_A(710, 1065) = (" << h_A(710, 1065).real() << ", " - << h_A(710, 1065).imag() << ")" << std::endl; + << ", h_A(710, 1065) = (" << h_A(710, 1065).real() << ", " << h_A(710, 1065).imag() << ")" << std::endl; } #endif { std::ostringstream msg; - msg << ", A is " << _M << " by " << _N << ", _A_is_lr = " << _A_is_lr - << ", _A_is_ll = " << _A_is_ll - << ", alpha type = " << typeid(alpha).name() - << ", _useHermitianOption = " << _useHermitianOption - << ", _useUpOption = " << _useUpOption - << ": syr2 result is incorrect on real components" - << ", numErrorsRealAbs = " << numErrorsRealAbs - << ", numErrorsRealRel = " << numErrorsRealRel - << ", maxErrorRealRel = " << maxErrorRealRel - << ", iForMaxErrorRealRel = " << iForMaxErrorRealRel - << ", jForMaxErrorRealRel = " << jForMaxErrorRealRel - << ", h_reference(i,j).real() = " - << (((_M > 0) && (_N > 0)) - ? h_reference(iForMaxErrorRealRel, jForMaxErrorRealRel).real() - : 9.999e+99) + msg << ", A is " << _M << " by " << _N << ", _A_is_lr = " << _A_is_lr << ", _A_is_ll = " << _A_is_ll + << ", alpha type = " << typeid(alpha).name() << ", _useHermitianOption = " << _useHermitianOption + << ", _useUpOption = " << _useUpOption << ": syr2 result is incorrect on real components" + << ", numErrorsRealAbs = " << numErrorsRealAbs << ", numErrorsRealRel = " << numErrorsRealRel + << ", maxErrorRealRel = " << maxErrorRealRel << ", iForMaxErrorRealRel = " << iForMaxErrorRealRel + << ", jForMaxErrorRealRel = " << jForMaxErrorRealRel << ", h_reference(i,j).real() = " + << (((_M > 0) && (_N > 0)) ? h_reference(iForMaxErrorRealRel, jForMaxErrorRealRel).real() : 9.999e+99) << ", h_A(i,j).real() = " - << (((_M > 0) && (_N > 0)) - ? h_A(iForMaxErrorRealRel, jForMaxErrorRealRel).real() - : 9.999e+99) + << (((_M > 0) && (_N > 0)) ? h_A(iForMaxErrorRealRel, jForMaxErrorRealRel).real() : 9.999e+99) << ", maxNumErrorsAllowed = " << maxNumErrorsAllowed; int numErrorsReal(numErrorsRealAbs + numErrorsRealRel); @@ -1366,25 +1148,15 @@ Syr2Tester:: } { std::ostringstream msg; - msg << ", A is " << _M << " by " << _N << ", _A_is_lr = " << _A_is_lr - << ", _A_is_ll = " << _A_is_ll - << ", alpha type = " << typeid(alpha).name() - << ", _useHermitianOption = " << _useHermitianOption - << ", _useUpOption = " << _useUpOption - << ": syr2 result is incorrect on imag components" - << ", numErrorsImagAbs = " << numErrorsImagAbs - << ", numErrorsImagRel = " << numErrorsImagRel - << ", maxErrorImagRel = " << maxErrorImagRel - << ", iForMaxErrorImagRel = " << iForMaxErrorImagRel - << ", jForMaxErrorImagRel = " << jForMaxErrorImagRel - << ", h_reference(i,j).imag() = " - << (((_M > 0) && (_N > 0)) - ? h_reference(iForMaxErrorImagRel, jForMaxErrorImagRel).imag() - : 9.999e+99) + msg << ", A is " << _M << " by " << _N << ", _A_is_lr = " << _A_is_lr << ", _A_is_ll = " << _A_is_ll + << ", alpha type = " << typeid(alpha).name() << ", _useHermitianOption = " << _useHermitianOption + << ", _useUpOption = " << _useUpOption << ": syr2 result is incorrect on imag components" + << ", numErrorsImagAbs = " << numErrorsImagAbs << ", numErrorsImagRel = " << numErrorsImagRel + << ", maxErrorImagRel = " << maxErrorImagRel << ", iForMaxErrorImagRel = " << iForMaxErrorImagRel + << ", jForMaxErrorImagRel = " << jForMaxErrorImagRel << ", h_reference(i,j).imag() = " + << (((_M > 0) && (_N > 0)) ? h_reference(iForMaxErrorImagRel, jForMaxErrorImagRel).imag() : 9.999e+99) << ", h_A(i,j).imag() = " - << (((_M > 0) && (_N > 0)) - ? h_A(iForMaxErrorImagRel, jForMaxErrorImagRel).imag() - : 9.999e+99) + << (((_M > 0) && (_N > 0)) ? h_A(iForMaxErrorImagRel, jForMaxErrorImagRel).imag() : 9.999e+99) << ", maxNumErrorsAllowed = " << maxNumErrorsAllowed; int numErrorsImag(numErrorsImagAbs + numErrorsImagRel); @@ -1398,28 +1170,23 @@ Syr2Tester:: } // Code for non-complex values -template +template template -typename std::enable_if>::value && - !std::is_same>::value, - void>::type -Syr2Tester:: - compareKkSyr2AgainstReference(const T& alpha, const _HostViewTypeA& h_A, - const _ViewTypeExpected& h_reference) { +typename std::enable_if< + !std::is_same>::value && !std::is_same>::value, void>::type +Syr2Tester::compareKkSyr2AgainstReference( + const T& alpha, const _HostViewTypeA& h_A, const _ViewTypeExpected& h_reference) { #ifdef HAVE_KOKKOSKERNELS_DEBUG if (_N <= 2) { for (int i(0); i < _M; ++i) { for (int j(0); j < _N; ++j) { - std::cout << "h_exp(" << i << "," << j << ") = " << h_reference(i, j) - << ", h_A(" << i << "," << j << ") = " << h_A(i, j) - << std::endl; + std::cout << "h_exp(" << i << "," << j << ") = " << h_reference(i, j) << ", h_A(" << i << "," << j + << ") = " << h_A(i, j) << std::endl; } } } #endif - int maxNumErrorsAllowed(static_cast(_M) * static_cast(_N) * - 1.e-3); + int maxNumErrorsAllowed(static_cast(_M) * static_cast(_N) * 1.e-3); int numErrorsAbs(0); int numErrorsRel(0); @@ -1455,53 +1222,34 @@ Syr2Tester:: } if (errorHappened && (numErrorsAbs + numErrorsRel == 1)) { #ifdef HAVE_KOKKOSKERNELS_DEBUG - std::cout << "ERROR, i = " << i << ", j = " << j - << ": h_reference(i,j) = " << h_reference(i, j) - << ", h_A(i,j) = " << h_A(i, j) - << ", _KAT_A::abs(h_reference(i,j) - h_A(i,j)) = " << diff + std::cout << "ERROR, i = " << i << ", j = " << j << ": h_reference(i,j) = " << h_reference(i, j) + << ", h_A(i,j) = " << h_A(i, j) << ", _KAT_A::abs(h_reference(i,j) - h_A(i,j)) = " << diff << ", diffThreshold = " << diffThreshold << std::endl; #endif } } // for j } // for i #ifdef HAVE_KOKKOSKERNELS_DEBUG - std::cout << "A is " << _M << " by " << _N << ", _A_is_lr = " << _A_is_lr - << ", _A_is_ll = " << _A_is_ll - << ", alpha type = " << typeid(alpha).name() - << ", _useHermitianOption = " << _useHermitianOption - << ", _useUpOption = " << _useUpOption - << ", numErrorsAbs = " << numErrorsAbs - << ", numErrorsRel = " << numErrorsRel - << ", maxErrorRel = " << maxErrorRel - << ", iForMaxErrorRel = " << iForMaxErrorRel - << ", jForMaxErrorRel = " << jForMaxErrorRel + std::cout << "A is " << _M << " by " << _N << ", _A_is_lr = " << _A_is_lr << ", _A_is_ll = " << _A_is_ll + << ", alpha type = " << typeid(alpha).name() << ", _useHermitianOption = " << _useHermitianOption + << ", _useUpOption = " << _useUpOption << ", numErrorsAbs = " << numErrorsAbs + << ", numErrorsRel = " << numErrorsRel << ", maxErrorRel = " << maxErrorRel + << ", iForMaxErrorRel = " << iForMaxErrorRel << ", jForMaxErrorRel = " << jForMaxErrorRel << ", h_reference(i,j) = " - << (((_M > 0) && (_N > 0)) - ? h_reference(iForMaxErrorRel, jForMaxErrorRel) - : 9.999e+99) - << ", h_A(i,j) = " - << (((_M > 0) && (_N > 0)) ? h_A(iForMaxErrorRel, jForMaxErrorRel) - : 9.999e+99) + << (((_M > 0) && (_N > 0)) ? h_reference(iForMaxErrorRel, jForMaxErrorRel) : 9.999e+99) + << ", h_A(i,j) = " << (((_M > 0) && (_N > 0)) ? h_A(iForMaxErrorRel, jForMaxErrorRel) : 9.999e+99) << ", maxNumErrorsAllowed = " << maxNumErrorsAllowed << std::endl; #endif { std::ostringstream msg; - msg << ", A is " << _M << " by " << _N << ", _A_is_lr = " << _A_is_lr - << ", _A_is_ll = " << _A_is_ll - << ", alpha type = " << typeid(alpha).name() - << ", _useHermitianOption = " << _useHermitianOption + msg << ", A is " << _M << " by " << _N << ", _A_is_lr = " << _A_is_lr << ", _A_is_ll = " << _A_is_ll + << ", alpha type = " << typeid(alpha).name() << ", _useHermitianOption = " << _useHermitianOption << ", _useUpOption = " << _useUpOption << ": syr2 result is incorrect" - << ", numErrorsAbs = " << numErrorsAbs - << ", numErrorsRel = " << numErrorsRel - << ", maxErrorRel = " << maxErrorRel - << ", iForMaxErrorRel = " << iForMaxErrorRel + << ", numErrorsAbs = " << numErrorsAbs << ", numErrorsRel = " << numErrorsRel + << ", maxErrorRel = " << maxErrorRel << ", iForMaxErrorRel = " << iForMaxErrorRel << ", jForMaxErrorRel = " << jForMaxErrorRel << ", h_reference(i,j) = " - << (((_M > 0) && (_N > 0)) - ? h_reference(iForMaxErrorRel, jForMaxErrorRel) - : 9.999e+99) - << ", h_A(i,j) = " - << (((_M > 0) && (_N > 0)) ? h_A(iForMaxErrorRel, jForMaxErrorRel) - : 9.999e+99) + << (((_M > 0) && (_N > 0)) ? h_reference(iForMaxErrorRel, jForMaxErrorRel) : 9.999e+99) + << ", h_A(i,j) = " << (((_M > 0) && (_N > 0)) ? h_A(iForMaxErrorRel, jForMaxErrorRel) : 9.999e+99) << ", maxNumErrorsAllowed = " << maxNumErrorsAllowed; int numErrors(numErrorsAbs + numErrorsRel); @@ -1514,22 +1262,16 @@ Syr2Tester:: } } -template +template template -void Syr2Tester:: - callKkSyr2AndCompareAgainstExpected( - const ScalarA& alpha, TX& x, TY& y, - view_stride_adapter<_ViewTypeA, false>& A, - const _ViewTypeExpected& h_expected, const std::string& situation) { +void Syr2Tester::callKkSyr2AndCompareAgainstExpected( + const ScalarA& alpha, TX& x, TY& y, view_stride_adapter<_ViewTypeA, false>& A, const _ViewTypeExpected& h_expected, + const std::string& situation) { #ifdef HAVE_KOKKOSKERNELS_DEBUG - std::cout << "In Test_Blas2_syr2, '" << situation << "', alpha = " << alpha - << std::endl; + std::cout << "In Test_Blas2_syr2, '" << situation << "', alpha = " << alpha << std::endl; std::cout << "In Test_Blas2_syr2.hpp, right before calling KokkosBlas::syr2()" << ": ViewTypeA = " << typeid(_ViewTypeA).name() - << ", _kkSyr2ShouldThrowException = " << _kkSyr2ShouldThrowException - << std::endl; + << ", _kkSyr2ShouldThrowException = " << _kkSyr2ShouldThrowException << std::endl; #endif std::string mode = _useHermitianOption ? "H" : "T"; std::string uplo = _useUpOption ? "U" : "L"; @@ -1540,25 +1282,21 @@ void Syr2Tester +template template -void Syr2Tester:: - callKkGerAndCompareKkSyr2AgainstIt( - const ScalarA& alpha, TX& x, TY& y, - view_stride_adapter<_ViewTypeA, false>& org_A, - const _HostViewTypeA& h_A_syr2, const std::string& situation) { +void Syr2Tester::callKkGerAndCompareKkSyr2AgainstIt( + const ScalarA& alpha, TX& x, TY& y, view_stride_adapter<_ViewTypeA, false>& org_A, const _HostViewTypeA& h_A_syr2, + const std::string& situation) { view_stride_adapter<_ViewTypeA, false> A_ger("A_ger", _M, _N); Kokkos::deep_copy(A_ger.d_base, org_A.d_base); @@ -1583,12 +1317,10 @@ void Syr2Tester h_ger_reference( - "h_ger_reference", _M, _N); + view_stride_adapter<_ViewTypeExpected, true> h_ger_reference("h_ger_reference", _M, _N); Kokkos::deep_copy(h_ger_reference.d_base, A_ger.d_base); Kokkos::deep_copy(h_ger_reference.h_base, h_ger_reference.d_base); std::string uplo = _useUpOption ? "U" : "L"; for (int i = 0; i < _M; ++i) { for (int j = 0; j < _N; ++j) { - if (((_useUpOption == true) && (i <= j)) || - ((_useUpOption == false) && (i >= j))) { + if (((_useUpOption == true) && (i <= j)) || ((_useUpOption == false) && (i >= j))) { // Keep h_ger_reference as already computed } else { h_ger_reference.h_view(i, j) = org_A.h_view(i, j); @@ -1677,9 +1398,7 @@ void Syr2Tester::value || - std::is_same::value || + bool xBool = std::is_same::value || std::is_same::value || std::is_same>::value || std::is_same>::value; - bool yBool = std::is_same::value || - std::is_same::value || + bool yBool = std::is_same::value || std::is_same::value || std::is_same>::value || std::is_same>::value; - bool aBool = std::is_same::value || - std::is_same::value || + bool aBool = std::is_same::value || std::is_same::value || std::is_same>::value || std::is_same>::value; bool useAnalyticalResults = xBool && yBool && aBool; #if defined(KOKKOSKERNELS_INST_LAYOUTLEFT) || \ - (!defined(KOKKOSKERNELS_ETI_ONLY) && \ - !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS)) + (!defined(KOKKOSKERNELS_ETI_ONLY) && !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS)) #ifdef HAVE_KOKKOSKERNELS_DEBUG std::cout << "+--------------------------------------------------------------" "------------" @@ -1725,8 +1440,7 @@ int test_syr2(const std::string& /*caseName*/) { std::cout << "Starting " << caseName << " for LAYOUTLEFT ..." << std::endl; #endif if (true) { - Test::Syr2Tester + Test::Syr2Tester tester; tester.test(0, 0); tester.test(1, 0); @@ -1761,8 +1475,7 @@ int test_syr2(const std::string& /*caseName*/) { #endif #if defined(KOKKOSKERNELS_INST_LAYOUTRIGHT) || \ - (!defined(KOKKOSKERNELS_ETI_ONLY) && \ - !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS)) + (!defined(KOKKOSKERNELS_ETI_ONLY) && !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS)) #ifdef HAVE_KOKKOSKERNELS_DEBUG std::cout << "+--------------------------------------------------------------" "------------" @@ -1770,8 +1483,7 @@ int test_syr2(const std::string& /*caseName*/) { std::cout << "Starting " << caseName << " for LAYOUTRIGHT ..." << std::endl; #endif if (true) { - Test::Syr2Tester + Test::Syr2Tester tester; tester.test(0, 0); tester.test(1, 0); @@ -1806,8 +1518,7 @@ int test_syr2(const std::string& /*caseName*/) { #endif #if defined(KOKKOSKERNELS_INST_LAYOUTSTRIDE) || \ - (!defined(KOKKOSKERNELS_ETI_ONLY) && \ - !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS)) + (!defined(KOKKOSKERNELS_ETI_ONLY) && !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS)) #ifdef HAVE_KOKKOSKERNELS_DEBUG std::cout << "+--------------------------------------------------------------" "------------" @@ -1815,8 +1526,7 @@ int test_syr2(const std::string& /*caseName*/) { std::cout << "Starting " << caseName << " for LAYOUTSTRIDE ..." << std::endl; #endif if (true) { - Test::Syr2Tester tester; tester.test(0, 0); @@ -1851,8 +1561,7 @@ int test_syr2(const std::string& /*caseName*/) { #endif #endif -#if !defined(KOKKOSKERNELS_ETI_ONLY) && \ - !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS) +#if !defined(KOKKOSKERNELS_ETI_ONLY) && !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS) #ifdef HAVE_KOKKOSKERNELS_DEBUG std::cout << "+--------------------------------------------------------------" "------------" @@ -1860,8 +1569,7 @@ int test_syr2(const std::string& /*caseName*/) { std::cout << "Starting " << caseName << " for MIXED LAYOUTS ..." << std::endl; #endif if (true) { - Test::Syr2Tester + Test::Syr2Tester tester; tester.test(1, 0); tester.test(2, 0); @@ -1879,8 +1587,7 @@ int test_syr2(const std::string& /*caseName*/) { } if (true) { - Test::Syr2Tester + Test::Syr2Tester tester; tester.test(1024, 0); } @@ -1903,8 +1610,7 @@ int test_syr2(const std::string& /*caseName*/) { } #if defined(KOKKOSKERNELS_INST_FLOAT) || \ - (!defined(KOKKOSKERNELS_ETI_ONLY) && \ - !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS)) + (!defined(KOKKOSKERNELS_ETI_ONLY) && !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS)) TEST_F(TestCategory, syr2_float) { Kokkos::Profiling::pushRegion("KokkosBlas::Test::syr2_float"); test_syr2("test case syr2_float"); @@ -1913,19 +1619,17 @@ TEST_F(TestCategory, syr2_float) { #endif #if defined(KOKKOSKERNELS_INST_COMPLEX_FLOAT) || \ - (!defined(KOKKOSKERNELS_ETI_ONLY) && \ - !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS)) + (!defined(KOKKOSKERNELS_ETI_ONLY) && !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS)) TEST_F(TestCategory, syr2_complex_float) { Kokkos::Profiling::pushRegion("KokkosBlas::Test::syr2_complex_float"); - test_syr2, Kokkos::complex, - Kokkos::complex, TestDevice>("test case syr2_complex_float"); + test_syr2, Kokkos::complex, Kokkos::complex, TestDevice>( + "test case syr2_complex_float"); Kokkos::Profiling::popRegion(); } #endif #if defined(KOKKOSKERNELS_INST_DOUBLE) || \ - (!defined(KOKKOSKERNELS_ETI_ONLY) && \ - !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS)) + (!defined(KOKKOSKERNELS_ETI_ONLY) && !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS)) TEST_F(TestCategory, syr2_double) { Kokkos::Profiling::pushRegion("KokkosBlas::Test::syr2_double"); test_syr2("test case syr2_double"); @@ -1934,20 +1638,17 @@ TEST_F(TestCategory, syr2_double) { #endif #if defined(KOKKOSKERNELS_INST_COMPLEX_DOUBLE) || \ - (!defined(KOKKOSKERNELS_ETI_ONLY) && \ - !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS)) + (!defined(KOKKOSKERNELS_ETI_ONLY) && !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS)) TEST_F(TestCategory, syr2_complex_double) { Kokkos::Profiling::pushRegion("KokkosBlas::Test::syr2_complex_double"); - test_syr2, Kokkos::complex, - Kokkos::complex, TestDevice>( + test_syr2, Kokkos::complex, Kokkos::complex, TestDevice>( "test case syr2_complex_double"); Kokkos::Profiling::popRegion(); } #endif -#if defined(KOKKOSKERNELS_INST_INT) || \ - (!defined(KOKKOSKERNELS_ETI_ONLY) && \ - !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS)) +#if defined(KOKKOSKERNELS_INST_INT) || \ + (!defined(KOKKOSKERNELS_ETI_ONLY) && !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS)) TEST_F(TestCategory, syr2_int) { Kokkos::Profiling::pushRegion("KokkosBlas::Test::syr2_int"); test_syr2("test case syr2_int"); @@ -1955,8 +1656,7 @@ TEST_F(TestCategory, syr2_int) { } #endif -#if !defined(KOKKOSKERNELS_ETI_ONLY) && \ - !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS) +#if !defined(KOKKOSKERNELS_ETI_ONLY) && !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS) TEST_F(TestCategory, syr2_int_float_double) { Kokkos::Profiling::pushRegion("KokkosBlas::Test::syr2_int_float_double"); test_syr2("test case syr2_mixed_types"); diff --git a/blas/unit_test/Test_Blas2_team_gemv.hpp b/blas/unit_test/Test_Blas2_team_gemv.hpp index 808532a98e..851410fdb7 100644 --- a/blas/unit_test/Test_Blas2_team_gemv.hpp +++ b/blas/unit_test/Test_Blas2_team_gemv.hpp @@ -27,37 +27,30 @@ namespace Test { -template +template struct TeamGEMVOp : public GemvOpBase { using params = GemvOpBase; - TeamGEMVOp(char trans_, ScalarType alpha_, AType A_, XType x_, - ScalarType beta_, YType y_) + TeamGEMVOp(char trans_, ScalarType alpha_, AType A_, XType x_, ScalarType beta_, YType y_) : params(trans_, alpha_, A_, x_, beta_, y_) {} template KOKKOS_INLINE_FUNCTION void operator()(const TeamMember& member) const { KokkosBlas::Experimental::Gemv::invoke( - member, params::trans, params::alpha, params::A, params::x, - params::beta, params::y); + member, params::trans, params::alpha, params::A, params::x, params::beta, params::y); } }; struct TeamGemvFactory { - template - using functor_type = - TeamGEMVOp; + template + using functor_type = TeamGEMVOp; - using algorithms = std::tuple; + using algorithms = std::tuple; }; } // namespace Test -#define TEST_TEAM_CASE4(N, A, X, Y, SC) \ - TEST_CASE4(team, TeamGemvFactory, N, A, X, Y, SC) +#define TEST_TEAM_CASE4(N, A, X, Y, SC) TEST_CASE4(team, TeamGemvFactory, N, A, X, Y, SC) #define TEST_TEAM_CASE2(N, S, SC) TEST_CASE2(team, TeamGemvFactory, N, S, SC) #define TEST_TEAM_CASE(N, S) TEST_CASE(team, TeamGemvFactory, N, S) diff --git a/blas/unit_test/Test_Blas2_teamvector_gemv.hpp b/blas/unit_test/Test_Blas2_teamvector_gemv.hpp index 655a5e2f12..74cdebf062 100644 --- a/blas/unit_test/Test_Blas2_teamvector_gemv.hpp +++ b/blas/unit_test/Test_Blas2_teamvector_gemv.hpp @@ -27,30 +27,23 @@ namespace Test { -template +template struct TeamVectorGEMVOp : public GemvOpBase { using params = GemvOpBase; - TeamVectorGEMVOp(char trans_, ScalarType alpha_, AType A_, XType x_, - ScalarType beta_, YType y_) + TeamVectorGEMVOp(char trans_, ScalarType alpha_, AType A_, XType x_, ScalarType beta_, YType y_) : params(trans_, alpha_, A_, x_, beta_, y_) {} template KOKKOS_INLINE_FUNCTION void operator()(const TeamMember& member) const { - KokkosBlas::Experimental::Gemv::invoke(member, params::trans, - params::alpha, params::A, - params::x, params::beta, - params::y); + KokkosBlas::Experimental::Gemv::invoke( + member, params::trans, params::alpha, params::A, params::x, params::beta, params::y); } }; struct TeamVectorGemvFactory { - template - using functor_type = - TeamVectorGEMVOp; + template + using functor_type = TeamVectorGEMVOp; // no Blocked implementation using algorithms = std::tuple; @@ -58,12 +51,9 @@ struct TeamVectorGemvFactory { } // namespace Test -#define TEST_TEAMVECTOR_CASE4(N, A, X, Y, SC) \ - TEST_CASE4(teamvector, TeamVectorGemvFactory, N, A, X, Y, SC) -#define TEST_TEAMVECTOR_CASE2(N, S, SC) \ - TEST_CASE2(teamvector, TeamVectorGemvFactory, N, S, SC) -#define TEST_TEAMVECTOR_CASE(N, S) \ - TEST_CASE(teamvector, TeamVectorGemvFactory, N, S) +#define TEST_TEAMVECTOR_CASE4(N, A, X, Y, SC) TEST_CASE4(teamvector, TeamVectorGemvFactory, N, A, X, Y, SC) +#define TEST_TEAMVECTOR_CASE2(N, S, SC) TEST_CASE2(teamvector, TeamVectorGemvFactory, N, S, SC) +#define TEST_TEAMVECTOR_CASE(N, S) TEST_CASE(teamvector, TeamVectorGemvFactory, N, S) #ifdef KOKKOSKERNELS_TEST_FLOAT TEST_TEAMVECTOR_CASE(float, float) diff --git a/blas/unit_test/Test_Blas3_gemm.hpp b/blas/unit_test/Test_Blas3_gemm.hpp index cd91bc6d95..d56886cf13 100644 --- a/blas/unit_test/Test_Blas3_gemm.hpp +++ b/blas/unit_test/Test_Blas3_gemm.hpp @@ -23,8 +23,7 @@ namespace Test { -template +template struct gemm_VanillaGEMM { bool A_t, B_t, A_c, B_c; int N, K; @@ -41,12 +40,9 @@ struct gemm_VanillaGEMM { ScalarC beta; KOKKOS_INLINE_FUNCTION - void operator()( - const typename Kokkos::TeamPolicy::member_type& team) - const { + void operator()(const typename Kokkos::TeamPolicy::member_type& team) const { // GNU COMPILER BUG WORKAROUND -#if defined(KOKKOS_COMPILER_GNU) && !defined(__CUDA_ARCH__) && \ - !defined(__HIP_DEVICE_COMPILE__) +#if defined(KOKKOS_COMPILER_GNU) && !defined(__CUDA_ARCH__) && !defined(__HIP_DEVICE_COMPILE__) int i = team.league_rank(); #else const int i = team.league_rank(); @@ -77,10 +73,8 @@ struct gemm_VanillaGEMM { }; template -void build_matrices(const int M, const int N, const int K, - const typename ViewTypeA::value_type alpha, ViewTypeA& A, - ViewTypeB& B, const typename ViewTypeA::value_type beta, - ViewTypeC& C, ViewTypeC& Cref) { +void build_matrices(const int M, const int N, const int K, const typename ViewTypeA::value_type alpha, ViewTypeA& A, + ViewTypeB& B, const typename ViewTypeA::value_type beta, ViewTypeC& C, ViewTypeC& Cref) { using execution_space = typename TestDevice::execution_space; using ScalarA = typename ViewTypeA::non_const_value_type; using ScalarB = typename ViewTypeB::non_const_value_type; @@ -93,28 +87,22 @@ void build_matrices(const int M, const int N, const int K, // (SA 11 Dec 2019) Max (previously: 10) increased to detect the bug in // Trilinos issue #6418 - const uint64_t seed = - std::chrono::high_resolution_clock::now().time_since_epoch().count(); + const uint64_t seed = std::chrono::high_resolution_clock::now().time_since_epoch().count(); Kokkos::Random_XorShift64_Pool rand_pool(seed); - Kokkos::fill_random(A, rand_pool, - Kokkos::rand::generator_type, - ScalarA>::max()); - Kokkos::fill_random(B, rand_pool, - Kokkos::rand::generator_type, - ScalarB>::max()); - Kokkos::fill_random(C, rand_pool, - Kokkos::rand::generator_type, - ScalarC>::max()); + Kokkos::fill_random( + A, rand_pool, + Kokkos::rand::generator_type, ScalarA>::max()); + Kokkos::fill_random( + B, rand_pool, + Kokkos::rand::generator_type, ScalarB>::max()); + Kokkos::fill_random( + C, rand_pool, + Kokkos::rand::generator_type, ScalarC>::max()); Kokkos::deep_copy(Cref, C); Kokkos::fence(); - struct Test::gemm_VanillaGEMM - vgemm; + struct Test::gemm_VanillaGEMM vgemm; vgemm.A_t = false; vgemm.B_t = false; vgemm.A_c = false; @@ -127,12 +115,10 @@ void build_matrices(const int M, const int N, const int K, vgemm.alpha = alpha; vgemm.beta = beta; - Kokkos::parallel_for( - "KokkosBlas::Test::gemm_VanillaGEMM", - Kokkos::TeamPolicy( - M, Kokkos::AUTO, - KokkosKernels::Impl::kk_get_max_vector_size()), - vgemm); + Kokkos::parallel_for("KokkosBlas::Test::gemm_VanillaGEMM", + Kokkos::TeamPolicy( + M, Kokkos::AUTO, KokkosKernels::Impl::kk_get_max_vector_size()), + vgemm); Kokkos::fence(); } @@ -146,9 +132,7 @@ struct DiffGEMM { typedef typename APT::mag_type mag_type; KOKKOS_INLINE_FUNCTION - void operator()( - const typename Kokkos::TeamPolicy::member_type& team, - mag_type& diff) const { + void operator()(const typename Kokkos::TeamPolicy::member_type& team, mag_type& diff) const { const int i = team.league_rank(); mag_type diff_row = 0; Kokkos::parallel_reduce( @@ -166,8 +150,7 @@ struct DiffGEMM { }; template -void impl_test_gemm(const char* TA, const char* TB, int M, int N, int K, - typename ViewTypeA::value_type alpha, +void impl_test_gemm(const char* TA, const char* TB, int M, int N, int K, typename ViewTypeA::value_type alpha, typename ViewTypeC::value_type beta) { bool A_t = (TA[0] != 'N') && (TA[0] != 'n'); bool B_t = (TB[0] != 'N') && (TB[0] != 'n'); @@ -187,30 +170,25 @@ void impl_test_gemm(const char* TA, const char* TB, int M, int N, int K, ViewTypeC C("C", M, N); ViewTypeC C2("C", M, N); - const uint64_t seed = - std::chrono::high_resolution_clock::now().time_since_epoch().count(); + const uint64_t seed = std::chrono::high_resolution_clock::now().time_since_epoch().count(); Kokkos::Random_XorShift64_Pool rand_pool(seed); // (SA 11 Dec 2019) Max (previously: 10) increased to detect the bug in // Trilinos issue #6418 - Kokkos::fill_random(A, rand_pool, - Kokkos::rand::generator_type, - ScalarA>::max()); - Kokkos::fill_random(B, rand_pool, - Kokkos::rand::generator_type, - ScalarB>::max()); - Kokkos::fill_random(C, rand_pool, - Kokkos::rand::generator_type, - ScalarC>::max()); + Kokkos::fill_random( + A, rand_pool, + Kokkos::rand::generator_type, ScalarA>::max()); + Kokkos::fill_random( + B, rand_pool, + Kokkos::rand::generator_type, ScalarB>::max()); + Kokkos::fill_random( + C, rand_pool, + Kokkos::rand::generator_type, ScalarC>::max()); Kokkos::deep_copy(C2, C); Kokkos::fence(); - struct gemm_VanillaGEMM - vgemm; + struct gemm_VanillaGEMM vgemm; vgemm.A_t = A_t; vgemm.B_t = B_t; vgemm.A_c = A_c; @@ -223,12 +201,10 @@ void impl_test_gemm(const char* TA, const char* TB, int M, int N, int K, vgemm.alpha = alpha; vgemm.beta = beta; - Kokkos::parallel_for( - "KokkosBlas::Test::gemm_VanillaGEMM", - Kokkos::TeamPolicy( - M, Kokkos::AUTO, - KokkosKernels::Impl::kk_get_max_vector_size()), - vgemm); + Kokkos::parallel_for("KokkosBlas::Test::gemm_VanillaGEMM", + Kokkos::TeamPolicy( + M, Kokkos::AUTO, KokkosKernels::Impl::kk_get_max_vector_size()), + vgemm); KokkosBlas::gemm(TA, TB, alpha, A, B, beta, C); @@ -238,9 +214,8 @@ void impl_test_gemm(const char* TA, const char* TB, int M, int N, int K, diffgemm.C = C; diffgemm.C2 = C2; - Kokkos::parallel_reduce("KokkosBlas::Test::DiffGEMM", - Kokkos::TeamPolicy(M, Kokkos::AUTO), - diffgemm, diff_C); + Kokkos::parallel_reduce("KokkosBlas::Test::DiffGEMM", Kokkos::TeamPolicy(M, Kokkos::AUTO), diffgemm, + diff_C); if (N != 0 && M != 0) { int K_eff = (K == 0) ? 1 : K; @@ -258,8 +233,7 @@ void impl_test_gemm(const char* TA, const char* TB, int M, int N, int K, } template -void impl_test_stream_gemm_psge2(const int M, const int N, const int K, - const Scalar alpha, const Scalar beta) { +void impl_test_stream_gemm_psge2(const int M, const int N, const int K, const Scalar alpha, const Scalar beta) { using execution_space = typename Device::execution_space; using ViewTypeA = Kokkos::View; using ViewTypeB = Kokkos::View; @@ -279,8 +253,7 @@ void impl_test_stream_gemm_psge2(const int M, const int N, const int K, Test::build_matrices(M, N, K, alpha, A1, B1, beta, C1, C1ref); Test::build_matrices(N, M, K, alpha, A2, B2, beta, C2, C2ref); - auto instances = - Kokkos::Experimental::partition_space(execution_space(), 1, 1); + auto instances = Kokkos::Experimental::partition_space(execution_space(), 1, 1); KokkosBlas::gemm(instances[0], tA, tB, alpha, A1, B1, beta, C1); KokkosBlas::gemm(instances[1], tA, tB, alpha, A2, B2, beta, C2); Kokkos::fence(); @@ -291,12 +264,10 @@ void impl_test_stream_gemm_psge2(const int M, const int N, const int K, diffgemm1.C = C1; diffgemm1.C2 = C1ref; - Kokkos::parallel_reduce( - "KokkosBlas::Test::DiffGEMM1", - Kokkos::TeamPolicy( - M, Kokkos::AUTO, - KokkosKernels::Impl::kk_get_max_vector_size()), - diffgemm1, diff_C1); + Kokkos::parallel_reduce("KokkosBlas::Test::DiffGEMM1", + Kokkos::TeamPolicy( + M, Kokkos::AUTO, KokkosKernels::Impl::kk_get_max_vector_size()), + diffgemm1, diff_C1); mag_type diff_C2 = 0; struct Test::DiffGEMM diffgemm2; @@ -304,12 +275,10 @@ void impl_test_stream_gemm_psge2(const int M, const int N, const int K, diffgemm2.C = C2; diffgemm2.C2 = C2ref; - Kokkos::parallel_reduce( - "KokkosBlas::Test::DiffGEMM2", - Kokkos::TeamPolicy( - N, Kokkos::AUTO, - KokkosKernels::Impl::kk_get_max_vector_size()), - diffgemm2, diff_C2); + Kokkos::parallel_reduce("KokkosBlas::Test::DiffGEMM2", + Kokkos::TeamPolicy( + N, Kokkos::AUTO, KokkosKernels::Impl::kk_get_max_vector_size()), + diffgemm2, diff_C2); Kokkos::fence(); if (N != 0 && M != 0) { @@ -317,8 +286,7 @@ void impl_test_stream_gemm_psge2(const int M, const int N, const int K, // Expected Result: Random Walk in the least significant bit (i.e. ~ // sqrt(K)*eps eps scales with the total sum and has a factor in it for the // accuracy of the operations -> eps = K * 75 * machine_eps * 7 - const double diff_C_expected = - 1.0 * sqrt(K_eff) * K_eff * 75 * machine_eps * 7; + const double diff_C_expected = 1.0 * sqrt(K_eff) * K_eff * 75 * machine_eps * 7; const double diff_C1_average = diff_C1 / (N * M); if ((diff_C1_average >= 1.05 * diff_C_expected)) { @@ -342,55 +310,45 @@ void test_gemm() { typedef Kokkos::View view_type_b; typedef Kokkos::View view_type_c; std::vector modes = {"N", "T"}; - if (std::is_same>::value || - std::is_same>::value) + if (std::is_same>::value || std::is_same>::value) modes.push_back("C"); Scalar alpha = 4.5; std::vector betas = {0.0, 3.0}; for (Scalar beta : betas) { for (auto amode : modes) { for (auto bmode : modes) { - Test::impl_test_gemm( - amode, bmode, 0, 0, 0, alpha, beta); + Test::impl_test_gemm(amode, bmode, 0, 0, 0, alpha, beta); // BMK: N = 1 exercises the special GEMV code path in GEMM (currently, // only for modes N/N) - Test::impl_test_gemm( - amode, bmode, 50, 1, 40, alpha, beta); + Test::impl_test_gemm(amode, bmode, 50, 1, 40, alpha, beta); // LBV: K = 0 exercise the quick return code path in GEMM - Test::impl_test_gemm( - amode, bmode, 20, 14, 0, alpha, beta); - Test::impl_test_gemm( - amode, bmode, 13, 15, 17, alpha, beta); - Test::impl_test_gemm( - amode, bmode, 179, 15, 211, alpha, beta); - Test::impl_test_gemm( - amode, bmode, 12, 3071, 517, alpha, beta); + Test::impl_test_gemm(amode, bmode, 20, 14, 0, alpha, beta); + Test::impl_test_gemm(amode, bmode, 13, 15, 17, alpha, beta); + Test::impl_test_gemm(amode, bmode, 179, 15, 211, alpha, + beta); + Test::impl_test_gemm(amode, bmode, 12, 3071, 517, alpha, + beta); } } } auto pool_size = execution_space().concurrency(); if (pool_size >= 2) { - Test::impl_test_stream_gemm_psge2( - 53, 42, 17, 4.5, - 3.0); // General code path - Test::impl_test_stream_gemm_psge2( - 13, 1, 17, 4.5, 3.0); // gemv based gemm code path - Test::impl_test_stream_gemm_psge2( - 7, 13, 17, 4.5, - 3.0); // dot based gemm code path + Test::impl_test_stream_gemm_psge2(53, 42, 17, 4.5, + 3.0); // General code path + Test::impl_test_stream_gemm_psge2(13, 1, 17, 4.5, 3.0); // gemv based gemm code path + Test::impl_test_stream_gemm_psge2(7, 13, 17, 4.5, + 3.0); // dot based gemm code path } } template void test_gemm_enabled_layouts() { #if defined(KOKKOSKERNELS_INST_LAYOUTLEFT) || \ - (!defined(KOKKOSKERNELS_ETI_ONLY) && \ - !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS)) + (!defined(KOKKOSKERNELS_ETI_ONLY) && !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS)) test_gemm(); #endif #if defined(KOKKOSKERNELS_INST_LAYOUTRIGHT) || \ - (!defined(KOKKOSKERNELS_ETI_ONLY) && \ - !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS)) + (!defined(KOKKOSKERNELS_ETI_ONLY) && !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS)) test_gemm(); #endif } @@ -416,8 +374,7 @@ void test_gemm_mixed_scalars() { } #if defined(KOKKOSKERNELS_INST_FLOAT) || \ - (!defined(KOKKOSKERNELS_ETI_ONLY) && \ - !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS)) + (!defined(KOKKOSKERNELS_ETI_ONLY) && !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS)) TEST_F(TestCategory, gemm_float) { Kokkos::Profiling::pushRegion("KokkosBlas::Test::gemm_float"); test_gemm_enabled_layouts(); @@ -426,8 +383,7 @@ TEST_F(TestCategory, gemm_float) { #endif #if defined(KOKKOSKERNELS_INST_DOUBLE) || \ - (!defined(KOKKOSKERNELS_ETI_ONLY) && \ - !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS)) + (!defined(KOKKOSKERNELS_ETI_ONLY) && !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS)) TEST_F(TestCategory, gemm_double) { Kokkos::Profiling::pushRegion("KokkosBlas::Test::gemm_double"); test_gemm_enabled_layouts(); @@ -436,8 +392,7 @@ TEST_F(TestCategory, gemm_double) { #endif #if defined(KOKKOSKERNELS_INST_COMPLEX_DOUBLE) || \ - (!defined(KOKKOSKERNELS_ETI_ONLY) && \ - !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS)) + (!defined(KOKKOSKERNELS_ETI_ONLY) && !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS)) TEST_F(TestCategory, gemm_complex_double) { Kokkos::Profiling::pushRegion("KokkosBlas::Test::gemm_complex_double"); test_gemm_enabled_layouts>(); @@ -446,8 +401,7 @@ TEST_F(TestCategory, gemm_complex_double) { #endif #if defined(KOKKOSKERNELS_INST_COMPLEX_FLOAT) || \ - (!defined(KOKKOSKERNELS_ETI_ONLY) && \ - !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS)) + (!defined(KOKKOSKERNELS_ETI_ONLY) && !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS)) TEST_F(TestCategory, gemm_complex_float) { Kokkos::Profiling::pushRegion("KokkosBlas::Test::gemm_complex_float"); test_gemm_enabled_layouts>(); @@ -455,21 +409,17 @@ TEST_F(TestCategory, gemm_complex_float) { } #endif -#if defined(KOKKOSKERNELS_INST_COMPLEX_DOUBLE) && \ - !defined(KOKKOSKERNELS_ETI_ONLY) +#if defined(KOKKOSKERNELS_INST_COMPLEX_DOUBLE) && !defined(KOKKOSKERNELS_ETI_ONLY) TEST_F(TestCategory, gemm_mixed_scalars_complex_double_double) { - Kokkos::Profiling::pushRegion( - "KokkosBlas::Test::gemm_mixed_complex_double_double"); + Kokkos::Profiling::pushRegion("KokkosBlas::Test::gemm_mixed_complex_double_double"); test_gemm_mixed_scalars, double>(); Kokkos::Profiling::popRegion(); } #endif -#if defined(KOKKOSKERNELS_INST_COMPLEX_FLOAT) && \ - !defined(KOKKOSKERNELS_ETI_ONLY) +#if defined(KOKKOSKERNELS_INST_COMPLEX_FLOAT) && !defined(KOKKOSKERNELS_ETI_ONLY) TEST_F(TestCategory, gemm_mixed_scalar_complex_float_float) { - Kokkos::Profiling::pushRegion( - "KokkosBlas::Test::gemm_mixed_complex_float_float"); + Kokkos::Profiling::pushRegion("KokkosBlas::Test::gemm_mixed_complex_float_float"); test_gemm_mixed_scalars, float>(); Kokkos::Profiling::popRegion(); } diff --git a/blas/unit_test/Test_Blas3_trmm.hpp b/blas/unit_test/Test_Blas3_trmm.hpp index a186835aaa..d5ba622969 100644 --- a/blas/unit_test/Test_Blas3_trmm.hpp +++ b/blas/unit_test/Test_Blas3_trmm.hpp @@ -44,8 +44,7 @@ struct NonUnitDiagTRMM { void operator()(const int& i) const { A_(i, i) = A_(i, i) + 10; } }; -template +template struct trmm_VanillaGEMM { bool A_t, B_t, A_c, B_c; int N, K; @@ -62,12 +61,9 @@ struct trmm_VanillaGEMM { ScalarC beta; KOKKOS_INLINE_FUNCTION - void operator()( - const typename Kokkos::TeamPolicy::member_type& team) - const { + void operator()(const typename Kokkos::TeamPolicy::member_type& team) const { // GNU COMPILER BUG WORKAROUND -#if defined(KOKKOS_COMPILER_GNU) && !defined(__CUDA_ARCH__) && \ - !defined(__HIP_DEVICE_COMPILE__) +#if defined(KOKKOS_COMPILER_GNU) && !defined(__CUDA_ARCH__) && !defined(__HIP_DEVICE_COMPILE__) int i = team.league_rank(); #else const int i = team.league_rank(); @@ -98,8 +94,8 @@ struct trmm_VanillaGEMM { }; template -void impl_test_trmm(const char* side, const char* uplo, const char* trans, - const char* diag, int M, int N, Scalar alpha) { +void impl_test_trmm(const char* side, const char* uplo, const char* trans, const char* diag, int M, int N, + Scalar alpha) { using execution_space = typename ViewTypeA::device_type::execution_space; using ScalarA = typename ViewTypeA::value_type; using APT = Kokkos::ArithTraits; @@ -112,45 +108,35 @@ void impl_test_trmm(const char* side, const char* uplo, const char* trans, ViewTypeA A("A", K, K); ViewTypeB B("B", M, N); ViewTypeB B_expected("B_expected", M, N); - uint64_t seed = - std::chrono::high_resolution_clock::now().time_since_epoch().count(); - ScalarA beta = ScalarA(0); + uint64_t seed = std::chrono::high_resolution_clock::now().time_since_epoch().count(); + ScalarA beta = ScalarA(0); // printf("KokkosBlas::trmm test for alpha %g, %c %c %c %c, M %d, N %d, eps // %g, ViewType: %s\n", // Kokkos::ArithTraits::real(alpha),side[0],uplo[0],trans[0],diag[0],M,N,eps,typeid(ViewTypeA).name()); - typename ViewTypeA::HostMirror host_A = Kokkos::create_mirror_view(A); - typename ViewTypeB::HostMirror host_B_actual = Kokkos::create_mirror_view(B); - typename ViewTypeB::HostMirror host_B_expected = - Kokkos::create_mirror_view(B_expected); + typename ViewTypeA::HostMirror host_A = Kokkos::create_mirror_view(A); + typename ViewTypeB::HostMirror host_B_actual = Kokkos::create_mirror_view(B); + typename ViewTypeB::HostMirror host_B_expected = Kokkos::create_mirror_view(B_expected); Kokkos::Random_XorShift64_Pool rand_pool(seed); if ((diag[0] == 'U') || (diag[0] == 'u')) { // Initialize A with deterministic random numbers - Kokkos::fill_random(A, rand_pool, - Kokkos::rand, - ScalarA>::max()); + Kokkos::fill_random(A, rand_pool, Kokkos::rand, ScalarA>::max()); using functor_type = UnitDiagTRMM; functor_type udtrmm(A); // Initialize As diag with 1s - Kokkos::parallel_for("KokkosBlas::Test::UnitDiagTRMM", - Kokkos::RangePolicy(0, K), udtrmm); + Kokkos::parallel_for("KokkosBlas::Test::UnitDiagTRMM", Kokkos::RangePolicy(0, K), udtrmm); } else { //(diag[0]=='N')||(diag[0]=='n') // Initialize A with random numbers - Kokkos::fill_random(A, rand_pool, - Kokkos::rand, - ScalarA>::max()); + Kokkos::fill_random(A, rand_pool, Kokkos::rand, ScalarA>::max()); using functor_type = NonUnitDiagTRMM; functor_type nudtrmm(A); // Initialize As diag with A(i,i)+10 - Kokkos::parallel_for("KokkosBlas::Test::NonUnitDiagTRMM", - Kokkos::RangePolicy(0, K), nudtrmm); + Kokkos::parallel_for("KokkosBlas::Test::NonUnitDiagTRMM", Kokkos::RangePolicy(0, K), nudtrmm); } - Kokkos::fill_random( - B, rand_pool, - Kokkos::rand, ScalarA>::max()); + Kokkos::fill_random(B, rand_pool, Kokkos::rand, ScalarA>::max()); Kokkos::deep_copy(host_A, A); // Make host_A a lower triangle @@ -164,8 +150,7 @@ void impl_test_trmm(const char* side, const char* uplo, const char* trans, } Kokkos::deep_copy(A, host_A); - struct trmm_VanillaGEMM - vgemm; + struct trmm_VanillaGEMM vgemm; if (A_l) { // B_expected = alpha * op(A) * B + beta * C = 1 * op(A) * B + 0 * C vgemm.A_t = (trans[0] != 'N') && (trans[0] != 'n'); @@ -188,12 +173,10 @@ void impl_test_trmm(const char* side, const char* uplo, const char* trans, vgemm.C = B_expected; // out vgemm.alpha = alpha; vgemm.beta = beta; - Kokkos::parallel_for( - "KokkosBlas::Test::trmm_VanillaGEMM", - Kokkos::TeamPolicy( - M, Kokkos::AUTO, - KokkosKernels::Impl::kk_get_max_vector_size()), - vgemm); + Kokkos::parallel_for("KokkosBlas::Test::trmm_VanillaGEMM", + Kokkos::TeamPolicy( + M, Kokkos::AUTO, KokkosKernels::Impl::kk_get_max_vector_size()), + vgemm); Kokkos::fence(); Kokkos::deep_copy(host_B_expected, B_expected); @@ -221,41 +204,38 @@ void impl_test_trmm(const char* side, const char* uplo, const char* trans, template int test_trmm(const char* mode, ScalarA alpha) { #if defined(KOKKOSKERNELS_INST_LAYOUTLEFT) || \ - (!defined(KOKKOSKERNELS_ETI_ONLY) && \ - !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS)) + (!defined(KOKKOSKERNELS_ETI_ONLY) && !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS)) using view_type_a_ll = Kokkos::View; using view_type_b_ll = Kokkos::View; - Test::impl_test_trmm( - &mode[0], &mode[1], &mode[2], &mode[3], 0, 0, alpha); - Test::impl_test_trmm( - &mode[0], &mode[1], &mode[2], &mode[3], 101, 19, alpha); - Test::impl_test_trmm( - &mode[0], &mode[1], &mode[2], &mode[3], 19, 101, alpha); - Test::impl_test_trmm( - &mode[0], &mode[1], &mode[2], &mode[3], 12, 731, alpha); + Test::impl_test_trmm(&mode[0], &mode[1], &mode[2], &mode[3], 0, 0, + alpha); + Test::impl_test_trmm(&mode[0], &mode[1], &mode[2], &mode[3], 101, 19, + alpha); + Test::impl_test_trmm(&mode[0], &mode[1], &mode[2], &mode[3], 19, 101, + alpha); + Test::impl_test_trmm(&mode[0], &mode[1], &mode[2], &mode[3], 12, 731, + alpha); #endif #if defined(KOKKOSKERNELS_INST_LAYOUTRIGHT) || \ - (!defined(KOKKOSKERNELS_ETI_ONLY) && \ - !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS)) + (!defined(KOKKOSKERNELS_ETI_ONLY) && !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS)) using view_type_a_lr = Kokkos::View; using view_type_b_lr = Kokkos::View; - Test::impl_test_trmm( - &mode[0], &mode[1], &mode[2], &mode[3], 0, 0, alpha); - Test::impl_test_trmm( - &mode[0], &mode[1], &mode[2], &mode[3], 101, 19, alpha); - Test::impl_test_trmm( - &mode[0], &mode[1], &mode[2], &mode[3], 19, 101, alpha); - Test::impl_test_trmm( - &mode[0], &mode[1], &mode[2], &mode[3], 12, 731, alpha); + Test::impl_test_trmm(&mode[0], &mode[1], &mode[2], &mode[3], 0, 0, + alpha); + Test::impl_test_trmm(&mode[0], &mode[1], &mode[2], &mode[3], 101, 19, + alpha); + Test::impl_test_trmm(&mode[0], &mode[1], &mode[2], &mode[3], 19, 101, + alpha); + Test::impl_test_trmm(&mode[0], &mode[1], &mode[2], &mode[3], 12, 731, + alpha); #endif return 1; } #if defined(KOKKOSKERNELS_INST_FLOAT) || \ - (!defined(KOKKOSKERNELS_ETI_ONLY) && \ - !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS)) + (!defined(KOKKOSKERNELS_ETI_ONLY) && !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS)) TEST_F(TestCategory, trmm_float) { Kokkos::Profiling::pushRegion("KokkosBlas::Test::trmm_float"); float alpha = 1.0f; @@ -300,8 +280,7 @@ TEST_F(TestCategory, trmm_float) { #endif #if defined(KOKKOSKERNELS_INST_DOUBLE) || \ - (!defined(KOKKOSKERNELS_ETI_ONLY) && \ - !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS)) + (!defined(KOKKOSKERNELS_ETI_ONLY) && !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS)) TEST_F(TestCategory, trmm_double) { Kokkos::Profiling::pushRegion("KokkosBlas::Test::trmm_double"); double alpha = 1.0; @@ -346,399 +325,333 @@ TEST_F(TestCategory, trmm_double) { #endif #if defined(KOKKOSKERNELS_INST_COMPLEX_DOUBLE) || \ - (!defined(KOKKOSKERNELS_ETI_ONLY) && \ - !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS)) + (!defined(KOKKOSKERNELS_ETI_ONLY) && !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS)) ///////////////// alpha 1.0 ///////////////// TEST_F(TestCategory, trmm_complex_double_LLNN_one) { Kokkos::Profiling::pushRegion("KokkosBlas::Test::trmm_complex_double_LLNN"); - test_trmm, Kokkos::complex, TestDevice>( - "LLNN", 1.0); + test_trmm, Kokkos::complex, TestDevice>("LLNN", 1.0); Kokkos::Profiling::popRegion(); } TEST_F(TestCategory, trmm_complex_double_LLNU_one) { Kokkos::Profiling::pushRegion("KokkosBlas::Test::trmm_complex_double_LLNU"); - test_trmm, Kokkos::complex, TestDevice>( - "LLNU", 1.0); + test_trmm, Kokkos::complex, TestDevice>("LLNU", 1.0); Kokkos::Profiling::popRegion(); } TEST_F(TestCategory, trmm_complex_double_LLCN_one) { Kokkos::Profiling::pushRegion("KokkosBlas::Test::trmm_complex_double_LLCN"); - test_trmm, Kokkos::complex, TestDevice>( - "LLCN", 1.0); + test_trmm, Kokkos::complex, TestDevice>("LLCN", 1.0); Kokkos::Profiling::popRegion(); } TEST_F(TestCategory, trmm_complex_double_LLCU_one) { Kokkos::Profiling::pushRegion("KokkosBlas::Test::trmm_complex_double_LLCU"); - test_trmm, Kokkos::complex, TestDevice>( - "LLCU", 1.0); + test_trmm, Kokkos::complex, TestDevice>("LLCU", 1.0); Kokkos::Profiling::popRegion(); } TEST_F(TestCategory, trmm_complex_double_LUNN_one) { Kokkos::Profiling::pushRegion("KokkosBlas::Test::trmm_complex_double_LUNN"); - test_trmm, Kokkos::complex, TestDevice>( - "LUNN", 1.0); + test_trmm, Kokkos::complex, TestDevice>("LUNN", 1.0); Kokkos::Profiling::popRegion(); } TEST_F(TestCategory, trmm_complex_double_LUNU_one) { Kokkos::Profiling::pushRegion("KokkosBlas::Test::trmm_complex_double_LUNU"); - test_trmm, Kokkos::complex, TestDevice>( - "LUNU", 1.0); + test_trmm, Kokkos::complex, TestDevice>("LUNU", 1.0); Kokkos::Profiling::popRegion(); } TEST_F(TestCategory, trmm_complex_double_LUCN_one) { Kokkos::Profiling::pushRegion("KokkosBlas::Test::trmm_complex_double_LUCN"); - test_trmm, Kokkos::complex, TestDevice>( - "LUCN", 1.0); + test_trmm, Kokkos::complex, TestDevice>("LUCN", 1.0); Kokkos::Profiling::popRegion(); } TEST_F(TestCategory, trmm_complex_double_LUCU_one) { Kokkos::Profiling::pushRegion("KokkosBlas::Test::trmm_complex_double_LUCU"); - test_trmm, Kokkos::complex, TestDevice>( - "LUCU", 1.0); + test_trmm, Kokkos::complex, TestDevice>("LUCU", 1.0); Kokkos::Profiling::popRegion(); } TEST_F(TestCategory, trmm_complex_double_RLNN_one) { Kokkos::Profiling::pushRegion("KokkosBlas::Test::trmm_complex_double_RLNN"); - test_trmm, Kokkos::complex, TestDevice>( - "RLNN", 1.0); + test_trmm, Kokkos::complex, TestDevice>("RLNN", 1.0); Kokkos::Profiling::popRegion(); } TEST_F(TestCategory, trmm_complex_double_RLNU_one) { Kokkos::Profiling::pushRegion("KokkosBlas::Test::trmm_complex_double_RLNU"); - test_trmm, Kokkos::complex, TestDevice>( - "RLNU", 1.0); + test_trmm, Kokkos::complex, TestDevice>("RLNU", 1.0); Kokkos::Profiling::popRegion(); } TEST_F(TestCategory, trmm_complex_double_RLCN_one) { Kokkos::Profiling::pushRegion("KokkosBlas::Test::trmm_complex_double_RLCN"); - test_trmm, Kokkos::complex, TestDevice>( - "RLCN", 1.0); + test_trmm, Kokkos::complex, TestDevice>("RLCN", 1.0); Kokkos::Profiling::popRegion(); } TEST_F(TestCategory, trmm_complex_double_RLCU_one) { Kokkos::Profiling::pushRegion("KokkosBlas::Test::trmm_complex_double_RLCU"); - test_trmm, Kokkos::complex, TestDevice>( - "RLCU", 1.0); + test_trmm, Kokkos::complex, TestDevice>("RLCU", 1.0); Kokkos::Profiling::popRegion(); } TEST_F(TestCategory, trmm_complex_double_RUNN_one) { Kokkos::Profiling::pushRegion("KokkosBlas::Test::trmm_complex_double_RUNN"); - test_trmm, Kokkos::complex, TestDevice>( - "RUNN", 1.0); + test_trmm, Kokkos::complex, TestDevice>("RUNN", 1.0); Kokkos::Profiling::popRegion(); } TEST_F(TestCategory, trmm_complex_double_RUNU_one) { Kokkos::Profiling::pushRegion("KokkosBlas::Test::trmm_complex_double_RUNU"); - test_trmm, Kokkos::complex, TestDevice>( - "RUNU", 1.0); + test_trmm, Kokkos::complex, TestDevice>("RUNU", 1.0); Kokkos::Profiling::popRegion(); } TEST_F(TestCategory, trmm_complex_double_RUCN_one) { Kokkos::Profiling::pushRegion("KokkosBlas::Test::trmm_complex_double_RUCN"); - test_trmm, Kokkos::complex, TestDevice>( - "RUCN", 1.0); + test_trmm, Kokkos::complex, TestDevice>("RUCN", 1.0); Kokkos::Profiling::popRegion(); } TEST_F(TestCategory, trmm_complex_double_RUCU_one) { Kokkos::Profiling::pushRegion("KokkosBlas::Test::trmm_complex_double_RUCU"); - test_trmm, Kokkos::complex, TestDevice>( - "RUCU", 1.0); + test_trmm, Kokkos::complex, TestDevice>("RUCU", 1.0); Kokkos::Profiling::popRegion(); } ///////////////// alpha 4.5 ///////////////// TEST_F(TestCategory, trmm_complex_double_LLNN_fourfive) { Kokkos::Profiling::pushRegion("KokkosBlas::Test::trmm_complex_double_LLNN"); - test_trmm, Kokkos::complex, TestDevice>( - "LLNN", Kokkos::complex(4.5, 0.0)); + test_trmm, Kokkos::complex, TestDevice>("LLNN", Kokkos::complex(4.5, 0.0)); Kokkos::Profiling::popRegion(); } TEST_F(TestCategory, trmm_complex_double_LLNU_fourfive) { Kokkos::Profiling::pushRegion("KokkosBlas::Test::trmm_complex_double_LLNU"); - test_trmm, Kokkos::complex, TestDevice>( - "LLNU", Kokkos::complex(4.5, 0.0)); + test_trmm, Kokkos::complex, TestDevice>("LLNU", Kokkos::complex(4.5, 0.0)); Kokkos::Profiling::popRegion(); } TEST_F(TestCategory, trmm_complex_double_LLCN_fourfive) { Kokkos::Profiling::pushRegion("KokkosBlas::Test::trmm_complex_double_LLCN"); - test_trmm, Kokkos::complex, TestDevice>( - "LLCN", Kokkos::complex(4.5, 0.0)); + test_trmm, Kokkos::complex, TestDevice>("LLCN", Kokkos::complex(4.5, 0.0)); Kokkos::Profiling::popRegion(); } TEST_F(TestCategory, trmm_complex_double_LLCU_fourfive) { Kokkos::Profiling::pushRegion("KokkosBlas::Test::trmm_complex_double_LLCU"); - test_trmm, Kokkos::complex, TestDevice>( - "LLCU", Kokkos::complex(4.5, 0.0)); + test_trmm, Kokkos::complex, TestDevice>("LLCU", Kokkos::complex(4.5, 0.0)); Kokkos::Profiling::popRegion(); } TEST_F(TestCategory, trmm_complex_double_LUNN_fourfive) { Kokkos::Profiling::pushRegion("KokkosBlas::Test::trmm_complex_double_LUNN"); - test_trmm, Kokkos::complex, TestDevice>( - "LUNN", Kokkos::complex(4.5, 0.0)); + test_trmm, Kokkos::complex, TestDevice>("LUNN", Kokkos::complex(4.5, 0.0)); Kokkos::Profiling::popRegion(); } TEST_F(TestCategory, trmm_complex_double_LUNU_fourfive) { Kokkos::Profiling::pushRegion("KokkosBlas::Test::trmm_complex_double_LUNU"); - test_trmm, Kokkos::complex, TestDevice>( - "LUNU", Kokkos::complex(4.5, 0.0)); + test_trmm, Kokkos::complex, TestDevice>("LUNU", Kokkos::complex(4.5, 0.0)); Kokkos::Profiling::popRegion(); } TEST_F(TestCategory, trmm_complex_double_LUCN_fourfive) { Kokkos::Profiling::pushRegion("KokkosBlas::Test::trmm_complex_double_LUCN"); - test_trmm, Kokkos::complex, TestDevice>( - "LUCN", Kokkos::complex(4.5, 0.0)); + test_trmm, Kokkos::complex, TestDevice>("LUCN", Kokkos::complex(4.5, 0.0)); Kokkos::Profiling::popRegion(); } TEST_F(TestCategory, trmm_complex_double_LUCU_fourfive) { Kokkos::Profiling::pushRegion("KokkosBlas::Test::trmm_complex_double_LUCU"); - test_trmm, Kokkos::complex, TestDevice>( - "LUCU", Kokkos::complex(4.5, 0.0)); + test_trmm, Kokkos::complex, TestDevice>("LUCU", Kokkos::complex(4.5, 0.0)); Kokkos::Profiling::popRegion(); } TEST_F(TestCategory, trmm_complex_double_RLNN_fourfive) { Kokkos::Profiling::pushRegion("KokkosBlas::Test::trmm_complex_double_RLNN"); - test_trmm, Kokkos::complex, TestDevice>( - "RLNN", Kokkos::complex(4.5, 0.0)); + test_trmm, Kokkos::complex, TestDevice>("RLNN", Kokkos::complex(4.5, 0.0)); Kokkos::Profiling::popRegion(); } TEST_F(TestCategory, trmm_complex_double_RLNU_fourfive) { Kokkos::Profiling::pushRegion("KokkosBlas::Test::trmm_complex_double_RLNU"); - test_trmm, Kokkos::complex, TestDevice>( - "RLNU", Kokkos::complex(4.5, 0.0)); + test_trmm, Kokkos::complex, TestDevice>("RLNU", Kokkos::complex(4.5, 0.0)); Kokkos::Profiling::popRegion(); } TEST_F(TestCategory, trmm_complex_double_RLCN_fourfive) { Kokkos::Profiling::pushRegion("KokkosBlas::Test::trmm_complex_double_RLCN"); - test_trmm, Kokkos::complex, TestDevice>( - "RLCN", Kokkos::complex(4.5, 0.0)); + test_trmm, Kokkos::complex, TestDevice>("RLCN", Kokkos::complex(4.5, 0.0)); Kokkos::Profiling::popRegion(); } TEST_F(TestCategory, trmm_complex_double_RLCU_fourfive) { Kokkos::Profiling::pushRegion("KokkosBlas::Test::trmm_complex_double_RLCU"); - test_trmm, Kokkos::complex, TestDevice>( - "RLCU", Kokkos::complex(4.5, 0.0)); + test_trmm, Kokkos::complex, TestDevice>("RLCU", Kokkos::complex(4.5, 0.0)); Kokkos::Profiling::popRegion(); } TEST_F(TestCategory, trmm_complex_double_RUNN_fourfive) { Kokkos::Profiling::pushRegion("KokkosBlas::Test::trmm_complex_double_RUNN"); - test_trmm, Kokkos::complex, TestDevice>( - "RUNN", Kokkos::complex(4.5, 0.0)); + test_trmm, Kokkos::complex, TestDevice>("RUNN", Kokkos::complex(4.5, 0.0)); Kokkos::Profiling::popRegion(); } TEST_F(TestCategory, trmm_complex_double_RUNU_fourfive) { Kokkos::Profiling::pushRegion("KokkosBlas::Test::trmm_complex_double_RUNU"); - test_trmm, Kokkos::complex, TestDevice>( - "RUNU", Kokkos::complex(4.5, 0.0)); + test_trmm, Kokkos::complex, TestDevice>("RUNU", Kokkos::complex(4.5, 0.0)); Kokkos::Profiling::popRegion(); } TEST_F(TestCategory, trmm_complex_double_RUCN_fourfive) { Kokkos::Profiling::pushRegion("KokkosBlas::Test::trmm_complex_double_RUCN"); - test_trmm, Kokkos::complex, TestDevice>( - "RUCN", Kokkos::complex(4.5, 0.0)); + test_trmm, Kokkos::complex, TestDevice>("RUCN", Kokkos::complex(4.5, 0.0)); Kokkos::Profiling::popRegion(); } TEST_F(TestCategory, trmm_complex_double_RUCU_fourfive) { Kokkos::Profiling::pushRegion("KokkosBlas::Test::trmm_complex_double_RUCU"); - test_trmm, Kokkos::complex, TestDevice>( - "RUCU", Kokkos::complex(4.5, 0.0)); + test_trmm, Kokkos::complex, TestDevice>("RUCU", Kokkos::complex(4.5, 0.0)); Kokkos::Profiling::popRegion(); } #endif #if defined(KOKKOSKERNELS_INST_COMPLEX_FLOAT) || \ - (!defined(KOKKOSKERNELS_ETI_ONLY) && \ - !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS)) + (!defined(KOKKOSKERNELS_ETI_ONLY) && !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS)) ///////////////// alpha 1.0 ///////////////// TEST_F(TestCategory, trmm_complex_float_LLNN_one) { Kokkos::Profiling::pushRegion("KokkosBlas::Test::trmm_complex_float_LLNN"); - test_trmm, Kokkos::complex, TestDevice>("LLNN", - 1.0f); + test_trmm, Kokkos::complex, TestDevice>("LLNN", 1.0f); Kokkos::Profiling::popRegion(); } TEST_F(TestCategory, trmm_complex_float_LLNU_one) { Kokkos::Profiling::pushRegion("KokkosBlas::Test::trmm_complex_float_LLNU"); - test_trmm, Kokkos::complex, TestDevice>("LLNU", - 1.0f); + test_trmm, Kokkos::complex, TestDevice>("LLNU", 1.0f); Kokkos::Profiling::popRegion(); } TEST_F(TestCategory, trmm_complex_float_LLCN_one) { Kokkos::Profiling::pushRegion("KokkosBlas::Test::trmm_complex_float_LLCN"); - test_trmm, Kokkos::complex, TestDevice>("LLCN", - 1.0f); + test_trmm, Kokkos::complex, TestDevice>("LLCN", 1.0f); Kokkos::Profiling::popRegion(); } TEST_F(TestCategory, trmm_complex_float_LLCU_one) { Kokkos::Profiling::pushRegion("KokkosBlas::Test::trmm_complex_float_LLCU"); - test_trmm, Kokkos::complex, TestDevice>("LLCU", - 1.0f); + test_trmm, Kokkos::complex, TestDevice>("LLCU", 1.0f); Kokkos::Profiling::popRegion(); } TEST_F(TestCategory, trmm_complex_float_LUNN_one) { Kokkos::Profiling::pushRegion("KokkosBlas::Test::trmm_complex_float_LUNN"); - test_trmm, Kokkos::complex, TestDevice>("LUNN", - 1.0f); + test_trmm, Kokkos::complex, TestDevice>("LUNN", 1.0f); Kokkos::Profiling::popRegion(); } TEST_F(TestCategory, trmm_complex_float_LUNU_one) { Kokkos::Profiling::pushRegion("KokkosBlas::Test::trmm_complex_float_LUNU"); - test_trmm, Kokkos::complex, TestDevice>("LUNU", - 1.0f); + test_trmm, Kokkos::complex, TestDevice>("LUNU", 1.0f); Kokkos::Profiling::popRegion(); } TEST_F(TestCategory, trmm_complex_float_LUCN_one) { Kokkos::Profiling::pushRegion("KokkosBlas::Test::trmm_complex_float_LUCN"); - test_trmm, Kokkos::complex, TestDevice>("LUCN", - 1.0f); + test_trmm, Kokkos::complex, TestDevice>("LUCN", 1.0f); Kokkos::Profiling::popRegion(); } TEST_F(TestCategory, trmm_complex_float_LUCU_one) { Kokkos::Profiling::pushRegion("KokkosBlas::Test::trmm_complex_float_LUCU"); - test_trmm, Kokkos::complex, TestDevice>("LUCU", - 1.0f); + test_trmm, Kokkos::complex, TestDevice>("LUCU", 1.0f); Kokkos::Profiling::popRegion(); } TEST_F(TestCategory, trmm_complex_float_RLNN_one) { Kokkos::Profiling::pushRegion("KokkosBlas::Test::trmm_complex_float_RLNN"); - test_trmm, Kokkos::complex, TestDevice>("RLNN", - 1.0f); + test_trmm, Kokkos::complex, TestDevice>("RLNN", 1.0f); Kokkos::Profiling::popRegion(); } TEST_F(TestCategory, trmm_complex_float_RLNU_one) { Kokkos::Profiling::pushRegion("KokkosBlas::Test::trmm_complex_float_RLNU"); - test_trmm, Kokkos::complex, TestDevice>("RLNU", - 1.0f); + test_trmm, Kokkos::complex, TestDevice>("RLNU", 1.0f); Kokkos::Profiling::popRegion(); } TEST_F(TestCategory, trmm_complex_float_RLCN_one) { Kokkos::Profiling::pushRegion("KokkosBlas::Test::trmm_complex_float_RLCN"); - test_trmm, Kokkos::complex, TestDevice>("RLCN", - 1.0f); + test_trmm, Kokkos::complex, TestDevice>("RLCN", 1.0f); Kokkos::Profiling::popRegion(); } TEST_F(TestCategory, trmm_complex_float_RLCU_one) { Kokkos::Profiling::pushRegion("KokkosBlas::Test::trmm_complex_float_RLCU"); - test_trmm, Kokkos::complex, TestDevice>("RLCU", - 1.0f); + test_trmm, Kokkos::complex, TestDevice>("RLCU", 1.0f); Kokkos::Profiling::popRegion(); } TEST_F(TestCategory, trmm_complex_float_RUNN_one) { Kokkos::Profiling::pushRegion("KokkosBlas::Test::trmm_complex_float_RUNN"); - test_trmm, Kokkos::complex, TestDevice>("RUNN", - 1.0f); + test_trmm, Kokkos::complex, TestDevice>("RUNN", 1.0f); Kokkos::Profiling::popRegion(); } TEST_F(TestCategory, trmm_complex_float_RUNU_one) { Kokkos::Profiling::pushRegion("KokkosBlas::Test::trmm_complex_float_RUNU"); - test_trmm, Kokkos::complex, TestDevice>("RUNU", - 1.0f); + test_trmm, Kokkos::complex, TestDevice>("RUNU", 1.0f); Kokkos::Profiling::popRegion(); } TEST_F(TestCategory, trmm_complex_float_RUCN_one) { Kokkos::Profiling::pushRegion("KokkosBlas::Test::trmm_complex_float_RUCN"); - test_trmm, Kokkos::complex, TestDevice>("RUCN", - 1.0f); + test_trmm, Kokkos::complex, TestDevice>("RUCN", 1.0f); Kokkos::Profiling::popRegion(); } TEST_F(TestCategory, trmm_complex_float_RUCU_one) { Kokkos::Profiling::pushRegion("KokkosBlas::Test::trmm_complex_float_RUCU"); - test_trmm, Kokkos::complex, TestDevice>("RUCU", - 1.0f); + test_trmm, Kokkos::complex, TestDevice>("RUCU", 1.0f); Kokkos::Profiling::popRegion(); } ///////////////// alpha 4.5 ///////////////// TEST_F(TestCategory, trmm_complex_float_LLNN_fourfive) { Kokkos::Profiling::pushRegion("KokkosBlas::Test::trmm_complex_float_LLNN"); - test_trmm, Kokkos::complex, TestDevice>( - "LLNN", Kokkos::complex(4.5f, 0.0f)); + test_trmm, Kokkos::complex, TestDevice>("LLNN", Kokkos::complex(4.5f, 0.0f)); Kokkos::Profiling::popRegion(); } TEST_F(TestCategory, trmm_complex_float_LLNU_fourfive) { Kokkos::Profiling::pushRegion("KokkosBlas::Test::trmm_complex_float_LLNU"); - test_trmm, Kokkos::complex, TestDevice>( - "LLNU", Kokkos::complex(4.5f, 0.0f)); + test_trmm, Kokkos::complex, TestDevice>("LLNU", Kokkos::complex(4.5f, 0.0f)); Kokkos::Profiling::popRegion(); } TEST_F(TestCategory, trmm_complex_float_LLCN_fourfive) { Kokkos::Profiling::pushRegion("KokkosBlas::Test::trmm_complex_float_LLCN"); - test_trmm, Kokkos::complex, TestDevice>( - "LLCN", Kokkos::complex(4.5f, 0.0f)); + test_trmm, Kokkos::complex, TestDevice>("LLCN", Kokkos::complex(4.5f, 0.0f)); Kokkos::Profiling::popRegion(); } TEST_F(TestCategory, trmm_complex_float_LLCU_fourfive) { Kokkos::Profiling::pushRegion("KokkosBlas::Test::trmm_complex_float_LLCU"); - test_trmm, Kokkos::complex, TestDevice>( - "LLCU", Kokkos::complex(4.5f, 0.0f)); + test_trmm, Kokkos::complex, TestDevice>("LLCU", Kokkos::complex(4.5f, 0.0f)); Kokkos::Profiling::popRegion(); } TEST_F(TestCategory, trmm_complex_float_LUNN_fourfive) { Kokkos::Profiling::pushRegion("KokkosBlas::Test::trmm_complex_float_LUNN"); - test_trmm, Kokkos::complex, TestDevice>( - "LUNN", Kokkos::complex(4.5f, 0.0f)); + test_trmm, Kokkos::complex, TestDevice>("LUNN", Kokkos::complex(4.5f, 0.0f)); Kokkos::Profiling::popRegion(); } TEST_F(TestCategory, trmm_complex_float_LUNU_fourfive) { Kokkos::Profiling::pushRegion("KokkosBlas::Test::trmm_complex_float_LUNU"); - test_trmm, Kokkos::complex, TestDevice>( - "LUNU", Kokkos::complex(4.5f, 0.0f)); + test_trmm, Kokkos::complex, TestDevice>("LUNU", Kokkos::complex(4.5f, 0.0f)); Kokkos::Profiling::popRegion(); } TEST_F(TestCategory, trmm_complex_float_LUCN_fourfive) { Kokkos::Profiling::pushRegion("KokkosBlas::Test::trmm_complex_float_LUCN"); - test_trmm, Kokkos::complex, TestDevice>( - "LUCN", Kokkos::complex(4.5f, 0.0f)); + test_trmm, Kokkos::complex, TestDevice>("LUCN", Kokkos::complex(4.5f, 0.0f)); Kokkos::Profiling::popRegion(); } TEST_F(TestCategory, trmm_complex_float_LUCU_fourfive) { Kokkos::Profiling::pushRegion("KokkosBlas::Test::trmm_complex_float_LUCU"); - test_trmm, Kokkos::complex, TestDevice>( - "LUCU", Kokkos::complex(4.5f, 0.0f)); + test_trmm, Kokkos::complex, TestDevice>("LUCU", Kokkos::complex(4.5f, 0.0f)); Kokkos::Profiling::popRegion(); } TEST_F(TestCategory, trmm_complex_float_RLNN_fourfive) { Kokkos::Profiling::pushRegion("KokkosBlas::Test::trmm_complex_float_RLNN"); - test_trmm, Kokkos::complex, TestDevice>( - "RLNN", Kokkos::complex(4.5f, 0.0f)); + test_trmm, Kokkos::complex, TestDevice>("RLNN", Kokkos::complex(4.5f, 0.0f)); Kokkos::Profiling::popRegion(); } TEST_F(TestCategory, trmm_complex_float_RLNU_fourfive) { Kokkos::Profiling::pushRegion("KokkosBlas::Test::trmm_complex_float_RLNU"); - test_trmm, Kokkos::complex, TestDevice>( - "RLNU", Kokkos::complex(4.5f, 0.0f)); + test_trmm, Kokkos::complex, TestDevice>("RLNU", Kokkos::complex(4.5f, 0.0f)); Kokkos::Profiling::popRegion(); } TEST_F(TestCategory, trmm_complex_float_RLCN_fourfive) { Kokkos::Profiling::pushRegion("KokkosBlas::Test::trmm_complex_float_RLCN"); - test_trmm, Kokkos::complex, TestDevice>( - "RLCN", Kokkos::complex(4.5f, 0.0f)); + test_trmm, Kokkos::complex, TestDevice>("RLCN", Kokkos::complex(4.5f, 0.0f)); Kokkos::Profiling::popRegion(); } TEST_F(TestCategory, trmm_complex_float_RLCU_fourfive) { Kokkos::Profiling::pushRegion("KokkosBlas::Test::trmm_complex_float_RLCU"); - test_trmm, Kokkos::complex, TestDevice>( - "RLCU", Kokkos::complex(4.5f, 0.0f)); + test_trmm, Kokkos::complex, TestDevice>("RLCU", Kokkos::complex(4.5f, 0.0f)); Kokkos::Profiling::popRegion(); } TEST_F(TestCategory, trmm_complex_float_RUNN_fourfive) { Kokkos::Profiling::pushRegion("KokkosBlas::Test::trmm_complex_float_RUNN"); - test_trmm, Kokkos::complex, TestDevice>( - "RUNN", Kokkos::complex(4.5f, 0.0f)); + test_trmm, Kokkos::complex, TestDevice>("RUNN", Kokkos::complex(4.5f, 0.0f)); Kokkos::Profiling::popRegion(); } TEST_F(TestCategory, trmm_complex_float_RUNU_fourfive) { Kokkos::Profiling::pushRegion("KokkosBlas::Test::trmm_complex_float_RUNU"); - test_trmm, Kokkos::complex, TestDevice>( - "RUNU", Kokkos::complex(4.5f, 0.0f)); + test_trmm, Kokkos::complex, TestDevice>("RUNU", Kokkos::complex(4.5f, 0.0f)); Kokkos::Profiling::popRegion(); } TEST_F(TestCategory, trmm_complex_float_RUCN_fourfive) { Kokkos::Profiling::pushRegion("KokkosBlas::Test::trmm_complex_float_RUCN"); - test_trmm, Kokkos::complex, TestDevice>( - "RUCN", Kokkos::complex(4.5f, 0.0f)); + test_trmm, Kokkos::complex, TestDevice>("RUCN", Kokkos::complex(4.5f, 0.0f)); Kokkos::Profiling::popRegion(); } TEST_F(TestCategory, trmm_complex_float_RUCU_fourfive) { Kokkos::Profiling::pushRegion("KokkosBlas::Test::trmm_complex_float_RUCU"); - test_trmm, Kokkos::complex, TestDevice>( - "RUCU", Kokkos::complex(4.5f, 0.0f)); + test_trmm, Kokkos::complex, TestDevice>("RUCU", Kokkos::complex(4.5f, 0.0f)); Kokkos::Profiling::popRegion(); } #endif diff --git a/blas/unit_test/Test_Blas3_trsm.hpp b/blas/unit_test/Test_Blas3_trsm.hpp index 9a00f22263..81fdad8929 100644 --- a/blas/unit_test/Test_Blas3_trsm.hpp +++ b/blas/unit_test/Test_Blas3_trsm.hpp @@ -44,8 +44,7 @@ struct NonUnitDiagTRSM { void operator()(const int& i) const { A_(i, i) = A_(i, i) + 10; } }; -template +template struct trsm_VanillaGEMM { bool A_t, B_t, A_c, B_c; int N, K; @@ -62,12 +61,9 @@ struct trsm_VanillaGEMM { ScalarC beta; KOKKOS_INLINE_FUNCTION - void operator()( - const typename Kokkos::TeamPolicy::member_type& team) - const { + void operator()(const typename Kokkos::TeamPolicy::member_type& team) const { // GNU COMPILER BUG WORKAROUND -#if defined(KOKKOS_COMPILER_GNU) && !defined(__CUDA_ARCH__) && \ - !defined(__HIP_DEVICE_COMPILE__) +#if defined(KOKKOS_COMPILER_GNU) && !defined(__CUDA_ARCH__) && !defined(__HIP_DEVICE_COMPILE__) int i = team.league_rank(); #else @@ -99,8 +95,7 @@ struct trsm_VanillaGEMM { }; template -void impl_test_trsm(const char* side, const char* uplo, const char* trans, - const char* diag, int M, int N, +void impl_test_trsm(const char* side, const char* uplo, const char* trans, const char* diag, int M, int N, typename ViewTypeA::value_type alpha) { using execution_space = typename ViewTypeA::device_type::execution_space; using ScalarA = typename ViewTypeA::value_type; @@ -123,31 +118,21 @@ void impl_test_trsm(const char* side, const char* uplo, const char* trans, typename ViewTypeB::HostMirror h_B = Kokkos::create_mirror_view(B); typename ViewTypeB::HostMirror h_X0 = Kokkos::create_mirror_view(X0); - uint64_t seed = - std::chrono::high_resolution_clock::now().time_since_epoch().count(); + uint64_t seed = std::chrono::high_resolution_clock::now().time_since_epoch().count(); Kokkos::Random_XorShift64_Pool rand_pool(seed); if ((diag[0] == 'U') || (diag[0] == 'u')) { - Kokkos::fill_random(A, rand_pool, - Kokkos::rand, - ScalarA>::max() * - 0.1); + Kokkos::fill_random(A, rand_pool, Kokkos::rand, ScalarA>::max() * 0.1); using functor_type = UnitDiagTRSM; functor_type udtrsm(A); - Kokkos::parallel_for("KokkosBlas::Test::UnitDiagTRSM", - Kokkos::RangePolicy(0, K), udtrsm); + Kokkos::parallel_for("KokkosBlas::Test::UnitDiagTRSM", Kokkos::RangePolicy(0, K), udtrsm); } else { //(diag[0]=='N')||(diag[0]=='n') - Kokkos::fill_random(A, rand_pool, - Kokkos::rand, - ScalarA>::max()); + Kokkos::fill_random(A, rand_pool, Kokkos::rand, ScalarA>::max()); using functor_type = NonUnitDiagTRSM; functor_type nudtrsm(A); - Kokkos::parallel_for("KokkosBlas::Test::NonUnitDiagTRSM", - Kokkos::RangePolicy(0, K), nudtrsm); + Kokkos::parallel_for("KokkosBlas::Test::NonUnitDiagTRSM", Kokkos::RangePolicy(0, K), nudtrsm); } - Kokkos::fill_random( - X0, rand_pool, - Kokkos::rand, ScalarA>::max()); + Kokkos::fill_random(X0, rand_pool, Kokkos::rand, ScalarA>::max()); Kokkos::deep_copy(h_A, A); Kokkos::deep_copy(h_X0, X0); @@ -165,8 +150,7 @@ void impl_test_trsm(const char* side, const char* uplo, const char* trans, Kokkos::deep_copy(A, h_A); - struct trsm_VanillaGEMM - vgemm; + struct trsm_VanillaGEMM vgemm; if (A_l) { vgemm.A_t = (trans[0] != 'N') && (trans[0] != 'n'); vgemm.B_t = false; @@ -187,12 +171,10 @@ void impl_test_trsm(const char* side, const char* uplo, const char* trans, vgemm.C = B; vgemm.alpha = alpha_trmm; vgemm.beta = beta; - Kokkos::parallel_for( - "KokkosBlas::Test::trsm_VanillaGEMM", - Kokkos::TeamPolicy( - M, Kokkos::AUTO, - KokkosKernels::Impl::kk_get_max_vector_size()), - vgemm); + Kokkos::parallel_for("KokkosBlas::Test::trsm_VanillaGEMM", + Kokkos::TeamPolicy( + M, Kokkos::AUTO, KokkosKernels::Impl::kk_get_max_vector_size()), + vgemm); Kokkos::fence(); KokkosBlas::trsm(side, uplo, trans, diag, alpha, A, B); @@ -223,41 +205,30 @@ void impl_test_trsm(const char* side, const char* uplo, const char* trans, template int test_trsm(const char* mode, ScalarA alpha) { #if defined(KOKKOSKERNELS_INST_LAYOUTLEFT) || \ - (!defined(KOKKOSKERNELS_ETI_ONLY) && \ - !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS)) + (!defined(KOKKOSKERNELS_ETI_ONLY) && !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS)) using view_type_a_ll = Kokkos::View; using view_type_b_ll = Kokkos::View; - Test::impl_test_trsm( - &mode[0], &mode[1], &mode[2], &mode[3], 0, 0, alpha); - Test::impl_test_trsm( - &mode[0], &mode[1], &mode[2], &mode[3], 101, 19, alpha); - Test::impl_test_trsm( - &mode[0], &mode[1], &mode[2], &mode[3], 19, 101, alpha); - Test::impl_test_trsm( - &mode[0], &mode[1], &mode[2], &mode[3], 343, 201, alpha); + Test::impl_test_trsm(&mode[0], &mode[1], &mode[2], &mode[3], 0, 0, alpha); + Test::impl_test_trsm(&mode[0], &mode[1], &mode[2], &mode[3], 101, 19, alpha); + Test::impl_test_trsm(&mode[0], &mode[1], &mode[2], &mode[3], 19, 101, alpha); + Test::impl_test_trsm(&mode[0], &mode[1], &mode[2], &mode[3], 343, 201, alpha); #endif #if defined(KOKKOSKERNELS_INST_LAYOUTRIGHT) || \ - (!defined(KOKKOSKERNELS_ETI_ONLY) && \ - !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS)) + (!defined(KOKKOSKERNELS_ETI_ONLY) && !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS)) using view_type_a_lr = Kokkos::View; using view_type_b_lr = Kokkos::View; - Test::impl_test_trsm( - &mode[0], &mode[1], &mode[2], &mode[3], 0, 0, alpha); - Test::impl_test_trsm( - &mode[0], &mode[1], &mode[2], &mode[3], 101, 19, alpha); - Test::impl_test_trsm( - &mode[0], &mode[1], &mode[2], &mode[3], 19, 101, alpha); - Test::impl_test_trsm( - &mode[0], &mode[1], &mode[2], &mode[3], 343, 201, alpha); + Test::impl_test_trsm(&mode[0], &mode[1], &mode[2], &mode[3], 0, 0, alpha); + Test::impl_test_trsm(&mode[0], &mode[1], &mode[2], &mode[3], 101, 19, alpha); + Test::impl_test_trsm(&mode[0], &mode[1], &mode[2], &mode[3], 19, 101, alpha); + Test::impl_test_trsm(&mode[0], &mode[1], &mode[2], &mode[3], 343, 201, alpha); #endif return 1; } #if defined(KOKKOSKERNELS_INST_FLOAT) || \ - (!defined(KOKKOSKERNELS_ETI_ONLY) && \ - !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS)) + (!defined(KOKKOSKERNELS_ETI_ONLY) && !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS)) TEST_F(TestCategory, trsm_float) { Kokkos::Profiling::pushRegion("KokkosBlas::Test::trsm_float"); float alpha = 1.0f; @@ -302,8 +273,7 @@ TEST_F(TestCategory, trsm_float) { #endif #if defined(KOKKOSKERNELS_INST_DOUBLE) || \ - (!defined(KOKKOSKERNELS_ETI_ONLY) && \ - !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS)) + (!defined(KOKKOSKERNELS_ETI_ONLY) && !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS)) TEST_F(TestCategory, trsm_double) { Kokkos::Profiling::pushRegion("KokkosBlas::Test::trsm_double"); double alpha = 1.0; @@ -348,157 +318,91 @@ TEST_F(TestCategory, trsm_double) { #endif #if defined(KOKKOSKERNELS_INST_COMPLEX_DOUBLE) || \ - (!defined(KOKKOSKERNELS_ETI_ONLY) && \ - !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS)) + (!defined(KOKKOSKERNELS_ETI_ONLY) && !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS)) TEST_F(TestCategory, trsm_complex_double) { Kokkos::Profiling::pushRegion("KokkosBlas::Test::trsm_complex_double"); Kokkos::complex alpha = 1.0; - test_trsm, Kokkos::complex, TestDevice>( - "LLNN", alpha); - test_trsm, Kokkos::complex, TestDevice>( - "LLNU", alpha); - test_trsm, Kokkos::complex, TestDevice>( - "LLCN", alpha); - test_trsm, Kokkos::complex, TestDevice>( - "LLCU", alpha); - test_trsm, Kokkos::complex, TestDevice>( - "LUNN", alpha); - test_trsm, Kokkos::complex, TestDevice>( - "LUNU", alpha); - test_trsm, Kokkos::complex, TestDevice>( - "LUCN", alpha); - test_trsm, Kokkos::complex, TestDevice>( - "LUCU", alpha); - - test_trsm, Kokkos::complex, TestDevice>( - "RLNN", alpha); - test_trsm, Kokkos::complex, TestDevice>( - "RLNU", alpha); - test_trsm, Kokkos::complex, TestDevice>( - "RLCN", alpha); - test_trsm, Kokkos::complex, TestDevice>( - "RLCU", alpha); - test_trsm, Kokkos::complex, TestDevice>( - "RUNN", alpha); - test_trsm, Kokkos::complex, TestDevice>( - "RUNU", alpha); - test_trsm, Kokkos::complex, TestDevice>( - "RUCN", alpha); - test_trsm, Kokkos::complex, TestDevice>( - "RUCU", alpha); + test_trsm, Kokkos::complex, TestDevice>("LLNN", alpha); + test_trsm, Kokkos::complex, TestDevice>("LLNU", alpha); + test_trsm, Kokkos::complex, TestDevice>("LLCN", alpha); + test_trsm, Kokkos::complex, TestDevice>("LLCU", alpha); + test_trsm, Kokkos::complex, TestDevice>("LUNN", alpha); + test_trsm, Kokkos::complex, TestDevice>("LUNU", alpha); + test_trsm, Kokkos::complex, TestDevice>("LUCN", alpha); + test_trsm, Kokkos::complex, TestDevice>("LUCU", alpha); + + test_trsm, Kokkos::complex, TestDevice>("RLNN", alpha); + test_trsm, Kokkos::complex, TestDevice>("RLNU", alpha); + test_trsm, Kokkos::complex, TestDevice>("RLCN", alpha); + test_trsm, Kokkos::complex, TestDevice>("RLCU", alpha); + test_trsm, Kokkos::complex, TestDevice>("RUNN", alpha); + test_trsm, Kokkos::complex, TestDevice>("RUNU", alpha); + test_trsm, Kokkos::complex, TestDevice>("RUCN", alpha); + test_trsm, Kokkos::complex, TestDevice>("RUCU", alpha); alpha = Kokkos::complex(4.5, 0.0); - test_trsm, Kokkos::complex, TestDevice>( - "LLNN", alpha); - test_trsm, Kokkos::complex, TestDevice>( - "LLNU", alpha); - test_trsm, Kokkos::complex, TestDevice>( - "LLCN", alpha); - test_trsm, Kokkos::complex, TestDevice>( - "LLCU", alpha); - test_trsm, Kokkos::complex, TestDevice>( - "LUNN", alpha); - test_trsm, Kokkos::complex, TestDevice>( - "LUNU", alpha); - test_trsm, Kokkos::complex, TestDevice>( - "LUCN", alpha); - test_trsm, Kokkos::complex, TestDevice>( - "LUCU", alpha); - - test_trsm, Kokkos::complex, TestDevice>( - "RLNN", alpha); - test_trsm, Kokkos::complex, TestDevice>( - "RLNU", alpha); - test_trsm, Kokkos::complex, TestDevice>( - "RLCN", alpha); - test_trsm, Kokkos::complex, TestDevice>( - "RLCU", alpha); - test_trsm, Kokkos::complex, TestDevice>( - "RUNN", alpha); - test_trsm, Kokkos::complex, TestDevice>( - "RUNU", alpha); - test_trsm, Kokkos::complex, TestDevice>( - "RUCN", alpha); - test_trsm, Kokkos::complex, TestDevice>( - "RUCU", alpha); + test_trsm, Kokkos::complex, TestDevice>("LLNN", alpha); + test_trsm, Kokkos::complex, TestDevice>("LLNU", alpha); + test_trsm, Kokkos::complex, TestDevice>("LLCN", alpha); + test_trsm, Kokkos::complex, TestDevice>("LLCU", alpha); + test_trsm, Kokkos::complex, TestDevice>("LUNN", alpha); + test_trsm, Kokkos::complex, TestDevice>("LUNU", alpha); + test_trsm, Kokkos::complex, TestDevice>("LUCN", alpha); + test_trsm, Kokkos::complex, TestDevice>("LUCU", alpha); + + test_trsm, Kokkos::complex, TestDevice>("RLNN", alpha); + test_trsm, Kokkos::complex, TestDevice>("RLNU", alpha); + test_trsm, Kokkos::complex, TestDevice>("RLCN", alpha); + test_trsm, Kokkos::complex, TestDevice>("RLCU", alpha); + test_trsm, Kokkos::complex, TestDevice>("RUNN", alpha); + test_trsm, Kokkos::complex, TestDevice>("RUNU", alpha); + test_trsm, Kokkos::complex, TestDevice>("RUCN", alpha); + test_trsm, Kokkos::complex, TestDevice>("RUCU", alpha); Kokkos::Profiling::popRegion(); } #endif #if defined(KOKKOSKERNELS_INST_COMPLEX_FLOAT) || \ - (!defined(KOKKOSKERNELS_ETI_ONLY) && \ - !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS)) + (!defined(KOKKOSKERNELS_ETI_ONLY) && !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS)) TEST_F(TestCategory, trsm_complex_float) { Kokkos::Profiling::pushRegion("KokkosBlas::Test::trsm_complex_float"); Kokkos::complex alpha = 1.0f; - test_trsm, Kokkos::complex, TestDevice>("LLNN", - alpha); - test_trsm, Kokkos::complex, TestDevice>("LLNU", - alpha); - test_trsm, Kokkos::complex, TestDevice>("LLCN", - alpha); - test_trsm, Kokkos::complex, TestDevice>("LLCU", - alpha); - test_trsm, Kokkos::complex, TestDevice>("LUNN", - alpha); - test_trsm, Kokkos::complex, TestDevice>("LUNU", - alpha); - test_trsm, Kokkos::complex, TestDevice>("LUCN", - alpha); - test_trsm, Kokkos::complex, TestDevice>("LUCU", - alpha); - - test_trsm, Kokkos::complex, TestDevice>("RLNN", - alpha); - test_trsm, Kokkos::complex, TestDevice>("RLNU", - alpha); - test_trsm, Kokkos::complex, TestDevice>("RLCN", - alpha); - test_trsm, Kokkos::complex, TestDevice>("RLCU", - alpha); - test_trsm, Kokkos::complex, TestDevice>("RUNN", - alpha); - test_trsm, Kokkos::complex, TestDevice>("RUNU", - alpha); - test_trsm, Kokkos::complex, TestDevice>("RUCN", - alpha); - test_trsm, Kokkos::complex, TestDevice>("RUCU", - alpha); + test_trsm, Kokkos::complex, TestDevice>("LLNN", alpha); + test_trsm, Kokkos::complex, TestDevice>("LLNU", alpha); + test_trsm, Kokkos::complex, TestDevice>("LLCN", alpha); + test_trsm, Kokkos::complex, TestDevice>("LLCU", alpha); + test_trsm, Kokkos::complex, TestDevice>("LUNN", alpha); + test_trsm, Kokkos::complex, TestDevice>("LUNU", alpha); + test_trsm, Kokkos::complex, TestDevice>("LUCN", alpha); + test_trsm, Kokkos::complex, TestDevice>("LUCU", alpha); + + test_trsm, Kokkos::complex, TestDevice>("RLNN", alpha); + test_trsm, Kokkos::complex, TestDevice>("RLNU", alpha); + test_trsm, Kokkos::complex, TestDevice>("RLCN", alpha); + test_trsm, Kokkos::complex, TestDevice>("RLCU", alpha); + test_trsm, Kokkos::complex, TestDevice>("RUNN", alpha); + test_trsm, Kokkos::complex, TestDevice>("RUNU", alpha); + test_trsm, Kokkos::complex, TestDevice>("RUCN", alpha); + test_trsm, Kokkos::complex, TestDevice>("RUCU", alpha); alpha = Kokkos::complex(4.5f, 0.0f); - test_trsm, Kokkos::complex, TestDevice>("LLNN", - alpha); - test_trsm, Kokkos::complex, TestDevice>("LLNU", - alpha); - test_trsm, Kokkos::complex, TestDevice>("LLCN", - alpha); - test_trsm, Kokkos::complex, TestDevice>("LLCU", - alpha); - test_trsm, Kokkos::complex, TestDevice>("LUNN", - alpha); - test_trsm, Kokkos::complex, TestDevice>("LUNU", - alpha); - test_trsm, Kokkos::complex, TestDevice>("LUCN", - alpha); - test_trsm, Kokkos::complex, TestDevice>("LUCU", - alpha); - - test_trsm, Kokkos::complex, TestDevice>("RLNN", - alpha); - test_trsm, Kokkos::complex, TestDevice>("RLNU", - alpha); - test_trsm, Kokkos::complex, TestDevice>("RLCN", - alpha); - test_trsm, Kokkos::complex, TestDevice>("RLCU", - alpha); - test_trsm, Kokkos::complex, TestDevice>("RUNN", - alpha); - test_trsm, Kokkos::complex, TestDevice>("RUNU", - alpha); - test_trsm, Kokkos::complex, TestDevice>("RUCN", - alpha); - test_trsm, Kokkos::complex, TestDevice>("RUCU", - alpha); + test_trsm, Kokkos::complex, TestDevice>("LLNN", alpha); + test_trsm, Kokkos::complex, TestDevice>("LLNU", alpha); + test_trsm, Kokkos::complex, TestDevice>("LLCN", alpha); + test_trsm, Kokkos::complex, TestDevice>("LLCU", alpha); + test_trsm, Kokkos::complex, TestDevice>("LUNN", alpha); + test_trsm, Kokkos::complex, TestDevice>("LUNU", alpha); + test_trsm, Kokkos::complex, TestDevice>("LUCN", alpha); + test_trsm, Kokkos::complex, TestDevice>("LUCU", alpha); + + test_trsm, Kokkos::complex, TestDevice>("RLNN", alpha); + test_trsm, Kokkos::complex, TestDevice>("RLNU", alpha); + test_trsm, Kokkos::complex, TestDevice>("RLCN", alpha); + test_trsm, Kokkos::complex, TestDevice>("RLCU", alpha); + test_trsm, Kokkos::complex, TestDevice>("RUNN", alpha); + test_trsm, Kokkos::complex, TestDevice>("RUNU", alpha); + test_trsm, Kokkos::complex, TestDevice>("RUCN", alpha); + test_trsm, Kokkos::complex, TestDevice>("RUCU", alpha); Kokkos::Profiling::popRegion(); } #endif diff --git a/blas/unit_test/Test_Blas_Newton.hpp b/blas/unit_test/Test_Blas_Newton.hpp index 5bb6946e99..7b6d4a9049 100644 --- a/blas/unit_test/Test_Blas_Newton.hpp +++ b/blas/unit_test/Test_Blas_Newton.hpp @@ -40,16 +40,13 @@ struct LogisticEquation { scalar_type dt; vec_type state; - LogisticEquation(const scalar_type dt_, vec_type initial_state) - : dt(dt_), state(initial_state) {} + LogisticEquation(const scalar_type dt_, vec_type initial_state) : dt(dt_), state(initial_state) {} KOKKOS_FUNCTION void residual(const vec_type& y, const vec_type& dydt) const { dydt(0) = y(0) - state(0) - dt * y(0) * (1 - y(0)); } - KOKKOS_FUNCTION void jacobian(const vec_type& y, const mat_type& jac) const { - jac(0, 0) = 1 - dt + 2 * dt * y(0); - } + KOKKOS_FUNCTION void jacobian(const vec_type& y, const mat_type& jac) const { jac(0, 0) = 1 - dt + 2 * dt * y(0); } KOKKOS_FUNCTION scalar_type expected_val(const scalar_type t) const { using Kokkos::exp; @@ -112,9 +109,7 @@ int test_logistic() { using norm_type = typename Kokkos::View; using handle_type = KokkosBlas::Impl::NewtonHandle; using system_type = LogisticEquation; - using newton_type = - KokkosBlas::Impl::NewtonFunctor; + using newton_type = KokkosBlas::Impl::NewtonFunctor; // Create the non-linear system and initialize data vec_type state("state", 1); @@ -150,9 +145,7 @@ int test_intersection() { using norm_type = typename Kokkos::View; using handle_type = KokkosBlas::Impl::NewtonHandle; using system_type = Intersection; - using newton_type = - KokkosBlas::Impl::NewtonFunctor; + using newton_type = KokkosBlas::Impl::NewtonFunctor; // Create the non-linear system and initialize data system_type intersection; diff --git a/blas/unit_test/Test_Blas_rocblas.hpp b/blas/unit_test/Test_Blas_rocblas.hpp index ed68b7a8b6..091fac7259 100644 --- a/blas/unit_test/Test_Blas_rocblas.hpp +++ b/blas/unit_test/Test_Blas_rocblas.hpp @@ -58,8 +58,7 @@ void test_rocblas_safe_call() { // fails it throws an error with the // KOKKOS_ROCBLAS_SAFE_CALL_IMPL macro void test_rocblas_singleton() { - KokkosBlas::Impl::RocBlasSingleton& s = - KokkosBlas::Impl::RocBlasSingleton::singleton(); + KokkosBlas::Impl::RocBlasSingleton& s = KokkosBlas::Impl::RocBlasSingleton::singleton(); (void)s; } diff --git a/blas/unit_test/Test_Blas_serial_axpy.hpp b/blas/unit_test/Test_Blas_serial_axpy.hpp index 427925a3dc..cd58eba920 100644 --- a/blas/unit_test/Test_Blas_serial_axpy.hpp +++ b/blas/unit_test/Test_Blas_serial_axpy.hpp @@ -29,8 +29,7 @@ namespace Test { struct KokkosKernelAxpyTag {}; struct NaiveAxpyTag {}; -template +template struct Functor_TestBlasSerialAxpy { using execution_space = typename DeviceType::execution_space; ScalarType _alpha; @@ -38,8 +37,7 @@ struct Functor_TestBlasSerialAxpy { ViewType _y; KOKKOS_INLINE_FUNCTION - Functor_TestBlasSerialAxpy(const ScalarType alpha, const ViewType &x, - const ViewType &y) + Functor_TestBlasSerialAxpy(const ScalarType alpha, const ViewType &x, const ViewType &y) : _alpha(alpha), _x(x), _y(y) {} KOKKOS_INLINE_FUNCTION @@ -62,15 +60,11 @@ struct Functor_TestBlasSerialAxpy { using value_type = typename ViewType::value_type; std::string name_region("KokkosBlas::Test::SerialAxpy"); const std::string name_value_type = Test::value_type_name(); - std::string name_work_tag = - (std::is_same::value - ? "::KokkosBlas" - : std::is_same::value - ? "::Naive" - : "::UnknownWorkTag"); - std::string name_test_id = "Axpy"; - std::string name = - name_region + name_value_type + name_work_tag + name_test_id; + std::string name_work_tag = (std::is_same::value ? "::KokkosBlas" + : std::is_same::value ? "::Naive" + : "::UnknownWorkTag"); + std::string name_test_id = "Axpy"; + std::string name = name_region + name_value_type + name_work_tag + name_test_id; Kokkos::Profiling::pushRegion(name.c_str()); Kokkos::RangePolicy policy(0, _x.extent(0)); Kokkos::parallel_for(name.c_str(), policy, *this); @@ -91,20 +85,15 @@ void impl_test_blas_serial_axpy(const int N, const int BlkSize) { ViewType Y("Y", N, BlkSize, BlkSize); ViewType Yref("Yref", N, BlkSize, BlkSize); - Kokkos::Random_XorShift64_Pool random( - 13718); + Kokkos::Random_XorShift64_Pool random(13718); Kokkos::fill_random(X, random, ats::one()); Kokkos::fill_random(Y, random, ats::one()); Kokkos::fence(); Kokkos::deep_copy(Yref, Y); /// test body - Functor_TestBlasSerialAxpy( - alpha, X, Yref) - .run(); - Functor_TestBlasSerialAxpy(alpha, X, Y) - .run(); + Functor_TestBlasSerialAxpy(alpha, X, Yref).run(); + Functor_TestBlasSerialAxpy(alpha, X, Y).run(); Kokkos::fence(); @@ -116,12 +105,10 @@ void impl_test_blas_serial_axpy(const int N, const int BlkSize) { Kokkos::deep_copy(Yref_host, Yref); /// check a = b - typename ats::mag_type eps = - 100 * std::numeric_limits::epsilon(); + typename ats::mag_type eps = 100 * std::numeric_limits::epsilon(); for (int k = 0; k < N; ++k) for (int i = 0; i < BlkSize; ++i) - for (int j = 0; j < BlkSize; ++j) - EXPECT_NEAR_KK(Y_host(k, i, j), Yref_host(k, i, j), eps); + for (int j = 0; j < BlkSize; ++j) EXPECT_NEAR_KK(Y_host(k, i, j), Yref_host(k, i, j), eps); } } // namespace Test @@ -130,24 +117,20 @@ template int test_blas_serial_axpy() { #if defined(KOKKOSKERNELS_INST_LAYOUTLEFT) { - typedef Kokkos::View - ViewType; + typedef Kokkos::View ViewType; Test::impl_test_blas_serial_axpy(0, 10); Test::impl_test_blas_serial_axpy(10, 15); Test::impl_test_blas_serial_axpy(1024, 9); - Test::impl_test_blas_serial_axpy(132231, - 3); + Test::impl_test_blas_serial_axpy(132231, 3); } #endif #if defined(KOKKOSKERNELS_INST_LAYOUTRIGHT) { - typedef Kokkos::View - ViewType; + typedef Kokkos::View ViewType; Test::impl_test_blas_serial_axpy(0, 10); Test::impl_test_blas_serial_axpy(10, 15); Test::impl_test_blas_serial_axpy(1024, 9); - Test::impl_test_blas_serial_axpy(132231, - 3); + Test::impl_test_blas_serial_axpy(132231, 3); } #endif @@ -155,21 +138,16 @@ int test_blas_serial_axpy() { } #if defined(KOKKOSKERNELS_INST_FLOAT) -TEST_F(TestCategory, serial_axpy_float_float) { - test_blas_serial_axpy(); -} +TEST_F(TestCategory, serial_axpy_float_float) { test_blas_serial_axpy(); } #endif #if defined(KOKKOSKERNELS_INST_DOUBLE) -TEST_F(TestCategory, serial_axpy_double_double) { - test_blas_serial_axpy(); -} +TEST_F(TestCategory, serial_axpy_double_double) { test_blas_serial_axpy(); } #endif #if defined(KOKKOSKERNELS_INST_COMPLEX_DOUBLE) TEST_F(TestCategory, serial_axpy_dcomplex_dcomplex) { - test_blas_serial_axpy, - Kokkos::complex >(); + test_blas_serial_axpy, Kokkos::complex >(); } TEST_F(TestCategory, serial_axpy_dcomplex_double) { @@ -179,13 +157,10 @@ TEST_F(TestCategory, serial_axpy_dcomplex_double) { #if defined(KOKKOSKERNELS_INST_COMPLEX_FLOAT) TEST_F(TestCategory, serial_axpy_fcomplex_fcomplex) { - test_blas_serial_axpy, - Kokkos::complex >(); + test_blas_serial_axpy, Kokkos::complex >(); } -TEST_F(TestCategory, serial_axpy_fcomplex_float) { - test_blas_serial_axpy, float>(); -} +TEST_F(TestCategory, serial_axpy_fcomplex_float) { test_blas_serial_axpy, float>(); } #endif #endif // TEST_BLAS_SERIAL_AXPY_HPP_ diff --git a/blas/unit_test/Test_Blas_serial_nrm2.hpp b/blas/unit_test/Test_Blas_serial_nrm2.hpp index 147df52353..bca8afa1f3 100644 --- a/blas/unit_test/Test_Blas_serial_nrm2.hpp +++ b/blas/unit_test/Test_Blas_serial_nrm2.hpp @@ -38,8 +38,7 @@ struct Functor_TestBlasSerialNrm2 { norm_view_type _nrm; KOKKOS_INLINE_FUNCTION - Functor_TestBlasSerialNrm2(const ViewType &x, const norm_view_type &nrm) - : _x(x), _nrm(nrm) {} + Functor_TestBlasSerialNrm2(const ViewType &x, const norm_view_type &nrm) : _x(x), _nrm(nrm) {} KOKKOS_INLINE_FUNCTION void operator()(const KokkosKernelTag &, const int i) const { @@ -61,14 +60,11 @@ struct Functor_TestBlasSerialNrm2 { inline void run() { std::string name_region("KokkosBlas::Test::SerialNrm2"); const std::string name_value_type = Test::value_type_name(); - std::string name_work_tag = - (std::is_same::value - ? "::KokkosBlas" - : std::is_same::value ? "::Naive" - : "::UnknownWorkTag"); - std::string name_test_id = "Nrm2"; - std::string name = - name_region + name_value_type + name_work_tag + name_test_id; + std::string name_work_tag = (std::is_same::value ? "::KokkosBlas" + : std::is_same::value ? "::Naive" + : "::UnknownWorkTag"); + std::string name_test_id = "Nrm2"; + std::string name = name_region + name_value_type + name_work_tag + name_test_id; Kokkos::Profiling::pushRegion(name.c_str()); Kokkos::RangePolicy policy(0, _x.extent(0)); Kokkos::parallel_for(name.c_str(), policy, *this); @@ -89,8 +85,7 @@ struct Functor_TestBlasSerialNrm2MV { norm_view_type _nrm; KOKKOS_INLINE_FUNCTION - Functor_TestBlasSerialNrm2MV(const ViewType &x, const norm_view_type &nrm) - : _x(x), _nrm(nrm) {} + Functor_TestBlasSerialNrm2MV(const ViewType &x, const norm_view_type &nrm) : _x(x), _nrm(nrm) {} KOKKOS_INLINE_FUNCTION void operator()(const KokkosKernelTag &, const int i) const { @@ -116,14 +111,11 @@ struct Functor_TestBlasSerialNrm2MV { inline void run() { std::string name_region("KokkosBlas::Test::SerialNrm2MV"); const std::string name_value_type = Test::value_type_name(); - std::string name_work_tag = - (std::is_same::value - ? "::KokkosBlas" - : std::is_same::value ? "::Naive" - : "::UnknownWorkTag"); - std::string name_test_id = "Nrm2"; - std::string name = - name_region + name_value_type + name_work_tag + name_test_id; + std::string name_work_tag = (std::is_same::value ? "::KokkosBlas" + : std::is_same::value ? "::Naive" + : "::UnknownWorkTag"); + std::string name_test_id = "Nrm2"; + std::string name = name_region + name_value_type + name_work_tag + name_test_id; Kokkos::Profiling::pushRegion(name.c_str()); Kokkos::RangePolicy policy(0, _x.extent(0)); Kokkos::parallel_for(name.c_str(), policy, *this); @@ -153,31 +145,24 @@ void impl_test_blas_serial_nrm2(const int N, const int BlkSize) { /// test body Functor_TestBlasSerialNrm2(X, norms).run(); - Functor_TestBlasSerialNrm2(X, - norms_ref) - .run(); + Functor_TestBlasSerialNrm2(X, norms_ref).run(); Kokkos::fence(); /// for comparison send it to host - typename norm_view_type::HostMirror norms_host = - Kokkos::create_mirror_view(norms); - typename norm_view_type::HostMirror norms_ref_host = - Kokkos::create_mirror_view(norms_ref); + typename norm_view_type::HostMirror norms_host = Kokkos::create_mirror_view(norms); + typename norm_view_type::HostMirror norms_ref_host = Kokkos::create_mirror_view(norms_ref); Kokkos::deep_copy(norms_host, norms); Kokkos::deep_copy(norms_ref_host, norms_ref); /// check a = b - typename ats::mag_type eps = - 100 * std::numeric_limits::epsilon(); - for (int k = 0; k < N; ++k) - EXPECT_NEAR_KK(norms_host(k), norms_ref_host(k), eps); + typename ats::mag_type eps = 100 * std::numeric_limits::epsilon(); + for (int k = 0; k < N; ++k) EXPECT_NEAR_KK(norms_host(k), norms_ref_host(k), eps); } template -void impl_test_blas_serial_nrm2mv(const int N, const int vecLength, - const int numVecs) { +void impl_test_blas_serial_nrm2mv(const int N, const int vecLength, const int numVecs) { /// typedefs using execution_space = typename DeviceType::execution_space; using value_type = typename ViewType::non_const_value_type; @@ -197,24 +182,19 @@ void impl_test_blas_serial_nrm2mv(const int N, const int vecLength, /// test body Functor_TestBlasSerialNrm2MV(X, norms).run(); - Functor_TestBlasSerialNrm2MV(X, - norms_ref) - .run(); + Functor_TestBlasSerialNrm2MV(X, norms_ref).run(); Kokkos::fence(); /// for comparison send it to host - typename norm_view_type::HostMirror norms_host = - Kokkos::create_mirror_view(norms); - typename norm_view_type::HostMirror norms_ref_host = - Kokkos::create_mirror_view(norms_ref); + typename norm_view_type::HostMirror norms_host = Kokkos::create_mirror_view(norms); + typename norm_view_type::HostMirror norms_ref_host = Kokkos::create_mirror_view(norms_ref); Kokkos::deep_copy(norms_host, norms); Kokkos::deep_copy(norms_ref_host, norms_ref); /// check a = b - typename ats::mag_type eps = - 100 * std::numeric_limits::epsilon(); + typename ats::mag_type eps = 100 * std::numeric_limits::epsilon(); for (int k = 0; k < N; ++k) for (int vecIdx = 0; vecIdx < numVecs; ++vecIdx) EXPECT_NEAR_KK(norms_host(k, vecIdx), norms_ref_host(k, vecIdx), eps); @@ -232,8 +212,7 @@ int test_blas_serial_nrm2() { Test::impl_test_blas_serial_nrm2(1024, 9); Test::impl_test_blas_serial_nrm2(132231, 3); - using MVViewType = - Kokkos::View; + using MVViewType = Kokkos::View; Test::impl_test_blas_serial_nrm2mv(0, 10, 5); Test::impl_test_blas_serial_nrm2mv(10, 15, 7); Test::impl_test_blas_serial_nrm2mv(1024, 9, 5); @@ -242,15 +221,13 @@ int test_blas_serial_nrm2() { #endif #if defined(KOKKOSKERNELS_INST_LAYOUTRIGHT) { - using ViewType = - Kokkos::View; + using ViewType = Kokkos::View; Test::impl_test_blas_serial_nrm2(0, 10); Test::impl_test_blas_serial_nrm2(10, 15); Test::impl_test_blas_serial_nrm2(1024, 9); Test::impl_test_blas_serial_nrm2(132231, 3); - using MVViewType = - Kokkos::View; + using MVViewType = Kokkos::View; Test::impl_test_blas_serial_nrm2mv(0, 10, 5); Test::impl_test_blas_serial_nrm2mv(10, 15, 5); Test::impl_test_blas_serial_nrm2mv(1024, 9, 5); @@ -262,27 +239,19 @@ int test_blas_serial_nrm2() { } #if defined(KOKKOSKERNELS_INST_FLOAT) -TEST_F(TestCategory, serial_nrm2_float_float) { - test_blas_serial_nrm2(); -} +TEST_F(TestCategory, serial_nrm2_float_float) { test_blas_serial_nrm2(); } #endif #if defined(KOKKOSKERNELS_INST_DOUBLE) -TEST_F(TestCategory, serial_nrm2_double_double) { - test_blas_serial_nrm2(); -} +TEST_F(TestCategory, serial_nrm2_double_double) { test_blas_serial_nrm2(); } #endif #if defined(KOKKOSKERNELS_INST_COMPLEX_FLOAT) -TEST_F(TestCategory, serial_nrm2_fcomplex_float) { - test_blas_serial_nrm2 >(); -} +TEST_F(TestCategory, serial_nrm2_fcomplex_float) { test_blas_serial_nrm2 >(); } #endif #if defined(KOKKOSKERNELS_INST_COMPLEX_DOUBLE) -TEST_F(TestCategory, serial_nrm2_dcomplex_dcomplex) { - test_blas_serial_nrm2 >(); -} +TEST_F(TestCategory, serial_nrm2_dcomplex_dcomplex) { test_blas_serial_nrm2 >(); } #endif #endif // TEST_BLAS_SERIAL_NRM2_HPP_ diff --git a/common/impl/KokkosKernels_Iota.hpp b/common/impl/KokkosKernels_Iota.hpp index 04851e81c9..770a0201ef 100644 --- a/common/impl/KokkosKernels_Iota.hpp +++ b/common/impl/KokkosKernels_Iota.hpp @@ -67,8 +67,7 @@ class Iota { Constructing with size < 0 yeilds a 0-size Iota */ KOKKOS_INLINE_FUNCTION - constexpr Iota(const size_type &size, const value_type offset) - : size_(size), offset_(offset) { + constexpr Iota(const size_type &size, const value_type offset) : size_(size), offset_(offset) { if constexpr (std::is_signed_v) { if (size_ < size_type(0)) { size_ = 0; @@ -102,8 +101,7 @@ class Iota { Creating a subview outside of the base Iota yeilds undefined behavior */ template - KOKKOS_INLINE_FUNCTION constexpr Iota(const Iota &base, - const Kokkos::pair &range) + KOKKOS_INLINE_FUNCTION constexpr Iota(const Iota &base, const Kokkos::pair &range) : Iota(range.second - range.first, base.offset_ + range.first) {} /*! \brief Construct Iota subview @@ -111,9 +109,7 @@ class Iota { i >= size() or i < 0 yields undefined behavior. */ KOKKOS_INLINE_FUNCTION - constexpr T operator()(size_type i) const noexcept { - return value_type(i + offset_); - }; + constexpr T operator()(size_type i) const noexcept { return value_type(i + offset_); }; /// \brief return the size of the iota KOKKOS_INLINE_FUNCTION diff --git a/common/impl/KokkosKernels_NaN.hpp b/common/impl/KokkosKernels_NaN.hpp index f319539a9f..75d6a3ac8c 100644 --- a/common/impl/KokkosKernels_NaN.hpp +++ b/common/impl/KokkosKernels_NaN.hpp @@ -26,10 +26,9 @@ namespace KokkosKernels::Impl { template KOKKOS_INLINE_FUNCTION T quiet_NaN() { if constexpr (std::is_same_v) { - return double(Kokkos::Experimental::quiet_NaN_v< - float>); // Kokkos::Experimetnal::quiet_NaN_v - // is undefined in - // device code + return double(Kokkos::Experimental::quiet_NaN_v); // Kokkos::Experimetnal::quiet_NaN_v + // is undefined in + // device code } else if constexpr (Kokkos::ArithTraits::is_complex) { using value_type = typename T::value_type; return T(quiet_NaN(), diff --git a/common/impl/KokkosKernels_SafeCompare.hpp b/common/impl/KokkosKernels_SafeCompare.hpp index 494ef45ada..1bd43c046a 100644 --- a/common/impl/KokkosKernels_SafeCompare.hpp +++ b/common/impl/KokkosKernels_SafeCompare.hpp @@ -47,8 +47,7 @@ KOKKOS_INLINE_FUNCTION constexpr bool safe_gt(const T &t, const U &u) { using KU = Kokkos::ArithTraits; // both are integer, but only one is signed - if constexpr (KT::is_integer && KU::is_integer && - (KT::is_signed != KU::is_signed)) { + if constexpr (KT::is_integer && KU::is_integer && (KT::is_signed != KU::is_signed)) { // how wide the signed type would need to be to hold T and U constexpr size_t t_width = KT::is_signed ? sizeof(T) : 2 * sizeof(T); constexpr size_t u_width = KU::is_signed ? sizeof(U) : 2 * sizeof(U); diff --git a/common/impl/KokkosKernels_ViewUtils.hpp b/common/impl/KokkosKernels_ViewUtils.hpp index 2ae8fb609d..4769f1744a 100644 --- a/common/impl/KokkosKernels_ViewUtils.hpp +++ b/common/impl/KokkosKernels_ViewUtils.hpp @@ -29,13 +29,11 @@ class with_unmanaged { using layout_type = typename View::array_layout; using memory_space = typename View::memory_space; - using orig_traits = typename View::memory_traits; - static constexpr unsigned new_traits = - orig_traits::impl_value | Kokkos::Unmanaged; + using orig_traits = typename View::memory_traits; + static constexpr unsigned new_traits = orig_traits::impl_value | Kokkos::Unmanaged; public: - using type = Kokkos::View >; + using type = Kokkos::View >; }; /*! \brief A type that is View with Kokkos::Unmanaged added to the memory traits diff --git a/common/src/KokkosKernels_BitUtils.hpp b/common/src/KokkosKernels_BitUtils.hpp index 5be56c388c..9dcf8a38ae 100644 --- a/common/src/KokkosKernels_BitUtils.hpp +++ b/common/src/KokkosKernels_BitUtils.hpp @@ -222,8 +222,7 @@ int least_set_bit( long long i ){ } */ -#elif defined(__INTEL_COMPILER) || defined(KOKKOS_COMPILER_IBM) || \ - defined(__GNUC__) || defined(__GNUG__) +#elif defined(__INTEL_COMPILER) || defined(KOKKOS_COMPILER_IBM) || defined(__GNUC__) || defined(__GNUG__) KOKKOS_FORCEINLINE_FUNCTION int least_set_bit(unsigned i) { return __builtin_ffs(i); } KOKKOS_FORCEINLINE_FUNCTION diff --git a/common/src/KokkosKernels_BlockHashmapAccumulator.hpp b/common/src/KokkosKernels_BlockHashmapAccumulator.hpp index 3ca160164c..2b64c38ce9 100644 --- a/common/src/KokkosKernels_BlockHashmapAccumulator.hpp +++ b/common/src/KokkosKernels_BlockHashmapAccumulator.hpp @@ -20,14 +20,13 @@ #include "KokkosKernels_BlockUtils.hpp" #include "KokkosKernels_HashmapAccumulator.hpp" -//#define HASHMAPACCUMULATOR_ASSERT_ENABLED +// #define HASHMAPACCUMULATOR_ASSERT_ENABLED namespace KokkosKernels { namespace Experimental { -template +template /** * \brief BlockHashmapAccumulator class * The use of this is described in the paper: @@ -89,13 +88,7 @@ struct BlockHashmapAccumulator { * Assumption: hash_begins_ are all initialized to -1. */ KOKKOS_INLINE_FUNCTION - BlockHashmapAccumulator() - : hash_begins(), - hash_nexts(), - keys(), - values(), - __max_value_size(), - __hashOpRHS(0) {} + BlockHashmapAccumulator() : hash_begins(), hash_nexts(), keys(), values(), __max_value_size(), __hashOpRHS(0) {} /** * \brief parameterized constructor BlockHashmapAccumulator @@ -113,10 +106,8 @@ struct BlockHashmapAccumulator { * Assumption: hash_begins_ are all initialized to -1. */ KOKKOS_INLINE_FUNCTION - BlockHashmapAccumulator(size_type block_dim_, const size_type max_value_size_, - const size_type hashOpRHS, size_type *hash_begins_, - size_type *hash_nexts_, key_type *keys_, - value_type *values_) + BlockHashmapAccumulator(size_type block_dim_, const size_type max_value_size_, const size_type hashOpRHS, + size_type *hash_begins_, size_type *hash_nexts_, key_type *keys_, value_type *values_) : hash_begins(hash_begins_), hash_nexts(hash_nexts_), keys(keys_), @@ -136,10 +127,9 @@ struct BlockHashmapAccumulator { // Insertion is sequential, no race condition for the insertion. // the mergeadd used in the numeric of KKMEM. KOKKOS_INLINE_FUNCTION - void sequential_insert_into_hash_mergeAdd_TrackHashes( - key_type key, const value_type *valueA, const value_type *valueB, - size_type *used_size_, size_type *used_hash_size, - size_type *used_hashes) { + void sequential_insert_into_hash_mergeAdd_TrackHashes(key_type key, const value_type *valueA, + const value_type *valueB, size_type *used_size_, + size_type *used_hash_size, size_type *used_hashes) { size_type hash, i, my_index; if (key == -1) return; @@ -149,8 +139,7 @@ struct BlockHashmapAccumulator { hash = __compute_hash(key, __hashOpRHS); for (i = hash_begins[hash]; i != -1; i = hash_nexts[i]) { if (keys[i] == key) { - KokkosSparse::Impl::kk_block_add_mul(block_dim, values + i * block_size, - valueA, valueB); + KokkosSparse::Impl::kk_block_add_mul(block_dim, values + i * block_size, valueA, valueB); return; } } @@ -164,8 +153,7 @@ struct BlockHashmapAccumulator { hash_begins[hash] = my_index; keys[my_index] = key; - KokkosSparse::Impl::kk_block_set_mul( - block_dim, values + my_index * block_size, valueA, valueB); + KokkosSparse::Impl::kk_block_set_mul(block_dim, values + my_index * block_size, valueA, valueB); } // Performs C[hash] += A * B (for existing entry) @@ -173,37 +161,28 @@ struct BlockHashmapAccumulator { // Insertion is sequential, no race condition for the insertion. // the mergeadd used in the numeric of KKMEM. KOKKOS_INLINE_FUNCTION - void sequential_insert_into_hash_simple(key_type key, const value_type *a_val, - const value_type *b_val, - size_type &used_size, - size_type *used_hashes) { - for (size_type hash = (key * HASHSCALAR) & __hashOpRHS;; - hash = (hash + 1) & __hashOpRHS) { + void sequential_insert_into_hash_simple(key_type key, const value_type *a_val, const value_type *b_val, + size_type &used_size, size_type *used_hashes) { + for (size_type hash = (key * HASHSCALAR) & __hashOpRHS;; hash = (hash + 1) & __hashOpRHS) { if (keys[hash] == -1) { used_hashes[used_size++] = hash; keys[hash] = key; - KokkosSparse::Impl::kk_block_set_mul( - block_dim, values + hash * block_size, a_val, b_val); + KokkosSparse::Impl::kk_block_set_mul(block_dim, values + hash * block_size, a_val, b_val); break; } else if (keys[hash] == key) { - KokkosSparse::Impl::kk_block_add_mul( - block_dim, values + hash * block_size, a_val, b_val); + KokkosSparse::Impl::kk_block_add_mul(block_dim, values + hash * block_size, a_val, b_val); break; } } } KOKKOS_INLINE_FUNCTION - void sequential_export_values_simple(const size_type used_size, - const size_type *used_hashes, - key_type *out_keys, - value_type *out_values, - const bool clear = true) { + void sequential_export_values_simple(const size_type used_size, const size_type *used_hashes, key_type *out_keys, + value_type *out_values, const bool clear = true) { for (size_type i = 0; i < used_size; ++i) { const auto hash = used_hashes[i]; out_keys[i] = keys[hash]; - KokkosSparse::Impl::kk_block_set(block_dim, out_values + i * block_size, - values + hash * block_size); + KokkosSparse::Impl::kk_block_set(block_dim, out_values + i * block_size, values + hash * block_size); if (clear) { keys[hash] = -1; } @@ -218,10 +197,9 @@ struct BlockHashmapAccumulator { // Insertion is simulteanous for the vector lanes of a thread. // used_size should be a shared pointer among the thread vectors KOKKOS_INLINE_FUNCTION - int vector_atomic_insert_into_hash_mergeAdd_TrackHashes( - const key_type key, const value_type *valA, const value_type *valB, - volatile size_type *used_size_, size_type *used_hash_size, - size_type *used_hashes) { + int vector_atomic_insert_into_hash_mergeAdd_TrackHashes(const key_type key, const value_type *valA, + const value_type *valB, volatile size_type *used_size_, + size_type *used_hash_size, size_type *used_hashes) { size_type hash, i, my_write_index, hashbeginning; if (key == -1) return __insert_success; @@ -232,8 +210,7 @@ struct BlockHashmapAccumulator { for (; i != -1; i = hash_nexts[i]) { if (keys[i] == key) { - KokkosSparse::Impl::kk_block_add_mul( - block_dim, values + i * block_size, valA, valB); + KokkosSparse::Impl::kk_block_add_mul(block_dim, values + i * block_size, valA, valB); return __insert_success; } } @@ -247,8 +224,7 @@ struct BlockHashmapAccumulator { return __insert_full; } else { keys[my_write_index] = key; - KokkosSparse::Impl::kk_block_set_mul( - block_dim, values + my_write_index * block_size, valA, valB); + KokkosSparse::Impl::kk_block_set_mul(block_dim, values + my_write_index * block_size, valA, valB); #ifdef KOKKOSKERNELS_CUDA_INDEPENDENT_THREADS // this is an issue on VOLTA+ and up because warps do not go in SIMD @@ -276,11 +252,9 @@ struct BlockHashmapAccumulator { hash_nexts[my_write_index] = hash_begins[hash]; #endif - hashbeginning = - Kokkos::atomic_exchange(hash_begins + hash, my_write_index); + hashbeginning = Kokkos::atomic_exchange(hash_begins + hash, my_write_index); if (hashbeginning == -1) { - used_hashes[Kokkos::atomic_fetch_add(used_hash_size, size_type(1))] = - hash; + used_hashes[Kokkos::atomic_fetch_add(used_hash_size, size_type(1))] = hash; } hash_nexts[my_write_index] = hashbeginning; return __insert_success; @@ -288,12 +262,9 @@ struct BlockHashmapAccumulator { } template - KOKKOS_INLINE_FUNCTION int - vector_atomic_insert_into_hash_mergeAdd_with_team_level_list_length( - const team_member_t & /* teamMember */, const int /* vector_size */, - size_type hash, const key_type key, const value_type *valA, - const value_type *valB, volatile size_type *used_size_, - const size_type max_value_size_) { + KOKKOS_INLINE_FUNCTION int vector_atomic_insert_into_hash_mergeAdd_with_team_level_list_length( + const team_member_t & /* teamMember */, const int /* vector_size */, size_type hash, const key_type key, + const value_type *valA, const value_type *valB, volatile size_type *used_size_, const size_type max_value_size_) { // Cannot compute hash here due to impl_speed use-case // hash = __compute_hash(key, __hashOpRHS); if (key == -1) return __insert_success; @@ -302,8 +273,7 @@ struct BlockHashmapAccumulator { size_type i = hash_begins[hash]; for (; i != -1; i = hash_nexts[i]) { if (keys[i] == key) { - KokkosSparse::Impl::kk_block_add_mul( - block_dim, values + i * block_size, valA, valB); + KokkosSparse::Impl::kk_block_add_mul(block_dim, values + i * block_size, valA, valB); return __insert_success; } } @@ -316,15 +286,13 @@ struct BlockHashmapAccumulator { if (used_size_[0] >= max_value_size_) { return __insert_full; } - size_type my_write_index = - Kokkos::atomic_fetch_add(used_size_, size_type(1)); + size_type my_write_index = Kokkos::atomic_fetch_add(used_size_, size_type(1)); if (my_write_index >= max_value_size_) { return __insert_full; } else { keys[my_write_index] = key; - KokkosSparse::Impl::kk_block_set_mul( - block_dim, values + my_write_index * block_size, valA, valB); + KokkosSparse::Impl::kk_block_set_mul(block_dim, values + my_write_index * block_size, valA, valB); #ifdef KOKKOSKERNELS_CUDA_INDEPENDENT_THREADS // this is an issue on VOLTA+ and up because warps do not go in SIMD @@ -356,8 +324,7 @@ struct BlockHashmapAccumulator { // hashbeginning = hash_begins[hash] // hash_begins[hash] = my_write_index // hash_nexts[my_write_index] = hash_begins[hash] - size_type hashbeginning = - Kokkos::atomic_exchange(hash_begins + hash, my_write_index); + size_type hashbeginning = Kokkos::atomic_exchange(hash_begins + hash, my_write_index); hash_nexts[my_write_index] = hashbeginning; return __insert_success; } @@ -371,15 +338,12 @@ struct BlockHashmapAccumulator { // Insertion is simulteanous for the vector lanes of a thread. // used_size should be a shared pointer among the thread vectors KOKKOS_INLINE_FUNCTION - int vector_atomic_insert_into_hash_mergeAdd(const key_type key, - const value_type *valA, - const value_type *valB, + int vector_atomic_insert_into_hash_mergeAdd(const key_type key, const value_type *valA, const value_type *valB, volatile size_type *used_size_) { if (key == -1) return __insert_success; return vector_atomic_insert_into_hash_mergeAdd_with_team_level_list_length( - nullptr, 0, __compute_hash(key, __hashOpRHS), key, valA, valB, - used_size_, __max_value_size); + nullptr, 0, __compute_hash(key, __hashOpRHS), key, valA, valB, used_size_, __max_value_size); } #if 0 @@ -592,11 +556,9 @@ struct BlockHashmapAccumulator { static constexpr int __insert_success = 0; static constexpr int __insert_full = 1; - template ::value || - std::is_same::value, - std::size_t>::type = 0> + template ::value || + std::is_same::value, + std::size_t>::type = 0> KOKKOS_INLINE_FUNCTION int __compute_hash(size_type key, size_type bitmask) { size_type hash = key & bitmask; #ifdef HASHMAPACCUMULATOR_ASSERT_ENABLED @@ -606,9 +568,8 @@ struct BlockHashmapAccumulator { return hash; } - template ::value, - std::size_t>::type = 0> + template ::value, std::size_t>::type = 0> KOKKOS_INLINE_FUNCTION int __compute_hash(size_type key, size_type divisor) { size_type hash = key % divisor; #ifdef HASHMAPACCUMULATOR_ASSERT_ENABLED diff --git a/common/src/KokkosKernels_BlockUtils.hpp b/common/src/KokkosKernels_BlockUtils.hpp index 6fd9d9b656..64309372ac 100644 --- a/common/src/KokkosKernels_BlockUtils.hpp +++ b/common/src/KokkosKernels_BlockUtils.hpp @@ -25,10 +25,9 @@ namespace Impl { // Initializes block: A = [val, val, val, ....] template -KOKKOS_INLINE_FUNCTION void kk_block_init( - const size_type block_dim, value_type *dst, - const value_type val = static_cast( - 0)) { // Note: replaces __host__ std::fill() not to be called from GPU +KOKKOS_INLINE_FUNCTION void kk_block_init(const size_type block_dim, value_type *dst, + const value_type val = static_cast( + 0)) { // Note: replaces __host__ std::fill() not to be called from GPU for (auto end = dst + (block_dim * block_dim); dst < end; ++dst) { *dst = val; } @@ -36,17 +35,13 @@ KOKKOS_INLINE_FUNCTION void kk_block_init( // Initializes block: A = B template -KOKKOS_INLINE_FUNCTION void kk_block_set(const size_type block_dim, - value_type *dst, - const value_type *val) { +KOKKOS_INLINE_FUNCTION void kk_block_set(const size_type block_dim, value_type *dst, const value_type *val) { memcpy((void *)dst, val, block_dim * block_dim * sizeof(value_type)); } // Performs A += B on blocks template -KOKKOS_INLINE_FUNCTION void kk_block_add(const size_type block_dim, - value_type *dst, - const value_type *val) { +KOKKOS_INLINE_FUNCTION void kk_block_add(const size_type block_dim, value_type *dst, const value_type *val) { const auto end = dst + block_dim * block_dim; while (dst < end) { *(dst++) += *(val++); @@ -57,33 +52,25 @@ KOKKOS_INLINE_FUNCTION void kk_block_add(const size_type block_dim, // Note: block is assumed to be row-major, dense matrix (no extra padding) // Note: set clear=true to set C = 0 before increment template > -KOKKOS_INLINE_FUNCTION void kk_block_dgemm(const size_type block_dim, - value_type *dst, - const value_type *valA, - const value_type *valB, - const bool clear = false) { + typename DGEMM = KokkosBatched::SerialGemmInternal> +KOKKOS_INLINE_FUNCTION void kk_block_dgemm(const size_type block_dim, value_type *dst, const value_type *valA, + const value_type *valB, const bool clear = false) { const auto ZERO = static_cast(0); const auto ONE = static_cast(1); - DGEMM::invoke(block_dim, block_dim, block_dim, ONE, valA, block_dim, 1, valB, - block_dim, 1, clear ? ZERO : ONE, dst, block_dim, 1); + DGEMM::invoke(block_dim, block_dim, block_dim, ONE, valA, block_dim, 1, valB, block_dim, 1, clear ? ZERO : ONE, dst, + block_dim, 1); } // dgemm: C = A * B template -KOKKOS_INLINE_FUNCTION void kk_block_set_mul(const size_type block_dim, - value_type *c_val, - const value_type *a_val, +KOKKOS_INLINE_FUNCTION void kk_block_set_mul(const size_type block_dim, value_type *c_val, const value_type *a_val, const value_type *b_val) { kk_block_dgemm(block_dim, c_val, a_val, b_val, true); } // dgemm: C += A * B template -KOKKOS_INLINE_FUNCTION void kk_block_add_mul(const size_type block_dim, - value_type *c_val, - const value_type *a_val, +KOKKOS_INLINE_FUNCTION void kk_block_add_mul(const size_type block_dim, value_type *c_val, const value_type *a_val, const value_type *b_val) { kk_block_dgemm(block_dim, c_val, a_val, b_val, false); } @@ -91,9 +78,7 @@ KOKKOS_INLINE_FUNCTION void kk_block_add_mul(const size_type block_dim, // Performs C += A * B (dense GEMM) on blocks // Note: all pointers reference dense row-major blocks (no extra padding) template -KOKKOS_INLINE_FUNCTION void kk_vector_block_add_mul(const size_type block_dim, - value_type *dst, - const value_type *valA, +KOKKOS_INLINE_FUNCTION void kk_vector_block_add_mul(const size_type block_dim, value_type *dst, const value_type *valA, const value_type *valB) { // NOTE: this should be replaced by batched DGEMM // once atomic increment is supported there @@ -102,8 +87,7 @@ KOKKOS_INLINE_FUNCTION void kk_vector_block_add_mul(const size_type block_dim, for (size_type col = 0; col < block_dim; ++col) { auto v = &dst[row_offset + col]; auto vb = valB + col; - for (const value_type *va = valA + row_offset, *end = va + block_dim; - va < end; ++va) { + for (const value_type *va = valA + row_offset, *end = va + block_dim; va < end; ++va) { Kokkos::atomic_add(v, (*va) * (*vb)); vb += block_dim; } diff --git a/common/src/KokkosKernels_Error.hpp b/common/src/KokkosKernels_Error.hpp index 83f2c23ff2..05ce523ecf 100644 --- a/common/src/KokkosKernels_Error.hpp +++ b/common/src/KokkosKernels_Error.hpp @@ -23,32 +23,25 @@ namespace KokkosKernels { namespace Impl { -inline void throw_runtime_exception(const std::string &msg) { - throw std::runtime_error(msg); -} +inline void throw_runtime_exception(const std::string &msg) { throw std::runtime_error(msg); } #if defined(KOKKOS_ENABLE_HIP) -inline void hip_internal_error_throw(hipError_t e, const char *name, - const char *file, const int line) { +inline void hip_internal_error_throw(hipError_t e, const char *name, const char *file, const int line) { std::ostringstream out; - out << name << " error( " << hipGetErrorName(e) - << "): " << hipGetErrorString(e); + out << name << " error( " << hipGetErrorName(e) << "): " << hipGetErrorString(e); if (file) { out << " " << file << ":" << line; } throw_runtime_exception(out.str()); } -inline void hip_internal_safe_call(hipError_t e, const char *name, - const char *file = nullptr, - const int line = 0) { +inline void hip_internal_safe_call(hipError_t e, const char *name, const char *file = nullptr, const int line = 0) { if (hipSuccess != e) { hip_internal_error_throw(e, name, file, line); } } -#define KOKKOSKERNELS_IMPL_HIP_SAFE_CALL(call) \ - hip_internal_safe_call(call, #call, __FILE__, __LINE__) +#define KOKKOSKERNELS_IMPL_HIP_SAFE_CALL(call) hip_internal_safe_call(call, #call, __FILE__, __LINE__) #endif } // namespace Impl @@ -90,8 +83,7 @@ inline void hip_internal_safe_call(hipError_t e, const char *name, #ifndef NDEBUG #define KK_ASSERT(condition) IMPL_THROW(condition, "", std::logic_error) -#define KK_ASSERT_MSG(condition, msg) \ - IMPL_THROW(condition, msg, std::logic_error) +#define KK_ASSERT_MSG(condition, msg) IMPL_THROW(condition, msg, std::logic_error) #define KK_KERNEL_ASSERT(condition) IMPL_KERNEL_THROW(condition, "") #define KK_KERNEL_ASSERT_MSG(condition, msg) IMPL_KERNEL_THROW(condition, msg) #else @@ -102,12 +94,10 @@ inline void hip_internal_safe_call(hipError_t e, const char *name, #endif #define KK_REQUIRE(condition) IMPL_THROW(condition, "", std::logic_error) -#define KK_REQUIRE_MSG(condition, msg) \ - IMPL_THROW(condition, msg, std::logic_error) +#define KK_REQUIRE_MSG(condition, msg) IMPL_THROW(condition, msg, std::logic_error) #define KK_USER_REQUIRE(condition) IMPL_THROW(condition, "", std::runtime_error) -#define KK_USER_REQUIRE_MSG(condition, msg) \ - IMPL_THROW(condition, msg, std::runtime_error) +#define KK_USER_REQUIRE_MSG(condition, msg) IMPL_THROW(condition, msg, std::runtime_error) #define KK_KERNEL_REQUIRE(condition) IMPL_KERNEL_THROW(condition, "") #define KK_KERNEL_REQUIRE_MSG(condition, msg) IMPL_KERNEL_THROW(condition, msg) diff --git a/common/src/KokkosKernels_ExecSpaceUtils.hpp b/common/src/KokkosKernels_ExecSpaceUtils.hpp index 4d3a3002b4..2d167f5c73 100644 --- a/common/src/KokkosKernels_ExecSpaceUtils.hpp +++ b/common/src/KokkosKernels_ExecSpaceUtils.hpp @@ -29,14 +29,7 @@ namespace KokkosKernels { namespace Impl { -enum ExecSpaceType { - Exec_SERIAL, - Exec_OMP, - Exec_THREADS, - Exec_CUDA, - Exec_HIP, - Exec_SYCL -}; +enum ExecSpaceType { Exec_SERIAL, Exec_OMP, Exec_THREADS, Exec_CUDA, Exec_HIP, Exec_SYCL }; template KOKKOS_FORCEINLINE_FUNCTION ExecSpaceType kk_get_exec_space_type() { @@ -105,8 +98,7 @@ constexpr KOKKOS_INLINE_FUNCTION bool kk_is_gpu_exec_space() { #ifdef KOKKOS_ENABLE_SYCL template <> -constexpr KOKKOS_INLINE_FUNCTION bool -kk_is_gpu_exec_space() { +constexpr KOKKOS_INLINE_FUNCTION bool kk_is_gpu_exec_space() { return true; } #endif @@ -122,8 +114,7 @@ constexpr KOKKOS_INLINE_FUNCTION bool kk_is_x86_64_mem_space() { #if __x86_64__ template <> -constexpr KOKKOS_INLINE_FUNCTION bool -kk_is_x86_64_mem_space() { +constexpr KOKKOS_INLINE_FUNCTION bool kk_is_x86_64_mem_space() { return true; } #endif // x86_64 architectures @@ -139,8 +130,7 @@ constexpr KOKKOS_INLINE_FUNCTION bool kk_is_a64fx_mem_space() { #if defined(__ARM_ARCH_ISA_A64) template <> -constexpr KOKKOS_INLINE_FUNCTION bool -kk_is_a64fx_mem_space() { +constexpr KOKKOS_INLINE_FUNCTION bool kk_is_a64fx_mem_space() { return true; } #endif // a64fx architectures @@ -148,86 +138,67 @@ kk_is_a64fx_mem_space() { // Host function to determine free and total device memory. // Will throw if execution space doesn't support this. template -inline void kk_get_free_total_memory(size_t& /* free_mem */, - size_t& /* total_mem */) { +inline void kk_get_free_total_memory(size_t& /* free_mem */, size_t& /* total_mem */) { std::ostringstream oss; - oss << "Error: memory space " << MemorySpace::name() - << " does not support querying free/total memory."; + oss << "Error: memory space " << MemorySpace::name() << " does not support querying free/total memory."; throw std::runtime_error(oss.str()); } // Host function to determine free and total device memory. // Will throw if execution space doesn't support this. template -inline void kk_get_free_total_memory(size_t& /* free_mem */, - size_t& /* total_mem */, - int /* n_streams */) { +inline void kk_get_free_total_memory(size_t& /* free_mem */, size_t& /* total_mem */, int /* n_streams */) { std::ostringstream oss; - oss << "Error: memory space " << MemorySpace::name() - << " does not support querying free/total memory."; + oss << "Error: memory space " << MemorySpace::name() << " does not support querying free/total memory."; throw std::runtime_error(oss.str()); } #ifdef KOKKOS_ENABLE_CUDA template <> -inline void kk_get_free_total_memory(size_t& free_mem, - size_t& total_mem, - int n_streams) { +inline void kk_get_free_total_memory(size_t& free_mem, size_t& total_mem, int n_streams) { cudaMemGetInfo(&free_mem, &total_mem); free_mem /= n_streams; total_mem /= n_streams; } template <> -inline void kk_get_free_total_memory(size_t& free_mem, - size_t& total_mem) { +inline void kk_get_free_total_memory(size_t& free_mem, size_t& total_mem) { kk_get_free_total_memory(free_mem, total_mem, 1); } template <> -inline void kk_get_free_total_memory(size_t& free_mem, - size_t& total_mem, - int n_streams) { +inline void kk_get_free_total_memory(size_t& free_mem, size_t& total_mem, int n_streams) { kk_get_free_total_memory(free_mem, total_mem, n_streams); } template <> -inline void kk_get_free_total_memory(size_t& free_mem, - size_t& total_mem) { +inline void kk_get_free_total_memory(size_t& free_mem, size_t& total_mem) { kk_get_free_total_memory(free_mem, total_mem, 1); } template <> -inline void kk_get_free_total_memory( - size_t& free_mem, size_t& total_mem, int n_streams) { +inline void kk_get_free_total_memory(size_t& free_mem, size_t& total_mem, int n_streams) { kk_get_free_total_memory(free_mem, total_mem, n_streams); } template <> -inline void kk_get_free_total_memory( - size_t& free_mem, size_t& total_mem) { +inline void kk_get_free_total_memory(size_t& free_mem, size_t& total_mem) { kk_get_free_total_memory(free_mem, total_mem, 1); } #endif #ifdef KOKKOS_ENABLE_HIP template <> -inline void kk_get_free_total_memory(size_t& free_mem, - size_t& total_mem, - int n_streams) { +inline void kk_get_free_total_memory(size_t& free_mem, size_t& total_mem, int n_streams) { KOKKOSKERNELS_IMPL_HIP_SAFE_CALL(hipMemGetInfo(&free_mem, &total_mem)); free_mem /= n_streams; total_mem /= n_streams; } template <> -inline void kk_get_free_total_memory(size_t& free_mem, - size_t& total_mem, - int n_streams) { +inline void kk_get_free_total_memory(size_t& free_mem, size_t& total_mem, int n_streams) { kk_get_free_total_memory(free_mem, total_mem, n_streams); } template <> -inline void kk_get_free_total_memory(size_t& free_mem, - size_t& total_mem) { +inline void kk_get_free_total_memory(size_t& free_mem, size_t& total_mem) { kk_get_free_total_memory(free_mem, total_mem, 1); } template <> -inline void kk_get_free_total_memory( - size_t& free_mem, size_t& total_mem) { +inline void kk_get_free_total_memory(size_t& free_mem, size_t& total_mem) { kk_get_free_total_memory(free_mem, total_mem, 1); } #endif @@ -236,12 +207,11 @@ inline void kk_get_free_total_memory( // available. Also, we assume to query memory associated with the default queue. #if defined(KOKKOS_ENABLE_SYCL) && defined(KOKKOS_ARCH_INTEL_GPU) template <> -inline void kk_get_free_total_memory( - size_t& free_mem, size_t& total_mem, int n_streams) { +inline void kk_get_free_total_memory(size_t& free_mem, size_t& total_mem, + int n_streams) { sycl::queue queue; - sycl::device device = queue.get_device(); - auto level_zero_handle = - sycl::get_native(device); + sycl::device device = queue.get_device(); + auto level_zero_handle = sycl::get_native(device); uint32_t n_memory_modules = 0; zesDeviceEnumMemoryModules(level_zero_handle, &n_memory_modules, nullptr); @@ -255,8 +225,7 @@ inline void kk_get_free_total_memory( total_mem = 0; free_mem = 0; std::vector mem_handles(n_memory_modules); - zesDeviceEnumMemoryModules(level_zero_handle, &n_memory_modules, - mem_handles.data()); + zesDeviceEnumMemoryModules(level_zero_handle, &n_memory_modules, mem_handles.data()); for (auto& mem_handle : mem_handles) { zes_mem_properties_t memory_properties{ZES_STRUCTURE_TYPE_MEM_PROPERTIES}; @@ -274,38 +243,30 @@ inline void kk_get_free_total_memory( } template <> -inline void kk_get_free_total_memory( - size_t& free_mem, size_t& total_mem) { - kk_get_free_total_memory( - free_mem, total_mem, 1); +inline void kk_get_free_total_memory(size_t& free_mem, size_t& total_mem) { + kk_get_free_total_memory(free_mem, total_mem, 1); } template <> -inline void kk_get_free_total_memory( - size_t& free_mem, size_t& total_mem, int n_streams) { - kk_get_free_total_memory( - free_mem, total_mem, n_streams); +inline void kk_get_free_total_memory(size_t& free_mem, size_t& total_mem, + int n_streams) { + kk_get_free_total_memory(free_mem, total_mem, n_streams); } template <> -inline void kk_get_free_total_memory( - size_t& free_mem, size_t& total_mem) { - kk_get_free_total_memory( - free_mem, total_mem, 1); +inline void kk_get_free_total_memory(size_t& free_mem, size_t& total_mem) { + kk_get_free_total_memory(free_mem, total_mem, 1); } template <> -inline void kk_get_free_total_memory( - size_t& free_mem, size_t& total_mem, int n_streams) { - kk_get_free_total_memory( - free_mem, total_mem, n_streams); +inline void kk_get_free_total_memory(size_t& free_mem, size_t& total_mem, + int n_streams) { + kk_get_free_total_memory(free_mem, total_mem, n_streams); } template <> -inline void kk_get_free_total_memory( - size_t& free_mem, size_t& total_mem) { - kk_get_free_total_memory( - free_mem, total_mem, 1); +inline void kk_get_free_total_memory(size_t& free_mem, size_t& total_mem) { + kk_get_free_total_memory(free_mem, total_mem, 1); } #endif @@ -325,8 +286,7 @@ inline int kk_get_max_vector_size() { } #endif -inline int kk_get_suggested_vector_size(const size_t nr, const size_t nnz, - const ExecSpaceType exec_space) { +inline int kk_get_suggested_vector_size(const size_t nr, const size_t nnz, const ExecSpaceType exec_space) { int suggested_vector_size_ = 1; int max_vector_size = 1; switch (exec_space) { @@ -360,17 +320,14 @@ inline int kk_get_suggested_vector_size(const size_t nr, const size_t nnz, } else { suggested_vector_size_ = 64; } - if (suggested_vector_size_ > max_vector_size) - suggested_vector_size_ = max_vector_size; + if (suggested_vector_size_ > max_vector_size) suggested_vector_size_ = max_vector_size; break; } return suggested_vector_size_; } -inline int kk_get_suggested_team_size(const int vector_size, - const ExecSpaceType exec_space) { - if (exec_space == Exec_CUDA || exec_space == Exec_HIP || - exec_space == Exec_SYCL) { +inline int kk_get_suggested_team_size(const int vector_size, const ExecSpaceType exec_space) { + if (exec_space == Exec_CUDA || exec_space == Exec_HIP || exec_space == Exec_SYCL) { // TODO: where this is used, tune the target value for // threads per block (but 256 is probably OK for CUDA and HIP) return 256 / vector_size; diff --git a/common/src/KokkosKernels_HashmapAccumulator.hpp b/common/src/KokkosKernels_HashmapAccumulator.hpp index 1085cec4af..c57dfa83fd 100644 --- a/common/src/KokkosKernels_HashmapAccumulator.hpp +++ b/common/src/KokkosKernels_HashmapAccumulator.hpp @@ -36,8 +36,7 @@ struct HashOpType { struct pow2Modulo {}; }; -template +template /** * \brief HashmapAccumulator class * The use of this is described in the paper: @@ -96,13 +95,7 @@ struct HashmapAccumulator { * Assumption: hash_begins_ are all initialized to -1. */ KOKKOS_INLINE_FUNCTION - HashmapAccumulator() - : hash_begins(), - hash_nexts(), - keys(), - values(), - __max_value_size(), - __hashOpRHS(0) {} + HashmapAccumulator() : hash_begins(), hash_nexts(), keys(), values(), __max_value_size(), __hashOpRHS(0) {} /** * \brief parameterized constructor HashmapAccumulator @@ -120,9 +113,8 @@ struct HashmapAccumulator { * Assumption: hash_begins_ are all initialized to -1. */ KOKKOS_INLINE_FUNCTION - HashmapAccumulator(const size_type max_value_size_, const size_type hashOpRHS, - size_type *hash_begins_, size_type *hash_nexts_, - key_type *keys_, value_type *values_) + HashmapAccumulator(const size_type max_value_size_, const size_type hashOpRHS, size_type *hash_begins_, + size_type *hash_nexts_, key_type *keys_, value_type *values_) : hash_begins(hash_begins_), hash_nexts(hash_nexts_), keys(keys_), @@ -139,11 +131,8 @@ struct HashmapAccumulator { // Accumulation is OR operation. // Insertion is sequential, no race condition for the insertion. KOKKOS_INLINE_FUNCTION - int sequential_insert_into_hash_mergeOr_TrackHashes(key_type key, - value_type value, - size_type *used_size_, - size_type *used_hash_size, - size_type *used_hashes) { + int sequential_insert_into_hash_mergeOr_TrackHashes(key_type key, value_type value, size_type *used_size_, + size_type *used_hash_size, size_type *used_hashes) { size_type hash, i, my_index; if (key == -1) return __insert_success; @@ -175,10 +164,9 @@ struct HashmapAccumulator { // TODO: This function is for triangle counting. // Assume that there are 2 values for triangle count. KOKKOS_INLINE_FUNCTION - int sequential_insert_into_hash_mergeOr_TriangleCount_TrackHashes( - key_type key, value_type value, value_type *values2, - size_type *used_size_, size_type *used_hash_size, - size_type *used_hashes) { + int sequential_insert_into_hash_mergeOr_TriangleCount_TrackHashes(key_type key, value_type value, value_type *values2, + size_type *used_size_, size_type *used_hash_size, + size_type *used_hashes) { size_type hash, i, my_index; if (key == -1) return __insert_success; @@ -210,10 +198,10 @@ struct HashmapAccumulator { // this is used in slow triangle counting method. // L x Incidence KOKKOS_INLINE_FUNCTION - int sequential_insert_into_hash_mergeAnd_TriangleCount_TrackHashes( - key_type key, value_type value, value_type *values2, - size_type * /*used_size_*/, size_type * /*used_hash_size*/, - size_type * /*used_hashes*/) { + int sequential_insert_into_hash_mergeAnd_TriangleCount_TrackHashes(key_type key, value_type value, + value_type *values2, size_type * /*used_size_*/, + size_type * /*used_hash_size*/, + size_type * /*used_hashes*/) { size_type hash, i; if (key == -1) return __insert_success; @@ -234,8 +222,7 @@ struct HashmapAccumulator { // this is used in LxL or Incidence^T x L KOKKOS_INLINE_FUNCTION - value_type sequential_insert_into_hash_mergeAnd_TriangleCount_TrackHashes( - key_type key, value_type value) { + value_type sequential_insert_into_hash_mergeAnd_TriangleCount_TrackHashes(key_type key, value_type value) { size_type hash, i; if (key == -1) return __insert_success; @@ -254,10 +241,9 @@ struct HashmapAccumulator { // this is used in slow triangle counting method. // L x Incidence KOKKOS_INLINE_FUNCTION - int sequential_insert_into_hash_TriangleCount_TrackHashes( - key_type key, value_type value, value_type *values2, - size_type *used_size_, size_type *used_hash_size, - size_type *used_hashes) { + int sequential_insert_into_hash_TriangleCount_TrackHashes(key_type key, value_type value, value_type *values2, + size_type *used_size_, size_type *used_hash_size, + size_type *used_hashes) { size_type hash, my_index; if (key == -1) return __insert_success; @@ -283,11 +269,10 @@ struct HashmapAccumulator { // this is used in LxL or Incidence^T x L KOKKOS_INLINE_FUNCTION - int sequential_insert_into_hash_TriangleCount_TrackHashes( - key_type key, value_type value, size_type *used_size_, - size_type *used_hash_size, - size_type *used_hashes) // issue-508, TODO figure out what this - // "used_hashes" is for + int sequential_insert_into_hash_TriangleCount_TrackHashes(key_type key, value_type value, size_type *used_size_, + size_type *used_hash_size, + size_type *used_hashes) // issue-508, TODO figure out what + // this "used_hashes" is for { size_type hash, my_index; @@ -315,9 +300,8 @@ struct HashmapAccumulator { // Insertion is sequential, no race condition for the insertion. // the mergeadd used in the numeric of KKMEM. KOKKOS_INLINE_FUNCTION - void sequential_insert_into_hash_mergeAdd_TrackHashes( - key_type key, value_type value, size_type *used_size_, - size_type *used_hash_size, size_type *used_hashes) { + void sequential_insert_into_hash_mergeAdd_TrackHashes(key_type key, value_type value, size_type *used_size_, + size_type *used_hash_size, size_type *used_hashes) { size_type hash, i, my_index; if (key == -1) return; @@ -348,9 +332,7 @@ struct HashmapAccumulator { // used in the compression to count the sets. // also used in the symbolic of spgemm if no compression is applied. KOKKOS_INLINE_FUNCTION - int sequential_insert_into_hash_TrackHashes(key_type key, - size_type *used_size_, - size_type *used_hash_size, + int sequential_insert_into_hash_TrackHashes(key_type key, size_type *used_size_, size_type *used_hash_size, size_type *used_hashes) { size_type hash, i, my_index; @@ -383,10 +365,9 @@ struct HashmapAccumulator { // Insertion is simulteanous for the vector lanes of a thread. // used_size should be a shared pointer among the thread vectors KOKKOS_INLINE_FUNCTION - int vector_atomic_insert_into_hash_mergeAdd_TrackHashes( - const key_type key, const value_type value, - volatile size_type *used_size_, size_type *used_hash_size, - size_type *used_hashes) { + int vector_atomic_insert_into_hash_mergeAdd_TrackHashes(const key_type key, const value_type value, + volatile size_type *used_size_, size_type *used_hash_size, + size_type *used_hashes) { size_type hash, i, my_write_index, hashbeginning; if (key == -1) return __insert_success; @@ -438,11 +419,9 @@ struct HashmapAccumulator { hash_nexts[my_write_index] = hash_begins[hash]; #endif - hashbeginning = - Kokkos::atomic_exchange(hash_begins + hash, my_write_index); + hashbeginning = Kokkos::atomic_exchange(hash_begins + hash, my_write_index); if (hashbeginning == -1) { - used_hashes[Kokkos::atomic_fetch_add(used_hash_size, size_type(1))] = - hash; + used_hashes[Kokkos::atomic_fetch_add(used_hash_size, size_type(1))] = hash; } hash_nexts[my_write_index] = hashbeginning; return __insert_success; @@ -453,10 +432,9 @@ struct HashmapAccumulator { // except uses atomic addition on updating the value // necessary if duplicate key insertions happen simultaneously KOKKOS_INLINE_FUNCTION - int vector_atomic_insert_into_hash_mergeAtomicAdd_TrackHashes( - const key_type key, const value_type value, - volatile size_type *used_size_, size_type *used_hash_size, - size_type *used_hashes) { + int vector_atomic_insert_into_hash_mergeAtomicAdd_TrackHashes(const key_type key, const value_type value, + volatile size_type *used_size_, + size_type *used_hash_size, size_type *used_hashes) { size_type hash, i, my_write_index, hashbeginning; if (key == -1) return __insert_success; @@ -509,11 +487,9 @@ struct HashmapAccumulator { hash_nexts[my_write_index] = hash_begins[hash]; #endif - hashbeginning = - Kokkos::atomic_exchange(hash_begins + hash, my_write_index); + hashbeginning = Kokkos::atomic_exchange(hash_begins + hash, my_write_index); if (hashbeginning == -1) { - used_hashes[Kokkos::atomic_fetch_add(used_hash_size, size_type(1))] = - hash; + used_hashes[Kokkos::atomic_fetch_add(used_hash_size, size_type(1))] = hash; } hash_nexts[my_write_index] = hashbeginning; return __insert_success; @@ -521,9 +497,8 @@ struct HashmapAccumulator { } KOKKOS_INLINE_FUNCTION - int vector_atomic_insert_into_hash_mergeAdd_TrackHashes_no_list( - const key_type key, const value_type value, size_type *used_hash_size, - size_type *used_hashes) { + int vector_atomic_insert_into_hash_mergeAdd_TrackHashes_no_list(const key_type key, const value_type value, + size_type *used_hash_size, size_type *used_hashes) { size_type hash; if (key == -1) return __insert_success; @@ -541,11 +516,9 @@ struct HashmapAccumulator { Kokkos::atomic_add(values + hash, value); return __insert_success; } else if (keys[hash] == -1) { - if (Kokkos::atomic_compare_exchange_strong(keys + hash, -1, - key)) { + if (Kokkos::atomic_compare_exchange_strong(keys + hash, -1, key)) { // should only be here if we used a new hash - used_hashes[Kokkos::atomic_fetch_add(used_hash_size, - size_type(1))] = hash; + used_hashes[Kokkos::atomic_fetch_add(used_hash_size, size_type(1))] = hash; Kokkos::atomic_add(values + hash, value); return __insert_success; } @@ -565,11 +538,9 @@ struct HashmapAccumulator { // NOTE: this is an exact copy of vector_atmoic_insert_into_hash_mergeAdd from // https://github.com/kokkos/kokkos-kernels/blob/750fe24508a69ed4dba92bb4a9e17a6094b1a083/src/common/KokkosKernels_HashmapAccumulator.hpp#L442-L502 template - KOKKOS_INLINE_FUNCTION int - vector_atomic_insert_into_hash_mergeAdd_with_team_level_list_length( - const team_member_t & /* teamMember */, const int /* vector_size */, - size_type hash, const key_type key, const value_type value, - volatile size_type *used_size_, const size_type max_value_size_) { + KOKKOS_INLINE_FUNCTION int vector_atomic_insert_into_hash_mergeAdd_with_team_level_list_length( + const team_member_t & /* teamMember */, const int /* vector_size */, size_type hash, const key_type key, + const value_type value, volatile size_type *used_size_, const size_type max_value_size_) { // Cannot compute hash here due to impl_speed use-case // hash = __compute_hash(key, __hashOpRHS); if (key == -1) return __insert_success; @@ -591,8 +562,7 @@ struct HashmapAccumulator { if (used_size_[0] >= max_value_size_) { return __insert_full; } - size_type my_write_index = - Kokkos::atomic_fetch_add(used_size_, size_type(1)); + size_type my_write_index = Kokkos::atomic_fetch_add(used_size_, size_type(1)); if (my_write_index >= max_value_size_) { return __insert_full; @@ -630,8 +600,7 @@ struct HashmapAccumulator { // hashbeginning = hash_begins[hash] // hash_begins[hash] = my_write_index // hash_nexts[my_write_index] = hash_begins[hash] - size_type hashbeginning = - Kokkos::atomic_exchange(hash_begins + hash, my_write_index); + size_type hashbeginning = Kokkos::atomic_exchange(hash_begins + hash, my_write_index); hash_nexts[my_write_index] = hashbeginning; return __insert_success; } @@ -645,20 +614,17 @@ struct HashmapAccumulator { // Insertion is simulteanous for the vector lanes of a thread. // used_size should be a shared pointer among the thread vectors KOKKOS_INLINE_FUNCTION - int vector_atomic_insert_into_hash_mergeAdd(const key_type key, - const value_type value, + int vector_atomic_insert_into_hash_mergeAdd(const key_type key, const value_type value, volatile size_type *used_size_) { if (key == -1) return __insert_success; return vector_atomic_insert_into_hash_mergeAdd_with_team_level_list_length( - nullptr, 0, __compute_hash(key, __hashOpRHS), key, value, used_size_, - __max_value_size); + nullptr, 0, __compute_hash(key, __hashOpRHS), key, value, used_size_, __max_value_size); } // used in symbolic of kkmem if the compression is not applied. KOKKOS_INLINE_FUNCTION - int vector_atomic_insert_into_hash(const key_type &key, - volatile size_type *used_size_) { + int vector_atomic_insert_into_hash(const key_type &key, volatile size_type *used_size_) { size_type hash, i, my_write_index, hashbeginning; if (key == -1) return __insert_success; @@ -692,8 +658,7 @@ struct HashmapAccumulator { hash_nexts[my_write_index] = hash_begins[hash]; #endif - hashbeginning = - Kokkos::atomic_exchange(hash_begins + hash, my_write_index); + hashbeginning = Kokkos::atomic_exchange(hash_begins + hash, my_write_index); hash_nexts[my_write_index] = hashbeginning; return __insert_success; } @@ -706,8 +671,7 @@ struct HashmapAccumulator { // Insertion is simulteanous for the vector lanes of a thread. // used_size should be a shared pointer among the thread vectors KOKKOS_INLINE_FUNCTION - int vector_atomic_insert_into_hash_mergeOr(const key_type &key, - const value_type &value, + int vector_atomic_insert_into_hash_mergeOr(const key_type &key, const value_type &value, volatile size_type *used_size_) { size_type hash, i, my_write_index, hashbeginning; @@ -744,8 +708,7 @@ struct HashmapAccumulator { hash_nexts[my_write_index] = hash_begins[hash]; #endif - hashbeginning = - Kokkos::atomic_exchange(hash_begins + hash, my_write_index); + hashbeginning = Kokkos::atomic_exchange(hash_begins + hash, my_write_index); hash_nexts[my_write_index] = hashbeginning; return __insert_success; } @@ -758,10 +721,9 @@ struct HashmapAccumulator { // Insertion is simulteanous for the vector lanes of a thread. // used_size should be a shared pointer among the thread vectors KOKKOS_INLINE_FUNCTION - int vector_atomic_insert_into_hash_mergeOr_TrackHashes( - const key_type &key, const value_type &value, - volatile size_type *used_size_, size_type *used_hash_size, - size_type *used_hashes) { + int vector_atomic_insert_into_hash_mergeOr_TrackHashes(const key_type &key, const value_type &value, + volatile size_type *used_size_, size_type *used_hash_size, + size_type *used_hashes) { size_type hash, i, my_write_index, hashbeginning; if (key == -1) return __insert_success; @@ -797,11 +759,9 @@ struct HashmapAccumulator { hash_nexts[my_write_index] = hash_begins[hash]; #endif - hashbeginning = - Kokkos::atomic_exchange(hash_begins + hash, my_write_index); + hashbeginning = Kokkos::atomic_exchange(hash_begins + hash, my_write_index); if (hashbeginning == -1) { - used_hashes[Kokkos::atomic_fetch_add(used_hash_size, size_type(1))] = - hash; + used_hashes[Kokkos::atomic_fetch_add(used_hash_size, size_type(1))] = hash; } hash_nexts[my_write_index] = hashbeginning; return __insert_success; @@ -809,10 +769,8 @@ struct HashmapAccumulator { } KOKKOS_INLINE_FUNCTION - int vector_atomic_insert_into_hash_TrackHashes(const key_type &key, - volatile size_type *used_size_, - size_type *used_hash_size, - size_type *used_hashes) { + int vector_atomic_insert_into_hash_TrackHashes(const key_type &key, volatile size_type *used_size_, + size_type *used_hash_size, size_type *used_hashes) { size_type hash, i, my_write_index, hashbeginning; if (key == -1) return __insert_success; @@ -846,11 +804,9 @@ struct HashmapAccumulator { hash_nexts[my_write_index] = hash_begins[hash]; #endif - hashbeginning = - Kokkos::atomic_exchange(hash_begins + hash, my_write_index); + hashbeginning = Kokkos::atomic_exchange(hash_begins + hash, my_write_index); if (hashbeginning == -1) { - used_hashes[Kokkos::atomic_fetch_add(used_hash_size, size_type(1))] = - hash; + used_hashes[Kokkos::atomic_fetch_add(used_hash_size, size_type(1))] = hash; } hash_nexts[my_write_index] = hashbeginning; return __insert_success; @@ -863,11 +819,9 @@ struct HashmapAccumulator { static constexpr int __insert_success = 0; static constexpr int __insert_full = 1; - template ::value || - std::is_same::value, - std::size_t>::type = 0> + template ::value || + std::is_same::value, + std::size_t>::type = 0> KOKKOS_INLINE_FUNCTION int __compute_hash(size_type key, size_type bitmask) { size_type hash = key & bitmask; #ifdef HASHMAPACCUMULATOR_ASSERT_ENABLED @@ -877,9 +831,8 @@ struct HashmapAccumulator { return hash; } - template ::value, - std::size_t>::type = 0> + template ::value, std::size_t>::type = 0> KOKKOS_INLINE_FUNCTION int __compute_hash(size_type key, size_type divisor) { size_type hash = key % divisor; #ifdef HASHMAPACCUMULATOR_ASSERT_ENABLED diff --git a/common/src/KokkosKernels_IOUtils.hpp b/common/src/KokkosKernels_IOUtils.hpp index fd3e44db09..eb44082a74 100644 --- a/common/src/KokkosKernels_IOUtils.hpp +++ b/common/src/KokkosKernels_IOUtils.hpp @@ -47,15 +47,13 @@ inline void getRandomBounds(double mag, Scalar &start, Scalar &end) { } template <> -inline void getRandomBounds(double mag, Kokkos::complex &start, - Kokkos::complex &end) { +inline void getRandomBounds(double mag, Kokkos::complex &start, Kokkos::complex &end) { start = Kokkos::complex(-mag, -mag); end = Kokkos::complex(mag, mag); } template <> -inline void getRandomBounds(double mag, Kokkos::complex &start, - Kokkos::complex &end) { +inline void getRandomBounds(double mag, Kokkos::complex &start, Kokkos::complex &end) { start = Kokkos::complex(-mag, -mag); end = Kokkos::complex(mag, mag); } @@ -98,9 +96,7 @@ inline size_t kk_get_file_size(const char *file) { } template -void buildEdgeListFromBinSrcTarg_undirected(const char *fnameSrc, - const char *fnameTarg, - size_t &numEdges, lno_t **srcs, +void buildEdgeListFromBinSrcTarg_undirected(const char *fnameSrc, const char *fnameTarg, size_t &numEdges, lno_t **srcs, lno_t **dst) { size_t srcFileSize = kk_get_file_size(fnameSrc); size_t trgFileSize = kk_get_file_size(fnameTarg); @@ -150,8 +146,7 @@ inline void kk_write_1Dview_to_file(idx_array_type view, const char *filename) { } template -inline void kk_read_1Dview_from_file(idx_array_type &view, - const char *filename) { +inline void kk_read_1Dview_from_file(idx_array_type &view, const char *filename) { typedef typename idx_array_type::HostMirror host_type; // typedef typename idx_array_type::size_type idx; host_type host_view = Kokkos::create_mirror_view(view); @@ -183,8 +178,7 @@ inline void kk_write_2Dview_to_file(idx_array_type view, const char *filename) { } template -inline void kk_read_2Dview_from_file(idx_array_type &view, - const char *filename) { +inline void kk_read_2Dview_from_file(idx_array_type &view, const char *filename) { typedef typename idx_array_type::HostMirror host_type; // typedef typename idx_array_type::size_type idx; host_type host_view = Kokkos::create_mirror_view(view); @@ -221,8 +215,7 @@ inline void kk_write_3Dview_to_file(idx_array_type view, const char *filename) { } template -inline void kk_read_3Dview_from_file(idx_array_type &view, - const char *filename) { +inline void kk_read_3Dview_from_file(idx_array_type &view, const char *filename) { typedef typename idx_array_type::HostMirror host_type; // typedef typename idx_array_type::size_type idx; host_type host_view = Kokkos::create_mirror_view(view); @@ -241,8 +234,7 @@ inline void kk_read_3Dview_from_file(idx_array_type &view, } template -[[deprecated]] void write_edgelist_bin(size_t ne, const idx *edge_begins, - const idx *edge_ends, const wt *ew, +[[deprecated]] void write_edgelist_bin(size_t ne, const idx *edge_begins, const idx *edge_ends, const wt *ew, const char *filename) { std::ofstream myFile(filename, std::ios::out | std::ios::binary); myFile.write((char *)&ne, sizeof(idx)); @@ -253,8 +245,7 @@ template } template -void read_edgelist_bin(idx *ne, idx **edge_begins, idx **edge_ends, wt **ew, - const char *filename) { +void read_edgelist_bin(idx *ne, idx **edge_begins, idx **edge_ends, wt **ew, const char *filename) { std::ifstream myFile(filename, std::ios::in | std::ios::binary); myFile.read((char *)ne, sizeof(idx)); @@ -269,8 +260,7 @@ void read_edgelist_bin(idx *ne, idx **edge_begins, idx **edge_ends, wt **ew, inline bool endswith(std::string const &fullString, std::string const &ending) { if (fullString.length() >= ending.length()) { - return (0 == fullString.compare(fullString.length() - ending.length(), - ending.length(), ending)); + return (0 == fullString.compare(fullString.length() - ending.length(), ending.length(), ending)); } else { return false; } diff --git a/common/src/KokkosKernels_LowerBound.hpp b/common/src/KokkosKernels_LowerBound.hpp index e091932453..f7a5ccef96 100644 --- a/common/src/KokkosKernels_LowerBound.hpp +++ b/common/src/KokkosKernels_LowerBound.hpp @@ -87,15 +87,11 @@ namespace Impl { At most view.size() predicate function calls */ -template > -KOKKOS_INLINE_FUNCTION typename ViewLike::size_type -lower_bound_sequential_thread( - const ViewLike &view, const typename ViewLike::non_const_value_type &value, - Pred pred = Pred()) { +template > +KOKKOS_INLINE_FUNCTION typename ViewLike::size_type lower_bound_sequential_thread( + const ViewLike &view, const typename ViewLike::non_const_value_type &value, Pred pred = Pred()) { using size_type = typename ViewLike::size_type; - static_assert(1 == ViewLike::rank, - "lower_bound_sequential_thread requires rank-1 views"); + static_assert(1 == ViewLike::rank, "lower_bound_sequential_thread requires rank-1 views"); size_type i = 0; while (i < view.size() && pred(view(i), value)) { @@ -116,14 +112,11 @@ lower_bound_sequential_thread( At most log2(view.size()) + 1 predicate function calls */ -template > +template > KOKKOS_INLINE_FUNCTION typename ViewLike::size_type lower_bound_binary_thread( - const ViewLike &view, const typename ViewLike::non_const_value_type &value, - Pred pred = Pred()) { + const ViewLike &view, const typename ViewLike::non_const_value_type &value, Pred pred = Pred()) { using size_type = typename ViewLike::size_type; - static_assert(1 == ViewLike::rank, - "lower_bound_binary_thread requires rank-1 views"); + static_assert(1 == ViewLike::rank, "lower_bound_binary_thread requires rank-1 views"); size_type lo = 0; size_type hi = view.size(); @@ -155,13 +148,10 @@ KOKKOS_INLINE_FUNCTION typename ViewLike::size_type lower_bound_binary_thread( This minimizes the calls to predicate: for view.size() >= 8, this does a binary search, otherwise, a linear search */ -template > +template > KOKKOS_INLINE_FUNCTION typename ViewLike::size_type lower_bound_thread( - const ViewLike &view, const typename ViewLike::non_const_value_type &value, - Pred pred = Pred()) { - static_assert(1 == ViewLike::rank, - "lower_bound_thread requires rank-1 views"); + const ViewLike &view, const typename ViewLike::non_const_value_type &value, Pred pred = Pred()) { + static_assert(1 == ViewLike::rank, "lower_bound_thread requires rank-1 views"); /* sequential search makes on average 0.5 * view.size memory accesses binary search makes log2(view.size)+1 accesses @@ -196,18 +186,14 @@ namespace Impl { Uses a single thread to call \c lower_bound_thread, and broadcasts that to all team members. */ -template > +template > KOKKOS_INLINE_FUNCTION typename ViewLike::size_type lower_bound_single_team( - const TeamMember &handle, const ViewLike &view, - const typename ViewLike::non_const_value_type &value, Pred pred = Pred()) { + const TeamMember &handle, const ViewLike &view, const typename ViewLike::non_const_value_type &value, + Pred pred = Pred()) { typename ViewLike::size_type idx; Kokkos::single( Kokkos::PerTeam(handle), - [&](typename ViewLike::size_type &lidx) { - lidx = KokkosKernels::lower_bound_thread(view, value, pred); - }, - idx); + [&](typename ViewLike::size_type &lidx) { lidx = KokkosKernels::lower_bound_thread(view, value, pred); }, idx); return idx; } @@ -229,16 +215,12 @@ KOKKOS_INLINE_FUNCTION typename ViewLike::size_type lower_bound_single_team( Apply pred(view(i), value) for i in [lo, hi) */ -template > +template > KOKKOS_INLINE_FUNCTION typename ViewLike::size_type lower_bound_sequential_team( - const TeamMember &handle, const ViewLike &view, - const typename ViewLike::non_const_value_type &value, - typename ViewLike::size_type lo, typename ViewLike::size_type hi, - Pred pred = Pred()) { + const TeamMember &handle, const ViewLike &view, const typename ViewLike::non_const_value_type &value, + typename ViewLike::size_type lo, typename ViewLike::size_type hi, Pred pred = Pred()) { using size_type = typename ViewLike::size_type; - static_assert(1 == ViewLike::rank, - "lower_bound_sequential_team requires rank-1 views"); + static_assert(1 == ViewLike::rank, "lower_bound_sequential_team requires rank-1 views"); static_assert(is_iota_v || Kokkos::is_view::value, "lower_bound_sequential_team requires a " "KokkosKernels::Impl::Iota or a Kokkos::View"); @@ -251,7 +233,7 @@ KOKKOS_INLINE_FUNCTION typename ViewLike::size_type lower_bound_sequential_team( Kokkos::TeamThreadRange(handle, lo, hi), [&](const size_type &i, size_type &li) { li = KOKKOSKERNELS_MACRO_MIN(li, hi); - if (i < li) { // no need to search higher than the smallest so far + if (i < li) { // no need to search higher than the smallest so far if (!pred(view(i), value)) { // look for the smallest index that does // not satisfy li = i; @@ -276,11 +258,10 @@ KOKKOS_INLINE_FUNCTION typename ViewLike::size_type lower_bound_sequential_team( \returns To all team members, the smallest i for which pred(view(i), value) is false or view.size() if no such value */ -template > +template > KOKKOS_INLINE_FUNCTION typename ViewLike::size_type lower_bound_sequential_team( - const TeamMember &handle, const ViewLike &view, - const typename ViewLike::non_const_value_type &value, Pred pred = Pred()) { + const TeamMember &handle, const ViewLike &view, const typename ViewLike::non_const_value_type &value, + Pred pred = Pred()) { return lower_bound_sequential_team(handle, view, value, 0, view.size(), pred); } @@ -310,10 +291,9 @@ struct Range { /// \brief maximizes the lower bound, and minimizes the upper bound of a Range template struct RangeReducer { - using reducer = RangeReducer; - using value_type = Range; - using result_view_type = - Kokkos::View *, Space, Kokkos::MemoryUnmanaged>; + using reducer = RangeReducer; + using value_type = Range; + using result_view_type = Kokkos::View *, Space, Kokkos::MemoryUnmanaged>; private: value_type &value; @@ -356,13 +336,11 @@ struct RangeReducer { false Once there are fewer values left than threads in the team, switch to team sequential search */ -template > +template > KOKKOS_INLINE_FUNCTION typename ViewLike::size_type lower_bound_kary_team( - const TeamMember &handle, const ViewLike &view, - const typename ViewLike::non_const_value_type &value, Pred pred = Pred()) { - static_assert(1 == ViewLike::rank, - "lower_bound_kary_team requires rank-1 views"); + const TeamMember &handle, const ViewLike &view, const typename ViewLike::non_const_value_type &value, + Pred pred = Pred()) { + static_assert(1 == ViewLike::rank, "lower_bound_kary_team requires rank-1 views"); static_assert(is_iota_v || Kokkos::is_view::value, "lower_bound_kary_team requires a " "KokkosKernels::Impl::Iota or a Kokkos::View"); @@ -378,9 +356,8 @@ KOKKOS_INLINE_FUNCTION typename ViewLike::size_type lower_bound_kary_team( } // otherwise, split the region up among threads - size_type mid = - lo + (hi - lo) * (handle.team_rank() + 1) / (handle.team_size() + 1); - auto ve = view(mid); + size_type mid = lo + (hi - lo) * (handle.team_rank() + 1) / (handle.team_size() + 1); + auto ve = view(mid); // reduce across threads to figure out where the new search bounds are // if a thread satisfies the predicate, the first element that does not @@ -433,14 +410,12 @@ KOKKOS_INLINE_FUNCTION typename ViewLike::size_type lower_bound_kary_team( Pred should be a binary function comparing two `typename View::non_const_value_type` */ -template > +template > KOKKOS_INLINE_FUNCTION typename ViewLike::size_type lower_bound_team( - const TeamMember &handle, const ViewLike &view, - const typename ViewLike::non_const_value_type &value, Pred pred = Pred()) { + const TeamMember &handle, const ViewLike &view, const typename ViewLike::non_const_value_type &value, + Pred pred = Pred()) { static_assert(1 == ViewLike::rank, "lower_bound_team requires rank-1 views"); - static_assert(KokkosKernels::Impl::is_iota_v || - Kokkos::is_view::value, + static_assert(KokkosKernels::Impl::is_iota_v || Kokkos::is_view::value, "lower_bound_team requires a " "KokkosKernels::Impl::Iota or a Kokkos::View"); diff --git a/common/src/KokkosKernels_Macros.hpp b/common/src/KokkosKernels_Macros.hpp index 04234a5ce2..6c4093ca10 100644 --- a/common/src/KokkosKernels_Macros.hpp +++ b/common/src/KokkosKernels_Macros.hpp @@ -34,15 +34,13 @@ // is enabled, since in that case, Kokkos::ThreadVectorRange should be used // instead for SIMD parallel loops. -#if !defined(KOKKOS_ENABLE_CUDA) && !defined(KOKKOS_ENABLE_HIP) && \ - defined(KOKKOS_ENABLE_OPENMP) +#if !defined(KOKKOS_ENABLE_CUDA) && !defined(KOKKOS_ENABLE_HIP) && defined(KOKKOS_ENABLE_OPENMP) // For clang OpenMP support, see // https://clang.llvm.org/docs/OpenMPSupport.html#id1 #if defined(KOKKOS_COMPILER_GNU) || defined(KOKKOS_COMPILER_CLANG) // GCC 4.8.5 and older do not support #pragma omp simd // Do not enable when using GCC 7.2.0 or 7.3.0 + C++17 due to a bug in gcc -#if (KOKKOS_COMPILER_GNU > 485) && \ - !(KOKKOS_COMPILER_GNU == 720 && defined(KOKKOS_ENABLE_CXX17)) && \ +#if (KOKKOS_COMPILER_GNU > 485) && !(KOKKOS_COMPILER_GNU == 720 && defined(KOKKOS_ENABLE_CXX17)) && \ !(KOKKOS_COMPILER_GNU == 730 && defined(KOKKOS_ENABLE_CXX17)) #define KOKKOSKERNELS_ENABLE_OMP_SIMD #endif @@ -99,9 +97,8 @@ // define KOKKOSKERNELS_CUDA_INDEPENDENT_THREADS if we are targeting a CUDA // architecture with "independent thread scheduling" (Volta70 and up). This // requires some extra logic in HashmapAccumulator to avoid data races. -#if defined(KOKKOS_ARCH_VOLTA) || defined(KOKKOS_ARCH_TURING75) || \ - defined(KOKKOS_ARCH_AMPERE) || defined(KOKKOS_ARCH_ADA89) || \ - defined(KOKKOS_ARCH_HOPPER) +#if defined(KOKKOS_ARCH_VOLTA) || defined(KOKKOS_ARCH_TURING75) || defined(KOKKOS_ARCH_AMPERE) || \ + defined(KOKKOS_ARCH_ADA89) || defined(KOKKOS_ARCH_HOPPER) #define KOKKOSKERNELS_CUDA_INDEPENDENT_THREADS #endif diff --git a/common/src/KokkosKernels_Predicates.hpp b/common/src/KokkosKernels_Predicates.hpp index a741d1353a..f3bc6f2b2c 100644 --- a/common/src/KokkosKernels_Predicates.hpp +++ b/common/src/KokkosKernels_Predicates.hpp @@ -32,17 +32,14 @@ namespace KokkosKernels { template struct GT { using value_type = T; - static_assert(!Kokkos::ArithTraits::is_complex, - "Please define custom predicates for ordering complex types"); + static_assert(!Kokkos::ArithTraits::is_complex, "Please define custom predicates for ordering complex types"); /** * @brief Return true if a is greater than b * @param a First value to be compared * @param b Second value to be compared */ - KOKKOS_INLINE_FUNCTION constexpr bool operator()(const value_type &a, - const value_type &b) const - noexcept { + KOKKOS_INLINE_FUNCTION constexpr bool operator()(const value_type &a, const value_type &b) const noexcept { return a > b; } }; @@ -53,13 +50,10 @@ struct GT { template struct GTE { using value_type = T; - static_assert(!Kokkos::ArithTraits::is_complex, - "Please define custom predicates for ordering complex types"); + static_assert(!Kokkos::ArithTraits::is_complex, "Please define custom predicates for ordering complex types"); /// \brief return a >= b - KOKKOS_INLINE_FUNCTION constexpr bool operator()(const value_type &a, - const value_type &b) const - noexcept { + KOKKOS_INLINE_FUNCTION constexpr bool operator()(const value_type &a, const value_type &b) const noexcept { return a >= b; } }; @@ -70,13 +64,10 @@ struct GTE { template struct LT { using value_type = T; - static_assert(!Kokkos::ArithTraits::is_complex, - "Please define custom predicates for ordering complex types"); + static_assert(!Kokkos::ArithTraits::is_complex, "Please define custom predicates for ordering complex types"); /// \brief return a < b - KOKKOS_INLINE_FUNCTION constexpr bool operator()(const value_type &a, - const value_type &b) const - noexcept { + KOKKOS_INLINE_FUNCTION constexpr bool operator()(const value_type &a, const value_type &b) const noexcept { return a < b; } }; @@ -87,13 +78,10 @@ struct LT { template struct LTE { using value_type = T; - static_assert(!Kokkos::ArithTraits::is_complex, - "Please define custom predicates for ordering complex types"); + static_assert(!Kokkos::ArithTraits::is_complex, "Please define custom predicates for ordering complex types"); /// \brief return a <= b - KOKKOS_INLINE_FUNCTION constexpr bool operator()(const value_type &a, - const value_type &b) const - noexcept { + KOKKOS_INLINE_FUNCTION constexpr bool operator()(const value_type &a, const value_type &b) const noexcept { return a <= b; } }; @@ -106,10 +94,7 @@ struct Equal { using value_type = T; /// \brief return a == b - KOKKOS_INLINE_FUNCTION constexpr bool operator()(const value_type &a, - const value_type &b) const { - return a == b; - } + KOKKOS_INLINE_FUNCTION constexpr bool operator()(const value_type &a, const value_type &b) const { return a == b; } }; /** @@ -133,8 +118,7 @@ struct Neg { * @param b Second value to be compared by the predicate * @return Boolean inverse of the result of the predicate applied to a and b */ - KOKKOS_INLINE_FUNCTION constexpr bool operator()(const value_type &a, - const value_type &b) const { + KOKKOS_INLINE_FUNCTION constexpr bool operator()(const value_type &a, const value_type &b) const { return !pred_(a, b); } @@ -153,8 +137,7 @@ struct Refl { constexpr Refl(const Pred &pred) : pred_(pred) {} /// \brief return the underlying binary predicate with reversed arguments - KOKKOS_INLINE_FUNCTION constexpr bool operator()(const value_type &a, - const value_type &b) const { + KOKKOS_INLINE_FUNCTION constexpr bool operator()(const value_type &a, const value_type &b) const { return pred_(b, a); } diff --git a/common/src/KokkosKernels_PrintConfiguration.hpp b/common/src/KokkosKernels_PrintConfiguration.hpp index c2e3a5187f..5870210912 100644 --- a/common/src/KokkosKernels_PrintConfiguration.hpp +++ b/common/src/KokkosKernels_PrintConfiguration.hpp @@ -37,8 +37,7 @@ inline void print_cublas_version_if_enabled(std::ostream& os) { inline void print_cusparse_version_if_enabled(std::ostream& os) { #ifdef KOKKOSKERNELS_ENABLE_TPL_CUSPARSE os << " " - << "KOKKOSKERNELS_ENABLE_TPL_CUSPARSE: " << cusparse_version_string() - << "\n"; + << "KOKKOSKERNELS_ENABLE_TPL_CUSPARSE: " << cusparse_version_string() << "\n"; #else os << " " << "KOKKOSKERNELS_ENABLE_TPL_CUSPARSE: no\n"; @@ -48,8 +47,7 @@ inline void print_cusparse_version_if_enabled(std::ostream& os) { inline void print_cusolver_version_if_enabled(std::ostream& os) { #ifdef KOKKOSKERNELS_ENABLE_TPL_CUSOLVER os << " " - << "KOKKOSKERNELS_ENABLE_TPL_CUSOLVER: " << cusolver_version_string() - << "\n"; + << "KOKKOSKERNELS_ENABLE_TPL_CUSOLVER: " << cusolver_version_string() << "\n"; #else os << " " << "KOKKOSKERNELS_ENABLE_TPL_CUSOLVER: no\n"; @@ -156,9 +154,8 @@ inline void print_version(std::ostream& os) { // KOKKOSKERNELS_VERSION is used because MAJOR, MINOR and PATCH macros // are not available in Kernels os << " " - << "KokkosKernels Version: " << KOKKOSKERNELS_VERSION_MAJOR << "." - << KOKKOSKERNELS_VERSION_MINOR << "." << KOKKOSKERNELS_VERSION_PATCH - << '\n'; + << "KokkosKernels Version: " << KOKKOSKERNELS_VERSION_MAJOR << "." << KOKKOSKERNELS_VERSION_MINOR << "." + << KOKKOSKERNELS_VERSION_PATCH << '\n'; } } // namespace Impl diff --git a/common/src/KokkosKernels_PrintUtils.hpp b/common/src/KokkosKernels_PrintUtils.hpp index 74b32c793a..b4817022fc 100644 --- a/common/src/KokkosKernels_PrintUtils.hpp +++ b/common/src/KokkosKernels_PrintUtils.hpp @@ -27,13 +27,11 @@ template struct Histogram { in_lno_view_t inview; out_lno_view_t outview; - Histogram(in_lno_view_t inview_, out_lno_view_t outview_) - : inview(inview_), outview(outview_) {} + Histogram(in_lno_view_t inview_, out_lno_view_t outview_) : inview(inview_), outview(outview_) {} KOKKOS_INLINE_FUNCTION void operator()(const size_t& ii) const { - typedef typename std::remove_reference::type - atomic_incr_type; + typedef typename std::remove_reference::type atomic_incr_type; Kokkos::atomic_fetch_add(&(outview(inview(ii))), atomic_incr_type(1)); } }; @@ -47,13 +45,11 @@ struct Histogram { * them with 0, and size must be big enough to hold all values in input view. */ template -inline void kk_get_histogram( - typename in_lno_view_t::size_type in_elements, in_lno_view_t in_view, - out_lno_view_t histogram /*must be initialized with 0s*/) { +inline void kk_get_histogram(typename in_lno_view_t::size_type in_elements, in_lno_view_t in_view, + out_lno_view_t histogram /*must be initialized with 0s*/) { typedef Kokkos::RangePolicy my_exec_space; - Kokkos::parallel_for( - "KokkosKernels::Common::GetHistogram", my_exec_space(0, in_elements), - Histogram(in_view, histogram)); + Kokkos::parallel_for("KokkosKernels::Common::GetHistogram", my_exec_space(0, in_elements), + Histogram(in_view, histogram)); MyExecSpace().fence(); } @@ -68,9 +64,9 @@ inline void kk_get_histogram( * pritned. This parameter is not used if print_all is set to true. */ template -inline std::enable_if_t kk_print_1Dview( - std::ostream& os, idx_array_type view, bool print_all = false, - const char* sep = " ", size_t print_size = 40) { +inline std::enable_if_t kk_print_1Dview(std::ostream& os, idx_array_type view, + bool print_all = false, const char* sep = " ", + size_t print_size = 40) { typedef typename idx_array_type::HostMirror host_type; typedef typename idx_array_type::size_type idx; host_type host_view = Kokkos::create_mirror_view(view); @@ -95,12 +91,11 @@ inline std::enable_if_t kk_print_1Dview( * rank-2 vectors same like rank-1 vectors and prints multi-vector dimensions. */ template -inline std::enable_if_t= 2> kk_print_1Dview( - std::ostream& os, idx_array_type view, bool print_all = false, - const char* sep = " ", size_t print_size = 40) { +inline std::enable_if_t= 2> kk_print_1Dview(std::ostream& os, idx_array_type view, + bool print_all = false, const char* sep = " ", + size_t print_size = 40) { if (idx_array_type::rank == 2 && view.extent(1) == 1) { - kk_print_1Dview(os, subview(view, Kokkos::ALL, 0), print_all, sep, - print_size); + kk_print_1Dview(os, subview(view, Kokkos::ALL, 0), print_all, sep, print_size); return; } os << "[" << view.extent(0); @@ -120,8 +115,7 @@ inline std::enable_if_t= 2> kk_print_1Dview( * This interface is provided for backwards compatiblity. */ template -inline void kk_print_1Dview(idx_array_type view, bool print_all = false, - size_t print_size = 40) { +inline void kk_print_1Dview(idx_array_type view, bool print_all = false, size_t print_size = 40) { kk_print_1Dview(std::cout, view, print_all, " ", print_size); } diff --git a/common/src/KokkosKernels_SimpleUtils.hpp b/common/src/KokkosKernels_SimpleUtils.hpp index 055c1d6d32..0ae29a2f50 100644 --- a/common/src/KokkosKernels_SimpleUtils.hpp +++ b/common/src/KokkosKernels_SimpleUtils.hpp @@ -21,8 +21,7 @@ #define KOKKOSKERNELS_MACRO_MIN(x, y) ((x) < (y) ? (x) : (y)) #define KOKKOSKERNELS_MACRO_MAX(x, y) ((x) < (y) ? (y) : (x)) -#define KOKKOSKERNELS_MACRO_ABS(x) \ - Kokkos::ArithTraits::type>::abs(x) +#define KOKKOSKERNELS_MACRO_ABS(x) Kokkos::ArithTraits::type>::abs(x) namespace KokkosKernels { @@ -53,8 +52,7 @@ struct ExclusiveParallelPrefixSum { KOKKOS_INLINE_FUNCTION void operator()(const size_t ii, value_type &update, const bool final) const { - value_type val = - (ii == array_sum.extent(0) - 1) ? value_type(0) : array_sum(ii); + value_type val = (ii == array_sum.extent(0) - 1) ? value_type(0) : array_sum(ii); if (final) { array_sum(ii) = value_type(update); } @@ -85,12 +83,10 @@ struct InclusiveParallelPrefixSum { * \param arr: the array for which the prefix sum will be performed. */ template -inline void kk_exclusive_parallel_prefix_sum( - const MyExecSpace &exec, typename view_t::value_type num_elements, - view_t arr) { +inline void kk_exclusive_parallel_prefix_sum(const MyExecSpace &exec, typename view_t::value_type num_elements, + view_t arr) { typedef Kokkos::RangePolicy my_exec_space; - Kokkos::parallel_scan("KokkosKernels::Common::PrefixSum", - my_exec_space(exec, 0, num_elements), + Kokkos::parallel_scan("KokkosKernels::Common::PrefixSum", my_exec_space(exec, 0, num_elements), ExclusiveParallelPrefixSum(arr)); } @@ -101,8 +97,7 @@ inline void kk_exclusive_parallel_prefix_sum( * \param arr: the array for which the prefix sum will be performed. */ template -inline void kk_exclusive_parallel_prefix_sum( - typename view_t::value_type num_elements, view_t arr) { +inline void kk_exclusive_parallel_prefix_sum(typename view_t::value_type num_elements, view_t arr) { kk_exclusive_parallel_prefix_sum(MyExecSpace(), num_elements, arr); } @@ -117,12 +112,10 @@ inline void kk_exclusive_parallel_prefix_sum( * prefix sum. */ template -inline void kk_exclusive_parallel_prefix_sum( - const MyExecSpace &exec, typename view_t::value_type num_elements, - view_t arr, typename view_t::non_const_value_type &finalSum) { +inline void kk_exclusive_parallel_prefix_sum(const MyExecSpace &exec, typename view_t::value_type num_elements, + view_t arr, typename view_t::non_const_value_type &finalSum) { typedef Kokkos::RangePolicy my_exec_space; - Kokkos::parallel_scan("KokkosKernels::Common::PrefixSum", - my_exec_space(exec, 0, num_elements), + Kokkos::parallel_scan("KokkosKernels::Common::PrefixSum", my_exec_space(exec, 0, num_elements), ExclusiveParallelPrefixSum(arr), finalSum); } @@ -136,9 +129,8 @@ inline void kk_exclusive_parallel_prefix_sum( * prefix sum. */ template -inline void kk_exclusive_parallel_prefix_sum( - typename view_t::value_type num_elements, view_t arr, - typename view_t::non_const_value_type &finalSum) { +inline void kk_exclusive_parallel_prefix_sum(typename view_t::value_type num_elements, view_t arr, + typename view_t::non_const_value_type &finalSum) { kk_exclusive_parallel_prefix_sum(MyExecSpace(), num_elements, arr, finalSum); } @@ -150,13 +142,10 @@ inline void kk_exclusive_parallel_prefix_sum( /// \param arr: the array for which the prefix sum will be performed. /// template -void kk_inclusive_parallel_prefix_sum( - MyExecSpace my_exec_space, - typename forward_array_type::value_type num_elements, - forward_array_type arr) { +void kk_inclusive_parallel_prefix_sum(MyExecSpace my_exec_space, typename forward_array_type::value_type num_elements, + forward_array_type arr) { typedef Kokkos::RangePolicy range_policy_t; - Kokkos::parallel_scan("KokkosKernels::Common::PrefixSum", - range_policy_t(my_exec_space, 0, num_elements), + Kokkos::parallel_scan("KokkosKernels::Common::PrefixSum", range_policy_t(my_exec_space, 0, num_elements), InclusiveParallelPrefixSum(arr)); } @@ -167,9 +156,7 @@ void kk_inclusive_parallel_prefix_sum( /// \param arr: the array for which the prefix sum will be performed. /// template -void kk_inclusive_parallel_prefix_sum( - typename forward_array_type::value_type num_elements, - forward_array_type arr) { +void kk_inclusive_parallel_prefix_sum(typename forward_array_type::value_type num_elements, forward_array_type arr) { MyExecSpace my_exec_space; return kk_inclusive_parallel_prefix_sum(my_exec_space, num_elements, arr); } @@ -180,9 +167,7 @@ struct ReductionFunctor { ReductionFunctor(view_t arr_) : array_sum(arr_) {} KOKKOS_INLINE_FUNCTION - void operator()(const size_t ii, typename view_t::value_type &update) const { - update += array_sum(ii); - } + void operator()(const size_t ii, typename view_t::value_type &update) const { update += array_sum(ii); } }; template @@ -191,55 +176,44 @@ struct ReductionFunctor2 { ReductionFunctor2(view_t arr_) : array_sum(arr_) {} KOKKOS_INLINE_FUNCTION - void operator()(const size_t ii, size_t &update) const { - update += array_sum(ii); - } + void operator()(const size_t ii, size_t &update) const { update += array_sum(ii); } }; template struct DiffReductionFunctor { view_t array_begins; view2_t array_ends; - DiffReductionFunctor(view_t begins, view2_t ends) - : array_begins(begins), array_ends(ends) {} + DiffReductionFunctor(view_t begins, view2_t ends) : array_begins(begins), array_ends(ends) {} KOKKOS_INLINE_FUNCTION - void operator()(const size_t ii, - typename view_t::non_const_value_type &update) const { + void operator()(const size_t ii, typename view_t::non_const_value_type &update) const { update += (array_ends(ii) - array_begins(ii)); } }; template -inline void kk_reduce_diff_view( - size_t num_elements, view_t smaller, view2_t bigger, - typename view_t::non_const_value_type &reduction) { +inline void kk_reduce_diff_view(size_t num_elements, view_t smaller, view2_t bigger, + typename view_t::non_const_value_type &reduction) { typedef Kokkos::RangePolicy my_exec_space; - Kokkos::parallel_reduce( - "KokkosKernels::Common::ReduceDiffView", my_exec_space(0, num_elements), - DiffReductionFunctor(smaller, bigger), reduction); + Kokkos::parallel_reduce("KokkosKernels::Common::ReduceDiffView", my_exec_space(0, num_elements), + DiffReductionFunctor(smaller, bigger), reduction); } template struct DiffReductionFunctorP { const it *array_begins; const it *array_ends; - DiffReductionFunctorP(const it *begins, const it *ends) - : array_begins(begins), array_ends(ends) {} + DiffReductionFunctorP(const it *begins, const it *ends) : array_begins(begins), array_ends(ends) {} KOKKOS_INLINE_FUNCTION - void operator()(const size_t ii, it &update) const { - update += (array_ends[ii] - array_begins[ii]); - } + void operator()(const size_t ii, it &update) const { update += (array_ends[ii] - array_begins[ii]); } }; template -inline void kkp_reduce_diff_view(const size_t num_elements, const it *smaller, - const it *bigger, it &reduction) { +inline void kkp_reduce_diff_view(const size_t num_elements, const it *smaller, const it *bigger, it &reduction) { typedef Kokkos::RangePolicy my_exec_space; - Kokkos::parallel_reduce( - "KokkosKernels::Common::ReduceDiffView", my_exec_space(0, num_elements), - DiffReductionFunctorP(smaller, bigger), reduction); + Kokkos::parallel_reduce("KokkosKernels::Common::ReduceDiffView", my_exec_space(0, num_elements), + DiffReductionFunctorP(smaller, bigger), reduction); } /*** @@ -249,33 +223,27 @@ inline void kkp_reduce_diff_view(const size_t num_elements, const it *smaller, * \param arr: the array for which the prefix sum will be performed. */ template -inline void kk_reduce_view(size_t num_elements, view_t arr, - typename view_t::value_type &reduction) { +inline void kk_reduce_view(size_t num_elements, view_t arr, typename view_t::value_type &reduction) { typedef Kokkos::RangePolicy my_exec_space; - Kokkos::parallel_reduce("KokkosKernels::Common::ReduceView", - my_exec_space(0, num_elements), + Kokkos::parallel_reduce("KokkosKernels::Common::ReduceView", my_exec_space(0, num_elements), ReductionFunctor(arr), reduction); } template -inline void kk_reduce_view2(size_t num_elements, view_t arr, - size_t &reduction) { +inline void kk_reduce_view2(size_t num_elements, view_t arr, size_t &reduction) { typedef Kokkos::RangePolicy my_exec_space; - Kokkos::parallel_reduce("KokkosKernels::Common::ReduceView2", - my_exec_space(0, num_elements), + Kokkos::parallel_reduce("KokkosKernels::Common::ReduceView2", my_exec_space(0, num_elements), ReductionFunctor2(arr), reduction); } template ::mag_type> + typename eps_type = typename Kokkos::ArithTraits::mag_type> struct IsIdenticalFunctor { view_type1 view1; view_type2 view2; eps_type eps; - IsIdenticalFunctor(view_type1 view1_, view_type2 view2_, eps_type eps_) - : view1(view1_), view2(view2_), eps(eps_) {} + IsIdenticalFunctor(view_type1 view1_, view_type2 view2_, eps_type eps_) : view1(view1_), view2(view2_), eps(eps_) {} KOKKOS_INLINE_FUNCTION void operator()(const size_t &i, size_t &is_equal) const { @@ -290,8 +258,7 @@ struct IsIdenticalFunctor { } }; -template +template bool kk_is_identical_view(view_type1 view1, view_type2 view2, eps_type eps) { if (view1.extent(0) != view2.extent(0)) { return false; @@ -301,10 +268,8 @@ bool kk_is_identical_view(view_type1 view1, view_type2 view2, eps_type eps) { typedef Kokkos::RangePolicy my_exec_space; size_t issame = 0; - Kokkos::parallel_reduce( - "KokkosKernels::Common::IsIdenticalView", my_exec_space(0, num_elements), - IsIdenticalFunctor(view1, view2, eps), - issame); + Kokkos::parallel_reduce("KokkosKernels::Common::IsIdenticalView", my_exec_space(0, num_elements), + IsIdenticalFunctor(view1, view2, eps), issame); MyExecSpace().fence(); if (issame > 0) { return false; @@ -314,15 +279,13 @@ bool kk_is_identical_view(view_type1 view1, view_type2 view2, eps_type eps) { } template ::mag_type> + typename eps_type = typename Kokkos::ArithTraits::mag_type> struct IsRelativelyIdenticalFunctor { view_type1 view1; view_type2 view2; eps_type eps; - IsRelativelyIdenticalFunctor(view_type1 view1_, view_type2 view2_, - eps_type eps_) + IsRelativelyIdenticalFunctor(view_type1 view1_, view_type2 view2_, eps_type eps_) : view1(view1_), view2(view2_), eps(eps_) {} KOKKOS_INLINE_FUNCTION @@ -333,27 +296,22 @@ struct IsRelativelyIdenticalFunctor { typedef Kokkos::ArithTraits KATM; mag_type val_diff = KATM::zero(); - if (KAT::abs(view1(i)) > mag_type(eps) || - KAT::abs(view2(i)) > mag_type(eps)) { - val_diff = KAT::abs(view1(i) - view2(i)) / - (KAT::abs(view1(i)) + KAT::abs(view2(i))); + if (KAT::abs(view1(i)) > mag_type(eps) || KAT::abs(view2(i)) > mag_type(eps)) { + val_diff = KAT::abs(view1(i) - view2(i)) / (KAT::abs(view1(i)) + KAT::abs(view2(i))); } if (val_diff > mag_type(eps)) { Kokkos::printf( "Values at index %d, %.6f + %.6fi and %.6f + %.6fi, differ too much " "(eps = %e, rel err = %e)\n", - (int)i, KAT::real(view1(i)), KAT::imag(view1(i)), KAT::real(view2(i)), - KAT::imag(view2(i)), eps, val_diff); + (int)i, KAT::real(view1(i)), KAT::imag(view1(i)), KAT::real(view2(i)), KAT::imag(view2(i)), eps, val_diff); num_diffs++; } } }; -template -bool kk_is_relatively_identical_view(view_type1 view1, view_type2 view2, - eps_type eps) { +template +bool kk_is_relatively_identical_view(view_type1 view1, view_type2 view2, eps_type eps) { if (view1.extent(0) != view2.extent(0)) { return false; } @@ -362,12 +320,9 @@ bool kk_is_relatively_identical_view(view_type1 view1, view_type2 view2, typedef Kokkos::RangePolicy my_exec_space; size_t numDifferences = 0; - Kokkos::parallel_reduce( - "KokkosKernels::Common::IsRelativelyIdenticalView", - my_exec_space(0, num_elements), - IsRelativelyIdenticalFunctor( - view1, view2, eps), - numDifferences); + Kokkos::parallel_reduce("KokkosKernels::Common::IsRelativelyIdenticalView", my_exec_space(0, num_elements), + IsRelativelyIdenticalFunctor(view1, view2, eps), + numDifferences); return numDifferences == 0; } @@ -377,8 +332,7 @@ struct ReduceMaxFunctor { typedef typename view_type::non_const_value_type value_type; const value_type min_val; ReduceMaxFunctor(view_type view_to_reduce_) - : view_to_reduce(view_to_reduce_), - min_val((std::numeric_limits::lowest())) {} + : view_to_reduce(view_to_reduce_), min_val((std::numeric_limits::lowest())) {} KOKKOS_INLINE_FUNCTION void operator()(const size_t &i, value_type &max_reduction) const { value_type val = view_to_reduce(i); @@ -404,28 +358,24 @@ struct ReduceMaxFunctor { }; template -void kk_view_reduce_max( - size_t num_elements, view_type view_to_reduce, - typename view_type::non_const_value_type &max_reduction) { +void kk_view_reduce_max(size_t num_elements, view_type view_to_reduce, + typename view_type::non_const_value_type &max_reduction) { typedef Kokkos::RangePolicy my_exec_space; - Kokkos::parallel_reduce( - "KokkosKernels::Common::ReduceMax", my_exec_space(0, num_elements), - ReduceMaxFunctor(view_to_reduce), max_reduction); + Kokkos::parallel_reduce("KokkosKernels::Common::ReduceMax", my_exec_space(0, num_elements), + ReduceMaxFunctor(view_to_reduce), max_reduction); } // xorshift hash/pseudorandom function (supported for 32- and 64-bit integer // types only) template KOKKOS_FORCEINLINE_FUNCTION Value xorshiftHash(Value v) { - static_assert(std::is_unsigned::value, - "xorshiftHash: value must be an unsigned integer type"); + static_assert(std::is_unsigned::value, "xorshiftHash: value must be an unsigned integer type"); uint64_t x = v; x ^= x >> 12; x ^= x << 25; x ^= x >> 27; - return std::is_same::value - ? static_cast((x * 2685821657736338717ULL - 1) >> 16) - : static_cast(x * 2685821657736338717ULL - 1); + return std::is_same::value ? static_cast((x * 2685821657736338717ULL - 1) >> 16) + : static_cast(x * 2685821657736338717ULL - 1); } struct ViewHashFunctor { @@ -458,16 +408,14 @@ uint32_t hashView(const View &v) { // but it's not defined on Intel 19 (with GCC 7.2.0 standard library). // So just check if it's available before using. #ifdef __cpp_lib_has_unique_object_representations - static_assert(std::has_unique_object_representations< - typename View::non_const_value_type>::value, + static_assert(std::has_unique_object_representations::value, "KokkosKernels::Impl::hashView: the view's element type must " "not have any padding bytes."); #endif size_t nbytes = v.span() * sizeof(typename View::value_type); uint32_t h; - Kokkos::parallel_reduce( - Kokkos::RangePolicy(0, nbytes), - ViewHashFunctor(reinterpret_cast(v.data())), h); + Kokkos::parallel_reduce(Kokkos::RangePolicy(0, nbytes), + ViewHashFunctor(reinterpret_cast(v.data())), h); return h; } @@ -476,18 +424,15 @@ struct SequentialFillFunctor { using size_type = typename V::size_type; using val_type = typename V::non_const_value_type; SequentialFillFunctor(const V &v_, val_type start_) : v(v_), start(start_) {} - KOKKOS_INLINE_FUNCTION void operator()(size_type i) const { - v(i) = start + (val_type)i; - } + KOKKOS_INLINE_FUNCTION void operator()(size_type i) const { v(i) = start + (val_type)i; } V v; val_type start; }; template void sequential_fill(const V &v, typename V::non_const_value_type start = 0) { - Kokkos::parallel_for( - Kokkos::RangePolicy(0, v.extent(0)), - SequentialFillFunctor(v, start)); + Kokkos::parallel_for(Kokkos::RangePolicy(0, v.extent(0)), + SequentialFillFunctor(v, start)); } } // namespace Impl diff --git a/common/src/KokkosKernels_Sorting.hpp b/common/src/KokkosKernels_Sorting.hpp index 20ce6deaa2..f91f11c164 100644 --- a/common/src/KokkosKernels_Sorting.hpp +++ b/common/src/KokkosKernels_Sorting.hpp @@ -17,7 +17,7 @@ #define _KOKKOSKERNELS_SORTING_HPP #include "Kokkos_Core.hpp" -#include "KokkosKernels_SimpleUtils.hpp" //for kk_exclusive_parallel_prefix_sum +#include "KokkosKernels_SimpleUtils.hpp" //for kk_exclusive_parallel_prefix_sum #include "KokkosKernels_ExecSpaceUtils.hpp" //for kk_is_gpu_exec_space #include @@ -26,10 +26,7 @@ namespace KokkosKernels { namespace Impl { template struct DefaultComparator { - KOKKOS_INLINE_FUNCTION bool operator()(const Value lhs, - const Value rhs) const { - return lhs < rhs; - } + KOKKOS_INLINE_FUNCTION bool operator()(const Value lhs, const Value rhs) const { return lhs < rhs; } }; } // namespace Impl @@ -39,9 +36,8 @@ struct DefaultComparator { // Bitonic sort: sorts v according to the comparator object's operator(). // Default comparator is just operator< for v's element type. -template < - typename View, typename ExecSpace, typename Ordinal, - typename Comparator = Impl::DefaultComparator> +template > void bitonicSort(View v, const Comparator& comp = Comparator()); // -------------------------------------------------------- @@ -51,15 +47,12 @@ void bitonicSort(View v, const Comparator& comp = Comparator()); // Radix sort. Not in-place: requires scratch array 'valuesAux' to be the same // size as values. ValueType must be an unsigned integer type. template -KOKKOS_INLINE_FUNCTION void SerialRadixSort(ValueType* values, - ValueType* valuesAux, Ordinal n); +KOKKOS_INLINE_FUNCTION void SerialRadixSort(ValueType* values, ValueType* valuesAux, Ordinal n); // Same as SerialRadixSort, but also permutes perm[0...n] as it sorts // values[0...n]. template -KOKKOS_INLINE_FUNCTION void SerialRadixSort2(ValueType* values, - ValueType* valuesAux, - PermType* perm, PermType* permAux, +KOKKOS_INLINE_FUNCTION void SerialRadixSort2(ValueType* values, ValueType* valuesAux, PermType* perm, PermType* permAux, Ordinal n); // ------------------------------------------------------------------- @@ -70,39 +63,32 @@ KOKKOS_INLINE_FUNCTION void SerialRadixSort2(ValueType* values, // raw array according to the comparator. template > -KOKKOS_INLINE_FUNCTION void TeamBitonicSort( - ValueType* values, Ordinal n, const TeamMember mem, - const Comparator& comp = Comparator()); +KOKKOS_INLINE_FUNCTION void TeamBitonicSort(ValueType* values, Ordinal n, const TeamMember mem, + const Comparator& comp = Comparator()); // Same as SerialRadixSort, but also permutes perm[0...n] as it sorts // values[0...n]. -template > -KOKKOS_INLINE_FUNCTION void TeamBitonicSort2( - ValueType* values, PermType* perm, Ordinal n, const TeamMember mem, - const Comparator& comp = Comparator()); +KOKKOS_INLINE_FUNCTION void TeamBitonicSort2(ValueType* values, PermType* perm, Ordinal n, const TeamMember mem, + const Comparator& comp = Comparator()); namespace Impl { // Functor that sorts a view on one team -template +template struct BitonicSingleTeamFunctor { - BitonicSingleTeamFunctor(View& v_, const Comparator& comp_) - : v(v_), comp(comp_) {} + BitonicSingleTeamFunctor(View& v_, const Comparator& comp_) : v(v_), comp(comp_) {} KOKKOS_INLINE_FUNCTION void operator()(const TeamMember t) const { - KokkosKernels::TeamBitonicSort( - v.data(), v.extent(0), t, comp); + KokkosKernels::TeamBitonicSort(v.data(), v.extent(0), t, + comp); }; View v; Comparator comp; }; // Functor that sorts equally sized chunks on each team -template +template struct BitonicChunkFunctor { BitonicChunkFunctor(View& v_, const Comparator& comp_, Ordinal chunkSize_) : v(v_), comp(comp_), chunkSize(chunkSize_) {} @@ -111,9 +97,8 @@ struct BitonicChunkFunctor { Ordinal chunkStart = chunk * chunkSize; Ordinal n = chunkSize; if (chunkStart + n > Ordinal(v.extent(0))) n = v.extent(0) - chunkStart; - KokkosKernels::TeamBitonicSort( - v.data() + chunkStart, n, t, comp); + KokkosKernels::TeamBitonicSort(v.data() + chunkStart, n, + t, comp); }; View v; Comparator comp; @@ -122,12 +107,10 @@ struct BitonicChunkFunctor { // Functor that does just the first phase (brown) of bitonic sort on // equally-sized chunks -template +template struct BitonicPhase1Functor { typedef typename View::value_type Value; - BitonicPhase1Functor(View& v_, const Comparator& comp_, Ordinal boxSize_, - Ordinal teamsPerBox_) + BitonicPhase1Functor(View& v_, const Comparator& comp_, Ordinal boxSize_, Ordinal teamsPerBox_) : v(v_), comp(comp_), boxSize(boxSize_), teamsPerBox(teamsPerBox_) {} KOKKOS_INLINE_FUNCTION void operator()(const TeamMember t) const { Ordinal box = t.league_rank() / teamsPerBox; @@ -135,18 +118,17 @@ struct BitonicPhase1Functor { Ordinal work = boxSize / teamsPerBox / 2; Ordinal workStart = work * (t.league_rank() % teamsPerBox); Ordinal workReflect = boxSize - workStart - 1; - Kokkos::parallel_for(Kokkos::TeamThreadRange(t, work), - [&](const Ordinal i) { - Ordinal elem1 = boxStart + workStart + i; - Ordinal elem2 = boxStart + workReflect - i; - if (elem2 < Ordinal(v.extent(0))) { - if (comp(v(elem2), v(elem1))) { - Value temp = v(elem1); - v(elem1) = v(elem2); - v(elem2) = temp; - } - } - }); + Kokkos::parallel_for(Kokkos::TeamThreadRange(t, work), [&](const Ordinal i) { + Ordinal elem1 = boxStart + workStart + i; + Ordinal elem2 = boxStart + workReflect - i; + if (elem2 < Ordinal(v.extent(0))) { + if (comp(v(elem2), v(elem1))) { + Value temp = v(elem1); + v(elem1) = v(elem2); + v(elem2) = temp; + } + } + }); }; View v; Comparator comp; @@ -155,12 +137,10 @@ struct BitonicPhase1Functor { }; // Functor that does the second phase (red) of bitonic sort -template +template struct BitonicPhase2Functor { typedef typename View::value_type Value; - BitonicPhase2Functor(View& v_, const Comparator& comp_, Ordinal boxSize_, - Ordinal teamsPerBox_) + BitonicPhase2Functor(View& v_, const Comparator& comp_, Ordinal boxSize_, Ordinal teamsPerBox_) : v(v_), comp(comp_), boxSize(boxSize_), teamsPerBox(teamsPerBox_) {} KOKKOS_INLINE_FUNCTION void operator()(const TeamMember t) const { Ordinal logBoxSize = 1; @@ -170,18 +150,17 @@ struct BitonicPhase2Functor { Ordinal work = boxSize / teamsPerBox / 2; Ordinal workStart = boxStart + work * (t.league_rank() % teamsPerBox); Ordinal jump = boxSize / 2; - Kokkos::parallel_for(Kokkos::TeamThreadRange(t, work), - [&](const Ordinal i) { - Ordinal elem1 = workStart + i; - Ordinal elem2 = workStart + jump + i; - if (elem2 < Ordinal(v.extent(0))) { - if (comp(v(elem2), v(elem1))) { - Value temp = v(elem1); - v(elem1) = v(elem2); - v(elem2) = temp; - } - } - }); + Kokkos::parallel_for(Kokkos::TeamThreadRange(t, work), [&](const Ordinal i) { + Ordinal elem1 = workStart + i; + Ordinal elem2 = workStart + jump + i; + if (elem2 < Ordinal(v.extent(0))) { + if (comp(v(elem2), v(elem1))) { + Value temp = v(elem1); + v(elem1) = v(elem2); + v(elem2) = temp; + } + } + }); if (teamsPerBox == 1) { // This team can finish phase 2 for all the smaller red boxes that follow, // since there are no longer cross-team data dependencies @@ -189,26 +168,23 @@ struct BitonicPhase2Functor { t.team_barrier(); Ordinal logSubBoxSize = logBoxSize - subLevel; Ordinal subBoxSize = Ordinal(1) << logSubBoxSize; - Kokkos::parallel_for( - Kokkos::TeamThreadRange(t, work), [&](const Ordinal i) { - Ordinal globalThread = i + t.league_rank() * work; - Ordinal subBox = globalThread >> (logSubBoxSize - 1); - Ordinal subBoxStart = subBox << logSubBoxSize; - Ordinal subBoxOffset = - globalThread & ((Ordinal(1) << (logSubBoxSize - 1)) - - 1); // i % (subBoxSize / 2) - Ordinal elem1 = subBoxStart + subBoxOffset; - // later phases (pink box): within a block, compare with fixed - // distance (boxSize / 2) apart - Ordinal elem2 = elem1 + subBoxSize / 2; - if (elem2 < Ordinal(v.extent(0))) { - if (comp(v(elem2), v(elem1))) { - Value temp = v(elem1); - v(elem1) = v(elem2); - v(elem2) = temp; - } - } - }); + Kokkos::parallel_for(Kokkos::TeamThreadRange(t, work), [&](const Ordinal i) { + Ordinal globalThread = i + t.league_rank() * work; + Ordinal subBox = globalThread >> (logSubBoxSize - 1); + Ordinal subBoxStart = subBox << logSubBoxSize; + Ordinal subBoxOffset = globalThread & ((Ordinal(1) << (logSubBoxSize - 1)) - 1); // i % (subBoxSize / 2) + Ordinal elem1 = subBoxStart + subBoxOffset; + // later phases (pink box): within a block, compare with fixed + // distance (boxSize / 2) apart + Ordinal elem2 = elem1 + subBoxSize / 2; + if (elem2 < Ordinal(v.extent(0))) { + if (comp(v(elem2), v(elem1))) { + Value temp = v(elem1); + v(elem1) = v(elem2); + v(elem2) = temp; + } + } + }); } } }; @@ -228,18 +204,15 @@ struct BitonicPhase2Functor { // type and an arbitrary device-compatible comparison operator (provided through // operator() of Comparator) If comparator is void, use operator< (which should // only be used for primitives) -template +template void bitonicSort(View v, const Comparator& comp) { typedef Kokkos::TeamPolicy team_policy; typedef typename team_policy::member_type team_member; Ordinal n = v.extent(0); // If n is small, just sort on a single team if (n <= Ordinal(1) << 12) { - Kokkos::parallel_for( - team_policy(1, Kokkos::AUTO()), - Impl::BitonicSingleTeamFunctor( - v, comp)); + Kokkos::parallel_for(team_policy(1, Kokkos::AUTO()), + Impl::BitonicSingleTeamFunctor(v, comp)); } else { Ordinal npot = 1; while (npot < n) npot <<= 1; @@ -247,22 +220,17 @@ void bitonicSort(View v, const Comparator& comp) { Ordinal chunkSize = 512; Ordinal numTeams = npot / chunkSize; // First, sort within teams - Kokkos::parallel_for( - team_policy(numTeams, Kokkos::AUTO()), - Impl::BitonicChunkFunctor( - v, comp, chunkSize)); - for (int teamsPerBox = 2; teamsPerBox <= npot / chunkSize; - teamsPerBox *= 2) { + Kokkos::parallel_for(team_policy(numTeams, Kokkos::AUTO()), + Impl::BitonicChunkFunctor(v, comp, chunkSize)); + for (int teamsPerBox = 2; teamsPerBox <= npot / chunkSize; teamsPerBox *= 2) { Ordinal boxSize = teamsPerBox * chunkSize; Kokkos::parallel_for( team_policy(numTeams, Kokkos::AUTO()), - Impl::BitonicPhase1Functor( - v, comp, boxSize, teamsPerBox)); + Impl::BitonicPhase1Functor(v, comp, boxSize, teamsPerBox)); for (int boxDiv = 1; teamsPerBox >> boxDiv; boxDiv++) { - Kokkos::parallel_for( - team_policy(numTeams, Kokkos::AUTO()), - Impl::BitonicPhase2Functor( - v, comp, boxSize >> boxDiv, teamsPerBox >> boxDiv)); + Kokkos::parallel_for(team_policy(numTeams, Kokkos::AUTO()), + Impl::BitonicPhase2Functor( + v, comp, boxSize >> boxDiv, teamsPerBox >> boxDiv)); } } } @@ -273,11 +241,9 @@ void bitonicSort(View v, const Comparator& comp) { // Better on CPU cores. Con: requires auxiliary storage, and this version only // works for integers template -KOKKOS_INLINE_FUNCTION void SerialRadixSort(ValueType* values, - ValueType* valuesAux, Ordinal n) { - static_assert( - std::is_integral::value && std::is_unsigned::value, - "radixSort can only be run on unsigned integers."); +KOKKOS_INLINE_FUNCTION void SerialRadixSort(ValueType* values, ValueType* valuesAux, Ordinal n) { + static_assert(std::is_integral::value && std::is_unsigned::value, + "radixSort can only be run on unsigned integers."); if (n <= 1) return; ValueType maxVal = 0; for (Ordinal i = 0; i < n; i++) { @@ -318,13 +284,13 @@ KOKKOS_INLINE_FUNCTION void SerialRadixSort(ValueType* values, // threads if (!inAux) { for (Ordinal i = 0; i < n; i++) { - Ordinal bucket = (values[i] & mask) >> maskPos; + Ordinal bucket = (values[i] & mask) >> maskPos; valuesAux[offset[bucket + 1] - count[bucket]] = values[i]; count[bucket]--; } } else { for (Ordinal i = 0; i < n; i++) { - Ordinal bucket = (valuesAux[i] & mask) >> maskPos; + Ordinal bucket = (valuesAux[i] & mask) >> maskPos; values[offset[bucket + 1] - count[bucket]] = valuesAux[i]; count[bucket]--; } @@ -348,13 +314,10 @@ KOKKOS_INLINE_FUNCTION void SerialRadixSort(ValueType* values, // lane. Con: requires auxiliary storage, this version only works for integers // (although float/double is possible) template -KOKKOS_INLINE_FUNCTION void SerialRadixSort2(ValueType* values, - ValueType* valuesAux, - PermType* perm, PermType* permAux, +KOKKOS_INLINE_FUNCTION void SerialRadixSort2(ValueType* values, ValueType* valuesAux, PermType* perm, PermType* permAux, Ordinal n) { - static_assert( - std::is_integral::value && std::is_unsigned::value, - "radixSort can only be run on unsigned integers."); + static_assert(std::is_integral::value && std::is_unsigned::value, + "radixSort can only be run on unsigned integers."); if (n <= 1) return; ValueType maxVal = 0; for (Ordinal i = 0; i < n; i++) { @@ -394,14 +357,14 @@ KOKKOS_INLINE_FUNCTION void SerialRadixSort2(ValueType* values, // threads if (!inAux) { for (Ordinal i = 0; i < n; i++) { - Ordinal bucket = (values[i] & mask) >> maskPos; + Ordinal bucket = (values[i] & mask) >> maskPos; valuesAux[offset[bucket + 1] - count[bucket]] = values[i]; permAux[offset[bucket + 1] - count[bucket]] = perm[i]; count[bucket]--; } } else { for (Ordinal i = 0; i < n; i++) { - Ordinal bucket = (valuesAux[i] & mask) >> maskPos; + Ordinal bucket = (valuesAux[i] & mask) >> maskPos; values[offset[bucket + 1] - count[bucket]] = valuesAux[i]; perm[offset[bucket + 1] - count[bucket]] = permAux[i]; count[bucket]--; @@ -425,10 +388,8 @@ KOKKOS_INLINE_FUNCTION void SerialRadixSort2(ValueType* values, // trivially-copyable) Pros: In-place, plenty of parallelism for GPUs, and // memory references are coalesced Con: O(n log^2(n)) serial time is bad on CPUs // Good diagram of the algorithm at https://en.wikipedia.org/wiki/Bitonic_sorter -template -KOKKOS_INLINE_FUNCTION void TeamBitonicSort(ValueType* values, Ordinal n, - const TeamMember mem, +template +KOKKOS_INLINE_FUNCTION void TeamBitonicSort(ValueType* values, Ordinal n, const TeamMember mem, const Comparator& comp) { // Algorithm only works on power-of-two input size only. // If n is not a power-of-two, will implicitly pretend @@ -443,52 +404,49 @@ KOKKOS_INLINE_FUNCTION void TeamBitonicSort(ValueType* values, Ordinal n, for (Ordinal i = 0; i < levels; i++) { for (Ordinal j = 0; j <= i; j++) { // n/2 pairs of items are compared in parallel - Kokkos::parallel_for( - Kokkos::TeamVectorRange(mem, npot / 2), [=](const Ordinal t) { - // How big are the brown/pink boxes? - Ordinal boxSize = Ordinal(2) << (i - j); - // Which box contains this thread? - Ordinal boxID = t >> (i - j); // t * 2 / boxSize; - Ordinal boxStart = boxID << (1 + i - j); // boxID * boxSize - Ordinal boxOffset = t - (boxStart >> 1); // t - boxID * boxSize / - // 2; - Ordinal elem1 = boxStart + boxOffset; - if (j == 0) { - // first phase (brown box): within a block, compare with the - // opposite value in the box - Ordinal elem2 = boxStart + boxSize - 1 - boxOffset; - if (elem2 < n) { - // both elements in bounds, so compare them and swap if out of - // order - if (comp(values[elem2], values[elem1])) { - ValueType temp = values[elem1]; - values[elem1] = values[elem2]; - values[elem2] = temp; - } - } - } else { - // later phases (pink box): within a block, compare with fixed - // distance (boxSize / 2) apart - Ordinal elem2 = elem1 + boxSize / 2; - if (elem2 < n) { - if (comp(values[elem2], values[elem1])) { - ValueType temp = values[elem1]; - values[elem1] = values[elem2]; - values[elem2] = temp; - } - } + Kokkos::parallel_for(Kokkos::TeamVectorRange(mem, npot / 2), [=](const Ordinal t) { + // How big are the brown/pink boxes? + Ordinal boxSize = Ordinal(2) << (i - j); + // Which box contains this thread? + Ordinal boxID = t >> (i - j); // t * 2 / boxSize; + Ordinal boxStart = boxID << (1 + i - j); // boxID * boxSize + Ordinal boxOffset = t - (boxStart >> 1); // t - boxID * boxSize / + // 2; + Ordinal elem1 = boxStart + boxOffset; + if (j == 0) { + // first phase (brown box): within a block, compare with the + // opposite value in the box + Ordinal elem2 = boxStart + boxSize - 1 - boxOffset; + if (elem2 < n) { + // both elements in bounds, so compare them and swap if out of + // order + if (comp(values[elem2], values[elem1])) { + ValueType temp = values[elem1]; + values[elem1] = values[elem2]; + values[elem2] = temp; + } + } + } else { + // later phases (pink box): within a block, compare with fixed + // distance (boxSize / 2) apart + Ordinal elem2 = elem1 + boxSize / 2; + if (elem2 < n) { + if (comp(values[elem2], values[elem1])) { + ValueType temp = values[elem1]; + values[elem1] = values[elem2]; + values[elem2] = temp; } - }); + } + } + }); mem.team_barrier(); } } } // Sort "values", while applying the same swaps to "perm" -template -KOKKOS_INLINE_FUNCTION void TeamBitonicSort2(ValueType* values, PermType* perm, - Ordinal n, const TeamMember mem, +template +KOKKOS_INLINE_FUNCTION void TeamBitonicSort2(ValueType* values, PermType* perm, Ordinal n, const TeamMember mem, const Comparator& comp) { // Algorithm only works on power-of-two input size only. // If n is not a power-of-two, will implicitly pretend @@ -503,48 +461,47 @@ KOKKOS_INLINE_FUNCTION void TeamBitonicSort2(ValueType* values, PermType* perm, for (Ordinal i = 0; i < levels; i++) { for (Ordinal j = 0; j <= i; j++) { // n/2 pairs of items are compared in parallel - Kokkos::parallel_for( - Kokkos::TeamVectorRange(mem, npot / 2), [=](const Ordinal t) { - // How big are the brown/pink boxes? - Ordinal boxSize = Ordinal(2) << (i - j); - // Which box contains this thread? - Ordinal boxID = t >> (i - j); // t * 2 / boxSize; - Ordinal boxStart = boxID << (1 + i - j); // boxID * boxSize - Ordinal boxOffset = t - (boxStart >> 1); // t - boxID * boxSize / - // 2; - Ordinal elem1 = boxStart + boxOffset; - if (j == 0) { - // first phase (brown box): within a block, compare with the - // opposite value in the box - Ordinal elem2 = boxStart + boxSize - 1 - boxOffset; - if (elem2 < n) { - // both elements in bounds, so compare them and swap if out of - // order - if (comp(values[elem2], values[elem1])) { - ValueType temp1 = values[elem1]; - values[elem1] = values[elem2]; - values[elem2] = temp1; - PermType temp2 = perm[elem1]; - perm[elem1] = perm[elem2]; - perm[elem2] = temp2; - } - } - } else { - // later phases (pink box): within a block, compare with fixed - // distance (boxSize / 2) apart - Ordinal elem2 = elem1 + boxSize / 2; - if (elem2 < n) { - if (comp(values[elem2], values[elem1])) { - ValueType temp1 = values[elem1]; - values[elem1] = values[elem2]; - values[elem2] = temp1; - PermType temp2 = perm[elem1]; - perm[elem1] = perm[elem2]; - perm[elem2] = temp2; - } - } + Kokkos::parallel_for(Kokkos::TeamVectorRange(mem, npot / 2), [=](const Ordinal t) { + // How big are the brown/pink boxes? + Ordinal boxSize = Ordinal(2) << (i - j); + // Which box contains this thread? + Ordinal boxID = t >> (i - j); // t * 2 / boxSize; + Ordinal boxStart = boxID << (1 + i - j); // boxID * boxSize + Ordinal boxOffset = t - (boxStart >> 1); // t - boxID * boxSize / + // 2; + Ordinal elem1 = boxStart + boxOffset; + if (j == 0) { + // first phase (brown box): within a block, compare with the + // opposite value in the box + Ordinal elem2 = boxStart + boxSize - 1 - boxOffset; + if (elem2 < n) { + // both elements in bounds, so compare them and swap if out of + // order + if (comp(values[elem2], values[elem1])) { + ValueType temp1 = values[elem1]; + values[elem1] = values[elem2]; + values[elem2] = temp1; + PermType temp2 = perm[elem1]; + perm[elem1] = perm[elem2]; + perm[elem2] = temp2; + } + } + } else { + // later phases (pink box): within a block, compare with fixed + // distance (boxSize / 2) apart + Ordinal elem2 = elem1 + boxSize / 2; + if (elem2 < n) { + if (comp(values[elem2], values[elem1])) { + ValueType temp1 = values[elem1]; + values[elem1] = values[elem2]; + values[elem2] = temp1; + PermType temp2 = perm[elem1]; + perm[elem1] = perm[elem2]; + perm[elem2] = temp2; } - }); + } + } + }); mem.team_barrier(); } } @@ -554,49 +511,40 @@ KOKKOS_INLINE_FUNCTION void TeamBitonicSort2(ValueType* values, PermType* perm, // KokkosKernels::Impl:: namespace Impl { -template < - typename View, typename ExecSpace, typename Ordinal, - typename Comparator = Impl::DefaultComparator> +template > [[deprecated]] void bitonicSort(View v, const Comparator& comp = Comparator()) { KokkosKernels::bitonicSort(v, comp); } template -[[deprecated]] KOKKOS_INLINE_FUNCTION void SerialRadixSort(ValueType* values, - ValueType* valuesAux, - Ordinal n) { +[[deprecated]] KOKKOS_INLINE_FUNCTION void SerialRadixSort(ValueType* values, ValueType* valuesAux, Ordinal n) { KokkosKernels::SerialRadixSort(values, valuesAux, n); } // Same as SerialRadixSort, but also permutes perm[0...n] as it sorts // values[0...n]. template -[[deprecated]] KOKKOS_INLINE_FUNCTION void SerialRadixSort2( - ValueType* values, ValueType* valuesAux, PermType* perm, PermType* permAux, - Ordinal n) { - KokkosKernels::SerialRadixSort2( - values, valuesAux, perm, permAux, n); +[[deprecated]] KOKKOS_INLINE_FUNCTION void SerialRadixSort2(ValueType* values, ValueType* valuesAux, PermType* perm, + PermType* permAux, Ordinal n) { + KokkosKernels::SerialRadixSort2(values, valuesAux, perm, permAux, n); } template > -[[deprecated]] KOKKOS_INLINE_FUNCTION void TeamBitonicSort( - ValueType* values, Ordinal n, const TeamMember mem, - const Comparator& comp = Comparator()) { - KokkosKernels::TeamBitonicSort( - values, n, mem, comp); +[[deprecated]] KOKKOS_INLINE_FUNCTION void TeamBitonicSort(ValueType* values, Ordinal n, const TeamMember mem, + const Comparator& comp = Comparator()) { + KokkosKernels::TeamBitonicSort(values, n, mem, comp); } // Same as SerialRadixSort, but also permutes perm[0...n] as it sorts // values[0...n]. -template > -[[deprecated]] KOKKOS_INLINE_FUNCTION void TeamBitonicSort2( - ValueType* values, PermType* perm, Ordinal n, const TeamMember mem, - const Comparator& comp = Comparator()) { - KokkosKernels::TeamBitonicSort2(values, perm, n, mem, comp); +[[deprecated]] KOKKOS_INLINE_FUNCTION void TeamBitonicSort2(ValueType* values, PermType* perm, Ordinal n, + const TeamMember mem, + const Comparator& comp = Comparator()) { + KokkosKernels::TeamBitonicSort2(values, perm, n, mem, comp); } } // namespace Impl diff --git a/common/src/KokkosKernels_TplsVersion.hpp b/common/src/KokkosKernels_TplsVersion.hpp index 3e00d72457..692f0fd350 100644 --- a/common/src/KokkosKernels_TplsVersion.hpp +++ b/common/src/KokkosKernels_TplsVersion.hpp @@ -50,8 +50,7 @@ inline std::string cusparse_version_string() { // Print version std::stringstream ss; - ss << CUSPARSE_VER_MAJOR << "." << CUSPARSE_VER_MINOR << "." - << CUSPARSE_VER_PATCH << "." << CUSPARSE_VER_BUILD; + ss << CUSPARSE_VER_MAJOR << "." << CUSPARSE_VER_MINOR << "." << CUSPARSE_VER_PATCH << "." << CUSPARSE_VER_BUILD; return ss.str(); } @@ -61,8 +60,7 @@ inline std::string cusparse_version_string() { inline std::string cusolver_version_string() { std::stringstream ss; - ss << CUSOLVER_VER_MAJOR << "." << CUSOLVER_VER_MINOR << "." - << CUSOLVER_VER_PATCH << "." << CUSOLVER_VER_BUILD; + ss << CUSOLVER_VER_MAJOR << "." << CUSOLVER_VER_MINOR << "." << CUSOLVER_VER_PATCH << "." << CUSOLVER_VER_BUILD; return ss.str(); } diff --git a/common/src/KokkosKernels_Uniform_Initialized_MemoryPool.hpp b/common/src/KokkosKernels_Uniform_Initialized_MemoryPool.hpp index e40b81a762..aa477815d6 100644 --- a/common/src/KokkosKernels_Uniform_Initialized_MemoryPool.hpp +++ b/common/src/KokkosKernels_Uniform_Initialized_MemoryPool.hpp @@ -176,10 +176,8 @@ class UniformMemoryPool { * initialized_value: the value to initialize \param pool_type_: whether * ManyThread2OneChunk or OneThread2OneChunk */ - UniformMemoryPool(const size_t num_chunks_, const size_t set_chunk_size_, - const data_type initialized_value = 0, - const PoolType pool_type_ = OneThread2OneChunk, - bool initialize = true) + UniformMemoryPool(const size_t num_chunks_, const size_t set_chunk_size_, const data_type initialized_value = 0, + const PoolType pool_type_ = OneThread2OneChunk, bool initialize = true) : num_chunks(1), num_set_chunks(num_chunks_), modular_num_chunks(0), @@ -200,9 +198,7 @@ class UniformMemoryPool { modular_num_chunks = num_chunks - 1; overall_size = num_chunks * chunk_size; if (num_set_chunks > 0) { - data_view = data_view_t( - Kokkos::view_alloc(Kokkos::WithoutInitializing, "pool data"), - overall_size); + data_view = data_view_t(Kokkos::view_alloc(Kokkos::WithoutInitializing, "pool data"), overall_size); } data = (data_view.data()); @@ -233,9 +229,9 @@ class UniformMemoryPool { ~UniformMemoryPool() = default; - UniformMemoryPool(UniformMemoryPool &&) = default; - UniformMemoryPool(const UniformMemoryPool &) = default; - UniformMemoryPool &operator=(UniformMemoryPool &&) = default; + UniformMemoryPool(UniformMemoryPool &&) = default; + UniformMemoryPool(const UniformMemoryPool &) = default; + UniformMemoryPool &operator=(UniformMemoryPool &&) = default; UniformMemoryPool &operator=(const UniformMemoryPool &) = default; /** @@ -295,12 +291,10 @@ class UniformMemoryPool { } KOKKOS_INLINE_FUNCTION - data_type *get_arbitrary_free_chunk(const size_t &thread_index, - const size_t max_tries) const { + data_type *get_arbitrary_free_chunk(const size_t &thread_index, const size_t max_tries) const { size_t chunk_index = thread_index & modular_num_chunks; size_t num_try = 0; - while (!Kokkos::atomic_compare_exchange_strong(pchunk_locks + chunk_index, - 0, 1)) { + while (!Kokkos::atomic_compare_exchange_strong(pchunk_locks + chunk_index, 0, 1)) { chunk_index = (chunk_index + 1) & modular_num_chunks; ++num_try; if (num_try > max_tries) { @@ -344,9 +338,7 @@ class UniformMemoryPool { * \brief Returns the chunk index of the pointer. */ KOKKOS_INLINE_FUNCTION - size_t get_chunk_index(const data_type *chunk_ptr) const { - return (chunk_ptr - data) / chunk_size; - } + size_t get_chunk_index(const data_type *chunk_ptr) const { return (chunk_ptr - data) / chunk_size; } /** * \brief Releases the memory that has been allocated. diff --git a/common/src/KokkosKernels_UpperBound.hpp b/common/src/KokkosKernels_UpperBound.hpp index 901c865743..97efd7559c 100644 --- a/common/src/KokkosKernels_UpperBound.hpp +++ b/common/src/KokkosKernels_UpperBound.hpp @@ -70,11 +70,9 @@ namespace KokkosKernels { \returns index of first element in view where pred(value,element) is true, or view.size if no such element exists */ -template > +template > KOKKOS_INLINE_FUNCTION typename ViewLike::size_type upper_bound_thread( - const ViewLike &view, const typename ViewLike::non_const_value_type &value, - Pred pred = Pred()) { + const ViewLike &view, const typename ViewLike::non_const_value_type &value, Pred pred = Pred()) { return lower_bound_thread(view, value, Neg(Refl(pred))); } @@ -88,11 +86,10 @@ KOKKOS_INLINE_FUNCTION typename ViewLike::size_type upper_bound_thread( \returns index of first element in view where pred(value,element) is true, or view.size if no such element exists */ -template > +template > KOKKOS_INLINE_FUNCTION typename ViewLike::size_type upper_bound_team( - const TeamMember &handle, const ViewLike &view, - const typename ViewLike::non_const_value_type &value, Pred pred = Pred()) { + const TeamMember &handle, const ViewLike &view, const typename ViewLike::non_const_value_type &value, + Pred pred = Pred()) { return lower_bound_team(handle, view, value, Neg(Refl(pred))); } diff --git a/common/src/KokkosKernels_Utils.hpp b/common/src/KokkosKernels_Utils.hpp index 92419424b6..a087002d31 100644 --- a/common/src/KokkosKernels_Utils.hpp +++ b/common/src/KokkosKernels_Utils.hpp @@ -36,31 +36,26 @@ ExecSpaceType get_exec_space_type() { return kk_get_exec_space_type(); } -inline int get_suggested_vector__size(size_t nr, size_t nnz, - ExecSpaceType exec_space) { +inline int get_suggested_vector__size(size_t nr, size_t nnz, ExecSpaceType exec_space) { return kk_get_suggested_vector_size(nr, nnz, exec_space); } template -void get_histogram(typename in_lno_view_t::size_type in_elements, - in_lno_view_t in_view, +void get_histogram(typename in_lno_view_t::size_type in_elements, in_lno_view_t in_view, out_lno_view_t histogram /*must be initialized with 0s*/) { - kk_get_histogram( - in_elements, in_view, histogram); + kk_get_histogram(in_elements, in_view, histogram); } template void get_suggested_vector_size(int &suggested_vector_size_, idx nr, idx nnz) { - suggested_vector_size_ = kk_get_suggested_vector_size( - nr, nnz, get_exec_space_type()); + suggested_vector_size_ = kk_get_suggested_vector_size(nr, nnz, get_exec_space_type()); } // Get the best team size for the given functor. // If it uses shared memory, the amount used must be available through // f.team_shmem_size(n), not through the TeamPolicy. If this is how dynamic // shared is set, just use AUTO for the team size. -template +template int get_suggested_team_size(Functor &f, int vector_size) { using execution_space = typename team_policy_t::traits::execution_space; if (kk_is_gpu_exec_space()) { @@ -70,23 +65,18 @@ int get_suggested_team_size(Functor &f, int vector_size) { return 1; } -template -int get_suggested_team_size(Functor &f, int vector_size, size_t sharedPerTeam, - size_t sharedPerThread) { +template +int get_suggested_team_size(Functor &f, int vector_size, size_t sharedPerTeam, size_t sharedPerThread) { using execution_space = typename team_policy_t::traits::execution_space; if (kk_is_gpu_exec_space()) { - team_policy_t temp = - team_policy_t(1, 1, vector_size) - .set_scratch_size(0, Kokkos::PerTeam(sharedPerTeam), - Kokkos::PerThread(sharedPerThread)); + team_policy_t temp = team_policy_t(1, 1, vector_size) + .set_scratch_size(0, Kokkos::PerTeam(sharedPerTeam), Kokkos::PerThread(sharedPerThread)); return temp.team_size_recommended(f, ParallelTag()); } else return 1; } -template +template struct FillSymmetricEdges { typedef typename idx_array_type::value_type idx; idx num_rows; @@ -97,44 +87,35 @@ struct FillSymmetricEdges { idx_out_edge_array_type srcs; idx_out_edge_array_type dsts; - FillSymmetricEdges(typename idx_array_type::value_type num_rows_, - idx_array_type xadj_, idx_edge_array_type adj_, + FillSymmetricEdges(typename idx_array_type::value_type num_rows_, idx_array_type xadj_, idx_edge_array_type adj_, - idx_out_edge_array_type srcs_, - idx_out_edge_array_type dsts_) - : num_rows(num_rows_), - nnz(adj_.extent(0)), - xadj(xadj_), - adj(adj_), - srcs(srcs_), - dsts(dsts_) {} + idx_out_edge_array_type srcs_, idx_out_edge_array_type dsts_) + : num_rows(num_rows_), nnz(adj_.extent(0)), xadj(xadj_), adj(adj_), srcs(srcs_), dsts(dsts_) {} KOKKOS_INLINE_FUNCTION void operator()(const team_member &teamMember) const { - idx ii = teamMember.league_rank() * teamMember.team_size() + - teamMember.team_rank(); + idx ii = teamMember.league_rank() * teamMember.team_size() + teamMember.team_rank(); if (ii >= num_rows) return; idx row_begin = xadj[ii]; idx row_end = xadj[ii + 1]; - Kokkos::parallel_for( - Kokkos::ThreadVectorRange(teamMember, row_end - row_begin), [&](idx i) { - idx adjind = i + row_begin; - idx colIndex = adj[adjind]; - if (colIndex < num_rows) { - srcs[adjind] = ii + 1; - dsts[adjind] = colIndex + 1; - if (colIndex != ii) { - srcs[adjind + nnz] = colIndex + 1; - dsts[adjind + nnz] = ii + 1; - } - } - }); + Kokkos::parallel_for(Kokkos::ThreadVectorRange(teamMember, row_end - row_begin), [&](idx i) { + idx adjind = i + row_begin; + idx colIndex = adj[adjind]; + if (colIndex < num_rows) { + srcs[adjind] = ii + 1; + dsts[adjind] = colIndex + 1; + if (colIndex != ii) { + srcs[adjind + nnz] = colIndex + 1; + dsts[adjind + nnz] = ii + 1; + } + } + }); } }; -template +template struct FillSymmetricEdgesHashMap { typedef typename in_lno_row_view_t::value_type idx; idx num_rows; @@ -145,60 +126,47 @@ struct FillSymmetricEdgesHashMap { out_lno_row_view_t pre_pps; bool lower_only; - FillSymmetricEdgesHashMap(idx num_rows_, in_lno_row_view_t xadj_, - in_lno_nnz_view_t adj_, hashmap_t hashmap_, + FillSymmetricEdgesHashMap(idx num_rows_, in_lno_row_view_t xadj_, in_lno_nnz_view_t adj_, hashmap_t hashmap_, out_lno_row_view_t pre_pps_) - : num_rows(num_rows_), - nnz(adj_.extent(0)), - xadj(xadj_), - adj(adj_), - umap(hashmap_), - pre_pps(pre_pps_) {} + : num_rows(num_rows_), nnz(adj_.extent(0)), xadj(xadj_), adj(adj_), umap(hashmap_), pre_pps(pre_pps_) {} KOKKOS_INLINE_FUNCTION void operator()(const team_member &teamMember /*, idx &nnz*/) const { - typedef typename std::remove_reference::type - atomic_incr_type; - idx ii = teamMember.league_rank() * teamMember.team_size() + - teamMember.team_rank(); + typedef typename std::remove_reference::type atomic_incr_type; + idx ii = teamMember.league_rank() * teamMember.team_size() + teamMember.team_rank(); if (ii >= num_rows) { return; } idx row_begin = xadj[ii]; idx row_end = xadj[ii + 1]; - Kokkos::parallel_for( - Kokkos::ThreadVectorRange(teamMember, row_end - row_begin), [&](idx i) { - idx adjind = i + row_begin; - idx colIndex = adj[adjind]; - if (colIndex < num_rows) { - if (colIndex < ii) { - Kokkos::UnorderedMapInsertResult r = - umap.insert(Kokkos::pair(colIndex, ii)); - if (r.success()) { - Kokkos::atomic_fetch_add(&(pre_pps(ii)), atomic_incr_type(1)); - - Kokkos::atomic_fetch_add(&(pre_pps(colIndex)), - atomic_incr_type(1)); - } - } else if (colIndex > ii) { - Kokkos::UnorderedMapInsertResult r = - umap.insert(Kokkos::pair(ii, colIndex)); - if (r.success()) { - Kokkos::atomic_fetch_add(&(pre_pps(colIndex)), - atomic_incr_type(1)); - - Kokkos::atomic_fetch_add(&(pre_pps(ii)), atomic_incr_type(1)); - } - } else { - Kokkos::atomic_fetch_add(&(pre_pps(ii)), atomic_incr_type(1)); - } + Kokkos::parallel_for(Kokkos::ThreadVectorRange(teamMember, row_end - row_begin), [&](idx i) { + idx adjind = i + row_begin; + idx colIndex = adj[adjind]; + if (colIndex < num_rows) { + if (colIndex < ii) { + Kokkos::UnorderedMapInsertResult r = umap.insert(Kokkos::pair(colIndex, ii)); + if (r.success()) { + Kokkos::atomic_fetch_add(&(pre_pps(ii)), atomic_incr_type(1)); + + Kokkos::atomic_fetch_add(&(pre_pps(colIndex)), atomic_incr_type(1)); } - }); + } else if (colIndex > ii) { + Kokkos::UnorderedMapInsertResult r = umap.insert(Kokkos::pair(ii, colIndex)); + if (r.success()) { + Kokkos::atomic_fetch_add(&(pre_pps(colIndex)), atomic_incr_type(1)); + + Kokkos::atomic_fetch_add(&(pre_pps(ii)), atomic_incr_type(1)); + } + } else { + Kokkos::atomic_fetch_add(&(pre_pps(ii)), atomic_incr_type(1)); + } + } + }); } }; -template +template struct FillSymmetricLowerEdgesHashMap { typedef typename in_lno_row_view_t::value_type idx; idx num_rows; @@ -208,55 +176,41 @@ struct FillSymmetricLowerEdgesHashMap { hashmap_t umap; out_lno_row_view_t pre_pps; - FillSymmetricLowerEdgesHashMap(idx num_rows_, in_lno_row_view_t xadj_, - in_lno_nnz_view_t adj_, hashmap_t hashmap_, - out_lno_row_view_t pre_pps_, - bool /* lower_only_ */ = false) - : num_rows(num_rows_), - nnz(adj_.extent(0)), - xadj(xadj_), - adj(adj_), - umap(hashmap_), - pre_pps(pre_pps_) {} + FillSymmetricLowerEdgesHashMap(idx num_rows_, in_lno_row_view_t xadj_, in_lno_nnz_view_t adj_, hashmap_t hashmap_, + out_lno_row_view_t pre_pps_, bool /* lower_only_ */ = false) + : num_rows(num_rows_), nnz(adj_.extent(0)), xadj(xadj_), adj(adj_), umap(hashmap_), pre_pps(pre_pps_) {} KOKKOS_INLINE_FUNCTION void operator()(const team_member &teamMember /*, idx &nnz*/) const { - typedef typename std::remove_reference::type - atomic_incr_type; - idx ii = teamMember.league_rank() * teamMember.team_size() + - teamMember.team_rank(); + typedef typename std::remove_reference::type atomic_incr_type; + idx ii = teamMember.league_rank() * teamMember.team_size() + teamMember.team_rank(); if (ii >= num_rows) { return; } idx row_begin = xadj[ii]; idx row_end = xadj[ii + 1]; - Kokkos::parallel_for( - Kokkos::ThreadVectorRange(teamMember, row_end - row_begin), [&](idx i) { - idx adjind = i + row_begin; - idx colIndex = adj[adjind]; - if (colIndex < num_rows) { - if (colIndex < ii) { - Kokkos::UnorderedMapInsertResult r = - umap.insert(Kokkos::pair(colIndex, ii)); - if (r.success()) { - Kokkos::atomic_fetch_add(&(pre_pps(colIndex)), - atomic_incr_type(1)); - } - } else if (colIndex > ii) { - Kokkos::UnorderedMapInsertResult r = - umap.insert(Kokkos::pair(ii, colIndex)); - if (r.success()) { - Kokkos::atomic_fetch_add(&(pre_pps(ii)), atomic_incr_type(1)); - } - } + Kokkos::parallel_for(Kokkos::ThreadVectorRange(teamMember, row_end - row_begin), [&](idx i) { + idx adjind = i + row_begin; + idx colIndex = adj[adjind]; + if (colIndex < num_rows) { + if (colIndex < ii) { + Kokkos::UnorderedMapInsertResult r = umap.insert(Kokkos::pair(colIndex, ii)); + if (r.success()) { + Kokkos::atomic_fetch_add(&(pre_pps(colIndex)), atomic_incr_type(1)); + } + } else if (colIndex > ii) { + Kokkos::UnorderedMapInsertResult r = umap.insert(Kokkos::pair(ii, colIndex)); + if (r.success()) { + Kokkos::atomic_fetch_add(&(pre_pps(ii)), atomic_incr_type(1)); } - }); + } + } + }); } }; -template struct FillSymmetricCRS_HashMap { typedef typename in_lno_row_view_t::value_type idx; @@ -268,10 +222,8 @@ struct FillSymmetricCRS_HashMap { out_lno_row_view_t pre_pps; out_lno_nnz_view_t sym_adj; - FillSymmetricCRS_HashMap(idx num_rows_, in_lno_row_view_t xadj_, - in_lno_nnz_view_t adj_, hashmap_t hashmap_, - out_lno_row_view_t pre_pps_, - out_lno_nnz_view_t sym_adj_) + FillSymmetricCRS_HashMap(idx num_rows_, in_lno_row_view_t xadj_, in_lno_nnz_view_t adj_, hashmap_t hashmap_, + out_lno_row_view_t pre_pps_, out_lno_nnz_view_t sym_adj_) : num_rows(num_rows_), nnz(adj_.extent(0)), xadj(xadj_), @@ -282,51 +234,42 @@ struct FillSymmetricCRS_HashMap { KOKKOS_INLINE_FUNCTION void operator()(const team_member_t &teamMember) const { - typedef typename std::remove_reference::type - atomic_incr_type; - idx ii = teamMember.league_rank() * teamMember.team_size() + - teamMember.team_rank(); + typedef typename std::remove_reference::type atomic_incr_type; + idx ii = teamMember.league_rank() * teamMember.team_size() + teamMember.team_rank(); if (ii >= num_rows) { return; } idx row_begin = xadj[ii]; idx row_end = xadj[ii + 1]; - Kokkos::parallel_for( - Kokkos::ThreadVectorRange(teamMember, row_end - row_begin), [&](idx i) { - idx adjind = i + row_begin; - idx colIndex = adj[adjind]; - if (colIndex < num_rows) { - if (colIndex < ii) { - if (umap.insert(Kokkos::pair(colIndex, ii)).success()) { - idx cAdjInd = Kokkos::atomic_fetch_add(&(pre_pps(colIndex)), - atomic_incr_type(1)); - idx iAdjInd = Kokkos::atomic_fetch_add(&(pre_pps(ii)), - atomic_incr_type(1)); - sym_adj[cAdjInd] = ii; - sym_adj[iAdjInd] = colIndex; - } - } else if (colIndex > ii) { - if (umap.insert(Kokkos::pair(ii, colIndex)).success()) { - idx cAdjInd = Kokkos::atomic_fetch_add(&(pre_pps(colIndex)), - atomic_incr_type(1)); - idx iAdjInd = Kokkos::atomic_fetch_add(&(pre_pps(ii)), - atomic_incr_type(1)); - sym_adj[cAdjInd] = ii; - sym_adj[iAdjInd] = colIndex; - } - } else { - idx cAdjInd = Kokkos::atomic_fetch_add(&(pre_pps(colIndex)), - atomic_incr_type(1)); - sym_adj[cAdjInd] = ii; - } + Kokkos::parallel_for(Kokkos::ThreadVectorRange(teamMember, row_end - row_begin), [&](idx i) { + idx adjind = i + row_begin; + idx colIndex = adj[adjind]; + if (colIndex < num_rows) { + if (colIndex < ii) { + if (umap.insert(Kokkos::pair(colIndex, ii)).success()) { + idx cAdjInd = Kokkos::atomic_fetch_add(&(pre_pps(colIndex)), atomic_incr_type(1)); + idx iAdjInd = Kokkos::atomic_fetch_add(&(pre_pps(ii)), atomic_incr_type(1)); + sym_adj[cAdjInd] = ii; + sym_adj[iAdjInd] = colIndex; } - }); + } else if (colIndex > ii) { + if (umap.insert(Kokkos::pair(ii, colIndex)).success()) { + idx cAdjInd = Kokkos::atomic_fetch_add(&(pre_pps(colIndex)), atomic_incr_type(1)); + idx iAdjInd = Kokkos::atomic_fetch_add(&(pre_pps(ii)), atomic_incr_type(1)); + sym_adj[cAdjInd] = ii; + sym_adj[iAdjInd] = colIndex; + } + } else { + idx cAdjInd = Kokkos::atomic_fetch_add(&(pre_pps(colIndex)), atomic_incr_type(1)); + sym_adj[cAdjInd] = ii; + } + } + }); } }; -template struct FillSymmetricEdgeList_HashMap { typedef typename in_lno_row_view_t::value_type idx; @@ -339,11 +282,8 @@ struct FillSymmetricEdgeList_HashMap { out_lno_nnz_view_t sym_dst; out_lno_row_view_t pps; - FillSymmetricEdgeList_HashMap(idx num_rows_, in_lno_row_view_t xadj_, - in_lno_nnz_view_t adj_, hashmap_t hashmap_, - out_lno_nnz_view_t sym_src_, - out_lno_nnz_view_t sym_dst_, - out_lno_row_view_t pps_) + FillSymmetricEdgeList_HashMap(idx num_rows_, in_lno_row_view_t xadj_, in_lno_nnz_view_t adj_, hashmap_t hashmap_, + out_lno_nnz_view_t sym_src_, out_lno_nnz_view_t sym_dst_, out_lno_row_view_t pps_) : num_rows(num_rows_), nnz(adj_.extent(0)), xadj(xadj_), @@ -355,44 +295,38 @@ struct FillSymmetricEdgeList_HashMap { KOKKOS_INLINE_FUNCTION void operator()(const team_member_t &teamMember) const { - typedef - typename std::remove_reference::type atomic_incr_type; - idx ii = teamMember.league_rank() * teamMember.team_size() + - teamMember.team_rank(); + typedef typename std::remove_reference::type atomic_incr_type; + idx ii = teamMember.league_rank() * teamMember.team_size() + teamMember.team_rank(); if (ii >= num_rows) { return; } idx row_begin = xadj[ii]; idx row_end = xadj[ii + 1]; - Kokkos::parallel_for( - Kokkos::ThreadVectorRange(teamMember, row_end - row_begin), [&](idx i) { - idx adjind = i + row_begin; - idx colIndex = adj[adjind]; - if (colIndex < num_rows) { - if (colIndex < ii) { - if (umap.insert(Kokkos::pair(colIndex, ii)).success()) { - idx cAdjInd = Kokkos::atomic_fetch_add(&(pps(colIndex)), - atomic_incr_type(1)); - sym_src[cAdjInd] = colIndex; - sym_dst[cAdjInd] = ii; - } - } else if (colIndex > ii) { - if (umap.insert(Kokkos::pair(ii, colIndex)).success()) { - idx cAdjInd = - Kokkos::atomic_fetch_add(&(pps(ii)), atomic_incr_type(1)); - sym_src[cAdjInd] = ii; - sym_dst[cAdjInd] = colIndex; - } - } + Kokkos::parallel_for(Kokkos::ThreadVectorRange(teamMember, row_end - row_begin), [&](idx i) { + idx adjind = i + row_begin; + idx colIndex = adj[adjind]; + if (colIndex < num_rows) { + if (colIndex < ii) { + if (umap.insert(Kokkos::pair(colIndex, ii)).success()) { + idx cAdjInd = Kokkos::atomic_fetch_add(&(pps(colIndex)), atomic_incr_type(1)); + sym_src[cAdjInd] = colIndex; + sym_dst[cAdjInd] = ii; } - }); + } else if (colIndex > ii) { + if (umap.insert(Kokkos::pair(ii, colIndex)).success()) { + idx cAdjInd = Kokkos::atomic_fetch_add(&(pps(ii)), atomic_incr_type(1)); + sym_src[cAdjInd] = ii; + sym_dst[cAdjInd] = colIndex; + } + } + } + }); } }; template -void print_1Dview(std::ostream &os, idx_array_type view, bool print_all = false, - const char *sep = " ") { +void print_1Dview(std::ostream &os, idx_array_type view, bool print_all = false, const char *sep = " ") { kk_print_1Dview(os, view, print_all, sep); } @@ -403,8 +337,7 @@ void print_1Dview(idx_array_type view, bool print_all = false) { template void print_1Dpointer(const lno_t *pview, size_t size, bool print_all = false) { - typedef Kokkos::View - um_array_type; + typedef Kokkos::View um_array_type; um_array_type view(pview, size); kk_print_1Dview(view, print_all); } @@ -415,14 +348,12 @@ struct Reverse_Map_Init { typedef typename reverse_map_type::value_type reverse_type; forward_map_type forward_map; reverse_map_type reverse_map_xadj; - Reverse_Map_Init(forward_map_type forward_map_, - reverse_map_type reverse_xadj_) + Reverse_Map_Init(forward_map_type forward_map_, reverse_map_type reverse_xadj_) : forward_map(forward_map_), reverse_map_xadj(reverse_xadj_) {} KOKKOS_INLINE_FUNCTION void operator()(const size_t &ii) const { - typedef typename std::remove_reference::type - atomic_incr_type; + typedef typename std::remove_reference::type atomic_incr_type; forward_type fm = forward_map[ii]; Kokkos::atomic_fetch_add(&(reverse_map_xadj(fm)), atomic_incr_type(1)); } @@ -436,44 +367,32 @@ struct Fill_Reverse_Map { reverse_map_type reverse_map_xadj; reverse_map_type reverse_map_adj; - Fill_Reverse_Map(forward_map_type forward_map_, - reverse_map_type reverse_map_xadj_, - reverse_map_type reverse_map_adj_) - : forward_map(forward_map_), - reverse_map_xadj(reverse_map_xadj_), - reverse_map_adj(reverse_map_adj_) {} + Fill_Reverse_Map(forward_map_type forward_map_, reverse_map_type reverse_map_xadj_, reverse_map_type reverse_map_adj_) + : forward_map(forward_map_), reverse_map_xadj(reverse_map_xadj_), reverse_map_adj(reverse_map_adj_) {} KOKKOS_INLINE_FUNCTION void operator()(const size_t &ii) const { - typedef typename std::remove_reference::type - atomic_incr_type; + typedef typename std::remove_reference::type atomic_incr_type; forward_type c = forward_map[ii]; - const reverse_type future_index = Kokkos::atomic_fetch_add( - &(reverse_map_xadj(c - 1)), atomic_incr_type(1)); - reverse_map_adj(future_index) = ii; + const reverse_type future_index = Kokkos::atomic_fetch_add(&(reverse_map_xadj(c - 1)), atomic_incr_type(1)); + reverse_map_adj(future_index) = ii; } }; template -void inclusive_parallel_prefix_sum( - MyExecSpace my_exec_space, - typename forward_array_type::value_type num_elements, - forward_array_type arr) { +void inclusive_parallel_prefix_sum(MyExecSpace my_exec_space, typename forward_array_type::value_type num_elements, + forward_array_type arr) { return kk_inclusive_parallel_prefix_sum(my_exec_space, num_elements, arr); } template -void inclusive_parallel_prefix_sum( - typename forward_array_type::value_type num_elements, - forward_array_type arr) { +void inclusive_parallel_prefix_sum(typename forward_array_type::value_type num_elements, forward_array_type arr) { MyExecSpace my_exec_space; return inclusive_parallel_prefix_sum(my_exec_space, num_elements, arr); } template -void exclusive_parallel_prefix_sum( - typename forward_array_type::value_type num_elements, - forward_array_type arr) { +void exclusive_parallel_prefix_sum(typename forward_array_type::value_type num_elements, forward_array_type arr) { kk_exclusive_parallel_prefix_sum(num_elements, arr); } @@ -499,21 +418,16 @@ struct PropogataMaxValstoZeros { } }; -template -void a_times_x_plus_b(typename in_array_t::value_type num_elements, - in_array_t out_arr, in_array_t in_arr, scalar_1 a, +template +void a_times_x_plus_b(typename in_array_t::value_type num_elements, in_array_t out_arr, in_array_t in_arr, scalar_1 a, scalar_2 b) { - kk_a_times_x_plus_b( - num_elements, out_arr, in_arr, a, b); + kk_a_times_x_plus_b(num_elements, out_arr, in_arr, a, b); } template -void modular_view(typename in_array_type::value_type num_elements, - out_array_type out_arr, in_array_type in_arr, +void modular_view(typename in_array_type::value_type num_elements, out_array_type out_arr, in_array_type in_arr, int mod_factor_) { - kk_modular_view( - num_elements, out_arr, in_arr, mod_factor_); + kk_modular_view(num_elements, out_arr, in_arr, mod_factor_); } template @@ -528,18 +442,14 @@ struct LinearInitialization { template void linear_init(typename array_type::value_type num_elements, array_type arr) { typedef Kokkos::RangePolicy my_exec_space; - Kokkos::parallel_for("KokkosKernels::Common::LinearInit", - my_exec_space(0, num_elements), + Kokkos::parallel_for("KokkosKernels::Common::LinearInit", my_exec_space(0, num_elements), LinearInitialization(arr)); } template -void remove_zeros_in_xadj_vector( - typename forward_array_type::value_type num_elements, - forward_array_type arr) { +void remove_zeros_in_xadj_vector(typename forward_array_type::value_type num_elements, forward_array_type arr) { typedef Kokkos::RangePolicy my_exec_space; - Kokkos::parallel_scan("KokkosKernels::Common::RemoveZerosInXadjVector", - my_exec_space(0, num_elements), + Kokkos::parallel_scan("KokkosKernels::Common::RemoveZerosInXadjVector", my_exec_space(0, num_elements), PropogataMaxValstoZeros(arr)); } @@ -548,10 +458,9 @@ struct FillReverseBegins { const forward_array_type &forward_map; // vertex to colors reverse_array_type &reverse_map_xadj; // colors to vertex xadj - FillReverseBegins( - const forward_array_type &forward_map_, // vertex to colors - reverse_array_type &reverse_map_xadj_ // colors to vertex xadj - ) + FillReverseBegins(const forward_array_type &forward_map_, // vertex to colors + reverse_array_type &reverse_map_xadj_ // colors to vertex xadj + ) : forward_map(forward_map_), reverse_map_xadj(reverse_map_xadj_) {} KOKKOS_INLINE_FUNCTION @@ -575,10 +484,8 @@ struct Reverse_Map_Scale_Init { const reverse_type multiply_shift_for_scale; const reverse_type division_shift_for_bucket; - Reverse_Map_Scale_Init(forward_map_type forward_map_, - reverse_map_type reverse_xadj_, - reverse_type multiply_shift_for_scale_, - reverse_type division_shift_for_bucket_) + Reverse_Map_Scale_Init(forward_map_type forward_map_, reverse_map_type reverse_xadj_, + reverse_type multiply_shift_for_scale_, reverse_type division_shift_for_bucket_) : forward_map(forward_map_), reverse_map_xadj(reverse_xadj_), multiply_shift_for_scale(multiply_shift_for_scale_), @@ -586,8 +493,7 @@ struct Reverse_Map_Scale_Init { KOKKOS_INLINE_FUNCTION void operator()(const size_t &ii) const { - typedef typename std::remove_reference::type - atomic_incr_type; + typedef typename std::remove_reference::type atomic_incr_type; forward_type fm = forward_map[ii]; fm = fm << multiply_shift_for_scale; fm += ii >> division_shift_for_bucket; @@ -606,10 +512,8 @@ struct Fill_Reverse_Scale_Map { const reverse_type multiply_shift_for_scale; const reverse_type division_shift_for_bucket; - Fill_Reverse_Scale_Map(forward_map_type forward_map_, - reverse_map_type reverse_map_xadj_, - reverse_map_type reverse_map_adj_, - reverse_type multiply_shift_for_scale_, + Fill_Reverse_Scale_Map(forward_map_type forward_map_, reverse_map_type reverse_map_xadj_, + reverse_map_type reverse_map_adj_, reverse_type multiply_shift_for_scale_, reverse_type division_shift_for_bucket_) : forward_map(forward_map_), reverse_map_xadj(reverse_map_xadj_), @@ -619,15 +523,13 @@ struct Fill_Reverse_Scale_Map { KOKKOS_INLINE_FUNCTION void operator()(const size_t &ii) const { - typedef typename std::remove_reference::type - atomic_incr_type; + typedef typename std::remove_reference::type atomic_incr_type; forward_type fm = forward_map[ii]; fm = fm << multiply_shift_for_scale; fm += ii >> division_shift_for_bucket; - const reverse_type future_index = Kokkos::atomic_fetch_add( - &(reverse_map_xadj(fm - 1)), atomic_incr_type(1)); - reverse_map_adj(future_index) = ii; + const reverse_type future_index = Kokkos::atomic_fetch_add(&(reverse_map_xadj(fm - 1)), atomic_incr_type(1)); + reverse_map_adj(future_index) = ii; } }; @@ -636,8 +538,7 @@ struct StridedCopy { const from_view_t from; to_view_t to; const size_t stride; - StridedCopy(const from_view_t from_, to_view_t to_, size_t stride_) - : from(from_), to(to_), stride(stride_) {} + StridedCopy(const from_view_t from_, to_view_t to_, size_t stride_) : from(from_), to(to_), stride(stride_) {} KOKKOS_INLINE_FUNCTION void operator()(const size_t &ii) const { @@ -665,18 +566,14 @@ struct StridedCopy { * values of reverse maps. Its size will be num_forward_elements. * */ -template -void create_reverse_map( - MyExecSpace my_exec_space, - const typename reverse_array_type::value_type - &num_forward_elements, // num_vertices - const typename forward_array_type::value_type - &num_reverse_elements, // num_colors +template +void create_reverse_map(MyExecSpace my_exec_space, + const typename reverse_array_type::value_type &num_forward_elements, // num_vertices + const typename forward_array_type::value_type &num_reverse_elements, // num_colors - const forward_array_type &forward_map, // vertex to colors - reverse_array_type &reverse_map_xadj, // colors to vertex xadj - reverse_array_type &reverse_map_adj) { // colros to vertex adj + const forward_array_type &forward_map, // vertex to colors + reverse_array_type &reverse_map_xadj, // colors to vertex xadj + reverse_array_type &reverse_map_adj) { // colros to vertex adj typedef typename reverse_array_type::value_type lno_t; typedef typename forward_array_type::value_type reverse_lno_t; @@ -685,110 +582,84 @@ void create_reverse_map( typedef Kokkos::RangePolicy range_policy_t; reverse_map_xadj = - reverse_array_type(Kokkos::view_alloc(my_exec_space, "Reverse Map Xadj"), - num_reverse_elements + 1); - reverse_map_adj = reverse_array_type( - Kokkos::view_alloc(my_exec_space, Kokkos::WithoutInitializing, - "REVERSE_ADJ"), - num_forward_elements); + reverse_array_type(Kokkos::view_alloc(my_exec_space, "Reverse Map Xadj"), num_reverse_elements + 1); + reverse_map_adj = reverse_array_type(Kokkos::view_alloc(my_exec_space, Kokkos::WithoutInitializing, "REVERSE_ADJ"), + num_forward_elements); if (num_reverse_elements < MINIMUM_TO_ATOMIC) { - const lno_t scale_size = 1024; - const lno_t multiply_shift_for_scale = 10; - const lno_t division_shift_for_bucket = - lno_t(ceil(log(double(num_forward_elements) / scale_size) / log(2))); + const lno_t scale_size = 1024; + const lno_t multiply_shift_for_scale = 10; + const lno_t division_shift_for_bucket = lno_t(ceil(log(double(num_forward_elements) / scale_size) / log(2))); // const lno_t bucket_range_size = pow(2, division_shift_for_bucket); // coloring indices are base-1. we end up using not using element 1. - const reverse_lno_t tmp_reverse_size = (num_reverse_elements + 1) - << multiply_shift_for_scale; + const reverse_lno_t tmp_reverse_size = (num_reverse_elements + 1) << multiply_shift_for_scale; - reverse_array_type tmp_color_xadj( - Kokkos::view_alloc(my_exec_space, "TMP_REVERSE_XADJ"), - tmp_reverse_size + 1); + reverse_array_type tmp_color_xadj(Kokkos::view_alloc(my_exec_space, "TMP_REVERSE_XADJ"), tmp_reverse_size + 1); Reverse_Map_Scale_Init rmi( - forward_map, tmp_color_xadj, multiply_shift_for_scale, - division_shift_for_bucket); + forward_map, tmp_color_xadj, multiply_shift_for_scale, division_shift_for_bucket); Kokkos::parallel_for("KokkosKernels::Common::ReverseMapScaleInit", - range_policy_t(my_exec_space, 0, num_forward_elements), - rmi); + range_policy_t(my_exec_space, 0, num_forward_elements), rmi); my_exec_space.fence(); - inclusive_parallel_prefix_sum( - my_exec_space, tmp_reverse_size + 1, tmp_color_xadj); + inclusive_parallel_prefix_sum(my_exec_space, tmp_reverse_size + 1, tmp_color_xadj); my_exec_space.fence(); Kokkos::parallel_for( - "KokkosKernels::Common::StridedCopy", - range_policy_t(my_exec_space, 0, num_reverse_elements + 1), - StridedCopy( - tmp_color_xadj, reverse_map_xadj, scale_size)); + "KokkosKernels::Common::StridedCopy", range_policy_t(my_exec_space, 0, num_reverse_elements + 1), + StridedCopy(tmp_color_xadj, reverse_map_xadj, scale_size)); my_exec_space.fence(); Fill_Reverse_Scale_Map frm( - forward_map, tmp_color_xadj, reverse_map_adj, multiply_shift_for_scale, - division_shift_for_bucket); + forward_map, tmp_color_xadj, reverse_map_adj, multiply_shift_for_scale, division_shift_for_bucket); Kokkos::parallel_for("KokkosKernels::Common::FillReverseMap", - range_policy_t(my_exec_space, 0, num_forward_elements), - frm); + range_policy_t(my_exec_space, 0, num_forward_elements), frm); my_exec_space.fence(); } else // atomic implementation. { reverse_array_type tmp_color_xadj( - Kokkos::view_alloc(my_exec_space, Kokkos::WithoutInitializing, - "TMP_REVERSE_XADJ"), - num_reverse_elements + 1); + Kokkos::view_alloc(my_exec_space, Kokkos::WithoutInitializing, "TMP_REVERSE_XADJ"), num_reverse_elements + 1); - Reverse_Map_Init rmi( - forward_map, reverse_map_xadj); + Reverse_Map_Init rmi(forward_map, reverse_map_xadj); Kokkos::parallel_for("KokkosKernels::Common::ReverseMapInit", - range_policy_t(my_exec_space, 0, num_forward_elements), - rmi); + range_policy_t(my_exec_space, 0, num_forward_elements), rmi); my_exec_space.fence(); // print_1Dview(reverse_map_xadj); - inclusive_parallel_prefix_sum( - my_exec_space, num_reverse_elements + 1, reverse_map_xadj); + inclusive_parallel_prefix_sum(my_exec_space, num_reverse_elements + 1, + reverse_map_xadj); Kokkos::deep_copy(my_exec_space, tmp_color_xadj, reverse_map_xadj); my_exec_space.fence(); - Fill_Reverse_Map frm( - forward_map, tmp_color_xadj, reverse_map_adj); + Fill_Reverse_Map frm(forward_map, tmp_color_xadj, reverse_map_adj); Kokkos::parallel_for("KokkosKernels::Common::FillReverseMap", - range_policy_t(my_exec_space, 0, num_forward_elements), - frm); + range_policy_t(my_exec_space, 0, num_forward_elements), frm); my_exec_space.fence(); } } template -void create_reverse_map( - const typename reverse_array_type::value_type - &num_forward_elements, // num_vertices - const typename forward_array_type::value_type - &num_reverse_elements, // num_colors - - const forward_array_type &forward_map, // vertex to colors - reverse_array_type &reverse_map_xadj, // colors to vertex xadj - reverse_array_type &reverse_map_adj) { +void create_reverse_map(const typename reverse_array_type::value_type &num_forward_elements, // num_vertices + const typename forward_array_type::value_type &num_reverse_elements, // num_colors + + const forward_array_type &forward_map, // vertex to colors + reverse_array_type &reverse_map_xadj, // colors to vertex xadj + reverse_array_type &reverse_map_adj) { MyExecSpace my_exec_space; - return create_reverse_map(my_exec_space, num_forward_elements, - num_reverse_elements, forward_map, reverse_map_xadj, + return create_reverse_map(my_exec_space, num_forward_elements, num_reverse_elements, forward_map, reverse_map_xadj, reverse_map_adj); } -template +template struct PermuteVector { typedef typename idx_array_type::value_type idx; value_array_type old_vector; out_value_array_type new_vector; idx_array_type old_to_new_mapping; idx mapping_size; - PermuteVector(value_array_type old_vector_, out_value_array_type new_vector_, - idx_array_type old_to_new_mapping_) + PermuteVector(value_array_type old_vector_, out_value_array_type new_vector_, idx_array_type old_to_new_mapping_) : old_vector(old_vector_), new_vector(new_vector_), old_to_new_mapping(old_to_new_mapping_), @@ -804,34 +675,24 @@ struct PermuteVector { } }; -template -void permute_vector(MyExecSpace my_exec_space, - typename idx_array_type::value_type num_elements, - idx_array_type &old_to_new_index_map, - value_array_type &old_vector, +template +void permute_vector(MyExecSpace my_exec_space, typename idx_array_type::value_type num_elements, + idx_array_type &old_to_new_index_map, value_array_type &old_vector, out_value_array_type &new_vector) { using range_policy_t = Kokkos::RangePolicy; - Kokkos::parallel_for( - "KokkosKernels::Common::PermuteVector", - range_policy_t(my_exec_space, 0, num_elements), - PermuteVector( - old_vector, new_vector, old_to_new_index_map)); + Kokkos::parallel_for("KokkosKernels::Common::PermuteVector", range_policy_t(my_exec_space, 0, num_elements), + PermuteVector(old_vector, new_vector, + old_to_new_index_map)); } -template -void permute_vector(typename idx_array_type::value_type num_elements, - idx_array_type &old_to_new_index_map, - value_array_type &old_vector, - out_value_array_type &new_vector) { - permute_vector(MyExecSpace(), num_elements, old_to_new_index_map, old_vector, - new_vector); +template +void permute_vector(typename idx_array_type::value_type num_elements, idx_array_type &old_to_new_index_map, + value_array_type &old_vector, out_value_array_type &new_vector) { + permute_vector(MyExecSpace(), num_elements, old_to_new_index_map, old_vector, new_vector); } -template +template struct PermuteBlockVector { typedef typename idx_array_type::value_type idx; int block_size; @@ -839,8 +700,7 @@ struct PermuteBlockVector { out_value_array_type new_vector; idx_array_type old_to_new_mapping; idx mapping_size; - PermuteBlockVector(int block_size_, value_array_type old_vector_, - out_value_array_type new_vector_, + PermuteBlockVector(int block_size_, value_array_type old_vector_, out_value_array_type new_vector_, idx_array_type old_to_new_mapping_) : block_size(block_size_), old_vector(old_vector_), @@ -854,55 +714,42 @@ struct PermuteBlockVector { if (ii < mapping_size) mapping = old_to_new_mapping[ii]; for (idx j = 0; j < static_cast(new_vector.extent(1)); j++) { for (int i = 0; i < block_size; ++i) { - new_vector.access(mapping * block_size + i, j) = - old_vector.access(ii * block_size + i, j); + new_vector.access(mapping * block_size + i, j) = old_vector.access(ii * block_size + i, j); } } } }; -template -void permute_block_vector(MyExecSpace my_exec_space, - typename idx_array_type::value_type num_elements, - int block_size, idx_array_type &old_to_new_index_map, - value_array_type &old_vector, +template +void permute_block_vector(MyExecSpace my_exec_space, typename idx_array_type::value_type num_elements, int block_size, + idx_array_type &old_to_new_index_map, value_array_type &old_vector, out_value_array_type &new_vector) { using range_policy_t = Kokkos::RangePolicy; - Kokkos::parallel_for( - "KokkosKernels::Common::PermuteVector", - range_policy_t(my_exec_space, 0, num_elements), - PermuteBlockVector(block_size, old_vector, new_vector, - old_to_new_index_map)); + Kokkos::parallel_for("KokkosKernels::Common::PermuteVector", range_policy_t(my_exec_space, 0, num_elements), + PermuteBlockVector( + block_size, old_vector, new_vector, old_to_new_index_map)); } -template -void permute_block_vector(typename idx_array_type::value_type num_elements, - int block_size, idx_array_type &old_to_new_index_map, - value_array_type &old_vector, +template +void permute_block_vector(typename idx_array_type::value_type num_elements, int block_size, + idx_array_type &old_to_new_index_map, value_array_type &old_vector, out_value_array_type &new_vector) { - permute_block_vector(MyExecSpace(), num_elements, block_size, - old_to_new_index_map, old_vector, new_vector); + permute_block_vector(MyExecSpace(), num_elements, block_size, old_to_new_index_map, old_vector, new_vector); } // TODO BMK: clean this up by removing 1st argument. It is unused but // its name gives the impression that only num_elements of the vector are // zeroed, when really it's always the whole thing. template -void zero_vector(ExecSpaceIn &exec_space_in, - typename value_array_type::value_type /* num_elements */, +void zero_vector(ExecSpaceIn &exec_space_in, typename value_array_type::value_type /* num_elements */, value_array_type &vector) { typedef typename value_array_type::non_const_value_type val_type; - Kokkos::deep_copy(exec_space_in, vector, - Kokkos::ArithTraits::zero()); + Kokkos::deep_copy(exec_space_in, vector, Kokkos::ArithTraits::zero()); exec_space_in.fence(); } template -void zero_vector(typename value_array_type::value_type /* num_elements */, - value_array_type &vector) { +void zero_vector(typename value_array_type::value_type /* num_elements */, value_array_type &vector) { using ne_tmp_t = typename value_array_type::value_type; ne_tmp_t ne_tmp = ne_tmp_t(0); MyExecSpace my_exec_space; @@ -915,21 +762,15 @@ struct MarkDuplicateSortedKeyValuePairs { v2 vals; v3 prefix_sum; typename v1::size_type overall_size; - MarkDuplicateSortedKeyValuePairs(v1 keys_, v2 vals_, v3 prefix_sum_, - typename v1::size_type overall_size_) - : keys(keys_), - vals(vals_), - prefix_sum(prefix_sum_), - overall_size(overall_size_) {} + MarkDuplicateSortedKeyValuePairs(v1 keys_, v2 vals_, v3 prefix_sum_, typename v1::size_type overall_size_) + : keys(keys_), vals(vals_), prefix_sum(prefix_sum_), overall_size(overall_size_) {} KOKKOS_INLINE_FUNCTION void operator()(const size_t &i, typename v3::value_type &num_result) const { typename v1::value_type my_key = keys(i); typename v2::value_type my_val = vals(i); - if ((my_key != 0 && my_val != 0) && - ((i + 1 >= overall_size) || - (my_key != keys(i + 1) || my_val != vals(i + 1)))) { + if ((my_key != 0 && my_val != 0) && ((i + 1 >= overall_size) || (my_key != keys(i + 1) || my_val != vals(i + 1)))) { prefix_sum(i) = 1; num_result += 1; } @@ -944,9 +785,7 @@ struct FillSymmetricCSR { typename v3::size_type array_size; v4 out_xadj; v5 out_adj; - FillSymmetricCSR(v1 keys_, v2 vals_, v3 prefix_sum_, - typename v3::size_type array_size_, v4 out_xadj_, - v5 out_adj_) + FillSymmetricCSR(v1 keys_, v2 vals_, v3 prefix_sum_, typename v3::size_type array_size_, v4 out_xadj_, v5 out_adj_) : keys(keys_), vals(vals_), prefix_sum(prefix_sum_), @@ -978,12 +817,10 @@ struct FillSymmetricCSR { } }; -template -void symmetrize_and_get_lower_diagonal_edge_list( - typename in_lno_nnz_view_t::value_type num_rows_to_symmetrize, - in_lno_row_view_t xadj, in_lno_nnz_view_t adj, out_lno_nnz_view_t &sym_srcs, - out_lno_nnz_view_t &sym_dsts_) { +template +void symmetrize_and_get_lower_diagonal_edge_list(typename in_lno_nnz_view_t::value_type num_rows_to_symmetrize, + in_lno_row_view_t xadj, in_lno_nnz_view_t adj, + out_lno_nnz_view_t &sym_srcs, out_lno_nnz_view_t &sym_dsts_) { typedef typename in_lno_row_view_t::non_const_value_type idx; idx nnz = adj.extent(0); @@ -997,8 +834,7 @@ void symmetrize_and_get_lower_diagonal_edge_list( // typedef Kokkos::RangePolicy my_exec_space; // TODO: Should change this to temporary memory space? - typedef Kokkos::UnorderedMap, void, MyExecSpace> - hashmap_t; + typedef Kokkos::UnorderedMap, void, MyExecSpace> hashmap_t; out_lno_nnz_view_t pre_pps_("pre_pps", num_rows_to_symmetrize + 1); @@ -1007,31 +843,26 @@ void symmetrize_and_get_lower_diagonal_edge_list( hashmap_t umap(nnz); umap.clear(); umap.end_erase(); - FillSymmetricLowerEdgesHashMap + FillSymmetricLowerEdgesHashMap fse(num_rows_to_symmetrize, xadj, adj, umap, pre_pps_); int teamSizeMax = 0; int vector_size = 0; - get_suggested_vector_size(vector_size, xadj.extent(0) - 1, - nnz); + get_suggested_vector_size(vector_size, xadj.extent(0) - 1, nnz); teamSizeMax = get_suggested_team_size(fse, vector_size); // std::cout << "max_allowed_team_size:" << max_allowed_team_size << " vs:" // << vector_size << " tsm:" << teamSizeMax<< std::endl; - team_policy pol((num_rows_to_symmetrize + teamSizeMax - 1) / teamSizeMax, - teamSizeMax, vector_size); - Kokkos::parallel_for( - "KokkosKernels::Common::SymmetrizeAndGetLowerDiagonalEdgeList::S0", pol, - fse /*, num_symmetric_edges*/); + team_policy pol((num_rows_to_symmetrize + teamSizeMax - 1) / teamSizeMax, teamSizeMax, vector_size); + Kokkos::parallel_for("KokkosKernels::Common::SymmetrizeAndGetLowerDiagonalEdgeList::S0", pol, + fse /*, num_symmetric_edges*/); MyExecSpace().fence(); } if (num_rows_to_symmetrize > 0) - exclusive_parallel_prefix_sum( - num_rows_to_symmetrize + 1, pre_pps_); + exclusive_parallel_prefix_sum(num_rows_to_symmetrize + 1, pre_pps_); MyExecSpace().fence(); auto d_sym_edge_size = Kokkos::subview(pre_pps_, num_rows_to_symmetrize); @@ -1046,45 +877,33 @@ void symmetrize_and_get_lower_diagonal_edge_list( num_symmetric_edges = h_sym_edge_size(h_sym_edge_size.extent(0) - 1); */ - sym_srcs = out_lno_nnz_view_t( - Kokkos::view_alloc(Kokkos::WithoutInitializing, "sym_srcs"), - num_symmetric_edges); - sym_dsts_ = out_lno_nnz_view_t( - Kokkos::view_alloc(Kokkos::WithoutInitializing, "sym_dsts_"), - num_symmetric_edges); + sym_srcs = out_lno_nnz_view_t(Kokkos::view_alloc(Kokkos::WithoutInitializing, "sym_srcs"), num_symmetric_edges); + sym_dsts_ = out_lno_nnz_view_t(Kokkos::view_alloc(Kokkos::WithoutInitializing, "sym_dsts_"), num_symmetric_edges); MyExecSpace().fence(); { hashmap_t umap(nnz); - FillSymmetricEdgeList_HashMap - FSCH(num_rows_to_symmetrize, xadj, adj, umap, sym_srcs, sym_dsts_, - pre_pps_); + FSCH(num_rows_to_symmetrize, xadj, adj, umap, sym_srcs, sym_dsts_, pre_pps_); int teamSizeMax = 0; int vector_size = 0; - get_suggested_vector_size(vector_size, xadj.extent(0) - 1, - nnz); + get_suggested_vector_size(vector_size, xadj.extent(0) - 1, nnz); teamSizeMax = get_suggested_team_size(FSCH, vector_size); - team_policy pol((num_rows_to_symmetrize + teamSizeMax - 1) / teamSizeMax, - teamSizeMax, vector_size); - Kokkos::parallel_for( - "KokkosKernels::Common::SymmetrizeAndGetLowerDiagonalEdgeList::S1", pol, - FSCH); + team_policy pol((num_rows_to_symmetrize + teamSizeMax - 1) / teamSizeMax, teamSizeMax, vector_size); + Kokkos::parallel_for("KokkosKernels::Common::SymmetrizeAndGetLowerDiagonalEdgeList::S1", pol, FSCH); MyExecSpace().fence(); } } -template -void symmetrize_graph_symbolic_hashmap( - typename in_lno_row_view_t::value_type num_rows_to_symmetrize, - in_lno_row_view_t xadj, in_lno_nnz_view_t adj, out_lno_row_view_t &sym_xadj, - out_lno_nnz_view_t &sym_adj) { +template +void symmetrize_graph_symbolic_hashmap(typename in_lno_row_view_t::value_type num_rows_to_symmetrize, + in_lno_row_view_t xadj, in_lno_nnz_view_t adj, out_lno_row_view_t &sym_xadj, + out_lno_nnz_view_t &sym_adj) { typedef typename in_lno_row_view_t::non_const_value_type idx; idx nnz = adj.extent(0); @@ -1098,8 +917,7 @@ void symmetrize_graph_symbolic_hashmap( // typedef Kokkos::RangePolicy my_exec_space; // TODO: Should change this to temporary memory space? - typedef Kokkos::UnorderedMap, void, MyExecSpace> - hashmap_t; + typedef Kokkos::UnorderedMap, void, MyExecSpace> hashmap_t; out_lno_row_view_t pre_pps_("pre_pps", num_rows_to_symmetrize + 1); @@ -1108,66 +926,53 @@ void symmetrize_graph_symbolic_hashmap( hashmap_t umap(nnz); umap.clear(); umap.end_erase(); - FillSymmetricEdgesHashMap - fse(num_rows_to_symmetrize, xadj, adj, umap, pre_pps_); + FillSymmetricEdgesHashMap fse( + num_rows_to_symmetrize, xadj, adj, umap, pre_pps_); int teamSizeMax = 0; int vector_size = 0; - get_suggested_vector_size(vector_size, xadj.extent(0) - 1, - nnz); + get_suggested_vector_size(vector_size, xadj.extent(0) - 1, nnz); teamSizeMax = get_suggested_team_size(fse, vector_size); - team_policy pol((num_rows_to_symmetrize + teamSizeMax - 1) / teamSizeMax, - teamSizeMax, vector_size); - Kokkos::parallel_for( - "KokkosKernels::Common::SymmetrizeGraphSymbolicHashMap::S0", pol, - fse /*, num_symmetric_edges*/); + team_policy pol((num_rows_to_symmetrize + teamSizeMax - 1) / teamSizeMax, teamSizeMax, vector_size); + Kokkos::parallel_for("KokkosKernels::Common::SymmetrizeGraphSymbolicHashMap::S0", pol, + fse /*, num_symmetric_edges*/); MyExecSpace().fence(); } if (num_rows_to_symmetrize > 0) - exclusive_parallel_prefix_sum( - num_rows_to_symmetrize + 1, pre_pps_); + exclusive_parallel_prefix_sum(num_rows_to_symmetrize + 1, pre_pps_); MyExecSpace().fence(); // out_lno_row_view_t d_sym_edge_size = Kokkos::subview(pre_pps_, // num_rows_to_symmetrize, num_rows_to_symmetrize ); - typename out_lno_row_view_t::HostMirror h_sym_edge_size = - Kokkos::create_mirror_view(pre_pps_); + typename out_lno_row_view_t::HostMirror h_sym_edge_size = Kokkos::create_mirror_view(pre_pps_); Kokkos::deep_copy(h_sym_edge_size, pre_pps_); num_symmetric_edges = h_sym_edge_size(h_sym_edge_size.extent(0) - 1); - sym_adj = out_lno_nnz_view_t( - Kokkos::view_alloc(Kokkos::WithoutInitializing, "sym_adj"), - num_symmetric_edges); + sym_adj = out_lno_nnz_view_t(Kokkos::view_alloc(Kokkos::WithoutInitializing, "sym_adj"), num_symmetric_edges); MyExecSpace().fence(); - sym_xadj = out_lno_row_view_t( - Kokkos::view_alloc(Kokkos::WithoutInitializing, "sym_xadj"), - num_rows_to_symmetrize + 1); + sym_xadj = + out_lno_row_view_t(Kokkos::view_alloc(Kokkos::WithoutInitializing, "sym_xadj"), num_rows_to_symmetrize + 1); Kokkos::deep_copy(sym_xadj, pre_pps_); { hashmap_t umap(nnz); - FillSymmetricCRS_HashMap FSCH(num_rows_to_symmetrize, xadj, adj, umap, pre_pps_, sym_adj); int teamSizeMax = 0; int vector_size = 0; - get_suggested_vector_size(vector_size, xadj.extent(0) - 1, - nnz); + get_suggested_vector_size(vector_size, xadj.extent(0) - 1, nnz); teamSizeMax = get_suggested_team_size(FSCH, vector_size); - team_policy pol((num_rows_to_symmetrize + teamSizeMax - 1) / teamSizeMax, - teamSizeMax, vector_size); - Kokkos::parallel_for( - "KokkosKernels::Common::SymmetrizeGraphSymbolicHashMap::S1", pol, FSCH); + team_policy pol((num_rows_to_symmetrize + teamSizeMax - 1) / teamSizeMax, teamSizeMax, vector_size); + Kokkos::parallel_for("KokkosKernels::Common::SymmetrizeGraphSymbolicHashMap::S1", pol, FSCH); MyExecSpace().fence(); } @@ -1192,44 +997,36 @@ struct CopyView { template void copy_view(size_t num_elements, from_vector from, to_vector to) { typedef Kokkos::RangePolicy my_exec_space; - Kokkos::parallel_for("KokkosKernels::Common::CopyView", - my_exec_space(0, num_elements), + Kokkos::parallel_for("KokkosKernels::Common::CopyView", my_exec_space(0, num_elements), CopyView(from, to)); } template -void safe_device_to_host_deep_copy(size_t num_elements, from_view from, - typename from_view::HostMirror to) { +void safe_device_to_host_deep_copy(size_t num_elements, from_view from, typename from_view::HostMirror to) { typedef typename from_view::value_type scalar_t; typedef typename from_view::device_type device_t; typedef Kokkos::View unstrided_from_view_t; unstrided_from_view_t unstrided_from("unstrided", num_elements); - copy_view(num_elements, from, - unstrided_from); + copy_view(num_elements, from, unstrided_from); Kokkos::fence(); typedef typename unstrided_from_view_t::HostMirror host_unstrided_from_view_t; - host_unstrided_from_view_t h_unstrided_from = - Kokkos::create_mirror_view(unstrided_from); + host_unstrided_from_view_t h_unstrided_from = Kokkos::create_mirror_view(unstrided_from); Kokkos::deep_copy(h_unstrided_from, unstrided_from); Kokkos::fence(); copy_view( - num_elements, h_unstrided_from, to); + typename host_unstrided_from_view_t::device_type::execution_space>(num_elements, h_unstrided_from, to); Kokkos::fence(); } template -void safe_host_to_device_deep_copy(size_t num_elements, - typename to_view::HostMirror from, - to_view to) { +void safe_host_to_device_deep_copy(size_t num_elements, typename to_view::HostMirror from, to_view to) { typedef typename to_view::value_type scalar_t; typedef typename to_view::device_type device_t; @@ -1241,17 +1038,15 @@ void safe_host_to_device_deep_copy(size_t num_elements, host_unstrided_view_t host_unstrided_from("unstrided", num_elements); device_unstrided_view_t device_unstrided_to("unstrided", num_elements); - copy_view(num_elements, from, - host_unstrided_from); + copy_view( + num_elements, from, host_unstrided_from); Kokkos::fence(); Kokkos::deep_copy(device_unstrided_to, host_unstrided_from); Kokkos::fence(); - copy_view(num_elements, - device_unstrided_to, to); + copy_view(num_elements, device_unstrided_to, + to); Kokkos::fence(); } @@ -1260,12 +1055,9 @@ template struct ReduceSumFunctor { view_type view_to_reduce; - ReduceSumFunctor(view_type view_to_reduce_) - : view_to_reduce(view_to_reduce_) {} + ReduceSumFunctor(view_type view_to_reduce_) : view_to_reduce(view_to_reduce_) {} - void operator()( - const size_t &i, - typename view_type::non_const_value_type &sum_reduction) const { + void operator()(const size_t &i, typename view_type::non_const_value_type &sum_reduction) const { sum_reduction += view_to_reduce(i); } }; @@ -1274,16 +1066,14 @@ template void view_reduce_sum(size_t num_elements, view_type view_to_reduce, typename view_type::non_const_value_type &sum_reduction) { typedef Kokkos::RangePolicy my_exec_space; - Kokkos::parallel_reduce( - "KokkosKernels::Common::ViewReduceSum", my_exec_space(0, num_elements), - ReduceSumFunctor(view_to_reduce), sum_reduction); + Kokkos::parallel_reduce("KokkosKernels::Common::ViewReduceSum", my_exec_space(0, num_elements), + ReduceSumFunctor(view_to_reduce), sum_reduction); } template void view_reduce_max(size_t num_elements, view_type view_to_reduce, typename view_type::non_const_value_type &max_reduction) { - kk_view_reduce_max(num_elements, view_to_reduce, - max_reduction); + kk_view_reduce_max(num_elements, view_to_reduce, max_reduction); } template @@ -1319,28 +1109,18 @@ struct ReduceRowSizeFunctor { // view has num_rows+1 elements. template -void kk_view_reduce_max_row_size(MyExecSpace my_exec_space, - const size_t num_rows, - const size_type *rowmap_view_begins, - const size_type *rowmap_view_ends, - size_type &max_row_size) { +void kk_view_reduce_max_row_size(MyExecSpace my_exec_space, const size_t num_rows, const size_type *rowmap_view_begins, + const size_type *rowmap_view_ends, size_type &max_row_size) { typedef Kokkos::RangePolicy range_policy_t; - Kokkos::parallel_reduce( - "KokkosKernels::Common::ViewReduceMaxRowSize", - range_policy_t(my_exec_space, 0, num_rows), - ReduceRowSizeFunctor(rowmap_view_begins, rowmap_view_ends), - max_row_size); + Kokkos::parallel_reduce("KokkosKernels::Common::ViewReduceMaxRowSize", range_policy_t(my_exec_space, 0, num_rows), + ReduceRowSizeFunctor(rowmap_view_begins, rowmap_view_ends), max_row_size); } // view has num_rows+1 elements. template -void kk_view_reduce_max_row_size(const size_t num_rows, - const size_type *rowmap_view_begins, - const size_type *rowmap_view_ends, - size_type &max_row_size) { - return kk_view_reduce_max_row_size(MyExecSpace(), num_rows, - rowmap_view_begins, rowmap_view_ends, - max_row_size); +void kk_view_reduce_max_row_size(const size_t num_rows, const size_type *rowmap_view_begins, + const size_type *rowmap_view_ends, size_type &max_row_size) { + return kk_view_reduce_max_row_size(MyExecSpace(), num_rows, rowmap_view_begins, rowmap_view_ends, max_row_size); } template @@ -1348,8 +1128,7 @@ struct ReduceMaxRowFunctor { view_type rowmap_view; typedef typename view_type::non_const_value_type value_type; const value_type min_val; - ReduceMaxRowFunctor(view_type rowmap_view_) - : rowmap_view(rowmap_view_), min_val(0) {} + ReduceMaxRowFunctor(view_type rowmap_view_) : rowmap_view(rowmap_view_), min_val(0) {} KOKKOS_INLINE_FUNCTION void operator()(const size_t &i, value_type &max_reduction) const { @@ -1377,13 +1156,11 @@ struct ReduceMaxRowFunctor { // view has num_rows+1 elements. template -void view_reduce_maxsizerow( - size_t num_rows, view_type rowmap_view, - typename view_type::non_const_value_type &max_reduction) { +void view_reduce_maxsizerow(size_t num_rows, view_type rowmap_view, + typename view_type::non_const_value_type &max_reduction) { typedef Kokkos::RangePolicy my_exec_space; - Kokkos::parallel_reduce( - "KokkosKernels::Common::ViewReduceMaxSizeRow", my_exec_space(0, num_rows), - ReduceMaxRowFunctor(rowmap_view), max_reduction); + Kokkos::parallel_reduce("KokkosKernels::Common::ViewReduceMaxSizeRow", my_exec_space(0, num_rows), + ReduceMaxRowFunctor(rowmap_view), max_reduction); } template @@ -1391,8 +1168,7 @@ struct IsEqualFunctor { view_type1 view1; view_type2 view2; - IsEqualFunctor(view_type1 view1_, view_type2 view2_) - : view1(view1_), view2(view2_) {} + IsEqualFunctor(view_type1 view1_, view_type2 view2_) : view1(view1_), view2(view2_) {} KOKKOS_INLINE_FUNCTION void operator()(const size_t &i, int &is_equal) const { @@ -1412,9 +1188,8 @@ template bool isSame(size_t num_elements, view_type1 view1, view_type2 view2) { typedef Kokkos::RangePolicy my_exec_space; int issame = 1; - Kokkos::parallel_reduce( - "KokkosKernels::Common::isSame", my_exec_space(0, num_elements), - IsEqualFunctor(view1, view2), issame); + Kokkos::parallel_reduce("KokkosKernels::Common::isSame", my_exec_space(0, num_elements), + IsEqualFunctor(view1, view2), issame); MyExecSpace().fence(); return issame; } @@ -1427,14 +1202,10 @@ struct MaxHeap { size_type current_size; MaxHeap(a_view_t heap_keys_, b_view_t heap_values_, size_type max_size_) - : heap_keys(heap_keys_), - heap_values(heap_values_), - max_size(max_size_), - current_size(0) {} + : heap_keys(heap_keys_), heap_values(heap_values_), max_size(max_size_), current_size(0) {} KOKKOS_INLINE_FUNCTION - void insert(typename a_view_t::value_type &key, - typename b_view_t::value_type &val) { + void insert(typename a_view_t::value_type &key, typename b_view_t::value_type &val) { for (size_type i = 0; i < current_size; ++i) { if (key == heap_keys(i)) { heap_values(i) = heap_values(i) & val; @@ -1459,8 +1230,7 @@ struct InitScalar { size_type team_row_chunk_size; nnz_lno_t init_val; - InitScalar(size_type num_elements_, in_view_t view_to_init_, - size_type chunk_size_, nnz_lno_t init_val_) + InitScalar(size_type num_elements_, in_view_t view_to_init_, size_type chunk_size_, nnz_lno_t init_val_) : num_elements(num_elements_), view_to_init(view_to_init_), team_row_chunk_size(chunk_size_), @@ -1471,20 +1241,16 @@ struct InitScalar { // const nnz_lno_t row_index = teamMember.league_rank() * // team_row_chunk_size; - const nnz_lno_t team_row_begin = - teamMember.league_rank() * team_row_chunk_size; - const nnz_lno_t team_row_end = KOKKOSKERNELS_MACRO_MIN( - team_row_begin + team_row_chunk_size, num_elements); - Kokkos::parallel_for( - Kokkos::TeamThreadRange(teamMember, team_row_begin, team_row_end), - [&](const nnz_lno_t &row_ind) { view_to_init[row_ind] = init_val; }); + const nnz_lno_t team_row_begin = teamMember.league_rank() * team_row_chunk_size; + const nnz_lno_t team_row_end = KOKKOSKERNELS_MACRO_MIN(team_row_begin + team_row_chunk_size, num_elements); + Kokkos::parallel_for(Kokkos::TeamThreadRange(teamMember, team_row_begin, team_row_end), + [&](const nnz_lno_t &row_ind) { view_to_init[row_ind] = init_val; }); } }; template -void init_view_withscalar( - typename in_row_view_t::size_type num_elements, in_row_view_t arr, - typename in_row_view_t::size_type team_size, - typename in_row_view_t::non_const_value_type init_val) { +void init_view_withscalar(typename in_row_view_t::size_type num_elements, in_row_view_t arr, + typename in_row_view_t::size_type team_size, + typename in_row_view_t::non_const_value_type init_val) { typename in_row_view_t::size_type chunk_size = num_elements / team_size; typedef InitScalar InitScalar_t; InitScalar_t tm(num_elements, arr, chunk_size, init_val); @@ -1492,9 +1258,8 @@ void init_view_withscalar( int vector_size = 1; Kokkos::Timer timer1; - Kokkos::parallel_for( - "KokkosKernels::Common::InitViewWithScalar", - tcp_t(num_elements / chunk_size + 1, team_size, vector_size), tm); + Kokkos::parallel_for("KokkosKernels::Common::InitViewWithScalar", + tcp_t(num_elements / chunk_size + 1, team_size, vector_size), tm); MyExecSpace().fence(); } @@ -1504,8 +1269,7 @@ struct array_sum_reduce { using ValueType = array_sum_reduce; // Workaround for https://github.com/kokkos/kokkos/issues/5860 static constexpr int N_internal = - ((N == 3 || N == 5 || N == 7) && - std::is_same::value && + ((N == 3 || N == 5 || N == 7) && std::is_same::value && sizeof(Kokkos::Experimental::half_t) == 2) ? (N + 1) : N; @@ -1533,11 +1297,9 @@ KOKKOS_INLINE_FUNCTION T *alignPtrTo(InPtr *p) { const std::uintptr_t ptrVal = reinterpret_cast(p); // ptrVal + (align - 1) lands inside the next valid aligned scalar_t, // and the mask produces the start of that scalar_t. - const std::uintptr_t ptrValNew = - (ptrVal + alignof(T) - 1) & (~(alignof(T) - 1)); - return reinterpret_cast( - reinterpret_cast(const_cast *>(p)) + - (ptrValNew - ptrVal)); + const std::uintptr_t ptrValNew = (ptrVal + alignof(T) - 1) & (~(alignof(T) - 1)); + return reinterpret_cast(reinterpret_cast(const_cast *>(p)) + + (ptrValNew - ptrVal)); } } // namespace Impl diff --git a/common/src/KokkosKernels_VectorUtils.hpp b/common/src/KokkosKernels_VectorUtils.hpp index f0c09a7e9f..d20d298956 100644 --- a/common/src/KokkosKernels_VectorUtils.hpp +++ b/common/src/KokkosKernels_VectorUtils.hpp @@ -22,15 +22,13 @@ namespace KokkosKernels { namespace Impl { -template +template struct A_times_X_plus_B { out_array_t out_view; in_array_t in_view; const scalar_1 a; const scalar_2 b; - A_times_X_plus_B(out_array_t out_view_, in_array_t in_view_, scalar_1 a_, - scalar_2 b_) + A_times_X_plus_B(out_array_t out_view_, in_array_t in_view_, scalar_1 a_, scalar_2 b_) : out_view(out_view_), in_view(in_view_), a(a_), b(b_) {} KOKKOS_INLINE_FUNCTION @@ -47,9 +45,7 @@ struct ModularView { : out_view(out_view_), in_view(in_view_), modular_constant(mod_factor_) {} KOKKOS_INLINE_FUNCTION - void operator()(const size_t ii) const { - out_view(ii) = in_view(ii) % modular_constant; - } + void operator()(const size_t ii) const { out_view(ii) = in_view(ii) % modular_constant; } }; template @@ -72,16 +68,12 @@ struct CopyVectorFunctor { * \param a: scalar for multiplication * \param b: scalar for addition */ -template -inline void kk_a_times_x_plus_b(typename in_array_t::value_type num_elements, - out_array_t out_arr, in_array_t in_arr, +template +inline void kk_a_times_x_plus_b(typename in_array_t::value_type num_elements, out_array_t out_arr, in_array_t in_arr, scalar_1 a, scalar_2 b) { typedef Kokkos::RangePolicy my_exec_space; - Kokkos::parallel_for( - "KokkosKernels::Common::ATimesXPlusB", my_exec_space(0, num_elements), - A_times_X_plus_B( - out_arr, in_arr, a, b)); + Kokkos::parallel_for("KokkosKernels::Common::ATimesXPlusB", my_exec_space(0, num_elements), + A_times_X_plus_B(out_arr, in_arr, a, b)); } /** @@ -92,20 +84,17 @@ inline void kk_a_times_x_plus_b(typename in_array_t::value_type num_elements, * applied. */ template -inline void kk_modular_view(typename in_array_type::value_type num_elements, - out_array_type out_arr, in_array_type in_arr, - int mod_factor_) { +inline void kk_modular_view(typename in_array_type::value_type num_elements, out_array_type out_arr, + in_array_type in_arr, int mod_factor_) { typedef Kokkos::RangePolicy my_exec_space; - Kokkos::parallel_for( - "KokkosKernels::Common::ModularView", my_exec_space(0, num_elements), - ModularView(out_arr, in_arr, mod_factor_)); + Kokkos::parallel_for("KokkosKernels::Common::ModularView", my_exec_space(0, num_elements), + ModularView(out_arr, in_arr, mod_factor_)); } template void kk_copy_vector(size_t num_elements, from_vector from, to_vector to) { typedef Kokkos::RangePolicy my_exec_space; - Kokkos::parallel_for("KokkosKernels::Common::CopyVector", - my_exec_space(0, num_elements), + Kokkos::parallel_for("KokkosKernels::Common::CopyVector", my_exec_space(0, num_elements), CopyVectorFunctor(from, to)); } } // namespace Impl diff --git a/common/src/KokkosKernels_helpers.hpp b/common/src/KokkosKernels_helpers.hpp index 1b725f2f5c..cea3a8a061 100644 --- a/common/src/KokkosKernels_helpers.hpp +++ b/common/src/KokkosKernels_helpers.hpp @@ -16,7 +16,7 @@ #ifndef KOKKOSKERNELS_HELPERS_HPP_ #define KOKKOSKERNELS_HELPERS_HPP_ -#include "KokkosKernels_config.h" // KOKKOSKERNELS_INST_LAYOUTLEFT, KOKKOSKERNELS_INST_LAYOUTRIGHT +#include "KokkosKernels_config.h" // KOKKOSKERNELS_INST_LAYOUTLEFT, KOKKOSKERNELS_INST_LAYOUTRIGHT #include "KokkosKernels_default_types.hpp" // default_layout #include @@ -29,49 +29,43 @@ namespace Impl { // Used to reduce number of code instantiations. template struct GetUnifiedLayoutPreferring { - using array_layout = typename std::conditional< - ((ViewType::rank == 1) && !std::is_same_v) || - (ViewType::rank == 0), - PreferredLayoutType, typename ViewType::array_layout>::type; + using array_layout = + typename std::conditional<((ViewType::rank == 1) && + !std::is_same_v) || + (ViewType::rank == 0), + PreferredLayoutType, typename ViewType::array_layout>::type; }; template struct GetUnifiedLayout { - using array_layout = - typename GetUnifiedLayoutPreferring::array_layout; + using array_layout = typename GetUnifiedLayoutPreferring::array_layout; }; -template ::value> +template ::value> struct GetUnifiedScalarViewType { typedef typename TX::non_const_value_type type; }; template struct GetUnifiedScalarViewType { - typedef Kokkos::View::array_layout, - typename T::device_type, - Kokkos::MemoryTraits > + typedef Kokkos::View< + typename T::non_const_value_type*, + typename KokkosKernels::Impl::GetUnifiedLayoutPreferring::array_layout, + typename T::device_type, Kokkos::MemoryTraits > type; }; template struct GetUnifiedScalarViewType { - typedef Kokkos::View::array_layout, - typename T::device_type, - Kokkos::MemoryTraits > + typedef Kokkos::View< + typename T::const_value_type*, + typename KokkosKernels::Impl::GetUnifiedLayoutPreferring::array_layout, + typename T::device_type, Kokkos::MemoryTraits > type; }; template -struct are_integral : std::bool_constant<((std::is_integral_v || - std::is_enum_v)&&...)> {}; +struct are_integral : std::bool_constant<((std::is_integral_v || std::is_enum_v)&&...)> {}; template inline constexpr bool are_integral_v = are_integral::value; diff --git a/common/src/Kokkos_ArithTraits.hpp b/common/src/Kokkos_ArithTraits.hpp index 415189be93..25089613d4 100644 --- a/common/src/Kokkos_ArithTraits.hpp +++ b/common/src/Kokkos_ArithTraits.hpp @@ -48,8 +48,7 @@ namespace { // anonymous /// /// Use intPowSigned or intPowUnsigned for general y. template -KOKKOS_FORCEINLINE_FUNCTION IntType intPowImpl(const IntType x, - const IntType y) { +KOKKOS_FORCEINLINE_FUNCTION IntType intPowImpl(const IntType x, const IntType y) { // Recursion (unrolled into while loop): pow(x, 2y) = (x^y)^2 IntType prod = x; IntType y_cur = 1; @@ -120,10 +119,8 @@ struct integer_abs { /// result of this function is undefined. However, this function will /// not throw an exception in that case. template -KOKKOS_FORCEINLINE_FUNCTION - typename std::enable_if::is_signed, - IntType>::type - intPowSigned(const IntType x, const IntType y) { +KOKKOS_FORCEINLINE_FUNCTION typename std::enable_if::is_signed, IntType>::type +intPowSigned(const IntType x, const IntType y) { // It's not entirely clear what to return if x and y are both zero. // In the case of floating-point numbers, 0^0 is NaN. Here, though, // I think it's safe to return 0. @@ -143,10 +140,8 @@ KOKKOS_FORCEINLINE_FUNCTION return intPowImpl(x, y); } template -KOKKOS_FORCEINLINE_FUNCTION - typename std::enable_if::is_signed, - IntType>::type - intPowSigned(const IntType x, const IntType y) { +KOKKOS_FORCEINLINE_FUNCTION typename std::enable_if::is_signed, IntType>::type +intPowSigned(const IntType x, const IntType y) { // It's not entirely clear what to return if x and y are both zero. // In the case of floating-point numbers, 0^0 is NaN. Here, though, // I think it's safe to return 0. @@ -166,8 +161,7 @@ KOKKOS_FORCEINLINE_FUNCTION /// result of this function is undefined. However, this function will /// not throw an exception in that case. template -KOKKOS_FORCEINLINE_FUNCTION IntType intPowUnsigned(const IntType x, - const IntType y) { +KOKKOS_FORCEINLINE_FUNCTION IntType intPowUnsigned(const IntType x, const IntType y) { // It's not entirely clear what to return if x and y are both zero. // In the case of floating-point numbers, 0^0 is NaN. Here, though, // I think it's safe to return 0. @@ -196,370 +190,229 @@ KOKKOS_FORCEINLINE_FUNCTION IntType intPowUnsigned(const IntType x, namespace Kokkos { // Macro to automate the wrapping of Kokkos Mathematical Functions -#define KOKKOSKERNELS_ARITHTRAITS_REAL_FP(FUNC_QUAL) \ - static FUNC_QUAL val_type zero() { return static_cast(0); } \ - static FUNC_QUAL val_type one() { return static_cast(1); } \ - static FUNC_QUAL val_type min() { \ - return Kokkos::Experimental::finite_min::value; \ - } \ - static FUNC_QUAL val_type max() { \ - return Kokkos::Experimental::finite_max::value; \ - } \ - static FUNC_QUAL val_type infinity() { \ - return Kokkos::Experimental::infinity::value; \ - } \ - static FUNC_QUAL val_type nan() { \ - return Kokkos::Experimental::quiet_NaN::value; \ - } \ - static FUNC_QUAL mag_type epsilon() { \ - return Kokkos::Experimental::epsilon::value; \ - } \ - static FUNC_QUAL mag_type sfmin() { \ - return Kokkos::Experimental::norm_min::value; \ - } \ - static FUNC_QUAL int base() { \ - return Kokkos::Experimental::radix::value; \ - } \ - static FUNC_QUAL mag_type prec() { \ - return epsilon() * static_cast(base()); \ - } \ - static FUNC_QUAL int t() { \ - return Kokkos::Experimental::digits::value; \ - } \ - static FUNC_QUAL mag_type rnd() { return one(); } \ - static FUNC_QUAL int emin() { \ - return Kokkos::Experimental::min_exponent::value; \ - } \ - static FUNC_QUAL mag_type rmin() { \ - return Kokkos::Experimental::norm_min::value; \ - } \ - static FUNC_QUAL int emax() { \ - return Kokkos::Experimental::max_exponent::value; \ - } \ - static FUNC_QUAL mag_type rmax() { \ - return Kokkos::Experimental::finite_max::value; \ - } \ - \ - static FUNC_QUAL bool isInf(const val_type x) { return Kokkos::isinf(x); } \ - static FUNC_QUAL bool isNan(const val_type x) { return Kokkos::isnan(x); } \ - static FUNC_QUAL mag_type abs(const val_type x) { return Kokkos::abs(x); } \ - static FUNC_QUAL mag_type real(const val_type x) { return Kokkos::real(x); } \ - static FUNC_QUAL mag_type imag(const val_type x) { return Kokkos::imag(x); } \ - static FUNC_QUAL val_type conj(const val_type x) { return x; } \ - static FUNC_QUAL val_type pow(const val_type x, const val_type y) { \ - return Kokkos::pow(x, y); \ - } \ - static FUNC_QUAL val_type sqrt(const val_type x) { return Kokkos::sqrt(x); } \ - static FUNC_QUAL val_type cbrt(const val_type x) { return Kokkos::cbrt(x); } \ - static FUNC_QUAL val_type exp(const val_type x) { return Kokkos::exp(x); } \ - static FUNC_QUAL val_type log(const val_type x) { return Kokkos::log(x); } \ - static FUNC_QUAL val_type log10(const val_type x) { \ - return Kokkos::log10(x); \ - } \ - static FUNC_QUAL val_type sin(const val_type x) { return Kokkos::sin(x); } \ - static FUNC_QUAL val_type cos(const val_type x) { return Kokkos::cos(x); } \ - static FUNC_QUAL val_type tan(const val_type x) { return Kokkos::tan(x); } \ - static FUNC_QUAL val_type sinh(const val_type x) { return Kokkos::sinh(x); } \ - static FUNC_QUAL val_type cosh(const val_type x) { return Kokkos::cosh(x); } \ - static FUNC_QUAL val_type tanh(const val_type x) { return Kokkos::tanh(x); } \ - static FUNC_QUAL val_type asin(const val_type x) { return Kokkos::asin(x); } \ - static FUNC_QUAL val_type acos(const val_type x) { return Kokkos::acos(x); } \ - static FUNC_QUAL val_type atan(const val_type x) { return Kokkos::atan(x); } \ - \ - static FUNC_QUAL bool isnaninf(const val_type x) { \ - return isNan(x) || isInf(x); \ - } \ - static FUNC_QUAL magnitudeType magnitude(const val_type x) { \ - return abs(x); \ - } \ - static FUNC_QUAL val_type conjugate(const val_type x) { return conj(x); } \ - static FUNC_QUAL val_type squareroot(const val_type x) { return sqrt(x); } \ +#define KOKKOSKERNELS_ARITHTRAITS_REAL_FP(FUNC_QUAL) \ + static FUNC_QUAL val_type zero() { return static_cast(0); } \ + static FUNC_QUAL val_type one() { return static_cast(1); } \ + static FUNC_QUAL val_type min() { return Kokkos::Experimental::finite_min::value; } \ + static FUNC_QUAL val_type max() { return Kokkos::Experimental::finite_max::value; } \ + static FUNC_QUAL val_type infinity() { return Kokkos::Experimental::infinity::value; } \ + static FUNC_QUAL val_type nan() { return Kokkos::Experimental::quiet_NaN::value; } \ + static FUNC_QUAL mag_type epsilon() { return Kokkos::Experimental::epsilon::value; } \ + static FUNC_QUAL mag_type sfmin() { return Kokkos::Experimental::norm_min::value; } \ + static FUNC_QUAL int base() { return Kokkos::Experimental::radix::value; } \ + static FUNC_QUAL mag_type prec() { return epsilon() * static_cast(base()); } \ + static FUNC_QUAL int t() { return Kokkos::Experimental::digits::value; } \ + static FUNC_QUAL mag_type rnd() { return one(); } \ + static FUNC_QUAL int emin() { return Kokkos::Experimental::min_exponent::value; } \ + static FUNC_QUAL mag_type rmin() { return Kokkos::Experimental::norm_min::value; } \ + static FUNC_QUAL int emax() { return Kokkos::Experimental::max_exponent::value; } \ + static FUNC_QUAL mag_type rmax() { return Kokkos::Experimental::finite_max::value; } \ + \ + static FUNC_QUAL bool isInf(const val_type x) { return Kokkos::isinf(x); } \ + static FUNC_QUAL bool isNan(const val_type x) { return Kokkos::isnan(x); } \ + static FUNC_QUAL mag_type abs(const val_type x) { return Kokkos::abs(x); } \ + static FUNC_QUAL mag_type real(const val_type x) { return Kokkos::real(x); } \ + static FUNC_QUAL mag_type imag(const val_type x) { return Kokkos::imag(x); } \ + static FUNC_QUAL val_type conj(const val_type x) { return x; } \ + static FUNC_QUAL val_type pow(const val_type x, const val_type y) { return Kokkos::pow(x, y); } \ + static FUNC_QUAL val_type sqrt(const val_type x) { return Kokkos::sqrt(x); } \ + static FUNC_QUAL val_type cbrt(const val_type x) { return Kokkos::cbrt(x); } \ + static FUNC_QUAL val_type exp(const val_type x) { return Kokkos::exp(x); } \ + static FUNC_QUAL val_type log(const val_type x) { return Kokkos::log(x); } \ + static FUNC_QUAL val_type log10(const val_type x) { return Kokkos::log10(x); } \ + static FUNC_QUAL val_type sin(const val_type x) { return Kokkos::sin(x); } \ + static FUNC_QUAL val_type cos(const val_type x) { return Kokkos::cos(x); } \ + static FUNC_QUAL val_type tan(const val_type x) { return Kokkos::tan(x); } \ + static FUNC_QUAL val_type sinh(const val_type x) { return Kokkos::sinh(x); } \ + static FUNC_QUAL val_type cosh(const val_type x) { return Kokkos::cosh(x); } \ + static FUNC_QUAL val_type tanh(const val_type x) { return Kokkos::tanh(x); } \ + static FUNC_QUAL val_type asin(const val_type x) { return Kokkos::asin(x); } \ + static FUNC_QUAL val_type acos(const val_type x) { return Kokkos::acos(x); } \ + static FUNC_QUAL val_type atan(const val_type x) { return Kokkos::atan(x); } \ + \ + static FUNC_QUAL bool isnaninf(const val_type x) { return isNan(x) || isInf(x); } \ + static FUNC_QUAL magnitudeType magnitude(const val_type x) { return abs(x); } \ + static FUNC_QUAL val_type conjugate(const val_type x) { return conj(x); } \ + static FUNC_QUAL val_type squareroot(const val_type x) { return sqrt(x); } \ static FUNC_QUAL mag_type eps() { return epsilon(); } // Macro to automate the wrapping of Kokkos Mathematical Functions -#define KOKKOSKERNELS_ARITHTRAITS_HALF_FP(FUNC_QUAL) \ - static FUNC_QUAL val_type zero() { return static_cast(0); } \ - static FUNC_QUAL val_type one() { return static_cast(1); } \ - static FUNC_QUAL val_type min() { \ - return Kokkos::Experimental::finite_min::value; \ - } \ - static FUNC_QUAL val_type max() { \ - return Kokkos::Experimental::finite_max::value; \ - } \ - static FUNC_QUAL val_type infinity() { \ - return Kokkos::Experimental::infinity::value; \ - } \ - static FUNC_QUAL val_type nan() { \ - return Kokkos::Experimental::quiet_NaN::value; \ - } \ - static FUNC_QUAL mag_type epsilon() { \ - return Kokkos::Experimental::epsilon::value; \ - } \ - static FUNC_QUAL mag_type sfmin() { \ - return Kokkos::Experimental::norm_min::value; \ - } \ - static FUNC_QUAL int base() { \ - return Kokkos::Experimental::radix::value; \ - } \ - static FUNC_QUAL mag_type prec() { \ - return epsilon() * static_cast(base()); \ - } \ - static FUNC_QUAL int t() { \ - return Kokkos::Experimental::digits::value; \ - } \ - static FUNC_QUAL mag_type rnd() { return one(); } \ - static FUNC_QUAL int emin() { \ - return Kokkos::Experimental::min_exponent::value; \ - } \ - static FUNC_QUAL mag_type rmin() { \ - return Kokkos::Experimental::norm_min::value; \ - } \ - static FUNC_QUAL int emax() { \ - return Kokkos::Experimental::max_exponent::value; \ - } \ - static FUNC_QUAL mag_type rmax() { \ - return Kokkos::Experimental::finite_max::value; \ - } \ - \ - static FUNC_QUAL bool isInf(const val_type x) { return Kokkos::isinf(x); } \ - static FUNC_QUAL mag_type abs(const val_type x) { return Kokkos::abs(x); } \ - static FUNC_QUAL mag_type real(const val_type x) { return Kokkos::real(x); } \ - static FUNC_QUAL mag_type imag(const val_type x) { return Kokkos::imag(x); } \ - static FUNC_QUAL val_type conj(const val_type x) { return x; } \ - static FUNC_QUAL val_type pow(const val_type x, const val_type y) { \ - return Kokkos::pow(x, y); \ - } \ - static FUNC_QUAL val_type sqrt(const val_type x) { return Kokkos::sqrt(x); } \ - static FUNC_QUAL val_type cbrt(const val_type x) { return Kokkos::cbrt(x); } \ - static FUNC_QUAL val_type exp(const val_type x) { return Kokkos::exp(x); } \ - static FUNC_QUAL val_type log(const val_type x) { return Kokkos::log(x); } \ - static FUNC_QUAL val_type log10(const val_type x) { \ - return Kokkos::log10(x); \ - } \ - static FUNC_QUAL val_type sin(const val_type x) { return Kokkos::sin(x); } \ - static FUNC_QUAL val_type cos(const val_type x) { return Kokkos::cos(x); } \ - static FUNC_QUAL val_type tan(const val_type x) { return Kokkos::tan(x); } \ - static FUNC_QUAL val_type sinh(const val_type x) { return Kokkos::sinh(x); } \ - static FUNC_QUAL val_type cosh(const val_type x) { return Kokkos::cosh(x); } \ - static FUNC_QUAL val_type tanh(const val_type x) { return Kokkos::tanh(x); } \ - static FUNC_QUAL val_type asin(const val_type x) { return Kokkos::asin(x); } \ - static FUNC_QUAL val_type acos(const val_type x) { return Kokkos::acos(x); } \ - static FUNC_QUAL val_type atan(const val_type x) { return Kokkos::atan(x); } \ - \ - static FUNC_QUAL magnitudeType magnitude(const val_type x) { \ - return abs(x); \ - } \ - static FUNC_QUAL val_type conjugate(const val_type x) { return conj(x); } \ - static FUNC_QUAL val_type squareroot(const val_type x) { return sqrt(x); } \ +#define KOKKOSKERNELS_ARITHTRAITS_HALF_FP(FUNC_QUAL) \ + static FUNC_QUAL val_type zero() { return static_cast(0); } \ + static FUNC_QUAL val_type one() { return static_cast(1); } \ + static FUNC_QUAL val_type min() { return Kokkos::Experimental::finite_min::value; } \ + static FUNC_QUAL val_type max() { return Kokkos::Experimental::finite_max::value; } \ + static FUNC_QUAL val_type infinity() { return Kokkos::Experimental::infinity::value; } \ + static FUNC_QUAL val_type nan() { return Kokkos::Experimental::quiet_NaN::value; } \ + static FUNC_QUAL mag_type epsilon() { return Kokkos::Experimental::epsilon::value; } \ + static FUNC_QUAL mag_type sfmin() { return Kokkos::Experimental::norm_min::value; } \ + static FUNC_QUAL int base() { return Kokkos::Experimental::radix::value; } \ + static FUNC_QUAL mag_type prec() { return epsilon() * static_cast(base()); } \ + static FUNC_QUAL int t() { return Kokkos::Experimental::digits::value; } \ + static FUNC_QUAL mag_type rnd() { return one(); } \ + static FUNC_QUAL int emin() { return Kokkos::Experimental::min_exponent::value; } \ + static FUNC_QUAL mag_type rmin() { return Kokkos::Experimental::norm_min::value; } \ + static FUNC_QUAL int emax() { return Kokkos::Experimental::max_exponent::value; } \ + static FUNC_QUAL mag_type rmax() { return Kokkos::Experimental::finite_max::value; } \ + \ + static FUNC_QUAL bool isInf(const val_type x) { return Kokkos::isinf(x); } \ + static FUNC_QUAL mag_type abs(const val_type x) { return Kokkos::abs(x); } \ + static FUNC_QUAL mag_type real(const val_type x) { return Kokkos::real(x); } \ + static FUNC_QUAL mag_type imag(const val_type x) { return Kokkos::imag(x); } \ + static FUNC_QUAL val_type conj(const val_type x) { return x; } \ + static FUNC_QUAL val_type pow(const val_type x, const val_type y) { return Kokkos::pow(x, y); } \ + static FUNC_QUAL val_type sqrt(const val_type x) { return Kokkos::sqrt(x); } \ + static FUNC_QUAL val_type cbrt(const val_type x) { return Kokkos::cbrt(x); } \ + static FUNC_QUAL val_type exp(const val_type x) { return Kokkos::exp(x); } \ + static FUNC_QUAL val_type log(const val_type x) { return Kokkos::log(x); } \ + static FUNC_QUAL val_type log10(const val_type x) { return Kokkos::log10(x); } \ + static FUNC_QUAL val_type sin(const val_type x) { return Kokkos::sin(x); } \ + static FUNC_QUAL val_type cos(const val_type x) { return Kokkos::cos(x); } \ + static FUNC_QUAL val_type tan(const val_type x) { return Kokkos::tan(x); } \ + static FUNC_QUAL val_type sinh(const val_type x) { return Kokkos::sinh(x); } \ + static FUNC_QUAL val_type cosh(const val_type x) { return Kokkos::cosh(x); } \ + static FUNC_QUAL val_type tanh(const val_type x) { return Kokkos::tanh(x); } \ + static FUNC_QUAL val_type asin(const val_type x) { return Kokkos::asin(x); } \ + static FUNC_QUAL val_type acos(const val_type x) { return Kokkos::acos(x); } \ + static FUNC_QUAL val_type atan(const val_type x) { return Kokkos::atan(x); } \ + \ + static FUNC_QUAL magnitudeType magnitude(const val_type x) { return abs(x); } \ + static FUNC_QUAL val_type conjugate(const val_type x) { return conj(x); } \ + static FUNC_QUAL val_type squareroot(const val_type x) { return sqrt(x); } \ static FUNC_QUAL mag_type eps() { return epsilon(); } -#define KOKKOSKERNELS_ARITHTRAITS_CMPLX_FP(FUNC_QUAL) \ - \ - static constexpr bool is_specialized = true; \ - static constexpr bool is_signed = true; \ - static constexpr bool is_integer = false; \ - static constexpr bool is_exact = false; \ - static constexpr bool is_complex = true; \ - static constexpr bool has_infinity = true; \ - \ - using magnitudeType = mag_type; \ - using halfPrecision = \ - ::Kokkos::complex::halfPrecision>; \ - using doublePrecision = \ - ::Kokkos::complex::doublePrecision>; \ - \ - static constexpr bool isComplex = true; \ - static constexpr bool isOrdinal = false; \ - static constexpr bool isComparable = false; \ - static constexpr bool hasMachineParameters = \ - ArithTraits::hasMachineParameters; \ - \ - static FUNC_QUAL val_type zero() { \ - return val_type(ArithTraits::zero(), \ - ArithTraits::zero()); \ - } \ - static FUNC_QUAL val_type one() { \ - return val_type(ArithTraits::one(), \ - ArithTraits::zero()); \ - } \ - static FUNC_QUAL val_type min() { \ - return val_type(ArithTraits::min(), \ - ArithTraits::min()); \ - } \ - static FUNC_QUAL val_type max() { \ - return val_type(ArithTraits::max(), \ - ArithTraits::max()); \ - } \ - static FUNC_QUAL val_type infinity() { \ - return val_type(ArithTraits::infinity(), \ - ArithTraits::infinity()); \ - } \ - static FUNC_QUAL val_type nan() { \ - return val_type(ArithTraits::nan(), \ - ArithTraits::nan()); \ - } \ - static FUNC_QUAL mag_type epsilon() { \ - return ArithTraits::epsilon(); \ - } \ - static FUNC_QUAL mag_type sfmin() { return ArithTraits::sfmin(); } \ - static FUNC_QUAL int base() { return ArithTraits::base(); } \ - static FUNC_QUAL mag_type prec() { return ArithTraits::prec(); } \ - static FUNC_QUAL int t() { return ArithTraits::t(); } \ - static FUNC_QUAL mag_type rnd() { return ArithTraits::rnd(); } \ - static FUNC_QUAL int emin() { return ArithTraits::emin(); } \ - static FUNC_QUAL mag_type rmin() { return ArithTraits::rmin(); } \ - static FUNC_QUAL int emax() { return ArithTraits::emax(); } \ - static FUNC_QUAL mag_type rmax() { return ArithTraits::rmax(); } \ - static FUNC_QUAL bool isInf(const val_type x) { \ - return ArithTraits::isInf(x.real()) || \ - ArithTraits::isInf(x.imag()); \ - } \ - static FUNC_QUAL bool isNan(const val_type x) { \ - return ArithTraits::isNan(x.real()) || \ - ArithTraits::isNan(x.imag()); \ - } \ - static FUNC_QUAL mag_type abs(const val_type x) { return ::Kokkos::abs(x); } \ - static FUNC_QUAL mag_type real(const val_type x) { return x.real(); } \ - static FUNC_QUAL mag_type imag(const val_type x) { return x.imag(); } \ - static FUNC_QUAL val_type conj(const val_type x) { \ - return ::Kokkos::conj(x); \ - } \ - static FUNC_QUAL val_type pow(const val_type x, const val_type y) { \ - return Kokkos::pow(x, y); \ - } \ - static FUNC_QUAL val_type pow(const val_type x, const mag_type y) { \ - return Kokkos::pow(x, y); \ - } \ - static FUNC_QUAL val_type pow(const mag_type x, const val_type y) { \ - return Kokkos::pow(x, y); \ - } \ - static FUNC_QUAL val_type sqrt(const val_type x) { \ - return ::Kokkos::sqrt(x); \ - } \ - static FUNC_QUAL val_type exp(const val_type x) { return Kokkos::exp(x); } \ - static FUNC_QUAL val_type log(const val_type x) { return Kokkos::log(x); } \ - static FUNC_QUAL val_type log10(const val_type x) { \ - return Kokkos::log10(x); \ - } \ - static FUNC_QUAL val_type sin(const val_type x) { return Kokkos::sin(x); } \ - static FUNC_QUAL val_type cos(const val_type x) { return Kokkos::cos(x); } \ - static FUNC_QUAL val_type tan(const val_type x) { return Kokkos::tan(x); } \ - static FUNC_QUAL val_type sinh(const val_type x) { return Kokkos::sinh(x); } \ - static FUNC_QUAL val_type cosh(const val_type x) { return Kokkos::cosh(x); } \ - static FUNC_QUAL val_type tanh(const val_type x) { return Kokkos::tanh(x); } \ - static FUNC_QUAL val_type asin(const val_type x) { return Kokkos::asin(x); } \ - static FUNC_QUAL val_type acos(const val_type x) { return Kokkos::acos(x); } \ - static FUNC_QUAL val_type atan(const val_type x) { return Kokkos::atan(x); } \ - static FUNC_QUAL bool isnaninf(const val_type& x) { \ - return isNan(x) || isInf(x); \ - } \ - static FUNC_QUAL mag_type magnitude(const val_type x) { return abs(x); } \ - static FUNC_QUAL val_type conjugate(const val_type x) { return conj(x); } \ - static FUNC_QUAL val_type squareroot(const val_type x) { return sqrt(x); } \ +#define KOKKOSKERNELS_ARITHTRAITS_CMPLX_FP(FUNC_QUAL) \ + \ + static constexpr bool is_specialized = true; \ + static constexpr bool is_signed = true; \ + static constexpr bool is_integer = false; \ + static constexpr bool is_exact = false; \ + static constexpr bool is_complex = true; \ + static constexpr bool has_infinity = true; \ + \ + using magnitudeType = mag_type; \ + using halfPrecision = ::Kokkos::complex::halfPrecision>; \ + using doublePrecision = ::Kokkos::complex::doublePrecision>; \ + \ + static constexpr bool isComplex = true; \ + static constexpr bool isOrdinal = false; \ + static constexpr bool isComparable = false; \ + static constexpr bool hasMachineParameters = ArithTraits::hasMachineParameters; \ + \ + static FUNC_QUAL val_type zero() { return val_type(ArithTraits::zero(), ArithTraits::zero()); } \ + static FUNC_QUAL val_type one() { return val_type(ArithTraits::one(), ArithTraits::zero()); } \ + static FUNC_QUAL val_type min() { return val_type(ArithTraits::min(), ArithTraits::min()); } \ + static FUNC_QUAL val_type max() { return val_type(ArithTraits::max(), ArithTraits::max()); } \ + static FUNC_QUAL val_type infinity() { \ + return val_type(ArithTraits::infinity(), ArithTraits::infinity()); \ + } \ + static FUNC_QUAL val_type nan() { return val_type(ArithTraits::nan(), ArithTraits::nan()); } \ + static FUNC_QUAL mag_type epsilon() { return ArithTraits::epsilon(); } \ + static FUNC_QUAL mag_type sfmin() { return ArithTraits::sfmin(); } \ + static FUNC_QUAL int base() { return ArithTraits::base(); } \ + static FUNC_QUAL mag_type prec() { return ArithTraits::prec(); } \ + static FUNC_QUAL int t() { return ArithTraits::t(); } \ + static FUNC_QUAL mag_type rnd() { return ArithTraits::rnd(); } \ + static FUNC_QUAL int emin() { return ArithTraits::emin(); } \ + static FUNC_QUAL mag_type rmin() { return ArithTraits::rmin(); } \ + static FUNC_QUAL int emax() { return ArithTraits::emax(); } \ + static FUNC_QUAL mag_type rmax() { return ArithTraits::rmax(); } \ + static FUNC_QUAL bool isInf(const val_type x) { \ + return ArithTraits::isInf(x.real()) || ArithTraits::isInf(x.imag()); \ + } \ + static FUNC_QUAL bool isNan(const val_type x) { \ + return ArithTraits::isNan(x.real()) || ArithTraits::isNan(x.imag()); \ + } \ + static FUNC_QUAL mag_type abs(const val_type x) { return ::Kokkos::abs(x); } \ + static FUNC_QUAL mag_type real(const val_type x) { return x.real(); } \ + static FUNC_QUAL mag_type imag(const val_type x) { return x.imag(); } \ + static FUNC_QUAL val_type conj(const val_type x) { return ::Kokkos::conj(x); } \ + static FUNC_QUAL val_type pow(const val_type x, const val_type y) { return Kokkos::pow(x, y); } \ + static FUNC_QUAL val_type pow(const val_type x, const mag_type y) { return Kokkos::pow(x, y); } \ + static FUNC_QUAL val_type pow(const mag_type x, const val_type y) { return Kokkos::pow(x, y); } \ + static FUNC_QUAL val_type sqrt(const val_type x) { return ::Kokkos::sqrt(x); } \ + static FUNC_QUAL val_type exp(const val_type x) { return Kokkos::exp(x); } \ + static FUNC_QUAL val_type log(const val_type x) { return Kokkos::log(x); } \ + static FUNC_QUAL val_type log10(const val_type x) { return Kokkos::log10(x); } \ + static FUNC_QUAL val_type sin(const val_type x) { return Kokkos::sin(x); } \ + static FUNC_QUAL val_type cos(const val_type x) { return Kokkos::cos(x); } \ + static FUNC_QUAL val_type tan(const val_type x) { return Kokkos::tan(x); } \ + static FUNC_QUAL val_type sinh(const val_type x) { return Kokkos::sinh(x); } \ + static FUNC_QUAL val_type cosh(const val_type x) { return Kokkos::cosh(x); } \ + static FUNC_QUAL val_type tanh(const val_type x) { return Kokkos::tanh(x); } \ + static FUNC_QUAL val_type asin(const val_type x) { return Kokkos::asin(x); } \ + static FUNC_QUAL val_type acos(const val_type x) { return Kokkos::acos(x); } \ + static FUNC_QUAL val_type atan(const val_type x) { return Kokkos::atan(x); } \ + static FUNC_QUAL bool isnaninf(const val_type& x) { return isNan(x) || isInf(x); } \ + static FUNC_QUAL mag_type magnitude(const val_type x) { return abs(x); } \ + static FUNC_QUAL val_type conjugate(const val_type x) { return conj(x); } \ + static FUNC_QUAL val_type squareroot(const val_type x) { return sqrt(x); } \ static FUNC_QUAL mag_type eps() { return epsilon(); } template -static KOKKOS_FUNCTION - typename std::enable_if::is_signed, - val_type>::type - KokkosKernelsAbs(const val_type x) { +static KOKKOS_FUNCTION typename std::enable_if::is_signed, val_type>::type +KokkosKernelsAbs(const val_type x) { return Kokkos::abs(x); } template -static KOKKOS_FUNCTION - typename std::enable_if::is_signed, - val_type>::type - KokkosKernelsAbs(const val_type x) { +static KOKKOS_FUNCTION typename std::enable_if::is_signed, val_type>::type +KokkosKernelsAbs(const val_type x) { return x; } template -static KOKKOS_FUNCTION - typename std::enable_if::is_signed, - val_type>::type - KokkosKernelsNan() { +static KOKKOS_FUNCTION typename std::enable_if::is_signed, val_type>::type +KokkosKernelsNan() { return -1; } template -static KOKKOS_FUNCTION - typename std::enable_if::is_signed, - val_type>::type - KokkosKernelsNan() { +static KOKKOS_FUNCTION typename std::enable_if::is_signed, val_type>::type +KokkosKernelsNan() { return Kokkos::Experimental::finite_max::value; } -#define KOKKOSKERNELS_ARITHTRAITS_INTEGRAL() \ - \ - static constexpr bool is_specialized = true; \ - static constexpr bool is_integer = true; \ - static constexpr bool is_exact = true; \ - static constexpr bool is_complex = false; \ - static constexpr bool has_infinity = false; \ - \ - using magnitudeType = mag_type; \ - using halfPrecision = val_type; \ - using doublePrecision = val_type; \ - \ - static constexpr bool isComplex = false; \ - static constexpr bool isOrdinal = true; \ - static constexpr bool isComparable = true; \ - static constexpr bool hasMachineParameters = false; \ - \ - static KOKKOS_FUNCTION val_type zero() { return static_cast(0); } \ - static KOKKOS_FUNCTION val_type one() { return static_cast(1); } \ - static KOKKOS_FUNCTION val_type min() { \ - return Kokkos::Experimental::finite_min::value; \ - } \ - static KOKKOS_FUNCTION val_type max() { \ - return Kokkos::Experimental::finite_max::value; \ - } \ - static KOKKOS_FUNCTION val_type infinity() { \ - return static_cast(0); \ - } \ - static KOKKOS_FUNCTION val_type nan() { \ - return KokkosKernelsNan(); \ - } \ - static KOKKOS_FUNCTION bool isInf(const val_type) { return false; } \ - static KOKKOS_FUNCTION bool isNan(const val_type) { return false; } \ - static KOKKOS_FUNCTION mag_type abs(const val_type x) { \ - return KokkosKernelsAbs(x); \ - } \ - static KOKKOS_FUNCTION mag_type real(const val_type x) { \ - return Kokkos::real(x); \ - } \ - static KOKKOS_FUNCTION mag_type imag(const val_type) { return zero(); } \ - static KOKKOS_FUNCTION val_type conj(const val_type x) { return x; } \ - static KOKKOS_FUNCTION val_type pow(const val_type x, const val_type y) { \ - return Kokkos::pow(x, y); \ - } \ - static KOKKOS_FUNCTION val_type sqrt(const val_type x) { \ - return static_cast(Kokkos::sqrt(abs(x))); \ - } \ - static KOKKOS_FUNCTION val_type cbrt(const val_type x) { \ - return static_cast(Kokkos::cbrt(abs(x))); \ - } \ - static KOKKOS_FUNCTION val_type exp(const val_type x) { \ - return static_cast(Kokkos::exp(abs(x))); \ - } \ - static KOKKOS_FUNCTION val_type log(const val_type x) { \ - return static_cast(Kokkos::log(abs(x))); \ - } \ - static KOKKOS_FUNCTION val_type log10(const val_type x) { \ - return static_cast(Kokkos::log10(abs(x))); \ - } \ - static KOKKOS_FUNCTION mag_type epsilon() { return zero(); } \ - static KOKKOS_FUNCTION magnitudeType magnitude(const val_type x) { \ - return abs(x); \ - } \ - static KOKKOS_FUNCTION val_type conjugate(const val_type x) { \ - return conj(x); \ - } \ - static KOKKOS_FUNCTION bool isnaninf(const val_type) { return false; } \ - static KOKKOS_FUNCTION val_type squareroot(const val_type x) { \ - return sqrt(x); \ - } +#define KOKKOSKERNELS_ARITHTRAITS_INTEGRAL() \ + \ + static constexpr bool is_specialized = true; \ + static constexpr bool is_integer = true; \ + static constexpr bool is_exact = true; \ + static constexpr bool is_complex = false; \ + static constexpr bool has_infinity = false; \ + \ + using magnitudeType = mag_type; \ + using halfPrecision = val_type; \ + using doublePrecision = val_type; \ + \ + static constexpr bool isComplex = false; \ + static constexpr bool isOrdinal = true; \ + static constexpr bool isComparable = true; \ + static constexpr bool hasMachineParameters = false; \ + \ + static KOKKOS_FUNCTION val_type zero() { return static_cast(0); } \ + static KOKKOS_FUNCTION val_type one() { return static_cast(1); } \ + static KOKKOS_FUNCTION val_type min() { return Kokkos::Experimental::finite_min::value; } \ + static KOKKOS_FUNCTION val_type max() { return Kokkos::Experimental::finite_max::value; } \ + static KOKKOS_FUNCTION val_type infinity() { return static_cast(0); } \ + static KOKKOS_FUNCTION val_type nan() { return KokkosKernelsNan(); } \ + static KOKKOS_FUNCTION bool isInf(const val_type) { return false; } \ + static KOKKOS_FUNCTION bool isNan(const val_type) { return false; } \ + static KOKKOS_FUNCTION mag_type abs(const val_type x) { return KokkosKernelsAbs(x); } \ + static KOKKOS_FUNCTION mag_type real(const val_type x) { return Kokkos::real(x); } \ + static KOKKOS_FUNCTION mag_type imag(const val_type) { return zero(); } \ + static KOKKOS_FUNCTION val_type conj(const val_type x) { return x; } \ + static KOKKOS_FUNCTION val_type pow(const val_type x, const val_type y) { return Kokkos::pow(x, y); } \ + static KOKKOS_FUNCTION val_type sqrt(const val_type x) { return static_cast(Kokkos::sqrt(abs(x))); } \ + static KOKKOS_FUNCTION val_type cbrt(const val_type x) { return static_cast(Kokkos::cbrt(abs(x))); } \ + static KOKKOS_FUNCTION val_type exp(const val_type x) { return static_cast(Kokkos::exp(abs(x))); } \ + static KOKKOS_FUNCTION val_type log(const val_type x) { return static_cast(Kokkos::log(abs(x))); } \ + static KOKKOS_FUNCTION val_type log10(const val_type x) { return static_cast(Kokkos::log10(abs(x))); } \ + static KOKKOS_FUNCTION mag_type epsilon() { return zero(); } \ + static KOKKOS_FUNCTION magnitudeType magnitude(const val_type x) { return abs(x); } \ + static KOKKOS_FUNCTION val_type conjugate(const val_type x) { return conj(x); } \ + static KOKKOS_FUNCTION bool isnaninf(const val_type) { return false; } \ + static KOKKOS_FUNCTION val_type squareroot(const val_type x) { return sqrt(x); } /// \class ArithTraits /// \brief Traits class for arithmetic on type T. @@ -1103,11 +956,9 @@ class ArithTraits { using magnitudeType = mag_type; using halfPrecision = float; #if defined(__CUDA_ARCH__) - using doublePrecision = - double; // CUDA doesn't support long double, unfortunately + using doublePrecision = double; // CUDA doesn't support long double, unfortunately #elif defined(__HIP_DEVICE_COMPILE__) - using doublePrecision = - double; // HIP does not support long double unfortunately + using doublePrecision = double; // HIP does not support long double unfortunately #else using doublePrecision = long double; #endif // __CUDA_ARCH__ @@ -1230,8 +1081,7 @@ class ArithTraits > { static constexpr bool has_infinity = true; static std::complex infinity() { - return std::complex(ArithTraits::infinity(), - ArithTraits::infinity()); + return std::complex(ArithTraits::infinity(), ArithTraits::infinity()); } #ifdef KOKKOS_ENABLE_SYCL @@ -1280,37 +1130,23 @@ class ArithTraits > { return isnan(real(x)) || isnan(imag(x)); } #endif - static mag_type abs(const std::complex& x) { - return std::abs(x); - } + static mag_type abs(const std::complex& x) { return std::abs(x); } static std::complex zero() { - return std::complex(ArithTraits::zero(), - ArithTraits::zero()); + return std::complex(ArithTraits::zero(), ArithTraits::zero()); } static std::complex one() { - return std::complex(ArithTraits::one(), - ArithTraits::zero()); + return std::complex(ArithTraits::one(), ArithTraits::zero()); } static std::complex min() { - return std::complex(ArithTraits::min(), - ArithTraits::zero()); + return std::complex(ArithTraits::min(), ArithTraits::zero()); } static std::complex max() { - return std::complex(ArithTraits::max(), - ArithTraits::zero()); - } - static mag_type real(const std::complex& x) { - return std::real(x); - } - static mag_type imag(const std::complex& x) { - return std::imag(x); + return std::complex(ArithTraits::max(), ArithTraits::zero()); } - static std::complex conj( - const std::complex& x) { - return std::conj(x); - } - static std::complex pow(const std::complex& x, - const std::complex& y) { + static mag_type real(const std::complex& x) { return std::real(x); } + static mag_type imag(const std::complex& x) { return std::imag(x); } + static std::complex conj(const std::complex& x) { return std::conj(x); } + static std::complex pow(const std::complex& x, const std::complex& y) { // Fix for some weird gcc 4.2.1 inaccuracy. if (y == one()) { return x; @@ -1320,46 +1156,29 @@ class ArithTraits > { return std::pow(x, y); } } - static std::complex pow(const std::complex& x, - const RealFloatType& y) { + static std::complex pow(const std::complex& x, const RealFloatType& y) { // Fix for some weird gcc 4.2.1 inaccuracy. if (y == ArithTraits::one()) { return x; - } else if (y == ArithTraits::one() + - ArithTraits::one()) { + } else if (y == ArithTraits::one() + ArithTraits::one()) { return x * x; } else { return std::pow(x, y); } } - static std::complex sqrt( - const std::complex& x) { - return std::sqrt(x); - } - static std::complex cbrt( - const std::complex& x) { + static std::complex sqrt(const std::complex& x) { return std::sqrt(x); } + static std::complex cbrt(const std::complex& x) { #ifdef KOKKOS_ACTIVE_EXECUTION_MEMORY_SPACE_SYCL return sycl::cbrt(x); #else return ::cbrt(x); #endif } - static std::complex exp(const std::complex& x) { - return std::exp(x); - } - static std::complex log(const std::complex& x) { - return std::log(x); - } - static std::complex log10( - const std::complex& x) { - return std::log10(x); - } - static std::complex sin(const std::complex& x) { - return std::sin(x); - } - static std::complex cos(const std::complex& x) { - return std::cos(x); - } + static std::complex exp(const std::complex& x) { return std::exp(x); } + static std::complex log(const std::complex& x) { return std::log(x); } + static std::complex log10(const std::complex& x) { return std::log10(x); } + static std::complex sin(const std::complex& x) { return std::sin(x); } + static std::complex cos(const std::complex& x) { return std::cos(x); } static std::complex tan(const std::complex& x) { #ifdef KOKKOS_ACTIVE_EXECUTION_MEMORY_SPACE_SYCL return sycl::tan(x); @@ -1367,36 +1186,24 @@ class ArithTraits > { return std::tan(x); #endif } - static std::complex sinh( - const std::complex& x) { - return std::sinh(x); - } - static std::complex cosh( - const std::complex& x) { - return std::cosh(x); - } - static std::complex tanh( - const std::complex& x) { - return std::tanh(x); - } - static std::complex asin( - const std::complex& x) { + static std::complex sinh(const std::complex& x) { return std::sinh(x); } + static std::complex cosh(const std::complex& x) { return std::cosh(x); } + static std::complex tanh(const std::complex& x) { return std::tanh(x); } + static std::complex asin(const std::complex& x) { #ifdef KOKKOS_ACTIVE_EXECUTION_MEMORY_SPACE_SYCL return sycl::asin(x); #else return ::asin(x); #endif } - static std::complex acos( - const std::complex& x) { + static std::complex acos(const std::complex& x) { #ifdef KOKKOS_ACTIVE_EXECUTION_MEMORY_SPACE_SYCL return sycl::acos(x); #else return ::acos(x); #endif } - static std::complex atan( - const std::complex& x) { + static std::complex atan(const std::complex& x) { #ifdef KOKKOS_ACTIVE_EXECUTION_MEMORY_SPACE_SYCL using sycl::atan; #else @@ -1411,33 +1218,19 @@ class ArithTraits > { static mag_type epsilon() { return ArithTraits::epsilon(); } // Backwards compatibility with Teuchos::ScalarTraits. - using magnitudeType = mag_type; - using halfPrecision = - std::complex::halfPrecision>; - using doublePrecision = - std::complex::doublePrecision>; + using magnitudeType = mag_type; + using halfPrecision = std::complex::halfPrecision>; + using doublePrecision = std::complex::doublePrecision>; static constexpr bool isComplex = true; static constexpr bool isOrdinal = false; static constexpr bool isComparable = false; static constexpr bool hasMachineParameters = true; - static bool isnaninf(const std::complex& x) { - return isNan(x) || isInf(x); - } - static mag_type magnitude(const std::complex& x) { - return abs(x); - } - static std::complex conjugate( - const std::complex& x) { - return conj(x); - } - static std::string name() { - return std::string("std::complex<") + ArithTraits::name() + ">"; - } - static std::complex squareroot( - const std::complex& x) { - return sqrt(x); - } + static bool isnaninf(const std::complex& x) { return isNan(x) || isInf(x); } + static mag_type magnitude(const std::complex& x) { return abs(x); } + static std::complex conjugate(const std::complex& x) { return conj(x); } + static std::string name() { return std::string("std::complex<") + ArithTraits::name() + ">"; } + static std::complex squareroot(const std::complex& x) { return sqrt(x); } static mag_type eps() { return epsilon(); } static mag_type sfmin() { return ArithTraits::sfmin(); } static int base() { return ArithTraits::base(); } @@ -1637,9 +1430,7 @@ struct [[deprecated]] ArithTraits { static inline mag_type real(const val_type& x) { return x; } static inline mag_type imag(const val_type&) { return zero(); } static inline val_type conj(const val_type& x) { return x; } - static inline val_type pow(const val_type& x, const val_type& y) { - return ::pow(x, y); - } + static inline val_type pow(const val_type& x, const val_type& y) { return ::pow(x, y); } static inline val_type sqrt(const val_type& x) { #ifdef KOKKOS_ACTIVE_EXECUTION_MEMORY_SPACE_SYCL return sycl::sqrt(x); @@ -1710,11 +1501,7 @@ struct [[deprecated]] ArithTraits { static int base() { return std::numeric_limits::radix; } static mag_type prec() { return eps() * base(); } static int t() { return std::numeric_limits::digits; } - static mag_type rnd() { - return std::numeric_limits::round_style == std::round_to_nearest - ? one() - : zero(); - } + static mag_type rnd() { return std::numeric_limits::round_style == std::round_to_nearest ? one() : zero(); } static int emin() { return std::numeric_limits::min_exponent; } static mag_type rmin() { return std::numeric_limits::min(); } static int emax() { return std::numeric_limits::max_exponent; } @@ -1753,9 +1540,7 @@ struct [[deprecated]] ArithTraits { static inline mag_type real(const val_type& x) { return x; } static inline mag_type imag(const val_type&) { return zero(); } static inline val_type conj(const val_type& x) { return x; } - static inline val_type pow(const val_type& x, const val_type& y) { - return ::pow(x, y); - } + static inline val_type pow(const val_type& x, const val_type& y) { return ::pow(x, y); } static inline val_type sqrt(const val_type& x) { #ifdef KOKKOS_ACTIVE_EXECUTION_MEMORY_SPACE_SYCL return sycl::sqrt(x); @@ -1810,9 +1595,7 @@ struct [[deprecated]] ArithTraits { #endif } static inline val_type nan() { return val_type::_nan; } - static inline val_type epsilon() { - return std::numeric_limits::epsilon(); - } + static inline val_type epsilon() { return std::numeric_limits::epsilon(); } typedef qd_real magnitudeType; typedef dd_real halfPrecision; @@ -1832,11 +1615,7 @@ struct [[deprecated]] ArithTraits { static int base() { return std::numeric_limits::radix; } static mag_type prec() { return eps() * base(); } static int t() { return std::numeric_limits::digits; } - static mag_type rnd() { - return std::numeric_limits::round_style == std::round_to_nearest - ? one() - : zero(); - } + static mag_type rnd() { return std::numeric_limits::round_style == std::round_to_nearest ? one() : zero(); } static int emin() { return std::numeric_limits::min_exponent; } static mag_type rmin() { return std::numeric_limits::min(); } static int emax() { return std::numeric_limits::max_exponent; } @@ -1857,8 +1636,7 @@ struct [[deprecated]] ArithTraits { namespace Details { template -using ArithTraits [[deprecated("Use Kokkos::ArithTraits instead")]] = - ::Kokkos::ArithTraits; +using ArithTraits [[deprecated("Use Kokkos::ArithTraits instead")]] = ::Kokkos::ArithTraits; } // namespace Details } // namespace Kokkos diff --git a/common/src/Kokkos_InnerProductSpaceTraits.hpp b/common/src/Kokkos_InnerProductSpaceTraits.hpp index c2bc475c45..25337c925f 100644 --- a/common/src/Kokkos_InnerProductSpaceTraits.hpp +++ b/common/src/Kokkos_InnerProductSpaceTraits.hpp @@ -125,19 +125,14 @@ class InnerProductSpaceTraits { typedef val_type dot_type; //! The "norm" (absolute value or magnitude) of a value x of type val_type. - static KOKKOS_FORCEINLINE_FUNCTION mag_type norm(const val_type& x) { - return Kokkos::ArithTraits::abs(x); - } + static KOKKOS_FORCEINLINE_FUNCTION mag_type norm(const val_type& x) { return Kokkos::ArithTraits::abs(x); } /// \brief The "dot product" of two values x and y of type val_type. /// /// This default implementation should suffice unless val_type is /// complex. In that case, see the partial specialization for /// Kokkos::complex below to see our convention for which input gets /// conjugated. - static KOKKOS_FORCEINLINE_FUNCTION dot_type dot(const val_type& x, - const val_type& y) { - return x * y; - } + static KOKKOS_FORCEINLINE_FUNCTION dot_type dot(const val_type& x, const val_type& y) { return x * y; } }; /// \brief Partial specialization for long double. @@ -149,9 +144,7 @@ struct InnerProductSpaceTraits { typedef Kokkos::ArithTraits::mag_type mag_type; typedef val_type dot_type; - static mag_type norm(const val_type& x) { - return Kokkos::ArithTraits::abs(x); - } + static mag_type norm(const val_type& x) { return Kokkos::ArithTraits::abs(x); } static dot_type dot(const val_type& x, const val_type& y) { return x * y; } }; @@ -163,13 +156,8 @@ class InnerProductSpaceTraits> { typedef typename Kokkos::ArithTraits::mag_type mag_type; typedef val_type dot_type; - static KOKKOS_FORCEINLINE_FUNCTION mag_type norm(const val_type& x) { - return Kokkos::ArithTraits::abs(x); - } - static KOKKOS_FORCEINLINE_FUNCTION dot_type dot(const val_type& x, - const val_type& y) { - return Kokkos::conj(x) * y; - } + static KOKKOS_FORCEINLINE_FUNCTION mag_type norm(const val_type& x) { return Kokkos::ArithTraits::abs(x); } + static KOKKOS_FORCEINLINE_FUNCTION dot_type dot(const val_type& x, const val_type& y) { return Kokkos::conj(x) * y; } }; /// \brief Partial specialization for std::complex. @@ -182,12 +170,8 @@ struct InnerProductSpaceTraits> { typedef typename Kokkos::ArithTraits::mag_type mag_type; typedef val_type dot_type; - static mag_type norm(const val_type& x) { - return Kokkos::ArithTraits::abs(x); - } - static dot_type dot(const val_type& x, const val_type& y) { - return std::conj(x) * y; - } + static mag_type norm(const val_type& x) { return Kokkos::ArithTraits::abs(x); } + static dot_type dot(const val_type& x, const val_type& y) { return std::conj(x) * y; } }; #ifdef HAVE_KOKKOSKERNELS_QUADMATH @@ -203,9 +187,7 @@ struct InnerProductSpaceTraits<__float128> { typedef typename Kokkos::ArithTraits::mag_type mag_type; typedef val_type dot_type; - static mag_type norm(const val_type& x) { - return Kokkos::ArithTraits::abs(x); - } + static mag_type norm(const val_type& x) { return Kokkos::ArithTraits::abs(x); } static dot_type dot(const val_type& x, const val_type& y) { return x * y; } }; @@ -232,9 +214,7 @@ struct InnerProductSpaceTraits { typedef Kokkos::ArithTraits::mag_type mag_type; typedef val_type dot_type; - static mag_type norm(const val_type& x) { - return Kokkos::ArithTraits::abs(x); - } + static mag_type norm(const val_type& x) { return Kokkos::ArithTraits::abs(x); } static dot_type dot(const val_type& x, const val_type& y) { return x * y; } }; @@ -244,34 +224,24 @@ struct InnerProductSpaceTraits { typedef Kokkos::ArithTraits::mag_type mag_type; typedef val_type dot_type; - static mag_type norm(const val_type& x) { - return Kokkos::ArithTraits::abs(x); - } + static mag_type norm(const val_type& x) { return Kokkos::ArithTraits::abs(x); } static dot_type dot(const val_type& x, const val_type& y) { return x * y; } }; #endif // HAVE_KOKKOS_QD template -KOKKOS_INLINE_FUNCTION void updateDot(ResultType& sum, const InputType1& x, - const InputType2& y) { +KOKKOS_INLINE_FUNCTION void updateDot(ResultType& sum, const InputType1& x, const InputType2& y) { // FIXME (mfh 22 Jan 2020) We should actually pick the type with the // greater precision. sum += InnerProductSpaceTraits::dot(x, y); } -KOKKOS_INLINE_FUNCTION void updateDot(double& sum, const double x, - const double y) { - sum += x * y; -} +KOKKOS_INLINE_FUNCTION void updateDot(double& sum, const double x, const double y) { sum += x * y; } -KOKKOS_INLINE_FUNCTION void updateDot(double& sum, const float x, - const float y) { - sum += x * y; -} +KOKKOS_INLINE_FUNCTION void updateDot(double& sum, const float x, const float y) { sum += x * y; } // This exists because complex += complex is not defined. -KOKKOS_INLINE_FUNCTION void updateDot(Kokkos::complex& sum, - const Kokkos::complex x, +KOKKOS_INLINE_FUNCTION void updateDot(Kokkos::complex& sum, const Kokkos::complex x, const Kokkos::complex y) { const auto tmp = Kokkos::conj(x) * y; sum += Kokkos::complex(tmp.real(), tmp.imag()); @@ -280,8 +250,7 @@ KOKKOS_INLINE_FUNCTION void updateDot(Kokkos::complex& sum, // This exists in case people call the overload of KokkosBlas::dot // that takes an output View, and the output View has element type // Kokkos::complex. -KOKKOS_INLINE_FUNCTION void updateDot(Kokkos::complex& sum, - const Kokkos::complex x, +KOKKOS_INLINE_FUNCTION void updateDot(Kokkos::complex& sum, const Kokkos::complex x, const Kokkos::complex y) { sum += Kokkos::conj(x) * y; } diff --git a/common/unit_test/Test_Common_AlignPtrTo.hpp b/common/unit_test/Test_Common_AlignPtrTo.hpp index 760cddd5a2..33e7ed542c 100644 --- a/common/unit_test/Test_Common_AlignPtrTo.hpp +++ b/common/unit_test/Test_Common_AlignPtrTo.hpp @@ -60,16 +60,14 @@ KOKKOS_INLINE_FUNCTION T *f1(InPtr p) { template KOKKOS_INLINE_FUNCTION T *f2(InPtr p) { std::uintptr_t ptrVal = reinterpret_cast(p); - return reinterpret_cast((ptrVal + alignof(T) - 1) / alignof(T) * - alignof(T)); + return reinterpret_cast((ptrVal + alignof(T) - 1) / alignof(T) * alignof(T)); } // the way GCC does it (roughly) template KOKKOS_INLINE_FUNCTION T *f3(InPtr p) { std::uintptr_t ptrVal = reinterpret_cast(p); - return reinterpret_cast((ptrVal - uint64_t(1) + alignof(T)) & - -alignof(T)); + return reinterpret_cast((ptrVal - uint64_t(1) + alignof(T)) & -alignof(T)); } // Function to be executed by each team @@ -81,8 +79,7 @@ struct TeamFunction { template KOKKOS_INLINE_FUNCTION void operator()(const Team &team) const { // get an "aligned" pointer to scratch memory - char *shmem = (char *)(team.team_shmem().get_shmem(team.team_size() * - sizeof(double))); + char *shmem = (char *)(team.team_shmem().get_shmem(team.team_size() * sizeof(double))); double *vals; if constexpr (0 == TEST_FN) { vals = f0(shmem); @@ -109,9 +106,7 @@ struct TeamFunction { results_(i) = vals[i]; } - size_t team_shmem_size(int team_size) const { - return team_size * sizeof(double); - } + size_t team_shmem_size(int team_size) const { return team_size * sizeof(double); } Results results_; }; @@ -119,20 +114,18 @@ struct TeamFunction { // use atomic add to set result(i) = i template void test_alignPtrTo() { - using MemorySpace = typename Device::memory_space; - using ExecSpace = typename Device::execution_space; - using TestView = Kokkos::View; - using TestPolicy = Kokkos::TeamPolicy; - const int teamSize = TestPolicy(1, Kokkos::AUTO) - .team_size_max(TeamFunction(), - Kokkos::ParallelForTag{}); + using MemorySpace = typename Device::memory_space; + using ExecSpace = typename Device::execution_space; + using TestView = Kokkos::View; + using TestPolicy = Kokkos::TeamPolicy; + const int teamSize = + TestPolicy(1, Kokkos::AUTO).team_size_max(TeamFunction(), Kokkos::ParallelForTag{}); ExecSpace space; TestView results("TestView", teamSize); TestPolicy policy(space, 1, teamSize); - Kokkos::parallel_for("test alignment", policy, - TeamFunction(results)); + Kokkos::parallel_for("test alignment", policy, TeamFunction(results)); int errs; Kokkos::parallel_reduce( diff --git a/common/unit_test/Test_Common_ArithTraits.hpp b/common/unit_test/Test_Common_ArithTraits.hpp index 8c493a3666..73a4ebfefe 100644 --- a/common/unit_test/Test_Common_ArithTraits.hpp +++ b/common/unit_test/Test_Common_ArithTraits.hpp @@ -42,8 +42,7 @@ } #if 0 -#define TRACE() \ - Kokkos::printf("%s:%s:%d: Trace\n", __FILE__, __func__, __LINE__); +#define TRACE() Kokkos::printf("%s:%s:%d: Trace\n", __FILE__, __func__, __LINE__); #else #define TRACE() #endif @@ -133,8 +132,7 @@ class ArithTraitsTesterBase { /// \brief Combine two intermediate reduction results into \c dst. /// /// Subclasses need not and must not override this method. - KOKKOS_INLINE_FUNCTION void join(value_type& dst, - const value_type& src) const { + KOKKOS_INLINE_FUNCTION void join(value_type& dst, const value_type& src) const { dst = dst && src; // dst = 1; } @@ -157,8 +155,7 @@ class ArithTraitsTesterBase { /// far. On output: The result of the tests run in this method. /// The result of more than one test is the logical AND of each /// test's result. - KOKKOS_INLINE_FUNCTION void operator()(size_type iwork, - value_type& dst) const { + KOKKOS_INLINE_FUNCTION void operator()(size_type iwork, value_type& dst) const { TRACE(); typedef Kokkos::ArithTraits AT; (void)iwork; // not using this argument @@ -293,14 +290,12 @@ class ArithTraitsTesterBase { } if (AT::is_integer != std::numeric_limits::is_integer) { - out << "AT::is_integer != std::numeric_limits::is_integer" - << endl; + out << "AT::is_integer != std::numeric_limits::is_integer" << endl; FAILURE(); } if (AT::is_exact != std::numeric_limits::is_exact) { - out << "AT::is_exact != std::numeric_limits::is_exact" - << endl; + out << "AT::is_exact != std::numeric_limits::is_exact" << endl; FAILURE(); } @@ -354,11 +349,9 @@ class ArithTraitsTesterBase { if (AT::has_infinity) { // Compiler intrinsic casts from inf of type half_t / bhalf_t to inf // of type float in CUDA, SYCL and HIP do not work yet. -#if defined(KOKKOS_ENABLE_CUDA) || defined(KOKKOS_ENABLE_SYCL) || \ - defined(KOKKOS_ENABLE_HIP) +#if defined(KOKKOS_ENABLE_CUDA) || defined(KOKKOS_ENABLE_SYCL) || defined(KOKKOS_ENABLE_HIP) namespace KE = Kokkos::Experimental; - if constexpr (!std::is_same::value && - !std::is_same::value) { + if constexpr (!std::is_same::value && !std::is_same::value) { #else { #endif // KOKKOS_ENABLE_CUDA || KOKKOS_ENABLE_SYCL || KOKKOS_ENABLE_HIP @@ -396,10 +389,8 @@ class ArithTraitsTesterBase { /// implements transcendental functions, but the specific tests that /// are run will depend on \c ScalarType. template ::value ? 1 : 0)> -class ArithTraitsTesterTranscendentalBase - : public ArithTraitsTesterBase { + const int has_transcendentals = (HasTranscendentals::value ? 1 : 0)> +class ArithTraitsTesterTranscendentalBase : public ArithTraitsTesterBase { private: //! The base class of this class. typedef ArithTraitsTesterBase base_type; @@ -413,8 +404,7 @@ class ArithTraitsTesterTranscendentalBase /// \brief The "parallel for" part of the reduction. /// /// See comments of ArithTraitsTesterBase's operator(). - KOKKOS_INLINE_FUNCTION void operator()(size_type iwork, - value_type& dst) const; + KOKKOS_INLINE_FUNCTION void operator()(size_type iwork, value_type& dst) const; //! Constructor (does nothing, but marked as device function). KOKKOS_INLINE_FUNCTION ArithTraitsTesterTranscendentalBase(); @@ -445,8 +435,7 @@ class ArithTraitsTesterTranscendentalBase //! Constructor (does nothing, but marked as device function). KOKKOS_INLINE_FUNCTION ArithTraitsTesterTranscendentalBase() {} - KOKKOS_INLINE_FUNCTION void operator()(size_type iwork, - value_type& dst) const { + KOKKOS_INLINE_FUNCTION void operator()(size_type iwork, value_type& dst) const { TRACE(); // typedef Kokkos::ArithTraits AT; (void)iwork; // forestall compiler warning for unused variable @@ -524,8 +513,7 @@ class ArithTraitsTesterTranscendentalBase //! Constructor (does nothing, but marked as device function). KOKKOS_INLINE_FUNCTION ArithTraitsTesterTranscendentalBase() {} - KOKKOS_INLINE_FUNCTION void operator()(size_type iwork, - value_type& dst) const { + KOKKOS_INLINE_FUNCTION void operator()(size_type iwork, value_type& dst) const { TRACE(); typedef Kokkos::ArithTraits AT; (void)iwork; // forestall compiler warning for unused variable @@ -666,8 +654,7 @@ class ArithTraitsTesterTranscendentalBase if (AT::is_complex) { const ScalarType val = two; //(two.real(), two.real()); if (!equal(AT::conj(AT::exp(val)), AT::exp(AT::conj(val)))) { - Kokkos::printf( - "AT::conj(exp(complex(2,2))) != AT::exp(conj(complex(2,2)))\n"); + Kokkos::printf("AT::conj(exp(complex(2,2))) != AT::exp(conj(complex(2,2)))\n"); FAILURE(); } } @@ -685,13 +672,11 @@ class ArithTraitsTesterTranscendentalBase const auto val_sin = AT::sin(val); const auto val_cos = AT::cos(val); if (!equal(val_sin * val_sin + val_cos * val_cos, one)) { - Kokkos::printf( - "AT(complex):: sin(val)*sin(val) + cos(val)*cos(val) != 1\n"); + Kokkos::printf("AT(complex):: sin(val)*sin(val) + cos(val)*cos(val) != 1\n"); FAILURE(); } if (!equal(val_sin / val_cos, AT::tan(val))) { - Kokkos::printf( - "AT(complex):: sin(val)/cos(val) != AT(real)::tan(val)\n"); + Kokkos::printf("AT(complex):: sin(val)/cos(val) != AT(real)::tan(val)\n"); FAILURE(); } } else { @@ -788,8 +773,7 @@ class ArithTraitsTesterTranscendentalBase if (!AT::is_complex) { result = AT::pow(three, three); if (result != twentySeven) { - out << "AT::pow (three, three) = " << result - << " != twentySeven = " << twentySeven << endl; + out << "AT::pow (three, three) = " << result << " != twentySeven = " << twentySeven << endl; FAILURE(); } } @@ -798,20 +782,17 @@ class ArithTraitsTesterTranscendentalBase if (AT::is_signed && !AT::is_complex) { result = AT::pow(-three, one); if (result != -three) { - out << "AT::pow (-three, one) = " << result << " != -three = " << -three - << endl; + out << "AT::pow (-three, one) = " << result << " != -three = " << -three << endl; FAILURE(); } result = AT::pow(-three, two); if (result != nine) { - out << "AT::pow (-three, two) = " << result << " != nine = " << nine - << endl; + out << "AT::pow (-three, two) = " << result << " != nine = " << nine << endl; FAILURE(); } result = AT::pow(-three, three); if (result != -twentySeven) { - out << "AT::pow (-three, three) = " << result - << " != -twentySeven = " << twentySeven << endl; + out << "AT::pow (-three, three) = " << result << " != -twentySeven = " << twentySeven << endl; FAILURE(); } } @@ -877,8 +858,7 @@ class ArithTraitsTesterTranscendentalBase if (AT::is_complex) { const ScalarType val = two; //(two.real(), two.real()); if (!equal(AT::conj(AT::exp(val)), AT::exp(AT::conj(val)))) { - Kokkos::printf( - "AT::conj(exp(complex(2,0))) != AT::exp(conj(complex(2,0)))\n"); + Kokkos::printf("AT::conj(exp(complex(2,0))) != AT::exp(conj(complex(2,0)))\n"); FAILURE(); } } @@ -896,13 +876,11 @@ class ArithTraitsTesterTranscendentalBase const auto val_sin = AT::sin(val); const auto val_cos = AT::cos(val); if (!equal(val_sin * val_sin + val_cos * val_cos, one)) { - Kokkos::printf( - "AT(complex):: sin(val)*sin(val) + cos(val)*cos(val) != 1\n"); + Kokkos::printf("AT(complex):: sin(val)*sin(val) + cos(val)*cos(val) != 1\n"); FAILURE(); } if (!equal(val_sin / val_cos, AT::tan(val))) { - Kokkos::printf( - "AT(complex):: sin(val)/cos(val) != AT(real)::tan(val)\n"); + Kokkos::printf("AT(complex):: sin(val)/cos(val) != AT(real)::tan(val)\n"); FAILURE(); } } else { @@ -956,10 +934,8 @@ class ArithTraitsTesterTranscendentalBase /// Some tests will be executed whether or not ScalarType is /// complex, but the specific tests that are run will depend on /// ScalarType. -template ::is_complex> -class ArithTraitsTesterComplexBase - : public ArithTraitsTesterTranscendentalBase { +template ::is_complex> +class ArithTraitsTesterComplexBase : public ArithTraitsTesterTranscendentalBase { private: //! The base class of this class. typedef ArithTraitsTesterTranscendentalBase base_type; @@ -973,8 +949,7 @@ class ArithTraitsTesterComplexBase /// \brief The "parallel for" part of the reduction. /// /// See comments of ArithTraitsTesterBase's operator(). - KOKKOS_INLINE_FUNCTION void operator()(size_type iwork, - value_type& dst) const; + KOKKOS_INLINE_FUNCTION void operator()(size_type iwork, value_type& dst) const; //! Constructor (does nothing, but marked as device function). KOKKOS_INLINE_FUNCTION ArithTraitsTesterComplexBase(); @@ -1004,8 +979,7 @@ class ArithTraitsTesterComplexBase //! Constructor (does nothing, but marked as device function). KOKKOS_INLINE_FUNCTION ArithTraitsTesterComplexBase() {} - KOKKOS_INLINE_FUNCTION void operator()(size_type iwork, - value_type& dst) const { + KOKKOS_INLINE_FUNCTION void operator()(size_type iwork, value_type& dst) const { TRACE(); typedef Kokkos::ArithTraits AT; (void)iwork; // forestall compiler warning for unused variable @@ -1029,9 +1003,9 @@ class ArithTraitsTesterComplexBase } #endif // KOKKOS_HALF_T_IS_FLOAT - if (AT::is_complex) { - FAILURE(); - } + if (AT::is_complex) { + FAILURE(); + } // Call the base class' implementation. Every subclass' // implementation of operator() must do this, in order to include @@ -1090,8 +1064,7 @@ class ArithTraitsTesterComplexBase //! Constructor (does nothing, but marked as device function). KOKKOS_INLINE_FUNCTION ArithTraitsTesterComplexBase() {} - KOKKOS_INLINE_FUNCTION void operator()(size_type iwork, - value_type& dst) const { + KOKKOS_INLINE_FUNCTION void operator()(size_type iwork, value_type& dst) const { TRACE(); typedef Kokkos::ArithTraits AT; (void)iwork; // forestall compiler warning for unused variable @@ -1109,8 +1082,7 @@ class ArithTraitsTesterComplexBase const ScalarType onePlusOne(one, one); // Test conjugation. - if (AT::conj(oneMinusOne) != onePlusOne || - AT::conj(onePlusOne) != oneMinusOne) { + if (AT::conj(oneMinusOne) != onePlusOne || AT::conj(onePlusOne) != oneMinusOne) { FAILURE(); } @@ -1178,16 +1150,12 @@ class ArithTraitsTesterComplexBase /// (testHost()). The device-based test is a reduction over redundant /// executions of the test. All redundant executions must return /// '1' (passed). -template ::is_exact> +template ::is_exact> class ArithTraitsTesterFloatingPointBase - : public ArithTraitsTesterComplexBase< - ScalarType, DeviceType, Kokkos::ArithTraits::is_complex> { + : public ArithTraitsTesterComplexBase::is_complex> { private: //! The base class of this class. - typedef ArithTraitsTesterComplexBase< - ScalarType, DeviceType, Kokkos::ArithTraits::is_complex> - base_type; + typedef ArithTraitsTesterComplexBase::is_complex> base_type; public: typedef DeviceType execution_space; @@ -1198,8 +1166,7 @@ class ArithTraitsTesterFloatingPointBase /// \brief The "parallel for" part of the reduction. /// /// See comments of ArithTraitsTesterBase's operator(). - KOKKOS_INLINE_FUNCTION void operator()(size_type iwork, - value_type& dst) const; + KOKKOS_INLINE_FUNCTION void operator()(size_type iwork, value_type& dst) const; protected: virtual int testHostImpl(std::ostream& out) const; @@ -1211,13 +1178,10 @@ class ArithTraitsTesterFloatingPointBase // template class ArithTraitsTesterFloatingPointBase - : public ArithTraitsTesterComplexBase< - ScalarType, DeviceType, Kokkos::ArithTraits::is_complex> { + : public ArithTraitsTesterComplexBase::is_complex> { private: //! The base class of this class. - typedef ArithTraitsTesterComplexBase< - ScalarType, DeviceType, Kokkos::ArithTraits::is_complex> - base_type; + typedef ArithTraitsTesterComplexBase::is_complex> base_type; public: typedef typename DeviceType::execution_space execution_space; @@ -1228,8 +1192,7 @@ class ArithTraitsTesterFloatingPointBase //! Constructor (does nothing, but marked as device function). KOKKOS_INLINE_FUNCTION ArithTraitsTesterFloatingPointBase() {} - KOKKOS_INLINE_FUNCTION void operator()(size_type iwork, - value_type& dst) const { + KOKKOS_INLINE_FUNCTION void operator()(size_type iwork, value_type& dst) const { TRACE(); typedef Kokkos::ArithTraits AT; (void)iwork; // forestall compiler warning for unused variable @@ -1242,11 +1205,9 @@ class ArithTraitsTesterFloatingPointBase // Compiler intrinsic casts from nan of type half_t / bhalf_t to nan // of type float in CUDA, SYCL and HIP do not work yet. -#if defined(KOKKOS_ENABLE_CUDA) || defined(KOKKOS_ENABLE_SYCL) || \ - defined(KOKKOS_ENABLE_HIP) +#if defined(KOKKOS_ENABLE_CUDA) || defined(KOKKOS_ENABLE_SYCL) || defined(KOKKOS_ENABLE_HIP) namespace KE = Kokkos::Experimental; - if constexpr (!std::is_same::value && - !std::is_same::value) { + if constexpr (!std::is_same::value && !std::is_same::value) { #else { #endif // KOKKOS_ENABLE_CUDA || KOKKOS_ENABLE_SYCL || KOKKOS_ENABLE_HIP @@ -1267,8 +1228,7 @@ class ArithTraitsTesterFloatingPointBase Kokkos::printf("1 is Inf\n"); FAILURE(); } -#if defined(KOKKOS_ENABLE_SYCL) || \ - defined(KOKKOS_ENABLE_HIP) // FIXME_SYCL, FIXME_HIP +#if defined(KOKKOS_ENABLE_SYCL) || defined(KOKKOS_ENABLE_HIP) // FIXME_SYCL, FIXME_HIP if constexpr (!std::is_same_v) { if (AT::isNan(zero)) { Kokkos::printf("0 is NaN\n"); @@ -1377,13 +1337,10 @@ class ArithTraitsTesterFloatingPointBase // template class ArithTraitsTesterFloatingPointBase - : public ArithTraitsTesterComplexBase< - ScalarType, DeviceType, Kokkos::ArithTraits::is_complex> { + : public ArithTraitsTesterComplexBase::is_complex> { private: //! The base class of this class. - typedef ArithTraitsTesterComplexBase< - ScalarType, DeviceType, Kokkos::ArithTraits::is_complex> - base_type; + typedef ArithTraitsTesterComplexBase::is_complex> base_type; public: typedef typename DeviceType::execution_space execution_space; @@ -1394,8 +1351,7 @@ class ArithTraitsTesterFloatingPointBase //! Constructor (does nothing, but marked as device function). KOKKOS_INLINE_FUNCTION ArithTraitsTesterFloatingPointBase() {} - KOKKOS_INLINE_FUNCTION void operator()(size_type iwork, - value_type& dst) const { + KOKKOS_INLINE_FUNCTION void operator()(size_type iwork, value_type& dst) const { TRACE(); typedef Kokkos::ArithTraits AT; (void)iwork; // forestall compiler warning for unused variable @@ -1464,8 +1420,7 @@ class ArithTraitsTesterFloatingPointBase /// executions of the test. All redundant executions must return /// '1' (passed). template -class ArithTraitsTester - : public ArithTraitsTesterFloatingPointBase { +class ArithTraitsTester : public ArithTraitsTesterFloatingPointBase { public: typedef typename DeviceType::execution_space execution_space; typedef typename execution_space::size_type size_type; @@ -1491,11 +1446,9 @@ int testArithTraitsOnDevice(std::ostream& out, const int verbose) { using std::endl; typedef ArithTraitsTester functor_type; int success = 1; // output argument of parallel_reduce - Kokkos::parallel_reduce("KokkosKernels::Common::Test::ArithTraitsOnDevice", 1, - functor_type(), success); + Kokkos::parallel_reduce("KokkosKernels::Common::Test::ArithTraitsOnDevice", 1, functor_type(), success); if (success) { - if (verbose) - out << Kokkos::ArithTraits::name() << " passed" << endl; + if (verbose) out << Kokkos::ArithTraits::name() << " passed" << endl; } else { out << Kokkos::ArithTraits::name() << " FAILED" << endl; } @@ -1517,8 +1470,7 @@ int testArithTraitsOnHost(std::ostream& out, const int verbose) { const int localSuccess = f.testHost(out); if (localSuccess) { - if (verbose) - out << Kokkos::ArithTraits::name() << " passed" << endl; + if (verbose) out << Kokkos::ArithTraits::name() << " passed" << endl; } else { out << Kokkos::ArithTraits::name() << " FAILED" << endl; } @@ -1558,8 +1510,7 @@ int runAllArithTraitsDeviceTests(std::ostream& out, const int verbose) { success = success && curSuccess; curSuccess = testArithTraitsOnDevice(out, verbose); success = success && curSuccess; - curSuccess = - testArithTraitsOnDevice(out, verbose); + curSuccess = testArithTraitsOnDevice(out, verbose); success = success && curSuccess; curSuccess = testArithTraitsOnDevice(out, verbose); success = success && curSuccess; @@ -1587,8 +1538,7 @@ int runAllArithTraitsDeviceTests(std::ostream& out, const int verbose) { success = success && curSuccess; curSuccess = testArithTraitsOnDevice(out, verbose); success = success && curSuccess; - curSuccess = - testArithTraitsOnDevice(out, verbose); + curSuccess = testArithTraitsOnDevice(out, verbose); // // Built-in real floating-point types @@ -1596,10 +1546,8 @@ int runAllArithTraitsDeviceTests(std::ostream& out, const int verbose) { #if defined(KOKKOS_HALF_T_IS_FLOAT) TRACE(); - success = success && curSuccess; - curSuccess = - testArithTraitsOnDevice( - out, verbose); + success = success && curSuccess; + curSuccess = testArithTraitsOnDevice(out, verbose); #endif // KOKKOS_HALF_T_IS_FLOAT success = success && curSuccess; curSuccess = testArithTraitsOnDevice(out, verbose); @@ -1610,12 +1558,10 @@ int runAllArithTraitsDeviceTests(std::ostream& out, const int verbose) { // Kokkos' complex floating-point types // - success = success && curSuccess; - curSuccess = - testArithTraitsOnDevice, DeviceType>(out, verbose); success = success && curSuccess; - curSuccess = testArithTraitsOnDevice, DeviceType>( - out, verbose); + curSuccess = testArithTraitsOnDevice, DeviceType>(out, verbose); + success = success && curSuccess; + curSuccess = testArithTraitsOnDevice, DeviceType>(out, verbose); return success && curSuccess; } @@ -1682,8 +1628,7 @@ int runAllArithTraitsHostTests(std::ostream& out, const int verbose) { success = success && curSuccess; curSuccess = testArithTraitsOnHost(out, verbose); success = success && curSuccess; - curSuccess = - testArithTraitsOnHost(out, verbose); + curSuccess = testArithTraitsOnHost(out, verbose); // // Built-in real and complex floating-point types @@ -1693,20 +1638,16 @@ int runAllArithTraitsHostTests(std::ostream& out, const int verbose) { curSuccess = testArithTraitsOnHost(out, verbose); success = success && curSuccess; curSuccess = testArithTraitsOnHost(out, verbose); -#if !defined(KOKKOS_ENABLE_CUDA) && !defined(KOKKOS_ENABLE_HIP) && \ - !defined(KOKKOS_ENABLE_SYCL) +#if !defined(KOKKOS_ENABLE_CUDA) && !defined(KOKKOS_ENABLE_HIP) && !defined(KOKKOS_ENABLE_SYCL) // This would spill tons of warnings about host device stuff otherwise success = success && curSuccess; curSuccess = testArithTraitsOnHost(out, verbose); success = success && curSuccess; - curSuccess = - testArithTraitsOnHost, DeviceType>(out, verbose); - success = success && curSuccess; - curSuccess = - testArithTraitsOnHost, DeviceType>(out, verbose); + curSuccess = testArithTraitsOnHost, DeviceType>(out, verbose); + success = success && curSuccess; + curSuccess = testArithTraitsOnHost, DeviceType>(out, verbose); success = success && curSuccess; - curSuccess = testArithTraitsOnHost, DeviceType>( - out, verbose); + curSuccess = testArithTraitsOnHost, DeviceType>(out, verbose); #endif // // Kokkos' complex floating-point types @@ -1715,15 +1656,12 @@ int runAllArithTraitsHostTests(std::ostream& out, const int verbose) { #if defined(KOKKOS_HALF_T_IS_FLOAT) success = success && curSuccess; TRACE(); - curSuccess = testArithTraitsOnHost( - out, verbose); + curSuccess = testArithTraitsOnHost(out, verbose); #endif // KOKKOS_HALF_T_IS_FLOAT - success = success && curSuccess; - curSuccess = - testArithTraitsOnHost, DeviceType>(out, verbose); - success = success && curSuccess; - curSuccess = - testArithTraitsOnHost, DeviceType>(out, verbose); + success = success && curSuccess; + curSuccess = testArithTraitsOnHost, DeviceType>(out, verbose); + success = success && curSuccess; + curSuccess = testArithTraitsOnHost, DeviceType>(out, verbose); // success = success && curSuccess; curSuccess = // testArithTraitsOnHost, DeviceType> (out, // verbose); diff --git a/common/unit_test/Test_Common_Error.hpp b/common/unit_test/Test_Common_Error.hpp index 375f75b5ff..139231d63f 100644 --- a/common/unit_test/Test_Common_Error.hpp +++ b/common/unit_test/Test_Common_Error.hpp @@ -20,8 +20,7 @@ #include "KokkosKernels_Error.hpp" void test_kokkoskernels_throw() { - const std::string my_throw_msg = - "Testing Kokkos Kernels' throw_runtime_exception."; + const std::string my_throw_msg = "Testing Kokkos Kernels' throw_runtime_exception."; try { KokkosKernels::Impl::throw_runtime_exception(my_throw_msg); } catch (const std::runtime_error& e) { diff --git a/common/unit_test/Test_Common_Iota.hpp b/common/unit_test/Test_Common_Iota.hpp index af3b6502bf..ee1e33fda8 100644 --- a/common/unit_test/Test_Common_Iota.hpp +++ b/common/unit_test/Test_Common_Iota.hpp @@ -76,13 +76,11 @@ void test_iota_rank() { template void test_iota_non_const_value_type() { - static_assert( - std::is_same_v::non_const_value_type, T>, - "Iota's non-const value type should be same as non-const type provided"); - static_assert( - std::is_same_v::non_const_value_type, T>, - "Iota's non-const value type should be same as non-const version of " - "const type provided"); + static_assert(std::is_same_v::non_const_value_type, T>, + "Iota's non-const value type should be same as non-const type provided"); + static_assert(std::is_same_v::non_const_value_type, T>, + "Iota's non-const value type should be same as non-const version of " + "const type provided"); } template @@ -98,10 +96,8 @@ void test_iota_subview() { template void test_is_iota() { - static_assert(KokkosKernels::Impl::is_iota_v>, - "Iota should be an Iota"); - static_assert(!KokkosKernels::Impl::is_iota_v, - "int should not be an Iota"); + static_assert(KokkosKernels::Impl::is_iota_v>, "Iota should be an Iota"); + static_assert(!KokkosKernels::Impl::is_iota_v, "int should not be an Iota"); } template diff --git a/common/unit_test/Test_Common_LowerBound.hpp b/common/unit_test/Test_Common_LowerBound.hpp index 23574087ff..d471801a30 100644 --- a/common/unit_test/Test_Common_LowerBound.hpp +++ b/common/unit_test/Test_Common_LowerBound.hpp @@ -21,8 +21,7 @@ #include template -size_t std_lower_bound(const std::vector &haystack, - const Ordinal needle) { +size_t std_lower_bound(const std::vector &haystack, const Ordinal needle) { const auto it = std::lower_bound(haystack.begin(), haystack.end(), needle); return it - haystack.begin(); } @@ -33,9 +32,7 @@ struct ThreadLowerBoundFunctor { using hv_value_type = typename HaystackView::non_const_value_type; using hv_size_type = typename HaystackView::size_type; - ThreadLowerBoundFunctor(const hv_size_type &expected, - const HaystackView &haystack, - const hv_value_type &needle) + ThreadLowerBoundFunctor(const hv_size_type &expected, const HaystackView &haystack, const hv_value_type &needle) : expected_(expected), haystack_(haystack), needle_(needle) {} KOKKOS_INLINE_FUNCTION @@ -43,8 +40,7 @@ struct ThreadLowerBoundFunctor { if (0 == i) { hv_size_type idx = KokkosKernels::lower_bound_thread(haystack_, needle_); if (idx != expected_) { - Kokkos::printf("%s:%d thread %d expected %d got %d\n", __FILE__, - __LINE__, int(i), int(expected_), int(idx)); + Kokkos::printf("%s:%d thread %d expected %d got %d\n", __FILE__, __LINE__, int(i), int(expected_), int(idx)); ++lerrCount; } } @@ -56,13 +52,11 @@ struct ThreadLowerBoundFunctor { }; template -void test_lower_bound_thread(const std::vector &_haystack, - const T &_needle) { +void test_lower_bound_thread(const std::vector &_haystack, const T &_needle) { using execution_space = typename Device::execution_space; using Policy = Kokkos::RangePolicy; using view_t = Kokkos::View; - using u_const_view_t = Kokkos::View>; + using u_const_view_t = Kokkos::View>; using size_type = typename u_const_view_t::size_type; // get expected value @@ -76,9 +70,7 @@ void test_lower_bound_thread(const std::vector &_haystack, // test lower_bound search int errCount; // run a single thread - Kokkos::parallel_reduce(Policy(0, 1), - ThreadLowerBoundFunctor(expected, haystack, _needle), - errCount); + Kokkos::parallel_reduce(Policy(0, 1), ThreadLowerBoundFunctor(expected, haystack, _needle), errCount); EXPECT_EQ(0, errCount); } @@ -89,18 +81,14 @@ struct TeamLowerBoundFunctor { using hv_value_type = typename HaystackView::non_const_value_type; using hv_size_type = typename HaystackView::size_type; - TeamLowerBoundFunctor(const hv_size_type &expected, - const HaystackView &haystack, - const hv_value_type &needle) + TeamLowerBoundFunctor(const hv_size_type &expected, const HaystackView &haystack, const hv_value_type &needle) : expected_(expected), haystack_(haystack), needle_(needle) {} - KOKKOS_INLINE_FUNCTION void operator()(const Member &handle, - int &lerrCount) const { - hv_size_type idx = - KokkosKernels::lower_bound_team(handle, haystack_, needle_); + KOKKOS_INLINE_FUNCTION void operator()(const Member &handle, int &lerrCount) const { + hv_size_type idx = KokkosKernels::lower_bound_team(handle, haystack_, needle_); if (idx != expected_) { - Kokkos::printf("%s:%d thread %d expected %d got %d\n", __FILE__, __LINE__, - int(handle.team_rank()), int(expected_), int(idx)); + Kokkos::printf("%s:%d thread %d expected %d got %d\n", __FILE__, __LINE__, int(handle.team_rank()), + int(expected_), int(idx)); ++lerrCount; } } @@ -116,8 +104,7 @@ void test_lower_bound_team(const std::vector &_haystack, const T _needle) { using Policy = Kokkos::TeamPolicy; using Member = typename Policy::member_type; using view_t = Kokkos::View; - using u_const_view_t = Kokkos::View>; + using u_const_view_t = Kokkos::View>; using size_type = typename u_const_view_t::size_type; // get expected value @@ -130,13 +117,10 @@ void test_lower_bound_team(const std::vector &_haystack, const T _needle) { // test lower_bound search const int leagueSize = 1; - const int teamSize = - KokkosKernels::Impl::kk_is_gpu_exec_space() ? 64 : 1; + const int teamSize = KokkosKernels::Impl::kk_is_gpu_exec_space() ? 64 : 1; int errCount; - Kokkos::parallel_reduce( - Policy(leagueSize, teamSize), - TeamLowerBoundFunctor(expected, haystack, _needle), - errCount); + Kokkos::parallel_reduce(Policy(leagueSize, teamSize), + TeamLowerBoundFunctor(expected, haystack, _needle), errCount); EXPECT_EQ(0, errCount); } @@ -218,38 +202,31 @@ void test_lower_bound() { } } -#define EXECUTE_TEST(T, DEVICE) \ - TEST_F(TestCategory, common##_##lower_bound##_##T##_##DEVICE) { \ - test_lower_bound(); \ - } +#define EXECUTE_TEST(T, DEVICE) \ + TEST_F(TestCategory, common##_##lower_bound##_##T##_##DEVICE) { test_lower_bound(); } #if (defined(KOKKOSKERNELS_INST_ORDINAL_INT)) || \ - (!defined(KOKKOSKERNELS_ETI_ONLY) && \ - !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS)) + (!defined(KOKKOSKERNELS_ETI_ONLY) && !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS)) EXECUTE_TEST(int, TestDevice) #endif #if (defined(KOKKOSKERNELS_INST_ORDINAL_INT64_T)) || \ - (!defined(KOKKOSKERNELS_ETI_ONLY) && \ - !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS)) + (!defined(KOKKOSKERNELS_ETI_ONLY) && !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS)) EXECUTE_TEST(int64_t, TestDevice) #endif #if (defined(KOKKOSKERNELS_INST_ORDINAL_INT)) || \ - (!defined(KOKKOSKERNELS_ETI_ONLY) && \ - !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS)) + (!defined(KOKKOSKERNELS_ETI_ONLY) && !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS)) EXECUTE_TEST(size_t, TestDevice) #endif #if (defined(KOKKOSKERNELS_INST_FLOAT)) || \ - (!defined(KOKKOSKERNELS_ETI_ONLY) && \ - !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS)) + (!defined(KOKKOSKERNELS_ETI_ONLY) && !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS)) EXECUTE_TEST(float, TestDevice) #endif #if (defined(KOKKOSKERNELS_INST_DOUBLE)) || \ - (!defined(KOKKOSKERNELS_ETI_ONLY) && \ - !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS)) + (!defined(KOKKOSKERNELS_ETI_ONLY) && !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS)) EXECUTE_TEST(double, TestDevice) #endif diff --git a/common/unit_test/Test_Common_PrintConfiguration.hpp b/common/unit_test/Test_Common_PrintConfiguration.hpp index 6638c6e398..4f59a8857b 100644 --- a/common/unit_test/Test_Common_PrintConfiguration.hpp +++ b/common/unit_test/Test_Common_PrintConfiguration.hpp @@ -56,8 +56,6 @@ void testPrintConfiguration() { check_print_configuration(out); } -TEST_F(TestCategory, common_print_configuration) { - testPrintConfiguration(); -} +TEST_F(TestCategory, common_print_configuration) { testPrintConfiguration(); } #endif // KOKKOSKERNELS_PRINTCONFIGURATIONTEST_HPP diff --git a/common/unit_test/Test_Common_Sorting.hpp b/common/unit_test/Test_Common_Sorting.hpp index e93a9d0939..30623a8691 100644 --- a/common/unit_test/Test_Common_Sorting.hpp +++ b/common/unit_test/Test_Common_Sorting.hpp @@ -33,8 +33,7 @@ // Then prefix-sum into randomOffsets. // This simulates a CRS rowmap or other batched sorting scenario template -size_t generateRandomOffsets(OrdView randomCounts, OrdView randomOffsets, - size_t n, size_t avg) { +size_t generateRandomOffsets(OrdView randomCounts, OrdView randomOffsets, size_t n, size_t avg) { srand(54321); auto countsHost = Kokkos::create_mirror_view(randomCounts); size_t total = 0; @@ -47,8 +46,7 @@ size_t generateRandomOffsets(OrdView randomCounts, OrdView randomOffsets, } Kokkos::deep_copy(randomCounts, countsHost); Kokkos::deep_copy(randomOffsets, randomCounts); - KokkosKernels::Impl::kk_exclusive_parallel_prefix_sum( - n, randomOffsets); + KokkosKernels::Impl::kk_exclusive_parallel_prefix_sum(n, randomOffsets); return total; } @@ -87,8 +85,7 @@ double getRandom() { template <> Coordinates getRandom() { - return Coordinates(getRandom(), getRandom(), - getRandom()); + return Coordinates(getRandom(), getRandom(), getRandom()); } // Specialize for Kokkos::complex, with the real and imaginary parts different @@ -99,9 +96,7 @@ struct kvHash { template struct kvHash> { - Kokkos::complex operator()(const Key& k) { - return Kokkos::complex(3 * k + 4, k - 10.4); - } + Kokkos::complex operator()(const Key& k) { return Kokkos::complex(3 * k + 4, k - 10.4); } }; template @@ -133,14 +128,12 @@ struct TestSerialRadixFunctor { using Key = typename KeyView::value_type; using UnsignedKey = typename std::make_unsigned::type; - TestSerialRadixFunctor(KeyView& keys_, KeyView& keysAux_, OrdView& counts_, - OrdView& offsets_) + TestSerialRadixFunctor(KeyView& keys_, KeyView& keysAux_, OrdView& counts_, OrdView& offsets_) : keys(keys_), keysAux(keysAux_), counts(counts_), offsets(offsets_) {} KOKKOS_INLINE_FUNCTION void operator()(const int i) const { int off = offsets(i); - KokkosKernels::SerialRadixSort( - (UnsignedKey*)keys.data() + off, (UnsignedKey*)keysAux.data() + off, - counts(i)); + KokkosKernels::SerialRadixSort((UnsignedKey*)keys.data() + off, + (UnsignedKey*)keysAux.data() + off, counts(i)); } KeyView keys; KeyView keysAux; @@ -155,20 +148,14 @@ struct TestSerialRadix2Functor { using UnsignedKey = typename std::make_unsigned::type; using Value = typename ValView::value_type; - TestSerialRadix2Functor(KeyView& keys_, KeyView& keysAux_, ValView& values_, - ValView& valuesAux_, OrdView& counts_, + TestSerialRadix2Functor(KeyView& keys_, KeyView& keysAux_, ValView& values_, ValView& valuesAux_, OrdView& counts_, OrdView& offsets_) - : keys(keys_), - keysAux(keysAux_), - values(values_), - valuesAux(valuesAux_), - counts(counts_), - offsets(offsets_) {} + : keys(keys_), keysAux(keysAux_), values(values_), valuesAux(valuesAux_), counts(counts_), offsets(offsets_) {} KOKKOS_INLINE_FUNCTION void operator()(const int i) const { int off = offsets(i); - KokkosKernels::SerialRadixSort2( - (UnsignedKey*)keys.data() + off, (UnsignedKey*)keysAux.data() + off, - values.data() + off, valuesAux.data() + off, counts(i)); + KokkosKernels::SerialRadixSort2((UnsignedKey*)keys.data() + off, + (UnsignedKey*)keysAux.data() + off, values.data() + off, + valuesAux.data() + off, counts(i)); } KeyView keys; KeyView keysAux; @@ -188,8 +175,7 @@ void testSerialRadixSort(size_t k, size_t subArraySize) { OrdView counts("Subarray Sizes", k); OrdView offsets("Subarray Offsets", k); // Generate k sub-array sizes, each with size about 20 - size_t n = generateRandomOffsets(counts, offsets, k, - subArraySize); + size_t n = generateRandomOffsets(counts, offsets, k, subArraySize); KeyView keys("Radix sort testing data", n); fillRandom(keys); // Sort using std::sort on host to do correctness test @@ -198,22 +184,17 @@ void testSerialRadixSort(size_t k, size_t subArraySize) { KeyView keysAux("Radix sort aux data", n); // Run the sorting on device in all sub-arrays in parallel typedef Kokkos::RangePolicy range_policy; - Kokkos::parallel_for( - range_policy(0, k), - TestSerialRadixFunctor(keys, keysAux, counts, offsets)); + Kokkos::parallel_for(range_policy(0, k), TestSerialRadixFunctor(keys, keysAux, counts, offsets)); exec_space().fence(); - auto countsHost = - Kokkos::create_mirror_view_and_copy(Kokkos::HostSpace(), counts); - auto offsetsHost = - Kokkos::create_mirror_view_and_copy(Kokkos::HostSpace(), offsets); + auto countsHost = Kokkos::create_mirror_view_and_copy(Kokkos::HostSpace(), counts); + auto offsetsHost = Kokkos::create_mirror_view_and_copy(Kokkos::HostSpace(), offsets); for (size_t i = 0; i < k; i++) { Key* begin = gold.data() + offsetsHost(i); Key* end = begin + countsHost(i); std::sort(begin, end); } // Copy actual result to host and compare - auto keysHost = - Kokkos::create_mirror_view_and_copy(Kokkos::HostSpace(), keys); + auto keysHost = Kokkos::create_mirror_view_and_copy(Kokkos::HostSpace(), keys); for (size_t i = 0; i < n; i++) { ASSERT_EQ(keysHost(i), gold(i)); } @@ -230,8 +211,7 @@ void testSerialRadixSort2(size_t k, size_t subArraySize) { OrdView counts("Subarray Sizes", k); OrdView offsets("Subarray Offsets", k); // Generate k sub-array sizes, each with size about 20 - size_t n = generateRandomOffsets(counts, offsets, k, - subArraySize); + size_t n = generateRandomOffsets(counts, offsets, k, subArraySize); KeyView keys("Radix test keys", n); ValView data("Radix test data", n); // The keys are randomized @@ -243,25 +223,20 @@ void testSerialRadixSort2(size_t k, size_t subArraySize) { // Run the sorting on device in all sub-arrays in parallel typedef Kokkos::RangePolicy range_policy; // Deliberately using a weird number for vector length - Kokkos::parallel_for(range_policy(0, k), - TestSerialRadix2Functor( - keys, keysAux, data, dataAux, counts, offsets)); + Kokkos::parallel_for(range_policy(0, k), TestSerialRadix2Functor( + keys, keysAux, data, dataAux, counts, offsets)); exec_space().fence(); // Sort using std::sort on host to do correctness test - auto countsHost = - Kokkos::create_mirror_view_and_copy(Kokkos::HostSpace(), counts); - auto offsetsHost = - Kokkos::create_mirror_view_and_copy(Kokkos::HostSpace(), offsets); + auto countsHost = Kokkos::create_mirror_view_and_copy(Kokkos::HostSpace(), counts); + auto offsetsHost = Kokkos::create_mirror_view_and_copy(Kokkos::HostSpace(), offsets); for (size_t i = 0; i < k; i++) { Key* begin = gold.data() + offsetsHost(i); Key* end = begin + countsHost(i); std::sort(begin, end); } // Copy results to host - auto keysHost = - Kokkos::create_mirror_view_and_copy(Kokkos::HostSpace(), keys); - auto dataHost = - Kokkos::create_mirror_view_and_copy(Kokkos::HostSpace(), data); + auto keysHost = Kokkos::create_mirror_view_and_copy(Kokkos::HostSpace(), keys); + auto dataHost = Kokkos::create_mirror_view_and_copy(Kokkos::HostSpace(), data); // Make sure keys are sorted exactly (stability of sort doesn't matter) for (size_t i = 0; i < n; i++) { ASSERT_EQ(keysHost(i), gold(i)); @@ -283,8 +258,7 @@ struct TestTeamBitonicFunctor { template KOKKOS_INLINE_FUNCTION void operator()(const TeamMem t) const { int i = t.league_rank(); - KokkosKernels::TeamBitonicSort( - values.data() + offsets(i), counts(i), t); + KokkosKernels::TeamBitonicSort(values.data() + offsets(i), counts(i), t); } ValView values; @@ -297,15 +271,14 @@ struct TestTeamBitonic2Functor { typedef typename KeyView::value_type Key; typedef typename ValView::value_type Value; - TestTeamBitonic2Functor(KeyView& keys_, ValView& values_, OrdView& counts_, - OrdView& offsets_) + TestTeamBitonic2Functor(KeyView& keys_, ValView& values_, OrdView& counts_, OrdView& offsets_) : keys(keys_), values(values_), counts(counts_), offsets(offsets_) {} template KOKKOS_INLINE_FUNCTION void operator()(const TeamMem t) const { int i = t.league_rank(); - KokkosKernels::TeamBitonicSort2( - keys.data() + offsets(i), values.data() + offsets(i), counts(i), t); + KokkosKernels::TeamBitonicSort2(keys.data() + offsets(i), values.data() + offsets(i), + counts(i), t); } KeyView keys; @@ -324,25 +297,21 @@ void testTeamBitonicSort(size_t k, size_t subArraySize) { OrdView counts("Subarray Sizes", k); OrdView offsets("Subarray Offsets", k); // Generate k sub-array sizes, each with size about 20 - size_t n = generateRandomOffsets(counts, offsets, k, - subArraySize); + size_t n = generateRandomOffsets(counts, offsets, k, subArraySize); ValView data("Bitonic sort testing data", n); fillRandom(data); Kokkos::View gold("Host sorted", n); Kokkos::deep_copy(gold, data); // Run the sorting on device in all sub-arrays in parallel - Kokkos::parallel_for( - Kokkos::TeamPolicy(k, Kokkos::AUTO()), - TestTeamBitonicFunctor(data, counts, offsets)); + Kokkos::parallel_for(Kokkos::TeamPolicy(k, Kokkos::AUTO()), + TestTeamBitonicFunctor(data, counts, offsets)); // Copy result to host auto dataHost = Kokkos::create_mirror_view(data); Kokkos::deep_copy(dataHost, data); // Sort using std::sort on host to do correctness test exec_space().fence(); - auto countsHost = - Kokkos::create_mirror_view_and_copy(Kokkos::HostSpace(), counts); - auto offsetsHost = - Kokkos::create_mirror_view_and_copy(Kokkos::HostSpace(), offsets); + auto countsHost = Kokkos::create_mirror_view_and_copy(Kokkos::HostSpace(), counts); + auto offsetsHost = Kokkos::create_mirror_view_and_copy(Kokkos::HostSpace(), offsets); for (size_t i = 0; i < k; i++) { Scalar* begin = gold.data() + offsetsHost(i); Scalar* end = begin + countsHost(i); @@ -364,8 +333,7 @@ void testTeamBitonicSort2(size_t k, size_t subArraySize) { OrdView counts("Subarray Sizes", k); OrdView offsets("Subarray Offsets", k); // Generate k sub-array sizes, each with size about 20 - size_t n = generateRandomOffsets(counts, offsets, k, - subArraySize); + size_t n = generateRandomOffsets(counts, offsets, k, subArraySize); KeyView keys("Bitonic test keys", n); ValView data("Bitonic test data", n); // The keys are randomized @@ -375,13 +343,10 @@ void testTeamBitonicSort2(size_t k, size_t subArraySize) { // Run the sorting on device in all sub-arrays in parallel, just using vector // loops Deliberately using a weird number for vector length Kokkos::parallel_for(Kokkos::TeamPolicy(k, Kokkos::AUTO()), - TestTeamBitonic2Functor( - keys, data, counts, offsets)); + TestTeamBitonic2Functor(keys, data, counts, offsets)); exec_space().fence(); - auto countsHost = - Kokkos::create_mirror_view_and_copy(Kokkos::HostSpace(), counts); - auto offsetsHost = - Kokkos::create_mirror_view_and_copy(Kokkos::HostSpace(), offsets); + auto countsHost = Kokkos::create_mirror_view_and_copy(Kokkos::HostSpace(), counts); + auto offsetsHost = Kokkos::create_mirror_view_and_copy(Kokkos::HostSpace(), offsets); // Sort using std::sort on host to do correctness test for (size_t i = 0; i < k; i++) { Key* begin = gold.data() + offsetsHost(i); @@ -389,10 +354,8 @@ void testTeamBitonicSort2(size_t k, size_t subArraySize) { std::sort(begin, end); } // Copy results to host - auto keysHost = - Kokkos::create_mirror_view_and_copy(Kokkos::HostSpace(), keys); - auto dataHost = - Kokkos::create_mirror_view_and_copy(Kokkos::HostSpace(), data); + auto keysHost = Kokkos::create_mirror_view_and_copy(Kokkos::HostSpace(), keys); + auto dataHost = Kokkos::create_mirror_view_and_copy(Kokkos::HostSpace(), data); // Make sure keys are sorted exactly (stability of sort doesn't matter) for (size_t i = 0; i < n; i++) { ASSERT_EQ(keysHost(i), gold(i)); @@ -423,8 +386,7 @@ void testBitonicSort(size_t n) { fillRandom(data); KokkosKernels::bitonicSort(data); int ordered = 1; - Kokkos::parallel_reduce(Kokkos::RangePolicy(0, n - 1), - CheckSortedFunctor(data), + Kokkos::parallel_reduce(Kokkos::RangePolicy(0, n - 1), CheckSortedFunctor(data), Kokkos::Min(ordered)); ASSERT_TRUE(ordered); } @@ -443,10 +405,7 @@ struct CheckOrderedFunctor { template struct CompareDescending { - KOKKOS_INLINE_FUNCTION bool operator()(const Scalar lhs, - const Scalar rhs) const { - return lhs > rhs; - } + KOKKOS_INLINE_FUNCTION bool operator()(const Scalar lhs, const Scalar rhs) const { return lhs > rhs; } }; template @@ -462,15 +421,13 @@ void testBitonicSortDescending() { fillRandom(data); KokkosKernels::bitonicSort(data); int ordered = 1; - Kokkos::parallel_reduce(Kokkos::RangePolicy(0, n - 1), - CheckOrderedFunctor(data), + Kokkos::parallel_reduce(Kokkos::RangePolicy(0, n - 1), CheckOrderedFunctor(data), Kokkos::Min(ordered)); ASSERT_TRUE(ordered); } struct LexCompare { - KOKKOS_INLINE_FUNCTION bool operator()(const Coordinates lhs, - const Coordinates rhs) const { + KOKKOS_INLINE_FUNCTION bool operator()(const Coordinates lhs, const Coordinates rhs) const { if (lhs.x < rhs.x) return true; else if (lhs.x > rhs.x) @@ -497,8 +454,7 @@ void testBitonicSortLexicographic() { fillRandom(data); KokkosKernels::bitonicSort(data); int ordered = 1; - Kokkos::parallel_reduce(Kokkos::RangePolicy(0, n - 1), - CheckOrderedFunctor(data), + Kokkos::parallel_reduce(Kokkos::RangePolicy(0, n - 1), CheckOrderedFunctor(data), Kokkos::Min(ordered)); ASSERT_TRUE(ordered); } @@ -520,8 +476,7 @@ TEST_F(TestCategory, common_serial_radix2) { for (size_t arrayMax = 0; arrayMax < 1000; arrayMax = 1 + 4 * arrayMax) { testSerialRadixSort2(numArrays, arrayMax); testSerialRadixSort2(numArrays, arrayMax); - testSerialRadixSort2>(numArrays, - arrayMax); + testSerialRadixSort2>(numArrays, arrayMax); } } @@ -542,8 +497,7 @@ TEST_F(TestCategory, common_team_bitonic2) { for (size_t arrayMax = 0; arrayMax < 10000; arrayMax = 1 + 4 * arrayMax) { testTeamBitonicSort2(numArrays, arrayMax); testTeamBitonicSort2(numArrays, arrayMax); - testTeamBitonicSort2>(numArrays, - arrayMax); + testTeamBitonicSort2>(numArrays, arrayMax); } } diff --git a/common/unit_test/Test_Common_UpperBound.hpp b/common/unit_test/Test_Common_UpperBound.hpp index aace02a738..abd4cf655a 100644 --- a/common/unit_test/Test_Common_UpperBound.hpp +++ b/common/unit_test/Test_Common_UpperBound.hpp @@ -21,8 +21,7 @@ #include template -size_t std_upper_bound(const std::vector &haystack, - const Ordinal needle) { +size_t std_upper_bound(const std::vector &haystack, const Ordinal needle) { const auto it = std::upper_bound(haystack.begin(), haystack.end(), needle); return it - haystack.begin(); } @@ -33,9 +32,7 @@ struct ThreadUpperBoundFunctor { using hv_value_type = typename HaystackView::non_const_value_type; using hv_size_type = typename HaystackView::size_type; - ThreadUpperBoundFunctor(const hv_size_type &expected, - const HaystackView &haystack, - const hv_value_type &needle) + ThreadUpperBoundFunctor(const hv_size_type &expected, const HaystackView &haystack, const hv_value_type &needle) : expected_(expected), haystack_(haystack), needle_(needle) {} KOKKOS_INLINE_FUNCTION @@ -43,8 +40,7 @@ struct ThreadUpperBoundFunctor { if (0 == i) { hv_size_type idx = KokkosKernels::upper_bound_thread(haystack_, needle_); if (idx != expected_) { - Kokkos::printf("%s:%d thread %d expected %d got %d\n", __FILE__, - __LINE__, int(i), int(expected_), int(idx)); + Kokkos::printf("%s:%d thread %d expected %d got %d\n", __FILE__, __LINE__, int(i), int(expected_), int(idx)); ++lerrCount; } } @@ -56,13 +52,11 @@ struct ThreadUpperBoundFunctor { }; template -void test_upper_bound_thread(const std::vector &_haystack, - const T &_needle) { +void test_upper_bound_thread(const std::vector &_haystack, const T &_needle) { using execution_space = typename Device::execution_space; using Policy = Kokkos::RangePolicy; using view_t = Kokkos::View; - using u_const_view_t = Kokkos::View>; + using u_const_view_t = Kokkos::View>; using hv_size_type = typename u_const_view_t::size_type; // get expected value @@ -76,9 +70,7 @@ void test_upper_bound_thread(const std::vector &_haystack, // test upper_bound search int errCount; // run a single thread - Kokkos::parallel_reduce(Policy(0, 1), - ThreadUpperBoundFunctor(expected, haystack, _needle), - errCount); + Kokkos::parallel_reduce(Policy(0, 1), ThreadUpperBoundFunctor(expected, haystack, _needle), errCount); EXPECT_EQ(0, errCount); } @@ -89,18 +81,14 @@ struct TeamUpperBoundFunctor { using hv_value_type = typename HaystackView::non_const_value_type; using hv_size_type = typename HaystackView::size_type; - TeamUpperBoundFunctor(const hv_size_type &expected, - const HaystackView &haystack, - const hv_value_type &needle) + TeamUpperBoundFunctor(const hv_size_type &expected, const HaystackView &haystack, const hv_value_type &needle) : expected_(expected), haystack_(haystack), needle_(needle) {} - KOKKOS_INLINE_FUNCTION void operator()(const Member &handle, - int &lerrCount) const { - hv_size_type idx = - KokkosKernels::upper_bound_team(handle, haystack_, needle_); + KOKKOS_INLINE_FUNCTION void operator()(const Member &handle, int &lerrCount) const { + hv_size_type idx = KokkosKernels::upper_bound_team(handle, haystack_, needle_); if (idx != expected_) { - Kokkos::printf("%s:%d thread %d expected %d got %d\n", __FILE__, __LINE__, - int(handle.team_rank()), int(expected_), int(idx)); + Kokkos::printf("%s:%d thread %d expected %d got %d\n", __FILE__, __LINE__, int(handle.team_rank()), + int(expected_), int(idx)); ++lerrCount; } } @@ -116,8 +104,7 @@ void test_upper_bound_team(const std::vector &_haystack, const T _needle) { using Policy = Kokkos::TeamPolicy; using Member = typename Policy::member_type; using view_t = Kokkos::View; - using u_const_view_t = Kokkos::View>; + using u_const_view_t = Kokkos::View>; using hv_size_type = typename u_const_view_t::size_type; // get expected value @@ -130,13 +117,10 @@ void test_upper_bound_team(const std::vector &_haystack, const T _needle) { // test upper_bound search const int leagueSize = 1; - const int teamSize = - KokkosKernels::Impl::kk_is_gpu_exec_space() ? 64 : 1; + const int teamSize = KokkosKernels::Impl::kk_is_gpu_exec_space() ? 64 : 1; int errCount; - Kokkos::parallel_reduce( - Policy(leagueSize, teamSize), - TeamUpperBoundFunctor(expected, haystack, _needle), - errCount); + Kokkos::parallel_reduce(Policy(leagueSize, teamSize), + TeamUpperBoundFunctor(expected, haystack, _needle), errCount); EXPECT_EQ(0, errCount); } @@ -209,38 +193,31 @@ void test_upper_bound() { } } -#define EXECUTE_TEST(T, DEVICE) \ - TEST_F(TestCategory, common##_##upper_bound##_##T##_##DEVICE) { \ - test_upper_bound(); \ - } +#define EXECUTE_TEST(T, DEVICE) \ + TEST_F(TestCategory, common##_##upper_bound##_##T##_##DEVICE) { test_upper_bound(); } #if (defined(KOKKOSKERNELS_INST_ORDINAL_INT)) || \ - (!defined(KOKKOSKERNELS_ETI_ONLY) && \ - !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS)) + (!defined(KOKKOSKERNELS_ETI_ONLY) && !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS)) EXECUTE_TEST(int, TestDevice) #endif #if (defined(KOKKOSKERNELS_INST_ORDINAL_INT64_T)) || \ - (!defined(KOKKOSKERNELS_ETI_ONLY) && \ - !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS)) + (!defined(KOKKOSKERNELS_ETI_ONLY) && !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS)) EXECUTE_TEST(int64_t, TestDevice) #endif #if (defined(KOKKOSKERNELS_INST_ORDINAL_INT)) || \ - (!defined(KOKKOSKERNELS_ETI_ONLY) && \ - !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS)) + (!defined(KOKKOSKERNELS_ETI_ONLY) && !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS)) EXECUTE_TEST(size_t, TestDevice) #endif #if (defined(KOKKOSKERNELS_INST_FLOAT)) || \ - (!defined(KOKKOSKERNELS_ETI_ONLY) && \ - !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS)) + (!defined(KOKKOSKERNELS_ETI_ONLY) && !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS)) EXECUTE_TEST(float, TestDevice) #endif #if (defined(KOKKOSKERNELS_INST_DOUBLE)) || \ - (!defined(KOKKOSKERNELS_ETI_ONLY) && \ - !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS)) + (!defined(KOKKOSKERNELS_ETI_ONLY) && !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS)) EXECUTE_TEST(double, TestDevice) #endif diff --git a/common/unit_test/Test_Common_Version.hpp b/common/unit_test/Test_Common_Version.hpp index cb5265cfef..e2a5faeee2 100644 --- a/common/unit_test/Test_Common_Version.hpp +++ b/common/unit_test/Test_Common_Version.hpp @@ -42,8 +42,7 @@ void test_version_info() { static_assert(false, "KOKKOSKERNELS_VERSION_PATCH macro is not defined!"); #endif - static_assert(KOKKOSKERNELS_VERSION == (KOKKOSKERNELS_VERSION_MAJOR * 10000 + - KOKKOSKERNELS_VERSION_MINOR * 100 + + static_assert(KOKKOSKERNELS_VERSION == (KOKKOSKERNELS_VERSION_MAJOR * 10000 + KOKKOSKERNELS_VERSION_MINOR * 100 + KOKKOSKERNELS_VERSION_PATCH)); } diff --git a/common/unit_test/Test_Common_float128.hpp b/common/unit_test/Test_Common_float128.hpp index 846a5ef879..063fd06d80 100644 --- a/common/unit_test/Test_Common_float128.hpp +++ b/common/unit_test/Test_Common_float128.hpp @@ -32,7 +32,7 @@ #include #include -//#include +// #include #include #include @@ -55,9 +55,8 @@ std::ostream& operator<<(std::ostream& out, const __float128& x) { const int numCharPrinted = quadmath_snprintf(buf, bufSize, "%.30Qe", x); if (static_cast(numCharPrinted) >= bufSize) { std::ostringstream os; - os << "Failed to print __float128 value: buffer has " << bufSize - << " characters, but quadmath_snprintf wanted " << numCharPrinted - << " characters!"; + os << "Failed to print __float128 value: buffer has " << bufSize << " characters, but quadmath_snprintf wanted " + << numCharPrinted << " characters!"; throw std::runtime_error(os.str()); } out << buf; @@ -79,8 +78,7 @@ void testfloat128() { << "y = " << y << endl << "z = " << z << endl << "(double) z = " << static_cast(z) << endl - << "z - (double) z = " - << (z - static_cast<__float128>(static_cast(z))) << endl; + << "z - (double) z = " << (z - static_cast<__float128>(static_cast(z))) << endl; // FIXME (mfh 04 Sep 2015) The results of printing could depend on // the locale. This works fine for the default locale on my system. @@ -89,8 +87,7 @@ void testfloat128() { os << x; if (os.str() != "1.000000000000000000000000000000e+00") { success = false; - cout << "'_float128 x = 1.0' does not print correctly! It prints as " - << os.str() << "." << endl; + cout << "'_float128 x = 1.0' does not print correctly! It prints as " << os.str() << "." << endl; } } { diff --git a/common/unit_test/Test_Common_set_bit_count.hpp b/common/unit_test/Test_Common_set_bit_count.hpp index 6e2c6e80b6..7b6c996390 100644 --- a/common/unit_test/Test_Common_set_bit_count.hpp +++ b/common/unit_test/Test_Common_set_bit_count.hpp @@ -37,21 +37,17 @@ template struct ppctest { view_type view; typename view_type::non_const_type out_view; - ppctest(view_type view_, typename view_type::non_const_type out_view_) - : view(view_), out_view(out_view_) {} + ppctest(view_type view_, typename view_type::non_const_type out_view_) : view(view_), out_view(out_view_) {} KOKKOS_INLINE_FUNCTION - void operator()(const size_t row) const { - out_view(row) = pop_count(view(row)); - } + void operator()(const size_t row) const { out_view(row) = pop_count(view(row)); } }; template struct ppccheck { view_type view; typename view_type::non_const_type out_view; - ppccheck(view_type view_, typename view_type::non_const_type out_view_) - : view(view_), out_view(out_view_) {} + ppccheck(view_type view_, typename view_type::non_const_type out_view_) : view(view_), out_view(out_view_) {} KOKKOS_INLINE_FUNCTION void operator()(const size_t row) const { @@ -69,8 +65,7 @@ view_type get_array_bit_count(view_type view) { typename view_type::non_const_type out_view("out", view.extent(0)); typedef Kokkos::RangePolicy my_exec_space; - Kokkos::parallel_for("KokkosKernels::Common::Test::GetArrayBitCount", - my_exec_space(0, view.extent(0)), + Kokkos::parallel_for("KokkosKernels::Common::Test::GetArrayBitCount", my_exec_space(0, view.extent(0)), ppctest(view, out_view)); Kokkos::fence(); return out_view; @@ -81,8 +76,7 @@ view_type check_array_bit_count(view_type view) { typename view_type::non_const_type out_view("out", view.extent(0)); typedef Kokkos::RangePolicy my_exec_space; - Kokkos::parallel_for("KokkosKernels::Common::Test::CheckArrayBitCount", - my_exec_space(0, view.extent(0)), + Kokkos::parallel_for("KokkosKernels::Common::Test::CheckArrayBitCount", my_exec_space(0, view.extent(0)), ppccheck(view, out_view)); Kokkos::fence(); return out_view; @@ -92,8 +86,7 @@ template struct ffstest { view_type view; typename view_type::non_const_type out_view; - ffstest(view_type view_, typename view_type::non_const_type out_view_) - : view(view_), out_view(out_view_) {} + ffstest(view_type view_, typename view_type::non_const_type out_view_) : view(view_), out_view(out_view_) {} KOKKOS_INLINE_FUNCTION void operator()(const size_t row) const { @@ -108,8 +101,7 @@ template struct ffscheck { view_type view; typename view_type::non_const_type out_view; - ffscheck(view_type view_, typename view_type::non_const_type out_view_) - : view(view_), out_view(out_view_) {} + ffscheck(view_type view_, typename view_type::non_const_type out_view_) : view(view_), out_view(out_view_) {} KOKKOS_INLINE_FUNCTION void operator()(const size_t row) const { @@ -130,8 +122,7 @@ view_type get_ffs(view_type view) { typename view_type::non_const_type out_view("out", view.extent(0)); typedef Kokkos::RangePolicy my_exec_space; - Kokkos::parallel_for("KokkosKernels::Common::Test::GetFFS", - my_exec_space(0, view.extent(0)), + Kokkos::parallel_for("KokkosKernels::Common::Test::GetFFS", my_exec_space(0, view.extent(0)), ffstest(view, out_view)); Kokkos::fence(); return out_view; @@ -142,8 +133,7 @@ view_type check_ffs(view_type view) { typename view_type::non_const_type out_view("out", view.extent(0)); typedef Kokkos::RangePolicy my_exec_space; - Kokkos::parallel_for("KokkosKernels::Common::Test::CheckFFS", - my_exec_space(0, view.extent(0)), + Kokkos::parallel_for("KokkosKernels::Common::Test::CheckFFS", my_exec_space(0, view.extent(0)), ffscheck(view, out_view)); Kokkos::fence(); return out_view; @@ -159,8 +149,7 @@ void test_set_bit_count() { nonconstview count_bit_view("count_bit_view", array_size); - typename nonconstview::HostMirror hview = - Kokkos::create_mirror_view(count_bit_view); + typename nonconstview::HostMirror hview = Kokkos::create_mirror_view(count_bit_view); for (int i = 0; i < array_size; ++i) { hview(i) = lno_t(rand()) * lno_t(rand()); @@ -170,18 +159,13 @@ void test_set_bit_count() { // KokkosKernels::Impl::kk_print_1Dview(count_bit_view); - myview out1 = - Test::get_array_bit_count( - count_bit_view); - myview out2 = - Test::check_array_bit_count( - count_bit_view); + myview out1 = Test::get_array_bit_count(count_bit_view); + myview out2 = Test::check_array_bit_count(count_bit_view); // KokkosKernels::Impl::kk_print_1Dview(out1); // KokkosKernels::Impl::kk_print_1Dview(out2); - bool is_identical = KokkosKernels::Impl::kk_is_identical_view< - myview, myview, typename myview::value_type, - typename device::execution_space>(out1, out2, 0); + bool is_identical = KokkosKernels::Impl::kk_is_identical_view(out1, out2, 0); EXPECT_TRUE(is_identical); } @@ -193,8 +177,7 @@ void test_ffs() { nonconstview count_bit_view("count_bit_view", array_size); - typename nonconstview::HostMirror hview = - Kokkos::create_mirror_view(count_bit_view); + typename nonconstview::HostMirror hview = Kokkos::create_mirror_view(count_bit_view); for (int i = 0; i < array_size; ++i) { hview(i) = lno_t(rand()) * lno_t(rand()); @@ -204,16 +187,13 @@ void test_ffs() { // KokkosKernels::Impl::kk_print_1Dview(count_bit_view); - myview out1 = - Test::get_ffs(count_bit_view); - myview out2 = - Test::check_ffs(count_bit_view); + myview out1 = Test::get_ffs(count_bit_view); + myview out2 = Test::check_ffs(count_bit_view); // KokkosKernels::Impl::kk_print_1Dview(out1); // KokkosKernels::Impl::kk_print_1Dview(out2); - bool is_identical = KokkosKernels::Impl::kk_is_identical_view< - myview, myview, typename myview::value_type, - typename device::execution_space>(out1, out2, 0); + bool is_identical = KokkosKernels::Impl::kk_is_identical_view(out1, out2, 0); EXPECT_TRUE(is_identical); } diff --git a/example/batched_solve/examples_helper.hpp b/example/batched_solve/examples_helper.hpp index 3010f66ba8..2bbe93fdfb 100644 --- a/example/batched_solve/examples_helper.hpp +++ b/example/batched_solve/examples_helper.hpp @@ -62,12 +62,8 @@ /// template -void create_saddle_point_matrices(const MatrixViewType &A, - const VectorViewType &Y, - const int n_dim = 3) { - Kokkos::Random_XorShift64_Pool< - typename MatrixViewType::device_type::execution_space> - random(13718); +void create_saddle_point_matrices(const MatrixViewType &A, const VectorViewType &Y, const int n_dim = 3) { + Kokkos::Random_XorShift64_Pool random(13718); const int N = A.extent(0); const int n = A.extent(1); const int n_2 = n_dim + 1; @@ -76,12 +72,8 @@ void create_saddle_point_matrices(const MatrixViewType &A, MatrixViewType xs("xs", N, n_1, n_dim); VectorViewType ys("ys", N, n_1); - Kokkos::fill_random( - xs, random, - Kokkos::reduction_identity::prod()); - Kokkos::fill_random( - ys, random, - Kokkos::reduction_identity::prod()); + Kokkos::fill_random(xs, random, Kokkos::reduction_identity::prod()); + Kokkos::fill_random(ys, random, Kokkos::reduction_identity::prod()); auto xs_host = Kokkos::create_mirror_view(xs); auto ys_host = Kokkos::create_mirror_view(ys); @@ -94,8 +86,8 @@ void create_saddle_point_matrices(const MatrixViewType &A, for (int i = 0; i < n_1; ++i) { for (int j = 0; j < n_1; ++j) { for (int l = 0; l < N; ++l) { - auto xs_i = Kokkos::subview(xs_host, l, i, Kokkos::ALL); - auto xs_j = Kokkos::subview(xs_host, l, j, Kokkos::ALL); + auto xs_i = Kokkos::subview(xs_host, l, i, Kokkos::ALL); + auto xs_j = Kokkos::subview(xs_host, l, j, Kokkos::ALL); typename MatrixViewType::value_type d = 0; for (int k = 0; k < n_dim; ++k) d += Kokkos::pow(xs_i(k) - xs_j(k), 2); d = Kokkos::sqrt(d); @@ -125,21 +117,12 @@ void create_saddle_point_matrices(const MatrixViewType &A, } template -void create_tridiagonal_batched_matrices(const int nnz, const int BlkSize, - const int N, const IntView &r, - const IntView &c, - const VectorViewType &D, - const VectorViewType &X, +void create_tridiagonal_batched_matrices(const int nnz, const int BlkSize, const int N, const IntView &r, + const IntView &c, const VectorViewType &D, const VectorViewType &X, const VectorViewType &B) { - Kokkos::Random_XorShift64_Pool< - typename VectorViewType::device_type::execution_space> - random(13718); - Kokkos::fill_random( - X, random, - Kokkos::reduction_identity::prod()); - Kokkos::fill_random( - B, random, - Kokkos::reduction_identity::prod()); + Kokkos::Random_XorShift64_Pool random(13718); + Kokkos::fill_random(X, random, Kokkos::reduction_identity::prod()); + Kokkos::fill_random(B, random, Kokkos::reduction_identity::prod()); auto D_host = Kokkos::create_mirror_view(D); auto r_host = Kokkos::create_mirror_view(r); @@ -181,8 +164,7 @@ void create_tridiagonal_batched_matrices(const int nnz, const int BlkSize, } template -void getInvDiagFromCRS(const VType &V, const IntType &r, const IntType &c, - const VType &diag) { +void getInvDiagFromCRS(const VType &V, const IntType &r, const IntType &c, const VType &diag) { auto diag_values_host = Kokkos::create_mirror_view(diag); auto values_host = Kokkos::create_mirror_view(V); auto row_ptr_host = Kokkos::create_mirror_view(r); @@ -197,8 +179,7 @@ void getInvDiagFromCRS(const VType &V, const IntType &r, const IntType &c, int BlkSize = diag.extent(1); for (int i = 0; i < BlkSize; ++i) { - for (current_index = row_ptr_host(i); current_index < row_ptr_host(i + 1); - ++current_index) { + for (current_index = row_ptr_host(i); current_index < row_ptr_host(i + 1); ++current_index) { if (colIndices_host(current_index) == i) break; } for (int j = 0; j < N; ++j) { diff --git a/example/batched_solve/static_pivoting.cpp b/example/batched_solve/static_pivoting.cpp index e8a25778fc..f8eabdee22 100644 --- a/example/batched_solve/static_pivoting.cpp +++ b/example/batched_solve/static_pivoting.cpp @@ -49,9 +49,7 @@ struct Functor_TeamTestStaticPivoting { const XYViewType _Y; KOKKOS_INLINE_FUNCTION - Functor_TeamTestStaticPivoting(const AViewType &A, const XYViewType &X, - const XYViewType &Y) - : _A(A), _X(X), _Y(Y) {} + Functor_TeamTestStaticPivoting(const AViewType &A, const XYViewType &X, const XYViewType &Y) : _A(A), _X(X), _Y(Y) {} template KOKKOS_INLINE_FUNCTION void operator()(const MemberType &member) const { @@ -61,22 +59,16 @@ struct Functor_TeamTestStaticPivoting { auto X = Kokkos::subview(_X, matrix_id, Kokkos::ALL); auto Y = Kokkos::subview(_Y, matrix_id, Kokkos::ALL); member.team_barrier(); - KokkosBatched::TeamGesv::invoke(member, - A, X, - Y); + KokkosBatched::TeamGesv::invoke(member, A, X, Y); member.team_barrier(); } inline void run() { std::string name("KokkosBatched::Test::StaticPivoting"); - Kokkos::TeamPolicy policy(_A.extent(0), Kokkos::AUTO(), - Kokkos::AUTO()); + Kokkos::TeamPolicy policy(_A.extent(0), Kokkos::AUTO(), Kokkos::AUTO()); - using MatrixViewType = - Kokkos::View; + using MatrixViewType = Kokkos::View; const int n = _A.extent(1); size_t bytes_0 = MatrixViewType::shmem_size(n, n + 4); @@ -95,8 +87,7 @@ struct Functor_SerialTestStaticPivoting { const XYViewType _Y; KOKKOS_INLINE_FUNCTION - Functor_SerialTestStaticPivoting(const AViewType &A, const AViewType &tmp, - const XYViewType &X, const XYViewType &Y) + Functor_SerialTestStaticPivoting(const AViewType &A, const AViewType &tmp, const XYViewType &X, const XYViewType &Y) : _A(A), _tmp(tmp), _X(X), _Y(Y) {} KOKKOS_INLINE_FUNCTION void operator()(const int &matrix_id) const { @@ -104,8 +95,7 @@ struct Functor_SerialTestStaticPivoting { auto tmp = Kokkos::subview(_tmp, matrix_id, Kokkos::ALL, Kokkos::ALL); auto X = Kokkos::subview(_X, matrix_id, Kokkos::ALL); auto Y = Kokkos::subview(_Y, matrix_id, Kokkos::ALL); - KokkosBatched::SerialGesv::invoke( - A, X, Y, tmp); + KokkosBatched::SerialGesv::invoke(A, X, Y, tmp); } inline void run() { @@ -144,12 +134,9 @@ int main(int /*argc*/, char ** /*argv[]*/) { KokkosKernels::Impl::kk_write_3Dview_to_file(A, "A.txt"); KokkosKernels::Impl::kk_write_2Dview_to_file(Y, "Y.txt"); - Functor_SerialTestStaticPivoting(A, tmp, - X, Y) - .run(); + Functor_SerialTestStaticPivoting(A, tmp, X, Y).run(); KokkosKernels::Impl::kk_write_2Dview_to_file(X, "X_serial.txt"); - Functor_TeamTestStaticPivoting(A2, X, Y2) - .run(); + Functor_TeamTestStaticPivoting(A2, X, Y2).run(); KokkosKernels::Impl::kk_write_2Dview_to_file(X, "X_team.txt"); } Kokkos::finalize(); diff --git a/example/batched_solve/team_GMRES.cpp b/example/batched_solve/team_GMRES.cpp index b543ddaad6..ab14b4b07a 100644 --- a/example/batched_solve/team_GMRES.cpp +++ b/example/batched_solve/team_GMRES.cpp @@ -40,8 +40,8 @@ typedef Kokkos::DefaultExecutionSpace exec_space; -template +template struct Functor_TestBatchedTeamVectorGMRES { const ValuesViewType _values; const ValuesViewType _diag; @@ -53,10 +53,9 @@ struct Functor_TestBatchedTeamVectorGMRES { KrylovHandleType _handle; KOKKOS_INLINE_FUNCTION - Functor_TestBatchedTeamVectorGMRES( - const ValuesViewType &values, const IntView &r, const IntView &c, - const VectorViewType &X, const VectorViewType &B, const int team_size, - const int vector_length, KrylovHandleType &handle) + Functor_TestBatchedTeamVectorGMRES(const ValuesViewType &values, const IntView &r, const IntView &c, + const VectorViewType &X, const VectorViewType &B, const int team_size, + const int vector_length, KrylovHandleType &handle) : _values(values), _r(r), _c(c), @@ -67,11 +66,9 @@ struct Functor_TestBatchedTeamVectorGMRES { _handle(handle) {} KOKKOS_INLINE_FUNCTION - Functor_TestBatchedTeamVectorGMRES( - const ValuesViewType &values, const ValuesViewType &diag, - const IntView &r, const IntView &c, const VectorViewType &X, - const VectorViewType &B, const int team_size, const int vector_length, - KrylovHandleType &handle) + Functor_TestBatchedTeamVectorGMRES(const ValuesViewType &values, const ValuesViewType &diag, const IntView &r, + const IntView &c, const VectorViewType &X, const VectorViewType &B, + const int team_size, const int vector_length, KrylovHandleType &handle) : _values(values), _diag(diag), _r(r), @@ -86,61 +83,42 @@ struct Functor_TestBatchedTeamVectorGMRES { KOKKOS_INLINE_FUNCTION void operator()(const MemberType &member) const { const int first_matrix = _handle.first_index(member.league_rank()); const int last_matrix = _handle.last_index(member.league_rank()); - using TeamVectorCopy1D = - KokkosBatched::TeamVectorCopy; - - auto d = Kokkos::subview( - _values, Kokkos::make_pair(first_matrix, last_matrix), Kokkos::ALL); - auto x = Kokkos::subview(_X, Kokkos::make_pair(first_matrix, last_matrix), - Kokkos::ALL); - auto b = Kokkos::subview(_B, Kokkos::make_pair(first_matrix, last_matrix), - Kokkos::ALL); - - using ScratchPadIntViewType = - Kokkos::View; - using ScratchPadValuesViewType = Kokkos::View< - typename ValuesViewType::non_const_value_type **, - typename ValuesViewType::array_layout, - typename ValuesViewType::execution_space::scratch_memory_space>; - - using Operator = - KokkosBatched::CrsMatrix; - - ScratchPadIntViewType tmp_1D_int(member.team_scratch(0), - _r.extent(0) + _c.extent(0)); - - auto r = - Kokkos::subview(tmp_1D_int, Kokkos::make_pair(0, (int)_r.extent(0))); - auto c = Kokkos::subview( - tmp_1D_int, - Kokkos::make_pair((int)_r.extent(0), (int)tmp_1D_int.extent(0))); + using TeamVectorCopy1D = KokkosBatched::TeamVectorCopy; + + auto d = Kokkos::subview(_values, Kokkos::make_pair(first_matrix, last_matrix), Kokkos::ALL); + auto x = Kokkos::subview(_X, Kokkos::make_pair(first_matrix, last_matrix), Kokkos::ALL); + auto b = Kokkos::subview(_B, Kokkos::make_pair(first_matrix, last_matrix), Kokkos::ALL); + + using ScratchPadIntViewType = Kokkos::View; + using ScratchPadValuesViewType = + Kokkos::View; + + using Operator = KokkosBatched::CrsMatrix; + + ScratchPadIntViewType tmp_1D_int(member.team_scratch(0), _r.extent(0) + _c.extent(0)); + + auto r = Kokkos::subview(tmp_1D_int, Kokkos::make_pair(0, (int)_r.extent(0))); + auto c = Kokkos::subview(tmp_1D_int, Kokkos::make_pair((int)_r.extent(0), (int)tmp_1D_int.extent(0))); TeamVectorCopy1D::invoke(member, _r, r); TeamVectorCopy1D::invoke(member, _c, c); Operator A(d, r, c); if (UsePrec) { - ScratchPadValuesViewType diag( - member.team_scratch(0), last_matrix - first_matrix, _diag.extent(1)); + ScratchPadValuesViewType diag(member.team_scratch(0), last_matrix - first_matrix, _diag.extent(1)); using PrecOperator = KokkosBatched::JacobiPrec; KokkosBatched::TeamVectorCopy::invoke( - member, - Kokkos::subview(_diag, Kokkos::make_pair(first_matrix, last_matrix), - Kokkos::ALL), - diag); + member, Kokkos::subview(_diag, Kokkos::make_pair(first_matrix, last_matrix), Kokkos::ALL), diag); PrecOperator P(diag); P.setComputedInverse(); - KokkosBatched::TeamVectorGMRES::template invoke< - Operator, VectorViewType, PrecOperator, KrylovHandleType>( - member, A, b, x, P, _handle); + KokkosBatched::TeamVectorGMRES::template invoke(member, A, b, x, P, _handle); } else { - KokkosBatched::TeamVectorGMRES::template invoke< - Operator, VectorViewType>(member, A, b, x, _handle); + KokkosBatched::TeamVectorGMRES::template invoke(member, A, b, x, _handle); } } @@ -149,10 +127,8 @@ struct Functor_TestBatchedTeamVectorGMRES { Kokkos::Timer timer; Kokkos::Profiling::pushRegion(name.c_str()); - Kokkos::TeamPolicy auto_policy(_handle.get_number_of_teams(), - Kokkos::AUTO(), Kokkos::AUTO()); - Kokkos::TeamPolicy tuned_policy(_handle.get_number_of_teams(), - _team_size, _vector_length); + Kokkos::TeamPolicy auto_policy(_handle.get_number_of_teams(), Kokkos::AUTO(), Kokkos::AUTO()); + Kokkos::TeamPolicy tuned_policy(_handle.get_number_of_teams(), _team_size, _vector_length); Kokkos::TeamPolicy policy; if (_team_size < 1) @@ -168,21 +144,17 @@ struct Functor_TestBatchedTeamVectorGMRES { using ViewType2D = Kokkos::View; - size_t bytes_1D = - ViewType2D::shmem_size(_handle.get_number_of_systems_per_team(), 1); + size_t bytes_1D = ViewType2D::shmem_size(_handle.get_number_of_systems_per_team(), 1); size_t bytes_row_ptr = IntView::shmem_size(_r.extent(0)); size_t bytes_col_idc = IntView::shmem_size(_c.extent(0)); - size_t bytes_2D_1 = ViewType2D::shmem_size( - _handle.get_number_of_systems_per_team(), _X.extent(1)); - size_t bytes_2D_2 = ViewType2D::shmem_size( - _handle.get_number_of_systems_per_team(), maximum_iteration + 1); + size_t bytes_2D_1 = ViewType2D::shmem_size(_handle.get_number_of_systems_per_team(), _X.extent(1)); + size_t bytes_2D_2 = ViewType2D::shmem_size(_handle.get_number_of_systems_per_team(), maximum_iteration + 1); size_t bytes_int = bytes_row_ptr + bytes_col_idc; size_t bytes_diag = bytes_2D_1; size_t bytes_tmp = 2 * bytes_2D_1 + 2 * bytes_1D + bytes_2D_2; - policy.set_scratch_size( - 0, Kokkos::PerTeam(bytes_tmp + bytes_diag + bytes_int)); + policy.set_scratch_size(0, Kokkos::PerTeam(bytes_tmp + bytes_diag + bytes_int)); exec_space().fence(); timer.reset(); @@ -221,8 +193,7 @@ int main(int /*argc*/, char ** /*argv*/) { printf("N = %d, Blk = %d, nnz = %d\n", N, Blk, nnz); - create_tridiagonal_batched_matrices(nnz, Blk, N, rowOffsets, colIndices, - values, x, y); + create_tridiagonal_batched_matrices(nnz, Blk, N, rowOffsets, colIndices, values, x, y); // Replace y by ones: Kokkos::deep_copy(y, 1.); @@ -242,9 +213,7 @@ int main(int /*argc*/, char ** /*argv*/) { using Scalar3DViewType = Kokkos::View; using IntViewType = Kokkos::View; - using KrylovHandleType = - KokkosBatched::KrylovHandle; + using KrylovHandleType = KokkosBatched::KrylovHandle; const int N_team = 2; const int n_iterations = 150; @@ -255,8 +224,7 @@ int main(int /*argc*/, char ** /*argv*/) { const int ortho_strategy = 0; KrylovHandleType handle(N, N_team, n_iterations, true); - handle.Arnoldi_view = - Scalar3DViewType("", N, n_iterations, Blk + n_iterations + 3); + handle.Arnoldi_view = Scalar3DViewType("", N, n_iterations, Blk + n_iterations + 3); handle.set_max_iteration(n_iterations); handle.set_tolerance(tol); @@ -265,37 +233,27 @@ int main(int /*argc*/, char ** /*argv*/) { handle.set_compute_last_residual(true); double time = - Functor_TestBatchedTeamVectorGMRES(values, diag, rowOffsets, - colIndices, x, y, team_size, - vector_length, handle) + Functor_TestBatchedTeamVectorGMRES( + values, diag, rowOffsets, colIndices, x, y, team_size, vector_length, handle) .run(); printf("times = %f secondes\n", time); for (int i = 0; i < N; ++i) { if (handle.is_converged_host(i)) { - std::cout - << "System " << i << " converged in " - << handle.get_iteration_host(i) - << " iterations, the initial absolute norm of the residual was " - << handle.get_norm_host(i, 0) << " and is now " - << handle.get_last_norm_host(i) << std::endl; + std::cout << "System " << i << " converged in " << handle.get_iteration_host(i) + << " iterations, the initial absolute norm of the residual was " << handle.get_norm_host(i, 0) + << " and is now " << handle.get_last_norm_host(i) << std::endl; } else { - std::cout - << "System " << i << " did not converge in " - << handle.get_max_iteration() - << " iterations, the initial absolute norm of the residual was " - << handle.get_norm_host(i, 0) << " and is now " - << handle.get_last_norm_host(i) << std::endl; + std::cout << "System " << i << " did not converge in " << handle.get_max_iteration() + << " iterations, the initial absolute norm of the residual was " << handle.get_norm_host(i, 0) + << " and is now " << handle.get_last_norm_host(i) << std::endl; } } if (handle.is_converged_host()) std::cout << "All the systems have converged." << std::endl; else - std::cout << "There is at least one system that did not converge." - << std::endl; + std::cout << "There is at least one system that did not converge." << std::endl; } Kokkos::finalize(); } diff --git a/example/gmres/ex_real_A.cpp b/example/gmres/ex_real_A.cpp index 14c4eaeb15..f18ccfd278 100644 --- a/example/gmres/ex_real_A.cpp +++ b/example/gmres/ex_real_A.cpp @@ -31,16 +31,14 @@ int main(int argc, char* argv[]) { using CRS = KokkosSparse::CrsMatrix; using ViewVectorType = Kokkos::View; - using KernelHandle = - KokkosKernels::Experimental::KokkosKernelsHandle; + using KernelHandle = KokkosKernels::Experimental::KokkosKernelsHandle; std::string filename("bcsstk09.mtx"); // example matrix std::string ortho("CGS2"); // orthog type int m = 50; // Max subspace size before restarting. - double convTol = 1e-10; // Relative residual convergence tolerance. - int cycLim = 50; // Maximum number of times to restart the solver. - bool rand_rhs = false; // Generate random right-hand side. + double convTol = 1e-10; // Relative residual convergence tolerance. + int cycLim = 50; // Maximum number of times to restart the solver. + bool rand_rhs = false; // Generate random right-hand side. for (int i = 1; i < argc; ++i) { const std::string& token = argv[i]; @@ -51,29 +49,26 @@ int main(int argc, char* argv[]) { if (token == std::string("--ortho")) ortho = argv[++i]; if (token == std::string("--rand_rhs")) rand_rhs = true; if (token == std::string("--help") || token == std::string("-h")) { - std::cout - << "Kokkos GMRES solver options:" << std::endl - << "--filename : The name of a matrix market (.mtx) file for " - "matrix A (Default bcsstk09.mtx)." - << std::endl - << "--max-subsp : The maximum size of the Kyrlov subspace before " - "restarting (Default 50)." - << std::endl - << "--max-restarts: Maximum number of GMRES restarts (Default 50)." - << std::endl - << "--tol : Convergence tolerance. (Default 1e-10)." - << std::endl - << "--ortho : Type of orthogonalization. Use 'CGS2' or 'MGS'. " - "(Default 'CGS2')" - << std::endl - << "--rand_rhs : Generate a random right-hand side b. (Else, " - "default uses b = vector of ones.)" - << std::endl - << "--help -h : Display this help message." << std::endl - << "Example Call : ./Gmres.exe --filename Laplace3D100.mtx --tol " - "1e-5 --max-subsp 100 " - << std::endl - << std::endl; + std::cout << "Kokkos GMRES solver options:" << std::endl + << "--filename : The name of a matrix market (.mtx) file for " + "matrix A (Default bcsstk09.mtx)." + << std::endl + << "--max-subsp : The maximum size of the Kyrlov subspace before " + "restarting (Default 50)." + << std::endl + << "--max-restarts: Maximum number of GMRES restarts (Default 50)." << std::endl + << "--tol : Convergence tolerance. (Default 1e-10)." << std::endl + << "--ortho : Type of orthogonalization. Use 'CGS2' or 'MGS'. " + "(Default 'CGS2')" + << std::endl + << "--rand_rhs : Generate a random right-hand side b. (Else, " + "default uses b = vector of ones.)" + << std::endl + << "--help -h : Display this help message." << std::endl + << "Example Call : ./Gmres.exe --filename Laplace3D100.mtx --tol " + "1e-5 --max-subsp 100 " + << std::endl + << std::endl; return 0; } } @@ -98,10 +93,8 @@ int main(int argc, char* argv[]) { auto gmres_handle = kh.get_gmres_handle(); // Get full gmres handle type using decltype. Deferencing a pointer gives a // reference, so we need to strip that too. - using GMRESHandle = - typename std::remove_reference::type; - gmres_handle->set_ortho(ortho == "CGS2" ? GMRESHandle::Ortho::CGS2 - : GMRESHandle::Ortho::MGS); + using GMRESHandle = typename std::remove_reference::type; + gmres_handle->set_ortho(ortho == "CGS2" ? GMRESHandle::Ortho::CGS2 : GMRESHandle::Ortho::MGS); if (rand_rhs) { // Make rhs random. @@ -128,8 +121,7 @@ int main(int argc, char* argv[]) { std::cout << "=========================================" << std::endl; std::cout << "Verify from main: Ending residual is " << endRes << std::endl; std::cout << "Number of iterations is: " << numIters << std::endl; - std::cout << "Diff of residual from main - residual from solver: " - << endRelRes - endRes << std::endl; + std::cout << "Diff of residual from main - residual from solver: " << endRelRes - endRes << std::endl; std::cout << "Convergence flag is : " << convFlag << std::endl; } Kokkos::finalize(); diff --git a/example/gmres/test_prec.cpp b/example/gmres/test_prec.cpp index 8d1ff74b87..942dc176b6 100644 --- a/example/gmres/test_prec.cpp +++ b/example/gmres/test_prec.cpp @@ -27,14 +27,10 @@ int main(int argc, char* argv[]) { using OT = int; using EXSP = Kokkos::DefaultExecutionSpace; using MESP = typename EXSP::memory_space; - using CRS = - KokkosSparse::CrsMatrix, void, OT>; + using CRS = KokkosSparse::CrsMatrix, void, OT>; - using ViewVectorType = - Kokkos::View>; - using KernelHandle = - KokkosKernels::Experimental::KokkosKernelsHandle; + using ViewVectorType = Kokkos::View>; + using KernelHandle = KokkosKernels::Experimental::KokkosKernelsHandle; std::string ortho("CGS2"); // orthog type int n = 1000; // Matrix size @@ -53,29 +49,26 @@ int main(int argc, char* argv[]) { if (token == std::string("--ortho")) ortho = argv[++i]; if (token == std::string("--rand_rhs")) rand_rhs = true; if (token == std::string("--help") || token == std::string("-h")) { - std::cout - << "Kokkos GMRES solver options:" << std::endl - << "--mat-size : The size of the nxn test matrix. (Default: " - "n=1000.)" - << std::endl - << "--max-subsp : The maximum size of the Kyrlov subspace before " - "restarting (Default 50)." - << std::endl - << "--max-restarts: Maximum number of GMRES restarts (Default 50)." - << std::endl - << "--tol : Convergence tolerance. (Default 1e-10)." - << std::endl - << "--ortho : Type of orthogonalization. Use 'CGS2' or 'MGS'. " - "(Default 'CGS2')" - << std::endl - << "--rand_rhs : Generate a random right-hand side b. (Else, " - "default uses b = vector of ones.)" - << std::endl - << "--help -h : Display this help message." << std::endl - << "Example Call : ./Gmres.exe --filename Laplace3D100.mtx --tol " - "1e-5 --max-subsp 100 " - << std::endl - << std::endl; + std::cout << "Kokkos GMRES solver options:" << std::endl + << "--mat-size : The size of the nxn test matrix. (Default: " + "n=1000.)" + << std::endl + << "--max-subsp : The maximum size of the Kyrlov subspace before " + "restarting (Default 50)." + << std::endl + << "--max-restarts: Maximum number of GMRES restarts (Default 50)." << std::endl + << "--tol : Convergence tolerance. (Default 1e-10)." << std::endl + << "--ortho : Type of orthogonalization. Use 'CGS2' or 'MGS'. " + "(Default 'CGS2')" + << std::endl + << "--rand_rhs : Generate a random right-hand side b. (Else, " + "default uses b = vector of ones.)" + << std::endl + << "--help -h : Display this help message." << std::endl + << "Example Call : ./Gmres.exe --filename Laplace3D100.mtx --tol " + "1e-5 --max-subsp 100 " + << std::endl + << std::endl; return 0; } } @@ -87,18 +80,16 @@ int main(int argc, char* argv[]) { auto gmres_handle = kh.get_gmres_handle(); // Get full gmres handle type using decltype. Deferencing a pointer gives a // reference, so we need to strip that too. - using GMRESHandle = - typename std::remove_reference::type; - gmres_handle->set_ortho(ortho == "CGS2" ? GMRESHandle::Ortho::CGS2 - : GMRESHandle::Ortho::MGS); + using GMRESHandle = typename std::remove_reference::type; + gmres_handle->set_ortho(ortho == "CGS2" ? GMRESHandle::Ortho::CGS2 : GMRESHandle::Ortho::MGS); // Initialize Kokkos AFTER parsing parameters: Kokkos::initialize(); { // Generate a diagonal matrix with entries 1, 2, ...., 1000 and its inverse. - CRS A = KokkosSparse::Impl::kk_generate_diag_matrix(n); - auto myPrec = new KokkosSparse::Experimental::MatrixPrec( - KokkosSparse::Impl::kk_generate_diag_matrix(n, true)); + CRS A = KokkosSparse::Impl::kk_generate_diag_matrix(n); + auto myPrec = + new KokkosSparse::Experimental::MatrixPrec(KokkosSparse::Impl::kk_generate_diag_matrix(n, true)); ViewVectorType X(Kokkos::view_alloc(Kokkos::WithoutInitializing, "X"), n); // Solution and initial guess @@ -107,9 +98,8 @@ int main(int argc, char* argv[]) { n); // right-hand side vec int rand_seed = 123; Kokkos::Random_XorShift64_Pool<> pool(rand_seed); - Kokkos::fill_random( - X, pool, -1, - 1); // Use non-zero initial guess to test GMRES properties. + Kokkos::fill_random(X, pool, -1, + 1); // Use non-zero initial guess to test GMRES properties. if (rand_rhs) { Kokkos::fill_random(B, pool, -1, 1); } else { @@ -131,8 +121,7 @@ int main(int argc, char* argv[]) { std::cout << "=========================================" << std::endl; std::cout << "Verify from main: Ending residual is " << endRes << std::endl; std::cout << "Number of iterations is: " << numIters << std::endl; - std::cout << "Diff of residual from main - residual from solver: " - << endRelRes - endRes << std::endl; + std::cout << "Diff of residual from main - residual from solver: " << endRelRes - endRes << std::endl; std::cout << "Convergence flag is : " << convFlag << std::endl; if (endRes < convTol && numIters == 1) { pass = true; diff --git a/example/graph/KokkosKernels_Example_Distance2GraphColor.cpp b/example/graph/KokkosKernels_Example_Distance2GraphColor.cpp index 9a5537ee5b..5506ce68d8 100644 --- a/example/graph/KokkosKernels_Example_Distance2GraphColor.cpp +++ b/example/graph/KokkosKernels_Example_Distance2GraphColor.cpp @@ -100,8 +100,7 @@ struct Parameters { } }; -void print_options(std::ostream& os, const char* app_name, - unsigned int indent = 0) { +void print_options(std::ostream& os, const char* app_name, unsigned int indent = 0) { std::string spaces(indent, ' '); os << "Usage:" << std::endl << spaces << " " << app_name << " [parameters]" << std::endl @@ -110,14 +109,11 @@ void print_options(std::ostream& os, const char* app_name, << spaces << " Parallelism (select one of the following):" << std::endl << spaces << " --serial Execute serially." << std::endl << spaces << " --threads Use N posix threads." << std::endl - << spaces << " --openmp Use OpenMP with N threads." - << std::endl + << spaces << " --openmp Use OpenMP with N threads." << std::endl << spaces << " --cuda Use CUDA" << std::endl << std::endl << spaces << " Required Parameters:" << std::endl - << spaces - << " --amtx Input file in Matrix Market format (.mtx)." - << std::endl + << spaces << " --amtx Input file in Matrix Market format (.mtx)." << std::endl << std::endl << spaces << " --algorithm Set the algorithm to use. " @@ -173,16 +169,12 @@ void print_options(std::ostream& os, const char* app_name, << " --verbose-level Set verbosity level [0..5] " "where N > 0 means print verbose messags." << std::endl - << spaces << " Default: 0" - << std::endl - << spaces - << " --help Print out command line help." - << std::endl + << spaces << " Default: 0" << std::endl + << spaces << " --help Print out command line help." << std::endl << spaces << " " << std::endl; } -int parse_inputs(KokkosKernels::Example::Parameters& params, int argc, - char** argv) { +int parse_inputs(KokkosKernels::Example::Parameters& params, int argc, char** argv) { bool got_required_param_amtx = false; bool got_required_param_algorithm = false; @@ -208,40 +200,32 @@ int parse_inputs(KokkosKernels::Example::Parameters& params, int argc, params.verbose_level = atoi(argv[++i]); params.verbose_level = std::min(5, params.verbose_level); params.verbose_level = std::max(0, params.verbose_level); - } else if (0 == - Test::string_compare_no_case(argv[i], "--output-histogram")) { + } else if (0 == Test::string_compare_no_case(argv[i], "--output-histogram")) { params.output_histogram = 1; - } else if (0 == - Test::string_compare_no_case(argv[i], "--output-graphviz")) { + } else if (0 == Test::string_compare_no_case(argv[i], "--output-graphviz")) { params.output_graphviz = 1; - } else if (0 == Test::string_compare_no_case( - argv[i], "--output-graphviz-vert-max")) { + } else if (0 == Test::string_compare_no_case(argv[i], "--output-graphviz-vert-max")) { params.output_graphviz_vert_max = atoi(argv[++i]); } else if (0 == Test::string_compare_no_case(argv[i], "--algorithm")) { ++i; - if (0 == - Test::string_compare_no_case(argv[i], "COLORING_D2_MATRIX_SQUARED")) { + if (0 == Test::string_compare_no_case(argv[i], "COLORING_D2_MATRIX_SQUARED")) { params.algorithm = 1; got_required_param_algorithm = true; - } else if (0 == - Test::string_compare_no_case(argv[i], "COLORING_D2_SERIAL")) { + } else if (0 == Test::string_compare_no_case(argv[i], "COLORING_D2_SERIAL")) { params.algorithm = 2; got_required_param_algorithm = true; } else if (0 == Test::string_compare_no_case(argv[i], "COLORING_D2_VB") || 0 == Test::string_compare_no_case(argv[i], "COLORING_D2")) { params.algorithm = 3; got_required_param_algorithm = true; - } else if (0 == - Test::string_compare_no_case(argv[i], "COLORING_D2_VB_BIT")) { + } else if (0 == Test::string_compare_no_case(argv[i], "COLORING_D2_VB_BIT")) { params.algorithm = 4; got_required_param_algorithm = true; - } else if (0 == Test::string_compare_no_case(argv[i], - "COLORING_D2_VB_BIT_EF")) { + } else if (0 == Test::string_compare_no_case(argv[i], "COLORING_D2_VB_BIT_EF")) { params.algorithm = 5; got_required_param_algorithm = true; } else { - std::cerr << "2-Unrecognized command line argument #" << i << ": " - << argv[i] << std::endl; + std::cerr << "2-Unrecognized command line argument #" << i << ": " << argv[i] << std::endl; print_options(std::cout, argv[0]); return 1; } @@ -250,8 +234,7 @@ int parse_inputs(KokkosKernels::Example::Parameters& params, int argc, print_options(std::cout, argv[0]); return 1; } else { - std::cerr << "3-Unrecognized command line argument #" << i << ": " - << argv[i] << std::endl; + std::cerr << "3-Unrecognized command line argument #" << i << ": " << argv[i] << std::endl; print_options(std::cout, argv[0]); return 1; } @@ -263,21 +246,19 @@ int parse_inputs(KokkosKernels::Example::Parameters& params, int argc, return 1; } if (!got_required_param_algorithm) { - std::cout << "Missing required parameter algorithm" << std::endl - << std::endl; + std::cout << "Missing required parameter algorithm" << std::endl << std::endl; print_options(std::cout, argv[0]); return 1; } - if (!params.use_serial && !params.use_threads && !params.use_openmp && - !params.use_cuda) { + if (!params.use_serial && !params.use_threads && !params.use_openmp && !params.use_cuda) { print_options(std::cout, argv[0]); return 1; } return 0; } -template +template void run_example(CrsGraph_type crsGraph, DataType num_cols, Parameters params) { using namespace KokkosGraph; using namespace KokkosGraph::Experimental; @@ -285,14 +266,13 @@ void run_example(CrsGraph_type crsGraph, DataType num_cols, Parameters params) { int algorithm = params.algorithm; int shmemsize = params.shmemsize; - using lno_view_type = typename CrsGraph_type::row_map_type::non_const_type; - using lno_nnz_view_type = - typename CrsGraph_type::entries_type::non_const_type; + using lno_view_type = typename CrsGraph_type::row_map_type::non_const_type; + using lno_nnz_view_type = typename CrsGraph_type::entries_type::non_const_type; using size_type = typename lno_view_type::non_const_value_type; using lno_type = typename lno_nnz_view_type::non_const_value_type; - using KernelHandle_type = KokkosKernels::Experimental::KokkosKernelsHandle< - size_type, lno_type, kk_scalar_type, ExecSpace, TempMemSpace, - PersistentMemSpace>; + using KernelHandle_type = + KokkosKernels::Experimental::KokkosKernelsHandle; // Create a kernel handle KernelHandle_type kh; @@ -333,52 +313,39 @@ void run_example(CrsGraph_type crsGraph, DataType num_cols, Parameters params) { break; } - std::cout << std::endl - << "Run Graph Color D2 (" << label_algorithm << ")" << std::endl; + std::cout << std::endl << "Run Graph Color D2 (" << label_algorithm << ")" << std::endl; // ------------------------------------------ // Call the distance-2 graph coloring routine // ------------------------------------------ - graph_compute_distance2_color(&kh, crsGraph.numRows(), num_cols, - crsGraph.row_map, crsGraph.entries, - crsGraph.row_map, crsGraph.entries); + graph_compute_distance2_color(&kh, crsGraph.numRows(), num_cols, crsGraph.row_map, crsGraph.entries, crsGraph.row_map, + crsGraph.entries); // ------------------------------------------ // Get the results // ------------------------------------------ - size_t num_colors = - kh.get_distance2_graph_coloring_handle()->get_num_colors(); - size_t num_phases = - kh.get_distance2_graph_coloring_handle()->get_num_phases(); + size_t num_colors = kh.get_distance2_graph_coloring_handle()->get_num_colors(); + size_t num_phases = kh.get_distance2_graph_coloring_handle()->get_num_phases(); if (params.verbose_level > 0) { - std::cout - << "Total Time: " - << kh.get_distance2_graph_coloring_handle()->get_overall_coloring_time() - << std::endl - << "Num colors: " - << kh.get_distance2_graph_coloring_handle()->get_num_colors() - << std::endl - << "Num Phases: " - << kh.get_distance2_graph_coloring_handle()->get_num_phases() - << std::endl - << "Colors:\n\t"; - KokkosKernels::Impl::print_1Dview( - kh.get_distance2_graph_coloring_handle()->get_vertex_colors()); + std::cout << "Total Time: " << kh.get_distance2_graph_coloring_handle()->get_overall_coloring_time() << std::endl + << "Num colors: " << kh.get_distance2_graph_coloring_handle()->get_num_colors() << std::endl + << "Num Phases: " << kh.get_distance2_graph_coloring_handle()->get_num_phases() << std::endl + << "Colors:\n\t"; + KokkosKernels::Impl::print_1Dview(kh.get_distance2_graph_coloring_handle()->get_vertex_colors()); std::cout << std::endl; } // ------------------------------------------ // Save coloring to a GraphViz file // ------------------------------------------ - if (params.output_graphviz && - crsGraph.numRows() <= params.output_graphviz_vert_max) { + if (params.output_graphviz && crsGraph.numRows() <= params.output_graphviz_vert_max) { auto colors = kh.get_distance2_graph_coloring_handle()->get_vertex_colors(); std::ofstream os("G.dot", std::ofstream::out); - kh.get_distance2_graph_coloring_handle()->dump_graphviz( - os, crsGraph.numRows(), crsGraph.row_map, crsGraph.entries, colors); + kh.get_distance2_graph_coloring_handle()->dump_graphviz(os, crsGraph.numRows(), crsGraph.row_map, crsGraph.entries, + colors); } // ------------------------------------------ @@ -394,29 +361,22 @@ void run_example(CrsGraph_type crsGraph, DataType num_cols, Parameters params) { d2_coloring_is_valid = KokkosGraph::Impl::graph_verify_distance2_color( &kh, crsGraph.numRows(), // crsGraph.numCols(), - num_cols, crsGraph.row_map, crsGraph.entries, crsGraph.row_map, - crsGraph.entries, d2_coloring_validation_flags); + num_cols, crsGraph.row_map, crsGraph.entries, crsGraph.row_map, crsGraph.entries, d2_coloring_validation_flags); // Print out messages based on coloring validation check. if (d2_coloring_is_valid) { - std::cout << std::endl - << "Distance-2 Graph Coloring is VALID" << std::endl - << std::endl; + std::cout << std::endl << "Distance-2 Graph Coloring is VALID" << std::endl << std::endl; } else { str_color_is_valid = "INVALID"; std::cout << std::endl << "Distance-2 Graph Coloring is NOT VALID" << std::endl - << " - Vert(s) left uncolored : " - << d2_coloring_validation_flags[1] << std::endl - << " - Invalid D2 Coloring : " - << d2_coloring_validation_flags[2] << std::endl + << " - Vert(s) left uncolored : " << d2_coloring_validation_flags[1] << std::endl + << " - Invalid D2 Coloring : " << d2_coloring_validation_flags[2] << std::endl << std::endl; } if (d2_coloring_validation_flags[3]) { - std::cout << "Distance-2 Graph Coloring may have poor quality." - << std::endl - << " - Vert(s) have high color value : " - << d2_coloring_validation_flags[3] << std::endl + std::cout << "Distance-2 Graph Coloring may have poor quality." << std::endl + << " - Vert(s) have high color value : " << d2_coloring_validation_flags[3] << std::endl << std::endl; } } @@ -425,27 +385,24 @@ void run_example(CrsGraph_type crsGraph, DataType num_cols, Parameters params) { // Print out a histogram of the colors // ------------------------------------------ if (0 != params.output_histogram) { - KokkosGraph::Impl::graph_print_distance2_color_histogram( - &kh, crsGraph.numRows(), num_cols, crsGraph.row_map, crsGraph.entries, - crsGraph.row_map, crsGraph.entries, false); + KokkosGraph::Impl::graph_print_distance2_color_histogram(&kh, crsGraph.numRows(), num_cols, crsGraph.row_map, + crsGraph.entries, crsGraph.row_map, crsGraph.entries, + false); } // ------------------------------------------ // Print out a summary // ------------------------------------------ std::string mtx_bin_file = params.mtx_bin_file; - mtx_bin_file = mtx_bin_file.substr(mtx_bin_file.find_last_of("/\\") + 1); + mtx_bin_file = mtx_bin_file.substr(mtx_bin_file.find_last_of("/\\") + 1); std::cout << "Summary" << std::endl << "-------" << std::endl - << " KExecSName : " << Kokkos::DefaultExecutionSpace::name() - << std::endl + << " KExecSName : " << Kokkos::DefaultExecutionSpace::name() << std::endl << " Filename : " << mtx_bin_file << std::endl << " Num Verts : " << crsGraph.numRows() << std::endl - << " Num Edges : " << crsGraph.entries.extent(0) - << std::endl - << " Concurrency : " - << Kokkos::DefaultExecutionSpace().concurrency() << std::endl + << " Num Edges : " << crsGraph.entries.extent(0) << std::endl + << " Concurrency : " << Kokkos::DefaultExecutionSpace().concurrency() << std::endl << " Algorithm : " << label_algorithm << std::endl << "Coloring Stats" << std::endl << " Num colors : " << num_colors << std::endl @@ -455,26 +412,21 @@ void run_example(CrsGraph_type crsGraph, DataType num_cols, Parameters params) { } // run_example() -template +template void driver(Parameters params) { using myExecSpace = exec_space; using myFastDevice = Kokkos::Device; - using crstmat_type = - typename KokkosSparse::CrsMatrix; - using graph_type = typename crstmat_type::StaticCrsGraphType; - using data_type = typename graph_type::data_type; + using crstmat_type = typename KokkosSparse::CrsMatrix; + using graph_type = typename crstmat_type::StaticCrsGraphType; + using data_type = typename graph_type::data_type; char* mat_file = params.mtx_bin_file; - crstmat_type crsmat = - KokkosKernels::Impl::read_kokkos_crst_matrix(mat_file); + crstmat_type crsmat = KokkosKernels::Impl::read_kokkos_crst_matrix(mat_file); graph_type crsgraph = crsmat.graph; data_type num_cols = crsmat.numCols(); - KokkosKernels::Example::run_example( + KokkosKernels::Example::run_example( crsgraph, num_cols, params); } // driver() @@ -494,13 +446,10 @@ int main(int argc, char* argv[]) { return 0; } - const int num_threads = - params.use_openmp; // Assumption is that use_openmp variable is provided - // as number of threads + const int num_threads = params.use_openmp; // Assumption is that use_openmp variable is provided + // as number of threads const int device_id = 0; - Kokkos::initialize(Kokkos::InitializationSettings() - .set_num_threads(num_threads) - .set_device_id(device_id)); + Kokkos::initialize(Kokkos::InitializationSettings().set_num_threads(num_threads).set_device_id(device_id)); // Print out information about the configuration of the run if verbose_level // >= 5 @@ -510,22 +459,19 @@ int main(int argc, char* argv[]) { #if defined(KOKKOS_ENABLE_OPENMP) if (params.use_openmp) { - KokkosKernels::Example::driver(params); + KokkosKernels::Example::driver(params); } #endif #if defined(KOKKOS_ENABLE_CUDA) if (params.use_cuda) { - KokkosKernels::Example::driver(params); + KokkosKernels::Example::driver(params); } #endif #if defined(KOKKOS_ENABLE_SERIAL) if (params.use_serial) { - KokkosKernels::Example::driver(params); + KokkosKernels::Example::driver(params); } #endif diff --git a/example/graph/PartitioningExample.cpp b/example/graph/PartitioningExample.cpp index 1bef46cd28..7f06b216d3 100644 --- a/example/graph/PartitioningExample.cpp +++ b/example/graph/PartitioningExample.cpp @@ -28,7 +28,7 @@ using std::cout; using std::vector; -//#include "../../src/sparse/impl/KokkosSparse_partitioning_impl.hpp" +// #include "../../src/sparse/impl/KokkosSparse_partitioning_impl.hpp" int main(int argc, char* argv[]) { /* diff --git a/example/half/xpy.cpp b/example/half/xpy.cpp index 238fdef187..cf3b5767f7 100644 --- a/example/half/xpy.cpp +++ b/example/half/xpy.cpp @@ -40,18 +40,15 @@ void do_xpy(size_t n, bool time_only = false) { View y_rand("y_rand", n); View expected("expected", n); - View relative_error( - "relative_error", n); + View relative_error("relative_error", n); typename ViewType::HostMirror x_host = create_mirror_view(x); typename ViewType::HostMirror y_host = create_mirror_view(y); // TODO: Report segfault in random_pool creation with: // typename ViewType::HostMirror y_host = create_mirror_view(y_host); Random_XorShift64_Pool random_pool(12345); - fill_random(x_rand, random_pool, ReferenceScalarType(1.0), - ReferenceScalarType(2.0)); - fill_random(y_rand, random_pool, ReferenceScalarType(1.0), - ReferenceScalarType(2.0)); + fill_random(x_rand, random_pool, ReferenceScalarType(1.0), ReferenceScalarType(2.0)); + fill_random(y_rand, random_pool, ReferenceScalarType(1.0), ReferenceScalarType(2.0)); ExecutionSpace().fence(); deep_copy(x, x_rand); @@ -72,22 +69,18 @@ void do_xpy(size_t n, bool time_only = false) { if (!time_only) { for (size_t i = 0; i < n; i++) - expected(i) = static_cast(y_host(i)) + - static_cast(x_host(i)); + expected(i) = static_cast(y_host(i)) + static_cast(x_host(i)); } deep_copy(x_host, x); ExecutionSpace().fence(); - std::cout << "n: " << n << ", " << typeid(ScalarType).name() - << " Runtime(s): " << s << std::endl; + std::cout << "n: " << n << ", " << typeid(ScalarType).name() << " Runtime(s): " << s << std::endl; if (!time_only) { - std::cout << "n: " << n << ", " << typeid(ScalarType).name() - << " Relative Errors:" << std::endl; + std::cout << "n: " << n << ", " << typeid(ScalarType).name() << " Relative Errors:" << std::endl; for (size_t i = 0; i < n; i++) { - std::cout << ", " << std::abs(expected(i) - x_host(i)) / expected(i) - << std::endl; + std::cout << ", " << std::abs(expected(i) - x_host(i)) / expected(i) << std::endl; } std::cout << std::endl << std::endl; } diff --git a/example/hashmap_accumulator/KokkosKernels_Example_HashmapAccumulator.cpp b/example/hashmap_accumulator/KokkosKernels_Example_HashmapAccumulator.cpp index 065c103cef..52de73fe29 100644 --- a/example/hashmap_accumulator/KokkosKernels_Example_HashmapAccumulator.cpp +++ b/example/hashmap_accumulator/KokkosKernels_Example_HashmapAccumulator.cpp @@ -52,8 +52,7 @@ typedef struct params { namespace KokkosKernels { namespace Experiment { -template +template struct functorTestHashmapAccumulator { typedef ExecutionSpace execution_space; typedef typename Kokkos::View data_view_t; @@ -65,17 +64,12 @@ struct functorTestHashmapAccumulator { const size_t _max_hash_entries; const parameters_t& _params; - typedef Kokkos::Experimental::UniqueToken< - execution_space, Kokkos::Experimental::UniqueTokenScope::Global> + typedef Kokkos::Experimental::UniqueToken unique_token_t; unique_token_t tokens; - functorTestHashmapAccumulator(const size_t num_entries, - const data_view_t& data, - uniform_memory_pool_t memory_pool, - const size_t hash_size, - const size_t max_hash_entries, - const parameters_t& params) + functorTestHashmapAccumulator(const size_t num_entries, const data_view_t& data, uniform_memory_pool_t memory_pool, + const size_t hash_size, const size_t max_hash_entries, const parameters_t& params) : _num_entries(num_entries), _data(data), _memory_pool(memory_pool), @@ -104,9 +98,7 @@ struct functorTestHashmapAccumulator { } scalar_t* ptr_memory_pool_chunk = (scalar_t*)(ptr_temp); - KokkosKernels::Experimental::HashmapAccumulator< - hash_size_type, hash_key_type, hash_value_type> - hash_map; + KokkosKernels::Experimental::HashmapAccumulator hash_map; // Set pointer to hash indices scalar_t* used_hash_indices = (scalar_t*)(ptr_temp); @@ -145,9 +137,8 @@ struct functorTestHashmapAccumulator { // Compute the hash index using & instead of % (modulus is slower). scalar_t hash = key & hash_func_pow2; - int r = hash_map.sequential_insert_into_hash_TrackHashes( - hash, key, &used_hash_size, hash_map.max_value_size, &used_hash_count, - used_hash_indices); + int r = hash_map.sequential_insert_into_hash_TrackHashes(hash, key, &used_hash_size, hash_map.max_value_size, + &used_hash_count, used_hash_indices); // Check return code if (r) { @@ -180,9 +171,7 @@ struct functorTestHashmapAccumulator { template void experiment(const parameters_t& params) { - typedef - typename KokkosKernels::Impl::UniformMemoryPool - uniform_memory_pool_t; + typedef typename KokkosKernels::Impl::UniformMemoryPool uniform_memory_pool_t; typedef typename Kokkos::View data_view_t; typedef typename data_view_t::HostMirror data_view_hostmirror_t; @@ -224,9 +213,8 @@ void experiment(const parameters_t& params) { // Set Hash Table Parameters size_t max_hash_entries = max_value; // Max number of entries that can be // inserted (values allowed are 1..100) - size_t hash_size_hint = - max_value; // How many hash keys are allowed. The actual hash size will - // be set to the next power of 2 bigger than hash_size_hint. + size_t hash_size_hint = max_value; // How many hash keys are allowed. The actual hash size will + // be set to the next power of 2 bigger than hash_size_hint. // Set the hash_size as the next power of 2 bigger than hash_size_hint. // - hash_size must be a power of two since we use & rather than % (which is @@ -237,8 +225,7 @@ void experiment(const parameters_t& params) { } // Create Uniform Initialized Memory Pool - KokkosKernels::Impl::PoolType pool_type = - KokkosKernels::Impl::OneThread2OneChunk; + KokkosKernels::Impl::PoolType pool_type = KokkosKernels::Impl::OneThread2OneChunk; // Determine memory chunk size for UniformMemoryPool size_t mem_chunk_size = hash_size; // for hash indices @@ -254,16 +241,12 @@ void experiment(const parameters_t& params) { // KokkosKernels::Impl::UniformMemoryPool m_space(mem_chunk_count, mem_chunk_size, -1, pool_type); - uniform_memory_pool_t memory_pool(mem_chunk_count, mem_chunk_size, -1, - pool_type); + uniform_memory_pool_t memory_pool(mem_chunk_count, mem_chunk_size, -1, pool_type); - functorTestHashmapAccumulator - testHashmapAccumulator(num_entries, d_data, memory_pool, hash_size, - max_hash_entries, params); + functorTestHashmapAccumulator testHashmapAccumulator( + num_entries, d_data, memory_pool, hash_size, max_hash_entries, params); - Kokkos::parallel_for("testHashmapAccumulator", num_entries, - testHashmapAccumulator); + Kokkos::parallel_for("testHashmapAccumulator", num_entries, testHashmapAccumulator); if (params.verbose) { double t = timer.seconds(); @@ -275,8 +258,7 @@ void experiment(const parameters_t& params) { } // namespace Experiment } // namespace KokkosKernels -void print_options(std::ostream& os, const char* app_name, - unsigned int indent = 0) { +void print_options(std::ostream& os, const char* app_name, unsigned int indent = 0) { std::string spaces(indent, ' '); os << "Usage:" << std::endl << spaces << " " << app_name << " [parameters]" << std::endl @@ -285,15 +267,12 @@ void print_options(std::ostream& os, const char* app_name, << spaces << " Parallelism (select one of the following):" << std::endl << spaces << " --serial Execute serially." << std::endl << spaces << " --threads Use N posix threads." << std::endl - << spaces << " --openmp Use OpenMP with N threads." - << std::endl + << spaces << " --openmp Use OpenMP with N threads." << std::endl << spaces << " --cuda Use CUDA" << std::endl << spaces << " Optional Parameters:" << std::endl - << spaces << " --problem-size Problem Size (Default: 20)" - << std::endl + << spaces << " --problem-size Problem Size (Default: 20)" << std::endl << spaces << " --verbose Verbose output" << std::endl - << spaces << " --help Print out command line help." - << std::endl + << spaces << " --help Print out command line help." << std::endl << spaces << " " << std::endl; } // print_options @@ -321,19 +300,16 @@ int parse_inputs(parameters_t& params, int argc, char** argv) { } else if (0 == Test::string_compare_no_case(argv[i], "--verbose") || 0 == Test::string_compare_no_case(argv[i], "-V")) { params.verbose = true; - } else if (0 == Test::string_compare_no_case(argv[i], "help") || - 0 == Test::string_compare_no_case(argv[i], "-h")) { + } else if (0 == Test::string_compare_no_case(argv[i], "help") || 0 == Test::string_compare_no_case(argv[i], "-h")) { print_options(std::cout, argv[0]); return 1; } else { - std::cerr << "3-Unrecognized command line argument #" << i << ": " - << argv[i] << std::endl; + std::cerr << "3-Unrecognized command line argument #" << i << ": " << argv[i] << std::endl; print_options(std::cout, argv[0]); return 1; } } - if (!params.use_serial && !params.use_threads && !params.use_openmp && - !params.use_cuda) { + if (!params.use_serial && !params.use_threads && !params.use_openmp && !params.use_cuda) { print_options(std::cout, argv[0]); return 1; } @@ -351,14 +327,11 @@ int main(int argc, char* argv[]) { return 1; } - const int device_id = 0; - const int num_threads = - params.use_openmp; // Assumption is that use_openmp variable is provided - // as number of threads + const int device_id = 0; + const int num_threads = params.use_openmp; // Assumption is that use_openmp variable is provided + // as number of threads - Kokkos::initialize(Kokkos::InitializationSettings() - .set_num_threads(num_threads) - .set_device_id(device_id)); + Kokkos::initialize(Kokkos::InitializationSettings().set_num_threads(num_threads).set_device_id(device_id)); if (params.verbose) { Kokkos::print_configuration(std::cout); diff --git a/example/wiki/blas/abs/abs.cpp b/example/wiki/blas/abs/abs.cpp index c5a1d39e15..a74d4e3555 100644 --- a/example/wiki/blas/abs/abs.cpp +++ b/example/wiki/blas/abs/abs.cpp @@ -29,8 +29,7 @@ int main(int argc, char* argv[]) { double sum = 0.0; Kokkos::parallel_reduce( - "CheckValue", N, - KOKKOS_LAMBDA(const int& i, double& lsum) { lsum += y(i); }, sum); + "CheckValue", N, KOKKOS_LAMBDA(const int& i, double& lsum) { lsum += y(i); }, sum); printf("Sum: %lf Expected: %lf Diff: %e\n", sum, 1.0 * N, sum - 1.0 * N); diff --git a/example/wiki/graph/KokkosGraph_wiki_9pt_stencil.hpp b/example/wiki/graph/KokkosGraph_wiki_9pt_stencil.hpp index 57f109f652..2137bf09e5 100644 --- a/example/wiki/graph/KokkosGraph_wiki_9pt_stencil.hpp +++ b/example/wiki/graph/KokkosGraph_wiki_9pt_stencil.hpp @@ -33,8 +33,8 @@ using DeviceSpace = typename ExecSpace::memory_space; using Kokkos::HostSpace; using RowmapType = Kokkos::View; using ColindsType = Kokkos::View; -using Handle = KokkosKernels::Experimental::KokkosKernelsHandle< - Offset, Ordinal, default_scalar, ExecSpace, DeviceSpace, DeviceSpace>; +using Handle = KokkosKernels::Experimental::KokkosKernelsHandle; namespace GraphDemo { Ordinal gridX = 15; @@ -124,10 +124,8 @@ void generate9pt(RowmapType& rowmapDevice, ColindsType& colindsDevice) { Offset numEdges = colinds.size(); // Now that the graph is formed, copy rowmap and colinds to Kokkos::Views in // device memory The nonowning host views just alias the std::vectors. - Kokkos::View> - rowmapHost(rowmap.data(), numVertices + 1); - Kokkos::View> - colindsHost(colinds.data(), numEdges); + Kokkos::View> rowmapHost(rowmap.data(), numVertices + 1); + Kokkos::View> colindsHost(colinds.data(), numEdges); // Allocate owning views on device with the correct size. rowmapDevice = RowmapType("Rowmap", numVertices + 1); colindsDevice = ColindsType("Colinds", numEdges); diff --git a/example/wiki/graph/KokkosGraph_wiki_coarsening.cpp b/example/wiki/graph/KokkosGraph_wiki_coarsening.cpp index 027ee0a057..409564a334 100644 --- a/example/wiki/graph/KokkosGraph_wiki_coarsening.cpp +++ b/example/wiki/graph/KokkosGraph_wiki_coarsening.cpp @@ -29,9 +29,8 @@ int main() { { std::cout << "Coarsened vertex labels:\n"; Ordinal numClusters = 0; - auto labels = - KokkosGraph::graph_mis2_aggregate( - rowmapDevice, colindsDevice, numClusters); + auto labels = KokkosGraph::graph_mis2_aggregate(rowmapDevice, colindsDevice, + numClusters); // coarsening labels can be printed in the same way as colors GraphDemo::printColoring(labels, numClusters); putchar('\n'); diff --git a/example/wiki/graph/KokkosGraph_wiki_coloring.cpp b/example/wiki/graph/KokkosGraph_wiki_coloring.cpp index ac62861e12..8ff0f6941d 100644 --- a/example/wiki/graph/KokkosGraph_wiki_coloring.cpp +++ b/example/wiki/graph/KokkosGraph_wiki_coloring.cpp @@ -42,10 +42,9 @@ int main() { // Use the default algorithm (chosen based on ExecSpace) handle.create_graph_coloring_handle(KokkosGraph::COLORING_DEFAULT); // Run coloring (graph is square and symmetric) - KokkosGraph::Experimental::graph_color(&handle, numVertices, numVertices, - rowmapDevice, colindsDevice); + KokkosGraph::Experimental::graph_color(&handle, numVertices, numVertices, rowmapDevice, colindsDevice); // Get the colors array, and the number of colors used from the handle. - auto colors = handle.get_graph_coloring_handle()->get_vertex_colors(); + auto colors = handle.get_graph_coloring_handle()->get_vertex_colors(); Ordinal numColors = handle.get_graph_coloring_handle()->get_num_colors(); printf("9-pt stencil: Distance-1 Colors (used %d):\n", (int)numColors); GraphDemo::printColoring(colors, numColors); @@ -57,16 +56,12 @@ int main() { { Handle handle; // Use the default algorithm (chosen based on ExecSpace) - handle.create_distance2_graph_coloring_handle( - KokkosGraph::COLORING_D2_DEFAULT); + handle.create_distance2_graph_coloring_handle(KokkosGraph::COLORING_D2_DEFAULT); // Run coloring - KokkosGraph::Experimental::graph_color_distance2( - &handle, numVertices, rowmapDevice, colindsDevice); + KokkosGraph::Experimental::graph_color_distance2(&handle, numVertices, rowmapDevice, colindsDevice); // Get the colors array, and the number of colors used from the handle. - auto colors = - handle.get_distance2_graph_coloring_handle()->get_vertex_colors(); - Ordinal numColors = - handle.get_distance2_graph_coloring_handle()->get_num_colors(); + auto colors = handle.get_distance2_graph_coloring_handle()->get_vertex_colors(); + Ordinal numColors = handle.get_distance2_graph_coloring_handle()->get_num_colors(); printf("9-pt stencil: Distance-2 Colors (used %d):\n", (int)numColors); GraphDemo::printColoring(colors, numColors); putchar('\n'); diff --git a/example/wiki/graph/KokkosGraph_wiki_mis2.cpp b/example/wiki/graph/KokkosGraph_wiki_mis2.cpp index 773930682f..2ee304d249 100644 --- a/example/wiki/graph/KokkosGraph_wiki_mis2.cpp +++ b/example/wiki/graph/KokkosGraph_wiki_mis2.cpp @@ -29,19 +29,16 @@ int main() { // algorithms { // Run coloring - auto misDevice = - KokkosGraph::graph_d2_mis( - rowmapDevice, colindsDevice, KokkosGraph::MIS2_FAST); - std::cout << "Distance-2 MIS, FAST algorithm: contains " - << misDevice.extent(0) << " out of " << GraphDemo::numVertices - << " vertices.\n"; + auto misDevice = KokkosGraph::graph_d2_mis(rowmapDevice, colindsDevice, + KokkosGraph::MIS2_FAST); + std::cout << "Distance-2 MIS, FAST algorithm: contains " << misDevice.extent(0) << " out of " + << GraphDemo::numVertices << " vertices.\n"; GraphDemo::printMIS(misDevice); putchar('\n'); - misDevice = KokkosGraph::graph_d2_mis( - rowmapDevice, colindsDevice, KokkosGraph::MIS2_QUALITY); - std::cout << "Distance-2 MIS, QUALITY algorithm: contains " - << misDevice.extent(0) << " out of " << GraphDemo::numVertices - << " vertices.\n"; + misDevice = KokkosGraph::graph_d2_mis(rowmapDevice, colindsDevice, + KokkosGraph::MIS2_QUALITY); + std::cout << "Distance-2 MIS, QUALITY algorithm: contains " << misDevice.extent(0) << " out of " + << GraphDemo::numVertices << " vertices.\n"; GraphDemo::printMIS(misDevice); putchar('\n'); } diff --git a/example/wiki/graph/KokkosGraph_wiki_rcm.cpp b/example/wiki/graph/KokkosGraph_wiki_rcm.cpp index d23a7de233..29fdf61312 100644 --- a/example/wiki/graph/KokkosGraph_wiki_rcm.cpp +++ b/example/wiki/graph/KokkosGraph_wiki_rcm.cpp @@ -17,19 +17,14 @@ #include "KokkosGraph_RCM.hpp" template -void printReorderedMatrix(const rowmap_t& rowmapIn, const entries_t& entriesIn, - const labels_t& invPermIn) { +void printReorderedMatrix(const rowmap_t& rowmapIn, const entries_t& entriesIn, const labels_t& invPermIn) { using size_type = typename rowmap_t::non_const_value_type; using lno_t = typename entries_t::non_const_value_type; - auto rowmap = - Kokkos::create_mirror_view_and_copy(Kokkos::HostSpace(), rowmapIn); - auto entries = - Kokkos::create_mirror_view_and_copy(Kokkos::HostSpace(), entriesIn); - auto invPerm = - Kokkos::create_mirror_view_and_copy(Kokkos::HostSpace(), invPermIn); - lno_t numVerts = rowmap.extent(0) - 1; - decltype(invPerm) perm( - Kokkos::view_alloc(Kokkos::WithoutInitializing, "Perm"), numVerts); + auto rowmap = Kokkos::create_mirror_view_and_copy(Kokkos::HostSpace(), rowmapIn); + auto entries = Kokkos::create_mirror_view_and_copy(Kokkos::HostSpace(), entriesIn); + auto invPerm = Kokkos::create_mirror_view_and_copy(Kokkos::HostSpace(), invPermIn); + lno_t numVerts = rowmap.extent(0) - 1; + decltype(invPerm) perm(Kokkos::view_alloc(Kokkos::WithoutInitializing, "Perm"), numVerts); for (lno_t i = 0; i < numVerts; i++) perm(invPerm(i)) = i; std::vector neighbors; for (lno_t i = 0; i < numVerts; i++) { @@ -68,9 +63,7 @@ int main() { // Step 2: Run RCM and print the reordered matrix { auto rcmDevice = - KokkosGraph::Experimental::graph_rcm(rowmapDevice, - colindsDevice); + KokkosGraph::Experimental::graph_rcm(rowmapDevice, colindsDevice); std::cout << "Graph reordered by reverse Cuthill-McKee:\n"; printReorderedMatrix(rowmapDevice, colindsDevice, rcmDevice); } diff --git a/example/wiki/sparse/KokkosSparse_wiki_bsrmatrix.cpp b/example/wiki/sparse/KokkosSparse_wiki_bsrmatrix.cpp index eacf134f89..49721e595e 100644 --- a/example/wiki/sparse/KokkosSparse_wiki_bsrmatrix.cpp +++ b/example/wiki/sparse/KokkosSparse_wiki_bsrmatrix.cpp @@ -31,19 +31,14 @@ using Layout = default_layout; int main() { Kokkos::initialize(); - using device_type = typename Kokkos::Device< - Kokkos::DefaultExecutionSpace, - typename Kokkos::DefaultExecutionSpace::memory_space>; - using matrix_type = - typename KokkosSparse::CrsMatrix; - using b_matrix_type = - typename KokkosSparse::Experimental::BsrMatrix; - using graph_type = typename matrix_type::staticcrsgraph_type; - using row_map_type = typename graph_type::row_map_type; - using entries_type = typename graph_type::entries_type; - using values_type = typename matrix_type::values_type; + using device_type = + typename Kokkos::Device; + using matrix_type = typename KokkosSparse::CrsMatrix; + using b_matrix_type = typename KokkosSparse::Experimental::BsrMatrix; + using graph_type = typename matrix_type::staticcrsgraph_type; + using row_map_type = typename graph_type::row_map_type; + using entries_type = typename graph_type::entries_type; + using values_type = typename matrix_type::values_type; const Scalar SC_ONE = Kokkos::ArithTraits::one(); @@ -70,8 +65,7 @@ int main() { { // Build the row pointers and store numNNZ - typename row_map_type::HostMirror row_map_h = - Kokkos::create_mirror_view(row_map); + typename row_map_type::HostMirror row_map_h = Kokkos::create_mirror_view(row_map); for (Ordinal rowIdx = 1; rowIdx < numRows + 1; ++rowIdx) { if ((rowIdx == 1) || (rowIdx == numRows)) { row_map_h(rowIdx) = row_map_h(rowIdx - 1) + 2; @@ -82,15 +76,13 @@ int main() { Kokkos::deep_copy(row_map, row_map_h); if (row_map_h(numRows) != numNNZ) { std::ostringstream error_msg; - error_msg << "error: row_map(numRows) != numNNZ, row_map_h(numRows)=" - << row_map_h(numRows) << ", numNNZ=" << numNNZ; + error_msg << "error: row_map(numRows) != numNNZ, row_map_h(numRows)=" << row_map_h(numRows) + << ", numNNZ=" << numNNZ; throw std::runtime_error(error_msg.str()); } - typename entries_type::HostMirror entries_h = - Kokkos::create_mirror_view(entries); - typename values_type::HostMirror values_h = - Kokkos::create_mirror_view(values); + typename entries_type::HostMirror entries_h = Kokkos::create_mirror_view(entries); + typename values_type::HostMirror values_h = Kokkos::create_mirror_view(values); for (Ordinal rowIdx = 0; rowIdx < numRows; ++rowIdx) { if (rowIdx == 0) { entries_h(row_map_h(rowIdx)) = rowIdx; diff --git a/example/wiki/sparse/KokkosSparse_wiki_bsrmatrix_2.cpp b/example/wiki/sparse/KokkosSparse_wiki_bsrmatrix_2.cpp index 7ff56ff14a..527b0d56c4 100644 --- a/example/wiki/sparse/KokkosSparse_wiki_bsrmatrix_2.cpp +++ b/example/wiki/sparse/KokkosSparse_wiki_bsrmatrix_2.cpp @@ -43,8 +43,7 @@ struct bsr_fill { block_tmp(1, 0) = 0.0; block_tmp(1, 1) = 1.0; } else if (rowIdx == bsr_mat.numRows() - 1) { // Right boundary condition - auto block_tmp = - bsr_mat.unmanaged_block(bsr_mat.graph.row_map(rowIdx) + 1); + auto block_tmp = bsr_mat.unmanaged_block(bsr_mat.graph.row_map(rowIdx) + 1); block_tmp(0, 0) = 1.0; block_tmp(1, 1) = 1.0; } else { @@ -54,13 +53,13 @@ struct bsr_fill { block_tmp(1, 0) = 0.0; block_tmp(1, 1) = -1.0; - block_tmp = bsr_mat.unmanaged_block(bsr_mat.graph.row_map(rowIdx) + 1); + block_tmp = bsr_mat.unmanaged_block(bsr_mat.graph.row_map(rowIdx) + 1); block_tmp(0, 0) = 2.0; block_tmp(0, 1) = 0.0; block_tmp(1, 0) = 0.0; block_tmp(1, 1) = 2.0; - block_tmp = bsr_mat.unmanaged_block(bsr_mat.graph.row_map(rowIdx) + 2); + block_tmp = bsr_mat.unmanaged_block(bsr_mat.graph.row_map(rowIdx) + 2); block_tmp(0, 0) = -1.0; block_tmp(0, 1) = 1.0 / 2.0; block_tmp(1, 0) = 0.0; @@ -89,8 +88,7 @@ struct diagonal_extractor { KOKKOS_INLINE_FUNCTION void operator()(const int& rowIdx) const { - for (Offset entryIdx = row_map(rowIdx); entryIdx < row_map(rowIdx + 1); - ++entryIdx) { + for (Offset entryIdx = row_map(rowIdx); entryIdx < row_map(rowIdx + 1); ++entryIdx) { if (entries(entryIdx) == rowIdx) { bsr_block_type bsr_diag_block = bsr_mat.unmanaged_block(entryIdx); for (int i = 0; i < bsr_mat.blockDim(); ++i) { @@ -104,15 +102,12 @@ struct diagonal_extractor { }; int main(int argc, char* argv[]) { - using device_type = typename Kokkos::Device< - Kokkos::DefaultExecutionSpace, - typename Kokkos::DefaultExecutionSpace::memory_space>; - using bsrmatrix_type = - typename KokkosSparse::Experimental::BsrMatrix; - using graph_type = typename bsrmatrix_type::staticcrsgraph_type; - using row_map_type = typename graph_type::row_map_type; - using entries_type = typename graph_type::entries_type; + using device_type = + typename Kokkos::Device; + using bsrmatrix_type = typename KokkosSparse::Experimental::BsrMatrix; + using graph_type = typename bsrmatrix_type::staticcrsgraph_type; + using row_map_type = typename graph_type::row_map_type; + using entries_type = typename graph_type::entries_type; Kokkos::initialize(argc, argv); { @@ -143,16 +138,12 @@ int main(int argc, char* argv[]) { bsrmatrix_type bsr_mat; { - typename row_map_type::non_const_type row_map( - Kokkos::view_alloc(Kokkos::WithoutInitializing, "row pointers"), - numRows + 1); - typename entries_type::non_const_type entries( - Kokkos::view_alloc(Kokkos::WithoutInitializing, "column indices"), - numNNZ); - typename row_map_type::HostMirror row_map_h = - Kokkos::create_mirror_view(row_map); - typename entries_type::HostMirror entries_h = - Kokkos::create_mirror_view(entries); + typename row_map_type::non_const_type row_map(Kokkos::view_alloc(Kokkos::WithoutInitializing, "row pointers"), + numRows + 1); + typename entries_type::non_const_type entries(Kokkos::view_alloc(Kokkos::WithoutInitializing, "column indices"), + numNNZ); + typename row_map_type::HostMirror row_map_h = Kokkos::create_mirror_view(row_map); + typename entries_type::HostMirror entries_h = Kokkos::create_mirror_view(entries); // First Step: build the CrsGraph { @@ -181,8 +172,8 @@ int main(int argc, char* argv[]) { if (row_map_h(numRows) != numNNZ) { std::ostringstream error_msg; - error_msg << "error: row_map(numRows) != numNNZ, row_map_h(numRows)=" - << row_map_h(numRows) << ", numNNZ=" << numNNZ; + error_msg << "error: row_map(numRows) != numNNZ, row_map_h(numRows)=" << row_map_h(numRows) + << ", numNNZ=" << numNNZ; throw std::runtime_error(error_msg.str()); } Kokkos::deep_copy(row_map, row_map_h); @@ -204,16 +195,13 @@ int main(int argc, char* argv[]) { std::cout << " "; } std::cout << "*"; - for (Offset entryIdx = row_map_h(rowIdx); - entryIdx < row_map_h(rowIdx + 1) - 1; ++entryIdx) { - for (int colIdx = entries_h(entryIdx) + 1; - colIdx < entries_h(entryIdx + 1); ++colIdx) { + for (Offset entryIdx = row_map_h(rowIdx); entryIdx < row_map_h(rowIdx + 1) - 1; ++entryIdx) { + for (int colIdx = entries_h(entryIdx) + 1; colIdx < entries_h(entryIdx + 1); ++colIdx) { std::cout << " "; } std::cout << "*"; } - for (int colIdx = entries_h(row_map_h(rowIdx + 1) - 1) + 1; - colIdx < numRows; ++colIdx) { + for (int colIdx = entries_h(row_map_h(rowIdx + 1) - 1) + 1; colIdx < numRows; ++colIdx) { std::cout << " "; } std::cout << "]" << std::endl; @@ -221,24 +209,17 @@ int main(int argc, char* argv[]) { } // Extract diagonal block and store them in a rank-3 view - using diag_blocks_type = - Kokkos::View; - diag_blocks_type diag_blocks("diagonal blocks", numRows, blockSize, - blockSize); + using diag_blocks_type = Kokkos::View; + diag_blocks_type diag_blocks("diagonal blocks", numRows, blockSize, blockSize); diagonal_extractor myFunc(bsr_mat, diag_blocks); Kokkos::parallel_for(Kokkos::RangePolicy(0, numRows), myFunc); - auto diag_blocks_h = - Kokkos::create_mirror_view_and_copy(Kokkos::HostSpace{}, diag_blocks); + auto diag_blocks_h = Kokkos::create_mirror_view_and_copy(Kokkos::HostSpace{}, diag_blocks); std::cout << "\nBsrMatrix diagonal blocks: " << std::endl; for (int blockId = 0; blockId < diag_blocks_h.extent_int(0); ++blockId) { - std::cout << " [" << diag_blocks_h(blockId, 0, 0) << ", " - << diag_blocks_h(blockId, 0, 1) << "]" << std::endl; - std::cout << " [" << diag_blocks_h(blockId, 1, 0) << ", " - << diag_blocks_h(blockId, 1, 1) << "]\n" - << std::endl; + std::cout << " [" << diag_blocks_h(blockId, 0, 0) << ", " << diag_blocks_h(blockId, 0, 1) << "]" << std::endl; + std::cout << " [" << diag_blocks_h(blockId, 1, 0) << ", " << diag_blocks_h(blockId, 1, 1) << "]\n" << std::endl; } } Kokkos::finalize(); diff --git a/example/wiki/sparse/KokkosSparse_wiki_crsmatrix.cpp b/example/wiki/sparse/KokkosSparse_wiki_crsmatrix.cpp index c8d6c805c1..21257d8034 100644 --- a/example/wiki/sparse/KokkosSparse_wiki_crsmatrix.cpp +++ b/example/wiki/sparse/KokkosSparse_wiki_crsmatrix.cpp @@ -29,12 +29,9 @@ using Layout = default_layout; int main() { Kokkos::initialize(); - using device_type = typename Kokkos::Device< - Kokkos::DefaultExecutionSpace, - typename Kokkos::DefaultExecutionSpace::memory_space>; - using matrix_type = - typename KokkosSparse::CrsMatrix; + using device_type = + typename Kokkos::Device; + using matrix_type = typename KokkosSparse::CrsMatrix; using graph_type = typename matrix_type::staticcrsgraph_type; using row_map_type = typename graph_type::row_map_type; using entries_type = typename graph_type::entries_type; @@ -52,8 +49,7 @@ int main() { { // Build the row pointers and store numNNZ - typename row_map_type::HostMirror row_map_h = - Kokkos::create_mirror_view(row_map); + typename row_map_type::HostMirror row_map_h = Kokkos::create_mirror_view(row_map); for (Ordinal rowIdx = 1; rowIdx < numRows + 1; ++rowIdx) { if ((rowIdx == 1) || (rowIdx == numRows)) { row_map_h(rowIdx) = row_map_h(rowIdx - 1) + 2; @@ -64,15 +60,13 @@ int main() { Kokkos::deep_copy(row_map, row_map_h); if (row_map_h(numRows) != numNNZ) { std::ostringstream error_msg; - error_msg << "error: row_map(numRows) != numNNZ, row_map_h(numRows)=" - << row_map_h(numRows) << ", numNNZ=" << numNNZ; + error_msg << "error: row_map(numRows) != numNNZ, row_map_h(numRows)=" << row_map_h(numRows) + << ", numNNZ=" << numNNZ; throw std::runtime_error(error_msg.str()); } - typename entries_type::HostMirror entries_h = - Kokkos::create_mirror_view(entries); - typename values_type::HostMirror values_h = - Kokkos::create_mirror_view(values); + typename entries_type::HostMirror entries_h = Kokkos::create_mirror_view(entries); + typename values_type::HostMirror values_h = Kokkos::create_mirror_view(values); for (Ordinal rowIdx = 0; rowIdx < numRows; ++rowIdx) { if (rowIdx == 0) { entries_h(row_map_h(rowIdx)) = rowIdx; diff --git a/example/wiki/sparse/KokkosSparse_wiki_gauss_seidel.cpp b/example/wiki/sparse/KokkosSparse_wiki_gauss_seidel.cpp index 3dd8bfd5e5..31ccea3b0a 100644 --- a/example/wiki/sparse/KokkosSparse_wiki_gauss_seidel.cpp +++ b/example/wiki/sparse/KokkosSparse_wiki_gauss_seidel.cpp @@ -37,10 +37,10 @@ int main() { using ExecSpace = Kokkos::DefaultExecutionSpace; using MemSpace = typename ExecSpace::memory_space; using Device = Kokkos::Device; - using Handle = KokkosKernels::Experimental::KokkosKernelsHandle< - Offset, Ordinal, default_scalar, ExecSpace, MemSpace, MemSpace>; - using Matrix = KokkosSparse::CrsMatrix; - using Vector = typename Matrix::values_type; + using Handle = + KokkosKernels::Experimental::KokkosKernelsHandle; + using Matrix = KokkosSparse::CrsMatrix; + using Vector = typename Matrix::values_type; constexpr Ordinal numRows = 10000; const Scalar one = Kokkos::ArithTraits::one(); const Mag magOne = Kokkos::ArithTraits::one(); @@ -52,32 +52,28 @@ int main() { // on which Gauss-Seidel should converge. Get approx. 20 entries per row // Diagonals are 2x the absolute sum of all other entries. Offset nnz = numRows * 20; - Matrix A = - KokkosSparse::Impl::kk_generate_diagonally_dominant_sparse_matrix< - Matrix>(numRows, numRows, nnz, 2, 100, 1.05 * one); - std::cout << "Generated a matrix with " << numRows << " rows/cols, and " - << nnz << " entries.\n"; + Matrix A = KokkosSparse::Impl::kk_generate_diagonally_dominant_sparse_matrix(numRows, numRows, nnz, 2, 100, + 1.05 * one); + std::cout << "Generated a matrix with " << numRows << " rows/cols, and " << nnz << " entries.\n"; // Create a kernel handle, then a Gauss-Seidel handle with the default // algorithm Handle handle; handle.create_gs_handle(KokkosSparse::GS_DEFAULT); // Set up Gauss-Seidel for the graph (matrix sparsity pattern) - KokkosSparse::Experimental::gauss_seidel_symbolic( - &handle, numRows, numRows, A.graph.row_map, A.graph.entries, false); + KokkosSparse::Experimental::gauss_seidel_symbolic(&handle, numRows, numRows, A.graph.row_map, A.graph.entries, + false); // Set up Gauss-Seidel for the matrix values (numeric) // Another matrix with the same sparsity pattern could re-use the handle and // symbolic phase, and only call numeric. - KokkosSparse::Experimental::gauss_seidel_numeric( - &handle, numRows, numRows, A.graph.row_map, A.graph.entries, A.values, - false); + KokkosSparse::Experimental::gauss_seidel_numeric(&handle, numRows, numRows, A.graph.row_map, A.graph.entries, + A.values, false); // Now, preconditioner is ready to use. Set up an unknown vector // (uninitialized) and randomized right-hand-side vector. Vector x(Kokkos::view_alloc(Kokkos::WithoutInitializing, "x"), numRows); Vector b(Kokkos::view_alloc(Kokkos::WithoutInitializing, "b"), numRows); Vector res(Kokkos::view_alloc(Kokkos::WithoutInitializing, "res"), numRows); auto bHost = Kokkos::create_mirror_view(b); - for (Ordinal i = 0; i < numRows; i++) - bHost(i) = 3 * ((one * rand()) / RAND_MAX); + for (Ordinal i = 0; i < numRows; i++) bHost(i) = 3 * ((one * rand()) / RAND_MAX); Kokkos::deep_copy(b, bHost); // Measure initial residual norm ||Ax - b||, where x is 0 Mag initialRes = KokkosBlas::nrm2(b); @@ -92,8 +88,7 @@ int main() { // * that b has changed since the previous apply (since there was no // previous apply) KokkosSparse::Experimental::forward_sweep_gauss_seidel_apply( - &handle, numRows, numRows, A.graph.row_map, A.graph.entries, A.values, - x, b, firstIter, firstIter, one, 1); + &handle, numRows, numRows, A.graph.row_map, A.graph.entries, A.values, x, b, firstIter, firstIter, one, 1); firstIter = false; // Now, compute the new residual norm using SPMV Kokkos::deep_copy(res, b); @@ -102,8 +97,7 @@ int main() { // Recompute the scaled norm scaledResNorm = KokkosBlas::nrm2(res) / initialRes; numIters++; - std::cout << "Iteration " << numIters - << " scaled residual norm: " << scaledResNorm << '\n'; + std::cout << "Iteration " << numIters << " scaled residual norm: " << scaledResNorm << '\n'; } std::cout << "SUCCESS: converged in " << numIters << " iterations.\n"; } diff --git a/example/wiki/sparse/KokkosSparse_wiki_spadd.cpp b/example/wiki/sparse/KokkosSparse_wiki_spadd.cpp index 841e3b9eb3..c9edd7bc0c 100644 --- a/example/wiki/sparse/KokkosSparse_wiki_spadd.cpp +++ b/example/wiki/sparse/KokkosSparse_wiki_spadd.cpp @@ -28,14 +28,11 @@ using Layout = default_layout; int main() { Kokkos::initialize(); - using device_type = typename Kokkos::Device< - Kokkos::DefaultExecutionSpace, - typename Kokkos::DefaultExecutionSpace::memory_space>; + using device_type = + typename Kokkos::Device; using execution_space = typename device_type::execution_space; using memory_space = typename device_type::memory_space; - using matrix_type = - typename KokkosSparse::CrsMatrix; + using matrix_type = typename KokkosSparse::CrsMatrix; int return_value = 0; @@ -47,8 +44,7 @@ int main() { // In each row the first entry is the number of grid point in // that direction, the second and third entries are used to apply // BCs in that direction. - Kokkos::View mat_structure( - "Matrix Structure", 2); + Kokkos::View mat_structure("Matrix Structure", 2); mat_structure(0, 0) = 10; // Request 10 grid point in 'x' direction mat_structure(0, 1) = 1; // Add BC to the left mat_structure(0, 2) = 1; // Add BC to the right @@ -56,15 +52,13 @@ int main() { mat_structure(1, 1) = 1; // Add BC to the bottom mat_structure(1, 2) = 1; // Add BC to the top - matrix_type A = - Test::generate_structured_matrix2D("FD", mat_structure); - matrix_type B = - Test::generate_structured_matrix2D("FE", mat_structure); + matrix_type A = Test::generate_structured_matrix2D("FD", mat_structure); + matrix_type B = Test::generate_structured_matrix2D("FE", mat_structure); matrix_type C; // Create KokkosKernelHandle - using KernelHandle = KokkosKernels::Experimental::KokkosKernelsHandle< - Offset, Ordinal, Scalar, execution_space, memory_space, memory_space>; + using KernelHandle = KokkosKernels::Experimental::KokkosKernelsHandle; KernelHandle kh; kh.create_spadd_handle(false); diff --git a/example/wiki/sparse/KokkosSparse_wiki_spgemm.cpp b/example/wiki/sparse/KokkosSparse_wiki_spgemm.cpp index 56a628ffd5..2b3ccd13d2 100644 --- a/example/wiki/sparse/KokkosSparse_wiki_spgemm.cpp +++ b/example/wiki/sparse/KokkosSparse_wiki_spgemm.cpp @@ -28,12 +28,9 @@ using Layout = default_layout; int main() { Kokkos::initialize(); - using device_type = typename Kokkos::Device< - Kokkos::DefaultExecutionSpace, - typename Kokkos::DefaultExecutionSpace::memory_space>; - using matrix_type = - typename KokkosSparse::CrsMatrix; + using device_type = + typename Kokkos::Device; + using matrix_type = typename KokkosSparse::CrsMatrix; int return_value = 0; @@ -45,8 +42,7 @@ int main() { // In each row the first entry is the number of grid point in // that direction, the second and third entries are used to apply // BCs in that direction. - Kokkos::View mat_structure( - "Matrix Structure", 2); + Kokkos::View mat_structure("Matrix Structure", 2); mat_structure(0, 0) = 10; // Request 10 grid point in 'x' direction mat_structure(0, 1) = 1; // Add BC to the left mat_structure(0, 2) = 1; // Add BC to the right @@ -54,15 +50,13 @@ int main() { mat_structure(1, 1) = 1; // Add BC to the bottom mat_structure(1, 2) = 1; // Add BC to the top - matrix_type A = - Test::generate_structured_matrix2D("FD", mat_structure); - matrix_type B = - Test::generate_structured_matrix2D("FE", mat_structure); + matrix_type A = Test::generate_structured_matrix2D("FD", mat_structure); + matrix_type B = Test::generate_structured_matrix2D("FE", mat_structure); matrix_type C = KokkosSparse::spgemm(A, false, B, false); - std::cout << "Ran spgemm: product C is " << C.numRows() << 'x' - << C.numCols() << " and has " << C.nnz() << " nonzeros.\n"; + std::cout << "Ran spgemm: product C is " << C.numRows() << 'x' << C.numCols() << " and has " << C.nnz() + << " nonzeros.\n"; } Kokkos::finalize(); diff --git a/example/wiki/sparse/KokkosSparse_wiki_spmv.cpp b/example/wiki/sparse/KokkosSparse_wiki_spmv.cpp index 8b876e5bfc..5778684a8a 100644 --- a/example/wiki/sparse/KokkosSparse_wiki_spmv.cpp +++ b/example/wiki/sparse/KokkosSparse_wiki_spmv.cpp @@ -44,12 +44,9 @@ struct check_spmv_functor { int main() { Kokkos::initialize(); - using device_type = typename Kokkos::Device< - Kokkos::DefaultExecutionSpace, - typename Kokkos::DefaultExecutionSpace::memory_space>; - using matrix_type = - typename KokkosSparse::CrsMatrix; + using device_type = + typename Kokkos::Device; + using matrix_type = typename KokkosSparse::CrsMatrix; using values_type = typename matrix_type::values_type; int return_value = 0; @@ -66,8 +63,7 @@ int main() { // BCs in that direction, BC=0 means Neumann BC is applied, // BC=1 means Dirichlet BC is applied by zeroing out the row and putting // one on the diagonal. - Kokkos::View mat_structure( - "Matrix Structure", 2); + Kokkos::View mat_structure("Matrix Structure", 2); mat_structure(0, 0) = 10; // Request 10 grid point in 'x' direction mat_structure(0, 1) = 0; // Add BC to the left mat_structure(0, 2) = 0; // Add BC to the right @@ -75,8 +71,7 @@ int main() { mat_structure(1, 1) = 0; // Add BC to the bottom mat_structure(1, 2) = 0; // Add BC to the top - matrix_type myMatrix = - Test::generate_structured_matrix2D("FD", mat_structure); + matrix_type myMatrix = Test::generate_structured_matrix2D("FD", mat_structure); const Ordinal numRows = myMatrix.numRows(); @@ -92,15 +87,12 @@ int main() { Ordinal count_errors = 0; check_spmv_functor check_spmv(y); - Kokkos::parallel_reduce(Kokkos::RangePolicy(0, numRows), - check_spmv, count_errors); + Kokkos::parallel_reduce(Kokkos::RangePolicy(0, numRows), check_spmv, count_errors); if (count_errors > 0) { return_value = 1; - std::cout << "Found " << count_errors << " errors in y vector!" - << std::endl; + std::cout << "Found " << count_errors << " errors in y vector!" << std::endl; } else { - std::cout << "spmv was performed correctly: y = beta*y + alpha*A*x" - << std::endl; + std::cout << "spmv was performed correctly: y = beta*y + alpha*A*x" << std::endl; } } diff --git a/graph/impl/KokkosGraph_BFS_impl.hpp b/graph/impl/KokkosGraph_BFS_impl.hpp index 9ea5d63e07..34cb3c9179 100644 --- a/graph/impl/KokkosGraph_BFS_impl.hpp +++ b/graph/impl/KokkosGraph_BFS_impl.hpp @@ -39,10 +39,8 @@ struct SerialRCM { SerialRCM(const rowmap_t& rowmap_, const entries_t& entries_) : numVerts(std::max(rowmap_.extent_int(0), 1) - 1), - rowmap(Kokkos::view_alloc(Kokkos::WithoutInitializing, "HostRowmap"), - rowmap_.extent(0)), - entries(Kokkos::view_alloc(Kokkos::WithoutInitializing, "HostEntries"), - entries_.extent(0)) { + rowmap(Kokkos::view_alloc(Kokkos::WithoutInitializing, "HostRowmap"), rowmap_.extent(0)), + entries(Kokkos::view_alloc(Kokkos::WithoutInitializing, "HostEntries"), entries_.extent(0)) { Kokkos::deep_copy(rowmap, rowmap_); Kokkos::deep_copy(entries, entries_); } @@ -51,11 +49,8 @@ struct SerialRCM { // Given a label L, labelReverse - L gives the reversed label (as in reverse // Cuthill McKee) lno_t labelReverse = numVerts - 1; - host_lno_view_t q(Kokkos::view_alloc(Kokkos::WithoutInitializing, "Queue"), - numVerts); - host_lno_view_t label( - Kokkos::view_alloc(Kokkos::WithoutInitializing, "Permutation"), - numVerts); + host_lno_view_t q(Kokkos::view_alloc(Kokkos::WithoutInitializing, "Queue"), numVerts); + host_lno_view_t label(Kokkos::view_alloc(Kokkos::WithoutInitializing, "Permutation"), numVerts); for (lno_t i = 0; i < numVerts; i++) label(i) = -1; lno_t qhead = 0; lno_t qtail = 0; @@ -63,16 +58,12 @@ struct SerialRCM { // (heuristic for best to worst starting vertex for RCM). // If the graph has multiple connected components, restart at the first // unlabeled vertex in this list. - host_lno_view_t allVertices( - Kokkos::view_alloc(Kokkos::WithoutInitializing, "allVertices"), - numVerts); + host_lno_view_t allVertices(Kokkos::view_alloc(Kokkos::WithoutInitializing, "allVertices"), numVerts); for (lno_t i = 0; i < numVerts; i++) allVertices(i) = i; - std::sort(allVertices.data(), allVertices.data() + numVerts, - [&](lno_t n1, lno_t n2) -> bool { - // return true if n1 has a lower degree than n2 - return (rowmap(n1 + 1) - rowmap(n1)) < - (rowmap(n2 + 1) - rowmap(n2)); - }); + std::sort(allVertices.data(), allVertices.data() + numVerts, [&](lno_t n1, lno_t n2) -> bool { + // return true if n1 has a lower degree than n2 + return (rowmap(n1 + 1) - rowmap(n1)) < (rowmap(n2 + 1) - rowmap(n2)); + }); lno_t allVerticesIter = 0; // Start RCM with the first vertex in allVertices lno_t start = allVertices(allVerticesIter++); @@ -90,12 +81,10 @@ struct SerialRCM { neighbors.push_back(nei); } } - std::sort(neighbors.begin(), neighbors.end(), - [&](lno_t n1, lno_t n2) -> bool { - // return true if n1 has a lower degree than n2 - return (rowmap(n1 + 1) - rowmap(n1)) < - (rowmap(n2 + 1) - rowmap(n2)); - }); + std::sort(neighbors.begin(), neighbors.end(), [&](lno_t n1, lno_t n2) -> bool { + // return true if n1 has a lower degree than n2 + return (rowmap(n1 + 1) - rowmap(n1)) < (rowmap(n2 + 1) - rowmap(n2)); + }); // label and enqueue all unlabeled neighbors for (lno_t nei : neighbors) { label(nei) = labelReverse - qtail; @@ -112,9 +101,7 @@ struct SerialRCM { q(qtail++) = restart; } } - lno_view_t labelOut( - Kokkos::view_alloc(Kokkos::WithoutInitializing, "RCM Permutation"), - numVerts); + lno_view_t labelOut(Kokkos::view_alloc(Kokkos::WithoutInitializing, "RCM Permutation"), numVerts); Kokkos::deep_copy(labelOut, label); return labelOut; } diff --git a/graph/impl/KokkosGraph_Distance1Color_impl.hpp b/graph/impl/KokkosGraph_Distance1Color_impl.hpp index 6bd1c022ae..2abc5c76e4 100644 --- a/graph/impl/KokkosGraph_Distance1Color_impl.hpp +++ b/graph/impl/KokkosGraph_Distance1Color_impl.hpp @@ -36,8 +36,7 @@ namespace Impl { * General aim is to find the minimum number of colors, minimum number of * independent sets. */ -template +template class GraphColor { public: typedef lno_row_view_t_ in_lno_row_view_t; @@ -49,19 +48,15 @@ class GraphColor { typedef typename HandleType::size_type size_type; typedef typename HandleType::nnz_lno_t nnz_lno_t; - typedef typename in_lno_row_view_t::HostMirror - row_lno_host_view_t; // Host view type + typedef typename in_lno_row_view_t::HostMirror row_lno_host_view_t; // Host view type - typedef typename in_lno_nnz_view_t::HostMirror - nnz_lno_host_view_t; // Host view type + typedef typename in_lno_nnz_view_t::HostMirror nnz_lno_host_view_t; // Host view type - typedef typename HandleType::color_host_view_t - color_host_view_t; // Host view type + typedef typename HandleType::color_host_view_t color_host_view_t; // Host view type typedef typename HandleType::HandleExecSpace MyExecSpace; typedef typename HandleType::HandleTempMemorySpace MyTempMemorySpace; - typedef - typename HandleType::HandlePersistentMemorySpace MyPersistentMemorySpace; + typedef typename HandleType::HandlePersistentMemorySpace MyPersistentMemorySpace; typedef typename HandleType::const_size_type const_size_type; typedef typename lno_row_view_t_::const_type const_lno_row_view_t; @@ -70,8 +65,8 @@ class GraphColor { typedef typename lno_nnz_view_t_::non_const_type non_const_lno_nnz_view_t; protected: - nnz_lno_t nv; //# vertices - size_type ne; //# edges + nnz_lno_t nv; // # vertices + size_type ne; // # edges const_lno_row_view_t xadj; // rowmap const_lno_nnz_view_t adj; // entries const_lno_nnz_view_t kok_src, kok_dst; // Edge list storage of the graph @@ -87,25 +82,13 @@ class GraphColor { * \param coloring_handle: GraphColoringHandle object that holds the * specification about the graph coloring, including parameters. */ - GraphColor(nnz_lno_t nv_, size_type ne_, const_lno_row_view_t row_map, - const_lno_nnz_view_t entries, HandleType *coloring_handle) - : nv(nv_), - ne(ne_), - xadj(row_map), - adj(entries), - kok_src(), - kok_dst(), - cp(coloring_handle) { - static_assert( - std::is_same< - size_type, - typename const_lno_row_view_t::non_const_value_type>::value, - "Row map element type does not match handle's size_type."); - static_assert( - std::is_same< - nnz_lno_t, - typename const_lno_nnz_view_t::non_const_value_type>::value, - "Entries element type does not match handle's nnz_lno_t."); + GraphColor(nnz_lno_t nv_, size_type ne_, const_lno_row_view_t row_map, const_lno_nnz_view_t entries, + HandleType *coloring_handle) + : nv(nv_), ne(ne_), xadj(row_map), adj(entries), kok_src(), kok_dst(), cp(coloring_handle) { + static_assert(std::is_same::value, + "Row map element type does not match handle's size_type."); + static_assert(std::is_same::value, + "Entries element type does not match handle's nnz_lno_t."); } /** \brief GraphColor destructor. @@ -125,11 +108,9 @@ class GraphColor { virtual void color_graph(color_view_t d_colors, int &num_phases) { num_phases = 1; - color_host_view_t colors = Kokkos::create_mirror_view(d_colors); - typename const_lno_row_view_t::HostMirror h_xadj = - Kokkos::create_mirror_view(this->xadj); - typename const_lno_nnz_view_t::HostMirror h_adj = - Kokkos::create_mirror_view(this->adj); + color_host_view_t colors = Kokkos::create_mirror_view(d_colors); + typename const_lno_row_view_t::HostMirror h_xadj = Kokkos::create_mirror_view(this->xadj); + typename const_lno_nnz_view_t::HostMirror h_adj = Kokkos::create_mirror_view(this->adj); // typename nnz_lno_host_view_t::HostMirror::HostMirror::HostMirror h_adj = // tmp; @@ -185,10 +166,8 @@ class GraphColor { * based algorithms. VBCS: Speculative parallel vertex based using color set * implementation. */ -template -class GraphColor_VB - : public GraphColor { +template +class GraphColor_VB : public GraphColor { public: typedef long long int ban_type; @@ -202,32 +181,24 @@ class GraphColor_VB typedef typename HandleType::nnz_lno_t nnz_lno_t; typedef typename HandleType::color_t color_t; - typedef typename HandleType::color_host_view_t - color_host_view_t; // Host view type + typedef typename HandleType::color_host_view_t color_host_view_t; // Host view type typedef typename HandleType::HandleExecSpace MyExecSpace; typedef typename HandleType::HandleTempMemorySpace MyTempMemorySpace; - typedef - typename HandleType::HandlePersistentMemorySpace MyPersistentMemorySpace; + typedef typename HandleType::HandlePersistentMemorySpace MyPersistentMemorySpace; - typedef typename Kokkos::View - single_dim_index_view_type; + typedef typename Kokkos::View single_dim_index_view_type; // typedef typename Kokkos::View // um_array_type; - typedef typename single_dim_index_view_type::HostMirror - single_dim_index_host_view_type; // Host view type + typedef typename single_dim_index_view_type::HostMirror single_dim_index_host_view_type; // Host view type typedef Kokkos::RangePolicy my_exec_space; - typedef typename HandleType::size_type_temp_work_view_t - size_type_temp_work_view_t; - typedef typename HandleType::size_type_persistent_work_view_t - size_type_persistent_work_view_t; + typedef typename HandleType::size_type_temp_work_view_t size_type_temp_work_view_t; + typedef typename HandleType::size_type_persistent_work_view_t size_type_persistent_work_view_t; - typedef - typename HandleType::nnz_lno_temp_work_view_t nnz_lno_temp_work_view_t; - typedef typename HandleType::nnz_lno_persistent_work_view_t - nnz_lno_persistent_work_view_t; + typedef typename HandleType::nnz_lno_temp_work_view_t nnz_lno_temp_work_view_t; + typedef typename HandleType::nnz_lno_persistent_work_view_t nnz_lno_persistent_work_view_t; typedef typename in_lno_row_view_t::const_type const_lno_row_view_t; @@ -240,21 +211,21 @@ class GraphColor_VB bool _serialConflictResolution; // if true use serial conflict resolution bool _ticToc; // if true print info in each step - ConflictList _conflict_scheme; // Enum: COLORING_NOCONFLICT, COLORING_ATOMIC, - // COLORING_PPS + ConflictList _conflict_scheme; // Enum: COLORING_NOCONFLICT, COLORING_ATOMIC, + // COLORING_PPS - double _pps_ratio; // the minimum number of reduction on the size of the - // conflictlist to create a new conflictlist + double _pps_ratio; // the minimum number of reduction on the size of the + // conflictlist to create a new conflictlist nnz_lno_t _min_vertex_cut_off; // minimum number of vertices to reduce the // conflictlist further. - bool _edge_filtering; // if true, edge-filtering is applied by swaps on - // adjacency array. - int _chunkSize; // the size of the minimum work unit assigned to threads. - // Changes the convergence on GPUs - char _use_color_set; // the VB algorithm type. - // 0 for VB: - // 1: for VBCS - // 2: for VBBIT + bool _edge_filtering; // if true, edge-filtering is applied by swaps on + // adjacency array. + int _chunkSize; // the size of the minimum work unit assigned to threads. + // Changes the convergence on GPUs + char _use_color_set; // the VB algorithm type. + // 0 for VB: + // 1: for VBCS + // 2: for VBBIT int _max_num_iterations; @@ -268,17 +239,14 @@ class GraphColor_VB * \param coloring_handle: GraphColoringHandle object that holds the * specification about the graph coloring, including parameters. */ - GraphColor_VB(nnz_lno_t nv_, size_type ne_, const_lno_row_view_t row_map, - const_lno_nnz_view_t entries, HandleType *coloring_handle) - : GraphColor( - nv_, ne_, row_map, entries, coloring_handle), - _serialConflictResolution( - coloring_handle->get_serial_conflict_resolution()), + GraphColor_VB(nnz_lno_t nv_, size_type ne_, const_lno_row_view_t row_map, const_lno_nnz_view_t entries, + HandleType *coloring_handle) + : GraphColor(nv_, ne_, row_map, entries, coloring_handle), + _serialConflictResolution(coloring_handle->get_serial_conflict_resolution()), _ticToc(coloring_handle->get_tictoc()), _conflict_scheme(coloring_handle->get_conflict_list_type()), _pps_ratio(coloring_handle->get_min_reduction_for_conflictlist()), - _min_vertex_cut_off( - coloring_handle->get_min_elements_for_conflictlist()), + _min_vertex_cut_off(coloring_handle->get_min_elements_for_conflictlist()), _edge_filtering(coloring_handle->get_vb_edge_filtering()), _chunkSize(coloring_handle->get_vb_chunk_size()), _use_color_set(), @@ -309,20 +277,15 @@ class GraphColor_VB virtual void color_graph(color_view_type colors, int &num_loops) { if (this->_ticToc) { std::cout << "\tVB params:" << std::endl - << "\tuseConflictList:" << int(this->_conflict_scheme) - << std::endl + << "\tuseConflictList:" << int(this->_conflict_scheme) << std::endl << "\talgorithm:" << (int)this->_use_color_set << std::endl - << "\tserialConflictResolution:" - << (int)this->_serialConflictResolution << std::endl + << "\tserialConflictResolution:" << (int)this->_serialConflictResolution << std::endl << "\tticToc:" << (int)this->_ticToc << std::endl << "\tuse_color_set:" << (int)this->_use_color_set << std::endl << "\tpps_ratio:" << this->_pps_ratio << std::endl - << "\tmin_vertex_cut_off:" << this->_min_vertex_cut_off - << std::endl - << "\tedge_filtering:" << (int)this->_edge_filtering - << std::endl - << "\tmax_num_iterations:" << this->_max_num_iterations - << std::endl + << "\tmin_vertex_cut_off:" << this->_min_vertex_cut_off << std::endl + << "\tedge_filtering:" << (int)this->_edge_filtering << std::endl + << "\tmax_num_iterations:" << this->_max_num_iterations << std::endl << "\tchunkSize:" << this->_chunkSize << std::endl; } @@ -334,9 +297,7 @@ class GraphColor_VB // We need to copy the adjacency array so that we dont harm the original // one. if (this->_edge_filtering) { - adj_copy = nnz_lno_temp_work_view_t( - Kokkos::view_alloc(Kokkos::WithoutInitializing, "adj copy"), - this->ne); + adj_copy = nnz_lno_temp_work_view_t(Kokkos::view_alloc(Kokkos::WithoutInitializing, "adj copy"), this->ne); Kokkos::deep_copy(adj_copy, this->adj); } @@ -348,9 +309,8 @@ class GraphColor_VB } // the conflictlist - nnz_lno_temp_work_view_t current_vertexList = nnz_lno_temp_work_view_t( - Kokkos::view_alloc(Kokkos::WithoutInitializing, "vertexList"), - this->nv); + nnz_lno_temp_work_view_t current_vertexList = + nnz_lno_temp_work_view_t(Kokkos::view_alloc(Kokkos::WithoutInitializing, "vertexList"), this->nv); nnz_lno_t current_vertexListLength = this->nv; if (this->cp->get_use_vtx_list()) { @@ -359,9 +319,8 @@ class GraphColor_VB current_vertexListLength = this->cp->get_vertex_list_size(); } else { // init vertexList sequentially. - Kokkos::parallel_for( - "KokkosGraph::GraphColoring::InitList", my_exec_space(0, this->nv), - functorInitList(current_vertexList)); + Kokkos::parallel_for("KokkosGraph::GraphColoring::InitList", my_exec_space(0, this->nv), + functorInitList(current_vertexList)); } // the next iteration's conflict list @@ -374,11 +333,9 @@ class GraphColor_VB // if a conflictlist is used if (this->_conflict_scheme != COLORING_NOCONFLICT) { // Vertices to recolor. Will swap with vertexList. - next_iteration_recolorList = nnz_lno_temp_work_view_t( - Kokkos::view_alloc(Kokkos::WithoutInitializing, "recolorList"), - this->nv); - next_iteration_recolorListLength = - single_dim_index_view_type("recolorListLength"); + next_iteration_recolorList = + nnz_lno_temp_work_view_t(Kokkos::view_alloc(Kokkos::WithoutInitializing, "recolorList"), this->nv); + next_iteration_recolorListLength = single_dim_index_view_type("recolorListLength"); } nnz_lno_t numUncolored = this->nv; @@ -394,13 +351,13 @@ class GraphColor_VB if (this->_edge_filtering) { // First color greedy speculatively, // some conflicts expected - this->colorGreedyEF(this->xadj, adj_copy, colors, vertex_color_set, - current_vertexList, current_vertexListLength); + this->colorGreedyEF(this->xadj, adj_copy, colors, vertex_color_set, current_vertexList, + current_vertexListLength); } else { // First color greedy speculatively, // some conflicts expected - this->colorGreedy(this->xadj, this->adj, colors, vertex_color_set, - current_vertexList, current_vertexListLength); + this->colorGreedy(this->xadj, this->adj, colors, vertex_color_set, current_vertexList, + current_vertexListLength); } MyExecSpace().fence(); @@ -408,22 +365,19 @@ class GraphColor_VB if (this->_ticToc) { double t = timer.seconds(); total_time_greedy_phase += t; - std::cout << "\tTime speculative greedy phase " << iter << " : " << t - << std::endl; + std::cout << "\tTime speculative greedy phase " << iter << " : " << t << std::endl; timer.reset(); } bool swap_work_arrays = true; if (this->_edge_filtering) { - numUncolored = this->findConflicts( - swap_work_arrays, this->xadj, adj_copy, colors, vertex_color_set, - current_vertexList, current_vertexListLength, - next_iteration_recolorList, next_iteration_recolorListLength); + numUncolored = + this->findConflicts(swap_work_arrays, this->xadj, adj_copy, colors, vertex_color_set, current_vertexList, + current_vertexListLength, next_iteration_recolorList, next_iteration_recolorListLength); } else { - numUncolored = this->findConflicts( - swap_work_arrays, this->xadj, this->adj, colors, vertex_color_set, - current_vertexList, current_vertexListLength, - next_iteration_recolorList, next_iteration_recolorListLength); + numUncolored = + this->findConflicts(swap_work_arrays, this->xadj, this->adj, colors, vertex_color_set, current_vertexList, + current_vertexListLength, next_iteration_recolorList, next_iteration_recolorListLength); } MyExecSpace().fence(); @@ -431,41 +385,34 @@ class GraphColor_VB if (_ticToc) { double t = timer.seconds(); total_time_find_conflicts += t; - std::cout << "\tTime conflict detection " << iter << " : " << t - << std::endl; + std::cout << "\tTime conflict detection " << iter << " : " << t << std::endl; timer.reset(); } - if (this->_serialConflictResolution) - break; // Break after first iteration. - if (this->_conflict_scheme != COLORING_NOCONFLICT && swap_work_arrays && - (iter + 1 < this->_max_num_iterations)) { + if (this->_serialConflictResolution) break; // Break after first iteration. + if (this->_conflict_scheme != COLORING_NOCONFLICT && swap_work_arrays && (iter + 1 < this->_max_num_iterations)) { // Swap recolorList and vertexList - nnz_lno_temp_work_view_t temp = current_vertexList; - current_vertexList = next_iteration_recolorList; - next_iteration_recolorList = temp; - current_vertexListLength = numUncolored; - next_iteration_recolorListLength = - single_dim_index_view_type("recolorListLength"); + nnz_lno_temp_work_view_t temp = current_vertexList; + current_vertexList = next_iteration_recolorList; + next_iteration_recolorList = temp; + current_vertexListLength = numUncolored; + next_iteration_recolorListLength = single_dim_index_view_type("recolorListLength"); } } // if VBCS algorithm is used, the colors are converted back to original // form. if (this->_use_color_set == 1) { - Kokkos::parallel_for("KokkosGraph::GraphColoring::SetFinalColors", - my_exec_space(0, this->nv), + Kokkos::parallel_for("KokkosGraph::GraphColoring::SetFinalColors", my_exec_space(0, this->nv), set_final_colors(colors, vertex_color_set)); } if (numUncolored > 0) { if (this->_edge_filtering) { // Resolve conflicts by recoloring in serial - this->resolveConflicts(this->nv, this->xadj, adj_copy, colors, - current_vertexList, current_vertexListLength); + this->resolveConflicts(this->nv, this->xadj, adj_copy, colors, current_vertexList, current_vertexListLength); } else { // Resolve conflicts by recoloring in serial - this->resolveConflicts(this->nv, this->xadj, this->adj, colors, - current_vertexList, current_vertexListLength); + this->resolveConflicts(this->nv, this->xadj, this->adj, colors, current_vertexList, current_vertexListLength); } MyExecSpace().fence(); if (_ticToc) { @@ -478,8 +425,7 @@ class GraphColor_VB this->cp->add_to_overall_coloring_time_phase1(total_time_greedy_phase); this->cp->add_to_overall_coloring_time_phase2(total_time_find_conflicts); - this->cp->add_to_overall_coloring_time_phase3( - total_time_serial_conflict_resolution); + this->cp->add_to_overall_coloring_time_phase3(total_time_serial_conflict_resolution); } // color_graph (end) private: @@ -491,13 +437,10 @@ class GraphColor_VB * \param current_vertexList_: current conflictlist * \param current_vertexListLength_: size of current conflictlist */ - void colorGreedy(const_lno_row_view_t xadj_, const_lno_nnz_view_t adj_, - color_view_type vertex_colors_, - nnz_lno_temp_work_view_t vertex_color_set, - nnz_lno_temp_work_view_t current_vertexList_, + void colorGreedy(const_lno_row_view_t xadj_, const_lno_nnz_view_t adj_, color_view_type vertex_colors_, + nnz_lno_temp_work_view_t vertex_color_set, nnz_lno_temp_work_view_t current_vertexList_, nnz_lno_t current_vertexListLength_) { - nnz_lno_t chunkSize_ = - this->_chunkSize; // Process chunkSize vertices in one chunk + nnz_lno_t chunkSize_ = this->_chunkSize; // Process chunkSize vertices in one chunk if (current_vertexListLength_ < 100 * chunkSize_) chunkSize_ = 1; @@ -505,34 +448,28 @@ class GraphColor_VB if (this->_use_color_set == 2) { // std::cout << ">>> functorGreedyColor_IMPLOG" << std::endl; // // WCMCLEN - functorGreedyColor_IMPLOG gc(this->nv, xadj_, adj_, vertex_colors_, - current_vertexList_, + functorGreedyColor_IMPLOG gc(this->nv, xadj_, adj_, vertex_colors_, current_vertexList_, current_vertexListLength_, chunkSize_); - Kokkos::parallel_for( - "KokkosGraph::GraphColoring::GreedyColor_IMPLOG", - my_exec_space(0, current_vertexListLength_ / chunkSize_ + 1), gc); + Kokkos::parallel_for("KokkosGraph::GraphColoring::GreedyColor_IMPLOG", + my_exec_space(0, current_vertexListLength_ / chunkSize_ + 1), gc); } // VBCS algorithm else if (this->_use_color_set == 1) { // std::cout << ">>> functorGreedyColor_IMP" << std::endl; // WCMCLEN - functorGreedyColor_IMP gc(this->nv, xadj_, adj_, vertex_colors_, - vertex_color_set, current_vertexList_, + functorGreedyColor_IMP gc(this->nv, xadj_, adj_, vertex_colors_, vertex_color_set, current_vertexList_, current_vertexListLength_, chunkSize_); - Kokkos::parallel_for( - "KokkosGraph::GraphColoring::GreedyColor_IMP", - my_exec_space(0, current_vertexListLength_ / chunkSize_ + 1), gc); + Kokkos::parallel_for("KokkosGraph::GraphColoring::GreedyColor_IMP", + my_exec_space(0, current_vertexListLength_ / chunkSize_ + 1), gc); } // VB algorithm else if (this->_use_color_set == 0) { // std::cout << ">>> functorGreedyColor" << std::endl; // WCMCLEN - functorGreedyColor gc(this->nv, xadj_, adj_, vertex_colors_, - current_vertexList_, current_vertexListLength_, + functorGreedyColor gc(this->nv, xadj_, adj_, vertex_colors_, current_vertexList_, current_vertexListLength_, chunkSize_); - Kokkos::parallel_for( - "KokkosGraph::GraphColoring::GreedyColor", - my_exec_space(0, current_vertexListLength_ / chunkSize_ + 1), gc); + Kokkos::parallel_for("KokkosGraph::GraphColoring::GreedyColor", + my_exec_space(0, current_vertexListLength_ / chunkSize_ + 1), gc); } } // colorGreedy (end) @@ -544,13 +481,10 @@ class GraphColor_VB * \param current_vertexList_: current conflictlist * \param current_vertexListLength_: size of current conflictlist */ - void colorGreedyEF(const_lno_row_view_t xadj_, nnz_lno_temp_work_view_t adj_, - color_view_type vertex_colors_, - nnz_lno_temp_work_view_t vertex_color_set, - nnz_lno_temp_work_view_t current_vertexList_, + void colorGreedyEF(const_lno_row_view_t xadj_, nnz_lno_temp_work_view_t adj_, color_view_type vertex_colors_, + nnz_lno_temp_work_view_t vertex_color_set, nnz_lno_temp_work_view_t current_vertexList_, nnz_lno_t current_vertexListLength_) { - nnz_lno_t chunkSize_ = - this->_chunkSize; // Process chunkSize vertices in one chunk + nnz_lno_t chunkSize_ = this->_chunkSize; // Process chunkSize vertices in one chunk if (current_vertexListLength_ < 100 * chunkSize_) chunkSize_ = 1; @@ -559,34 +493,28 @@ class GraphColor_VB // If edge filtering is applied // std::cout << ">>> functorGreedyColor_IMPLOG_EF" << std::endl; // // WCMCLEN - functorGreedyColor_IMPLOG_EF gc(this->nv, xadj_, adj_, vertex_colors_, - current_vertexList_, + functorGreedyColor_IMPLOG_EF gc(this->nv, xadj_, adj_, vertex_colors_, current_vertexList_, current_vertexListLength_, chunkSize_); - Kokkos::parallel_for( - "KokkosGraph::GraphColoring::GreedyColor_IMPLOG_EF", - my_exec_space(0, current_vertexListLength_ / chunkSize_ + 1), gc); + Kokkos::parallel_for("KokkosGraph::GraphColoring::GreedyColor_IMPLOG_EF", + my_exec_space(0, current_vertexListLength_ / chunkSize_ + 1), gc); } // VBCS algorithm else if (this->_use_color_set == 1) { // std::cout << ">>> functorGreedyColor_IMP_EF" << std::endl; // // WCMCLEN - functorGreedyColor_IMP_EF gc(this->nv, xadj_, adj_, vertex_colors_, - vertex_color_set, current_vertexList_, + functorGreedyColor_IMP_EF gc(this->nv, xadj_, adj_, vertex_colors_, vertex_color_set, current_vertexList_, current_vertexListLength_, chunkSize_); - Kokkos::parallel_for( - "KokkosGraph::GraphColoring::GreedyColor_IMP_EF", - my_exec_space(0, current_vertexListLength_ / chunkSize_ + 1), gc); + Kokkos::parallel_for("KokkosGraph::GraphColoring::GreedyColor_IMP_EF", + my_exec_space(0, current_vertexListLength_ / chunkSize_ + 1), gc); } // VB algorithm else if (this->_use_color_set == 0) { // std::cout << ">>> functorGreedyColor_EF" << std::endl; // WCMCLEN - functorGreedyColor_EF gc(this->nv, xadj_, adj_, vertex_colors_, - current_vertexList_, current_vertexListLength_, + functorGreedyColor_EF gc(this->nv, xadj_, adj_, vertex_colors_, current_vertexList_, current_vertexListLength_, chunkSize_); - Kokkos::parallel_for( - "KokkosGraph::GraphColoring::GreedyColor_EF", - my_exec_space(0, current_vertexListLength_ / chunkSize_ + 1), gc); + Kokkos::parallel_for("KokkosGraph::GraphColoring::GreedyColor_EF", + my_exec_space(0, current_vertexListLength_ / chunkSize_ + 1), gc); } } @@ -601,85 +529,63 @@ class GraphColor_VB * \param next_iteration_recolorListLength_: size of next conflictlist */ template - nnz_lno_t findConflicts( - bool &swap_work_arrays, const_lno_row_view_t xadj_, adj_view_t adj_, - color_view_type vertex_colors_, - nnz_lno_temp_work_view_t vertex_color_set_, - nnz_lno_temp_work_view_t current_vertexList_, - nnz_lno_t current_vertexListLength_, - nnz_lno_temp_work_view_t next_iteration_recolorList_, - single_dim_index_view_type next_iteration_recolorListLength_) { + nnz_lno_t findConflicts(bool &swap_work_arrays, const_lno_row_view_t xadj_, adj_view_t adj_, + color_view_type vertex_colors_, nnz_lno_temp_work_view_t vertex_color_set_, + nnz_lno_temp_work_view_t current_vertexList_, nnz_lno_t current_vertexListLength_, + nnz_lno_temp_work_view_t next_iteration_recolorList_, + single_dim_index_view_type next_iteration_recolorListLength_) { swap_work_arrays = true; nnz_lno_t numUncolored = 0; if (this->_conflict_scheme == COLORING_NOCONFLICT) { if (this->_use_color_set == 0 || this->_use_color_set == 2) { - functorFindConflicts_No_Conflist conf(this->nv, xadj_, adj_, - vertex_colors_); - Kokkos::parallel_reduce( - "KokkosGraph::GraphColoring::FindConflicts::CaseA", - my_exec_space(0, current_vertexListLength_), conf, numUncolored); + functorFindConflicts_No_Conflist conf(this->nv, xadj_, adj_, vertex_colors_); + Kokkos::parallel_reduce("KokkosGraph::GraphColoring::FindConflicts::CaseA", + my_exec_space(0, current_vertexListLength_), conf, numUncolored); } else { - functorFindConflicts_No_Conflist_IMP conf( - this->nv, xadj_, adj_, vertex_colors_, vertex_color_set_); - Kokkos::parallel_reduce( - "KokkosGraph::GraphColoring::FindConflicts::CaseB", - my_exec_space(0, current_vertexListLength_), conf, numUncolored); + functorFindConflicts_No_Conflist_IMP conf(this->nv, xadj_, adj_, vertex_colors_, vertex_color_set_); + Kokkos::parallel_reduce("KokkosGraph::GraphColoring::FindConflicts::CaseB", + my_exec_space(0, current_vertexListLength_), conf, numUncolored); } } else if (this->_conflict_scheme == COLORING_PPS) { if (this->_use_color_set == 0 || this->_use_color_set == 2) { // Check for conflicts. Compute numUncolored == numConflicts. - functorFindConflicts_PPS conf( - this->nv, xadj_, adj_, vertex_colors_, current_vertexList_); - Kokkos::parallel_reduce( - "KokkosGraph::GraphColoring::FindConflicts::CaseC", - my_exec_space(0, current_vertexListLength_), conf, numUncolored); + functorFindConflicts_PPS conf(this->nv, xadj_, adj_, vertex_colors_, current_vertexList_); + Kokkos::parallel_reduce("KokkosGraph::GraphColoring::FindConflicts::CaseC", + my_exec_space(0, current_vertexListLength_), conf, numUncolored); } else { - functorFindConflicts_PPS_IMP conf( - this->nv, xadj_, adj_, vertex_colors_, vertex_color_set_, - current_vertexList_); - Kokkos::parallel_reduce( - "KokkosGraph::GraphColoring::FindConflicts::CaseD", - my_exec_space(0, current_vertexListLength_), conf, numUncolored); + functorFindConflicts_PPS_IMP conf(this->nv, xadj_, adj_, vertex_colors_, vertex_color_set_, + current_vertexList_); + Kokkos::parallel_reduce("KokkosGraph::GraphColoring::FindConflicts::CaseD", + my_exec_space(0, current_vertexListLength_), conf, numUncolored); } - if (numUncolored && - (current_vertexListLength_ >= this->_min_vertex_cut_off) && - (double(numUncolored) / current_vertexListLength_ < - (1.0 - this->_pps_ratio))) { + if (numUncolored && (current_vertexListLength_ >= this->_min_vertex_cut_off) && + (double(numUncolored) / current_vertexListLength_ < (1.0 - this->_pps_ratio))) { if (this->_ticToc) { - std::cout - << "\tcreating work array with pps current_vertexListLength_:" - << current_vertexListLength_ - << " params->min_vertex_cut_off:" << this->_min_vertex_cut_off - << std::endl; + std::cout << "\tcreating work array with pps current_vertexListLength_:" << current_vertexListLength_ + << " params->min_vertex_cut_off:" << this->_min_vertex_cut_off << std::endl; } single_dim_index_host_view_type h_numUncolored(&numUncolored); Kokkos::deep_copy(next_iteration_recolorListLength_, h_numUncolored); Kokkos::parallel_scan( - "KokkosGraph::GraphColoring::PrefixSum", - my_exec_space(0, current_vertexListLength_), - ppsWorklistFunctorVB( - this->nv, current_vertexList_, next_iteration_recolorList_)); + "KokkosGraph::GraphColoring::PrefixSum", my_exec_space(0, current_vertexListLength_), + ppsWorklistFunctorVB(this->nv, current_vertexList_, next_iteration_recolorList_)); } else { swap_work_arrays = false; } } else { // worklist scheme COLORING_ATOMIC if (this->_use_color_set == 0 || this->_use_color_set == 2) { // Check for conflicts. Compute numUncolored == numConflicts. - functorFindConflicts_Atomic conf( - this->nv, xadj_, adj_, vertex_colors_, current_vertexList_, - next_iteration_recolorList_, next_iteration_recolorListLength_); - Kokkos::parallel_reduce( - "KokkosGraph::GraphColoring::FindConflictsAtomic", - my_exec_space(0, current_vertexListLength_), conf, numUncolored); + functorFindConflicts_Atomic conf(this->nv, xadj_, adj_, vertex_colors_, current_vertexList_, + next_iteration_recolorList_, next_iteration_recolorListLength_); + Kokkos::parallel_reduce("KokkosGraph::GraphColoring::FindConflictsAtomic", + my_exec_space(0, current_vertexListLength_), conf, numUncolored); } else { - functorFindConflicts_Atomic_IMP conf( - this->nv, xadj_, adj_, vertex_colors_, vertex_color_set_, - current_vertexList_, next_iteration_recolorList_, - next_iteration_recolorListLength_); - Kokkos::parallel_reduce( - "KokkosGraph::GraphColoring::FindConflictsAtomic_IMP", - my_exec_space(0, current_vertexListLength_), conf, numUncolored); + functorFindConflicts_Atomic_IMP conf(this->nv, xadj_, adj_, vertex_colors_, vertex_color_set_, + current_vertexList_, next_iteration_recolorList_, + next_iteration_recolorListLength_); + Kokkos::parallel_reduce("KokkosGraph::GraphColoring::FindConflictsAtomic_IMP", + my_exec_space(0, current_vertexListLength_), conf, numUncolored); } } if (this->_ticToc) { @@ -697,10 +603,8 @@ class GraphColor_VB * \param current_vertexListLength_: size of current conflictlist */ template - void resolveConflicts(nnz_lno_t _nv, const_lno_row_view_t xadj_, - adj_view_t adj_, color_view_type vertex_colors_, - nnz_lno_temp_work_view_t current_vertexList_, - size_type current_vertexListLength_) { + void resolveConflicts(nnz_lno_t _nv, const_lno_row_view_t xadj_, adj_view_t adj_, color_view_type vertex_colors_, + nnz_lno_temp_work_view_t current_vertexList_, size_type current_vertexListLength_) { color_t *forbidden = new color_t[_nv]; nnz_lno_t i = 0; nnz_lno_t end = _nv; @@ -711,10 +615,9 @@ class GraphColor_VB h_recolor_list = Kokkos::create_mirror_view(current_vertexList_); Kokkos::deep_copy(h_recolor_list, current_vertexList_); } - color_host_view_t h_colors = Kokkos::create_mirror_view(vertex_colors_); - typename const_lno_row_view_t::HostMirror h_idx = - Kokkos::create_mirror_view(xadj_); - typename adj_view_t::HostMirror h_adj = Kokkos::create_mirror_view(adj_); + color_host_view_t h_colors = Kokkos::create_mirror_view(vertex_colors_); + typename const_lno_row_view_t::HostMirror h_idx = Kokkos::create_mirror_view(xadj_); + typename adj_view_t::HostMirror h_adj = Kokkos::create_mirror_view(adj_); Kokkos::deep_copy(h_colors, vertex_colors_); Kokkos::deep_copy(h_idx, xadj_); @@ -756,12 +659,9 @@ class GraphColor_VB nnz_lno_t _vertexListLength; nnz_lno_t _chunkSize; - functorGreedyColor_IMPLOG_EF(nnz_lno_t nv_, const_lno_row_view_t xadj_, - nnz_lno_temp_work_view_t adj_, - color_view_type colors, - nnz_lno_temp_work_view_t vertexList, - nnz_lno_t vertexListLength, - nnz_lno_t chunkSize) + functorGreedyColor_IMPLOG_EF(nnz_lno_t nv_, const_lno_row_view_t xadj_, nnz_lno_temp_work_view_t adj_, + color_view_type colors, nnz_lno_temp_work_view_t vertexList, + nnz_lno_t vertexListLength, nnz_lno_t chunkSize) : nv(nv_), _idx(xadj_), _adj(adj_), @@ -794,8 +694,7 @@ class GraphColor_VB // we parse the neigborlist multiple times, // each time we look for a certain range of colors. - for (; (offset <= degree + VBBIT_COLORING_FORBIDDEN_SIZE); - offset += VBBIT_COLORING_FORBIDDEN_SIZE) { + for (; (offset <= degree + VBBIT_COLORING_FORBIDDEN_SIZE); offset += VBBIT_COLORING_FORBIDDEN_SIZE) { // Forbidden colors // we use a single (long) int for forbidden colors ban_type forbidden = 0; @@ -867,10 +766,9 @@ class GraphColor_VB nnz_lno_t _vertexListLength; nnz_lno_t _chunkSize; - functorGreedyColor_IMPLOG(nnz_lno_t nv_, const_lno_row_view_t xadj_, - const_lno_nnz_view_t adj_, color_view_type colors, - nnz_lno_temp_work_view_t vertexList, - nnz_lno_t vertexListLength, nnz_lno_t chunkSize) + functorGreedyColor_IMPLOG(nnz_lno_t nv_, const_lno_row_view_t xadj_, const_lno_nnz_view_t adj_, + color_view_type colors, nnz_lno_temp_work_view_t vertexList, nnz_lno_t vertexListLength, + nnz_lno_t chunkSize) : nv(nv_), _idx(xadj_), _adj(adj_), @@ -896,8 +794,7 @@ class GraphColor_VB color_t degree = my_xadj_end - xadjbegin; // My degree color_t offset = 0; - for (; (offset <= degree + VBBIT_COLORING_FORBIDDEN_SIZE); - offset += VBBIT_COLORING_FORBIDDEN_SIZE) { + for (; (offset <= degree + VBBIT_COLORING_FORBIDDEN_SIZE); offset += VBBIT_COLORING_FORBIDDEN_SIZE) { ban_type forbidden = 0; // Forbidden colors // Check nbors, fill forbidden array. @@ -950,12 +847,9 @@ class GraphColor_VB nnz_lno_t _vertexListLength; nnz_lno_t _chunkSize; - functorGreedyColor_IMP_EF(nnz_lno_t nv_, const_lno_row_view_t xadj_, - nnz_lno_temp_work_view_t adj_, - color_view_type colors, - nnz_lno_temp_work_view_t color_set, - nnz_lno_temp_work_view_t vertexList, - nnz_lno_t vertexListLength, nnz_lno_t chunkSize) + functorGreedyColor_IMP_EF(nnz_lno_t nv_, const_lno_row_view_t xadj_, nnz_lno_temp_work_view_t adj_, + color_view_type colors, nnz_lno_temp_work_view_t color_set, + nnz_lno_temp_work_view_t vertexList, nnz_lno_t vertexListLength, nnz_lno_t chunkSize) : nv(nv_), _xadj(xadj_), _adj(adj_), @@ -1033,10 +927,8 @@ class GraphColor_VB nnz_lno_t _vertexListLength; nnz_lno_t _chunkSize; - functorGreedyColor_IMP(nnz_lno_t nv_, const_lno_row_view_t xadj_, - const_lno_nnz_view_t adj_, color_view_type colors, - nnz_lno_temp_work_view_t color_set, - nnz_lno_temp_work_view_t vertexList, + functorGreedyColor_IMP(nnz_lno_t nv_, const_lno_row_view_t xadj_, const_lno_nnz_view_t adj_, color_view_type colors, + nnz_lno_temp_work_view_t color_set, nnz_lno_temp_work_view_t vertexList, nnz_lno_t vertexListLength, nnz_lno_t chunkSize) : nv(nv_), _xadj(xadj_), @@ -1105,10 +997,9 @@ class GraphColor_VB nnz_lno_t _vertexListLength; nnz_lno_t _chunkSize; - functorGreedyColor_EF(nnz_lno_t nv_, const_lno_row_view_t xadj_, - nnz_lno_temp_work_view_t adj_, color_view_type colors, - nnz_lno_temp_work_view_t vertexList, - nnz_lno_t vertexListLength, nnz_lno_t chunkSize) + functorGreedyColor_EF(nnz_lno_t nv_, const_lno_row_view_t xadj_, nnz_lno_temp_work_view_t adj_, + color_view_type colors, nnz_lno_temp_work_view_t vertexList, nnz_lno_t vertexListLength, + nnz_lno_t chunkSize) : nv(nv_), _idx(xadj_), _adj(adj_), @@ -1150,8 +1041,7 @@ class GraphColor_VB color_t offset = 0; size_type xadjbegin = _idx(i); - for (; (offset <= degree + VB_COLORING_FORBIDDEN_SIZE) && (!foundColor); - offset += VB_COLORING_FORBIDDEN_SIZE) { + for (; (offset <= degree + VB_COLORING_FORBIDDEN_SIZE) && (!foundColor); offset += VB_COLORING_FORBIDDEN_SIZE) { // initialize for (int j = 0; j < VB_COLORING_FORBIDDEN_SIZE; j++) { forbidden[j] = false; @@ -1211,10 +1101,8 @@ class GraphColor_VB nnz_lno_t _vertexListLength; nnz_lno_t _chunkSize; - functorGreedyColor(nnz_lno_t nv_, const_lno_row_view_t xadj_, - const_lno_nnz_view_t adj_, color_view_type colors, - nnz_lno_temp_work_view_t vertexList, - nnz_lno_t vertexListLength, nnz_lno_t chunkSize) + functorGreedyColor(nnz_lno_t nv_, const_lno_row_view_t xadj_, const_lno_nnz_view_t adj_, color_view_type colors, + nnz_lno_temp_work_view_t vertexList, nnz_lno_t vertexListLength, nnz_lno_t chunkSize) : nv(nv_), _idx(xadj_), _adj(adj_), @@ -1253,8 +1141,7 @@ class GraphColor_VB // Do multiple passes if array is too small. color_t degree = _idx(i + 1) - _idx(i); // My degree color_t offset = 1; - for (; (offset <= degree + VB_COLORING_FORBIDDEN_SIZE) && (!foundColor); - offset += VB_COLORING_FORBIDDEN_SIZE) { + for (; (offset <= degree + VB_COLORING_FORBIDDEN_SIZE) && (!foundColor); offset += VB_COLORING_FORBIDDEN_SIZE) { // initialize for (int j = 0; j < VB_COLORING_FORBIDDEN_SIZE; j++) { forbidden[j] = false; @@ -1271,8 +1158,7 @@ class GraphColor_VB // foundColor = true; // return; //} - if ((c >= offset) && (c - offset < VB_COLORING_FORBIDDEN_SIZE)) - forbidden[c - offset] = true; + if ((c >= offset) && (c - offset < VB_COLORING_FORBIDDEN_SIZE)) forbidden[c - offset] = true; } // color vertex i with smallest available color (FirstFit) @@ -1302,8 +1188,7 @@ class GraphColor_VB adj_view_t _adj; color_view_type _colors; - functorFindConflicts_No_Conflist(nnz_lno_t nv_, const_lno_row_view_t xadj_, - adj_view_t adj_, color_view_type colors) + functorFindConflicts_No_Conflist(nnz_lno_t nv_, const_lno_row_view_t xadj_, adj_view_t adj_, color_view_type colors) : nv(nv_), _idx(xadj_), _adj(adj_), _colors(colors) {} KOKKOS_INLINE_FUNCTION @@ -1323,9 +1208,8 @@ class GraphColor_VB #endif _colors(neighbor) == my_color #ifdef DEGREECOMP - && - (myDegree < _idx(neighbor + 1) - _idx(neighbor) || - (myDegree == _idx(neighbor + 1) - _idx(neighbor) && ii < neighbor)) + && (myDegree < _idx(neighbor + 1) - _idx(neighbor) || + (myDegree == _idx(neighbor + 1) - _idx(neighbor) && ii < neighbor)) #endif ) { // std::cout << "me:" << ii << " n:" << neighbor << " color:" << @@ -1350,14 +1234,9 @@ class GraphColor_VB color_view_type _colors; nnz_lno_temp_work_view_t _vertexList; - functorFindConflicts_PPS(nnz_lno_t nv_, const_lno_row_view_t xadj_, - adj_view_t adj_, color_view_type colors, + functorFindConflicts_PPS(nnz_lno_t nv_, const_lno_row_view_t xadj_, adj_view_t adj_, color_view_type colors, nnz_lno_temp_work_view_t vertexList) - : nv(nv_), - _idx(xadj_), - _adj(adj_), - _colors(colors), - _vertexList(vertexList) {} + : nv(nv_), _idx(xadj_), _adj(adj_), _colors(colors), _vertexList(vertexList) {} KOKKOS_INLINE_FUNCTION void operator()(const nnz_lno_t ii, nnz_lno_t &numConflicts) const { @@ -1378,9 +1257,8 @@ class GraphColor_VB #endif _colors(neighbor) == my_color #ifdef DEGREECOMP - && - (myDegree < _idx(neighbor + 1) - _idx(neighbor) || - (myDegree == _idx(neighbor + 1) - _idx(neighbor) && i < neighbor)) + && (myDegree < _idx(neighbor + 1) - _idx(neighbor) || + (myDegree == _idx(neighbor + 1) - _idx(neighbor) && i < neighbor)) #endif ) { _colors(i) = 0; // Uncolor vertex i @@ -1405,10 +1283,8 @@ class GraphColor_VB nnz_lno_temp_work_view_t _recolorList; single_dim_index_view_type _recolorListLength; - functorFindConflicts_Atomic(nnz_lno_t nv_, const_lno_row_view_t xadj_, - adj_view_t adj_, color_view_type colors, - nnz_lno_temp_work_view_t vertexList, - nnz_lno_temp_work_view_t recolorList, + functorFindConflicts_Atomic(nnz_lno_t nv_, const_lno_row_view_t xadj_, adj_view_t adj_, color_view_type colors, + nnz_lno_temp_work_view_t vertexList, nnz_lno_temp_work_view_t recolorList, single_dim_index_view_type recolorListLength) : nv(nv_), _idx(xadj_), @@ -1420,9 +1296,7 @@ class GraphColor_VB KOKKOS_INLINE_FUNCTION void operator()(const nnz_lno_t ii, nnz_lno_t &numConflicts) const { - typedef - typename std::remove_reference::type - atomic_incr_type; + typedef typename std::remove_reference::type atomic_incr_type; nnz_lno_t i = _vertexList(ii); color_t my_color = _colors(i); @@ -1441,15 +1315,13 @@ class GraphColor_VB #endif _colors(neighbor) == my_color #ifdef DEGREECOMP - && - (myDegree < _idx(neighbor + 1) - _idx(neighbor) || - (myDegree == _idx(neighbor + 1) - _idx(neighbor) && i < neighbor)) + && (myDegree < _idx(neighbor + 1) - _idx(neighbor) || + (myDegree == _idx(neighbor + 1) - _idx(neighbor) && i < neighbor)) #endif ) { _colors(i) = 0; // Uncolor vertex i // Atomically add vertex i to recolorList - const nnz_lno_t k = Kokkos::atomic_fetch_add(&_recolorListLength(), - atomic_incr_type(1)); + const nnz_lno_t k = Kokkos::atomic_fetch_add(&_recolorListLength(), atomic_incr_type(1)); _recolorList(k) = i; numConflicts += 1; break; // Once i is uncolored and marked conflict @@ -1470,16 +1342,9 @@ class GraphColor_VB color_view_type _colors; nnz_lno_temp_work_view_t _color_sets; - functorFindConflicts_No_Conflist_IMP(nnz_lno_t nv_, - const_lno_row_view_t xadj_, - adj_view_t adj_, - color_view_type colors, - nnz_lno_temp_work_view_t color_sets) - : nv(nv_), - _xadj(xadj_), - _adj(adj_), - _colors(colors), - _color_sets(color_sets) {} + functorFindConflicts_No_Conflist_IMP(nnz_lno_t nv_, const_lno_row_view_t xadj_, adj_view_t adj_, + color_view_type colors, nnz_lno_temp_work_view_t color_sets) + : nv(nv_), _xadj(xadj_), _adj(adj_), _colors(colors), _color_sets(color_sets) {} KOKKOS_INLINE_FUNCTION void operator()(const nnz_lno_t ii, nnz_lno_t &numConflicts) const { @@ -1504,12 +1369,10 @@ class GraphColor_VB #ifndef DEGREECOMP ii < neighbor && neighbor < nv && #endif - _colors(neighbor) == my_color && - my_color_set == _color_sets(neighbor) + _colors(neighbor) == my_color && my_color_set == _color_sets(neighbor) #ifdef DEGREECOMP && (myDegree < _xadj(neighbor + 1) - _xadj(neighbor) || - (myDegree == _xadj(neighbor + 1) - _xadj(neighbor) && - ii < neighbor)) + (myDegree == _xadj(neighbor + 1) - _xadj(neighbor) && ii < neighbor)) #endif ) { _colors(ii) = 0; // Uncolor vertex i @@ -1535,16 +1398,9 @@ class GraphColor_VB nnz_lno_temp_work_view_t _color_sets; nnz_lno_temp_work_view_t _vertexList; - functorFindConflicts_PPS_IMP(nnz_lno_t nv_, const_lno_row_view_t xadj_, - adj_view_t adj_, color_view_type colors, - nnz_lno_temp_work_view_t color_sets, - nnz_lno_temp_work_view_t vertexList) - : nv(nv_), - _xadj(xadj_), - _adj(adj_), - _colors(colors), - _color_sets(color_sets), - _vertexList(vertexList) {} + functorFindConflicts_PPS_IMP(nnz_lno_t nv_, const_lno_row_view_t xadj_, adj_view_t adj_, color_view_type colors, + nnz_lno_temp_work_view_t color_sets, nnz_lno_temp_work_view_t vertexList) + : nv(nv_), _xadj(xadj_), _adj(adj_), _colors(colors), _color_sets(color_sets), _vertexList(vertexList) {} KOKKOS_INLINE_FUNCTION void operator()(const nnz_lno_t ii, nnz_lno_t &numConflicts) const { @@ -1570,12 +1426,10 @@ class GraphColor_VB #ifndef DEGREECOMP i < neighbor && neighbor < nv && #endif - _colors(neighbor) == my_color && - my_color_set == _color_sets(neighbor) + _colors(neighbor) == my_color && my_color_set == _color_sets(neighbor) #ifdef DEGREECOMP && (myDegree < _xadj(neighbor + 1) - _xadj(neighbor) || - (myDegree == _xadj(neighbor + 1) - _xadj(neighbor) && - i < neighbor)) + (myDegree == _xadj(neighbor + 1) - _xadj(neighbor) && i < neighbor)) #endif ) { _colors(i) = 0; // Uncolor vertex i @@ -1603,12 +1457,9 @@ class GraphColor_VB nnz_lno_temp_work_view_t _recolorList; single_dim_index_view_type _recolorListLength; - functorFindConflicts_Atomic_IMP( - nnz_lno_t nv_, const_lno_row_view_t xadj_, adj_view_t adj_, - color_view_type colors, nnz_lno_temp_work_view_t color_sets, - nnz_lno_temp_work_view_t vertexList, - nnz_lno_temp_work_view_t recolorList, - single_dim_index_view_type recolorListLength) + functorFindConflicts_Atomic_IMP(nnz_lno_t nv_, const_lno_row_view_t xadj_, adj_view_t adj_, color_view_type colors, + nnz_lno_temp_work_view_t color_sets, nnz_lno_temp_work_view_t vertexList, + nnz_lno_temp_work_view_t recolorList, single_dim_index_view_type recolorListLength) : nv(nv_), _xadj(xadj_), _adj(adj_), @@ -1620,16 +1471,13 @@ class GraphColor_VB KOKKOS_INLINE_FUNCTION void operator()(const nnz_lno_t ii, nnz_lno_t &numConflicts) const { - typedef - typename std::remove_reference::type - atomic_incr_type; + typedef typename std::remove_reference::type atomic_incr_type; nnz_lno_t i = _vertexList(ii); color_t my_color = _colors(i); if (my_color == 0) { // this should only happen when one_color_set_per_iteration is set to // true. - const nnz_lno_t k = Kokkos::atomic_fetch_add(&_recolorListLength(), - atomic_incr_type(1)); + const nnz_lno_t k = Kokkos::atomic_fetch_add(&_recolorListLength(), atomic_incr_type(1)); _recolorList(k) = i; numConflicts++; } else { @@ -1647,19 +1495,16 @@ class GraphColor_VB #ifndef DEGREECOMP i < neighbor && neighbor < nv && #endif - _colors(neighbor) == my_color && - my_color_set == _color_sets(neighbor) + _colors(neighbor) == my_color && my_color_set == _color_sets(neighbor) #ifdef DEGREECOMP && (myDegree < _xadj(neighbor + 1) - _xadj(neighbor) || - (myDegree == _xadj(neighbor + 1) - _xadj(neighbor) && - i < neighbor)) + (myDegree == _xadj(neighbor + 1) - _xadj(neighbor) && i < neighbor)) #endif ) { _colors(i) = 0; // Uncolor vertex i _color_sets(i) = 0; // Atomically add vertex i to recolorList - const nnz_lno_t k = Kokkos::atomic_fetch_add(&_recolorListLength(), - atomic_incr_type(1)); + const nnz_lno_t k = Kokkos::atomic_fetch_add(&_recolorListLength(), atomic_incr_type(1)); _recolorList(k) = i; numConflicts++; break; // Once i is uncolored and marked conflict @@ -1690,8 +1535,7 @@ class GraphColor_VB view_type _vertexList; view_type _recolorList; - ppsWorklistFunctorVB(nnz_lno_t nv_, const view_type &vertexList, - const view_type &recolorList) + ppsWorklistFunctorVB(nnz_lno_t nv_, const view_type &vertexList, const view_type &recolorList) : _nv(nv_), _vertexList(vertexList), _recolorList(recolorList) {} KOKKOS_INLINE_FUNCTION @@ -1709,9 +1553,8 @@ class GraphColor_VB */ struct set_final_colors { color_view_type kokcol; - nnz_lno_temp_work_view_t - kokcolset; // the colors that are represented with bits, and the colors - // set that the color is in. + nnz_lno_temp_work_view_t kokcolset; // the colors that are represented with bits, and the colors + // set that the color is in. color_t color_size; /** \brief functor constructor. @@ -1720,11 +1563,8 @@ class GraphColor_VB * color_set_ together is used to represent the colors e.g. color_set_(v) * * (numbits_in_idx-1) + set_bit_position_in_kokcolors_(v) */ - set_final_colors(color_view_type kokcol_, - nnz_lno_temp_work_view_t kokcolset_) - : kokcol(kokcol_), - kokcolset(kokcolset_), - color_size(sizeof(color_t) * 8) {} + set_final_colors(color_view_type kokcol_, nnz_lno_temp_work_view_t kokcolset_) + : kokcol(kokcol_), kokcolset(kokcolset_), color_size(sizeof(color_t) * 8) {} KOKKOS_INLINE_FUNCTION void operator()(const nnz_lno_t &ii) const { @@ -1747,10 +1587,8 @@ class GraphColor_VB /*! \brief Class for the deterministic vertex based graph coloring algorithms. */ -template -class GraphColor_VBD - : public GraphColor { +template +class GraphColor_VBD : public GraphColor { public: typedef long long int ban_type; @@ -1764,30 +1602,22 @@ class GraphColor_VBD typedef typename HandleType::nnz_lno_t nnz_lno_t; typedef typename HandleType::color_t color_t; - typedef typename HandleType::color_host_view_t - color_host_view_t; // Host view type + typedef typename HandleType::color_host_view_t color_host_view_t; // Host view type typedef typename HandleType::HandleExecSpace MyExecSpace; typedef typename HandleType::HandleTempMemorySpace MyTempMemorySpace; - typedef - typename HandleType::HandlePersistentMemorySpace MyPersistentMemorySpace; + typedef typename HandleType::HandlePersistentMemorySpace MyPersistentMemorySpace; - typedef typename Kokkos::View - single_dim_index_view_type; - typedef typename single_dim_index_view_type::HostMirror - single_dim_index_host_view_type; // Host view type + typedef typename Kokkos::View single_dim_index_view_type; + typedef typename single_dim_index_view_type::HostMirror single_dim_index_host_view_type; // Host view type typedef Kokkos::RangePolicy my_exec_space; - typedef typename HandleType::size_type_temp_work_view_t - size_type_temp_work_view_t; - typedef typename HandleType::size_type_persistent_work_view_t - size_type_persistent_work_view_t; + typedef typename HandleType::size_type_temp_work_view_t size_type_temp_work_view_t; + typedef typename HandleType::size_type_persistent_work_view_t size_type_persistent_work_view_t; - typedef - typename HandleType::nnz_lno_temp_work_view_t nnz_lno_temp_work_view_t; - typedef typename HandleType::nnz_lno_persistent_work_view_t - nnz_lno_persistent_work_view_t; + typedef typename HandleType::nnz_lno_temp_work_view_t nnz_lno_temp_work_view_t; + typedef typename HandleType::nnz_lno_persistent_work_view_t nnz_lno_persistent_work_view_t; typedef typename in_lno_row_view_t::const_type const_lno_row_view_t; @@ -1795,9 +1625,9 @@ class GraphColor_VBD typedef typename lno_nnz_view_t_::non_const_type non_const_lno_nnz_view_t; protected: - bool _ticToc; // if true print info in each step - int _chunkSize; // the size of the minimum work unit assigned to threads. - // Changes the convergence on GPUs + bool _ticToc; // if true print info in each step + int _chunkSize; // the size of the minimum work unit assigned to threads. + // Changes the convergence on GPUs char _use_color_set; // the VBD algorithm type. // 0 for VBD: @@ -1811,10 +1641,9 @@ class GraphColor_VBD * \param coloring_handle: GraphColoringHandle object that holds the * specification about the graph coloring, including parameters. */ - GraphColor_VBD(nnz_lno_t nv_, size_type ne_, const_lno_row_view_t row_map, - const_lno_nnz_view_t entries, HandleType *coloring_handle) - : GraphColor( - nv_, ne_, row_map, entries, coloring_handle), + GraphColor_VBD(nnz_lno_t nv_, size_type ne_, const_lno_row_view_t row_map, const_lno_nnz_view_t entries, + HandleType *coloring_handle) + : GraphColor(nv_, ne_, row_map, entries, coloring_handle), _ticToc(coloring_handle->get_tictoc()), _chunkSize(coloring_handle->get_vb_chunk_size()), _use_color_set() { @@ -1850,15 +1679,13 @@ class GraphColor_VBD nnz_lno_t numVertices = this->nv; - size_type maxColors = 0; - nnz_lno_persistent_work_view_t score = nnz_lno_persistent_work_view_t( - Kokkos::view_alloc(Kokkos::WithoutInitializing, "score"), this->nv); - functorScoreCalculation scoreCalculation( - score, this->xadj); + size_type maxColors = 0; + nnz_lno_persistent_work_view_t score = + nnz_lno_persistent_work_view_t(Kokkos::view_alloc(Kokkos::WithoutInitializing, "score"), this->nv); + functorScoreCalculation scoreCalculation(score, this->xadj); - Kokkos::parallel_reduce("Deterministic Coloring: compute initial scores", - my_exec_space(0, this->nv), scoreCalculation, - Kokkos::Max(maxColors)); + Kokkos::parallel_reduce("Deterministic Coloring: compute initial scores", my_exec_space(0, this->nv), + scoreCalculation, Kokkos::Max(maxColors)); if (this->_ticToc) { std::cout << "maxColors: " << maxColors << std::endl; @@ -1867,18 +1694,17 @@ class GraphColor_VBD // Create the dependency list of the graph nnz_lno_persistent_work_view_t dependency("dependency", numVertices); Kokkos::View frontierSize("frontierSize"); - typename Kokkos::View::HostMirror - host_frontierSize = Kokkos::create_mirror_view(frontierSize); - Kokkos::View newFrontierSize( - "newFrontierSize"); - typename Kokkos::View::HostMirror - host_newFrontierSize = Kokkos::create_mirror_view(newFrontierSize); + typename Kokkos::View::HostMirror host_frontierSize = + Kokkos::create_mirror_view(frontierSize); + Kokkos::View newFrontierSize("newFrontierSize"); + typename Kokkos::View::HostMirror host_newFrontierSize = + Kokkos::create_mirror_view(newFrontierSize); nnz_lno_temp_work_view_t frontier("frontier", numVertices); nnz_lno_temp_work_view_t newFrontier("newFrontier", numVertices); - functorInitialDependency myInitialDependency( - this->xadj, this->adj, score, dependency, newFrontier, newFrontierSize); - Kokkos::parallel_for("Deterministic Coloring: compute dependency list", - my_exec_space(0, numVertices), myInitialDependency); + functorInitialDependency myInitialDependency(this->xadj, this->adj, score, dependency, newFrontier, + newFrontierSize); + Kokkos::parallel_for("Deterministic Coloring: compute dependency list", my_exec_space(0, numVertices), + myInitialDependency); Kokkos::deep_copy(host_newFrontierSize, newFrontierSize); while (host_newFrontierSize() > 0) { @@ -1886,8 +1712,7 @@ class GraphColor_VBD // First swap fontier with newFrontier and fontierSize with // newFrontierSize reset newFrontierSize functorSwapOnDevice mySwapOnDevice(frontierSize, newFrontierSize); - Kokkos::parallel_for("Swap frontier sizes", my_exec_space(0, 1), - mySwapOnDevice); + Kokkos::parallel_for("Swap frontier sizes", my_exec_space(0, 1), mySwapOnDevice); Kokkos::deep_copy(host_frontierSize, frontierSize); { auto swap_tmp = frontier; @@ -1898,11 +1723,9 @@ class GraphColor_VBD // Loop over nodes in the frontier // First variant without bit array, easier to understand/program if (this->_use_color_set == 0) { - functorDeterministicColoring myDeterministicColoring( - this->xadj, this->adj, dependency, frontier, frontierSize, - newFrontier, newFrontierSize, maxColors, colors); - Kokkos::parallel_for("Deterministic Coloring: color nodes in frontier", - my_exec_space(0, host_frontierSize()), + functorDeterministicColoring myDeterministicColoring(this->xadj, this->adj, dependency, frontier, frontierSize, + newFrontier, newFrontierSize, maxColors, colors); + Kokkos::parallel_for("Deterministic Coloring: color nodes in frontier", my_exec_space(0, host_frontierSize()), myDeterministicColoring); } else if (this->_use_color_set == 1) { @@ -1911,12 +1734,9 @@ class GraphColor_VBD // we need to use successive color ranges of width 64 // to represent all the possible colors on the graph. functorDeterministicColoringBitArray myDeterministicColoringBitArray( - this->xadj, this->adj, dependency, frontier, frontierSize, - newFrontier, newFrontierSize, maxColors, colors); - Kokkos::parallel_for( - "Deterministic Coloring: color nodes in frontier", - my_exec_space(0, host_frontierSize()), - myDeterministicColoringBitArray); // Loop over current frontier + this->xadj, this->adj, dependency, frontier, frontierSize, newFrontier, newFrontierSize, maxColors, colors); + Kokkos::parallel_for("Deterministic Coloring: color nodes in frontier", my_exec_space(0, host_frontierSize()), + myDeterministicColoringBitArray); // Loop over current frontier } Kokkos::deep_copy(host_newFrontierSize, newFrontierSize); } // while newFrontierSize @@ -1928,14 +1748,13 @@ class GraphColor_VBD nnz_lno_persistent_work_view_t score_; const_lno_row_view_t numNeighbors_; - functorScoreCalculation(nnz_lno_persistent_work_view_t &score, - const_lno_row_view_t &numNeighbors) + functorScoreCalculation(nnz_lno_persistent_work_view_t &score, const_lno_row_view_t &numNeighbors) : score_(score), numNeighbors_(numNeighbors) {} KOKKOS_INLINE_FUNCTION void operator()(const int i, size_type &update) const { score_(i) = numNeighbors_(i + 1) - numNeighbors_(i); - update = ((size_type)score_(i) < update ? update : (size_type)score_(i)); + update = ((size_type)score_(i) < update ? update : (size_type)score_(i)); } }; // functorScoreCalculation() @@ -1943,9 +1762,8 @@ class GraphColor_VBD Kokkos::View frontierSize_; Kokkos::View newFrontierSize_; - functorSwapOnDevice( - Kokkos::View frontierSize, - Kokkos::View newFrontierSize) + functorSwapOnDevice(Kokkos::View frontierSize, + Kokkos::View newFrontierSize) : frontierSize_(frontierSize), newFrontierSize_(newFrontierSize) {} KOKKOS_INLINE_FUNCTION @@ -1964,12 +1782,10 @@ class GraphColor_VBD nnz_lno_temp_work_view_t newFrontier_; Kokkos::View newFrontierSize_; - functorInitialDependency( - const_lno_row_view_t rowPtr, const_lno_nnz_view_t colInd, - nnz_lno_persistent_work_view_t score, - nnz_lno_persistent_work_view_t dependency, - nnz_lno_temp_work_view_t newFrontier, - Kokkos::View newFrontierSize) + functorInitialDependency(const_lno_row_view_t rowPtr, const_lno_nnz_view_t colInd, + nnz_lno_persistent_work_view_t score, nnz_lno_persistent_work_view_t dependency, + nnz_lno_temp_work_view_t newFrontier, + Kokkos::View newFrontierSize) : xadj_(rowPtr), adj_(colInd), score_(score), @@ -1979,8 +1795,7 @@ class GraphColor_VBD KOKKOS_INLINE_FUNCTION void operator()(const int node) const { - typedef typename std::remove_reference::type - atomic_incr_type; + typedef typename std::remove_reference::type atomic_incr_type; int myScore = score_(node); int numNeighs = xadj_(node + 1) - xadj_(node); nnz_lno_t numVerts = xadj_.extent(0) - 1; @@ -1996,9 +1811,8 @@ class GraphColor_VBD } } if (dependency_(node) == 0) { - const size_type newFrontierIdx = - Kokkos::atomic_fetch_add(&newFrontierSize_(), atomic_incr_type(1)); - newFrontier_(newFrontierIdx) = node; + const size_type newFrontierIdx = Kokkos::atomic_fetch_add(&newFrontierSize_(), atomic_incr_type(1)); + newFrontier_(newFrontierIdx) = node; } } @@ -2016,14 +1830,12 @@ class GraphColor_VBD color_view_type colors_; Kokkos::View bannedColors_; - functorDeterministicColoring( - const_lno_row_view_t rowPtr, const_lno_nnz_view_t colInd, - nnz_lno_persistent_work_view_t dependency, - nnz_lno_temp_work_view_t frontier, - Kokkos::View frontierSize, - nnz_lno_temp_work_view_t newFrontier, - Kokkos::View newFrontierSize, - size_type maxColors, color_view_type colors) + functorDeterministicColoring(const_lno_row_view_t rowPtr, const_lno_nnz_view_t colInd, + nnz_lno_persistent_work_view_t dependency, nnz_lno_temp_work_view_t frontier, + Kokkos::View frontierSize, + nnz_lno_temp_work_view_t newFrontier, + Kokkos::View newFrontierSize, size_type maxColors, + color_view_type colors) : xadj_(rowPtr), adj_(colInd), dependency_(dependency), @@ -2033,14 +1845,12 @@ class GraphColor_VBD newFrontierSize_(newFrontierSize), maxColors_(maxColors), colors_(colors), - bannedColors_("KokkosKernels::bannedColors", frontier.size(), - maxColors_) {} + bannedColors_("KokkosKernels::bannedColors", frontier.size(), maxColors_) {} KOKKOS_INLINE_FUNCTION void operator()(const size_type frontierIdx) const { nnz_lno_t numVerts = xadj_.extent(0) - 1; - typedef typename std::remove_reference::type - atomic_incr_type; + typedef typename std::remove_reference::type atomic_incr_type; size_type frontierNode = frontier_(frontierIdx); for (size_type colorIdx = 0; colorIdx < maxColors_; ++colorIdx) { bannedColors_(frontierIdx, colorIdx) = 0; @@ -2048,8 +1858,7 @@ class GraphColor_VBD // Loop over neighbors, find banned colors, decrement dependency and // update newFrontier - for (size_type i = xadj_(frontierNode); i < xadj_(frontierNode + 1); - ++i) { + for (size_type i = xadj_(frontierNode); i < xadj_(frontierNode + 1); ++i) { nnz_lno_t neigh = adj_(i); // Skip remote edges (in case this is part of a distributed graph) if (neigh >= numVerts) continue; @@ -2059,13 +1868,11 @@ class GraphColor_VBD // so let's check that the node is not already colored, i.e. // its dependency is not -1. if (dependency_(neigh) >= 0) { - nnz_lno_t myDependency = - Kokkos::atomic_fetch_add(&dependency_(neigh), -1); + nnz_lno_t myDependency = Kokkos::atomic_fetch_add(&dependency_(neigh), -1); // dependency(neigh) = dependency(neigh) - 1; if (myDependency - 1 == 0) { - const size_type newFrontierIdx = Kokkos::atomic_fetch_add( - &newFrontierSize_(), atomic_incr_type(1)); - newFrontier_(newFrontierIdx) = neigh; + const size_type newFrontierIdx = Kokkos::atomic_fetch_add(&newFrontierSize_(), atomic_incr_type(1)); + newFrontier_(newFrontierIdx) = neigh; } } } // Loop over neighbors @@ -2090,14 +1897,12 @@ class GraphColor_VBD size_type maxColors_; color_view_type colors_; - functorDeterministicColoringBitArray( - const_lno_row_view_t rowPtr, const_lno_nnz_view_t colInd, - nnz_lno_persistent_work_view_t dependency, - nnz_lno_temp_work_view_t frontier, - Kokkos::View frontierSize, - nnz_lno_temp_work_view_t newFrontier, - Kokkos::View newFrontierSize, - size_type maxColors, color_view_type colors) + functorDeterministicColoringBitArray(const_lno_row_view_t rowPtr, const_lno_nnz_view_t colInd, + nnz_lno_persistent_work_view_t dependency, nnz_lno_temp_work_view_t frontier, + Kokkos::View frontierSize, + nnz_lno_temp_work_view_t newFrontier, + Kokkos::View newFrontierSize, + size_type maxColors, color_view_type colors) : xadj_(rowPtr), adj_(colInd), dependency_(dependency), @@ -2110,8 +1915,7 @@ class GraphColor_VBD KOKKOS_INLINE_FUNCTION void operator()(const size_type frontierIdx) const { - typedef typename std::remove_reference::type - atomic_incr_type; + typedef typename std::remove_reference::type atomic_incr_type; nnz_lno_t numVerts = xadj_.extent(0) - 1; size_type frontierNode = frontier_(frontierIdx); // Initialize bit array to all bits = 0 @@ -2121,8 +1925,7 @@ class GraphColor_VBD while (myColor == 0) { // Loop over neighbors, find banned colors in the range: // [colorOffset + 1, colorOffset + 64] - for (size_type i = xadj_(frontierNode); i < xadj_(frontierNode + 1); - ++i) { + for (size_type i = xadj_(frontierNode); i < xadj_(frontierNode + 1); ++i) { nnz_lno_t neigh = adj_(i); if (neigh >= numVerts) continue; color_t neighColor = colors_(neigh); @@ -2136,12 +1939,10 @@ class GraphColor_VBD // so let's check that the node is not already colored, i.e. // its dependency is not -1. if (colorOffset == 0 && dependency_(neigh) >= 0) { - nnz_lno_t myDependency = - Kokkos::atomic_fetch_add(&dependency_(neigh), -1); + nnz_lno_t myDependency = Kokkos::atomic_fetch_add(&dependency_(neigh), -1); if (myDependency - 1 == 0) { - const size_type newFrontierIdx = Kokkos::atomic_fetch_add( - &newFrontierSize_(), atomic_incr_type(1)); - newFrontier_(newFrontierIdx) = neigh; + const size_type newFrontierIdx = Kokkos::atomic_fetch_add(&newFrontierSize_(), atomic_incr_type(1)); + newFrontier_(newFrontierIdx) = neigh; } } } // Loop over neighbors @@ -2169,10 +1970,8 @@ class GraphColor_VBD * Performs a edge_base coloring, with the hope of better load balance * as well as better memory accesses on GPUs. */ -template -class GraphColor_EB : public GraphColor { +template +class GraphColor_EB : public GraphColor { // FIXME SYCL: This does not work, returns colors with conflicts. public: typedef long long int ban_type; @@ -2187,41 +1986,30 @@ class GraphColor_EB : public GraphColor - single_dim_index_view_type; + typedef typename Kokkos::View single_dim_index_view_type; - typedef typename single_dim_index_view_type::HostMirror - single_dim_index_host_view_type; // Host view type + typedef typename single_dim_index_view_type::HostMirror single_dim_index_host_view_type; // Host view type typedef Kokkos::RangePolicy my_exec_space; - typedef typename HandleType::size_type_temp_work_view_t - size_type_temp_work_view_t; - typedef typename HandleType::size_type_persistent_work_view_t - size_type_persistent_work_view_t; + typedef typename HandleType::size_type_temp_work_view_t size_type_temp_work_view_t; + typedef typename HandleType::size_type_persistent_work_view_t size_type_persistent_work_view_t; - typedef - typename HandleType::nnz_lno_temp_work_view_t nnz_lno_temp_work_view_t; - typedef typename HandleType::nnz_lno_persistent_work_view_t - nnz_lno_persistent_work_view_t; + typedef typename HandleType::nnz_lno_temp_work_view_t nnz_lno_temp_work_view_t; + typedef typename HandleType::nnz_lno_persistent_work_view_t nnz_lno_persistent_work_view_t; - typedef typename Kokkos::View - color_temp_work_view_type; + typedef typename Kokkos::View color_temp_work_view_type; typedef Kokkos::View char_temp_work_view_type; - typedef typename char_temp_work_view_type::HostMirror - char_temp_work_host_view_type; // Host view type + typedef typename char_temp_work_view_type::HostMirror char_temp_work_host_view_type; // Host view type typedef typename in_row_index_view_type::const_type const_lno_row_view_t; - typedef typename in_nonzero_index_view_type::const_type - const_nonzero_index_view_type; + typedef typename in_nonzero_index_view_type::const_type const_nonzero_index_view_type; public: /** @@ -2231,12 +2019,10 @@ class GraphColor_EB : public GraphColor(nv_, ne_, row_map, entries, - coloring_handle) {} + : GraphColor(nv_, ne_, row_map, entries, + coloring_handle) {} /** * \brief Class Destructor. @@ -2256,7 +2042,7 @@ class GraphColor_EB : public GraphColorcp->get_eb_num_initial_colors(); double pps_cutoff = this->cp->get_min_reduction_for_conflictlist(); size_type ps_min = this->cp->get_min_elements_for_conflictlist(); - bool use_pps = (this->cp->get_conflict_list_type() == COLORING_PPS); + bool use_pps = (this->cp->get_conflict_list_type() == COLORING_PPS); bool tictoc = this->cp->get_tictoc(); @@ -2264,53 +2050,40 @@ class GraphColor_EB : public GraphColorcp->get_lower_diagonal_edge_list(this->nv, this->ne, this->xadj, - this->adj, numEdges, _kok_src, - _kok_dst); + this->cp->get_lower_diagonal_edge_list(this->nv, this->ne, this->xadj, this->adj, numEdges, _kok_src, _kok_dst); size_type num_work_edges = numEdges; // allocate memory for vertex ban colors, and tentative bans - color_temp_work_view_type color_ban( - Kokkos::view_alloc(Kokkos::WithoutInitializing, "color_ban"), this->nv); - color_temp_work_view_type tentative_color_ban( - "tentative_color_ban", this->nv); // views are initialized with zero + color_temp_work_view_type color_ban(Kokkos::view_alloc(Kokkos::WithoutInitializing, "color_ban"), this->nv); + color_temp_work_view_type tentative_color_ban("tentative_color_ban", this->nv); // views are initialized with zero // allocate memory for vertex color set shifts. nnz_lno_temp_work_view_t color_set("color_set", this->nv); // initialized with zero. // initialize colors, color bans - Kokkos::parallel_for( - "KokkosGraph::GraphColoring::initColors", my_exec_space(0, this->nv), - init_colors(kok_colors, color_ban, numInitialColors, color_set)); + Kokkos::parallel_for("KokkosGraph::GraphColoring::initColors", my_exec_space(0, this->nv), + init_colors(kok_colors, color_ban, numInitialColors, color_set)); // std::cout << "nv:" << this->nv << " init_colors" << std::endl; // worklist size_type_temp_work_view_t edge_conflict_indices( - Kokkos::view_alloc(Kokkos::WithoutInitializing, - "edge_conflict_indices"), - num_work_edges); + Kokkos::view_alloc(Kokkos::WithoutInitializing, "edge_conflict_indices"), num_work_edges); // next iterations conflict list size_type_temp_work_view_t new_edge_conflict_indices( - Kokkos::view_alloc(Kokkos::WithoutInitializing, - "new_edge_conflict_indices"), - num_work_edges); + Kokkos::view_alloc(Kokkos::WithoutInitializing, "new_edge_conflict_indices"), num_work_edges); char_temp_work_view_type edge_conflict_marker( - Kokkos::view_alloc(Kokkos::WithoutInitializing, "edge_conflict_marker"), - num_work_edges); + Kokkos::view_alloc(Kokkos::WithoutInitializing, "edge_conflict_marker"), num_work_edges); // initialize the worklist sequentiall, and markers as 1. - Kokkos::parallel_for( - "KokkosGraph::GraphColoring::InitWorkArrays", - my_exec_space(0, num_work_edges), - init_work_arrays(edge_conflict_indices, edge_conflict_marker)); + Kokkos::parallel_for("KokkosGraph::GraphColoring::InitWorkArrays", my_exec_space(0, num_work_edges), + init_work_arrays(edge_conflict_indices, edge_conflict_marker)); MyExecSpace().fence(); // std::cout << "nv:" << this->nv << " init_work_arrays" << std::endl; @@ -2319,8 +2092,7 @@ class GraphColor_EB : public GraphColorseconds(); timer->reset(); } - double mc_time = 0, cnt_time = 0, ban_time = 0, expand_ban_time = 0, - color_time = 0, pps_time = 0; + double mc_time = 0, cnt_time = 0, ban_time = 0, expand_ban_time = 0, color_time = 0, pps_time = 0; size_type i = 0; @@ -2340,12 +2112,9 @@ class GraphColor_EB : public GraphColornv << " i:" << i << " num_work_edges:" << // num_work_edges<< std::endl; conflict detection mark conflicts as color // 0. update their bans - Kokkos::parallel_for( - "KokkosGraph::GraphColoring::HalfEdgeMarkConflicts", - my_exec_space(0, num_work_edges), - halfedge_mark_conflicts(_kok_src, _kok_dst, kok_colors, color_set, - color_ban, tentative_color_ban, - edge_conflict_indices)); + Kokkos::parallel_for("KokkosGraph::GraphColoring::HalfEdgeMarkConflicts", my_exec_space(0, num_work_edges), + halfedge_mark_conflicts(_kok_src, _kok_dst, kok_colors, color_set, color_ban, + tentative_color_ban, edge_conflict_indices)); MyExecSpace().fence(); // std::cout << "nv:" << this->nv << " i:" << i << " @@ -2361,13 +2130,10 @@ class GraphColor_EB : public GraphColor 0) - Kokkos::parallel_reduce( - "KokkosGraph::GraphColoring::HalfEdgeConflictsCount", - my_exec_space(0, num_work_edges), - halfedge_conflict_count(_kok_src, _kok_dst, kok_colors, color_set, - edge_conflict_indices, - edge_conflict_marker), - num_conflict_reduction); + Kokkos::parallel_reduce("KokkosGraph::GraphColoring::HalfEdgeConflictsCount", my_exec_space(0, num_work_edges), + halfedge_conflict_count(_kok_src, _kok_dst, kok_colors, color_set, + edge_conflict_indices, edge_conflict_marker), + num_conflict_reduction); MyExecSpace().fence(); @@ -2396,26 +2162,19 @@ class GraphColor_EB : public GraphColor ps_min && - num_conflict_reduction / double(num_work_edges) > pps_cutoff) { + if (num_work_edges > ps_min && num_conflict_reduction / double(num_work_edges) > pps_cutoff) { // use_pps = false; if (use_pps) { - Kokkos::parallel_scan("KokkosGraph::GraphColoring::CalcEdgePositions", - my_exec_space(0, num_work_edges), - ppsWorklistFunctorEB(edge_conflict_indices, - new_edge_conflict_indices, - edge_conflict_marker)); + Kokkos::parallel_scan( + "KokkosGraph::GraphColoring::CalcEdgePositions", my_exec_space(0, num_work_edges), + ppsWorklistFunctorEB(edge_conflict_indices, new_edge_conflict_indices, edge_conflict_marker)); } else { // create new worklist - single_dim_index_view_type new_index = - single_dim_index_view_type("recolorListLength"); + single_dim_index_view_type new_index = single_dim_index_view_type("recolorListLength"); ; - Kokkos::parallel_for( - "KokkosGraph::GraphColoring::CreateNewWorkArrayAtomic", - my_exec_space(0, num_work_edges), - atomic_create_new_work_array(new_index, edge_conflict_indices, - edge_conflict_marker, - new_edge_conflict_indices)); + Kokkos::parallel_for("KokkosGraph::GraphColoring::CreateNewWorkArrayAtomic", my_exec_space(0, num_work_edges), + atomic_create_new_work_array(new_index, edge_conflict_indices, edge_conflict_marker, + new_edge_conflict_indices)); MyExecSpace().fence(); } @@ -2433,12 +2192,9 @@ class GraphColor_EB : public GraphColorseconds(); @@ -2463,27 +2217,22 @@ class GraphColor_EB : public GraphColornv), - choose_colors(kok_colors, color_set, color_ban, tentative_color_ban)); + Kokkos::parallel_for("KokkosGraph::GraphColoring::ChooseColors", my_exec_space(0, this->nv), + choose_colors(kok_colors, color_set, color_ban, tentative_color_ban)); if (tictoc) { color_time += timer->seconds(); timer->reset(); } } if (tictoc) { - std::cout << "\tinit_time:" << inittime << " mc:" << mc_time - << " cnt_time:" << cnt_time << " ban_time:" << ban_time - << " expand ban time:" << expand_ban_time - << " pps time:" << pps_time << " color time:" << color_time - << std::endl + std::cout << "\tinit_time:" << inittime << " mc:" << mc_time << " cnt_time:" << cnt_time + << " ban_time:" << ban_time << " expand ban time:" << expand_ban_time << " pps time:" << pps_time + << " color time:" << color_time << std::endl << std::endl; } // set the final colors. - Kokkos::parallel_for("KokkosGraph::GraphColoring::SetFinalColors", - my_exec_space(0, this->nv), + Kokkos::parallel_for("KokkosGraph::GraphColoring::SetFinalColors", my_exec_space(0, this->nv), set_final_colors(kok_colors, color_set)); num_loops = i; @@ -2500,7 +2249,7 @@ class GraphColor_EB : public GraphColor - _color_set(s))) || // if source is colored, and destination - // color set is larger than source - (dc && (_color_set(s) > - _color_set(d))) // or if destionation is colored, and the - // source color set is larger + if ((dc && sc) || // if both colored + (sc && (_color_set(d) > _color_set(s))) || // if source is colored, and destination + // color set is larger than source + (dc && (_color_set(s) > _color_set(d))) // or if destionation is colored, and the + // source color set is larger ) { // then no need to look at this edge anymore. _edge_conflict_marker(w) = 0; @@ -2696,8 +2430,7 @@ class GraphColor_EB : public GraphColor::type - atomic_incr_type; + typedef typename std::remove_reference::type atomic_incr_type; size_type w = _edge_conflict_indices(ii); if (_edge_conflict_marker(w)) { - const size_type future_index = - Kokkos::atomic_fetch_add(&_new_index(), atomic_incr_type(1)); + const size_type future_index = Kokkos::atomic_fetch_add(&_new_index(), atomic_incr_type(1)); _new_edge_conflict_indices(future_index) = w; } } @@ -2751,11 +2480,9 @@ class GraphColor_EB : public GraphColor(&(color_ban(uncolored_vertex)), - src_col | dst_col); + Kokkos::atomic_fetch_or(&(color_ban(uncolored_vertex)), src_col | dst_col); edge_conflict_marker(work_index) = 0; } } @@ -2821,9 +2544,8 @@ class GraphColor_EB : public GraphColor dst_id) ? src_id : dst_id; - nnz_lno_t smaller_index = - dst_id; // TODO which one is better? this seems to be not - // much changing + nnz_lno_t smaller_index = dst_id; // TODO which one is better? this seems to be not + // much changing // idx smaller_index = src_id; // then both have been colored tentavitely. propoagate the color // of src to dst. - Kokkos::atomic_fetch_or( - &(tentative_color_ban(smaller_index)), -src_col); - nnz_lno_t banned_colors = ~(color_ban(smaller_index) | - tentative_color_ban(smaller_index)); + Kokkos::atomic_fetch_or(&(tentative_color_ban(smaller_index)), -src_col); + nnz_lno_t banned_colors = ~(color_ban(smaller_index) | tentative_color_ban(smaller_index)); nnz_lno_t larger_col = banned_colors & (-banned_colors); kokcolors(smaller_index) = -(larger_col); } @@ -2909,16 +2627,14 @@ class GraphColor_EB : public GraphColor(&(color_ban(dst_id)), // -src_col); - Kokkos::atomic_fetch_or(&(tentative_color_ban(dst_id)), - -src_col); + Kokkos::atomic_fetch_or(&(tentative_color_ban(dst_id)), -src_col); } else if (dst_col != 0) { // if it is dst tentatively colors, but src is not colored, // then we send the dst color info to src's tentative_ban // Kokkos::atomic_fetch_or(&(color_ban(src_id)), // -dst_col); - Kokkos::atomic_fetch_or(&(tentative_color_ban(src_id)), - -dst_col); + Kokkos::atomic_fetch_or(&(tentative_color_ban(src_id)), -dst_col); } else { // idx smaller_index = src_id < dst_id > 0 ? src_id: dst_id; // idx larger_index = src_id < dst_id > 0 ? dst_id : src_id; @@ -2937,16 +2653,14 @@ class GraphColor_EB : public GraphColor( - &(tentative_color_ban(larger_index)), src_col); + Kokkos::atomic_fetch_or(&(tentative_color_ban(larger_index)), src_col); // Kokkos::atomic_fetch_or(&(color_ban(dst_id)), // src_col); } @@ -2961,15 +2675,13 @@ class GraphColor_EB : public GraphColor -void graph_color_impl(KernelHandle *handle, - typename KernelHandle::nnz_lno_t num_rows, - lno_row_view_t_ row_map, lno_nnz_view_t_ entries) { +template +void graph_color_impl(KernelHandle *handle, typename KernelHandle::nnz_lno_t num_rows, lno_row_view_t_ row_map, + lno_nnz_view_t_ entries) { Kokkos::Timer timer; - typename KernelHandle::GraphColoringHandleType *gch = - handle->get_graph_coloring_handle(); + typename KernelHandle::GraphColoringHandleType *gch = handle->get_graph_coloring_handle(); ColoringAlgorithm algorithm = gch->get_coloring_algo_type(); - typedef typename KernelHandle::GraphColoringHandleType::color_view_t - color_view_type; + typedef typename KernelHandle::GraphColoringHandleType::color_view_t color_view_type; gch->set_tictoc(handle->get_verbose()); @@ -3119,46 +2820,35 @@ void graph_color_impl(KernelHandle *handle, colors_out = color_view_type("Graph Colors", num_rows); } - typedef - typename Impl::GraphColor - BaseGraphColoring; + typedef typename Impl::GraphColor + BaseGraphColoring; BaseGraphColoring *gc = NULL; switch (algorithm) { - case COLORING_SERIAL: - gc = new BaseGraphColoring(num_rows, entries.extent(0), row_map, entries, - gch); - break; + case COLORING_SERIAL: gc = new BaseGraphColoring(num_rows, entries.extent(0), row_map, entries, gch); break; case COLORING_VB: case COLORING_VBBIT: case COLORING_VBCS: - typedef typename Impl::GraphColor_VB< - typename KernelHandle::GraphColoringHandleType, lno_row_view_t_, - lno_nnz_view_t_> - VBGraphColoring; - gc = new VBGraphColoring(num_rows, entries.extent(0), row_map, entries, - gch); + typedef + typename Impl::GraphColor_VB + VBGraphColoring; + gc = new VBGraphColoring(num_rows, entries.extent(0), row_map, entries, gch); break; case COLORING_VBD: case COLORING_VBDBIT: - typedef typename Impl::GraphColor_VBD< - typename KernelHandle::GraphColoringHandleType, lno_row_view_t_, - lno_nnz_view_t_> + typedef typename Impl::GraphColor_VBD VBDGraphColoring; - gc = new VBDGraphColoring(num_rows, entries.extent(0), row_map, entries, - gch); + gc = new VBDGraphColoring(num_rows, entries.extent(0), row_map, entries, gch); break; case COLORING_EB: - typedef typename Impl::GraphColor_EB< - typename KernelHandle::GraphColoringHandleType, lno_row_view_t_, - lno_nnz_view_t_> - EBGraphColoring; - gc = new EBGraphColoring(num_rows, entries.extent(0), row_map, entries, - gch); + typedef + typename Impl::GraphColor_EB + EBGraphColoring; + gc = new EBGraphColoring(num_rows, entries.extent(0), row_map, entries, gch); break; case COLORING_DEFAULT: break; diff --git a/graph/impl/KokkosGraph_Distance2Color_impl.hpp b/graph/impl/KokkosGraph_Distance2Color_impl.hpp index 58b6d79ebb..cfa5186283 100644 --- a/graph/impl/KokkosGraph_Distance2Color_impl.hpp +++ b/graph/impl/KokkosGraph_Distance2Color_impl.hpp @@ -53,8 +53,7 @@ namespace Impl { * Distance-1 conflicts will not be checked. * */ -template +template class GraphColorDistance2 { // Need mutable entries type for edge filtering using nc_entries_t = typename entries_t::non_const_type; @@ -109,9 +108,8 @@ class GraphColorDistance2 { * \param handle: GraphColoringHandle object that holds the specification * about the graph coloring, including parameters. */ - GraphColorDistance2(lno_t nr_, lno_t nc_, rowmap_t row_map, entries_t entries, - rowmap_t t_row_map, entries_t t_entries, - HandleType* handle) + GraphColorDistance2(lno_t nr_, lno_t nc_, rowmap_t row_map, entries_t entries, rowmap_t t_row_map, + entries_t t_entries, HandleType* handle) : nr(nr_), nc(nc_), ne(entries.extent(0)), @@ -163,9 +161,8 @@ class GraphColorDistance2 { case COLORING_D2_NB_BIT: compute_d2_coloring_nb(colors_out); break; case COLORING_D2_SERIAL: compute_d2_coloring_serial(colors_out); break; default: - throw std::runtime_error( - std::string("D2 coloring handle has invalid algorithm: ") + - std::to_string((int)this->gc_handle->get_coloring_algo_type())); + throw std::runtime_error(std::string("D2 coloring handle has invalid algorithm: ") + + std::to_string((int)this->gc_handle->get_coloring_algo_type())); } } @@ -179,16 +176,11 @@ class GraphColorDistance2 { // adjacency list ) if (this->_ticToc) { std::cout << "\tcolor_graph_d2 params:" << std::endl - << "\t algorithm : " - << this->gc_handle->getD2AlgorithmName() << std::endl - << "\t ticToc : " << this->_ticToc - << std::endl - << "\t max_num_iterations : " - << this->_max_num_iterations << std::endl - << "\t chunkSize : " << this->_chunkSize - << std::endl - << "\t Edge Filtering Pass? : " - << (int)using_edge_filtering << std::endl + << "\t algorithm : " << this->gc_handle->getD2AlgorithmName() << std::endl + << "\t ticToc : " << this->_ticToc << std::endl + << "\t max_num_iterations : " << this->_max_num_iterations << std::endl + << "\t chunkSize : " << this->_chunkSize << std::endl + << "\t Edge Filtering Pass? : " << (int)using_edge_filtering << std::endl << "\tgraph information:" << std::endl << "\t nr : " << this->nr << std::endl << "\t ne : " << this->ne << std::endl; @@ -203,9 +195,7 @@ class GraphColorDistance2 { // conflictlist - store conflicts that can happen when we're coloring in // parallel. - lno_view_t current_vertexList( - Kokkos::view_alloc(Kokkos::WithoutInitializing, "vertexList"), - this->nr); + lno_view_t current_vertexList(Kokkos::view_alloc(Kokkos::WithoutInitializing, "vertexList"), this->nr); lno_t current_vertexListLength = this->nr; @@ -215,13 +205,10 @@ class GraphColorDistance2 { current_vertexListLength = this->gc_handle->get_vertex_list_size(); } else { // init conflictlist sequentially. - Kokkos::parallel_for("InitList", range_policy_type(0, this->nr), - functorInitList(current_vertexList)); + Kokkos::parallel_for("InitList", range_policy_type(0, this->nr), functorInitList(current_vertexList)); } // Next iteratons's conflictList - lno_view_t next_iteration_recolorList( - Kokkos::view_alloc(Kokkos::WithoutInitializing, "recolorList"), - this->nr); + lno_view_t next_iteration_recolorList(Kokkos::view_alloc(Kokkos::WithoutInitializing, "recolorList"), this->nr); // Size the next iteration conflictList single_lno_view_t next_iteration_recolorListLength("recolorListLength"); @@ -251,15 +238,11 @@ class GraphColorDistance2 { // entries_t, // so that it has the same type as adj // * on the other hand, t_adj is not actually modified by EF functor - lno_view_t adj_copy( - Kokkos::view_alloc(Kokkos::WithoutInitializing, "adj copy"), - this->ne); + lno_view_t adj_copy(Kokkos::view_alloc(Kokkos::WithoutInitializing, "adj copy"), this->ne); Kokkos::deep_copy(adj_copy, this->adj); - this->colorGreedyEF(this->xadj, adj_copy, this->t_xadj, this->t_adj, - colors_out); + this->colorGreedyEF(this->xadj, adj_copy, this->t_xadj, this->t_adj, colors_out); } else { - this->colorGreedy(this->xadj, this->adj, this->t_xadj, this->t_adj, - colors_out, current_vertexList, + this->colorGreedy(this->xadj, this->adj, this->t_xadj, this->t_adj, colors_out, current_vertexList, current_vertexListLength); } @@ -269,10 +252,8 @@ class GraphColorDistance2 { time = timer.seconds(); total_time += time; std::cout << "\tIteration: " << iter << std::endl - << "\t - Time speculative greedy phase : " << time - << std::endl - << "\t - Num Uncolored (greedy-color) : " << numUncolored - << std::endl; + << "\t - Time speculative greedy phase : " << time << std::endl + << "\t - Num Uncolored (greedy-color) : " << numUncolored << std::endl; gc_handle->add_to_overall_coloring_time_phase1(time); @@ -289,20 +270,17 @@ class GraphColorDistance2 { // NOTE: not using colorset algorithm in this so we don't include colorset // data - numUncolored = this->findConflicts( - swap_work_arrays, this->xadj, this->adj, this->t_xadj, this->t_adj, - colors_out, current_vertexList, current_vertexListLength, - next_iteration_recolorList, next_iteration_recolorListLength); + numUncolored = this->findConflicts(swap_work_arrays, this->xadj, this->adj, this->t_xadj, this->t_adj, colors_out, + current_vertexList, current_vertexListLength, next_iteration_recolorList, + next_iteration_recolorListLength); execution_space().fence(); if (_ticToc) { time = timer.seconds(); total_time += time; - std::cout << "\t - Time conflict detection : " << time - << std::endl; - std::cout << "\t - Num Uncolored (conflicts) : " << numUncolored - << std::endl; + std::cout << "\t - Time conflict detection : " << time << std::endl; + std::cout << "\t - Num Uncolored (conflicts) : " << numUncolored << std::endl; gc_handle->add_to_overall_coloring_time_phase2(time); timer.reset(); } @@ -315,9 +293,8 @@ class GraphColorDistance2 { current_vertexList = next_iteration_recolorList; next_iteration_recolorList = temp; - current_vertexListLength = numUncolored; - next_iteration_recolorListLength = - single_lno_view_t("recolorListLength"); + current_vertexListLength = numUncolored; + next_iteration_recolorListLength = single_lno_view_t("recolorListLength"); } } @@ -331,8 +308,7 @@ class GraphColorDistance2 { // clean up in serial (resolveConflictsSerial) // ------------------------------------------ if (numUncolored > 0) { - this->resolveConflictsSerial(this->xadj, this->adj, this->t_xadj, - this->t_adj, colors_out, current_vertexList, + this->resolveConflictsSerial(this->xadj, this->adj, this->t_xadj, this->t_adj, colors_out, current_vertexList, current_vertexListLength); } @@ -341,10 +317,8 @@ class GraphColorDistance2 { if (_ticToc) { time = timer.seconds(); total_time += time; - std::cout << "\tTime serial conflict resolution : " << time - << std::endl; - std::cout << "\tTotal time for coloring : " << total_time - << std::endl; + std::cout << "\tTime serial conflict resolution : " << time << std::endl; + std::cout << "\tTotal time for coloring : " << total_time << std::endl; gc_handle->add_to_overall_coloring_time_phase3(time); } @@ -356,11 +330,9 @@ class GraphColorDistance2 { template struct NB_Coloring { - NB_Coloring(const lno_view_t& worklist_, const single_lno_view_t& worklen_, - color_type colorBase_, const forbidden_view& forbidden_, - color_view_type colors_, const rowmap_t& Vrowmap_, - const entries_t& Vcolinds_, lno_t vertsPerThread_, - lno_t numCols_) + NB_Coloring(const lno_view_t& worklist_, const single_lno_view_t& worklen_, color_type colorBase_, + const forbidden_view& forbidden_, color_view_type colors_, const rowmap_t& Vrowmap_, + const entries_t& Vcolinds_, lno_t vertsPerThread_, lno_t numCols_) : worklist(worklist_), worklen(worklen_), colorBase(colorBase_), @@ -387,8 +359,7 @@ class GraphColorDistance2 { for (size_type j = rowBegin; j < rowEnd; j++) { lno_t nei = Vcolinds(j); if (nei < numCols) { - for (int b = 0; b < batch; b++) - forbid[b] |= forbidden(nei * batch + b); + for (int b = 0; b < batch; b++) forbid[b] |= forbidden(nei * batch + b); } } // Find the first 0 bit in forbid @@ -405,27 +376,22 @@ class GraphColorDistance2 { break; } } - if (color && (colors(v) == 0 || colors(v) == CONFLICTED || - colors(v) == UNCOLORABLE)) { + if (color && (colors(v) == 0 || colors(v) == CONFLICTED || colors(v) == UNCOLORABLE)) { // Color v colors(v) = color; if (!doing_bipartite) { // Update forbidden for v (preventing dist-1 conflicts) - if (v < numCols) - Kokkos::atomic_fetch_or(&forbidden(v * batch + colorWord), - (uint32_t)1 << colorBit); + if (v < numCols) Kokkos::atomic_fetch_or(&forbidden(v * batch + colorWord), (uint32_t)1 << colorBit); } // Update forbidden for all of v's neighbors for (size_type j = rowBegin; j < rowEnd; j++) { lno_t nei = Vcolinds(j); if (nei < numCols) { // Update column forbidden - Kokkos::atomic_fetch_or(&forbidden(nei * batch + colorWord), - (uint32_t)1 << colorBit); + Kokkos::atomic_fetch_or(&forbidden(nei * batch + colorWord), (uint32_t)1 << colorBit); } } - } else if (colors(v) == 0 || colors(v) == CONFLICTED || - colors(v) == UNCOLORABLE) { + } else if (colors(v) == 0 || colors(v) == CONFLICTED || colors(v) == UNCOLORABLE) { colors(v) = UNCOLORABLE; } } @@ -444,9 +410,8 @@ class GraphColorDistance2 { template struct NB_Conflict { - NB_Conflict(color_type colorBase_, const forbidden_view& forbidden_, - const color_view_type& colors_, const rowmap_t& Crowmap_, - const entries_t& Ccolinds_, lno_t numVerts_) + NB_Conflict(color_type colorBase_, const forbidden_view& forbidden_, const color_view_type& colors_, + const rowmap_t& Crowmap_, const entries_t& Ccolinds_, lno_t numVerts_) : colorBase(colorBase_), forbidden(forbidden_), colors(colors_), @@ -513,10 +478,8 @@ class GraphColorDistance2 { template struct NB_RefreshForbidden { - NB_RefreshForbidden(color_type colorBase_, const forbidden_view& forbidden_, - const color_view_type& colors_, - const rowmap_t& Crowmap_, const entries_t& Ccolinds_, - lno_t numVerts_) + NB_RefreshForbidden(color_type colorBase_, const forbidden_view& forbidden_, const color_view_type& colors_, + const rowmap_t& Crowmap_, const entries_t& Ccolinds_, lno_t numVerts_) : colorBase(colorBase_), colorEnd(colorBase + 32 * batch), forbidden(forbidden_), @@ -563,12 +526,11 @@ class GraphColorDistance2 { }; struct NB_Worklist { - NB_Worklist(const color_view_type colors_, const lno_view_t& worklist_, - const single_lno_view_t& worklen_, lno_t nr_) + NB_Worklist(const color_view_type colors_, const lno_view_t& worklist_, const single_lno_view_t& worklen_, + lno_t nr_) : colors(colors_), worklist(worklist_), worklen(worklen_), nr(nr_) {} - KOKKOS_INLINE_FUNCTION void operator()(const lno_t v, lno_t& lnum, - bool finalPass) const { + KOKKOS_INLINE_FUNCTION void operator()(const lno_t v, lno_t& lnum, bool finalPass) const { if (colors(v) == CONFLICTED) { if (finalPass) worklist(lnum) = v; lnum++; @@ -587,12 +549,11 @@ class GraphColorDistance2 { }; struct NB_UpdateBatch { - NB_UpdateBatch(const color_view_type& colors_, const lno_view_t& worklist_, - const single_lno_view_t& worklen_, lno_t nr_) + NB_UpdateBatch(const color_view_type& colors_, const lno_view_t& worklist_, const single_lno_view_t& worklen_, + lno_t nr_) : colors(colors_), worklist(worklist_), worklen(worklen_), nr(nr_) {} - KOKKOS_INLINE_FUNCTION void operator()(const lno_t v, lno_t& lnum, - bool finalPass) const { + KOKKOS_INLINE_FUNCTION void operator()(const lno_t v, lno_t& lnum, bool finalPass) const { if (colors(v) == UNCOLORABLE) { if (finalPass) worklist(lnum) = v; lnum++; @@ -630,8 +591,7 @@ class GraphColorDistance2 { Kokkos::deep_copy(worklen, this->nr); // init conflictlist sequentially. - Kokkos::parallel_for("InitList", range_policy_type(0, this->nr), - functorInitList(worklist)); + Kokkos::parallel_for("InitList", range_policy_type(0, this->nr), functorInitList(worklist)); // Estimate the number of colors that will be needed // The algorithm can't use more colors than the max distance-2 degree, @@ -670,7 +630,7 @@ class GraphColorDistance2 { // for batch size while (currentWork) { lno_t vertsPerThread = 1; - lno_t workBatches = (currentWork + vertsPerThread - 1) / vertsPerThread; + lno_t workBatches = (currentWork + vertsPerThread - 1) / vertsPerThread; timer.reset(); // if still using this color set, refresh forbidden. // This avoids using too many colors, by relying on forbidden from @@ -681,26 +641,22 @@ class GraphColorDistance2 { case 1: Kokkos::parallel_for( "NB D2 Forbidden", range_policy_type(0, numCols), - NB_RefreshForbidden<1>(colorBase, forbidden, colors_out, - this->t_xadj, this->t_adj, numVerts)); + NB_RefreshForbidden<1>(colorBase, forbidden, colors_out, this->t_xadj, this->t_adj, numVerts)); break; case 2: Kokkos::parallel_for( "NB D2 Forbidden", range_policy_type(0, numCols), - NB_RefreshForbidden<2>(colorBase, forbidden, colors_out, - this->t_xadj, this->t_adj, numVerts)); + NB_RefreshForbidden<2>(colorBase, forbidden, colors_out, this->t_xadj, this->t_adj, numVerts)); break; case 4: Kokkos::parallel_for( "NB D2 Forbidden", range_policy_type(0, numCols), - NB_RefreshForbidden<4>(colorBase, forbidden, colors_out, - this->t_xadj, this->t_adj, numVerts)); + NB_RefreshForbidden<4>(colorBase, forbidden, colors_out, this->t_xadj, this->t_adj, numVerts)); break; case 8: Kokkos::parallel_for( "NB D2 Forbidden", range_policy_type(0, numCols), - NB_RefreshForbidden<8>(colorBase, forbidden, colors_out, - this->t_xadj, this->t_adj, numVerts)); + NB_RefreshForbidden<8>(colorBase, forbidden, colors_out, this->t_xadj, this->t_adj, numVerts)); break; default:; } @@ -709,62 +665,46 @@ class GraphColorDistance2 { switch (batch) { case 1: timer.reset(); - Kokkos::parallel_for( - "NB D2 Coloring", range_policy_type(0, workBatches), - NB_Coloring<1>(worklist, worklen, colorBase, forbidden, - colors_out, this->xadj, this->adj, - vertsPerThread, numCols)); + Kokkos::parallel_for("NB D2 Coloring", range_policy_type(0, workBatches), + NB_Coloring<1>(worklist, worklen, colorBase, forbidden, colors_out, this->xadj, + this->adj, vertsPerThread, numCols)); colorTime += timer.seconds(); timer.reset(); - Kokkos::parallel_for( - "NB D2 Conflict Resolution", range_policy_type(0, numCols), - NB_Conflict<1>(colorBase, forbidden, colors_out, this->t_xadj, - this->t_adj, numVerts)); + Kokkos::parallel_for("NB D2 Conflict Resolution", range_policy_type(0, numCols), + NB_Conflict<1>(colorBase, forbidden, colors_out, this->t_xadj, this->t_adj, numVerts)); conflictTime += timer.seconds(); break; case 2: timer.reset(); - Kokkos::parallel_for( - "NB D2 Coloring", range_policy_type(0, workBatches), - NB_Coloring<2>(worklist, worklen, colorBase, forbidden, - colors_out, this->xadj, this->adj, - vertsPerThread, numCols)); + Kokkos::parallel_for("NB D2 Coloring", range_policy_type(0, workBatches), + NB_Coloring<2>(worklist, worklen, colorBase, forbidden, colors_out, this->xadj, + this->adj, vertsPerThread, numCols)); colorTime += timer.seconds(); timer.reset(); - Kokkos::parallel_for( - "NB D2 Conflict Resolution", range_policy_type(0, numCols), - NB_Conflict<2>(colorBase, forbidden, colors_out, this->t_xadj, - this->t_adj, numVerts)); + Kokkos::parallel_for("NB D2 Conflict Resolution", range_policy_type(0, numCols), + NB_Conflict<2>(colorBase, forbidden, colors_out, this->t_xadj, this->t_adj, numVerts)); conflictTime += timer.seconds(); break; case 4: timer.reset(); - Kokkos::parallel_for( - "NB D2 Coloring", range_policy_type(0, workBatches), - NB_Coloring<4>(worklist, worklen, colorBase, forbidden, - colors_out, this->xadj, this->adj, - vertsPerThread, numCols)); + Kokkos::parallel_for("NB D2 Coloring", range_policy_type(0, workBatches), + NB_Coloring<4>(worklist, worklen, colorBase, forbidden, colors_out, this->xadj, + this->adj, vertsPerThread, numCols)); colorTime += timer.seconds(); timer.reset(); - Kokkos::parallel_for( - "NB D2 Conflict Resolution", range_policy_type(0, numCols), - NB_Conflict<4>(colorBase, forbidden, colors_out, this->t_xadj, - this->t_adj, numVerts)); + Kokkos::parallel_for("NB D2 Conflict Resolution", range_policy_type(0, numCols), + NB_Conflict<4>(colorBase, forbidden, colors_out, this->t_xadj, this->t_adj, numVerts)); conflictTime += timer.seconds(); break; case 8: timer.reset(); - Kokkos::parallel_for( - "NB D2 Coloring", range_policy_type(0, workBatches), - NB_Coloring<8>(worklist, worklen, colorBase, forbidden, - colors_out, this->xadj, this->adj, - vertsPerThread, numCols)); + Kokkos::parallel_for("NB D2 Coloring", range_policy_type(0, workBatches), + NB_Coloring<8>(worklist, worklen, colorBase, forbidden, colors_out, this->xadj, + this->adj, vertsPerThread, numCols)); colorTime += timer.seconds(); timer.reset(); - Kokkos::parallel_for( - "NB D2 Conflict Resolution", range_policy_type(0, numCols), - NB_Conflict<8>(colorBase, forbidden, colors_out, this->t_xadj, - this->t_adj, numVerts)); + Kokkos::parallel_for("NB D2 Conflict Resolution", range_policy_type(0, numCols), + NB_Conflict<8>(colorBase, forbidden, colors_out, this->t_xadj, this->t_adj, numVerts)); conflictTime += timer.seconds(); break; default: @@ -774,17 +714,15 @@ class GraphColorDistance2 { } timer.reset(); // Then build the next worklist - Kokkos::parallel_scan( - "NB D2 worklist", range_policy_type(0, numVerts), - NB_Worklist(colors_out, worklist, worklen, numVerts), currentWork); + Kokkos::parallel_scan("NB D2 worklist", range_policy_type(0, numVerts), + NB_Worklist(colors_out, worklist, worklen, numVerts), currentWork); worklistTime += timer.seconds(); timer.reset(); iter++; } // Will need to run with a different color base, so rebuild the work list - Kokkos::parallel_scan( - "NB D2 Worklist Rebuild", range_policy_type(0, numVerts), - NB_UpdateBatch(colors_out, worklist, worklen, numVerts)); + Kokkos::parallel_scan("NB D2 Worklist Rebuild", range_policy_type(0, numVerts), + NB_UpdateBatch(colors_out, worklist, worklen, numVerts)); Kokkos::deep_copy(currentWork, worklen); worklistTime += timer.seconds(); timer.reset(); @@ -802,9 +740,7 @@ class GraphColorDistance2 { std::cout << "Conflict: " << conflictTime << '\n'; std::cout << "Forbidden: " << forbiddenTime << '\n'; std::cout << "Worklist: " << worklistTime << '\n'; - std::cout << "** Total: " - << colorTime + conflictTime + forbiddenTime + worklistTime - << "\n\n"; + std::cout << "** Total: " << colorTime + conflictTime + forbiddenTime + worklistTime << "\n\n"; } if (this->_ticToc) { gc_handle->add_to_overall_coloring_time_phase1(timer.seconds()); @@ -838,8 +774,8 @@ class GraphColorDistance2 { Kokkos::View Vcolinds = Kokkos::create_mirror_view_and_copy(Kokkos::HostSpace(), this->adj); // Create worklist - Kokkos::View worklist( - Kokkos::view_alloc(Kokkos::WithoutInitializing, "Worklist"), this->nr); + Kokkos::View worklist(Kokkos::view_alloc(Kokkos::WithoutInitializing, "Worklist"), + this->nr); int iter = 0; Kokkos::Timer timer; lno_t currentWork = this->nr; @@ -898,10 +834,8 @@ class GraphColorDistance2 { // GraphColorDistance2::colorGreedy() // // ----------------------------------------------------------------- - void colorGreedy(rowmap_t xadj_, entries_t adj_, rowmap_t t_xadj_, - entries_t t_adj_, color_view_type vertex_colors_, - lno_view_t current_vertexList_, - lno_t current_vertexListLength_) { + void colorGreedy(rowmap_t xadj_, entries_t adj_, rowmap_t t_xadj_, entries_t t_adj_, color_view_type vertex_colors_, + lno_view_t current_vertexList_, lno_t current_vertexListLength_) { lno_t chunkSize_ = this->_chunkSize; if (current_vertexListLength_ < 100 * chunkSize_) { @@ -917,11 +851,9 @@ class GraphColorDistance2 { // 3. [S] loop over vertex neighbors // 4. [S] loop over vertex neighbors of neighbors case COLORING_D2_VB: { - functorGreedyColorVB gc(this->nr, this->nc, xadj_, adj_, t_xadj_, - t_adj_, vertex_colors_, current_vertexList_, + functorGreedyColorVB gc(this->nr, this->nc, xadj_, adj_, t_xadj_, t_adj_, vertex_colors_, current_vertexList_, current_vertexListLength_); - Kokkos::parallel_for("LoopOverChunks", range_policy_type(0, this->nr), - gc); + Kokkos::parallel_for("LoopOverChunks", range_policy_type(0, this->nr), gc); } break; // One level Perallelism, BIT Array for coloring @@ -930,11 +862,9 @@ class GraphColorDistance2 { // 3. [S] loop over vertex neighbors // 4. [S] loop over vertex neighbors of neighbors case COLORING_D2_VB_BIT: { - functorGreedyColorVB_BIT gc(this->nr, this->nc, xadj_, adj_, t_xadj_, - t_adj_, vertex_colors_, current_vertexList_, - current_vertexListLength_); - Kokkos::parallel_for("LoopOverChunks", range_policy_type(0, this->nr), - gc); + functorGreedyColorVB_BIT gc(this->nr, this->nc, xadj_, adj_, t_xadj_, t_adj_, vertex_colors_, + current_vertexList_, current_vertexListLength_); + Kokkos::parallel_for("LoopOverChunks", range_policy_type(0, this->nr), gc); } break; default: @@ -950,8 +880,8 @@ class GraphColorDistance2 { // GraphColorDistance2::colorGreedyEF() // // ----------------------------------------------------------------- - void colorGreedyEF(rowmap_t xadj_, lno_view_t adj_copy_, rowmap_t t_xadj_, - entries_t t_adj_copy_, color_view_type vertex_colors_) { + void colorGreedyEF(rowmap_t xadj_, lno_view_t adj_copy_, rowmap_t t_xadj_, entries_t t_adj_copy_, + color_view_type vertex_colors_) { // Pick the right coloring algorithm to use based on which algorithm we're // using switch (this->gc_handle->get_coloring_algo_type()) { @@ -961,10 +891,8 @@ class GraphColorDistance2 { // 3. [S] loop over vertex neighbors // 4. [S] loop over vertex neighbors of neighbors case COLORING_D2_VB_BIT_EF: { - functorGreedyColorVB_BIT_EF gc(this->nr, this->nc, xadj_, adj_copy_, - t_xadj_, t_adj_copy_, vertex_colors_); - Kokkos::parallel_for("LoopOverChunks", range_policy_type(0, this->nr), - gc); + functorGreedyColorVB_BIT_EF gc(this->nr, this->nc, xadj_, adj_copy_, t_xadj_, t_adj_copy_, vertex_colors_); + Kokkos::parallel_for("LoopOverChunks", range_policy_type(0, this->nr), gc); // prettyPrint1DView(vertex_colors_, "COLORS_GC_VB_BIT",500); } break; @@ -980,23 +908,17 @@ class GraphColorDistance2 { // GraphColorDistance2::findConflicts() // // ----------------------------------------------------------------- - lno_t findConflicts(bool& swap_work_arrays, rowmap_t xadj_, entries_t adj_, - rowmap_t t_xadj_, entries_t t_adj_, - color_view_type vertex_colors_, - lno_view_t current_vertexList_, - lno_t current_vertexListLength_, - lno_view_t next_iteration_recolorList_, - single_lno_view_t next_iteration_recolorListLength_) { + lno_t findConflicts(bool& swap_work_arrays, rowmap_t xadj_, entries_t adj_, rowmap_t t_xadj_, entries_t t_adj_, + color_view_type vertex_colors_, lno_view_t current_vertexList_, lno_t current_vertexListLength_, + lno_view_t next_iteration_recolorList_, single_lno_view_t next_iteration_recolorListLength_) { swap_work_arrays = true; lno_t output_numUncolored = 0; - functorFindConflicts_Atomic conf( - this->nr, this->nc, xadj_, adj_, t_xadj_, t_adj_, vertex_colors_, - current_vertexList_, next_iteration_recolorList_, - next_iteration_recolorListLength_); - Kokkos::parallel_reduce("FindConflicts", - range_policy_type(0, current_vertexListLength_), - conf, output_numUncolored); + functorFindConflicts_Atomic conf(this->nr, this->nc, xadj_, adj_, t_xadj_, t_adj_, vertex_colors_, + current_vertexList_, next_iteration_recolorList_, + next_iteration_recolorListLength_); + Kokkos::parallel_reduce("FindConflicts", range_policy_type(0, current_vertexListLength_), conf, + output_numUncolored); return output_numUncolored; } // findConflicts (end) @@ -1005,9 +927,8 @@ class GraphColorDistance2 { // GraphColorDistance2::resolveConflictsSerial() // // ----------------------------------------------------------------- - void resolveConflictsSerial(rowmap_t xadj_, entries_t adj_, rowmap_t t_xadj_, - entries_t t_adj_, color_view_type vertex_colors_, - lno_view_t current_vertexList_, + void resolveConflictsSerial(rowmap_t xadj_, entries_t adj_, rowmap_t t_xadj_, entries_t t_adj_, + color_view_type vertex_colors_, lno_view_t current_vertexList_, size_type current_vertexListLength_) { color_type* forbidden = new color_type[nr]; for (lno_t i = 0; i < nr; i++) forbidden[i] = nr; @@ -1042,16 +963,14 @@ class GraphColorDistance2 { if (h_colors(vid) > 0) continue; // loop over distance-1 neighbors of vid - for (size_type vid_d1_adj = h_idx(vid); vid_d1_adj < h_idx(vid + 1); - vid_d1_adj++) { + for (size_type vid_d1_adj = h_idx(vid); vid_d1_adj < h_idx(vid + 1); vid_d1_adj++) { lno_t vid_d1 = h_adj(vid_d1_adj); if (vid_d1 < nc) { if (!doing_bipartite && vid_d1 != vid) { forbidden[h_colors(vid_d1)] = vid; } // loop over neighbors of vid_d1 (distance-2 from vid) - for (size_type vid_d2_adj = h_t_idx(vid_d1); - vid_d2_adj < h_t_idx(vid_d1 + 1); vid_d2_adj++) { + for (size_type vid_d2_adj = h_t_idx(vid_d1); vid_d2_adj < h_t_idx(vid_d1 + 1); vid_d2_adj++) { lno_t vid_d2 = h_t_adj(vid_d2_adj); // skip over loops vid -- x -- vid, and filter out-of-bounds @@ -1076,8 +995,7 @@ class GraphColorDistance2 { public: // pretty-print a 1D View with label template - void prettyPrint1DView(kokkos_view_t& view, const char* label, - const size_t max_entries = 500) const { + void prettyPrint1DView(kokkos_view_t& view, const char* label, const size_t max_entries = 500) const { int max_per_line = 20; int line_count = 1; std::cout << label << " = [ \n\t"; @@ -1132,10 +1050,8 @@ class GraphColorDistance2 { lno_t _vertexListLength; // lno_t _chunkSize; // - functorGreedyColorVB(lno_t nr_, lno_t nc_, rowmap_t xadj_, entries_t adj_, - rowmap_t t_xadj_, entries_t t_adj_, - color_view_type colors, lno_view_t vertexList, - lno_t vertexListLength) + functorGreedyColorVB(lno_t nr_, lno_t nc_, rowmap_t xadj_, entries_t adj_, rowmap_t t_xadj_, entries_t t_adj_, + color_view_type colors, lno_view_t vertexList, lno_t vertexListLength) : nr(nr_), nc(nc_), _idx(xadj_), @@ -1173,15 +1089,13 @@ class GraphColorDistance2 { // but in distance-2 we'd need the total vertices at distance-2 which // we don't easily have aprioi. This could be as big as all the // vertices in the graph if diameter(G)=2... - for (color_type offset = 1; offset <= nr; - offset += VB_D2_COLORING_FORBIDDEN_SIZE) { + for (color_type offset = 1; offset <= nr; offset += VB_D2_COLORING_FORBIDDEN_SIZE) { // initialize for (int i = 0; i < VB_D2_COLORING_FORBIDDEN_SIZE; i++) { forbidden[i] = false; } // Check neighbors, fill forbidden array. - for (size_type vid_adj = vid_adj_begin; vid_adj < vid_adj_end; - vid_adj++) { + for (size_type vid_adj = vid_adj_begin; vid_adj < vid_adj_end; vid_adj++) { const lno_t vid_d1 = _adj(vid_adj); if (vid_d1 < nc) { if (!doing_bipartite) // note: compile-time branch (template @@ -1189,23 +1103,20 @@ class GraphColorDistance2 { { if (vid_d1 != vid) { const color_type c = _colors(vid_d1); - if ((c >= offset) && - (c - offset < VB_D2_COLORING_FORBIDDEN_SIZE)) { + if ((c >= offset) && (c - offset < VB_D2_COLORING_FORBIDDEN_SIZE)) { forbidden[c - offset] = true; } } } const size_type vid_d1_adj_begin = _t_idx(vid_d1); const size_type vid_d1_adj_end = _t_idx(vid_d1 + 1); - for (size_type vid_d1_adj = vid_d1_adj_begin; - vid_d1_adj < vid_d1_adj_end; vid_d1_adj++) { + for (size_type vid_d1_adj = vid_d1_adj_begin; vid_d1_adj < vid_d1_adj_end; vid_d1_adj++) { const lno_t vid_d2 = _t_adj(vid_d1_adj); // Skip distance-2-self-loops if (vid_d2 != vid && vid_d2 < nr) { const color_type c = _colors(vid_d2); - if ((c >= offset) && - (c - offset < VB_D2_COLORING_FORBIDDEN_SIZE)) { + if ((c >= offset) && (c - offset < VB_D2_COLORING_FORBIDDEN_SIZE)) { forbidden[c - offset] = true; } } @@ -1240,10 +1151,8 @@ class GraphColorDistance2 { lno_view_t _vertexList; // lno_t _vertexListLength; // - functorGreedyColorVB_BIT(lno_t nr_, lno_t nc_, rowmap_t xadj_, - entries_t adj_, rowmap_t t_xadj_, entries_t t_adj_, - color_view_type colors, lno_view_t vertexList, - lno_t vertexListLength) + functorGreedyColorVB_BIT(lno_t nr_, lno_t nc_, rowmap_t xadj_, entries_t adj_, rowmap_t t_xadj_, entries_t t_adj_, + color_view_type colors, lno_view_t vertexList, lno_t vertexListLength) : nr(nr_), nc(nc_), _idx(xadj_), @@ -1270,8 +1179,7 @@ class GraphColorDistance2 { const size_type vid_adj_begin = _idx(vid); const size_type vid_adj_end = _idx(vid + 1); - for (color_type offset = 1; - offset <= (nr + VBBIT_D2_COLORING_FORBIDDEN_SIZE); + for (color_type offset = 1; offset <= (nr + VBBIT_D2_COLORING_FORBIDDEN_SIZE); offset += VBBIT_D2_COLORING_FORBIDDEN_SIZE) { // Forbidden colors // - single long int for forbidden colors @@ -1282,8 +1190,7 @@ class GraphColorDistance2 { bool break_out = false; // Loop over distance-1 neighbors of vid - for (size_type vid_adj = vid_adj_begin; - !break_out && vid_adj < vid_adj_end; ++vid_adj) { + for (size_type vid_adj = vid_adj_begin; !break_out && vid_adj < vid_adj_end; ++vid_adj) { const lno_t vid_d1 = _adj(vid_adj); if (vid_d1 < nc) { if (!doing_bipartite) // note: compile-time branch (template @@ -1293,8 +1200,7 @@ class GraphColorDistance2 { if (vid_d1 != vid) { const color_type color = _colors(vid_d1); const color_type color_offset = color - offset; - if (color && - color_offset <= VBBIT_D2_COLORING_FORBIDDEN_SIZE) { + if (color && color_offset <= VBBIT_D2_COLORING_FORBIDDEN_SIZE) { // if it is in the current range, then add the color to the // banned colors if (color > offset) { @@ -1313,8 +1219,7 @@ class GraphColorDistance2 { const size_type vid_d1_adj_end = _t_idx(vid_d1 + 1); // Loop over distance-2 neighbors of vid - for (size_type vid_d1_adj = vid_d1_adj_begin; - !break_out && vid_d1_adj < vid_d1_adj_end; ++vid_d1_adj) { + for (size_type vid_d1_adj = vid_d1_adj_begin; !break_out && vid_d1_adj < vid_d1_adj_end; ++vid_d1_adj) { const lno_t vid_d2 = _t_adj(vid_d1_adj); // Ignore Distance-2 Self Loops @@ -1324,8 +1229,7 @@ class GraphColorDistance2 { // if color is within the current range, or if its color is in // a previously traversed range - if (offset <= color && - color_offset < VBBIT_D2_COLORING_FORBIDDEN_SIZE) { + if (offset <= color && color_offset < VBBIT_D2_COLORING_FORBIDDEN_SIZE) { // if it is in the current range, then add the color to the // banned colors forbidden |= (bit_64_forbidden_type(1) << color_offset); @@ -1343,9 +1247,8 @@ class GraphColorDistance2 { // check if an available color exists. if (~forbidden) { - bit_64_forbidden_type color_offset = - KokkosKernels::Impl::least_set_bit(~forbidden) - 1; - _colors(vid) = offset + color_offset; + bit_64_forbidden_type color_offset = KokkosKernels::Impl::least_set_bit(~forbidden) - 1; + _colors(vid) = offset + color_offset; return; } } // for offset <= (nr + VBBIT_D2_COLORING_FORBIDDEN_SIZE) @@ -1366,16 +1269,9 @@ class GraphColorDistance2 { entries_t _t_adj; // transpose vertex adjacency list (NOT modified) color_view_type _colors; // vertex colors - functorGreedyColorVB_BIT_EF(lno_t nr_, lno_t nc_, rowmap_t xadj_, - lno_view_t adj_, rowmap_t t_xadj_, + functorGreedyColorVB_BIT_EF(lno_t nr_, lno_t nc_, rowmap_t xadj_, lno_view_t adj_, rowmap_t t_xadj_, entries_t t_adj_, color_view_type colors) - : _nr(nr_), - _nc(nc_), - _idx(xadj_), - _adj(adj_), - _t_idx(t_xadj_), - _t_adj(t_adj_), - _colors(colors) {} + : _nr(nr_), _nc(nc_), _idx(xadj_), _adj(adj_), _t_idx(t_xadj_), _t_adj(t_adj_), _colors(colors) {} // Color vertex i with smallest available color. // @@ -1394,8 +1290,7 @@ class GraphColorDistance2 { size_type vid_adj_end = _idx(vid + 1); bool foundColor = false; - for (color_type offset = 0; - !foundColor && offset <= (_nr + VBBIT_D2_COLORING_FORBIDDEN_SIZE); + for (color_type offset = 0; !foundColor && offset <= (_nr + VBBIT_D2_COLORING_FORBIDDEN_SIZE); offset += VBBIT_D2_COLORING_FORBIDDEN_SIZE) { // Forbidden colors // - single long int for forbidden colors @@ -1406,8 +1301,7 @@ class GraphColorDistance2 { bool offset_colors_full = false; // Loop over distance-1 neighbors of vid - for (size_type vid_adj = vid_adj_begin; - !offset_colors_full && vid_adj < vid_adj_end; ++vid_adj) { + for (size_type vid_adj = vid_adj_begin; !offset_colors_full && vid_adj < vid_adj_end; ++vid_adj) { const lno_t vid_d1 = _adj(vid_adj); if (vid_d1 < _nc) { if (!doing_bipartite) // note: compile-time branch (template @@ -1419,21 +1313,20 @@ class GraphColorDistance2 { color_type color_offset = color - offset; // if color is within the current range, or if its color is in // a previously traversed range - if (color && offset < color && - color_offset <= VBBIT_D2_COLORING_FORBIDDEN_SIZE) { + if (color && offset < color && color_offset <= VBBIT_D2_COLORING_FORBIDDEN_SIZE) { // if it is in the current range, then add the color to the // banned colors convert color to bit representation bit_64_forbidden_type ban_color_bit = 1; - ban_color_bit = ban_color_bit << (color_offset - 1); + ban_color_bit = ban_color_bit << (color_offset - 1); // add it to forbidden colors forbidden = forbidden | ban_color_bit; } } } - size_type vid_d1_adj_begin = _t_idx(vid_d1); - const size_type vid_d1_adj_end = _t_idx(vid_d1 + 1); - const size_type degree_vid_d1 = vid_d1_adj_end - vid_d1_adj_begin; + size_type vid_d1_adj_begin = _t_idx(vid_d1); + const size_type vid_d1_adj_end = _t_idx(vid_d1 + 1); + const size_type degree_vid_d1 = vid_d1_adj_end - vid_d1_adj_begin; size_type num_vid_d2_colored_in_range = 0; // Store the maximum color value found in the vertices adjacent to @@ -1441,26 +1334,22 @@ class GraphColorDistance2 { color_type max_color_adj_to_d1 = 0; // Loop over distance-2 neighbors of vid - for (size_type vid_d1_adj = vid_d1_adj_begin; - !offset_colors_full && vid_d1_adj < vid_d1_adj_end; + for (size_type vid_d1_adj = vid_d1_adj_begin; !offset_colors_full && vid_d1_adj < vid_d1_adj_end; ++vid_d1_adj) { const lno_t vid_d2 = _t_adj(vid_d1_adj); // Ignore Distance-2 Self Loops if (vid_d2 != vid && vid_d2 < _nr) { - color_type color = _colors(vid_d2); - color_type color_offset = - color - offset; // color_offset < 0 means color is from a - // previous offset. + color_type color = _colors(vid_d2); + color_type color_offset = color - offset; // color_offset < 0 means color is from a + // previous offset. // Update maximum color adjacent to vid_d1 found so far. - max_color_adj_to_d1 = - color > max_color_adj_to_d1 ? color : max_color_adj_to_d1; + max_color_adj_to_d1 = color > max_color_adj_to_d1 ? color : max_color_adj_to_d1; // if color is within the current range, or if its color is in // a previously traversed range - if (color && - color_offset <= VBBIT_D2_COLORING_FORBIDDEN_SIZE) { + if (color && color_offset <= VBBIT_D2_COLORING_FORBIDDEN_SIZE) { num_vid_d2_colored_in_range++; // if it is in the current range, then add the color to the @@ -1543,10 +1432,8 @@ class GraphColorDistance2 { lno_view_t _recolorList; single_lno_view_t _recolorListLength; - functorFindConflicts_Atomic(lno_t nr_, lno_t nc_, rowmap_t xadj_, - entries_t adj_, rowmap_t t_xadj_, - entries_t t_adj_, color_view_type colors, - lno_view_t vertexList, lno_view_t recolorList, + functorFindConflicts_Atomic(lno_t nr_, lno_t nc_, rowmap_t xadj_, entries_t adj_, rowmap_t t_xadj_, + entries_t t_adj_, color_view_type colors, lno_view_t vertexList, lno_view_t recolorList, single_lno_view_t recolorListLength) : nr(nr_), nc(nc_), @@ -1566,8 +1453,7 @@ class GraphColorDistance2 { const size_type vid_d1_adj_begin = _idx(vid); const size_type vid_d1_adj_end = _idx(vid + 1); // If vid is a valid column (vid < nc), check for column->vert conflicts - for (size_type vid_d1_adj = vid_d1_adj_begin; vid_d1_adj < vid_d1_adj_end; - vid_d1_adj++) { + for (size_type vid_d1_adj = vid_d1_adj_begin; vid_d1_adj < vid_d1_adj_end; vid_d1_adj++) { lno_t vid_d1 = _adj(vid_d1_adj); if (vid_d1 < nc) { if (!doing_bipartite) // note: compile-time branch (template param) @@ -1576,8 +1462,7 @@ class GraphColorDistance2 { if (vid_d1 != vid && _colors(vid_d1) == my_color) { _colors(vid) = 0; // uncolor vertex // Atomically add vertex to recolorList - const lno_t k = - Kokkos::atomic_fetch_add(&_recolorListLength(), lno_t(1)); + const lno_t k = Kokkos::atomic_fetch_add(&_recolorListLength(), lno_t(1)); _recolorList(k) = vid; numConflicts++; return; @@ -1585,16 +1470,14 @@ class GraphColorDistance2 { } const size_type d2_adj_begin = _t_idx(vid_d1); const size_type d2_adj_end = _t_idx(vid_d1 + 1); - for (size_type vid_d2_adj = d2_adj_begin; vid_d2_adj < d2_adj_end; - vid_d2_adj++) { + for (size_type vid_d2_adj = d2_adj_begin; vid_d2_adj < d2_adj_end; vid_d2_adj++) { const lno_t vid_d2 = _t_adj(vid_d2_adj); if (vid != vid_d2 && vid_d2 < nr) { if (_colors(vid_d2) == my_color) { _colors(vid) = 0; // uncolor vertex // Atomically add vertex to recolorList - const lno_t k = - Kokkos::atomic_fetch_add(&_recolorListLength(), lno_t(1)); + const lno_t k = Kokkos::atomic_fetch_add(&_recolorListLength(), lno_t(1)); _recolorList(k) = vid; numConflicts++; return; @@ -1634,8 +1517,7 @@ class GraphColorDistance2 { * @return nothing */ template -void graph_print_distance2_color_histogram(KernelHandle* handle, - bool csv = false) { +void graph_print_distance2_color_histogram(KernelHandle* handle, bool csv = false) { using lno_view_t = typename KernelHandle::nnz_lno_temp_work_view_t; using lno_t = typename KernelHandle::nnz_lno_t; using execution_space = typename KernelHandle::HandleExecSpace; @@ -1647,11 +1529,8 @@ void graph_print_distance2_color_histogram(KernelHandle* handle, color_view_t colors = gch_d2->get_vertex_colors(); lno_t num_colors = gch_d2->get_num_colors(); lno_view_t histogram("histogram", num_colors + 1); - KokkosKernels::Impl::kk_get_histogram(colors.extent(0), - colors, histogram); - auto h_histogram = - Kokkos::create_mirror_view_and_copy(Kokkos::HostSpace(), histogram); + KokkosKernels::Impl::kk_get_histogram(colors.extent(0), colors, histogram); + auto h_histogram = Kokkos::create_mirror_view_and_copy(Kokkos::HostSpace(), histogram); // note: both modes ignore color 0 in output, since we assume the coloring is // valid if (csv) { @@ -1661,8 +1540,7 @@ void graph_print_distance2_color_histogram(KernelHandle* handle, } std::cout << h_histogram(i); } else { - auto histogram_slice = Kokkos::subview( - histogram, std::make_pair((size_t)1, histogram.extent(0))); + auto histogram_slice = Kokkos::subview(histogram, std::make_pair((size_t)1, histogram.extent(0))); std::cout << "Distance-2 Color Histogram (1..N): " << std::endl; KokkosKernels::Impl::kk_print_1Dview(histogram_slice); std::cout << std::endl; diff --git a/graph/impl/KokkosGraph_Distance2MIS_impl.hpp b/graph/impl/KokkosGraph_Distance2MIS_impl.hpp index a359956a23..e39e1e7ad3 100644 --- a/graph/impl/KokkosGraph_Distance2MIS_impl.hpp +++ b/graph/impl/KokkosGraph_Distance2MIS_impl.hpp @@ -26,8 +26,7 @@ namespace KokkosGraph { namespace Impl { -template +template struct D2_MIS_RandomPriority { using exec_space = typename device_t::execution_space; using mem_space = typename device_t::memory_space; @@ -66,17 +65,14 @@ struct D2_MIS_RandomPriority { // adjacent to the column. // This counts up monotonically as vertices are eliminated (given status // OUT_SET) - rowStatus = status_view_t( - Kokkos::ViewAllocateWithoutInitializing("RowStatus"), numVerts); - colStatus = status_view_t( - Kokkos::ViewAllocateWithoutInitializing("ColStatus"), numVerts); + rowStatus = status_view_t(Kokkos::ViewAllocateWithoutInitializing("RowStatus"), numVerts); + colStatus = status_view_t(Kokkos::ViewAllocateWithoutInitializing("ColStatus"), numVerts); allWorklists = Kokkos::View( Kokkos::ViewAllocateWithoutInitializing("AllWorklists"), numVerts, 3); } struct RefreshRowStatus { - RefreshRowStatus(const status_view_t& rowStatus_, - const worklist_t& worklist_, lno_t nvBits_, int round) + RefreshRowStatus(const status_view_t& rowStatus_, const worklist_t& worklist_, lno_t nvBits_, int round) : rowStatus(rowStatus_), worklist(worklist_), nvBits(nvBits_) { hashedRound = KokkosKernels::Impl::xorshiftHash(round); } @@ -85,8 +81,8 @@ struct D2_MIS_RandomPriority { lno_t i = worklist(w); // Combine vertex and round to get some pseudorandom priority bits that // change each round - status_t priority = KokkosKernels::Impl::xorshiftHash( - KokkosKernels::Impl::xorshiftHash(i) ^ hashedRound); + status_t priority = + KokkosKernels::Impl::xorshiftHash(KokkosKernels::Impl::xorshiftHash(i) ^ hashedRound); // Generate unique status per row, with IN_SET < status < OUT_SET, status_t newStatus = (status_t)(i + 1) | (priority << nvBits); if (newStatus == OUT_SET) newStatus--; @@ -100,10 +96,8 @@ struct D2_MIS_RandomPriority { }; struct RefreshColStatus { - RefreshColStatus(const status_view_t& colStatus_, - const worklist_t& worklist_, - const status_view_t& rowStatus_, const rowmap_t& rowmap_, - const entries_t& entries_, lno_t nv_, lno_t worklistLen_) + RefreshColStatus(const status_view_t& colStatus_, const worklist_t& worklist_, const status_view_t& rowStatus_, + const rowmap_t& rowmap_, const entries_t& entries_, lno_t nv_, lno_t worklistLen_) : colStatus(colStatus_), worklist(worklist_), rowStatus(rowStatus_), @@ -167,10 +161,8 @@ struct D2_MIS_RandomPriority { }; struct DecideSetFunctor { - DecideSetFunctor(const status_view_t& rowStatus_, - const status_view_t& colStatus_, const rowmap_t& rowmap_, - const entries_t& entries_, lno_t nv_, - const worklist_t& worklist_, lno_t worklistLen_) + DecideSetFunctor(const status_view_t& rowStatus_, const status_view_t& colStatus_, const rowmap_t& rowmap_, + const entries_t& entries_, lno_t nv_, const worklist_t& worklist_, lno_t worklistLen_) : rowStatus(rowStatus_), colStatus(colStatus_), rowmap(rowmap_), @@ -275,8 +267,7 @@ struct D2_MIS_RandomPriority { struct CompactInSet { CompactInSet(const status_view_t& rowStatus_, const lno_view_t& setList_) : rowStatus(rowStatus_), setList(setList_) {} - KOKKOS_INLINE_FUNCTION void operator()(lno_t i, lno_t& lNumInSet, - bool finalPass) const { + KOKKOS_INLINE_FUNCTION void operator()(lno_t i, lno_t& lNumInSet, bool finalPass) const { if (rowStatus(i) == IN_SET) { if (finalPass) setList(lNumInSet) = i; lNumInSet++; @@ -287,11 +278,9 @@ struct D2_MIS_RandomPriority { }; struct MaskedWorklist { - MaskedWorklist(const lno_view_t& mask_, const worklist_t& worklist_) - : mask(mask_), worklist(worklist_) {} + MaskedWorklist(const lno_view_t& mask_, const worklist_t& worklist_) : mask(mask_), worklist(worklist_) {} - KOKKOS_INLINE_FUNCTION void operator()(lno_t i, lno_t& lNumInList, - bool finalPass) const { + KOKKOS_INLINE_FUNCTION void operator()(lno_t i, lno_t& lNumInList, bool finalPass) const { if (mask(i) < 0) { if (finalPass) worklist(lNumInList) = i; lNumInList++; @@ -302,12 +291,10 @@ struct D2_MIS_RandomPriority { }; struct CompactWorklistFunctor { - CompactWorklistFunctor(const worklist_t& src_, const worklist_t& dst_, - const status_view_t& status_) + CompactWorklistFunctor(const worklist_t& src_, const worklist_t& dst_, const status_view_t& status_) : src(src_), dst(dst_), status(status_) {} - KOKKOS_INLINE_FUNCTION void operator()(lno_t w, lno_t& lNumInSet, - bool finalPass) const { + KOKKOS_INLINE_FUNCTION void operator()(lno_t w, lno_t& lNumInSet, bool finalPass) const { lno_t i = src(w); status_t s = status(i); if (s != IN_SET && s != OUT_SET) { @@ -329,15 +316,12 @@ struct D2_MIS_RandomPriority { KokkosKernels::Impl::sequential_fill(rowWorklist); KokkosKernels::Impl::sequential_fill(colWorklist); worklist_t thirdWorklist = Kokkos::subview(allWorklists, Kokkos::ALL(), 2); - auto execSpaceEnum = - KokkosKernels::Impl::kk_get_exec_space_type(); - bool useTeams = KokkosKernels::Impl::kk_is_gpu_exec_space() && - (entries.extent(0) / numVerts >= 16); - int vectorLength = KokkosKernels::Impl::kk_get_suggested_vector_size( - numVerts, entries.extent(0), execSpaceEnum); - int round = 0; - lno_t rowWorkLen = numVerts; - lno_t colWorkLen = numVerts; + auto execSpaceEnum = KokkosKernels::Impl::kk_get_exec_space_type(); + bool useTeams = KokkosKernels::Impl::kk_is_gpu_exec_space() && (entries.extent(0) / numVerts >= 16); + int vectorLength = KokkosKernels::Impl::kk_get_suggested_vector_size(numVerts, entries.extent(0), execSpaceEnum); + int round = 0; + lno_t rowWorkLen = numVerts; + lno_t colWorkLen = numVerts; int refreshColTeamSize = 0; int decideSetTeamSize = 0; if (useTeams) { @@ -345,71 +329,54 @@ struct D2_MIS_RandomPriority { // Compute the recommended team size for RefreshColStatus and // DecideSetFunctor (will be constant) { - RefreshColStatus refreshCol(colStatus, colWorklist, rowStatus, rowmap, - entries, numVerts, colWorkLen); - refreshColTeamSize = - dummyPolicy.team_size_max(refreshCol, Kokkos::ParallelForTag()); + RefreshColStatus refreshCol(colStatus, colWorklist, rowStatus, rowmap, entries, numVerts, colWorkLen); + refreshColTeamSize = dummyPolicy.team_size_max(refreshCol, Kokkos::ParallelForTag()); } { - DecideSetFunctor decideSet(rowStatus, colStatus, rowmap, entries, - numVerts, rowWorklist, rowWorkLen); - decideSetTeamSize = - dummyPolicy.team_size_max(decideSet, Kokkos::ParallelForTag()); + DecideSetFunctor decideSet(rowStatus, colStatus, rowmap, entries, numVerts, rowWorklist, rowWorkLen); + decideSetTeamSize = dummyPolicy.team_size_max(decideSet, Kokkos::ParallelForTag()); } } while (true) { // Compute new row statuses - Kokkos::parallel_for( - range_pol(0, rowWorkLen), - RefreshRowStatus(rowStatus, rowWorklist, nvBits, round)); + Kokkos::parallel_for(range_pol(0, rowWorkLen), RefreshRowStatus(rowStatus, rowWorklist, nvBits, round)); // Compute new col statuses { - RefreshColStatus refreshCol(colStatus, colWorklist, rowStatus, rowmap, - entries, numVerts, colWorkLen); + RefreshColStatus refreshCol(colStatus, colWorklist, rowStatus, rowmap, entries, numVerts, colWorkLen); if (useTeams) - Kokkos::parallel_for(team_pol((colWorkLen + refreshColTeamSize - 1) / - refreshColTeamSize, - refreshColTeamSize, vectorLength), - refreshCol); + Kokkos::parallel_for( + team_pol((colWorkLen + refreshColTeamSize - 1) / refreshColTeamSize, refreshColTeamSize, vectorLength), + refreshCol); else Kokkos::parallel_for(range_pol(0, colWorkLen), refreshCol); } // Decide row statuses where enough information is available { - DecideSetFunctor decideSet(rowStatus, colStatus, rowmap, entries, - numVerts, rowWorklist, rowWorkLen); + DecideSetFunctor decideSet(rowStatus, colStatus, rowmap, entries, numVerts, rowWorklist, rowWorkLen); if (useTeams) Kokkos::parallel_for( - team_pol((rowWorkLen + decideSetTeamSize - 1) / decideSetTeamSize, - decideSetTeamSize, vectorLength), + team_pol((rowWorkLen + decideSetTeamSize - 1) / decideSetTeamSize, decideSetTeamSize, vectorLength), decideSet); else Kokkos::parallel_for(range_pol(0, rowWorkLen), decideSet); } round++; // Compact row worklist - Kokkos::parallel_scan( - range_pol(0, rowWorkLen), - CompactWorklistFunctor(rowWorklist, thirdWorklist, rowStatus), - rowWorkLen); + Kokkos::parallel_scan(range_pol(0, rowWorkLen), CompactWorklistFunctor(rowWorklist, thirdWorklist, rowStatus), + rowWorkLen); if (rowWorkLen == 0) break; std::swap(rowWorklist, thirdWorklist); // Compact col worklist - Kokkos::parallel_scan( - range_pol(0, colWorkLen), - CompactWorklistFunctor(colWorklist, thirdWorklist, colStatus), - colWorkLen); + Kokkos::parallel_scan(range_pol(0, colWorkLen), CompactWorklistFunctor(colWorklist, thirdWorklist, colStatus), + colWorkLen); std::swap(colWorklist, thirdWorklist); } // now that every vertex has been decided IN_SET/OUT_SET, // build a compact list of the vertices which are IN_SET. lno_t numInSet = 0; - Kokkos::parallel_reduce(range_pol(0, numVerts), CountInSet(rowStatus), - numInSet); - lno_view_t setList(Kokkos::ViewAllocateWithoutInitializing("D2MIS"), - numInSet); - Kokkos::parallel_scan(range_pol(0, numVerts), - CompactInSet(rowStatus, setList)); + Kokkos::parallel_reduce(range_pol(0, numVerts), CountInSet(rowStatus), numInSet); + lno_view_t setList(Kokkos::ViewAllocateWithoutInitializing("D2MIS"), numInSet); + Kokkos::parallel_scan(range_pol(0, numVerts), CompactInSet(rowStatus, setList)); return setList; } @@ -422,20 +389,16 @@ struct D2_MIS_RandomPriority { lno_t rowWorkLen = numVerts; lno_t colWorkLen = numVerts; // Row worklist: initially only the non-masked vertices - Kokkos::parallel_scan(range_pol(0, numVerts), - MaskedWorklist(mask, rowWorklist), rowWorkLen); + Kokkos::parallel_scan(range_pol(0, numVerts), MaskedWorklist(mask, rowWorklist), rowWorkLen); KokkosKernels::Impl::sequential_fill(colWorklist); // Need to fill rowStatus with OUT_SET initially so that vertices not in the // worklist don't affect algorithm Kokkos::deep_copy(rowStatus, ~(status_t(0))); worklist_t thirdWorklist = Kokkos::subview(allWorklists, Kokkos::ALL(), 2); - auto execSpaceEnum = - KokkosKernels::Impl::kk_get_exec_space_type(); - bool useTeams = KokkosKernels::Impl::kk_is_gpu_exec_space() && - (entries.extent(0) / numVerts >= 16); - int vectorLength = KokkosKernels::Impl::kk_get_suggested_vector_size( - numVerts, entries.extent(0), execSpaceEnum); - int round = 0; + auto execSpaceEnum = KokkosKernels::Impl::kk_get_exec_space_type(); + bool useTeams = KokkosKernels::Impl::kk_is_gpu_exec_space() && (entries.extent(0) / numVerts >= 16); + int vectorLength = KokkosKernels::Impl::kk_get_suggested_vector_size(numVerts, entries.extent(0), execSpaceEnum); + int round = 0; int refreshColTeamSize = 0; int decideSetTeamSize = 0; if (useTeams) { @@ -443,71 +406,54 @@ struct D2_MIS_RandomPriority { // Compute the recommended team size for RefreshColStatus and // DecideSetFunctor (will be constant) { - RefreshColStatus refreshCol(colStatus, colWorklist, rowStatus, rowmap, - entries, numVerts, colWorkLen); - refreshColTeamSize = - dummyPolicy.team_size_max(refreshCol, Kokkos::ParallelForTag()); + RefreshColStatus refreshCol(colStatus, colWorklist, rowStatus, rowmap, entries, numVerts, colWorkLen); + refreshColTeamSize = dummyPolicy.team_size_max(refreshCol, Kokkos::ParallelForTag()); } { - DecideSetFunctor decideSet(rowStatus, colStatus, rowmap, entries, - numVerts, rowWorklist, rowWorkLen); - decideSetTeamSize = - dummyPolicy.team_size_max(decideSet, Kokkos::ParallelForTag()); + DecideSetFunctor decideSet(rowStatus, colStatus, rowmap, entries, numVerts, rowWorklist, rowWorkLen); + decideSetTeamSize = dummyPolicy.team_size_max(decideSet, Kokkos::ParallelForTag()); } } while (true) { // Compute new row statuses - Kokkos::parallel_for( - range_pol(0, rowWorkLen), - RefreshRowStatus(rowStatus, rowWorklist, nvBits, round)); + Kokkos::parallel_for(range_pol(0, rowWorkLen), RefreshRowStatus(rowStatus, rowWorklist, nvBits, round)); // Compute new col statuses { - RefreshColStatus refreshCol(colStatus, colWorklist, rowStatus, rowmap, - entries, numVerts, colWorkLen); + RefreshColStatus refreshCol(colStatus, colWorklist, rowStatus, rowmap, entries, numVerts, colWorkLen); if (useTeams) - Kokkos::parallel_for(team_pol((colWorkLen + refreshColTeamSize - 1) / - refreshColTeamSize, - refreshColTeamSize, vectorLength), - refreshCol); + Kokkos::parallel_for( + team_pol((colWorkLen + refreshColTeamSize - 1) / refreshColTeamSize, refreshColTeamSize, vectorLength), + refreshCol); else Kokkos::parallel_for(range_pol(0, colWorkLen), refreshCol); } // Decide row statuses where enough information is available { - DecideSetFunctor decideSet(rowStatus, colStatus, rowmap, entries, - numVerts, rowWorklist, rowWorkLen); + DecideSetFunctor decideSet(rowStatus, colStatus, rowmap, entries, numVerts, rowWorklist, rowWorkLen); if (useTeams) Kokkos::parallel_for( - team_pol((rowWorkLen + decideSetTeamSize - 1) / decideSetTeamSize, - decideSetTeamSize, vectorLength), + team_pol((rowWorkLen + decideSetTeamSize - 1) / decideSetTeamSize, decideSetTeamSize, vectorLength), decideSet); else Kokkos::parallel_for(range_pol(0, rowWorkLen), decideSet); } round++; // Compact row worklist - Kokkos::parallel_scan( - range_pol(0, rowWorkLen), - CompactWorklistFunctor(rowWorklist, thirdWorklist, rowStatus), - rowWorkLen); + Kokkos::parallel_scan(range_pol(0, rowWorkLen), CompactWorklistFunctor(rowWorklist, thirdWorklist, rowStatus), + rowWorkLen); if (rowWorkLen == 0) break; std::swap(rowWorklist, thirdWorklist); // Compact col worklist - Kokkos::parallel_scan( - range_pol(0, colWorkLen), - CompactWorklistFunctor(colWorklist, thirdWorklist, colStatus), - colWorkLen); + Kokkos::parallel_scan(range_pol(0, colWorkLen), CompactWorklistFunctor(colWorklist, thirdWorklist, colStatus), + colWorkLen); std::swap(colWorklist, thirdWorklist); } // now that every vertex has been decided IN_SET/OUT_SET, // build a compact list of the vertices which are IN_SET. lno_t numInSet = 0; - Kokkos::parallel_reduce(range_pol(0, numVerts), CountInSet(rowStatus), - numInSet); - lno_view_t setList(Kokkos::ViewAllocateWithoutInitializing("D2MIS"), - numInSet); - Kokkos::parallel_scan(range_pol(0, numVerts), - CompactInSet(rowStatus, setList)); + Kokkos::parallel_reduce(range_pol(0, numVerts), CountInSet(rowStatus), numInSet); + lno_view_t setList(Kokkos::ViewAllocateWithoutInitializing("D2MIS"), numInSet); + Kokkos::parallel_scan(range_pol(0, numVerts), CompactInSet(rowStatus, setList)); return setList; } @@ -523,8 +469,7 @@ struct D2_MIS_RandomPriority { int nvBits; }; -template +template struct D2_MIS_FixedPriority { using exec_space = typename device_t::execution_space; using mem_space = typename device_t::memory_space; @@ -551,10 +496,8 @@ struct D2_MIS_FixedPriority { entries(entries_), numVerts(rowmap.extent(0) - 1), colUpdateBitset(numVerts), - worklist1(Kokkos::view_alloc(Kokkos::WithoutInitializing, "WL1"), - numVerts), - worklist2(Kokkos::view_alloc(Kokkos::WithoutInitializing, "WL2"), - numVerts) { + worklist1(Kokkos::view_alloc(Kokkos::WithoutInitializing, "WL1"), numVerts), + worklist2(Kokkos::view_alloc(Kokkos::WithoutInitializing, "WL2"), numVerts) { status_t i = numVerts + 1; nvBits = 0; while (i) { @@ -566,25 +509,19 @@ struct D2_MIS_FixedPriority { // adjacent to the column. // This counts up monotonically as vertices are eliminated (given status // OUT_SET) - rowStatus = status_view_t( - Kokkos::view_alloc(Kokkos::WithoutInitializing, "RowStatus"), numVerts); - colStatus = status_view_t( - Kokkos::view_alloc(Kokkos::WithoutInitializing, "ColStatus"), numVerts); - KokkosSparse::Impl::graph_min_max_degree( - rowmap, minDegree, maxDegree); + rowStatus = status_view_t(Kokkos::view_alloc(Kokkos::WithoutInitializing, "RowStatus"), numVerts); + colStatus = status_view_t(Kokkos::view_alloc(Kokkos::WithoutInitializing, "ColStatus"), numVerts); + KokkosSparse::Impl::graph_min_max_degree(rowmap, minDegree, maxDegree); // Compute row statuses Kokkos::parallel_for(range_pol(0, numVerts), - InitRowStatus(rowStatus, rowmap, numVerts, nvBits, - minDegree, maxDegree)); + InitRowStatus(rowStatus, rowmap, numVerts, nvBits, minDegree, maxDegree)); // Compute col statuses - Kokkos::parallel_for( - range_pol(0, numVerts), - InitColStatus(colStatus, rowStatus, rowmap, entries, numVerts)); + Kokkos::parallel_for(range_pol(0, numVerts), InitColStatus(colStatus, rowStatus, rowmap, entries, numVerts)); } struct InitRowStatus { - InitRowStatus(const status_view_t& rowStatus_, const rowmap_t& rowmap_, - lno_t nv_, lno_t nvBits_, lno_t minDeg_, lno_t maxDeg_) + InitRowStatus(const status_view_t& rowStatus_, const rowmap_t& rowmap_, lno_t nv_, lno_t nvBits_, lno_t minDeg_, + lno_t maxDeg_) : rowStatus(rowStatus_), rowmap(rowmap_), nv(nv_), @@ -605,8 +542,7 @@ struct D2_MIS_FixedPriority { status_t maxDegRange = (((status_t)1) << degBits) - 2; lno_t deg = rowmap(i + 1) - rowmap(i); float degScore = (float)(deg - minDeg) * invDegRange; - rowStatus(i) = - (status_t)(i + 1) + (((status_t)(degScore * maxDegRange)) << nvBits); + rowStatus(i) = (status_t)(i + 1) + (((status_t)(degScore * maxDegRange)) << nvBits); } status_view_t rowStatus; @@ -619,14 +555,9 @@ struct D2_MIS_FixedPriority { }; struct InitColStatus { - InitColStatus(const status_view_t& colStatus_, - const status_view_t& rowStatus_, const rowmap_t& rowmap_, + InitColStatus(const status_view_t& colStatus_, const status_view_t& rowStatus_, const rowmap_t& rowmap_, const entries_t& entries_, lno_t nv_) - : colStatus(colStatus_), - rowStatus(rowStatus_), - rowmap(rowmap_), - entries(entries_), - nv(nv_) {} + : colStatus(colStatus_), rowStatus(rowStatus_), rowmap(rowmap_), entries(entries_), nv(nv_) {} KOKKOS_INLINE_FUNCTION void operator()(lno_t i) const { // iterate over {i} union the neighbors of i, to find @@ -652,10 +583,8 @@ struct D2_MIS_FixedPriority { }; struct IterateStatusFunctor { - IterateStatusFunctor(const status_view_t& rowStatus_, - const status_view_t& colStatus_, - const rowmap_t& rowmap_, const entries_t& entries_, - lno_t nv_, const lno_view_t& worklist_, + IterateStatusFunctor(const status_view_t& rowStatus_, const status_view_t& colStatus_, const rowmap_t& rowmap_, + const entries_t& entries_, lno_t nv_, const lno_view_t& worklist_, const bitset_t& colUpdateBitset_) : rowStatus(rowStatus_), colStatus(colStatus_), @@ -715,15 +644,11 @@ struct D2_MIS_FixedPriority { }; struct UpdateWorklistFunctor { - UpdateWorklistFunctor(const status_view_t& rowStatus_, - const lno_view_t& oldWorklist_, + UpdateWorklistFunctor(const status_view_t& rowStatus_, const lno_view_t& oldWorklist_, const lno_view_t& newWorklist_) - : rowStatus(rowStatus_), - oldWorklist(oldWorklist_), - newWorklist(newWorklist_) {} + : rowStatus(rowStatus_), oldWorklist(oldWorklist_), newWorklist(newWorklist_) {} - KOKKOS_INLINE_FUNCTION void operator()(lno_t w, lno_t& lcount, - bool finalPass) const { + KOKKOS_INLINE_FUNCTION void operator()(lno_t w, lno_t& lcount, bool finalPass) const { // processing row i lno_t i = oldWorklist(w); // Bit i will be set when it's decided IN_SET/OUT_SET. @@ -741,12 +666,10 @@ struct D2_MIS_FixedPriority { }; struct ColRefreshWorklist { - ColRefreshWorklist(const bitset_t& colUpdateBitset_, - const lno_view_t& refreshList_) + ColRefreshWorklist(const bitset_t& colUpdateBitset_, const lno_view_t& refreshList_) : colUpdateBitset(colUpdateBitset_), refreshList(refreshList_) {} - KOKKOS_INLINE_FUNCTION void operator()(lno_t i, lno_t& lindex, - bool finalPass) const { + KOKKOS_INLINE_FUNCTION void operator()(lno_t i, lno_t& lindex, bool finalPass) const { if (colUpdateBitset.test(i)) { if (finalPass) { refreshList(lindex) = i; @@ -761,10 +684,8 @@ struct D2_MIS_FixedPriority { }; struct RefreshColStatus { - RefreshColStatus(const lno_view_t& worklist_, - const status_view_t& rowStatus_, - const status_view_t& colStatus_, const rowmap_t& rowmap_, - const entries_t& entries_, lno_t nv_) + RefreshColStatus(const lno_view_t& worklist_, const status_view_t& rowStatus_, const status_view_t& colStatus_, + const rowmap_t& rowmap_, const entries_t& entries_, lno_t nv_) : worklist(worklist_), rowStatus(rowStatus_), colStatus(colStatus_), @@ -812,8 +733,7 @@ struct D2_MIS_FixedPriority { struct CompactInSet { CompactInSet(const status_view_t& rowStatus_, const lno_view_t& setList_) : rowStatus(rowStatus_), setList(setList_) {} - KOKKOS_INLINE_FUNCTION void operator()(lno_t i, lno_t& lNumInSet, - bool finalPass) const { + KOKKOS_INLINE_FUNCTION void operator()(lno_t i, lno_t& lNumInSet, bool finalPass) const { if (rowStatus(i) == IN_SET) { if (finalPass) setList(lNumInSet) = i; lNumInSet++; @@ -825,30 +745,22 @@ struct D2_MIS_FixedPriority { lno_view_t compute() { // Initialize first worklist to 0...numVerts - Kokkos::parallel_for(range_pol(0, numVerts), - InitWorklistFunctor(worklist1)); + Kokkos::parallel_for(range_pol(0, numVerts), InitWorklistFunctor(worklist1)); lno_t workRemain = numVerts; while (workRemain) { // do another iteration - Kokkos::parallel_for( - range_pol(0, workRemain), - IterateStatusFunctor(rowStatus, colStatus, rowmap, entries, numVerts, - worklist1, colUpdateBitset)); + Kokkos::parallel_for(range_pol(0, workRemain), IterateStatusFunctor(rowStatus, colStatus, rowmap, entries, + numVerts, worklist1, colUpdateBitset)); // And refresh the column statuses using the other worklist. lno_t colsToRefresh; - Kokkos::parallel_scan(range_pol(0, numVerts), - ColRefreshWorklist(colUpdateBitset, worklist2), - colsToRefresh); + Kokkos::parallel_scan(range_pol(0, numVerts), ColRefreshWorklist(colUpdateBitset, worklist2), colsToRefresh); Kokkos::parallel_for(range_pol(0, colsToRefresh), - RefreshColStatus(worklist2, rowStatus, colStatus, - rowmap, entries, numVerts)); + RefreshColStatus(worklist2, rowStatus, colStatus, rowmap, entries, numVerts)); // then build the next worklist with a scan. Also get the length of the // next worklist. lno_t newWorkRemain = 0; - Kokkos::parallel_scan( - range_pol(0, workRemain), - UpdateWorklistFunctor(rowStatus, worklist1, worklist2), - newWorkRemain); + Kokkos::parallel_scan(range_pol(0, workRemain), UpdateWorklistFunctor(rowStatus, worklist1, worklist2), + newWorkRemain); // Finally, flip the worklists std::swap(worklist1, worklist2); workRemain = newWorkRemain; @@ -856,12 +768,9 @@ struct D2_MIS_FixedPriority { // now that every vertex has been decided IN_SET/OUT_SET, // build a compact list of the vertices which are IN_SET. lno_t numInSet = 0; - Kokkos::parallel_reduce(range_pol(0, numVerts), CountInSet(rowStatus), - numInSet); - lno_view_t setList(Kokkos::view_alloc(Kokkos::WithoutInitializing, "D2MIS"), - numInSet); - Kokkos::parallel_scan(range_pol(0, numVerts), - CompactInSet(rowStatus, setList)); + Kokkos::parallel_reduce(range_pol(0, numVerts), CountInSet(rowStatus), numInSet); + lno_view_t setList(Kokkos::view_alloc(Kokkos::WithoutInitializing, "D2MIS"), numInSet); + Kokkos::parallel_scan(range_pol(0, numVerts), CompactInSet(rowStatus, setList)); return setList; } @@ -883,8 +792,7 @@ struct D2_MIS_FixedPriority { lno_view_t worklist2; }; -template +template struct D2_MIS_Aggregation { using exec_space = typename device_t::execution_space; using mem_space = typename device_t::memory_space; @@ -904,15 +812,13 @@ struct D2_MIS_Aggregation { : rowmap(rowmap_), entries(entries_), numVerts(rowmap.extent(0) - 1), - labels(Kokkos::ViewAllocateWithoutInitializing("AggregateLabels"), - numVerts), + labels(Kokkos::ViewAllocateWithoutInitializing("AggregateLabels"), numVerts), roots("Root Status", numVerts) { Kokkos::deep_copy(labels, (lno_t)-1); } struct Phase1Functor { - Phase1Functor(lno_t numVerts__, const mis2_view& m1__, - const rowmap_t& rowmap__, const entries_t& entries__, + Phase1Functor(lno_t numVerts__, const mis2_view& m1__, const rowmap_t& rowmap__, const entries_t& entries__, const labels_t& labels__, const char_view_t& roots__) : numVerts_(numVerts__), m1_(m1__), @@ -943,21 +849,16 @@ struct D2_MIS_Aggregation { void createPrimaryAggregates() { // Compute an MIS-2 - D2_MIS_RandomPriority d2mis( - rowmap, entries); + D2_MIS_RandomPriority d2mis(rowmap, entries); mis2_view m1 = d2mis.compute(); // Construct initial aggregates using roots and all direct neighbors - Kokkos::parallel_for( - range_pol(0, m1.extent(0)), - Phase1Functor(numVerts, m1, rowmap, entries, labels, roots)); + Kokkos::parallel_for(range_pol(0, m1.extent(0)), Phase1Functor(numVerts, m1, rowmap, entries, labels, roots)); numAggs = m1.extent(0); } struct CandAggSizesFunctor { - CandAggSizesFunctor(lno_t numVerts__, const labels_t& m2__, - const rowmap_t& rowmap__, const entries_t& entries__, - const labels_t& labels__, - const labels_t& candAggSizes__) + CandAggSizesFunctor(lno_t numVerts__, const labels_t& m2__, const rowmap_t& rowmap__, const entries_t& entries__, + const labels_t& labels__, const labels_t& candAggSizes__) : numVerts_(numVerts__), m2_(m2__), rowmap_(rowmap__), @@ -988,11 +889,8 @@ struct D2_MIS_Aggregation { }; struct ChoosePhase2AggsFunctor { - ChoosePhase2AggsFunctor(lno_t numVerts__, lno_t numAggs__, - const labels_t& m2__, const rowmap_t& rowmap__, - const entries_t& entries__, - const labels_t& labels__, - const labels_t& candAggSizes__, + ChoosePhase2AggsFunctor(lno_t numVerts__, lno_t numAggs__, const labels_t& m2__, const rowmap_t& rowmap__, + const entries_t& entries__, const labels_t& labels__, const labels_t& candAggSizes__, const char_view_t& roots__) : numVerts_(numVerts__), numAggs_(numAggs__), @@ -1003,8 +901,7 @@ struct D2_MIS_Aggregation { candAggSizes_(candAggSizes__), roots_(roots__) {} - KOKKOS_INLINE_FUNCTION void operator()(lno_t i, lno_t& lid, - bool finalPass) const { + KOKKOS_INLINE_FUNCTION void operator()(lno_t i, lno_t& lid, bool finalPass) const { lno_t aggSize = candAggSizes_(i); if (aggSize < 3) return; if (finalPass) { @@ -1035,36 +932,27 @@ struct D2_MIS_Aggregation { }; void createSecondaryAggregates() { - labels_t candAggSizes( - Kokkos::ViewAllocateWithoutInitializing("Phase2 Candidate Agg Sizes"), - numVerts); + labels_t candAggSizes(Kokkos::ViewAllocateWithoutInitializing("Phase2 Candidate Agg Sizes"), numVerts); // Compute a new MIS-2 from only unaggregated nodes - D2_MIS_RandomPriority d2mis( - rowmap, entries); + D2_MIS_RandomPriority d2mis(rowmap, entries); labels_t m2 = d2mis.compute(labels); lno_t numCandRoots = m2.extent(0); // Compute the sizes of would-be aggregates. Kokkos::parallel_for(range_pol(0, numCandRoots), - CandAggSizesFunctor(numVerts, m2, rowmap, entries, - labels, candAggSizes)); + CandAggSizesFunctor(numVerts, m2, rowmap, entries, labels, candAggSizes)); // Now, filter out the candidate aggs which are big enough, and create those // aggregates. Using a scan for this assigns IDs deterministically (unlike // an atomic counter). lno_t numNewAggs = 0; - Kokkos::parallel_scan( - range_pol(0, numCandRoots), - ChoosePhase2AggsFunctor(numVerts, numAggs, m2, rowmap, entries, labels, - candAggSizes, roots), - numNewAggs); + Kokkos::parallel_scan(range_pol(0, numCandRoots), + ChoosePhase2AggsFunctor(numVerts, numAggs, m2, rowmap, entries, labels, candAggSizes, roots), + numNewAggs); numAggs += numNewAggs; } struct SizeAndConnectivityFunctor { - SizeAndConnectivityFunctor(lno_t numVerts__, const rowmap_t& rowmap__, - const entries_t& entries__, - const labels_t& labels__, - const labels_t& connectivities__, - const labels_t& aggSizes__) + SizeAndConnectivityFunctor(lno_t numVerts__, const rowmap_t& rowmap__, const entries_t& entries__, + const labels_t& labels__, const labels_t& connectivities__, const labels_t& aggSizes__) : numVerts_(numVerts__), rowmap_(rowmap__), entries_(entries__), @@ -1100,12 +988,9 @@ struct D2_MIS_Aggregation { }; struct AssignLeftoverFunctor { - AssignLeftoverFunctor(lno_t numVerts__, const rowmap_t& rowmap__, - const entries_t& entries__, const labels_t& labels__, - const labels_t& labelsOld__, - const labels_t& connectivities__, - const labels_t& aggSizes__, - const char_view_t& roots__) + AssignLeftoverFunctor(lno_t numVerts__, const rowmap_t& rowmap__, const entries_t& entries__, + const labels_t& labels__, const labels_t& labelsOld__, const labels_t& connectivities__, + const labels_t& aggSizes__, const char_view_t& roots__) : numVerts_(numVerts__), rowmap_(rowmap__), entries_(entries__), @@ -1167,8 +1052,7 @@ struct D2_MIS_Aggregation { // Priorities: adjacent to root > connect > size if (trackedRootAdj[k] > bestRootAdj || (trackedRootAdj[k] == bestRootAdj && - ((trackedConnect[k] > bestConnect) || - (trackedConnect[k] == bestConnect && s < bestSize)))) { + ((trackedConnect[k] > bestConnect) || (trackedConnect[k] == bestConnect && s < bestSize)))) { bestRootAdj = trackedRootAdj[k]; bestConnect = trackedConnect[k]; bestSize = s; @@ -1195,18 +1079,13 @@ struct D2_MIS_Aggregation { // neighboring aggregate. labels_t labelsOld("old", numVerts); Kokkos::deep_copy(labelsOld, labels); - labels_t connectivities(Kokkos::ViewAllocateWithoutInitializing("connect"), - numVerts); + labels_t connectivities(Kokkos::ViewAllocateWithoutInitializing("connect"), numVerts); labels_t aggSizes("Phase3 Agg Sizes", numAggs); - Kokkos::parallel_for( - range_pol(0, numVerts), - SizeAndConnectivityFunctor(numVerts, rowmap, entries, labels, - connectivities, aggSizes)); + Kokkos::parallel_for(range_pol(0, numVerts), + SizeAndConnectivityFunctor(numVerts, rowmap, entries, labels, connectivities, aggSizes)); // Now, join vertices to aggregates - Kokkos::parallel_for( - range_pol(0, numVerts), - AssignLeftoverFunctor(numVerts, rowmap, entries, labels, labelsOld, - connectivities, aggSizes, roots)); + Kokkos::parallel_for(range_pol(0, numVerts), AssignLeftoverFunctor(numVerts, rowmap, entries, labels, labelsOld, + connectivities, aggSizes, roots)); } // phase 2 creates new aggregates in between the initial MIS-2 neighborhoods. diff --git a/graph/impl/KokkosGraph_ExplicitCoarsening_impl.hpp b/graph/impl/KokkosGraph_ExplicitCoarsening_impl.hpp index 464880c932..dc0e802485 100644 --- a/graph/impl/KokkosGraph_ExplicitCoarsening_impl.hpp +++ b/graph/impl/KokkosGraph_ExplicitCoarsening_impl.hpp @@ -20,10 +20,8 @@ namespace KokkosGraph { namespace Impl { -template +template struct ExplicitGraphCoarsening { using exec_space = typename device_t::execution_space; using range_pol = Kokkos::RangePolicy; @@ -33,29 +31,23 @@ struct ExplicitGraphCoarsening { using const_bitset_t = Kokkos::ConstBitset; struct ClusterSizeFunctor { - ClusterSizeFunctor(const ordinal_view_t& counts_, - const labels_t& vertClusters_) + ClusterSizeFunctor(const ordinal_view_t& counts_, const labels_t& vertClusters_) : counts(counts_), vertClusters(vertClusters_) {} - KOKKOS_INLINE_FUNCTION void operator()(const lno_t i) const { - Kokkos::atomic_increment(&counts(vertClusters(i))); - } + KOKKOS_INLINE_FUNCTION void operator()(const lno_t i) const { Kokkos::atomic_increment(&counts(vertClusters(i))); } ordinal_view_t counts; labels_t vertClusters; }; struct FillClusterVertsFunctor { - FillClusterVertsFunctor(const ordinal_view_t& clusterOffsets_, - const ordinal_view_t& clusterVerts_, - const labels_t& vertClusters_, - const ordinal_view_t& insertCounts_) + FillClusterVertsFunctor(const ordinal_view_t& clusterOffsets_, const ordinal_view_t& clusterVerts_, + const labels_t& vertClusters_, const ordinal_view_t& insertCounts_) : clusterOffsets(clusterOffsets_), clusterVerts(clusterVerts_), vertClusters(vertClusters_), insertCounts(insertCounts_) {} KOKKOS_INLINE_FUNCTION void operator()(const lno_t i) const { - lno_t cluster = vertClusters(i); - lno_t offset = clusterOffsets(cluster) + - Kokkos::atomic_fetch_add(&insertCounts(cluster), 1); + lno_t cluster = vertClusters(i); + lno_t offset = clusterOffsets(cluster) + Kokkos::atomic_fetch_add(&insertCounts(cluster), 1); clusterVerts(offset) = i; } ordinal_view_t clusterOffsets; @@ -65,12 +57,9 @@ struct ExplicitGraphCoarsening { }; struct BuildCrossClusterMaskFunctor { - BuildCrossClusterMaskFunctor(const fine_rowmap_t& rowmap_, - const fine_entries_t& colinds_, - const ordinal_view_t& clusterOffsets_, - const ordinal_view_t& clusterVerts_, - const labels_t& vertClusters_, - const bitset_t& mask_) + BuildCrossClusterMaskFunctor(const fine_rowmap_t& rowmap_, const fine_entries_t& colinds_, + const ordinal_view_t& clusterOffsets_, const ordinal_view_t& clusterVerts_, + const labels_t& vertClusters_, const bitset_t& mask_) : numRows(rowmap_.extent(0) - 1), rowmap(rowmap_), colinds(colinds_), @@ -106,13 +95,10 @@ struct ExplicitGraphCoarsening { // Try to insert the edge between cluster (team's cluster) and neighbor // (neighboring cluster) by inserting nei into the table. - KOKKOS_INLINE_FUNCTION bool insert(lno_t cluster, lno_t nei, - int* table) const { + KOKKOS_INLINE_FUNCTION bool insert(lno_t cluster, lno_t nei, int* table) const { unsigned h = xorshiftHash(nei); for (unsigned i = h; i < h + 2; i++) { - if (Kokkos::atomic_compare_exchange_strong(&table[i % tableSize()], - cluster, nei)) - return true; + if (Kokkos::atomic_compare_exchange_strong(&table[i % tableSize()], cluster, nei)) return true; } return false; } @@ -127,40 +113,35 @@ struct ExplicitGraphCoarsening { // thread handles a cluster int* table = (int*)t.team_shmem().get_shmem(tableSize() * sizeof(int)); // mark every entry as cluster (self-loop) to represent free/empty - Kokkos::parallel_for(Kokkos::TeamVectorRange(t, tableSize()), - [&](const lno_t i) { table[i] = cluster; }); + Kokkos::parallel_for(Kokkos::TeamVectorRange(t, tableSize()), [&](const lno_t i) { table[i] = cluster; }); t.team_barrier(); // now, for each row belonging to the cluster, iterate through the // neighbors - Kokkos::parallel_for( - Kokkos::TeamThreadRange(t, clusterSize), [&](const lno_t i) { - lno_t row = clusterVerts(clusterOffsets(cluster) + i); - lno_t rowDeg = rowmap(row + 1) - rowmap(row); - Kokkos::parallel_for(Kokkos::ThreadVectorRange(t, rowDeg), - [&](const lno_t j) { - lno_t nei = colinds(rowmap(row) + j); - // Remote neighbors are not included - if (nei >= numRows) return; - lno_t neiCluster = vertClusters(nei); - if (neiCluster != cluster) { - // Have a neighbor. Try to find it in the - // table. - if (!lookup(neiCluster, table)) { - // Not in the table. Try to insert it. - insert(cluster, neiCluster, table); - // Whether or not insertion succeeded, - // this is a cross-cluster edge possibly - // not seen before - mask.set(rowmap(row) + j); - } - } - }); - }); + Kokkos::parallel_for(Kokkos::TeamThreadRange(t, clusterSize), [&](const lno_t i) { + lno_t row = clusterVerts(clusterOffsets(cluster) + i); + lno_t rowDeg = rowmap(row + 1) - rowmap(row); + Kokkos::parallel_for(Kokkos::ThreadVectorRange(t, rowDeg), [&](const lno_t j) { + lno_t nei = colinds(rowmap(row) + j); + // Remote neighbors are not included + if (nei >= numRows) return; + lno_t neiCluster = vertClusters(nei); + if (neiCluster != cluster) { + // Have a neighbor. Try to find it in the + // table. + if (!lookup(neiCluster, table)) { + // Not in the table. Try to insert it. + insert(cluster, neiCluster, table); + // Whether or not insertion succeeded, + // this is a cross-cluster edge possibly + // not seen before + mask.set(rowmap(row) + j); + } + } + }); + }); } - size_t team_shmem_size(int /*teamSize*/) const { - return tableSize() * sizeof(int); - } + size_t team_shmem_size(int /*teamSize*/) const { return tableSize() * sizeof(int); } lno_t numRows; fine_rowmap_t rowmap; @@ -172,14 +153,10 @@ struct ExplicitGraphCoarsening { }; struct FillClusterEntriesFunctor { - FillClusterEntriesFunctor(const fine_rowmap_t& rowmap_, - const fine_entries_t& colinds_, - const coarse_rowmap_t& clusterRowmap_, - const coarse_entries_t& clusterEntries_, - const ordinal_view_t& clusterOffsets_, - const ordinal_view_t& clusterVerts_, - const labels_t& vertClusters_, - const bitset_t& edgeMask_) + FillClusterEntriesFunctor(const fine_rowmap_t& rowmap_, const fine_entries_t& colinds_, + const coarse_rowmap_t& clusterRowmap_, const coarse_entries_t& clusterEntries_, + const ordinal_view_t& clusterOffsets_, const ordinal_view_t& clusterVerts_, + const labels_t& vertClusters_, const bitset_t& edgeMask_) : rowmap(rowmap_), colinds(colinds_), clusterRowmap(clusterRowmap_), @@ -189,8 +166,7 @@ struct ExplicitGraphCoarsening { vertClusters(vertClusters_), edgeMask(edgeMask_) {} // Run this scan over entries in clusterVerts (reordered point rows) - KOKKOS_INLINE_FUNCTION void operator()(const lno_t i, lno_t& lcount, - const bool& finalPass) const { + KOKKOS_INLINE_FUNCTION void operator()(const lno_t i, lno_t& lcount, const bool& finalPass) const { lno_t numRows = rowmap.extent(0) - 1; lno_t row = clusterVerts(i); size_type rowStart = rowmap(row); @@ -238,9 +214,8 @@ struct ExplicitGraphCoarsening { // Constructor just does the computation and outputs to coarseRowmap, // coarseEntries. - ExplicitGraphCoarsening(const fine_rowmap_t& fineRowmap, - const fine_entries_t& fineEntries, - const labels_t& labels, lno_t numCoarseVerts) { + ExplicitGraphCoarsening(const fine_rowmap_t& fineRowmap, const fine_entries_t& fineEntries, const labels_t& labels, + lno_t numCoarseVerts) { lno_t numFineVerts = fineRowmap.extent(0); if (numFineVerts <= 1) { coarseRowmap = coarse_rowmap_t(); @@ -249,54 +224,39 @@ struct ExplicitGraphCoarsening { } numFineVerts--; clusterOffsets = ordinal_view_t("Cluster offsets", numCoarseVerts + 1); - clusterVerts = ordinal_view_t( - Kokkos::view_alloc(Kokkos::WithoutInitializing, "Cluster verts"), - numFineVerts); - Kokkos::parallel_for(range_pol(0, numFineVerts), - ClusterSizeFunctor(clusterOffsets, labels)); - KokkosKernels::Impl::exclusive_parallel_prefix_sum( - numCoarseVerts + 1, clusterOffsets); + clusterVerts = ordinal_view_t(Kokkos::view_alloc(Kokkos::WithoutInitializing, "Cluster verts"), numFineVerts); + Kokkos::parallel_for(range_pol(0, numFineVerts), ClusterSizeFunctor(clusterOffsets, labels)); + KokkosKernels::Impl::exclusive_parallel_prefix_sum(numCoarseVerts + 1, clusterOffsets); { - ordinal_view_t tempInsertCounts("Temporary cluster insert counts", - numCoarseVerts); + ordinal_view_t tempInsertCounts("Temporary cluster insert counts", numCoarseVerts); Kokkos::parallel_for(range_pol(0, numFineVerts), - FillClusterVertsFunctor(clusterOffsets, clusterVerts, - labels, tempInsertCounts)); + FillClusterVertsFunctor(clusterOffsets, clusterVerts, labels, tempInsertCounts)); } // Determine the set of edges (in the point graph) that cross between two // distinct clusters int vectorSize = KokkosKernels::Impl::kk_get_suggested_vector_size( - numFineVerts, fineEntries.extent(0), - KokkosKernels::Impl::kk_get_exec_space_type()); + numFineVerts, fineEntries.extent(0), KokkosKernels::Impl::kk_get_exec_space_type()); bitset_t crossClusterEdgeMask(fineEntries.extent(0)); size_type numClusterEdges; { - BuildCrossClusterMaskFunctor buildEdgeMask(fineRowmap, fineEntries, - clusterOffsets, clusterVerts, - labels, crossClusterEdgeMask); - int sharedPerTeam = buildEdgeMask.team_shmem_size( - 0); // using team-size = 0 for since no per-thread shared is used. - int teamSize = KokkosKernels::Impl::get_suggested_team_size( - buildEdgeMask, vectorSize, sharedPerTeam, 0); + BuildCrossClusterMaskFunctor buildEdgeMask(fineRowmap, fineEntries, clusterOffsets, clusterVerts, labels, + crossClusterEdgeMask); + int sharedPerTeam = + buildEdgeMask.team_shmem_size(0); // using team-size = 0 for since no per-thread shared is used. + int teamSize = + KokkosKernels::Impl::get_suggested_team_size(buildEdgeMask, vectorSize, sharedPerTeam, 0); Kokkos::parallel_for( - team_pol(numCoarseVerts, teamSize, vectorSize) - .set_scratch_size(0, Kokkos::PerTeam(sharedPerTeam)), + team_pol(numCoarseVerts, teamSize, vectorSize).set_scratch_size(0, Kokkos::PerTeam(sharedPerTeam)), buildEdgeMask); numClusterEdges = crossClusterEdgeMask.count(); } - coarseRowmap = coarse_rowmap_t( - Kokkos::view_alloc(Kokkos::WithoutInitializing, "Cluster graph rowmap"), - numCoarseVerts + 1); + coarseRowmap = + coarse_rowmap_t(Kokkos::view_alloc(Kokkos::WithoutInitializing, "Cluster graph rowmap"), numCoarseVerts + 1); coarseEntries = - coarse_entries_t(Kokkos::view_alloc(Kokkos::WithoutInitializing, - "Cluster graph colinds"), - numClusterEdges); - Kokkos::parallel_scan( - range_pol(0, numFineVerts), - FillClusterEntriesFunctor(fineRowmap, fineEntries, coarseRowmap, - coarseEntries, clusterOffsets, clusterVerts, - labels, crossClusterEdgeMask)); + coarse_entries_t(Kokkos::view_alloc(Kokkos::WithoutInitializing, "Cluster graph colinds"), numClusterEdges); + Kokkos::parallel_scan(range_pol(0, numFineVerts), + FillClusterEntriesFunctor(fineRowmap, fineEntries, coarseRowmap, coarseEntries, + clusterOffsets, clusterVerts, labels, crossClusterEdgeMask)); } coarse_rowmap_t coarseRowmap; diff --git a/graph/impl/KokkosGraph_color_d1_spec.hpp b/graph/impl/KokkosGraph_color_d1_spec.hpp index 5d66240763..178fdd9182 100644 --- a/graph/impl/KokkosGraph_color_d1_spec.hpp +++ b/graph/impl/KokkosGraph_color_d1_spec.hpp @@ -36,21 +36,17 @@ struct color_d1_eti_spec_avail { } // namespace Impl } // namespace KokkosGraph -#define KOKKOSGRAPH_COLOR_D1_ETI_SPEC_AVAIL(SCALAR_TYPE, ORDINAL_TYPE, \ - OFFSET_TYPE, LAYOUT_TYPE, \ - EXEC_SPACE_TYPE, MEM_SPACE_TYPE) \ - template <> \ - struct color_d1_eti_spec_avail< \ - KokkosKernels::Experimental::KokkosKernelsHandle< \ - const OFFSET_TYPE, const ORDINAL_TYPE, const SCALAR_TYPE, \ - EXEC_SPACE_TYPE, MEM_SPACE_TYPE, MEM_SPACE_TYPE>, \ - Kokkos::View, \ - Kokkos::MemoryTraits>, \ - Kokkos::View, \ - Kokkos::MemoryTraits>> { \ - enum : bool { value = true }; \ +#define KOKKOSGRAPH_COLOR_D1_ETI_SPEC_AVAIL(SCALAR_TYPE, ORDINAL_TYPE, OFFSET_TYPE, LAYOUT_TYPE, EXEC_SPACE_TYPE, \ + MEM_SPACE_TYPE) \ + template <> \ + struct color_d1_eti_spec_avail< \ + KokkosKernels::Experimental::KokkosKernelsHandle, \ + Kokkos::View, \ + Kokkos::MemoryTraits>, \ + Kokkos::View, \ + Kokkos::MemoryTraits>> { \ + enum : bool { value = true }; \ }; // Include the actual specialization declarations @@ -63,24 +59,19 @@ namespace Impl { /// \brief Implementation of KokkosGraph::graph_color (distance-1 greedy /// coloring) -template ::value> +template ::value> struct COLOR_D1 { - static void color_d1(KernelHandle *handle, - typename lno_view_t::non_const_value_type num_rows, - size_view_t rowmap, lno_view_t entries); + static void color_d1(KernelHandle *handle, typename lno_view_t::non_const_value_type num_rows, size_view_t rowmap, + lno_view_t entries); }; #if !defined(KOKKOSKERNELS_ETI_ONLY) || KOKKOSKERNELS_IMPL_COMPILE_LIBRARY template -struct COLOR_D1 { - static void color_d1(KernelHandle *handle, - typename lno_view_t::non_const_value_type num_rows, - size_view_t rowmap, lno_view_t entries) { +struct COLOR_D1 { + static void color_d1(KernelHandle *handle, typename lno_view_t::non_const_value_type num_rows, size_view_t rowmap, + lno_view_t entries) { KokkosGraph::Impl::graph_color_impl(handle, num_rows, rowmap, entries); } }; @@ -90,34 +81,26 @@ struct COLOR_D1, \ - Kokkos::View, \ - Kokkos::MemoryTraits>, \ - Kokkos::View, \ - Kokkos::MemoryTraits>, \ +#define KOKKOSGRAPH_COLOR_D1_ETI_SPEC_DECL(SCALAR_TYPE, ORDINAL_TYPE, OFFSET_TYPE, LAYOUT_TYPE, EXEC_SPACE_TYPE, \ + MEM_SPACE_TYPE) \ + extern template struct COLOR_D1< \ + typename KokkosKernels::Experimental::KokkosKernelsHandle< \ + const OFFSET_TYPE, const ORDINAL_TYPE, const SCALAR_TYPE, EXEC_SPACE_TYPE, MEM_SPACE_TYPE, MEM_SPACE_TYPE>, \ + Kokkos::View, \ + Kokkos::MemoryTraits>, \ + Kokkos::View, \ + Kokkos::MemoryTraits>, \ false, true>; -#define KOKKOSGRAPH_COLOR_D1_ETI_SPEC_INST(SCALAR_TYPE, ORDINAL_TYPE, \ - OFFSET_TYPE, LAYOUT_TYPE, \ - EXEC_SPACE_TYPE, MEM_SPACE_TYPE) \ - template struct COLOR_D1< \ - KokkosKernels::Experimental::KokkosKernelsHandle< \ - const OFFSET_TYPE, const ORDINAL_TYPE, const SCALAR_TYPE, \ - EXEC_SPACE_TYPE, MEM_SPACE_TYPE, MEM_SPACE_TYPE>, \ - Kokkos::View, \ - Kokkos::MemoryTraits>, \ - Kokkos::View, \ - Kokkos::MemoryTraits>, \ +#define KOKKOSGRAPH_COLOR_D1_ETI_SPEC_INST(SCALAR_TYPE, ORDINAL_TYPE, OFFSET_TYPE, LAYOUT_TYPE, EXEC_SPACE_TYPE, \ + MEM_SPACE_TYPE) \ + template struct COLOR_D1< \ + KokkosKernels::Experimental::KokkosKernelsHandle, \ + Kokkos::View, \ + Kokkos::MemoryTraits>, \ + Kokkos::View, \ + Kokkos::MemoryTraits>, \ false, true>; #endif diff --git a/graph/src/KokkosGraph_CoarsenConstruct.hpp b/graph/src/KokkosGraph_CoarsenConstruct.hpp index 28de59979e..8e1cce3ddb 100644 --- a/graph/src/KokkosGraph_CoarsenConstruct.hpp +++ b/graph/src/KokkosGraph_CoarsenConstruct.hpp @@ -31,8 +31,7 @@ namespace KokkosSparse { namespace Impl { -template +template struct SortLowDegreeCrsMatrixFunctor { using size_type = typename rowmap_t::non_const_value_type; using lno_t = typename entries_t::non_const_value_type; @@ -40,27 +39,17 @@ struct SortLowDegreeCrsMatrixFunctor { using team_mem = typename Kokkos::TeamPolicy::member_type; using value_type = lno_t; - SortLowDegreeCrsMatrixFunctor(bool usingRangePol, const rowmap_t& _rowmap, - const entries_t& _entries, - const values_t& _values, - const lno_t _degreeLimit) - : rowmap(_rowmap), - entries(_entries), - values(_values), - degreeLimit(_degreeLimit) { + SortLowDegreeCrsMatrixFunctor(bool usingRangePol, const rowmap_t& _rowmap, const entries_t& _entries, + const values_t& _values, const lno_t _degreeLimit) + : rowmap(_rowmap), entries(_entries), values(_values), degreeLimit(_degreeLimit) { if (usingRangePol) { - entriesAux = - entries_t(Kokkos::ViewAllocateWithoutInitializing("Entries aux"), - entries.extent(0)); - valuesAux = - values_t(Kokkos::ViewAllocateWithoutInitializing("Values aux"), - values.extent(0)); + entriesAux = entries_t(Kokkos::ViewAllocateWithoutInitializing("Entries aux"), entries.extent(0)); + valuesAux = values_t(Kokkos::ViewAllocateWithoutInitializing("Values aux"), values.extent(0)); } // otherwise, aux arrays won't be allocated (sorting in place) } - KOKKOS_INLINE_FUNCTION void operator()(const lno_t i, - value_type& reducer) const { + KOKKOS_INLINE_FUNCTION void operator()(const lno_t i, value_type& reducer) const { size_type rowStart = rowmap(i); size_type rowEnd = rowmap(i + 1); lno_t rowNum = rowEnd - rowStart; @@ -71,13 +60,11 @@ struct SortLowDegreeCrsMatrixFunctor { // Radix sort requires unsigned keys for comparison using unsigned_lno_t = typename std::make_unsigned::type; KokkosKernels::SerialRadixSort2( - (unsigned_lno_t*)entries.data() + rowStart, - (unsigned_lno_t*)entriesAux.data() + rowStart, values.data() + rowStart, - valuesAux.data() + rowStart, rowNum); + (unsigned_lno_t*)entries.data() + rowStart, (unsigned_lno_t*)entriesAux.data() + rowStart, + values.data() + rowStart, valuesAux.data() + rowStart, rowNum); } - KOKKOS_INLINE_FUNCTION void operator()(const team_mem t, - value_type& reducer) const { + KOKKOS_INLINE_FUNCTION void operator()(const team_mem t, value_type& reducer) const { size_type i = t.league_rank(); size_type rowStart = rowmap(i); size_type rowEnd = rowmap(i + 1); @@ -86,8 +73,8 @@ struct SortLowDegreeCrsMatrixFunctor { Kokkos::single(Kokkos::PerTeam(t), [&]() { reducer++; }); return; } - KokkosKernels::TeamBitonicSort2( - entries.data() + rowStart, values.data() + rowStart, rowNum, t); + KokkosKernels::TeamBitonicSort2(entries.data() + rowStart, + values.data() + rowStart, rowNum, t); } rowmap_t rowmap; @@ -103,23 +90,19 @@ struct SortLowDegreeCrsMatrixFunctor { // Sort a CRS matrix: within each row, sort entries ascending by column. // At the same time, permute the values. // Only modifies rows below the degreeLimit -template +template typename entries_t::non_const_value_type sort_low_degree_rows_crs_matrix( const rowmap_t& rowmap, const entries_t& entries, const values_t& values, const typename entries_t::non_const_value_type degreeLimit) { using lno_t = typename entries_t::non_const_value_type; using team_pol = Kokkos::TeamPolicy; - bool useRadix = !KokkosKernels::Impl::kk_is_gpu_exec_space(); - Impl::SortLowDegreeCrsMatrixFunctor - funct(useRadix, rowmap, entries, values, degreeLimit); + bool useRadix = !KokkosKernels::Impl::kk_is_gpu_exec_space(); + Impl::SortLowDegreeCrsMatrixFunctor funct(useRadix, rowmap, entries, + values, degreeLimit); lno_t numRows = rowmap.extent(0) ? rowmap.extent(0) - 1 : 0; lno_t notSorted = 0; if (useRadix) { - Kokkos::parallel_reduce("sort_crs_matrix", - Kokkos::RangePolicy(0, numRows), - funct, notSorted); + Kokkos::parallel_reduce("sort_crs_matrix", Kokkos::RangePolicy(0, numRows), funct, notSorted); } else { // Try to get teamsize to be largest power of 2 not greater than avg entries // per row @@ -136,10 +119,8 @@ typename entries_t::non_const_value_type sort_low_degree_rows_crs_matrix( teamSize *= 2; } team_pol temp(numRows, teamSize); - teamSize = std::min(teamSize, - temp.team_size_max(funct, Kokkos::ParallelReduceTag())); - Kokkos::parallel_reduce("sort_crs_matrix", team_pol(numRows, teamSize), - funct, notSorted); + teamSize = std::min(teamSize, temp.team_size_max(funct, Kokkos::ParallelReduceTag())); + Kokkos::parallel_reduce("sort_crs_matrix", team_pol(numRows, teamSize), funct, notSorted); } return notSorted; } @@ -156,30 +137,27 @@ template class coarse_builder { public: // define internal types - using matrix_t = crsMat; - using exec_space = typename matrix_t::execution_space; - using mem_space = typename matrix_t::memory_space; - using Device = typename matrix_t::device_type; - using ordinal_t = typename matrix_t::ordinal_type; - using edge_offset_t = typename matrix_t::size_type; - using scalar_t = typename matrix_t::value_type; - using vtx_view_t = Kokkos::View; - using wgt_view_t = Kokkos::View; - using edge_view_t = Kokkos::View; - using edge_subview_t = Kokkos::View; - using graph_type = typename matrix_t::staticcrsgraph_type; - using policy_t = Kokkos::RangePolicy; - using dyn_policy_t = - Kokkos::RangePolicy, exec_space>; - using team_policy_t = Kokkos::TeamPolicy; - using dyn_team_policy_t = - Kokkos::TeamPolicy, exec_space>; - using member = typename team_policy_t::member_type; - using spgemm_kernel_handle = KokkosKernels::Experimental::KokkosKernelsHandle< - edge_offset_t, ordinal_t, scalar_t, exec_space, mem_space, mem_space>; - using uniform_memory_pool_t = - KokkosKernels::Impl::UniformMemoryPool; - using mapper_t = coarsen_heuristics; + using matrix_t = crsMat; + using exec_space = typename matrix_t::execution_space; + using mem_space = typename matrix_t::memory_space; + using Device = typename matrix_t::device_type; + using ordinal_t = typename matrix_t::ordinal_type; + using edge_offset_t = typename matrix_t::size_type; + using scalar_t = typename matrix_t::value_type; + using vtx_view_t = Kokkos::View; + using wgt_view_t = Kokkos::View; + using edge_view_t = Kokkos::View; + using edge_subview_t = Kokkos::View; + using graph_type = typename matrix_t::staticcrsgraph_type; + using policy_t = Kokkos::RangePolicy; + using dyn_policy_t = Kokkos::RangePolicy, exec_space>; + using team_policy_t = Kokkos::TeamPolicy; + using dyn_team_policy_t = Kokkos::TeamPolicy, exec_space>; + using member = typename team_policy_t::member_type; + using spgemm_kernel_handle = KokkosKernels::Experimental::KokkosKernelsHandle; + using uniform_memory_pool_t = KokkosKernels::Impl::UniformMemoryPool; + using mapper_t = coarsen_heuristics; static constexpr ordinal_t get_null_val() { // this value must line up with the null value used by the hashmap // accumulator @@ -189,10 +167,9 @@ class coarse_builder { return std::numeric_limits::max(); } } - static constexpr ordinal_t ORD_MAX = get_null_val(); - static constexpr bool is_host_space = std::is_same< - typename exec_space::memory_space, - typename Kokkos::DefaultHostExecutionSpace::memory_space>::value; + static constexpr ordinal_t ORD_MAX = get_null_val(); + static constexpr bool is_host_space = + std::is_same::value; static constexpr bool scal_eq_ord = std::is_same::value; // contains matrix and vertex weights corresponding to current level // interp matrix maps previous level to this level @@ -222,9 +199,7 @@ class coarse_builder { }; // determine if dynamic scheduling should be used - static bool should_use_dyn( - const ordinal_t n, const Kokkos::View work, - int t_count) { + static bool should_use_dyn(const ordinal_t n, const Kokkos::View work, int t_count) { bool use_dyn = false; edge_offset_t max = 0; edge_offset_t min = std::numeric_limits::max(); @@ -252,19 +227,16 @@ class coarse_builder { // build the course graph according to ((B^T A) B) or (B^T (A B)), where B is // aggregator matrix - static coarse_level_triple build_coarse_graph_spgemm( - coarsen_handle& handle, const coarse_level_triple level, - const matrix_t interp_mtx) { + static coarse_level_triple build_coarse_graph_spgemm(coarsen_handle& handle, const coarse_level_triple level, + const matrix_t interp_mtx) { vtx_view_t f_vtx_w = level.vtx_wgts; matrix_t g = level.mtx; - if (!KokkosSparse::Impl::isCrsGraphSorted(g.graph.row_map, g.graph.entries)) - KokkosSparse::sort_crs_matrix(g); + if (!KokkosSparse::Impl::isCrsGraphSorted(g.graph.row_map, g.graph.entries)) KokkosSparse::sort_crs_matrix(g); ordinal_t n = g.numRows(); ordinal_t nc = interp_mtx.numCols(); - matrix_t interp_transpose = - KokkosSparse::Impl::transpose_matrix(interp_mtx); + matrix_t interp_transpose = KokkosSparse::Impl::transpose_matrix(interp_mtx); KokkosSparse::sort_crs_matrix(interp_transpose); spgemm_kernel_handle kh; @@ -278,78 +250,60 @@ class coarse_builder { if (handle.b == Spgemm_transpose_first) { kh.create_spgemm_handle(); edge_view_t row_map_p1("rows_partial", nc + 1); - KokkosSparse::Experimental::spgemm_symbolic( - &kh, nc, n, n, interp_transpose.graph.row_map, - interp_transpose.graph.entries, false, g.graph.row_map, - g.graph.entries, false, row_map_p1); + KokkosSparse::Experimental::spgemm_symbolic(&kh, nc, n, n, interp_transpose.graph.row_map, + interp_transpose.graph.entries, false, g.graph.row_map, + g.graph.entries, false, row_map_p1); // partial-result matrix - vtx_view_t entries_p1("adjacencies_partial", - kh.get_spgemm_handle()->get_c_nnz()); - wgt_view_t values_p1("weights_partial", - kh.get_spgemm_handle()->get_c_nnz()); + vtx_view_t entries_p1("adjacencies_partial", kh.get_spgemm_handle()->get_c_nnz()); + wgt_view_t values_p1("weights_partial", kh.get_spgemm_handle()->get_c_nnz()); KokkosSparse::Experimental::spgemm_numeric( - &kh, nc, n, n, interp_transpose.graph.row_map, - interp_transpose.graph.entries, interp_transpose.values, false, - g.graph.row_map, g.graph.entries, g.values, false, row_map_p1, - entries_p1, values_p1); + &kh, nc, n, n, interp_transpose.graph.row_map, interp_transpose.graph.entries, interp_transpose.values, false, + g.graph.row_map, g.graph.entries, g.values, false, row_map_p1, entries_p1, values_p1); kh.destroy_spgemm_handle(); row_map_coarse = edge_view_t("rows_coarse", nc + 1); kh.create_spgemm_handle(); - KokkosSparse::Experimental::spgemm_symbolic( - &kh, nc, n, nc, row_map_p1, entries_p1, false, - interp_mtx.graph.row_map, interp_mtx.graph.entries, false, - row_map_coarse); + KokkosSparse::Experimental::spgemm_symbolic(&kh, nc, n, nc, row_map_p1, entries_p1, false, + interp_mtx.graph.row_map, interp_mtx.graph.entries, false, + row_map_coarse); // coarse-graph adjacency matrix - adj_coarse = - vtx_view_t("adjacencies_coarse", kh.get_spgemm_handle()->get_c_nnz()); - wgt_coarse = - wgt_view_t("weights_coarse", kh.get_spgemm_handle()->get_c_nnz()); + adj_coarse = vtx_view_t("adjacencies_coarse", kh.get_spgemm_handle()->get_c_nnz()); + wgt_coarse = wgt_view_t("weights_coarse", kh.get_spgemm_handle()->get_c_nnz()); - KokkosSparse::Experimental::spgemm_numeric( - &kh, nc, n, nc, row_map_p1, entries_p1, values_p1, false, - interp_mtx.graph.row_map, interp_mtx.graph.entries, interp_mtx.values, - false, row_map_coarse, adj_coarse, wgt_coarse); + KokkosSparse::Experimental::spgemm_numeric(&kh, nc, n, nc, row_map_p1, entries_p1, values_p1, false, + interp_mtx.graph.row_map, interp_mtx.graph.entries, interp_mtx.values, + false, row_map_coarse, adj_coarse, wgt_coarse); kh.destroy_spgemm_handle(); } else { edge_view_t row_map_p1("rows_partial", n + 1); kh.create_spgemm_handle(); - KokkosSparse::Experimental::spgemm_symbolic( - &kh, n, n, nc, g.graph.row_map, g.graph.entries, false, - interp_mtx.graph.row_map, interp_mtx.graph.entries, false, - row_map_p1); + KokkosSparse::Experimental::spgemm_symbolic(&kh, n, n, nc, g.graph.row_map, g.graph.entries, false, + interp_mtx.graph.row_map, interp_mtx.graph.entries, false, + row_map_p1); // partial-result matrix - vtx_view_t entries_p1("adjacencies_partial", - kh.get_spgemm_handle()->get_c_nnz()); - wgt_view_t values_p1("weights_partial", - kh.get_spgemm_handle()->get_c_nnz()); + vtx_view_t entries_p1("adjacencies_partial", kh.get_spgemm_handle()->get_c_nnz()); + wgt_view_t values_p1("weights_partial", kh.get_spgemm_handle()->get_c_nnz()); - KokkosSparse::Experimental::spgemm_numeric( - &kh, n, n, nc, g.graph.row_map, g.graph.entries, g.values, false, - interp_mtx.graph.row_map, interp_mtx.graph.entries, interp_mtx.values, - false, row_map_p1, entries_p1, values_p1); + KokkosSparse::Experimental::spgemm_numeric(&kh, n, n, nc, g.graph.row_map, g.graph.entries, g.values, false, + interp_mtx.graph.row_map, interp_mtx.graph.entries, interp_mtx.values, + false, row_map_p1, entries_p1, values_p1); kh.destroy_spgemm_handle(); row_map_coarse = edge_view_t("rows_coarse", nc + 1); kh.create_spgemm_handle(); - KokkosSparse::Experimental::spgemm_symbolic( - &kh, nc, n, nc, interp_transpose.graph.row_map, - interp_transpose.graph.entries, false, row_map_p1, entries_p1, false, - row_map_coarse); + KokkosSparse::Experimental::spgemm_symbolic(&kh, nc, n, nc, interp_transpose.graph.row_map, + interp_transpose.graph.entries, false, row_map_p1, entries_p1, false, + row_map_coarse); // coarse-graph adjacency matrix - adj_coarse = - vtx_view_t("adjacencies_coarse", kh.get_spgemm_handle()->get_c_nnz()); - wgt_coarse = - wgt_view_t("weights_coarse", kh.get_spgemm_handle()->get_c_nnz()); + adj_coarse = vtx_view_t("adjacencies_coarse", kh.get_spgemm_handle()->get_c_nnz()); + wgt_coarse = wgt_view_t("weights_coarse", kh.get_spgemm_handle()->get_c_nnz()); KokkosSparse::Experimental::spgemm_numeric( - &kh, nc, n, nc, interp_transpose.graph.row_map, - interp_transpose.graph.entries, interp_transpose.values, false, - row_map_p1, entries_p1, values_p1, false, row_map_coarse, adj_coarse, - wgt_coarse); + &kh, nc, n, nc, interp_transpose.graph.row_map, interp_transpose.graph.entries, interp_transpose.values, + false, row_map_p1, entries_p1, values_p1, false, row_map_coarse, adj_coarse, wgt_coarse); kh.destroy_spgemm_handle(); } @@ -362,8 +316,7 @@ class coarse_builder { Kokkos::parallel_for( policy_t(0, nc), KOKKOS_LAMBDA(ordinal_t u) { - for (edge_offset_t j = row_map_coarse(u); j < row_map_coarse(u + 1); - j++) { + for (edge_offset_t j = row_map_coarse(u); j < row_map_coarse(u + 1); j++) { if (adj_coarse(j) != u) { nonLoops(u)++; } @@ -373,8 +326,7 @@ class coarse_builder { edge_view_t row_map_nonloop("nonloop row map", nc + 1); Kokkos::parallel_scan( - policy_t(0, nc), KOKKOS_LAMBDA(const ordinal_t i, edge_offset_t& update, - const bool final) { + policy_t(0, nc), KOKKOS_LAMBDA(const ordinal_t i, edge_offset_t& update, const bool final) { const edge_offset_t val_i = nonLoops(i); update += val_i; if (final) { @@ -394,8 +346,7 @@ class coarse_builder { Kokkos::parallel_for( policy_t(0, nc), KOKKOS_LAMBDA(const ordinal_t u) { - for (edge_offset_t j = row_map_coarse(u); j < row_map_coarse(u + 1); - j++) { + for (edge_offset_t j = row_map_coarse(u); j < row_map_coarse(u + 1); j++) { if (adj_coarse(j) != u) { edge_offset_t offset = row_map_nonloop(u) + nonLoops(u)++; entries_nonloop(offset) = adj_coarse(j); @@ -412,8 +363,7 @@ class coarse_builder { vtx_view_t c_vtx_w("coarse vtx weights", interp_mtx.numCols()); Kokkos::parallel_for( - "compute coarse vtx wgts", policy_t(0, n), - KOKKOS_LAMBDA(const ordinal_t i) { + "compute coarse vtx wgts", policy_t(0, n), KOKKOS_LAMBDA(const ordinal_t i) { ordinal_t u = interp_mtx.graph.entries(i); Kokkos::atomic_add(&c_vtx_w(u), f_vtx_w(i)); }); @@ -431,12 +381,10 @@ class coarse_builder { vtx_view_t input; edge_view_t output; - prefix_sum(vtx_view_t _input, edge_view_t _output) - : input(_input), output(_output) {} + prefix_sum(vtx_view_t _input, edge_view_t _output) : input(_input), output(_output) {} KOKKOS_INLINE_FUNCTION - void operator()(const ordinal_t i, edge_offset_t& update, - const bool final) const { + void operator()(const ordinal_t i, edge_offset_t& update, const bool final) const { const edge_offset_t val_i = input(i); update += val_i; if (final) { @@ -455,11 +403,8 @@ class coarse_builder { vtx_view_t dedupe_edge_count; ordinal_t degreeLimit; - functorDedupeLowDegreeAfterSort(edge_view_t _row_map, vtx_view_t _entries, - vtx_view_t _entriesOut, wgt_view_t _wgts, - wgt_view_t _wgtsOut, - vtx_view_t _dedupe_edge_count, - ordinal_t _degreeLimit_) + functorDedupeLowDegreeAfterSort(edge_view_t _row_map, vtx_view_t _entries, vtx_view_t _entriesOut, wgt_view_t _wgts, + wgt_view_t _wgtsOut, vtx_view_t _dedupe_edge_count, ordinal_t _degreeLimit_) : row_map(_row_map), entries(_entries), entriesOut(_entriesOut), @@ -477,31 +422,28 @@ class coarse_builder { if (degree > degreeLimit) { return; } - Kokkos::parallel_scan( - Kokkos::TeamThreadRange(thread, start, end), - [&](const edge_offset_t& i, edge_offset_t& update, const bool final) { - if (i == start) { - update += 1; - } else if (entries(i) != entries(i - 1)) { - update += 1; - } - if (final) { - entriesOut(start + update - 1) = entries(i); - // requires that wgtsOut be initialized to 0 - Kokkos::atomic_add(&wgtsOut(start + update - 1), wgts(i)); - if (i + 1 == end) { - dedupe_edge_count(u) = update; - } - } - }); - Kokkos::parallel_for( - Kokkos::TeamThreadRange(thread, start, start + dedupe_edge_count(u)), - [&](const edge_offset_t& i) { - entries(i) = entriesOut(i); - wgts(i) = wgtsOut(i); - }); - Kokkos::single(Kokkos::PerTeam(thread), - [&]() { thread_sum += dedupe_edge_count(u); }); + Kokkos::parallel_scan(Kokkos::TeamThreadRange(thread, start, end), + [&](const edge_offset_t& i, edge_offset_t& update, const bool final) { + if (i == start) { + update += 1; + } else if (entries(i) != entries(i - 1)) { + update += 1; + } + if (final) { + entriesOut(start + update - 1) = entries(i); + // requires that wgtsOut be initialized to 0 + Kokkos::atomic_add(&wgtsOut(start + update - 1), wgts(i)); + if (i + 1 == end) { + dedupe_edge_count(u) = update; + } + } + }); + Kokkos::parallel_for(Kokkos::TeamThreadRange(thread, start, start + dedupe_edge_count(u)), + [&](const edge_offset_t& i) { + entries(i) = entriesOut(i); + wgts(i) = wgtsOut(i); + }); + Kokkos::single(Kokkos::PerTeam(thread), [&]() { thread_sum += dedupe_edge_count(u); }); } KOKKOS_INLINE_FUNCTION @@ -536,8 +478,7 @@ class coarse_builder { wgt_view_t wgts, wgtsOut; vtx_view_t dedupe_edge_count; - functorDedupeAfterSort(edge_view_t _row_map, vtx_view_t _entries, - vtx_view_t _entriesOut, wgt_view_t _wgts, + functorDedupeAfterSort(edge_view_t _row_map, vtx_view_t _entries, vtx_view_t _entriesOut, wgt_view_t _wgts, wgt_view_t _wgtsOut, vtx_view_t _dedupe_edge_count) : row_map(_row_map), entries(_entries), @@ -551,25 +492,23 @@ class coarse_builder { ordinal_t u = thread.league_rank(); edge_offset_t start = row_map(u); edge_offset_t end = row_map(u + 1); - Kokkos::parallel_scan( - Kokkos::TeamThreadRange(thread, start, end), - [&](const edge_offset_t& i, edge_offset_t& update, const bool final) { - if (i == start) { - update += 1; - } else if (entries(i) != entries(i - 1)) { - update += 1; - } - if (final) { - entriesOut(start + update - 1) = entries(i); - // requires that wgtsOut be initialized to 0 - Kokkos::atomic_add(&wgtsOut(start + update - 1), wgts(i)); - if (i + 1 == end) { - dedupe_edge_count(u) = update; - } - } - }); - Kokkos::single(Kokkos::PerTeam(thread), - [&]() { thread_sum += dedupe_edge_count(u); }); + Kokkos::parallel_scan(Kokkos::TeamThreadRange(thread, start, end), + [&](const edge_offset_t& i, edge_offset_t& update, const bool final) { + if (i == start) { + update += 1; + } else if (entries(i) != entries(i - 1)) { + update += 1; + } + if (final) { + entriesOut(start + update - 1) = entries(i); + // requires that wgtsOut be initialized to 0 + Kokkos::atomic_add(&wgtsOut(start + update - 1), wgts(i)); + if (i + 1 == end) { + dedupe_edge_count(u) = update; + } + } + }); + Kokkos::single(Kokkos::PerTeam(thread), [&]() { thread_sum += dedupe_edge_count(u); }); } KOKKOS_INLINE_FUNCTION @@ -601,11 +540,10 @@ class coarse_builder { const wgt_view_t source_wgts; wgt_view_t target_wgts; - functorCollapseDirectedToUndirected( - const edge_view_t _source_row_map, const edge_view_t _target_row_map, - const vtx_view_t _source_edge_counts, vtx_view_t _target_edge_counts, - const vtx_view_t _source_destinations, vtx_view_t _target_destinations, - const wgt_view_t _source_wgts, wgt_view_t _target_wgts) + functorCollapseDirectedToUndirected(const edge_view_t _source_row_map, const edge_view_t _target_row_map, + const vtx_view_t _source_edge_counts, vtx_view_t _target_edge_counts, + const vtx_view_t _source_destinations, vtx_view_t _target_destinations, + const wgt_view_t _source_wgts, wgt_view_t _target_wgts) : source_row_map(_source_row_map), target_row_map(_target_row_map), source_edge_counts(_source_edge_counts), @@ -620,24 +558,18 @@ class coarse_builder { ordinal_t u = thread.league_rank(); edge_offset_t u_origin = source_row_map(u); edge_offset_t u_dest_offset = target_row_map(u); - Kokkos::parallel_for( - Kokkos::TeamThreadRange(thread, source_edge_counts(u)), - [&](const edge_offset_t u_idx) { - ordinal_t v = source_destinations(u_origin + u_idx); - scalar_t wgt = source_wgts(u_origin + u_idx); - edge_offset_t v_dest_offset = target_row_map(v); - edge_offset_t v_dest = - v_dest_offset + - Kokkos::atomic_fetch_add(&target_edge_counts(v), 1); - edge_offset_t u_dest = - u_dest_offset + - Kokkos::atomic_fetch_add(&target_edge_counts(u), 1); - - target_destinations(u_dest) = v; - target_wgts(u_dest) = wgt; - target_destinations(v_dest) = u; - target_wgts(v_dest) = wgt; - }); + Kokkos::parallel_for(Kokkos::TeamThreadRange(thread, source_edge_counts(u)), [&](const edge_offset_t u_idx) { + ordinal_t v = source_destinations(u_origin + u_idx); + scalar_t wgt = source_wgts(u_origin + u_idx); + edge_offset_t v_dest_offset = target_row_map(v); + edge_offset_t v_dest = v_dest_offset + Kokkos::atomic_fetch_add(&target_edge_counts(v), 1); + edge_offset_t u_dest = u_dest_offset + Kokkos::atomic_fetch_add(&target_edge_counts(u), 1); + + target_destinations(u_dest) = v; + target_wgts(u_dest) = wgt; + target_destinations(v_dest) = u; + target_wgts(v_dest) = wgt; + }); } }; @@ -654,14 +586,10 @@ class coarse_builder { vtx_view_t remaining; bool use_out; - functorHashmapAccumulator(edge_view_t _row_map, vtx_view_t _entries_in, - vtx_view_t _entries_out, wgt_view_t _wgts_in, - wgt_view_t _wgts_out, - vtx_view_t _dedupe_edge_count, - uniform_memory_pool_t _memory_pool, - const ordinal_t _hash_size, - const ordinal_t _max_hash_entries, - vtx_view_t _remaining, bool _use_out) + functorHashmapAccumulator(edge_view_t _row_map, vtx_view_t _entries_in, vtx_view_t _entries_out, + wgt_view_t _wgts_in, wgt_view_t _wgts_out, vtx_view_t _dedupe_edge_count, + uniform_memory_pool_t _memory_pool, const ordinal_t _hash_size, + const ordinal_t _max_hash_entries, vtx_view_t _remaining, bool _use_out) : row_map(_row_map), entries_in(_entries_in), entries_out(_entries_out), @@ -680,12 +608,10 @@ class coarse_builder { if (std::is_same::value) return 0; #endif #if defined(KOKKOS_ENABLE_OPENMP) - if (std::is_same::value) - return Kokkos::OpenMP::impl_hardware_thread_id(); + if (std::is_same::value) return Kokkos::OpenMP::impl_hardware_thread_id(); #endif #if defined(KOKKOS_ENABLE_THREADS) - if (std::is_same::value) - return Kokkos::Threads::impl_hardware_thread_id(); + if (std::is_same::value) return Kokkos::Threads::impl_hardware_thread_id(); #endif return row_index; } @@ -745,17 +671,15 @@ class coarse_builder { // Set pointer to hash values scalar_t* values = (scalar_t*)wgts_out.data() + row_map(idx); - KokkosKernels::Experimental::HashmapAccumulator< - hash_size_type, hash_key_type, hash_value_type, - KokkosKernels::Experimental::HashOpType::bitwiseAnd> - hash_map(hash_size, hash_func_pow2, hash_begins, hash_nexts, keys, - values); + KokkosKernels::Experimental::HashmapAccumulator + hash_map(hash_size, hash_func_pow2, hash_begins, hash_nexts, keys, values); for (edge_offset_t i = row_map(idx); i < row_map(idx + 1); i++) { ordinal_t key = entries_in(i); scalar_t value = wgts_in(i); - hash_map.sequential_insert_into_hash_mergeAdd_TrackHashes( - key, value, used_hash_size, used_hash_count, used_hash_indices); + hash_map.sequential_insert_into_hash_mergeAdd_TrackHashes(key, value, used_hash_size, used_hash_count, + used_hash_indices); }; // Reset the Begins values to -1 before releasing the memory pool chunk. @@ -797,8 +721,7 @@ class coarse_builder { // Acquire a chunk from the memory pool using a spin-loop. ptr_write = nullptr; while (nullptr == ptr_write) { - ptr_write = (volatile ordinal_t*)(memory_pool.allocate_chunk( - thread.league_rank())); + ptr_write = (volatile ordinal_t*)(memory_pool.allocate_chunk(thread.league_rank())); } }, ptr_temp); @@ -848,29 +771,23 @@ class coarse_builder { values = (scalar_t*)(ptr_temp); } - KokkosKernels::Experimental::HashmapAccumulator< - hash_size_type, hash_key_type, hash_value_type, - KokkosKernels::Experimental::HashOpType::bitwiseAnd> - hash_map(hash_size, hash_func_pow2, hash_begins, hash_nexts, keys, - values); - - Kokkos::parallel_for( - Kokkos::ThreadVectorRange(thread, row_map(idx), row_map(idx + 1)), - [&](const edge_offset_t& i) { - ordinal_t key = entries_in(i); - scalar_t value = wgts_in(i); - // duplicate keys may be inserted simultaneously, this causes - // problems we must handle later - int r = - hash_map - .vector_atomic_insert_into_hash_mergeAtomicAdd_TrackHashes( - key, value, used_hash_size, used_hash_count, - used_hash_indices); - - // Check return code - if (r) { - } - }); + KokkosKernels::Experimental::HashmapAccumulator + hash_map(hash_size, hash_func_pow2, hash_begins, hash_nexts, keys, values); + + Kokkos::parallel_for(Kokkos::ThreadVectorRange(thread, row_map(idx), row_map(idx + 1)), + [&](const edge_offset_t& i) { + ordinal_t key = entries_in(i); + scalar_t value = wgts_in(i); + // duplicate keys may be inserted simultaneously, this causes + // problems we must handle later + int r = hash_map.vector_atomic_insert_into_hash_mergeAtomicAdd_TrackHashes( + key, value, used_hash_size, used_hash_count, used_hash_indices); + + // Check return code + if (r) { + } + }); thread.team_barrier(); // Reset the Begins values to -1 before releasing the memory pool chunk. @@ -879,72 +796,49 @@ class coarse_builder { // there can be duplicate key insertions (these are hopefully rare or else // performance will suffer) This did not work as a TeamThreadRange, don't // know why (possibly issues with atomic addition on write_idx) - Kokkos::parallel_for( - Kokkos::ThreadVectorRange(thread, (ordinal_t)0, *used_hash_count), - [&](const ordinal_t& i) { - ordinal_t dirty_hash = used_hash_indices[i]; - - ordinal_t bucket = hash_begins[dirty_hash]; - - // ascending-key bubble-sort the linked list - // it really do be like that sometimes - ordinal_t end_inner = ORD_MAX; - while (end_inner != bucket) { - ordinal_t last_idx = bucket; - ordinal_t last_key = keys[last_idx]; - scalar_t last_val = values[last_idx]; - bool is_sorted = true; - // bubble-up - for (ordinal_t k = hash_nexts[bucket]; k != end_inner; - k = hash_nexts[k]) { - // swap - if (keys[k] < last_key) { - keys[last_idx] = keys[k]; - values[last_idx] = values[k]; - keys[k] = last_key; - values[k] = last_val; - is_sorted = false; - } - // increment last - last_key = keys[k]; - last_val = values[k]; - last_idx = k; - } - end_inner = last_idx; - if (is_sorted) { - // end the outer loop - end_inner = bucket; - } - } - ordinal_t key = keys[bucket]; - scalar_t val = values[bucket]; - ordinal_t last = bucket; - // merge linked list and write out - for (ordinal_t j = hash_nexts[bucket]; j != ORD_MAX; - j = hash_nexts[j]) { - if (keys[j] == key) { - val += values[j]; - } else { - ordinal_t write_at = - row_map(idx) + Kokkos::atomic_fetch_add(write_idx, 1); - entries_out(write_at) = key; - if (use_out) { - // reuse wgts_in as scratch space because we are overwriting - // working memory if we use wgts_out - wgts_in(write_at) = val; - } else { - wgts_out(write_at) = val; - } - key = keys[j]; - val = values[j]; - } - hash_nexts[last] = ORD_MAX; - last = j; + Kokkos::parallel_for(Kokkos::ThreadVectorRange(thread, (ordinal_t)0, *used_hash_count), [&](const ordinal_t& i) { + ordinal_t dirty_hash = used_hash_indices[i]; + + ordinal_t bucket = hash_begins[dirty_hash]; + + // ascending-key bubble-sort the linked list + // it really do be like that sometimes + ordinal_t end_inner = ORD_MAX; + while (end_inner != bucket) { + ordinal_t last_idx = bucket; + ordinal_t last_key = keys[last_idx]; + scalar_t last_val = values[last_idx]; + bool is_sorted = true; + // bubble-up + for (ordinal_t k = hash_nexts[bucket]; k != end_inner; k = hash_nexts[k]) { + // swap + if (keys[k] < last_key) { + keys[last_idx] = keys[k]; + values[last_idx] = values[k]; + keys[k] = last_key; + values[k] = last_val; + is_sorted = false; } - hash_nexts[last] = ORD_MAX; - // write out the final entry in linked list - ordinal_t write_at = - row_map(idx) + Kokkos::atomic_fetch_add(write_idx, 1); + // increment last + last_key = keys[k]; + last_val = values[k]; + last_idx = k; + } + end_inner = last_idx; + if (is_sorted) { + // end the outer loop + end_inner = bucket; + } + } + ordinal_t key = keys[bucket]; + scalar_t val = values[bucket]; + ordinal_t last = bucket; + // merge linked list and write out + for (ordinal_t j = hash_nexts[bucket]; j != ORD_MAX; j = hash_nexts[j]) { + if (keys[j] == key) { + val += values[j]; + } else { + ordinal_t write_at = row_map(idx) + Kokkos::atomic_fetch_add(write_idx, 1); entries_out(write_at) = key; if (use_out) { // reuse wgts_in as scratch space because we are overwriting @@ -953,17 +847,31 @@ class coarse_builder { } else { wgts_out(write_at) = val; } - hash_begins[dirty_hash] = ORD_MAX; - }); + key = keys[j]; + val = values[j]; + } + hash_nexts[last] = ORD_MAX; + last = j; + } + hash_nexts[last] = ORD_MAX; + // write out the final entry in linked list + ordinal_t write_at = row_map(idx) + Kokkos::atomic_fetch_add(write_idx, 1); + entries_out(write_at) = key; + if (use_out) { + // reuse wgts_in as scratch space because we are overwriting + // working memory if we use wgts_out + wgts_in(write_at) = val; + } else { + wgts_out(write_at) = val; + } + hash_begins[dirty_hash] = ORD_MAX; + }); thread.team_barrier(); // need to copy from wgts_in to wgts_out if we used wgts_in as scratch // space if (use_out) { - Kokkos::parallel_for( - Kokkos::ThreadVectorRange(thread, (ordinal_t)0, *write_idx), - [&](const ordinal_t& i) { - wgts_out(row_map(idx) + i) = wgts_in(row_map(idx) + i); - }); + Kokkos::parallel_for(Kokkos::ThreadVectorRange(thread, (ordinal_t)0, *write_idx), + [&](const ordinal_t& i) { wgts_out(row_map(idx) + i) = wgts_in(row_map(idx) + i); }); } Kokkos::single(Kokkos::PerTeam(thread), [&]() { @@ -978,14 +886,11 @@ class coarse_builder { }; // functorHashmapAccumulator - static void getHashmapSizeAndCount( - coarsen_handle& handle, const ordinal_t n, - const ordinal_t remaining_count, vtx_view_t remaining, - vtx_view_t edges_per_source, ordinal_t& hash_size, ordinal_t& max_entries, - ordinal_t& mem_chunk_size, ordinal_t& mem_chunk_count) { + static void getHashmapSizeAndCount(coarsen_handle& handle, const ordinal_t n, const ordinal_t remaining_count, + vtx_view_t remaining, vtx_view_t edges_per_source, ordinal_t& hash_size, + ordinal_t& max_entries, ordinal_t& mem_chunk_size, ordinal_t& mem_chunk_count) { ordinal_t avg_entries = 0; - if (!is_host_space && - static_cast(remaining_count) / static_cast(n) > 0.01) { + if (!is_host_space && static_cast(remaining_count) / static_cast(n) > 0.01) { Kokkos::parallel_reduce( "calc average among remaining", policy_t(0, remaining_count), KOKKOS_LAMBDA(const ordinal_t i, ordinal_t& thread_sum) { @@ -1024,12 +929,11 @@ class coarse_builder { } // Determine memory chunk size for UniformMemoryPool - mem_chunk_size = hash_size; // for hash indices - mem_chunk_size += hash_size; // for hash begins - mem_chunk_size += - 3 * max_entries; // for hash nexts, keys, and values (unless scalar_t - // != ordinal_t, in which case memory is unused) - mem_chunk_size += 10; // for metadata + mem_chunk_size = hash_size; // for hash indices + mem_chunk_size += hash_size; // for hash begins + mem_chunk_size += 3 * max_entries; // for hash nexts, keys, and values (unless scalar_t + // != ordinal_t, in which case memory is unused) + mem_chunk_size += 10; // for metadata mem_chunk_count = exec_space().concurrency(); if (mem_chunk_count > remaining_count) { mem_chunk_count = remaining_count + 1; @@ -1037,34 +941,27 @@ class coarse_builder { if (!is_host_space) { // decrease number of mem_chunks to reduce memory usage if necessary - size_t mem_needed = static_cast(mem_chunk_count) * - static_cast(mem_chunk_size) * - sizeof(ordinal_t); + size_t mem_needed = + static_cast(mem_chunk_count) * static_cast(mem_chunk_size) * sizeof(ordinal_t); //~500MB size_t max_mem_allowed = handle.max_mem_allowed; if (mem_needed > max_mem_allowed) { size_t chunk_dif = mem_needed - max_mem_allowed; - chunk_dif = chunk_dif / - (static_cast(mem_chunk_size) * sizeof(ordinal_t)); + chunk_dif = chunk_dif / (static_cast(mem_chunk_size) * sizeof(ordinal_t)); chunk_dif++; mem_chunk_count -= chunk_dif; } } } - static void deduplicate_graph(coarsen_handle& handle, const ordinal_t n, - const bool use_team, - vtx_view_t edges_per_source, - vtx_view_t dest_by_source, - wgt_view_t wgt_by_source, - const edge_view_t source_bucket_offset, - edge_offset_t& gc_nedges) { + static void deduplicate_graph(coarsen_handle& handle, const ordinal_t n, const bool use_team, + vtx_view_t edges_per_source, vtx_view_t dest_by_source, wgt_view_t wgt_by_source, + const edge_view_t source_bucket_offset, edge_offset_t& gc_nedges) { if (handle.b == Hashmap || is_host_space) { ordinal_t remaining_count = n; vtx_view_t remaining("remaining vtx", n); Kokkos::parallel_for( - policy_t(0, n), - KOKKOS_LAMBDA(const ordinal_t i) { remaining(i) = i; }); + policy_t(0, n), KOKKOS_LAMBDA(const ordinal_t i) { remaining(i) = i; }); // deduplicate rows in phases starting with the small degree rows so we // can use small hashmaps increase the hashmap size each phase to the // necessary size for twice the average of remaining rows @@ -1076,12 +973,10 @@ class coarse_builder { do { // determine size for hashmap ordinal_t hash_size, max_entries, mem_chunk_size, mem_chunk_count; - getHashmapSizeAndCount(handle, n, remaining_count, remaining, - edges_per_source, hash_size, max_entries, + getHashmapSizeAndCount(handle, n, remaining_count, remaining, edges_per_source, hash_size, max_entries, mem_chunk_size, mem_chunk_count); // Create Uniform Initialized Memory Pool - KokkosKernels::Impl::PoolType pool_type = - KokkosKernels::Impl::ManyThread2OneChunk; + KokkosKernels::Impl::PoolType pool_type = KokkosKernels::Impl::ManyThread2OneChunk; if (is_host_space) { pool_type = KokkosKernels::Impl::OneThread2OneChunk; @@ -1089,29 +984,23 @@ class coarse_builder { bool use_dyn = should_use_dyn(n, source_bucket_offset, mem_chunk_count); - uniform_memory_pool_t memory_pool(mem_chunk_count, mem_chunk_size, - ORD_MAX, pool_type); + uniform_memory_pool_t memory_pool(mem_chunk_count, mem_chunk_size, ORD_MAX, pool_type); - functorHashmapAccumulator hashmapAccumulator( - source_bucket_offset, dest_by_source, dest_by_source, wgt_by_source, - wgt_out, edges_per_source, memory_pool, hash_size, max_entries, - remaining, !scal_eq_ord); + functorHashmapAccumulator hashmapAccumulator(source_bucket_offset, dest_by_source, dest_by_source, + wgt_by_source, wgt_out, edges_per_source, memory_pool, hash_size, + max_entries, remaining, !scal_eq_ord); ordinal_t old_remaining_count = remaining_count; if (!is_host_space && max_entries >= 128) { - Kokkos::parallel_reduce("hashmap time", - team_policy_t(old_remaining_count, 1, 64), - hashmapAccumulator, remaining_count); + Kokkos::parallel_reduce("hashmap time", team_policy_t(old_remaining_count, 1, 64), hashmapAccumulator, + remaining_count); } else { if (use_dyn) { - Kokkos::parallel_reduce( - "hashmap time", - dyn_policy_t(0, old_remaining_count, Kokkos::ChunkSize(128)), - hashmapAccumulator, remaining_count); - } else { - Kokkos::parallel_reduce("hashmap time", - policy_t(0, old_remaining_count), + Kokkos::parallel_reduce("hashmap time", dyn_policy_t(0, old_remaining_count, Kokkos::ChunkSize(128)), hashmapAccumulator, remaining_count); + } else { + Kokkos::parallel_reduce("hashmap time", policy_t(0, old_remaining_count), hashmapAccumulator, + remaining_count); } } @@ -1120,8 +1009,7 @@ class coarse_builder { Kokkos::parallel_scan( "move remaining vertices", policy_t(0, old_remaining_count), - KOKKOS_LAMBDA(const ordinal_t i, ordinal_t& update, - const bool final) { + KOKKOS_LAMBDA(const ordinal_t i, ordinal_t& update, const bool final) { ordinal_t u = remaining(i); if (edges_per_source(u) >= max_entries) { if (final) { @@ -1135,39 +1023,31 @@ class coarse_builder { } } while (remaining_count > 0); Kokkos::parallel_reduce( - policy_t(0, n), - KOKKOS_LAMBDA(const ordinal_t i, edge_offset_t& sum) { - sum += edges_per_source(i); - }, + policy_t(0, n), KOKKOS_LAMBDA(const ordinal_t i, edge_offset_t& sum) { sum += edges_per_source(i); }, gc_nedges); if (!scal_eq_ord && !is_host_space) { Kokkos::deep_copy(wgt_by_source, wgt_out); } } else if (handle.b == Sort) { // sort the (implicit) crs matrix - KokkosSparse::sort_crs_matrix(source_bucket_offset, - dest_by_source, wgt_by_source); + KokkosSparse::sort_crs_matrix(source_bucket_offset, + dest_by_source, wgt_by_source); // combine adjacent entries that are equal if (use_team) { // thread team version wgt_view_t wgts_out("wgts after dedupe", wgt_by_source.extent(0)); vtx_view_t dest_out("dest after dedupe", dest_by_source.extent(0)); - functorDedupeAfterSort deduper(source_bucket_offset, dest_by_source, - dest_out, wgt_by_source, wgts_out, + functorDedupeAfterSort deduper(source_bucket_offset, dest_by_source, dest_out, wgt_by_source, wgts_out, edges_per_source); - Kokkos::parallel_reduce("deduplicated sorted", team_policy_t(n, 64), - deduper, gc_nedges); + Kokkos::parallel_reduce("deduplicated sorted", team_policy_t(n, 64), deduper, gc_nedges); Kokkos::deep_copy(wgt_by_source, wgts_out); Kokkos::deep_copy(dest_by_source, dest_out); } else { // no thread team version - functorDedupeAfterSort deduper(source_bucket_offset, dest_by_source, - dest_by_source, wgt_by_source, + functorDedupeAfterSort deduper(source_bucket_offset, dest_by_source, dest_by_source, wgt_by_source, wgt_by_source, edges_per_source); - Kokkos::parallel_reduce("deduplicated sorted", policy_t(0, n), deduper, - gc_nedges); + Kokkos::parallel_reduce("deduplicated sorted", policy_t(0, n), deduper, gc_nedges); } } else if (handle.b == Hybrid) { @@ -1179,23 +1059,19 @@ class coarse_builder { ordinal_t limit = 128; // sort the (implicit) crs matrix, but only the low degree rows ordinal_t remaining_count = - KokkosSparse::sort_low_degree_rows_crs_matrix( + KokkosSparse::sort_low_degree_rows_crs_matrix( source_bucket_offset, dest_by_source, wgt_by_source, limit); // combine adjacent entries that are equal { // no thread team version - functorDedupeLowDegreeAfterSort deduper( - source_bucket_offset, dest_by_source, dest_by_source, wgt_by_source, - wgt_out, edges_per_source, limit); - Kokkos::parallel_reduce("deduplicated sorted", policy_t(0, n), deduper, - gc_nedges); + functorDedupeLowDegreeAfterSort deduper(source_bucket_offset, dest_by_source, dest_by_source, wgt_by_source, + wgt_out, edges_per_source, limit); + Kokkos::parallel_reduce("deduplicated sorted", policy_t(0, n), deduper, gc_nedges); } vtx_view_t remaining("remaining vtx", remaining_count); Kokkos::parallel_scan( "move remaining vertices", policy_t(0, n), - KOKKOS_LAMBDA(const ordinal_t i, ordinal_t& update, - const bool final) { + KOKKOS_LAMBDA(const ordinal_t i, ordinal_t& update, const bool final) { if (edges_per_source(i) > limit) { if (final) { remaining(update) = i; @@ -1209,34 +1085,28 @@ class coarse_builder { while (remaining_count > 0) { // determine size for hashmap ordinal_t hash_size, max_entries, mem_chunk_size, mem_chunk_count; - getHashmapSizeAndCount(handle, n, remaining_count, remaining, - edges_per_source, hash_size, max_entries, + getHashmapSizeAndCount(handle, n, remaining_count, remaining, edges_per_source, hash_size, max_entries, mem_chunk_size, mem_chunk_count); // Create Uniform Initialized Memory Pool - KokkosKernels::Impl::PoolType pool_type = - KokkosKernels::Impl::ManyThread2OneChunk; + KokkosKernels::Impl::PoolType pool_type = KokkosKernels::Impl::ManyThread2OneChunk; if (is_host_space) { pool_type = KokkosKernels::Impl::OneThread2OneChunk; } - uniform_memory_pool_t memory_pool(mem_chunk_count, mem_chunk_size, - ORD_MAX, pool_type); + uniform_memory_pool_t memory_pool(mem_chunk_count, mem_chunk_size, ORD_MAX, pool_type); - functorHashmapAccumulator hashmapAccumulator( - source_bucket_offset, dest_by_source, dest_by_source, wgt_by_source, - wgt_out, edges_per_source, memory_pool, hash_size, max_entries, - remaining, !scal_eq_ord); + functorHashmapAccumulator hashmapAccumulator(source_bucket_offset, dest_by_source, dest_by_source, + wgt_by_source, wgt_out, edges_per_source, memory_pool, hash_size, + max_entries, remaining, !scal_eq_ord); ordinal_t old_remaining_count = remaining_count; if (!is_host_space && max_entries >= 128) { - Kokkos::parallel_reduce("hashmap time", - dyn_team_policy_t(old_remaining_count, 1, 64), - hashmapAccumulator, remaining_count); + Kokkos::parallel_reduce("hashmap time", dyn_team_policy_t(old_remaining_count, 1, 64), hashmapAccumulator, + remaining_count); } else { - Kokkos::parallel_reduce("hashmap time", - dyn_policy_t(0, old_remaining_count), - hashmapAccumulator, remaining_count); + Kokkos::parallel_reduce("hashmap time", dyn_policy_t(0, old_remaining_count), hashmapAccumulator, + remaining_count); } if (remaining_count > 0) { @@ -1244,8 +1114,7 @@ class coarse_builder { Kokkos::parallel_scan( "move remaining vertices", policy_t(0, old_remaining_count), - KOKKOS_LAMBDA(const ordinal_t i, ordinal_t& update, - const bool final) { + KOKKOS_LAMBDA(const ordinal_t i, ordinal_t& update, const bool final) { ordinal_t u = remaining(i); if (edges_per_source(u) >= max_entries) { if (final) { @@ -1260,10 +1129,7 @@ class coarse_builder { } gc_nedges = 0; Kokkos::parallel_reduce( - policy_t(0, n), - KOKKOS_LAMBDA(const ordinal_t i, edge_offset_t& sum) { - sum += edges_per_source(i); - }, + policy_t(0, n), KOKKOS_LAMBDA(const ordinal_t i, edge_offset_t& sum) { sum += edges_per_source(i); }, gc_nedges); if (!scal_eq_ord && !is_host_space) { Kokkos::deep_copy(wgt_by_source, wgt_out); @@ -1279,10 +1145,8 @@ class coarse_builder { wgt_view_t wgts_out; ordinal_t workLength; - translationFunctor(matrix_t _vcmap, matrix_t _g, vtx_view_t _mapped_edges, - vtx_view_t _edges_per_source, - edge_view_t _source_bucket_offset, vtx_view_t _edges_out, - wgt_view_t _wgts_out) + translationFunctor(matrix_t _vcmap, matrix_t _g, vtx_view_t _mapped_edges, vtx_view_t _edges_per_source, + edge_view_t _source_bucket_offset, vtx_view_t _edges_out, wgt_view_t _wgts_out) : vcmap(_vcmap), g(_g), mapped_edges(_mapped_edges), @@ -1299,20 +1163,18 @@ class coarse_builder { ordinal_t u = vcmap.graph.entries(i); edge_offset_t start = g.graph.row_map(i); edge_offset_t end = g.graph.row_map(i + 1); - Kokkos::parallel_for(Kokkos::ThreadVectorRange(t, start, end), - [&](const edge_offset_t idx) { - ordinal_t v = mapped_edges(idx); - if (u != v) { - // fix this, inefficient - edge_offset_t offset = Kokkos::atomic_fetch_add( - &edges_per_source(u), 1); - - offset += source_bucket_offset(u); - - edges_out(offset) = v; - wgts_out(offset) = g.values(idx); - } - }); + Kokkos::parallel_for(Kokkos::ThreadVectorRange(t, start, end), [&](const edge_offset_t idx) { + ordinal_t v = mapped_edges(idx); + if (u != v) { + // fix this, inefficient + edge_offset_t offset = Kokkos::atomic_fetch_add(&edges_per_source(u), 1); + + offset += source_bucket_offset(u); + + edges_out(offset) = v; + wgts_out(offset) = g.values(idx); + } + }); } KOKKOS_INLINE_FUNCTION @@ -1324,8 +1186,7 @@ class coarse_builder { ordinal_t v = mapped_edges(idx); if (u != v) { // fix this - edge_offset_t offset = - Kokkos::atomic_fetch_add(&edges_per_source(u), 1); + edge_offset_t offset = Kokkos::atomic_fetch_add(&edges_per_source(u), 1); offset += source_bucket_offset(u); @@ -1337,18 +1198,14 @@ class coarse_builder { }; // optimized for regular distribution low degree rows - static coarse_level_triple build_nonskew(coarsen_handle& handle, - const matrix_t g, - const matrix_t vcmap, - vtx_view_t mapped_edges, - vtx_view_t edges_per_source) { + static coarse_level_triple build_nonskew(coarsen_handle& handle, const matrix_t g, const matrix_t vcmap, + vtx_view_t mapped_edges, vtx_view_t edges_per_source) { ordinal_t n = g.numRows(); ordinal_t nc = vcmap.numCols(); edge_view_t source_bucket_offset("source_bucket_offsets", nc + 1); edge_offset_t gc_nedges = 0; - Kokkos::parallel_scan("calc source offsets", policy_t(0, nc), - prefix_sum(edges_per_source, source_bucket_offset)); + Kokkos::parallel_scan("calc source offsets", policy_t(0, nc), prefix_sum(edges_per_source, source_bucket_offset)); Kokkos::deep_copy(edges_per_source, static_cast(0)); @@ -1360,39 +1217,30 @@ class coarse_builder { wgt_view_t wgt_by_source("wgt_by_source", nnz_pre_dedupe); // translates fine entries into coarse entries and writes into coarse rows - translationFunctor translateF(vcmap, g, mapped_edges, edges_per_source, - source_bucket_offset, dest_by_source, + translationFunctor translateF(vcmap, g, mapped_edges, edges_per_source, source_bucket_offset, dest_by_source, wgt_by_source); if (is_host_space) { - bool use_dyn = - should_use_dyn(n, g.graph.row_map, exec_space().concurrency()); + bool use_dyn = should_use_dyn(n, g.graph.row_map, exec_space().concurrency()); if (use_dyn) { - Kokkos::parallel_for("move edges to coarse matrix", dyn_policy_t(0, n), - translateF); + Kokkos::parallel_for("move edges to coarse matrix", dyn_policy_t(0, n), translateF); } else { - Kokkos::parallel_for("move edges to coarse matrix", policy_t(0, n), - translateF); + Kokkos::parallel_for("move edges to coarse matrix", policy_t(0, n), translateF); } } else { - auto execSpaceEnum = - KokkosKernels::Impl::kk_get_exec_space_type(); - int vectorLength = KokkosKernels::Impl::kk_get_suggested_vector_size( - n, g.nnz(), execSpaceEnum); + auto execSpaceEnum = KokkosKernels::Impl::kk_get_exec_space_type(); + int vectorLength = KokkosKernels::Impl::kk_get_suggested_vector_size(n, g.nnz(), execSpaceEnum); team_policy_t dummy(1, 1, vectorLength); int teamSize = dummy.team_size_max(translateF, Kokkos::ParallelForTag()); - Kokkos::parallel_for( - "move edges to coarse matrix", - team_policy_t((n + teamSize - 1) / teamSize, teamSize, vectorLength), - translateF); + Kokkos::parallel_for("move edges to coarse matrix", + team_policy_t((n + teamSize - 1) / teamSize, teamSize, vectorLength), translateF); } - deduplicate_graph(handle, nc, false, edges_per_source, dest_by_source, - wgt_by_source, source_bucket_offset, gc_nedges); + deduplicate_graph(handle, nc, false, edges_per_source, dest_by_source, wgt_by_source, source_bucket_offset, + gc_nedges); edge_view_t source_offsets("source_offsets", nc + 1); - Kokkos::parallel_scan("calc source offsets again", policy_t(0, nc), - prefix_sum(edges_per_source, source_offsets)); + Kokkos::parallel_scan("calc source offsets again", policy_t(0, nc), prefix_sum(edges_per_source, source_offsets)); edge_subview_t edge_total_subview = Kokkos::subview(source_offsets, nc); Kokkos::deep_copy(gc_nedges, edge_total_subview); @@ -1401,12 +1249,10 @@ class coarse_builder { wgt_view_t wgts("wgts", gc_nedges); if (is_host_space) { - bool use_dyn = - should_use_dyn(nc, source_offsets, exec_space().concurrency()); + bool use_dyn = should_use_dyn(nc, source_offsets, exec_space().concurrency()); if (use_dyn) { Kokkos::parallel_for( - "move deduped edges to new coarse matrix", dyn_policy_t(0, nc), - KOKKOS_LAMBDA(const ordinal_t& u) { + "move deduped edges to new coarse matrix", dyn_policy_t(0, nc), KOKKOS_LAMBDA(const ordinal_t& u) { edge_offset_t start_origin = source_bucket_offset(u); edge_offset_t start_dest = source_offsets(u); for (ordinal_t idx = 0; idx < edges_per_source(u); idx++) { @@ -1416,8 +1262,7 @@ class coarse_builder { }); } else { Kokkos::parallel_for( - "move deduped edges to new coarse matrix", policy_t(0, nc), - KOKKOS_LAMBDA(const ordinal_t& u) { + "move deduped edges to new coarse matrix", policy_t(0, nc), KOKKOS_LAMBDA(const ordinal_t& u) { edge_offset_t start_origin = source_bucket_offset(u); edge_offset_t start_dest = source_offsets(u); for (ordinal_t idx = 0; idx < edges_per_source(u); idx++) { @@ -1428,18 +1273,15 @@ class coarse_builder { } } else { Kokkos::parallel_for( - "move deduped edges to new coarse matrix", - team_policy_t(nc, Kokkos::AUTO), KOKKOS_LAMBDA(const member& thread) { + "move deduped edges to new coarse matrix", team_policy_t(nc, Kokkos::AUTO), + KOKKOS_LAMBDA(const member& thread) { ordinal_t u = thread.league_rank(); edge_offset_t start_origin = source_bucket_offset(u); edge_offset_t start_dest = source_offsets(u); - Kokkos::parallel_for( - Kokkos::TeamThreadRange(thread, edges_per_source(u)), - [=](const ordinal_t idx) { - dest_idx(start_dest + idx) = - dest_by_source(start_origin + idx); - wgts(start_dest + idx) = wgt_by_source(start_origin + idx); - }); + Kokkos::parallel_for(Kokkos::TeamThreadRange(thread, edges_per_source(u)), [=](const ordinal_t idx) { + dest_idx(start_dest + idx) = dest_by_source(start_origin + idx); + wgts(start_dest + idx) = wgt_by_source(start_origin + idx); + }); }); } @@ -1452,37 +1294,33 @@ class coarse_builder { } // forms the explicit matrix created by symmetrizing the implicit matrix - static matrix_t collapse_directed_to_undirected( - const ordinal_t nc, const vtx_view_t source_edge_counts, - const edge_view_t source_row_map, const vtx_view_t source_destinations, - const wgt_view_t source_wgts) { + static matrix_t collapse_directed_to_undirected(const ordinal_t nc, const vtx_view_t source_edge_counts, + const edge_view_t source_row_map, + const vtx_view_t source_destinations, const wgt_view_t source_wgts) { vtx_view_t coarse_degree("coarse degree", nc); Kokkos::deep_copy(coarse_degree, source_edge_counts); Kokkos::parallel_for( - "count directed edges owned by opposite endpoint", - team_policy_t(nc, Kokkos::AUTO), KOKKOS_LAMBDA(const member& thread) { + "count directed edges owned by opposite endpoint", team_policy_t(nc, Kokkos::AUTO), + KOKKOS_LAMBDA(const member& thread) { ordinal_t u = thread.league_rank(); edge_offset_t start = source_row_map(u); edge_offset_t end = start + source_edge_counts(u); - Kokkos::parallel_for(Kokkos::TeamThreadRange(thread, start, end), - [=](const edge_offset_t idx) { - ordinal_t v = source_destinations(idx); - // increment other vertex - Kokkos::atomic_fetch_add(&coarse_degree(v), 1); - }); + Kokkos::parallel_for(Kokkos::TeamThreadRange(thread, start, end), [=](const edge_offset_t idx) { + ordinal_t v = source_destinations(idx); + // increment other vertex + Kokkos::atomic_fetch_add(&coarse_degree(v), 1); + }); }); edge_view_t target_row_map("target row map", nc + 1); - Kokkos::parallel_scan("calc target row map", policy_t(0, nc), - prefix_sum(coarse_degree, target_row_map)); + Kokkos::parallel_scan("calc target row map", policy_t(0, nc), prefix_sum(coarse_degree, target_row_map)); Kokkos::deep_copy(coarse_degree, static_cast(0)); - edge_offset_t coarse_edges_total = 0; - edge_subview_t coarse_edge_total_subview = - Kokkos::subview(target_row_map, nc); + edge_offset_t coarse_edges_total = 0; + edge_subview_t coarse_edge_total_subview = Kokkos::subview(target_row_map, nc); Kokkos::deep_copy(coarse_edges_total, coarse_edge_total_subview); vtx_view_t dest_idx("dest_idx", coarse_edges_total); @@ -1490,9 +1328,8 @@ class coarse_builder { Kokkos::parallel_for( "move edges into correct size matrix", team_policy_t(nc, Kokkos::AUTO), - functorCollapseDirectedToUndirected( - source_row_map, target_row_map, source_edge_counts, coarse_degree, - source_destinations, dest_idx, source_wgts, wgts)); + functorCollapseDirectedToUndirected(source_row_map, target_row_map, source_edge_counts, coarse_degree, + source_destinations, dest_idx, source_wgts, wgts)); graph_type gc_graph(dest_idx, target_row_map); matrix_t gc("gc", nc, wgts, gc_graph); @@ -1500,10 +1337,8 @@ class coarse_builder { } // optimized for skewed degree distributions - static coarse_level_triple build_skew(coarsen_handle& handle, - const matrix_t g, const matrix_t vcmap, - vtx_view_t mapped_edges, - vtx_view_t degree_initial) { + static coarse_level_triple build_skew(coarsen_handle& handle, const matrix_t g, const matrix_t vcmap, + vtx_view_t mapped_edges, vtx_view_t degree_initial) { ordinal_t n = g.numRows(); ordinal_t nc = vcmap.numCols(); edge_offset_t gc_nedges = 0; @@ -1513,8 +1348,7 @@ class coarse_builder { // recount with edges only belonging to coarse vertex of smaller degree // matrix becomes directed Kokkos::parallel_for( - "recount edges", team_policy_t(n, Kokkos::AUTO), - KOKKOS_LAMBDA(const member& thread) { + "recount edges", team_policy_t(n, Kokkos::AUTO), KOKKOS_LAMBDA(const member& thread) { ordinal_t outer_idx = thread.league_rank(); ordinal_t u = vcmap.graph.entries(outer_idx); edge_offset_t start = g.graph.row_map(outer_idx); @@ -1531,15 +1365,13 @@ class coarse_builder { } }, nonLoopEdgesTotal); - Kokkos::single(Kokkos::PerTeam(thread), [=]() { - Kokkos::atomic_add(&edges_per_source(u), nonLoopEdgesTotal); - }); + Kokkos::single(Kokkos::PerTeam(thread), + [=]() { Kokkos::atomic_add(&edges_per_source(u), nonLoopEdgesTotal); }); }); edge_view_t source_bucket_offset("source_bucket_offsets", nc + 1); - Kokkos::parallel_scan("calc source offsets", policy_t(0, nc), - prefix_sum(edges_per_source, source_bucket_offset)); + Kokkos::parallel_scan("calc source offsets", policy_t(0, nc), prefix_sum(edges_per_source, source_bucket_offset)); edge_subview_t sbo_subview = Kokkos::subview(source_bucket_offset, nc); edge_offset_t nnz_pre_dedupe = 0; Kokkos::deep_copy(nnz_pre_dedupe, sbo_subview); @@ -1548,38 +1380,33 @@ class coarse_builder { vtx_view_t dest_by_source("dest by source", nnz_pre_dedupe); wgt_view_t wgt_by_source("wgt by source", nnz_pre_dedupe); Kokkos::parallel_for( - "combine fine rows", team_policy_t(n, Kokkos::AUTO), - KOKKOS_LAMBDA(const member& thread) { + "combine fine rows", team_policy_t(n, Kokkos::AUTO), KOKKOS_LAMBDA(const member& thread) { ordinal_t outer_idx = thread.league_rank(); ordinal_t u = vcmap.graph.entries(outer_idx); edge_offset_t start = g.graph.row_map(outer_idx); edge_offset_t end = g.graph.row_map(outer_idx + 1); - Kokkos::parallel_for( - Kokkos::TeamThreadRange(thread, start, end), - [=](const edge_offset_t idx) { - ordinal_t v = mapped_edges(idx); - bool degree_less = degree_initial(u) < degree_initial(v); - bool degree_equal = degree_initial(u) == degree_initial(v); - if (degree_less || (degree_equal && u < v)) { - edge_offset_t offset = - Kokkos::atomic_fetch_add(&edges_per_source(u), 1); + Kokkos::parallel_for(Kokkos::TeamThreadRange(thread, start, end), [=](const edge_offset_t idx) { + ordinal_t v = mapped_edges(idx); + bool degree_less = degree_initial(u) < degree_initial(v); + bool degree_equal = degree_initial(u) == degree_initial(v); + if (degree_less || (degree_equal && u < v)) { + edge_offset_t offset = Kokkos::atomic_fetch_add(&edges_per_source(u), 1); - offset += source_bucket_offset(u); + offset += source_bucket_offset(u); - dest_by_source(offset) = v; - wgt_by_source(offset) = g.values(idx); - } - }); + dest_by_source(offset) = v; + wgt_by_source(offset) = g.values(idx); + } + }); }); gc_nedges = 0; - deduplicate_graph(handle, nc, true, edges_per_source, dest_by_source, - wgt_by_source, source_bucket_offset, gc_nedges); + deduplicate_graph(handle, nc, true, edges_per_source, dest_by_source, wgt_by_source, source_bucket_offset, + gc_nedges); // form the final coarse graph, which requires symmetrizing the matrix - matrix_t gc = collapse_directed_to_undirected( - nc, edges_per_source, source_bucket_offset, dest_by_source, - wgt_by_source); + matrix_t gc = + collapse_directed_to_undirected(nc, edges_per_source, source_bucket_offset, dest_by_source, wgt_by_source); coarse_level_triple next_level; next_level.mtx = gc; @@ -1591,11 +1418,8 @@ class coarse_builder { // deduplicates within each fine row // combines fine rows into coarse rows // deduplicates within each coarse row - static coarse_level_triple build_high_duplicity(coarsen_handle& handle, - const matrix_t g, - const matrix_t vcmap, - vtx_view_t mapped_edges, - vtx_view_t degree_initial) { + static coarse_level_triple build_high_duplicity(coarsen_handle& handle, const matrix_t g, const matrix_t vcmap, + vtx_view_t mapped_edges, vtx_view_t degree_initial) { ordinal_t n = g.numRows(); ordinal_t nc = vcmap.numCols(); edge_offset_t gc_nedges = 0; @@ -1606,8 +1430,7 @@ class coarse_builder { // recount fine row sizes with edges only belonging to fine vertex of coarse // vertex of smaller degree matrix becomes directed Kokkos::parallel_for( - "recount edges", team_policy_t(n, Kokkos::AUTO), - KOKKOS_LAMBDA(const member& thread) { + "recount edges", team_policy_t(n, Kokkos::AUTO), KOKKOS_LAMBDA(const member& thread) { ordinal_t outer_idx = thread.league_rank(); ordinal_t u = vcmap.graph.entries(outer_idx); edge_offset_t start = g.graph.row_map(outer_idx); @@ -1624,13 +1447,10 @@ class coarse_builder { } }, nonLoopEdgesTotal); - Kokkos::single(Kokkos::PerTeam(thread), [=]() { - dedupe_count(outer_idx) = nonLoopEdgesTotal; - }); + Kokkos::single(Kokkos::PerTeam(thread), [=]() { dedupe_count(outer_idx) = nonLoopEdgesTotal; }); }); - Kokkos::parallel_scan("calc source offsets", policy_t(0, n), - prefix_sum(dedupe_count, row_map_copy)); + Kokkos::parallel_scan("calc source offsets", policy_t(0, n), prefix_sum(dedupe_count, row_map_copy)); // reset counters to 0 Kokkos::deep_copy(dedupe_count, static_cast(0)); @@ -1643,35 +1463,30 @@ class coarse_builder { // create a new directed version of the fine matrix Kokkos::parallel_for( - "move edges to new matrix", team_policy_t(n, Kokkos::AUTO), - KOKKOS_LAMBDA(const member& thread) { + "move edges to new matrix", team_policy_t(n, Kokkos::AUTO), KOKKOS_LAMBDA(const member& thread) { ordinal_t outer_idx = thread.league_rank(); ordinal_t u = vcmap.graph.entries(outer_idx); edge_offset_t start = g.graph.row_map(outer_idx); edge_offset_t end = g.graph.row_map(outer_idx + 1); - Kokkos::parallel_for( - Kokkos::TeamThreadRange(thread, start, end), - [=](const edge_offset_t idx) { - ordinal_t v = mapped_edges(idx); - bool degree_less = degree_initial(u) < degree_initial(v); - bool degree_equal = degree_initial(u) == degree_initial(v); - if (u != v && (degree_less || (degree_equal && u < v))) { - edge_offset_t offset = - Kokkos::atomic_fetch_add(&dedupe_count(outer_idx), 1); + Kokkos::parallel_for(Kokkos::TeamThreadRange(thread, start, end), [=](const edge_offset_t idx) { + ordinal_t v = mapped_edges(idx); + bool degree_less = degree_initial(u) < degree_initial(v); + bool degree_equal = degree_initial(u) == degree_initial(v); + if (u != v && (degree_less || (degree_equal && u < v))) { + edge_offset_t offset = Kokkos::atomic_fetch_add(&dedupe_count(outer_idx), 1); - offset += row_map_copy(outer_idx); + offset += row_map_copy(outer_idx); - dest_fine(offset) = v; - wgt_fine(offset) = g.values(idx); - } - }); + dest_fine(offset) = v; + wgt_fine(offset) = g.values(idx); + } + }); }); //"delete" these views Kokkos::resize(mapped_edges, 0); // deduplicate coarse adjacencies within each fine row - deduplicate_graph(handle, n, true, dedupe_count, dest_fine, wgt_fine, - row_map_copy, gc_nedges); + deduplicate_graph(handle, n, true, dedupe_count, dest_fine, wgt_fine, row_map_copy, gc_nedges); edge_view_t source_bucket_offset("source_bucket_offsets", nc + 1); vtx_view_t edges_per_source("edges_per_source", nc); @@ -1681,46 +1496,40 @@ class coarse_builder { ordinal_t u = vcmap.graph.entries(i); Kokkos::atomic_fetch_add(&edges_per_source(u), dedupe_count(i)); }); - Kokkos::parallel_scan("calc source offsets", policy_t(0, nc), - prefix_sum(edges_per_source, source_bucket_offset)); + Kokkos::parallel_scan("calc source offsets", policy_t(0, nc), prefix_sum(edges_per_source, source_bucket_offset)); Kokkos::deep_copy(edges_per_source, static_cast(0)); vtx_view_t dest_by_source("dest by source", gc_nedges); wgt_view_t wgt_by_source("wgt by source", gc_nedges); Kokkos::parallel_for( - "combine deduped fine rows", team_policy_t(n, Kokkos::AUTO), - KOKKOS_LAMBDA(const member& thread) { + "combine deduped fine rows", team_policy_t(n, Kokkos::AUTO), KOKKOS_LAMBDA(const member& thread) { ordinal_t outer_idx = thread.league_rank(); ordinal_t u = vcmap.graph.entries(outer_idx); edge_offset_t start = row_map_copy(outer_idx); edge_offset_t end = start + dedupe_count(outer_idx); - Kokkos::parallel_for( - Kokkos::TeamThreadRange(thread, start, end), - [=](const edge_offset_t idx) { - ordinal_t v = dest_fine(idx); - bool degree_less = degree_initial(u) < degree_initial(v); - bool degree_equal = degree_initial(u) == degree_initial(v); - if (degree_less || (degree_equal && u < v)) { - edge_offset_t offset = - Kokkos::atomic_fetch_add(&edges_per_source(u), 1); + Kokkos::parallel_for(Kokkos::TeamThreadRange(thread, start, end), [=](const edge_offset_t idx) { + ordinal_t v = dest_fine(idx); + bool degree_less = degree_initial(u) < degree_initial(v); + bool degree_equal = degree_initial(u) == degree_initial(v); + if (degree_less || (degree_equal && u < v)) { + edge_offset_t offset = Kokkos::atomic_fetch_add(&edges_per_source(u), 1); - offset += source_bucket_offset(u); + offset += source_bucket_offset(u); - dest_by_source(offset) = v; - wgt_by_source(offset) = wgt_fine(idx); - } - }); + dest_by_source(offset) = v; + wgt_by_source(offset) = wgt_fine(idx); + } + }); }); gc_nedges = 0; Kokkos::resize(dest_fine, 0); Kokkos::resize(wgt_fine, 0); - deduplicate_graph(handle, nc, true, edges_per_source, dest_by_source, - wgt_by_source, source_bucket_offset, gc_nedges); + deduplicate_graph(handle, nc, true, edges_per_source, dest_by_source, wgt_by_source, source_bucket_offset, + gc_nedges); // form the final coarse graph, which requires symmetrizing the matrix - matrix_t gc = collapse_directed_to_undirected( - nc, edges_per_source, source_bucket_offset, dest_by_source, - wgt_by_source); + matrix_t gc = + collapse_directed_to_undirected(nc, edges_per_source, source_bucket_offset, dest_by_source, wgt_by_source); coarse_level_triple next_level; next_level.mtx = gc; @@ -1735,9 +1544,8 @@ class coarse_builder { vtx_view_t c_vtx_w, f_vtx_w; ordinal_t workLength; - countingFunctor(matrix_t _vcmap, matrix_t _g, vtx_view_t _mapped_edges, - vtx_view_t _degree_initial, vtx_view_t _c_vtx_w, - vtx_view_t _f_vtx_w) + countingFunctor(matrix_t _vcmap, matrix_t _g, vtx_view_t _mapped_edges, vtx_view_t _degree_initial, + vtx_view_t _c_vtx_w, vtx_view_t _f_vtx_w) : vcmap(_vcmap), g(_g), mapped_edges(_mapped_edges), @@ -1788,8 +1596,7 @@ class coarse_builder { } }; - static coarse_level_triple build_coarse_graph(coarsen_handle& handle, - const coarse_level_triple level, + static coarse_level_triple build_coarse_graph(coarsen_handle& handle, const coarse_level_triple level, const matrix_t vcmap) { if (handle.b == Spgemm || handle.b == Spgemm_transpose_first) { return build_coarse_graph_spgemm(handle, level, vcmap); @@ -1807,24 +1614,18 @@ class coarse_builder { // count non-self loop edges per coarse vertex // also computes coarse vertex weights - countingFunctor countF(vcmap, g, mapped_edges, degree_initial, c_vtx_w, - f_vtx_w); + countingFunctor countF(vcmap, g, mapped_edges, degree_initial, c_vtx_w, f_vtx_w); if (is_host_space) { - Kokkos::parallel_for( - "count edges per coarse vertex (also compute coarse vertex weights)", - policy_t(0, n), countF); + Kokkos::parallel_for("count edges per coarse vertex (also compute coarse vertex weights)", policy_t(0, n), + countF); } else { - auto execSpaceEnum = - KokkosKernels::Impl::kk_get_exec_space_type(); - int vectorLength = KokkosKernels::Impl::kk_get_suggested_vector_size( - n, g.nnz(), execSpaceEnum); + auto execSpaceEnum = KokkosKernels::Impl::kk_get_exec_space_type(); + int vectorLength = KokkosKernels::Impl::kk_get_suggested_vector_size(n, g.nnz(), execSpaceEnum); team_policy_t dummy(1, 1, vectorLength); int teamSize = dummy.team_size_max(countF, Kokkos::ParallelForTag()); // count edges per vertex - Kokkos::parallel_for( - "count edges per coarse vertex (also compute coarse vertex weights)", - team_policy_t((n + teamSize - 1) / teamSize, teamSize, vectorLength), - countF); + Kokkos::parallel_for("count edges per coarse vertex (also compute coarse vertex weights)", + team_policy_t((n + teamSize - 1) / teamSize, teamSize, vectorLength), countF); } // compute max row size and avg row size @@ -1842,10 +1643,7 @@ class coarse_builder { Kokkos::Max(max_unduped)); Kokkos::parallel_reduce( "find total", policy_t(0, nc), - KOKKOS_LAMBDA(const ordinal_t i, edge_offset_t& sum) { - sum += degree_initial(i); - }, - total_unduped); + KOKKOS_LAMBDA(const ordinal_t i, edge_offset_t& sum) { sum += degree_initial(i); }, total_unduped); ordinal_t avg_unduped = total_unduped / nc; coarse_level_triple next_level; @@ -1853,14 +1651,11 @@ class coarse_builder { // adjacency rows don't do optimizations if running on CPU (the default host // space) if (avg_unduped > (nc / 4) && !is_host_space) { - next_level = - build_high_duplicity(handle, g, vcmap, mapped_edges, degree_initial); - } else if (avg_unduped > 50 && (max_unduped / 10) > avg_unduped && - !is_host_space) { + next_level = build_high_duplicity(handle, g, vcmap, mapped_edges, degree_initial); + } else if (avg_unduped > 50 && (max_unduped / 10) > avg_unduped && !is_host_space) { next_level = build_skew(handle, g, vcmap, mapped_edges, degree_initial); } else { - next_level = - build_nonskew(handle, g, vcmap, mapped_edges, degree_initial); + next_level = build_nonskew(handle, g, vcmap, mapped_edges, degree_initial); } next_level.vtx_wgts = c_vtx_w; @@ -1870,9 +1665,7 @@ class coarse_builder { return next_level; } - static matrix_t generate_coarse_mapping(coarsen_handle& handle, - const matrix_t g, - bool uniform_weights) { + static matrix_t generate_coarse_mapping(coarsen_handle& handle, const matrix_t g, bool uniform_weights) { matrix_t interpolation_graph; int choice = 0; @@ -1883,14 +1676,9 @@ class coarse_builder { } switch (handle.h) { - case HECv1: - interpolation_graph = mapper_t::coarsen_HEC(g, uniform_weights); - break; + case HECv1: interpolation_graph = mapper_t::coarsen_HEC(g, uniform_weights); break; case Match: - case MtMetis: - interpolation_graph = - mapper_t::coarsen_match(g, uniform_weights, choice); - break; + case MtMetis: interpolation_graph = mapper_t::coarsen_match(g, uniform_weights, choice); break; case MIS2: interpolation_graph = mapper_t::coarsen_mis_2(g); break; case GOSHv2: interpolation_graph = mapper_t::coarsen_GOSH_v2(g); break; case GOSHv1: interpolation_graph = mapper_t::coarsen_GOSH(g); break; @@ -1902,9 +1690,7 @@ class coarse_builder { // this function can't return the generated list directly because of an NVCC // compiler bug caller must use the get_levels() method after calling this // function - static void generate_coarse_graphs(coarsen_handle& handle, - const matrix_t fine_g, - bool uniform_weights = false) { + static void generate_coarse_graphs(coarsen_handle& handle, const matrix_t fine_g, bool uniform_weights = false) { ordinal_t fine_n = fine_g.numRows(); std::list& levels = handle.results; levels.clear(); @@ -1920,15 +1706,13 @@ class coarse_builder { while (levels.rbegin()->mtx.numRows() > handle.coarse_vtx_cutoff) { coarse_level_triple current_level = *levels.rbegin(); - matrix_t interp_graph = generate_coarse_mapping( - handle, current_level.mtx, current_level.uniform_weights); + matrix_t interp_graph = generate_coarse_mapping(handle, current_level.mtx, current_level.uniform_weights); if (interp_graph.numCols() < handle.min_allowed_vtx) { break; } - coarse_level_triple next_level = - build_coarse_graph(handle, current_level, interp_graph); + coarse_level_triple next_level = build_coarse_graph(handle, current_level, interp_graph); levels.push_back(next_level); diff --git a/graph/src/KokkosGraph_CoarsenHeuristics.hpp b/graph/src/KokkosGraph_CoarsenHeuristics.hpp index 1694905167..f136882d89 100644 --- a/graph/src/KokkosGraph_CoarsenHeuristics.hpp +++ b/graph/src/KokkosGraph_CoarsenHeuristics.hpp @@ -74,8 +74,7 @@ class coarsen_heuristics { int t_buckets = 2 * n; vtx_view_t buckets("buckets", t_buckets); Kokkos::parallel_for( - "init buckets", policy_t(0, t_buckets), - KOKKOS_LAMBDA(ordinal_t i) { buckets(i) = ORD_MAX; }); + "init buckets", policy_t(0, t_buckets), KOKKOS_LAMBDA(ordinal_t i) { buckets(i) = ORD_MAX; }); uint64_t max = std::numeric_limits::max(); uint64_t bucket_size = max / t_buckets; @@ -87,8 +86,7 @@ class coarsen_heuristics { if (bucket >= t_buckets) bucket -= t_buckets; if (buckets(bucket) == ORD_MAX) { // attempt to insert into bucket - if (Kokkos::atomic_compare_exchange_strong(&buckets(bucket), - ORD_MAX, i)) { + if (Kokkos::atomic_compare_exchange_strong(&buckets(bucket), ORD_MAX, i)) { break; } } @@ -113,9 +111,9 @@ class coarsen_heuristics { // create a mapping when some vertices are already mapped // hn is a list of vertices such that vertex i wants to aggregate with vertex // hn(i) - static ordinal_t parallel_map_construct_prefilled( - vtx_view_t vcmap, const ordinal_t n, const vtx_view_t vperm, - const vtx_view_t hn, Kokkos::View nvertices_coarse) { + static ordinal_t parallel_map_construct_prefilled(vtx_view_t vcmap, const ordinal_t n, const vtx_view_t vperm, + const vtx_view_t hn, + Kokkos::View nvertices_coarse) { vtx_view_t match("match", n); Kokkos::parallel_for( policy_t(0, n), KOKKOS_LAMBDA(ordinal_t i) { @@ -142,14 +140,11 @@ class coarsen_heuristics { // need to enforce an ordering condition to allow hard-stall // conditions to be broken if (condition ^ swap) { - if (Kokkos::atomic_compare_exchange_strong(&match(u), ORD_MAX, - v)) { - if (u == v || Kokkos::atomic_compare_exchange_strong( - &match(v), ORD_MAX, u)) { - ordinal_t cv = - Kokkos::atomic_fetch_add(&nvertices_coarse(), 1); - vcmap(u) = cv; - vcmap(v) = cv; + if (Kokkos::atomic_compare_exchange_strong(&match(u), ORD_MAX, v)) { + if (u == v || Kokkos::atomic_compare_exchange_strong(&match(v), ORD_MAX, u)) { + ordinal_t cv = Kokkos::atomic_fetch_add(&nvertices_coarse(), 1); + vcmap(u) = cv; + vcmap(v) = cv; } else { if (vcmap(v) != ORD_MAX) { vcmap(u) = vcmap(v); @@ -183,10 +178,8 @@ class coarsen_heuristics { // hn is a list of vertices such that vertex i wants to aggregate with vertex // hn(i) - static ordinal_t parallel_map_construct(vtx_view_t vcmap, const ordinal_t n, - const vtx_view_t vperm, - const vtx_view_t hn, - const vtx_view_t ordering) { + static ordinal_t parallel_map_construct(vtx_view_t vcmap, const ordinal_t n, const vtx_view_t vperm, + const vtx_view_t hn, const vtx_view_t ordering) { vtx_view_t match("match", n); Kokkos::parallel_for( policy_t(0, n), KOKKOS_LAMBDA(ordinal_t i) { match(i) = ORD_MAX; }); @@ -208,10 +201,8 @@ class coarsen_heuristics { // need to enforce an ordering condition to allow hard-stall // conditions to be broken if (condition ^ swap) { - if (Kokkos::atomic_compare_exchange_strong(&match(u), ORD_MAX, - v)) { - if (u == v || Kokkos::atomic_compare_exchange_strong( - &match(v), ORD_MAX, u)) { + if (Kokkos::atomic_compare_exchange_strong(&match(u), ORD_MAX, v)) { + if (u == v || Kokkos::atomic_compare_exchange_strong(&match(v), ORD_MAX, u)) { ordinal_t cv = u; if (v < u) { cv = v; @@ -232,9 +223,7 @@ class coarsen_heuristics { // add the ones that failed to be reprocessed next round // maybe count these then create next_perm to save memory? Kokkos::parallel_scan( - policy_t(0, perm_length), - KOKKOS_LAMBDA(const ordinal_t i, ordinal_t& update, - const bool final) { + policy_t(0, perm_length), KOKKOS_LAMBDA(const ordinal_t i, ordinal_t& update, const bool final) { ordinal_t u = curr_perm(i); if (vcmap(u) == ORD_MAX) { if (final) { @@ -252,8 +241,7 @@ class coarsen_heuristics { curr_perm = next_perm; } Kokkos::parallel_scan( - "assign aggregates", policy_t(0, n), - KOKKOS_LAMBDA(const ordinal_t u, ordinal_t& update, const bool final) { + "assign aggregates", policy_t(0, n), KOKKOS_LAMBDA(const ordinal_t u, ordinal_t& update, const bool final) { if (vcmap(u) == u) { if (final) { vcmap(u) = update; @@ -325,8 +313,7 @@ class coarsen_heuristics { edge_offset_t max_degree = tuple_degree(u); ordinal_t max_idx = tuple_idx(u); - for (edge_offset_t j = g.graph.row_map(u); - j < g.graph.row_map(u + 1); j++) { + for (edge_offset_t j = g.graph.row_map(u); j < g.graph.row_map(u + 1); j++) { ordinal_t v = g.graph.entries(j); bool is_max = false; if (tuple_state(v) > max_state) { @@ -375,8 +362,7 @@ class coarsen_heuristics { } // check if at least one of neighbors are in the IS or will be // placed into the IS - else if (tuple_state(u) == 1 || - tuple_idx(tuple_idx(u)) == tuple_idx(u)) { + else if (tuple_state(u) == 1 || tuple_idx(tuple_idx(u)) == tuple_idx(u)) { state(u) = -1; } } @@ -389,8 +375,7 @@ class coarsen_heuristics { vtx_view_t next_unassigned("next unassigned", next_unassigned_total); Kokkos::parallel_scan( "create next unassigned", policy_t(0, unassigned_total), - KOKKOS_LAMBDA(const ordinal_t i, ordinal_t& update, - const bool final) { + KOKKOS_LAMBDA(const ordinal_t i, ordinal_t& update, const bool final) { ordinal_t u = unassigned(i); if (state(u) == 0) { if (final) { @@ -408,12 +393,11 @@ class coarsen_heuristics { static matrix_t coarsen_mis_2(const matrix_t& g) { ordinal_t n = g.numRows(); - typename matrix_t::staticcrsgraph_type::entries_type::non_const_value_type - nc = 0; - vtx_view_t vcmap = KokkosGraph::graph_mis2_aggregate< - Device, typename matrix_t::staticcrsgraph_type::row_map_type, - typename matrix_t::staticcrsgraph_type::entries_type, vtx_view_t>( - g.graph.row_map, g.graph.entries, nc); + typename matrix_t::staticcrsgraph_type::entries_type::non_const_value_type nc = 0; + vtx_view_t vcmap = + KokkosGraph::graph_mis2_aggregate( + g.graph.row_map, g.graph.entries, nc); edge_view_t row_map("interpolate row map", n + 1); @@ -461,11 +445,9 @@ class coarsen_heuristics { if (colors(i) != first_color) { // could use a thread team here edge_offset_t max_degree = 0; - for (edge_offset_t j = g.graph.row_map(i); - j < g.graph.row_map(i + 1); j++) { - ordinal_t v = g.graph.entries(j); - edge_offset_t degree = - g.graph.row_map(v + 1) - g.graph.row_map(v); + for (edge_offset_t j = g.graph.row_map(i); j < g.graph.row_map(i + 1); j++) { + ordinal_t v = g.graph.entries(j); + edge_offset_t degree = g.graph.row_map(v + 1) - g.graph.row_map(v); if (colors(v) == first_color && degree > max_degree) { max_degree = degree; vcmap(i) = vcmap(v); @@ -524,8 +506,7 @@ class coarsen_heuristics { if (vcmap(i) == ORD_MAX) { ordinal_t argmax = ORD_MAX; scalar_t max_w = 0; - for (edge_offset_t j = g.graph.row_map(i); - j < g.graph.row_map(i + 1); j++) { + for (edge_offset_t j = g.graph.row_map(i); j < g.graph.row_map(i + 1); j++) { ordinal_t v = g.graph.entries(j); ordinal_t wgt = g.values(j); if (vcmap(v) != ORD_MAX) { @@ -547,11 +528,9 @@ class coarsen_heuristics { if (vcmap(i) == ORD_MAX) { ordinal_t argmax = ORD_MAX; edge_offset_t max_d = 0; - for (edge_offset_t j = g.graph.row_map(i); - j < g.graph.row_map(i + 1); j++) { - ordinal_t v = g.graph.entries(j); - edge_offset_t degree = - g.graph.row_map(v + 1) - g.graph.row_map(v); + for (edge_offset_t j = g.graph.row_map(i); j < g.graph.row_map(i + 1); j++) { + ordinal_t v = g.graph.entries(j); + edge_offset_t degree = g.graph.row_map(v + 1) - g.graph.row_map(v); if (vcmap(v) != ORD_MAX) { if (degree >= max_d) { max_d = degree; @@ -569,8 +548,7 @@ class coarsen_heuristics { Kokkos::parallel_for( policy_t(0, n), KOKKOS_LAMBDA(ordinal_t i) { if (vcmap(i) != ORD_MAX) { - for (edge_offset_t j = g.graph.row_map(i); - j < g.graph.row_map(i + 1); j++) { + for (edge_offset_t j = g.graph.row_map(i); j < g.graph.row_map(i + 1); j++) { ordinal_t v = g.graph.entries(j); if (vcmap(v) == ORD_MAX) { vcmap(v) = vcmap(i); @@ -593,8 +571,7 @@ class coarsen_heuristics { vtx_view_t remaining("remaining vtx", remaining_total); Kokkos::parallel_scan( - "count remaining", policy_t(0, n), - KOKKOS_LAMBDA(const ordinal_t i, ordinal_t& update, const bool final) { + "count remaining", policy_t(0, n), KOKKOS_LAMBDA(const ordinal_t i, ordinal_t& update, const bool final) { if (vcmap(i) == ORD_MAX) { if (final) { remaining(update) = i; @@ -608,8 +585,7 @@ class coarsen_heuristics { pool_t rand_pool(std::time(nullptr)); Kokkos::parallel_for( - "fill hn", policy_t(0, remaining_total), - KOKKOS_LAMBDA(ordinal_t r_idx) { + "fill hn", policy_t(0, remaining_total), KOKKOS_LAMBDA(ordinal_t r_idx) { // select heaviest neighbor with ties randomly broken ordinal_t i = remaining(r_idx); ordinal_t hn_i = ORD_MAX; @@ -639,8 +615,7 @@ class coarsen_heuristics { hn(i) = hn_i; }); - ordinal_t nc = - parallel_map_construct_prefilled(vcmap, n, remaining, hn, nvc); + ordinal_t nc = parallel_map_construct_prefilled(vcmap, n, remaining, hn, nvc); Kokkos::deep_copy(nc, nvc); edge_view_t row_map("interpolate row map", n + 1); @@ -671,8 +646,7 @@ class coarsen_heuristics { vtx_view_t vcmap("vcmap", n); Kokkos::parallel_for( - "initialize vcmap", policy_t(0, n), - KOKKOS_LAMBDA(ordinal_t i) { vcmap(i) = ORD_MAX; }); + "initialize vcmap", policy_t(0, n), KOKKOS_LAMBDA(ordinal_t i) { vcmap(i) = ORD_MAX; }); pool_t rand_pool(std::time(nullptr)); @@ -680,8 +654,7 @@ class coarsen_heuristics { vtx_view_t reverse_map("reversed", n); Kokkos::parallel_for( - "construct reverse map", policy_t(0, n), - KOKKOS_LAMBDA(ordinal_t i) { reverse_map(vperm(i)) = i; }); + "construct reverse map", policy_t(0, n), KOKKOS_LAMBDA(ordinal_t i) { reverse_map(vperm(i)) = i; }); if (uniform_weights) { // all weights equal at this level so choose heaviest edge randomly @@ -690,9 +663,8 @@ class coarsen_heuristics { gen_t generator = rand_pool.get_state(); ordinal_t adj_size = g.graph.row_map(i + 1) - g.graph.row_map(i); if (adj_size > 0) { - ordinal_t offset = - g.graph.row_map(i) + (generator.urand64() % adj_size); - hn(i) = g.graph.entries(offset); + ordinal_t offset = g.graph.row_map(i) + (generator.urand64() % adj_size); + hn(i) = g.graph.entries(offset); } else { hn(i) = generator.urand64() % n; } @@ -700,18 +672,15 @@ class coarsen_heuristics { }); } else { Kokkos::parallel_for( - "Heaviest HN", team_policy_t(n, Kokkos::AUTO), - KOKKOS_LAMBDA(const member& thread) { + "Heaviest HN", team_policy_t(n, Kokkos::AUTO), KOKKOS_LAMBDA(const member& thread) { ordinal_t i = thread.league_rank(); ordinal_t adj_size = g.graph.row_map(i + 1) - g.graph.row_map(i); if (adj_size > 0) { edge_offset_t end = g.graph.row_map(i + 1); - typename Kokkos::MaxLoc::value_type argmax{}; + typename Kokkos::MaxLoc::value_type argmax{}; Kokkos::parallel_reduce( Kokkos::TeamThreadRange(thread, g.graph.row_map(i), end), - [=](const edge_offset_t idx, - Kokkos::ValLocScalar& local) { + [=](const edge_offset_t idx, Kokkos::ValLocScalar& local) { scalar_t wgt = g.values(idx); if (wgt >= local.val) { local.val = wgt; @@ -773,10 +742,8 @@ class coarsen_heuristics { Kokkos::View hashes; ordinal_t unmapped_total; Kokkos::View nvertices_coarse; - MatchByHashSorted(vtx_view_t _vcmap, vtx_view_t _unmapped, - Kokkos::View _hashes, - ordinal_t _unmapped_total, - Kokkos::View _nvertices_coarse) + MatchByHashSorted(vtx_view_t _vcmap, vtx_view_t _unmapped, Kokkos::View _hashes, + ordinal_t _unmapped_total, Kokkos::View _nvertices_coarse) : vcmap(_vcmap), unmapped(_unmapped), hashes(_hashes), @@ -784,8 +751,7 @@ class coarsen_heuristics { nvertices_coarse(_nvertices_coarse) {} KOKKOS_INLINE_FUNCTION - void operator()(const ordinal_t i, ordinal_t& update, - const bool final) const { + void operator()(const ordinal_t i, ordinal_t& update, const bool final) const { ordinal_t u = unmapped(i); ordinal_t tentative = 0; if (i == 0) { @@ -823,8 +789,7 @@ class coarsen_heuristics { } }; - static matrix_t coarsen_match(const matrix_t& g, bool uniform_weights, - int match_choice) { + static matrix_t coarsen_match(const matrix_t& g, bool uniform_weights, int match_choice) { ordinal_t n = g.numRows(); vtx_view_t hn("heavies", n); @@ -832,8 +797,7 @@ class coarsen_heuristics { vtx_view_t vcmap("vcmap", n); Kokkos::parallel_for( - "initialize vcmap", policy_t(0, n), - KOKKOS_LAMBDA(ordinal_t i) { vcmap(i) = ORD_MAX; }); + "initialize vcmap", policy_t(0, n), KOKKOS_LAMBDA(ordinal_t i) { vcmap(i) = ORD_MAX; }); rand_view_t randoms("randoms", n); @@ -843,8 +807,7 @@ class coarsen_heuristics { vtx_view_t reverse_map("reversed", n); Kokkos::parallel_for( - "construct reverse map", policy_t(0, n), - KOKKOS_LAMBDA(ordinal_t i) { reverse_map(vperm(i)) = i; }); + "construct reverse map", policy_t(0, n), KOKKOS_LAMBDA(ordinal_t i) { reverse_map(vperm(i)) = i; }); if (uniform_weights) { // all weights equal at this level so choose heaviest edge randomly @@ -852,9 +815,8 @@ class coarsen_heuristics { "Random HN", policy_t(0, n), KOKKOS_LAMBDA(ordinal_t i) { gen_t generator = rand_pool.get_state(); ordinal_t adj_size = g.graph.row_map(i + 1) - g.graph.row_map(i); - ordinal_t offset = - g.graph.row_map(i) + (generator.urand64() % adj_size); - hn(i) = g.graph.entries(offset); + ordinal_t offset = g.graph.row_map(i) + (generator.urand64() % adj_size); + hn(i) = g.graph.entries(offset); rand_pool.free_state(generator); }); } else { @@ -863,11 +825,9 @@ class coarsen_heuristics { ordinal_t hn_i = g.graph.entries(g.graph.row_map(i)); scalar_t max_ewt = g.values(g.graph.row_map(i)); - edge_offset_t end_offset = - g.graph.row_map(i + 1); // +g.edges_per_source[i]; + edge_offset_t end_offset = g.graph.row_map(i + 1); // +g.edges_per_source[i]; - for (edge_offset_t j = g.graph.row_map(i) + 1; j < end_offset; - j++) { + for (edge_offset_t j = g.graph.row_map(i) + 1; j < end_offset; j++) { if (max_ewt < g.values(j)) { max_ewt = g.values(j); hn_i = g.graph.entries(j); @@ -899,15 +859,12 @@ class coarsen_heuristics { // need to enforce an ordering condition to allow hard-stall // conditions to be broken if (condition ^ swap) { - if (Kokkos::atomic_compare_exchange_strong(&match(u), ORD_MAX, - v)) { - if (u == v || Kokkos::atomic_compare_exchange_strong( - &match(v), ORD_MAX, u)) { + if (Kokkos::atomic_compare_exchange_strong(&match(u), ORD_MAX, v)) { + if (u == v || Kokkos::atomic_compare_exchange_strong(&match(v), ORD_MAX, u)) { // u == v avoids problems if there is a self-loop edge - ordinal_t cv = - Kokkos::atomic_fetch_add(&nvertices_coarse(), 1); - vcmap(u) = cv; - vcmap(v) = cv; + ordinal_t cv = Kokkos::atomic_fetch_add(&nvertices_coarse(), 1); + vcmap(u) = cv; + vcmap(v) = cv; } else { match(u) = ORD_MAX; } @@ -930,8 +887,7 @@ class coarsen_heuristics { // check if any are unmatched! so instead of randomly choosing a // heaviest edge, we instead use the reverse permutation order // as the weight - for (edge_offset_t j = g.graph.row_map(u); - j < g.graph.row_map(u + 1); j++) { + for (edge_offset_t j = g.graph.row_map(u); j < g.graph.row_map(u + 1); j++) { ordinal_t v = g.graph.entries(j); // v must be unmatched to be considered if (vcmap(v) == ORD_MAX) { @@ -944,8 +900,7 @@ class coarsen_heuristics { } } else { scalar_t max_ewt = 0; - for (edge_offset_t j = g.graph.row_map(u); - j < g.graph.row_map(u + 1); j++) { + for (edge_offset_t j = g.graph.row_map(u); j < g.graph.row_map(u + 1); j++) { ordinal_t v = g.graph.entries(j); // v must be unmatched to be considered if (vcmap(v) == ORD_MAX) { @@ -959,8 +914,7 @@ class coarsen_heuristics { } if (h != ORD_MAX) { - ordinal_t add_next = - Kokkos::atomic_fetch_add(&next_length(), 1); + ordinal_t add_next = Kokkos::atomic_fetch_add(&next_length(), 1); next_perm(add_next) = u; hn(u) = h; } @@ -973,9 +927,8 @@ class coarsen_heuristics { } if (match_choice == 1) { - ordinal_t unmapped = countInf(vcmap); - double unmappedRatio = - static_cast(unmapped) / static_cast(n); + ordinal_t unmapped = countInf(vcmap); + double unmappedRatio = static_cast(unmapped) / static_cast(n); // leaf matches if (unmappedRatio > 0.25) { @@ -983,8 +936,7 @@ class coarsen_heuristics { policy_t(0, n), KOKKOS_LAMBDA(ordinal_t u) { if (vcmap(u) != ORD_MAX) { ordinal_t lastLeaf = ORD_MAX; - for (edge_offset_t j = g.graph.row_map(u); - j < g.graph.row_map(u + 1); j++) { + for (edge_offset_t j = g.graph.row_map(u); j < g.graph.row_map(u + 1); j++) { ordinal_t v = g.graph.entries(j); // v must be unmatched to be considered if (vcmap(v) == ORD_MAX) { @@ -993,10 +945,9 @@ class coarsen_heuristics { if (lastLeaf == ORD_MAX) { lastLeaf = v; } else { - vcmap(lastLeaf) = - Kokkos::atomic_fetch_add(&nvertices_coarse(), 1); - vcmap(v) = vcmap(lastLeaf); - lastLeaf = ORD_MAX; + vcmap(lastLeaf) = Kokkos::atomic_fetch_add(&nvertices_coarse(), 1); + vcmap(v) = vcmap(lastLeaf); + lastLeaf = ORD_MAX; } } } @@ -1017,20 +968,16 @@ class coarsen_heuristics { hasher_t hasher; // compute digests of adjacency lists Kokkos::parallel_for( - "create digests", team_policy_t(n, Kokkos::AUTO), - KOKKOS_LAMBDA(const member& thread) { + "create digests", team_policy_t(n, Kokkos::AUTO), KOKKOS_LAMBDA(const member& thread) { ordinal_t u = thread.league_rank(); if (vcmap(u) == ORD_MAX) { uint32_t hash = 0; Kokkos::parallel_reduce( - Kokkos::TeamThreadRange(thread, g.graph.row_map(u), - g.graph.row_map(u + 1)), - [=](const edge_offset_t j, uint32_t& thread_sum) { - thread_sum += hasher(g.graph.entries(j)); - }, + Kokkos::TeamThreadRange(thread, g.graph.row_map(u), g.graph.row_map(u + 1)), + [=](const edge_offset_t j, uint32_t& thread_sum) { thread_sum += hasher(g.graph.entries(j)); }, hash); Kokkos::single(Kokkos::PerTeam(thread), [=]() { - ordinal_t idx = Kokkos::atomic_fetch_add(&unmappedIdx(), 1); + ordinal_t idx = Kokkos::atomic_fetch_add(&unmappedIdx(), 1); unmappedVtx(idx) = u; hashes(idx) = hash; }); @@ -1040,17 +987,13 @@ class coarsen_heuristics { typedef Kokkos::BinOp1D > BinOp; BinOp bin_op(unmapped, 0, max); // VERY important that final parameter is true - Kokkos::BinSort, BinOp, exec_space, - ordinal_t> - sorter(hashes, bin_op, true); + Kokkos::BinSort, BinOp, exec_space, ordinal_t> sorter(hashes, bin_op, true); sorter.create_permute_vector(); sorter.template sort >(hashes); sorter.template sort(unmappedVtx); - MatchByHashSorted matchTwinFunctor(vcmap, unmappedVtx, hashes, unmapped, - nvertices_coarse); - Kokkos::parallel_scan("match twins", policy_t(0, unmapped), - matchTwinFunctor); + MatchByHashSorted matchTwinFunctor(vcmap, unmappedVtx, hashes, unmapped, nvertices_coarse); + Kokkos::parallel_scan("match twins", policy_t(0, unmapped), matchTwinFunctor); } unmapped = countInf(vcmap); @@ -1061,9 +1004,7 @@ class coarsen_heuristics { // get possibly mappable vertices of unmapped vtx_view_t mappableVtx("mappable vertices", unmapped); Kokkos::parallel_scan( - "get unmapped", policy_t(0, n), - KOKKOS_LAMBDA(const ordinal_t i, ordinal_t& update, - const bool final) { + "get unmapped", policy_t(0, n), KOKKOS_LAMBDA(const ordinal_t i, ordinal_t& update, const bool final) { if (vcmap(i) == ORD_MAX) { if (final) { mappableVtx(update) = i; @@ -1076,8 +1017,7 @@ class coarsen_heuristics { ordinal_t mappable_count = unmapped; do { Kokkos::parallel_for( - "reset hn", policy_t(0, mappable_count), - KOKKOS_LAMBDA(ordinal_t i) { + "reset hn", policy_t(0, mappable_count), KOKKOS_LAMBDA(ordinal_t i) { ordinal_t u = mappableVtx(i); hn(u) = ORD_MAX; }); @@ -1087,8 +1027,7 @@ class coarsen_heuristics { "assign relatives", policy_t(0, n), KOKKOS_LAMBDA(ordinal_t i) { if (vcmap(i) != ORD_MAX) { ordinal_t last_free = ORD_MAX; - for (edge_offset_t j = g.graph.row_map(i); - j < g.graph.row_map(i + 1); j++) { + for (edge_offset_t j = g.graph.row_map(i); j < g.graph.row_map(i + 1); j++) { ordinal_t v = g.graph.entries(j); if (vcmap(v) == ORD_MAX) { if (last_free != ORD_MAX) { @@ -1123,8 +1062,7 @@ class coarsen_heuristics { Kokkos::parallel_scan( "get next mappable", policy_t(0, old_mappable), - KOKKOS_LAMBDA(const ordinal_t i, ordinal_t& update, - const bool final) { + KOKKOS_LAMBDA(const ordinal_t i, ordinal_t& update, const bool final) { ordinal_t u = mappableVtx(i); if (hn(u) != ORD_MAX) { if (final) { @@ -1146,14 +1084,11 @@ class coarsen_heuristics { // need to enforce an ordering condition to allow hard-stall // conditions to be broken if (condition ^ swap) { - if (Kokkos::atomic_compare_exchange_strong(&match(u), - ORD_MAX, v)) { - if (Kokkos::atomic_compare_exchange_strong(&match(v), - ORD_MAX, u)) { - ordinal_t cv = - Kokkos::atomic_fetch_add(&nvertices_coarse(), 1); - vcmap(u) = cv; - vcmap(v) = cv; + if (Kokkos::atomic_compare_exchange_strong(&match(u), ORD_MAX, v)) { + if (Kokkos::atomic_compare_exchange_strong(&match(v), ORD_MAX, u)) { + ordinal_t cv = Kokkos::atomic_fetch_add(&nvertices_coarse(), 1); + vcmap(u) = cv; + vcmap(v) = cv; } else { match(u) = ORD_MAX; } diff --git a/graph/src/KokkosGraph_Distance1Color.hpp b/graph/src/KokkosGraph_Distance1Color.hpp index 784b687957..86bb28bab0 100644 --- a/graph/src/KokkosGraph_Distance1Color.hpp +++ b/graph/src/KokkosGraph_Distance1Color.hpp @@ -24,13 +24,10 @@ namespace KokkosGraph { namespace Experimental { -template -void graph_color_symbolic(KernelHandle *handle, - typename KernelHandle::nnz_lno_t num_rows, - typename KernelHandle::nnz_lno_t /* num_cols */, - lno_row_view_t_ row_map, lno_nnz_view_t_ entries, - bool /* is_symmetric */ = true) { +template +void graph_color_symbolic(KernelHandle *handle, typename KernelHandle::nnz_lno_t num_rows, + typename KernelHandle::nnz_lno_t /* num_cols */, lno_row_view_t_ row_map, + lno_nnz_view_t_ entries, bool /* is_symmetric */ = true) { typedef typename KernelHandle::HandleExecSpace ExecSpace; typedef typename KernelHandle::HandleTempMemorySpace MemSpace; typedef typename KernelHandle::HandlePersistentMemorySpace PersistentMemSpace; @@ -40,37 +37,29 @@ void graph_color_symbolic(KernelHandle *handle, typedef typename KernelHandle::const_nnz_lno_t c_lno_t; typedef typename KernelHandle::const_nnz_scalar_t c_scalar_t; - typedef typename KokkosKernels::Experimental::KokkosKernelsHandle< - c_size_t, c_lno_t, c_scalar_t, ExecSpace, MemSpace, PersistentMemSpace> + typedef typename KokkosKernels::Experimental::KokkosKernelsHandle ConstKernelHandle; ConstKernelHandle tmp_handle(*handle); typedef Kokkos::View::array_layout, - DeviceType, Kokkos::MemoryTraits > + typename KokkosKernels::Impl::GetUnifiedLayout::array_layout, DeviceType, + Kokkos::MemoryTraits > Internal_rowmap; typedef Kokkos::View::array_layout, - DeviceType, Kokkos::MemoryTraits > + typename KokkosKernels::Impl::GetUnifiedLayout::array_layout, DeviceType, + Kokkos::MemoryTraits > Internal_entries; - KokkosGraph::Impl:: - COLOR_D1::color_d1( - &tmp_handle, num_rows, - Internal_rowmap(row_map.data(), row_map.extent(0)), - Internal_entries(entries.data(), entries.extent(0))); + KokkosGraph::Impl::COLOR_D1::color_d1( + &tmp_handle, num_rows, Internal_rowmap(row_map.data(), row_map.extent(0)), + Internal_entries(entries.data(), entries.extent(0))); } -template -void graph_color(KernelHandle *handle, - typename KernelHandle::nnz_lno_t num_rows, - typename KernelHandle::nnz_lno_t num_cols, - lno_row_view_t_ row_map, lno_nnz_view_t_ entries, +template +void graph_color(KernelHandle *handle, typename KernelHandle::nnz_lno_t num_rows, + typename KernelHandle::nnz_lno_t num_cols, lno_row_view_t_ row_map, lno_nnz_view_t_ entries, bool is_symmetric = true) { - graph_color_symbolic(handle, num_rows, num_cols, row_map, entries, - is_symmetric); + graph_color_symbolic(handle, num_rows, num_cols, row_map, entries, is_symmetric); } } // end namespace Experimental diff --git a/graph/src/KokkosGraph_Distance1ColorHandle.hpp b/graph/src/KokkosGraph_Distance1ColorHandle.hpp index 1b2f981945..1eefd07c4d 100644 --- a/graph/src/KokkosGraph_Distance1ColorHandle.hpp +++ b/graph/src/KokkosGraph_Distance1ColorHandle.hpp @@ -22,7 +22,7 @@ #ifndef _GRAPHCOLORHANDLE_HPP #define _GRAPHCOLORHANDLE_HPP -//#define VERBOSE +// #define VERBOSE namespace KokkosGraph { enum ColoringAlgorithm { @@ -45,8 +45,7 @@ enum ColoringType { Distance1, Distance2 }; template + class ExecutionSpace, class TemporaryMemorySpace, class PersistentMemorySpace> class GraphColoringHandle { public: typedef ExecutionSpace HandleExecSpace; @@ -62,8 +61,7 @@ class GraphColoringHandle { typedef typename std::remove_const::type color_t; typedef const color_t const_color_t; - typedef typename Kokkos::View - color_view_t; + typedef typename Kokkos::View color_view_t; typedef typename color_view_t::array_layout color_view_array_layout; typedef typename color_view_t::device_type color_view_device_t; @@ -71,20 +69,15 @@ class GraphColoringHandle { typedef typename color_view_t::HostMirror color_host_view_t; // Host view // type - typedef typename Kokkos::View - size_type_temp_work_view_t; - typedef typename Kokkos::View - size_type_persistent_work_view_t; + typedef typename Kokkos::View size_type_temp_work_view_t; + typedef typename Kokkos::View size_type_persistent_work_view_t; - typedef typename size_type_persistent_work_view_t::HostMirror - size_type_persistent_work_host_view_t; // Host view type + typedef + typename size_type_persistent_work_view_t::HostMirror size_type_persistent_work_host_view_t; // Host view type - typedef typename Kokkos::View - nnz_lno_temp_work_view_t; - typedef typename Kokkos::View - nnz_lno_persistent_work_view_t; - typedef typename nnz_lno_persistent_work_view_t::HostMirror - nnz_lno_persistent_work_host_view_t; // Host view type + typedef typename Kokkos::View nnz_lno_temp_work_view_t; + typedef typename Kokkos::View nnz_lno_persistent_work_view_t; + typedef typename nnz_lno_persistent_work_view_t::HostMirror nnz_lno_persistent_work_host_view_t; // Host view type typedef Kokkos::TeamPolicy team_policy_t; typedef typename team_policy_t::member_type team_member_t; @@ -95,9 +88,9 @@ class GraphColoringHandle { ColoringType GraphColoringType; // Parameters ColoringAlgorithm coloring_algorithm_type; // VB, VBBIT, VBCS, VBD or EB. - ConflictList conflict_list_type; // whether to use a conflict list or not, - // and if using it wheter to create it with - // atomic or parallel prefix sum. + ConflictList conflict_list_type; // whether to use a conflict list or not, + // and if using it wheter to create it with + // atomic or parallel prefix sum. double min_reduction_for_conflictlist; // if used pps is selected to create conflict list, what min percantage should @@ -116,23 +109,23 @@ class GraphColoringHandle { bool vb_edge_filtering; // whether to do edge filtering or not in vertex // based algorithms. Swaps on the ad error. - int vb_chunk_size; // the (minimum) size of the consecutive works that a - // thread will be assigned to. + int vb_chunk_size; // the (minimum) size of the consecutive works that a + // thread will be assigned to. int max_number_of_iterations; // maximum allowed number of phases int eb_num_initial_colors; // the number of colors to assign at the beginning // of the edge-based algorithm // STATISTICS - double overall_coloring_time; // the overall time that it took to color the - // graph. In the case of the iterative calls. + double overall_coloring_time; // the overall time that it took to color the + // graph. In the case of the iterative calls. double overall_coloring_time_phase1; // double overall_coloring_time_phase2; // double overall_coloring_time_phase3; // Some timer accumulators for internal // phases. double overall_coloring_time_phase4; // double overall_coloring_time_phase5; // - double coloring_time; // the time that it took to color the graph + double coloring_time; // the time that it took to color the graph int num_phases; // @@ -189,9 +182,7 @@ class GraphColoringHandle { * KokkosKernels::Experimental::Graph::Distance1 or * KokkosKernels::Experimental::Graph::Distance2 */ - void set_coloring_type(const ColoringType &col_type) { - this->GraphColoringType = col_type; - } + void set_coloring_type(const ColoringType &col_type) { this->GraphColoringType = col_type; } /** \brief Gets the graph coloring type. Whether it is distance-1 or * distance-2 coloring. returns Coloring Type: @@ -206,8 +197,7 @@ class GraphColoringHandle { * COLORING_VBCS, COLORING_EB \param set_default_parameters: whether or not to * reset the default parameters for the given algorithm. */ - void set_algorithm(const ColoringAlgorithm &col_algo, - bool set_default_parameters = true) { + void set_algorithm(const ColoringAlgorithm &col_algo, bool set_default_parameters = true) { if (col_algo == COLORING_DEFAULT) { this->choose_default_algorithm(); } else { @@ -228,27 +218,23 @@ class GraphColoringHandle { if (exec == KokkosKernels::Impl::Exec_SERIAL) { this->coloring_algorithm_type = COLORING_SERIAL; #ifdef VERBOSE - std::cout - << "Serial Execution Space, Default Algorithm: COLORING_SERIAL\n"; + std::cout << "Serial Execution Space, Default Algorithm: COLORING_SERIAL\n"; #endif } else if (exec == KokkosKernels::Impl::Exec_SYCL) { // FIXME SYCL: Do not use EB this->coloring_algorithm_type = COLORING_VBBIT; #ifdef VERBOSE - std::cout << ExecutionSpace::name() - << " Execution Space, Default Algorithm: COLORING_VBBIT\n"; + std::cout << ExecutionSpace::name() << " Execution Space, Default Algorithm: COLORING_VBBIT\n"; #endif } else if (KokkosKernels::Impl::kk_is_gpu_exec_space()) { this->coloring_algorithm_type = COLORING_EB; #ifdef VERBOSE - std::cout << ExecutionSpace::name() - << " Execution Space, Default Algorithm: COLORING_EB\n"; + std::cout << ExecutionSpace::name() << " Execution Space, Default Algorithm: COLORING_EB\n"; #endif } else { this->coloring_algorithm_type = COLORING_VBBIT; #ifdef VERBOSE - std::cout << ExecutionSpace::name() - << " Execution Space, Default Algorithm: COLORING_VBBIT\n"; + std::cout << ExecutionSpace::name() << " Execution Space, Default Algorithm: COLORING_VBBIT\n"; #endif } } @@ -261,10 +247,7 @@ class GraphColoringHandle { v3 lower_xadj_counts; CountLowerTriangle(nnz_lno_t nv_, v1 xadj_, v2 adj_, v3 lower_xadj_counts_) - : nv(nv_), - xadj(xadj_), - adj(adj_), - lower_xadj_counts(lower_xadj_counts_) {} + : nv(nv_), xadj(xadj_), adj(adj_), lower_xadj_counts(lower_xadj_counts_) {} KOKKOS_INLINE_FUNCTION void operator()(const nnz_lno_t &i, size_type &new_num_edge) const { @@ -290,18 +273,12 @@ class GraphColoringHandle { v2 adj; v3 lower_xadj_counts; - CountLowerTriangleTeam(nnz_lno_t nv_, v1 xadj_, v2 adj_, - v3 lower_xadj_counts_) - : nv(nv_), - xadj(xadj_), - adj(adj_), - lower_xadj_counts(lower_xadj_counts_) {} + CountLowerTriangleTeam(nnz_lno_t nv_, v1 xadj_, v2 adj_, v3 lower_xadj_counts_) + : nv(nv_), xadj(xadj_), adj(adj_), lower_xadj_counts(lower_xadj_counts_) {} KOKKOS_INLINE_FUNCTION - void operator()( - const team_member_t &teamMember /*, row_lno_t &new_num_edge*/) const { - nnz_lno_t ii = teamMember.league_rank() * teamMember.team_size() + - teamMember.team_rank(); + void operator()(const team_member_t &teamMember /*, row_lno_t &new_num_edge*/) const { + nnz_lno_t ii = teamMember.league_rank() * teamMember.team_size() + teamMember.team_rank(); if (ii >= nv) { return; } @@ -322,8 +299,7 @@ class GraphColoringHandle { }, new_edge_count); - Kokkos::single(Kokkos::PerThread(teamMember), - [&]() { lower_xadj_counts(ii + 1) = new_edge_count; }); + Kokkos::single(Kokkos::PerThread(teamMember), [&]() { lower_xadj_counts(ii + 1) = new_edge_count; }); } }; @@ -336,8 +312,7 @@ class GraphColoringHandle { v4 lower_srcs; v4 lower_dsts; - FillLowerTriangleTeam(nnz_lno_t nv_, v1 xadj_, v2 adj_, - v3 lower_xadj_counts_, v4 lower_srcs_, v4 lower_dsts_) + FillLowerTriangleTeam(nnz_lno_t nv_, v1 xadj_, v2 adj_, v3 lower_xadj_counts_, v4 lower_srcs_, v4 lower_dsts_) : nv(nv_), xadj(xadj_), adj(adj_), @@ -347,12 +322,9 @@ class GraphColoringHandle { KOKKOS_INLINE_FUNCTION void operator()(const team_member_t &teamMember) const { - typedef - typename std::remove_reference::type - atomic_incr_type; + typedef typename std::remove_reference::type atomic_incr_type; - nnz_lno_t ii = teamMember.league_rank() * teamMember.team_size() + - teamMember.team_rank(); + nnz_lno_t ii = teamMember.league_rank() * teamMember.team_size() + teamMember.team_rank(); if (ii >= nv) { return; } @@ -360,18 +332,15 @@ class GraphColoringHandle { size_type xadj_begin = xadj(ii); size_type xadj_end = xadj(ii + 1); - Kokkos::parallel_for( - Kokkos::ThreadVectorRange(teamMember, xadj_end - xadj_begin), - [&](size_type i) { - size_type adjind = i + xadj_begin; - nnz_lno_t n = adj[adjind]; - if (ii < n && n < nv) { - size_type position = Kokkos::atomic_fetch_add( - &(lower_xadj_counts(ii)), atomic_incr_type(1)); - lower_srcs(position) = ii; - lower_dsts(position) = n; - } - }); + Kokkos::parallel_for(Kokkos::ThreadVectorRange(teamMember, xadj_end - xadj_begin), [&](size_type i) { + size_type adjind = i + xadj_begin; + nnz_lno_t n = adj[adjind]; + if (ii < n && n < nv) { + size_type position = Kokkos::atomic_fetch_add(&(lower_xadj_counts(ii)), atomic_incr_type(1)); + lower_srcs(position) = ii; + lower_dsts(position) = n; + } + }); } }; @@ -384,8 +353,7 @@ class GraphColoringHandle { v4 lower_srcs; v4 lower_dsts; - FillLowerTriangle(nnz_lno_t nv_, v1 xadj_, v2 adj_, v3 lower_xadj_counts_, - v4 lower_srcs_, v4 lower_dsts_) + FillLowerTriangle(nnz_lno_t nv_, v1 xadj_, v2 adj_, v3 lower_xadj_counts_, v4 lower_srcs_, v4 lower_dsts_) : nv(nv_), xadj(xadj_), adj(adj_), @@ -410,21 +378,18 @@ class GraphColoringHandle { }; template - void symmetrize_and_calculate_lower_diagonal_edge_list( - nnz_lno_t nv, row_index_view_type xadj, nonzero_view_type adj) { - KokkosKernels::Impl::symmetrize_and_get_lower_diagonal_edge_list< - row_index_view_type, nonzero_view_type, nnz_lno_persistent_work_view_t, - ExecutionSpace>(nv, xadj, adj, lower_triangle_src, lower_triangle_dst); + void symmetrize_and_calculate_lower_diagonal_edge_list(nnz_lno_t nv, row_index_view_type xadj, + nonzero_view_type adj) { + KokkosKernels::Impl::symmetrize_and_get_lower_diagonal_edge_list( + nv, xadj, adj, lower_triangle_src, lower_triangle_dst); size_of_edge_list = lower_triangle_src.extent(0); } template - void get_lower_diagonal_edge_list(nnz_lno_t nv, size_type ne, - row_index_view_type xadj, - nonzero_view_type adj, - size_type &num_out_edges, - nnz_lno_persistent_work_view_t &src, + void get_lower_diagonal_edge_list(nnz_lno_t nv, size_type ne, row_index_view_type xadj, nonzero_view_type adj, + size_type &num_out_edges, nnz_lno_persistent_work_view_t &src, nnz_lno_persistent_work_view_t &dst) { if (size_of_edge_list > 0) { num_out_edges = size_of_edge_list; @@ -441,26 +406,20 @@ class GraphColoringHandle { int teamSizeMax = 0; int vector_size = 0; - CountLowerTriangleTeam - clt(nv, xadj, adj, lower_count); + CountLowerTriangleTeam clt(nv, xadj, adj, + lower_count); - KokkosKernels::Impl::get_suggested_vector_size( - vector_size, nv, ne); + KokkosKernels::Impl::get_suggested_vector_size(vector_size, nv, ne); - teamSizeMax = - KokkosKernels::Impl::get_suggested_team_size( - clt, vector_size); + teamSizeMax = KokkosKernels::Impl::get_suggested_team_size(clt, vector_size); Kokkos::parallel_for("KokkosGraph::CountLowerTriangleTeam", - team_policy_t((nv + teamSizeMax - 1) / teamSizeMax, - teamSizeMax, vector_size), + team_policy_t((nv + teamSizeMax - 1) / teamSizeMax, teamSizeMax, vector_size), clt //, new_num_edge ); - KokkosKernels::Impl::inclusive_parallel_prefix_sum< - size_type_temp_work_view_t, ExecutionSpace>(nv + 1, lower_count); + KokkosKernels::Impl::inclusive_parallel_prefix_sum(nv + 1, + lower_count); // Kokkos::parallel_scan (my_exec_space(0, nv + 1), // PPS(lower_count)); ExecutionSpace().fence(); @@ -469,20 +428,15 @@ class GraphColoringHandle { Kokkos::deep_copy(hlower, lower_total_count); new_num_edge = hlower(); - nnz_lno_persistent_work_view_t half_src( - Kokkos::view_alloc(Kokkos::WithoutInitializing, "HALF SRC"), - new_num_edge); - nnz_lno_persistent_work_view_t half_dst( - Kokkos::view_alloc(Kokkos::WithoutInitializing, "HALF DST"), - new_num_edge); + nnz_lno_persistent_work_view_t half_src(Kokkos::view_alloc(Kokkos::WithoutInitializing, "HALF SRC"), + new_num_edge); + nnz_lno_persistent_work_view_t half_dst(Kokkos::view_alloc(Kokkos::WithoutInitializing, "HALF DST"), + new_num_edge); Kokkos::parallel_for( "KokkosGraph::FillLowerTriangleTeam", - team_policy_t((nv + teamSizeMax - 1) / teamSizeMax, teamSizeMax, - vector_size), - FillLowerTriangleTeam( - nv, xadj, adj, lower_count, half_src, half_dst)); + team_policy_t((nv + teamSizeMax - 1) / teamSizeMax, teamSizeMax, vector_size), + FillLowerTriangleTeam(nv, xadj, adj, lower_count, half_src, half_dst)); src = lower_triangle_src = half_src; dst = lower_triangle_dst = half_dst; @@ -491,30 +445,25 @@ class GraphColoringHandle { if (nv > 0) { Kokkos::parallel_reduce( "KokkosGraph::CountLowerTriangleTeam", my_exec_space(0, nv), - CountLowerTriangle(nv, xadj, adj, - lower_count), + CountLowerTriangle(nv, xadj, adj, + lower_count), new_num_edge); } // Kokkos::parallel_scan (my_exec_space(0, nv + 1), // PPS(lower_count)); - KokkosKernels::Impl::inclusive_parallel_prefix_sum< - size_type_temp_work_view_t, ExecutionSpace>(nv + 1, lower_count); - nnz_lno_persistent_work_view_t half_src( - Kokkos::view_alloc(Kokkos::WithoutInitializing, "HALF SRC"), - new_num_edge); - nnz_lno_persistent_work_view_t half_dst( - Kokkos::view_alloc(Kokkos::WithoutInitializing, "HALF DST"), - new_num_edge); + KokkosKernels::Impl::inclusive_parallel_prefix_sum(nv + 1, + lower_count); + nnz_lno_persistent_work_view_t half_src(Kokkos::view_alloc(Kokkos::WithoutInitializing, "HALF SRC"), + new_num_edge); + nnz_lno_persistent_work_view_t half_dst(Kokkos::view_alloc(Kokkos::WithoutInitializing, "HALF DST"), + new_num_edge); Kokkos::parallel_for( "KokkosGraph::FillLowerTriangleTeam", my_exec_space(0, nv), - FillLowerTriangle( - nv, xadj, adj, lower_count, half_src, half_dst)); + FillLowerTriangle(nv, xadj, adj, lower_count, half_src, half_dst)); src = lower_triangle_src = half_src; dst = lower_triangle_dst = half_dst; @@ -547,8 +496,7 @@ class GraphColoringHandle { nnz_lno_t get_num_colors() { if (num_colors == 0) { typedef typename Kokkos::RangePolicy my_exec_space; - Kokkos::parallel_reduce("KokkosKernels::FindMax", - my_exec_space(0, vertex_colors.extent(0)), + Kokkos::parallel_reduce("KokkosKernels::FindMax", my_exec_space(0, vertex_colors.extent(0)), ReduceMaxFunctor(vertex_colors), num_colors); } return num_colors; @@ -594,47 +542,23 @@ class GraphColoringHandle { virtual ~GraphColoringHandle(){}; // getters - ColoringAlgorithm get_coloring_algo_type() const { - return this->coloring_algorithm_type; - } - ConflictList get_conflict_list_type() const { - return this->conflict_list_type; - } - double get_min_reduction_for_conflictlist() const { - return this->min_reduction_for_conflictlist; - } - int get_min_elements_for_conflictlist() const { - return this->min_elements_for_conflictlist; - } - bool get_serial_conflict_resolution() const { - return this->serial_conflict_resolution; - } + ColoringAlgorithm get_coloring_algo_type() const { return this->coloring_algorithm_type; } + ConflictList get_conflict_list_type() const { return this->conflict_list_type; } + double get_min_reduction_for_conflictlist() const { return this->min_reduction_for_conflictlist; } + int get_min_elements_for_conflictlist() const { return this->min_elements_for_conflictlist; } + bool get_serial_conflict_resolution() const { return this->serial_conflict_resolution; } bool get_tictoc() const { return this->tictoc; } bool get_vb_edge_filtering() const { return this->vb_edge_filtering; } int get_vb_chunk_size() const { return this->vb_chunk_size; } - int get_max_number_of_iterations() const { - return this->max_number_of_iterations; - } + int get_max_number_of_iterations() const { return this->max_number_of_iterations; } int get_eb_num_initial_colors() const { return this->eb_num_initial_colors; } - double get_overall_coloring_time() const { - return this->overall_coloring_time; - } - double get_overall_coloring_time_phase1() const { - return this->overall_coloring_time_phase1; - } - double get_overall_coloring_time_phase2() const { - return this->overall_coloring_time_phase2; - } - double get_overall_coloring_time_phase3() const { - return this->overall_coloring_time_phase3; - } - double get_overall_coloring_time_phase4() const { - return this->overall_coloring_time_phase4; - } - double get_overall_coloring_time_phase5() const { - return this->overall_coloring_time_phase5; - } + double get_overall_coloring_time() const { return this->overall_coloring_time; } + double get_overall_coloring_time_phase1() const { return this->overall_coloring_time_phase1; } + double get_overall_coloring_time_phase2() const { return this->overall_coloring_time_phase2; } + double get_overall_coloring_time_phase3() const { return this->overall_coloring_time_phase3; } + double get_overall_coloring_time_phase4() const { return this->overall_coloring_time_phase4; } + double get_overall_coloring_time_phase5() const { return this->overall_coloring_time_phase5; } double get_coloring_time() const { return this->coloring_time; } int get_num_phases() const { return this->num_phases; } color_view_t get_vertex_colors() const { return this->vertex_colors; } @@ -643,44 +567,28 @@ class GraphColoringHandle { nnz_lno_temp_work_view_t get_vertex_list() const { return this->vertex_list; } size_type get_vertex_list_size() const { return this->vertex_list_size; } // setters - void set_vertex_list(nnz_lno_temp_work_view_t vertex_list_, - size_type vertex_list_size_) { + void set_vertex_list(nnz_lno_temp_work_view_t vertex_list_, size_type vertex_list_size_) { this->vertex_list = vertex_list_; this->vertex_list_size = vertex_list_size_; this->use_vtx_list = true; } - void set_coloring_algo_type(const ColoringAlgorithm &col_algo) { - this->coloring_algorithm_type = col_algo; - } - void set_conflict_list_type(const ConflictList &cl) { - this->conflict_list_type = cl; - } + void set_coloring_algo_type(const ColoringAlgorithm &col_algo) { this->coloring_algorithm_type = col_algo; } + void set_conflict_list_type(const ConflictList &cl) { this->conflict_list_type = cl; } void set_min_reduction_for_conflictlist(const double &min_reduction) { this->min_reduction_for_conflictlist = min_reduction; } void set_min_elements_for_conflictlist(const int &min_elements) { this->min_elements_for_conflictlist = min_elements; } - void set_serial_conflict_resolution( - const bool &use_serial_conflist_resolution) { + void set_serial_conflict_resolution(const bool &use_serial_conflist_resolution) { this->serial_conflict_resolution = use_serial_conflist_resolution; } void set_tictoc(const bool use_tictoc) { this->tictoc = use_tictoc; } - void set_vb_edge_filtering(const bool &use_vb_edge_filtering) { - this->vb_edge_filtering = use_vb_edge_filtering; - } - void set_vb_chunk_size(const int &chunksize) { - this->vb_chunk_size = chunksize; - } - void set_max_number_of_iterations(const int &max_phases) { - this->max_number_of_iterations = max_phases; - } - void set_eb_num_initial_colors(const int &num_initial_colors) { - this->eb_num_initial_colors = num_initial_colors; - } - void add_to_overall_coloring_time(const double &coloring_time_) { - this->overall_coloring_time += coloring_time_; - } + void set_vb_edge_filtering(const bool &use_vb_edge_filtering) { this->vb_edge_filtering = use_vb_edge_filtering; } + void set_vb_chunk_size(const int &chunksize) { this->vb_chunk_size = chunksize; } + void set_max_number_of_iterations(const int &max_phases) { this->max_number_of_iterations = max_phases; } + void set_eb_num_initial_colors(const int &num_initial_colors) { this->eb_num_initial_colors = num_initial_colors; } + void add_to_overall_coloring_time(const double &coloring_time_) { this->overall_coloring_time += coloring_time_; } void add_to_overall_coloring_time_phase1(const double &coloring_time_) { this->overall_coloring_time_phase1 += coloring_time_; } @@ -696,12 +604,8 @@ class GraphColoringHandle { void add_to_overall_coloring_time_phase5(const double &coloring_time_) { this->overall_coloring_time_phase5 += coloring_time_; } - void set_coloring_time(const double &coloring_time_) { - this->coloring_time = coloring_time_; - } - void set_num_phases(const double &num_phases_) { - this->num_phases = num_phases_; - } + void set_coloring_time(const double &coloring_time_) { this->coloring_time = coloring_time_; } + void set_num_phases(const double &num_phases_) { this->num_phases = num_phases_; } void set_vertex_colors(const color_view_t vertex_colors_) { this->vertex_colors = vertex_colors_; this->is_coloring_called_before = true; diff --git a/graph/src/KokkosGraph_Distance2Color.hpp b/graph/src/KokkosGraph_Distance2Color.hpp index c40ec72ece..a6555915bb 100644 --- a/graph/src/KokkosGraph_Distance2Color.hpp +++ b/graph/src/KokkosGraph_Distance2Color.hpp @@ -44,16 +44,13 @@ namespace Experimental { */ template -void graph_color_distance2(KernelHandle *handle, - typename KernelHandle::nnz_lno_t num_verts, - InRowmap row_map, InEntries row_entries) { +void graph_color_distance2(KernelHandle *handle, typename KernelHandle::nnz_lno_t num_verts, InRowmap row_map, + InEntries row_entries) { using size_type = typename KernelHandle::size_type; using lno_t = typename KernelHandle::nnz_lno_t; - using InternalRowmap = Kokkos::View>; - using InternalEntries = Kokkos::View>; Kokkos::Timer timer; size_type nnz = row_entries.extent(0); @@ -61,11 +58,9 @@ void graph_color_distance2(KernelHandle *handle, InternalEntries rowentries_internal(row_entries.data(), nnz); auto gch_d2 = handle->get_distance2_graph_coloring_handle(); // note: last template argument 'false' means do distance-2, not bipartite - KokkosGraph::Impl::GraphColorDistance2< - typename KernelHandle::GraphColorDistance2HandleType, InternalRowmap, - InternalEntries, false> - gc(num_verts, num_verts, rowmap_internal, rowentries_internal, - rowmap_internal, rowentries_internal, gch_d2); + KokkosGraph::Impl::GraphColorDistance2 + gc(num_verts, num_verts, rowmap_internal, rowentries_internal, rowmap_internal, rowentries_internal, gch_d2); gc.compute_distance2_color(); gch_d2->add_to_overall_coloring_time(timer.seconds()); gch_d2->set_coloring_time(timer.seconds()); @@ -104,24 +99,18 @@ void graph_color_distance2(KernelHandle *handle, */ template -void bipartite_color_rows(KernelHandle *handle, - typename KernelHandle::nnz_lno_t num_rows, - typename KernelHandle::nnz_lno_t num_columns, - InRowmap row_map, InEntries row_entries, +void bipartite_color_rows(KernelHandle *handle, typename KernelHandle::nnz_lno_t num_rows, + typename KernelHandle::nnz_lno_t num_columns, InRowmap row_map, InEntries row_entries, bool is_symmetric = false) { using execution_space = typename KernelHandle::HandleExecSpace; using size_type = typename KernelHandle::size_type; using lno_t = typename KernelHandle::nnz_lno_t; - using InternalRowmap = Kokkos::View>; - using InternalEntries = Kokkos::View>; - using TRowmap = Kokkos::View; - using TEntries = Kokkos::View; + using TRowmap = Kokkos::View; + using TEntries = Kokkos::View; Kokkos::Timer timer; size_type nnz = row_entries.extent(0); TRowmap col_map; @@ -130,8 +119,7 @@ void bipartite_color_rows(KernelHandle *handle, // Compute the transpose col_map = TRowmap("Col map", num_columns + 1); col_entries = TEntries("Col entries", nnz); - KokkosSparse::Impl::transpose_graph( + KokkosSparse::Impl::transpose_graph( num_rows, num_columns, row_map, row_entries, col_map, col_entries); } InternalRowmap rowmap_internal(row_map.data(), row_map.extent(0)); @@ -147,11 +135,9 @@ void bipartite_color_rows(KernelHandle *handle, } auto gch_d2 = handle->get_distance2_graph_coloring_handle(); // note: last template argument 'true' means do bipartite one-sided - KokkosGraph::Impl::GraphColorDistance2< - typename KernelHandle::GraphColorDistance2HandleType, InternalRowmap, - InternalEntries, true> - gc(num_rows, num_columns, rowmap_internal, rowentries_internal, - colmap_internal, colentries_internal, gch_d2); + KokkosGraph::Impl::GraphColorDistance2 + gc(num_rows, num_columns, rowmap_internal, rowentries_internal, colmap_internal, colentries_internal, gch_d2); gc.compute_distance2_color(); gch_d2->add_to_overall_coloring_time(timer.seconds()); gch_d2->set_coloring_time(timer.seconds()); @@ -185,31 +171,23 @@ void bipartite_color_rows(KernelHandle *handle, * return a view of length num_columns, containing the colors. */ template -void bipartite_color_columns(KernelHandle *handle, - typename KernelHandle::nnz_lno_t num_rows, - typename KernelHandle::nnz_lno_t num_columns, - InRowmap row_map, InEntries row_entries) { +void bipartite_color_columns(KernelHandle *handle, typename KernelHandle::nnz_lno_t num_rows, + typename KernelHandle::nnz_lno_t num_columns, InRowmap row_map, InEntries row_entries) { using execution_space = typename KernelHandle::HandleExecSpace; using size_type = typename KernelHandle::size_type; using lno_t = typename KernelHandle::nnz_lno_t; - using InternalRowmap = Kokkos::View>; - using InternalEntries = Kokkos::View>; - using TRowmap = Kokkos::View; - using TEntries = Kokkos::View; + using TRowmap = Kokkos::View; + using TEntries = Kokkos::View; Kokkos::Timer timer; size_type nnz = row_entries.extent(0); // Compute the transpose TRowmap col_map("Col map", num_columns + 1); - TEntries col_entries( - Kokkos::view_alloc(Kokkos::WithoutInitializing, "Col entries"), nnz); - KokkosSparse::Impl::transpose_graph( + TEntries col_entries(Kokkos::view_alloc(Kokkos::WithoutInitializing, "Col entries"), nnz); + KokkosSparse::Impl::transpose_graph( num_rows, num_columns, row_map, row_entries, col_map, col_entries); // Get unmanaged views for both graph and its transpose InternalRowmap colmap_internal(col_map.data(), col_map.extent(0)); @@ -218,11 +196,9 @@ void bipartite_color_columns(KernelHandle *handle, InternalEntries rowentries_internal(row_entries.data(), nnz); auto gch_d2 = handle->get_distance2_graph_coloring_handle(); // note: last template argument 'true' means do bipartite one-sided - KokkosGraph::Impl::GraphColorDistance2< - typename KernelHandle::GraphColorDistance2HandleType, InternalRowmap, - InternalEntries, true> - gc(num_columns, num_rows, colmap_internal, colentries_internal, - rowmap_internal, rowentries_internal, gch_d2); + KokkosGraph::Impl::GraphColorDistance2 + gc(num_columns, num_rows, colmap_internal, colentries_internal, rowmap_internal, rowentries_internal, gch_d2); gc.compute_distance2_color(); gch_d2->add_to_overall_coloring_time(timer.seconds()); gch_d2->set_coloring_time(timer.seconds()); diff --git a/graph/src/KokkosGraph_Distance2ColorHandle.hpp b/graph/src/KokkosGraph_Distance2ColorHandle.hpp index c6508e0ba8..f50ce08fef 100644 --- a/graph/src/KokkosGraph_Distance2ColorHandle.hpp +++ b/graph/src/KokkosGraph_Distance2ColorHandle.hpp @@ -36,45 +36,37 @@ enum GraphColoringAlgorithmDistance2 { COLORING_D2_NB_BIT // Distance-2 Graph Coloring Net Based BIT }; -template +template class GraphColorDistance2Handle { public: - using HandleExecSpace = ExecutionSpace; - using HandleTempMemorySpace = TemporaryMemorySpace; - using HandlePersistentMemorySpace = PersistentMemorySpace; - using size_type = typename std::remove_const::type; - using const_size_type = const size_type; - using nnz_lno_type = typename std::remove_const::type; - using const_nnz_lno_type = const nnz_lno_type; - using color_type = typename std::remove_const::type; - using const_color_type = const color_type; - using color_view_type = - typename Kokkos::View; - using color_view_array_layout = typename color_view_type::array_layout; - using color_view_device_type = typename color_view_type::device_type; - using color_view_memory_traits = typename color_view_type::memory_traits; - using color_host_view_type = typename color_view_type::HostMirror; - using size_type_temp_work_view_type = - typename Kokkos::View; - using size_type_persistent_work_view_type = - typename Kokkos::View; - using size_type_persistent_work_host_view_type = - typename size_type_persistent_work_view_type::HostMirror; - using nnz_lno_temp_work_view_type = - typename Kokkos::View; - using nnz_lno_persistent_work_view_type = - typename Kokkos::View; - using nnz_lno_persistent_work_host_view_type = - typename nnz_lno_persistent_work_view_type::HostMirror; - using team_policy_type = Kokkos::TeamPolicy; - using team_member_type = typename team_policy_type::member_type; - using non_const_1d_size_type_view_type = typename Kokkos::View; + using HandleExecSpace = ExecutionSpace; + using HandleTempMemorySpace = TemporaryMemorySpace; + using HandlePersistentMemorySpace = PersistentMemorySpace; + using size_type = typename std::remove_const::type; + using const_size_type = const size_type; + using nnz_lno_type = typename std::remove_const::type; + using const_nnz_lno_type = const nnz_lno_type; + using color_type = typename std::remove_const::type; + using const_color_type = const color_type; + using color_view_type = typename Kokkos::View; + using color_view_array_layout = typename color_view_type::array_layout; + using color_view_device_type = typename color_view_type::device_type; + using color_view_memory_traits = typename color_view_type::memory_traits; + using color_host_view_type = typename color_view_type::HostMirror; + using size_type_temp_work_view_type = typename Kokkos::View; + using size_type_persistent_work_view_type = typename Kokkos::View; + using size_type_persistent_work_host_view_type = typename size_type_persistent_work_view_type::HostMirror; + using nnz_lno_temp_work_view_type = typename Kokkos::View; + using nnz_lno_persistent_work_view_type = typename Kokkos::View; + using nnz_lno_persistent_work_host_view_type = typename nnz_lno_persistent_work_view_type::HostMirror; + using team_policy_type = Kokkos::TeamPolicy; + using team_member_type = typename team_policy_type::member_type; + using non_const_1d_size_type_view_type = typename Kokkos::View; private: // Parameters - GraphColoringAlgorithmDistance2 - coloring_algorithm_type; // Which algorithm type to use. + GraphColoringAlgorithmDistance2 coloring_algorithm_type; // Which algorithm type to use. bool verbose; // verbosity flag bool tictoc; // print time at every step @@ -82,20 +74,20 @@ class GraphColorDistance2Handle { bool vb_edge_filtering; // whether to do edge filtering or not in vertex // based algorithms. - int vb_chunk_size; // the (minimum) size of the consecutive works that a - // thread will be assigned to. + int vb_chunk_size; // the (minimum) size of the consecutive works that a + // thread will be assigned to. int max_number_of_iterations; // maximum allowed number of phases that // STATISTICS - double overall_coloring_time; // The overall time taken to color the graph. - // In the case of the iterative calls. + double overall_coloring_time; // The overall time taken to color the graph. + // In the case of the iterative calls. double overall_coloring_time_phase1; // double overall_coloring_time_phase2; // double overall_coloring_time_phase3; // Some timer accumulators for internal // phases. double overall_coloring_time_phase4; // double overall_coloring_time_phase5; // - double coloring_time; // the time that it took to color the graph + double coloring_time; // the time that it took to color the graph bool use_vtx_list; nnz_lno_temp_work_view_type vertex_list; @@ -159,8 +151,7 @@ class GraphColorDistance2Handle { * * @return None */ - void set_algorithm(const GraphColoringAlgorithmDistance2& col_algo, - bool set_default_parameters = true) { + void set_algorithm(const GraphColoringAlgorithmDistance2& col_algo, bool set_default_parameters = true) { if (col_algo == COLORING_D2_DEFAULT) { this->choose_default_algorithm(); } else { @@ -182,26 +173,23 @@ class GraphColorDistance2Handle { */ void choose_default_algorithm() { - if (KokkosKernels::Impl::kk_get_exec_space_type() == - KokkosKernels::Impl::Exec_SERIAL) { + if (KokkosKernels::Impl::kk_get_exec_space_type() == KokkosKernels::Impl::Exec_SERIAL) { this->coloring_algorithm_type = COLORING_D2_SERIAL; #ifdef VERBOSE - std::cout - << "Serial Execution Space, Default Algorithm: COLORING_D2_SERIAL\n"; + std::cout << "Serial Execution Space, Default Algorithm: COLORING_D2_SERIAL\n"; #endif } else { this->coloring_algorithm_type = COLORING_D2_NB_BIT; #ifdef VERBOSE - std::cout << ExecutionSpace::name() - << " Execution Space, Default Algorithm: COLORING_D2_NB_BIT\n"; + std::cout << ExecutionSpace::name() << " Execution Space, Default Algorithm: COLORING_D2_NB_BIT\n"; #endif } } nnz_lno_type get_num_colors() { if (num_colors == 0) - KokkosKernels::Impl::view_reduce_max( - vertex_colors.extent(0), vertex_colors, num_colors); + KokkosKernels::Impl::view_reduce_max(vertex_colors.extent(0), vertex_colors, + num_colors); return num_colors; } @@ -219,9 +207,7 @@ class GraphColorDistance2Handle { this->vb_chunk_size = 8; this->max_number_of_iterations = 200; break; - default: - throw std::runtime_error( - "Unknown Distance-2 Graph Coloring Algorithm\n"); + default: throw std::runtime_error("Unknown Distance-2 Graph Coloring Algorithm\n"); } } @@ -231,35 +217,19 @@ class GraphColorDistance2Handle { virtual ~GraphColorDistance2Handle(){}; // getters and setters - GraphColoringAlgorithmDistance2 get_coloring_algo_type() const { - return this->coloring_algorithm_type; - } + GraphColoringAlgorithmDistance2 get_coloring_algo_type() const { return this->coloring_algorithm_type; } bool get_verbose() const { return this->verbose; } double get_coloring_time() const { return this->coloring_time; } - int get_max_number_of_iterations() const { - return this->max_number_of_iterations; - } + int get_max_number_of_iterations() const { return this->max_number_of_iterations; } int get_num_phases() const { return this->num_phases; } - double get_overall_coloring_time() const { - return this->overall_coloring_time; - } - double get_overall_coloring_time_phase1() const { - return this->overall_coloring_time_phase1; - } - double get_overall_coloring_time_phase2() const { - return this->overall_coloring_time_phase2; - } - double get_overall_coloring_time_phase3() const { - return this->overall_coloring_time_phase3; - } - double get_overall_coloring_time_phase4() const { - return this->overall_coloring_time_phase4; - } - double get_overall_coloring_time_phase5() const { - return this->overall_coloring_time_phase5; - } + double get_overall_coloring_time() const { return this->overall_coloring_time; } + double get_overall_coloring_time_phase1() const { return this->overall_coloring_time_phase1; } + double get_overall_coloring_time_phase2() const { return this->overall_coloring_time_phase2; } + double get_overall_coloring_time_phase3() const { return this->overall_coloring_time_phase3; } + double get_overall_coloring_time_phase4() const { return this->overall_coloring_time_phase4; } + double get_overall_coloring_time_phase5() const { return this->overall_coloring_time_phase5; } bool get_tictoc() const { return this->tictoc; } @@ -272,14 +242,11 @@ class GraphColorDistance2Handle { bool is_coloring_called() const { return this->is_coloring_called_before; } bool get_use_vtx_list() const { return this->use_vtx_list; } - nnz_lno_temp_work_view_type get_vertex_list() const { - return this->vertex_list; - } + nnz_lno_temp_work_view_type get_vertex_list() const { return this->vertex_list; } size_type get_vertex_list_size() const { return this->vertex_list_size; } // setters - void set_vertex_list(nnz_lno_temp_work_view_type vertex_list_, - size_type vertex_list_size_) { + void set_vertex_list(nnz_lno_temp_work_view_type vertex_list_, size_type vertex_list_size_) { this->vertex_list = vertex_list_; this->vertex_list_size = vertex_list_size_; this->use_vtx_list = true; @@ -291,19 +258,11 @@ class GraphColorDistance2Handle { } void set_verbose(const bool verbose_) { this->verbose = verbose_; } - void set_coloring_time(const double& coloring_time_) { - this->coloring_time = coloring_time_; - } - void set_max_number_of_iterations(const int& max_phases) { - this->max_number_of_iterations = max_phases; - } - void set_num_phases(const int& num_phases_) { - this->num_phases = num_phases_; - } + void set_coloring_time(const double& coloring_time_) { this->coloring_time = coloring_time_; } + void set_max_number_of_iterations(const int& max_phases) { this->max_number_of_iterations = max_phases; } + void set_num_phases(const int& num_phases_) { this->num_phases = num_phases_; } - void add_to_overall_coloring_time(const double& coloring_time_) { - this->overall_coloring_time += coloring_time_; - } + void add_to_overall_coloring_time(const double& coloring_time_) { this->overall_coloring_time += coloring_time_; } void add_to_overall_coloring_time_phase1(const double& coloring_time_) { this->overall_coloring_time_phase1 += coloring_time_; } @@ -322,13 +281,9 @@ class GraphColorDistance2Handle { void set_tictoc(const bool use_tictoc) { this->tictoc = use_tictoc; } - void set_vb_chunk_size(const int& chunksize) { - this->vb_chunk_size = chunksize; - } + void set_vb_chunk_size(const int& chunksize) { this->vb_chunk_size = chunksize; } - void set_vb_edge_filtering(const bool& use_vb_edge_filtering) { - this->vb_edge_filtering = use_vb_edge_filtering; - } + void set_vb_edge_filtering(const bool& use_vb_edge_filtering) { this->vb_edge_filtering = use_vb_edge_filtering; } void set_vertex_colors(const color_view_type vertex_colors_) { this->vertex_colors = vertex_colors_; @@ -349,10 +304,8 @@ class GraphColorDistance2Handle { * object (i.e., `std::ofstream os("G.dot", std::ofstream::out);`) to write to * a file. */ - template - void dump_graphviz(std::ostream& os, const size_t num_verts, - rowmap_type& rowmap, entries_type& entries, + template + void dump_graphviz(std::ostream& os, const size_t num_verts, rowmap_type& rowmap, entries_type& entries, kokkos_view_type& colors) const { using h_colors_type = typename kokkos_view_type::HostMirror; using h_rowmap_type = typename rowmap_type::HostMirror; @@ -407,13 +360,11 @@ class GraphColorDistance2Handle { penwidth = ", penwidth=\"2.0\""; } - os << " " << vid << " [ label=\"" << vid << "|" << h_colors(vid) - << "\"" << style << fontcolor << color << fillcolor << penwidth << "];" - << std::endl; + os << " " << vid << " [ label=\"" << vid << "|" << h_colors(vid) << "\"" << style << fontcolor << color + << fillcolor << penwidth << "];" << std::endl; // Add the node's edges - for (size_t iadj = h_rowmap(vid); iadj < (size_t)h_rowmap(vid + 1); - iadj++) { + for (size_t iadj = h_rowmap(vid); iadj < (size_t)h_rowmap(vid + 1); iadj++) { size_t vadj = h_entries(iadj); if (vadj >= vid) { os << " " << vid << " -- " << vadj << ";" << std::endl; diff --git a/graph/src/KokkosGraph_ExplicitCoarsening.hpp b/graph/src/KokkosGraph_ExplicitCoarsening.hpp index 3c655026f5..67c4fbd453 100644 --- a/graph/src/KokkosGraph_ExplicitCoarsening.hpp +++ b/graph/src/KokkosGraph_ExplicitCoarsening.hpp @@ -32,35 +32,27 @@ namespace Experimental { // An uncompressed graph will still work as input to some things like D1 graph // coloring. -template -void graph_explicit_coarsen( - const fine_rowmap_t& fineRowmap, const fine_entries_t& fineEntries, - const labels_t& labels, - typename fine_entries_t::non_const_value_type numCoarseVerts, - coarse_rowmap_t& coarseRowmap, coarse_entries_t& coarseEntries, - bool compress = true) { +template +void graph_explicit_coarsen(const fine_rowmap_t& fineRowmap, const fine_entries_t& fineEntries, const labels_t& labels, + typename fine_entries_t::non_const_value_type numCoarseVerts, coarse_rowmap_t& coarseRowmap, + coarse_entries_t& coarseEntries, bool compress = true) { using size_type = typename fine_rowmap_t::non_const_value_type; using lno_t = typename fine_entries_t::non_const_value_type; using exec_space = typename device_t::execution_space; - static_assert( - std::is_same::value, - "graph_explicit_coarsen: The coarse and fine entry Views have different " - "value types."); - KokkosGraph::Impl::ExplicitGraphCoarsening< - lno_t, size_type, device_t, fine_rowmap_t, fine_entries_t, labels_t, - coarse_rowmap_t, coarse_entries_t, coarse_entries_t> + static_assert(std::is_same::value, + "graph_explicit_coarsen: The coarse and fine entry Views have different " + "value types."); + KokkosGraph::Impl::ExplicitGraphCoarsening egc(fineRowmap, fineEntries, labels, numCoarseVerts); coarseRowmap = egc.coarseRowmap; coarseEntries = egc.coarseEntries; if (compress) { coarse_rowmap_t mergedRowmap; coarse_entries_t mergedEntries; - KokkosSparse::sort_and_merge_graph( - coarseRowmap, coarseEntries, mergedRowmap, mergedEntries); + KokkosSparse::sort_and_merge_graph(coarseRowmap, coarseEntries, + mergedRowmap, mergedEntries); coarseRowmap = mergedRowmap; coarseEntries = mergedEntries; } @@ -68,27 +60,22 @@ void graph_explicit_coarsen( // Same as above, but also produce the map from coarse vertices to fine vertices // (inverse map of labels) -template -void graph_explicit_coarsen_with_inverse_map( - const fine_rowmap_t& fineRowmap, const fine_entries_t& fineEntries, - const labels_t& labels, - typename fine_entries_t::non_const_value_type numCoarseVerts, - coarse_rowmap_t& coarseRowmap, coarse_entries_t& coarseEntries, - ordinal_view_t& inverseOffsets, ordinal_view_t& inverseLabels, - bool compress = true) { +template +void graph_explicit_coarsen_with_inverse_map(const fine_rowmap_t& fineRowmap, const fine_entries_t& fineEntries, + const labels_t& labels, + typename fine_entries_t::non_const_value_type numCoarseVerts, + coarse_rowmap_t& coarseRowmap, coarse_entries_t& coarseEntries, + ordinal_view_t& inverseOffsets, ordinal_view_t& inverseLabels, + bool compress = true) { using size_type = typename fine_rowmap_t::non_const_value_type; using lno_t = typename fine_entries_t::non_const_value_type; using exec_space = typename device_t::execution_space; - static_assert( - std::is_same::value, - "graph_explicit_coarsen: The coarse and fine entry Views have different " - "value types."); - KokkosGraph::Impl::ExplicitGraphCoarsening< - lno_t, size_type, device_t, fine_rowmap_t, fine_entries_t, labels_t, - coarse_rowmap_t, coarse_entries_t, ordinal_view_t> + static_assert(std::is_same::value, + "graph_explicit_coarsen: The coarse and fine entry Views have different " + "value types."); + KokkosGraph::Impl::ExplicitGraphCoarsening egc(fineRowmap, fineEntries, labels, numCoarseVerts); coarseRowmap = egc.coarseRowmap; coarseEntries = egc.coarseEntries; @@ -97,9 +84,8 @@ void graph_explicit_coarsen_with_inverse_map( if (compress) { coarse_rowmap_t mergedRowmap; coarse_entries_t mergedEntries; - KokkosSparse::sort_and_merge_graph( - coarseRowmap, coarseEntries, mergedRowmap, mergedEntries); + KokkosSparse::sort_and_merge_graph(coarseRowmap, coarseEntries, + mergedRowmap, mergedEntries); coarseRowmap = mergedRowmap; coarseEntries = mergedEntries; } diff --git a/graph/src/KokkosGraph_MIS2.hpp b/graph/src/KokkosGraph_MIS2.hpp index fb38d05456..4af491a406 100644 --- a/graph/src/KokkosGraph_MIS2.hpp +++ b/graph/src/KokkosGraph_MIS2.hpp @@ -30,21 +30,18 @@ enum MIS2_Algorithm { MIS2_QUALITY, MIS2_FAST }; template -lno_view_t graph_d2_mis(const rowmap_t& rowmap, const colinds_t& colinds, - MIS2_Algorithm algo = MIS2_FAST) { +lno_view_t graph_d2_mis(const rowmap_t& rowmap, const colinds_t& colinds, MIS2_Algorithm algo = MIS2_FAST) { if (rowmap.extent(0) <= 1) { // zero vertices means the MIS is empty. return lno_view_t(); } switch (algo) { case MIS2_QUALITY: { - Impl::D2_MIS_FixedPriority mis( - rowmap, colinds); + Impl::D2_MIS_FixedPriority mis(rowmap, colinds); return mis.compute(); } case MIS2_FAST: { - Impl::D2_MIS_RandomPriority - mis(rowmap, colinds); + Impl::D2_MIS_RandomPriority mis(rowmap, colinds); return mis.compute(); } } @@ -53,16 +50,14 @@ lno_view_t graph_d2_mis(const rowmap_t& rowmap, const colinds_t& colinds, template -labels_t graph_mis2_coarsen( - const rowmap_t& rowmap, const colinds_t& colinds, - typename colinds_t::non_const_value_type& numClusters) { +labels_t graph_mis2_coarsen(const rowmap_t& rowmap, const colinds_t& colinds, + typename colinds_t::non_const_value_type& numClusters) { if (rowmap.extent(0) <= 1) { // there are no vertices to label numClusters = 0; return labels_t(); } - Impl::D2_MIS_Aggregation aggregation( - rowmap, colinds); + Impl::D2_MIS_Aggregation aggregation(rowmap, colinds); aggregation.compute(false); numClusters = aggregation.numAggs; return aggregation.labels; @@ -70,16 +65,14 @@ labels_t graph_mis2_coarsen( template -labels_t graph_mis2_aggregate( - const rowmap_t& rowmap, const colinds_t& colinds, - typename colinds_t::non_const_value_type& numAggregates) { +labels_t graph_mis2_aggregate(const rowmap_t& rowmap, const colinds_t& colinds, + typename colinds_t::non_const_value_type& numAggregates) { if (rowmap.extent(0) <= 1) { // there are no vertices to label numAggregates = 0; return labels_t(); } - Impl::D2_MIS_Aggregation aggregation( - rowmap, colinds); + Impl::D2_MIS_Aggregation aggregation(rowmap, colinds); aggregation.compute(true); numAggregates = aggregation.numAggs; return aggregation.labels; @@ -101,31 +94,23 @@ namespace Experimental { template -[[deprecated]] lno_view_t graph_d2_mis(const rowmap_t& rowmap, - const colinds_t& colinds, +[[deprecated]] lno_view_t graph_d2_mis(const rowmap_t& rowmap, const colinds_t& colinds, MIS2_Algorithm algo = MIS2_FAST) { - return KokkosGraph::graph_d2_mis( - rowmap, colinds, algo); + return KokkosGraph::graph_d2_mis(rowmap, colinds, algo); } template -[[deprecated]] labels_t graph_mis2_coarsen( - const rowmap_t& rowmap, const colinds_t& colinds, - typename colinds_t::non_const_value_type& numClusters) { - return KokkosGraph::graph_mis2_coarsen(rowmap, colinds, - numClusters); +[[deprecated]] labels_t graph_mis2_coarsen(const rowmap_t& rowmap, const colinds_t& colinds, + typename colinds_t::non_const_value_type& numClusters) { + return KokkosGraph::graph_mis2_coarsen(rowmap, colinds, numClusters); } template -[[deprecated]] labels_t graph_mis2_aggregate( - const rowmap_t& rowmap, const colinds_t& colinds, - typename colinds_t::non_const_value_type& numAggregates) { - return KokkosGraph::graph_mis2_aggregate(rowmap, colinds, - numAggregates); +[[deprecated]] labels_t graph_mis2_aggregate(const rowmap_t& rowmap, const colinds_t& colinds, + typename colinds_t::non_const_value_type& numAggregates) { + return KokkosGraph::graph_mis2_aggregate(rowmap, colinds, numAggregates); } [[deprecated]] inline const char* mis2_algorithm_name(MIS2_Algorithm algo) { diff --git a/graph/src/KokkosGraph_Triangle.hpp b/graph/src/KokkosGraph_Triangle.hpp index 0a878891ce..6ab6dd7b9a 100644 --- a/graph/src/KokkosGraph_Triangle.hpp +++ b/graph/src/KokkosGraph_Triangle.hpp @@ -148,15 +148,11 @@ transposeA, row_mapB, entriesB, transposeB); } */ -template -void triangle_generic(KernelHandle *handle, typename KernelHandle::nnz_lno_t m, - typename KernelHandle::nnz_lno_t n, - typename KernelHandle::nnz_lno_t k, - alno_row_view_t_ row_mapA, alno_nnz_view_t_ entriesA, - bool transposeA, blno_row_view_t_ row_mapB, - blno_nnz_view_t_ entriesB, bool transposeB, +void triangle_generic(KernelHandle *handle, typename KernelHandle::nnz_lno_t m, typename KernelHandle::nnz_lno_t n, + typename KernelHandle::nnz_lno_t k, alno_row_view_t_ row_mapA, alno_nnz_view_t_ entriesA, + bool transposeA, blno_row_view_t_ row_mapB, blno_nnz_view_t_ entriesB, bool transposeB, visit_struct_t visit_struct) { using namespace KokkosSparse; @@ -168,30 +164,24 @@ void triangle_generic(KernelHandle *handle, typename KernelHandle::nnz_lno_t m, case SPGEMM_KK_TRIANGLE_IA: case SPGEMM_KK_TRIANGLE_IA_UNION: default: { - KokkosSparse::Impl::KokkosSPGEMM< - KernelHandle, alno_row_view_t_, alno_nnz_view_t_, - typename KernelHandle::in_scalar_nnz_view_t, blno_row_view_t_, - blno_nnz_view_t_, typename KernelHandle::in_scalar_nnz_view_t> - kspgemm(handle, m, n, k, row_mapA, entriesA, transposeA, row_mapB, - entriesB, transposeB); + KokkosSparse::Impl::KokkosSPGEMM + kspgemm(handle, m, n, k, row_mapA, entriesA, transposeA, row_mapB, entriesB, transposeB); kspgemm.KokkosSPGEMM_generic_triangle(visit_struct); } break; } } -template -void triangle_generic(KernelHandle *handle, typename KernelHandle::nnz_lno_t m, - alno_row_view_t_ row_mapA, alno_nnz_view_t_ entriesA, - visit_struct_t visit_struct) { +template +void triangle_generic(KernelHandle *handle, typename KernelHandle::nnz_lno_t m, alno_row_view_t_ row_mapA, + alno_nnz_view_t_ entriesA, visit_struct_t visit_struct) { typedef typename KernelHandle::nnz_lno_t nnz_lno_t; typedef typename KernelHandle::size_type size_type; typedef typename KernelHandle::SPGEMMHandleType spgemmHandleType; - typedef typename KernelHandle::nnz_lno_persistent_work_view_t - nnz_lno_persistent_work_view_t; - typedef typename KernelHandle::row_lno_persistent_work_view_t - row_lno_persistent_work_view_t; + typedef typename KernelHandle::nnz_lno_persistent_work_view_t nnz_lno_persistent_work_view_t; + typedef typename KernelHandle::row_lno_persistent_work_view_t row_lno_persistent_work_view_t; typedef typename KernelHandle::HandleExecSpace ExecutionSpace; @@ -207,8 +197,8 @@ void triangle_generic(KernelHandle *handle, typename KernelHandle::nnz_lno_t m, should_i_sort = true; else if (sort_lower_triangle == 2) { size_type max_row_size = 0; - KokkosKernels::Impl::kk_view_reduce_max_row_size( - m, row_mapA.data(), row_mapA.data() + 1, max_row_size); + KokkosKernels::Impl::kk_view_reduce_max_row_size(m, row_mapA.data(), row_mapA.data() + 1, + max_row_size); if (max_row_size > 1000) { should_i_sort = true; @@ -217,13 +207,11 @@ void triangle_generic(KernelHandle *handle, typename KernelHandle::nnz_lno_t m, if (should_i_sort) { if (sh->get_lower_triangular_permutation().data() == NULL) { - nnz_lno_persistent_work_view_t new_indices( - Kokkos::view_alloc(Kokkos::WithoutInitializing, "new_indices"), m); + nnz_lno_persistent_work_view_t new_indices(Kokkos::view_alloc(Kokkos::WithoutInitializing, "new_indices"), m); int sort_decreasing_order = 1; ////If true we place the largest row to top, so that largest row size will /// be minimized in lower triangle. - if (sh->get_algorithm_type() == SPGEMM_KK_TRIANGLE_AI || - sh->get_algorithm_type() == SPGEMM_KK_TRIANGLE_LU) { + if (sh->get_algorithm_type() == SPGEMM_KK_TRIANGLE_AI || sh->get_algorithm_type() == SPGEMM_KK_TRIANGLE_LU) { sort_decreasing_order = 0; // if false we place the largest row to bottom, so that largest column // is minimizedin lower triangle. @@ -232,10 +220,8 @@ void triangle_generic(KernelHandle *handle, typename KernelHandle::nnz_lno_t m, // if 2, we do an interleaved sort. } { - KokkosSparse::Impl::kk_sort_by_row_size( - m, row_mapA.data(), new_indices.data(), sort_decreasing_order, - ExecutionSpace().concurrency()); + KokkosSparse::Impl::kk_sort_by_row_size( + m, row_mapA.data(), new_indices.data(), sort_decreasing_order, ExecutionSpace().concurrency()); } sh->set_lower_triangular_permutation(new_indices); } @@ -250,56 +236,43 @@ void triangle_generic(KernelHandle *handle, typename KernelHandle::nnz_lno_t m, row_lno_persistent_work_view_t lower_triangular_matrix_rowmap; nnz_lno_persistent_work_view_t lower_triangular_matrix_entries; timer1.reset(); - if (create_lower_triangular || - sh->get_algorithm_type() == SPGEMM_KK_TRIANGLE_LL || + if (create_lower_triangular || sh->get_algorithm_type() == SPGEMM_KK_TRIANGLE_LL || sh->get_algorithm_type() == SPGEMM_KK_TRIANGLE_LU) { - sh->get_lower_triangular_matrix(lower_triangular_matrix_rowmap, - lower_triangular_matrix_entries); - if (lower_triangular_matrix_rowmap.data() == NULL || - lower_triangular_matrix_entries.data() == NULL) { + sh->get_lower_triangular_matrix(lower_triangular_matrix_rowmap, lower_triangular_matrix_entries); + if (lower_triangular_matrix_rowmap.data() == NULL || lower_triangular_matrix_entries.data() == NULL) { alno_nnz_view_t_ null_values; - nnz_lno_persistent_work_view_t new_indices = - sh->get_lower_triangular_permutation(); - - KokkosSparse::Impl::kk_get_lower_triangle< - alno_row_view_t_, alno_nnz_view_t_, alno_nnz_view_t_, - row_lno_persistent_work_view_t, nnz_lno_persistent_work_view_t, - alno_nnz_view_t_, nnz_lno_persistent_work_view_t, ExecutionSpace>( - m, row_mapA, entriesA, null_values, lower_triangular_matrix_rowmap, - lower_triangular_matrix_entries, null_values, new_indices, - handle->is_dynamic_scheduling(), + nnz_lno_persistent_work_view_t new_indices = sh->get_lower_triangular_permutation(); + + KokkosSparse::Impl::kk_get_lower_triangle( + m, row_mapA, entriesA, null_values, lower_triangular_matrix_rowmap, lower_triangular_matrix_entries, + null_values, new_indices, handle->is_dynamic_scheduling(), handle->get_team_work_size(1, ExecutionSpace().concurrency(), m)); - sh->set_lower_triangular_matrix(lower_triangular_matrix_rowmap, - lower_triangular_matrix_entries); + sh->set_lower_triangular_matrix(lower_triangular_matrix_rowmap, lower_triangular_matrix_entries); } } if (handle->get_verbose()) { - std::cout << "Preprocess Create Lower Triangular Time:" << timer1.seconds() - << std::endl; + std::cout << "Preprocess Create Lower Triangular Time:" << timer1.seconds() << std::endl; } timer1.reset(); row_lno_persistent_work_view_t upper_triangular_matrix_rowmap; nnz_lno_persistent_work_view_t upper_triangular_matrix_entries; if (sh->get_algorithm_type() == SPGEMM_KK_TRIANGLE_LU) { - sh->get_lower_triangular_matrix(lower_triangular_matrix_rowmap, - lower_triangular_matrix_entries); + sh->get_lower_triangular_matrix(lower_triangular_matrix_rowmap, lower_triangular_matrix_entries); alno_nnz_view_t_ null_values; - nnz_lno_persistent_work_view_t new_indices = - sh->get_lower_triangular_permutation(); - - KokkosSparse::Impl::kk_get_lower_triangle< - alno_row_view_t_, alno_nnz_view_t_, alno_nnz_view_t_, - row_lno_persistent_work_view_t, nnz_lno_persistent_work_view_t, - alno_nnz_view_t_, nnz_lno_persistent_work_view_t, ExecutionSpace>( - m, row_mapA, entriesA, null_values, upper_triangular_matrix_rowmap, - upper_triangular_matrix_entries, null_values, new_indices, - handle->is_dynamic_scheduling(), 4, false); + nnz_lno_persistent_work_view_t new_indices = sh->get_lower_triangular_permutation(); + + KokkosSparse::Impl::kk_get_lower_triangle( + m, row_mapA, entriesA, null_values, upper_triangular_matrix_rowmap, upper_triangular_matrix_entries, + null_values, new_indices, handle->is_dynamic_scheduling(), 4, false); } if (handle->get_verbose()) { - std::cout << "Preprocess Create Upper Triangular Time:" << timer1.seconds() - << std::endl; + std::cout << "Preprocess Create Upper Triangular Time:" << timer1.seconds() << std::endl; } /////////CREATE LOWER TRIANGLE/////// @@ -320,33 +293,25 @@ void triangle_generic(KernelHandle *handle, typename KernelHandle::nnz_lno_t m, case SPGEMM_KK_TRIANGLE_IA: { // these are the algorithms that requires transpose of the incidence // matrix. - sh->get_lower_triangular_matrix(lower_triangular_matrix_rowmap, - lower_triangular_matrix_entries); + sh->get_lower_triangular_matrix(lower_triangular_matrix_rowmap, lower_triangular_matrix_entries); - if (lower_triangular_matrix_rowmap.data() == NULL || - lower_triangular_matrix_entries.data() == NULL) { + if (lower_triangular_matrix_rowmap.data() == NULL || lower_triangular_matrix_entries.data() == NULL) { std::cout << "Creating lower triangular A" << std::endl; alno_nnz_view_t_ null_values; - nnz_lno_persistent_work_view_t new_indices = - sh->get_lower_triangular_permutation(); - - KokkosSparse::Impl::kk_get_lower_triangle< - alno_row_view_t_, alno_nnz_view_t_, alno_nnz_view_t_, - row_lno_persistent_work_view_t, nnz_lno_persistent_work_view_t, - alno_nnz_view_t_, nnz_lno_persistent_work_view_t, ExecutionSpace>( - m, row_mapA, entriesA, null_values, lower_triangular_matrix_rowmap, - lower_triangular_matrix_entries, null_values, new_indices, - handle->is_dynamic_scheduling()); + nnz_lno_persistent_work_view_t new_indices = sh->get_lower_triangular_permutation(); + + KokkosSparse::Impl::kk_get_lower_triangle( + m, row_mapA, entriesA, null_values, lower_triangular_matrix_rowmap, lower_triangular_matrix_entries, + null_values, new_indices, handle->is_dynamic_scheduling()); } - KokkosSparse::Impl:: - kk_create_incidence_tranpose_matrix_from_lower_triangle< - row_lno_persistent_work_view_t, nnz_lno_persistent_work_view_t, - row_lno_persistent_work_view_t, nnz_lno_persistent_work_view_t, - ExecutionSpace>( - m, lower_triangular_matrix_rowmap, - lower_triangular_matrix_entries, incidence_transpose_rowmap, - incidence_transpose_entries, handle->is_dynamic_scheduling()); + KokkosSparse::Impl::kk_create_incidence_tranpose_matrix_from_lower_triangle< + row_lno_persistent_work_view_t, nnz_lno_persistent_work_view_t, row_lno_persistent_work_view_t, + nnz_lno_persistent_work_view_t, ExecutionSpace>(m, lower_triangular_matrix_rowmap, + lower_triangular_matrix_entries, incidence_transpose_rowmap, + incidence_transpose_entries, handle->is_dynamic_scheduling()); } break; // IF it is one of below, we perform (A) or (L) x I @@ -355,12 +320,10 @@ void triangle_generic(KernelHandle *handle, typename KernelHandle::nnz_lno_t m, // these are the algorithms that requires the incidence matrix. KokkosSparse::Impl::kk_create_incidence_matrix_from_original_matrix< - alno_row_view_t_, alno_nnz_view_t_, row_lno_persistent_work_view_t, - nnz_lno_persistent_work_view_t, nnz_lno_persistent_work_view_t, - ExecutionSpace>(m, row_mapA, entriesA, incidence_rowmap, - incidence_entries, - sh->get_lower_triangular_permutation(), - handle->is_dynamic_scheduling()); + alno_row_view_t_, alno_nnz_view_t_, row_lno_persistent_work_view_t, nnz_lno_persistent_work_view_t, + nnz_lno_persistent_work_view_t, ExecutionSpace>(m, row_mapA, entriesA, incidence_rowmap, incidence_entries, + sh->get_lower_triangular_permutation(), + handle->is_dynamic_scheduling()); } break; case SPGEMM_KK_TRIANGLE_LU: case SPGEMM_KK_TRIANGLE_LL: @@ -370,8 +333,7 @@ void triangle_generic(KernelHandle *handle, typename KernelHandle::nnz_lno_t m, } if (handle->get_verbose()) { - std::cout << "Preprocess Incidence Matrix Create Time:" << timer1.seconds() - << std::endl; + std::cout << "Preprocess Incidence Matrix Create Time:" << timer1.seconds() << std::endl; } //// /// CREATE INCIDENCE MATRIX END @@ -380,49 +342,36 @@ void triangle_generic(KernelHandle *handle, typename KernelHandle::nnz_lno_t m, switch (sh->get_algorithm_type()) { default: case SPGEMM_KK_TRIANGLE_LL: { - KokkosSparse::Impl::KokkosSPGEMM< - KernelHandle, row_lno_persistent_work_view_t, - nnz_lno_persistent_work_view_t, nnz_lno_persistent_work_view_t, - row_lno_persistent_work_view_t, nnz_lno_persistent_work_view_t, - nnz_lno_persistent_work_view_t> - kspgemm(handle, m, m, m, lower_triangular_matrix_rowmap, - lower_triangular_matrix_entries, false, - lower_triangular_matrix_rowmap, - lower_triangular_matrix_entries, false); + KokkosSparse::Impl::KokkosSPGEMM + kspgemm(handle, m, m, m, lower_triangular_matrix_rowmap, lower_triangular_matrix_entries, false, + lower_triangular_matrix_rowmap, lower_triangular_matrix_entries, false); kspgemm.KokkosSPGEMM_generic_triangle(visit_struct); } break; case SPGEMM_KK_TRIANGLE_LU: { - KokkosSparse::Impl::KokkosSPGEMM< - KernelHandle, row_lno_persistent_work_view_t, - nnz_lno_persistent_work_view_t, nnz_lno_persistent_work_view_t, - row_lno_persistent_work_view_t, nnz_lno_persistent_work_view_t, - nnz_lno_persistent_work_view_t> - kspgemm(handle, m, m, m, lower_triangular_matrix_rowmap, - lower_triangular_matrix_entries, false, - upper_triangular_matrix_rowmap, - upper_triangular_matrix_entries, false); + KokkosSparse::Impl::KokkosSPGEMM + kspgemm(handle, m, m, m, lower_triangular_matrix_rowmap, lower_triangular_matrix_entries, false, + upper_triangular_matrix_rowmap, upper_triangular_matrix_entries, false); kspgemm.KokkosSPGEMM_generic_triangle(visit_struct); } break; case SPGEMM_KK_TRIANGLE_AI: { if (create_lower_triangular) { - KokkosSparse::Impl::KokkosSPGEMM< - KernelHandle, row_lno_persistent_work_view_t, - nnz_lno_persistent_work_view_t, nnz_lno_persistent_work_view_t, - row_lno_persistent_work_view_t, nnz_lno_persistent_work_view_t, - nnz_lno_persistent_work_view_t> - kspgemm(handle, m, m, incidence_entries.extent(0) / 2, - lower_triangular_matrix_rowmap, + KokkosSparse::Impl::KokkosSPGEMM + kspgemm(handle, m, m, incidence_entries.extent(0) / 2, lower_triangular_matrix_rowmap, lower_triangular_matrix_entries, false, // transpose ignore. incidence_rowmap, incidence_entries, false); kspgemm.KokkosSPGEMM_generic_triangle(visit_struct); } else { - KokkosSparse::Impl::KokkosSPGEMM< - KernelHandle, alno_row_view_t_, alno_nnz_view_t_, - nnz_lno_persistent_work_view_t, row_lno_persistent_work_view_t, - nnz_lno_persistent_work_view_t, nnz_lno_persistent_work_view_t> - kspgemm(handle, m, m, incidence_entries.extent(0) / 2, row_mapA, - entriesA, + KokkosSparse::Impl::KokkosSPGEMM + kspgemm(handle, m, m, incidence_entries.extent(0) / 2, row_mapA, entriesA, false, // transpose ignore. incidence_rowmap, incidence_entries, false); kspgemm.KokkosSPGEMM_generic_triangle(visit_struct); @@ -433,24 +382,20 @@ void triangle_generic(KernelHandle *handle, typename KernelHandle::nnz_lno_t m, case SPGEMM_KK_TRIANGLE_IA_UNION: case SPGEMM_KK_TRIANGLE_IA: { if (create_lower_triangular) { - KokkosSparse::Impl::KokkosSPGEMM< - KernelHandle, row_lno_persistent_work_view_t, - nnz_lno_persistent_work_view_t, nnz_lno_persistent_work_view_t, - row_lno_persistent_work_view_t, nnz_lno_persistent_work_view_t, - nnz_lno_persistent_work_view_t> - kspgemm(handle, incidence_transpose_rowmap.extent(0) - 1, m, m, - incidence_transpose_rowmap, incidence_transpose_entries, + KokkosSparse::Impl::KokkosSPGEMM + kspgemm(handle, incidence_transpose_rowmap.extent(0) - 1, m, m, incidence_transpose_rowmap, + incidence_transpose_entries, false, // transpose ignore. - lower_triangular_matrix_rowmap, - lower_triangular_matrix_entries, false); + lower_triangular_matrix_rowmap, lower_triangular_matrix_entries, false); kspgemm.KokkosSPGEMM_generic_triangle(visit_struct); } else { - KokkosSparse::Impl::KokkosSPGEMM< - KernelHandle, row_lno_persistent_work_view_t, - nnz_lno_persistent_work_view_t, nnz_lno_persistent_work_view_t, - alno_row_view_t_, alno_nnz_view_t_, nnz_lno_persistent_work_view_t> - kspgemm(handle, incidence_transpose_rowmap.extent(0) - 1, m, m, - incidence_transpose_rowmap, incidence_transpose_entries, + KokkosSparse::Impl::KokkosSPGEMM + kspgemm(handle, incidence_transpose_rowmap.extent(0) - 1, m, m, incidence_transpose_rowmap, + incidence_transpose_entries, false, // transpose ignore. row_mapA, entriesA, false); kspgemm.KokkosSPGEMM_generic_triangle(visit_struct); diff --git a/graph/unit_test/Test_Graph_coarsen.hpp b/graph/unit_test/Test_Graph_coarsen.hpp index 95f1533c88..2fda527dfb 100644 --- a/graph/unit_test/Test_Graph_coarsen.hpp +++ b/graph/unit_test/Test_Graph_coarsen.hpp @@ -47,23 +47,16 @@ bool verify_coarsening(typename coarsener_t::coarse_level_triple fine_l, using ordinal_t = typename entries_t::value_type; using edge_t = typename rowmap_t::value_type; - crsMat A = fine_l.mtx; - crsMat coarse_A = coarse_l.mtx; - auto f_rowmap = - Kokkos::create_mirror_view_and_copy(Kokkos::HostSpace(), A.graph.row_map); - auto c_rowmap = Kokkos::create_mirror_view_and_copy(Kokkos::HostSpace(), - coarse_A.graph.row_map); - auto f_entries = - Kokkos::create_mirror_view_and_copy(Kokkos::HostSpace(), A.graph.entries); - auto vcmap = Kokkos::create_mirror_view_and_copy( - Kokkos::HostSpace(), coarse_l.interp_mtx.graph.entries); - auto few = Kokkos::create_mirror_view_and_copy(Kokkos::HostSpace(), A.values); - auto cew = - Kokkos::create_mirror_view_and_copy(Kokkos::HostSpace(), coarse_A.values); - auto fvw = - Kokkos::create_mirror_view_and_copy(Kokkos::HostSpace(), fine_l.vtx_wgts); - auto cvw = Kokkos::create_mirror_view_and_copy(Kokkos::HostSpace(), - coarse_l.vtx_wgts); + crsMat A = fine_l.mtx; + crsMat coarse_A = coarse_l.mtx; + auto f_rowmap = Kokkos::create_mirror_view_and_copy(Kokkos::HostSpace(), A.graph.row_map); + auto c_rowmap = Kokkos::create_mirror_view_and_copy(Kokkos::HostSpace(), coarse_A.graph.row_map); + auto f_entries = Kokkos::create_mirror_view_and_copy(Kokkos::HostSpace(), A.graph.entries); + auto vcmap = Kokkos::create_mirror_view_and_copy(Kokkos::HostSpace(), coarse_l.interp_mtx.graph.entries); + auto few = Kokkos::create_mirror_view_and_copy(Kokkos::HostSpace(), A.values); + auto cew = Kokkos::create_mirror_view_and_copy(Kokkos::HostSpace(), coarse_A.values); + auto fvw = Kokkos::create_mirror_view_and_copy(Kokkos::HostSpace(), fine_l.vtx_wgts); + auto cvw = Kokkos::create_mirror_view_and_copy(Kokkos::HostSpace(), coarse_l.vtx_wgts); ordinal_t f_size = 0; ordinal_t c_size = 0; for (ordinal_t i = 0; i < static_cast(fvw.extent(0)); i++) { @@ -112,10 +105,8 @@ bool verify_is_graph(crsMat A) { using entries_t = typename c_entries_t::non_const_type; using ordinal_t = typename entries_t::value_type; using edge_t = typename rowmap_t::value_type; - auto rowmap = - Kokkos::create_mirror_view_and_copy(Kokkos::HostSpace(), A.graph.row_map); - auto entries = - Kokkos::create_mirror_view_and_copy(Kokkos::HostSpace(), A.graph.entries); + auto rowmap = Kokkos::create_mirror_view_and_copy(Kokkos::HostSpace(), A.graph.row_map); + auto entries = Kokkos::create_mirror_view_and_copy(Kokkos::HostSpace(), A.graph.entries); for (ordinal_t i = 0; i < A.numRows(); i++) { std::set adjset; @@ -158,8 +149,7 @@ bool verify_aggregator(crsMat A, crsMat agg) { if (A.numRows() < agg.numCols()) { return false; } - auto entries = Kokkos::create_mirror_view_and_copy(Kokkos::HostSpace(), - agg.graph.entries); + auto entries = Kokkos::create_mirror_view_and_copy(Kokkos::HostSpace(), agg.graph.entries); std::vector aggregateSizes(agg.numCols(), 0); for (ordinal_t i = 0; i < static_cast(agg.nnz()); i++) { @@ -244,8 +234,7 @@ crsMat gen_grid() { template void test_multilevel_coarsen_grid() { - using crsMat = - KokkosSparse::CrsMatrix; + using crsMat = KokkosSparse::CrsMatrix; crsMat A = gen_grid(); using coarsener_t = coarse_builder; typename coarsener_t::coarsen_handle handle; @@ -259,17 +248,12 @@ void test_multilevel_coarsen_grid() { coarse++; while (coarse != levels.end()) { bool correct_aggregator = verify_aggregator(fine->mtx, coarse->interp_mtx); - EXPECT_TRUE(correct_aggregator) - << "Multilevel coarsening produced invalid aggregator on level " - << coarse->level - 1; + EXPECT_TRUE(correct_aggregator) << "Multilevel coarsening produced invalid aggregator on level " + << coarse->level - 1; bool correct_graph = verify_is_graph(coarse->mtx); bool correct_coarsening = verify_coarsening(*fine, *coarse); - EXPECT_TRUE(correct_graph) - << "Multilevel coarsening produced invalid graph on level " - << coarse->level; - EXPECT_TRUE(correct_coarsening) - << "Multilevel coarsening produced invalid coarsening on level " - << coarse->level; + EXPECT_TRUE(correct_graph) << "Multilevel coarsening produced invalid graph on level " << coarse->level; + EXPECT_TRUE(correct_coarsening) << "Multilevel coarsening produced invalid coarsening on level " << coarse->level; fine++; coarse++; } @@ -277,8 +261,7 @@ void test_multilevel_coarsen_grid() { template void test_coarsen_grid() { - using crsMat = - KokkosSparse::CrsMatrix; + using crsMat = KokkosSparse::CrsMatrix; using graph_type = typename crsMat::StaticCrsGraphType; using c_entries_t = typename graph_type::entries_type; using entries_t = typename c_entries_t::non_const_type; @@ -293,60 +276,49 @@ void test_coarsen_grid() { fine_A.vtx_wgts = vWgts; fine_A.level = 0; fine_A.uniform_weights = true; - std::vector heuristics = { - coarsener_t::HECv1, coarsener_t::Match, coarsener_t::MtMetis, - coarsener_t::MIS2, coarsener_t::GOSHv1, coarsener_t::GOSHv2}; - std::vector builders = { - coarsener_t::Sort, coarsener_t::Hashmap, coarsener_t::Hybrid, - coarsener_t::Spgemm, coarsener_t::Spgemm_transpose_first}; + std::vector heuristics = {coarsener_t::HECv1, coarsener_t::Match, + coarsener_t::MtMetis, coarsener_t::MIS2, + coarsener_t::GOSHv1, coarsener_t::GOSHv2}; + std::vector builders = {coarsener_t::Sort, coarsener_t::Hashmap, coarsener_t::Hybrid, + coarsener_t::Spgemm, coarsener_t::Spgemm_transpose_first}; for (auto h : heuristics) { - handle.h = h; - crsMat aggregator = - coarsener_t::generate_coarse_mapping(handle, fine_A.mtx, true); + handle.h = h; + crsMat aggregator = coarsener_t::generate_coarse_mapping(handle, fine_A.mtx, true); bool correct_aggregator = verify_aggregator(fine_A.mtx, aggregator); - EXPECT_TRUE(correct_aggregator) - << "Aggregation heuristic " << static_cast(h) - << " produced invalid aggregator."; + EXPECT_TRUE(correct_aggregator) << "Aggregation heuristic " << static_cast(h) + << " produced invalid aggregator."; for (auto b : builders) { - handle.b = b; - clt coarse_A = - coarsener_t::build_coarse_graph(handle, fine_A, aggregator); - bool correct_graph = verify_is_graph(coarse_A.mtx); - bool correct_coarsening = - verify_coarsening(fine_A, coarse_A); - EXPECT_TRUE(correct_graph) - << "Coarsening with dedupe method " << static_cast(b) - << " produced invalid graph with aggregation heuristic " - << static_cast(h) << "."; - EXPECT_TRUE(correct_coarsening) - << "Coarsening with dedupe method " << static_cast(b) - << " produced invalid coarsening with aggregation heuristic " - << static_cast(h) << "."; + handle.b = b; + clt coarse_A = coarsener_t::build_coarse_graph(handle, fine_A, aggregator); + bool correct_graph = verify_is_graph(coarse_A.mtx); + bool correct_coarsening = verify_coarsening(fine_A, coarse_A); + EXPECT_TRUE(correct_graph) << "Coarsening with dedupe method " << static_cast(b) + << " produced invalid graph with aggregation heuristic " << static_cast(h) << "."; + EXPECT_TRUE(correct_coarsening) << "Coarsening with dedupe method " << static_cast(b) + << " produced invalid coarsening with aggregation heuristic " + << static_cast(h) << "."; } } } template -void test_coarsen_random(lno_t numVerts, size_type nnz, lno_t bandwidth, - lno_t row_size_variance) { +void test_coarsen_random(lno_t numVerts, size_type nnz, lno_t bandwidth, lno_t row_size_variance) { using execution_space = typename device::execution_space; - using crsMat = - KokkosSparse::CrsMatrix; - using graph_type = typename crsMat::StaticCrsGraphType; - using c_rowmap_t = typename graph_type::row_map_type; - using c_entries_t = typename graph_type::entries_type; - using rowmap_t = typename c_rowmap_t::non_const_type; - using entries_t = typename c_entries_t::non_const_type; - using svt = typename crsMat::values_type; + using crsMat = KokkosSparse::CrsMatrix; + using graph_type = typename crsMat::StaticCrsGraphType; + using c_rowmap_t = typename graph_type::row_map_type; + using c_entries_t = typename graph_type::entries_type; + using rowmap_t = typename c_rowmap_t::non_const_type; + using entries_t = typename c_entries_t::non_const_type; + using svt = typename crsMat::values_type; // Generate graph - crsMat A = KokkosSparse::Impl::kk_generate_sparse_matrix( - numVerts, numVerts, nnz, row_size_variance, bandwidth); + crsMat A = + KokkosSparse::Impl::kk_generate_sparse_matrix(numVerts, numVerts, nnz, row_size_variance, bandwidth); auto G = A.graph; // Symmetrize the graph rowmap_t symRowmap; entries_t symEntries; - KokkosKernels::Impl::symmetrize_graph_symbolic_hashmap< - c_rowmap_t, c_entries_t, rowmap_t, entries_t, execution_space>( + KokkosKernels::Impl::symmetrize_graph_symbolic_hashmap( numVerts, G.row_map, G.entries, symRowmap, symEntries); graph_type GS(symEntries, symRowmap); svt symValues("sym values", symEntries.extent(0)); @@ -362,88 +334,65 @@ void test_coarsen_random(lno_t numVerts, size_type nnz, lno_t bandwidth, fine_A.vtx_wgts = vWgts; fine_A.level = 0; fine_A.uniform_weights = true; - std::vector heuristics = { - coarsener_t::HECv1, coarsener_t::Match, coarsener_t::MtMetis, - coarsener_t::MIS2, coarsener_t::GOSHv1, coarsener_t::GOSHv2}; - std::vector builders = { - coarsener_t::Sort, coarsener_t::Hashmap, coarsener_t::Hybrid, - coarsener_t::Spgemm, coarsener_t::Spgemm_transpose_first}; + std::vector heuristics = {coarsener_t::HECv1, coarsener_t::Match, + coarsener_t::MtMetis, coarsener_t::MIS2, + coarsener_t::GOSHv1, coarsener_t::GOSHv2}; + std::vector builders = {coarsener_t::Sort, coarsener_t::Hashmap, coarsener_t::Hybrid, + coarsener_t::Spgemm, coarsener_t::Spgemm_transpose_first}; for (auto h : heuristics) { - handle.h = h; - crsMat aggregator = - coarsener_t::generate_coarse_mapping(handle, fine_A.mtx, true); + handle.h = h; + crsMat aggregator = coarsener_t::generate_coarse_mapping(handle, fine_A.mtx, true); bool correct_aggregator = verify_aggregator(fine_A.mtx, aggregator); - EXPECT_TRUE(correct_aggregator) - << "Aggregation heuristic " << static_cast(h) - << " produced invalid aggregator."; + EXPECT_TRUE(correct_aggregator) << "Aggregation heuristic " << static_cast(h) + << " produced invalid aggregator."; for (auto b : builders) { - handle.b = b; - clt coarse_A = - coarsener_t::build_coarse_graph(handle, fine_A, aggregator); - bool correct_graph = verify_is_graph(coarse_A.mtx); - bool correct_coarsening = - verify_coarsening(fine_A, coarse_A); - EXPECT_TRUE(correct_graph) - << "Coarsening with dedupe method " << static_cast(b) - << " produced invalid graph with aggregation heuristic " - << static_cast(h) << "."; - EXPECT_TRUE(correct_coarsening) - << "Coarsening with dedupe method " << static_cast(b) - << " produced invalid coarsening with aggregation heuristic " - << static_cast(h) << "."; + handle.b = b; + clt coarse_A = coarsener_t::build_coarse_graph(handle, fine_A, aggregator); + bool correct_graph = verify_is_graph(coarse_A.mtx); + bool correct_coarsening = verify_coarsening(fine_A, coarse_A); + EXPECT_TRUE(correct_graph) << "Coarsening with dedupe method " << static_cast(b) + << " produced invalid graph with aggregation heuristic " << static_cast(h) << "."; + EXPECT_TRUE(correct_coarsening) << "Coarsening with dedupe method " << static_cast(b) + << " produced invalid coarsening with aggregation heuristic " + << static_cast(h) << "."; } } } -#define EXECUTE_TEST(SCALAR, ORDINAL, OFFSET, DEVICE) \ - TEST_F( \ - TestCategory, \ - graph##_##random_graph_coarsen##_##SCALAR##_##ORDINAL##_##OFFSET##_##DEVICE) { \ - test_coarsen_random(5000, 5000 * 20, \ - 1000, 10); \ - test_coarsen_random(50, 50 * 10, 40, 10); \ - test_coarsen_random(5, 5 * 3, 5, 0); \ - } \ - TEST_F( \ - TestCategory, \ - graph##_##grid_graph_coarsen##_##SCALAR##_##ORDINAL##_##OFFSET##_##DEVICE) { \ - test_coarsen_grid(); \ - } \ - TEST_F( \ - TestCategory, \ - graph##_##grid_graph_multilevel_coarsen##_##SCALAR##_##ORDINAL##_##OFFSET##_##DEVICE) { \ - test_multilevel_coarsen_grid(); \ +#define EXECUTE_TEST(SCALAR, ORDINAL, OFFSET, DEVICE) \ + TEST_F(TestCategory, graph##_##random_graph_coarsen##_##SCALAR##_##ORDINAL##_##OFFSET##_##DEVICE) { \ + test_coarsen_random(5000, 5000 * 20, 1000, 10); \ + test_coarsen_random(50, 50 * 10, 40, 10); \ + test_coarsen_random(5, 5 * 3, 5, 0); \ + } \ + TEST_F(TestCategory, graph##_##grid_graph_coarsen##_##SCALAR##_##ORDINAL##_##OFFSET##_##DEVICE) { \ + test_coarsen_grid(); \ + } \ + TEST_F(TestCategory, graph##_##grid_graph_multilevel_coarsen##_##SCALAR##_##ORDINAL##_##OFFSET##_##DEVICE) { \ + test_multilevel_coarsen_grid(); \ } // FIXME_SYCL #ifndef KOKKOS_ENABLE_SYCL #if defined(KOKKOSKERNELS_INST_DOUBLE) -#if (defined(KOKKOSKERNELS_INST_ORDINAL_INT) && \ - defined(KOKKOSKERNELS_INST_OFFSET_INT)) || \ - (!defined(KOKKOSKERNELS_ETI_ONLY) && \ - !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS)) +#if (defined(KOKKOSKERNELS_INST_ORDINAL_INT) && defined(KOKKOSKERNELS_INST_OFFSET_INT)) || \ + (!defined(KOKKOSKERNELS_ETI_ONLY) && !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS)) EXECUTE_TEST(double, int, int, TestDevice) #endif #endif -#if (defined(KOKKOSKERNELS_INST_ORDINAL_INT64_T) && \ - defined(KOKKOSKERNELS_INST_OFFSET_INT)) || \ - (!defined(KOKKOSKERNELS_ETI_ONLY) && \ - !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS)) +#if (defined(KOKKOSKERNELS_INST_ORDINAL_INT64_T) && defined(KOKKOSKERNELS_INST_OFFSET_INT)) || \ + (!defined(KOKKOSKERNELS_ETI_ONLY) && !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS)) EXECUTE_TEST(double, int64_t, int, TestDevice) #endif -#if (defined(KOKKOSKERNELS_INST_ORDINAL_INT) && \ - defined(KOKKOSKERNELS_INST_OFFSET_SIZE_T)) || \ - (!defined(KOKKOSKERNELS_ETI_ONLY) && \ - !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS)) +#if (defined(KOKKOSKERNELS_INST_ORDINAL_INT) && defined(KOKKOSKERNELS_INST_OFFSET_SIZE_T)) || \ + (!defined(KOKKOSKERNELS_ETI_ONLY) && !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS)) EXECUTE_TEST(double, int, size_t, TestDevice) #endif -#if (defined(KOKKOSKERNELS_INST_ORDINAL_INT64_T) && \ - defined(KOKKOSKERNELS_INST_OFFSET_SIZE_T)) || \ - (!defined(KOKKOSKERNELS_ETI_ONLY) && \ - !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS)) +#if (defined(KOKKOSKERNELS_INST_ORDINAL_INT64_T) && defined(KOKKOSKERNELS_INST_OFFSET_SIZE_T)) || \ + (!defined(KOKKOSKERNELS_ETI_ONLY) && !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS)) EXECUTE_TEST(double, int64_t, size_t, TestDevice) #endif #endif diff --git a/graph/unit_test/Test_Graph_graph_color.hpp b/graph/unit_test/Test_Graph_graph_color.hpp index 101c489bc0..3ddfa7c9b0 100644 --- a/graph/unit_test/Test_Graph_graph_color.hpp +++ b/graph/unit_test/Test_Graph_graph_color.hpp @@ -32,11 +32,8 @@ using namespace KokkosGraph::Experimental; namespace Test { template -int run_graphcolor( - crsMat_t input_mat, ColoringAlgorithm coloring_algorithm, - size_t &num_colors, - typename crsMat_t::StaticCrsGraphType::entries_type::non_const_type - &vertex_colors) { +int run_graphcolor(crsMat_t input_mat, ColoringAlgorithm coloring_algorithm, size_t &num_colors, + typename crsMat_t::StaticCrsGraphType::entries_type::non_const_type &vertex_colors) { typedef typename crsMat_t::StaticCrsGraphType graph_t; typedef typename graph_t::row_map_type lno_view_t; typedef typename graph_t::entries_type lno_nnz_view_t; @@ -46,9 +43,8 @@ int run_graphcolor( typedef typename lno_nnz_view_t::value_type lno_t; typedef typename scalar_view_t::value_type scalar_t; - typedef KokkosKernelsHandle< - size_type, lno_t, scalar_t, typename device::execution_space, - typename device::memory_space, typename device::memory_space> + typedef KokkosKernelsHandle KernelHandle; KernelHandle kh; @@ -60,9 +56,8 @@ int run_graphcolor( const size_t num_rows_1 = input_mat.numRows(); const size_t num_cols_1 = input_mat.numCols(); - graph_color( - &kh, num_rows_1, num_cols_1, input_mat.graph.row_map, - input_mat.graph.entries); + graph_color(&kh, num_rows_1, num_cols_1, input_mat.graph.row_map, + input_mat.graph.entries); num_colors = kh.get_graph_coloring_handle()->get_num_colors(); vertex_colors = kh.get_graph_coloring_handle()->get_vertex_colors(); @@ -72,14 +67,10 @@ int run_graphcolor( } // namespace Test -template -void test_coloring(lno_t numRows, size_type nnz, lno_t bandwidth, - lno_t row_size_variance) { +template +void test_coloring(lno_t numRows, size_type nnz, lno_t bandwidth, lno_t row_size_variance) { using namespace Test; - typedef - typename KokkosSparse::CrsMatrix - crsMat_t; + typedef typename KokkosSparse::CrsMatrix crsMat_t; typedef typename crsMat_t::StaticCrsGraphType graph_t; typedef typename graph_t::row_map_type lno_view_t; typedef typename graph_t::entries_type lno_nnz_view_t; @@ -87,28 +78,24 @@ void test_coloring(lno_t numRows, size_type nnz, lno_t bandwidth, typedef typename crsMat_t::values_type::non_const_type scalar_view_t; // typedef typename lno_view_t::non_const_value_type size_type; - lno_t numCols = numRows; - crsMat_t input_mat = KokkosSparse::Impl::kk_generate_sparse_matrix( - numRows, numCols, nnz, row_size_variance, bandwidth); + lno_t numCols = numRows; + crsMat_t input_mat = + KokkosSparse::Impl::kk_generate_sparse_matrix(numRows, numCols, nnz, row_size_variance, bandwidth); typename lno_view_t::non_const_type sym_xadj; typename lno_nnz_view_t::non_const_type sym_adj; KokkosKernels::Impl::symmetrize_graph_symbolic_hashmap< - lno_view_t, lno_nnz_view_t, typename lno_view_t::non_const_type, - typename lno_nnz_view_t::non_const_type, - typename device::execution_space>(numRows, input_mat.graph.row_map, - input_mat.graph.entries, sym_xadj, - sym_adj); + lno_view_t, lno_nnz_view_t, typename lno_view_t::non_const_type, typename lno_nnz_view_t::non_const_type, + typename device::execution_space>(numRows, input_mat.graph.row_map, input_mat.graph.entries, sym_xadj, sym_adj); size_type numentries = sym_adj.extent(0); scalar_view_t newValues("vals", numentries); graph_t static_graph(sym_adj, sym_xadj); input_mat = crsMat_t("CrsMatrix", numCols, newValues, static_graph); - std::vector coloring_algorithms = { - COLORING_DEFAULT, COLORING_SERIAL, COLORING_VB, COLORING_VBBIT, - COLORING_VBCS}; + std::vector coloring_algorithms = {COLORING_DEFAULT, COLORING_SERIAL, COLORING_VB, COLORING_VBBIT, + COLORING_VBCS}; // FIXME: VBD sometimes fails on CUDA and HIP #if defined(KOKKOS_ENABLE_CUDA) @@ -125,8 +112,7 @@ void test_coloring(lno_t numRows, size_type nnz, lno_t bandwidth, // FIXME SYCL: re-enable this when EB is working #ifdef KOKKOS_ENABLE_SYCL - if (!std::is_same::value) { + if (!std::is_same::value) { coloring_algorithms.push_back(COLORING_EB); } #else @@ -140,28 +126,22 @@ void test_coloring(lno_t numRows, size_type nnz, lno_t bandwidth, Kokkos::Timer timer1; crsMat_t output_mat; - int res = run_graphcolor(input_mat, coloring_algorithm, - num_colors, vector_colors); + int res = run_graphcolor(input_mat, coloring_algorithm, num_colors, vector_colors); // double coloring_time = timer1.seconds(); EXPECT_TRUE((res == 0)); const lno_t num_rows_1 = input_mat.numRows(); const lno_t num_cols_1 = input_mat.numCols(); - lno_t num_conflict = KokkosSparse::Impl::kk_is_d1_coloring_valid< - lno_view_t, lno_nnz_view_t, color_view_t, - typename device::execution_space>( - num_rows_1, num_cols_1, input_mat.graph.row_map, - input_mat.graph.entries, vector_colors); + lno_t num_conflict = KokkosSparse::Impl::kk_is_d1_coloring_valid( + num_rows_1, num_cols_1, input_mat.graph.row_map, input_mat.graph.entries, vector_colors); lno_t conf = 0; { // also check the correctness of the validation code :) - typename lno_view_t::HostMirror hrm = - Kokkos::create_mirror_view(input_mat.graph.row_map); - typename lno_nnz_view_t::HostMirror hentries = - Kokkos::create_mirror_view(input_mat.graph.entries); - typename color_view_t::HostMirror hcolor = - Kokkos::create_mirror_view(vector_colors); + typename lno_view_t::HostMirror hrm = Kokkos::create_mirror_view(input_mat.graph.row_map); + typename lno_nnz_view_t::HostMirror hentries = Kokkos::create_mirror_view(input_mat.graph.entries); + typename color_view_t::HostMirror hcolor = Kokkos::create_mirror_view(vector_colors); Kokkos::deep_copy(hrm, input_mat.graph.row_map); Kokkos::deep_copy(hentries, input_mat.graph.entries); Kokkos::deep_copy(hcolor, vector_colors); @@ -179,53 +159,39 @@ void test_coloring(lno_t numRows, size_type nnz, lno_t bandwidth, } } } - EXPECT_TRUE((num_conflict == conf)) - << "Coloring algo " << (int)coloring_algorithm - << ": kk_is_d1_coloring_valid returned incorrect number of conflicts (" - << num_conflict << ", should be " << conf << ")"; - - EXPECT_TRUE((num_conflict == 0)) - << "Coloring algo " << (int)coloring_algorithm - << ": D1 coloring produced invalid coloring (" << num_conflict - << " conflicts)"; + EXPECT_TRUE((num_conflict == conf)) << "Coloring algo " << (int)coloring_algorithm + << ": kk_is_d1_coloring_valid returned incorrect number of conflicts (" + << num_conflict << ", should be " << conf << ")"; + + EXPECT_TRUE((num_conflict == 0)) << "Coloring algo " << (int)coloring_algorithm + << ": D1 coloring produced invalid coloring (" << num_conflict << " conflicts)"; } // device::execution_space::finalize(); } -#define EXECUTE_TEST(SCALAR, ORDINAL, OFFSET, DEVICE) \ - TEST_F(TestCategory, \ - graph##_##graph_color##_##SCALAR##_##ORDINAL##_##OFFSET##_##DEVICE) { \ - test_coloring(50000, 50000 * 30, 200, \ - 10); \ - test_coloring(50000, 50000 * 30, 100, \ - 10); \ +#define EXECUTE_TEST(SCALAR, ORDINAL, OFFSET, DEVICE) \ + TEST_F(TestCategory, graph##_##graph_color##_##SCALAR##_##ORDINAL##_##OFFSET##_##DEVICE) { \ + test_coloring(50000, 50000 * 30, 200, 10); \ + test_coloring(50000, 50000 * 30, 100, 10); \ } -#if (defined(KOKKOSKERNELS_INST_ORDINAL_INT) && \ - defined(KOKKOSKERNELS_INST_OFFSET_INT)) || \ - (!defined(KOKKOSKERNELS_ETI_ONLY) && \ - !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS)) +#if (defined(KOKKOSKERNELS_INST_ORDINAL_INT) && defined(KOKKOSKERNELS_INST_OFFSET_INT)) || \ + (!defined(KOKKOSKERNELS_ETI_ONLY) && !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS)) EXECUTE_TEST(default_scalar, int, int, TestDevice) #endif -#if (defined(KOKKOSKERNELS_INST_ORDINAL_INT64_T) && \ - defined(KOKKOSKERNELS_INST_OFFSET_INT)) || \ - (!defined(KOKKOSKERNELS_ETI_ONLY) && \ - !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS)) +#if (defined(KOKKOSKERNELS_INST_ORDINAL_INT64_T) && defined(KOKKOSKERNELS_INST_OFFSET_INT)) || \ + (!defined(KOKKOSKERNELS_ETI_ONLY) && !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS)) EXECUTE_TEST(default_scalar, int64_t, int, TestDevice) #endif -#if (defined(KOKKOSKERNELS_INST_ORDINAL_INT) && \ - defined(KOKKOSKERNELS_INST_OFFSET_SIZE_T)) || \ - (!defined(KOKKOSKERNELS_ETI_ONLY) && \ - !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS)) +#if (defined(KOKKOSKERNELS_INST_ORDINAL_INT) && defined(KOKKOSKERNELS_INST_OFFSET_SIZE_T)) || \ + (!defined(KOKKOSKERNELS_ETI_ONLY) && !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS)) EXECUTE_TEST(default_scalar, int, size_t, TestDevice) #endif -#if (defined(KOKKOSKERNELS_INST_ORDINAL_INT64_T) && \ - defined(KOKKOSKERNELS_INST_OFFSET_SIZE_T)) || \ - (!defined(KOKKOSKERNELS_ETI_ONLY) && \ - !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS)) +#if (defined(KOKKOSKERNELS_INST_ORDINAL_INT64_T) && defined(KOKKOSKERNELS_INST_OFFSET_SIZE_T)) || \ + (!defined(KOKKOSKERNELS_ETI_ONLY) && !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS)) EXECUTE_TEST(default_scalar, int64_t, size_t, TestDevice) #endif diff --git a/graph/unit_test/Test_Graph_graph_color_deterministic.hpp b/graph/unit_test/Test_Graph_graph_color_deterministic.hpp index 7bd3c4cd40..87771de84f 100644 --- a/graph/unit_test/Test_Graph_graph_color_deterministic.hpp +++ b/graph/unit_test/Test_Graph_graph_color_deterministic.hpp @@ -32,11 +32,8 @@ using namespace KokkosGraph::Experimental; namespace Test { template -int run_graphcolor_deter( - crsMat_t input_mat, ColoringAlgorithm coloring_algorithm, - size_t &num_colors, - typename crsMat_t::StaticCrsGraphType::entries_type::non_const_type - &vertex_colors) { +int run_graphcolor_deter(crsMat_t input_mat, ColoringAlgorithm coloring_algorithm, size_t &num_colors, + typename crsMat_t::StaticCrsGraphType::entries_type::non_const_type &vertex_colors) { typedef typename crsMat_t::StaticCrsGraphType graph_t; typedef typename graph_t::row_map_type lno_view_t; typedef typename graph_t::entries_type lno_nnz_view_t; @@ -46,9 +43,8 @@ int run_graphcolor_deter( typedef typename lno_nnz_view_t::value_type lno_t; typedef typename scalar_view_t::value_type scalar_t; - typedef KokkosKernelsHandle< - size_type, lno_t, scalar_t, typename device::execution_space, - typename device::memory_space, typename device::memory_space> + typedef KokkosKernelsHandle KernelHandle; KernelHandle kh; @@ -60,9 +56,8 @@ int run_graphcolor_deter( const size_t num_rows_1 = input_mat.numRows(); const size_t num_cols_1 = input_mat.numCols(); - graph_color( - &kh, num_rows_1, num_cols_1, input_mat.graph.row_map, - input_mat.graph.entries); + graph_color(&kh, num_rows_1, num_cols_1, input_mat.graph.row_map, + input_mat.graph.entries); num_colors = kh.get_graph_coloring_handle()->get_num_colors(); vertex_colors = kh.get_graph_coloring_handle()->get_vertex_colors(); @@ -72,13 +67,10 @@ int run_graphcolor_deter( } // namespace Test -template +template void test_coloring_deterministic(lno_t numRows, size_type nnz) { using namespace Test; - typedef - typename KokkosSparse::CrsMatrix - crsMat_t; + typedef typename KokkosSparse::CrsMatrix crsMat_t; typedef typename crsMat_t::StaticCrsGraphType graph_t; typedef typename graph_t::row_map_type lno_view_t; typedef typename graph_t::entries_type lno_nnz_view_t; @@ -89,11 +81,9 @@ void test_coloring_deterministic(lno_t numRows, size_type nnz) { lno_t numCols = numRows; typename lno_view_t::non_const_type xadj("xadj", numRows + 1); - typename lno_view_t::non_const_type::HostMirror h_xadj = - Kokkos::create_mirror_view(xadj); + typename lno_view_t::non_const_type::HostMirror h_xadj = Kokkos::create_mirror_view(xadj); typename lno_nnz_view_t::non_const_type adj("adj", nnz); - typename lno_nnz_view_t::non_const_type::HostMirror h_adj = - Kokkos::create_mirror_view(adj); + typename lno_nnz_view_t::non_const_type::HostMirror h_adj = Kokkos::create_mirror_view(adj); // Fill up the rowPtr array h_xadj(0) = 0; @@ -211,18 +201,15 @@ void test_coloring_deterministic(lno_t numRows, size_type nnz) { size_t num_colors; Kokkos::Timer timer1; - int res = run_graphcolor_deter( - input_mat, coloring_algorithm, num_colors, vector_colors); + int res = run_graphcolor_deter(input_mat, coloring_algorithm, num_colors, vector_colors); EXPECT_TRUE((res == 0)); EXPECT_TRUE((num_colors == 2)); - size_type num_conflict = 0; - typename color_view_t::HostMirror h_vector_colors = - Kokkos::create_mirror_view(vector_colors); + size_type num_conflict = 0; + typename color_view_t::HostMirror h_vector_colors = Kokkos::create_mirror_view(vector_colors); Kokkos::deep_copy(h_vector_colors, vector_colors); - int exact_colors[18] = {2, 1, 2, 1, 1, 2, 1, 2, 2, - 1, 2, 1, 2, 1, 2, 1, 2, 1}; + int exact_colors[18] = {2, 1, 2, 1, 1, 2, 1, 2, 2, 1, 2, 1, 2, 1, 2, 1, 2, 1}; for (lno_t vertexIdx = 0; vertexIdx < numRows; ++vertexIdx) { if (h_vector_colors(vertexIdx) != exact_colors[vertexIdx]) { @@ -235,39 +222,29 @@ void test_coloring_deterministic(lno_t numRows, size_type nnz) { } } -#define EXECUTE_TEST(SCALAR, ORDINAL, OFFSET, DEVICE) \ - TEST_F( \ - TestCategory, \ - graph##_##graph_color_deterministic##_##SCALAR##_##ORDINAL##_##OFFSET##_##DEVICE) { \ - test_coloring_deterministic(18, 74); \ - test_coloring_deterministic(18, 74); \ +#define EXECUTE_TEST(SCALAR, ORDINAL, OFFSET, DEVICE) \ + TEST_F(TestCategory, graph##_##graph_color_deterministic##_##SCALAR##_##ORDINAL##_##OFFSET##_##DEVICE) { \ + test_coloring_deterministic(18, 74); \ + test_coloring_deterministic(18, 74); \ } -#if (defined(KOKKOSKERNELS_INST_ORDINAL_INT) && \ - defined(KOKKOSKERNELS_INST_OFFSET_INT)) || \ - (!defined(KOKKOSKERNELS_ETI_ONLY) && \ - !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS)) +#if (defined(KOKKOSKERNELS_INST_ORDINAL_INT) && defined(KOKKOSKERNELS_INST_OFFSET_INT)) || \ + (!defined(KOKKOSKERNELS_ETI_ONLY) && !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS)) EXECUTE_TEST(default_scalar, int, int, TestDevice) #endif -#if (defined(KOKKOSKERNELS_INST_ORDINAL_INT64_T) && \ - defined(KOKKOSKERNELS_INST_OFFSET_INT)) || \ - (!defined(KOKKOSKERNELS_ETI_ONLY) && \ - !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS)) +#if (defined(KOKKOSKERNELS_INST_ORDINAL_INT64_T) && defined(KOKKOSKERNELS_INST_OFFSET_INT)) || \ + (!defined(KOKKOSKERNELS_ETI_ONLY) && !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS)) EXECUTE_TEST(default_scalar, int64_t, int, TestDevice) #endif -#if (defined(KOKKOSKERNELS_INST_ORDINAL_INT) && \ - defined(KOKKOSKERNELS_INST_OFFSET_SIZE_T)) || \ - (!defined(KOKKOSKERNELS_ETI_ONLY) && \ - !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS)) +#if (defined(KOKKOSKERNELS_INST_ORDINAL_INT) && defined(KOKKOSKERNELS_INST_OFFSET_SIZE_T)) || \ + (!defined(KOKKOSKERNELS_ETI_ONLY) && !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS)) EXECUTE_TEST(default_scalar, int, size_t, TestDevice) #endif -#if (defined(KOKKOSKERNELS_INST_ORDINAL_INT64_T) && \ - defined(KOKKOSKERNELS_INST_OFFSET_SIZE_T)) || \ - (!defined(KOKKOSKERNELS_ETI_ONLY) && \ - !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS)) +#if (defined(KOKKOSKERNELS_INST_ORDINAL_INT64_T) && defined(KOKKOSKERNELS_INST_OFFSET_SIZE_T)) || \ + (!defined(KOKKOSKERNELS_ETI_ONLY) && !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS)) EXECUTE_TEST(default_scalar, int64_t, size_t, TestDevice) #endif diff --git a/graph/unit_test/Test_Graph_graph_color_distance2.hpp b/graph/unit_test/Test_Graph_graph_color_distance2.hpp index 44ddaed0bf..ac3bbb7a18 100644 --- a/graph/unit_test/Test_Graph_graph_color_distance2.hpp +++ b/graph/unit_test/Test_Graph_graph_color_distance2.hpp @@ -35,10 +35,8 @@ using namespace KokkosGraph::Experimental; namespace Test { // Verify that a distance-2 coloring is correct (all views must be hostspace) -template -bool verifyD2Coloring(lno_t numVerts, const rowmap_t& rowmap, - const entries_t& entries, const colors_t& colors) { +template +bool verifyD2Coloring(lno_t numVerts, const rowmap_t& rowmap, const entries_t& entries, const colors_t& colors) { // Just do the simplest possible neighbors-of-neighbors loop to find conflicts for (lno_t v = 0; v < numVerts; v++) { if (colors(v) == 0) { @@ -52,8 +50,7 @@ bool verifyD2Coloring(lno_t numVerts, const rowmap_t& rowmap, if (nei1 < numVerts && nei1 != v) { // check for dist-1 conflict if (colors(v) == colors(nei1)) { - std::cout << "Dist-1 conflict between " << v << " and " << nei1 - << '\n'; + std::cout << "Dist-1 conflict between " << v << " and " << nei1 << '\n'; return false; } // iterate over dist-2 neighbors @@ -63,8 +60,7 @@ bool verifyD2Coloring(lno_t numVerts, const rowmap_t& rowmap, lno_t nei2 = entries(j); if (nei2 < numVerts && nei2 != v) { if (colors(v) == colors(nei2)) { - std::cout << "Dist-2 conflict between " << v << " and " << nei2 - << '\n'; + std::cout << "Dist-2 conflict between " << v << " and " << nei2 << '\n'; return false; } } @@ -75,14 +71,9 @@ bool verifyD2Coloring(lno_t numVerts, const rowmap_t& rowmap, return true; } -template -bool verifyBipartitePartialColoring(lno_t numRows, lno_t numCols, - const rowmap_t& rowmap, - const entries_t& entries, - const rowmap_t& t_rowmap, - const entries_t& t_entries, - const colors_t& colors) { +template +bool verifyBipartitePartialColoring(lno_t numRows, lno_t numCols, const rowmap_t& rowmap, const entries_t& entries, + const rowmap_t& t_rowmap, const entries_t& t_entries, const colors_t& colors) { // Just do the simplest possible neighbors-of-neighbors loop to find conflicts for (lno_t v = 0; v < numRows; v++) { if (colors(v) == 0) { @@ -101,8 +92,7 @@ bool verifyBipartitePartialColoring(lno_t numRows, lno_t numCols, lno_t nei2 = t_entries(j); if (nei2 < numRows && nei2 != v) { if (colors(v) == colors(nei2)) { - std::cout << "Hyperedge conflict between " << v << " and " << nei2 - << '\n'; + std::cout << "Hyperedge conflict between " << v << " and " << nei2 << '\n'; return false; } } @@ -114,256 +104,189 @@ bool verifyBipartitePartialColoring(lno_t numRows, lno_t numCols, } } // namespace Test -template -void test_dist2_coloring(lno_t numVerts, size_type nnz, lno_t bandwidth, - lno_t row_size_variance) { +template +void test_dist2_coloring(lno_t numVerts, size_type nnz, lno_t bandwidth, lno_t row_size_variance) { using execution_space = typename device::execution_space; using memory_space = typename device::memory_space; - using crsMat = - KokkosSparse::CrsMatrix; - using graph_type = typename crsMat::StaticCrsGraphType; - using c_rowmap_t = typename graph_type::row_map_type; - using c_entries_t = typename graph_type::entries_type; - using rowmap_t = typename c_rowmap_t::non_const_type; - using entries_t = typename c_entries_t::non_const_type; - using KernelHandle = - KokkosKernelsHandle; + using crsMat = KokkosSparse::CrsMatrix; + using graph_type = typename crsMat::StaticCrsGraphType; + using c_rowmap_t = typename graph_type::row_map_type; + using c_entries_t = typename graph_type::entries_type; + using rowmap_t = typename c_rowmap_t::non_const_type; + using entries_t = typename c_entries_t::non_const_type; + using KernelHandle = KokkosKernelsHandle; // Generate graph, and add some out-of-bounds columns - crsMat A = KokkosSparse::Impl::kk_generate_sparse_matrix( - numVerts, numVerts, nnz, row_size_variance, bandwidth); + crsMat A = + KokkosSparse::Impl::kk_generate_sparse_matrix(numVerts, numVerts, nnz, row_size_variance, bandwidth); auto G = A.graph; // Symmetrize the graph rowmap_t symRowmap; entries_t symEntries; - KokkosKernels::Impl::symmetrize_graph_symbolic_hashmap< - c_rowmap_t, c_entries_t, rowmap_t, entries_t, execution_space>( + KokkosKernels::Impl::symmetrize_graph_symbolic_hashmap( numVerts, G.row_map, G.entries, symRowmap, symEntries); - auto rowmapHost = - Kokkos::create_mirror_view_and_copy(Kokkos::HostSpace(), symRowmap); - auto entriesHost = - Kokkos::create_mirror_view_and_copy(Kokkos::HostSpace(), symEntries); - std::vector algos = { - COLORING_D2_DEFAULT, COLORING_D2_SERIAL, COLORING_D2_VB, - COLORING_D2_VB_BIT, COLORING_D2_VB_BIT_EF, COLORING_D2_NB_BIT}; + auto rowmapHost = Kokkos::create_mirror_view_and_copy(Kokkos::HostSpace(), symRowmap); + auto entriesHost = Kokkos::create_mirror_view_and_copy(Kokkos::HostSpace(), symEntries); + std::vector algos = {COLORING_D2_DEFAULT, COLORING_D2_SERIAL, COLORING_D2_VB, + COLORING_D2_VB_BIT, COLORING_D2_VB_BIT_EF, COLORING_D2_NB_BIT}; for (auto algo : algos) { KernelHandle kh; kh.create_distance2_graph_coloring_handle(algo); // Compute the Distance-2 graph coloring. - graph_color_distance2( - &kh, numVerts, symRowmap, symEntries); + graph_color_distance2(&kh, numVerts, symRowmap, symEntries); execution_space().fence(); auto coloring_handle = kh.get_distance2_graph_coloring_handle(); auto colors = coloring_handle->get_vertex_colors(); - auto colorsHost = - Kokkos::create_mirror_view_and_copy(Kokkos::HostSpace(), colors); - auto numColors = coloring_handle->get_num_colors(); + auto colorsHost = Kokkos::create_mirror_view_and_copy(Kokkos::HostSpace(), colors); + auto numColors = coloring_handle->get_num_colors(); EXPECT_LE(numColors, numVerts); bool success = - Test::verifyD2Coloring( + Test::verifyD2Coloring( numVerts, rowmapHost, entriesHost, colorsHost); - EXPECT_TRUE(success) << "Dist-2: algorithm " - << coloring_handle->getD2AlgorithmName() + EXPECT_TRUE(success) << "Dist-2: algorithm " << coloring_handle->getD2AlgorithmName() << " produced invalid coloring"; kh.destroy_distance2_graph_coloring_handle(); } } -template -void test_bipartite_symmetric(lno_t numVerts, size_type nnz, lno_t bandwidth, - lno_t row_size_variance) { +template +void test_bipartite_symmetric(lno_t numVerts, size_type nnz, lno_t bandwidth, lno_t row_size_variance) { using execution_space = typename device::execution_space; using memory_space = typename device::memory_space; - using crsMat = - KokkosSparse::CrsMatrix; - using graph_type = typename crsMat::StaticCrsGraphType; - using c_rowmap_t = typename graph_type::row_map_type; - using c_entries_t = typename graph_type::entries_type; - using rowmap_t = typename c_rowmap_t::non_const_type; - using entries_t = typename c_entries_t::non_const_type; - using KernelHandle = - KokkosKernelsHandle; + using crsMat = KokkosSparse::CrsMatrix; + using graph_type = typename crsMat::StaticCrsGraphType; + using c_rowmap_t = typename graph_type::row_map_type; + using c_entries_t = typename graph_type::entries_type; + using rowmap_t = typename c_rowmap_t::non_const_type; + using entries_t = typename c_entries_t::non_const_type; + using KernelHandle = KokkosKernelsHandle; // Generate graph, and add some out-of-bounds columns - crsMat A = KokkosSparse::Impl::kk_generate_sparse_matrix( - numVerts, numVerts, nnz, row_size_variance, bandwidth); + crsMat A = + KokkosSparse::Impl::kk_generate_sparse_matrix(numVerts, numVerts, nnz, row_size_variance, bandwidth); auto G = A.graph; // Symmetrize the graph rowmap_t symRowmap; entries_t symEntries; - KokkosKernels::Impl::symmetrize_graph_symbolic_hashmap< - c_rowmap_t, c_entries_t, rowmap_t, entries_t, execution_space>( + KokkosKernels::Impl::symmetrize_graph_symbolic_hashmap( numVerts, G.row_map, G.entries, symRowmap, symEntries); - auto rowmapHost = - Kokkos::create_mirror_view_and_copy(Kokkos::HostSpace(), symRowmap); - auto entriesHost = - Kokkos::create_mirror_view_and_copy(Kokkos::HostSpace(), symEntries); - std::vector algos = { - COLORING_D2_DEFAULT, COLORING_D2_SERIAL, COLORING_D2_VB, - COLORING_D2_VB_BIT, COLORING_D2_VB_BIT_EF, COLORING_D2_NB_BIT}; + auto rowmapHost = Kokkos::create_mirror_view_and_copy(Kokkos::HostSpace(), symRowmap); + auto entriesHost = Kokkos::create_mirror_view_and_copy(Kokkos::HostSpace(), symEntries); + std::vector algos = {COLORING_D2_DEFAULT, COLORING_D2_SERIAL, COLORING_D2_VB, + COLORING_D2_VB_BIT, COLORING_D2_VB_BIT_EF, COLORING_D2_NB_BIT}; for (auto algo : algos) { KernelHandle kh; kh.create_distance2_graph_coloring_handle(algo); // Compute the Distance-2 graph coloring. - bipartite_color_rows( - &kh, numVerts, numVerts, symRowmap, symEntries, true); + bipartite_color_rows(&kh, numVerts, numVerts, symRowmap, symEntries, true); execution_space().fence(); auto coloring_handle = kh.get_distance2_graph_coloring_handle(); auto colors = coloring_handle->get_vertex_colors(); - auto colorsHost = - Kokkos::create_mirror_view_and_copy(Kokkos::HostSpace(), colors); - auto numColors = coloring_handle->get_num_colors(); + auto colorsHost = Kokkos::create_mirror_view_and_copy(Kokkos::HostSpace(), colors); + auto numColors = coloring_handle->get_num_colors(); EXPECT_LE(numColors, numVerts); - bool success = Test::verifyBipartitePartialColoring< - lno_t, size_type, decltype(rowmapHost), decltype(entriesHost), - decltype(colorsHost)>(numVerts, numVerts, rowmapHost, entriesHost, - rowmapHost, entriesHost, colorsHost); - EXPECT_TRUE(success) << "Dist-2: algorithm " - << coloring_handle->getD2AlgorithmName() + bool success = Test::verifyBipartitePartialColoring( + numVerts, numVerts, rowmapHost, entriesHost, rowmapHost, entriesHost, colorsHost); + EXPECT_TRUE(success) << "Dist-2: algorithm " << coloring_handle->getD2AlgorithmName() << " produced invalid coloring"; kh.destroy_distance2_graph_coloring_handle(); } } -template -void test_bipartite(lno_t numRows, lno_t numCols, size_type nnz, - lno_t bandwidth, lno_t row_size_variance, bool colorRows) { +template +void test_bipartite(lno_t numRows, lno_t numCols, size_type nnz, lno_t bandwidth, lno_t row_size_variance, + bool colorRows) { using execution_space = typename device::execution_space; using memory_space = typename device::memory_space; - using crsMat = - KokkosSparse::CrsMatrix; - using graph_type = typename crsMat::StaticCrsGraphType; - using rowmap_t = typename graph_type::row_map_type::non_const_type; - using entries_t = typename graph_type::entries_type::non_const_type; - using c_rowmap_t = typename graph_type::row_map_type; - using c_entries_t = typename graph_type::entries_type; - using KernelHandle = - KokkosKernelsHandle; + using crsMat = KokkosSparse::CrsMatrix; + using graph_type = typename crsMat::StaticCrsGraphType; + using rowmap_t = typename graph_type::row_map_type::non_const_type; + using entries_t = typename graph_type::entries_type::non_const_type; + using c_rowmap_t = typename graph_type::row_map_type; + using c_entries_t = typename graph_type::entries_type; + using KernelHandle = KokkosKernelsHandle; // Generate graph - crsMat A = KokkosSparse::Impl::kk_generate_sparse_matrix( - numRows, numCols, nnz, row_size_variance, bandwidth); - auto G = A.graph; + crsMat A = KokkosSparse::Impl::kk_generate_sparse_matrix(numRows, numCols, nnz, row_size_variance, bandwidth); + auto G = A.graph; rowmap_t t_rowmap("rowmap^T", numCols + 1); entries_t t_entries("entries^T", G.entries.extent(0)); - KokkosSparse::Impl::transpose_graph( + KokkosSparse::Impl::transpose_graph( numRows, numCols, G.row_map, G.entries, t_rowmap, t_entries); // TODO: remove me, shouldn't be needed even with UVM execution_space().fence(); - auto rowmapHost = - Kokkos::create_mirror_view_and_copy(Kokkos::HostSpace(), G.row_map); - auto entriesHost = - Kokkos::create_mirror_view_and_copy(Kokkos::HostSpace(), G.entries); - auto t_rowmapHost = - Kokkos::create_mirror_view_and_copy(Kokkos::HostSpace(), t_rowmap); - auto t_entriesHost = - Kokkos::create_mirror_view_and_copy(Kokkos::HostSpace(), t_entries); - std::vector algos = { - COLORING_D2_DEFAULT, COLORING_D2_SERIAL, COLORING_D2_VB, - COLORING_D2_VB_BIT, COLORING_D2_VB_BIT_EF, COLORING_D2_NB_BIT}; + auto rowmapHost = Kokkos::create_mirror_view_and_copy(Kokkos::HostSpace(), G.row_map); + auto entriesHost = Kokkos::create_mirror_view_and_copy(Kokkos::HostSpace(), G.entries); + auto t_rowmapHost = Kokkos::create_mirror_view_and_copy(Kokkos::HostSpace(), t_rowmap); + auto t_entriesHost = Kokkos::create_mirror_view_and_copy(Kokkos::HostSpace(), t_entries); + std::vector algos = {COLORING_D2_DEFAULT, COLORING_D2_SERIAL, COLORING_D2_VB, + COLORING_D2_VB_BIT, COLORING_D2_VB_BIT_EF, COLORING_D2_NB_BIT}; for (auto algo : algos) { KernelHandle kh; kh.create_distance2_graph_coloring_handle(algo); // Compute the one-sided bipartite coloring. if (colorRows) { - bipartite_color_rows( - &kh, numRows, numCols, G.row_map, G.entries); + bipartite_color_rows(&kh, numRows, numCols, G.row_map, G.entries); } else { - bipartite_color_columns( - &kh, numRows, numCols, G.row_map, G.entries); + bipartite_color_columns(&kh, numRows, numCols, G.row_map, G.entries); } execution_space().fence(); auto coloring_handle = kh.get_distance2_graph_coloring_handle(); auto colors = coloring_handle->get_vertex_colors(); - auto colorsHost = - Kokkos::create_mirror_view_and_copy(Kokkos::HostSpace(), colors); - auto numColors = coloring_handle->get_num_colors(); + auto colorsHost = Kokkos::create_mirror_view_and_copy(Kokkos::HostSpace(), colors); + auto numColors = coloring_handle->get_num_colors(); bool success; if (colorRows) { EXPECT_LE(numColors, numRows); - success = Test::verifyBipartitePartialColoring< - lno_t, size_type, decltype(rowmapHost), decltype(entriesHost), - decltype(colorsHost)>(numRows, numCols, rowmapHost, entriesHost, - t_rowmapHost, t_entriesHost, colorsHost); + success = Test::verifyBipartitePartialColoring(numRows, numCols, rowmapHost, entriesHost, + t_rowmapHost, t_entriesHost, colorsHost); } else { EXPECT_LE(numColors, numCols); - success = Test::verifyBipartitePartialColoring< - lno_t, size_type, decltype(rowmapHost), decltype(entriesHost), - decltype(colorsHost)>(numCols, numRows, t_rowmapHost, t_entriesHost, - rowmapHost, entriesHost, colorsHost); + success = Test::verifyBipartitePartialColoring( + numCols, numRows, t_rowmapHost, t_entriesHost, rowmapHost, entriesHost, colorsHost); } - EXPECT_TRUE(success) << "Bipartite " << (colorRows ? "row" : "column") - << " coloring: algorithm " - << coloring_handle->getD2AlgorithmName() - << " produced invalid coloring"; + EXPECT_TRUE(success) << "Bipartite " << (colorRows ? "row" : "column") << " coloring: algorithm " + << coloring_handle->getD2AlgorithmName() << " produced invalid coloring"; kh.destroy_distance2_graph_coloring_handle(); } } -#define EXECUTE_TEST(SCALAR, ORDINAL, OFFSET, DEVICE) \ - TEST_F( \ - TestCategory, \ - graph##_##graph_color_distance2##_##SCALAR##_##ORDINAL##_##OFFSET##_##DEVICE) { \ - test_dist2_coloring(5000, 5000 * 20, \ - 1000, 10); \ - test_dist2_coloring(50, 50 * 10, 40, 10); \ - } \ - TEST_F( \ - TestCategory, \ - graph##_##graph_color_bipartite_sym##_##SCALAR##_##ORDINAL##_##OFFSET##_##DEVICE) { \ - test_bipartite_symmetric(50, 50 * 5, 30, \ - 1); \ - test_bipartite_symmetric(2000, 2000 * 20, \ - 800, 10); \ - } \ - TEST_F( \ - TestCategory, \ - graph##_##graph_color_bipartite_row##_##SCALAR##_##ORDINAL##_##OFFSET##_##DEVICE) { \ - test_bipartite(2000, 4000, 3000 * 20, \ - 800, 10, true); \ - test_bipartite(4000, 2000, 3000 * 20, \ - 800, 10, true); \ - } \ - TEST_F( \ - TestCategory, \ - graph##_##graph_color_bipartite_col##_##SCALAR##_##ORDINAL##_##OFFSET##_##DEVICE) { \ - test_bipartite(2000, 4000, 3000 * 20, \ - 800, 10, false); \ - test_bipartite(4000, 2000, 3000 * 20, \ - 800, 10, false); \ +#define EXECUTE_TEST(SCALAR, ORDINAL, OFFSET, DEVICE) \ + TEST_F(TestCategory, graph##_##graph_color_distance2##_##SCALAR##_##ORDINAL##_##OFFSET##_##DEVICE) { \ + test_dist2_coloring(5000, 5000 * 20, 1000, 10); \ + test_dist2_coloring(50, 50 * 10, 40, 10); \ + } \ + TEST_F(TestCategory, graph##_##graph_color_bipartite_sym##_##SCALAR##_##ORDINAL##_##OFFSET##_##DEVICE) { \ + test_bipartite_symmetric(50, 50 * 5, 30, 1); \ + test_bipartite_symmetric(2000, 2000 * 20, 800, 10); \ + } \ + TEST_F(TestCategory, graph##_##graph_color_bipartite_row##_##SCALAR##_##ORDINAL##_##OFFSET##_##DEVICE) { \ + test_bipartite(2000, 4000, 3000 * 20, 800, 10, true); \ + test_bipartite(4000, 2000, 3000 * 20, 800, 10, true); \ + } \ + TEST_F(TestCategory, graph##_##graph_color_bipartite_col##_##SCALAR##_##ORDINAL##_##OFFSET##_##DEVICE) { \ + test_bipartite(2000, 4000, 3000 * 20, 800, 10, false); \ + test_bipartite(4000, 2000, 3000 * 20, 800, 10, false); \ } #if defined(KOKKOSKERNELS_INST_DOUBLE) -#if (defined(KOKKOSKERNELS_INST_ORDINAL_INT) && \ - defined(KOKKOSKERNELS_INST_OFFSET_INT)) || \ - (!defined(KOKKOSKERNELS_ETI_ONLY) && \ - !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS)) +#if (defined(KOKKOSKERNELS_INST_ORDINAL_INT) && defined(KOKKOSKERNELS_INST_OFFSET_INT)) || \ + (!defined(KOKKOSKERNELS_ETI_ONLY) && !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS)) EXECUTE_TEST(double, int, int, TestDevice) #endif -#if (defined(KOKKOSKERNELS_INST_ORDINAL_INT64_T) && \ - defined(KOKKOSKERNELS_INST_OFFSET_INT)) || \ - (!defined(KOKKOSKERNELS_ETI_ONLY) && \ - !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS)) +#if (defined(KOKKOSKERNELS_INST_ORDINAL_INT64_T) && defined(KOKKOSKERNELS_INST_OFFSET_INT)) || \ + (!defined(KOKKOSKERNELS_ETI_ONLY) && !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS)) EXECUTE_TEST(double, int64_t, int, TestDevice) #endif -#if (defined(KOKKOSKERNELS_INST_ORDINAL_INT) && \ - defined(KOKKOSKERNELS_INST_OFFSET_SIZE_T)) || \ - (!defined(KOKKOSKERNELS_ETI_ONLY) && \ - !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS)) +#if (defined(KOKKOSKERNELS_INST_ORDINAL_INT) && defined(KOKKOSKERNELS_INST_OFFSET_SIZE_T)) || \ + (!defined(KOKKOSKERNELS_ETI_ONLY) && !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS)) EXECUTE_TEST(double, int, size_t, TestDevice) #endif -#if (defined(KOKKOSKERNELS_INST_ORDINAL_INT64_T) && \ - defined(KOKKOSKERNELS_INST_OFFSET_SIZE_T)) || \ - (!defined(KOKKOSKERNELS_ETI_ONLY) && \ - !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS)) +#if (defined(KOKKOSKERNELS_INST_ORDINAL_INT64_T) && defined(KOKKOSKERNELS_INST_OFFSET_SIZE_T)) || \ + (!defined(KOKKOSKERNELS_ETI_ONLY) && !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS)) EXECUTE_TEST(double, int64_t, size_t, TestDevice) #endif #endif diff --git a/graph/unit_test/Test_Graph_mis2.hpp b/graph/unit_test/Test_Graph_mis2.hpp index c6fb7562e7..cd96badd44 100644 --- a/graph/unit_test/Test_Graph_mis2.hpp +++ b/graph/unit_test/Test_Graph_mis2.hpp @@ -34,10 +34,8 @@ enum CoarseningType { PHASE2, NO_PHASE2 }; namespace Test { -template -bool verifyD2MIS(lno_t numVerts, const rowmap_t& rowmap, - const entries_t& entries, const mis_t& misArray) { +template +bool verifyD2MIS(lno_t numVerts, const rowmap_t& rowmap, const entries_t& entries, const mis_t& misArray) { // set a std::set of the mis, for fast membership test std::set mis; for (size_t i = 0; i < misArray.extent(0); i++) mis.insert(misArray(i)); @@ -82,74 +80,58 @@ bool verifyD2MIS(lno_t numVerts, const rowmap_t& rowmap, } } // namespace Test -template -void test_mis2(lno_t numVerts, size_type nnz, lno_t bandwidth, - lno_t row_size_variance) { +template +void test_mis2(lno_t numVerts, size_type nnz, lno_t bandwidth, lno_t row_size_variance) { using execution_space = typename device::execution_space; - using crsMat = - KokkosSparse::CrsMatrix; - using graph_type = typename crsMat::StaticCrsGraphType; - using c_rowmap_t = typename graph_type::row_map_type; - using c_entries_t = typename graph_type::entries_type; - using rowmap_t = typename c_rowmap_t::non_const_type; - using entries_t = typename c_entries_t::non_const_type; + using crsMat = KokkosSparse::CrsMatrix; + using graph_type = typename crsMat::StaticCrsGraphType; + using c_rowmap_t = typename graph_type::row_map_type; + using c_entries_t = typename graph_type::entries_type; + using rowmap_t = typename c_rowmap_t::non_const_type; + using entries_t = typename c_entries_t::non_const_type; // Generate graph, and add some out-of-bounds columns - crsMat A = KokkosSparse::Impl::kk_generate_sparse_matrix( - numVerts, numVerts, nnz, row_size_variance, bandwidth); + crsMat A = + KokkosSparse::Impl::kk_generate_sparse_matrix(numVerts, numVerts, nnz, row_size_variance, bandwidth); auto G = A.graph; // Symmetrize the graph rowmap_t symRowmap; entries_t symEntries; - KokkosKernels::Impl::symmetrize_graph_symbolic_hashmap< - c_rowmap_t, c_entries_t, rowmap_t, entries_t, execution_space>( + KokkosKernels::Impl::symmetrize_graph_symbolic_hashmap( numVerts, G.row_map, G.entries, symRowmap, symEntries); - auto rowmapHost = - Kokkos::create_mirror_view_and_copy(Kokkos::HostSpace(), symRowmap); - auto entriesHost = - Kokkos::create_mirror_view_and_copy(Kokkos::HostSpace(), symEntries); + auto rowmapHost = Kokkos::create_mirror_view_and_copy(Kokkos::HostSpace(), symRowmap); + auto entriesHost = Kokkos::create_mirror_view_and_copy(Kokkos::HostSpace(), symEntries); // For each algorithm, compute and verify the MIS std::vector algos = {MIS2_FAST, MIS2_QUALITY}; for (auto algo : algos) { - auto mis = KokkosGraph::graph_d2_mis( - symRowmap, symEntries, algo); - auto misHost = - Kokkos::create_mirror_view_and_copy(Kokkos::HostSpace(), mis); - bool success = Test::verifyD2MIS( + auto mis = KokkosGraph::graph_d2_mis(symRowmap, symEntries, algo); + auto misHost = Kokkos::create_mirror_view_and_copy(Kokkos::HostSpace(), mis); + bool success = Test::verifyD2MIS( numVerts, rowmapHost, entriesHost, misHost); - EXPECT_TRUE(success) << "Dist-2 MIS (algo " << (int)algo - << ") produced invalid set."; + EXPECT_TRUE(success) << "Dist-2 MIS (algo " << (int)algo << ") produced invalid set."; } } -template -void test_mis2_coarsening(lno_t numVerts, size_type nnz, lno_t bandwidth, - lno_t row_size_variance) { +template +void test_mis2_coarsening(lno_t numVerts, size_type nnz, lno_t bandwidth, lno_t row_size_variance) { using execution_space = typename device::execution_space; - using crsMat = - KokkosSparse::CrsMatrix; - using graph_type = typename crsMat::StaticCrsGraphType; - using c_rowmap_t = typename graph_type::row_map_type; - using c_entries_t = typename graph_type::entries_type; - using rowmap_t = typename c_rowmap_t::non_const_type; - using entries_t = typename c_entries_t::non_const_type; - using labels_t = entries_t; + using crsMat = KokkosSparse::CrsMatrix; + using graph_type = typename crsMat::StaticCrsGraphType; + using c_rowmap_t = typename graph_type::row_map_type; + using c_entries_t = typename graph_type::entries_type; + using rowmap_t = typename c_rowmap_t::non_const_type; + using entries_t = typename c_entries_t::non_const_type; + using labels_t = entries_t; // Generate graph, and add some out-of-bounds columns - crsMat A = KokkosSparse::Impl::kk_generate_sparse_matrix( - numVerts, numVerts, nnz, row_size_variance, bandwidth); + crsMat A = + KokkosSparse::Impl::kk_generate_sparse_matrix(numVerts, numVerts, nnz, row_size_variance, bandwidth); auto G = A.graph; // Symmetrize the graph rowmap_t symRowmap; entries_t symEntries; - KokkosKernels::Impl::symmetrize_graph_symbolic_hashmap< - c_rowmap_t, c_entries_t, rowmap_t, entries_t, execution_space>( + KokkosKernels::Impl::symmetrize_graph_symbolic_hashmap( numVerts, G.row_map, G.entries, symRowmap, symEntries); - auto rowmapHost = - Kokkos::create_mirror_view_and_copy(Kokkos::HostSpace(), symRowmap); - auto entriesHost = - Kokkos::create_mirror_view_and_copy(Kokkos::HostSpace(), symEntries); + auto rowmapHost = Kokkos::create_mirror_view_and_copy(Kokkos::HostSpace(), symRowmap); + auto entriesHost = Kokkos::create_mirror_view_and_copy(Kokkos::HostSpace(), symEntries); // For each algorithm, compute and verify the MIS std::vector algos = {PHASE2, NO_PHASE2}; for (auto algo : algos) { @@ -157,46 +139,34 @@ void test_mis2_coarsening(lno_t numVerts, size_type nnz, lno_t bandwidth, labels_t labels; switch (algo) { case NO_PHASE2: - labels = KokkosGraph::graph_mis2_coarsen( - symRowmap, symEntries, numClusters); + labels = KokkosGraph::graph_mis2_coarsen(symRowmap, symEntries, numClusters); break; case PHASE2: - labels = KokkosGraph::graph_mis2_aggregate( - symRowmap, symEntries, numClusters); + labels = KokkosGraph::graph_mis2_aggregate(symRowmap, symEntries, numClusters); } - auto labelsHost = - Kokkos::create_mirror_view_and_copy(Kokkos::HostSpace(), labels); + auto labelsHost = Kokkos::create_mirror_view_and_copy(Kokkos::HostSpace(), labels); // Not a strong test, but sanity check the number of clusters returned EXPECT_TRUE(numClusters >= 1 && numClusters <= numVerts); // Check that every label is in the range [0, numClusters) - for (lno_t i = 0; i < numVerts; i++) - EXPECT_TRUE(0 <= labelsHost(i) && labelsHost(i) < numClusters); + for (lno_t i = 0; i < numVerts; i++) EXPECT_TRUE(0 <= labelsHost(i) && labelsHost(i) < numClusters); // Test explicit coarsening given the labels, with and without compressing // the result rowmap_t coarseRowmapNC, coarseRowmapC; entries_t coarseEntriesNC, coarseEntriesC; - KokkosGraph::Experimental::graph_explicit_coarsen< - device, rowmap_t, entries_t, entries_t, rowmap_t, entries_t>( - symRowmap, symEntries, labels, numClusters, coarseRowmapNC, - coarseEntriesNC, false); - KokkosGraph::Experimental::graph_explicit_coarsen< - device, rowmap_t, entries_t, entries_t, rowmap_t, entries_t>( - symRowmap, symEntries, labels, numClusters, coarseRowmapC, - coarseEntriesC, true); + KokkosGraph::Experimental::graph_explicit_coarsen( + symRowmap, symEntries, labels, numClusters, coarseRowmapNC, coarseEntriesNC, false); + KokkosGraph::Experimental::graph_explicit_coarsen( + symRowmap, symEntries, labels, numClusters, coarseRowmapC, coarseEntriesC, true); EXPECT_EQ(coarseRowmapC.extent(0), numClusters + 1); EXPECT_EQ(coarseRowmapNC.extent(0), numClusters + 1); // Check that coarse graph doesn't have more edges than fine graph EXPECT_LE(coarseEntriesC.extent(0), symEntries.extent(0)); EXPECT_LE(coarseEntriesNC.extent(0), symEntries.extent(0)); // Verify compression is working. - auto hostRowmapNC = Kokkos::create_mirror_view_and_copy(Kokkos::HostSpace(), - coarseRowmapNC); - auto hostEntriesNC = Kokkos::create_mirror_view_and_copy( - Kokkos::HostSpace(), coarseEntriesNC); - auto hostRowmapC = - Kokkos::create_mirror_view_and_copy(Kokkos::HostSpace(), coarseRowmapC); - auto hostEntriesC = Kokkos::create_mirror_view_and_copy(Kokkos::HostSpace(), - coarseEntriesC); + auto hostRowmapNC = Kokkos::create_mirror_view_and_copy(Kokkos::HostSpace(), coarseRowmapNC); + auto hostEntriesNC = Kokkos::create_mirror_view_and_copy(Kokkos::HostSpace(), coarseEntriesNC); + auto hostRowmapC = Kokkos::create_mirror_view_and_copy(Kokkos::HostSpace(), coarseRowmapC); + auto hostEntriesC = Kokkos::create_mirror_view_and_copy(Kokkos::HostSpace(), coarseEntriesC); for (lno_t i = 0; i < numClusters; i++) { // std::set maintains uniqueness as well as ascending order of elements. // So it should exactly match the entries in the compressed version. @@ -215,11 +185,9 @@ void test_mis2_coarsening(lno_t numVerts, size_type nnz, lno_t bandwidth, } } -template +template void test_mis2_coarsening_zero_rows() { - using crsMat = - KokkosSparse::CrsMatrix; + using crsMat = KokkosSparse::CrsMatrix; using graph_type = typename crsMat::StaticCrsGraphType; using c_rowmap_t = typename graph_type::row_map_type; using c_entries_t = typename graph_type::entries_type; @@ -230,72 +198,55 @@ void test_mis2_coarsening_zero_rows() { // note: MIS2 coarsening first calls MIS2 on the fine graph, so this covers // the zero-row case for MIS2 alone. lno_t numClusters; - auto labels = KokkosGraph::graph_mis2_coarsen( - fineRowmap, fineEntries, numClusters); + auto labels = KokkosGraph::graph_mis2_coarsen(fineRowmap, fineEntries, numClusters); EXPECT_EQ(numClusters, 0); EXPECT_EQ(labels.extent(0), 0); // coarsen, should also produce a graph with 0 rows/entries rowmap_t coarseRowmap; entries_t coarseEntries; - KokkosGraph::Experimental::graph_explicit_coarsen< - device, rowmap_t, entries_t, entries_t, rowmap_t, entries_t>( + KokkosGraph::Experimental::graph_explicit_coarsen( fineRowmap, fineEntries, labels, 0, coarseRowmap, coarseEntries, false); EXPECT_LE(coarseRowmap.extent(0), 1); EXPECT_EQ(coarseEntries.extent(0), 0); - KokkosGraph::Experimental::graph_explicit_coarsen< - device, rowmap_t, entries_t, entries_t, rowmap_t, entries_t>( + KokkosGraph::Experimental::graph_explicit_coarsen( fineRowmap, fineEntries, labels, 0, coarseRowmap, coarseEntries, true); EXPECT_LE(coarseRowmap.extent(0), 1); EXPECT_EQ(coarseEntries.extent(0), 0); } -#define EXECUTE_TEST(SCALAR, ORDINAL, OFFSET, DEVICE) \ - TEST_F(TestCategory, \ - graph##_##graph_mis2##_##SCALAR##_##ORDINAL##_##OFFSET##_##DEVICE) { \ - test_mis2(5000, 5000 * 20, 1000, 10); \ - test_mis2(50, 50 * 10, 40, 10); \ - test_mis2(5, 5 * 3, 5, 0); \ - } \ - TEST_F( \ - TestCategory, \ - graph##_##graph_mis2_coarsening##_##SCALAR##_##ORDINAL##_##OFFSET##_##DEVICE) { \ - test_mis2_coarsening(5000, 5000 * 200, \ - 2000, 10); \ - test_mis2_coarsening(5000, 5000 * 20, \ - 1000, 10); \ - test_mis2_coarsening(50, 50 * 10, 40, \ - 10); \ - test_mis2_coarsening(5, 5 * 3, 5, 0); \ - test_mis2_coarsening_zero_rows(); \ +#define EXECUTE_TEST(SCALAR, ORDINAL, OFFSET, DEVICE) \ + TEST_F(TestCategory, graph##_##graph_mis2##_##SCALAR##_##ORDINAL##_##OFFSET##_##DEVICE) { \ + test_mis2(5000, 5000 * 20, 1000, 10); \ + test_mis2(50, 50 * 10, 40, 10); \ + test_mis2(5, 5 * 3, 5, 0); \ + } \ + TEST_F(TestCategory, graph##_##graph_mis2_coarsening##_##SCALAR##_##ORDINAL##_##OFFSET##_##DEVICE) { \ + test_mis2_coarsening(5000, 5000 * 200, 2000, 10); \ + test_mis2_coarsening(5000, 5000 * 20, 1000, 10); \ + test_mis2_coarsening(50, 50 * 10, 40, 10); \ + test_mis2_coarsening(5, 5 * 3, 5, 0); \ + test_mis2_coarsening_zero_rows(); \ } #if defined(KOKKOSKERNELS_INST_DOUBLE) -#if (defined(KOKKOSKERNELS_INST_ORDINAL_INT) && \ - defined(KOKKOSKERNELS_INST_OFFSET_INT)) || \ - (!defined(KOKKOSKERNELS_ETI_ONLY) && \ - !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS)) +#if (defined(KOKKOSKERNELS_INST_ORDINAL_INT) && defined(KOKKOSKERNELS_INST_OFFSET_INT)) || \ + (!defined(KOKKOSKERNELS_ETI_ONLY) && !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS)) EXECUTE_TEST(double, int, int, TestDevice) #endif #endif -#if (defined(KOKKOSKERNELS_INST_ORDINAL_INT64_T) && \ - defined(KOKKOSKERNELS_INST_OFFSET_INT)) || \ - (!defined(KOKKOSKERNELS_ETI_ONLY) && \ - !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS)) +#if (defined(KOKKOSKERNELS_INST_ORDINAL_INT64_T) && defined(KOKKOSKERNELS_INST_OFFSET_INT)) || \ + (!defined(KOKKOSKERNELS_ETI_ONLY) && !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS)) EXECUTE_TEST(double, int64_t, int, TestDevice) #endif -#if (defined(KOKKOSKERNELS_INST_ORDINAL_INT) && \ - defined(KOKKOSKERNELS_INST_OFFSET_SIZE_T)) || \ - (!defined(KOKKOSKERNELS_ETI_ONLY) && \ - !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS)) +#if (defined(KOKKOSKERNELS_INST_ORDINAL_INT) && defined(KOKKOSKERNELS_INST_OFFSET_SIZE_T)) || \ + (!defined(KOKKOSKERNELS_ETI_ONLY) && !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS)) EXECUTE_TEST(double, int, size_t, TestDevice) #endif -#if (defined(KOKKOSKERNELS_INST_ORDINAL_INT64_T) && \ - defined(KOKKOSKERNELS_INST_OFFSET_SIZE_T)) || \ - (!defined(KOKKOSKERNELS_ETI_ONLY) && \ - !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS)) +#if (defined(KOKKOSKERNELS_INST_ORDINAL_INT64_T) && defined(KOKKOSKERNELS_INST_OFFSET_SIZE_T)) || \ + (!defined(KOKKOSKERNELS_ETI_ONLY) && !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS)) EXECUTE_TEST(double, int64_t, size_t, TestDevice) #endif diff --git a/graph/unit_test/Test_Graph_rcm.hpp b/graph/unit_test/Test_Graph_rcm.hpp index a6d165d8c3..0a9543367a 100644 --- a/graph/unit_test/Test_Graph_rcm.hpp +++ b/graph/unit_test/Test_Graph_rcm.hpp @@ -26,13 +26,10 @@ // Generates a graph from 3D 7-pt stencil. Slices grid into 2 connected // components near the middle of X dimension. template -void generate7pt(rowmap_t& rowmapView, entries_t& entriesView, int gridX, - int gridY, int gridZ) { - using size_type = typename rowmap_t::non_const_value_type; - using lno_t = typename entries_t::non_const_value_type; - auto getVertexID = [=](lno_t x, lno_t y, lno_t z) -> lno_t { - return x + y * gridX + z * gridX * gridY; - }; +void generate7pt(rowmap_t& rowmapView, entries_t& entriesView, int gridX, int gridY, int gridZ) { + using size_type = typename rowmap_t::non_const_value_type; + using lno_t = typename entries_t::non_const_value_type; + auto getVertexID = [=](lno_t x, lno_t y, lno_t z) -> lno_t { return x + y * gridX + z * gridX * gridY; }; lno_t numVertices = gridX * gridY * gridZ; // Generate the graph on host (use std::vector to not need to know // how many entries ahead of time) @@ -44,10 +41,8 @@ void generate7pt(rowmap_t& rowmapView, entries_t& entriesView, int gridX, for (lno_t j = 0; j < gridY; j++) { for (lno_t i = 0; i < gridX; i++) { lno_t v = getVertexID(i, j, k); - if (i != 0 && i != xslice + 1) - entries.push_back(getVertexID(i - 1, j, k)); - if (i != gridX - 1 && i != xslice) - entries.push_back(getVertexID(i + 1, j, k)); + if (i != 0 && i != xslice + 1) entries.push_back(getVertexID(i - 1, j, k)); + if (i != gridX - 1 && i != xslice) entries.push_back(getVertexID(i + 1, j, k)); if (j != 0) entries.push_back(getVertexID(i, j - 1, k)); if (j != gridY - 1) entries.push_back(getVertexID(i, j + 1, k)); if (k != 0) entries.push_back(getVertexID(i, j, k - 1)); @@ -59,26 +54,20 @@ void generate7pt(rowmap_t& rowmapView, entries_t& entriesView, int gridX, size_type numEdges = entries.size(); // Now that the graph is formed, copy rowmap and entries to Kokkos::Views in // device memory The nonowning host views just alias the std::vectors. - Kokkos::View> - rowmapHost(rowmap.data(), numVertices + 1); - Kokkos::View> - entriesHost(entries.data(), numEdges); + Kokkos::View> rowmapHost(rowmap.data(), + numVertices + 1); + Kokkos::View> entriesHost(entries.data(), + numEdges); // Allocate owning views on device with the correct size. - rowmapView = - rowmap_t(Kokkos::view_alloc(Kokkos::WithoutInitializing, "Rowmap"), - numVertices + 1); - entriesView = entries_t( - Kokkos::view_alloc(Kokkos::WithoutInitializing, "Colinds"), numEdges); + rowmapView = rowmap_t(Kokkos::view_alloc(Kokkos::WithoutInitializing, "Rowmap"), numVertices + 1); + entriesView = entries_t(Kokkos::view_alloc(Kokkos::WithoutInitializing, "Colinds"), numEdges); // Copy the graph from host to device Kokkos::deep_copy(rowmapView, rowmapHost); Kokkos::deep_copy(entriesView, entriesHost); } template -int maxBandwidth(const rowmap_t& rowmap, const entries_t& entries, - const labels_t& invPerm, const labels_t& perm) { +int maxBandwidth(const rowmap_t& rowmap, const entries_t& entries, const labels_t& invPerm, const labels_t& perm) { using size_type = typename rowmap_t::non_const_value_type; using lno_t = typename entries_t::non_const_value_type; lno_t numVerts = std::max(1, rowmap.extent_int(0)) - 1; @@ -98,19 +87,14 @@ int maxBandwidth(const rowmap_t& rowmap, const entries_t& entries, } template -void test_rcm(const rowmap_t& rowmap, const entries_t& entries, - bool expectBandwidthReduced) { - using lno_t = typename entries_t::non_const_value_type; - auto rcm = KokkosGraph::Experimental::graph_rcm( - rowmap, entries); - auto rowmapHost = - Kokkos::create_mirror_view_and_copy(Kokkos::HostSpace(), rowmap); - auto entriesHost = - Kokkos::create_mirror_view_and_copy(Kokkos::HostSpace(), entries); - auto rcmHost = Kokkos::create_mirror_view_and_copy(Kokkos::HostSpace(), rcm); - lno_t numVerts = std::max(rowmap.extent_int(0), 1) - 1; - decltype(rcmHost) rcmPermHost( - Kokkos::view_alloc(Kokkos::WithoutInitializing, "RCMPerm"), numVerts); +void test_rcm(const rowmap_t& rowmap, const entries_t& entries, bool expectBandwidthReduced) { + using lno_t = typename entries_t::non_const_value_type; + auto rcm = KokkosGraph::Experimental::graph_rcm(rowmap, entries); + auto rowmapHost = Kokkos::create_mirror_view_and_copy(Kokkos::HostSpace(), rowmap); + auto entriesHost = Kokkos::create_mirror_view_and_copy(Kokkos::HostSpace(), entries); + auto rcmHost = Kokkos::create_mirror_view_and_copy(Kokkos::HostSpace(), rcm); + lno_t numVerts = std::max(rowmap.extent_int(0), 1) - 1; + decltype(rcmHost) rcmPermHost(Kokkos::view_alloc(Kokkos::WithoutInitializing, "RCMPerm"), numVerts); for (lno_t i = 0; i < numVerts; i++) rcmPermHost(rcmHost(i)) = i; // make sure each row index shows up exactly once { @@ -124,20 +108,18 @@ void test_rcm(const rowmap_t& rowmap, const entries_t& entries, for (lno_t i = 0; i < numVerts; i++) ASSERT_EQ(counts[i], 1); } if (expectBandwidthReduced) { - Kokkos::View identityOrder( - Kokkos::view_alloc(Kokkos::WithoutInitializing, "Identity"), numVerts); + Kokkos::View identityOrder(Kokkos::view_alloc(Kokkos::WithoutInitializing, "Identity"), + numVerts); for (lno_t i = 0; i < numVerts; i++) identityOrder(i) = i; - size_t origBW = - maxBandwidth(rowmapHost, entriesHost, identityOrder, identityOrder); - size_t rcmBW = maxBandwidth(rowmapHost, entriesHost, rcmHost, rcmPermHost); + size_t origBW = maxBandwidth(rowmapHost, entriesHost, identityOrder, identityOrder); + size_t rcmBW = maxBandwidth(rowmapHost, entriesHost, rcmHost, rcmPermHost); EXPECT_LE(rcmBW, origBW); } } template void test_rcm_zerorows() { - using graph_t = - Kokkos::StaticCrsGraph; + using graph_t = Kokkos::StaticCrsGraph; using rowmap_t = typename graph_t::row_map_type::non_const_type; using entries_t = typename graph_t::entries_type::non_const_type; rowmap_t rowmap; @@ -146,10 +128,8 @@ void test_rcm_zerorows() { } template -void test_rcm_7pt(lno_t gridX, lno_t gridY, lno_t gridZ, - bool expectBandwidthReduced) { - using graph_t = - Kokkos::StaticCrsGraph; +void test_rcm_7pt(lno_t gridX, lno_t gridY, lno_t gridZ, bool expectBandwidthReduced) { + using graph_t = Kokkos::StaticCrsGraph; using rowmap_t = typename graph_t::row_map_type::non_const_type; using entries_t = typename graph_t::entries_type::non_const_type; rowmap_t rowmap; @@ -160,8 +140,7 @@ void test_rcm_7pt(lno_t gridX, lno_t gridY, lno_t gridZ, template void test_rcm_4clique() { - using graph_t = - Kokkos::StaticCrsGraph; + using graph_t = Kokkos::StaticCrsGraph; using rowmap_t = typename graph_t::row_map_type::non_const_type; using entries_t = typename graph_t::entries_type::non_const_type; rowmap_t rowmap("rowmap", 5); @@ -177,20 +156,17 @@ void test_rcm_4clique() { template void test_rcm_multiple_components() { - using graph_t = - Kokkos::StaticCrsGraph; + using graph_t = Kokkos::StaticCrsGraph; using rowmap_t = typename graph_t::row_map_type::non_const_type; using entries_t = typename graph_t::entries_type::non_const_type; // Generate a single 3D grid first rowmap_t rowmap_cube; entries_t entries_cube; generate7pt(rowmap_cube, entries_cube, 7, 7, 7); - auto rowmap_cube_host = - Kokkos::create_mirror_view_and_copy(Kokkos::HostSpace(), rowmap_cube); - auto entries_cube_host = - Kokkos::create_mirror_view_and_copy(Kokkos::HostSpace(), entries_cube); - lno_t nv_cube = 7 * 7 * 7; - lno_t ne_cube = entries_cube.extent(0); + auto rowmap_cube_host = Kokkos::create_mirror_view_and_copy(Kokkos::HostSpace(), rowmap_cube); + auto entries_cube_host = Kokkos::create_mirror_view_and_copy(Kokkos::HostSpace(), entries_cube); + lno_t nv_cube = 7 * 7 * 7; + lno_t ne_cube = entries_cube.extent(0); // Now replicate the graph twice, so there are 2 disconnected copies of the // cube rowmap_t rowmap("rowmap", nv_cube * 2 + 1); @@ -214,55 +190,41 @@ void test_rcm_multiple_components() { test_rcm(rowmap, entries, true); } -#define EXECUTE_TEST(SCALAR, ORDINAL, OFFSET, DEVICE) \ - TEST_F( \ - TestCategory, \ - graph##_##rcm_zerorows##_##SCALAR##_##ORDINAL##_##OFFSET##_##DEVICE) { \ - test_rcm_zerorows(); \ - } \ - TEST_F(TestCategory, \ - graph##_##rcm_7pt##_##SCALAR##_##ORDINAL##_##OFFSET##_##DEVICE) { \ - test_rcm_7pt(1, 1, 1, false); \ - test_rcm_7pt(2, 1, 1, false); \ - test_rcm_7pt(6, 3, 3, true); \ - test_rcm_7pt(20, 20, 20, true); \ - test_rcm_7pt(100, 100, 1, true); \ - } \ - TEST_F(TestCategory, \ - graph##_##rcm_4clique##_##SCALAR##_##ORDINAL##_##OFFSET##_##DEVICE) { \ - test_rcm_4clique(); \ - } \ - TEST_F( \ - TestCategory, \ - graph##_##rcm_multiple_components##_##SCALAR##_##ORDINAL##_##OFFSET##_##DEVICE) { \ - test_rcm_multiple_components(); \ +#define EXECUTE_TEST(SCALAR, ORDINAL, OFFSET, DEVICE) \ + TEST_F(TestCategory, graph##_##rcm_zerorows##_##SCALAR##_##ORDINAL##_##OFFSET##_##DEVICE) { \ + test_rcm_zerorows(); \ + } \ + TEST_F(TestCategory, graph##_##rcm_7pt##_##SCALAR##_##ORDINAL##_##OFFSET##_##DEVICE) { \ + test_rcm_7pt(1, 1, 1, false); \ + test_rcm_7pt(2, 1, 1, false); \ + test_rcm_7pt(6, 3, 3, true); \ + test_rcm_7pt(20, 20, 20, true); \ + test_rcm_7pt(100, 100, 1, true); \ + } \ + TEST_F(TestCategory, graph##_##rcm_4clique##_##SCALAR##_##ORDINAL##_##OFFSET##_##DEVICE) { \ + test_rcm_4clique(); \ + } \ + TEST_F(TestCategory, graph##_##rcm_multiple_components##_##SCALAR##_##ORDINAL##_##OFFSET##_##DEVICE) { \ + test_rcm_multiple_components(); \ } -#if (defined(KOKKOSKERNELS_INST_ORDINAL_INT) && \ - defined(KOKKOSKERNELS_INST_OFFSET_INT)) || \ - (!defined(KOKKOSKERNELS_ETI_ONLY) && \ - !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS)) +#if (defined(KOKKOSKERNELS_INST_ORDINAL_INT) && defined(KOKKOSKERNELS_INST_OFFSET_INT)) || \ + (!defined(KOKKOSKERNELS_ETI_ONLY) && !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS)) EXECUTE_TEST(double, int, int, TestDevice) #endif -#if (defined(KOKKOSKERNELS_INST_ORDINAL_INT64_T) && \ - defined(KOKKOSKERNELS_INST_OFFSET_INT)) || \ - (!defined(KOKKOSKERNELS_ETI_ONLY) && \ - !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS)) +#if (defined(KOKKOSKERNELS_INST_ORDINAL_INT64_T) && defined(KOKKOSKERNELS_INST_OFFSET_INT)) || \ + (!defined(KOKKOSKERNELS_ETI_ONLY) && !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS)) EXECUTE_TEST(double, int64_t, int, TestDevice) #endif -#if (defined(KOKKOSKERNELS_INST_ORDINAL_INT) && \ - defined(KOKKOSKERNELS_INST_OFFSET_SIZE_T)) || \ - (!defined(KOKKOSKERNELS_ETI_ONLY) && \ - !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS)) +#if (defined(KOKKOSKERNELS_INST_ORDINAL_INT) && defined(KOKKOSKERNELS_INST_OFFSET_SIZE_T)) || \ + (!defined(KOKKOSKERNELS_ETI_ONLY) && !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS)) EXECUTE_TEST(double, int, size_t, TestDevice) #endif -#if (defined(KOKKOSKERNELS_INST_ORDINAL_INT64_T) && \ - defined(KOKKOSKERNELS_INST_OFFSET_SIZE_T)) || \ - (!defined(KOKKOSKERNELS_ETI_ONLY) && \ - !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS)) +#if (defined(KOKKOSKERNELS_INST_ORDINAL_INT64_T) && defined(KOKKOSKERNELS_INST_OFFSET_SIZE_T)) || \ + (!defined(KOKKOSKERNELS_ETI_ONLY) && !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS)) EXECUTE_TEST(double, int64_t, size_t, TestDevice) #endif diff --git a/lapack/impl/KokkosLapack_gesv_spec.hpp b/lapack/impl/KokkosLapack_gesv_spec.hpp index 97d74280ff..60a69e72b3 100644 --- a/lapack/impl/KokkosLapack_gesv_spec.hpp +++ b/lapack/impl/KokkosLapack_gesv_spec.hpp @@ -42,21 +42,17 @@ struct gesv_eti_spec_avail { // We may spread out definitions (see _INST macro below) across one or // more .cpp files. // -#define KOKKOSLAPACK_GESV_ETI_SPEC_AVAIL(SCALAR_TYPE, LAYOUT_TYPE, \ - EXEC_SPACE_TYPE, MEM_SPACE_TYPE) \ - template <> \ - struct gesv_eti_spec_avail< \ - EXEC_SPACE_TYPE, \ - Kokkos::View, \ - Kokkos::MemoryTraits>, \ - Kokkos::View, \ - Kokkos::MemoryTraits>, \ - Kokkos::View, \ - Kokkos::MemoryTraits>> { \ - enum : bool { value = true }; \ +#define KOKKOSLAPACK_GESV_ETI_SPEC_AVAIL(SCALAR_TYPE, LAYOUT_TYPE, EXEC_SPACE_TYPE, MEM_SPACE_TYPE) \ + template <> \ + struct gesv_eti_spec_avail< \ + EXEC_SPACE_TYPE, \ + Kokkos::View, \ + Kokkos::MemoryTraits>, \ + Kokkos::View, \ + Kokkos::MemoryTraits>, \ + Kokkos::View, \ + Kokkos::MemoryTraits>> { \ + enum : bool { value = true }; \ }; // Include the actual specialization declarations @@ -70,23 +66,19 @@ namespace Impl { /// \brief Implementation of KokkosLapack::gesv. template ::value, - bool eti_spec_avail = - gesv_eti_spec_avail::value> + bool tpl_spec_avail = gesv_tpl_spec_avail::value, + bool eti_spec_avail = gesv_eti_spec_avail::value> struct GESV { - static void gesv(const ExecutionSpace &space, const AMatrix &A, const BXMV &B, - const IPIVV &IPIV); + static void gesv(const ExecutionSpace &space, const AMatrix &A, const BXMV &B, const IPIVV &IPIV); }; #if !defined(KOKKOSKERNELS_ETI_ONLY) || KOKKOSKERNELS_IMPL_COMPILE_LIBRARY //! Full specialization of gesv for multi vectors. // Unification layer template -struct GESV { - static void gesv(const ExecutionSpace & /* space */, const AMatrix & /* A */, - const BXMV & /* B */, const IPIVV & /* IPIV */) { +struct GESV { + static void gesv(const ExecutionSpace & /* space */, const AMatrix & /* A */, const BXMV & /* B */, + const IPIVV & /* IPIV */) { // NOTE: Might add the implementation of KokkosLapack::gesv later throw std::runtime_error( "No fallback implementation of GESV (general LU factorization & solve) " @@ -105,36 +97,26 @@ struct GESV, \ - Kokkos::MemoryTraits>, \ - Kokkos::View, \ - Kokkos::MemoryTraits>, \ - Kokkos::View, \ - Kokkos::MemoryTraits>, \ +#define KOKKOSLAPACK_GESV_ETI_SPEC_DECL(SCALAR_TYPE, LAYOUT_TYPE, EXEC_SPACE_TYPE, MEM_SPACE_TYPE) \ + extern template struct GESV< \ + EXEC_SPACE_TYPE, \ + Kokkos::View, \ + Kokkos::MemoryTraits>, \ + Kokkos::View, \ + Kokkos::MemoryTraits>, \ + Kokkos::View, \ + Kokkos::MemoryTraits>, \ false, true>; -#define KOKKOSLAPACK_GESV_ETI_SPEC_INST(SCALAR_TYPE, LAYOUT_TYPE, \ - EXEC_SPACE_TYPE, MEM_SPACE_TYPE) \ - template struct GESV< \ - EXEC_SPACE_TYPE, \ - Kokkos::View, \ - Kokkos::MemoryTraits>, \ - Kokkos::View, \ - Kokkos::MemoryTraits>, \ - Kokkos::View, \ - Kokkos::MemoryTraits>, \ +#define KOKKOSLAPACK_GESV_ETI_SPEC_INST(SCALAR_TYPE, LAYOUT_TYPE, EXEC_SPACE_TYPE, MEM_SPACE_TYPE) \ + template struct GESV< \ + EXEC_SPACE_TYPE, \ + Kokkos::View, \ + Kokkos::MemoryTraits>, \ + Kokkos::View, \ + Kokkos::MemoryTraits>, \ + Kokkos::View, \ + Kokkos::MemoryTraits>, \ false, true>; #include diff --git a/lapack/impl/KokkosLapack_svd_spec.hpp b/lapack/impl/KokkosLapack_svd_spec.hpp index fc0a34f790..b0dfe3d091 100644 --- a/lapack/impl/KokkosLapack_svd_spec.hpp +++ b/lapack/impl/KokkosLapack_svd_spec.hpp @@ -28,8 +28,7 @@ namespace KokkosLapack { namespace Impl { // Specialization struct which defines whether a specialization exists -template +template struct svd_eti_spec_avail { enum : bool { value = false }; }; @@ -43,24 +42,19 @@ struct svd_eti_spec_avail { // We may spread out definitions (see _INST macro below) across one or // more .cpp files. // -#define KOKKOSLAPACK_SVD_ETI_SPEC_AVAIL(SCALAR_TYPE, LAYOUT_TYPE, \ - EXEC_SPACE_TYPE, MEM_SPACE_TYPE) \ - template <> \ - struct svd_eti_spec_avail< \ - EXEC_SPACE_TYPE, \ - Kokkos::View, \ - Kokkos::MemoryTraits>, \ - Kokkos::View::mag_type *, LAYOUT_TYPE, \ - Kokkos::Device, \ - Kokkos::MemoryTraits>, \ - Kokkos::View, \ - Kokkos::MemoryTraits>, \ - Kokkos::View, \ - Kokkos::MemoryTraits>> { \ - enum : bool { value = true }; \ +#define KOKKOSLAPACK_SVD_ETI_SPEC_AVAIL(SCALAR_TYPE, LAYOUT_TYPE, EXEC_SPACE_TYPE, MEM_SPACE_TYPE) \ + template <> \ + struct svd_eti_spec_avail< \ + EXEC_SPACE_TYPE, \ + Kokkos::View, \ + Kokkos::MemoryTraits>, \ + Kokkos::View::mag_type *, LAYOUT_TYPE, \ + Kokkos::Device, Kokkos::MemoryTraits>, \ + Kokkos::View, \ + Kokkos::MemoryTraits>, \ + Kokkos::View, \ + Kokkos::MemoryTraits>> { \ + enum : bool { value = true }; \ }; // Include the actual specialization declarations @@ -73,29 +67,21 @@ namespace Impl { // Unification layer /// \brief Implementation of KokkosLapack::svd. -template ::value, - bool eti_spec_avail = svd_eti_spec_avail< - ExecutionSpace, AMatrix, SVector, UMatrix, VMatrix>::value> +template ::value, + bool eti_spec_avail = svd_eti_spec_avail::value> struct SVD { - static void svd(const ExecutionSpace &space, const char jobu[], - const char jobvt[], const AMatrix &A, const SVector &S, - const UMatrix &U, const VMatrix &Vt); + static void svd(const ExecutionSpace &space, const char jobu[], const char jobvt[], const AMatrix &A, + const SVector &S, const UMatrix &U, const VMatrix &Vt); }; #if !defined(KOKKOSKERNELS_ETI_ONLY) || KOKKOSKERNELS_IMPL_COMPILE_LIBRARY //! Full specialization of svd // Unification layer -template -struct SVD { - static void svd(const ExecutionSpace & /* space */, const char * /* jobu */, - const char * /* jobvt */, const AMatrix & /* A */, - const SVector & /* S */, const UMatrix & /* U */, - const VMatrix & /* Vt */) { +template +struct SVD { + static void svd(const ExecutionSpace & /* space */, const char * /* jobu */, const char * /* jobvt */, + const AMatrix & /* A */, const SVector & /* S */, const UMatrix & /* U */, const VMatrix & /* Vt */) { // NOTE: Might add the implementation of KokkosLapack::svd later throw std::runtime_error( "No fallback implementation of SVD (singular value decomposition) " @@ -115,40 +101,30 @@ struct SVD, \ - Kokkos::MemoryTraits>, \ - Kokkos::View::mag_type *, LAYOUT_TYPE, \ - Kokkos::Device, \ - Kokkos::MemoryTraits>, \ - Kokkos::View, \ - Kokkos::MemoryTraits>, \ - Kokkos::View, \ - Kokkos::MemoryTraits>, \ +#define KOKKOSLAPACK_SVD_ETI_SPEC_DECL(SCALAR_TYPE, LAYOUT_TYPE, EXEC_SPACE_TYPE, MEM_SPACE_TYPE) \ + extern template struct SVD< \ + EXEC_SPACE_TYPE, \ + Kokkos::View, \ + Kokkos::MemoryTraits>, \ + Kokkos::View::mag_type *, LAYOUT_TYPE, \ + Kokkos::Device, Kokkos::MemoryTraits>, \ + Kokkos::View, \ + Kokkos::MemoryTraits>, \ + Kokkos::View, \ + Kokkos::MemoryTraits>, \ false, true>; -#define KOKKOSLAPACK_SVD_ETI_SPEC_INST(SCALAR_TYPE, LAYOUT_TYPE, \ - EXEC_SPACE_TYPE, MEM_SPACE_TYPE) \ - template struct SVD< \ - EXEC_SPACE_TYPE, \ - Kokkos::View, \ - Kokkos::MemoryTraits>, \ - Kokkos::View::mag_type *, LAYOUT_TYPE, \ - Kokkos::Device, \ - Kokkos::MemoryTraits>, \ - Kokkos::View, \ - Kokkos::MemoryTraits>, \ - Kokkos::View, \ - Kokkos::MemoryTraits>, \ +#define KOKKOSLAPACK_SVD_ETI_SPEC_INST(SCALAR_TYPE, LAYOUT_TYPE, EXEC_SPACE_TYPE, MEM_SPACE_TYPE) \ + template struct SVD< \ + EXEC_SPACE_TYPE, \ + Kokkos::View, \ + Kokkos::MemoryTraits>, \ + Kokkos::View::mag_type *, LAYOUT_TYPE, \ + Kokkos::Device, Kokkos::MemoryTraits>, \ + Kokkos::View, \ + Kokkos::MemoryTraits>, \ + Kokkos::View, \ + Kokkos::MemoryTraits>, \ false, true>; #include diff --git a/lapack/impl/KokkosLapack_trtri_impl.hpp b/lapack/impl/KokkosLapack_trtri_impl.hpp index 9f52c2d412..5ba6f80eec 100644 --- a/lapack/impl/KokkosLapack_trtri_impl.hpp +++ b/lapack/impl/KokkosLapack_trtri_impl.hpp @@ -31,8 +31,7 @@ namespace KokkosLapack { namespace Impl { template -void SerialTrtri_Invoke(const RViewType &R, const char uplo[], - const char diag[], const AViewType &A) { +void SerialTrtri_Invoke(const RViewType &R, const char uplo[], const char diag[], const AViewType &A) { using KokkosBatched::Algo; using KokkosBatched::Diag; using KokkosBatched::SerialTrtriInternalLower; @@ -43,24 +42,20 @@ void SerialTrtri_Invoke(const RViewType &R, const char uplo[], //// Lower //// if (__uplo == 'l') { if (__diag == 'u') { - R() = SerialTrtriInternalLower::invoke( - Diag::Unit::use_unit_diag, A.extent(0), A.extent(1), A.data(), - A.stride(0), A.stride(1)); + R() = SerialTrtriInternalLower::invoke(Diag::Unit::use_unit_diag, A.extent(0), + A.extent(1), A.data(), A.stride(0), A.stride(1)); } else { - R() = SerialTrtriInternalLower::invoke( - Diag::NonUnit::use_unit_diag, A.extent(0), A.extent(1), A.data(), - A.stride(0), A.stride(1)); + R() = SerialTrtriInternalLower::invoke(Diag::NonUnit::use_unit_diag, A.extent(0), + A.extent(1), A.data(), A.stride(0), A.stride(1)); } } else { //// Upper //// if (__diag == 'u') { - R() = SerialTrtriInternalUpper::invoke( - Diag::Unit::use_unit_diag, A.extent(0), A.extent(1), A.data(), - A.stride(0), A.stride(1)); + R() = SerialTrtriInternalUpper::invoke(Diag::Unit::use_unit_diag, A.extent(0), + A.extent(1), A.data(), A.stride(0), A.stride(1)); } else { - R() = SerialTrtriInternalUpper::invoke( - Diag::NonUnit::use_unit_diag, A.extent(0), A.extent(1), A.data(), - A.stride(0), A.stride(1)); + R() = SerialTrtriInternalUpper::invoke(Diag::NonUnit::use_unit_diag, A.extent(0), + A.extent(1), A.data(), A.stride(0), A.stride(1)); } } } diff --git a/lapack/impl/KokkosLapack_trtri_spec.hpp b/lapack/impl/KokkosLapack_trtri_spec.hpp index a17184dc41..ef458f7e57 100644 --- a/lapack/impl/KokkosLapack_trtri_spec.hpp +++ b/lapack/impl/KokkosLapack_trtri_spec.hpp @@ -37,15 +37,13 @@ struct trtri_eti_spec_avail { // This Macros provides the ETI specialization of trtri, currently not // available. // -#define KOKKOSLAPACK_TRTRI_ETI_SPEC_AVAIL(SCALAR, LAYOUTA, EXEC_SPACE, \ - MEM_SPACE) \ - template <> \ - struct trtri_eti_spec_avail< \ - Kokkos::View >, \ - Kokkos::View, \ - Kokkos::MemoryTraits > > { \ - enum : bool { value = true }; \ +#define KOKKOSLAPACK_TRTRI_ETI_SPEC_AVAIL(SCALAR, LAYOUTA, EXEC_SPACE, MEM_SPACE) \ + template <> \ + struct trtri_eti_spec_avail< \ + Kokkos::View >, \ + Kokkos::View, \ + Kokkos::MemoryTraits > > { \ + enum : bool { value = true }; \ }; // Include the actual specialization declarations @@ -60,33 +58,28 @@ namespace Impl { // // Unification layer -template ::value, +template ::value, bool eti_spec_avail = trtri_eti_spec_avail::value> struct TRTRI { - static void trtri(const RVIT& R, const char uplo[], const char diag[], - const AVIT& A); + static void trtri(const RVIT& R, const char uplo[], const char diag[], const AVIT& A); }; #if !defined(KOKKOSKERNELS_ETI_ONLY) || KOKKOSKERNELS_IMPL_COMPILE_LIBRARY template struct TRTRI { - static void trtri(const RVIT& R, const char uplo[], const char diag[], - const AVIT& A) { + static void trtri(const RVIT& R, const char uplo[], const char diag[], const AVIT& A) { static_assert(Kokkos::is_view::value, "AVIT must be a Kokkos::View."); static_assert(static_cast(AVIT::rank) == 2, "AVIT must have rank 2."); - Kokkos::Profiling::pushRegion(KOKKOSKERNELS_IMPL_COMPILE_LIBRARY - ? "KokkosLapack::trtri[ETI]" - : "KokkosLapack::trtri[noETI]"); + Kokkos::Profiling::pushRegion(KOKKOSKERNELS_IMPL_COMPILE_LIBRARY ? "KokkosLapack::trtri[ETI]" + : "KokkosLapack::trtri[noETI]"); typename AVIT::HostMirror host_A = Kokkos::create_mirror_view(A); typename RVIT::HostMirror host_R = Kokkos::create_mirror_view(R); Kokkos::deep_copy(host_A, A); - SerialTrtri_Invoke( - R, uplo, diag, host_A); + SerialTrtri_Invoke(R, uplo, diag, host_A); Kokkos::deep_copy(A, host_A); @@ -106,22 +99,18 @@ struct TRTRI { // "extern template" skips the implicit instatiation step ensuring that the // callers code uses this explicit instantiation definition of TRTRI. // -#define KOKKOSLAPACK_TRTRI_ETI_SPEC_DECL(SCALAR, LAYOUTA, EXEC_SPACE, \ - MEM_SPACE) \ - extern template struct TRTRI< \ - Kokkos::View >, \ - Kokkos::View, \ - Kokkos::MemoryTraits >, \ +#define KOKKOSLAPACK_TRTRI_ETI_SPEC_DECL(SCALAR, LAYOUTA, EXEC_SPACE, MEM_SPACE) \ + extern template struct TRTRI< \ + Kokkos::View >, \ + Kokkos::View, \ + Kokkos::MemoryTraits >, \ false, true>; -#define KOKKOSLAPACK_TRTRI_ETI_SPEC_INST(SCALAR, LAYOUTA, EXEC_SPACE, \ - MEM_SPACE) \ - template struct TRTRI< \ - Kokkos::View >, \ - Kokkos::View, \ - Kokkos::MemoryTraits >, \ +#define KOKKOSLAPACK_TRTRI_ETI_SPEC_INST(SCALAR, LAYOUTA, EXEC_SPACE, MEM_SPACE) \ + template struct TRTRI< \ + Kokkos::View >, \ + Kokkos::View, \ + Kokkos::MemoryTraits >, \ false, true>; #include diff --git a/lapack/src/KokkosLapack_gesv.hpp b/lapack/src/KokkosLapack_gesv.hpp index b66583bbdf..281d6a5651 100644 --- a/lapack/src/KokkosLapack_gesv.hpp +++ b/lapack/src/KokkosLapack_gesv.hpp @@ -53,44 +53,29 @@ namespace KokkosLapack { /// used. /// template -void gesv(const ExecutionSpace& space, const AMatrix& A, const BXMV& B, - const IPIVV& IPIV) { +void gesv(const ExecutionSpace& space, const AMatrix& A, const BXMV& B, const IPIVV& IPIV) { // NOTE: Currently, KokkosLapack::gesv only supports LAPACK, MAGMA and // rocSOLVER TPLs. // MAGMA/rocSOLVER TPL should be enabled to call the MAGMA/rocSOLVER GPU // interface for device views LAPACK TPL should be enabled to call the // LAPACK interface for host views - static_assert( - Kokkos::SpaceAccessibility::accessible); - static_assert( - Kokkos::SpaceAccessibility::accessible); + static_assert(Kokkos::SpaceAccessibility::accessible); + static_assert(Kokkos::SpaceAccessibility::accessible); #if defined(KOKKOSKERNELS_ENABLE_TPL_MAGMA) if constexpr (!std::is_same_v) { - static_assert( - Kokkos::SpaceAccessibility::accessible); + static_assert(Kokkos::SpaceAccessibility::accessible); } #else - static_assert( - Kokkos::SpaceAccessibility::accessible); + static_assert(Kokkos::SpaceAccessibility::accessible); #endif - static_assert(Kokkos::is_view::value, - "KokkosLapack::gesv: A must be a Kokkos::View."); - static_assert(Kokkos::is_view::value, - "KokkosLapack::gesv: B must be a Kokkos::View."); - static_assert(Kokkos::is_view::value, - "KokkosLapack::gesv: IPIV must be a Kokkos::View."); - static_assert(static_cast(AMatrix::rank) == 2, - "KokkosLapack::gesv: A must have rank 2."); - static_assert( - static_cast(BXMV::rank) == 1 || static_cast(BXMV::rank) == 2, - "KokkosLapack::gesv: B must have either rank 1 or rank 2."); - static_assert(static_cast(IPIVV::rank) == 1, - "KokkosLapack::gesv: IPIV must have rank 1."); + static_assert(Kokkos::is_view::value, "KokkosLapack::gesv: A must be a Kokkos::View."); + static_assert(Kokkos::is_view::value, "KokkosLapack::gesv: B must be a Kokkos::View."); + static_assert(Kokkos::is_view::value, "KokkosLapack::gesv: IPIV must be a Kokkos::View."); + static_assert(static_cast(AMatrix::rank) == 2, "KokkosLapack::gesv: A must have rank 2."); + static_assert(static_cast(BXMV::rank) == 1 || static_cast(BXMV::rank) == 2, + "KokkosLapack::gesv: B must have either rank 1 or rank 2."); + static_assert(static_cast(IPIVV::rank) == 1, "KokkosLapack::gesv: IPIV must have rank 1."); int64_t IPIV0 = IPIV.extent(0); int64_t A0 = A.extent(0); @@ -98,8 +83,7 @@ void gesv(const ExecutionSpace& space, const AMatrix& A, const BXMV& B, int64_t B0 = B.extent(0); // Check validity of pivot argument - bool valid_pivot = - (IPIV0 == A1) || ((IPIV0 == 0) && (IPIV.data() == nullptr)); + bool valid_pivot = (IPIV0 == A1) || ((IPIV0 == 0) && (IPIV.data() == nullptr)); if (!(valid_pivot)) { std::ostringstream os; os << "KokkosLapack::gesv: IPIV: " << IPIV0 << ". " @@ -112,9 +96,8 @@ void gesv(const ExecutionSpace& space, const AMatrix& A, const BXMV& B, // Check for no pivoting case. Only MAGMA supports no pivoting interface #ifdef KOKKOSKERNELS_ENABLE_TPL_MAGMA // have MAGMA TPL #ifdef KOKKOSKERNELS_ENABLE_TPL_LAPACK // and have LAPACK TPL - if ((!std::is_same::value) && - (IPIV0 == 0) && (IPIV.data() == nullptr)) { + if ((!std::is_same::value) && (IPIV0 == 0) && + (IPIV.data() == nullptr)) { std::ostringstream os; os << "KokkosLapack::gesv: IPIV: " << IPIV0 << ". " << "LAPACK TPL does not support no pivoting."; @@ -136,22 +119,18 @@ void gesv(const ExecutionSpace& space, const AMatrix& A, const BXMV& B, if ((A0 < A1) || (A0 != B0)) { std::ostringstream os; os << "KokkosLapack::gesv: Dimensions of A, and B do not match: " - << " A: " << A.extent(0) << " x " << A.extent(1) << " B: " << B.extent(0) - << " x " << B.extent(1); + << " A: " << A.extent(0) << " x " << A.extent(1) << " B: " << B.extent(0) << " x " << B.extent(1); KokkosKernels::Impl::throw_runtime_exception(os.str()); } - typedef Kokkos::View< - typename AMatrix::non_const_value_type**, typename AMatrix::array_layout, - typename AMatrix::device_type, Kokkos::MemoryTraits > + typedef Kokkos::View > AMatrix_Internal; - typedef Kokkos::View > BXMV_Internal; - typedef Kokkos::View< - typename IPIVV::non_const_value_type*, typename IPIVV::array_layout, - typename IPIVV::device_type, Kokkos::MemoryTraits > + typedef Kokkos::View > IPIVV_Internal; AMatrix_Internal A_i = A; // BXMV_Internal B_i = B; @@ -159,12 +138,12 @@ void gesv(const ExecutionSpace& space, const AMatrix& A, const BXMV& B, if (BXMV::rank == 1) { auto B_i = BXMV_Internal(B.data(), B.extent(0), 1); - KokkosLapack::Impl::GESV::gesv(space, A_i, B_i, IPIV_i); + KokkosLapack::Impl::GESV::gesv(space, A_i, B_i, + IPIV_i); } else { // BXMV::rank == 2 auto B_i = BXMV_Internal(B.data(), B.extent(0), B.extent(1)); - KokkosLapack::Impl::GESV::gesv(space, A_i, B_i, IPIV_i); + KokkosLapack::Impl::GESV::gesv(space, A_i, B_i, + IPIV_i); } } diff --git a/lapack/src/KokkosLapack_svd.hpp b/lapack/src/KokkosLapack_svd.hpp index 71ea7cc30f..c0c962fb19 100644 --- a/lapack/src/KokkosLapack_svd.hpp +++ b/lapack/src/KokkosLapack_svd.hpp @@ -58,36 +58,21 @@ namespace KokkosLapack { /// vectors of A. /// // clang-format on -template -void svd(const ExecutionSpace& space, const char jobu[], const char jobvt[], - const AMatrix& A, const SVector& S, const UMatrix& U, - const VMatrix& Vt) { - static_assert( - Kokkos::SpaceAccessibility::accessible); - static_assert( - Kokkos::SpaceAccessibility::accessible); - static_assert( - Kokkos::SpaceAccessibility::accessible); - static_assert( - Kokkos::SpaceAccessibility::accessible); - static_assert(Kokkos::is_view::value, - "KokkosLapack::svd: A must be a Kokkos::View."); - static_assert(Kokkos::is_view::value, - "KokkosLapack::svd: S must be a Kokkos::View."); - static_assert(Kokkos::is_view::value, - "KokkosLapack::svd: U must be a Kokkos::View."); - static_assert(Kokkos::is_view::value, - "KokkosLapack::svd: Vt must be a Kokkos::View."); +template +void svd(const ExecutionSpace& space, const char jobu[], const char jobvt[], const AMatrix& A, const SVector& S, + const UMatrix& U, const VMatrix& Vt) { + static_assert(Kokkos::SpaceAccessibility::accessible); + static_assert(Kokkos::SpaceAccessibility::accessible); + static_assert(Kokkos::SpaceAccessibility::accessible); + static_assert(Kokkos::SpaceAccessibility::accessible); + static_assert(Kokkos::is_view::value, "KokkosLapack::svd: A must be a Kokkos::View."); + static_assert(Kokkos::is_view::value, "KokkosLapack::svd: S must be a Kokkos::View."); + static_assert(Kokkos::is_view::value, "KokkosLapack::svd: U must be a Kokkos::View."); + static_assert(Kokkos::is_view::value, "KokkosLapack::svd: Vt must be a Kokkos::View."); static_assert(AMatrix::rank() == 2, "KokkosLapack::svd: A must have rank 2."); static_assert(SVector::rank() == 1, "KokkosLapack::svd: S must have rank 1."); static_assert(UMatrix::rank() == 2, "KokkosLapack::svd: U must have rank 2."); - static_assert(VMatrix::rank() == 2, - "KokkosLapack::svd: Vt must have rank 2."); + static_assert(VMatrix::rank() == 2, "KokkosLapack::svd: Vt must have rank 2."); int64_t m = A.extent(0); int64_t n = A.extent(1); @@ -102,40 +87,32 @@ void svd(const ExecutionSpace& space, const char jobu[], const char jobvt[], // Check the jobu and jobvt control flags // The only valid options there are 'A', 'S', 'O' and 'N' - const bool is_jobu_invalid = - !((jobu[0] == 'A') || (jobu[0] == 'a') || (jobu[0] == 'S') || - (jobu[0] == 's') || (jobu[0] == 'O') || (jobu[0] == 'o') || - (jobu[0] == 'N') || (jobu[0] == 'n')); + const bool is_jobu_invalid = !((jobu[0] == 'A') || (jobu[0] == 'a') || (jobu[0] == 'S') || (jobu[0] == 's') || + (jobu[0] == 'O') || (jobu[0] == 'o') || (jobu[0] == 'N') || (jobu[0] == 'n')); - const bool is_jobvt_invalid = - !((jobvt[0] == 'A') || (jobvt[0] == 'a') || (jobvt[0] == 'S') || - (jobvt[0] == 's') || (jobvt[0] == 'O') || (jobvt[0] == 'o') || - (jobvt[0] == 'N') || (jobvt[0] == 'n')); + const bool is_jobvt_invalid = !((jobvt[0] == 'A') || (jobvt[0] == 'a') || (jobvt[0] == 'S') || (jobvt[0] == 's') || + (jobvt[0] == 'O') || (jobvt[0] == 'o') || (jobvt[0] == 'N') || (jobvt[0] == 'n')); if (is_jobu_invalid && is_jobvt_invalid) { std::ostringstream oss; oss << "KokkosLapack::svd: both jobu and jobvt are invalid!\n" - << "Possible values are A, S, O or N, submitted values are " << jobu[0] - << " and " << jobvt[0] << "\n"; + << "Possible values are A, S, O or N, submitted values are " << jobu[0] << " and " << jobvt[0] << "\n"; KokkosKernels::Impl::throw_runtime_exception(oss.str()); } if (is_jobu_invalid) { std::ostringstream oss; oss << "KokkosLapack::svd: jobu is invalid!\n" - << "Possible values are A, S, O or N, submitted value is " << jobu[0] - << "\n"; + << "Possible values are A, S, O or N, submitted value is " << jobu[0] << "\n"; KokkosKernels::Impl::throw_runtime_exception(oss.str()); } if (is_jobvt_invalid) { std::ostringstream oss; oss << "KokkosLapack::svd: jobvt is invalid!\n" - << "Possible values are A, S, O or N, submitted value is " << jobvt[0] - << "\n"; + << "Possible values are A, S, O or N, submitted value is " << jobvt[0] << "\n"; KokkosKernels::Impl::throw_runtime_exception(oss.str()); } - if (((jobu[0] == 'O') || (jobu[0] == 'o')) && - ((jobvt[0] == 'O') || (jobvt[0] == 'o'))) { + if (((jobu[0] == 'O') || (jobu[0] == 'o')) && ((jobvt[0] == 'O') || (jobvt[0] == 'o'))) { std::ostringstream oss; oss << "KokkosLapack::svd: jobu and jobvt cannot be O at the same time!\n"; KokkosKernels::Impl::throw_runtime_exception(oss.str()); @@ -148,23 +125,20 @@ void svd(const ExecutionSpace& space, const char jobu[], const char jobvt[], std::ostringstream os; if (S.extent_int(0) != rankA) { is_extent_invalid = true; - os << "KokkosLapack::svd: S has extent " << S.extent(0) << ", instead of " - << rankA << ".\n"; + os << "KokkosLapack::svd: S has extent " << S.extent(0) << ", instead of " << rankA << ".\n"; } - if ((jobu[0] == 'A') || (jobu[0] == 'a') || (jobu[0] == 'S') || - (jobu[0] == 's')) { + if ((jobu[0] == 'A') || (jobu[0] == 'a') || (jobu[0] == 'S') || (jobu[0] == 's')) { if (U.extent_int(0) != m || U.extent_int(1) != m) { is_extent_invalid = true; - os << "KokkosLapack::svd: U has extents (" << U.extent(0) << ", " - << U.extent(1) << ") instead of (" << m << ", " << m << ").\n"; + os << "KokkosLapack::svd: U has extents (" << U.extent(0) << ", " << U.extent(1) << ") instead of (" << m << ", " + << m << ").\n"; } } - if ((jobvt[0] == 'A') || (jobvt[0] == 'a') || (jobvt[0] == 'S') || - (jobvt[0] == 's')) { + if ((jobvt[0] == 'A') || (jobvt[0] == 'a') || (jobvt[0] == 'S') || (jobvt[0] == 's')) { if (Vt.extent_int(0) != n || Vt.extent_int(1) != n) { is_extent_invalid = true; - os << "KokkosLapack::svd: V has extents (" << Vt.extent(0) << ", " - << Vt.extent(1) << ") instead of (" << n << ", " << n << ").\n"; + os << "KokkosLapack::svd: V has extents (" << Vt.extent(0) << ", " << Vt.extent(1) << ") instead of (" << n + << ", " << n << ").\n"; } } if (is_extent_invalid) { @@ -172,8 +146,7 @@ void svd(const ExecutionSpace& space, const char jobu[], const char jobvt[], } #if defined(KOKKOSKERNELS_ENABLE_TPL_CUSOLVER) - if (std::is_same_v && - (A.extent(0) < A.extent(1))) { + if (std::is_same_v && (A.extent(0) < A.extent(1))) { throw std::runtime_error( "CUSOLVER does not support SVD for matrices with more columns " "than rows, you can transpose you matrix first then compute " @@ -182,32 +155,25 @@ void svd(const ExecutionSpace& space, const char jobu[], const char jobvt[], } #endif - using AMatrix_Internal = Kokkos::View< - typename AMatrix::non_const_value_type**, typename AMatrix::array_layout, - typename AMatrix::device_type, Kokkos::MemoryTraits>; + using AMatrix_Internal = Kokkos::View>; - using SVector_Internal = Kokkos::View< - typename SVector::non_const_value_type*, typename SVector::array_layout, - typename SVector::device_type, Kokkos::MemoryTraits>; + using SVector_Internal = Kokkos::View>; - using UMatrix_Internal = Kokkos::View< - typename UMatrix::non_const_value_type**, typename UMatrix::array_layout, - typename UMatrix::device_type, Kokkos::MemoryTraits>; + using UMatrix_Internal = Kokkos::View>; - using VMatrix_Internal = Kokkos::View< - typename VMatrix::non_const_value_type**, typename VMatrix::array_layout, - typename VMatrix::device_type, Kokkos::MemoryTraits>; + using VMatrix_Internal = Kokkos::View>; AMatrix_Internal A_i = A; SVector_Internal S_i = S; UMatrix_Internal U_i = U; VMatrix_Internal Vt_i = Vt; - KokkosLapack::Impl::SVD::svd(space, jobu, - jobvt, A_i, - S_i, U_i, - Vt_i); + KokkosLapack::Impl::SVD::svd( + space, jobu, jobvt, A_i, S_i, U_i, Vt_i); } // clang-format off @@ -235,8 +201,8 @@ void svd(const ExecutionSpace& space, const char jobu[], const char jobvt[], /// // clang-format on template -void svd(const char jobu[], const char jobvt[], const AMatrix& A, - const SVector& S, const UMatrix& U, const VMatrix& Vt) { +void svd(const char jobu[], const char jobvt[], const AMatrix& A, const SVector& S, const UMatrix& U, + const VMatrix& Vt) { typename AMatrix::execution_space space{}; svd(space, jobu, jobvt, A, S, U, Vt); } diff --git a/lapack/src/KokkosLapack_trtri.hpp b/lapack/src/KokkosLapack_trtri.hpp index 9a884f2303..cfe311f476 100644 --- a/lapack/src/KokkosLapack_trtri.hpp +++ b/lapack/src/KokkosLapack_trtri.hpp @@ -49,16 +49,12 @@ namespace KokkosLapack { // source: https://software.intel.com/en-us/mkl-developer-reference-c-trtri template int trtri(const char uplo[], const char diag[], const AViewType& A) { - static_assert(Kokkos::is_view::value, - "AViewType must be a Kokkos::View."); - static_assert(static_cast(AViewType::rank) == 2, - "AViewType must have rank 2."); + static_assert(Kokkos::is_view::value, "AViewType must be a Kokkos::View."); + static_assert(static_cast(AViewType::rank) == 2, "AViewType must have rank 2."); // Check validity of indicator argument - bool valid_uplo = (uplo[0] == 'U') || (uplo[0] == 'u') || (uplo[0] == 'L') || - (uplo[0] == 'l'); - bool valid_diag = (diag[0] == 'U') || (diag[0] == 'u') || (diag[0] == 'N') || - (diag[0] == 'n'); + bool valid_uplo = (uplo[0] == 'U') || (uplo[0] == 'u') || (uplo[0] == 'L') || (uplo[0] == 'l'); + bool valid_diag = (diag[0] == 'U') || (diag[0] == 'u') || (diag[0] == 'N') || (diag[0] == 'n'); if (!valid_uplo) { std::ostringstream os; @@ -94,22 +90,17 @@ int trtri(const char uplo[], const char diag[], const AViewType& A) { } // Create A matrix view type alias - using AViewInternalType = - Kokkos::View >; + using AViewInternalType = Kokkos::View >; // This is the return value type and should always reside on host using RViewInternalType = - Kokkos::View >; + Kokkos::View >; int result; RViewInternalType R = RViewInternalType(&result); - KokkosLapack::Impl::TRTRI::trtri( - R, uplo, diag, A); + KokkosLapack::Impl::TRTRI::trtri(R, uplo, diag, A); return result; } diff --git a/lapack/tpls/KokkosLapack_Cuda_tpl.hpp b/lapack/tpls/KokkosLapack_Cuda_tpl.hpp index 943d10d111..3ead12d5f4 100644 --- a/lapack/tpls/KokkosLapack_Cuda_tpl.hpp +++ b/lapack/tpls/KokkosLapack_Cuda_tpl.hpp @@ -24,8 +24,7 @@ namespace Impl { CudaLapackSingleton::CudaLapackSingleton() { cusolverStatus_t stat = cusolverDnCreate(&handle); - if (stat != CUSOLVER_STATUS_SUCCESS) - Kokkos::abort("CUSOLVER initialization failed\n"); + if (stat != CUSOLVER_STATUS_SUCCESS) Kokkos::abort("CUSOLVER initialization failed\n"); Kokkos::push_finalize_hook([&]() { cusolverDnDestroy(handle); }); } diff --git a/lapack/tpls/KokkosLapack_Host_tpl.cpp b/lapack/tpls/KokkosLapack_Host_tpl.cpp index add0a802bd..3b60a0578b 100644 --- a/lapack/tpls/KokkosLapack_Host_tpl.cpp +++ b/lapack/tpls/KokkosLapack_Host_tpl.cpp @@ -29,39 +29,25 @@ extern "C" { /// Gesv /// -void F77_BLAS_MANGLE(sgesv, SGESV)(int*, int*, float*, int*, int*, float*, int*, - int*); -void F77_BLAS_MANGLE(dgesv, DGESV)(int*, int*, double*, int*, int*, double*, - int*, int*); -void F77_BLAS_MANGLE(cgesv, CGESV)(int*, int*, std::complex*, int*, int*, - std::complex*, int*, int*); -void F77_BLAS_MANGLE(zgesv, ZGESV)(int*, int*, std::complex*, int*, - int*, std::complex*, int*, int*); +void F77_BLAS_MANGLE(sgesv, SGESV)(int*, int*, float*, int*, int*, float*, int*, int*); +void F77_BLAS_MANGLE(dgesv, DGESV)(int*, int*, double*, int*, int*, double*, int*, int*); +void F77_BLAS_MANGLE(cgesv, CGESV)(int*, int*, std::complex*, int*, int*, std::complex*, int*, int*); +void F77_BLAS_MANGLE(zgesv, ZGESV)(int*, int*, std::complex*, int*, int*, std::complex*, int*, int*); /// /// Gesvd /// -void F77_BLAS_MANGLE(sgesvd, SGESVD)(const char*, const char*, const int*, - const int*, float*, const int*, float*, - float*, const int*, float*, const int*, - float*, int*, int*); -void F77_BLAS_MANGLE(dgesvd, DGESVD)(const char*, const char*, const int*, - const int*, double*, const int*, double*, - double*, const int*, double*, const int*, - double*, int*, int*); -void F77_BLAS_MANGLE(cgesvd, CGESVD)(const char*, const char*, const int*, - const int*, std::complex*, - const int*, float*, std::complex*, - const int*, std::complex*, - const int*, std::complex*, int*, - float*, int*); -void F77_BLAS_MANGLE(zgesvd, ZGESVD)(const char*, const char*, const int*, - const int*, std::complex*, - const int*, double*, std::complex*, - const int*, std::complex*, - const int*, std::complex*, int*, - double*, int*); +void F77_BLAS_MANGLE(sgesvd, SGESVD)(const char*, const char*, const int*, const int*, float*, const int*, float*, + float*, const int*, float*, const int*, float*, int*, int*); +void F77_BLAS_MANGLE(dgesvd, DGESVD)(const char*, const char*, const int*, const int*, double*, const int*, double*, + double*, const int*, double*, const int*, double*, int*, int*); +void F77_BLAS_MANGLE(cgesvd, CGESVD)(const char*, const char*, const int*, const int*, std::complex*, const int*, + float*, std::complex*, const int*, std::complex*, const int*, + std::complex*, int*, float*, int*); +void F77_BLAS_MANGLE(zgesvd, ZGESVD)(const char*, const char*, const int*, const int*, std::complex*, + const int*, double*, std::complex*, const int*, std::complex*, + const int*, std::complex*, int*, double*, int*); /// /// Trtri @@ -74,14 +60,10 @@ void F77_BLAS_MANGLE(zgesvd, ZGESVD)(const char*, const char*, const int*, &diag, &n, a, &lda, &info); */ -void F77_BLAS_MANGLE(strtri, STRTRI)(const char*, const char*, int*, - const float*, int*, int*); -void F77_BLAS_MANGLE(dtrtri, DTRTRI)(const char*, const char*, int*, - const double*, int*, int*); -void F77_BLAS_MANGLE(ctrtri, CTRTRI)(const char*, const char*, int*, - const std::complex*, int*, int*); -void F77_BLAS_MANGLE(ztrtri, ZTRTRI)(const char*, const char*, int*, - const std::complex*, int*, int*); +void F77_BLAS_MANGLE(strtri, STRTRI)(const char*, const char*, int*, const float*, int*, int*); +void F77_BLAS_MANGLE(dtrtri, DTRTRI)(const char*, const char*, int*, const double*, int*, int*); +void F77_BLAS_MANGLE(ctrtri, CTRTRI)(const char*, const char*, int*, const std::complex*, int*, int*); +void F77_BLAS_MANGLE(ztrtri, ZTRTRI)(const char*, const char*, int*, const std::complex*, int*, int*); } #define F77_FUNC_SGESV F77_BLAS_MANGLE(sgesv, SGESV) @@ -107,22 +89,17 @@ namespace Impl { /// template <> -void HostLapack::gesv(int n, int rhs, float* a, int lda, int* ipiv, - float* b, int ldb, int info) { +void HostLapack::gesv(int n, int rhs, float* a, int lda, int* ipiv, float* b, int ldb, int info) { F77_FUNC_SGESV(&n, &rhs, a, &lda, ipiv, b, &ldb, &info); } template <> -void HostLapack::gesvd(const char jobu, const char jobvt, const int m, - const int n, float* a, const int lda, float* s, - float* u, const int ldu, float* vt, - const int ldvt, float* work, int lwork, +void HostLapack::gesvd(const char jobu, const char jobvt, const int m, const int n, float* a, const int lda, + float* s, float* u, const int ldu, float* vt, const int ldvt, float* work, int lwork, float* /*rwork*/, int info) { - F77_FUNC_SGESVD(&jobu, &jobvt, &m, &n, a, &lda, s, u, &ldu, vt, &ldvt, work, - &lwork, &info); + F77_FUNC_SGESVD(&jobu, &jobvt, &m, &n, a, &lda, s, u, &ldu, vt, &ldvt, work, &lwork, &info); } template <> -int HostLapack::trtri(const char uplo, const char diag, int n, - const float* a, int lda) { +int HostLapack::trtri(const char uplo, const char diag, int n, const float* a, int lda) { int info = 0; F77_FUNC_STRTRI(&uplo, &diag, &n, a, &lda, &info); return info; @@ -133,22 +110,17 @@ int HostLapack::trtri(const char uplo, const char diag, int n, /// template <> -void HostLapack::gesv(int n, int rhs, double* a, int lda, int* ipiv, - double* b, int ldb, int info) { +void HostLapack::gesv(int n, int rhs, double* a, int lda, int* ipiv, double* b, int ldb, int info) { F77_FUNC_DGESV(&n, &rhs, a, &lda, ipiv, b, &ldb, &info); } template <> -void HostLapack::gesvd(const char jobu, const char jobvt, const int m, - const int n, double* a, const int lda, double* s, - double* u, const int ldu, double* vt, - const int ldvt, double* work, int lwork, +void HostLapack::gesvd(const char jobu, const char jobvt, const int m, const int n, double* a, const int lda, + double* s, double* u, const int ldu, double* vt, const int ldvt, double* work, int lwork, double* /*rwork*/, int info) { - F77_FUNC_DGESVD(&jobu, &jobvt, &m, &n, a, &lda, s, u, &ldu, vt, &ldvt, work, - &lwork, &info); + F77_FUNC_DGESVD(&jobu, &jobvt, &m, &n, a, &lda, s, u, &ldu, vt, &ldvt, work, &lwork, &info); } template <> -int HostLapack::trtri(const char uplo, const char diag, int n, - const double* a, int lda) { +int HostLapack::trtri(const char uplo, const char diag, int n, const double* a, int lda) { int info = 0; F77_FUNC_DTRTRI(&uplo, &diag, &n, a, &lda, &info); return info; @@ -159,24 +131,19 @@ int HostLapack::trtri(const char uplo, const char diag, int n, /// template <> -void HostLapack >::gesv(int n, int rhs, - std::complex* a, int lda, - int* ipiv, std::complex* b, - int ldb, int info) { +void HostLapack >::gesv(int n, int rhs, std::complex* a, int lda, int* ipiv, + std::complex* b, int ldb, int info) { F77_FUNC_CGESV(&n, &rhs, a, &lda, ipiv, b, &ldb, &info); } template <> -void HostLapack >::gesvd( - const char jobu, const char jobvt, const int m, const int n, - std::complex* a, const int lda, float* s, std::complex* u, - const int ldu, std::complex* vt, const int ldvt, - std::complex* work, int lwork, float* rwork, int info) { - F77_FUNC_CGESVD(&jobu, &jobvt, &m, &n, a, &lda, s, u, &ldu, vt, &ldvt, work, - &lwork, rwork, &info); +void HostLapack >::gesvd(const char jobu, const char jobvt, const int m, const int n, + std::complex* a, const int lda, float* s, std::complex* u, + const int ldu, std::complex* vt, const int ldvt, + std::complex* work, int lwork, float* rwork, int info) { + F77_FUNC_CGESVD(&jobu, &jobvt, &m, &n, a, &lda, s, u, &ldu, vt, &ldvt, work, &lwork, rwork, &info); } template <> -int HostLapack >::trtri(const char uplo, const char diag, - int n, const std::complex* a, +int HostLapack >::trtri(const char uplo, const char diag, int n, const std::complex* a, int lda) { int info = 0; F77_FUNC_CTRTRI(&uplo, &diag, &n, a, &lda, &info); @@ -188,25 +155,20 @@ int HostLapack >::trtri(const char uplo, const char diag, /// template <> -void HostLapack >::gesv(int n, int rhs, - std::complex* a, int lda, - int* ipiv, std::complex* b, - int ldb, int info) { +void HostLapack >::gesv(int n, int rhs, std::complex* a, int lda, int* ipiv, + std::complex* b, int ldb, int info) { F77_FUNC_ZGESV(&n, &rhs, a, &lda, ipiv, b, &ldb, &info); } template <> -void HostLapack >::gesvd( - const char jobu, const char jobvt, const int m, const int n, - std::complex* a, const int lda, double* s, std::complex* u, - const int ldu, std::complex* vt, const int ldvt, - std::complex* work, int lwork, double* rwork, int info) { - F77_FUNC_ZGESVD(&jobu, &jobvt, &m, &n, a, &lda, s, u, &ldu, vt, &ldvt, work, - &lwork, rwork, &info); +void HostLapack >::gesvd(const char jobu, const char jobvt, const int m, const int n, + std::complex* a, const int lda, double* s, + std::complex* u, const int ldu, std::complex* vt, + const int ldvt, std::complex* work, int lwork, double* rwork, + int info) { + F77_FUNC_ZGESVD(&jobu, &jobvt, &m, &n, a, &lda, s, u, &ldu, vt, &ldvt, work, &lwork, rwork, &info); } template <> -int HostLapack >::trtri(const char uplo, const char diag, - int n, - const std::complex* a, +int HostLapack >::trtri(const char uplo, const char diag, int n, const std::complex* a, int lda) { int info = 0; F77_FUNC_ZTRTRI(&uplo, &diag, &n, a, &lda, &info); diff --git a/lapack/tpls/KokkosLapack_Host_tpl.hpp b/lapack/tpls/KokkosLapack_Host_tpl.hpp index 9eca83afea..092f9ac9f0 100644 --- a/lapack/tpls/KokkosLapack_Host_tpl.hpp +++ b/lapack/tpls/KokkosLapack_Host_tpl.hpp @@ -30,17 +30,13 @@ namespace Impl { template struct HostLapack { - static void gesv(int n, int rhs, T *a, int lda, int *ipiv, T *b, int ldb, - int info); + static void gesv(int n, int rhs, T *a, int lda, int *ipiv, T *b, int ldb, int info); - static void gesvd(const char jobu, const char jobvt, const int m, const int n, - T *A, const int lda, - typename Kokkos::ArithTraits::mag_type *S, T *U, - const int ldu, T *Vt, const int ldvt, T *work, int lwork, - typename Kokkos::ArithTraits::mag_type *rwork, int info); + static void gesvd(const char jobu, const char jobvt, const int m, const int n, T *A, const int lda, + typename Kokkos::ArithTraits::mag_type *S, T *U, const int ldu, T *Vt, const int ldvt, T *work, + int lwork, typename Kokkos::ArithTraits::mag_type *rwork, int info); - static int trtri(const char uplo, const char diag, int n, const T *a, - int lda); + static int trtri(const char uplo, const char diag, int n, const T *a, int lda); }; } // namespace Impl } // namespace KokkosLapack diff --git a/lapack/tpls/KokkosLapack_cusolver.hpp b/lapack/tpls/KokkosLapack_cusolver.hpp index 006fd68b6f..272fb8b3b8 100644 --- a/lapack/tpls/KokkosLapack_cusolver.hpp +++ b/lapack/tpls/KokkosLapack_cusolver.hpp @@ -34,8 +34,7 @@ struct CudaLapackSingleton { static CudaLapackSingleton& singleton(); }; -inline void cusolver_internal_error_throw(cusolverStatus_t cusolverStatus, - const char* name, const char* file, +inline void cusolver_internal_error_throw(cusolverStatus_t cusolverStatus, const char* name, const char* file, const int line) { std::ostringstream out; out << name << " error( "; @@ -48,21 +47,11 @@ inline void cusolver_internal_error_throw(cusolverStatus_t cusolverStatus, out << "CUSOLVER_STATUS_ALLOC_FAILED): you might tried to allocate too " "much memory"; break; - case CUSOLVER_STATUS_INVALID_VALUE: - out << "CUSOLVER_STATUS_INVALID_VALUE)"; - break; - case CUSOLVER_STATUS_ARCH_MISMATCH: - out << "CUSOLVER_STATUS_ARCH_MISMATCH)"; - break; - case CUSOLVER_STATUS_EXECUTION_FAILED: - out << "CUSOLVER_STATUS_EXECUTION_FAILED)"; - break; - case CUSOLVER_STATUS_INTERNAL_ERROR: - out << "CUSOLVER_STATUS_INTERNAL_ERROR)"; - break; - case CUSOLVER_STATUS_MATRIX_TYPE_NOT_SUPPORTED: - out << "CUSOLVER_STATUS_MATRIX_TYPE_NOT_SUPPORTED)"; - break; + case CUSOLVER_STATUS_INVALID_VALUE: out << "CUSOLVER_STATUS_INVALID_VALUE)"; break; + case CUSOLVER_STATUS_ARCH_MISMATCH: out << "CUSOLVER_STATUS_ARCH_MISMATCH)"; break; + case CUSOLVER_STATUS_EXECUTION_FAILED: out << "CUSOLVER_STATUS_EXECUTION_FAILED)"; break; + case CUSOLVER_STATUS_INTERNAL_ERROR: out << "CUSOLVER_STATUS_INTERNAL_ERROR)"; break; + case CUSOLVER_STATUS_MATRIX_TYPE_NOT_SUPPORTED: out << "CUSOLVER_STATUS_MATRIX_TYPE_NOT_SUPPORTED)"; break; default: out << "unrecognized error code): this is bad!"; break; } if (file) { @@ -71,10 +60,8 @@ inline void cusolver_internal_error_throw(cusolverStatus_t cusolverStatus, throw std::runtime_error(out.str()); } -inline void cusolver_internal_safe_call(cusolverStatus_t cusolverStatus, - const char* name, - const char* file = nullptr, - const int line = 0) { +inline void cusolver_internal_safe_call(cusolverStatus_t cusolverStatus, const char* name, const char* file = nullptr, + const int line = 0) { if (CUSOLVER_STATUS_SUCCESS != cusolverStatus) { cusolver_internal_error_throw(cusolverStatus, name, file, line); } @@ -82,9 +69,8 @@ inline void cusolver_internal_safe_call(cusolverStatus_t cusolverStatus, // The macro below defines is the public interface for the safe cusolver calls. // The functions themselves are protected by impl namespace. -#define KOKKOS_CUSOLVER_SAFE_CALL_IMPL(call) \ - KokkosLapack::Impl::cusolver_internal_safe_call(call, #call, __FILE__, \ - __LINE__) +#define KOKKOS_CUSOLVER_SAFE_CALL_IMPL(call) \ + KokkosLapack::Impl::cusolver_internal_safe_call(call, #call, __FILE__, __LINE__) } // namespace Impl } // namespace KokkosLapack diff --git a/lapack/tpls/KokkosLapack_gesv_tpl_spec_avail.hpp b/lapack/tpls/KokkosLapack_gesv_tpl_spec_avail.hpp index 9fbd299ca5..472b79ce85 100644 --- a/lapack/tpls/KokkosLapack_gesv_tpl_spec_avail.hpp +++ b/lapack/tpls/KokkosLapack_gesv_tpl_spec_avail.hpp @@ -28,27 +28,20 @@ struct gesv_tpl_spec_avail { // Generic Host side LAPACK (could be MKL or whatever) #ifdef KOKKOSKERNELS_ENABLE_TPL_LAPACK -#define KOKKOSLAPACK_GESV_TPL_SPEC_AVAIL_LAPACK(SCALAR, LAYOUT, MEMSPACE) \ - template \ - struct gesv_tpl_spec_avail< \ - ExecSpace, \ - Kokkos::View, \ - Kokkos::MemoryTraits >, \ - Kokkos::View, \ - Kokkos::MemoryTraits >, \ - Kokkos::View, \ - Kokkos::MemoryTraits > > { \ - enum : bool { value = true }; \ +#define KOKKOSLAPACK_GESV_TPL_SPEC_AVAIL_LAPACK(SCALAR, LAYOUT, MEMSPACE) \ + template \ + struct gesv_tpl_spec_avail< \ + ExecSpace, \ + Kokkos::View, Kokkos::MemoryTraits >, \ + Kokkos::View, Kokkos::MemoryTraits >, \ + Kokkos::View, Kokkos::MemoryTraits > > { \ + enum : bool { value = true }; \ }; -KOKKOSLAPACK_GESV_TPL_SPEC_AVAIL_LAPACK(double, Kokkos::LayoutLeft, - Kokkos::HostSpace) -KOKKOSLAPACK_GESV_TPL_SPEC_AVAIL_LAPACK(float, Kokkos::LayoutLeft, - Kokkos::HostSpace) -KOKKOSLAPACK_GESV_TPL_SPEC_AVAIL_LAPACK(Kokkos::complex, - Kokkos::LayoutLeft, Kokkos::HostSpace) -KOKKOSLAPACK_GESV_TPL_SPEC_AVAIL_LAPACK(Kokkos::complex, - Kokkos::LayoutLeft, Kokkos::HostSpace) +KOKKOSLAPACK_GESV_TPL_SPEC_AVAIL_LAPACK(double, Kokkos::LayoutLeft, Kokkos::HostSpace) +KOKKOSLAPACK_GESV_TPL_SPEC_AVAIL_LAPACK(float, Kokkos::LayoutLeft, Kokkos::HostSpace) +KOKKOSLAPACK_GESV_TPL_SPEC_AVAIL_LAPACK(Kokkos::complex, Kokkos::LayoutLeft, Kokkos::HostSpace) +KOKKOSLAPACK_GESV_TPL_SPEC_AVAIL_LAPACK(Kokkos::complex, Kokkos::LayoutLeft, Kokkos::HostSpace) #endif } // namespace Impl } // namespace KokkosLapack @@ -59,29 +52,23 @@ KOKKOSLAPACK_GESV_TPL_SPEC_AVAIL_LAPACK(Kokkos::complex, namespace KokkosLapack { namespace Impl { -#define KOKKOSLAPACK_GESV_TPL_SPEC_AVAIL_MAGMA(SCALAR, LAYOUT, MEMSPACE) \ - template <> \ - struct gesv_tpl_spec_avail< \ - Kokkos::Cuda, \ - Kokkos::View, \ - Kokkos::MemoryTraits >, \ - Kokkos::View, \ - Kokkos::MemoryTraits >, \ - Kokkos::View, \ - Kokkos::MemoryTraits > > { \ - enum : bool { value = true }; \ +#define KOKKOSLAPACK_GESV_TPL_SPEC_AVAIL_MAGMA(SCALAR, LAYOUT, MEMSPACE) \ + template <> \ + struct gesv_tpl_spec_avail< \ + Kokkos::Cuda, \ + Kokkos::View, \ + Kokkos::MemoryTraits >, \ + Kokkos::View, \ + Kokkos::MemoryTraits >, \ + Kokkos::View, \ + Kokkos::MemoryTraits > > { \ + enum : bool { value = true }; \ }; -KOKKOSLAPACK_GESV_TPL_SPEC_AVAIL_MAGMA(double, Kokkos::LayoutLeft, - Kokkos::CudaSpace) -KOKKOSLAPACK_GESV_TPL_SPEC_AVAIL_MAGMA(float, Kokkos::LayoutLeft, - Kokkos::CudaSpace) -KOKKOSLAPACK_GESV_TPL_SPEC_AVAIL_MAGMA(Kokkos::complex, - Kokkos::LayoutLeft, Kokkos::CudaSpace) -KOKKOSLAPACK_GESV_TPL_SPEC_AVAIL_MAGMA(Kokkos::complex, - Kokkos::LayoutLeft, Kokkos::CudaSpace) +KOKKOSLAPACK_GESV_TPL_SPEC_AVAIL_MAGMA(double, Kokkos::LayoutLeft, Kokkos::CudaSpace) +KOKKOSLAPACK_GESV_TPL_SPEC_AVAIL_MAGMA(float, Kokkos::LayoutLeft, Kokkos::CudaSpace) +KOKKOSLAPACK_GESV_TPL_SPEC_AVAIL_MAGMA(Kokkos::complex, Kokkos::LayoutLeft, Kokkos::CudaSpace) +KOKKOSLAPACK_GESV_TPL_SPEC_AVAIL_MAGMA(Kokkos::complex, Kokkos::LayoutLeft, Kokkos::CudaSpace) } // namespace Impl } // namespace KokkosLapack #endif // KOKKOSKERNELS_ENABLE_TPL_MAGMA @@ -91,39 +78,28 @@ KOKKOSLAPACK_GESV_TPL_SPEC_AVAIL_MAGMA(Kokkos::complex, namespace KokkosLapack { namespace Impl { -#define KOKKOSLAPACK_GESV_TPL_SPEC_AVAIL_CUSOLVER(SCALAR, LAYOUT, MEMSPACE) \ - template <> \ - struct gesv_tpl_spec_avail< \ - Kokkos::Cuda, \ - Kokkos::View, \ - Kokkos::MemoryTraits >, \ - Kokkos::View, \ - Kokkos::MemoryTraits >, \ - Kokkos::View, \ - Kokkos::MemoryTraits > > { \ - enum : bool { value = true }; \ +#define KOKKOSLAPACK_GESV_TPL_SPEC_AVAIL_CUSOLVER(SCALAR, LAYOUT, MEMSPACE) \ + template <> \ + struct gesv_tpl_spec_avail< \ + Kokkos::Cuda, \ + Kokkos::View, \ + Kokkos::MemoryTraits >, \ + Kokkos::View, \ + Kokkos::MemoryTraits >, \ + Kokkos::View, Kokkos::MemoryTraits > > { \ + enum : bool { value = true }; \ }; -KOKKOSLAPACK_GESV_TPL_SPEC_AVAIL_CUSOLVER(double, Kokkos::LayoutLeft, - Kokkos::CudaSpace) -KOKKOSLAPACK_GESV_TPL_SPEC_AVAIL_CUSOLVER(float, Kokkos::LayoutLeft, - Kokkos::CudaSpace) -KOKKOSLAPACK_GESV_TPL_SPEC_AVAIL_CUSOLVER(Kokkos::complex, - Kokkos::LayoutLeft, Kokkos::CudaSpace) -KOKKOSLAPACK_GESV_TPL_SPEC_AVAIL_CUSOLVER(Kokkos::complex, - Kokkos::LayoutLeft, Kokkos::CudaSpace) +KOKKOSLAPACK_GESV_TPL_SPEC_AVAIL_CUSOLVER(double, Kokkos::LayoutLeft, Kokkos::CudaSpace) +KOKKOSLAPACK_GESV_TPL_SPEC_AVAIL_CUSOLVER(float, Kokkos::LayoutLeft, Kokkos::CudaSpace) +KOKKOSLAPACK_GESV_TPL_SPEC_AVAIL_CUSOLVER(Kokkos::complex, Kokkos::LayoutLeft, Kokkos::CudaSpace) +KOKKOSLAPACK_GESV_TPL_SPEC_AVAIL_CUSOLVER(Kokkos::complex, Kokkos::LayoutLeft, Kokkos::CudaSpace) #if defined(KOKKOSKERNELS_INST_MEMSPACE_CUDAUVMSPACE) -KOKKOSLAPACK_GESV_TPL_SPEC_AVAIL_CUSOLVER(double, Kokkos::LayoutLeft, - Kokkos::CudaUVMSpace) -KOKKOSLAPACK_GESV_TPL_SPEC_AVAIL_CUSOLVER(float, Kokkos::LayoutLeft, - Kokkos::CudaUVMSpace) -KOKKOSLAPACK_GESV_TPL_SPEC_AVAIL_CUSOLVER(Kokkos::complex, - Kokkos::LayoutLeft, - Kokkos::CudaUVMSpace) -KOKKOSLAPACK_GESV_TPL_SPEC_AVAIL_CUSOLVER(Kokkos::complex, - Kokkos::LayoutLeft, - Kokkos::CudaUVMSpace) +KOKKOSLAPACK_GESV_TPL_SPEC_AVAIL_CUSOLVER(double, Kokkos::LayoutLeft, Kokkos::CudaUVMSpace) +KOKKOSLAPACK_GESV_TPL_SPEC_AVAIL_CUSOLVER(float, Kokkos::LayoutLeft, Kokkos::CudaUVMSpace) +KOKKOSLAPACK_GESV_TPL_SPEC_AVAIL_CUSOLVER(Kokkos::complex, Kokkos::LayoutLeft, Kokkos::CudaUVMSpace) +KOKKOSLAPACK_GESV_TPL_SPEC_AVAIL_CUSOLVER(Kokkos::complex, Kokkos::LayoutLeft, Kokkos::CudaUVMSpace) #endif } // namespace Impl @@ -136,28 +112,21 @@ KOKKOSLAPACK_GESV_TPL_SPEC_AVAIL_CUSOLVER(Kokkos::complex, namespace KokkosLapack { namespace Impl { -#define KOKKOSLAPACK_GESV_TPL_SPEC_AVAIL_ROCSOLVER(SCALAR, LAYOUT, MEMSPACE) \ - template <> \ - struct gesv_tpl_spec_avail< \ - Kokkos::HIP, \ - Kokkos::View, \ - Kokkos::MemoryTraits >, \ - Kokkos::View, \ - Kokkos::MemoryTraits >, \ - Kokkos::View, \ - Kokkos::MemoryTraits > > { \ - enum : bool { value = true }; \ +#define KOKKOSLAPACK_GESV_TPL_SPEC_AVAIL_ROCSOLVER(SCALAR, LAYOUT, MEMSPACE) \ + template <> \ + struct gesv_tpl_spec_avail< \ + Kokkos::HIP, \ + Kokkos::View, Kokkos::MemoryTraits >, \ + Kokkos::View, Kokkos::MemoryTraits >, \ + Kokkos::View, \ + Kokkos::MemoryTraits > > { \ + enum : bool { value = true }; \ }; -KOKKOSLAPACK_GESV_TPL_SPEC_AVAIL_ROCSOLVER(double, Kokkos::LayoutLeft, - Kokkos::HIPSpace) -KOKKOSLAPACK_GESV_TPL_SPEC_AVAIL_ROCSOLVER(float, Kokkos::LayoutLeft, - Kokkos::HIPSpace) -KOKKOSLAPACK_GESV_TPL_SPEC_AVAIL_ROCSOLVER(Kokkos::complex, - Kokkos::LayoutLeft, Kokkos::HIPSpace) -KOKKOSLAPACK_GESV_TPL_SPEC_AVAIL_ROCSOLVER(Kokkos::complex, - Kokkos::LayoutLeft, Kokkos::HIPSpace) +KOKKOSLAPACK_GESV_TPL_SPEC_AVAIL_ROCSOLVER(double, Kokkos::LayoutLeft, Kokkos::HIPSpace) +KOKKOSLAPACK_GESV_TPL_SPEC_AVAIL_ROCSOLVER(float, Kokkos::LayoutLeft, Kokkos::HIPSpace) +KOKKOSLAPACK_GESV_TPL_SPEC_AVAIL_ROCSOLVER(Kokkos::complex, Kokkos::LayoutLeft, Kokkos::HIPSpace) +KOKKOSLAPACK_GESV_TPL_SPEC_AVAIL_ROCSOLVER(Kokkos::complex, Kokkos::LayoutLeft, Kokkos::HIPSpace) } // namespace Impl } // namespace KokkosLapack diff --git a/lapack/tpls/KokkosLapack_gesv_tpl_spec_decl.hpp b/lapack/tpls/KokkosLapack_gesv_tpl_spec_decl.hpp index ca4b9e7abc..559f5d0509 100644 --- a/lapack/tpls/KokkosLapack_gesv_tpl_spec_decl.hpp +++ b/lapack/tpls/KokkosLapack_gesv_tpl_spec_decl.hpp @@ -23,14 +23,12 @@ template inline void gesv_print_specialization() { #ifdef KOKKOSKERNELS_ENABLE_CHECK_SPECIALIZATION #ifdef KOKKOSKERNELS_ENABLE_TPL_MAGMA - printf("KokkosLapack::gesv<> TPL MAGMA specialization for < %s , %s, %s >\n", - typeid(AViewType).name(), typeid(BViewType).name(), - typeid(PViewType).name()); + printf("KokkosLapack::gesv<> TPL MAGMA specialization for < %s , %s, %s >\n", typeid(AViewType).name(), + typeid(BViewType).name(), typeid(PViewType).name()); #else #ifdef KOKKOSKERNELS_ENABLE_TPL_LAPACK - printf("KokkosLapack::gesv<> TPL Lapack specialization for < %s , %s, %s >\n", - typeid(AViewType).name(), typeid(BViewType).name(), - typeid(PViewType).name()); + printf("KokkosLapack::gesv<> TPL Lapack specialization for < %s , %s, %s >\n", typeid(AViewType).name(), + typeid(BViewType).name(), typeid(PViewType).name()); #endif #endif #endif @@ -46,8 +44,7 @@ namespace KokkosLapack { namespace Impl { template -void lapackGesvWrapper(const AViewType& A, const BViewType& B, - const IPIVViewType& IPIV) { +void lapackGesvWrapper(const AViewType& A, const BViewType& B, const IPIVViewType& IPIV) { using Scalar = typename AViewType::non_const_value_type; const bool with_pivot = !((IPIV.extent(0) == 0) && (IPIV.data() == nullptr)); @@ -65,88 +62,65 @@ void lapackGesvWrapper(const AViewType& A, const BViewType& B, if constexpr (Kokkos::ArithTraits::is_complex) { using MagType = typename Kokkos::ArithTraits::mag_type; - HostLapack>::gesv( - N, NRHS, reinterpret_cast*>(A.data()), LDA, - IPIV.data(), reinterpret_cast*>(B.data()), LDB, - info); + HostLapack>::gesv(N, NRHS, reinterpret_cast*>(A.data()), LDA, + IPIV.data(), reinterpret_cast*>(B.data()), LDB, + info); } else { - HostLapack::gesv(N, NRHS, A.data(), LDA, IPIV.data(), B.data(), - LDB, info); + HostLapack::gesv(N, NRHS, A.data(), LDA, IPIV.data(), B.data(), LDB, info); } } } -#define KOKKOSLAPACK_GESV_LAPACK(SCALAR, LAYOUT, EXECSPACE, MEM_SPACE) \ - template <> \ - struct GESV< \ - EXECSPACE, \ - Kokkos::View, \ - Kokkos::MemoryTraits>, \ - Kokkos::View, \ - Kokkos::MemoryTraits>, \ - Kokkos::View, \ - Kokkos::MemoryTraits>, \ - true, \ - gesv_eti_spec_avail< \ - EXECSPACE, \ - Kokkos::View, \ - Kokkos::MemoryTraits>, \ - Kokkos::View, \ - Kokkos::MemoryTraits>, \ - Kokkos::View, \ - Kokkos::MemoryTraits>>::value> { \ - using AViewType = \ - Kokkos::View, \ - Kokkos::MemoryTraits>; \ - using BViewType = \ - Kokkos::View, \ - Kokkos::MemoryTraits>; \ - using PViewType = \ - Kokkos::View, \ - Kokkos::MemoryTraits>; \ - \ - static void gesv(const EXECSPACE& /* space */, const AViewType& A, \ - const BViewType& B, const PViewType& IPIV) { \ - Kokkos::Profiling::pushRegion("KokkosLapack::gesv[TPL_LAPACK," #SCALAR \ - "]"); \ - gesv_print_specialization(); \ - lapackGesvWrapper(A, B, IPIV); \ - Kokkos::Profiling::popRegion(); \ - } \ +#define KOKKOSLAPACK_GESV_LAPACK(SCALAR, LAYOUT, EXECSPACE, MEM_SPACE) \ + template <> \ + struct GESV< \ + EXECSPACE, \ + Kokkos::View, Kokkos::MemoryTraits>, \ + Kokkos::View, Kokkos::MemoryTraits>, \ + Kokkos::View, \ + Kokkos::MemoryTraits>, \ + true, \ + gesv_eti_spec_avail, \ + Kokkos::MemoryTraits>, \ + Kokkos::View, \ + Kokkos::MemoryTraits>, \ + Kokkos::View, \ + Kokkos::MemoryTraits>>::value> { \ + using AViewType = \ + Kokkos::View, Kokkos::MemoryTraits>; \ + using BViewType = \ + Kokkos::View, Kokkos::MemoryTraits>; \ + using PViewType = \ + Kokkos::View, Kokkos::MemoryTraits>; \ + \ + static void gesv(const EXECSPACE& /* space */, const AViewType& A, const BViewType& B, const PViewType& IPIV) { \ + Kokkos::Profiling::pushRegion("KokkosLapack::gesv[TPL_LAPACK," #SCALAR "]"); \ + gesv_print_specialization(); \ + lapackGesvWrapper(A, B, IPIV); \ + Kokkos::Profiling::popRegion(); \ + } \ }; #if defined(KOKKOS_ENABLE_SERIAL) -KOKKOSLAPACK_GESV_LAPACK(float, Kokkos::LayoutLeft, Kokkos::Serial, - Kokkos::HostSpace) -KOKKOSLAPACK_GESV_LAPACK(double, Kokkos::LayoutLeft, Kokkos::Serial, - Kokkos::HostSpace) -KOKKOSLAPACK_GESV_LAPACK(Kokkos::complex, Kokkos::LayoutLeft, - Kokkos::Serial, Kokkos::HostSpace) -KOKKOSLAPACK_GESV_LAPACK(Kokkos::complex, Kokkos::LayoutLeft, - Kokkos::Serial, Kokkos::HostSpace) +KOKKOSLAPACK_GESV_LAPACK(float, Kokkos::LayoutLeft, Kokkos::Serial, Kokkos::HostSpace) +KOKKOSLAPACK_GESV_LAPACK(double, Kokkos::LayoutLeft, Kokkos::Serial, Kokkos::HostSpace) +KOKKOSLAPACK_GESV_LAPACK(Kokkos::complex, Kokkos::LayoutLeft, Kokkos::Serial, Kokkos::HostSpace) +KOKKOSLAPACK_GESV_LAPACK(Kokkos::complex, Kokkos::LayoutLeft, Kokkos::Serial, Kokkos::HostSpace) #endif #if defined(KOKKOS_ENABLE_OPENMP) -KOKKOSLAPACK_GESV_LAPACK(float, Kokkos::LayoutLeft, Kokkos::OpenMP, - Kokkos::HostSpace) -KOKKOSLAPACK_GESV_LAPACK(double, Kokkos::LayoutLeft, Kokkos::OpenMP, - Kokkos::HostSpace) -KOKKOSLAPACK_GESV_LAPACK(Kokkos::complex, Kokkos::LayoutLeft, - Kokkos::OpenMP, Kokkos::HostSpace) -KOKKOSLAPACK_GESV_LAPACK(Kokkos::complex, Kokkos::LayoutLeft, - Kokkos::OpenMP, Kokkos::HostSpace) +KOKKOSLAPACK_GESV_LAPACK(float, Kokkos::LayoutLeft, Kokkos::OpenMP, Kokkos::HostSpace) +KOKKOSLAPACK_GESV_LAPACK(double, Kokkos::LayoutLeft, Kokkos::OpenMP, Kokkos::HostSpace) +KOKKOSLAPACK_GESV_LAPACK(Kokkos::complex, Kokkos::LayoutLeft, Kokkos::OpenMP, Kokkos::HostSpace) +KOKKOSLAPACK_GESV_LAPACK(Kokkos::complex, Kokkos::LayoutLeft, Kokkos::OpenMP, Kokkos::HostSpace) #endif #if defined(KOKKOS_ENABLE_THREADS) -KOKKOSLAPACK_GESV_LAPACK(float, Kokkos::LayoutLeft, Kokkos::Threads, - Kokkos::HostSpace) -KOKKOSLAPACK_GESV_LAPACK(double, Kokkos::LayoutLeft, Kokkos::Threads, - Kokkos::HostSpace) -KOKKOSLAPACK_GESV_LAPACK(Kokkos::complex, Kokkos::LayoutLeft, - Kokkos::Threads, Kokkos::HostSpace) -KOKKOSLAPACK_GESV_LAPACK(Kokkos::complex, Kokkos::LayoutLeft, - Kokkos::Threads, Kokkos::HostSpace) +KOKKOSLAPACK_GESV_LAPACK(float, Kokkos::LayoutLeft, Kokkos::Threads, Kokkos::HostSpace) +KOKKOSLAPACK_GESV_LAPACK(double, Kokkos::LayoutLeft, Kokkos::Threads, Kokkos::HostSpace) +KOKKOSLAPACK_GESV_LAPACK(Kokkos::complex, Kokkos::LayoutLeft, Kokkos::Threads, Kokkos::HostSpace) +KOKKOSLAPACK_GESV_LAPACK(Kokkos::complex, Kokkos::LayoutLeft, Kokkos::Threads, Kokkos::HostSpace) #endif } // namespace Impl @@ -161,12 +135,10 @@ namespace KokkosLapack { namespace Impl { template -void magmaGesvWrapper(const ExecSpace& space, const AViewType& A, - const BViewType& B, const IPIVViewType& IPIV) { +void magmaGesvWrapper(const ExecSpace& space, const AViewType& A, const BViewType& B, const IPIVViewType& IPIV) { using scalar_type = typename AViewType::non_const_value_type; - Kokkos::Profiling::pushRegion("KokkosLapack::gesv[TPL_MAGMA," + - Kokkos::ArithTraits::name() + "]"); + Kokkos::Profiling::pushRegion("KokkosLapack::gesv[TPL_MAGMA," + Kokkos::ArithTraits::name() + "]"); gesv_print_specialization(); const bool with_pivot = !((IPIV.extent(0) == 0) && (IPIV.data() == nullptr)); @@ -178,112 +150,88 @@ void magmaGesvWrapper(const ExecSpace& space, const AViewType& A, magma_int_t LDB = (BST == 0) ? 1 : BST; magma_int_t NRHS = static_cast(B.extent(1)); - KokkosLapack::Impl::MagmaSingleton& s = - KokkosLapack::Impl::MagmaSingleton::singleton(); - magma_int_t info = 0; + KokkosLapack::Impl::MagmaSingleton& s = KokkosLapack::Impl::MagmaSingleton::singleton(); + magma_int_t info = 0; space.fence(); if constexpr (std::is_same_v) { if (with_pivot) { - magma_sgesv_gpu(N, NRHS, reinterpret_cast(A.data()), LDA, - IPIV.data(), reinterpret_cast(B.data()), - LDB, &info); + magma_sgesv_gpu(N, NRHS, reinterpret_cast(A.data()), LDA, IPIV.data(), + reinterpret_cast(B.data()), LDB, &info); } else { - magma_sgesv_nopiv_gpu(N, NRHS, reinterpret_cast(A.data()), - LDA, reinterpret_cast(B.data()), - LDB, &info); + magma_sgesv_nopiv_gpu(N, NRHS, reinterpret_cast(A.data()), LDA, + reinterpret_cast(B.data()), LDB, &info); } } if constexpr (std::is_same_v) { if (with_pivot) { - magma_dgesv_gpu(N, NRHS, reinterpret_cast(A.data()), LDA, - IPIV.data(), reinterpret_cast(B.data()), - LDB, &info); + magma_dgesv_gpu(N, NRHS, reinterpret_cast(A.data()), LDA, IPIV.data(), + reinterpret_cast(B.data()), LDB, &info); } else { - magma_dgesv_nopiv_gpu( - N, NRHS, reinterpret_cast(A.data()), LDA, - reinterpret_cast(B.data()), LDB, &info); + magma_dgesv_nopiv_gpu(N, NRHS, reinterpret_cast(A.data()), LDA, + reinterpret_cast(B.data()), LDB, &info); } } if constexpr (std::is_same_v>) { if (with_pivot) { - magma_cgesv_gpu( - N, NRHS, reinterpret_cast(A.data()), LDA, - IPIV.data(), reinterpret_cast(B.data()), LDB, - &info); + magma_cgesv_gpu(N, NRHS, reinterpret_cast(A.data()), LDA, IPIV.data(), + reinterpret_cast(B.data()), LDB, &info); } else { - magma_cgesv_nopiv_gpu( - N, NRHS, reinterpret_cast(A.data()), LDA, - reinterpret_cast(B.data()), LDB, &info); + magma_cgesv_nopiv_gpu(N, NRHS, reinterpret_cast(A.data()), LDA, + reinterpret_cast(B.data()), LDB, &info); } } if constexpr (std::is_same_v>) { if (with_pivot) { - magma_zgesv_gpu( - N, NRHS, reinterpret_cast(A.data()), LDA, - IPIV.data(), reinterpret_cast(B.data()), LDB, - &info); + magma_zgesv_gpu(N, NRHS, reinterpret_cast(A.data()), LDA, IPIV.data(), + reinterpret_cast(B.data()), LDB, &info); } else { - magma_zgesv_nopiv_gpu( - N, NRHS, reinterpret_cast(A.data()), LDA, - reinterpret_cast(B.data()), LDB, &info); + magma_zgesv_nopiv_gpu(N, NRHS, reinterpret_cast(A.data()), LDA, + reinterpret_cast(B.data()), LDB, &info); } } ExecSpace().fence(); Kokkos::Profiling::popRegion(); } -#define KOKKOSLAPACK_GESV_MAGMA(SCALAR, LAYOUT, MEM_SPACE) \ - template <> \ - struct GESV< \ - Kokkos::Cuda, \ - Kokkos::View, \ - Kokkos::MemoryTraits>, \ - Kokkos::View, \ - Kokkos::MemoryTraits>, \ - Kokkos::View, \ - Kokkos::MemoryTraits>, \ - true, \ - gesv_eti_spec_avail< \ - Kokkos::Cuda, \ - Kokkos::View, \ - Kokkos::MemoryTraits>, \ - Kokkos::View, \ - Kokkos::MemoryTraits>, \ - Kokkos::View, \ - Kokkos::MemoryTraits>>::value> { \ - using AViewType = Kokkos::View, \ - Kokkos::MemoryTraits>; \ - using BViewType = Kokkos::View, \ - Kokkos::MemoryTraits>; \ - using PViewType = Kokkos::View< \ - magma_int_t*, LAYOUT, \ - Kokkos::Device, \ - Kokkos::MemoryTraits>; \ - \ - static void gesv(const Kokkos::Cuda& space, const AViewType& A, \ - const BViewType& B, const PViewType& IPIV) { \ - magmaGesvWrapper(space, A, B, IPIV); \ - } \ +#define KOKKOSLAPACK_GESV_MAGMA(SCALAR, LAYOUT, MEM_SPACE) \ + template <> \ + struct GESV, \ + Kokkos::MemoryTraits>, \ + Kokkos::View, \ + Kokkos::MemoryTraits>, \ + Kokkos::View, \ + Kokkos::MemoryTraits>, \ + true, \ + gesv_eti_spec_avail, \ + Kokkos::MemoryTraits>, \ + Kokkos::View, \ + Kokkos::MemoryTraits>, \ + Kokkos::View, \ + Kokkos::MemoryTraits>>::value> { \ + using AViewType = Kokkos::View, \ + Kokkos::MemoryTraits>; \ + using BViewType = Kokkos::View, \ + Kokkos::MemoryTraits>; \ + using PViewType = \ + Kokkos::View, \ + Kokkos::MemoryTraits>; \ + \ + static void gesv(const Kokkos::Cuda& space, const AViewType& A, const BViewType& B, const PViewType& IPIV) { \ + magmaGesvWrapper(space, A, B, IPIV); \ + } \ }; KOKKOSLAPACK_GESV_MAGMA(float, Kokkos::LayoutLeft, Kokkos::CudaSpace) KOKKOSLAPACK_GESV_MAGMA(double, Kokkos::LayoutLeft, Kokkos::CudaSpace) -KOKKOSLAPACK_GESV_MAGMA(Kokkos::complex, Kokkos::LayoutLeft, - Kokkos::CudaSpace) -KOKKOSLAPACK_GESV_MAGMA(Kokkos::complex, Kokkos::LayoutLeft, - Kokkos::CudaSpace) +KOKKOSLAPACK_GESV_MAGMA(Kokkos::complex, Kokkos::LayoutLeft, Kokkos::CudaSpace) +KOKKOSLAPACK_GESV_MAGMA(Kokkos::complex, Kokkos::LayoutLeft, Kokkos::CudaSpace) } // namespace Impl } // namespace KokkosLapack @@ -296,10 +244,9 @@ KOKKOSLAPACK_GESV_MAGMA(Kokkos::complex, Kokkos::LayoutLeft, namespace KokkosLapack { namespace Impl { -template -void cusolverGesvWrapper(const ExecutionSpace& space, const IPIVViewType& IPIV, - const AViewType& A, const BViewType& B) { +template +void cusolverGesvWrapper(const ExecutionSpace& space, const IPIVViewType& IPIV, const AViewType& A, + const BViewType& B) { using memory_space = typename AViewType::memory_space; using Scalar = typename BViewType::non_const_value_type; using ALayout_t = typename AViewType::array_layout; @@ -307,137 +254,109 @@ void cusolverGesvWrapper(const ExecutionSpace& space, const IPIVViewType& IPIV, const int m = A.extent_int(0); const int n = A.extent_int(1); - const int lda = std::is_same_v ? A.stride(0) - : A.stride(1); + const int lda = std::is_same_v ? A.stride(0) : A.stride(1); (void)B; const int nrhs = B.extent_int(1); - const int ldb = std::is_same_v ? B.stride(0) - : B.stride(1); - int lwork = 0; + const int ldb = std::is_same_v ? B.stride(0) : B.stride(1); + int lwork = 0; Kokkos::View info("getrf info"); CudaLapackSingleton& s = CudaLapackSingleton::singleton(); - KOKKOS_CUSOLVER_SAFE_CALL_IMPL( - cusolverDnSetStream(s.handle, space.cuda_stream())); + KOKKOS_CUSOLVER_SAFE_CALL_IMPL(cusolverDnSetStream(s.handle, space.cuda_stream())); if constexpr (std::is_same_v) { - KOKKOS_CUSOLVER_SAFE_CALL_IMPL( - cusolverDnSgetrf_bufferSize(s.handle, m, n, A.data(), lda, &lwork)); + KOKKOS_CUSOLVER_SAFE_CALL_IMPL(cusolverDnSgetrf_bufferSize(s.handle, m, n, A.data(), lda, &lwork)); Kokkos::View Workspace("getrf workspace", lwork); - KOKKOS_CUSOLVER_SAFE_CALL_IMPL(cusolverDnSgetrf(s.handle, m, n, A.data(), - lda, Workspace.data(), - IPIV.data(), info.data())); + KOKKOS_CUSOLVER_SAFE_CALL_IMPL( + cusolverDnSgetrf(s.handle, m, n, A.data(), lda, Workspace.data(), IPIV.data(), info.data())); KOKKOS_CUSOLVER_SAFE_CALL_IMPL( - cusolverDnSgetrs(s.handle, CUBLAS_OP_N, m, nrhs, A.data(), lda, - IPIV.data(), B.data(), ldb, info.data())); + cusolverDnSgetrs(s.handle, CUBLAS_OP_N, m, nrhs, A.data(), lda, IPIV.data(), B.data(), ldb, info.data())); } if constexpr (std::is_same_v) { - KOKKOS_CUSOLVER_SAFE_CALL_IMPL( - cusolverDnDgetrf_bufferSize(s.handle, m, n, A.data(), lda, &lwork)); + KOKKOS_CUSOLVER_SAFE_CALL_IMPL(cusolverDnDgetrf_bufferSize(s.handle, m, n, A.data(), lda, &lwork)); Kokkos::View Workspace("getrf workspace", lwork); - KOKKOS_CUSOLVER_SAFE_CALL_IMPL(cusolverDnDgetrf(s.handle, m, n, A.data(), - lda, Workspace.data(), - IPIV.data(), info.data())); + KOKKOS_CUSOLVER_SAFE_CALL_IMPL( + cusolverDnDgetrf(s.handle, m, n, A.data(), lda, Workspace.data(), IPIV.data(), info.data())); KOKKOS_CUSOLVER_SAFE_CALL_IMPL( - cusolverDnDgetrs(s.handle, CUBLAS_OP_N, m, nrhs, A.data(), lda, - IPIV.data(), B.data(), ldb, info.data())); + cusolverDnDgetrs(s.handle, CUBLAS_OP_N, m, nrhs, A.data(), lda, IPIV.data(), B.data(), ldb, info.data())); } if constexpr (std::is_same_v>) { - KOKKOS_CUSOLVER_SAFE_CALL_IMPL(cusolverDnCgetrf_bufferSize( - s.handle, m, n, reinterpret_cast(A.data()), lda, &lwork)); + KOKKOS_CUSOLVER_SAFE_CALL_IMPL( + cusolverDnCgetrf_bufferSize(s.handle, m, n, reinterpret_cast(A.data()), lda, &lwork)); Kokkos::View Workspace("getrf workspace", lwork); - KOKKOS_CUSOLVER_SAFE_CALL_IMPL( - cusolverDnCgetrf(s.handle, m, n, reinterpret_cast(A.data()), - lda, reinterpret_cast(Workspace.data()), - IPIV.data(), info.data())); - - KOKKOS_CUSOLVER_SAFE_CALL_IMPL(cusolverDnCgetrs( - s.handle, CUBLAS_OP_N, m, nrhs, reinterpret_cast(A.data()), - lda, IPIV.data(), reinterpret_cast(B.data()), ldb, - info.data())); + KOKKOS_CUSOLVER_SAFE_CALL_IMPL(cusolverDnCgetrf(s.handle, m, n, reinterpret_cast(A.data()), lda, + reinterpret_cast(Workspace.data()), IPIV.data(), + info.data())); + + KOKKOS_CUSOLVER_SAFE_CALL_IMPL(cusolverDnCgetrs(s.handle, CUBLAS_OP_N, m, nrhs, + reinterpret_cast(A.data()), lda, IPIV.data(), + reinterpret_cast(B.data()), ldb, info.data())); } if constexpr (std::is_same_v>) { - KOKKOS_CUSOLVER_SAFE_CALL_IMPL(cusolverDnZgetrf_bufferSize( - s.handle, m, n, reinterpret_cast(A.data()), lda, - &lwork)); - Kokkos::View Workspace("getrf workspace", - lwork); - - KOKKOS_CUSOLVER_SAFE_CALL_IMPL(cusolverDnZgetrf( - s.handle, m, n, reinterpret_cast(A.data()), lda, - reinterpret_cast(Workspace.data()), IPIV.data(), - info.data())); - - KOKKOS_CUSOLVER_SAFE_CALL_IMPL(cusolverDnZgetrs( - s.handle, CUBLAS_OP_N, m, nrhs, - reinterpret_cast(A.data()), lda, IPIV.data(), - reinterpret_cast(B.data()), ldb, info.data())); + KOKKOS_CUSOLVER_SAFE_CALL_IMPL( + cusolverDnZgetrf_bufferSize(s.handle, m, n, reinterpret_cast(A.data()), lda, &lwork)); + Kokkos::View Workspace("getrf workspace", lwork); + + KOKKOS_CUSOLVER_SAFE_CALL_IMPL(cusolverDnZgetrf(s.handle, m, n, reinterpret_cast(A.data()), lda, + reinterpret_cast(Workspace.data()), IPIV.data(), + info.data())); + + KOKKOS_CUSOLVER_SAFE_CALL_IMPL(cusolverDnZgetrs(s.handle, CUBLAS_OP_N, m, nrhs, + reinterpret_cast(A.data()), lda, IPIV.data(), + reinterpret_cast(B.data()), ldb, info.data())); } KOKKOS_CUSOLVER_SAFE_CALL_IMPL(cusolverDnSetStream(s.handle, NULL)); } -#define KOKKOSLAPACK_GESV_CUSOLVER(SCALAR, LAYOUT, MEM_SPACE) \ - template <> \ - struct GESV< \ - Kokkos::Cuda, \ - Kokkos::View, \ - Kokkos::MemoryTraits>, \ - Kokkos::View, \ - Kokkos::MemoryTraits>, \ - Kokkos::View, \ - Kokkos::MemoryTraits>, \ - true, \ - gesv_eti_spec_avail< \ - Kokkos::Cuda, \ - Kokkos::View, \ - Kokkos::MemoryTraits>, \ - Kokkos::View, \ - Kokkos::MemoryTraits>, \ - Kokkos::View, \ - Kokkos::MemoryTraits>>::value> { \ - using AViewType = Kokkos::View, \ - Kokkos::MemoryTraits>; \ - using BViewType = Kokkos::View, \ - Kokkos::MemoryTraits>; \ - using PViewType = \ - Kokkos::View, \ - Kokkos::MemoryTraits>; \ - \ - static void gesv(const Kokkos::Cuda& space, const AViewType& A, \ - const BViewType& B, const PViewType& IPIV) { \ - Kokkos::Profiling::pushRegion("KokkosLapack::gesv[TPL_CUSOLVER," #SCALAR \ - "]"); \ - gesv_print_specialization(); \ - \ - cusolverGesvWrapper(space, IPIV, A, B); \ - Kokkos::Profiling::popRegion(); \ - } \ +#define KOKKOSLAPACK_GESV_CUSOLVER(SCALAR, LAYOUT, MEM_SPACE) \ + template <> \ + struct GESV< \ + Kokkos::Cuda, \ + Kokkos::View, \ + Kokkos::MemoryTraits>, \ + Kokkos::View, \ + Kokkos::MemoryTraits>, \ + Kokkos::View, Kokkos::MemoryTraits>, \ + true, \ + gesv_eti_spec_avail, \ + Kokkos::MemoryTraits>, \ + Kokkos::View, \ + Kokkos::MemoryTraits>, \ + Kokkos::View, \ + Kokkos::MemoryTraits>>::value> { \ + using AViewType = Kokkos::View, \ + Kokkos::MemoryTraits>; \ + using BViewType = Kokkos::View, \ + Kokkos::MemoryTraits>; \ + using PViewType = \ + Kokkos::View, Kokkos::MemoryTraits>; \ + \ + static void gesv(const Kokkos::Cuda& space, const AViewType& A, const BViewType& B, const PViewType& IPIV) { \ + Kokkos::Profiling::pushRegion("KokkosLapack::gesv[TPL_CUSOLVER," #SCALAR "]"); \ + gesv_print_specialization(); \ + \ + cusolverGesvWrapper(space, IPIV, A, B); \ + Kokkos::Profiling::popRegion(); \ + } \ }; KOKKOSLAPACK_GESV_CUSOLVER(float, Kokkos::LayoutLeft, Kokkos::CudaSpace) KOKKOSLAPACK_GESV_CUSOLVER(double, Kokkos::LayoutLeft, Kokkos::CudaSpace) -KOKKOSLAPACK_GESV_CUSOLVER(Kokkos::complex, Kokkos::LayoutLeft, - Kokkos::CudaSpace) -KOKKOSLAPACK_GESV_CUSOLVER(Kokkos::complex, Kokkos::LayoutLeft, - Kokkos::CudaSpace) +KOKKOSLAPACK_GESV_CUSOLVER(Kokkos::complex, Kokkos::LayoutLeft, Kokkos::CudaSpace) +KOKKOSLAPACK_GESV_CUSOLVER(Kokkos::complex, Kokkos::LayoutLeft, Kokkos::CudaSpace) #if defined(KOKKOSKERNELS_INST_MEMSPACE_CUDAUVMSPACE) KOKKOSLAPACK_GESV_CUSOLVER(float, Kokkos::LayoutLeft, Kokkos::CudaUVMSpace) KOKKOSLAPACK_GESV_CUSOLVER(double, Kokkos::LayoutLeft, Kokkos::CudaUVMSpace) -KOKKOSLAPACK_GESV_CUSOLVER(Kokkos::complex, Kokkos::LayoutLeft, - Kokkos::CudaUVMSpace) -KOKKOSLAPACK_GESV_CUSOLVER(Kokkos::complex, Kokkos::LayoutLeft, - Kokkos::CudaUVMSpace) +KOKKOSLAPACK_GESV_CUSOLVER(Kokkos::complex, Kokkos::LayoutLeft, Kokkos::CudaUVMSpace) +KOKKOSLAPACK_GESV_CUSOLVER(Kokkos::complex, Kokkos::LayoutLeft, Kokkos::CudaUVMSpace) #endif } // namespace Impl @@ -452,103 +371,78 @@ KOKKOSLAPACK_GESV_CUSOLVER(Kokkos::complex, Kokkos::LayoutLeft, namespace KokkosLapack { namespace Impl { -template -void rocsolverGesvWrapper(const ExecutionSpace& space, const IPIVViewType& IPIV, - const AViewType& A, const BViewType& B) { +template +void rocsolverGesvWrapper(const ExecutionSpace& space, const IPIVViewType& IPIV, const AViewType& A, + const BViewType& B) { using Scalar = typename BViewType::non_const_value_type; using ALayout_t = typename AViewType::array_layout; using BLayout_t = typename BViewType::array_layout; const rocblas_int N = static_cast(A.extent(0)); const rocblas_int nrhs = static_cast(B.extent(1)); - const rocblas_int lda = std::is_same_v - ? A.stride(0) - : A.stride(1); - const rocblas_int ldb = std::is_same_v - ? B.stride(0) - : B.stride(1); + const rocblas_int lda = std::is_same_v ? A.stride(0) : A.stride(1); + const rocblas_int ldb = std::is_same_v ? B.stride(0) : B.stride(1); Kokkos::View info("rocsolver info"); - KokkosBlas::Impl::RocBlasSingleton& s = - KokkosBlas::Impl::RocBlasSingleton::singleton(); - KOKKOS_ROCBLAS_SAFE_CALL_IMPL( - rocblas_set_stream(s.handle, space.hip_stream())); + KokkosBlas::Impl::RocBlasSingleton& s = KokkosBlas::Impl::RocBlasSingleton::singleton(); + KOKKOS_ROCBLAS_SAFE_CALL_IMPL(rocblas_set_stream(s.handle, space.hip_stream())); if constexpr (std::is_same_v) { - KOKKOS_ROCBLAS_SAFE_CALL_IMPL(rocsolver_sgesv(s.handle, N, nrhs, A.data(), - lda, IPIV.data(), B.data(), - ldb, info.data())); + KOKKOS_ROCBLAS_SAFE_CALL_IMPL( + rocsolver_sgesv(s.handle, N, nrhs, A.data(), lda, IPIV.data(), B.data(), ldb, info.data())); } if constexpr (std::is_same_v) { - KOKKOS_ROCBLAS_SAFE_CALL_IMPL(rocsolver_dgesv(s.handle, N, nrhs, A.data(), - lda, IPIV.data(), B.data(), - ldb, info.data())); + KOKKOS_ROCBLAS_SAFE_CALL_IMPL( + rocsolver_dgesv(s.handle, N, nrhs, A.data(), lda, IPIV.data(), B.data(), ldb, info.data())); } if constexpr (std::is_same_v>) { - KOKKOS_ROCBLAS_SAFE_CALL_IMPL(rocsolver_cgesv( - s.handle, N, nrhs, reinterpret_cast(A.data()), - lda, IPIV.data(), reinterpret_cast(B.data()), - ldb, info.data())); + KOKKOS_ROCBLAS_SAFE_CALL_IMPL(rocsolver_cgesv(s.handle, N, nrhs, reinterpret_cast(A.data()), + lda, IPIV.data(), reinterpret_cast(B.data()), + ldb, info.data())); } if constexpr (std::is_same_v>) { - KOKKOS_ROCBLAS_SAFE_CALL_IMPL(rocsolver_zgesv( - s.handle, N, nrhs, reinterpret_cast(A.data()), - lda, IPIV.data(), reinterpret_cast(B.data()), - ldb, info.data())); + KOKKOS_ROCBLAS_SAFE_CALL_IMPL( + rocsolver_zgesv(s.handle, N, nrhs, reinterpret_cast(A.data()), lda, IPIV.data(), + reinterpret_cast(B.data()), ldb, info.data())); } KOKKOS_ROCBLAS_SAFE_CALL_IMPL(rocblas_set_stream(s.handle, NULL)); } -#define KOKKOSLAPACK_GESV_ROCSOLVER(SCALAR, LAYOUT, MEM_SPACE) \ - template <> \ - struct GESV< \ - Kokkos::HIP, \ - Kokkos::View, \ - Kokkos::MemoryTraits>, \ - Kokkos::View, \ - Kokkos::MemoryTraits>, \ - Kokkos::View, \ - Kokkos::MemoryTraits>, \ - true, \ - gesv_eti_spec_avail< \ - Kokkos::HIP, \ - Kokkos::View, \ - Kokkos::MemoryTraits>, \ - Kokkos::View, \ - Kokkos::MemoryTraits>, \ - Kokkos::View, \ - Kokkos::MemoryTraits>>::value> { \ - using AViewType = \ - Kokkos::View, \ - Kokkos::MemoryTraits>; \ - using BViewType = \ - Kokkos::View, \ - Kokkos::MemoryTraits>; \ - using PViewType = Kokkos::View, \ - Kokkos::MemoryTraits>; \ - \ - static void gesv(const Kokkos::HIP& space, const AViewType& A, \ - const BViewType& B, const PViewType& IPIV) { \ - Kokkos::Profiling::pushRegion( \ - "KokkosLapack::gesv[TPL_ROCSOLVER," #SCALAR "]"); \ - gesv_print_specialization(); \ - \ - rocsolverGesvWrapper(space, IPIV, A, B); \ - Kokkos::Profiling::popRegion(); \ - } \ +#define KOKKOSLAPACK_GESV_ROCSOLVER(SCALAR, LAYOUT, MEM_SPACE) \ + template <> \ + struct GESV< \ + Kokkos::HIP, \ + Kokkos::View, Kokkos::MemoryTraits>, \ + Kokkos::View, Kokkos::MemoryTraits>, \ + Kokkos::View, \ + Kokkos::MemoryTraits>, \ + true, \ + gesv_eti_spec_avail, \ + Kokkos::MemoryTraits>, \ + Kokkos::View, \ + Kokkos::MemoryTraits>, \ + Kokkos::View, \ + Kokkos::MemoryTraits>>::value> { \ + using AViewType = Kokkos::View, \ + Kokkos::MemoryTraits>; \ + using BViewType = Kokkos::View, \ + Kokkos::MemoryTraits>; \ + using PViewType = Kokkos::View, \ + Kokkos::MemoryTraits>; \ + \ + static void gesv(const Kokkos::HIP& space, const AViewType& A, const BViewType& B, const PViewType& IPIV) { \ + Kokkos::Profiling::pushRegion("KokkosLapack::gesv[TPL_ROCSOLVER," #SCALAR "]"); \ + gesv_print_specialization(); \ + \ + rocsolverGesvWrapper(space, IPIV, A, B); \ + Kokkos::Profiling::popRegion(); \ + } \ }; KOKKOSLAPACK_GESV_ROCSOLVER(float, Kokkos::LayoutLeft, Kokkos::HIPSpace) KOKKOSLAPACK_GESV_ROCSOLVER(double, Kokkos::LayoutLeft, Kokkos::HIPSpace) -KOKKOSLAPACK_GESV_ROCSOLVER(Kokkos::complex, Kokkos::LayoutLeft, - Kokkos::HIPSpace) -KOKKOSLAPACK_GESV_ROCSOLVER(Kokkos::complex, Kokkos::LayoutLeft, - Kokkos::HIPSpace) +KOKKOSLAPACK_GESV_ROCSOLVER(Kokkos::complex, Kokkos::LayoutLeft, Kokkos::HIPSpace) +KOKKOSLAPACK_GESV_ROCSOLVER(Kokkos::complex, Kokkos::LayoutLeft, Kokkos::HIPSpace) } // namespace Impl } // namespace KokkosLapack diff --git a/lapack/tpls/KokkosLapack_svd_tpl_spec_avail.hpp b/lapack/tpls/KokkosLapack_svd_tpl_spec_avail.hpp index 7a7403209f..cc1ad12b96 100644 --- a/lapack/tpls/KokkosLapack_svd_tpl_spec_avail.hpp +++ b/lapack/tpls/KokkosLapack_svd_tpl_spec_avail.hpp @@ -20,148 +20,104 @@ namespace KokkosLapack { namespace Impl { // Specialization struct which defines whether a specialization exists -template +template struct svd_tpl_spec_avail { enum : bool { value = false }; }; // LAPACK -#if defined(KOKKOSKERNELS_ENABLE_TPL_LAPACK) || \ - defined(KOKKOSKERNELS_ENABLE_TPL_MKL) -#define KOKKOSLAPACK_SVD_TPL_SPEC_AVAIL_LAPACK(SCALAR, LAYOUT, EXECSPACE) \ - template <> \ - struct svd_tpl_spec_avail< \ - EXECSPACE, \ - Kokkos::View, \ - Kokkos::MemoryTraits>, \ - Kokkos::View::mag_type*, LAYOUT, \ - Kokkos::Device, \ - Kokkos::MemoryTraits>, \ - Kokkos::View, \ - Kokkos::MemoryTraits>, \ - Kokkos::View, \ - Kokkos::MemoryTraits>> { \ - enum : bool { value = true }; \ +#if defined(KOKKOSKERNELS_ENABLE_TPL_LAPACK) || defined(KOKKOSKERNELS_ENABLE_TPL_MKL) +#define KOKKOSLAPACK_SVD_TPL_SPEC_AVAIL_LAPACK(SCALAR, LAYOUT, EXECSPACE) \ + template <> \ + struct svd_tpl_spec_avail< \ + EXECSPACE, \ + Kokkos::View, \ + Kokkos::MemoryTraits>, \ + Kokkos::View::mag_type*, LAYOUT, Kokkos::Device, \ + Kokkos::MemoryTraits>, \ + Kokkos::View, \ + Kokkos::MemoryTraits>, \ + Kokkos::View, \ + Kokkos::MemoryTraits>> { \ + enum : bool { value = true }; \ }; #if defined(KOKKOS_ENABLE_SERIAL) -KOKKOSLAPACK_SVD_TPL_SPEC_AVAIL_LAPACK(float, Kokkos::LayoutLeft, - Kokkos::Serial) -KOKKOSLAPACK_SVD_TPL_SPEC_AVAIL_LAPACK(double, Kokkos::LayoutLeft, - Kokkos::Serial) -KOKKOSLAPACK_SVD_TPL_SPEC_AVAIL_LAPACK(Kokkos::complex, - Kokkos::LayoutLeft, Kokkos::Serial) -KOKKOSLAPACK_SVD_TPL_SPEC_AVAIL_LAPACK(Kokkos::complex, - Kokkos::LayoutLeft, Kokkos::Serial) +KOKKOSLAPACK_SVD_TPL_SPEC_AVAIL_LAPACK(float, Kokkos::LayoutLeft, Kokkos::Serial) +KOKKOSLAPACK_SVD_TPL_SPEC_AVAIL_LAPACK(double, Kokkos::LayoutLeft, Kokkos::Serial) +KOKKOSLAPACK_SVD_TPL_SPEC_AVAIL_LAPACK(Kokkos::complex, Kokkos::LayoutLeft, Kokkos::Serial) +KOKKOSLAPACK_SVD_TPL_SPEC_AVAIL_LAPACK(Kokkos::complex, Kokkos::LayoutLeft, Kokkos::Serial) #endif #if defined(KOKKOS_ENABLE_OPENMP) -KOKKOSLAPACK_SVD_TPL_SPEC_AVAIL_LAPACK(float, Kokkos::LayoutLeft, - Kokkos::OpenMP) -KOKKOSLAPACK_SVD_TPL_SPEC_AVAIL_LAPACK(double, Kokkos::LayoutLeft, - Kokkos::OpenMP) -KOKKOSLAPACK_SVD_TPL_SPEC_AVAIL_LAPACK(Kokkos::complex, - Kokkos::LayoutLeft, Kokkos::OpenMP) -KOKKOSLAPACK_SVD_TPL_SPEC_AVAIL_LAPACK(Kokkos::complex, - Kokkos::LayoutLeft, Kokkos::OpenMP) +KOKKOSLAPACK_SVD_TPL_SPEC_AVAIL_LAPACK(float, Kokkos::LayoutLeft, Kokkos::OpenMP) +KOKKOSLAPACK_SVD_TPL_SPEC_AVAIL_LAPACK(double, Kokkos::LayoutLeft, Kokkos::OpenMP) +KOKKOSLAPACK_SVD_TPL_SPEC_AVAIL_LAPACK(Kokkos::complex, Kokkos::LayoutLeft, Kokkos::OpenMP) +KOKKOSLAPACK_SVD_TPL_SPEC_AVAIL_LAPACK(Kokkos::complex, Kokkos::LayoutLeft, Kokkos::OpenMP) #endif #if defined(KOKKOS_ENABLE_THREADS) -KOKKOSLAPACK_SVD_TPL_SPEC_AVAIL_LAPACK(float, Kokkos::LayoutLeft, - Kokkos::Threads) -KOKKOSLAPACK_SVD_TPL_SPEC_AVAIL_LAPACK(double, Kokkos::LayoutLeft, - Kokkos::Threads) -KOKKOSLAPACK_SVD_TPL_SPEC_AVAIL_LAPACK(Kokkos::complex, - Kokkos::LayoutLeft, Kokkos::Threads) -KOKKOSLAPACK_SVD_TPL_SPEC_AVAIL_LAPACK(Kokkos::complex, - Kokkos::LayoutLeft, Kokkos::Threads) +KOKKOSLAPACK_SVD_TPL_SPEC_AVAIL_LAPACK(float, Kokkos::LayoutLeft, Kokkos::Threads) +KOKKOSLAPACK_SVD_TPL_SPEC_AVAIL_LAPACK(double, Kokkos::LayoutLeft, Kokkos::Threads) +KOKKOSLAPACK_SVD_TPL_SPEC_AVAIL_LAPACK(Kokkos::complex, Kokkos::LayoutLeft, Kokkos::Threads) +KOKKOSLAPACK_SVD_TPL_SPEC_AVAIL_LAPACK(Kokkos::complex, Kokkos::LayoutLeft, Kokkos::Threads) #endif #endif // KOKKOSKERNELS_ENABLE_TPL_LAPACK || KOKKOSKERNELS_ENABLE_TPL_MKL // CUSOLVER #ifdef KOKKOSKERNELS_ENABLE_TPL_CUSOLVER -#define KOKKOSLAPACK_SVD_TPL_SPEC_AVAIL_CUSOLVER(SCALAR, LAYOUT, MEMSPACE) \ - template <> \ - struct svd_tpl_spec_avail< \ - Kokkos::Cuda, \ - Kokkos::View, \ - Kokkos::MemoryTraits>, \ - Kokkos::View::mag_type*, LAYOUT, \ - Kokkos::Device, \ - Kokkos::MemoryTraits>, \ - Kokkos::View, \ - Kokkos::MemoryTraits>, \ - Kokkos::View, \ - Kokkos::MemoryTraits>> { \ - enum : bool { value = true }; \ +#define KOKKOSLAPACK_SVD_TPL_SPEC_AVAIL_CUSOLVER(SCALAR, LAYOUT, MEMSPACE) \ + template <> \ + struct svd_tpl_spec_avail< \ + Kokkos::Cuda, \ + Kokkos::View, Kokkos::MemoryTraits>, \ + Kokkos::View::mag_type*, LAYOUT, Kokkos::Device, \ + Kokkos::MemoryTraits>, \ + Kokkos::View, Kokkos::MemoryTraits>, \ + Kokkos::View, \ + Kokkos::MemoryTraits>> { \ + enum : bool { value = true }; \ }; -KOKKOSLAPACK_SVD_TPL_SPEC_AVAIL_CUSOLVER(float, Kokkos::LayoutLeft, - Kokkos::CudaSpace) -KOKKOSLAPACK_SVD_TPL_SPEC_AVAIL_CUSOLVER(double, Kokkos::LayoutLeft, - Kokkos::CudaSpace) -KOKKOSLAPACK_SVD_TPL_SPEC_AVAIL_CUSOLVER(Kokkos::complex, - Kokkos::LayoutLeft, Kokkos::CudaSpace) -KOKKOSLAPACK_SVD_TPL_SPEC_AVAIL_CUSOLVER(Kokkos::complex, - Kokkos::LayoutLeft, Kokkos::CudaSpace) +KOKKOSLAPACK_SVD_TPL_SPEC_AVAIL_CUSOLVER(float, Kokkos::LayoutLeft, Kokkos::CudaSpace) +KOKKOSLAPACK_SVD_TPL_SPEC_AVAIL_CUSOLVER(double, Kokkos::LayoutLeft, Kokkos::CudaSpace) +KOKKOSLAPACK_SVD_TPL_SPEC_AVAIL_CUSOLVER(Kokkos::complex, Kokkos::LayoutLeft, Kokkos::CudaSpace) +KOKKOSLAPACK_SVD_TPL_SPEC_AVAIL_CUSOLVER(Kokkos::complex, Kokkos::LayoutLeft, Kokkos::CudaSpace) #if defined(KOKKOSKERNELS_INST_MEMSPACE_CUDAUVMSPACE) -KOKKOSLAPACK_SVD_TPL_SPEC_AVAIL_CUSOLVER(float, Kokkos::LayoutLeft, - Kokkos::CudaUVMSpace) -KOKKOSLAPACK_SVD_TPL_SPEC_AVAIL_CUSOLVER(double, Kokkos::LayoutLeft, - Kokkos::CudaUVMSpace) -KOKKOSLAPACK_SVD_TPL_SPEC_AVAIL_CUSOLVER(Kokkos::complex, - Kokkos::LayoutLeft, - Kokkos::CudaUVMSpace) -KOKKOSLAPACK_SVD_TPL_SPEC_AVAIL_CUSOLVER(Kokkos::complex, - Kokkos::LayoutLeft, - Kokkos::CudaUVMSpace) +KOKKOSLAPACK_SVD_TPL_SPEC_AVAIL_CUSOLVER(float, Kokkos::LayoutLeft, Kokkos::CudaUVMSpace) +KOKKOSLAPACK_SVD_TPL_SPEC_AVAIL_CUSOLVER(double, Kokkos::LayoutLeft, Kokkos::CudaUVMSpace) +KOKKOSLAPACK_SVD_TPL_SPEC_AVAIL_CUSOLVER(Kokkos::complex, Kokkos::LayoutLeft, Kokkos::CudaUVMSpace) +KOKKOSLAPACK_SVD_TPL_SPEC_AVAIL_CUSOLVER(Kokkos::complex, Kokkos::LayoutLeft, Kokkos::CudaUVMSpace) #endif // CUDAUVMSPACE #endif // CUSOLVER // ROCSOLVER #ifdef KOKKOSKERNELS_ENABLE_TPL_ROCSOLVER -#define KOKKOSLAPACK_SVD_TPL_SPEC_AVAIL_ROCSOLVER(SCALAR, LAYOUT, MEMSPACE) \ - template <> \ - struct svd_tpl_spec_avail< \ - Kokkos::HIP, \ - Kokkos::View, \ - Kokkos::MemoryTraits>, \ - Kokkos::View::mag_type*, LAYOUT, \ - Kokkos::Device, \ - Kokkos::MemoryTraits>, \ - Kokkos::View, \ - Kokkos::MemoryTraits>, \ - Kokkos::View, \ - Kokkos::MemoryTraits>> { \ - enum : bool { value = true }; \ +#define KOKKOSLAPACK_SVD_TPL_SPEC_AVAIL_ROCSOLVER(SCALAR, LAYOUT, MEMSPACE) \ + template <> \ + struct svd_tpl_spec_avail< \ + Kokkos::HIP, \ + Kokkos::View, Kokkos::MemoryTraits>, \ + Kokkos::View::mag_type*, LAYOUT, Kokkos::Device, \ + Kokkos::MemoryTraits>, \ + Kokkos::View, Kokkos::MemoryTraits>, \ + Kokkos::View, \ + Kokkos::MemoryTraits>> { \ + enum : bool { value = true }; \ }; -KOKKOSLAPACK_SVD_TPL_SPEC_AVAIL_ROCSOLVER(float, Kokkos::LayoutLeft, - Kokkos::HIPSpace) -KOKKOSLAPACK_SVD_TPL_SPEC_AVAIL_ROCSOLVER(double, Kokkos::LayoutLeft, - Kokkos::HIPSpace) -KOKKOSLAPACK_SVD_TPL_SPEC_AVAIL_ROCSOLVER(Kokkos::complex, - Kokkos::LayoutLeft, Kokkos::HIPSpace) -KOKKOSLAPACK_SVD_TPL_SPEC_AVAIL_ROCSOLVER(Kokkos::complex, - Kokkos::LayoutLeft, Kokkos::HIPSpace) +KOKKOSLAPACK_SVD_TPL_SPEC_AVAIL_ROCSOLVER(float, Kokkos::LayoutLeft, Kokkos::HIPSpace) +KOKKOSLAPACK_SVD_TPL_SPEC_AVAIL_ROCSOLVER(double, Kokkos::LayoutLeft, Kokkos::HIPSpace) +KOKKOSLAPACK_SVD_TPL_SPEC_AVAIL_ROCSOLVER(Kokkos::complex, Kokkos::LayoutLeft, Kokkos::HIPSpace) +KOKKOSLAPACK_SVD_TPL_SPEC_AVAIL_ROCSOLVER(Kokkos::complex, Kokkos::LayoutLeft, Kokkos::HIPSpace) #if defined(KOKKOSKERNELS_INST_MEMSPACE_HIPMANAGEDSPACE) -KOKKOSLAPACK_SVD_TPL_SPEC_AVAIL_ROCSOLVER(float, Kokkos::LayoutLeft, - Kokkos::HIPManagedSpace) -KOKKOSLAPACK_SVD_TPL_SPEC_AVAIL_ROCSOLVER(double, Kokkos::LayoutLeft, - Kokkos::HIPManagedSpace) -KOKKOSLAPACK_SVD_TPL_SPEC_AVAIL_ROCSOLVER(Kokkos::complex, - Kokkos::LayoutLeft, - Kokkos::HIPManagedSpace) -KOKKOSLAPACK_SVD_TPL_SPEC_AVAIL_ROCSOLVER(Kokkos::complex, - Kokkos::LayoutLeft, - Kokkos::HIPManagedSpace) +KOKKOSLAPACK_SVD_TPL_SPEC_AVAIL_ROCSOLVER(float, Kokkos::LayoutLeft, Kokkos::HIPManagedSpace) +KOKKOSLAPACK_SVD_TPL_SPEC_AVAIL_ROCSOLVER(double, Kokkos::LayoutLeft, Kokkos::HIPManagedSpace) +KOKKOSLAPACK_SVD_TPL_SPEC_AVAIL_ROCSOLVER(Kokkos::complex, Kokkos::LayoutLeft, Kokkos::HIPManagedSpace) +KOKKOSLAPACK_SVD_TPL_SPEC_AVAIL_ROCSOLVER(Kokkos::complex, Kokkos::LayoutLeft, Kokkos::HIPManagedSpace) #endif // HIPMANAGEDSPACE #endif // ROCSOLVER diff --git a/lapack/tpls/KokkosLapack_svd_tpl_spec_decl.hpp b/lapack/tpls/KokkosLapack_svd_tpl_spec_decl.hpp index 4385fa40d6..01255bf427 100644 --- a/lapack/tpls/KokkosLapack_svd_tpl_spec_decl.hpp +++ b/lapack/tpls/KokkosLapack_svd_tpl_spec_decl.hpp @@ -22,8 +22,7 @@ namespace KokkosLapack { namespace Impl { -template +template inline void svd_print_specialization() { #ifdef KOKKOSKERNELS_ENABLE_CHECK_SPECIALIZATION #ifdef KOKKOSKERNELS_ENABLE_TPL_CUSOLVER @@ -31,8 +30,7 @@ inline void svd_print_specialization() { printf( "KokkosLapack::svd<> TPL Cusolver specialization for < %s , %s, %s, %s " ">\n", - typeid(AMatrix).name(), typeid(SVector).name(), typeid(UMatrix).name(), - typeid(VMatrix).name()); + typeid(AMatrix).name(), typeid(SVector).name(), typeid(UMatrix).name(), typeid(VMatrix).name()); } #endif #endif @@ -41,18 +39,15 @@ inline void svd_print_specialization() { } // namespace KokkosLapack // LAPACK -#if defined(KOKKOSKERNELS_ENABLE_TPL_LAPACK) && \ - !defined(KOKKOSKERNELS_ENABLE_TPL_MKL) +#if defined(KOKKOSKERNELS_ENABLE_TPL_LAPACK) && !defined(KOKKOSKERNELS_ENABLE_TPL_MKL) #include "KokkosLapack_Host_tpl.hpp" namespace KokkosLapack { namespace Impl { -template -void lapackSvdWrapper(const ExecutionSpace& /* space */, const char jobu[], - const char jobvt[], const AMatrix& A, const SVector& S, - const UMatrix& U, const VMatrix& Vt) { +template +void lapackSvdWrapper(const ExecutionSpace& /* space */, const char jobu[], const char jobvt[], const AMatrix& A, + const SVector& S, const UMatrix& U, const VMatrix& Vt) { using memory_space = typename AMatrix::memory_space; using Scalar = typename AMatrix::non_const_value_type; using Magnitude = typename SVector::non_const_value_type; @@ -74,128 +69,96 @@ void lapackSvdWrapper(const ExecutionSpace& /* space */, const char jobu[], const int ldvt = Vt.stride(1); int lwork = -1, info = 0; - Kokkos::View rwork("svd rwork buffer", - 5 * Kokkos::min(m, n)); + Kokkos::View rwork("svd rwork buffer", 5 * Kokkos::min(m, n)); Kokkos::View work("svd work buffer", 1); if constexpr (Kokkos::ArithTraits::is_complex) { HostLapack>::gesvd( - jobu[0], jobvt[0], m, n, - reinterpret_cast*>(A.data()), lda, S.data(), + jobu[0], jobvt[0], m, n, reinterpret_cast*>(A.data()), lda, S.data(), reinterpret_cast*>(U.data()), ldu, reinterpret_cast*>(Vt.data()), ldvt, - reinterpret_cast*>(work.data()), lwork, - rwork.data(), info); + reinterpret_cast*>(work.data()), lwork, rwork.data(), info); lwork = static_cast(work(0).real()); work = Kokkos::View("svd work buffer", lwork); HostLapack>::gesvd( - jobu[0], jobvt[0], m, n, - reinterpret_cast*>(A.data()), lda, S.data(), + jobu[0], jobvt[0], m, n, reinterpret_cast*>(A.data()), lda, S.data(), reinterpret_cast*>(U.data()), ldu, reinterpret_cast*>(Vt.data()), ldvt, - reinterpret_cast*>(work.data()), lwork, - rwork.data(), info); + reinterpret_cast*>(work.data()), lwork, rwork.data(), info); } else { - HostLapack::gesvd(jobu[0], jobvt[0], m, n, A.data(), lda, S.data(), - U.data(), ldu, Vt.data(), ldvt, work.data(), - lwork, rwork.data(), info); + HostLapack::gesvd(jobu[0], jobvt[0], m, n, A.data(), lda, S.data(), U.data(), ldu, Vt.data(), ldvt, + work.data(), lwork, rwork.data(), info); lwork = static_cast(work(0)); work = Kokkos::View("svd work buffer", lwork); - HostLapack::gesvd(jobu[0], jobvt[0], m, n, A.data(), lda, S.data(), - U.data(), ldu, Vt.data(), ldvt, work.data(), - lwork, rwork.data(), info); + HostLapack::gesvd(jobu[0], jobvt[0], m, n, A.data(), lda, S.data(), U.data(), ldu, Vt.data(), ldvt, + work.data(), lwork, rwork.data(), info); } } -#define KOKKOSLAPACK_SVD_LAPACK(SCALAR, LAYOUT, EXEC_SPACE) \ - template <> \ - struct SVD< \ - EXEC_SPACE, \ - Kokkos::View, \ - Kokkos::MemoryTraits>, \ - Kokkos::View::mag_type*, LAYOUT, \ - Kokkos::Device, \ - Kokkos::MemoryTraits>, \ - Kokkos::View, \ - Kokkos::MemoryTraits>, \ - Kokkos::View, \ - Kokkos::MemoryTraits>, \ - true, \ - svd_eti_spec_avail< \ - EXEC_SPACE, \ - Kokkos::View, \ - Kokkos::MemoryTraits>, \ - Kokkos::View::mag_type*, LAYOUT, \ - Kokkos::Device, \ - Kokkos::MemoryTraits>, \ - Kokkos::View, \ - Kokkos::MemoryTraits>, \ - Kokkos::View, \ - Kokkos::MemoryTraits>>::value> { \ - using AMatrix = \ - Kokkos::View, \ - Kokkos::MemoryTraits>; \ - using SVector = \ - Kokkos::View::mag_type*, LAYOUT, \ - Kokkos::Device, \ - Kokkos::MemoryTraits>; \ - using UMatrix = \ - Kokkos::View, \ - Kokkos::MemoryTraits>; \ - using VMatrix = \ - Kokkos::View, \ - Kokkos::MemoryTraits>; \ - \ - static void svd(const EXEC_SPACE& space, const char jobu[], \ - const char jobvt[], const AMatrix& A, const SVector& S, \ - const UMatrix& U, const VMatrix& Vt) { \ - Kokkos::Profiling::pushRegion("KokkosLapack::svd[TPL_LAPACK," #SCALAR \ - "]"); \ - svd_print_specialization(); \ - \ - lapackSvdWrapper(space, jobu, jobvt, A, S, U, Vt); \ - Kokkos::Profiling::popRegion(); \ - } \ +#define KOKKOSLAPACK_SVD_LAPACK(SCALAR, LAYOUT, EXEC_SPACE) \ + template <> \ + struct SVD, \ + Kokkos::MemoryTraits>, \ + Kokkos::View::mag_type*, LAYOUT, \ + Kokkos::Device, Kokkos::MemoryTraits>, \ + Kokkos::View, \ + Kokkos::MemoryTraits>, \ + Kokkos::View, \ + Kokkos::MemoryTraits>, \ + true, \ + svd_eti_spec_avail< \ + EXEC_SPACE, \ + Kokkos::View, \ + Kokkos::MemoryTraits>, \ + Kokkos::View::mag_type*, LAYOUT, \ + Kokkos::Device, Kokkos::MemoryTraits>, \ + Kokkos::View, \ + Kokkos::MemoryTraits>, \ + Kokkos::View, \ + Kokkos::MemoryTraits>>::value> { \ + using AMatrix = Kokkos::View, \ + Kokkos::MemoryTraits>; \ + using SVector = \ + Kokkos::View::mag_type*, LAYOUT, Kokkos::Device, \ + Kokkos::MemoryTraits>; \ + using UMatrix = Kokkos::View, \ + Kokkos::MemoryTraits>; \ + using VMatrix = Kokkos::View, \ + Kokkos::MemoryTraits>; \ + \ + static void svd(const EXEC_SPACE& space, const char jobu[], const char jobvt[], const AMatrix& A, \ + const SVector& S, const UMatrix& U, const VMatrix& Vt) { \ + Kokkos::Profiling::pushRegion("KokkosLapack::svd[TPL_LAPACK," #SCALAR "]"); \ + svd_print_specialization(); \ + \ + lapackSvdWrapper(space, jobu, jobvt, A, S, U, Vt); \ + Kokkos::Profiling::popRegion(); \ + } \ }; #if defined(KOKKOS_ENABLE_SERIAL) KOKKOSLAPACK_SVD_LAPACK(float, Kokkos::LayoutLeft, Kokkos::Serial) KOKKOSLAPACK_SVD_LAPACK(double, Kokkos::LayoutLeft, Kokkos::Serial) -KOKKOSLAPACK_SVD_LAPACK(Kokkos::complex, Kokkos::LayoutLeft, - Kokkos::Serial) -KOKKOSLAPACK_SVD_LAPACK(Kokkos::complex, Kokkos::LayoutLeft, - Kokkos::Serial) +KOKKOSLAPACK_SVD_LAPACK(Kokkos::complex, Kokkos::LayoutLeft, Kokkos::Serial) +KOKKOSLAPACK_SVD_LAPACK(Kokkos::complex, Kokkos::LayoutLeft, Kokkos::Serial) #endif #if defined(KOKKOS_ENABLE_OPENMP) KOKKOSLAPACK_SVD_LAPACK(float, Kokkos::LayoutLeft, Kokkos::OpenMP) KOKKOSLAPACK_SVD_LAPACK(double, Kokkos::LayoutLeft, Kokkos::OpenMP) -KOKKOSLAPACK_SVD_LAPACK(Kokkos::complex, Kokkos::LayoutLeft, - Kokkos::OpenMP) -KOKKOSLAPACK_SVD_LAPACK(Kokkos::complex, Kokkos::LayoutLeft, - Kokkos::OpenMP) +KOKKOSLAPACK_SVD_LAPACK(Kokkos::complex, Kokkos::LayoutLeft, Kokkos::OpenMP) +KOKKOSLAPACK_SVD_LAPACK(Kokkos::complex, Kokkos::LayoutLeft, Kokkos::OpenMP) #endif #if defined(KOKKOS_ENABLE_THREADS) KOKKOSLAPACK_SVD_LAPACK(float, Kokkos::LayoutLeft, Kokkos::Threads) KOKKOSLAPACK_SVD_LAPACK(double, Kokkos::LayoutLeft, Kokkos::Threads) -KOKKOSLAPACK_SVD_LAPACK(Kokkos::complex, Kokkos::LayoutLeft, - Kokkos::Threads) -KOKKOSLAPACK_SVD_LAPACK(Kokkos::complex, Kokkos::LayoutLeft, - Kokkos::Threads) +KOKKOSLAPACK_SVD_LAPACK(Kokkos::complex, Kokkos::LayoutLeft, Kokkos::Threads) +KOKKOSLAPACK_SVD_LAPACK(Kokkos::complex, Kokkos::LayoutLeft, Kokkos::Threads) #endif } // namespace Impl @@ -208,11 +171,9 @@ KOKKOSLAPACK_SVD_LAPACK(Kokkos::complex, Kokkos::LayoutLeft, namespace KokkosLapack { namespace Impl { -template -void mklSvdWrapper(const ExecutionSpace& /* space */, const char jobu[], - const char jobvt[], const AMatrix& A, const SVector& S, - const UMatrix& U, const VMatrix& Vt) { +template +void mklSvdWrapper(const ExecutionSpace& /* space */, const char jobu[], const char jobvt[], const AMatrix& A, + const SVector& S, const UMatrix& U, const VMatrix& Vt) { using memory_space = typename AMatrix::memory_space; using Scalar = typename AMatrix::non_const_value_type; using Magnitude = typename SVector::non_const_value_type; @@ -233,33 +194,25 @@ void mklSvdWrapper(const ExecutionSpace& /* space */, const char jobu[], const lapack_int ldu = U.stride(1); const lapack_int ldvt = Vt.stride(1); - Kokkos::View rwork("svd rwork buffer", - Kokkos::min(m, n) - 1); + Kokkos::View rwork("svd rwork buffer", Kokkos::min(m, n) - 1); lapack_int ret = 0; if constexpr (std::is_same_v) { - ret = - LAPACKE_sgesvd(LAPACK_COL_MAJOR, jobu[0], jobvt[0], m, n, A.data(), lda, - S.data(), U.data(), ldu, Vt.data(), ldvt, rwork.data()); + ret = LAPACKE_sgesvd(LAPACK_COL_MAJOR, jobu[0], jobvt[0], m, n, A.data(), lda, S.data(), U.data(), ldu, Vt.data(), + ldvt, rwork.data()); } if constexpr (std::is_same_v) { - ret = - LAPACKE_dgesvd(LAPACK_COL_MAJOR, jobu[0], jobvt[0], m, n, A.data(), lda, - S.data(), U.data(), ldu, Vt.data(), ldvt, rwork.data()); + ret = LAPACKE_dgesvd(LAPACK_COL_MAJOR, jobu[0], jobvt[0], m, n, A.data(), lda, S.data(), U.data(), ldu, Vt.data(), + ldvt, rwork.data()); } if constexpr (std::is_same_v>) { - ret = LAPACKE_cgesvd( - LAPACK_COL_MAJOR, jobu[0], jobvt[0], m, n, - reinterpret_cast(A.data()), lda, S.data(), - reinterpret_cast(U.data()), ldu, - reinterpret_cast(Vt.data()), ldvt, rwork.data()); + ret = LAPACKE_cgesvd(LAPACK_COL_MAJOR, jobu[0], jobvt[0], m, n, reinterpret_cast(A.data()), + lda, S.data(), reinterpret_cast(U.data()), ldu, + reinterpret_cast(Vt.data()), ldvt, rwork.data()); } if constexpr (std::is_same_v>) { - ret = LAPACKE_zgesvd( - LAPACK_COL_MAJOR, jobu[0], jobvt[0], m, n, - reinterpret_cast(A.data()), lda, S.data(), - reinterpret_cast(U.data()), ldu, - reinterpret_cast(Vt.data()), ldvt, - rwork.data()); + ret = LAPACKE_zgesvd(LAPACK_COL_MAJOR, jobu[0], jobvt[0], m, n, reinterpret_cast(A.data()), + lda, S.data(), reinterpret_cast(U.data()), ldu, + reinterpret_cast(Vt.data()), ldvt, rwork.data()); } if (ret != 0) { @@ -269,90 +222,67 @@ void mklSvdWrapper(const ExecutionSpace& /* space */, const char jobu[], } } -#define KOKKOSLAPACK_SVD_MKL(SCALAR, LAYOUT, EXEC_SPACE) \ - template <> \ - struct SVD< \ - EXEC_SPACE, \ - Kokkos::View, \ - Kokkos::MemoryTraits>, \ - Kokkos::View::mag_type*, LAYOUT, \ - Kokkos::Device, \ - Kokkos::MemoryTraits>, \ - Kokkos::View, \ - Kokkos::MemoryTraits>, \ - Kokkos::View, \ - Kokkos::MemoryTraits>, \ - true, \ - svd_eti_spec_avail< \ - EXEC_SPACE, \ - Kokkos::View, \ - Kokkos::MemoryTraits>, \ - Kokkos::View::mag_type*, LAYOUT, \ - Kokkos::Device, \ - Kokkos::MemoryTraits>, \ - Kokkos::View, \ - Kokkos::MemoryTraits>, \ - Kokkos::View, \ - Kokkos::MemoryTraits>>::value> { \ - using AMatrix = \ - Kokkos::View, \ - Kokkos::MemoryTraits>; \ - using SVector = \ - Kokkos::View::mag_type*, LAYOUT, \ - Kokkos::Device, \ - Kokkos::MemoryTraits>; \ - using UMatrix = \ - Kokkos::View, \ - Kokkos::MemoryTraits>; \ - using VMatrix = \ - Kokkos::View, \ - Kokkos::MemoryTraits>; \ - \ - static void svd(const EXEC_SPACE& space, const char jobu[], \ - const char jobvt[], const AMatrix& A, const SVector& S, \ - const UMatrix& U, const VMatrix& Vt) { \ - Kokkos::Profiling::pushRegion("KokkosLapack::svd[TPL_LAPACK," #SCALAR \ - "]"); \ - svd_print_specialization(); \ - \ - mklSvdWrapper(space, jobu, jobvt, A, S, U, Vt); \ - Kokkos::Profiling::popRegion(); \ - } \ +#define KOKKOSLAPACK_SVD_MKL(SCALAR, LAYOUT, EXEC_SPACE) \ + template <> \ + struct SVD, \ + Kokkos::MemoryTraits>, \ + Kokkos::View::mag_type*, LAYOUT, \ + Kokkos::Device, Kokkos::MemoryTraits>, \ + Kokkos::View, \ + Kokkos::MemoryTraits>, \ + Kokkos::View, \ + Kokkos::MemoryTraits>, \ + true, \ + svd_eti_spec_avail< \ + EXEC_SPACE, \ + Kokkos::View, \ + Kokkos::MemoryTraits>, \ + Kokkos::View::mag_type*, LAYOUT, \ + Kokkos::Device, Kokkos::MemoryTraits>, \ + Kokkos::View, \ + Kokkos::MemoryTraits>, \ + Kokkos::View, \ + Kokkos::MemoryTraits>>::value> { \ + using AMatrix = Kokkos::View, \ + Kokkos::MemoryTraits>; \ + using SVector = \ + Kokkos::View::mag_type*, LAYOUT, Kokkos::Device, \ + Kokkos::MemoryTraits>; \ + using UMatrix = Kokkos::View, \ + Kokkos::MemoryTraits>; \ + using VMatrix = Kokkos::View, \ + Kokkos::MemoryTraits>; \ + \ + static void svd(const EXEC_SPACE& space, const char jobu[], const char jobvt[], const AMatrix& A, \ + const SVector& S, const UMatrix& U, const VMatrix& Vt) { \ + Kokkos::Profiling::pushRegion("KokkosLapack::svd[TPL_LAPACK," #SCALAR "]"); \ + svd_print_specialization(); \ + \ + mklSvdWrapper(space, jobu, jobvt, A, S, U, Vt); \ + Kokkos::Profiling::popRegion(); \ + } \ }; #if defined(KOKKOS_ENABLE_SERIAL) KOKKOSLAPACK_SVD_MKL(float, Kokkos::LayoutLeft, Kokkos::Serial) KOKKOSLAPACK_SVD_MKL(double, Kokkos::LayoutLeft, Kokkos::Serial) KOKKOSLAPACK_SVD_MKL(Kokkos::complex, Kokkos::LayoutLeft, Kokkos::Serial) -KOKKOSLAPACK_SVD_MKL(Kokkos::complex, Kokkos::LayoutLeft, - Kokkos::Serial) +KOKKOSLAPACK_SVD_MKL(Kokkos::complex, Kokkos::LayoutLeft, Kokkos::Serial) #endif #if defined(KOKKOS_ENABLE_OPENMP) KOKKOSLAPACK_SVD_MKL(float, Kokkos::LayoutLeft, Kokkos::OpenMP) KOKKOSLAPACK_SVD_MKL(double, Kokkos::LayoutLeft, Kokkos::OpenMP) KOKKOSLAPACK_SVD_MKL(Kokkos::complex, Kokkos::LayoutLeft, Kokkos::OpenMP) -KOKKOSLAPACK_SVD_MKL(Kokkos::complex, Kokkos::LayoutLeft, - Kokkos::OpenMP) +KOKKOSLAPACK_SVD_MKL(Kokkos::complex, Kokkos::LayoutLeft, Kokkos::OpenMP) #endif #if defined(KOKKOS_ENABLE_THREADS) KOKKOSLAPACK_SVD_MKL(float, Kokkos::LayoutLeft, Kokkos::Threads) KOKKOSLAPACK_SVD_MKL(double, Kokkos::LayoutLeft, Kokkos::Threads) -KOKKOSLAPACK_SVD_MKL(Kokkos::complex, Kokkos::LayoutLeft, - Kokkos::Threads) -KOKKOSLAPACK_SVD_MKL(Kokkos::complex, Kokkos::LayoutLeft, - Kokkos::Threads) +KOKKOSLAPACK_SVD_MKL(Kokkos::complex, Kokkos::LayoutLeft, Kokkos::Threads) +KOKKOSLAPACK_SVD_MKL(Kokkos::complex, Kokkos::LayoutLeft, Kokkos::Threads) #endif } // namespace Impl @@ -366,11 +296,9 @@ KOKKOSLAPACK_SVD_MKL(Kokkos::complex, Kokkos::LayoutLeft, namespace KokkosLapack { namespace Impl { -template -void cusolverSvdWrapper(const ExecutionSpace& space, const char jobu[], - const char jobvt[], const AMatrix& A, const SVector& S, - const UMatrix& U, const VMatrix& Vt) { +template +void cusolverSvdWrapper(const ExecutionSpace& space, const char jobu[], const char jobvt[], const AMatrix& A, + const SVector& S, const UMatrix& U, const VMatrix& Vt) { using memory_space = typename AMatrix::memory_space; using Scalar = typename AMatrix::non_const_value_type; using Magnitude = typename SVector::non_const_value_type; @@ -393,128 +321,98 @@ void cusolverSvdWrapper(const ExecutionSpace& space, const char jobu[], int lwork = 0; Kokkos::View info("svd info"); - Kokkos::View rwork("svd rwork buffer", - Kokkos::min(m, n) - 1); + Kokkos::View rwork("svd rwork buffer", Kokkos::min(m, n) - 1); CudaLapackSingleton& s = CudaLapackSingleton::singleton(); - KOKKOS_CUSOLVER_SAFE_CALL_IMPL( - cusolverDnSetStream(s.handle, space.cuda_stream())); + KOKKOS_CUSOLVER_SAFE_CALL_IMPL(cusolverDnSetStream(s.handle, space.cuda_stream())); if constexpr (std::is_same_v) { - KOKKOS_CUSOLVER_SAFE_CALL_IMPL( - cusolverDnSgesvd_bufferSize(s.handle, m, n, &lwork)); + KOKKOS_CUSOLVER_SAFE_CALL_IMPL(cusolverDnSgesvd_bufferSize(s.handle, m, n, &lwork)); Kokkos::View work("svd work buffer", lwork); - KOKKOS_CUSOLVER_SAFE_CALL_IMPL(cusolverDnSgesvd( - s.handle, jobu[0], jobvt[0], m, n, A.data(), lda, S.data(), U.data(), - ldu, Vt.data(), ldvt, work.data(), lwork, rwork.data(), info.data())); + KOKKOS_CUSOLVER_SAFE_CALL_IMPL(cusolverDnSgesvd(s.handle, jobu[0], jobvt[0], m, n, A.data(), lda, S.data(), + U.data(), ldu, Vt.data(), ldvt, work.data(), lwork, rwork.data(), + info.data())); } if constexpr (std::is_same_v) { - KOKKOS_CUSOLVER_SAFE_CALL_IMPL( - cusolverDnDgesvd_bufferSize(s.handle, m, n, &lwork)); + KOKKOS_CUSOLVER_SAFE_CALL_IMPL(cusolverDnDgesvd_bufferSize(s.handle, m, n, &lwork)); Kokkos::View work("svd work buffer", lwork); - KOKKOS_CUSOLVER_SAFE_CALL_IMPL(cusolverDnDgesvd( - s.handle, jobu[0], jobvt[0], m, n, A.data(), lda, S.data(), U.data(), - ldu, Vt.data(), ldvt, work.data(), lwork, rwork.data(), info.data())); + KOKKOS_CUSOLVER_SAFE_CALL_IMPL(cusolverDnDgesvd(s.handle, jobu[0], jobvt[0], m, n, A.data(), lda, S.data(), + U.data(), ldu, Vt.data(), ldvt, work.data(), lwork, rwork.data(), + info.data())); } if constexpr (std::is_same_v>) { - KOKKOS_CUSOLVER_SAFE_CALL_IMPL( - cusolverDnCgesvd_bufferSize(s.handle, m, n, &lwork)); + KOKKOS_CUSOLVER_SAFE_CALL_IMPL(cusolverDnCgesvd_bufferSize(s.handle, m, n, &lwork)); Kokkos::View work("svd work buffer", lwork); KOKKOS_CUSOLVER_SAFE_CALL_IMPL( - cusolverDnCgesvd(s.handle, jobu[0], jobvt[0], m, n, - reinterpret_cast(A.data()), lda, S.data(), - reinterpret_cast(U.data()), ldu, - reinterpret_cast(Vt.data()), ldvt, - reinterpret_cast(work.data()), lwork, - rwork.data(), info.data())); + cusolverDnCgesvd(s.handle, jobu[0], jobvt[0], m, n, reinterpret_cast(A.data()), lda, S.data(), + reinterpret_cast(U.data()), ldu, reinterpret_cast(Vt.data()), ldvt, + reinterpret_cast(work.data()), lwork, rwork.data(), info.data())); } if constexpr (std::is_same_v>) { - KOKKOS_CUSOLVER_SAFE_CALL_IMPL( - cusolverDnZgesvd_bufferSize(s.handle, m, n, &lwork)); + KOKKOS_CUSOLVER_SAFE_CALL_IMPL(cusolverDnZgesvd_bufferSize(s.handle, m, n, &lwork)); Kokkos::View work("svd work buffer", lwork); - KOKKOS_CUSOLVER_SAFE_CALL_IMPL( - cusolverDnZgesvd(s.handle, jobu[0], jobvt[0], m, n, - reinterpret_cast(A.data()), lda, - S.data(), reinterpret_cast(U.data()), - ldu, reinterpret_cast(Vt.data()), - ldvt, reinterpret_cast(work.data()), - lwork, rwork.data(), info.data())); + KOKKOS_CUSOLVER_SAFE_CALL_IMPL(cusolverDnZgesvd( + s.handle, jobu[0], jobvt[0], m, n, reinterpret_cast(A.data()), lda, S.data(), + reinterpret_cast(U.data()), ldu, reinterpret_cast(Vt.data()), ldvt, + reinterpret_cast(work.data()), lwork, rwork.data(), info.data())); } KOKKOS_CUSOLVER_SAFE_CALL_IMPL(cusolverDnSetStream(s.handle, NULL)); } -#define KOKKOSLAPACK_SVD_CUSOLVER(SCALAR, LAYOUT, MEM_SPACE) \ - template <> \ - struct SVD< \ - Kokkos::Cuda, \ - Kokkos::View, \ - Kokkos::MemoryTraits>, \ - Kokkos::View::mag_type*, LAYOUT, \ - Kokkos::Device, \ - Kokkos::MemoryTraits>, \ - Kokkos::View, \ - Kokkos::MemoryTraits>, \ - Kokkos::View, \ - Kokkos::MemoryTraits>, \ - true, \ - svd_eti_spec_avail< \ - Kokkos::Cuda, \ - Kokkos::View, \ - Kokkos::MemoryTraits>, \ - Kokkos::View::mag_type*, LAYOUT, \ - Kokkos::Device, \ - Kokkos::MemoryTraits>, \ - Kokkos::View, \ - Kokkos::MemoryTraits>, \ - Kokkos::View, \ - Kokkos::MemoryTraits>>::value> { \ - using AMatrix = Kokkos::View, \ - Kokkos::MemoryTraits>; \ - using SVector = \ - Kokkos::View::mag_type*, LAYOUT, \ - Kokkos::Device, \ - Kokkos::MemoryTraits>; \ - using UMatrix = Kokkos::View, \ - Kokkos::MemoryTraits>; \ - using VMatrix = Kokkos::View, \ - Kokkos::MemoryTraits>; \ - \ - static void svd(const Kokkos::Cuda& space, const char jobu[], \ - const char jobvt[], const AMatrix& A, const SVector& S, \ - const UMatrix& U, const VMatrix& Vt) { \ - Kokkos::Profiling::pushRegion("KokkosLapack::svd[TPL_CUSOLVER," #SCALAR \ - "]"); \ - svd_print_specialization(); \ - \ - cusolverSvdWrapper(space, jobu, jobvt, A, S, U, Vt); \ - Kokkos::Profiling::popRegion(); \ - } \ +#define KOKKOSLAPACK_SVD_CUSOLVER(SCALAR, LAYOUT, MEM_SPACE) \ + template <> \ + struct SVD, \ + Kokkos::MemoryTraits>, \ + Kokkos::View::mag_type*, LAYOUT, Kokkos::Device, \ + Kokkos::MemoryTraits>, \ + Kokkos::View, \ + Kokkos::MemoryTraits>, \ + Kokkos::View, \ + Kokkos::MemoryTraits>, \ + true, \ + svd_eti_spec_avail< \ + Kokkos::Cuda, \ + Kokkos::View, \ + Kokkos::MemoryTraits>, \ + Kokkos::View::mag_type*, LAYOUT, Kokkos::Device, \ + Kokkos::MemoryTraits>, \ + Kokkos::View, \ + Kokkos::MemoryTraits>, \ + Kokkos::View, \ + Kokkos::MemoryTraits>>::value> { \ + using AMatrix = Kokkos::View, \ + Kokkos::MemoryTraits>; \ + using SVector = Kokkos::View::mag_type*, LAYOUT, \ + Kokkos::Device, Kokkos::MemoryTraits>; \ + using UMatrix = Kokkos::View, \ + Kokkos::MemoryTraits>; \ + using VMatrix = Kokkos::View, \ + Kokkos::MemoryTraits>; \ + \ + static void svd(const Kokkos::Cuda& space, const char jobu[], const char jobvt[], const AMatrix& A, \ + const SVector& S, const UMatrix& U, const VMatrix& Vt) { \ + Kokkos::Profiling::pushRegion("KokkosLapack::svd[TPL_CUSOLVER," #SCALAR "]"); \ + svd_print_specialization(); \ + \ + cusolverSvdWrapper(space, jobu, jobvt, A, S, U, Vt); \ + Kokkos::Profiling::popRegion(); \ + } \ }; KOKKOSLAPACK_SVD_CUSOLVER(float, Kokkos::LayoutLeft, Kokkos::CudaSpace) KOKKOSLAPACK_SVD_CUSOLVER(double, Kokkos::LayoutLeft, Kokkos::CudaSpace) -KOKKOSLAPACK_SVD_CUSOLVER(Kokkos::complex, Kokkos::LayoutLeft, - Kokkos::CudaSpace) -KOKKOSLAPACK_SVD_CUSOLVER(Kokkos::complex, Kokkos::LayoutLeft, - Kokkos::CudaSpace) +KOKKOSLAPACK_SVD_CUSOLVER(Kokkos::complex, Kokkos::LayoutLeft, Kokkos::CudaSpace) +KOKKOSLAPACK_SVD_CUSOLVER(Kokkos::complex, Kokkos::LayoutLeft, Kokkos::CudaSpace) #if defined(KOKKOSKERNELS_INST_MEMSPACE_CUDAUVMSPACE) KOKKOSLAPACK_SVD_CUSOLVER(float, Kokkos::LayoutLeft, Kokkos::CudaUVMSpace) KOKKOSLAPACK_SVD_CUSOLVER(double, Kokkos::LayoutLeft, Kokkos::CudaUVMSpace) -KOKKOSLAPACK_SVD_CUSOLVER(Kokkos::complex, Kokkos::LayoutLeft, - Kokkos::CudaUVMSpace) -KOKKOSLAPACK_SVD_CUSOLVER(Kokkos::complex, Kokkos::LayoutLeft, - Kokkos::CudaUVMSpace) +KOKKOSLAPACK_SVD_CUSOLVER(Kokkos::complex, Kokkos::LayoutLeft, Kokkos::CudaUVMSpace) +KOKKOSLAPACK_SVD_CUSOLVER(Kokkos::complex, Kokkos::LayoutLeft, Kokkos::CudaUVMSpace) #endif } // namespace Impl @@ -529,11 +427,9 @@ KOKKOSLAPACK_SVD_CUSOLVER(Kokkos::complex, Kokkos::LayoutLeft, namespace KokkosLapack { namespace Impl { -template -void rocsolverSvdWrapper(const ExecutionSpace& space, const char jobu[], - const char jobvt[], const AMatrix& A, const SVector& S, - const UMatrix& U, const VMatrix& Vt) { +template +void rocsolverSvdWrapper(const ExecutionSpace& space, const char jobu[], const char jobvt[], const AMatrix& A, + const SVector& S, const UMatrix& U, const VMatrix& Vt) { using memory_space = typename AMatrix::memory_space; using Scalar = typename AMatrix::non_const_value_type; using Magnitude = typename SVector::non_const_value_type; @@ -574,111 +470,84 @@ void rocsolverSvdWrapper(const ExecutionSpace& space, const char jobu[], const rocblas_workmode WorkMode = rocblas_outofplace; Kokkos::View info("svd info"); - Kokkos::View rwork("svd rwork buffer", - Kokkos::min(m, n) - 1); + Kokkos::View rwork("svd rwork buffer", Kokkos::min(m, n) - 1); - KokkosBlas::Impl::RocBlasSingleton& s = - KokkosBlas::Impl::RocBlasSingleton::singleton(); - KOKKOS_ROCBLAS_SAFE_CALL_IMPL( - rocblas_set_stream(s.handle, space.hip_stream())); + KokkosBlas::Impl::RocBlasSingleton& s = KokkosBlas::Impl::RocBlasSingleton::singleton(); + KOKKOS_ROCBLAS_SAFE_CALL_IMPL(rocblas_set_stream(s.handle, space.hip_stream())); if constexpr (std::is_same_v) { - KOKKOS_ROCBLAS_SAFE_CALL_IMPL(rocsolver_sgesvd( - s.handle, UVecMode, VVecMode, m, n, A.data(), lda, S.data(), U.data(), - ldu, Vt.data(), ldvt, rwork.data(), WorkMode, info.data())); + KOKKOS_ROCBLAS_SAFE_CALL_IMPL(rocsolver_sgesvd(s.handle, UVecMode, VVecMode, m, n, A.data(), lda, S.data(), + U.data(), ldu, Vt.data(), ldvt, rwork.data(), WorkMode, + info.data())); } if constexpr (std::is_same_v) { - KOKKOS_ROCBLAS_SAFE_CALL_IMPL(rocsolver_dgesvd( - s.handle, UVecMode, VVecMode, m, n, A.data(), lda, S.data(), U.data(), - ldu, Vt.data(), ldvt, rwork.data(), WorkMode, info.data())); + KOKKOS_ROCBLAS_SAFE_CALL_IMPL(rocsolver_dgesvd(s.handle, UVecMode, VVecMode, m, n, A.data(), lda, S.data(), + U.data(), ldu, Vt.data(), ldvt, rwork.data(), WorkMode, + info.data())); } if constexpr (std::is_same_v>) { KOKKOS_ROCBLAS_SAFE_CALL_IMPL(rocsolver_cgesvd( - s.handle, UVecMode, VVecMode, m, n, - reinterpret_cast(A.data()), lda, S.data(), - reinterpret_cast(U.data()), ldu, - reinterpret_cast(Vt.data()), ldvt, rwork.data(), - WorkMode, info.data())); + s.handle, UVecMode, VVecMode, m, n, reinterpret_cast(A.data()), lda, S.data(), + reinterpret_cast(U.data()), ldu, reinterpret_cast(Vt.data()), + ldvt, rwork.data(), WorkMode, info.data())); } if constexpr (std::is_same_v>) { KOKKOS_ROCBLAS_SAFE_CALL_IMPL(rocsolver_zgesvd( - s.handle, UVecMode, VVecMode, m, n, - reinterpret_cast(A.data()), lda, S.data(), - reinterpret_cast(U.data()), ldu, - reinterpret_cast(Vt.data()), ldvt, - rwork.data(), WorkMode, info.data())); + s.handle, UVecMode, VVecMode, m, n, reinterpret_cast(A.data()), lda, S.data(), + reinterpret_cast(U.data()), ldu, reinterpret_cast(Vt.data()), + ldvt, rwork.data(), WorkMode, info.data())); } KOKKOS_ROCBLAS_SAFE_CALL_IMPL(rocblas_set_stream(s.handle, NULL)); } -#define KOKKOSLAPACK_SVD_ROCSOLVER(SCALAR, LAYOUT, MEM_SPACE) \ - template <> \ - struct SVD< \ - Kokkos::HIP, \ - Kokkos::View, \ - Kokkos::MemoryTraits>, \ - Kokkos::View::mag_type*, LAYOUT, \ - Kokkos::Device, \ - Kokkos::MemoryTraits>, \ - Kokkos::View, \ - Kokkos::MemoryTraits>, \ - Kokkos::View, \ - Kokkos::MemoryTraits>, \ - true, \ - svd_eti_spec_avail< \ - Kokkos::HIP, \ - Kokkos::View, \ - Kokkos::MemoryTraits>, \ - Kokkos::View::mag_type*, LAYOUT, \ - Kokkos::Device, \ - Kokkos::MemoryTraits>, \ - Kokkos::View, \ - Kokkos::MemoryTraits>, \ - Kokkos::View, \ - Kokkos::MemoryTraits>>::value> { \ - using AMatrix = \ - Kokkos::View, \ - Kokkos::MemoryTraits>; \ - using SVector = \ - Kokkos::View::mag_type*, LAYOUT, \ - Kokkos::Device, \ - Kokkos::MemoryTraits>; \ - using UMatrix = \ - Kokkos::View, \ - Kokkos::MemoryTraits>; \ - using VMatrix = \ - Kokkos::View, \ - Kokkos::MemoryTraits>; \ - \ - static void svd(const Kokkos::HIP& space, const char jobu[], \ - const char jobvt[], const AMatrix& A, const SVector& S, \ - const UMatrix& U, const VMatrix& Vt) { \ - Kokkos::Profiling::pushRegion("KokkosLapack::svd[TPL_ROCSOLVER," #SCALAR \ - "]"); \ - svd_print_specialization(); \ - \ - rocsolverSvdWrapper(space, jobu, jobvt, A, S, U, Vt); \ - Kokkos::Profiling::popRegion(); \ - } \ +#define KOKKOSLAPACK_SVD_ROCSOLVER(SCALAR, LAYOUT, MEM_SPACE) \ + template <> \ + struct SVD< \ + Kokkos::HIP, \ + Kokkos::View, Kokkos::MemoryTraits>, \ + Kokkos::View::mag_type*, LAYOUT, Kokkos::Device, \ + Kokkos::MemoryTraits>, \ + Kokkos::View, Kokkos::MemoryTraits>, \ + Kokkos::View, Kokkos::MemoryTraits>, \ + true, \ + svd_eti_spec_avail< \ + Kokkos::HIP, \ + Kokkos::View, \ + Kokkos::MemoryTraits>, \ + Kokkos::View::mag_type*, LAYOUT, Kokkos::Device, \ + Kokkos::MemoryTraits>, \ + Kokkos::View, \ + Kokkos::MemoryTraits>, \ + Kokkos::View, \ + Kokkos::MemoryTraits>>::value> { \ + using AMatrix = Kokkos::View, \ + Kokkos::MemoryTraits>; \ + using SVector = Kokkos::View::mag_type*, LAYOUT, \ + Kokkos::Device, Kokkos::MemoryTraits>; \ + using UMatrix = Kokkos::View, \ + Kokkos::MemoryTraits>; \ + using VMatrix = Kokkos::View, \ + Kokkos::MemoryTraits>; \ + \ + static void svd(const Kokkos::HIP& space, const char jobu[], const char jobvt[], const AMatrix& A, \ + const SVector& S, const UMatrix& U, const VMatrix& Vt) { \ + Kokkos::Profiling::pushRegion("KokkosLapack::svd[TPL_ROCSOLVER," #SCALAR "]"); \ + svd_print_specialization(); \ + \ + rocsolverSvdWrapper(space, jobu, jobvt, A, S, U, Vt); \ + Kokkos::Profiling::popRegion(); \ + } \ }; KOKKOSLAPACK_SVD_ROCSOLVER(float, Kokkos::LayoutLeft, Kokkos::HIPSpace) KOKKOSLAPACK_SVD_ROCSOLVER(double, Kokkos::LayoutLeft, Kokkos::HIPSpace) -KOKKOSLAPACK_SVD_ROCSOLVER(Kokkos::complex, Kokkos::LayoutLeft, - Kokkos::HIPSpace) -KOKKOSLAPACK_SVD_ROCSOLVER(Kokkos::complex, Kokkos::LayoutLeft, - Kokkos::HIPSpace) +KOKKOSLAPACK_SVD_ROCSOLVER(Kokkos::complex, Kokkos::LayoutLeft, Kokkos::HIPSpace) +KOKKOSLAPACK_SVD_ROCSOLVER(Kokkos::complex, Kokkos::LayoutLeft, Kokkos::HIPSpace) #if defined(KOKKOSKERNELS_INST_MEMSPACE_HIPMANAGEDSPACE) KOKKOSLAPACK_SVD_ROCSOLVER(float, Kokkos::LayoutLeft, Kokkos::HIPManagedSpace) KOKKOSLAPACK_SVD_ROCSOLVER(double, Kokkos::LayoutLeft, Kokkos::HIPManagedSpace) -KOKKOSLAPACK_SVD_ROCSOLVER(Kokkos::complex, Kokkos::LayoutLeft, - Kokkos::HIPManagedSpace) -KOKKOSLAPACK_SVD_ROCSOLVER(Kokkos::complex, Kokkos::LayoutLeft, - Kokkos::HIPManagedSpace) +KOKKOSLAPACK_SVD_ROCSOLVER(Kokkos::complex, Kokkos::LayoutLeft, Kokkos::HIPManagedSpace) +KOKKOSLAPACK_SVD_ROCSOLVER(Kokkos::complex, Kokkos::LayoutLeft, Kokkos::HIPManagedSpace) #endif } // namespace Impl diff --git a/lapack/tpls/KokkosLapack_trtri_tpl_spec_avail.hpp b/lapack/tpls/KokkosLapack_trtri_tpl_spec_avail.hpp index 7251d97086..6ec8d26a98 100644 --- a/lapack/tpls/KokkosLapack_trtri_tpl_spec_avail.hpp +++ b/lapack/tpls/KokkosLapack_trtri_tpl_spec_avail.hpp @@ -27,14 +27,13 @@ struct trtri_tpl_spec_avail { }; // Generic Host side LAPACK (could be MKL or whatever) -#define KOKKOSLAPACK_TRTRI_TPL_SPEC_AVAIL(SCALAR, LAYOUTA, MEMSPACE) \ - template \ - struct trtri_tpl_spec_avail< \ - Kokkos::View >, \ - Kokkos::View, \ - Kokkos::MemoryTraits > > { \ - enum : bool { value = true }; \ +#define KOKKOSLAPACK_TRTRI_TPL_SPEC_AVAIL(SCALAR, LAYOUTA, MEMSPACE) \ + template \ + struct trtri_tpl_spec_avail< \ + Kokkos::View >, \ + Kokkos::View, \ + Kokkos::MemoryTraits > > { \ + enum : bool { value = true }; \ }; #ifdef KOKKOSKERNELS_ENABLE_TPL_LAPACK @@ -51,80 +50,52 @@ struct trtri_tpl_spec_avail { #define KOKKOSLAPACK_TRTRI_TPL_SPEC_AVAIL_MAGMA(SCALAR, LAYOUTA, MEMSPACE) #endif // KOKKOSKERNELS_ENABLE_TPL_MAGMA -KOKKOSLAPACK_TRTRI_TPL_SPEC_AVAIL_LAPACK(double, Kokkos::LayoutLeft, - Kokkos::HostSpace) +KOKKOSLAPACK_TRTRI_TPL_SPEC_AVAIL_LAPACK(double, Kokkos::LayoutLeft, Kokkos::HostSpace) #ifdef KOKKOS_ENABLE_CUDA -KOKKOSLAPACK_TRTRI_TPL_SPEC_AVAIL_MAGMA(double, Kokkos::LayoutLeft, - Kokkos::CudaSpace) -KOKKOSLAPACK_TRTRI_TPL_SPEC_AVAIL_MAGMA(double, Kokkos::LayoutLeft, - Kokkos::CudaUVMSpace) +KOKKOSLAPACK_TRTRI_TPL_SPEC_AVAIL_MAGMA(double, Kokkos::LayoutLeft, Kokkos::CudaSpace) +KOKKOSLAPACK_TRTRI_TPL_SPEC_AVAIL_MAGMA(double, Kokkos::LayoutLeft, Kokkos::CudaUVMSpace) #endif -KOKKOSLAPACK_TRTRI_TPL_SPEC_AVAIL_LAPACK(float, Kokkos::LayoutLeft, - Kokkos::HostSpace) +KOKKOSLAPACK_TRTRI_TPL_SPEC_AVAIL_LAPACK(float, Kokkos::LayoutLeft, Kokkos::HostSpace) #ifdef KOKKOS_ENABLE_CUDA -KOKKOSLAPACK_TRTRI_TPL_SPEC_AVAIL_MAGMA(float, Kokkos::LayoutLeft, - Kokkos::CudaSpace) -KOKKOSLAPACK_TRTRI_TPL_SPEC_AVAIL_MAGMA(float, Kokkos::LayoutLeft, - Kokkos::CudaUVMSpace) +KOKKOSLAPACK_TRTRI_TPL_SPEC_AVAIL_MAGMA(float, Kokkos::LayoutLeft, Kokkos::CudaSpace) +KOKKOSLAPACK_TRTRI_TPL_SPEC_AVAIL_MAGMA(float, Kokkos::LayoutLeft, Kokkos::CudaUVMSpace) #endif -KOKKOSLAPACK_TRTRI_TPL_SPEC_AVAIL_LAPACK(Kokkos::complex, - Kokkos::LayoutLeft, Kokkos::HostSpace) +KOKKOSLAPACK_TRTRI_TPL_SPEC_AVAIL_LAPACK(Kokkos::complex, Kokkos::LayoutLeft, Kokkos::HostSpace) #ifdef KOKKOS_ENABLE_CUDA -KOKKOSLAPACK_TRTRI_TPL_SPEC_AVAIL_MAGMA(Kokkos::complex, - Kokkos::LayoutLeft, Kokkos::CudaSpace) -KOKKOSLAPACK_TRTRI_TPL_SPEC_AVAIL_MAGMA(Kokkos::complex, - Kokkos::LayoutLeft, - Kokkos::CudaUVMSpace) +KOKKOSLAPACK_TRTRI_TPL_SPEC_AVAIL_MAGMA(Kokkos::complex, Kokkos::LayoutLeft, Kokkos::CudaSpace) +KOKKOSLAPACK_TRTRI_TPL_SPEC_AVAIL_MAGMA(Kokkos::complex, Kokkos::LayoutLeft, Kokkos::CudaUVMSpace) #endif -KOKKOSLAPACK_TRTRI_TPL_SPEC_AVAIL_LAPACK(Kokkos::complex, - Kokkos::LayoutLeft, Kokkos::HostSpace) +KOKKOSLAPACK_TRTRI_TPL_SPEC_AVAIL_LAPACK(Kokkos::complex, Kokkos::LayoutLeft, Kokkos::HostSpace) #ifdef KOKKOS_ENABLE_CUDA -KOKKOSLAPACK_TRTRI_TPL_SPEC_AVAIL_MAGMA(Kokkos::complex, - Kokkos::LayoutLeft, Kokkos::CudaSpace) -KOKKOSLAPACK_TRTRI_TPL_SPEC_AVAIL_MAGMA(Kokkos::complex, - Kokkos::LayoutLeft, - Kokkos::CudaUVMSpace) +KOKKOSLAPACK_TRTRI_TPL_SPEC_AVAIL_MAGMA(Kokkos::complex, Kokkos::LayoutLeft, Kokkos::CudaSpace) +KOKKOSLAPACK_TRTRI_TPL_SPEC_AVAIL_MAGMA(Kokkos::complex, Kokkos::LayoutLeft, Kokkos::CudaUVMSpace) #endif -KOKKOSLAPACK_TRTRI_TPL_SPEC_AVAIL_LAPACK(double, Kokkos::LayoutRight, - Kokkos::HostSpace) +KOKKOSLAPACK_TRTRI_TPL_SPEC_AVAIL_LAPACK(double, Kokkos::LayoutRight, Kokkos::HostSpace) #ifdef KOKKOS_ENABLE_CUDA -KOKKOSLAPACK_TRTRI_TPL_SPEC_AVAIL_MAGMA(double, Kokkos::LayoutRight, - Kokkos::CudaSpace) -KOKKOSLAPACK_TRTRI_TPL_SPEC_AVAIL_MAGMA(double, Kokkos::LayoutRight, - Kokkos::CudaUVMSpace) +KOKKOSLAPACK_TRTRI_TPL_SPEC_AVAIL_MAGMA(double, Kokkos::LayoutRight, Kokkos::CudaSpace) +KOKKOSLAPACK_TRTRI_TPL_SPEC_AVAIL_MAGMA(double, Kokkos::LayoutRight, Kokkos::CudaUVMSpace) #endif -KOKKOSLAPACK_TRTRI_TPL_SPEC_AVAIL_LAPACK(float, Kokkos::LayoutRight, - Kokkos::HostSpace) +KOKKOSLAPACK_TRTRI_TPL_SPEC_AVAIL_LAPACK(float, Kokkos::LayoutRight, Kokkos::HostSpace) #ifdef KOKKOS_ENABLE_CUDA -KOKKOSLAPACK_TRTRI_TPL_SPEC_AVAIL_MAGMA(float, Kokkos::LayoutRight, - Kokkos::CudaSpace) -KOKKOSLAPACK_TRTRI_TPL_SPEC_AVAIL_MAGMA(float, Kokkos::LayoutRight, - Kokkos::CudaUVMSpace) +KOKKOSLAPACK_TRTRI_TPL_SPEC_AVAIL_MAGMA(float, Kokkos::LayoutRight, Kokkos::CudaSpace) +KOKKOSLAPACK_TRTRI_TPL_SPEC_AVAIL_MAGMA(float, Kokkos::LayoutRight, Kokkos::CudaUVMSpace) #endif -KOKKOSLAPACK_TRTRI_TPL_SPEC_AVAIL_LAPACK(Kokkos::complex, - Kokkos::LayoutRight, Kokkos::HostSpace) +KOKKOSLAPACK_TRTRI_TPL_SPEC_AVAIL_LAPACK(Kokkos::complex, Kokkos::LayoutRight, Kokkos::HostSpace) #ifdef KOKKOS_ENABLE_CUDA -KOKKOSLAPACK_TRTRI_TPL_SPEC_AVAIL_MAGMA(Kokkos::complex, - Kokkos::LayoutRight, Kokkos::CudaSpace) -KOKKOSLAPACK_TRTRI_TPL_SPEC_AVAIL_MAGMA(Kokkos::complex, - Kokkos::LayoutRight, - Kokkos::CudaUVMSpace) +KOKKOSLAPACK_TRTRI_TPL_SPEC_AVAIL_MAGMA(Kokkos::complex, Kokkos::LayoutRight, Kokkos::CudaSpace) +KOKKOSLAPACK_TRTRI_TPL_SPEC_AVAIL_MAGMA(Kokkos::complex, Kokkos::LayoutRight, Kokkos::CudaUVMSpace) #endif -KOKKOSLAPACK_TRTRI_TPL_SPEC_AVAIL_LAPACK(Kokkos::complex, - Kokkos::LayoutRight, Kokkos::HostSpace) +KOKKOSLAPACK_TRTRI_TPL_SPEC_AVAIL_LAPACK(Kokkos::complex, Kokkos::LayoutRight, Kokkos::HostSpace) #ifdef KOKKOS_ENABLE_CUDA -KOKKOSLAPACK_TRTRI_TPL_SPEC_AVAIL_MAGMA(Kokkos::complex, - Kokkos::LayoutRight, Kokkos::CudaSpace) -KOKKOSLAPACK_TRTRI_TPL_SPEC_AVAIL_MAGMA(Kokkos::complex, - Kokkos::LayoutRight, - Kokkos::CudaUVMSpace) +KOKKOSLAPACK_TRTRI_TPL_SPEC_AVAIL_MAGMA(Kokkos::complex, Kokkos::LayoutRight, Kokkos::CudaSpace) +KOKKOSLAPACK_TRTRI_TPL_SPEC_AVAIL_MAGMA(Kokkos::complex, Kokkos::LayoutRight, Kokkos::CudaUVMSpace) #endif } // namespace Impl diff --git a/lapack/tpls/KokkosLapack_trtri_tpl_spec_decl.hpp b/lapack/tpls/KokkosLapack_trtri_tpl_spec_decl.hpp index b7e9c6e341..b326e722a0 100644 --- a/lapack/tpls/KokkosLapack_trtri_tpl_spec_decl.hpp +++ b/lapack/tpls/KokkosLapack_trtri_tpl_spec_decl.hpp @@ -27,112 +27,86 @@ namespace KokkosLapack { namespace Impl { #ifdef KOKKOSKERNELS_ENABLE_TPL_LAPACK -#define KOKKOSLAPACK_TRTRI_LAPACK_HOST(SCALAR_TYPE, BASE_SCALAR_TYPE, LAYOUTA, \ - MEM_SPACE, ETI_SPEC_AVAIL) \ - template \ - struct TRTRI >, \ - Kokkos::View, \ - Kokkos::MemoryTraits >, \ - true, ETI_SPEC_AVAIL> { \ - typedef SCALAR_TYPE SCALAR; \ - typedef Kokkos::View > \ - RViewType; \ - typedef Kokkos::View, \ - Kokkos::MemoryTraits > \ - AViewType; \ - \ - static void trtri(const RViewType& R, const char uplo[], \ - const char diag[], const AViewType& A) { \ - Kokkos::Profiling::pushRegion( \ - "KokkosLapack::trtri[TPL_LAPACK," #SCALAR_TYPE "]"); \ - const int M = static_cast(A.extent(0)); \ - \ - bool A_is_layout_left = \ - std::is_same::value; \ - \ - const int AST = A_is_layout_left ? A.stride(1) : A.stride(0), \ - LDA = (AST == 0) ? 1 : AST; \ - \ - char uplo_; \ - \ - if ((uplo[0] == 'L') || (uplo[0] == 'l')) \ - uplo_ = A_is_layout_left ? 'L' : 'U'; \ - else \ - uplo_ = A_is_layout_left ? 'U' : 'L'; \ - \ - R() = HostLapack::trtri( \ - uplo_, diag[0], M, \ - reinterpret_cast(A.data()), LDA); \ - Kokkos::Profiling::popRegion(); \ - } \ +#define KOKKOSLAPACK_TRTRI_LAPACK_HOST(SCALAR_TYPE, BASE_SCALAR_TYPE, LAYOUTA, MEM_SPACE, ETI_SPEC_AVAIL) \ + template \ + struct TRTRI >, \ + Kokkos::View, \ + Kokkos::MemoryTraits >, \ + true, ETI_SPEC_AVAIL> { \ + typedef SCALAR_TYPE SCALAR; \ + typedef Kokkos::View > \ + RViewType; \ + typedef Kokkos::View, \ + Kokkos::MemoryTraits > \ + AViewType; \ + \ + static void trtri(const RViewType& R, const char uplo[], const char diag[], const AViewType& A) { \ + Kokkos::Profiling::pushRegion("KokkosLapack::trtri[TPL_LAPACK," #SCALAR_TYPE "]"); \ + const int M = static_cast(A.extent(0)); \ + \ + bool A_is_layout_left = std::is_same::value; \ + \ + const int AST = A_is_layout_left ? A.stride(1) : A.stride(0), LDA = (AST == 0) ? 1 : AST; \ + \ + char uplo_; \ + \ + if ((uplo[0] == 'L') || (uplo[0] == 'l')) \ + uplo_ = A_is_layout_left ? 'L' : 'U'; \ + else \ + uplo_ = A_is_layout_left ? 'U' : 'L'; \ + \ + R() = HostLapack::trtri(uplo_, diag[0], M, \ + reinterpret_cast(A.data()), LDA); \ + Kokkos::Profiling::popRegion(); \ + } \ }; #else -#define KOKKOSLAPACK_TRTRI_LAPACK_HOST(SCALAR_TYPE, BASE_SCALAR_TYPE, LAYOUTA, \ - MEM_SPACE, ETI_SPEC_AVAIL) +#define KOKKOSLAPACK_TRTRI_LAPACK_HOST(SCALAR_TYPE, BASE_SCALAR_TYPE, LAYOUTA, MEM_SPACE, ETI_SPEC_AVAIL) #endif // KOKKOSKERNELS_ENABLE_TPL_LAPACK #ifdef KOKKOSKERNELS_ENABLE_TPL_MAGMA -#define KOKKOSLAPACK_TRTRI_LAPACK_MAGMA(SCALAR_TYPE, BASE_SCALAR_TYPE, \ - MAGMA_FN, LAYOUTA, MEM_SPACE, \ - ETI_SPEC_AVAIL) \ - template \ - struct TRTRI >, \ - Kokkos::View, \ - Kokkos::MemoryTraits >, \ - true, ETI_SPEC_AVAIL> { \ - typedef SCALAR_TYPE SCALAR; \ - typedef Kokkos::View > \ - RViewType; \ - typedef Kokkos::View, \ - Kokkos::MemoryTraits > \ - AViewType; \ - \ - static void trtri(const RViewType& R, const char uplo[], \ - const char diag[], const AViewType& A) { \ - Kokkos::Profiling::pushRegion( \ - "KokkosLapack::trtri[TPL_LAPACK," #SCALAR_TYPE "]"); \ - magma_int_t M = static_cast(A.extent(0)); \ - \ - bool A_is_layout_left = \ - std::is_same::value; \ - \ - magma_int_t AST = A_is_layout_left ? A.stride(1) : A.stride(0), \ - LDA = (AST == 0) ? 1 : AST; \ - magma_int_t info = 0; \ - magma_uplo_t uplo_; \ - magma_diag_t diag_; \ - \ - if ((uplo[0] == 'L') || (uplo[0] == 'l')) \ - uplo_ = A_is_layout_left ? MagmaLower : MagmaUpper; \ - else \ - uplo_ = A_is_layout_left ? MagmaUpper : MagmaLower; \ - \ - if (diag[0] == 'U' || diag[0] == 'u') \ - diag_ = MagmaUnit; \ - else \ - diag_ = MagmaNonUnit; \ - \ - KokkosLapack::Impl::MagmaSingleton& s = \ - KokkosLapack::Impl::MagmaSingleton::singleton(); \ - R() = MAGMA_FN(uplo_, diag_, M, \ - reinterpret_cast( \ - const_cast(A.data())), \ - LDA, &info); \ - Kokkos::Profiling::popRegion(); \ - } \ +#define KOKKOSLAPACK_TRTRI_LAPACK_MAGMA(SCALAR_TYPE, BASE_SCALAR_TYPE, MAGMA_FN, LAYOUTA, MEM_SPACE, ETI_SPEC_AVAIL) \ + template \ + struct TRTRI >, \ + Kokkos::View, \ + Kokkos::MemoryTraits >, \ + true, ETI_SPEC_AVAIL> { \ + typedef SCALAR_TYPE SCALAR; \ + typedef Kokkos::View > \ + RViewType; \ + typedef Kokkos::View, \ + Kokkos::MemoryTraits > \ + AViewType; \ + \ + static void trtri(const RViewType& R, const char uplo[], const char diag[], const AViewType& A) { \ + Kokkos::Profiling::pushRegion("KokkosLapack::trtri[TPL_LAPACK," #SCALAR_TYPE "]"); \ + magma_int_t M = static_cast(A.extent(0)); \ + \ + bool A_is_layout_left = std::is_same::value; \ + \ + magma_int_t AST = A_is_layout_left ? A.stride(1) : A.stride(0), LDA = (AST == 0) ? 1 : AST; \ + magma_int_t info = 0; \ + magma_uplo_t uplo_; \ + magma_diag_t diag_; \ + \ + if ((uplo[0] == 'L') || (uplo[0] == 'l')) \ + uplo_ = A_is_layout_left ? MagmaLower : MagmaUpper; \ + else \ + uplo_ = A_is_layout_left ? MagmaUpper : MagmaLower; \ + \ + if (diag[0] == 'U' || diag[0] == 'u') \ + diag_ = MagmaUnit; \ + else \ + diag_ = MagmaNonUnit; \ + \ + KokkosLapack::Impl::MagmaSingleton& s = KokkosLapack::Impl::MagmaSingleton::singleton(); \ + R() = MAGMA_FN(uplo_, diag_, M, reinterpret_cast(const_cast(A.data())), LDA, \ + &info); \ + Kokkos::Profiling::popRegion(); \ + } \ }; #else -#define KOKKOSLAPACK_TRTRI_LAPACK_MAGMA(SCALAR_TYPE, BASE_SCALAR_TYPE, \ - MAGMA_FN, LAYOUTA, MEM_SPACE, \ - ETI_SPEC_AVAIL) +#define KOKKOSLAPACK_TRTRI_LAPACK_MAGMA(SCALAR_TYPE, BASE_SCALAR_TYPE, MAGMA_FN, LAYOUTA, MEM_SPACE, ETI_SPEC_AVAIL) #endif // KOKKOSKERNELS_ENABLE_TPL_MAGMA // Explicitly define the TRTRI class for all permutations listed below @@ -140,63 +114,50 @@ namespace Impl { // Handle type and space permutations #ifdef KOKKOS_ENABLE_CUDA -#define KOKKOSLAPACK_DTRTRI_LAPACK(LAYOUTA, ETI_SPEC_AVAIL) \ - KOKKOSLAPACK_TRTRI_LAPACK_HOST(double, double, LAYOUTA, Kokkos::HostSpace, \ - ETI_SPEC_AVAIL) \ - KOKKOSLAPACK_TRTRI_LAPACK_MAGMA(double, magmaDouble_ptr, magma_dtrtri_gpu, \ - LAYOUTA, Kokkos::CudaSpace, ETI_SPEC_AVAIL) \ - KOKKOSLAPACK_TRTRI_LAPACK_MAGMA(double, magmaDouble_ptr, magma_dtrtri_gpu, \ - LAYOUTA, Kokkos::CudaUVMSpace, \ +#define KOKKOSLAPACK_DTRTRI_LAPACK(LAYOUTA, ETI_SPEC_AVAIL) \ + KOKKOSLAPACK_TRTRI_LAPACK_HOST(double, double, LAYOUTA, Kokkos::HostSpace, ETI_SPEC_AVAIL) \ + KOKKOSLAPACK_TRTRI_LAPACK_MAGMA(double, magmaDouble_ptr, magma_dtrtri_gpu, LAYOUTA, Kokkos::CudaSpace, \ + ETI_SPEC_AVAIL) \ + KOKKOSLAPACK_TRTRI_LAPACK_MAGMA(double, magmaDouble_ptr, magma_dtrtri_gpu, LAYOUTA, Kokkos::CudaUVMSpace, \ ETI_SPEC_AVAIL) -#define KOKKOSLAPACK_STRTRI_LAPACK(LAYOUTA, ETI_SPEC_AVAIL) \ - KOKKOSLAPACK_TRTRI_LAPACK_HOST(float, float, LAYOUTA, Kokkos::HostSpace, \ - ETI_SPEC_AVAIL) \ - KOKKOSLAPACK_TRTRI_LAPACK_MAGMA(float, magmaFloat_ptr, magma_strtri_gpu, \ - LAYOUTA, Kokkos::CudaSpace, ETI_SPEC_AVAIL) \ - KOKKOSLAPACK_TRTRI_LAPACK_MAGMA(float, magmaFloat_ptr, magma_strtri_gpu, \ - LAYOUTA, Kokkos::CudaUVMSpace, \ +#define KOKKOSLAPACK_STRTRI_LAPACK(LAYOUTA, ETI_SPEC_AVAIL) \ + KOKKOSLAPACK_TRTRI_LAPACK_HOST(float, float, LAYOUTA, Kokkos::HostSpace, ETI_SPEC_AVAIL) \ + KOKKOSLAPACK_TRTRI_LAPACK_MAGMA(float, magmaFloat_ptr, magma_strtri_gpu, LAYOUTA, Kokkos::CudaSpace, ETI_SPEC_AVAIL) \ + KOKKOSLAPACK_TRTRI_LAPACK_MAGMA(float, magmaFloat_ptr, magma_strtri_gpu, LAYOUTA, Kokkos::CudaUVMSpace, \ ETI_SPEC_AVAIL) -#define KOKKOSLAPACK_ZTRTRI_LAPACK(LAYOUTA, ETI_SPEC_AVAIL) \ - KOKKOSLAPACK_TRTRI_LAPACK_HOST(Kokkos::complex, \ - std::complex, LAYOUTA, \ - Kokkos::HostSpace, ETI_SPEC_AVAIL) \ - KOKKOSLAPACK_TRTRI_LAPACK_MAGMA(Kokkos::complex, \ - magmaDoubleComplex_ptr, magma_ztrtri_gpu, \ - LAYOUTA, Kokkos::CudaSpace, ETI_SPEC_AVAIL) \ - KOKKOSLAPACK_TRTRI_LAPACK_MAGMA( \ - Kokkos::complex, magmaDoubleComplex_ptr, magma_ztrtri_gpu, \ - LAYOUTA, Kokkos::CudaUVMSpace, ETI_SPEC_AVAIL) - -#define KOKKOSLAPACK_CTRTRI_LAPACK(LAYOUTA, ETI_SPEC_AVAIL) \ - KOKKOSLAPACK_TRTRI_LAPACK_HOST(Kokkos::complex, std::complex, \ - LAYOUTA, Kokkos::HostSpace, ETI_SPEC_AVAIL) \ - KOKKOSLAPACK_TRTRI_LAPACK_MAGMA(Kokkos::complex, \ - magmaFloatComplex_ptr, magma_ctrtri_gpu, \ - LAYOUTA, Kokkos::CudaSpace, ETI_SPEC_AVAIL) \ - KOKKOSLAPACK_TRTRI_LAPACK_MAGMA( \ - Kokkos::complex, magmaFloatComplex_ptr, magma_ctrtri_gpu, \ - LAYOUTA, Kokkos::CudaUVMSpace, ETI_SPEC_AVAIL) +#define KOKKOSLAPACK_ZTRTRI_LAPACK(LAYOUTA, ETI_SPEC_AVAIL) \ + KOKKOSLAPACK_TRTRI_LAPACK_HOST(Kokkos::complex, std::complex, LAYOUTA, Kokkos::HostSpace, \ + ETI_SPEC_AVAIL) \ + KOKKOSLAPACK_TRTRI_LAPACK_MAGMA(Kokkos::complex, magmaDoubleComplex_ptr, magma_ztrtri_gpu, LAYOUTA, \ + Kokkos::CudaSpace, ETI_SPEC_AVAIL) \ + KOKKOSLAPACK_TRTRI_LAPACK_MAGMA(Kokkos::complex, magmaDoubleComplex_ptr, magma_ztrtri_gpu, LAYOUTA, \ + Kokkos::CudaUVMSpace, ETI_SPEC_AVAIL) + +#define KOKKOSLAPACK_CTRTRI_LAPACK(LAYOUTA, ETI_SPEC_AVAIL) \ + KOKKOSLAPACK_TRTRI_LAPACK_HOST(Kokkos::complex, std::complex, LAYOUTA, Kokkos::HostSpace, \ + ETI_SPEC_AVAIL) \ + KOKKOSLAPACK_TRTRI_LAPACK_MAGMA(Kokkos::complex, magmaFloatComplex_ptr, magma_ctrtri_gpu, LAYOUTA, \ + Kokkos::CudaSpace, ETI_SPEC_AVAIL) \ + KOKKOSLAPACK_TRTRI_LAPACK_MAGMA(Kokkos::complex, magmaFloatComplex_ptr, magma_ctrtri_gpu, LAYOUTA, \ + Kokkos::CudaUVMSpace, ETI_SPEC_AVAIL) #else -#define KOKKOSLAPACK_DTRTRI_LAPACK(LAYOUTA, ETI_SPEC_AVAIL) \ - KOKKOSLAPACK_TRTRI_LAPACK_HOST(double, double, LAYOUTA, Kokkos::HostSpace, \ - ETI_SPEC_AVAIL) +#define KOKKOSLAPACK_DTRTRI_LAPACK(LAYOUTA, ETI_SPEC_AVAIL) \ + KOKKOSLAPACK_TRTRI_LAPACK_HOST(double, double, LAYOUTA, Kokkos::HostSpace, ETI_SPEC_AVAIL) -#define KOKKOSLAPACK_STRTRI_LAPACK(LAYOUTA, ETI_SPEC_AVAIL) \ - KOKKOSLAPACK_TRTRI_LAPACK_HOST(float, float, LAYOUTA, Kokkos::HostSpace, \ - ETI_SPEC_AVAIL) +#define KOKKOSLAPACK_STRTRI_LAPACK(LAYOUTA, ETI_SPEC_AVAIL) \ + KOKKOSLAPACK_TRTRI_LAPACK_HOST(float, float, LAYOUTA, Kokkos::HostSpace, ETI_SPEC_AVAIL) -#define KOKKOSLAPACK_ZTRTRI_LAPACK(LAYOUTA, ETI_SPEC_AVAIL) \ - KOKKOSLAPACK_TRTRI_LAPACK_HOST(Kokkos::complex, \ - std::complex, LAYOUTA, \ - Kokkos::HostSpace, ETI_SPEC_AVAIL) +#define KOKKOSLAPACK_ZTRTRI_LAPACK(LAYOUTA, ETI_SPEC_AVAIL) \ + KOKKOSLAPACK_TRTRI_LAPACK_HOST(Kokkos::complex, std::complex, LAYOUTA, Kokkos::HostSpace, \ + ETI_SPEC_AVAIL) -#define KOKKOSLAPACK_CTRTRI_LAPACK(LAYOUTA, ETI_SPEC_AVAIL) \ - KOKKOSLAPACK_TRTRI_LAPACK_HOST(Kokkos::complex, std::complex, \ - LAYOUTA, Kokkos::HostSpace, ETI_SPEC_AVAIL) +#define KOKKOSLAPACK_CTRTRI_LAPACK(LAYOUTA, ETI_SPEC_AVAIL) \ + KOKKOSLAPACK_TRTRI_LAPACK_HOST(Kokkos::complex, std::complex, LAYOUTA, Kokkos::HostSpace, \ + ETI_SPEC_AVAIL) #endif diff --git a/lapack/unit_test/Test_Lapack_gesv.hpp b/lapack/unit_test/Test_Lapack_gesv.hpp index 77774d1d3f..653ed2cbf2 100644 --- a/lapack/unit_test/Test_Lapack_gesv.hpp +++ b/lapack/unit_test/Test_Lapack_gesv.hpp @@ -16,14 +16,11 @@ // only enable this test where KokkosLapack supports gesv: // CUDA+(MAGMA or CUSOLVER), HIP+ROCSOLVER and HOST+LAPACK -#if (defined(TEST_CUDA_LAPACK_CPP) && \ - (defined(KOKKOSKERNELS_ENABLE_TPL_MAGMA) || \ - defined(KOKKOSKERNELS_ENABLE_TPL_CUSOLVER))) || \ - (defined(TEST_HIP_LAPACK_CPP) && \ - defined(KOKKOSKERNELS_ENABLE_TPL_ROCSOLVER)) || \ - (defined(KOKKOSKERNELS_ENABLE_TPL_LAPACK) && \ - (defined(TEST_OPENMP_LAPACK_CPP) || defined(TEST_SERIAL_LAPACK_CPP) || \ - defined(TEST_THREADS_LAPACK_CPP))) +#if (defined(TEST_CUDA_LAPACK_CPP) && \ + (defined(KOKKOSKERNELS_ENABLE_TPL_MAGMA) || defined(KOKKOSKERNELS_ENABLE_TPL_CUSOLVER))) || \ + (defined(TEST_HIP_LAPACK_CPP) && defined(KOKKOSKERNELS_ENABLE_TPL_ROCSOLVER)) || \ + (defined(KOKKOSKERNELS_ENABLE_TPL_LAPACK) && \ + (defined(TEST_OPENMP_LAPACK_CPP) || defined(TEST_SERIAL_LAPACK_CPP) || defined(TEST_THREADS_LAPACK_CPP))) #include #include @@ -66,12 +63,8 @@ void impl_test_gesv(const char* mode, const char* padding, int N) { typename ViewTypeB::HostMirror h_B = Kokkos::create_mirror(B); // Initialize data. - Kokkos::fill_random( - A, rand_pool, - Kokkos::rand, ScalarA>::max()); - Kokkos::fill_random( - X0, rand_pool, - Kokkos::rand, ScalarA>::max()); + Kokkos::fill_random(A, rand_pool, Kokkos::rand, ScalarA>::max()); + Kokkos::fill_random(X0, rand_pool, Kokkos::rand, ScalarA>::max()); // Generate RHS B = A*X0. ScalarA alpha = 1.0; @@ -84,9 +77,8 @@ void impl_test_gesv(const char* mode, const char* padding, int N) { Kokkos::deep_copy(h_X0, X0); // Allocate IPIV view on host - using ViewTypeP = typename std::conditional< - MAGMA, Kokkos::View, - Kokkos::View>::type; + using ViewTypeP = typename std::conditional, + Kokkos::View>::type; ViewTypeP ipiv; int Nt = 0; if (mode[0] == 'Y') { @@ -105,8 +97,7 @@ void impl_test_gesv(const char* mode, const char* padding, int N) { bool notpl_runtime_err = false; #ifdef KOKKOSKERNELS_ENABLE_TPL_MAGMA // have MAGMA TPL #ifdef KOKKOSKERNELS_ENABLE_TPL_LAPACK // and have LAPACK TPL - nopivot_runtime_err = (!std::is_same::value) && + nopivot_runtime_err = (!std::is_same::value) && (ipiv.extent(0) == 0) && (ipiv.data() == nullptr); notpl_runtime_err = false; #else @@ -138,8 +129,7 @@ void impl_test_gesv(const char* mode, const char* padding, int N) { printf( " Error %d, pivot %c, padding %c: result( %.15lf ) !=" "solution( %.15lf ) at (%d), error=%.15e, eps=%.15e\n", - N, mode[0], padding[0], ats::abs(h_B(i)), ats::abs(h_X0(i)), int(i), - ats::abs(h_B(i) - h_X0(i)), eps); + N, mode[0], padding[0], ats::abs(h_B(i)), ats::abs(h_X0(i)), int(i), ats::abs(h_B(i) - h_X0(i)), eps); break; } } @@ -147,8 +137,7 @@ void impl_test_gesv(const char* mode, const char* padding, int N) { } template -void impl_test_gesv_mrhs(const char* mode, const char* padding, int N, - int nrhs) { +void impl_test_gesv_mrhs(const char* mode, const char* padding, int N, int nrhs) { using execution_space = typename Device::execution_space; using ScalarA = typename ViewTypeA::value_type; using ats = Kokkos::ArithTraits; @@ -177,12 +166,8 @@ void impl_test_gesv_mrhs(const char* mode, const char* padding, int N, typename ViewTypeB::HostMirror h_B = Kokkos::create_mirror(B); // Initialize data. - Kokkos::fill_random( - A, rand_pool, - Kokkos::rand, ScalarA>::max()); - Kokkos::fill_random( - X0, rand_pool, - Kokkos::rand, ScalarA>::max()); + Kokkos::fill_random(A, rand_pool, Kokkos::rand, ScalarA>::max()); + Kokkos::fill_random(X0, rand_pool, Kokkos::rand, ScalarA>::max()); // Generate RHS B = A*X0. ScalarA alpha = 1.0; @@ -195,9 +180,8 @@ void impl_test_gesv_mrhs(const char* mode, const char* padding, int N, Kokkos::deep_copy(h_X0, X0); // Allocate IPIV view on host - using ViewTypeP = typename std::conditional< - MAGMA, Kokkos::View, - Kokkos::View>::type; + using ViewTypeP = typename std::conditional, + Kokkos::View>::type; ViewTypeP ipiv; int Nt = 0; if (mode[0] == 'Y') { @@ -216,8 +200,7 @@ void impl_test_gesv_mrhs(const char* mode, const char* padding, int N, bool notpl_runtime_err = false; #ifdef KOKKOSKERNELS_ENABLE_TPL_MAGMA // have MAGMA TPL #ifdef KOKKOSKERNELS_ENABLE_TPL_LAPACK // and have LAPACK TPL - nopivot_runtime_err = (!std::is_same::value) && + nopivot_runtime_err = (!std::is_same::value) && (ipiv.extent(0) == 0) && (ipiv.data() == nullptr); notpl_runtime_err = false; #else @@ -263,49 +246,32 @@ void impl_test_gesv_mrhs(const char* mode, const char* padding, int N, template int test_gesv(const char* mode) { #if defined(KOKKOSKERNELS_INST_LAYOUTLEFT) || \ - (!defined(KOKKOSKERNELS_ETI_ONLY) && \ - !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS)) + (!defined(KOKKOSKERNELS_ETI_ONLY) && !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS)) using view_type_a_ll = Kokkos::View; using view_type_b_ll = Kokkos::View; -#if (defined(TEST_CUDA_LAPACK_CPP) && \ - defined(KOKKOSKERNELS_ENABLE_TPL_CUSOLVER)) || \ - (defined(TEST_HIP_LAPACK_CPP) && \ - defined(KOKKOSKERNELS_ENABLE_TPL_ROCSOLVER)) || \ - (defined(KOKKOSKERNELS_ENABLE_TPL_LAPACK) && \ - (defined(TEST_OPENMP_LAPACK_CPP) || defined(TEST_SERIAL_LAPACK_CPP) || \ - defined(TEST_THREADS_LAPACK_CPP))) - Test::impl_test_gesv( - &mode[0], "N", 2); // no padding - Test::impl_test_gesv( - &mode[0], "N", 13); // no padding - Test::impl_test_gesv( - &mode[0], "N", 179); // no padding - Test::impl_test_gesv( - &mode[0], "N", 64); // no padding - Test::impl_test_gesv( - &mode[0], "N", 1024); // no padding +#if (defined(TEST_CUDA_LAPACK_CPP) && defined(KOKKOSKERNELS_ENABLE_TPL_CUSOLVER)) || \ + (defined(TEST_HIP_LAPACK_CPP) && defined(KOKKOSKERNELS_ENABLE_TPL_ROCSOLVER)) || \ + (defined(KOKKOSKERNELS_ENABLE_TPL_LAPACK) && \ + (defined(TEST_OPENMP_LAPACK_CPP) || defined(TEST_SERIAL_LAPACK_CPP) || defined(TEST_THREADS_LAPACK_CPP))) + Test::impl_test_gesv(&mode[0], "N", 2); // no padding + Test::impl_test_gesv(&mode[0], "N", 13); // no padding + Test::impl_test_gesv(&mode[0], "N", 179); // no padding + Test::impl_test_gesv(&mode[0], "N", 64); // no padding + Test::impl_test_gesv(&mode[0], "N", 1024); // no padding #elif defined(KOKKOSKERNELS_ENABLE_TPL_MAGMA) && defined(KOKKOS_ENABLE_CUDA) - if constexpr (std::is_same_v) { - Test::impl_test_gesv( - &mode[0], "N", 2); // no padding - Test::impl_test_gesv( - &mode[0], "N", 13); // no padding - Test::impl_test_gesv( - &mode[0], "N", 179); // no padding - Test::impl_test_gesv( - &mode[0], "N", 64); // no padding - Test::impl_test_gesv( - &mode[0], "N", 1024); // no padding - - Test::impl_test_gesv( - &mode[0], "Y", - 13); // padding - Test::impl_test_gesv( - &mode[0], "Y", - 179); // padding + if constexpr (std::is_same_v) { + Test::impl_test_gesv(&mode[0], "N", 2); // no padding + Test::impl_test_gesv(&mode[0], "N", 13); // no padding + Test::impl_test_gesv(&mode[0], "N", 179); // no padding + Test::impl_test_gesv(&mode[0], "N", 64); // no padding + Test::impl_test_gesv(&mode[0], "N", 1024); // no padding + + Test::impl_test_gesv(&mode[0], "Y", + 13); // padding + Test::impl_test_gesv(&mode[0], "Y", + 179); // padding } #endif #endif @@ -318,48 +284,31 @@ int test_gesv(const char* mode) { template int test_gesv_mrhs(const char* mode) { #if defined(KOKKOSKERNELS_INST_LAYOUTLEFT) || \ - (!defined(KOKKOSKERNELS_ETI_ONLY) && \ - !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS)) + (!defined(KOKKOSKERNELS_ETI_ONLY) && !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS)) using view_type_a_ll = Kokkos::View; using view_type_b_ll = Kokkos::View; -#if (defined(TEST_CUDA_LAPACK_CPP) && \ - defined(KOKKOSKERNELS_ENABLE_TPL_CUSOLVER)) || \ - (defined(TEST_HIP_LAPACK_CPP) && \ - defined(KOKKOSKERNELS_ENABLE_TPL_ROCSOLVER)) || \ - (defined(KOKKOSKERNELS_ENABLE_TPL_LAPACK) && \ - (defined(TEST_OPENMP_LAPACK_CPP) || defined(TEST_SERIAL_LAPACK_CPP) || \ - defined(TEST_THREADS_LAPACK_CPP))) - Test::impl_test_gesv_mrhs( - &mode[0], "N", 2, 5); // no padding - Test::impl_test_gesv_mrhs( - &mode[0], "N", 13, 5); // no padding - Test::impl_test_gesv_mrhs( - &mode[0], "N", 179, 5); // no padding - Test::impl_test_gesv_mrhs( - &mode[0], "N", 64, 5); // no padding - Test::impl_test_gesv_mrhs( - &mode[0], "N", 1024, 5); // no padding +#if (defined(TEST_CUDA_LAPACK_CPP) && defined(KOKKOSKERNELS_ENABLE_TPL_CUSOLVER)) || \ + (defined(TEST_HIP_LAPACK_CPP) && defined(KOKKOSKERNELS_ENABLE_TPL_ROCSOLVER)) || \ + (defined(KOKKOSKERNELS_ENABLE_TPL_LAPACK) && \ + (defined(TEST_OPENMP_LAPACK_CPP) || defined(TEST_SERIAL_LAPACK_CPP) || defined(TEST_THREADS_LAPACK_CPP))) + Test::impl_test_gesv_mrhs(&mode[0], "N", 2, 5); // no padding + Test::impl_test_gesv_mrhs(&mode[0], "N", 13, 5); // no padding + Test::impl_test_gesv_mrhs(&mode[0], "N", 179, 5); // no padding + Test::impl_test_gesv_mrhs(&mode[0], "N", 64, 5); // no padding + Test::impl_test_gesv_mrhs(&mode[0], "N", 1024, 5); // no padding // When appropriate run MAGMA specific tests #elif defined(KOKKOSKERNELS_ENABLE_TPL_MAGMA) && defined(KOKKOS_ENABLE_CUDA) - if constexpr (std::is_same_v) { - Test::impl_test_gesv_mrhs( - &mode[0], "N", 2, 5); // no padding - Test::impl_test_gesv_mrhs( - &mode[0], "N", 13, 5); // no padding - Test::impl_test_gesv_mrhs( - &mode[0], "N", 179, 5); // no padding - Test::impl_test_gesv_mrhs( - &mode[0], "N", 64, 5); // no padding - Test::impl_test_gesv_mrhs( - &mode[0], "N", 1024, 5); // no padding - - Test::impl_test_gesv_mrhs( - &mode[0], "Y", 13, 5); // padding - Test::impl_test_gesv_mrhs( - &mode[0], "Y", 179, 5); // padding + if constexpr (std::is_same_v) { + Test::impl_test_gesv_mrhs(&mode[0], "N", 2, 5); // no padding + Test::impl_test_gesv_mrhs(&mode[0], "N", 13, 5); // no padding + Test::impl_test_gesv_mrhs(&mode[0], "N", 179, 5); // no padding + Test::impl_test_gesv_mrhs(&mode[0], "N", 64, 5); // no padding + Test::impl_test_gesv_mrhs(&mode[0], "N", 1024, 5); // no padding + + Test::impl_test_gesv_mrhs(&mode[0], "Y", 13, 5); // padding + Test::impl_test_gesv_mrhs(&mode[0], "Y", 179, 5); // padding } #endif #endif @@ -370,8 +319,7 @@ int test_gesv_mrhs(const char* mode) { } #if defined(KOKKOSKERNELS_INST_FLOAT) || \ - (!defined(KOKKOSKERNELS_ETI_ONLY) && \ - !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS)) + (!defined(KOKKOSKERNELS_ETI_ONLY) && !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS)) TEST_F(TestCategory, gesv_float) { Kokkos::Profiling::pushRegion("KokkosLapack::Test::gesv_float"); test_gesv("N"); // No pivoting @@ -388,8 +336,7 @@ TEST_F(TestCategory, gesv_mrhs_float) { #endif #if defined(KOKKOSKERNELS_INST_DOUBLE) || \ - (!defined(KOKKOSKERNELS_ETI_ONLY) && \ - !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS)) + (!defined(KOKKOSKERNELS_ETI_ONLY) && !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS)) TEST_F(TestCategory, gesv_double) { Kokkos::Profiling::pushRegion("KokkosLapack::Test::gesv_double"); test_gesv("N"); // No pivoting @@ -406,8 +353,7 @@ TEST_F(TestCategory, gesv_mrhs_double) { #endif #if defined(KOKKOSKERNELS_INST_COMPLEX_DOUBLE) || \ - (!defined(KOKKOSKERNELS_ETI_ONLY) && \ - !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS)) + (!defined(KOKKOSKERNELS_ETI_ONLY) && !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS)) TEST_F(TestCategory, gesv_complex_double) { Kokkos::Profiling::pushRegion("KokkosLapack::Test::gesv_complex_double"); test_gesv, TestDevice>("N"); // No pivoting @@ -424,8 +370,7 @@ TEST_F(TestCategory, gesv_mrhs_complex_double) { #endif #if defined(KOKKOSKERNELS_INST_COMPLEX_FLOAT) || \ - (!defined(KOKKOSKERNELS_ETI_ONLY) && \ - !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS)) + (!defined(KOKKOSKERNELS_ETI_ONLY) && !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS)) TEST_F(TestCategory, gesv_complex_float) { Kokkos::Profiling::pushRegion("KokkosLapack::Test::gesv_complex_float"); test_gesv, TestDevice>("N"); // No pivoting diff --git a/lapack/unit_test/Test_Lapack_svd.hpp b/lapack/unit_test/Test_Lapack_svd.hpp index da9f9ba480..a47dbbe9b9 100644 --- a/lapack/unit_test/Test_Lapack_svd.hpp +++ b/lapack/unit_test/Test_Lapack_svd.hpp @@ -26,10 +26,8 @@ namespace Test { template -void check_triple_product( - const AMatrix& A, const SVector& S, const UMatrix& U, const VMatrix& Vt, - typename Kokkos::ArithTraits< - typename AMatrix::non_const_value_type>::mag_type tol) { +void check_triple_product(const AMatrix& A, const SVector& S, const UMatrix& U, const VMatrix& Vt, + typename Kokkos::ArithTraits::mag_type tol) { // After a successful SVD decomposition we have A=U*S*V // So using gemm we should be able to compare the above // triple product to the original matrix A. @@ -40,8 +38,7 @@ void check_triple_product( // First compute the left side of the product: temp = U*S Kokkos::parallel_for( - Kokkos::RangePolicy(0, U.extent_int(0)), - KOKKOS_LAMBDA(const int& rowIdx) { + Kokkos::RangePolicy(0, U.extent_int(0)), KOKKOS_LAMBDA(const int& rowIdx) { for (int colIdx = 0; colIdx < U.extent_int(1); ++colIdx) { if (colIdx < S.extent_int(0)) { temp(rowIdx, colIdx) = U(rowIdx, colIdx) * S(colIdx); @@ -69,8 +66,7 @@ void check_triple_product( template void check_unitary_orthogonal_matrix( - const Matrix& M, typename Kokkos::ArithTraits< - typename Matrix::non_const_value_type>::mag_type tol) { + const Matrix& M, typename Kokkos::ArithTraits::mag_type tol) { // After a successful SVD decomposition the matrices // U and V are unitary matrices. Thus we can check // the property UUt=UtU=I and VVt=VtV=I using gemm. @@ -83,11 +79,9 @@ void check_unitary_orthogonal_matrix( for (int rowIdx = 0; rowIdx < M.extent_int(0); ++rowIdx) { for (int colIdx = 0; colIdx < M.extent_int(0); ++colIdx) { if (rowIdx == colIdx) { - EXPECT_NEAR_KK_REL(I0_h(rowIdx, colIdx), - Kokkos::ArithTraits::one(), tol); + EXPECT_NEAR_KK_REL(I0_h(rowIdx, colIdx), Kokkos::ArithTraits::one(), tol); } else { - EXPECT_NEAR_KK(I0_h(rowIdx, colIdx), - Kokkos::ArithTraits::zero(), tol); + EXPECT_NEAR_KK(I0_h(rowIdx, colIdx), Kokkos::ArithTraits::zero(), tol); } } } @@ -99,11 +93,9 @@ void check_unitary_orthogonal_matrix( for (int rowIdx = 0; rowIdx < M.extent_int(1); ++rowIdx) { for (int colIdx = 0; colIdx < M.extent_int(1); ++colIdx) { if (rowIdx == colIdx) { - EXPECT_NEAR_KK_REL(I1_h(rowIdx, colIdx), - Kokkos::ArithTraits::one(), tol); + EXPECT_NEAR_KK_REL(I1_h(rowIdx, colIdx), Kokkos::ArithTraits::one(), tol); } else { - EXPECT_NEAR_KK(I1_h(rowIdx, colIdx), - Kokkos::ArithTraits::zero(), tol); + EXPECT_NEAR_KK(I1_h(rowIdx, colIdx), Kokkos::ArithTraits::zero(), tol); } } } @@ -113,9 +105,8 @@ template int impl_analytic_2x2_svd() { using scalar_type = typename AMatrix::value_type; using mag_type = typename Kokkos::ArithTraits::mag_type; - using vector_type = - Kokkos::View; - using KAT_S = Kokkos::ArithTraits; + using vector_type = Kokkos::View; + using KAT_S = Kokkos::ArithTraits; const mag_type eps = KAT_S::eps(); @@ -147,8 +138,7 @@ int impl_analytic_2x2_svd() { // The singular values for this problem // are known: sqrt(45) and sqrt(5) - EXPECT_NEAR_KK_REL(S_h(0), static_cast(Kokkos::sqrt(45)), - 100 * eps); + EXPECT_NEAR_KK_REL(S_h(0), static_cast(Kokkos::sqrt(45)), 100 * eps); EXPECT_NEAR_KK_REL(S_h(1), static_cast(Kokkos::sqrt(5)), 100 * eps); // The singular vectors should be identical @@ -156,21 +146,16 @@ int impl_analytic_2x2_svd() { // component of the vectors to determine // the proper signed comparison. std::vector Uref = { - static_cast(1 / Kokkos::sqrt(10)), - static_cast(3 / Kokkos::sqrt(10)), - static_cast(-3 / Kokkos::sqrt(10)), - static_cast(1 / Kokkos::sqrt(10))}; + static_cast(1 / Kokkos::sqrt(10)), static_cast(3 / Kokkos::sqrt(10)), + static_cast(-3 / Kokkos::sqrt(10)), static_cast(1 / Kokkos::sqrt(10))}; std::vector Vtref = { - static_cast(1 / Kokkos::sqrt(2)), - static_cast(-1 / Kokkos::sqrt(2)), - static_cast(1 / Kokkos::sqrt(2)), - static_cast(1 / Kokkos::sqrt(2))}; + static_cast(1 / Kokkos::sqrt(2)), static_cast(-1 / Kokkos::sqrt(2)), + static_cast(1 / Kokkos::sqrt(2)), static_cast(1 / Kokkos::sqrt(2))}; // Both rotations and reflections are valid // vector basis so we need to check both signs // to confirm proper SVD was achieved. - Kokkos::View U_real("U real", 2, 2), - Vt_real("Vt real", 2, 2); + Kokkos::View U_real("U real", 2, 2), Vt_real("Vt real", 2, 2); if constexpr (KAT_S::is_complex) { U_real(0, 0) = U_h(0, 0).real(); U_real(0, 1) = U_h(0, 1).real(); @@ -219,9 +204,8 @@ template int impl_analytic_2x3_svd() { using scalar_type = typename AMatrix::value_type; using mag_type = typename Kokkos::ArithTraits::mag_type; - using vector_type = - Kokkos::View; - using KAT_S = Kokkos::ArithTraits; + using vector_type = Kokkos::View; + using KAT_S = Kokkos::ArithTraits; const mag_type tol = 100 * KAT_S::eps(); @@ -277,8 +261,7 @@ int impl_analytic_2x3_svd() { // Both rotations and reflections are valid // vector basis so we need to check both signs // to confirm proper SVD was achieved. - Kokkos::View U_real("U real", 2, 2), - Vt_real("Vt real", 3, 3); + Kokkos::View U_real("U real", 2, 2), Vt_real("Vt real", 3, 3); if constexpr (KAT_S::is_complex) { U_real(0, 0) = U_h(0, 0).real(); U_real(0, 1) = U_h(0, 1).real(); @@ -350,9 +333,8 @@ template int impl_analytic_3x2_svd() { using scalar_type = typename AMatrix::value_type; using mag_type = typename Kokkos::ArithTraits::mag_type; - using vector_type = - Kokkos::View; - using KAT_S = Kokkos::ArithTraits; + using vector_type = Kokkos::View; + using KAT_S = Kokkos::ArithTraits; const mag_type tol = 100 * KAT_S::eps(); @@ -396,8 +378,7 @@ int impl_analytic_3x2_svd() { // Both rotations and reflections are valid // vector basis so we need to check both signs // to confirm proper SVD was achieved. - Kokkos::View U_real("U real", 3, 3), - Vt_real("Vt real", 2, 2); + Kokkos::View U_real("U real", 3, 3), Vt_real("Vt real", 2, 2); if constexpr (KAT_S::is_complex) { U_real(0, 0) = U_h(0, 0).real(); U_real(0, 1) = U_h(0, 1).real(); @@ -471,8 +452,7 @@ int impl_test_svd(const int m, const int n) { using scalar_type = typename AMatrix::value_type; using KAT_S = Kokkos::ArithTraits; using mag_type = typename KAT_S::mag_type; - using vector_type = - Kokkos::View; + using vector_type = Kokkos::View; const mag_type max_val = 10; const mag_type tol = 2000 * max_val * KAT_S::eps(); @@ -480,8 +460,7 @@ int impl_test_svd(const int m, const int n) { AMatrix A("A", m, n), U("U", m, m), Vt("Vt", n, n), Aref("A ref", m, n); vector_type S("S", Kokkos::min(m, n)); - const uint64_t seed = - std::chrono::high_resolution_clock::now().time_since_epoch().count(); + const uint64_t seed = std::chrono::high_resolution_clock::now().time_since_epoch().count(); Kokkos::Random_XorShift64_Pool rand_pool(seed); // Initialize A with random numbers @@ -492,8 +471,7 @@ int impl_test_svd(const int m, const int n) { // Working around CUSOLVER constraint for m >= n #if defined(KOKKOSKERNELS_ENABLE_TPL_CUSOLVER) - if constexpr (std::is_same_v) { + if constexpr (std::is_same_v) { if (m >= n) { KokkosLapack::svd("A", "A", A, S, U, Vt); } else { @@ -523,10 +501,8 @@ int test_svd() { int ret; #if defined(KOKKOSKERNELS_INST_LAYOUTLEFT) || \ - (!defined(KOKKOSKERNELS_ETI_ONLY) && \ - !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS)) - using view_type_a_layout_left = - Kokkos::View; + (!defined(KOKKOSKERNELS_ETI_ONLY) && !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS)) + using view_type_a_layout_left = Kokkos::View; ret = Test::impl_analytic_2x2_svd(); EXPECT_EQ(ret, 0); @@ -554,10 +530,8 @@ int test_svd() { #endif #if defined(KOKKOSKERNELS_INST_LAYOUTRIGHT) || \ - (!defined(KOKKOSKERNELS_ETI_ONLY) && \ - !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS)) - using view_type_a_layout_right = - Kokkos::View; + (!defined(KOKKOSKERNELS_ETI_ONLY) && !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS)) + using view_type_a_layout_right = Kokkos::View; ret = Test::impl_analytic_2x2_svd(); EXPECT_EQ(ret, 0); @@ -589,18 +563,15 @@ int test_svd() { template int test_svd_wrapper() { -#if defined(KOKKOSKERNELS_ENABLE_TPL_LAPACK) || \ - defined(KOKKOSKERNELS_ENABLE_TPL_MKL) - if constexpr (std::is_same_v) { +#if defined(KOKKOSKERNELS_ENABLE_TPL_LAPACK) || defined(KOKKOSKERNELS_ENABLE_TPL_MKL) + if constexpr (std::is_same_v) { // Using a device side space with LAPACK/MKL return test_svd(); } #endif #if defined(KOKKOSKERNELS_ENABLE_TPL_CUSOLVER) - if constexpr (std::is_same_v) { + if constexpr (std::is_same_v) { // Using a Cuda device with CUSOLVER return test_svd(); } @@ -618,8 +589,7 @@ int test_svd_wrapper() { } #if defined(KOKKOSKERNELS_INST_FLOAT) || \ - (!defined(KOKKOSKERNELS_ETI_ONLY) && \ - !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS)) + (!defined(KOKKOSKERNELS_ETI_ONLY) && !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS)) TEST_F(TestCategory, svd_float) { Kokkos::Profiling::pushRegion("KokkosLapack::Test::svd_float"); test_svd_wrapper(); @@ -628,8 +598,7 @@ TEST_F(TestCategory, svd_float) { #endif #if defined(KOKKOSKERNELS_INST_DOUBLE) || \ - (!defined(KOKKOSKERNELS_ETI_ONLY) && \ - !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS)) + (!defined(KOKKOSKERNELS_ETI_ONLY) && !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS)) TEST_F(TestCategory, svd_double) { Kokkos::Profiling::pushRegion("KokkosLapack::Test::svd_double"); test_svd_wrapper(); @@ -638,8 +607,7 @@ TEST_F(TestCategory, svd_double) { #endif #if defined(KOKKOSKERNELS_INST_COMPLEX_FLOAT) || \ - (!defined(KOKKOSKERNELS_ETI_ONLY) && \ - !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS)) + (!defined(KOKKOSKERNELS_ETI_ONLY) && !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS)) TEST_F(TestCategory, svd_complex_float) { Kokkos::Profiling::pushRegion("KokkosLapack::Test::svd_complex_float"); test_svd_wrapper, TestDevice>(); @@ -648,8 +616,7 @@ TEST_F(TestCategory, svd_complex_float) { #endif #if defined(KOKKOSKERNELS_INST_COMPLEX_DOUBLE) || \ - (!defined(KOKKOSKERNELS_ETI_ONLY) && \ - !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS)) + (!defined(KOKKOSKERNELS_ETI_ONLY) && !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS)) TEST_F(TestCategory, svd_complex_double) { Kokkos::Profiling::pushRegion("KokkosLapack::Test::svd_complex_double"); test_svd_wrapper, TestDevice>(); diff --git a/lapack/unit_test/Test_Lapack_trtri.hpp b/lapack/unit_test/Test_Lapack_trtri.hpp index a19e575d89..b555ea8aaf 100644 --- a/lapack/unit_test/Test_Lapack_trtri.hpp +++ b/lapack/unit_test/Test_Lapack_trtri.hpp @@ -43,8 +43,7 @@ struct NonUnitDiagTRTRI { KOKKOS_INLINE_FUNCTION void operator()(const int& i) const { A_(i, i) = A_(i, i) + 10; } }; -template +template struct VanillaGEMM { bool A_t, B_t, A_c, B_c; int N, K; @@ -61,12 +60,9 @@ struct VanillaGEMM { ScalarC beta; KOKKOS_INLINE_FUNCTION - void operator()( - const typename Kokkos::TeamPolicy::member_type& team) - const { + void operator()(const typename Kokkos::TeamPolicy::member_type& team) const { // GNU COMPILER BUG WORKAROUND -#if defined(KOKKOS_COMPILER_GNU) && !defined(__CUDA_ARCH__) && \ - !defined(__HIP_DEVICE_COMPILE__) +#if defined(KOKKOS_COMPILER_GNU) && !defined(__CUDA_ARCH__) && !defined(__HIP_DEVICE_COMPILE__) int i = team.league_rank(); #else const int i = team.league_rank(); @@ -97,8 +93,7 @@ struct VanillaGEMM { }; template -int impl_test_trtri(int bad_diag_idx, const char* uplo, const char* diag, - const int M, const int N) { +int impl_test_trtri(int bad_diag_idx, const char* uplo, const char* diag, const int M, const int N) { using execution_space = typename ViewTypeA::device_type::execution_space; using ScalarA = typename ViewTypeA::value_type; using APT = Kokkos::ArithTraits; @@ -111,9 +106,8 @@ int impl_test_trtri(int bad_diag_idx, const char* uplo, const char* diag, ViewTypeA A("A", M, N); ViewTypeA A_original("A_original", M, N); ViewTypeA A_I("A_I", M, N); // is I taken...? - uint64_t seed = - std::chrono::high_resolution_clock::now().time_since_epoch().count(); - ScalarA beta = ScalarA(0); + uint64_t seed = std::chrono::high_resolution_clock::now().time_since_epoch().count(); + ScalarA beta = ScalarA(0); ScalarA cur_check_val; // Either 1 or 0, to check A_I // const int As0 = A.stride(0), As1 = A.stride(1); @@ -137,8 +131,7 @@ int impl_test_trtri(int bad_diag_idx, const char* uplo, const char* diag, } } // Set just 1 value in the diagonal to 0. - if (M > 0 && N > 0) - host_A(bad_diag_idx - 1, bad_diag_idx - 1) = ScalarA(0); + if (M > 0 && N > 0) host_A(bad_diag_idx - 1, bad_diag_idx - 1) = ScalarA(0); Kokkos::deep_copy(A, host_A); } return KokkosLapack::trtri(uplo, diag, A); @@ -151,21 +144,17 @@ int impl_test_trtri(int bad_diag_idx, const char* uplo, const char* diag, Kokkos::Random_XorShift64_Pool rand_pool(seed); // Initialize A with deterministic random numbers - Kokkos::fill_random( - A, rand_pool, - Kokkos::rand, ScalarA>::max()); + Kokkos::fill_random(A, rand_pool, Kokkos::rand, ScalarA>::max()); if ((diag[0] == 'U') || (diag[0] == 'u')) { using functor_type = UnitDiagTRTRI; functor_type udtrtri(A); // Initialize As diag with 1s - Kokkos::parallel_for("KokkosLapack::Test::UnitDiagTRTRI", - Kokkos::RangePolicy(0, M), udtrtri); + Kokkos::parallel_for("KokkosLapack::Test::UnitDiagTRTRI", Kokkos::RangePolicy(0, M), udtrtri); } else { //(diag[0]=='N')||(diag[0]=='n') using functor_type = NonUnitDiagTRTRI; functor_type nudtrtri(A); // Initialize As diag with A(i,i)+10 - Kokkos::parallel_for("KokkosLapack::Test::NonUnitDiagTRTRI", - Kokkos::RangePolicy(0, M), nudtrtri); + Kokkos::parallel_for("KokkosLapack::Test::NonUnitDiagTRTRI", Kokkos::RangePolicy(0, M), nudtrtri); } Kokkos::fence(); Kokkos::deep_copy(host_A, A); @@ -199,8 +188,7 @@ int impl_test_trtri(int bad_diag_idx, const char* uplo, const char* diag, Kokkos::fence(); if (ret) { - printf("KokkosLapack::trtri(%c, %c, %s) returned %d\n", uplo[0], diag[0], - typeid(ViewTypeA).name(), ret); + printf("KokkosLapack::trtri(%c, %c, %s) returned %d\n", uplo[0], diag[0], typeid(ViewTypeA).name(), ret); return ret; } @@ -228,12 +216,10 @@ int impl_test_trtri(int bad_diag_idx, const char* uplo, const char* diag, vgemm.C = A_I; // out vgemm.alpha = ScalarA(1); vgemm.beta = beta; - Kokkos::parallel_for( - "KokkosLapack::Test::VanillaGEMM", - Kokkos::TeamPolicy( - M, Kokkos::AUTO, - KokkosKernels::Impl::kk_get_max_vector_size()), - vgemm); + Kokkos::parallel_for("KokkosLapack::Test::VanillaGEMM", + Kokkos::TeamPolicy( + M, Kokkos::AUTO, KokkosKernels::Impl::kk_get_max_vector_size()), + vgemm); Kokkos::fence(); Kokkos::deep_copy(host_I, A_I); @@ -251,8 +237,7 @@ int impl_test_trtri(int bad_diag_idx, const char* uplo, const char* diag, for (int i = 0; i < M; i++) { for (int j = 0; j < N; j++) { // Set check value - cur_check_val = - (i == j) ? ScalarA(1) : ScalarA(0); // APT::abs(host_A(i,j)); + cur_check_val = (i == j) ? ScalarA(1) : ScalarA(0); // APT::abs(host_A(i,j)); // Check how close |A_I - cur_check_val| is to 0. if (APT::abs(APT::abs(host_I(i, j)) - cur_check_val) > eps) { @@ -276,38 +261,30 @@ int test_trtri(const char* mode) { int ret; int bad_diag_idx = -1; #if defined(KOKKOSKERNELS_INST_LAYOUTLEFT) || \ - (!defined(KOKKOSKERNELS_ETI_ONLY) && \ - !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS)) - using view_type_a_layout_left = - Kokkos::View; + (!defined(KOKKOSKERNELS_ETI_ONLY) && !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS)) + using view_type_a_layout_left = Kokkos::View; - ret = Test::impl_test_trtri( - bad_diag_idx, &mode[0], &mode[1], 0, 0); + ret = Test::impl_test_trtri(bad_diag_idx, &mode[0], &mode[1], 0, 0); EXPECT_EQ(ret, 0); - ret = Test::impl_test_trtri( - bad_diag_idx, &mode[0], &mode[1], 1, 1); + ret = Test::impl_test_trtri(bad_diag_idx, &mode[0], &mode[1], 1, 1); EXPECT_EQ(ret, 0); - ret = Test::impl_test_trtri( - bad_diag_idx, &mode[0], &mode[1], 15, 15); + ret = Test::impl_test_trtri(bad_diag_idx, &mode[0], &mode[1], 15, 15); EXPECT_EQ(ret, 0); - ret = Test::impl_test_trtri( - bad_diag_idx, &mode[0], &mode[1], 100, 100); + ret = Test::impl_test_trtri(bad_diag_idx, &mode[0], &mode[1], 100, 100); EXPECT_EQ(ret, 0); // Rounding errors with randomly generated matrices begin here where M>100, so // we pass in A=I - ret = Test::impl_test_trtri( - bad_diag_idx, &mode[0], &mode[1], 273, 273); + ret = Test::impl_test_trtri(bad_diag_idx, &mode[0], &mode[1], 273, 273); EXPECT_EQ(ret, 0); // Only non-unit matrices could be singular. if (mode[1] == 'N' || mode[1] == 'n') { bad_diag_idx = 2; // 1-index based - ret = Test::impl_test_trtri( - bad_diag_idx, &mode[0], &mode[1], 2, 2); + ret = Test::impl_test_trtri(bad_diag_idx, &mode[0], &mode[1], 2, 2); EXPECT_EQ(ret, bad_diag_idx); bad_diag_idx = -1; } @@ -318,38 +295,30 @@ int test_trtri(const char* mode) { #endif #if defined(KOKKOSKERNELS_INST_LAYOUTRIGHT) || \ - (!defined(KOKKOSKERNELS_ETI_ONLY) && \ - !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS)) - using view_type_a_layout_right = - Kokkos::View; + (!defined(KOKKOSKERNELS_ETI_ONLY) && !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS)) + using view_type_a_layout_right = Kokkos::View; - ret = Test::impl_test_trtri( - bad_diag_idx, &mode[0], &mode[1], 0, 0); + ret = Test::impl_test_trtri(bad_diag_idx, &mode[0], &mode[1], 0, 0); EXPECT_EQ(ret, 0); - ret = Test::impl_test_trtri( - bad_diag_idx, &mode[0], &mode[1], 1, 1); + ret = Test::impl_test_trtri(bad_diag_idx, &mode[0], &mode[1], 1, 1); EXPECT_EQ(ret, 0); - ret = Test::impl_test_trtri( - bad_diag_idx, &mode[0], &mode[1], 15, 15); + ret = Test::impl_test_trtri(bad_diag_idx, &mode[0], &mode[1], 15, 15); EXPECT_EQ(ret, 0); - ret = Test::impl_test_trtri( - bad_diag_idx, &mode[0], &mode[1], 100, 100); + ret = Test::impl_test_trtri(bad_diag_idx, &mode[0], &mode[1], 100, 100); EXPECT_EQ(ret, 0); // Rounding errors with randomly generated matrices begin here where M>100, so // we pass in A=I - ret = Test::impl_test_trtri( - bad_diag_idx, &mode[0], &mode[1], 273, 273); + ret = Test::impl_test_trtri(bad_diag_idx, &mode[0], &mode[1], 273, 273); EXPECT_EQ(ret, 0); // Only non-unit matrices could be singular. if (mode[1] == 'N' || mode[1] == 'n') { bad_diag_idx = 2; // 1-index based - ret = Test::impl_test_trtri( - bad_diag_idx, &mode[0], &mode[1], 2, 2); + ret = Test::impl_test_trtri(bad_diag_idx, &mode[0], &mode[1], 2, 2); EXPECT_EQ(ret, bad_diag_idx); bad_diag_idx = -1; } @@ -359,8 +328,7 @@ int test_trtri(const char* mode) { } #if defined(KOKKOSKERNELS_INST_FLOAT) || \ - (!defined(KOKKOSKERNELS_ETI_ONLY) && \ - !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS)) + (!defined(KOKKOSKERNELS_ETI_ONLY) && !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS)) TEST_F(TestCategory, trtri_float) { Kokkos::Profiling::pushRegion("KokkosLapack::Test::trtri_float"); test_trtri("UN"); @@ -372,8 +340,7 @@ TEST_F(TestCategory, trtri_float) { #endif #if defined(KOKKOSKERNELS_INST_DOUBLE) || \ - (!defined(KOKKOSKERNELS_ETI_ONLY) && \ - !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS)) + (!defined(KOKKOSKERNELS_ETI_ONLY) && !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS)) TEST_F(TestCategory, trtri_double) { Kokkos::Profiling::pushRegion("KokkosLapack::Test::trtri_double"); test_trtri("UN"); @@ -385,8 +352,7 @@ TEST_F(TestCategory, trtri_double) { #endif #if defined(KOKKOSKERNELS_INST_COMPLEX_DOUBLE) || \ - (!defined(KOKKOSKERNELS_ETI_ONLY) && \ - !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS)) + (!defined(KOKKOSKERNELS_ETI_ONLY) && !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS)) TEST_F(TestCategory, trtri_complex_double) { Kokkos::Profiling::pushRegion("KokkosLapack::Test::trtri_complex_double"); test_trtri, TestDevice>("UN"); @@ -398,8 +364,7 @@ TEST_F(TestCategory, trtri_complex_double) { #endif #if defined(KOKKOSKERNELS_INST_COMPLEX_FLOAT) || \ - (!defined(KOKKOSKERNELS_ETI_ONLY) && \ - !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS)) + (!defined(KOKKOSKERNELS_ETI_ONLY) && !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS)) TEST_F(TestCategory, trtri_complex_float) { Kokkos::Profiling::pushRegion("KokkosLapack::Test::trtri_complex_float"); test_trtri, TestDevice>("UN"); diff --git a/ode/impl/KokkosODE_BDF_impl.hpp b/ode/impl/KokkosODE_BDF_impl.hpp index cf89731f1b..3119ff0e3a 100644 --- a/ode/impl/KokkosODE_BDF_impl.hpp +++ b/ode/impl/KokkosODE_BDF_impl.hpp @@ -44,31 +44,27 @@ struct BDF_table<2> { template <> struct BDF_table<3> { static constexpr int order = 3; - Kokkos::Array coefficients{ - {-18.0 / 11.0, 9.0 / 11.0, -2.0 / 11.0, 6.0 / 11.0}}; + Kokkos::Array coefficients{{-18.0 / 11.0, 9.0 / 11.0, -2.0 / 11.0, 6.0 / 11.0}}; }; template <> struct BDF_table<4> { static constexpr int order = 4; - Kokkos::Array coefficients{ - {-48.0 / 25.0, 36.0 / 25.0, -16.0 / 25.0, 3.0 / 25.0, 12.0 / 25.0}}; + Kokkos::Array coefficients{{-48.0 / 25.0, 36.0 / 25.0, -16.0 / 25.0, 3.0 / 25.0, 12.0 / 25.0}}; }; template <> struct BDF_table<5> { static constexpr int order = 5; - Kokkos::Array coefficients{{-300.0 / 137.0, 300.0 / 137.0, - -200.0 / 137.0, 75.0 / 137.0, - -12.0 / 137.0, 60.0 / 137.0}}; + Kokkos::Array coefficients{ + {-300.0 / 137.0, 300.0 / 137.0, -200.0 / 137.0, 75.0 / 137.0, -12.0 / 137.0, 60.0 / 137.0}}; }; template <> struct BDF_table<6> { static constexpr int order = 6; Kokkos::Array coefficients{ - {-360.0 / 147.0, 450.0 / 147.0, -400.0 / 147.0, 225.0 / 147.0, - -72.0 / 147.0, 10.0 / 147.0, 60.0 / 147.0}}; + {-360.0 / 147.0, 450.0 / 147.0, -400.0 / 147.0, 225.0 / 147.0, -72.0 / 147.0, 10.0 / 147.0, 60.0 / 147.0}}; }; template @@ -82,14 +78,9 @@ struct BDF_system_wrapper { mv_type yn; KOKKOS_FUNCTION - BDF_system_wrapper(const system_type& mySys_, const table_type& table_, - const double t_, const double dt_, const mv_type& yn_) - : mySys(mySys_), - neqs(mySys_.neqs), - table(table_), - t(t_), - dt(dt_), - yn(yn_) {} + BDF_system_wrapper(const system_type& mySys_, const table_type& table_, const double t_, const double dt_, + const mv_type& yn_) + : mySys(mySys_), neqs(mySys_.neqs), table(table_), t(t_), dt(dt_), yn(yn_) {} template KOKKOS_FUNCTION void residual(const vec_type& y, const vec_type& f) const { @@ -99,8 +90,7 @@ struct BDF_system_wrapper { for (int eqIdx = 0; eqIdx < neqs; ++eqIdx) { f(eqIdx) = y(eqIdx) - table.coefficients[order] * dt * f(eqIdx); for (int orderIdx = 0; orderIdx < order; ++orderIdx) { - f(eqIdx) += - table.coefficients[order - 1 - orderIdx] * yn(eqIdx, orderIdx); + f(eqIdx) += table.coefficients[order - 1 - orderIdx] * yn(eqIdx, orderIdx); } } } @@ -111,8 +101,7 @@ struct BDF_system_wrapper { for (int rowIdx = 0; rowIdx < neqs; ++rowIdx) { for (int colIdx = 0; colIdx < neqs; ++colIdx) { - jac(rowIdx, colIdx) = - -table.coefficients[order] * dt * jac(rowIdx, colIdx); + jac(rowIdx, colIdx) = -table.coefficients[order] * dt * jac(rowIdx, colIdx); } jac(rowIdx, rowIdx) += 1.0; } @@ -130,13 +119,12 @@ struct BDF_system_wrapper2 { double t, dt, c = 0; KOKKOS_FUNCTION - BDF_system_wrapper2(const system_type& mySys_, const subview_type& psi_, - const d_vec_type& d_, const double t_, const double dt_) + BDF_system_wrapper2(const system_type& mySys_, const subview_type& psi_, const d_vec_type& d_, const double t_, + const double dt_) : mySys(mySys_), neqs(mySys_.neqs), psi(psi_), d(d_), t(t_), dt(dt_) {} template - KOKKOS_FUNCTION void residual(const YVectorType& y, - const FVectorType& f) const { + KOKKOS_FUNCTION void residual(const YVectorType& y, const FVectorType& f) const { // f = f(t+dt, y) mySys.evaluate_function(t, dt, y, f); @@ -165,14 +153,10 @@ struct BDF_system_wrapper2 { } }; -template -KOKKOS_FUNCTION void BDFStep(ode_type& ode, const table_type& table, - scalar_type t, scalar_type dt, - const vec_type& y_old, const vec_type& y_new, - const vec_type& rhs, const vec_type& update, - const vec_type& scale, const mv_type& y_vecs, - const mat_type& temp, const mat_type& jac) { +template +KOKKOS_FUNCTION void BDFStep(ode_type& ode, const table_type& table, scalar_type t, scalar_type dt, + const vec_type& y_old, const vec_type& y_new, const vec_type& rhs, const vec_type& update, + const vec_type& scale, const mv_type& y_vecs, const mat_type& temp, const mat_type& jac) { using newton_params = KokkosODE::Experimental::Newton_params; BDF_system_wrapper sys(ode, table, t, dt, y_vecs); @@ -184,57 +168,43 @@ KOKKOS_FUNCTION void BDFStep(ode_type& ode, const table_type& table, } // solver the nonlinear problem - { - KokkosODE::Experimental::Newton::Solve(sys, param, jac, temp, y_new, rhs, - update, scale); - } + { KokkosODE::Experimental::Newton::Solve(sys, param, jac, temp, y_new, rhs, update, scale); } } // BDFStep template -KOKKOS_FUNCTION void compute_coeffs(const int order, const scalar_type factor, - const mat_type& coeffs) { +KOKKOS_FUNCTION void compute_coeffs(const int order, const scalar_type factor, const mat_type& coeffs) { coeffs(0, 0) = 1.0; for (int colIdx = 0; colIdx < order; ++colIdx) { coeffs(0, colIdx + 1) = 1.0; for (int rowIdx = 0; rowIdx < order; ++rowIdx) { coeffs(rowIdx + 1, colIdx + 1) = - ((rowIdx - factor * (colIdx + 1.0)) / (rowIdx + 1.0)) * - coeffs(rowIdx, colIdx + 1); + ((rowIdx - factor * (colIdx + 1.0)) / (rowIdx + 1.0)) * coeffs(rowIdx, colIdx + 1); } } } template -KOKKOS_FUNCTION void update_D(const int order, const scalar_type factor, - const mat_type& coeffs, const mat_type& tempD, +KOKKOS_FUNCTION void update_D(const int order, const scalar_type factor, const mat_type& coeffs, const mat_type& tempD, const mat_type& D) { - auto subD = - Kokkos::subview(D, Kokkos::ALL(), Kokkos::pair(0, order + 1)); - auto subTempD = Kokkos::subview(tempD, Kokkos::ALL(), - Kokkos::pair(0, order + 1)); + auto subD = Kokkos::subview(D, Kokkos::ALL(), Kokkos::pair(0, order + 1)); + auto subTempD = Kokkos::subview(tempD, Kokkos::ALL(), Kokkos::pair(0, order + 1)); compute_coeffs(order, factor, coeffs); - auto R = Kokkos::subview(coeffs, Kokkos::pair(0, order + 1), - Kokkos::pair(0, order + 1)); - KokkosBatched::SerialGemm< - KokkosBatched::Trans::NoTranspose, KokkosBatched::Trans::NoTranspose, - KokkosBatched::Algo::Gemm::Blocked>::invoke(1.0, subD, R, 0.0, subTempD); + auto R = Kokkos::subview(coeffs, Kokkos::pair(0, order + 1), Kokkos::pair(0, order + 1)); + KokkosBatched::SerialGemm::invoke(1.0, subD, R, 0.0, subTempD); compute_coeffs(order, 1.0, coeffs); - auto U = Kokkos::subview(coeffs, Kokkos::pair(0, order + 1), - Kokkos::pair(0, order + 1)); - KokkosBatched::SerialGemm< - KokkosBatched::Trans::NoTranspose, KokkosBatched::Trans::NoTranspose, - KokkosBatched::Algo::Gemm::Blocked>::invoke(1.0, subTempD, U, 0.0, subD); + auto U = Kokkos::subview(coeffs, Kokkos::pair(0, order + 1), Kokkos::pair(0, order + 1)); + KokkosBatched::SerialGemm::invoke(1.0, subTempD, U, 0.0, subD); } -template -KOKKOS_FUNCTION void initial_step_size( - const ode_type ode, const int order, const scalar_type t0, - const scalar_type atol, const scalar_type rtol, const vec_type& y0, - const res_type& f0, const mat_type& temp, scalar_type& dt_ini) { +template +KOKKOS_FUNCTION void initial_step_size(const ode_type ode, const int order, const scalar_type t0, + const scalar_type atol, const scalar_type rtol, const vec_type& y0, + const res_type& f0, const mat_type& temp, scalar_type& dt_ini) { using KAT = Kokkos::ArithTraits; // Extract subviews to store intermediate data @@ -290,16 +260,12 @@ KOKKOS_FUNCTION void initial_step_size( } } // initial_step_size -template -KOKKOS_FUNCTION void BDFStep(ode_type& ode, scalar_type& t, scalar_type& dt, - scalar_type t_end, int& order, - int& num_equal_steps, const int max_newton_iters, - const scalar_type atol, const scalar_type rtol, - const scalar_type min_factor, - const vec_type& y_old, const vec_type& y_new, - const res_type& rhs, const res_type& update, - const mat_type& temp, const mat_type& temp2) { +template +KOKKOS_FUNCTION void BDFStep(ode_type& ode, scalar_type& t, scalar_type& dt, scalar_type t_end, int& order, + int& num_equal_steps, const int max_newton_iters, const scalar_type atol, + const scalar_type rtol, const scalar_type min_factor, const vec_type& y_old, + const vec_type& y_new, const res_type& rhs, const res_type& update, const mat_type& temp, + const mat_type& temp2) { using newton_params = KokkosODE::Experimental::Newton_params; constexpr int max_order = 5; @@ -310,10 +276,8 @@ KOKKOS_FUNCTION void BDFStep(ode_type& ode, scalar_type& t, scalar_type& dt, // kappa gamma(i) = sum_{k=1}^i(1.0 / k); gamma(0) = 0; // NDF coefficients // gamma_k alpha(i) = (1 - kappa(i)) * gamma(i) error_const(i) = kappa(i) * // gamma(i) + 1 / (i + 1) - const Kokkos::Array alpha{ - {0., 1.185, 1.66666667, 1.98421667, 2.16979167, 2.28333333}}; - const Kokkos::Array error_const{ - {1., 0.315, 0.16666667, 0.09911667, 0.11354167, 0.16666667}}; + const Kokkos::Array alpha{{0., 1.185, 1.66666667, 1.98421667, 2.16979167, 2.28333333}}; + const Kokkos::Array error_const{{1., 0.315, 0.16666667, 0.09911667, 0.11354167, 0.16666667}}; // Extract columns of temp to form temporary // subviews to operate on. @@ -322,12 +286,9 @@ KOKKOS_FUNCTION void BDFStep(ode_type& ode, scalar_type& t, scalar_type& dt, // numCols << std::endl; std::cout << "Extract subview from temp" << // std::endl; int offset = 2; - auto D = Kokkos::subview( - temp, Kokkos::ALL(), - Kokkos::pair(offset, offset + 8)); // y and its derivatives + auto D = Kokkos::subview(temp, Kokkos::ALL(), Kokkos::pair(offset, offset + 8)); // y and its derivatives offset += 8; - auto tempD = Kokkos::subview(temp, Kokkos::ALL(), - Kokkos::pair(offset, offset + 8)); + auto tempD = Kokkos::subview(temp, Kokkos::ALL(), Kokkos::pair(offset, offset + 8)); offset += 8; auto scale = Kokkos::subview(temp, Kokkos::ALL(), offset + 1); ++offset; // Scaling coefficients for error calculation @@ -337,31 +298,26 @@ KOKKOS_FUNCTION void BDFStep(ode_type& ode, scalar_type& t, scalar_type& dt, ++offset; // Higher order terms contribution to rhs auto error = Kokkos::subview(temp, Kokkos::ALL(), offset + 1); ++offset; // Error estimate - auto jac = Kokkos::subview( - temp, Kokkos::ALL(), - Kokkos::pair(offset, offset + ode.neqs)); // Jacobian matrix + auto jac = + Kokkos::subview(temp, Kokkos::ALL(), Kokkos::pair(offset, offset + ode.neqs)); // Jacobian matrix offset += ode.neqs; auto tmp_gesv = Kokkos::subview( - temp, Kokkos::ALL(), - Kokkos::pair( - offset, offset + ode.neqs + 4)); // Buffer space for gesv calculation + temp, Kokkos::ALL(), Kokkos::pair(offset, offset + ode.neqs + 4)); // Buffer space for gesv calculation offset += ode.neqs + 4; - auto coeffs = - Kokkos::subview(temp2, Kokkos::ALL(), Kokkos::pair(0, 6)); - auto gamma = Kokkos::subview(temp2, Kokkos::ALL(), 6); - gamma(0) = 0.0; - gamma(1) = 1.0; - gamma(2) = 1.5; - gamma(3) = 1.83333333; - gamma(4) = 2.08333333; - gamma(5) = 2.28333333; + auto coeffs = Kokkos::subview(temp2, Kokkos::ALL(), Kokkos::pair(0, 6)); + auto gamma = Kokkos::subview(temp2, Kokkos::ALL(), 6); + gamma(0) = 0.0; + gamma(1) = 1.0; + gamma(2) = 1.5; + gamma(3) = 1.83333333; + gamma(4) = 2.08333333; + gamma(5) = 2.28333333; BDF_system_wrapper2 sys(ode, psi, update, t, dt); const newton_params param( max_newton_iters, atol, - Kokkos::max(10 * Kokkos::ArithTraits::eps() / rtol, - Kokkos::min(0.03, Kokkos::sqrt(rtol)))); + Kokkos::max(10 * Kokkos::ArithTraits::eps() / rtol, Kokkos::min(0.03, Kokkos::sqrt(rtol)))); scalar_type max_step = Kokkos::ArithTraits::max(); scalar_type min_step = Kokkos::ArithTraits::min(); @@ -406,12 +362,9 @@ KOKKOS_FUNCTION void BDFStep(ode_type& ode, scalar_type& t, scalar_type& dt, // Compute psi, the sum of the higher order // contribution to the residual - auto subD = - Kokkos::subview(D, Kokkos::ALL(), Kokkos::pair(1, order + 1)); - auto subGamma = - Kokkos::subview(gamma, Kokkos::pair(1, order + 1)); - KokkosBlas::Experimental::serial_gemv('N', 1.0 / alpha[order], subD, - subGamma, 0.0, psi); + auto subD = Kokkos::subview(D, Kokkos::ALL(), Kokkos::pair(1, order + 1)); + auto subGamma = Kokkos::subview(gamma, Kokkos::pair(1, order + 1)); + KokkosBlas::Experimental::serial_gemv('N', 1.0 / alpha[order], subD, subGamma, 0.0, psi); sys.compute_jac = true; sys.c = dt / alpha[order]; @@ -420,23 +373,20 @@ KOKKOS_FUNCTION void BDFStep(ode_type& ode, scalar_type& t, scalar_type& dt, Kokkos::Experimental::local_deep_copy(y_new, y_predict); Kokkos::Experimental::local_deep_copy(update, 0); KokkosODE::Experimental::newton_solver_status newton_status = - KokkosODE::Experimental::Newton::Solve(sys, param, jac, tmp_gesv, y_new, - rhs, update, scale); + KokkosODE::Experimental::Newton::Solve(sys, param, jac, tmp_gesv, y_new, rhs, update, scale); for (int eqIdx = 0; eqIdx < sys.neqs; ++eqIdx) { update(eqIdx) = y_new(eqIdx) - y_predict(eqIdx); } - if (newton_status == - KokkosODE::Experimental::newton_solver_status::MAX_ITER) { + if (newton_status == KokkosODE::Experimental::newton_solver_status::MAX_ITER) { dt = 0.5 * dt; update_D(order, 0.5, coeffs, tempD, D); num_equal_steps = 0; } else { // Estimate the solution error - safety = 0.9 * (2 * max_newton_iters + 1) / - (2 * max_newton_iters + param.iters); + safety = 0.9 * (2 * max_newton_iters + 1) / (2 * max_newton_iters + param.iters); error_norm = 0; for (int eqIdx = 0; eqIdx < sys.neqs; ++eqIdx) { scale(eqIdx) = atol + rtol * Kokkos::abs(y_new(eqIdx)); @@ -447,9 +397,8 @@ KOKKOS_FUNCTION void BDFStep(ode_type& ode, scalar_type& t, scalar_type& dt, // Check error norm and adapt step size or accept step if (error_norm > 1) { - scalar_type factor = Kokkos::max( - min_factor, safety * Kokkos::pow(error_norm, -1.0 / (order + 1))); - dt = factor * dt; + scalar_type factor = Kokkos::max(min_factor, safety * Kokkos::pow(error_norm, -1.0 / (order + 1))); + dt = factor * dt; update_D(order, factor, coeffs, tempD, D); num_equal_steps = 0; } else { @@ -483,8 +432,7 @@ KOKKOS_FUNCTION void BDFStep(ode_type& ode, scalar_type& t, scalar_type& dt, if (1 < order) { for (int eqIdx = 0; eqIdx < sys.neqs; ++eqIdx) { - error_low += Kokkos::pow( - error_const[order - 1] * D(eqIdx, order) / scale(eqIdx), 2); + error_low += Kokkos::pow(error_const[order - 1] * D(eqIdx, order) / scale(eqIdx), 2); } error_low = Kokkos::sqrt(error_low) / Kokkos::sqrt(sys.neqs); } else { @@ -493,8 +441,7 @@ KOKKOS_FUNCTION void BDFStep(ode_type& ode, scalar_type& t, scalar_type& dt, if (order < max_order) { for (int eqIdx = 0; eqIdx < sys.neqs; ++eqIdx) { - error_high += Kokkos::pow( - error_const[order + 1] * D(eqIdx, order + 2) / scale(eqIdx), 2); + error_high += Kokkos::pow(error_const[order + 1] * D(eqIdx, order + 2) / scale(eqIdx), 2); } error_high = Kokkos::sqrt(error_high) / Kokkos::sqrt(sys.neqs); } else { diff --git a/ode/impl/KokkosODE_Newton_impl.hpp b/ode/impl/KokkosODE_Newton_impl.hpp index ae573801ac..1ca545689a 100644 --- a/ode/impl/KokkosODE_Newton_impl.hpp +++ b/ode/impl/KokkosODE_Newton_impl.hpp @@ -30,19 +30,18 @@ namespace KokkosODE { namespace Impl { -template +template KOKKOS_FUNCTION KokkosODE::Experimental::newton_solver_status NewtonSolve( - system_type& sys, const KokkosODE::Experimental::Newton_params& params, - mat_type& J, mat_type& tmp, ini_vec_type& y0, rhs_vec_type& rhs, - update_type& update, const scale_type& scale) { + system_type& sys, const KokkosODE::Experimental::Newton_params& params, mat_type& J, mat_type& tmp, + ini_vec_type& y0, rhs_vec_type& rhs, update_type& update, const scale_type& scale) { using newton_solver_status = KokkosODE::Experimental::newton_solver_status; using value_type = typename ini_vec_type::non_const_value_type; // Define the type returned by nrm2 to store // the norm of the residual. - using norm_type = typename Kokkos::Details::InnerProductSpaceTraits< - typename ini_vec_type::non_const_value_type>::mag_type; + using norm_type = + typename Kokkos::Details::InnerProductSpaceTraits::mag_type; sys.residual(y0, rhs); const norm_type norm0 = KokkosBlas::serial_nrm2(rhs); norm_type norm = Kokkos::ArithTraits::zero(); @@ -50,9 +49,8 @@ KOKKOS_FUNCTION KokkosODE::Experimental::newton_solver_status NewtonSolve( norm_type norm_new = Kokkos::ArithTraits::zero(); norm_type rate = Kokkos::ArithTraits::zero(); - const norm_type tol = - Kokkos::max(10 * Kokkos::ArithTraits::eps() / params.rel_tol, - Kokkos::min(0.03, Kokkos::sqrt(params.rel_tol))); + const norm_type tol = Kokkos::max(10 * Kokkos::ArithTraits::eps() / params.rel_tol, + Kokkos::min(0.03, Kokkos::sqrt(params.rel_tol))); // LBV - 07/24/2023: for now assume that we take // a full Newton step. Eventually this value can @@ -73,9 +71,7 @@ KOKKOS_FUNCTION KokkosODE::Experimental::newton_solver_status NewtonSolve( sys.jacobian(y0, J); // solve linear problem - int linSolverStat = - KokkosBatched::SerialGesv::invoke( - J, update, rhs, tmp); + int linSolverStat = KokkosBatched::SerialGesv::invoke(J, update, rhs, tmp); KokkosBlas::SerialScale::invoke(-1, update); // update solution // x = x + alpha*update @@ -89,9 +85,7 @@ KOKKOS_FUNCTION KokkosODE::Experimental::newton_solver_status NewtonSolve( norm_new = Kokkos::sqrt(norm_new / sys.neqs); if ((it > 0) && norm_old > Kokkos::ArithTraits::zero()) { rate = norm_new / norm_old; - if ((rate >= 1) || - Kokkos::pow(rate, params.max_iters - it) / (1 - rate) * norm_new > - tol) { + if ((rate >= 1) || Kokkos::pow(rate, params.max_iters - it) / (1 - rate) * norm_new > tol) { return newton_solver_status::NLS_DIVERGENCE; } else if ((norm_new == 0) || ((rate / (1 - rate)) * norm_new < tol)) { return newton_solver_status::NLS_SUCCESS; @@ -103,8 +97,7 @@ KOKKOS_FUNCTION KokkosODE::Experimental::newton_solver_status NewtonSolve( return newton_solver_status::LIN_SOLVE_FAIL; } - if ((norm < (params.rel_tol * norm0)) || - (it > 0 ? KokkosBlas::serial_nrm2(update) < params.abs_tol : false)) { + if ((norm < (params.rel_tol * norm0)) || (it > 0 ? KokkosBlas::serial_nrm2(update) < params.abs_tol : false)) { return newton_solver_status::NLS_SUCCESS; } diff --git a/ode/impl/KokkosODE_RungeKuttaTables_impl.hpp b/ode/impl/KokkosODE_RungeKuttaTables_impl.hpp index 85a8ec0b45..6a0770d1a7 100644 --- a/ode/impl/KokkosODE_RungeKuttaTables_impl.hpp +++ b/ode/impl/KokkosODE_RungeKuttaTables_impl.hpp @@ -83,8 +83,7 @@ struct ButcherTableau<1, 1> // Euler-Heun Method static constexpr int order = 2; static constexpr int nstages = 2; // total dimensions, nstagesxnstages system Kokkos::Array a{ - {0.0, 1.0, - 0.0}}; //(nstages*nstages+nstages)/2 size of lower triangular matrix + {0.0, 1.0, 0.0}}; //(nstages*nstages+nstages)/2 size of lower triangular matrix Kokkos::Array b{{0.5, 0.5}}; Kokkos::Array c{{0.0, 1.0}}; Kokkos::Array e{{-0.5, 0.5}}; @@ -100,12 +99,10 @@ struct ButcherTableau<1, 2> // Known as Fehlberg 1-2 method { static constexpr int order = 2; static constexpr int nstages = 3; - Kokkos::Array a{ - {0.0, 0.5, 0.0, 1.0 / 256.0, 255.0 / 256.0, 0.0}}; + Kokkos::Array a{{0.0, 0.5, 0.0, 1.0 / 256.0, 255.0 / 256.0, 0.0}}; Kokkos::Array b{{1.0 / 512.0, 255.0 / 256.0, 1. / 512}}; Kokkos::Array c{{0.0, 1.0 / 2.0, 1.0}}; - Kokkos::Array e{ - {1.0 / 256.0 - 1.0 / 512.0, 0.0, -1.0 / 512.0}}; + Kokkos::Array e{{1.0 / 256.0 - 1.0 / 512.0, 0.0, -1.0 / 512.0}}; }; // Coefficients obtained from: @@ -119,12 +116,10 @@ struct ButcherTableau<2, 3> // Bogacki-Shampine method static constexpr int order = 3; static constexpr int nstages = 4; Kokkos::Array a{ - {0.0, 0.5, 0.0, 0.0, 3.0 / 4.0, 0.0, 2.0 / 9.0, 1.0 / 3.0, 4.0 / 9.0, - 0.0}}; + {0.0, 0.5, 0.0, 0.0, 3.0 / 4.0, 0.0, 2.0 / 9.0, 1.0 / 3.0, 4.0 / 9.0, 0.0}}; Kokkos::Array b{{2.0 / 9.0, 1.0 / 3.0, 4.0 / 9.0, 0.0}}; Kokkos::Array c{{0.0, 0.5, 0.75, 1.0}}; - Kokkos::Array e{{2.0 / 9.0 - 7.0 / 24.0, 1.0 / 3.0 - 0.25, - 4.0 / 9.0 - 1.0 / 3.0, -1.0 / 8.0}}; + Kokkos::Array e{{2.0 / 9.0 - 7.0 / 24.0, 1.0 / 3.0 - 0.25, 4.0 / 9.0 - 1.0 / 3.0, -1.0 / 8.0}}; }; // Coefficients obtained from: @@ -136,10 +131,8 @@ struct ButcherTableau<3, 3> // RK4 { static constexpr int order = 4; static constexpr int nstages = 4; - Kokkos::Array a{ - {0.0, 0.5, 0.0, 0.0, 0.5, 0.0, 0.0, 0.0, 1.0, 0.0}}; - Kokkos::Array b{ - {1.0 / 6.0, 1.0 / 3.0, 1.0 / 3.0, 1.0 / 6.0}}; + Kokkos::Array a{{0.0, 0.5, 0.0, 0.0, 0.5, 0.0, 0.0, 0.0, 1.0, 0.0}}; + Kokkos::Array b{{1.0 / 6.0, 1.0 / 3.0, 1.0 / 3.0, 1.0 / 6.0}}; Kokkos::Array c{{0.0, 0.5, 0.5, 1.0}}; Kokkos::Array e{{1.0 / 6.0, 0.0, -1.0 / 3.0, 1.0 / 6.0}}; }; @@ -175,14 +168,10 @@ struct ButcherTableau<4, 5> // Fehlberg Method 1859.0 / 4104.0, -11.0 / 40.0, 0.0}}; - Kokkos::Array b{{16.0 / 135.0, 0.0, 6656.0 / 12825.0, - 28561.0 / 56430.0, -9.0 / 50.0, - 2.0 / 55.0}}; - Kokkos::Array c{ - {0.0, 0.25, 3.0 / 8.0, 12.0 / 13.0, 1.0, 0.5}}; - Kokkos::Array e{ - {16.0 / 135.0 - 25.0 / 216.0, 0.0, 6656.0 / 12825.0 - 1408.0 / 2565.0, - 28561.0 / 56430.0 - 2197.0 / 4104.0, -9.0 / 50.0 + 0.2, 2.0 / 55.0}}; + Kokkos::Array b{{16.0 / 135.0, 0.0, 6656.0 / 12825.0, 28561.0 / 56430.0, -9.0 / 50.0, 2.0 / 55.0}}; + Kokkos::Array c{{0.0, 0.25, 3.0 / 8.0, 12.0 / 13.0, 1.0, 0.5}}; + Kokkos::Array e{{16.0 / 135.0 - 25.0 / 216.0, 0.0, 6656.0 / 12825.0 - 1408.0 / 2565.0, + 28561.0 / 56430.0 - 2197.0 / 4104.0, -9.0 / 50.0 + 0.2, 2.0 / 55.0}}; }; // Coefficients obtained from: @@ -195,35 +184,31 @@ struct ButcherTableau<4, 5, 1> // Cash-Karp { static constexpr int order = 5; static constexpr int nstages = 6; - Kokkos::Array a{ - {0.0, - 0.2, - 0.0, - 3.0 / 40.0, - 9.0 / 40.0, - 0.0, - 0.3, - -0.9, - 1.2, - 0.0, - -11.0 / 54.0, - 2.5, - -70.0 / 27.0, - 35.0 / 27.0, - 0.0, - 1631.0 / 55296.0, - 175.0 / 512.0, - 575.0 / 13824.0, - 44275.0 / 110592.0, - 253.0 / 4096.0, - 0.0}}; - Kokkos::Array b{ - {37.0 / 378.0, 0.0, 250.0 / 621.0, 125.0 / 594.0, 0.0, 512.0 / 1771.0}}; + Kokkos::Array a{{0.0, + 0.2, + 0.0, + 3.0 / 40.0, + 9.0 / 40.0, + 0.0, + 0.3, + -0.9, + 1.2, + 0.0, + -11.0 / 54.0, + 2.5, + -70.0 / 27.0, + 35.0 / 27.0, + 0.0, + 1631.0 / 55296.0, + 175.0 / 512.0, + 575.0 / 13824.0, + 44275.0 / 110592.0, + 253.0 / 4096.0, + 0.0}}; + Kokkos::Array b{{37.0 / 378.0, 0.0, 250.0 / 621.0, 125.0 / 594.0, 0.0, 512.0 / 1771.0}}; Kokkos::Array c{{0.0, 0.2, 0.3, 0.6, 1.0, 7.0 / 8.0}}; - Kokkos::Array e{{37.0 / 378.0 - 2825.0 / 27648.0, 0.0, - 250.0 / 621.0 - 18575.0 / 48384.0, - 125.0 / 594.0 - 13525.0 / 55296.0, - -277.0 / 14336.0, 512.0 / 1771.0 - 0.25}}; + Kokkos::Array e{{37.0 / 378.0 - 2825.0 / 27648.0, 0.0, 250.0 / 621.0 - 18575.0 / 48384.0, + 125.0 / 594.0 - 13525.0 / 55296.0, -277.0 / 14336.0, 512.0 / 1771.0 - 0.25}}; }; // Coefficients obtained from: @@ -264,14 +249,12 @@ struct ButcherTableau<4, 6> // Referred to as DOPRI5 or RKDP -2187.0 / 6784.0, 11.0 / 84.0, 0.0}}; - Kokkos::Array b{{35.0 / 384.0, 0.0, 500.0 / 1113.0, - 125.0 / 192.0, -2187.0 / 6784.0, - 11.0 / 84.0, 0.0}}; + Kokkos::Array b{ + {35.0 / 384.0, 0.0, 500.0 / 1113.0, 125.0 / 192.0, -2187.0 / 6784.0, 11.0 / 84.0, 0.0}}; Kokkos::Array c{{0.0, 0.2, 0.3, 0.8, 8.0 / 9.0, 1.0, 1.0}}; - Kokkos::Array e{ - {35.0 / 384.0 - 5179.0 / 57600.0, 0.0, 500.0 / 1113.0 - 7571.0 / 16695.0, - 125.0 / 192.0 - 393.0 / 640.0, -2187.0 / 6784.0 + 92097.0 / 339200.0, - 11.0 / 84.0 - 187.0 / 2100.0, -1.0 / 40.0}}; + Kokkos::Array e{{35.0 / 384.0 - 5179.0 / 57600.0, 0.0, 500.0 / 1113.0 - 7571.0 / 16695.0, + 125.0 / 192.0 - 393.0 / 640.0, -2187.0 / 6784.0 + 92097.0 / 339200.0, + 11.0 / 84.0 - 187.0 / 2100.0, -1.0 / 40.0}}; }; } // namespace Impl diff --git a/ode/impl/KokkosODE_RungeKutta_impl.hpp b/ode/impl/KokkosODE_RungeKutta_impl.hpp index f5fe39d65d..83ab76758f 100644 --- a/ode/impl/KokkosODE_RungeKutta_impl.hpp +++ b/ode/impl/KokkosODE_RungeKutta_impl.hpp @@ -30,12 +30,9 @@ namespace Impl { // k_i = f(t+c_i*dt, y_old+sum(a_{ij}*k_i)) j in [1, i-1] // we need to compute the k_i and store them as we go // to use them for k_{i+1} computation. -template -KOKKOS_FUNCTION void RKStep(ode_type& ode, const table_type& table, - const bool adaptivity, scalar_type t, - scalar_type dt, const vec_type& y_old, - const vec_type& y_new, const vec_type& temp, +template +KOKKOS_FUNCTION void RKStep(ode_type& ode, const table_type& table, const bool adaptivity, scalar_type t, + scalar_type dt, const vec_type& y_old, const vec_type& y_new, const vec_type& temp, const mv_type& k_vecs) { const int neqs = ode.neqs; const int nstages = table.nstages; @@ -64,8 +61,7 @@ KOKKOS_FUNCTION void RKStep(ode_type& ode, const table_type& table, for (int idx = 0; idx < stageIdx; ++idx) { for (int eqIdx = 0; eqIdx < neqs; ++eqIdx) { - temp(eqIdx) += - table.a[stageIdx * (stageIdx + 1) / 2 + idx] * k_vecs(idx, eqIdx); + temp(eqIdx) += table.a[stageIdx * (stageIdx + 1) / 2 + idx] * k_vecs(idx, eqIdx); } } KokkosBlas::SerialScale::invoke(dt, temp); @@ -88,13 +84,12 @@ KOKKOS_FUNCTION void RKStep(ode_type& ode, const table_type& table, } } // RKStep -template -KOKKOS_FUNCTION Experimental::ode_solver_status RKSolve( - const ode_type& ode, const table_type& table, - const KokkosODE::Experimental::ODE_params& params, - const scalar_type t_start, const scalar_type t_end, const vec_type& y0, - const vec_type& y, const vec_type& temp, const mv_type& k_vecs) { +template +KOKKOS_FUNCTION Experimental::ode_solver_status RKSolve(const ode_type& ode, const table_type& table, + const KokkosODE::Experimental::ODE_params& params, + const scalar_type t_start, const scalar_type t_end, + const vec_type& y0, const vec_type& y, const vec_type& temp, + const mv_type& k_vecs) { constexpr scalar_type error_threshold = 1; bool adapt = params.adaptivity; bool dt_was_reduced; @@ -107,8 +102,7 @@ KOKKOS_FUNCTION Experimental::ode_solver_status RKSolve( scalar_type dt = (t_end - t_start) / params.max_steps; // Loop over time steps to integrate ODE - for (int stepIdx = 0; (stepIdx < params.max_steps) && (t_now <= t_end); - ++stepIdx) { + for (int stepIdx = 0; (stepIdx < params.max_steps) && (t_now <= t_end); ++stepIdx) { // Check that the step attempted is not putting // the solution past t_end, otherwise shrink dt if (t_end < t_now + dt) { @@ -138,9 +132,7 @@ KOKKOS_FUNCTION Experimental::ode_solver_status RKSolve( for (int eqIdx = 0; eqIdx < ode.neqs; ++eqIdx) { error = Kokkos::max(error, Kokkos::abs(temp(eqIdx))); tol = Kokkos::max( - tol, params.abs_tol + - params.rel_tol * Kokkos::max(Kokkos::abs(y(eqIdx)), - Kokkos::abs(y0(eqIdx)))); + tol, params.abs_tol + params.rel_tol * Kokkos::max(Kokkos::abs(y(eqIdx)), Kokkos::abs(y0(eqIdx)))); } error = error / tol; @@ -148,12 +140,11 @@ KOKKOS_FUNCTION Experimental::ode_solver_status RKSolve( // is too large and current step // is rejected. if (error > 1) { - dt = dt * Kokkos::max(0.2, 0.8 / Kokkos::pow(error, 1 / table.order)); + dt = dt * Kokkos::max(0.2, 0.8 / Kokkos::pow(error, 1 / table.order)); dt_was_reduced = true; } - if (dt < params.min_step_size) - return Experimental::ode_solver_status::MIN_SIZE; + if (dt < params.min_step_size) return Experimental::ode_solver_status::MIN_SIZE; } } @@ -166,10 +157,7 @@ KOKKOS_FUNCTION Experimental::ode_solver_status RKSolve( if (t_now < t_end) { if (adapt && !dt_was_reduced && error < 0.5) { // Compute new time increment - dt = dt * - Kokkos::min( - 10.0, - Kokkos::max(2.0, 0.9 * Kokkos::pow(error, 1 / table.order))); + dt = dt * Kokkos::min(10.0, Kokkos::max(2.0, 0.9 * Kokkos::pow(error, 1 / table.order))); } } else { return Experimental::ode_solver_status::SUCCESS; diff --git a/ode/src/KokkosODE_BDF.hpp b/ode/src/KokkosODE_BDF.hpp index 71a450a1c6..419316ba45 100644 --- a/ode/src/KokkosODE_BDF.hpp +++ b/ode/src/KokkosODE_BDF.hpp @@ -29,14 +29,7 @@ namespace KokkosODE { namespace Experimental { -enum BDF_type : int { - BDF1 = 0, - BDF2 = 1, - BDF3 = 2, - BDF4 = 3, - BDF5 = 4, - BDF6 = 5 -}; +enum BDF_type : int { BDF1 = 0, BDF2 = 1, BDF3 = 2, BDF4 = 3, BDF5 = 4, BDF6 = 5 }; template struct BDF_coeff_helper { @@ -91,14 +84,11 @@ template struct BDF { using table_type = typename BDF_coeff_helper::table_type; - template - KOKKOS_FUNCTION static void Solve( - const ode_type& ode, const scalar_type t_start, const scalar_type t_end, - const int num_steps, const vec_type& y0, const vec_type& y, - const vec_type& rhs, const vec_type& update, const vec_type& scale, - const mv_type& y_vecs, const mv_type& kstack, const mat_type& temp, - const mat_type& jac) { + template + KOKKOS_FUNCTION static void Solve(const ode_type& ode, const scalar_type t_start, const scalar_type t_end, + const int num_steps, const vec_type& y0, const vec_type& y, const vec_type& rhs, + const vec_type& update, const vec_type& scale, const mv_type& y_vecs, + const mv_type& kstack, const mat_type& temp, const mat_type& jac) { const table_type table{}; const double dt = (t_end - t_start) / num_steps; @@ -117,8 +107,7 @@ struct BDF { } KokkosODE::Experimental::ODE_params params(table.order - 1); for (int stepIdx = 0; stepIdx < init_steps; ++stepIdx) { - KokkosODE::Experimental::RungeKutta::Solve( - ode, params, t, t + dt, y0, y, update, kstack); + KokkosODE::Experimental::RungeKutta::Solve(ode, params, t, t + dt, y0, y, update, kstack); for (int eqIdx = 0; eqIdx < ode.neqs; ++eqIdx) { y_vecs(eqIdx, stepIdx + 1) = y(eqIdx); @@ -128,8 +117,7 @@ struct BDF { } for (int stepIdx = init_steps; stepIdx < num_steps; ++stepIdx) { - KokkosODE::Impl::BDFStep(ode, table, t, dt, y0, y, rhs, update, scale, - y_vecs, temp, jac); + KokkosODE::Impl::BDFStep(ode, table, t, dt, y0, y, rhs, update, scale, y_vecs, temp, jac); // Update history for (int eqIdx = 0; eqIdx < ode.neqs; ++eqIdx) { @@ -167,12 +155,9 @@ struct BDF { /// \param temp [in]: vectors for temporary storage /// \param temp2 [in]: vectors for temporary storage template -KOKKOS_FUNCTION void BDFSolve(const ode_type& ode, const scalar_type t_start, - const scalar_type t_end, - const scalar_type initial_step, - const scalar_type max_step, const vec_type& y0, - const vec_type& y_new, mat_type& temp, - mat_type& temp2) { +KOKKOS_FUNCTION void BDFSolve(const ode_type& ode, const scalar_type t_start, const scalar_type t_end, + const scalar_type initial_step, const scalar_type max_step, const vec_type& y0, + const vec_type& y_new, mat_type& temp, mat_type& temp2) { using KAT = Kokkos::ArithTraits; // This needs to go away and be pulled out of temp instead... @@ -195,8 +180,7 @@ KOKKOS_FUNCTION void BDFSolve(const ode_type& ode, const scalar_type t_start, // Check if we need to compute the initial // time step size. if (initial_step == KAT::zero()) { - KokkosODE::Impl::initial_step_size(ode, order, t_start, atol, rtol, y0, rhs, - temp, dt); + KokkosODE::Impl::initial_step_size(ode, order, t_start, atol, rtol, y0, rhs, temp, dt); } // Initialize D(:, 0) = y0 and D(:, 1) = dt*rhs @@ -210,8 +194,7 @@ KOKKOS_FUNCTION void BDFSolve(const ode_type& ode, const scalar_type t_start, // Now we loop over the time interval [t_start, t_end] // and solve our ODE. while (t < t_end) { - KokkosODE::Impl::BDFStep(ode, t, dt, t_end, order, num_equal_steps, - max_newton_iters, atol, rtol, min_factor, y0, + KokkosODE::Impl::BDFStep(ode, t, dt, t_end, order, num_equal_steps, max_newton_iters, atol, rtol, min_factor, y0, y_new, rhs, update, temp, temp2); for (int eqIdx = 0; eqIdx < ode.neqs; ++eqIdx) { diff --git a/ode/src/KokkosODE_Newton.hpp b/ode/src/KokkosODE_Newton.hpp index ffccba5cd3..5686423e9e 100644 --- a/ode/src/KokkosODE_Newton.hpp +++ b/ode/src/KokkosODE_Newton.hpp @@ -30,14 +30,13 @@ namespace Experimental { /// \brief Newton solver for non-linear system of equations struct Newton { - template - KOKKOS_FUNCTION static newton_solver_status Solve( - const system_type& sys, const Newton_params& params, const mat_type& J, - const mat_type& tmp, const ini_vec_type& y0, const rhs_vec_type& rhs, - const update_type& update, const scale_type& scale) { - return KokkosODE::Impl::NewtonSolve(sys, params, J, tmp, y0, rhs, update, - scale); + template + KOKKOS_FUNCTION static newton_solver_status Solve(const system_type& sys, const Newton_params& params, + const mat_type& J, const mat_type& tmp, const ini_vec_type& y0, + const rhs_vec_type& rhs, const update_type& update, + const scale_type& scale) { + return KokkosODE::Impl::NewtonSolve(sys, params, J, tmp, y0, rhs, update, scale); } }; diff --git a/ode/src/KokkosODE_RungeKutta.hpp b/ode/src/KokkosODE_RungeKutta.hpp index b4711de81c..2d298a6568 100644 --- a/ode/src/KokkosODE_RungeKutta.hpp +++ b/ode/src/KokkosODE_RungeKutta.hpp @@ -31,8 +31,8 @@ namespace Experimental { /// \brief RK_type is an enum tye that conveniently /// describes the Runge-Kutta methods implemented. enum RK_type : int { - RKFE = 0, ///< Forward Euler method (no adaptivity available for this method) - RKEH = 1, ///< Euler-Heun method + RKFE = 0, ///< Forward Euler method (no adaptivity available for this method) + RKEH = 1, ///< Euler-Heun method RKF12 = 2, ///< Fehlberg order 2 method RKBS = 3, ///< Bogacki-Shampine method RK4 = 4, ///< Runge-Kutta classic order 4 method @@ -126,13 +126,11 @@ struct RungeKutta { /// \return ode_solver_status an enum that describes success of failure /// of the integration method once it at terminated. template - KOKKOS_FUNCTION static ode_solver_status Solve( - const ode_type& ode, const KokkosODE::Experimental::ODE_params& params, - const scalar_type t_start, const scalar_type t_end, const vec_type& y0, - const vec_type& y, const vec_type& temp, const mv_type& k_vecs) { + KOKKOS_FUNCTION static ode_solver_status Solve(const ode_type& ode, const KokkosODE::Experimental::ODE_params& params, + const scalar_type t_start, const scalar_type t_end, const vec_type& y0, + const vec_type& y, const vec_type& temp, const mv_type& k_vecs) { table_type table; - return KokkosODE::Impl::RKSolve(ode, table, params, t_start, t_end, y0, y, - temp, k_vecs); + return KokkosODE::Impl::RKSolve(ode, table, params, t_start, t_end, y0, y, temp, k_vecs); } }; diff --git a/ode/src/KokkosODE_Types.hpp b/ode/src/KokkosODE_Types.hpp index 5fb2c44846..2145afb718 100644 --- a/ode/src/KokkosODE_Types.hpp +++ b/ode/src/KokkosODE_Types.hpp @@ -32,17 +32,12 @@ struct ODE_params { // be constant such that dt = (tend - tstart) / num_steps; KOKKOS_FUNCTION ODE_params(const int num_steps_) - : adaptivity(false), - num_steps(num_steps_), - max_steps(num_steps_), - abs_tol(0), - rel_tol(0), - min_step_size(0) {} + : adaptivity(false), num_steps(num_steps_), max_steps(num_steps_), abs_tol(0), rel_tol(0), min_step_size(0) {} /// ODE_parms construtor for adaptive time stepping. KOKKOS_FUNCTION - ODE_params(const int num_steps_, const int max_steps_, const double abs_tol_, - const double rel_tol_, const double min_step_size_) + ODE_params(const int num_steps_, const int max_steps_, const double abs_tol_, const double rel_tol_, + const double min_step_size_) : adaptivity(true), num_steps(num_steps_), max_steps(max_steps_), @@ -68,8 +63,7 @@ struct Newton_params { // double abs_tol_ [in]: absolute tolerance to reach for successful solve // double rel_tol_ [in]: relative tolerance to reach for successful solve KOKKOS_FUNCTION - Newton_params(const int max_iters_, const double abs_tol_, - const double rel_tol_) + Newton_params(const int max_iters_, const double abs_tol_, const double rel_tol_) : max_iters(max_iters_), abs_tol(abs_tol_), rel_tol(rel_tol_) {} }; diff --git a/ode/unit_test/Test_ODE_BDF.hpp b/ode/unit_test/Test_ODE_BDF.hpp index 8360302971..8f8319cb1d 100644 --- a/ode/unit_test/Test_ODE_BDF.hpp +++ b/ode/unit_test/Test_ODE_BDF.hpp @@ -37,23 +37,19 @@ struct Logistic { Logistic(double r_, double K_) : r(r_), K(K_){}; template - KOKKOS_FUNCTION void evaluate_function(const double /*t*/, - const double /*dt*/, - const vec_type1& y, + KOKKOS_FUNCTION void evaluate_function(const double /*t*/, const double /*dt*/, const vec_type1& y, const vec_type2& f) const { f(0) = r * y(0) * (1.0 - y(0) / K); } template - KOKKOS_FUNCTION void evaluate_jacobian(const double /*t*/, - const double /*dt*/, const vec_type& y, + KOKKOS_FUNCTION void evaluate_jacobian(const double /*t*/, const double /*dt*/, const vec_type& y, const mat_type& jac) const { jac(0, 0) = r - 2 * r * y(0) / K; } template - KOKKOS_FUNCTION void solution(const double t, const vec_type& y0, - const vec_type& y) const { + KOKKOS_FUNCTION void solution(const double t, const vec_type& y0, const vec_type& y) const { y(0) = K / (1 + (K - y0) / y0 * Kokkos::exp(-r * t)); } @@ -78,17 +74,14 @@ struct LotkaVolterra { : alpha(alpha_), beta(beta_), delta(delta_), gamma(gamma_){}; template - KOKKOS_FUNCTION void evaluate_function(const double /*t*/, - const double /*dt*/, - const vec_type1& y, + KOKKOS_FUNCTION void evaluate_function(const double /*t*/, const double /*dt*/, const vec_type1& y, const vec_type2& f) const { f(0) = alpha * y(0) - beta * y(0) * y(1); f(1) = delta * y(0) * y(1) - gamma * y(1); } template - KOKKOS_FUNCTION void evaluate_jacobian(const double /*t*/, - const double /*dt*/, const vec_type& y, + KOKKOS_FUNCTION void evaluate_jacobian(const double /*t*/, const double /*dt*/, const vec_type& y, const mat_type& jac) const { jac(0, 0) = alpha - beta * y(1); jac(0, 1) = -beta * y(0); @@ -112,9 +105,7 @@ struct StiffChemistry { StiffChemistry() {} template - KOKKOS_FUNCTION void evaluate_function(const double /*t*/, - const double /*dt*/, - const vec_type1& y, + KOKKOS_FUNCTION void evaluate_function(const double /*t*/, const double /*dt*/, const vec_type1& y, const vec_type2& f) const { f(0) = -0.04 * y(0) + 1.e4 * y(1) * y(2); f(1) = 0.04 * y(0) - 1.e4 * y(1) * y(2) - 3.e7 * y(1) * y(1); @@ -122,8 +113,7 @@ struct StiffChemistry { } template - KOKKOS_FUNCTION void evaluate_jacobian(const double /*t*/, - const double /*dt*/, const vec_type& y, + KOKKOS_FUNCTION void evaluate_jacobian(const double /*t*/, const double /*dt*/, const vec_type& y, const mat_type& jac) const { jac(0, 0) = -0.04; jac(0, 1) = 1.e4 * y(2); @@ -137,8 +127,8 @@ struct StiffChemistry { } }; -template +template struct BDFSolve_wrapper { ode_type my_ode; scalar_type tstart, tend; @@ -147,12 +137,9 @@ struct BDFSolve_wrapper { mv_type y_vecs, kstack; mat_type temp, jac; - BDFSolve_wrapper(const ode_type& my_ode_, const scalar_type tstart_, - const scalar_type tend_, const int num_steps_, - const vec_type& y_old_, const vec_type& y_new_, - const vec_type& rhs_, const vec_type& update_, - const vec_type& scale_, const mv_type& y_vecs_, - const mv_type& kstack_, const mat_type& temp_, + BDFSolve_wrapper(const ode_type& my_ode_, const scalar_type tstart_, const scalar_type tend_, const int num_steps_, + const vec_type& y_old_, const vec_type& y_new_, const vec_type& rhs_, const vec_type& update_, + const vec_type& scale_, const mv_type& y_vecs_, const mv_type& kstack_, const mat_type& temp_, const mat_type& jac_) : my_ode(my_ode_), tstart(tstart_), @@ -170,9 +157,8 @@ struct BDFSolve_wrapper { KOKKOS_FUNCTION void operator()(const int /*idx*/) const { - KokkosODE::Experimental::BDF::Solve( - my_ode, tstart, tend, num_steps, y_old, y_new, rhs, update, scale, - y_vecs, kstack, temp, jac); + KokkosODE::Experimental::BDF::Solve(my_ode, tstart, tend, num_steps, y_old, y_new, rhs, update, scale, + y_vecs, kstack, temp, jac); } }; @@ -183,11 +169,9 @@ struct BDF_Solve_wrapper { const vec_type y0, y_new; const mat_type temp, temp2; - BDF_Solve_wrapper(const ode_type& my_ode_, const scalar_type& t_start_, - const scalar_type& t_end_, const scalar_type& dt_, - const scalar_type& max_step_, const vec_type& y0_, - const vec_type& y_new_, const mat_type& temp_, - const mat_type& temp2_) + BDF_Solve_wrapper(const ode_type& my_ode_, const scalar_type& t_start_, const scalar_type& t_end_, + const scalar_type& dt_, const scalar_type& max_step_, const vec_type& y0_, const vec_type& y_new_, + const mat_type& temp_, const mat_type& temp2_) : my_ode(my_ode_), t_start(t_start_), t_end(t_end_), @@ -199,8 +183,7 @@ struct BDF_Solve_wrapper { temp2(temp2_) {} KOKKOS_FUNCTION void operator()(const int) const { - KokkosODE::Experimental::BDFSolve(my_ode, t_start, t_end, dt, max_step, y0, - y_new, temp, temp2); + KokkosODE::Experimental::BDFSolve(my_ode, t_start, t_end, dt, max_step, y0, y_new, temp, temp2); } }; @@ -221,8 +204,7 @@ void test_BDF_Logistic() { vec_type y0("initial conditions", mySys.neqs), y_new("solution", mySys.neqs); vec_type rhs("rhs", mySys.neqs), update("update", mySys.neqs); vec_type scale("scaling factors", mySys.neqs); - mat_type jac("jacobian", mySys.neqs, mySys.neqs), - temp("temp storage", mySys.neqs, mySys.neqs + 4); + mat_type jac("jacobian", mySys.neqs, mySys.neqs), temp("temp storage", mySys.neqs, mySys.neqs + 4); mv_type kstack("Startup RK vectors", 6, mySys.neqs); Kokkos::deep_copy(scale, 1); @@ -239,26 +221,21 @@ void test_BDF_Logistic() { Kokkos::deep_copy(y0, 0.5); Kokkos::deep_copy(y_vecs, 0.5); - BDFSolve_wrapper - solve_wrapper(mySys, t_start, t_end, num_steps[idx], y0, y_new, rhs, - update, scale, y_vecs, kstack, temp, jac); + BDFSolve_wrapper + solve_wrapper(mySys, t_start, t_end, num_steps[idx], y0, y_new, rhs, update, scale, y_vecs, kstack, temp, jac); Kokkos::parallel_for(myPolicy, solve_wrapper); Kokkos::fence(); auto y_new_h = Kokkos::create_mirror_view(y_new); Kokkos::deep_copy(y_new_h, y_new); - errors[idx] = Kokkos::abs(y_new_h(0) - 1 / (1 + Kokkos::exp(-t_end))) / - Kokkos::abs(1 / (1 + Kokkos::exp(-t_end))); + errors[idx] = Kokkos::abs(y_new_h(0) - 1 / (1 + Kokkos::exp(-t_end))) / Kokkos::abs(1 / (1 + Kokkos::exp(-t_end))); } - measured_order = - Kokkos::pow(errors[num_tests - 1] / errors[0], 1.0 / (num_tests - 1)); + measured_order = Kokkos::pow(errors[num_tests - 1] / errors[0], 1.0 / (num_tests - 1)); EXPECT_NEAR_KK_REL(measured_order, 2.0, 0.15); #if defined(HAVE_KOKKOSKERNELS_DEBUG) std::cout << "expected ratio: 2, actual ratio: " << measured_order - << ", order error=" << Kokkos::abs(measured_order - 2.0) / 2.0 - << std::endl; + << ", order error=" << Kokkos::abs(measured_order - 2.0) / 2.0 << std::endl; #endif // Test BDF2 @@ -269,26 +246,21 @@ void test_BDF_Logistic() { mv_type y_vecs("history vectors", mySys.neqs, 2); Kokkos::deep_copy(y0, 0.5); - BDFSolve_wrapper - solve_wrapper(mySys, t_start, t_end, num_steps[idx], y0, y_new, rhs, - update, scale, y_vecs, kstack, temp, jac); + BDFSolve_wrapper + solve_wrapper(mySys, t_start, t_end, num_steps[idx], y0, y_new, rhs, update, scale, y_vecs, kstack, temp, jac); Kokkos::parallel_for(myPolicy, solve_wrapper); Kokkos::fence(); auto y_new_h = Kokkos::create_mirror_view(y_new); Kokkos::deep_copy(y_new_h, y_new); - errors[idx] = Kokkos::abs(y_new_h(0) - 1 / (1 + Kokkos::exp(-t_end))) / - Kokkos::abs(1 / (1 + Kokkos::exp(-t_end))); + errors[idx] = Kokkos::abs(y_new_h(0) - 1 / (1 + Kokkos::exp(-t_end))) / Kokkos::abs(1 / (1 + Kokkos::exp(-t_end))); } - measured_order = - Kokkos::pow(errors[num_tests - 1] / errors[0], 1.0 / (num_tests - 1)); + measured_order = Kokkos::pow(errors[num_tests - 1] / errors[0], 1.0 / (num_tests - 1)); EXPECT_NEAR_KK_REL(measured_order, 4.0, 0.15); #if defined(HAVE_KOKKOSKERNELS_DEBUG) std::cout << "expected ratio: 4, actual ratio: " << measured_order - << ", order error=" << Kokkos::abs(measured_order - 4.0) / 4.0 - << std::endl; + << ", order error=" << Kokkos::abs(measured_order - 4.0) / 4.0 << std::endl; #endif // Test BDF3 @@ -299,26 +271,21 @@ void test_BDF_Logistic() { mv_type y_vecs("history vectors", mySys.neqs, 3); Kokkos::deep_copy(y0, 0.5); - BDFSolve_wrapper - solve_wrapper(mySys, t_start, t_end, num_steps[idx], y0, y_new, rhs, - update, scale, y_vecs, kstack, temp, jac); + BDFSolve_wrapper + solve_wrapper(mySys, t_start, t_end, num_steps[idx], y0, y_new, rhs, update, scale, y_vecs, kstack, temp, jac); Kokkos::parallel_for(myPolicy, solve_wrapper); Kokkos::fence(); auto y_new_h = Kokkos::create_mirror_view(y_new); Kokkos::deep_copy(y_new_h, y_new); - errors[idx] = Kokkos::abs(y_new_h(0) - 1 / (1 + Kokkos::exp(-t_end))) / - Kokkos::abs(1 / (1 + Kokkos::exp(-t_end))); + errors[idx] = Kokkos::abs(y_new_h(0) - 1 / (1 + Kokkos::exp(-t_end))) / Kokkos::abs(1 / (1 + Kokkos::exp(-t_end))); } - measured_order = - Kokkos::pow(errors[num_tests - 1] / errors[0], 1.0 / (num_tests - 1)); + measured_order = Kokkos::pow(errors[num_tests - 1] / errors[0], 1.0 / (num_tests - 1)); EXPECT_NEAR_KK_REL(measured_order, 8.0, 0.15); #if defined(HAVE_KOKKOSKERNELS_DEBUG) std::cout << "expected ratio: 8, actual ratio: " << measured_order - << ", order error=" << Kokkos::abs(measured_order - 8.0) / 8.0 - << std::endl; + << ", order error=" << Kokkos::abs(measured_order - 8.0) / 8.0 << std::endl; #endif // Test BDF4 @@ -329,25 +296,20 @@ void test_BDF_Logistic() { mv_type y_vecs("history vectors", mySys.neqs, 4); Kokkos::deep_copy(y0, 0.5); - BDFSolve_wrapper - solve_wrapper(mySys, t_start, t_end, num_steps[idx], y0, y_new, rhs, - update, scale, y_vecs, kstack, temp, jac); + BDFSolve_wrapper + solve_wrapper(mySys, t_start, t_end, num_steps[idx], y0, y_new, rhs, update, scale, y_vecs, kstack, temp, jac); Kokkos::parallel_for(myPolicy, solve_wrapper); Kokkos::fence(); auto y_new_h = Kokkos::create_mirror_view(y_new); Kokkos::deep_copy(y_new_h, y_new); - errors[idx] = Kokkos::abs(y_new_h(0) - 1 / (1 + Kokkos::exp(-t_end))) / - Kokkos::abs(1 / (1 + Kokkos::exp(-t_end))); + errors[idx] = Kokkos::abs(y_new_h(0) - 1 / (1 + Kokkos::exp(-t_end))) / Kokkos::abs(1 / (1 + Kokkos::exp(-t_end))); } - measured_order = - Kokkos::pow(errors[num_tests - 1] / errors[0], 1.0 / (num_tests - 1)); + measured_order = Kokkos::pow(errors[num_tests - 1] / errors[0], 1.0 / (num_tests - 1)); #if defined(HAVE_KOKKOSKERNELS_DEBUG) std::cout << "expected ratio: 16, actual ratio: " << measured_order - << ", order error=" << Kokkos::abs(measured_order - 16.0) / 16.0 - << std::endl; + << ", order error=" << Kokkos::abs(measured_order - 16.0) / 16.0 << std::endl; #endif // Test BDF5 @@ -358,25 +320,20 @@ void test_BDF_Logistic() { mv_type y_vecs("history vectors", mySys.neqs, 5); Kokkos::deep_copy(y0, 0.5); - BDFSolve_wrapper - solve_wrapper(mySys, t_start, t_end, num_steps[idx], y0, y_new, rhs, - update, scale, y_vecs, kstack, temp, jac); + BDFSolve_wrapper + solve_wrapper(mySys, t_start, t_end, num_steps[idx], y0, y_new, rhs, update, scale, y_vecs, kstack, temp, jac); Kokkos::parallel_for(myPolicy, solve_wrapper); Kokkos::fence(); auto y_new_h = Kokkos::create_mirror_view(y_new); Kokkos::deep_copy(y_new_h, y_new); - errors[idx] = Kokkos::abs(y_new_h(0) - 1 / (1 + Kokkos::exp(-t_end))) / - Kokkos::abs(1 / (1 + Kokkos::exp(-t_end))); + errors[idx] = Kokkos::abs(y_new_h(0) - 1 / (1 + Kokkos::exp(-t_end))) / Kokkos::abs(1 / (1 + Kokkos::exp(-t_end))); } - measured_order = - Kokkos::pow(errors[num_tests - 1] / errors[0], 1.0 / (num_tests - 1)); + measured_order = Kokkos::pow(errors[num_tests - 1] / errors[0], 1.0 / (num_tests - 1)); #if defined(HAVE_KOKKOSKERNELS_DEBUG) std::cout << "expected ratio: 32, actual ratio: " << measured_order - << ", order error=" << Kokkos::abs(measured_order - 32.0) / 32.0 - << std::endl; + << ", order error=" << Kokkos::abs(measured_order - 32.0) / 32.0 << std::endl; #endif } // test_BDF_Logistic @@ -394,8 +351,7 @@ void test_BDF_LotkaVolterra() { vec_type y0("initial conditions", mySys.neqs), y_new("solution", mySys.neqs); vec_type rhs("rhs", mySys.neqs), update("update", mySys.neqs); vec_type scale("scaling factors", mySys.neqs); - mat_type jac("jacobian", mySys.neqs, mySys.neqs), - temp("temp storage", mySys.neqs, mySys.neqs + 4); + mat_type jac("jacobian", mySys.neqs, mySys.neqs), temp("temp storage", mySys.neqs, mySys.neqs + 4); Kokkos::deep_copy(scale, 1); @@ -407,10 +363,8 @@ void test_BDF_LotkaVolterra() { Kokkos::deep_copy(y_vecs, 10.0); Kokkos::RangePolicy myPolicy(0, 1); - BDFSolve_wrapper - solve_wrapper(mySys, t_start, t_end, 1000, y0, y_new, rhs, update, scale, - y_vecs, kstack, temp, jac); + BDFSolve_wrapper + solve_wrapper(mySys, t_start, t_end, 1000, y0, y_new, rhs, update, scale, y_vecs, kstack, temp, jac); Kokkos::parallel_for(myPolicy, solve_wrapper); } @@ -427,8 +381,7 @@ void test_BDF_StiffChemistry() { vec_type y0("initial conditions", mySys.neqs), y_new("solution", mySys.neqs); vec_type rhs("rhs", mySys.neqs), update("update", mySys.neqs); vec_type scale("scaling factors", mySys.neqs); - mat_type jac("jacobian", mySys.neqs, mySys.neqs), - temp("temp storage", mySys.neqs, mySys.neqs + 4); + mat_type jac("jacobian", mySys.neqs, mySys.neqs), temp("temp storage", mySys.neqs, mySys.neqs + 4); Kokkos::deep_copy(scale, 1); @@ -444,10 +397,8 @@ void test_BDF_StiffChemistry() { Kokkos::deep_copy(y_vecs, 0.0); Kokkos::RangePolicy myPolicy(0, 1); - BDFSolve_wrapper - solve_wrapper(mySys, t_start, t_end, 110000, y0, y_new, rhs, update, - scale, y_vecs, kstack, temp, jac); + BDFSolve_wrapper + solve_wrapper(mySys, t_start, t_end, 110000, y0, y_new, rhs, update, scale, y_vecs, kstack, temp, jac); Kokkos::parallel_for(myPolicy, solve_wrapper); } @@ -559,8 +510,7 @@ void test_BDF_StiffChemistry() { // } template -void compute_coeffs(const int order, const scalar_type factor, - const mat_type& coeffs) { +void compute_coeffs(const int order, const scalar_type factor, const mat_type& coeffs) { std::cout << "compute_coeffs" << std::endl; coeffs(0, 0) = 1.0; @@ -568,35 +518,28 @@ void compute_coeffs(const int order, const scalar_type factor, coeffs(0, colIdx + 1) = 1.0; for (int rowIdx = 0; rowIdx < order; ++rowIdx) { coeffs(rowIdx + 1, colIdx + 1) = - ((rowIdx - factor * (colIdx + 1.0)) / (rowIdx + 1.0)) * - coeffs(rowIdx, colIdx + 1); + ((rowIdx - factor * (colIdx + 1.0)) / (rowIdx + 1.0)) * coeffs(rowIdx, colIdx + 1); } } } template -void update_D(const int order, const scalar_type factor, const mat_type& coeffs, - const mat_type& tempD, const mat_type& D) { - auto subD = - Kokkos::subview(D, Kokkos::pair(0, order + 1), Kokkos::ALL); - auto subTempD = - Kokkos::subview(tempD, Kokkos::pair(0, order + 1), Kokkos::ALL); +void update_D(const int order, const scalar_type factor, const mat_type& coeffs, const mat_type& tempD, + const mat_type& D) { + auto subD = Kokkos::subview(D, Kokkos::pair(0, order + 1), Kokkos::ALL); + auto subTempD = Kokkos::subview(tempD, Kokkos::pair(0, order + 1), Kokkos::ALL); compute_coeffs(order, factor, coeffs); - auto R = Kokkos::subview(coeffs, Kokkos::pair(0, order + 1), - Kokkos::pair(0, order + 1)); + auto R = Kokkos::subview(coeffs, Kokkos::pair(0, order + 1), Kokkos::pair(0, order + 1)); std::cout << "SerialGemm" << std::endl; - KokkosBatched::SerialGemm< - KokkosBatched::Trans::Transpose, KokkosBatched::Trans::NoTranspose, - KokkosBatched::Algo::Gemm::Blocked>::invoke(1.0, R, subD, 0.0, subTempD); + KokkosBatched::SerialGemm::invoke(1.0, R, subD, 0.0, subTempD); compute_coeffs(order, 1.0, coeffs); - auto U = Kokkos::subview(coeffs, Kokkos::pair(0, order + 1), - Kokkos::pair(0, order + 1)); + auto U = Kokkos::subview(coeffs, Kokkos::pair(0, order + 1), Kokkos::pair(0, order + 1)); std::cout << "SerialGemm" << std::endl; - KokkosBatched::SerialGemm< - KokkosBatched::Trans::Transpose, KokkosBatched::Trans::NoTranspose, - KokkosBatched::Algo::Gemm::Blocked>::invoke(1.0, U, subTempD, 0.0, subD); + KokkosBatched::SerialGemm::invoke(1.0, U, subTempD, 0.0, subD); } template @@ -604,10 +547,8 @@ void test_Nordsieck() { using execution_space = Kokkos::HostSpace; StiffChemistry mySys{}; - Kokkos::View R("coeffs", 6, 6), - U("coeffs", 6, 6); - Kokkos::View D("D", 8, mySys.neqs), - tempD("tmp", 8, mySys.neqs); + Kokkos::View R("coeffs", 6, 6), U("coeffs", 6, 6); + Kokkos::View D("D", 8, mySys.neqs), tempD("tmp", 8, mySys.neqs); int order = 1; double factor = 0.8; @@ -639,17 +580,13 @@ void test_Nordsieck() { } std::cout << "D before update:" << std::endl; - std::cout << " { " << D(0, 0) << ", " << D(0, 1) << ", " << D(0, 2) << " }" - << std::endl; - std::cout << " { " << D(1, 0) << ", " << D(1, 1) << ", " << D(1, 2) << " }" - << std::endl; + std::cout << " { " << D(0, 0) << ", " << D(0, 1) << ", " << D(0, 2) << " }" << std::endl; + std::cout << " { " << D(1, 0) << ", " << D(1, 1) << ", " << D(1, 2) << " }" << std::endl; update_D(order, factor, R, tempD, D); std::cout << "D after update:" << std::endl; - std::cout << " { " << D(0, 0) << ", " << D(0, 1) << ", " << D(0, 2) << " }" - << std::endl; - std::cout << " { " << D(1, 0) << ", " << D(1, 1) << ", " << D(1, 2) << " }" - << std::endl; + std::cout << " { " << D(0, 0) << ", " << D(0, 1) << ", " << D(0, 2) << " }" << std::endl; + std::cout << " { " << D(1, 0) << ", " << D(1, 1) << ", " << D(1, 2) << " }" << std::endl; } template @@ -668,8 +605,7 @@ void test_adaptive_BDF() { vec_type y0("initial conditions", mySys.neqs), y_new("solution", mySys.neqs); vec_type rhs("rhs", mySys.neqs), update("update", mySys.neqs); - mat_type temp("buffer1", mySys.neqs, 23 + 2 * mySys.neqs + 4), - temp2("buffer2", 6, 7); + mat_type temp("buffer1", mySys.neqs, 23 + 2 * mySys.neqs + 4), temp2("buffer2", 6, 7); // Initial condition Kokkos::deep_copy(y0, 0.5); @@ -688,13 +624,11 @@ void test_adaptive_BDF() { std::cout << "Initial conditions" << std::endl; std::cout << " y0=" << y0(0) << ", t=" << t << ", dt=" << dt << std::endl; - std::cout << "Initial D: {" << D(0, 0) << ", " << D(0, 1) << ", " << D(0, 2) - << ", " << D(0, 3) << ", " << D(0, 4) << ", " << D(0, 5) << ", " - << D(0, 6) << ", " << D(0, 7) << "}" << std::endl; + std::cout << "Initial D: {" << D(0, 0) << ", " << D(0, 1) << ", " << D(0, 2) << ", " << D(0, 3) << ", " << D(0, 4) + << ", " << D(0, 5) << ", " << D(0, 6) << ", " << D(0, 7) << "}" << std::endl; - KokkosODE::Impl::BDFStep(mySys, t, dt, t_end, order, num_equal_steps, - max_newton_iters, atol, rtol, 0.2, y0, y_new, rhs, - update, temp, temp2); + KokkosODE::Impl::BDFStep(mySys, t, dt, t_end, order, num_equal_steps, max_newton_iters, atol, rtol, 0.2, y0, y_new, + rhs, update, temp, temp2); for (int eqIdx = 0; eqIdx < mySys.neqs; ++eqIdx) { y0(eqIdx) = y_new(eqIdx); @@ -706,13 +640,11 @@ void test_adaptive_BDF() { std::cout << " y0=" << y0(0) << ", t=" << t << ", dt: " << dt << std::endl; - std::cout << "Initial D: {" << D(0, 0) << ", " << D(0, 1) << ", " << D(0, 2) - << ", " << D(0, 3) << ", " << D(0, 4) << ", " << D(0, 5) << ", " - << D(0, 6) << ", " << D(0, 7) << "}" << std::endl; + std::cout << "Initial D: {" << D(0, 0) << ", " << D(0, 1) << ", " << D(0, 2) << ", " << D(0, 3) << ", " << D(0, 4) + << ", " << D(0, 5) << ", " << D(0, 6) << ", " << D(0, 7) << "}" << std::endl; - KokkosODE::Impl::BDFStep(mySys, t, dt, t_end, order, num_equal_steps, - max_newton_iters, atol, rtol, 0.2, y0, y_new, rhs, - update, temp, temp2); + KokkosODE::Impl::BDFStep(mySys, t, dt, t_end, order, num_equal_steps, max_newton_iters, atol, rtol, 0.2, y0, y_new, + rhs, update, temp, temp2); for (int eqIdx = 0; eqIdx < mySys.neqs; ++eqIdx) { y0(eqIdx) = y_new(eqIdx); @@ -724,13 +656,11 @@ void test_adaptive_BDF() { std::cout << " y0=" << y0(0) << ", t=" << t << ", dt: " << dt << std::endl; - std::cout << "Initial D: {" << D(0, 0) << ", " << D(0, 1) << ", " << D(0, 2) - << ", " << D(0, 3) << ", " << D(0, 4) << ", " << D(0, 5) << ", " - << D(0, 6) << ", " << D(0, 7) << "}" << std::endl; + std::cout << "Initial D: {" << D(0, 0) << ", " << D(0, 1) << ", " << D(0, 2) << ", " << D(0, 3) << ", " << D(0, 4) + << ", " << D(0, 5) << ", " << D(0, 6) << ", " << D(0, 7) << "}" << std::endl; - KokkosODE::Impl::BDFStep(mySys, t, dt, t_end, order, num_equal_steps, - max_newton_iters, atol, rtol, 0.2, y0, y_new, rhs, - update, temp, temp2); + KokkosODE::Impl::BDFStep(mySys, t, dt, t_end, order, num_equal_steps, max_newton_iters, atol, rtol, 0.2, y0, y_new, + rhs, update, temp, temp2); std::cout << "Final t: " << t << ", y=" << y_new(0) << std::endl; @@ -751,22 +681,18 @@ void test_adaptive_BDF_v2() { vec_type y0("initial conditions", mySys.neqs), y_new("solution", mySys.neqs); Kokkos::deep_copy(y0, 0.5); - mat_type temp("buffer1", mySys.neqs, 23 + 2 * mySys.neqs + 4), - temp2("buffer2", 6, 7); + mat_type temp("buffer1", mySys.neqs, 23 + 2 * mySys.neqs + 4), temp2("buffer2", 6, 7); { scalar_type dt = KAT::zero(); vec_type f0("initial value f", mySys.neqs); mySys.evaluate_function(t_start, KAT::zero(), y0, f0); - KokkosODE::Impl::initial_step_size(mySys, 1, t_start, 1e-6, 1e-3, y0, f0, - temp, dt); + KokkosODE::Impl::initial_step_size(mySys, 1, t_start, 1e-6, 1e-3, y0, f0, temp, dt); std::cout << "Initial Step Size: dt=" << dt << std::endl; } - KokkosODE::Experimental::BDFSolve(mySys, t_start, t_end, 0.0117188, - (t_end - t_start) / 10, y0, y_new, temp, - temp2); + KokkosODE::Experimental::BDFSolve(mySys, t_start, t_end, 0.0117188, (t_end - t_start) / 10, y0, y_new, temp, temp2); } template @@ -789,42 +715,30 @@ void test_BDF_adaptive_stiff() { y0_h(2) = KAT::zero(); Kokkos::deep_copy(y0, y0_h); - mat_type temp("buffer1", mySys.neqs, 23 + 2 * mySys.neqs + 4), - temp2("buffer2", 6, 7); + mat_type temp("buffer1", mySys.neqs, 23 + 2 * mySys.neqs + 4), temp2("buffer2", 6, 7); Kokkos::RangePolicy policy(0, 1); - BDF_Solve_wrapper bdf_wrapper(mySys, t_start, t_end, dt, - (t_end - t_start) / 10, y0, y_new, temp, temp2); + BDF_Solve_wrapper bdf_wrapper(mySys, t_start, t_end, dt, (t_end - t_start) / 10, y0, y_new, temp, temp2); Kokkos::parallel_for(policy, bdf_wrapper); auto y_new_h = Kokkos::create_mirror_view(y_new); Kokkos::deep_copy(y_new_h, y_new); - std::cout << "Stiff Chemistry solution at t=500: {" << y_new_h(0) << ", " - << y_new_h(1) << ", " << y_new_h(2) << "}" << std::endl; + std::cout << "Stiff Chemistry solution at t=500: {" << y_new_h(0) << ", " << y_new_h(1) << ", " << y_new_h(2) << "}" + << std::endl; } } // namespace Test -TEST_F(TestCategory, BDF_Logistic_serial) { - ::Test::test_BDF_Logistic(); -} -TEST_F(TestCategory, BDF_LotkaVolterra_serial) { - ::Test::test_BDF_LotkaVolterra(); -} -TEST_F(TestCategory, BDF_StiffChemistry_serial) { - ::Test::test_BDF_StiffChemistry(); -} +TEST_F(TestCategory, BDF_Logistic_serial) { ::Test::test_BDF_Logistic(); } +TEST_F(TestCategory, BDF_LotkaVolterra_serial) { ::Test::test_BDF_LotkaVolterra(); } +TEST_F(TestCategory, BDF_StiffChemistry_serial) { ::Test::test_BDF_StiffChemistry(); } // TEST_F(TestCategory, BDF_parallel_serial) { // ::Test::test_BDF_parallel(); // } -TEST_F(TestCategory, BDF_Nordsieck) { - ::Test::test_Nordsieck(); -} +TEST_F(TestCategory, BDF_Nordsieck) { ::Test::test_Nordsieck(); } // TEST_F(TestCategory, BDF_adaptive) { // ::Test::test_adaptive_BDF(); // ::Test::test_adaptive_BDF_v2(); // } -TEST_F(TestCategory, BDF_StiffChemistry_adaptive) { - ::Test::test_BDF_adaptive_stiff(); -} +TEST_F(TestCategory, BDF_StiffChemistry_adaptive) { ::Test::test_BDF_adaptive_stiff(); } diff --git a/ode/unit_test/Test_ODE_Newton.hpp b/ode/unit_test/Test_ODE_Newton.hpp index 45dd4adb6a..c37142ee8f 100644 --- a/ode/unit_test/Test_ODE_Newton.hpp +++ b/ode/unit_test/Test_ODE_Newton.hpp @@ -21,8 +21,7 @@ namespace Test { -template +template struct NewtonSolve_wrapper { using newton_params = KokkosODE::Experimental::Newton_params; @@ -35,11 +34,9 @@ struct NewtonSolve_wrapper { scale_type scale; - NewtonSolve_wrapper(const system_type& my_nls_, const newton_params& params_, - const vec_type& x_, const vec_type& rhs_, - const vec_type& update_, const mat_type& J_, - const mat_type& tmp_, const status_view& status_, - const scale_type& scale_) + NewtonSolve_wrapper(const system_type& my_nls_, const newton_params& params_, const vec_type& x_, + const vec_type& rhs_, const vec_type& update_, const mat_type& J_, const mat_type& tmp_, + const status_view& status_, const scale_type& scale_) : my_nls(my_nls_), params(params_), x(x_), @@ -54,38 +51,27 @@ struct NewtonSolve_wrapper { void operator()(const int idx) const { // Take subviews to create the local problem auto local_x = Kokkos::subview( - x, Kokkos::pair(static_cast(my_nls.neqs * idx), - static_cast(my_nls.neqs * (idx + 1)))); + x, Kokkos::pair(static_cast(my_nls.neqs * idx), static_cast(my_nls.neqs * (idx + 1)))); auto local_rhs = Kokkos::subview( - rhs, Kokkos::pair(static_cast(my_nls.neqs * idx), - static_cast(my_nls.neqs * (idx + 1)))); + rhs, Kokkos::pair(static_cast(my_nls.neqs * idx), static_cast(my_nls.neqs * (idx + 1)))); auto local_update = Kokkos::subview( - update, - Kokkos::pair(static_cast(my_nls.neqs * idx), - static_cast(my_nls.neqs * (idx + 1)))); + update, Kokkos::pair(static_cast(my_nls.neqs * idx), static_cast(my_nls.neqs * (idx + 1)))); auto local_J = Kokkos::subview( - J, - Kokkos::pair(static_cast(my_nls.neqs * idx), - static_cast(my_nls.neqs * (idx + 1))), + J, Kokkos::pair(static_cast(my_nls.neqs * idx), static_cast(my_nls.neqs * (idx + 1))), Kokkos::ALL()); auto local_tmp = Kokkos::subview( - tmp, - Kokkos::pair(static_cast(my_nls.neqs * idx), - static_cast(my_nls.neqs * (idx + 1))), + tmp, Kokkos::pair(static_cast(my_nls.neqs * idx), static_cast(my_nls.neqs * (idx + 1))), Kokkos::ALL()); // Run Newton nonlinear solver - status(idx) = KokkosODE::Experimental::Newton::Solve( - my_nls, params, local_J, local_tmp, local_x, local_rhs, local_update, - scale); + status(idx) = KokkosODE::Experimental::Newton::Solve(my_nls, params, local_J, local_tmp, local_x, local_rhs, + local_update, scale); } }; template -void run_newton_test(const system_type& mySys, - KokkosODE::Experimental::Newton_params& params, - const scalar_type* const initial_val, - const scalar_type* const solution) { +void run_newton_test(const system_type& mySys, KokkosODE::Experimental::Newton_params& params, + const scalar_type* const initial_val, const scalar_type* const solution) { using execution_space = typename Device::execution_space; using newton_solver_status = KokkosODE::Experimental::newton_solver_status; using vec_type = typename Kokkos::View; @@ -96,14 +82,12 @@ void run_newton_test(const system_type& mySys, vec_type scale("scaling factors", mySys.neqs); Kokkos::deep_copy(scale, 1); - vec_type x("solution vector", mySys.neqs), - rhs("right hand side vector", mySys.neqs); + vec_type x("solution vector", mySys.neqs), rhs("right hand side vector", mySys.neqs); auto x_h = Kokkos::create_mirror_view(x); auto r_h = Kokkos::create_mirror_view(rhs); vec_type update("update", mySys.neqs); - mat_type J("jacobian", mySys.neqs, mySys.neqs), - tmp("temp mem", mySys.neqs, mySys.neqs + 4); + mat_type J("jacobian", mySys.neqs, mySys.neqs), tmp("temp mem", mySys.neqs, mySys.neqs + 4); // Initial values for (int eqIdx = 0; eqIdx < mySys.neqs; ++eqIdx) { @@ -112,8 +96,7 @@ void run_newton_test(const system_type& mySys, Kokkos::deep_copy(x, x_h); Kokkos::RangePolicy my_policy(0, 1); - NewtonSolve_wrapper solve_wrapper(mySys, params, x, rhs, update, J, tmp, - status, scale); + NewtonSolve_wrapper solve_wrapper(mySys, params, x, rhs, update, J, tmp, status, scale); Kokkos::parallel_for(my_policy, solve_wrapper); @@ -131,9 +114,7 @@ void run_newton_test(const system_type& mySys, } std::cout << " ), " << KokkosBlas::serial_nrm2(rhs) << ", ("; for (int eqIdx = 0; eqIdx < mySys.neqs; ++eqIdx) { - std::cout << " " - << Kokkos::abs(x_h(eqIdx) - solution[eqIdx]) / - Kokkos::abs(solution[eqIdx]); + std::cout << " " << Kokkos::abs(x_h(eqIdx) - solution[eqIdx]) / Kokkos::abs(solution[eqIdx]); } std::cout << " )]" << std::endl; #else @@ -154,13 +135,9 @@ struct QuadraticEquation { QuadraticEquation() {} - KOKKOS_FUNCTION void residual(const vec_type& y, const vec_type& f) const { - f(0) = y(0) * y(0) - y(0) - 2; - } + KOKKOS_FUNCTION void residual(const vec_type& y, const vec_type& f) const { f(0) = y(0) * y(0) - y(0) - 2; } - KOKKOS_FUNCTION void jacobian(const vec_type& y, const mat_type& jac) const { - jac(0, 0) = 2 * y(0) - 1; - } + KOKKOS_FUNCTION void jacobian(const vec_type& y, const mat_type& jac) const { jac(0, 0) = 2 * y(0) - 1; } }; // Trigonometric equation @@ -176,13 +153,9 @@ struct TrigonometricEquation { TrigonometricEquation() {} - KOKKOS_FUNCTION void residual(const vec_type& y, const vec_type& f) const { - f(0) = Kokkos::cos(y(0)) - y(0); - } + KOKKOS_FUNCTION void residual(const vec_type& y, const vec_type& f) const { f(0) = Kokkos::cos(y(0)) - y(0); } - KOKKOS_FUNCTION void jacobian(const vec_type& y, const mat_type& jac) const { - jac(0, 0) = -Kokkos::sin(y(0)) - 1; - } + KOKKOS_FUNCTION void jacobian(const vec_type& y, const mat_type& jac) const { jac(0, 0) = -Kokkos::sin(y(0)) - 1; } }; // Logarithmic equation @@ -202,9 +175,7 @@ struct LogarithmicEquation { f(0) = 7 * y(0) - Kokkos::log(7 * y(0)) - 1; } - KOKKOS_FUNCTION void jacobian(const vec_type& y, const mat_type& jac) const { - jac(0, 0) = 7 - 1 / y(0); - } + KOKKOS_FUNCTION void jacobian(const vec_type& y, const mat_type& jac) const { jac(0, 0) = 7 - 1 / y(0); } }; template @@ -238,9 +209,8 @@ void test_newton_status() { #ifdef HAVE_KOKKOSKERNELS_DEBUG scalar_type solution[3] = {2.0, -1.0, 0.0}; #endif - newton_solver_status newton_status[3] = { - newton_solver_status::NLS_SUCCESS, newton_solver_status::NLS_DIVERGENCE, - newton_solver_status::LIN_SOLVE_FAIL}; + newton_solver_status newton_status[3] = {newton_solver_status::NLS_SUCCESS, newton_solver_status::NLS_DIVERGENCE, + newton_solver_status::LIN_SOLVE_FAIL}; vec_type x("solution vector", 1), rhs("right hand side vector", 1); auto x_h = Kokkos::create_mirror_view(x); auto r_h = Kokkos::create_mirror_view(rhs); @@ -253,8 +223,7 @@ void test_newton_status() { Kokkos::deep_copy(x, initial_value[idx]); Kokkos::RangePolicy my_policy(0, 1); - NewtonSolve_wrapper solve_wrapper(my_system, params, x, rhs, update, J, tmp, - status, scale); + NewtonSolve_wrapper solve_wrapper(my_system, params, x, rhs, update, J, tmp, status, scale); Kokkos::parallel_for(my_policy, solve_wrapper); Kokkos::deep_copy(status_h, status); @@ -263,10 +232,8 @@ void test_newton_status() { #ifdef HAVE_KOKKOSKERNELS_DEBUG Kokkos::deep_copy(x_h, x); Kokkos::deep_copy(r_h, rhs); - printf("Non-linear problem solution and residual with initial value %f:\n", - initial_value[idx]); - printf(" [%f, %g, %g]\n", x_h(0), r_h(0), - Kokkos::abs(x_h(0) - solution[idx]) / Kokkos::abs(solution[idx])); + printf("Non-linear problem solution and residual with initial value %f:\n", initial_value[idx]); + printf(" [%f, %g, %g]\n", x_h(0), r_h(0), Kokkos::abs(x_h(0) - solution[idx]) / Kokkos::abs(solution[idx])); #endif } } @@ -296,8 +263,7 @@ void test_simple_problems() { system_type mySys{}; scalar_type initial_value[2] = {1.0, -0.5}, solution[2] = {2.0, -1.0}; for (int idx = 0; idx < 2; ++idx) { - run_newton_test( - mySys, params, &(initial_value[idx]), &(solution[idx])); + run_newton_test(mySys, params, &(initial_value[idx]), &(solution[idx])); } #ifdef HAVE_KOKKOSKERNELS_DEBUG std::cout << "Finished Quadratic Equation problem" << std::endl; @@ -312,8 +278,7 @@ void test_simple_problems() { using system_type = TrigonometricEquation; system_type mySys{}; scalar_type initial_value[1] = {0.1}, solution[1] = {0.739085}; - run_newton_test(mySys, params, - initial_value, solution); + run_newton_test(mySys, params, initial_value, solution); #ifdef HAVE_KOKKOSKERNELS_DEBUG std::cout << "Finished Trigonometric Equation problem" << std::endl; #endif @@ -327,10 +292,8 @@ void test_simple_problems() { using system_type = LogarithmicEquation; system_type mySys{}; scalar_type initial_value[1] = {static_cast(0.5)}, - solution[1] = {static_cast(1.0) / - static_cast(7.0)}; - run_newton_test(mySys, params, - initial_value, solution); + solution[1] = {static_cast(1.0) / static_cast(7.0)}; + run_newton_test(mySys, params, initial_value, solution); #ifdef HAVE_KOKKOSKERNELS_DEBUG std::cout << "Finished Logarithmic Equation problem" << std::endl; #endif @@ -431,8 +394,7 @@ void test_simple_systems() { system_type mySys{}; scalar_type initial_values[2] = {1.5, 1.5}; scalar_type solution[2] = {10.75 / 6, 0.8887803753}; - run_newton_test(mySys, params, - initial_values, solution); + run_newton_test(mySys, params, initial_values, solution); #ifdef HAVE_KOKKOSKERNELS_DEBUG std::cout << "Finished Circles Intersetcion problem" << std::endl; #endif @@ -441,8 +403,7 @@ void test_simple_systems() { { // Second problem: circle / hyperbola intersection #ifdef HAVE_KOKKOSKERNELS_DEBUG - std::cout << "\nStarting Circle/Hyperbola Intersetcion problem" - << std::endl; + std::cout << "\nStarting Circle/Hyperbola Intersetcion problem" << std::endl; #endif using system_type = CircleHyperbolaIntersection; system_type mySys{}; @@ -450,12 +411,9 @@ void test_simple_systems() { scalar_type init_vals[2] = {0.0, 1.0}; scalar_type solutions[2] = { Kokkos::ArithTraits::one() / - Kokkos::sqrt(static_cast( - 4 + Kokkos::sqrt(static_cast(12.0)) / 2)), - Kokkos::sqrt(static_cast( - (4 + Kokkos::sqrt(static_cast(12.0))) / 2))}; - run_newton_test(mySys, params, init_vals, - solutions); + Kokkos::sqrt(static_cast(4 + Kokkos::sqrt(static_cast(12.0)) / 2)), + Kokkos::sqrt(static_cast((4 + Kokkos::sqrt(static_cast(12.0))) / 2))}; + run_newton_test(mySys, params, init_vals, solutions); #ifdef HAVE_KOKKOSKERNELS_DEBUG std::cout << "Finished Circle/Hyperbola Intersetcion problem" << std::endl; #endif @@ -502,8 +460,7 @@ void test_newton_on_device() { mat_type J("jacobian", mySys.neqs * num_systems, mySys.neqs); mat_type tmp("temp mem", mySys.neqs * num_systems, mySys.neqs + 4); - Kokkos::View status("solver status", - num_systems); + Kokkos::View status("solver status", num_systems); auto x_h = Kokkos::create_mirror_view(x); auto r_h = Kokkos::create_mirror_view(rhs); @@ -517,8 +474,7 @@ void test_newton_on_device() { Kokkos::deep_copy(x, x_h); Kokkos::RangePolicy my_policy(0, num_systems); - NewtonSolve_wrapper solve_wrapper(mySys, params, x, rhs, update, J, tmp, - status, scale); + NewtonSolve_wrapper solve_wrapper(mySys, params, x, rhs, update, J, tmp, status, scale); Kokkos::parallel_for(my_policy, solve_wrapper); Kokkos::fence(); @@ -536,30 +492,14 @@ void test_newton_on_device() { // No ETI is performed for these device routines // Just pick scalar types at will... -TEST_F(TestCategory, Newton_status_float) { - ::Test::test_newton_status(); -} -TEST_F(TestCategory, Newton_status_double) { - ::Test::test_newton_status(); -} +TEST_F(TestCategory, Newton_status_float) { ::Test::test_newton_status(); } +TEST_F(TestCategory, Newton_status_double) { ::Test::test_newton_status(); } -TEST_F(TestCategory, Newton_simple_float) { - ::Test::test_simple_problems(); -} -TEST_F(TestCategory, Newton_simple_double) { - ::Test::test_simple_problems(); -} +TEST_F(TestCategory, Newton_simple_float) { ::Test::test_simple_problems(); } +TEST_F(TestCategory, Newton_simple_double) { ::Test::test_simple_problems(); } -TEST_F(TestCategory, Newton_system_float) { - ::Test::test_simple_systems(); -} -TEST_F(TestCategory, Newton_system_double) { - ::Test::test_simple_systems(); -} +TEST_F(TestCategory, Newton_system_float) { ::Test::test_simple_systems(); } +TEST_F(TestCategory, Newton_system_double) { ::Test::test_simple_systems(); } -TEST_F(TestCategory, Newton_parallel_float) { - ::Test::test_newton_on_device(); -} -TEST_F(TestCategory, Newton_parallel_double) { - ::Test::test_newton_on_device(); -} +TEST_F(TestCategory, Newton_parallel_float) { ::Test::test_newton_on_device(); } +TEST_F(TestCategory, Newton_parallel_double) { ::Test::test_newton_on_device(); } diff --git a/ode/unit_test/Test_ODE_RK.hpp b/ode/unit_test/Test_ODE_RK.hpp index c7d1a84865..90bec0e184 100644 --- a/ode/unit_test/Test_ODE_RK.hpp +++ b/ode/unit_test/Test_ODE_RK.hpp @@ -37,25 +37,17 @@ struct duho { const double a11 = 0, a12 = 1, a21, a22; duho(const double m_, const double c_, const double k_) - : m(m_), - c(c_), - k(k_), - d(k_ / m_ - (c_ * c_) / (4 * m_ * m_)), - a21(-k / m), - a22(-c / m){}; + : m(m_), c(c_), k(k_), d(k_ / m_ - (c_ * c_) / (4 * m_ * m_)), a21(-k / m), a22(-c / m){}; template - KOKKOS_FUNCTION void evaluate_function(const double /*t*/, - const double /*dt*/, - const vec_type1& y, + KOKKOS_FUNCTION void evaluate_function(const double /*t*/, const double /*dt*/, const vec_type1& y, const vec_type2& f) const { f(0) = a11 * y(0) + a12 * y(1); f(1) = a21 * y(0) + a22 * y(1); } template - KOKKOS_FUNCTION void solution(const double t, const vec_type& y0, - const vec_type& y) const { + KOKKOS_FUNCTION void solution(const double t, const vec_type& y0, const vec_type& y) const { using KAT = Kokkos::ArithTraits; const double gamma = c / (2 * m); @@ -64,8 +56,7 @@ struct duho { const double A = y0(0) / KAT::cos(phi); y(0) = A * KAT::cos(omega * t - phi) * KAT::exp(-t * gamma); - y(1) = -y(0) * gamma - - omega * A * KAT::sin(omega * t - phi) * KAT::exp(-t * gamma); + y(1) = -y(0) * gamma - omega * A * KAT::sin(omega * t - phi) * KAT::exp(-t * gamma); } }; // duho @@ -76,16 +67,14 @@ struct solution_wrapper { scalar_type t; vec_type y_old, y_ref; - solution_wrapper(const ode_type& ode_, const scalar_type t_, - const vec_type& y_old_, const vec_type& y_ref_) + solution_wrapper(const ode_type& ode_, const scalar_type t_, const vec_type& y_old_, const vec_type& y_ref_) : ode(ode_), t(t_), y_old(y_old_), y_ref(y_ref_){}; KOKKOS_FUNCTION void operator()(const int /*idx*/) const { ode.solution(t, y_old, y_ref); } }; -template +template struct RKSolve_wrapper { using ode_params = KokkosODE::Experimental::ODE_params; @@ -96,10 +85,9 @@ struct RKSolve_wrapper { vec_type y_old, y_new, tmp; mv_type kstack; - RKSolve_wrapper(const ode_type& my_ode_, const ode_params& params_, - const scalar_type tstart_, const scalar_type tend_, - const vec_type& y_old_, const vec_type& y_new_, - const vec_type& tmp_, const mv_type& kstack_) + RKSolve_wrapper(const ode_type& my_ode_, const ode_params& params_, const scalar_type tstart_, + const scalar_type tend_, const vec_type& y_old_, const vec_type& y_new_, const vec_type& tmp_, + const mv_type& kstack_) : my_ode(my_ode_), params(params_), tstart(tstart_), @@ -111,20 +99,15 @@ struct RKSolve_wrapper { KOKKOS_FUNCTION void operator()(const int /*idx*/) const { - KokkosODE::Experimental::RungeKutta::Solve( - my_ode, params, tstart, tend, y_old, y_new, tmp, kstack); + KokkosODE::Experimental::RungeKutta::Solve(my_ode, params, tstart, tend, y_old, y_new, tmp, kstack); } }; -template -void test_method(const std::string label, ode_type& my_ode, - const scalar_type& tstart, const scalar_type& tend, - const int num_steps, vec_type& y_old, vec_type& y_new, - const int order, const int num_stages, +template +void test_method(const std::string label, ode_type& my_ode, const scalar_type& tstart, const scalar_type& tend, + const int num_steps, vec_type& y_old, vec_type& y_new, const int order, const int num_stages, const Kokkos::View& ks, - const Kokkos::View& sol, - typename vec_type::HostMirror y_ref_h) { + const Kokkos::View& sol, typename vec_type::HostMirror y_ref_h) { using execution_space = typename vec_type::execution_space; using solver_type = KokkosODE::Experimental::RungeKutta; @@ -133,8 +116,8 @@ void test_method(const std::string label, ode_type& my_ode, mv_type kstack("k stack", solver_type::num_stages(), my_ode.neqs); Kokkos::RangePolicy my_policy(0, 1); - RKSolve_wrapper - solve_wrapper(my_ode, params, tstart, tend, y_old, y_new, tmp, kstack); + RKSolve_wrapper solve_wrapper(my_ode, params, tstart, tend, y_old, + y_new, tmp, kstack); Kokkos::parallel_for(my_policy, solve_wrapper); auto y_new_h = Kokkos::create_mirror_view(y_new); @@ -155,19 +138,16 @@ void test_method(const std::string label, ode_type& my_ode, EXPECT_NEAR_KK(ks(0, stageIdx), kstack_h(stageIdx, 0), 1e-8); EXPECT_NEAR_KK(ks(1, stageIdx), kstack_h(stageIdx, 1), 1e-8); #if defined(HAVE_KOKKOSKERNELS_DEBUG) - std::cout << " k" << stageIdx << "={" << kstack_h(stageIdx, 0) << ", " - << kstack_h(stageIdx, 1) << "}" << std::endl; + std::cout << " k" << stageIdx << "={" << kstack_h(stageIdx, 0) << ", " << kstack_h(stageIdx, 1) << "}" + << std::endl; #endif } EXPECT_NEAR_KK(sol(0), y_new_h(0), 1e-8); EXPECT_NEAR_KK(sol(1), y_new_h(1), 1e-8); #if defined(HAVE_KOKKOSKERNELS_DEBUG) std::cout << " y={" << y_new_h(0) << ", " << y_new_h(1) << "}" << std::endl; - std::cout << " error={" - << Kokkos::abs(y_new_h(0) - y_ref_h(0)) / Kokkos::abs(y_ref_h(0)) - << ", " - << Kokkos::abs(y_new_h(1) - y_ref_h(1)) / Kokkos::abs(y_ref_h(1)) - << "}" << std::endl; + std::cout << " error={" << Kokkos::abs(y_new_h(0) - y_ref_h(0)) / Kokkos::abs(y_ref_h(0)) << ", " + << Kokkos::abs(y_new_h(1) - y_ref_h(1)) / Kokkos::abs(y_ref_h(1)) << "}" << std::endl; #else (void)y_ref_h; #endif @@ -216,8 +196,7 @@ void test_RK() { Kokkos::deep_copy(y_ref_h, y_ref); #if defined(HAVE_KOKKOSKERNELS_DEBUG) std::cout << "\nAnalytical solution" << std::endl; - std::cout << " y={" << y_ref_h(0) << ", " << y_ref_h(1) << "}" - << std::endl; + std::cout << " y={" << y_ref_h(0) << ", " << y_ref_h(1) << "}" << std::endl; #endif } @@ -230,9 +209,8 @@ void test_RK() { Kokkos::View ks(ks_raw, 2, 1); double sol_raw[2] = {1, -0.04}; Kokkos::View sol(sol_raw, 2); - test_method( - "Euler-Forward", my_oscillator, tstart, tend, 1, y_old, y_new, 1, 1, ks, - sol, y_ref_h); + test_method("Euler-Forward", my_oscillator, tstart, tend, 1, y_old, + y_new, 1, 1, ks, sol, y_ref_h); } { @@ -241,9 +219,8 @@ void test_RK() { Kokkos::View ks(ks_raw, 2, 2); double sol_raw[2] = {0.9998, -0.0398}; Kokkos::View sol(sol_raw, 2); - test_method( - "Euler-Heun", my_oscillator, tstart, tend, 1, y_old, y_new, 2, 2, ks, - sol, y_ref_h); + test_method("Euler-Heun", my_oscillator, tstart, tend, 1, y_old, + y_new, 2, 2, ks, sol, y_ref_h); } { @@ -252,73 +229,59 @@ void test_RK() { Kokkos::View ks(ks_raw, 2, 3); double sol_raw[2] = {0.9998, -0.03979999}; Kokkos::View sol(sol_raw, 2); - test_method( - "RKF-12", my_oscillator, tstart, tend, 1, y_old, y_new, 2, 3, ks, sol, - y_ref_h); + test_method("RKF-12", my_oscillator, tstart, tend, 1, y_old, y_new, + 2, 3, ks, sol, y_ref_h); } { Kokkos::deep_copy(y_old, y_old_h); - double ks_raw[8] = {0, -0.02, -0.02985, -0.039798, - -4, -3.98, -3.96955, -3.95940467}; + double ks_raw[8] = {0, -0.02, -0.02985, -0.039798, -4, -3.98, -3.96955, -3.95940467}; Kokkos::View ks(ks_raw, 2, 4); double sol_raw[2] = {0.99980067, -0.039798}; Kokkos::View sol(sol_raw, 2); - test_method( - "RKBS", my_oscillator, tstart, tend, 1, y_old, y_new, 3, 4, ks, sol, - y_ref_h); + test_method("RKBS", my_oscillator, tstart, tend, 1, y_old, y_new, 3, + 4, ks, sol, y_ref_h); } { Kokkos::deep_copy(y_old, y_old_h); - double ks_raw[12] = {0, -0.01, -0.01497188, -0.03674986, - -0.03979499, -0.0199505, -4, -3.99, - -3.98491562, -3.96257222, -3.95941166, -3.97984883}; + double ks_raw[12] = {0, -0.01, -0.01497188, -0.03674986, -0.03979499, -0.0199505, + -4, -3.99, -3.98491562, -3.96257222, -3.95941166, -3.97984883}; Kokkos::View ks(ks_raw, 2, 6); double sol_raw[2] = {0.99980067, -0.03979801}; Kokkos::View sol(sol_raw, 2); - test_method( - "RKF-45", my_oscillator, tstart, tend, 1, y_old, y_new, 5, 6, ks, sol, - y_ref_h); + test_method("RKF-45", my_oscillator, tstart, tend, 1, y_old, y_new, + 5, 6, ks, sol, y_ref_h); } { Kokkos::deep_copy(y_old, y_old_h); - double ks_raw[12] = {0, -0.008, -0.011982, -0.02392735, - -0.03979862, -0.03484563, -4, -3.992, - -3.987946, -3.97578551, -3.95940328, -3.96454357}; + double ks_raw[12] = {0, -0.008, -0.011982, -0.02392735, -0.03979862, -0.03484563, + -4, -3.992, -3.987946, -3.97578551, -3.95940328, -3.96454357}; Kokkos::View ks(ks_raw, 2, 6); double sol_raw[2] = {0.99980067, -0.03979801}; Kokkos::View sol(sol_raw, 2); - test_method( - "Cash-Karp", my_oscillator, tstart, tend, 1, y_old, y_new, 5, 6, ks, - sol, y_ref_h); + test_method("Cash-Karp", my_oscillator, tstart, tend, 1, y_old, + y_new, 5, 6, ks, sol, y_ref_h); } { Kokkos::deep_copy(y_old, y_old_h); - double ks_raw[14] = {0, -0.008, -0.011982, -0.03187008, - -0.03539333, -0.0397954, -0.03979801, -4, - -3.992, -3.987946, -3.96762048, -3.96398013, - -3.95941068, -3.95940467}; + double ks_raw[14] = {0, -0.008, -0.011982, -0.03187008, -0.03539333, -0.0397954, -0.03979801, + -4, -3.992, -3.987946, -3.96762048, -3.96398013, -3.95941068, -3.95940467}; Kokkos::View ks(ks_raw, 2, 7); double sol_raw[2] = {0.99980067, -0.03979801}; Kokkos::View sol(sol_raw, 2); - test_method( - "Dormand-Prince", my_oscillator, tstart, tend, 1, y_old, y_new, 5, 7, - ks, sol, y_ref_h); + test_method("Dormand-Prince", my_oscillator, tstart, tend, 1, y_old, + y_new, 5, 7, ks, sol, y_ref_h); } } // test_RK -template -void test_rate(ode_type& my_ode, const scalar_type& tstart, - const scalar_type& tend, - Kokkos::View num_steps, - typename vec_type::HostMirror& y_old_h, - typename vec_type::HostMirror& y_ref_h, - typename vec_type::HostMirror& error) { +template +void test_rate(ode_type& my_ode, const scalar_type& tstart, const scalar_type& tend, + Kokkos::View num_steps, typename vec_type::HostMirror& y_old_h, + typename vec_type::HostMirror& y_ref_h, typename vec_type::HostMirror& error) { using execution_space = typename vec_type::execution_space; using solver_type = KokkosODE::Experimental::RungeKutta; @@ -334,8 +297,8 @@ void test_rate(ode_type& my_ode, const scalar_type& tstart, KokkosODE::Experimental::ODE_params params(num_steps(idx)); Kokkos::deep_copy(y_old, y_old_h); Kokkos::deep_copy(y_new, y_old_h); - RKSolve_wrapper - solve_wrapper(my_ode, params, tstart, tend, y_old, y_new, tmp, kstack); + RKSolve_wrapper solve_wrapper(my_ode, params, tstart, tend, + y_old, y_new, tmp, kstack); Kokkos::parallel_for(my_policy, solve_wrapper); Kokkos::deep_copy(y_new_h, y_new); @@ -343,8 +306,8 @@ void test_rate(ode_type& my_ode, const scalar_type& tstart, #if defined(HAVE_KOKKOSKERNELS_DEBUG) scalar_type dt = (tend - tstart) / num_steps(idx); - std::cout << "dt=" << dt << ", error=" << error(idx) << ", solution: {" - << y_new_h(0) << ", " << y_new_h(1) << "}" << std::endl; + std::cout << "dt=" << dt << ", error=" << error(idx) << ", solution: {" << y_new_h(0) << ", " << y_new_h(1) << "}" + << std::endl; #endif } @@ -399,67 +362,57 @@ void test_convergence_rate() { Kokkos::deep_copy(y_ref_h, y_ref); #if defined(HAVE_KOKKOSKERNELS_DEBUG) std::cout << "\nAnalytical solution" << std::endl; - std::cout << " y={" << y_ref_h(0) << ", " << y_ref_h(1) << "}" - << std::endl; + std::cout << " y={" << y_ref_h(0) << ", " << y_ref_h(1) << "}" << std::endl; #endif } typename vec_type::HostMirror error("error", num_steps.extent(0)); - test_rate( - my_oscillator, tstart, tend, num_steps, y_old_h, y_ref_h, error); + test_rate(my_oscillator, tstart, tend, num_steps, y_old_h, y_ref_h, + error); for (int idx = 1; idx < num_steps.extent_int(0) - 2; ++idx) { double expected_ratio = - Kokkos::pow(num_steps(idx) / num_steps(idx + 1), - KokkosODE::Impl::ButcherTableau<1, 1>::order); + Kokkos::pow(num_steps(idx) / num_steps(idx + 1), KokkosODE::Impl::ButcherTableau<1, 1>::order); double actual_ratio = error(idx + 1) / error(idx); EXPECT_NEAR_KK_REL(actual_ratio, expected_ratio, 0.15); #if defined(HAVE_KOKKOSKERNELS_DEBUG) - double rel_ratio_diff = Kokkos::abs(actual_ratio - expected_ratio) / - Kokkos::abs(expected_ratio); - std::cout << "error ratio: " << actual_ratio - << ", expected ratio: " << expected_ratio + double rel_ratio_diff = Kokkos::abs(actual_ratio - expected_ratio) / Kokkos::abs(expected_ratio); + std::cout << "error ratio: " << actual_ratio << ", expected ratio: " << expected_ratio << ", rel diff: " << rel_ratio_diff << std::endl; #endif } Kokkos::deep_copy(error, 0); - test_rate( - my_oscillator, tstart, tend, num_steps, y_old_h, y_ref_h, error); + test_rate(my_oscillator, tstart, tend, num_steps, y_old_h, y_ref_h, + error); for (int idx = 1; idx < num_steps.extent_int(0) - 2; ++idx) { double expected_ratio = - Kokkos::pow(num_steps(idx) / num_steps(idx + 1), - KokkosODE::Impl::ButcherTableau<2, 3>::order); + Kokkos::pow(num_steps(idx) / num_steps(idx + 1), KokkosODE::Impl::ButcherTableau<2, 3>::order); double actual_ratio = error(idx + 1) / error(idx); EXPECT_NEAR_KK_REL(actual_ratio, expected_ratio, 0.05); #if defined(HAVE_KOKKOSKERNELS_DEBUG) - double rel_ratio_diff = Kokkos::abs(actual_ratio - expected_ratio) / - Kokkos::abs(expected_ratio); - std::cout << "error ratio: " << actual_ratio - << ", expected ratio: " << expected_ratio + double rel_ratio_diff = Kokkos::abs(actual_ratio - expected_ratio) / Kokkos::abs(expected_ratio); + std::cout << "error ratio: " << actual_ratio << ", expected ratio: " << expected_ratio << ", rel diff: " << rel_ratio_diff << std::endl; #endif } Kokkos::deep_copy(error, 0); - test_rate( - my_oscillator, tstart, tend, num_steps, y_old_h, y_ref_h, error); + test_rate(my_oscillator, tstart, tend, num_steps, y_old_h, y_ref_h, + error); for (int idx = 1; idx < num_steps.extent_int(0) - 2; ++idx) { double expected_ratio = - Kokkos::pow(num_steps(idx) / num_steps(idx + 1), - KokkosODE::Impl::ButcherTableau<4, 5>::order); + Kokkos::pow(num_steps(idx) / num_steps(idx + 1), KokkosODE::Impl::ButcherTableau<4, 5>::order); double actual_ratio = error(idx + 1) / error(idx); EXPECT_NEAR_KK_REL(actual_ratio, expected_ratio, 0.05); #if defined(HAVE_KOKKOSKERNELS_DEBUG) - double rel_ratio_diff = Kokkos::abs(actual_ratio - expected_ratio) / - Kokkos::abs(expected_ratio); - std::cout << "error ratio: " << actual_ratio - << ", expected ratio: " << expected_ratio + double rel_ratio_diff = Kokkos::abs(actual_ratio - expected_ratio) / Kokkos::abs(expected_ratio); + std::cout << "error ratio: " << actual_ratio << ", expected ratio: " << expected_ratio << ", rel diff: " << rel_ratio_diff << std::endl; #endif } @@ -507,24 +460,19 @@ void test_adaptivity() { Kokkos::deep_copy(y_ref_h, y_ref); #if defined(HAVE_KOKKOSKERNELS_DEBUG) std::cout << "\nAnalytical solution" << std::endl; - std::cout << " y={" << y_ref_h(0) << ", " << y_ref_h(1) << "}" - << std::endl; + std::cout << " y={" << y_ref_h(0) << ", " << y_ref_h(1) << "}" << std::endl; #endif } vec_type tmp("tmp vector", neqs); - mv_type kstack( - "k stack", - KokkosODE::Experimental::RungeKutta::num_stages(), neqs); + mv_type kstack("k stack", KokkosODE::Experimental::RungeKutta::num_stages(), neqs); Kokkos::RangePolicy my_policy(0, 1); - KokkosODE::Experimental::ODE_params params(numSteps, maxSteps, absTol, relTol, - minStepSize); + KokkosODE::Experimental::ODE_params params(numSteps, maxSteps, absTol, relTol, minStepSize); Kokkos::deep_copy(y_old, y_old_h); Kokkos::deep_copy(y_new, y_old_h); - RKSolve_wrapper - solve_wrapper(my_oscillator, params, tstart, tend, y_old, y_new, tmp, - kstack); + RKSolve_wrapper solve_wrapper(my_oscillator, params, tstart, tend, + y_old, y_new, tmp, kstack); Kokkos::parallel_for(my_policy, solve_wrapper); auto y_new_h = Kokkos::create_mirror(y_new); @@ -547,8 +495,7 @@ void test_adaptivity() { for (int idx = 0; idx < y_new_h.extent_int(0); ++idx) { #if defined(HAVE_KOKKOSKERNELS_DEBUG) - error = - Kokkos::abs(y_new_h(idx) - y_ref_h(idx)) / Kokkos::abs(y_ref_h(idx)); + error = Kokkos::abs(y_new_h(idx) - y_ref_h(idx)) / Kokkos::abs(y_ref_h(idx)); std::cout << error << " "; #endif EXPECT_NEAR_KK_REL(y_new_h(idx), y_ref_h(idx), 1e-7); diff --git a/ode/unit_test/Test_ODE_RK_chem.hpp b/ode/unit_test/Test_ODE_RK_chem.hpp index 763f38a013..690e271c84 100644 --- a/ode/unit_test/Test_ODE_RK_chem.hpp +++ b/ode/unit_test/Test_ODE_RK_chem.hpp @@ -33,13 +33,11 @@ struct chem_model_1 { const double tstart, tend, T0, T1; - chem_model_1(const double tstart_ = 0, const double tend_ = 100, - const double T0_ = 300, const double T1_ = 800) + chem_model_1(const double tstart_ = 0, const double tend_ = 100, const double T0_ = 300, const double T1_ = 800) : tstart(tstart_), tend(tend_), T0(T0_), T1(T1_){}; template - KOKKOS_FUNCTION void evaluate_function(const double t, const double /*dt*/, - const vec_type1& y, + KOKKOS_FUNCTION void evaluate_function(const double t, const double /*dt*/, const vec_type1& y, const vec_type2& f) const { // First compute the temperature // using linear ramp from T0 to T1 @@ -61,13 +59,11 @@ struct chem_model_2 { const double tstart, tend, T0, T1; - chem_model_2(const double tstart_ = 0, const double tend_ = 1200, - const double T0_ = 300, const double T1_ = 1000) + chem_model_2(const double tstart_ = 0, const double tend_ = 1200, const double T0_ = 300, const double T1_ = 1000) : tstart(tstart_), tend(tend_), T0(T0_), T1(T1_){}; template - KOKKOS_FUNCTION void evaluate_function(const double t, const double /*dt*/, - const vec_type1& y, + KOKKOS_FUNCTION void evaluate_function(const double t, const double /*dt*/, const vec_type1& y, const vec_type2& f) const { // First compute the temperature // using linear ramp from T0 to T1 @@ -116,9 +112,8 @@ void test_chem() { Kokkos::deep_copy(y_new, y_old_h); Kokkos::RangePolicy my_policy(0, 1); - RKSolve_wrapper - solve_wrapper(chem_model, params, chem_model.tstart, chem_model.tend, - y_old, y_new, tmp, kstack); + RKSolve_wrapper solve_wrapper( + chem_model, params, chem_model.tstart, chem_model.tend, y_old, y_new, tmp, kstack); Kokkos::parallel_for(my_policy, solve_wrapper); auto y_new_h = Kokkos::create_mirror(y_new); @@ -126,15 +121,11 @@ void test_chem() { #if defined(HAVE_KOKKOSKERNELS_DEBUG) const double dt = (chem_model.tend - chem_model.tstart) / params.num_steps; std::cout << "\nChem model 1" << std::endl; - std::cout << " t0=" << chem_model.tstart << ", tn=" << chem_model.tend - << std::endl; - std::cout << " T0=" << chem_model.T0 << ", Tn=" << chem_model.T1 - << std::endl; + std::cout << " t0=" << chem_model.tstart << ", tn=" << chem_model.tend << std::endl; + std::cout << " T0=" << chem_model.T0 << ", Tn=" << chem_model.T1 << std::endl; std::cout << " dt=" << dt << std::endl; - std::cout << " y(t0)={" << y_old_h(0) << ", " << y_old_h(1) << "}" - << std::endl; - std::cout << " y(tn)={" << y_new_h(0) << ", " << y_new_h(1) << "}" - << std::endl; + std::cout << " y(t0)={" << y_old_h(0) << ", " << y_old_h(1) << "}" << std::endl; + std::cout << " y(tn)={" << y_new_h(0) << ", " << y_new_h(1) << "}" << std::endl; #endif } @@ -162,9 +153,8 @@ void test_chem() { Kokkos::deep_copy(y_new, y_old_h); Kokkos::RangePolicy my_policy(0, 1); - RKSolve_wrapper - solve_wrapper(chem_model, params, chem_model.tstart, chem_model.tend, - y_old, y_new, tmp, kstack); + RKSolve_wrapper solve_wrapper( + chem_model, params, chem_model.tstart, chem_model.tend, y_old, y_new, tmp, kstack); Kokkos::parallel_for(my_policy, solve_wrapper); auto y_new_h = Kokkos::create_mirror(y_new); @@ -172,17 +162,13 @@ void test_chem() { #if defined(HAVE_KOKKOSKERNELS_DEBUG) const double dt = (chem_model.tend - chem_model.tstart) / params.num_steps; std::cout << "\nChem model 2" << std::endl; - std::cout << " t0=" << chem_model.tstart << ", tn=" << chem_model.tend - << std::endl; - std::cout << " T0=" << chem_model.T0 << ", Tn=" << chem_model.T1 - << std::endl; + std::cout << " t0=" << chem_model.tstart << ", tn=" << chem_model.tend << std::endl; + std::cout << " T0=" << chem_model.T0 << ", Tn=" << chem_model.T1 << std::endl; std::cout << " dt=" << dt << std::endl; - std::cout << " y(t0)={" << y_old_h(0) << ", " << y_old_h(1) << ", " - << y_old_h(2) << ", " << y_old_h(3) << ", " << y_old_h(4) << ", " - << y_old_h(5) << ", " << y_old_h(6) << "}" << std::endl; - std::cout << " y(tn)={" << y_new_h(0) << ", " << y_new_h(1) << ", " - << y_new_h(2) << ", " << y_new_h(3) << ", " << y_new_h(4) << ", " - << y_new_h(5) << ", " << y_new_h(6) << "}" << std::endl; + std::cout << " y(t0)={" << y_old_h(0) << ", " << y_old_h(1) << ", " << y_old_h(2) << ", " << y_old_h(3) << ", " + << y_old_h(4) << ", " << y_old_h(5) << ", " << y_old_h(6) << "}" << std::endl; + std::cout << " y(tn)={" << y_new_h(0) << ", " << y_new_h(1) << ", " << y_new_h(2) << ", " << y_new_h(3) << ", " + << y_new_h(4) << ", " << y_new_h(5) << ", " << y_new_h(6) << "}" << std::endl; #endif } } // test_chem diff --git a/perf_test/Benchmark_Context.hpp b/perf_test/Benchmark_Context.hpp index adfc336576..e4a2416010 100644 --- a/perf_test/Benchmark_Context.hpp +++ b/perf_test/Benchmark_Context.hpp @@ -61,8 +61,7 @@ inline void add_kokkos_configuration(bool verbose) { auto val = remove_unwanted_characters(line.substr(found + 1)); // Ignore line without value, for example a category name if (!val.empty()) { - benchmark::AddCustomContext( - remove_unwanted_characters(line.substr(0, found)), val); + benchmark::AddCustomContext(remove_unwanted_characters(line.substr(0, found)), val); } } } @@ -75,18 +74,13 @@ inline void add_version_info() { if (!GIT_BRANCH.empty()) { benchmark::AddCustomContext("GIT_BRANCH", std::string(GIT_BRANCH)); - benchmark::AddCustomContext("GIT_COMMIT_HASH", - std::string(GIT_COMMIT_HASH)); - benchmark::AddCustomContext("GIT_CLEAN_STATUS", - std::string(GIT_CLEAN_STATUS)); - benchmark::AddCustomContext("GIT_COMMIT_DESCRIPTION", - std::string(GIT_COMMIT_DESCRIPTION)); - benchmark::AddCustomContext("GIT_COMMIT_DATE", - std::string(GIT_COMMIT_DATE)); + benchmark::AddCustomContext("GIT_COMMIT_HASH", std::string(GIT_COMMIT_HASH)); + benchmark::AddCustomContext("GIT_CLEAN_STATUS", std::string(GIT_CLEAN_STATUS)); + benchmark::AddCustomContext("GIT_COMMIT_DESCRIPTION", std::string(GIT_COMMIT_DESCRIPTION)); + benchmark::AddCustomContext("GIT_COMMIT_DATE", std::string(GIT_COMMIT_DATE)); } if (!BENCHMARK_VERSION.empty()) { - benchmark::AddCustomContext("GOOGLE_BENCHMARK_VERSION", - std::string(BENCHMARK_VERSION)); + benchmark::AddCustomContext("GOOGLE_BENCHMARK_VERSION", std::string(BENCHMARK_VERSION)); } } @@ -117,20 +111,16 @@ inline void add_benchmark_context(bool verbose = false) { } template -inline auto register_benchmark(const char* name, FuncType func, - std::vector arg_names, - std::vector args, int repeat, - ArgsToCallOp&&... func_args) { +inline auto register_benchmark(const char* name, FuncType func, std::vector arg_names, + std::vector args, int repeat, ArgsToCallOp&&... func_args) { if (repeat > 0) { - return benchmark::RegisterBenchmark( - name, func, std::forward(func_args)...) + return benchmark::RegisterBenchmark(name, func, std::forward(func_args)...) ->ArgNames(arg_names) ->Args(args) ->UseManualTime() ->Iterations(repeat); } else { - return benchmark::RegisterBenchmark( - name, func, std::forward(func_args)...) + return benchmark::RegisterBenchmark(name, func, std::forward(func_args)...) ->ArgNames(arg_names) ->Args(args) ->UseManualTime(); @@ -138,20 +128,16 @@ inline auto register_benchmark(const char* name, FuncType func, } template -inline auto register_benchmark_real_time(const char* name, FuncType func, - std::vector arg_names, - std::vector args, int repeat, - ArgsToCallOp&&... func_args) { +inline auto register_benchmark_real_time(const char* name, FuncType func, std::vector arg_names, + std::vector args, int repeat, ArgsToCallOp&&... func_args) { if (repeat > 0) { - return benchmark::RegisterBenchmark( - name, func, std::forward(func_args)...) + return benchmark::RegisterBenchmark(name, func, std::forward(func_args)...) ->ArgNames(arg_names) ->Args(args) ->UseRealTime() ->Iterations(repeat); } else { - return benchmark::RegisterBenchmark( - name, func, std::forward(func_args)...) + return benchmark::RegisterBenchmark(name, func, std::forward(func_args)...) ->ArgNames(arg_names) ->Args(args) ->UseRealTime(); diff --git a/perf_test/KokkosKernels_perf_test_instantiation.hpp b/perf_test/KokkosKernels_perf_test_instantiation.hpp index 6844922ddb..8a46754030 100644 --- a/perf_test/KokkosKernels_perf_test_instantiation.hpp +++ b/perf_test/KokkosKernels_perf_test_instantiation.hpp @@ -57,9 +57,7 @@ int main_instantiation(int argc, char** argv) { else if (params.use_sycl) device_id = params.use_sycl - 1; - Kokkos::initialize(Kokkos::InitializationSettings() - .set_num_threads(num_threads) - .set_device_id(device_id)); + Kokkos::initialize(Kokkos::InitializationSettings().set_num_threads(num_threads).set_device_id(device_id)); Kokkos::print_configuration(std::cout); std::cout << '\n'; @@ -112,8 +110,7 @@ int main_instantiation(int argc, char** argv) { if (params.use_sycl) { #if defined(KOKKOS_ENABLE_SYCL) std::cout << "Running on SYCL backend.\n"; - KOKKOSKERNELS_PERF_TEST_NAME(argc, argv, - params); + KOKKOSKERNELS_PERF_TEST_NAME(argc, argv, params); ran = true; #else std::cout << "ERROR: SYCL requested, but not available.\n"; diff --git a/perf_test/KokkosKernels_perf_test_utilities.hpp b/perf_test/KokkosKernels_perf_test_utilities.hpp index 1303b2370e..ec767c68f7 100644 --- a/perf_test/KokkosKernels_perf_test_utilities.hpp +++ b/perf_test/KokkosKernels_perf_test_utilities.hpp @@ -39,50 +39,49 @@ struct CommonInputParams { std::string list_common_options() { std::ostringstream common_options; - common_options - << "\t[Required] Backend: the available backends are:\n" + common_options << "\t[Required] Backend: the available backends are:\n" #ifdef KOKKOS_ENABLE_THREADS - << "\t\t'--threads [numThreads]'\n" + << "\t\t'--threads [numThreads]'\n" #endif #ifdef KOKKOS_ENABLE_OPENMP - << "\t\t'--openmp [numThreads]'\n" + << "\t\t'--openmp [numThreads]'\n" #endif #ifdef KOKKOS_ENABLE_CUDA - << "\t\t'--cuda [deviceIndex]'\n" + << "\t\t'--cuda [deviceIndex]'\n" #endif #ifdef KOKKOS_ENABLE_HIP - << "\t\t'--hip [deviceIndex]'\n" + << "\t\t'--hip [deviceIndex]'\n" #endif #ifdef KOKKOS_ENABLE_SYCL - << "\t\t'--sycl [deviceIndex]'\n" + << "\t\t'--sycl [deviceIndex]'\n" #endif #ifdef KOKKOS_ENABLE_SERIAL - << "\t\tIf no parallel backend is requested, Serial will be used.\n" + << "\t\tIf no parallel backend is requested, Serial will be used.\n" #endif - << "\n" - << "\t The following backends are not available because Kokkos was not " - "configured with them:\n" + << "\n" + << "\t The following backends are not available because Kokkos was not " + "configured with them:\n" #ifndef KOKKOS_ENABLE_THREADS - << "\t\t'--threads [numThreads]'\n" + << "\t\t'--threads [numThreads]'\n" #endif #ifndef KOKKOS_ENABLE_OPENMP - << "\t\t'--openmp [numThreads]'\n" + << "\t\t'--openmp [numThreads]'\n" #endif #ifndef KOKKOS_ENABLE_CUDA - << "\t\t'--cuda [deviceIndex]'\n" + << "\t\t'--cuda [deviceIndex]'\n" #endif #ifndef KOKKOS_ENABLE_HIP - << "\t\t'--hip [deviceIndex]'\n" + << "\t\t'--hip [deviceIndex]'\n" #endif #ifndef KOKKOS_ENABLE_SYCL - << "\t\t'--sycl [deviceIndex]'\n" + << "\t\t'--sycl [deviceIndex]'\n" #endif #ifndef KOKKOS_ENABLE_SERIAL - << "\t\tSerial is not enabled so a parallel backend must be selected.\n" + << "\t\tSerial is not enabled so a parallel backend must be selected.\n" #endif - << "\n" - << "\t[Optional]:\n" - << "\t\t'-h', '--help': show available options\n\n"; + << "\n" + << "\t[Optional]:\n" + << "\t\t'-h', '--help': show available options\n\n"; return common_options.str(); } @@ -94,15 +93,13 @@ void process_arg_int(char const* str_val, int& val) { if (str_val == ptr_end) { std::stringstream ss; - ss << "Error: cannot convert command line argument '" << str_val - << "' to an integer.\n"; + ss << "Error: cannot convert command line argument '" << str_val << "' to an integer.\n"; throw std::invalid_argument(ss.str()); } if (errno == ERANGE) { std::stringstream ss; - ss << "Error: converted value for command line argument '" << str_val - << "' falls out of range.\n"; + ss << "Error: converted value for command line argument '" << str_val << "' falls out of range.\n"; throw std::invalid_argument(ss.str()); } } @@ -114,21 +111,18 @@ void process_arg_double(char const* str_val, double& val) { if (str_val == ptr_end) { std::stringstream ss; - ss << "Error: cannot convert command line argument '" << str_val - << "' to a double.\n"; + ss << "Error: cannot convert command line argument '" << str_val << "' to a double.\n"; throw std::invalid_argument(ss.str()); } if (errno == ERANGE) { std::stringstream ss; - ss << "Error: converted value for command line argument '" << str_val - << "' falls out of range.\n"; + ss << "Error: converted value for command line argument '" << str_val << "' falls out of range.\n"; throw std::invalid_argument(ss.str()); } } -bool check_arg_int(int const i, int const argc, char** argv, char const* name, - int& val) { +bool check_arg_int(int const i, int const argc, char** argv, char const* name, int& val) { if (0 != Test::string_compare_no_case(argv[i], name)) { return false; } @@ -143,8 +137,7 @@ bool check_arg_int(int const i, int const argc, char** argv, char const* name, return true; } -bool check_arg_double(int const i, int const argc, char** argv, - char const* name, double& val) { +bool check_arg_double(int const i, int const argc, char** argv, char const* name, double& val) { if (0 != Test::string_compare_no_case(argv[i], name)) { return false; } @@ -159,8 +152,7 @@ bool check_arg_double(int const i, int const argc, char** argv, return true; } -bool check_arg_bool(int const i, int const /*argc*/, char** argv, - char const* name, bool& val) { +bool check_arg_bool(int const i, int const /*argc*/, char** argv, char const* name, bool& val) { if (0 != Test::string_compare_no_case(argv[i], name)) { return false; } @@ -168,8 +160,7 @@ bool check_arg_bool(int const i, int const /*argc*/, char** argv, return true; } -bool check_arg_str(int const i, int const argc, char** argv, char const* name, - std::string& val) { +bool check_arg_str(int const i, int const argc, char** argv, char const* name, std::string& val) { if (0 != Test::string_compare_no_case(argv[i], name)) { return false; } @@ -198,8 +189,7 @@ void parse_common_options(int& argc, char** argv, CommonInputParams& params) { int remove_flags = 0; if (check_arg_int(argIdx, argc, argv, "--threads", params.use_threads)) { remove_flags = 2; - } else if (check_arg_int(argIdx, argc, argv, "--openmp", - params.use_openmp)) { + } else if (check_arg_int(argIdx, argc, argv, "--openmp", params.use_openmp)) { remove_flags = 2; } else if (check_arg_int(argIdx, argc, argv, "--cuda", params.use_cuda)) { params.use_cuda++; @@ -213,8 +203,7 @@ void parse_common_options(int& argc, char** argv, CommonInputParams& params) { } else if (check_arg_int(argIdx, argc, argv, "--repeat", params.repeat)) { remove_flags = 2; } else if (check_arg_bool(argIdx, argc, argv, "-h", params.print_help) || - check_arg_bool(argIdx, argc, argv, "--help", - params.print_help)) { + check_arg_bool(argIdx, argc, argv, "--help", params.print_help)) { remove_flags = 1; } diff --git a/perf_test/PerfTestUtilities.cpp b/perf_test/PerfTestUtilities.cpp index c403d0213d..479d50d2ba 100644 --- a/perf_test/PerfTestUtilities.cpp +++ b/perf_test/PerfTestUtilities.cpp @@ -23,8 +23,6 @@ namespace test { std::string inputDataPath; -void set_input_data_path(const std::string& path_to_data) { - inputDataPath = path_to_data; -} +void set_input_data_path(const std::string& path_to_data) { inputDataPath = path_to_data; } std::string get_input_data_path() { return inputDataPath; } } // namespace test diff --git a/perf_test/PerfTestUtilities.hpp b/perf_test/PerfTestUtilities.hpp index 4de10312b6..f6531a76fb 100644 --- a/perf_test/PerfTestUtilities.hpp +++ b/perf_test/PerfTestUtilities.hpp @@ -36,8 +36,7 @@ std::string get_input_data_path(); namespace KokkosSparse { -template +template class CrsMatrix; } @@ -62,8 +61,7 @@ inline std::vector get_directories(std::string path) { std::string nname = std::string(dir->d_name); // Check to see if item is a directory // if (isDirectory(path + '/' + nname)) - if (nname != "." && nname != ".." && - isDirectory(path + '/' + dir->d_name)) + if (nname != "." && nname != ".." && isDirectory(path + '/' + dir->d_name)) // std::vector::emplace_back: insert a new element to the end of vector paths.emplace_back(dir->d_name); } @@ -75,18 +73,16 @@ inline std::vector get_directories(std::string path) { namespace readers { template -using matrix_type = - KokkosSparse::CrsMatrix; +using matrix_type = KokkosSparse::CrsMatrix; template struct test_reader; template struct test_reader> { - static matrix_type read( - const std::string &filename) { - return KokkosKernels::Impl::read_kokkos_crst_matrix< - matrix_type>(filename.c_str()); + static matrix_type read(const std::string &filename) { + return KokkosKernels::Impl::read_kokkos_crst_matrix>( + filename.c_str()); } }; @@ -100,30 +96,23 @@ struct data_retriever { std::tuple test_data; }; std::vector test_cases; - std::string make_full_path_to_data_file(std::string repo, - std::string path_to_data, - std::string dataset, + std::string make_full_path_to_data_file(std::string repo, std::string path_to_data, std::string dataset, std::string filename) { - return root_path + "/" + repo + "/" + path_to_data + dataset + "/" + - filename; + return root_path + "/" + repo + "/" + path_to_data + dataset + "/" + filename; } template - data_retriever(std::string path_to_data, Locations... locations) - : sub_path(path_to_data) { + data_retriever(std::string path_to_data, Locations... locations) : sub_path(path_to_data) { root_path = test::get_input_data_path(); // TODO: way to list the directories in the root path std::vector data_repos = get_directories(root_path + "/"); // TODO: list directories in subpaths for (auto repo : data_repos) { - std::vector datasets = - get_directories(root_path + "/" + repo + "/" + path_to_data + "/"); + std::vector datasets = get_directories(root_path + "/" + repo + "/" + path_to_data + "/"); for (auto dataset : datasets) { - test_cases.push_back( - test_case{repo + "/" + dataset, - std::make_tuple(readers::test_reader::read( - make_full_path_to_data_file( - repo, path_to_data, dataset, locations))...)}); + test_cases.push_back(test_case{repo + "/" + dataset, + std::make_tuple(readers::test_reader::read( + make_full_path_to_data_file(repo, path_to_data, dataset, locations))...)}); } } } diff --git a/perf_test/batched/dense/KokkosBatched_Test_BlockJacobi_Tutorial.cpp b/perf_test/batched/dense/KokkosBatched_Test_BlockJacobi_Tutorial.cpp index 53a6f8f173..5081017e46 100644 --- a/perf_test/batched/dense/KokkosBatched_Test_BlockJacobi_Tutorial.cpp +++ b/perf_test/batched/dense/KokkosBatched_Test_BlockJacobi_Tutorial.cpp @@ -50,8 +50,8 @@ using member_type = typename policy_type::member_type; using namespace KokkosBatched; template -val_type computeResidual(const ManyMatrixType &A, const ManyVectorType &x, - const ManyVectorType &b, const ManyVectorType &r) { +val_type computeResidual(const ManyMatrixType &A, const ManyVectorType &x, const ManyVectorType &b, + const ManyVectorType &r) { /// compute residual val_type residual(0); { @@ -66,17 +66,12 @@ val_type computeResidual(const ManyMatrixType &A, const ManyVectorType &x, auto xx = Kokkos::subview(x, i, Kokkos::ALL()); auto rr = Kokkos::subview(r, i, Kokkos::ALL()); - TeamGemv::invoke(member, -one, AA, xx, one, - rr); + TeamGemv::invoke(member, -one, AA, xx, one, rr); val_type sum(0); Kokkos::parallel_reduce( Kokkos::TeamThreadRange(member, rr.extent(0)), - [&](const int &k, val_type &lsum) { - lsum += Kokkos::ArithTraits::abs(rr(k)); - }, - sum); + [&](const int &k, val_type &lsum) { lsum += Kokkos::ArithTraits::abs(rr(k)); }, sum); Kokkos::single(Kokkos::PerTeam(member), [&]() { update += sum; }); }, residual); @@ -132,8 +127,8 @@ struct Task1SolveLowerTriangular { const val_type one(1); auto AA = Kokkos::subview(__A, i, Kokkos::ALL(), Kokkos::ALL()); auto TT = Kokkos::subview(__T, i, Kokkos::ALL(), Kokkos::ALL()); - TeamTrsm::invoke(member, one, TT, AA); + TeamTrsm::invoke( + member, one, TT, AA); } }; @@ -152,9 +147,8 @@ struct Task1SolveUpperTriangular { const val_type one(1); auto AA = Kokkos::subview(__A, i, Kokkos::ALL(), Kokkos::ALL()); auto TT = Kokkos::subview(__T, i, Kokkos::ALL(), Kokkos::ALL()); - TeamTrsm::invoke(member, one, TT, - AA); + TeamTrsm::invoke( + member, one, TT, AA); } }; } // namespace ConstructBlockJacobi @@ -176,8 +170,7 @@ struct Task1ApplyBlockJacobi { auto AA = Kokkos::subview(__A, i, Kokkos::ALL(), Kokkos::ALL()); auto xx = Kokkos::subview(__x, i, Kokkos::ALL()); auto bb = Kokkos::subview(__b, i, Kokkos::ALL()); - TeamGemv::invoke( - member, one, AA, bb, zero, xx); + TeamGemv::invoke(member, one, AA, bb, zero, xx); } }; @@ -200,11 +193,10 @@ struct Task2FactorizeInvert { TeamLU::invoke(member, AA); TeamCopy::invoke(member, AA, TT); TeamSetIdentity::invoke(member, AA); - TeamTrsm::invoke(member, one, TT, AA); - TeamTrsm::invoke(member, one, TT, - AA); + TeamTrsm::invoke( + member, one, TT, AA); + TeamTrsm::invoke( + member, one, TT, AA); } }; @@ -225,8 +217,7 @@ struct Task2ApplyBlockJacobi { auto AA = Kokkos::subview(__A, i, Kokkos::ALL(), Kokkos::ALL()); auto xx = Kokkos::subview(__x, i, Kokkos::ALL()); auto bb = Kokkos::subview(__b, i, Kokkos::ALL()); - TeamGemv::invoke( - member, one, AA, bb, zero, xx); + TeamGemv::invoke(member, one, AA, bb, zero, xx); } }; @@ -260,22 +251,17 @@ int main(int argc, char *argv[]) { /// x - solution vector /// b - right hand side vector /// - Kokkos::View A( - "block diagonals", N, Blk, Blk); - Kokkos::View T( - "temporal block diagonals", N, Blk, Blk); - Kokkos::View x("x", N, - Blk); - Kokkos::View b("b", N, - Blk); + Kokkos::View A("block diagonals", N, Blk, Blk); + Kokkos::View T("temporal block diagonals", N, Blk, Blk); + Kokkos::View x("x", N, Blk); + Kokkos::View b("b", N, Blk); /// copy of A to check residual - Kokkos::View Acopy( - "Acopy", A.extent(0), A.extent(1), A.extent(2)); + Kokkos::View Acopy("Acopy", A.extent(0), A.extent(1), + A.extent(2)); /// residual vector - Kokkos::View r( - "r", b.extent(0), b.extent(1)); + Kokkos::View r("r", b.extent(0), b.extent(1)); /// The block diagonal matrices are assumed to be extracted from a block /// sparse matrix. Here we set the blocks with random values @@ -308,23 +294,15 @@ int main(int argc, char *argv[]) { { policy_type policy(A.extent(0), Kokkos::AUTO()); timer.reset(); - Kokkos::parallel_for( - "task1.factorize", policy, - ConstructBlockJacobi::Task1Factorize(A)); + Kokkos::parallel_for("task1.factorize", policy, ConstructBlockJacobi::Task1Factorize(A)); Kokkos::deep_copy(T, A); - Kokkos::parallel_for( - "task1.set-identity", policy, - ConstructBlockJacobi::Task1SetIdentity(A)); + Kokkos::parallel_for("task1.set-identity", policy, ConstructBlockJacobi::Task1SetIdentity(A)); Kokkos::fence(); - Kokkos::parallel_for( - "task1.solve-lower-triangular", policy, - ConstructBlockJacobi::Task1SolveLowerTriangular(A, T)); + Kokkos::parallel_for("task1.solve-lower-triangular", policy, + ConstructBlockJacobi::Task1SolveLowerTriangular(A, T)); Kokkos::fence(); - Kokkos::parallel_for( - "task1.solve-upper-triangular", policy, - ConstructBlockJacobi::Task1SolveUpperTriangular(A, T)); + Kokkos::parallel_for("task1.solve-upper-triangular", policy, + ConstructBlockJacobi::Task1SolveUpperTriangular(A, T)); Kokkos::fence(); const double t = timer.seconds(); printf( @@ -337,10 +315,8 @@ int main(int argc, char *argv[]) { { timer.reset(); policy_type policy(A.extent(0), Kokkos::AUTO()); - Kokkos::parallel_for( - "task1.apply-block-jacobi", policy, - Task1ApplyBlockJacobi(A, x, - b)); + Kokkos::parallel_for("task1.apply-block-jacobi", policy, + Task1ApplyBlockJacobi(A, x, b)); const double t = timer.seconds(); printf( "task 1: application of jacobi time = %f , # of applications per " @@ -374,9 +350,7 @@ int main(int argc, char *argv[]) { { policy_type policy(A.extent(0), Kokkos::AUTO()); timer.reset(); - Kokkos::parallel_for( - "task2.factorize-invert", policy, - Task2FactorizeInvert(A, T)); + Kokkos::parallel_for("task2.factorize-invert", policy, Task2FactorizeInvert(A, T)); Kokkos::fence(); const double t = timer.seconds(); printf( @@ -389,10 +363,8 @@ int main(int argc, char *argv[]) { { timer.reset(); policy_type policy(A.extent(0), Kokkos::AUTO()); - Kokkos::parallel_for( - "task2.apply-block-jacobi", policy, - Task2ApplyBlockJacobi(A, x, - b)); + Kokkos::parallel_for("task2.apply-block-jacobi", policy, + Task2ApplyBlockJacobi(A, x, b)); const double t = timer.seconds(); printf( "task 2: application of jacobi time = %f , # of applications per " diff --git a/perf_test/batched/dense/KokkosBatched_Test_BlockTridiagDirect.cpp b/perf_test/batched/dense/KokkosBatched_Test_BlockTridiagDirect.cpp index f3eb0dd8ac..810112baa3 100644 --- a/perf_test/batched/dense/KokkosBatched_Test_BlockTridiagDirect.cpp +++ b/perf_test/batched/dense/KokkosBatched_Test_BlockTridiagDirect.cpp @@ -66,11 +66,9 @@ using member_type = typename policy_type::member_type; /// using namespace KokkosBatched; -static constexpr int vector_length = - DefaultVectorLength::value; +static constexpr int vector_length = DefaultVectorLength::value; #if defined(KOKKOSBATCHED_USE_128BIT_MEMORY_INST) -static constexpr int internal_vector_length = - DefaultInternalVectorLength::value; +static constexpr int internal_vector_length = DefaultInternalVectorLength::value; #else static constexpr int internal_vector_length = 1; #endif @@ -169,15 +167,11 @@ struct SetTridiagToIdentity { KOKKOS_INLINE_FUNCTION void operator()(const member_type &member) const { const int i = member.league_rank(); - Kokkos::parallel_for( - Kokkos::TeamThreadRange(member, __AA.extent(1)), [&](const int &j) { - Kokkos::parallel_for( - Kokkos::ThreadVectorRange(member, __AA.extent(5)), - [&](const int &v) { - for (int k = 0, kend = __AA.extent(3); k < kend; ++k) - __AA(i, j, 1, k, k, v) = 1; - }); - }); + Kokkos::parallel_for(Kokkos::TeamThreadRange(member, __AA.extent(1)), [&](const int &j) { + Kokkos::parallel_for(Kokkos::ThreadVectorRange(member, __AA.extent(5)), [&](const int &v) { + for (int k = 0, kend = __AA.extent(3); k < kend; ++k) __AA(i, j, 1, k, k, v) = 1; + }); + }); } }; @@ -192,46 +186,42 @@ struct Factorize { KOKKOS_INLINE_FUNCTION void operator()(const member_type &member) const { - typedef FactorizeModeAndAlgo - default_mode_and_algo_type; + typedef FactorizeModeAndAlgo default_mode_and_algo_type; typedef default_mode_and_algo_type::mode_type mode_type; typedef default_mode_and_algo_type::algo_type algo_type; const int i = member.league_rank(); - Kokkos::parallel_for( - Kokkos::ThreadVectorRange(member, __AA.extent(5)), [&](const int &v) { - auto AAA = Kokkos::subview(__AA, i, Kokkos::ALL(), Kokkos::ALL(), - Kokkos::ALL(), Kokkos::ALL(), v); - - /// subview patterns - auto A = Kokkos::subview(AAA, 0, 1, Kokkos::ALL(), Kokkos::ALL()); - auto B = Kokkos::subview(AAA, 0, 2, Kokkos::ALL(), Kokkos::ALL()); - auto C = Kokkos::subview(AAA, 0, 0, Kokkos::ALL(), Kokkos::ALL()); - auto D = Kokkos::subview(AAA, 0, 1, Kokkos::ALL(), Kokkos::ALL()); - - if (__L == 1) { - A.assign_data(&AAA(0, 1, 0, 0)); - LU::invoke(member, A); - } else { - for (int k = 0; k < (__L - 1); ++k) { - A.assign_data(&AAA(k, 1, 0, 0)); - B.assign_data(&AAA(k, 2, 0, 0)); - C.assign_data(&AAA(k, 0, 0, 0)); - D.assign_data(&AAA(k + 1, 1, 0, 0)); - - LU::invoke(member, A); - Trsm::invoke(member, 1.0, A, B); - Trsm::invoke(member, 1.0, A, - C); - Gemm::invoke(member, -1.0, C, B, 1.0, D); - } - LU::invoke(member, D); - } - }); + Kokkos::parallel_for(Kokkos::ThreadVectorRange(member, __AA.extent(5)), [&](const int &v) { + auto AAA = Kokkos::subview(__AA, i, Kokkos::ALL(), Kokkos::ALL(), Kokkos::ALL(), Kokkos::ALL(), v); + + /// subview patterns + auto A = Kokkos::subview(AAA, 0, 1, Kokkos::ALL(), Kokkos::ALL()); + auto B = Kokkos::subview(AAA, 0, 2, Kokkos::ALL(), Kokkos::ALL()); + auto C = Kokkos::subview(AAA, 0, 0, Kokkos::ALL(), Kokkos::ALL()); + auto D = Kokkos::subview(AAA, 0, 1, Kokkos::ALL(), Kokkos::ALL()); + + if (__L == 1) { + A.assign_data(&AAA(0, 1, 0, 0)); + LU::invoke(member, A); + } else { + for (int k = 0; k < (__L - 1); ++k) { + A.assign_data(&AAA(k, 1, 0, 0)); + B.assign_data(&AAA(k, 2, 0, 0)); + C.assign_data(&AAA(k, 0, 0, 0)); + D.assign_data(&AAA(k + 1, 1, 0, 0)); + + LU::invoke(member, A); + Trsm::invoke( + member, 1.0, A, B); + Trsm::invoke( + member, 1.0, A, C); + Gemm::invoke(member, -1.0, C, B, + 1.0, D); + } + LU::invoke(member, D); + } + }); } }; @@ -275,58 +265,46 @@ int main(int argc, char *argv[]) { /// /// double 16 - Kokkos::View Av( - "A", N / vector_length, L, 3, Blk, Blk); + Kokkos::View Av("A", N / vector_length, L, 3, Blk, Blk); /// double Kokkos::View As( - (value_type *)Av.data(), Av.extent(0), Av.extent(1), Av.extent(2), - Av.extent(3), Av.extent(4), vector_length); + (value_type *)Av.data(), Av.extent(0), Av.extent(1), Av.extent(2), Av.extent(3), Av.extent(4), vector_length); /// double 2 - Kokkos::View - Ai((internal_vector_type *)Av.data(), Av.extent(0), Av.extent(1), - Av.extent(2), Av.extent(3), Av.extent(4), - vector_length / internal_vector_length); + Kokkos::View Ai( + (internal_vector_type *)Av.data(), Av.extent(0), Av.extent(1), Av.extent(2), Av.extent(3), Av.extent(4), + vector_length / internal_vector_length); /// double 16 - Kokkos::View xv( - "x", N / vector_length, Nvec, L, Blk); + Kokkos::View xv("x", N / vector_length, Nvec, L, Blk); /// double Kokkos::View xs( - (value_type *)xv.data(), xv.extent(0), xv.extent(1), xv.extent(2), - xv.extent(3), vector_length); + (value_type *)xv.data(), xv.extent(0), xv.extent(1), xv.extent(2), xv.extent(3), vector_length); /// double 2 - Kokkos::View - xi((internal_vector_type *)xv.data(), xv.extent(0), xv.extent(1), - xv.extent(2), xv.extent(3), vector_length / internal_vector_length); + Kokkos::View xi( + (internal_vector_type *)xv.data(), xv.extent(0), xv.extent(1), xv.extent(2), xv.extent(3), + vector_length / internal_vector_length); /// double 16 - Kokkos::View bv( - "b", N / vector_length, Nvec, L, Blk); + Kokkos::View bv("b", N / vector_length, Nvec, L, Blk); /// double Kokkos::View bs( - (value_type *)bv.data(), bv.extent(0), bv.extent(1), bv.extent(2), - bv.extent(3), vector_length); + (value_type *)bv.data(), bv.extent(0), bv.extent(1), bv.extent(2), bv.extent(3), vector_length); /// double 2 - Kokkos::View - bi((internal_vector_type *)bv.data(), bv.extent(0), bv.extent(1), - bv.extent(2), bv.extent(3), vector_length / internal_vector_length); + Kokkos::View bi( + (internal_vector_type *)bv.data(), bv.extent(0), bv.extent(1), bv.extent(2), bv.extent(3), + vector_length / internal_vector_length); /// double copy of A Kokkos::View Acopy( - "Acopy", As.extent(0), As.extent(1), As.extent(2), As.extent(3), - As.extent(4), As.extent(5)); + "Acopy", As.extent(0), As.extent(1), As.extent(2), As.extent(3), As.extent(4), As.extent(5)); - Kokkos::View rs( - "rs", bs.extent(0), bs.extent(1), bs.extent(2), bs.extent(3), - bs.extent(4)); + Kokkos::View rs("rs", bs.extent(0), bs.extent(1), + bs.extent(2), bs.extent(3), bs.extent(4)); #if defined(KOKKOSBATCHED_USE_128BIT_MEMORY_INST) auto AA = Ai; @@ -347,8 +325,7 @@ int main(int argc, char *argv[]) { #endif timer.reset(); policy_type policy(AA.extent(0), Kokkos::AUTO(), AA.extent(5)); - Kokkos::parallel_for("setTridiagToIdentity", policy, - SetTridiagToIdentity(AA)); + Kokkos::parallel_for("setTridiagToIdentity", policy, SetTridiagToIdentity(AA)); Kokkos::fence(); const double t = timer.seconds(); #if defined(KOKKOS_ENABLE_CUDA) && defined(KOKKOSBATCHED_PROFILE) @@ -385,16 +362,14 @@ int main(int argc, char *argv[]) { } policy_type policy(AA.extent(0), team_size, AA.extent(5)); - Kokkos::parallel_for("factorize", - policy.set_scratch_size(0, Kokkos::PerTeam(S)), + Kokkos::parallel_for("factorize", policy.set_scratch_size(0, Kokkos::PerTeam(S)), Factorize(AA, L)); Kokkos::fence(); const double t = timer.seconds(); #if defined(KOKKOS_ENABLE_CUDA) && defined(KOKKOSBATCHED_PROFILE) cudaProfilerStop(); #endif - printf("factorize time = %f , # of factorization per min = %f \n", t, - 1.0 / t * 60); + printf("factorize time = %f , # of factorization per min = %f \n", t, 1.0 / t * 60); } /// @@ -417,121 +392,96 @@ int main(int argc, char *argv[]) { policy_type policy(AA.extent(0), team_size, AA.extent(5)); for (int iter = 0; iter < niter; ++iter) { Kokkos::parallel_for( - "solve", policy.set_scratch_size(0, Kokkos::PerTeam(S)), - KOKKOS_LAMBDA(const member_type &member) { - typedef SolveModeAndAlgo - default_mode_and_algo_type; + "solve", policy.set_scratch_size(0, Kokkos::PerTeam(S)), KOKKOS_LAMBDA(const member_type &member) { + typedef SolveModeAndAlgo default_mode_and_algo_type; typedef default_mode_and_algo_type::mode_type mode_type; typedef default_mode_and_algo_type::algo_type algo_type; const int i = member.league_rank(); - Kokkos::parallel_for( - Kokkos::ThreadVectorRange(member, AA.extent(5)), - [&](const int &v) { - auto A = Kokkos::subview(AA, i, Kokkos::ALL(), 1, - Kokkos::ALL(), Kokkos::ALL(), v); - auto B = Kokkos::subview(AA, i, Kokkos::ALL(), 2, - Kokkos::ALL(), Kokkos::ALL(), v); - auto C = Kokkos::subview(AA, i, Kokkos::ALL(), 0, - Kokkos::ALL(), Kokkos::ALL(), v); - - for (int jvec = 0; jvec < Nvec; ++jvec) { - auto x = Kokkos::subview(xx, i, jvec, Kokkos::ALL(), - Kokkos::ALL(), v); - auto b = Kokkos::subview(bb, i, jvec, Kokkos::ALL(), - Kokkos::ALL(), v); - - auto xt = Kokkos::subview(x, 0, Kokkos::ALL()); - auto xb = Kokkos::subview(x, 0, Kokkos::ALL()); - - /// - /// forward substitution - /// - { - // const bool is_same_x_and_b = (x.data() == b.data()); - auto LT = - Kokkos::subview(A, 0, Kokkos::ALL(), Kokkos::ALL()); - auto LB = - Kokkos::subview(C, 0, Kokkos::ALL(), Kokkos::ALL()); - - auto bk = Kokkos::subview(b, 0, Kokkos::ALL()); - { - { // if (!is_same_x_and_b) { - Copy::invoke(member, bk, xb); - member.team_barrier(); - } - } - const int kend = L - 1; - for (int k = 0; k < kend; ++k) { - LT.assign_data(&A(k, 0, 0)); - LB.assign_data(&C(k, 0, 0)); - - xt.assign_data(&x(k, 0)); - xb.assign_data(&x(k + 1, 0)); - - { // if (!is_same_x_and_b) { - bk.assign_data(&b(k + 1, 0)); - Copy::invoke(member, bk, xb); - } - - Trsv::invoke(member, - 1.0, - LT, - xt); - - Gemv::invoke(member, -1.0, LB, xt, 1.0, - xb); - } - { - LT.assign_data(&A(kend, 0, 0)); - xt.assign_data(&x(kend, 0)); - Trsv::invoke(member, - 1.0, - LT, - xt); - } - } /// end forward substitution - - /// - /// backward substitution - /// - { - auto UT = - Kokkos::subview(B, 0, Kokkos::ALL(), Kokkos::ALL()); - auto UB = - Kokkos::subview(A, 0, Kokkos::ALL(), Kokkos::ALL()); - - const int kbegin = L - 1; - for (int k = kbegin; k > 0; --k) { - UT.assign_data(&B(k - 1, 0, 0)); - UB.assign_data(&A(k, 0, 0)); - - xt.assign_data(&x(k - 1, 0)); - xb.assign_data(&x(k, 0)); - - Trsv::invoke(member, 1.0, UB, xb); - - Gemv::invoke(member, -1.0, UT, xb, 1.0, - xt); - } - { - UB.assign_data(&A(0, 0, 0)); - xb.assign_data(&x(0, 0)); - Trsv::invoke(member, 1.0, UB, xb); - } - } // end backward substitution + Kokkos::parallel_for(Kokkos::ThreadVectorRange(member, AA.extent(5)), [&](const int &v) { + auto A = Kokkos::subview(AA, i, Kokkos::ALL(), 1, Kokkos::ALL(), Kokkos::ALL(), v); + auto B = Kokkos::subview(AA, i, Kokkos::ALL(), 2, Kokkos::ALL(), Kokkos::ALL(), v); + auto C = Kokkos::subview(AA, i, Kokkos::ALL(), 0, Kokkos::ALL(), Kokkos::ALL(), v); + + for (int jvec = 0; jvec < Nvec; ++jvec) { + auto x = Kokkos::subview(xx, i, jvec, Kokkos::ALL(), Kokkos::ALL(), v); + auto b = Kokkos::subview(bb, i, jvec, Kokkos::ALL(), Kokkos::ALL(), v); + + auto xt = Kokkos::subview(x, 0, Kokkos::ALL()); + auto xb = Kokkos::subview(x, 0, Kokkos::ALL()); + + /// + /// forward substitution + /// + { + // const bool is_same_x_and_b = (x.data() == b.data()); + auto LT = Kokkos::subview(A, 0, Kokkos::ALL(), Kokkos::ALL()); + auto LB = Kokkos::subview(C, 0, Kokkos::ALL(), Kokkos::ALL()); + + auto bk = Kokkos::subview(b, 0, Kokkos::ALL()); + { + { // if (!is_same_x_and_b) { + Copy::invoke(member, bk, xb); + member.team_barrier(); + } + } + const int kend = L - 1; + for (int k = 0; k < kend; ++k) { + LT.assign_data(&A(k, 0, 0)); + LB.assign_data(&C(k, 0, 0)); + + xt.assign_data(&x(k, 0)); + xb.assign_data(&x(k + 1, 0)); + + { // if (!is_same_x_and_b) { + bk.assign_data(&b(k + 1, 0)); + Copy::invoke(member, bk, xb); + } + + Trsv::invoke( + member, 1.0, LT, xt); + + Gemv::invoke(member, -1.0, LB, xt, 1.0, + xb); + } + { + LT.assign_data(&A(kend, 0, 0)); + xt.assign_data(&x(kend, 0)); + Trsv::invoke( + member, 1.0, LT, xt); + } + } /// end forward substitution + + /// + /// backward substitution + /// + { + auto UT = Kokkos::subview(B, 0, Kokkos::ALL(), Kokkos::ALL()); + auto UB = Kokkos::subview(A, 0, Kokkos::ALL(), Kokkos::ALL()); + + const int kbegin = L - 1; + for (int k = kbegin; k > 0; --k) { + UT.assign_data(&B(k - 1, 0, 0)); + UB.assign_data(&A(k, 0, 0)); + + xt.assign_data(&x(k - 1, 0)); + xb.assign_data(&x(k, 0)); + + Trsv::invoke( + member, 1.0, UB, xb); + + Gemv::invoke(member, -1.0, UT, xb, 1.0, + xt); } - }); + { + UB.assign_data(&A(0, 0, 0)); + xb.assign_data(&x(0, 0)); + Trsv::invoke( + member, 1.0, UB, xb); + } + } // end backward substitution + } + }); }); Kokkos::fence(); } @@ -539,8 +489,7 @@ int main(int argc, char *argv[]) { #if defined(KOKKOS_ENABLE_CUDA) && defined(KOKKOSBATCHED_PROFILE) cudaProfilerStop(); #endif - printf("solve time = %f , # of solves per min = %f\n", t, - 1.0 / t * 60 * niter); + printf("solve time = %f , # of solves per min = %f\n", t, 1.0 / t * 60 * niter); } /// @@ -552,114 +501,77 @@ int main(int argc, char *argv[]) { Kokkos::parallel_for( "compute residual", policy, KOKKOS_LAMBDA(const member_type &member) { const int i = member.league_rank(); - Kokkos::parallel_for( - Kokkos::ThreadVectorRange(member, Acopy.extent(5)), - [&](const int &v) { - auto A = Kokkos::subview(Acopy, i, Kokkos::ALL(), 1, - Kokkos::ALL(), Kokkos::ALL(), v); - auto B = Kokkos::subview(Acopy, i, Kokkos::ALL(), 2, - Kokkos::ALL(), Kokkos::ALL(), v); - auto C = Kokkos::subview(Acopy, i, Kokkos::ALL(), 0, - Kokkos::ALL(), Kokkos::ALL(), v); - - for (int jvec = 0, jvecend = rs.extent(1); jvec < jvecend; - ++jvec) { - auto x = Kokkos::subview(xs, i, jvec, Kokkos::ALL(), - Kokkos::ALL(), v); - auto b = Kokkos::subview(bs, i, jvec, Kokkos::ALL(), - Kokkos::ALL(), v); - auto r = Kokkos::subview(rs, i, jvec, Kokkos::ALL(), - Kokkos::ALL(), v); - - if (L == 1) { - auto A0 = - Kokkos::subview(A, 0, Kokkos::ALL(), Kokkos::ALL()); - auto x0 = Kokkos::subview(x, 0, Kokkos::ALL()); - auto b0 = Kokkos::subview(b, 0, Kokkos::ALL()); - auto r0 = Kokkos::subview(r, 0, Kokkos::ALL()); - - TeamCopy::invoke(member, - b0, r0); - TeamGemv::invoke(member, -1.0, A0, x0, 1.0, - r0); - } else { - int k = 0; - { - /// first row - auto A1 = - Kokkos::subview(A, k, Kokkos::ALL(), Kokkos::ALL()); - auto B2 = - Kokkos::subview(B, k, Kokkos::ALL(), Kokkos::ALL()); - - auto x1 = Kokkos::subview(x, k, Kokkos::ALL()); - auto x2 = Kokkos::subview(x, k + 1, Kokkos::ALL()); - - auto bk = Kokkos::subview(b, k, Kokkos::ALL()); - auto rk = Kokkos::subview(r, k, Kokkos::ALL()); - TeamCopy::invoke( - member, bk, rk); - member.team_barrier(); - TeamGemv::invoke(member, -1.0, A1, x1, 1.0, - rk); - TeamGemv::invoke(member, -1.0, B2, x2, 1.0, - rk); - ++k; - } - for (; k < (L - 1); ++k) { - auto C0 = Kokkos::subview(C, k - 1, Kokkos::ALL(), - Kokkos::ALL()); - auto A1 = - Kokkos::subview(A, k, Kokkos::ALL(), Kokkos::ALL()); - auto B2 = - Kokkos::subview(B, k, Kokkos::ALL(), Kokkos::ALL()); - - auto x0 = Kokkos::subview(x, k - 1, Kokkos::ALL()); - auto x1 = Kokkos::subview(x, k, Kokkos::ALL()); - auto x2 = Kokkos::subview(x, k + 1, Kokkos::ALL()); - - auto bk = Kokkos::subview(b, k, Kokkos::ALL()); - auto rk = Kokkos::subview(r, k, Kokkos::ALL()); - TeamCopy::invoke( - member, bk, rk); - member.team_barrier(); - TeamGemv::invoke(member, -1.0, C0, x0, 1.0, - rk); - TeamGemv::invoke(member, -1.0, A1, x1, 1.0, - rk); - TeamGemv::invoke(member, -1.0, B2, x2, 1.0, - rk); - } - { - // last row - auto C0 = Kokkos::subview(C, k - 1, Kokkos::ALL(), - Kokkos::ALL()); - auto A1 = - Kokkos::subview(A, k, Kokkos::ALL(), Kokkos::ALL()); - - auto x0 = Kokkos::subview(x, k - 1, Kokkos::ALL()); - auto x1 = Kokkos::subview(x, k, Kokkos::ALL()); - - auto bk = Kokkos::subview(b, k, Kokkos::ALL()); - auto rk = Kokkos::subview(r, k, Kokkos::ALL()); - TeamCopy::invoke( - member, bk, rk); - member.team_barrier(); - TeamGemv::invoke(member, -1.0, C0, x0, 1.0, - rk); - TeamGemv::invoke(member, -1.0, A1, x1, 1.0, - rk); - } - } + Kokkos::parallel_for(Kokkos::ThreadVectorRange(member, Acopy.extent(5)), [&](const int &v) { + auto A = Kokkos::subview(Acopy, i, Kokkos::ALL(), 1, Kokkos::ALL(), Kokkos::ALL(), v); + auto B = Kokkos::subview(Acopy, i, Kokkos::ALL(), 2, Kokkos::ALL(), Kokkos::ALL(), v); + auto C = Kokkos::subview(Acopy, i, Kokkos::ALL(), 0, Kokkos::ALL(), Kokkos::ALL(), v); + + for (int jvec = 0, jvecend = rs.extent(1); jvec < jvecend; ++jvec) { + auto x = Kokkos::subview(xs, i, jvec, Kokkos::ALL(), Kokkos::ALL(), v); + auto b = Kokkos::subview(bs, i, jvec, Kokkos::ALL(), Kokkos::ALL(), v); + auto r = Kokkos::subview(rs, i, jvec, Kokkos::ALL(), Kokkos::ALL(), v); + + if (L == 1) { + auto A0 = Kokkos::subview(A, 0, Kokkos::ALL(), Kokkos::ALL()); + auto x0 = Kokkos::subview(x, 0, Kokkos::ALL()); + auto b0 = Kokkos::subview(b, 0, Kokkos::ALL()); + auto r0 = Kokkos::subview(r, 0, Kokkos::ALL()); + + TeamCopy::invoke(member, b0, r0); + TeamGemv::invoke(member, -1.0, A0, x0, 1.0, r0); + } else { + int k = 0; + { + /// first row + auto A1 = Kokkos::subview(A, k, Kokkos::ALL(), Kokkos::ALL()); + auto B2 = Kokkos::subview(B, k, Kokkos::ALL(), Kokkos::ALL()); + + auto x1 = Kokkos::subview(x, k, Kokkos::ALL()); + auto x2 = Kokkos::subview(x, k + 1, Kokkos::ALL()); + + auto bk = Kokkos::subview(b, k, Kokkos::ALL()); + auto rk = Kokkos::subview(r, k, Kokkos::ALL()); + TeamCopy::invoke(member, bk, rk); + member.team_barrier(); + TeamGemv::invoke(member, -1.0, A1, x1, 1.0, rk); + TeamGemv::invoke(member, -1.0, B2, x2, 1.0, rk); + ++k; } - }); + for (; k < (L - 1); ++k) { + auto C0 = Kokkos::subview(C, k - 1, Kokkos::ALL(), Kokkos::ALL()); + auto A1 = Kokkos::subview(A, k, Kokkos::ALL(), Kokkos::ALL()); + auto B2 = Kokkos::subview(B, k, Kokkos::ALL(), Kokkos::ALL()); + + auto x0 = Kokkos::subview(x, k - 1, Kokkos::ALL()); + auto x1 = Kokkos::subview(x, k, Kokkos::ALL()); + auto x2 = Kokkos::subview(x, k + 1, Kokkos::ALL()); + + auto bk = Kokkos::subview(b, k, Kokkos::ALL()); + auto rk = Kokkos::subview(r, k, Kokkos::ALL()); + TeamCopy::invoke(member, bk, rk); + member.team_barrier(); + TeamGemv::invoke(member, -1.0, C0, x0, 1.0, rk); + TeamGemv::invoke(member, -1.0, A1, x1, 1.0, rk); + TeamGemv::invoke(member, -1.0, B2, x2, 1.0, rk); + } + { + // last row + auto C0 = Kokkos::subview(C, k - 1, Kokkos::ALL(), Kokkos::ALL()); + auto A1 = Kokkos::subview(A, k, Kokkos::ALL(), Kokkos::ALL()); + + auto x0 = Kokkos::subview(x, k - 1, Kokkos::ALL()); + auto x1 = Kokkos::subview(x, k, Kokkos::ALL()); + + auto bk = Kokkos::subview(b, k, Kokkos::ALL()); + auto rk = Kokkos::subview(r, k, Kokkos::ALL()); + TeamCopy::invoke(member, bk, rk); + member.team_barrier(); + TeamGemv::invoke(member, -1.0, C0, x0, 1.0, rk); + TeamGemv::invoke(member, -1.0, A1, x1, 1.0, rk); + } + } + } + }); }); Kokkos::fence(); auto rs_host = Kokkos::create_mirror_view(rs); @@ -669,13 +581,11 @@ int main(int argc, char *argv[]) { Kokkos::fence(); { double norm2 = 0, diff2 = 0; - for (int i0 = 0, i0end = rs.extent(0); i0 < i0end; - ++i0) // N/vector_length - for (int i1 = 0, i1end = rs.extent(1); i1 < i1end; ++i1) // Nvec - for (int i2 = 0, i2end = rs.extent(2); i2 < i2end; ++i2) // L - for (int i3 = 0, i3end = rs.extent(3); i3 < i3end; ++i3) // Blk - for (int i4 = 0, i4end = rs.extent(4); i4 < i4end; - ++i4) { // vector_length + for (int i0 = 0, i0end = rs.extent(0); i0 < i0end; ++i0) // N/vector_length + for (int i1 = 0, i1end = rs.extent(1); i1 < i1end; ++i1) // Nvec + for (int i2 = 0, i2end = rs.extent(2); i2 < i2end; ++i2) // L + for (int i3 = 0, i3end = rs.extent(3); i3 < i3end; ++i3) // Blk + for (int i4 = 0, i4end = rs.extent(4); i4 < i4end; ++i4) { // vector_length const auto val = bs_host(i0, i1, i2, i3, i4); const auto res = rs_host(i0, i1, i2, i3, i4); norm2 += val * val; diff --git a/perf_test/batched/dense/KokkosBatched_Test_BlockTridiagJacobi.cpp b/perf_test/batched/dense/KokkosBatched_Test_BlockTridiagJacobi.cpp index 67a141578e..629c73924e 100644 --- a/perf_test/batched/dense/KokkosBatched_Test_BlockTridiagJacobi.cpp +++ b/perf_test/batched/dense/KokkosBatched_Test_BlockTridiagJacobi.cpp @@ -72,11 +72,9 @@ typedef double value_type; /// using namespace KokkosBatched; -static constexpr int vector_length = - DefaultVectorLength::value; +static constexpr int vector_length = DefaultVectorLength::value; #if defined(KOKKOSBATCHED_USE_128BIT_MEMORY_INST) -static constexpr int internal_vector_length = - DefaultInternalVectorLength::value; +static constexpr int internal_vector_length = DefaultInternalVectorLength::value; #else static constexpr int internal_vector_length = 1; #endif @@ -98,20 +96,17 @@ struct InverseDiagonalsModeAndAlgoHostImpl { #if defined(KOKKOS_ENABLE_SERIAL) template <> -struct InverseDiagonalsModeAndAlgo - : InverseDiagonalsModeAndAlgoHostImpl {}; +struct InverseDiagonalsModeAndAlgo : InverseDiagonalsModeAndAlgoHostImpl {}; #endif #if defined(KOKKOS_ENABLE_THREADS) template <> -struct InverseDiagonalsModeAndAlgo - : InverseDiagonalsModeAndAlgoHostImpl {}; +struct InverseDiagonalsModeAndAlgo : InverseDiagonalsModeAndAlgoHostImpl {}; #endif #if defined(KOKKOS_ENABLE_ONPENMP) template <> -struct InverseDiagonalsModeAndAlgo - : InverseDiagonalsModeAndAlgoHostImpl {}; +struct InverseDiagonalsModeAndAlgo : InverseDiagonalsModeAndAlgoHostImpl {}; #endif struct InverseDiagonalsModeAndAlgoDeviceImpl { @@ -121,14 +116,12 @@ struct InverseDiagonalsModeAndAlgoDeviceImpl { #if defined(KOKKOS_ENABLE_CUDA) template <> -struct InverseDiagonalsModeAndAlgo - : InverseDiagonalsModeAndAlgoDeviceImpl {}; +struct InverseDiagonalsModeAndAlgo : InverseDiagonalsModeAndAlgoDeviceImpl {}; #endif #if defined(KOKKOS_ENABLE_HIP) template <> -struct InverseDiagonalsModeAndAlgo - : InverseDiagonalsModeAndAlgoDeviceImpl {}; +struct InverseDiagonalsModeAndAlgo : InverseDiagonalsModeAndAlgoDeviceImpl {}; #endif template @@ -211,56 +204,46 @@ int main(int argc, char *argv[]) { /// /// double 16 - Kokkos::View Av( - "A", N / vector_length, L, 4, Blk, Blk); + Kokkos::View Av("A", N / vector_length, L, 4, Blk, Blk); /// double Kokkos::View As( - (value_type *)Av.data(), Av.extent(0), Av.extent(1), Av.extent(2), - Av.extent(3), Av.extent(4), vector_length); + (value_type *)Av.data(), Av.extent(0), Av.extent(1), Av.extent(2), Av.extent(3), Av.extent(4), vector_length); /// double 2 - Kokkos::View - Ai((internal_vector_type *)Av.data(), Av.extent(0), Av.extent(1), - Av.extent(2), Av.extent(3), Av.extent(4), - vector_length / internal_vector_length); + Kokkos::View Ai( + (internal_vector_type *)Av.data(), Av.extent(0), Av.extent(1), Av.extent(2), Av.extent(3), Av.extent(4), + vector_length / internal_vector_length); /// double 16 - Kokkos::View xv( - "x", N / vector_length, Nvec, 2, L, Blk); + Kokkos::View xv("x", N / vector_length, Nvec, 2, L, Blk); /// double Kokkos::View xs( - (value_type *)xv.data(), xv.extent(0), xv.extent(1), xv.extent(2), - xv.extent(3), xv.extent(4), vector_length); + (value_type *)xv.data(), xv.extent(0), xv.extent(1), xv.extent(2), xv.extent(3), xv.extent(4), vector_length); /// double 2 - Kokkos::View - xi((internal_vector_type *)xv.data(), xv.extent(0), xv.extent(1), - xv.extent(2), xv.extent(3), xv.extent(4), - vector_length / internal_vector_length); + Kokkos::View xi( + (internal_vector_type *)xv.data(), xv.extent(0), xv.extent(1), xv.extent(2), xv.extent(3), xv.extent(4), + vector_length / internal_vector_length); /// double 16 - Kokkos::View bv( - "b", N / vector_length, Nvec, L, Blk); + Kokkos::View bv("b", N / vector_length, Nvec, L, Blk); /// double Kokkos::View bs( - (value_type *)bv.data(), bv.extent(0), bv.extent(1), bv.extent(2), - bv.extent(3), vector_length); + (value_type *)bv.data(), bv.extent(0), bv.extent(1), bv.extent(2), bv.extent(3), vector_length); /// double 2 - Kokkos::View - bi((internal_vector_type *)bv.data(), bv.extent(0), bv.extent(1), - bv.extent(2), bv.extent(3), vector_length / internal_vector_length); + Kokkos::View bi( + (internal_vector_type *)bv.data(), bv.extent(0), bv.extent(1), bv.extent(2), bv.extent(3), + vector_length / internal_vector_length); /// double copy of A Kokkos::View Acopy( - "Acopy", As.extent(0), As.extent(1), As.extent(2), As.extent(3), - As.extent(4), As.extent(5)); + "Acopy", As.extent(0), As.extent(1), As.extent(2), As.extent(3), As.extent(4), As.extent(5)); - Kokkos::View rs( - "rs", bs.extent(0), bs.extent(1), bs.extent(2), bs.extent(3), - bs.extent(4)); + Kokkos::View rs("rs", bs.extent(0), bs.extent(1), bs.extent(2), + bs.extent(3), bs.extent(4)); #if defined(KOKKOSBATCHED_USE_128BIT_MEMORY_INST) auto AA = Ai; @@ -288,18 +271,13 @@ int main(int argc, char *argv[]) { using member_type = typename policy_type::member_type; policy_type policy(AA.extent(0) * L, Kokkos::AUTO(), AA.extent(5)); Kokkos::parallel_for( - "diagonal dominant", policy, - KOKKOS_LAMBDA(const member_type &member) { + "diagonal dominant", policy, KOKKOS_LAMBDA(const member_type &member) { const int i = member.league_rank() / L; const int k = member.league_rank() % L; - Kokkos::parallel_for( - Kokkos::TeamThreadRange(member, Blk), [&](const int &j) { - Kokkos::parallel_for( - Kokkos::ThreadVectorRange(member, AA.extent(5)), - [&](const int &v) { - AA(i, k, 1, j, j, v) += internal_vector_type(9 * Blk); - }); - }); + Kokkos::parallel_for(Kokkos::TeamThreadRange(member, Blk), [&](const int &j) { + Kokkos::parallel_for(Kokkos::ThreadVectorRange(member, AA.extent(5)), + [&](const int &v) { AA(i, k, 1, j, j, v) += internal_vector_type(9 * Blk); }); + }); }); Kokkos::fence(); #if defined(KOKKOS_ENABLE_CUDA) && defined(KOKKOSBATCHED_PROFILE) @@ -318,16 +296,14 @@ int main(int argc, char *argv[]) { #endif timer.reset(); typedef internal_vector_type scratch_value_type; - typedef Kokkos::View scratch_view_type; - using policy_type = Kokkos::TeamPolicy; - using member_type = typename policy_type::member_type; - const int per_team_scratch = - scratch_view_type::shmem_size(Blk, Blk, AA.extent(5)); - int team_size = 0; + using policy_type = Kokkos::TeamPolicy; + using member_type = typename policy_type::member_type; + const int per_team_scratch = scratch_view_type::shmem_size(Blk, Blk, AA.extent(5)); + int team_size = 0; if (Blk < 8) { team_size = 32 / AA.extent(5); } else if (Blk < 12) { @@ -338,49 +314,37 @@ int main(int argc, char *argv[]) { policy_type policy(AA.extent(0) * L, team_size, AA.extent(5)); Kokkos::parallel_for( - "inverse diagonals", - policy.set_scratch_size( - 0, Kokkos::PerTeam(S < per_team_scratch ? per_team_scratch : S)), + "inverse diagonals", policy.set_scratch_size(0, Kokkos::PerTeam(S < per_team_scratch ? per_team_scratch : S)), KOKKOS_LAMBDA(const member_type &member) { - typedef InverseDiagonalsModeAndAlgo - default_mode_and_algo_type; + typedef InverseDiagonalsModeAndAlgo default_mode_and_algo_type; typedef default_mode_and_algo_type::mode_type mode_type; typedef default_mode_and_algo_type::algo_type algo_type; const int i = member.league_rank() / L; const int k = member.league_rank() % L; - scratch_view_type WW(member.team_scratch(0), Blk, Blk, - AA.extent(5)); - Kokkos::parallel_for( - Kokkos::ThreadVectorRange(member, AA.extent(5)), - [&](const int &v) { - auto A = Kokkos::subview(AA, i, k, 1, Kokkos::ALL(), - Kokkos::ALL(), v); - auto D = Kokkos::subview(AA, i, k, 3, Kokkos::ALL(), - Kokkos::ALL(), v); - auto W = Kokkos::subview(WW, Kokkos::ALL(), Kokkos::ALL(), v); - - Copy::invoke( - member, A, W); - SetIdentity::invoke(member, D); - member.team_barrier(); - LU::invoke(member, W); - Trsm::invoke(member, 1.0, W, - D); - Trsm::invoke(member, 1.0, - W, D); - }); + scratch_view_type WW(member.team_scratch(0), Blk, Blk, AA.extent(5)); + Kokkos::parallel_for(Kokkos::ThreadVectorRange(member, AA.extent(5)), [&](const int &v) { + auto A = Kokkos::subview(AA, i, k, 1, Kokkos::ALL(), Kokkos::ALL(), v); + auto D = Kokkos::subview(AA, i, k, 3, Kokkos::ALL(), Kokkos::ALL(), v); + auto W = Kokkos::subview(WW, Kokkos::ALL(), Kokkos::ALL(), v); + + Copy::invoke(member, A, W); + SetIdentity::invoke(member, D); + member.team_barrier(); + LU::invoke(member, W); + Trsm::invoke( + member, 1.0, W, D); + Trsm::invoke(member, 1.0, W, D); + }); }); Kokkos::fence(); const double t = timer.seconds(); #if defined(KOKKOS_ENABLE_CUDA) && defined(KOKKOSBATCHED_PROFILE) cudaProfilerStop(); #endif - printf("inverse time = %f , # of inverse per min = %f \n", t, - 1.0 / t * 60); + printf("inverse time = %f , # of inverse per min = %f \n", t, 1.0 / t * 60); } /// @@ -392,12 +356,10 @@ int main(int argc, char *argv[]) { #endif timer.reset(); typedef internal_vector_type scratch_value_type; - typedef Kokkos::View scratch_view_type; - const int per_team_scratch = - scratch_view_type::shmem_size(Blk, AA.extent(5)); + const int per_team_scratch = scratch_view_type::shmem_size(Blk, AA.extent(5)); using policy_type = Kokkos::TeamPolicy; using member_type = typename policy_type::member_type; @@ -412,78 +374,53 @@ int main(int argc, char *argv[]) { policy_type policy(AA.extent(0) * L, team_size, AA.extent(5)); for (int iter = 0; iter < niter; ++iter) { - auto xxx = Kokkos::subview(xx, Kokkos::ALL(), Kokkos::ALL(), 0, - Kokkos::ALL(), Kokkos::ALL(), Kokkos::ALL()); - auto yyy = Kokkos::subview(xx, Kokkos::ALL(), Kokkos::ALL(), 1, - Kokkos::ALL(), Kokkos::ALL(), Kokkos::ALL()); + auto xxx = Kokkos::subview(xx, Kokkos::ALL(), Kokkos::ALL(), 0, Kokkos::ALL(), Kokkos::ALL(), Kokkos::ALL()); + auto yyy = Kokkos::subview(xx, Kokkos::ALL(), Kokkos::ALL(), 1, Kokkos::ALL(), Kokkos::ALL(), Kokkos::ALL()); for (int nis = 0; nis < nsweep; ++nis) { Kokkos::parallel_for( - "solve", - policy.set_scratch_size( - 0, - Kokkos::PerTeam(S < per_team_scratch ? per_team_scratch : S)), + "solve", policy.set_scratch_size(0, Kokkos::PerTeam(S < per_team_scratch ? per_team_scratch : S)), KOKKOS_LAMBDA(const member_type &member) { - typedef SolveModeAndAlgo - default_mode_and_algo_type; + typedef SolveModeAndAlgo default_mode_and_algo_type; typedef default_mode_and_algo_type::mode_type mode_type; typedef default_mode_and_algo_type::algo_type algo_type; scratch_view_type WW(member.team_scratch(0), Blk, AA.extent(5)); const int i = member.league_rank() / L; //%AA.extent(0); const int k = member.league_rank() % L; - Kokkos::parallel_for( - Kokkos::ThreadVectorRange(member, AA.extent(5)), - [&](const int &v) { - auto A = Kokkos::subview(AA, i, k, 1, Kokkos::ALL(), - Kokkos::ALL(), v); - auto D = Kokkos::subview(AA, i, k, 3, Kokkos::ALL(), - Kokkos::ALL(), v); - auto B = Kokkos::subview(AA, i, k, 2, Kokkos::ALL(), - Kokkos::ALL(), v); - auto C = Kokkos::subview(AA, i, k ? k - 1 : 0, 0, - Kokkos::ALL(), Kokkos::ALL(), v); - auto u = Kokkos::subview(WW, Kokkos::ALL(), v); - for (int jvec = 0; jvec < Nvec; ++jvec) { - auto x0 = Kokkos::subview( - xxx, i, jvec, k == 0 ? 0 : k - 1, Kokkos::ALL(), v); - auto x1 = - Kokkos::subview(xxx, i, jvec, k, Kokkos::ALL(), v); - auto x2 = Kokkos::subview(xxx, i, jvec, - k == L - 1 ? 0 : k + 1, - Kokkos::ALL(), v); - auto y1 = - Kokkos::subview(yyy, i, jvec, k, Kokkos::ALL(), v); - auto b = - Kokkos::subview(bb, i, jvec, k, Kokkos::ALL(), v); - - if (L == 1) { - Gemv::invoke(member, 1.0, D, b, 0.0, x1); - } else { - Copy::invoke(member, b, u); - if (k == 0) { - Gemv::invoke(member, -1.0, B, x2, 1.0, - u); - } else if (k == L - 1) { - Gemv::invoke(member, -1.0, C, x0, 1.0, - u); - } else { - Gemv::invoke(member, -1.0, B, x2, 1.0, - u); - Gemv::invoke(member, -1.0, C, x0, 1.0, - u); - } - Gemv::invoke(member, 1.0, D, u, 0.0, y1); - } + Kokkos::parallel_for(Kokkos::ThreadVectorRange(member, AA.extent(5)), [&](const int &v) { + auto A = Kokkos::subview(AA, i, k, 1, Kokkos::ALL(), Kokkos::ALL(), v); + auto D = Kokkos::subview(AA, i, k, 3, Kokkos::ALL(), Kokkos::ALL(), v); + auto B = Kokkos::subview(AA, i, k, 2, Kokkos::ALL(), Kokkos::ALL(), v); + auto C = Kokkos::subview(AA, i, k ? k - 1 : 0, 0, Kokkos::ALL(), Kokkos::ALL(), v); + auto u = Kokkos::subview(WW, Kokkos::ALL(), v); + for (int jvec = 0; jvec < Nvec; ++jvec) { + auto x0 = Kokkos::subview(xxx, i, jvec, k == 0 ? 0 : k - 1, Kokkos::ALL(), v); + auto x1 = Kokkos::subview(xxx, i, jvec, k, Kokkos::ALL(), v); + auto x2 = Kokkos::subview(xxx, i, jvec, k == L - 1 ? 0 : k + 1, Kokkos::ALL(), v); + auto y1 = Kokkos::subview(yyy, i, jvec, k, Kokkos::ALL(), v); + auto b = Kokkos::subview(bb, i, jvec, k, Kokkos::ALL(), v); + + if (L == 1) { + Gemv::invoke(member, 1.0, D, b, 0.0, x1); + } else { + Copy::invoke(member, b, u); + if (k == 0) { + Gemv::invoke(member, -1.0, B, x2, 1.0, + u); + } else if (k == L - 1) { + Gemv::invoke(member, -1.0, C, x0, 1.0, + u); + } else { + Gemv::invoke(member, -1.0, B, x2, 1.0, + u); + Gemv::invoke(member, -1.0, C, x0, 1.0, + u); } - }); + Gemv::invoke(member, 1.0, D, u, 0.0, y1); + } + } + }); }); auto tmp = xxx; xxx = yyy; @@ -495,8 +432,7 @@ int main(int argc, char *argv[]) { #if defined(KOKKOS_ENABLE_CUDA) && defined(KOKKOSBATCHED_PROFILE) cudaProfilerStop(); #endif - printf("solve time = %f , # of solves per min = %f\n", t, - 1.0 / t * 60 * niter); + printf("solve time = %f , # of solves per min = %f\n", t, 1.0 / t * 60 * niter); } /// @@ -507,140 +443,87 @@ int main(int argc, char *argv[]) { using policy_type = Kokkos::TeamPolicy; policy_type policy(Acopy.extent(0), Kokkos::AUTO(), Acopy.extent(5)); Kokkos::parallel_for( - "compute residual", policy, - KOKKOS_LAMBDA(const typename policy_type::member_type &member) { + "compute residual", policy, KOKKOS_LAMBDA(const typename policy_type::member_type &member) { const int i = member.league_rank(); - Kokkos::parallel_for( - Kokkos::ThreadVectorRange(member, Acopy.extent(5)), - [&](const int &v) { - auto A = Kokkos::subview(Acopy, i, Kokkos::ALL(), 1, - Kokkos::ALL(), Kokkos::ALL(), v); - auto B = Kokkos::subview(Acopy, i, Kokkos::ALL(), 2, - Kokkos::ALL(), Kokkos::ALL(), v); - auto C = Kokkos::subview(Acopy, i, Kokkos::ALL(), 0, - Kokkos::ALL(), Kokkos::ALL(), v); - - for (int jvec = 0, jvecend = rs.extent(1); jvec < jvecend; - ++jvec) { - auto x = Kokkos::subview(xs, i, jvec, nsweep % 2, - Kokkos::ALL(), Kokkos::ALL(), v); - auto b = Kokkos::subview(bs, i, jvec, Kokkos::ALL(), - Kokkos::ALL(), v); - auto r = Kokkos::subview(rs, i, jvec, Kokkos::ALL(), - Kokkos::ALL(), v); - - if (L == 1) { - auto A0 = - Kokkos::subview(A, 0, Kokkos::ALL(), Kokkos::ALL()); - auto x0 = Kokkos::subview(x, 0, Kokkos::ALL()); - auto b0 = Kokkos::subview(b, 0, Kokkos::ALL()); - auto r0 = Kokkos::subview(r, 0, Kokkos::ALL()); - - TeamCopy::invoke(member, b0, r0); - TeamGemv::invoke(member, - -1.0, A0, - x0, 1.0, - r0); - } else { - int k = 0; - { - /// first row - auto A1 = - Kokkos::subview(A, k, Kokkos::ALL(), Kokkos::ALL()); - auto B2 = - Kokkos::subview(B, k, Kokkos::ALL(), Kokkos::ALL()); - - auto x1 = Kokkos::subview(x, k, Kokkos::ALL()); - auto x2 = Kokkos::subview(x, k + 1, Kokkos::ALL()); - - auto bk = Kokkos::subview(b, k, Kokkos::ALL()); - auto rk = Kokkos::subview(r, k, Kokkos::ALL()); - TeamCopy::invoke(member, bk, rk); - member.team_barrier(); - TeamGemv::invoke(member, - -1.0, - A1, x1, - 1.0, - rk); - TeamGemv::invoke(member, - -1.0, - B2, x2, - 1.0, - rk); - ++k; - } - for (; k < (L - 1); ++k) { - auto C0 = Kokkos::subview(C, k - 1, Kokkos::ALL(), - Kokkos::ALL()); - auto A1 = - Kokkos::subview(A, k, Kokkos::ALL(), Kokkos::ALL()); - auto B2 = - Kokkos::subview(B, k, Kokkos::ALL(), Kokkos::ALL()); - - auto x0 = Kokkos::subview(x, k - 1, Kokkos::ALL()); - auto x1 = Kokkos::subview(x, k, Kokkos::ALL()); - auto x2 = Kokkos::subview(x, k + 1, Kokkos::ALL()); - - auto bk = Kokkos::subview(b, k, Kokkos::ALL()); - auto rk = Kokkos::subview(r, k, Kokkos::ALL()); - TeamCopy::invoke(member, bk, rk); - member.team_barrier(); - TeamGemv::invoke(member, - -1.0, - C0, x0, - 1.0, - rk); - TeamGemv::invoke(member, - -1.0, - A1, x1, - 1.0, - rk); - TeamGemv::invoke(member, - -1.0, - B2, x2, - 1.0, - rk); - } - { - // last row - auto C0 = Kokkos::subview(C, k - 1, Kokkos::ALL(), - Kokkos::ALL()); - auto A1 = - Kokkos::subview(A, k, Kokkos::ALL(), Kokkos::ALL()); - - auto x0 = Kokkos::subview(x, k - 1, Kokkos::ALL()); - auto x1 = Kokkos::subview(x, k, Kokkos::ALL()); - - auto bk = Kokkos::subview(b, k, Kokkos::ALL()); - auto rk = Kokkos::subview(r, k, Kokkos::ALL()); - TeamCopy::invoke(member, bk, rk); - member.team_barrier(); - TeamGemv::invoke(member, - -1.0, - C0, x0, - 1.0, - rk); - TeamGemv::invoke(member, - -1.0, - A1, x1, - 1.0, - rk); - } - } + Kokkos::parallel_for(Kokkos::ThreadVectorRange(member, Acopy.extent(5)), [&](const int &v) { + auto A = Kokkos::subview(Acopy, i, Kokkos::ALL(), 1, Kokkos::ALL(), Kokkos::ALL(), v); + auto B = Kokkos::subview(Acopy, i, Kokkos::ALL(), 2, Kokkos::ALL(), Kokkos::ALL(), v); + auto C = Kokkos::subview(Acopy, i, Kokkos::ALL(), 0, Kokkos::ALL(), Kokkos::ALL(), v); + + for (int jvec = 0, jvecend = rs.extent(1); jvec < jvecend; ++jvec) { + auto x = Kokkos::subview(xs, i, jvec, nsweep % 2, Kokkos::ALL(), Kokkos::ALL(), v); + auto b = Kokkos::subview(bs, i, jvec, Kokkos::ALL(), Kokkos::ALL(), v); + auto r = Kokkos::subview(rs, i, jvec, Kokkos::ALL(), Kokkos::ALL(), v); + + if (L == 1) { + auto A0 = Kokkos::subview(A, 0, Kokkos::ALL(), Kokkos::ALL()); + auto x0 = Kokkos::subview(x, 0, Kokkos::ALL()); + auto b0 = Kokkos::subview(b, 0, Kokkos::ALL()); + auto r0 = Kokkos::subview(r, 0, Kokkos::ALL()); + + TeamCopy::invoke(member, b0, r0); + TeamGemv::invoke(member, -1.0, A0, + x0, 1.0, r0); + } else { + int k = 0; + { + /// first row + auto A1 = Kokkos::subview(A, k, Kokkos::ALL(), Kokkos::ALL()); + auto B2 = Kokkos::subview(B, k, Kokkos::ALL(), Kokkos::ALL()); + + auto x1 = Kokkos::subview(x, k, Kokkos::ALL()); + auto x2 = Kokkos::subview(x, k + 1, Kokkos::ALL()); + + auto bk = Kokkos::subview(b, k, Kokkos::ALL()); + auto rk = Kokkos::subview(r, k, Kokkos::ALL()); + TeamCopy::invoke(member, bk, rk); + member.team_barrier(); + TeamGemv::invoke(member, -1.0, A1, + x1, 1.0, rk); + TeamGemv::invoke(member, -1.0, B2, + x2, 1.0, rk); + ++k; } - }); + for (; k < (L - 1); ++k) { + auto C0 = Kokkos::subview(C, k - 1, Kokkos::ALL(), Kokkos::ALL()); + auto A1 = Kokkos::subview(A, k, Kokkos::ALL(), Kokkos::ALL()); + auto B2 = Kokkos::subview(B, k, Kokkos::ALL(), Kokkos::ALL()); + + auto x0 = Kokkos::subview(x, k - 1, Kokkos::ALL()); + auto x1 = Kokkos::subview(x, k, Kokkos::ALL()); + auto x2 = Kokkos::subview(x, k + 1, Kokkos::ALL()); + + auto bk = Kokkos::subview(b, k, Kokkos::ALL()); + auto rk = Kokkos::subview(r, k, Kokkos::ALL()); + TeamCopy::invoke(member, bk, rk); + member.team_barrier(); + TeamGemv::invoke(member, -1.0, C0, + x0, 1.0, rk); + TeamGemv::invoke(member, -1.0, A1, + x1, 1.0, rk); + TeamGemv::invoke(member, -1.0, B2, + x2, 1.0, rk); + } + { + // last row + auto C0 = Kokkos::subview(C, k - 1, Kokkos::ALL(), Kokkos::ALL()); + auto A1 = Kokkos::subview(A, k, Kokkos::ALL(), Kokkos::ALL()); + + auto x0 = Kokkos::subview(x, k - 1, Kokkos::ALL()); + auto x1 = Kokkos::subview(x, k, Kokkos::ALL()); + + auto bk = Kokkos::subview(b, k, Kokkos::ALL()); + auto rk = Kokkos::subview(r, k, Kokkos::ALL()); + TeamCopy::invoke(member, bk, rk); + member.team_barrier(); + TeamGemv::invoke(member, -1.0, C0, + x0, 1.0, rk); + TeamGemv::invoke(member, -1.0, A1, + x1, 1.0, rk); + } + } + } + }); }); Kokkos::fence(); auto rs_host = Kokkos::create_mirror_view(rs); @@ -650,13 +533,11 @@ int main(int argc, char *argv[]) { Kokkos::fence(); { double norm2 = 0, diff2 = 0; - for (int i0 = 0, i0end = rs.extent(0); i0 < i0end; - ++i0) // N/vector_length - for (int i1 = 0, i1end = rs.extent(1); i1 < i1end; ++i1) // Nvec - for (int i2 = 0, i2end = rs.extent(2); i2 < i2end; ++i2) // L - for (int i3 = 0, i3end = rs.extent(3); i3 < i3end; ++i3) // Blk - for (int i4 = 0, i4end = rs.extent(4); i4 < i4end; - ++i4) { // vector_length + for (int i0 = 0, i0end = rs.extent(0); i0 < i0end; ++i0) // N/vector_length + for (int i1 = 0, i1end = rs.extent(1); i1 < i1end; ++i1) // Nvec + for (int i2 = 0, i2end = rs.extent(2); i2 < i2end; ++i2) // L + for (int i3 = 0, i3end = rs.extent(3); i3 < i3end; ++i3) // Blk + for (int i4 = 0, i4end = rs.extent(4); i4 < i4end; ++i4) { // vector_length const auto val = bs_host(i0, i1, i2, i3, i4); const auto res = rs_host(i0, i1, i2, i3, i4); norm2 += val * val; diff --git a/perf_test/batched/dense/do-not-use/KokkosBatched_Test_Gemm_Cuda.cpp b/perf_test/batched/dense/do-not-use/KokkosBatched_Test_Gemm_Cuda.cpp index 5f9c167b72..9ac7e82d3a 100644 --- a/perf_test/batched/dense/do-not-use/KokkosBatched_Test_Gemm_Cuda.cpp +++ b/perf_test/batched/dense/do-not-use/KokkosBatched_Test_Gemm_Cuda.cpp @@ -69,8 +69,7 @@ struct Functor { Functor() = default; KOKKOS_INLINE_FUNCTION - Functor(const ViewType &a, const ViewType &b, const ViewType &c) - : _a(a), _b(b), _c(c) {} + Functor(const ViewType &a, const ViewType &b, const ViewType &c) : _a(a), _b(b), _c(c) {} KOKKOS_INLINE_FUNCTION void operator()(const RangeTag &, const int k) const { @@ -78,98 +77,81 @@ struct Functor { auto bb = Kokkos::subview(_b, k, Kokkos::ALL(), Kokkos::ALL()); auto cc = Kokkos::subview(_c, k, Kokkos::ALL(), Kokkos::ALL()); - SerialGemm::invoke( - 1.0, aa, bb, 1.0, cc); + SerialGemm::invoke(1.0, aa, bb, 1.0, cc); } template - KOKKOS_INLINE_FUNCTION void operator()(const TeamTagV1 &, - const MemberType &member) const { - const int kbeg = - (member.league_rank() * (member.team_size() * VectorLength) + - member.team_rank() * VectorLength); - Kokkos::parallel_for( - Kokkos::ThreadVectorRange(member, VectorLength), [&](const int &k) { - const int kk = kbeg + k; - if (kk < int(_c.extent(0))) { - auto aa = Kokkos::subview(_a, kk, Kokkos::ALL(), Kokkos::ALL()); - auto bb = Kokkos::subview(_b, kk, Kokkos::ALL(), Kokkos::ALL()); - auto cc = Kokkos::subview(_c, kk, Kokkos::ALL(), Kokkos::ALL()); - - SerialGemm::invoke(1.0, aa, bb, 1.0, cc); - } - }); + KOKKOS_INLINE_FUNCTION void operator()(const TeamTagV1 &, const MemberType &member) const { + const int kbeg = (member.league_rank() * (member.team_size() * VectorLength) + member.team_rank() * VectorLength); + Kokkos::parallel_for(Kokkos::ThreadVectorRange(member, VectorLength), [&](const int &k) { + const int kk = kbeg + k; + if (kk < int(_c.extent(0))) { + auto aa = Kokkos::subview(_a, kk, Kokkos::ALL(), Kokkos::ALL()); + auto bb = Kokkos::subview(_b, kk, Kokkos::ALL(), Kokkos::ALL()); + auto cc = Kokkos::subview(_c, kk, Kokkos::ALL(), Kokkos::ALL()); + + SerialGemm::invoke(1.0, aa, bb, 1.0, cc); + } + }); } template - KOKKOS_INLINE_FUNCTION void operator()(const TeamTagV2 &, - const MemberType &member) const { + KOKKOS_INLINE_FUNCTION void operator()(const TeamTagV2 &, const MemberType &member) const { const int kbeg = member.league_rank() * VectorLength; - Kokkos::parallel_for( - Kokkos::ThreadVectorRange(member, VectorLength), [&](const int &k) { - const int kk = kbeg + k; - if (kk < int(_c.extent(0))) { - auto aa = Kokkos::subview(_a, kk, Kokkos::ALL(), Kokkos::ALL()); - auto bb = Kokkos::subview(_b, kk, Kokkos::ALL(), Kokkos::ALL()); - auto cc = Kokkos::subview(_c, kk, Kokkos::ALL(), Kokkos::ALL()); - - TeamGemm::invoke(member, 1.0, aa, bb, 1.0, cc); - } - }); + Kokkos::parallel_for(Kokkos::ThreadVectorRange(member, VectorLength), [&](const int &k) { + const int kk = kbeg + k; + if (kk < int(_c.extent(0))) { + auto aa = Kokkos::subview(_a, kk, Kokkos::ALL(), Kokkos::ALL()); + auto bb = Kokkos::subview(_b, kk, Kokkos::ALL(), Kokkos::ALL()); + auto cc = Kokkos::subview(_c, kk, Kokkos::ALL(), Kokkos::ALL()); + + TeamGemm::invoke(member, 1.0, aa, bb, 1.0, cc); + } + }); } template - KOKKOS_INLINE_FUNCTION void operator()(const TeamTagV3 &, - const MemberType &member) const { + KOKKOS_INLINE_FUNCTION void operator()(const TeamTagV3 &, const MemberType &member) const { const int lvl = 0; - ScratchViewType sa(member.team_scratch(lvl), VectorLength, - _a.extent(1), _a.extent(2)); - ScratchViewType sb(member.team_scratch(lvl), VectorLength, - _b.extent(1), _b.extent(2)); + ScratchViewType sa(member.team_scratch(lvl), VectorLength, _a.extent(1), _a.extent(2)); + ScratchViewType sb(member.team_scratch(lvl), VectorLength, _b.extent(1), _b.extent(2)); const int kbeg = member.league_rank() * VectorLength; - Kokkos::parallel_for( - Kokkos::ThreadVectorRange(member, VectorLength), [&](const int &k) { - const int kk = kbeg + k; - if (kk < int(_c.extent(0))) { - auto aa = Kokkos::subview(_a, kk, Kokkos::ALL(), Kokkos::ALL()); - auto bb = Kokkos::subview(_b, kk, Kokkos::ALL(), Kokkos::ALL()); - auto cc = Kokkos::subview(_c, kk, Kokkos::ALL(), Kokkos::ALL()); - - auto saa = Kokkos::subview(sa, k, Kokkos::ALL(), Kokkos::ALL()); - auto sbb = Kokkos::subview(sb, k, Kokkos::ALL(), Kokkos::ALL()); - - TeamCopy::invoke(member, aa, saa); - TeamCopy::invoke(member, bb, sbb); - member.team_barrier(); - - TeamGemm::invoke(member, 1.0, saa, sbb, 1.0, cc); - } - }); + Kokkos::parallel_for(Kokkos::ThreadVectorRange(member, VectorLength), [&](const int &k) { + const int kk = kbeg + k; + if (kk < int(_c.extent(0))) { + auto aa = Kokkos::subview(_a, kk, Kokkos::ALL(), Kokkos::ALL()); + auto bb = Kokkos::subview(_b, kk, Kokkos::ALL(), Kokkos::ALL()); + auto cc = Kokkos::subview(_c, kk, Kokkos::ALL(), Kokkos::ALL()); + + auto saa = Kokkos::subview(sa, k, Kokkos::ALL(), Kokkos::ALL()); + auto sbb = Kokkos::subview(sb, k, Kokkos::ALL(), Kokkos::ALL()); + + TeamCopy::invoke(member, aa, saa); + TeamCopy::invoke(member, bb, sbb); + member.team_barrier(); + + TeamGemm::invoke(member, 1.0, saa, sbb, 1.0, + cc); + } + }); } template - KOKKOS_INLINE_FUNCTION void operator()(const TeamTagHandmade &, - const MemberType &member) const { + KOKKOS_INLINE_FUNCTION void operator()(const TeamTagHandmade &, const MemberType &member) const { const int kbeg = member.league_rank() * VectorLength; - Kokkos::parallel_for( - Kokkos::ThreadVectorRange(member, VectorLength), [&](const int &k) { - const int kk = kbeg + k; - if (kk < int(_c.extent(0))) { - const int m = _c.extent(1), n = _c.extent(2), q = _a.extent(2); - Kokkos::parallel_for( - Kokkos::TeamThreadRange(member, 0, m * n), [&](const int &ij) { - const int i = ij % m, j = ij / m; - typename ViewType::non_const_value_type cval = 0; - for (int p = 0; p < q; ++p) - cval += _a(kk, i, p) * _b(kk, p, j); - _c(kk, i, j) += cval; - }); - } + Kokkos::parallel_for(Kokkos::ThreadVectorRange(member, VectorLength), [&](const int &k) { + const int kk = kbeg + k; + if (kk < int(_c.extent(0))) { + const int m = _c.extent(1), n = _c.extent(2), q = _a.extent(2); + Kokkos::parallel_for(Kokkos::TeamThreadRange(member, 0, m * n), [&](const int &ij) { + const int i = ij % m, j = ij / m; + typename ViewType::non_const_value_type cval = 0; + for (int p = 0; p < q; ++p) cval += _a(kk, i, p) * _b(kk, p, j); + _c(kk, i, j) += cval; }); + } + }); } }; @@ -177,19 +159,15 @@ template void Gemm(const int NN, const int BlkSize) { typedef Kokkos::Schedule ScheduleType; - constexpr int VectorLength = - DefaultVectorLength::value; - const int N = NN / VectorLength; + constexpr int VectorLength = DefaultVectorLength::value; + const int N = NN / VectorLength; { std::string value_type_name; if (std::is_same::value) value_type_name = "double"; - if (std::is_same >::value) - value_type_name = "Kokkos::complex"; + if (std::is_same >::value) value_type_name = "Kokkos::complex"; - std::cout << "SIMD is defined: datatype " << value_type_name - << " a vector length " << VectorLength << "\n"; + std::cout << "SIMD is defined: datatype " << value_type_name << " a vector length " << VectorLength << "\n"; } const double flop = (N * VectorLength) * FlopCount(BlkSize, BlkSize, BlkSize); @@ -201,10 +179,8 @@ void Gemm(const int NN, const int BlkSize) { const int iter_begin = -3, iter_end = 30; Kokkos::Timer timer; - Kokkos::View amat( - "amat", N * VectorLength, BlkSize, BlkSize), - bmat("bmat", N * VectorLength, BlkSize, BlkSize), - cref("cref", N * VectorLength, BlkSize, BlkSize); + Kokkos::View amat("amat", N * VectorLength, BlkSize, BlkSize), + bmat("bmat", N * VectorLength, BlkSize, BlkSize), cref("cref", N * VectorLength, BlkSize, BlkSize); { Random random; @@ -225,12 +201,9 @@ void Gemm(const int NN, const int BlkSize) { /// /// CUBLAS Strided version /// - const Kokkos::LayoutStride stride(N * VectorLength, BlkSize * BlkSize, - BlkSize, 1, BlkSize, BlkSize); + const Kokkos::LayoutStride stride(N * VectorLength, BlkSize * BlkSize, BlkSize, 1, BlkSize, BlkSize); - Kokkos::View a( - "a", stride), - b("b", stride), c("c", stride); + Kokkos::View a("a", stride), b("b", stride), c("c", stride); double tavg = 0, tmin = tmax; @@ -238,13 +211,10 @@ void Gemm(const int NN, const int BlkSize) { cublasHandle_t handle; stat = cublasCreate(&handle); - if (stat != CUBLAS_STATUS_SUCCESS) - Kokkos::abort("CUBLAS initialization failed\n"); + if (stat != CUBLAS_STATUS_SUCCESS) Kokkos::abort("CUBLAS initialization failed\n"); - auto amat_device = - Kokkos::create_mirror_view(DeviceMemorySpaceType(), amat); - auto bmat_device = - Kokkos::create_mirror_view(DeviceMemorySpaceType(), bmat); + auto amat_device = Kokkos::create_mirror_view(DeviceMemorySpaceType(), amat); + auto bmat_device = Kokkos::create_mirror_view(DeviceMemorySpaceType(), bmat); Kokkos::deep_copy(amat_device, amat); Kokkos::deep_copy(bmat_device, bmat); @@ -268,12 +238,10 @@ void Gemm(const int NN, const int BlkSize) { Kokkos::fence(); timer.reset(); - stat = cublasDgemmStridedBatched( - handle, CUBLAS_OP_N, CUBLAS_OP_N, BlkSize, BlkSize, BlkSize, &one, - (const value_type *)a.data(), BlkSize, BlkSize * BlkSize, - (const value_type *)b.data(), BlkSize, BlkSize * BlkSize, &zero, - (value_type *)c.data(), BlkSize, BlkSize * BlkSize, - N * VectorLength); + stat = cublasDgemmStridedBatched(handle, CUBLAS_OP_N, CUBLAS_OP_N, BlkSize, BlkSize, BlkSize, &one, + (const value_type *)a.data(), BlkSize, BlkSize * BlkSize, + (const value_type *)b.data(), BlkSize, BlkSize * BlkSize, &zero, + (value_type *)c.data(), BlkSize, BlkSize * BlkSize, N * VectorLength); Kokkos::fence(); const double t = timer.seconds(); @@ -282,16 +250,14 @@ void Gemm(const int NN, const int BlkSize) { } tavg /= iter_end; - auto csol = - Kokkos::create_mirror_view(typename HostSpaceType::memory_space(), c); + auto csol = Kokkos::create_mirror_view(typename HostSpaceType::memory_space(), c); Kokkos::deep_copy(csol, c); Kokkos::deep_copy(cref, csol); std::cout << std::setw(8) << "CUBLAS" << std::setw(8) << "Strided" << " BlkSize = " << std::setw(3) << BlkSize << " TeamSize = N/A" << " ScratchSize (KB) = 0" - << " time = " << std::scientific << tmin - << " avg flop/s = " << (flop / tavg) + << " time = " << std::scientific << tmin << " avg flop/s = " << (flop / tavg) << " max flop/s = " << (flop / tmin) << std::endl; } cublasDestroy(handle); @@ -303,15 +269,13 @@ void Gemm(const int NN, const int BlkSize) { /// Range policy version /// typedef Kokkos::View view_type; - view_type a("a", N * VectorLength, BlkSize, BlkSize), - b("b", N * VectorLength, BlkSize, BlkSize), + view_type a("a", N * VectorLength, BlkSize, BlkSize), b("b", N * VectorLength, BlkSize, BlkSize), c("c", N * VectorLength, BlkSize, BlkSize); double tavg = 0, tmin = tmax; { typedef Functor functor_type; - const Kokkos::RangePolicy policy( - 0, N * VectorLength); + const Kokkos::RangePolicy policy(0, N * VectorLength); for (int iter = iter_begin; iter < iter_end; ++iter) { // flush @@ -325,8 +289,7 @@ void Gemm(const int NN, const int BlkSize) { Kokkos::fence(); timer.reset(); - Kokkos::parallel_for("KokkosBatched::PerfTest::GemmCuda::RangeTag", - policy, functor_type(a, b, c)); + Kokkos::parallel_for("KokkosBatched::PerfTest::GemmCuda::RangeTag", policy, functor_type(a, b, c)); Kokkos::fence(); const double t = timer.seconds(); @@ -335,22 +298,19 @@ void Gemm(const int NN, const int BlkSize) { } tavg /= iter_end; - auto csol = - Kokkos::create_mirror_view(typename HostSpaceType::memory_space(), c); + auto csol = Kokkos::create_mirror_view(typename HostSpaceType::memory_space(), c); Kokkos::deep_copy(csol, c); double diff = 0; for (int i = 0, iend = cref.extent(0); i < iend; ++i) for (int j = 0, jend = cref.extent(1); j < jend; ++j) for (int k = 0, kend = cref.extent(2); k < kend; ++k) - diff += Kokkos::ArithTraits::abs(cref(i, j, k) - - csol(i, j, k)); + diff += Kokkos::ArithTraits::abs(cref(i, j, k) - csol(i, j, k)); std::cout << std::setw(8) << "Kokkos" << std::setw(8) << "Range" << " BlkSize = " << std::setw(3) << BlkSize << " TeamSize = N/A" << " ScratchSize (KB) = 0" - << " time = " << std::scientific << tmin - << " avg flop/s = " << (flop / tavg) + << " time = " << std::scientific << tmin << " avg flop/s = " << (flop / tavg) << " max flop/s = " << (flop / tmin); #if defined(__KOKKOSKERNELS_NVIDIA_CUBLAS__) std::cout << " diff to ref = " << diff; @@ -365,21 +325,18 @@ void Gemm(const int NN, const int BlkSize) { /// expect the same performance as range policy /// typedef Kokkos::View view_type; - view_type a("a", N * VectorLength, BlkSize, BlkSize), - b("b", N * VectorLength, BlkSize, BlkSize), + view_type a("a", N * VectorLength, BlkSize, BlkSize), b("b", N * VectorLength, BlkSize, BlkSize), c("c", N * VectorLength, BlkSize, BlkSize); double tavg = 0, tmin = tmax; { - typedef Kokkos::TeamPolicy - policy_type; + typedef Kokkos::TeamPolicy policy_type; typedef Functor functor_type; // 128 is rough estimates - const int team_size = - policy_type(N / 32, Kokkos::AUTO, VectorLength) - .team_size_recommended(functor_type(), Kokkos::ParallelForTag()); + const int team_size = policy_type(N / 32, Kokkos::AUTO, VectorLength) + .team_size_recommended(functor_type(), Kokkos::ParallelForTag()); const policy_type policy(N / team_size, team_size, VectorLength); for (int iter = iter_begin; iter < iter_end; ++iter) { @@ -394,8 +351,7 @@ void Gemm(const int NN, const int BlkSize) { Kokkos::fence(); timer.reset(); - Kokkos::parallel_for("KokkosBatched::PerfTest::GemmCuda::TeamPolicyV1", - policy, functor_type(a, b, c)); + Kokkos::parallel_for("KokkosBatched::PerfTest::GemmCuda::TeamPolicyV1", policy, functor_type(a, b, c)); Kokkos::fence(); const double t = timer.seconds(); @@ -404,23 +360,19 @@ void Gemm(const int NN, const int BlkSize) { } tavg /= iter_end; - auto csol = - Kokkos::create_mirror_view(typename HostSpaceType::memory_space(), c); + auto csol = Kokkos::create_mirror_view(typename HostSpaceType::memory_space(), c); Kokkos::deep_copy(csol, c); double diff = 0; for (int i = 0, iend = cref.extent(0); i < iend; ++i) for (int j = 0, jend = cref.extent(1); j < jend; ++j) for (int k = 0, kend = cref.extent(2); k < kend; ++k) - diff += Kokkos::ArithTraits::abs(cref(i, j, k) - - csol(i, j, k)); + diff += Kokkos::ArithTraits::abs(cref(i, j, k) - csol(i, j, k)); std::cout << std::setw(8) << "Kokkos" << std::setw(8) << "Team V1" - << " BlkSize = " << std::setw(3) << BlkSize - << " TeamSize = " << std::setw(3) << team_size + << " BlkSize = " << std::setw(3) << BlkSize << " TeamSize = " << std::setw(3) << team_size << " ScratchSize (KB) = 0" - << " time = " << std::scientific << tmin - << " avg flop/s = " << (flop / tavg) + << " time = " << std::scientific << tmin << " avg flop/s = " << (flop / tavg) << " max flop/s = " << (flop / tmin); #if defined(__KOKKOSKERNELS_NVIDIA_CUBLAS__) std::cout << " diff to ref = " << diff; @@ -434,26 +386,21 @@ void Gemm(const int NN, const int BlkSize) { /// Team policy V2 - team parallel /// typedef Kokkos::View view_type; - view_type a("a", N * VectorLength, BlkSize, BlkSize), - b("b", N * VectorLength, BlkSize, BlkSize), + view_type a("a", N * VectorLength, BlkSize, BlkSize), b("b", N * VectorLength, BlkSize, BlkSize), c("c", N * VectorLength, BlkSize, BlkSize); double tavg = 0, tmin = tmax; { - typedef Kokkos::TeamPolicy - policy_type; + typedef Kokkos::TeamPolicy policy_type; typedef Functor functor_type; - const int is_blocked_algo = - (std::is_same::value), - mb = Algo::Gemm::Blocked::mb(), - mp = BlkSize % mb > 0; + const int is_blocked_algo = (std::is_same::value), + mb = Algo::Gemm::Blocked::mb(), mp = BlkSize % mb > 0; const int mblk = is_blocked_algo ? (BlkSize / mb + mp) : BlkSize; const int max_team_size = - policy_type(N, Kokkos::AUTO, VectorLength) - .team_size_max(functor_type(), Kokkos::ParallelForTag()); + policy_type(N, Kokkos::AUTO, VectorLength).team_size_max(functor_type(), Kokkos::ParallelForTag()); const int team_size = std::min(std::max(mblk * mblk, 4), max_team_size); policy_type policy(N, team_size, VectorLength); @@ -469,8 +416,7 @@ void Gemm(const int NN, const int BlkSize) { Kokkos::fence(); timer.reset(); - Kokkos::parallel_for("KokkosBatched::PerfTest::GemmCuda::TeamPolicyV2", - policy, functor_type(a, b, c)); + Kokkos::parallel_for("KokkosBatched::PerfTest::GemmCuda::TeamPolicyV2", policy, functor_type(a, b, c)); Kokkos::fence(); const double t = timer.seconds(); @@ -479,23 +425,19 @@ void Gemm(const int NN, const int BlkSize) { } tavg /= iter_end; - auto csol = - Kokkos::create_mirror_view(typename HostSpaceType::memory_space(), c); + auto csol = Kokkos::create_mirror_view(typename HostSpaceType::memory_space(), c); Kokkos::deep_copy(csol, c); double diff = 0; for (int i = 0, iend = cref.extent(0); i < iend; ++i) for (int j = 0, jend = cref.extent(1); j < jend; ++j) for (int k = 0, kend = cref.extent(2); k < kend; ++k) - diff += Kokkos::ArithTraits::abs(cref(i, j, k) - - csol(i, j, k)); + diff += Kokkos::ArithTraits::abs(cref(i, j, k) - csol(i, j, k)); std::cout << std::setw(8) << "Kokkos" << std::setw(8) << "Team V2" - << " BlkSize = " << std::setw(3) << BlkSize - << " TeamSize = " << std::setw(3) << team_size + << " BlkSize = " << std::setw(3) << BlkSize << " TeamSize = " << std::setw(3) << team_size << " ScratchSize (KB) = 0" - << " time = " << std::scientific << tmin - << " avg flop/s = " << (flop / tavg) + << " time = " << std::scientific << tmin << " avg flop/s = " << (flop / tavg) << " max flop/s = " << (flop / tmin); #if defined(__KOKKOSKERNELS_NVIDIA_CUBLAS__) std::cout << " diff to ref = " << diff; @@ -509,37 +451,29 @@ void Gemm(const int NN, const int BlkSize) { /// Team policy V3 - team parallel + scratch /// typedef Kokkos::View view_type; - view_type a("a", N * VectorLength, BlkSize, BlkSize), - b("b", N * VectorLength, BlkSize, BlkSize), + view_type a("a", N * VectorLength, BlkSize, BlkSize), b("b", N * VectorLength, BlkSize, BlkSize), c("c", N * VectorLength, BlkSize, BlkSize); double tavg = 0, tmin = tmax; { - typedef Kokkos::TeamPolicy - policy_type; + typedef Kokkos::TeamPolicy policy_type; typedef Functor functor_type; - const int lvl = 0, - per_team_scratch = 2 * ScratchViewType::shmem_size( - VectorLength, BlkSize, BlkSize); + const int lvl = 0, per_team_scratch = 2 * ScratchViewType::shmem_size(VectorLength, BlkSize, BlkSize); // std::cout << "per team scratch " << per_team_scratch << "\n"; if (per_team_scratch / 1024 < 48) { - const int is_blocked_algo = - (std::is_same::value), - mb = Algo::Gemm::Blocked::mb(), - mp = BlkSize % mb > 0; + const int is_blocked_algo = (std::is_same::value), + mb = Algo::Gemm::Blocked::mb(), mp = BlkSize % mb > 0; const int mblk = is_blocked_algo ? (BlkSize / mb + mp) : BlkSize; - const int max_team_size = - policy_type(N, Kokkos::AUTO, VectorLength) - .set_scratch_size(lvl, Kokkos::PerTeam(per_team_scratch)) - .team_size_max(functor_type(), Kokkos::ParallelForTag()); + const int max_team_size = policy_type(N, Kokkos::AUTO, VectorLength) + .set_scratch_size(lvl, Kokkos::PerTeam(per_team_scratch)) + .team_size_max(functor_type(), Kokkos::ParallelForTag()); const int team_size = std::min(std::max(mblk * mblk, 4), max_team_size); policy_type policy = - policy_type(N, team_size, VectorLength) - .set_scratch_size(lvl, Kokkos::PerTeam(per_team_scratch)); + policy_type(N, team_size, VectorLength).set_scratch_size(lvl, Kokkos::PerTeam(per_team_scratch)); for (int iter = iter_begin; iter < iter_end; ++iter) { // flush flush.run(); @@ -552,9 +486,7 @@ void Gemm(const int NN, const int BlkSize) { Kokkos::fence(); timer.reset(); - Kokkos::parallel_for( - "KokkosBatched::PerfTest::GemmCuda::TeamPolicyV3", policy, - functor_type(a, b, c)); + Kokkos::parallel_for("KokkosBatched::PerfTest::GemmCuda::TeamPolicyV3", policy, functor_type(a, b, c)); Kokkos::fence(); const double t = timer.seconds(); @@ -563,23 +495,19 @@ void Gemm(const int NN, const int BlkSize) { } tavg /= iter_end; - auto csol = Kokkos::create_mirror_view( - typename HostSpaceType::memory_space(), c); + auto csol = Kokkos::create_mirror_view(typename HostSpaceType::memory_space(), c); Kokkos::deep_copy(csol, c); double diff = 0; for (int i = 0, iend = cref.extent(0); i < iend; ++i) for (int j = 0, jend = cref.extent(1); j < jend; ++j) for (int k = 0, kend = cref.extent(2); k < kend; ++k) - diff += Kokkos::ArithTraits::abs(cref(i, j, k) - - csol(i, j, k)); + diff += Kokkos::ArithTraits::abs(cref(i, j, k) - csol(i, j, k)); std::cout << std::setw(8) << "Kokkos" << std::setw(8) << "Team V3" - << " BlkSize = " << std::setw(3) << BlkSize - << " TeamSize = " << std::setw(3) << team_size - << " ScratchSize (KB) = " << std::setw(3) - << (per_team_scratch / 1024) << " time = " << std::scientific - << tmin << " avg flop/s = " << (flop / tavg) + << " BlkSize = " << std::setw(3) << BlkSize << " TeamSize = " << std::setw(3) << team_size + << " ScratchSize (KB) = " << std::setw(3) << (per_team_scratch / 1024) + << " time = " << std::scientific << tmin << " avg flop/s = " << (flop / tavg) << " max flop/s = " << (flop / tmin); #if defined(__KOKKOSKERNELS_NVIDIA_CUBLAS__) std::cout << " diff to ref = " << diff; @@ -587,8 +515,7 @@ void Gemm(const int NN, const int BlkSize) { std::cout << std::endl; } else { std::cout << std::setw(8) << "Kokkos" << std::setw(8) << "Team V3" - << " Scratch per team is too big:" << std::setw(3) - << (per_team_scratch / 1024) << std::endl; + << " Scratch per team is too big:" << std::setw(3) << (per_team_scratch / 1024) << std::endl; } } } @@ -598,19 +525,16 @@ void Gemm(const int NN, const int BlkSize) { /// Team policy - handmade /// typedef Kokkos::View view_type; - view_type a("a", N * VectorLength, BlkSize, BlkSize), - b("b", N * VectorLength, BlkSize, BlkSize), + view_type a("a", N * VectorLength, BlkSize, BlkSize), b("b", N * VectorLength, BlkSize, BlkSize), c("c", N * VectorLength, BlkSize, BlkSize); double tavg = 0, tmin = tmax; { - typedef Kokkos::TeamPolicy - policy_type; + typedef Kokkos::TeamPolicy policy_type; typedef Functor functor_type; const int max_team_size = - policy_type(N, Kokkos::AUTO, VectorLength) - .team_size_max(functor_type(), Kokkos::ParallelForTag()); + policy_type(N, Kokkos::AUTO, VectorLength).team_size_max(functor_type(), Kokkos::ParallelForTag()); const int team_size = std::min(max_team_size, BlkSize * BlkSize); @@ -627,9 +551,7 @@ void Gemm(const int NN, const int BlkSize) { Kokkos::fence(); timer.reset(); - Kokkos::parallel_for( - "KokkosBatched::PerfTest::GemmCuda::TeamPolicyHandmade", policy, - functor_type(a, b, c)); + Kokkos::parallel_for("KokkosBatched::PerfTest::GemmCuda::TeamPolicyHandmade", policy, functor_type(a, b, c)); Kokkos::fence(); const double t = timer.seconds(); @@ -638,23 +560,19 @@ void Gemm(const int NN, const int BlkSize) { } tavg /= iter_end; - auto csol = - Kokkos::create_mirror_view(typename HostSpaceType::memory_space(), c); + auto csol = Kokkos::create_mirror_view(typename HostSpaceType::memory_space(), c); Kokkos::deep_copy(csol, c); double diff = 0; for (int i = 0, iend = cref.extent(0); i < iend; ++i) for (int j = 0, jend = cref.extent(1); j < jend; ++j) for (int k = 0, kend = cref.extent(2); k < kend; ++k) - diff += Kokkos::ArithTraits::abs(cref(i, j, k) - - csol(i, j, k)); + diff += Kokkos::ArithTraits::abs(cref(i, j, k) - csol(i, j, k)); std::cout << std::setw(8) << "Kokkos" << std::setw(8) << "Team HM" - << " BlkSize = " << std::setw(3) << BlkSize - << " TeamSize = " << std::setw(3) << team_size + << " BlkSize = " << std::setw(3) << BlkSize << " TeamSize = " << std::setw(3) << team_size << " ScratchSize (KB) = 0" - << " time = " << std::scientific << tmin - << " avg flop/s = " << (flop / tavg) + << " time = " << std::scientific << tmin << " avg flop/s = " << (flop / tavg) << " max flop/s = " << (flop / tmin); #if defined(__KOKKOSKERNELS_NVIDIA_CUBLAS__) std::cout << " diff to ref = " << diff; diff --git a/perf_test/batched/dense/do-not-use/KokkosBatched_Test_Gemm_Host.hpp b/perf_test/batched/dense/do-not-use/KokkosBatched_Test_Gemm_Host.hpp index 225e10f63b..cfcbb176fa 100644 --- a/perf_test/batched/dense/do-not-use/KokkosBatched_Test_Gemm_Host.hpp +++ b/perf_test/batched/dense/do-not-use/KokkosBatched_Test_Gemm_Host.hpp @@ -35,7 +35,7 @@ #include "KokkosBatched_Gemm_Decl.hpp" #include "KokkosBatched_Gemm_Serial_Impl.hpp" -//#undef __KOKKOSBATCHED_INTEL_MKL_BATCHED__ +// #undef __KOKKOSBATCHED_INTEL_MKL_BATCHED__ namespace KokkosBatched { namespace PerfTest { @@ -66,25 +66,20 @@ template void Gemm(const int NN) { typedef Kokkos::Schedule ScheduleType; - constexpr int VectorLength = - DefaultVectorLength::value; - const int N = NN / VectorLength; + constexpr int VectorLength = DefaultVectorLength::value; + const int N = NN / VectorLength; { std::string value_type_name; if (std::is_same::value) value_type_name = "double"; - if (std::is_same >::value) - value_type_name = "Kokkos::complex"; + if (std::is_same >::value) value_type_name = "Kokkos::complex"; #if defined(__AVX512F__) - std::cout << "AVX512 is defined: datatype " << value_type_name - << " a vector length " << VectorLength << "\n"; + std::cout << "AVX512 is defined: datatype " << value_type_name << " a vector length " << VectorLength << "\n"; #elif defined(__AVX__) || defined(__AVX2__) - std::cout << "AVX or AVX2 is defined: datatype " << value_type_name - << " a vector length " << VectorLength << "\n"; + std::cout << "AVX or AVX2 is defined: datatype " << value_type_name << " a vector length " << VectorLength << "\n"; #else - std::cout << "SIMD (compiler vectorization) is defined: datatype " - << value_type_name << " a vector length " << VectorLength << "\n"; + std::cout << "SIMD (compiler vectorization) is defined: datatype " << value_type_name << " a vector length " + << VectorLength << "\n"; #endif } @@ -95,8 +90,7 @@ void Gemm(const int NN) { Kokkos::Timer timer; Kokkos::View cref; - Kokkos::View amat( - "amat", N * VectorLength, BlkSize, BlkSize), + Kokkos::View amat("amat", N * VectorLength, BlkSize, BlkSize), bmat("bmat", N * VectorLength, BlkSize, BlkSize); Kokkos::Random_XorShift64_Pool random(13718); @@ -104,13 +98,11 @@ void Gemm(const int NN) { Kokkos::fill_random(bmat, random, value_type(1.0)); typedef Vector, VectorLength> VectorType; - Kokkos::View amat_simd( - "amat_simd", N, BlkSize, BlkSize), + Kokkos::View amat_simd("amat_simd", N, BlkSize, BlkSize), bmat_simd("bmat_simd", N, BlkSize, BlkSize); Kokkos::parallel_for( - "KokkosBatched::PerfTest::GemmHost::Pack", - Kokkos::RangePolicy(0, N * VectorLength), + "KokkosBatched::PerfTest::GemmHost::Pack", Kokkos::RangePolicy(0, N * VectorLength), KOKKOS_LAMBDA(const int k) { const int k0 = k / VectorLength, k1 = k % VectorLength; for (int i = 0; i < BlkSize; ++i) @@ -129,14 +121,11 @@ void Gemm(const int NN) { /// #if defined(__KOKKOSBATCHED_INTEL_MKL__) { - Kokkos::View a( - "a", N * VectorLength, BlkSize, BlkSize), - b("b", N * VectorLength, BlkSize, BlkSize), - c("c", N * VectorLength, BlkSize, BlkSize); + Kokkos::View a("a", N * VectorLength, BlkSize, BlkSize), + b("b", N * VectorLength, BlkSize, BlkSize), c("c", N * VectorLength, BlkSize, BlkSize); { - const Kokkos::RangePolicy policy( - 0, N * VectorLength); + const Kokkos::RangePolicy policy(0, N * VectorLength); double tavg = 0, tmin = tmax; for (int iter = iter_begin; iter < iter_end; ++iter) { @@ -152,24 +141,20 @@ void Gemm(const int NN) { timer.reset(); Kokkos::parallel_for( - "KokkosBatched::PerfTest::GemmHost::CblasOpenMP", policy, - KOKKOS_LAMBDA(const int k) { + "KokkosBatched::PerfTest::GemmHost::CblasOpenMP", policy, KOKKOS_LAMBDA(const int k) { auto aa = Kokkos::subview(a, k, Kokkos::ALL(), Kokkos::ALL()); auto bb = Kokkos::subview(b, k, Kokkos::ALL(), Kokkos::ALL()); auto cc = Kokkos::subview(c, k, Kokkos::ALL(), Kokkos::ALL()); const double one = 1.0; if (std::is_same::value) { - cblas_dgemm(CblasRowMajor, CblasNoTrans, CblasNoTrans, BlkSize, - BlkSize, BlkSize, one, (double *)aa.data(), - aa.stride_0(), (double *)bb.data(), bb.stride_0(), - one, (double *)cc.data(), cc.stride_0()); - } else if (std::is_same >::value) { - cblas_zgemm(CblasRowMajor, CblasNoTrans, CblasNoTrans, BlkSize, - BlkSize, BlkSize, (void *)&one, (void *)aa.data(), - aa.stride_0(), (void *)bb.data(), bb.stride_0(), - (void *)&one, (void *)cc.data(), cc.stride_0()); + cblas_dgemm(CblasRowMajor, CblasNoTrans, CblasNoTrans, BlkSize, BlkSize, BlkSize, one, + (double *)aa.data(), aa.stride_0(), (double *)bb.data(), bb.stride_0(), one, + (double *)cc.data(), cc.stride_0()); + } else if (std::is_same >::value) { + cblas_zgemm(CblasRowMajor, CblasNoTrans, CblasNoTrans, BlkSize, BlkSize, BlkSize, (void *)&one, + (void *)aa.data(), aa.stride_0(), (void *)bb.data(), bb.stride_0(), (void *)&one, + (void *)cc.data(), cc.stride_0()); } }); @@ -181,10 +166,8 @@ void Gemm(const int NN) { tavg /= iter_end; std::cout << std::setw(12) << "MKL DGEMM" - << " BlkSize = " << std::setw(3) << BlkSize - << " time = " << std::scientific << tmin - << " avg flop/s = " << (flop / tavg) - << " max flop/s = " << (flop / tmin) << std::endl; + << " BlkSize = " << std::setw(3) << BlkSize << " time = " << std::scientific << tmin + << " avg flop/s = " << (flop / tavg) << " max flop/s = " << (flop / tmin) << std::endl; cref = c; } @@ -192,14 +175,11 @@ void Gemm(const int NN) { #if defined(__KOKKOSBATCHED_INTEL_MKL_BATCHED__) { - typedef Kokkos::View - ViewType; - ViewType a("a", N * VectorLength, BlkSize, BlkSize), - b("b", N * VectorLength, BlkSize, BlkSize), + typedef Kokkos::View ViewType; + ViewType a("a", N * VectorLength, BlkSize, BlkSize), b("b", N * VectorLength, BlkSize, BlkSize), c("c", N * VectorLength, BlkSize, BlkSize); - value_type *aa[N * VectorLength], *bb[N * VectorLength], - *cc[N * VectorLength]; + value_type *aa[N * VectorLength], *bb[N * VectorLength], *cc[N * VectorLength]; for (int k = 0; k < N * VectorLength; ++k) { aa[k] = &a(k, 0, 0); @@ -234,15 +214,11 @@ void Gemm(const int NN) { timer.reset(); if (std::is_same::value) { - cblas_dgemm_batch(CblasRowMajor, transA, transB, blksize, blksize, - blksize, one, (const double **)aa, lda, - (const double **)bb, ldb, one, (double **)cc, ldc, - 1, size_per_grp); + cblas_dgemm_batch(CblasRowMajor, transA, transB, blksize, blksize, blksize, one, (const double **)aa, lda, + (const double **)bb, ldb, one, (double **)cc, ldc, 1, size_per_grp); } else if (std::is_same >::value) { - cblas_zgemm_batch(CblasRowMajor, transA, transB, blksize, blksize, - blksize, one, (const void **)aa, lda, - (const void **)bb, ldb, one, (void **)cc, ldc, 1, - size_per_grp); + cblas_zgemm_batch(CblasRowMajor, transA, transB, blksize, blksize, blksize, one, (const void **)aa, lda, + (const void **)bb, ldb, one, (void **)cc, ldc, 1, size_per_grp); } HostSpaceType().fence(); @@ -255,22 +231,18 @@ void Gemm(const int NN) { double diff = 0; for (int i = 0, iend = cref.extent(0); i < iend; ++i) for (int j = 0, jend = cref.extent(1); j < jend; ++j) - for (int k = 0, kend = cref.extent(2); k < kend; ++k) - diff += abs(cref(i, j, k) - c(i, j, k)); + for (int k = 0, kend = cref.extent(2); k < kend; ++k) diff += abs(cref(i, j, k) - c(i, j, k)); std::cout << std::setw(12) << "MKL Batch" - << " BlkSize = " << std::setw(3) << BlkSize - << " time = " << std::scientific << tmin - << " avg flop/s = " << (flop / tavg) - << " max flop/s = " << (flop / tmin) - << " diff to ref = " << diff << std::endl; + << " BlkSize = " << std::setw(3) << BlkSize << " time = " << std::scientific << tmin + << " avg flop/s = " << (flop / tavg) << " max flop/s = " << (flop / tmin) << " diff to ref = " << diff + << std::endl; } } #endif #if defined(__KOKKOSBATCHED_INTEL_MKL_COMPACT_BATCHED__) { - Kokkos::View a( - "a", N, BlkSize, BlkSize), + Kokkos::View a("a", N, BlkSize, BlkSize), b("b", N, BlkSize, BlkSize), c("c", N, BlkSize, BlkSize); { @@ -306,19 +278,15 @@ void Gemm(const int NN) { timer.reset(); if (std::is_same::value) { - mkl_dgemm_compact(MKL_ROW_MAJOR, MKL_NOTRANS, MKL_NOTRANS, BlkSize, - BlkSize, BlkSize, done, (const double *)a.data(), - (MKL_INT)a.stride_1(), (const double *)b.data(), - (MKL_INT)b.stride_1(), done, (double *)c.data(), - (MKL_INT)c.stride_1(), format, N * VectorLength); - } else if (std::is_same >::value) { - mkl_zgemm_compact(MKL_ROW_MAJOR, MKL_NOTRANS, MKL_NOTRANS, BlkSize, - BlkSize, BlkSize, (MKL_Complex16 *)&zone, - (const double *)a.data(), (MKL_INT)a.stride_1(), - (const double *)b.data(), (MKL_INT)b.stride_1(), - (MKL_Complex16 *)&zone, (double *)c.data(), - (MKL_INT)c.stride_1(), format, N * VectorLength); + mkl_dgemm_compact(MKL_ROW_MAJOR, MKL_NOTRANS, MKL_NOTRANS, BlkSize, BlkSize, BlkSize, done, + (const double *)a.data(), (MKL_INT)a.stride_1(), (const double *)b.data(), + (MKL_INT)b.stride_1(), done, (double *)c.data(), (MKL_INT)c.stride_1(), format, + N * VectorLength); + } else if (std::is_same >::value) { + mkl_zgemm_compact(MKL_ROW_MAJOR, MKL_NOTRANS, MKL_NOTRANS, BlkSize, BlkSize, BlkSize, + (MKL_Complex16 *)&zone, (const double *)a.data(), (MKL_INT)a.stride_1(), + (const double *)b.data(), (MKL_INT)b.stride_1(), (MKL_Complex16 *)&zone, + (double *)c.data(), (MKL_INT)c.stride_1(), format, N * VectorLength); } HostSpaceType().fence(); @@ -332,15 +300,12 @@ void Gemm(const int NN) { for (int i = 0, iend = cref.extent(0); i < iend; ++i) for (int j = 0, jend = cref.extent(1); j < jend; ++j) for (int k = 0, kend = cref.extent(2); k < kend; ++k) - diff += abs(cref(i, j, k) - - c(i / VectorLength, j, k)[i % VectorLength]); + diff += abs(cref(i, j, k) - c(i / VectorLength, j, k)[i % VectorLength]); std::cout << std::setw(12) << "MKL Cmpct" - << " BlkSize = " << std::setw(3) << BlkSize - << " time = " << std::scientific << tmin - << " avg flop/s = " << (flop / tavg) - << " max flop/s = " << (flop / tmin) - << " diff to ref = " << diff << std::endl; + << " BlkSize = " << std::setw(3) << BlkSize << " time = " << std::scientific << tmin + << " avg flop/s = " << (flop / tavg) << " max flop/s = " << (flop / tmin) << " diff to ref = " << diff + << std::endl; } } } @@ -351,16 +316,13 @@ void Gemm(const int NN) { { libxsmm_init(); - Kokkos::View a( - "a", N * VectorLength, BlkSize, BlkSize), - b("b", N * VectorLength, BlkSize, BlkSize), - c("c", N * VectorLength, BlkSize, BlkSize); + Kokkos::View a("a", N * VectorLength, BlkSize, BlkSize), + b("b", N * VectorLength, BlkSize, BlkSize), c("c", N * VectorLength, BlkSize, BlkSize); libxsmm_blasint lda = a.stride_1(), ldb = b.stride_1(), ldc = c.stride_1(); { - const Kokkos::RangePolicy policy( - 0, N * VectorLength); + const Kokkos::RangePolicy policy(0, N * VectorLength); double tavg = 0, tmin = tmax; @@ -382,19 +344,15 @@ void Gemm(const int NN) { timer.reset(); Kokkos::parallel_for( - "KokkosBatched::PerfTest::GemmHost::libxswmmOpenMP", policy, - KOKKOS_LAMBDA(const int k) { + "KokkosBatched::PerfTest::GemmHost::libxswmmOpenMP", policy, KOKKOS_LAMBDA(const int k) { auto aa = Kokkos::subview(a, k, Kokkos::ALL(), Kokkos::ALL()); auto bb = Kokkos::subview(b, k, Kokkos::ALL(), Kokkos::ALL()); auto cc = Kokkos::subview(c, k, Kokkos::ALL(), Kokkos::ALL()); // column major - libxsmm_gemm((const char *)&transA, (const char *)&transB, - blksize, blksize, blksize, (const double *)&one, - (const double *)bb.data(), - (const libxsmm_blasint *)&ldb, - (const double *)aa.data(), - (const libxsmm_blasint *)&lda, (const double *)&one, + libxsmm_gemm((const char *)&transA, (const char *)&transB, blksize, blksize, blksize, + (const double *)&one, (const double *)bb.data(), (const libxsmm_blasint *)&ldb, + (const double *)aa.data(), (const libxsmm_blasint *)&lda, (const double *)&one, (double *)cc.data(), (const libxsmm_blasint *)&ldc); }); @@ -409,15 +367,12 @@ void Gemm(const int NN) { double diff = 0; for (int i = 0, iend = cref.extent(0); i < iend; ++i) for (int j = 0, jend = cref.extent(1); j < jend; ++j) - for (int k = 0, kend = cref.extent(2); k < kend; ++k) - diff += abs(cref(i, j, k) - c(i, j, k)); + for (int k = 0, kend = cref.extent(2); k < kend; ++k) diff += abs(cref(i, j, k) - c(i, j, k)); std::cout << std::setw(12) << "libxsmm" - << " BlkSize = " << std::setw(3) << BlkSize - << " time = " << std::scientific << tmin - << " avg flop/s = " << (flop / tavg) - << " max flop/s = " << (flop / tmin) - << " diff to ref = " << diff << std::endl; + << " BlkSize = " << std::setw(3) << BlkSize << " time = " << std::scientific << tmin + << " avg flop/s = " << (flop / tavg) << " max flop/s = " << (flop / tmin) << " diff to ref = " << diff + << std::endl; } libxsmm_finalize(); } @@ -488,8 +443,7 @@ void Gemm(const int NN) { /// Serial SIMD with appropriate data layout /// { - Kokkos::View a( - "a", N, BlkSize, BlkSize), + Kokkos::View a("a", N, BlkSize, BlkSize), b("b", N, BlkSize, BlkSize), c("c", N, BlkSize, BlkSize); { @@ -510,14 +464,12 @@ void Gemm(const int NN) { timer.reset(); Kokkos::parallel_for( - "KokkosBatched::PerfTest::GemmHost::SIMDSerialOpenMP", policy, - KOKKOS_LAMBDA(const int k) { + "KokkosBatched::PerfTest::GemmHost::SIMDSerialOpenMP", policy, KOKKOS_LAMBDA(const int k) { auto aa = Kokkos::subview(a, k, Kokkos::ALL(), Kokkos::ALL()); auto bb = Kokkos::subview(b, k, Kokkos::ALL(), Kokkos::ALL()); auto cc = Kokkos::subview(c, k, Kokkos::ALL(), Kokkos::ALL()); - SerialGemm::invoke(1.0, aa, bb, 1.0, cc); + SerialGemm::invoke(1.0, aa, bb, 1.0, cc); }); HostSpaceType().fence(); @@ -531,15 +483,12 @@ void Gemm(const int NN) { for (int i = 0, iend = cref.extent(0); i < iend; ++i) for (int j = 0, jend = cref.extent(1); j < jend; ++j) for (int k = 0, kend = cref.extent(2); k < kend; ++k) - diff += abs(cref(i, j, k) - - c(i / VectorLength, j, k)[i % VectorLength]); + diff += abs(cref(i, j, k) - c(i / VectorLength, j, k)[i % VectorLength]); std::cout << std::setw(12) << "KK Vector" - << " BlkSize = " << std::setw(3) << BlkSize - << " time = " << std::scientific << tmin - << " avg flop/s = " << (flop / tavg) - << " max flop/s = " << (flop / tmin) - << " diff to ref = " << diff << std::endl; + << " BlkSize = " << std::setw(3) << BlkSize << " time = " << std::scientific << tmin + << " avg flop/s = " << (flop / tavg) << " max flop/s = " << (flop / tmin) << " diff to ref = " << diff + << std::endl; } } std::cout << std::endl; diff --git a/perf_test/batched/dense/do-not-use/KokkosBatched_Test_Gemv_Host.hpp b/perf_test/batched/dense/do-not-use/KokkosBatched_Test_Gemv_Host.hpp index 9ae401f03f..e368e8c00b 100644 --- a/perf_test/batched/dense/do-not-use/KokkosBatched_Test_Gemv_Host.hpp +++ b/perf_test/batched/dense/do-not-use/KokkosBatched_Test_Gemv_Host.hpp @@ -15,8 +15,8 @@ //@HEADER /// \author Kyungjoo Kim (kyukim@sandia.gov) -//#define __KOKKOSBATCHED_INTEL_MKL__ -//#define __KOKKOSBATCHED_INTEL_MKL_BATCHED__ +// #define __KOKKOSBATCHED_INTEL_MKL__ +// #define __KOKKOSBATCHED_INTEL_MKL_BATCHED__ #include @@ -60,47 +60,38 @@ double FlopCount(int mm, int nn) { return (FLOP_MUL * (m * n) + FLOP_ADD * (m * n)); } -template +template void Gemv(const int NN) { typedef Kokkos::Schedule ScheduleType; // typedef Kokkos::Schedule ScheduleType; - constexpr int VectorLength = - DefaultVectorLength::value; - const int N = NN / VectorLength; + constexpr int VectorLength = DefaultVectorLength::value; + const int N = NN / VectorLength; { std::string value_type_name; if (std::is_same::value) value_type_name = "double"; - if (std::is_same >::value) - value_type_name = "Kokkos::complex"; + if (std::is_same >::value) value_type_name = "Kokkos::complex"; #if defined(__AVX512F__) - std::cout << "AVX512 is defined: datatype " << value_type_name - << " a vector length " << VectorLength << "\n"; + std::cout << "AVX512 is defined: datatype " << value_type_name << " a vector length " << VectorLength << "\n"; #elif defined(__AVX__) || defined(__AVX2__) - std::cout << "AVX or AVX2 is defined: datatype " << value_type_name - << " a vector length " << VectorLength << "\n"; + std::cout << "AVX or AVX2 is defined: datatype " << value_type_name << " a vector length " << VectorLength << "\n"; #else - std::cout << "SIMD (compiler vectorization) is defined: datatype " - << value_type_name << " a vector length " << VectorLength << "\n"; + std::cout << "SIMD (compiler vectorization) is defined: datatype " << value_type_name << " a vector length " + << VectorLength << "\n"; #endif } - const double flop = - (N * VectorLength) * FlopCount(BlkSize, BlkSize) * NumVecs; + const double flop = (N * VectorLength) * FlopCount(BlkSize, BlkSize) * NumVecs; // const double tmax = 1.0e15; const int iter_begin = -10, iter_end = 100; Kokkos::Timer timer; Kokkos::View yref; - Kokkos::View amat( - "amat", N * VectorLength, BlkSize, BlkSize); - Kokkos::View xvec( - "xvec", N * VectorLength, NumVecs, BlkSize); + Kokkos::View amat("amat", N * VectorLength, BlkSize, BlkSize); + Kokkos::View xvec("xvec", N * VectorLength, NumVecs, BlkSize); Kokkos::Random_XorShift64_Pool random(13718); Kokkos::fill_random(xvec, random, value_type(1.0)); @@ -115,14 +106,11 @@ void Gemv(const int NN) { /// #if defined(__KOKKOSBATCHED_INTEL_MKL__) { - Kokkos::View a( - "a", N * VectorLength, BlkSize, BlkSize), - x("x", N * VectorLength, NumVecs, BlkSize), - y("y", N * VectorLength, NumVecs, BlkSize); + Kokkos::View a("a", N * VectorLength, BlkSize, BlkSize), + x("x", N * VectorLength, NumVecs, BlkSize), y("y", N * VectorLength, NumVecs, BlkSize); { - const Kokkos::RangePolicy policy( - 0, N * VectorLength); + const Kokkos::RangePolicy policy(0, N * VectorLength); double t = 0; for (int iter = iter_begin; iter < iter_end; ++iter) { @@ -138,17 +126,14 @@ void Gemv(const int NN) { timer.reset(); Kokkos::parallel_for( - "KokkosBatched::PerfTest::GemvHost::CblasOpenMP", policy, - KOKKOS_LAMBDA(const int k) { + "KokkosBatched::PerfTest::GemvHost::CblasOpenMP", policy, KOKKOS_LAMBDA(const int k) { auto aa = Kokkos::subview(a, k, Kokkos::ALL(), Kokkos::ALL()); for (int j = 0; j < NumVecs; ++j) { auto xx = Kokkos::subview(x, k, j, Kokkos::ALL()); auto yy = Kokkos::subview(y, k, j, Kokkos::ALL()); - cblas_dgemv(CblasRowMajor, CblasNoTrans, BlkSize, BlkSize, 1.0, - (double*)aa.data(), aa.stride_0(), - (double*)xx.data(), xx.stride_0(), 1.0, - (double*)yy.data(), yy.stride_0()); + cblas_dgemv(CblasRowMajor, CblasNoTrans, BlkSize, BlkSize, 1.0, (double*)aa.data(), aa.stride_0(), + (double*)xx.data(), xx.stride_0(), 1.0, (double*)yy.data(), yy.stride_0()); } }); @@ -158,10 +143,8 @@ void Gemv(const int NN) { t /= iter_end; std::cout << std::setw(12) << "MKL DGEMV" - << " BlkSize = " << std::setw(3) << BlkSize - << " NumVecs = " << std::setw(3) << NumVecs - << " time = " << std::scientific << t - << " flop/s = " << (flop / t) << std::endl; + << " BlkSize = " << std::setw(3) << BlkSize << " NumVecs = " << std::setw(3) << NumVecs + << " time = " << std::scientific << t << " flop/s = " << (flop / t) << std::endl; yref = y; } @@ -172,14 +155,11 @@ void Gemv(const int NN) { /// Plain version (comparable to micro BLAS version) /// { - Kokkos::View a( - "a", N * VectorLength, BlkSize, BlkSize), - x("x", N * VectorLength, NumVecs, BlkSize), - y("y", N * VectorLength, NumVecs, BlkSize); + Kokkos::View a("a", N * VectorLength, BlkSize, BlkSize), + x("x", N * VectorLength, NumVecs, BlkSize), y("y", N * VectorLength, NumVecs, BlkSize); { - const Kokkos::RangePolicy policy( - 0, N * VectorLength); + const Kokkos::RangePolicy policy(0, N * VectorLength); double t = 0; for (int iter = iter_begin; iter < iter_end; ++iter) { @@ -195,16 +175,14 @@ void Gemv(const int NN) { timer.reset(); Kokkos::parallel_for( - "KokkosBatched::PerfTest::GemvHost::SerialOpenMP", policy, - KOKKOS_LAMBDA(const int k) { + "KokkosBatched::PerfTest::GemvHost::SerialOpenMP", policy, KOKKOS_LAMBDA(const int k) { auto aa = Kokkos::subview(a, k, Kokkos::ALL(), Kokkos::ALL()); for (int j = 0; j < NumVecs; ++j) { auto xx = Kokkos::subview(x, k, j, Kokkos::ALL()); auto yy = Kokkos::subview(y, k, j, Kokkos::ALL()); - SerialGemv::invoke(1.0, aa, xx, - 1.0, yy); + SerialGemv::invoke(1.0, aa, xx, 1.0, yy); } }); @@ -217,38 +195,31 @@ void Gemv(const int NN) { for (int i = 0, iend = yref.extent(0); i < iend; ++i) for (int j = 0, jend = yref.extent(1); j < jend; ++j) for (int k = 0, kend = yref.extent(2); k < kend; ++k) - diff += Kokkos::ArithTraits::abs(yref(i, j, k) - - y(i, j, k)); + diff += Kokkos::ArithTraits::abs(yref(i, j, k) - y(i, j, k)); std::cout << std::setw(12) << "Plain" - << " BlkSize = " << std::setw(3) << BlkSize - << " NumVecs = " << std::setw(3) << NumVecs - << " time = " << std::scientific << t - << " flop/s = " << (flop / t) << " diff to ref = " << diff + << " BlkSize = " << std::setw(3) << BlkSize << " NumVecs = " << std::setw(3) << NumVecs + << " time = " << std::scientific << t << " flop/s = " << (flop / t) << " diff to ref = " << diff << std::endl; } } typedef Vector, VectorLength> VectorType; - Kokkos::View amat_simd( - "amat_simd", N, BlkSize, BlkSize), + Kokkos::View amat_simd("amat_simd", N, BlkSize, BlkSize), xvec_simd("xvec_simd", N, NumVecs, BlkSize); for (int k0 = 0; k0 < N; ++k0) for (int k1 = 0; k1 < VectorLength; ++k1) for (int i = 0; i < BlkSize; ++i) { - for (int j = 0; j < NumVecs; ++j) - xvec_simd(k0, j, i)[k1] = xvec(k0 * VectorLength + k1, j, i); - for (int j = 0; j < BlkSize; ++j) - amat_simd(k0, i, j)[k1] = amat(k0 * VectorLength + k1, i, j); + for (int j = 0; j < NumVecs; ++j) xvec_simd(k0, j, i)[k1] = xvec(k0 * VectorLength + k1, j, i); + for (int j = 0; j < BlkSize; ++j) amat_simd(k0, i, j)[k1] = amat(k0 * VectorLength + k1, i, j); } /// /// Serial SIMD with appropriate data layout /// { - Kokkos::View a( - "a", N, BlkSize, BlkSize), + Kokkos::View a("a", N, BlkSize, BlkSize), x("x", N, NumVecs, BlkSize), y("y", N, NumVecs, BlkSize); { @@ -268,16 +239,14 @@ void Gemv(const int NN) { timer.reset(); Kokkos::parallel_for( - "KokkosBatched::PerfTest::GemvHost::SIMDSerialOpenMP", policy, - KOKKOS_LAMBDA(const int k) { + "KokkosBatched::PerfTest::GemvHost::SIMDSerialOpenMP", policy, KOKKOS_LAMBDA(const int k) { auto aa = Kokkos::subview(a, k, Kokkos::ALL(), Kokkos::ALL()); for (int j = 0; j < NumVecs; ++j) { auto xx = Kokkos::subview(x, k, j, Kokkos::ALL()); auto yy = Kokkos::subview(y, k, j, Kokkos::ALL()); - SerialGemv::invoke(1.0, aa, xx, - 1.0, yy); + SerialGemv::invoke(1.0, aa, xx, 1.0, yy); } }); @@ -290,14 +259,11 @@ void Gemv(const int NN) { for (int i = 0, iend = yref.extent(0); i < iend; ++i) for (int j = 0, jend = yref.extent(1); j < jend; ++j) for (int k = 0, kend = yref.extent(2); k < kend; ++k) - diff += Kokkos::ArithTraits::abs( - yref(i, j, k) - y(i / VectorLength, j, k)[i % VectorLength]); + diff += Kokkos::ArithTraits::abs(yref(i, j, k) - y(i / VectorLength, j, k)[i % VectorLength]); std::cout << std::setw(12) << "Serial SIMD" - << " BlkSize = " << std::setw(3) << BlkSize - << " NumVecs = " << std::setw(3) << NumVecs - << " time = " << std::scientific << t - << " flop/s = " << (flop / t) << " diff to ref = " << diff + << " BlkSize = " << std::setw(3) << BlkSize << " NumVecs = " << std::setw(3) << NumVecs + << " time = " << std::scientific << t << " flop/s = " << (flop / t) << " diff to ref = " << diff << std::endl; } } diff --git a/perf_test/batched/dense/do-not-use/KokkosBatched_Test_LU_Cuda.cpp b/perf_test/batched/dense/do-not-use/KokkosBatched_Test_LU_Cuda.cpp index 9909afd943..4d3f7c8fd0 100644 --- a/perf_test/batched/dense/do-not-use/KokkosBatched_Test_LU_Cuda.cpp +++ b/perf_test/batched/dense/do-not-use/KokkosBatched_Test_LU_Cuda.cpp @@ -48,15 +48,11 @@ double FlopCount(int mm, int nn) { double m = (double)mm; double n = (double)nn; if (m > n) - return (FLOP_MUL * (0.5 * m * n * n - (1.0 / 6.0) * n * n * n + - 0.5 * m * n - 0.5 * n * n + (2.0 / 3.0) * n) + - FLOP_ADD * (0.5 * m * n * n - (1.0 / 6.0) * n * n * n - - 0.5 * m * n + (1.0 / 6.0) * n)); + return (FLOP_MUL * (0.5 * m * n * n - (1.0 / 6.0) * n * n * n + 0.5 * m * n - 0.5 * n * n + (2.0 / 3.0) * n) + + FLOP_ADD * (0.5 * m * n * n - (1.0 / 6.0) * n * n * n - 0.5 * m * n + (1.0 / 6.0) * n)); else - return (FLOP_MUL * (0.5 * n * m * m - (1.0 / 6.0) * m * m * m + - 0.5 * n * m - 0.5 * m * m + (2.0 / 3.0) * m) + - FLOP_ADD * (0.5 * n * m * m - (1.0 / 6.0) * m * m * m - - 0.5 * n * m + (1.0 / 6.0) * m)); + return (FLOP_MUL * (0.5 * n * m * m - (1.0 / 6.0) * m * m * m + 0.5 * n * m - 0.5 * m * m + (2.0 / 3.0) * m) + + FLOP_ADD * (0.5 * n * m * m - (1.0 / 6.0) * m * m * m - 0.5 * n * m + (1.0 / 6.0) * m)); } struct RangeTag {}; @@ -82,57 +78,48 @@ struct Functor { } template - KOKKOS_INLINE_FUNCTION void operator()(const TeamTagV1 &, - const MemberType &member) const { - const int kbeg = - (member.league_rank() * (member.team_size() * VectorLength) + - member.team_rank() * VectorLength); - Kokkos::parallel_for( - Kokkos::ThreadVectorRange(member, VectorLength), [&](const int &k) { - const int kk = kbeg + k; - if (kk < _a.extent_int(0)) { - auto aa = Kokkos::subview(_a, kk, Kokkos::ALL(), Kokkos::ALL()); - SerialLU::invoke(aa); - } - }); + KOKKOS_INLINE_FUNCTION void operator()(const TeamTagV1 &, const MemberType &member) const { + const int kbeg = (member.league_rank() * (member.team_size() * VectorLength) + member.team_rank() * VectorLength); + Kokkos::parallel_for(Kokkos::ThreadVectorRange(member, VectorLength), [&](const int &k) { + const int kk = kbeg + k; + if (kk < _a.extent_int(0)) { + auto aa = Kokkos::subview(_a, kk, Kokkos::ALL(), Kokkos::ALL()); + SerialLU::invoke(aa); + } + }); } template - KOKKOS_INLINE_FUNCTION void operator()(const TeamTagV2 &, - const MemberType &member) const { + KOKKOS_INLINE_FUNCTION void operator()(const TeamTagV2 &, const MemberType &member) const { const int kbeg = member.league_rank() * VectorLength; - Kokkos::parallel_for( - Kokkos::ThreadVectorRange(member, VectorLength), [&](const int &k) { - const int kk = kbeg + k; - if (kk < _a.extent_int(0)) { - auto aa = Kokkos::subview(_a, kk, Kokkos::ALL(), Kokkos::ALL()); - TeamLU::invoke(member, aa); - } - }); + Kokkos::parallel_for(Kokkos::ThreadVectorRange(member, VectorLength), [&](const int &k) { + const int kk = kbeg + k; + if (kk < _a.extent_int(0)) { + auto aa = Kokkos::subview(_a, kk, Kokkos::ALL(), Kokkos::ALL()); + TeamLU::invoke(member, aa); + } + }); } template - KOKKOS_INLINE_FUNCTION void operator()(const TeamTagV3 &, - const MemberType &member) const { + KOKKOS_INLINE_FUNCTION void operator()(const TeamTagV3 &, const MemberType &member) const { const int lvl = 0; - ScratchViewType sa(member.team_scratch(lvl), VectorLength, - _a.extent(1), _a.extent(2)); + ScratchViewType sa(member.team_scratch(lvl), VectorLength, _a.extent(1), _a.extent(2)); const int kbeg = member.league_rank() * VectorLength; - Kokkos::parallel_for( - Kokkos::ThreadVectorRange(member, VectorLength), [&](const int &k) { - const int kk = kbeg + k; - if (kk < _a.extent_int(0)) { - auto aa = Kokkos::subview(_a, kk, Kokkos::ALL(), Kokkos::ALL()); - auto saa = Kokkos::subview(sa, k, Kokkos::ALL(), Kokkos::ALL()); - - TeamCopy::invoke(member, aa, saa); - member.team_barrier(); - TeamLU::invoke(member, saa); - member.team_barrier(); - TeamCopy::invoke(member, saa, aa); - } - }); + Kokkos::parallel_for(Kokkos::ThreadVectorRange(member, VectorLength), [&](const int &k) { + const int kk = kbeg + k; + if (kk < _a.extent_int(0)) { + auto aa = Kokkos::subview(_a, kk, Kokkos::ALL(), Kokkos::ALL()); + auto saa = Kokkos::subview(sa, k, Kokkos::ALL(), Kokkos::ALL()); + + TeamCopy::invoke(member, aa, saa); + member.team_barrier(); + TeamLU::invoke(member, saa); + member.team_barrier(); + TeamCopy::invoke(member, saa, aa); + } + }); } }; @@ -140,19 +127,15 @@ template void LU(const int NN, const int BlkSize) { typedef Kokkos::Schedule ScheduleType; - constexpr int VectorLength = - DefaultVectorLength::value; - const int N = NN / VectorLength; + constexpr int VectorLength = DefaultVectorLength::value; + const int N = NN / VectorLength; { std::string value_type_name; if (std::is_same::value) value_type_name = "double"; - if (std::is_same >::value) - value_type_name = "Kokkos::complex"; + if (std::is_same >::value) value_type_name = "Kokkos::complex"; - std::cout << "SIMD is defined: datatype " << value_type_name - << " a vector length " << VectorLength << "\n"; + std::cout << "SIMD is defined: datatype " << value_type_name << " a vector length " << VectorLength << "\n"; } const double flop = (N * VectorLength) * FlopCount(BlkSize, BlkSize); @@ -164,8 +147,7 @@ void LU(const int NN, const int BlkSize) { const int iter_begin = -3, iter_end = 50; Kokkos::Timer timer; - Kokkos::View amat( - "amat", N * VectorLength, BlkSize, BlkSize), + Kokkos::View amat("amat", N * VectorLength, BlkSize, BlkSize), aref("aref", N * VectorLength, BlkSize, BlkSize); { @@ -202,22 +184,18 @@ void LU(const int NN, const int BlkSize) { /// /// CUBLAS Batch version /// - const Kokkos::LayoutStride stride(N * VectorLength, BlkSize * BlkSize, - BlkSize, 1, BlkSize, BlkSize); + const Kokkos::LayoutStride stride(N * VectorLength, BlkSize * BlkSize, BlkSize, 1, BlkSize, BlkSize); - Kokkos::View a( - "a", stride); + Kokkos::View a("a", stride); Kokkos::View info("info", N * VectorLength); cublasStatus_t stat; cublasHandle_t handle; stat = cublasCreate(&handle); - if (stat != CUBLAS_STATUS_SUCCESS) - Kokkos::abort("CUBLAS initialization failed\n"); + if (stat != CUBLAS_STATUS_SUCCESS) Kokkos::abort("CUBLAS initialization failed\n"); - auto amat_device = Kokkos::create_mirror_view( - typename DeviceSpaceType::memory_space(), amat); + auto amat_device = Kokkos::create_mirror_view(typename DeviceSpaceType::memory_space(), amat); Kokkos::deep_copy(amat_device, amat); Kokkos::fence(); @@ -229,12 +207,10 @@ void LU(const int NN, const int BlkSize) { aa[k] = a.data() + k * a.stride_0(); } value_type **aa_device; - if (cudaMalloc(&aa_device, N * VectorLength * sizeof(value_type *)) != - cudaSuccess) { + if (cudaMalloc(&aa_device, N * VectorLength * sizeof(value_type *)) != cudaSuccess) { Kokkos::abort("CUDA memory allocation failed\n"); } - if (cudaMemcpy(aa_device, aa, sizeof(value_type *) * N * VectorLength, - cudaMemcpyHostToDevice) != cudaSuccess) { + if (cudaMemcpy(aa_device, aa, sizeof(value_type *) * N * VectorLength, cudaMemcpyHostToDevice) != cudaSuccess) { Kokkos::abort("CUDA memcpy failed\n"); } Kokkos::fence(); @@ -248,8 +224,7 @@ void LU(const int NN, const int BlkSize) { Kokkos::fence(); timer.reset(); - stat = cublasDgetrfBatched(handle, BlkSize, (value_type **)aa_device, - BlkSize, NULL, (int *)info.data(), + stat = cublasDgetrfBatched(handle, BlkSize, (value_type **)aa_device, BlkSize, NULL, (int *)info.data(), N * VectorLength); if (stat != CUBLAS_STATUS_SUCCESS) { Kokkos::abort("CUBLAS LU Batched failed\n"); @@ -262,8 +237,7 @@ void LU(const int NN, const int BlkSize) { } tavg /= iter_end; - auto asol = - Kokkos::create_mirror_view(typename HostSpaceType::memory_space(), a); + auto asol = Kokkos::create_mirror_view(typename HostSpaceType::memory_space(), a); Kokkos::deep_copy(asol, a); Kokkos::deep_copy(aref, asol); @@ -274,8 +248,7 @@ void LU(const int NN, const int BlkSize) { std::cout << std::setw(8) << "CUBLAS" << std::setw(8) << "Batch" << " BlkSize = " << std::setw(3) << BlkSize << " TeamSize = N/A" << " ScratchSize (KB) = N/A" - << " time = " << std::scientific << tmin - << " avg flop/s = " << (flop / tavg) + << " time = " << std::scientific << tmin << " avg flop/s = " << (flop / tavg) << " max flop/s = " << (flop / tmin) << std::endl; } } @@ -291,8 +264,7 @@ void LU(const int NN, const int BlkSize) { double tavg = 0, tmin = tmax; { typedef Functor functor_type; - const Kokkos::RangePolicy policy( - 0, N * VectorLength); + const Kokkos::RangePolicy policy(0, N * VectorLength); for (int iter = iter_begin; iter < iter_end; ++iter) { // flush @@ -304,8 +276,7 @@ void LU(const int NN, const int BlkSize) { Kokkos::fence(); timer.reset(); - Kokkos::parallel_for("KokkosBatched::PerfTest::LUCuda::RangeTag", - policy, functor_type(a)); + Kokkos::parallel_for("KokkosBatched::PerfTest::LUCuda::RangeTag", policy, functor_type(a)); Kokkos::fence(); const double t = timer.seconds(); @@ -314,22 +285,19 @@ void LU(const int NN, const int BlkSize) { } tavg /= iter_end; - auto asol = - Kokkos::create_mirror_view(typename HostSpaceType::memory_space(), a); + auto asol = Kokkos::create_mirror_view(typename HostSpaceType::memory_space(), a); Kokkos::deep_copy(asol, a); double diff = 0; for (int i = 0, iend = aref.extent(0); i < iend; ++i) for (int j = 0, jend = aref.extent(1); j < jend; ++j) for (int k = 0, kend = aref.extent(2); k < kend; ++k) - diff += Kokkos::ArithTraits::abs(aref(i, j, k) - - asol(i, j, k)); + diff += Kokkos::ArithTraits::abs(aref(i, j, k) - asol(i, j, k)); std::cout << std::setw(8) << "Kokkos" << std::setw(8) << "Range" << " BlkSize = " << std::setw(3) << BlkSize << " TeamSize = N/A" << " ScratchSize (KB) = 0" - << " time = " << std::scientific << tmin - << " avg flop/s = " << (flop / tavg) + << " time = " << std::scientific << tmin << " avg flop/s = " << (flop / tavg) << " max flop/s = " << (flop / tmin); #if defined(__KOKKOSKERNELS_NVIDIA_CUBLAS__) std::cout << " diff to ref = " << diff; @@ -346,13 +314,11 @@ void LU(const int NN, const int BlkSize) { double tavg = 0, tmin = tmax; { - typedef Kokkos::TeamPolicy - policy_type; + typedef Kokkos::TeamPolicy policy_type; typedef Functor functor_type; - const int team_size = - policy_type(N / 32, Kokkos::AUTO, VectorLength) - .team_size_recommended(functor_type(), Kokkos::ParallelForTag()); + const int team_size = policy_type(N / 32, Kokkos::AUTO, VectorLength) + .team_size_recommended(functor_type(), Kokkos::ParallelForTag()); const policy_type policy(N / team_size, team_size, VectorLength); for (int iter = iter_begin; iter < iter_end; ++iter) { @@ -365,8 +331,7 @@ void LU(const int NN, const int BlkSize) { Kokkos::fence(); timer.reset(); - Kokkos::parallel_for("KokkosBatched::PerfTest::LUCuda::TeamTagV1", - policy, functor_type(a)); + Kokkos::parallel_for("KokkosBatched::PerfTest::LUCuda::TeamTagV1", policy, functor_type(a)); Kokkos::fence(); const double t = timer.seconds(); @@ -375,23 +340,19 @@ void LU(const int NN, const int BlkSize) { } tavg /= iter_end; - auto asol = - Kokkos::create_mirror_view(typename HostSpaceType::memory_space(), a); + auto asol = Kokkos::create_mirror_view(typename HostSpaceType::memory_space(), a); Kokkos::deep_copy(asol, a); double diff = 0; for (int i = 0, iend = aref.extent(0); i < iend; ++i) for (int j = 0, jend = aref.extent(1); j < jend; ++j) for (int k = 0, kend = aref.extent(2); k < kend; ++k) - diff += Kokkos::ArithTraits::abs(aref(i, j, k) - - asol(i, j, k)); + diff += Kokkos::ArithTraits::abs(aref(i, j, k) - asol(i, j, k)); std::cout << std::setw(8) << "Kokkos" << std::setw(8) << "Team V1" - << " BlkSize = " << std::setw(3) << BlkSize - << " TeamSize = " << std::setw(3) << team_size + << " BlkSize = " << std::setw(3) << BlkSize << " TeamSize = " << std::setw(3) << team_size << " ScratchSize (KB) = 0" - << " time = " << std::scientific << tmin - << " avg flop/s = " << (flop / tavg) + << " time = " << std::scientific << tmin << " avg flop/s = " << (flop / tavg) << " max flop/s = " << (flop / tmin); #if defined(__KOKKOSKERNELS_NVIDIA_CUBLAS__) std::cout << " diff to ref = " << diff; @@ -408,13 +369,11 @@ void LU(const int NN, const int BlkSize) { double tavg = 0, tmin = tmax; { - typedef Kokkos::TeamPolicy - policy_type; + typedef Kokkos::TeamPolicy policy_type; typedef Functor functor_type; - const int is_blocked_algo = - (std::is_same::value), - mb = Algo::LU::Blocked::mb(); + const int is_blocked_algo = (std::is_same::value), + mb = Algo::LU::Blocked::mb(); // mp = BlkSize%mb > 0; const int @@ -422,8 +381,7 @@ void LU(const int NN, const int BlkSize) { mblk = is_blocked_algo ? (BlkSize - mb) : (BlkSize - 1); const int max_team_size = - policy_type(N, Kokkos::AUTO, VectorLength) - .team_size_max(functor_type(), Kokkos::ParallelForTag()); + policy_type(N, Kokkos::AUTO, VectorLength).team_size_max(functor_type(), Kokkos::ParallelForTag()); const int team_size = std::min(std::max(mblk * 2, 1), max_team_size); const policy_type policy(N, team_size, VectorLength); @@ -437,8 +395,7 @@ void LU(const int NN, const int BlkSize) { Kokkos::fence(); timer.reset(); - Kokkos::parallel_for("KokkosBatched::PerfTest::LUCuda::TeamTagV2", - policy, functor_type(a)); + Kokkos::parallel_for("KokkosBatched::PerfTest::LUCuda::TeamTagV2", policy, functor_type(a)); Kokkos::fence(); const double t = timer.seconds(); @@ -447,23 +404,19 @@ void LU(const int NN, const int BlkSize) { } tavg /= iter_end; - auto asol = - Kokkos::create_mirror_view(typename HostSpaceType::memory_space(), a); + auto asol = Kokkos::create_mirror_view(typename HostSpaceType::memory_space(), a); Kokkos::deep_copy(asol, a); double diff = 0; for (int i = 0, iend = aref.extent(0); i < iend; ++i) for (int j = 0, jend = aref.extent(1); j < jend; ++j) for (int k = 0, kend = aref.extent(2); k < kend; ++k) - diff += Kokkos::ArithTraits::abs(aref(i, j, k) - - asol(i, j, k)); + diff += Kokkos::ArithTraits::abs(aref(i, j, k) - asol(i, j, k)); std::cout << std::setw(8) << "Kokkos" << std::setw(8) << "Team V2" - << " BlkSize = " << std::setw(3) << BlkSize - << " TeamSize = " << std::setw(3) << team_size + << " BlkSize = " << std::setw(3) << BlkSize << " TeamSize = " << std::setw(3) << team_size << " ScratchSize (KB) = 0" - << " time = " << std::scientific << tmin - << " avg flop/s = " << (flop / tavg) + << " time = " << std::scientific << tmin << " avg flop/s = " << (flop / tavg) << " max flop/s = " << (flop / tmin); #if defined(__KOKKOSKERNELS_NVIDIA_CUBLAS__) std::cout << " diff to ref = " << diff; @@ -480,27 +433,22 @@ void LU(const int NN, const int BlkSize) { double tavg = 0, tmin = tmax; { - typedef Kokkos::TeamPolicy - policy_type; + typedef Kokkos::TeamPolicy policy_type; typedef Functor functor_type; - const int lvl = 0, - per_team_scratch = ScratchViewType::shmem_size( - VectorLength, BlkSize, BlkSize); + const int lvl = 0, per_team_scratch = ScratchViewType::shmem_size(VectorLength, BlkSize, BlkSize); if (per_team_scratch / 1024 < 48) { - const int is_blocked_algo = - (std::is_same::value), - mb = Algo::LU::Blocked::mb(); + const int is_blocked_algo = (std::is_same::value), + mb = Algo::LU::Blocked::mb(); // mp = BlkSize%mb > 0; const int // mblk = is_blocked_algo ? (BlkSize/mb + mp) : BlkSize; mblk = is_blocked_algo ? (BlkSize - mb) : (BlkSize - 1); - const int max_team_size = - policy_type(N, Kokkos::AUTO, VectorLength) - .set_scratch_size(lvl, Kokkos::PerTeam(per_team_scratch)) - .team_size_max(functor_type(), Kokkos::ParallelForTag()); + const int max_team_size = policy_type(N, Kokkos::AUTO, VectorLength) + .set_scratch_size(lvl, Kokkos::PerTeam(per_team_scratch)) + .team_size_max(functor_type(), Kokkos::ParallelForTag()); const int team_size = std::min(std::max(mblk * 2, 1), max_team_size); policy_type policy(N, team_size, VectorLength); @@ -514,10 +462,8 @@ void LU(const int NN, const int BlkSize) { Kokkos::fence(); timer.reset(); - Kokkos::parallel_for( - "KokkosBatched::PerfTest::LUCuda::TeamTagV3", - policy.set_scratch_size(lvl, Kokkos::PerTeam(per_team_scratch)), - functor_type(a)); + Kokkos::parallel_for("KokkosBatched::PerfTest::LUCuda::TeamTagV3", + policy.set_scratch_size(lvl, Kokkos::PerTeam(per_team_scratch)), functor_type(a)); Kokkos::fence(); const double t = timer.seconds(); @@ -526,23 +472,19 @@ void LU(const int NN, const int BlkSize) { } tavg /= iter_end; - auto asol = Kokkos::create_mirror_view( - typename HostSpaceType::memory_space(), a); + auto asol = Kokkos::create_mirror_view(typename HostSpaceType::memory_space(), a); Kokkos::deep_copy(asol, a); double diff = 0; for (int i = 0, iend = aref.extent(0); i < iend; ++i) for (int j = 0, jend = aref.extent(1); j < jend; ++j) for (int k = 0, kend = aref.extent(2); k < kend; ++k) - diff += Kokkos::ArithTraits::abs(aref(i, j, k) - - asol(i, j, k)); + diff += Kokkos::ArithTraits::abs(aref(i, j, k) - asol(i, j, k)); std::cout << std::setw(8) << "Kokkos" << std::setw(8) << "Team V3" - << " BlkSize = " << std::setw(3) << BlkSize - << " TeamSize = " << std::setw(3) << team_size - << " ScratchSize (KB) = " << std::setw(3) - << (per_team_scratch / 1024) << " time = " << std::scientific - << tmin << " avg flop/s = " << (flop / tavg) + << " BlkSize = " << std::setw(3) << BlkSize << " TeamSize = " << std::setw(3) << team_size + << " ScratchSize (KB) = " << std::setw(3) << (per_team_scratch / 1024) + << " time = " << std::scientific << tmin << " avg flop/s = " << (flop / tavg) << " max flop/s = " << (flop / tmin); #if defined(__KOKKOSKERNELS_NVIDIA_CUBLAS__) std::cout << " diff to ref = " << diff; @@ -550,8 +492,7 @@ void LU(const int NN, const int BlkSize) { std::cout << std::endl; } else { std::cout << std::setw(8) << "Kokkos" << std::setw(8) << "Team V3" - << " Scratch per team is too big (KB): " - << (per_team_scratch / 1024) << std::endl; + << " Scratch per team is too big (KB): " << (per_team_scratch / 1024) << std::endl; } } } diff --git a/perf_test/batched/dense/do-not-use/KokkosBatched_Test_LU_Host.hpp b/perf_test/batched/dense/do-not-use/KokkosBatched_Test_LU_Host.hpp index d17f9b9003..f27365694a 100644 --- a/perf_test/batched/dense/do-not-use/KokkosBatched_Test_LU_Host.hpp +++ b/perf_test/batched/dense/do-not-use/KokkosBatched_Test_LU_Host.hpp @@ -15,8 +15,8 @@ //@HEADER /// \author Kyungjoo Kim (kyukim@sandia.gov) -//#define __KOKKOSBATCHED_INTEL_MKL__ -//#define __KOKKOSBATCHED_INTEL_MKL_BATCHED__ +// #define __KOKKOSBATCHED_INTEL_MKL__ +// #define __KOKKOSBATCHED_INTEL_MKL_BATCHED__ #include #include "KokkosBatched_Util.hpp" @@ -57,15 +57,11 @@ double FlopCount(int mm, int nn) { double m = (double)mm; double n = (double)nn; if (m > n) - return (FLOP_MUL * (0.5 * m * n * n - (1.0 / 6.0) * n * n * n + - 0.5 * m * n - 0.5 * n * n + (2.0 / 3.0) * n) + - FLOP_ADD * (0.5 * m * n * n - (1.0 / 6.0) * n * n * n - - 0.5 * m * n + (1.0 / 6.0) * n)); + return (FLOP_MUL * (0.5 * m * n * n - (1.0 / 6.0) * n * n * n + 0.5 * m * n - 0.5 * n * n + (2.0 / 3.0) * n) + + FLOP_ADD * (0.5 * m * n * n - (1.0 / 6.0) * n * n * n - 0.5 * m * n + (1.0 / 6.0) * n)); else - return (FLOP_MUL * (0.5 * n * m * m - (1.0 / 6.0) * m * m * m + - 0.5 * n * m - 0.5 * m * m + (2.0 / 3.0) * m) + - FLOP_ADD * (0.5 * n * m * m - (1.0 / 6.0) * m * m * m - - 0.5 * n * m + (1.0 / 6.0) * m)); + return (FLOP_MUL * (0.5 * n * m * m - (1.0 / 6.0) * m * m * m + 0.5 * n * m - 0.5 * m * m + (2.0 / 3.0) * m) + + FLOP_ADD * (0.5 * n * m * m - (1.0 / 6.0) * m * m * m - 0.5 * n * m + (1.0 / 6.0) * m)); } template @@ -73,26 +69,21 @@ void LU(const int NN) { typedef Kokkos::Schedule ScheduleType; // typedef Kokkos::Schedule ScheduleType; - constexpr int VectorLength = - DefaultVectorLength::value; - const int N = NN / VectorLength; + constexpr int VectorLength = DefaultVectorLength::value; + const int N = NN / VectorLength; { std::string value_type_name; if (std::is_same::value) value_type_name = "double"; - if (std::is_same >::value) - value_type_name = "Kokkos::complex"; + if (std::is_same >::value) value_type_name = "Kokkos::complex"; #if defined(__AVX512F__) - std::cout << "AVX512 is defined: datatype " << value_type_name - << " a vector length " << VectorLength << "\n"; + std::cout << "AVX512 is defined: datatype " << value_type_name << " a vector length " << VectorLength << "\n"; #elif defined(__AVX__) || defined(__AVX2__) - std::cout << "AVX or AVX2 is defined: datatype " << value_type_name - << " a vector length " << VectorLength << "\n"; + std::cout << "AVX or AVX2 is defined: datatype " << value_type_name << " a vector length " << VectorLength << "\n"; #else - std::cout << "SIMD (compiler vectorization) is defined: datatype " - << value_type_name << " a vector length " << VectorLength << "\n"; + std::cout << "SIMD (compiler vectorization) is defined: datatype " << value_type_name << " a vector length " + << VectorLength << "\n"; #endif } @@ -106,8 +97,7 @@ void LU(const int NN) { /// Reference version using MKL DGETRF /// Kokkos::View aref; - Kokkos::View amat( - "amat", N * VectorLength, BlkSize, BlkSize); + Kokkos::View amat("amat", N * VectorLength, BlkSize, BlkSize); Random random; @@ -124,12 +114,11 @@ void LU(const int NN) { } typedef Vector, VectorLength> VectorType; - Kokkos::View amat_simd( - "amat_simd", N, BlkSize, BlkSize); //, a("a", N, BlkSize, BlkSize); + Kokkos::View amat_simd("amat_simd", N, BlkSize, + BlkSize); //, a("a", N, BlkSize, BlkSize); Kokkos::parallel_for( - "KokkosBatched::PerfTest::LUHost::Pack", - Kokkos::RangePolicy(0, N * VectorLength), + "KokkosBatched::PerfTest::LUHost::Pack", Kokkos::RangePolicy(0, N * VectorLength), KOKKOS_LAMBDA(const int k) { const int k0 = k / VectorLength, k1 = k % VectorLength; for (int i = 0; i < BlkSize; ++i) @@ -147,10 +136,8 @@ void LU(const int NN) { /// #if defined(__KOKKOSBATCHED_INTEL_MKL__) { - Kokkos::View a( - "a", N * VectorLength, BlkSize, BlkSize); - Kokkos::View p( - "p", N * VectorLength, BlkSize); + Kokkos::View a("a", N * VectorLength, BlkSize, BlkSize); + Kokkos::View p("p", N * VectorLength, BlkSize); { double tavg = 0, tmin = tmax; for (int iter = iter_begin; iter < iter_end; ++iter) { @@ -163,16 +150,12 @@ void LU(const int NN) { HostSpaceType().fence(); timer.reset(); - Kokkos::RangePolicy policy( - 0, N * VectorLength); + Kokkos::RangePolicy policy(0, N * VectorLength); Kokkos::parallel_for( - "KokkosBatched::PerfTest::LUHost::LAPACKE_dgetrfOpenMP", policy, - KOKKOS_LAMBDA(const int k) { + "KokkosBatched::PerfTest::LUHost::LAPACKE_dgetrfOpenMP", policy, KOKKOS_LAMBDA(const int k) { auto aa = Kokkos::subview(a, k, Kokkos::ALL(), Kokkos::ALL()); auto pp = Kokkos::subview(p, k, Kokkos::ALL()); - LAPACKE_dgetrf(LAPACK_ROW_MAJOR, BlkSize, BlkSize, - (double*)aa.data(), aa.stride_0(), - (int*)pp.data()); + LAPACKE_dgetrf(LAPACK_ROW_MAJOR, BlkSize, BlkSize, (double*)aa.data(), aa.stride_0(), (int*)pp.data()); }); HostSpaceType().fence(); @@ -183,10 +166,8 @@ void LU(const int NN) { tavg /= iter_end; std::cout << std::setw(10) << "MKL LU" - << " BlkSize = " << std::setw(3) << BlkSize - << " time = " << std::scientific << tmin - << " avg flop/s = " << (flop / tavg) - << " max flop/s = " << (flop / tmin) << std::endl; + << " BlkSize = " << std::setw(3) << BlkSize << " time = " << std::scientific << tmin + << " avg flop/s = " << (flop / tavg) << " max flop/s = " << (flop / tmin) << std::endl; } aref = a; @@ -197,8 +178,7 @@ void LU(const int NN) { #if defined(__KOKKOSBATCHED_INTEL_MKL_COMPACT_BATCHED__) { - Kokkos::View a( - "a", N, BlkSize, BlkSize); + Kokkos::View a("a", N, BlkSize, BlkSize); { double tavg = 0, tmin = tmax; @@ -220,8 +200,7 @@ void LU(const int NN) { HostSpaceType().fence(); timer.reset(); - mkl_dgetrfnp_compact(MKL_ROW_MAJOR, BlkSize, BlkSize, - (double*)a.data(), a.stride_1(), (MKL_INT*)&info, + mkl_dgetrfnp_compact(MKL_ROW_MAJOR, BlkSize, BlkSize, (double*)a.data(), a.stride_1(), (MKL_INT*)&info, format, (MKL_INT)N * VectorLength); HostSpaceType().fence(); @@ -235,15 +214,12 @@ void LU(const int NN) { for (int i = 0, iend = aref.extent(0); i < iend; ++i) for (int j = 0, jend = aref.extent(1); j < jend; ++j) for (int k = 0, kend = aref.extent(2); k < kend; ++k) - diff += abs(aref(i, j, k) - - a(i / VectorLength, j, k)[i % VectorLength]); + diff += abs(aref(i, j, k) - a(i / VectorLength, j, k)[i % VectorLength]); std::cout << std::setw(10) << "MKL Cmpt" - << " BlkSize = " << std::setw(3) << BlkSize - << " time = " << std::scientific << tmin - << " avg flop/s = " << (flop / tavg) - << " max flop/s = " << (flop / tmin) - << " diff to ref = " << diff << std::endl; + << " BlkSize = " << std::setw(3) << BlkSize << " time = " << std::scientific << tmin + << " avg flop/s = " << (flop / tavg) << " max flop/s = " << (flop / tmin) << " diff to ref = " << diff + << std::endl; } } } @@ -307,8 +283,7 @@ void LU(const int NN) { /// { - Kokkos::View a( - "a", N, BlkSize, BlkSize); + Kokkos::View a("a", N, BlkSize, BlkSize); { double tavg = 0, tmin = tmax; @@ -324,8 +299,7 @@ void LU(const int NN) { Kokkos::RangePolicy policy(0, N); Kokkos::parallel_for( - "KokkosBatched::PerfTest::LUHost::SIMDSerialOpenMP", policy, - KOKKOS_LAMBDA(const int k) { + "KokkosBatched::PerfTest::LUHost::SIMDSerialOpenMP", policy, KOKKOS_LAMBDA(const int k) { auto aa = Kokkos::subview(a, k, Kokkos::ALL(), Kokkos::ALL()); SerialLU::invoke(aa); @@ -342,14 +316,11 @@ void LU(const int NN) { for (int i = 0, iend = aref.extent(0); i < iend; ++i) for (int j = 0, jend = aref.extent(1); j < jend; ++j) for (int k = 0, kend = aref.extent(2); k < kend; ++k) - diff += abs(aref(i, j, k) - - a(i / VectorLength, j, k)[i % VectorLength]); + diff += abs(aref(i, j, k) - a(i / VectorLength, j, k)[i % VectorLength]); std::cout << std::setw(10) << "SIMD" - << " BlkSize = " << std::setw(3) << BlkSize - << " time = " << std::scientific << tmin - << " avg flop/s = " << (flop / tavg) - << " max flop/s = " << (flop / tmin) - << " diff to ref = " << diff << std::endl; + << " BlkSize = " << std::setw(3) << BlkSize << " time = " << std::scientific << tmin + << " avg flop/s = " << (flop / tavg) << " max flop/s = " << (flop / tmin) << " diff to ref = " << diff + << std::endl; } } } diff --git a/perf_test/batched/dense/do-not-use/KokkosBatched_Test_Trsm_Cuda.cpp b/perf_test/batched/dense/do-not-use/KokkosBatched_Test_Trsm_Cuda.cpp index f99ee9dc80..99f1a1d537 100644 --- a/perf_test/batched/dense/do-not-use/KokkosBatched_Test_Trsm_Cuda.cpp +++ b/perf_test/batched/dense/do-not-use/KokkosBatched_Test_Trsm_Cuda.cpp @@ -50,15 +50,13 @@ typedef double value_type; double FlopCountLower(int mm, int nn) { double m = (double)mm; double n = (double)nn; - return (FLOP_MUL * (0.5 * m * n * (n + 1.0)) + - FLOP_ADD * (0.5 * m * n * (n - 1.0))); + return (FLOP_MUL * (0.5 * m * n * (n + 1.0)) + FLOP_ADD * (0.5 * m * n * (n - 1.0))); } double FlopCountUpper(int mm, int nn) { double m = (double)mm; double n = (double)nn; - return (FLOP_MUL * (0.5 * m * n * (n + 1.0)) + - FLOP_ADD * (0.5 * m * n * (n - 1.0))); + return (FLOP_MUL * (0.5 * m * n * (n + 1.0)) + FLOP_ADD * (0.5 * m * n * (n - 1.0))); } struct RangeTag {}; @@ -67,8 +65,7 @@ struct TeamTagV2 {}; struct TeamTagV3 {}; struct TeamTagHandmade {}; -template +template struct Functor { ConstUnmanagedViewType _a; UnmanagedViewType _b; @@ -86,160 +83,131 @@ struct Functor { switch (test) { case 0: - SerialTrsm::invoke(1.0, aa, bb); + SerialTrsm::invoke(1.0, aa, bb); break; case 1: - SerialTrsm::invoke(1.0, aa, bb); + SerialTrsm::invoke(1.0, aa, bb); break; case 2: - SerialTrsm::invoke(1.0, aa, bb); + SerialTrsm::invoke(1.0, aa, bb); break; case 3: - SerialTrsm::invoke(1.0, aa, bb); + SerialTrsm::invoke(1.0, aa, bb); break; case 4: - SerialTrsm::invoke(1.0, aa, bb); + SerialTrsm::invoke(1.0, aa, bb); break; } } template - KOKKOS_INLINE_FUNCTION void operator()(const TeamTagV1 &, - const MemberType &member) const { - const int kbeg = - (member.league_rank() * (member.team_size() * VectorLength) + - member.team_rank() * VectorLength); - Kokkos::parallel_for( - Kokkos::ThreadVectorRange(member, VectorLength), [&](const int &k) { - const int kk = kbeg + k; - if (kk < int(_b.extent(0))) { - auto aa = Kokkos::subview(_a, kk, Kokkos::ALL(), Kokkos::ALL()); - auto bb = Kokkos::subview(_b, kk, Kokkos::ALL(), Kokkos::ALL()); - - switch (test) { - case 0: - SerialTrsm::invoke(1.0, aa, bb); - break; - case 1: - SerialTrsm::invoke(1.0, aa, bb); - break; - case 2: - SerialTrsm::invoke(1.0, aa, bb); - break; - case 3: - SerialTrsm::invoke(1.0, aa, bb); - break; - case 4: - SerialTrsm::invoke(1.0, aa, bb); - break; - } - } - }); + KOKKOS_INLINE_FUNCTION void operator()(const TeamTagV1 &, const MemberType &member) const { + const int kbeg = (member.league_rank() * (member.team_size() * VectorLength) + member.team_rank() * VectorLength); + Kokkos::parallel_for(Kokkos::ThreadVectorRange(member, VectorLength), [&](const int &k) { + const int kk = kbeg + k; + if (kk < int(_b.extent(0))) { + auto aa = Kokkos::subview(_a, kk, Kokkos::ALL(), Kokkos::ALL()); + auto bb = Kokkos::subview(_b, kk, Kokkos::ALL(), Kokkos::ALL()); + + switch (test) { + case 0: + SerialTrsm::invoke(1.0, aa, bb); + break; + case 1: + SerialTrsm::invoke(1.0, aa, bb); + break; + case 2: + SerialTrsm::invoke(1.0, aa, bb); + break; + case 3: + SerialTrsm::invoke(1.0, aa, bb); + break; + case 4: + SerialTrsm::invoke(1.0, aa, bb); + break; + } + } + }); } template - KOKKOS_INLINE_FUNCTION void operator()(const TeamTagV2 &, - const MemberType &member) const { + KOKKOS_INLINE_FUNCTION void operator()(const TeamTagV2 &, const MemberType &member) const { const int kbeg = member.league_rank() * VectorLength; - Kokkos::parallel_for( - Kokkos::ThreadVectorRange(member, VectorLength), [&](const int &k) { - const int kk = kbeg + k; - if (kk < int(_b.extent(0))) { - auto aa = Kokkos::subview(_a, kk, Kokkos::ALL(), Kokkos::ALL()); - auto bb = Kokkos::subview(_b, kk, Kokkos::ALL(), Kokkos::ALL()); - - switch (test) { - case 0: - TeamTrsm::invoke(member, 1.0, aa, bb); - break; - case 1: - TeamTrsm::invoke(member, 1.0, aa, bb); - break; - case 2: - TeamTrsm::invoke(member, 1.0, aa, bb); - break; - case 3: - TeamTrsm::invoke(member, 1.0, aa, bb); - break; - case 4: - TeamTrsm::invoke(member, 1.0, aa, bb); - break; - } - } - }); + Kokkos::parallel_for(Kokkos::ThreadVectorRange(member, VectorLength), [&](const int &k) { + const int kk = kbeg + k; + if (kk < int(_b.extent(0))) { + auto aa = Kokkos::subview(_a, kk, Kokkos::ALL(), Kokkos::ALL()); + auto bb = Kokkos::subview(_b, kk, Kokkos::ALL(), Kokkos::ALL()); + + switch (test) { + case 0: + TeamTrsm::invoke( + member, 1.0, aa, bb); + break; + case 1: + TeamTrsm::invoke( + member, 1.0, aa, bb); + break; + case 2: + TeamTrsm::invoke( + member, 1.0, aa, bb); + break; + case 3: + TeamTrsm::invoke( + member, 1.0, aa, bb); + break; + case 4: + TeamTrsm::invoke( + member, 1.0, aa, bb); + break; + } + } + }); } template - KOKKOS_INLINE_FUNCTION void operator()(const TeamTagV3 &, - const MemberType &member) const { + KOKKOS_INLINE_FUNCTION void operator()(const TeamTagV3 &, const MemberType &member) const { const int lvl = 0; - ScratchViewType sa(member.team_scratch(lvl), VectorLength, - _a.extent(1), _a.extent(2)); + ScratchViewType sa(member.team_scratch(lvl), VectorLength, _a.extent(1), _a.extent(2)); // ScratchViewType sb(member.team_scratch(lvl), VectorLength, // _b.extent(1), _b.extent(2)); const int kbeg = member.league_rank() * VectorLength; - Kokkos::parallel_for( - Kokkos::ThreadVectorRange(member, VectorLength), [&](const int &k) { - const int kk = kbeg + k; - if (kk < int(_b.extent(0))) { - auto aa = Kokkos::subview(_a, kk, Kokkos::ALL(), Kokkos::ALL()); - auto bb = Kokkos::subview(_b, kk, Kokkos::ALL(), Kokkos::ALL()); - - auto saa = Kokkos::subview(sa, k, Kokkos::ALL(), Kokkos::ALL()); - - TeamCopy::invoke(member, aa, saa); - member.team_barrier(); - - switch (test) { - case 0: - TeamTrsm::invoke(member, 1.0, saa, bb); - break; - case 1: - TeamTrsm::invoke(member, 1.0, saa, bb); - break; - case 2: - TeamTrsm::invoke(member, 1.0, saa, bb); - break; - case 3: - TeamTrsm::invoke(member, 1.0, saa, bb); - break; - case 4: - TeamTrsm::invoke(member, 1.0, saa, bb); - break; - } - } - }); + Kokkos::parallel_for(Kokkos::ThreadVectorRange(member, VectorLength), [&](const int &k) { + const int kk = kbeg + k; + if (kk < int(_b.extent(0))) { + auto aa = Kokkos::subview(_a, kk, Kokkos::ALL(), Kokkos::ALL()); + auto bb = Kokkos::subview(_b, kk, Kokkos::ALL(), Kokkos::ALL()); + + auto saa = Kokkos::subview(sa, k, Kokkos::ALL(), Kokkos::ALL()); + + TeamCopy::invoke(member, aa, saa); + member.team_barrier(); + + switch (test) { + case 0: + TeamTrsm::invoke( + member, 1.0, saa, bb); + break; + case 1: + TeamTrsm::invoke( + member, 1.0, saa, bb); + break; + case 2: + TeamTrsm::invoke( + member, 1.0, saa, bb); + break; + case 3: + TeamTrsm::invoke( + member, 1.0, saa, bb); + break; + case 4: + TeamTrsm::invoke( + member, 1.0, saa, bb); + break; + } + } + }); } }; @@ -247,19 +215,15 @@ template void Trsm(const int NN, const int BlkSize, const int NumCols) { typedef Kokkos::Schedule ScheduleType; - constexpr int VectorLength = - DefaultVectorLength::value; - const int N = NN / VectorLength; + constexpr int VectorLength = DefaultVectorLength::value; + const int N = NN / VectorLength; { std::string value_type_name; if (std::is_same::value) value_type_name = "double"; - if (std::is_same >::value) - value_type_name = "Kokkos::complex"; + if (std::is_same >::value) value_type_name = "Kokkos::complex"; - std::cout << "SIMD is defined: datatype " << value_type_name - << " a vector length " << VectorLength << "\n"; + std::cout << "SIMD is defined: datatype " << value_type_name << " a vector length " << VectorLength << "\n"; } switch (test) { @@ -288,17 +252,14 @@ void Trsm(const int NN, const int BlkSize, const int NumCols) { const int iter_begin = -3, iter_end = 30; Kokkos::Timer timer; - Kokkos::View amat( - "amat", N * VectorLength, BlkSize, BlkSize), - bmat("bmat", N * VectorLength, BlkSize, NumCols), - bref("bmat", N * VectorLength, BlkSize, NumCols); + Kokkos::View amat("amat", N * VectorLength, BlkSize, BlkSize), + bmat("bmat", N * VectorLength, BlkSize, NumCols), bref("bmat", N * VectorLength, BlkSize, NumCols); { Random random; for (int k = 0; k < N * VectorLength; ++k) { for (int i = 0; i < BlkSize; ++i) - for (int j = 0; j < BlkSize; ++j) - amat(k, i, j) = random.value() + 4.0 * (i == j); + for (int j = 0; j < BlkSize; ++j) amat(k, i, j) = random.value() + 4.0 * (i == j); for (int i = 0; i < BlkSize; ++i) for (int j = 0; j < NumCols; ++j) bmat(k, i, j) = random.value(); } @@ -313,24 +274,18 @@ void Trsm(const int NN, const int BlkSize, const int NumCols) { /// /// CUBLAS Batch version /// - const Kokkos::LayoutStride stride(N * VectorLength, BlkSize * BlkSize, - BlkSize, 1, BlkSize, BlkSize); + const Kokkos::LayoutStride stride(N * VectorLength, BlkSize * BlkSize, BlkSize, 1, BlkSize, BlkSize); - Kokkos::View a( - "a", stride), - b("b", stride); + Kokkos::View a("a", stride), b("b", stride); cublasStatus_t stat; cublasHandle_t handle; stat = cublasCreate(&handle); - if (stat != CUBLAS_STATUS_SUCCESS) - Kokkos::abort("CUBLAS initialization failed\n"); + if (stat != CUBLAS_STATUS_SUCCESS) Kokkos::abort("CUBLAS initialization failed\n"); - auto amat_device = Kokkos::create_mirror_view( - typename DeviceSpaceType::memory_space(), amat); - auto bmat_device = Kokkos::create_mirror_view( - typename DeviceSpaceType::memory_space(), bmat); + auto amat_device = Kokkos::create_mirror_view(typename DeviceSpaceType::memory_space(), amat); + auto bmat_device = Kokkos::create_mirror_view(typename DeviceSpaceType::memory_space(), bmat); Kokkos::deep_copy(amat_device, amat); Kokkos::deep_copy(bmat_device, bmat); @@ -346,16 +301,12 @@ void Trsm(const int NN, const int BlkSize, const int NumCols) { bb[k] = b.data() + k * b.stride_0(); } value_type **aa_device, **bb_device; - if (cudaMalloc(&aa_device, N * VectorLength * sizeof(value_type *)) != - cudaSuccess || - cudaMalloc(&bb_device, N * VectorLength * sizeof(value_type *)) != - cudaSuccess) { + if (cudaMalloc(&aa_device, N * VectorLength * sizeof(value_type *)) != cudaSuccess || + cudaMalloc(&bb_device, N * VectorLength * sizeof(value_type *)) != cudaSuccess) { Kokkos::abort("CUDA memory allocation failed\n"); } - if (cudaMemcpy(aa_device, aa, sizeof(value_type *) * N * VectorLength, - cudaMemcpyHostToDevice) != cudaSuccess || - cudaMemcpy(bb_device, bb, sizeof(value_type *) * N * VectorLength, - cudaMemcpyHostToDevice) != cudaSuccess) { + if (cudaMemcpy(aa_device, aa, sizeof(value_type *) * N * VectorLength, cudaMemcpyHostToDevice) != cudaSuccess || + cudaMemcpy(bb_device, bb, sizeof(value_type *) * N * VectorLength, cudaMemcpyHostToDevice) != cudaSuccess) { Kokkos::abort("CUDA memcpy failed\n"); } Kokkos::fence(); @@ -371,47 +322,37 @@ void Trsm(const int NN, const int BlkSize, const int NumCols) { switch (test) { case 0: { // Left, Lower, NoTrans, UnitDiag - stat = cublasDtrsmBatched( - handle, CUBLAS_SIDE_LEFT, CUBLAS_FILL_MODE_LOWER, CUBLAS_OP_N, - CUBLAS_DIAG_UNIT, BlkSize, NumCols, &one, - (const value_type **)aa_device, BlkSize, - (value_type **)bb_device, BlkSize, N * VectorLength); + stat = cublasDtrsmBatched(handle, CUBLAS_SIDE_LEFT, CUBLAS_FILL_MODE_LOWER, CUBLAS_OP_N, CUBLAS_DIAG_UNIT, + BlkSize, NumCols, &one, (const value_type **)aa_device, BlkSize, + (value_type **)bb_device, BlkSize, N * VectorLength); break; } case 1: { // Left, Lower, NoTrans, NonUnitDiag - stat = cublasDtrsmBatched( - handle, CUBLAS_SIDE_LEFT, CUBLAS_FILL_MODE_LOWER, CUBLAS_OP_N, - CUBLAS_DIAG_NON_UNIT, BlkSize, NumCols, &one, - (const value_type **)aa_device, BlkSize, - (value_type **)bb_device, BlkSize, N * VectorLength); + stat = cublasDtrsmBatched(handle, CUBLAS_SIDE_LEFT, CUBLAS_FILL_MODE_LOWER, CUBLAS_OP_N, + CUBLAS_DIAG_NON_UNIT, BlkSize, NumCols, &one, (const value_type **)aa_device, + BlkSize, (value_type **)bb_device, BlkSize, N * VectorLength); break; } case 2: { // Right, Upper, NoTrans, UnitDiag - stat = cublasDtrsmBatched( - handle, CUBLAS_SIDE_RIGHT, CUBLAS_FILL_MODE_UPPER, CUBLAS_OP_N, - CUBLAS_DIAG_UNIT, BlkSize, NumCols, &one, - (const value_type **)aa_device, BlkSize, - (value_type **)bb_device, BlkSize, N * VectorLength); + stat = cublasDtrsmBatched(handle, CUBLAS_SIDE_RIGHT, CUBLAS_FILL_MODE_UPPER, CUBLAS_OP_N, CUBLAS_DIAG_UNIT, + BlkSize, NumCols, &one, (const value_type **)aa_device, BlkSize, + (value_type **)bb_device, BlkSize, N * VectorLength); break; } case 3: { // Right, Upper, NoTrans, NonUnitDiag - stat = cublasDtrsmBatched( - handle, CUBLAS_SIDE_RIGHT, CUBLAS_FILL_MODE_UPPER, CUBLAS_OP_N, - CUBLAS_DIAG_NON_UNIT, BlkSize, NumCols, &one, - (const value_type **)aa_device, BlkSize, - (value_type **)bb_device, BlkSize, N * VectorLength); + stat = cublasDtrsmBatched(handle, CUBLAS_SIDE_RIGHT, CUBLAS_FILL_MODE_UPPER, CUBLAS_OP_N, + CUBLAS_DIAG_NON_UNIT, BlkSize, NumCols, &one, (const value_type **)aa_device, + BlkSize, (value_type **)bb_device, BlkSize, N * VectorLength); break; } case 4: { // Left, Upper, NoTrans, NonUnitDiag - stat = cublasDtrsmBatched( - handle, CUBLAS_SIDE_LEFT, CUBLAS_FILL_MODE_UPPER, CUBLAS_OP_N, - CUBLAS_DIAG_NON_UNIT, BlkSize, NumCols, &one, - (const value_type **)aa_device, BlkSize, - (value_type **)bb_device, BlkSize, N * VectorLength); + stat = cublasDtrsmBatched(handle, CUBLAS_SIDE_LEFT, CUBLAS_FILL_MODE_UPPER, CUBLAS_OP_N, + CUBLAS_DIAG_NON_UNIT, BlkSize, NumCols, &one, (const value_type **)aa_device, + BlkSize, (value_type **)bb_device, BlkSize, N * VectorLength); break; } } @@ -426,22 +367,19 @@ void Trsm(const int NN, const int BlkSize, const int NumCols) { } tavg /= iter_end; - auto bsol = - Kokkos::create_mirror_view(typename HostSpaceType::memory_space(), b); + auto bsol = Kokkos::create_mirror_view(typename HostSpaceType::memory_space(), b); Kokkos::deep_copy(bsol, b); Kokkos::deep_copy(bref, bsol); - if (cudaFree(aa_device) != cudaSuccess || - cudaFree(bb_device) != cudaSuccess) { + if (cudaFree(aa_device) != cudaSuccess || cudaFree(bb_device) != cudaSuccess) { Kokkos::abort("CUDA memory free failed\n"); } std::cout << std::setw(8) << "CUBLAS" << std::setw(8) << "Batched" - << " BlkSize = " << std::setw(3) << BlkSize - << " NumCols = " << std::setw(3) << NumCols << " TeamSize = N/A" + << " BlkSize = " << std::setw(3) << BlkSize << " NumCols = " << std::setw(3) << NumCols + << " TeamSize = N/A" << " ScratchSize (KB) = N/A" - << " time = " << std::scientific << tmin - << " avg flop/s = " << (flop / tavg) + << " time = " << std::scientific << tmin << " avg flop/s = " << (flop / tavg) << " max flop/s = " << (flop / tmin) << std::endl; } cublasDestroy(handle); @@ -453,14 +391,12 @@ void Trsm(const int NN, const int BlkSize, const int NumCols) { /// Range policy version /// typedef Kokkos::View view_type; - view_type a("a", N * VectorLength, BlkSize, BlkSize), - b("b", N * VectorLength, BlkSize, NumCols); + view_type a("a", N * VectorLength, BlkSize, BlkSize), b("b", N * VectorLength, BlkSize, NumCols); double tavg = 0, tmin = tmax; { typedef Functor functor_type; - const Kokkos::RangePolicy policy( - 0, N * VectorLength); + const Kokkos::RangePolicy policy(0, N * VectorLength); for (int iter = iter_begin; iter < iter_end; ++iter) { // flush @@ -473,8 +409,7 @@ void Trsm(const int NN, const int BlkSize, const int NumCols) { Kokkos::fence(); timer.reset(); - Kokkos::parallel_for("KokkosBatched::PerfTest::RangeTag", policy, - functor_type(a, b)); + Kokkos::parallel_for("KokkosBatched::PerfTest::RangeTag", policy, functor_type(a, b)); Kokkos::fence(); const double t = timer.seconds(); @@ -483,23 +418,20 @@ void Trsm(const int NN, const int BlkSize, const int NumCols) { } tavg /= iter_end; - auto bsol = - Kokkos::create_mirror_view(typename HostSpaceType::memory_space(), b); + auto bsol = Kokkos::create_mirror_view(typename HostSpaceType::memory_space(), b); Kokkos::deep_copy(bsol, b); double diff = 0; for (int i = 0, iend = bref.extent(0); i < iend; ++i) for (int j = 0, jend = bref.extent(1); j < jend; ++j) for (int k = 0, kend = bref.extent(2); k < kend; ++k) - diff += Kokkos::ArithTraits::abs(bref(i, j, k) - - bsol(i, j, k)); + diff += Kokkos::ArithTraits::abs(bref(i, j, k) - bsol(i, j, k)); std::cout << std::setw(8) << "Kokkos" << std::setw(8) << "Range" - << " BlkSize = " << std::setw(3) << BlkSize - << " NumCols = " << std::setw(3) << NumCols << " TeamSize = N/A" + << " BlkSize = " << std::setw(3) << BlkSize << " NumCols = " << std::setw(3) << NumCols + << " TeamSize = N/A" << " ScratchSize (KB) = 0" - << " time = " << std::scientific << tmin - << " avg flop/s = " << (flop / tavg) + << " time = " << std::scientific << tmin << " avg flop/s = " << (flop / tavg) << " max flop/s = " << (flop / tmin); #if defined(__KOKKOSKERNELS_NVIDIA_CUBLAS__) std::cout << " diff to ref = " << diff; @@ -513,18 +445,15 @@ void Trsm(const int NN, const int BlkSize, const int NumCols) { /// Team policy V1 - almost same scheduling with range policy /// typedef Kokkos::View view_type; - view_type a("a", N * VectorLength, BlkSize, BlkSize), - b("b", N * VectorLength, BlkSize, NumCols); + view_type a("a", N * VectorLength, BlkSize, BlkSize), b("b", N * VectorLength, BlkSize, NumCols); double tavg = 0, tmin = tmax; { - typedef Kokkos::TeamPolicy - policy_type; + typedef Kokkos::TeamPolicy policy_type; typedef Functor functor_type; - const int team_size = - policy_type(N / 32, Kokkos::AUTO, VectorLength) - .team_size_recommended(functor_type(), Kokkos::ParallelForTag()); + const int team_size = policy_type(N / 32, Kokkos::AUTO, VectorLength) + .team_size_recommended(functor_type(), Kokkos::ParallelForTag()); const policy_type policy(N / team_size, team_size, VectorLength); for (int iter = iter_begin; iter < iter_end; ++iter) { @@ -538,8 +467,7 @@ void Trsm(const int NN, const int BlkSize, const int NumCols) { Kokkos::fence(); timer.reset(); - Kokkos::parallel_for("KokkosBatched::PerfTest::TeamTagV1", policy, - functor_type(a, b)); + Kokkos::parallel_for("KokkosBatched::PerfTest::TeamTagV1", policy, functor_type(a, b)); Kokkos::fence(); const double t = timer.seconds(); @@ -548,24 +476,19 @@ void Trsm(const int NN, const int BlkSize, const int NumCols) { } tavg /= iter_end; - auto bsol = - Kokkos::create_mirror_view(typename HostSpaceType::memory_space(), b); + auto bsol = Kokkos::create_mirror_view(typename HostSpaceType::memory_space(), b); Kokkos::deep_copy(bsol, b); double diff = 0; for (int i = 0, iend = bref.extent(0); i < iend; ++i) for (int j = 0, jend = bref.extent(1); j < jend; ++j) for (int k = 0, kend = bref.extent(2); k < kend; ++k) - diff += Kokkos::ArithTraits::abs(bref(i, j, k) - - bsol(i, j, k)); + diff += Kokkos::ArithTraits::abs(bref(i, j, k) - bsol(i, j, k)); std::cout << std::setw(8) << "Kokkos" << std::setw(8) << "Team V1" - << " BlkSize = " << std::setw(3) << BlkSize - << " NumCols = " << std::setw(3) << NumCols - << " TeamSize = " << std::setw(3) << team_size - << " ScratchSize (KB) = 0" - << " time = " << std::scientific << tmin - << " avg flop/s = " << (flop / tavg) + << " BlkSize = " << std::setw(3) << BlkSize << " NumCols = " << std::setw(3) << NumCols + << " TeamSize = " << std::setw(3) << team_size << " ScratchSize (KB) = 0" + << " time = " << std::scientific << tmin << " avg flop/s = " << (flop / tavg) << " max flop/s = " << (flop / tmin); #if defined(__KOKKOSKERNELS_NVIDIA_CUBLAS__) std::cout << " diff to ref = " << diff; @@ -579,27 +502,21 @@ void Trsm(const int NN, const int BlkSize, const int NumCols) { /// Team policy V2 - team parallel /// typedef Kokkos::View view_type; - view_type a("a", N * VectorLength, BlkSize, BlkSize), - b("b", N * VectorLength, BlkSize, NumCols); + view_type a("a", N * VectorLength, BlkSize, BlkSize), b("b", N * VectorLength, BlkSize, NumCols); double tavg = 0, tmin = tmax; { - typedef Kokkos::TeamPolicy - policy_type; + typedef Kokkos::TeamPolicy policy_type; typedef Functor functor_type; - const int is_blocked_algo = - (std::is_same::value), - mb = Algo::Trsm::Blocked::mb(), - mp = BlkSize % mb > 0; + const int is_blocked_algo = (std::is_same::value), + mb = Algo::Trsm::Blocked::mb(), mp = BlkSize % mb > 0; const int mblk = is_blocked_algo ? (BlkSize / mb + mp) : BlkSize; const int max_team_size = - policy_type(N, Kokkos::AUTO, VectorLength) - .team_size_max(functor_type(), Kokkos::ParallelForTag()); - const int team_size = - std::min(std::max(NumCols, (mblk - 1) * mblk), max_team_size); + policy_type(N, Kokkos::AUTO, VectorLength).team_size_max(functor_type(), Kokkos::ParallelForTag()); + const int team_size = std::min(std::max(NumCols, (mblk - 1) * mblk), max_team_size); const policy_type policy(N, team_size, VectorLength); for (int iter = iter_begin; iter < iter_end; ++iter) { @@ -613,8 +530,7 @@ void Trsm(const int NN, const int BlkSize, const int NumCols) { DeviceSpaceType().fence(); timer.reset(); - Kokkos::parallel_for("KokkosBatched::PerfTest::TeamTagV2", policy, - functor_type(a, b)); + Kokkos::parallel_for("KokkosBatched::PerfTest::TeamTagV2", policy, functor_type(a, b)); DeviceSpaceType().fence(); const double t = timer.seconds(); @@ -623,24 +539,19 @@ void Trsm(const int NN, const int BlkSize, const int NumCols) { } tavg /= iter_end; - auto bsol = - Kokkos::create_mirror_view(typename HostSpaceType::memory_space(), b); + auto bsol = Kokkos::create_mirror_view(typename HostSpaceType::memory_space(), b); Kokkos::deep_copy(bsol, b); double diff = 0; for (int i = 0, iend = bref.extent(0); i < iend; ++i) for (int j = 0, jend = bref.extent(1); j < jend; ++j) for (int k = 0, kend = bref.extent(2); k < kend; ++k) - diff += Kokkos::ArithTraits::abs(bref(i, j, k) - - bsol(i, j, k)); + diff += Kokkos::ArithTraits::abs(bref(i, j, k) - bsol(i, j, k)); std::cout << std::setw(8) << "Kokkos" << std::setw(8) << "Team V2" - << " BlkSize = " << std::setw(3) << BlkSize - << " NumCols = " << std::setw(3) << NumCols - << " TeamSize = " << std::setw(3) << team_size - << " ScratchSize (KB) = 0" - << " time = " << std::scientific << tmin - << " avg flop/s = " << (flop / tavg) + << " BlkSize = " << std::setw(3) << BlkSize << " NumCols = " << std::setw(3) << NumCols + << " TeamSize = " << std::setw(3) << team_size << " ScratchSize (KB) = 0" + << " time = " << std::scientific << tmin << " avg flop/s = " << (flop / tavg) << " max flop/s = " << (flop / tmin); #if defined(__KOKKOSKERNELS_NVIDIA_CUBLAS__) std::cout << " diff to ref = " << diff; @@ -654,33 +565,25 @@ void Trsm(const int NN, const int BlkSize, const int NumCols) { /// Team policy V3 - team parallel + sratch /// typedef Kokkos::View view_type; - view_type a("a", N * VectorLength, BlkSize, BlkSize), - b("b", N * VectorLength, BlkSize, NumCols); + view_type a("a", N * VectorLength, BlkSize, BlkSize), b("b", N * VectorLength, BlkSize, NumCols); double tavg = 0, tmin = tmax; { - typedef Kokkos::TeamPolicy - policy_type; + typedef Kokkos::TeamPolicy policy_type; typedef Functor functor_type; - const int lvl = 0, - per_team_scratch = ScratchViewType::shmem_size( - VectorLength, BlkSize, BlkSize); + const int lvl = 0, per_team_scratch = ScratchViewType::shmem_size(VectorLength, BlkSize, BlkSize); if (per_team_scratch / 1024 < 48) { - const int is_blocked_algo = - (std::is_same::value), - mb = Algo::Trsm::Blocked::mb(), - mp = BlkSize % mb > 0; + const int is_blocked_algo = (std::is_same::value), + mb = Algo::Trsm::Blocked::mb(), mp = BlkSize % mb > 0; const int mblk = is_blocked_algo ? (BlkSize / mb + mp) : BlkSize; - const int max_team_size = - policy_type(N, Kokkos::AUTO, VectorLength) - .set_scratch_size(lvl, Kokkos::PerTeam(per_team_scratch)) - .team_size_max(functor_type(), Kokkos::ParallelForTag()); - const int team_size = - std::min(std::max(NumCols, (mblk - 1) * mblk), max_team_size); + const int max_team_size = policy_type(N, Kokkos::AUTO, VectorLength) + .set_scratch_size(lvl, Kokkos::PerTeam(per_team_scratch)) + .team_size_max(functor_type(), Kokkos::ParallelForTag()); + const int team_size = std::min(std::max(NumCols, (mblk - 1) * mblk), max_team_size); policy_type policy(N, team_size, VectorLength); for (int iter = iter_begin; iter < iter_end; ++iter) { @@ -694,8 +597,7 @@ void Trsm(const int NN, const int BlkSize, const int NumCols) { DeviceSpaceType().fence(); timer.reset(); - Kokkos::parallel_for("KokkosBatched::PerfTest::TeamTagV3", policy, - functor_type(a, b)); + Kokkos::parallel_for("KokkosBatched::PerfTest::TeamTagV3", policy, functor_type(a, b)); DeviceSpaceType().fence(); const double t = timer.seconds(); @@ -704,33 +606,27 @@ void Trsm(const int NN, const int BlkSize, const int NumCols) { } tavg /= iter_end; - auto bsol = Kokkos::create_mirror_view( - typename HostSpaceType::memory_space(), b); + auto bsol = Kokkos::create_mirror_view(typename HostSpaceType::memory_space(), b); Kokkos::deep_copy(bsol, b); double diff = 0; for (int i = 0, iend = bref.extent(0); i < iend; ++i) for (int j = 0, jend = bref.extent(1); j < jend; ++j) for (int k = 0, kend = bref.extent(2); k < kend; ++k) - diff += Kokkos::ArithTraits::abs(bref(i, j, k) - - bsol(i, j, k)); + diff += Kokkos::ArithTraits::abs(bref(i, j, k) - bsol(i, j, k)); std::cout << std::setw(8) << "Kokkos" << std::setw(8) << "Team V3" - << " BlkSize = " << std::setw(3) << BlkSize - << " NumCols = " << std::setw(3) << NumCols - << " TeamSize = " << std::setw(3) << team_size - << " ScratchSize (KB) = " << std::setw(3) - << (per_team_scratch / 1024) << " time = " << std::scientific - << tmin << " avg flop/s = " << (flop / tavg) - << " max flop/s = " << (flop / tmin); + << " BlkSize = " << std::setw(3) << BlkSize << " NumCols = " << std::setw(3) << NumCols + << " TeamSize = " << std::setw(3) << team_size << " ScratchSize (KB) = " << std::setw(3) + << (per_team_scratch / 1024) << " time = " << std::scientific << tmin + << " avg flop/s = " << (flop / tavg) << " max flop/s = " << (flop / tmin); #if defined(__KOKKOSKERNELS_NVIDIA_CUBLAS__) std::cout << " diff to ref = " << diff; #endif std::cout << std::endl; } else { std::cout << std::setw(8) << "Kokkos" << std::setw(8) << "Team V3" - << " Scratch per team is too big (KB): " - << (per_team_scratch / 1024) << std::endl; + << " Scratch per team is too big (KB): " << (per_team_scratch / 1024) << std::endl; } } } diff --git a/perf_test/batched/dense/do-not-use/KokkosBatched_Test_Trsm_Host.hpp b/perf_test/batched/dense/do-not-use/KokkosBatched_Test_Trsm_Host.hpp index 52b2395b8d..5e8c6a6abc 100644 --- a/perf_test/batched/dense/do-not-use/KokkosBatched_Test_Trsm_Host.hpp +++ b/perf_test/batched/dense/do-not-use/KokkosBatched_Test_Trsm_Host.hpp @@ -30,7 +30,7 @@ #include "KokkosBatched_Trsm_Decl.hpp" #include "KokkosBatched_Trsm_Serial_Impl.hpp" -//#undef __KOKKOSBATCHED_INTEL_MKL_BATCHED__ +// #undef __KOKKOSBATCHED_INTEL_MKL_BATCHED__ namespace KokkosBatched { namespace PerfTest { @@ -54,41 +54,33 @@ typedef double value_type; double FlopCountLower(int mm, int nn) { double m = (double)mm; double n = (double)nn; - return (FLOP_MUL * (0.5 * m * n * (n + 1.0)) + - FLOP_ADD * (0.5 * m * n * (n - 1.0))); + return (FLOP_MUL * (0.5 * m * n * (n + 1.0)) + FLOP_ADD * (0.5 * m * n * (n - 1.0))); } double FlopCountUpper(int mm, int nn) { double m = (double)mm; double n = (double)nn; - return (FLOP_MUL * (0.5 * m * n * (n + 1.0)) + - FLOP_ADD * (0.5 * m * n * (n - 1.0))); + return (FLOP_MUL * (0.5 * m * n * (n + 1.0)) + FLOP_ADD * (0.5 * m * n * (n - 1.0))); } -template +template void Trsm(const int NN) { typedef Kokkos::Schedule ScheduleType; - constexpr int VectorLength = - DefaultVectorLength::value; - const int N = NN / VectorLength; + constexpr int VectorLength = DefaultVectorLength::value; + const int N = NN / VectorLength; { std::string value_type_name; if (std::is_same::value) value_type_name = "double"; - if (std::is_same >::value) - value_type_name = "Kokkos::complex"; + if (std::is_same >::value) value_type_name = "Kokkos::complex"; #if defined(__AVX512F__) - std::cout << "AVX512 is defined: datatype " << value_type_name - << " a vector length " << VectorLength << "\n"; + std::cout << "AVX512 is defined: datatype " << value_type_name << " a vector length " << VectorLength << "\n"; #elif defined(__AVX__) || defined(__AVX2__) - std::cout << "AVX or AVX2 is defined: datatype " << value_type_name - << " a vector length " << VectorLength << "\n"; + std::cout << "AVX or AVX2 is defined: datatype " << value_type_name << " a vector length " << VectorLength << "\n"; #else - std::cout << "SIMD (compiler vectorization) is defined: datatype " - << value_type_name << " a vector length " << VectorLength << "\n"; + std::cout << "SIMD (compiler vectorization) is defined: datatype " << value_type_name << " a vector length " + << VectorLength << "\n"; #endif } @@ -120,13 +112,11 @@ void Trsm(const int NN) { /// Reference version using MKL DTRSM /// Kokkos::View bref; - Kokkos::View amat( - "amat", N * VectorLength, BlkSize, BlkSize), + Kokkos::View amat("amat", N * VectorLength, BlkSize, BlkSize), bmat("bmat", N * VectorLength, BlkSize, NumCols); typedef Vector, VectorLength> VectorType; - Kokkos::View amat_simd( - "amat_simd", N, BlkSize, BlkSize), + Kokkos::View amat_simd("amat_simd", N, BlkSize, BlkSize), bmat_simd("bmat_simd", N, BlkSize, NumCols); Random random; @@ -154,8 +144,7 @@ void Trsm(const int NN) { /// #if defined(__KOKKOSBATCHED_INTEL_MKL__) { - Kokkos::View a( - "a", N * VectorLength, BlkSize, BlkSize), + Kokkos::View a("a", N * VectorLength, BlkSize, BlkSize), b("b", N * VectorLength, BlkSize, NumCols); { @@ -171,44 +160,32 @@ void Trsm(const int NN) { HostSpaceType().fence(); timer.reset(); - Kokkos::RangePolicy policy( - 0, N * VectorLength); + Kokkos::RangePolicy policy(0, N * VectorLength); Kokkos::parallel_for( - "KokkosBatched::PerfTest::TrsmHost::MKLOpenMP", policy, - KOKKOS_LAMBDA(const int k) { + "KokkosBatched::PerfTest::TrsmHost::MKLOpenMP", policy, KOKKOS_LAMBDA(const int k) { auto aa = Kokkos::subview(a, k, Kokkos::ALL(), Kokkos::ALL()); auto bb = Kokkos::subview(b, k, Kokkos::ALL(), Kokkos::ALL()); switch (test) { case 0: - cblas_dtrsm(CblasRowMajor, CblasLeft, CblasLower, - CblasNoTrans, CblasUnit, BlkSize, NumCols, 1.0, - (double *)aa.data(), aa.stride_0(), - (double *)bb.data(), bb.stride_0()); + cblas_dtrsm(CblasRowMajor, CblasLeft, CblasLower, CblasNoTrans, CblasUnit, BlkSize, NumCols, 1.0, + (double *)aa.data(), aa.stride_0(), (double *)bb.data(), bb.stride_0()); break; case 1: - cblas_dtrsm(CblasRowMajor, CblasLeft, CblasLower, - CblasNoTrans, CblasNonUnit, BlkSize, NumCols, 1.0, - (double *)aa.data(), aa.stride_0(), - (double *)bb.data(), bb.stride_0()); + cblas_dtrsm(CblasRowMajor, CblasLeft, CblasLower, CblasNoTrans, CblasNonUnit, BlkSize, NumCols, 1.0, + (double *)aa.data(), aa.stride_0(), (double *)bb.data(), bb.stride_0()); break; case 2: - cblas_dtrsm(CblasRowMajor, CblasRight, CblasUpper, - CblasNoTrans, CblasUnit, BlkSize, NumCols, 1.0, - (double *)aa.data(), aa.stride_0(), - (double *)bb.data(), bb.stride_0()); + cblas_dtrsm(CblasRowMajor, CblasRight, CblasUpper, CblasNoTrans, CblasUnit, BlkSize, NumCols, 1.0, + (double *)aa.data(), aa.stride_0(), (double *)bb.data(), bb.stride_0()); break; case 3: - cblas_dtrsm(CblasRowMajor, CblasRight, CblasUpper, - CblasNoTrans, CblasNonUnit, BlkSize, NumCols, 1.0, - (double *)aa.data(), aa.stride_0(), - (double *)bb.data(), bb.stride_0()); + cblas_dtrsm(CblasRowMajor, CblasRight, CblasUpper, CblasNoTrans, CblasNonUnit, BlkSize, NumCols, 1.0, + (double *)aa.data(), aa.stride_0(), (double *)bb.data(), bb.stride_0()); break; case 4: - cblas_dtrsm(CblasRowMajor, CblasLeft, CblasUpper, - CblasNoTrans, CblasNonUnit, BlkSize, NumCols, 1.0, - (double *)aa.data(), aa.stride_0(), - (double *)bb.data(), bb.stride_0()); + cblas_dtrsm(CblasRowMajor, CblasLeft, CblasUpper, CblasNoTrans, CblasNonUnit, BlkSize, NumCols, 1.0, + (double *)aa.data(), aa.stride_0(), (double *)bb.data(), bb.stride_0()); break; } }); @@ -223,24 +200,19 @@ void Trsm(const int NN) { double sum = 0; for (int i = 0, iend = b.extent(0); i < iend; ++i) for (int j = 0, jend = b.extent(1); j < jend; ++j) - for (int k = 0, kend = b.extent(2); k < kend; ++k) - sum += Kokkos::ArithTraits::abs(bmat(i, j, k)); + for (int k = 0, kend = b.extent(2); k < kend; ++k) sum += Kokkos::ArithTraits::abs(bmat(i, j, k)); std::cout << std::setw(10) << "MKL TRSM" - << " BlkSize = " << std::setw(3) << BlkSize - << " NumCols = " << std::setw(3) << NumCols - << " time = " << std::scientific << tmin - << " avg flop/s = " << (flop / tavg) - << " max flop/s = " << (flop / tmin) << " sum abs(B) = " << sum - << std::endl; + << " BlkSize = " << std::setw(3) << BlkSize << " NumCols = " << std::setw(3) << NumCols + << " time = " << std::scientific << tmin << " avg flop/s = " << (flop / tavg) + << " max flop/s = " << (flop / tmin) << " sum abs(B) = " << sum << std::endl; bref = b; } } #if defined(__KOKKOSBATCHED_INTEL_MKL_BATCHED__) { - Kokkos::View a( - "a", N * VectorLength, BlkSize, BlkSize), + Kokkos::View a("a", N * VectorLength, BlkSize, BlkSize), b("b", N * VectorLength, BlkSize, NumCols); value_type *aa[N * VectorLength], *bb[N * VectorLength]; @@ -280,8 +252,7 @@ void Trsm(const int NN) { CBLAS_TRANSPOSE transA[1] = {CblasNoTrans}; CBLAS_DIAG diag[1] = {CblasUnit}; - cblas_dtrsm_batch(CblasRowMajor, side, uplo, transA, diag, blksize, - numcols, one, (const double **)aa, lda, + cblas_dtrsm_batch(CblasRowMajor, side, uplo, transA, diag, blksize, numcols, one, (const double **)aa, lda, (double **)bb, ldb, 1, size_per_grp); break; } @@ -291,8 +262,7 @@ void Trsm(const int NN) { CBLAS_TRANSPOSE transA[1] = {CblasNoTrans}; CBLAS_DIAG diag[1] = {CblasNonUnit}; - cblas_dtrsm_batch(CblasRowMajor, side, uplo, transA, diag, blksize, - numcols, one, (const double **)aa, lda, + cblas_dtrsm_batch(CblasRowMajor, side, uplo, transA, diag, blksize, numcols, one, (const double **)aa, lda, (double **)bb, ldb, 1, size_per_grp); break; } @@ -302,8 +272,7 @@ void Trsm(const int NN) { CBLAS_TRANSPOSE transA[1] = {CblasNoTrans}; CBLAS_DIAG diag[1] = {CblasUnit}; - cblas_dtrsm_batch(CblasRowMajor, side, uplo, transA, diag, blksize, - numcols, one, (const double **)aa, lda, + cblas_dtrsm_batch(CblasRowMajor, side, uplo, transA, diag, blksize, numcols, one, (const double **)aa, lda, (double **)bb, ldb, 1, size_per_grp); break; } @@ -313,8 +282,7 @@ void Trsm(const int NN) { CBLAS_TRANSPOSE transA[1] = {CblasNoTrans}; CBLAS_DIAG diag[1] = {CblasNonUnit}; - cblas_dtrsm_batch(CblasRowMajor, side, uplo, transA, diag, blksize, - numcols, one, (const double **)aa, lda, + cblas_dtrsm_batch(CblasRowMajor, side, uplo, transA, diag, blksize, numcols, one, (const double **)aa, lda, (double **)bb, ldb, 1, size_per_grp); break; } @@ -324,8 +292,7 @@ void Trsm(const int NN) { CBLAS_TRANSPOSE transA[1] = {CblasNoTrans}; CBLAS_DIAG diag[1] = {CblasNonUnit}; - cblas_dtrsm_batch(CblasRowMajor, side, uplo, transA, diag, blksize, - numcols, one, (const double **)aa, lda, + cblas_dtrsm_batch(CblasRowMajor, side, uplo, transA, diag, blksize, numcols, one, (const double **)aa, lda, (double **)bb, ldb, 1, size_per_grp); break; } @@ -342,24 +309,19 @@ void Trsm(const int NN) { for (int i = 0, iend = bref.extent(0); i < iend; ++i) for (int j = 0, jend = bref.extent(1); j < jend; ++j) for (int k = 0, kend = bref.extent(2); k < kend; ++k) - diff += Kokkos::ArithTraits::abs(bref(i, j, k) - - b(i, j, k)); + diff += Kokkos::ArithTraits::abs(bref(i, j, k) - b(i, j, k)); std::cout << std::setw(10) << "MKL Batch" - << " BlkSize = " << std::setw(3) << BlkSize - << " NumCols = " << std::setw(3) << NumCols - << " time = " << std::scientific << tmin - << " avg flop/s = " << (flop / tavg) - << " max flop/s = " << (flop / tmin) - << " diff to ref = " << diff << std::endl; + << " BlkSize = " << std::setw(3) << BlkSize << " NumCols = " << std::setw(3) << NumCols + << " time = " << std::scientific << tmin << " avg flop/s = " << (flop / tavg) + << " max flop/s = " << (flop / tmin) << " diff to ref = " << diff << std::endl; } } #endif #if defined(__KOKKOSBATCHED_INTEL_MKL_COMPACT_BATCHED__) { - Kokkos::View a( - "a", N, BlkSize, BlkSize), + Kokkos::View a("a", N, BlkSize, BlkSize), b("b", N, BlkSize, NumCols); { @@ -392,10 +354,9 @@ void Trsm(const int NN) { MKL_TRANSPOSE transA = MKL_NOTRANS; MKL_DIAG diag = MKL_UNIT; - mkl_dtrsm_compact(MKL_ROW_MAJOR, side, uplo, transA, diag, - BlkSize, NumCols, one, (const double *)a.data(), - a.stride_1(), (double *)b.data(), b.stride_1(), - format, (MKL_INT)N * VectorLength); + mkl_dtrsm_compact(MKL_ROW_MAJOR, side, uplo, transA, diag, BlkSize, NumCols, one, + (const double *)a.data(), a.stride_1(), (double *)b.data(), b.stride_1(), format, + (MKL_INT)N * VectorLength); break; } case 1: { @@ -404,10 +365,9 @@ void Trsm(const int NN) { MKL_TRANSPOSE transA = MKL_NOTRANS; MKL_DIAG diag = MKL_NONUNIT; - mkl_dtrsm_compact(MKL_ROW_MAJOR, side, uplo, transA, diag, - BlkSize, NumCols, one, (const double *)a.data(), - a.stride_1(), (double *)b.data(), b.stride_1(), - format, (MKL_INT)N * VectorLength); + mkl_dtrsm_compact(MKL_ROW_MAJOR, side, uplo, transA, diag, BlkSize, NumCols, one, + (const double *)a.data(), a.stride_1(), (double *)b.data(), b.stride_1(), format, + (MKL_INT)N * VectorLength); break; } case 2: { @@ -416,10 +376,9 @@ void Trsm(const int NN) { MKL_TRANSPOSE transA = MKL_NOTRANS; MKL_DIAG diag = MKL_UNIT; - mkl_dtrsm_compact(MKL_ROW_MAJOR, side, uplo, transA, diag, - BlkSize, NumCols, one, (const double *)a.data(), - a.stride_1(), (double *)b.data(), b.stride_1(), - format, (MKL_INT)N * VectorLength); + mkl_dtrsm_compact(MKL_ROW_MAJOR, side, uplo, transA, diag, BlkSize, NumCols, one, + (const double *)a.data(), a.stride_1(), (double *)b.data(), b.stride_1(), format, + (MKL_INT)N * VectorLength); break; } case 3: { @@ -428,10 +387,9 @@ void Trsm(const int NN) { MKL_TRANSPOSE transA = MKL_NOTRANS; MKL_DIAG diag = MKL_NONUNIT; - mkl_dtrsm_compact(MKL_ROW_MAJOR, side, uplo, transA, diag, - BlkSize, NumCols, one, (const double *)a.data(), - a.stride_1(), (double *)b.data(), b.stride_1(), - format, (MKL_INT)N * VectorLength); + mkl_dtrsm_compact(MKL_ROW_MAJOR, side, uplo, transA, diag, BlkSize, NumCols, one, + (const double *)a.data(), a.stride_1(), (double *)b.data(), b.stride_1(), format, + (MKL_INT)N * VectorLength); break; } case 4: { @@ -440,10 +398,9 @@ void Trsm(const int NN) { MKL_TRANSPOSE transA = MKL_NOTRANS; MKL_DIAG diag = MKL_NONUNIT; - mkl_dtrsm_compact(MKL_ROW_MAJOR, side, uplo, transA, diag, - BlkSize, NumCols, one, (const double *)a.data(), - a.stride_1(), (double *)b.data(), b.stride_1(), - format, (MKL_INT)N * VectorLength); + mkl_dtrsm_compact(MKL_ROW_MAJOR, side, uplo, transA, diag, BlkSize, NumCols, one, + (const double *)a.data(), a.stride_1(), (double *)b.data(), b.stride_1(), format, + (MKL_INT)N * VectorLength); break; } } @@ -459,16 +416,12 @@ void Trsm(const int NN) { for (int i = 0, iend = bref.extent(0); i < iend; ++i) for (int j = 0, jend = bref.extent(1); j < jend; ++j) for (int k = 0, kend = bref.extent(2); k < kend; ++k) - diff += Kokkos::ArithTraits::abs( - bref(i, j, k) - b(i / VectorLength, j, k)[i % VectorLength]); + diff += Kokkos::ArithTraits::abs(bref(i, j, k) - b(i / VectorLength, j, k)[i % VectorLength]); std::cout << std::setw(10) << "MKL Cmpt" - << " BlkSize = " << std::setw(3) << BlkSize - << " NumCols = " << std::setw(3) << NumCols - << " time = " << std::scientific << tmin - << " avg flop/s = " << (flop / tavg) - << " max flop/s = " << (flop / tmin) - << " diff to ref = " << diff << std::endl; + << " BlkSize = " << std::setw(3) << BlkSize << " NumCols = " << std::setw(3) << NumCols + << " time = " << std::scientific << tmin << " avg flop/s = " << (flop / tavg) + << " max flop/s = " << (flop / tmin) << " diff to ref = " << diff << std::endl; } } } @@ -557,8 +510,7 @@ void Trsm(const int NN) { /// SIMD with appropriate data layout /// { - Kokkos::View a( - "a", N, BlkSize, BlkSize), + Kokkos::View a("a", N, BlkSize, BlkSize), b("b", N, BlkSize, NumCols); { @@ -576,31 +528,29 @@ void Trsm(const int NN) { Kokkos::RangePolicy policy(0, N); Kokkos::parallel_for( - "KokkosBatched::PerfTest::TrsmHost::SIMDSerialOpenMP", policy, - KOKKOS_LAMBDA(const int k) { + "KokkosBatched::PerfTest::TrsmHost::SIMDSerialOpenMP", policy, KOKKOS_LAMBDA(const int k) { auto aa = Kokkos::subview(a, k, Kokkos::ALL(), Kokkos::ALL()); auto bb = Kokkos::subview(b, k, Kokkos::ALL(), Kokkos::ALL()); switch (test) { case 0: - SerialTrsm::invoke(1.0, aa, bb); + SerialTrsm::invoke(1.0, aa, bb); break; case 1: - SerialTrsm::invoke(1.0, aa, bb); + SerialTrsm::invoke(1.0, aa, + bb); break; case 2: - SerialTrsm::invoke(1.0, aa, bb); + SerialTrsm::invoke(1.0, aa, + bb); break; case 3: - SerialTrsm::invoke(1.0, aa, bb); + SerialTrsm::invoke(1.0, aa, + bb); break; case 4: - SerialTrsm::invoke(1.0, aa, bb); + SerialTrsm::invoke(1.0, aa, + bb); break; } }); @@ -616,16 +566,12 @@ void Trsm(const int NN) { for (int i = 0, iend = bref.extent(0); i < iend; ++i) for (int j = 0, jend = bref.extent(1); j < jend; ++j) for (int k = 0, kend = bref.extent(2); k < kend; ++k) - diff += Kokkos::ArithTraits::abs( - bref(i, j, k) - b(i / VectorLength, j, k)[i % VectorLength]); + diff += Kokkos::ArithTraits::abs(bref(i, j, k) - b(i / VectorLength, j, k)[i % VectorLength]); std::cout << std::setw(10) << "KK Vector" - << " BlkSize = " << std::setw(3) << BlkSize - << " NumCols = " << std::setw(3) << NumCols - << " time = " << std::scientific << tmin - << " avg flop/s = " << (flop / tavg) - << " max flop/s = " << (flop / tmin) - << " diff to ref = " << diff << std::endl; + << " BlkSize = " << std::setw(3) << BlkSize << " NumCols = " << std::setw(3) << NumCols + << " time = " << std::scientific << tmin << " avg flop/s = " << (flop / tavg) + << " max flop/s = " << (flop / tmin) << " diff to ref = " << diff << std::endl; } } std::cout << "\n\n"; diff --git a/perf_test/batched/sparse/CG/Functor_TestBatchedTeamVectorCG_1.hpp b/perf_test/batched/sparse/CG/Functor_TestBatchedTeamVectorCG_1.hpp index e289f8fa52..5722480212 100644 --- a/perf_test/batched/sparse/CG/Functor_TestBatchedTeamVectorCG_1.hpp +++ b/perf_test/batched/sparse/CG/Functor_TestBatchedTeamVectorCG_1.hpp @@ -14,8 +14,8 @@ // //@HEADER -template +template struct Functor_TestBatchedTeamVectorCG_1 { const ValuesViewType _D; const IntView _r; @@ -26,12 +26,9 @@ struct Functor_TestBatchedTeamVectorCG_1 { KrylovHandleType _handle; KOKKOS_INLINE_FUNCTION - Functor_TestBatchedTeamVectorCG_1(const ValuesViewType &D, const IntView &r, - const IntView &c, const VectorViewType &X, - const VectorViewType &B, const int N_team, - const int team_size, - const int vector_length, - KrylovHandleType &handle) + Functor_TestBatchedTeamVectorCG_1(const ValuesViewType &D, const IntView &r, const IntView &c, + const VectorViewType &X, const VectorViewType &B, const int N_team, + const int team_size, const int vector_length, KrylovHandleType &handle) : _D(D), _r(r), _c(c), @@ -47,20 +44,15 @@ struct Functor_TestBatchedTeamVectorCG_1 { const int first_matrix = _handle.first_index(member.league_rank()); const int last_matrix = _handle.last_index(member.league_rank()); - auto d = Kokkos::subview(_D, Kokkos::make_pair(first_matrix, last_matrix), - Kokkos::ALL); - auto x = Kokkos::subview(_X, Kokkos::make_pair(first_matrix, last_matrix), - Kokkos::ALL); - auto b = Kokkos::subview(_B, Kokkos::make_pair(first_matrix, last_matrix), - Kokkos::ALL); + auto d = Kokkos::subview(_D, Kokkos::make_pair(first_matrix, last_matrix), Kokkos::ALL); + auto x = Kokkos::subview(_X, Kokkos::make_pair(first_matrix, last_matrix), Kokkos::ALL); + auto b = Kokkos::subview(_B, Kokkos::make_pair(first_matrix, last_matrix), Kokkos::ALL); using Operator = KokkosBatched::CrsMatrix; Operator A(d, _r, _c); - KokkosBatched::TeamVectorCG::template invoke( - member, A, b, x, _handle); + KokkosBatched::TeamVectorCG::template invoke(member, A, b, x, _handle); } inline double run() { @@ -70,13 +62,10 @@ struct Functor_TestBatchedTeamVectorCG_1 { _handle.set_memory_strategy(1); - _handle.tmp_view = typename KrylovHandleType::TemporaryViewType( - "", _X.extent(0), 4 * _X.extent(1)); + _handle.tmp_view = typename KrylovHandleType::TemporaryViewType("", _X.extent(0), 4 * _X.extent(1)); - Kokkos::TeamPolicy auto_policy(_handle.get_number_of_teams(), - Kokkos::AUTO(), Kokkos::AUTO()); - Kokkos::TeamPolicy tuned_policy(_handle.get_number_of_teams(), - _team_size, _vector_length); + Kokkos::TeamPolicy auto_policy(_handle.get_number_of_teams(), Kokkos::AUTO(), Kokkos::AUTO()); + Kokkos::TeamPolicy tuned_policy(_handle.get_number_of_teams(), _team_size, _vector_length); Kokkos::TeamPolicy policy; if (_team_size < 1) diff --git a/perf_test/batched/sparse/CG/Functor_TestBatchedTeamVectorCG_2.hpp b/perf_test/batched/sparse/CG/Functor_TestBatchedTeamVectorCG_2.hpp index b3451938c5..5749d640d0 100644 --- a/perf_test/batched/sparse/CG/Functor_TestBatchedTeamVectorCG_2.hpp +++ b/perf_test/batched/sparse/CG/Functor_TestBatchedTeamVectorCG_2.hpp @@ -14,8 +14,8 @@ // //@HEADER -template +template struct Functor_TestBatchedTeamVectorCG_2 { const ValuesViewType _D; const IntView _r; @@ -26,12 +26,9 @@ struct Functor_TestBatchedTeamVectorCG_2 { KrylovHandleType _handle; KOKKOS_INLINE_FUNCTION - Functor_TestBatchedTeamVectorCG_2(const ValuesViewType &D, const IntView &r, - const IntView &c, const VectorViewType &X, - const VectorViewType &B, const int N_team, - const int team_size, - const int vector_length, - KrylovHandleType &handle) + Functor_TestBatchedTeamVectorCG_2(const ValuesViewType &D, const IntView &r, const IntView &c, + const VectorViewType &X, const VectorViewType &B, const int N_team, + const int team_size, const int vector_length, KrylovHandleType &handle) : _D(D), _r(r), _c(c), @@ -47,41 +44,27 @@ struct Functor_TestBatchedTeamVectorCG_2 { const int first_matrix = _handle.first_index(member.league_rank()); const int last_matrix = _handle.last_index(member.league_rank()); - using TeamVectorCopy1D = - KokkosBatched::TeamVectorCopy; + using TeamVectorCopy1D = KokkosBatched::TeamVectorCopy; - auto d = Kokkos::subview(_D, Kokkos::make_pair(first_matrix, last_matrix), - Kokkos::ALL); - auto x = Kokkos::subview(_X, Kokkos::make_pair(first_matrix, last_matrix), - Kokkos::ALL); - auto b = Kokkos::subview(_B, Kokkos::make_pair(first_matrix, last_matrix), - Kokkos::ALL); + auto d = Kokkos::subview(_D, Kokkos::make_pair(first_matrix, last_matrix), Kokkos::ALL); + auto x = Kokkos::subview(_X, Kokkos::make_pair(first_matrix, last_matrix), Kokkos::ALL); + auto b = Kokkos::subview(_B, Kokkos::make_pair(first_matrix, last_matrix), Kokkos::ALL); - using ScratchPadIntViewType = - Kokkos::View; + using ScratchPadIntViewType = Kokkos::View; - using Operator = - KokkosBatched::CrsMatrix; + using Operator = KokkosBatched::CrsMatrix; - ScratchPadIntViewType tmp_1D_int(member.team_scratch(0), - _r.extent(0) + _c.extent(0)); + ScratchPadIntViewType tmp_1D_int(member.team_scratch(0), _r.extent(0) + _c.extent(0)); - auto r = - Kokkos::subview(tmp_1D_int, Kokkos::make_pair(0, (int)_r.extent(0))); - auto c = Kokkos::subview( - tmp_1D_int, - Kokkos::make_pair((int)_r.extent(0), (int)tmp_1D_int.extent(0))); + auto r = Kokkos::subview(tmp_1D_int, Kokkos::make_pair(0, (int)_r.extent(0))); + auto c = Kokkos::subview(tmp_1D_int, Kokkos::make_pair((int)_r.extent(0), (int)tmp_1D_int.extent(0))); TeamVectorCopy1D::invoke(member, _r, r); TeamVectorCopy1D::invoke(member, _c, c); Operator A(d, r, c); - KokkosBatched::TeamVectorCG::template invoke( - member, A, b, x, _handle); + KokkosBatched::TeamVectorCG::template invoke(member, A, b, x, _handle); } inline double run() { @@ -91,13 +74,10 @@ struct Functor_TestBatchedTeamVectorCG_2 { _handle.set_memory_strategy(1); - _handle.tmp_view = typename KrylovHandleType::TemporaryViewType( - "", _X.extent(0), 4 * _X.extent(1)); + _handle.tmp_view = typename KrylovHandleType::TemporaryViewType("", _X.extent(0), 4 * _X.extent(1)); - Kokkos::TeamPolicy auto_policy(_handle.get_number_of_teams(), - Kokkos::AUTO(), Kokkos::AUTO()); - Kokkos::TeamPolicy tuned_policy(_handle.get_number_of_teams(), - _team_size, _vector_length); + Kokkos::TeamPolicy auto_policy(_handle.get_number_of_teams(), Kokkos::AUTO(), Kokkos::AUTO()); + Kokkos::TeamPolicy tuned_policy(_handle.get_number_of_teams(), _team_size, _vector_length); Kokkos::TeamPolicy policy; if (_team_size < 1) diff --git a/perf_test/batched/sparse/CG/Functor_TestBatchedTeamVectorCG_3.hpp b/perf_test/batched/sparse/CG/Functor_TestBatchedTeamVectorCG_3.hpp index 3dbfca7f15..9df01fd5f0 100644 --- a/perf_test/batched/sparse/CG/Functor_TestBatchedTeamVectorCG_3.hpp +++ b/perf_test/batched/sparse/CG/Functor_TestBatchedTeamVectorCG_3.hpp @@ -14,8 +14,8 @@ // //@HEADER -template +template struct Functor_TestBatchedTeamVectorCG_3 { const ValuesViewType _D; const IntView _r; @@ -26,12 +26,9 @@ struct Functor_TestBatchedTeamVectorCG_3 { KrylovHandleType _handle; KOKKOS_INLINE_FUNCTION - Functor_TestBatchedTeamVectorCG_3(const ValuesViewType &D, const IntView &r, - const IntView &c, const VectorViewType &X, - const VectorViewType &B, const int N_team, - const int team_size, - const int vector_length, - KrylovHandleType &handle) + Functor_TestBatchedTeamVectorCG_3(const ValuesViewType &D, const IntView &r, const IntView &c, + const VectorViewType &X, const VectorViewType &B, const int N_team, + const int team_size, const int vector_length, KrylovHandleType &handle) : _D(D), _r(r), _c(c), @@ -47,41 +44,27 @@ struct Functor_TestBatchedTeamVectorCG_3 { const int first_matrix = _handle.first_index(member.league_rank()); const int last_matrix = _handle.last_index(member.league_rank()); - using TeamVectorCopy1D = - KokkosBatched::TeamVectorCopy; + using TeamVectorCopy1D = KokkosBatched::TeamVectorCopy; - auto d = Kokkos::subview(_D, Kokkos::make_pair(first_matrix, last_matrix), - Kokkos::ALL); - auto x = Kokkos::subview(_X, Kokkos::make_pair(first_matrix, last_matrix), - Kokkos::ALL); - auto b = Kokkos::subview(_B, Kokkos::make_pair(first_matrix, last_matrix), - Kokkos::ALL); + auto d = Kokkos::subview(_D, Kokkos::make_pair(first_matrix, last_matrix), Kokkos::ALL); + auto x = Kokkos::subview(_X, Kokkos::make_pair(first_matrix, last_matrix), Kokkos::ALL); + auto b = Kokkos::subview(_B, Kokkos::make_pair(first_matrix, last_matrix), Kokkos::ALL); - using ScratchPadIntViewType = - Kokkos::View; + using ScratchPadIntViewType = Kokkos::View; - using Operator = - KokkosBatched::CrsMatrix; + using Operator = KokkosBatched::CrsMatrix; - ScratchPadIntViewType tmp_1D_int(member.team_scratch(0), - _r.extent(0) + _c.extent(0)); + ScratchPadIntViewType tmp_1D_int(member.team_scratch(0), _r.extent(0) + _c.extent(0)); - auto r = - Kokkos::subview(tmp_1D_int, Kokkos::make_pair(0, (int)_r.extent(0))); - auto c = Kokkos::subview( - tmp_1D_int, - Kokkos::make_pair((int)_r.extent(0), (int)tmp_1D_int.extent(0))); + auto r = Kokkos::subview(tmp_1D_int, Kokkos::make_pair(0, (int)_r.extent(0))); + auto c = Kokkos::subview(tmp_1D_int, Kokkos::make_pair((int)_r.extent(0), (int)tmp_1D_int.extent(0))); TeamVectorCopy1D::invoke(member, _r, r); TeamVectorCopy1D::invoke(member, _c, c); Operator A(d, r, c); - KokkosBatched::TeamVectorCG::template invoke( - member, A, b, x, _handle); + KokkosBatched::TeamVectorCG::template invoke(member, A, b, x, _handle); } inline double run() { @@ -91,10 +74,8 @@ struct Functor_TestBatchedTeamVectorCG_3 { _handle.set_memory_strategy(0); - Kokkos::TeamPolicy auto_policy(_handle.get_number_of_teams(), - Kokkos::AUTO(), Kokkos::AUTO()); - Kokkos::TeamPolicy tuned_policy(_handle.get_number_of_teams(), - _team_size, _vector_length); + Kokkos::TeamPolicy auto_policy(_handle.get_number_of_teams(), Kokkos::AUTO(), Kokkos::AUTO()); + Kokkos::TeamPolicy tuned_policy(_handle.get_number_of_teams(), _team_size, _vector_length); Kokkos::TeamPolicy policy; if (_team_size < 1) @@ -106,7 +87,7 @@ struct Functor_TestBatchedTeamVectorCG_3 { size_t bytes_col_idc = IntView::shmem_size(_c.extent(0)); size_t bytes_int = bytes_row_ptr + bytes_col_idc; size_t bytes_0 = ValuesViewType::shmem_size(_N_team, 5); - size_t bytes_1 = ValuesViewType::shmem_size(_N_team, 4 * _X.extent(1)); + size_t bytes_1 = ValuesViewType::shmem_size(_N_team, 4 * _X.extent(1)); policy.set_scratch_size(0, Kokkos::PerTeam(bytes_int + bytes_0 + bytes_1)); diff --git a/perf_test/batched/sparse/CG/KokkosBatched_Test_CG.cpp b/perf_test/batched/sparse/CG/KokkosBatched_Test_CG.cpp index 5bf6061fe4..e0440ddbfd 100644 --- a/perf_test/batched/sparse/CG/KokkosBatched_Test_CG.cpp +++ b/perf_test/batched/sparse/CG/KokkosBatched_Test_CG.cpp @@ -73,50 +73,41 @@ int main(int argc, char *argv[]) { for (int i = 1; i < argc; ++i) { const std::string &token = argv[i]; if (token == std::string("--help") || token == std::string("-h")) { - std::cout - << "Kokkos Batched CG performance test options:" << std::endl - << "-A : Filename of the input batched matrix." - << std::endl - << "-B : Filename of the input batched right-hand " - "side." - << std::endl - << "-X : Filename of the output batched solution." - << std::endl - << "-res : Filename of the output residual history." - << std::endl - << "-timers : Filename of the output timers." - << std::endl - << "-n1 : Number of repetitions of the experience." - << std::endl - << "-n2 : Number of the kernel calls inside one " - "experience." - << std::endl - << "-team_size : Used team size." << std::endl - << "-n_implementations: Number of implementations to use: test " - "all " - "implementations [0, specified number -1]." - << std::endl - << "-implementation : Specify only one implementation at a time." - << std::endl - << " Note: implementation 0 : use scratch pad " - "only for scalar temporary variable." - << std::endl - << " Note: implementation 1 : use scratch pad " - "for scalar temporary variables and for the graph of the " - "matrices." - << std::endl - << " Note: implementation 2 : use scratch pad " - "for scalar and vector temporary variables and for the graph of " - "the matrices." - << std::endl - << "-l : Specify left layout." << std::endl - << "-r : Specify right layout." << std::endl - << "-C : Specify if the convergence is monitored." - << std::endl - << "-N_team : Specify the number of systems per team." - << std::endl - << "-vector_length : Specify the vector length." << std::endl - << std::endl; + std::cout << "Kokkos Batched CG performance test options:" << std::endl + << "-A : Filename of the input batched matrix." << std::endl + << "-B : Filename of the input batched right-hand " + "side." + << std::endl + << "-X : Filename of the output batched solution." << std::endl + << "-res : Filename of the output residual history." << std::endl + << "-timers : Filename of the output timers." << std::endl + << "-n1 : Number of repetitions of the experience." << std::endl + << "-n2 : Number of the kernel calls inside one " + "experience." + << std::endl + << "-team_size : Used team size." << std::endl + << "-n_implementations: Number of implementations to use: test " + "all " + "implementations [0, specified number -1]." + << std::endl + << "-implementation : Specify only one implementation at a time." << std::endl + << " Note: implementation 0 : use scratch pad " + "only for scalar temporary variable." + << std::endl + << " Note: implementation 1 : use scratch pad " + "for scalar temporary variables and for the graph of the " + "matrices." + << std::endl + << " Note: implementation 2 : use scratch pad " + "for scalar and vector temporary variables and for the graph of " + "the matrices." + << std::endl + << "-l : Specify left layout." << std::endl + << "-r : Specify right layout." << std::endl + << "-C : Specify if the convergence is monitored." << std::endl + << "-N_team : Specify the number of systems per team." << std::endl + << "-vector_length : Specify the vector length." << std::endl + << std::endl; return 0; } if (token == std::string("-A")) name_A = argv[++i]; @@ -131,10 +122,8 @@ int main(int argc, char *argv[]) { if (token == std::string("-n1")) n_rep_1 = std::atoi(argv[++i]); if (token == std::string("-n2")) n_rep_2 = std::atoi(argv[++i]); if (token == std::string("-team_size")) team_size = std::atoi(argv[++i]); - if (token == std::string("-n_implementations")) - n_impl = std::atoi(argv[++i]); - if (token == std::string("-implementation")) - impls.push_back(std::atoi(argv[++i])); + if (token == std::string("-n_implementations")) n_impl = std::atoi(argv[++i]); + if (token == std::string("-implementation")) impls.push_back(std::atoi(argv[++i])); if (token == std::string("-l")) { layout_left = true; layout_right = false; @@ -144,10 +133,8 @@ int main(int argc, char *argv[]) { layout_right = true; } if (token == std::string("-C")) monitor_convergence = true; - if (token == std::string("-N_team")) - N_team_potential = std::atoi(argv[++i]); - if (token == std::string("-vector_length")) - vector_length = std::atoi(argv[++i]); + if (token == std::string("-N_team")) N_team_potential = std::atoi(argv[++i]); + if (token == std::string("-vector_length")) vector_length = std::atoi(argv[++i]); } int N, Blk, nnz, ncols; @@ -157,16 +144,14 @@ int main(int argc, char *argv[]) { if (impls.size() == 0) for (int i = 0; i < n_impl; ++i) impls.push_back(i); - std::cout << "N_team_potential = " << N_team_potential << ", n = " << Blk - << ", N = " << N << ", team_size = " << team_size - << ", vector_length = " << vector_length << std::endl; + std::cout << "N_team_potential = " << N_team_potential << ", n = " << Blk << ", N = " << N + << ", team_size = " << team_size << ", vector_length = " << vector_length << std::endl; // V100 L2 cache 6MB per core constexpr size_t LLC_CAPACITY = 80 * 6 * 1024 * 1024; KokkosBatched::Flush flush; - printf(" :::: CG Testing (N = %d, Blk = %d, nnz = %d, vl = %d, n = %d)\n", - N, Blk, nnz, vector_length, n_rep_1); + printf(" :::: CG Testing (N = %d, Blk = %d, nnz = %d, vl = %d, n = %d)\n", N, Blk, nnz, vector_length, n_rep_1); typedef Kokkos::LayoutRight LR; typedef Kokkos::LayoutLeft LL; @@ -193,12 +178,9 @@ int main(int argc, char *argv[]) { XYTypeLL yLL("values", N, Blk); if (layout_left) - printf(" :::: Testing left layout (team_size = %d, vector_length = %d)\n", - team_size, vector_length); + printf(" :::: Testing left layout (team_size = %d, vector_length = %d)\n", team_size, vector_length); if (layout_right) - printf( - " :::: Testing right layout (team_size = %d, vector_length = %d)\n", - team_size, vector_length); + printf(" :::: Testing right layout (team_size = %d, vector_length = %d)\n", team_size, vector_length); if (layout_left) { readCRSFromMM(name_A, valuesLL, rowOffsets, colIndices); @@ -226,9 +208,7 @@ int main(int argc, char *argv[]) { using Scalar3DViewType = Kokkos::View; using IntViewType = Kokkos::View; - using KrylovHandleType = - KokkosBatched::KrylovHandle; + using KrylovHandleType = KokkosBatched::KrylovHandle; KrylovHandleType handle(N, N_team); handle.set_scratch_pad_level(0); @@ -246,56 +226,38 @@ int main(int argc, char *argv[]) { if (i_impl == 0 && layout_left) { t_spmv += - Functor_TestBatchedTeamVectorCG_1( - valuesLL, rowOffsets, colIndices, xLL, yLL, N_team, - team_size, vector_length, handle) + Functor_TestBatchedTeamVectorCG_1( + valuesLL, rowOffsets, colIndices, xLL, yLL, N_team, team_size, vector_length, handle) .run(); } if (i_impl == 1 && layout_left) { t_spmv += - Functor_TestBatchedTeamVectorCG_2( - valuesLL, rowOffsets, colIndices, xLL, yLL, N_team, - team_size, vector_length, handle) + Functor_TestBatchedTeamVectorCG_2( + valuesLL, rowOffsets, colIndices, xLL, yLL, N_team, team_size, vector_length, handle) .run(); } if (i_impl == 2 && layout_left) { t_spmv += - Functor_TestBatchedTeamVectorCG_3( - valuesLL, rowOffsets, colIndices, xLL, yLL, N_team, - team_size, vector_length, handle) + Functor_TestBatchedTeamVectorCG_3( + valuesLL, rowOffsets, colIndices, xLL, yLL, N_team, team_size, vector_length, handle) .run(); } if (i_impl == 0 && layout_right) { t_spmv += - Functor_TestBatchedTeamVectorCG_1( - valuesLR, rowOffsets, colIndices, xLR, yLR, N_team, - team_size, vector_length, handle) + Functor_TestBatchedTeamVectorCG_1( + valuesLR, rowOffsets, colIndices, xLR, yLR, N_team, team_size, vector_length, handle) .run(); } if (i_impl == 1 && layout_right) { t_spmv += - Functor_TestBatchedTeamVectorCG_2( - valuesLR, rowOffsets, colIndices, xLR, yLR, N_team, - team_size, vector_length, handle) + Functor_TestBatchedTeamVectorCG_2( + valuesLR, rowOffsets, colIndices, xLR, yLR, N_team, team_size, vector_length, handle) .run(); } if (i_impl == 2 && layout_right) { t_spmv += - Functor_TestBatchedTeamVectorCG_3( - valuesLR, rowOffsets, colIndices, xLR, yLR, N_team, - team_size, vector_length, handle) + Functor_TestBatchedTeamVectorCG_3( + valuesLR, rowOffsets, colIndices, xLR, yLR, N_team, team_size, vector_length, handle) .run(); } exec_space().fence(); @@ -310,10 +272,8 @@ int main(int argc, char *argv[]) { { std::ofstream myfile; std::string name; - if (layout_left) - name = name_timer + "_" + std::to_string(i_impl) + "_left.txt"; - if (layout_right) - name = name_timer + "_" + std::to_string(i_impl) + "_right.txt"; + if (layout_left) name = name_timer + "_" + std::to_string(i_impl) + "_left.txt"; + if (layout_right) name = name_timer + "_" + std::to_string(i_impl) + "_right.txt"; myfile.open(name); @@ -326,15 +286,10 @@ int main(int argc, char *argv[]) { double average_time = 0.; - for (size_t i = 0; i < timers.size(); ++i) - average_time += timers[i] / timers.size(); + for (size_t i = 0; i < timers.size(); ++i) average_time += timers[i] / timers.size(); - if (layout_left) - printf("Left layout: Implementation %d: solve time = %f\n", i_impl, - average_time); - if (layout_right) - printf("Right layout: Implementation %d: solve time = %f\n", i_impl, - average_time); + if (layout_left) printf("Left layout: Implementation %d: solve time = %f\n", i_impl, average_time); + if (layout_right) printf("Right layout: Implementation %d: solve time = %f\n", i_impl, average_time); if (layout_left) { writeArrayToMM(name_X + std::to_string(i_impl) + "_l.mm", xLL); @@ -343,8 +298,7 @@ int main(int argc, char *argv[]) { writeArrayToMM(name_X + std::to_string(i_impl) + "_r.mm", xLR); } if (monitor_convergence) { - writeArrayToMM(name_conv + std::to_string(i_impl) + ".mm", - handle.residual_norms); + writeArrayToMM(name_conv + std::to_string(i_impl) + ".mm", handle.residual_norms); } } } diff --git a/perf_test/batched/sparse/GMRES/Functor_TestBatchedTeamVectorGMRES_1.hpp b/perf_test/batched/sparse/GMRES/Functor_TestBatchedTeamVectorGMRES_1.hpp index 0640ac8151..068960bbb6 100644 --- a/perf_test/batched/sparse/GMRES/Functor_TestBatchedTeamVectorGMRES_1.hpp +++ b/perf_test/batched/sparse/GMRES/Functor_TestBatchedTeamVectorGMRES_1.hpp @@ -14,8 +14,8 @@ // //@HEADER -template +template struct Functor_TestBatchedTeamVectorGMRES_1 { const ValuesViewType _D; const ValuesViewType _diag; @@ -32,12 +32,11 @@ struct Functor_TestBatchedTeamVectorGMRES_1 { KrylovHandleType _handle; KOKKOS_INLINE_FUNCTION - Functor_TestBatchedTeamVectorGMRES_1( - const ValuesViewType &D, const IntView &r, const IntView &c, - const VectorViewType &X, const VectorViewType &B, const int N_team, - const int team_size, const int vector_length, const int N_iteration, - const double tol, const int ortho_strategy, const int arnoldi_level, - const int other_level, KrylovHandleType &handle) + Functor_TestBatchedTeamVectorGMRES_1(const ValuesViewType &D, const IntView &r, const IntView &c, + const VectorViewType &X, const VectorViewType &B, const int N_team, + const int team_size, const int vector_length, const int N_iteration, + const double tol, const int ortho_strategy, const int arnoldi_level, + const int other_level, KrylovHandleType &handle) : _D(D), _r(r), _c(c), @@ -54,12 +53,11 @@ struct Functor_TestBatchedTeamVectorGMRES_1 { _handle(handle) {} KOKKOS_INLINE_FUNCTION - Functor_TestBatchedTeamVectorGMRES_1( - const ValuesViewType &D, const ValuesViewType &diag, const IntView &r, - const IntView &c, const VectorViewType &X, const VectorViewType &B, - const int N_team, const int team_size, const int vector_length, - const int N_iteration, const double tol, int ortho_strategy, - const int arnoldi_level, const int other_level, KrylovHandleType &handle) + Functor_TestBatchedTeamVectorGMRES_1(const ValuesViewType &D, const ValuesViewType &diag, const IntView &r, + const IntView &c, const VectorViewType &X, const VectorViewType &B, + const int N_team, const int team_size, const int vector_length, + const int N_iteration, const double tol, int ortho_strategy, + const int arnoldi_level, const int other_level, KrylovHandleType &handle) : _D(D), _diag(diag), _r(r), @@ -81,31 +79,25 @@ struct Functor_TestBatchedTeamVectorGMRES_1 { const int first_matrix = _handle.first_index(member.league_rank()); const int last_matrix = _handle.last_index(member.league_rank()); - auto d = Kokkos::subview(_D, Kokkos::make_pair(first_matrix, last_matrix), - Kokkos::ALL); - auto x = Kokkos::subview(_X, Kokkos::make_pair(first_matrix, last_matrix), - Kokkos::ALL); - auto b = Kokkos::subview(_B, Kokkos::make_pair(first_matrix, last_matrix), - Kokkos::ALL); + auto d = Kokkos::subview(_D, Kokkos::make_pair(first_matrix, last_matrix), Kokkos::ALL); + auto x = Kokkos::subview(_X, Kokkos::make_pair(first_matrix, last_matrix), Kokkos::ALL); + auto b = Kokkos::subview(_B, Kokkos::make_pair(first_matrix, last_matrix), Kokkos::ALL); using Operator = KokkosBatched::CrsMatrix; Operator A(d, _r, _c); if (UsePrec) { - auto diag = Kokkos::subview( - _diag, Kokkos::make_pair(first_matrix, last_matrix), Kokkos::ALL); + auto diag = Kokkos::subview(_diag, Kokkos::make_pair(first_matrix, last_matrix), Kokkos::ALL); using PrecOperator = KokkosBatched::JacobiPrec; PrecOperator P(diag); P.setComputedInverse(); - KokkosBatched::TeamVectorGMRES::template invoke< - Operator, VectorViewType, PrecOperator, KrylovHandleType>( - member, A, b, x, P, _handle); + KokkosBatched::TeamVectorGMRES::template invoke(member, A, b, x, P, _handle); } else { - KokkosBatched::TeamVectorGMRES::template invoke< - Operator, VectorViewType>(member, A, b, x, _handle); + KokkosBatched::TeamVectorGMRES::template invoke(member, A, b, x, _handle); } } @@ -118,13 +110,11 @@ struct Functor_TestBatchedTeamVectorGMRES_1 { _handle.set_memory_strategy(1); - _handle.tmp_view = typename KrylovHandleType::TemporaryViewType( - "", _X.extent(0), _X.extent(1) + maximum_iteration + 3); + _handle.tmp_view = + typename KrylovHandleType::TemporaryViewType("", _X.extent(0), _X.extent(1) + maximum_iteration + 3); - Kokkos::TeamPolicy auto_policy(_handle.get_number_of_teams(), - Kokkos::AUTO(), Kokkos::AUTO()); - Kokkos::TeamPolicy tuned_policy(_handle.get_number_of_teams(), - _team_size, _vector_length); + Kokkos::TeamPolicy auto_policy(_handle.get_number_of_teams(), Kokkos::AUTO(), Kokkos::AUTO()); + Kokkos::TeamPolicy tuned_policy(_handle.get_number_of_teams(), _team_size, _vector_length); Kokkos::TeamPolicy policy; if (_team_size < 1) diff --git a/perf_test/batched/sparse/GMRES/Functor_TestBatchedTeamVectorGMRES_2.hpp b/perf_test/batched/sparse/GMRES/Functor_TestBatchedTeamVectorGMRES_2.hpp index 3970b7e94a..22e735c304 100644 --- a/perf_test/batched/sparse/GMRES/Functor_TestBatchedTeamVectorGMRES_2.hpp +++ b/perf_test/batched/sparse/GMRES/Functor_TestBatchedTeamVectorGMRES_2.hpp @@ -14,8 +14,8 @@ // //@HEADER -template +template struct Functor_TestBatchedTeamVectorGMRES_2 { const ValuesViewType _D; const ValuesViewType _diag; @@ -32,12 +32,11 @@ struct Functor_TestBatchedTeamVectorGMRES_2 { KrylovHandleType _handle; KOKKOS_INLINE_FUNCTION - Functor_TestBatchedTeamVectorGMRES_2( - const ValuesViewType &D, const IntView &r, const IntView &c, - const VectorViewType &X, const VectorViewType &B, const int N_team, - const int team_size, const int vector_length, const int N_iteration, - const double tol, const int ortho_strategy, const int arnoldi_level, - const int other_level, KrylovHandleType &handle) + Functor_TestBatchedTeamVectorGMRES_2(const ValuesViewType &D, const IntView &r, const IntView &c, + const VectorViewType &X, const VectorViewType &B, const int N_team, + const int team_size, const int vector_length, const int N_iteration, + const double tol, const int ortho_strategy, const int arnoldi_level, + const int other_level, KrylovHandleType &handle) : _D(D), _r(r), _c(c), @@ -54,12 +53,11 @@ struct Functor_TestBatchedTeamVectorGMRES_2 { _handle(handle) {} KOKKOS_INLINE_FUNCTION - Functor_TestBatchedTeamVectorGMRES_2( - const ValuesViewType &D, const ValuesViewType &diag, const IntView &r, - const IntView &c, const VectorViewType &X, const VectorViewType &B, - const int N_team, const int team_size, const int vector_length, - const int N_iteration, const double tol, int ortho_strategy, - const int arnoldi_level, const int other_level, KrylovHandleType &handle) + Functor_TestBatchedTeamVectorGMRES_2(const ValuesViewType &D, const ValuesViewType &diag, const IntView &r, + const IntView &c, const VectorViewType &X, const VectorViewType &B, + const int N_team, const int team_size, const int vector_length, + const int N_iteration, const double tol, int ortho_strategy, + const int arnoldi_level, const int other_level, KrylovHandleType &handle) : _D(D), _diag(diag), _r(r), @@ -81,60 +79,41 @@ struct Functor_TestBatchedTeamVectorGMRES_2 { const int first_matrix = _handle.first_index(member.league_rank()); const int last_matrix = _handle.last_index(member.league_rank()); - using TeamVectorCopy1D = - KokkosBatched::TeamVectorCopy; - - auto d = Kokkos::subview(_D, Kokkos::make_pair(first_matrix, last_matrix), - Kokkos::ALL); - auto x = Kokkos::subview(_X, Kokkos::make_pair(first_matrix, last_matrix), - Kokkos::ALL); - auto b = Kokkos::subview(_B, Kokkos::make_pair(first_matrix, last_matrix), - Kokkos::ALL); - - using ScratchPadIntViewType = - Kokkos::View; - using ScratchPadValuesViewType = Kokkos::View< - typename ValuesViewType::non_const_value_type **, - typename ValuesViewType::array_layout, - typename ValuesViewType::execution_space::scratch_memory_space>; - using Operator = - KokkosBatched::CrsMatrix; - - ScratchPadIntViewType tmp_1D_int(member.team_scratch(0), - _r.extent(0) + _c.extent(0)); - - auto r = - Kokkos::subview(tmp_1D_int, Kokkos::make_pair(0, (int)_r.extent(0))); - auto c = Kokkos::subview( - tmp_1D_int, - Kokkos::make_pair((int)_r.extent(0), (int)tmp_1D_int.extent(0))); + using TeamVectorCopy1D = KokkosBatched::TeamVectorCopy; + + auto d = Kokkos::subview(_D, Kokkos::make_pair(first_matrix, last_matrix), Kokkos::ALL); + auto x = Kokkos::subview(_X, Kokkos::make_pair(first_matrix, last_matrix), Kokkos::ALL); + auto b = Kokkos::subview(_B, Kokkos::make_pair(first_matrix, last_matrix), Kokkos::ALL); + + using ScratchPadIntViewType = Kokkos::View; + using ScratchPadValuesViewType = + Kokkos::View; + using Operator = KokkosBatched::CrsMatrix; + + ScratchPadIntViewType tmp_1D_int(member.team_scratch(0), _r.extent(0) + _c.extent(0)); + + auto r = Kokkos::subview(tmp_1D_int, Kokkos::make_pair(0, (int)_r.extent(0))); + auto c = Kokkos::subview(tmp_1D_int, Kokkos::make_pair((int)_r.extent(0), (int)tmp_1D_int.extent(0))); TeamVectorCopy1D::invoke(member, _r, r); TeamVectorCopy1D::invoke(member, _c, c); Operator A(d, r, c); if (UsePrec) { - ScratchPadValuesViewType diag( - member.team_scratch(0), last_matrix - first_matrix, _diag.extent(1)); + ScratchPadValuesViewType diag(member.team_scratch(0), last_matrix - first_matrix, _diag.extent(1)); using PrecOperator = KokkosBatched::JacobiPrec; KokkosBatched::TeamVectorCopy::invoke( - member, - Kokkos::subview(_diag, Kokkos::make_pair(first_matrix, last_matrix), - Kokkos::ALL), - diag); + member, Kokkos::subview(_diag, Kokkos::make_pair(first_matrix, last_matrix), Kokkos::ALL), diag); PrecOperator P(diag); P.setComputedInverse(); - KokkosBatched::TeamVectorGMRES::template invoke< - Operator, VectorViewType, PrecOperator, KrylovHandleType>( - member, A, b, x, P, _handle); + KokkosBatched::TeamVectorGMRES::template invoke(member, A, b, x, P, _handle); } else { - KokkosBatched::TeamVectorGMRES::template invoke< - Operator, VectorViewType>(member, A, b, x, _handle); + KokkosBatched::TeamVectorGMRES::template invoke(member, A, b, x, _handle); } } @@ -143,10 +122,8 @@ struct Functor_TestBatchedTeamVectorGMRES_2 { Kokkos::Timer timer; Kokkos::Profiling::pushRegion(name.c_str()); - Kokkos::TeamPolicy auto_policy(_handle.get_number_of_teams(), - Kokkos::AUTO(), Kokkos::AUTO()); - Kokkos::TeamPolicy tuned_policy(_handle.get_number_of_teams(), - _team_size, _vector_length); + Kokkos::TeamPolicy auto_policy(_handle.get_number_of_teams(), Kokkos::AUTO(), Kokkos::AUTO()); + Kokkos::TeamPolicy tuned_policy(_handle.get_number_of_teams(), _team_size, _vector_length); Kokkos::TeamPolicy policy; if (_team_size < 1) @@ -158,8 +135,8 @@ struct Functor_TestBatchedTeamVectorGMRES_2 { _handle.set_memory_strategy(1); - _handle.tmp_view = typename KrylovHandleType::TemporaryViewType( - "", _X.extent(0), _X.extent(1) + maximum_iteration + 3); + _handle.tmp_view = + typename KrylovHandleType::TemporaryViewType("", _X.extent(0), _X.extent(1) + maximum_iteration + 3); using ScalarType = typename ValuesViewType::non_const_value_type; using Layout = typename ValuesViewType::array_layout; diff --git a/perf_test/batched/sparse/GMRES/Functor_TestBatchedTeamVectorGMRES_3.hpp b/perf_test/batched/sparse/GMRES/Functor_TestBatchedTeamVectorGMRES_3.hpp index 013984b3d1..7c7d9103b2 100644 --- a/perf_test/batched/sparse/GMRES/Functor_TestBatchedTeamVectorGMRES_3.hpp +++ b/perf_test/batched/sparse/GMRES/Functor_TestBatchedTeamVectorGMRES_3.hpp @@ -14,8 +14,8 @@ // //@HEADER -template +template struct Functor_TestBatchedTeamVectorGMRES_3 { const ValuesViewType _D; const ValuesViewType _diag; @@ -32,12 +32,11 @@ struct Functor_TestBatchedTeamVectorGMRES_3 { KrylovHandleType _handle; KOKKOS_INLINE_FUNCTION - Functor_TestBatchedTeamVectorGMRES_3( - const ValuesViewType &D, const IntView &r, const IntView &c, - const VectorViewType &X, const VectorViewType &B, const int N_team, - const int team_size, const int vector_length, const int N_iteration, - const double tol, const int ortho_strategy, const int arnoldi_level, - const int other_level, KrylovHandleType &handle) + Functor_TestBatchedTeamVectorGMRES_3(const ValuesViewType &D, const IntView &r, const IntView &c, + const VectorViewType &X, const VectorViewType &B, const int N_team, + const int team_size, const int vector_length, const int N_iteration, + const double tol, const int ortho_strategy, const int arnoldi_level, + const int other_level, KrylovHandleType &handle) : _D(D), _r(r), _c(c), @@ -54,12 +53,11 @@ struct Functor_TestBatchedTeamVectorGMRES_3 { _handle(handle) {} KOKKOS_INLINE_FUNCTION - Functor_TestBatchedTeamVectorGMRES_3( - const ValuesViewType &D, const ValuesViewType &diag, const IntView &r, - const IntView &c, const VectorViewType &X, const VectorViewType &B, - const int N_team, const int team_size, const int vector_length, - const int N_iteration, const double tol, int ortho_strategy, - const int arnoldi_level, const int other_level, KrylovHandleType &handle) + Functor_TestBatchedTeamVectorGMRES_3(const ValuesViewType &D, const ValuesViewType &diag, const IntView &r, + const IntView &c, const VectorViewType &X, const VectorViewType &B, + const int N_team, const int team_size, const int vector_length, + const int N_iteration, const double tol, int ortho_strategy, + const int arnoldi_level, const int other_level, KrylovHandleType &handle) : _D(D), _diag(diag), _r(r), @@ -81,60 +79,41 @@ struct Functor_TestBatchedTeamVectorGMRES_3 { const int first_matrix = _handle.first_index(member.league_rank()); const int last_matrix = _handle.last_index(member.league_rank()); - using TeamVectorCopy1D = - KokkosBatched::TeamVectorCopy; - - auto d = Kokkos::subview(_D, Kokkos::make_pair(first_matrix, last_matrix), - Kokkos::ALL); - auto x = Kokkos::subview(_X, Kokkos::make_pair(first_matrix, last_matrix), - Kokkos::ALL); - auto b = Kokkos::subview(_B, Kokkos::make_pair(first_matrix, last_matrix), - Kokkos::ALL); - - using ScratchPadIntViewType = - Kokkos::View; - using ScratchPadValuesViewType = Kokkos::View< - typename ValuesViewType::non_const_value_type **, - typename ValuesViewType::array_layout, - typename ValuesViewType::execution_space::scratch_memory_space>; - using Operator = - KokkosBatched::CrsMatrix; - - ScratchPadIntViewType tmp_1D_int(member.team_scratch(0), - _r.extent(0) + _c.extent(0)); - - auto r = - Kokkos::subview(tmp_1D_int, Kokkos::make_pair(0, (int)_r.extent(0))); - auto c = Kokkos::subview( - tmp_1D_int, - Kokkos::make_pair((int)_r.extent(0), (int)tmp_1D_int.extent(0))); + using TeamVectorCopy1D = KokkosBatched::TeamVectorCopy; + + auto d = Kokkos::subview(_D, Kokkos::make_pair(first_matrix, last_matrix), Kokkos::ALL); + auto x = Kokkos::subview(_X, Kokkos::make_pair(first_matrix, last_matrix), Kokkos::ALL); + auto b = Kokkos::subview(_B, Kokkos::make_pair(first_matrix, last_matrix), Kokkos::ALL); + + using ScratchPadIntViewType = Kokkos::View; + using ScratchPadValuesViewType = + Kokkos::View; + using Operator = KokkosBatched::CrsMatrix; + + ScratchPadIntViewType tmp_1D_int(member.team_scratch(0), _r.extent(0) + _c.extent(0)); + + auto r = Kokkos::subview(tmp_1D_int, Kokkos::make_pair(0, (int)_r.extent(0))); + auto c = Kokkos::subview(tmp_1D_int, Kokkos::make_pair((int)_r.extent(0), (int)tmp_1D_int.extent(0))); TeamVectorCopy1D::invoke(member, _r, r); TeamVectorCopy1D::invoke(member, _c, c); Operator A(d, r, c); if (UsePrec) { - ScratchPadValuesViewType diag( - member.team_scratch(0), last_matrix - first_matrix, _diag.extent(1)); + ScratchPadValuesViewType diag(member.team_scratch(0), last_matrix - first_matrix, _diag.extent(1)); using PrecOperator = KokkosBatched::JacobiPrec; KokkosBatched::TeamVectorCopy::invoke( - member, - Kokkos::subview(_diag, Kokkos::make_pair(first_matrix, last_matrix), - Kokkos::ALL), - diag); + member, Kokkos::subview(_diag, Kokkos::make_pair(first_matrix, last_matrix), Kokkos::ALL), diag); PrecOperator P(diag); P.setComputedInverse(); - KokkosBatched::TeamVectorGMRES::template invoke< - Operator, VectorViewType, PrecOperator, KrylovHandleType>( - member, A, b, x, P, _handle); + KokkosBatched::TeamVectorGMRES::template invoke(member, A, b, x, P, _handle); } else { - KokkosBatched::TeamVectorGMRES::template invoke< - Operator, VectorViewType>(member, A, b, x, _handle); + KokkosBatched::TeamVectorGMRES::template invoke(member, A, b, x, _handle); } } @@ -143,10 +122,8 @@ struct Functor_TestBatchedTeamVectorGMRES_3 { Kokkos::Timer timer; Kokkos::Profiling::pushRegion(name.c_str()); - Kokkos::TeamPolicy auto_policy(_handle.get_number_of_teams(), - Kokkos::AUTO(), Kokkos::AUTO()); - Kokkos::TeamPolicy tuned_policy(_handle.get_number_of_teams(), - _team_size, _vector_length); + Kokkos::TeamPolicy auto_policy(_handle.get_number_of_teams(), Kokkos::AUTO(), Kokkos::AUTO()); + Kokkos::TeamPolicy tuned_policy(_handle.get_number_of_teams(), _team_size, _vector_length); Kokkos::TeamPolicy policy; if (_team_size < 1) @@ -168,14 +145,13 @@ struct Functor_TestBatchedTeamVectorGMRES_3 { size_t bytes_row_ptr = IntView::shmem_size(_r.extent(0)); size_t bytes_col_idc = IntView::shmem_size(_c.extent(0)); size_t bytes_2D_1 = ViewType2D::shmem_size(_N_team, _X.extent(1)); - size_t bytes_2D_2 = ViewType2D::shmem_size(_N_team, maximum_iteration + 1); + size_t bytes_2D_2 = ViewType2D::shmem_size(_N_team, maximum_iteration + 1); size_t bytes_int = bytes_row_ptr + bytes_col_idc; size_t bytes_diag = bytes_2D_1; size_t bytes_tmp = 2 * bytes_2D_1 + 2 * bytes_1D + bytes_2D_2; - policy.set_scratch_size( - 0, Kokkos::PerTeam(bytes_tmp + bytes_diag + bytes_int)); + policy.set_scratch_size(0, Kokkos::PerTeam(bytes_tmp + bytes_diag + bytes_int)); exec_space().fence(); timer.reset(); diff --git a/perf_test/batched/sparse/GMRES/KokkosBatched_Test_GMRES.cpp b/perf_test/batched/sparse/GMRES/KokkosBatched_Test_GMRES.cpp index c0ce8f0bd4..f69ccadd7e 100644 --- a/perf_test/batched/sparse/GMRES/KokkosBatched_Test_GMRES.cpp +++ b/perf_test/batched/sparse/GMRES/KokkosBatched_Test_GMRES.cpp @@ -82,60 +82,50 @@ int main(int argc, char *argv[]) { for (int i = 1; i < argc; ++i) { const std::string &token = argv[i]; if (token == std::string("--help") || token == std::string("-h")) { - std::cout - << "Kokkos Batched GMRES performance test options:" << std::endl - << "-A : Filename of the input batched matrix." - << std::endl - << "-B : Filename of the input batched right-hand " - "side." - << std::endl - << "-X : Filename of the output batched solution." - << std::endl - << "-res : Filename of the output residual history." - << std::endl - << "-timers : Filename of the output timers." - << std::endl - << "-ortho_strategy : Select the orthogonalization strategy." - << std::endl - << "-arnoldi_level : Select the scratch pad level (if used) " - "where Arnoldi vectors are stored." - << std::endl - << "-other_level : Select the scratch pad level (if used) " - "where everything except the Arnoldi vectors are stored." - << std::endl - << "-n1 : Number of repetitions of the experience." - << std::endl - << "-n2 : Number of the kernel calls inside one " - "experience." - << std::endl - << "-team_size : Used team size." << std::endl - << "-n_implementations: Number of implementations to use: test " - "all " - "implementations [0, specified number -1]." - << std::endl - << "-implementation : Specify only one implementation at a time." - << std::endl - << " Note: implementation 0 : does not use " - "scratch pad." - << std::endl - << " Note: implementation 1 : use scratch pad " - "for the graph and for the diagonal entries of the matrices." - << std::endl - << " Note: implementation 2 : use scratch pad " - "for the graph and for the diagonal entries of the matrices and " - "for the temporary variable but not for the Arnoldi vectors." - << std::endl - << "-l : Specify left layout." << std::endl - << "-r : Specify right layout." << std::endl - << "-P : Specify if a Jacobi preconditioner is " - "used." - << std::endl - << "-C : Specify if the convergence is monitored." - << std::endl - << "-N_team : Specify the number of systems per team." - << std::endl - << "-vector_length : Specify the vector length." << std::endl - << std::endl; + std::cout << "Kokkos Batched GMRES performance test options:" << std::endl + << "-A : Filename of the input batched matrix." << std::endl + << "-B : Filename of the input batched right-hand " + "side." + << std::endl + << "-X : Filename of the output batched solution." << std::endl + << "-res : Filename of the output residual history." << std::endl + << "-timers : Filename of the output timers." << std::endl + << "-ortho_strategy : Select the orthogonalization strategy." << std::endl + << "-arnoldi_level : Select the scratch pad level (if used) " + "where Arnoldi vectors are stored." + << std::endl + << "-other_level : Select the scratch pad level (if used) " + "where everything except the Arnoldi vectors are stored." + << std::endl + << "-n1 : Number of repetitions of the experience." << std::endl + << "-n2 : Number of the kernel calls inside one " + "experience." + << std::endl + << "-team_size : Used team size." << std::endl + << "-n_implementations: Number of implementations to use: test " + "all " + "implementations [0, specified number -1]." + << std::endl + << "-implementation : Specify only one implementation at a time." << std::endl + << " Note: implementation 0 : does not use " + "scratch pad." + << std::endl + << " Note: implementation 1 : use scratch pad " + "for the graph and for the diagonal entries of the matrices." + << std::endl + << " Note: implementation 2 : use scratch pad " + "for the graph and for the diagonal entries of the matrices and " + "for the temporary variable but not for the Arnoldi vectors." + << std::endl + << "-l : Specify left layout." << std::endl + << "-r : Specify right layout." << std::endl + << "-P : Specify if a Jacobi preconditioner is " + "used." + << std::endl + << "-C : Specify if the convergence is monitored." << std::endl + << "-N_team : Specify the number of systems per team." << std::endl + << "-vector_length : Specify the vector length." << std::endl + << std::endl; return 0; } if (token == std::string("-A")) name_A = argv[++i]; @@ -143,26 +133,18 @@ int main(int argc, char *argv[]) { if (token == std::string("-X")) name_X = argv[++i]; if (token == std::string("-res")) name_conv = argv[++i]; if (token == std::string("-timers")) name_timer = argv[++i]; - if (token == std::string("-ortho_strategy")) - ortho_strategy = std::atoi(argv[++i]); - if (token == std::string("-arnoldi_level")) - arnoldi_level = std::atoi(argv[++i]); - if (token == std::string("-other_level")) - other_level = std::atoi(argv[++i]); + if (token == std::string("-ortho_strategy")) ortho_strategy = std::atoi(argv[++i]); + if (token == std::string("-arnoldi_level")) arnoldi_level = std::atoi(argv[++i]); + if (token == std::string("-other_level")) other_level = std::atoi(argv[++i]); if (token == std::string("-n1")) n_rep_1 = std::atoi(argv[++i]); if (token == std::string("-n2")) n_rep_2 = std::atoi(argv[++i]); - if (token == std::string("-n_iterations")) - n_iterations = std::atoi(argv[++i]); + if (token == std::string("-n_iterations")) n_iterations = std::atoi(argv[++i]); if (token == std::string("-tol")) tol = std::stod(argv[++i]); if (token == std::string("-team_size")) team_size = std::atoi(argv[++i]); - if (token == std::string("-N_team")) - N_team_potential = std::atoi(argv[++i]); - if (token == std::string("-vector_length")) - vector_length = std::atoi(argv[++i]); - if (token == std::string("-n_implementations")) - n_impl = std::atoi(argv[++i]); - if (token == std::string("-implementation")) - impls.push_back(std::atoi(argv[++i])); + if (token == std::string("-N_team")) N_team_potential = std::atoi(argv[++i]); + if (token == std::string("-vector_length")) vector_length = std::atoi(argv[++i]); + if (token == std::string("-n_implementations")) n_impl = std::atoi(argv[++i]); + if (token == std::string("-implementation")) impls.push_back(std::atoi(argv[++i])); if (token == std::string("-l")) { layout_left = true; layout_right = false; @@ -179,9 +161,8 @@ int main(int argc, char *argv[]) { readSizesFromMM(name_A, Blk, ncols, nnz, N); - std::cout << "N_team_potential = " << N_team_potential << ", n = " << Blk - << ", N = " << N << ", team_size = " << team_size - << ", vector_length = " << vector_length << std::endl; + std::cout << "N_team_potential = " << N_team_potential << ", n = " << Blk << ", N = " << N + << ", team_size = " << team_size << ", vector_length = " << vector_length << std::endl; if (impls.size() == 0) for (int i = 0; i < n_impl; ++i) impls.push_back(i); @@ -190,9 +171,7 @@ int main(int argc, char *argv[]) { constexpr size_t LLC_CAPACITY = 80 * 6 * 1024 * 1024; KokkosBatched::Flush flush; - printf( - " :::: GMRES Testing (N = %d, Blk = %d, nnz = %d, vl = %d, n = %d)\n", - N, Blk, nnz, vector_length, n_rep_1); + printf(" :::: GMRES Testing (N = %d, Blk = %d, nnz = %d, vl = %d, n = %d)\n", N, Blk, nnz, vector_length, n_rep_1); typedef Kokkos::LayoutRight LR; typedef Kokkos::LayoutLeft LL; @@ -221,22 +200,18 @@ int main(int argc, char *argv[]) { XYTypeLL xLL("values", N, Blk); XYTypeLL yLL("values", N, Blk); - if (layout_left) - printf(" :::: Testing left layout (team_size = %d)\n", team_size); - if (layout_right) - printf(" :::: Testing right layout (team_size = %d)\n", team_size); + if (layout_left) printf(" :::: Testing left layout (team_size = %d)\n", team_size); + if (layout_right) printf(" :::: Testing right layout (team_size = %d)\n", team_size); if (layout_left) { readCRSFromMM(name_A, valuesLL, rowOffsets, colIndices); readArrayFromMM(name_B, yLL); - if (use_preconditioner) - getInvDiagFromCRS(valuesLL, rowOffsets, colIndices, diagLL); + if (use_preconditioner) getInvDiagFromCRS(valuesLL, rowOffsets, colIndices, diagLL); } if (layout_right) { readCRSFromMM(name_A, valuesLR, rowOffsets, colIndices); readArrayFromMM(name_B, yLR); - if (use_preconditioner) - getInvDiagFromCRS(valuesLR, rowOffsets, colIndices, diagLR); + if (use_preconditioner) getInvDiagFromCRS(valuesLR, rowOffsets, colIndices, diagLR); } for (auto i_impl : impls) { @@ -256,12 +231,9 @@ int main(int argc, char *argv[]) { using Scalar3DViewType = Kokkos::View; using IntViewType = Kokkos::View; - using KrylovHandleType = - KokkosBatched::KrylovHandle; + using KrylovHandleType = KokkosBatched::KrylovHandle; KrylovHandleType handle(N, N_team, n_iterations, true); - handle.Arnoldi_view = - Scalar3DViewType("", N, n_iterations, Blk + n_iterations + 3); + handle.Arnoldi_view = Scalar3DViewType("", N, n_iterations, Blk + n_iterations + 3); // handle.tmp_view = typename KrylovHandleType::TemporaryViewType( // "", N, Blk + n_iterations + 3); @@ -285,110 +257,86 @@ int main(int argc, char *argv[]) { if (i_impl == 0 && layout_left) { if (use_preconditioner) - t_spmv += Functor_TestBatchedTeamVectorGMRES_1< - exec_space, AMatrixValueViewLL, IntView, XYTypeLL, - KrylovHandleType, true>( - valuesLL, diagLL, rowOffsets, colIndices, xLL, yLL, - N_team, team_size, vector_length, n_iterations, tol, - ortho_strategy, arnoldi_level, other_level, handle) + t_spmv += Functor_TestBatchedTeamVectorGMRES_1( + valuesLL, diagLL, rowOffsets, colIndices, xLL, yLL, N_team, team_size, vector_length, + n_iterations, tol, ortho_strategy, arnoldi_level, other_level, handle) .run(); else - t_spmv += Functor_TestBatchedTeamVectorGMRES_1< - exec_space, AMatrixValueViewLL, IntView, XYTypeLL, - KrylovHandleType, false>( - valuesLL, rowOffsets, colIndices, xLL, yLL, N_team, - team_size, vector_length, n_iterations, tol, - ortho_strategy, arnoldi_level, other_level, handle) + t_spmv += Functor_TestBatchedTeamVectorGMRES_1( + valuesLL, rowOffsets, colIndices, xLL, yLL, N_team, team_size, vector_length, n_iterations, + tol, ortho_strategy, arnoldi_level, other_level, handle) .run(); } if (i_impl == 1 && layout_left) { if (use_preconditioner) - t_spmv += Functor_TestBatchedTeamVectorGMRES_2< - exec_space, AMatrixValueViewLL, IntView, XYTypeLL, - KrylovHandleType, true>( - valuesLL, diagLL, rowOffsets, colIndices, xLL, yLL, - N_team, team_size, vector_length, n_iterations, tol, - ortho_strategy, arnoldi_level, other_level, handle) + t_spmv += Functor_TestBatchedTeamVectorGMRES_2( + valuesLL, diagLL, rowOffsets, colIndices, xLL, yLL, N_team, team_size, vector_length, + n_iterations, tol, ortho_strategy, arnoldi_level, other_level, handle) .run(); else - t_spmv += Functor_TestBatchedTeamVectorGMRES_2< - exec_space, AMatrixValueViewLL, IntView, XYTypeLL, - KrylovHandleType, false>( - valuesLL, rowOffsets, colIndices, xLL, yLL, N_team, - team_size, vector_length, n_iterations, tol, - ortho_strategy, arnoldi_level, other_level, handle) + t_spmv += Functor_TestBatchedTeamVectorGMRES_2( + valuesLL, rowOffsets, colIndices, xLL, yLL, N_team, team_size, vector_length, n_iterations, + tol, ortho_strategy, arnoldi_level, other_level, handle) .run(); } if (i_impl == 2 && layout_left) { if (use_preconditioner) - t_spmv += Functor_TestBatchedTeamVectorGMRES_3< - exec_space, AMatrixValueViewLL, IntView, XYTypeLL, - KrylovHandleType, true>( - valuesLL, diagLL, rowOffsets, colIndices, xLL, yLL, - N_team, team_size, vector_length, n_iterations, tol, - ortho_strategy, arnoldi_level, other_level, handle) + t_spmv += Functor_TestBatchedTeamVectorGMRES_3( + valuesLL, diagLL, rowOffsets, colIndices, xLL, yLL, N_team, team_size, vector_length, + n_iterations, tol, ortho_strategy, arnoldi_level, other_level, handle) .run(); else - t_spmv += Functor_TestBatchedTeamVectorGMRES_3< - exec_space, AMatrixValueViewLL, IntView, XYTypeLL, - KrylovHandleType, false>( - valuesLL, rowOffsets, colIndices, xLL, yLL, N_team, - team_size, vector_length, n_iterations, tol, - ortho_strategy, arnoldi_level, other_level, handle) + t_spmv += Functor_TestBatchedTeamVectorGMRES_3( + valuesLL, rowOffsets, colIndices, xLL, yLL, N_team, team_size, vector_length, n_iterations, + tol, ortho_strategy, arnoldi_level, other_level, handle) .run(); } if (i_impl == 0 && layout_right) { if (use_preconditioner) - t_spmv += Functor_TestBatchedTeamVectorGMRES_1< - exec_space, AMatrixValueViewLR, IntView, XYTypeLR, - KrylovHandleType, true>( - valuesLR, diagLR, rowOffsets, colIndices, xLR, yLR, - N_team, team_size, vector_length, n_iterations, tol, - ortho_strategy, arnoldi_level, other_level, handle) + t_spmv += Functor_TestBatchedTeamVectorGMRES_1( + valuesLR, diagLR, rowOffsets, colIndices, xLR, yLR, N_team, team_size, vector_length, + n_iterations, tol, ortho_strategy, arnoldi_level, other_level, handle) .run(); else - t_spmv += Functor_TestBatchedTeamVectorGMRES_1< - exec_space, AMatrixValueViewLR, IntView, XYTypeLR, - KrylovHandleType, false>( - valuesLR, rowOffsets, colIndices, xLR, yLR, N_team, - team_size, vector_length, n_iterations, tol, - ortho_strategy, arnoldi_level, other_level, handle) + t_spmv += Functor_TestBatchedTeamVectorGMRES_1( + valuesLR, rowOffsets, colIndices, xLR, yLR, N_team, team_size, vector_length, n_iterations, + tol, ortho_strategy, arnoldi_level, other_level, handle) .run(); } if (i_impl == 1 && layout_right) { if (use_preconditioner) - t_spmv += Functor_TestBatchedTeamVectorGMRES_2< - exec_space, AMatrixValueViewLR, IntView, XYTypeLR, - KrylovHandleType, true>( - valuesLR, diagLR, rowOffsets, colIndices, xLR, yLR, - N_team, team_size, vector_length, n_iterations, tol, - ortho_strategy, arnoldi_level, other_level, handle) + t_spmv += Functor_TestBatchedTeamVectorGMRES_2( + valuesLR, diagLR, rowOffsets, colIndices, xLR, yLR, N_team, team_size, vector_length, + n_iterations, tol, ortho_strategy, arnoldi_level, other_level, handle) .run(); else - t_spmv += Functor_TestBatchedTeamVectorGMRES_2< - exec_space, AMatrixValueViewLR, IntView, XYTypeLR, - KrylovHandleType, false>( - valuesLR, rowOffsets, colIndices, xLR, yLR, N_team, - team_size, vector_length, n_iterations, tol, - ortho_strategy, arnoldi_level, other_level, handle) + t_spmv += Functor_TestBatchedTeamVectorGMRES_2( + valuesLR, rowOffsets, colIndices, xLR, yLR, N_team, team_size, vector_length, n_iterations, + tol, ortho_strategy, arnoldi_level, other_level, handle) .run(); } if (i_impl == 2 && layout_right) { if (use_preconditioner) - t_spmv += Functor_TestBatchedTeamVectorGMRES_3< - exec_space, AMatrixValueViewLR, IntView, XYTypeLR, - KrylovHandleType, true>( - valuesLR, diagLR, rowOffsets, colIndices, xLR, yLR, - N_team, team_size, vector_length, n_iterations, tol, - ortho_strategy, arnoldi_level, other_level, handle) + t_spmv += Functor_TestBatchedTeamVectorGMRES_3( + valuesLR, diagLR, rowOffsets, colIndices, xLR, yLR, N_team, team_size, vector_length, + n_iterations, tol, ortho_strategy, arnoldi_level, other_level, handle) .run(); else - t_spmv += Functor_TestBatchedTeamVectorGMRES_3< - exec_space, AMatrixValueViewLR, IntView, XYTypeLR, - KrylovHandleType, false>( - valuesLR, rowOffsets, colIndices, xLR, yLR, N_team, - team_size, vector_length, n_iterations, tol, - ortho_strategy, arnoldi_level, other_level, handle) + t_spmv += Functor_TestBatchedTeamVectorGMRES_3( + valuesLR, rowOffsets, colIndices, xLR, yLR, N_team, team_size, vector_length, n_iterations, + tol, ortho_strategy, arnoldi_level, other_level, handle) .run(); } exec_space().fence(); @@ -403,10 +351,8 @@ int main(int argc, char *argv[]) { { std::ofstream myfile; std::string name; - if (layout_left) - name = name_timer + "_" + std::to_string(i_impl) + "_left.txt"; - if (layout_right) - name = name_timer + "_" + std::to_string(i_impl) + "_right.txt"; + if (layout_left) name = name_timer + "_" + std::to_string(i_impl) + "_left.txt"; + if (layout_right) name = name_timer + "_" + std::to_string(i_impl) + "_right.txt"; myfile.open(name); @@ -419,15 +365,10 @@ int main(int argc, char *argv[]) { double average_time = 0.; - for (size_t i = 0; i < timers.size(); ++i) - average_time += timers[i] / timers.size(); + for (size_t i = 0; i < timers.size(); ++i) average_time += timers[i] / timers.size(); - if (layout_left) - printf("Left layout: Implementation %d: solve time = %f\n", i_impl, - average_time); - if (layout_right) - printf("Right layout: Implementation %d: solve time = %f\n", i_impl, - average_time); + if (layout_left) printf("Left layout: Implementation %d: solve time = %f\n", i_impl, average_time); + if (layout_right) printf("Right layout: Implementation %d: solve time = %f\n", i_impl, average_time); if (layout_left) { writeArrayToMM(name_X + std::to_string(i_impl) + "_l.mm", xLL); @@ -436,8 +377,7 @@ int main(int argc, char *argv[]) { writeArrayToMM(name_X + std::to_string(i_impl) + "_r.mm", xLR); } if (monitor_convergence) { - writeArrayToMM(name_conv + std::to_string(i_impl) + ".mm", - handle.residual_norms); + writeArrayToMM(name_conv + std::to_string(i_impl) + ".mm", handle.residual_norms); } } } diff --git a/perf_test/batched/sparse/KokkosBatched_Test_Sparse_Helper.hpp b/perf_test/batched/sparse/KokkosBatched_Test_Sparse_Helper.hpp index 1eaacbde5e..53f1c48f6c 100644 --- a/perf_test/batched/sparse/KokkosBatched_Test_Sparse_Helper.hpp +++ b/perf_test/batched/sparse/KokkosBatched_Test_Sparse_Helper.hpp @@ -36,11 +36,9 @@ void writeArrayToMM(std::string name, const XType x) { myfile.close(); } -void readSizesFromMM(std::string name, int &nrows, int &ncols, int &nnz, - int &N) { +void readSizesFromMM(std::string name, int &nrows, int &ncols, int &nnz, int &N) { std::ifstream input(name); - while (input.peek() == '%') - input.ignore(std::numeric_limits::max(), '\n'); + while (input.peek() == '%') input.ignore(std::numeric_limits::max(), '\n'); std::string line_sizes; @@ -67,8 +65,7 @@ template void readArrayFromMM(std::string name, const XType &x) { std::ifstream input(name); - while (input.peek() == '%') - input.ignore(std::numeric_limits::max(), '\n'); + while (input.peek() == '%') input.ignore(std::numeric_limits::max(), '\n'); input.ignore(std::numeric_limits::max(), '\n'); typename XType::HostMirror x_h = Kokkos::create_mirror_view(x); @@ -85,8 +82,7 @@ template void readDenseFromMM(std::string name, const AType &A) { std::ifstream input(name); - while (input.peek() == '%') - input.ignore(std::numeric_limits::max(), '\n'); + while (input.peek() == '%') input.ignore(std::numeric_limits::max(), '\n'); input.ignore(std::numeric_limits::max(), '\n'); typename AType::HostMirror A_h = Kokkos::create_mirror_view(A); @@ -113,12 +109,10 @@ void readDenseFromMM(std::string name, const AType &A) { } template -void readCRSFromMM(std::string name, const VType &V, const IntType &r, - const IntType &c) { +void readCRSFromMM(std::string name, const VType &V, const IntType &r, const IntType &c) { std::ifstream input(name); - while (input.peek() == '%') - input.ignore(std::numeric_limits::max(), '\n'); + while (input.peek() == '%') input.ignore(std::numeric_limits::max(), '\n'); input.ignore(std::numeric_limits::max(), '\n'); typename VType::HostMirror V_h = Kokkos::create_mirror_view(V); @@ -137,8 +131,7 @@ void readCRSFromMM(std::string name, const VType &V, const IntType &r, input >> read_row >> c_h(i); --read_row; --c_h(i); - for (int tmp_row = current_row + 1; tmp_row <= read_row; ++tmp_row) - r_h(tmp_row) = i; + for (int tmp_row = current_row + 1; tmp_row <= read_row; ++tmp_row) r_h(tmp_row) = i; current_row = read_row; // if (VType::rank == 1) @@ -157,8 +150,7 @@ void readCRSFromMM(std::string name, const VType &V, const IntType &r, } template -void getInvDiagFromCRS(const VType &V, const IntType &r, const IntType &c, - const VType &diag) { +void getInvDiagFromCRS(const VType &V, const IntType &r, const IntType &c, const VType &diag) { auto diag_values_host = Kokkos::create_mirror_view(diag); auto values_host = Kokkos::create_mirror_view(V); auto row_ptr_host = Kokkos::create_mirror_view(r); @@ -173,12 +165,10 @@ void getInvDiagFromCRS(const VType &V, const IntType &r, const IntType &c, int BlkSize = diag.extent(1); for (int i = 0; i < BlkSize; ++i) { - for (current_index = row_ptr_host(i); current_index < row_ptr_host(i + 1); - ++current_index) { + for (current_index = row_ptr_host(i); current_index < row_ptr_host(i + 1); ++current_index) { if (colIndices_host(current_index) == i) break; } - for (int j = 0; j < N; ++j) - diag_values_host(j, i) = 1. / values_host(j, current_index); + for (int j = 0; j < N; ++j) diag_values_host(j, i) = 1. / values_host(j, current_index); } Kokkos::deep_copy(diag, diag_values_host); diff --git a/perf_test/batched/sparse/SPMV/KokkosBatched_SPMV_View.hpp b/perf_test/batched/sparse/SPMV/KokkosBatched_SPMV_View.hpp index 17b8ad6d3e..c1cdec2778 100644 --- a/perf_test/batched/sparse/SPMV/KokkosBatched_SPMV_View.hpp +++ b/perf_test/batched/sparse/SPMV/KokkosBatched_SPMV_View.hpp @@ -14,8 +14,7 @@ // //@HEADER -template +template struct BSPMV_Functor_View { typedef typename AMatrix::execution_space exec_space; typedef typename AMatrix::non_const_value_type value_type; @@ -36,11 +35,9 @@ struct BSPMV_Functor_View { const int N; int implementation; - BSPMV_Functor_View(const value_type* alpha_, const AMatrix m_A_values_, - const IntView m_A_row_ptr_, const IntView m_A_col_indices_, - const XVector m_x_, const value_type* beta_, - const YVector m_y_, const int matrices_per_team_, - const int N_, const int implementation_ = 0) + BSPMV_Functor_View(const value_type* alpha_, const AMatrix m_A_values_, const IntView m_A_row_ptr_, + const IntView m_A_col_indices_, const XVector m_x_, const value_type* beta_, const YVector m_y_, + const int matrices_per_team_, const int N_, const int implementation_ = 0) : alpha(alpha_), m_A_values(m_A_values_), m_A_row_ptr(m_A_row_ptr_), @@ -51,23 +48,16 @@ struct BSPMV_Functor_View { matrices_per_team(matrices_per_team_), N(N_), implementation(implementation_) { - static_assert(static_cast(AMatrix::rank) == 2, - "AMatrix must be a rank 2 View."); - static_assert(static_cast(IntView::rank) == 1, - "IntView must be a rank 1 View."); - static_assert(static_cast(XVector::rank) == 2, - "XVector must be a rank 2 View."); - static_assert(static_cast(YVector::rank) == 2, - "YVector must be a rank 2 View."); + static_assert(static_cast(AMatrix::rank) == 2, "AMatrix must be a rank 2 View."); + static_assert(static_cast(IntView::rank) == 1, "IntView must be a rank 1 View."); + static_assert(static_cast(XVector::rank) == 2, "XVector must be a rank 2 View."); + static_assert(static_cast(YVector::rank) == 2, "YVector must be a rank 2 View."); } - KOKKOS_INLINE_FUNCTION void getIndices(const ordinal_type iTemp, - const ordinal_type n_rows, - const ordinal_type n_matrices, - ordinal_type& iRow, + KOKKOS_INLINE_FUNCTION void getIndices(const ordinal_type iTemp, const ordinal_type n_rows, + const ordinal_type n_matrices, ordinal_type& iRow, ordinal_type& iMatrix) const { - if (std::is_same::value) { + if (std::is_same::value) { iRow = iTemp / n_matrices; iMatrix = iTemp % n_matrices; } else { @@ -78,90 +68,72 @@ struct BSPMV_Functor_View { KOKKOS_INLINE_FUNCTION void operator()(const team_member& dev) const { if (implementation == 0) { - const int first_matrix = - static_cast(dev.league_rank()) * matrices_per_team; - const int last_matrix = - static_cast(dev.league_rank() + 1) * matrices_per_team < N - ? static_cast(dev.league_rank() + 1) * matrices_per_team - : N; + const int first_matrix = static_cast(dev.league_rank()) * matrices_per_team; + const int last_matrix = static_cast(dev.league_rank() + 1) * matrices_per_team < N + ? static_cast(dev.league_rank() + 1) * matrices_per_team + : N; const ordinal_type n_rows = m_A_row_ptr.extent(0) - 1; for (int i_matrix = first_matrix; i_matrix < last_matrix; ++i_matrix) { - Kokkos::parallel_for( - Kokkos::TeamThreadRange(dev, 0, n_rows), - [&](const ordinal_type& iRow) { - const ordinal_type row_length = - m_A_row_ptr(iRow + 1) - m_A_row_ptr(iRow); - value_type sum = 0; - - Kokkos::parallel_reduce( - Kokkos::ThreadVectorRange(dev, row_length), - [&](const ordinal_type& iEntry, value_type& lsum) { - const value_type val = - m_A_values(i_matrix, m_A_row_ptr(iRow) + iEntry); - lsum += - val * m_x(i_matrix, - m_A_col_indices(m_A_row_ptr(iRow) + iEntry)); - }, - sum); - - Kokkos::single(Kokkos::PerThread(dev), [&]() { - sum *= alpha[i_matrix]; - - if (dobeta == 0) { - m_y(i_matrix, iRow) = sum; - } else { - m_y(i_matrix, iRow) = - beta[i_matrix] * m_y(i_matrix, iRow) + sum; - } - }); - }); + Kokkos::parallel_for(Kokkos::TeamThreadRange(dev, 0, n_rows), [&](const ordinal_type& iRow) { + const ordinal_type row_length = m_A_row_ptr(iRow + 1) - m_A_row_ptr(iRow); + value_type sum = 0; + + Kokkos::parallel_reduce( + Kokkos::ThreadVectorRange(dev, row_length), + [&](const ordinal_type& iEntry, value_type& lsum) { + const value_type val = m_A_values(i_matrix, m_A_row_ptr(iRow) + iEntry); + lsum += val * m_x(i_matrix, m_A_col_indices(m_A_row_ptr(iRow) + iEntry)); + }, + sum); + + Kokkos::single(Kokkos::PerThread(dev), [&]() { + sum *= alpha[i_matrix]; + + if (dobeta == 0) { + m_y(i_matrix, iRow) = sum; + } else { + m_y(i_matrix, iRow) = beta[i_matrix] * m_y(i_matrix, iRow) + sum; + } + }); + }); } } if (implementation == 1) { - const int first_matrix = - static_cast(dev.league_rank()) * matrices_per_team; - const int last_matrix = - static_cast(dev.league_rank() + 1) * matrices_per_team < N - ? static_cast(dev.league_rank() + 1) * matrices_per_team - : N; + const int first_matrix = static_cast(dev.league_rank()) * matrices_per_team; + const int last_matrix = static_cast(dev.league_rank() + 1) * matrices_per_team < N + ? static_cast(dev.league_rank() + 1) * matrices_per_team + : N; const int n_matrices = last_matrix - first_matrix; const ordinal_type n_rows = m_A_row_ptr.extent(0) - 1; - Kokkos::parallel_for( - Kokkos::TeamVectorRange(dev, 0, n_rows * n_matrices), - [&](const ordinal_type& iTemp) { - ordinal_type iRow, iMatrix; - this->getIndices(iTemp, n_rows, n_matrices, iRow, iMatrix); - const int iGlobalMatrix = first_matrix + iMatrix; + Kokkos::parallel_for(Kokkos::TeamVectorRange(dev, 0, n_rows * n_matrices), [&](const ordinal_type& iTemp) { + ordinal_type iRow, iMatrix; + this->getIndices(iTemp, n_rows, n_matrices, iRow, iMatrix); + const int iGlobalMatrix = first_matrix + iMatrix; - const ordinal_type row_length = - m_A_row_ptr(iRow + 1) - m_A_row_ptr(iRow); - value_type sum = 0; + const ordinal_type row_length = m_A_row_ptr(iRow + 1) - m_A_row_ptr(iRow); + value_type sum = 0; #if defined(KOKKOS_ENABLE_PRAGMA_UNROLL) #pragma unroll #endif - for (int iEntry = 0; iEntry < row_length; ++iEntry) { - sum += m_A_values(iGlobalMatrix, m_A_row_ptr(iRow) + iEntry) * - m_x(iGlobalMatrix, - m_A_col_indices(m_A_row_ptr(iRow) + iEntry)); - } - - sum *= alpha[iGlobalMatrix]; - - if (dobeta == 0) { - m_y(iGlobalMatrix, iRow) = sum; - } else { - m_y(iGlobalMatrix, iRow) = - beta[iGlobalMatrix] * m_y(iGlobalMatrix, iRow) + sum; - } - }); + for (int iEntry = 0; iEntry < row_length; ++iEntry) { + sum += m_A_values(iGlobalMatrix, m_A_row_ptr(iRow) + iEntry) * + m_x(iGlobalMatrix, m_A_col_indices(m_A_row_ptr(iRow) + iEntry)); + } + + sum *= alpha[iGlobalMatrix]; + + if (dobeta == 0) { + m_y(iGlobalMatrix, iRow) = sum; + } else { + m_y(iGlobalMatrix, iRow) = beta[iGlobalMatrix] * m_y(iGlobalMatrix, iRow) + sum; + } + }); } if (implementation == 2) { - using ScratchPadIntView = - Kokkos::View; + using ScratchPadIntView = Kokkos::View; const ordinal_type n_rows = m_A_row_ptr.extent(0) - 1; const ordinal_type nnz = m_A_col_indices.extent(0); @@ -169,51 +141,43 @@ struct BSPMV_Functor_View { ScratchPadIntView cols(dev.team_scratch(0), nnz); ScratchPadIntView row_map(dev.team_scratch(0), n_rows + 1); - Kokkos::parallel_for( - Kokkos::TeamVectorRange(dev, 0, n_rows + 1), - [&](const ordinal_type& i) { row_map(i) = m_A_row_ptr(i); }); + Kokkos::parallel_for(Kokkos::TeamVectorRange(dev, 0, n_rows + 1), + [&](const ordinal_type& i) { row_map(i) = m_A_row_ptr(i); }); - Kokkos::parallel_for( - Kokkos::TeamVectorRange(dev, 0, nnz), - [&](const ordinal_type& i) { cols(i) = m_A_col_indices(i); }); + Kokkos::parallel_for(Kokkos::TeamVectorRange(dev, 0, nnz), + [&](const ordinal_type& i) { cols(i) = m_A_col_indices(i); }); dev.team_barrier(); - const int first_matrix = - static_cast(dev.league_rank()) * matrices_per_team; - const int last_matrix = - static_cast(dev.league_rank() + 1) * matrices_per_team < N - ? static_cast(dev.league_rank() + 1) * matrices_per_team - : N; - const int n_matrices = last_matrix - first_matrix; + const int first_matrix = static_cast(dev.league_rank()) * matrices_per_team; + const int last_matrix = static_cast(dev.league_rank() + 1) * matrices_per_team < N + ? static_cast(dev.league_rank() + 1) * matrices_per_team + : N; + const int n_matrices = last_matrix - first_matrix; - Kokkos::parallel_for( - Kokkos::TeamVectorRange(dev, 0, n_rows * n_matrices), - [&](const ordinal_type& iTemp) { - ordinal_type iRow, iMatrix; - this->getIndices(iTemp, n_rows, n_matrices, iRow, iMatrix); - const int iGlobalMatrix = first_matrix + iMatrix; + Kokkos::parallel_for(Kokkos::TeamVectorRange(dev, 0, n_rows * n_matrices), [&](const ordinal_type& iTemp) { + ordinal_type iRow, iMatrix; + this->getIndices(iTemp, n_rows, n_matrices, iRow, iMatrix); + const int iGlobalMatrix = first_matrix + iMatrix; - const ordinal_type row_length = row_map(iRow + 1) - row_map(iRow); - value_type sum = 0; + const ordinal_type row_length = row_map(iRow + 1) - row_map(iRow); + value_type sum = 0; #if defined(KOKKOS_ENABLE_PRAGMA_UNROLL) #pragma unroll #endif - for (int iEntry = 0; iEntry < row_length; ++iEntry) { - sum += m_A_values(iGlobalMatrix, row_map(iRow) + iEntry) * - m_x(iGlobalMatrix, cols(row_map(iRow) + iEntry)); - } - - sum *= alpha[iGlobalMatrix]; - - if (dobeta == 0) { - m_y(iGlobalMatrix, iRow) = sum; - } else { - m_y(iGlobalMatrix, iRow) = - beta[iGlobalMatrix] * m_y(iGlobalMatrix, iRow) + sum; - } - }); + for (int iEntry = 0; iEntry < row_length; ++iEntry) { + sum += m_A_values(iGlobalMatrix, row_map(iRow) + iEntry) * m_x(iGlobalMatrix, cols(row_map(iRow) + iEntry)); + } + + sum *= alpha[iGlobalMatrix]; + + if (dobeta == 0) { + m_y(iGlobalMatrix, iRow) = sum; + } else { + m_y(iGlobalMatrix, iRow) = beta[iGlobalMatrix] * m_y(iGlobalMatrix, iRow) + sum; + } + }); } } }; \ No newline at end of file diff --git a/perf_test/batched/sparse/SPMV/KokkosBatched_Test_SPMV.cpp b/perf_test/batched/sparse/SPMV/KokkosBatched_Test_SPMV.cpp index 06ea55e303..e93c65f7f9 100644 --- a/perf_test/batched/sparse/SPMV/KokkosBatched_Test_SPMV.cpp +++ b/perf_test/batched/sparse/SPMV/KokkosBatched_Test_SPMV.cpp @@ -31,9 +31,8 @@ typedef typename exec_space::memory_space memory_space; typedef Kokkos::DefaultHostExecutionSpace host_space; typedef typename Kokkos::Device device; -template +template struct Functor_TestBatchedTeamVectorSpmv { PolicyType _policy; const alphaViewType _alpha; @@ -46,10 +45,9 @@ struct Functor_TestBatchedTeamVectorSpmv { int _matrices_per_team; KOKKOS_INLINE_FUNCTION - Functor_TestBatchedTeamVectorSpmv( - PolicyType policy, const alphaViewType &alpha, const DViewType &D, - const IntView &r, const IntView &c, const xViewType &X, - const betaViewType &beta, const yViewType &Y, const int matrices_per_team) + Functor_TestBatchedTeamVectorSpmv(PolicyType policy, const alphaViewType &alpha, const DViewType &D, const IntView &r, + const IntView &c, const xViewType &X, const betaViewType &beta, const yViewType &Y, + const int matrices_per_team) : _policy(policy), _alpha(alpha), _D(D), @@ -62,28 +60,19 @@ struct Functor_TestBatchedTeamVectorSpmv { template KOKKOS_INLINE_FUNCTION void operator()(const MemberType &member) const { - const int first_matrix = - static_cast(member.league_rank()) * _matrices_per_team; - const int N = _D.extent(0); - const int last_matrix = - (static_cast(member.league_rank() + 1) * _matrices_per_team < N - ? static_cast(member.league_rank() + 1) * _matrices_per_team - : N); - - auto alpha_team = - Kokkos::subview(_alpha, Kokkos::make_pair(first_matrix, last_matrix)); - auto D_team = Kokkos::subview( - _D, Kokkos::make_pair(first_matrix, last_matrix), Kokkos::ALL); - auto X_team = Kokkos::subview( - _X, Kokkos::make_pair(first_matrix, last_matrix), Kokkos::ALL); - auto beta_team = - Kokkos::subview(_beta, Kokkos::make_pair(first_matrix, last_matrix)); - auto Y_team = Kokkos::subview( - _Y, Kokkos::make_pair(first_matrix, last_matrix), Kokkos::ALL); - - using ScratchPadIntView = - Kokkos::View; + const int first_matrix = static_cast(member.league_rank()) * _matrices_per_team; + const int N = _D.extent(0); + const int last_matrix = (static_cast(member.league_rank() + 1) * _matrices_per_team < N + ? static_cast(member.league_rank() + 1) * _matrices_per_team + : N); + + auto alpha_team = Kokkos::subview(_alpha, Kokkos::make_pair(first_matrix, last_matrix)); + auto D_team = Kokkos::subview(_D, Kokkos::make_pair(first_matrix, last_matrix), Kokkos::ALL); + auto X_team = Kokkos::subview(_X, Kokkos::make_pair(first_matrix, last_matrix), Kokkos::ALL); + auto beta_team = Kokkos::subview(_beta, Kokkos::make_pair(first_matrix, last_matrix)); + auto Y_team = Kokkos::subview(_Y, Kokkos::make_pair(first_matrix, last_matrix), Kokkos::ALL); + + using ScratchPadIntView = Kokkos::View; const int n_rows = _r.extent(0) - 1; const int nnz = _c.extent(0); @@ -91,31 +80,23 @@ struct Functor_TestBatchedTeamVectorSpmv { ScratchPadIntView cols(member.team_scratch(0), nnz); ScratchPadIntView row_map(member.team_scratch(0), n_rows + 1); - Kokkos::parallel_for(Kokkos::TeamVectorRange(member, 0, n_rows + 1), - [&](const int &i) { row_map(i) = _r(i); }); + Kokkos::parallel_for(Kokkos::TeamVectorRange(member, 0, n_rows + 1), [&](const int &i) { row_map(i) = _r(i); }); - Kokkos::parallel_for(Kokkos::TeamVectorRange(member, 0, nnz), - [&](const int &i) { cols(i) = _c(i); }); + Kokkos::parallel_for(Kokkos::TeamVectorRange(member, 0, nnz), [&](const int &i) { cols(i) = _c(i); }); member.team_barrier(); if (last_matrix != N && _matrices_per_team == 8) - KokkosBatched::TeamVectorSpmv< - MemberType, KokkosBatched::Trans::NoTranspose, - 8>::template invoke( + KokkosBatched::TeamVectorSpmv::template invoke< + DViewType, ScratchPadIntView, xViewType, yViewType, alphaViewType, betaViewType, dobeta>( member, alpha_team, D_team, row_map, cols, X_team, beta_team, Y_team); else - KokkosBatched::TeamVectorSpmv< - MemberType, KokkosBatched::Trans::NoTranspose, - 1>::template invoke( + KokkosBatched::TeamVectorSpmv::template invoke< + DViewType, ScratchPadIntView, xViewType, yViewType, alphaViewType, betaViewType, dobeta>( member, alpha_team, D_team, row_map, cols, X_team, beta_team, Y_team); } - inline void run() { - Kokkos::parallel_for("KokkosSparse::PerfTest::BSpMV", _policy, *this); - } + inline void run() { Kokkos::parallel_for("KokkosSparse::PerfTest::BSpMV", _policy, *this); } }; int main(int argc, char *argv[]) { @@ -151,53 +132,46 @@ int main(int argc, char *argv[]) { for (int i = 1; i < argc; ++i) { const std::string &token = argv[i]; if (token == std::string("--help") || token == std::string("-h")) { - std::cout - << "Kokkos Batched SPMV performance test options:" << std::endl - << "-A : Filename of the input batched matrix." - << std::endl - << "-B : Filename of the input batched right-hand " - "side." - << std::endl - << "-X : Filename of the output batched solution." - << std::endl - << "-timers : Filename of the output timers." - << std::endl - << "-n1 : Number of repetitions of the experience." - << std::endl - << "-n2 : Number of the kernel calls inside one " - "experience." - << std::endl - << "-team_size : Used team size." << std::endl - << "-n_implementations: Number of implementations to use: test " - "all " - "implementations [0, specified number -1]." - << std::endl - << "-implementation : Specify only one implementation at a time." - << std::endl - << " Note: implementation 0 : use a Team " - "approach where a Team have to apply N_team SPMV. A given team " - "applies N_team SPMV sequentially and uses a ThreadRange over " - "the row and a VectorRange over the non zero entries of a given " - "row." - << std::endl - << " Note: implementation 1 : use a Team " - "approach where a Team have to apply N_team SPMV. A given team " - "uses a fused thread vector range policy to loop over the " - "independent fibers." - << std::endl - << " Note: implementation 2 : same as " - "implementation 1 but using scratch pad for the graph." - << std::endl - << " Note: implementation 3 : same as " - "implementation 1 but using the kernels from " - "batched/sparse/impl/*." - << std::endl - << "-l : Specify left layout." << std::endl - << "-r : Specify right layout." << std::endl - << "-N_team : Specify the number of systems per team." - << std::endl - << "-vector_length : Specify the vector length." << std::endl - << std::endl; + std::cout << "Kokkos Batched SPMV performance test options:" << std::endl + << "-A : Filename of the input batched matrix." << std::endl + << "-B : Filename of the input batched right-hand " + "side." + << std::endl + << "-X : Filename of the output batched solution." << std::endl + << "-timers : Filename of the output timers." << std::endl + << "-n1 : Number of repetitions of the experience." << std::endl + << "-n2 : Number of the kernel calls inside one " + "experience." + << std::endl + << "-team_size : Used team size." << std::endl + << "-n_implementations: Number of implementations to use: test " + "all " + "implementations [0, specified number -1]." + << std::endl + << "-implementation : Specify only one implementation at a time." << std::endl + << " Note: implementation 0 : use a Team " + "approach where a Team have to apply N_team SPMV. A given team " + "applies N_team SPMV sequentially and uses a ThreadRange over " + "the row and a VectorRange over the non zero entries of a given " + "row." + << std::endl + << " Note: implementation 1 : use a Team " + "approach where a Team have to apply N_team SPMV. A given team " + "uses a fused thread vector range policy to loop over the " + "independent fibers." + << std::endl + << " Note: implementation 2 : same as " + "implementation 1 but using scratch pad for the graph." + << std::endl + << " Note: implementation 3 : same as " + "implementation 1 but using the kernels from " + "batched/sparse/impl/*." + << std::endl + << "-l : Specify left layout." << std::endl + << "-r : Specify right layout." << std::endl + << "-N_team : Specify the number of systems per team." << std::endl + << "-vector_length : Specify the vector length." << std::endl + << std::endl; return 0; } if (token == std::string("-A")) name_A = argv[++i]; @@ -209,15 +183,11 @@ int main(int argc, char *argv[]) { if (token == std::string("-n1")) n_rep_1 = std::atoi(argv[++i]); if (token == std::string("-n2")) n_rep_2 = std::atoi(argv[++i]); - if (token == std::string("-vector_length")) - vector_length = std::atoi(argv[++i]); - if (token == std::string("-N_team")) - N_team_potential = std::atoi(argv[++i]); + if (token == std::string("-vector_length")) vector_length = std::atoi(argv[++i]); + if (token == std::string("-N_team")) N_team_potential = std::atoi(argv[++i]); if (token == std::string("-team_size")) team_size = std::atoi(argv[++i]); - if (token == std::string("-n_implementations")) - n_impl = std::atoi(argv[++i]); - if (token == std::string("-implementation")) - impls.push_back(std::atoi(argv[++i])); + if (token == std::string("-n_implementations")) n_impl = std::atoi(argv[++i]); + if (token == std::string("-implementation")) impls.push_back(std::atoi(argv[++i])); if (token == std::string("-l")) { layout_left = true; layout_right = false; @@ -244,8 +214,7 @@ int main(int argc, char *argv[]) { printf( " :::: Testing (N = %d, Blk = %d, nnz = %d, vl = %d, vi = %d, n = " "%d, N_team_potential = %d)\n", - N, Blk, nnz, vector_length, internal_vector_length, n_rep_1, - N_team_potential); + N, Blk, nnz, vector_length, internal_vector_length, n_rep_1, N_team_potential); typedef Kokkos::LayoutRight LR; typedef Kokkos::LayoutLeft LL; @@ -274,10 +243,8 @@ int main(int argc, char *argv[]) { double *s_a = new double[N]; double *s_b = new double[N]; - if (layout_left) - printf(" :::: Testing left layout (team_size = %d)\n", team_size); - if (layout_right) - printf(" :::: Testing right layout (team_size = %d)\n", team_size); + if (layout_left) printf(" :::: Testing left layout (team_size = %d)\n", team_size); + if (layout_right) printf(" :::: Testing right layout (team_size = %d)\n", team_size); if (layout_left) { readCRSFromMM(name_A, valuesLL, rowOffsets, colIndices); @@ -301,8 +268,7 @@ int main(int argc, char *argv[]) { Kokkos::deep_copy(alphaV, alphaV_h); Kokkos::deep_copy(betaV, betaV_h); - using ScratchPadIntView = - Kokkos::View; + using ScratchPadIntView = Kokkos::View; for (auto i_impl : impls) { std::vector timers; @@ -327,12 +293,9 @@ int main(int argc, char *argv[]) { if (layout_left) { using policy_type = Kokkos::TeamPolicy; - policy_type auto_policy(number_of_teams, Kokkos::AUTO(), - Kokkos::AUTO()); - policy_type tuned_policy(number_of_teams, team_size, - Kokkos::AUTO()); - policy_type tuned_policy_2(number_of_teams, team_size, - vector_length); + policy_type auto_policy(number_of_teams, Kokkos::AUTO(), Kokkos::AUTO()); + policy_type tuned_policy(number_of_teams, team_size, Kokkos::AUTO()); + policy_type tuned_policy_2(number_of_teams, team_size, vector_length); policy_type policy; if (team_size < 1) @@ -347,33 +310,24 @@ int main(int argc, char *argv[]) { size_t bytes_0 = ScratchPadIntView::shmem_size(Blk + 1); size_t bytes_1 = ScratchPadIntView::shmem_size(nnz); - if (i_impl > 1) - policy.set_scratch_size(0, Kokkos::PerTeam(bytes_0 + bytes_1)); + if (i_impl > 1) policy.set_scratch_size(0, Kokkos::PerTeam(bytes_0 + bytes_1)); // policy.set_scratch_size(1, Kokkos::PerTeam(bytes_1)); if (i_impl == 3) { - Functor_TestBatchedTeamVectorSpmv< - policy_type, AMatrixValueViewLL, IntView, XYTypeLL, XYTypeLL, - alphaViewType, alphaViewType, 0>(policy, alphaV, valuesLL, - rowOffsets, colIndices, xLL, - betaV, yLL, N_team) + Functor_TestBatchedTeamVectorSpmv(policy, alphaV, valuesLL, rowOffsets, + colIndices, xLL, betaV, yLL, N_team) .run(); } else { - Kokkos::parallel_for( - "KokkosSparse::PerfTest::BSpMV", policy, - BSPMV_Functor_View(s_a, valuesLL, rowOffsets, - colIndices, xLL, s_b, yLL, - N_team, N, i_impl)); + Kokkos::parallel_for("KokkosSparse::PerfTest::BSpMV", policy, + BSPMV_Functor_View( + s_a, valuesLL, rowOffsets, colIndices, xLL, s_b, yLL, N_team, N, i_impl)); } } if (layout_right) { using policy_type = Kokkos::TeamPolicy; - policy_type auto_policy(number_of_teams, Kokkos::AUTO(), - Kokkos::AUTO()); - policy_type tuned_policy(number_of_teams, team_size, - Kokkos::AUTO()); - policy_type tuned_policy_2(number_of_teams, team_size, - vector_length); + policy_type auto_policy(number_of_teams, Kokkos::AUTO(), Kokkos::AUTO()); + policy_type tuned_policy(number_of_teams, team_size, Kokkos::AUTO()); + policy_type tuned_policy_2(number_of_teams, team_size, vector_length); policy_type policy; if (team_size < 1) @@ -385,23 +339,17 @@ int main(int argc, char *argv[]) { size_t bytes_0 = ScratchPadIntView::shmem_size(Blk + 1); size_t bytes_1 = ScratchPadIntView::shmem_size(nnz); - if (i_impl > 1) - policy.set_scratch_size(0, Kokkos::PerTeam(bytes_0 + bytes_1)); + if (i_impl > 1) policy.set_scratch_size(0, Kokkos::PerTeam(bytes_0 + bytes_1)); // policy.set_scratch_size(1, Kokkos::PerTeam(bytes_1)); if (i_impl == 3) { - Functor_TestBatchedTeamVectorSpmv< - policy_type, AMatrixValueViewLR, IntView, XYTypeLR, XYTypeLR, - alphaViewType, alphaViewType, 0>(policy, alphaV, valuesLR, - rowOffsets, colIndices, xLR, - betaV, yLR, N_team) + Functor_TestBatchedTeamVectorSpmv(policy, alphaV, valuesLR, rowOffsets, + colIndices, xLR, betaV, yLR, N_team) .run(); } else { - Kokkos::parallel_for( - "KokkosSparse::PerfTest::BSpMV", policy, - BSPMV_Functor_View(s_a, valuesLR, rowOffsets, - colIndices, xLR, s_b, yLR, - N_team, N, i_impl)); + Kokkos::parallel_for("KokkosSparse::PerfTest::BSpMV", policy, + BSPMV_Functor_View( + s_a, valuesLR, rowOffsets, colIndices, xLR, s_b, yLR, N_team, N, i_impl)); } } exec_space().fence(); @@ -416,10 +364,8 @@ int main(int argc, char *argv[]) { { std::ofstream myfile; std::string name; - if (layout_left) - name = name_timer + "_" + std::to_string(i_impl) + "_left.txt"; - if (layout_right) - name = name_timer + "_" + std::to_string(i_impl) + "_right.txt"; + if (layout_left) name = name_timer + "_" + std::to_string(i_impl) + "_left.txt"; + if (layout_right) name = name_timer + "_" + std::to_string(i_impl) + "_right.txt"; myfile.open(name); @@ -432,8 +378,7 @@ int main(int argc, char *argv[]) { double average_time = 0.; - for (size_t i = 0; i < timers.size(); ++i) - average_time += timers[i] / timers.size(); + for (size_t i = 0; i < timers.size(); ++i) average_time += timers[i] / timers.size(); if (layout_left) printf( diff --git a/perf_test/batched/sparse/cusolver/KokkosBatched_Test_cusolverDn.cpp b/perf_test/batched/sparse/cusolver/KokkosBatched_Test_cusolverDn.cpp index 2294c23805..5e9bf13f8c 100644 --- a/perf_test/batched/sparse/cusolver/KokkosBatched_Test_cusolverDn.cpp +++ b/perf_test/batched/sparse/cusolver/KokkosBatched_Test_cusolverDn.cpp @@ -71,9 +71,7 @@ struct Functor_Test_BatchedDenseCuSolve { const VectorViewType _B; KOKKOS_INLINE_FUNCTION - Functor_Test_BatchedDenseCuSolve(const MatrixViewType &A, - const VectorViewType &X, - const VectorViewType &B) + Functor_Test_BatchedDenseCuSolve(const MatrixViewType &A, const VectorViewType &X, const VectorViewType &B) : _A(A), _X(X), _B(B) {} inline double run() { @@ -100,10 +98,8 @@ struct Functor_Test_BatchedDenseCuSolve { double **d_Aarray = nullptr; double **d_Barray = nullptr; - cudaMalloc(reinterpret_cast(&d_Aarray), - sizeof(double *) * batchSize); - cudaMalloc(reinterpret_cast(&d_Barray), - sizeof(double *) * batchSize); + cudaMalloc(reinterpret_cast(&d_Aarray), sizeof(double *) * batchSize); + cudaMalloc(reinterpret_cast(&d_Barray), sizeof(double *) * batchSize); std::vector Aarray(batchSize, nullptr); std::vector Barray(batchSize, nullptr); @@ -112,34 +108,26 @@ struct Functor_Test_BatchedDenseCuSolve { Barray[i] = Kokkos::subview(_X, i, Kokkos::ALL).data(); } - cudaMemcpyAsync(d_Aarray, Aarray.data(), sizeof(double *) * batchSize, - cudaMemcpyHostToDevice); - cudaMemcpyAsync(d_Barray, Barray.data(), sizeof(double *) * batchSize, - cudaMemcpyHostToDevice); + cudaMemcpyAsync(d_Aarray, Aarray.data(), sizeof(double *) * batchSize, cudaMemcpyHostToDevice); + cudaMemcpyAsync(d_Barray, Barray.data(), sizeof(double *) * batchSize, cudaMemcpyHostToDevice); cudaDeviceSynchronize(); exec_space().fence(); timer.reset(); - auto status1 = cusolverDnDpotrfBatched(handle, uplo, m, d_Aarray, lda, - d_infoArray, batchSize); + auto status1 = cusolverDnDpotrfBatched(handle, uplo, m, d_Aarray, lda, d_infoArray, batchSize); if (status1 != CUSOLVER_STATUS_SUCCESS) - std::cout << "Error in cusolverDnDpotrfBatched with batchSize = " - << batchSize << " and m = " << m << std::endl; + std::cout << "Error in cusolverDnDpotrfBatched with batchSize = " << batchSize << " and m = " << m << std::endl; cudaDeviceSynchronize(); - auto status2 = cusolverDnDpotrsBatched(handle, uplo, m, 1, d_Aarray, lda, - d_Barray, ldb, info, batchSize); + auto status2 = cusolverDnDpotrsBatched(handle, uplo, m, 1, d_Aarray, lda, d_Barray, ldb, info, batchSize); if (status2 != CUSOLVER_STATUS_SUCCESS) { if (status2 == CUSOLVER_STATUS_NOT_INITIALIZED) - std::cout << "Error in cusolverDnDpotrsBatched with batchSize = " - << batchSize << " and m = " << m + std::cout << "Error in cusolverDnDpotrsBatched with batchSize = " << batchSize << " and m = " << m << " CUSOLVER_STATUS_NOT_INITIALIZED " << std::endl; if (status2 == CUSOLVER_STATUS_INVALID_VALUE) - std::cout << "Error in cusolverDnDpotrsBatched with batchSize = " - << batchSize << " and m = " << m + std::cout << "Error in cusolverDnDpotrsBatched with batchSize = " << batchSize << " and m = " << m << " CUSOLVER_STATUS_INVALID_VALUE " << std::endl; if (status2 == CUSOLVER_STATUS_INTERNAL_ERROR) - std::cout << "Error in cusolverDnDpotrsBatched with batchSize = " - << batchSize << " and m = " << m + std::cout << "Error in cusolverDnDpotrsBatched with batchSize = " << batchSize << " and m = " << m << " CUSOLVER_STATUS_INTERNAL_ERROR " << std::endl; cudaDeviceSynchronize(); exec_space().fence(); @@ -189,12 +177,9 @@ int main(int argc, char *argv[]) { if (token == std::string("-X")) name_X = argv[++i]; if (token == std::string("-timers")) name_timer = argv[++i]; if (token == std::string("-team_size")) team_size = std::atoi(argv[++i]); - if (token == std::string("-vector_length")) - vector_length = std::atoi(argv[++i]); - if (token == std::string("-n_implementations")) - n_impl = std::atoi(argv[++i]); - if (token == std::string("-implementation")) - impls.push_back(std::atoi(argv[++i])); + if (token == std::string("-vector_length")) vector_length = std::atoi(argv[++i]); + if (token == std::string("-n_implementations")) n_impl = std::atoi(argv[++i]); + if (token == std::string("-implementation")) impls.push_back(std::atoi(argv[++i])); if (token == std::string("-l")) { layout_left = true; layout_right = false; @@ -219,8 +204,7 @@ int main(int argc, char *argv[]) { constexpr size_t LLC_CAPACITY = 80 * 6 * 1024 * 1024; KokkosBatched::Flush flush; - printf(" :::: CusolverDn Testing (N = %d, Blk = %d, vl = %d, n = %d)\n", N, - Blk, vector_length, n_rep_1); + printf(" :::: CusolverDn Testing (N = %d, Blk = %d, vl = %d, n = %d)\n", N, Blk, vector_length, n_rep_1); typedef Kokkos::LayoutRight LR; typedef Kokkos::LayoutLeft LL; @@ -240,10 +224,8 @@ int main(int argc, char *argv[]) { XYTypeLL xLL("values", N, Blk); XYTypeLL yLL("values", N, Blk); - if (layout_left) - printf(" :::: Testing left layout (team_size = %d)\n", team_size); - if (layout_right) - printf(" :::: Testing right layout (team_size = %d)\n", team_size); + if (layout_left) printf(" :::: Testing left layout (team_size = %d)\n", team_size); + if (layout_right) printf(" :::: Testing right layout (team_size = %d)\n", team_size); if (layout_left) { readDenseFromMM(name_A, aLL); @@ -269,9 +251,7 @@ int main(int argc, char *argv[]) { if (i_impl == 0) { if (layout_right) { - t_spmv = Functor_Test_BatchedDenseCuSolve(aLR, xLR, yLR) - .run(); + t_spmv = Functor_Test_BatchedDenseCuSolve(aLR, xLR, yLR).run(); } } exec_space().fence(); @@ -285,10 +265,8 @@ int main(int argc, char *argv[]) { { std::ofstream myfile; std::string name; - if (layout_left) - name = name_timer + "_" + std::to_string(i_impl) + "_left.txt"; - if (layout_right) - name = name_timer + "_" + std::to_string(i_impl) + "_right.txt"; + if (layout_left) name = name_timer + "_" + std::to_string(i_impl) + "_left.txt"; + if (layout_right) name = name_timer + "_" + std::to_string(i_impl) + "_right.txt"; myfile.open(name); @@ -301,15 +279,10 @@ int main(int argc, char *argv[]) { double average_time = 0.; - for (size_t i = 0; i < timers.size(); ++i) - average_time += timers[i] / timers.size(); + for (size_t i = 0; i < timers.size(); ++i) average_time += timers[i] / timers.size(); - if (layout_left) - printf("Left layout: Implementation %d: solve time = %f\n", i_impl, - average_time); - if (layout_right) - printf("Right layout: Implementation %d: solve time = %f\n", i_impl, - average_time); + if (layout_left) printf("Left layout: Implementation %d: solve time = %f\n", i_impl, average_time); + if (layout_right) printf("Right layout: Implementation %d: solve time = %f\n", i_impl, average_time); if (layout_left) { writeArrayToMM(name_X + std::to_string(i_impl) + "_l.mm", xLL); diff --git a/perf_test/batched/sparse/cusolver/KokkosBatched_Test_cusolverSp.cpp b/perf_test/batched/sparse/cusolver/KokkosBatched_Test_cusolverSp.cpp index 808e235edc..8b2b48c0f4 100644 --- a/perf_test/batched/sparse/cusolver/KokkosBatched_Test_cusolverSp.cpp +++ b/perf_test/batched/sparse/cusolver/KokkosBatched_Test_cusolverSp.cpp @@ -26,7 +26,7 @@ #include "Kokkos_Sort.hpp" // -//#define KOKKOSKERNELS_ENABLE_TPL_CUSPARSE +// #define KOKKOSKERNELS_ENABLE_TPL_CUSPARSE #ifdef KOKKOSKERNELS_ENABLE_TPL_CUSPARSE @@ -66,8 +66,7 @@ typedef typename exec_space::memory_space memory_space; typedef Kokkos::DefaultHostExecutionSpace host_space; typedef typename Kokkos::Device device; -template +template struct Functor_Test_SparseCuSolveQR { const MatrixViewType _A; const IntView _r; @@ -76,8 +75,7 @@ struct Functor_Test_SparseCuSolveQR { const VectorViewType _B; KOKKOS_INLINE_FUNCTION - Functor_Test_SparseCuSolveQR(const MatrixViewType &A, const IntView &r, - const IntView &c, const VectorViewType &X, + Functor_Test_SparseCuSolveQR(const MatrixViewType &A, const IntView &r, const IntView &c, const VectorViewType &X, const VectorViewType &B) : _A(A), _r(r), _c(c), _X(X), _B(B) {} @@ -94,10 +92,8 @@ struct Functor_Test_SparseCuSolveQR { cusparseMatDescr_t descrA = 0; KOKKOS_CUSPARSE_SAFE_CALL(cusparseCreateMatDescr(&descrA)); - KOKKOS_CUSPARSE_SAFE_CALL( - cusparseSetMatType(descrA, CUSPARSE_MATRIX_TYPE_GENERAL)); - KOKKOS_CUSPARSE_SAFE_CALL( - cusparseSetMatIndexBase(descrA, CUSPARSE_INDEX_BASE_ZERO)); + KOKKOS_CUSPARSE_SAFE_CALL(cusparseSetMatType(descrA, CUSPARSE_MATRIX_TYPE_GENERAL)); + KOKKOS_CUSPARSE_SAFE_CALL(cusparseSetMatIndexBase(descrA, CUSPARSE_INDEX_BASE_ZERO)); double tol = 1e-18; int reorder = 0; @@ -110,10 +106,8 @@ struct Functor_Test_SparseCuSolveQR { auto b = Kokkos::subview(_B, i, Kokkos::ALL).data(); auto x = Kokkos::subview(_X, i, Kokkos::ALL).data(); - cusolverSpDcsrlsvqr(handle, m, nnz, descrA, csrValA, _r.data(), _c.data(), - b, tol, reorder, x, singularity); - if (singularity[0] != -1) - std::cout << " Error ! " << singularity[0] << " " << m << std::endl; + cusolverSpDcsrlsvqr(handle, m, nnz, descrA, csrValA, _r.data(), _c.data(), b, tol, reorder, x, singularity); + if (singularity[0] != -1) std::cout << " Error ! " << singularity[0] << " " << m << std::endl; } exec_space().fence(); @@ -124,8 +118,7 @@ struct Functor_Test_SparseCuSolveQR { } }; -template +template struct Functor_Test_Block_SparseCuSolveQR { const MatrixViewType _A; const IntView _r; @@ -134,9 +127,8 @@ struct Functor_Test_Block_SparseCuSolveQR { const VectorViewType _B; KOKKOS_INLINE_FUNCTION - Functor_Test_Block_SparseCuSolveQR(const MatrixViewType &A, const IntView &r, - const IntView &c, const VectorViewType &X, - const VectorViewType &B) + Functor_Test_Block_SparseCuSolveQR(const MatrixViewType &A, const IntView &r, const IntView &c, + const VectorViewType &X, const VectorViewType &B) : _A(A), _r(r), _c(c), _X(X), _B(B) {} inline double run() { @@ -155,10 +147,8 @@ struct Functor_Test_Block_SparseCuSolveQR { cusparseMatDescr_t descrA = 0; KOKKOS_CUSPARSE_SAFE_CALL(cusparseCreateMatDescr(&descrA)); - KOKKOS_CUSPARSE_SAFE_CALL( - cusparseSetMatType(descrA, CUSPARSE_MATRIX_TYPE_GENERAL)); - KOKKOS_CUSPARSE_SAFE_CALL( - cusparseSetMatIndexBase(descrA, CUSPARSE_INDEX_BASE_ZERO)); + KOKKOS_CUSPARSE_SAFE_CALL(cusparseSetMatType(descrA, CUSPARSE_MATRIX_TYPE_GENERAL)); + KOKKOS_CUSPARSE_SAFE_CALL(cusparseSetMatIndexBase(descrA, CUSPARSE_INDEX_BASE_ZERO)); double tol = 1e-18; int reorder = 0; @@ -180,15 +170,12 @@ struct Functor_Test_Block_SparseCuSolveQR { rowOffsets_host(0) = 0; for (size_t i = 0; i < N; ++i) { for (size_t row = 0; row < m; ++row) { - const size_t current_row_index = i * m + row; - const size_t row_length = _r_host(row + 1) - _r_host(row); - rowOffsets_host(current_row_index + 1) = - rowOffsets_host(current_row_index) + row_length; + const size_t current_row_index = i * m + row; + const size_t row_length = _r_host(row + 1) - _r_host(row); + rowOffsets_host(current_row_index + 1) = rowOffsets_host(current_row_index) + row_length; for (size_t nnz_row = 0; nnz_row < row_length; ++nnz_row) { - const size_t current_block_nnz_index = - rowOffsets_host(current_row_index) + nnz_row; - const size_t current_block_col_index = - _c_host(_r_host(row) + nnz_row) + i * m; + const size_t current_block_nnz_index = rowOffsets_host(current_row_index) + nnz_row; + const size_t current_block_col_index = _c_host(_r_host(row) + nnz_row) + i * m; colIndices_host(current_block_nnz_index) = current_block_col_index; } } @@ -204,12 +191,10 @@ struct Functor_Test_Block_SparseCuSolveQR { auto b = _B.data(); auto x = _X.data(); - cusolverSpDcsrlsvqr(handle, block_m, block_nnz, descrA, csrValA, - rowOffsets.data(), colIndices.data(), b, tol, reorder, - x, singularity); + cusolverSpDcsrlsvqr(handle, block_m, block_nnz, descrA, csrValA, rowOffsets.data(), colIndices.data(), b, tol, + reorder, x, singularity); - if (singularity[0] != -1) - std::cout << " Error ! " << singularity[0] << " " << m << std::endl; + if (singularity[0] != -1) std::cout << " Error ! " << singularity[0] << " " << m << std::endl; exec_space().fence(); double sec = timer.seconds(); @@ -219,8 +204,7 @@ struct Functor_Test_Block_SparseCuSolveQR { } }; -template +template struct Functor_Test_SparseCuSolveChol { const MatrixViewType _A; const IntView _r; @@ -229,8 +213,7 @@ struct Functor_Test_SparseCuSolveChol { const VectorViewType _B; KOKKOS_INLINE_FUNCTION - Functor_Test_SparseCuSolveChol(const MatrixViewType &A, const IntView &r, - const IntView &c, const VectorViewType &X, + Functor_Test_SparseCuSolveChol(const MatrixViewType &A, const IntView &r, const IntView &c, const VectorViewType &X, const VectorViewType &B) : _A(A), _r(r), _c(c), _X(X), _B(B) {} @@ -247,10 +230,8 @@ struct Functor_Test_SparseCuSolveChol { cusparseMatDescr_t descrA = 0; KOKKOS_CUSPARSE_SAFE_CALL(cusparseCreateMatDescr(&descrA)); - KOKKOS_CUSPARSE_SAFE_CALL( - cusparseSetMatType(descrA, CUSPARSE_MATRIX_TYPE_GENERAL)); - KOKKOS_CUSPARSE_SAFE_CALL( - cusparseSetMatIndexBase(descrA, CUSPARSE_INDEX_BASE_ZERO)); + KOKKOS_CUSPARSE_SAFE_CALL(cusparseSetMatType(descrA, CUSPARSE_MATRIX_TYPE_GENERAL)); + KOKKOS_CUSPARSE_SAFE_CALL(cusparseSetMatIndexBase(descrA, CUSPARSE_INDEX_BASE_ZERO)); double tol = 1e-18; int reorder = 0; @@ -263,10 +244,8 @@ struct Functor_Test_SparseCuSolveChol { auto b = Kokkos::subview(_B, i, Kokkos::ALL).data(); auto x = Kokkos::subview(_X, i, Kokkos::ALL).data(); - cusolverSpDcsrlsvchol(handle, m, nnz, descrA, csrValA, _r.data(), - _c.data(), b, tol, reorder, x, singularity); - if (singularity[0] != -1) - std::cout << " Error ! " << singularity[0] << " " << m << std::endl; + cusolverSpDcsrlsvchol(handle, m, nnz, descrA, csrValA, _r.data(), _c.data(), b, tol, reorder, x, singularity); + if (singularity[0] != -1) std::cout << " Error ! " << singularity[0] << " " << m << std::endl; } exec_space().fence(); @@ -277,8 +256,7 @@ struct Functor_Test_SparseCuSolveChol { } }; -template +template struct Functor_Test_Block_SparseCuSolveChol { const MatrixViewType _A; const IntView _r; @@ -287,10 +265,8 @@ struct Functor_Test_Block_SparseCuSolveChol { const VectorViewType _B; KOKKOS_INLINE_FUNCTION - Functor_Test_Block_SparseCuSolveChol(const MatrixViewType &A, - const IntView &r, const IntView &c, - const VectorViewType &X, - const VectorViewType &B) + Functor_Test_Block_SparseCuSolveChol(const MatrixViewType &A, const IntView &r, const IntView &c, + const VectorViewType &X, const VectorViewType &B) : _A(A), _r(r), _c(c), _X(X), _B(B) {} inline double run() { @@ -309,10 +285,8 @@ struct Functor_Test_Block_SparseCuSolveChol { cusparseMatDescr_t descrA = 0; KOKKOS_CUSPARSE_SAFE_CALL(cusparseCreateMatDescr(&descrA)); - KOKKOS_CUSPARSE_SAFE_CALL( - cusparseSetMatType(descrA, CUSPARSE_MATRIX_TYPE_GENERAL)); - KOKKOS_CUSPARSE_SAFE_CALL( - cusparseSetMatIndexBase(descrA, CUSPARSE_INDEX_BASE_ZERO)); + KOKKOS_CUSPARSE_SAFE_CALL(cusparseSetMatType(descrA, CUSPARSE_MATRIX_TYPE_GENERAL)); + KOKKOS_CUSPARSE_SAFE_CALL(cusparseSetMatIndexBase(descrA, CUSPARSE_INDEX_BASE_ZERO)); double tol = 1e-18; int reorder = 0; @@ -334,15 +308,12 @@ struct Functor_Test_Block_SparseCuSolveChol { rowOffsets_host(0) = 0; for (size_t i = 0; i < N; ++i) { for (size_t row = 0; row < m; ++row) { - const size_t current_row_index = i * m + row; - const size_t row_length = _r_host(row + 1) - _r_host(row); - rowOffsets_host(current_row_index + 1) = - rowOffsets_host(current_row_index) + row_length; + const size_t current_row_index = i * m + row; + const size_t row_length = _r_host(row + 1) - _r_host(row); + rowOffsets_host(current_row_index + 1) = rowOffsets_host(current_row_index) + row_length; for (size_t nnz_row = 0; nnz_row < row_length; ++nnz_row) { - const size_t current_block_nnz_index = - rowOffsets_host(current_row_index) + nnz_row; - const size_t current_block_col_index = - _c_host(_r_host(row) + nnz_row) + i * m; + const size_t current_block_nnz_index = rowOffsets_host(current_row_index) + nnz_row; + const size_t current_block_col_index = _c_host(_r_host(row) + nnz_row) + i * m; colIndices_host(current_block_nnz_index) = current_block_col_index; } } @@ -358,11 +329,9 @@ struct Functor_Test_Block_SparseCuSolveChol { auto b = _B.data(); auto x = _X.data(); - cusolverSpDcsrlsvchol(handle, block_m, block_nnz, descrA, csrValA, - rowOffsets.data(), colIndices.data(), b, tol, reorder, - x, singularity); - if (singularity[0] != -1) - std::cout << " Error ! " << singularity[0] << " " << m << std::endl; + cusolverSpDcsrlsvchol(handle, block_m, block_nnz, descrA, csrValA, rowOffsets.data(), colIndices.data(), b, tol, + reorder, x, singularity); + if (singularity[0] != -1) std::cout << " Error ! " << singularity[0] << " " << m << std::endl; exec_space().fence(); double sec = timer.seconds(); @@ -407,12 +376,9 @@ int main(int argc, char *argv[]) { if (token == std::string("-X")) name_X = argv[++i]; if (token == std::string("-timers")) name_timer = argv[++i]; if (token == std::string("-team_size")) team_size = std::atoi(argv[++i]); - if (token == std::string("-vector_length")) - vector_length = std::atoi(argv[++i]); - if (token == std::string("-n_implementations")) - n_impl = std::atoi(argv[++i]); - if (token == std::string("-implementation")) - impls.push_back(std::atoi(argv[++i])); + if (token == std::string("-vector_length")) vector_length = std::atoi(argv[++i]); + if (token == std::string("-n_implementations")) n_impl = std::atoi(argv[++i]); + if (token == std::string("-implementation")) impls.push_back(std::atoi(argv[++i])); if (token == std::string("-l")) { layout_left = true; layout_right = false; @@ -437,8 +403,7 @@ int main(int argc, char *argv[]) { constexpr size_t LLC_CAPACITY = 80 * 6 * 1024 * 1024; KokkosBatched::Flush flush; - printf(" :::: CusolverSp Testing (N = %d, Blk = %d, vl = %d, n = %d)\n", N, - Blk, vector_length, n_rep_1); + printf(" :::: CusolverSp Testing (N = %d, Blk = %d, vl = %d, n = %d)\n", N, Blk, vector_length, n_rep_1); typedef Kokkos::LayoutRight LR; typedef Kokkos::LayoutLeft LL; @@ -460,10 +425,8 @@ int main(int argc, char *argv[]) { XYTypeLL xLL("values", N, Blk); XYTypeLL yLL("values", N, Blk); - if (layout_left) - printf(" :::: Testing left layout (team_size = %d)\n", team_size); - if (layout_right) - printf(" :::: Testing right layout (team_size = %d)\n", team_size); + if (layout_left) printf(" :::: Testing left layout (team_size = %d)\n", team_size); + if (layout_right) printf(" :::: Testing right layout (team_size = %d)\n", team_size); if (layout_left) { readCRSFromMM(name_A, valuesLL, rowOffsets, colIndices); @@ -490,34 +453,28 @@ int main(int argc, char *argv[]) { if (i_impl == 0) { if (layout_right) { - t_spmv = Functor_Test_SparseCuSolveQR( - valuesLR, rowOffsets, colIndices, xLR, yLR) + t_spmv = Functor_Test_SparseCuSolveQR(valuesLR, rowOffsets, + colIndices, xLR, yLR) .run(); } } if (i_impl == 1) { if (layout_right) { - t_spmv = - Functor_Test_SparseCuSolveChol( - valuesLR, rowOffsets, colIndices, xLR, yLR) - .run(); + t_spmv = Functor_Test_SparseCuSolveChol( + valuesLR, rowOffsets, colIndices, xLR, yLR) + .run(); } } if (i_impl == 2) { if (layout_right) { - t_spmv = - Functor_Test_Block_SparseCuSolveQR( - valuesLR, rowOffsets, colIndices, xLR, yLR) - .run(); + t_spmv = Functor_Test_Block_SparseCuSolveQR( + valuesLR, rowOffsets, colIndices, xLR, yLR) + .run(); } } if (i_impl == 3) { if (layout_right) { - t_spmv = Functor_Test_Block_SparseCuSolveChol< - exec_space, AMatrixValueViewLR, IntView, XYTypeLR>( + t_spmv = Functor_Test_Block_SparseCuSolveChol( valuesLR, rowOffsets, colIndices, xLR, yLR) .run(); } @@ -533,10 +490,8 @@ int main(int argc, char *argv[]) { { std::ofstream myfile; std::string name; - if (layout_left) - name = name_timer + "_" + std::to_string(i_impl) + "_left.txt"; - if (layout_right) - name = name_timer + "_" + std::to_string(i_impl) + "_right.txt"; + if (layout_left) name = name_timer + "_" + std::to_string(i_impl) + "_left.txt"; + if (layout_right) name = name_timer + "_" + std::to_string(i_impl) + "_right.txt"; myfile.open(name); @@ -549,15 +504,10 @@ int main(int argc, char *argv[]) { double average_time = 0.; - for (size_t i = 0; i < timers.size(); ++i) - average_time += timers[i] / timers.size(); + for (size_t i = 0; i < timers.size(); ++i) average_time += timers[i] / timers.size(); - if (layout_left) - printf("Left layout: Implementation %d: solve time = %f\n", i_impl, - average_time); - if (layout_right) - printf("Right layout: Implementation %d: solve time = %f\n", i_impl, - average_time); + if (layout_left) printf("Left layout: Implementation %d: solve time = %f\n", i_impl, average_time); + if (layout_right) printf("Right layout: Implementation %d: solve time = %f\n", i_impl, average_time); if (layout_left) { writeArrayToMM(name_X + std::to_string(i_impl) + "_l.mm", xLL); diff --git a/perf_test/blas/KokkosBlas_blas1.cpp b/perf_test/blas/KokkosBlas_blas1.cpp index 52d2cd4b42..b9471dee37 100644 --- a/perf_test/blas/KokkosBlas_blas1.cpp +++ b/perf_test/blas/KokkosBlas_blas1.cpp @@ -40,8 +40,7 @@ RCP