Skip to content

Commit

Permalink
Merge 'trilinos/Trilinos:develop' (5d032ab) into 'tcad-charon/Trilino…
Browse files Browse the repository at this point in the history
…s:develop' (d9299d4).

* trilinos-develop:
  Intrepid2: Fix some shadow variable warnings (trilinos#11142)
  MueLu: Update to fix additional MueLu linking
  Geminga: Pass max-stracktrace to Valgrind to avoid Intrepid2 issues
  rol: remove volatile from join in ROL_TpetraBoundConstraint.hpp
  Changes:
  • Loading branch information
Charonops Jenkins Pipeline committed Oct 18, 2022
2 parents d9299d4 + 5d032ab commit aa3bbe3
Show file tree
Hide file tree
Showing 17 changed files with 182 additions and 89 deletions.
Original file line number Diff line number Diff line change
Expand Up @@ -113,7 +113,7 @@ MACRO(TRILINOS_SYSTEM_SPECIFIC_CTEST_DRIVER)

# Options for valgrind, if needed
SET(CTEST_MEMORYCHECK_COMMAND_OPTIONS
"--trace-children=yes --leak-check=full --gen-suppressions=all --error-limit=no" ${CTEST_MEMORYCHECK_COMMAND_OPTIONS} )
"--max-stackframe=3835488 --trace-children=yes --leak-check=full --gen-suppressions=all --error-limit=no" ${CTEST_MEMORYCHECK_COMMAND_OPTIONS} )
SET(CTEST_MEMORYCHECK_SUPPRESSIONS_FILE "${CTEST_SCRIPT_DIRECTORY}/valgrind_suppressions.txt")


Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -87,7 +87,8 @@ class ChebyshevKernel {
using vector_type = Tpetra::Vector<SC, LO, GO, NT>;

public:
ChebyshevKernel (const Teuchos::RCP<const operator_type>& A);
ChebyshevKernel (const Teuchos::RCP<const operator_type>& A,
const bool useNativeSpMV=false);

void
setMatrix (const Teuchos::RCP<const operator_type>& A);
Expand All @@ -113,6 +114,10 @@ class ChebyshevKernel {

Teuchos::RCP<vector_type> W_vec_, B_vec_, X_vec_;

// External override to not fuse operations into a single kernel
// And use native blas/SpMV operations
bool useNativeSpMV_;

// Do the Import, if needed, and return the column Map version of X.
vector_type&
importVector (vector_type& X_domMap);
Expand Down
9 changes: 8 additions & 1 deletion packages/ifpack2/src/Ifpack2_Details_ChebyshevKernel_def.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -300,7 +300,9 @@ chebyshev_kernel_vector

template<class TpetraOperatorType>
ChebyshevKernel<TpetraOperatorType>::
ChebyshevKernel (const Teuchos::RCP<const operator_type>& A)
ChebyshevKernel (const Teuchos::RCP<const operator_type>& A,
const bool useNativeSpMV):
useNativeSpMV_(useNativeSpMV)
{
setMatrix (A);
}
Expand Down Expand Up @@ -388,6 +390,11 @@ bool
ChebyshevKernel<TpetraOperatorType>::
canFuse (const multivector_type& B) const
{
// If override is enabled
if(useNativeSpMV_)
return false;

// Some criteria must be met for fused kernel
return B.getNumVectors () == size_t (1) &&
! A_crs_.is_null () &&
exp_.is_null ();
Expand Down
4 changes: 4 additions & 0 deletions packages/ifpack2/src/Ifpack2_Details_Chebyshev_decl.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -509,6 +509,10 @@ class Chebyshev : public Teuchos::Describable {
//! Whether apply() will compute and return the max residual norm.
bool computeMaxResNorm_;

/// If true, the ChebyshevKernel operator will not to use a fused kernel
/// and insead use native blas/SpMV operators
bool ckUseNativeSpMV_;

/// \brief Output stream for debug output ONLY.
///
/// This is ONLY valid if debug_ is true.
Expand Down
13 changes: 11 additions & 2 deletions packages/ifpack2/src/Ifpack2_Details_Chebyshev_def.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -306,6 +306,7 @@ Chebyshev (Teuchos::RCP<const row_matrix_type> A) :
assumeMatrixUnchanged_ (false),
textbookAlgorithm_ (false),
computeMaxResNorm_ (false),
ckUseNativeSpMV_(false),
debug_ (false)
{
checkConstructorInput ();
Expand Down Expand Up @@ -337,6 +338,7 @@ Chebyshev (Teuchos::RCP<const row_matrix_type> A,
assumeMatrixUnchanged_ (false),
textbookAlgorithm_ (false),
computeMaxResNorm_ (false),
ckUseNativeSpMV_(false),
debug_ (false)
{
checkConstructorInput ();
Expand Down Expand Up @@ -382,6 +384,7 @@ setParameters (Teuchos::ParameterList& plist)
const bool defaultAssumeMatrixUnchanged = false;
const bool defaultTextbookAlgorithm = false;
const bool defaultComputeMaxResNorm = false;
const bool defaultCkUseNativeSpMV = false;
const bool defaultDebug = false;

// We'll set the instance data transactionally, after all reads
Expand All @@ -403,6 +406,7 @@ setParameters (Teuchos::ParameterList& plist)
bool assumeMatrixUnchanged = defaultAssumeMatrixUnchanged;
bool textbookAlgorithm = defaultTextbookAlgorithm;
bool computeMaxResNorm = defaultComputeMaxResNorm;
bool ckUseNativeSpMV = defaultCkUseNativeSpMV;
bool debug = defaultDebug;

// Fetch the parameters from the ParameterList. Defer all
Expand Down Expand Up @@ -481,6 +485,10 @@ setParameters (Teuchos::ParameterList& plist)
// userInvDiag.
}

// Load the kernel fuse override from the parameter list
if (plist.isParameter ("chebyshev: use native spmv"))
ckUseNativeSpMV = plist.get("chebyshev: use native spmv", ckUseNativeSpMV);

// Don't fill in defaults for the max or min eigenvalue, because
// this class uses the existence of those parameters to determine
// whether it should do eigenanalysis.
Expand Down Expand Up @@ -686,6 +694,7 @@ setParameters (Teuchos::ParameterList& plist)
assumeMatrixUnchanged_ = assumeMatrixUnchanged;
textbookAlgorithm_ = textbookAlgorithm;
computeMaxResNorm_ = computeMaxResNorm;
ckUseNativeSpMV_ = ckUseNativeSpMV;
debug_ = debug;

if (debug_) {
Expand Down Expand Up @@ -1355,7 +1364,7 @@ ifpackApplyImpl (const op_type& A,

if (ck_.is_null ()) {
Teuchos::RCP<const op_type> A_op = A_;
ck_ = Teuchos::rcp (new ChebyshevKernel<op_type> (A_op));
ck_ = Teuchos::rcp (new ChebyshevKernel<op_type> (A_op, ckUseNativeSpMV_));
}
// W := (1/theta)*D_inv*(B-A*X) and X := X + W.
// X := X + W
Expand All @@ -1374,7 +1383,7 @@ ifpackApplyImpl (const op_type& A,

if (numIters > 1 && ck_.is_null ()) {
Teuchos::RCP<const op_type> A_op = A_;
ck_ = Teuchos::rcp (new ChebyshevKernel<op_type> (A_op));
ck_ = Teuchos::rcp (new ChebyshevKernel<op_type> (A_op, ckUseNativeSpMV_));
}

// The rest of the iterations.
Expand Down
1 change: 1 addition & 0 deletions packages/ifpack2/src/Ifpack2_Parameters.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -79,6 +79,7 @@ void getValidParameters(Teuchos::ParameterList& params)
// params.set("chebyshev: operator inv diagonal",Teuchos::null);
params.set("chebyshev: min diagonal value", STS::eps());
params.set("chebyshev: zero starting solution", true);
params.set("chebyshev: use native spmv", false);

// Ifpack2_Amesos.cpp
params.set("amesos: solver type", "Amesos_Klu");
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -467,9 +467,9 @@ namespace Intrepid2
// face functions
{
// relabel scratch views
auto & P = scratch0;
auto & P_2ip1 = scratch1;
auto &scratchP = scratch0;
auto &scratchP_2ip1 = scratch1;

const ordinal_type max_ij_sum = polyOrder_ - 1;

for (ordinal_type faceOrdinal=0; faceOrdinal<numFaces; faceOrdinal++)
Expand All @@ -481,22 +481,24 @@ namespace Intrepid2
computeFaceVectorWeight(vectorWeight_x, vectorWeight_y, vectorWeight_z, faceOrdinal, lambda, lambda_dx, lambda_dy, lambda_dz);

ordinal_type fieldOrdinal = faceOrdinal * numFaceFunctionsPerFace_;
computeFaceLegendre(P, faceOrdinal, lambda);
computeFaceLegendre(scratchP, faceOrdinal, lambda);

for (int ij_sum=0; ij_sum <= max_ij_sum; ij_sum++)
{
for (int i=0; i<=ij_sum; i++)
{
computeFaceJacobi(P_2ip1, faceOrdinal, i, lambda);
computeFaceJacobi(scratchP_2ip1, faceOrdinal, i, lambda);

const int j = ij_sum - i; // j >= 1

auto & output_x = output_(fieldOrdinal,pointOrdinal,0);
auto & output_y = output_(fieldOrdinal,pointOrdinal,1);
auto & output_z = output_(fieldOrdinal,pointOrdinal,2);

faceFunctionValue(output_x, output_y, output_z, i, j, P, P_2ip1, vectorWeight_x, vectorWeight_y, vectorWeight_z, lambda);


faceFunctionValue(output_x, output_y, output_z, i, j,
scratchP, scratchP_2ip1, vectorWeight_x,
vectorWeight_y, vectorWeight_z, lambda);

fieldOrdinal++;
} // i
} // ij_sum
Expand All @@ -506,10 +508,10 @@ namespace Intrepid2
// interior functions
{
// relabel scratch views
auto & P = scratch0;
auto & P_2ip1 = scratch1;
auto & L_2ipjp1 = scratch2; // L^{2(i+j+1)}, integrated Jacobi
auto &scratchP = scratch0;
auto &scratchP_2ip1 = scratch1;
auto &scratchL_2ipjp1 = scratch2; // L^{2(i+j+1)}, integrated Jacobi

const ordinal_type numInteriorFamilies = 3;
const ordinal_type min_ijk_sum = 1;
const ordinal_type max_ijk_sum = polyOrder_-1;
Expand All @@ -523,12 +525,14 @@ namespace Intrepid2
for (int interiorFamilyOrdinal=1; interiorFamilyOrdinal<=numInteriorFamilies; interiorFamilyOrdinal++)
{
// following ESEAS, we interleave the interior families. This groups all the interior dofs of a given degree together.

ordinal_type fieldOrdinal = numFaceFunctions_ + interiorFamilyOrdinal - 1;


ordinal_type interiorFamilyFieldOrdinal =
numFaceFunctions_ + interiorFamilyOrdinal - 1;

const ordinal_type relatedFaceOrdinal = faceOrdinalForInterior_[interiorFamilyOrdinal-1];

computeFaceLegendreForInterior(P, interiorFamilyOrdinal-1, lambda);

computeFaceLegendreForInterior(scratchP,
interiorFamilyOrdinal - 1, lambda);
computeFaceVectorWeight(vectorWeight_x, vectorWeight_y, vectorWeight_z, relatedFaceOrdinal, lambda, lambda_dx, lambda_dy, lambda_dz);

for (int ijk_sum=min_ijk_sum; ijk_sum <= max_ijk_sum; ijk_sum++)
Expand All @@ -540,22 +544,32 @@ namespace Intrepid2
const ordinal_type j = ij_sum-i;
const ordinal_type k = ijk_sum - ij_sum;

computeFaceJacobiForInterior(P_2ip1, interiorFamilyOrdinal-1, i, lambda);
computeInteriorIntegratedJacobi(L_2ipjp1, i, j, interiorFamilyOrdinal-1, lambda);

computeFaceJacobiForInterior(
scratchP_2ip1, interiorFamilyOrdinal - 1, i, lambda);
computeInteriorIntegratedJacobi(scratchL_2ipjp1, i, j,
interiorFamilyOrdinal - 1,
lambda);

OutputScalar V_x, V_y, V_z;

faceFunctionValue(V_x, V_y, V_z, i, j, P, P_2ip1, vectorWeight_x, vectorWeight_y, vectorWeight_z, lambda);

auto & output_x = output_(fieldOrdinal,pointOrdinal,0);
auto & output_y = output_(fieldOrdinal,pointOrdinal,1);
auto & output_z = output_(fieldOrdinal,pointOrdinal,2);

output_x = V_x * L_2ipjp1(k);
output_y = V_y * L_2ipjp1(k);
output_z = V_z * L_2ipjp1(k);

fieldOrdinal += numInteriorFamilies; // increment due to the interleaving.

faceFunctionValue(V_x, V_y, V_z, i, j, scratchP,
scratchP_2ip1, vectorWeight_x,
vectorWeight_y, vectorWeight_z, lambda);

auto &output_x =
output_(interiorFamilyFieldOrdinal, pointOrdinal, 0);
auto &output_y =
output_(interiorFamilyFieldOrdinal, pointOrdinal, 1);
auto &output_z =
output_(interiorFamilyFieldOrdinal, pointOrdinal, 2);

output_x = V_x * scratchL_2ipjp1(k);
output_y = V_y * scratchL_2ipjp1(k);
output_z = V_z * scratchL_2ipjp1(k);

interiorFamilyFieldOrdinal +=
numInteriorFamilies; // increment due to the
// interleaving.
}
}
}
Expand All @@ -567,27 +581,28 @@ namespace Intrepid2
case OPERATOR_DIV:
{
// rename the scratch memory to match our usage here:
auto & P = scratch0;
auto & P_2ip1 = scratch1;
auto &scratchP = scratch0;
auto &scratchP_2ip1 = scratch1;

// following ESEAS, we interleave the face families. This groups all the face dofs of a given degree together.
ordinal_type fieldOrdinal = 0;
for (int faceOrdinal=0; faceOrdinal<numFaces; faceOrdinal++)
{
const int max_ij_sum = polyOrder_ - 1;
computeFaceLegendre(P, faceOrdinal, lambda);
computeFaceLegendre(scratchP, faceOrdinal, lambda);
OutputScalar divWeight;
computeFaceDivWeight(divWeight, faceOrdinal, lambda_dx, lambda_dy, lambda_dz);
for (int ij_sum=0; ij_sum <= max_ij_sum; ij_sum++)
{
for (int i=0; i<=ij_sum; i++)
{
const int j = ij_sum - i; // j >= 0
computeFaceJacobi(P_2ip1, faceOrdinal, i, lambda);

computeFaceJacobi(scratchP_2ip1, faceOrdinal, i, lambda);
auto &outputValue = output_(fieldOrdinal,pointOrdinal);
faceFunctionDiv(outputValue, i, j, P, P_2ip1, divWeight, lambda);

faceFunctionDiv(outputValue, i, j, scratchP, scratchP_2ip1,
divWeight, lambda);

fieldOrdinal++;
} // i
} // ij_sum
Expand All @@ -596,20 +611,21 @@ namespace Intrepid2
// interior functions
{
// rename the scratch memory to match our usage here:
auto & P = scratch0;
auto & P_2ip1 = scratch1;
auto & L_2ipjp1 = scratch2;
auto & P_2ipjp1 = scratch3;
auto &scratchP = scratch0;
auto &scratchP_2ip1 = scratch1;
auto &scratchL_2ipjp1 = scratch2;
auto &scratchP_2ipjp1 = scratch3;

const int numInteriorFamilies = 3;
const int interiorFieldOrdinalOffset = numFaceFunctions_;
for (int interiorFamilyOrdinal=1; interiorFamilyOrdinal<=numInteriorFamilies; interiorFamilyOrdinal++)
{
// following ESEAS, we interleave the interior families. This groups all the interior dofs of a given degree together.

const ordinal_type relatedFaceOrdinal = faceOrdinalForInterior_[interiorFamilyOrdinal-1];

computeFaceLegendreForInterior(P, interiorFamilyOrdinal-1, lambda);

computeFaceLegendreForInterior(scratchP,
interiorFamilyOrdinal - 1, lambda);
OutputScalar divWeight;
computeFaceDivWeight(divWeight, relatedFaceOrdinal, lambda_dx, lambda_dy, lambda_dz);

Expand All @@ -632,24 +648,38 @@ namespace Intrepid2
{
const ordinal_type j = ij_sum-i;
const ordinal_type k = ijk_sum - ij_sum;
computeFaceJacobiForInterior(P_2ip1, interiorFamilyOrdinal-1, i, lambda);

computeFaceJacobiForInterior(
scratchP_2ip1, interiorFamilyOrdinal - 1, i, lambda);

OutputScalar faceDiv;
faceFunctionDiv(faceDiv, i, j, P, P_2ip1, divWeight, lambda);

faceFunctionDiv(faceDiv, i, j, scratchP, scratchP_2ip1,
divWeight, lambda);

OutputScalar faceValue_x, faceValue_y, faceValue_z;

faceFunctionValue(faceValue_x, faceValue_y, faceValue_z, i, j, P, P_2ip1, vectorWeight_x, vectorWeight_y, vectorWeight_z, lambda);
computeInteriorJacobi(P_2ipjp1, i, j, interiorFamilyOrdinal-1, lambda);

computeInteriorIntegratedJacobi(L_2ipjp1, i, j, interiorFamilyOrdinal-1, lambda);


faceFunctionValue(faceValue_x, faceValue_y, faceValue_z, i,
j, scratchP, scratchP_2ip1,
vectorWeight_x, vectorWeight_y,
vectorWeight_z, lambda);
computeInteriorJacobi(scratchP_2ipjp1, i, j,
interiorFamilyOrdinal - 1, lambda);

computeInteriorIntegratedJacobi(scratchL_2ipjp1, i, j,
interiorFamilyOrdinal - 1,
lambda);

OutputScalar L_2ipjp1_k_dx, L_2ipjp1_k_dy, L_2ipjp1_k_dz;
gradInteriorIntegratedJacobi(L_2ipjp1_k_dx, L_2ipjp1_k_dy, L_2ipjp1_k_dz, interiorFamilyOrdinal-1, j, k, P_2ipjp1, lambda, lambda_dx, lambda_dy, lambda_dz);

gradInteriorIntegratedJacobi(
L_2ipjp1_k_dx, L_2ipjp1_k_dy, L_2ipjp1_k_dz,
interiorFamilyOrdinal - 1, j, k, scratchP_2ipjp1,
lambda, lambda_dx, lambda_dy, lambda_dz);

auto & outputDiv = output_(fieldOrdinal,pointOrdinal);
interiorFunctionDiv(outputDiv, L_2ipjp1(k), faceDiv, L_2ipjp1_k_dx, L_2ipjp1_k_dy, L_2ipjp1_k_dz, faceValue_x, faceValue_y, faceValue_z);

interiorFunctionDiv(outputDiv, scratchL_2ipjp1(k), faceDiv,
L_2ipjp1_k_dx, L_2ipjp1_k_dy,
L_2ipjp1_k_dz, faceValue_x, faceValue_y,
faceValue_z);

fieldOrdinal += numInteriorFamilies; // increment due to the interleaving.
}
}
Expand Down
Loading

0 comments on commit aa3bbe3

Please sign in to comment.