Skip to content

Commit

Permalink
Tpetra: merge two TAFC methods
Browse files Browse the repository at this point in the history
Merges Tpetra_CrsMatrix methods unpackAndCombineWithOwningPIDsCount and
unpackAndCombineIntoCrsArrays.  Reduces number of deep copies.  Part of
larger effort to have TAFC run on device.

Temporary change in Tpetra_CrsMatrix_def.hpp:
    destMat->numImportPacketsPerLID_.modify_host()

because numImportPacketsPerLID_ is a Kokkos::DualView and hasn't been
properly marked as modified on host

Addresses #11693 and #11694.
  • Loading branch information
jhux2 authored and csiefer2 committed Sep 12, 2023
1 parent 4fcc46d commit ee8dff1
Show file tree
Hide file tree
Showing 4 changed files with 394 additions and 10 deletions.
59 changes: 57 additions & 2 deletions packages/tpetra/core/src/Tpetra_CrsMatrix_def.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -4534,7 +4534,7 @@ CrsMatrix<Scalar, LocalOrdinal, GlobalOrdinal, Node>::
);
this->checkInternalState ();
}
}
} //fillComplete(domainMap, rangeMap, params)

template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
void
Expand Down Expand Up @@ -8450,11 +8450,19 @@ CrsMatrix<Scalar, LocalOrdinal, GlobalOrdinal, Node>::
}
}

/*********************************************************************/
/**** 3) Copy all of the Same/Permute/Remote data into CSR_arrays ****/
/*********************************************************************/

// Backwards compatibility measure. We'll use this again below.

// TODO JHU Need to track down why numImportPacketsPerLID_ has not been corrently marked as modified on host (which it has been)
// TODO JHU somewhere above, e.g., call to Distor.doPostsAndWaits().
// TODO JHU This only becomes apparent as we begin to convert TAFC to run on device.
destMat->numImportPacketsPerLID_.modify_host(); //FIXME

#define TPETRA_NEW_TAFC_UNPACK_AND_COMBINE
#ifndef TPETRA_NEW_TAFC_UNPACK_AND_COMBINE

#ifdef HAVE_TPETRA_MMM_TIMINGS
RCP<TimeMonitor> tmCopySPRdata = rcp(new TimeMonitor(*TimeMonitor::getNewTimer(prefix + std::string("TAFC unpack-count-resize"))));
#endif
Expand Down Expand Up @@ -8538,9 +8546,56 @@ CrsMatrix<Scalar, LocalOrdinal, GlobalOrdinal, Node>::
Teuchos::av_reinterpret_cast<impl_scalar_type> (CSR_vals ()),
SourcePids (),
TargetPids);
#else
# ifdef HAVE_TPETRA_MMM_TIMINGS
RCP<TimeMonitor> tmCopySPRdata = rcp(new TimeMonitor(*TimeMonitor::getNewTimer(prefix + std::string("TAFC unpack-count-resize + copy same-perm-remote data"))));
# endif
ArrayRCP<size_t> CSR_rowptr;
ArrayRCP<GO> CSR_colind_GID;
ArrayRCP<LO> CSR_colind_LID;
ArrayRCP<Scalar> CSR_vals;

destMat->imports_.sync_device ();
destMat->numImportPacketsPerLID_.sync_device ();

size_t N = BaseRowMap->getLocalNumElements ();

TEUCHOS_TEST_FOR_EXCEPTION
(destMat->numImportPacketsPerLID_.need_sync_device(), std::logic_error, "The "
"input Kokkos::DualView was most recently modified on host, but TAFC "
"needs the device view of the data to be the most recently modified.");

Details::unpackAndCombineIntoCrsArrays_new(
*this,
RemoteLIDs,
destMat->imports_.view_device(), //hostImports
destMat->numImportPacketsPerLID_.view_device(), //numImportPacketsPerLID
NumSameIDs,
PermuteToLIDs,
PermuteFromLIDs,
N,
MyPID,
CSR_rowptr,
CSR_colind_GID,
CSR_vals,
SourcePids(),
TargetPids);

// If LO and GO are the same, we can reuse memory when
// converting the column indices from global to local indices.
if (typeid (LO) == typeid (GO)) {
CSR_colind_LID = Teuchos::arcp_reinterpret_cast<LO> (CSR_colind_GID);
}
else {
CSR_colind_LID.resize (CSR_colind_GID.size());
}
CSR_colind_LID.resize (CSR_colind_GID.size());
size_t mynnz = CSR_vals.size();
#endif //ifndef TPETRA_NEW_TAFC_UNPACK_AND_COMBINE ... else

// On return from unpackAndCombineIntoCrsArrays TargetPids[i] == -1 for locally
// owned entries. Convert them to the actual PID.
// JHU FIXME This can be done within unpackAndCombineIntoCrsArrays_new with a parallel_for.
for(size_t i=0; i<static_cast<size_t>(TargetPids.size()); i++)
{
if(TargetPids[i] == -1) TargetPids[i] = MyPID;
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -246,6 +246,39 @@ unpackAndCombineIntoCrsArrays (
const Teuchos::ArrayView<const int>& SourcePids,
Teuchos::Array<int>& TargetPids);


/// \brief unpackAndCombineIntoCrsArrays_new
///
/// Note: The SourcePids vector (on input) should contain owning PIDs
/// for each column in the (source) ColMap, as from
/// Tpetra::Import_Util::getPids, with the "-1 for local" option being
/// used.
///
/// Note: The TargetPids vector (on output) will contain owning PIDs
/// for each entry in the matrix, with the "-1 for local" for locally
/// owned entries.
///
/// Note: This method does the work of unpackAndCombineWithOwningPIDsCount,
/// namely, calculating the local number of nonzeros, and allocates CRS
/// arrays of the correct sizes.
template<typename Scalar, typename LocalOrdinal, typename GlobalOrdinal, typename Node>
void
unpackAndCombineIntoCrsArrays_new (
const CrsMatrix<Scalar, LocalOrdinal, GlobalOrdinal, Node> & sourceMatrix,
const Teuchos::ArrayView<const LocalOrdinal>& importLIDs,
const Kokkos::View<const char*, typename Node::device_type>& imports_d,
const Kokkos::View<const size_t*, typename Node::device_type>& num_packets_per_lid_d,
const size_t numSameIDs,
const Teuchos::ArrayView<const LocalOrdinal>& permuteToLIDs,
const Teuchos::ArrayView<const LocalOrdinal>& permuteFromLIDs,
size_t TargetNumRows,
const int MyTargetPID,
Teuchos::ArrayRCP<size_t>& CRS_rowptr,
Teuchos::ArrayRCP<GlobalOrdinal>& CRS_colind,
Teuchos::ArrayRCP<Scalar>& CRS_vals,
const Teuchos::ArrayView<const int>& SourcePids,
Teuchos::Array<int>& TargetPids);

} // namespace Details
} // namespace Tpetra

Expand Down
Loading

0 comments on commit ee8dff1

Please sign in to comment.