41#ifndef TPETRA_DETAILS_RESIDUAL_HPP
42#define TPETRA_DETAILS_RESIDUAL_HPP
44#include "TpetraCore_config.h"
45#include "Tpetra_CrsMatrix.hpp"
46#include "Tpetra_LocalCrsMatrixOperator.hpp"
47#include "Tpetra_MultiVector.hpp"
48#include "Teuchos_RCP.hpp"
49#include "Teuchos_ScalarTraits.hpp"
52#include "KokkosSparse_spmv_impl.hpp"
70template<
class AMatrix,
class MV,
class ConstMV,
class Offsets,
bool is_MV,
bool restrictedMode,
bool skipOffRank>
73 using execution_space =
typename AMatrix::execution_space;
74 using LO =
typename AMatrix::non_const_ordinal_type;
75 using value_type =
typename AMatrix::non_const_value_type;
76 using team_policy =
typename Kokkos::TeamPolicy<execution_space>;
77 using team_member =
typename team_policy::member_type;
78 using ATV = Kokkos::ArithTraits<value_type>;
109 Kokkos::parallel_for(Kokkos::TeamThreadRange (
dev, 0, rows_per_team),[&] (
const LO&
loop) {
110 const LO
lclRow =
static_cast<LO
> (
dev.league_rank ()) * rows_per_team +
loop;
112 if (
lclRow >= A_lcl.numRows ()) {
118 value_type
A_x = ATV::zero ();
133 const size_t start = A_lcl.graph.row_map(
lclRow);
134 const size_t end = A_lcl.graph.row_map(
lclRow+1);
136 Kokkos::parallel_reduce(Kokkos::ThreadVectorRange (
dev, start,
end), [&] (
const LO
iEntry, value_type&
lsum) {
146 Kokkos::single(Kokkos::PerThread(
dev),[&] () {
152 const LO
numVectors =
static_cast<LO
>(X_colmap_lcl.extent(1));
156 value_type
A_x = ATV::zero ();
170 const size_t start = A_lcl.graph.row_map(
lclRow);
171 const size_t end = A_lcl.graph.row_map(
lclRow+1);
173 Kokkos::parallel_reduce(Kokkos::ThreadVectorRange (
dev, start,
end), [&] (
const LO
iEntry, value_type&
lsum) {
183 Kokkos::single(Kokkos::PerThread(
dev),[&] () {
195template<
class AMatrix,
class MV,
class ConstMV,
class Offsets,
bool is_MV>
198 using execution_space =
typename AMatrix::execution_space;
199 using LO =
typename AMatrix::non_const_ordinal_type;
200 using value_type =
typename AMatrix::non_const_value_type;
201 using team_policy =
typename Kokkos::TeamPolicy<execution_space>;
202 using team_member =
typename team_policy::member_type;
203 using ATV = Kokkos::ArithTraits<value_type>;
228 Kokkos::parallel_for(Kokkos::TeamThreadRange (
dev, 0, rows_per_team),[&] (
const LO&
loop) {
229 const LO
lclRow =
static_cast<LO
> (
dev.league_rank ()) * rows_per_team +
loop;
231 if (
lclRow >= A_lcl.numRows ()) {
236 const size_t end = A_lcl.graph.row_map(
lclRow+1);
240 value_type
A_x = ATV::zero ();
248 Kokkos::single(Kokkos::PerThread(
dev),[&] () {
254 const LO
numVectors =
static_cast<LO
>(X_colmap_lcl.extent(1));
258 value_type
A_x = ATV::zero ();
266 Kokkos::single(Kokkos::PerThread(
dev),[&] () {
276template<
class SC,
class LO,
class GO,
class NO>
281 const Kokkos::View<const size_t*, typename NO::device_type>& offsets,
284 using Teuchos::NO_TRANS;
290 using offset_type = Kokkos::View<const size_t*, typename NO::device_type>;
292 local_matrix_device_type A_lcl =
A.getLocalMatrixDevice ();
301 (
X_colmap.getNumVectors () !=
R.getNumVectors (), std::runtime_error,
302 "X.getNumVectors() = " <<
X_colmap.getNumVectors () <<
" != "
303 "R.getNumVectors() = " <<
R.getNumVectors () <<
".");
306 A.getColMap ()->getLocalNumElements (), std::runtime_error,
307 "X has the wrong number of local rows. "
308 "X.getLocalLength() = " <<
X_colmap.getLocalLength () <<
" != "
309 "A.getColMap()->getLocalNumElements() = " <<
310 A.getColMap ()->getLocalNumElements () <<
".");
312 (
R.getLocalLength () !=
313 A.getRowMap ()->getLocalNumElements (), std::runtime_error,
314 "R has the wrong number of local rows. "
315 "R.getLocalLength() = " <<
R.getLocalLength () <<
" != "
316 "A.getRowMap()->getLocalNumElements() = " <<
317 A.getRowMap ()->getLocalNumElements () <<
".");
319 (
B.getLocalLength () !=
320 A.getRowMap ()->getLocalNumElements (), std::runtime_error,
321 "B has the wrong number of local rows. "
322 "B.getLocalLength() = " <<
B.getLocalLength () <<
" != "
323 "A.getRowMap()->getLocalNumElements() = " <<
324 A.getRowMap ()->getLocalNumElements () <<
".");
327 (!
A.isFillComplete (), std::runtime_error,
"The matrix A is not "
328 "fill complete. You must call fillComplete() (possibly with "
329 "domain and range Map arguments) without an intervening "
330 "resumeFill() call before you may call this method.");
332 (!
X_colmap.isConstantStride () || !
R.isConstantStride () || !
B.isConstantStride (),
333 std::runtime_error,
"X, Y and B must be constant stride.");
337 ((X_colmap_lcl.data () == R_lcl.data () && X_colmap_lcl.data () !=
nullptr) ||
338 (X_colmap_lcl.data () == B_lcl.data () && X_colmap_lcl.data () !=
nullptr),
339 std::runtime_error,
"X, Y and R may not alias one another.");
342 SC one = Teuchos::ScalarTraits<SC>::one(),
negone = -
one,
zero = Teuchos::ScalarTraits<SC>::zero();
343#ifdef TPETRA_DETAILS_USE_REFERENCE_RESIDUAL
350 if (A_lcl.numRows() == 0) {
354 int64_t numLocalRows = A_lcl.numRows ();
355 int64_t myNnz = A_lcl.nnz();
359 LO maxRowImbalance = 0;
360 if(numLocalRows != 0)
361 maxRowImbalance = A.getLocalMaxNumRowEntries() - (myNnz / numLocalRows);
365 auto lclOp = A.getLocalMultiplyOperator();
367 lclOp->applyImbalancedRows (X_colmap_lcl, R_lcl, Teuchos::NO_TRANS, one, zero);
368 R.update(one,B,negone);
373 int vector_length = -1;
374 int64_t rows_per_thread = -1;
377 using policy_type =
typename Kokkos::TeamPolicy<execution_space>;
379 int64_t rows_per_team = KokkosSparse::Impl::spmv_launch_parameters<execution_space>(numLocalRows, myNnz, rows_per_thread, team_size, vector_length);
380 int64_t worksets = (B_lcl.extent (0) + rows_per_team - 1) / rows_per_team;
382 policy_type policy (1, 1);
384 policy = policy_type (worksets, Kokkos::AUTO, vector_length);
387 policy = policy_type (worksets, team_size, vector_length);
390 bool is_vector = (X_colmap_lcl.extent(1) == 1);
394 if (X_domainmap ==
nullptr) {
396 using functor_type = LocalResidualFunctor<local_matrix_device_type,local_view_device_type,const_local_view_device_type,offset_type,false,false,false>;
397 functor_type func (A_lcl, X_colmap_lcl, B_lcl, R_lcl, rows_per_team, offsets, X_domainmap_lcl);
398 Kokkos::parallel_for(
"residual-vector",policy,func);
403 X_domainmap_lcl = X_domainmap->getLocalViewDevice(Access::ReadOnly);
404 using functor_type = LocalResidualFunctor<local_matrix_device_type,local_view_device_type,const_local_view_device_type,offset_type,false,true,false>;
405 functor_type func (A_lcl, X_colmap_lcl, B_lcl, R_lcl, rows_per_team, offsets, X_domainmap_lcl);
406 Kokkos::parallel_for(
"residual-vector",policy,func);
412 if (X_domainmap ==
nullptr) {
414 using functor_type = LocalResidualFunctor<local_matrix_device_type,local_view_device_type,const_local_view_device_type,offset_type,true,false,false>;
415 functor_type func (A_lcl, X_colmap_lcl, B_lcl, R_lcl, rows_per_team, offsets, X_domainmap_lcl);
416 Kokkos::parallel_for(
"residual-multivector",policy,func);
421 X_domainmap_lcl = X_domainmap->getLocalViewDevice(Access::ReadOnly);
422 using functor_type = LocalResidualFunctor<local_matrix_device_type,local_view_device_type,const_local_view_device_type,offset_type,true,true,false>;
423 functor_type func (A_lcl, X_colmap_lcl, B_lcl, R_lcl, rows_per_team, offsets, X_domainmap_lcl);
424 Kokkos::parallel_for(
"residual-multivector",policy,func);
432template<
class SC,
class LO,
class GO,
class NO>
433void localResidualWithCommCompOverlap(
const CrsMatrix<SC,LO,GO,NO> & A,
434 MultiVector<SC,LO,GO,NO> & X_colmap,
435 const MultiVector<SC,LO,GO,NO> & B,
436 MultiVector<SC,LO,GO,NO> & R,
437 const Kokkos::View<const size_t*, typename NO::device_type>& offsets,
438 const MultiVector<SC,LO,GO,NO> & X_domainmap) {
440 using Teuchos::NO_TRANS;
444 ProfilingRegion regionLocalApply (
"Tpetra::CrsMatrix::localResidualWithCommCompOverlap");
447 using local_view_device_type =
typename MultiVector<SC,LO,GO,NO>::dual_view_type::t_dev::non_const_type;
448 using const_local_view_device_type =
typename MultiVector<SC,LO,GO,NO>::dual_view_type::t_dev::const_type;
449 using offset_type = Kokkos::View<const size_t*, typename NO::device_type>;
451 local_matrix_device_type A_lcl = A.getLocalMatrixDevice ();
452 const_local_view_device_type X_colmap_lcl = X_colmap.getLocalViewDevice(Access::ReadOnly);
453 const_local_view_device_type B_lcl = B.getLocalViewDevice(Access::ReadOnly);
454 local_view_device_type R_lcl = R.getLocalViewDevice(Access::OverwriteAll);
455 const_local_view_device_type X_domainmap_lcl = X_domainmap.getLocalViewDevice(Access::ReadOnly);;
459 TEUCHOS_TEST_FOR_EXCEPTION
460 (X_colmap.getNumVectors () != R.getNumVectors (), std::runtime_error,
461 "X.getNumVectors() = " << X_colmap.getNumVectors () <<
" != "
462 "R.getNumVectors() = " << R.getNumVectors () <<
".");
463 TEUCHOS_TEST_FOR_EXCEPTION
464 (X_colmap.getLocalLength () !=
465 A.getColMap ()->getLocalNumElements (), std::runtime_error,
466 "X has the wrong number of local rows. "
467 "X.getLocalLength() = " << X_colmap.getLocalLength () <<
" != "
468 "A.getColMap()->getLocalNumElements() = " <<
469 A.getColMap ()->getLocalNumElements () <<
".");
470 TEUCHOS_TEST_FOR_EXCEPTION
471 (R.getLocalLength () !=
472 A.getRowMap ()->getLocalNumElements (), std::runtime_error,
473 "R has the wrong number of local rows. "
474 "R.getLocalLength() = " << R.getLocalLength () <<
" != "
475 "A.getRowMap()->getLocalNumElements() = " <<
476 A.getRowMap ()->getLocalNumElements () <<
".");
477 TEUCHOS_TEST_FOR_EXCEPTION
478 (B.getLocalLength () !=
479 A.getRowMap ()->getLocalNumElements (), std::runtime_error,
480 "B has the wrong number of local rows. "
481 "B.getLocalLength() = " << B.getLocalLength () <<
" != "
482 "A.getRowMap()->getLocalNumElements() = " <<
483 A.getRowMap ()->getLocalNumElements () <<
".");
485 TEUCHOS_TEST_FOR_EXCEPTION
486 (! A.isFillComplete (), std::runtime_error,
"The matrix A is not "
487 "fill complete. You must call fillComplete() (possibly with "
488 "domain and range Map arguments) without an intervening "
489 "resumeFill() call before you may call this method.");
490 TEUCHOS_TEST_FOR_EXCEPTION
491 (! X_colmap.isConstantStride () || ! R.isConstantStride () || ! B.isConstantStride (),
492 std::runtime_error,
"X, Y and B must be constant stride.");
495 TEUCHOS_TEST_FOR_EXCEPTION
496 ((X_colmap_lcl.data () == R_lcl.data () && X_colmap_lcl.data () !=
nullptr) ||
497 (X_colmap_lcl.data () == B_lcl.data () && X_colmap_lcl.data () !=
nullptr),
498 std::runtime_error,
"X, Y and R may not alias one another.");
501 if (A_lcl.numRows() == 0) {
505 int64_t numLocalRows = A_lcl.numRows ();
506 int64_t myNnz = A_lcl.nnz();
524 int vector_length = -1;
525 int64_t rows_per_thread = -1;
528 using policy_type =
typename Kokkos::TeamPolicy<execution_space>;
530 int64_t rows_per_team = KokkosSparse::Impl::spmv_launch_parameters<execution_space>(numLocalRows, myNnz, rows_per_thread, team_size, vector_length);
531 int64_t worksets = (B_lcl.extent (0) + rows_per_team - 1) / rows_per_team;
533 policy_type policy (1, 1);
535 policy = policy_type (worksets, Kokkos::AUTO, vector_length);
538 policy = policy_type (worksets, team_size, vector_length);
541 bool is_vector = (X_colmap_lcl.extent(1) == 1);
545 using functor_type = LocalResidualFunctor<local_matrix_device_type,local_view_device_type,const_local_view_device_type,offset_type,false,true,true>;
546 functor_type func (A_lcl, X_colmap_lcl, B_lcl, R_lcl, rows_per_team, offsets, X_domainmap_lcl);
547 Kokkos::parallel_for(
"residual-vector",policy,func);
549 RCP<const import_type> importer = A.getGraph ()->getImporter ();
550 X_colmap.endImport (X_domainmap, *importer,
INSERT,
true);
554 using functor_type2 = OffRankUpdateFunctor<local_matrix_device_type,local_view_device_type,const_local_view_device_type,offset_type,false>;
555 functor_type2 func2 (A_lcl, X_colmap_lcl, R_lcl, rows_per_team, offsets);
556 Kokkos::parallel_for(
"residual-vector-offrank",policy,func2);
561 using functor_type = LocalResidualFunctor<local_matrix_device_type,local_view_device_type,const_local_view_device_type,offset_type,true,true,true>;
562 functor_type func (A_lcl, X_colmap_lcl, B_lcl, R_lcl, rows_per_team, offsets, X_domainmap_lcl);
563 Kokkos::parallel_for(
"residual-multivector",policy,func);
565 RCP<const import_type> importer = A.getGraph ()->getImporter ();
566 X_colmap.endImport (X_domainmap, *importer,
INSERT,
true);
570 using functor_type2 = OffRankUpdateFunctor<local_matrix_device_type,local_view_device_type,const_local_view_device_type,offset_type,true>;
571 functor_type2 func2 (A_lcl, X_colmap_lcl, R_lcl, rows_per_team, offsets);
572 Kokkos::parallel_for(
"residual-vector-offrank",policy,func2);
579template<
class SC,
class LO,
class GO,
class NO>
587 using Teuchos::rcp_const_cast;
588 using Teuchos::rcpFromRef;
593 if (overlapCommunicationAndComputation)
606 SC one = Teuchos::ScalarTraits<SC>::one(),
negone = -
one,
zero = Teuchos::ScalarTraits<SC>::zero();
617 using offset_type =
typename graph_type::offset_device_view_type;
621 (!
R_in.isDistributed () &&
A.getComm ()->getSize () != 1);
637 if (!
X_in.isConstantStride ()) {
667 (!
importer->getTargetMap()->isLocallyFitted(*
importer->getSourceMap()), std::runtime_error,
668 "Source map and target map are not locally fitted, but Tpetra::residual thinks they are.");
677 A.getCrsGraph()->getLocalOffRankOffsets(offsets);
684 if (!
R_in.isConstantStride ()) {
699 if (!
B_in.isConstantStride ()) {
742 if (overlapCommunicationAndComputation) {
760 if (!
R_in.isConstantStride () ) {
Declaration of Tpetra::Details::Behavior, a class that describes Tpetra's behavior.
Declaration of Tpetra::Details::Profiling, a scope guard for Kokkos Profiling.
Struct that holds views of the contents of a CrsMatrix.
typename device_type::execution_space execution_space
The Kokkos execution space.
KokkosSparse::CrsMatrix< impl_scalar_type, local_ordinal_type, device_type, void, typename local_graph_device_type::size_type > local_matrix_device_type
The specialization of Kokkos::CrsMatrix that represents the part of the sparse matrix on each MPI pro...
Import< LocalOrdinal, GlobalOrdinal, Node > import_type
The Import specialization suitable for this CrsMatrix specialization.
static bool debug()
Whether Tpetra is in debug mode.
static bool overlapCommunicationAndComputation()
Overlap communication and computation.
static size_t rowImbalanceThreshold()
Threshold for deciding if a local matrix is "imbalanced" in the number of entries per row....
static bool skipCopyAndPermuteIfPossible()
Skip copyAndPermute if possible.
Communication plan for data redistribution from a (possibly) multiply-owned to a uniquely-owned distr...
Communication plan for data redistribution from a uniquely-owned to a (possibly) multiply-owned distr...
One or more distributed dense vectors.
Implementation details of Tpetra.
void residual(const Operator< SC, LO, GO, NO > &A, const MultiVector< SC, LO, GO, NO > &X, const MultiVector< SC, LO, GO, NO > &B, MultiVector< SC, LO, GO, NO > &R)
Computes R = B - A * X.
Namespace Tpetra contains the class and methods constituting the Tpetra library.
void deep_copy(MultiVector< DS, DL, DG, DN > &dst, const MultiVector< SS, SL, SG, SN > &src)
Copy the contents of the MultiVector src into dst.
@ INSERT
Insert new values that don't currently exist.
Functor for computing the residual.
Functor for computing R -= A_offRank*X_colmap.