Tpetra parallel linear algebra Version of the Day
Loading...
Searching...
No Matches
TpetraExt_TripleMatrixMultiply_def.hpp
Go to the documentation of this file.
1// @HEADER
2// ***********************************************************************
3//
4// Tpetra: Templated Linear Algebra Services Package
5// Copyright (2008) Sandia Corporation
6//
7// Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation,
8// the U.S. Government retains certain rights in this software.
9//
10// Redistribution and use in source and binary forms, with or without
11// modification, are permitted provided that the following conditions are
12// met:
13//
14// 1. Redistributions of source code must retain the above copyright
15// notice, this list of conditions and the following disclaimer.
16//
17// 2. Redistributions in binary form must reproduce the above copyright
18// notice, this list of conditions and the following disclaimer in the
19// documentation and/or other materials provided with the distribution.
20//
21// 3. Neither the name of the Corporation nor the names of the
22// contributors may be used to endorse or promote products derived from
23// this software without specific prior written permission.
24//
25// THIS SOFTWARE IS PROVIDED BY SANDIA CORPORATION "AS IS" AND ANY
26// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
27// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
28// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL SANDIA CORPORATION OR THE
29// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
30// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
31// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
32// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
33// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
34// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
35// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
36//
37// Questions? Contact Michael A. Heroux (maherou@sandia.gov)
38//
39// ************************************************************************
40// @HEADER
41#ifndef TPETRA_TRIPLEMATRIXMULTIPLY_DEF_HPP
42#define TPETRA_TRIPLEMATRIXMULTIPLY_DEF_HPP
43
45#include "TpetraExt_MatrixMatrix_ExtraKernels_decl.hpp" //for UnmanagedView
46#include "Teuchos_VerboseObject.hpp"
47#include "Teuchos_Array.hpp"
48#include "Tpetra_Util.hpp"
49#include "Tpetra_ConfigDefs.hpp"
50#include "Tpetra_CrsMatrix.hpp"
52#include "Tpetra_RowMatrixTransposer.hpp"
53#include "Tpetra_ConfigDefs.hpp"
54#include "Tpetra_Map.hpp"
55#include "Tpetra_Export.hpp"
58#include <algorithm>
59#include <cmath>
60#include "Teuchos_FancyOStream.hpp"
61// #include "KokkosSparse_spgemm.hpp"
62
63
71/*********************************************************************************************************/
72// Include the architecture-specific kernel partial specializations here
73// NOTE: This needs to be outside all namespaces
74#include "TpetraExt_MatrixMatrix_OpenMP.hpp"
75#include "TpetraExt_MatrixMatrix_Cuda.hpp"
76#include "TpetraExt_MatrixMatrix_HIP.hpp"
77#include "TpetraExt_MatrixMatrix_SYCL.hpp"
78
79namespace Tpetra {
80
81 namespace TripleMatrixMultiply{
82
83 //
84 // This method forms the matrix-matrix product Ac = op(R) * op(A) * op(P), where
85 // op(A) == A if transposeA is false,
86 // op(A) == A^T if transposeA is true,
87 // and similarly for op(R) and op(P).
88 //
89 template <class Scalar,
90 class LocalOrdinal,
91 class GlobalOrdinal,
92 class Node>
95 bool transposeR,
97 bool transposeA,
99 bool transposeP,
102 const std::string& label,
103 const Teuchos::RCP<Teuchos::ParameterList>& params)
104 {
105 using Teuchos::null;
106 using Teuchos::RCP;
107 typedef Scalar SC;
108 typedef LocalOrdinal LO;
109 typedef GlobalOrdinal GO;
110 typedef Node NO;
111 typedef CrsMatrix<SC,LO,GO,NO> crs_matrix_type;
112 typedef Import<LO,GO,NO> import_type;
113 typedef Export<LO,GO,NO> export_type;
115 typedef Map<LO,GO,NO> map_type;
117
118#ifdef HAVE_TPETRA_MMM_TIMINGS
119 std::string prefix_mmm = std::string("TpetraExt ") + label + std::string(": ");
120 using Teuchos::TimeMonitor;
121 RCP<Teuchos::TimeMonitor> MM = rcp(new TimeMonitor(*TimeMonitor::getNewTimer(prefix_mmm + std::string("RAP All Setup"))));
122#endif
123
124 const std::string prefix = "TpetraExt::TripleMatrixMultiply::MultiplyRAP(): ";
125
126 // TEUCHOS_FUNC_TIME_MONITOR_DIFF("My Matrix Mult", mmm_multiply);
127
128 // The input matrices R, A and P must both be fillComplete.
129 TEUCHOS_TEST_FOR_EXCEPTION(!R.isFillComplete(), std::runtime_error, prefix << "Matrix R is not fill complete.");
130 TEUCHOS_TEST_FOR_EXCEPTION(!A.isFillComplete(), std::runtime_error, prefix << "Matrix A is not fill complete.");
131 TEUCHOS_TEST_FOR_EXCEPTION(!P.isFillComplete(), std::runtime_error, prefix << "Matrix P is not fill complete.");
132
133 // If transposeA is true, then Rprime will be the transpose of R
134 // (computed explicitly via RowMatrixTransposer). Otherwise, Rprime
135 // will just be a pointer to R.
137 // If transposeA is true, then Aprime will be the transpose of A
138 // (computed explicitly via RowMatrixTransposer). Otherwise, Aprime
139 // will just be a pointer to A.
141 // If transposeB is true, then Pprime will be the transpose of P
142 // (computed explicitly via RowMatrixTransposer). Otherwise, Pprime
143 // will just be a pointer to P.
145
146 // Is this a "clean" matrix?
147 //
148 // mfh 27 Sep 2016: Historically, if Epetra_CrsMatrix was neither
149 // locally nor globally indexed, then it was empty. I don't like
150 // this, because the most straightforward implementation presumes
151 // lazy allocation of indices. However, historical precedent
152 // demands that we keep around this predicate as a way to test
153 // whether the matrix is empty.
154 const bool newFlag = !Ac.getGraph()->isLocallyIndexed() && !Ac.getGraph()->isGloballyIndexed();
155
156 using Teuchos::ParameterList;
158 transposeParams->set ("sort", false);
159
160 if (transposeR && &R != &P) {
162 Rprime = transposer.createTranspose (transposeParams);
163 } else {
165 }
166
167 if (transposeA) {
169 Aprime = transposer.createTranspose (transposeParams);
170 } else {
172 }
173
174 if (transposeP) {
176 Pprime = transposer.createTranspose (transposeParams);
177 } else {
179 }
180
181 // Check size compatibility
182 global_size_t numRCols = R.getDomainMap()->getGlobalNumElements();
183 global_size_t numACols = A.getDomainMap()->getGlobalNumElements();
184 global_size_t numPCols = P.getDomainMap()->getGlobalNumElements();
185 global_size_t Rleft = transposeR ? numRCols : R.getGlobalNumRows();
186 global_size_t Rright = transposeR ? R.getGlobalNumRows() : numRCols;
187 global_size_t Aleft = transposeA ? numACols : A.getGlobalNumRows();
188 global_size_t Aright = transposeA ? A.getGlobalNumRows() : numACols;
189 global_size_t Pleft = transposeP ? numPCols : P.getGlobalNumRows();
190 global_size_t Pright = transposeP ? P.getGlobalNumRows() : numPCols;
191 TEUCHOS_TEST_FOR_EXCEPTION(Rright != Aleft, std::runtime_error,
192 prefix << "ERROR, inner dimensions of op(R) and op(A) "
193 "must match for matrix-matrix product. op(R) is "
194 << Rleft << "x" << Rright << ", op(A) is "<< Aleft << "x" << Aright);
195
196 TEUCHOS_TEST_FOR_EXCEPTION(Aright != Pleft, std::runtime_error,
197 prefix << "ERROR, inner dimensions of op(A) and op(P) "
198 "must match for matrix-matrix product. op(A) is "
199 << Aleft << "x" << Aright << ", op(P) is "<< Pleft << "x" << Pright);
200
201 // The result matrix Ac must at least have a row-map that reflects the correct
202 // row-size. Don't check the number of columns because rectangular matrices
203 // which were constructed with only one map can still end up having the
204 // correct capacity and dimensions when filled.
205 TEUCHOS_TEST_FOR_EXCEPTION(Rleft > Ac.getGlobalNumRows(), std::runtime_error,
206 prefix << "ERROR, dimensions of result Ac must "
207 "match dimensions of op(R) * op(A) * op(P). Ac has " << Ac.getGlobalNumRows()
208 << " rows, should have at least " << Rleft << std::endl);
209
210 // It doesn't matter whether Ac is already Filled or not. If it is already
211 // Filled, it must have space allocated for the positions that will be
212 // referenced in forming Ac = op(R)*op(A)*op(P). If it doesn't have enough space,
213 // we'll error out later when trying to store result values.
214
215 // CGB: However, matrix must be in active-fill
216 TEUCHOS_TEST_FOR_EXCEPT( Ac.isFillActive() == false );
217
218 // We're going to need to import remotely-owned sections of P if
219 // more than one processor is performing this run, depending on the scenario.
220 int numProcs = P.getComm()->getSize();
221
222 // Declare a couple of structs that will be used to hold views of the data
223 // of R, A and P, to be used for fast access during the matrix-multiplication.
227
231
232#ifdef HAVE_TPETRA_MMM_TIMINGS
233 MM = Teuchos::null; MM = rcp(new TimeMonitor(*TimeMonitor::getNewTimer(prefix_mmm + std::string("RAP All I&X"))));
234#endif
235
236 // Now import any needed remote rows and populate the Aview struct
237 // NOTE: We assert that an import isn't needed --- since we do the transpose
238 // above to handle that.
240
241 if (!(transposeR && &R == &P))
242 MMdetails::import_and_extract_views(*Rprime, targetMap_R, Rview, dummyImporter, true, label, params);
243
244 MMdetails::import_and_extract_views(*Aprime, targetMap_A, Aview, dummyImporter, true, label, params);
245
246 // We will also need local access to all rows of P that correspond to the
247 // column-map of op(A).
248 if (numProcs > 1)
249 targetMap_P = Aprime->getColMap();
250
251 // Import any needed remote rows and populate the Pview struct.
252 MMdetails::import_and_extract_views(*Pprime, targetMap_P, Pview, Aprime->getGraph()->getImporter(), false, label, params);
253
254
256
257 bool needs_final_export = !Pprime->getGraph()->getImporter().is_null();
260 else
261 Actemp = rcp(&Ac,false);// don't allow deallocation
262
263#ifdef HAVE_TPETRA_MMM_TIMINGS
264 MM = Teuchos::null; MM = rcp(new TimeMonitor(*TimeMonitor::getNewTimer(prefix_mmm + std::string("RAP All Multiply"))));
265#endif
266
267 // Call the appropriate method to perform the actual multiplication.
269 if (transposeR && &R == &P)
270 MMdetails::mult_PT_A_P_newmatrix(Aview, Pview, *Actemp, label, params);
271 else
272 MMdetails::mult_R_A_P_newmatrix(Rview, Aview, Pview, *Actemp, label, params);
273 } else if (call_FillComplete_on_result) {
274 if (transposeR && &R == &P)
275 MMdetails::mult_PT_A_P_reuse(Aview, Pview, *Actemp, label, params);
276 else
277 MMdetails::mult_R_A_P_reuse(Rview, Aview, Pview, *Actemp, label, params);
278 } else {
279 // mfh 27 Sep 2016: Is this the "slow" case? This
280 // "CrsWrapper_CrsMatrix" thing could perhaps be made to support
281 // thread-parallel inserts, but that may take some effort.
282 // CrsWrapper_CrsMatrix<Scalar, LocalOrdinal, GlobalOrdinal, Node> crsmat(Ac);
283
284 // MMdetails::mult_A_B(Aview, Bview, crsmat, label,params);
285
286 // #ifdef HAVE_TPETRA_MMM_TIMINGS
287 // MM = rcp(new TimeMonitor(*TimeMonitor::getNewTimer(prefix_mmm + std::string("RAP All FillComplete"))));
288 // #endif
289 // if (call_FillComplete_on_result) {
290 // // We'll call FillComplete on the C matrix before we exit, and give it a
291 // // domain-map and a range-map.
292 // // The domain-map will be the domain-map of B, unless
293 // // op(B)==transpose(B), in which case the range-map of B will be used.
294 // // The range-map will be the range-map of A, unless op(A)==transpose(A),
295 // // in which case the domain-map of A will be used.
296 // if (!C.isFillComplete())
297 // C.fillComplete(Bprime->getDomainMap(), Aprime->getRangeMap());
298 // }
299 // Not implemented
300 if (transposeR && &R == &P)
301 MMdetails::mult_PT_A_P_newmatrix(Aview, Pview, *Actemp, label, params);
302 else
303 MMdetails::mult_R_A_P_newmatrix(Rview, Aview, Pview, *Actemp, label, params);
304 }
305
306 if (needs_final_export) {
307#ifdef HAVE_TPETRA_MMM_TIMINGS
308 MM = Teuchos::null; MM = rcp(new TimeMonitor(*TimeMonitor::getNewTimer(prefix_mmm + std::string("RAP exportAndFillComplete"))));
309#endif
310 Teuchos::ParameterList labelList;
311 labelList.set("Timer Label", label);
312 Teuchos::ParameterList& labelList_subList = labelList.sublist("matrixmatrix: kernel params",false);
313
315 bool isMM = true;
316 bool overrideAllreduce = false;
318 if(!params.is_null()) {
319 Teuchos::ParameterList& params_sublist = params->sublist("matrixmatrix: kernel params",false);
321 mm_optimization_core_count = params_sublist.get("MM_TAFC_OptimizationCoreCount",mm_optimization_core_count);
322 int mm_optimization_core_count2 = params->get("MM_TAFC_OptimizationCoreCount",mm_optimization_core_count);
324 isMM = params_sublist.get("isMatrixMatrix_TransferAndFillComplete",false);
325 overrideAllreduce = params_sublist.get("MM_TAFC_OverrideAllreduceCheck",false);
326
327 labelList.set("compute global constants",params->get("compute global constants",true));
328 }
329 labelList_subList.set("MM_TAFC_OptimizationCoreCount",mm_optimization_core_count,"Core Count above which the optimized neighbor discovery is used");
330
331 labelList_subList.set("isMatrixMatrix_TransferAndFillComplete",isMM,
332 "This parameter should be set to true only for MatrixMatrix operations: the optimization in Epetra that was ported to Tpetra does _not_ take into account the possibility that for any given source PID, a particular GID may not exist on the target PID: i.e. a transfer operation. A fix for this general case is in development.");
333 labelList_subList.set("MM_TAFC_OverrideAllreduceCheck",overrideAllreduce);
334
335 export_type exporter = export_type(*Pprime->getGraph()->getImporter());
336 Actemp->exportAndFillComplete(Acprime,
337 exporter,
338 Acprime->getDomainMap(),
339 Acprime->getRangeMap(),
340 rcp(&labelList,false));
341
342 }
343#ifdef HAVE_TPETRA_MMM_STATISTICS
344 printMultiplicationStatistics(Actemp->getGraph()->getExporter(), label+std::string(" RAP MMM"));
345#endif
346
347 }
348
349
350 } //End namespace TripleMatrixMultiply
351
352 namespace MMdetails{
353
354 // Kernel method for computing the local portion of Ac = R*A*P
355 template<class Scalar,
356 class LocalOrdinal,
357 class GlobalOrdinal,
358 class Node>
359 void mult_R_A_P_newmatrix(
360 CrsMatrixStruct<Scalar, LocalOrdinal, GlobalOrdinal, Node>& Rview,
361 CrsMatrixStruct<Scalar, LocalOrdinal, GlobalOrdinal, Node>& Aview,
362 CrsMatrixStruct<Scalar, LocalOrdinal, GlobalOrdinal, Node>& Pview,
363 CrsMatrix<Scalar, LocalOrdinal, GlobalOrdinal, Node>& Ac,
364 const std::string& label,
365 const Teuchos::RCP<Teuchos::ParameterList>& params)
366 {
367 using Teuchos::Array;
368 using Teuchos::ArrayRCP;
369 using Teuchos::ArrayView;
370 using Teuchos::RCP;
371 using Teuchos::rcp;
372
373 //typedef Scalar SC; // unused
374 typedef LocalOrdinal LO;
375 typedef GlobalOrdinal GO;
376 typedef Node NO;
377
378 typedef Import<LO,GO,NO> import_type;
379 typedef Map<LO,GO,NO> map_type;
380
381 // Kokkos typedefs
382 typedef typename map_type::local_map_type local_map_type;
384 typedef typename KCRS::StaticCrsGraphType graph_t;
385 typedef typename graph_t::row_map_type::non_const_type lno_view_t;
386 typedef typename NO::execution_space execution_space;
387 typedef Kokkos::RangePolicy<execution_space, size_t> range_type;
388 typedef Kokkos::View<LO*, typename lno_view_t::array_layout, typename lno_view_t::device_type> lo_view_t;
389
390#ifdef HAVE_TPETRA_MMM_TIMINGS
391 std::string prefix_mmm = std::string("TpetraExt ") + label + std::string(": ");
392 using Teuchos::TimeMonitor;
393 RCP<TimeMonitor> MM = rcp(new TimeMonitor(*(TimeMonitor::getNewTimer(prefix_mmm + std::string("RAP M5 Cmap")))));
394#endif
395 LO LO_INVALID = Teuchos::OrdinalTraits<LO>::invalid();
396
397 // Build the final importer / column map, hash table lookups for Ac
398 RCP<const import_type> Cimport;
399 RCP<const map_type> Ccolmap;
400 RCP<const import_type> Pimport = Pview.origMatrix->getGraph()->getImporter();
401 RCP<const import_type> Iimport = Pview.importMatrix.is_null() ? Teuchos::null : Pview.importMatrix->getGraph()->getImporter();
402 local_map_type Acolmap_local = Aview.colMap->getLocalMap();
403 local_map_type Prowmap_local = Pview.origMatrix->getRowMap()->getLocalMap();
404 local_map_type Irowmap_local; if(!Pview.importMatrix.is_null()) Irowmap_local = Pview.importMatrix->getRowMap()->getLocalMap();
405 local_map_type Pcolmap_local = Pview.origMatrix->getColMap()->getLocalMap();
406 local_map_type Icolmap_local; if(!Pview.importMatrix.is_null()) Icolmap_local = Pview.importMatrix->getColMap()->getLocalMap();
407
408
409 // mfh 27 Sep 2016: Pcol2Ccol is a table that maps from local column
410 // indices of B, to local column indices of Ac. (B and Ac have the
411 // same number of columns.) The kernel uses this, instead of
412 // copying the entire input matrix B and converting its column
413 // indices to those of C.
414 lo_view_t Pcol2Ccol(Kokkos::ViewAllocateWithoutInitializing("Pcol2Ccol"),Pview.colMap->getLocalNumElements()), Icol2Ccol;
415
416 if (Pview.importMatrix.is_null()) {
417 // mfh 27 Sep 2016: B has no "remotes," so P and C have the same column Map.
418 Cimport = Pimport;
419 Ccolmap = Pview.colMap;
420 const LO colMapSize = static_cast<LO>(Pview.colMap->getLocalNumElements());
421 // Pcol2Ccol is trivial
422 Kokkos::parallel_for("Tpetra::mult_R_A_P_newmatrix::Pcol2Ccol_fill",
423 Kokkos::RangePolicy<execution_space, LO>(0, colMapSize),
424 KOKKOS_LAMBDA(const LO i) {
425 Pcol2Ccol(i) = i;
426 });
427 }
428 else {
429 // mfh 27 Sep 2016: P has "remotes," so we need to build the
430 // column Map of C, as well as C's Import object (from its domain
431 // Map to its column Map). C's column Map is the union of the
432 // column Maps of (the local part of) P, and the "remote" part of
433 // P. Ditto for the Import. We have optimized this "setUnion"
434 // operation on Import objects and Maps.
435
436 // Choose the right variant of setUnion
437 if (!Pimport.is_null() && !Iimport.is_null()) {
438 Cimport = Pimport->setUnion(*Iimport);
439 }
440 else if (!Pimport.is_null() && Iimport.is_null()) {
441 Cimport = Pimport->setUnion();
442 }
443 else if (Pimport.is_null() && !Iimport.is_null()) {
444 Cimport = Iimport->setUnion();
445 }
446 else {
447 throw std::runtime_error("TpetraExt::RAP status of matrix importers is nonsensical");
448 }
449 Ccolmap = Cimport->getTargetMap();
450
451 // FIXME (mfh 27 Sep 2016) This error check requires an all-reduce
452 // in general. We should get rid of it in order to reduce
453 // communication costs of sparse matrix-matrix multiply.
454 TEUCHOS_TEST_FOR_EXCEPTION(!Cimport->getSourceMap()->isSameAs(*Pview.origMatrix->getDomainMap()),
455 std::runtime_error, "Tpetra::RAP: Import setUnion messed with the DomainMap in an unfortunate way");
456
457 // NOTE: This is not efficient and should be folded into setUnion
458 //
459 // mfh 27 Sep 2016: What the above comment means, is that the
460 // setUnion operation on Import objects could also compute these
461 // local index - to - local index look-up tables.
462 Kokkos::resize(Icol2Ccol,Pview.importMatrix->getColMap()->getLocalNumElements());
463 local_map_type Ccolmap_local = Ccolmap->getLocalMap();
464 Kokkos::parallel_for("Tpetra::mult_R_A_P_newmatrix::Pcol2Ccol_getGlobalElement",range_type(0,Pview.origMatrix->getColMap()->getLocalNumElements()),KOKKOS_LAMBDA(const LO i) {
465 Pcol2Ccol(i) = Ccolmap_local.getLocalElement(Pcolmap_local.getGlobalElement(i));
466 });
467 Kokkos::parallel_for("Tpetra::mult_R_A_P_newmatrix::Icol2Ccol_getGlobalElement",range_type(0,Pview.importMatrix->getColMap()->getLocalNumElements()),KOKKOS_LAMBDA(const LO i) {
468 Icol2Ccol(i) = Ccolmap_local.getLocalElement(Icolmap_local.getGlobalElement(i));
469 });
470
471 }
472
473 // Replace the column map
474 //
475 // mfh 27 Sep 2016: We do this because C was originally created
476 // without a column Map. Now we have its column Map.
477 Ac.replaceColMap(Ccolmap);
478
479 // mfh 27 Sep 2016: Construct tables that map from local column
480 // indices of A, to local row indices of either B_local (the locally
481 // owned part of B), or B_remote (the "imported" remote part of B).
482 //
483 // For column index Aik in row i of A, if the corresponding row of B
484 // exists in the local part of B ("orig") (which I'll call B_local),
485 // then targetMapToOrigRow[Aik] is the local index of that row of B.
486 // Otherwise, targetMapToOrigRow[Aik] is "invalid" (a flag value).
487 //
488 // For column index Aik in row i of A, if the corresponding row of B
489 // exists in the remote part of B ("Import") (which I'll call
490 // B_remote), then targetMapToImportRow[Aik] is the local index of
491 // that row of B. Otherwise, targetMapToOrigRow[Aik] is "invalid"
492 // (a flag value).
493
494 // Run through all the hash table lookups once and for all
495 lo_view_t targetMapToOrigRow(Kokkos::ViewAllocateWithoutInitializing("targetMapToOrigRow"),Aview.colMap->getLocalNumElements());
496 lo_view_t targetMapToImportRow(Kokkos::ViewAllocateWithoutInitializing("targetMapToImportRow"),Aview.colMap->getLocalNumElements());
497 Kokkos::parallel_for("Tpetra::mult_R_A_P_newmatrix::construct_tables",range_type(Aview.colMap->getMinLocalIndex(), Aview.colMap->getMaxLocalIndex()+1),KOKKOS_LAMBDA(const LO i) {
498 GO aidx = Acolmap_local.getGlobalElement(i);
499 LO P_LID = Prowmap_local.getLocalElement(aidx);
500 if (P_LID != LO_INVALID) {
501 targetMapToOrigRow(i) = P_LID;
502 targetMapToImportRow(i) = LO_INVALID;
503 } else {
504 LO I_LID = Irowmap_local.getLocalElement(aidx);
505 targetMapToOrigRow(i) = LO_INVALID;
506 targetMapToImportRow(i) = I_LID;
507 }
508 });
509
510 // Call the actual kernel. We'll rely on partial template specialization to call the correct one ---
511 // Either the straight-up Tpetra code (SerialNode) or the KokkosKernels one (other NGP node types)
512 KernelWrappers3<Scalar,LocalOrdinal,GlobalOrdinal,Node,lo_view_t>::
513 mult_R_A_P_newmatrix_kernel_wrapper(Rview, Aview, Pview,
514 targetMapToOrigRow,targetMapToImportRow, Pcol2Ccol, Icol2Ccol,
515 Ac, Cimport, label, params);
516 }
517
518
519 // Kernel method for computing the local portion of Ac = R*A*P (reuse mode)
520 template<class Scalar,
521 class LocalOrdinal,
522 class GlobalOrdinal,
523 class Node>
524 void mult_R_A_P_reuse(
525 CrsMatrixStruct<Scalar, LocalOrdinal, GlobalOrdinal, Node>& Rview,
526 CrsMatrixStruct<Scalar, LocalOrdinal, GlobalOrdinal, Node>& Aview,
527 CrsMatrixStruct<Scalar, LocalOrdinal, GlobalOrdinal, Node>& Pview,
528 CrsMatrix<Scalar, LocalOrdinal, GlobalOrdinal, Node>& Ac,
529 const std::string& label,
530 const Teuchos::RCP<Teuchos::ParameterList>& params)
531 {
532 using Teuchos::Array;
533 using Teuchos::ArrayRCP;
534 using Teuchos::ArrayView;
535 using Teuchos::RCP;
536 using Teuchos::rcp;
537
538 //typedef Scalar SC; // unused
539 typedef LocalOrdinal LO;
540 typedef GlobalOrdinal GO;
541 typedef Node NO;
542
543 typedef Import<LO,GO,NO> import_type;
544 typedef Map<LO,GO,NO> map_type;
545
546 // Kokkos typedefs
547 typedef typename map_type::local_map_type local_map_type;
549 typedef typename KCRS::StaticCrsGraphType graph_t;
550 typedef typename graph_t::row_map_type::non_const_type lno_view_t;
551 typedef typename NO::execution_space execution_space;
552 typedef Kokkos::RangePolicy<execution_space, size_t> range_type;
553 typedef Kokkos::View<LO*, typename lno_view_t::array_layout, typename lno_view_t::device_type> lo_view_t;
554
555#ifdef HAVE_TPETRA_MMM_TIMINGS
556 std::string prefix_mmm = std::string("TpetraExt ") + label + std::string(": ");
557 using Teuchos::TimeMonitor;
558 RCP<TimeMonitor> MM = rcp(new TimeMonitor(*(TimeMonitor::getNewTimer(prefix_mmm + std::string("RAP M5 Cmap")))));
559#endif
560 LO LO_INVALID = Teuchos::OrdinalTraits<LO>::invalid();
561
562 // Build the final importer / column map, hash table lookups for Ac
563 RCP<const import_type> Cimport = Ac.getGraph()->getImporter();
564 RCP<const map_type> Ccolmap = Ac.getColMap();
565 RCP<const import_type> Pimport = Pview.origMatrix->getGraph()->getImporter();
566 RCP<const import_type> Iimport = Pview.importMatrix.is_null() ? Teuchos::null : Pview.importMatrix->getGraph()->getImporter();
567 local_map_type Acolmap_local = Aview.colMap->getLocalMap();
568 local_map_type Prowmap_local = Pview.origMatrix->getRowMap()->getLocalMap();
569 local_map_type Irowmap_local; if(!Pview.importMatrix.is_null()) Irowmap_local = Pview.importMatrix->getRowMap()->getLocalMap();
570 local_map_type Pcolmap_local = Pview.origMatrix->getColMap()->getLocalMap();
571 local_map_type Icolmap_local; if(!Pview.importMatrix.is_null()) Icolmap_local = Pview.importMatrix->getColMap()->getLocalMap();
572 local_map_type Ccolmap_local = Ccolmap->getLocalMap();
573
574 // Build the final importer / column map, hash table lookups for C
575 lo_view_t Bcol2Ccol(Kokkos::ViewAllocateWithoutInitializing("Bcol2Ccol"),Pview.colMap->getLocalNumElements()), Icol2Ccol;
576 {
577 // Bcol2Col may not be trivial, as Ccolmap is compressed during fillComplete in newmatrix
578 // So, column map of C may be a strict subset of the column map of B
579 Kokkos::parallel_for(range_type(0,Pview.origMatrix->getColMap()->getLocalNumElements()),KOKKOS_LAMBDA(const LO i) {
580 Bcol2Ccol(i) = Ccolmap_local.getLocalElement(Pcolmap_local.getGlobalElement(i));
581 });
582
583 if (!Pview.importMatrix.is_null()) {
584 TEUCHOS_TEST_FOR_EXCEPTION(!Cimport->getSourceMap()->isSameAs(*Pview.origMatrix->getDomainMap()),
585 std::runtime_error, "Tpetra::MMM: Import setUnion messed with the DomainMap in an unfortunate way");
586
587 Kokkos::resize(Icol2Ccol,Pview.importMatrix->getColMap()->getLocalNumElements());
588 Kokkos::parallel_for(range_type(0,Pview.importMatrix->getColMap()->getLocalNumElements()),KOKKOS_LAMBDA(const LO i) {
589 Icol2Ccol(i) = Ccolmap_local.getLocalElement(Icolmap_local.getGlobalElement(i));
590 });
591 }
592 }
593
594 // Run through all the hash table lookups once and for all
595 lo_view_t targetMapToOrigRow(Kokkos::ViewAllocateWithoutInitializing("targetMapToOrigRow"),Aview.colMap->getLocalNumElements());
596 lo_view_t targetMapToImportRow(Kokkos::ViewAllocateWithoutInitializing("targetMapToImportRow"),Aview.colMap->getLocalNumElements());
597 Kokkos::parallel_for(range_type(Aview.colMap->getMinLocalIndex(), Aview.colMap->getMaxLocalIndex()+1),KOKKOS_LAMBDA(const LO i) {
598 GO aidx = Acolmap_local.getGlobalElement(i);
599 LO B_LID = Prowmap_local.getLocalElement(aidx);
600 if (B_LID != LO_INVALID) {
601 targetMapToOrigRow(i) = B_LID;
602 targetMapToImportRow(i) = LO_INVALID;
603 } else {
604 LO I_LID = Irowmap_local.getLocalElement(aidx);
605 targetMapToOrigRow(i) = LO_INVALID;
606 targetMapToImportRow(i) = I_LID;
607
608 }
609 });
610
611 // Call the actual kernel. We'll rely on partial template specialization to call the correct one ---
612 // Either the straight-up Tpetra code (SerialNode) or the KokkosKernels one (other NGP node types)
613 KernelWrappers3<Scalar,LocalOrdinal,GlobalOrdinal,Node,lo_view_t>::
614 mult_R_A_P_reuse_kernel_wrapper(Rview, Aview, Pview,
615 targetMapToOrigRow,targetMapToImportRow, Bcol2Ccol, Icol2Ccol,
616 Ac, Cimport, label, params);
617 }
618
619
620 // Kernel method for computing the local portion of Ac = R*A*P
621 template<class Scalar,
622 class LocalOrdinal,
623 class GlobalOrdinal,
624 class Node>
625 void mult_PT_A_P_newmatrix(
626 CrsMatrixStruct<Scalar, LocalOrdinal, GlobalOrdinal, Node>& Aview,
627 CrsMatrixStruct<Scalar, LocalOrdinal, GlobalOrdinal, Node>& Pview,
628 CrsMatrix<Scalar, LocalOrdinal, GlobalOrdinal, Node>& Ac,
629 const std::string& label,
630 const Teuchos::RCP<Teuchos::ParameterList>& params)
631 {
632 using Teuchos::Array;
633 using Teuchos::ArrayRCP;
634 using Teuchos::ArrayView;
635 using Teuchos::RCP;
636 using Teuchos::rcp;
637
638 //typedef Scalar SC; // unused
639 typedef LocalOrdinal LO;
640 typedef GlobalOrdinal GO;
641 typedef Node NO;
642
643 typedef Import<LO,GO,NO> import_type;
644 typedef Map<LO,GO,NO> map_type;
645
646 // Kokkos typedefs
647 typedef typename map_type::local_map_type local_map_type;
649 typedef typename KCRS::StaticCrsGraphType graph_t;
650 typedef typename graph_t::row_map_type::non_const_type lno_view_t;
651 typedef typename NO::execution_space execution_space;
652 typedef Kokkos::RangePolicy<execution_space, size_t> range_type;
653 typedef Kokkos::View<LO*, typename lno_view_t::array_layout, typename lno_view_t::device_type> lo_view_t;
654
655#ifdef HAVE_TPETRA_MMM_TIMINGS
656 std::string prefix_mmm = std::string("TpetraExt ") + label + std::string(": ");
657 using Teuchos::TimeMonitor;
658 RCP<TimeMonitor> MM = rcp(new TimeMonitor(*(TimeMonitor::getNewTimer(prefix_mmm + std::string("PTAP M5 Cmap")))));
659#endif
660 LO LO_INVALID = Teuchos::OrdinalTraits<LO>::invalid();
661
662 // Build the final importer / column map, hash table lookups for Ac
663 RCP<const import_type> Cimport;
664 RCP<const map_type> Ccolmap;
665 RCP<const import_type> Pimport = Pview.origMatrix->getGraph()->getImporter();
666 RCP<const import_type> Iimport = Pview.importMatrix.is_null() ? Teuchos::null : Pview.importMatrix->getGraph()->getImporter();
667 local_map_type Acolmap_local = Aview.colMap->getLocalMap();
668 local_map_type Prowmap_local = Pview.origMatrix->getRowMap()->getLocalMap();
669 local_map_type Irowmap_local; if(!Pview.importMatrix.is_null()) Irowmap_local = Pview.importMatrix->getRowMap()->getLocalMap();
670 local_map_type Pcolmap_local = Pview.origMatrix->getColMap()->getLocalMap();
671 local_map_type Icolmap_local; if(!Pview.importMatrix.is_null()) Icolmap_local = Pview.importMatrix->getColMap()->getLocalMap();
672
673
674 // mfh 27 Sep 2016: Pcol2Ccol is a table that maps from local column
675 // indices of B, to local column indices of Ac. (B and Ac have the
676 // same number of columns.) The kernel uses this, instead of
677 // copying the entire input matrix B and converting its column
678 // indices to those of C.
679 lo_view_t Pcol2Ccol(Kokkos::ViewAllocateWithoutInitializing("Pcol2Ccol"),Pview.colMap->getLocalNumElements()), Icol2Ccol;
680
681 if (Pview.importMatrix.is_null()) {
682 // mfh 27 Sep 2016: B has no "remotes," so P and C have the same column Map.
683 Cimport = Pimport;
684 Ccolmap = Pview.colMap;
685 const LO colMapSize = static_cast<LO>(Pview.colMap->getLocalNumElements());
686 // Pcol2Ccol is trivial
687 Kokkos::parallel_for("Tpetra::mult_R_A_P_newmatrix::Pcol2Ccol_fill",
688 Kokkos::RangePolicy<execution_space, LO>(0, colMapSize),
689 KOKKOS_LAMBDA(const LO i) {
690 Pcol2Ccol(i) = i;
691 });
692 }
693 else {
694 // mfh 27 Sep 2016: P has "remotes," so we need to build the
695 // column Map of C, as well as C's Import object (from its domain
696 // Map to its column Map). C's column Map is the union of the
697 // column Maps of (the local part of) P, and the "remote" part of
698 // P. Ditto for the Import. We have optimized this "setUnion"
699 // operation on Import objects and Maps.
700
701 // Choose the right variant of setUnion
702 if (!Pimport.is_null() && !Iimport.is_null()) {
703 Cimport = Pimport->setUnion(*Iimport);
704 }
705 else if (!Pimport.is_null() && Iimport.is_null()) {
706 Cimport = Pimport->setUnion();
707 }
708 else if (Pimport.is_null() && !Iimport.is_null()) {
709 Cimport = Iimport->setUnion();
710 }
711 else {
712 throw std::runtime_error("TpetraExt::RAP status of matrix importers is nonsensical");
713 }
714 Ccolmap = Cimport->getTargetMap();
715
716 // FIXME (mfh 27 Sep 2016) This error check requires an all-reduce
717 // in general. We should get rid of it in order to reduce
718 // communication costs of sparse matrix-matrix multiply.
719 TEUCHOS_TEST_FOR_EXCEPTION(!Cimport->getSourceMap()->isSameAs(*Pview.origMatrix->getDomainMap()),
720 std::runtime_error, "Tpetra::RAP: Import setUnion messed with the DomainMap in an unfortunate way");
721
722 // NOTE: This is not efficient and should be folded into setUnion
723 //
724 // mfh 27 Sep 2016: What the above comment means, is that the
725 // setUnion operation on Import objects could also compute these
726 // local index - to - local index look-up tables.
727 Kokkos::resize(Icol2Ccol,Pview.importMatrix->getColMap()->getLocalNumElements());
728 local_map_type Ccolmap_local = Ccolmap->getLocalMap();
729 Kokkos::parallel_for("Tpetra::mult_R_A_P_newmatrix::Pcol2Ccol_getGlobalElement",range_type(0,Pview.origMatrix->getColMap()->getLocalNumElements()),KOKKOS_LAMBDA(const LO i) {
730 Pcol2Ccol(i) = Ccolmap_local.getLocalElement(Pcolmap_local.getGlobalElement(i));
731 });
732 Kokkos::parallel_for("Tpetra::mult_R_A_P_newmatrix::Icol2Ccol_getGlobalElement",range_type(0,Pview.importMatrix->getColMap()->getLocalNumElements()),KOKKOS_LAMBDA(const LO i) {
733 Icol2Ccol(i) = Ccolmap_local.getLocalElement(Icolmap_local.getGlobalElement(i));
734 });
735
736 }
737
738 // Replace the column map
739 //
740 // mfh 27 Sep 2016: We do this because C was originally created
741 // without a column Map. Now we have its column Map.
742 Ac.replaceColMap(Ccolmap);
743
744 // mfh 27 Sep 2016: Construct tables that map from local column
745 // indices of A, to local row indices of either B_local (the locally
746 // owned part of B), or B_remote (the "imported" remote part of B).
747 //
748 // For column index Aik in row i of A, if the corresponding row of B
749 // exists in the local part of B ("orig") (which I'll call B_local),
750 // then targetMapToOrigRow[Aik] is the local index of that row of B.
751 // Otherwise, targetMapToOrigRow[Aik] is "invalid" (a flag value).
752 //
753 // For column index Aik in row i of A, if the corresponding row of B
754 // exists in the remote part of B ("Import") (which I'll call
755 // B_remote), then targetMapToImportRow[Aik] is the local index of
756 // that row of B. Otherwise, targetMapToOrigRow[Aik] is "invalid"
757 // (a flag value).
758
759 // Run through all the hash table lookups once and for all
760 lo_view_t targetMapToOrigRow(Kokkos::ViewAllocateWithoutInitializing("targetMapToOrigRow"),Aview.colMap->getLocalNumElements());
761 lo_view_t targetMapToImportRow(Kokkos::ViewAllocateWithoutInitializing("targetMapToImportRow"),Aview.colMap->getLocalNumElements());
762
763 Kokkos::parallel_for("Tpetra::mult_R_A_P_newmatrix::construct_tables",range_type(Aview.colMap->getMinLocalIndex(), Aview.colMap->getMaxLocalIndex()+1),KOKKOS_LAMBDA(const LO i) {
764 GO aidx = Acolmap_local.getGlobalElement(i);
765 LO P_LID = Prowmap_local.getLocalElement(aidx);
766 if (P_LID != LO_INVALID) {
767 targetMapToOrigRow(i) = P_LID;
768 targetMapToImportRow(i) = LO_INVALID;
769 } else {
770 LO I_LID = Irowmap_local.getLocalElement(aidx);
771 targetMapToOrigRow(i) = LO_INVALID;
772 targetMapToImportRow(i) = I_LID;
773 }
774 });
775
776 // Call the actual kernel. We'll rely on partial template specialization to call the correct one ---
777 // Either the straight-up Tpetra code (SerialNode) or the KokkosKernels one (other NGP node types)
778 KernelWrappers3<Scalar,LocalOrdinal,GlobalOrdinal,Node,lo_view_t>::
779 mult_PT_A_P_newmatrix_kernel_wrapper(Aview, Pview,
780 targetMapToOrigRow,targetMapToImportRow, Pcol2Ccol, Icol2Ccol,
781 Ac, Cimport, label, params);
782 }
783
784 // Kernel method for computing the local portion of Ac = R*A*P (reuse mode)
785 template<class Scalar,
786 class LocalOrdinal,
787 class GlobalOrdinal,
788 class Node>
789 void mult_PT_A_P_reuse(
790 CrsMatrixStruct<Scalar, LocalOrdinal, GlobalOrdinal, Node>& Aview,
791 CrsMatrixStruct<Scalar, LocalOrdinal, GlobalOrdinal, Node>& Pview,
792 CrsMatrix<Scalar, LocalOrdinal, GlobalOrdinal, Node>& Ac,
793 const std::string& label,
794 const Teuchos::RCP<Teuchos::ParameterList>& params)
795 {
796 using Teuchos::Array;
797 using Teuchos::ArrayRCP;
798 using Teuchos::ArrayView;
799 using Teuchos::RCP;
800 using Teuchos::rcp;
801
802 //typedef Scalar SC; // unused
803 typedef LocalOrdinal LO;
804 typedef GlobalOrdinal GO;
805 typedef Node NO;
806
807 typedef Import<LO,GO,NO> import_type;
808 typedef Map<LO,GO,NO> map_type;
809
810 // Kokkos typedefs
811 typedef typename map_type::local_map_type local_map_type;
813 typedef typename KCRS::StaticCrsGraphType graph_t;
814 typedef typename graph_t::row_map_type::non_const_type lno_view_t;
815 typedef typename NO::execution_space execution_space;
816 typedef Kokkos::RangePolicy<execution_space, size_t> range_type;
817 typedef Kokkos::View<LO*, typename lno_view_t::array_layout, typename lno_view_t::device_type> lo_view_t;
818
819#ifdef HAVE_TPETRA_MMM_TIMINGS
820 std::string prefix_mmm = std::string("TpetraExt ") + label + std::string(": ");
821 using Teuchos::TimeMonitor;
822 RCP<TimeMonitor> MM = rcp(new TimeMonitor(*(TimeMonitor::getNewTimer(prefix_mmm + std::string("RAP M5 Cmap")))));
823#endif
824 LO LO_INVALID = Teuchos::OrdinalTraits<LO>::invalid();
825
826 // Build the final importer / column map, hash table lookups for Ac
827 RCP<const import_type> Cimport = Ac.getGraph()->getImporter();
828 RCP<const map_type> Ccolmap = Ac.getColMap();
829 RCP<const import_type> Pimport = Pview.origMatrix->getGraph()->getImporter();
830 RCP<const import_type> Iimport = Pview.importMatrix.is_null() ? Teuchos::null : Pview.importMatrix->getGraph()->getImporter();
831 local_map_type Acolmap_local = Aview.colMap->getLocalMap();
832 local_map_type Prowmap_local = Pview.origMatrix->getRowMap()->getLocalMap();
833 local_map_type Irowmap_local; if(!Pview.importMatrix.is_null()) Irowmap_local = Pview.importMatrix->getRowMap()->getLocalMap();
834 local_map_type Pcolmap_local = Pview.origMatrix->getColMap()->getLocalMap();
835 local_map_type Icolmap_local; if(!Pview.importMatrix.is_null()) Icolmap_local = Pview.importMatrix->getColMap()->getLocalMap();
836 local_map_type Ccolmap_local = Ccolmap->getLocalMap();
837
838 // Build the final importer / column map, hash table lookups for C
839 lo_view_t Bcol2Ccol(Kokkos::ViewAllocateWithoutInitializing("Bcol2Ccol"),Pview.colMap->getLocalNumElements()), Icol2Ccol;
840 {
841 // Bcol2Col may not be trivial, as Ccolmap is compressed during fillComplete in newmatrix
842 // So, column map of C may be a strict subset of the column map of B
843 Kokkos::parallel_for(range_type(0,Pview.origMatrix->getColMap()->getLocalNumElements()),KOKKOS_LAMBDA(const LO i) {
844 Bcol2Ccol(i) = Ccolmap_local.getLocalElement(Pcolmap_local.getGlobalElement(i));
845 });
846
847 if (!Pview.importMatrix.is_null()) {
848 TEUCHOS_TEST_FOR_EXCEPTION(!Cimport->getSourceMap()->isSameAs(*Pview.origMatrix->getDomainMap()),
849 std::runtime_error, "Tpetra::MMM: Import setUnion messed with the DomainMap in an unfortunate way");
850
851 Kokkos::resize(Icol2Ccol,Pview.importMatrix->getColMap()->getLocalNumElements());
852 Kokkos::parallel_for(range_type(0,Pview.importMatrix->getColMap()->getLocalNumElements()),KOKKOS_LAMBDA(const LO i) {
853 Icol2Ccol(i) = Ccolmap_local.getLocalElement(Icolmap_local.getGlobalElement(i));
854 });
855 }
856 }
857
858 // Run through all the hash table lookups once and for all
859 lo_view_t targetMapToOrigRow(Kokkos::ViewAllocateWithoutInitializing("targetMapToOrigRow"),Aview.colMap->getLocalNumElements());
860 lo_view_t targetMapToImportRow(Kokkos::ViewAllocateWithoutInitializing("targetMapToImportRow"),Aview.colMap->getLocalNumElements());
861 Kokkos::parallel_for(range_type(Aview.colMap->getMinLocalIndex(), Aview.colMap->getMaxLocalIndex()+1),KOKKOS_LAMBDA(const LO i) {
862 GO aidx = Acolmap_local.getGlobalElement(i);
863 LO B_LID = Prowmap_local.getLocalElement(aidx);
864 if (B_LID != LO_INVALID) {
865 targetMapToOrigRow(i) = B_LID;
866 targetMapToImportRow(i) = LO_INVALID;
867 } else {
868 LO I_LID = Irowmap_local.getLocalElement(aidx);
869 targetMapToOrigRow(i) = LO_INVALID;
870 targetMapToImportRow(i) = I_LID;
871
872 }
873 });
874
875 // Call the actual kernel. We'll rely on partial template specialization to call the correct one ---
876 // Either the straight-up Tpetra code (SerialNode) or the KokkosKernels one (other NGP node types)
877 KernelWrappers3<Scalar,LocalOrdinal,GlobalOrdinal,Node,lo_view_t>::
878 mult_PT_A_P_reuse_kernel_wrapper(Aview, Pview,
879 targetMapToOrigRow,targetMapToImportRow, Bcol2Ccol, Icol2Ccol,
880 Ac, Cimport, label, params);
881 }
882
883
884 /*********************************************************************************************************/
885 // RAP NewMatrix Kernel wrappers (Default non-threaded version)
886 // Computes R * A * P -> Ac using classic Gustavson approach
887 template<class Scalar,
888 class LocalOrdinal,
889 class GlobalOrdinal,
890 class Node,
891 class LocalOrdinalViewType>
892 void KernelWrappers3<Scalar,LocalOrdinal,GlobalOrdinal,Node,LocalOrdinalViewType>::mult_R_A_P_newmatrix_kernel_wrapper(CrsMatrixStruct<Scalar, LocalOrdinal, GlobalOrdinal, Node>& Rview,
893 CrsMatrixStruct<Scalar, LocalOrdinal, GlobalOrdinal, Node>& Aview,
894 CrsMatrixStruct<Scalar, LocalOrdinal, GlobalOrdinal, Node>& Pview,
895 const LocalOrdinalViewType & Acol2Prow_dev,
896 const LocalOrdinalViewType & Acol2PIrow_dev,
897 const LocalOrdinalViewType & Pcol2Accol_dev,
898 const LocalOrdinalViewType & PIcol2Accol_dev,
899 CrsMatrix<Scalar, LocalOrdinal, GlobalOrdinal, Node>& Ac,
900 Teuchos::RCP<const Import<LocalOrdinal,GlobalOrdinal,Node> > Acimport,
901 const std::string& label,
902 const Teuchos::RCP<Teuchos::ParameterList>& params) {
903#ifdef HAVE_TPETRA_MMM_TIMINGS
904 std::string prefix_mmm = std::string("TpetraExt ") + label + std::string(": ");
905 using Teuchos::TimeMonitor;
906 Teuchos::RCP<Teuchos::TimeMonitor> MM = rcp(new TimeMonitor(*TimeMonitor::getNewTimer(prefix_mmm + std::string("RAP Newmatrix SerialCore"))));
907#endif
908
909 using Teuchos::Array;
910 using Teuchos::ArrayRCP;
911 using Teuchos::ArrayView;
912 using Teuchos::RCP;
913 using Teuchos::rcp;
914
915 // Lots and lots of typedefs
917 typedef typename KCRS::StaticCrsGraphType graph_t;
918 typedef typename graph_t::row_map_type::const_type c_lno_view_t;
919 typedef typename graph_t::row_map_type::non_const_type lno_view_t;
920 typedef typename graph_t::entries_type::non_const_type lno_nnz_view_t;
921 typedef typename KCRS::values_type::non_const_type scalar_view_t;
922
923 typedef Scalar SC;
924 typedef LocalOrdinal LO;
925 typedef GlobalOrdinal GO;
926 typedef Node NO;
927 typedef Map<LO,GO,NO> map_type;
928 const size_t ST_INVALID = Teuchos::OrdinalTraits<LO>::invalid();
929 const LO LO_INVALID = Teuchos::OrdinalTraits<LO>::invalid();
930 const SC SC_ZERO = Teuchos::ScalarTraits<Scalar>::zero();
931
932 // Sizes
933 RCP<const map_type> Accolmap = Ac.getColMap();
934 size_t m = Rview.origMatrix->getLocalNumRows();
935 size_t n = Accolmap->getLocalNumElements();
936 size_t p_max_nnz_per_row = Pview.origMatrix->getLocalMaxNumRowEntries();
937
938 // Routine runs on host; have to put arguments on host, too
939 auto Acol2Prow = Kokkos::create_mirror_view_and_copy(Kokkos::HostSpace(),
940 Acol2Prow_dev);
941 auto Acol2PIrow = Kokkos::create_mirror_view_and_copy(Kokkos::HostSpace(),
942 Acol2PIrow_dev);
943 auto Pcol2Accol = Kokkos::create_mirror_view_and_copy(Kokkos::HostSpace(),
944 Pcol2Accol_dev);
945 auto PIcol2Accol = Kokkos::create_mirror_view_and_copy(Kokkos::HostSpace(),
946 PIcol2Accol_dev);
947
948 // Grab the Kokkos::SparseCrsMatrices & inner stuff
949 const auto Amat = Aview.origMatrix->getLocalMatrixHost();
950 const auto Pmat = Pview.origMatrix->getLocalMatrixHost();
951 const auto Rmat = Rview.origMatrix->getLocalMatrixHost();
952
953 auto Arowptr = Amat.graph.row_map;
954 auto Prowptr = Pmat.graph.row_map;
955 auto Rrowptr = Rmat.graph.row_map;
956 const auto Acolind = Amat.graph.entries;
957 const auto Pcolind = Pmat.graph.entries;
958 const auto Rcolind = Rmat.graph.entries;
959 const auto Avals = Amat.values;
960 const auto Pvals = Pmat.values;
961 const auto Rvals = Rmat.values;
962
963 typename c_lno_view_t::HostMirror::const_type Irowptr;
964 typename lno_nnz_view_t::HostMirror Icolind;
965 typename scalar_view_t::HostMirror Ivals;
966 if(!Pview.importMatrix.is_null()) {
967 auto lclP = Pview.importMatrix->getLocalMatrixHost();
968 Irowptr = lclP.graph.row_map;
969 Icolind = lclP.graph.entries;
970 Ivals = lclP.values;
971 p_max_nnz_per_row = std::max(p_max_nnz_per_row,Pview.importMatrix->getLocalMaxNumRowEntries());
972 }
973
974#ifdef HAVE_TPETRA_MMM_TIMINGS
975 RCP<TimeMonitor> MM2 = rcp(new TimeMonitor(*TimeMonitor::getNewTimer(prefix_mmm + std::string("RAP Newmatrix SerialCore - Compare"))));
976#endif
977
978 // Classic csr assembly (low memory edition)
979 //
980 // mfh 27 Sep 2016: Ac_estimate_nnz does not promise an upper bound.
981 // The method loops over rows of R, and may resize after processing
982 // each row. Chris Siefert says that this reflects experience in
983 // ML; for the non-threaded case, ML found it faster to spend less
984 // effort on estimation and risk an occasional reallocation.
985 size_t CSR_alloc = std::max(C_estimate_nnz(*Aview.origMatrix, *Pview.origMatrix), n);
986 typename lno_view_t::HostMirror Crowptr(Kokkos::ViewAllocateWithoutInitializing("Crowptr"),m+1);
987 typename lno_nnz_view_t::HostMirror Ccolind(Kokkos::ViewAllocateWithoutInitializing("Ccolind"),CSR_alloc);
988 typename scalar_view_t::HostMirror Cvals(Kokkos::ViewAllocateWithoutInitializing("Cvals"),CSR_alloc);
989
990 // mfh 27 Sep 2016: The ac_status array is an implementation detail
991 // of the local sparse matrix-matrix multiply routine.
992
993 // The status array will contain the index into colind where this entry was last deposited.
994 // ac_status[i] < nnz - not in the row yet
995 // ac_status[i] >= nnz - this is the entry where you can find the data
996 // We start with this filled with INVALID's indicating that there are no entries yet.
997 // Sadly, this complicates the code due to the fact that size_t's are unsigned.
998 const size_t INVALID = Teuchos::OrdinalTraits<size_t>::invalid();
999 Array<size_t> ac_status(n, ST_INVALID);
1000
1001 // mfh 27 Sep 2016: Here is the local sparse matrix-matrix multiply
1002 // routine. The routine computes Ac := R * A * (P_local + P_remote).
1003 //
1004 // For column index Aik in row i of A, Acol2Prow[Aik] tells
1005 // you whether the corresponding row of P belongs to P_local
1006 // ("orig") or P_remote ("Import").
1007
1008 // For each row of R
1009 size_t nnz = 0, nnz_old = 0;
1010 for (size_t i = 0; i < m; i++) {
1011 // mfh 27 Sep 2016: m is the number of rows in the input matrix R
1012 // on the calling process.
1013 Crowptr[i] = nnz;
1014
1015 // mfh 27 Sep 2016: For each entry of R in the current row of R
1016 for (size_t kk = Rrowptr[i]; kk < Rrowptr[i+1]; kk++) {
1017 LO k = Rcolind[kk]; // local column index of current entry of R
1018 const SC Rik = Rvals[kk]; // value of current entry of R
1019 if (Rik == SC_ZERO)
1020 continue; // skip explicitly stored zero values in R
1021 // For each entry of A in the current row of A
1022 for (size_t ll = Arowptr[k]; ll < Arowptr[k+1]; ll++) {
1023 LO l = Acolind[ll]; // local column index of current entry of A
1024 const SC Akl = Avals[ll]; // value of current entry of A
1025 if (Akl == SC_ZERO)
1026 continue; // skip explicitly stored zero values in A
1027
1028
1029 if (Acol2Prow[l] != LO_INVALID) {
1030 // mfh 27 Sep 2016: If the entry of Acol2Prow
1031 // corresponding to the current entry of A is populated, then
1032 // the corresponding row of P is in P_local (i.e., it lives on
1033 // the calling process).
1034
1035 // Local matrix
1036 size_t Pl = Teuchos::as<size_t>(Acol2Prow[l]);
1037
1038 // mfh 27 Sep 2016: Go through all entries in that row of P_local.
1039 for (size_t jj = Prowptr[Pl]; jj < Prowptr[Pl+1]; jj++) {
1040 LO j = Pcolind[jj];
1041 LO Acj = Pcol2Accol[j];
1042 SC Plj = Pvals[jj];
1043
1044 if (ac_status[Acj] == INVALID || ac_status[Acj] < nnz_old) {
1045#ifdef HAVE_TPETRA_DEBUG
1046 // Ac_estimate_nnz() is probably not perfect yet. If this happens, we need to allocate more memory..
1047 TEUCHOS_TEST_FOR_EXCEPTION(nnz >= Teuchos::as<size_t>(Ccolind.size()),
1048 std::runtime_error,
1049 label << " ERROR, not enough memory allocated for matrix product. Allocated: " << Ccolind.extent(0) << std::endl);
1050#endif
1051 // New entry
1052 ac_status[Acj] = nnz;
1053 Ccolind[nnz] = Acj;
1054 Cvals[nnz] = Rik*Akl*Plj;
1055 nnz++;
1056 } else {
1057 Cvals[ac_status[Acj]] += Rik*Akl*Plj;
1058 }
1059 }
1060 } else {
1061 // mfh 27 Sep 2016: If the entry of Acol2PRow
1062 // corresponding to the current entry of A is NOT populated (has
1063 // a flag "invalid" value), then the corresponding row of P is
1064 // in P_remote (i.e., it does not live on the calling process).
1065
1066 // Remote matrix
1067 size_t Il = Teuchos::as<size_t>(Acol2PIrow[l]);
1068 for (size_t jj = Irowptr[Il]; jj < Irowptr[Il+1]; jj++) {
1069 LO j = Icolind[jj];
1070 LO Acj = PIcol2Accol[j];
1071 SC Plj = Ivals[jj];
1072
1073 if (ac_status[Acj] == INVALID || ac_status[Acj] < nnz_old) {
1074#ifdef HAVE_TPETRA_DEBUG
1075 // Ac_estimate_nnz() is probably not perfect yet. If this happens, we need to allocate more memory..
1076 TEUCHOS_TEST_FOR_EXCEPTION(nnz >= Teuchos::as<size_t>(Ccolind.size()),
1077 std::runtime_error,
1078 label << " ERROR, not enough memory allocated for matrix product. Allocated: " << Ccolind.extent(0) << std::endl);
1079#endif
1080 // New entry
1081 ac_status[Acj] = nnz;
1082 Ccolind[nnz] = Acj;
1083 Cvals[nnz] = Rik*Akl*Plj;
1084 nnz++;
1085 } else {
1086 Cvals[ac_status[Acj]] += Rik*Akl*Plj;
1087 }
1088 }
1089 }
1090 }
1091 }
1092 // Resize for next pass if needed
1093 if (nnz + n > CSR_alloc) {
1094 CSR_alloc *= 2;
1095 Kokkos::resize(Ccolind,CSR_alloc);
1096 Kokkos::resize(Cvals,CSR_alloc);
1097 }
1098 nnz_old = nnz;
1099 }
1100
1101 Crowptr[m] = nnz;
1102
1103 // Downward resize
1104 Kokkos::resize(Ccolind,nnz);
1105 Kokkos::resize(Cvals,nnz);
1106
1107#ifdef HAVE_TPETRA_MMM_TIMINGS
1108 MM = Teuchos::null; MM = rcp(new TimeMonitor (*TimeMonitor::getNewTimer(prefix_mmm + std::string("RAP Newmatrix Final Sort"))));
1109#endif
1110 auto Crowptr_dev = Kokkos::create_mirror_view_and_copy(
1111 typename KCRS::device_type(), Crowptr);
1112 auto Ccolind_dev = Kokkos::create_mirror_view_and_copy(
1113 typename KCRS::device_type(), Ccolind);
1114 auto Cvals_dev = Kokkos::create_mirror_view_and_copy(
1115 typename KCRS::device_type(), Cvals);
1116
1117 // Final sort & set of CRS arrays
1118 if (params.is_null() || params->get("sort entries",true))
1119 Import_Util::sortCrsEntries(Crowptr_dev, Ccolind_dev, Cvals_dev);
1120 Ac.setAllValues(Crowptr_dev, Ccolind_dev, Cvals_dev);
1121
1122#ifdef HAVE_TPETRA_MMM_TIMINGS
1123 MM = Teuchos::null; MM = rcp(new TimeMonitor (*TimeMonitor::getNewTimer(prefix_mmm + std::string("RAP Newmatrix ESFC"))));
1124#endif
1125
1126 // Final FillComplete
1127 //
1128 // mfh 27 Sep 2016: So-called "expert static fill complete" bypasses
1129 // Import (from domain Map to column Map) construction (which costs
1130 // lots of communication) by taking the previously constructed
1131 // Import object. We should be able to do this without interfering
1132 // with the implementation of the local part of sparse matrix-matrix
1133 // multply above.
1134 RCP<Teuchos::ParameterList> labelList = rcp(new Teuchos::ParameterList);
1135 labelList->set("Timer Label",label);
1136 if(!params.is_null()) labelList->set("compute global constants",params->get("compute global constants",true));
1137 RCP<const Export<LO,GO,NO> > dummyExport;
1138 Ac.expertStaticFillComplete(Pview.origMatrix->getDomainMap(),
1139 Rview.origMatrix->getRangeMap(),
1140 Acimport,
1141 dummyExport,
1142 labelList);
1143
1144 }
1145
1146 /*********************************************************************************************************/
1147 // RAP Reuse Kernel wrappers (Default non-threaded version)
1148 // Computes R * A * P -> Ac using reuse Gustavson
1149 template<class Scalar,
1150 class LocalOrdinal,
1151 class GlobalOrdinal,
1152 class Node,
1153 class LocalOrdinalViewType>
1154 void KernelWrappers3<Scalar,LocalOrdinal,GlobalOrdinal,Node,LocalOrdinalViewType>::mult_R_A_P_reuse_kernel_wrapper(CrsMatrixStruct<Scalar, LocalOrdinal, GlobalOrdinal, Node>& Rview,
1155 CrsMatrixStruct<Scalar, LocalOrdinal, GlobalOrdinal, Node>& Aview,
1156 CrsMatrixStruct<Scalar, LocalOrdinal, GlobalOrdinal, Node>& Pview,
1157 const LocalOrdinalViewType & Acol2Prow_dev,
1158 const LocalOrdinalViewType & Acol2PIrow_dev,
1159 const LocalOrdinalViewType & Pcol2Accol_dev,
1160 const LocalOrdinalViewType & PIcol2Accol_dev,
1161 CrsMatrix<Scalar, LocalOrdinal, GlobalOrdinal, Node>& Ac,
1162 Teuchos::RCP<const Import<LocalOrdinal,GlobalOrdinal,Node> > Acimport,
1163 const std::string& label,
1164 const Teuchos::RCP<Teuchos::ParameterList>& params) {
1165#ifdef HAVE_TPETRA_MMM_TIMINGS
1166 std::string prefix_mmm = std::string("TpetraExt ") + label + std::string(": ");
1167 using Teuchos::TimeMonitor;
1168 Teuchos::RCP<Teuchos::TimeMonitor> MM = rcp(new TimeMonitor(*TimeMonitor::getNewTimer(prefix_mmm + std::string("RAP Reuse SerialCore"))));
1169#endif
1170
1171 using Teuchos::Array;
1172 using Teuchos::ArrayRCP;
1173 using Teuchos::ArrayView;
1174 using Teuchos::RCP;
1175 using Teuchos::rcp;
1176
1177 // Lots and lots of typedefs
1178 typedef typename Tpetra::CrsMatrix<Scalar,LocalOrdinal,GlobalOrdinal,Node>::local_matrix_host_type KCRS;
1179 typedef typename KCRS::StaticCrsGraphType graph_t;
1180 typedef typename graph_t::row_map_type::const_type c_lno_view_t;
1181 typedef typename graph_t::entries_type::non_const_type lno_nnz_view_t;
1182 typedef typename KCRS::values_type::non_const_type scalar_view_t;
1183
1184 typedef Scalar SC;
1185 typedef LocalOrdinal LO;
1186 typedef GlobalOrdinal GO;
1187 typedef Node NO;
1188 typedef Map<LO,GO,NO> map_type;
1189 const size_t ST_INVALID = Teuchos::OrdinalTraits<LO>::invalid();
1190 const LO LO_INVALID = Teuchos::OrdinalTraits<LO>::invalid();
1191 const SC SC_ZERO = Teuchos::ScalarTraits<Scalar>::zero();
1192
1193 // Sizes
1194 RCP<const map_type> Accolmap = Ac.getColMap();
1195 size_t m = Rview.origMatrix->getLocalNumRows();
1196 size_t n = Accolmap->getLocalNumElements();
1197 size_t p_max_nnz_per_row = Pview.origMatrix->getLocalMaxNumRowEntries();
1198
1199 // Routine runs on host; have to put arguments on host, too
1200 auto Acol2Prow = Kokkos::create_mirror_view_and_copy(Kokkos::HostSpace(),
1201 Acol2Prow_dev);
1202 auto Acol2PIrow = Kokkos::create_mirror_view_and_copy(Kokkos::HostSpace(),
1203 Acol2PIrow_dev);
1204 auto Pcol2Accol = Kokkos::create_mirror_view_and_copy(Kokkos::HostSpace(),
1205 Pcol2Accol_dev);
1206 auto PIcol2Accol = Kokkos::create_mirror_view_and_copy(Kokkos::HostSpace(),
1207 PIcol2Accol_dev);
1208
1209 // Grab the Kokkos::SparseCrsMatrices & inner stuff
1210 const KCRS & Amat = Aview.origMatrix->getLocalMatrixHost();
1211 const KCRS & Pmat = Pview.origMatrix->getLocalMatrixHost();
1212 const KCRS & Rmat = Rview.origMatrix->getLocalMatrixHost();
1213 const KCRS & Cmat = Ac.getLocalMatrixHost();
1214
1215 c_lno_view_t Arowptr = Amat.graph.row_map, Prowptr = Pmat.graph.row_map, Rrowptr = Rmat.graph.row_map, Crowptr = Cmat.graph.row_map;
1216 const lno_nnz_view_t Acolind = Amat.graph.entries, Pcolind = Pmat.graph.entries , Rcolind = Rmat.graph.entries, Ccolind = Cmat.graph.entries;
1217 const scalar_view_t Avals = Amat.values, Pvals = Pmat.values, Rvals = Rmat.values;
1218 scalar_view_t Cvals = Cmat.values;
1219
1220 c_lno_view_t Irowptr;
1221 lno_nnz_view_t Icolind;
1222 scalar_view_t Ivals;
1223 if(!Pview.importMatrix.is_null()) {
1224 auto lclP = Pview.importMatrix->getLocalMatrixHost();
1225 Irowptr = lclP.graph.row_map;
1226 Icolind = lclP.graph.entries;
1227 Ivals = lclP.values;
1228 p_max_nnz_per_row = std::max(p_max_nnz_per_row,Pview.importMatrix->getLocalMaxNumRowEntries());
1229 }
1230
1231#ifdef HAVE_TPETRA_MMM_TIMINGS
1232 RCP<TimeMonitor> MM2 = rcp(new TimeMonitor(*TimeMonitor::getNewTimer(prefix_mmm + std::string("RAP Reuse SerialCore - Compare"))));
1233#endif
1234
1235 // mfh 27 Sep 2016: The ac_status array is an implementation detail
1236 // of the local sparse matrix-matrix multiply routine.
1237
1238 // The status array will contain the index into colind where this entry was last deposited.
1239 // ac_status[i] < nnz - not in the row yet
1240 // ac_status[i] >= nnz - this is the entry where you can find the data
1241 // We start with this filled with INVALID's indicating that there are no entries yet.
1242 // Sadly, this complicates the code due to the fact that size_t's are unsigned.
1243 Array<size_t> ac_status(n, ST_INVALID);
1244
1245 // mfh 27 Sep 2016: Here is the local sparse matrix-matrix multiply
1246 // routine. The routine computes Ac := R * A * (P_local + P_remote).
1247 //
1248 // For column index Aik in row i of A, Acol2Prow[Aik] tells
1249 // you whether the corresponding row of P belongs to P_local
1250 // ("orig") or P_remote ("Import").
1251
1252 // Necessary until following UVM host accesses are changed - for example Crowptr
1253 // Also probably needed in mult_R_A_P_newmatrix_kernel_wrapper - did not demonstrate this in test failure yet
1254 Kokkos::fence();
1255
1256 // For each row of R
1257 size_t OLD_ip = 0, CSR_ip = 0;
1258 for (size_t i = 0; i < m; i++) {
1259 // First fill the c_status array w/ locations where we're allowed to
1260 // generate nonzeros for this row
1261 OLD_ip = Crowptr[i];
1262 CSR_ip = Crowptr[i+1];
1263 for (size_t k = OLD_ip; k < CSR_ip; k++) {
1264 ac_status[Ccolind[k]] = k;
1265
1266 // Reset values in the row of C
1267 Cvals[k] = SC_ZERO;
1268 }
1269
1270 // mfh 27 Sep 2016: For each entry of R in the current row of R
1271 for (size_t kk = Rrowptr[i]; kk < Rrowptr[i+1]; kk++) {
1272 LO k = Rcolind[kk]; // local column index of current entry of R
1273 const SC Rik = Rvals[kk]; // value of current entry of R
1274 if (Rik == SC_ZERO)
1275 continue; // skip explicitly stored zero values in R
1276 // For each entry of A in the current row of A
1277 for (size_t ll = Arowptr[k]; ll < Arowptr[k+1]; ll++) {
1278 LO l = Acolind[ll]; // local column index of current entry of A
1279 const SC Akl = Avals[ll]; // value of current entry of A
1280 if (Akl == SC_ZERO)
1281 continue; // skip explicitly stored zero values in A
1282
1283
1284 if (Acol2Prow[l] != LO_INVALID) {
1285 // mfh 27 Sep 2016: If the entry of Acol2Prow
1286 // corresponding to the current entry of A is populated, then
1287 // the corresponding row of P is in P_local (i.e., it lives on
1288 // the calling process).
1289
1290 // Local matrix
1291 size_t Pl = Teuchos::as<size_t>(Acol2Prow[l]);
1292
1293 // mfh 27 Sep 2016: Go through all entries in that row of P_local.
1294 for (size_t jj = Prowptr[Pl]; jj < Prowptr[Pl+1]; jj++) {
1295 LO j = Pcolind[jj];
1296 LO Cij = Pcol2Accol[j];
1297 SC Plj = Pvals[jj];
1298
1299 TEUCHOS_TEST_FOR_EXCEPTION(ac_status[Cij] < OLD_ip || ac_status[Cij] >= CSR_ip,
1300 std::runtime_error, "Trying to insert a new entry (" << i << "," << Cij << ") into a static graph " <<
1301 "(c_status = " << ac_status[Cij] << " of [" << OLD_ip << "," << CSR_ip << "))");
1302
1303 Cvals[ac_status[Cij]] += Rik*Akl*Plj;
1304 }
1305 } else {
1306 // mfh 27 Sep 2016: If the entry of Acol2PRow
1307 // corresponding to the current entry of A is NOT populated (has
1308 // a flag "invalid" value), then the corresponding row of P is
1309 // in P_remote (i.e., it does not live on the calling process).
1310
1311 // Remote matrix
1312 size_t Il = Teuchos::as<size_t>(Acol2PIrow[l]);
1313 for (size_t jj = Irowptr[Il]; jj < Irowptr[Il+1]; jj++) {
1314 LO j = Icolind[jj];
1315 LO Cij = PIcol2Accol[j];
1316 SC Plj = Ivals[jj];
1317
1318 TEUCHOS_TEST_FOR_EXCEPTION(ac_status[Cij] < OLD_ip || ac_status[Cij] >= CSR_ip,
1319 std::runtime_error, "Trying to insert a new entry (" << i << "," << Cij << ") into a static graph " <<
1320 "(c_status = " << ac_status[Cij] << " of [" << OLD_ip << "," << CSR_ip << "))");
1321
1322 Cvals[ac_status[Cij]] += Rik*Akl*Plj;
1323 }
1324 }
1325 }
1326 }
1327 }
1328
1329#ifdef HAVE_TPETRA_MMM_TIMINGS
1330 auto MM3 = rcp(new TimeMonitor (*TimeMonitor::getNewTimer(prefix_mmm + std::string("RAP Reuse ESFC"))));
1331#endif
1332
1333 Ac.fillComplete(Ac.getDomainMap(), Ac.getRangeMap());
1334
1335}
1336
1337
1338 /*********************************************************************************************************/
1339 // PT_A_P NewMatrix Kernel wrappers (Default, general, non-threaded version)
1340 // Computes P.T * A * P -> Ac
1341 template<class Scalar,
1342 class LocalOrdinal,
1343 class GlobalOrdinal,
1344 class Node,
1345 class LocalOrdinalViewType>
1346 void KernelWrappers3<Scalar,LocalOrdinal,GlobalOrdinal,Node,LocalOrdinalViewType>::mult_PT_A_P_newmatrix_kernel_wrapper(CrsMatrixStruct<Scalar, LocalOrdinal, GlobalOrdinal, Node>& Aview,
1347 CrsMatrixStruct<Scalar, LocalOrdinal, GlobalOrdinal, Node>& Pview,
1348 const LocalOrdinalViewType & Acol2Prow,
1349 const LocalOrdinalViewType & Acol2PIrow,
1350 const LocalOrdinalViewType & Pcol2Accol,
1351 const LocalOrdinalViewType & PIcol2Accol,
1352 CrsMatrix<Scalar, LocalOrdinal, GlobalOrdinal, Node>& Ac,
1353 Teuchos::RCP<const Import<LocalOrdinal,GlobalOrdinal,Node> > Acimport,
1354 const std::string& label,
1355 const Teuchos::RCP<Teuchos::ParameterList>& params) {
1356#ifdef HAVE_TPETRA_MMM_TIMINGS
1357 std::string prefix_mmm = std::string("TpetraExt ") + label + std::string(": ");
1358 using Teuchos::TimeMonitor;
1359 Teuchos::TimeMonitor MM(*TimeMonitor::getNewTimer(prefix_mmm + std::string("PTAP local transpose")));
1360#endif
1361
1362 // We don't need a kernel-level PTAP, we just transpose here
1363 typedef RowMatrixTransposer<Scalar,LocalOrdinal,GlobalOrdinal, Node> transposer_type;
1364 transposer_type transposer (Pview.origMatrix,label+std::string("XP: "));
1365
1366 using Teuchos::ParameterList;
1367 using Teuchos::RCP;
1368 RCP<ParameterList> transposeParams (new ParameterList);
1369 transposeParams->set ("sort", false);
1370
1371 if (! params.is_null ()) {
1372 transposeParams->set ("compute global constants",
1373 params->get ("compute global constants: temporaries",
1374 false));
1375 }
1376 RCP<CrsMatrix<Scalar, LocalOrdinal, GlobalOrdinal, Node> > Ptrans =
1377 transposer.createTransposeLocal (transposeParams);
1378 CrsMatrixStruct<Scalar, LocalOrdinal, GlobalOrdinal, Node> Rview;
1379 Rview.origMatrix = Ptrans;
1380
1381 mult_R_A_P_newmatrix_kernel_wrapper(Rview,Aview,Pview,Acol2Prow,Acol2PIrow,Pcol2Accol,PIcol2Accol,Ac,Acimport,label,params);
1382 }
1383
1384 /*********************************************************************************************************/
1385 // PT_A_P Reuse Kernel wrappers (Default, general, non-threaded version)
1386 // Computes P.T * A * P -> Ac
1387 template<class Scalar,
1388 class LocalOrdinal,
1389 class GlobalOrdinal,
1390 class Node,
1391 class LocalOrdinalViewType>
1392 void KernelWrappers3<Scalar,LocalOrdinal,GlobalOrdinal,Node,LocalOrdinalViewType>::mult_PT_A_P_reuse_kernel_wrapper(CrsMatrixStruct<Scalar, LocalOrdinal, GlobalOrdinal, Node>& Aview,
1393 CrsMatrixStruct<Scalar, LocalOrdinal, GlobalOrdinal, Node>& Pview,
1394 const LocalOrdinalViewType & Acol2Prow,
1395 const LocalOrdinalViewType & Acol2PIrow,
1396 const LocalOrdinalViewType & Pcol2Accol,
1397 const LocalOrdinalViewType & PIcol2Accol,
1398 CrsMatrix<Scalar, LocalOrdinal, GlobalOrdinal, Node>& Ac,
1399 Teuchos::RCP<const Import<LocalOrdinal,GlobalOrdinal,Node> > Acimport,
1400 const std::string& label,
1401 const Teuchos::RCP<Teuchos::ParameterList>& params) {
1402#ifdef HAVE_TPETRA_MMM_TIMINGS
1403 std::string prefix_mmm = std::string("TpetraExt ") + label + std::string(": ");
1404 using Teuchos::TimeMonitor;
1405 Teuchos::TimeMonitor MM(*TimeMonitor::getNewTimer(prefix_mmm + std::string("PTAP local transpose")));
1406#endif
1407
1408 // We don't need a kernel-level PTAP, we just transpose here
1409 typedef RowMatrixTransposer<Scalar,LocalOrdinal,GlobalOrdinal, Node> transposer_type;
1410 transposer_type transposer (Pview.origMatrix,label+std::string("XP: "));
1411
1412 using Teuchos::ParameterList;
1413 using Teuchos::RCP;
1414 RCP<ParameterList> transposeParams (new ParameterList);
1415 transposeParams->set ("sort", false);
1416
1417 if (! params.is_null ()) {
1418 transposeParams->set ("compute global constants",
1419 params->get ("compute global constants: temporaries",
1420 false));
1421 }
1422 RCP<CrsMatrix<Scalar, LocalOrdinal, GlobalOrdinal, Node> > Ptrans =
1423 transposer.createTransposeLocal (transposeParams);
1424 CrsMatrixStruct<Scalar, LocalOrdinal, GlobalOrdinal, Node> Rview;
1425 Rview.origMatrix = Ptrans;
1426
1427 mult_R_A_P_reuse_kernel_wrapper(Rview,Aview,Pview,Acol2Prow,Acol2PIrow,Pcol2Accol,PIcol2Accol,Ac,Acimport,label,params);
1428 }
1429
1430 /*********************************************************************************************************/
1431 // PT_A_P NewMatrix Kernel wrappers (Default non-threaded version)
1432 // Computes P.T * A * P -> Ac using a 2-pass algorithm.
1433 // This turned out to be slower on SerialNode, but it might still be helpful when going to Kokkos, so I left it in.
1434 // Currently, this implementation never gets called.
1435 template<class Scalar,
1436 class LocalOrdinal,
1437 class GlobalOrdinal,
1438 class Node>
1439 void KernelWrappers3MMM<Scalar,LocalOrdinal,GlobalOrdinal,Node>::mult_PT_A_P_newmatrix_kernel_wrapper_2pass(CrsMatrixStruct<Scalar, LocalOrdinal, GlobalOrdinal, Node>& Aview,
1440 CrsMatrixStruct<Scalar, LocalOrdinal, GlobalOrdinal, Node>& Pview,
1441 const Teuchos::Array<LocalOrdinal> & Acol2PRow,
1442 const Teuchos::Array<LocalOrdinal> & Acol2PRowImport,
1443 const Teuchos::Array<LocalOrdinal> & Pcol2Accol,
1444 const Teuchos::Array<LocalOrdinal> & PIcol2Accol,
1445 CrsMatrix<Scalar, LocalOrdinal, GlobalOrdinal, Node>& Ac,
1446 Teuchos::RCP<const Import<LocalOrdinal,GlobalOrdinal,Node> > Acimport,
1447 const std::string& label,
1448 const Teuchos::RCP<Teuchos::ParameterList>& params) {
1449#ifdef HAVE_TPETRA_MMM_TIMINGS
1450 std::string prefix_mmm = std::string("TpetraExt ") + label + std::string(": ");
1451 using Teuchos::TimeMonitor;
1452 Teuchos::RCP<Teuchos::TimeMonitor> MM = rcp(new TimeMonitor(*TimeMonitor::getNewTimer(prefix_mmm + std::string("PTAP Newmatrix SerialCore"))));
1453#endif
1454
1455 using Teuchos::Array;
1456 using Teuchos::ArrayRCP;
1457 using Teuchos::ArrayView;
1458 using Teuchos::RCP;
1459 using Teuchos::rcp;
1460
1461 typedef Scalar SC;
1462 typedef LocalOrdinal LO;
1463 typedef GlobalOrdinal GO;
1464 typedef Node NO;
1465 typedef RowMatrixTransposer<SC,LO,GO,NO> transposer_type;
1466 const LO LO_INVALID = Teuchos::OrdinalTraits<LO>::invalid();
1467 const SC SC_ZERO = Teuchos::ScalarTraits<Scalar>::zero();
1468
1469 // number of rows on the process of the fine matrix
1470 // size_t m = Pview.origMatrix->getLocalNumRows();
1471 // number of rows on the process of the coarse matrix
1472 size_t n = Ac.getRowMap()->getLocalNumElements();
1473 LO maxAccol = Ac.getColMap()->getMaxLocalIndex();
1474
1475 // Get Data Pointers
1476 ArrayRCP<size_t> Acrowptr_RCP;
1477 ArrayRCP<LO> Accolind_RCP;
1478 ArrayRCP<SC> Acvals_RCP;
1479
1480 // mfh 27 Sep 2016: get the three CSR arrays
1481 // out of the CrsMatrix. This code computes R * A * (P_local +
1482 // P_remote), where P_local contains the locally owned rows of P,
1483 // and P_remote the (previously Import'ed) remote rows of P.
1484
1485 auto Arowptr = Aview.origMatrix->getLocalRowPtrsHost();
1486 auto Acolind = Aview.origMatrix->getLocalIndicesHost();
1487 auto Avals = Aview.origMatrix->getLocalValuesHost(
1488 Tpetra::Access::ReadOnly);
1489 auto Prowptr = Pview.origMatrix->getLocalRowPtrsHost();
1490 auto Pcolind = Pview.origMatrix->getLocalIndicesHost();
1491 auto Pvals = Pview.origMatrix->getLocalValuesHost(
1492 Tpetra::Access::ReadOnly);
1493 decltype(Prowptr) Irowptr;
1494 decltype(Pcolind) Icolind;
1495 decltype(Pvals) Ivals;
1496
1497 if (!Pview.importMatrix.is_null()) {
1498 Irowptr = Pview.importMatrix->getLocalRowPtrsHost();
1499 Icolind = Pview.importMatrix->getLocalIndicesHost();
1500 Ivals = Pview.importMatrix->getLocalValuesHost(
1501 Tpetra::Access::ReadOnly);
1502 }
1503
1504 // mfh 27 Sep 2016: Remark below "For efficiency" refers to an issue
1505 // where Teuchos::ArrayRCP::operator[] may be slower than
1506 // Teuchos::ArrayView::operator[].
1507
1508 // For efficiency
1509 ArrayView<size_t> Acrowptr;
1510 ArrayView<LO> Accolind;
1511 ArrayView<SC> Acvals;
1512
1514 // In a first pass, determine the graph of Ac.
1516
1518 // Get the graph of Ac. This gets the local transpose of P,
1519 // then loops over R, A, P to get the graph of Ac.
1521
1522#ifdef HAVE_TPETRA_MMM_TIMINGS
1523 MM = rcp(new TimeMonitor (*TimeMonitor::getNewTimer(prefix_mmm + std::string("PTAP local transpose"))));
1524#endif
1525
1527 // Get the local transpose of the graph of P by locally transposing
1528 // all of P
1529
1530 transposer_type transposer (Pview.origMatrix,label+std::string("XP: "));
1531
1532 using Teuchos::ParameterList;
1533 RCP<ParameterList> transposeParams (new ParameterList);
1534 transposeParams->set ("sort", false);
1535 if (! params.is_null ()) {
1536 transposeParams->set ("compute global constants",
1537 params->get ("compute global constants: temporaries",
1538 false));
1539 }
1540 RCP<CrsMatrix<Scalar, LocalOrdinal, GlobalOrdinal, Node> > Ptrans =
1541 transposer.createTransposeLocal (transposeParams);
1542
1543 auto Rrowptr = Ptrans->getLocalRowPtrsHost();
1544 auto Rcolind = Ptrans->getLocalIndicesHost();
1545 auto Rvals = Ptrans->getLocalValuesHost(Tpetra::Access::ReadOnly);
1546
1548 // Construct graph
1549
1550 #ifdef HAVE_TPETRA_MMM_TIMINGS
1551 MM = rcp(new TimeMonitor (*TimeMonitor::getNewTimer(prefix_mmm + std::string("PTAP graph"))));
1552 #endif
1553
1554 const size_t ST_INVALID = Teuchos::OrdinalTraits<LO>::invalid();
1555 Array<size_t> ac_status(maxAccol + 1, ST_INVALID);
1556
1557 size_t nnz_alloc = std::max(Ac_estimate_nnz(*Aview.origMatrix, *Pview.origMatrix), n);
1558 size_t nnzPerRowA = 100;
1559 if (Aview.origMatrix->getLocalNumEntries() > 0)
1560 nnzPerRowA = Aview.origMatrix->getLocalNumEntries()/Aview.origMatrix->getLocalNumRows();
1561 Acrowptr_RCP.resize(n+1);
1562 Acrowptr = Acrowptr_RCP();
1563 Accolind_RCP.resize(nnz_alloc);
1564 Accolind = Accolind_RCP();
1565
1566 size_t nnz = 0, nnz_old = 0;
1567 for (size_t i = 0; i < n; i++) {
1568 // mfh 27 Sep 2016: m is the number of rows in the input matrix R
1569 // on the calling process.
1570 Acrowptr[i] = nnz;
1571
1572 // mfh 27 Sep 2016: For each entry of R in the current row of R
1573 for (size_t kk = Rrowptr[i]; kk < Rrowptr[i+1]; kk++) {
1574 LO k = Rcolind[kk]; // local column index of current entry of R
1575 // For each entry of A in the current row of A
1576 for (size_t ll = Arowptr[k]; ll < Arowptr[k+1]; ll++) {
1577 LO l = Acolind[ll]; // local column index of current entry of A
1578
1579 if (Acol2PRow[l] != LO_INVALID) {
1580 // mfh 27 Sep 2016: If the entry of Acol2PRow
1581 // corresponding to the current entry of A is populated, then
1582 // the corresponding row of P is in P_local (i.e., it lives on
1583 // the calling process).
1584
1585 // Local matrix
1586 size_t Pl = Teuchos::as<size_t>(Acol2PRow[l]);
1587
1588 // mfh 27 Sep 2016: Go through all entries in that row of P_local.
1589 for (size_t jj = Prowptr[Pl]; jj < Prowptr[Pl+1]; jj++) {
1590 LO j = Pcolind[jj];
1591 LO Acj = Pcol2Accol[j];
1592
1593 if (ac_status[Acj] == ST_INVALID || ac_status[Acj] < nnz_old) {
1594 // New entry
1595 ac_status[Acj] = nnz;
1596 Accolind[nnz] = Acj;
1597 nnz++;
1598 }
1599 }
1600 } else {
1601 // mfh 27 Sep 2016: If the entry of Acol2PRow
1602 // corresponding to the current entry of A is NOT populated (has
1603 // a flag "invalid" value), then the corresponding row of P is
1604 // in P_remote (i.e., it does not live on the calling process).
1605
1606 // Remote matrix
1607 size_t Il = Teuchos::as<size_t>(Acol2PRowImport[l]);
1608 for (size_t jj = Irowptr[Il]; jj < Irowptr[Il+1]; jj++) {
1609 LO j = Icolind[jj];
1610 LO Acj = PIcol2Accol[j];
1611
1612 if (ac_status[Acj] == ST_INVALID || ac_status[Acj] < nnz_old){
1613 // New entry
1614 ac_status[Acj] = nnz;
1615 Accolind[nnz] = Acj;
1616 nnz++;
1617 }
1618 }
1619 }
1620 }
1621 }
1622 // Resize for next pass if needed
1623 // cag: Maybe we can do something more subtle here, and not double
1624 // the size right away.
1625 if (nnz + std::max(5*nnzPerRowA, n) > nnz_alloc) {
1626 nnz_alloc *= 2;
1627 nnz_alloc = std::max(nnz_alloc, nnz + std::max(5*nnzPerRowA, n));
1628 Accolind_RCP.resize(nnz_alloc); Accolind = Accolind_RCP();
1629 Acvals_RCP.resize(nnz_alloc); Acvals = Acvals_RCP();
1630 }
1631 nnz_old = nnz;
1632 }
1633 Acrowptr[n] = nnz;
1634
1635 // Downward resize
1636 Accolind_RCP.resize(nnz);
1637 Accolind = Accolind_RCP();
1638
1639 // Allocate Acvals
1640 Acvals_RCP.resize(nnz, SC_ZERO);
1641 Acvals = Acvals_RCP();
1642
1643
1645 // In a second pass, enter the values into Acvals.
1647
1648 #ifdef HAVE_TPETRA_MMM_TIMINGS
1649 MM = rcp(new TimeMonitor (*TimeMonitor::getNewTimer(prefix_mmm + std::string("PTAP Newmatrix Fill Matrix"))));
1650 #endif
1651
1652
1653 for (size_t k = 0; k < n; k++) {
1654 for (size_t ii = Prowptr[k]; ii < Prowptr[k+1]; ii++) {
1655 LO i = Pcolind[ii];
1656 const SC Pki = Pvals[ii];
1657 for (size_t ll = Arowptr[k]; ll < Arowptr[k+1]; ll++) {
1658 LO l = Acolind[ll];
1659 const SC Akl = Avals[ll];
1660 if (Akl == 0.)
1661 continue;
1662 if (Acol2PRow[l] != LO_INVALID) {
1663 // mfh 27 Sep 2016: If the entry of Acol2PRow
1664 // corresponding to the current entry of A is populated, then
1665 // the corresponding row of P is in P_local (i.e., it lives on
1666 // the calling process).
1667
1668 // Local matrix
1669 size_t Pl = Teuchos::as<size_t>(Acol2PRow[l]);
1670 for (size_t jj = Prowptr[Pl]; jj < Prowptr[Pl+1]; jj++) {
1671 LO j = Pcolind[jj];
1672 LO Acj = Pcol2Accol[j];
1673 size_t pp;
1674 for (pp = Acrowptr[i]; pp < Acrowptr[i+1]; pp++)
1675 if (Accolind[pp] == Acj)
1676 break;
1677 // TEUCHOS_TEST_FOR_EXCEPTION(Accolind[pp] != Acj,
1678 // std::runtime_error, "problem with Ac column indices");
1679 Acvals[pp] += Pki*Akl*Pvals[jj];
1680 }
1681 } else {
1682 // mfh 27 Sep 2016: If the entry of Acol2PRow
1683 // corresponding to the current entry of A NOT populated (has
1684 // a flag "invalid" value), then the corresponding row of P is
1685 // in P_remote (i.e., it does not live on the calling process).
1686
1687 // Remote matrix
1688 size_t Il = Teuchos::as<size_t>(Acol2PRowImport[l]);
1689 for (size_t jj = Irowptr[Il]; jj < Irowptr[Il+1]; jj++) {
1690 LO j = Icolind[jj];
1691 LO Acj = PIcol2Accol[j];
1692 size_t pp;
1693 for (pp = Acrowptr[i]; pp < Acrowptr[i+1]; pp++)
1694 if (Accolind[pp] == Acj)
1695 break;
1696 // TEUCHOS_TEST_FOR_EXCEPTION(Accolind[pp] != Acj,
1697 // std::runtime_error, "problem with Ac column indices");
1698 Acvals[pp] += Pki*Akl*Ivals[jj];
1699 }
1700 }
1701 }
1702 }
1703 }
1704
1705
1706#ifdef HAVE_TPETRA_MMM_TIMINGS
1707 MM = rcp(new TimeMonitor (*TimeMonitor::getNewTimer(prefix_mmm + std::string("PTAP sort"))));
1708#endif
1709
1710 // Final sort & set of CRS arrays
1711 //
1712 // TODO (mfh 27 Sep 2016) Will the thread-parallel "local" sparse
1713 // matrix-matrix multiply routine sort the entries for us?
1714 Import_Util::sortCrsEntries(Acrowptr_RCP(), Accolind_RCP(), Acvals_RCP());
1715
1716 // mfh 27 Sep 2016: This just sets pointers.
1717 Ac.setAllValues(Acrowptr_RCP, Accolind_RCP, Acvals_RCP);
1718
1719#ifdef HAVE_TPETRA_MMM_TIMINGS
1720 MM = rcp(new TimeMonitor (*TimeMonitor::getNewTimer(prefix_mmm + std::string("PTAP Newmatrix ESFC"))));
1721#endif
1722
1723 // Final FillComplete
1724 //
1725 // mfh 27 Sep 2016: So-called "expert static fill complete" bypasses
1726 // Import (from domain Map to column Map) construction (which costs
1727 // lots of communication) by taking the previously constructed
1728 // Import object. We should be able to do this without interfering
1729 // with the implementation of the local part of sparse matrix-matrix
1730 // multply above.
1731 RCP<Teuchos::ParameterList> labelList = rcp(new Teuchos::ParameterList);
1732 labelList->set("Timer Label",label);
1733 // labelList->set("Sort column Map ghost GIDs")
1734 if(!params.is_null()) labelList->set("compute global constants",params->get("compute global constants",true));
1735 RCP<const Export<LO,GO,NO> > dummyExport;
1736 Ac.expertStaticFillComplete(Pview.origMatrix->getDomainMap(),
1737 Pview.origMatrix->getDomainMap(),
1738 Acimport,
1739 dummyExport, labelList);
1740 }
1741
1742
1743
1744 } //End namepsace MMdetails
1745
1746} //End namespace Tpetra
1747//
1748// Explicit instantiation macro
1749//
1750// Must be expanded from within the Tpetra namespace!
1751//
1752
1753#define TPETRA_TRIPLEMATRIXMULTIPLY_INSTANT(SCALAR,LO,GO,NODE) \
1754 \
1755 template \
1756 void TripleMatrixMultiply::MultiplyRAP( \
1757 const CrsMatrix< SCALAR , LO , GO , NODE >& R, \
1758 bool transposeR, \
1759 const CrsMatrix< SCALAR , LO , GO , NODE >& A, \
1760 bool transposeA, \
1761 const CrsMatrix< SCALAR , LO , GO , NODE >& P, \
1762 bool transposeP, \
1763 CrsMatrix< SCALAR , LO , GO , NODE >& Ac, \
1764 bool call_FillComplete_on_result, \
1765 const std::string & label, \
1766 const Teuchos::RCP<Teuchos::ParameterList>& params); \
1767 \
1768
1769
1770#endif // TPETRA_TRIPLEMATRIXMULTIPLY_DEF_HPP
Utility functions for packing and unpacking sparse matrix entries.
Internal functions and macros designed for use with Tpetra::Import and Tpetra::Export objects.
Stand-alone utility functions and macros.
Struct that holds views of the contents of a CrsMatrix.
KokkosSparse::CrsMatrix< impl_scalar_type, local_ordinal_type, device_type, void, typename local_graph_device_type::size_type > local_matrix_device_type
The specialization of Kokkos::CrsMatrix that represents the part of the sparse matrix on each MPI pro...
static int TAFC_OptimizationCoreCount()
MPI process count above which Tpetra::CrsMatrix::transferAndFillComplete will attempt to do advanced ...
void MultiplyRAP(const CrsMatrix< Scalar, LocalOrdinal, GlobalOrdinal, Node > &R, bool transposeR, const CrsMatrix< Scalar, LocalOrdinal, GlobalOrdinal, Node > &A, bool transposeA, const CrsMatrix< Scalar, LocalOrdinal, GlobalOrdinal, Node > &P, bool transposeP, CrsMatrix< Scalar, LocalOrdinal, GlobalOrdinal, Node > &Ac, bool call_FillComplete_on_result=true, const std::string &label=std::string(), const Teuchos::RCP< Teuchos::ParameterList > &params=Teuchos::null)
Sparse matrix-matrix multiply.
Namespace Tpetra contains the class and methods constituting the Tpetra library.