42#ifndef TPETRA_MATRIXMATRIX_CUDA_DEF_HPP
43#define TPETRA_MATRIXMATRIX_CUDA_DEF_HPP
45#ifdef HAVE_TPETRA_INST_CUDA
54 class LocalOrdinalViewType>
55struct KernelWrappers<Scalar,LocalOrdinal,GlobalOrdinal,Kokkos::Compat::KokkosCudaWrapperNode,LocalOrdinalViewType> {
56 static inline void mult_A_B_newmatrix_kernel_wrapper(CrsMatrixStruct<Scalar, LocalOrdinal, GlobalOrdinal, Kokkos::Compat::KokkosCudaWrapperNode>& Aview,
57 CrsMatrixStruct<Scalar, LocalOrdinal, GlobalOrdinal, Kokkos::Compat::KokkosCudaWrapperNode>& Bview,
58 const LocalOrdinalViewType & Acol2Brow,
59 const LocalOrdinalViewType & Acol2Irow,
60 const LocalOrdinalViewType & Bcol2Ccol,
61 const LocalOrdinalViewType & Icol2Ccol,
62 CrsMatrix<Scalar, LocalOrdinal, GlobalOrdinal, Kokkos::Compat::KokkosCudaWrapperNode>& C,
63 Teuchos::RCP<
const Import<LocalOrdinal,GlobalOrdinal,Kokkos::Compat::KokkosCudaWrapperNode> > Cimport,
64 const std::string& label = std::string(),
65 const Teuchos::RCP<Teuchos::ParameterList>& params = Teuchos::null);
69 static inline void mult_A_B_reuse_kernel_wrapper(CrsMatrixStruct<Scalar, LocalOrdinal, GlobalOrdinal, Kokkos::Compat::KokkosCudaWrapperNode>& Aview,
70 CrsMatrixStruct<Scalar, LocalOrdinal, GlobalOrdinal, Kokkos::Compat::KokkosCudaWrapperNode>& Bview,
71 const LocalOrdinalViewType & Acol2Brow,
72 const LocalOrdinalViewType & Acol2Irow,
73 const LocalOrdinalViewType & Bcol2Ccol,
74 const LocalOrdinalViewType & Icol2Ccol,
75 CrsMatrix<Scalar, LocalOrdinal, GlobalOrdinal, Kokkos::Compat::KokkosCudaWrapperNode>& C,
76 Teuchos::RCP<
const Import<LocalOrdinal,GlobalOrdinal,Kokkos::Compat::KokkosCudaWrapperNode> > Cimport,
77 const std::string& label = std::string(),
78 const Teuchos::RCP<Teuchos::ParameterList>& params = Teuchos::null);
85 class GlobalOrdinal,
class LocalOrdinalViewType>
86struct KernelWrappers2<Scalar,LocalOrdinal,GlobalOrdinal,Kokkos::Compat::KokkosCudaWrapperNode,LocalOrdinalViewType> {
87 static inline void jacobi_A_B_newmatrix_kernel_wrapper(Scalar omega,
88 const Vector<Scalar,LocalOrdinal,GlobalOrdinal,Kokkos::Compat::KokkosCudaWrapperNode> & Dinv,
89 CrsMatrixStruct<Scalar, LocalOrdinal, GlobalOrdinal, Kokkos::Compat::KokkosCudaWrapperNode>& Aview,
90 CrsMatrixStruct<Scalar, LocalOrdinal, GlobalOrdinal, Kokkos::Compat::KokkosCudaWrapperNode>& Bview,
91 const LocalOrdinalViewType & Acol2Brow,
92 const LocalOrdinalViewType & Acol2Irow,
93 const LocalOrdinalViewType & Bcol2Ccol,
94 const LocalOrdinalViewType & Icol2Ccol,
95 CrsMatrix<Scalar, LocalOrdinal, GlobalOrdinal, Kokkos::Compat::KokkosCudaWrapperNode>& C,
96 Teuchos::RCP<
const Import<LocalOrdinal,GlobalOrdinal,Kokkos::Compat::KokkosCudaWrapperNode> > Cimport,
97 const std::string& label = std::string(),
98 const Teuchos::RCP<Teuchos::ParameterList>& params = Teuchos::null);
100 static inline void jacobi_A_B_reuse_kernel_wrapper(Scalar omega,
101 const Vector<Scalar,LocalOrdinal,GlobalOrdinal,Kokkos::Compat::KokkosCudaWrapperNode> & Dinv,
102 CrsMatrixStruct<Scalar, LocalOrdinal, GlobalOrdinal, Kokkos::Compat::KokkosCudaWrapperNode>& Aview,
103 CrsMatrixStruct<Scalar, LocalOrdinal, GlobalOrdinal, Kokkos::Compat::KokkosCudaWrapperNode>& Bview,
104 const LocalOrdinalViewType & Acol2Brow,
105 const LocalOrdinalViewType & Acol2Irow,
106 const LocalOrdinalViewType & Bcol2Ccol,
107 const LocalOrdinalViewType & Icol2Ccol,
108 CrsMatrix<Scalar, LocalOrdinal, GlobalOrdinal, Kokkos::Compat::KokkosCudaWrapperNode>& C,
109 Teuchos::RCP<
const Import<LocalOrdinal,GlobalOrdinal,Kokkos::Compat::KokkosCudaWrapperNode> > Cimport,
110 const std::string& label = std::string(),
111 const Teuchos::RCP<Teuchos::ParameterList>& params = Teuchos::null);
113 static inline void jacobi_A_B_newmatrix_KokkosKernels(Scalar omega,
114 const Vector<Scalar,LocalOrdinal,GlobalOrdinal,Kokkos::Compat::KokkosCudaWrapperNode> & Dinv,
115 CrsMatrixStruct<Scalar, LocalOrdinal, GlobalOrdinal, Kokkos::Compat::KokkosCudaWrapperNode>& Aview,
116 CrsMatrixStruct<Scalar, LocalOrdinal, GlobalOrdinal, Kokkos::Compat::KokkosCudaWrapperNode>& Bview,
117 const LocalOrdinalViewType & Acol2Brow,
118 const LocalOrdinalViewType & Acol2Irow,
119 const LocalOrdinalViewType & Bcol2Ccol,
120 const LocalOrdinalViewType & Icol2Ccol,
121 CrsMatrix<Scalar, LocalOrdinal, GlobalOrdinal, Kokkos::Compat::KokkosCudaWrapperNode>& C,
122 Teuchos::RCP<
const Import<LocalOrdinal,GlobalOrdinal,Kokkos::Compat::KokkosCudaWrapperNode> > Cimport,
123 const std::string& label = std::string(),
124 const Teuchos::RCP<Teuchos::ParameterList>& params = Teuchos::null);
130template<
class Scalar,
133 class LocalOrdinalViewType>
134void KernelWrappers<Scalar,LocalOrdinal,GlobalOrdinal,Kokkos::Compat::KokkosCudaWrapperNode,LocalOrdinalViewType>::mult_A_B_newmatrix_kernel_wrapper(CrsMatrixStruct<Scalar, LocalOrdinal, GlobalOrdinal, Kokkos::Compat::KokkosCudaWrapperNode>& Aview,
135 CrsMatrixStruct<Scalar, LocalOrdinal, GlobalOrdinal, Kokkos::Compat::KokkosCudaWrapperNode>& Bview,
136 const LocalOrdinalViewType & Acol2Brow,
137 const LocalOrdinalViewType & Acol2Irow,
138 const LocalOrdinalViewType & Bcol2Ccol,
139 const LocalOrdinalViewType & Icol2Ccol,
140 CrsMatrix<Scalar, LocalOrdinal, GlobalOrdinal, Kokkos::Compat::KokkosCudaWrapperNode>& C,
141 Teuchos::RCP<
const Import<LocalOrdinal,GlobalOrdinal,Kokkos::Compat::KokkosCudaWrapperNode> > Cimport,
142 const std::string& label,
143 const Teuchos::RCP<Teuchos::ParameterList>& params) {
146#ifdef HAVE_TPETRA_MMM_TIMINGS
147 std::string prefix_mmm = std::string(
"TpetraExt ") + label + std::string(
": ");
148 using Teuchos::TimeMonitor;
149 Teuchos::RCP<TimeMonitor> MM = rcp(
new TimeMonitor(*(TimeMonitor::getNewTimer(prefix_mmm + std::string(
"MMM Newmatrix CudaWrapper")))));
152 typedef Kokkos::Compat::KokkosCudaWrapperNode Node;
153 std::string nodename(
"Cuda");
158 typedef typename KCRS::device_type device_t;
159 typedef typename KCRS::StaticCrsGraphType graph_t;
160 typedef typename graph_t::row_map_type::non_const_type lno_view_t;
161 typedef typename graph_t::row_map_type::const_type c_lno_view_t;
162 typedef typename graph_t::entries_type::non_const_type lno_nnz_view_t;
163 typedef typename KCRS::values_type::non_const_type scalar_view_t;
167 int team_work_size = 16;
168 std::string myalg(
"SPGEMM_KK_MEMORY");
169 if(!params.is_null()) {
170 if(params->isParameter(
"cuda: algorithm"))
171 myalg = params->get(
"cuda: algorithm",myalg);
172 if(params->isParameter(
"cuda: team work size"))
173 team_work_size = params->get(
"cuda: team work size",team_work_size);
177 typedef KokkosKernels::Experimental::KokkosKernelsHandle<
178 typename lno_view_t::const_value_type,
typename lno_nnz_view_t::const_value_type,
typename scalar_view_t::const_value_type,
179 typename device_t::execution_space,
typename device_t::memory_space,
typename device_t::memory_space > KernelHandle;
182 const KCRS & Amat = Aview.origMatrix->getLocalMatrixDevice();
183 const KCRS & Bmat = Bview.origMatrix->getLocalMatrixDevice();
185 c_lno_view_t Arowptr = Amat.graph.row_map,
186 Browptr = Bmat.graph.row_map;
187 const lno_nnz_view_t Acolind = Amat.graph.entries,
188 Bcolind = Bmat.graph.entries;
189 const scalar_view_t Avals = Amat.values,
192 c_lno_view_t Irowptr;
193 lno_nnz_view_t Icolind;
195 if(!Bview.importMatrix.is_null()) {
196 auto lclB = Bview.importMatrix->getLocalMatrixDevice();
197 Irowptr = lclB.graph.row_map;
198 Icolind = lclB.graph.entries;
204 std::string alg = nodename+std::string(
" algorithm");
206 if(!params.is_null() && params->isParameter(alg)) myalg = params->get(alg,myalg);
207 KokkosSparse::SPGEMMAlgorithm alg_enum = KokkosSparse::StringToSPGEMMAlgorithm(myalg);
210 const KCRS Bmerged = Tpetra::MMdetails::merge_matrices(Aview,Bview,Acol2Brow,Acol2Irow,Bcol2Ccol,Icol2Ccol,C.getColMap()->getLocalNumElements());
212#ifdef HAVE_TPETRA_MMM_TIMINGS
213 MM = Teuchos::null; MM = rcp(
new TimeMonitor (*TimeMonitor::getNewTimer(prefix_mmm + std::string(
"MMM Newmatrix CudaCore"))));
217 typename KernelHandle::nnz_lno_t AnumRows = Amat.numRows();
218 typename KernelHandle::nnz_lno_t BnumRows = Bmerged.numRows();
219 typename KernelHandle::nnz_lno_t BnumCols = Bmerged.numCols();
221 lno_view_t row_mapC (Kokkos::ViewAllocateWithoutInitializing(
"non_const_lnow_row"), AnumRows + 1);
222 lno_nnz_view_t entriesC;
223 scalar_view_t valuesC;
225 kh.create_spgemm_handle(alg_enum);
226 kh.set_team_work_size(team_work_size);
228 KokkosSparse::Experimental::spgemm_symbolic(&kh,AnumRows,BnumRows,BnumCols,Amat.graph.row_map,Amat.graph.entries,
false,Bmerged.graph.row_map,Bmerged.graph.entries,
false,row_mapC);
230 size_t c_nnz_size = kh.get_spgemm_handle()->get_c_nnz();
232 entriesC = lno_nnz_view_t (Kokkos::ViewAllocateWithoutInitializing(
"entriesC"), c_nnz_size);
233 valuesC = scalar_view_t (Kokkos::ViewAllocateWithoutInitializing(
"valuesC"), c_nnz_size);
235 KokkosSparse::Experimental::spgemm_numeric(&kh,AnumRows,BnumRows,BnumCols,Amat.graph.row_map,Amat.graph.entries,Amat.values,
false,Bmerged.graph.row_map,Bmerged.graph.entries,Bmerged.values,
false,row_mapC,entriesC,valuesC);
236 kh.destroy_spgemm_handle();
238#ifdef HAVE_TPETRA_MMM_TIMINGS
239 MM = Teuchos::null; MM = rcp(
new TimeMonitor (*TimeMonitor::getNewTimer(prefix_mmm + std::string(
"MMM Newmatrix CudaSort"))));
243 if (params.is_null() || params->get(
"sort entries",
true))
244 Import_Util::sortCrsEntries(row_mapC, entriesC, valuesC);
245 C.setAllValues(row_mapC,entriesC,valuesC);
247#ifdef HAVE_TPETRA_MMM_TIMINGS
248 MM = Teuchos::null; MM = rcp(
new TimeMonitor (*TimeMonitor::getNewTimer(prefix_mmm + std::string(
"MMM Newmatrix CudaESFC"))));
252 RCP<Teuchos::ParameterList> labelList = rcp(
new Teuchos::ParameterList);
253 labelList->set(
"Timer Label",label);
254 if(!params.is_null()) labelList->set(
"compute global constants",params->get(
"compute global constants",
true));
255 RCP<const Export<LocalOrdinal,GlobalOrdinal,Kokkos::Compat::KokkosCudaWrapperNode> > dummyExport;
256 C.expertStaticFillComplete(Bview.origMatrix->getDomainMap(), Aview.origMatrix->getRangeMap(), Cimport,dummyExport,labelList);
261template<
class Scalar,
264 class LocalOrdinalViewType>
265void KernelWrappers<Scalar,LocalOrdinal,GlobalOrdinal,Kokkos::Compat::KokkosCudaWrapperNode,LocalOrdinalViewType>::mult_A_B_reuse_kernel_wrapper(
266 CrsMatrixStruct<Scalar, LocalOrdinal, GlobalOrdinal, Kokkos::Compat::KokkosCudaWrapperNode>& Aview,
267 CrsMatrixStruct<Scalar, LocalOrdinal, GlobalOrdinal, Kokkos::Compat::KokkosCudaWrapperNode>& Bview,
268 const LocalOrdinalViewType & targetMapToOrigRow_dev,
269 const LocalOrdinalViewType & targetMapToImportRow_dev,
270 const LocalOrdinalViewType & Bcol2Ccol_dev,
271 const LocalOrdinalViewType & Icol2Ccol_dev,
272 CrsMatrix<Scalar, LocalOrdinal, GlobalOrdinal, Kokkos::Compat::KokkosCudaWrapperNode>& C,
273 Teuchos::RCP<
const Import<LocalOrdinal,GlobalOrdinal,Kokkos::Compat::KokkosCudaWrapperNode> > Cimport,
274 const std::string& label,
275 const Teuchos::RCP<Teuchos::ParameterList>& params) {
278 typedef Kokkos::Compat::KokkosCudaWrapperNode Node;
280#ifdef HAVE_TPETRA_MMM_TIMINGS
281 std::string prefix_mmm = std::string(
"TpetraExt ") + label + std::string(
": ");
282 using Teuchos::TimeMonitor;
283 Teuchos::RCP<Teuchos::TimeMonitor> MM = rcp(
new TimeMonitor(*TimeMonitor::getNewTimer(prefix_mmm + std::string(
"MMM Reuse SerialCore"))));
284 Teuchos::RCP<Teuchos::TimeMonitor> MM2;
291 typedef typename Tpetra::CrsMatrix<Scalar,LocalOrdinal,GlobalOrdinal,Node>::local_matrix_host_type KCRS;
292 typedef typename KCRS::StaticCrsGraphType graph_t;
293 typedef typename graph_t::row_map_type::const_type c_lno_view_t;
294 typedef typename graph_t::entries_type::non_const_type lno_nnz_view_t;
295 typedef typename KCRS::values_type::non_const_type scalar_view_t;
298 typedef LocalOrdinal LO;
299 typedef GlobalOrdinal GO;
301 typedef Map<LO,GO,NO> map_type;
302 const size_t ST_INVALID = Teuchos::OrdinalTraits<LO>::invalid();
303 const LO LO_INVALID = Teuchos::OrdinalTraits<LO>::invalid();
304 const SC SC_ZERO = Teuchos::ScalarTraits<Scalar>::zero();
312 auto targetMapToOrigRow =
313 Kokkos::create_mirror_view_and_copy(Kokkos::HostSpace(),
314 targetMapToOrigRow_dev);
315 auto targetMapToImportRow =
316 Kokkos::create_mirror_view_and_copy(Kokkos::HostSpace(),
317 targetMapToImportRow_dev);
319 Kokkos::create_mirror_view_and_copy(Kokkos::HostSpace(),
322 Kokkos::create_mirror_view_and_copy(Kokkos::HostSpace(),
326 RCP<const map_type> Ccolmap = C.getColMap();
327 size_t m = Aview.origMatrix->getLocalNumRows();
328 size_t n = Ccolmap->getLocalNumElements();
331 const KCRS & Amat = Aview.origMatrix->getLocalMatrixHost();
332 const KCRS & Bmat = Bview.origMatrix->getLocalMatrixHost();
333 const KCRS & Cmat = C.getLocalMatrixHost();
335 c_lno_view_t Arowptr = Amat.graph.row_map,
336 Browptr = Bmat.graph.row_map,
337 Crowptr = Cmat.graph.row_map;
338 const lno_nnz_view_t Acolind = Amat.graph.entries,
339 Bcolind = Bmat.graph.entries,
340 Ccolind = Cmat.graph.entries;
341 const scalar_view_t Avals = Amat.values, Bvals = Bmat.values;
342 scalar_view_t Cvals = Cmat.values;
344 c_lno_view_t Irowptr;
345 lno_nnz_view_t Icolind;
347 if(!Bview.importMatrix.is_null()) {
348 auto lclB = Bview.importMatrix->getLocalMatrixHost();
349 Irowptr = lclB.graph.row_map;
350 Icolind = lclB.graph.entries;
354#ifdef HAVE_TPETRA_MMM_TIMINGS
355 MM2 = Teuchos::null; MM2 = rcp(
new TimeMonitor(*TimeMonitor::getNewTimer(prefix_mmm + std::string(
"MMM Newmatrix SerialCore - Compare"))));
367 std::vector<size_t> c_status(n, ST_INVALID);
370 size_t CSR_ip = 0, OLD_ip = 0;
371 for (
size_t i = 0; i < m; i++) {
375 CSR_ip = Crowptr[i+1];
376 for (
size_t k = OLD_ip; k < CSR_ip; k++) {
377 c_status[Ccolind[k]] = k;
383 for (
size_t k = Arowptr[i]; k < Arowptr[i+1]; k++) {
385 const SC Aval = Avals[k];
389 if (targetMapToOrigRow[Aik] != LO_INVALID) {
391 size_t Bk = Teuchos::as<size_t>(targetMapToOrigRow[Aik]);
393 for (
size_t j = Browptr[Bk]; j < Browptr[Bk+1]; ++j) {
395 LO Cij = Bcol2Ccol[Bkj];
397 TEUCHOS_TEST_FOR_EXCEPTION(c_status[Cij] < OLD_ip || c_status[Cij] >= CSR_ip,
398 std::runtime_error,
"Trying to insert a new entry (" << i <<
"," << Cij <<
") into a static graph " <<
399 "(c_status = " << c_status[Cij] <<
" of [" << OLD_ip <<
"," << CSR_ip <<
"))");
401 Cvals[c_status[Cij]] += Aval * Bvals[j];
406 size_t Ik = Teuchos::as<size_t>(targetMapToImportRow[Aik]);
407 for (
size_t j = Irowptr[Ik]; j < Irowptr[Ik+1]; ++j) {
409 LO Cij = Icol2Ccol[Ikj];
411 TEUCHOS_TEST_FOR_EXCEPTION(c_status[Cij] < OLD_ip || c_status[Cij] >= CSR_ip,
412 std::runtime_error,
"Trying to insert a new entry (" << i <<
"," << Cij <<
") into a static graph " <<
413 "(c_status = " << c_status[Cij] <<
" of [" << OLD_ip <<
"," << CSR_ip <<
"))");
415 Cvals[c_status[Cij]] += Aval * Ivals[j];
421 C.fillComplete(C.getDomainMap(), C.getRangeMap());
425template<
class Scalar,
428 class LocalOrdinalViewType>
429void KernelWrappers2<Scalar,LocalOrdinal,GlobalOrdinal,Kokkos::Compat::KokkosCudaWrapperNode,LocalOrdinalViewType>::jacobi_A_B_newmatrix_kernel_wrapper(Scalar omega,
430 const Vector<Scalar,LocalOrdinal,GlobalOrdinal,Kokkos::Compat::KokkosCudaWrapperNode> & Dinv,
431 CrsMatrixStruct<Scalar, LocalOrdinal, GlobalOrdinal, Kokkos::Compat::KokkosCudaWrapperNode>& Aview,
432 CrsMatrixStruct<Scalar, LocalOrdinal, GlobalOrdinal, Kokkos::Compat::KokkosCudaWrapperNode>& Bview,
433 const LocalOrdinalViewType & Acol2Brow,
434 const LocalOrdinalViewType & Acol2Irow,
435 const LocalOrdinalViewType & Bcol2Ccol,
436 const LocalOrdinalViewType & Icol2Ccol,
437 CrsMatrix<Scalar, LocalOrdinal, GlobalOrdinal, Kokkos::Compat::KokkosCudaWrapperNode>& C,
438 Teuchos::RCP<
const Import<LocalOrdinal,GlobalOrdinal,Kokkos::Compat::KokkosCudaWrapperNode> > Cimport,
439 const std::string& label,
440 const Teuchos::RCP<Teuchos::ParameterList>& params) {
442#ifdef HAVE_TPETRA_MMM_TIMINGS
443 std::string prefix_mmm = std::string(
"TpetraExt ") + label + std::string(
": ");
444 using Teuchos::TimeMonitor;
445 Teuchos::RCP<TimeMonitor> MM;
453 std::string myalg(
"KK");
454 if(!params.is_null()) {
455 if(params->isParameter(
"cuda: jacobi algorithm"))
456 myalg = params->get(
"cuda: jacobi algorithm",myalg);
459 if(myalg ==
"MSAK") {
460 ::Tpetra::MatrixMatrix::ExtraKernels::jacobi_A_B_newmatrix_MultiplyScaleAddKernel(omega,Dinv,Aview,Bview,Acol2Brow,Acol2Irow,Bcol2Ccol,Icol2Ccol,C,Cimport,label,params);
462 else if(myalg ==
"KK") {
463 jacobi_A_B_newmatrix_KokkosKernels(omega,Dinv,Aview,Bview,Acol2Brow,Acol2Irow,Bcol2Ccol,Icol2Ccol,C,Cimport,label,params);
466 throw std::runtime_error(
"Tpetra::MatrixMatrix::Jacobi newmatrix unknown kernel");
469#ifdef HAVE_TPETRA_MMM_TIMINGS
470 MM = Teuchos::null; MM = rcp(
new TimeMonitor (*TimeMonitor::getNewTimer(prefix_mmm + std::string(
"Jacobi Newmatrix CudaESFC"))));
474 RCP<Teuchos::ParameterList> labelList = rcp(
new Teuchos::ParameterList);
475 labelList->set(
"Timer Label",label);
476 if(!params.is_null()) labelList->set(
"compute global constants",params->get(
"compute global constants",
true));
479 if(!C.isFillComplete()) {
480 RCP<const Export<LocalOrdinal,GlobalOrdinal,Kokkos::Compat::KokkosCudaWrapperNode> > dummyExport;
481 C.expertStaticFillComplete(Bview.origMatrix->getDomainMap(), Aview.origMatrix->getRangeMap(), Cimport,dummyExport,labelList);
489template<
class Scalar,
492 class LocalOrdinalViewType>
493void KernelWrappers2<Scalar,LocalOrdinal,GlobalOrdinal,Kokkos::Compat::KokkosCudaWrapperNode,LocalOrdinalViewType>::jacobi_A_B_reuse_kernel_wrapper(Scalar omega,
494 const Vector<Scalar,LocalOrdinal,GlobalOrdinal,Kokkos::Compat::KokkosCudaWrapperNode> & Dinv,
495 CrsMatrixStruct<Scalar, LocalOrdinal, GlobalOrdinal, Kokkos::Compat::KokkosCudaWrapperNode>& Aview,
496 CrsMatrixStruct<Scalar, LocalOrdinal, GlobalOrdinal, Kokkos::Compat::KokkosCudaWrapperNode>& Bview,
497 const LocalOrdinalViewType & targetMapToOrigRow_dev,
498 const LocalOrdinalViewType & targetMapToImportRow_dev,
499 const LocalOrdinalViewType & Bcol2Ccol_dev,
500 const LocalOrdinalViewType & Icol2Ccol_dev,
501 CrsMatrix<Scalar, LocalOrdinal, GlobalOrdinal, Kokkos::Compat::KokkosCudaWrapperNode>& C,
502 Teuchos::RCP<
const Import<LocalOrdinal,GlobalOrdinal,Kokkos::Compat::KokkosCudaWrapperNode> > Cimport,
503 const std::string& label,
504 const Teuchos::RCP<Teuchos::ParameterList>& params) {
507 typedef Kokkos::Compat::KokkosCudaWrapperNode Node;
509#ifdef HAVE_TPETRA_MMM_TIMINGS
510 std::string prefix_mmm = std::string(
"TpetraExt ") + label + std::string(
": ");
511 using Teuchos::TimeMonitor;
512 Teuchos::RCP<Teuchos::TimeMonitor> MM = rcp(
new TimeMonitor(*TimeMonitor::getNewTimer(prefix_mmm + std::string(
"Jacobi Reuse CudaCore"))));
513 Teuchos::RCP<Teuchos::TimeMonitor> MM2;
519 typedef typename Tpetra::CrsMatrix<Scalar,LocalOrdinal,GlobalOrdinal,Node>::local_matrix_host_type KCRS;
520 typedef typename KCRS::StaticCrsGraphType graph_t;
521 typedef typename graph_t::row_map_type::const_type c_lno_view_t;
522 typedef typename graph_t::entries_type::non_const_type lno_nnz_view_t;
523 typedef typename KCRS::values_type::non_const_type scalar_view_t;
524 typedef typename scalar_view_t::memory_space scalar_memory_space;
527 typedef LocalOrdinal LO;
528 typedef GlobalOrdinal GO;
530 typedef Map<LO,GO,NO> map_type;
531 const size_t ST_INVALID = Teuchos::OrdinalTraits<LO>::invalid();
532 const LO LO_INVALID = Teuchos::OrdinalTraits<LO>::invalid();
533 const SC SC_ZERO = Teuchos::ScalarTraits<Scalar>::zero();
541 auto targetMapToOrigRow =
542 Kokkos::create_mirror_view_and_copy(Kokkos::HostSpace(),
543 targetMapToOrigRow_dev);
544 auto targetMapToImportRow =
545 Kokkos::create_mirror_view_and_copy(Kokkos::HostSpace(),
546 targetMapToImportRow_dev);
548 Kokkos::create_mirror_view_and_copy(Kokkos::HostSpace(),
551 Kokkos::create_mirror_view_and_copy(Kokkos::HostSpace(),
556 RCP<const map_type> Ccolmap = C.getColMap();
557 size_t m = Aview.origMatrix->getLocalNumRows();
558 size_t n = Ccolmap->getLocalNumElements();
561 const KCRS & Amat = Aview.origMatrix->getLocalMatrixHost();
562 const KCRS & Bmat = Bview.origMatrix->getLocalMatrixHost();
563 const KCRS & Cmat = C.getLocalMatrixHost();
565 c_lno_view_t Arowptr = Amat.graph.row_map, Browptr = Bmat.graph.row_map, Crowptr = Cmat.graph.row_map;
566 const lno_nnz_view_t Acolind = Amat.graph.entries, Bcolind = Bmat.graph.entries, Ccolind = Cmat.graph.entries;
567 const scalar_view_t Avals = Amat.values, Bvals = Bmat.values;
568 scalar_view_t Cvals = Cmat.values;
570 c_lno_view_t Irowptr;
571 lno_nnz_view_t Icolind;
573 if(!Bview.importMatrix.is_null()) {
574 auto lclB = Bview.importMatrix->getLocalMatrixHost();
575 Irowptr = lclB.graph.row_map;
576 Icolind = lclB.graph.entries;
582 Dinv.template getLocalView<scalar_memory_space>(Access::ReadOnly);
584#ifdef HAVE_TPETRA_MMM_TIMINGS
585 MM2 = Teuchos::null; MM2 = rcp(
new TimeMonitor(*TimeMonitor::getNewTimer(prefix_mmm + std::string(
"Jacobi Reuse CudaCore - Compare"))));
593 std::vector<size_t> c_status(n, ST_INVALID);
596 size_t CSR_ip = 0, OLD_ip = 0;
597 for (
size_t i = 0; i < m; i++) {
602 CSR_ip = Crowptr[i+1];
603 for (
size_t k = OLD_ip; k < CSR_ip; k++) {
604 c_status[Ccolind[k]] = k;
610 SC minusOmegaDval = -omega*Dvals(i,0);
613 for (
size_t j = Browptr[i]; j < Browptr[i+1]; j++) {
614 Scalar Bval = Bvals[j];
618 LO Cij = Bcol2Ccol[Bij];
620 TEUCHOS_TEST_FOR_EXCEPTION(c_status[Cij] < OLD_ip || c_status[Cij] >= CSR_ip,
621 std::runtime_error,
"Trying to insert a new entry into a static graph");
623 Cvals[c_status[Cij]] = Bvals[j];
627 for (
size_t k = Arowptr[i]; k < Arowptr[i+1]; k++) {
629 const SC Aval = Avals[k];
633 if (targetMapToOrigRow[Aik] != LO_INVALID) {
635 size_t Bk = Teuchos::as<size_t>(targetMapToOrigRow[Aik]);
637 for (
size_t j = Browptr[Bk]; j < Browptr[Bk+1]; ++j) {
639 LO Cij = Bcol2Ccol[Bkj];
641 TEUCHOS_TEST_FOR_EXCEPTION(c_status[Cij] < OLD_ip || c_status[Cij] >= CSR_ip,
642 std::runtime_error,
"Trying to insert a new entry into a static graph");
644 Cvals[c_status[Cij]] += minusOmegaDval * Aval * Bvals[j];
649 size_t Ik = Teuchos::as<size_t>(targetMapToImportRow[Aik]);
650 for (
size_t j = Irowptr[Ik]; j < Irowptr[Ik+1]; ++j) {
652 LO Cij = Icol2Ccol[Ikj];
654 TEUCHOS_TEST_FOR_EXCEPTION(c_status[Cij] < OLD_ip || c_status[Cij] >= CSR_ip,
655 std::runtime_error,
"Trying to insert a new entry into a static graph");
657 Cvals[c_status[Cij]] += minusOmegaDval * Aval * Ivals[j];
663#ifdef HAVE_TPETRA_MMM_TIMINGS
665 MM = Teuchos::null; MM = rcp(
new TimeMonitor (*TimeMonitor::getNewTimer(prefix_mmm + std::string(
"Jacobi Reuse ESFC"))));
668 C.fillComplete(C.getDomainMap(), C.getRangeMap());
673template<
class Scalar,
676 class LocalOrdinalViewType>
677void KernelWrappers2<Scalar,LocalOrdinal,GlobalOrdinal,Kokkos::Compat::KokkosCudaWrapperNode,LocalOrdinalViewType>::jacobi_A_B_newmatrix_KokkosKernels(Scalar omega,
678 const Vector<Scalar,LocalOrdinal,GlobalOrdinal,Kokkos::Compat::KokkosCudaWrapperNode> & Dinv,
679 CrsMatrixStruct<Scalar, LocalOrdinal, GlobalOrdinal, Kokkos::Compat::KokkosCudaWrapperNode>& Aview,
680 CrsMatrixStruct<Scalar, LocalOrdinal, GlobalOrdinal, Kokkos::Compat::KokkosCudaWrapperNode>& Bview,
681 const LocalOrdinalViewType & Acol2Brow,
682 const LocalOrdinalViewType & Acol2Irow,
683 const LocalOrdinalViewType & Bcol2Ccol,
684 const LocalOrdinalViewType & Icol2Ccol,
685 CrsMatrix<Scalar, LocalOrdinal, GlobalOrdinal, Kokkos::Compat::KokkosCudaWrapperNode>& C,
686 Teuchos::RCP<
const Import<LocalOrdinal,GlobalOrdinal,Kokkos::Compat::KokkosCudaWrapperNode> > Cimport,
687 const std::string& label,
688 const Teuchos::RCP<Teuchos::ParameterList>& params) {
690#ifdef HAVE_TPETRA_MMM_TIMINGS
691 std::string prefix_mmm = std::string(
"TpetraExt ") + label + std::string(
": ");
692 using Teuchos::TimeMonitor;
693 Teuchos::RCP<TimeMonitor> MM;
700 auto rowMap = Aview.origMatrix->getRowMap();
702 Aview.origMatrix->getLocalDiagCopy(diags);
703 size_t diagLength = rowMap->getLocalNumElements();
704 Teuchos::Array<Scalar> diagonal(diagLength);
705 diags.get1dCopy(diagonal());
707 for(
size_t i = 0; i < diagLength; ++i) {
708 TEUCHOS_TEST_FOR_EXCEPTION(diagonal[i] == Teuchos::ScalarTraits<Scalar>::zero(),
710 "Matrix A has a zero/missing diagonal: " << diagonal[i] << std::endl <<
711 "KokkosKernels Jacobi-fused SpGEMM requires nonzero diagonal entries in A" << std::endl);
716 using device_t =
typename Kokkos::Compat::KokkosCudaWrapperNode::device_type;
718 using graph_t =
typename matrix_t::StaticCrsGraphType;
719 using lno_view_t =
typename graph_t::row_map_type::non_const_type;
720 using c_lno_view_t =
typename graph_t::row_map_type::const_type;
721 using lno_nnz_view_t =
typename graph_t::entries_type::non_const_type;
722 using scalar_view_t =
typename matrix_t::values_type::non_const_type;
725 using handle_t =
typename KokkosKernels::Experimental::KokkosKernelsHandle<
726 typename lno_view_t::const_value_type,
typename lno_nnz_view_t::const_value_type,
typename scalar_view_t::const_value_type,
727 typename device_t::execution_space,
typename device_t::memory_space,
typename device_t::memory_space >;
730 c_lno_view_t Irowptr;
731 lno_nnz_view_t Icolind;
733 if(!Bview.importMatrix.is_null()) {
734 auto lclB = Bview.importMatrix->getLocalMatrixDevice();
735 Irowptr = lclB.graph.row_map;
736 Icolind = lclB.graph.entries;
741 const matrix_t Bmerged = Tpetra::MMdetails::merge_matrices(Aview,Bview,Acol2Brow,Acol2Irow,Bcol2Ccol,Icol2Ccol,C.getColMap()->getLocalNumElements());
744 const matrix_t & Amat = Aview.origMatrix->getLocalMatrixDevice();
745 const matrix_t & Bmat = Bview.origMatrix->getLocalMatrixDevice();
747 typename handle_t::nnz_lno_t AnumRows = Amat.numRows();
748 typename handle_t::nnz_lno_t BnumRows = Bmerged.numRows();
749 typename handle_t::nnz_lno_t BnumCols = Bmerged.numCols();
751 c_lno_view_t Arowptr = Amat.graph.row_map, Browptr = Bmerged.graph.row_map;
752 const lno_nnz_view_t Acolind = Amat.graph.entries, Bcolind = Bmerged.graph.entries;
753 const scalar_view_t Avals = Amat.values, Bvals = Bmerged.values;
756 lno_view_t row_mapC (Kokkos::ViewAllocateWithoutInitializing(
"non_const_lnow_row"), AnumRows + 1);
757 lno_nnz_view_t entriesC;
758 scalar_view_t valuesC;
761 int team_work_size = 16;
762 std::string myalg(
"SPGEMM_KK_MEMORY");
763 if(!params.is_null()) {
764 if(params->isParameter(
"cuda: algorithm"))
765 myalg = params->get(
"cuda: algorithm",myalg);
766 if(params->isParameter(
"cuda: team work size"))
767 team_work_size = params->get(
"cuda: team work size",team_work_size);
771 std::string nodename(
"Cuda");
772 std::string alg = nodename + std::string(
" algorithm");
773 if(!params.is_null() && params->isParameter(alg)) myalg = params->get(alg,myalg);
774 KokkosSparse::SPGEMMAlgorithm alg_enum = KokkosSparse::StringToSPGEMMAlgorithm(myalg);
779 kh.create_spgemm_handle(alg_enum);
780 kh.set_team_work_size(team_work_size);
782 KokkosSparse::Experimental::spgemm_symbolic(&kh, AnumRows, BnumRows, BnumCols,
783 Arowptr, Acolind,
false,
784 Browptr, Bcolind,
false,
787 size_t c_nnz_size = kh.get_spgemm_handle()->get_c_nnz();
789 entriesC = lno_nnz_view_t (Kokkos::ViewAllocateWithoutInitializing(
"entriesC"), c_nnz_size);
790 valuesC = scalar_view_t (Kokkos::ViewAllocateWithoutInitializing(
"valuesC"), c_nnz_size);
793 KokkosSparse::Experimental::spgemm_jacobi(&kh, AnumRows, BnumRows, BnumCols,
794 Arowptr, Acolind, Avals,
false,
795 Browptr, Bcolind, Bvals,
false,
796 row_mapC, entriesC, valuesC,
797 omega, Dinv.getLocalViewDevice(Access::ReadOnly));
798 kh.destroy_spgemm_handle();
800#ifdef HAVE_TPETRA_MMM_TIMINGS
801 MM = Teuchos::null; MM = rcp(
new TimeMonitor (*TimeMonitor::getNewTimer(prefix_mmm + std::string(
"Jacobi Newmatrix CudaSort"))));
805 if (params.is_null() || params->get(
"sort entries",
true))
806 Import_Util::sortCrsEntries(row_mapC, entriesC, valuesC);
807 C.setAllValues(row_mapC,entriesC,valuesC);
809#ifdef HAVE_TPETRA_MMM_TIMINGS
810 MM = Teuchos::null; MM = rcp(
new TimeMonitor (*TimeMonitor::getNewTimer(prefix_mmm + std::string(
"Jacobi Newmatrix CudaESFC"))));
814 Teuchos::RCP<Teuchos::ParameterList> labelList = rcp(
new Teuchos::ParameterList);
815 labelList->set(
"Timer Label",label);
816 if(!params.is_null()) labelList->set(
"compute global constants",params->get(
"compute global constants",
true));
817 Teuchos::RCP<const Export<LocalOrdinal,GlobalOrdinal,Kokkos::Compat::KokkosCudaWrapperNode> > dummyExport;
818 C.expertStaticFillComplete(Bview.origMatrix->getDomainMap(), Aview.origMatrix->getRangeMap(), Cimport,dummyExport,labelList);
Struct that holds views of the contents of a CrsMatrix.
KokkosSparse::CrsMatrix< impl_scalar_type, local_ordinal_type, device_type, void, typename local_graph_device_type::size_type > local_matrix_device_type
The specialization of Kokkos::CrsMatrix that represents the part of the sparse matrix on each MPI pro...
static bool debug()
Whether Tpetra is in debug mode.
Namespace Tpetra contains the class and methods constituting the Tpetra library.