Tpetra parallel linear algebra Version of the Day
Loading...
Searching...
No Matches
TpetraExt_MatrixMatrix_HIP.hpp
1// @HEADER
2// ***********************************************************************
3//
4// Tpetra: Templated Linear Algebra Services Package
5// Copyright (2008) Sandia Corporation
6//
7// Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation,
8// the U.S. Government retains certain rights in this software.
9//
10// Redistribution and use in source and binary forms, with or without
11// modification, are permitted provided that the following conditions are
12// met:
13//
14// 1. Redistributions of source code must retain the above copyright
15// notice, this list of conditions and the following disclaimer.
16//
17// 2. Redistributions in binary form must reproduce the above copyright
18// notice, this list of conditions and the following disclaimer in the
19// documentation and/or other materials provided with the distribution.
20//
21// 3. Neither the name of the Corporation nor the names of the
22// contributors may be used to endorse or promote products derived from
23// this software without specific prior written permission.
24//
25// THIS SOFTWARE IS PROVIDED BY SANDIA CORPORATION "AS IS" AND ANY
26// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
27// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
28// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL SANDIA CORPORATION OR THE
29// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
30// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
31// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
32// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
33// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
34// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
35// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
36//
37// Questions? Contact Michael A. Heroux (maherou@sandia.gov)
38//
39// ************************************************************************
40// @HEADER
41
42#ifndef TPETRA_MATRIXMATRIX_HIP_DEF_HPP
43#define TPETRA_MATRIXMATRIX_HIP_DEF_HPP
44
45#ifdef HAVE_TPETRA_INST_HIP
46namespace Tpetra {
47namespace MMdetails {
48
49/*********************************************************************************************************/
50// MMM KernelWrappers for Partial Specialization to HIP
51template<class Scalar,
52 class LocalOrdinal,
53 class GlobalOrdinal,
54 class LocalOrdinalViewType>
55struct KernelWrappers<Scalar,LocalOrdinal,GlobalOrdinal,Kokkos::Compat::KokkosHIPWrapperNode,LocalOrdinalViewType> {
56 static inline void mult_A_B_newmatrix_kernel_wrapper(CrsMatrixStruct<Scalar, LocalOrdinal, GlobalOrdinal, Kokkos::Compat::KokkosHIPWrapperNode>& Aview,
57 CrsMatrixStruct<Scalar, LocalOrdinal, GlobalOrdinal, Kokkos::Compat::KokkosHIPWrapperNode>& Bview,
58 const LocalOrdinalViewType & Acol2Brow,
59 const LocalOrdinalViewType & Acol2Irow,
60 const LocalOrdinalViewType & Bcol2Ccol,
61 const LocalOrdinalViewType & Icol2Ccol,
62 CrsMatrix<Scalar, LocalOrdinal, GlobalOrdinal, Kokkos::Compat::KokkosHIPWrapperNode>& C,
63 Teuchos::RCP<const Import<LocalOrdinal,GlobalOrdinal,Kokkos::Compat::KokkosHIPWrapperNode> > Cimport,
64 const std::string& label = std::string(),
65 const Teuchos::RCP<Teuchos::ParameterList>& params = Teuchos::null);
66
67
68
69 static inline void mult_A_B_reuse_kernel_wrapper(CrsMatrixStruct<Scalar, LocalOrdinal, GlobalOrdinal, Kokkos::Compat::KokkosHIPWrapperNode>& Aview,
70 CrsMatrixStruct<Scalar, LocalOrdinal, GlobalOrdinal, Kokkos::Compat::KokkosHIPWrapperNode>& Bview,
71 const LocalOrdinalViewType & Acol2Brow,
72 const LocalOrdinalViewType & Acol2Irow,
73 const LocalOrdinalViewType & Bcol2Ccol,
74 const LocalOrdinalViewType & Icol2Ccol,
75 CrsMatrix<Scalar, LocalOrdinal, GlobalOrdinal, Kokkos::Compat::KokkosHIPWrapperNode>& C,
76 Teuchos::RCP<const Import<LocalOrdinal,GlobalOrdinal,Kokkos::Compat::KokkosHIPWrapperNode> > Cimport,
77 const std::string& label = std::string(),
78 const Teuchos::RCP<Teuchos::ParameterList>& params = Teuchos::null);
79
80};
81
82// Jacobi KernelWrappers for Partial Specialization to HIP
83template<class Scalar,
84 class LocalOrdinal,
85 class GlobalOrdinal, class LocalOrdinalViewType>
86struct KernelWrappers2<Scalar,LocalOrdinal,GlobalOrdinal,Kokkos::Compat::KokkosHIPWrapperNode,LocalOrdinalViewType> {
87 static inline void jacobi_A_B_newmatrix_kernel_wrapper(Scalar omega,
88 const Vector<Scalar,LocalOrdinal,GlobalOrdinal,Kokkos::Compat::KokkosHIPWrapperNode> & Dinv,
89 CrsMatrixStruct<Scalar, LocalOrdinal, GlobalOrdinal, Kokkos::Compat::KokkosHIPWrapperNode>& Aview,
90 CrsMatrixStruct<Scalar, LocalOrdinal, GlobalOrdinal, Kokkos::Compat::KokkosHIPWrapperNode>& Bview,
91 const LocalOrdinalViewType & Acol2Brow,
92 const LocalOrdinalViewType & Acol2Irow,
93 const LocalOrdinalViewType & Bcol2Ccol,
94 const LocalOrdinalViewType & Icol2Ccol,
95 CrsMatrix<Scalar, LocalOrdinal, GlobalOrdinal, Kokkos::Compat::KokkosHIPWrapperNode>& C,
96 Teuchos::RCP<const Import<LocalOrdinal,GlobalOrdinal,Kokkos::Compat::KokkosHIPWrapperNode> > Cimport,
97 const std::string& label = std::string(),
98 const Teuchos::RCP<Teuchos::ParameterList>& params = Teuchos::null);
99
100 static inline void jacobi_A_B_reuse_kernel_wrapper(Scalar omega,
101 const Vector<Scalar,LocalOrdinal,GlobalOrdinal,Kokkos::Compat::KokkosHIPWrapperNode> & Dinv,
102 CrsMatrixStruct<Scalar, LocalOrdinal, GlobalOrdinal, Kokkos::Compat::KokkosHIPWrapperNode>& Aview,
103 CrsMatrixStruct<Scalar, LocalOrdinal, GlobalOrdinal, Kokkos::Compat::KokkosHIPWrapperNode>& Bview,
104 const LocalOrdinalViewType & Acol2Brow,
105 const LocalOrdinalViewType & Acol2Irow,
106 const LocalOrdinalViewType & Bcol2Ccol,
107 const LocalOrdinalViewType & Icol2Ccol,
108 CrsMatrix<Scalar, LocalOrdinal, GlobalOrdinal, Kokkos::Compat::KokkosHIPWrapperNode>& C,
109 Teuchos::RCP<const Import<LocalOrdinal,GlobalOrdinal,Kokkos::Compat::KokkosHIPWrapperNode> > Cimport,
110 const std::string& label = std::string(),
111 const Teuchos::RCP<Teuchos::ParameterList>& params = Teuchos::null);
112
113 static inline void jacobi_A_B_newmatrix_KokkosKernels(Scalar omega,
114 const Vector<Scalar,LocalOrdinal,GlobalOrdinal,Kokkos::Compat::KokkosHIPWrapperNode> & Dinv,
115 CrsMatrixStruct<Scalar, LocalOrdinal, GlobalOrdinal, Kokkos::Compat::KokkosHIPWrapperNode>& Aview,
116 CrsMatrixStruct<Scalar, LocalOrdinal, GlobalOrdinal, Kokkos::Compat::KokkosHIPWrapperNode>& Bview,
117 const LocalOrdinalViewType & Acol2Brow,
118 const LocalOrdinalViewType & Acol2Irow,
119 const LocalOrdinalViewType & Bcol2Ccol,
120 const LocalOrdinalViewType & Icol2Ccol,
121 CrsMatrix<Scalar, LocalOrdinal, GlobalOrdinal, Kokkos::Compat::KokkosHIPWrapperNode>& C,
122 Teuchos::RCP<const Import<LocalOrdinal,GlobalOrdinal,Kokkos::Compat::KokkosHIPWrapperNode> > Cimport,
123 const std::string& label = std::string(),
124 const Teuchos::RCP<Teuchos::ParameterList>& params = Teuchos::null);
125};
126
127
128/*********************************************************************************************************/
129// AB NewMatrix Kernel wrappers (KokkosKernels/HIP Version)
130template<class Scalar,
131 class LocalOrdinal,
132 class GlobalOrdinal,
133 class LocalOrdinalViewType>
134void KernelWrappers<Scalar,LocalOrdinal,GlobalOrdinal,Kokkos::Compat::KokkosHIPWrapperNode,LocalOrdinalViewType>::mult_A_B_newmatrix_kernel_wrapper(CrsMatrixStruct<Scalar, LocalOrdinal, GlobalOrdinal, Kokkos::Compat::KokkosHIPWrapperNode>& Aview,
135 CrsMatrixStruct<Scalar, LocalOrdinal, GlobalOrdinal, Kokkos::Compat::KokkosHIPWrapperNode>& Bview,
136 const LocalOrdinalViewType & Acol2Brow,
137 const LocalOrdinalViewType & Acol2Irow,
138 const LocalOrdinalViewType & Bcol2Ccol,
139 const LocalOrdinalViewType & Icol2Ccol,
140 CrsMatrix<Scalar, LocalOrdinal, GlobalOrdinal, Kokkos::Compat::KokkosHIPWrapperNode>& C,
141 Teuchos::RCP<const Import<LocalOrdinal,GlobalOrdinal,Kokkos::Compat::KokkosHIPWrapperNode> > Cimport,
142 const std::string& label,
143 const Teuchos::RCP<Teuchos::ParameterList>& params) {
144
145
146#ifdef HAVE_TPETRA_MMM_TIMINGS
147 std::string prefix_mmm = std::string("TpetraExt ") + label + std::string(": ");
148 using Teuchos::TimeMonitor;
149 Teuchos::RCP<TimeMonitor> MM = rcp(new TimeMonitor(*(TimeMonitor::getNewTimer(prefix_mmm + std::string("MMM Newmatrix HIPWrapper")))));
150#endif
151 // Node-specific code
152 typedef Kokkos::Compat::KokkosHIPWrapperNode Node;
153 std::string nodename("HIP");
154
155 // Lots and lots of typedefs
156 using Teuchos::RCP;
158 typedef typename KCRS::device_type device_t;
159 typedef typename KCRS::StaticCrsGraphType graph_t;
160 typedef typename graph_t::row_map_type::non_const_type lno_view_t;
161 typedef typename graph_t::row_map_type::const_type c_lno_view_t;
162 typedef typename graph_t::entries_type::non_const_type lno_nnz_view_t;
163 typedef typename KCRS::values_type::non_const_type scalar_view_t;
164 //typedef typename graph_t::row_map_type::const_type lno_view_t_const;
165
166 // Options
167 int team_work_size = 16; // Defaults to 16 as per Deveci 12/7/16 - csiefer
168 std::string myalg("SPGEMM_KK_MEMORY");
169 if(!params.is_null()) {
170 if(params->isParameter("hip: algorithm"))
171 myalg = params->get("hip: algorithm",myalg);
172 if(params->isParameter("hip: team work size"))
173 team_work_size = params->get("hip: team work size",team_work_size);
174 }
175
176 // KokkosKernelsHandle
177 typedef KokkosKernels::Experimental::KokkosKernelsHandle<
178 typename lno_view_t::const_value_type,typename lno_nnz_view_t::const_value_type, typename scalar_view_t::const_value_type,
179 typename device_t::execution_space, typename device_t::memory_space,typename device_t::memory_space > KernelHandle;
180
181 // Grab the Kokkos::SparseCrsMatrices
182 const KCRS & Amat = Aview.origMatrix->getLocalMatrixDevice();
183 const KCRS & Bmat = Bview.origMatrix->getLocalMatrixDevice();
184
185 c_lno_view_t Arowptr = Amat.graph.row_map,
186 Browptr = Bmat.graph.row_map;
187 const lno_nnz_view_t Acolind = Amat.graph.entries,
188 Bcolind = Bmat.graph.entries;
189 const scalar_view_t Avals = Amat.values,
190 Bvals = Bmat.values;
191
192 c_lno_view_t Irowptr;
193 lno_nnz_view_t Icolind;
194 scalar_view_t Ivals;
195 if(!Bview.importMatrix.is_null()) {
196 auto lclB = Bview.importMatrix->getLocalMatrixDevice();
197 Irowptr = lclB.graph.row_map;
198 Icolind = lclB.graph.entries;
199 Ivals = lclB.values;
200 }
201
202
203 // Get the algorithm mode
204 std::string alg = nodename+std::string(" algorithm");
205 // printf("DEBUG: Using kernel: %s\n",myalg.c_str());
206 if(!params.is_null() && params->isParameter(alg)) myalg = params->get(alg,myalg);
207 KokkosSparse::SPGEMMAlgorithm alg_enum = KokkosSparse::StringToSPGEMMAlgorithm(myalg);
208
209 // Merge the B and Bimport matrices
210 const KCRS Bmerged = Tpetra::MMdetails::merge_matrices(Aview,Bview,Acol2Brow,Acol2Irow,Bcol2Ccol,Icol2Ccol,C.getColMap()->getLocalNumElements());
211
212#ifdef HAVE_TPETRA_MMM_TIMINGS
213 MM = Teuchos::null; MM = rcp(new TimeMonitor (*TimeMonitor::getNewTimer(prefix_mmm + std::string("MMM Newmatrix HIPCore"))));
214#endif
215
216 // Do the multiply on whatever we've got
217 typename KernelHandle::nnz_lno_t AnumRows = Amat.numRows();
218 typename KernelHandle::nnz_lno_t BnumRows = Bmerged.numRows();
219 typename KernelHandle::nnz_lno_t BnumCols = Bmerged.numCols();
220
221 lno_view_t row_mapC (Kokkos::ViewAllocateWithoutInitializing("non_const_lnow_row"), AnumRows + 1);
222 lno_nnz_view_t entriesC;
223 scalar_view_t valuesC;
224 KernelHandle kh;
225 kh.create_spgemm_handle(alg_enum);
226 kh.set_team_work_size(team_work_size);
227
228 KokkosSparse::Experimental::spgemm_symbolic(&kh,AnumRows,BnumRows,BnumCols,Amat.graph.row_map,Amat.graph.entries,false,Bmerged.graph.row_map,Bmerged.graph.entries,false,row_mapC);
229
230 size_t c_nnz_size = kh.get_spgemm_handle()->get_c_nnz();
231 if (c_nnz_size){
232 entriesC = lno_nnz_view_t (Kokkos::ViewAllocateWithoutInitializing("entriesC"), c_nnz_size);
233 valuesC = scalar_view_t (Kokkos::ViewAllocateWithoutInitializing("valuesC"), c_nnz_size);
234 }
235 KokkosSparse::Experimental::spgemm_numeric(&kh,AnumRows,BnumRows,BnumCols,Amat.graph.row_map,Amat.graph.entries,Amat.values,false,Bmerged.graph.row_map,Bmerged.graph.entries,Bmerged.values,false,row_mapC,entriesC,valuesC);
236 kh.destroy_spgemm_handle();
237
238#ifdef HAVE_TPETRA_MMM_TIMINGS
239 MM = Teuchos::null; MM = rcp(new TimeMonitor (*TimeMonitor::getNewTimer(prefix_mmm + std::string("MMM Newmatrix HIPSort"))));
240#endif
241
242 // Sort & set values
243 if (params.is_null() || params->get("sort entries",true))
244 Import_Util::sortCrsEntries(row_mapC, entriesC, valuesC);
245 C.setAllValues(row_mapC,entriesC,valuesC);
246
247#ifdef HAVE_TPETRA_MMM_TIMINGS
248 MM = Teuchos::null; MM = rcp(new TimeMonitor (*TimeMonitor::getNewTimer(prefix_mmm + std::string("MMM Newmatrix HIPESFC"))));
249#endif
250
251 // Final Fillcomplete
252 RCP<Teuchos::ParameterList> labelList = rcp(new Teuchos::ParameterList);
253 labelList->set("Timer Label",label);
254 if(!params.is_null()) labelList->set("compute global constants",params->get("compute global constants",true));
255 RCP<const Export<LocalOrdinal,GlobalOrdinal,Kokkos::Compat::KokkosHIPWrapperNode> > dummyExport;
256 C.expertStaticFillComplete(Bview.origMatrix->getDomainMap(), Aview.origMatrix->getRangeMap(), Cimport,dummyExport,labelList);
257}
258
259
260/*********************************************************************************************************/
261template<class Scalar,
262 class LocalOrdinal,
263 class GlobalOrdinal,
264 class LocalOrdinalViewType>
265void KernelWrappers<Scalar,LocalOrdinal,GlobalOrdinal,Kokkos::Compat::KokkosHIPWrapperNode,LocalOrdinalViewType>::mult_A_B_reuse_kernel_wrapper(CrsMatrixStruct<Scalar, LocalOrdinal, GlobalOrdinal, Kokkos::Compat::KokkosHIPWrapperNode>& Aview,
266 CrsMatrixStruct<Scalar, LocalOrdinal, GlobalOrdinal, Kokkos::Compat::KokkosHIPWrapperNode>& Bview,
267 const LocalOrdinalViewType & targetMapToOrigRow_dev,
268 const LocalOrdinalViewType & targetMapToImportRow_dev,
269 const LocalOrdinalViewType & Bcol2Ccol_dev,
270 const LocalOrdinalViewType & Icol2Ccol_dev,
271 CrsMatrix<Scalar, LocalOrdinal, GlobalOrdinal, Kokkos::Compat::KokkosHIPWrapperNode>& C,
272 Teuchos::RCP<const Import<LocalOrdinal,GlobalOrdinal,Kokkos::Compat::KokkosHIPWrapperNode> > Cimport,
273 const std::string& label,
274 const Teuchos::RCP<Teuchos::ParameterList>& params) {
275
276 // FIXME: Right now, this is a cut-and-paste of the serial kernel
277 typedef Kokkos::Compat::KokkosHIPWrapperNode Node;
278
279#ifdef HAVE_TPETRA_MMM_TIMINGS
280 std::string prefix_mmm = std::string("TpetraExt ") + label + std::string(": ");
281 using Teuchos::TimeMonitor;
282 Teuchos::RCP<Teuchos::TimeMonitor> MM = rcp(new TimeMonitor(*TimeMonitor::getNewTimer(prefix_mmm + std::string("MMM Reuse SerialCore"))));
283 Teuchos::RCP<Teuchos::TimeMonitor> MM2;
284#endif
285 using Teuchos::RCP;
286 using Teuchos::rcp;
287
288
289 // Lots and lots of typedefs
290 typedef typename Tpetra::CrsMatrix<Scalar,LocalOrdinal,GlobalOrdinal,Node>::local_matrix_host_type KCRS;
291 typedef typename KCRS::StaticCrsGraphType graph_t;
292 typedef typename graph_t::row_map_type::const_type c_lno_view_t;
293 typedef typename graph_t::entries_type::non_const_type lno_nnz_view_t;
294 typedef typename KCRS::values_type::non_const_type scalar_view_t;
295
296 typedef Scalar SC;
297 typedef LocalOrdinal LO;
298 typedef GlobalOrdinal GO;
299 typedef Node NO;
300 typedef Map<LO,GO,NO> map_type;
301 const size_t ST_INVALID = Teuchos::OrdinalTraits<LO>::invalid();
302 const LO LO_INVALID = Teuchos::OrdinalTraits<LO>::invalid();
303 const SC SC_ZERO = Teuchos::ScalarTraits<Scalar>::zero();
304
305 // KDDKDD UVM Without UVM, need to copy targetMap arrays to host.
306 // KDDKDD UVM Ideally, this function would run on device and use
307 // KDDKDD UVM KokkosKernels instead of this host implementation.
308 auto targetMapToOrigRow =
309 Kokkos::create_mirror_view_and_copy(Kokkos::HostSpace(),
310 targetMapToOrigRow_dev);
311 auto targetMapToImportRow =
312 Kokkos::create_mirror_view_and_copy(Kokkos::HostSpace(),
313 targetMapToImportRow_dev);
314 auto Bcol2Ccol =
315 Kokkos::create_mirror_view_and_copy(Kokkos::HostSpace(),
316 Bcol2Ccol_dev);
317 auto Icol2Ccol =
318 Kokkos::create_mirror_view_and_copy(Kokkos::HostSpace(),
319 Icol2Ccol_dev);
320
321 // Sizes
322 RCP<const map_type> Ccolmap = C.getColMap();
323 size_t m = Aview.origMatrix->getLocalNumRows();
324 size_t n = Ccolmap->getLocalNumElements();
325
326 // Grab the Kokkos::SparseCrsMatrices & inner stuff
327 const KCRS & Amat = Aview.origMatrix->getLocalMatrixHost();
328 const KCRS & Bmat = Bview.origMatrix->getLocalMatrixHost();
329 const KCRS & Cmat = C.getLocalMatrixHost();
330
331 c_lno_view_t Arowptr = Amat.graph.row_map,
332 Browptr = Bmat.graph.row_map,
333 Crowptr = Cmat.graph.row_map;
334 const lno_nnz_view_t Acolind = Amat.graph.entries,
335 Bcolind = Bmat.graph.entries,
336 Ccolind = Cmat.graph.entries;
337 const scalar_view_t Avals = Amat.values, Bvals = Bmat.values;
338 scalar_view_t Cvals = Cmat.values;
339
340 c_lno_view_t Irowptr;
341 lno_nnz_view_t Icolind;
342 scalar_view_t Ivals;
343 if(!Bview.importMatrix.is_null()) {
344 auto lclB = Bview.importMatrix->getLocalMatrixHost();
345 Irowptr = lclB.graph.row_map;
346 Icolind = lclB.graph.entries;
347 Ivals = lclB.values;
348 }
349
350#ifdef HAVE_TPETRA_MMM_TIMINGS
351 MM2 = Teuchos::null; MM2 = rcp(new TimeMonitor(*TimeMonitor::getNewTimer(prefix_mmm + std::string("MMM Newmatrix SerialCore - Compare"))));
352#endif
353
354 // Classic csr assembly (low memory edition)
355 // mfh 27 Sep 2016: The c_status array is an implementation detail
356 // of the local sparse matrix-matrix multiply routine.
357
358 // The status array will contain the index into colind where this entry was last deposited.
359 // c_status[i] < CSR_ip - not in the row yet
360 // c_status[i] >= CSR_ip - this is the entry where you can find the data
361 // We start with this filled with INVALID's indicating that there are no entries yet.
362 // Sadly, this complicates the code due to the fact that size_t's are unsigned.
363 std::vector<size_t> c_status(n, ST_INVALID);
364
365 // For each row of A/C
366 size_t CSR_ip = 0, OLD_ip = 0;
367 for (size_t i = 0; i < m; i++) {
368 // First fill the c_status array w/ locations where we're allowed to
369 // generate nonzeros for this row
370 OLD_ip = Crowptr[i];
371 CSR_ip = Crowptr[i+1];
372 for (size_t k = OLD_ip; k < CSR_ip; k++) {
373 c_status[Ccolind[k]] = k;
374
375 // Reset values in the row of C
376 Cvals[k] = SC_ZERO;
377 }
378
379 for (size_t k = Arowptr[i]; k < Arowptr[i+1]; k++) {
380 LO Aik = Acolind[k];
381 const SC Aval = Avals[k];
382 if (Aval == SC_ZERO)
383 continue;
384
385 if (targetMapToOrigRow[Aik] != LO_INVALID) {
386 // Local matrix
387 size_t Bk = Teuchos::as<size_t>(targetMapToOrigRow[Aik]);
388
389 for (size_t j = Browptr[Bk]; j < Browptr[Bk+1]; ++j) {
390 LO Bkj = Bcolind[j];
391 LO Cij = Bcol2Ccol[Bkj];
392
393 TEUCHOS_TEST_FOR_EXCEPTION(c_status[Cij] < OLD_ip || c_status[Cij] >= CSR_ip,
394 std::runtime_error, "Trying to insert a new entry (" << i << "," << Cij << ") into a static graph " <<
395 "(c_status = " << c_status[Cij] << " of [" << OLD_ip << "," << CSR_ip << "))");
396
397 Cvals[c_status[Cij]] += Aval * Bvals[j];
398 }
399
400 } else {
401 // Remote matrix
402 size_t Ik = Teuchos::as<size_t>(targetMapToImportRow[Aik]);
403 for (size_t j = Irowptr[Ik]; j < Irowptr[Ik+1]; ++j) {
404 LO Ikj = Icolind[j];
405 LO Cij = Icol2Ccol[Ikj];
406
407 TEUCHOS_TEST_FOR_EXCEPTION(c_status[Cij] < OLD_ip || c_status[Cij] >= CSR_ip,
408 std::runtime_error, "Trying to insert a new entry (" << i << "," << Cij << ") into a static graph " <<
409 "(c_status = " << c_status[Cij] << " of [" << OLD_ip << "," << CSR_ip << "))");
410
411 Cvals[c_status[Cij]] += Aval * Ivals[j];
412 }
413 }
414 }
415 }
416
417 C.fillComplete(C.getDomainMap(), C.getRangeMap());
418}
419
420/*********************************************************************************************************/
421template<class Scalar,
422 class LocalOrdinal,
423 class GlobalOrdinal,
424 class LocalOrdinalViewType>
425void KernelWrappers2<Scalar,LocalOrdinal,GlobalOrdinal,Kokkos::Compat::KokkosHIPWrapperNode,LocalOrdinalViewType>::jacobi_A_B_newmatrix_kernel_wrapper(Scalar omega,
426 const Vector<Scalar,LocalOrdinal,GlobalOrdinal,Kokkos::Compat::KokkosHIPWrapperNode> & Dinv,
427 CrsMatrixStruct<Scalar, LocalOrdinal, GlobalOrdinal, Kokkos::Compat::KokkosHIPWrapperNode>& Aview,
428 CrsMatrixStruct<Scalar, LocalOrdinal, GlobalOrdinal, Kokkos::Compat::KokkosHIPWrapperNode>& Bview,
429 const LocalOrdinalViewType & Acol2Brow,
430 const LocalOrdinalViewType & Acol2Irow,
431 const LocalOrdinalViewType & Bcol2Ccol,
432 const LocalOrdinalViewType & Icol2Ccol,
433 CrsMatrix<Scalar, LocalOrdinal, GlobalOrdinal, Kokkos::Compat::KokkosHIPWrapperNode>& C,
434 Teuchos::RCP<const Import<LocalOrdinal,GlobalOrdinal,Kokkos::Compat::KokkosHIPWrapperNode> > Cimport,
435 const std::string& label,
436 const Teuchos::RCP<Teuchos::ParameterList>& params) {
437
438#ifdef HAVE_TPETRA_MMM_TIMINGS
439 std::string prefix_mmm = std::string("TpetraExt ") + label + std::string(": ");
440 using Teuchos::TimeMonitor;
441 Teuchos::RCP<TimeMonitor> MM;
442#endif
443
444 // Node-specific code
445 using Teuchos::RCP;
446
447 // Options
448 //int team_work_size = 16; // Defaults to 16 as per Deveci 12/7/16 - csiefer // unreferenced
449 std::string myalg("KK");
450 if(!params.is_null()) {
451 if(params->isParameter("hip: jacobi algorithm"))
452 myalg = params->get("hip: jacobi algorithm",myalg);
453 }
454
455 if(myalg == "MSAK") {
456 ::Tpetra::MatrixMatrix::ExtraKernels::jacobi_A_B_newmatrix_MultiplyScaleAddKernel(omega,Dinv,Aview,Bview,Acol2Brow,Acol2Irow,Bcol2Ccol,Icol2Ccol,C,Cimport,label,params);
457 }
458 else if(myalg == "KK") {
459 jacobi_A_B_newmatrix_KokkosKernels(omega,Dinv,Aview,Bview,Acol2Brow,Acol2Irow,Bcol2Ccol,Icol2Ccol,C,Cimport,label,params);
460 }
461 else {
462 throw std::runtime_error("Tpetra::MatrixMatrix::Jacobi newmatrix unknown kernel");
463 }
464
465#ifdef HAVE_TPETRA_MMM_TIMINGS
466 MM = Teuchos::null; MM = rcp(new TimeMonitor (*TimeMonitor::getNewTimer(prefix_mmm + std::string("Jacobi Newmatrix HIPESFC"))));
467#endif
468
469 // Final Fillcomplete
470 RCP<Teuchos::ParameterList> labelList = rcp(new Teuchos::ParameterList);
471 labelList->set("Timer Label",label);
472 if(!params.is_null()) labelList->set("compute global constants",params->get("compute global constants",true));
473
474 // NOTE: MSAK already fillCompletes, so we have to check here
475 if(!C.isFillComplete()) {
476 RCP<const Export<LocalOrdinal,GlobalOrdinal,Kokkos::Compat::KokkosHIPWrapperNode> > dummyExport;
477 C.expertStaticFillComplete(Bview.origMatrix->getDomainMap(), Aview.origMatrix->getRangeMap(), Cimport,dummyExport,labelList);
478 }
479
480}
481
482
483
484/*********************************************************************************************************/
485template<class Scalar,
486 class LocalOrdinal,
487 class GlobalOrdinal,
488 class LocalOrdinalViewType>
489void KernelWrappers2<Scalar,LocalOrdinal,GlobalOrdinal,Kokkos::Compat::KokkosHIPWrapperNode,LocalOrdinalViewType>::jacobi_A_B_reuse_kernel_wrapper(Scalar omega,
490 const Vector<Scalar,LocalOrdinal,GlobalOrdinal,Kokkos::Compat::KokkosHIPWrapperNode> & Dinv,
491 CrsMatrixStruct<Scalar, LocalOrdinal, GlobalOrdinal, Kokkos::Compat::KokkosHIPWrapperNode>& Aview,
492 CrsMatrixStruct<Scalar, LocalOrdinal, GlobalOrdinal, Kokkos::Compat::KokkosHIPWrapperNode>& Bview,
493 const LocalOrdinalViewType & targetMapToOrigRow_dev,
494 const LocalOrdinalViewType & targetMapToImportRow_dev,
495 const LocalOrdinalViewType & Bcol2Ccol_dev,
496 const LocalOrdinalViewType & Icol2Ccol_dev,
497 CrsMatrix<Scalar, LocalOrdinal, GlobalOrdinal, Kokkos::Compat::KokkosHIPWrapperNode>& C,
498 Teuchos::RCP<const Import<LocalOrdinal,GlobalOrdinal,Kokkos::Compat::KokkosHIPWrapperNode> > Cimport,
499 const std::string& label,
500 const Teuchos::RCP<Teuchos::ParameterList>& params) {
501
502 // FIXME: Right now, this is a cut-and-paste of the serial kernel
503 typedef Kokkos::Compat::KokkosHIPWrapperNode Node;
504
505#ifdef HAVE_TPETRA_MMM_TIMINGS
506 std::string prefix_mmm = std::string("TpetraExt ") + label + std::string(": ");
507 using Teuchos::TimeMonitor;
508 Teuchos::RCP<Teuchos::TimeMonitor> MM = rcp(new TimeMonitor(*TimeMonitor::getNewTimer(prefix_mmm + std::string("Jacobi Reuse HIPCore"))));
509 Teuchos::RCP<Teuchos::TimeMonitor> MM2;
510#endif
511 using Teuchos::RCP;
512 using Teuchos::rcp;
513
514 // Lots and lots of typedefs
515 typedef typename Tpetra::CrsMatrix<Scalar,LocalOrdinal,GlobalOrdinal,Node>::local_matrix_host_type KCRS;
516 typedef typename KCRS::StaticCrsGraphType graph_t;
517 typedef typename graph_t::row_map_type::const_type c_lno_view_t;
518 typedef typename graph_t::entries_type::non_const_type lno_nnz_view_t;
519 typedef typename KCRS::values_type::non_const_type scalar_view_t;
520 typedef typename scalar_view_t::memory_space scalar_memory_space;
521
522 typedef Scalar SC;
523 typedef LocalOrdinal LO;
524 typedef GlobalOrdinal GO;
525 typedef Node NO;
526 typedef Map<LO,GO,NO> map_type;
527 const size_t ST_INVALID = Teuchos::OrdinalTraits<LO>::invalid();
528 const LO LO_INVALID = Teuchos::OrdinalTraits<LO>::invalid();
529 const SC SC_ZERO = Teuchos::ScalarTraits<Scalar>::zero();
530
531 // KDDKDD UVM Without UVM, need to copy targetMap arrays to host.
532 // KDDKDD UVM Ideally, this function would run on device and use
533 // KDDKDD UVM KokkosKernels instead of this host implementation.
534 auto targetMapToOrigRow =
535 Kokkos::create_mirror_view_and_copy(Kokkos::HostSpace(),
536 targetMapToOrigRow_dev);
537 auto targetMapToImportRow =
538 Kokkos::create_mirror_view_and_copy(Kokkos::HostSpace(),
539 targetMapToImportRow_dev);
540 auto Bcol2Ccol =
541 Kokkos::create_mirror_view_and_copy(Kokkos::HostSpace(),
542 Bcol2Ccol_dev);
543 auto Icol2Ccol =
544 Kokkos::create_mirror_view_and_copy(Kokkos::HostSpace(),
545 Icol2Ccol_dev);
546
547
548 // Sizes
549 RCP<const map_type> Ccolmap = C.getColMap();
550 size_t m = Aview.origMatrix->getLocalNumRows();
551 size_t n = Ccolmap->getLocalNumElements();
552
553 // Grab the Kokkos::SparseCrsMatrices & inner stuff
554 const KCRS & Amat = Aview.origMatrix->getLocalMatrixHost();
555 const KCRS & Bmat = Bview.origMatrix->getLocalMatrixHost();
556 const KCRS & Cmat = C.getLocalMatrixHost();
557
558 c_lno_view_t Arowptr = Amat.graph.row_map, Browptr = Bmat.graph.row_map, Crowptr = Cmat.graph.row_map;
559 const lno_nnz_view_t Acolind = Amat.graph.entries, Bcolind = Bmat.graph.entries, Ccolind = Cmat.graph.entries;
560 const scalar_view_t Avals = Amat.values, Bvals = Bmat.values;
561 scalar_view_t Cvals = Cmat.values;
562
563 c_lno_view_t Irowptr;
564 lno_nnz_view_t Icolind;
565 scalar_view_t Ivals;
566 if(!Bview.importMatrix.is_null()) {
567 auto lclB = Bview.importMatrix->getLocalMatrixHost();
568 Irowptr = lclB.graph.row_map;
569 Icolind = lclB.graph.entries;
570 Ivals = lclB.values;
571 }
572
573 // Jacobi-specific inner stuff
574 auto Dvals =
575 Dinv.template getLocalView<scalar_memory_space>(Access::ReadOnly);
576
577#ifdef HAVE_TPETRA_MMM_TIMINGS
578 MM2 = Teuchos::null; MM2 = rcp(new TimeMonitor(*TimeMonitor::getNewTimer(prefix_mmm + std::string("Jacobi Reuse HIPCore - Compare"))));
579#endif
580
581 // The status array will contain the index into colind where this entry was last deposited.
582 // c_status[i] < CSR_ip - not in the row yet
583 // c_status[i] >= CSR_ip - this is the entry where you can find the data
584 // We start with this filled with INVALID's indicating that there are no entries yet.
585 // Sadly, this complicates the code due to the fact that size_t's are unsigned.
586 std::vector<size_t> c_status(n, ST_INVALID);
587
588 // For each row of A/C
589 size_t CSR_ip = 0, OLD_ip = 0;
590 for (size_t i = 0; i < m; i++) {
591
592 // First fill the c_status array w/ locations where we're allowed to
593 // generate nonzeros for this row
594 OLD_ip = Crowptr[i];
595 CSR_ip = Crowptr[i+1];
596 for (size_t k = OLD_ip; k < CSR_ip; k++) {
597 c_status[Ccolind[k]] = k;
598
599 // Reset values in the row of C
600 Cvals[k] = SC_ZERO;
601 }
602
603 SC minusOmegaDval = -omega*Dvals(i,0);
604
605 // Entries of B
606 for (size_t j = Browptr[i]; j < Browptr[i+1]; j++) {
607 Scalar Bval = Bvals[j];
608 if (Bval == SC_ZERO)
609 continue;
610 LO Bij = Bcolind[j];
611 LO Cij = Bcol2Ccol[Bij];
612
613 TEUCHOS_TEST_FOR_EXCEPTION(c_status[Cij] < OLD_ip || c_status[Cij] >= CSR_ip,
614 std::runtime_error, "Trying to insert a new entry into a static graph");
615
616 Cvals[c_status[Cij]] = Bvals[j];
617 }
618
619 // Entries of -omega * Dinv * A * B
620 for (size_t k = Arowptr[i]; k < Arowptr[i+1]; k++) {
621 LO Aik = Acolind[k];
622 const SC Aval = Avals[k];
623 if (Aval == SC_ZERO)
624 continue;
625
626 if (targetMapToOrigRow[Aik] != LO_INVALID) {
627 // Local matrix
628 size_t Bk = Teuchos::as<size_t>(targetMapToOrigRow[Aik]);
629
630 for (size_t j = Browptr[Bk]; j < Browptr[Bk+1]; ++j) {
631 LO Bkj = Bcolind[j];
632 LO Cij = Bcol2Ccol[Bkj];
633
634 TEUCHOS_TEST_FOR_EXCEPTION(c_status[Cij] < OLD_ip || c_status[Cij] >= CSR_ip,
635 std::runtime_error, "Trying to insert a new entry into a static graph");
636
637 Cvals[c_status[Cij]] += minusOmegaDval * Aval * Bvals[j];
638 }
639
640 } else {
641 // Remote matrix
642 size_t Ik = Teuchos::as<size_t>(targetMapToImportRow[Aik]);
643 for (size_t j = Irowptr[Ik]; j < Irowptr[Ik+1]; ++j) {
644 LO Ikj = Icolind[j];
645 LO Cij = Icol2Ccol[Ikj];
646
647 TEUCHOS_TEST_FOR_EXCEPTION(c_status[Cij] < OLD_ip || c_status[Cij] >= CSR_ip,
648 std::runtime_error, "Trying to insert a new entry into a static graph");
649
650 Cvals[c_status[Cij]] += minusOmegaDval * Aval * Ivals[j];
651 }
652 }
653 }
654 }
655
656#ifdef HAVE_TPETRA_MMM_TIMINGS
657 MM2= Teuchos::null;
658 MM = Teuchos::null; MM = rcp(new TimeMonitor (*TimeMonitor::getNewTimer(prefix_mmm + std::string("Jacobi Reuse ESFC"))));
659#endif
660
661 C.fillComplete(C.getDomainMap(), C.getRangeMap());
662
663}
664
665/*********************************************************************************************************/
666template<class Scalar,
667 class LocalOrdinal,
668 class GlobalOrdinal,
669 class LocalOrdinalViewType>
670void KernelWrappers2<Scalar,LocalOrdinal,GlobalOrdinal,Kokkos::Compat::KokkosHIPWrapperNode,LocalOrdinalViewType>::jacobi_A_B_newmatrix_KokkosKernels(Scalar omega,
671 const Vector<Scalar,LocalOrdinal,GlobalOrdinal,Kokkos::Compat::KokkosHIPWrapperNode> & Dinv,
672 CrsMatrixStruct<Scalar, LocalOrdinal, GlobalOrdinal, Kokkos::Compat::KokkosHIPWrapperNode>& Aview,
673 CrsMatrixStruct<Scalar, LocalOrdinal, GlobalOrdinal, Kokkos::Compat::KokkosHIPWrapperNode>& Bview,
674 const LocalOrdinalViewType & Acol2Brow,
675 const LocalOrdinalViewType & Acol2Irow,
676 const LocalOrdinalViewType & Bcol2Ccol,
677 const LocalOrdinalViewType & Icol2Ccol,
678 CrsMatrix<Scalar, LocalOrdinal, GlobalOrdinal, Kokkos::Compat::KokkosHIPWrapperNode>& C,
679 Teuchos::RCP<const Import<LocalOrdinal,GlobalOrdinal,Kokkos::Compat::KokkosHIPWrapperNode> > Cimport,
680 const std::string& label,
681 const Teuchos::RCP<Teuchos::ParameterList>& params) {
682
683#ifdef HAVE_TPETRA_MMM_TIMINGS
684 std::string prefix_mmm = std::string("TpetraExt ") + label + std::string(": ");
685 using Teuchos::TimeMonitor;
686 Teuchos::RCP<TimeMonitor> MM;
687#endif
688
689 // Check if the diagonal entries exist in debug mode
690 const bool debug = Tpetra::Details::Behavior::debug();
691 if(debug) {
692
693 auto rowMap = Aview.origMatrix->getRowMap();
694 Tpetra::Vector<Scalar> diags(rowMap);
695 Aview.origMatrix->getLocalDiagCopy(diags);
696 size_t diagLength = rowMap->getLocalNumElements();
697 Teuchos::Array<Scalar> diagonal(diagLength);
698 diags.get1dCopy(diagonal());
699
700 for(size_t i = 0; i < diagLength; ++i) {
701 TEUCHOS_TEST_FOR_EXCEPTION(diagonal[i] == Teuchos::ScalarTraits<Scalar>::zero(),
702 std::runtime_error,
703 "Matrix A has a zero/missing diagonal: " << diagonal[i] << std::endl <<
704 "KokkosKernels Jacobi-fused SpGEMM requires nonzero diagonal entries in A" << std::endl);
705 }
706 }
707
708 // Usings
709 using device_t = typename Kokkos::Compat::KokkosHIPWrapperNode::device_type;
711 using graph_t = typename matrix_t::StaticCrsGraphType;
712 using lno_view_t = typename graph_t::row_map_type::non_const_type;
713 using c_lno_view_t = typename graph_t::row_map_type::const_type;
714 using lno_nnz_view_t = typename graph_t::entries_type::non_const_type;
715 using scalar_view_t = typename matrix_t::values_type::non_const_type;
716
717 // KokkosKernels handle
718 using handle_t = typename KokkosKernels::Experimental::KokkosKernelsHandle<
719 typename lno_view_t::const_value_type,typename lno_nnz_view_t::const_value_type, typename scalar_view_t::const_value_type,
720 typename device_t::execution_space, typename device_t::memory_space,typename device_t::memory_space >;
721
722 // Get the rowPtr, colInd and vals of importMatrix
723 c_lno_view_t Irowptr;
724 lno_nnz_view_t Icolind;
725 scalar_view_t Ivals;
726 if(!Bview.importMatrix.is_null()) {
727 auto lclB = Bview.importMatrix->getLocalMatrixDevice();
728 Irowptr = lclB.graph.row_map;
729 Icolind = lclB.graph.entries;
730 Ivals = lclB.values;
731 }
732
733 // Merge the B and Bimport matrices
734 const matrix_t Bmerged = Tpetra::MMdetails::merge_matrices(Aview,Bview,Acol2Brow,Acol2Irow,Bcol2Ccol,Icol2Ccol,C.getColMap()->getLocalNumElements());
735
736 // Get the properties and arrays of input matrices
737 const matrix_t & Amat = Aview.origMatrix->getLocalMatrixDevice();
738 const matrix_t & Bmat = Bview.origMatrix->getLocalMatrixDevice();
739
740 typename handle_t::nnz_lno_t AnumRows = Amat.numRows();
741 typename handle_t::nnz_lno_t BnumRows = Bmerged.numRows();
742 typename handle_t::nnz_lno_t BnumCols = Bmerged.numCols();
743
744 c_lno_view_t Arowptr = Amat.graph.row_map, Browptr = Bmerged.graph.row_map;
745 const lno_nnz_view_t Acolind = Amat.graph.entries, Bcolind = Bmerged.graph.entries;
746 const scalar_view_t Avals = Amat.values, Bvals = Bmerged.values;
747
748 // Arrays of the output matrix
749 lno_view_t row_mapC (Kokkos::ViewAllocateWithoutInitializing("non_const_lnow_row"), AnumRows + 1);
750 lno_nnz_view_t entriesC;
751 scalar_view_t valuesC;
752
753 // Options
754 int team_work_size = 16;
755 std::string myalg("SPGEMM_KK_MEMORY");
756 if(!params.is_null()) {
757 if(params->isParameter("hip: algorithm"))
758 myalg = params->get("hip: algorithm",myalg);
759 if(params->isParameter("hip: team work size"))
760 team_work_size = params->get("hip: team work size",team_work_size);
761 }
762
763 // Get the algorithm mode
764 std::string nodename("HIP");
765 std::string alg = nodename + std::string(" algorithm");
766 if(!params.is_null() && params->isParameter(alg)) myalg = params->get(alg,myalg);
767 KokkosSparse::SPGEMMAlgorithm alg_enum = KokkosSparse::StringToSPGEMMAlgorithm(myalg);
768
769
770 // KokkosKernels call
771 handle_t kh;
772 kh.create_spgemm_handle(alg_enum);
773 kh.set_team_work_size(team_work_size);
774
775 KokkosSparse::Experimental::spgemm_symbolic(&kh, AnumRows, BnumRows, BnumCols,
776 Arowptr, Acolind, false,
777 Browptr, Bcolind, false,
778 row_mapC);
779
780 size_t c_nnz_size = kh.get_spgemm_handle()->get_c_nnz();
781 if (c_nnz_size){
782 entriesC = lno_nnz_view_t (Kokkos::ViewAllocateWithoutInitializing("entriesC"), c_nnz_size);
783 valuesC = scalar_view_t (Kokkos::ViewAllocateWithoutInitializing("valuesC"), c_nnz_size);
784 }
785
786 KokkosSparse::Experimental::spgemm_jacobi(&kh, AnumRows, BnumRows, BnumCols,
787 Arowptr, Acolind, Avals, false,
788 Browptr, Bcolind, Bvals, false,
789 row_mapC, entriesC, valuesC,
790 omega, Dinv.getLocalViewDevice(Access::ReadOnly));
791 kh.destroy_spgemm_handle();
792
793#ifdef HAVE_TPETRA_MMM_TIMINGS
794 MM = Teuchos::null; MM = rcp(new TimeMonitor (*TimeMonitor::getNewTimer(prefix_mmm + std::string("Jacobi Newmatrix HIPSort"))));
795#endif
796
797 // Sort & set values
798 if (params.is_null() || params->get("sort entries",true))
799 Import_Util::sortCrsEntries(row_mapC, entriesC, valuesC);
800 C.setAllValues(row_mapC,entriesC,valuesC);
801
802#ifdef HAVE_TPETRA_MMM_TIMINGS
803 MM = Teuchos::null; MM = rcp(new TimeMonitor (*TimeMonitor::getNewTimer(prefix_mmm + std::string("Jacobi Newmatrix HIPESFC"))));
804#endif
805
806 // Final Fillcomplete
807 Teuchos::RCP<Teuchos::ParameterList> labelList = rcp(new Teuchos::ParameterList);
808 labelList->set("Timer Label",label);
809 if(!params.is_null()) labelList->set("compute global constants",params->get("compute global constants",true));
810 Teuchos::RCP<const Export<LocalOrdinal,GlobalOrdinal,Kokkos::Compat::KokkosHIPWrapperNode> > dummyExport;
811 C.expertStaticFillComplete(Bview.origMatrix->getDomainMap(), Aview.origMatrix->getRangeMap(), Cimport,dummyExport,labelList);
812}
813
814 }//MMdetails
815}//Tpetra
816
817#endif//HIP
818
819#endif
Struct that holds views of the contents of a CrsMatrix.
KokkosSparse::CrsMatrix< impl_scalar_type, local_ordinal_type, device_type, void, typename local_graph_device_type::size_type > local_matrix_device_type
The specialization of Kokkos::CrsMatrix that represents the part of the sparse matrix on each MPI pro...
static bool debug()
Whether Tpetra is in debug mode.
Namespace Tpetra contains the class and methods constituting the Tpetra library.