Tpetra parallel linear algebra Version of the Day
Loading...
Searching...
No Matches
TpetraExt_MatrixMatrix_Cuda.hpp
1// @HEADER
2// ***********************************************************************
3//
4// Tpetra: Templated Linear Algebra Services Package
5// Copyright (2008) Sandia Corporation
6//
7// Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation,
8// the U.S. Government retains certain rights in this software.
9//
10// Redistribution and use in source and binary forms, with or without
11// modification, are permitted provided that the following conditions are
12// met:
13//
14// 1. Redistributions of source code must retain the above copyright
15// notice, this list of conditions and the following disclaimer.
16//
17// 2. Redistributions in binary form must reproduce the above copyright
18// notice, this list of conditions and the following disclaimer in the
19// documentation and/or other materials provided with the distribution.
20//
21// 3. Neither the name of the Corporation nor the names of the
22// contributors may be used to endorse or promote products derived from
23// this software without specific prior written permission.
24//
25// THIS SOFTWARE IS PROVIDED BY SANDIA CORPORATION "AS IS" AND ANY
26// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
27// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
28// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL SANDIA CORPORATION OR THE
29// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
30// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
31// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
32// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
33// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
34// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
35// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
36//
37// Questions? Contact Michael A. Heroux (maherou@sandia.gov)
38//
39// ************************************************************************
40// @HEADER
41
42#ifndef TPETRA_MATRIXMATRIX_CUDA_DEF_HPP
43#define TPETRA_MATRIXMATRIX_CUDA_DEF_HPP
44
45#ifdef HAVE_TPETRA_INST_CUDA
46namespace Tpetra {
47namespace MMdetails {
48
49/*********************************************************************************************************/
50// MMM KernelWrappers for Partial Specialization to CUDA
51template<class Scalar,
52 class LocalOrdinal,
53 class GlobalOrdinal,
54 class LocalOrdinalViewType>
55struct KernelWrappers<Scalar,LocalOrdinal,GlobalOrdinal,Kokkos::Compat::KokkosCudaWrapperNode,LocalOrdinalViewType> {
56 static inline void mult_A_B_newmatrix_kernel_wrapper(CrsMatrixStruct<Scalar, LocalOrdinal, GlobalOrdinal, Kokkos::Compat::KokkosCudaWrapperNode>& Aview,
57 CrsMatrixStruct<Scalar, LocalOrdinal, GlobalOrdinal, Kokkos::Compat::KokkosCudaWrapperNode>& Bview,
58 const LocalOrdinalViewType & Acol2Brow,
59 const LocalOrdinalViewType & Acol2Irow,
60 const LocalOrdinalViewType & Bcol2Ccol,
61 const LocalOrdinalViewType & Icol2Ccol,
62 CrsMatrix<Scalar, LocalOrdinal, GlobalOrdinal, Kokkos::Compat::KokkosCudaWrapperNode>& C,
63 Teuchos::RCP<const Import<LocalOrdinal,GlobalOrdinal,Kokkos::Compat::KokkosCudaWrapperNode> > Cimport,
64 const std::string& label = std::string(),
65 const Teuchos::RCP<Teuchos::ParameterList>& params = Teuchos::null);
66
67
68
69 static inline void mult_A_B_reuse_kernel_wrapper(CrsMatrixStruct<Scalar, LocalOrdinal, GlobalOrdinal, Kokkos::Compat::KokkosCudaWrapperNode>& Aview,
70 CrsMatrixStruct<Scalar, LocalOrdinal, GlobalOrdinal, Kokkos::Compat::KokkosCudaWrapperNode>& Bview,
71 const LocalOrdinalViewType & Acol2Brow,
72 const LocalOrdinalViewType & Acol2Irow,
73 const LocalOrdinalViewType & Bcol2Ccol,
74 const LocalOrdinalViewType & Icol2Ccol,
75 CrsMatrix<Scalar, LocalOrdinal, GlobalOrdinal, Kokkos::Compat::KokkosCudaWrapperNode>& C,
76 Teuchos::RCP<const Import<LocalOrdinal,GlobalOrdinal,Kokkos::Compat::KokkosCudaWrapperNode> > Cimport,
77 const std::string& label = std::string(),
78 const Teuchos::RCP<Teuchos::ParameterList>& params = Teuchos::null);
79
80};
81
82// Jacobi KernelWrappers for Partial Specialization to Cuda
83template<class Scalar,
84 class LocalOrdinal,
85 class GlobalOrdinal, class LocalOrdinalViewType>
86struct KernelWrappers2<Scalar,LocalOrdinal,GlobalOrdinal,Kokkos::Compat::KokkosCudaWrapperNode,LocalOrdinalViewType> {
87 static inline void jacobi_A_B_newmatrix_kernel_wrapper(Scalar omega,
88 const Vector<Scalar,LocalOrdinal,GlobalOrdinal,Kokkos::Compat::KokkosCudaWrapperNode> & Dinv,
89 CrsMatrixStruct<Scalar, LocalOrdinal, GlobalOrdinal, Kokkos::Compat::KokkosCudaWrapperNode>& Aview,
90 CrsMatrixStruct<Scalar, LocalOrdinal, GlobalOrdinal, Kokkos::Compat::KokkosCudaWrapperNode>& Bview,
91 const LocalOrdinalViewType & Acol2Brow,
92 const LocalOrdinalViewType & Acol2Irow,
93 const LocalOrdinalViewType & Bcol2Ccol,
94 const LocalOrdinalViewType & Icol2Ccol,
95 CrsMatrix<Scalar, LocalOrdinal, GlobalOrdinal, Kokkos::Compat::KokkosCudaWrapperNode>& C,
96 Teuchos::RCP<const Import<LocalOrdinal,GlobalOrdinal,Kokkos::Compat::KokkosCudaWrapperNode> > Cimport,
97 const std::string& label = std::string(),
98 const Teuchos::RCP<Teuchos::ParameterList>& params = Teuchos::null);
99
100 static inline void jacobi_A_B_reuse_kernel_wrapper(Scalar omega,
101 const Vector<Scalar,LocalOrdinal,GlobalOrdinal,Kokkos::Compat::KokkosCudaWrapperNode> & Dinv,
102 CrsMatrixStruct<Scalar, LocalOrdinal, GlobalOrdinal, Kokkos::Compat::KokkosCudaWrapperNode>& Aview,
103 CrsMatrixStruct<Scalar, LocalOrdinal, GlobalOrdinal, Kokkos::Compat::KokkosCudaWrapperNode>& Bview,
104 const LocalOrdinalViewType & Acol2Brow,
105 const LocalOrdinalViewType & Acol2Irow,
106 const LocalOrdinalViewType & Bcol2Ccol,
107 const LocalOrdinalViewType & Icol2Ccol,
108 CrsMatrix<Scalar, LocalOrdinal, GlobalOrdinal, Kokkos::Compat::KokkosCudaWrapperNode>& C,
109 Teuchos::RCP<const Import<LocalOrdinal,GlobalOrdinal,Kokkos::Compat::KokkosCudaWrapperNode> > Cimport,
110 const std::string& label = std::string(),
111 const Teuchos::RCP<Teuchos::ParameterList>& params = Teuchos::null);
112
113 static inline void jacobi_A_B_newmatrix_KokkosKernels(Scalar omega,
114 const Vector<Scalar,LocalOrdinal,GlobalOrdinal,Kokkos::Compat::KokkosCudaWrapperNode> & Dinv,
115 CrsMatrixStruct<Scalar, LocalOrdinal, GlobalOrdinal, Kokkos::Compat::KokkosCudaWrapperNode>& Aview,
116 CrsMatrixStruct<Scalar, LocalOrdinal, GlobalOrdinal, Kokkos::Compat::KokkosCudaWrapperNode>& Bview,
117 const LocalOrdinalViewType & Acol2Brow,
118 const LocalOrdinalViewType & Acol2Irow,
119 const LocalOrdinalViewType & Bcol2Ccol,
120 const LocalOrdinalViewType & Icol2Ccol,
121 CrsMatrix<Scalar, LocalOrdinal, GlobalOrdinal, Kokkos::Compat::KokkosCudaWrapperNode>& C,
122 Teuchos::RCP<const Import<LocalOrdinal,GlobalOrdinal,Kokkos::Compat::KokkosCudaWrapperNode> > Cimport,
123 const std::string& label = std::string(),
124 const Teuchos::RCP<Teuchos::ParameterList>& params = Teuchos::null);
125};
126
127
128/*********************************************************************************************************/
129// AB NewMatrix Kernel wrappers (KokkosKernels/CUDA Version)
130template<class Scalar,
131 class LocalOrdinal,
132 class GlobalOrdinal,
133 class LocalOrdinalViewType>
134void KernelWrappers<Scalar,LocalOrdinal,GlobalOrdinal,Kokkos::Compat::KokkosCudaWrapperNode,LocalOrdinalViewType>::mult_A_B_newmatrix_kernel_wrapper(CrsMatrixStruct<Scalar, LocalOrdinal, GlobalOrdinal, Kokkos::Compat::KokkosCudaWrapperNode>& Aview,
135 CrsMatrixStruct<Scalar, LocalOrdinal, GlobalOrdinal, Kokkos::Compat::KokkosCudaWrapperNode>& Bview,
136 const LocalOrdinalViewType & Acol2Brow,
137 const LocalOrdinalViewType & Acol2Irow,
138 const LocalOrdinalViewType & Bcol2Ccol,
139 const LocalOrdinalViewType & Icol2Ccol,
140 CrsMatrix<Scalar, LocalOrdinal, GlobalOrdinal, Kokkos::Compat::KokkosCudaWrapperNode>& C,
141 Teuchos::RCP<const Import<LocalOrdinal,GlobalOrdinal,Kokkos::Compat::KokkosCudaWrapperNode> > Cimport,
142 const std::string& label,
143 const Teuchos::RCP<Teuchos::ParameterList>& params) {
144
145
146#ifdef HAVE_TPETRA_MMM_TIMINGS
147 std::string prefix_mmm = std::string("TpetraExt ") + label + std::string(": ");
148 using Teuchos::TimeMonitor;
149 Teuchos::RCP<TimeMonitor> MM = rcp(new TimeMonitor(*(TimeMonitor::getNewTimer(prefix_mmm + std::string("MMM Newmatrix CudaWrapper")))));
150#endif
151 // Node-specific code
152 typedef Kokkos::Compat::KokkosCudaWrapperNode Node;
153 std::string nodename("Cuda");
154
155 // Lots and lots of typedefs
156 using Teuchos::RCP;
158 typedef typename KCRS::device_type device_t;
159 typedef typename KCRS::StaticCrsGraphType graph_t;
160 typedef typename graph_t::row_map_type::non_const_type lno_view_t;
161 typedef typename graph_t::row_map_type::const_type c_lno_view_t;
162 typedef typename graph_t::entries_type::non_const_type lno_nnz_view_t;
163 typedef typename KCRS::values_type::non_const_type scalar_view_t;
164 //typedef typename graph_t::row_map_type::const_type lno_view_t_const;
165
166 // Options
167 int team_work_size = 16; // Defaults to 16 as per Deveci 12/7/16 - csiefer
168 std::string myalg("SPGEMM_KK_MEMORY");
169 if(!params.is_null()) {
170 if(params->isParameter("cuda: algorithm"))
171 myalg = params->get("cuda: algorithm",myalg);
172 if(params->isParameter("cuda: team work size"))
173 team_work_size = params->get("cuda: team work size",team_work_size);
174 }
175
176 // KokkosKernelsHandle
177 typedef KokkosKernels::Experimental::KokkosKernelsHandle<
178 typename lno_view_t::const_value_type,typename lno_nnz_view_t::const_value_type, typename scalar_view_t::const_value_type,
179 typename device_t::execution_space, typename device_t::memory_space,typename device_t::memory_space > KernelHandle;
180
181 // Grab the Kokkos::SparseCrsMatrices
182 const KCRS & Amat = Aview.origMatrix->getLocalMatrixDevice();
183 const KCRS & Bmat = Bview.origMatrix->getLocalMatrixDevice();
184
185 c_lno_view_t Arowptr = Amat.graph.row_map,
186 Browptr = Bmat.graph.row_map;
187 const lno_nnz_view_t Acolind = Amat.graph.entries,
188 Bcolind = Bmat.graph.entries;
189 const scalar_view_t Avals = Amat.values,
190 Bvals = Bmat.values;
191
192 c_lno_view_t Irowptr;
193 lno_nnz_view_t Icolind;
194 scalar_view_t Ivals;
195 if(!Bview.importMatrix.is_null()) {
196 auto lclB = Bview.importMatrix->getLocalMatrixDevice();
197 Irowptr = lclB.graph.row_map;
198 Icolind = lclB.graph.entries;
199 Ivals = lclB.values;
200 }
201
202
203 // Get the algorithm mode
204 std::string alg = nodename+std::string(" algorithm");
205 // printf("DEBUG: Using kernel: %s\n",myalg.c_str());
206 if(!params.is_null() && params->isParameter(alg)) myalg = params->get(alg,myalg);
207 KokkosSparse::SPGEMMAlgorithm alg_enum = KokkosSparse::StringToSPGEMMAlgorithm(myalg);
208
209 // Merge the B and Bimport matrices
210 const KCRS Bmerged = Tpetra::MMdetails::merge_matrices(Aview,Bview,Acol2Brow,Acol2Irow,Bcol2Ccol,Icol2Ccol,C.getColMap()->getLocalNumElements());
211
212#ifdef HAVE_TPETRA_MMM_TIMINGS
213 MM = Teuchos::null; MM = rcp(new TimeMonitor (*TimeMonitor::getNewTimer(prefix_mmm + std::string("MMM Newmatrix CudaCore"))));
214#endif
215
216 // Do the multiply on whatever we've got
217 typename KernelHandle::nnz_lno_t AnumRows = Amat.numRows();
218 typename KernelHandle::nnz_lno_t BnumRows = Bmerged.numRows();
219 typename KernelHandle::nnz_lno_t BnumCols = Bmerged.numCols();
220
221 lno_view_t row_mapC (Kokkos::ViewAllocateWithoutInitializing("non_const_lnow_row"), AnumRows + 1);
222 lno_nnz_view_t entriesC;
223 scalar_view_t valuesC;
224 KernelHandle kh;
225 kh.create_spgemm_handle(alg_enum);
226 kh.set_team_work_size(team_work_size);
227
228 KokkosSparse::Experimental::spgemm_symbolic(&kh,AnumRows,BnumRows,BnumCols,Amat.graph.row_map,Amat.graph.entries,false,Bmerged.graph.row_map,Bmerged.graph.entries,false,row_mapC);
229
230 size_t c_nnz_size = kh.get_spgemm_handle()->get_c_nnz();
231 if (c_nnz_size){
232 entriesC = lno_nnz_view_t (Kokkos::ViewAllocateWithoutInitializing("entriesC"), c_nnz_size);
233 valuesC = scalar_view_t (Kokkos::ViewAllocateWithoutInitializing("valuesC"), c_nnz_size);
234 }
235 KokkosSparse::Experimental::spgemm_numeric(&kh,AnumRows,BnumRows,BnumCols,Amat.graph.row_map,Amat.graph.entries,Amat.values,false,Bmerged.graph.row_map,Bmerged.graph.entries,Bmerged.values,false,row_mapC,entriesC,valuesC);
236 kh.destroy_spgemm_handle();
237
238#ifdef HAVE_TPETRA_MMM_TIMINGS
239 MM = Teuchos::null; MM = rcp(new TimeMonitor (*TimeMonitor::getNewTimer(prefix_mmm + std::string("MMM Newmatrix CudaSort"))));
240#endif
241
242 // Sort & set values
243 if (params.is_null() || params->get("sort entries",true))
244 Import_Util::sortCrsEntries(row_mapC, entriesC, valuesC);
245 C.setAllValues(row_mapC,entriesC,valuesC);
246
247#ifdef HAVE_TPETRA_MMM_TIMINGS
248 MM = Teuchos::null; MM = rcp(new TimeMonitor (*TimeMonitor::getNewTimer(prefix_mmm + std::string("MMM Newmatrix CudaESFC"))));
249#endif
250
251 // Final Fillcomplete
252 RCP<Teuchos::ParameterList> labelList = rcp(new Teuchos::ParameterList);
253 labelList->set("Timer Label",label);
254 if(!params.is_null()) labelList->set("compute global constants",params->get("compute global constants",true));
255 RCP<const Export<LocalOrdinal,GlobalOrdinal,Kokkos::Compat::KokkosCudaWrapperNode> > dummyExport;
256 C.expertStaticFillComplete(Bview.origMatrix->getDomainMap(), Aview.origMatrix->getRangeMap(), Cimport,dummyExport,labelList);
257}
258
259
260/*********************************************************************************************************/
261template<class Scalar,
262 class LocalOrdinal,
263 class GlobalOrdinal,
264 class LocalOrdinalViewType>
265void KernelWrappers<Scalar,LocalOrdinal,GlobalOrdinal,Kokkos::Compat::KokkosCudaWrapperNode,LocalOrdinalViewType>::mult_A_B_reuse_kernel_wrapper(
266 CrsMatrixStruct<Scalar, LocalOrdinal, GlobalOrdinal, Kokkos::Compat::KokkosCudaWrapperNode>& Aview,
267 CrsMatrixStruct<Scalar, LocalOrdinal, GlobalOrdinal, Kokkos::Compat::KokkosCudaWrapperNode>& Bview,
268 const LocalOrdinalViewType & targetMapToOrigRow_dev,
269 const LocalOrdinalViewType & targetMapToImportRow_dev,
270 const LocalOrdinalViewType & Bcol2Ccol_dev,
271 const LocalOrdinalViewType & Icol2Ccol_dev,
272 CrsMatrix<Scalar, LocalOrdinal, GlobalOrdinal, Kokkos::Compat::KokkosCudaWrapperNode>& C,
273 Teuchos::RCP<const Import<LocalOrdinal,GlobalOrdinal,Kokkos::Compat::KokkosCudaWrapperNode> > Cimport,
274 const std::string& label,
275 const Teuchos::RCP<Teuchos::ParameterList>& params) {
276
277 // FIXME: Right now, this is a cut-and-paste of the serial kernel
278 typedef Kokkos::Compat::KokkosCudaWrapperNode Node;
279
280#ifdef HAVE_TPETRA_MMM_TIMINGS
281 std::string prefix_mmm = std::string("TpetraExt ") + label + std::string(": ");
282 using Teuchos::TimeMonitor;
283 Teuchos::RCP<Teuchos::TimeMonitor> MM = rcp(new TimeMonitor(*TimeMonitor::getNewTimer(prefix_mmm + std::string("MMM Reuse SerialCore"))));
284 Teuchos::RCP<Teuchos::TimeMonitor> MM2;
285#endif
286 using Teuchos::RCP;
287 using Teuchos::rcp;
288
289
290 // Lots and lots of typedefs
291 typedef typename Tpetra::CrsMatrix<Scalar,LocalOrdinal,GlobalOrdinal,Node>::local_matrix_host_type KCRS;
292 typedef typename KCRS::StaticCrsGraphType graph_t;
293 typedef typename graph_t::row_map_type::const_type c_lno_view_t;
294 typedef typename graph_t::entries_type::non_const_type lno_nnz_view_t;
295 typedef typename KCRS::values_type::non_const_type scalar_view_t;
296
297 typedef Scalar SC;
298 typedef LocalOrdinal LO;
299 typedef GlobalOrdinal GO;
300 typedef Node NO;
301 typedef Map<LO,GO,NO> map_type;
302 const size_t ST_INVALID = Teuchos::OrdinalTraits<LO>::invalid();
303 const LO LO_INVALID = Teuchos::OrdinalTraits<LO>::invalid();
304 const SC SC_ZERO = Teuchos::ScalarTraits<Scalar>::zero();
305
306 // Since this is being run on Cuda, we need to fence because the below code will use UVM
307 // typename graph_t::execution_space().fence();
308
309 // KDDKDD UVM Without UVM, need to copy targetMap arrays to host.
310 // KDDKDD UVM Ideally, this function would run on device and use
311 // KDDKDD UVM KokkosKernels instead of this host implementation.
312 auto targetMapToOrigRow =
313 Kokkos::create_mirror_view_and_copy(Kokkos::HostSpace(),
314 targetMapToOrigRow_dev);
315 auto targetMapToImportRow =
316 Kokkos::create_mirror_view_and_copy(Kokkos::HostSpace(),
317 targetMapToImportRow_dev);
318 auto Bcol2Ccol =
319 Kokkos::create_mirror_view_and_copy(Kokkos::HostSpace(),
320 Bcol2Ccol_dev);
321 auto Icol2Ccol =
322 Kokkos::create_mirror_view_and_copy(Kokkos::HostSpace(),
323 Icol2Ccol_dev);
324
325 // Sizes
326 RCP<const map_type> Ccolmap = C.getColMap();
327 size_t m = Aview.origMatrix->getLocalNumRows();
328 size_t n = Ccolmap->getLocalNumElements();
329
330 // Grab the Kokkos::SparseCrsMatrices & inner stuff
331 const KCRS & Amat = Aview.origMatrix->getLocalMatrixHost();
332 const KCRS & Bmat = Bview.origMatrix->getLocalMatrixHost();
333 const KCRS & Cmat = C.getLocalMatrixHost();
334
335 c_lno_view_t Arowptr = Amat.graph.row_map,
336 Browptr = Bmat.graph.row_map,
337 Crowptr = Cmat.graph.row_map;
338 const lno_nnz_view_t Acolind = Amat.graph.entries,
339 Bcolind = Bmat.graph.entries,
340 Ccolind = Cmat.graph.entries;
341 const scalar_view_t Avals = Amat.values, Bvals = Bmat.values;
342 scalar_view_t Cvals = Cmat.values;
343
344 c_lno_view_t Irowptr;
345 lno_nnz_view_t Icolind;
346 scalar_view_t Ivals;
347 if(!Bview.importMatrix.is_null()) {
348 auto lclB = Bview.importMatrix->getLocalMatrixHost();
349 Irowptr = lclB.graph.row_map;
350 Icolind = lclB.graph.entries;
351 Ivals = lclB.values;
352 }
353
354#ifdef HAVE_TPETRA_MMM_TIMINGS
355 MM2 = Teuchos::null; MM2 = rcp(new TimeMonitor(*TimeMonitor::getNewTimer(prefix_mmm + std::string("MMM Newmatrix SerialCore - Compare"))));
356#endif
357
358 // Classic csr assembly (low memory edition)
359 // mfh 27 Sep 2016: The c_status array is an implementation detail
360 // of the local sparse matrix-matrix multiply routine.
361
362 // The status array will contain the index into colind where this entry was last deposited.
363 // c_status[i] < CSR_ip - not in the row yet
364 // c_status[i] >= CSR_ip - this is the entry where you can find the data
365 // We start with this filled with INVALID's indicating that there are no entries yet.
366 // Sadly, this complicates the code due to the fact that size_t's are unsigned.
367 std::vector<size_t> c_status(n, ST_INVALID);
368
369 // For each row of A/C
370 size_t CSR_ip = 0, OLD_ip = 0;
371 for (size_t i = 0; i < m; i++) {
372 // First fill the c_status array w/ locations where we're allowed to
373 // generate nonzeros for this row
374 OLD_ip = Crowptr[i];
375 CSR_ip = Crowptr[i+1];
376 for (size_t k = OLD_ip; k < CSR_ip; k++) {
377 c_status[Ccolind[k]] = k;
378
379 // Reset values in the row of C
380 Cvals[k] = SC_ZERO;
381 }
382
383 for (size_t k = Arowptr[i]; k < Arowptr[i+1]; k++) {
384 LO Aik = Acolind[k];
385 const SC Aval = Avals[k];
386 if (Aval == SC_ZERO)
387 continue;
388
389 if (targetMapToOrigRow[Aik] != LO_INVALID) {
390 // Local matrix
391 size_t Bk = Teuchos::as<size_t>(targetMapToOrigRow[Aik]);
392
393 for (size_t j = Browptr[Bk]; j < Browptr[Bk+1]; ++j) {
394 LO Bkj = Bcolind[j];
395 LO Cij = Bcol2Ccol[Bkj];
396
397 TEUCHOS_TEST_FOR_EXCEPTION(c_status[Cij] < OLD_ip || c_status[Cij] >= CSR_ip,
398 std::runtime_error, "Trying to insert a new entry (" << i << "," << Cij << ") into a static graph " <<
399 "(c_status = " << c_status[Cij] << " of [" << OLD_ip << "," << CSR_ip << "))");
400
401 Cvals[c_status[Cij]] += Aval * Bvals[j];
402 }
403
404 } else {
405 // Remote matrix
406 size_t Ik = Teuchos::as<size_t>(targetMapToImportRow[Aik]);
407 for (size_t j = Irowptr[Ik]; j < Irowptr[Ik+1]; ++j) {
408 LO Ikj = Icolind[j];
409 LO Cij = Icol2Ccol[Ikj];
410
411 TEUCHOS_TEST_FOR_EXCEPTION(c_status[Cij] < OLD_ip || c_status[Cij] >= CSR_ip,
412 std::runtime_error, "Trying to insert a new entry (" << i << "," << Cij << ") into a static graph " <<
413 "(c_status = " << c_status[Cij] << " of [" << OLD_ip << "," << CSR_ip << "))");
414
415 Cvals[c_status[Cij]] += Aval * Ivals[j];
416 }
417 }
418 }
419 }
420
421 C.fillComplete(C.getDomainMap(), C.getRangeMap());
422}
423
424/*********************************************************************************************************/
425template<class Scalar,
426 class LocalOrdinal,
427 class GlobalOrdinal,
428 class LocalOrdinalViewType>
429void KernelWrappers2<Scalar,LocalOrdinal,GlobalOrdinal,Kokkos::Compat::KokkosCudaWrapperNode,LocalOrdinalViewType>::jacobi_A_B_newmatrix_kernel_wrapper(Scalar omega,
430 const Vector<Scalar,LocalOrdinal,GlobalOrdinal,Kokkos::Compat::KokkosCudaWrapperNode> & Dinv,
431 CrsMatrixStruct<Scalar, LocalOrdinal, GlobalOrdinal, Kokkos::Compat::KokkosCudaWrapperNode>& Aview,
432 CrsMatrixStruct<Scalar, LocalOrdinal, GlobalOrdinal, Kokkos::Compat::KokkosCudaWrapperNode>& Bview,
433 const LocalOrdinalViewType & Acol2Brow,
434 const LocalOrdinalViewType & Acol2Irow,
435 const LocalOrdinalViewType & Bcol2Ccol,
436 const LocalOrdinalViewType & Icol2Ccol,
437 CrsMatrix<Scalar, LocalOrdinal, GlobalOrdinal, Kokkos::Compat::KokkosCudaWrapperNode>& C,
438 Teuchos::RCP<const Import<LocalOrdinal,GlobalOrdinal,Kokkos::Compat::KokkosCudaWrapperNode> > Cimport,
439 const std::string& label,
440 const Teuchos::RCP<Teuchos::ParameterList>& params) {
441
442#ifdef HAVE_TPETRA_MMM_TIMINGS
443 std::string prefix_mmm = std::string("TpetraExt ") + label + std::string(": ");
444 using Teuchos::TimeMonitor;
445 Teuchos::RCP<TimeMonitor> MM;
446#endif
447
448 // Node-specific code
449 using Teuchos::RCP;
450
451 // Options
452 //int team_work_size = 16; // Defaults to 16 as per Deveci 12/7/16 - csiefer // unreferenced
453 std::string myalg("KK");
454 if(!params.is_null()) {
455 if(params->isParameter("cuda: jacobi algorithm"))
456 myalg = params->get("cuda: jacobi algorithm",myalg);
457 }
458
459 if(myalg == "MSAK") {
460 ::Tpetra::MatrixMatrix::ExtraKernels::jacobi_A_B_newmatrix_MultiplyScaleAddKernel(omega,Dinv,Aview,Bview,Acol2Brow,Acol2Irow,Bcol2Ccol,Icol2Ccol,C,Cimport,label,params);
461 }
462 else if(myalg == "KK") {
463 jacobi_A_B_newmatrix_KokkosKernels(omega,Dinv,Aview,Bview,Acol2Brow,Acol2Irow,Bcol2Ccol,Icol2Ccol,C,Cimport,label,params);
464 }
465 else {
466 throw std::runtime_error("Tpetra::MatrixMatrix::Jacobi newmatrix unknown kernel");
467 }
468
469#ifdef HAVE_TPETRA_MMM_TIMINGS
470 MM = Teuchos::null; MM = rcp(new TimeMonitor (*TimeMonitor::getNewTimer(prefix_mmm + std::string("Jacobi Newmatrix CudaESFC"))));
471#endif
472
473 // Final Fillcomplete
474 RCP<Teuchos::ParameterList> labelList = rcp(new Teuchos::ParameterList);
475 labelList->set("Timer Label",label);
476 if(!params.is_null()) labelList->set("compute global constants",params->get("compute global constants",true));
477
478 // NOTE: MSAK already fillCompletes, so we have to check here
479 if(!C.isFillComplete()) {
480 RCP<const Export<LocalOrdinal,GlobalOrdinal,Kokkos::Compat::KokkosCudaWrapperNode> > dummyExport;
481 C.expertStaticFillComplete(Bview.origMatrix->getDomainMap(), Aview.origMatrix->getRangeMap(), Cimport,dummyExport,labelList);
482 }
483
484}
485
486
487
488/*********************************************************************************************************/
489template<class Scalar,
490 class LocalOrdinal,
491 class GlobalOrdinal,
492 class LocalOrdinalViewType>
493void KernelWrappers2<Scalar,LocalOrdinal,GlobalOrdinal,Kokkos::Compat::KokkosCudaWrapperNode,LocalOrdinalViewType>::jacobi_A_B_reuse_kernel_wrapper(Scalar omega,
494 const Vector<Scalar,LocalOrdinal,GlobalOrdinal,Kokkos::Compat::KokkosCudaWrapperNode> & Dinv,
495 CrsMatrixStruct<Scalar, LocalOrdinal, GlobalOrdinal, Kokkos::Compat::KokkosCudaWrapperNode>& Aview,
496 CrsMatrixStruct<Scalar, LocalOrdinal, GlobalOrdinal, Kokkos::Compat::KokkosCudaWrapperNode>& Bview,
497 const LocalOrdinalViewType & targetMapToOrigRow_dev,
498 const LocalOrdinalViewType & targetMapToImportRow_dev,
499 const LocalOrdinalViewType & Bcol2Ccol_dev,
500 const LocalOrdinalViewType & Icol2Ccol_dev,
501 CrsMatrix<Scalar, LocalOrdinal, GlobalOrdinal, Kokkos::Compat::KokkosCudaWrapperNode>& C,
502 Teuchos::RCP<const Import<LocalOrdinal,GlobalOrdinal,Kokkos::Compat::KokkosCudaWrapperNode> > Cimport,
503 const std::string& label,
504 const Teuchos::RCP<Teuchos::ParameterList>& params) {
505
506 // FIXME: Right now, this is a cut-and-paste of the serial kernel
507 typedef Kokkos::Compat::KokkosCudaWrapperNode Node;
508
509#ifdef HAVE_TPETRA_MMM_TIMINGS
510 std::string prefix_mmm = std::string("TpetraExt ") + label + std::string(": ");
511 using Teuchos::TimeMonitor;
512 Teuchos::RCP<Teuchos::TimeMonitor> MM = rcp(new TimeMonitor(*TimeMonitor::getNewTimer(prefix_mmm + std::string("Jacobi Reuse CudaCore"))));
513 Teuchos::RCP<Teuchos::TimeMonitor> MM2;
514#endif
515 using Teuchos::RCP;
516 using Teuchos::rcp;
517
518 // Lots and lots of typedefs
519 typedef typename Tpetra::CrsMatrix<Scalar,LocalOrdinal,GlobalOrdinal,Node>::local_matrix_host_type KCRS;
520 typedef typename KCRS::StaticCrsGraphType graph_t;
521 typedef typename graph_t::row_map_type::const_type c_lno_view_t;
522 typedef typename graph_t::entries_type::non_const_type lno_nnz_view_t;
523 typedef typename KCRS::values_type::non_const_type scalar_view_t;
524 typedef typename scalar_view_t::memory_space scalar_memory_space;
525
526 typedef Scalar SC;
527 typedef LocalOrdinal LO;
528 typedef GlobalOrdinal GO;
529 typedef Node NO;
530 typedef Map<LO,GO,NO> map_type;
531 const size_t ST_INVALID = Teuchos::OrdinalTraits<LO>::invalid();
532 const LO LO_INVALID = Teuchos::OrdinalTraits<LO>::invalid();
533 const SC SC_ZERO = Teuchos::ScalarTraits<Scalar>::zero();
534
535 // Since this is being run on Cuda, we need to fence because the below host code will use UVM
536 // KDDKDD typename graph_t::execution_space().fence();
537
538 // KDDKDD UVM Without UVM, need to copy targetMap arrays to host.
539 // KDDKDD UVM Ideally, this function would run on device and use
540 // KDDKDD UVM KokkosKernels instead of this host implementation.
541 auto targetMapToOrigRow =
542 Kokkos::create_mirror_view_and_copy(Kokkos::HostSpace(),
543 targetMapToOrigRow_dev);
544 auto targetMapToImportRow =
545 Kokkos::create_mirror_view_and_copy(Kokkos::HostSpace(),
546 targetMapToImportRow_dev);
547 auto Bcol2Ccol =
548 Kokkos::create_mirror_view_and_copy(Kokkos::HostSpace(),
549 Bcol2Ccol_dev);
550 auto Icol2Ccol =
551 Kokkos::create_mirror_view_and_copy(Kokkos::HostSpace(),
552 Icol2Ccol_dev);
553
554
555 // Sizes
556 RCP<const map_type> Ccolmap = C.getColMap();
557 size_t m = Aview.origMatrix->getLocalNumRows();
558 size_t n = Ccolmap->getLocalNumElements();
559
560 // Grab the Kokkos::SparseCrsMatrices & inner stuff
561 const KCRS & Amat = Aview.origMatrix->getLocalMatrixHost();
562 const KCRS & Bmat = Bview.origMatrix->getLocalMatrixHost();
563 const KCRS & Cmat = C.getLocalMatrixHost();
564
565 c_lno_view_t Arowptr = Amat.graph.row_map, Browptr = Bmat.graph.row_map, Crowptr = Cmat.graph.row_map;
566 const lno_nnz_view_t Acolind = Amat.graph.entries, Bcolind = Bmat.graph.entries, Ccolind = Cmat.graph.entries;
567 const scalar_view_t Avals = Amat.values, Bvals = Bmat.values;
568 scalar_view_t Cvals = Cmat.values;
569
570 c_lno_view_t Irowptr;
571 lno_nnz_view_t Icolind;
572 scalar_view_t Ivals;
573 if(!Bview.importMatrix.is_null()) {
574 auto lclB = Bview.importMatrix->getLocalMatrixHost();
575 Irowptr = lclB.graph.row_map;
576 Icolind = lclB.graph.entries;
577 Ivals = lclB.values;
578 }
579
580 // Jacobi-specific inner stuff
581 auto Dvals =
582 Dinv.template getLocalView<scalar_memory_space>(Access::ReadOnly);
583
584#ifdef HAVE_TPETRA_MMM_TIMINGS
585 MM2 = Teuchos::null; MM2 = rcp(new TimeMonitor(*TimeMonitor::getNewTimer(prefix_mmm + std::string("Jacobi Reuse CudaCore - Compare"))));
586#endif
587
588 // The status array will contain the index into colind where this entry was last deposited.
589 // c_status[i] < CSR_ip - not in the row yet
590 // c_status[i] >= CSR_ip - this is the entry where you can find the data
591 // We start with this filled with INVALID's indicating that there are no entries yet.
592 // Sadly, this complicates the code due to the fact that size_t's are unsigned.
593 std::vector<size_t> c_status(n, ST_INVALID);
594
595 // For each row of A/C
596 size_t CSR_ip = 0, OLD_ip = 0;
597 for (size_t i = 0; i < m; i++) {
598
599 // First fill the c_status array w/ locations where we're allowed to
600 // generate nonzeros for this row
601 OLD_ip = Crowptr[i];
602 CSR_ip = Crowptr[i+1];
603 for (size_t k = OLD_ip; k < CSR_ip; k++) {
604 c_status[Ccolind[k]] = k;
605
606 // Reset values in the row of C
607 Cvals[k] = SC_ZERO;
608 }
609
610 SC minusOmegaDval = -omega*Dvals(i,0);
611
612 // Entries of B
613 for (size_t j = Browptr[i]; j < Browptr[i+1]; j++) {
614 Scalar Bval = Bvals[j];
615 if (Bval == SC_ZERO)
616 continue;
617 LO Bij = Bcolind[j];
618 LO Cij = Bcol2Ccol[Bij];
619
620 TEUCHOS_TEST_FOR_EXCEPTION(c_status[Cij] < OLD_ip || c_status[Cij] >= CSR_ip,
621 std::runtime_error, "Trying to insert a new entry into a static graph");
622
623 Cvals[c_status[Cij]] = Bvals[j];
624 }
625
626 // Entries of -omega * Dinv * A * B
627 for (size_t k = Arowptr[i]; k < Arowptr[i+1]; k++) {
628 LO Aik = Acolind[k];
629 const SC Aval = Avals[k];
630 if (Aval == SC_ZERO)
631 continue;
632
633 if (targetMapToOrigRow[Aik] != LO_INVALID) {
634 // Local matrix
635 size_t Bk = Teuchos::as<size_t>(targetMapToOrigRow[Aik]);
636
637 for (size_t j = Browptr[Bk]; j < Browptr[Bk+1]; ++j) {
638 LO Bkj = Bcolind[j];
639 LO Cij = Bcol2Ccol[Bkj];
640
641 TEUCHOS_TEST_FOR_EXCEPTION(c_status[Cij] < OLD_ip || c_status[Cij] >= CSR_ip,
642 std::runtime_error, "Trying to insert a new entry into a static graph");
643
644 Cvals[c_status[Cij]] += minusOmegaDval * Aval * Bvals[j];
645 }
646
647 } else {
648 // Remote matrix
649 size_t Ik = Teuchos::as<size_t>(targetMapToImportRow[Aik]);
650 for (size_t j = Irowptr[Ik]; j < Irowptr[Ik+1]; ++j) {
651 LO Ikj = Icolind[j];
652 LO Cij = Icol2Ccol[Ikj];
653
654 TEUCHOS_TEST_FOR_EXCEPTION(c_status[Cij] < OLD_ip || c_status[Cij] >= CSR_ip,
655 std::runtime_error, "Trying to insert a new entry into a static graph");
656
657 Cvals[c_status[Cij]] += minusOmegaDval * Aval * Ivals[j];
658 }
659 }
660 }
661 }
662
663#ifdef HAVE_TPETRA_MMM_TIMINGS
664 MM2= Teuchos::null;
665 MM = Teuchos::null; MM = rcp(new TimeMonitor (*TimeMonitor::getNewTimer(prefix_mmm + std::string("Jacobi Reuse ESFC"))));
666#endif
667
668 C.fillComplete(C.getDomainMap(), C.getRangeMap());
669
670}
671
672/*********************************************************************************************************/
673template<class Scalar,
674 class LocalOrdinal,
675 class GlobalOrdinal,
676 class LocalOrdinalViewType>
677void KernelWrappers2<Scalar,LocalOrdinal,GlobalOrdinal,Kokkos::Compat::KokkosCudaWrapperNode,LocalOrdinalViewType>::jacobi_A_B_newmatrix_KokkosKernels(Scalar omega,
678 const Vector<Scalar,LocalOrdinal,GlobalOrdinal,Kokkos::Compat::KokkosCudaWrapperNode> & Dinv,
679 CrsMatrixStruct<Scalar, LocalOrdinal, GlobalOrdinal, Kokkos::Compat::KokkosCudaWrapperNode>& Aview,
680 CrsMatrixStruct<Scalar, LocalOrdinal, GlobalOrdinal, Kokkos::Compat::KokkosCudaWrapperNode>& Bview,
681 const LocalOrdinalViewType & Acol2Brow,
682 const LocalOrdinalViewType & Acol2Irow,
683 const LocalOrdinalViewType & Bcol2Ccol,
684 const LocalOrdinalViewType & Icol2Ccol,
685 CrsMatrix<Scalar, LocalOrdinal, GlobalOrdinal, Kokkos::Compat::KokkosCudaWrapperNode>& C,
686 Teuchos::RCP<const Import<LocalOrdinal,GlobalOrdinal,Kokkos::Compat::KokkosCudaWrapperNode> > Cimport,
687 const std::string& label,
688 const Teuchos::RCP<Teuchos::ParameterList>& params) {
689
690#ifdef HAVE_TPETRA_MMM_TIMINGS
691 std::string prefix_mmm = std::string("TpetraExt ") + label + std::string(": ");
692 using Teuchos::TimeMonitor;
693 Teuchos::RCP<TimeMonitor> MM;
694#endif
695
696 // Check if the diagonal entries exist in debug mode
697 const bool debug = Tpetra::Details::Behavior::debug();
698 if(debug) {
699
700 auto rowMap = Aview.origMatrix->getRowMap();
701 Tpetra::Vector<Scalar> diags(rowMap);
702 Aview.origMatrix->getLocalDiagCopy(diags);
703 size_t diagLength = rowMap->getLocalNumElements();
704 Teuchos::Array<Scalar> diagonal(diagLength);
705 diags.get1dCopy(diagonal());
706
707 for(size_t i = 0; i < diagLength; ++i) {
708 TEUCHOS_TEST_FOR_EXCEPTION(diagonal[i] == Teuchos::ScalarTraits<Scalar>::zero(),
709 std::runtime_error,
710 "Matrix A has a zero/missing diagonal: " << diagonal[i] << std::endl <<
711 "KokkosKernels Jacobi-fused SpGEMM requires nonzero diagonal entries in A" << std::endl);
712 }
713 }
714
715 // Usings
716 using device_t = typename Kokkos::Compat::KokkosCudaWrapperNode::device_type;
718 using graph_t = typename matrix_t::StaticCrsGraphType;
719 using lno_view_t = typename graph_t::row_map_type::non_const_type;
720 using c_lno_view_t = typename graph_t::row_map_type::const_type;
721 using lno_nnz_view_t = typename graph_t::entries_type::non_const_type;
722 using scalar_view_t = typename matrix_t::values_type::non_const_type;
723
724 // KokkosKernels handle
725 using handle_t = typename KokkosKernels::Experimental::KokkosKernelsHandle<
726 typename lno_view_t::const_value_type,typename lno_nnz_view_t::const_value_type, typename scalar_view_t::const_value_type,
727 typename device_t::execution_space, typename device_t::memory_space,typename device_t::memory_space >;
728
729 // Get the rowPtr, colInd and vals of importMatrix
730 c_lno_view_t Irowptr;
731 lno_nnz_view_t Icolind;
732 scalar_view_t Ivals;
733 if(!Bview.importMatrix.is_null()) {
734 auto lclB = Bview.importMatrix->getLocalMatrixDevice();
735 Irowptr = lclB.graph.row_map;
736 Icolind = lclB.graph.entries;
737 Ivals = lclB.values;
738 }
739
740 // Merge the B and Bimport matrices
741 const matrix_t Bmerged = Tpetra::MMdetails::merge_matrices(Aview,Bview,Acol2Brow,Acol2Irow,Bcol2Ccol,Icol2Ccol,C.getColMap()->getLocalNumElements());
742
743 // Get the properties and arrays of input matrices
744 const matrix_t & Amat = Aview.origMatrix->getLocalMatrixDevice();
745 const matrix_t & Bmat = Bview.origMatrix->getLocalMatrixDevice();
746
747 typename handle_t::nnz_lno_t AnumRows = Amat.numRows();
748 typename handle_t::nnz_lno_t BnumRows = Bmerged.numRows();
749 typename handle_t::nnz_lno_t BnumCols = Bmerged.numCols();
750
751 c_lno_view_t Arowptr = Amat.graph.row_map, Browptr = Bmerged.graph.row_map;
752 const lno_nnz_view_t Acolind = Amat.graph.entries, Bcolind = Bmerged.graph.entries;
753 const scalar_view_t Avals = Amat.values, Bvals = Bmerged.values;
754
755 // Arrays of the output matrix
756 lno_view_t row_mapC (Kokkos::ViewAllocateWithoutInitializing("non_const_lnow_row"), AnumRows + 1);
757 lno_nnz_view_t entriesC;
758 scalar_view_t valuesC;
759
760 // Options
761 int team_work_size = 16;
762 std::string myalg("SPGEMM_KK_MEMORY");
763 if(!params.is_null()) {
764 if(params->isParameter("cuda: algorithm"))
765 myalg = params->get("cuda: algorithm",myalg);
766 if(params->isParameter("cuda: team work size"))
767 team_work_size = params->get("cuda: team work size",team_work_size);
768 }
769
770 // Get the algorithm mode
771 std::string nodename("Cuda");
772 std::string alg = nodename + std::string(" algorithm");
773 if(!params.is_null() && params->isParameter(alg)) myalg = params->get(alg,myalg);
774 KokkosSparse::SPGEMMAlgorithm alg_enum = KokkosSparse::StringToSPGEMMAlgorithm(myalg);
775
776
777 // KokkosKernels call
778 handle_t kh;
779 kh.create_spgemm_handle(alg_enum);
780 kh.set_team_work_size(team_work_size);
781
782 KokkosSparse::Experimental::spgemm_symbolic(&kh, AnumRows, BnumRows, BnumCols,
783 Arowptr, Acolind, false,
784 Browptr, Bcolind, false,
785 row_mapC);
786
787 size_t c_nnz_size = kh.get_spgemm_handle()->get_c_nnz();
788 if (c_nnz_size){
789 entriesC = lno_nnz_view_t (Kokkos::ViewAllocateWithoutInitializing("entriesC"), c_nnz_size);
790 valuesC = scalar_view_t (Kokkos::ViewAllocateWithoutInitializing("valuesC"), c_nnz_size);
791 }
792
793 KokkosSparse::Experimental::spgemm_jacobi(&kh, AnumRows, BnumRows, BnumCols,
794 Arowptr, Acolind, Avals, false,
795 Browptr, Bcolind, Bvals, false,
796 row_mapC, entriesC, valuesC,
797 omega, Dinv.getLocalViewDevice(Access::ReadOnly));
798 kh.destroy_spgemm_handle();
799
800#ifdef HAVE_TPETRA_MMM_TIMINGS
801 MM = Teuchos::null; MM = rcp(new TimeMonitor (*TimeMonitor::getNewTimer(prefix_mmm + std::string("Jacobi Newmatrix CudaSort"))));
802#endif
803
804 // Sort & set values
805 if (params.is_null() || params->get("sort entries",true))
806 Import_Util::sortCrsEntries(row_mapC, entriesC, valuesC);
807 C.setAllValues(row_mapC,entriesC,valuesC);
808
809#ifdef HAVE_TPETRA_MMM_TIMINGS
810 MM = Teuchos::null; MM = rcp(new TimeMonitor (*TimeMonitor::getNewTimer(prefix_mmm + std::string("Jacobi Newmatrix CudaESFC"))));
811#endif
812
813 // Final Fillcomplete
814 Teuchos::RCP<Teuchos::ParameterList> labelList = rcp(new Teuchos::ParameterList);
815 labelList->set("Timer Label",label);
816 if(!params.is_null()) labelList->set("compute global constants",params->get("compute global constants",true));
817 Teuchos::RCP<const Export<LocalOrdinal,GlobalOrdinal,Kokkos::Compat::KokkosCudaWrapperNode> > dummyExport;
818 C.expertStaticFillComplete(Bview.origMatrix->getDomainMap(), Aview.origMatrix->getRangeMap(), Cimport,dummyExport,labelList);
819}
820
821 }//MMdetails
822}//Tpetra
823
824#endif//CUDA
825
826#endif
Struct that holds views of the contents of a CrsMatrix.
KokkosSparse::CrsMatrix< impl_scalar_type, local_ordinal_type, device_type, void, typename local_graph_device_type::size_type > local_matrix_device_type
The specialization of Kokkos::CrsMatrix that represents the part of the sparse matrix on each MPI pro...
static bool debug()
Whether Tpetra is in debug mode.
Namespace Tpetra contains the class and methods constituting the Tpetra library.