Stokhos Package Browser (Single Doxygen Collection) Version of the Day
Loading...
Searching...
No Matches
Stokhos_Cuda_CrsProductTensor.hpp
Go to the documentation of this file.
1// @HEADER
2// ***********************************************************************
3//
4// Stokhos Package
5// Copyright (2009) Sandia Corporation
6//
7// Under terms of Contract DE-AC04-94AL85000, there is a non-exclusive
8// license for use of this work by or on behalf of the U.S. Government.
9//
10// Redistribution and use in source and binary forms, with or without
11// modification, are permitted provided that the following conditions are
12// met:
13//
14// 1. Redistributions of source code must retain the above copyright
15// notice, this list of conditions and the following disclaimer.
16//
17// 2. Redistributions in binary form must reproduce the above copyright
18// notice, this list of conditions and the following disclaimer in the
19// documentation and/or other materials provided with the distribution.
20//
21// 3. Neither the name of the Corporation nor the names of the
22// contributors may be used to endorse or promote products derived from
23// this software without specific prior written permission.
24//
25// THIS SOFTWARE IS PROVIDED BY SANDIA CORPORATION "AS IS" AND ANY
26// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
27// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
28// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL SANDIA CORPORATION OR THE
29// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
30// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
31// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
32// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
33// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
34// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
35// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
36//
37// Questions? Contact Eric T. Phipps (etphipp@sandia.gov).
38//
39// ***********************************************************************
40// @HEADER
41
42#ifndef STOKHOS_CUDA_CRS_PRODUCT_TENSOR_HPP
43#define STOKHOS_CUDA_CRS_PRODUCT_TENSOR_HPP
44
45#include <iostream>
46
47#include "Kokkos_Core.hpp"
48
49#include "Stokhos_Multiply.hpp"
52
55
56#include "Teuchos_TestForException.hpp"
57
58#include "cuda_profiler_api.h"
59
60namespace Stokhos {
61
62//----------------------------------------------------------------------------
63
64// Matrix-vector product specialization for CrsProductTensor layout
65// To do:
66// * Incorporate texture/read-only cache
67// * Change tensor layout to allow coalesced reads with smaller
68// threads/row (will only probably help smaller problem sizes)
69// * Get average FEM entries/row from FEM graph (hardcoded to 27)
70template< typename TensorScalar,
71 typename MatrixScalar,
72 typename VectorScalar >
74 BlockCrsMatrix< CrsProductTensor< TensorScalar, Kokkos::Cuda >,
75 MatrixScalar, Kokkos::Cuda >,
76 Kokkos::View<VectorScalar**, Kokkos::LayoutLeft, Kokkos::Cuda>,
77 Kokkos::View<VectorScalar**, Kokkos::LayoutLeft, Kokkos::Cuda> >
78{
79public:
80
81 typedef Kokkos::Cuda execution_space;
82 typedef execution_space::size_type size_type;
83
84 typedef CrsProductTensor< TensorScalar, execution_space > tensor_type;
85 typedef BlockCrsMatrix< tensor_type, MatrixScalar, execution_space > matrix_type;
86 typedef Kokkos::View< VectorScalar**, Kokkos::LayoutLeft, Kokkos::Cuda > vector_type;
87
88#define USE_LDG 0
89
90#if USE_LDG == 0
91
92 // The multiply kernel
93 class MultiplyKernel {
94 public:
95
100
102 const vector_type & x,
103 const vector_type & y,
104 const size_type block_size )
105 : m_A( A ), m_x( x ), m_y( y ), BlockSize(block_size) {}
106
107 __device__
108 void operator()(void) const
109 {
110 // Number of bases in the stochastic system:
111 const size_type dim = m_A.block.dimension();
112
113 // Get shared memory for loading x, A, and y
114 volatile VectorScalar * const sh_x =
115 kokkos_impl_cuda_shared_memory<VectorScalar>();
116 volatile MatrixScalar * const sh_A = sh_x + BlockSize*dim;
117 volatile VectorScalar * const sh_y = sh_A + BlockSize*dim;
118#if !HAVE_CUDA_SHUFFLE
119 volatile VectorScalar * const sh_t = sh_y + dim;
120#endif
121
122 const size_type nid = blockDim.x * blockDim.y;
123 const size_type tid = threadIdx.x + blockDim.x * threadIdx.y;
124
125 // Mask used for shuffle/warp-sync operations
126 const int mask = blockDim.x == 32 ? 0xffffffff :
127 ((1<<blockDim.x)-1)<<(threadIdx.y%(32/blockDim.x))*blockDim.x;
128
129 // Zero y
130 for ( size_type i = tid; i < dim; i += nid ) {
131 sh_y[i] = 0.0;
132 }
133
134 // Loop over columns in the discrete (finite element) system.
135 // blockIdx.x == row in the deterministic (finite element) system
136 const size_type iBlockEntryBeg = m_A.graph.row_map[ blockIdx.x ];
137 const size_type iBlockEntryEnd = m_A.graph.row_map[ blockIdx.x + 1 ];
138 for (size_type iBlockEntry=iBlockEntryBeg; iBlockEntry<iBlockEntryEnd;
139 iBlockEntry += BlockSize) {
140 const size_type block_size =
141 (iBlockEntryEnd-iBlockEntry < BlockSize) ?
142 iBlockEntryEnd-iBlockEntry : BlockSize;
143
144 // Wait for X and A to be used in the previous iteration
145 // before reading new values.
146 __syncthreads();
147
148 // Coalesced read blocks of X and A into shared memory
149 for ( size_type col = 0; col < block_size; ++col ) {
150
151 const size_type iBlockColumn = m_A.graph.entries( iBlockEntry + col );
152 const VectorScalar * const x = & m_x( 0, iBlockColumn );
153 const MatrixScalar * const A = & m_A.values( 0, iBlockEntry + col );
154
155 // Coalesced read by the whole block from global memory:
156 for ( size_type i = tid; i < dim; i += nid ) {
157 sh_x[col + i * BlockSize] = x[i]; // m_x( i, iBlockColumn );
158 sh_A[col + i * BlockSize] = A[i]; // m_A.values( i, iBlockEntry );
159 }
160
161 }
162
163 __syncthreads(); // wait for X and A to be read before using them
164
165 // This cuda block is responsible for computing all values of 'y'
166 for ( size_type i = threadIdx.y; i < dim; i += blockDim.y ) {
167 VectorScalar y = 0;
168
169 // Product tensor entries which this warp will iterate:
170 const size_type lBeg = m_A.block.entry_begin( i );
171 const size_type lEnd = m_A.block.entry_end( i );
172
173 // Loop through sparse tensor contributions with coalesced reads.
174 for ( size_type l = lBeg+threadIdx.x; l < lEnd; l += blockDim.x ) {
175
176 // Read 'blockDim.x' entries from the tensor (coalesced)
177 const size_type kj = m_A.block.coord( l );
178 const TensorScalar v = m_A.block.value( l );
179 const size_type j = ( kj & 0x0ffff ) * BlockSize ;
180 const size_type k = ( kj >> 16 ) * BlockSize ;
181
182 for ( size_type col = 0; col < block_size; ++col ) {
183 y += v * ( sh_A[col+j] * sh_x[col+k] +
184 sh_A[col+k] * sh_x[col+j] );
185 }
186
187 }
188
189 // Reduction of 'y' within 'blockDim.x'
190#if HAVE_CUDA_SHUFFLE
191 if (blockDim.x >= 2) y += shfl_down(y, 1, blockDim.x, mask);
192 if (blockDim.x >= 4) y += shfl_down(y, 2, blockDim.x, mask);
193 if (blockDim.x >= 8) y += shfl_down(y, 4, blockDim.x, mask);
194 if (blockDim.x >= 16) y += shfl_down(y, 8, blockDim.x, mask);
195 if (blockDim.x >= 32) y += shfl_down(y, 16, blockDim.x, mask);
196 if ( threadIdx.x == 0 ) sh_y[i] += y;
197#else
198 sh_t[ tid ] = y;
199 if (threadIdx.x+16 < blockDim.x) sh_t[tid] += sh_t[tid+16];
200 sync_warp(mask);
201 if (threadIdx.x+ 8 < blockDim.x) sh_t[tid] += sh_t[tid+ 8];
202 sync_warp(mask);
203 if (threadIdx.x+ 4 < blockDim.x) sh_t[tid] += sh_t[tid+ 4];
204 sync_warp(mask);
205 if (threadIdx.x+ 2 < blockDim.x) sh_t[tid] += sh_t[tid+ 2];
206 sync_warp(mask);
207 if (threadIdx.x+ 1 < blockDim.x) sh_t[tid] += sh_t[tid+ 1];
208 sync_warp(mask);
209 if (threadIdx.x == 0) sh_y[i] += sh_t[tid];
210#endif
211
212 }
213
214 }
215
216 // Wait for all contributions of y to be completed
217 __syncthreads();
218
219 // Store result back in global memory
220 for ( size_type i = tid; i < dim; i += nid ) {
221 m_y( i, blockIdx.x ) = sh_y[ i ];
222 }
223 }
224 };
225
226#elif USE_LDG == 1
227
228 // The multiply kernel -- using read-only data cache for A
229 // Currently is slower than one above
230 class MultiplyKernel {
231 public:
232
233 const matrix_type m_A;
234 const vector_type m_x;
235 const vector_type m_y;
236 const size_type BlockSize;
237
238 MultiplyKernel( const matrix_type & A,
239 const vector_type & x,
240 const vector_type & y,
241 const size_type block_size )
242 : m_A( A ), m_x( x ), m_y( y ), BlockSize(block_size) {}
243
244 __device__
245 void operator()(void) const
246 {
247 // Number of bases in the stochastic system:
248 const size_type dim = m_A.block.dimension();
249
250 volatile VectorScalar * const sh_x =
251 kokkos_impl_cuda_shared_memory<VectorScalar>();
252 volatile VectorScalar * const sh_y = sh_x + BlockSize*dim;
253#if !HAVE_CUDA_SHUFFLE
254 volatile VectorScalar * const sh_t = sh_y + dim;
255#endif
256
257 const size_type nid = blockDim.x * blockDim.y;
258 const size_type tid = threadIdx.x + blockDim.x * threadIdx.y;
259
260 // Mask used for shuffle/warp-sync operations
261 const int mask = blockDim.x == 32 ? 0xffffffff :
262 ((1<<blockDim.x)-1)<<(threadIdx.y%(32/blockDim.x))*blockDim.x;
263
264 // Zero y
265 for ( size_type i = tid; i < dim; i += nid ) {
266 sh_y[i] = 0.0;
267 }
268
269 // Loop over columns in the discrete (finite element) system.
270 // blockIdx.x == row in the deterministic (finite element) system
271 const size_type iBlockEntryBeg = m_A.graph.row_map[ blockIdx.x ];
272 const size_type iBlockEntryEnd = m_A.graph.row_map[ blockIdx.x + 1 ];
273 for (size_type iBlockEntry=iBlockEntryBeg; iBlockEntry<iBlockEntryEnd;
274 iBlockEntry += BlockSize) {
275 const size_type block_size =
276 (iBlockEntryEnd-iBlockEntry < BlockSize) ?
277 iBlockEntryEnd-iBlockEntry : BlockSize;
278
279 // Wait for X and A to be used in the previous iteration
280 // before reading new values.
281 __syncthreads();
282
283 // Coalesced read blocks of X into shared memory
284 for ( size_type col = 0; col < block_size; ++col ) {
285
286 const size_type iBlockColumn = m_A.graph.entries( iBlockEntry + col );
287 const VectorScalar * const x = & m_x( 0, iBlockColumn );
288
289 // Coalesced read by the whole block from global memory:
290 for ( size_type i = tid; i < dim; i += nid ) {
291 sh_x[col + i * BlockSize] = x[i]; // m_x( i, iBlockColumn );
292 }
293
294 }
295
296 __syncthreads(); // wait for X to be read before using them
297
298 // This cuda block is responsible for computing all values of 'y'
299 for ( size_type i = threadIdx.y; i < dim; i += blockDim.y ) {
300 VectorScalar y = 0;
301
302 // Product tensor entries which this warp will iterate:
303 const size_type lBeg = m_A.block.entry_begin( i );
304 const size_type lEnd = m_A.block.entry_end( i );
305
306 // Loop through sparse tensor contributions with coalesced reads.
307 for ( size_type l = lBeg+threadIdx.x; l < lEnd; l += blockDim.x ) {
308
309 // Read 'blockDim.x' entries from the tensor (coalesced)
310 const size_type kj = m_A.block.coord( l );
311 const TensorScalar v = m_A.block.value( l );
312 const size_type j = ( kj & 0x0ffff ) ;
313 const size_type k = ( kj >> 16 ) ;
314
315 for ( size_type col = 0; col < block_size; ++col ) {
316 const size_type bCol = iBlockEntry + col;
317#if (__CUDA_ARCH__ >= 350)
318 y += v * ( __ldg(&m_A.values(j,bCol)) * sh_x[col+k*BlockSize] +
319 __ldg(&m_A.values(k,bCol)) * sh_x[col+j*BlockSize] );
320#else
321 y += v * ( m_A.values(j,bCol) * sh_x[col+k*BlockSize] +
322 m_A.values(k,bCol) * sh_x[col+j*BlockSize] );
323#endif
324 }
325
326 }
327
328 // Reduction of 'y' within 'blockDim.x'
329#if HAVE_CUDA_SHUFFLE
330 if (blockDim.x >= 2) y += shfl_down(y, 1, blockDim.x, mask);
331 if (blockDim.x >= 4) y += shfl_down(y, 2, blockDim.x, mask);
332 if (blockDim.x >= 8) y += shfl_down(y, 4, blockDim.x, mask);
333 if (blockDim.x >= 16) y += shfl_down(y, 8, blockDim.x, mask);
334 if (blockDim.x >= 32) y += shfl_down(y, 16, blockDim.x, mask);
335 if ( threadIdx.x == 0 ) sh_y[i] += y;
336#else
337 sh_t[ tid ] = y;
338 if (threadIdx.x+16 < blockDim.x) sh_t[tid] += sh_t[tid+16];
339 sync_warp(mask);
340 if (threadIdx.x+ 8 < blockDim.x) sh_t[tid] += sh_t[tid+ 8];
341 sync_warp(mask);
342 if (threadIdx.x+ 4 < blockDim.x) sh_t[tid] += sh_t[tid+ 4];
343 sync_warp(mask);
344 if (threadIdx.x+ 2 < blockDim.x) sh_t[tid] += sh_t[tid+ 2];
345 sync_warp(mask);
346 if (threadIdx.x+ 1 < blockDim.x) sh_t[tid] += sh_t[tid+ 1];
347 sync_warp(mask);
348 if (threadIdx.x == 0) sh_y[i] += sh_t[tid];
349#endif
350
351 }
352
353 }
354
355 // Wait for all contributions of y to be completed
356 __syncthreads();
357
358 // Store result back in global memory
359 for ( size_type i = tid; i < dim; i += nid ) {
360 m_y( i, blockIdx.x ) = sh_y[ i ];
361 }
362 }
363 };
364
365#endif
366
367 //------------------------------------
368
369 struct TensorReadEntry {
370 size_type block_size, shmem, num_blocks, num_warp;
371 double reads;
372 };
373
374 static void apply( const matrix_type & A,
375 const vector_type & x,
376 const vector_type & y )
377 {
378 const size_type row_count = A.graph.row_map.extent(0) - 1;
379 const size_type tensor_dimension = A.block.dimension();
380 const size_type tensor_align = tensor_dimension;
381 const size_type avg_tensor_entries_per_row = A.block.avg_entries_per_row();
382
383 // Should compute this from FEM graph
384 const size_type fem_nnz_per_row = 27;
385
386 // Get device properties we need for whatever device is currently selected
387 DeviceProp device_prop;
388 const size_type shcap = device_prop.shared_memory_capacity;
389 const size_type sh_granularity = device_prop.shared_memory_granularity;
390 const size_type max_shmem_per_block = device_prop.max_shmem_per_block;
391 const size_type max_blocks_per_sm = device_prop.max_blocks_per_sm;
392 const size_type warp_size = device_prop.warp_size;
393 const size_type warp_granularity = device_prop.warp_granularity;
394 const size_type max_warps_per_block =
395 std::min(device_prop.max_threads_per_block / warp_size,
396 device_prop.max_warps_per_sm);
397 const size_type min_warps_per_block = 1;
398 const size_type max_regs_per_sm = device_prop.max_regs_per_sm;
399 const size_type max_regs_per_block = device_prop.max_regs_per_block;
400 const size_type reg_bank_size = device_prop.reg_bank_size;
401
402 // Compute number of warps we can fit on each SM based on register limits
403 // Use Cuda introspection to determine number of registers per thread
404 //const size_type regs_per_thread = 46;
405 const size_type regs_per_thread =
406 device_prop.get_kernel_registers(
407 Kokkos::Impl::cuda_parallel_launch_local_memory<MultiplyKernel>);
408 const size_type regs_per_warp =
409 (warp_size*regs_per_thread + reg_bank_size-1) & ~(reg_bank_size-1);
410 const size_type warps_per_sm =
411 (max_regs_per_sm/regs_per_warp) & ~(warp_granularity-1);
412 const size_type warps_per_block =
413 (max_regs_per_block/regs_per_warp) & ~(warp_granularity-1);
414
415 // Compute number of threads per stochastic row based on number of
416 // nonzero entries per row.
417 // For double, 16 threads/row is still coalesced, but not for float.
418 // We should reorder the tensor values for a given vector width to
419 // maintain coalesced reads. This would help smaller problems too by
420 // allowing fewer threads/row.
421 const size_type threads_per_row =
422 avg_tensor_entries_per_row >= 88 ? 32 : 16;
423 const size_type rows_per_warp = warp_size / threads_per_row;
424
425 const size_type vec_scalar_size = sizeof(VectorScalar);
426#if USE_LDG == 0
427 const size_type mat_scalar_size = sizeof(MatrixScalar);
428#endif
429
430#define USE_FIXED_BLOCKSIZE 0
431
432#if USE_FIXED_BLOCKSIZE
433
434 const size_type num_blocks = 3;
435 size_type nw = warps_per_sm / num_blocks;
436 while (nw > 1 && num_blocks*nw % warp_granularity) --nw;
437 const size_type num_warp = nw;
438 const size_type sh_per_block = shcap / num_blocks;
439 const size_type sr =
440 device_prop.has_shuffle ? 0 : vec_scalar_size*warp_size*num_warp;
441#if USE_LDG == 1
442 size_type bs = ((sh_per_block - sr) / tensor_align - vec_scalar_size) /
443 vec_scalar_size;
444#else
445 size_type bs = ((sh_per_block - sr) / tensor_align - vec_scalar_size) /
446 (vec_scalar_size+mat_scalar_size);
447#endif
448 if (bs % 2 == 0) --bs;
449 const size_type block_size_max = 31;
450 const size_type block_size = std::min(bs, block_size_max);
451 //const size_type block_size = 7;
452#if USE_LDG == 1
453 const size_type shmem =
454 ( (vec_scalar_size*block_size+vec_scalar_size)*tensor_align + sr + sh_granularity-1 ) & ~(sh_granularity-1);
455#else
456 const size_type shmem =
457 ( ((vec_scalar_size+mat_scalar_size)*block_size+vec_scalar_size)*tensor_align + sr + sh_granularity-1 ) & ~(sh_granularity-1);
458#endif
459
460#else
461
462 // We want to maximize the number of blocks per SM (to maximize throughput)
463 // as well as the block_size (to minimize tensor reads), subject to
464 // shared memory and register constraints. Here we try to do this computing
465 // the number of tensor reads per block per thread for each choice of
466 // block_size, and then choose the configuration with the smallest value.
467 // This isn't perfect, but seems to generally work OK. It could be
468 // improved with a better model of:
469 // * Number of blocks versus warps per block (to minimize synchronization)
470 // * Thread efficiency for small numbers of rows per thread
471 const size_type block_size_min = 3;
472 const size_type half_nnz_per_row = fem_nnz_per_row / 2 + 1;
473 const size_type block_size_max =
474 half_nnz_per_row % 2 ? half_nnz_per_row + 1 : half_nnz_per_row;
475 Teuchos::Array<TensorReadEntry> reads_per_thread;
476 for (size_type bs = block_size_min; bs<=block_size_max; bs+=2) {
477 // We don't know the number of warps yet, so we just have to bound
478 // sr by the maximum number possible (which is all warps in 1 block)
479 const size_type sr =
480 device_prop.has_shuffle ? 0 : vec_scalar_size*warp_size*warps_per_block;
481#if USE_LDG == 1
482 size_type shmem =
483 (vec_scalar_size*bs+vec_scalar_size)*tensor_align+sr;
484#else
485 size_type shmem =
486 ((vec_scalar_size+mat_scalar_size)*bs+vec_scalar_size)*tensor_align+sr;
487#endif
488 shmem = (shmem + sh_granularity-1) & ~(sh_granularity-1);
489 if (shmem <= max_shmem_per_block) {
490 size_type num_blocks = std::min(shcap / shmem, max_blocks_per_sm);
491 size_type tensor_reads = (fem_nnz_per_row+bs-1) / bs;
492 size_type num_warp =
493 std::min(std::max(std::min(warps_per_sm/num_blocks, warps_per_block),
494 min_warps_per_block),
495 max_warps_per_block);
496 while (num_warp > 1 && num_blocks*num_warp % warp_granularity)
497 --num_warp;
498 TensorReadEntry entry;
499 entry.block_size = bs;
500 entry.shmem = shmem;
501 entry.num_blocks = num_blocks;
502 entry.num_warp = num_warp;
503
504 // Prefer at least 3 blocks
505 size_type factor = std::min(num_blocks,3u);
506 entry.reads = (static_cast<double>(tensor_reads) /
507 static_cast<double>(factor*num_blocks*num_warp));
508 reads_per_thread.push_back(entry);
509 }
510 }
511 TEUCHOS_TEST_FOR_EXCEPTION(
512 reads_per_thread.size() == 0, std::logic_error,
513 "Stochastic problem dimension is too large to fit in shared memory");
514 size_type idx = 0;
515 double min_reads = reads_per_thread[0].reads;
516 for (int i=1; i<reads_per_thread.size(); ++i) {
517 if (reads_per_thread[i].reads < min_reads) {
518 idx = i;
519 min_reads = reads_per_thread[i].reads;
520 }
521 }
522
523 const size_type block_size = reads_per_thread[idx].block_size;
524 const size_type shmem = reads_per_thread[idx].shmem;
525 const size_type num_blocks = reads_per_thread[idx].num_blocks;
526 const size_type num_warp = reads_per_thread[idx].num_warp;
527
528#endif
529
530 // Setup thread blocks and grid
531 const dim3 dBlock( threads_per_row , rows_per_warp*num_warp , 1 );
532 const dim3 dGrid( row_count, 1, 1 );
533
534#if 0
535 std::cout << "block_size = " << block_size
536 << " tensor reads = " << (fem_nnz_per_row+block_size-1)/block_size
537 << " regs_per_thread = " << regs_per_thread
538 << " num blocks = " << num_blocks
539 << " num warps = " << num_warp
540 << " num rows = " << tensor_dimension
541 << " rows/warp = " << tensor_dimension / (num_warp*rows_per_warp)
542 << " avg entries/row = " << avg_tensor_entries_per_row
543 << std::endl;
544#endif
545
546 // Finally launch our kernel
547 //cudaProfilerStart();
548 Kokkos::Impl::cuda_parallel_launch_local_memory<<< dGrid, dBlock, shmem >>>
549 ( MultiplyKernel( A, x, y, block_size ) );
550 //cudaProfilerStop();
551 }
552
553};
554
555//----------------------------------------------------------------------------
556//----------------------------------------------------------------------------
557
558} // namespace Stokhos
559
560#endif /* #ifndef STOKHOS_CUDA_CRS_PRODUCT_TENSOR_HPP */
CRS matrix of dense blocks.
Sparse product tensor with replicated entries to provide subsets with a given coordinate.
Top-level namespace for Stokhos classes and functions.
KOKKOS_INLINE_FUNCTION void sync_warp(const int &mask)
KOKKOS_INLINE_FUNCTION Scalar shfl_down(const Scalar &val, const int &delta, const int &width)
const IndexType const IndexType const IndexType const IndexType const ValueType const ValueType * x
Definition csr_vector.h:265
const IndexType const IndexType const IndexType const IndexType const ValueType const ValueType ValueType * y
Definition csr_vector.h:267