378 const size_type row_count = A.graph.row_map.extent(0) - 1;
379 const size_type tensor_dimension = A.block.dimension();
380 const size_type tensor_align = tensor_dimension;
381 const size_type avg_tensor_entries_per_row = A.block.avg_entries_per_row();
389 const size_type sh_granularity = device_prop.shared_memory_granularity;
390 const size_type max_shmem_per_block = device_prop.max_shmem_per_block;
391 const size_type max_blocks_per_sm = device_prop.max_blocks_per_sm;
392 const size_type warp_size = device_prop.warp_size;
393 const size_type warp_granularity = device_prop.warp_granularity;
395 std::min(device_prop.max_threads_per_block / warp_size,
396 device_prop.max_warps_per_sm);
398 const size_type max_regs_per_sm = device_prop.max_regs_per_sm;
399 const size_type max_regs_per_block = device_prop.max_regs_per_block;
400 const size_type reg_bank_size = device_prop.reg_bank_size;
406 device_prop.get_kernel_registers(
407 Kokkos::Impl::cuda_parallel_launch_local_memory<MultiplyKernel>);
409 (warp_size*regs_per_thread + reg_bank_size-1) & ~(reg_bank_size-1);
411 (max_regs_per_sm/regs_per_warp) & ~(warp_granularity-1);
413 (max_regs_per_block/regs_per_warp) & ~(warp_granularity-1);
422 avg_tensor_entries_per_row >= 88 ? 32 : 16;
423 const size_type rows_per_warp = warp_size / threads_per_row;
425 const size_type vec_scalar_size =
sizeof(VectorScalar);
427 const size_type mat_scalar_size =
sizeof(MatrixScalar);
430#define USE_FIXED_BLOCKSIZE 0
432#if USE_FIXED_BLOCKSIZE
435 size_type nw = warps_per_sm / num_blocks;
436 while (nw > 1 && num_blocks*nw % warp_granularity) --nw;
438 const size_type sh_per_block = shcap / num_blocks;
440 device_prop.has_shuffle ? 0 : vec_scalar_size*warp_size*num_warp;
442 size_type bs = ((sh_per_block - sr) / tensor_align - vec_scalar_size) /
445 size_type bs = ((sh_per_block - sr) / tensor_align - vec_scalar_size) /
446 (vec_scalar_size+mat_scalar_size);
448 if (bs % 2 == 0) --bs;
450 const size_type block_size = std::min(bs, block_size_max);
454 ( (vec_scalar_size*block_size+vec_scalar_size)*tensor_align + sr + sh_granularity-1 ) & ~(sh_granularity-1);
457 ( ((vec_scalar_size+mat_scalar_size)*block_size+vec_scalar_size)*tensor_align + sr + sh_granularity-1 ) & ~(sh_granularity-1);
472 const size_type half_nnz_per_row = fem_nnz_per_row / 2 + 1;
474 half_nnz_per_row % 2 ? half_nnz_per_row + 1 : half_nnz_per_row;
475 Teuchos::Array<TensorReadEntry> reads_per_thread;
476 for (
size_type bs = block_size_min; bs<=block_size_max; bs+=2) {
480 device_prop.has_shuffle ? 0 : vec_scalar_size*warp_size*warps_per_block;
483 (vec_scalar_size*bs+vec_scalar_size)*tensor_align+sr;
486 ((vec_scalar_size+mat_scalar_size)*bs+vec_scalar_size)*tensor_align+sr;
488 shmem = (shmem + sh_granularity-1) & ~(sh_granularity-1);
489 if (shmem <= max_shmem_per_block) {
490 size_type num_blocks = std::min(shcap / shmem, max_blocks_per_sm);
491 size_type tensor_reads = (fem_nnz_per_row+bs-1) / bs;
493 std::min(std::max(std::min(warps_per_sm/num_blocks, warps_per_block),
494 min_warps_per_block),
495 max_warps_per_block);
496 while (num_warp > 1 && num_blocks*num_warp % warp_granularity)
498 TensorReadEntry entry;
499 entry.block_size = bs;
501 entry.num_blocks = num_blocks;
502 entry.num_warp = num_warp;
505 size_type factor = std::min(num_blocks,3u);
506 entry.reads = (
static_cast<double>(tensor_reads) /
507 static_cast<double>(factor*num_blocks*num_warp));
508 reads_per_thread.push_back(entry);
511 TEUCHOS_TEST_FOR_EXCEPTION(
512 reads_per_thread.size() == 0, std::logic_error,
513 "Stochastic problem dimension is too large to fit in shared memory");
515 double min_reads = reads_per_thread[0].reads;
516 for (
int i=1; i<reads_per_thread.size(); ++i) {
517 if (reads_per_thread[i].reads < min_reads) {
519 min_reads = reads_per_thread[i].reads;
523 const size_type block_size = reads_per_thread[idx].block_size;
524 const size_type shmem = reads_per_thread[idx].shmem;
525 const size_type num_blocks = reads_per_thread[idx].num_blocks;
526 const size_type num_warp = reads_per_thread[idx].num_warp;
531 const dim3 dBlock( threads_per_row , rows_per_warp*num_warp , 1 );
532 const dim3 dGrid( row_count, 1, 1 );
535 std::cout <<
"block_size = " << block_size
536 <<
" tensor reads = " << (fem_nnz_per_row+block_size-1)/block_size
537 <<
" regs_per_thread = " << regs_per_thread
538 <<
" num blocks = " << num_blocks
539 <<
" num warps = " << num_warp
540 <<
" num rows = " << tensor_dimension
541 <<
" rows/warp = " << tensor_dimension / (num_warp*rows_per_warp)
542 <<
" avg entries/row = " << avg_tensor_entries_per_row
548 Kokkos::Impl::cuda_parallel_launch_local_memory<<< dGrid, dBlock, shmem >>>
549 ( MultiplyKernel( A, x, y, block_size ) );