68 const int d = block.dimension();
69 const int warp_size = Kokkos::Impl::CudaTraits::WarpSize;
70 const int y = ( Kokkos::Impl::cuda_internal_maximum_warp_count() * warp_size ) / d ;
73 throw std::runtime_error( std::string(
"Stokhos::Multiply< SymmetricDiagonalSpec<Cuda> > ERROR: block too large") );
77 return dim3( d , std::min( y , ( 1 + d ) / 2 ) , 1 );
99 const MatrixValue *
const a ,
100 const VectorValue *
const x )
102 const int dimension = block.dimension();
103 const int dim_half = ( dimension + 1 ) >> 1 ;
105 VectorValue *
const shX = kokkos_impl_cuda_shared_memory<VectorValue>();
112 if ( 0 == threadIdx.y ) {
115 shX[ threadIdx.x ] = x[ threadIdx.x ];
117 y = shX[ threadIdx.x ] * a[ threadIdx.x ];
122 if ( 0 == threadIdx.y && ! ( dimension & 01 ) ) {
128 ia = threadIdx.x + dim_half * dimension ;
130 if ( threadIdx.x < dim_half ) {
137 y += shX[ ix ] * a[ ia ];
142 const int A_stride = blockDim.y * dimension ;
144 int d = 1 + threadIdx.y ;
146 const MatrixValue * A = a + d * dimension ;
148 for ( ; d < dim_half ; d += blockDim.y , A += A_stride ) {
150 ix = threadIdx.x + d ;
if ( dimension <= ix ) ix -= dimension ;
151 ia = threadIdx.x - d ;
if ( ia < 0 ) ia += dimension ;
158 y += shX[ ix ] * A[ threadIdx.x ] +
162 if ( 0 < threadIdx.y ) {
163 shX[ threadIdx.x + threadIdx.y * dimension ] = y ;
168 for ( ix = 1 ; ix < blockDim.y ; ++ix ) {
169 y += shX[ threadIdx.x + ix * dimension ];