105 const size_type dim = m_A.block.dimension();
106 const size_type num_entries = m_A.block.entry_count();
109 const size_type nid = blockDim.x * blockDim.y;
110 const size_type tid = threadIdx.x + blockDim.x * threadIdx.y;
113 VectorScalar *
const sh_x =
114 kokkos_impl_cuda_shared_memory<VectorScalar>();
115 VectorScalar *
const sh_A = sh_x + m_block_size*dim;
116 VectorScalar *
const sh_y = sh_A + m_block_size*dim;
117 volatile VectorScalar *
const vals = sh_y + dim;
119 reinterpret_cast<volatile rows_type*
>(vals + nid);
122 const size_type entries_per_warp = blockDim.x * m_entries_per_thread;
123 const size_type lBeg = threadIdx.y * entries_per_warp + threadIdx.x;
124 const size_type lEnd = ( lBeg + entries_per_warp < num_entries ?
125 lBeg + entries_per_warp : num_entries );
128 const size_type femBeg = m_A.graph.row_map[ blockIdx.x ];
129 const size_type femEnd = m_A.graph.row_map[ blockIdx.x + 1 ];
132 for (
size_type l = tid; l < dim; l += nid) {
137 rows[tid] = invalid_row;
141 for (
size_type femEntry=femBeg; femEntry<femEnd; femEntry+=m_block_size) {
143 femEntry + m_block_size < femEnd ? m_block_size : femEnd - femEntry;
150 for (
size_type col = 0; col < block_size; ++col) {
152 const size_type femColumn = m_A.graph.entries( femEntry + col );
153 const VectorScalar *
const x = & m_x( 0, femColumn );
154 const MatrixScalar *
const A = & m_A.values( 0, femEntry + col );
157 for (
size_type l = tid; l < dim; l += nid) {
158 sh_x[col + l * m_block_size] = x[l];
159 sh_A[col + l * m_block_size] = A[l];
167 for (
size_type l = lBeg; l < lEnd; l += blockDim.x) {
171 m_A.block.coord(l,i,
j,k);
172 const TensorScalar v = m_A.block.value( l );
180 if (threadIdx.x == 0) {
181 if (i == rows[tid+31])
184 sh_y[rows[tid+31]] += vals[tid+31];
188 for (
size_type col = 0; col < block_size; ++col) {
189 y += v * ( sh_A[col+
j] * sh_x[col+k] + sh_A[col+k] * sh_x[col+
j] );
198 if (threadIdx.x >= 1 && i == rows[tid- 1]) vals[tid] += vals[tid- 1];
199 if (threadIdx.x >= 2 && i == rows[tid- 2]) vals[tid] += vals[tid- 2];
200 if (threadIdx.x >= 4 && i == rows[tid- 4]) vals[tid] += vals[tid- 4];
201 if (threadIdx.x >= 8 && i == rows[tid- 8]) vals[tid] += vals[tid- 8];
202 if (threadIdx.x >= 16 && i == rows[tid-16]) vals[tid] += vals[tid-16];
207 if (threadIdx.x < 31 && i != rows[tid + 1])
208 sh_y[i] += vals[tid];
215 if (threadIdx.x == 31) {
216 rows[threadIdx.y] = rows[tid];
217 vals[threadIdx.y] = vals[tid];
223 if (threadIdx.y == 0 && threadIdx.x < blockDim.y) {
225 if (threadIdx.x >= 1 && i == rows[tid- 1]) vals[tid] += vals[tid- 1];
226 if (threadIdx.x >= 2 && i == rows[tid- 2]) vals[tid] += vals[tid- 2];
227 if (threadIdx.x >= 4 && i == rows[tid- 4]) vals[tid] += vals[tid- 4];
228 if (threadIdx.x >= 8 && i == rows[tid- 8]) vals[tid] += vals[tid- 8];
229 if (threadIdx.x >= 16 && i == rows[tid-16]) vals[tid] += vals[tid-16];
231 if ((threadIdx.x == blockDim.y-1) ||
232 (threadIdx.x < blockDim.y-1 && i != rows[tid+1]))
233 sh_y[i] += vals[tid];
237 rows[tid] = invalid_row;
246 for (
size_type l = tid; l < dim; l += nid) {
247 m_y( l, blockIdx.x ) = sh_y[ l ];