57 typedef int IndexType;
58 typedef double ValueType;
59 typedef cusp::device_memory MemorySpace;
67 Teuchos::CommandLineProcessor
CLP;
68 CLP.setDocString(
"This test performance of block multiply routines.\n");
70 CLP.setOption(
"n", &n,
"Number of mesh points in the each direction");
71 IndexType nrhs_begin = 32;
72 CLP.setOption(
"begin", &nrhs_begin,
73 "Staring number of right-hand-sides");
74 IndexType nrhs_end = 512;
75 CLP.setOption(
"end", &nrhs_end,
76 "Ending number of right-hand-sides");
77 IndexType nrhs_step = 32;
78 CLP.setOption(
"step", &nrhs_step,
79 "Increment in number of right-hand-sides");
81 CLP.setOption(
"nits", &nits,
82 "Number of multiply iterations");
84 CLP.setOption(
"device", &device_id,
"CUDA device ID");
88 cudaSetDevice(device_id);
89 cudaDeviceSetSharedMemConfig(cudaSharedMemBankSizeEightByte);
92 cusp::csr_matrix<IndexType, ValueType, MemorySpace> A;
93 cusp::gallery::poisson27pt(A, n, n, n);
95 std::cout <<
"nrhs , num_rows , num_entries , row_time , row_gflops , "
96 <<
"col_time , col_gflops" << std::endl;
98 for (IndexType nrhs = nrhs_begin; nrhs <= nrhs_end; nrhs += nrhs_step) {
101 2.0 *
static_cast<double>(A.num_entries) *
static_cast<double>(nrhs);
104 cusp::array2d<ValueType, MemorySpace, cusp::row_major> x_row(
105 A.num_rows, nrhs, 1);
106 cusp::array2d<ValueType, MemorySpace, cusp::row_major> y_row(
107 A.num_rows, nrhs, 0);
109 cusp::detail::timer row_timer;
111 for (IndexType iter=0; iter<nits; ++iter) {
114 cudaDeviceSynchronize();
115 double row_time = row_timer.seconds_elapsed() / nits;
116 double row_gflops = 1.0e-9 * flops / row_time;
119 cusp::array2d<ValueType, MemorySpace, cusp::column_major> x_col(
120 A.num_rows, nrhs, 1);
121 cusp::array2d<ValueType, MemorySpace, cusp::column_major> y_col(
122 A.num_rows, nrhs, 0);
124 cusp::detail::timer col_timer;
126 for (IndexType iter=0; iter<nits; ++iter) {
129 cudaDeviceSynchronize();
130 double col_time = col_timer.seconds_elapsed() / nits;
131 double col_gflops = 1.0e-9 * flops / col_time;
133 std::cout << nrhs <<
" , "
134 << A.num_rows <<
" , " << A.num_entries <<
" , "
135 << row_time <<
" , " << row_gflops <<
" , "
136 << col_time <<
" , " << col_gflops
142 TEUCHOS_STANDARD_CATCH_STATEMENTS(verbose, std::cerr, success);