48 #include "Kokkos_Core.hpp" 51 #include "Teuchos_CommandLineProcessor.hpp" 52 #include "Teuchos_StandardCatchMacros.hpp" 53 #ifdef KOKKOS_ENABLE_CUDA 54 #include "cuda_runtime_api.h" 57 template <
typename Storage>
58 void mainHost(
int nGrid,
int nIter, KokkosSparse::DeviceConfig dev_config);
59 template <
typename Storage>
60 void mainCuda(
int nGrid,
int nIter, KokkosSparse::DeviceConfig dev_config);
68 const size_t num_sockets = Kokkos::hwloc::get_available_numa_count();
69 const size_t num_cores_per_socket =
70 Kokkos::hwloc::get_available_cores_per_numa();
71 const size_t num_threads_per_core =
72 Kokkos::hwloc::get_available_threads_per_core();
75 Teuchos::CommandLineProcessor
CLP;
77 "This test performance of MP::Vector multiply routines.\n");
79 CLP.setOption(
"n", &nGrid,
"Number of mesh points in the each direction");
81 CLP.setOption(
"ni", &nIter,
"Number of multiply iterations");
82 int num_cores = num_cores_per_socket * num_sockets;
84 "Number of CPU cores to use (defaults to all)");
87 "Number of hyper threads per core to use (defaults to all)");
88 int threads_per_vector = 1;
89 CLP.setOption(
"threads_per_vector", &threads_per_vector,
90 "Number of threads to use within each vector");
91 #ifdef KOKKOS_ENABLE_THREADS 93 CLP.setOption(
"threads",
"no-threads", &threads,
"Enable Threads device");
95 #ifdef KOKKOS_ENABLE_OPENMP 97 CLP.setOption(
"openmp",
"no-openmp", &openmp,
"Enable OpenMP device");
99 #ifdef KOKKOS_ENABLE_CUDA 101 CLP.setOption(
"cuda",
"no-cuda", &cuda,
"Enable Cuda device");
102 int cuda_threads_per_vector = 16;
103 CLP.setOption(
"cuda_threads_per_vector", &cuda_threads_per_vector,
104 "Number of Cuda threads to use within each vector");
105 int cuda_block_size = 0;
106 CLP.setOption(
"cuda_block_size", &cuda_block_size,
107 "Cuda block size (0 implies the default choice)");
108 int num_cuda_blocks = 0;
109 CLP.setOption(
"num_cuda_blocks", &num_cuda_blocks,
110 "Number of Cuda blocks (0 implies the default choice)");
112 CLP.setOption(
"device", &device_id,
"CUDA device ID");
119 #ifdef KOKKOS_ENABLE_THREADS 121 typedef Kokkos::Threads Device;
124 Kokkos::InitArguments init_args;
126 Kokkos::initialize( init_args );
128 std::cout << std::endl
130 <<
" threads:" << std::endl;
132 KokkosSparse::DeviceConfig dev_config(
num_cores,
136 mainHost<Storage>(nGrid, nIter, dev_config);
142 #ifdef KOKKOS_ENABLE_OPENMP 144 typedef Kokkos::OpenMP Device;
147 Kokkos::InitArguments init_args;
149 Kokkos::initialize( init_args );
151 std::cout << std::endl
153 <<
" threads:" << std::endl;
155 KokkosSparse::DeviceConfig dev_config(
num_cores,
159 mainHost<Storage>(nGrid, nIter, dev_config);
165 #ifdef KOKKOS_ENABLE_CUDA 167 typedef Kokkos::Cuda Device;
170 Kokkos::InitArguments init_args;
171 init_args.device_id = device_id;
172 Kokkos::initialize( init_args );
174 cudaDeviceProp deviceProp;
175 cudaGetDeviceProperties(&deviceProp, device_id);
176 std::cout << std::endl
177 <<
"CUDA performance for device " << device_id <<
" (" 178 << deviceProp.name <<
"):" 181 KokkosSparse::DeviceConfig dev_config(
183 cuda_threads_per_vector,
184 cuda_threads_per_vector == 0 ? 0 : cuda_block_size / cuda_threads_per_vector);
186 mainCuda<Storage>(nGrid,nIter,dev_config);
193 TEUCHOS_STANDARD_CATCH_STATEMENTS(verbose, std::cerr, success);
Stokhos::StandardStorage< int, double > Storage
Statically allocated storage class.
void mainCuda(int nGrid, int nIter, KokkosSparse::DeviceConfig dev_config)
void mainHost(int nGrid, int nIter, KokkosSparse::DeviceConfig dev_config)
int main(int argc, char *argv[])