45 #include "Kokkos_Core.hpp" 48 #include "Teuchos_CommandLineProcessor.hpp" 49 #include "Teuchos_StandardCatchMacros.hpp" 50 #ifdef KOKKOS_ENABLE_CUDA 51 #include "cuda_runtime_api.h" 54 template <
typename Scalar,
typename Ordinal,
typename Device>
67 #ifdef KOKKOS_ENABLE_THREADS 68 const size_t num_sockets = Kokkos::hwloc::get_available_numa_count();
69 const size_t num_cores_per_socket =
70 Kokkos::hwloc::get_available_cores_per_numa();
71 const size_t num_threads_per_core =
72 Kokkos::hwloc::get_available_threads_per_core();
76 Teuchos::CommandLineProcessor
CLP;
78 "This test performance of MP::Vector multiply routines.\n");
80 CLP.setOption(
"n", &nGrid,
"Number of mesh points in the each direction");
82 CLP.setOption(
"ni", &nIter,
"Number of multiply iterations");
84 CLP.setOption(
"emin", &ensemble_min,
"Staring ensemble size");
85 int ensemble_max = 24;
86 CLP.setOption(
"emax", &ensemble_max,
"Stoping ensemble size");
87 int ensemble_step = 4;
88 CLP.setOption(
"estep", &ensemble_step,
"Ensemble increment");
89 #ifdef KOKKOS_ENABLE_THREADS 91 CLP.setOption(
"threads",
"no-threads", &threads,
"Enable Threads device");
92 int num_cores = num_cores_per_socket * num_sockets;
94 "Number of CPU cores to use (defaults to all)");
97 "Number of hyper threads per core to use (defaults to all)");
99 #ifdef KOKKOS_ENABLE_CUDA 101 CLP.setOption(
"cuda",
"no-cuda", &cuda,
"Enable Cuda device");
103 CLP.setOption(
"device", &device_id,
"CUDA device ID");
110 #ifdef KOKKOS_ENABLE_THREADS 112 typedef Kokkos::Threads Device;
114 Kokkos::InitArguments init_args;
116 Kokkos::initialize( init_args );
118 std::cout << std::endl
120 <<
" threads:" << std::endl;
122 performance_test_driver<Scalar,Ordinal,Device>(
123 nGrid, nIter, ensemble_min, ensemble_max, ensemble_step);
129 #ifdef KOKKOS_ENABLE_CUDA 131 typedef Kokkos::Cuda Device;
133 Kokkos::InitArguments init_args;
134 init_args.device_id = device_id;
135 Kokkos::initialize( init_args );
137 cudaDeviceProp deviceProp;
138 cudaGetDeviceProperties(&deviceProp, device_id);
139 std::cout << std::endl
140 <<
"CUDA performance for device " << device_id <<
" (" 141 << deviceProp.name <<
"):" 144 performance_test_driver<Scalar,Ordinal,Device>(
145 nGrid, nIter, ensemble_min, ensemble_max, ensemble_step);
152 TEUCHOS_STANDARD_CATCH_STATEMENTS(verbose, std::cerr, success);
void performance_test_driver(const Ordinal nGrid, const Ordinal nIter, const Ordinal ensemble_min, const Ordinal ensemble_max, const Ordinal ensemble_step)
int main(int argc, char *argv[])