Sacado Package Browser (Single Doxygen Collection) Version of the Day
Loading...
Searching...
No Matches
fad_kokkos_mat_vec_perf.cpp
Go to the documentation of this file.
1// @HEADER
2// ***********************************************************************
3//
4// Sacado Package
5// Copyright (2006) Sandia Corporation
6//
7// Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation,
8// the U.S. Government retains certain rights in this software.
9//
10// This library is free software; you can redistribute it and/or modify
11// it under the terms of the GNU Lesser General Public License as
12// published by the Free Software Foundation; either version 2.1 of the
13// License, or (at your option) any later version.
14//
15// This library is distributed in the hope that it will be useful, but
16// WITHOUT ANY WARRANTY; without even the implied warranty of
17// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
18// Lesser General Public License for more details.
19//
20// You should have received a copy of the GNU Lesser General Public
21// License along with this library; if not, write to the Free Software
22// Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301
23// USA
24// Questions? Contact David M. Gay (dmgay@sandia.gov) or Eric T. Phipps
25// (etphipp@sandia.gov).
26//
27// ***********************************************************************
28// @HEADER
29
30#define SACADO_VIEW_CUDA_HIERARCHICAL 1
31//#define SACADO_VIEW_CUDA_HIERARCHICAL_DFAD 1
32//#define SACADO_KOKKOS_USE_MEMORY_POOL 1
33#define SACADO_ALIGN_SFAD 1
34
35//#define SACADO_DISABLE_FAD_VIEW_SPEC
36#include "Sacado.hpp"
37
38#include "Teuchos_CommandLineProcessor.hpp"
39#include "Teuchos_StandardCatchMacros.hpp"
40#include "Teuchos_Time.hpp"
41
42#include "Kokkos_Timer.hpp"
43
44// For vtune
45#include <sys/types.h>
46#include <unistd.h>
47#include <algorithm>
48
49// A performance test that computes the derivative of a simple Kokkos kernel
50// using various Fad classes
51
52template <typename ViewTypeA, typename ViewTypeB, typename ViewTypeC>
53void run_mat_vec(const ViewTypeA& A, const ViewTypeB& b, const ViewTypeC& c) {
54 typedef typename ViewTypeC::value_type scalar_type;
55 typedef typename ViewTypeC::execution_space execution_space;
56
57 const int m = A.extent(0);
58 const int n = A.extent(1);
59 Kokkos::parallel_for(
60 Kokkos::RangePolicy<execution_space>( 0,m ),
61 KOKKOS_LAMBDA (const int i) {
62 scalar_type t = 0.0;
63 for (int j=0; j<n; ++j)
64 t += A(i,j)*b(j);
65 c(i) = t;
66 }
67 );
68}
69
70#if defined(SACADO_VIEW_CUDA_HIERARCHICAL_DFAD)
71
72template <typename ViewTypeA, typename ViewTypeB, typename ViewTypeC>
73void run_mat_vec_hierarchical(const ViewTypeA& A, const ViewTypeB& b,
74 const ViewTypeC& c) {
75 typedef typename ViewTypeC::value_type scalar_type;
76 typedef typename ViewTypeC::execution_space execution_space;
77
78#if defined (KOKKOS_ENABLE_CUDA)
79 const bool is_cuda = std::is_same<execution_space, Kokkos::Cuda>::value;
80#else
81 const bool is_cuda = false;
82#endif
83 const unsigned vector_size = is_cuda ? 32 : 1;
84 const unsigned team_size = is_cuda ? 128 / vector_size : 1;
85
86 const int m = A.extent(0);
87 const int n = A.extent(1);
88 const int range = (m+team_size-1)/team_size;
89
90 typedef Kokkos::TeamPolicy<execution_space> Policy;
91 Kokkos::parallel_for(
92 Policy( range,team_size,vector_size ),
93 KOKKOS_LAMBDA (const typename Policy::member_type& team) {
94 const int i = team.league_rank()*team.team_size() + team.team_rank();
95 if (i >= m)
96 return;
97
98 scalar_type t = 0.0;
99 for (int j=0; j<n; ++j)
100 t += A(i,j)*b(j);
101 c(i) = t;
102 }
103 );
104}
105
106#elif defined(SACADO_VIEW_CUDA_HIERARCHICAL)
107
108template <typename ViewTypeA, typename ViewTypeB, typename ViewTypeC>
109void run_mat_vec_hierarchical(const ViewTypeA& A, const ViewTypeB& b,
110 const ViewTypeC& c) {
111 typedef typename Kokkos::ThreadLocalScalarType<ViewTypeC>::type scalar_type;
112 typedef typename ViewTypeC::execution_space execution_space;
113
114#if defined (KOKKOS_ENABLE_CUDA)
115 const bool is_cuda = std::is_same<execution_space, Kokkos::Cuda>::value;
116#else
117 const bool is_cuda = false;
118#endif
119 const unsigned vector_size = is_cuda ? 32 : 1;
120 const unsigned team_size = is_cuda ? 128 / vector_size : 1;
121
122 const int m = A.extent(0);
123 const int n = A.extent(1);
124 const int range = (m+team_size-1)/team_size;
125
126 typedef Kokkos::TeamPolicy<execution_space> Policy;
127 Kokkos::parallel_for(
128 Policy( range,team_size,vector_size ),
129 KOKKOS_LAMBDA (const typename Policy::member_type& team) {
130 const int i = team.league_rank()*team.team_size() + team.team_rank();
131 if (i >= m)
132 return;
133
134 scalar_type t = 0.0;
135 for (int j=0; j<n; ++j)
136 t += A(i,j)*b(j);
137 c(i) = t;
138 }
139 );
140}
141
142#else
143
144template <typename ViewTypeA, typename ViewTypeB, typename ViewTypeC>
145void run_mat_vec_hierarchical(const ViewTypeA& A, const ViewTypeB& b,
146 const ViewTypeC& c) {
147 typedef typename ViewTypeC::value_type scalar_type;
148 typedef typename ViewTypeC::execution_space execution_space;
149
150#if defined (KOKKOS_ENABLE_CUDA)
151 const bool is_cuda = std::is_same<execution_space, Kokkos::Cuda>::value;
152#else
153 const bool is_cuda = false;
154#endif
155 const unsigned vector_size = 1;
156 const unsigned team_size = is_cuda ? 128 / vector_size : 1;
157
158 const int m = A.extent(0);
159 const int n = A.extent(1);
160 const int range = (m+team_size-1)/team_size;
161
162 typedef Kokkos::TeamPolicy<execution_space> Policy;
163 Kokkos::parallel_for(
164 Policy( range,team_size,vector_size ),
165 KOKKOS_LAMBDA (const typename Policy::member_type& team) {
166 const int i = team.league_rank()*team.team_size() + team.team_rank();
167 if (i >= m)
168 return;
169
170 scalar_type t = 0.0;
171 for (int j=0; j<n; ++j)
172 t += A(i,j)*b(j);
173 c(i) = t;
174 }
175 );
176}
177
178#endif
179
180template <typename ViewTypeA, typename ViewTypeB, typename ViewTypeC>
181void
182check_val(const ViewTypeA& A, const ViewTypeB& b, const ViewTypeC& c)
183{
184 const double tol = 1.0e-14;
185 typedef typename ViewTypeC::value_type value_type;
186 typename ViewTypeC::HostMirror h_c = Kokkos::create_mirror_view(c);
187 Kokkos::deep_copy(h_c, c);
188 const size_t m = A.extent(0);
189 const size_t n = A.extent(1);
190 for (size_t i=0; i<m; ++i) {
191 value_type t = n;
192 if (std::abs(h_c(i)- t) > tol) {
193 std::cout << "Comparison failed! " << i << " : " << h_c(i) << " , " << t
194 << std::endl;
195 }
196 }
197}
198
199template <typename ViewTypeA, typename ViewTypeB, typename ViewTypeC>
200void
201check_deriv(const ViewTypeA& A, const ViewTypeB& b, const ViewTypeC& c)
202{
203 const double tol = 1.0e-14;
204 typedef typename ViewTypeC::value_type value_type;
205 typename ViewTypeC::HostMirror h_c = Kokkos::create_mirror_view(c);
206 Kokkos::deep_copy(h_c, c);
207 const size_t m = A.extent(0);
208 const size_t n = A.extent(1);
209 const size_t p = Kokkos::dimension_scalar(A);
210 for (size_t i=0; i<m; ++i) {
211 for (size_t j=0; j<p; ++j) {
212 value_type t = (j == p-1 ? n : 2*n);
213 if (std::abs(h_c(i).fastAccessDx(j)- t) > tol) {
214 std::cout << "Comparison failed! " << i << "," << j << " : "
215 << h_c(i).fastAccessDx(j) << " , " << t << std::endl;
216 }
217 }
218 }
219}
220
221struct Perf {
222 double time;
223 double flops;
224 double throughput;
225};
226
227template <typename FadType, typename ... ViewArgs>
228Perf
229do_time_fad(const size_t m, const size_t n, const size_t p, const size_t nloop,
230 const bool check)
231{
232 typedef Kokkos::View<FadType**, ViewArgs...> ViewTypeA;
233 typedef Kokkos::View<FadType*, ViewArgs...> ViewTypeB;
234 typedef Kokkos::View<FadType*, ViewArgs...> ViewTypeC;
235 typedef typename ViewTypeA::execution_space execution_space;
236
237 ViewTypeA A("A",m,n,p+1);
238 ViewTypeB b("B",n,p+1);
239 ViewTypeC c("c",m,p+1);
240
241 FadType a(p, 1.0);
242 for (size_t k=0; k<p; ++k)
243 a.fastAccessDx(k) = 1.0;
244 Kokkos::deep_copy(A, a);
245 Kokkos::deep_copy(b, a);
246
247 Kokkos::Timer wall_clock;
248 Perf perf;
249
250 // Execute the kernel once to warm up
251 run_mat_vec( A, b, c );
252 execution_space().fence();
253
254 wall_clock.reset();
255 for (size_t l=0; l<nloop; l++) {
256 run_mat_vec( A, b, c );
257 }
258 execution_space().fence();
259
260 perf.time = wall_clock.seconds() / nloop;
261 perf.flops = m*n*(2+4*p);
262 perf.throughput = perf.flops / perf.time / 1.0e9;
263
264 if (check) {
265 check_deriv(A, b, c);
266 }
267
268 return perf;
269}
270
271template <typename FadType, typename ... ViewArgs>
272Perf
273do_time_fad_hierarchical(const size_t m, const size_t n, const size_t p,
274 const size_t nloop, const bool check)
275{
276 typedef Kokkos::View<FadType**, ViewArgs...> ViewTypeA;
277 typedef Kokkos::View<FadType*, ViewArgs...> ViewTypeB;
278 typedef Kokkos::View<FadType*, ViewArgs...> ViewTypeC;
279 typedef typename ViewTypeA::execution_space execution_space;
280
281#if defined (KOKKOS_ENABLE_CUDA)
282 const bool is_cuda = std::is_same<execution_space, Kokkos::Cuda>::value;
283#else
284 const bool is_cuda = false;
285#endif
286
287#if defined(SACADO_VIEW_CUDA_HIERARCHICAL)
288 const int FadStride = is_cuda ? 32 : 1;
289#if defined(SACADO_ALIGN_SFAD)
291 const int Nalign = ((N+FadStride-1)/FadStride)*FadStride;
292 const size_t pa = N > 0 ? ((p+FadStride-1)/FadStride)*FadStride : p;
293 typedef typename FadType::template apply_N<Nalign>::type AlignedFadType;
294#else
295 typedef FadType AlignedFadType;
296 const size_t pa = p;
297#endif
298#else
299 const int FadStride = 1;
300 typedef FadType AlignedFadType;
301 const size_t pa = p;
302#endif
303
304#if defined(SACADO_VIEW_CUDA_HIERARCHICAL) || defined(SACADO_VIEW_CUDA_HIERARCHICAL_DFAD)
308#else
309 typedef typename ViewTypeA::array_layout ConLayoutA;
310 typedef typename ViewTypeB::array_layout ConLayoutB;
311 typedef typename ViewTypeC::array_layout ConLayoutC;
312 (void) FadStride;
313#endif
314
315
316 typedef Kokkos::View<AlignedFadType**, ConLayoutA, execution_space> ConViewTypeA;
317 typedef Kokkos::View<AlignedFadType*, ConLayoutB, execution_space> ConViewTypeB;
318 typedef Kokkos::View<AlignedFadType*, ConLayoutC, execution_space> ConViewTypeC;
319
320 ConViewTypeA A("A",m,n,pa+1);
321 ConViewTypeB b("B",n,pa+1);
322 ConViewTypeC c("c",m,pa+1);
323
324 AlignedFadType a(pa, 1.0);
325 for (size_t k=0; k<pa; ++k)
326 a.fastAccessDx(k) = 1.0;
327 Kokkos::deep_copy(A, a);
328 Kokkos::deep_copy(b, a);
329
330 Kokkos::Timer wall_clock;
331 Perf perf;
332
333#if defined(SACADO_KOKKOS_USE_MEMORY_POOL)
334 const size_t concurrency = execution_space().concurrency();
335 const size_t warp_dim = is_cuda ? 32 : 1;
336 const size_t block_size = pa*sizeof(double);
337 const size_t nkernels = concurrency / warp_dim;
338 const size_t mem_pool_size =
339 static_cast<size_t>(1.2*nkernels*block_size);
340 const size_t superblock_size = std::max<size_t>(nkernels / 100, 1) * block_size;
341 execution_space space;
342 Sacado::createGlobalMemoryPool(space, mem_pool_size,
343 block_size,
344 block_size,
345 superblock_size
346 );
347#endif
348
349 // Execute the kernel once to warm up
351 execution_space().fence();
352
353 wall_clock.reset();
354 for (size_t l=0; l<nloop; l++) {
356 }
357 execution_space().fence();
358
359 perf.time = wall_clock.seconds() / nloop;
360 perf.flops = m*n*(2+4*p);
361 perf.throughput = perf.flops / perf.time / 1.0e9;
362
363 if (check) {
364 check_deriv(A, b, c);
365 }
366
367#if defined(SACADO_KOKKOS_USE_MEMORY_POOL)
369#endif
370
371 return perf;
372}
373
374template <typename ... ViewArgs>
375Perf
376do_time_val(const size_t m, const size_t n, const size_t nloop,
377 const bool check)
378{
379 typedef Kokkos::View<double**, ViewArgs...> ViewTypeA;
380 typedef Kokkos::View<double*, ViewArgs...> ViewTypeB;
381 typedef Kokkos::View<double*, ViewArgs...> ViewTypeC;
382 typedef typename ViewTypeA::execution_space execution_space;
383
384 ViewTypeA A("A",m,n);
385 ViewTypeB b("B",n);
386 ViewTypeC c("c",m);
387
388 Kokkos::deep_copy(A, 1.0);
389 Kokkos::deep_copy(b, 1.0);
390
391 Kokkos::Timer wall_clock;
392 Perf perf;
393
394 // Execute the kernel once to warm up
395 run_mat_vec( A, b, c );
396 execution_space().fence();
397
398 wall_clock.reset();
399 for (size_t l=0; l<nloop; l++) {
400 run_mat_vec( A, b, c );
401 }
402 execution_space().fence();
403
404 perf.time = wall_clock.seconds() / nloop;
405 perf.flops = m*n*2;
406 perf.throughput = perf.flops / perf.time / 1.0e9;
407
408 if (check)
409 check_val(A,b,c);
410
411 return perf;
412}
413
414void
415print_perf(const Perf& perf, const Perf& perf_base, const size_t p,
416 const std::string& name)
417{
418 std::cout << name << "\t "
419 << perf.time << "\t "
420 << perf.throughput << "\t "
421 << perf.time / (perf_base.time*p)
422 << std::endl;
423}
424
425template <int SFadSize, int SLFadSize, int HierSFadSize, int HierSLFadSize,
426 typename ... ViewArgs>
427void
428do_times(const size_t m,
429 const size_t n,
430 const size_t p,
431 const size_t ph,
432 const size_t nloop,
433 const bool value,
434 const bool sfad,
435 const bool slfad,
436 const bool dfad,
437 const bool hierarchical,
438 const bool check)
439{
440 Perf perf_value;
441 perf_value.time = 1.0;
442
443 // Run value
444 if (value) {
445 Perf perf = do_time_val<ViewArgs...>(m,n,nloop,check);
446 perf_value = perf;
447 print_perf(perf, perf_value, p, "Value ");
448 }
449
450 // Run SFad
451 if (sfad && p == SFadSize) {
452 Perf perf =
453 do_time_fad<Sacado::Fad::SFad<double,SFadSize>, ViewArgs...>(m,n,p,nloop,check);
454 print_perf(perf, perf_value, p, "SFad ");
455 }
456
457 // Run SLFad
458 if (slfad && p <= SLFadSize) {
459 Perf perf =
460 do_time_fad<Sacado::Fad::SLFad<double,SLFadSize>, ViewArgs...>(m,n,p,nloop,check);
461 print_perf(perf, perf_value, p, "SLFad ");
462 }
463
464 // Run DFad
465 if (dfad) {
466 Perf perf =
467 do_time_fad<Sacado::Fad::DFad<double>, ViewArgs...>(m,n,p,nloop,check);
468 print_perf(perf, perf_value, p, "DFad ");
469 }
470
471 // Run hierarchical
472 if (hierarchical) {
473 if (sfad && ph == HierSFadSize) {
474 Perf perf =
475 do_time_fad_hierarchical<Sacado::Fad::SFad<double,HierSFadSize>, ViewArgs...>(m,n,ph,nloop,check);
476 print_perf(perf, perf_value, ph, "Hier SFad ");
477 }
478 if (slfad && ph <= HierSLFadSize) {
479 Perf perf =
480 do_time_fad_hierarchical<Sacado::Fad::SLFad<double,HierSLFadSize>, ViewArgs...>(m,n,ph,nloop,check);
481 print_perf(perf, perf_value, ph, "Hier SLFad");
482 }
483 if (dfad) {
484 Perf perf =
485 do_time_fad_hierarchical<Sacado::Fad::DFad<double>, ViewArgs...>(m,n,ph,nloop,check);
486 print_perf(perf, perf_value, ph, "Hier DFad ");
487 }
488 }
489
490}
491
497const int num_layout_types = 3;
500const char *layout_names[] = { "left", "right", "default" };
501
502template <int SFadSize, int SLFadSize, int HierSFadSize, int HierSLFadSize,
503 typename Device>
504void
505do_times_layout(const size_t m,
506 const size_t n,
507 const size_t p,
508 const size_t ph,
509 const size_t nloop,
510 const bool value,
511 const bool sfad,
512 const bool slfad,
513 const bool dfad,
514 const bool hierarchical,
515 const bool check,
516 const LayoutType& layout,
517 const std::string& device)
518{
519 int prec = 2;
520 std::cout.setf(std::ios::scientific);
521 std::cout.precision(prec);
522 std::cout << std::endl
523 << device
524 << " performance for layout "
525 << layout_names[layout]
526 << " m = " << m << " n = " << n << " p = " << p << " ph = " << ph
527 << std::endl << std::endl;
528 std::cout << "Computation \t Time \t Throughput \t Ratio" << std::endl;
529
530 if (layout == LAYOUT_LEFT)
531 do_times<SFadSize,SLFadSize,HierSFadSize,HierSLFadSize,Kokkos::LayoutLeft,Device>(
532 m,n,p,ph,nloop,value,sfad,slfad,dfad,hierarchical,check);
533 else if (layout == LAYOUT_RIGHT)
534 do_times<SFadSize,SLFadSize,HierSFadSize,HierSLFadSize,Kokkos::LayoutRight,Device>(
535 m,n,p,ph,nloop,value,sfad,slfad,dfad,hierarchical,check);
536 else
537 do_times<SFadSize,SLFadSize,HierSFadSize,HierSLFadSize,Device>
538 (m,n,p,ph,nloop,value,sfad,slfad,dfad,hierarchical,check);
539}
540
541// Connect executable to vtune for profiling
543 std::stringstream cmd;
544 pid_t my_os_pid=getpid();
545 const std::string vtune_loc =
546 "amplxe-cl";
547 const std::string output_dir = "./vtune";
548 cmd << vtune_loc
549 << " -collect hotspots -result-dir " << output_dir
550 << " -target-pid " << my_os_pid << " &";
551 std::cout << cmd.str() << std::endl;
552 system(cmd.str().c_str());
553 system("sleep 10");
554}
555
556//const int SFadSize = 8;
557const int SFadSize = 32;
558const int SLFadSize = SFadSize;
559//const int HierSFadSize = 50;
560const int HierSFadSize = 32;
562
563int main(int argc, char* argv[]) {
564 bool success = true;
565 try {
566
567 // Set up command line options
568 Teuchos::CommandLineProcessor clp(false);
569 clp.setDocString("This program tests the speed of various forward mode AD implementations for simple Kokkos kernel");
570 int m = 100000;
571 clp.setOption("m", &m, "Number of matrix rows");
572 int n = 100;
573 clp.setOption("n", &n, "Number of matrix columns");
574 int p = SFadSize;
575 clp.setOption("p", &p, "Number of derivative components");
576 int ph = HierSFadSize;
577 clp.setOption("ph", &ph, "Number of derivative components for hierarchical");
578 int nloop = 10;
579 clp.setOption("nloop", &nloop, "Number of loops");
580#ifdef KOKKOS_ENABLE_SERIAL
581 bool serial = 0;
582 clp.setOption("serial", "no-serial", &serial, "Whether to run Serial");
583#endif
584#ifdef KOKKOS_ENABLE_OPENMP
585 int openmp = 0;
586 clp.setOption("openmp", &openmp, "Number of OpenMP threads");
587#endif
588#ifdef KOKKOS_ENABLE_THREADS
589 int threads = 0;
590 clp.setOption("threads", &threads, "Number of pThreads threads");
591#endif
592#ifdef KOKKOS_ENABLE_CUDA
593 bool cuda = 0;
594 clp.setOption("cuda", "no-cuda", &cuda, "Whether to run CUDA");
595#endif
596 int numa = 0;
597 clp.setOption("numa", &numa,
598 "Number of NUMA domains to use (set to 0 to use all NUMAs");
599 int cores_per_numa = 0;
600 clp.setOption("cores-per-numa", &cores_per_numa,
601 "Number of CPU cores per NUMA to use (set to 0 to use all cores)");
602 bool print_config = false;
603 clp.setOption("print-config", "no-print-config", &print_config,
604 "Whether to print Kokkos device configuration");
605 LayoutType layout = LAYOUT_DEFAULT;
606 clp.setOption("layout", &layout, num_layout_types, layout_values,
607 layout_names, "View layout");
608 bool vtune = false;
609 clp.setOption("vtune", "no-vtune", &vtune, "Profile with vtune");
610 bool value = true;
611 clp.setOption("value", "no-value", &value, "Run value calculation");
612 bool sfad = true;
613 clp.setOption("sfad", "no-sfad", &sfad, "Run SFad derivative calculation");
614 bool slfad = true;
615 clp.setOption("slfad", "no-slfad", &slfad, "Run SLFad derivative calculation");
616 bool dfad = true;
617 clp.setOption("dfad", "no-dfad", &dfad, "Run DFad derivative calculation");
618 bool hierarchical = true;
619 clp.setOption("hierarchical", "no-hierarchical", &hierarchical, "Run hierarchical Fad derivative calculation");
620 bool check = false;
621 clp.setOption("check", "no-check", &check, "Check calculations are correct");
622
623 // Parse options
624 switch (clp.parse(argc, argv)) {
625 case Teuchos::CommandLineProcessor::PARSE_HELP_PRINTED:
626 return 0;
627 case Teuchos::CommandLineProcessor::PARSE_ERROR:
628 case Teuchos::CommandLineProcessor::PARSE_UNRECOGNIZED_OPTION:
629 return 1;
630 case Teuchos::CommandLineProcessor::PARSE_SUCCESSFUL:
631 break;
632 }
633
634 if (vtune)
636
637 Kokkos::InitializationSettings init_args;
638 init_args.set_num_threads(cores_per_numa);
639
640 Kokkos::initialize(init_args);
641
642 if (print_config)
643 Kokkos::print_configuration(std::cout, true);
644
645#ifdef KOKKOS_ENABLE_SERIAL
646 if (serial) {
647 do_times_layout<SFadSize,SLFadSize,HierSFadSize,HierSLFadSize,Kokkos::Serial>(
648 m,n,p,ph,nloop,value,sfad,slfad,dfad,hierarchical,check,layout,"Serial");
649 }
650#endif
651
652#ifdef KOKKOS_ENABLE_OPENMP
653 if (openmp) {
654 do_times_layout<SFadSize,SLFadSize,HierSFadSize,HierSLFadSize,Kokkos::OpenMP>(
655 m,n,p,ph,nloop,value,sfad,slfad,dfad,hierarchical,check,layout,"OpenMP");
656 }
657#endif
658
659#ifdef KOKKOS_ENABLE_THREADS
660 if (threads) {
661 do_times_layout<SFadSize,SLFadSize,HierSFadSize,HierSLFadSize,Kokkos::Threads>(
662 m,n,p,ph,nloop,value,sfad,slfad,dfad,hierarchical,check,layout,"Threads");
663 }
664#endif
665
666#ifdef KOKKOS_ENABLE_CUDA
667 if (cuda) {
668 do_times_layout<SFadSize,SLFadSize,HierSFadSize,HierSLFadSize,Kokkos::Cuda>(
669 m,n,p,ph,nloop,value,sfad,slfad,dfad,hierarchical,check,layout,"Cuda");
670 }
671#endif
672
673 Kokkos::finalize();
674
675 }
676 TEUCHOS_STANDARD_CATCH_STATEMENTS(true, std::cerr, success);
677
678 return !success;
679}
expr expr expr fastAccessDx(i)) FAD_UNARYOP_MACRO(exp
expr expr1 expr1 expr1 c expr2 expr1 expr2 expr1 expr2 expr1 expr1 expr1 expr1 c expr2 expr1 expr2 expr1 expr2 expr1 expr1 expr1 expr1 c *expr2 expr1 expr2 expr1 expr2 expr1 expr1 expr1 expr1 c expr2 expr1 expr2 expr1 expr2 expr1 expr1 expr1 expr2 expr1 expr2 expr1 expr1 expr1 expr2 expr1 expr2 expr1 expr1 expr1 c
#define A
const int N
int main()
Sacado::Fad::DFad< double > FadType
int check(Epetra_CrsGraph &A, int NumMyRows1, int NumGlobalRows1, int NumMyNonzeros1, int NumGlobalNonzeros1, int *MyGlobalElements, bool verbose)
const int HierSFadSize
Perf do_time_val(const size_t m, const size_t n, const size_t nloop, const bool check)
void do_times(const size_t m, const size_t n, const size_t p, const size_t ph, const size_t nloop, const bool value, const bool sfad, const bool slfad, const bool dfad, const bool hierarchical, const bool check)
const LayoutType layout_values[]
const char * layout_names[]
void do_times_layout(const size_t m, const size_t n, const size_t p, const size_t ph, const size_t nloop, const bool value, const bool sfad, const bool slfad, const bool dfad, const bool hierarchical, const bool check, const LayoutType &layout, const std::string &device)
void run_mat_vec(const ViewTypeA &A, const ViewTypeB &b, const ViewTypeC &c)
void check_val(const ViewTypeA &A, const ViewTypeB &b, const ViewTypeC &c)
void run_mat_vec_hierarchical(const ViewTypeA &A, const ViewTypeB &b, const ViewTypeC &c)
Perf do_time_fad_hierarchical(const size_t m, const size_t n, const size_t p, const size_t nloop, const bool check)
void check_deriv(const ViewTypeA &A, const ViewTypeB &b, const ViewTypeC &c)
void connect_vtune()
const int num_layout_types
const int SFadSize
Perf do_time_fad(const size_t m, const size_t n, const size_t p, const size_t nloop, const bool check)
void print_perf(const Perf &perf, const Perf &perf_base, const size_t p, const std::string &name)
const int SLFadSize
const int HierSLFadSize
const char * p
void createGlobalMemoryPool(const ExecSpace &space, const size_t min_total_alloc_size, const uint32_t min_block_alloc_size, const uint32_t max_block_alloc_size, const uint32_t min_superblock_size)
void destroyGlobalMemoryPool(const ExecSpace &space)
double time
double flops
double throughput
Base template specification for static size.
const double tol