Sacado Package Browser (Single Doxygen Collection) Version of the Day
Loading...
Searching...
No Matches
advection_const_basis/advection_hierarchical_dfad.cpp
Go to the documentation of this file.
1// @HEADER
2// ***********************************************************************
3//
4// Sacado Package
5// Copyright (2006) Sandia Corporation
6//
7// Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation,
8// the U.S. Government retains certain rights in this software.
9//
10// This library is free software; you can redistribute it and/or modify
11// it under the terms of the GNU Lesser General Public License as
12// published by the Free Software Foundation; either version 2.1 of the
13// License, or (at your option) any later version.
14//
15// This library is distributed in the hope that it will be useful, but
16// WITHOUT ANY WARRANTY; without even the implied warranty of
17// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
18// Lesser General Public License for more details.
19//
20// You should have received a copy of the GNU Lesser General Public
21// License along with this library; if not, write to the Free Software
22// Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301
23// USA
24// Questions? Contact David M. Gay (dmgay@sandia.gov) or Eric T. Phipps
25// (etphipp@sandia.gov).
26//
27// ***********************************************************************
28// @HEADER
29
30#define SACADO_VIEW_CUDA_HIERARCHICAL_DFAD 1
31#define SACADO_KOKKOS_USE_MEMORY_POOL 1
32
33#include "Sacado.hpp"
35#include "common.hpp"
36
37#include "Kokkos_Timer.hpp"
38
39template<typename FluxView, typename WgbView, typename SrcView,
40 typename WbsView, typename ResidualView>
41void run_dfad_hierarchical_team(const FluxView& flux, const WgbView& wgb,
42 const SrcView& src, const WbsView& wbs,
43 const ResidualView& residual)
44{
45 typedef typename ResidualView::execution_space execution_space;
46 typedef typename ResidualView::non_const_value_type scalar_type;
47 typedef Kokkos::TeamPolicy<execution_space> policy_type;
48 typedef typename policy_type::member_type team_member;
49
50 const size_t num_cells = wgb.extent(0);
51 const int num_basis = wgb.extent(1);
52 const int num_points = wgb.extent(2);
53 const int num_dim = wgb.extent(3);
54
55 const bool is_cuda = is_cuda_space<execution_space>::value;
56 const int vector_size = is_cuda ? 32 : 1;
57 const int team_size = is_cuda ? 256/vector_size : 1;
58
59 policy_type policy(num_cells,team_size,vector_size);
60 Kokkos::parallel_for(policy, KOKKOS_LAMBDA (const team_member& team)
61 {
62 const int team_rank = team.team_rank();
63 const size_t cell = team.league_rank();
64 scalar_type value, value2;
65 for (int basis=team_rank; basis<num_basis; basis+=team_size) {
66 value = 0.0;
67 value2 = 0.0;
68 for (int qp=0; qp<num_points; ++qp) {
69 for (int dim=0; dim<num_dim; ++dim)
70 value += flux(cell,qp,dim)*wgb(cell,basis,qp,dim);
71 value2 += src(cell,qp)*wbs(cell,basis,qp);
72 }
73 residual(cell,basis) = value+value2;
74 }
75 });
76}
77
78template<typename FluxView, typename WgbView, typename SrcView,
79 typename WbsView, typename ResidualView>
81 const FluxView& flux, const WgbView& wgb,
82 const SrcView& src, const WbsView& wbs,
83 const ResidualView& residual)
84{
85 typedef typename ResidualView::execution_space execution_space;
86 typedef typename ResidualView::non_const_value_type scalar_type;
87 typedef Kokkos::TeamPolicy<execution_space> policy_type;
88 typedef typename policy_type::member_type team_member;
89 typedef Kokkos::View<scalar_type*, typename execution_space::scratch_memory_space, Kokkos::MemoryTraits<Kokkos::Unmanaged> > tmp_scratch_type;
90
91 const size_t num_cells = wgb.extent(0);
92 const int num_basis = wgb.extent(1);
93 const int num_points = wgb.extent(2);
94 const int num_dim = wgb.extent(3);
95
96 const bool is_cuda = is_cuda_space<execution_space>::value;
97 const int vector_size = is_cuda ? 32 : 1;
98 const int team_size = is_cuda ? 256/vector_size : 1;
99 const int fad_size = Kokkos::dimension_scalar(residual);
100 const size_t bytes = 2*tmp_scratch_type::shmem_size(team_size,fad_size);
101 policy_type policy(num_cells,team_size,vector_size);
102
103 Kokkos::parallel_for(policy.set_scratch_size(0,Kokkos::PerTeam(bytes)),
104 KOKKOS_LAMBDA (const team_member& team)
105 {
106 const int team_rank = team.team_rank();
107 const size_t cell = team.league_rank();
108 tmp_scratch_type value(team.team_scratch(0), team_size, fad_size);
109 tmp_scratch_type value2(team.team_scratch(0), team_size, fad_size);
110 for (int basis=team_rank; basis<num_basis; basis+=team_size) {
111 value(team_rank) = 0.0;
112 value2(team_rank) = 0.0;
113 for (int qp=0; qp<num_points; ++qp) {
114 for (int dim=0; dim<num_dim; ++dim)
115 value(team_rank) += flux(cell,qp,dim)*wgb(cell,basis,qp,dim);
116 value2(team_rank) += src(cell,qp)*wbs(cell,basis,qp);
117 }
118 residual(cell,basis) = value(team_rank)+value2(team_rank);
119 }
120 });
121}
122
123template <int N, typename ExecSpace>
124double time_dfad_hierarchical_team(int ncells, int num_basis, int num_points,
125 int ndim, int ntrial, bool check)
126{
128
129 typedef typename ExecSpace::array_layout DefaultLayout;
131 typedef Kokkos::View<double****,ExecSpace> t_4DView_d;
132 typedef Kokkos::View<double***,ExecSpace> t_3DView_d;
133 typedef Kokkos::View<FadType***,ContLayout,ExecSpace> t_3DView;
134 typedef Kokkos::View<FadType**,ContLayout,ExecSpace> t_2DView;
135
136 t_4DView_d wgb("",ncells,num_basis,num_points,ndim);
137 t_3DView_d wbs("",ncells,num_basis,num_points);
138 t_3DView flux("",ncells,num_points,ndim,N+1);
139 t_2DView src("",ncells,num_points,N+1);
140 t_2DView residual("",ncells,num_basis,N+1);
141 init_fad(wgb, wbs, flux, src, residual);
142
143 // Create memory pool for DFad
144 // The kernel allocates 2*N double's per warp on Cuda. Approximate
145 // the maximum number of warps as the maximum concurrency / 32.
146 // Include a fudge factor of 1.2 since memory pool treats a block as full
147 // once it reaches 80% capacity
148 const size_t block_size = N*sizeof(double);
149 size_t nkernels = ExecSpace().concurrency()*2;
151 nkernels /= 32;
152 const size_t mem_pool_size = static_cast<size_t>(1.2*nkernels*block_size);
153 const size_t superblock_size =
154 std::max<size_t>(nkernels / 100, 1) * block_size;
155 ExecSpace exec_space;
156 Sacado::createGlobalMemoryPool(exec_space, mem_pool_size,
157 block_size, block_size, superblock_size);
158
159 // Run once to warm up, complete any UVM transfers
160 run_dfad_hierarchical_team(flux, wgb, src, wbs, residual);
161
162 // Time execution
163 Kokkos::fence();
164 Kokkos::Timer timer;
165 for (int i=0; i<ntrial; ++i)
166 run_dfad_hierarchical_team(flux, wgb, src, wbs, residual);
167 Kokkos::fence();
168 double time = timer.seconds() / ntrial / ncells;
169
170 // Check result
171 if (check)
172 check_residual(flux, wgb, src, wbs, residual);
173
174 // Destroy memory pool
176
177 return time;
178}
179
180template <int N, typename ExecSpace>
182 int ncells, int num_basis, int num_points, int ndim, int ntrial, bool check)
183{
185
186 typedef typename ExecSpace::array_layout DefaultLayout;
188 typedef Kokkos::View<double****,ExecSpace> t_4DView_d;
189 typedef Kokkos::View<double***,ExecSpace> t_3DView_d;
190 typedef Kokkos::View<FadType***,ContLayout,ExecSpace> t_3DView;
191 typedef Kokkos::View<FadType**,ContLayout,ExecSpace> t_2DView;
192
193 t_4DView_d wgb("",ncells,num_basis,num_points,ndim);
194 t_3DView_d wbs("",ncells,num_basis,num_points);
195 t_3DView flux("",ncells,num_points,ndim,N+1);
196 t_2DView src("",ncells,num_points,N+1);
197 t_2DView residual("",ncells,num_basis,N+1);
198 init_fad(wgb, wbs, flux, src, residual);
199
200 // Run once to warm up, complete any UVM transfers
201 run_dfad_hierarchical_team_scratch(flux, wgb, src, wbs, residual);
202
203 // Time execution
204 Kokkos::fence();
205 Kokkos::Timer timer;
206 for (int i=0; i<ntrial; ++i)
207 run_dfad_hierarchical_team_scratch(flux, wgb, src, wbs, residual);
208 Kokkos::fence();
209 double time = timer.seconds() / ntrial / ncells;
210
211 // Check result
212 if (check)
213 check_residual(flux, wgb, src, wbs, residual);
214
215 return time;
216}
217
218#define INST_FUNC_N_DEV(N,DEV) \
219 template double time_dfad_hierarchical_team< N, DEV >(int ncells, int num_basis, int num_points, int ndim, int ntrial, bool check); \
220 template double time_dfad_hierarchical_team_scratch< N, DEV >(int ncells, int num_basis, int num_points, int ndim, int ntrial, bool check);
221
222#define INST_FUNC_DEV(DEV) \
223 INST_FUNC_N_DEV( fad_dim, DEV )
224
225#ifdef KOKKOS_ENABLE_SERIAL
226INST_FUNC_DEV(Kokkos::Serial)
227#endif
228
229#ifdef KOKKOS_ENABLE_OPENMP
230INST_FUNC_DEV(Kokkos::OpenMP)
231#endif
232
233#ifdef KOKKOS_ENABLE_THREADS
234INST_FUNC_DEV(Kokkos::Threads)
235#endif
236
237#ifdef KOKKOS_ENABLE_CUDA
238INST_FUNC_DEV(Kokkos::Cuda)
239#endif
const int N
void init_fad(const V1 &v1, const V2 &v2, const V3 &v3, const V4 &v4, const V5 &v5)
void check_residual(const FluxView &flux, const WgbView &wgb, const SrcView &src, const WbsView &wbs, const ResidualView &residual)
double time_dfad_hierarchical_team_scratch(int ncells, int num_basis, int num_points, int ndim, int ntrial, bool check)
void run_dfad_hierarchical_team(const FluxView &flux, const WgbView &wgb, const SrcView &src, const WbsView &wbs, const ResidualView &residual)
double time_dfad_hierarchical_team(int ncells, int num_basis, int num_points, int ndim, int ntrial, bool check)
void run_dfad_hierarchical_team_scratch(const FluxView &flux, const WgbView &wgb, const SrcView &src, const WbsView &wbs, const ResidualView &residual)
Sacado::Fad::DFad< double > FadType
Fad specializations for Teuchos::BLAS wrappers.
int check(Epetra_CrsGraph &A, int NumMyRows1, int NumGlobalRows1, int NumMyNonzeros1, int NumGlobalNonzeros1, int *MyGlobalElements, bool verbose)
void createGlobalMemoryPool(const ExecSpace &space, const size_t min_total_alloc_size, const uint32_t min_block_alloc_size, const uint32_t max_block_alloc_size, const uint32_t min_superblock_size)
void destroyGlobalMemoryPool(const ExecSpace &space)