Zoltan2
Loading...
Searching...
No Matches
Zoltan2_AlgHybridD1.hpp
Go to the documentation of this file.
1#ifndef _ZOLTAN2_ALGHYBRIDD1_HPP_
2#define _ZOLTAN2_ALGHYBRIDD1_HPP_
3
4#include <vector>
5#include <unordered_map>
6#include <iostream>
7#include <fstream>
8#include <queue>
9#ifdef _WIN32
10#include <time.h>
11#else
12#include <sys/time.h>
13#endif
14#include <algorithm>
15
16#include "Zoltan2_Algorithm.hpp"
19#include "Zoltan2_Util.hpp"
20#include "Zoltan2_TPLTraits.hpp"
21#include "Zoltan2_AlltoAll.hpp"
22
23#include "Tpetra_Core.hpp"
24#include "Teuchos_RCP.hpp"
25#include "Tpetra_Import.hpp"
26#include "Tpetra_FEMultiVector.hpp"
27
28#include "KokkosKernels_Handle.hpp"
29#include "KokkosKernels_IOUtils.hpp"
30#include "KokkosGraph_Distance1Color.hpp"
31#include "KokkosGraph_Distance1ColorHandle.hpp"
32#include <stdlib.h>
33
37
38namespace Zoltan2 {
39
40template <typename Adapter>
41class AlgDistance1 : public Algorithm<Adapter>
42{
43 public:
44
45 using lno_t = typename Adapter::lno_t;
46 using gno_t = typename Adapter::gno_t;
47 using offset_t = typename Adapter::offset_t;
48 using scalar_t = typename Adapter::scalar_t;
49 using base_adapter_t = typename Adapter::base_adapter_t;
50 using map_t = Tpetra::Map<lno_t, gno_t>;
51 using femv_scalar_t = int;
52 using femv_t = Tpetra::FEMultiVector<femv_scalar_t, lno_t, gno_t>;
53 using device_type = typename femv_t::device_type;
54 using execution_space = typename device_type::execution_space;
55 using memory_space = typename device_type::memory_space;
56 using host_exec = typename femv_t::host_view_type::device_type::execution_space;
57 using host_mem = typename femv_t::host_view_type::device_type::memory_space;
58 double timer() {
59 struct timeval tp;
60 gettimeofday(&tp, NULL);
61 return ((double) (tp.tv_sec) + 1e-6 * tp.tv_usec);
62 }
63
64 private:
65
66 void buildModel(modelFlag_t &flags);
67
68 //function to invoke KokkosKernels distance-1 coloring
69 //
70 //OUTPUT ARG:
71 //
72 // femv: FEMultiVector containing dual-view of colors.
73 // After this call each vertex will have a locally-valid color.
74 //
75 //INPUT ARGS:
76 //
77 // nVtx: number of local owned vertices
78 //
79 // adjs_view: CSR adjacencies for the local graph
80 //
81 // offset_view: CSR offsets for indexing adjs_view
82 //
83 // vertex_list: list of local IDs of vertices that
84 // need to be recolored
85 //
86 // vertex_list_size: 0 means no vertex list given,
87 // otherwise it simply is the size
88 // of the vertex_list
89 //
90 // recolor: switches between VB_BIT and EB KokkosKernels
91 // algorithms
92 //
93 template <class ExecutionSpace, typename MemorySpace>
94 void colorInterior(const size_t nVtx,
95 Kokkos::View<lno_t*, Kokkos::Device<ExecutionSpace,MemorySpace> > adjs_view,
96 Kokkos::View<offset_t*, Kokkos::Device<ExecutionSpace, MemorySpace> > offset_view,
97 Teuchos::RCP<femv_t> femv,
98 Kokkos::View<lno_t*, Kokkos::Device<ExecutionSpace, MemorySpace> > vertex_list,
99 size_t vertex_list_size = 0,
100 bool recolor=false){
101
102 //set up types to be used by KokkosKernels
103 using KernelHandle = KokkosKernels::Experimental::KokkosKernelsHandle
104 <offset_t, lno_t, lno_t, ExecutionSpace, MemorySpace,
105 MemorySpace>;
106 using lno_row_view_t = Kokkos::View<offset_t*, Kokkos::Device<ExecutionSpace, MemorySpace>>;
107 using lno_nnz_view_t = Kokkos::View<lno_t*, Kokkos::Device<ExecutionSpace, MemorySpace>>;
108
109 KernelHandle kh;
110
111 //pick which KokkosKernels algorithm to use.
112 //VBBIT is more efficient for inputs with max degree < ~6000
113 //EB is more efficient for inputs with max degree > ~6000
114 if(recolor){
115 kh.create_graph_coloring_handle(KokkosGraph::COLORING_VBBIT);
116 } else {
117 kh.create_graph_coloring_handle(KokkosGraph::COLORING_EB);
118 }
119
120 //vertex_list_size indicates whether we have provided a list of vertices to recolor.
121 //Only makes a difference if the algorithm to be used is VBBIT.
122 if(vertex_list_size != 0){
123 kh.get_graph_coloring_handle()->set_vertex_list(vertex_list,vertex_list_size);
124 }
125
126 kh.set_verbose(verbose);
127
128 //set the initial coloring of the kh.get_graph_coloring_handle() to be
129 //the data view from the femv.
130 auto femvColors = femv->template getLocalView<Kokkos::Device<ExecutionSpace,MemorySpace> >(Tpetra::Access::ReadWrite);
131 auto sv = subview(femvColors, Kokkos::ALL, 0);
132 kh.get_graph_coloring_handle()->set_vertex_colors(sv);
133 kh.get_graph_coloring_handle()->set_tictoc(verbose);
134 KokkosGraph::Experimental::graph_color_symbolic<KernelHandle, lno_row_view_t, lno_nnz_view_t>
135 (&kh, nVtx, nVtx, offset_view, adjs_view);
136 //numColors = kh.get_graph_coloring_handle()->get_num_colors();
137
138 if(verbose){
139 std::cout<<"\nKokkosKernels Coloring: "<<kh.get_graph_coloring_handle()->get_overall_coloring_time()<<" iterations: "<<kh.get_graph_coloring_handle()->get_num_phases()<<"\n\n";
140 }
141 }
142
143 public:
144 //contains all distance-1 conflict detection logic
145 //
146 //OUTPUT ARGS:
147 //
148 // femv_colors: This function uncolors vertices that have
149 // distance-1 conflicts, so these colors will
150 // change if there are any conflicts present
151 //
152 //INPUT ARGS:
153 //
154 // n_local: number of locally owned vertices
155 //
156 // n_ghosts: number of ghosts on this process
157 //
158 // dist_offsets: CSR offsets of the local graph
159 //
160 // dist_adjs: CSR adjacencies of the local graph
161 //
162 // verts_to_send_view: Used to construct a list of verts to send on device.
163 //
164 // verts_to_send_size_atomic: atomic version of the verts_to_send_size view
165 // Used to construct a list on device,
166 // the atomic is necessary for correctness.
167 //
168 // recoloringSize: holds the total amount of recoloring that will be done
169 // locally. Does not need to be atomic.
170 //
171 // rand: view that holds the tie-breaking random numbers indexed by LID.
172 //
173 // gid: view that holds GIDs, for tie breaking in the case that rand
174 // numbers are the same for two vertices.
175 //
176 // ghost_degrees: view that holds degrees only for ghost vertices.
177 // LID n_local corresponds to ghost_degrees(0).
178 //
179 // recolor_degrees: if true, we factor degrees into the conflict detection
180 // if false, we resolve using only consistent random numbers.
181 //
182 template <class ExecutionSpace, typename MemorySpace>
183 void detectConflicts(const size_t n_local, const size_t n_ghosts,
184 Kokkos::View<offset_t*, Kokkos::Device<ExecutionSpace, MemorySpace>> dist_offsets,
185 Kokkos::View<lno_t*, Kokkos::Device<ExecutionSpace, MemorySpace>> dist_adjs,
186 Kokkos::View<int*, Kokkos::Device<ExecutionSpace, MemorySpace>> femv_colors,
187 Kokkos::View<lno_t*,
188 Kokkos::Device<ExecutionSpace, MemorySpace>> verts_to_send_view,
189 Kokkos::View<size_t*,
190 Kokkos::Device<ExecutionSpace, MemorySpace>,
191 Kokkos::MemoryTraits<Kokkos::Atomic>> verts_to_send_size_atomic,
192 Kokkos::View<gno_t*, Kokkos::Device<ExecutionSpace, MemorySpace>> recoloringSize,
193 Kokkos::View<int*,
194 Kokkos::Device<ExecutionSpace, MemorySpace>> rand,
195 Kokkos::View<gno_t*,
196 Kokkos::Device<ExecutionSpace, MemorySpace>> gid,
197 Kokkos::View<gno_t*,
198 Kokkos::Device<ExecutionSpace, MemorySpace>> ghost_degrees,
199 bool recolor_degrees){
200 gno_t local_recoloring_size;
201 Kokkos::parallel_reduce("Conflict Detection",
202 Kokkos::RangePolicy<ExecutionSpace>(0,n_ghosts),
203 KOKKOS_LAMBDA(const int& i,
204 gno_t& recoloring_size){
205 lno_t lid = i+n_local;
206 int currColor = femv_colors(lid);
207 int currDegree = ghost_degrees(i);
208 for(offset_t j = dist_offsets(lid); j < dist_offsets(lid+1); j++){
209 int nborColor = femv_colors(dist_adjs(j));
210 int nborDegree = 0;
211 if((size_t)dist_adjs(j) < n_local) nborDegree = dist_offsets(dist_adjs(j)+1) - dist_offsets(dist_adjs(j));
212 else nborDegree = ghost_degrees(dist_adjs(j) - n_local);
213 if(currColor == nborColor ){
214 if(currDegree < nborDegree && recolor_degrees){
215 femv_colors(lid) = 0;
216 recoloring_size++;
217 }else if (nborDegree < currDegree && recolor_degrees){
218 femv_colors(dist_adjs(j)) = 0;
219 recoloring_size++;
220 }else if(rand(lid) > rand(dist_adjs(j))){
221 femv_colors(lid) = 0;
222 recoloring_size++;
223 break;
224 }else if (rand(dist_adjs(j)) > rand(lid)){
225 femv_colors(dist_adjs(j)) = 0;
226 recoloring_size++;
227 } else {
228 if (gid(lid) >= gid(dist_adjs(j))){
229 femv_colors(lid) = 0;
230 recoloring_size++;
231 break;
232 } else {
233 femv_colors(dist_adjs(j)) = 0;
234 recoloring_size++;
235 }
236 }
237 }
238 }
239 },local_recoloring_size);
240 Kokkos::deep_copy(recoloringSize, local_recoloring_size);
241 Kokkos::fence();
242 Kokkos::parallel_for("Rebuild verts_to_send_view",
243 Kokkos::RangePolicy<ExecutionSpace>(0,n_local),
244 KOKKOS_LAMBDA(const int& i){
245 if(femv_colors(i) == 0){
246 verts_to_send_view(verts_to_send_size_atomic(0)++) = i;
247 }
248 });
249 Kokkos::fence();
250 }
251
252 private:
253 //Communicates owned vertex colors to ghost copies.
254 //
255 //RETURN VALUE:
256 //
257 // returns the time it took for the communication call to complete
258 //
259 //OUTPUT ARGS:
260 //
261 // femv: FEMultivector that holds the dual-view of the colors
262 // After this call, ghost colors will be up-to-date.
263 //
264 // recv: returns the size of the recv buffer
265 //
266 // send: returns the size of the send buffer
267 //
268 //INPUT ARGS:
269 //
270 // mapOwnedPlusGhosts: a Tpetra map that translates between
271 // LID and GID for any vertex on this process.
272 //
273 // nVtx: the number of locally owned vertices.
274 //
275 // verts_to_send: hostmirror of verts to send. This function sends
276 // all vertices in this list to their ghost copies.
277 //
278 // verts_to_send_size: hostmirror of verts_to_send_size, holds the
279 // size of verts_to_send.
280 //
281 // procs_to_send: map that translates LID into a list of processes that
282 // have a ghost copy of the vertex.
283 //
284 double doOwnedToGhosts(RCP<const map_t> mapOwnedPlusGhosts,
285 size_t nVtx,
286 typename Kokkos::View<lno_t*, device_type>::HostMirror verts_to_send,
287 typename Kokkos::View<size_t*, device_type>::HostMirror& verts_to_send_size,
288 Teuchos::RCP<femv_t> femv,
289 const std::unordered_map<lno_t, std::vector<int>>& procs_to_send,
290 gno_t& recv, gno_t& send){
291 auto femvColors = femv->getLocalViewHost(Tpetra::Access::ReadWrite);
292 auto colors = subview(femvColors, Kokkos::ALL, 0);
293 int nprocs = comm->getSize();
294
295 std::vector<int> sendcnts(comm->getSize(), 0);
296 std::vector<gno_t> sdispls(comm->getSize()+1, 0);
297 for(size_t i = 0; i < verts_to_send_size(0); i++){
298 for(size_t j = 0; j < procs_to_send.at(verts_to_send(i)).size(); j++){
299 sendcnts[procs_to_send.at(verts_to_send(i))[j]] += 2;
300 }
301 }
302
303 sdispls[0] = 0;
304 gno_t sendsize = 0;
305 std::vector<int> sentcount(nprocs, 0);
306
307 for(int i = 1; i < comm->getSize()+1; i++){
308 sdispls[i] = sdispls[i-1] + sendcnts[i-1];
309 sendsize += sendcnts[i-1];
310 }
311 send = sendsize;
312 std::vector<gno_t> sendbuf(sendsize,0);
313
314 for(size_t i = 0; i < verts_to_send_size(0); i++){
315 std::vector<int> procs = procs_to_send.at(verts_to_send(i));
316 for(size_t j = 0; j < procs.size(); j++){
317 size_t idx = sdispls[procs[j]] + sentcount[procs[j]];
318 sentcount[procs[j]] += 2;
319 sendbuf[idx++] = mapOwnedPlusGhosts->getGlobalElement(verts_to_send(i));
320 sendbuf[idx] = colors(verts_to_send(i));
321 }
322 }
323
324 Teuchos::ArrayView<int> sendcnts_view = Teuchos::arrayViewFromVector(sendcnts);
325 Teuchos::ArrayView<gno_t> sendbuf_view = Teuchos::arrayViewFromVector(sendbuf);
326 Teuchos::ArrayRCP<gno_t> recvbuf;
327 std::vector<int> recvcnts(comm->getSize(), 0);
328 Teuchos::ArrayView<int> recvcnts_view = Teuchos::arrayViewFromVector(recvcnts);
329
330 //if we're reporting timings, remove the computation imbalance from the comm timer.
331 if(timing) comm->barrier();
332 double comm_total = 0.0;
333 double comm_temp = timer();
334
335 AlltoAllv<gno_t>(*comm, *env, sendbuf_view, sendcnts_view, recvbuf, recvcnts_view);
336 comm_total += timer() - comm_temp;
337
338 gno_t recvsize = 0;
339
340 for(int i = 0; i < recvcnts_view.size(); i++){
341 recvsize += recvcnts_view[i];
342 }
343 recv = recvsize;
344 for(int i = 0; i < recvsize; i+=2){
345 size_t lid = mapOwnedPlusGhosts->getLocalElement(recvbuf[i]);
346 if(lid<nVtx && verbose) std::cout<<comm->getRank()<<": received a locally owned vertex, somehow\n";
347 colors(lid) = recvbuf[i+1];
348 }
349
350 return comm_total;
351 }
352
353 RCP<const base_adapter_t> adapter;
354 RCP<GraphModel<base_adapter_t> > model;
355 RCP<Teuchos::ParameterList> pl;
356 RCP<Environment> env;
357 RCP<const Teuchos::Comm<int> > comm;
358 bool verbose;
359 bool timing;
360 public:
361 //constructor for the hybrid distributed distance-1 algorithm
363 const RCP<const base_adapter_t> &adapter_,
364 const RCP<Teuchos::ParameterList> &pl_,
365 const RCP<Environment> &env_,
366 const RCP<const Teuchos::Comm<int> > &comm_)
367 : adapter(adapter_), pl(pl_), env(env_), comm(comm_) {
368 verbose = pl->get<bool>("verbose",false);
369 timing = pl->get<bool>("timing", false);
370 if(verbose) std::cout<<comm->getRank()<<": inside coloring constructor\n";
371 modelFlag_t flags;
372 flags.reset();
373 buildModel(flags);
374 if(verbose) std::cout<<comm->getRank()<<": done constructing coloring class\n";
375 }
376
377
378 //Main entry point for graph coloring
379 void color( const RCP<ColoringSolution<Adapter> > &solution ) {
380 if(verbose) std::cout<<comm->getRank()<<": inside coloring function\n";
381 //this will color the global graph in a manner similar to Zoltan
382
383 //get vertex GIDs in a locally indexed array
384 if(verbose) std::cout<<comm->getRank()<<": getting owned vtxIDs\n";
385 ArrayView<const gno_t> vtxIDs;
386 ArrayView<StridedData<lno_t, scalar_t> > vwgts;
387 size_t nVtx = model->getVertexList(vtxIDs, vwgts);
388 //we do not use weights at this point
389 if(verbose) std::cout<<comm->getRank()<<": getting edge list\n";
390 //get edge information from the model
391 ArrayView<const gno_t> adjs;
392 ArrayView<const offset_t> offsets;
393 ArrayView<StridedData<lno_t, scalar_t> > ewgts;
394 model->getEdgeList(adjs, offsets, ewgts);
395 //again, weights are not used
396
397 RCP<const map_t> mapOwned;
398 RCP<const map_t> mapWithCopies;
399
400 std::vector<gno_t> finalGIDs;
401 std::vector<offset_t> finalOffset_vec;
402 std::vector<lno_t> finalAdjs_vec;
403
404 std::vector<lno_t> reorderToLocal;
405 for(size_t i = 0; i< nVtx; i++) reorderToLocal.push_back(i);
406 if(verbose) std::cout<<comm->getRank()<<": Setting up local datastructures\n";
407 //Set up a typical local mapping here.
408 std::unordered_map<gno_t,lno_t> globalToLocal;
409 std::vector<gno_t> ownedPlusGhosts;
410 for (gno_t i = 0; i < vtxIDs.size(); i++){
411 if(vtxIDs[i] < 0 && verbose) std::cout<<comm->getRank()<<": found a negative GID\n";
412 globalToLocal[vtxIDs[i]] = i;
413 ownedPlusGhosts.push_back(vtxIDs[i]);
414 }
415 gno_t nghosts = 0;
416 for (int i = 0; i < adjs.size(); i++){
417 if(globalToLocal.count(adjs[i]) == 0){
418 //new unique ghost found
419 if(adjs[i] < 0 && verbose) std::cout<<comm->getRank()<<": found a negative adjacency\n";
420 ownedPlusGhosts.push_back(adjs[i]);
421 globalToLocal[adjs[i]] = vtxIDs.size() + nghosts;
422 nghosts++;
423
424 }
425 }
426 if(verbose) std::cout<<comm->getRank()<<": vec.max_size() = "<<finalAdjs_vec.max_size()<<", adjs.size() = "<<adjs.size()<<"\n";
427 finalAdjs_vec.resize(adjs.size());
428 for(size_t i = 0; i < finalAdjs_vec.size();i++){
429 finalAdjs_vec[i] = globalToLocal[adjs[i]];
430 }
431 for(int i = 0; i < offsets.size(); i++) finalOffset_vec.push_back(offsets[i]);
432 finalGIDs = ownedPlusGhosts;
433
434
435 if(verbose) std::cout<<comm->getRank()<<": creating Tpetra Maps\n";
436 Tpetra::global_size_t dummy = Teuchos::OrdinalTraits
437 <Tpetra::global_size_t>::invalid();
438 mapOwned = rcp(new map_t(dummy, vtxIDs, 0, comm));
439
440 dummy = Teuchos::OrdinalTraits <Tpetra::global_size_t>::invalid();
441 mapWithCopies = rcp(new map_t(dummy,
442 Teuchos::arrayViewFromVector(ownedPlusGhosts),
443 0, comm));
444
445 //create the FEMultiVector for the distributed communication.
446 //We also use the views from this datastructure as arguments to
447 //KokkosKernels coloring functions.
448 if(verbose)std::cout<<comm->getRank()<<": creating FEMultiVector\n";
449 typedef Tpetra::Import<lno_t, gno_t> import_t;
450 Teuchos::RCP<import_t> importer = rcp(new import_t(mapOwned,
451 mapWithCopies));
452 Teuchos::RCP<femv_t> femv = rcp(new femv_t(mapOwned,
453 importer, 1, true));
454 //Get color array to fill
455 ArrayRCP<int> colors = solution->getColorsRCP();
456 for(size_t i=0; i<nVtx; i++){
457 colors[i] = 0;
458 }
459
460 //Create random numbers seeded on global IDs so that we don't
461 //need to communicate for consistency. These numbers determine
462 //which vertex gets recolored in the event of a conflict.
463 //taken directly from the Zoltan coloring implementation
464 std::vector<int> rand(finalGIDs.size());
465 for(size_t i = 0; i < finalGIDs.size(); i++){
466 std::srand(finalGIDs[i]);
467 rand[i] = std::rand();
468 }
469
470 //find out who owns the ghosts on this process
471 std::vector<int> ghostOwners(finalGIDs.size() - nVtx);
472 std::vector<gno_t> ghosts(finalGIDs.size() - nVtx);
473 for(size_t i = nVtx; i < finalGIDs.size(); i++) ghosts[i-nVtx] = finalGIDs[i];
474 ArrayView<int> owners = Teuchos::arrayViewFromVector(ghostOwners);
475 ArrayView<const gno_t> ghostGIDs = Teuchos::arrayViewFromVector(ghosts);
476 mapOwned->getRemoteIndexList(ghostGIDs, owners);
477
478 //create const views of the CSR representation of the local graph
479 ArrayView<const offset_t> finalOffsets = Teuchos::arrayViewFromVector(finalOffset_vec);
480 ArrayView<const lno_t> finalAdjs = Teuchos::arrayViewFromVector(finalAdjs_vec);
481 ArrayView<const int> rand_view = Teuchos::arrayViewFromVector(rand);
482 ArrayView<const gno_t> gid_view = Teuchos::arrayViewFromVector(finalGIDs);
483
484 //find out which remote processes have a ghost copy of a local owned vertex.
485 Teuchos::ArrayView<const lno_t> exportLIDs = importer->getExportLIDs();
486 Teuchos::ArrayView<const int> exportPIDs = importer->getExportPIDs();
487
488 //create a quick lookup from LID -> remote processes with a ghost copy
489 std::unordered_map<lno_t, std::vector<int>> procs_to_send;
490 for(int i = 0; i < exportLIDs.size(); i++){
491 procs_to_send[exportLIDs[i]].push_back(exportPIDs[i]);
492 }
493
494 // call coloring function
495 hybridGMB(nVtx, finalAdjs, finalOffsets,femv,gid_view,rand_view,owners,mapWithCopies, procs_to_send);
496
497 //copy colors to the output array.
498 auto femvdata = femv->getData(0);
499 for(int i = 0; i < colors.size(); i++){
500 colors[reorderToLocal[i]] = femvdata[i];
501 }
502
503 }
504
505 //This function contains all coloring logic.
506 //
507 //OUTPUT ARGS:
508 //
509 // femv: FEMultiVector with a dual-view of the vertex colors.
510 // The colors will be a valid distance-1 coloring after this
511 // function returns.
512 //
513 //INPUT ARGS:
514 //
515 // nVtx: number of locally owned vertices
516 //
517 // adjs: CSR adjacencies for the local graph
518 //
519 // offsets: CSR offsets into adjs for the local graph
520 //
521 // gids: global IDs for every vertex on this process.
522 // indexed by local ID
523 //
524 // rand: random number generated for every vertex on
525 // this process. Indexed by local ID
526 //
527 // owners: owners of each ghost vertex.
528 // owners[i] = the owning process for vertex with GID gids[i+nVtx]
529 //
530 // mapOwnedPlusGhosts: Tpetra map that translates between LIDs and GIDs
531 //
532 // procs_to_send: maps from LIDs to Process IDs.
533 // procs_to_send[LID] gives a list of processes that have
534 // a ghost copy of LID.
535 //
536 void hybridGMB(const size_t nVtx,
537 const Teuchos::ArrayView<const lno_t>& adjs,
538 const Teuchos::ArrayView<const offset_t>& offsets,
539 const Teuchos::RCP<femv_t>& femv,
540 const Teuchos::ArrayView<const gno_t>& gids,
541 const Teuchos::ArrayView<const int>& rand,
542 const Teuchos::ArrayView<const int>& owners,
543 RCP<const map_t> mapOwnedPlusGhosts,
544 const std::unordered_map<lno_t, std::vector<int>>& procs_to_send){
545 if(verbose) std::cout<<comm->getRank()<<": inside coloring algorithm\n";
546
547 //initialize timers
548 double total_time = 0.0;
549 double interior_time = 0.0;
550 double comm_time = 0.0;
551 double comp_time = 0.0;
552 double recoloring_time = 0.0;
553 double conflict_detection = 0.0;
554
555 const int numStatisticRecordingRounds = 100;
556
557 //Put together local and remote degrees
558 //1. Communicate remote GIDs to owning processes
559 std::vector<int> deg_send_cnts(comm->getSize(),0);
560 std::vector<gno_t> deg_sdispls(comm->getSize()+1,0);
561 for(int i = 0; i < owners.size(); i++){
562 deg_send_cnts[owners[i]]++;
563 }
564 deg_sdispls[0] = 0;
565 gno_t deg_sendsize = 0;
566 std::vector<int> deg_sentcount(comm->getSize(),0);
567 for(int i = 1; i < comm->getSize()+1; i++){
568 deg_sdispls[i] = deg_sdispls[i-1] + deg_send_cnts[i-1];
569 deg_sendsize += deg_send_cnts[i-1];
570 }
571 std::vector<gno_t> deg_sendbuf(deg_sendsize,0);
572 for(int i = 0; i < owners.size(); i++){
573 size_t idx = deg_sdispls[owners[i]] + deg_sentcount[owners[i]];
574 deg_sentcount[owners[i]]++;
575 deg_sendbuf[idx] = mapOwnedPlusGhosts->getGlobalElement(i+nVtx);
576 }
577 Teuchos::ArrayView<int> deg_send_cnts_view = Teuchos::arrayViewFromVector(deg_send_cnts);
578 Teuchos::ArrayView<gno_t> deg_sendbuf_view = Teuchos::arrayViewFromVector(deg_sendbuf);
579 Teuchos::ArrayRCP<gno_t> deg_recvbuf;
580 std::vector<int> deg_recvcnts(comm->getSize(),0);
581 Teuchos::ArrayView<int> deg_recvcnts_view = Teuchos::arrayViewFromVector(deg_recvcnts);
582 AlltoAllv<gno_t>(*comm, *env, deg_sendbuf_view, deg_send_cnts_view, deg_recvbuf, deg_recvcnts_view);
583
584 //2. replace GID with local degree
585 for(int i = 0; i < deg_recvbuf.size(); i++){
586 lno_t lid = mapOwnedPlusGhosts->getLocalElement(deg_recvbuf[i]);
587 deg_recvbuf[i] = offsets[lid+1] - offsets[lid];
588 }
589 //3. send modified buffer back
590 ArrayRCP<gno_t> ghost_degrees;
591 AlltoAllv<gno_t>(*comm, *env, deg_recvbuf(), deg_recvcnts_view, ghost_degrees, deg_send_cnts_view);
592 //determine max degree locally and globally
593 Kokkos::View<gno_t*, device_type> ghost_degrees_dev("ghost degree view",ghost_degrees.size());
594 typename Kokkos::View<gno_t*, device_type>::HostMirror ghost_degrees_host = Kokkos::create_mirror(ghost_degrees_dev);
595 for(int i = 0; i < ghost_degrees.size(); i++){
596 lno_t lid = mapOwnedPlusGhosts->getLocalElement(deg_sendbuf[i]);
597 ghost_degrees_host(lid-nVtx) = ghost_degrees[i];
598 }
599 Kokkos::deep_copy(ghost_degrees_dev, ghost_degrees_host);
600
601 offset_t local_max_degree = 0;
602 offset_t global_max_degree = 0;
603 for(size_t i = 0; i < nVtx; i++){
604 offset_t curr_degree = offsets[i+1] - offsets[i];
605 if(curr_degree > local_max_degree){
606 local_max_degree = curr_degree;
607 }
608 }
609 Teuchos::reduceAll<int, offset_t>(*comm,Teuchos::REDUCE_MAX,1, &local_max_degree, &global_max_degree);
610 if(comm->getRank() == 0 && verbose) std::cout<<"Input has max degree "<<global_max_degree<<"\n";
611 if(verbose)std::cout<<comm->getRank()<<": creating Kokkos Views\n";
612
613 Kokkos::View<offset_t*, device_type> dist_degrees("Owned+Ghost degree view",rand.size());
614 typename Kokkos::View<offset_t*, device_type>::HostMirror dist_degrees_host = Kokkos::create_mirror(dist_degrees);
615 //set degree counts for ghosts
616 for(int i = 0; i < adjs.size(); i++){
617 if((size_t)adjs[i] < nVtx) continue;
618 dist_degrees_host(adjs[i])++;
619 }
620 //set degree counts for owned verts
621 for(int i = 0; i < offsets.size()-1; i++){
622 dist_degrees_host(i) = offsets[i+1] - offsets[i];
623 }
624
625 Kokkos::View<offset_t*, device_type> dist_offsets("Owned+Ghost Offset view", rand.size()+1);
626 typename Kokkos::View<offset_t*, device_type>::HostMirror dist_offsets_host = Kokkos::create_mirror(dist_offsets);
627
628 //set offsets and total # of adjacencies
629 dist_offsets_host(0) = 0;
630 uint64_t total_adjs = 0;
631 for(Teuchos_Ordinal i = 1; i < rand.size()+1; i++){
632 dist_offsets_host(i) = dist_degrees_host(i-1) + dist_offsets_host(i-1);
633 total_adjs+= dist_degrees_host(i-1);
634 }
635
636 Kokkos::View<lno_t*, device_type> dist_adjs("Owned+Ghost adjacency view", total_adjs);
637 typename Kokkos::View<lno_t*, device_type>::HostMirror dist_adjs_host = Kokkos::create_mirror(dist_adjs);
638 //now, use the degree view as a counter
639 for(Teuchos_Ordinal i = 0; i < rand.size(); i++){
640 dist_degrees_host(i) = 0;
641 }
642 for(int i = 0; i < adjs.size(); i++) dist_adjs_host(i) = adjs[i];
643 if(comm->getSize() > 1){
644 for(size_t i = 0; i < nVtx; i++){
645 for(offset_t j = offsets[i]; j < offsets[i+1]; j++){
646 //if the adjacency is a ghost
647 if( (size_t)adjs[j] >= nVtx){
648 //add the symmetric edge to its adjacency list (already accounted for by offsets)
649 dist_adjs_host(dist_offsets_host(adjs[j]) + dist_degrees_host(adjs[j])) = i;
650 dist_degrees_host(adjs[j])++;
651 }
652 }
653 }
654 }
655
656 if(verbose) std::cout<<comm->getRank()<<": copying host mirrors to device views\n";
657 //copy the host data to device memory
658 Kokkos::deep_copy(dist_degrees, dist_degrees_host); //may be unnecessary
659 Kokkos::deep_copy(dist_offsets, dist_offsets_host);
660 Kokkos::deep_copy(dist_adjs, dist_adjs_host);
661 if(verbose) std::cout<<comm->getRank()<<": done copying to device\n";
662
663 //counter in UVM memory for how many vertices need recoloring.
664 Kokkos::View<gno_t*, device_type> recoloringSize("Recoloring Queue Size",1);
665 typename Kokkos::View<gno_t*, device_type>::HostMirror recoloringSize_host = Kokkos::create_mirror(recoloringSize);
666 recoloringSize_host(0) = 0;
667 Kokkos::deep_copy(recoloringSize, recoloringSize_host);
668
669 //device copy of the random tie-breakers
670 Kokkos::View<int*,device_type> rand_dev("randVec",rand.size());
671 typename Kokkos::View<int*, device_type>::HostMirror rand_host = Kokkos::create_mirror(rand_dev);
672 for(Teuchos_Ordinal i = 0; i < rand.size(); i++){
673 rand_host(i) = rand[i];
674 }
675
676 //device copy of global IDs
677 Kokkos::View<gno_t*, device_type> gid_dev("GIDs",gids.size());
678 typename Kokkos::View<gno_t*,device_type>::HostMirror gid_host = Kokkos::create_mirror(gid_dev);
679 for(Teuchos_Ordinal i = 0; i < gids.size(); i++){
680 gid_host(i) = gids[i];
681 }
682
683 //copy host views to device memory
684 Kokkos::deep_copy(rand_dev,rand_host);
685 Kokkos::deep_copy(gid_dev, gid_host);
686
687 if(verbose)std::cout<<comm->getRank()<<": done creating recoloring datastructures\n";
688 //count boundary size to allocate list of vertices to recolor.
689 offset_t boundary_size = 0;
690 for(size_t i = 0; i < nVtx; i++){
691 for(offset_t j = offsets[i]; j < offsets[i+1]; j++){
692 if((size_t)adjs[j] >= nVtx) {
693 boundary_size++;
694 break;
695 }
696 }
697 }
698 if(verbose)std::cout<<comm->getRank()<<": creating send views\n";
699
700 //list of vertices to send to remote processes
701 Kokkos::View<lno_t*, device_type> verts_to_send_view("verts to send",boundary_size);
702 Kokkos::parallel_for("init verts_to_send_view",
703 Kokkos::RangePolicy<execution_space, int>(0,boundary_size),
704 KOKKOS_LAMBDA(const int& i){
705 verts_to_send_view(i) = -1;
706 });
707
708 //size information for the list of vertices to send. Also includes an atomic copy
709 Kokkos::View<size_t*, device_type> verts_to_send_size("verts to send size",1);
710 Kokkos::View<size_t*, device_type, Kokkos::MemoryTraits<Kokkos::Atomic> > verts_to_send_size_atomic = verts_to_send_size;
711 typename Kokkos::View<lno_t*, device_type>::HostMirror verts_to_send_host = create_mirror(verts_to_send_view);
712 typename Kokkos::View<size_t*,device_type>::HostMirror verts_to_send_size_host = create_mirror(verts_to_send_size);
713 //initialize the device view with a value of zero
714 verts_to_send_size_host(0) = 0;
715 deep_copy(verts_to_send_size, verts_to_send_size_host);
716
717 if(verbose)std::cout<<comm->getRank()<<": Done creating send views, initializing...\n";
718 if(verbose)std::cout<<comm->getRank()<<": boundary_size = "<<boundary_size<<" verts_to_send_size_atomic(0) = "<<verts_to_send_size_atomic(0)<<"\n";
719 //initially the verts to send include all boundary vertices.
720 Kokkos::parallel_for("Initialize verts_to_send",
721 Kokkos::RangePolicy<execution_space, int>(0,nVtx),
722 KOKKOS_LAMBDA(const int&i){
723 for(offset_t j = dist_offsets(i); j < dist_offsets(i+1); j++){
724 if((size_t)dist_adjs(j) >= nVtx){
725 verts_to_send_view(verts_to_send_size_atomic(0)++) = i;
726 break;
727 }
728 }
729 });
730 Kokkos::fence();
731
732
733 //used to replace the incorrectly recolored ghosts
734 Kokkos::View<int*, device_type> ghost_colors("ghost color backups", rand.size()-nVtx);
735 if(verbose)std::cout<<comm->getRank()<<": Done initializing\n";
736 gno_t sentPerRound[numStatisticRecordingRounds];
737 gno_t recvPerRound[numStatisticRecordingRounds];
738
739 if(verbose) std::cout<<comm->getRank()<<": Coloring interior\n";
740 //initialize interior and total timers, barrier to prevent any imbalance from setup.
741 //Only use a barrier if timing is happening.
742 if(timing) comm->barrier();
743 interior_time = timer();
744 total_time = timer();
745 //call the KokkosKernels coloring function with the Tpetra default spaces.
746 bool use_vbbit = (global_max_degree < 6000);
747 this->colorInterior<execution_space,memory_space>
748 (nVtx, dist_adjs, dist_offsets, femv,dist_adjs,0,use_vbbit);
749 if(timing){
750 interior_time = timer() - interior_time;
751 comp_time = interior_time;
752 }
753 if(verbose) std::cout<<comm->getRank()<<": Going to recolor\n";
754 bool recolor_degrees = this->pl->template get<bool>("recolor_degrees", true);
755
756 //if there is more than a single process, check distributed conflicts and recolor
757 if(comm->getSize() > 1){
758
759 if(verbose)std::cout<<comm->getRank()<<": going to communicate\n";
760
761 //communicate
762 Kokkos::deep_copy(verts_to_send_host, verts_to_send_view);
763 Kokkos::deep_copy(verts_to_send_size_host, verts_to_send_size);
764 gno_t recv, sent;
765 comm_time = doOwnedToGhosts(mapOwnedPlusGhosts,
766 nVtx,
767 verts_to_send_host,
768 verts_to_send_size_host,
769 femv,
770 procs_to_send,
771 recv,
772 sent);
773 sentPerRound[0] = sent;
774 recvPerRound[0] = recv;
775 if(verbose) std::cout<<comm->getRank()<<": done communicating\n";
776 verts_to_send_size_host(0) = 0;
777 deep_copy(verts_to_send_size, verts_to_send_size_host);
778 //set the old ghost colors
779 //get the colors from the femv
780 Kokkos::View<int**, Kokkos::LayoutLeft, device_type> femvColors =
781 femv->template getLocalView<device_type>(Tpetra::Access::ReadWrite); // Partial write
782 Kokkos::View<int*, device_type> femv_colors = subview(femvColors, Kokkos::ALL, 0);
783 Kokkos::parallel_for("get colors from femv",
784 Kokkos::RangePolicy<execution_space, int>(0,rand.size()-nVtx),
785 KOKKOS_LAMBDA(const int& i){
786 ghost_colors(i) = femv_colors(i+nVtx);
787 });
788 Kokkos::fence();
789 //detect conflicts on the device, uncolor conflicts consistently.
790 double temp = timer();
791 detectConflicts<execution_space, memory_space>(nVtx,
792 rand.size()-nVtx,
793 dist_offsets,
794 dist_adjs,
795 femv_colors,
796 verts_to_send_view,
797 verts_to_send_size_atomic,
798 recoloringSize,
799 rand_dev,
800 gid_dev,
801 ghost_degrees_dev,
802 recolor_degrees);
803 deep_copy(recoloringSize_host, recoloringSize);
804 conflict_detection += timer() - temp;
805 comp_time += conflict_detection;
806 }
807 //end the initial recoloring
808 if(verbose)std::cout<<comm->getRank()<<": done initial recoloring, begin recoloring loop\n";
809 double totalPerRound[numStatisticRecordingRounds];
810 double commPerRound[numStatisticRecordingRounds];
811 double compPerRound[numStatisticRecordingRounds];
812 double recoloringPerRound[numStatisticRecordingRounds];
813 double conflictDetectionPerRound[numStatisticRecordingRounds];
814 double serialRecoloringPerRound[numStatisticRecordingRounds];
815 int vertsPerRound[numStatisticRecordingRounds];
816 bool done = false; //We're only done when all processors are done
817 if(comm->getSize() == 1) done = true;
818 totalPerRound[0] = interior_time + comm_time + conflict_detection;
819 recoloringPerRound[0] = 0;
820 commPerRound[0] = comm_time;
821 compPerRound[0] = interior_time + conflict_detection;
822 conflictDetectionPerRound[0] = conflict_detection;
823 recoloringPerRound[0] = 0;
824 vertsPerRound[0] = 0;
825 int distributedRounds = 1; //this is the same across all processors
826 int serial_threshold = this->pl->template get<int>("serial_threshold",0);
827
828 Kokkos::View<lno_t*, device_type> verts_to_recolor("verts_to_recolor", boundary_size);
829 typename Kokkos::View<int*, device_type>::HostMirror ghost_colors_host;
830 //while the queue is not empty
831 while(recoloringSize_host(0) > 0 || !done){
832 if(recoloringSize_host(0) < serial_threshold) break;
833 //get a subview of the colors:
834 auto femvColors = femv->getLocalViewDevice(Tpetra::Access::ReadWrite);
835 auto femv_colors = subview(femvColors, Kokkos::ALL, 0);
836 //color everything in the recoloring queue, put everything on conflict queue
837 if(distributedRounds < numStatisticRecordingRounds) {
838 vertsPerRound[distributedRounds] = recoloringSize_host(0);
839 }
840
841 //copying the send view to the recolor view is necessary because
842 //KokkosKernels can change the view passed in, and we need the send view
843 //intact for communication.
844 Kokkos::deep_copy(verts_to_recolor, verts_to_send_view);
845
846 double recolor_temp = timer();
847 //use KokkosKernels to recolor the conflicting vertices.
848 deep_copy(verts_to_send_size_host, verts_to_send_size);
849 if(verts_to_send_size_host(0) > 0){
850 this->colorInterior<execution_space,
851 memory_space>(femv_colors.size(),
852 dist_adjs,dist_offsets,
853 femv,
854 verts_to_recolor,
855 verts_to_send_size_host(0),
856 use_vbbit);
857 }
858
859 if(distributedRounds < numStatisticRecordingRounds){
860 recoloringPerRound[distributedRounds] = timer() - recolor_temp;
861 recoloring_time += recoloringPerRound[distributedRounds];
862 comp_time += recoloringPerRound[distributedRounds];
863 compPerRound[distributedRounds] = recoloringPerRound[distributedRounds];
864 totalPerRound[distributedRounds] = recoloringPerRound[distributedRounds];
865 } else if(timing) {
866 double recolor_round_time = timer() - recolor_temp;
867 recoloring_time += recolor_round_time;
868 comp_time += recolor_round_time;
869 }
870
871 //reset the recoloringSize device host and device views
872 //to zero
873 recoloringSize_host(0) = 0;
874 Kokkos::deep_copy(recoloringSize,recoloringSize_host);
875
876 Kokkos::parallel_for("set femv colors",
877 Kokkos::RangePolicy<execution_space, int>(0,rand.size()-nVtx),
878 KOKKOS_LAMBDA(const int& i){
879 femv_colors(i+nVtx) = ghost_colors(i);
880 });
881 Kokkos::fence();
882 //communicate
883 Kokkos::deep_copy(verts_to_send_host, verts_to_send_view);
884 Kokkos::deep_copy(verts_to_send_size_host, verts_to_send_size);
885 gno_t sent,recv;
886 // Reset device views
887 femvColors = decltype(femvColors)();
888 femv_colors = decltype(femv_colors)();
889
890 double curr_comm_time = doOwnedToGhosts(mapOwnedPlusGhosts,
891 nVtx,
892 verts_to_send_host,
893 verts_to_send_size_host,
894 femv,
895 procs_to_send,
896 recv,
897 sent);
898 comm_time += curr_comm_time;
899 if(distributedRounds < numStatisticRecordingRounds){
900 commPerRound[distributedRounds] = curr_comm_time;
901 sentPerRound[distributedRounds] = sent;
902 recvPerRound[distributedRounds] = recv;
903 totalPerRound[distributedRounds] += commPerRound[distributedRounds];
904 }
905 //detect conflicts in parallel. For a detected conflict,
906 //reset the vertex-to-be-recolored's color to 0, in order to
907 //allow KokkosKernels to recolor correctly.
908
909 femvColors = femv->getLocalViewDevice(Tpetra::Access::ReadWrite);
910 femv_colors = subview(femvColors, Kokkos::ALL, 0);
911 Kokkos::parallel_for("get femv colors 2",
912 Kokkos::RangePolicy<execution_space, int>(0,rand.size()-nVtx),
913 KOKKOS_LAMBDA(const int& i){
914 ghost_colors(i) = femv_colors(i+nVtx);
915 });
916 Kokkos::fence();
917 verts_to_send_size_host(0) = 0;
918 deep_copy(verts_to_send_size, verts_to_send_size_host);
919 double detection_temp = timer();
920 detectConflicts<execution_space, memory_space>(nVtx,
921 rand.size()-nVtx,
922 dist_offsets,
923 dist_adjs,
924 femv_colors,
925 verts_to_send_view,
926 verts_to_send_size_atomic,
927 recoloringSize,
928 rand_dev,
929 gid_dev,
930 ghost_degrees_dev,
931 recolor_degrees);
932
933 Kokkos::deep_copy(recoloringSize_host, recoloringSize);
934
935 if(distributedRounds < numStatisticRecordingRounds){
936 conflictDetectionPerRound[distributedRounds] = timer() - detection_temp;
937 conflict_detection += conflictDetectionPerRound[distributedRounds];
938 compPerRound[distributedRounds] += conflictDetectionPerRound[distributedRounds];
939 totalPerRound[distributedRounds] += conflictDetectionPerRound[distributedRounds];
940 comp_time += conflictDetectionPerRound[distributedRounds];
941 } else if(timing){
942 double conflict_detection_round_time = timer()- detection_temp;
943 conflict_detection += conflict_detection_round_time;
944 comp_time += conflict_detection_round_time;
945 }
946 //do a reduction to determine if we're done
947 int globalDone = 0;
948 int localDone = recoloringSize_host(0);
949 Teuchos::reduceAll<int, int>(*comm,Teuchos::REDUCE_SUM,1, &localDone, &globalDone);
950 //We're only allowed to stop once everyone has no work to do.
951 //collectives will hang if one process exits.
952 distributedRounds++;
953 done = !globalDone;
954 }
955
956 if(recoloringSize_host(0) > 0 || !done){
957 ghost_colors_host = Kokkos::create_mirror_view(ghost_colors);
958 deep_copy(ghost_colors_host, ghost_colors);
959 deep_copy(verts_to_send_host, verts_to_send_view);
960 deep_copy(verts_to_send_size_host, verts_to_send_size);
961 }
962
963
964 //finish the local coloring in serial on the host
965 while(recoloringSize_host(0) > 0 || !done){
966 //Use non-templated call to get the Host view
967 auto femvColors = femv->getLocalViewHost(Tpetra::Access::ReadWrite);
968 auto femv_colors = subview(femvColors, Kokkos::ALL, 0);
969 /*Kokkos::View<int*, Kokkos::Device<Kokkos::Serial, Kokkos::HostSpace>>femv_colors_host = create_mirror(femv_colors);
970 Kokkos::deep_copy(femv_colors_host, femv_colors);*/
971 if(distributedRounds < 100){
972 vertsPerRound[distributedRounds] = recoloringSize_host(0);
973 }
974
975 double recolor_temp = timer();
976 //use KokkosKernels to recolor the conflicting vertices
977 if(verts_to_send_size_host(0) > 0){
978 this->colorInterior<host_exec,
979 host_mem>
980 (femv_colors.size(), dist_adjs_host, dist_offsets_host, femv, verts_to_send_host, verts_to_send_size_host(0), true);
981 }
982
983 if(distributedRounds < numStatisticRecordingRounds){
984 recoloringPerRound[distributedRounds] = timer() - recolor_temp;
985 recoloring_time += recoloringPerRound[distributedRounds];
986 comp_time += recoloringPerRound[distributedRounds];
987 compPerRound[distributedRounds] = recoloringPerRound[distributedRounds];
988 totalPerRound[distributedRounds] = recoloringPerRound[distributedRounds];
989 } else if(timing){
990 double recolor_serial_round_time = timer() - recolor_temp;
991 recoloring_time += recolor_serial_round_time;
992 comp_time += recolor_serial_round_time;
993 }
994
995 recoloringSize_host(0) = 0;
996
997 for(size_t i = 0; i < rand.size() -nVtx; i++){
998 femv_colors(i+nVtx) = ghost_colors_host(i);
999 }
1000
1001 gno_t sent,recv;
1002 double curr_comm_time = doOwnedToGhosts(mapOwnedPlusGhosts,
1003 nVtx,
1004 verts_to_send_host,
1005 verts_to_send_size_host,
1006 femv,
1007 procs_to_send,
1008 recv,
1009 sent);
1010 comm_time += curr_comm_time;
1011
1012 if(distributedRounds < numStatisticRecordingRounds){
1013 commPerRound[distributedRounds] = curr_comm_time;
1014 sentPerRound[distributedRounds] = sent;
1015 recvPerRound[distributedRounds] = recv;
1016 totalPerRound[distributedRounds] += commPerRound[distributedRounds];
1017 }
1018 for(size_t i = 0; i < rand.size()-nVtx; i++){
1019 ghost_colors_host(i) = femv_colors(i+nVtx);
1020 }
1021
1022 verts_to_send_size_host(0) = 0;
1023 double detection_temp = timer();
1024 detectConflicts<host_exec, host_mem>(nVtx,
1025 rand.size()-nVtx,
1026 dist_offsets_host,
1027 dist_adjs_host,
1028 femv_colors,
1029 verts_to_send_host,
1030 verts_to_send_size_host,
1031 recoloringSize_host,
1032 rand_host,
1033 gid_host,
1034 ghost_degrees_host,
1035 recolor_degrees);
1036 if(distributedRounds < numStatisticRecordingRounds){
1037 conflictDetectionPerRound[distributedRounds] = timer() - detection_temp;
1038 conflict_detection += conflictDetectionPerRound[distributedRounds];
1039 compPerRound[distributedRounds] += conflictDetectionPerRound[distributedRounds];
1040 totalPerRound[distributedRounds] += conflictDetectionPerRound[distributedRounds];
1041 comp_time += conflictDetectionPerRound[distributedRounds];
1042 } else if(timing){
1043 double conflict_detection_serial_round_time = timer() - detection_temp;
1044 conflict_detection += conflict_detection_serial_round_time;
1045 comp_time += conflict_detection_serial_round_time;
1046 }
1047 //do a reduction to determine if we're done
1048 int globalDone = 0;
1049 int localDone = recoloringSize_host(0);
1050 Teuchos::reduceAll<int, int>(*comm, Teuchos::REDUCE_SUM,1, &localDone, &globalDone);
1051 distributedRounds++;
1052 done = !globalDone;
1053 }
1054 total_time = timer() - total_time;
1055
1056 //only take time to compute statistics if the user wants them
1057 if(verbose){
1058 std::cout<<comm->getRank()<<": done recoloring loop, computing statistics\n";
1059 int localBoundaryVertices = 0;
1060 for(size_t i = 0; i < nVtx; i++){
1061 for(offset_t j = offsets[i]; j < offsets[i+1]; j++){
1062 if((size_t)adjs[j] >= nVtx){
1063 localBoundaryVertices++;
1064 break;
1065 }
1066 }
1067 }
1068 //print how many rounds of speculating/correcting happened (this should be the same for all ranks):
1069 //if(comm->getRank()==0) printf("did %d rounds of distributed coloring\n", distributedRounds);
1070 int totalBoundarySize = 0;
1071 int totalVertsPerRound[numStatisticRecordingRounds];
1072 double finalTotalPerRound[numStatisticRecordingRounds];
1073 double maxRecoloringPerRound[numStatisticRecordingRounds];
1074 double finalSerialRecoloringPerRound[numStatisticRecordingRounds];
1075 double minRecoloringPerRound[numStatisticRecordingRounds];
1076 double finalCommPerRound[numStatisticRecordingRounds];
1077 double finalCompPerRound[numStatisticRecordingRounds];
1078 double finalConflictDetectionPerRound[numStatisticRecordingRounds];
1079 gno_t finalRecvPerRound[numStatisticRecordingRounds];
1080 gno_t finalSentPerRound[numStatisticRecordingRounds];
1081 for(int i = 0; i < numStatisticRecordingRounds; i++) {
1082 totalVertsPerRound[i] = 0;
1083 finalTotalPerRound[i] = 0.0;
1084 maxRecoloringPerRound[i] = 0.0;
1085 minRecoloringPerRound[i] = 0.0;
1086 finalCommPerRound[i] = 0.0;
1087 finalCompPerRound[i] = 0.0;
1088 finalConflictDetectionPerRound[i] = 0.0;
1089 finalSentPerRound[i] = 0;
1090 finalRecvPerRound[i] = 0;
1091 }
1092 Teuchos::reduceAll<int,int>(*comm, Teuchos::REDUCE_SUM,1, &localBoundaryVertices,&totalBoundarySize);
1093 Teuchos::reduceAll<int,int>(*comm, Teuchos::REDUCE_SUM,numStatisticRecordingRounds,vertsPerRound,totalVertsPerRound);
1094 Teuchos::reduceAll<int,double>(*comm, Teuchos::REDUCE_MAX,numStatisticRecordingRounds,totalPerRound,finalTotalPerRound);
1095 Teuchos::reduceAll<int,double>(*comm, Teuchos::REDUCE_MAX,numStatisticRecordingRounds,recoloringPerRound,maxRecoloringPerRound);
1096 Teuchos::reduceAll<int,double>(*comm, Teuchos::REDUCE_MIN,numStatisticRecordingRounds,recoloringPerRound,minRecoloringPerRound);
1097 Teuchos::reduceAll<int,double>(*comm, Teuchos::REDUCE_MAX,numStatisticRecordingRounds,serialRecoloringPerRound,finalSerialRecoloringPerRound);
1098 Teuchos::reduceAll<int,double>(*comm, Teuchos::REDUCE_MAX,numStatisticRecordingRounds,commPerRound,finalCommPerRound);
1099 Teuchos::reduceAll<int,double>(*comm, Teuchos::REDUCE_MAX,numStatisticRecordingRounds,compPerRound,finalCompPerRound);
1100 Teuchos::reduceAll<int,double>(*comm,
1101 Teuchos::REDUCE_MAX,numStatisticRecordingRounds,conflictDetectionPerRound,finalConflictDetectionPerRound);
1102 Teuchos::reduceAll<int,gno_t> (*comm, Teuchos::REDUCE_SUM,numStatisticRecordingRounds,recvPerRound, finalRecvPerRound);
1103 Teuchos::reduceAll<int,gno_t> (*comm, Teuchos::REDUCE_SUM,numStatisticRecordingRounds,sentPerRound, finalSentPerRound);
1104
1105 std::cout << "Rank " << comm->getRank()
1106 << ": boundary size: " << localBoundaryVertices << std::endl;
1107 if(comm->getRank()==0)
1108 std::cout << "Total boundary size: " << totalBoundarySize << std::endl;
1109 for(int i = 0; i < std::min(distributedRounds,numStatisticRecordingRounds); i++){
1110 std::cout << "Rank " << comm->getRank()
1111 << ": recolor " << vertsPerRound[i]
1112 << " vertices in round " << i << std::endl;
1113 if(comm->getRank()==0) {
1114 std::cout << "recolored " << totalVertsPerRound[i]
1115 << " vertices in round " << i << std::endl;
1116 std::cout << "total time in round " << i
1117 << ": " << finalTotalPerRound[i] << std::endl;;
1118 std::cout << "recoloring time in round " << i
1119 << ": " << maxRecoloringPerRound[i] << std::endl;
1120 std::cout << "serial recoloring time in round " << i
1121 << ": " << finalSerialRecoloringPerRound[i] << std::endl;
1122 std::cout << "min recoloring time in round " << i
1123 << ": " << minRecoloringPerRound[i] << std::endl;
1124 std::cout << "conflict detection time in round " << i
1125 << ": " << finalConflictDetectionPerRound[i] << std::endl;
1126 std::cout << "comm time in round " << i
1127 << ": " << finalCommPerRound[i] << std::endl;
1128 std::cout << "total sent in round " << i
1129 << ": " << finalSentPerRound[i] << std::endl;
1130 std::cout << "total recv in round " << i
1131 << ": " << finalRecvPerRound[i] << std::endl;
1132 std::cout << "comp time in round " << i
1133 << ": " << finalCompPerRound[i] << std::endl;
1134 }
1135 }
1136 } else if(timing){
1137 double global_total_time = 0.0;
1138 double global_recoloring_time=0.0;
1139 double global_min_recoloring_time=0.0;
1140 double global_conflict_detection=0.0;
1141 double global_comm_time=0.0;
1142 double global_comp_time=0.0;
1143 double global_interior_time = 0.0;
1144 Teuchos::reduceAll<int,double>(*comm, Teuchos::REDUCE_MAX,1,&total_time,&global_total_time);
1145 Teuchos::reduceAll<int,double>(*comm, Teuchos::REDUCE_MAX,1,&recoloring_time,&global_recoloring_time);
1146 Teuchos::reduceAll<int,double>(*comm, Teuchos::REDUCE_MIN,1,&recoloring_time,&global_min_recoloring_time);
1147 Teuchos::reduceAll<int,double>(*comm, Teuchos::REDUCE_MAX,1,&conflict_detection,&global_conflict_detection);
1148 Teuchos::reduceAll<int,double>(*comm, Teuchos::REDUCE_MAX,1,&comm_time,&global_comm_time);
1149 Teuchos::reduceAll<int,double>(*comm, Teuchos::REDUCE_MAX,1,&comp_time,&global_comp_time);
1150 Teuchos::reduceAll<int,double>(*comm, Teuchos::REDUCE_MAX,1,&interior_time,&global_interior_time);
1151 comm->barrier();
1152 fflush(stdout);
1153 if(comm->getRank()==0){
1154 std::cout << "Total Time: " << global_total_time << std::endl;
1155 std::cout << "Interior Time: " << global_interior_time << std::endl;
1156 std::cout << "Recoloring Time: " << global_recoloring_time << std::endl;
1157 std::cout << "Min Recoloring Time: " << global_min_recoloring_time << std::endl;
1158 std::cout << "Conflict Detection Time: " << global_conflict_detection << std::endl;
1159 std::cout << "Comm Time: " << global_comm_time << std::endl;
1160 std::cout << "Comp Time: " << global_comp_time << std::endl;
1161 }
1162 }
1163 if(verbose) std::cout<<comm->getRank()<<": exiting coloring\n";
1164 }
1165};
1166
1167template <typename Adapter>
1168void AlgDistance1<Adapter>::buildModel(modelFlag_t &flags){
1169 flags.set(REMOVE_SELF_EDGES);
1170
1171 this->env->debug(DETAILED_STATUS, " building graph model");
1172 if(verbose) std::cout<<comm->getRank()<<": starting to construct graph model\n";
1173 this->model = rcp(new GraphModel<base_adapter_t>(this->adapter, this->env,
1174 this->comm, flags));
1175 if(verbose) std::cout<<comm->getRank()<<": done constructing graph model\n";
1176 this->env->debug(DETAILED_STATUS, " graph model built");
1177}
1178
1179
1180}//end namespace Zoltan2
1181#endif
AlltoAll communication methods.
Defines the ColoringSolution class.
Defines the GraphModel interface.
Traits class to handle conversions between gno_t/lno_t and TPL data types (e.g., ParMETIS's idx_t,...
A gathering of useful namespace methods.
Tpetra::FEMultiVector< femv_scalar_t, lno_t, gno_t > femv_t
typename femv_t::device_type device_type
Tpetra::Map< lno_t, gno_t > map_t
typename femv_t::host_view_type::device_type::memory_space host_mem
typename Adapter::gno_t gno_t
typename Adapter::scalar_t scalar_t
void detectConflicts(const size_t n_local, const size_t n_ghosts, Kokkos::View< offset_t *, Kokkos::Device< ExecutionSpace, MemorySpace > > dist_offsets, Kokkos::View< lno_t *, Kokkos::Device< ExecutionSpace, MemorySpace > > dist_adjs, Kokkos::View< int *, Kokkos::Device< ExecutionSpace, MemorySpace > > femv_colors, Kokkos::View< lno_t *, Kokkos::Device< ExecutionSpace, MemorySpace > > verts_to_send_view, Kokkos::View< size_t *, Kokkos::Device< ExecutionSpace, MemorySpace >, Kokkos::MemoryTraits< Kokkos::Atomic > > verts_to_send_size_atomic, Kokkos::View< gno_t *, Kokkos::Device< ExecutionSpace, MemorySpace > > recoloringSize, Kokkos::View< int *, Kokkos::Device< ExecutionSpace, MemorySpace > > rand, Kokkos::View< gno_t *, Kokkos::Device< ExecutionSpace, MemorySpace > > gid, Kokkos::View< gno_t *, Kokkos::Device< ExecutionSpace, MemorySpace > > ghost_degrees, bool recolor_degrees)
typename Adapter::base_adapter_t base_adapter_t
typename Adapter::lno_t lno_t
typename device_type::execution_space execution_space
typename femv_t::host_view_type::device_type::execution_space host_exec
void color(const RCP< ColoringSolution< Adapter > > &solution)
Coloring method.
typename device_type::memory_space memory_space
void hybridGMB(const size_t nVtx, const Teuchos::ArrayView< const lno_t > &adjs, const Teuchos::ArrayView< const offset_t > &offsets, const Teuchos::RCP< femv_t > &femv, const Teuchos::ArrayView< const gno_t > &gids, const Teuchos::ArrayView< const int > &rand, const Teuchos::ArrayView< const int > &owners, RCP< const map_t > mapOwnedPlusGhosts, const std::unordered_map< lno_t, std::vector< int > > &procs_to_send)
AlgDistance1(const RCP< const base_adapter_t > &adapter_, const RCP< Teuchos::ParameterList > &pl_, const RCP< Environment > &env_, const RCP< const Teuchos::Comm< int > > &comm_)
typename Adapter::offset_t offset_t
Algorithm defines the base class for all algorithms.
map_t::local_ordinal_type lno_t
map_t::global_ordinal_type gno_t
Created by mbenlioglu on Aug 31, 2020.
std::bitset< NUM_MODEL_FLAGS > modelFlag_t
@ REMOVE_SELF_EDGES
algorithm requires no self edges