Zoltan2
Loading...
Searching...
No Matches
Zoltan2_AlgHybrid2GL.hpp
Go to the documentation of this file.
1#ifndef _ZOLTAN2_2GHOSTLAYER_HPP_
2#define _ZOLTAN2_2GHOSTLAYER_HPP_
3
4#include <vector>
5#include <unordered_map>
6#include <iostream>
7#include <queue>
8#ifdef _WIN32
9#include <time.h>
10#else
11#include <sys/time.h>
12#endif
13
14#include "Zoltan2_Algorithm.hpp"
17#include "Zoltan2_Util.hpp"
18#include "Zoltan2_TPLTraits.hpp"
19#include "Zoltan2_AlltoAll.hpp"
20
21
22#include "Tpetra_Core.hpp"
23#include "Teuchos_RCP.hpp"
24#include "Tpetra_Import.hpp"
25#include "Tpetra_FEMultiVector.hpp"
26
27#include "KokkosKernels_Handle.hpp"
28#include "KokkosKernels_IOUtils.hpp"
29#include "KokkosGraph_Distance1Color.hpp"
30#include "KokkosGraph_Distance1ColorHandle.hpp"
31
35// for methods that use two layers of ghosts.
36
37
38namespace Zoltan2{
39
40template <typename Adapter>
41class AlgTwoGhostLayer : public Algorithm<Adapter> {
42
43 public:
44
45 using lno_t = typename Adapter::lno_t;
46 using gno_t = typename Adapter::gno_t;
47 using offset_t = typename Adapter::offset_t;
48 using scalar_t = typename Adapter::scalar_t;
49 using base_adapter_t = typename Adapter::base_adapter_t;
50 using map_t = Tpetra::Map<lno_t,gno_t>;
51 using femv_scalar_t = int;
52 using femv_t = Tpetra::FEMultiVector<femv_scalar_t, lno_t, gno_t>;
53 using device_type = typename femv_t::device_type;
54 using execution_space = typename device_type::execution_space;
55 using memory_space = typename device_type::memory_space;
56 using host_exec = typename femv_t::host_view_type::device_type::execution_space;
57 using host_mem = typename femv_t::host_view_type::device_type::memory_space;
58
59 double timer(){
60 struct timeval tp;
61 gettimeofday(&tp, NULL);
62 return ((double) (tp.tv_sec) + 1e-6 * tp.tv_usec);
63 }
64 private:
65
66 //Entry point for parallel local coloring
67 // nVtx is the number of vertices owned by the current process
68 //
69 // adjs_view is the adjacencies, indexed by offset_view
70 //
71 // offset_view is the CSR row map, used to index the adjs_view
72 //
73 // femv is the FEMultiVector that holds the colors for the vertices
74 // the colors will change in this function.
75 //
76 // vertex_list is a list of vertices to recolor
77 //
78 // vertex_list_size is the size of the list of vertices to recolor
79 // vertex_list_size = 0 means recolor all uncolored vertices
80 //
81 // recolor decides which KokkosKernels algorithm to use.
82 //
83 virtual void colorInterior(const size_t nVtx,
84 Kokkos::View<lno_t*, device_type > adjs_view,
85 Kokkos::View<offset_t*,device_type > offset_view,
86 Teuchos::RCP<femv_t> femv,
87 Kokkos::View<lno_t*, device_type> vertex_list,
88 size_t vertex_list_size = 0,
89 bool recolor=false) = 0;
90
91 //Entry point for serial local coloring
92 virtual void colorInterior_serial(const size_t nVtx,
93 typename Kokkos::View<lno_t*, device_type >::HostMirror adjs_view,
94 typename Kokkos::View<offset_t*,device_type >::HostMirror offset_view,
95 Teuchos::RCP<femv_t> femv,
96 typename Kokkos::View<lno_t*, device_type>::HostMirror vertex_list,
97 size_t vertex_list_size = 0,
98 bool recolor=false) = 0;
99 public:
100 //Entry point for parallel conflict detection
101 //INPUT ARGS
102 // n_local is the number of vertices owned by the current process
103 //
104 // dist_offsets_dev is the device view that holds the CSR offsets
105 // It holds offsets for all vertices, owned and ghosts.
106 // It is organized with owned first, and then ghost layers:
107 // -----------------------------------------------
108 // | owned | first | second |
109 // | vtx | ghost | ghost |
110 // | offsets | layer | layer |
111 // -----------------------------------------------
112 // The allocated length is equal to the sum of owned
113 // and ghost vertices + 1. Any vertex with LID >= n_local
114 // is a ghost.
115 //
116 // dist_adjs_dev is the device view that holds CSR adjacencies
117 //
118 // boundary_verts_view holds the local IDs of vertices in the boundary.
119 // By definition, boundary vertices are owned vertices
120 // that have a ghost in their neighborhood.
121 // (distance-1 coloring = 1-hop neighborhood)
122 // (distance-2 coloring = 2-hop neighborhood)
123 // Note: for D1-2GL we do not use this argument.
124 // It is possible to detect all D1 conflicts by only
125 // checking the ghost vertices, without constructing
126 // the boundary.
127 //
128 // rand is a device view that holds random numbers generated from GIDs,
129 // they are consistently generated across processes
130 //
131 // gid is a device view that holds the GIDs of each vertex on this process.
132 // It is indexable by local ID. It stores both owned and ghost
133 // vertices, and LIDs are ordered so that the structure looks like:
134 // -----------------------------------------
135 // | | first | second | owned LIDs are smallest
136 // | owned | ghost | ghost | ghost LIDs increase based
137 // | vtx | layer | layer | on layer (1 < 2)
138 // -----------------------------------------
139 // The allocated size is dist_offsets_dev.extent(0)-1.
140 //
141 //
142 // ghost_degrees is a device view that holds the degrees of ghost vertices only.
143 // A ghost with local ID n_local will be the first entry in this view.
144 // So, ghost_degrees[i] is the degree of the vtx with
145 // GID = gid[n_local+i].
146 //
147 // recolor_degrees is a boolean that determines whether or not we factor in vertex
148 // degrees on recoloring
149 //
150 //OUTPUT ARGS
151 // femv_colors is the device color view.
152 // After this function call, conflicts between vertices will
153 // be resolved via setting a vertex's color to 0. The vertex
154 // to be uncolored is determined by rules that are consistent
155 // across multiple processes.
156 //
157 // verts_to_recolor_view is a device view that holds the list
158 // of vertices to recolor. Any vertex
159 // uncolored by this function will appear in this
160 // view after the function returns.
161 //
162 // verts_to_recolor_size_atomic is an atomic device view that holds the
163 // size of verts_to_recolor_view.
164 // Effectively it represents how many
165 // vertices need to be recolored after
166 // conflict detection.
167 // It differs in size from the allocated size of
168 // verts_to_recolor_view.
169 // Atomicity is required for building
170 // verts_to_recolor_view in parallel, which
171 // is the reason this is a view instead of
172 // just an integer type.
173 //
174 // verts_to_send_view is a device view that holds the list of vertices
175 // that will need to be sent to remotes after recoloring
176 // Note: Only locally owned vertices should ever be in
177 // this view. A process cannot color ghosts correctly.
178 //
179 // verts_to_send_size_atomic is an atomic device view that holds the size of
180 // verts_to_send_view. It differs in size from the
181 // allocated size of verts_to_send_view.
182 // Atomicity is required for building
183 // verts_to_send_view in parallel,
184 // which is the reason this is a view instead of
185 // just an integer type.
186 //
187 // recoloringSize is a device view that stores the total number of
188 // vertices that were uncolored by the conflict detection.
189 //
190 virtual void detectConflicts(const size_t n_local,
191 Kokkos::View<offset_t*, device_type > dist_offsets_dev,
192 Kokkos::View<lno_t*, device_type > dist_adjs_dev,
193 Kokkos::View<int*,device_type > femv_colors,
194 Kokkos::View<lno_t*, device_type > boundary_verts_view,
195 Kokkos::View<lno_t*,
196 device_type > verts_to_recolor_view,
197 Kokkos::View<int*,
199 Kokkos::MemoryTraits<Kokkos::Atomic>> verts_to_recolor_size_atomic,
200 Kokkos::View<lno_t*,
201 device_type > verts_to_send_view,
202 Kokkos::View<size_t*,
204 Kokkos::MemoryTraits<Kokkos::Atomic>> verts_to_send_size_atomic,
205 Kokkos::View<size_t*, device_type> recoloringSize,
206 Kokkos::View<int*,
207 device_type> rand,
208 Kokkos::View<gno_t*,
209 device_type> gid,
210 Kokkos::View<gno_t*,
211 device_type> ghost_degrees,
212 bool recolor_degrees) = 0;
213
214 //Entry point for serial conflict detection
215 virtual void detectConflicts_serial(const size_t n_local,
216 typename Kokkos::View<offset_t*, device_type >::HostMirror dist_offsets_host,
217 typename Kokkos::View<lno_t*, device_type >::HostMirror dist_adjs_host,
218 typename Kokkos::View<int*,device_type >::HostMirror femv_colors,
219 typename Kokkos::View<lno_t*, device_type >::HostMirror boundary_verts_view,
220 typename Kokkos::View<lno_t*,device_type>::HostMirror verts_to_recolor_view,
221 typename Kokkos::View<int*,device_type>::HostMirror verts_to_recolor_size_atomic,
222 typename Kokkos::View<lno_t*,device_type>::HostMirror verts_to_send_view,
223 typename Kokkos::View<size_t*,device_type>::HostMirror verts_to_send_size_atomic,
224 typename Kokkos::View<size_t*, device_type>::HostMirror recoloringSize,
225 typename Kokkos::View<int*, device_type>::HostMirror rand,
226 typename Kokkos::View<gno_t*,device_type>::HostMirror gid,
227 typename Kokkos::View<gno_t*,device_type>::HostMirror ghost_degrees,
228 bool recolor_degrees) = 0;
229 //Entry point for the construction of the boundary
230 //INPUT ARGS
231 // n_local is the number of vertices owned by the current process
232 //
233 // dist_offsets_dev is the device view that holds the CSR offsets
234 //
235 // dist_adjs_dev is the device view that holds CSR adjacencies
236 //
237 // dist_offsets_host is the hostmirror that holds the CSR offsets
238 //
239 // dist_adjs_host is the hostmirror that holds the CSR adjacencies
240 //
241 //OUTPUT ARGS
242 // boundary_verts is an unallocated device view that will hold the
243 // list of boundary vertices.
244 //
245 // verts_to_send_view will hold the list of vertices to send
246 //
247 // verts_to_send_size_atomic will hold the number of vertices to
248 // send currently held in verts_to_send_view.
249 // verts_to_send_size_atomic differs
250 // from the allocated size of verts_to_send_view
251 // Atomicity is required for building
252 // verts_to_send_view in parallel, which is
253 // the reason this is a view instead of an
254 // integer type.
255 //
256 virtual void constructBoundary(const size_t n_local,
257 Kokkos::View<offset_t*, device_type> dist_offsets_dev,
258 Kokkos::View<lno_t*, device_type> dist_adjs_dev,
259 typename Kokkos::View<offset_t*, device_type>::HostMirror dist_offsets_host,
260 typename Kokkos::View<lno_t*, device_type>::HostMirror dist_adjs_host,
261 Kokkos::View<lno_t*, device_type>& boundary_verts,
262 Kokkos::View<lno_t*,
263 device_type > verts_to_send_view,
264 Kokkos::View<size_t*,
266 Kokkos::MemoryTraits<Kokkos::Atomic>> verts_to_send_size_atomic) = 0;
267
268 protected:
269 RCP<const base_adapter_t> adapter;
270 RCP<Teuchos::ParameterList> pl;
271 RCP<Environment> env;
272 RCP<const Teuchos::Comm<int> > comm;
274 bool timing;
275
276 private:
277 //This function constructs a CSR with complete adjacency information for
278 //first-layer ghost vertices. This CSR can contain edges from:
279 // first-layer ghosts to owned;
280 // first-layer ghosts to first-layer ghosts;
281 // first-layer ghosts to second-layer ghosts;
282 //1) Each process sends a list of ghost GIDs back to their owning process
283 // to request full adjacency information for that vertex.
284 //
285 //2) Initially each process sends back degree information to the requesting
286 // process.
287 //
288 //3) Then, each process can construct send buffers and receive schedules for
289 // the adjacency information for each requested GID it received,
290 // in addition to building the 2GL offsets and relabeling ghost LIDs
291 // to make the final CSR construction easier.
292 //
293 // 3a) We can construct the 2GL offsets here because each process has
294 // already received the degree information for each first-layer
295 // ghost vertex.
296 //
297 // 3b) The original ghost LIDs may not correspond to the order in which
298 // we will receive the adjacency information. Instead of reordering
299 // the received adjacencies, we relabel the GIDs of first-layer
300 // ghosts with new LIDs that correspond to the order in which we
301 // receive the adjacency information. Only first-layer ghost LIDs
302 // are changed.
303 //
304 //4) Due to limitations on the size of an MPI message, we split sending and
305 // receiving into rounds to accommodate arbitrarily large graphs.
306 //
307 //5) As send rounds progress, we construct the 2GL adjacency structure.
308 // Because we relabel ghost LIDs, we do not need to reorder the
309 // data after we receive it.
310 //
311 //
312 //
313 //OUTPUT ARGS
314 // ownedPlusGhosts Initially holds the list of GIDs for owned and ghost
315 // vertices. The only hard constraint on the ordering is
316 // that owned vertices come before ghost vertices.
317 // This function can re-assign the LIDs of ghost vertices
318 // in order to make the final construction of the 2GL
319 // CSR easier. After this function call, some ghost
320 // LIDs may be changed, but they will still be greater
321 // than owned LIDs. No owned LIDs will be changed.
322 //
323 // adjs_2GL holds the second ghost layer CSR adjacencies. The 2GL CSR
324 // contains complete adjacency information for the first-layer
325 // ghosts. These adjacencies can hold both owned vertices and
326 // second-layer ghosts, as well as first-layer ghosts.
327 //
328 // offsets_2GL holds CSR offsets for vertices in the first ghost layer only.
329 // That is, a vertex with GID = gid[n_local] will be the first
330 // vertex with information in this offset structure.
331 //
332 //
333 //INPUT ARGS
334 // owners holds the owning proc for a given vertex, indexed by local ID.
335 //
336 // adjs is the CSR adjacencies of the local graph with a single ghost layer.
337 //
338 // offsets is the CSR offsets of the local graph with a single ghost layer.
339 //
340 // mapOwned translates from Owned GID to LID. We only need this translation
341 // for owned vertices.
342 //
343 //TODO: This function uses many vectors of size comm->getSize();
344 // consider changes to increase memory scalability.
345 void constructSecondGhostLayer(std::vector<gno_t>& ownedPlusGhosts,
346 const std::vector<int>& owners,
347 ArrayView<const gno_t> adjs,
348 ArrayView<const offset_t> offsets,
349 RCP<const map_t> mapOwned,
350 std::vector< gno_t>& adjs_2GL,
351 std::vector< offset_t>& offsets_2GL) {
352
353 //first, we send ghost GIDs back to owners to receive the
354 //degrees of each first-layer ghost.
355 std::vector<int> sendcounts(comm->getSize(),0);
356 std::vector<size_t> sdispls(comm->getSize()+1,0);
357 //loop through owners, count how many vertices we'll send to each processor
358 //We send each ghost GID back to its owning process.
359 if(verbose) std::cout<<comm->getRank()<<": building sendcounts\n";
360 for(size_t i = 0; i < owners.size(); i++){
361 if(owners[i] != comm->getRank()&& owners[i] !=-1) sendcounts[owners[i]]++;
362 }
363 //construct sdispls (for building sendbuf), and sum the total sendcount
364 if(verbose) std::cout<<comm->getRank()<<": building sdispls\n";
365 size_t sendcount = 0;
366 for(int i = 1; i < comm->getSize()+1; i++){
367 sdispls[i] = sdispls[i-1] + sendcounts[i-1];
368 sendcount += sendcounts[i-1];
369 }
370
371 if(verbose) std::cout<<comm->getRank()<<": building idx\n";
372 std::vector<gno_t> idx(comm->getSize(),0);
373 for(int i = 0; i < comm->getSize(); i++){
374 idx[i]=sdispls[i];
375 }
376 //construct sendbuf to send GIDs to owning processes
377 if(verbose) std::cout<<comm->getRank()<<": building sendbuf\n";
378
379 std::vector<gno_t> sendbuf(sendcount,0);
380 for(size_t i = offsets.size()-1; i < owners.size(); i++){
381 if(owners[i] != comm->getRank() && owners[i] != -1){
382 sendbuf[idx[owners[i]]++] = ownedPlusGhosts[i];
383 }
384 }
385
386 //communicate GIDs to owners
387 if(verbose) std::cout<<comm->getRank()<<": requesting GIDs from owners\n";
388 Teuchos::ArrayView<int> sendcounts_view = Teuchos::arrayViewFromVector(sendcounts);
389 Teuchos::ArrayView<gno_t> sendbuf_view = Teuchos::arrayViewFromVector(sendbuf);
390 Teuchos::ArrayRCP<gno_t> recvbuf;
391 std::vector<int> recvcounts(comm->getSize(),0);
392 Teuchos::ArrayView<int> recvcounts_view = Teuchos::arrayViewFromVector(recvcounts);
393 Zoltan2::AlltoAllv<gno_t>(*comm, *env, sendbuf_view, sendcounts_view, recvbuf, recvcounts_view);
394
395 if(verbose) std::cout<<comm->getRank()<<": done communicating\n";
396 //replace entries in recvGIDs with their degrees
397
398 if(verbose) std::cout<<comm->getRank()<<": building rdispls\n";
399 gno_t recvcounttotal = 0;
400 std::vector<int> rdispls(comm->getSize()+1,0);
401 for(size_t i = 1; i<recvcounts.size()+1; i++){
402 rdispls[i] = rdispls[i-1] + recvcounts[i-1];
403 recvcounttotal += recvcounts[i-1];
404 }
405 //send back the degrees to the requesting processes,
406 //build the adjacency counts
407 std::vector<offset_t> sendDegrees(recvcounttotal,0);
408 gno_t adj_len = 0;
409 std::vector<int> adjsendcounts(comm->getSize(),0);
410 if(verbose) std::cout<<comm->getRank()<<": building adjacency counts\n";
411 for(int i = 0; i < comm->getSize(); i++){
412 adjsendcounts[i] = 0;
413 for(int j = rdispls[i]; j < rdispls[i+1]; j++){
414 lno_t lid = mapOwned->getLocalElement(recvbuf[j]);
415 offset_t degree = offsets[lid+1] - offsets[lid];
416 sendDegrees[j] = degree;
417 adj_len +=degree;
418 adjsendcounts[i] += degree;
419 }
420 }
421 //communicate the degrees back to the requesting processes
422 if(verbose) std::cout<<comm->getRank()<<": sending degrees back to requestors\n";
423 Teuchos::ArrayView<offset_t> sendDegrees_view = Teuchos::arrayViewFromVector(sendDegrees);
424 Teuchos::ArrayRCP<offset_t> recvDegrees;
425 std::vector<int> recvDegreesCount(comm->getSize(),0);
426 Teuchos::ArrayView<int> recvDegreesCount_view = Teuchos::arrayViewFromVector(recvDegreesCount);
427 Zoltan2::AlltoAllv<offset_t>(*comm, *env, sendDegrees_view, recvcounts_view, recvDegrees, recvDegreesCount_view);
428
429 //calculate number of rounds of AlltoAllv's that are necessary on this process
430
431 if(verbose) std::cout<<comm->getRank()<<": determining number of rounds necessary\n";
432 int rounds = 1;
433 for(int i = 0; i < comm->getSize(); i++){
434 if(adjsendcounts[i]*sizeof(gno_t)/ INT_MAX > (size_t)rounds){
435 rounds = (adjsendcounts[i]*sizeof(gno_t)/INT_MAX)+1;
436 }
437 }
438
439 //see what the max number of rounds will be globally
440 int max_rounds = 0;
441 Teuchos::reduceAll<int>(*comm, Teuchos::REDUCE_MAX, 1, &rounds, &max_rounds);
442
443 if(verbose) std::cout<<comm->getRank()<<": building per_proc sums\n";
444 //compute send and receive schedules to and from each process
445 std::vector<std::vector<uint64_t>> per_proc_round_adj_sums(max_rounds+1,std::vector<uint64_t>(comm->getSize(),0));
446 std::vector<std::vector<uint64_t>> per_proc_round_vtx_sums(max_rounds+1,std::vector<uint64_t>(comm->getSize(),0));
447
448 if(verbose) std::cout<<comm->getRank()<<": filling per_proc sums\n";
449 //fill out the send schedules
450 for(int proc_to_send = 0; proc_to_send < comm->getSize(); proc_to_send++){
451 int curr_round = 0;
452 for(size_t j = sdispls[proc_to_send]; j < sdispls[proc_to_send+1]; j++){
453 if((per_proc_round_adj_sums[curr_round][proc_to_send] + recvDegrees[j])*sizeof(gno_t) > INT_MAX){
454 curr_round++;
455 }
456 per_proc_round_adj_sums[curr_round][proc_to_send] += recvDegrees[j];
457 per_proc_round_vtx_sums[curr_round][proc_to_send]++;
458 }
459 }
460
461 if(verbose) std::cout<<comm->getRank()<<": building recv GID schedule\n";
462
463 //A 3D vector to hold the GIDs so we can see how this process will receive
464 //the vertices in each round, from each process. This way we can reorder things
465 //locally so that the data we receive will automatically construct the CSR correctly
466 //without any other computation. We do the reordering before we start receiving.
467 std::vector<std::vector<std::vector<gno_t>>> recv_GID_per_proc_per_round(
468 max_rounds+1,std::vector<std::vector<gno_t>>(
469 comm->getSize(),std::vector<gno_t>(0)));
470 for(int i = 0; i < max_rounds; i++){
471 for(int j = 0; j < comm->getSize(); j++){
472 recv_GID_per_proc_per_round[i][j] = std::vector<gno_t>(sendcounts[j],0);
473 }
474 }
475
476 if(verbose) std::cout<<comm->getRank()<<": filling out recv GID schedule\n";
477 for(int i = 0; i < comm->getSize(); i++){
478 int curr_round = 0;
479 size_t curr_idx = 0;
480 for(size_t j = sdispls[i]; j < sdispls[i+1]; j++){
481 if(curr_idx > per_proc_round_vtx_sums[curr_round][i]){
482 curr_round++;
483 curr_idx = 0;
484 }
485 recv_GID_per_proc_per_round[curr_round][i][curr_idx++] = j;
486 }
487 }
488
489 if(verbose) std::cout<<comm->getRank()<<": reordering gids and degrees in the order they'll be received\n";
490 //reorder the GIDs and degrees locally:
491 // - The way we previously stored GIDs and degrees had no relation
492 // to the order that we'll be receiving the adjacency data.
493 // For the CSR to be complete (and usable), we reorder the LIDs of
494 // ghosts so that they are in the order we will receive the adjacency
495 // data.
496 //
497 // - For graphs that are not large enough to trigger sending in multiple
498 // rounds of communication, the LIDs of the ghosts will be ordered
499 // by owning process. If multiple rounds are used, the LIDs of
500 // ghosts will be ordered by rounds in addition to owning process.
501 //
502 // - final_gid_vec and final_degree_vec hold the reorganized gids and
503 // degrees, the final re-mapping happens further down
504 std::vector<gno_t> final_gid_vec(sendcount, 0);
505 std::vector<offset_t> final_degree_vec(sendcount,0);
506 gno_t reorder_idx = 0;
507 for(int i = 0; i < max_rounds; i++){
508 for(int j = 0; j < comm->getSize(); j++){
509 for(size_t k = 0; k < per_proc_round_vtx_sums[i][j]; k++){
510 final_gid_vec[reorder_idx] = sendbuf[recv_GID_per_proc_per_round[i][j][k]];
511 final_degree_vec[reorder_idx++] = recvDegrees[recv_GID_per_proc_per_round[i][j][k]];
512 }
513 }
514 }
515
516 //check to see if the reorganization has happened correctly
517 if(verbose){
518 //do a quick check to see if we ended up reorganizing anything
519 bool reorganized = false;
520 for(size_t i = 0; i < sendcount; i++){
521 if(final_gid_vec[i] != sendbuf[i]) reorganized = true;
522 }
523
524 //if we have more than a single round of communication, we need to reorganize.
525 //this alerts of unnecessary reorganization, and a failure to perform necessary
526 //reorganization.
527 if(!reorganized && (max_rounds > 1)) std::cout<<comm->getRank()<<": did not reorgainze GIDs, but probably should have\n";
528 if(reorganized && (max_rounds == 1)) std::cout<<comm->getRank()<<": reorganized GIDs, but probably should not have\n";
529 }
530 //remap the GIDs so we receive the adjacencies in the same order as the current processes LIDs
531 // originally, the GID->LID mapping has no relevance to how we'll be receiving adj data from
532 // remote processes. Here we make it so that the GID->LID mapping does correspond to the
533 // order we have received degree info and will receive adjacencies. ( LID n_local
534 // corresponds to the first GID whose adjacency info we will receive, LID n_local+1 the
535 // second, etc.)
536 // That way, we don't need to reorder the adjacencies after we receive them.
537 //
538 // This next loop is the final mapping step.
539
540 for (size_t i = 0; i < sendcount; i++){
541 ownedPlusGhosts[i+offsets.size()-1] = final_gid_vec[i];
542 }
543
544 //status report
545 if(verbose) {
546 std::cout<<comm->getRank()<<": done remapping\n";
547 std::cout<<comm->getRank()<<": building ghost offsets\n";
548 }
549 //start building the second ghost layer's offsets
550 std::vector<offset_t> ghost_offsets(sendcount+1,0);
551 for(size_t i = 1; i < sendcount+1; i++){
552 ghost_offsets[i] = ghost_offsets[i-1] + final_degree_vec[i-1];
553 }
554
555
556 if(verbose) std::cout<<comm->getRank()<<": going through the sending rounds\n";
557 //set up counters to keep track of where we are in the sending order
558 std::vector<uint64_t> curr_idx_per_proc(comm->getSize(),0);
559 for(int i = 0; i < comm->getSize(); i++) curr_idx_per_proc[i] = rdispls[i];
560 for(int round = 0; round < max_rounds; round++){
561 //send buffers
562 std::vector<gno_t> send_adj;
563 std::vector<int> send_adj_counts(comm->getSize(),0);
564 if(verbose) std::cout<<comm->getRank()<<": round "<<round<<", constructing send_adj\n";
565 //construct the adjacencies to send for this round
566 for(int curr_proc = 0; curr_proc < comm->getSize(); curr_proc++){
567 uint64_t curr_adj_sum = 0;
568 //keep going through adjacencies to send to this process until we're done
569 while( curr_idx_per_proc[curr_proc] < (size_t)rdispls[curr_proc+1]){
570 lno_t lid = mapOwned->getLocalElement(recvbuf[curr_idx_per_proc[curr_proc]++]);
571
572 //if the next adjacency would push us over the MPI message size max,
573 //stop for this round
574 if((curr_adj_sum + (offsets[lid+1]-offsets[lid]))*sizeof(gno_t) >= INT_MAX){
575 break;
576 }
577
578 //add the adjacencies to the send buffer
579 curr_adj_sum += (offsets[lid+1] - offsets[lid]);
580 for(offset_t j = offsets[lid]; j < offsets[lid+1]; j++){
581 send_adj.push_back(adjs[j]);
582 }
583 }
584 //update the send counts for this round
585 send_adj_counts[curr_proc] = curr_adj_sum;
586 }
587 if(verbose) std::cout<<comm->getRank()<<": round "<<round<<", sending...\n";
588 //do the sending...
589 Teuchos::ArrayView<gno_t> send_adjs_view = Teuchos::arrayViewFromVector(send_adj);
590 Teuchos::ArrayView<int> adjsendcounts_view = Teuchos::arrayViewFromVector(send_adj_counts);
591 Teuchos::ArrayRCP<gno_t> ghost_adjs;
592 std::vector<int> adjrecvcounts(comm->getSize(),0);
593 Teuchos::ArrayView<int> adjsrecvcounts_view = Teuchos::arrayViewFromVector(adjrecvcounts);
594 Zoltan2::AlltoAllv<gno_t>(*comm, *env, send_adjs_view, adjsendcounts_view, ghost_adjs, adjsrecvcounts_view);
595 //Because of the previous reordering, these adjacencies are
596 //in the correct order as they arrive on the process.
597 for(offset_t i = 0; i< (offset_t)ghost_adjs.size(); i++){
598 adjs_2GL.push_back(ghost_adjs[i]);
599 }
600 }
601 if(verbose) std::cout<<comm->getRank()<<": constructing offsets\n";
602 //put the offsets we computed into the output argument.
603 for(size_t i = 0; i < sendcount+1; i++){
604 offsets_2GL.push_back(ghost_offsets[i]);
605 }
606 if(verbose) std::cout<<comm->getRank()<<": done building 2nd ghost layer\n";
607 }
608
609 //Communicates owned vertex colors to remote ghost copies.
610 //
611 //returns: the amount of time the communication call took.
612 //
613 //OUTPUT ARGS:
614 // colors: the owned vertices' colors are not changed,
615 // ghost vertex colors are updated from their owners.
616 //
617 // total_sent: reports the total size of the send buffer
618 //
619 // total_recv: reports the total size of the recv buffer
620 //
621 //INPUT ARGS:
622 //
623 // mapOwnedPlusGhosts: maps global IDs to local IDs and vice-versa.
624 //
625 // nVtx: the number of owned vertices on this process
626 //
627 // verts_to_send: hostmirror of the list of vertices to send.
628 // This list will only contain local vertices
629 // that are ghosted on a remote process.
630 //
631 // verts_to_send_size: hostmirror of the size of verts_to_send
632 //
633 // procs_to_send: map that takes a local ID and gives a vector of
634 // process IDs which have a ghost copy of that vertex.
635 //
636 double doOwnedToGhosts(RCP<const map_t> mapOwnedPlusGhosts,
637 size_t nVtx,
638 typename Kokkos::View<lno_t*,device_type>::HostMirror verts_to_send,
639 typename Kokkos::View<size_t*,device_type>::HostMirror verts_to_send_size,
640 Teuchos::RCP<femv_t> femv,
641 const std::unordered_map<lno_t, std::vector<int>>& procs_to_send,
642 gno_t& total_sent, gno_t& total_recvd){
643
644 auto femvColors = femv->getLocalViewHost(Tpetra::Access::ReadWrite);
645 auto colors = subview(femvColors, Kokkos::ALL, 0);
646 //create vectors to hold send information
647 int nprocs = comm->getSize();
648 std::vector<int> sendcnts(comm->getSize(), 0);
649 std::vector<gno_t> sdispls(comm->getSize()+1, 0);
650
651 //calculate how much data we're sending to each process
652 for(size_t i = 0; i < verts_to_send_size(0); i++){
653 for(size_t j = 0; j < procs_to_send.at(verts_to_send(i)).size(); j++){
654 sendcnts[procs_to_send.at(verts_to_send(i))[j]] += 2;
655 }
656 }
657 //calculate sendsize and sdispls
658 sdispls[0] = 0;
659 gno_t sendsize = 0;
660 std::vector<int> sentcount(nprocs, 0);
661
662 for(int i = 1; i < comm->getSize()+1; i++){
663 sdispls[i] = sdispls[i-1] + sendcnts[i-1];
664 sendsize += sendcnts[i-1];
665 }
666 total_sent = sendsize;
667 std::vector<gno_t> sendbuf(sendsize,0);
668 //construct sendbuf, send each owned vertex's GID
669 //and its color to the processes that have a
670 //ghost copy of that vertex. If a vertex is not ghosted,
671 //it does not get sent anywhere.
672 for(size_t i = 0; i < verts_to_send_size(0); i++){
673 std::vector<int> procs = procs_to_send.at(verts_to_send(i));
674 for(size_t j = 0; j < procs.size(); j++){
675 size_t idx = sdispls[procs[j]] + sentcount[procs[j]];
676 sentcount[procs[j]] += 2;
677 sendbuf[idx++] = mapOwnedPlusGhosts->getGlobalElement(verts_to_send(i));
678 sendbuf[idx] = colors(verts_to_send(i));
679 }
680 }
681
682 Teuchos::ArrayView<int> sendcnts_view = Teuchos::arrayViewFromVector(sendcnts);
683 Teuchos::ArrayView<gno_t> sendbuf_view = Teuchos::arrayViewFromVector(sendbuf);
684 Teuchos::ArrayRCP<gno_t> recvbuf;
685 std::vector<int> recvcnts(comm->getSize(), 0);
686 Teuchos::ArrayView<int> recvcnts_view = Teuchos::arrayViewFromVector(recvcnts);
687
688 //if we're reporting times, remove the computation imbalance from the comm timer
689 if(timing) comm->barrier();
690 double comm_total = 0.0;
691 double comm_temp = timer();
692
693 Zoltan2::AlltoAllv<gno_t>(*comm, *env, sendbuf_view, sendcnts_view, recvbuf, recvcnts_view);
694 comm_total += timer() - comm_temp;
695
696 //compute total recvsize for updating local ghost colors
697 gno_t recvsize = 0;
698 for(int i = 0; i < recvcnts_view.size(); i++){
699 recvsize += recvcnts_view[i];
700 }
701 total_recvd = recvsize;
702 //update the local ghost copies with the color we just received.
703 for(int i = 0; i < recvsize; i+=2){
704 size_t lid = mapOwnedPlusGhosts->getLocalElement(recvbuf[i]);
705 colors(lid) = recvbuf[i+1];
706 }
707
708 return comm_total;
709 }
710
711 public:
712 //constructor
714 const RCP<const base_adapter_t> &adapter_,
715 const RCP<Teuchos::ParameterList> &pl_,
716 const RCP<Environment> &env_,
717 const RCP<const Teuchos::Comm<int> > &comm_)
718 : adapter(adapter_), pl(pl_), env(env_), comm(comm_){
719 verbose = pl->get<bool>("verbose",false);
720 timing = pl->get<bool>("timing", false);
721 }
722 //Main entry point for graph coloring
723 void color( const RCP<ColoringSolution<Adapter> > &solution){
724 //convert from global graph to local graph
725 ArrayView<const gno_t> vtxIDs;
726 ArrayView<StridedData<lno_t, scalar_t> > vwgts;
727
728 modelFlag_t flags;
729 flags.set(REMOVE_SELF_EDGES);
730 const auto model = rcp(new GraphModel<base_adapter_t>(this->adapter, this->env,
731 this->comm, flags));
732 size_t nVtx = model->getVertexList(vtxIDs, vwgts);
733 // the weights are not used at this point.
734
735 //get the adjacencies in a view that holds GIDs
736 ArrayView<const gno_t> adjs;
737 //get the CSR offsets
738 ArrayView<const offset_t> offsets;
739 ArrayView<StridedData<lno_t, scalar_t> > ewgts;
740 model->getEdgeList(adjs, offsets, ewgts);
741
742 //This map is used to map GID to LID initially
743 std::unordered_map<gno_t,lno_t> globalToLocal;
744 //this vector holds GIDs for owned and ghost vertices
745 //The final order of the gids is:
746 // -----------------------------------
747 // | | first | second |
748 // | owned | layer | layer |
749 // | | ghosts | ghosts |
750 // -----------------------------------
751 //
752 // constructSecondGhostLayer reorders the first layer ghost
753 // GIDs in the particular order that the communication requires.
754
755 std::vector<gno_t> ownedPlusGhosts;
756
757 //this vector holds the owning process for
758 //owned vertices and the first ghost layer.
759 //owners[i] gives the processor owning ownedPlusGhosts[i],
760 //for owned vertices and first layer ghosts.
761 //
762 //This should only be used BEFORE the call to constructSecondGhostLayer
763 std::vector<int> owners;
764
765 //fill these structures using owned vertices
766 for(int i = 0; i < vtxIDs.size(); i++){
767 globalToLocal[vtxIDs[i]] = i;
768 ownedPlusGhosts.push_back(vtxIDs[i]);
769 owners.push_back(comm->getRank());
770 }
771
772 //count the initial number of ghosts,
773 //map them to a local ID.
774 //The globalToLocal map has first layer ghosts
775 //from here on.
776 int nGhosts = 0;
777 std::vector<lno_t> local_adjs;
778 for(int i = 0; i < adjs.size(); i++){
779 if(globalToLocal.count(adjs[i])==0){
780 ownedPlusGhosts.push_back(adjs[i]);
781 globalToLocal[adjs[i]] = vtxIDs.size()+nGhosts;
782 nGhosts++;
783 }
784 local_adjs.push_back(globalToLocal[adjs[i]]);
785 }
786
787 //construct a Tpetra map for the FEMultiVector
788 Tpetra::global_size_t dummy = Teuchos::OrdinalTraits
789 <Tpetra::global_size_t>::invalid();
790 RCP<const map_t> mapOwned = rcp(new map_t(dummy, vtxIDs, 0, comm));
791
792 //GID and owner lookup for ghost vertices
793 std::vector<gno_t> ghosts;
794 std::vector<int> ghostowners;
795 for(size_t i = nVtx; i < nVtx+nGhosts; i++){
796 ghosts.push_back(ownedPlusGhosts[i]);
797 ghostowners.push_back(-1);
798 }
799
800 //get the owners of the ghost vertices
801 ArrayView<int> owningProcs = Teuchos::arrayViewFromVector(ghostowners);
802 ArrayView<const gno_t> gids = Teuchos::arrayViewFromVector(ghosts);
803 mapOwned->getRemoteIndexList(gids, owningProcs);
804
805 //add ghost owners to the total owner vector
806 for(size_t i = 0; i < ghostowners.size(); i++){
807 owners.push_back(ghostowners[i]);
808 }
809
810 //construct the second ghost layer.
811 //NOTE: this may reorder the LIDs of the ghosts.
812 //
813 //these vectors hold the CSR representation of the
814 // first ghost layer. This CSR contains:
815 // - edges from first layer ghosts to:
816 // - owned vertices
817 // - first layer ghosts
818 // - second layer ghosts
819 //
820 // - first_layer_ghost_adjs uses GIDs that need
821 // translated to LIDs.
822 //
823 // - first_layer_ghost_offsets are indexed by LID,
824 // first_layer_ghost_offsets[i] corresponds to
825 // the vertex with GID = ownedPlusGhosts[i+nVtx].
826 // first_layer_ghost_offsets.size() = #first layer ghosts + 1
827 //
828 std::vector< gno_t> first_layer_ghost_adjs;
829 std::vector< offset_t> first_layer_ghost_offsets;
830 constructSecondGhostLayer(ownedPlusGhosts,owners, adjs, offsets, mapOwned,
831 first_layer_ghost_adjs, first_layer_ghost_offsets);
832
833 //we potentially reordered the local IDs of the ghost vertices, so we need
834 //to re-insert the GIDs into the global to local ID mapping.
835 globalToLocal.clear();
836 for(size_t i = 0; i < ownedPlusGhosts.size(); i++){
837 globalToLocal[ownedPlusGhosts[i]] = i;
838 }
839
840 //use updated globalToLocal map to translate
841 //adjacency GIDs to LIDs
842 for(int i = 0 ; i < adjs.size(); i++){
843 local_adjs[i] = globalToLocal[adjs[i]];
844 }
845
846 //at this point, we have ownedPlusGhosts with 1layer ghosts' GIDs.
847 //Need to map the second ghost layer GIDs to new local IDs,
848 //and add them to the map, as well as translating 2layer ghost
849 //adjacencies to use those new LIDs.
850 size_t n2Ghosts = 0;
851
852 //local_ghost_adjs is the LID translation of first_layer_ghost_adjs.
853 std::vector<lno_t> local_ghost_adjs;
854 for(size_t i = 0; i< first_layer_ghost_adjs.size(); i++ ){
855 if(globalToLocal.count(first_layer_ghost_adjs[i]) == 0){
856 ownedPlusGhosts.push_back(first_layer_ghost_adjs[i]);
857 globalToLocal[first_layer_ghost_adjs[i]] = vtxIDs.size() + nGhosts + n2Ghosts;
858 n2Ghosts++;
859 }
860 local_ghost_adjs.push_back(globalToLocal[first_layer_ghost_adjs[i]]);
861 }
862
863 //construct data structures necessary for FEMultiVector
864 if(verbose) std::cout<<comm->getRank()<<": constructing Tpetra map with copies\n";
865 dummy = Teuchos::OrdinalTraits <Tpetra::global_size_t>::invalid();
866 RCP<const map_t> mapWithCopies = rcp(new map_t(dummy,
867 Teuchos::arrayViewFromVector(ownedPlusGhosts),
868 0, comm));
869 if(verbose) std::cout<<comm->getRank()<<": done constructing map with copies\n";
870
871 using import_t = Tpetra::Import<lno_t, gno_t>;
872 Teuchos::RCP<import_t> importer = rcp(new import_t(mapOwned,
873 mapWithCopies));
874 if(verbose) std::cout<<comm->getRank()<<": done constructing importer\n";
875 Teuchos::RCP<femv_t> femv = rcp(new femv_t(mapOwned,
876 importer, 1, true));
877 if(verbose) std::cout<<comm->getRank()<<": done constructing femv\n";
878
879 //Create random numbers seeded on global IDs to resolve conflicts
880 //consistently between processes.
881 std::vector<int> rand(ownedPlusGhosts.size());
882 for(size_t i = 0; i < rand.size(); i++){
883 std::srand(ownedPlusGhosts[i]);
884 rand[i] = std::rand();
885 }
886
887 // get up-to-date owners for all ghosts.
888 std::vector<int> ghostOwners2(ownedPlusGhosts.size() -nVtx);
889 std::vector<gno_t> ghosts2(ownedPlusGhosts.size() - nVtx);
890 for(size_t i = nVtx; i < ownedPlusGhosts.size(); i++) ghosts2[i-nVtx] = ownedPlusGhosts[i];
891 Teuchos::ArrayView<int> owners2 = Teuchos::arrayViewFromVector(ghostOwners2);
892 Teuchos::ArrayView<const gno_t> ghostGIDs = Teuchos::arrayViewFromVector(ghosts2);
893 mapOwned->getRemoteIndexList(ghostGIDs,owners2);
894 if(verbose) std::cout<<comm->getRank()<<": done getting ghost owners\n";
895
896 //calculations for how many 2GL verts are in the boundary of another process, only
897 //do this if it's going to be displayed.
898 if(verbose) {
899 std::cout<<comm->getRank()<<": calculating 2GL stats...\n";
900
901 std::vector<int> sendcounts(comm->getSize(),0);
902 std::vector<gno_t> sdispls(comm->getSize()+1,0);
903 //loop through owners, count how many vertices we'll send to each processor
904 for(int i = nGhosts; i < ghostGIDs.size(); i++){
905 if(owners2[i] != comm->getRank()&& owners2[i] !=-1) sendcounts[owners2[i]]++;
906 }
907 //construct sdispls (for building sendbuf), and sum the total sendcount
908 gno_t sendcount = 0;
909 for(int i = 1; i < comm->getSize()+1; i++){
910 sdispls[i] = sdispls[i-1] + sendcounts[i-1];
911 sendcount += sendcounts[i-1];
912 }
913 std::vector<gno_t> idx(comm->getSize(),0);
914 for(int i = 0; i < comm->getSize(); i++){
915 idx[i]=sdispls[i];
916 }
917 //construct sendbuf to send GIDs to owning processes
918 std::vector<gno_t> sendbuf(sendcount,0);
919 for(size_t i = nGhosts; i < (size_t)ghostGIDs.size(); i++){
920 if(owners2[i] != comm->getRank() && owners2[i] != -1){
921 sendbuf[idx[owners2[i]]++] = ghostGIDs[i];
922 }
923 }
924 //do the communication
925 Teuchos::ArrayView<int> sendcounts_view = Teuchos::arrayViewFromVector(sendcounts);
926 Teuchos::ArrayView<gno_t> sendbuf_view = Teuchos::arrayViewFromVector(sendbuf);
927 Teuchos::ArrayRCP<gno_t> recvbuf;
928 std::vector<int> recvcounts(comm->getSize(),0);
929 Teuchos::ArrayView<int> recvcounts_view = Teuchos::arrayViewFromVector(recvcounts);
930 Zoltan2::AlltoAllv<gno_t>(*comm, *env, sendbuf_view, sendcounts_view, recvbuf, recvcounts_view);
931 std::vector<int> is_bndry_send(recvbuf.size(),0);
932
933 //send back how many received vertices are in the boundary
934 for(int i = 0; i < recvbuf.size(); i++){
935 size_t lid = mapWithCopies->getLocalElement(recvbuf[i]);
936 is_bndry_send[i] = 0;
937 if(lid < nVtx){
938 for(offset_t j = offsets[lid]; j < offsets[lid+1]; j++){
939 if((size_t)local_adjs[j] >= nVtx) is_bndry_send[i] = 1;
940 }
941 } else{
942 for(offset_t j = first_layer_ghost_offsets[lid]; j < first_layer_ghost_offsets[lid+1]; j++){
943 if((size_t)local_ghost_adjs[j] >= nVtx) is_bndry_send[i] = 1;
944 }
945 }
946 }
947
948 //send back the boundary flags
949 Teuchos::ArrayView<int> is_bndry_send_view = Teuchos::arrayViewFromVector(is_bndry_send);
950 Teuchos::ArrayRCP<int> is_bndry_recv;
951 std::vector<int> bndry_recvcounts(comm->getSize(),0);
952 Teuchos::ArrayView<int> bndry_recvcounts_view = Teuchos::arrayViewFromVector(bndry_recvcounts);
953 Zoltan2::AlltoAllv<int> (*comm, *env, is_bndry_send_view, recvcounts_view, is_bndry_recv, bndry_recvcounts_view);
954
955 //add together the flags to compute how many boundary vertices are in the second ghost layer
956 int boundaryverts = 0;
957 for(int i = 0; i < is_bndry_recv.size(); i++){
958 boundaryverts += is_bndry_recv[i];
959 }
960 //this cout is guarded by a verbose check.
961 std::cout<<comm->getRank()<<": "<<boundaryverts<<" boundary verts out of "<<n2Ghosts<<" verts in 2GL\n";
962 }
963
964
965 //local CSR representation for the owned vertices:
966 Teuchos::ArrayView<const lno_t> local_adjs_view = Teuchos::arrayViewFromVector(local_adjs);
967 //NOTE: the offset ArrayView was constructed earlier, and is up-to-date.
968 //
969 //first ghost layer CSR representation:
970 Teuchos::ArrayView<const offset_t> ghost_offsets = Teuchos::arrayViewFromVector(first_layer_ghost_offsets);
971 Teuchos::ArrayView<const lno_t> ghost_adjacencies = Teuchos::arrayViewFromVector(local_ghost_adjs);
972 Teuchos::ArrayView<const int> rand_view = Teuchos::arrayViewFromVector(rand);
973 Teuchos::ArrayView<const gno_t> gid_view = Teuchos::arrayViewFromVector(ownedPlusGhosts);
974 //these ArrayViews contain LIDs that are ghosted remotely,
975 //and the Process IDs that have those ghost copies.
976 //An LID may appear in the exportLIDs more than once.
977 Teuchos::ArrayView<const lno_t> exportLIDs = importer->getExportLIDs();
978 Teuchos::ArrayView<const int> exportPIDs = importer->getExportPIDs();
979
980 //construct a quick lookup datastructure to map from LID to a
981 //list of PIDs we need to send data to.
982 std::unordered_map<lno_t, std::vector<int>> procs_to_send;
983 for(int i = 0; i < exportLIDs.size(); i++){
984 procs_to_send[exportLIDs[i]].push_back(exportPIDs[i]);
985 }
986
987 //call the coloring algorithm
988 twoGhostLayer(nVtx, nVtx+nGhosts, local_adjs_view, offsets, ghost_adjacencies, ghost_offsets,
989 femv, gid_view, rand_view, owners2, mapWithCopies, procs_to_send);
990
991 //copy colors to the output array
992 ArrayRCP<int> colors = solution->getColorsRCP();
993 auto femvdata = femv->getData(0);
994 for(size_t i=0; i<nVtx; i++){
995 colors[i] = femvdata[i];
996 }
997
998 }
999// private:
1000
1001 //This is the coloring logic for all 2GL-based methods.
1002 //
1003 //INPUT ARGS:
1004 //
1005 // n_local: the number of local owned vertices
1006 //
1007 // n_total: the number of local owned vertices plus first layer ghosts
1008 //
1009 // adjs: the CSR adjacencies for the graph, only including owned vertices
1010 // and first layer ghosts
1011 //
1012 // offsets: the CSR offsets for the graph, has owned offsets into adjacencies
1013 //
1014 // ghost_adjs: the adjacency information for first-layer ghost vertices.
1015 // Includes all neighbors (owned, first-layer ghost,
1016 // second-layer ghost) for each first-layer ghost.
1017 //
1018 //
1019 // ghost_offsets: the offsets into ghost_adjs, first layer ghost LIDs
1020 // minus n_local are used to index this
1021 // datastructure. I.E. ghost_offsets(0)
1022 // corresponds to the ghost with LID n_local
1023 //
1024 // gids: a vector that holds GIDs for all vertices on this process
1025 // (includes owned, and all ghosts, indexable by LID)
1026 // The GIDs are ordered like this:
1027 // ----------------------------------
1028 // | | first | second |
1029 // | owned | layer | layer |
1030 // | | ghosts | ghosts |
1031 // ----------------------------------
1032 // ^ ^
1033 // n_local n_total
1034 //
1035 // rand: a vector that holds random numbers generated on GID for tie
1036 // breaking. Indexed by LID.
1037 //
1038 // ghost_owners: contains the process ID for the owner of each remote vertex.
1039 // Indexable by LID. owners[i] = the owning process for vertex
1040 // with GID = gids[i+n_local]
1041 //
1042 // mapOwnedPlusGhosts: map that can translate between GID and LID
1043 //
1044 // procs_to_send: for each vertex that is copied on a remote process,
1045 // this map contains the list of processes that have
1046 // a copy of a given vertex. Input LID, get the list
1047 // of PIDs that have a ghost copy of that LID.
1048 //
1049 //OUTPUT ARGS:
1050 //
1051 // femv: an FEMultiVector that holds a dual view of the colors.
1052 // After this call, femv contains updated colors.
1053 //
1054 void twoGhostLayer(const size_t n_local, const size_t n_total,
1055 const Teuchos::ArrayView<const lno_t>& adjs,
1056 const Teuchos::ArrayView<const offset_t>& offsets,
1057 const Teuchos::ArrayView<const lno_t>& ghost_adjs,
1058 const Teuchos::ArrayView<const offset_t>& ghost_offsets,
1059 const Teuchos::RCP<femv_t>& femv,
1060 const Teuchos::ArrayView<const gno_t>& gids,
1061 const Teuchos::ArrayView<const int>& rand,
1062 const Teuchos::ArrayView<const int>& ghost_owners,
1063 RCP<const map_t> mapOwnedPlusGhosts,
1064 const std::unordered_map<lno_t, std::vector<int>>& procs_to_send){
1065 //initialize timing variables
1066 double total_time = 0.0;
1067 double interior_time = 0.0;
1068 double comm_time = 0.0;
1069 double comp_time = 0.0;
1070 double recoloring_time=0.0;
1071 double conflict_detection = 0.0;
1072
1073 //Number of rounds we are saving statistics for
1074 //100 is a decent default. Reporting requires --verbose argument.
1075 const int numStatisticRecordingRounds = 100;
1076
1077 //includes all ghosts, including the second layer.
1078 const size_t n_ghosts = rand.size() - n_local;
1079
1080
1081 //Get the degrees of all ghost vertices
1082 //This is necessary for recoloring based on degrees,
1083 //we need ghost vertex degrees for consistency.
1084 std::vector<int> deg_send_cnts(comm->getSize(),0);
1085 std::vector<gno_t> deg_sdispls(comm->getSize()+1,0);
1086 for(int i = 0; i < ghost_owners.size(); i++){
1087 deg_send_cnts[ghost_owners[i]]++;
1088 }
1089 deg_sdispls[0] = 0;
1090 gno_t deg_sendsize = 0;
1091 std::vector<int> deg_sentcount(comm->getSize(),0);
1092 for(int i = 1; i < comm->getSize()+1; i++){
1093 deg_sdispls[i] = deg_sdispls[i-1] + deg_send_cnts[i-1];
1094 deg_sendsize += deg_send_cnts[i-1];
1095 }
1096 std::vector<gno_t> deg_sendbuf(deg_sendsize,0);
1097 for(int i = 0; i < ghost_owners.size(); i++){
1098 size_t idx = deg_sdispls[ghost_owners[i]] + deg_sentcount[ghost_owners[i]];
1099 deg_sentcount[ghost_owners[i]]++;
1100 deg_sendbuf[idx] = mapOwnedPlusGhosts->getGlobalElement(i+n_local);
1101 }
1102 Teuchos::ArrayView<int> deg_send_cnts_view = Teuchos::arrayViewFromVector(deg_send_cnts);
1103 Teuchos::ArrayView<gno_t> deg_sendbuf_view = Teuchos::arrayViewFromVector(deg_sendbuf);
1104 Teuchos::ArrayRCP<gno_t> deg_recvbuf;
1105 std::vector<int> deg_recvcnts(comm->getSize(),0);
1106 Teuchos::ArrayView<int> deg_recvcnts_view = Teuchos::arrayViewFromVector(deg_recvcnts);
1107 Zoltan2::AlltoAllv<gno_t>(*comm, *env, deg_sendbuf_view, deg_send_cnts_view, deg_recvbuf, deg_recvcnts_view);
1108
1109 //replace GID with the degree the owning process has for this vertex.
1110 //(this will include all edges present for this vertex globally)
1111 //This is safe to do, since we sent ghosts to their owners.
1112 for(int i = 0; i < deg_recvbuf.size(); i++){
1113 lno_t lid = mapOwnedPlusGhosts->getLocalElement(deg_recvbuf[i]);
1114 deg_recvbuf[i] = offsets[lid+1] - offsets[lid];
1115 }
1116 //send modified buffer back
1117 ArrayRCP<gno_t> ghost_degrees;
1118 Zoltan2::AlltoAllv<gno_t>(*comm, *env, deg_recvbuf(), deg_recvcnts_view, ghost_degrees, deg_send_cnts_view);
1119
1120 //create the ghost degree views, for use on host and device.
1121 Kokkos::View<gno_t*, device_type> ghost_degrees_dev("ghost degree view",ghost_degrees.size());
1122 typename Kokkos::View<gno_t*, device_type>::HostMirror ghost_degrees_host = Kokkos::create_mirror(ghost_degrees_dev);
1123 for(int i = 0; i < ghost_degrees.size(); i++){
1124 lno_t lid = mapOwnedPlusGhosts->getLocalElement(deg_sendbuf[i]);
1125 ghost_degrees_host(lid-n_local) = ghost_degrees[i];
1126 }
1127 Kokkos::deep_copy(ghost_degrees_dev, ghost_degrees_host);
1128
1129 //track the size of sends and receives through coloring rounds.
1130 gno_t recvPerRound[numStatisticRecordingRounds];
1131 gno_t sentPerRound[numStatisticRecordingRounds];
1132
1133 //find global max degree to determine which
1134 //coloring algorithm will be the most efficient
1135 //(For D1-2GL this is important, D2 and PD2 should always use NBBIT
1136 offset_t local_max_degree = 0;
1137 offset_t global_max_degree = 0;
1138 for(size_t i = 0; i < n_local; i++){
1139 offset_t curr_degree = offsets[i+1] - offsets[i];
1140 if(curr_degree > local_max_degree){
1141 local_max_degree = curr_degree;
1142 }
1143 }
1144 Teuchos::reduceAll<int, offset_t>(*comm, Teuchos::REDUCE_MAX,1, &local_max_degree, &global_max_degree);
1145 if(comm->getRank() == 0 && verbose) std::cout<<"Input has max degree "<<global_max_degree<<"\n";
1146
1147 if(verbose) std::cout<<comm->getRank()<<": constructing Kokkos Views for initial coloring\n";
1148
1149 //the initial coloring is able to use a standard CSR representation, so we construct that here.
1150 //This is a direct translation of the offsets and adjs arguments into Kokkos::Views.
1151 Kokkos::View<offset_t*, device_type> offsets_dev("Host Offset View", offsets.size());
1152 typename Kokkos::View<offset_t*, device_type>::HostMirror offsets_host = Kokkos::create_mirror(offsets_dev);
1153 Kokkos::View<lno_t*, device_type> adjs_dev("Host Adjacencies View", adjs.size());
1154 typename Kokkos::View<lno_t*, device_type>::HostMirror adjs_host = Kokkos::create_mirror(adjs_dev);
1155 for(Teuchos_Ordinal i = 0; i < offsets.size(); i++) offsets_host(i) = offsets[i];
1156 for(Teuchos_Ordinal i = 0; i < adjs.size(); i++) adjs_host(i) = adjs[i];
1157 Kokkos::deep_copy(offsets_dev,offsets_host);
1158 Kokkos::deep_copy(adjs_dev, adjs_host);
1159
1160
1161 //create the graph structures which allow KokkosKernels to recolor the conflicting vertices
1162 if(verbose) std::cout<<comm->getRank()<<": constructing Kokkos Views for recoloring\n";
1163
1164 //in order to correctly recolor, all ghost vertices on this process need an entry in the CSR offsets.
1165 //Otherwise, the color of the ghosts will be ignored, and the coloring will not converge.
1166 Kokkos::View<offset_t*, device_type> dist_degrees_dev("Owned+Ghost degree view",rand.size());
1167 typename Kokkos::View<offset_t*, device_type>::HostMirror dist_degrees_host = Kokkos::create_mirror(dist_degrees_dev);
1168
1169 //calculate the local degrees for the owned, first layer ghosts, and second layer ghosts
1170 //to be used to construct the CSR representation of the local graph.
1171 //owned
1172 for(Teuchos_Ordinal i = 0; i < offsets.size()-1; i++) dist_degrees_host(i) = offsets[i+1] - offsets[i];
1173 //first layer ghosts
1174 for(Teuchos_Ordinal i = 0; i < ghost_offsets.size()-1; i++) dist_degrees_host(i+n_local) = ghost_offsets[i+1] - ghost_offsets[i];
1175 //second layer ghosts
1176 for(Teuchos_Ordinal i = 0; i < ghost_adjs.size(); i++){
1177 //second layer ghosts have LID >= n_total
1178 if((size_t)ghost_adjs[i] >= n_total ){
1179 dist_degrees_host(ghost_adjs[i])++;
1180 }
1181 }
1182
1183 //The structure of the final CSR constructed here looks like:
1184 //
1185 // Index by LID--v
1186 // --------------------------------------------
1187 // | | first |second |
1188 // dist_offsets_dev/host |owned verts | layer |layer |
1189 // | | ghosts |ghosts |
1190 // --------------------------------------------
1191 // |indexes
1192 // v
1193 // --------------------------------------------
1194 // | |first |second |
1195 // dist_adjs_dev/host |owned adjs |layer |layer |
1196 // | |ghost adjs |ghost adjs |
1197 // --------------------------------------------
1198 //
1199 // We end up with a CSR that has adjacency information for all the vertices on
1200 // this process. We include only edges on the process, so ghosts have only partial
1201 // adjacency information.
1202 //
1203 // We symmetrize the local graph representation as well, for
1204 // KokkosKernels to behave as we need it to. This means that edges to
1205 // second layer ghosts must be symmetrized, so we end up with offsets
1206 // that correspond to second layer ghosts.
1207 Kokkos::View<offset_t*, device_type> dist_offsets_dev("Owned+Ghost Offset view", rand.size()+1);
1208 typename Kokkos::View<offset_t*, device_type>::HostMirror dist_offsets_host = Kokkos::create_mirror(dist_offsets_dev);
1209
1210 //we can construct the entire offsets using the degrees we computed above
1211 dist_offsets_host(0) = 0;
1212 offset_t total_adjs = 0;
1213 for(Teuchos_Ordinal i = 1; i < rand.size()+1; i++){
1214 dist_offsets_host(i) = dist_degrees_host(i-1) + dist_offsets_host(i-1);
1215 total_adjs += dist_degrees_host(i-1);
1216 }
1217 Kokkos::View<lno_t*, device_type> dist_adjs_dev("Owned+Ghost adjacency view", total_adjs);
1218 typename Kokkos::View<lno_t*, device_type>::HostMirror dist_adjs_host = Kokkos::create_mirror(dist_adjs_dev);
1219
1220 //zero out the degrees to use it as a counter.
1221 //The offsets now allow us to compute degrees,
1222 //so we aren't losing anything.
1223 for(Teuchos_Ordinal i = 0; i < rand.size(); i++){
1224 dist_degrees_host(i) = 0;
1225 }
1226 //We can just copy the adjacencies for the owned verts and first layer ghosts
1227 for(Teuchos_Ordinal i = 0; i < adjs.size(); i++) dist_adjs_host(i) = adjs[i];
1228 for(Teuchos_Ordinal i = adjs.size(); i < adjs.size() + ghost_adjs.size(); i++) dist_adjs_host(i) = ghost_adjs[i-adjs.size()];
1229
1230 //We have to symmetrize edges that involve a second layer ghost.
1231 //Add edges from the second layer ghosts to their neighbors.
1232 for(Teuchos_Ordinal i = 0; i < ghost_offsets.size()-1; i++){
1233 //loop through each first layer ghost
1234 for(offset_t j = ghost_offsets[i]; j < ghost_offsets[i+1]; j++){
1235 //if an adjacency is a second layer ghost
1236 if((size_t)ghost_adjs[j] >= n_total){
1237 //compute the offset to place the symmetric edge, and place it.
1238 dist_adjs_host(dist_offsets_host(ghost_adjs[j]) + dist_degrees_host(ghost_adjs[j])) = i + n_local;
1239 //increment the counter to keep track of how many adjacencies
1240 //have been placed.
1241 dist_degrees_host(ghost_adjs[j])++;
1242 }
1243 }
1244 }
1245 //copy the host views back to the device views
1246 Kokkos::deep_copy(dist_degrees_dev,dist_degrees_host);
1247 Kokkos::deep_copy(dist_offsets_dev,dist_offsets_host);
1248 Kokkos::deep_copy(dist_adjs_dev, dist_adjs_host);
1249
1250 //this view represents how many conflicts were found
1251 Kokkos::View<size_t*, device_type> recoloringSize("Recoloring Queue Size",1);
1252 typename Kokkos::View<size_t*, device_type>::HostMirror recoloringSize_host = Kokkos::create_mirror(recoloringSize);
1253 recoloringSize_host(0) = 0;
1254 Kokkos::deep_copy(recoloringSize, recoloringSize_host);
1255
1256 //create a view for the tie-breaking random numbers.
1257 if(verbose) std::cout<<comm->getRank()<<": constructing rand and GIDs views\n";
1258 Kokkos::View<int*, device_type> rand_dev("Random View", rand.size());
1259 typename Kokkos::View<int*, device_type>::HostMirror rand_host = Kokkos::create_mirror(rand_dev);
1260 for(Teuchos_Ordinal i = 0; i < rand.size(); i ++) rand_host(i) = rand[i];
1261 Kokkos::deep_copy(rand_dev,rand_host);
1262
1263 //create a view for the global IDs of each vertex.
1264 Kokkos::View<gno_t*, device_type> gid_dev("GIDs", gids.size());
1265 typename Kokkos::View<gno_t*, device_type>::HostMirror gid_host = Kokkos::create_mirror(gid_dev);
1266 for(Teuchos_Ordinal i = 0; i < gids.size(); i++) gid_host(i) = gids[i];
1267 Kokkos::deep_copy(gid_dev,gid_host);
1268
1269 //These variables will be initialized by constructBoundary()
1270 //
1271 //boundary_verts_dev holds all owned vertices that could possibly conflict
1272 //with a remote vertex. Stores LIDs.
1273 Kokkos::View<lno_t*, device_type> boundary_verts_dev;
1274 //this is the number of vertices in the boundary.
1275 if(verbose) std::cout<<comm->getRank()<<": constructing communication and recoloring lists\n";
1276
1277 //We keep track of the vertices that need to get recolored
1278 //this list can contain ghost vertices, so it needs to be initialized
1279 //to the number of all vertices on the local process.
1280 //rand has an entry for each vertex, so its size corresponds to what is needed.
1281 Kokkos::View<lno_t*, device_type> verts_to_recolor_view("verts to recolor", rand.size());
1282 typename Kokkos::View<lno_t*, device_type>::HostMirror verts_to_recolor_host = create_mirror(verts_to_recolor_view);
1283
1284 //This view keeps track of the size of the list of vertices to recolor.
1285 Kokkos::View<int*, device_type> verts_to_recolor_size("verts to recolor size",1);
1286 Kokkos::View<int*, device_type, Kokkos::MemoryTraits<Kokkos::Atomic>> verts_to_recolor_size_atomic = verts_to_recolor_size;
1287 typename Kokkos::View<int*, device_type>::HostMirror verts_to_recolor_size_host = create_mirror(verts_to_recolor_size);
1288
1289 //initialize the host view
1290 verts_to_recolor_size_host(0) = 0;
1291 //copy to device
1292 Kokkos::deep_copy(verts_to_recolor_size, verts_to_recolor_size_host);
1293
1294 //verts to send can only include locally owned vertices,
1295 //so we can safely allocate the view to size n_local.
1296 //
1297 //This view is a list of local vertices that are going to be
1298 //recolored, and need to be sent to their remote copies.
1299 Kokkos::View<lno_t*, device_type> verts_to_send_view("verts to send", n_local);
1300 typename Kokkos::View<lno_t*, device_type>::HostMirror verts_to_send_host = create_mirror(verts_to_send_view);
1301
1302 //this view keeps track of the size of verts_to_send.
1303 Kokkos::View<size_t*, device_type> verts_to_send_size("verts to send size",1);
1304 Kokkos::View<size_t*, device_type, Kokkos::MemoryTraits<Kokkos::Atomic>> verts_to_send_size_atomic = verts_to_send_size;
1305 typename Kokkos::View<size_t*, device_type>::HostMirror verts_to_send_size_host = create_mirror(verts_to_send_size);
1306
1307 verts_to_send_size_host(0) = 0;
1308 Kokkos::deep_copy(verts_to_send_size, verts_to_send_size_host);
1309
1310 if(verbose) std::cout<<comm->getRank()<<": Constructing the boundary\n";
1311
1312 //this function allocates and initializes the boundary_verts_dev view,
1313 //sets the boundary_size accordingly, and adds vertices to the
1314 //verts_to_send_atomic view, updating the size view as well.
1315 constructBoundary(n_local, dist_offsets_dev, dist_adjs_dev, dist_offsets_host, dist_adjs_host, boundary_verts_dev,
1316 verts_to_send_view, verts_to_send_size_atomic);
1317
1318 //this boolean chooses which KokkosKernels algorithm to use.
1319 //It was experimentally chosen for distance-1 coloring.
1320 //Should be disregarded for distance-2 colorings.
1321 bool use_vbbit = (global_max_degree < 6000);
1322 //Done initializing, start coloring!
1323
1324 //use a barrier if we are reporting timing info
1325 if(timing) comm->barrier();
1326 interior_time = timer();
1327 total_time = timer();
1328 //give the entire local graph to KokkosKernels to color
1329 this->colorInterior(n_local, adjs_dev, offsets_dev, femv,adjs_dev,0,use_vbbit);
1330 interior_time = timer() - interior_time;
1331 comp_time = interior_time;
1332
1333 //ghost_colors holds the colors of only ghost vertices.
1334 //ghost_colors(0) holds the color of a vertex with LID n_local.
1335 //To index this with LIDs, subtract n_local. If an LID is < n_local,
1336 //it will not have a color stored in this view.
1337 Kokkos::View<int*,device_type> ghost_colors("ghost color backups", n_ghosts);
1338
1339 //communicate the initial coloring.
1340 if(verbose) std::cout<<comm->getRank()<<": communicating\n";
1341
1342 //update the host views for the verts to send,
1343 //doOwnedToGhosts needs host memory views, but doesn't
1344 //change the values in them, so no need to copy afterwards
1345 Kokkos::deep_copy(verts_to_send_host, verts_to_send_view);
1346 Kokkos::deep_copy(verts_to_send_size_host, verts_to_send_size);
1347 gno_t recv,sent;
1348 comm_time = doOwnedToGhosts(mapOwnedPlusGhosts,n_local,verts_to_send_host,verts_to_send_size_host,femv,procs_to_send,sent,recv);
1349 sentPerRound[0] = sent;
1350 recvPerRound[0] = recv;
1351
1352 //store ghost colors so we can restore them after recoloring.
1353 //the local process can't color ghosts correctly, so we
1354 //reset the colors to avoid consistency issues.
1355 //get the color view from the FEMultiVector
1356 auto femvColors = femv->getLocalViewDevice(Tpetra::Access::ReadWrite);
1357 auto femv_colors = subview(femvColors, Kokkos::ALL, 0);
1358 Kokkos::parallel_for("get femv colors",
1359 Kokkos::RangePolicy<execution_space, int>(0,n_ghosts),
1360 KOKKOS_LAMBDA(const int& i){
1361 ghost_colors(i) = femv_colors(i+n_local);
1362 });
1363 Kokkos::fence();
1364
1365 double temp = timer();
1366 //detect conflicts only for ghost vertices
1367 bool recolor_degrees = this->pl->template get<bool>("recolor_degrees",false);
1368 if(verbose) std::cout<<comm->getRank()<<": detecting conflicts\n";
1369
1370 //these sizes will be updated by detectConflicts, zero them out beforehand
1371 verts_to_send_size_host(0) = 0;
1372 verts_to_recolor_size_host(0) = 0;
1373 recoloringSize_host(0) = 0;
1374 Kokkos::deep_copy(verts_to_send_size, verts_to_send_size_host);
1375 Kokkos::deep_copy(verts_to_recolor_size, verts_to_recolor_size_host);
1376 Kokkos::deep_copy(recoloringSize, recoloringSize_host);
1377
1378 detectConflicts(n_local, dist_offsets_dev, dist_adjs_dev, femv_colors, boundary_verts_dev,
1379 verts_to_recolor_view, verts_to_recolor_size_atomic, verts_to_send_view, verts_to_send_size_atomic,
1380 recoloringSize, rand_dev, gid_dev, ghost_degrees_dev, recolor_degrees);
1381
1382 //these sizes were updated by detectConflicts,
1383 //copy the device views back into host memory
1384 Kokkos::deep_copy(verts_to_send_host, verts_to_send_view);
1385 Kokkos::deep_copy(verts_to_send_size_host, verts_to_send_size);
1386 Kokkos::deep_copy(recoloringSize_host, recoloringSize);
1387 Kokkos::deep_copy(verts_to_recolor_size_host, verts_to_recolor_size);
1388
1389 if(comm->getSize() > 1){
1390 conflict_detection = timer() - temp;
1391 comp_time += conflict_detection;
1392 }
1393 //all conflicts detected!
1394 if(verbose) std::cout<<comm->getRank()<<": starting to recolor\n";
1395 //variables for statistics
1396 double totalPerRound[numStatisticRecordingRounds];
1397 double commPerRound[numStatisticRecordingRounds];
1398 double compPerRound[numStatisticRecordingRounds];
1399 double recoloringPerRound[numStatisticRecordingRounds];
1400 double conflictDetectionPerRound[numStatisticRecordingRounds];
1401 uint64_t vertsPerRound[numStatisticRecordingRounds];
1402 uint64_t incorrectGhostsPerRound[numStatisticRecordingRounds];
1403 int distributedRounds = 1;
1404 totalPerRound[0] = interior_time + comm_time + conflict_detection;
1405 recoloringPerRound[0] = 0;
1406 commPerRound[0] = comm_time;
1407 compPerRound[0] = interior_time + conflict_detection;
1408 conflictDetectionPerRound[0] = conflict_detection;
1409 recoloringPerRound[0] = 0;
1410 vertsPerRound[0] = 0;
1411 incorrectGhostsPerRound[0]=0;
1412 typename Kokkos::View<int*, device_type>::HostMirror ghost_colors_host;
1413 typename Kokkos::View<lno_t*, device_type>::HostMirror boundary_verts_host;
1414 size_t serial_threshold = this->pl->template get<int>("serial_threshold",0);
1415 //see if recoloring is necessary.
1416 size_t totalConflicts = 0;
1417 size_t localConflicts = recoloringSize_host(0);
1418 Teuchos::reduceAll<int,size_t>(*comm, Teuchos::REDUCE_SUM, 1, &localConflicts, &totalConflicts);
1419 bool done = !totalConflicts;
1420 if(comm->getSize()==1) done = true;
1421
1422 //recolor until no conflicts are left
1423 while(!done){
1424 //if the number of vertices left to recolor is less than
1425 //the serial threshold set by the user, finish the coloring
1426 //only using host views in a serial execution space.
1427 if(recoloringSize_host(0) < serial_threshold) break;
1428 if(distributedRounds < numStatisticRecordingRounds) {
1429 vertsPerRound[distributedRounds] = verts_to_recolor_size_host(0);
1430 }
1431
1432 if(timing) comm->barrier();
1433 double recolor_temp = timer();
1434 //recolor using KokkosKernels' coloring function
1435 if(verts_to_recolor_size_host(0) > 0){
1436 this->colorInterior(femv_colors.size(), dist_adjs_dev, dist_offsets_dev,femv,verts_to_recolor_view,verts_to_recolor_size_host(0),use_vbbit);
1437 }
1438
1439 if(distributedRounds < numStatisticRecordingRounds){
1440 recoloringPerRound[distributedRounds] = timer() - recolor_temp;
1441 recoloring_time += recoloringPerRound[distributedRounds];
1442 comp_time += recoloringPerRound[distributedRounds];
1443 compPerRound[distributedRounds] = recoloringPerRound[distributedRounds];
1444 totalPerRound[distributedRounds] = recoloringPerRound[distributedRounds];
1445 } else if(timing){
1446 double recoloring_round_time = timer() - recolor_temp;
1447 recoloring_time += recoloring_round_time;
1448 comp_time += recoloring_round_time;
1449 }
1450
1451 //reset the ghost colors to what they were before recoloring
1452 //the recoloring does not have enough information to color
1453 //ghosts correctly, so we set the colors to what they were before
1454 //to avoid consistency issues.
1455 Kokkos::parallel_for("set femv colors",
1456 Kokkos::RangePolicy<execution_space, int>(0,n_ghosts),
1457 KOKKOS_LAMBDA(const int& i){
1458 femv_colors(i+n_local) = ghost_colors(i);
1459 });
1460 Kokkos::fence();
1461
1462 //send views are up-to-date, they were copied after conflict detection.
1463 //communicate the new colors
1464
1465 // Reset device views
1466 femvColors = decltype(femvColors)();
1467 femv_colors = decltype(femv_colors)();
1468 double curr_comm_time = doOwnedToGhosts(mapOwnedPlusGhosts,n_local,verts_to_send_host,verts_to_send_size_host,femv,procs_to_send,sent,recv);
1469 comm_time += curr_comm_time;
1470
1471 if(distributedRounds < numStatisticRecordingRounds){
1472 commPerRound[distributedRounds] = curr_comm_time;
1473 recvPerRound[distributedRounds] = recv;
1474 sentPerRound[distributedRounds] = sent;
1475 totalPerRound[distributedRounds] += commPerRound[distributedRounds];
1476 }
1477
1478 //store ghost colors before we uncolor and recolor them.
1479 //this process doesn't have enough info to correctly color
1480 //ghosts, so we set them back to what they were before to
1481 //remove consistency issues.
1482 femvColors = femv->getLocalViewDevice(Tpetra::Access::ReadWrite);
1483 femv_colors = subview(femvColors, Kokkos::ALL, 0);
1484 Kokkos::parallel_for("get femv colors 2",
1485 Kokkos::RangePolicy<execution_space, int>(0,n_ghosts),
1486 KOKKOS_LAMBDA(const int& i){
1487 ghost_colors(i) = femv_colors(i+n_local);
1488 });
1489 Kokkos::fence();
1490
1491
1492 //these views will change in detectConflicts, they need
1493 //to be zeroed out beforehand
1494 verts_to_send_size_host(0) = 0;
1495 verts_to_recolor_size_host(0) = 0;
1496 recoloringSize_host(0) = 0;
1497 Kokkos::deep_copy(verts_to_send_size, verts_to_send_size_host);
1498 Kokkos::deep_copy(verts_to_recolor_size, verts_to_recolor_size_host);
1499 Kokkos::deep_copy(recoloringSize, recoloringSize_host);
1500
1501 //check for further conflicts
1502 double detection_temp = timer();
1503
1504 detectConflicts(n_local, dist_offsets_dev, dist_adjs_dev,femv_colors, boundary_verts_dev,
1505 verts_to_recolor_view, verts_to_recolor_size_atomic, verts_to_send_view, verts_to_send_size_atomic,
1506 recoloringSize, rand_dev, gid_dev, ghost_degrees_dev, recolor_degrees);
1507
1508 //copy the updated device views back into host memory where necessary
1509 Kokkos::deep_copy(verts_to_send_host, verts_to_send_view);
1510 Kokkos::deep_copy(verts_to_send_size_host, verts_to_send_size);
1511 //we only use the list of verts to recolor on device, no need to copy to host.
1512 Kokkos::deep_copy(verts_to_recolor_size_host, verts_to_recolor_size);
1513 Kokkos::deep_copy(recoloringSize_host, recoloringSize);
1514
1515 if(distributedRounds < numStatisticRecordingRounds){
1516 conflictDetectionPerRound[distributedRounds] = timer() - detection_temp;
1517 conflict_detection += conflictDetectionPerRound[distributedRounds];
1518 compPerRound[distributedRounds] += conflictDetectionPerRound[distributedRounds];
1519 totalPerRound[distributedRounds] += conflictDetectionPerRound[distributedRounds];
1520 comp_time += conflictDetectionPerRound[distributedRounds];
1521 } else if(timing){
1522 double conflict_detection_round_time = timer() - detection_temp;
1523 conflict_detection += conflict_detection_round_time;
1524 comp_time += conflict_detection_round_time;
1525 }
1526
1527 distributedRounds++;
1528 size_t localDone = recoloringSize_host(0);
1529 size_t globalDone = 0;
1530 Teuchos::reduceAll<int,size_t>(*comm, Teuchos::REDUCE_SUM, 1, &localDone, &globalDone);
1531 done = !globalDone;
1532
1533 }//end device coloring
1534
1535
1536 //If we reach this point and still have vertices to color,
1537 //the rest of the coloring is happening in serial on the host.
1538 //
1539 //First, we need to copy device-only views to host mirrors
1540 if(recoloringSize_host(0) > 0 || !done){
1541 ghost_colors_host = Kokkos::create_mirror_view_and_copy(host_mem(),ghost_colors,"ghost_colors host mirror");
1542 boundary_verts_host = Kokkos::create_mirror_view_and_copy(host_mem(),boundary_verts_dev,"boundary_verts host mirror");
1543 }
1544
1545 //Now we do a similar coloring loop to before,
1546 //but using only host views in a serial execution space.
1547 // Reset device views
1548 femvColors = decltype(femvColors)();
1549 femv_colors = decltype(femv_colors)();
1550 while(recoloringSize_host(0) > 0 || !done){
1551 auto femvColors_host = femv->getLocalViewHost(Tpetra::Access::ReadWrite);
1552 auto colors_host = subview(femvColors_host, Kokkos::ALL, 0);
1553 if(distributedRounds < numStatisticRecordingRounds){
1554 vertsPerRound[distributedRounds] = recoloringSize_host(0);
1555 }
1556 if(verbose) std::cout<<comm->getRank()<<": starting to recolor, serial\n";
1557 if(timing) comm->barrier();
1558
1559 double recolor_temp = timer();
1560 if(verts_to_recolor_size_host(0) > 0){
1561 this->colorInterior_serial(colors_host.size(), dist_adjs_host, dist_offsets_host, femv,
1562 verts_to_recolor_host, verts_to_recolor_size_host(0), true);
1563 }
1564 if(distributedRounds < numStatisticRecordingRounds){
1565 recoloringPerRound[distributedRounds] = timer() - recolor_temp;
1566 recoloring_time += recoloringPerRound[distributedRounds];
1567 comp_time += recoloringPerRound[distributedRounds];
1568 compPerRound[distributedRounds] = recoloringPerRound[distributedRounds];
1569 totalPerRound[distributedRounds] = recoloringPerRound[distributedRounds];
1570 } else if(timing){
1571 double recoloring_serial_round_time = timer() - recolor_temp;
1572 recoloring_time += recoloring_serial_round_time;
1573 comp_time += recoloring_serial_round_time;
1574 }
1575
1576 //reset the ghost colors to their previous values to avoid
1577 //consistency issues.
1578 for(size_t i = 0; i < n_ghosts; i++){
1579 colors_host(i+n_local) = ghost_colors_host(i);
1580 }
1581
1582 double curr_comm_time = doOwnedToGhosts(mapOwnedPlusGhosts, n_local,verts_to_send_host, verts_to_send_size_host, femv, procs_to_send, sent,recv);
1583 comm_time += curr_comm_time;
1584
1585 if(distributedRounds < numStatisticRecordingRounds){
1586 commPerRound[distributedRounds] = curr_comm_time;
1587 recvPerRound[distributedRounds] = recv;
1588 sentPerRound[distributedRounds] = sent;
1589 totalPerRound[distributedRounds] += commPerRound[distributedRounds];
1590 }
1591
1592 //store updated ghost colors we received from their owners
1593 //before conflict detection and recoloring changes them locally.
1594 for(size_t i = 0; i < n_ghosts; i++){
1595 ghost_colors_host(i) = colors_host(i+n_local);
1596 }
1597
1598 if(timing) comm->barrier();
1599 double detection_temp = timer();
1600
1601 //zero these out, they'll be updated by detectConflicts_serial
1602 verts_to_recolor_size_host(0) = 0;
1603 verts_to_send_size_host(0) = 0;
1604 recoloringSize_host(0) = 0;
1605
1606 detectConflicts_serial(n_local,dist_offsets_host, dist_adjs_host, colors_host, boundary_verts_host,
1607 verts_to_recolor_host, verts_to_recolor_size_host, verts_to_send_host, verts_to_send_size_host,
1608 recoloringSize_host, rand_host, gid_host, ghost_degrees_host, recolor_degrees);
1609
1610 //no need to update the host views, we're only using host views now.
1611
1612 if(distributedRounds < numStatisticRecordingRounds){
1613 conflictDetectionPerRound[distributedRounds] = timer() - detection_temp;
1614 conflict_detection += conflictDetectionPerRound[distributedRounds];
1615 compPerRound[distributedRounds] += conflictDetectionPerRound[distributedRounds];
1616 totalPerRound[distributedRounds] += conflictDetectionPerRound[distributedRounds];
1617 comp_time += conflictDetectionPerRound[distributedRounds];
1618 } else if(timing){
1619 double conflict_detection_serial_round_time = timer() - detection_temp;
1620 conflict_detection += conflict_detection_serial_round_time;
1621 comp_time += conflict_detection_serial_round_time;
1622 }
1623
1624 size_t globalDone = 0;
1625 size_t localDone = recoloringSize_host(0);
1626 Teuchos::reduceAll<int,size_t>(*comm, Teuchos::REDUCE_SUM, 1, &localDone, &globalDone);
1627 distributedRounds++;
1628 done = !globalDone;
1629 }
1630
1631 total_time = timer() - total_time;
1632 //only compute stats if they will be displayed
1633 if(verbose){
1634 uint64_t localBoundaryVertices = 0;
1635 for(size_t i = 0; i < n_local; i++){
1636 for(offset_t j = offsets[i]; j < offsets[i+1]; j++){
1637 if((size_t)adjs[j] >= n_local){
1638 localBoundaryVertices++;
1639 break;
1640 }
1641 }
1642 }
1643
1644 //if(comm->getRank() == 0) printf("did %d rounds of distributed coloring\n", distributedRounds);
1645 uint64_t totalVertsPerRound[numStatisticRecordingRounds];
1646 uint64_t totalBoundarySize = 0;
1647 uint64_t totalIncorrectGhostsPerRound[numStatisticRecordingRounds];
1648 double finalTotalPerRound[numStatisticRecordingRounds];
1649 double maxRecoloringPerRound[numStatisticRecordingRounds];
1650 double minRecoloringPerRound[numStatisticRecordingRounds];
1651 double finalCommPerRound[numStatisticRecordingRounds];
1652 double finalCompPerRound[numStatisticRecordingRounds];
1653 double finalConflictDetectionPerRound[numStatisticRecordingRounds];
1654 gno_t finalRecvPerRound[numStatisticRecordingRounds];
1655 gno_t finalSentPerRound[numStatisticRecordingRounds];
1656 for(int i = 0; i < numStatisticRecordingRounds; i++){
1657 totalVertsPerRound[i] = 0;
1658 finalTotalPerRound[i] = 0.0;
1659 maxRecoloringPerRound[i] = 0.0;
1660 minRecoloringPerRound[i] = 0.0;
1661 finalCommPerRound[i] = 0.0;
1662 finalCompPerRound[i] = 0.0;
1663 finalConflictDetectionPerRound[i] = 0.0;
1664 finalRecvPerRound[i] = 0;
1665 finalSentPerRound[i] = 0;
1666 }
1667 Teuchos::reduceAll<int,uint64_t>(*comm, Teuchos::REDUCE_SUM,1,&localBoundaryVertices, &totalBoundarySize);
1668 Teuchos::reduceAll<int,uint64_t>(*comm, Teuchos::REDUCE_SUM,numStatisticRecordingRounds,vertsPerRound,totalVertsPerRound);
1669 Teuchos::reduceAll<int,uint64_t>(*comm, Teuchos::REDUCE_SUM,numStatisticRecordingRounds,incorrectGhostsPerRound,totalIncorrectGhostsPerRound);
1670 Teuchos::reduceAll<int,double>(*comm, Teuchos::REDUCE_MAX,numStatisticRecordingRounds,totalPerRound, finalTotalPerRound);
1671 Teuchos::reduceAll<int,double>(*comm, Teuchos::REDUCE_MAX,numStatisticRecordingRounds,recoloringPerRound,maxRecoloringPerRound);
1672 Teuchos::reduceAll<int,double>(*comm, Teuchos::REDUCE_MIN,numStatisticRecordingRounds,recoloringPerRound,minRecoloringPerRound);
1673 Teuchos::reduceAll<int,double>(*comm, Teuchos::REDUCE_MAX,numStatisticRecordingRounds,commPerRound,finalCommPerRound);
1674 Teuchos::reduceAll<int,double>(*comm, Teuchos::REDUCE_MAX,numStatisticRecordingRounds,compPerRound,finalCompPerRound);
1675 Teuchos::reduceAll<int,double>(*comm,
1676 Teuchos::REDUCE_MAX,numStatisticRecordingRounds,conflictDetectionPerRound,finalConflictDetectionPerRound);
1677 Teuchos::reduceAll<int,gno_t> (*comm, Teuchos::REDUCE_SUM,numStatisticRecordingRounds,recvPerRound,finalRecvPerRound);
1678 Teuchos::reduceAll<int,gno_t> (*comm, Teuchos::REDUCE_SUM,numStatisticRecordingRounds,sentPerRound,finalSentPerRound);
1679 std::cout << "Rank " << comm->getRank()
1680 << ": boundary size: " << localBoundaryVertices << std::endl;
1681 if(comm->getRank() == 0)
1682 std::cout << "Total boundary size: " << totalBoundarySize << std::endl;
1683 for(int i = 0; i < std::min((int)distributedRounds,numStatisticRecordingRounds); i++){
1684 std::cout << "Rank " << comm->getRank()
1685 << ": recolor " << vertsPerRound[i]
1686 << " vertices in round " << i << std::endl;
1687 std::cout << "Rank " << comm->getRank()
1688 << " sentbuf had " << sentPerRound[i]
1689 << " entries in round " << i << std::endl;
1690 if(comm->getRank()==0){
1691 std::cout << "recolored " << totalVertsPerRound[i]
1692 << " vertices in round " << i << std::endl;
1693 std::cout << totalIncorrectGhostsPerRound[i]
1694 << " inconsistent ghosts in round " << i << std::endl;
1695 std::cout << "total time in round " << i
1696 << ": " << finalTotalPerRound[i] << std::endl;
1697 std::cout << "recoloring time in round " << i
1698 << ": " << maxRecoloringPerRound[i] << std::endl;
1699 std::cout << "min recoloring time in round " << i
1700 << ": " << minRecoloringPerRound[i] << std::endl;
1701 std::cout << "conflict detection time in round " << i
1702 << ": " << finalConflictDetectionPerRound[i] << std::endl;
1703 std::cout << "comm time in round " << i
1704 << ": " << finalCommPerRound[i] << std::endl;
1705 std::cout << "recvbuf size in round " << i
1706 << ": " << finalRecvPerRound[i] << std::endl;
1707 std::cout << "sendbuf size in round " << i
1708 << ": " << finalSentPerRound[i] << std::endl;
1709 std::cout << "comp time in round " << i
1710 << ": " << finalCompPerRound[i] << std::endl;
1711 }
1712 }
1713 } else if (timing){
1714 double global_total_time = 0.0;
1715 double global_recoloring_time = 0.0;
1716 double global_min_recoloring_time = 0.0;
1717 double global_conflict_detection=0.0;
1718 double global_comm_time=0.0;
1719 double global_comp_time=0.0;
1720 double global_interior_time=0.0;
1721 Teuchos::reduceAll<int,double>(*comm, Teuchos::REDUCE_MAX,1,&total_time,&global_total_time);
1722 Teuchos::reduceAll<int,double>(*comm, Teuchos::REDUCE_MAX,1,&recoloring_time,&global_recoloring_time);
1723 Teuchos::reduceAll<int,double>(*comm, Teuchos::REDUCE_MIN,1,&recoloring_time,&global_min_recoloring_time);
1724 Teuchos::reduceAll<int,double>(*comm, Teuchos::REDUCE_MAX,1,&conflict_detection,&global_conflict_detection);
1725 Teuchos::reduceAll<int,double>(*comm, Teuchos::REDUCE_MAX,1,&comm_time,&global_comm_time);
1726 Teuchos::reduceAll<int,double>(*comm, Teuchos::REDUCE_MAX,1,&comp_time,&global_comp_time);
1727 Teuchos::reduceAll<int,double>(*comm, Teuchos::REDUCE_MAX,1,&interior_time,&global_interior_time);
1728 comm->barrier();
1729 fflush(stdout);
1730 if(comm->getRank()==0){
1731 std::cout << "Total Time: " << global_total_time << std::endl;
1732 std::cout << "Interior Time: " << global_interior_time << std::endl;
1733 std::cout << "Recoloring Time: " << global_recoloring_time << std::endl;
1734 std::cout << "Min Recoloring Time: " << global_min_recoloring_time << std::endl;
1735 std::cout << "Conflict Detection Time: " << global_conflict_detection << std::endl;
1736 std::cout << "Comm Time: " << global_comm_time << std::endl;
1737 std::cout << "Comp Time: " << global_comp_time << std::endl;
1738 }
1739 }
1740 }
1741}; //end class
1742
1743}//end namespace Zoltan2
1744
1745#endif
AlltoAll communication methods.
Defines the ColoringSolution class.
Defines the GraphModel interface.
Traits class to handle conversions between gno_t/lno_t and TPL data types (e.g., ParMETIS's idx_t,...
A gathering of useful namespace methods.
typename femv_t::device_type device_type
virtual void constructBoundary(const size_t n_local, Kokkos::View< offset_t *, device_type > dist_offsets_dev, Kokkos::View< lno_t *, device_type > dist_adjs_dev, typename Kokkos::View< offset_t *, device_type >::HostMirror dist_offsets_host, typename Kokkos::View< lno_t *, device_type >::HostMirror dist_adjs_host, Kokkos::View< lno_t *, device_type > &boundary_verts, Kokkos::View< lno_t *, device_type > verts_to_send_view, Kokkos::View< size_t *, device_type, Kokkos::MemoryTraits< Kokkos::Atomic > > verts_to_send_size_atomic)=0
typename Adapter::base_adapter_t base_adapter_t
void twoGhostLayer(const size_t n_local, const size_t n_total, const Teuchos::ArrayView< const lno_t > &adjs, const Teuchos::ArrayView< const offset_t > &offsets, const Teuchos::ArrayView< const lno_t > &ghost_adjs, const Teuchos::ArrayView< const offset_t > &ghost_offsets, const Teuchos::RCP< femv_t > &femv, const Teuchos::ArrayView< const gno_t > &gids, const Teuchos::ArrayView< const int > &rand, const Teuchos::ArrayView< const int > &ghost_owners, RCP< const map_t > mapOwnedPlusGhosts, const std::unordered_map< lno_t, std::vector< int > > &procs_to_send)
typename Adapter::gno_t gno_t
typename Adapter::scalar_t scalar_t
typename Adapter::offset_t offset_t
Tpetra::FEMultiVector< femv_scalar_t, lno_t, gno_t > femv_t
typename femv_t::host_view_type::device_type::execution_space host_exec
typename device_type::execution_space execution_space
void color(const RCP< ColoringSolution< Adapter > > &solution)
Coloring method.
virtual void detectConflicts(const size_t n_local, Kokkos::View< offset_t *, device_type > dist_offsets_dev, Kokkos::View< lno_t *, device_type > dist_adjs_dev, Kokkos::View< int *, device_type > femv_colors, Kokkos::View< lno_t *, device_type > boundary_verts_view, Kokkos::View< lno_t *, device_type > verts_to_recolor_view, Kokkos::View< int *, device_type, Kokkos::MemoryTraits< Kokkos::Atomic > > verts_to_recolor_size_atomic, Kokkos::View< lno_t *, device_type > verts_to_send_view, Kokkos::View< size_t *, device_type, Kokkos::MemoryTraits< Kokkos::Atomic > > verts_to_send_size_atomic, Kokkos::View< size_t *, device_type > recoloringSize, Kokkos::View< int *, device_type > rand, Kokkos::View< gno_t *, device_type > gid, Kokkos::View< gno_t *, device_type > ghost_degrees, bool recolor_degrees)=0
virtual void detectConflicts_serial(const size_t n_local, typename Kokkos::View< offset_t *, device_type >::HostMirror dist_offsets_host, typename Kokkos::View< lno_t *, device_type >::HostMirror dist_adjs_host, typename Kokkos::View< int *, device_type >::HostMirror femv_colors, typename Kokkos::View< lno_t *, device_type >::HostMirror boundary_verts_view, typename Kokkos::View< lno_t *, device_type >::HostMirror verts_to_recolor_view, typename Kokkos::View< int *, device_type >::HostMirror verts_to_recolor_size_atomic, typename Kokkos::View< lno_t *, device_type >::HostMirror verts_to_send_view, typename Kokkos::View< size_t *, device_type >::HostMirror verts_to_send_size_atomic, typename Kokkos::View< size_t *, device_type >::HostMirror recoloringSize, typename Kokkos::View< int *, device_type >::HostMirror rand, typename Kokkos::View< gno_t *, device_type >::HostMirror gid, typename Kokkos::View< gno_t *, device_type >::HostMirror ghost_degrees, bool recolor_degrees)=0
typename femv_t::host_view_type::device_type::memory_space host_mem
RCP< Teuchos::ParameterList > pl
RCP< const base_adapter_t > adapter
AlgTwoGhostLayer(const RCP< const base_adapter_t > &adapter_, const RCP< Teuchos::ParameterList > &pl_, const RCP< Environment > &env_, const RCP< const Teuchos::Comm< int > > &comm_)
typename device_type::memory_space memory_space
typename Adapter::lno_t lno_t
Tpetra::Map< lno_t, gno_t > map_t
RCP< const Teuchos::Comm< int > > comm
Algorithm defines the base class for all algorithms.
map_t::local_ordinal_type lno_t
map_t::global_ordinal_type gno_t
Created by mbenlioglu on Aug 31, 2020.
std::bitset< NUM_MODEL_FLAGS > modelFlag_t
@ REMOVE_SELF_EDGES
algorithm requires no self edges