Zoltan2
Loading...
Searching...
No Matches
Zoltan2_AlgMultiJagged.hpp
Go to the documentation of this file.
1// @HEADER
2//
3// ***********************************************************************
4//
5// Zoltan2: A package of combinatorial algorithms for scientific computing
6// Copyright 2012 Sandia Corporation
7//
8// Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation,
9// the U.S. Government retains certain rights in this software.
10//
11// Redistribution and use in source and binary forms, with or without
12// modification, are permitted provided that the following conditions are
13// met:
14//
15// 1. Redistributions of source code must retain the above copyright
16// notice, this list of conditions and the following disclaimer.
17//
18// 2. Redistributions in binary form must reproduce the above copyright
19// notice, this list of conditions and the following disclaimer in the
20// documentation and/or other materials provided with the distribution.
21//
22// 3. Neither the name of the Corporation nor the names of the
23// contributors may be used to endorse or promote products derived from
24// this software without specific prior written permission.
25//
26// THIS SOFTWARE IS PROVIDED BY SANDIA CORPORATION "AS IS" AND ANY
27// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
28// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
29// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL SANDIA CORPORATION OR THE
30// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
31// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
32// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
33// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
34// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
35// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
36// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
37//
38// Questions? Contact Karen Devine (kddevin@sandia.gov)
39// Erik Boman (egboman@sandia.gov)
40// Siva Rajamanickam (srajama@sandia.gov)
41//
42// ***********************************************************************
43//
44// @HEADER
49#ifndef _ZOLTAN2_ALGMultiJagged_HPP_
50#define _ZOLTAN2_ALGMultiJagged_HPP_
51
55#include <Zoltan2_Algorithm.hpp>
58#include <Zoltan2_Util.hpp>
59#include <Tpetra_Distributor.hpp>
60#include <Teuchos_StandardParameterEntryValidators.hpp>
61#include <Teuchos_ParameterList.hpp>
62#include <Kokkos_Sort.hpp>
63
64#include <algorithm> // std::sort
65#include <vector>
66#include <unordered_map>
67
68#ifdef ZOLTAN2_USEZOLTANCOMM
69#ifdef HAVE_ZOLTAN2_MPI
70#define ZOLTAN2_MJ_ENABLE_ZOLTAN_MIGRATION
71#include "zoltan_comm_cpp.h"
72#include "zoltan_types.h" // for error codes
73#endif
74#endif
75
76namespace Teuchos{
77
81template <typename Ordinal, typename T>
82class Zoltan2_BoxBoundaries : public ValueTypeReductionOp<Ordinal,T>
83{
84private:
85 Ordinal size;
86 T epsilon;
87
88public:
92 epsilon(std::numeric_limits<T>::epsilon()) {}
93
98 size(s_), epsilon(std::numeric_limits<T>::epsilon()) {}
99
105 void reduce( const Ordinal count, const T inBuffer[], T inoutBuffer[]) const {
106 for(Ordinal i = 0; i < count; i++) {
107 if(Z2_ABS(inBuffer[i]) > epsilon) {
108 inoutBuffer[i] = inBuffer[i];
109 }
110 }
111 }
112};
113
114} // namespace Teuchos
115
116namespace Zoltan2{
117
124template <typename IT, typename CT, typename WT>
126{
127public:
128 // TODO: Why volatile?
129 // no idea, another intel compiler failure.
130 volatile IT index;
131 volatile CT count;
132 volatile WT *val;
133 volatile WT epsilon;
134
136 this->index = 0;
137 this->count = 0;
138 this->val = NULL;
139 this->epsilon = std::numeric_limits<WT>::epsilon() * 100;
140 }
141
142 // TODO: Document these methods?
143 uMultiSortItem(IT index_ ,CT count_, WT *vals_) {
144 this->index = index_;
145 this->count = count_;
146 this->val = vals_;
147 this->epsilon = std::numeric_limits<WT>::epsilon() * 100;
148 }
149
152
153 void set(IT index_ ,CT count_, WT *vals_) {
154 this->index = index_;
155 this->count = count_;
156 this->val = vals_;
157 }
158
159 bool operator<(const uMultiSortItem<IT,CT,WT>& other) const {
160 assert(this->count == other.count);
161 for(CT i = 0; i < this->count; ++i) {
162 // if the values are equal go to next one.
163 if(std::abs(this->val[i] - other.val[i]) < this->epsilon) {
164 continue;
165 }
166 // if next value is smaller return true;
167 if(this->val[i] < other.val[i]) {
168 return true;
169 }
170 // if next value is bigger return false;
171 else {
172 return false;
173 }
174 }
175 // if they are totally equal.
176 return this->index < other.index;
177 }
178};
179
182template <class IT, class WT>
184{
185 IT id;
186 WT val;
187};
188
193template <class IT, class WT>
194void uqsort(IT n, uSortItem<IT, WT> * arr) {
195 const int NSTACK = 50;
196 int M = 7;
197 IT i, ir=n, j, k, l=1;
198 IT jstack=0, istack[NSTACK];
199 WT aval;
200 uSortItem<IT,WT> a;
201
202 --arr;
203 for(;;) {
204 if(ir-l < M) {
205 for(j=l+1;j<=ir;j++) {
206 a=arr[j];
207 aval = a.val;
208 for(i=j-1;i>=1;i--) {
209 if(arr[i].val <= aval)
210 break;
211 arr[i+1] = arr[i];
212 }
213 arr[i+1]=a;
214 }
215 if(jstack == 0)
216 break;
217 ir=istack[jstack--];
218 l=istack[jstack--];
219 }
220 else {
221 k=(l+ir) >> 1;
222 std::swap(arr[k],arr[l+1]);
223 if(arr[l+1].val > arr[ir].val) {
224 std::swap(arr[l+1],arr[ir]);
225 }
226 if(arr[l].val > arr[ir].val) {
227 std::swap(arr[l],arr[ir]);
228 }
229 if(arr[l+1].val > arr[l].val) {
230 std::swap(arr[l+1],arr[l]);
231 }
232 i=l+1;
233 j=ir;
234 a=arr[l];
235 aval = a.val;
236 for(;;) {
237 do i++; while (arr[i].val < aval);
238 do j--; while (arr[j].val > aval);
239 if(j < i) break;
240 std::swap(arr[i],arr[j]);
241 }
242 arr[l]=arr[j];
243 arr[j]=a;
244 jstack += 2;
245 if(jstack > NSTACK) {
246 std::cout << "uqsort: NSTACK too small in sort." << std::endl;
247 std::terminate();
248 }
249 if(ir-i+1 >= j-l) {
250 istack[jstack]=ir;
251 istack[jstack-1]=i;
252 ir=j-1;
253 }
254 else {
255 istack[jstack]=j-1;
256 istack[jstack-1]=l;
257 l=i;
258 }
259 }
260 }
261}
262
263template <class IT, class WT, class SIGN>
265{
266 IT id;
267 WT val;
268 SIGN signbit; // 1 means positive, 0 means negative.
269 bool operator<(const uSignedSortItem<IT, WT, SIGN>& rhs) const {
270 /*if I am negative, the other is positive*/
271 if(this->signbit < rhs.signbit) {
272 return true;
273 }
274 /*if both has the same sign*/
275 else if(this->signbit == rhs.signbit) {
276 if(this->val < rhs.val) {//if my value is smaller,
277 return this->signbit;//then if we both are positive return true.
278 //if we both are negative, return false.
279 }
280 else if(this->val > rhs.val) {//if my value is larger,
281 return !this->signbit; //then if we both are positive return false.
282 //if we both are negative, return true.
283 }
284 else { //if both are equal.
285 return false;
286 }
287 }
288 else {
289 /*if I am positive, the other is negative*/
290 return false;
291 }
292 }
293
294 bool operator<=(const uSignedSortItem<IT, WT, SIGN>& rhs) {
295 return (this->val == rhs.val && this->signbit == rhs.signbit) || (*this < rhs);
296 }
297};
298
302template <class IT, class WT, class SIGN>
303void uqSignsort(IT n, uSignedSortItem<IT, WT, SIGN> * arr) {
304 const IT NSTACK = 50;
305 IT M = 7;
306 IT i, ir=n, j, k, l=1;
307 IT jstack=0, istack[NSTACK];
308 uSignedSortItem<IT,WT,SIGN> a;
309
310 --arr;
311 for(;;) {
312 if(ir < M + l) {
313 for(j=l+1;j<=ir;j++) {
314 a=arr[j];
315 for(i=j-1;i>=1;i--) {
316 if(arr[i] <= a) {
317 break;
318 }
319 arr[i+1] = arr[i];
320 }
321 arr[i+1]=a;
322 }
323 if(jstack == 0) {
324 break;
325 }
326 ir=istack[jstack--];
327 l=istack[jstack--];
328 }
329 else {
330 k=(l+ir) >> 1;
331 std::swap(arr[k],arr[l+1]);
332 if(arr[ir] < arr[l+1]) {
333 std::swap(arr[l+1],arr[ir]);
334 }
335 if(arr[ir] < arr[l] ) {
336 std::swap(arr[l],arr[ir]);
337 }
338 if(arr[l] < arr[l+1]) {
339 std::swap(arr[l+1],arr[l]);
340 }
341 i=l+1;
342 j=ir;
343 a=arr[l];
344 for(;;) {
345 do i++; while (arr[i] < a);
346 do j--; while (a < arr[j]);
347 if(j < i) break;
348 std::swap(arr[i],arr[j]);
349 }
350 arr[l]=arr[j];
351 arr[j]=a;
352 jstack += 2;
353 if(jstack > NSTACK) {
354 std::cout << "uqsort: NSTACK too small in sort." << std::endl;
355 std::terminate();
356 }
357 if(ir+l+1 >= j+i) {
358 istack[jstack]=ir;
359 istack[jstack-1]=i;
360 ir=j-1;
361 }
362 else {
363 istack[jstack]=j-1;
364 istack[jstack-1]=l;
365 l=i;
366 }
367 }
368 }
369}
370
371// This exists only so we can track how many times the MJ algorithm is
372// called and put each of those into different timer names.
373// Currently the MultiJaggedTest.cpp will actually call it twice.
374// First time with data from a Tpetra MultiVector and then a second time using
375// a BasicVectorAdapter which allows us to turn UVM off for some tests. The
376// results of the two runs are compared which helps to catch a lot of bugs. For
377// profiling I'm mostly just interested in the UVM off case and need it to be
378// in separate timers. Passing a value through would mess up the API. Possibly
379// we could check the Adapter and use that. The statics have to be outside the
380// templated class as the two called instances will be different template
381// parameters. Another complication is that MultiJagged.cpp will call through
382// the Zoltan2_AlgMJ class and we want to time things in both classes. However
383// TaskMapper will directly call AlgMJ so I made two counters for the two
384// classes to make sure it was always correct. This does not impact any
385// behavior and has the sole purpose of generating unique timer names. If you
386// run an MJ test you'll see MJ(0) and MJ(1) in the names to distinguish the
387// 1st and 2nd run. Right now only MultijaggedTest.cpp cares about this.
389 static int get_counter_AlgMJ() {
390 static int counter = 0;
391 return counter++;
392 }
394 static int counter = 0;
395 return counter++;
396 }
397};
398
401template <typename mj_scalar_t, typename mj_lno_t, typename mj_gno_t,
402 typename mj_part_t, typename mj_node_t>
403class AlgMJ
404{
405private:
406 typedef typename mj_node_t::device_type device_t; // for views
408 typedef std::vector<mj_partBox_t> mj_partBoxVector_t;
409
410 //if the (last dimension reduce all count) x the mpi world size
411 //estimated to be bigger than this number then migration will be forced
412 //in earlier iterations.
413 static constexpr size_t future_reduceall_cutoff = 1500000;
414
415 //if parts right before last dimension are estimated to have less than
416 //MIN_WORK_LAST_DIM many coords, migration will be forced in earlier iterations.
417 static constexpr mj_lno_t min_work_last_dim = 1000;
418
419 static constexpr mj_scalar_t least_signifiance = 0.0001;
420 static constexpr int significance_mul = 1000;
421
422 std::string mj_timer_base_string; // for convenience making timer names
423
424 RCP<const Environment> mj_env; // the environment object
425 RCP<const Comm<int> > mj_problemComm; // initial comm object
426 RCP<Comm<int> > comm; // comm object than can be altered during execution
427 double imbalance_tolerance; // input imbalance tolerance.
428 int recursion_depth; // number of steps that partitioning will be solved in.
429 int coord_dim; // coordinate dim
430 int num_weights_per_coord; // # of weights per coord
431 size_t initial_num_loc_coords; // initial num local coords.
432 global_size_t initial_num_glob_coords; // initial num global coords.
433 mj_lno_t num_local_coords; // number of local coords.
434 mj_gno_t num_global_coords; // number of global coords.
435 mj_scalar_t sEpsilon; // epsilon for mj_scalar_t
436
437 // can distribute points on same coordinant to different parts.
438 bool distribute_points_on_cut_lines;
439
440 // how many parts we can calculate concurrently.
441 mj_part_t max_concurrent_part_calculation;
442
443 bool mj_run_as_rcb; // means recursion depth is adjusted to maximum value.
444 int mj_user_recursion_depth; // the recursion depth value provided by user.
445 bool mj_keep_part_boxes; // if the boxes need to be kept.
446
447 // whether to migrate=1, avoid migrate=2, or leave decision to MJ=0
448 int check_migrate_avoid_migration_option;
449
450 // when doing the migration, 0 will aim for perfect load-imbalance, 1 - will
451 // aim for minimized number of messages with possibly bad load-imbalance
452 int migration_type;
453
454 // when MJ decides whether to migrate, the minimum imbalance for migration.
455 double minimum_migration_imbalance;
456
457 // Nonuniform first level partitioning
458 // (Currently available only for sequential_task_partitioning):
459 // Used for Dragonfly task mapping by partitioning Dragonfly RCA
460 // machine coordinates and application coordinates.
461 // An optimization that completely partitions the most important machine dimension
462 // first (i.e. the Dragonfly group coordinate, or RCA's x coordinate). The standard
463 // MJ alg follows after the nonuniform first level partitioning.
464 //
465 // Ex. (first level partitioning): If we have 120 elements,
466 // num_first_level_parts = 3, first_level_distribution = [4, 10, 6], then
467 // part sizes after first level will be [24, 60, 36]. Standard uniform MJ
468 // continues for all subsequent levels.
469
470 // If used, number of parts requested for a nonuniform
471 // first level partitioning
472 mj_part_t num_first_level_parts;
473
474 // If used, the requested distribution of parts for the
475 // nonuniform first level partitioning
476 Kokkos::View<mj_part_t*, Kokkos::HostSpace> first_level_distribution;
477
478 mj_part_t total_num_cut ; // how many cuts will be totally
479 mj_part_t total_num_part; // how many parts will be totally
480
481 mj_part_t max_num_part_along_dim ; // maximum part count along a dimension.
482 mj_part_t max_num_cut_along_dim; // maximum cut count along a dimension.
483
484 // maximum part+cut count along a dimension.
485 size_t max_num_total_part_along_dim;
486
487 mj_part_t total_dim_num_reduce_all; // estimate on #reduceAlls can be done.
488
489 // max no of parts that might occur during the partition before the last
490 // partitioning dimension.
491 mj_part_t last_dim_num_part;
492
493 // input part array specifying num part to divide along each dim.
494 Kokkos::View<mj_part_t *, Kokkos::HostSpace> part_no_array;
495
496 // two dimension coordinate array
497 // coordinates in MJ are LayoutLeft since Tpetra Multivector gives LayoutLeft
498 Kokkos::View<mj_scalar_t **, Kokkos::LayoutLeft, device_t>
499 mj_coordinates;
500
501 // two dimension weight array
502 Kokkos::View<mj_scalar_t **, device_t> mj_weights;
503
504 // if the target parts are uniform
505 Kokkos::View<bool *, Kokkos::HostSpace> mj_uniform_parts;
506
507 // if the coordinates have uniform weights
508 Kokkos::View<bool *, Kokkos::HostSpace> mj_uniform_weights;
509
510 int mj_num_teams; // the number of teams
511
512 size_t num_global_parts; // the targeted number of parts
513
514 // vector of all boxes for all parts, constructed if mj_keep_part_boxes true
515 RCP<mj_partBoxVector_t> kept_boxes;
516
517 RCP<mj_partBox_t> global_box;
518
519 int myRank; // processor rank
520 int myActualRank; // initial rank
521
522 bool divide_to_prime_first;
523
524 // initial global ids of the coordinates.
525 Kokkos::View<const mj_gno_t*, device_t> initial_mj_gnos;
526
527 // current global ids of the coordinates, might change during migration.
528 Kokkos::View<mj_gno_t*, device_t> current_mj_gnos;
529
530 // the actual processor owner of the coordinate, to track after migrations.
531 Kokkos::View<int*, Kokkos::HostSpace> owner_of_coordinate;
532
533 // permutation of coordinates, for partitioning.
534 Kokkos::View<mj_lno_t*, device_t> coordinate_permutations;
535
536 // permutation work array.
537 Kokkos::View<mj_lno_t*, device_t> new_coordinate_permutations;
538
539 // the part ids assigned to coordinates.
540 Kokkos::View<mj_part_t*, device_t> assigned_part_ids;
541
542 // beginning and end of each part.
543 Kokkos::View<mj_lno_t *, device_t> part_xadj;
544
545 // work array for beginning and end of each part.
546 Kokkos::View<mj_lno_t *, device_t> new_part_xadj;
547
548 Kokkos::View<mj_scalar_t *, device_t> all_cut_coordinates;
549
550 // how much weight should a MPI put left side of the each cutline
551 Kokkos::View<mj_scalar_t *, device_t>
552 process_cut_line_weight_to_put_left;
553
554 // weight percentage each thread in MPI puts left side of the each outline
555 Kokkos::View<mj_scalar_t *, device_t>
556 thread_cut_line_weight_to_put_left;
557
558 // work array to manipulate coordinate of cutlines in different iterations.
559 // necessary because previous cut line information is used for determining
560 // the next cutline information. therefore, cannot update the cut work array
561 // until all cutlines are determined.
562 Kokkos::View<mj_scalar_t *, device_t> cut_coordinates_work_array;
563
564 // Used for swapping above cut_coordinates_work_array
565 Kokkos::View<mj_scalar_t *, device_t> temp_cut_coords;
566
567 // cumulative part weight array.
568 Kokkos::View<mj_scalar_t *, device_t> target_part_weights;
569
570 // upper bound coordinate of a cut line
571 Kokkos::View<mj_scalar_t *, device_t> cut_upper_bound_coordinates;
572
573 // lower bound coordinate of a cut line
574 Kokkos::View<mj_scalar_t *, device_t> cut_lower_bound_coordinates;
575
576 // lower bound weight of a cut line
577 Kokkos::View<mj_scalar_t *, device_t> cut_lower_bound_weights;
578
579 // upper bound weight of a cut line
580 Kokkos::View<mj_scalar_t *, device_t> cut_upper_bound_weights;
581
582 // combined array to exchange the min and max coordinate, and total
583 // weight of part.
584 Kokkos::View<mj_scalar_t *, device_t>
585 process_local_min_max_coord_total_weight;
586
587 // global combined array with the results for min, max and total weight.
588 Kokkos::View<mj_scalar_t *, device_t>
589 global_min_max_coord_total_weight;
590
591 // isDone is used to determine if a cutline is determined already. If a cut
592 // line is already determined, the next iterations will skip this cut line.
593 Kokkos::View<bool *, device_t> is_cut_line_determined;
594
595 // incomplete_cut_count count holds the number of cutlines that have not
596 // been finalized for each part when concurrentPartCount>1, using this
597 // information, if incomplete_cut_count[x]==0, then no work is done
598 // for this part.
599 Kokkos::View<mj_part_t *, device_t> device_incomplete_cut_count;
600 typename decltype(device_incomplete_cut_count)::HostMirror
601 incomplete_cut_count;
602
603 // Need a quick accessor for this on host
604 typename decltype (part_xadj)::HostMirror host_part_xadj;
605
606 // local part weights of each thread.
607 Kokkos::View<double *, device_t>
608 thread_part_weights;
609
610 // the work manupulation array for partweights.
611 Kokkos::View<double *, device_t>
612 thread_part_weight_work;
613
614 // thread_cut_left_closest_point to hold the closest coordinate
615 // to a cutline from left (for each thread).
616 Kokkos::View<mj_scalar_t *, device_t>
617 thread_cut_left_closest_point;
618
619 // thread_cut_right_closest_point to hold the closest coordinate
620 // to a cutline from right (for each thread)
621 Kokkos::View<mj_scalar_t *, device_t>
622 thread_cut_right_closest_point;
623
624 // to store how many points in each part a thread has.
625 Kokkos::View<mj_lno_t *, device_t>
626 thread_point_counts;
627
628 Kokkos::View<mj_scalar_t *, device_t> process_rectilinear_cut_weight;
629 Kokkos::View<mj_scalar_t *, device_t> global_rectilinear_cut_weight;
630
631 // for faster communication, concatanation of
632 // totalPartWeights sized 2P-1, since there are P parts and P-1 cut lines
633 // leftClosest distances sized P-1, since P-1 cut lines
634 // rightClosest distances size P-1, since P-1 cut lines.
635 Kokkos::View<mj_scalar_t *, device_t>
636 total_part_weight_left_right_closests;
637 Kokkos::View<mj_scalar_t *, device_t>
638 global_total_part_weight_left_right_closests;
639
640 Kokkos::View<mj_part_t*, device_t> device_num_partitioning_in_current_dim;
641 typename decltype(device_num_partitioning_in_current_dim)::HostMirror
642 host_num_partitioning_in_current_dim; // for quick access on host
643
644 /* \brief helper functio to calculate imbalance.
645 * \param achieved balance we achieved.
646 * \param expected balance expected.
647 */
648 static
649 KOKKOS_INLINE_FUNCTION
650 double calculate_imbalance(mj_scalar_t achieved, mj_scalar_t expected) {
651 return static_cast<double>(achieved) / static_cast<double>(expected) - 1.0;
652 }
653
654 /* \brief Either the mj array (part_no_array) or num_global_parts should be
655 * provided in the input. part_no_array takes precedence if both are
656 * provided. Depending on these parameters, total cut/part number, maximum
657 * part/cut number along a dimension, estimated number of reduceAlls,
658 * and the number of parts before the last dimension is calculated.
659 * */
660 void set_part_specifications();
661
662 /* \brief Tries to determine the part number for current dimension,
663 * by trying to make the partitioning as square as possible.
664 * \param num_total_future how many more partitionings are required.
665 * \param root how many more recursion depth is left.
666 */
667 inline mj_part_t get_part_count(
668 mj_part_t num_total_future,
669 double root);
670
671 /* \brief for part communication we keep track of the box boundaries.
672 * This is performed when either asked specifically, or when geometric
673 * mapping is performed afterwards. This function initializes a single box
674 * with all global min and max coordinates.
675 * \param initial_partitioning_boxes the input and output vector for boxes.
676 */
677 void init_part_boxes(RCP<mj_partBoxVector_t> & outPartBoxes);
678
679 /* \brief Function returns how many parts that will be obtained after this
680 * dimension partitioning. It sets how many parts each current part will be
681 * partitioned into in this dimension to device_num_partitioning_in_current_dim
682 * vector, sets how many total future parts each obtained part will be
683 * partitioned into in next_future_num_parts_in_parts vector, If part boxes
684 * are kept, then sets initializes the output_part_boxes as its ancestor.
685 * \param future_num_part_in_parts: input, how many future parts each
686 * current part will be partitioned into.
687 * \param next_future_num_parts_in_parts: output, how many future parts
688 * each obtained part will be partitioned into.
689 * \param future_num_parts: output, max number of future parts that will be
690 * obtained from a single
691 * \param current_num_parts: input, how many parts are there currently.
692 * \param current_iteration: input, current dimension iteration number.
693 * \param input_part_boxes: input, if boxes are kept, current boxes.
694 * \param output_part_boxes: output, if boxes are kept, the initial box
695 * boundaries for obtained parts.
696 * \param atomic_part_count // DOCWORK: Documentation
697 */
698 mj_part_t update_part_num_arrays(
699 std::vector<mj_part_t> *future_num_part_in_parts,
700 std::vector<mj_part_t> *next_future_num_parts_in_parts,
701 mj_part_t &future_num_parts,
702 mj_part_t current_num_parts,
703 int current_iteration,
704 RCP<mj_partBoxVector_t> input_part_boxes,
705 RCP<mj_partBoxVector_t> output_part_boxes,
706 mj_part_t atomic_part_count);
707
719 static
720 KOKKOS_INLINE_FUNCTION
721 void mj_calculate_new_cut_position (
722 mj_scalar_t cut_upper_bound,
723 mj_scalar_t cut_lower_bound,
724 mj_scalar_t cut_upper_weight,
725 mj_scalar_t cut_lower_weight,
726 mj_scalar_t expected_weight,
727 mj_scalar_t &new_cut_position,
728 mj_scalar_t sEpsilon);
729
754 bool mj_perform_migration(
755 mj_part_t in_num_parts, //current number of parts
756 mj_part_t &out_num_parts, //output number of parts.
757 std::vector<mj_part_t> *next_future_num_parts_in_parts,
758 mj_part_t &output_part_begin_index,
759 size_t migration_reduce_all_population,
760 mj_lno_t num_coords_for_last_dim_part,
761 std::string iteration,
762 RCP<mj_partBoxVector_t> &input_part_boxes,
763 RCP<mj_partBoxVector_t> &output_part_boxes);
764
782 bool mj_check_to_migrate(
783 size_t migration_reduce_all_population,
784 mj_lno_t num_coords_for_last_dim_part,
785 mj_part_t num_procs,
786 mj_part_t num_parts,
787 mj_gno_t *num_points_in_all_processor_parts);
788
813 void mj_migration_part_proc_assignment(
814 mj_gno_t * num_points_in_all_processor_parts,
815 mj_part_t num_parts,
816 mj_part_t num_procs,
817 mj_lno_t *send_count_to_each_proc,
818 std::vector<mj_part_t> &processor_ranks_for_subcomm,
819 std::vector<mj_part_t> *next_future_num_parts_in_parts,
820 mj_part_t &out_num_part,
821 std::vector<mj_part_t> &out_part_indices,
822 mj_part_t &output_part_numbering_begin_index,
823 int *coordinate_destinations);
824
850 void mj_assign_proc_to_parts(
851 mj_gno_t * num_points_in_all_processor_parts,
852 mj_part_t num_parts,
853 mj_part_t num_procs,
854 mj_lno_t *send_count_to_each_proc,
855 std::vector<mj_part_t> &processor_ranks_for_subcomm,
856 std::vector<mj_part_t> *next_future_num_parts_in_parts,
857 mj_part_t &out_part_index,
858 mj_part_t &output_part_numbering_begin_index,
859 int *coordinate_destinations);
860
876 void assign_send_destinations(
877 mj_part_t num_parts,
878 mj_part_t *part_assignment_proc_begin_indices,
879 mj_part_t *processor_chains_in_parts,
880 mj_lno_t *send_count_to_each_proc,
881 int *coordinate_destinations);
882
897 void assign_send_destinations2(
898 mj_part_t num_parts,
899 uSortItem<mj_part_t, mj_part_t> * sort_item_part_to_proc_assignment,
900 int *coordinate_destinations,
901 mj_part_t &output_part_numbering_begin_index,
902 std::vector<mj_part_t> *next_future_num_parts_in_parts);
903
926 void mj_assign_parts_to_procs(
927 mj_gno_t * num_points_in_all_processor_parts,
928 mj_part_t num_parts,
929 mj_part_t num_procs,
930 mj_lno_t *send_count_to_each_proc,
931 std::vector<mj_part_t> *next_future_num_parts_in_parts,
932 mj_part_t &out_num_part,
933 std::vector<mj_part_t> &out_part_indices,
934 mj_part_t &output_part_numbering_begin_index,
935 int *coordinate_destinations);
936
950 void mj_migrate_coords(
951 mj_part_t num_procs,
952 mj_lno_t &num_new_local_points,
953 std::string iteration,
954 int *coordinate_destinations,
955 mj_part_t num_parts);
956
962 void create_sub_communicator(
963 std::vector<mj_part_t> &processor_ranks_for_subcomm);
964
969 mj_part_t find_largest_prime_factor(mj_part_t num_parts) {
970 mj_part_t largest_factor = 1;
971 mj_part_t n = num_parts;
972 mj_part_t divisor = 2;
973 while (n > 1) {
974 while (n % divisor == 0) {
975 n = n / divisor;
976 largest_factor = divisor;
977 }
978 ++divisor;
979 if(divisor * divisor > n) {
980 if(n > 1) {
981 largest_factor = n;
982 }
983 break;
984 }
985 }
986 return largest_factor;
987 }
988
989public:
990 AlgMJ();
991
992 // DOCWORK: Make param documentation use : consistently
1018 void multi_jagged_part(
1019 const RCP<const Environment> &env,
1020 RCP<const Comm<int> > &problemComm,
1021 double imbalance_tolerance,
1022 int num_teams,
1023 size_t num_global_parts,
1024 Kokkos::View<mj_part_t*, Kokkos::HostSpace> & part_no_array,
1025 int recursion_depth,
1026 int coord_dim,
1027 mj_lno_t num_local_coords,
1028 mj_gno_t num_global_coords,
1029 Kokkos::View<const mj_gno_t*, device_t> & initial_mj_gnos,
1030 // coordinates in MJ are LayoutLeft since Tpetra Multivector gives LayoutLeft
1031 Kokkos::View<mj_scalar_t**, Kokkos::LayoutLeft, device_t> & mj_coordinates,
1032 int num_weights_per_coord,
1033 Kokkos::View<bool*, Kokkos::HostSpace> & mj_uniform_weights,
1034 Kokkos::View<mj_scalar_t**, device_t> & mj_weights,
1035 Kokkos::View<bool*, Kokkos::HostSpace> & mj_uniform_parts,
1036 Kokkos::View<mj_part_t*, device_t> & result_assigned_part_ids,
1037 Kokkos::View<mj_gno_t*, device_t> & result_mj_gnos);
1038
1052 bool distribute_points_on_cut_lines_,
1053 int max_concurrent_part_calculation_,
1054 int check_migrate_avoid_migration_option_,
1055 double minimum_migration_imbalance_,
1056 int migration_type_ = 0);
1057
1061
1064 RCP<mj_partBox_t> get_global_box() const;
1065
1068 RCP<mj_partBoxVector_t> get_kept_boxes() const;
1069
1072 RCP<mj_partBoxVector_t> compute_global_box_boundaries(
1073 RCP<mj_partBoxVector_t> &localPartBoxes) const;
1074
1114 const RCP<const Environment> &env,
1115 mj_lno_t num_total_coords,
1116 mj_lno_t num_selected_coords,
1117 size_t num_target_part,
1118 int coord_dim,
1119 // coordinates in MJ are LayoutLeft since Tpetra Multivector gives LayoutLeft
1120 Kokkos::View<mj_scalar_t **, Kokkos::LayoutLeft, device_t> & mj_coordinates_,
1121 Kokkos::View<mj_lno_t *, device_t> &
1122 initial_selected_coords_output_permutation,
1123 mj_lno_t *output_xadj,
1124 int recursion_depth_,
1125 const Kokkos::View<mj_part_t *, Kokkos::HostSpace> & part_no_array,
1126 bool partition_along_longest_dim,
1127 int num_ranks_per_node,
1128 bool divide_to_prime_first_,
1129 mj_part_t num_first_level_parts_ = 1,
1130 const Kokkos::View<mj_part_t *, Kokkos::HostSpace> & first_level_distribution_
1131 = Kokkos::View<mj_part_t *, Kokkos::HostSpace>());
1132
1133#if defined(KOKKOS_ENABLE_CUDA) || defined(KOKKOS_ENABLE_HIP)
1134 public:
1135#else
1136 private:
1137#endif
1138
1139 /* \brief Allocates all required memory for the mj partitioning algorithm.
1140 */
1141 void allocate_set_work_memory();
1142
1143 /* \brief compute global bounding box: min/max coords of global domain */
1144 void compute_global_box();
1145
1146 // DOCWORK: Inconsisent use of ! for descriptive/brief commenting - decide.
1153 void mj_get_local_min_max_coord_totW(
1154 mj_part_t current_work_part,
1155 mj_part_t current_concurrent_num_parts,
1156 Kokkos::View<mj_scalar_t *, device_t> & mj_current_dim_coords);
1157
1170 void mj_get_global_min_max_coord_totW(
1171 mj_part_t current_concurrent_num_parts,
1172 Kokkos::View<mj_scalar_t *, device_t> & local_min_max_total,
1173 Kokkos::View<mj_scalar_t *, device_t> & global_min_max_total);
1174
1205 void mj_get_initial_cut_coords_target_weights(
1206 mj_scalar_t min_coord,
1207 mj_scalar_t max_coord,
1208 mj_part_t num_cuts/*p-1*/ ,
1209 mj_scalar_t global_weight,
1210 Kokkos::View<mj_scalar_t *, device_t> & initial_cut_coords,
1211 Kokkos::View<mj_scalar_t *, device_t> & target_part_weights,
1212 std::vector <mj_part_t> *future_num_part_in_parts,
1213 std::vector <mj_part_t> *next_future_num_parts_in_parts,
1214 mj_part_t concurrent_current_part,
1215 mj_part_t obtained_part_index,
1216 mj_part_t num_target_first_level_parts = 1,
1217 const Kokkos::View<mj_part_t *, Kokkos::HostSpace> & target_first_level_dist =
1218 Kokkos::View<mj_part_t *, Kokkos::HostSpace>());
1219
1236 void set_initial_coordinate_parts(
1237 mj_scalar_t &max_coordinate,
1238 mj_scalar_t &min_coordinate,
1239 mj_lno_t coordinate_begin_index,
1240 mj_lno_t coordinate_end_index,
1241 Kokkos::View<mj_lno_t *, device_t> &
1242 mj_current_coordinate_permutations,
1243 Kokkos::View<mj_scalar_t *, device_t> & mj_current_dim_coords,
1244 Kokkos::View<mj_part_t *, device_t> & mj_part_ids,
1245 mj_part_t &partition_count);
1246
1263 void mj_1D_part(
1264 Kokkos::View<mj_scalar_t *, device_t> & mj_current_dim_coords,
1265 double imbalanceTolerance,
1266 mj_part_t current_work_part,
1267 mj_part_t current_concurrent_num_parts,
1268 Kokkos::View<mj_scalar_t *, device_t> & current_cut_coordinates,
1269 mj_part_t total_incomplete_cut_count,
1270 Kokkos::View<mj_part_t *, device_t> & view_rectilinear_cut_count,
1271 Kokkos::View<size_t*, device_t> & view_total_reduction_size);
1272
1278 void mj_1D_part_get_part_weights(
1279 mj_part_t current_concurrent_num_parts,
1280 mj_part_t current_work_part,
1281 Kokkos::View<mj_scalar_t *, device_t> & mj_current_dim_coords,
1282 int loop_count);
1283
1291 void mj_combine_rightleft_and_weights(
1292 mj_part_t current_work_part,
1293 mj_part_t current_concurrent_num_parts);
1294
1307 void mj_create_new_partitions(
1308 mj_part_t num_parts,
1309 mj_part_t current_concurrent_work_part,
1310 Kokkos::View<mj_scalar_t *, device_t> & mj_current_dim_coords,
1311 Kokkos::View<mj_scalar_t *, device_t> & current_concurrent_cut_coordinate,
1312 Kokkos::View<mj_scalar_t *, device_t> & used_local_cut_line_weight_to_left,
1313 Kokkos::View<mj_lno_t *, device_t> & out_part_xadj);
1314
1350 void mj_get_new_cut_coordinates(
1351 mj_part_t current_concurrent_num_parts,
1352 mj_part_t kk,
1353 const mj_part_t &num_cuts,
1354 const double &used_imbalance_tolerance,
1355 Kokkos::View<mj_scalar_t *, device_t> & current_global_part_weights,
1356 Kokkos::View<mj_scalar_t *, device_t> & current_local_part_weights,
1357 Kokkos::View<mj_scalar_t *, device_t> & current_part_target_weights,
1358 Kokkos::View<bool *, device_t> & current_cut_line_determined,
1359 Kokkos::View<mj_scalar_t *, device_t> & current_cut_coordinates,
1360 Kokkos::View<mj_scalar_t *, device_t> & current_cut_upper_bounds,
1361 Kokkos::View<mj_scalar_t *, device_t> & current_cut_lower_bounds,
1362 Kokkos::View<mj_scalar_t *, device_t> & current_global_left_closest_points,
1363 Kokkos::View<mj_scalar_t *, device_t> & current_global_right_closest_points,
1364 Kokkos::View<mj_scalar_t *, device_t> & current_cut_lower_bound_weights,
1365 Kokkos::View<mj_scalar_t *, device_t> & current_cut_upper_weights,
1366 Kokkos::View<mj_scalar_t *, device_t> & new_current_cut_coordinates,
1367 Kokkos::View<mj_scalar_t *, device_t> &
1368 current_part_cut_line_weight_to_put_left,
1369 Kokkos::View<mj_part_t *, device_t> & view_rectilinear_cut_count);
1370
1380 void get_processor_num_points_in_parts(
1381 mj_part_t num_procs,
1382 mj_part_t num_parts,
1383 mj_gno_t *&num_points_in_all_processor_parts);
1384
1389 void fill_permutation_array(
1390 mj_part_t output_num_parts,
1391 mj_part_t num_parts);
1392
1414 void create_consistent_chunks(
1415 mj_part_t num_parts,
1416 Kokkos::View<mj_scalar_t *, device_t> & mj_current_dim_coords,
1417 Kokkos::View<mj_scalar_t *, device_t> & current_concurrent_cut_coordinate,
1418 mj_lno_t coordinate_begin,
1419 mj_lno_t coordinate_end,
1420 Kokkos::View<mj_scalar_t *, device_t> & used_local_cut_line_weight_to_left,
1421 Kokkos::View<mj_lno_t *, device_t> & out_part_xadj,
1422 int coordInd,
1423 bool longest_dim_part,
1424 uSignedSortItem<int, mj_scalar_t, char> *p_coord_dimension_range_sorted);
1425
1434 void set_final_parts(
1435 mj_part_t current_num_parts,
1436 mj_part_t output_part_begin_index,
1437 RCP<mj_partBoxVector_t> &output_part_boxes,
1438 bool is_data_ever_migrated);
1439};
1440
1443template <typename mj_scalar_t, typename mj_lno_t, typename mj_gno_t,
1444 typename mj_part_t, typename mj_node_t>
1446 mj_env(), mj_problemComm(), comm(), imbalance_tolerance(0),
1447 recursion_depth(0), coord_dim(0),
1448 num_weights_per_coord(0), initial_num_loc_coords(0),
1449 initial_num_glob_coords(0),
1450 num_local_coords(0), num_global_coords(0),
1451 sEpsilon(std::numeric_limits<mj_scalar_t>::epsilon() * 100),
1452 distribute_points_on_cut_lines(true),
1453 max_concurrent_part_calculation(1),
1454 mj_run_as_rcb(false), mj_user_recursion_depth(0),
1455 mj_keep_part_boxes(false),
1456 check_migrate_avoid_migration_option(0), migration_type(0),
1457 minimum_migration_imbalance(0.30),
1458 num_first_level_parts(1),
1459 total_num_cut(0), total_num_part(0), max_num_part_along_dim(0),
1460 max_num_cut_along_dim(0),
1461 max_num_total_part_along_dim(0),
1462 total_dim_num_reduce_all(0),
1463 last_dim_num_part(0),
1464 mj_num_teams(0),
1465 num_global_parts(1),
1466 kept_boxes(), global_box(),
1467 myRank(0), myActualRank(0),
1468 divide_to_prime_first(false)
1469{
1470}
1471
1515template <typename mj_scalar_t, typename mj_lno_t, typename mj_gno_t,
1516 typename mj_part_t, typename mj_node_t>
1519 const RCP<const Environment> &env,
1520 mj_lno_t num_total_coords,
1521 mj_lno_t num_selected_coords,
1522 size_t num_target_part,
1523 int coord_dim_,
1524 // coordinates in MJ are LayoutLeft since Tpetra Multivector gives LayoutLeft
1525 Kokkos::View<mj_scalar_t **, Kokkos::LayoutLeft, device_t> &
1526 mj_coordinates_,
1527 Kokkos::View<mj_lno_t *, device_t> & initial_adjList_output_adjlist,
1528 mj_lno_t *output_xadj,
1529 int recursion_depth_,
1530 const Kokkos::View<mj_part_t *, Kokkos::HostSpace> & part_no_array_,
1531 bool partition_along_longest_dim,
1532 int num_ranks_per_node,
1533 bool divide_to_prime_first_,
1534 mj_part_t num_first_level_parts_,
1535 const Kokkos::View<mj_part_t *, Kokkos::HostSpace> & first_level_distribution_)
1536{
1537 this->mj_env = env;
1538 const RCP<Comm<int> > commN;
1539 this->mj_problemComm = Teuchos::DefaultComm<int>::getDefaultSerialComm(commN);
1540 this->comm = Teuchos::rcp_const_cast<Comm<int> >(this->mj_problemComm);
1541 this->myActualRank = this->myRank = 1;
1542
1543 this->divide_to_prime_first = divide_to_prime_first_;
1544 //weights are uniform for task mapping
1545
1546 //parts are uniform for task mapping
1547 //as input indices.
1548 this->imbalance_tolerance = 0;
1549 this->num_global_parts = num_target_part;
1550 this->part_no_array = part_no_array_;
1551 this->recursion_depth = recursion_depth_;
1552
1553 // If nonuniform first level partitioning, the requested num of parts and the
1554 // requested distribution of elements for each part
1555 this->num_first_level_parts = num_first_level_parts_;
1556
1557 this->first_level_distribution = first_level_distribution_;
1558
1559 this->coord_dim = coord_dim_;
1560 this->num_local_coords = num_total_coords;
1561
1562 this->num_global_coords = num_total_coords;
1563 this->mj_coordinates = mj_coordinates_;
1564
1565
1566 this->initial_mj_gnos =
1567 Kokkos::View<mj_gno_t*, device_t>("gids", this->num_local_coords);
1568
1569 this->num_weights_per_coord = 0;
1570
1571 this->mj_uniform_weights = Kokkos::View<bool*, Kokkos::HostSpace>(
1572 "uniform weights", 1);
1573 this->mj_uniform_weights(0) = true;
1574
1575 this->mj_weights = Kokkos::View<mj_scalar_t**, device_t>
1576 ("weights", 1, 1);
1577
1578 this->mj_uniform_parts =
1579 Kokkos::View<bool*, Kokkos::HostSpace>("uniform parts", 1);
1580 this->mj_uniform_parts(0) = true;
1581
1582 this->set_part_specifications();
1583
1584 this->allocate_set_work_memory();
1585
1586 // Do single init
1587 auto local_part_xadj = this->part_xadj;
1588 Kokkos::parallel_for(
1589 Kokkos::RangePolicy<typename mj_node_t::execution_space, int> (0, 1),
1590 KOKKOS_LAMBDA (int dummy) {
1591 local_part_xadj(0) = static_cast<mj_lno_t>(num_selected_coords);
1592 });
1593
1594 Kokkos::deep_copy(coordinate_permutations, initial_adjList_output_adjlist);
1595
1596 mj_part_t current_num_parts = 1;
1597
1598 Kokkos::View<mj_scalar_t *, device_t> current_cut_coordinates =
1599 this->all_cut_coordinates;
1600
1601 mj_part_t future_num_parts = this->total_num_part;
1602
1603 std::vector<mj_part_t> *future_num_part_in_parts =
1604 new std::vector<mj_part_t>();
1605 std::vector<mj_part_t> *next_future_num_parts_in_parts =
1606 new std::vector<mj_part_t>();
1607 next_future_num_parts_in_parts->push_back(this->num_global_parts);
1608 RCP<mj_partBoxVector_t> t1;
1609 RCP<mj_partBoxVector_t> t2;
1610
1611 std::vector <uSignedSortItem<int, mj_scalar_t, char>>
1612 coord_dimension_range_sorted(this->coord_dim);
1613 uSignedSortItem<int, mj_scalar_t, char> *p_coord_dimension_range_sorted =
1614 &(coord_dimension_range_sorted[0]);
1615 std::vector <mj_scalar_t> coord_dim_mins(this->coord_dim);
1616 std::vector <mj_scalar_t> coord_dim_maxs(this->coord_dim);
1617
1618 // Need a device counter - how best to allocate?
1619 // Putting this allocation in the loops is very costly so moved out here.
1620 Kokkos::View<mj_part_t*, device_t>
1621 view_rectilinear_cut_count("view_rectilinear_cut_count", 1);
1622 Kokkos::View<size_t*, device_t>
1623 view_total_reduction_size("view_total_reduction_size", 1);
1624
1625 for(int rd = 0; rd < this->recursion_depth; ++rd) {
1626 // next_future_num_parts_in_parts will be as the size of outnumParts,
1627 // and this will hold how many more parts that each output part
1628 // should be divided. this array will also be used to determine the weight
1629 // ratios of the parts.
1630 // swap the arrays to use iteratively..
1631 std::vector<mj_part_t> *tmpPartVect = future_num_part_in_parts;
1632 future_num_part_in_parts = next_future_num_parts_in_parts;
1633 next_future_num_parts_in_parts = tmpPartVect;
1634
1635 // clear next_future_num_parts_in_parts array as
1636 // getPartitionArrays expects it to be empty.
1637 next_future_num_parts_in_parts->clear();
1638
1639 // returns the total number of output parts for this dimension partitioning.
1640 mj_part_t output_part_count_in_dimension =
1641 this->update_part_num_arrays(
1642 future_num_part_in_parts,
1643 next_future_num_parts_in_parts,
1644 future_num_parts,
1645 current_num_parts,
1646 rd,
1647 t1,
1648 t2, num_ranks_per_node);
1649
1650 // if the number of obtained parts equal to current number of parts,
1651 // skip this dimension. For example, this happens when 1 is given in
1652 // the input part array is given. P=4,5,1,2
1653 if(output_part_count_in_dimension == current_num_parts) {
1654 tmpPartVect = future_num_part_in_parts;
1655 future_num_part_in_parts = next_future_num_parts_in_parts;
1656 next_future_num_parts_in_parts = tmpPartVect;
1657 continue;
1658 }
1659
1660 //convert i to string to be used for debugging purposes.
1661 std::string istring = std::to_string(rd);
1662
1663 // alloc Memory to point the indices
1664 // of the parts in the permutation array.
1665 this->new_part_xadj = Kokkos::View<mj_lno_t*, device_t>(
1666 "new part xadj", output_part_count_in_dimension);
1667
1668 // the index where in the outtotalCounts will be written.
1669
1670 mj_part_t output_part_index = 0;
1671
1672 // whatever is written to outTotalCounts will be added with previousEnd
1673 // so that the points will be shifted.
1674 mj_part_t output_coordinate_end_index = 0;
1675
1676 mj_part_t current_work_part = 0;
1677 mj_part_t current_concurrent_num_parts = 1;
1678
1679 mj_part_t obtained_part_index = 0;
1680
1681 // get the coordinate axis along which the partitioning will be done.
1682 int coordInd = rd % this->coord_dim;
1683
1684 Kokkos::View<mj_scalar_t *, device_t> mj_current_dim_coords =
1685 Kokkos::subview(this->mj_coordinates, Kokkos::ALL, coordInd);
1686
1687 auto host_process_local_min_max_coord_total_weight =
1688 Kokkos::create_mirror_view(process_local_min_max_coord_total_weight);
1689 auto host_global_min_max_coord_total_weight =
1690 Kokkos::create_mirror_view(global_min_max_coord_total_weight);
1691
1692 // run for all available parts.
1693 for(; current_work_part < current_num_parts;
1694 current_work_part += current_concurrent_num_parts) {
1695
1696 mj_part_t actual_work_part_count = 0;
1697
1698 // initialization for 1D partitioning.
1699 // get the min and max coordinates of each part
1700 // together with the part weights of each part.
1701 for(int kk = 0; kk < current_concurrent_num_parts; ++kk) {
1702 mj_part_t current_work_part_in_concurrent_parts =
1703 current_work_part + kk;
1704
1705 // if this part wont be partitioned any further
1706 // dont do any work for this part.
1707 mj_part_t partition_count = host_num_partitioning_in_current_dim(
1708 current_work_part_in_concurrent_parts);
1709 if(partition_count == 1) {
1710 continue;
1711 }
1712 ++actual_work_part_count;
1713 if(partition_along_longest_dim) {
1714 auto local_process_local_min_max_coord_total_weight =
1715 this->process_local_min_max_coord_total_weight;
1716 for(int coord_traverse_ind = 0;
1717 coord_traverse_ind < this->coord_dim; ++coord_traverse_ind) {
1718
1719 Kokkos::View<mj_scalar_t *, device_t> coords =
1720 Kokkos::subview(this->mj_coordinates, Kokkos::ALL, coord_traverse_ind);
1721
1722 this->mj_get_local_min_max_coord_totW(
1723 current_work_part,
1724 current_concurrent_num_parts,
1725 coords);
1726
1727 coord_dimension_range_sorted[coord_traverse_ind].id =
1728 coord_traverse_ind;
1729 coord_dimension_range_sorted[coord_traverse_ind].signbit = 1;
1730
1731 Kokkos::deep_copy(host_process_local_min_max_coord_total_weight,
1732 process_local_min_max_coord_total_weight);
1733
1734 coord_dim_mins[coord_traverse_ind] =
1735 host_process_local_min_max_coord_total_weight(kk);
1736 coord_dim_maxs[coord_traverse_ind] =
1737 host_process_local_min_max_coord_total_weight(
1738 kk + current_concurrent_num_parts);
1739 coord_dimension_range_sorted[coord_traverse_ind].val =
1740 host_process_local_min_max_coord_total_weight(
1741 kk + current_concurrent_num_parts) -
1742 host_process_local_min_max_coord_total_weight(kk);
1743 }
1744
1745 uqSignsort(this->coord_dim, p_coord_dimension_range_sorted);
1746 coordInd = p_coord_dimension_range_sorted[this->coord_dim - 1].id;
1747 auto set_min = coord_dim_mins[coordInd];
1748 auto set_max = coord_dim_maxs[coordInd];
1749 Kokkos::parallel_for(
1750 Kokkos::RangePolicy<typename mj_node_t::execution_space, int>
1751 (0, 1), KOKKOS_LAMBDA (int dummy) {
1752 local_process_local_min_max_coord_total_weight(kk) = set_min;
1753 local_process_local_min_max_coord_total_weight(
1754 kk + current_concurrent_num_parts) = set_max;
1755 });
1756
1757 mj_current_dim_coords =
1758 Kokkos::subview(this->mj_coordinates, Kokkos::ALL, coordInd);
1759 }
1760 else {
1761 Kokkos::View<mj_scalar_t *, device_t> coords =
1762 Kokkos::subview(this->mj_coordinates, Kokkos::ALL, coordInd);
1763 this->mj_get_local_min_max_coord_totW(
1764 current_work_part,
1765 current_concurrent_num_parts,
1766 coords);
1767 }
1768 }
1769
1770 // 1D partitioning
1771 if(actual_work_part_count > 0) {
1772 // obtain global Min max of the part.
1773 this->mj_get_global_min_max_coord_totW(
1774 current_concurrent_num_parts,
1775 this->process_local_min_max_coord_total_weight,
1776 this->global_min_max_coord_total_weight);
1777
1778 // update host copy
1779 Kokkos::deep_copy(host_global_min_max_coord_total_weight,
1780 global_min_max_coord_total_weight);
1781
1782 // represents the total number of cutlines
1783 // whose coordinate should be determined.
1784 mj_part_t total_incomplete_cut_count = 0;
1785
1786 //Compute weight ratios for parts & cuts:
1787 //e.g., 0.25 0.25 0.5 0.5 0.75 0.75 1.0
1788 // part0 cut0 part1 cut1 part2 cut2 part3
1789 mj_part_t concurrent_part_cut_shift = 0;
1790 mj_part_t concurrent_part_part_shift = 0;
1791 for(int kk = 0; kk < current_concurrent_num_parts; ++kk) {
1792 mj_scalar_t min_coordinate =
1793 host_global_min_max_coord_total_weight(kk);
1794 mj_scalar_t max_coordinate = host_global_min_max_coord_total_weight(
1795 kk + current_concurrent_num_parts);
1796 mj_scalar_t global_total_weight = host_global_min_max_coord_total_weight(
1797 kk + 2*current_concurrent_num_parts);
1798
1799 mj_part_t concurrent_current_part_index = current_work_part + kk;
1800
1801 mj_part_t partition_count = host_num_partitioning_in_current_dim(
1802 concurrent_current_part_index);
1803
1804 Kokkos::View<mj_scalar_t *, device_t> usedCutCoordinate =
1805 Kokkos::subview(current_cut_coordinates,
1806 std::pair<mj_lno_t, mj_lno_t>(
1807 concurrent_part_cut_shift,
1808 current_cut_coordinates.size()));
1809 Kokkos::View<mj_scalar_t *, device_t>
1810 current_target_part_weights =
1811 Kokkos::subview(target_part_weights,
1812 std::pair<mj_lno_t, mj_lno_t>(
1813 concurrent_part_part_shift,
1814 target_part_weights.size()));
1815
1816 // shift the usedCutCoordinate array as noCuts.
1817 concurrent_part_cut_shift += partition_count - 1;
1818 // shift the partRatio array as noParts.
1819 concurrent_part_part_shift += partition_count;
1820 // calculate only if part is not empty,
1821 // and part will be further partitioend.
1822 if(partition_count > 1 && min_coordinate <= max_coordinate) {
1823 // increase allDone by the number of cuts of the current
1824 // part's cut line number.
1825 total_incomplete_cut_count += partition_count - 1;
1826
1827 this->incomplete_cut_count(kk) = partition_count - 1;
1828
1829 // When num_first_level_parts != 1 we have
1830 // nonuniform partitioning on the first level, providing
1831 // requested number of parts (num_first_level_parts) and
1832 // requested distribution in parts (first_level_distribution)
1833
1834 // Get the target part weights given a desired distribution
1835 this->mj_get_initial_cut_coords_target_weights(
1836 min_coordinate,
1837 max_coordinate,
1838 partition_count - 1,
1839 global_total_weight,
1840 usedCutCoordinate,
1841 current_target_part_weights,
1842 future_num_part_in_parts,
1843 next_future_num_parts_in_parts,
1844 concurrent_current_part_index,
1845 obtained_part_index,
1846 rd == 0 ? this->num_first_level_parts : 1,
1847 this->first_level_distribution);
1848
1849 mj_lno_t coordinate_end_index =
1850 host_part_xadj(concurrent_current_part_index);
1851 mj_lno_t coordinate_begin_index =
1852 (concurrent_current_part_index==0) ? 0 :
1853 host_part_xadj[concurrent_current_part_index - 1];
1854
1855 // get the initial estimated part assignments of the coordinates.
1856 this->set_initial_coordinate_parts(
1857 max_coordinate,
1858 min_coordinate,
1859 coordinate_begin_index, coordinate_end_index,
1860 this->coordinate_permutations,
1861 mj_current_dim_coords,
1862 this->assigned_part_ids,
1863 partition_count);
1864 }
1865 else {
1866 // e.g., if have fewer coordinates than parts, don't need to do
1867 // next dim.
1868 this->incomplete_cut_count(kk) = 0;
1869 }
1870 obtained_part_index += partition_count;
1871 }
1872
1873 // used imbalance, it is always 0, as it is difficult
1874 // to estimate a range.
1875 double used_imbalance = 0;
1876
1877 // Determine cut lines for k parts here.
1878 this->mj_env->timerStart(MACRO_TIMERS,
1879 mj_timer_base_string + "mj_1D_part()");
1880
1881 this->mj_1D_part(
1882 mj_current_dim_coords,
1883 used_imbalance,
1884 current_work_part,
1885 current_concurrent_num_parts,
1886 current_cut_coordinates,
1887 total_incomplete_cut_count,
1888 view_rectilinear_cut_count,
1889 view_total_reduction_size);
1890
1891 this->mj_env->timerStop(MACRO_TIMERS,
1892 mj_timer_base_string + "mj_1D_part()");
1893 }
1894 else {
1895 obtained_part_index += current_concurrent_num_parts;
1896 }
1897 // create part chunks
1898 {
1899 mj_part_t output_array_shift = 0;
1900 mj_part_t cut_shift = 0;
1901 size_t tlr_shift = 0;
1902 size_t partweight_array_shift = 0;
1903
1904 for(int kk = 0; kk < current_concurrent_num_parts; ++kk) {
1905 mj_part_t current_concurrent_work_part = current_work_part + kk;
1906
1907 mj_part_t num_parts = host_num_partitioning_in_current_dim(
1908 current_concurrent_work_part);
1909
1910 // if the part is empty, skip the part.
1911 int coordinateA_bigger_than_coordinateB =
1912 host_global_min_max_coord_total_weight(kk) >
1913 host_global_min_max_coord_total_weight(
1914 kk + current_concurrent_num_parts);
1915
1916 if((num_parts != 1) && coordinateA_bigger_than_coordinateB) {
1917 // we still need to write the begin and end point of the empty part.
1918 // simply set it zero, the array indices will be shifted later
1919 auto local_new_part_xadj = this->new_part_xadj;
1920 Kokkos::parallel_for(
1921 Kokkos::RangePolicy<typename mj_node_t::execution_space,
1922 mj_part_t> (0, num_parts), KOKKOS_LAMBDA(mj_part_t jj) {
1923 local_new_part_xadj(
1924 output_part_index + output_array_shift + jj) = 0;
1925 });
1926
1927 cut_shift += num_parts - 1;
1928 tlr_shift += (4 *(num_parts - 1) + 1);
1929 output_array_shift += num_parts;
1930 partweight_array_shift += (2 * (num_parts - 1) + 1);
1931 continue;
1932 }
1933 mj_lno_t coordinate_end =
1934 host_part_xadj(current_concurrent_work_part);
1935 mj_lno_t coordinate_begin =
1936 current_concurrent_work_part==0 ? 0 :
1937 host_part_xadj(current_concurrent_work_part-1);
1938
1939 Kokkos::View<mj_scalar_t *, device_t>
1940 current_concurrent_cut_coordinate =
1941 Kokkos::subview(current_cut_coordinates,
1942 std::pair<mj_lno_t, mj_lno_t>(
1943 cut_shift,
1944 current_cut_coordinates.size()));
1945 Kokkos::View<mj_scalar_t *, device_t>
1946 used_local_cut_line_weight_to_left =
1947 Kokkos::subview(process_cut_line_weight_to_put_left,
1948 std::pair<mj_lno_t, mj_lno_t>(
1949 cut_shift,
1950 process_cut_line_weight_to_put_left.size()));
1951
1952 this->thread_part_weight_work =
1953 Kokkos::subview(
1954 this->thread_part_weights,
1955 std::pair<mj_lno_t, mj_lno_t>(
1956 partweight_array_shift,
1957 this->thread_part_weights.size()));
1958
1959 if(num_parts > 1) {
1960 // Rewrite the indices based on the computed cuts.
1961 Kokkos::View<mj_lno_t *, device_t> subview_new_part_xadj =
1962 Kokkos::subview(this->new_part_xadj,
1963 std::pair<mj_lno_t, mj_lno_t>(
1964 output_part_index + output_array_shift,
1965 this->new_part_xadj.size()));
1966
1967 this->create_consistent_chunks(
1968 num_parts,
1969 mj_current_dim_coords,
1970 current_concurrent_cut_coordinate,
1971 coordinate_begin,
1972 coordinate_end,
1973 used_local_cut_line_weight_to_left,
1974 subview_new_part_xadj,
1975 coordInd,
1976 partition_along_longest_dim,
1977 p_coord_dimension_range_sorted);
1978 }
1979 else {
1980 // if this part is partitioned into 1 then just copy
1981 // the old values.
1982 mj_lno_t part_size = coordinate_end - coordinate_begin;
1983
1984 auto local_new_part_xadj = this->new_part_xadj;
1985 Kokkos::parallel_for(
1986 Kokkos::RangePolicy<typename mj_node_t::execution_space, int>
1987 (0, 1), KOKKOS_LAMBDA (int dummy) {
1988 local_new_part_xadj(output_part_index + output_array_shift)
1989 = part_size;
1990 });
1991
1992 auto subview_new_coordinate_permutations =
1993 Kokkos::subview(this->new_coordinate_permutations,
1994 std::pair<mj_lno_t, mj_lno_t>(
1995 coordinate_begin,
1996 coordinate_begin + part_size));
1997 auto subview_coordinate_permutations =
1998 Kokkos::subview(this->coordinate_permutations,
1999 std::pair<mj_lno_t, mj_lno_t>(
2000 coordinate_begin,
2001 coordinate_begin + part_size));
2002 Kokkos::deep_copy(subview_new_coordinate_permutations,
2003 subview_coordinate_permutations);
2004 }
2005
2006 cut_shift += num_parts - 1;
2007 tlr_shift += (4 *(num_parts - 1) + 1);
2008 output_array_shift += num_parts;
2009 partweight_array_shift += (2 * (num_parts - 1) + 1);
2010 }
2011
2012 // shift cut coordinates so that all cut coordinates are stored.
2013 // current_cut_coordinates += cutShift;
2014
2015 // getChunks from coordinates partitioned the parts and
2016 // wrote the indices as if there were a single part.
2017 // now we need to shift the beginning indices.
2018 for(mj_part_t kk = 0; kk < current_concurrent_num_parts; ++kk) {
2019 mj_part_t num_parts =
2020 host_num_partitioning_in_current_dim(current_work_part + kk);
2021 auto local_new_part_xadj = this->new_part_xadj;
2022 auto local_mj_current_dim_coords = mj_current_dim_coords;
2023 auto local_new_coordinate_permutations =
2024 new_coordinate_permutations;
2025 Kokkos::parallel_for(
2026 Kokkos::RangePolicy<typename mj_node_t::execution_space, mj_part_t> (
2027 0, num_parts), KOKKOS_LAMBDA (mj_part_t ii) {
2028 //shift it by previousCount
2029 local_new_part_xadj(output_part_index+ii) +=
2030 output_coordinate_end_index;
2031
2032 if(ii % 2 == 1) {
2033 mj_lno_t coordinate_end =
2034 local_new_part_xadj(output_part_index+ii);
2035 mj_lno_t coordinate_begin =
2036 local_new_part_xadj(output_part_index);
2037
2038 for(mj_lno_t task_traverse = coordinate_begin;
2039 task_traverse < coordinate_end; ++task_traverse) {
2040 mj_lno_t l = local_new_coordinate_permutations(task_traverse);
2041 //MARKER: FLIPPED ZORDER BELOW
2042 local_mj_current_dim_coords(l) = -local_mj_current_dim_coords(l);
2043 }
2044 }
2045 });
2046
2047 // increase the previous count by current end.
2048 mj_part_t get_single;
2049 Kokkos::parallel_reduce("Read new_part_xadj",
2050 Kokkos::RangePolicy<typename mj_node_t::execution_space, int>(0, 1),
2051 KOKKOS_LAMBDA(int dummy, mj_part_t & set_single) {
2052 set_single = local_new_part_xadj(output_part_index + num_parts - 1);
2053 }, get_single);;
2054
2055 output_coordinate_end_index = get_single;
2056 // increase the current out.
2057 output_part_index += num_parts;
2058 }
2059 }
2060 }
2061
2062 // end of this partitioning dimension
2063 // set the current num parts for next dim partitioning
2064 current_num_parts = output_part_count_in_dimension;
2065
2066 //swap the coordinate permutations for the next dimension.
2067 Kokkos::View<mj_lno_t *, device_t> tmp = this->coordinate_permutations;
2068 this->coordinate_permutations = this->new_coordinate_permutations;
2069 this->new_coordinate_permutations = tmp;
2070
2071 this->part_xadj = this->new_part_xadj;
2072 this->host_part_xadj = Kokkos::create_mirror_view(part_xadj);
2073 Kokkos::deep_copy(host_part_xadj, part_xadj); // keep in sync
2074 this->new_part_xadj = Kokkos::View<mj_lno_t*, device_t>("empty", 0);
2075 }
2076
2077 Kokkos::deep_copy(initial_adjList_output_adjlist, coordinate_permutations);
2078
2079 // Return output_xadj in CSR format
2080 output_xadj[0] = 0;
2081 for(size_t i = 0; i < this->num_global_parts ; ++i) {
2082 output_xadj[i+1] = host_part_xadj(i);
2083 }
2084
2085 delete future_num_part_in_parts;
2086 delete next_future_num_parts_in_parts;
2087}
2088
2092template <typename mj_scalar_t, typename mj_lno_t, typename mj_gno_t,
2093 typename mj_part_t, typename mj_node_t>
2094RCP<typename AlgMJ
2095 <mj_scalar_t,mj_lno_t,mj_gno_t,mj_part_t,mj_node_t>::mj_partBox_t>
2097 get_global_box() const
2098{
2099 return this->global_box;
2100}
2101
2104template <typename mj_scalar_t, typename mj_lno_t, typename mj_gno_t,
2105 typename mj_part_t, typename mj_node_t>
2106void AlgMJ<mj_scalar_t, mj_lno_t, mj_gno_t, mj_part_t,
2107 mj_node_t>::set_to_keep_part_boxes()
2108{
2109 this->mj_keep_part_boxes = true;
2110}
2111
2112/* \brief Either the mj array (part_no_array) or num_global_parts should be
2113 * provided in the input. part_no_array takes
2114 * precedence if both are provided.
2115 * Depending on these parameters, total cut/part number,
2116 * maximum part/cut number along a dimension, estimated number of reduceAlls,
2117 * and the number of parts before the last dimension is calculated.
2118 * */
2119template <typename mj_scalar_t, typename mj_lno_t, typename mj_gno_t,
2120 typename mj_part_t, typename mj_node_t>
2123{
2124 this->total_num_cut = 0; //how many cuts will be totally
2125 this->total_num_part = 1; //how many parts will be totally
2126 this->max_num_part_along_dim = 0; // maximum part count along a dimension.
2127 this->total_dim_num_reduce_all = 0; // estimate on #reduceAlls can be done.
2128 this->last_dim_num_part = 1; //max no of parts that might occur
2129 //during the partition before the
2130 //last partitioning dimension.
2131 this->max_num_cut_along_dim = 0;
2132 this->max_num_total_part_along_dim = 0;
2133
2134 if(this->part_no_array.size()) {
2135 auto local_recursion_depth = this->recursion_depth;
2136
2137 this->total_dim_num_reduce_all =
2138 this->total_num_part * this->recursion_depth;
2139
2140 this->total_num_part = 1;
2141 for(int i = 0; i < local_recursion_depth; ++i) {
2142 this->total_num_part *= this->part_no_array(i);
2143 }
2144
2145 mj_part_t track_max = 0;
2146 for(int i = 0; i < local_recursion_depth; ++i) {
2147 if(part_no_array(i) > track_max) {
2148 track_max = this->part_no_array(i);
2149 };
2150 }
2151
2152 this->last_dim_num_part = this->total_num_part /
2153 this->part_no_array(local_recursion_depth-1);
2154
2155 this->max_num_part_along_dim = track_max;
2156 this->num_global_parts = this->total_num_part;
2157 } else {
2158 mj_part_t future_num_parts = this->num_global_parts;
2159
2160 // If using nonuniform first level partitioning.
2161 // initial value max_num_part_along_dim == num_first_level_parts
2162 if (this->first_level_distribution.size() != 0 &&
2163 this->num_first_level_parts > 1) {
2164 this->max_num_part_along_dim = this->num_first_level_parts;
2165 }
2166
2167 // we need to calculate the part numbers now, to determine
2168 // the maximum along the dimensions.
2169 for(int rd = 0; rd < this->recursion_depth; ++rd) {
2170 mj_part_t maxNoPartAlongI = 0;
2171 mj_part_t nfutureNumParts = 0;
2172
2173 // Nonuniform first level partitioning sets part specificiations for
2174 // rd == 0 only, given requested num of parts and distribution in parts
2175 // for the first level.
2176 if (rd == 0 &&
2177 this->first_level_distribution.size() != 0 &&
2178 this->num_first_level_parts > 1) {
2179
2180 maxNoPartAlongI = this->num_first_level_parts;
2181 this->max_num_part_along_dim = this->num_first_level_parts;
2182
2183 mj_part_t sum_first_level_dist = 0;
2184 mj_part_t max_part = 0;
2185
2186 // Cumulative sum of distribution of parts and size of largest part
2187 for (int i = 0; i < this->num_first_level_parts; ++i) {
2188 sum_first_level_dist += this->first_level_distribution(i);
2189 if (this->first_level_distribution(i) > max_part)
2190 max_part = this->first_level_distribution(i);
2191 }
2192
2193 // Total parts in largest nonuniform superpart from
2194 // first level partitioning
2195 nfutureNumParts =
2196 this->num_global_parts * max_part / sum_first_level_dist;
2197 }
2198 // Standard uniform partitioning this level
2199 else {
2200 maxNoPartAlongI = this->get_part_count(future_num_parts,
2201 1.0f / (this->recursion_depth - rd));
2202 if (maxNoPartAlongI > this->max_num_part_along_dim)
2203 this->max_num_part_along_dim = maxNoPartAlongI;
2204 nfutureNumParts = future_num_parts / maxNoPartAlongI;
2205 if (future_num_parts % maxNoPartAlongI) {
2206 ++nfutureNumParts;
2207 }
2208 }
2209 future_num_parts = nfutureNumParts;
2210 }
2211 this->total_num_part = this->num_global_parts;
2212
2213 if(this->divide_to_prime_first) {
2214 this->total_dim_num_reduce_all = this->num_global_parts * 2;
2215 this->last_dim_num_part = this->num_global_parts;
2216 }
2217 else {
2218 //this is the lower bound.
2219 //estimate reduceAll Count here.
2220 //we find the upperbound instead.
2221 size_t p = 1;
2222 for(int i = 0; i < this->recursion_depth; ++i) {
2223 this->total_dim_num_reduce_all += p;
2224 p *= this->max_num_part_along_dim;
2225 }
2226
2227 if(p / this->max_num_part_along_dim > this->num_global_parts) {
2228 this->last_dim_num_part = this->num_global_parts;
2229 }
2230 else {
2231 this->last_dim_num_part = p / this->max_num_part_along_dim;
2232 }
2233 }
2234 }
2235
2236 this->total_num_cut = this->total_num_part - 1;
2237 this->max_num_cut_along_dim = this->max_num_part_along_dim - 1;
2238 this->max_num_total_part_along_dim = this->max_num_part_along_dim +
2239 size_t(this->max_num_cut_along_dim);
2240 // maxPartNo is P, maxCutNo = P-1, matTotalPartcount = 2P-1
2241
2242 // refine the concurrent part count, if it is given bigger than the maximum
2243 // possible part count.
2244 if(this->max_concurrent_part_calculation > this->last_dim_num_part) {
2245 if(this->mj_problemComm->getRank() == 0) {
2246 std::cerr << "Warning: Concurrent part count (" <<
2247 this->max_concurrent_part_calculation <<
2248 ") has been set bigger than maximum amount that can be used." <<
2249 " Setting to:" << this->last_dim_num_part << "." << std::endl;
2250 }
2251 this->max_concurrent_part_calculation = this->last_dim_num_part;
2252 }
2253}
2254
2255/* \brief Tries to determine the part number for current dimension,
2256 * by trying to make the partitioning as square as possible.
2257 * \param num_total_future how many more partitionings are required.
2258 * \param root how many more recursion depth is left.
2259 */
2260template <typename mj_scalar_t, typename mj_lno_t, typename mj_gno_t,
2261 typename mj_part_t, typename mj_node_t>
2262inline mj_part_t AlgMJ<mj_scalar_t, mj_lno_t, mj_gno_t, mj_part_t, mj_node_t>::
2263 get_part_count(mj_part_t num_total_future, double root)
2264{
2265 double fp = pow(num_total_future, root);
2266 mj_part_t ip = mj_part_t(fp);
2267 if(fp - ip < std::numeric_limits<float>::epsilon() * 100) {
2268 return ip;
2269 }
2270 else {
2271 return ip + 1;
2272 }
2273}
2274
2275/* \brief Function returns how many parts that will be obtained after this
2276 * dimension partitioning. It sets how many parts each current part will be
2277 * partitioned into in this dimension to device_num_partitioning_in_current_dim
2278 * view, sets how many total future parts each obtained part will be
2279 * partitioned into in next_future_num_parts_in_parts vector. If part boxes are
2280 * kept, then sets initializes the output_part_boxes as its ancestor.
2281 * \param future_num_part_in_parts: input, how many future parts each current
2282 * part will be partitioned into.
2283 * \param next_future_num_parts_in_parts: output, how many future parts each
2284 * obtained part will be partitioned into.
2285 * \param future_num_parts: output, max number of future parts that will be
2286 * obtained from a single
2287 * \param current_num_parts: input, how many parts are there currently.
2288 * \param current_iteration: input, current dimension iteration number.
2289 * \param input_part_boxes: input, if boxes are kept, current boxes.
2290 * \param output_part_boxes: output, if boxes are kept, the initial box
2291 * boundaries for obtained parts.
2292 * \param atomic_part_count DOCWORK: Documentation
2293 */
2294template <typename mj_scalar_t, typename mj_lno_t, typename mj_gno_t,
2295 typename mj_part_t, typename mj_node_t>
2296mj_part_t AlgMJ<mj_scalar_t, mj_lno_t, mj_gno_t, mj_part_t, mj_node_t>::
2297 update_part_num_arrays(
2298 std::vector<mj_part_t> *future_num_part_in_parts,
2299 std::vector<mj_part_t> *next_future_num_parts_in_parts,
2300 mj_part_t &future_num_parts,
2301 mj_part_t current_num_parts,
2302 int current_iteration,
2303 RCP<mj_partBoxVector_t> input_part_boxes,
2304 RCP<mj_partBoxVector_t> output_part_boxes,
2305 mj_part_t atomic_part_count)
2306{
2307 std::vector<mj_part_t> num_partitioning_in_current_dim;
2308
2309 // how many parts that will be obtained after this dimension.
2310 mj_part_t output_num_parts = 0;
2311 if(this->part_no_array.size()) {
2312 // when the partNo array is provided as input,
2313 // each current partition will be partition to the same number of parts.
2314 // we dont need to use the future_num_part_in_parts vector in this case.
2315 mj_part_t current_part_no_array =
2316 this->part_no_array(current_iteration);
2317
2318 if(current_part_no_array < 1) {
2319 std::cout << "Current recursive iteration: " << current_iteration <<
2320 " part_no_array[" << current_iteration << "] is given as:" <<
2321 current_part_no_array << std::endl;
2322 std::terminate();
2323 }
2324 if(current_part_no_array == 1) {
2325 return current_num_parts;
2326 }
2327
2328 // If using part_no_array, ensure compatibility with num_first_level_parts.
2329 if (this->first_level_distribution.size() != 0 &&
2330 current_iteration == 0 &&
2331 current_part_no_array != this->num_first_level_parts) {
2332 std::cout << "Current recursive iteration: " << current_iteration
2333 << " part_no_array[" << current_iteration << "] is given as: " <<
2334 current_part_no_array << " and contradicts num_first_level_parts: " <<
2335 this->num_first_level_parts << std::endl;
2336 std::terminate();
2337 }
2338
2339 for(mj_part_t ii = 0; ii < current_num_parts; ++ii) {
2340 num_partitioning_in_current_dim.push_back(current_part_no_array);
2341 }
2342
2343/*
2344 std::cout << "\n\nme: " << this->myRank << " current_iteration: " <<
2345 current_iteration << " current_num_parts: " <<
2346 current_num_parts << "\n\n";
2347
2348 std::cout << "\n\nnum_partitioning_in_current_dim[0]: " <<
2349 num_partitioning_in_current_dim[0] << "\n\n";
2350
2351 std::cout << "\n\nfuture_num_parts: " << future_num_parts
2352 << " num_partitioning_in_current_dim[0]: " <<
2353 num_partitioning_in_current_dim[0] << " " <<
2354 future_num_parts / num_partitioning_in_current_dim[0] << "\n\n";
2355*/
2356
2357 future_num_parts /= num_partitioning_in_current_dim[0];
2358 output_num_parts = current_num_parts *
2359 num_partitioning_in_current_dim[0];
2360 if(this->mj_keep_part_boxes) {
2361 for(mj_part_t k = 0; k < current_num_parts; ++k) {
2362 //initialized the output boxes as its ancestor.
2363 for(mj_part_t j = 0; j <
2364 num_partitioning_in_current_dim[0]; ++j) {
2365 output_part_boxes->push_back((*input_part_boxes)[k]);
2366 }
2367 }
2368 }
2369
2370 // set the how many more parts each part will be divided.
2371 // this is obvious when partNo array is provided as input.
2372 // however, fill this so weights will be calculated according to this array.
2373 for(mj_part_t ii = 0; ii < output_num_parts; ++ii) {
2374 next_future_num_parts_in_parts->push_back(future_num_parts);
2375 }
2376 }
2377 else {
2378 // if partNo array is not provided as input, future_num_part_in_parts
2379 // holds how many parts each part should be divided. Initially it holds a
2380 // single number equal to the total number of global parts.
2381
2382 // calculate the future_num_parts from beginning,
2383 // since each part might be divided into different number of parts.
2384 future_num_parts = 1;
2385
2386 // cout << "i:" << i << std::endl;
2387 for(mj_part_t ii = 0; ii < current_num_parts; ++ii) {
2388 // get how many parts a part should be divided.
2389 mj_part_t future_num_parts_of_part_ii = (*future_num_part_in_parts)[ii];
2390
2391 // get the ideal number of parts that is close to the
2392 // (recursion_depth - i) root of the future_num_parts_of_part_ii.
2393 mj_part_t num_partitions_in_current_dim =
2394 this->get_part_count(future_num_parts_of_part_ii,
2395 1.0 / (this->recursion_depth - current_iteration)
2396 );
2397 if(num_partitions_in_current_dim > this->max_num_part_along_dim) {
2398 std::cerr << "ERROR: maxPartNo calculation is wrong."
2399 " num_partitions_in_current_dim: "
2400 << num_partitions_in_current_dim << " this->max_num_part_along_dim: "
2401 << this->max_num_part_along_dim <<
2402 " this->recursion_depth: " << this->recursion_depth <<
2403 " current_iteration:" << current_iteration <<
2404 " future_num_parts_of_part_ii: " << future_num_parts_of_part_ii <<
2405 " might need to fix max part no calculation for "
2406 "largest_prime_first partitioning." <<
2407 std::endl;
2408 std::terminate();
2409 }
2410 // add this number to vector_num_partitioning_in_current_dim vector.
2411 // num_partitioning_in_current_dim.push_back(num_partitions_in_current_dim);
2412 // mj_part_t largest_prime_factor = num_partitions_in_current_dim;
2413
2414 // Update part num arrays when on current_iteration == 0 and
2415 // using nonuniform first level partitioning
2416 // with requested num parts (num_first_level_parts) and
2417 // a requested distribution in parts (first_level_distribution).
2418 if (current_iteration == 0 &&
2419 this->first_level_distribution.size() != 0 &&
2420 this->num_first_level_parts > 1) {
2421 // Only 1 current part to begin and partitions into
2422 // num_first_level_parts many parts
2423 num_partitioning_in_current_dim.push_back(this->num_first_level_parts);
2424
2425 // The output number of parts from first level partitioning
2426 output_num_parts = this->num_first_level_parts;
2427
2428 // Remaining parts left to partition for all future levels
2429 future_num_parts /= this->num_first_level_parts;
2430
2431 mj_part_t max_part = 0;
2432 mj_part_t sum_first_level_dist = 0;
2433
2434 // Cumulative sum of distribution of first level parts
2435 // and size of largest first level part
2436 for (int i = 0; i < this->num_first_level_parts; ++i) {
2437 sum_first_level_dist += this->first_level_distribution(i);
2438
2439 if (this->first_level_distribution(i) > max_part)
2440 max_part = this->first_level_distribution(i);
2441 }
2442
2443 // Maximum # of remaining parts left to partition for all future levels
2444 future_num_parts = this->num_global_parts * max_part / sum_first_level_dist;
2445
2446 // Number of parts remaining left to partition for each future_part
2447 // The sum must exactly equal global_num_parts
2448 for (int i = 0; i < this->num_first_level_parts; ++i) {
2449 next_future_num_parts_in_parts->push_back(this->first_level_distribution(i) *
2450 this->num_global_parts / sum_first_level_dist);
2451 }
2452 }
2453 else if (this->divide_to_prime_first) {
2454 // Add this number to num_partitioning_in_current_dim vector.
2455 num_partitioning_in_current_dim.push_back(num_partitions_in_current_dim);
2456
2457 mj_part_t largest_prime_factor = num_partitions_in_current_dim;
2458
2459 //increase the output number of parts.
2460 output_num_parts += num_partitions_in_current_dim;
2461
2462 if (future_num_parts_of_part_ii == atomic_part_count ||
2463 future_num_parts_of_part_ii % atomic_part_count != 0) {
2464 atomic_part_count = 1;
2465 }
2466
2467 largest_prime_factor =
2468 this->find_largest_prime_factor(future_num_parts_of_part_ii / atomic_part_count);
2469
2470 // We divide to num_partitions_in_current_dim. But we adjust the weights
2471 // based on largest prime/ if num_partitions_in_current_dim = 2,
2472 // largest prime = 5 --> we divide to 2 parts with weights 3x and 2x.
2473 // if the largest prime is less than part count, we use the part count
2474 // so that we divide uniformly.
2475 if (largest_prime_factor < num_partitions_in_current_dim) {
2476 largest_prime_factor = num_partitions_in_current_dim;
2477 }
2478 //ideal number of future partitions for each part.
2479 mj_part_t ideal_num_future_parts_in_part =
2480 (future_num_parts_of_part_ii / atomic_part_count) / largest_prime_factor;
2481 //if num_partitions_in_current_dim = 2, largest prime = 5 then ideal weight is 2x
2482 mj_part_t ideal_prime_scale = largest_prime_factor / num_partitions_in_current_dim;
2483
2484/*
2485 std::cout << "\ncurrent num part: " << ii
2486 << " largest_prime_factor: " << largest_prime_factor
2487 << " To Partition: " << future_num_parts_of_part_ii << "\n\n";
2488*/
2489
2490 for (mj_part_t iii = 0; iii < num_partitions_in_current_dim; ++iii) {
2491 //if num_partitions_in_current_dim = 2, largest prime = 5 then ideal weight is 2x
2492 mj_part_t my_ideal_primescale = ideal_prime_scale;
2493 //left over weighs. Left side is adjusted to be 3x, right side stays as 2x
2494 if (iii < (largest_prime_factor) % num_partitions_in_current_dim) {
2495 ++my_ideal_primescale;
2496 }
2497 //scale with 'x';
2498 mj_part_t num_future_parts_for_part_iii =
2499 ideal_num_future_parts_in_part * my_ideal_primescale;
2500
2501 //if there is a remainder in the part increase the part weight.
2502 if (iii < (future_num_parts_of_part_ii / atomic_part_count) % largest_prime_factor) {
2503 //if not uniform, add 1 for the extra parts.
2504 ++num_future_parts_for_part_iii;
2505 }
2506
2507 next_future_num_parts_in_parts->push_back(num_future_parts_for_part_iii * atomic_part_count);
2508
2509 //if part boxes are stored, initialize the box of the parts as the ancestor.
2510 if (this->mj_keep_part_boxes) {
2511 output_part_boxes->push_back((*input_part_boxes)[ii]);
2512 }
2513
2514 //set num future_num_parts to maximum in this part.
2515 if (num_future_parts_for_part_iii > future_num_parts)
2516 future_num_parts = num_future_parts_for_part_iii;
2517
2518 }
2519 }
2520 else {
2521 // Add this number to num_partitioning_in_current_dim vector.
2522 num_partitioning_in_current_dim.push_back(num_partitions_in_current_dim);
2523
2524 //increase the output number of parts.
2525 output_num_parts += num_partitions_in_current_dim;
2526
2527 if((future_num_parts_of_part_ii == atomic_part_count) ||
2528 (future_num_parts_of_part_ii % atomic_part_count != 0)) {
2529 atomic_part_count = 1;
2530 }
2531 //ideal number of future partitions for each part.
2532 mj_part_t ideal_num_future_parts_in_part =
2533 (future_num_parts_of_part_ii / atomic_part_count) /
2534 num_partitions_in_current_dim;
2535 for(mj_part_t iii = 0; iii < num_partitions_in_current_dim; ++iii) {
2536 mj_part_t num_future_parts_for_part_iii =
2537 ideal_num_future_parts_in_part;
2538
2539 //if there is a remainder in the part increase the part weight.
2540 if(iii < (future_num_parts_of_part_ii / atomic_part_count) %
2541 num_partitions_in_current_dim) {
2542 // if not uniform, add 1 for the extra parts.
2543 ++num_future_parts_for_part_iii;
2544 }
2545
2546 next_future_num_parts_in_parts->push_back(
2547 num_future_parts_for_part_iii * atomic_part_count);
2548
2549 // if part boxes are stored, initialize the box of the parts as
2550 // the ancestor.
2551 if(this->mj_keep_part_boxes) {
2552 output_part_boxes->push_back((*input_part_boxes)[ii]);
2553 }
2554 //set num future_num_parts to maximum in this part.
2555 if(num_future_parts_for_part_iii > future_num_parts)
2556 future_num_parts = num_future_parts_for_part_iii;
2557 }
2558 }
2559 }
2560 }
2561 // move temp std::vector to host view
2562 device_num_partitioning_in_current_dim = Kokkos::View<
2563 mj_part_t*, device_t>("test", num_partitioning_in_current_dim.size());
2564 host_num_partitioning_in_current_dim =
2565 Kokkos::create_mirror_view(device_num_partitioning_in_current_dim);
2566 for(size_t n = 0; n < num_partitioning_in_current_dim.size(); ++n) {
2567 host_num_partitioning_in_current_dim(n) =
2568 num_partitioning_in_current_dim[n];
2569 }
2570 // setup device equivalent - this data is used on host and device and it's
2571 // more efficient to just setup array on both sides now rather than copy
2572 // values as needed later.
2573 Kokkos::deep_copy(device_num_partitioning_in_current_dim,
2574 host_num_partitioning_in_current_dim);
2575 return output_num_parts;
2576}
2577
2578/* \brief Allocates and initializes the work memory that will be used by MJ.
2579 * */
2580template <typename mj_scalar_t, typename mj_lno_t, typename mj_gno_t,
2581 typename mj_part_t, typename mj_node_t>
2582void AlgMJ<mj_scalar_t, mj_lno_t, mj_gno_t, mj_part_t, mj_node_t>::
2583 allocate_set_work_memory()
2584{
2585 // Throughout the partitioning execution,
2586 // instead of the moving the coordinates, hold a permutation array for parts.
2587 // coordinate_permutations holds the current permutation.
2588 this->coordinate_permutations = Kokkos::View<mj_lno_t*, device_t>(
2589 Kokkos::ViewAllocateWithoutInitializing("coordinate_permutations"),
2590 this->num_local_coords);
2591 auto local_coordinate_permutations = coordinate_permutations;
2592 Kokkos::parallel_for(
2593 Kokkos::RangePolicy<typename mj_node_t::execution_space, mj_lno_t> (
2594 0, this->num_local_coords), KOKKOS_LAMBDA (mj_lno_t i) {
2595 local_coordinate_permutations(i) = i;
2596 });
2597
2598 // new_coordinate_permutations holds the current permutation.
2599 this->new_coordinate_permutations = Kokkos::View<mj_lno_t*, device_t>(
2600 Kokkos::ViewAllocateWithoutInitializing("num_local_coords"),
2601 this->num_local_coords);
2602
2603 this->assigned_part_ids = Kokkos::View<mj_part_t*, device_t>(
2604 Kokkos::ViewAllocateWithoutInitializing("assigned parts"), 0);
2605 if(this->num_local_coords > 0) {
2606 this->assigned_part_ids = Kokkos::View<mj_part_t*, device_t>(
2607 Kokkos::ViewAllocateWithoutInitializing("assigned part ids"),
2608 this->num_local_coords);
2609 }
2610
2611 // single partition starts at index-0, and ends at numLocalCoords
2612 // inTotalCounts array holds the end points in coordinate_permutations array
2613 // for each partition. Initially sized 1, and single element is set to
2614 // numLocalCoords.
2615 this->part_xadj = Kokkos::View<mj_lno_t*, device_t>(
2616 Kokkos::ViewAllocateWithoutInitializing("part xadj"), 1);
2617 this->host_part_xadj = Kokkos::create_mirror_view(part_xadj);
2618 host_part_xadj(0) = num_local_coords;
2619 Kokkos::deep_copy(this->part_xadj, host_part_xadj);
2620
2621 // the ends points of the output, this is allocated later.
2622 this->new_part_xadj = Kokkos::View<mj_lno_t*, device_t>(
2623 Kokkos::ViewAllocateWithoutInitializing("empty"), 0);
2624
2625 // only store this much if cuts are needed to be stored.
2626 this->all_cut_coordinates = Kokkos::View<mj_scalar_t*, device_t>(
2627 Kokkos::ViewAllocateWithoutInitializing("all cut coordinates"),
2628 this->max_num_cut_along_dim * this->max_concurrent_part_calculation);
2629
2630 // how much weight percentage should a MPI put left side of the each cutline
2631 this->process_cut_line_weight_to_put_left = Kokkos::View<mj_scalar_t*,
2632 device_t>(Kokkos::ViewAllocateWithoutInitializing("empty"), 0);
2633
2634 // how much weight percentage should each thread in MPI put left side of
2635 // each outline
2636 this->thread_cut_line_weight_to_put_left =
2637 Kokkos::View<mj_scalar_t*, device_t>(
2638 Kokkos::ViewAllocateWithoutInitializing("empty"), 0);
2639
2640 if(this->distribute_points_on_cut_lines) {
2641 this->process_cut_line_weight_to_put_left =
2642 Kokkos::View<mj_scalar_t *, device_t>(
2643 Kokkos::ViewAllocateWithoutInitializing(
2644 "process_cut_line_weight_to_put_left"),
2645 this->max_num_cut_along_dim * this->max_concurrent_part_calculation);
2646 this->thread_cut_line_weight_to_put_left =
2647 Kokkos::View<mj_scalar_t *, device_t>(
2648 Kokkos::ViewAllocateWithoutInitializing(
2649 "thread_cut_line_weight_to_put_left"),
2650 this->max_num_cut_along_dim);
2651 this->process_rectilinear_cut_weight =
2652 Kokkos::View<mj_scalar_t *, device_t>(
2653 Kokkos::ViewAllocateWithoutInitializing("process_rectilinear_cut_weight"),
2654 this->max_num_cut_along_dim);
2655 this->global_rectilinear_cut_weight =
2656 Kokkos::View<mj_scalar_t *, device_t>(
2657 Kokkos::ViewAllocateWithoutInitializing("global_rectilinear_cut_weight"),
2658 this->max_num_cut_along_dim);
2659 }
2660
2661 // work array to manipulate coordinate of cutlines in different iterations.
2662 // necessary because previous cut line information is used for determining
2663 // the next cutline information. therefore, cannot update the cut work array
2664 // until all cutlines are determined.
2665 this->cut_coordinates_work_array =
2666 Kokkos::View<mj_scalar_t *, device_t>(
2667 Kokkos::ViewAllocateWithoutInitializing("cut_coordinates_work_array"),
2668 this->max_num_cut_along_dim * this->max_concurrent_part_calculation);
2669
2670 // cumulative part weight array.
2671 this->target_part_weights = Kokkos::View<mj_scalar_t*, device_t>(
2672 Kokkos::ViewAllocateWithoutInitializing("target_part_weights"),
2673 this->max_num_part_along_dim * this->max_concurrent_part_calculation);
2674
2675 // upper bound coordinate of a cut line
2676 this->cut_upper_bound_coordinates =
2677 Kokkos::View<mj_scalar_t*, device_t>(
2678 Kokkos::ViewAllocateWithoutInitializing("cut_upper_bound_coordinates"),
2679 this->max_num_cut_along_dim * this->max_concurrent_part_calculation);
2680
2681 // lower bound coordinate of a cut line
2682 this->cut_lower_bound_coordinates =
2683 Kokkos::View<mj_scalar_t*, device_t>(
2684 Kokkos::ViewAllocateWithoutInitializing("cut_lower_bound_coordinates"),
2685 this->max_num_cut_along_dim* this->max_concurrent_part_calculation);
2686
2687 // lower bound weight of a cut line
2688 this->cut_lower_bound_weights =
2689 Kokkos::View<mj_scalar_t*, device_t>(
2690 Kokkos::ViewAllocateWithoutInitializing("cut_lower_bound_weights"),
2691 this->max_num_cut_along_dim* this->max_concurrent_part_calculation);
2692
2693 //upper bound weight of a cut line
2694 this->cut_upper_bound_weights =
2695 Kokkos::View<mj_scalar_t*, device_t>(
2696 Kokkos::ViewAllocateWithoutInitializing("cut_upper_bound_weights"),
2697 this->max_num_cut_along_dim* this->max_concurrent_part_calculation);
2698
2699 // combined array to exchange the min and max coordinate,
2700 // and total weight of part.
2701 this->process_local_min_max_coord_total_weight =
2702 Kokkos::View<mj_scalar_t*, device_t>(
2703 Kokkos::ViewAllocateWithoutInitializing(
2704 "process_local_min_max_coord_total_weight"),
2705 3 * this->max_concurrent_part_calculation);
2706
2707 // global combined array with the results for min, max and total weight.
2708 this->global_min_max_coord_total_weight =
2709 Kokkos::View<mj_scalar_t*, device_t>(
2710 Kokkos::ViewAllocateWithoutInitializing("global_min_max_coord_total_weight"),
2711 3 * this->max_concurrent_part_calculation);
2712
2713 // is_cut_line_determined is used to determine if a cutline is
2714 // determined already. If a cut line is already determined, the next
2715 // iterations will skip this cut line.
2716 this->is_cut_line_determined = Kokkos::View<bool *, device_t>(
2717 Kokkos::ViewAllocateWithoutInitializing("is_cut_line_determined"),
2718 this->max_num_cut_along_dim * this->max_concurrent_part_calculation);
2719
2720 // incomplete_cut_count count holds the number of cutlines that have not
2721 // been finalized for each part when concurrentPartCount>1, using this
2722 // information, if incomplete_cut_count[x]==0, then no work is done for
2723 // this part.
2724 this->device_incomplete_cut_count = Kokkos::View<mj_part_t *, device_t>(
2725 Kokkos::ViewAllocateWithoutInitializing("device_incomplete_cut_count"),
2726 this->max_concurrent_part_calculation);
2727 this->incomplete_cut_count =
2728 Kokkos::create_mirror_view(device_incomplete_cut_count);
2729
2730 // local part weights of each thread.
2731 this->thread_part_weights = Kokkos::View<double *, device_t>(
2732 Kokkos::ViewAllocateWithoutInitializing("thread_part_weights"),
2733 this->max_num_total_part_along_dim * this->max_concurrent_part_calculation);
2734
2735 this->thread_cut_left_closest_point = Kokkos::View<mj_scalar_t *, device_t>(
2736 Kokkos::ViewAllocateWithoutInitializing("thread_cut_left_closest_point"),
2737 this->max_num_cut_along_dim * this->max_concurrent_part_calculation);
2738
2739 // thread_cut_right_closest_point to hold the closest coordinate to a
2740 // cutline from right (for each thread)
2741 this->thread_cut_right_closest_point = Kokkos::View<mj_scalar_t *, device_t>(
2742 Kokkos::ViewAllocateWithoutInitializing("thread_cut_right_closest_point"),
2743 this->max_num_cut_along_dim * this->max_concurrent_part_calculation);
2744
2745 // to store how many points in each part a thread has.
2746 this->thread_point_counts = Kokkos::View<mj_lno_t *, device_t>(
2747 Kokkos::ViewAllocateWithoutInitializing("thread_point_counts"),
2748 this->max_num_part_along_dim);
2749
2750 // for faster communication, concatanation of
2751 // totalPartWeights sized 2P-1, since there are P parts and P-1 cut lines
2752 // leftClosest distances sized P-1, since P-1 cut lines
2753 // rightClosest distances size P-1, since P-1 cut lines.
2754 this->total_part_weight_left_right_closests =
2755 Kokkos::View<mj_scalar_t*, device_t>(
2756 Kokkos::ViewAllocateWithoutInitializing(
2757 "total_part_weight_left_right_closests"),
2758 (this->max_num_total_part_along_dim + this->max_num_cut_along_dim * 2) *
2759 this->max_concurrent_part_calculation);
2760
2761 this->global_total_part_weight_left_right_closests =
2762 Kokkos::View<mj_scalar_t*, device_t>(
2763 Kokkos::ViewAllocateWithoutInitializing(
2764 "global_total_part_weight_left_right_closests"),
2765 (this->max_num_total_part_along_dim +
2766 this->max_num_cut_along_dim * 2) * this->max_concurrent_part_calculation);
2767
2768 this->current_mj_gnos = Kokkos::View<mj_gno_t*, device_t>(
2769 Kokkos::ViewAllocateWithoutInitializing("gids"), num_local_coords);
2770
2771 this->owner_of_coordinate = Kokkos::View<int *, Kokkos::HostSpace>(
2772 Kokkos::ViewAllocateWithoutInitializing("owner_of_coordinate"),
2773 num_local_coords);
2774
2775 // changes owners back to host - so we don't run them on device
2776 // this improves migration code but means we have to serial init here.
2777 // Note we might allow this to be OpenMP when available even for CUDA.
2778 Kokkos::deep_copy(owner_of_coordinate, myActualRank);
2779
2780 auto local_current_mj_gnos = current_mj_gnos;
2781 auto local_initial_mj_gnos = initial_mj_gnos;
2782 Kokkos::parallel_for(
2783 Kokkos::RangePolicy<typename mj_node_t::execution_space, mj_lno_t>
2784 (0, num_local_coords), KOKKOS_LAMBDA (mj_lno_t j) {
2785 local_current_mj_gnos(j) = local_initial_mj_gnos(j);
2786 });
2787}
2788
2789/* \brief compute the global bounding box
2790 */
2791template <typename mj_scalar_t, typename mj_lno_t, typename mj_gno_t,
2792 typename mj_part_t, typename mj_node_t>
2793void AlgMJ<mj_scalar_t,mj_lno_t,mj_gno_t,mj_part_t,
2794 mj_node_t>::compute_global_box()
2795{
2796 //local min coords
2797 mj_scalar_t *mins = new mj_scalar_t[this->coord_dim];
2798 //global min coords
2799 mj_scalar_t *gmins = new mj_scalar_t[this->coord_dim];
2800 //local max coords
2801 mj_scalar_t *maxs = new mj_scalar_t[this->coord_dim];
2802 //global max coords
2803 mj_scalar_t *gmaxs = new mj_scalar_t[this->coord_dim];
2804
2805 auto local_mj_coordinates = this->mj_coordinates;
2806
2807 // If we are only doing 2 parts then we don't need these values
2808 // for y and z. Init them all to 0 first
2809 for(int i = 0; i < this->coord_dim; ++i) {
2810 mins[i] = 0;
2811 maxs[i] = 0;
2812 }
2813
2814 for(int i = 0; i < std::min(this->recursion_depth, this->coord_dim); ++i) {
2815 Kokkos::parallel_reduce("MinReduce",
2816 Kokkos::RangePolicy<typename mj_node_t::execution_space, mj_lno_t>
2817 (0, this->num_local_coords),
2818 KOKKOS_LAMBDA(mj_lno_t j, mj_scalar_t & running_min) {
2819 if(local_mj_coordinates(j,i) < running_min) {
2820 running_min = local_mj_coordinates(j,i);
2821 }
2822 }, Kokkos::Min<mj_scalar_t>(mins[i]));
2823 Kokkos::parallel_reduce("MaxReduce",
2824 Kokkos::RangePolicy<typename mj_node_t::execution_space, mj_lno_t>
2825 (0, this->num_local_coords),
2826 KOKKOS_LAMBDA(mj_lno_t j, mj_scalar_t & running_max) {
2827 if(local_mj_coordinates(j,i) > running_max) {
2828 running_max = local_mj_coordinates(j,i);
2829 }
2830 }, Kokkos::Max<mj_scalar_t>(maxs[i]));
2831 }
2832
2833 reduceAll<int, mj_scalar_t>(*this->comm, Teuchos::REDUCE_MIN,
2834 this->coord_dim, mins, gmins
2835 );
2836
2837 reduceAll<int, mj_scalar_t>(*this->comm, Teuchos::REDUCE_MAX,
2838 this->coord_dim, maxs, gmaxs
2839 );
2840
2841 //create single box with all areas.
2842 global_box = rcp(new mj_partBox_t(0,this->coord_dim,gmins,gmaxs));
2843 //coordinateModelPartBox <mj_scalar_t, mj_part_t> tmpBox (0, coordDim);
2844 delete [] mins;
2845 delete [] gmins;
2846 delete [] maxs;
2847 delete [] gmaxs;
2848}
2849
2850/* \brief for part communication we keep track of the box boundaries.
2851 * This is performed when either asked specifically, or when geometric mapping
2852 * is performed afterwards.
2853 * This function initializes a single box with all global min, max coordinates.
2854 * \param initial_partitioning_boxes the input and output vector for boxes.
2855 */
2856template <typename mj_scalar_t, typename mj_lno_t, typename mj_gno_t,
2857 typename mj_part_t, typename mj_node_t>
2858void AlgMJ<mj_scalar_t, mj_lno_t, mj_gno_t, mj_part_t,
2859 mj_node_t>::init_part_boxes(
2860 RCP<mj_partBoxVector_t> & initial_partitioning_boxes)
2861{
2862 mj_partBox_t tmp_box(*global_box);
2863 initial_partitioning_boxes->push_back(tmp_box);
2864}
2865
2870template <typename mj_scalar_t, typename mj_lno_t, typename mj_gno_t,
2871 typename mj_part_t,
2872 typename mj_node_t>
2873void AlgMJ<mj_scalar_t, mj_lno_t, mj_gno_t, mj_part_t, mj_node_t>::
2874 mj_get_local_min_max_coord_totW(
2875 mj_part_t current_work_part,
2876 mj_part_t current_concurrent_num_parts,
2877 Kokkos::View<mj_scalar_t *, device_t> & mj_current_dim_coords)
2878{
2879 auto local_coordinate_permutations = this->coordinate_permutations;
2880 auto local_process_local_min_max_coord_total_weight =
2881 this->process_local_min_max_coord_total_weight;
2882 auto local_mj_weights = this->mj_weights;
2883
2884 bool bUniformWeights = mj_uniform_weights(0);
2885
2886 for(int kk = 0; kk < current_concurrent_num_parts; ++kk) {
2887
2888 mj_part_t concurrent_current_part = current_work_part + kk;
2889 mj_lno_t coordinate_begin_index = concurrent_current_part == 0 ? 0 :
2890 host_part_xadj(concurrent_current_part - 1);
2891 mj_lno_t coordinate_end_index =
2892 host_part_xadj(concurrent_current_part);
2893
2894 mj_scalar_t my_min_coord = 0;
2895 mj_scalar_t my_max_coord = 0;
2896 mj_scalar_t my_total_weight;
2897 //if the part is empty.
2898 //set the min and max coordinates as reverse.
2899 if(coordinate_begin_index >= coordinate_end_index)
2900 {
2901 my_min_coord = std::numeric_limits<mj_scalar_t>::max();
2902 my_max_coord = -std::numeric_limits<mj_scalar_t>::max();
2903 my_total_weight = 0;
2904 }
2905 else {
2906 // get min
2907 Kokkos::parallel_reduce("get min",
2908 Kokkos::RangePolicy<typename mj_node_t::execution_space, mj_lno_t>
2909 (coordinate_begin_index, coordinate_end_index),
2910 KOKKOS_LAMBDA (mj_lno_t j, mj_scalar_t & running_min) {
2911 int i = local_coordinate_permutations(j);
2912 if(mj_current_dim_coords(i) < running_min)
2913 running_min = mj_current_dim_coords(i);
2914 }, Kokkos::Min<mj_scalar_t>(my_min_coord));
2915 // get max
2916 Kokkos::parallel_reduce("get max",
2917 Kokkos::RangePolicy<typename mj_node_t::execution_space, mj_lno_t>
2918 (coordinate_begin_index, coordinate_end_index),
2919 KOKKOS_LAMBDA (mj_lno_t j, mj_scalar_t & running_max) {
2920 int i = local_coordinate_permutations(j);
2921 if(mj_current_dim_coords(i) > running_max)
2922 running_max = mj_current_dim_coords(i);
2923 }, Kokkos::Max<mj_scalar_t>(my_max_coord));
2924 if(bUniformWeights) {
2925 my_total_weight = coordinate_end_index - coordinate_begin_index;
2926 }
2927 else {
2928 my_total_weight = 0;
2929 Kokkos::parallel_reduce("get weight",
2930 Kokkos::RangePolicy<typename mj_node_t::execution_space, mj_lno_t>
2931 (coordinate_begin_index, coordinate_end_index),
2932 KOKKOS_LAMBDA (mj_lno_t j, mj_scalar_t & lsum) {
2933 int i = local_coordinate_permutations(j);
2934 lsum += local_mj_weights(i,0);
2935 }, my_total_weight);
2936 }
2937 }
2938
2939 // single write
2940 Kokkos::parallel_for(
2941 Kokkos::RangePolicy<typename mj_node_t::execution_space, mj_part_t>
2942 (0, 1), KOKKOS_LAMBDA (int dummy) {
2943 local_process_local_min_max_coord_total_weight(kk) =
2944 my_min_coord;
2945 local_process_local_min_max_coord_total_weight(
2946 kk + current_concurrent_num_parts) = my_max_coord;
2947 local_process_local_min_max_coord_total_weight(
2948 kk + 2*current_concurrent_num_parts) = my_total_weight;
2949 });
2950 }
2951}
2952
2965template <typename mj_scalar_t, typename mj_lno_t, typename mj_gno_t,
2966 typename mj_part_t, typename mj_node_t>
2967void AlgMJ<mj_scalar_t, mj_lno_t, mj_gno_t, mj_part_t,
2968 mj_node_t>::mj_get_global_min_max_coord_totW(
2969 mj_part_t current_concurrent_num_parts,
2970 Kokkos::View<mj_scalar_t *, device_t> & local_min_max_total,
2971 Kokkos::View<mj_scalar_t *, device_t> & global_min_max_total) {
2972 // reduce min for first current_concurrent_num_parts elements, reduce
2973 // max for next concurrentPartCount elements, reduce sum for the last
2974 // concurrentPartCount elements.
2975 if(this->comm->getSize() > 1) {
2976 // We're using explicit host here as Spectrum MPI would fail
2977 // with the prior HostMirror UVMSpace to UVMSpace setup.
2978 auto host_local_min_max_total =
2979 Kokkos::create_mirror_view(Kokkos::HostSpace(), local_min_max_total);
2980 auto host_global_min_max_total =
2981 Kokkos::create_mirror_view(Kokkos::HostSpace(), global_min_max_total);
2982 Kokkos::deep_copy(host_local_min_max_total, local_min_max_total);
2984 reductionOp(current_concurrent_num_parts,
2985 current_concurrent_num_parts, current_concurrent_num_parts);
2986 try {
2987 reduceAll<int, mj_scalar_t>(
2988 *(this->comm),
2989 reductionOp,
2990 3 * current_concurrent_num_parts,
2991 host_local_min_max_total.data(),
2992 host_global_min_max_total.data());
2993 }
2994 Z2_THROW_OUTSIDE_ERROR(*(this->mj_env))
2995 Kokkos::deep_copy(global_min_max_total, host_global_min_max_total);
2996 }
2997 else {
2998 mj_part_t s = 3 * current_concurrent_num_parts;
2999 Kokkos::parallel_for(
3000 Kokkos::RangePolicy<typename mj_node_t::execution_space, mj_part_t>
3001 (0, s), KOKKOS_LAMBDA (mj_part_t i) {
3002 global_min_max_total(i) = local_min_max_total(i);
3003 });
3004 }
3005}
3006
3039template <typename mj_scalar_t, typename mj_lno_t, typename mj_gno_t,
3040 typename mj_part_t, typename mj_node_t>
3041void AlgMJ<mj_scalar_t, mj_lno_t, mj_gno_t, mj_part_t, mj_node_t>::
3042 mj_get_initial_cut_coords_target_weights(
3043 mj_scalar_t min_coord,
3044 mj_scalar_t max_coord,
3045 mj_part_t num_cuts/*p-1*/ ,
3046 mj_scalar_t global_weight,
3047 /*p - 1 sized, coordinate of each cut line*/
3048 Kokkos::View<mj_scalar_t *, device_t> & initial_cut_coords,
3049 /*cumulative weights, at left side of each cut line. p-1 sized*/
3050 Kokkos::View<mj_scalar_t *, device_t> & current_target_part_weights ,
3051 std::vector <mj_part_t> *future_num_part_in_parts, //the vecto
3052 std::vector <mj_part_t> *next_future_num_parts_in_parts,
3053 mj_part_t concurrent_current_part,
3054 mj_part_t obtained_part_index,
3055 mj_part_t num_target_first_level_parts,
3056 const Kokkos::View<mj_part_t *, Kokkos::HostSpace> & target_first_level_dist)
3057{
3058 mj_scalar_t coord_range = max_coord - min_coord;
3059
3060 // We decided we could keep some std::vectors around for now. Eventually
3061 // it would be nice to have everything just as views with some being device
3062 // and some host. This particular case needs a bit of work to get setup
3063 // in a cleaner way so not going to mess with it at the moment.
3064
3065 bool bUniformPartsCheck =
3066 num_target_first_level_parts <= 1 && this->mj_uniform_parts(0);
3067
3068 if(!bUniformPartsCheck) {
3069 bool bValidNonUniformTargetWeights =
3070 (num_target_first_level_parts > 1 && target_first_level_dist.size() != 0);
3071 if(!bValidNonUniformTargetWeights) {
3072 std::cerr << "MJ does not support non uniform part weights beyond the first partition" << std::endl;
3073 std::terminate();
3074 }
3075 }
3076
3077 Kokkos::View<mj_scalar_t*, device_t> device_cumulative(
3078 "device_cumulative", num_cuts);
3079 auto host_cumulative = Kokkos::create_mirror_view(device_cumulative);
3080
3081 mj_scalar_t cumulative = 0;
3082
3083 if(bUniformPartsCheck) {
3084 // How many total future parts the part will be partitioned into.
3085 mj_scalar_t total_future_part_count_in_part =
3086 static_cast<mj_scalar_t>((*future_num_part_in_parts)[concurrent_current_part]);
3087
3088 // How much each part should weigh in ideal case.
3089 mj_scalar_t unit_part_weight =
3090 global_weight / total_future_part_count_in_part;
3091
3092 for(mj_part_t i = 0; i < num_cuts; ++i) {
3093 cumulative += unit_part_weight * static_cast<mj_scalar_t>((*next_future_num_parts_in_parts)[i + obtained_part_index]);
3094 host_cumulative(i) = cumulative;
3095 }
3096 }
3097 else {
3098 // Sum of entries in the first level partition distribution vector
3099 mj_scalar_t sum_target_first_level_dist = 0.0;
3100 for (int i = 0; i < num_target_first_level_parts; ++i) {
3101 sum_target_first_level_dist += target_first_level_dist(i);
3102 }
3103
3104 for(mj_part_t i = 0; i < num_cuts; ++i) {
3105 cumulative += global_weight * target_first_level_dist(i) /
3106 sum_target_first_level_dist;
3107 host_cumulative(i) = cumulative;
3108 }
3109 }
3110
3111 Kokkos::deep_copy(device_cumulative, host_cumulative);
3112
3113 Kokkos::parallel_for("Write num in parts",
3114 Kokkos::RangePolicy<typename mj_node_t::execution_space, mj_part_t>
3115 (0, num_cuts), KOKKOS_LAMBDA(mj_part_t cut) {
3116 // set target part weight.
3117 current_target_part_weights(cut) = device_cumulative(cut);
3118 initial_cut_coords(cut) = min_coord +
3119 (coord_range * device_cumulative(cut)) / global_weight;
3120 // set this multiple times but here for device handling
3121 current_target_part_weights(num_cuts) = global_weight;
3122 });
3123
3124 // round the target part weights.
3125 // Note need to discuss regarding DragonFly commits and determine if we
3126 // would not simply check mj_uniform_weights here.
3127 if (!bUniformPartsCheck || this->mj_uniform_weights[0]) {
3128 Kokkos::parallel_for(
3129 Kokkos::RangePolicy<typename mj_node_t::execution_space, mj_part_t>
3130 (0, num_cuts + 1),
3131 KOKKOS_LAMBDA (mj_part_t i) {
3132 current_target_part_weights(i) =
3133 long(current_target_part_weights(i) + 0.5);
3134 });
3135 }
3136}
3137
3154template <typename mj_scalar_t, typename mj_lno_t, typename mj_gno_t,
3155 typename mj_part_t, typename mj_node_t>
3156void AlgMJ<mj_scalar_t, mj_lno_t, mj_gno_t, mj_part_t, mj_node_t>::
3157 set_initial_coordinate_parts(
3158 mj_scalar_t &max_coordinate,
3159 mj_scalar_t &min_coordinate,
3160 mj_lno_t coordinate_begin_index,
3161 mj_lno_t coordinate_end_index,
3162 Kokkos::View<mj_lno_t *, device_t> & mj_current_coordinate_permutations,
3163 Kokkos::View<mj_scalar_t *, device_t> & mj_current_dim_coords,
3164 Kokkos::View<mj_part_t *, device_t> & mj_part_ids,
3165 mj_part_t &partition_count)
3166{
3167 mj_scalar_t coordinate_range = max_coordinate - min_coordinate;
3168
3169 // if there is single point, or if all points are along a line.
3170 // set initial part to 0 for all.
3171 if(std::abs(coordinate_range) < this->sEpsilon ) {
3172 Kokkos::parallel_for(
3173 Kokkos::RangePolicy<typename mj_node_t::execution_space, mj_lno_t>
3174 (coordinate_begin_index, coordinate_end_index),
3175 KOKKOS_LAMBDA (mj_lno_t ii) {
3176 mj_part_ids(mj_current_coordinate_permutations[ii]) = 0;
3177 });
3178 }
3179 else {
3180 // otherwise estimate an initial part for each coordinate.
3181 // assuming uniform distribution of points.
3182 mj_scalar_t slice = coordinate_range / partition_count;
3183 Kokkos::parallel_for(
3184 Kokkos::RangePolicy<typename mj_node_t::execution_space, mj_lno_t>
3185 (coordinate_begin_index, coordinate_end_index),
3186 KOKKOS_LAMBDA (mj_lno_t ii) {
3187 mj_lno_t iii = mj_current_coordinate_permutations[ii];
3188 mj_part_t pp =
3189 mj_part_t((mj_current_dim_coords[iii] - min_coordinate) / slice);
3190 if(pp >= partition_count) {
3191 pp = partition_count - 1; // don't want last coord in an invalid part
3192 }
3193 mj_part_ids[iii] = 2 * pp;
3194 });
3195 }
3196}
3197
3212template <typename mj_scalar_t, typename mj_lno_t, typename mj_gno_t,
3213 typename mj_part_t, typename mj_node_t>
3214void AlgMJ<mj_scalar_t, mj_lno_t, mj_gno_t, mj_part_t,mj_node_t>::mj_1D_part(
3215 Kokkos::View<mj_scalar_t *, device_t> & mj_current_dim_coords,
3216 double used_imbalance_tolerance,
3217 mj_part_t current_work_part,
3218 mj_part_t current_concurrent_num_parts,
3219 Kokkos::View<mj_scalar_t *, device_t> & current_cut_coordinates,
3220 mj_part_t total_incomplete_cut_count,
3221 Kokkos::View<mj_part_t *, device_t> & view_rectilinear_cut_count,
3222 Kokkos::View<size_t*, device_t> & view_total_reduction_size)
3223{
3224 this->temp_cut_coords = current_cut_coordinates;
3225
3227 *reductionOp = NULL;
3228
3229 bool bSingleProcess = (this->comm->getSize() == 1);
3230
3231 std::vector<mj_part_t> temp(host_num_partitioning_in_current_dim.size());
3232 if(!bSingleProcess) {
3233 for(size_t n = 0; n < host_num_partitioning_in_current_dim.size(); ++n) {
3234 temp[n] = host_num_partitioning_in_current_dim(n);
3235 }
3237 <mj_part_t, mj_scalar_t>(
3238 &temp,
3239 current_work_part,
3240 current_concurrent_num_parts);
3241 }
3242
3243 auto local_cut_lower_bound_coordinates =
3244 cut_lower_bound_coordinates;
3245 auto local_cut_upper_bound_coordinates =
3246 cut_upper_bound_coordinates;
3247 auto local_cut_upper_bound_weights = cut_upper_bound_weights;
3248 auto local_cut_lower_bound_weights = cut_lower_bound_weights;
3249 bool local_distribute_points_on_cut_lines = distribute_points_on_cut_lines;
3250 auto local_process_cut_line_weight_to_put_left =
3251 process_cut_line_weight_to_put_left;
3252 auto local_temp_cut_coords = temp_cut_coords;
3253 auto local_global_total_part_weight_left_right_closests =
3254 global_total_part_weight_left_right_closests;
3255 auto local_cut_coordinates_work_array =
3256 cut_coordinates_work_array;
3257 auto local_part_xadj = part_xadj;
3258 auto local_global_min_max_coord_total_weight =
3259 global_min_max_coord_total_weight;
3260 auto local_target_part_weights =
3261 target_part_weights;
3262 auto local_global_rectilinear_cut_weight =
3263 global_rectilinear_cut_weight;
3264 auto local_process_rectilinear_cut_weight =
3265 process_rectilinear_cut_weight;
3266
3267 auto local_is_cut_line_determined = this->is_cut_line_determined;
3268 auto local_device_num_partitioning_in_current_dim =
3269 device_num_partitioning_in_current_dim;
3270
3271 Kokkos::parallel_for(
3272 Kokkos::RangePolicy<typename mj_node_t::execution_space, int> (0, 1),
3273 KOKKOS_LAMBDA (int dummy) {
3274
3275 // these need to be initialized
3276 view_rectilinear_cut_count(0) = 0;
3277 view_total_reduction_size(0) = 0;
3278
3279 // initialize the lower and upper bounds of the cuts.
3280 mj_part_t next = 0;
3281 for(mj_part_t i = 0; i < current_concurrent_num_parts; ++i) {
3282 mj_part_t num_part_in_dim =
3283 local_device_num_partitioning_in_current_dim(current_work_part + i);
3284 mj_part_t num_cut_in_dim = num_part_in_dim - 1;
3285 view_total_reduction_size(0) += (4 * num_cut_in_dim + 1);
3286
3287 for(mj_part_t ii = 0; ii < num_cut_in_dim; ++ii) {
3288 local_is_cut_line_determined(next) = false;
3289 // min coordinate
3290 local_cut_lower_bound_coordinates(next) =
3291 local_global_min_max_coord_total_weight(i);
3292 // max coordinate
3293 local_cut_upper_bound_coordinates(next) =
3294 local_global_min_max_coord_total_weight(
3295 i + current_concurrent_num_parts);
3296 // total weight
3297 local_cut_upper_bound_weights(next) =
3298 local_global_min_max_coord_total_weight(
3299 i + 2 * current_concurrent_num_parts);
3300 local_cut_lower_bound_weights(next) = 0;
3301 if(local_distribute_points_on_cut_lines) {
3302 local_process_cut_line_weight_to_put_left(next) = 0;
3303 }
3304 ++next;
3305 }
3306 }
3307 });
3308
3309 // loop_count allows the kernel to behave differently on the first loop
3310 // and subsequent loops. First loop we do a binary search and subsequent
3311 // loops we simply step towards our target.
3312 int loop_count = 0;
3313 while (total_incomplete_cut_count != 0) {
3314 this->mj_1D_part_get_part_weights(
3315 current_concurrent_num_parts,
3316 current_work_part,
3317 mj_current_dim_coords,
3318 loop_count);
3319 ++loop_count;
3320
3321 this->mj_combine_rightleft_and_weights(
3322 current_work_part,
3323 current_concurrent_num_parts);
3324
3325 // now sum up the results of mpi processors.
3326 if(!bSingleProcess) {
3327 // We're using explicit host here as Spectrum MPI would fail
3328 // with the prior HostMirror UVMSpace to UVMSpace setup.
3329 auto host_total_part_weight_left_right_closests =
3330 Kokkos::create_mirror_view(Kokkos::HostSpace(),
3331 total_part_weight_left_right_closests);
3332 auto host_global_total_part_weight_left_right_closests =
3333 Kokkos::create_mirror_view(Kokkos::HostSpace(),
3334 global_total_part_weight_left_right_closests);
3335
3336 Kokkos::deep_copy(host_total_part_weight_left_right_closests,
3337 total_part_weight_left_right_closests);
3338
3339 size_t host_view_total_reduction_size;
3340 Kokkos::parallel_reduce("Read single",
3341 Kokkos::RangePolicy<typename mj_node_t::execution_space, int> (0, 1),
3342 KOKKOS_LAMBDA(int dummy, size_t & set_single) {
3343 set_single = view_total_reduction_size(0);
3344 }, host_view_total_reduction_size);
3345
3346 reduceAll<int, mj_scalar_t>( *(this->comm), *reductionOp,
3347 host_view_total_reduction_size,
3348 host_total_part_weight_left_right_closests.data(),
3349 host_global_total_part_weight_left_right_closests.data());
3350 Kokkos::deep_copy(global_total_part_weight_left_right_closests,
3351 host_global_total_part_weight_left_right_closests);
3352 }
3353 else {
3354 local_global_total_part_weight_left_right_closests =
3355 this->total_part_weight_left_right_closests;
3356 }
3357
3358 // how much cut will be shifted for the next part in the concurrent
3359 // part calculation.
3360 mj_part_t cut_shift = 0;
3361
3362 // how much the concantaneted array will be shifted for the next part
3363 // in concurrent part calculation.
3364 size_t tlr_shift = 0;
3365
3366 Kokkos::View<mj_part_t*, Kokkos::HostSpace>
3367 save_initial_incomplete_cut_count("save_initial_incomplete_cut_count",
3368 current_concurrent_num_parts);
3369
3370 for(mj_part_t kk = 0; kk < current_concurrent_num_parts; ++kk) {
3371
3372 mj_part_t num_parts =
3373 host_num_partitioning_in_current_dim(current_work_part + kk);
3374
3375 mj_part_t num_cuts = num_parts - 1;
3376 size_t num_total_part = num_parts + size_t (num_cuts);
3377
3378 //if the cuts of this cut has already been completed.
3379 //nothing to do for this part.
3380 //just update the shift amount and proceed.
3381 mj_part_t kk_incomplete_cut_count = this->incomplete_cut_count(kk);
3382
3383 if(kk_incomplete_cut_count == 0) {
3384 cut_shift += num_cuts;
3385 tlr_shift += (num_total_part + 2 * num_cuts);
3386 continue;
3387 }
3388
3389 Kokkos::View<mj_scalar_t *, device_t> current_local_part_weights =
3390 Kokkos::subview(this->total_part_weight_left_right_closests,
3391 std::pair<mj_lno_t, mj_lno_t>(
3392 tlr_shift,
3393 this->total_part_weight_left_right_closests.size()));
3394
3395 Kokkos::View<mj_scalar_t *, device_t> current_global_tlr =
3396 Kokkos::subview(
3397 local_global_total_part_weight_left_right_closests,
3398 std::pair<mj_lno_t, mj_lno_t>(
3399 tlr_shift,
3400 local_global_total_part_weight_left_right_closests.size()));
3401 Kokkos::View<mj_scalar_t *, device_t>
3402 current_global_left_closest_points =
3403 Kokkos::subview(current_global_tlr,
3404 std::pair<mj_lno_t, mj_lno_t>(
3405 num_total_part,
3406 current_global_tlr.size()));
3407 Kokkos::View<mj_scalar_t *, device_t>
3408 current_global_right_closest_points =
3409 Kokkos::subview(current_global_tlr,
3410 std::pair<mj_lno_t, mj_lno_t>(
3411 num_total_part + num_cuts,
3412 current_global_tlr.size()));
3413 Kokkos::View<mj_scalar_t *, device_t> current_global_part_weights =
3414 current_global_tlr;
3415
3416 Kokkos::View<bool *, device_t> current_cut_line_determined =
3417 Kokkos::subview(this->is_cut_line_determined,
3418 std::pair<mj_lno_t, mj_lno_t>(
3419 cut_shift,
3420 this->is_cut_line_determined.size()));
3421 Kokkos::View<mj_scalar_t *, device_t> current_part_target_weights =
3422 Kokkos::subview(local_target_part_weights,
3423 std::pair<mj_lno_t, mj_lno_t>(
3424 cut_shift + kk,
3425 local_target_part_weights.size()));
3426 Kokkos::View<mj_scalar_t *, device_t>
3427 current_part_cut_line_weight_to_put_left =
3428 Kokkos::subview(local_process_cut_line_weight_to_put_left,
3429 std::pair<mj_lno_t, mj_lno_t>(
3430 cut_shift,
3431 local_process_cut_line_weight_to_put_left.size()));
3432
3433 save_initial_incomplete_cut_count(kk) =
3434 kk_incomplete_cut_count;
3435
3436 Kokkos::View<mj_scalar_t *, device_t>
3437 current_cut_lower_bound_weights =
3438 Kokkos::subview(local_cut_lower_bound_weights,
3439 std::pair<mj_lno_t, mj_lno_t>(
3440 cut_shift,
3441 local_cut_lower_bound_weights.size()));
3442 Kokkos::View<mj_scalar_t *, device_t> current_cut_upper_weights =
3443 Kokkos::subview(local_cut_upper_bound_weights,
3444 std::pair<mj_lno_t, mj_lno_t>(
3445 cut_shift,
3446 local_cut_upper_bound_weights.size()));
3447 Kokkos::View<mj_scalar_t *, device_t> current_cut_upper_bounds =
3448 Kokkos::subview(local_cut_upper_bound_coordinates,
3449 std::pair<mj_lno_t, mj_lno_t>(
3450 cut_shift,
3451 local_cut_upper_bound_coordinates.size()));
3452 Kokkos::View<mj_scalar_t *, device_t> current_cut_lower_bounds =
3453 Kokkos::subview(local_cut_lower_bound_coordinates,
3454 std::pair<mj_lno_t, mj_lno_t>(
3455 cut_shift,
3456 local_cut_lower_bound_coordinates.size()));
3457
3458 // Now compute the new cut coordinates.
3459 Kokkos::View<mj_scalar_t*, device_t> sub_temp_cut_coords =
3460 Kokkos::subview(this->temp_cut_coords,
3461 std::pair<mj_lno_t, mj_lno_t>(
3462 cut_shift, this->temp_cut_coords.size()));
3463 Kokkos::View<mj_scalar_t*, device_t> sub_cut_coordinates_work_array =
3464 Kokkos::subview(this->cut_coordinates_work_array,
3465 std::pair<mj_lno_t, mj_lno_t>(
3466 cut_shift, this->cut_coordinates_work_array.size()));
3467
3468 this->mj_get_new_cut_coordinates(
3469 current_concurrent_num_parts,
3470 kk,
3471 num_cuts,
3472 used_imbalance_tolerance,
3473 current_global_part_weights,
3474 current_local_part_weights,
3475 current_part_target_weights,
3476 current_cut_line_determined,
3477 sub_temp_cut_coords,
3478 current_cut_upper_bounds,
3479 current_cut_lower_bounds,
3480 current_global_left_closest_points,
3481 current_global_right_closest_points,
3482 current_cut_lower_bound_weights,
3483 current_cut_upper_weights,
3484 sub_cut_coordinates_work_array,
3485 current_part_cut_line_weight_to_put_left,
3486 view_rectilinear_cut_count);
3487
3488 cut_shift += num_cuts;
3489 tlr_shift += (num_total_part + 2 * num_cuts);
3490 } // end of kk loop
3491
3492 for(mj_part_t kk = 0; kk < current_concurrent_num_parts; ++kk) {
3493 mj_part_t iteration_complete_cut_count =
3494 save_initial_incomplete_cut_count(kk) - this->incomplete_cut_count(kk);
3495 total_incomplete_cut_count -= iteration_complete_cut_count;
3496 }
3497
3498 Kokkos::parallel_for(
3499 Kokkos::RangePolicy<typename mj_node_t::execution_space, int>
3500 (0, local_temp_cut_coords.size()), KOKKOS_LAMBDA(int n) {
3501 auto t = local_temp_cut_coords(n);
3502 local_temp_cut_coords(n) = local_cut_coordinates_work_array(n);
3503 local_cut_coordinates_work_array(n) = t;
3504 });
3505 } // end of the while loop
3506
3507 // Needed only if keep_cuts; otherwise can simply swap array pointers
3508 // cutCoordinates and cutCoordinatesWork.
3509 // (at first iteration, cutCoordinates == cutCoorindates_tmp).
3510 // computed cuts must be in cutCoordinates.
3511 if(current_cut_coordinates != local_temp_cut_coords) {
3512 Kokkos::parallel_for(
3513 Kokkos::RangePolicy<typename mj_node_t::execution_space, int>
3514 (0, 1), KOKKOS_LAMBDA(int dummy) {
3515 mj_part_t next = 0;
3516 for(mj_part_t i = 0; i < current_concurrent_num_parts; ++i) {
3517 mj_part_t num_parts = -1;
3518 num_parts = local_device_num_partitioning_in_current_dim(
3519 current_work_part + i);
3520 mj_part_t num_cuts = num_parts - 1;
3521 for(mj_part_t ii = 0; ii < num_cuts; ++ii) {
3522 current_cut_coordinates(next + ii) = local_temp_cut_coords(next + ii);
3523 }
3524 next += num_cuts;
3525 }
3526 for(int n = 0; n <
3527 static_cast<int>(local_cut_coordinates_work_array.size()); ++n) {
3528 local_cut_coordinates_work_array(n) = local_temp_cut_coords(n);
3529 }
3530 });
3531 }
3532
3533 delete reductionOp;
3534}
3535
3536template<class scalar_t>
3538 scalar_t * ptr;
3539
3540 // With new kokkos setup parallel_reduce will call empty constructor and
3541 // we update the ptr in the init method.
3542 KOKKOS_INLINE_FUNCTION
3544
3545 KOKKOS_INLINE_FUNCTION
3546 Zoltan2_MJArrayType(scalar_t * pSetPtr) : ptr(pSetPtr) {};
3547
3548 Zoltan2_MJArrayType<scalar_t>& operator=(const volatile Zoltan2_MJArrayType<scalar_t>& zmj) {
3549 ptr = zmj.ptr;
3550 return *this;
3551 }
3552};
3553
3554#if !defined(KOKKOS_ENABLE_CUDA) && !defined(KOKKOS_ENABLE_HIP)
3555
3556template<class policy_t, class scalar_t, class part_t>
3558
3560 typedef Zoltan2_MJArrayType<scalar_t> value_type;
3561 scalar_t max_scalar;
3565
3566 KOKKOS_INLINE_FUNCTION ArrayCombinationReducer(
3567 scalar_t mj_max_scalar,
3568 value_type &val,
3569 int mj_value_count_rightleft,
3570 int mj_value_count_weights) :
3571 max_scalar(mj_max_scalar),
3572 value(&val),
3573 value_count_rightleft(mj_value_count_rightleft),
3574 value_count_weights(mj_value_count_weights)
3575 {}
3576
3577 KOKKOS_INLINE_FUNCTION
3579 return *value;
3580 }
3581
3582 KOKKOS_INLINE_FUNCTION
3583 void join(value_type& dst, const value_type& src) const {
3584 for(int n = 0; n < value_count_weights; ++n) {
3585 dst.ptr[n] += src.ptr[n];
3586 }
3587
3588 for(int n = value_count_weights + 2;
3589 n < value_count_weights + value_count_rightleft - 2; n += 2) {
3590 if(src.ptr[n] > dst.ptr[n]) {
3591 dst.ptr[n] = src.ptr[n];
3592 }
3593 if(src.ptr[n+1] < dst.ptr[n+1]) {
3594 dst.ptr[n+1] = src.ptr[n+1];
3595 }
3596 }
3597 }
3598
3599 KOKKOS_INLINE_FUNCTION
3600 void join (volatile value_type& dst, const volatile value_type& src) const {
3601 for(int n = 0; n < value_count_weights; ++n) {
3602 dst.ptr[n] += src.ptr[n];
3603 }
3604
3605 for(int n = value_count_weights + 2;
3606 n < value_count_weights + value_count_rightleft - 2; n += 2) {
3607 if(src.ptr[n] > dst.ptr[n]) {
3608 dst.ptr[n] = src.ptr[n];
3609 }
3610 if(src.ptr[n+1] < dst.ptr[n+1]) {
3611 dst.ptr[n+1] = src.ptr[n+1];
3612 }
3613 }
3614 }
3615
3616 KOKKOS_INLINE_FUNCTION void init (value_type& dst) const {
3617 dst.ptr = value->ptr; // must update ptr
3618
3619 for(int n = 0; n < value_count_weights; ++n) {
3620 dst.ptr[n] = 0;
3621 }
3622
3623 for(int n = value_count_weights;
3625 dst.ptr[n] = -max_scalar;
3626 dst.ptr[n+1] = max_scalar;
3627 }
3628 }
3629};
3630#endif // KOKKOS_ENABLE_CUDA && KOKKOS_ENABLE_HIP
3631
3632template<class policy_t, class scalar_t, class part_t, class index_t,
3633 class device_t, class array_t>
3635 typedef typename policy_t::member_type member_type;
3636 typedef Kokkos::View<scalar_t*> scalar_view_t;
3637
3638#if !defined(KOKKOS_ENABLE_CUDA) && !defined(KOKKOS_ENABLE_HIP)
3639 typedef array_t value_type[];
3640#endif
3641
3643 array_t max_scalar;
3644
3652 Kokkos::View<index_t*, device_t> permutations;
3653 Kokkos::View<scalar_t *, device_t> coordinates;
3654 Kokkos::View<scalar_t**, device_t> weights;
3655 Kokkos::View<part_t*, device_t> parts;
3656 Kokkos::View<scalar_t *, device_t> cut_coordinates;
3657 Kokkos::View<index_t *, device_t> part_xadj;
3659 scalar_t sEpsilon;
3660
3661#if defined(KOKKOS_ENABLE_CUDA) || defined(KOKKOS_ENABLE_HIP)
3662 Kokkos::View<double *, device_t> current_part_weights;
3663 Kokkos::View<scalar_t *, device_t> current_left_closest;
3664 Kokkos::View<scalar_t *, device_t> current_right_closest;
3665#endif // KOKKOS_ENABLE_CUDA || defined(KOKKOS_ENABLE_HIP)
3666
3668 int mj_loop_count,
3669 array_t mj_max_scalar,
3670 part_t mj_concurrent_current_part,
3671 part_t mj_num_cuts,
3672 part_t mj_current_work_part,
3673 part_t mj_current_concurrent_num_parts,
3674 part_t mj_left_right_array_size,
3675 part_t mj_weight_array_size,
3676 Kokkos::View<index_t*, device_t> & mj_permutations,
3677 Kokkos::View<scalar_t *, device_t> & mj_coordinates,
3678 Kokkos::View<scalar_t**, device_t> & mj_weights,
3679 Kokkos::View<part_t*, device_t> & mj_parts,
3680 Kokkos::View<scalar_t *, device_t> & mj_cut_coordinates,
3681 Kokkos::View<index_t *, device_t> & mj_part_xadj,
3682 bool mj_uniform_weights0,
3683 scalar_t mj_sEpsilon
3684#if defined(KOKKOS_ENABLE_CUDA) || defined(KOKKOS_ENABLE_HIP)
3685 ,Kokkos::View<double *, device_t> & mj_current_part_weights,
3686 Kokkos::View<scalar_t *, device_t> & mj_current_left_closest,
3687 Kokkos::View<scalar_t *, device_t> & mj_current_right_closest
3688#endif // KOKKOS_ENABLE_CUDA || KOKKOS_ENABLE_HIP
3689 ) :
3690 loop_count(mj_loop_count),
3691 max_scalar(mj_max_scalar),
3692 concurrent_current_part(mj_concurrent_current_part),
3693 num_cuts(mj_num_cuts),
3694 current_work_part(mj_current_work_part),
3695 current_concurrent_num_parts(mj_current_concurrent_num_parts),
3696 value_count_rightleft(mj_left_right_array_size),
3697 value_count_weights(mj_weight_array_size),
3698 value_count(mj_weight_array_size+mj_left_right_array_size),
3699 permutations(mj_permutations),
3700 coordinates(mj_coordinates),
3701 weights(mj_weights),
3702 parts(mj_parts),
3703 cut_coordinates(mj_cut_coordinates),
3704 part_xadj(mj_part_xadj),
3705 uniform_weights0(mj_uniform_weights0),
3706 sEpsilon(mj_sEpsilon)
3707#if defined(KOKKOS_ENABLE_CUDA) || defined(KOKKOS_ENABLE_HIP)
3708 ,current_part_weights(mj_current_part_weights),
3709 current_left_closest(mj_current_left_closest),
3710 current_right_closest(mj_current_right_closest)
3711#endif // KOKKOS_ENABLE_CUDA || KOKKOS_ENABLE_HIP
3712 {
3713 }
3714
3715 size_t team_shmem_size (int team_size) const {
3716#if defined(KOKKOS_ENABLE_CUDA) || defined(KOKKOS_ENABLE_HIP)
3717 int result = sizeof(array_t) *
3719#else
3720 int result = sizeof(array_t) *
3722#endif
3723
3724 // pad this to a multiple of 8 or it will run corrupt
3725 int remainder = result % 8;
3726 if(remainder != 0) {
3727 result += 8 - remainder;
3728 }
3729 return result;
3730 }
3731
3732 KOKKOS_INLINE_FUNCTION
3733#if defined(KOKKOS_ENABLE_CUDA) || defined(KOKKOS_ENABLE_HIP)
3734 void operator() (const member_type & teamMember) const {
3735#else
3736 void operator() (const member_type & teamMember, value_type teamSum) const {
3737#endif
3738
3739 index_t all_begin = (concurrent_current_part == 0) ? 0 :
3741 index_t all_end = part_xadj(concurrent_current_part);
3742
3743 index_t num_working_points = all_end - all_begin;
3744 int num_teams = teamMember.league_size();
3745
3746 index_t stride = num_working_points / num_teams;
3747 if((num_working_points % num_teams) > 0) {
3748 stride += 1; // make sure we have coverage for the final points
3749 }
3750
3751 // the last team may have less work than the other teams
3752 // the last team can be empty (begin > end) if num_teams > stride
3753 // which is true for many teams and small numbers of coords (tests)
3754 index_t begin = all_begin + stride * teamMember.league_rank();
3755 index_t end = begin + stride;
3756 if(end > all_end) {
3757 end = all_end;
3758 }
3759
3760#if defined(KOKKOS_ENABLE_CUDA) || defined(KOKKOS_ENABLE_HIP)
3761 size_t sh_mem_size = sizeof(array_t) * (value_count_weights +
3763
3764 array_t * shared_ptr = (array_t *) teamMember.team_shmem().get_shmem(
3765 sh_mem_size);
3766
3767 // init the shared array to 0
3768 Kokkos::single(Kokkos::PerTeam(teamMember), [=] () {
3769 for(int n = 0; n < value_count_weights; ++n) {
3770 shared_ptr[n] = 0;
3771 }
3772 for(int n = value_count_weights;
3774 shared_ptr[n] = -max_scalar;
3775 shared_ptr[n+1] = max_scalar;
3776 }
3777 });
3778 teamMember.team_barrier();
3779
3780 Kokkos::parallel_for(
3781 Kokkos::TeamThreadRange(teamMember, begin, end),
3782 [=] (index_t ii) {
3783#else // KOKKOS_ENABLE_CUDA || KOKKOS_ENABLE_HIP
3784 // create the team shared data - each thread gets one of the arrays
3785 size_t sh_mem_size = sizeof(array_t) * (value_count_weights +
3786 value_count_rightleft) * teamMember.team_size();
3787
3788 array_t * shared_ptr = (array_t *) teamMember.team_shmem().get_shmem(
3789 sh_mem_size);
3790
3791 // select the array for this thread
3792 Zoltan2_MJArrayType<array_t> array(&shared_ptr[teamMember.team_rank() *
3794
3795 // create reducer which handles the Zoltan2_MJArrayType class
3796 ArrayCombinationReducer<policy_t, array_t, part_t> arraySumReducer(
3797 max_scalar, array,
3800
3801 Kokkos::parallel_reduce(
3802 Kokkos::TeamThreadRange(teamMember, begin, end),
3803 [=] (size_t ii, Zoltan2_MJArrayType<array_t>& threadSum) {
3804#endif // KOKKOS_ENABLE_CUDA || KOKKOS_ENABLE_HIP
3805
3806 int i = permutations(ii);
3807 scalar_t coord = coordinates(i);
3808 array_t w = uniform_weights0 ? 1 : (array_t) weights(i,0);
3809
3810 // now check each part and it's right cut
3811 index_t part = parts(i)/2;
3812
3813 int upper = num_cuts;
3814 int lower = 0;
3815
3816 // binary search - find matching part
3817 while(true) {
3818 scalar_t a = (part == 0) ? -max_scalar : cut_coordinates(part-1);
3819 scalar_t b = (part == num_cuts) ? max_scalar : cut_coordinates(part);
3820
3821 if(coord >= a + sEpsilon && coord <= b - sEpsilon) {
3822#if defined(KOKKOS_ENABLE_CUDA) || defined(KOKKOS_ENABLE_HIP)
3823 Kokkos::atomic_add(&shared_ptr[part*2], w);
3824#else
3825 threadSum.ptr[part*2] += w;
3826#endif
3827
3828 parts(i) = part*2;
3829
3830 // now handle the left/right closest part
3831#if defined(KOKKOS_ENABLE_CUDA) || defined(KOKKOS_ENABLE_HIP)
3832 array_t new_value = (array_t) coord;
3833 array_t prev_value = shared_ptr[value_count_weights + part * 2 + 1];
3834 while(new_value < prev_value) {
3835 prev_value = Kokkos::atomic_compare_exchange(
3836 &shared_ptr[value_count_weights + part * 2 + 1],
3837 prev_value, new_value);
3838 }
3839 prev_value = shared_ptr[value_count_weights + part * 2 + 2];
3840 while(new_value > prev_value) {
3841 prev_value = Kokkos::atomic_compare_exchange(
3842 &shared_ptr[value_count_weights + part * 2 + 2],
3843 prev_value, new_value);
3844 }
3845#else
3846 // note cut to left needs to set right closest and cut to right needs
3847 // to set left closest. It's index +1 and +2 instead of -1 and +0
3848 // because right/left segment is padded with an extra pair at
3849 // begining and end to avoid branching with if checks.
3850 if(coord < threadSum.ptr[value_count_weights + part * 2 + 1]) {
3851 threadSum.ptr[value_count_weights + part * 2 + 1] = coord;
3852 }
3853 if(coord > threadSum.ptr[value_count_weights + part * 2 + 2]) {
3854 threadSum.ptr[value_count_weights + part * 2 + 2] = coord;
3855 }
3856#endif
3857
3858 break;
3859 }
3860 else if(part != num_cuts) {
3861 if(coord < b + sEpsilon && coord > b - sEpsilon) {
3862 // Note if on cut we set right/left closest to the cut itself
3863 // but we add +2 because we buffered the area with an extra slot
3864 // to reduce cuda branching. So it's +2, +3 instead of +0, +1.
3865#if defined(KOKKOS_ENABLE_CUDA) || defined(KOKKOS_ENABLE_HIP)
3866 Kokkos::atomic_add(&shared_ptr[part*2+1], w);
3867 shared_ptr[value_count_weights + part * 2 + 2] = b;
3868 shared_ptr[value_count_weights + part * 2 + 3] = b;
3869#else
3870 threadSum.ptr[part*2+1] += w;
3871 threadSum.ptr[value_count_weights + part * 2 + 2] = b;
3872 threadSum.ptr[value_count_weights + part * 2 + 3] = b;
3873#endif
3874
3875 parts(i) = part*2+1;
3876
3877 // Need to scan up for any other cuts of same coordinate
3878 // This is costly but it's only relevant for the fix4785 test
3879 // which loads a lot of coordinates on the same point, so without
3880 // this our cuts would all just sit at 0.
3881 part_t base_b = part;
3882 scalar_t base_coord = cut_coordinates(base_b);
3883 part += 1;
3884 while(part < num_cuts) {
3885 b = cut_coordinates(part);
3886 scalar_t delta = b - base_coord;
3887 if(delta < 0) delta = -delta;
3888 if(delta < sEpsilon) {
3889 // Note if on cut we set right/left closest to the cut itself
3890 // but we add +2 because we buffered the area with an extra slot
3891 // to reduce cuda branching. So it's +2, +3 instead of +0, +1.
3892#if defined(KOKKOS_ENABLE_CUDA) || defined(KOKKOS_ENABLE_HIP)
3893 Kokkos::atomic_add(&shared_ptr[part*2+1], w);
3894 shared_ptr[value_count_weights + part * 2 + 2] = b;
3895 shared_ptr[value_count_weights + part * 2 + 3] = b;
3896#else
3897 threadSum.ptr[part*2+1] += w;
3898 threadSum.ptr[value_count_weights + part * 2 + 2] = b;
3899 threadSum.ptr[value_count_weights + part * 2 + 3] = b;
3900#endif
3901 }
3902 else { break; }
3903 ++part;
3904 }
3905 part = base_b - 1;
3906 while(part >= 0) {
3907 b = cut_coordinates(part);
3908 scalar_t delta = b - base_coord;
3909 if(delta < 0) delta = -delta;
3910 if(delta < sEpsilon) {
3911 // Note if on cut we set right/left closest to the cut itself
3912 // but we add +2 because we buffered the area with an extra slot
3913 // to reduce cuda branching. So it's +2, +3 instead of +0, +1.
3914#if defined(KOKKOS_ENABLE_CUDA) || defined(KOKKOS_ENABLE_HIP)
3915 Kokkos::atomic_add(&shared_ptr[part*2+1], w);
3916 shared_ptr[value_count_weights + part * 2 + 2] = b;
3917 shared_ptr[value_count_weights + part * 2 + 3] = b;
3918#else
3919 threadSum.ptr[part*2+1] += w;
3920 threadSum.ptr[value_count_weights + part * 2 + 2] = b;
3921 threadSum.ptr[value_count_weights + part * 2 + 3] = b;
3922#endif
3923 }
3924 else { break; }
3925 --part;
3926 }
3927
3928 break;
3929 }
3930 }
3931
3932 if(loop_count != 0) {
3933 // subsequent loops can just step towards target
3934 if(coord < b) {
3935 part -= 1;
3936 }
3937 else {
3938 part += 1;
3939 }
3940 }
3941 else {
3942 // initial loop binary search
3943 if(coord < b) {
3944 if(part == lower + 1) {
3945 part = lower;
3946 }
3947 else {
3948 upper = part - 1;
3949 part -= (part - lower)/2;
3950 }
3951 }
3952 else if(part == upper - 1) {
3953 part = upper;
3954 }
3955 else {
3956 lower = part + 1;
3957 part += (upper - part)/2;
3958 }
3959 }
3960 }
3961#if defined(KOKKOS_ENABLE_CUDA) || defined(KOKKOS_ENABLE_HIP)
3962 });
3963#else // KOKKOS_ENABLE_CUDA || KOKKOS_ENABLE_HIP
3964 }, arraySumReducer);
3965#endif // KOKKOS_ENABLE_CUDA || KOKKOS_ENABLE_HIP
3966
3967 teamMember.team_barrier();
3968
3969 // collect all the team's results
3970 Kokkos::single(Kokkos::PerTeam(teamMember), [=] () {
3971 for(int n = 0; n < value_count_weights; ++n) {
3972#if defined(KOKKOS_ENABLE_CUDA) || defined(KOKKOS_ENABLE_HIP)
3973 Kokkos::atomic_add(&current_part_weights(n),
3974 static_cast<double>(shared_ptr[n]));
3975#else // KOKKOS_ENABLE_CUDA || KOKKOS_ENABLE_HIP
3976 teamSum[n] += array.ptr[n];
3977#endif // KOKKOS_ENABLE_CUDA || KOKKOS_ENABLE_HIP
3978 }
3979
3980#if defined(KOKKOS_ENABLE_CUDA) || defined(KOKKOS_ENABLE_HIP)
3981 int insert_left = 0;
3982 int insert_right = 0;
3983#endif
3984
3985 for(int n = 2 + value_count_weights;
3986 n < value_count_weights + value_count_rightleft - 2; n += 2) {
3987#if defined(KOKKOS_ENABLE_CUDA) || defined(KOKKOS_ENABLE_HIP)
3988 scalar_t new_value = shared_ptr[n+1];
3989 scalar_t prev_value = current_right_closest(insert_right);
3990 while(new_value < prev_value) {
3991 prev_value = Kokkos::atomic_compare_exchange(
3992 &current_right_closest(insert_right), prev_value, new_value);
3993 }
3994
3995 new_value = shared_ptr[n];
3996 prev_value = current_left_closest(insert_left);
3997 while(new_value > prev_value) {
3998 prev_value = Kokkos::atomic_compare_exchange(
3999 &current_left_closest(insert_left), prev_value, new_value);
4000 }
4001
4002 ++insert_left;
4003 ++insert_right;
4004#else // KOKKOS_ENABLE_CUDA || KOKKOS_ENABLE_HIP
4005 if(array.ptr[n] > teamSum[n]) {
4006 teamSum[n] = array.ptr[n];
4007 }
4008 if(array.ptr[n+1] < teamSum[n+1]) {
4009 teamSum[n+1] = array.ptr[n+1];
4010 }
4011#endif // KOKKOS_ENABLE_CUDA || KOKKOS_ENABLE_HIP
4012 }
4013 });
4014
4015 teamMember.team_barrier();
4016 }
4017
4018#if !defined(KOKKOS_ENABLE_CUDA) && !defined(KOKKOS_ENABLE_HIP)
4019 KOKKOS_INLINE_FUNCTION
4020 void join(value_type dst, const value_type src) const {
4021 for(int n = 0; n < value_count_weights; ++n) {
4022 dst[n] += src[n];
4023 }
4024
4025 for(int n = value_count_weights + 2;
4026 n < value_count_weights + value_count_rightleft - 2; n += 2) {
4027 if(src[n] > dst[n]) {
4028 dst[n] = src[n];
4029 }
4030 if(src[n+1] < dst[n+1]) {
4031 dst[n+1] = src[n+1];
4032 }
4033 }
4034 }
4035
4036 KOKKOS_INLINE_FUNCTION void init (value_type dst) const {
4037 for(int n = 0; n < value_count_weights; ++n) {
4038 dst[n] = 0;
4039 }
4040
4041 for(int n = value_count_weights;
4043 dst[n] = -max_scalar;
4044 dst[n+1] = max_scalar;
4045 }
4046 }
4047#endif // KOKKOS_ENABLE_CUDA || KOKKOS_ENABLE_HIP
4048};
4049
4057template <typename mj_scalar_t, typename mj_lno_t, typename mj_gno_t,
4058 typename mj_part_t, typename mj_node_t>
4059void AlgMJ<mj_scalar_t, mj_lno_t, mj_gno_t,mj_part_t, mj_node_t>::
4060 mj_1D_part_get_part_weights(
4062 mj_part_t current_work_part,
4063 Kokkos::View<mj_scalar_t *, device_t> & mj_current_dim_coords,
4064 int loop_count)
4065{
4066 auto local_is_cut_line_determined = is_cut_line_determined;
4067 auto local_thread_part_weights = thread_part_weights;
4068 auto local_thread_cut_left_closest_point = thread_cut_left_closest_point;
4069 auto local_thread_cut_right_closest_point = thread_cut_right_closest_point;
4070
4071 // Create some locals so we don't use this inside the kernels
4072 // which causes problems
4073 auto local_sEpsilon = this->sEpsilon;
4074 auto local_assigned_part_ids = this->assigned_part_ids;
4075 auto local_coordinate_permutations = this->coordinate_permutations;
4076 auto local_mj_weights = this->mj_weights;
4077 auto local_part_xadj = this->part_xadj;
4078 auto local_global_min_max_coord_total_weight =
4079 this->global_min_max_coord_total_weight;
4080
4081 typedef Kokkos::TeamPolicy<typename mj_node_t::execution_space> policy_t;
4082
4083 auto local_device_num_partitioning_in_current_dim =
4084 device_num_partitioning_in_current_dim;
4085
4086 Kokkos::deep_copy(device_incomplete_cut_count, this->incomplete_cut_count);
4087 auto local_device_incomplete_cut_count = device_incomplete_cut_count;
4088
4089 mj_part_t total_part_shift = 0;
4090
4091 mj_part_t concurrent_cut_shifts = 0;
4092 for(int kk = 0; kk < current_concurrent_num_parts; ++kk) {
4093 Kokkos::View<mj_scalar_t *, device_t> local_temp_cut_coords =
4094 Kokkos::subview(temp_cut_coords, std::pair<mj_lno_t, mj_lno_t>(
4095 concurrent_cut_shifts, temp_cut_coords.size()));
4096
4097 mj_part_t num_parts =
4098 host_num_partitioning_in_current_dim(current_work_part + kk);
4099 mj_part_t num_cuts = num_parts - 1;
4100 mj_part_t total_part_count = num_parts + num_cuts;
4101 mj_part_t weight_array_length = num_cuts + num_parts;
4102
4103 // for right/left closest + buffer cut on either side
4104 mj_part_t right_left_array_length = (num_cuts + 2) * 2;
4105
4106 if(this->incomplete_cut_count(kk) == 0) {
4107 total_part_shift += total_part_count;
4108 concurrent_cut_shifts += num_cuts;
4109 continue;
4110 }
4111
4112 // if not set use 60 - was initial testing amount but somewhat arbitrary
4113 auto policy_ReduceWeightsFunctor = policy_t(
4114 mj_num_teams ? mj_num_teams : 60, Kokkos::AUTO);
4115
4116#if !defined(KOKKOS_ENABLE_CUDA) && !defined(KOKKOS_ENABLE_HIP)
4117 int total_array_length =
4118 weight_array_length + right_left_array_length;
4119#endif
4120
4121 // Using float here caused some numerical errors for coord on cut calculations.
4122 // Probably that can be fixed with proper epsilon adjustment but since cuda
4123 // doesn't reduce right now the shared memory pressure is no longer relevant.
4124 // Just use scalar_t to match the original algorithm.
4125 typedef mj_scalar_t array_t;
4126
4127#if !defined(KOKKOS_ENABLE_CUDA) && !defined(KOKKOS_ENABLE_HIP)
4128 Kokkos::View<array_t*, Kokkos::HostSpace> reduce_array("reduce_array", total_array_length);
4129#endif // KOKKOS_ENABLE_CUDA && KOKKOS_ENABLE_HIP
4130
4131 int offset_cuts = 0;
4132 for(int kk2 = 0; kk2 < kk; ++kk2) {
4133 offset_cuts +=
4134 host_num_partitioning_in_current_dim(current_work_part + kk2) - 1;
4135 }
4136 Kokkos::View<double *, device_t> my_current_part_weights =
4137 Kokkos::subview(local_thread_part_weights,
4138 std::pair<mj_lno_t, mj_lno_t>(total_part_shift,
4139 total_part_shift + total_part_count));
4140 Kokkos::View<mj_scalar_t *, device_t> my_current_left_closest =
4141 Kokkos::subview(local_thread_cut_left_closest_point,
4142 std::pair<mj_lno_t, mj_lno_t>(
4143 offset_cuts,
4144 local_thread_cut_left_closest_point.size()));
4145 Kokkos::View<mj_scalar_t *, device_t> my_current_right_closest =
4146 Kokkos::subview(local_thread_cut_right_closest_point,
4147 std::pair<mj_lno_t, mj_lno_t>(
4148 offset_cuts,
4149 local_thread_cut_right_closest_point.size()));
4150
4151 array_t max_scalar = std::numeric_limits<array_t>::max();
4152
4153#if defined(KOKKOS_ENABLE_CUDA) || defined(KOKKOS_ENABLE_HIP)
4154 // initialize values
4155 Kokkos::parallel_for(
4156 Kokkos::RangePolicy<typename mj_node_t::execution_space, int> (0, 1),
4157 KOKKOS_LAMBDA (int dummy) {
4158 for(int n = 0; n < weight_array_length; ++n) {
4159 my_current_part_weights(n) = 0;
4160 }
4161 for(int n = 0; n < num_cuts; ++n) {
4162 my_current_left_closest(n) = -max_scalar;
4163 my_current_right_closest(n) = max_scalar;
4164 }
4165 });
4166#endif
4167
4168 mj_part_t concurrent_current_part =
4169 current_work_part + kk;
4170
4171 ReduceWeightsFunctor<policy_t, mj_scalar_t, mj_part_t, mj_lno_t,
4172 typename mj_node_t::device_type, array_t>
4173 teamFunctor(
4174 loop_count,
4175 max_scalar,
4177 num_cuts,
4180 right_left_array_length,
4181 weight_array_length,
4182 coordinate_permutations,
4183 mj_current_dim_coords,
4184 mj_weights,
4185 assigned_part_ids,
4186 local_temp_cut_coords,
4187 part_xadj,
4188 mj_uniform_weights(0), // host and currently only relevant to slot 0
4189 sEpsilon
4190#if defined(KOKKOS_ENABLE_CUDA) || defined(KOKKOS_ENABLE_HIP)
4191 ,my_current_part_weights,
4192 my_current_left_closest,
4193 my_current_right_closest
4194#endif
4195 );
4196
4197#if defined(KOKKOS_ENABLE_CUDA) || defined(KOKKOS_ENABLE_HIP)
4198 Kokkos::parallel_for(policy_ReduceWeightsFunctor, teamFunctor);
4199#else
4200 Kokkos::parallel_reduce(policy_ReduceWeightsFunctor,
4201 teamFunctor, reduce_array);
4202 Kokkos::fence();
4203#endif
4204
4205#if !defined(KOKKOS_ENABLE_CUDA) && !defined(KOKKOS_ENABLE_HIP)
4206 auto hostArray = Kokkos::create_mirror_view(my_current_part_weights);
4207
4208 for(int i = 0; i < static_cast<int>(total_part_count); ++i) {
4209 hostArray(i) = reduce_array[i];
4210 }
4211
4212 Kokkos::deep_copy(my_current_part_weights, hostArray);
4213
4214 auto hostLeftArray = Kokkos::create_mirror_view(my_current_left_closest);
4215 auto hostRightArray = Kokkos::create_mirror_view(my_current_right_closest);
4216 for(mj_part_t cut = 0; cut < num_cuts; ++cut) {
4217 hostLeftArray(cut) = reduce_array[weight_array_length + (cut+1)*2+0];
4218 hostRightArray(cut) = reduce_array[weight_array_length + (cut+1)*2+1];
4219 }
4220 Kokkos::deep_copy(my_current_left_closest, hostLeftArray);
4221 Kokkos::deep_copy(my_current_right_closest, hostRightArray);
4222#endif
4223
4224 total_part_shift += total_part_count;
4225 concurrent_cut_shifts += num_cuts;
4226 }
4227
4228 auto local_temp_cut_coords = temp_cut_coords;
4229
4230 Kokkos::parallel_for(
4231 Kokkos::RangePolicy<typename mj_node_t::execution_space, mj_part_t>
4232 (0, current_concurrent_num_parts), KOKKOS_LAMBDA(mj_part_t kk) {
4233 mj_part_t num_parts = local_device_num_partitioning_in_current_dim(
4234 current_work_part + kk);
4235 mj_part_t num_cuts = num_parts - 1;
4236 mj_part_t total_part_count = num_parts + num_cuts;
4237
4238 if(local_device_incomplete_cut_count(kk) > 0) {
4239 // get the prefix sum
4240 // This is an inefficiency but not sure if it matters much
4241 size_t offset = 0;
4242 size_t offset_cuts = 0;
4243 for(mj_part_t kk2 = 0; kk2 < kk; ++kk2) {
4244 auto num_parts_kk2 = local_device_num_partitioning_in_current_dim(
4245 current_work_part + kk2);
4246 offset += num_parts_kk2 * 2 - 1;
4247 offset_cuts += num_parts_kk2 - 1;
4248 }
4249
4250 for(mj_part_t i = 1; i < total_part_count; ++i) {
4251 // check for cuts sharing the same position; all cuts sharing a position
4252 // have the same weight == total weight for all cuts sharing the
4253 // position. Don't want to accumulate that total weight more than once.
4254 if(i % 2 == 0 && i > 1 && i < total_part_count - 1 &&
4255 std::abs(local_temp_cut_coords(offset_cuts + i / 2) -
4256 local_temp_cut_coords(offset_cuts + i /2 - 1))
4257 < local_sEpsilon) {
4258 // i % 2 = 0 when part i represents the cut coordinate.
4259 // if it is a cut, and if next cut also has the same coordinate, then
4260 // dont addup.
4261 local_thread_part_weights(offset + i)
4262 = local_thread_part_weights(offset + i-2);
4263 continue;
4264 }
4265
4266 // otherwise do the prefix sum.
4267 local_thread_part_weights(offset + i) +=
4268 local_thread_part_weights(offset + i-1);
4269 }
4270 }
4271 });
4272}
4273
4281template <typename mj_scalar_t, typename mj_lno_t, typename mj_gno_t,
4282 typename mj_part_t, typename mj_node_t>
4283void AlgMJ<mj_scalar_t, mj_lno_t, mj_gno_t, mj_part_t, mj_node_t>::
4284 mj_combine_rightleft_and_weights(
4285 mj_part_t current_work_part,
4287{
4288 auto local_thread_part_weights = this->thread_part_weights;
4289 auto local_is_cut_line_determined = this->is_cut_line_determined;
4290 auto local_thread_cut_left_closest_point =
4291 this->thread_cut_left_closest_point;
4292 auto local_thread_cut_right_closest_point =
4293 this->thread_cut_right_closest_point;
4294 auto local_total_part_weight_left_right_closests =
4295 this->total_part_weight_left_right_closests;
4296 auto local_device_num_partitioning_in_current_dim =
4297 device_num_partitioning_in_current_dim;
4298 Kokkos::parallel_for(
4299 Kokkos::RangePolicy<typename mj_node_t::execution_space, int>(0,1),
4300 KOKKOS_LAMBDA (int dummy) {
4301
4302 size_t tlr_array_shift = 0;
4303 mj_part_t cut_shift = 0;
4304 size_t total_part_array_shift = 0;
4305
4306 // iterate for all concurrent parts to find the left and right closest
4307 // points in the process.
4308 for(mj_part_t i = 0; i < current_concurrent_num_parts; ++i) {
4309
4310 mj_part_t num_parts_in_part =
4311 local_device_num_partitioning_in_current_dim(current_work_part + i);
4312 mj_part_t num_cuts_in_part = num_parts_in_part - 1;
4313 size_t num_total_part_in_part =
4314 num_parts_in_part + size_t (num_cuts_in_part);
4315
4316 // iterate for cuts in a single part.
4317 for(int ii = 0; ii < num_cuts_in_part; ++ii) {
4318 mj_part_t next = tlr_array_shift + ii;
4319 mj_part_t cut_index = cut_shift + ii;
4320
4321 if(!local_is_cut_line_determined(cut_index)) {
4322 mj_scalar_t left_closest_in_process =
4323 local_thread_cut_left_closest_point(cut_index);
4324 mj_scalar_t right_closest_in_process =
4325 local_thread_cut_right_closest_point(cut_index);
4326
4327 // store the left and right closes points.
4328 local_total_part_weight_left_right_closests(
4329 num_total_part_in_part + next) = left_closest_in_process;
4330
4331 local_total_part_weight_left_right_closests(
4332 num_total_part_in_part + num_cuts_in_part + next) =
4333 right_closest_in_process;
4334 }
4335 }
4336
4337 for(size_t j = 0; j < num_total_part_in_part; ++j) {
4338 mj_part_t cut_ind = j / 2 + cut_shift;
4339
4340 // need to check j != num_total_part_in_part - 1
4341 // which is same as j/2 != num_cuts_in_part.
4342 // we cannot check it using cut_ind, because of the concurrent part
4343 // concantanetion.
4344 if(j == num_total_part_in_part - 1 ||
4345 !local_is_cut_line_determined(cut_ind)) {
4346 double pwj = local_thread_part_weights(total_part_array_shift + j);
4347 local_total_part_weight_left_right_closests(tlr_array_shift + j) = pwj;
4348 }
4349 }
4350
4351 // set the shift position in the arrays
4352 cut_shift += num_cuts_in_part;
4353 tlr_array_shift += num_total_part_in_part + 2 * num_cuts_in_part;
4354 total_part_array_shift += num_total_part_in_part;
4355 }
4356 });
4357}
4358
4371template <typename mj_scalar_t, typename mj_lno_t, typename mj_gno_t,
4372 typename mj_part_t, typename mj_node_t>
4373KOKKOS_INLINE_FUNCTION
4374void AlgMJ<mj_scalar_t, mj_lno_t, mj_gno_t, mj_part_t,
4375 mj_node_t>::mj_calculate_new_cut_position(mj_scalar_t cut_upper_bound,
4376 mj_scalar_t cut_lower_bound,
4377 mj_scalar_t cut_upper_weight,
4378 mj_scalar_t cut_lower_weight,
4379 mj_scalar_t expected_weight,
4380 mj_scalar_t &new_cut_position,
4381 mj_scalar_t sEpsilon) {
4382
4383 if(std::abs(cut_upper_bound - cut_lower_bound) < sEpsilon) {
4384 new_cut_position = cut_upper_bound; //or lower bound does not matter.
4385 }
4386
4387 if(std::abs(cut_upper_weight - cut_lower_weight) < sEpsilon) {
4388 new_cut_position = cut_lower_bound;
4389 }
4390
4391 mj_scalar_t coordinate_range = (cut_upper_bound - cut_lower_bound);
4392 mj_scalar_t weight_range = (cut_upper_weight - cut_lower_weight);
4393 mj_scalar_t my_weight_diff = (expected_weight - cut_lower_weight);
4394
4395 mj_scalar_t required_shift = (my_weight_diff / weight_range);
4396 int scale_constant = 20;
4397 int shiftint= int (required_shift * scale_constant);
4398 if(shiftint == 0) shiftint = 1;
4399 required_shift = mj_scalar_t (shiftint) / scale_constant;
4400 new_cut_position = coordinate_range * required_shift + cut_lower_bound;
4401}
4402
4403#if !defined(KOKKOS_ENABLE_CUDA) && !defined(KOKKOS_ENABLE_HIP)
4404
4405template<class policy_t, class scalar_t>
4407
4409 typedef Zoltan2_MJArrayType<scalar_t> value_type;
4412
4413 KOKKOS_INLINE_FUNCTION ArrayReducer(
4414 value_type &val,
4415 int mj_value_count) :
4416 value(&val),
4417 value_count(mj_value_count)
4418 {}
4419
4420 KOKKOS_INLINE_FUNCTION
4422 return *value;
4423 }
4424
4425 KOKKOS_INLINE_FUNCTION
4426 void join(value_type& dst, const value_type& src) const {
4427 for(int n = 0; n < value_count; ++n) {
4428 dst.ptr[n] += src.ptr[n];
4429 }
4430 }
4431
4432 KOKKOS_INLINE_FUNCTION void init (value_type& dst) const {
4433 dst.ptr = value->ptr; // must update ptr
4434 for(int n = 0; n < value_count; ++n) {
4435 dst.ptr[n] = 0;
4436 }
4437 }
4438};
4439
4440#endif
4441
4442template<class policy_t, class scalar_t, class part_t, class index_t,
4443 class device_t, class array_t>
4445 typedef typename policy_t::member_type member_type;
4446 typedef Kokkos::View<scalar_t*> scalar_view_t;
4447
4448#if !defined(KOKKOS_ENABLE_CUDA) && !defined(KOKKOS_ENABLE_HIP)
4449 typedef array_t value_type[];
4450#endif
4451
4454 Kokkos::View<index_t*, device_t> permutations;
4455 Kokkos::View<scalar_t *, device_t> coordinates;
4456 Kokkos::View<part_t*, device_t> parts;
4457 Kokkos::View<index_t *, device_t> part_xadj;
4458 Kokkos::View<index_t *, device_t> track_on_cuts;
4459
4460#if defined(KOKKOS_ENABLE_CUDA) || defined(KOKKOS_ENABLE_HIP)
4461 Kokkos::View<int *, device_t> local_point_counts;
4462#endif // KOKKOS_ENABLE_CUDA || KOKKOS_ENABLE_HIP
4463
4465 part_t mj_concurrent_current_part,
4466 part_t mj_weight_array_size,
4467 Kokkos::View<index_t*, device_t> & mj_permutations,
4468 Kokkos::View<scalar_t *, device_t> & mj_coordinates,
4469 Kokkos::View<part_t*, device_t> & mj_parts,
4470 Kokkos::View<index_t *, device_t> & mj_part_xadj,
4471 Kokkos::View<index_t *, device_t> & mj_track_on_cuts
4472#if defined(KOKKOS_ENABLE_CUDA) || defined(KOKKOS_ENABLE_HIP)
4473 ,Kokkos::View<int *, device_t> & mj_local_point_counts
4474#endif // KOKKOS_ENABLE_CUDA || KOKKOS_ENABLE_HIP
4475 ) :
4476 concurrent_current_part(mj_concurrent_current_part),
4477 value_count(mj_weight_array_size),
4478 permutations(mj_permutations),
4479 coordinates(mj_coordinates),
4480 parts(mj_parts),
4481 part_xadj(mj_part_xadj),
4482 track_on_cuts(mj_track_on_cuts)
4483#if defined(KOKKOS_ENABLE_CUDA) || defined(KOKKOS_ENABLE_HIP)
4484 ,local_point_counts(mj_local_point_counts)
4485#endif
4486 {
4487 }
4488
4489 size_t team_shmem_size (int team_size) const {
4490#if defined(KOKKOS_ENABLE_CUDA) || defined(KOKKOS_ENABLE_HIP)
4491 int result = sizeof(array_t) * (value_count);
4492#else
4493 int result = sizeof(array_t) * (value_count) * team_size;
4494#endif
4495
4496 // pad this to a multiple of 8 or it will run corrupt
4497 int remainder = result % 8;
4498 if(remainder != 0) {
4499 result += 8 - remainder;
4500 }
4501 return result;
4502 }
4503
4504 KOKKOS_INLINE_FUNCTION
4505#if defined(KOKKOS_ENABLE_CUDA) || defined(KOKKOS_ENABLE_HIP)
4506 void operator() (const member_type & teamMember) const {
4507#else
4508 void operator() (const member_type & teamMember, value_type teamSum) const {
4509#endif
4510 index_t all_begin = (concurrent_current_part == 0) ? 0 :
4511 part_xadj(concurrent_current_part - 1);
4512 index_t all_end = part_xadj(concurrent_current_part);
4513
4514 index_t num_working_points = all_end - all_begin;
4515 int num_teams = teamMember.league_size();
4516
4517 index_t stride = num_working_points / num_teams;
4518 if((num_working_points % num_teams) > 0) {
4519 stride += 1; // make sure we have coverage for the final points
4520 }
4521
4522 index_t begin = all_begin + stride * teamMember.league_rank();
4523 index_t end = begin + stride;
4524 if(end > all_end) {
4525 end = all_end; // the last team may have less work than the other teams
4526 }
4527
4528 int track_on_cuts_insert_index = track_on_cuts.size() - 1;
4529
4530 // create the team shared data - each thread gets one of the arrays
4531#if defined(KOKKOS_ENABLE_CUDA) || defined(KOKKOS_ENABLE_HIP)
4532 size_t sh_mem_size = sizeof(array_t) * (value_count);
4533#else
4534 size_t sh_mem_size =
4535 sizeof(array_t) * (value_count) * teamMember.team_size();
4536#endif
4537
4538 array_t * shared_ptr = (array_t *) teamMember.team_shmem().get_shmem(
4539 sh_mem_size);
4540
4541#if defined(KOKKOS_ENABLE_CUDA) || defined(KOKKOS_ENABLE_HIP)
4542 // init the shared array to 0
4543 Kokkos::single(Kokkos::PerTeam(teamMember), [=] () {
4544 for(int n = 0; n < value_count; ++n) {
4545 shared_ptr[n] = 0;
4546 }
4547 });
4548 teamMember.team_barrier();
4549
4550 Kokkos::parallel_for(Kokkos::TeamThreadRange(teamMember, begin, end),
4551 [=] (index_t ii) {
4552#else // KOKKOS_ENABLE_CUDA || KOKKOS_ENABLE_HIP
4553 // select the array for this thread
4554 Zoltan2_MJArrayType<array_t> array(&shared_ptr[teamMember.team_rank() *
4555 (value_count)]);
4556
4557 // create reducer which handles the Zoltan2_MJArrayType class
4558 ArrayReducer<policy_t, array_t> arrayReducer(array, value_count);
4559
4560 Kokkos::parallel_reduce(
4561 Kokkos::TeamThreadRange(teamMember, begin, end),
4562 [=] (size_t ii, Zoltan2_MJArrayType<array_t>& threadSum) {
4563#endif // KOKKOS_ENABLE_CUDA || KOKKOS_ENABLE_HIP
4564
4565 index_t coordinate_index = permutations(ii);
4566 part_t place = parts(coordinate_index);
4567 part_t part = place / 2;
4568 if(place % 2 == 0) {
4569#if defined(KOKKOS_ENABLE_CUDA) || defined(KOKKOS_ENABLE_HIP)
4570 Kokkos::atomic_add(&shared_ptr[part], 1);
4571#else
4572 threadSum.ptr[part] += 1;
4573#endif
4574
4575 parts(coordinate_index) = part;
4576 }
4577 else {
4578 // fill a tracking array so we can process these slower points
4579 // in next cycle
4580 index_t set_index = Kokkos::atomic_fetch_add(
4581 &track_on_cuts(track_on_cuts_insert_index), 1);
4582 track_on_cuts(set_index) = ii;
4583 }
4584#if defined(KOKKOS_ENABLE_CUDA) || defined(KOKKOS_ENABLE_HIP)
4585 });
4586#else // KOKKOS_ENABLE_CUDA || KOKKOS_ENABLE_HIP
4587 }, arrayReducer);
4588#endif // KOKKOS_ENABLE_CUDA || KOKKOS_ENABLE_HIP
4589
4590 teamMember.team_barrier();
4591
4592 // collect all the team's results
4593 Kokkos::single(Kokkos::PerTeam(teamMember), [=] () {
4594 for(int n = 0; n < value_count; ++n) {
4595#if defined(KOKKOS_ENABLE_CUDA) || defined(KOKKOS_ENABLE_HIP)
4596 Kokkos::atomic_add(&local_point_counts(n), shared_ptr[n]);
4597#else // KOKKOS_ENABLE_CUDA || KOKKOS_ENABLE_HIP
4598 teamSum[n] += array.ptr[n];
4599#endif // KOKKOS_ENABLE_CUDA || KOKKOS_ENABLE_HIP
4600 }
4601 });
4602
4603 teamMember.team_barrier();
4604 }
4605
4606#if !defined(KOKKOS_ENABLE_CUDA) && !defined(KOKKOS_ENABLE_HIP)
4607
4608 KOKKOS_INLINE_FUNCTION
4609 void join(value_type dst, const value_type src) const {
4610 for(int n = 0; n < value_count; ++n) {
4611 dst[n] += src[n];
4612 }
4613 }
4614
4615 KOKKOS_INLINE_FUNCTION void init (value_type dst) const {
4616 for(int n = 0; n < value_count; ++n) {
4617 dst[n] = 0;
4618 }
4619 }
4620#endif
4621};
4622
4638template <typename mj_scalar_t, typename mj_lno_t, typename mj_gno_t,
4639 typename mj_part_t, typename mj_node_t>
4640void AlgMJ<mj_scalar_t, mj_lno_t, mj_gno_t, mj_part_t, mj_node_t>::
4641mj_create_new_partitions(
4642 mj_part_t num_parts,
4643 mj_part_t current_concurrent_work_part,
4644 Kokkos::View<mj_scalar_t *, device_t> & mj_current_dim_coords,
4645 Kokkos::View<mj_scalar_t *, device_t> & current_concurrent_cut_coordinate,
4646 Kokkos::View<mj_scalar_t *, device_t> & used_local_cut_line_weight_to_left,
4647 Kokkos::View<mj_lno_t *, device_t> & out_part_xadj)
4648{
4649 // Get locals for cuda
4650 auto local_thread_part_weight_work = this->thread_part_weight_work;
4651 auto local_point_counts = this->thread_point_counts;
4652 auto local_distribute_points_on_cut_lines =
4653 this->distribute_points_on_cut_lines;
4654 auto local_thread_cut_line_weight_to_put_left =
4655 this->thread_cut_line_weight_to_put_left;
4656 auto local_sEpsilon = this->sEpsilon;
4657 auto local_coordinate_permutations = this->coordinate_permutations;
4658 auto local_mj_weights = this->mj_weights;
4659 auto local_assigned_part_ids = this->assigned_part_ids;
4660 auto local_new_coordinate_permutations = this->new_coordinate_permutations;
4661
4662 mj_part_t num_cuts = num_parts - 1;
4663
4664 Kokkos::parallel_for(
4665 Kokkos::RangePolicy<typename mj_node_t::execution_space, int> (0, 1),
4666 KOKKOS_LAMBDA(int dummy) {
4667
4668 if(local_distribute_points_on_cut_lines) {
4669 for(int i = 0; i < num_cuts; ++i) {
4670 mj_scalar_t left_weight = used_local_cut_line_weight_to_left(i);
4671 if(left_weight > local_sEpsilon) {
4672 // the weight of thread ii on cut.
4673 mj_scalar_t thread_ii_weight_on_cut =
4674 local_thread_part_weight_work(i * 2 + 1) -
4675 local_thread_part_weight_work(i * 2);
4676
4677 if(thread_ii_weight_on_cut < left_weight) {
4678 // if left weight is bigger than threads weight on cut.
4679 local_thread_cut_line_weight_to_put_left(i) =
4680 thread_ii_weight_on_cut;
4681 }
4682 else {
4683 // if thread's weight is bigger than space, then put only a portion.
4684 local_thread_cut_line_weight_to_put_left(i) = left_weight;
4685 }
4686 left_weight -= thread_ii_weight_on_cut;
4687 }
4688 else {
4689 local_thread_cut_line_weight_to_put_left(i) = 0;
4690 }
4691 }
4692
4693 // this is a special case. If cutlines share the same coordinate,
4694 // their weights are equal. We need to adjust the ratio for that.
4695 for(mj_part_t i = num_cuts - 1; i > 0 ; --i) {
4696 if(std::abs(current_concurrent_cut_coordinate(i) -
4697 current_concurrent_cut_coordinate(i -1)) < local_sEpsilon) {
4698 local_thread_cut_line_weight_to_put_left(i) -=
4699 local_thread_cut_line_weight_to_put_left(i - 1);
4700 }
4701 local_thread_cut_line_weight_to_put_left(i) =
4702 static_cast<long long>((local_thread_cut_line_weight_to_put_left(i) +
4703 least_signifiance) * significance_mul) /
4704 static_cast<mj_scalar_t>(significance_mul);
4705 }
4706 }
4707
4708 for(mj_part_t i = 0; i < num_parts; ++i) {
4709 local_point_counts(i) = 0;
4710 }
4711 });
4712
4713 mj_lno_t coordinate_begin_index =
4714 current_concurrent_work_part == 0 ? 0 :
4715 host_part_xadj(current_concurrent_work_part - 1);
4716 mj_lno_t coordinate_end_index =
4717 host_part_xadj(current_concurrent_work_part);
4718
4719 mj_lno_t total_on_cut;
4720 Kokkos::parallel_reduce("Get total_on_cut",
4721 Kokkos::RangePolicy<typename mj_node_t::execution_space, int> (
4722 coordinate_begin_index, coordinate_end_index),
4723 KOKKOS_LAMBDA(int ii, mj_lno_t & val) {
4724 mj_lno_t coordinate_index = local_coordinate_permutations(ii);
4725 mj_part_t coordinate_assigned_place =
4726 local_assigned_part_ids(coordinate_index);
4727 if(coordinate_assigned_place % 2 == 1) {
4728 val += 1;
4729 }
4730 }, total_on_cut);
4731
4732 Kokkos::View<mj_lno_t *, device_t> track_on_cuts;
4733 if(total_on_cut > 0) {
4734 track_on_cuts = Kokkos::View<mj_lno_t *, device_t>(
4735 "track_on_cuts", // would do WithoutInitialization but need last init to 0
4736 total_on_cut + 1); // extra index to use for tracking
4737 }
4738
4739 // here we need to parallel reduce an array to count coords in each part
4740 // atomically adding, especially for low part count would kill us
4741 // in the original setup we kept arrays allocated for each thread but for
4742 // the cuda version we'd like to avoid allocating N arrays for the number
4743 // of teams/threads which would be complicated based on running openmp or
4744 // cuda.
4745 typedef Kokkos::TeamPolicy<typename mj_node_t::execution_space> policy_t;
4746
4747 // if not set use 60 - somewhat arbitrary based on initial performance tests
4748 int use_num_teams = mj_num_teams ? mj_num_teams : 60;
4749
4750 auto policy_ReduceFunctor = policy_t(use_num_teams, Kokkos::AUTO);
4751 typedef int array_t;
4752
4753 // just need parts - on the cuts will be handled in a separate serial
4754 // call after this.
4755#if !defined(KOKKOS_ENABLE_CUDA) && !defined(KOKKOS_ENABLE_HIP)
4756 Kokkos::View<array_t*, Kokkos::HostSpace> reduce_array("reduce_array", num_parts);
4757#endif
4758
4759 ReduceArrayFunctor<policy_t, mj_scalar_t, mj_part_t, mj_lno_t,
4760 typename mj_node_t::device_type, array_t>teamFunctor(
4761 current_concurrent_work_part,
4762 num_parts,
4763 coordinate_permutations,
4764 mj_current_dim_coords,
4765 assigned_part_ids,
4766 part_xadj,
4767 track_on_cuts
4768#if defined(KOKKOS_ENABLE_CUDA) || defined(KOKKOS_ENABLE_HIP)
4769 ,local_point_counts
4770#endif
4771 );
4772
4773#if defined(KOKKOS_ENABLE_CUDA) || defined(KOKKOS_ENABLE_HIP)
4774 Kokkos::parallel_for(policy_ReduceFunctor, teamFunctor);
4775#else
4776 Kokkos::parallel_reduce(policy_ReduceFunctor, teamFunctor, reduce_array);
4777 Kokkos::fence();
4778#endif
4779
4780#if !defined(KOKKOS_ENABLE_CUDA) && !defined(KOKKOS_ENABLE_HIP)
4781 for(mj_part_t part = 0; part < num_parts; ++part) {
4782 local_point_counts(part) = reduce_array[part];
4783 }
4784#endif
4785
4786 // the last member is utility used for atomically inserting the values.
4787 // Sorting here avoids potential indeterminancy in the partitioning results
4788 if(track_on_cuts.size() > 0) { // size 0 unused, or size is minimum of 2
4789 auto track_on_cuts_sort = Kokkos::subview(track_on_cuts,
4790 std::pair<mj_lno_t, mj_lno_t>(0, track_on_cuts.size() - 1)); // do not sort last element
4791 Kokkos::sort(track_on_cuts_sort);
4792 }
4793
4794 bool uniform_weights0 = this->mj_uniform_weights(0);
4795 Kokkos::parallel_for(
4796 Kokkos::RangePolicy<typename mj_node_t::execution_space, int> (0, 1),
4797 KOKKOS_LAMBDA (int dummy) {
4798
4799 for(int j = 0; j < total_on_cut; ++j) {
4800 int ii = track_on_cuts(j);
4801 mj_lno_t coordinate_index = local_coordinate_permutations(ii);
4802 mj_scalar_t coordinate_weight = uniform_weights0 ? 1 :
4803 local_mj_weights(coordinate_index,0);
4804 mj_part_t coordinate_assigned_place =
4805 local_assigned_part_ids(coordinate_index);
4806 mj_part_t coordinate_assigned_part = coordinate_assigned_place / 2;
4807 // if it is on the cut.
4808 if(local_distribute_points_on_cut_lines &&
4809 local_thread_cut_line_weight_to_put_left(
4810 coordinate_assigned_part) > local_sEpsilon) {
4811 // if the rectilinear partitioning is allowed,
4812 // and the thread has still space to put on the left of the cut
4813 // then thread puts the vertex to left.
4814 local_thread_cut_line_weight_to_put_left(
4815 coordinate_assigned_part) -= coordinate_weight;
4816 // if putting the vertex to left increased the weight more
4817 // than expected, and if the next cut is on the same coordinate,
4818 // then we need to adjust how much weight next cut puts to its left as
4819 // well, in order to take care of the imbalance.
4820 if(local_thread_cut_line_weight_to_put_left(
4821 coordinate_assigned_part) < 0 && coordinate_assigned_part <
4822 num_cuts - 1 &&
4823 std::abs(current_concurrent_cut_coordinate(
4824 coordinate_assigned_part+1) -
4825 current_concurrent_cut_coordinate(
4826 coordinate_assigned_part)) < local_sEpsilon)
4827 {
4828 local_thread_cut_line_weight_to_put_left(
4829 coordinate_assigned_part + 1) +=
4830 local_thread_cut_line_weight_to_put_left(
4831 coordinate_assigned_part);
4832 }
4833 ++local_point_counts(coordinate_assigned_part);
4834 local_assigned_part_ids(coordinate_index) =
4835 coordinate_assigned_part;
4836 }
4837 else {
4838 // if there is no more space on the left, put the coordinate to the
4839 // right of the cut.
4840 ++coordinate_assigned_part;
4841 // this while loop is necessary when a line is partitioned into more
4842 // than 2 parts.
4843 while(local_distribute_points_on_cut_lines &&
4844 coordinate_assigned_part < num_cuts)
4845 {
4846 // traverse all the cut lines having the same partitiong
4847 if(std::abs(current_concurrent_cut_coordinate(
4848 coordinate_assigned_part) -
4849 current_concurrent_cut_coordinate(
4850 coordinate_assigned_part - 1)) < local_sEpsilon)
4851 {
4852 // if line has enough space on left, put it there.
4853 if(local_thread_cut_line_weight_to_put_left(
4854 coordinate_assigned_part) > local_sEpsilon &&
4855 local_thread_cut_line_weight_to_put_left(
4856 coordinate_assigned_part) >=
4857 std::abs(local_thread_cut_line_weight_to_put_left(
4858 coordinate_assigned_part) - coordinate_weight))
4859 {
4860 local_thread_cut_line_weight_to_put_left(
4861 coordinate_assigned_part) -= coordinate_weight;
4862 // Again if it put too much on left of the cut,
4863 // update how much the next cut sharing the same coordinate will
4864 // put to its left.
4865 if(local_thread_cut_line_weight_to_put_left(
4866 coordinate_assigned_part) < 0 &&
4867 coordinate_assigned_part < num_cuts - 1 &&
4868 std::abs(current_concurrent_cut_coordinate(
4869 coordinate_assigned_part+1) -
4870 current_concurrent_cut_coordinate(
4871 coordinate_assigned_part)) < local_sEpsilon)
4872 {
4873 local_thread_cut_line_weight_to_put_left(
4874 coordinate_assigned_part + 1) +=
4875 local_thread_cut_line_weight_to_put_left(
4876 coordinate_assigned_part);
4877 }
4878 break;
4879 }
4880 }
4881 else {
4882 break;
4883 }
4884 ++coordinate_assigned_part;
4885 }
4886 local_point_counts(coordinate_assigned_part) += 1;
4887 local_assigned_part_ids(coordinate_index) = coordinate_assigned_part;
4888 }
4889 }
4890
4891 for(int j = 0; j < num_parts; ++j) {
4892 out_part_xadj(j) = local_point_counts(j);
4893 local_point_counts(j) = 0;
4894
4895 if(j != 0) {
4896 out_part_xadj(j) += out_part_xadj(j - 1);
4897 local_point_counts(j) += out_part_xadj(j - 1);
4898 }
4899 }
4900 });
4901
4902 // here we will determine insert indices for N teams
4903 // then all the teams can fill
4904
4905#if defined(KOKKOS_ENABLE_CUDA) || defined(KOKKOS_ENABLE_HIP)
4906
4907 // This is the fastest so far - just straight atomic writes for CUDA
4908 // However this is not a deterministic result since it is atomic.
4909 // The final result will be deterministic.
4910 Kokkos::parallel_for(
4911 Kokkos::RangePolicy<typename mj_node_t::execution_space, mj_lno_t> (
4912 coordinate_begin_index, coordinate_end_index),
4913 KOKKOS_LAMBDA (mj_lno_t ii) {
4914 mj_lno_t i = local_coordinate_permutations(ii);
4915 mj_part_t p = local_assigned_part_ids(i);
4916 mj_lno_t idx = Kokkos::atomic_fetch_add(&local_point_counts(p), 1);
4917 local_new_coordinate_permutations(coordinate_begin_index + idx) = i;
4918 });
4919
4920#else
4921
4922#ifdef KOKKOS_ENABLE_OPENMP
4923 // will return and fix this - revert back to 1 for clear auto testing
4924 const int num_threads = 1; // Kokkos::OpenMP::impl_max_hardware_threads();
4925#else
4926 const int num_threads = 1;
4927#endif
4928
4929 const int num_teams = 1; // cuda is handled above using a different format
4930
4931 // allow init - we want all 0's first
4932 Kokkos::View<mj_lno_t*, device_t>
4933 point_counter("insert indices", num_teams * num_threads * num_parts);
4934
4935 // count how many coords per thread
4936 // then we will fill each independently
4937 Kokkos::TeamPolicy<typename mj_node_t::execution_space>
4938 block_policy(num_teams, num_threads);
4939 typedef typename Kokkos::TeamPolicy<typename mj_node_t::execution_space>::
4940 member_type member_type;
4941 mj_lno_t range = coordinate_end_index - coordinate_begin_index;
4942 mj_lno_t block_size = range / num_teams + 1;
4943 Kokkos::parallel_for(block_policy, KOKKOS_LAMBDA(member_type team_member) {
4944 int team = team_member.league_rank();
4945 int team_offset = team * num_threads * num_parts;
4946 mj_lno_t begin = coordinate_begin_index + team * block_size;
4947 mj_lno_t end = begin + block_size;
4948 if(end > coordinate_end_index) {
4949 end = coordinate_end_index;
4950 }
4951
4952 Kokkos::parallel_for(Kokkos::TeamThreadRange(team_member, begin, end),
4953 [=] (mj_lno_t ii) {
4954 int thread = team_member.team_rank();
4955 mj_lno_t i = local_coordinate_permutations(ii);
4956 mj_part_t p = local_assigned_part_ids(i);
4957 int index = team_offset + thread * num_parts + p;
4958 ++point_counter(index);
4959 });
4960 });
4961
4962 // now prefix sum
4963 // we currently have the counts in the slots
4964 // we want the first counter for each part to be 0
4965 // then the rest should be the sum of all the priors
4966 Kokkos::parallel_for(
4967 Kokkos::RangePolicy<typename mj_node_t::execution_space, int> (0, 1),
4968 KOKKOS_LAMBDA (int dummy) {
4969 int num_sets = point_counter.size() / num_parts;
4970 for(int set = num_sets - 1; set >= 1; set -=1) {
4971 int base = set * num_parts;
4972 for(int part = 0; part < num_parts; ++part) {
4973 point_counter(base + part) = point_counter(base + part - num_parts);
4974 }
4975 }
4976
4977 for(int part = 0; part < num_parts; ++part) {
4978 point_counter(part) = 0;
4979 }
4980
4981 for(int set = 1; set < num_sets; ++set) {
4982 int base = set * num_parts;
4983 for(int part = 0; part < num_parts; ++part) {
4984 point_counter(base + part) += point_counter(base + part - num_parts);
4985 }
4986 }
4987 });
4988
4989 // now permute
4990 Kokkos::parallel_for(block_policy, KOKKOS_LAMBDA(member_type team_member) {
4991 int team = team_member.league_rank();
4992 int team_offset = team * num_threads * num_parts;
4993 mj_lno_t begin = coordinate_begin_index + team * block_size;
4994 mj_lno_t end = begin + block_size;
4995 if(end > coordinate_end_index) {
4996 end = coordinate_end_index;
4997 }
4998 Kokkos::parallel_for(Kokkos::TeamThreadRange(team_member, begin, end),
4999 [=] (mj_lno_t ii) {
5000 int thread = team_member.team_rank();
5001 mj_lno_t i = local_coordinate_permutations(ii);
5002 mj_part_t p = local_assigned_part_ids(i);
5003 int index = team_offset + thread * num_parts + p;
5004 int set_counter = (point_counter(index)++) + local_point_counts(p);
5005 local_new_coordinate_permutations(coordinate_begin_index + set_counter) = i;
5006 });
5007 });
5008#endif
5009}
5010
5054template <typename mj_scalar_t, typename mj_lno_t, typename mj_gno_t,
5055 typename mj_part_t, typename mj_node_t>
5056void AlgMJ<mj_scalar_t, mj_lno_t, mj_gno_t, mj_part_t,
5057 mj_node_t>::mj_get_new_cut_coordinates(
5058 mj_part_t current_concurrent_num_parts,
5059 mj_part_t kk,
5060 const mj_part_t &num_cuts,
5061 const double &used_imbalance_tolerance,
5062 Kokkos::View<mj_scalar_t *, device_t> & current_global_part_weights,
5063 Kokkos::View<mj_scalar_t *, device_t> & current_local_part_weights,
5064 Kokkos::View<mj_scalar_t *, device_t> & current_part_target_weights,
5065 Kokkos::View<bool *, device_t> & current_cut_line_determined,
5066 Kokkos::View<mj_scalar_t *, device_t> & current_cut_coordinates,
5067 Kokkos::View<mj_scalar_t *, device_t> & current_cut_upper_bounds,
5068 Kokkos::View<mj_scalar_t *, device_t> & current_cut_lower_bounds,
5069 Kokkos::View<mj_scalar_t *, device_t> & current_global_left_closest_points,
5070 Kokkos::View<mj_scalar_t *, device_t> & current_global_right_closest_points,
5071 Kokkos::View<mj_scalar_t *, device_t> & current_cut_lower_bound_weights,
5072 Kokkos::View<mj_scalar_t *, device_t> & current_cut_upper_weights,
5073 Kokkos::View<mj_scalar_t *, device_t> & new_current_cut_coordinates,
5074 Kokkos::View<mj_scalar_t *, device_t> &
5075 current_part_cut_line_weight_to_put_left,
5076 Kokkos::View<mj_part_t *, device_t> & view_rectilinear_cut_count)
5077{
5078 Kokkos::deep_copy(device_incomplete_cut_count, this->incomplete_cut_count);
5079
5080 auto local_device_incomplete_cut_count = device_incomplete_cut_count;
5081 auto local_sEpsilon = sEpsilon;
5082 auto local_distribute_points_on_cut_lines = distribute_points_on_cut_lines;
5083 auto local_global_rectilinear_cut_weight = global_rectilinear_cut_weight;
5084 auto local_process_rectilinear_cut_weight = process_rectilinear_cut_weight;
5085 auto local_global_min_max_coord_total_weight =
5086 global_min_max_coord_total_weight;
5087
5088 const auto _sEpsilon = this->sEpsilon;
5089 // Note for a 22 part system I tried removing the outer loop
5090 // and doing each sub loop as a simple parallel_for over num_cuts.
5091 // But that was about twice as slow (10ms) as the current form (5ms)
5092 // so I think the overhead of launching the new global parallel kernels
5093 // is costly. This form is just running one team so effectively using
5094 // a single warp to process the cuts. I expect with a lot of parts this
5095 // might need changing.
5096 Kokkos::TeamPolicy<typename mj_node_t::execution_space>
5097 policy_one_team(1, Kokkos::AUTO());
5098 typedef typename Kokkos::TeamPolicy<typename mj_node_t::execution_space>::
5099 member_type member_type;
5100 Kokkos::parallel_for(policy_one_team, KOKKOS_LAMBDA(member_type team_member) {
5101
5102 mj_scalar_t min_coordinate =
5103 local_global_min_max_coord_total_weight(kk);
5104 mj_scalar_t max_coordinate =
5105 local_global_min_max_coord_total_weight(
5106 kk + current_concurrent_num_parts);
5107 mj_scalar_t global_total_weight =
5108 local_global_min_max_coord_total_weight(
5109 kk + current_concurrent_num_parts * 2);
5110
5111 Kokkos::parallel_for(Kokkos::TeamThreadRange (team_member, num_cuts),
5112 [=] (mj_part_t i) {
5113 // if left and right closest points are not set yet,
5114 // set it to the cut itself.
5115 if(min_coordinate -
5116 current_global_left_closest_points(i) > local_sEpsilon) {
5117 current_global_left_closest_points(i) =
5118 current_cut_coordinates(i);
5119 }
5120 if(current_global_right_closest_points(i) -
5121 max_coordinate > local_sEpsilon) {
5122 current_global_right_closest_points(i) =
5123 current_cut_coordinates(i);
5124 }
5125 });
5126 team_member.team_barrier(); // for end of Kokkos::TeamThreadRange
5127
5128 Kokkos::parallel_for(Kokkos::TeamThreadRange (team_member, num_cuts),
5129 [=] (mj_part_t i) {
5130 using algMJ_t = AlgMJ<mj_scalar_t, mj_lno_t, mj_gno_t, mj_part_t,
5131 mj_node_t>;
5132 // seen weight in the part
5133 mj_scalar_t seen_weight_in_part = 0;
5134 // expected weight for part.
5135 mj_scalar_t expected_weight_in_part = 0;
5136 // imbalance for the left and right side of the cut.
5137 double imbalance_on_left = 0, imbalance_on_right = 0;
5138 if(local_distribute_points_on_cut_lines) {
5139 // init the weight on the cut.
5140 local_global_rectilinear_cut_weight(i) = 0;
5141 local_process_rectilinear_cut_weight(i) = 0;
5142 }
5143 bool bContinue = false;
5144 // if already determined at previous iterations,
5145 // then just write the coordinate to new array, and proceed.
5146 if(current_cut_line_determined(i)) {
5147 new_current_cut_coordinates(i) =
5148 current_cut_coordinates(i);
5149 bContinue = true;
5150 }
5151 if(!bContinue) {
5152 //current weight of the part at the left of the cut line.
5153 seen_weight_in_part = current_global_part_weights(i * 2);
5154
5155 //expected ratio
5156 expected_weight_in_part = current_part_target_weights(i);
5157
5158 //leftImbalance = imbalanceOf(seenW, globalTotalWeight, expected);
5159 imbalance_on_left = algMJ_t::calculate_imbalance(seen_weight_in_part,
5160 expected_weight_in_part);
5161 // rightImbalance = imbalanceOf(globalTotalWeight - seenW,
5162 // globalTotalWeight, 1 - expected);
5163 imbalance_on_right = algMJ_t::calculate_imbalance(global_total_weight -
5164 seen_weight_in_part, global_total_weight - expected_weight_in_part);
5165 bool is_left_imbalance_valid = std::abs(imbalance_on_left) -
5166 used_imbalance_tolerance < local_sEpsilon ;
5167 bool is_right_imbalance_valid = std::abs(imbalance_on_right) -
5168 used_imbalance_tolerance < local_sEpsilon;
5169 //if the cut line reaches to desired imbalance.
5170 if(is_left_imbalance_valid && is_right_imbalance_valid) {
5171 current_cut_line_determined(i) = true;
5172 Kokkos::atomic_add(&local_device_incomplete_cut_count(kk), -1);
5173 new_current_cut_coordinates(i) = current_cut_coordinates(i);
5174 }
5175 else if(imbalance_on_left < 0) {
5176 //if left imbalance < 0 then we need to move the cut to right.
5177 if(local_distribute_points_on_cut_lines) {
5178 // if it is okay to distribute the coordinate on
5179 // the same coordinate to left and right.
5180 // then check if we can reach to the target weight by including the
5181 // coordinates in the part.
5182 if(current_global_part_weights(i * 2 + 1) ==
5183 expected_weight_in_part) {
5184 // if it is we are done.
5185 current_cut_line_determined(i) = true;
5186 Kokkos::atomic_add(&local_device_incomplete_cut_count(kk), -1);
5187
5188 //then assign everything on the cut to the left of the cut.
5189 new_current_cut_coordinates(i) =
5190 current_cut_coordinates(i);
5191 //for this cut all the weight on cut will be put to left.
5192 current_part_cut_line_weight_to_put_left(i) =
5193 current_local_part_weights(i * 2 + 1) -
5194 current_local_part_weights(i * 2);
5195 bContinue = true;
5196 }
5197 else if(current_global_part_weights(i * 2 + 1) >
5198 expected_weight_in_part) {
5199 // if the weight is larger than the expected weight,
5200 // then we need to distribute some points to left, some to right.
5201 current_cut_line_determined(i) = true;
5202 Kokkos::atomic_add(&view_rectilinear_cut_count(0), 1);
5203
5204 // increase the num cuts to be determined with rectilinear
5205 // partitioning.
5206 Kokkos::atomic_add(&local_device_incomplete_cut_count(kk), -1);
5207 new_current_cut_coordinates(i) =
5208 current_cut_coordinates(i);
5209 local_process_rectilinear_cut_weight[i] =
5210 current_local_part_weights(i * 2 + 1) -
5211 current_local_part_weights(i * 2);
5212 bContinue = true;
5213 }
5214 }
5215
5216 if(!bContinue) {
5217
5218 // we need to move further right,so set lower bound to current line,
5219 // and shift it to the closes point from right.
5220 current_cut_lower_bounds(i) =
5221 current_global_right_closest_points(i);
5222
5223 //set the lower bound weight to the weight we have seen.
5224 current_cut_lower_bound_weights(i) = seen_weight_in_part;
5225
5226 // compare the upper bound with what has been found in the
5227 // last iteration.
5228 // we try to make more strict bounds for the cut here.
5229 for(mj_part_t ii = i + 1; ii < num_cuts ; ++ii) {
5230 mj_scalar_t p_weight = current_global_part_weights(ii * 2);
5231 mj_scalar_t line_weight =
5232 current_global_part_weights(ii * 2 + 1);
5233 if(p_weight >= expected_weight_in_part) {
5234 // if a cut on the right has the expected weight, then we found
5235 // our cut position. Set up and low coordiantes to this
5236 // new cut coordinate, but we need one more iteration to
5237 // finalize the cut position, as wee need to update the part ids.
5238 if(p_weight == expected_weight_in_part) {
5239 current_cut_upper_bounds(i) =
5240 current_cut_coordinates(ii);
5241 current_cut_upper_weights(i) = p_weight;
5242 current_cut_lower_bounds(i) =
5243 current_cut_coordinates(ii);
5244 current_cut_lower_bound_weights(i) = p_weight;
5245 } else if(p_weight < current_cut_upper_weights(i)) {
5246 // if a part weight is larger then my expected weight,
5247 // but lower than my upper bound weight, update upper bound.
5248 current_cut_upper_bounds(i) =
5249 current_global_left_closest_points(ii);
5250 current_cut_upper_weights(i) = p_weight;
5251 }
5252 break;
5253 }
5254 // if comes here then pw < ew
5255 // then compare the weight against line weight.
5256 if(line_weight >= expected_weight_in_part) {
5257 // if the line is larger than the expected weight, then we need
5258 // to reach to the balance by distributing coordinates on
5259 // this line.
5260 current_cut_upper_bounds(i) =
5261 current_cut_coordinates(ii);
5262 current_cut_upper_weights(i) = line_weight;
5263 current_cut_lower_bounds(i) =
5264 current_cut_coordinates(ii);
5265 current_cut_lower_bound_weights(i) = p_weight;
5266 break;
5267 }
5268 // if a stricter lower bound is found,
5269 // update the lower bound.
5270 if(p_weight <= expected_weight_in_part && p_weight >=
5271 current_cut_lower_bound_weights(i)) {
5272 current_cut_lower_bounds(i) =
5273 current_global_right_closest_points(ii);
5274 current_cut_lower_bound_weights(i) = p_weight;
5275 }
5276 }
5277
5278 mj_scalar_t new_cut_position = 0;
5279 algMJ_t::mj_calculate_new_cut_position(
5280 current_cut_upper_bounds(i),
5281 current_cut_lower_bounds(i),
5282 current_cut_upper_weights(i),
5283 current_cut_lower_bound_weights(i),
5284 expected_weight_in_part, new_cut_position,
5285 _sEpsilon);
5286
5287 // if cut line does not move significantly.
5288 // then finalize the search.
5289 if(std::abs(current_cut_coordinates(i) -
5290 new_cut_position) < local_sEpsilon) {
5291 current_cut_line_determined(i) = true;
5292 Kokkos::atomic_add(&local_device_incomplete_cut_count(kk), -1);
5293
5294 //set the cut coordinate and proceed.
5295 new_current_cut_coordinates(i) =
5296 current_cut_coordinates(i);
5297 } else {
5298 new_current_cut_coordinates(i) = new_cut_position;
5299 }
5300 } // bContinue
5301 } else {
5302 // need to move the cut line to left.
5303 // set upper bound to current line.
5304 current_cut_upper_bounds(i) =
5305 current_global_left_closest_points(i);
5306 current_cut_upper_weights(i) =
5307 seen_weight_in_part;
5308 // compare the current cut line weights with
5309 // previous upper and lower bounds.
5310 for(int ii = i - 1; ii >= 0; --ii) {
5311 mj_scalar_t p_weight =
5312 current_global_part_weights(ii * 2);
5313 mj_scalar_t line_weight =
5314 current_global_part_weights(ii * 2 + 1);
5315 if(p_weight <= expected_weight_in_part) {
5316 if(p_weight == expected_weight_in_part) {
5317 // if the weight of the part is my expected weight
5318 // then we find the solution.
5319 current_cut_upper_bounds(i) =
5320 current_cut_coordinates(ii);
5321 current_cut_upper_weights(i) = p_weight;
5322 current_cut_lower_bounds(i) =
5323 current_cut_coordinates(ii);
5324 current_cut_lower_bound_weights(i) = p_weight;
5325 }
5326 else if(p_weight > current_cut_lower_bound_weights(i)) {
5327 // if found weight is bigger than the lower bound
5328 // then update the lower bound.
5329 current_cut_lower_bounds(i) =
5330 current_global_right_closest_points(ii);
5331 current_cut_lower_bound_weights(i) = p_weight;
5332
5333 // at the same time, if weight of line is bigger than the
5334 // expected weight, then update the upper bound as well.
5335 // in this case the balance will be obtained by distributing
5336 // weights on this cut position.
5337 if(line_weight > expected_weight_in_part) {
5338 current_cut_upper_bounds(i) =
5339 current_global_right_closest_points(ii);
5340 current_cut_upper_weights(i) = line_weight;
5341 }
5342 }
5343 break;
5344 }
5345 // if the weight of the cut on the left is still bigger than
5346 // my weight, and also if the weight is smaller than the current
5347 // upper weight, or if the weight is equal to current upper
5348 // weight, but on the left of the upper weight, then update
5349 // upper bound.
5350 if(p_weight >= expected_weight_in_part &&
5351 (p_weight < current_cut_upper_weights(i) ||
5352 (p_weight == current_cut_upper_weights(i) &&
5353 current_cut_upper_bounds(i) >
5354 current_global_left_closest_points(ii)))) {
5355 current_cut_upper_bounds(i) =
5356 current_global_left_closest_points(ii);
5357 current_cut_upper_weights(i) = p_weight;
5358 }
5359 }
5360 mj_scalar_t new_cut_position = 0;
5361 algMJ_t::mj_calculate_new_cut_position(
5362 current_cut_upper_bounds(i),
5363 current_cut_lower_bounds(i),
5364 current_cut_upper_weights(i),
5365 current_cut_lower_bound_weights(i),
5366 expected_weight_in_part,
5367 new_cut_position,
5368 _sEpsilon);
5369
5370 // if cut line does not move significantly.
5371 if(std::abs(current_cut_coordinates(i) -
5372 new_cut_position) < local_sEpsilon) {
5373 current_cut_line_determined(i) = true;
5374 Kokkos::atomic_add(&local_device_incomplete_cut_count(kk), -1);
5375 //set the cut coordinate and proceed.
5376 new_current_cut_coordinates(i) =
5377 current_cut_coordinates(i);
5378 } else {
5379 new_current_cut_coordinates(i) =
5380 new_cut_position;
5381 }
5382 }
5383 }; // bContinue
5384 });
5385
5386 team_member.team_barrier(); // for end of Kokkos::TeamThreadRange
5387 });
5388
5389 // view_rectilinear_cut_count
5390 mj_part_t rectilinear_cut_count;
5391 Kokkos::parallel_reduce("Read bDoingWork",
5392 Kokkos::RangePolicy<typename mj_node_t::execution_space, int>(0, 1),
5393 KOKKOS_LAMBDA(int dummy, int & set_single) {
5394 set_single = view_rectilinear_cut_count(0);
5395 }, rectilinear_cut_count);
5396
5397 if(rectilinear_cut_count > 0) {
5398 auto host_local_process_rectilinear_cut_weight =
5399 Kokkos::create_mirror_view(Kokkos::HostSpace(),
5400 local_process_rectilinear_cut_weight);
5401 auto host_local_global_rectilinear_cut_weight =
5402 Kokkos::create_mirror_view(Kokkos::HostSpace(),
5403 local_global_rectilinear_cut_weight);
5404 Kokkos::deep_copy(host_local_process_rectilinear_cut_weight,
5405 local_process_rectilinear_cut_weight);
5406 Kokkos::deep_copy(host_local_global_rectilinear_cut_weight,
5407 local_global_rectilinear_cut_weight);
5408 Teuchos::scan<int,mj_scalar_t>(
5409 *comm, Teuchos::REDUCE_SUM,
5410 num_cuts,
5411 host_local_process_rectilinear_cut_weight.data(),
5412 host_local_global_rectilinear_cut_weight.data());
5413 Kokkos::deep_copy(local_process_rectilinear_cut_weight,
5414 host_local_process_rectilinear_cut_weight);
5415 Kokkos::deep_copy(local_global_rectilinear_cut_weight,
5416 host_local_global_rectilinear_cut_weight);
5417
5418 Kokkos::parallel_for("finish up mj_get_new_cut_coordinates",
5419 Kokkos::RangePolicy<typename mj_node_t::execution_space, int> (0, 1),
5420 KOKKOS_LAMBDA(int dummy) {
5421 for(mj_part_t i = 0; i < num_cuts; ++i) {
5422 // if cut line weight to be distributed.
5423 if(local_global_rectilinear_cut_weight(i) > 0) {
5424 // expected weight to go to left of the cut.
5425 mj_scalar_t expected_part_weight = current_part_target_weights(i);
5426 // the weight that should be put to left of the cut.
5427 mj_scalar_t necessary_weight_on_line_for_left =
5428 expected_part_weight - current_global_part_weights(i * 2);
5429
5430 // the weight of the cut in the process
5431 mj_scalar_t my_weight_on_line =
5432 local_process_rectilinear_cut_weight(i);
5433
5434 // the sum of the cut weights upto this process,
5435 // including the weight of this process.
5436 mj_scalar_t weight_on_line_upto_process_inclusive =
5437 local_global_rectilinear_cut_weight(i);
5438 // the space on the left side of the cut after all processes
5439 // before this process (including this process)
5440 // puts their weights on cut to left.
5441 mj_scalar_t space_to_put_left =
5442 necessary_weight_on_line_for_left -
5443 weight_on_line_upto_process_inclusive;
5444 // add my weight to this space to find out how much space
5445 // is left to me.
5446 mj_scalar_t space_left_to_me =
5447 space_to_put_left + my_weight_on_line;
5448
5449 /*
5450 cout << "expected_part_weight:" << expected_part_weight
5451 << " necessary_weight_on_line_for_left:"
5452 << necessary_weight_on_line_for_left
5453 << " my_weight_on_line" << my_weight_on_line
5454 << " weight_on_line_upto_process_inclusive:"
5455 << weight_on_line_upto_process_inclusive
5456 << " space_to_put_left:" << space_to_put_left
5457 << " space_left_to_me" << space_left_to_me << endl;
5458 */
5459
5460 if(space_left_to_me < 0) {
5461 // space_left_to_me is negative and i dont need to put
5462 // anything to left.
5463 current_part_cut_line_weight_to_put_left(i) = 0;
5464 }
5465 else if(space_left_to_me >= my_weight_on_line) {
5466 // space left to me is bigger than the weight of the
5467 // processor on cut.
5468 // so put everything to left.
5469 current_part_cut_line_weight_to_put_left(i) =
5470 my_weight_on_line;
5471 // cout << "setting current_part_cut_line_weight_to_put_left
5472 // to my_weight_on_line:" << my_weight_on_line << endl;
5473 }
5474 else {
5475 // put only the weight as much as the space.
5476 current_part_cut_line_weight_to_put_left(i) =
5477 space_left_to_me;
5478 // cout << "setting current_part_cut_line_weight_to_put_left
5479 // to space_left_to_me:" << space_left_to_me << endl;
5480 }
5481 }
5482 }
5483 view_rectilinear_cut_count(0) = 0;
5484 });
5485 }
5486
5487 Kokkos::deep_copy(this->incomplete_cut_count, device_incomplete_cut_count);
5488}
5489
5499template <typename mj_scalar_t, typename mj_lno_t, typename mj_gno_t,
5500 typename mj_part_t, typename mj_node_t>
5501void AlgMJ<mj_scalar_t, mj_lno_t, mj_gno_t, mj_part_t, mj_node_t>::
5502 get_processor_num_points_in_parts(
5503 mj_part_t num_procs,
5504 mj_part_t num_parts,
5505 mj_gno_t *&num_points_in_all_processor_parts)
5506{
5507 // initially allocation_size is num_parts
5508 size_t allocation_size = num_parts * (num_procs + 1);
5509
5510 // this will be output
5511 // holds how many each processor has in each part.
5512 // last portion is the sum of all processor points in each part.
5513
5514 // allocate memory for the local num coordinates in each part.
5515 mj_gno_t *num_local_points_in_each_part_to_reduce_sum =
5516 new mj_gno_t[allocation_size];
5517
5518 // this is the portion of the memory which will be used
5519 // at the summation to obtain total number of processors' points in each part.
5520 mj_gno_t *my_local_points_to_reduce_sum =
5521 num_local_points_in_each_part_to_reduce_sum + num_procs * num_parts;
5522
5523 // this is the portion of the memory where each stores its local number.
5524 // this information is needed by other processors.
5525 mj_gno_t *my_local_point_counts_in_each_part =
5526 num_local_points_in_each_part_to_reduce_sum + this->myRank * num_parts;
5527
5528 // initialize the array with 0's.
5529 memset(num_local_points_in_each_part_to_reduce_sum, 0,
5530 sizeof(mj_gno_t)*allocation_size);
5531
5532 auto local_new_part_xadj = this->new_part_xadj;
5533 Kokkos::View<mj_gno_t *, typename mj_node_t::device_type> points_per_part(
5534 Kokkos::ViewAllocateWithoutInitializing("points per part"), num_parts);
5535 Kokkos::parallel_for("get vals on device",
5536 Kokkos::RangePolicy<typename mj_node_t::execution_space, mj_gno_t>
5537 (0, num_parts), KOKKOS_LAMBDA(mj_gno_t i) {
5538 points_per_part(i) =
5539 local_new_part_xadj(i) - ((i == 0) ? 0 : local_new_part_xadj(i-1));
5540 });
5541 auto host_points_per_part = Kokkos::create_mirror_view(points_per_part);
5542 Kokkos::deep_copy(host_points_per_part, points_per_part);
5543 for(int i = 0; i < num_parts; ++i) {
5544 my_local_points_to_reduce_sum[i] = host_points_per_part(i);
5545 }
5546
5547 // copy the local num parts to the last portion of array, so that this portion
5548 // will represent the global num points in each part after the reduction.
5549 memcpy (my_local_point_counts_in_each_part, my_local_points_to_reduce_sum,
5550 sizeof(mj_gno_t) * (num_parts) );
5551
5552 // reduceAll operation.
5553 // the portion that belongs to a processor with index p
5554 // will start from myRank * num_parts.
5555 // the global number of points will be held at the index
5556 try{
5557 reduceAll<int, mj_gno_t>(
5558 *(this->comm),
5559 Teuchos::REDUCE_SUM,
5560 allocation_size,
5561 num_local_points_in_each_part_to_reduce_sum,
5562 num_points_in_all_processor_parts);
5563 }
5564 Z2_THROW_OUTSIDE_ERROR(*(this->mj_env))
5565
5566 delete [] num_local_points_in_each_part_to_reduce_sum;
5567}
5568
5584template <typename mj_scalar_t, typename mj_lno_t, typename mj_gno_t,
5585 typename mj_part_t, typename mj_node_t>
5586bool AlgMJ<mj_scalar_t, mj_lno_t, mj_gno_t, mj_part_t, mj_node_t>::
5587 mj_check_to_migrate(
5588 size_t migration_reduce_all_population,
5589 mj_lno_t num_coords_for_last_dim_part,
5590 mj_part_t num_procs,
5591 mj_part_t num_parts,
5592 mj_gno_t *num_points_in_all_processor_parts)
5593{
5594 // if reduce all count and population in the last dim is too high
5595 if(migration_reduce_all_population > future_reduceall_cutoff) {
5596 return true;
5597 }
5598
5599 // if the work in a part per processor in the last dim is too low.
5600 if(num_coords_for_last_dim_part < min_work_last_dim) {
5601 return true;
5602 }
5603
5604 // if migration is to be checked and the imbalance is too high
5605 if(this->check_migrate_avoid_migration_option == 0) {
5606 double global_imbalance = 0;
5607 // global shift to reach the sum of coordiante count in each part.
5608 size_t global_shift = num_procs * num_parts;
5609
5610 for(mj_part_t ii = 0; ii < num_procs; ++ii) {
5611 for(mj_part_t i = 0; i < num_parts; ++i) {
5612 double ideal_num = num_points_in_all_processor_parts[global_shift + i]
5613 / double(num_procs);
5614
5615 global_imbalance += std::abs(ideal_num -
5616 num_points_in_all_processor_parts[ii * num_parts + i]) / (ideal_num);
5617 }
5618 }
5619 global_imbalance /= num_parts;
5620 global_imbalance /= num_procs;
5621
5622 if(global_imbalance <= this->minimum_migration_imbalance) {
5623 return false;
5624 }
5625 else {
5626 return true;
5627 }
5628 }
5629 else {
5630 // if migration is forced
5631 return true;
5632 }
5633}
5634
5648template <typename mj_scalar_t, typename mj_lno_t, typename mj_gno_t,
5649 typename mj_part_t, typename mj_node_t>
5650void AlgMJ<mj_scalar_t, mj_lno_t, mj_gno_t, mj_part_t, mj_node_t>::
5651 assign_send_destinations(
5652 mj_part_t num_parts,
5653 mj_part_t *part_assignment_proc_begin_indices,
5654 mj_part_t *processor_chains_in_parts,
5655 mj_lno_t *send_count_to_each_proc,
5656 int *coordinate_destinations) {
5657
5658 auto host_new_part_xadj = Kokkos::create_mirror_view(this->new_part_xadj);
5659 deep_copy(host_new_part_xadj, this->new_part_xadj);
5660
5661 auto host_new_coordinate_permutations =
5662 Kokkos::create_mirror_view(this->new_coordinate_permutations);
5663 deep_copy(host_new_coordinate_permutations, this->new_coordinate_permutations);
5664
5665 for(mj_part_t p = 0; p < num_parts; ++p) {
5666 mj_lno_t part_begin = 0;
5667 if(p > 0) part_begin = host_new_part_xadj(p - 1);
5668 mj_lno_t part_end = host_new_part_xadj(p);
5669 // get the first part that current processor will send its part-p.
5670 mj_part_t proc_to_sent = part_assignment_proc_begin_indices[p];
5671 // initialize how many point I sent to this processor.
5672 mj_lno_t num_total_send = 0;
5673 for(mj_lno_t j=part_begin; j < part_end; j++) {
5674 mj_lno_t local_ind = host_new_coordinate_permutations(j);
5675 while (num_total_send >= send_count_to_each_proc[proc_to_sent]) {
5676 // then get the next processor to send the points in part p.
5677 num_total_send = 0;
5678 // assign new processor to part_assign_begin[p]
5679 part_assignment_proc_begin_indices[p] =
5680 processor_chains_in_parts[proc_to_sent];
5681 // remove the previous processor
5682 processor_chains_in_parts[proc_to_sent] = -1;
5683 // choose the next processor as the next one to send.
5684 proc_to_sent = part_assignment_proc_begin_indices[p];
5685 }
5686 // write the gno index to corresponding position in sendBuf.
5687 coordinate_destinations[local_ind] = proc_to_sent;
5688 ++num_total_send;
5689 }
5690 }
5691}
5692
5713template <typename mj_scalar_t, typename mj_lno_t, typename mj_gno_t,
5714 typename mj_part_t, typename mj_node_t>
5715void AlgMJ<mj_scalar_t, mj_lno_t, mj_gno_t, mj_part_t, mj_node_t>::
5716 mj_assign_proc_to_parts(
5717 mj_gno_t * num_points_in_all_processor_parts,
5718 mj_part_t num_parts,
5719 mj_part_t num_procs,
5720 mj_lno_t *send_count_to_each_proc,
5721 std::vector<mj_part_t> &processor_ranks_for_subcomm,
5722 std::vector<mj_part_t> *next_future_num_parts_in_parts,
5723 mj_part_t &out_part_index,
5724 mj_part_t &output_part_numbering_begin_index,
5725 int * coordinate_destinations) {
5726 mj_gno_t *global_num_points_in_parts =
5727 num_points_in_all_processor_parts + num_procs * num_parts;
5728 mj_part_t *num_procs_assigned_to_each_part = new mj_part_t[num_parts];
5729
5730 // boolean variable if the process finds its part to be assigned.
5731 bool did_i_find_my_group = false;
5732
5733 mj_part_t num_free_procs = num_procs;
5734 mj_part_t minimum_num_procs_required_for_rest_of_parts = num_parts - 1;
5735
5736 double max_imbalance_difference = 0;
5737 mj_part_t max_differing_part = 0;
5738
5739 // find how many processor each part requires.
5740 for(mj_part_t i = 0; i < num_parts; i++) {
5741
5742 // scalar portion of the required processors
5743 double scalar_required_proc = num_procs *
5744 (double (global_num_points_in_parts[i]) /
5745 double (this->num_global_coords));
5746
5747 // round it to closest integer; make sure have at least one proc.
5748 mj_part_t required_proc =
5749 static_cast<mj_part_t> (0.5 + scalar_required_proc);
5750 if(required_proc == 0) required_proc = 1;
5751
5752 // if assigning the required num procs, creates problems for the rest
5753 // of the parts, then only assign {num_free_procs -
5754 // (minimum_num_procs_required_for_rest_of_parts)} procs to this part.
5755 if(num_free_procs -
5756 required_proc < minimum_num_procs_required_for_rest_of_parts) {
5757 required_proc = num_free_procs -
5758 (minimum_num_procs_required_for_rest_of_parts);
5759 }
5760
5761 // reduce the free processor count
5762 num_free_procs -= required_proc;
5763
5764 // reduce the free minimum processor count required for the rest of the
5765 // part by 1.
5766 --minimum_num_procs_required_for_rest_of_parts;
5767
5768 // part (i) is assigned to (required_proc) processors.
5769 num_procs_assigned_to_each_part[i] = required_proc;
5770
5771 // because of the roundings some processors might be left as unassigned.
5772 // we want to assign those processors to the part with most imbalance.
5773 // find the part with the maximum imbalance here.
5774 double imbalance_wrt_ideal =
5775 (scalar_required_proc - required_proc) / required_proc;
5776 if(imbalance_wrt_ideal > max_imbalance_difference) {
5777 max_imbalance_difference = imbalance_wrt_ideal;
5778 max_differing_part = i;
5779 }
5780 }
5781
5782 // assign extra processors to the part with maximum imbalance
5783 // than the ideal.
5784 if(num_free_procs > 0) {
5785 num_procs_assigned_to_each_part[max_differing_part] += num_free_procs;
5786 }
5787
5788 // now find what are the best processors with least migration for each part.
5789
5790 // part_assignment_proc_begin_indices ([i]) is the array that holds the
5791 // beginning index of a processor that processor sends its data for part - i
5792 mj_part_t *part_assignment_proc_begin_indices = new mj_part_t[num_parts];
5793
5794 // the next processor send is found in processor_chains_in_parts,
5795 // in linked list manner.
5796 mj_part_t *processor_chains_in_parts = new mj_part_t [num_procs];
5797 mj_part_t *processor_part_assignments = new mj_part_t[num_procs];
5798
5799 // initialize the assignment of each processor.
5800 // this has a linked list implementation.
5801 // the beginning of processors assigned
5802 // to each part is hold at part_assignment_proc_begin_indices[part].
5803 // then the next processor assigned to that part is located at
5804 // proc_part_assignments[part_assign_begins[part]], this is a chain
5805 // until the value of -1 is reached.
5806 for(int i = 0; i < num_procs; ++i ) {
5807 processor_part_assignments[i] = -1;
5808 processor_chains_in_parts[i] = -1;
5809 }
5810 for(int i = 0; i < num_parts; ++i ) {
5811 part_assignment_proc_begin_indices[i] = -1;
5812 }
5813
5814 // std::cout << "Before migration: mig type:" <<
5815 // this->migration_type << std::endl;
5816 // Allocate memory for sorting data structure.
5817 uSignedSortItem<mj_part_t, mj_gno_t, char> *
5818 sort_item_num_part_points_in_procs =
5819 new uSignedSortItem<mj_part_t, mj_gno_t, char>[num_procs];
5820
5821 for(mj_part_t i = 0; i < num_parts; ++i) {
5822 // the algorithm tries to minimize the cost of migration, by assigning the
5823 // processors with highest number of coordinates on that part.
5824 // here we might want to implement a maximum weighted bipartite matching
5825 // algorithm.
5826 for(mj_part_t ii = 0; ii < num_procs; ++ii) {
5827 sort_item_num_part_points_in_procs[ii].id = ii;
5828 // if processor is not assigned yet.
5829 // add its num points to the sort data structure.
5830 if(processor_part_assignments[ii] == -1) {
5831 sort_item_num_part_points_in_procs[ii].val =
5832 num_points_in_all_processor_parts[ii * num_parts + i];
5833 // indicate that the processor has positive weight.
5834 sort_item_num_part_points_in_procs[ii].signbit = 1;
5835 }
5836 else {
5837 // if processor is already assigned, insert -nLocal - 1 so that it
5838 // won't be selected again.
5839 // would be same if we simply set it to -1, but more information with
5840 // no extra cost (which is used later) is provided.
5841 // sort_item_num_part_points_in_procs[ii].val =
5842 // -num_points_in_all_processor_parts[ii * num_parts + i] - 1;
5843
5844 // UPDATE: Since above gets warning when unsigned is used to
5845 // represent, we added extra bit to as sign bit to the sort item.
5846 // It is 1 for positives, 0 for negatives.
5847 sort_item_num_part_points_in_procs[ii].val =
5848 num_points_in_all_processor_parts[ii * num_parts + i];
5849 sort_item_num_part_points_in_procs[ii].signbit = 0;
5850 }
5851 }
5852
5853 // sort the processors in the part.
5854 uqSignsort<mj_part_t, mj_gno_t,char>
5855 (num_procs, sort_item_num_part_points_in_procs);
5856
5857 /*
5858 for(mj_part_t ii = 0; ii < num_procs; ++ii) {
5859 std::cout << "ii:" << ii << " " <<
5860 sort_item_num_part_points_in_procs[ii].id <<
5861 " " << sort_item_num_part_points_in_procs[ii].val <<
5862 " " << int(sort_item_num_part_points_in_procs[ii].signbit) <<
5863 std::endl;
5864 }
5865 */
5866
5867 mj_part_t required_proc_count = num_procs_assigned_to_each_part[i];
5868 mj_gno_t total_num_points_in_part = global_num_points_in_parts[i];
5869 mj_gno_t ideal_num_points_in_a_proc = Teuchos::as<mj_gno_t>(
5870 ceil(total_num_points_in_part / double (required_proc_count)));
5871
5872 // starts sending to least heaviest part.
5873 mj_part_t next_proc_to_send_index = num_procs - required_proc_count;
5874 mj_part_t next_proc_to_send_id =
5875 sort_item_num_part_points_in_procs[next_proc_to_send_index].id;
5876 mj_lno_t space_left_in_sent_proc = ideal_num_points_in_a_proc -
5877 sort_item_num_part_points_in_procs[next_proc_to_send_index].val;
5878
5879 // find the processors that will be assigned to this part, which are the
5880 // heaviest non assigned processors.
5881 for(mj_part_t ii = num_procs - 1;
5882 ii >= num_procs - required_proc_count; --ii) {
5883 mj_part_t proc_id = sort_item_num_part_points_in_procs[ii].id;
5884 // assign processor to part - i.
5885 processor_part_assignments[proc_id] = i;
5886 }
5887
5888 bool did_change_sign = false;
5889 // if processor has a minus count, reverse it.
5890 for(mj_part_t ii = 0; ii < num_procs; ++ii) {
5891 // TODO: THE LINE BELOW PRODUCES A WARNING IF gno_t IS UNSIGNED
5892 // TODO: SEE BUG 6194
5893 if(sort_item_num_part_points_in_procs[ii].signbit == 0) {
5894 did_change_sign = true;
5895 sort_item_num_part_points_in_procs[ii].signbit = 1;
5896 }
5897 else {
5898 break;
5899 }
5900 }
5901
5902 if(did_change_sign) {
5903 // resort the processors in the part for the rest of the processors that
5904 // is not assigned.
5905 uqSignsort<mj_part_t, mj_gno_t>(num_procs - required_proc_count,
5906 sort_item_num_part_points_in_procs);
5907 }
5908
5909 /*
5910 for(mj_part_t ii = 0; ii < num_procs; ++ii) {
5911 std::cout << "after resort ii:" << ii << " " <<
5912 sort_item_num_part_points_in_procs[ii].id <<
5913 " " << sort_item_num_part_points_in_procs[ii].val <<
5914 " " << int(sort_item_num_part_points_in_procs[ii].signbit ) <<
5915 std::endl;
5916 }
5917 */
5918
5919 // check if this processors is one of the procs assigned to this part.
5920 // if it is, then get the group.
5921 if(!did_i_find_my_group) {
5922 for(mj_part_t ii = num_procs - 1; ii >=
5923 num_procs - required_proc_count; --ii) {
5924
5925 mj_part_t proc_id_to_assign = sort_item_num_part_points_in_procs[ii].id;
5926
5927 // add the proc to the group.
5928 processor_ranks_for_subcomm.push_back(proc_id_to_assign);
5929
5930 if(proc_id_to_assign == this->myRank) {
5931 // if the assigned process is me, then I find my group.
5932 did_i_find_my_group = true;
5933
5934 // set the beginning of part i to my rank.
5935 part_assignment_proc_begin_indices[i] = this->myRank;
5936 processor_chains_in_parts[this->myRank] = -1;
5937
5938 // set send count to myself to the number of points that I have
5939 // in part i.
5940 send_count_to_each_proc[this->myRank] =
5941 sort_item_num_part_points_in_procs[ii].val;
5942
5943 // calculate the shift required for the
5944 // output_part_numbering_begin_index
5945 for(mj_part_t in = 0; in < i; ++in) {
5946 output_part_numbering_begin_index +=
5947 (*next_future_num_parts_in_parts)[in];
5948 }
5949 out_part_index = i;
5950 }
5951 }
5952
5953 // if these was not my group,
5954 // clear the subcomminicator processor array.
5955 if(!did_i_find_my_group) {
5956 processor_ranks_for_subcomm.clear();
5957 }
5958 }
5959
5960 // send points of the nonassigned coordinates to the assigned coordinates.
5961 // starts from the heaviest nonassigned processor.
5962 // TODO we might want to play with this part, that allows more
5963 // computational imbalance but having better communication balance.
5964 for(mj_part_t ii = num_procs - required_proc_count - 1; ii >= 0; --ii) {
5965 mj_part_t nonassigned_proc_id =
5966 sort_item_num_part_points_in_procs[ii].id;
5967 mj_lno_t num_points_to_sent =
5968 sort_item_num_part_points_in_procs[ii].val;
5969
5970 // we set number of points to -to_sent - 1 for the assigned processors.
5971 // we reverse it here. This should not happen, as we have already
5972 // reversed them above.
5973#ifdef MJ_DEBUG
5974 if(num_points_to_sent < 0) {
5975 cout << "Migration - processor assignments - for part:" << i
5976 << "from proc:" << nonassigned_proc_id << " num_points_to_sent:"
5977 << num_points_to_sent << std::endl;
5978 std::terminate();
5979 }
5980#endif
5981
5982 switch (migration_type) {
5983 case 0:
5984 {
5985 // now sends the points to the assigned processors.
5986 while (num_points_to_sent > 0) {
5987 // if the processor has enough space.
5988 if(num_points_to_sent <= space_left_in_sent_proc) {
5989 // reduce the space left in the processor.
5990 space_left_in_sent_proc -= num_points_to_sent;
5991 // if my rank is the one that is sending the coordinates.
5992 if(this->myRank == nonassigned_proc_id) {
5993 // set my sent count to the sent processor.
5994 send_count_to_each_proc[next_proc_to_send_id] =
5995 num_points_to_sent;
5996 // save the processor in the list (processor_chains_in_parts
5997 // and part_assignment_proc_begin_indices)
5998 // that the processor will send its point in part-i.
5999 mj_part_t prev_begin = part_assignment_proc_begin_indices[i];
6000 part_assignment_proc_begin_indices[i] = next_proc_to_send_id;
6001 processor_chains_in_parts[next_proc_to_send_id] = prev_begin;
6002 }
6003 num_points_to_sent = 0;
6004 }
6005 else {
6006 // there might be no space left in the processor.
6007 if(space_left_in_sent_proc > 0) {
6008 num_points_to_sent -= space_left_in_sent_proc;
6009
6010 //send as the space left in the processor.
6011 if(this->myRank == nonassigned_proc_id) {
6012 // send as much as the space in this case.
6013 send_count_to_each_proc[next_proc_to_send_id] =
6014 space_left_in_sent_proc;
6015 mj_part_t prev_begin = part_assignment_proc_begin_indices[i];
6016 part_assignment_proc_begin_indices[i] = next_proc_to_send_id;
6017 processor_chains_in_parts[next_proc_to_send_id] = prev_begin;
6018 }
6019 }
6020 // change the sent part
6021 ++next_proc_to_send_index;
6022
6023#ifdef MJ_DEBUG
6024 if(next_part_to_send_index < nprocs - required_proc_count ) {
6025 cout << "Migration - processor assignments - for part:"
6026 << i
6027 << " next_part_to_send :" << next_part_to_send_index
6028 << " nprocs:" << nprocs
6029 << " required_proc_count:" << required_proc_count
6030 << " Error: next_part_to_send_index <" <<
6031 << " nprocs - required_proc_count" << std::endl;
6032 std::terminate();
6033 }
6034#endif
6035 // send the new id.
6036 next_proc_to_send_id =
6037 sort_item_num_part_points_in_procs[next_proc_to_send_index].id;
6038 // set the new space in the processor.
6039 space_left_in_sent_proc = ideal_num_points_in_a_proc -
6040 sort_item_num_part_points_in_procs[next_proc_to_send_index].val;
6041 }
6042 }
6043 }
6044 break;
6045 default:
6046 {
6047 // to minimize messages, we want each processor to send its
6048 // coordinates to only a single point.
6049 // we do not respect imbalances here, we send all points to the
6050 // next processor.
6051 if(this->myRank == nonassigned_proc_id) {
6052 // set my sent count to the sent processor.
6053 send_count_to_each_proc[next_proc_to_send_id] = num_points_to_sent;
6054 // save the processor in the list (processor_chains_in_parts and
6055 // part_assignment_proc_begin_indices)
6056 // that the processor will send its point in part-i.
6057 mj_part_t prev_begin = part_assignment_proc_begin_indices[i];
6058 part_assignment_proc_begin_indices[i] = next_proc_to_send_id;
6059 processor_chains_in_parts[next_proc_to_send_id] = prev_begin;
6060 }
6061 num_points_to_sent = 0;
6062 ++next_proc_to_send_index;
6063
6064 // if we made it to the heaviest processor we round robin and
6065 // go to beginning
6066 if(next_proc_to_send_index == num_procs) {
6067 next_proc_to_send_index = num_procs - required_proc_count;
6068 }
6069 // send the new id.
6070 next_proc_to_send_id =
6071 sort_item_num_part_points_in_procs[next_proc_to_send_index].id;
6072 // set the new space in the processor.
6073 space_left_in_sent_proc = ideal_num_points_in_a_proc -
6074 sort_item_num_part_points_in_procs[next_proc_to_send_index].val;
6075 }
6076 }
6077 }
6078 }
6079
6080 /*
6081 for(int i = 0; i < num_procs;++i) {
6082 std::cout << "me:" << this->myRank << " to part:" << i << " sends:" <<
6083 send_count_to_each_proc[i] << std::endl;
6084 }
6085 */
6086
6087 this->assign_send_destinations(
6088 num_parts,
6089 part_assignment_proc_begin_indices,
6090 processor_chains_in_parts,
6091 send_count_to_each_proc,
6092 coordinate_destinations);
6093 delete [] part_assignment_proc_begin_indices;
6094 delete [] processor_chains_in_parts;
6095 delete [] processor_part_assignments;
6096 delete [] sort_item_num_part_points_in_procs;
6097 delete [] num_procs_assigned_to_each_part;
6098}
6099
6115template <typename mj_scalar_t, typename mj_lno_t, typename mj_gno_t,
6116 typename mj_part_t, typename mj_node_t>
6117void AlgMJ<mj_scalar_t, mj_lno_t, mj_gno_t, mj_part_t, mj_node_t>::
6118 assign_send_destinations2(
6119 mj_part_t num_parts,
6120 uSortItem<mj_part_t, mj_part_t> * sort_item_part_to_proc_assignment,
6121 int *coordinate_destinations,
6122 mj_part_t &output_part_numbering_begin_index,
6123 std::vector<mj_part_t> *next_future_num_parts_in_parts)
6124{
6125 mj_part_t part_shift_amount = output_part_numbering_begin_index;
6126 mj_part_t previous_processor = -1;
6127
6128 auto local_new_part_xadj = Kokkos::create_mirror_view(this->new_part_xadj);
6129 Kokkos::deep_copy(local_new_part_xadj, this->new_part_xadj);
6130
6131 auto local_new_coordinate_permutations =
6132 Kokkos::create_mirror_view(this->new_coordinate_permutations);
6133 Kokkos::deep_copy(local_new_coordinate_permutations,
6134 this->new_coordinate_permutations);
6135
6136 for(mj_part_t i = 0; i < num_parts; ++i) {
6137 mj_part_t p = sort_item_part_to_proc_assignment[i].id;
6138
6139 // assigned processors are sorted.
6140 mj_lno_t part_begin_index = 0;
6141
6142 if(p > 0) {
6143 part_begin_index = local_new_part_xadj(p - 1);
6144 }
6145
6146 mj_lno_t part_end_index = local_new_part_xadj(p);
6147
6148 mj_part_t assigned_proc = sort_item_part_to_proc_assignment[i].val;
6149 if(this->myRank == assigned_proc && previous_processor != assigned_proc) {
6150 output_part_numbering_begin_index = part_shift_amount;
6151 }
6152 previous_processor = assigned_proc;
6153 part_shift_amount += (*next_future_num_parts_in_parts)[p];
6154
6155 for(mj_lno_t j= part_begin_index; j < part_end_index; j++) {
6156 mj_lno_t localInd = local_new_coordinate_permutations(j);
6157 coordinate_destinations[localInd] = assigned_proc;
6158 }
6159 }
6160}
6161
6183template <typename mj_scalar_t, typename mj_lno_t, typename mj_gno_t,
6184 typename mj_part_t, typename mj_node_t>
6185void AlgMJ<mj_scalar_t, mj_lno_t, mj_gno_t, mj_part_t, mj_node_t>::
6186 mj_assign_parts_to_procs(
6187 mj_gno_t * num_points_in_all_processor_parts,
6188 mj_part_t num_parts,
6189 mj_part_t num_procs,
6190 mj_lno_t *send_count_to_each_proc,
6191 std::vector<mj_part_t> *next_future_num_parts_in_parts,
6192 mj_part_t &out_num_part,
6193 std::vector<mj_part_t> &out_part_indices,
6194 mj_part_t &output_part_numbering_begin_index,
6195 int *coordinate_destinations) {
6196
6197 out_num_part = 0;
6198 mj_gno_t *global_num_points_in_parts =
6199 num_points_in_all_processor_parts + num_procs * num_parts;
6200 out_part_indices.clear();
6201
6202 // to sort the parts that is assigned to the processors.
6203 // id is the part number, sort value is the assigned processor id.
6204 uSortItem<mj_part_t, mj_part_t> * sort_item_part_to_proc_assignment =
6205 new uSortItem<mj_part_t, mj_part_t>[num_parts];
6206 uSortItem<mj_part_t, mj_gno_t> * sort_item_num_points_of_proc_in_part_i =
6207 new uSortItem<mj_part_t, mj_gno_t>[num_procs];
6208
6209 // calculate the optimal number of coordinates that should be assigned
6210 // to each processor.
6211 mj_lno_t work_each =
6212 mj_lno_t (this->num_global_coords / (double (num_procs)) + 0.5f);
6213
6214 // to hold the left space as the number of coordinates to the optimal
6215 // number in each proc.
6216 mj_lno_t *space_in_each_processor = new mj_lno_t[num_procs];
6217
6218 // initialize left space in each.
6219 for(mj_part_t i = 0; i < num_procs; ++i) {
6220 space_in_each_processor[i] = work_each;
6221 }
6222
6223 // we keep track of how many parts each processor is assigned to.
6224 // because in some weird inputs, it might be possible that some
6225 // processors is not assigned to any part. Using these variables,
6226 // we force each processor to have at least one part.
6227 mj_part_t *num_parts_proc_assigned = new mj_part_t[num_procs];
6228 memset(num_parts_proc_assigned, 0, sizeof(mj_part_t) * num_procs);
6229 int empty_proc_count = num_procs;
6230
6231 // to sort the parts with decreasing order of their coordiantes.
6232 // id are the part numbers, sort value is the number of points in each.
6233 uSortItem<mj_part_t, mj_gno_t> * sort_item_point_counts_in_parts =
6234 new uSortItem<mj_part_t, mj_gno_t>[num_parts];
6235
6236 // initially we will sort the parts according to the number of coordinates
6237 // they have, so that we will start assigning with the part that has the most
6238 // number of coordinates.
6239 for(mj_part_t i = 0; i < num_parts; ++i) {
6240 sort_item_point_counts_in_parts[i].id = i;
6241 sort_item_point_counts_in_parts[i].val = global_num_points_in_parts[i];
6242 }
6243
6244 // sort parts with increasing order of loads.
6245 uqsort<mj_part_t, mj_gno_t>(num_parts, sort_item_point_counts_in_parts);
6246
6247 // assigning parts to the processors
6248 // traverse the part with decreasing order of load.
6249 // first assign the heaviest part.
6250 for(mj_part_t j = 0; j < num_parts; ++j) {
6251 // sorted with increasing order, traverse inverse.
6252 mj_part_t i = sort_item_point_counts_in_parts[num_parts - 1 - j].id;
6253
6254 // load of the part
6255 mj_gno_t load = global_num_points_in_parts[i];
6256
6257 // assigned processors
6258 mj_part_t assigned_proc = -1;
6259
6260 // sort processors with increasing number of points in this part.
6261 for(mj_part_t ii = 0; ii < num_procs; ++ii) {
6262 sort_item_num_points_of_proc_in_part_i[ii].id = ii;
6263
6264 // if there are still enough parts to fill empty processors, than proceed
6265 // normally, but if empty processor count is equal to the number of part,
6266 // then we force to part assignments only to empty processors.
6267 if(empty_proc_count < num_parts - j ||
6268 num_parts_proc_assigned[ii] == 0) {
6269 // how many points processor ii has in part i?
6270 sort_item_num_points_of_proc_in_part_i[ii].val =
6271 num_points_in_all_processor_parts[ii * num_parts + i];
6272 }
6273 else {
6274 sort_item_num_points_of_proc_in_part_i[ii].val = -1;
6275 }
6276 }
6277
6278 uqsort<mj_part_t, mj_gno_t>(num_procs,
6279 sort_item_num_points_of_proc_in_part_i);
6280
6281 // traverse all processors with decreasing load.
6282 for(mj_part_t iii = num_procs - 1; iii >= 0; --iii) {
6283 mj_part_t ii = sort_item_num_points_of_proc_in_part_i[iii].id;
6284 if(assigned_proc == -1 ||
6285 (space_in_each_processor[ii] > space_in_each_processor[assigned_proc])) {
6286 assigned_proc = ii;
6287 }
6288 else if(space_in_each_processor[ii] == space_in_each_processor[assigned_proc]) {
6289 if(ii < assigned_proc) {
6290 // ties go to lower proc
6291 // not necessary for a valid result but allows testing to compare
6292 // MPI results and have parts numbers assigned to the same boxes.
6293 // We don't break here because we may have more ties still to check.
6294 // The indeterminate state before this is due to Cuda using
6295 // atomics to refill the permutation array. So non-cuda runs don't
6296 // actualy need this since they will always have the same pattern.
6297 assigned_proc = ii;
6298 }
6299 }
6300 else {
6301 break; // now we can break - we have our part and no more ties.
6302 }
6303 }
6304
6305 if(num_parts_proc_assigned[assigned_proc]++ == 0) {
6306 --empty_proc_count;
6307 }
6308
6309 space_in_each_processor[assigned_proc] -= load;
6310 //to sort later, part-i is assigned to the proccessor - assignment.
6311 sort_item_part_to_proc_assignment[j].id = i; //part i
6312
6313 // assigned to processor - assignment.
6314 sort_item_part_to_proc_assignment[j].val = assigned_proc;
6315
6316 // if assigned processor is me, increase the number.
6317 if(assigned_proc == this->myRank) {
6318 out_num_part++;//assigned_part_count;
6319 out_part_indices.push_back(i);
6320 }
6321
6322 // increase the send to that processor by the number of points in that
6323 // part, as everyone send their coordiantes in this part to the
6324 // processor assigned to this part.
6325 send_count_to_each_proc[assigned_proc] +=
6326 num_points_in_all_processor_parts[this->myRank * num_parts + i];
6327 }
6328
6329 delete [] num_parts_proc_assigned;
6330 delete [] sort_item_num_points_of_proc_in_part_i;
6331 delete [] sort_item_point_counts_in_parts;
6332 delete [] space_in_each_processor;
6333
6334 // sort assignments with respect to the assigned processors.
6335 uqsort<mj_part_t, mj_part_t>(num_parts, sort_item_part_to_proc_assignment);
6336
6337 // fill sendBuf.
6338 this->assign_send_destinations2(
6339 num_parts,
6340 sort_item_part_to_proc_assignment,
6341 coordinate_destinations,
6342 output_part_numbering_begin_index,
6343 next_future_num_parts_in_parts);
6344
6345 delete [] sort_item_part_to_proc_assignment;
6346}
6347
6348
6372template <typename mj_scalar_t, typename mj_lno_t, typename mj_gno_t,
6373 typename mj_part_t, typename mj_node_t>
6374void AlgMJ<mj_scalar_t, mj_lno_t, mj_gno_t, mj_part_t, mj_node_t>::
6375 mj_migration_part_proc_assignment(
6376 mj_gno_t * num_points_in_all_processor_parts,
6377 mj_part_t num_parts,
6378 mj_part_t num_procs,
6379 mj_lno_t *send_count_to_each_proc,
6380 std::vector<mj_part_t> &processor_ranks_for_subcomm,
6381 std::vector<mj_part_t> *next_future_num_parts_in_parts,
6382 mj_part_t &out_num_part,
6383 std::vector<mj_part_t> &out_part_indices,
6384 mj_part_t &output_part_numbering_begin_index,
6385 int *coordinate_destinations)
6386{
6387 processor_ranks_for_subcomm.clear();
6388 // if(this->num_local_coords > 0)
6389 if(num_procs > num_parts) {
6390 // if there are more processors than the number of current part
6391 // then processors share the existing parts.
6392 // at the end each processor will have a single part,
6393 // but a part will be shared by a group of processors.
6394 mj_part_t out_part_index = 0;
6395
6396 this->mj_assign_proc_to_parts(
6397 num_points_in_all_processor_parts,
6398 num_parts,
6399 num_procs,
6400 send_count_to_each_proc,
6401 processor_ranks_for_subcomm,
6402 next_future_num_parts_in_parts,
6403 out_part_index,
6404 output_part_numbering_begin_index,
6405 coordinate_destinations
6406 );
6407
6408 out_num_part = 1;
6409 out_part_indices.clear();
6410 out_part_indices.push_back(out_part_index);
6411 }
6412 else {
6413
6414 // there are more parts than the processors.
6415 // therefore a processor will be assigned multiple parts,
6416 // the subcommunicators will only have a single processor.
6417 processor_ranks_for_subcomm.push_back(this->myRank);
6418
6419 // since there are more parts then procs,
6420 // assign multiple parts to processors.
6421
6422 this->mj_assign_parts_to_procs(
6423 num_points_in_all_processor_parts,
6424 num_parts,
6425 num_procs,
6426 send_count_to_each_proc,
6427 next_future_num_parts_in_parts,
6428 out_num_part,
6429 out_part_indices,
6430 output_part_numbering_begin_index,
6431 coordinate_destinations);
6432 }
6433}
6434
6448template <typename mj_scalar_t, typename mj_lno_t, typename mj_gno_t,
6449 typename mj_part_t, typename mj_node_t>
6450void AlgMJ<mj_scalar_t, mj_lno_t, mj_gno_t, mj_part_t, mj_node_t>::
6451 mj_migrate_coords(
6452 mj_part_t num_procs,
6453 mj_lno_t &num_new_local_points,
6454 std::string iteration,
6455 int *coordinate_destinations,
6456 mj_part_t num_parts)
6457{
6458
6459#ifdef ZOLTAN2_MJ_ENABLE_ZOLTAN_MIGRATION
6460 if(sizeof(mj_lno_t) <= sizeof(int)) {
6461 // Cannot use Zoltan_Comm with local ordinals larger than ints.
6462 // In Zoltan_Comm_Create, the cast int(this->num_local_coords)
6463 // may overflow.
6464 ZOLTAN_COMM_OBJ *plan = NULL;
6465 MPI_Comm mpi_comm = Teuchos::getRawMpiComm(*(this->comm));
6466 int num_incoming_gnos = 0;
6467 int message_tag = 7859;
6468
6469 this->mj_env->timerStart(MACRO_TIMERS,
6470 mj_timer_base_string + "Migration Z1PlanCreating-" + iteration);
6471 int ierr = Zoltan_Comm_Create(
6472 &plan,
6473 int(this->num_local_coords),
6474 coordinate_destinations,
6475 mpi_comm,
6476 message_tag,
6477 &num_incoming_gnos);
6478
6479 Z2_ASSERT_VALUE(ierr, ZOLTAN_OK);
6480 this->mj_env->timerStop(MACRO_TIMERS,
6481 mj_timer_base_string + "Migration Z1PlanCreating-" + iteration);
6482
6483 this->mj_env->timerStart(MACRO_TIMERS,
6484 mj_timer_base_string + "Migration Z1Migration-" + iteration);
6485
6486 // MPI Buffers should be on Kokkos::HostSpace not Kokkos::CudaUVMSpace
6487 // Note, with UVM space, create_mirror_view does NOT create a non-UVM
6488 // view; need the explicit Host creation and deep_copy.
6489
6490 // migrate gnos.
6491 {
6492 auto host_current_mj_gnos = Kokkos::create_mirror_view(
6493 Kokkos::HostSpace(), this->current_mj_gnos);
6494 Kokkos::deep_copy(host_current_mj_gnos, this->current_mj_gnos);
6495 Kokkos::View<mj_gno_t*, device_t> dst_gnos(
6496 Kokkos::ViewAllocateWithoutInitializing("dst_gnos"), num_incoming_gnos);
6497 auto host_dst_gnos = Kokkos::create_mirror_view(
6498 Kokkos::HostSpace(), dst_gnos);
6499 message_tag++;
6500 ierr = Zoltan_Comm_Do(
6501 plan,
6502 message_tag,
6503 (char *) host_current_mj_gnos.data(),
6504 sizeof(mj_gno_t),
6505 (char *) host_dst_gnos.data());
6506 Z2_ASSERT_VALUE(ierr, ZOLTAN_OK);
6507 Kokkos::deep_copy(dst_gnos, host_dst_gnos);
6508 this->current_mj_gnos = dst_gnos;
6509 }
6510
6511 //migrate coordinates
6512 {
6513 // coordinates in MJ are LayoutLeft since Tpetra Multivector gives LayoutLeft
6514 auto host_src_coordinates = Kokkos::create_mirror_view(
6515 Kokkos::HostSpace(), this->mj_coordinates);
6516 Kokkos::deep_copy(host_src_coordinates, this->mj_coordinates);
6517 Kokkos::View<mj_scalar_t**, Kokkos::LayoutLeft, device_t>
6518 dst_coordinates(Kokkos::ViewAllocateWithoutInitializing("mj_coordinates"),
6519 num_incoming_gnos, this->coord_dim);
6520 auto host_dst_coordinates = Kokkos::create_mirror_view(
6521 Kokkos::HostSpace(), dst_coordinates);
6522 for(int i = 0; i < this->coord_dim; ++i) {
6523 Kokkos::View<mj_scalar_t*, Kokkos::HostSpace> sub_host_src_coordinates
6524 = Kokkos::subview(host_src_coordinates, Kokkos::ALL, i);
6525 Kokkos::View<mj_scalar_t *, Kokkos::HostSpace> sub_host_dst_coordinates
6526 = Kokkos::subview(host_dst_coordinates, Kokkos::ALL, i);
6527 // Note Layout Left means we can do these in contiguous blocks
6528 message_tag++;
6529 ierr = Zoltan_Comm_Do(
6530 plan,
6531 message_tag,
6532 (char *) sub_host_src_coordinates.data(),
6533 sizeof(mj_scalar_t),
6534 (char *) sub_host_dst_coordinates.data());
6535 Z2_ASSERT_VALUE(ierr, ZOLTAN_OK);
6536 }
6537 deep_copy(dst_coordinates, host_dst_coordinates);
6538 this->mj_coordinates = dst_coordinates;
6539 }
6540
6541 // migrate weights.
6542 {
6543 auto host_src_weights = Kokkos::create_mirror_view(
6544 Kokkos::HostSpace(), this->mj_weights);
6545 Kokkos::deep_copy(host_src_weights, this->mj_weights);
6546 Kokkos::View<mj_scalar_t**, device_t> dst_weights(
6547 Kokkos::ViewAllocateWithoutInitializing("mj_weights"),
6548 num_incoming_gnos, this->num_weights_per_coord);
6549 auto host_dst_weights = Kokkos::create_mirror_view(dst_weights);
6550 for(int i = 0; i < this->num_weights_per_coord; ++i) {
6551 auto sub_host_src_weights
6552 = Kokkos::subview(host_src_weights, Kokkos::ALL, i);
6553 auto sub_host_dst_weights
6554 = Kokkos::subview(host_dst_weights, Kokkos::ALL, i);
6555 ArrayRCP<mj_scalar_t> sent_weight(this->num_local_coords);
6556 // Copy because of layout
6557 for(mj_lno_t n = 0; n < this->num_local_coords; ++n) {
6558 sent_weight[n] = sub_host_src_weights(n);
6559 }
6560 ArrayRCP<mj_scalar_t> received_weight(num_incoming_gnos);
6561 message_tag++;
6562 ierr = Zoltan_Comm_Do(
6563 plan,
6564 message_tag,
6565 (char *) sent_weight.getRawPtr(),
6566 sizeof(mj_scalar_t),
6567 (char *) received_weight.getRawPtr());
6568 Z2_ASSERT_VALUE(ierr, ZOLTAN_OK);
6569 // Again we copy by index due to layout
6570 for(mj_lno_t n = 0; n < num_incoming_gnos; ++n) {
6571 sub_host_dst_weights(n) = received_weight[n];
6572 }
6573 }
6574 deep_copy(dst_weights, host_dst_weights);
6575 this->mj_weights = dst_weights;
6576 }
6577
6578 // migrate owners.
6579 {
6580 // Note that owners we kept on Serial
6581 Kokkos::View<int *, Kokkos::HostSpace> dst_owners_of_coordinate(
6582 Kokkos::ViewAllocateWithoutInitializing("owner_of_coordinate"),
6583 num_incoming_gnos);
6584 message_tag++;
6585 ierr = Zoltan_Comm_Do(
6586 plan,
6587 message_tag,
6588 (char *) owner_of_coordinate.data(),
6589 sizeof(int),
6590 (char *) dst_owners_of_coordinate.data());
6591 Z2_ASSERT_VALUE(ierr, ZOLTAN_OK);
6592 this->owner_of_coordinate = dst_owners_of_coordinate;
6593 }
6594
6595 // if num procs is less than num parts,
6596 // we need the part assigment arrays as well, since
6597 // there will be multiple parts in processor.
6598 {
6599 auto host_src_assigned_part_ids = Kokkos::create_mirror_view(
6600 Kokkos::HostSpace(), this->assigned_part_ids);
6601 Kokkos::deep_copy(host_src_assigned_part_ids, this->assigned_part_ids);
6602 Kokkos::View<int *, device_t> dst_assigned_part_ids(
6603 Kokkos::ViewAllocateWithoutInitializing("assigned_part_ids"),
6604 num_incoming_gnos);
6605 auto host_dst_assigned_part_ids = Kokkos::create_mirror_view(
6606 Kokkos::HostSpace(), dst_assigned_part_ids);
6607 mj_part_t *new_parts = new mj_part_t[num_incoming_gnos];
6608 if(num_procs < num_parts) {
6609 message_tag++;
6610 ierr = Zoltan_Comm_Do(
6611 plan,
6612 message_tag,
6613 (char *) host_src_assigned_part_ids.data(),
6614 sizeof(mj_part_t),
6615 (char *) host_dst_assigned_part_ids.data());
6616 Z2_ASSERT_VALUE(ierr, ZOLTAN_OK);
6617 Kokkos::deep_copy(dst_assigned_part_ids, host_dst_assigned_part_ids);
6618 }
6619 // In original code this would just assign to an uninitialized array
6620 // if num_procs < num_parts. We're doing the same here.
6621 this->assigned_part_ids = dst_assigned_part_ids;
6622 }
6623
6624 ierr = Zoltan_Comm_Destroy(&plan);
6625 Z2_ASSERT_VALUE(ierr, ZOLTAN_OK);
6626 num_new_local_points = num_incoming_gnos;
6627 this->mj_env->timerStop(MACRO_TIMERS,
6628 mj_timer_base_string + "Migration Z1Migration-" + iteration);
6629 }
6630 else
6631#endif // ZOLTAN2_MJ_ENABLE_ZOLTAN_MIGRATION
6632 {
6633 this->mj_env->timerStart(MACRO_TIMERS, mj_timer_base_string +
6634 "Migration DistributorPlanCreating-" + iteration);
6635
6636 Tpetra::Distributor distributor(this->comm);
6637 ArrayView<const mj_part_t> destinations( coordinate_destinations,
6638 this->num_local_coords);
6639 mj_lno_t num_incoming_gnos = distributor.createFromSends(destinations);
6640 this->mj_env->timerStop(MACRO_TIMERS, mj_timer_base_string +
6641 "Migration DistributorPlanCreating-" + iteration);
6642 this->mj_env->timerStart(MACRO_TIMERS, mj_timer_base_string +
6643 "Migration DistributorMigration-" + iteration);
6644
6645 // note MPI buffers should all be on Kokkos::HostSpace and not
6646 // Kokkos::CudaUVMSpace.
6647 // Note, with UVM space, create_mirror_view does NOT create a non-UVM
6648 // view; need the explicit Host creation and deep_copy.
6649 // migrate gnos.
6650 {
6651 Kokkos::View<mj_gno_t*, Kokkos::HostSpace> received_gnos(
6652 Kokkos::ViewAllocateWithoutInitializing("received_gnos"),
6653 num_incoming_gnos);
6654
6655 Kokkos::View<mj_gno_t*, Kokkos::HostSpace> sent_gnos(
6656 Kokkos::ViewAllocateWithoutInitializing("sent_gnos"),
6657 this->current_mj_gnos.extent(0));
6658 Kokkos::deep_copy(sent_gnos, this->current_mj_gnos);
6659
6660 distributor.doPostsAndWaits(sent_gnos, 1, received_gnos);
6661
6662 this->current_mj_gnos = Kokkos::View<mj_gno_t*, device_t>(
6663 Kokkos::ViewAllocateWithoutInitializing("gids"), num_incoming_gnos);
6664
6665 Kokkos::deep_copy(this->current_mj_gnos, received_gnos);
6666 }
6667
6668 // migrate coordinates
6669 // coordinates in MJ are LayoutLeft since Tpetra Multivector is LayoutLeft
6670 Kokkos::View<mj_scalar_t**, Kokkos::LayoutLeft, device_t>
6671 dst_coordinates("mj_coordinates", num_incoming_gnos, this->coord_dim);
6672
6673 Kokkos::View<mj_scalar_t**, Kokkos::LayoutLeft, Kokkos::HostSpace>
6674 host_src_coordinates(
6675 Kokkos::ViewAllocateWithoutInitializing("host_coords"),
6676 this->mj_coordinates.extent(0), this->mj_coordinates.extent(1));
6677 Kokkos::deep_copy(host_src_coordinates, this->mj_coordinates);
6678
6679 Kokkos::View<mj_scalar_t*, Kokkos::HostSpace> received_coord(
6680 Kokkos::ViewAllocateWithoutInitializing("received_coord"),
6681 num_incoming_gnos);
6682
6683 for(int i = 0; i < this->coord_dim; ++i) {
6684
6685 // Note Layout Left means we can do these in contiguous blocks
6686
6687 Kokkos::View<mj_scalar_t*, Kokkos::HostSpace> sent_coord
6688 = Kokkos::subview(host_src_coordinates, Kokkos::ALL, i);
6689
6690 distributor.doPostsAndWaits(sent_coord, 1, received_coord);
6691
6692 Kokkos::deep_copy(Kokkos::subview(dst_coordinates, Kokkos::ALL, i),
6693 received_coord);
6694
6695 // Kokkos::deep_copy will fence, I think, so it should be safe
6696 // to reuse received_coord in the next lop iteration
6697 }
6698 this->mj_coordinates = dst_coordinates;
6699
6700 // migrate weights.
6701 Kokkos::View<mj_scalar_t**, device_t> dst_weights(
6702 "mj_weights", num_incoming_gnos, this->num_weights_per_coord);
6703 auto host_dst_weights = Kokkos::create_mirror_view(Kokkos::HostSpace(),
6704 dst_weights);
6705
6706 auto host_src_weights = Kokkos::create_mirror_view_and_copy(
6707 Kokkos::HostSpace(), this->mj_weights);
6708
6709 // contiguous buffers to gather potentially strided data
6710 Kokkos::View<mj_scalar_t*, Kokkos::HostSpace> sent_weight(
6711 Kokkos::ViewAllocateWithoutInitializing("send_weight_buffer"),
6712 this->num_local_coords);
6713
6714 Kokkos::View<mj_scalar_t*, Kokkos::HostSpace> received_weight(
6715 Kokkos::ViewAllocateWithoutInitializing("received_weight_buffer"),
6716 num_incoming_gnos);
6717
6718 for(int i = 0; i < this->num_weights_per_coord; ++i) {
6719
6720 auto sub_host_src_weights
6721 = Kokkos::subview(host_src_weights, Kokkos::ALL, i);
6722
6723 auto sub_host_dst_weights
6724 = Kokkos::subview(host_dst_weights, Kokkos::ALL, i);
6725
6726
6727 // Layout Right means the weights are not contiguous
6728 // However we don't have any systems setup with more than 1 weight so
6729 // really I have not tested any of this code with num weights > 1.
6730 // I think this is the right thing to do.
6731 for(mj_lno_t n = 0; n < this->num_local_coords; ++n) {
6732 sent_weight[n] = sub_host_src_weights(n);
6733 }
6734
6735 distributor.doPostsAndWaits(sent_weight, 1, received_weight);
6736
6737 // Again we copy by index due to layout
6738 for(mj_lno_t n = 0; n < num_incoming_gnos; ++n) {
6739 sub_host_dst_weights(n) = received_weight[n];
6740 }
6741 }
6742 Kokkos::deep_copy(dst_weights, host_dst_weights);
6743 this->mj_weights = dst_weights;
6744
6745 // migrate owners
6746 {
6747 // Note owners we kept on Serial
6748 Kokkos::View<int *, Kokkos::HostSpace> received_owners(
6749 Kokkos::ViewAllocateWithoutInitializing("owner_of_coordinate"),
6750 num_incoming_gnos);
6751
6752 distributor.doPostsAndWaits(owner_of_coordinate, 1, received_owners);
6753
6754 this->owner_of_coordinate = received_owners;
6755 }
6756
6757 // if num procs is less than num parts,
6758 // we need the part assigment arrays as well, since
6759 // there will be multiple parts in processor.
6760 if(num_procs < num_parts) {
6761 Kokkos::View<mj_part_t*, Kokkos::HostSpace> sent_partids(
6762 Kokkos::ViewAllocateWithoutInitializing("host_parts"),
6763 this->assigned_part_ids.extent(0));
6764 Kokkos::deep_copy(sent_partids, assigned_part_ids);
6765
6766 Kokkos::View<mj_part_t*, Kokkos::HostSpace> received_partids(
6767 Kokkos::ViewAllocateWithoutInitializing("received_partids"),
6768 num_incoming_gnos);
6769
6770 distributor.doPostsAndWaits(sent_partids, 1, received_partids);
6771
6772 this->assigned_part_ids = Kokkos::View<mj_part_t *, device_t>
6773 ("assigned_part_ids", num_incoming_gnos);
6774 Kokkos::deep_copy(this->assigned_part_ids, received_partids);
6775 }
6776 else {
6777 this->assigned_part_ids = Kokkos::View<mj_part_t *, device_t>
6778 ("assigned_part_ids", num_incoming_gnos);
6779 }
6780 this->mj_env->timerStop(MACRO_TIMERS, "" + mj_timer_base_string +
6781 "Migration DistributorMigration-" + iteration);
6782
6783 num_new_local_points = num_incoming_gnos;
6784 }
6785}
6786
6792template <typename mj_scalar_t, typename mj_lno_t, typename mj_gno_t,
6793 typename mj_part_t, typename mj_node_t>
6794void AlgMJ<mj_scalar_t, mj_lno_t, mj_gno_t, mj_part_t, mj_node_t>::
6795 create_sub_communicator(std::vector<mj_part_t> &processor_ranks_for_subcomm)
6796{
6797 mj_part_t group_size = processor_ranks_for_subcomm.size();
6798 mj_part_t *ids = new mj_part_t[group_size];
6799 for(mj_part_t i = 0; i < group_size; ++i) {
6800 ids[i] = processor_ranks_for_subcomm[i];
6801 }
6802 ArrayView<const mj_part_t> idView(ids, group_size);
6803 this->comm = this->comm->createSubcommunicator(idView);
6804 delete [] ids;
6805}
6806
6812template <typename mj_scalar_t, typename mj_lno_t, typename mj_gno_t,
6813 typename mj_part_t, typename mj_node_t>
6814void AlgMJ<mj_scalar_t, mj_lno_t, mj_gno_t, mj_part_t, mj_node_t>::
6815 fill_permutation_array(
6816 mj_part_t output_num_parts,
6817 mj_part_t num_parts)
6818{
6819 // if there is single output part, then simply fill the permutation array.
6820 if(output_num_parts == 1) {
6821 auto local_new_coordinate_permutations = this->new_coordinate_permutations;
6822 Kokkos::parallel_for(
6823 Kokkos::RangePolicy<typename mj_node_t::execution_space, mj_lno_t>
6824 (0, this->num_local_coords),
6825 KOKKOS_LAMBDA(mj_lno_t i) {
6826 local_new_coordinate_permutations(i) = i;
6827 });
6828 auto local_new_part_xadj = this->new_part_xadj;
6829 auto local_num_local_coords = this->num_local_coords;
6830 Kokkos::parallel_for(
6831 Kokkos::RangePolicy<typename mj_node_t::execution_space, int> (0,1),
6832 KOKKOS_LAMBDA(int dummy) {
6833 local_new_part_xadj(0) = local_num_local_coords;
6834 });
6835 }
6836 else {
6837 auto local_num_local_coords = this->num_local_coords;
6838 auto local_assigned_part_ids = this->assigned_part_ids;
6839 auto local_new_part_xadj = this->new_part_xadj;
6840 auto local_new_coordinate_permutations = this->new_coordinate_permutations;
6841
6842 // part shift holds the which part number an old part number corresponds to.
6843 Kokkos::View<mj_part_t*, device_t> part_shifts("part_shifts", num_parts);
6844
6845 // otherwise we need to count how many points are there in each part.
6846 // we allocate here as num_parts, because the sent partids are up to
6847 // num_parts, although there are outout_num_parts different part.
6848 Kokkos::View<mj_lno_t*, device_t> num_points_in_parts(
6849 "num_points_in_parts", num_parts);
6850
6851 Kokkos::parallel_for(
6852 Kokkos::RangePolicy<typename mj_node_t::execution_space, int> (0,1),
6853 KOKKOS_LAMBDA(int dummy) {
6854
6855 for(mj_lno_t i = 0; i < local_num_local_coords; ++i) {
6856 mj_part_t ii = local_assigned_part_ids(i);
6857 ++num_points_in_parts(ii);
6858 }
6859
6860 // write the end points of the parts.
6861 mj_part_t p = 0;
6862 mj_lno_t prev_index = 0;
6863 for(mj_part_t i = 0; i < num_parts; ++i) {
6864 if(num_points_in_parts(i) > 0) {
6865 local_new_part_xadj(p) = prev_index + num_points_in_parts(i);
6866 prev_index += num_points_in_parts(i);
6867 part_shifts(i) = p++;
6868 }
6869 }
6870
6871 // for the rest of the parts write the end index as end point.
6872 mj_part_t assigned_num_parts = p - 1;
6873 for(;p < num_parts; ++p) {
6874 local_new_part_xadj(p) =
6875 local_new_part_xadj(assigned_num_parts);
6876 }
6877 for(mj_part_t i = 0; i < output_num_parts; ++i) {
6878 num_points_in_parts(i) = local_new_part_xadj(i);
6879 }
6880
6881 // write the permutation array here.
6882 // get the part of the coordinate i, shift it to obtain the new part number.
6883 // assign it to the end of the new part numbers pointer.
6884 for(mj_lno_t i = local_num_local_coords - 1; i >= 0; --i) {
6885 mj_part_t part =
6886 part_shifts[mj_part_t(local_assigned_part_ids(i))];
6887 local_new_coordinate_permutations(--num_points_in_parts[part]) = i;
6888 }
6889 });
6890 }
6891}
6892
6917template <typename mj_scalar_t, typename mj_lno_t, typename mj_gno_t,
6918 typename mj_part_t, typename mj_node_t>
6919bool AlgMJ<mj_scalar_t, mj_lno_t, mj_gno_t, mj_part_t, mj_node_t>::
6920 mj_perform_migration(
6921 mj_part_t input_num_parts,
6922 mj_part_t &output_num_parts,
6923 std::vector<mj_part_t> *next_future_num_parts_in_parts,
6924 mj_part_t &output_part_begin_index,
6925 size_t migration_reduce_all_population,
6926 mj_lno_t num_coords_for_last_dim_part,
6927 std::string iteration,
6928 RCP<mj_partBoxVector_t> &input_part_boxes,
6929 RCP<mj_partBoxVector_t> &output_part_boxes)
6930{
6931 mj_part_t num_procs = this->comm->getSize();
6932 this->myRank = this->comm->getRank();
6933
6934 // this array holds how many points each processor has in each part.
6935 // to access how many points processor i has on part j,
6936 // num_points_in_all_processor_parts[i * num_parts + j]
6937 mj_gno_t *num_points_in_all_processor_parts =
6938 new mj_gno_t[input_num_parts * (num_procs + 1)];
6939
6940 // get the number of coordinates in each part in each processor.
6941 this->get_processor_num_points_in_parts(
6942 num_procs,
6943 input_num_parts,
6944 num_points_in_all_processor_parts);
6945
6946 // check if migration will be performed or not.
6947 if(!this->mj_check_to_migrate(
6948 migration_reduce_all_population,
6949 num_coords_for_last_dim_part,
6950 num_procs,
6951 input_num_parts,
6952 num_points_in_all_processor_parts)) {
6953 delete [] num_points_in_all_processor_parts;
6954 return false;
6955 }
6956
6957 mj_lno_t *send_count_to_each_proc = NULL;
6958 int *coordinate_destinations = new int[this->num_local_coords];
6959 send_count_to_each_proc = new mj_lno_t[num_procs];
6960
6961 for(int i = 0; i < num_procs; ++i) {
6962 send_count_to_each_proc[i] = 0;
6963 }
6964
6965 std::vector<mj_part_t> processor_ranks_for_subcomm;
6966 std::vector<mj_part_t> out_part_indices;
6967
6968 // determine which processors are assigned to which parts
6969 this->mj_migration_part_proc_assignment(
6970 num_points_in_all_processor_parts,
6971 input_num_parts,
6972 num_procs,
6973 send_count_to_each_proc,
6974 processor_ranks_for_subcomm,
6975 next_future_num_parts_in_parts,
6976 output_num_parts,
6977 out_part_indices,
6978 output_part_begin_index,
6979 coordinate_destinations);
6980
6981 delete [] send_count_to_each_proc;
6982 std::vector <mj_part_t> tmpv;
6983
6984 std::sort (out_part_indices.begin(), out_part_indices.end());
6985 mj_part_t outP = out_part_indices.size();
6986 mj_gno_t new_global_num_points = 0;
6987 mj_gno_t *global_num_points_in_parts =
6988 num_points_in_all_processor_parts + num_procs * input_num_parts;
6989
6990 if(this->mj_keep_part_boxes) {
6991 input_part_boxes->clear();
6992 }
6993
6994 // now we calculate the new values for next_future_num_parts_in_parts.
6995 // same for the part boxes.
6996 for(mj_part_t i = 0; i < outP; ++i) {
6997 mj_part_t ind = out_part_indices[i];
6998 new_global_num_points += global_num_points_in_parts[ind];
6999 tmpv.push_back((*next_future_num_parts_in_parts)[ind]);
7000 if(this->mj_keep_part_boxes) {
7001 input_part_boxes->push_back((*output_part_boxes)[ind]);
7002 }
7003 }
7004
7005 // swap the input and output part boxes.
7006 if(this->mj_keep_part_boxes) {
7007 RCP<mj_partBoxVector_t> tmpPartBoxes = input_part_boxes;
7008 input_part_boxes = output_part_boxes;
7009 output_part_boxes = tmpPartBoxes;
7010 }
7011 next_future_num_parts_in_parts->clear();
7012 for(mj_part_t i = 0; i < outP; ++i) {
7013 mj_part_t p = tmpv[i];
7014 next_future_num_parts_in_parts->push_back(p);
7015 }
7016
7017 delete [] num_points_in_all_processor_parts;
7018
7019 mj_lno_t num_new_local_points = 0;
7020 //perform the actual migration operation here.
7021 this->mj_migrate_coords(
7022 num_procs,
7023 num_new_local_points,
7024 iteration,
7025 coordinate_destinations,
7026 input_num_parts);
7027
7028 delete [] coordinate_destinations;
7029 if(this->num_local_coords != num_new_local_points) {
7030 this->new_coordinate_permutations = Kokkos::View<mj_lno_t*, device_t>
7031 (Kokkos::ViewAllocateWithoutInitializing("new_coordinate_permutations"),
7032 num_new_local_points);
7033 this->coordinate_permutations = Kokkos::View<mj_lno_t*, device_t>
7034 (Kokkos::ViewAllocateWithoutInitializing("coordinate_permutations"),
7035 num_new_local_points);
7036 }
7037 this->num_local_coords = num_new_local_points;
7038 this->num_global_coords = new_global_num_points;
7039
7040 // create subcommunicator.
7041 this->create_sub_communicator(processor_ranks_for_subcomm);
7042
7043 processor_ranks_for_subcomm.clear();
7044
7045 // fill the new permutation arrays.
7046 this->fill_permutation_array(output_num_parts, input_num_parts);
7047
7048 return true;
7049}
7050
7069template <typename mj_scalar_t, typename mj_lno_t, typename mj_gno_t,
7070 typename mj_part_t, typename mj_node_t>
7071void AlgMJ<mj_scalar_t, mj_lno_t, mj_gno_t, mj_part_t, mj_node_t>::
7072 create_consistent_chunks(
7073 mj_part_t num_parts,
7074 Kokkos::View<mj_scalar_t *, device_t> & mj_current_dim_coords,
7075 Kokkos::View<mj_scalar_t *, device_t> & current_concurrent_cut_coordinate,
7076 mj_lno_t coordinate_begin,
7077 mj_lno_t coordinate_end,
7078 Kokkos::View<mj_scalar_t *, device_t> & used_local_cut_line_weight_to_left,
7079 Kokkos::View<mj_lno_t *, device_t> & out_part_xadj,
7080 int coordInd,
7081 bool longest_dim_part,
7082 uSignedSortItem<int, mj_scalar_t, char> * p_coord_dimension_range_sorted)
7083{
7084 // Note that this method is only used by task mapper
7085 // All code in this file has been verified to run with UVM off by running
7086 // mj tests and task mapper tests with UVM off. However for this particular
7087 // method I did not do much for UVM off. I heavily use device to host copies
7088 // and more or less preserve the original logic. Due to the handling of
7089 // arrays it will be a bit of work to convert this to as better form.
7090 // Since it's only relevant to task mapper and I wasn't sure how much priority
7091 // to give it, I put that on hold until further discussion.
7092 mj_part_t no_cuts = num_parts - 1;
7093
7094 // now if the rectilinear partitioning is allowed we decide how
7095 // much weight each thread should put to left and right.
7096 if(this->distribute_points_on_cut_lines) {
7097 auto local_thread_cut_line_weight_to_put_left =
7098 this->thread_cut_line_weight_to_put_left;
7099 auto local_thread_part_weight_work =
7100 this->thread_part_weight_work;
7101 auto local_sEpsilon = this->sEpsilon;
7102
7103 Kokkos::parallel_for(
7104 Kokkos::RangePolicy<typename mj_node_t::execution_space,
7105 mj_part_t> (0, no_cuts), KOKKOS_LAMBDA (mj_part_t i) {
7106 // the left to be put on the left of the cut.
7107 mj_scalar_t left_weight = used_local_cut_line_weight_to_left(i);
7108 if(left_weight > local_sEpsilon) {
7109 // the weight of thread ii on cut.
7110 mj_scalar_t thread_ii_weight_on_cut =
7111 local_thread_part_weight_work(i * 2 + 1) -
7112 local_thread_part_weight_work(i * 2);
7113 if(thread_ii_weight_on_cut < left_weight) {
7114 local_thread_cut_line_weight_to_put_left(i) =
7115 thread_ii_weight_on_cut;
7116 }
7117 else {
7118 local_thread_cut_line_weight_to_put_left(i) = left_weight;
7119 }
7120 }
7121 else {
7122 local_thread_cut_line_weight_to_put_left(i) = 0;
7123 }
7124 });
7125
7126 if(no_cuts > 0) {
7127 auto local_least_signifiance = least_signifiance;
7128 auto local_significance_mul = significance_mul;
7129 Kokkos::parallel_for(
7130 Kokkos::RangePolicy<typename mj_node_t::execution_space, int>
7131 (0, 1), KOKKOS_LAMBDA (int dummy) {
7132 // this is a special case. If cutlines share the same coordinate,
7133 // their weights are equal.
7134 // we need to adjust the ratio for that.
7135 for(mj_part_t i = no_cuts - 1; i > 0 ; --i) {
7136 mj_scalar_t cut1 = current_concurrent_cut_coordinate(i-1);
7137 mj_scalar_t cut2 = current_concurrent_cut_coordinate(i);
7138 mj_scalar_t delta = cut2 - cut1;
7139 mj_scalar_t abs_delta = (delta > 0) ? delta : -delta;
7140 if(abs_delta < local_sEpsilon) {
7141 local_thread_cut_line_weight_to_put_left(i) -=
7142 local_thread_cut_line_weight_to_put_left(i - 1);
7143 }
7144 local_thread_cut_line_weight_to_put_left(i) =
7145 static_cast<long long>((local_thread_cut_line_weight_to_put_left(i) +
7146 local_least_signifiance) * local_significance_mul) /
7147 static_cast<mj_scalar_t>(local_significance_mul);
7148 }
7149 });
7150 }
7151 }
7152
7153 auto local_thread_point_counts = this->thread_point_counts;
7154 Kokkos::parallel_for(
7155 Kokkos::RangePolicy<typename mj_node_t::execution_space, mj_part_t>
7156 (0, num_parts), KOKKOS_LAMBDA (mj_part_t i) {
7157 local_thread_point_counts(i) = 0;
7158 });
7159
7160 // for this specific case we dont want to distribute the points along the
7161 // cut position randomly, as we need a specific ordering of them. Instead,
7162 // we put the coordinates into a sort item, where we sort those
7163 // using the coordinates of points on other dimensions and the index.
7164
7165 // some of the cuts might share the same position.
7166 // in this case, if cut i and cut j share the same position
7167 // cut_map[i] = cut_map[j] = sort item index.
7168 mj_part_t *cut_map = new mj_part_t[no_cuts];
7169
7170 typedef uMultiSortItem<mj_lno_t, int, mj_scalar_t> multiSItem;
7171 typedef std::vector< multiSItem > multiSVector;
7172 typedef std::vector<multiSVector> multiS2Vector;
7173
7174 // to keep track of the memory allocated.
7175 std::vector<mj_scalar_t *>allocated_memory;
7176
7177 // vector for which the coordinates will be sorted.
7178 multiS2Vector sort_vector_points_on_cut;
7179
7180 // the number of cuts that have different coordinates.
7181 mj_part_t different_cut_count = 1;
7182 cut_map[0] = 0;
7183
7184 // now we insert 1 sort vector for all cuts on the different
7185 // positins.if multiple cuts are on the same position,
7186 // they share sort vectors.
7187 multiSVector tmpMultiSVector;
7188 sort_vector_points_on_cut.push_back(tmpMultiSVector);
7189
7190 auto local_current_concurrent_cut_coordinate =
7191 current_concurrent_cut_coordinate;
7192 auto host_current_concurrent_cut_coordinate =
7193 Kokkos::create_mirror_view(local_current_concurrent_cut_coordinate);
7194 Kokkos::deep_copy(host_current_concurrent_cut_coordinate,
7195 local_current_concurrent_cut_coordinate);
7196
7197 for(mj_part_t i = 1; i < no_cuts ; ++i) {
7198 // if cuts share the same cut coordinates
7199 // set the cutmap accordingly.
7200 if(std::abs(host_current_concurrent_cut_coordinate(i) -
7201 host_current_concurrent_cut_coordinate(i-1)) < this->sEpsilon) {
7202 cut_map[i] = cut_map[i-1];
7203 }
7204 else {
7205 cut_map[i] = different_cut_count++;
7206 multiSVector tmp2MultiSVector;
7207 sort_vector_points_on_cut.push_back(tmp2MultiSVector);
7208 }
7209 }
7210 Kokkos::deep_copy(current_concurrent_cut_coordinate,
7211 host_current_concurrent_cut_coordinate);
7212
7213 // now the actual part assigment.
7214 auto host_coordinate_permutations =
7215 Kokkos::create_mirror_view(coordinate_permutations);
7216 Kokkos::deep_copy(host_coordinate_permutations, coordinate_permutations);
7217
7218 auto host_assigned_part_ids = Kokkos::create_mirror_view(assigned_part_ids);
7219 Kokkos::deep_copy(host_assigned_part_ids, assigned_part_ids);
7220
7221 auto host_mj_coordinates = Kokkos::create_mirror_view(mj_coordinates);
7222 Kokkos::deep_copy(host_mj_coordinates, mj_coordinates);
7223
7224 auto host_thread_point_counts = Kokkos::create_mirror_view(thread_point_counts);
7225 Kokkos::deep_copy(host_thread_point_counts, thread_point_counts);
7226
7227 auto local_coord_dim = this->coord_dim;
7228
7229 for(mj_lno_t ii = coordinate_begin; ii < coordinate_end; ++ii) {
7230 mj_lno_t i = host_coordinate_permutations(ii);
7231 mj_part_t pp = host_assigned_part_ids(i);
7232 mj_part_t p = pp / 2;
7233 // if the coordinate is on a cut.
7234 if(pp % 2 == 1 ) {
7235 mj_scalar_t *vals = new mj_scalar_t[local_coord_dim -1];
7236 allocated_memory.push_back(vals);
7237
7238 // we insert the coordinates to the sort item here.
7239 int val_ind = 0;
7240
7241 if(longest_dim_part) {
7242 // std::cout << std::endl << std::endl;
7243 for(int dim = local_coord_dim - 2; dim >= 0; --dim) {
7244 // uSignedSortItem<int, mj_scalar_t, char>
7245 // *p_coord_dimension_range_sorted
7246 int next_largest_coord_dim = p_coord_dimension_range_sorted[dim].id;
7247 // std::cout << "next_largest_coord_dim: " <<
7248 // next_largest_coord_dim << " ";
7249 // Note refactor in progress
7250 vals[val_ind++] =
7251 host_mj_coordinates(i,next_largest_coord_dim);
7252 }
7253 }
7254 else {
7255 for(int dim = coordInd + 1; dim < local_coord_dim; ++dim) {
7256 vals[val_ind++] = host_mj_coordinates(i,dim);
7257 }
7258 for(int dim = 0; dim < coordInd; ++dim) {
7259 vals[val_ind++] = host_mj_coordinates(i,dim);
7260 }
7261 }
7262
7263 multiSItem tempSortItem(i, local_coord_dim -1, vals);
7264 //insert the point to the sort vector pointed by the cut_map[p].
7265 mj_part_t cmap = cut_map[p];
7266 sort_vector_points_on_cut[cmap].push_back(tempSortItem);
7267 }
7268 else {
7269 //if it is not on the cut, simple sorting.
7270 ++host_thread_point_counts(p);
7271 host_assigned_part_ids(i) = p;
7272 }
7273 }
7274
7275 // sort all the sort vectors.
7276 for(mj_part_t i = 0; i < different_cut_count; ++i) {
7277 std::sort (sort_vector_points_on_cut[i].begin(),
7278 sort_vector_points_on_cut[i].end());
7279 }
7280
7281 mj_part_t previous_cut_map = cut_map[0];
7282
7283 auto host_thread_cut_line_weight_to_put_left =
7284 Kokkos::create_mirror_view(thread_cut_line_weight_to_put_left);
7285 Kokkos::deep_copy(host_thread_cut_line_weight_to_put_left,
7286 thread_cut_line_weight_to_put_left);
7287
7288 auto host_mj_weights = Kokkos::create_mirror_view(mj_weights);
7289 Kokkos::deep_copy(host_mj_weights, mj_weights);
7290
7291 // this is how much previous part owns the weight of the current part.
7292 // when target part weight is 1.6, and the part on the left is given 2,
7293 // the left has an extra 0.4, while the right has missing 0.4 from the
7294 // previous cut.
7295 // This parameter is used to balance this issues.
7296 // in the above example weight_stolen_from_previous_part will be 0.4.
7297 // if the left part target is 2.2 but it is given 2,
7298 // then weight_stolen_from_previous_part will be -0.2.
7299 mj_scalar_t weight_stolen_from_previous_part = 0;
7300 for(mj_part_t p = 0; p < no_cuts; ++p) {
7301 mj_part_t mapped_cut = cut_map[p];
7302
7303 // if previous cut map is done, and it does not have the same index,
7304 // then assign all points left on that cut to its right.
7305 if(previous_cut_map != mapped_cut) {
7306 mj_lno_t sort_vector_end = (mj_lno_t)
7307 sort_vector_points_on_cut[previous_cut_map].size() - 1;
7308 for(; sort_vector_end >= 0; --sort_vector_end) {
7309 multiSItem t =
7310 sort_vector_points_on_cut[previous_cut_map][sort_vector_end];
7311 mj_lno_t i = t.index;
7312 ++host_thread_point_counts(p);
7313 host_assigned_part_ids(i) = p;
7314 }
7315 sort_vector_points_on_cut[previous_cut_map].clear();
7316 }
7317
7318 // TODO: MD: I dont remember why I have it reverse order here.
7319 mj_lno_t sort_vector_end = (mj_lno_t)
7320 sort_vector_points_on_cut[mapped_cut].size() - 1;
7321 // mj_lno_t sort_vector_begin= 0;
7322 // mj_lno_t sort_vector_size =
7323 // (mj_lno_t)sort_vector_points_on_cut[mapped_cut].size();
7324
7325 // TODO commented for reverse order
7326 for(; sort_vector_end >= 0; --sort_vector_end) {
7327 // for(; sort_vector_begin < sort_vector_size; ++sort_vector_begin) {
7328 // TODO COMMENTED FOR REVERSE ORDER
7329 multiSItem t = sort_vector_points_on_cut[mapped_cut][sort_vector_end];
7330 //multiSItem t = sort_vector_points_on_cut[mapped_cut][sort_vector_begin];
7331 mj_lno_t i = t.index;
7332 mj_scalar_t w = this->mj_uniform_weights(0) ? 1 :
7333 this->mj_weights(i,0);
7334 // part p has enough space for point i, then put it to point i.
7335 if(host_thread_cut_line_weight_to_put_left(p) +
7336 weight_stolen_from_previous_part> this->sEpsilon &&
7337 host_thread_cut_line_weight_to_put_left(p) +
7338 weight_stolen_from_previous_part -
7339 std::abs(host_thread_cut_line_weight_to_put_left(p) +
7340 weight_stolen_from_previous_part - w)> this->sEpsilon)
7341 {
7342 host_thread_cut_line_weight_to_put_left(p) -= w;
7343
7344 sort_vector_points_on_cut[mapped_cut].pop_back();
7345
7346 ++host_thread_point_counts(p);
7347 host_assigned_part_ids(i) = p;
7348 // if putting this weight to left overweights the left cut, then
7349 // increase the space for the next cut using
7350 // weight_stolen_from_previous_part.
7351 if(p < no_cuts - 1 &&
7352 host_thread_cut_line_weight_to_put_left(p) < this->sEpsilon) {
7353 if(mapped_cut == cut_map[p + 1] ) {
7354 // if the cut before the cut indexed at p was also at the same
7355 // position special case, as we handle the weight differently here.
7356 if(previous_cut_map != mapped_cut) {
7357 weight_stolen_from_previous_part =
7358 host_thread_cut_line_weight_to_put_left(p);
7359 }
7360 else {
7361 // if the cut before the cut indexed at p was also at the same
7362 // position we assign extra weights cumulatively in this case.
7363 weight_stolen_from_previous_part +=
7364 host_thread_cut_line_weight_to_put_left(p);
7365 }
7366 }
7367 else{
7368 weight_stolen_from_previous_part =
7369 -host_thread_cut_line_weight_to_put_left(p);
7370 }
7371 // end assignment for part p
7372 break;
7373 }
7374 } else {
7375 // if part p does not have enough space for this point
7376 // and if there is another cut sharing the same positon,
7377 // again increase the space for the next
7378 if(p < no_cuts - 1 && mapped_cut == cut_map[p + 1]) {
7379 if(previous_cut_map != mapped_cut) {
7380 weight_stolen_from_previous_part =
7381 host_thread_cut_line_weight_to_put_left(p);
7382 }
7383 else {
7384 weight_stolen_from_previous_part +=
7385 host_thread_cut_line_weight_to_put_left(p);
7386 }
7387 }
7388 else{
7389 weight_stolen_from_previous_part =
7390 -host_thread_cut_line_weight_to_put_left(p);
7391 }
7392 // end assignment for part p
7393 break;
7394 }
7395 }
7396 previous_cut_map = mapped_cut;
7397 }
7398
7399 // TODO commented for reverse order
7400 // put everything left on the last cut to the last part.
7401 mj_lno_t sort_vector_end = (mj_lno_t)sort_vector_points_on_cut[
7402 previous_cut_map].size() - 1;
7403
7404 // mj_lno_t sort_vector_begin= 0;
7405 // mj_lno_t sort_vector_size = (mj_lno_t)
7406 // sort_vector_points_on_cut[previous_cut_map].size();
7407 // TODO commented for reverse order
7408 for(; sort_vector_end >= 0; --sort_vector_end) {
7409 // TODO commented for reverse order
7410 multiSItem t = sort_vector_points_on_cut[previous_cut_map][sort_vector_end];
7411 // multiSItem t =
7412 // sort_vector_points_on_cut[previous_cut_map][sort_vector_begin];
7413 mj_lno_t i = t.index;
7414 ++host_thread_point_counts(no_cuts);
7415 host_assigned_part_ids(i) = no_cuts;
7416 }
7417
7418 sort_vector_points_on_cut[previous_cut_map].clear();
7419 delete [] cut_map;
7420
7421 //free the memory allocated for vertex sort items .
7422 mj_lno_t vSize = (mj_lno_t) allocated_memory.size();
7423 for(mj_lno_t i = 0; i < vSize; ++i) {
7424 delete [] allocated_memory[i];
7425 }
7426
7427 auto local_out_part_xadj = out_part_xadj;
7428 auto host_out_part_xadj = Kokkos::create_mirror_view(local_out_part_xadj);
7429 Kokkos::deep_copy(host_out_part_xadj, out_part_xadj);
7430
7431 // creation of part_xadj as in usual case.
7432 for(mj_part_t j = 0; j < num_parts; ++j) {
7433 host_out_part_xadj(j) = host_thread_point_counts(j);
7434 host_thread_point_counts(j) = 0;
7435 }
7436
7437 // perform prefix sum for num_points in parts.
7438 for(mj_part_t j = 1; j < num_parts; ++j) {
7439 host_out_part_xadj(j) += host_out_part_xadj(j - 1);
7440 }
7441
7442 // shift the num points in threads thread to obtain the
7443 // beginning index of each thread's private space.
7444 for(mj_part_t j = 1; j < num_parts; ++j) {
7445 host_thread_point_counts(j) += host_out_part_xadj(j - 1);
7446 }
7447
7448 auto host_new_coordinate_permutations =
7449 Kokkos::create_mirror_view(new_coordinate_permutations);
7450 Kokkos::deep_copy(host_new_coordinate_permutations,
7451 new_coordinate_permutations);
7452
7453 // now thread gets the coordinate and writes the index of coordinate to
7454 // the permutation array using the part index we calculated.
7455 for(mj_lno_t ii = coordinate_begin; ii < coordinate_end; ++ii) {
7456 mj_lno_t i = host_coordinate_permutations(ii);
7457 mj_part_t p = host_assigned_part_ids(i);
7458 host_new_coordinate_permutations(coordinate_begin +
7459 host_thread_point_counts(p)++) = i;
7460 }
7461
7462 Kokkos::deep_copy(thread_point_counts, host_thread_point_counts);
7463 Kokkos::deep_copy(new_coordinate_permutations,
7464 host_new_coordinate_permutations);
7465 Kokkos::deep_copy(local_out_part_xadj, host_out_part_xadj);
7466}
7467
7477template <typename mj_scalar_t, typename mj_lno_t, typename mj_gno_t,
7478 typename mj_part_t, typename mj_node_t>
7479void AlgMJ<mj_scalar_t, mj_lno_t, mj_gno_t, mj_part_t, mj_node_t>::
7480 set_final_parts(
7481 mj_part_t current_num_parts,
7482 mj_part_t output_part_begin_index,
7483 RCP<mj_partBoxVector_t> &output_part_boxes,
7484 bool is_data_ever_migrated)
7485{
7486 this->mj_env->timerStart(MACRO_TIMERS,
7487 mj_timer_base_string + "Part_Assignment");
7488
7489 auto local_part_xadj = part_xadj;
7490 auto local_mj_keep_part_boxes = mj_keep_part_boxes;
7491 auto local_coordinate_permutations = coordinate_permutations;
7492 auto local_assigned_part_ids = assigned_part_ids;
7493
7494 if(local_mj_keep_part_boxes) {
7495 for(int i = 0; i < current_num_parts; ++i) {
7496 (*output_part_boxes)[i].setpId(i + output_part_begin_index);
7497 }
7498 }
7499
7500 Kokkos::TeamPolicy<typename mj_node_t::execution_space> policy(
7501 current_num_parts, Kokkos::AUTO());
7502 typedef typename Kokkos::TeamPolicy<typename mj_node_t::execution_space>::
7503 member_type member_type;
7504 Kokkos::parallel_for(policy, KOKKOS_LAMBDA(member_type team_member) {
7505 int i = team_member.league_rank();
7506 Kokkos::parallel_for(Kokkos::TeamThreadRange (team_member, (i != 0) ?
7507 local_part_xadj(i-1) : 0, local_part_xadj(i)),
7508 [=] (mj_lno_t ii) {
7509 mj_lno_t k = local_coordinate_permutations(ii);
7510 local_assigned_part_ids(k) = i + output_part_begin_index;
7511 });
7512 });
7513
7514 if(is_data_ever_migrated) {
7515#ifdef ZOLTAN2_MJ_ENABLE_ZOLTAN_MIGRATION
7516 if(sizeof(mj_lno_t) <= sizeof(int)) {
7517
7518 // Cannot use Zoltan_Comm with local ordinals larger than ints.
7519 // In Zoltan_Comm_Create, the cast int(this->num_local_coords)
7520 // may overflow.
7521
7522 // if data is migrated, then send part numbers to the original owners.
7523 ZOLTAN_COMM_OBJ *plan = NULL;
7524 MPI_Comm mpi_comm = Teuchos::getRawMpiComm(*(this->mj_problemComm));
7525
7526 int incoming = 0;
7527 int message_tag = 7856;
7528
7529 this->mj_env->timerStart(MACRO_TIMERS,
7530 mj_timer_base_string + "Final Z1PlanCreating");
7531
7532 // setup incoming count
7533 int ierr = Zoltan_Comm_Create( &plan, int(this->num_local_coords),
7534 this->owner_of_coordinate.data(), mpi_comm, message_tag, &incoming);
7535
7536 Z2_ASSERT_VALUE(ierr, ZOLTAN_OK);
7537 this->mj_env->timerStop(MACRO_TIMERS,
7538 mj_timer_base_string + "Final Z1PlanCreating" );
7539
7540 this->mj_env->timerStart(MACRO_TIMERS,
7541 mj_timer_base_string + "Final Z1PlanComm");
7542
7543 // MPI Buffers should be on Kokkos::HostSpace not Kokkos::CudaUVMSpace
7544 // Note, with UVM space, create_mirror_view does NOT create a non-UVM
7545 // view; need the explicit Host creation and deep_copy.
7546
7547 // migrate gnos to actual owners.
7548 auto host_current_mj_gnos = Kokkos::create_mirror_view(
7549 Kokkos::HostSpace(), this->current_mj_gnos);
7550 deep_copy(host_current_mj_gnos, this->current_mj_gnos);
7551 Kokkos::View<mj_gno_t*, device_t> dst_gnos(
7552 Kokkos::ViewAllocateWithoutInitializing("dst_gnos"), incoming);
7553 auto host_dst_gnos = Kokkos::create_mirror_view(
7554 Kokkos::HostSpace(), dst_gnos);
7555 message_tag++;
7556 ierr = Zoltan_Comm_Do( plan, message_tag,
7557 (char *) host_current_mj_gnos.data(),
7558 sizeof(mj_gno_t), (char *) host_dst_gnos.data());
7559 Z2_ASSERT_VALUE(ierr, ZOLTAN_OK);
7560 Kokkos::deep_copy(dst_gnos, host_dst_gnos);
7561 this->current_mj_gnos = dst_gnos;
7562
7563 // migrate part ids to actual owners.
7564 auto host_src_part_ids = Kokkos::create_mirror_view(
7565 Kokkos::HostSpace(), this->assigned_part_ids);
7566 deep_copy(host_src_part_ids, this->assigned_part_ids);
7567 Kokkos::View<mj_part_t*, device_t> dst_part_ids(
7568 Kokkos::ViewAllocateWithoutInitializing("dst_part_ids"), incoming);
7569 auto host_dst_part_ids = Kokkos::create_mirror_view(
7570 Kokkos::HostSpace(), dst_part_ids);
7571 message_tag++;
7572 ierr = Zoltan_Comm_Do( plan, message_tag,
7573 (char *) host_src_part_ids.data(),
7574 sizeof(mj_part_t), (char *) host_dst_part_ids.data());
7575 Z2_ASSERT_VALUE(ierr, ZOLTAN_OK);
7576 Kokkos::deep_copy(dst_part_ids, host_dst_part_ids);
7577 this->assigned_part_ids = dst_part_ids;
7578
7579 ierr = Zoltan_Comm_Destroy(&plan);
7580 Z2_ASSERT_VALUE(ierr, ZOLTAN_OK);
7581
7582 this->num_local_coords = incoming;
7583
7584 this->mj_env->timerStop(MACRO_TIMERS,
7585 mj_timer_base_string + "Final Z1PlanComm");
7586 }
7587 else
7588#endif // ZOLTAN2_MJ_ENABLE_ZOLTAN_MIGRATION
7589 {
7590 // setup incoming count
7591 this->mj_env->timerStart(MACRO_TIMERS,
7592 mj_timer_base_string + "Final DistributorPlanCreating");
7593 Tpetra::Distributor distributor(this->mj_problemComm);
7594 ArrayView<const mj_part_t> owners_of_coords(
7595 this->owner_of_coordinate.data(), this->num_local_coords);
7596 mj_lno_t incoming = distributor.createFromSends(owners_of_coords);
7597 this->mj_env->timerStop(MACRO_TIMERS,
7598 mj_timer_base_string + "Final DistributorPlanCreating" );
7599
7600 this->mj_env->timerStart(MACRO_TIMERS,
7601 mj_timer_base_string + "Final DistributorPlanComm");
7602
7603 // migrate gnos to actual owners.
7604 // MPI buffers should be Kokkos::HostSpace, not Kokkos::CudaUVMSpace
7605 // Note, with UVM space, create_mirror_view does NOT create a non-UVM
7606 // view; need the explicit Host creation and deep_copy.
7607 Kokkos::View<mj_gno_t*, Kokkos::HostSpace> sent_gnos(
7608 Kokkos::ViewAllocateWithoutInitializing("sent_gnos"),
7609 this->current_mj_gnos.extent(0));
7610 Kokkos::deep_copy(sent_gnos, this->current_mj_gnos);
7611
7612 Kokkos::View<mj_gno_t*, Kokkos::HostSpace> received_gnos(
7613 Kokkos::ViewAllocateWithoutInitializing("received_gnos"),
7614 incoming);
7615
7616 distributor.doPostsAndWaits(sent_gnos, 1, received_gnos);
7617
7618 this->current_mj_gnos = Kokkos::View<mj_gno_t*, device_t>(
7619 Kokkos::ViewAllocateWithoutInitializing("current_mj_gnos"), incoming);
7620
7621 Kokkos::deep_copy(this->current_mj_gnos, received_gnos);
7622
7623 // migrate part ids to actual owners.
7624 Kokkos::View<mj_part_t *, Kokkos::HostSpace> sent_partids(
7625 Kokkos::ViewAllocateWithoutInitializing("sent_partids"),
7626 this->assigned_part_ids.extent(0));
7627 Kokkos::deep_copy(sent_partids, this->assigned_part_ids);
7628
7629 Kokkos::View<mj_part_t *, Kokkos::HostSpace> received_partids(
7630 Kokkos::ViewAllocateWithoutInitializing("received_partids"),
7631 incoming);
7632
7633 distributor.doPostsAndWaits(sent_partids, 1, received_partids);
7634
7635 this->assigned_part_ids =
7636 Kokkos::View<mj_part_t*, device_t>(
7637 Kokkos::ViewAllocateWithoutInitializing("assigned_part_ids"),
7638 incoming);
7639
7640 Kokkos::deep_copy(this->assigned_part_ids, received_partids);
7641 this->num_local_coords = incoming;
7642
7643 this->mj_env->timerStop(MACRO_TIMERS,
7644 mj_timer_base_string + "Final DistributorPlanComm");
7645 }
7646 }
7647
7648 this->mj_env->timerStop(MACRO_TIMERS,
7649 mj_timer_base_string + "Part_Assignment");
7650
7651 this->mj_env->timerStart(MACRO_TIMERS,
7652 mj_timer_base_string + "Solution_Part_Assignment");
7653
7654 // ArrayRCP<mj_part_t> partId;
7655 // partId = arcp(this->assigned_part_ids, 0, this->num_local_coords, true);
7656
7657 if(this->mj_keep_part_boxes) {
7658 this->kept_boxes = compute_global_box_boundaries(output_part_boxes);
7659 }
7660
7661 this->mj_env->timerStop(MACRO_TIMERS,
7662 mj_timer_base_string + "Solution_Part_Assignment");
7663}
7664
7677template <typename mj_scalar_t, typename mj_lno_t, typename mj_gno_t,
7678 typename mj_part_t, typename mj_node_t>
7681 bool distribute_points_on_cut_lines_,
7682 int max_concurrent_part_calculation_,
7683 int check_migrate_avoid_migration_option_,
7684 double minimum_migration_imbalance_,
7685 int migration_type_)
7686{
7687 this->distribute_points_on_cut_lines = distribute_points_on_cut_lines_;
7688 this->max_concurrent_part_calculation = max_concurrent_part_calculation_;
7689 this->check_migrate_avoid_migration_option =
7690 check_migrate_avoid_migration_option_;
7691 this->minimum_migration_imbalance = minimum_migration_imbalance_;
7692 this->migration_type = migration_type_;
7693}
7694
7722template <typename mj_scalar_t, typename mj_lno_t, typename mj_gno_t,
7723 typename mj_part_t, typename mj_node_t>
7726 const RCP<const Environment> &env,
7727 RCP<const Comm<int> > &problemComm,
7728 double imbalance_tolerance_,
7729 int num_teams_,
7730 size_t num_global_parts_,
7731 Kokkos::View<mj_part_t*, Kokkos::HostSpace> & part_no_array_,
7732 int recursion_depth_,
7733 int coord_dim_,
7734 mj_lno_t num_local_coords_,
7735 mj_gno_t num_global_coords_,
7736 Kokkos::View<const mj_gno_t*, device_t> & initial_mj_gnos_,
7737 // coordinates in MJ are LayoutLeft since Tpetra Multivector gives LayoutLeft
7738 Kokkos::View<mj_scalar_t**, Kokkos::LayoutLeft, device_t> & mj_coordinates_,
7739 int num_weights_per_coord_,
7740 Kokkos::View<bool*, Kokkos::HostSpace> & mj_uniform_weights_,
7741 Kokkos::View<mj_scalar_t**, device_t> & mj_weights_,
7742 Kokkos::View<bool*, Kokkos::HostSpace> & mj_uniform_parts_,
7743 Kokkos::View<mj_part_t *, device_t> & result_assigned_part_ids_,
7744 Kokkos::View<mj_gno_t*, device_t> & result_mj_gnos_)
7745{
7746
7747 // see comment above for Zoltan2_AlgMJ_TrackCallsCounter
7749 this->mj_timer_base_string = "MJ(" + std::to_string(execute_counter) + ") - ";
7750
7751 this->mj_env = env;
7752 this->mj_problemComm = problemComm;
7753 this->myActualRank = this->myRank = this->mj_problemComm->getRank();
7754 this->mj_env->timerStart(MACRO_TIMERS,
7755 mj_timer_base_string + "Total");
7756 this->mj_env->debug(3, "In MultiJagged Jagged");
7757 this->imbalance_tolerance = imbalance_tolerance_;
7758 this->mj_num_teams = num_teams_;
7759 this->num_global_parts = num_global_parts_;
7760 this->part_no_array = part_no_array_;
7761 this->recursion_depth = recursion_depth_;
7762 this->coord_dim = coord_dim_;
7763 this->num_local_coords = num_local_coords_;
7764 this->num_global_coords = num_global_coords_;
7765 this->mj_coordinates = mj_coordinates_;
7766 this->initial_mj_gnos = initial_mj_gnos_;
7767 this->num_weights_per_coord = num_weights_per_coord_;
7768 this->mj_uniform_weights = mj_uniform_weights_;
7769 this->mj_weights = mj_weights_;
7770 this->mj_uniform_parts = mj_uniform_parts_;
7771
7772 // this->set_input_data();
7773
7774 this->set_part_specifications();
7775
7776 this->mj_env->timerStart(MACRO_TIMERS,
7777 mj_timer_base_string + "Allocate Views");
7778 this->allocate_set_work_memory();
7779 this->mj_env->timerStop(MACRO_TIMERS,
7780 mj_timer_base_string + "Allocate Views");
7781
7782 // We duplicate the comm as we create subcommunicators during migration.
7783 // We keep the problemComm as it is, while comm changes after each migration.
7784 this->comm = this->mj_problemComm->duplicate();
7785
7786#ifdef print_debug
7787 if(comm->getRank() == 0) {
7788 std::cout << "size of gno:" << sizeof(mj_gno_t) << std::endl;
7789 std::cout << "size of lno:" << sizeof(mj_lno_t) << std::endl;
7790 std::cout << "size of mj_scalar_t:" << sizeof(mj_scalar_t) << std::endl;
7791 }
7792#endif
7793
7794 // initially there is a single partition
7795 mj_part_t current_num_parts = 1;
7796 Kokkos::View<mj_scalar_t *, device_t> current_cut_coordinates =
7797 this->all_cut_coordinates;
7798 this->mj_env->timerStart(MACRO_TIMERS,
7799 mj_timer_base_string + "Problem_Partitioning");
7800 mj_part_t output_part_begin_index = 0;
7801 mj_part_t future_num_parts = this->total_num_part;
7802 bool is_data_ever_migrated = false;
7803
7804 std::vector<mj_part_t> *future_num_part_in_parts =
7805 new std::vector<mj_part_t> ();
7806 std::vector<mj_part_t> *next_future_num_parts_in_parts =
7807 new std::vector<mj_part_t> ();
7808
7809 next_future_num_parts_in_parts->push_back(this->num_global_parts);
7810
7811 RCP<mj_partBoxVector_t> input_part_boxes;
7812 RCP<mj_partBoxVector_t> output_part_boxes;
7813
7814 if(this->mj_keep_part_boxes) {
7815 input_part_boxes = RCP<mj_partBoxVector_t>(new mj_partBoxVector_t(), true);
7816 output_part_boxes = RCP<mj_partBoxVector_t>(new mj_partBoxVector_t(), true);
7817 compute_global_box();
7818 this->init_part_boxes(output_part_boxes);
7819 }
7820
7821 auto local_part_xadj = this->part_xadj;
7822
7823 // Need a device counter - how best to allocate?
7824 // Putting this allocation in the loops is very costly so moved out here.
7825 Kokkos::View<mj_part_t*, device_t>
7826 view_rectilinear_cut_count("view_rectilinear_cut_count", 1);
7827 Kokkos::View<size_t*, device_t>
7828 view_total_reduction_size("view_total_reduction_size", 1);
7829
7830 for(int i = 0; i < this->recursion_depth; ++i) {
7831
7832 // convert i to string to be used for debugging purposes.
7833 std::string istring = std::to_string(i);
7834
7835 // next_future_num_parts_in_parts will be as the size of outnumParts,
7836 // and this will hold how many more parts that each output part
7837 // should be divided. this array will also be used to determine the weight
7838 // ratios of the parts. swap the arrays to use iteratively.
7839 std::vector<mj_part_t> *tmpPartVect= future_num_part_in_parts;
7840 future_num_part_in_parts = next_future_num_parts_in_parts;
7841 next_future_num_parts_in_parts = tmpPartVect;
7842
7843 // clear next_future_num_parts_in_parts array as
7844 // getPartitionArrays expects it to be empty.
7845 next_future_num_parts_in_parts->clear();
7846 if(this->mj_keep_part_boxes) {
7847 RCP<mj_partBoxVector_t> tmpPartBoxes = input_part_boxes;
7848 input_part_boxes = output_part_boxes;
7849 output_part_boxes = tmpPartBoxes;
7850 output_part_boxes->clear();
7851 }
7852
7853 // returns the total no. of output parts for this dimension partitioning.
7854 mj_part_t output_part_count_in_dimension =
7855 this->update_part_num_arrays(
7856 future_num_part_in_parts,
7857 next_future_num_parts_in_parts,
7858 future_num_parts,
7859 current_num_parts,
7860 i,
7861 input_part_boxes,
7862 output_part_boxes, 1);
7863
7864 // if the number of obtained parts equal to current number of parts,
7865 // skip this dimension. For example, this happens when 1 is given in the
7866 // input part array is given. P=4,5,1,2
7867 if(output_part_count_in_dimension == current_num_parts) {
7868 //still need to swap the input output arrays.
7869 tmpPartVect= future_num_part_in_parts;
7870 future_num_part_in_parts = next_future_num_parts_in_parts;
7871 next_future_num_parts_in_parts = tmpPartVect;
7872
7873 if(this->mj_keep_part_boxes) {
7874 RCP<mj_partBoxVector_t> tmpPartBoxes = input_part_boxes;
7875 input_part_boxes = output_part_boxes;
7876 output_part_boxes = tmpPartBoxes;
7877 }
7878 continue;
7879 }
7880
7881 // get the coordinate axis along which the partitioning will be done.
7882 int coordInd = i % this->coord_dim;
7883
7884 Kokkos::View<mj_scalar_t *, device_t> mj_current_dim_coords =
7885 Kokkos::subview(this->mj_coordinates, Kokkos::ALL, coordInd);
7886
7887 this->mj_env->timerStart(MACRO_TIMERS,
7888 mj_timer_base_string + "Problem_Partitioning_" + istring);
7889
7890 // alloc Memory to point the indices
7891 // of the parts in the permutation array.
7892 this->new_part_xadj = Kokkos::View<mj_lno_t*, device_t>(
7893 "new part xadj", output_part_count_in_dimension);
7894
7895 // the index where in the new_part_xadj will be written.
7896 mj_part_t output_part_index = 0;
7897
7898 // whatever is written to output_part_index will be added with
7899 // output_coordinate_end_index so that the points will be shifted.
7900 mj_part_t output_coordinate_end_index = 0;
7901
7902 mj_part_t current_work_part = 0;
7903 mj_part_t current_concurrent_num_parts =
7904 std::min(current_num_parts - current_work_part,
7905 this->max_concurrent_part_calculation);
7906
7907 mj_part_t obtained_part_index = 0;
7908
7909 auto host_process_local_min_max_coord_total_weight =
7910 Kokkos::create_mirror_view(process_local_min_max_coord_total_weight);
7911 auto host_global_min_max_coord_total_weight =
7912 Kokkos::create_mirror_view(global_min_max_coord_total_weight);
7913
7914 // run for all available parts.
7915 for(; current_work_part < current_num_parts;
7916 current_work_part += current_concurrent_num_parts) {
7917
7918 current_concurrent_num_parts =
7919 std::min(current_num_parts - current_work_part,
7920 this->max_concurrent_part_calculation);
7921
7922 int bDoingWork_int; // Can't reduce on bool so use int
7923 auto local_device_num_partitioning_in_current_dim =
7924 device_num_partitioning_in_current_dim;
7925 Kokkos::parallel_reduce("Read bDoingWork",
7926 Kokkos::RangePolicy<typename mj_node_t::execution_space, int> (0, 1),
7927 KOKKOS_LAMBDA(int dummy, int & set_single) {
7928 set_single = 0;
7929 for(int kk = 0; kk < current_concurrent_num_parts; ++kk) {
7930 if(local_device_num_partitioning_in_current_dim(
7931 current_work_part + kk) != 1) {
7932 set_single = 1;
7933 break;
7934 }
7935 }
7936 }, bDoingWork_int);
7937 bool bDoingWork = (bDoingWork_int != 0) ? true : false;
7938
7939 this->mj_get_local_min_max_coord_totW(
7940 current_work_part,
7941 current_concurrent_num_parts,
7942 mj_current_dim_coords);
7943
7944 // 1D partitioning
7945 if(bDoingWork) {
7946 // obtain global Min max of the part.
7947 this->mj_get_global_min_max_coord_totW(
7948 current_concurrent_num_parts,
7949 this->process_local_min_max_coord_total_weight,
7950 this->global_min_max_coord_total_weight);
7951
7952 // represents the total number of cutlines
7953 // whose coordinate should be determined.
7954 mj_part_t total_incomplete_cut_count = 0;
7955
7956 // Compute weight ratios for parts & cuts:
7957 // e.g., 0.25 0.25 0.5 0.5 0.75 0.75 1
7958 // part0 cut0 part1 cut1 part2 cut2 part3
7959 mj_part_t concurrent_part_cut_shift = 0;
7960 mj_part_t concurrent_part_part_shift = 0;
7961
7962 for(int kk = 0; kk < current_concurrent_num_parts; ++kk) {
7963
7964 Kokkos::deep_copy(host_global_min_max_coord_total_weight,
7965 global_min_max_coord_total_weight);
7966
7967 mj_scalar_t min_coordinate =
7968 host_global_min_max_coord_total_weight(kk);
7969 mj_scalar_t max_coordinate =
7970 host_global_min_max_coord_total_weight(
7971 kk + current_concurrent_num_parts);
7972
7973 mj_scalar_t global_total_weight =
7974 host_global_min_max_coord_total_weight(
7975 kk + 2 * current_concurrent_num_parts);
7976
7977 mj_part_t concurrent_current_part_index = current_work_part + kk;
7978
7979 mj_part_t partition_count = host_num_partitioning_in_current_dim(
7980 concurrent_current_part_index);
7981
7982 Kokkos::View<mj_scalar_t *, device_t> usedCutCoordinate =
7983 Kokkos::subview(current_cut_coordinates,
7984 std::pair<mj_lno_t, mj_lno_t>(
7985 concurrent_part_cut_shift, current_cut_coordinates.size()));
7986 Kokkos::View<mj_scalar_t *, device_t>
7987 current_target_part_weights =
7988 Kokkos::subview(target_part_weights,
7989 std::pair<mj_lno_t, mj_lno_t>(
7990 concurrent_part_part_shift, target_part_weights.size()));
7991
7992 // shift the usedCutCoordinate array as noCuts.
7993 concurrent_part_cut_shift += partition_count - 1;
7994 // shift the partRatio array as noParts.
7995 concurrent_part_part_shift += partition_count;
7996
7997 // calculate only if part is not empty,
7998 // and part will be further partitioned.
7999 if(partition_count > 1 && min_coordinate <= max_coordinate) {
8000
8001 // increase num_cuts_do_be_determined by the number of cuts of the
8002 // current part's cut line number.
8003 total_incomplete_cut_count += partition_count - 1;
8004
8005 this->incomplete_cut_count(kk) = partition_count - 1;
8006
8007 // get the target weights of the parts
8008 this->mj_get_initial_cut_coords_target_weights(
8009 min_coordinate,
8010 max_coordinate,
8011 partition_count - 1,
8012 global_total_weight,
8013 usedCutCoordinate,
8014 current_target_part_weights,
8015 future_num_part_in_parts,
8016 next_future_num_parts_in_parts,
8017 concurrent_current_part_index,
8018 obtained_part_index);
8019
8020 mj_lno_t coordinate_end_index =
8021 host_part_xadj(concurrent_current_part_index);
8022 mj_lno_t coordinate_begin_index =
8023 concurrent_current_part_index==0 ? 0 :
8024 host_part_xadj(concurrent_current_part_index - 1);
8025
8026 this->set_initial_coordinate_parts(
8027 max_coordinate,
8028 min_coordinate,
8029 coordinate_begin_index, coordinate_end_index,
8030 this->coordinate_permutations,
8031 mj_current_dim_coords,
8032 this->assigned_part_ids,
8033 partition_count);
8034 }
8035 else {
8036 // e.g., if have fewer coordinates than parts, don't need to do
8037 // next dim.
8038 this->incomplete_cut_count(kk) = 0;
8039 }
8040
8041 obtained_part_index += partition_count;
8042 }
8043
8044 // used imbalance, it is always 0, as it is difficult to
8045 // estimate a range.
8046 double used_imbalance = 0;
8047 // Determine cut lines for all concurrent parts parts here.
8048 this->mj_env->timerStart(MACRO_TIMERS,
8049 mj_timer_base_string + "Problem_Partitioning Get Part Weights");
8050
8051 this->mj_1D_part(
8052 mj_current_dim_coords,
8053 used_imbalance,
8054 current_work_part,
8055 current_concurrent_num_parts,
8056 current_cut_coordinates,
8057 total_incomplete_cut_count,
8058 view_rectilinear_cut_count,
8059 view_total_reduction_size);
8060
8061 this->mj_env->timerStop(MACRO_TIMERS,
8062 mj_timer_base_string + "Problem_Partitioning Get Part Weights");
8063 }
8064
8065 // create new part chunks
8066 {
8067 mj_part_t output_array_shift = 0;
8068 mj_part_t cut_shift = 0;
8069 size_t tlr_shift = 0;
8070 size_t partweight_array_shift = 0;
8071 for(int kk = 0; kk < current_concurrent_num_parts; ++kk) {
8072
8073 mj_part_t current_concurrent_work_part = current_work_part + kk;
8074
8075 mj_part_t num_parts = host_num_partitioning_in_current_dim(
8076 current_concurrent_work_part);
8077
8078 // if the part is empty, skip the part.
8079 int coordinateA_bigger_than_coordinateB =
8080 host_global_min_max_coord_total_weight(kk) >
8081 host_global_min_max_coord_total_weight(
8082 kk + current_concurrent_num_parts);
8083
8084 if((num_parts != 1) && coordinateA_bigger_than_coordinateB) {
8085 // we still need to write the begin and end point of the empty part.
8086 // simply set it zero, the array indices will be shifted later
8087 auto local_new_part_xadj = this->new_part_xadj;
8088 Kokkos::parallel_for(
8089 Kokkos::RangePolicy<typename mj_node_t::execution_space, mj_part_t>
8090 (0, num_parts), KOKKOS_LAMBDA (mj_part_t jj) {
8091 local_new_part_xadj(
8092 output_part_index + output_array_shift + jj) = 0;
8093 });
8094
8095 cut_shift += num_parts - 1;
8096 tlr_shift += (4 *(num_parts - 1) + 1);
8097 output_array_shift += num_parts;
8098 partweight_array_shift += (2 * (num_parts - 1) + 1);
8099 continue;
8100 }
8101
8102 Kokkos::View<mj_scalar_t *, device_t>
8103 current_concurrent_cut_coordinate =
8104 Kokkos::subview(current_cut_coordinates,
8105 std::pair<mj_lno_t, mj_lno_t>(
8106 cut_shift,
8107 current_cut_coordinates.size()));
8108 Kokkos::View<mj_scalar_t *, device_t>
8109 used_local_cut_line_weight_to_left =
8110 Kokkos::subview(process_cut_line_weight_to_put_left,
8111 std::pair<mj_lno_t, mj_lno_t>(
8112 cut_shift,
8113 process_cut_line_weight_to_put_left.size()));
8114
8115 this->thread_part_weight_work =
8116 Kokkos::subview(
8117 this->thread_part_weights,
8118 std::pair<mj_lno_t, mj_lno_t>(
8119 partweight_array_shift,
8120 this->thread_part_weights.extent(0)));
8121
8122 if(num_parts > 1) {
8123 if(this->mj_keep_part_boxes) {
8124 // if part boxes are to be stored update the boundaries.
8125 for(mj_part_t j = 0; j < num_parts - 1; ++j) {
8126 mj_scalar_t temp_get_val;
8127 Kokkos::parallel_reduce("Read single",
8128 Kokkos::RangePolicy<typename mj_node_t::execution_space, int> (0, 1),
8129 KOKKOS_LAMBDA(int dummy, mj_scalar_t & set_single) {
8130 set_single = current_concurrent_cut_coordinate(j);
8131 }, temp_get_val);
8132 (*output_part_boxes)
8133 [output_array_shift + output_part_index + j].
8134 updateMinMax(temp_get_val, 1 /*update max*/, coordInd);
8135 (*output_part_boxes)
8136 [output_array_shift + output_part_index + j + 1].
8137 updateMinMax(temp_get_val, 0 /*update max*/, coordInd);
8138 }
8139 }
8140
8141 // Rewrite the indices based on the computed cuts.
8142 Kokkos::View<mj_lno_t*, device_t> sub_new_part_xadj =
8143 Kokkos::subview(this->new_part_xadj,
8144 std::pair<mj_lno_t, mj_lno_t>(
8145 output_part_index + output_array_shift,
8146 this->new_part_xadj.size()));
8147
8148 this->mj_create_new_partitions(
8149 num_parts,
8150 current_concurrent_work_part,
8151 mj_current_dim_coords,
8152 current_concurrent_cut_coordinate,
8153 used_local_cut_line_weight_to_left,
8154 sub_new_part_xadj);
8155 }
8156 else {
8157
8158 mj_lno_t coordinate_end = host_part_xadj(
8159 current_concurrent_work_part);
8160 mj_lno_t coordinate_begin =
8161 current_concurrent_work_part==0 ? 0 : host_part_xadj(
8162 current_concurrent_work_part - 1);
8163
8164 // if this part is partitioned into 1 then just copy
8165 // the old values.
8166 mj_lno_t part_size = coordinate_end - coordinate_begin;
8167
8168 // Awkward here to set one value - need some broader
8169 // refactoring to improve this one.
8170 auto local_new_part_xadj = this->new_part_xadj;
8171 Kokkos::parallel_for(
8172 Kokkos::RangePolicy<typename mj_node_t::execution_space, int>
8173 (0, 1), KOKKOS_LAMBDA (int dummy) {
8174 local_new_part_xadj(
8175 output_part_index + output_array_shift) = part_size;
8176 });
8177
8178 auto subview_new_coordinate_permutations =
8179 Kokkos::subview(this->new_coordinate_permutations,
8180 std::pair<mj_lno_t, mj_lno_t>(
8181 coordinate_begin,
8182 coordinate_begin + part_size));
8183 auto subview_coordinate_permutations =
8184 Kokkos::subview(this->coordinate_permutations,
8185 std::pair<mj_lno_t, mj_lno_t>(
8186 coordinate_begin,
8187 coordinate_begin + part_size));
8188 Kokkos::deep_copy(subview_new_coordinate_permutations,
8189 subview_coordinate_permutations);
8190 }
8191 cut_shift += num_parts - 1;
8192 output_array_shift += num_parts;
8193 partweight_array_shift += (2 * (num_parts - 1) + 1);
8194 }
8195
8196 // shift cut coordinates so that all cut coordinates are stored.
8197 // no shift now because we dont keep the cuts.
8198 // current_cut_coordinates += cut_shift;
8199 // mj_create_new_partitions from coordinates partitioned the parts
8200 // and write the indices as if there were a single part.
8201 // now we need to shift the beginning indices.
8202 for(mj_part_t kk = 0; kk < current_concurrent_num_parts; ++kk) {
8203 mj_part_t num_parts =
8204 host_num_partitioning_in_current_dim(current_work_part + kk);
8205
8206 // These two kernels are a bit awkward but need broader redesign to
8207 // avoid this situation.
8208 auto local_new_part_xadj = this->new_part_xadj;
8209 Kokkos::parallel_for(
8210 Kokkos::RangePolicy<typename mj_node_t::execution_space, mj_part_t>
8211 (0, num_parts), KOKKOS_LAMBDA (mj_part_t ii) {
8212 local_new_part_xadj(output_part_index+ii) +=
8213 output_coordinate_end_index;
8214 });
8215
8216 // increase the previous count by current end.
8217 mj_part_t temp_get;
8218 Kokkos::parallel_reduce("Read single",
8219 Kokkos::RangePolicy<typename mj_node_t::execution_space, int> (0, 1),
8220 KOKKOS_LAMBDA(int dummy, mj_part_t & set_single) {
8221 set_single =
8222 local_new_part_xadj(output_part_index + num_parts - 1);
8223 }, temp_get);
8224 output_coordinate_end_index = temp_get;
8225 //increase the current out.
8226 output_part_index += num_parts;
8227 }
8228 }
8229 }
8230
8231 // end of this partitioning dimension
8232 int current_world_size = this->comm->getSize();
8233 long migration_reduce_all_population =
8234 this->total_dim_num_reduce_all * current_world_size;
8235 bool is_migrated_in_current_dimension = false;
8236
8237 // we migrate if there are more partitionings to be done after this step
8238 // and if the migration is not forced to be avoided.
8239 // and the operation is not sequential.
8240 if(future_num_parts > 1 &&
8241 this->check_migrate_avoid_migration_option >= 0 &&
8242 current_world_size > 1) {
8243 this->mj_env->timerStart(MACRO_TIMERS,
8244 mj_timer_base_string + "Problem_Migration-" + istring);
8245 mj_part_t num_parts = output_part_count_in_dimension;
8246
8247 if(this->mj_perform_migration(
8248 num_parts,
8249 current_num_parts, //output
8250 next_future_num_parts_in_parts, //output
8251 output_part_begin_index,
8252 migration_reduce_all_population,
8253 this->num_global_coords / (future_num_parts * current_num_parts),
8254 istring,
8255 input_part_boxes, output_part_boxes) )
8256 {
8257 is_migrated_in_current_dimension = true;
8258 is_data_ever_migrated = true;
8259 this->mj_env->timerStop(MACRO_TIMERS,
8260 mj_timer_base_string + "Problem_Migration-" + istring);
8261 // since data is migrated, we reduce the number of reduceAll
8262 // operations for the last part.
8263 this->total_dim_num_reduce_all /= num_parts;
8264 }
8265 else {
8266 is_migrated_in_current_dimension = false;
8267 this->mj_env->timerStop(MACRO_TIMERS,
8268 mj_timer_base_string + "Problem_Migration-" + istring);
8269 }
8270 }
8271
8272 // swap the coordinate permutations for the next dimension.
8273 Kokkos::View<mj_lno_t*, device_t> tmp =
8274 this->coordinate_permutations;
8275 this->coordinate_permutations =
8276 this->new_coordinate_permutations;
8277
8278 this->new_coordinate_permutations = tmp;
8279 if(!is_migrated_in_current_dimension) {
8280 this->total_dim_num_reduce_all -= current_num_parts;
8281 current_num_parts = output_part_count_in_dimension;
8282 }
8283
8284 {
8285 this->part_xadj = this->new_part_xadj;
8286 local_part_xadj = this->new_part_xadj;
8287 this->host_part_xadj = Kokkos::create_mirror_view(part_xadj);
8288 Kokkos::deep_copy(host_part_xadj, part_xadj); // keep in sync
8289
8290 this->new_part_xadj = Kokkos::View<mj_lno_t*, device_t>("empty", 0);
8291 this->mj_env->timerStop(MACRO_TIMERS,
8292 mj_timer_base_string + "Problem_Partitioning_" + istring);
8293 }
8294 }
8295
8296 // Partitioning is done
8297 delete future_num_part_in_parts;
8298 delete next_future_num_parts_in_parts;
8299 this->mj_env->timerStop(MACRO_TIMERS,
8300 mj_timer_base_string + "Problem_Partitioning");
8302
8303 //get the final parts of each initial coordinate
8304 //the results will be written to
8305 //this->assigned_part_ids for gnos given in this->current_mj_gnos
8306 this->set_final_parts(
8307 current_num_parts,
8308 output_part_begin_index,
8309 output_part_boxes,
8310 is_data_ever_migrated);
8311
8312 result_assigned_part_ids_ = this->assigned_part_ids;
8313 result_mj_gnos_ = this->current_mj_gnos;
8314 this->mj_env->timerStop(MACRO_TIMERS,
8315 mj_timer_base_string + "Total");
8316 this->mj_env->debug(3, "Out of MultiJagged");
8317}
8318
8319template <typename mj_scalar_t, typename mj_lno_t, typename mj_gno_t,
8320 typename mj_part_t, typename mj_node_t>
8321RCP<typename AlgMJ<mj_scalar_t,mj_lno_t,mj_gno_t,mj_part_t, mj_node_t>::
8322 mj_partBoxVector_t>
8324 get_kept_boxes() const
8325{
8326 if(this->mj_keep_part_boxes) {
8327 return this->kept_boxes;
8328 }
8329 else {
8330 throw std::logic_error("Error: part boxes are not stored.");
8331 }
8332}
8333
8334template <typename mj_scalar_t, typename mj_lno_t, typename mj_gno_t,
8335 typename mj_part_t, typename mj_node_t>
8336RCP<typename AlgMJ<mj_scalar_t,mj_lno_t,mj_gno_t,mj_part_t, mj_node_t>::
8337 mj_partBoxVector_t>
8339 compute_global_box_boundaries(RCP<mj_partBoxVector_t> &localPartBoxes) const
8340{
8341 typedef typename Zoltan2::coordinateModelPartBox::coord_t coord_t;
8342 mj_part_t ntasks = this->num_global_parts;
8343 int dim = (*localPartBoxes)[0].getDim();
8344 coord_t *localPartBoundaries = new coord_t[ntasks * 2 *dim];
8345
8346 memset(localPartBoundaries, 0, sizeof(coord_t) * ntasks * 2 *dim);
8347
8348 coord_t *globalPartBoundaries = new coord_t[ntasks * 2 *dim];
8349 memset(globalPartBoundaries, 0, sizeof(coord_t) * ntasks * 2 *dim);
8350
8351 coord_t *localPartMins = localPartBoundaries;
8352 coord_t *localPartMaxs = localPartBoundaries + ntasks * dim;
8353
8354 coord_t *globalPartMins = globalPartBoundaries;
8355 coord_t *globalPartMaxs = globalPartBoundaries + ntasks * dim;
8356
8357 mj_part_t boxCount = localPartBoxes->size();
8358 for(mj_part_t i = 0; i < boxCount; ++i) {
8359 mj_part_t pId = (*localPartBoxes)[i].getpId();
8360
8361 // cout << "me:" << comm->getRank() << " has:" << pId << endl;
8362
8363 coord_t *lmins = (*localPartBoxes)[i].getlmins();
8364 coord_t *lmaxs = (*localPartBoxes)[i].getlmaxs();
8365
8366 for(int j = 0; j < dim; ++j) {
8367 localPartMins[dim * pId + j] = lmins[j];
8368 localPartMaxs[dim * pId + j] = lmaxs[j];
8369
8370 /*
8371 std::cout << "me:" << comm->getRank() <<
8372 " dim * pId + j:"<< dim * pId + j <<
8373 " localMin:" << localPartMins[dim * pId + j] <<
8374 " localMax:" << localPartMaxs[dim * pId + j] << std::endl;
8375 */
8376 }
8377 }
8378
8379 Teuchos::Zoltan2_BoxBoundaries<int, coord_t> reductionOp(ntasks * 2 *dim);
8380
8381 reduceAll<int, coord_t>(*mj_problemComm, reductionOp,
8382 ntasks * 2 *dim, localPartBoundaries, globalPartBoundaries);
8383
8384 RCP<mj_partBoxVector_t> pB(new mj_partBoxVector_t(),true);
8385 for(mj_part_t i = 0; i < ntasks; ++i) {
8387 globalPartMins + dim * i,
8388 globalPartMaxs + dim * i);
8389
8390 /*
8391 for(int j = 0; j < dim; ++j) {
8392 std::cout << "me:" << comm->getRank() <<
8393 " dim * pId + j:"<< dim * i + j <<
8394 " globalMin:" << globalPartMins[dim * i + j] <<
8395 " globalMax:" << globalPartMaxs[dim * i + j] << std::endl;
8396 }
8397 */
8398
8399 pB->push_back(tpb);
8400 }
8401 delete []localPartBoundaries;
8402 delete []globalPartBoundaries;
8403 //RCP <mj_partBoxVector_t> tmpRCPBox(pB, true);
8404 return pB;
8405}
8406
8409template <typename Adapter>
8410class Zoltan2_AlgMJ : public Algorithm<Adapter>
8411{
8412
8413private:
8414
8415#ifndef DOXYGEN_SHOULD_SKIP_THIS
8416 // For coordinates and weights, MJ needs floats or doubles
8417 // But Adapter can provide other scalars, e.g., ints.
8418 // So have separate scalar_t for MJ and adapter.
8419 typedef typename Adapter::scalar_t adapter_scalar_t;
8420
8421 // Provide a default type for mj_scalar_t;
8422 typedef float default_mj_scalar_t;
8423
8424 // If Adapter provided float or double scalar_t, use it (prevents copies).
8425 // Otherwise, use the default type of mj_scalar_t;
8426 typedef typename
8427 std::conditional<
8428 (std::is_same<adapter_scalar_t, float>::value ||
8429 std::is_same<adapter_scalar_t, double>::value),
8430 adapter_scalar_t, default_mj_scalar_t>::type mj_scalar_t;
8431
8432 typedef typename Adapter::gno_t mj_gno_t;
8433 typedef typename Adapter::lno_t mj_lno_t;
8434 typedef typename Adapter::part_t mj_part_t;
8435 typedef typename Adapter::node_t mj_node_t;
8436 typedef coordinateModelPartBox mj_partBox_t;
8437 typedef std::vector<mj_partBox_t> mj_partBoxVector_t;
8438 typedef typename mj_node_t::device_type device_t;
8439#endif
8440
8441 AlgMJ<mj_scalar_t, mj_lno_t, mj_gno_t, mj_part_t, mj_node_t> mj_partitioner;
8442
8443 RCP<const Environment> mj_env; // the environment object
8444 RCP<const Comm<int> > mj_problemComm; // initial comm object
8445 RCP<const typename Adapter::base_adapter_t> mj_adapter; // coordinate adapter
8446
8447 // PARAMETERS
8448 double imbalance_tolerance; // input imbalance tolerance.
8449
8450 int num_teams; // how many teams to run main loop with
8451
8452 size_t num_global_parts; // the targeted number of parts
8453
8454 // input part array specifying num part to divide along each dim.
8455 Kokkos::View<mj_part_t*, Kokkos::HostSpace> part_no_array;
8456
8457 // the number of steps that partitioning will be solved in.
8458 int recursion_depth;
8459
8460 int coord_dim; // coordinate dimension.
8461 mj_lno_t num_local_coords; //number of local coords.
8462 mj_gno_t num_global_coords; //number of global coords.
8463
8464 // initial global ids of the coordinates.
8465 Kokkos::View<const mj_gno_t*, device_t> initial_mj_gnos;
8466
8467 // two dimension coordinate array.
8468 // coordinates in MJ are LayoutLeft since Tpetra Multivector gives LayoutLeft
8469 Kokkos::View<mj_scalar_t**, Kokkos::LayoutLeft, device_t>
8470 mj_coordinates;
8471
8472 int num_weights_per_coord; // number of weights per coordinate
8473
8474 // if the target parts are uniform.
8475 Kokkos::View<bool*, Kokkos::HostSpace> mj_uniform_weights;
8476
8477 // two dimensional weight array.
8478 Kokkos::View<mj_scalar_t**, device_t> mj_weights;
8479
8480 // if the target parts are uniform
8481 Kokkos::View<bool*, Kokkos::HostSpace> mj_uniform_parts;
8482
8483 // Nonuniform first level partitioning
8484 // Currently used for Dragonfly task mapping by partitioning Dragonfly RCA
8485 // machine coordinates and application coordinates.
8486 // An optimization that completely partitions the most important machine
8487 // dimension first (i.e. the Dragonfly group coordinate, or RCA's x
8488 // coordinate). The standard MJ alg follows after the nonuniform first level
8489 // partitioning.
8490 // If used, number of parts for the first level partitioning
8491 mj_part_t num_first_level_parts;
8492
8493 // If used, the distribution of parts for the nonuniform
8494 // first level partitioning
8495 Kokkos::View<mj_part_t*, Kokkos::HostSpace> first_level_distribution;
8496
8497 // if partitioning can distribute points on same coordiante to
8498 // different parts.
8499 bool distribute_points_on_cut_lines;
8500
8501 // how many parts we can calculate concurrently.
8502 mj_part_t max_concurrent_part_calculation;
8503
8504 // whether to migrate=1, avoid migrate=2, or leave decision to MJ=0
8505 int check_migrate_avoid_migration_option;
8506
8507 // when doing the migration, 0 will aim for perfect load-imbalance,
8508 int migration_type;
8509
8510 // 1 for minimized messages
8511
8512 // when MJ decides whether to migrate, the minimum imbalance for migration.
8513 double minimum_migration_imbalance;
8514 bool mj_keep_part_boxes; //if the boxes need to be kept.
8515
8516 // if this is set, then recursion depth is adjusted to its maximum value.
8517 bool mj_run_as_rcb;
8518 int mj_premigration_option;
8519 int min_coord_per_rank_for_premigration;
8520
8521 // communication graph xadj
8522 ArrayRCP<mj_part_t> comXAdj_;
8523
8524 // communication graph adj.
8525 ArrayRCP<mj_part_t> comAdj_;
8526
8527 void copy(
8528 const RCP<PartitioningSolution<Adapter> >&solution);
8529
8530 void set_input_parameters(const Teuchos::ParameterList &p);
8531
8532 RCP<mj_partBoxVector_t> getGlobalBoxBoundaries() const;
8533
8534 bool mj_premigrate_to_subset(
8535 int used_num_ranks,
8536 int migration_selection_option,
8537 RCP<const Environment> mj_env_,
8538 RCP<const Comm<int> > mj_problemComm_,
8539 int coord_dim_,
8540 mj_lno_t num_local_coords_,
8541 mj_gno_t num_global_coords_, size_t num_global_parts_,
8542 Kokkos::View<const mj_gno_t*, device_t> & initial_mj_gnos_,
8543 // coordinates in MJ are LayoutLeft since Tpetra Multivector gives LayoutLeft
8544 Kokkos::View<mj_scalar_t**, Kokkos::LayoutLeft, device_t> &
8545 mj_coordinates_,
8546 int num_weights_per_coord_,
8547 Kokkos::View<mj_scalar_t**, device_t> & mj_weights_,
8548 //results
8549 RCP<const Comm<int> > &result_problemComm_,
8550 mj_lno_t & result_num_local_coords_,
8551 Kokkos::View<mj_gno_t*, device_t> & result_initial_mj_gnos_,
8552 // coordinates in MJ are LayoutLeft since Tpetra Multivector gives LayoutLeft
8553 Kokkos::View<mj_scalar_t**, Kokkos::LayoutLeft, device_t> &
8554 result_mj_coordinates_,
8555 Kokkos::View<mj_scalar_t**, device_t> & result_mj_weights_,
8556 int * &result_actual_owner_rank_);
8557
8558public:
8559
8560 Zoltan2_AlgMJ(const RCP<const Environment> &env,
8561 RCP<const Comm<int> > &problemComm,
8562 const RCP<const typename Adapter::base_adapter_t> &adapter) :
8563 mj_partitioner(),
8564 mj_env(env),
8565 mj_problemComm(problemComm),
8566 mj_adapter(adapter),
8567 imbalance_tolerance(0),
8568 num_teams(0),
8569 num_global_parts(1),
8570 recursion_depth(0),
8571 coord_dim(0),
8572 num_local_coords(0),
8573 num_global_coords(0),
8574 num_weights_per_coord(0),
8575 num_first_level_parts(1),
8576 distribute_points_on_cut_lines(true),
8577 max_concurrent_part_calculation(1),
8578 check_migrate_avoid_migration_option(0),
8579 migration_type(0),
8580 minimum_migration_imbalance(0.30),
8581 mj_keep_part_boxes(false),
8582 mj_run_as_rcb(false),
8583 mj_premigration_option(0),
8584 min_coord_per_rank_for_premigration(32000),
8585 comXAdj_(),
8586 comAdj_()
8587 {
8588 }
8589
8591 {
8592 }
8593
8596 static void getValidParameters(ParameterList & pl)
8597 {
8598 const bool bUnsorted = true; // this clarifies the flag is for unsrorted
8599 RCP<Zoltan2::IntegerRangeListValidator<int>> mj_parts_Validator =
8600 Teuchos::rcp( new Zoltan2::IntegerRangeListValidator<int>(bUnsorted) );
8601 pl.set("mj_parts", "0", "list of parts for multiJagged partitioning "
8602 "algorithm. As many as the dimension count.", mj_parts_Validator);
8603
8604 pl.set("mj_concurrent_part_count", 1, "The number of parts whose cut "
8605 "coordinates will be calculated concurently.",
8607
8608 pl.set("mj_minimum_migration_imbalance", 1.1,
8609 "mj_minimum_migration_imbalance, the minimum imbalance of the "
8610 "processors to avoid migration",
8612
8613 RCP<Teuchos::EnhancedNumberValidator<int>> mj_migration_option_validator =
8614 Teuchos::rcp( new Teuchos::EnhancedNumberValidator<int>(0, 2) );
8615 pl.set("mj_migration_option", 1, "Migration option, 0 for decision "
8616 "depending on the imbalance, 1 for forcing migration, 2 for "
8617 "avoiding migration", mj_migration_option_validator);
8618
8619 RCP<Teuchos::EnhancedNumberValidator<int>> mj_migration_type_validator =
8620 Teuchos::rcp( new Teuchos::EnhancedNumberValidator<int>(0, 1) );
8621 pl.set("mj_migration_type", 0,
8622 "Migration type, 0 for migration to minimize the imbalance "
8623 "1 for migration to minimize messages exchanged the migration.",
8624 mj_migration_option_validator);
8625
8626 // bool parameter
8627 pl.set("mj_keep_part_boxes", false, "Keep the part boundaries of the "
8628 "geometric partitioning.", Environment::getBoolValidator());
8629
8630 // bool parameter
8631 pl.set("mj_enable_rcb", false, "Use MJ as RCB.",
8633
8634 pl.set("mj_recursion_depth", -1, "Recursion depth for MJ: Must be "
8635 "greater than 0.", Environment::getAnyIntValidator());
8636
8637 RCP<Teuchos::EnhancedNumberValidator<int>>
8638 mj_num_teams_validator =
8639 Teuchos::rcp( new Teuchos::EnhancedNumberValidator<int>(
8640 0, Teuchos::EnhancedNumberTraits<int>::max()) );
8641 pl.set("mj_num_teams", 0,
8642 "How many teams for the main kernel loop"
8643 , mj_num_teams_validator);
8644
8645 RCP<Teuchos::EnhancedNumberValidator<int>>
8646 mj_premigration_option_validator =
8647 Teuchos::rcp( new Teuchos::EnhancedNumberValidator<int>(0, 1024) );
8648
8649 pl.set("mj_premigration_option", 0,
8650 "Whether to do premigration or not. 0 for no migration "
8651 "x > 0 for migration to consecutive processors, "
8652 "the subset will be 0,x,2x,3x,...subset ranks."
8653 , mj_premigration_option_validator);
8654
8655 pl.set("mj_premigration_coordinate_count", 32000, "How many coordinate to "
8656 "assign each rank in multijagged after premigration"
8658 }
8659
8665 void partition(const RCP<PartitioningSolution<Adapter> > &solution);
8666
8667 mj_partBoxVector_t &getPartBoxesView() const
8668 {
8669 RCP<mj_partBoxVector_t> pBoxes = this->getGlobalBoxBoundaries();
8670 return *pBoxes;
8671 }
8672
8673 mj_part_t pointAssign(int dim, adapter_scalar_t *point) const;
8674
8675 void boxAssign(int dim, adapter_scalar_t *lower, adapter_scalar_t *upper,
8676 size_t &nPartsFound, mj_part_t **partsFound) const;
8677
8681 const PartitioningSolution<Adapter> *solution,
8682 ArrayRCP<mj_part_t> &comXAdj,
8683 ArrayRCP<mj_part_t> &comAdj);
8684
8685 void set_up_partitioning_data( // public for CUDA
8686 const RCP<PartitioningSolution<Adapter> >&solution);
8687
8688 private:
8689 std::string timer_base_string; // used for making timers
8690
8691 // After loading views from coordinate adapter we may need to copy them
8692 // if mj type is different, but otherwise we just want to assign the view.
8693 // So purpose of this code is to make that assign only happen when the types
8694 // match. The empty case would otherwise not compile.
8695 // If they don't match the internal code handles allocating the new view
8696 // and copying the elements. See the test Zoltan2_mj_int_coordinates.
8697 template<class dst_t, class src_t> // version for same types
8698 typename std::enable_if<std::is_same<typename dst_t::value_type,
8699 typename src_t::value_type>::value>::type
8700 assign_if_same(dst_t & dst, const src_t & src) {
8701 dst = src;
8702 }
8703 template<class dst_t, class src_t> // version for different types
8704 typename std::enable_if<!std::is_same<typename dst_t::value_type,
8705 typename src_t::value_type>::value>::type
8706 assign_if_same(dst_t & dst, const src_t & src) {
8707 // do nothing - handled manually
8708 }
8709};
8710
8711template <typename Adapter>
8712bool Zoltan2_AlgMJ<Adapter>::mj_premigrate_to_subset(
8713 int used_num_ranks,
8714 int migration_selection_option,
8715 RCP<const Environment> mj_env_,
8716 RCP<const Comm<int> > mj_problemComm_,
8717 int coord_dim_,
8718 mj_lno_t num_local_coords_,
8719 mj_gno_t num_global_coords_, size_t num_global_parts_,
8720 Kokkos::View<const mj_gno_t*, device_t> & initial_mj_gnos_,
8721 // coordinates in MJ are LayoutLeft since Tpetra Multivector gives LayoutLeft
8722 Kokkos::View<mj_scalar_t**, Kokkos::LayoutLeft, device_t> & mj_coordinates_,
8723 int num_weights_per_coord_,
8724 Kokkos::View<mj_scalar_t**, device_t> & mj_weights_,
8725 //results
8726 RCP<const Comm<int> > & result_problemComm_,
8727 mj_lno_t &result_num_local_coords_,
8728 Kokkos::View<mj_gno_t*, device_t> & result_initial_mj_gnos_,
8729 // coordinates in MJ are LayoutLeft since Tpetra Multivector gives LayoutLeft
8730 Kokkos::View<mj_scalar_t**, Kokkos::LayoutLeft, device_t> &
8731 result_mj_coordinates_,
8732 Kokkos::View<mj_scalar_t**, device_t> & result_mj_weights_,
8733 int * &result_actual_owner_rank_)
8734{
8735 mj_env_->timerStart(MACRO_TIMERS,
8736 timer_base_string + "PreMigration DistributorPlanCreating");
8737
8738 int myRank = mj_problemComm_->getRank();
8739 int worldSize = mj_problemComm_->getSize();
8740
8741 mj_part_t groupsize = worldSize / used_num_ranks;
8742
8743 std::vector<mj_part_t> group_begins(used_num_ranks + 1, 0);
8744
8745 mj_part_t i_am_sending_to = 0;
8746 bool am_i_a_receiver = false;
8747
8748 for(int i = 0; i < used_num_ranks; ++i) {
8749 group_begins[i+ 1] = group_begins[i] + groupsize;
8750 if(worldSize % used_num_ranks > i) group_begins[i+ 1] += 1;
8751 if(i == used_num_ranks) group_begins[i+ 1] = worldSize;
8752 if(myRank >= group_begins[i] && myRank < group_begins[i + 1]) {
8753 i_am_sending_to = group_begins[i];
8754 }
8755 if(myRank == group_begins[i]) {
8756 am_i_a_receiver = true;
8757 }
8758 }
8759
8760 ArrayView<const mj_part_t> idView(&(group_begins[0]), used_num_ranks );
8761 result_problemComm_ = mj_problemComm_->createSubcommunicator(idView);
8762
8763 Tpetra::Distributor distributor(mj_problemComm_);
8764
8765 std::vector<mj_part_t>
8766 coordinate_destinations(num_local_coords_, i_am_sending_to);
8767
8768 ArrayView<const mj_part_t>
8769 destinations(&(coordinate_destinations[0]), num_local_coords_);
8770 mj_lno_t num_incoming_gnos = distributor.createFromSends(destinations);
8771 result_num_local_coords_ = num_incoming_gnos;
8772 mj_env_->timerStop(MACRO_TIMERS,
8773 timer_base_string + "PreMigration DistributorPlanCreating");
8774
8775 mj_env_->timerStart(MACRO_TIMERS,
8776 timer_base_string + "PreMigration DistributorMigration");
8777
8778
8779 // migrate gnos.
8780 // MPI buffers should be on Kokkos::HostSpace not Kokkos::CudaUVMSpace
8781 // Note, with UVM space, create_mirror_view does NOT create a non-UVM
8782 // view; need the explicit Host creation and deep_copy.
8783 {
8784 Kokkos::View<mj_gno_t*, Kokkos::HostSpace> sent_gnos(
8785 Kokkos::ViewAllocateWithoutInitializing("sent_gnos"),
8786 initial_mj_gnos_.size()); // initial_mj_gnos_ is const mj_gno_t *
8787 Kokkos::deep_copy(sent_gnos, initial_mj_gnos_);
8788
8789 Kokkos::View<mj_gno_t*, Kokkos::HostSpace> received_gnos (
8790 Kokkos::ViewAllocateWithoutInitializing("received_gnos"),
8791 num_incoming_gnos);
8792
8793 distributor.doPostsAndWaits(sent_gnos, 1, received_gnos);
8794
8795 result_initial_mj_gnos_ = Kokkos::View<mj_gno_t*, device_t>(
8796 Kokkos::ViewAllocateWithoutInitializing("result_initial_mj_gnos_"),
8797 num_incoming_gnos);
8798 Kokkos::deep_copy(result_initial_mj_gnos_, received_gnos);
8799 }
8800
8801 // migrate coordinates
8802 // coordinates in MJ are LayoutLeft since Tpetra Multivector gives LayoutLeft
8803
8804 Kokkos::View<mj_scalar_t**, Kokkos::LayoutLeft, Kokkos::HostSpace>
8805 host_src_coordinates(
8806 Kokkos::ViewAllocateWithoutInitializing("mj_coordinates"),
8807 this->mj_coordinates.extent(0), this->mj_coordinates.extent(1));
8808
8809 Kokkos::deep_copy(host_src_coordinates, this->mj_coordinates);
8810
8811 Kokkos::View<mj_scalar_t**, Kokkos::LayoutLeft, device_t> dst_coordinates(
8812 Kokkos::ViewAllocateWithoutInitializing("mj_coordinates"),
8813 num_incoming_gnos, this->coord_dim);
8814
8815 Kokkos::View<mj_scalar_t*, Kokkos::HostSpace> received_coord(
8816 Kokkos::ViewAllocateWithoutInitializing("received_coord"),
8817 num_incoming_gnos);
8818
8819 for(int i = 0; i < this->coord_dim; ++i) {
8820
8821 auto sent_coord = Kokkos::subview(host_src_coordinates, Kokkos::ALL, i);
8822
8823 distributor.doPostsAndWaits(sent_coord, 1, received_coord);
8824
8825 Kokkos::deep_copy(Kokkos::subview(dst_coordinates, Kokkos::ALL, i),
8826 received_coord);
8827 Kokkos::fence();
8828 }
8829 result_mj_coordinates_ = dst_coordinates;
8830
8831 // migrate weights.
8832
8833 Kokkos::View<mj_scalar_t**, device_t> dst_weights(
8834 Kokkos::ViewAllocateWithoutInitializing("mj_weights"),
8835 num_incoming_gnos, this->num_weights_per_coord);
8836 auto host_dst_weights = Kokkos::create_mirror_view(dst_weights);
8837
8838 auto host_src_weights = Kokkos::create_mirror_view_and_copy(
8839 Kokkos::HostSpace(), this->mj_weights);
8840
8841 // contiguous buffers to gather potentially strided data
8842 Kokkos::View<mj_scalar_t*, Kokkos::HostSpace> sent_weight(
8843 Kokkos::ViewAllocateWithoutInitializing("send_weight_buffer"),
8844 this->num_local_coords);
8845
8846 Kokkos::View<mj_scalar_t*, Kokkos::HostSpace> received_weight(
8847 Kokkos::ViewAllocateWithoutInitializing("received_weight_buffer"),
8848 num_incoming_gnos);
8849
8850 for(int i = 0; i < this->num_weights_per_coord; ++i) {
8851
8852 auto sub_host_src_weights
8853 = Kokkos::subview(host_src_weights, Kokkos::ALL, i);
8854 auto sub_host_dst_weights
8855 = Kokkos::subview(host_dst_weights, Kokkos::ALL, i);
8856
8857 // Layout Right means these weights are not contiguous
8858 // However we don't have any systems setup with more than 1 weight so
8859 // really I have not tested any of this code with num weights > 1.
8860 // I think this is the right thing to do. Note that there are other
8861 // places in the code which don't handle the possibility of more weights.
8862 // So evaluating all that and adding tests would be another project.
8863 for(mj_lno_t n = 0; n < this->num_local_coords; ++n) {
8864 sent_weight[n] = sub_host_src_weights(n);
8865 }
8866
8867 distributor.doPostsAndWaits(sent_weight, 1, received_weight);
8868
8869 // Again we copy by index due to layout
8870 for(mj_lno_t n = 0; n < num_incoming_gnos; ++n) {
8871 sub_host_dst_weights(n) = received_weight[n];
8872 }
8873 }
8874 Kokkos::deep_copy(dst_weights, host_dst_weights);
8875 result_mj_weights_ = dst_weights;
8876
8877 // migrate the owners of the coordinates
8878 {
8879 Kokkos::View<int*, Kokkos::HostSpace> sent_owners(
8880 Kokkos::ViewAllocateWithoutInitializing("sent_owners"),
8881 num_local_coords_);
8882 Kokkos::deep_copy(sent_owners, myRank);
8883
8884 Kokkos::View<int*, Kokkos::HostSpace> received_owners(
8885 Kokkos::ViewAllocateWithoutInitializing("received_owners"),
8886 num_incoming_gnos);
8887
8888 distributor.doPostsAndWaits(sent_owners, 1, received_owners);
8889
8890 result_actual_owner_rank_ = new int[num_incoming_gnos];
8891 memcpy(
8892 result_actual_owner_rank_,
8893 received_owners.data(),
8894 num_incoming_gnos * sizeof(int));
8895 }
8896
8897 mj_env_->timerStop(MACRO_TIMERS,
8898 timer_base_string + "PreMigration DistributorMigration");
8899 return am_i_a_receiver;
8900}
8901
8909template <typename Adapter>
8911 const RCP<PartitioningSolution<Adapter> > &solution)
8912{
8913 // purpose of this code is to validate node and UVM status for the tests
8914 // std::cout << "Memory Space: " << mj_node_t::memory_space::name() << " "
8915 // << "Execution Space: " << mj_node_t::execution_space::name()
8916 // << std::endl;
8917
8918 int execute_counter =
8920 timer_base_string = "partition(" + std::to_string(execute_counter) + ") - ";
8921
8922 this->mj_env->timerStart(MACRO_TIMERS, timer_base_string + "all");
8923 {
8924 this->mj_env->timerStart(MACRO_TIMERS, timer_base_string + "setup");
8925
8926 this->set_up_partitioning_data(solution);
8927
8928 this->set_input_parameters(this->mj_env->getParameters());
8929 if(this->mj_keep_part_boxes) {
8930 this->mj_partitioner.set_to_keep_part_boxes();
8931 }
8932
8933 this->mj_partitioner.set_partitioning_parameters(
8934 this->distribute_points_on_cut_lines,
8935 this->max_concurrent_part_calculation,
8936 this->check_migrate_avoid_migration_option,
8937 this->minimum_migration_imbalance, this->migration_type);
8938
8939 RCP<const Comm<int> > result_problemComm = this->mj_problemComm;
8940 mj_lno_t result_num_local_coords = this->num_local_coords;
8941 Kokkos::View<mj_gno_t*, device_t> result_initial_mj_gnos;
8942 // coordinates in MJ are LayoutLeft since Tpetra Multivector gives LayoutLeft
8943 Kokkos::View<mj_scalar_t**, Kokkos::LayoutLeft, device_t>
8944 result_mj_coordinates = this->mj_coordinates;
8945 Kokkos::View<mj_scalar_t**, device_t> result_mj_weights =
8946 this->mj_weights;
8947 int *result_actual_owner_rank = NULL;
8948
8949 Kokkos::View<const mj_gno_t*, device_t> result_initial_mj_gnos_ =
8950 this->initial_mj_gnos;
8951
8952 // TODO: MD 08/2017: Further discussion is required.
8953 // MueLu calls MJ when it has very few coordinates per processors,
8954 // such as 10. For example, it begins with 1K processor with 1K coordinate
8955 // in each. Then with coarsening this reduces to 10 coordinate per procesor.
8956 // It calls MJ to repartition these to 10 coordinates.
8957 // MJ runs with 1K processor, 10 coordinate in each, and partitions to
8958 // 10 parts. As expected strong scaling is problem here, because
8959 // computation is almost 0, and communication cost of MJ linearly increases.
8960 // Premigration option gathers the coordinates to 10 parts before MJ starts
8961 // therefore MJ will run with a smalller subset of the problem.
8962 // Below, I am migrating the coordinates if mj_premigration_option is set,
8963 // and the result parts are less than the current part count, and the
8964 // average number of local coordinates is less than some threshold.
8965 // For example, premigration may not help if 1000 processors are
8966 // partitioning data to 10, but each of them already have 1M coordinate.
8967 // In that case, we premigration would not help.
8968 int current_world_size = this->mj_problemComm->getSize();
8969 mj_lno_t threshold_num_local_coords =
8970 this->min_coord_per_rank_for_premigration;
8971 bool is_pre_migrated = false;
8972 bool am_i_in_subset = true;
8973
8974 // Note that we need to add testing for migration and should also cover the
8975 // zoltan case when ZOLTAN2_MJ_ENABLE_ZOLTAN_MIGRATION is defined.
8976 // Currently did a minimal test of this code by running mjTest with
8977 // PM=1, TB=0 then run again with C=3 instead of C=4 (numProcs is 4).
8978 if(mj_premigration_option > 0 &&
8979 size_t (current_world_size) > this->num_global_parts &&
8980 this->num_global_coords < mj_gno_t (
8981 current_world_size * threshold_num_local_coords))
8982 {
8983 if(this->mj_keep_part_boxes) {
8984 throw std::logic_error("Multijagged: mj_keep_part_boxes and "
8985 "mj_premigration_option are not supported together yet.");
8986 }
8987
8988 is_pre_migrated =true;
8989 int migration_selection_option = mj_premigration_option;
8990 if(migration_selection_option * this->num_global_parts >
8991 (size_t) (current_world_size)) {
8992 migration_selection_option =
8993 current_world_size / this->num_global_parts;
8994 }
8995
8996 int used_num_ranks = int (this->num_global_coords /
8997 float (threshold_num_local_coords) + 0.5);
8998
8999 if(used_num_ranks == 0) {
9000 used_num_ranks = 1;
9001 }
9002
9003 am_i_in_subset = this->mj_premigrate_to_subset(
9004 used_num_ranks,
9005 migration_selection_option,
9006 this->mj_env,
9007 this->mj_problemComm,
9008 this->coord_dim,
9009 this->num_local_coords,
9010 this->num_global_coords,
9011 this->num_global_parts,
9012 this->initial_mj_gnos,
9013 this->mj_coordinates,
9014 this->num_weights_per_coord,
9015 this->mj_weights,
9016 //results
9017 result_problemComm,
9018 result_num_local_coords,
9019 result_initial_mj_gnos,
9020 result_mj_coordinates,
9021 result_mj_weights,
9022 result_actual_owner_rank);
9023
9024 result_initial_mj_gnos_ = result_initial_mj_gnos;
9025 }
9026
9027 Kokkos::View<mj_part_t *, device_t> result_assigned_part_ids;
9028 Kokkos::View<mj_gno_t*, device_t> result_mj_gnos;
9029
9030 this->mj_env->timerStop(MACRO_TIMERS, timer_base_string + "setup");
9031
9032 if(am_i_in_subset) {
9033 this->mj_partitioner.multi_jagged_part(
9034 this->mj_env,
9035 result_problemComm, //this->mj_problemComm,
9036 this->imbalance_tolerance,
9037 this->num_teams,
9038 this->num_global_parts,
9039 this->part_no_array,
9040 this->recursion_depth,
9041 this->coord_dim,
9042 result_num_local_coords, //this->num_local_coords,
9043 this->num_global_coords,
9044 result_initial_mj_gnos_,
9045 result_mj_coordinates,
9046 this->num_weights_per_coord,
9047 this->mj_uniform_weights,
9048 result_mj_weights,
9049 this->mj_uniform_parts,
9050 result_assigned_part_ids,
9051 result_mj_gnos
9052 );
9053 }
9054
9055 this->mj_env->timerStart(MACRO_TIMERS, timer_base_string + "cleanup");
9056
9057 // Reorder results so that they match the order of the input
9058 std::unordered_map<mj_gno_t, mj_lno_t> localGidToLid;
9059 localGidToLid.reserve(result_num_local_coords);
9060 Kokkos::View<mj_gno_t*, Kokkos::HostSpace> host_result_initial_mj_gnos(
9061 Kokkos::ViewAllocateWithoutInitializing("host_result_initial_mj_gnos"),
9062 result_initial_mj_gnos_.size());
9063 Kokkos::deep_copy(host_result_initial_mj_gnos, result_initial_mj_gnos_);
9064 for(mj_lno_t i = 0; i < result_num_local_coords; i++) {
9065 localGidToLid[host_result_initial_mj_gnos(i)] = i;
9066 }
9067
9068 ArrayRCP<mj_part_t> partId = arcp(new mj_part_t[result_num_local_coords],
9069 0, result_num_local_coords, true);
9070 auto host_result_assigned_part_ids =
9071 Kokkos::create_mirror_view(result_assigned_part_ids);
9072 Kokkos::deep_copy(host_result_assigned_part_ids, result_assigned_part_ids);
9073 auto host_result_mj_gnos = Kokkos::create_mirror_view(result_mj_gnos);
9074 Kokkos::deep_copy(host_result_mj_gnos, result_mj_gnos);
9075 for(mj_lno_t i = 0; i < result_num_local_coords; i++) {
9076 mj_lno_t origLID = localGidToLid[host_result_mj_gnos(i)];
9077 partId[origLID] = host_result_assigned_part_ids(i);
9078 }
9079
9080 //now the results are reordered. but if premigration occured,
9081 //then we need to send these ids to actual owners again.
9082 if(is_pre_migrated) {
9083 this->mj_env->timerStart(MACRO_TIMERS, timer_base_string +
9084 "PostMigration DistributorPlanCreating");
9085 Tpetra::Distributor distributor(this->mj_problemComm);
9086
9087 ArrayView<const mj_part_t> actual_owner_destinations(
9088 result_actual_owner_rank , result_num_local_coords);
9089
9090 mj_lno_t num_incoming_gnos = distributor.createFromSends(
9091 actual_owner_destinations);
9092
9093 if(num_incoming_gnos != this->num_local_coords) {
9094 throw std::logic_error("Zoltan2 - Multijagged Post Migration - "
9095 "num incoming is not equal to num local coords");
9096 }
9097
9098 mj_env->timerStop(MACRO_TIMERS, timer_base_string +
9099 "PostMigration DistributorPlanCreating");
9100 mj_env->timerStart(MACRO_TIMERS, timer_base_string +
9101 "PostMigration DistributorMigration");
9102
9103 Kokkos::View<mj_gno_t*, Kokkos::HostSpace> received_gnos(
9104 Kokkos::ViewAllocateWithoutInitializing("received_gnos"),
9105 num_incoming_gnos);
9106 Kokkos::View<mj_part_t*, Kokkos::HostSpace> received_partids(
9107 Kokkos::ViewAllocateWithoutInitializing("received_partids"),
9108 num_incoming_gnos);
9109
9110 distributor.doPostsAndWaits(host_result_initial_mj_gnos, 1,
9111 received_gnos);
9112 {
9113 Kokkos::View<mj_part_t*, Kokkos::HostSpace> sent_partnos;
9114 if (partId.size() > 0) {
9115 sent_partnos = Kokkos::View<mj_part_t*, Kokkos::HostSpace>(
9116 partId.getRawPtr(), partId.size()); //unmanaged
9117 }
9118 distributor.doPostsAndWaits(sent_partnos, 1, received_partids);
9119 }
9120
9121 partId = arcp(new mj_part_t[this->num_local_coords],
9122 0, this->num_local_coords, true);
9123
9124 {
9125 std::unordered_map<mj_gno_t, mj_lno_t> localGidToLid2;
9126 localGidToLid2.reserve(this->num_local_coords);
9127 auto host_initial_mj_gnos =
9128 Kokkos::create_mirror_view(this->initial_mj_gnos);
9129 Kokkos::deep_copy(host_initial_mj_gnos,
9130 this->initial_mj_gnos);
9131 for(mj_lno_t i = 0; i < this->num_local_coords; i++) {
9132 localGidToLid2[host_initial_mj_gnos(i)] = i;
9133 }
9134
9135 for(mj_lno_t i = 0; i < this->num_local_coords; i++) {
9136 mj_lno_t origLID = localGidToLid2[received_gnos[i]];
9137 partId[origLID] = received_partids[i];
9138 }
9139 }
9140
9141 {
9142 delete [] result_actual_owner_rank;
9143 }
9144 mj_env->timerStop(MACRO_TIMERS,
9145 timer_base_string + "PostMigration DistributorMigration");
9146 }
9147 solution->setParts(partId);
9148 this->mj_env->timerStop(MACRO_TIMERS, timer_base_string + "cleanup");
9149 }
9150
9151 this->mj_env->timerStop(MACRO_TIMERS, timer_base_string + "all");
9152
9153 // reset the view (release the reference to device data)
9154 this->mj_coordinates = Kokkos::View<mj_scalar_t **, Kokkos::LayoutLeft, device_t>();
9155}
9156
9157/* \brief Sets the partitioning data for multijagged algorithm.
9158 * */
9159template <typename Adapter>
9161 const RCP<PartitioningSolution<Adapter> > &solution
9162)
9163{
9164 modelFlag_t flags;
9165 CoordinateModel<Adapter> mj_coords(mj_adapter, mj_env, mj_problemComm, flags);
9166
9167 this->coord_dim = mj_coords.getCoordinateDim();
9168 this->num_weights_per_coord = mj_coords.getNumWeightsPerCoordinate();
9169 this->num_local_coords = mj_coords.getLocalNumCoordinates();
9170 this->num_global_coords = mj_coords.getGlobalNumCoordinates();
9171
9172 int criteria_dim = (this->num_weights_per_coord ?
9173 this->num_weights_per_coord : 1);
9174 // From the Solution we get part information.
9175 // If the part sizes for a given criteria are not uniform,
9176 // then they are values that sum to 1.0.
9177 this->num_global_parts = solution->getTargetGlobalNumberOfParts();
9178 // allocate only two dimensional pointer.
9179 // raw pointer addresess will be obtained from multivector.
9180 this->mj_uniform_parts = Kokkos::View<bool *, Kokkos::HostSpace>(
9181 "uniform parts", criteria_dim);
9182 this->mj_uniform_weights = Kokkos::View<bool *, Kokkos::HostSpace>(
9183 "uniform weights", criteria_dim);
9184
9185 Kokkos::View<const mj_gno_t *, device_t> gnos;
9186 Kokkos::View<adapter_scalar_t **, Kokkos::LayoutLeft, device_t> xyz_adapter;
9187 // coordinates in MJ are LayoutLeft since Tpetra Multivector gives LayoutLeft
9188 Kokkos::View<adapter_scalar_t **, device_t> wgts_adapter;
9189 mj_coords.getCoordinatesKokkos(gnos, xyz_adapter, wgts_adapter);
9190 // coordinates in MJ are LayoutLeft since Tpetra Multivector gives LayoutLeft
9191 Kokkos::View<mj_scalar_t **, Kokkos::LayoutLeft, device_t> xyz;
9192 Kokkos::View<mj_scalar_t **, device_t> wgts;
9193
9194 // Now we must get the data from the adapter.
9195 // If the types match we point to the view but if not, we must copy.
9196 if(std::is_same<mj_scalar_t, adapter_scalar_t>()) {
9197 // we can just point the views but we must specialize because this code
9198 // only compiles in this case - for is_same false assign does nothing.
9199 assign_if_same(xyz, xyz_adapter);
9200 assign_if_same(wgts, wgts_adapter);
9201 }
9202 else {
9203 // we only allocate a new view if we are going to copy
9204 // coordinates in MJ are LayoutLeft since Tpetra Multivector gives LayoutLeft
9205 xyz = Kokkos::View<mj_scalar_t **, Kokkos::LayoutLeft, device_t>
9206 (Kokkos::ViewAllocateWithoutInitializing(
9207 "xyz"), xyz_adapter.extent(0), xyz_adapter.extent(1));
9208 wgts = Kokkos::View<mj_scalar_t **, device_t>(
9209 Kokkos::ViewAllocateWithoutInitializing("wgts"),
9210 wgts_adapter.extent(0), wgts_adapter.extent(1));
9211
9212 typedef typename Kokkos::View<mj_scalar_t **, device_t>::size_type view_size_t;
9213 Kokkos::parallel_for(
9214 Kokkos::RangePolicy<typename mj_node_t::execution_space, int>
9215 (0, xyz_adapter.extent(0)), KOKKOS_LAMBDA (int i) {
9216 for(view_size_t n = 0; n < xyz_adapter.extent(1); ++n) {
9217 xyz(i, n) = static_cast<mj_scalar_t>(xyz_adapter(i, n));
9218 }
9219 });
9220 Kokkos::parallel_for(
9221 Kokkos::RangePolicy<typename mj_node_t::execution_space, int>
9222 (0, wgts.extent(0)), KOKKOS_LAMBDA (int i) {
9223 for(view_size_t n = 0; n < wgts.extent(1); ++n) {
9224 wgts(i, n) = static_cast<mj_scalar_t>(wgts_adapter(i, n));
9225 }
9226 });
9227 }
9228
9229 // obtain global ids.
9230 this->initial_mj_gnos = gnos;
9231 // extract coordinates from multivector.
9232 this->mj_coordinates = xyz;
9233 // if no weights are provided set uniform weight.
9234
9235 if(this->num_weights_per_coord == 0) {
9236 this->mj_uniform_weights(0) = true;
9237 Kokkos::resize(this->mj_weights, 0, 0);
9238 }
9239 else{
9240 this->mj_weights = wgts;
9241 for(int wdim = 0; wdim < this->num_weights_per_coord; ++wdim) {
9242 this->mj_uniform_weights(wdim) = false;
9243 }
9244 }
9245
9246 for(int wdim = 0; wdim < criteria_dim; ++wdim) {
9247 if(solution->criteriaHasUniformPartSizes(wdim)) {
9248 this->mj_uniform_parts(wdim) = true;
9249 }
9250 else {
9251 printf("Error: MJ does not support non uniform target part weights\n");
9252 std::terminate();
9253 }
9254 }
9255}
9256
9257/* \brief Sets the partitioning parameters for multijagged algorithm.
9258 * \param pl: is the parameter list provided to zoltan2 call
9259 * */
9260template <typename Adapter>
9262 const Teuchos::ParameterList &pl)
9263{
9264 const Teuchos::ParameterEntry *pe = pl.getEntryPtr("imbalance_tolerance");
9265 if(pe) {
9266 double tol;
9267 tol = pe->getValue(&tol);
9268 this->imbalance_tolerance = tol - 1.0;
9269 }
9270
9271 // TODO: May be a more relaxed tolerance is needed. RCB uses 10%
9272 if(this->imbalance_tolerance <= 0) {
9273 this->imbalance_tolerance= 10e-4;
9274 }
9275
9276 // if an input partitioning array is provided.
9277 Kokkos::resize(this->part_no_array, 0);
9278
9279 // the length of the input partitioning array.
9280 this->recursion_depth = 0;
9281
9282 if(pl.getPtr<int>("mj_num_teams")) {
9283 this->num_teams = pl.get<int>("mj_num_teams");
9284 }
9285
9286 if(pl.getPtr<Array <mj_part_t> >("mj_parts")) {
9287 auto mj_parts = pl.get<Array <mj_part_t> >("mj_parts");
9288 int mj_parts_size = static_cast<int>(mj_parts.size());
9289
9290 // build the view we'll have data on and copy values from host
9291 this->part_no_array = Kokkos::View<mj_part_t*, Kokkos::HostSpace>(
9292 "part_no_array", mj_parts_size);
9293 for(int i = 0; i < mj_parts_size; ++i) {
9294 this->part_no_array(i) = mj_parts.getRawPtr()[i];
9295 }
9296
9297 this->recursion_depth = mj_parts_size - 1;
9298 this->mj_env->debug(2, "mj_parts provided by user");
9299 }
9300
9301 // get mj specific parameters.
9302 this->distribute_points_on_cut_lines = true;
9303 this->max_concurrent_part_calculation = 1;
9304
9305 this->mj_run_as_rcb = false;
9306 this->mj_premigration_option = 0;
9307 this->min_coord_per_rank_for_premigration = 32000;
9308
9309 int mj_user_recursion_depth = -1;
9310 this->mj_keep_part_boxes = false;
9311 this->check_migrate_avoid_migration_option = 0;
9312 this->migration_type = 0;
9313 this->minimum_migration_imbalance = 0.35;
9314
9315 pe = pl.getEntryPtr("mj_minimum_migration_imbalance");
9316 if(pe) {
9317 double imb;
9318 imb = pe->getValue(&imb);
9319 this->minimum_migration_imbalance = imb - 1.0;
9320 }
9321
9322 pe = pl.getEntryPtr("mj_migration_option");
9323 if(pe) {
9324 this->check_migrate_avoid_migration_option =
9325 pe->getValue(&this->check_migrate_avoid_migration_option);
9326 } else {
9327 this->check_migrate_avoid_migration_option = 0;
9328 }
9329 if(this->check_migrate_avoid_migration_option > 1) {
9330 this->check_migrate_avoid_migration_option = -1;
9331 }
9332
9334 pe = pl.getEntryPtr("mj_migration_type");
9335 if(pe) {
9336 this->migration_type = pe->getValue(&this->migration_type);
9337 } else {
9338 this->migration_type = 0;
9339 }
9340
9341 //std::cout << " this->migration_type:" << this->migration_type << std::endl;
9343
9344 pe = pl.getEntryPtr("mj_concurrent_part_count");
9345 if(pe) {
9346 this->max_concurrent_part_calculation =
9347 pe->getValue(&this->max_concurrent_part_calculation);
9348 } else {
9349 this->max_concurrent_part_calculation = 1; // Set to 1 if not provided.
9350 }
9351
9352 pe = pl.getEntryPtr("mj_keep_part_boxes");
9353 if(pe) {
9354 this->mj_keep_part_boxes = pe->getValue(&this->mj_keep_part_boxes);
9355 } else {
9356 this->mj_keep_part_boxes = false; // Set to invalid value
9357 }
9358
9359 // For now, need keep_part_boxes to do pointAssign and boxAssign.
9360 // pe = pl.getEntryPtr("keep_cuts");
9361 // if(pe) {
9362 // int tmp = pe->getValue(&tmp);
9363 // if(tmp) this->mj_keep_part_boxes = true;
9364 // }
9365
9366 //need to keep part boxes if mapping type is geometric.
9367 if(this->mj_keep_part_boxes == false) {
9368 pe = pl.getEntryPtr("mapping_type");
9369 if(pe) {
9370 int mapping_type = -1;
9371 mapping_type = pe->getValue(&mapping_type);
9372 if(mapping_type == 0) {
9373 mj_keep_part_boxes = true;
9374 }
9375 }
9376 }
9377
9378 // need to keep part boxes if mapping type is geometric.
9379 pe = pl.getEntryPtr("mj_enable_rcb");
9380 if(pe) {
9381 this->mj_run_as_rcb = pe->getValue(&this->mj_run_as_rcb);
9382 } else {
9383 this->mj_run_as_rcb = false; // Set to invalid value
9384 }
9385
9386 pe = pl.getEntryPtr("mj_premigration_option");
9387 if(pe) {
9388 mj_premigration_option = pe->getValue(&mj_premigration_option);
9389 } else {
9390 mj_premigration_option = 0;
9391 }
9392
9393 pe = pl.getEntryPtr("mj_premigration_coordinate_count");
9394 if(pe) {
9395 min_coord_per_rank_for_premigration = pe->getValue(&mj_premigration_option);
9396 } else {
9397 min_coord_per_rank_for_premigration = 32000;
9398 }
9399
9400 pe = pl.getEntryPtr("mj_recursion_depth");
9401 if(pe) {
9402 mj_user_recursion_depth = pe->getValue(&mj_user_recursion_depth);
9403 } else {
9404 mj_user_recursion_depth = -1; // Set to invalid value
9405 }
9406
9407 bool val = false;
9408 pe = pl.getEntryPtr("rectilinear");
9409 if(pe) {
9410 val = pe->getValue(&val);
9411 }
9412 if(val) {
9413 this->distribute_points_on_cut_lines = false;
9414 } else {
9415 this->distribute_points_on_cut_lines = true;
9416 }
9417
9418 if(this->mj_run_as_rcb) {
9419 mj_user_recursion_depth =
9420 (int)(ceil(log ((this->num_global_parts)) / log (2.0)));
9421 }
9422 if(this->recursion_depth < 1) {
9423 if(mj_user_recursion_depth > 0) {
9424 this->recursion_depth = mj_user_recursion_depth;
9425 }
9426 else {
9427 this->recursion_depth = this->coord_dim;
9428 }
9429 }
9430}
9431
9433template <typename Adapter>
9435 int dim,
9436 adapter_scalar_t *lower,
9437 adapter_scalar_t *upper,
9438 size_t &nPartsFound,
9439 typename Adapter::part_t **partsFound) const
9440{
9441 // TODO: Implement with cuts rather than boxes to reduce algorithmic
9442 // TODO: complexity. Or at least do a search through the boxes, using
9443 // TODO: p x q x r x ... if possible.
9444
9445 nPartsFound = 0;
9446 *partsFound = NULL;
9447
9448 if(this->mj_keep_part_boxes) {
9449
9450 // Get vector of part boxes
9451 RCP<mj_partBoxVector_t> partBoxes = this->getGlobalBoxBoundaries();
9452
9453 size_t nBoxes = (*partBoxes).size();
9454 if(nBoxes == 0) {
9455 throw std::logic_error("no part boxes exist");
9456 }
9457
9458 // Determine whether the box overlaps the globalBox at all
9459 RCP<mj_partBox_t> globalBox = this->mj_partitioner.get_global_box();
9460
9461 if(globalBox->boxesOverlap(dim, lower, upper)) {
9462
9463 std::vector<typename Adapter::part_t> partlist;
9464
9465 // box overlaps the global box; find specific overlapping boxes
9466 for(size_t i = 0; i < nBoxes; i++) {
9467 try {
9468 if((*partBoxes)[i].boxesOverlap(dim, lower, upper)) {
9469 nPartsFound++;
9470 partlist.push_back((*partBoxes)[i].getpId());
9471 /*
9472 std::cout << "Given box (";
9473 for(int j = 0; j < dim; j++)
9474 std::cout << lower[j] << " ";
9475 std::cout << ") x (";
9476 for(int j = 0; j < dim; j++)
9477 std::cout << upper[j] << " ";
9478 std::cout << ") overlaps PartBox "
9479 << (*partBoxes)[i].getpId() << " (";
9480 for(int j = 0; j < dim; j++)
9481 std::cout << (*partBoxes)[i].getlmins()[j] << " ";
9482 std::cout << ") x (";
9483 for(int j = 0; j < dim; j++)
9484 std::cout << (*partBoxes)[i].getlmaxs()[j] << " ";
9485 std::cout << ")" << std::endl;
9486 */
9487 }
9488 }
9490 }
9491 if(nPartsFound) {
9492 *partsFound = new mj_part_t[nPartsFound];
9493 for(size_t i = 0; i < nPartsFound; i++)
9494 (*partsFound)[i] = partlist[i];
9495 }
9496 }
9497 else {
9498 // Box does not overlap the domain at all. Find the closest part
9499 // Not sure how to perform this operation for MJ without having the
9500 // cuts. With the RCB cuts, the concept of a part extending to
9501 // infinity was natural. With the boxes, it is much more difficult.
9502 // TODO: For now, return information indicating NO OVERLAP.
9503 }
9504 }
9505 else {
9506 throw std::logic_error("need to use keep_cuts parameter for boxAssign");
9507 }
9508}
9509
9511template <typename Adapter>
9513 int dim,
9514 adapter_scalar_t *point) const
9515{
9516 // TODO: Implement with cuts rather than boxes to reduce algorithmic
9517 // TODO: complexity. Or at least do a search through the boxes, using
9518 // TODO: p x q x r x ... if possible.
9519
9520 if(this->mj_keep_part_boxes) {
9521 typename Adapter::part_t foundPart = -1;
9522
9523 // Get vector of part boxes
9524 RCP<mj_partBoxVector_t> partBoxes = this->getGlobalBoxBoundaries();
9525
9526 size_t nBoxes = (*partBoxes).size();
9527 if(nBoxes == 0) {
9528 throw std::logic_error("no part boxes exist");
9529 }
9530
9531 // Determine whether the point is within the global domain
9532 RCP<mj_partBox_t> globalBox = this->mj_partitioner.get_global_box();
9533
9534 if(globalBox->pointInBox(dim, point)) {
9535
9536 // point is in the global domain; determine in which part it is.
9537 size_t i;
9538 for(i = 0; i < nBoxes; i++) {
9539 try {
9540 if((*partBoxes)[i].pointInBox(dim, point)) {
9541 foundPart = (*partBoxes)[i].getpId();
9542 // std::cout << "Point (";
9543 // for(int j = 0; j < dim; j++) std::cout << point[j] << " ";
9544 // std::cout << ") found in box " << i << " part " << foundPart
9545 // << std::endl;
9546 // (*partBoxes)[i].print();
9547 break;
9548 }
9549 }
9551 }
9552
9553 if(i == nBoxes) {
9554 // This error should never occur
9555 std::ostringstream oss;
9556 oss << "Point (";
9557 for(int j = 0; j < dim; j++) oss << point[j] << " ";
9558 oss << ") not found in domain";
9559 throw std::logic_error(oss.str());
9560 }
9561 }
9562
9563 else {
9564 // Point is outside the global domain.
9565 // Determine to which part it is closest.
9566 // TODO: with cuts, would not need this special case
9567
9568 typedef typename Zoltan2::coordinateModelPartBox::coord_t coord_t;
9569 size_t closestBox = 0;
9570 coord_t minDistance = std::numeric_limits<coord_t>::max();
9571 coord_t *centroid = new coord_t[dim];
9572 for(size_t i = 0; i < nBoxes; i++) {
9573 (*partBoxes)[i].computeCentroid(centroid);
9574 coord_t sum = 0.;
9575 coord_t diff;
9576 for(int j = 0; j < dim; j++) {
9577 diff = centroid[j] - point[j];
9578 sum += diff * diff;
9579 }
9580 if(sum < minDistance) {
9581 minDistance = sum;
9582 closestBox = i;
9583 }
9584 }
9585 foundPart = (*partBoxes)[closestBox].getpId();
9586 delete [] centroid;
9587 }
9588
9589 return foundPart;
9590 }
9591 else {
9592 throw std::logic_error("need to use keep_cuts parameter for pointAssign");
9593 }
9594}
9595
9596template <typename Adapter>
9598 const PartitioningSolution<Adapter> *solution,
9599 ArrayRCP<typename Zoltan2_AlgMJ<Adapter>::mj_part_t> &comXAdj,
9600 ArrayRCP<typename Zoltan2_AlgMJ<Adapter>::mj_part_t> &comAdj)
9601{
9602 if(comXAdj_.getRawPtr() == NULL && comAdj_.getRawPtr() == NULL) {
9603 RCP<mj_partBoxVector_t> pBoxes = this->getGlobalBoxBoundaries();
9604 mj_part_t ntasks = (*pBoxes).size();
9605 int dim = (*pBoxes)[0].getDim();
9606 GridHash grid(pBoxes, ntasks, dim);
9607 grid.getAdjArrays(comXAdj_, comAdj_);
9608 }
9609 comAdj = comAdj_;
9610 comXAdj = comXAdj_;
9611}
9612
9613template <typename Adapter>
9614RCP<typename Zoltan2_AlgMJ<Adapter>::mj_partBoxVector_t>
9616{
9617 return this->mj_partitioner.get_kept_boxes();
9618}
9619} // namespace Zoltan2
9620
9621#endif
Defines the CoordinateModel classes.
#define Z2_FORWARD_EXCEPTIONS
Forward an exception back through call stack.
#define Z2_ASSERT_VALUE(actual, expected)
Throw an error when actual value is not equal to expected value.
#define Z2_THROW_OUTSIDE_ERROR(env)
Throw an error returned from outside the Zoltan2 library.
Define IntegerRangeList validator.
Contains Teuchos redcution operators for the Multi-jagged algorthm.
Defines Parameter related enumerators, declares functions.
A gathering of useful namespace methods.
Zoltan2_BoxBoundaries is a reduction operation to all reduce the all box boundaries.
void reduce(const Ordinal count, const T inBuffer[], T inoutBuffer[]) const
Implement Teuchos::ValueTypeReductionOp interface.
Zoltan2_BoxBoundaries(Ordinal s_)
Constructor.
Multi Jagged coordinate partitioning algorithm.
void set_partitioning_parameters(bool distribute_points_on_cut_lines_, int max_concurrent_part_calculation_, int check_migrate_avoid_migration_option_, double minimum_migration_imbalance_, int migration_type_=0)
Multi Jagged coordinate partitioning algorithm.
RCP< mj_partBoxVector_t > compute_global_box_boundaries(RCP< mj_partBoxVector_t > &localPartBoxes) const
DOCWORK: Documentation.
void sequential_task_partitioning(const RCP< const Environment > &env, mj_lno_t num_total_coords, mj_lno_t num_selected_coords, size_t num_target_part, int coord_dim, Kokkos::View< mj_scalar_t **, Kokkos::LayoutLeft, device_t > &mj_coordinates_, Kokkos::View< mj_lno_t *, device_t > &initial_selected_coords_output_permutation, mj_lno_t *output_xadj, int recursion_depth_, const Kokkos::View< mj_part_t *, Kokkos::HostSpace > &part_no_array, bool partition_along_longest_dim, int num_ranks_per_node, bool divide_to_prime_first_, mj_part_t num_first_level_parts_=1, const Kokkos::View< mj_part_t *, Kokkos::HostSpace > &first_level_distribution_=Kokkos::View< mj_part_t *, Kokkos::HostSpace >())
Special function for partitioning for task mapping. Runs sequential, and performs deterministic parti...
void multi_jagged_part(const RCP< const Environment > &env, RCP< const Comm< int > > &problemComm, double imbalance_tolerance, int num_teams, size_t num_global_parts, Kokkos::View< mj_part_t *, Kokkos::HostSpace > &part_no_array, int recursion_depth, int coord_dim, mj_lno_t num_local_coords, mj_gno_t num_global_coords, Kokkos::View< const mj_gno_t *, device_t > &initial_mj_gnos, Kokkos::View< mj_scalar_t **, Kokkos::LayoutLeft, device_t > &mj_coordinates, int num_weights_per_coord, Kokkos::View< bool *, Kokkos::HostSpace > &mj_uniform_weights, Kokkos::View< mj_scalar_t **, device_t > &mj_weights, Kokkos::View< bool *, Kokkos::HostSpace > &mj_uniform_parts, Kokkos::View< mj_part_t *, device_t > &result_assigned_part_ids, Kokkos::View< mj_gno_t *, device_t > &result_mj_gnos)
Multi Jagged coordinate partitioning algorithm.
RCP< mj_partBoxVector_t > get_kept_boxes() const
DOCWORK: Documentation.
AlgMJ()
Multi Jagged coordinate partitioning algorithm default constructor.
RCP< mj_partBox_t > get_global_box() const
DOCWORK: Documentation.
void set_to_keep_part_boxes()
Function call, if the part boxes are intended to be kept.
Algorithm defines the base class for all algorithms.
static RCP< Teuchos::BoolParameterEntryValidator > getBoolValidator()
Exists to make setting up validators less cluttered.
static RCP< Teuchos::AnyNumberParameterEntryValidator > getAnyDoubleValidator()
Exists to make setting up validators less cluttered.
static RCP< Teuchos::AnyNumberParameterEntryValidator > getAnyIntValidator()
Exists to make setting up validators less cluttered.
GridHash Class, Hashing Class for part boxes.
A ParameterList validator for integer range lists.
Multi Jagged coordinate partitioning algorithm.
void set_up_partitioning_data(const RCP< PartitioningSolution< Adapter > > &solution)
Zoltan2_AlgMJ(const RCP< const Environment > &env, RCP< const Comm< int > > &problemComm, const RCP< const typename Adapter::base_adapter_t > &adapter)
void partition(const RCP< PartitioningSolution< Adapter > > &solution)
Multi Jagged coordinate partitioning algorithm.
mj_part_t pointAssign(int dim, adapter_scalar_t *point) const
void boxAssign(int dim, adapter_scalar_t *lower, adapter_scalar_t *upper, size_t &nPartsFound, mj_part_t **partsFound) const
static void getValidParameters(ParameterList &pl)
Set up validators specific to this algorithm.
void getCommunicationGraph(const PartitioningSolution< Adapter > *solution, ArrayRCP< mj_part_t > &comXAdj, ArrayRCP< mj_part_t > &comAdj)
returns communication graph resulting from MJ partitioning.
mj_partBoxVector_t & getPartBoxesView() const
for partitioning methods, return bounding boxes of the
coordinateModelPartBox Class, represents the boundaries of the box which is a result of a geometric p...
Class for sorting items with multiple values. First sorting with respect to val[0],...
void set(IT index_, CT count_, WT *vals_)
bool operator<(const uMultiSortItem< IT, CT, WT > &other) const
uMultiSortItem(IT index_, CT count_, WT *vals_)
Created by mbenlioglu on Aug 31, 2020.
Tpetra::global_size_t global_size_t
std::bitset< NUM_MODEL_FLAGS > modelFlag_t
void uqsort(IT n, uSortItem< IT, WT > *arr)
Quick sort function. Sorts the arr of uSortItems, with respect to increasing vals....
void uqSignsort(IT n, uSignedSortItem< IT, WT, SIGN > *arr)
Quick sort function. Sorts the arr of uSignedSortItems, with respect to increasing vals.
#define epsilon
Definition nd.cpp:82
static RCP< tMVector_t > coordinates
SparseMatrixAdapter_t::part_t part_t
KOKKOS_INLINE_FUNCTION value_type & reference() const
KOKKOS_INLINE_FUNCTION void join(value_type &dst, const value_type &src) const
Zoltan2_MJArrayType< scalar_t > value_type
KOKKOS_INLINE_FUNCTION void join(volatile value_type &dst, const volatile value_type &src) const
KOKKOS_INLINE_FUNCTION ArrayCombinationReducer(scalar_t mj_max_scalar, value_type &val, int mj_value_count_rightleft, int mj_value_count_weights)
KOKKOS_INLINE_FUNCTION void init(value_type &dst) const
Zoltan2_MJArrayType< scalar_t > value_type
KOKKOS_INLINE_FUNCTION void init(value_type &dst) const
KOKKOS_INLINE_FUNCTION ArrayReducer(value_type &val, int mj_value_count)
KOKKOS_INLINE_FUNCTION void join(value_type &dst, const value_type &src) const
KOKKOS_INLINE_FUNCTION value_type & reference() const
KOKKOS_INLINE_FUNCTION void init(value_type dst) const
Kokkos::View< part_t *, device_t > parts
Kokkos::View< scalar_t * > scalar_view_t
Kokkos::View< index_t *, device_t > part_xadj
ReduceArrayFunctor(part_t mj_concurrent_current_part, part_t mj_weight_array_size, Kokkos::View< index_t *, device_t > &mj_permutations, Kokkos::View< scalar_t *, device_t > &mj_coordinates, Kokkos::View< part_t *, device_t > &mj_parts, Kokkos::View< index_t *, device_t > &mj_part_xadj, Kokkos::View< index_t *, device_t > &mj_track_on_cuts)
Kokkos::View< index_t *, device_t > track_on_cuts
Kokkos::View< scalar_t *, device_t > coordinates
size_t team_shmem_size(int team_size) const
Kokkos::View< index_t *, device_t > permutations
KOKKOS_INLINE_FUNCTION void join(value_type dst, const value_type src) const
Kokkos::View< scalar_t *, device_t > cut_coordinates
KOKKOS_INLINE_FUNCTION void init(value_type dst) const
Kokkos::View< scalar_t **, device_t > weights
ReduceWeightsFunctor(int mj_loop_count, array_t mj_max_scalar, part_t mj_concurrent_current_part, part_t mj_num_cuts, part_t mj_current_work_part, part_t mj_current_concurrent_num_parts, part_t mj_left_right_array_size, part_t mj_weight_array_size, Kokkos::View< index_t *, device_t > &mj_permutations, Kokkos::View< scalar_t *, device_t > &mj_coordinates, Kokkos::View< scalar_t **, device_t > &mj_weights, Kokkos::View< part_t *, device_t > &mj_parts, Kokkos::View< scalar_t *, device_t > &mj_cut_coordinates, Kokkos::View< index_t *, device_t > &mj_part_xadj, bool mj_uniform_weights0, scalar_t mj_sEpsilon)
KOKKOS_INLINE_FUNCTION void join(value_type dst, const value_type src) const
Kokkos::View< scalar_t *, device_t > coordinates
Kokkos::View< part_t *, device_t > parts
size_t team_shmem_size(int team_size) const
Kokkos::View< index_t *, device_t > part_xadj
Kokkos::View< index_t *, device_t > permutations
KOKKOS_INLINE_FUNCTION void operator()(const member_type &teamMember, value_type teamSum) const
Kokkos::View< scalar_t * > scalar_view_t
Zoltan2_MJArrayType< scalar_t > & operator=(const volatile Zoltan2_MJArrayType< scalar_t > &zmj)
KOKKOS_INLINE_FUNCTION Zoltan2_MJArrayType()
KOKKOS_INLINE_FUNCTION Zoltan2_MJArrayType(scalar_t *pSetPtr)
bool operator<=(const uSignedSortItem< IT, WT, SIGN > &rhs)
bool operator<(const uSignedSortItem< IT, WT, SIGN > &rhs) const
Sort items for quick sort function.