49#ifndef _ZOLTAN2_ALGMultiJagged_HPP_
50#define _ZOLTAN2_ALGMultiJagged_HPP_
59#include <Tpetra_Distributor.hpp>
60#include <Teuchos_StandardParameterEntryValidators.hpp>
61#include <Teuchos_ParameterList.hpp>
62#include <Kokkos_Sort.hpp>
66#include <unordered_map>
68#ifdef ZOLTAN2_USEZOLTANCOMM
69#ifdef HAVE_ZOLTAN2_MPI
70#define ZOLTAN2_MJ_ENABLE_ZOLTAN_MIGRATION
71#include "zoltan_comm_cpp.h"
72#include "zoltan_types.h"
81template <
typename Ordinal,
typename T>
105 void reduce(
const Ordinal count,
const T inBuffer[], T inoutBuffer[])
const {
106 for(Ordinal i = 0; i < count; i++) {
108 inoutBuffer[i] = inBuffer[i];
124template <
typename IT,
typename CT,
typename WT>
139 this->epsilon = std::numeric_limits<WT>::epsilon() * 100;
144 this->index = index_;
145 this->count = count_;
147 this->epsilon = std::numeric_limits<WT>::epsilon() * 100;
153 void set(IT index_ ,CT count_, WT *vals_) {
154 this->index = index_;
155 this->count = count_;
159 bool operator<(
const uMultiSortItem<IT,CT,WT>& other)
const {
160 assert(this->count == other.count);
161 for(CT i = 0; i < this->
count; ++i) {
163 if(std::abs(this->val[i] - other.val[i]) < this->epsilon) {
167 if(this->val[i] < other.val[i]) {
176 return this->index < other.index;
182template <
class IT,
class WT>
193template <
class IT,
class WT>
194void uqsort(IT n, uSortItem<IT, WT> * arr) {
195 const int NSTACK = 50;
197 IT i, ir=n, j, k, l=1;
198 IT jstack=0, istack[NSTACK];
205 for(j=l+1;j<=ir;j++) {
208 for(i=j-1;i>=1;i--) {
209 if(arr[i].val <= aval)
222 std::swap(arr[k],arr[l+1]);
223 if(arr[l+1].val > arr[ir].val) {
224 std::swap(arr[l+1],arr[ir]);
226 if(arr[l].val > arr[ir].val) {
227 std::swap(arr[l],arr[ir]);
229 if(arr[l+1].val > arr[l].val) {
230 std::swap(arr[l+1],arr[l]);
237 do i++;
while (arr[i].val < aval);
238 do j--;
while (arr[j].val > aval);
240 std::swap(arr[i],arr[j]);
245 if(jstack > NSTACK) {
246 std::cout <<
"uqsort: NSTACK too small in sort." << std::endl;
263template <
class IT,
class WT,
class SIGN>
269 bool operator<(
const uSignedSortItem<IT, WT, SIGN>& rhs)
const {
271 if(this->signbit < rhs.signbit) {
275 else if(this->signbit == rhs.signbit) {
276 if(this->val < rhs.val) {
280 else if(this->val > rhs.val) {
295 return (this->val == rhs.val && this->signbit == rhs.signbit) || (*
this < rhs);
302template <
class IT,
class WT,
class SIGN>
304 const IT NSTACK = 50;
306 IT i, ir=n, j, k, l=1;
307 IT jstack=0, istack[NSTACK];
308 uSignedSortItem<IT,WT,SIGN> a;
313 for(j=l+1;j<=ir;j++) {
315 for(i=j-1;i>=1;i--) {
331 std::swap(arr[k],arr[l+1]);
332 if(arr[ir] < arr[l+1]) {
333 std::swap(arr[l+1],arr[ir]);
335 if(arr[ir] < arr[l] ) {
336 std::swap(arr[l],arr[ir]);
338 if(arr[l] < arr[l+1]) {
339 std::swap(arr[l+1],arr[l]);
345 do i++;
while (arr[i] < a);
346 do j--;
while (a < arr[j]);
348 std::swap(arr[i],arr[j]);
353 if(jstack > NSTACK) {
354 std::cout <<
"uqsort: NSTACK too small in sort." << std::endl;
390 static int counter = 0;
394 static int counter = 0;
401template <
typename mj_scalar_t,
typename mj_lno_t,
typename mj_gno_t,
402 typename mj_part_t,
typename mj_node_t>
406 typedef typename mj_node_t::device_type device_t;
408 typedef std::vector<mj_partBox_t> mj_partBoxVector_t;
413 static constexpr size_t future_reduceall_cutoff = 1500000;
417 static constexpr mj_lno_t min_work_last_dim = 1000;
419 static constexpr mj_scalar_t least_signifiance = 0.0001;
420 static constexpr int significance_mul = 1000;
422 std::string mj_timer_base_string;
424 RCP<const Environment> mj_env;
425 RCP<const Comm<int> > mj_problemComm;
426 RCP<Comm<int> > comm;
427 double imbalance_tolerance;
430 int num_weights_per_coord;
431 size_t initial_num_loc_coords;
433 mj_lno_t num_local_coords;
434 mj_gno_t num_global_coords;
435 mj_scalar_t sEpsilon;
438 bool distribute_points_on_cut_lines;
441 mj_part_t max_concurrent_part_calculation;
444 int mj_user_recursion_depth;
445 bool mj_keep_part_boxes;
448 int check_migrate_avoid_migration_option;
455 double minimum_migration_imbalance;
472 mj_part_t num_first_level_parts;
476 Kokkos::View<mj_part_t*, Kokkos::HostSpace> first_level_distribution;
478 mj_part_t total_num_cut ;
479 mj_part_t total_num_part;
481 mj_part_t max_num_part_along_dim ;
482 mj_part_t max_num_cut_along_dim;
485 size_t max_num_total_part_along_dim;
487 mj_part_t total_dim_num_reduce_all;
491 mj_part_t last_dim_num_part;
494 Kokkos::View<mj_part_t *, Kokkos::HostSpace> part_no_array;
498 Kokkos::View<mj_scalar_t **, Kokkos::LayoutLeft, device_t>
502 Kokkos::View<mj_scalar_t **, device_t> mj_weights;
505 Kokkos::View<bool *, Kokkos::HostSpace> mj_uniform_parts;
508 Kokkos::View<bool *, Kokkos::HostSpace> mj_uniform_weights;
512 size_t num_global_parts;
515 RCP<mj_partBoxVector_t> kept_boxes;
517 RCP<mj_partBox_t> global_box;
522 bool divide_to_prime_first;
525 Kokkos::View<const mj_gno_t*, device_t> initial_mj_gnos;
528 Kokkos::View<mj_gno_t*, device_t> current_mj_gnos;
531 Kokkos::View<int*, Kokkos::HostSpace> owner_of_coordinate;
534 Kokkos::View<mj_lno_t*, device_t> coordinate_permutations;
537 Kokkos::View<mj_lno_t*, device_t> new_coordinate_permutations;
540 Kokkos::View<mj_part_t*, device_t> assigned_part_ids;
543 Kokkos::View<mj_lno_t *, device_t> part_xadj;
546 Kokkos::View<mj_lno_t *, device_t> new_part_xadj;
548 Kokkos::View<mj_scalar_t *, device_t> all_cut_coordinates;
551 Kokkos::View<mj_scalar_t *, device_t>
552 process_cut_line_weight_to_put_left;
555 Kokkos::View<mj_scalar_t *, device_t>
556 thread_cut_line_weight_to_put_left;
562 Kokkos::View<mj_scalar_t *, device_t> cut_coordinates_work_array;
565 Kokkos::View<mj_scalar_t *, device_t> temp_cut_coords;
568 Kokkos::View<mj_scalar_t *, device_t> target_part_weights;
571 Kokkos::View<mj_scalar_t *, device_t> cut_upper_bound_coordinates;
574 Kokkos::View<mj_scalar_t *, device_t> cut_lower_bound_coordinates;
577 Kokkos::View<mj_scalar_t *, device_t> cut_lower_bound_weights;
580 Kokkos::View<mj_scalar_t *, device_t> cut_upper_bound_weights;
584 Kokkos::View<mj_scalar_t *, device_t>
585 process_local_min_max_coord_total_weight;
588 Kokkos::View<mj_scalar_t *, device_t>
589 global_min_max_coord_total_weight;
593 Kokkos::View<bool *, device_t> is_cut_line_determined;
599 Kokkos::View<mj_part_t *, device_t> device_incomplete_cut_count;
600 typename decltype(device_incomplete_cut_count)::HostMirror
601 incomplete_cut_count;
604 typename decltype (part_xadj)::HostMirror host_part_xadj;
607 Kokkos::View<double *, device_t>
611 Kokkos::View<double *, device_t>
612 thread_part_weight_work;
616 Kokkos::View<mj_scalar_t *, device_t>
617 thread_cut_left_closest_point;
621 Kokkos::View<mj_scalar_t *, device_t>
622 thread_cut_right_closest_point;
625 Kokkos::View<mj_lno_t *, device_t>
628 Kokkos::View<mj_scalar_t *, device_t> process_rectilinear_cut_weight;
629 Kokkos::View<mj_scalar_t *, device_t> global_rectilinear_cut_weight;
635 Kokkos::View<mj_scalar_t *, device_t>
636 total_part_weight_left_right_closests;
637 Kokkos::View<mj_scalar_t *, device_t>
638 global_total_part_weight_left_right_closests;
640 Kokkos::View<mj_part_t*, device_t> device_num_partitioning_in_current_dim;
641 typename decltype(device_num_partitioning_in_current_dim)::HostMirror
642 host_num_partitioning_in_current_dim;
649 KOKKOS_INLINE_FUNCTION
650 double calculate_imbalance(mj_scalar_t achieved, mj_scalar_t expected) {
651 return static_cast<double>(achieved) /
static_cast<double>(expected) - 1.0;
660 void set_part_specifications();
667 inline mj_part_t get_part_count(
668 mj_part_t num_total_future,
677 void init_part_boxes(RCP<mj_partBoxVector_t> & outPartBoxes);
698 mj_part_t update_part_num_arrays(
699 std::vector<mj_part_t> *future_num_part_in_parts,
700 std::vector<mj_part_t> *next_future_num_parts_in_parts,
701 mj_part_t &future_num_parts,
702 mj_part_t current_num_parts,
703 int current_iteration,
704 RCP<mj_partBoxVector_t> input_part_boxes,
705 RCP<mj_partBoxVector_t> output_part_boxes,
706 mj_part_t atomic_part_count);
720 KOKKOS_INLINE_FUNCTION
721 void mj_calculate_new_cut_position (
722 mj_scalar_t cut_upper_bound,
723 mj_scalar_t cut_lower_bound,
724 mj_scalar_t cut_upper_weight,
725 mj_scalar_t cut_lower_weight,
726 mj_scalar_t expected_weight,
727 mj_scalar_t &new_cut_position,
728 mj_scalar_t sEpsilon);
754 bool mj_perform_migration(
755 mj_part_t in_num_parts,
756 mj_part_t &out_num_parts,
757 std::vector<mj_part_t> *next_future_num_parts_in_parts,
758 mj_part_t &output_part_begin_index,
759 size_t migration_reduce_all_population,
760 mj_lno_t num_coords_for_last_dim_part,
761 std::string iteration,
762 RCP<mj_partBoxVector_t> &input_part_boxes,
763 RCP<mj_partBoxVector_t> &output_part_boxes);
782 bool mj_check_to_migrate(
783 size_t migration_reduce_all_population,
784 mj_lno_t num_coords_for_last_dim_part,
787 mj_gno_t *num_points_in_all_processor_parts);
813 void mj_migration_part_proc_assignment(
814 mj_gno_t * num_points_in_all_processor_parts,
817 mj_lno_t *send_count_to_each_proc,
818 std::vector<mj_part_t> &processor_ranks_for_subcomm,
819 std::vector<mj_part_t> *next_future_num_parts_in_parts,
820 mj_part_t &out_num_part,
821 std::vector<mj_part_t> &out_part_indices,
822 mj_part_t &output_part_numbering_begin_index,
823 int *coordinate_destinations);
850 void mj_assign_proc_to_parts(
851 mj_gno_t * num_points_in_all_processor_parts,
854 mj_lno_t *send_count_to_each_proc,
855 std::vector<mj_part_t> &processor_ranks_for_subcomm,
856 std::vector<mj_part_t> *next_future_num_parts_in_parts,
857 mj_part_t &out_part_index,
858 mj_part_t &output_part_numbering_begin_index,
859 int *coordinate_destinations);
876 void assign_send_destinations(
878 mj_part_t *part_assignment_proc_begin_indices,
879 mj_part_t *processor_chains_in_parts,
880 mj_lno_t *send_count_to_each_proc,
881 int *coordinate_destinations);
897 void assign_send_destinations2(
899 uSortItem<mj_part_t, mj_part_t> * sort_item_part_to_proc_assignment,
900 int *coordinate_destinations,
901 mj_part_t &output_part_numbering_begin_index,
902 std::vector<mj_part_t> *next_future_num_parts_in_parts);
926 void mj_assign_parts_to_procs(
927 mj_gno_t * num_points_in_all_processor_parts,
930 mj_lno_t *send_count_to_each_proc,
931 std::vector<mj_part_t> *next_future_num_parts_in_parts,
932 mj_part_t &out_num_part,
933 std::vector<mj_part_t> &out_part_indices,
934 mj_part_t &output_part_numbering_begin_index,
935 int *coordinate_destinations);
950 void mj_migrate_coords(
952 mj_lno_t &num_new_local_points,
953 std::string iteration,
954 int *coordinate_destinations,
955 mj_part_t num_parts);
962 void create_sub_communicator(
963 std::vector<mj_part_t> &processor_ranks_for_subcomm);
969 mj_part_t find_largest_prime_factor(mj_part_t num_parts) {
970 mj_part_t largest_factor = 1;
971 mj_part_t n = num_parts;
972 mj_part_t divisor = 2;
974 while (n % divisor == 0) {
976 largest_factor = divisor;
979 if(divisor * divisor > n) {
986 return largest_factor;
1019 const RCP<const Environment> &env,
1020 RCP<
const Comm<int> > &problemComm,
1021 double imbalance_tolerance,
1023 size_t num_global_parts,
1024 Kokkos::View<mj_part_t*, Kokkos::HostSpace> & part_no_array,
1025 int recursion_depth,
1027 mj_lno_t num_local_coords,
1028 mj_gno_t num_global_coords,
1029 Kokkos::View<const mj_gno_t*, device_t> & initial_mj_gnos,
1031 Kokkos::View<mj_scalar_t**, Kokkos::LayoutLeft, device_t> & mj_coordinates,
1032 int num_weights_per_coord,
1033 Kokkos::View<bool*, Kokkos::HostSpace> & mj_uniform_weights,
1034 Kokkos::View<mj_scalar_t**, device_t> & mj_weights,
1035 Kokkos::View<bool*, Kokkos::HostSpace> & mj_uniform_parts,
1036 Kokkos::View<mj_part_t*, device_t> & result_assigned_part_ids,
1037 Kokkos::View<mj_gno_t*, device_t> & result_mj_gnos);
1052 bool distribute_points_on_cut_lines_,
1053 int max_concurrent_part_calculation_,
1054 int check_migrate_avoid_migration_option_,
1055 double minimum_migration_imbalance_,
1056 int migration_type_ = 0);
1073 RCP<mj_partBoxVector_t> &localPartBoxes)
const;
1114 const RCP<const Environment> &env,
1115 mj_lno_t num_total_coords,
1116 mj_lno_t num_selected_coords,
1117 size_t num_target_part,
1120 Kokkos::View<mj_scalar_t **, Kokkos::LayoutLeft, device_t> & mj_coordinates_,
1121 Kokkos::View<mj_lno_t *, device_t> &
1122 initial_selected_coords_output_permutation,
1123 mj_lno_t *output_xadj,
1124 int recursion_depth_,
1125 const Kokkos::View<mj_part_t *, Kokkos::HostSpace> & part_no_array,
1126 bool partition_along_longest_dim,
1127 int num_ranks_per_node,
1128 bool divide_to_prime_first_,
1129 mj_part_t num_first_level_parts_ = 1,
1130 const Kokkos::View<mj_part_t *, Kokkos::HostSpace> & first_level_distribution_
1131 = Kokkos::View<mj_part_t *, Kokkos::HostSpace>());
1133#if defined(KOKKOS_ENABLE_CUDA) || defined(KOKKOS_ENABLE_HIP)
1141 void allocate_set_work_memory();
1144 void compute_global_box();
1153 void mj_get_local_min_max_coord_totW(
1154 mj_part_t current_work_part,
1155 mj_part_t current_concurrent_num_parts,
1156 Kokkos::View<mj_scalar_t *, device_t> & mj_current_dim_coords);
1170 void mj_get_global_min_max_coord_totW(
1171 mj_part_t current_concurrent_num_parts,
1172 Kokkos::View<mj_scalar_t *, device_t> & local_min_max_total,
1173 Kokkos::View<mj_scalar_t *, device_t> & global_min_max_total);
1205 void mj_get_initial_cut_coords_target_weights(
1206 mj_scalar_t min_coord,
1207 mj_scalar_t max_coord,
1208 mj_part_t num_cuts ,
1209 mj_scalar_t global_weight,
1210 Kokkos::View<mj_scalar_t *, device_t> & initial_cut_coords,
1211 Kokkos::View<mj_scalar_t *, device_t> & target_part_weights,
1212 std::vector <mj_part_t> *future_num_part_in_parts,
1213 std::vector <mj_part_t> *next_future_num_parts_in_parts,
1214 mj_part_t concurrent_current_part,
1215 mj_part_t obtained_part_index,
1216 mj_part_t num_target_first_level_parts = 1,
1217 const Kokkos::View<mj_part_t *, Kokkos::HostSpace> & target_first_level_dist =
1218 Kokkos::View<mj_part_t *, Kokkos::HostSpace>());
1236 void set_initial_coordinate_parts(
1237 mj_scalar_t &max_coordinate,
1238 mj_scalar_t &min_coordinate,
1239 mj_lno_t coordinate_begin_index,
1240 mj_lno_t coordinate_end_index,
1241 Kokkos::View<mj_lno_t *, device_t> &
1242 mj_current_coordinate_permutations,
1243 Kokkos::View<mj_scalar_t *, device_t> & mj_current_dim_coords,
1244 Kokkos::View<mj_part_t *, device_t> & mj_part_ids,
1245 mj_part_t &partition_count);
1264 Kokkos::View<mj_scalar_t *, device_t> & mj_current_dim_coords,
1265 double imbalanceTolerance,
1266 mj_part_t current_work_part,
1267 mj_part_t current_concurrent_num_parts,
1268 Kokkos::View<mj_scalar_t *, device_t> & current_cut_coordinates,
1269 mj_part_t total_incomplete_cut_count,
1270 Kokkos::View<mj_part_t *, device_t> & view_rectilinear_cut_count,
1271 Kokkos::View<size_t*, device_t> & view_total_reduction_size);
1278 void mj_1D_part_get_part_weights(
1279 mj_part_t current_concurrent_num_parts,
1280 mj_part_t current_work_part,
1281 Kokkos::View<mj_scalar_t *, device_t> & mj_current_dim_coords,
1291 void mj_combine_rightleft_and_weights(
1292 mj_part_t current_work_part,
1293 mj_part_t current_concurrent_num_parts);
1307 void mj_create_new_partitions(
1308 mj_part_t num_parts,
1309 mj_part_t current_concurrent_work_part,
1310 Kokkos::View<mj_scalar_t *, device_t> & mj_current_dim_coords,
1311 Kokkos::View<mj_scalar_t *, device_t> & current_concurrent_cut_coordinate,
1312 Kokkos::View<mj_scalar_t *, device_t> & used_local_cut_line_weight_to_left,
1313 Kokkos::View<mj_lno_t *, device_t> & out_part_xadj);
1350 void mj_get_new_cut_coordinates(
1351 mj_part_t current_concurrent_num_parts,
1353 const mj_part_t &num_cuts,
1354 const double &used_imbalance_tolerance,
1355 Kokkos::View<mj_scalar_t *, device_t> & current_global_part_weights,
1356 Kokkos::View<mj_scalar_t *, device_t> & current_local_part_weights,
1357 Kokkos::View<mj_scalar_t *, device_t> & current_part_target_weights,
1358 Kokkos::View<bool *, device_t> & current_cut_line_determined,
1359 Kokkos::View<mj_scalar_t *, device_t> & current_cut_coordinates,
1360 Kokkos::View<mj_scalar_t *, device_t> & current_cut_upper_bounds,
1361 Kokkos::View<mj_scalar_t *, device_t> & current_cut_lower_bounds,
1362 Kokkos::View<mj_scalar_t *, device_t> & current_global_left_closest_points,
1363 Kokkos::View<mj_scalar_t *, device_t> & current_global_right_closest_points,
1364 Kokkos::View<mj_scalar_t *, device_t> & current_cut_lower_bound_weights,
1365 Kokkos::View<mj_scalar_t *, device_t> & current_cut_upper_weights,
1366 Kokkos::View<mj_scalar_t *, device_t> & new_current_cut_coordinates,
1367 Kokkos::View<mj_scalar_t *, device_t> &
1368 current_part_cut_line_weight_to_put_left,
1369 Kokkos::View<mj_part_t *, device_t> & view_rectilinear_cut_count);
1380 void get_processor_num_points_in_parts(
1381 mj_part_t num_procs,
1382 mj_part_t num_parts,
1383 mj_gno_t *&num_points_in_all_processor_parts);
1389 void fill_permutation_array(
1390 mj_part_t output_num_parts,
1391 mj_part_t num_parts);
1414 void create_consistent_chunks(
1415 mj_part_t num_parts,
1416 Kokkos::View<mj_scalar_t *, device_t> & mj_current_dim_coords,
1417 Kokkos::View<mj_scalar_t *, device_t> & current_concurrent_cut_coordinate,
1418 mj_lno_t coordinate_begin,
1419 mj_lno_t coordinate_end,
1420 Kokkos::View<mj_scalar_t *, device_t> & used_local_cut_line_weight_to_left,
1421 Kokkos::View<mj_lno_t *, device_t> & out_part_xadj,
1423 bool longest_dim_part,
1424 uSignedSortItem<int, mj_scalar_t, char> *p_coord_dimension_range_sorted);
1434 void set_final_parts(
1435 mj_part_t current_num_parts,
1436 mj_part_t output_part_begin_index,
1437 RCP<mj_partBoxVector_t> &output_part_boxes,
1438 bool is_data_ever_migrated);
1443template <
typename mj_scalar_t,
typename mj_lno_t,
typename mj_gno_t,
1444 typename mj_part_t,
typename mj_node_t>
1446 mj_env(), mj_problemComm(), comm(), imbalance_tolerance(0),
1447 recursion_depth(0), coord_dim(0),
1448 num_weights_per_coord(0), initial_num_loc_coords(0),
1449 initial_num_glob_coords(0),
1450 num_local_coords(0), num_global_coords(0),
1451 sEpsilon(std::numeric_limits<mj_scalar_t>::
epsilon() * 100),
1452 distribute_points_on_cut_lines(true),
1453 max_concurrent_part_calculation(1),
1454 mj_run_as_rcb(false), mj_user_recursion_depth(0),
1455 mj_keep_part_boxes(false),
1456 check_migrate_avoid_migration_option(0), migration_type(0),
1457 minimum_migration_imbalance(0.30),
1458 num_first_level_parts(1),
1459 total_num_cut(0), total_num_part(0), max_num_part_along_dim(0),
1460 max_num_cut_along_dim(0),
1461 max_num_total_part_along_dim(0),
1462 total_dim_num_reduce_all(0),
1463 last_dim_num_part(0),
1465 num_global_parts(1),
1466 kept_boxes(), global_box(),
1467 myRank(0), myActualRank(0),
1468 divide_to_prime_first(false)
1515template <
typename mj_scalar_t,
typename mj_lno_t,
typename mj_gno_t,
1516 typename mj_part_t,
typename mj_node_t>
1519 const RCP<const Environment> &env,
1520 mj_lno_t num_total_coords,
1521 mj_lno_t num_selected_coords,
1522 size_t num_target_part,
1525 Kokkos::View<mj_scalar_t **, Kokkos::LayoutLeft, device_t> &
1527 Kokkos::View<mj_lno_t *, device_t> & initial_adjList_output_adjlist,
1528 mj_lno_t *output_xadj,
1529 int recursion_depth_,
1530 const Kokkos::View<mj_part_t *, Kokkos::HostSpace> & part_no_array_,
1531 bool partition_along_longest_dim,
1532 int num_ranks_per_node,
1533 bool divide_to_prime_first_,
1534 mj_part_t num_first_level_parts_,
1535 const Kokkos::View<mj_part_t *, Kokkos::HostSpace> & first_level_distribution_)
1538 const RCP<Comm<int> > commN;
1539 this->mj_problemComm = Teuchos::DefaultComm<int>::getDefaultSerialComm(commN);
1540 this->comm = Teuchos::rcp_const_cast<Comm<int> >(this->mj_problemComm);
1541 this->myActualRank = this->myRank = 1;
1543 this->divide_to_prime_first = divide_to_prime_first_;
1548 this->imbalance_tolerance = 0;
1549 this->num_global_parts = num_target_part;
1550 this->part_no_array = part_no_array_;
1551 this->recursion_depth = recursion_depth_;
1555 this->num_first_level_parts = num_first_level_parts_;
1557 this->first_level_distribution = first_level_distribution_;
1559 this->coord_dim = coord_dim_;
1560 this->num_local_coords = num_total_coords;
1562 this->num_global_coords = num_total_coords;
1563 this->mj_coordinates = mj_coordinates_;
1566 this->initial_mj_gnos =
1567 Kokkos::View<mj_gno_t*, device_t>(
"gids", this->num_local_coords);
1569 this->num_weights_per_coord = 0;
1571 this->mj_uniform_weights = Kokkos::View<bool*, Kokkos::HostSpace>(
1572 "uniform weights", 1);
1573 this->mj_uniform_weights(0) =
true;
1575 this->mj_weights = Kokkos::View<mj_scalar_t**, device_t>
1578 this->mj_uniform_parts =
1579 Kokkos::View<bool*, Kokkos::HostSpace>(
"uniform parts", 1);
1580 this->mj_uniform_parts(0) =
true;
1582 this->set_part_specifications();
1584 this->allocate_set_work_memory();
1587 auto local_part_xadj = this->part_xadj;
1588 Kokkos::parallel_for(
1589 Kokkos::RangePolicy<typename mj_node_t::execution_space, int> (0, 1),
1590 KOKKOS_LAMBDA (
int dummy) {
1591 local_part_xadj(0) =
static_cast<mj_lno_t
>(num_selected_coords);
1594 Kokkos::deep_copy(coordinate_permutations, initial_adjList_output_adjlist);
1596 mj_part_t current_num_parts = 1;
1598 Kokkos::View<mj_scalar_t *, device_t> current_cut_coordinates =
1599 this->all_cut_coordinates;
1601 mj_part_t future_num_parts = this->total_num_part;
1603 std::vector<mj_part_t> *future_num_part_in_parts =
1604 new std::vector<mj_part_t>();
1605 std::vector<mj_part_t> *next_future_num_parts_in_parts =
1606 new std::vector<mj_part_t>();
1607 next_future_num_parts_in_parts->push_back(this->num_global_parts);
1608 RCP<mj_partBoxVector_t> t1;
1609 RCP<mj_partBoxVector_t> t2;
1611 std::vector <uSignedSortItem<int, mj_scalar_t, char>>
1612 coord_dimension_range_sorted(this->coord_dim);
1613 uSignedSortItem<int, mj_scalar_t, char> *p_coord_dimension_range_sorted =
1614 &(coord_dimension_range_sorted[0]);
1615 std::vector <mj_scalar_t> coord_dim_mins(this->coord_dim);
1616 std::vector <mj_scalar_t> coord_dim_maxs(this->coord_dim);
1620 Kokkos::View<mj_part_t*, device_t>
1621 view_rectilinear_cut_count(
"view_rectilinear_cut_count", 1);
1622 Kokkos::View<size_t*, device_t>
1623 view_total_reduction_size(
"view_total_reduction_size", 1);
1625 for(
int rd = 0; rd < this->recursion_depth; ++rd) {
1631 std::vector<mj_part_t> *tmpPartVect = future_num_part_in_parts;
1632 future_num_part_in_parts = next_future_num_parts_in_parts;
1633 next_future_num_parts_in_parts = tmpPartVect;
1637 next_future_num_parts_in_parts->clear();
1640 mj_part_t output_part_count_in_dimension =
1641 this->update_part_num_arrays(
1642 future_num_part_in_parts,
1643 next_future_num_parts_in_parts,
1648 t2, num_ranks_per_node);
1653 if(output_part_count_in_dimension == current_num_parts) {
1654 tmpPartVect = future_num_part_in_parts;
1655 future_num_part_in_parts = next_future_num_parts_in_parts;
1656 next_future_num_parts_in_parts = tmpPartVect;
1661 std::string istring = std::to_string(rd);
1665 this->new_part_xadj = Kokkos::View<mj_lno_t*, device_t>(
1666 "new part xadj", output_part_count_in_dimension);
1670 mj_part_t output_part_index = 0;
1674 mj_part_t output_coordinate_end_index = 0;
1676 mj_part_t current_work_part = 0;
1677 mj_part_t current_concurrent_num_parts = 1;
1679 mj_part_t obtained_part_index = 0;
1682 int coordInd = rd % this->coord_dim;
1684 Kokkos::View<mj_scalar_t *, device_t> mj_current_dim_coords =
1685 Kokkos::subview(this->mj_coordinates, Kokkos::ALL, coordInd);
1687 auto host_process_local_min_max_coord_total_weight =
1688 Kokkos::create_mirror_view(process_local_min_max_coord_total_weight);
1689 auto host_global_min_max_coord_total_weight =
1690 Kokkos::create_mirror_view(global_min_max_coord_total_weight);
1693 for(; current_work_part < current_num_parts;
1694 current_work_part += current_concurrent_num_parts) {
1696 mj_part_t actual_work_part_count = 0;
1701 for(
int kk = 0; kk < current_concurrent_num_parts; ++kk) {
1702 mj_part_t current_work_part_in_concurrent_parts =
1703 current_work_part + kk;
1707 mj_part_t partition_count = host_num_partitioning_in_current_dim(
1708 current_work_part_in_concurrent_parts);
1709 if(partition_count == 1) {
1712 ++actual_work_part_count;
1713 if(partition_along_longest_dim) {
1714 auto local_process_local_min_max_coord_total_weight =
1715 this->process_local_min_max_coord_total_weight;
1716 for(
int coord_traverse_ind = 0;
1717 coord_traverse_ind < this->coord_dim; ++coord_traverse_ind) {
1719 Kokkos::View<mj_scalar_t *, device_t> coords =
1720 Kokkos::subview(this->mj_coordinates, Kokkos::ALL, coord_traverse_ind);
1722 this->mj_get_local_min_max_coord_totW(
1724 current_concurrent_num_parts,
1727 coord_dimension_range_sorted[coord_traverse_ind].id =
1729 coord_dimension_range_sorted[coord_traverse_ind].signbit = 1;
1731 Kokkos::deep_copy(host_process_local_min_max_coord_total_weight,
1732 process_local_min_max_coord_total_weight);
1734 coord_dim_mins[coord_traverse_ind] =
1735 host_process_local_min_max_coord_total_weight(kk);
1736 coord_dim_maxs[coord_traverse_ind] =
1737 host_process_local_min_max_coord_total_weight(
1738 kk + current_concurrent_num_parts);
1739 coord_dimension_range_sorted[coord_traverse_ind].val =
1740 host_process_local_min_max_coord_total_weight(
1741 kk + current_concurrent_num_parts) -
1742 host_process_local_min_max_coord_total_weight(kk);
1745 uqSignsort(this->coord_dim, p_coord_dimension_range_sorted);
1746 coordInd = p_coord_dimension_range_sorted[this->coord_dim - 1].id;
1747 auto set_min = coord_dim_mins[coordInd];
1748 auto set_max = coord_dim_maxs[coordInd];
1749 Kokkos::parallel_for(
1750 Kokkos::RangePolicy<typename mj_node_t::execution_space, int>
1751 (0, 1), KOKKOS_LAMBDA (
int dummy) {
1752 local_process_local_min_max_coord_total_weight(kk) = set_min;
1753 local_process_local_min_max_coord_total_weight(
1754 kk + current_concurrent_num_parts) = set_max;
1757 mj_current_dim_coords =
1758 Kokkos::subview(this->mj_coordinates, Kokkos::ALL, coordInd);
1761 Kokkos::View<mj_scalar_t *, device_t> coords =
1762 Kokkos::subview(this->mj_coordinates, Kokkos::ALL, coordInd);
1763 this->mj_get_local_min_max_coord_totW(
1765 current_concurrent_num_parts,
1771 if(actual_work_part_count > 0) {
1773 this->mj_get_global_min_max_coord_totW(
1774 current_concurrent_num_parts,
1775 this->process_local_min_max_coord_total_weight,
1776 this->global_min_max_coord_total_weight);
1779 Kokkos::deep_copy(host_global_min_max_coord_total_weight,
1780 global_min_max_coord_total_weight);
1784 mj_part_t total_incomplete_cut_count = 0;
1789 mj_part_t concurrent_part_cut_shift = 0;
1790 mj_part_t concurrent_part_part_shift = 0;
1791 for(
int kk = 0; kk < current_concurrent_num_parts; ++kk) {
1792 mj_scalar_t min_coordinate =
1793 host_global_min_max_coord_total_weight(kk);
1794 mj_scalar_t max_coordinate = host_global_min_max_coord_total_weight(
1795 kk + current_concurrent_num_parts);
1796 mj_scalar_t global_total_weight = host_global_min_max_coord_total_weight(
1797 kk + 2*current_concurrent_num_parts);
1799 mj_part_t concurrent_current_part_index = current_work_part + kk;
1801 mj_part_t partition_count = host_num_partitioning_in_current_dim(
1802 concurrent_current_part_index);
1804 Kokkos::View<mj_scalar_t *, device_t> usedCutCoordinate =
1805 Kokkos::subview(current_cut_coordinates,
1806 std::pair<mj_lno_t, mj_lno_t>(
1807 concurrent_part_cut_shift,
1808 current_cut_coordinates.size()));
1809 Kokkos::View<mj_scalar_t *, device_t>
1810 current_target_part_weights =
1811 Kokkos::subview(target_part_weights,
1812 std::pair<mj_lno_t, mj_lno_t>(
1813 concurrent_part_part_shift,
1814 target_part_weights.size()));
1817 concurrent_part_cut_shift += partition_count - 1;
1819 concurrent_part_part_shift += partition_count;
1822 if(partition_count > 1 && min_coordinate <= max_coordinate) {
1825 total_incomplete_cut_count += partition_count - 1;
1827 this->incomplete_cut_count(kk) = partition_count - 1;
1835 this->mj_get_initial_cut_coords_target_weights(
1838 partition_count - 1,
1839 global_total_weight,
1841 current_target_part_weights,
1842 future_num_part_in_parts,
1843 next_future_num_parts_in_parts,
1844 concurrent_current_part_index,
1845 obtained_part_index,
1846 rd == 0 ? this->num_first_level_parts : 1,
1847 this->first_level_distribution);
1849 mj_lno_t coordinate_end_index =
1850 host_part_xadj(concurrent_current_part_index);
1851 mj_lno_t coordinate_begin_index =
1852 (concurrent_current_part_index==0) ? 0 :
1853 host_part_xadj[concurrent_current_part_index - 1];
1856 this->set_initial_coordinate_parts(
1859 coordinate_begin_index, coordinate_end_index,
1860 this->coordinate_permutations,
1861 mj_current_dim_coords,
1862 this->assigned_part_ids,
1868 this->incomplete_cut_count(kk) = 0;
1870 obtained_part_index += partition_count;
1875 double used_imbalance = 0;
1878 this->mj_env->timerStart(MACRO_TIMERS,
1879 mj_timer_base_string +
"mj_1D_part()");
1882 mj_current_dim_coords,
1885 current_concurrent_num_parts,
1886 current_cut_coordinates,
1887 total_incomplete_cut_count,
1888 view_rectilinear_cut_count,
1889 view_total_reduction_size);
1891 this->mj_env->timerStop(MACRO_TIMERS,
1892 mj_timer_base_string +
"mj_1D_part()");
1895 obtained_part_index += current_concurrent_num_parts;
1899 mj_part_t output_array_shift = 0;
1900 mj_part_t cut_shift = 0;
1901 size_t tlr_shift = 0;
1902 size_t partweight_array_shift = 0;
1904 for(
int kk = 0; kk < current_concurrent_num_parts; ++kk) {
1905 mj_part_t current_concurrent_work_part = current_work_part + kk;
1907 mj_part_t num_parts = host_num_partitioning_in_current_dim(
1908 current_concurrent_work_part);
1911 int coordinateA_bigger_than_coordinateB =
1912 host_global_min_max_coord_total_weight(kk) >
1913 host_global_min_max_coord_total_weight(
1914 kk + current_concurrent_num_parts);
1916 if((num_parts != 1) && coordinateA_bigger_than_coordinateB) {
1919 auto local_new_part_xadj = this->new_part_xadj;
1920 Kokkos::parallel_for(
1921 Kokkos::RangePolicy<
typename mj_node_t::execution_space,
1922 mj_part_t> (0, num_parts), KOKKOS_LAMBDA(mj_part_t jj) {
1923 local_new_part_xadj(
1924 output_part_index + output_array_shift + jj) = 0;
1927 cut_shift += num_parts - 1;
1928 tlr_shift += (4 *(num_parts - 1) + 1);
1929 output_array_shift += num_parts;
1930 partweight_array_shift += (2 * (num_parts - 1) + 1);
1933 mj_lno_t coordinate_end =
1934 host_part_xadj(current_concurrent_work_part);
1935 mj_lno_t coordinate_begin =
1936 current_concurrent_work_part==0 ? 0 :
1937 host_part_xadj(current_concurrent_work_part-1);
1939 Kokkos::View<mj_scalar_t *, device_t>
1940 current_concurrent_cut_coordinate =
1941 Kokkos::subview(current_cut_coordinates,
1942 std::pair<mj_lno_t, mj_lno_t>(
1944 current_cut_coordinates.size()));
1945 Kokkos::View<mj_scalar_t *, device_t>
1946 used_local_cut_line_weight_to_left =
1947 Kokkos::subview(process_cut_line_weight_to_put_left,
1948 std::pair<mj_lno_t, mj_lno_t>(
1950 process_cut_line_weight_to_put_left.size()));
1952 this->thread_part_weight_work =
1954 this->thread_part_weights,
1955 std::pair<mj_lno_t, mj_lno_t>(
1956 partweight_array_shift,
1957 this->thread_part_weights.size()));
1961 Kokkos::View<mj_lno_t *, device_t> subview_new_part_xadj =
1962 Kokkos::subview(this->new_part_xadj,
1963 std::pair<mj_lno_t, mj_lno_t>(
1964 output_part_index + output_array_shift,
1965 this->new_part_xadj.size()));
1967 this->create_consistent_chunks(
1969 mj_current_dim_coords,
1970 current_concurrent_cut_coordinate,
1973 used_local_cut_line_weight_to_left,
1974 subview_new_part_xadj,
1976 partition_along_longest_dim,
1977 p_coord_dimension_range_sorted);
1982 mj_lno_t part_size = coordinate_end - coordinate_begin;
1984 auto local_new_part_xadj = this->new_part_xadj;
1985 Kokkos::parallel_for(
1986 Kokkos::RangePolicy<typename mj_node_t::execution_space, int>
1987 (0, 1), KOKKOS_LAMBDA (
int dummy) {
1988 local_new_part_xadj(output_part_index + output_array_shift)
1992 auto subview_new_coordinate_permutations =
1993 Kokkos::subview(this->new_coordinate_permutations,
1994 std::pair<mj_lno_t, mj_lno_t>(
1996 coordinate_begin + part_size));
1997 auto subview_coordinate_permutations =
1998 Kokkos::subview(this->coordinate_permutations,
1999 std::pair<mj_lno_t, mj_lno_t>(
2001 coordinate_begin + part_size));
2002 Kokkos::deep_copy(subview_new_coordinate_permutations,
2003 subview_coordinate_permutations);
2006 cut_shift += num_parts - 1;
2007 tlr_shift += (4 *(num_parts - 1) + 1);
2008 output_array_shift += num_parts;
2009 partweight_array_shift += (2 * (num_parts - 1) + 1);
2018 for(mj_part_t kk = 0; kk < current_concurrent_num_parts; ++kk) {
2019 mj_part_t num_parts =
2020 host_num_partitioning_in_current_dim(current_work_part + kk);
2021 auto local_new_part_xadj = this->new_part_xadj;
2022 auto local_mj_current_dim_coords = mj_current_dim_coords;
2023 auto local_new_coordinate_permutations =
2024 new_coordinate_permutations;
2025 Kokkos::parallel_for(
2026 Kokkos::RangePolicy<typename mj_node_t::execution_space, mj_part_t> (
2027 0, num_parts), KOKKOS_LAMBDA (mj_part_t ii) {
2029 local_new_part_xadj(output_part_index+ii) +=
2030 output_coordinate_end_index;
2033 mj_lno_t coordinate_end =
2034 local_new_part_xadj(output_part_index+ii);
2035 mj_lno_t coordinate_begin =
2036 local_new_part_xadj(output_part_index);
2038 for(mj_lno_t task_traverse = coordinate_begin;
2039 task_traverse < coordinate_end; ++task_traverse) {
2040 mj_lno_t l = local_new_coordinate_permutations(task_traverse);
2042 local_mj_current_dim_coords(l) = -local_mj_current_dim_coords(l);
2048 mj_part_t get_single;
2049 Kokkos::parallel_reduce(
"Read new_part_xadj",
2050 Kokkos::RangePolicy<typename mj_node_t::execution_space, int>(0, 1),
2051 KOKKOS_LAMBDA(
int dummy, mj_part_t & set_single) {
2052 set_single = local_new_part_xadj(output_part_index + num_parts - 1);
2055 output_coordinate_end_index = get_single;
2057 output_part_index += num_parts;
2064 current_num_parts = output_part_count_in_dimension;
2067 Kokkos::View<mj_lno_t *, device_t> tmp = this->coordinate_permutations;
2068 this->coordinate_permutations = this->new_coordinate_permutations;
2069 this->new_coordinate_permutations = tmp;
2071 this->part_xadj = this->new_part_xadj;
2072 this->host_part_xadj = Kokkos::create_mirror_view(part_xadj);
2073 Kokkos::deep_copy(host_part_xadj, part_xadj);
2074 this->new_part_xadj = Kokkos::View<mj_lno_t*, device_t>(
"empty", 0);
2077 Kokkos::deep_copy(initial_adjList_output_adjlist, coordinate_permutations);
2081 for(
size_t i = 0; i < this->num_global_parts ; ++i) {
2082 output_xadj[i+1] = host_part_xadj(i);
2085 delete future_num_part_in_parts;
2086 delete next_future_num_parts_in_parts;
2092template <
typename mj_scalar_t,
typename mj_lno_t,
typename mj_gno_t,
2093 typename mj_part_t,
typename mj_node_t>
2095 <mj_scalar_t,mj_lno_t,mj_gno_t,mj_part_t,mj_node_t>::mj_partBox_t>
2099 return this->global_box;
2104template <
typename mj_scalar_t,
typename mj_lno_t,
typename mj_gno_t,
2105 typename mj_part_t,
typename mj_node_t>
2106void AlgMJ<mj_scalar_t, mj_lno_t, mj_gno_t, mj_part_t,
2107 mj_node_t>::set_to_keep_part_boxes()
2109 this->mj_keep_part_boxes =
true;
2119template <
typename mj_scalar_t,
typename mj_lno_t,
typename mj_gno_t,
2120 typename mj_part_t,
typename mj_node_t>
2124 this->total_num_cut = 0;
2125 this->total_num_part = 1;
2126 this->max_num_part_along_dim = 0;
2127 this->total_dim_num_reduce_all = 0;
2128 this->last_dim_num_part = 1;
2131 this->max_num_cut_along_dim = 0;
2132 this->max_num_total_part_along_dim = 0;
2134 if(this->part_no_array.size()) {
2135 auto local_recursion_depth = this->recursion_depth;
2137 this->total_dim_num_reduce_all =
2138 this->total_num_part * this->recursion_depth;
2140 this->total_num_part = 1;
2141 for(
int i = 0; i < local_recursion_depth; ++i) {
2142 this->total_num_part *= this->part_no_array(i);
2145 mj_part_t track_max = 0;
2146 for(
int i = 0; i < local_recursion_depth; ++i) {
2147 if(part_no_array(i) > track_max) {
2148 track_max = this->part_no_array(i);
2152 this->last_dim_num_part = this->total_num_part /
2153 this->part_no_array(local_recursion_depth-1);
2155 this->max_num_part_along_dim = track_max;
2156 this->num_global_parts = this->total_num_part;
2158 mj_part_t future_num_parts = this->num_global_parts;
2162 if (this->first_level_distribution.size() != 0 &&
2163 this->num_first_level_parts > 1) {
2164 this->max_num_part_along_dim = this->num_first_level_parts;
2169 for(
int rd = 0; rd < this->recursion_depth; ++rd) {
2170 mj_part_t maxNoPartAlongI = 0;
2171 mj_part_t nfutureNumParts = 0;
2177 this->first_level_distribution.size() != 0 &&
2178 this->num_first_level_parts > 1) {
2180 maxNoPartAlongI = this->num_first_level_parts;
2181 this->max_num_part_along_dim = this->num_first_level_parts;
2183 mj_part_t sum_first_level_dist = 0;
2184 mj_part_t max_part = 0;
2187 for (
int i = 0; i < this->num_first_level_parts; ++i) {
2188 sum_first_level_dist += this->first_level_distribution(i);
2189 if (this->first_level_distribution(i) > max_part)
2190 max_part = this->first_level_distribution(i);
2196 this->num_global_parts * max_part / sum_first_level_dist;
2200 maxNoPartAlongI = this->get_part_count(future_num_parts,
2201 1.0f / (this->recursion_depth - rd));
2202 if (maxNoPartAlongI > this->max_num_part_along_dim)
2203 this->max_num_part_along_dim = maxNoPartAlongI;
2204 nfutureNumParts = future_num_parts / maxNoPartAlongI;
2205 if (future_num_parts % maxNoPartAlongI) {
2209 future_num_parts = nfutureNumParts;
2211 this->total_num_part = this->num_global_parts;
2213 if(this->divide_to_prime_first) {
2214 this->total_dim_num_reduce_all = this->num_global_parts * 2;
2215 this->last_dim_num_part = this->num_global_parts;
2222 for(
int i = 0; i < this->recursion_depth; ++i) {
2223 this->total_dim_num_reduce_all += p;
2224 p *= this->max_num_part_along_dim;
2227 if(p / this->max_num_part_along_dim > this->num_global_parts) {
2228 this->last_dim_num_part = this->num_global_parts;
2231 this->last_dim_num_part = p / this->max_num_part_along_dim;
2236 this->total_num_cut = this->total_num_part - 1;
2237 this->max_num_cut_along_dim = this->max_num_part_along_dim - 1;
2238 this->max_num_total_part_along_dim = this->max_num_part_along_dim +
2239 size_t(this->max_num_cut_along_dim);
2244 if(this->max_concurrent_part_calculation > this->last_dim_num_part) {
2245 if(this->mj_problemComm->getRank() == 0) {
2246 std::cerr <<
"Warning: Concurrent part count (" <<
2247 this->max_concurrent_part_calculation <<
2248 ") has been set bigger than maximum amount that can be used." <<
2249 " Setting to:" << this->last_dim_num_part <<
"." << std::endl;
2251 this->max_concurrent_part_calculation = this->last_dim_num_part;
2260template <
typename mj_scalar_t,
typename mj_lno_t,
typename mj_gno_t,
2261 typename mj_part_t,
typename mj_node_t>
2262inline mj_part_t AlgMJ<mj_scalar_t, mj_lno_t, mj_gno_t, mj_part_t, mj_node_t>::
2263 get_part_count(mj_part_t num_total_future,
double root)
2265 double fp = pow(num_total_future, root);
2266 mj_part_t ip = mj_part_t(fp);
2267 if(fp - ip < std::numeric_limits<float>::epsilon() * 100) {
2294template <
typename mj_scalar_t,
typename mj_lno_t,
typename mj_gno_t,
2295 typename mj_part_t,
typename mj_node_t>
2296mj_part_t AlgMJ<mj_scalar_t, mj_lno_t, mj_gno_t, mj_part_t, mj_node_t>::
2297 update_part_num_arrays(
2298 std::vector<mj_part_t> *future_num_part_in_parts,
2299 std::vector<mj_part_t> *next_future_num_parts_in_parts,
2300 mj_part_t &future_num_parts,
2301 mj_part_t current_num_parts,
2302 int current_iteration,
2303 RCP<mj_partBoxVector_t> input_part_boxes,
2304 RCP<mj_partBoxVector_t> output_part_boxes,
2305 mj_part_t atomic_part_count)
2307 std::vector<mj_part_t> num_partitioning_in_current_dim;
2310 mj_part_t output_num_parts = 0;
2311 if(this->part_no_array.size()) {
2315 mj_part_t current_part_no_array =
2316 this->part_no_array(current_iteration);
2318 if(current_part_no_array < 1) {
2319 std::cout <<
"Current recursive iteration: " << current_iteration <<
2320 " part_no_array[" << current_iteration <<
"] is given as:" <<
2321 current_part_no_array << std::endl;
2324 if(current_part_no_array == 1) {
2325 return current_num_parts;
2329 if (this->first_level_distribution.size() != 0 &&
2330 current_iteration == 0 &&
2331 current_part_no_array != this->num_first_level_parts) {
2332 std::cout <<
"Current recursive iteration: " << current_iteration
2333 <<
" part_no_array[" << current_iteration <<
"] is given as: " <<
2334 current_part_no_array <<
" and contradicts num_first_level_parts: " <<
2335 this->num_first_level_parts << std::endl;
2339 for(mj_part_t ii = 0; ii < current_num_parts; ++ii) {
2340 num_partitioning_in_current_dim.push_back(current_part_no_array);
2357 future_num_parts /= num_partitioning_in_current_dim[0];
2358 output_num_parts = current_num_parts *
2359 num_partitioning_in_current_dim[0];
2360 if(this->mj_keep_part_boxes) {
2361 for(mj_part_t k = 0; k < current_num_parts; ++k) {
2363 for(mj_part_t j = 0; j <
2364 num_partitioning_in_current_dim[0]; ++j) {
2365 output_part_boxes->push_back((*input_part_boxes)[k]);
2373 for(mj_part_t ii = 0; ii < output_num_parts; ++ii) {
2374 next_future_num_parts_in_parts->push_back(future_num_parts);
2384 future_num_parts = 1;
2387 for(mj_part_t ii = 0; ii < current_num_parts; ++ii) {
2389 mj_part_t future_num_parts_of_part_ii = (*future_num_part_in_parts)[ii];
2393 mj_part_t num_partitions_in_current_dim =
2394 this->get_part_count(future_num_parts_of_part_ii,
2395 1.0 / (this->recursion_depth - current_iteration)
2397 if(num_partitions_in_current_dim > this->max_num_part_along_dim) {
2398 std::cerr <<
"ERROR: maxPartNo calculation is wrong."
2399 " num_partitions_in_current_dim: "
2400 << num_partitions_in_current_dim <<
" this->max_num_part_along_dim: "
2401 << this->max_num_part_along_dim <<
2402 " this->recursion_depth: " << this->recursion_depth <<
2403 " current_iteration:" << current_iteration <<
2404 " future_num_parts_of_part_ii: " << future_num_parts_of_part_ii <<
2405 " might need to fix max part no calculation for "
2406 "largest_prime_first partitioning." <<
2418 if (current_iteration == 0 &&
2419 this->first_level_distribution.size() != 0 &&
2420 this->num_first_level_parts > 1) {
2423 num_partitioning_in_current_dim.push_back(this->num_first_level_parts);
2426 output_num_parts = this->num_first_level_parts;
2429 future_num_parts /= this->num_first_level_parts;
2431 mj_part_t max_part = 0;
2432 mj_part_t sum_first_level_dist = 0;
2436 for (
int i = 0; i < this->num_first_level_parts; ++i) {
2437 sum_first_level_dist += this->first_level_distribution(i);
2439 if (this->first_level_distribution(i) > max_part)
2440 max_part = this->first_level_distribution(i);
2444 future_num_parts = this->num_global_parts * max_part / sum_first_level_dist;
2448 for (
int i = 0; i < this->num_first_level_parts; ++i) {
2449 next_future_num_parts_in_parts->push_back(this->first_level_distribution(i) *
2450 this->num_global_parts / sum_first_level_dist);
2453 else if (this->divide_to_prime_first) {
2455 num_partitioning_in_current_dim.push_back(num_partitions_in_current_dim);
2457 mj_part_t largest_prime_factor = num_partitions_in_current_dim;
2460 output_num_parts += num_partitions_in_current_dim;
2462 if (future_num_parts_of_part_ii == atomic_part_count ||
2463 future_num_parts_of_part_ii % atomic_part_count != 0) {
2464 atomic_part_count = 1;
2467 largest_prime_factor =
2468 this->find_largest_prime_factor(future_num_parts_of_part_ii / atomic_part_count);
2475 if (largest_prime_factor < num_partitions_in_current_dim) {
2476 largest_prime_factor = num_partitions_in_current_dim;
2479 mj_part_t ideal_num_future_parts_in_part =
2480 (future_num_parts_of_part_ii / atomic_part_count) / largest_prime_factor;
2482 mj_part_t ideal_prime_scale = largest_prime_factor / num_partitions_in_current_dim;
2490 for (mj_part_t iii = 0; iii < num_partitions_in_current_dim; ++iii) {
2492 mj_part_t my_ideal_primescale = ideal_prime_scale;
2494 if (iii < (largest_prime_factor) % num_partitions_in_current_dim) {
2495 ++my_ideal_primescale;
2498 mj_part_t num_future_parts_for_part_iii =
2499 ideal_num_future_parts_in_part * my_ideal_primescale;
2502 if (iii < (future_num_parts_of_part_ii / atomic_part_count) % largest_prime_factor) {
2504 ++num_future_parts_for_part_iii;
2507 next_future_num_parts_in_parts->push_back(num_future_parts_for_part_iii * atomic_part_count);
2510 if (this->mj_keep_part_boxes) {
2511 output_part_boxes->push_back((*input_part_boxes)[ii]);
2515 if (num_future_parts_for_part_iii > future_num_parts)
2516 future_num_parts = num_future_parts_for_part_iii;
2522 num_partitioning_in_current_dim.push_back(num_partitions_in_current_dim);
2525 output_num_parts += num_partitions_in_current_dim;
2527 if((future_num_parts_of_part_ii == atomic_part_count) ||
2528 (future_num_parts_of_part_ii % atomic_part_count != 0)) {
2529 atomic_part_count = 1;
2532 mj_part_t ideal_num_future_parts_in_part =
2533 (future_num_parts_of_part_ii / atomic_part_count) /
2534 num_partitions_in_current_dim;
2535 for(mj_part_t iii = 0; iii < num_partitions_in_current_dim; ++iii) {
2536 mj_part_t num_future_parts_for_part_iii =
2537 ideal_num_future_parts_in_part;
2540 if(iii < (future_num_parts_of_part_ii / atomic_part_count) %
2541 num_partitions_in_current_dim) {
2543 ++num_future_parts_for_part_iii;
2546 next_future_num_parts_in_parts->push_back(
2547 num_future_parts_for_part_iii * atomic_part_count);
2551 if(this->mj_keep_part_boxes) {
2552 output_part_boxes->push_back((*input_part_boxes)[ii]);
2555 if(num_future_parts_for_part_iii > future_num_parts)
2556 future_num_parts = num_future_parts_for_part_iii;
2562 device_num_partitioning_in_current_dim = Kokkos::View<
2563 mj_part_t*, device_t>(
"test", num_partitioning_in_current_dim.size());
2564 host_num_partitioning_in_current_dim =
2565 Kokkos::create_mirror_view(device_num_partitioning_in_current_dim);
2566 for(
size_t n = 0; n < num_partitioning_in_current_dim.size(); ++n) {
2567 host_num_partitioning_in_current_dim(n) =
2568 num_partitioning_in_current_dim[n];
2573 Kokkos::deep_copy(device_num_partitioning_in_current_dim,
2574 host_num_partitioning_in_current_dim);
2575 return output_num_parts;
2580template <
typename mj_scalar_t,
typename mj_lno_t,
typename mj_gno_t,
2581 typename mj_part_t,
typename mj_node_t>
2582void AlgMJ<mj_scalar_t, mj_lno_t, mj_gno_t, mj_part_t, mj_node_t>::
2583 allocate_set_work_memory()
2588 this->coordinate_permutations = Kokkos::View<mj_lno_t*, device_t>(
2589 Kokkos::ViewAllocateWithoutInitializing(
"coordinate_permutations"),
2590 this->num_local_coords);
2591 auto local_coordinate_permutations = coordinate_permutations;
2592 Kokkos::parallel_for(
2593 Kokkos::RangePolicy<typename mj_node_t::execution_space, mj_lno_t> (
2594 0, this->num_local_coords), KOKKOS_LAMBDA (mj_lno_t i) {
2595 local_coordinate_permutations(i) = i;
2599 this->new_coordinate_permutations = Kokkos::View<mj_lno_t*, device_t>(
2600 Kokkos::ViewAllocateWithoutInitializing(
"num_local_coords"),
2601 this->num_local_coords);
2603 this->assigned_part_ids = Kokkos::View<mj_part_t*, device_t>(
2604 Kokkos::ViewAllocateWithoutInitializing(
"assigned parts"), 0);
2605 if(this->num_local_coords > 0) {
2606 this->assigned_part_ids = Kokkos::View<mj_part_t*, device_t>(
2607 Kokkos::ViewAllocateWithoutInitializing(
"assigned part ids"),
2608 this->num_local_coords);
2615 this->part_xadj = Kokkos::View<mj_lno_t*, device_t>(
2616 Kokkos::ViewAllocateWithoutInitializing(
"part xadj"), 1);
2617 this->host_part_xadj = Kokkos::create_mirror_view(part_xadj);
2618 host_part_xadj(0) = num_local_coords;
2619 Kokkos::deep_copy(this->part_xadj, host_part_xadj);
2622 this->new_part_xadj = Kokkos::View<mj_lno_t*, device_t>(
2623 Kokkos::ViewAllocateWithoutInitializing(
"empty"), 0);
2626 this->all_cut_coordinates = Kokkos::View<mj_scalar_t*, device_t>(
2627 Kokkos::ViewAllocateWithoutInitializing(
"all cut coordinates"),
2628 this->max_num_cut_along_dim * this->max_concurrent_part_calculation);
2631 this->process_cut_line_weight_to_put_left = Kokkos::View<mj_scalar_t*,
2632 device_t>(Kokkos::ViewAllocateWithoutInitializing(
"empty"), 0);
2636 this->thread_cut_line_weight_to_put_left =
2637 Kokkos::View<mj_scalar_t*, device_t>(
2638 Kokkos::ViewAllocateWithoutInitializing(
"empty"), 0);
2640 if(this->distribute_points_on_cut_lines) {
2641 this->process_cut_line_weight_to_put_left =
2642 Kokkos::View<mj_scalar_t *, device_t>(
2643 Kokkos::ViewAllocateWithoutInitializing(
2644 "process_cut_line_weight_to_put_left"),
2645 this->max_num_cut_along_dim * this->max_concurrent_part_calculation);
2646 this->thread_cut_line_weight_to_put_left =
2647 Kokkos::View<mj_scalar_t *, device_t>(
2648 Kokkos::ViewAllocateWithoutInitializing(
2649 "thread_cut_line_weight_to_put_left"),
2650 this->max_num_cut_along_dim);
2651 this->process_rectilinear_cut_weight =
2652 Kokkos::View<mj_scalar_t *, device_t>(
2653 Kokkos::ViewAllocateWithoutInitializing(
"process_rectilinear_cut_weight"),
2654 this->max_num_cut_along_dim);
2655 this->global_rectilinear_cut_weight =
2656 Kokkos::View<mj_scalar_t *, device_t>(
2657 Kokkos::ViewAllocateWithoutInitializing(
"global_rectilinear_cut_weight"),
2658 this->max_num_cut_along_dim);
2665 this->cut_coordinates_work_array =
2666 Kokkos::View<mj_scalar_t *, device_t>(
2667 Kokkos::ViewAllocateWithoutInitializing(
"cut_coordinates_work_array"),
2668 this->max_num_cut_along_dim * this->max_concurrent_part_calculation);
2671 this->target_part_weights = Kokkos::View<mj_scalar_t*, device_t>(
2672 Kokkos::ViewAllocateWithoutInitializing(
"target_part_weights"),
2673 this->max_num_part_along_dim * this->max_concurrent_part_calculation);
2676 this->cut_upper_bound_coordinates =
2677 Kokkos::View<mj_scalar_t*, device_t>(
2678 Kokkos::ViewAllocateWithoutInitializing(
"cut_upper_bound_coordinates"),
2679 this->max_num_cut_along_dim * this->max_concurrent_part_calculation);
2682 this->cut_lower_bound_coordinates =
2683 Kokkos::View<mj_scalar_t*, device_t>(
2684 Kokkos::ViewAllocateWithoutInitializing(
"cut_lower_bound_coordinates"),
2685 this->max_num_cut_along_dim* this->max_concurrent_part_calculation);
2688 this->cut_lower_bound_weights =
2689 Kokkos::View<mj_scalar_t*, device_t>(
2690 Kokkos::ViewAllocateWithoutInitializing(
"cut_lower_bound_weights"),
2691 this->max_num_cut_along_dim* this->max_concurrent_part_calculation);
2694 this->cut_upper_bound_weights =
2695 Kokkos::View<mj_scalar_t*, device_t>(
2696 Kokkos::ViewAllocateWithoutInitializing(
"cut_upper_bound_weights"),
2697 this->max_num_cut_along_dim* this->max_concurrent_part_calculation);
2701 this->process_local_min_max_coord_total_weight =
2702 Kokkos::View<mj_scalar_t*, device_t>(
2703 Kokkos::ViewAllocateWithoutInitializing(
2704 "process_local_min_max_coord_total_weight"),
2705 3 * this->max_concurrent_part_calculation);
2708 this->global_min_max_coord_total_weight =
2709 Kokkos::View<mj_scalar_t*, device_t>(
2710 Kokkos::ViewAllocateWithoutInitializing(
"global_min_max_coord_total_weight"),
2711 3 * this->max_concurrent_part_calculation);
2716 this->is_cut_line_determined = Kokkos::View<bool *, device_t>(
2717 Kokkos::ViewAllocateWithoutInitializing(
"is_cut_line_determined"),
2718 this->max_num_cut_along_dim * this->max_concurrent_part_calculation);
2724 this->device_incomplete_cut_count = Kokkos::View<mj_part_t *, device_t>(
2725 Kokkos::ViewAllocateWithoutInitializing(
"device_incomplete_cut_count"),
2726 this->max_concurrent_part_calculation);
2727 this->incomplete_cut_count =
2728 Kokkos::create_mirror_view(device_incomplete_cut_count);
2731 this->thread_part_weights = Kokkos::View<double *, device_t>(
2732 Kokkos::ViewAllocateWithoutInitializing(
"thread_part_weights"),
2733 this->max_num_total_part_along_dim * this->max_concurrent_part_calculation);
2735 this->thread_cut_left_closest_point = Kokkos::View<mj_scalar_t *, device_t>(
2736 Kokkos::ViewAllocateWithoutInitializing(
"thread_cut_left_closest_point"),
2737 this->max_num_cut_along_dim * this->max_concurrent_part_calculation);
2741 this->thread_cut_right_closest_point = Kokkos::View<mj_scalar_t *, device_t>(
2742 Kokkos::ViewAllocateWithoutInitializing(
"thread_cut_right_closest_point"),
2743 this->max_num_cut_along_dim * this->max_concurrent_part_calculation);
2746 this->thread_point_counts = Kokkos::View<mj_lno_t *, device_t>(
2747 Kokkos::ViewAllocateWithoutInitializing(
"thread_point_counts"),
2748 this->max_num_part_along_dim);
2754 this->total_part_weight_left_right_closests =
2755 Kokkos::View<mj_scalar_t*, device_t>(
2756 Kokkos::ViewAllocateWithoutInitializing(
2757 "total_part_weight_left_right_closests"),
2758 (this->max_num_total_part_along_dim + this->max_num_cut_along_dim * 2) *
2759 this->max_concurrent_part_calculation);
2761 this->global_total_part_weight_left_right_closests =
2762 Kokkos::View<mj_scalar_t*, device_t>(
2763 Kokkos::ViewAllocateWithoutInitializing(
2764 "global_total_part_weight_left_right_closests"),
2765 (this->max_num_total_part_along_dim +
2766 this->max_num_cut_along_dim * 2) * this->max_concurrent_part_calculation);
2768 this->current_mj_gnos = Kokkos::View<mj_gno_t*, device_t>(
2769 Kokkos::ViewAllocateWithoutInitializing(
"gids"), num_local_coords);
2771 this->owner_of_coordinate = Kokkos::View<int *, Kokkos::HostSpace>(
2772 Kokkos::ViewAllocateWithoutInitializing(
"owner_of_coordinate"),
2778 Kokkos::deep_copy(owner_of_coordinate, myActualRank);
2780 auto local_current_mj_gnos = current_mj_gnos;
2781 auto local_initial_mj_gnos = initial_mj_gnos;
2782 Kokkos::parallel_for(
2783 Kokkos::RangePolicy<typename mj_node_t::execution_space, mj_lno_t>
2784 (0, num_local_coords), KOKKOS_LAMBDA (mj_lno_t j) {
2785 local_current_mj_gnos(j) = local_initial_mj_gnos(j);
2791template <
typename mj_scalar_t,
typename mj_lno_t,
typename mj_gno_t,
2792 typename mj_part_t,
typename mj_node_t>
2793void AlgMJ<mj_scalar_t,mj_lno_t,mj_gno_t,mj_part_t,
2794 mj_node_t>::compute_global_box()
2797 mj_scalar_t *mins =
new mj_scalar_t[this->coord_dim];
2799 mj_scalar_t *gmins =
new mj_scalar_t[this->coord_dim];
2801 mj_scalar_t *maxs =
new mj_scalar_t[this->coord_dim];
2803 mj_scalar_t *gmaxs =
new mj_scalar_t[this->coord_dim];
2805 auto local_mj_coordinates = this->mj_coordinates;
2809 for(
int i = 0; i < this->coord_dim; ++i) {
2814 for(
int i = 0; i < std::min(this->recursion_depth, this->coord_dim); ++i) {
2815 Kokkos::parallel_reduce(
"MinReduce",
2816 Kokkos::RangePolicy<typename mj_node_t::execution_space, mj_lno_t>
2817 (0, this->num_local_coords),
2818 KOKKOS_LAMBDA(mj_lno_t j, mj_scalar_t & running_min) {
2819 if(local_mj_coordinates(j,i) < running_min) {
2820 running_min = local_mj_coordinates(j,i);
2822 }, Kokkos::Min<mj_scalar_t>(mins[i]));
2823 Kokkos::parallel_reduce(
"MaxReduce",
2824 Kokkos::RangePolicy<typename mj_node_t::execution_space, mj_lno_t>
2825 (0, this->num_local_coords),
2826 KOKKOS_LAMBDA(mj_lno_t j, mj_scalar_t & running_max) {
2827 if(local_mj_coordinates(j,i) > running_max) {
2828 running_max = local_mj_coordinates(j,i);
2830 }, Kokkos::Max<mj_scalar_t>(maxs[i]));
2833 reduceAll<int, mj_scalar_t>(*this->comm, Teuchos::REDUCE_MIN,
2834 this->coord_dim, mins, gmins
2837 reduceAll<int, mj_scalar_t>(*this->comm, Teuchos::REDUCE_MAX,
2838 this->coord_dim, maxs, gmaxs
2842 global_box = rcp(
new mj_partBox_t(0,this->coord_dim,gmins,gmaxs));
2856template <
typename mj_scalar_t,
typename mj_lno_t,
typename mj_gno_t,
2857 typename mj_part_t,
typename mj_node_t>
2858void AlgMJ<mj_scalar_t, mj_lno_t, mj_gno_t, mj_part_t,
2859 mj_node_t>::init_part_boxes(
2860 RCP<mj_partBoxVector_t> & initial_partitioning_boxes)
2862 mj_partBox_t tmp_box(*global_box);
2863 initial_partitioning_boxes->push_back(tmp_box);
2870template <
typename mj_scalar_t,
typename mj_lno_t,
typename mj_gno_t,
2873void AlgMJ<mj_scalar_t, mj_lno_t, mj_gno_t, mj_part_t, mj_node_t>::
2874 mj_get_local_min_max_coord_totW(
2875 mj_part_t current_work_part,
2876 mj_part_t current_concurrent_num_parts,
2877 Kokkos::View<mj_scalar_t *, device_t> & mj_current_dim_coords)
2879 auto local_coordinate_permutations = this->coordinate_permutations;
2880 auto local_process_local_min_max_coord_total_weight =
2881 this->process_local_min_max_coord_total_weight;
2882 auto local_mj_weights = this->mj_weights;
2884 bool bUniformWeights = mj_uniform_weights(0);
2886 for(
int kk = 0; kk < current_concurrent_num_parts; ++kk) {
2888 mj_part_t concurrent_current_part = current_work_part + kk;
2889 mj_lno_t coordinate_begin_index = concurrent_current_part == 0 ? 0 :
2890 host_part_xadj(concurrent_current_part - 1);
2891 mj_lno_t coordinate_end_index =
2892 host_part_xadj(concurrent_current_part);
2894 mj_scalar_t my_min_coord = 0;
2895 mj_scalar_t my_max_coord = 0;
2896 mj_scalar_t my_total_weight;
2899 if(coordinate_begin_index >= coordinate_end_index)
2901 my_min_coord = std::numeric_limits<mj_scalar_t>::max();
2902 my_max_coord = -std::numeric_limits<mj_scalar_t>::max();
2903 my_total_weight = 0;
2907 Kokkos::parallel_reduce(
"get min",
2908 Kokkos::RangePolicy<typename mj_node_t::execution_space, mj_lno_t>
2909 (coordinate_begin_index, coordinate_end_index),
2910 KOKKOS_LAMBDA (mj_lno_t j, mj_scalar_t & running_min) {
2911 int i = local_coordinate_permutations(j);
2912 if(mj_current_dim_coords(i) < running_min)
2913 running_min = mj_current_dim_coords(i);
2914 }, Kokkos::Min<mj_scalar_t>(my_min_coord));
2916 Kokkos::parallel_reduce(
"get max",
2917 Kokkos::RangePolicy<typename mj_node_t::execution_space, mj_lno_t>
2918 (coordinate_begin_index, coordinate_end_index),
2919 KOKKOS_LAMBDA (mj_lno_t j, mj_scalar_t & running_max) {
2920 int i = local_coordinate_permutations(j);
2921 if(mj_current_dim_coords(i) > running_max)
2922 running_max = mj_current_dim_coords(i);
2923 }, Kokkos::Max<mj_scalar_t>(my_max_coord));
2924 if(bUniformWeights) {
2925 my_total_weight = coordinate_end_index - coordinate_begin_index;
2928 my_total_weight = 0;
2929 Kokkos::parallel_reduce(
"get weight",
2930 Kokkos::RangePolicy<typename mj_node_t::execution_space, mj_lno_t>
2931 (coordinate_begin_index, coordinate_end_index),
2932 KOKKOS_LAMBDA (mj_lno_t j, mj_scalar_t & lsum) {
2933 int i = local_coordinate_permutations(j);
2934 lsum += local_mj_weights(i,0);
2935 }, my_total_weight);
2940 Kokkos::parallel_for(
2941 Kokkos::RangePolicy<typename mj_node_t::execution_space, mj_part_t>
2942 (0, 1), KOKKOS_LAMBDA (
int dummy) {
2943 local_process_local_min_max_coord_total_weight(kk) =
2945 local_process_local_min_max_coord_total_weight(
2946 kk + current_concurrent_num_parts) = my_max_coord;
2947 local_process_local_min_max_coord_total_weight(
2948 kk + 2*current_concurrent_num_parts) = my_total_weight;
2965template <
typename mj_scalar_t,
typename mj_lno_t,
typename mj_gno_t,
2966 typename mj_part_t,
typename mj_node_t>
2967void AlgMJ<mj_scalar_t, mj_lno_t, mj_gno_t, mj_part_t,
2968 mj_node_t>::mj_get_global_min_max_coord_totW(
2969 mj_part_t current_concurrent_num_parts,
2970 Kokkos::View<mj_scalar_t *, device_t> & local_min_max_total,
2971 Kokkos::View<mj_scalar_t *, device_t> & global_min_max_total) {
2975 if(this->comm->getSize() > 1) {
2978 auto host_local_min_max_total =
2979 Kokkos::create_mirror_view(Kokkos::HostSpace(), local_min_max_total);
2980 auto host_global_min_max_total =
2981 Kokkos::create_mirror_view(Kokkos::HostSpace(), global_min_max_total);
2982 Kokkos::deep_copy(host_local_min_max_total, local_min_max_total);
2984 reductionOp(current_concurrent_num_parts,
2985 current_concurrent_num_parts, current_concurrent_num_parts);
2987 reduceAll<int, mj_scalar_t>(
2990 3 * current_concurrent_num_parts,
2991 host_local_min_max_total.data(),
2992 host_global_min_max_total.data());
2995 Kokkos::deep_copy(global_min_max_total, host_global_min_max_total);
2998 mj_part_t s = 3 * current_concurrent_num_parts;
2999 Kokkos::parallel_for(
3000 Kokkos::RangePolicy<typename mj_node_t::execution_space, mj_part_t>
3001 (0, s), KOKKOS_LAMBDA (mj_part_t i) {
3002 global_min_max_total(i) = local_min_max_total(i);
3039template <
typename mj_scalar_t,
typename mj_lno_t,
typename mj_gno_t,
3040 typename mj_part_t,
typename mj_node_t>
3041void AlgMJ<mj_scalar_t, mj_lno_t, mj_gno_t, mj_part_t, mj_node_t>::
3042 mj_get_initial_cut_coords_target_weights(
3043 mj_scalar_t min_coord,
3044 mj_scalar_t max_coord,
3045 mj_part_t num_cuts ,
3046 mj_scalar_t global_weight,
3048 Kokkos::View<mj_scalar_t *, device_t> & initial_cut_coords,
3050 Kokkos::View<mj_scalar_t *, device_t> & current_target_part_weights ,
3051 std::vector <mj_part_t> *future_num_part_in_parts,
3052 std::vector <mj_part_t> *next_future_num_parts_in_parts,
3053 mj_part_t concurrent_current_part,
3054 mj_part_t obtained_part_index,
3055 mj_part_t num_target_first_level_parts,
3056 const Kokkos::View<mj_part_t *, Kokkos::HostSpace> & target_first_level_dist)
3058 mj_scalar_t coord_range = max_coord - min_coord;
3065 bool bUniformPartsCheck =
3066 num_target_first_level_parts <= 1 && this->mj_uniform_parts(0);
3068 if(!bUniformPartsCheck) {
3069 bool bValidNonUniformTargetWeights =
3070 (num_target_first_level_parts > 1 && target_first_level_dist.size() != 0);
3071 if(!bValidNonUniformTargetWeights) {
3072 std::cerr <<
"MJ does not support non uniform part weights beyond the first partition" << std::endl;
3077 Kokkos::View<mj_scalar_t*, device_t> device_cumulative(
3078 "device_cumulative", num_cuts);
3079 auto host_cumulative = Kokkos::create_mirror_view(device_cumulative);
3081 mj_scalar_t cumulative = 0;
3083 if(bUniformPartsCheck) {
3085 mj_scalar_t total_future_part_count_in_part =
3086 static_cast<mj_scalar_t
>((*future_num_part_in_parts)[concurrent_current_part]);
3089 mj_scalar_t unit_part_weight =
3090 global_weight / total_future_part_count_in_part;
3092 for(mj_part_t i = 0; i < num_cuts; ++i) {
3093 cumulative += unit_part_weight *
static_cast<mj_scalar_t
>((*next_future_num_parts_in_parts)[i + obtained_part_index]);
3094 host_cumulative(i) = cumulative;
3099 mj_scalar_t sum_target_first_level_dist = 0.0;
3100 for (
int i = 0; i < num_target_first_level_parts; ++i) {
3101 sum_target_first_level_dist += target_first_level_dist(i);
3104 for(mj_part_t i = 0; i < num_cuts; ++i) {
3105 cumulative += global_weight * target_first_level_dist(i) /
3106 sum_target_first_level_dist;
3107 host_cumulative(i) = cumulative;
3111 Kokkos::deep_copy(device_cumulative, host_cumulative);
3113 Kokkos::parallel_for(
"Write num in parts",
3114 Kokkos::RangePolicy<typename mj_node_t::execution_space, mj_part_t>
3115 (0, num_cuts), KOKKOS_LAMBDA(mj_part_t cut) {
3117 current_target_part_weights(cut) = device_cumulative(cut);
3118 initial_cut_coords(cut) = min_coord +
3119 (coord_range * device_cumulative(cut)) / global_weight;
3121 current_target_part_weights(num_cuts) = global_weight;
3127 if (!bUniformPartsCheck || this->mj_uniform_weights[0]) {
3128 Kokkos::parallel_for(
3129 Kokkos::RangePolicy<typename mj_node_t::execution_space, mj_part_t>
3131 KOKKOS_LAMBDA (mj_part_t i) {
3132 current_target_part_weights(i) =
3133 long(current_target_part_weights(i) + 0.5);
3154template <
typename mj_scalar_t,
typename mj_lno_t,
typename mj_gno_t,
3155 typename mj_part_t,
typename mj_node_t>
3156void AlgMJ<mj_scalar_t, mj_lno_t, mj_gno_t, mj_part_t, mj_node_t>::
3157 set_initial_coordinate_parts(
3158 mj_scalar_t &max_coordinate,
3159 mj_scalar_t &min_coordinate,
3160 mj_lno_t coordinate_begin_index,
3161 mj_lno_t coordinate_end_index,
3162 Kokkos::View<mj_lno_t *, device_t> & mj_current_coordinate_permutations,
3163 Kokkos::View<mj_scalar_t *, device_t> & mj_current_dim_coords,
3164 Kokkos::View<mj_part_t *, device_t> & mj_part_ids,
3165 mj_part_t &partition_count)
3167 mj_scalar_t coordinate_range = max_coordinate - min_coordinate;
3171 if(std::abs(coordinate_range) < this->sEpsilon ) {
3172 Kokkos::parallel_for(
3173 Kokkos::RangePolicy<typename mj_node_t::execution_space, mj_lno_t>
3174 (coordinate_begin_index, coordinate_end_index),
3175 KOKKOS_LAMBDA (mj_lno_t ii) {
3176 mj_part_ids(mj_current_coordinate_permutations[ii]) = 0;
3182 mj_scalar_t slice = coordinate_range / partition_count;
3183 Kokkos::parallel_for(
3184 Kokkos::RangePolicy<typename mj_node_t::execution_space, mj_lno_t>
3185 (coordinate_begin_index, coordinate_end_index),
3186 KOKKOS_LAMBDA (mj_lno_t ii) {
3187 mj_lno_t iii = mj_current_coordinate_permutations[ii];
3189 mj_part_t((mj_current_dim_coords[iii] - min_coordinate) / slice);
3190 if(pp >= partition_count) {
3191 pp = partition_count - 1;
3193 mj_part_ids[iii] = 2 * pp;
3212template <
typename mj_scalar_t,
typename mj_lno_t,
typename mj_gno_t,
3213 typename mj_part_t,
typename mj_node_t>
3214void AlgMJ<mj_scalar_t, mj_lno_t, mj_gno_t, mj_part_t,mj_node_t>::mj_1D_part(
3215 Kokkos::View<mj_scalar_t *, device_t> & mj_current_dim_coords,
3216 double used_imbalance_tolerance,
3217 mj_part_t current_work_part,
3218 mj_part_t current_concurrent_num_parts,
3219 Kokkos::View<mj_scalar_t *, device_t> & current_cut_coordinates,
3220 mj_part_t total_incomplete_cut_count,
3221 Kokkos::View<mj_part_t *, device_t> & view_rectilinear_cut_count,
3222 Kokkos::View<size_t*, device_t> & view_total_reduction_size)
3224 this->temp_cut_coords = current_cut_coordinates;
3227 *reductionOp = NULL;
3229 bool bSingleProcess = (this->comm->getSize() == 1);
3231 std::vector<mj_part_t> temp(host_num_partitioning_in_current_dim.size());
3232 if(!bSingleProcess) {
3233 for(
size_t n = 0; n < host_num_partitioning_in_current_dim.size(); ++n) {
3234 temp[n] = host_num_partitioning_in_current_dim(n);
3237 <mj_part_t, mj_scalar_t>(
3240 current_concurrent_num_parts);
3243 auto local_cut_lower_bound_coordinates =
3244 cut_lower_bound_coordinates;
3245 auto local_cut_upper_bound_coordinates =
3246 cut_upper_bound_coordinates;
3247 auto local_cut_upper_bound_weights = cut_upper_bound_weights;
3248 auto local_cut_lower_bound_weights = cut_lower_bound_weights;
3249 bool local_distribute_points_on_cut_lines = distribute_points_on_cut_lines;
3250 auto local_process_cut_line_weight_to_put_left =
3251 process_cut_line_weight_to_put_left;
3252 auto local_temp_cut_coords = temp_cut_coords;
3253 auto local_global_total_part_weight_left_right_closests =
3254 global_total_part_weight_left_right_closests;
3255 auto local_cut_coordinates_work_array =
3256 cut_coordinates_work_array;
3257 auto local_part_xadj = part_xadj;
3258 auto local_global_min_max_coord_total_weight =
3259 global_min_max_coord_total_weight;
3260 auto local_target_part_weights =
3261 target_part_weights;
3262 auto local_global_rectilinear_cut_weight =
3263 global_rectilinear_cut_weight;
3264 auto local_process_rectilinear_cut_weight =
3265 process_rectilinear_cut_weight;
3267 auto local_is_cut_line_determined = this->is_cut_line_determined;
3268 auto local_device_num_partitioning_in_current_dim =
3269 device_num_partitioning_in_current_dim;
3271 Kokkos::parallel_for(
3272 Kokkos::RangePolicy<typename mj_node_t::execution_space, int> (0, 1),
3273 KOKKOS_LAMBDA (
int dummy) {
3276 view_rectilinear_cut_count(0) = 0;
3277 view_total_reduction_size(0) = 0;
3281 for(mj_part_t i = 0; i < current_concurrent_num_parts; ++i) {
3282 mj_part_t num_part_in_dim =
3283 local_device_num_partitioning_in_current_dim(current_work_part + i);
3284 mj_part_t num_cut_in_dim = num_part_in_dim - 1;
3285 view_total_reduction_size(0) += (4 * num_cut_in_dim + 1);
3287 for(mj_part_t ii = 0; ii < num_cut_in_dim; ++ii) {
3288 local_is_cut_line_determined(next) =
false;
3290 local_cut_lower_bound_coordinates(next) =
3291 local_global_min_max_coord_total_weight(i);
3293 local_cut_upper_bound_coordinates(next) =
3294 local_global_min_max_coord_total_weight(
3295 i + current_concurrent_num_parts);
3297 local_cut_upper_bound_weights(next) =
3298 local_global_min_max_coord_total_weight(
3299 i + 2 * current_concurrent_num_parts);
3300 local_cut_lower_bound_weights(next) = 0;
3301 if(local_distribute_points_on_cut_lines) {
3302 local_process_cut_line_weight_to_put_left(next) = 0;
3313 while (total_incomplete_cut_count != 0) {
3314 this->mj_1D_part_get_part_weights(
3315 current_concurrent_num_parts,
3317 mj_current_dim_coords,
3321 this->mj_combine_rightleft_and_weights(
3323 current_concurrent_num_parts);
3326 if(!bSingleProcess) {
3329 auto host_total_part_weight_left_right_closests =
3330 Kokkos::create_mirror_view(Kokkos::HostSpace(),
3331 total_part_weight_left_right_closests);
3332 auto host_global_total_part_weight_left_right_closests =
3333 Kokkos::create_mirror_view(Kokkos::HostSpace(),
3334 global_total_part_weight_left_right_closests);
3336 Kokkos::deep_copy(host_total_part_weight_left_right_closests,
3337 total_part_weight_left_right_closests);
3339 size_t host_view_total_reduction_size;
3340 Kokkos::parallel_reduce(
"Read single",
3341 Kokkos::RangePolicy<typename mj_node_t::execution_space, int> (0, 1),
3342 KOKKOS_LAMBDA(
int dummy,
size_t & set_single) {
3343 set_single = view_total_reduction_size(0);
3344 }, host_view_total_reduction_size);
3346 reduceAll<int, mj_scalar_t>( *(this->comm), *reductionOp,
3347 host_view_total_reduction_size,
3348 host_total_part_weight_left_right_closests.data(),
3349 host_global_total_part_weight_left_right_closests.data());
3350 Kokkos::deep_copy(global_total_part_weight_left_right_closests,
3351 host_global_total_part_weight_left_right_closests);
3354 local_global_total_part_weight_left_right_closests =
3355 this->total_part_weight_left_right_closests;
3360 mj_part_t cut_shift = 0;
3364 size_t tlr_shift = 0;
3366 Kokkos::View<mj_part_t*, Kokkos::HostSpace>
3367 save_initial_incomplete_cut_count(
"save_initial_incomplete_cut_count",
3368 current_concurrent_num_parts);
3370 for(mj_part_t kk = 0; kk < current_concurrent_num_parts; ++kk) {
3372 mj_part_t num_parts =
3373 host_num_partitioning_in_current_dim(current_work_part + kk);
3375 mj_part_t num_cuts = num_parts - 1;
3376 size_t num_total_part = num_parts + size_t (num_cuts);
3381 mj_part_t kk_incomplete_cut_count = this->incomplete_cut_count(kk);
3383 if(kk_incomplete_cut_count == 0) {
3384 cut_shift += num_cuts;
3385 tlr_shift += (num_total_part + 2 * num_cuts);
3389 Kokkos::View<mj_scalar_t *, device_t> current_local_part_weights =
3390 Kokkos::subview(this->total_part_weight_left_right_closests,
3391 std::pair<mj_lno_t, mj_lno_t>(
3393 this->total_part_weight_left_right_closests.size()));
3395 Kokkos::View<mj_scalar_t *, device_t> current_global_tlr =
3397 local_global_total_part_weight_left_right_closests,
3398 std::pair<mj_lno_t, mj_lno_t>(
3400 local_global_total_part_weight_left_right_closests.size()));
3401 Kokkos::View<mj_scalar_t *, device_t>
3402 current_global_left_closest_points =
3403 Kokkos::subview(current_global_tlr,
3404 std::pair<mj_lno_t, mj_lno_t>(
3406 current_global_tlr.size()));
3407 Kokkos::View<mj_scalar_t *, device_t>
3408 current_global_right_closest_points =
3409 Kokkos::subview(current_global_tlr,
3410 std::pair<mj_lno_t, mj_lno_t>(
3411 num_total_part + num_cuts,
3412 current_global_tlr.size()));
3413 Kokkos::View<mj_scalar_t *, device_t> current_global_part_weights =
3416 Kokkos::View<bool *, device_t> current_cut_line_determined =
3417 Kokkos::subview(this->is_cut_line_determined,
3418 std::pair<mj_lno_t, mj_lno_t>(
3420 this->is_cut_line_determined.size()));
3421 Kokkos::View<mj_scalar_t *, device_t> current_part_target_weights =
3422 Kokkos::subview(local_target_part_weights,
3423 std::pair<mj_lno_t, mj_lno_t>(
3425 local_target_part_weights.size()));
3426 Kokkos::View<mj_scalar_t *, device_t>
3427 current_part_cut_line_weight_to_put_left =
3428 Kokkos::subview(local_process_cut_line_weight_to_put_left,
3429 std::pair<mj_lno_t, mj_lno_t>(
3431 local_process_cut_line_weight_to_put_left.size()));
3433 save_initial_incomplete_cut_count(kk) =
3434 kk_incomplete_cut_count;
3436 Kokkos::View<mj_scalar_t *, device_t>
3437 current_cut_lower_bound_weights =
3438 Kokkos::subview(local_cut_lower_bound_weights,
3439 std::pair<mj_lno_t, mj_lno_t>(
3441 local_cut_lower_bound_weights.size()));
3442 Kokkos::View<mj_scalar_t *, device_t> current_cut_upper_weights =
3443 Kokkos::subview(local_cut_upper_bound_weights,
3444 std::pair<mj_lno_t, mj_lno_t>(
3446 local_cut_upper_bound_weights.size()));
3447 Kokkos::View<mj_scalar_t *, device_t> current_cut_upper_bounds =
3448 Kokkos::subview(local_cut_upper_bound_coordinates,
3449 std::pair<mj_lno_t, mj_lno_t>(
3451 local_cut_upper_bound_coordinates.size()));
3452 Kokkos::View<mj_scalar_t *, device_t> current_cut_lower_bounds =
3453 Kokkos::subview(local_cut_lower_bound_coordinates,
3454 std::pair<mj_lno_t, mj_lno_t>(
3456 local_cut_lower_bound_coordinates.size()));
3459 Kokkos::View<mj_scalar_t*, device_t> sub_temp_cut_coords =
3460 Kokkos::subview(this->temp_cut_coords,
3461 std::pair<mj_lno_t, mj_lno_t>(
3462 cut_shift, this->temp_cut_coords.size()));
3463 Kokkos::View<mj_scalar_t*, device_t> sub_cut_coordinates_work_array =
3464 Kokkos::subview(this->cut_coordinates_work_array,
3465 std::pair<mj_lno_t, mj_lno_t>(
3466 cut_shift, this->cut_coordinates_work_array.size()));
3468 this->mj_get_new_cut_coordinates(
3469 current_concurrent_num_parts,
3472 used_imbalance_tolerance,
3473 current_global_part_weights,
3474 current_local_part_weights,
3475 current_part_target_weights,
3476 current_cut_line_determined,
3477 sub_temp_cut_coords,
3478 current_cut_upper_bounds,
3479 current_cut_lower_bounds,
3480 current_global_left_closest_points,
3481 current_global_right_closest_points,
3482 current_cut_lower_bound_weights,
3483 current_cut_upper_weights,
3484 sub_cut_coordinates_work_array,
3485 current_part_cut_line_weight_to_put_left,
3486 view_rectilinear_cut_count);
3488 cut_shift += num_cuts;
3489 tlr_shift += (num_total_part + 2 * num_cuts);
3492 for(mj_part_t kk = 0; kk < current_concurrent_num_parts; ++kk) {
3493 mj_part_t iteration_complete_cut_count =
3494 save_initial_incomplete_cut_count(kk) - this->incomplete_cut_count(kk);
3495 total_incomplete_cut_count -= iteration_complete_cut_count;
3498 Kokkos::parallel_for(
3499 Kokkos::RangePolicy<typename mj_node_t::execution_space, int>
3500 (0, local_temp_cut_coords.size()), KOKKOS_LAMBDA(
int n) {
3501 auto t = local_temp_cut_coords(n);
3502 local_temp_cut_coords(n) = local_cut_coordinates_work_array(n);
3503 local_cut_coordinates_work_array(n) = t;
3511 if(current_cut_coordinates != local_temp_cut_coords) {
3512 Kokkos::parallel_for(
3513 Kokkos::RangePolicy<typename mj_node_t::execution_space, int>
3514 (0, 1), KOKKOS_LAMBDA(
int dummy) {
3516 for(mj_part_t i = 0; i < current_concurrent_num_parts; ++i) {
3517 mj_part_t num_parts = -1;
3518 num_parts = local_device_num_partitioning_in_current_dim(
3519 current_work_part + i);
3520 mj_part_t num_cuts = num_parts - 1;
3521 for(mj_part_t ii = 0; ii < num_cuts; ++ii) {
3522 current_cut_coordinates(next + ii) = local_temp_cut_coords(next + ii);
3527 static_cast<int>(local_cut_coordinates_work_array.size()); ++n) {
3528 local_cut_coordinates_work_array(n) = local_temp_cut_coords(n);
3536template<
class scalar_t>
3542 KOKKOS_INLINE_FUNCTION
3545 KOKKOS_INLINE_FUNCTION
3548 Zoltan2_MJArrayType<scalar_t>&
operator=(
const volatile Zoltan2_MJArrayType<scalar_t>& zmj) {
3554#if !defined(KOKKOS_ENABLE_CUDA) && !defined(KOKKOS_ENABLE_HIP)
3556template<
class policy_t,
class scalar_t,
class part_t>
3567 scalar_t mj_max_scalar,
3569 int mj_value_count_rightleft,
3570 int mj_value_count_weights) :
3577 KOKKOS_INLINE_FUNCTION
3582 KOKKOS_INLINE_FUNCTION
3585 dst.ptr[n] += src.ptr[n];
3590 if(src.ptr[n] > dst.ptr[n]) {
3591 dst.ptr[n] = src.ptr[n];
3593 if(src.ptr[n+1] < dst.ptr[n+1]) {
3594 dst.ptr[n+1] = src.ptr[n+1];
3599 KOKKOS_INLINE_FUNCTION
3602 dst.ptr[n] += src.ptr[n];
3607 if(src.ptr[n] > dst.ptr[n]) {
3608 dst.ptr[n] = src.ptr[n];
3610 if(src.ptr[n+1] < dst.ptr[n+1]) {
3611 dst.ptr[n+1] = src.ptr[n+1];
3617 dst.ptr =
value->ptr;
3632template<
class policy_t,
class scalar_t,
class part_t,
class index_t,
3633 class device_t,
class array_t>
3638#if !defined(KOKKOS_ENABLE_CUDA) && !defined(KOKKOS_ENABLE_HIP)
3661#if defined(KOKKOS_ENABLE_CUDA) || defined(KOKKOS_ENABLE_HIP)
3662 Kokkos::View<double *, device_t> current_part_weights;
3663 Kokkos::View<scalar_t *, device_t> current_left_closest;
3664 Kokkos::View<scalar_t *, device_t> current_right_closest;
3669 array_t mj_max_scalar,
3670 part_t mj_concurrent_current_part,
3672 part_t mj_current_work_part,
3673 part_t mj_current_concurrent_num_parts,
3674 part_t mj_left_right_array_size,
3675 part_t mj_weight_array_size,
3676 Kokkos::View<index_t*, device_t> & mj_permutations,
3677 Kokkos::View<scalar_t *, device_t> & mj_coordinates,
3678 Kokkos::View<scalar_t**, device_t> & mj_weights,
3679 Kokkos::View<part_t*, device_t> & mj_parts,
3680 Kokkos::View<scalar_t *, device_t> & mj_cut_coordinates,
3681 Kokkos::View<index_t *, device_t> & mj_part_xadj,
3682 bool mj_uniform_weights0,
3683 scalar_t mj_sEpsilon
3684#
if defined(KOKKOS_ENABLE_CUDA) || defined(KOKKOS_ENABLE_HIP)
3685 ,Kokkos::View<double *, device_t> & mj_current_part_weights,
3686 Kokkos::View<scalar_t *, device_t> & mj_current_left_closest,
3687 Kokkos::View<scalar_t *, device_t> & mj_current_right_closest
3698 value_count(mj_weight_array_size+mj_left_right_array_size),
3707#if defined(KOKKOS_ENABLE_CUDA) || defined(KOKKOS_ENABLE_HIP)
3708 ,current_part_weights(mj_current_part_weights),
3709 current_left_closest(mj_current_left_closest),
3710 current_right_closest(mj_current_right_closest)
3716#if defined(KOKKOS_ENABLE_CUDA) || defined(KOKKOS_ENABLE_HIP)
3717 int result =
sizeof(array_t) *
3720 int result =
sizeof(array_t) *
3725 int remainder = result % 8;
3726 if(remainder != 0) {
3727 result += 8 - remainder;
3732 KOKKOS_INLINE_FUNCTION
3733#if defined(KOKKOS_ENABLE_CUDA) || defined(KOKKOS_ENABLE_HIP)
3743 index_t num_working_points = all_end - all_begin;
3744 int num_teams = teamMember.league_size();
3746 index_t stride = num_working_points / num_teams;
3747 if((num_working_points % num_teams) > 0) {
3754 index_t begin = all_begin + stride * teamMember.league_rank();
3755 index_t end = begin + stride;
3760#if defined(KOKKOS_ENABLE_CUDA) || defined(KOKKOS_ENABLE_HIP)
3764 array_t * shared_ptr = (array_t *) teamMember.team_shmem().get_shmem(
3768 Kokkos::single(Kokkos::PerTeam(teamMember), [=] () {
3778 teamMember.team_barrier();
3780 Kokkos::parallel_for(
3781 Kokkos::TeamThreadRange(teamMember, begin, end),
3788 array_t * shared_ptr = (array_t *) teamMember.team_shmem().get_shmem(
3792 Zoltan2_MJArrayType<array_t> array(&shared_ptr[teamMember.team_rank() *
3796 ArrayCombinationReducer<policy_t, array_t, part_t> arraySumReducer(
3801 Kokkos::parallel_reduce(
3802 Kokkos::TeamThreadRange(teamMember, begin, end),
3803 [=] (
size_t ii, Zoltan2_MJArrayType<array_t>& threadSum) {
3811 index_t part =
parts(i)/2;
3822#if defined(KOKKOS_ENABLE_CUDA) || defined(KOKKOS_ENABLE_HIP)
3823 Kokkos::atomic_add(&shared_ptr[part*2], w);
3825 threadSum.ptr[part*2] += w;
3831#if defined(KOKKOS_ENABLE_CUDA) || defined(KOKKOS_ENABLE_HIP)
3832 array_t new_value = (array_t) coord;
3834 while(new_value < prev_value) {
3835 prev_value = Kokkos::atomic_compare_exchange(
3837 prev_value, new_value);
3840 while(new_value > prev_value) {
3841 prev_value = Kokkos::atomic_compare_exchange(
3843 prev_value, new_value);
3861 if(coord < b + sEpsilon && coord > b -
sEpsilon) {
3865#if defined(KOKKOS_ENABLE_CUDA) || defined(KOKKOS_ENABLE_HIP)
3866 Kokkos::atomic_add(&shared_ptr[part*2+1], w);
3870 threadSum.ptr[part*2+1] += w;
3875 parts(i) = part*2+1;
3886 scalar_t delta = b - base_coord;
3887 if(delta < 0) delta = -delta;
3892#if defined(KOKKOS_ENABLE_CUDA) || defined(KOKKOS_ENABLE_HIP)
3893 Kokkos::atomic_add(&shared_ptr[part*2+1], w);
3897 threadSum.ptr[part*2+1] += w;
3908 scalar_t delta = b - base_coord;
3909 if(delta < 0) delta = -delta;
3914#if defined(KOKKOS_ENABLE_CUDA) || defined(KOKKOS_ENABLE_HIP)
3915 Kokkos::atomic_add(&shared_ptr[part*2+1], w);
3919 threadSum.ptr[part*2+1] += w;
3944 if(part == lower + 1) {
3949 part -= (part - lower)/2;
3952 else if(part == upper - 1) {
3957 part += (upper - part)/2;
3961#if defined(KOKKOS_ENABLE_CUDA) || defined(KOKKOS_ENABLE_HIP)
3964 }, arraySumReducer);
3967 teamMember.team_barrier();
3970 Kokkos::single(Kokkos::PerTeam(teamMember), [=] () {
3972#if defined(KOKKOS_ENABLE_CUDA) || defined(KOKKOS_ENABLE_HIP)
3973 Kokkos::atomic_add(¤t_part_weights(n),
3974 static_cast<double>(shared_ptr[n]));
3976 teamSum[n] += array.ptr[n];
3980#if defined(KOKKOS_ENABLE_CUDA) || defined(KOKKOS_ENABLE_HIP)
3981 int insert_left = 0;
3982 int insert_right = 0;
3987#if defined(KOKKOS_ENABLE_CUDA) || defined(KOKKOS_ENABLE_HIP)
3988 scalar_t new_value = shared_ptr[n+1];
3989 scalar_t prev_value = current_right_closest(insert_right);
3990 while(new_value < prev_value) {
3991 prev_value = Kokkos::atomic_compare_exchange(
3992 ¤t_right_closest(insert_right), prev_value, new_value);
3995 new_value = shared_ptr[n];
3996 prev_value = current_left_closest(insert_left);
3997 while(new_value > prev_value) {
3998 prev_value = Kokkos::atomic_compare_exchange(
3999 ¤t_left_closest(insert_left), prev_value, new_value);
4005 if(array.ptr[n] > teamSum[n]) {
4006 teamSum[n] = array.ptr[n];
4008 if(array.ptr[n+1] < teamSum[n+1]) {
4009 teamSum[n+1] = array.ptr[n+1];
4015 teamMember.team_barrier();
4018#if !defined(KOKKOS_ENABLE_CUDA) && !defined(KOKKOS_ENABLE_HIP)
4019 KOKKOS_INLINE_FUNCTION
4027 if(src[n] > dst[n]) {
4030 if(src[n+1] < dst[n+1]) {
4031 dst[n+1] = src[n+1];
4057template <
typename mj_scalar_t,
typename mj_lno_t,
typename mj_gno_t,
4058 typename mj_part_t,
typename mj_node_t>
4059void AlgMJ<mj_scalar_t, mj_lno_t, mj_gno_t,mj_part_t, mj_node_t>::
4060 mj_1D_part_get_part_weights(
4063 Kokkos::View<mj_scalar_t *, device_t> & mj_current_dim_coords,
4066 auto local_is_cut_line_determined = is_cut_line_determined;
4067 auto local_thread_part_weights = thread_part_weights;
4068 auto local_thread_cut_left_closest_point = thread_cut_left_closest_point;
4069 auto local_thread_cut_right_closest_point = thread_cut_right_closest_point;
4073 auto local_sEpsilon = this->
sEpsilon;
4074 auto local_assigned_part_ids = this->assigned_part_ids;
4075 auto local_coordinate_permutations = this->coordinate_permutations;
4076 auto local_mj_weights = this->mj_weights;
4078 auto local_global_min_max_coord_total_weight =
4079 this->global_min_max_coord_total_weight;
4081 typedef Kokkos::TeamPolicy<typename mj_node_t::execution_space> policy_t;
4083 auto local_device_num_partitioning_in_current_dim =
4084 device_num_partitioning_in_current_dim;
4086 Kokkos::deep_copy(device_incomplete_cut_count, this->incomplete_cut_count);
4087 auto local_device_incomplete_cut_count = device_incomplete_cut_count;
4089 mj_part_t total_part_shift = 0;
4091 mj_part_t concurrent_cut_shifts = 0;
4093 Kokkos::View<mj_scalar_t *, device_t> local_temp_cut_coords =
4094 Kokkos::subview(temp_cut_coords, std::pair<mj_lno_t, mj_lno_t>(
4095 concurrent_cut_shifts, temp_cut_coords.size()));
4097 mj_part_t num_parts =
4099 mj_part_t
num_cuts = num_parts - 1;
4100 mj_part_t total_part_count = num_parts +
num_cuts;
4101 mj_part_t weight_array_length =
num_cuts + num_parts;
4104 mj_part_t right_left_array_length = (
num_cuts + 2) * 2;
4106 if(this->incomplete_cut_count(kk) == 0) {
4107 total_part_shift += total_part_count;
4113 auto policy_ReduceWeightsFunctor = policy_t(
4114 mj_num_teams ? mj_num_teams : 60, Kokkos::AUTO);
4116#if !defined(KOKKOS_ENABLE_CUDA) && !defined(KOKKOS_ENABLE_HIP)
4117 int total_array_length =
4118 weight_array_length + right_left_array_length;
4125 typedef mj_scalar_t array_t;
4127#if !defined(KOKKOS_ENABLE_CUDA) && !defined(KOKKOS_ENABLE_HIP)
4128 Kokkos::View<array_t*, Kokkos::HostSpace> reduce_array(
"reduce_array", total_array_length);
4131 int offset_cuts = 0;
4132 for(
int kk2 = 0; kk2 < kk; ++kk2) {
4136 Kokkos::View<double *, device_t> my_current_part_weights =
4137 Kokkos::subview(local_thread_part_weights,
4138 std::pair<mj_lno_t, mj_lno_t>(total_part_shift,
4139 total_part_shift + total_part_count));
4140 Kokkos::View<mj_scalar_t *, device_t> my_current_left_closest =
4141 Kokkos::subview(local_thread_cut_left_closest_point,
4142 std::pair<mj_lno_t, mj_lno_t>(
4144 local_thread_cut_left_closest_point.size()));
4145 Kokkos::View<mj_scalar_t *, device_t> my_current_right_closest =
4146 Kokkos::subview(local_thread_cut_right_closest_point,
4147 std::pair<mj_lno_t, mj_lno_t>(
4149 local_thread_cut_right_closest_point.size()));
4151 array_t
max_scalar = std::numeric_limits<array_t>::max();
4153#if defined(KOKKOS_ENABLE_CUDA) || defined(KOKKOS_ENABLE_HIP)
4155 Kokkos::parallel_for(
4156 Kokkos::RangePolicy<typename mj_node_t::execution_space, int> (0, 1),
4157 KOKKOS_LAMBDA (
int dummy) {
4158 for(
int n = 0; n < weight_array_length; ++n) {
4159 my_current_part_weights(n) = 0;
4161 for(
int n = 0; n <
num_cuts; ++n) {
4172 typename mj_node_t::device_type, array_t>
4180 right_left_array_length,
4181 weight_array_length,
4182 coordinate_permutations,
4183 mj_current_dim_coords,
4186 local_temp_cut_coords,
4188 mj_uniform_weights(0),
4190#
if defined(KOKKOS_ENABLE_CUDA) || defined(KOKKOS_ENABLE_HIP)
4191 ,my_current_part_weights,
4192 my_current_left_closest,
4193 my_current_right_closest
4197#if defined(KOKKOS_ENABLE_CUDA) || defined(KOKKOS_ENABLE_HIP)
4198 Kokkos::parallel_for(policy_ReduceWeightsFunctor, teamFunctor);
4200 Kokkos::parallel_reduce(policy_ReduceWeightsFunctor,
4201 teamFunctor, reduce_array);
4205#if !defined(KOKKOS_ENABLE_CUDA) && !defined(KOKKOS_ENABLE_HIP)
4206 auto hostArray = Kokkos::create_mirror_view(my_current_part_weights);
4208 for(
int i = 0; i < static_cast<int>(total_part_count); ++i) {
4209 hostArray(i) = reduce_array[i];
4212 Kokkos::deep_copy(my_current_part_weights, hostArray);
4214 auto hostLeftArray = Kokkos::create_mirror_view(my_current_left_closest);
4215 auto hostRightArray = Kokkos::create_mirror_view(my_current_right_closest);
4216 for(mj_part_t cut = 0; cut <
num_cuts; ++cut) {
4217 hostLeftArray(cut) = reduce_array[weight_array_length + (cut+1)*2+0];
4218 hostRightArray(cut) = reduce_array[weight_array_length + (cut+1)*2+1];
4220 Kokkos::deep_copy(my_current_left_closest, hostLeftArray);
4221 Kokkos::deep_copy(my_current_right_closest, hostRightArray);
4224 total_part_shift += total_part_count;
4228 auto local_temp_cut_coords = temp_cut_coords;
4230 Kokkos::parallel_for(
4231 Kokkos::RangePolicy<typename mj_node_t::execution_space, mj_part_t>
4233 mj_part_t num_parts = local_device_num_partitioning_in_current_dim(
4235 mj_part_t
num_cuts = num_parts - 1;
4236 mj_part_t total_part_count = num_parts +
num_cuts;
4238 if(local_device_incomplete_cut_count(kk) > 0) {
4242 size_t offset_cuts = 0;
4243 for(mj_part_t kk2 = 0; kk2 < kk; ++kk2) {
4244 auto num_parts_kk2 = local_device_num_partitioning_in_current_dim(
4246 offset += num_parts_kk2 * 2 - 1;
4247 offset_cuts += num_parts_kk2 - 1;
4250 for(mj_part_t i = 1; i < total_part_count; ++i) {
4254 if(i % 2 == 0 && i > 1 && i < total_part_count - 1 &&
4255 std::abs(local_temp_cut_coords(offset_cuts + i / 2) -
4256 local_temp_cut_coords(offset_cuts + i /2 - 1))
4261 local_thread_part_weights(offset + i)
4262 = local_thread_part_weights(offset + i-2);
4267 local_thread_part_weights(offset + i) +=
4268 local_thread_part_weights(offset + i-1);
4281template <
typename mj_scalar_t,
typename mj_lno_t,
typename mj_gno_t,
4282 typename mj_part_t,
typename mj_node_t>
4283void AlgMJ<mj_scalar_t, mj_lno_t, mj_gno_t, mj_part_t, mj_node_t>::
4284 mj_combine_rightleft_and_weights(
4288 auto local_thread_part_weights = this->thread_part_weights;
4289 auto local_is_cut_line_determined = this->is_cut_line_determined;
4290 auto local_thread_cut_left_closest_point =
4291 this->thread_cut_left_closest_point;
4292 auto local_thread_cut_right_closest_point =
4293 this->thread_cut_right_closest_point;
4294 auto local_total_part_weight_left_right_closests =
4295 this->total_part_weight_left_right_closests;
4296 auto local_device_num_partitioning_in_current_dim =
4297 device_num_partitioning_in_current_dim;
4298 Kokkos::parallel_for(
4299 Kokkos::RangePolicy<typename mj_node_t::execution_space, int>(0,1),
4300 KOKKOS_LAMBDA (
int dummy) {
4302 size_t tlr_array_shift = 0;
4303 mj_part_t cut_shift = 0;
4304 size_t total_part_array_shift = 0;
4310 mj_part_t num_parts_in_part =
4312 mj_part_t num_cuts_in_part = num_parts_in_part - 1;
4313 size_t num_total_part_in_part =
4314 num_parts_in_part + size_t (num_cuts_in_part);
4317 for(
int ii = 0; ii < num_cuts_in_part; ++ii) {
4318 mj_part_t next = tlr_array_shift + ii;
4319 mj_part_t cut_index = cut_shift + ii;
4321 if(!local_is_cut_line_determined(cut_index)) {
4322 mj_scalar_t left_closest_in_process =
4323 local_thread_cut_left_closest_point(cut_index);
4324 mj_scalar_t right_closest_in_process =
4325 local_thread_cut_right_closest_point(cut_index);
4328 local_total_part_weight_left_right_closests(
4329 num_total_part_in_part + next) = left_closest_in_process;
4331 local_total_part_weight_left_right_closests(
4332 num_total_part_in_part + num_cuts_in_part + next) =
4333 right_closest_in_process;
4337 for(
size_t j = 0; j < num_total_part_in_part; ++j) {
4338 mj_part_t cut_ind = j / 2 + cut_shift;
4344 if(j == num_total_part_in_part - 1 ||
4345 !local_is_cut_line_determined(cut_ind)) {
4346 double pwj = local_thread_part_weights(total_part_array_shift + j);
4347 local_total_part_weight_left_right_closests(tlr_array_shift + j) = pwj;
4352 cut_shift += num_cuts_in_part;
4353 tlr_array_shift += num_total_part_in_part + 2 * num_cuts_in_part;
4354 total_part_array_shift += num_total_part_in_part;
4371template <
typename mj_scalar_t,
typename mj_lno_t,
typename mj_gno_t,
4372 typename mj_part_t,
typename mj_node_t>
4373KOKKOS_INLINE_FUNCTION
4374void AlgMJ<mj_scalar_t, mj_lno_t, mj_gno_t, mj_part_t,
4375 mj_node_t>::mj_calculate_new_cut_position(mj_scalar_t cut_upper_bound,
4376 mj_scalar_t cut_lower_bound,
4377 mj_scalar_t cut_upper_weight,
4378 mj_scalar_t cut_lower_weight,
4379 mj_scalar_t expected_weight,
4380 mj_scalar_t &new_cut_position,
4383 if(std::abs(cut_upper_bound - cut_lower_bound) <
sEpsilon) {
4384 new_cut_position = cut_upper_bound;
4387 if(std::abs(cut_upper_weight - cut_lower_weight) <
sEpsilon) {
4388 new_cut_position = cut_lower_bound;
4391 mj_scalar_t coordinate_range = (cut_upper_bound - cut_lower_bound);
4392 mj_scalar_t weight_range = (cut_upper_weight - cut_lower_weight);
4393 mj_scalar_t my_weight_diff = (expected_weight - cut_lower_weight);
4395 mj_scalar_t required_shift = (my_weight_diff / weight_range);
4396 int scale_constant = 20;
4397 int shiftint= int (required_shift * scale_constant);
4398 if(shiftint == 0) shiftint = 1;
4399 required_shift = mj_scalar_t (shiftint) / scale_constant;
4400 new_cut_position = coordinate_range * required_shift + cut_lower_bound;
4403#if !defined(KOKKOS_ENABLE_CUDA) && !defined(KOKKOS_ENABLE_HIP)
4405template<
class policy_t,
class scalar_t>
4415 int mj_value_count) :
4417 value_count(mj_value_count)
4420 KOKKOS_INLINE_FUNCTION
4425 KOKKOS_INLINE_FUNCTION
4427 for(
int n = 0; n < value_count; ++n) {
4428 dst.ptr[n] += src.ptr[n];
4433 dst.ptr = value->ptr;
4434 for(
int n = 0; n < value_count; ++n) {
4442template<
class policy_t,
class scalar_t,
class part_t,
class index_t,
4443 class device_t,
class array_t>
4448#if !defined(KOKKOS_ENABLE_CUDA) && !defined(KOKKOS_ENABLE_HIP)
4449 typedef array_t value_type[];
4460#if defined(KOKKOS_ENABLE_CUDA) || defined(KOKKOS_ENABLE_HIP)
4461 Kokkos::View<int *, device_t> local_point_counts;
4465 part_t mj_concurrent_current_part,
4466 part_t mj_weight_array_size,
4467 Kokkos::View<index_t*, device_t> & mj_permutations,
4468 Kokkos::View<scalar_t *, device_t> & mj_coordinates,
4469 Kokkos::View<part_t*, device_t> & mj_parts,
4470 Kokkos::View<index_t *, device_t> & mj_part_xadj,
4471 Kokkos::View<index_t *, device_t> & mj_track_on_cuts
4472#
if defined(KOKKOS_ENABLE_CUDA) || defined(KOKKOS_ENABLE_HIP)
4473 ,Kokkos::View<int *, device_t> & mj_local_point_counts
4476 concurrent_current_part(mj_concurrent_current_part),
4477 value_count(mj_weight_array_size),
4478 permutations(mj_permutations),
4481 part_xadj(mj_part_xadj),
4482 track_on_cuts(mj_track_on_cuts)
4483#if defined(KOKKOS_ENABLE_CUDA) || defined(KOKKOS_ENABLE_HIP)
4484 ,local_point_counts(mj_local_point_counts)
4490#if defined(KOKKOS_ENABLE_CUDA) || defined(KOKKOS_ENABLE_HIP)
4491 int result =
sizeof(array_t) * (value_count);
4493 int result =
sizeof(array_t) * (value_count) * team_size;
4497 int remainder = result % 8;
4498 if(remainder != 0) {
4499 result += 8 - remainder;
4504 KOKKOS_INLINE_FUNCTION
4505#if defined(KOKKOS_ENABLE_CUDA) || defined(KOKKOS_ENABLE_HIP)
4506 void operator() (
const member_type & teamMember)
const {
4508 void operator() (
const member_type & teamMember, value_type teamSum)
const {
4510 index_t all_begin = (concurrent_current_part == 0) ? 0 :
4511 part_xadj(concurrent_current_part - 1);
4512 index_t all_end = part_xadj(concurrent_current_part);
4514 index_t num_working_points = all_end - all_begin;
4515 int num_teams = teamMember.league_size();
4517 index_t stride = num_working_points / num_teams;
4518 if((num_working_points % num_teams) > 0) {
4522 index_t begin = all_begin + stride * teamMember.league_rank();
4523 index_t end = begin + stride;
4528 int track_on_cuts_insert_index = track_on_cuts.size() - 1;
4531#if defined(KOKKOS_ENABLE_CUDA) || defined(KOKKOS_ENABLE_HIP)
4532 size_t sh_mem_size =
sizeof(array_t) * (value_count);
4534 size_t sh_mem_size =
4535 sizeof(array_t) * (value_count) * teamMember.team_size();
4538 array_t * shared_ptr = (array_t *) teamMember.team_shmem().get_shmem(
4541#if defined(KOKKOS_ENABLE_CUDA) || defined(KOKKOS_ENABLE_HIP)
4543 Kokkos::single(Kokkos::PerTeam(teamMember), [=] () {
4544 for(
int n = 0; n < value_count; ++n) {
4548 teamMember.team_barrier();
4550 Kokkos::parallel_for(Kokkos::TeamThreadRange(teamMember, begin, end),
4554 Zoltan2_MJArrayType<array_t> array(&shared_ptr[teamMember.team_rank() *
4558 ArrayReducer<policy_t, array_t> arrayReducer(array, value_count);
4560 Kokkos::parallel_reduce(
4561 Kokkos::TeamThreadRange(teamMember, begin, end),
4562 [=] (
size_t ii, Zoltan2_MJArrayType<array_t>& threadSum) {
4565 index_t coordinate_index = permutations(ii);
4566 part_t place = parts(coordinate_index);
4568 if(place % 2 == 0) {
4569#if defined(KOKKOS_ENABLE_CUDA) || defined(KOKKOS_ENABLE_HIP)
4570 Kokkos::atomic_add(&shared_ptr[part], 1);
4572 threadSum.ptr[part] += 1;
4575 parts(coordinate_index) = part;
4580 index_t set_index = Kokkos::atomic_fetch_add(
4581 &track_on_cuts(track_on_cuts_insert_index), 1);
4582 track_on_cuts(set_index) = ii;
4584#if defined(KOKKOS_ENABLE_CUDA) || defined(KOKKOS_ENABLE_HIP)
4590 teamMember.team_barrier();
4593 Kokkos::single(Kokkos::PerTeam(teamMember), [=] () {
4594 for(
int n = 0; n < value_count; ++n) {
4595#if defined(KOKKOS_ENABLE_CUDA) || defined(KOKKOS_ENABLE_HIP)
4596 Kokkos::atomic_add(&local_point_counts(n), shared_ptr[n]);
4598 teamSum[n] += array.ptr[n];
4603 teamMember.team_barrier();
4606#if !defined(KOKKOS_ENABLE_CUDA) && !defined(KOKKOS_ENABLE_HIP)
4608 KOKKOS_INLINE_FUNCTION
4609 void join(value_type dst,
const value_type src)
const {
4610 for(
int n = 0; n < value_count; ++n) {
4615 KOKKOS_INLINE_FUNCTION
void init (value_type dst)
const {
4616 for(
int n = 0; n < value_count; ++n) {
4638template <
typename mj_scalar_t,
typename mj_lno_t,
typename mj_gno_t,
4639 typename mj_part_t,
typename mj_node_t>
4640void AlgMJ<mj_scalar_t, mj_lno_t, mj_gno_t, mj_part_t, mj_node_t>::
4641mj_create_new_partitions(
4642 mj_part_t num_parts,
4643 mj_part_t current_concurrent_work_part,
4644 Kokkos::View<mj_scalar_t *, device_t> & mj_current_dim_coords,
4645 Kokkos::View<mj_scalar_t *, device_t> & current_concurrent_cut_coordinate,
4646 Kokkos::View<mj_scalar_t *, device_t> & used_local_cut_line_weight_to_left,
4647 Kokkos::View<mj_lno_t *, device_t> & out_part_xadj)
4650 auto local_thread_part_weight_work = this->thread_part_weight_work;
4651 auto local_point_counts = this->thread_point_counts;
4652 auto local_distribute_points_on_cut_lines =
4653 this->distribute_points_on_cut_lines;
4654 auto local_thread_cut_line_weight_to_put_left =
4655 this->thread_cut_line_weight_to_put_left;
4656 auto local_sEpsilon = this->sEpsilon;
4657 auto local_coordinate_permutations = this->coordinate_permutations;
4658 auto local_mj_weights = this->mj_weights;
4659 auto local_assigned_part_ids = this->assigned_part_ids;
4660 auto local_new_coordinate_permutations = this->new_coordinate_permutations;
4662 mj_part_t num_cuts = num_parts - 1;
4664 Kokkos::parallel_for(
4665 Kokkos::RangePolicy<typename mj_node_t::execution_space, int> (0, 1),
4666 KOKKOS_LAMBDA(
int dummy) {
4668 if(local_distribute_points_on_cut_lines) {
4669 for(
int i = 0; i < num_cuts; ++i) {
4670 mj_scalar_t left_weight = used_local_cut_line_weight_to_left(i);
4671 if(left_weight > local_sEpsilon) {
4673 mj_scalar_t thread_ii_weight_on_cut =
4674 local_thread_part_weight_work(i * 2 + 1) -
4675 local_thread_part_weight_work(i * 2);
4677 if(thread_ii_weight_on_cut < left_weight) {
4679 local_thread_cut_line_weight_to_put_left(i) =
4680 thread_ii_weight_on_cut;
4684 local_thread_cut_line_weight_to_put_left(i) = left_weight;
4686 left_weight -= thread_ii_weight_on_cut;
4689 local_thread_cut_line_weight_to_put_left(i) = 0;
4695 for(mj_part_t i = num_cuts - 1; i > 0 ; --i) {
4696 if(std::abs(current_concurrent_cut_coordinate(i) -
4697 current_concurrent_cut_coordinate(i -1)) < local_sEpsilon) {
4698 local_thread_cut_line_weight_to_put_left(i) -=
4699 local_thread_cut_line_weight_to_put_left(i - 1);
4701 local_thread_cut_line_weight_to_put_left(i) =
4702 static_cast<long long>((local_thread_cut_line_weight_to_put_left(i) +
4703 least_signifiance) * significance_mul) /
4704 static_cast<mj_scalar_t
>(significance_mul);
4708 for(mj_part_t i = 0; i < num_parts; ++i) {
4709 local_point_counts(i) = 0;
4713 mj_lno_t coordinate_begin_index =
4714 current_concurrent_work_part == 0 ? 0 :
4715 host_part_xadj(current_concurrent_work_part - 1);
4716 mj_lno_t coordinate_end_index =
4717 host_part_xadj(current_concurrent_work_part);
4719 mj_lno_t total_on_cut;
4720 Kokkos::parallel_reduce(
"Get total_on_cut",
4721 Kokkos::RangePolicy<typename mj_node_t::execution_space, int> (
4722 coordinate_begin_index, coordinate_end_index),
4723 KOKKOS_LAMBDA(
int ii, mj_lno_t & val) {
4724 mj_lno_t coordinate_index = local_coordinate_permutations(ii);
4725 mj_part_t coordinate_assigned_place =
4726 local_assigned_part_ids(coordinate_index);
4727 if(coordinate_assigned_place % 2 == 1) {
4732 Kokkos::View<mj_lno_t *, device_t> track_on_cuts;
4733 if(total_on_cut > 0) {
4734 track_on_cuts = Kokkos::View<mj_lno_t *, device_t>(
4745 typedef Kokkos::TeamPolicy<typename mj_node_t::execution_space> policy_t;
4748 int use_num_teams = mj_num_teams ? mj_num_teams : 60;
4750 auto policy_ReduceFunctor = policy_t(use_num_teams, Kokkos::AUTO);
4751 typedef int array_t;
4755#if !defined(KOKKOS_ENABLE_CUDA) && !defined(KOKKOS_ENABLE_HIP)
4756 Kokkos::View<array_t*, Kokkos::HostSpace> reduce_array(
"reduce_array", num_parts);
4759 ReduceArrayFunctor<policy_t, mj_scalar_t, mj_part_t, mj_lno_t,
4760 typename mj_node_t::device_type, array_t>teamFunctor(
4761 current_concurrent_work_part,
4763 coordinate_permutations,
4764 mj_current_dim_coords,
4768#
if defined(KOKKOS_ENABLE_CUDA) || defined(KOKKOS_ENABLE_HIP)
4773#if defined(KOKKOS_ENABLE_CUDA) || defined(KOKKOS_ENABLE_HIP)
4774 Kokkos::parallel_for(policy_ReduceFunctor, teamFunctor);
4776 Kokkos::parallel_reduce(policy_ReduceFunctor, teamFunctor, reduce_array);
4780#if !defined(KOKKOS_ENABLE_CUDA) && !defined(KOKKOS_ENABLE_HIP)
4781 for(mj_part_t part = 0; part < num_parts; ++part) {
4782 local_point_counts(part) = reduce_array[part];
4788 if(track_on_cuts.size() > 0) {
4789 auto track_on_cuts_sort = Kokkos::subview(track_on_cuts,
4790 std::pair<mj_lno_t, mj_lno_t>(0, track_on_cuts.size() - 1));
4791 Kokkos::sort(track_on_cuts_sort);
4794 bool uniform_weights0 = this->mj_uniform_weights(0);
4795 Kokkos::parallel_for(
4796 Kokkos::RangePolicy<typename mj_node_t::execution_space, int> (0, 1),
4797 KOKKOS_LAMBDA (
int dummy) {
4799 for(
int j = 0; j < total_on_cut; ++j) {
4800 int ii = track_on_cuts(j);
4801 mj_lno_t coordinate_index = local_coordinate_permutations(ii);
4802 mj_scalar_t coordinate_weight = uniform_weights0 ? 1 :
4803 local_mj_weights(coordinate_index,0);
4804 mj_part_t coordinate_assigned_place =
4805 local_assigned_part_ids(coordinate_index);
4806 mj_part_t coordinate_assigned_part = coordinate_assigned_place / 2;
4808 if(local_distribute_points_on_cut_lines &&
4809 local_thread_cut_line_weight_to_put_left(
4810 coordinate_assigned_part) > local_sEpsilon) {
4814 local_thread_cut_line_weight_to_put_left(
4815 coordinate_assigned_part) -= coordinate_weight;
4820 if(local_thread_cut_line_weight_to_put_left(
4821 coordinate_assigned_part) < 0 && coordinate_assigned_part <
4823 std::abs(current_concurrent_cut_coordinate(
4824 coordinate_assigned_part+1) -
4825 current_concurrent_cut_coordinate(
4826 coordinate_assigned_part)) < local_sEpsilon)
4828 local_thread_cut_line_weight_to_put_left(
4829 coordinate_assigned_part + 1) +=
4830 local_thread_cut_line_weight_to_put_left(
4831 coordinate_assigned_part);
4833 ++local_point_counts(coordinate_assigned_part);
4834 local_assigned_part_ids(coordinate_index) =
4835 coordinate_assigned_part;
4840 ++coordinate_assigned_part;
4843 while(local_distribute_points_on_cut_lines &&
4844 coordinate_assigned_part < num_cuts)
4847 if(std::abs(current_concurrent_cut_coordinate(
4848 coordinate_assigned_part) -
4849 current_concurrent_cut_coordinate(
4850 coordinate_assigned_part - 1)) < local_sEpsilon)
4853 if(local_thread_cut_line_weight_to_put_left(
4854 coordinate_assigned_part) > local_sEpsilon &&
4855 local_thread_cut_line_weight_to_put_left(
4856 coordinate_assigned_part) >=
4857 std::abs(local_thread_cut_line_weight_to_put_left(
4858 coordinate_assigned_part) - coordinate_weight))
4860 local_thread_cut_line_weight_to_put_left(
4861 coordinate_assigned_part) -= coordinate_weight;
4865 if(local_thread_cut_line_weight_to_put_left(
4866 coordinate_assigned_part) < 0 &&
4867 coordinate_assigned_part < num_cuts - 1 &&
4868 std::abs(current_concurrent_cut_coordinate(
4869 coordinate_assigned_part+1) -
4870 current_concurrent_cut_coordinate(
4871 coordinate_assigned_part)) < local_sEpsilon)
4873 local_thread_cut_line_weight_to_put_left(
4874 coordinate_assigned_part + 1) +=
4875 local_thread_cut_line_weight_to_put_left(
4876 coordinate_assigned_part);
4884 ++coordinate_assigned_part;
4886 local_point_counts(coordinate_assigned_part) += 1;
4887 local_assigned_part_ids(coordinate_index) = coordinate_assigned_part;
4891 for(
int j = 0; j < num_parts; ++j) {
4892 out_part_xadj(j) = local_point_counts(j);
4893 local_point_counts(j) = 0;
4896 out_part_xadj(j) += out_part_xadj(j - 1);
4897 local_point_counts(j) += out_part_xadj(j - 1);
4905#if defined(KOKKOS_ENABLE_CUDA) || defined(KOKKOS_ENABLE_HIP)
4910 Kokkos::parallel_for(
4911 Kokkos::RangePolicy<typename mj_node_t::execution_space, mj_lno_t> (
4912 coordinate_begin_index, coordinate_end_index),
4913 KOKKOS_LAMBDA (mj_lno_t ii) {
4914 mj_lno_t i = local_coordinate_permutations(ii);
4915 mj_part_t p = local_assigned_part_ids(i);
4916 mj_lno_t idx = Kokkos::atomic_fetch_add(&local_point_counts(p), 1);
4917 local_new_coordinate_permutations(coordinate_begin_index + idx) = i;
4922#ifdef KOKKOS_ENABLE_OPENMP
4924 const int num_threads = 1;
4926 const int num_threads = 1;
4929 const int num_teams = 1;
4932 Kokkos::View<mj_lno_t*, device_t>
4933 point_counter(
"insert indices", num_teams * num_threads * num_parts);
4937 Kokkos::TeamPolicy<typename mj_node_t::execution_space>
4938 block_policy(num_teams, num_threads);
4939 typedef typename Kokkos::TeamPolicy<typename mj_node_t::execution_space>::
4940 member_type member_type;
4941 mj_lno_t range = coordinate_end_index - coordinate_begin_index;
4942 mj_lno_t block_size = range / num_teams + 1;
4943 Kokkos::parallel_for(block_policy, KOKKOS_LAMBDA(member_type team_member) {
4944 int team = team_member.league_rank();
4945 int team_offset = team * num_threads * num_parts;
4946 mj_lno_t begin = coordinate_begin_index + team * block_size;
4947 mj_lno_t end = begin + block_size;
4948 if(end > coordinate_end_index) {
4949 end = coordinate_end_index;
4952 Kokkos::parallel_for(Kokkos::TeamThreadRange(team_member, begin, end),
4954 int thread = team_member.team_rank();
4955 mj_lno_t i = local_coordinate_permutations(ii);
4956 mj_part_t p = local_assigned_part_ids(i);
4957 int index = team_offset + thread * num_parts + p;
4958 ++point_counter(index);
4966 Kokkos::parallel_for(
4967 Kokkos::RangePolicy<typename mj_node_t::execution_space, int> (0, 1),
4968 KOKKOS_LAMBDA (
int dummy) {
4969 int num_sets = point_counter.size() / num_parts;
4970 for(
int set = num_sets - 1; set >= 1; set -=1) {
4971 int base = set * num_parts;
4972 for(
int part = 0; part < num_parts; ++part) {
4973 point_counter(base + part) = point_counter(base + part - num_parts);
4977 for(
int part = 0; part < num_parts; ++part) {
4978 point_counter(part) = 0;
4981 for(
int set = 1; set < num_sets; ++set) {
4982 int base = set * num_parts;
4983 for(
int part = 0; part < num_parts; ++part) {
4984 point_counter(base + part) += point_counter(base + part - num_parts);
4990 Kokkos::parallel_for(block_policy, KOKKOS_LAMBDA(member_type team_member) {
4991 int team = team_member.league_rank();
4992 int team_offset = team * num_threads * num_parts;
4993 mj_lno_t begin = coordinate_begin_index + team * block_size;
4994 mj_lno_t end = begin + block_size;
4995 if(end > coordinate_end_index) {
4996 end = coordinate_end_index;
4998 Kokkos::parallel_for(Kokkos::TeamThreadRange(team_member, begin, end),
5000 int thread = team_member.team_rank();
5001 mj_lno_t i = local_coordinate_permutations(ii);
5002 mj_part_t p = local_assigned_part_ids(i);
5003 int index = team_offset + thread * num_parts + p;
5004 int set_counter = (point_counter(index)++) + local_point_counts(p);
5005 local_new_coordinate_permutations(coordinate_begin_index + set_counter) = i;
5054template <
typename mj_scalar_t,
typename mj_lno_t,
typename mj_gno_t,
5055 typename mj_part_t,
typename mj_node_t>
5056void AlgMJ<mj_scalar_t, mj_lno_t, mj_gno_t, mj_part_t,
5057 mj_node_t>::mj_get_new_cut_coordinates(
5058 mj_part_t current_concurrent_num_parts,
5060 const mj_part_t &num_cuts,
5061 const double &used_imbalance_tolerance,
5062 Kokkos::View<mj_scalar_t *, device_t> & current_global_part_weights,
5063 Kokkos::View<mj_scalar_t *, device_t> & current_local_part_weights,
5064 Kokkos::View<mj_scalar_t *, device_t> & current_part_target_weights,
5065 Kokkos::View<bool *, device_t> & current_cut_line_determined,
5066 Kokkos::View<mj_scalar_t *, device_t> & current_cut_coordinates,
5067 Kokkos::View<mj_scalar_t *, device_t> & current_cut_upper_bounds,
5068 Kokkos::View<mj_scalar_t *, device_t> & current_cut_lower_bounds,
5069 Kokkos::View<mj_scalar_t *, device_t> & current_global_left_closest_points,
5070 Kokkos::View<mj_scalar_t *, device_t> & current_global_right_closest_points,
5071 Kokkos::View<mj_scalar_t *, device_t> & current_cut_lower_bound_weights,
5072 Kokkos::View<mj_scalar_t *, device_t> & current_cut_upper_weights,
5073 Kokkos::View<mj_scalar_t *, device_t> & new_current_cut_coordinates,
5074 Kokkos::View<mj_scalar_t *, device_t> &
5075 current_part_cut_line_weight_to_put_left,
5076 Kokkos::View<mj_part_t *, device_t> & view_rectilinear_cut_count)
5078 Kokkos::deep_copy(device_incomplete_cut_count, this->incomplete_cut_count);
5080 auto local_device_incomplete_cut_count = device_incomplete_cut_count;
5081 auto local_sEpsilon = sEpsilon;
5082 auto local_distribute_points_on_cut_lines = distribute_points_on_cut_lines;
5083 auto local_global_rectilinear_cut_weight = global_rectilinear_cut_weight;
5084 auto local_process_rectilinear_cut_weight = process_rectilinear_cut_weight;
5085 auto local_global_min_max_coord_total_weight =
5086 global_min_max_coord_total_weight;
5088 const auto _sEpsilon = this->sEpsilon;
5096 Kokkos::TeamPolicy<typename mj_node_t::execution_space>
5097 policy_one_team(1, Kokkos::AUTO());
5098 typedef typename Kokkos::TeamPolicy<typename mj_node_t::execution_space>::
5099 member_type member_type;
5100 Kokkos::parallel_for(policy_one_team, KOKKOS_LAMBDA(member_type team_member) {
5102 mj_scalar_t min_coordinate =
5103 local_global_min_max_coord_total_weight(kk);
5104 mj_scalar_t max_coordinate =
5105 local_global_min_max_coord_total_weight(
5106 kk + current_concurrent_num_parts);
5107 mj_scalar_t global_total_weight =
5108 local_global_min_max_coord_total_weight(
5109 kk + current_concurrent_num_parts * 2);
5111 Kokkos::parallel_for(Kokkos::TeamThreadRange (team_member, num_cuts),
5116 current_global_left_closest_points(i) > local_sEpsilon) {
5117 current_global_left_closest_points(i) =
5118 current_cut_coordinates(i);
5120 if(current_global_right_closest_points(i) -
5121 max_coordinate > local_sEpsilon) {
5122 current_global_right_closest_points(i) =
5123 current_cut_coordinates(i);
5126 team_member.team_barrier();
5128 Kokkos::parallel_for(Kokkos::TeamThreadRange (team_member, num_cuts),
5130 using algMJ_t = AlgMJ<mj_scalar_t, mj_lno_t, mj_gno_t, mj_part_t,
5133 mj_scalar_t seen_weight_in_part = 0;
5135 mj_scalar_t expected_weight_in_part = 0;
5137 double imbalance_on_left = 0, imbalance_on_right = 0;
5138 if(local_distribute_points_on_cut_lines) {
5140 local_global_rectilinear_cut_weight(i) = 0;
5141 local_process_rectilinear_cut_weight(i) = 0;
5143 bool bContinue =
false;
5146 if(current_cut_line_determined(i)) {
5147 new_current_cut_coordinates(i) =
5148 current_cut_coordinates(i);
5153 seen_weight_in_part = current_global_part_weights(i * 2);
5156 expected_weight_in_part = current_part_target_weights(i);
5159 imbalance_on_left = algMJ_t::calculate_imbalance(seen_weight_in_part,
5160 expected_weight_in_part);
5163 imbalance_on_right = algMJ_t::calculate_imbalance(global_total_weight -
5164 seen_weight_in_part, global_total_weight - expected_weight_in_part);
5165 bool is_left_imbalance_valid = std::abs(imbalance_on_left) -
5166 used_imbalance_tolerance < local_sEpsilon ;
5167 bool is_right_imbalance_valid = std::abs(imbalance_on_right) -
5168 used_imbalance_tolerance < local_sEpsilon;
5170 if(is_left_imbalance_valid && is_right_imbalance_valid) {
5171 current_cut_line_determined(i) =
true;
5172 Kokkos::atomic_add(&local_device_incomplete_cut_count(kk), -1);
5173 new_current_cut_coordinates(i) = current_cut_coordinates(i);
5175 else if(imbalance_on_left < 0) {
5177 if(local_distribute_points_on_cut_lines) {
5182 if(current_global_part_weights(i * 2 + 1) ==
5183 expected_weight_in_part) {
5185 current_cut_line_determined(i) =
true;
5186 Kokkos::atomic_add(&local_device_incomplete_cut_count(kk), -1);
5189 new_current_cut_coordinates(i) =
5190 current_cut_coordinates(i);
5192 current_part_cut_line_weight_to_put_left(i) =
5193 current_local_part_weights(i * 2 + 1) -
5194 current_local_part_weights(i * 2);
5197 else if(current_global_part_weights(i * 2 + 1) >
5198 expected_weight_in_part) {
5201 current_cut_line_determined(i) =
true;
5202 Kokkos::atomic_add(&view_rectilinear_cut_count(0), 1);
5206 Kokkos::atomic_add(&local_device_incomplete_cut_count(kk), -1);
5207 new_current_cut_coordinates(i) =
5208 current_cut_coordinates(i);
5209 local_process_rectilinear_cut_weight[i] =
5210 current_local_part_weights(i * 2 + 1) -
5211 current_local_part_weights(i * 2);
5220 current_cut_lower_bounds(i) =
5221 current_global_right_closest_points(i);
5224 current_cut_lower_bound_weights(i) = seen_weight_in_part;
5229 for(mj_part_t ii = i + 1; ii < num_cuts ; ++ii) {
5230 mj_scalar_t p_weight = current_global_part_weights(ii * 2);
5231 mj_scalar_t line_weight =
5232 current_global_part_weights(ii * 2 + 1);
5233 if(p_weight >= expected_weight_in_part) {
5238 if(p_weight == expected_weight_in_part) {
5239 current_cut_upper_bounds(i) =
5240 current_cut_coordinates(ii);
5241 current_cut_upper_weights(i) = p_weight;
5242 current_cut_lower_bounds(i) =
5243 current_cut_coordinates(ii);
5244 current_cut_lower_bound_weights(i) = p_weight;
5245 }
else if(p_weight < current_cut_upper_weights(i)) {
5248 current_cut_upper_bounds(i) =
5249 current_global_left_closest_points(ii);
5250 current_cut_upper_weights(i) = p_weight;
5256 if(line_weight >= expected_weight_in_part) {
5260 current_cut_upper_bounds(i) =
5261 current_cut_coordinates(ii);
5262 current_cut_upper_weights(i) = line_weight;
5263 current_cut_lower_bounds(i) =
5264 current_cut_coordinates(ii);
5265 current_cut_lower_bound_weights(i) = p_weight;
5270 if(p_weight <= expected_weight_in_part && p_weight >=
5271 current_cut_lower_bound_weights(i)) {
5272 current_cut_lower_bounds(i) =
5273 current_global_right_closest_points(ii);
5274 current_cut_lower_bound_weights(i) = p_weight;
5278 mj_scalar_t new_cut_position = 0;
5279 algMJ_t::mj_calculate_new_cut_position(
5280 current_cut_upper_bounds(i),
5281 current_cut_lower_bounds(i),
5282 current_cut_upper_weights(i),
5283 current_cut_lower_bound_weights(i),
5284 expected_weight_in_part, new_cut_position,
5289 if(std::abs(current_cut_coordinates(i) -
5290 new_cut_position) < local_sEpsilon) {
5291 current_cut_line_determined(i) =
true;
5292 Kokkos::atomic_add(&local_device_incomplete_cut_count(kk), -1);
5295 new_current_cut_coordinates(i) =
5296 current_cut_coordinates(i);
5298 new_current_cut_coordinates(i) = new_cut_position;
5304 current_cut_upper_bounds(i) =
5305 current_global_left_closest_points(i);
5306 current_cut_upper_weights(i) =
5307 seen_weight_in_part;
5310 for(
int ii = i - 1; ii >= 0; --ii) {
5311 mj_scalar_t p_weight =
5312 current_global_part_weights(ii * 2);
5313 mj_scalar_t line_weight =
5314 current_global_part_weights(ii * 2 + 1);
5315 if(p_weight <= expected_weight_in_part) {
5316 if(p_weight == expected_weight_in_part) {
5319 current_cut_upper_bounds(i) =
5320 current_cut_coordinates(ii);
5321 current_cut_upper_weights(i) = p_weight;
5322 current_cut_lower_bounds(i) =
5323 current_cut_coordinates(ii);
5324 current_cut_lower_bound_weights(i) = p_weight;
5326 else if(p_weight > current_cut_lower_bound_weights(i)) {
5329 current_cut_lower_bounds(i) =
5330 current_global_right_closest_points(ii);
5331 current_cut_lower_bound_weights(i) = p_weight;
5337 if(line_weight > expected_weight_in_part) {
5338 current_cut_upper_bounds(i) =
5339 current_global_right_closest_points(ii);
5340 current_cut_upper_weights(i) = line_weight;
5350 if(p_weight >= expected_weight_in_part &&
5351 (p_weight < current_cut_upper_weights(i) ||
5352 (p_weight == current_cut_upper_weights(i) &&
5353 current_cut_upper_bounds(i) >
5354 current_global_left_closest_points(ii)))) {
5355 current_cut_upper_bounds(i) =
5356 current_global_left_closest_points(ii);
5357 current_cut_upper_weights(i) = p_weight;
5360 mj_scalar_t new_cut_position = 0;
5361 algMJ_t::mj_calculate_new_cut_position(
5362 current_cut_upper_bounds(i),
5363 current_cut_lower_bounds(i),
5364 current_cut_upper_weights(i),
5365 current_cut_lower_bound_weights(i),
5366 expected_weight_in_part,
5371 if(std::abs(current_cut_coordinates(i) -
5372 new_cut_position) < local_sEpsilon) {
5373 current_cut_line_determined(i) =
true;
5374 Kokkos::atomic_add(&local_device_incomplete_cut_count(kk), -1);
5376 new_current_cut_coordinates(i) =
5377 current_cut_coordinates(i);
5379 new_current_cut_coordinates(i) =
5386 team_member.team_barrier();
5390 mj_part_t rectilinear_cut_count;
5391 Kokkos::parallel_reduce(
"Read bDoingWork",
5392 Kokkos::RangePolicy<typename mj_node_t::execution_space, int>(0, 1),
5393 KOKKOS_LAMBDA(
int dummy,
int & set_single) {
5394 set_single = view_rectilinear_cut_count(0);
5395 }, rectilinear_cut_count);
5397 if(rectilinear_cut_count > 0) {
5398 auto host_local_process_rectilinear_cut_weight =
5399 Kokkos::create_mirror_view(Kokkos::HostSpace(),
5400 local_process_rectilinear_cut_weight);
5401 auto host_local_global_rectilinear_cut_weight =
5402 Kokkos::create_mirror_view(Kokkos::HostSpace(),
5403 local_global_rectilinear_cut_weight);
5404 Kokkos::deep_copy(host_local_process_rectilinear_cut_weight,
5405 local_process_rectilinear_cut_weight);
5406 Kokkos::deep_copy(host_local_global_rectilinear_cut_weight,
5407 local_global_rectilinear_cut_weight);
5408 Teuchos::scan<int,mj_scalar_t>(
5409 *comm, Teuchos::REDUCE_SUM,
5411 host_local_process_rectilinear_cut_weight.data(),
5412 host_local_global_rectilinear_cut_weight.data());
5413 Kokkos::deep_copy(local_process_rectilinear_cut_weight,
5414 host_local_process_rectilinear_cut_weight);
5415 Kokkos::deep_copy(local_global_rectilinear_cut_weight,
5416 host_local_global_rectilinear_cut_weight);
5418 Kokkos::parallel_for(
"finish up mj_get_new_cut_coordinates",
5419 Kokkos::RangePolicy<typename mj_node_t::execution_space, int> (0, 1),
5420 KOKKOS_LAMBDA(
int dummy) {
5421 for(mj_part_t i = 0; i < num_cuts; ++i) {
5423 if(local_global_rectilinear_cut_weight(i) > 0) {
5425 mj_scalar_t expected_part_weight = current_part_target_weights(i);
5427 mj_scalar_t necessary_weight_on_line_for_left =
5428 expected_part_weight - current_global_part_weights(i * 2);
5431 mj_scalar_t my_weight_on_line =
5432 local_process_rectilinear_cut_weight(i);
5436 mj_scalar_t weight_on_line_upto_process_inclusive =
5437 local_global_rectilinear_cut_weight(i);
5441 mj_scalar_t space_to_put_left =
5442 necessary_weight_on_line_for_left -
5443 weight_on_line_upto_process_inclusive;
5446 mj_scalar_t space_left_to_me =
5447 space_to_put_left + my_weight_on_line;
5460 if(space_left_to_me < 0) {
5463 current_part_cut_line_weight_to_put_left(i) = 0;
5465 else if(space_left_to_me >= my_weight_on_line) {
5469 current_part_cut_line_weight_to_put_left(i) =
5476 current_part_cut_line_weight_to_put_left(i) =
5483 view_rectilinear_cut_count(0) = 0;
5487 Kokkos::deep_copy(this->incomplete_cut_count, device_incomplete_cut_count);
5499template <
typename mj_scalar_t,
typename mj_lno_t,
typename mj_gno_t,
5500 typename mj_part_t,
typename mj_node_t>
5501void AlgMJ<mj_scalar_t, mj_lno_t, mj_gno_t, mj_part_t, mj_node_t>::
5502 get_processor_num_points_in_parts(
5503 mj_part_t num_procs,
5504 mj_part_t num_parts,
5505 mj_gno_t *&num_points_in_all_processor_parts)
5508 size_t allocation_size = num_parts * (num_procs + 1);
5515 mj_gno_t *num_local_points_in_each_part_to_reduce_sum =
5516 new mj_gno_t[allocation_size];
5520 mj_gno_t *my_local_points_to_reduce_sum =
5521 num_local_points_in_each_part_to_reduce_sum + num_procs * num_parts;
5525 mj_gno_t *my_local_point_counts_in_each_part =
5526 num_local_points_in_each_part_to_reduce_sum + this->myRank * num_parts;
5529 memset(num_local_points_in_each_part_to_reduce_sum, 0,
5530 sizeof(mj_gno_t)*allocation_size);
5532 auto local_new_part_xadj = this->new_part_xadj;
5533 Kokkos::View<mj_gno_t *, typename mj_node_t::device_type> points_per_part(
5534 Kokkos::ViewAllocateWithoutInitializing(
"points per part"), num_parts);
5535 Kokkos::parallel_for(
"get vals on device",
5536 Kokkos::RangePolicy<typename mj_node_t::execution_space, mj_gno_t>
5537 (0, num_parts), KOKKOS_LAMBDA(mj_gno_t i) {
5538 points_per_part(i) =
5539 local_new_part_xadj(i) - ((i == 0) ? 0 : local_new_part_xadj(i-1));
5541 auto host_points_per_part = Kokkos::create_mirror_view(points_per_part);
5542 Kokkos::deep_copy(host_points_per_part, points_per_part);
5543 for(
int i = 0; i < num_parts; ++i) {
5544 my_local_points_to_reduce_sum[i] = host_points_per_part(i);
5549 memcpy (my_local_point_counts_in_each_part, my_local_points_to_reduce_sum,
5550 sizeof(mj_gno_t) * (num_parts) );
5557 reduceAll<int, mj_gno_t>(
5559 Teuchos::REDUCE_SUM,
5561 num_local_points_in_each_part_to_reduce_sum,
5562 num_points_in_all_processor_parts);
5566 delete [] num_local_points_in_each_part_to_reduce_sum;
5584template <
typename mj_scalar_t,
typename mj_lno_t,
typename mj_gno_t,
5585 typename mj_part_t,
typename mj_node_t>
5586bool AlgMJ<mj_scalar_t, mj_lno_t, mj_gno_t, mj_part_t, mj_node_t>::
5587 mj_check_to_migrate(
5588 size_t migration_reduce_all_population,
5589 mj_lno_t num_coords_for_last_dim_part,
5590 mj_part_t num_procs,
5591 mj_part_t num_parts,
5592 mj_gno_t *num_points_in_all_processor_parts)
5595 if(migration_reduce_all_population > future_reduceall_cutoff) {
5600 if(num_coords_for_last_dim_part < min_work_last_dim) {
5605 if(this->check_migrate_avoid_migration_option == 0) {
5606 double global_imbalance = 0;
5608 size_t global_shift = num_procs * num_parts;
5610 for(mj_part_t ii = 0; ii < num_procs; ++ii) {
5611 for(mj_part_t i = 0; i < num_parts; ++i) {
5612 double ideal_num = num_points_in_all_processor_parts[global_shift + i]
5613 / double(num_procs);
5615 global_imbalance += std::abs(ideal_num -
5616 num_points_in_all_processor_parts[ii * num_parts + i]) / (ideal_num);
5619 global_imbalance /= num_parts;
5620 global_imbalance /= num_procs;
5622 if(global_imbalance <= this->minimum_migration_imbalance) {
5648template <
typename mj_scalar_t,
typename mj_lno_t,
typename mj_gno_t,
5649 typename mj_part_t,
typename mj_node_t>
5650void AlgMJ<mj_scalar_t, mj_lno_t, mj_gno_t, mj_part_t, mj_node_t>::
5651 assign_send_destinations(
5652 mj_part_t num_parts,
5653 mj_part_t *part_assignment_proc_begin_indices,
5654 mj_part_t *processor_chains_in_parts,
5655 mj_lno_t *send_count_to_each_proc,
5656 int *coordinate_destinations) {
5658 auto host_new_part_xadj = Kokkos::create_mirror_view(this->new_part_xadj);
5659 deep_copy(host_new_part_xadj, this->new_part_xadj);
5661 auto host_new_coordinate_permutations =
5662 Kokkos::create_mirror_view(this->new_coordinate_permutations);
5663 deep_copy(host_new_coordinate_permutations, this->new_coordinate_permutations);
5665 for(mj_part_t p = 0; p < num_parts; ++p) {
5666 mj_lno_t part_begin = 0;
5667 if(p > 0) part_begin = host_new_part_xadj(p - 1);
5668 mj_lno_t part_end = host_new_part_xadj(p);
5670 mj_part_t proc_to_sent = part_assignment_proc_begin_indices[p];
5672 mj_lno_t num_total_send = 0;
5673 for(mj_lno_t j=part_begin; j < part_end; j++) {
5674 mj_lno_t local_ind = host_new_coordinate_permutations(j);
5675 while (num_total_send >= send_count_to_each_proc[proc_to_sent]) {
5679 part_assignment_proc_begin_indices[p] =
5680 processor_chains_in_parts[proc_to_sent];
5682 processor_chains_in_parts[proc_to_sent] = -1;
5684 proc_to_sent = part_assignment_proc_begin_indices[p];
5687 coordinate_destinations[local_ind] = proc_to_sent;
5713template <
typename mj_scalar_t,
typename mj_lno_t,
typename mj_gno_t,
5714 typename mj_part_t,
typename mj_node_t>
5715void AlgMJ<mj_scalar_t, mj_lno_t, mj_gno_t, mj_part_t, mj_node_t>::
5716 mj_assign_proc_to_parts(
5717 mj_gno_t * num_points_in_all_processor_parts,
5718 mj_part_t num_parts,
5719 mj_part_t num_procs,
5720 mj_lno_t *send_count_to_each_proc,
5721 std::vector<mj_part_t> &processor_ranks_for_subcomm,
5722 std::vector<mj_part_t> *next_future_num_parts_in_parts,
5723 mj_part_t &out_part_index,
5724 mj_part_t &output_part_numbering_begin_index,
5725 int * coordinate_destinations) {
5726 mj_gno_t *global_num_points_in_parts =
5727 num_points_in_all_processor_parts + num_procs * num_parts;
5728 mj_part_t *num_procs_assigned_to_each_part =
new mj_part_t[num_parts];
5731 bool did_i_find_my_group =
false;
5733 mj_part_t num_free_procs = num_procs;
5734 mj_part_t minimum_num_procs_required_for_rest_of_parts = num_parts - 1;
5736 double max_imbalance_difference = 0;
5737 mj_part_t max_differing_part = 0;
5740 for(mj_part_t i = 0; i < num_parts; i++) {
5743 double scalar_required_proc = num_procs *
5744 (double (global_num_points_in_parts[i]) /
5745 double (this->num_global_coords));
5748 mj_part_t required_proc =
5749 static_cast<mj_part_t
> (0.5 + scalar_required_proc);
5750 if(required_proc == 0) required_proc = 1;
5756 required_proc < minimum_num_procs_required_for_rest_of_parts) {
5757 required_proc = num_free_procs -
5758 (minimum_num_procs_required_for_rest_of_parts);
5762 num_free_procs -= required_proc;
5766 --minimum_num_procs_required_for_rest_of_parts;
5769 num_procs_assigned_to_each_part[i] = required_proc;
5774 double imbalance_wrt_ideal =
5775 (scalar_required_proc - required_proc) / required_proc;
5776 if(imbalance_wrt_ideal > max_imbalance_difference) {
5777 max_imbalance_difference = imbalance_wrt_ideal;
5778 max_differing_part = i;
5784 if(num_free_procs > 0) {
5785 num_procs_assigned_to_each_part[max_differing_part] += num_free_procs;
5792 mj_part_t *part_assignment_proc_begin_indices =
new mj_part_t[num_parts];
5796 mj_part_t *processor_chains_in_parts =
new mj_part_t [num_procs];
5797 mj_part_t *processor_part_assignments =
new mj_part_t[num_procs];
5806 for(
int i = 0; i < num_procs; ++i ) {
5807 processor_part_assignments[i] = -1;
5808 processor_chains_in_parts[i] = -1;
5810 for(
int i = 0; i < num_parts; ++i ) {
5811 part_assignment_proc_begin_indices[i] = -1;
5817 uSignedSortItem<mj_part_t, mj_gno_t, char> *
5818 sort_item_num_part_points_in_procs =
5819 new uSignedSortItem<mj_part_t, mj_gno_t, char>[num_procs];
5821 for(mj_part_t i = 0; i < num_parts; ++i) {
5826 for(mj_part_t ii = 0; ii < num_procs; ++ii) {
5827 sort_item_num_part_points_in_procs[ii].id = ii;
5830 if(processor_part_assignments[ii] == -1) {
5831 sort_item_num_part_points_in_procs[ii].val =
5832 num_points_in_all_processor_parts[ii * num_parts + i];
5834 sort_item_num_part_points_in_procs[ii].signbit = 1;
5847 sort_item_num_part_points_in_procs[ii].val =
5848 num_points_in_all_processor_parts[ii * num_parts + i];
5849 sort_item_num_part_points_in_procs[ii].signbit = 0;
5854 uqSignsort<mj_part_t, mj_gno_t,char>
5855 (num_procs, sort_item_num_part_points_in_procs);
5867 mj_part_t required_proc_count = num_procs_assigned_to_each_part[i];
5868 mj_gno_t total_num_points_in_part = global_num_points_in_parts[i];
5869 mj_gno_t ideal_num_points_in_a_proc = Teuchos::as<mj_gno_t>(
5870 ceil(total_num_points_in_part /
double (required_proc_count)));
5873 mj_part_t next_proc_to_send_index = num_procs - required_proc_count;
5874 mj_part_t next_proc_to_send_id =
5875 sort_item_num_part_points_in_procs[next_proc_to_send_index].id;
5876 mj_lno_t space_left_in_sent_proc = ideal_num_points_in_a_proc -
5877 sort_item_num_part_points_in_procs[next_proc_to_send_index].val;
5881 for(mj_part_t ii = num_procs - 1;
5882 ii >= num_procs - required_proc_count; --ii) {
5883 mj_part_t proc_id = sort_item_num_part_points_in_procs[ii].id;
5885 processor_part_assignments[proc_id] = i;
5888 bool did_change_sign =
false;
5890 for(mj_part_t ii = 0; ii < num_procs; ++ii) {
5893 if(sort_item_num_part_points_in_procs[ii].signbit == 0) {
5894 did_change_sign =
true;
5895 sort_item_num_part_points_in_procs[ii].signbit = 1;
5902 if(did_change_sign) {
5905 uqSignsort<mj_part_t, mj_gno_t>(num_procs - required_proc_count,
5906 sort_item_num_part_points_in_procs);
5921 if(!did_i_find_my_group) {
5922 for(mj_part_t ii = num_procs - 1; ii >=
5923 num_procs - required_proc_count; --ii) {
5925 mj_part_t proc_id_to_assign = sort_item_num_part_points_in_procs[ii].id;
5928 processor_ranks_for_subcomm.push_back(proc_id_to_assign);
5930 if(proc_id_to_assign == this->myRank) {
5932 did_i_find_my_group =
true;
5935 part_assignment_proc_begin_indices[i] = this->myRank;
5936 processor_chains_in_parts[this->myRank] = -1;
5940 send_count_to_each_proc[this->myRank] =
5941 sort_item_num_part_points_in_procs[ii].val;
5945 for(mj_part_t in = 0; in < i; ++in) {
5946 output_part_numbering_begin_index +=
5947 (*next_future_num_parts_in_parts)[in];
5955 if(!did_i_find_my_group) {
5956 processor_ranks_for_subcomm.clear();
5964 for(mj_part_t ii = num_procs - required_proc_count - 1; ii >= 0; --ii) {
5965 mj_part_t nonassigned_proc_id =
5966 sort_item_num_part_points_in_procs[ii].id;
5967 mj_lno_t num_points_to_sent =
5968 sort_item_num_part_points_in_procs[ii].val;
5974 if(num_points_to_sent < 0) {
5975 cout <<
"Migration - processor assignments - for part:" << i
5976 <<
"from proc:" << nonassigned_proc_id <<
" num_points_to_sent:"
5977 << num_points_to_sent << std::endl;
5982 switch (migration_type) {
5986 while (num_points_to_sent > 0) {
5988 if(num_points_to_sent <= space_left_in_sent_proc) {
5990 space_left_in_sent_proc -= num_points_to_sent;
5992 if(this->myRank == nonassigned_proc_id) {
5994 send_count_to_each_proc[next_proc_to_send_id] =
5999 mj_part_t prev_begin = part_assignment_proc_begin_indices[i];
6000 part_assignment_proc_begin_indices[i] = next_proc_to_send_id;
6001 processor_chains_in_parts[next_proc_to_send_id] = prev_begin;
6003 num_points_to_sent = 0;
6007 if(space_left_in_sent_proc > 0) {
6008 num_points_to_sent -= space_left_in_sent_proc;
6011 if(this->myRank == nonassigned_proc_id) {
6013 send_count_to_each_proc[next_proc_to_send_id] =
6014 space_left_in_sent_proc;
6015 mj_part_t prev_begin = part_assignment_proc_begin_indices[i];
6016 part_assignment_proc_begin_indices[i] = next_proc_to_send_id;
6017 processor_chains_in_parts[next_proc_to_send_id] = prev_begin;
6021 ++next_proc_to_send_index;
6024 if(next_part_to_send_index < nprocs - required_proc_count ) {
6025 cout <<
"Migration - processor assignments - for part:"
6027 <<
" next_part_to_send :" << next_part_to_send_index
6028 <<
" nprocs:" << nprocs
6029 <<
" required_proc_count:" << required_proc_count
6030 <<
" Error: next_part_to_send_index <" <<
6031 <<
" nprocs - required_proc_count" << std::endl;
6036 next_proc_to_send_id =
6037 sort_item_num_part_points_in_procs[next_proc_to_send_index].id;
6039 space_left_in_sent_proc = ideal_num_points_in_a_proc -
6040 sort_item_num_part_points_in_procs[next_proc_to_send_index].val;
6051 if(this->myRank == nonassigned_proc_id) {
6053 send_count_to_each_proc[next_proc_to_send_id] = num_points_to_sent;
6057 mj_part_t prev_begin = part_assignment_proc_begin_indices[i];
6058 part_assignment_proc_begin_indices[i] = next_proc_to_send_id;
6059 processor_chains_in_parts[next_proc_to_send_id] = prev_begin;
6061 num_points_to_sent = 0;
6062 ++next_proc_to_send_index;
6066 if(next_proc_to_send_index == num_procs) {
6067 next_proc_to_send_index = num_procs - required_proc_count;
6070 next_proc_to_send_id =
6071 sort_item_num_part_points_in_procs[next_proc_to_send_index].id;
6073 space_left_in_sent_proc = ideal_num_points_in_a_proc -
6074 sort_item_num_part_points_in_procs[next_proc_to_send_index].val;
6087 this->assign_send_destinations(
6089 part_assignment_proc_begin_indices,
6090 processor_chains_in_parts,
6091 send_count_to_each_proc,
6092 coordinate_destinations);
6093 delete [] part_assignment_proc_begin_indices;
6094 delete [] processor_chains_in_parts;
6095 delete [] processor_part_assignments;
6096 delete [] sort_item_num_part_points_in_procs;
6097 delete [] num_procs_assigned_to_each_part;
6115template <
typename mj_scalar_t,
typename mj_lno_t,
typename mj_gno_t,
6116 typename mj_part_t,
typename mj_node_t>
6117void AlgMJ<mj_scalar_t, mj_lno_t, mj_gno_t, mj_part_t, mj_node_t>::
6118 assign_send_destinations2(
6119 mj_part_t num_parts,
6120 uSortItem<mj_part_t, mj_part_t> * sort_item_part_to_proc_assignment,
6121 int *coordinate_destinations,
6122 mj_part_t &output_part_numbering_begin_index,
6123 std::vector<mj_part_t> *next_future_num_parts_in_parts)
6125 mj_part_t part_shift_amount = output_part_numbering_begin_index;
6126 mj_part_t previous_processor = -1;
6128 auto local_new_part_xadj = Kokkos::create_mirror_view(this->new_part_xadj);
6129 Kokkos::deep_copy(local_new_part_xadj, this->new_part_xadj);
6131 auto local_new_coordinate_permutations =
6132 Kokkos::create_mirror_view(this->new_coordinate_permutations);
6133 Kokkos::deep_copy(local_new_coordinate_permutations,
6134 this->new_coordinate_permutations);
6136 for(mj_part_t i = 0; i < num_parts; ++i) {
6137 mj_part_t p = sort_item_part_to_proc_assignment[i].id;
6140 mj_lno_t part_begin_index = 0;
6143 part_begin_index = local_new_part_xadj(p - 1);
6146 mj_lno_t part_end_index = local_new_part_xadj(p);
6148 mj_part_t assigned_proc = sort_item_part_to_proc_assignment[i].val;
6149 if(this->myRank == assigned_proc && previous_processor != assigned_proc) {
6150 output_part_numbering_begin_index = part_shift_amount;
6152 previous_processor = assigned_proc;
6153 part_shift_amount += (*next_future_num_parts_in_parts)[p];
6155 for(mj_lno_t j= part_begin_index; j < part_end_index; j++) {
6156 mj_lno_t localInd = local_new_coordinate_permutations(j);
6157 coordinate_destinations[localInd] = assigned_proc;
6183template <
typename mj_scalar_t,
typename mj_lno_t,
typename mj_gno_t,
6184 typename mj_part_t,
typename mj_node_t>
6185void AlgMJ<mj_scalar_t, mj_lno_t, mj_gno_t, mj_part_t, mj_node_t>::
6186 mj_assign_parts_to_procs(
6187 mj_gno_t * num_points_in_all_processor_parts,
6188 mj_part_t num_parts,
6189 mj_part_t num_procs,
6190 mj_lno_t *send_count_to_each_proc,
6191 std::vector<mj_part_t> *next_future_num_parts_in_parts,
6192 mj_part_t &out_num_part,
6193 std::vector<mj_part_t> &out_part_indices,
6194 mj_part_t &output_part_numbering_begin_index,
6195 int *coordinate_destinations) {
6198 mj_gno_t *global_num_points_in_parts =
6199 num_points_in_all_processor_parts + num_procs * num_parts;
6200 out_part_indices.clear();
6204 uSortItem<mj_part_t, mj_part_t> * sort_item_part_to_proc_assignment =
6205 new uSortItem<mj_part_t, mj_part_t>[num_parts];
6206 uSortItem<mj_part_t, mj_gno_t> * sort_item_num_points_of_proc_in_part_i =
6207 new uSortItem<mj_part_t, mj_gno_t>[num_procs];
6211 mj_lno_t work_each =
6212 mj_lno_t (this->num_global_coords / (
double (num_procs)) + 0.5f);
6216 mj_lno_t *space_in_each_processor =
new mj_lno_t[num_procs];
6219 for(mj_part_t i = 0; i < num_procs; ++i) {
6220 space_in_each_processor[i] = work_each;
6227 mj_part_t *num_parts_proc_assigned =
new mj_part_t[num_procs];
6228 memset(num_parts_proc_assigned, 0,
sizeof(mj_part_t) * num_procs);
6229 int empty_proc_count = num_procs;
6233 uSortItem<mj_part_t, mj_gno_t> * sort_item_point_counts_in_parts =
6234 new uSortItem<mj_part_t, mj_gno_t>[num_parts];
6239 for(mj_part_t i = 0; i < num_parts; ++i) {
6240 sort_item_point_counts_in_parts[i].id = i;
6241 sort_item_point_counts_in_parts[i].val = global_num_points_in_parts[i];
6245 uqsort<mj_part_t, mj_gno_t>(num_parts, sort_item_point_counts_in_parts);
6250 for(mj_part_t j = 0; j < num_parts; ++j) {
6252 mj_part_t i = sort_item_point_counts_in_parts[num_parts - 1 - j].id;
6255 mj_gno_t load = global_num_points_in_parts[i];
6258 mj_part_t assigned_proc = -1;
6261 for(mj_part_t ii = 0; ii < num_procs; ++ii) {
6262 sort_item_num_points_of_proc_in_part_i[ii].id = ii;
6267 if(empty_proc_count < num_parts - j ||
6268 num_parts_proc_assigned[ii] == 0) {
6270 sort_item_num_points_of_proc_in_part_i[ii].val =
6271 num_points_in_all_processor_parts[ii * num_parts + i];
6274 sort_item_num_points_of_proc_in_part_i[ii].val = -1;
6278 uqsort<mj_part_t, mj_gno_t>(num_procs,
6279 sort_item_num_points_of_proc_in_part_i);
6282 for(mj_part_t iii = num_procs - 1; iii >= 0; --iii) {
6283 mj_part_t ii = sort_item_num_points_of_proc_in_part_i[iii].id;
6284 if(assigned_proc == -1 ||
6285 (space_in_each_processor[ii] > space_in_each_processor[assigned_proc])) {
6288 else if(space_in_each_processor[ii] == space_in_each_processor[assigned_proc]) {
6289 if(ii < assigned_proc) {
6305 if(num_parts_proc_assigned[assigned_proc]++ == 0) {
6309 space_in_each_processor[assigned_proc] -= load;
6311 sort_item_part_to_proc_assignment[j].id = i;
6314 sort_item_part_to_proc_assignment[j].val = assigned_proc;
6317 if(assigned_proc == this->myRank) {
6319 out_part_indices.push_back(i);
6325 send_count_to_each_proc[assigned_proc] +=
6326 num_points_in_all_processor_parts[this->myRank * num_parts + i];
6329 delete [] num_parts_proc_assigned;
6330 delete [] sort_item_num_points_of_proc_in_part_i;
6331 delete [] sort_item_point_counts_in_parts;
6332 delete [] space_in_each_processor;
6335 uqsort<mj_part_t, mj_part_t>(num_parts, sort_item_part_to_proc_assignment);
6338 this->assign_send_destinations2(
6340 sort_item_part_to_proc_assignment,
6341 coordinate_destinations,
6342 output_part_numbering_begin_index,
6343 next_future_num_parts_in_parts);
6345 delete [] sort_item_part_to_proc_assignment;
6372template <
typename mj_scalar_t,
typename mj_lno_t,
typename mj_gno_t,
6373 typename mj_part_t,
typename mj_node_t>
6374void AlgMJ<mj_scalar_t, mj_lno_t, mj_gno_t, mj_part_t, mj_node_t>::
6375 mj_migration_part_proc_assignment(
6376 mj_gno_t * num_points_in_all_processor_parts,
6377 mj_part_t num_parts,
6378 mj_part_t num_procs,
6379 mj_lno_t *send_count_to_each_proc,
6380 std::vector<mj_part_t> &processor_ranks_for_subcomm,
6381 std::vector<mj_part_t> *next_future_num_parts_in_parts,
6382 mj_part_t &out_num_part,
6383 std::vector<mj_part_t> &out_part_indices,
6384 mj_part_t &output_part_numbering_begin_index,
6385 int *coordinate_destinations)
6387 processor_ranks_for_subcomm.clear();
6389 if(num_procs > num_parts) {
6394 mj_part_t out_part_index = 0;
6396 this->mj_assign_proc_to_parts(
6397 num_points_in_all_processor_parts,
6400 send_count_to_each_proc,
6401 processor_ranks_for_subcomm,
6402 next_future_num_parts_in_parts,
6404 output_part_numbering_begin_index,
6405 coordinate_destinations
6409 out_part_indices.clear();
6410 out_part_indices.push_back(out_part_index);
6417 processor_ranks_for_subcomm.push_back(this->myRank);
6422 this->mj_assign_parts_to_procs(
6423 num_points_in_all_processor_parts,
6426 send_count_to_each_proc,
6427 next_future_num_parts_in_parts,
6430 output_part_numbering_begin_index,
6431 coordinate_destinations);
6448template <
typename mj_scalar_t,
typename mj_lno_t,
typename mj_gno_t,
6449 typename mj_part_t,
typename mj_node_t>
6450void AlgMJ<mj_scalar_t, mj_lno_t, mj_gno_t, mj_part_t, mj_node_t>::
6452 mj_part_t num_procs,
6453 mj_lno_t &num_new_local_points,
6454 std::string iteration,
6455 int *coordinate_destinations,
6456 mj_part_t num_parts)
6459#ifdef ZOLTAN2_MJ_ENABLE_ZOLTAN_MIGRATION
6460 if(
sizeof(mj_lno_t) <=
sizeof(
int)) {
6464 ZOLTAN_COMM_OBJ *plan = NULL;
6465 MPI_Comm mpi_comm = Teuchos::getRawMpiComm(*(this->comm));
6466 int num_incoming_gnos = 0;
6467 int message_tag = 7859;
6469 this->mj_env->timerStart(MACRO_TIMERS,
6470 mj_timer_base_string +
"Migration Z1PlanCreating-" + iteration);
6471 int ierr = Zoltan_Comm_Create(
6473 int(this->num_local_coords),
6474 coordinate_destinations,
6477 &num_incoming_gnos);
6480 this->mj_env->timerStop(MACRO_TIMERS,
6481 mj_timer_base_string +
"Migration Z1PlanCreating-" + iteration);
6483 this->mj_env->timerStart(MACRO_TIMERS,
6484 mj_timer_base_string +
"Migration Z1Migration-" + iteration);
6492 auto host_current_mj_gnos = Kokkos::create_mirror_view(
6493 Kokkos::HostSpace(), this->current_mj_gnos);
6494 Kokkos::deep_copy(host_current_mj_gnos, this->current_mj_gnos);
6495 Kokkos::View<mj_gno_t*, device_t> dst_gnos(
6496 Kokkos::ViewAllocateWithoutInitializing(
"dst_gnos"), num_incoming_gnos);
6497 auto host_dst_gnos = Kokkos::create_mirror_view(
6498 Kokkos::HostSpace(), dst_gnos);
6500 ierr = Zoltan_Comm_Do(
6503 (
char *) host_current_mj_gnos.data(),
6505 (
char *) host_dst_gnos.data());
6507 Kokkos::deep_copy(dst_gnos, host_dst_gnos);
6508 this->current_mj_gnos = dst_gnos;
6514 auto host_src_coordinates = Kokkos::create_mirror_view(
6515 Kokkos::HostSpace(), this->mj_coordinates);
6516 Kokkos::deep_copy(host_src_coordinates, this->mj_coordinates);
6517 Kokkos::View<mj_scalar_t**, Kokkos::LayoutLeft, device_t>
6518 dst_coordinates(Kokkos::ViewAllocateWithoutInitializing(
"mj_coordinates"),
6519 num_incoming_gnos, this->coord_dim);
6520 auto host_dst_coordinates = Kokkos::create_mirror_view(
6521 Kokkos::HostSpace(), dst_coordinates);
6522 for(
int i = 0; i < this->coord_dim; ++i) {
6523 Kokkos::View<mj_scalar_t*, Kokkos::HostSpace> sub_host_src_coordinates
6524 = Kokkos::subview(host_src_coordinates, Kokkos::ALL, i);
6525 Kokkos::View<mj_scalar_t *, Kokkos::HostSpace> sub_host_dst_coordinates
6526 = Kokkos::subview(host_dst_coordinates, Kokkos::ALL, i);
6529 ierr = Zoltan_Comm_Do(
6532 (
char *) sub_host_src_coordinates.data(),
6533 sizeof(mj_scalar_t),
6534 (
char *) sub_host_dst_coordinates.data());
6537 deep_copy(dst_coordinates, host_dst_coordinates);
6538 this->mj_coordinates = dst_coordinates;
6543 auto host_src_weights = Kokkos::create_mirror_view(
6544 Kokkos::HostSpace(), this->mj_weights);
6545 Kokkos::deep_copy(host_src_weights, this->mj_weights);
6546 Kokkos::View<mj_scalar_t**, device_t> dst_weights(
6547 Kokkos::ViewAllocateWithoutInitializing(
"mj_weights"),
6548 num_incoming_gnos, this->num_weights_per_coord);
6549 auto host_dst_weights = Kokkos::create_mirror_view(dst_weights);
6550 for(
int i = 0; i < this->num_weights_per_coord; ++i) {
6551 auto sub_host_src_weights
6552 = Kokkos::subview(host_src_weights, Kokkos::ALL, i);
6553 auto sub_host_dst_weights
6554 = Kokkos::subview(host_dst_weights, Kokkos::ALL, i);
6555 ArrayRCP<mj_scalar_t> sent_weight(this->num_local_coords);
6557 for(mj_lno_t n = 0; n < this->num_local_coords; ++n) {
6558 sent_weight[n] = sub_host_src_weights(n);
6560 ArrayRCP<mj_scalar_t> received_weight(num_incoming_gnos);
6562 ierr = Zoltan_Comm_Do(
6565 (
char *) sent_weight.getRawPtr(),
6566 sizeof(mj_scalar_t),
6567 (
char *) received_weight.getRawPtr());
6570 for(mj_lno_t n = 0; n < num_incoming_gnos; ++n) {
6571 sub_host_dst_weights(n) = received_weight[n];
6574 deep_copy(dst_weights, host_dst_weights);
6575 this->mj_weights = dst_weights;
6581 Kokkos::View<int *, Kokkos::HostSpace> dst_owners_of_coordinate(
6582 Kokkos::ViewAllocateWithoutInitializing(
"owner_of_coordinate"),
6585 ierr = Zoltan_Comm_Do(
6588 (
char *) owner_of_coordinate.data(),
6590 (
char *) dst_owners_of_coordinate.data());
6592 this->owner_of_coordinate = dst_owners_of_coordinate;
6599 auto host_src_assigned_part_ids = Kokkos::create_mirror_view(
6600 Kokkos::HostSpace(), this->assigned_part_ids);
6601 Kokkos::deep_copy(host_src_assigned_part_ids, this->assigned_part_ids);
6602 Kokkos::View<int *, device_t> dst_assigned_part_ids(
6603 Kokkos::ViewAllocateWithoutInitializing(
"assigned_part_ids"),
6605 auto host_dst_assigned_part_ids = Kokkos::create_mirror_view(
6606 Kokkos::HostSpace(), dst_assigned_part_ids);
6607 mj_part_t *new_parts =
new mj_part_t[num_incoming_gnos];
6608 if(num_procs < num_parts) {
6610 ierr = Zoltan_Comm_Do(
6613 (
char *) host_src_assigned_part_ids.data(),
6615 (
char *) host_dst_assigned_part_ids.data());
6617 Kokkos::deep_copy(dst_assigned_part_ids, host_dst_assigned_part_ids);
6621 this->assigned_part_ids = dst_assigned_part_ids;
6624 ierr = Zoltan_Comm_Destroy(&plan);
6626 num_new_local_points = num_incoming_gnos;
6627 this->mj_env->timerStop(MACRO_TIMERS,
6628 mj_timer_base_string +
"Migration Z1Migration-" + iteration);
6633 this->mj_env->timerStart(MACRO_TIMERS, mj_timer_base_string +
6634 "Migration DistributorPlanCreating-" + iteration);
6636 Tpetra::Distributor distributor(this->comm);
6637 ArrayView<const mj_part_t> destinations( coordinate_destinations,
6638 this->num_local_coords);
6639 mj_lno_t num_incoming_gnos = distributor.createFromSends(destinations);
6640 this->mj_env->timerStop(MACRO_TIMERS, mj_timer_base_string +
6641 "Migration DistributorPlanCreating-" + iteration);
6642 this->mj_env->timerStart(MACRO_TIMERS, mj_timer_base_string +
6643 "Migration DistributorMigration-" + iteration);
6651 Kokkos::View<mj_gno_t*, Kokkos::HostSpace> received_gnos(
6652 Kokkos::ViewAllocateWithoutInitializing(
"received_gnos"),
6655 Kokkos::View<mj_gno_t*, Kokkos::HostSpace> sent_gnos(
6656 Kokkos::ViewAllocateWithoutInitializing(
"sent_gnos"),
6657 this->current_mj_gnos.extent(0));
6658 Kokkos::deep_copy(sent_gnos, this->current_mj_gnos);
6660 distributor.doPostsAndWaits(sent_gnos, 1, received_gnos);
6662 this->current_mj_gnos = Kokkos::View<mj_gno_t*, device_t>(
6663 Kokkos::ViewAllocateWithoutInitializing(
"gids"), num_incoming_gnos);
6665 Kokkos::deep_copy(this->current_mj_gnos, received_gnos);
6670 Kokkos::View<mj_scalar_t**, Kokkos::LayoutLeft, device_t>
6671 dst_coordinates(
"mj_coordinates", num_incoming_gnos, this->coord_dim);
6673 Kokkos::View<mj_scalar_t**, Kokkos::LayoutLeft, Kokkos::HostSpace>
6674 host_src_coordinates(
6675 Kokkos::ViewAllocateWithoutInitializing(
"host_coords"),
6676 this->mj_coordinates.extent(0), this->mj_coordinates.extent(1));
6677 Kokkos::deep_copy(host_src_coordinates, this->mj_coordinates);
6679 Kokkos::View<mj_scalar_t*, Kokkos::HostSpace> received_coord(
6680 Kokkos::ViewAllocateWithoutInitializing(
"received_coord"),
6683 for(
int i = 0; i < this->coord_dim; ++i) {
6687 Kokkos::View<mj_scalar_t*, Kokkos::HostSpace> sent_coord
6688 = Kokkos::subview(host_src_coordinates, Kokkos::ALL, i);
6690 distributor.doPostsAndWaits(sent_coord, 1, received_coord);
6692 Kokkos::deep_copy(Kokkos::subview(dst_coordinates, Kokkos::ALL, i),
6698 this->mj_coordinates = dst_coordinates;
6701 Kokkos::View<mj_scalar_t**, device_t> dst_weights(
6702 "mj_weights", num_incoming_gnos, this->num_weights_per_coord);
6703 auto host_dst_weights = Kokkos::create_mirror_view(Kokkos::HostSpace(),
6706 auto host_src_weights = Kokkos::create_mirror_view_and_copy(
6707 Kokkos::HostSpace(), this->mj_weights);
6710 Kokkos::View<mj_scalar_t*, Kokkos::HostSpace> sent_weight(
6711 Kokkos::ViewAllocateWithoutInitializing(
"send_weight_buffer"),
6712 this->num_local_coords);
6714 Kokkos::View<mj_scalar_t*, Kokkos::HostSpace> received_weight(
6715 Kokkos::ViewAllocateWithoutInitializing(
"received_weight_buffer"),
6718 for(
int i = 0; i < this->num_weights_per_coord; ++i) {
6720 auto sub_host_src_weights
6721 = Kokkos::subview(host_src_weights, Kokkos::ALL, i);
6723 auto sub_host_dst_weights
6724 = Kokkos::subview(host_dst_weights, Kokkos::ALL, i);
6731 for(mj_lno_t n = 0; n < this->num_local_coords; ++n) {
6732 sent_weight[n] = sub_host_src_weights(n);
6735 distributor.doPostsAndWaits(sent_weight, 1, received_weight);
6738 for(mj_lno_t n = 0; n < num_incoming_gnos; ++n) {
6739 sub_host_dst_weights(n) = received_weight[n];
6742 Kokkos::deep_copy(dst_weights, host_dst_weights);
6743 this->mj_weights = dst_weights;
6748 Kokkos::View<int *, Kokkos::HostSpace> received_owners(
6749 Kokkos::ViewAllocateWithoutInitializing(
"owner_of_coordinate"),
6752 distributor.doPostsAndWaits(owner_of_coordinate, 1, received_owners);
6754 this->owner_of_coordinate = received_owners;
6760 if(num_procs < num_parts) {
6761 Kokkos::View<mj_part_t*, Kokkos::HostSpace> sent_partids(
6762 Kokkos::ViewAllocateWithoutInitializing(
"host_parts"),
6763 this->assigned_part_ids.extent(0));
6764 Kokkos::deep_copy(sent_partids, assigned_part_ids);
6766 Kokkos::View<mj_part_t*, Kokkos::HostSpace> received_partids(
6767 Kokkos::ViewAllocateWithoutInitializing(
"received_partids"),
6770 distributor.doPostsAndWaits(sent_partids, 1, received_partids);
6772 this->assigned_part_ids = Kokkos::View<mj_part_t *, device_t>
6773 (
"assigned_part_ids", num_incoming_gnos);
6774 Kokkos::deep_copy(this->assigned_part_ids, received_partids);
6777 this->assigned_part_ids = Kokkos::View<mj_part_t *, device_t>
6778 (
"assigned_part_ids", num_incoming_gnos);
6780 this->mj_env->timerStop(MACRO_TIMERS,
"" + mj_timer_base_string +
6781 "Migration DistributorMigration-" + iteration);
6783 num_new_local_points = num_incoming_gnos;
6792template <
typename mj_scalar_t,
typename mj_lno_t,
typename mj_gno_t,
6793 typename mj_part_t,
typename mj_node_t>
6794void AlgMJ<mj_scalar_t, mj_lno_t, mj_gno_t, mj_part_t, mj_node_t>::
6795 create_sub_communicator(std::vector<mj_part_t> &processor_ranks_for_subcomm)
6797 mj_part_t group_size = processor_ranks_for_subcomm.size();
6798 mj_part_t *ids =
new mj_part_t[group_size];
6799 for(mj_part_t i = 0; i < group_size; ++i) {
6800 ids[i] = processor_ranks_for_subcomm[i];
6802 ArrayView<const mj_part_t> idView(ids, group_size);
6803 this->comm = this->comm->createSubcommunicator(idView);
6812template <
typename mj_scalar_t,
typename mj_lno_t,
typename mj_gno_t,
6813 typename mj_part_t,
typename mj_node_t>
6814void AlgMJ<mj_scalar_t, mj_lno_t, mj_gno_t, mj_part_t, mj_node_t>::
6815 fill_permutation_array(
6816 mj_part_t output_num_parts,
6817 mj_part_t num_parts)
6820 if(output_num_parts == 1) {
6821 auto local_new_coordinate_permutations = this->new_coordinate_permutations;
6822 Kokkos::parallel_for(
6823 Kokkos::RangePolicy<typename mj_node_t::execution_space, mj_lno_t>
6824 (0, this->num_local_coords),
6825 KOKKOS_LAMBDA(mj_lno_t i) {
6826 local_new_coordinate_permutations(i) = i;
6828 auto local_new_part_xadj = this->new_part_xadj;
6829 auto local_num_local_coords = this->num_local_coords;
6830 Kokkos::parallel_for(
6831 Kokkos::RangePolicy<typename mj_node_t::execution_space, int> (0,1),
6832 KOKKOS_LAMBDA(
int dummy) {
6833 local_new_part_xadj(0) = local_num_local_coords;
6837 auto local_num_local_coords = this->num_local_coords;
6838 auto local_assigned_part_ids = this->assigned_part_ids;
6839 auto local_new_part_xadj = this->new_part_xadj;
6840 auto local_new_coordinate_permutations = this->new_coordinate_permutations;
6843 Kokkos::View<mj_part_t*, device_t> part_shifts(
"part_shifts", num_parts);
6848 Kokkos::View<mj_lno_t*, device_t> num_points_in_parts(
6849 "num_points_in_parts", num_parts);
6851 Kokkos::parallel_for(
6852 Kokkos::RangePolicy<typename mj_node_t::execution_space, int> (0,1),
6853 KOKKOS_LAMBDA(
int dummy) {
6855 for(mj_lno_t i = 0; i < local_num_local_coords; ++i) {
6856 mj_part_t ii = local_assigned_part_ids(i);
6857 ++num_points_in_parts(ii);
6862 mj_lno_t prev_index = 0;
6863 for(mj_part_t i = 0; i < num_parts; ++i) {
6864 if(num_points_in_parts(i) > 0) {
6865 local_new_part_xadj(p) = prev_index + num_points_in_parts(i);
6866 prev_index += num_points_in_parts(i);
6867 part_shifts(i) = p++;
6872 mj_part_t assigned_num_parts = p - 1;
6873 for(;p < num_parts; ++p) {
6874 local_new_part_xadj(p) =
6875 local_new_part_xadj(assigned_num_parts);
6877 for(mj_part_t i = 0; i < output_num_parts; ++i) {
6878 num_points_in_parts(i) = local_new_part_xadj(i);
6884 for(mj_lno_t i = local_num_local_coords - 1; i >= 0; --i) {
6886 part_shifts[mj_part_t(local_assigned_part_ids(i))];
6887 local_new_coordinate_permutations(--num_points_in_parts[part]) = i;
6917template <
typename mj_scalar_t,
typename mj_lno_t,
typename mj_gno_t,
6918 typename mj_part_t,
typename mj_node_t>
6919bool AlgMJ<mj_scalar_t, mj_lno_t, mj_gno_t, mj_part_t, mj_node_t>::
6920 mj_perform_migration(
6921 mj_part_t input_num_parts,
6922 mj_part_t &output_num_parts,
6923 std::vector<mj_part_t> *next_future_num_parts_in_parts,
6924 mj_part_t &output_part_begin_index,
6925 size_t migration_reduce_all_population,
6926 mj_lno_t num_coords_for_last_dim_part,
6927 std::string iteration,
6928 RCP<mj_partBoxVector_t> &input_part_boxes,
6929 RCP<mj_partBoxVector_t> &output_part_boxes)
6931 mj_part_t num_procs = this->comm->getSize();
6932 this->myRank = this->comm->getRank();
6937 mj_gno_t *num_points_in_all_processor_parts =
6938 new mj_gno_t[input_num_parts * (num_procs + 1)];
6941 this->get_processor_num_points_in_parts(
6944 num_points_in_all_processor_parts);
6947 if(!this->mj_check_to_migrate(
6948 migration_reduce_all_population,
6949 num_coords_for_last_dim_part,
6952 num_points_in_all_processor_parts)) {
6953 delete [] num_points_in_all_processor_parts;
6957 mj_lno_t *send_count_to_each_proc = NULL;
6958 int *coordinate_destinations =
new int[this->num_local_coords];
6959 send_count_to_each_proc =
new mj_lno_t[num_procs];
6961 for(
int i = 0; i < num_procs; ++i) {
6962 send_count_to_each_proc[i] = 0;
6965 std::vector<mj_part_t> processor_ranks_for_subcomm;
6966 std::vector<mj_part_t> out_part_indices;
6969 this->mj_migration_part_proc_assignment(
6970 num_points_in_all_processor_parts,
6973 send_count_to_each_proc,
6974 processor_ranks_for_subcomm,
6975 next_future_num_parts_in_parts,
6978 output_part_begin_index,
6979 coordinate_destinations);
6981 delete [] send_count_to_each_proc;
6982 std::vector <mj_part_t> tmpv;
6984 std::sort (out_part_indices.begin(), out_part_indices.end());
6985 mj_part_t outP = out_part_indices.size();
6986 mj_gno_t new_global_num_points = 0;
6987 mj_gno_t *global_num_points_in_parts =
6988 num_points_in_all_processor_parts + num_procs * input_num_parts;
6990 if(this->mj_keep_part_boxes) {
6991 input_part_boxes->clear();
6996 for(mj_part_t i = 0; i < outP; ++i) {
6997 mj_part_t ind = out_part_indices[i];
6998 new_global_num_points += global_num_points_in_parts[ind];
6999 tmpv.push_back((*next_future_num_parts_in_parts)[ind]);
7000 if(this->mj_keep_part_boxes) {
7001 input_part_boxes->push_back((*output_part_boxes)[ind]);
7006 if(this->mj_keep_part_boxes) {
7007 RCP<mj_partBoxVector_t> tmpPartBoxes = input_part_boxes;
7008 input_part_boxes = output_part_boxes;
7009 output_part_boxes = tmpPartBoxes;
7011 next_future_num_parts_in_parts->clear();
7012 for(mj_part_t i = 0; i < outP; ++i) {
7013 mj_part_t p = tmpv[i];
7014 next_future_num_parts_in_parts->push_back(p);
7017 delete [] num_points_in_all_processor_parts;
7019 mj_lno_t num_new_local_points = 0;
7021 this->mj_migrate_coords(
7023 num_new_local_points,
7025 coordinate_destinations,
7028 delete [] coordinate_destinations;
7029 if(this->num_local_coords != num_new_local_points) {
7030 this->new_coordinate_permutations = Kokkos::View<mj_lno_t*, device_t>
7031 (Kokkos::ViewAllocateWithoutInitializing(
"new_coordinate_permutations"),
7032 num_new_local_points);
7033 this->coordinate_permutations = Kokkos::View<mj_lno_t*, device_t>
7034 (Kokkos::ViewAllocateWithoutInitializing(
"coordinate_permutations"),
7035 num_new_local_points);
7037 this->num_local_coords = num_new_local_points;
7038 this->num_global_coords = new_global_num_points;
7041 this->create_sub_communicator(processor_ranks_for_subcomm);
7043 processor_ranks_for_subcomm.clear();
7046 this->fill_permutation_array(output_num_parts, input_num_parts);
7069template <
typename mj_scalar_t,
typename mj_lno_t,
typename mj_gno_t,
7070 typename mj_part_t,
typename mj_node_t>
7071void AlgMJ<mj_scalar_t, mj_lno_t, mj_gno_t, mj_part_t, mj_node_t>::
7072 create_consistent_chunks(
7073 mj_part_t num_parts,
7074 Kokkos::View<mj_scalar_t *, device_t> & mj_current_dim_coords,
7075 Kokkos::View<mj_scalar_t *, device_t> & current_concurrent_cut_coordinate,
7076 mj_lno_t coordinate_begin,
7077 mj_lno_t coordinate_end,
7078 Kokkos::View<mj_scalar_t *, device_t> & used_local_cut_line_weight_to_left,
7079 Kokkos::View<mj_lno_t *, device_t> & out_part_xadj,
7081 bool longest_dim_part,
7082 uSignedSortItem<int, mj_scalar_t, char> * p_coord_dimension_range_sorted)
7092 mj_part_t no_cuts = num_parts - 1;
7096 if(this->distribute_points_on_cut_lines) {
7097 auto local_thread_cut_line_weight_to_put_left =
7098 this->thread_cut_line_weight_to_put_left;
7099 auto local_thread_part_weight_work =
7100 this->thread_part_weight_work;
7101 auto local_sEpsilon = this->sEpsilon;
7103 Kokkos::parallel_for(
7104 Kokkos::RangePolicy<
typename mj_node_t::execution_space,
7105 mj_part_t> (0, no_cuts), KOKKOS_LAMBDA (mj_part_t i) {
7107 mj_scalar_t left_weight = used_local_cut_line_weight_to_left(i);
7108 if(left_weight > local_sEpsilon) {
7110 mj_scalar_t thread_ii_weight_on_cut =
7111 local_thread_part_weight_work(i * 2 + 1) -
7112 local_thread_part_weight_work(i * 2);
7113 if(thread_ii_weight_on_cut < left_weight) {
7114 local_thread_cut_line_weight_to_put_left(i) =
7115 thread_ii_weight_on_cut;
7118 local_thread_cut_line_weight_to_put_left(i) = left_weight;
7122 local_thread_cut_line_weight_to_put_left(i) = 0;
7127 auto local_least_signifiance = least_signifiance;
7128 auto local_significance_mul = significance_mul;
7129 Kokkos::parallel_for(
7130 Kokkos::RangePolicy<typename mj_node_t::execution_space, int>
7131 (0, 1), KOKKOS_LAMBDA (
int dummy) {
7135 for(mj_part_t i = no_cuts - 1; i > 0 ; --i) {
7136 mj_scalar_t cut1 = current_concurrent_cut_coordinate(i-1);
7137 mj_scalar_t cut2 = current_concurrent_cut_coordinate(i);
7138 mj_scalar_t delta = cut2 - cut1;
7139 mj_scalar_t abs_delta = (delta > 0) ? delta : -delta;
7140 if(abs_delta < local_sEpsilon) {
7141 local_thread_cut_line_weight_to_put_left(i) -=
7142 local_thread_cut_line_weight_to_put_left(i - 1);
7144 local_thread_cut_line_weight_to_put_left(i) =
7145 static_cast<long long>((local_thread_cut_line_weight_to_put_left(i) +
7146 local_least_signifiance) * local_significance_mul) /
7147 static_cast<mj_scalar_t
>(local_significance_mul);
7153 auto local_thread_point_counts = this->thread_point_counts;
7154 Kokkos::parallel_for(
7155 Kokkos::RangePolicy<typename mj_node_t::execution_space, mj_part_t>
7156 (0, num_parts), KOKKOS_LAMBDA (mj_part_t i) {
7157 local_thread_point_counts(i) = 0;
7168 mj_part_t *cut_map =
new mj_part_t[no_cuts];
7170 typedef uMultiSortItem<mj_lno_t, int, mj_scalar_t> multiSItem;
7171 typedef std::vector< multiSItem > multiSVector;
7172 typedef std::vector<multiSVector> multiS2Vector;
7175 std::vector<mj_scalar_t *>allocated_memory;
7178 multiS2Vector sort_vector_points_on_cut;
7181 mj_part_t different_cut_count = 1;
7187 multiSVector tmpMultiSVector;
7188 sort_vector_points_on_cut.push_back(tmpMultiSVector);
7190 auto local_current_concurrent_cut_coordinate =
7191 current_concurrent_cut_coordinate;
7192 auto host_current_concurrent_cut_coordinate =
7193 Kokkos::create_mirror_view(local_current_concurrent_cut_coordinate);
7194 Kokkos::deep_copy(host_current_concurrent_cut_coordinate,
7195 local_current_concurrent_cut_coordinate);
7197 for(mj_part_t i = 1; i < no_cuts ; ++i) {
7200 if(std::abs(host_current_concurrent_cut_coordinate(i) -
7201 host_current_concurrent_cut_coordinate(i-1)) < this->sEpsilon) {
7202 cut_map[i] = cut_map[i-1];
7205 cut_map[i] = different_cut_count++;
7206 multiSVector tmp2MultiSVector;
7207 sort_vector_points_on_cut.push_back(tmp2MultiSVector);
7210 Kokkos::deep_copy(current_concurrent_cut_coordinate,
7211 host_current_concurrent_cut_coordinate);
7214 auto host_coordinate_permutations =
7215 Kokkos::create_mirror_view(coordinate_permutations);
7216 Kokkos::deep_copy(host_coordinate_permutations, coordinate_permutations);
7218 auto host_assigned_part_ids = Kokkos::create_mirror_view(assigned_part_ids);
7219 Kokkos::deep_copy(host_assigned_part_ids, assigned_part_ids);
7221 auto host_mj_coordinates = Kokkos::create_mirror_view(mj_coordinates);
7222 Kokkos::deep_copy(host_mj_coordinates, mj_coordinates);
7224 auto host_thread_point_counts = Kokkos::create_mirror_view(thread_point_counts);
7225 Kokkos::deep_copy(host_thread_point_counts, thread_point_counts);
7227 auto local_coord_dim = this->coord_dim;
7229 for(mj_lno_t ii = coordinate_begin; ii < coordinate_end; ++ii) {
7230 mj_lno_t i = host_coordinate_permutations(ii);
7231 mj_part_t pp = host_assigned_part_ids(i);
7232 mj_part_t p = pp / 2;
7235 mj_scalar_t *vals =
new mj_scalar_t[local_coord_dim -1];
7236 allocated_memory.push_back(vals);
7241 if(longest_dim_part) {
7243 for(
int dim = local_coord_dim - 2; dim >= 0; --dim) {
7246 int next_largest_coord_dim = p_coord_dimension_range_sorted[dim].id;
7251 host_mj_coordinates(i,next_largest_coord_dim);
7255 for(
int dim = coordInd + 1; dim < local_coord_dim; ++dim) {
7256 vals[val_ind++] = host_mj_coordinates(i,dim);
7258 for(
int dim = 0; dim < coordInd; ++dim) {
7259 vals[val_ind++] = host_mj_coordinates(i,dim);
7263 multiSItem tempSortItem(i, local_coord_dim -1, vals);
7265 mj_part_t cmap = cut_map[p];
7266 sort_vector_points_on_cut[cmap].push_back(tempSortItem);
7270 ++host_thread_point_counts(p);
7271 host_assigned_part_ids(i) = p;
7276 for(mj_part_t i = 0; i < different_cut_count; ++i) {
7277 std::sort (sort_vector_points_on_cut[i].begin(),
7278 sort_vector_points_on_cut[i].end());
7281 mj_part_t previous_cut_map = cut_map[0];
7283 auto host_thread_cut_line_weight_to_put_left =
7284 Kokkos::create_mirror_view(thread_cut_line_weight_to_put_left);
7285 Kokkos::deep_copy(host_thread_cut_line_weight_to_put_left,
7286 thread_cut_line_weight_to_put_left);
7288 auto host_mj_weights = Kokkos::create_mirror_view(mj_weights);
7289 Kokkos::deep_copy(host_mj_weights, mj_weights);
7299 mj_scalar_t weight_stolen_from_previous_part = 0;
7300 for(mj_part_t p = 0; p < no_cuts; ++p) {
7301 mj_part_t mapped_cut = cut_map[p];
7305 if(previous_cut_map != mapped_cut) {
7306 mj_lno_t sort_vector_end = (mj_lno_t)
7307 sort_vector_points_on_cut[previous_cut_map].size() - 1;
7308 for(; sort_vector_end >= 0; --sort_vector_end) {
7310 sort_vector_points_on_cut[previous_cut_map][sort_vector_end];
7311 mj_lno_t i = t.index;
7312 ++host_thread_point_counts(p);
7313 host_assigned_part_ids(i) = p;
7315 sort_vector_points_on_cut[previous_cut_map].clear();
7319 mj_lno_t sort_vector_end = (mj_lno_t)
7320 sort_vector_points_on_cut[mapped_cut].size() - 1;
7326 for(; sort_vector_end >= 0; --sort_vector_end) {
7329 multiSItem t = sort_vector_points_on_cut[mapped_cut][sort_vector_end];
7331 mj_lno_t i = t.index;
7332 mj_scalar_t w = this->mj_uniform_weights(0) ? 1 :
7333 this->mj_weights(i,0);
7335 if(host_thread_cut_line_weight_to_put_left(p) +
7336 weight_stolen_from_previous_part> this->sEpsilon &&
7337 host_thread_cut_line_weight_to_put_left(p) +
7338 weight_stolen_from_previous_part -
7339 std::abs(host_thread_cut_line_weight_to_put_left(p) +
7340 weight_stolen_from_previous_part - w)> this->sEpsilon)
7342 host_thread_cut_line_weight_to_put_left(p) -= w;
7344 sort_vector_points_on_cut[mapped_cut].pop_back();
7346 ++host_thread_point_counts(p);
7347 host_assigned_part_ids(i) = p;
7351 if(p < no_cuts - 1 &&
7352 host_thread_cut_line_weight_to_put_left(p) < this->sEpsilon) {
7353 if(mapped_cut == cut_map[p + 1] ) {
7356 if(previous_cut_map != mapped_cut) {
7357 weight_stolen_from_previous_part =
7358 host_thread_cut_line_weight_to_put_left(p);
7363 weight_stolen_from_previous_part +=
7364 host_thread_cut_line_weight_to_put_left(p);
7368 weight_stolen_from_previous_part =
7369 -host_thread_cut_line_weight_to_put_left(p);
7378 if(p < no_cuts - 1 && mapped_cut == cut_map[p + 1]) {
7379 if(previous_cut_map != mapped_cut) {
7380 weight_stolen_from_previous_part =
7381 host_thread_cut_line_weight_to_put_left(p);
7384 weight_stolen_from_previous_part +=
7385 host_thread_cut_line_weight_to_put_left(p);
7389 weight_stolen_from_previous_part =
7390 -host_thread_cut_line_weight_to_put_left(p);
7396 previous_cut_map = mapped_cut;
7401 mj_lno_t sort_vector_end = (mj_lno_t)sort_vector_points_on_cut[
7402 previous_cut_map].size() - 1;
7408 for(; sort_vector_end >= 0; --sort_vector_end) {
7410 multiSItem t = sort_vector_points_on_cut[previous_cut_map][sort_vector_end];
7413 mj_lno_t i = t.index;
7414 ++host_thread_point_counts(no_cuts);
7415 host_assigned_part_ids(i) = no_cuts;
7418 sort_vector_points_on_cut[previous_cut_map].clear();
7422 mj_lno_t vSize = (mj_lno_t) allocated_memory.size();
7423 for(mj_lno_t i = 0; i < vSize; ++i) {
7424 delete [] allocated_memory[i];
7427 auto local_out_part_xadj = out_part_xadj;
7428 auto host_out_part_xadj = Kokkos::create_mirror_view(local_out_part_xadj);
7429 Kokkos::deep_copy(host_out_part_xadj, out_part_xadj);
7432 for(mj_part_t j = 0; j < num_parts; ++j) {
7433 host_out_part_xadj(j) = host_thread_point_counts(j);
7434 host_thread_point_counts(j) = 0;
7438 for(mj_part_t j = 1; j < num_parts; ++j) {
7439 host_out_part_xadj(j) += host_out_part_xadj(j - 1);
7444 for(mj_part_t j = 1; j < num_parts; ++j) {
7445 host_thread_point_counts(j) += host_out_part_xadj(j - 1);
7448 auto host_new_coordinate_permutations =
7449 Kokkos::create_mirror_view(new_coordinate_permutations);
7450 Kokkos::deep_copy(host_new_coordinate_permutations,
7451 new_coordinate_permutations);
7455 for(mj_lno_t ii = coordinate_begin; ii < coordinate_end; ++ii) {
7456 mj_lno_t i = host_coordinate_permutations(ii);
7457 mj_part_t p = host_assigned_part_ids(i);
7458 host_new_coordinate_permutations(coordinate_begin +
7459 host_thread_point_counts(p)++) = i;
7462 Kokkos::deep_copy(thread_point_counts, host_thread_point_counts);
7463 Kokkos::deep_copy(new_coordinate_permutations,
7464 host_new_coordinate_permutations);
7465 Kokkos::deep_copy(local_out_part_xadj, host_out_part_xadj);
7477template <
typename mj_scalar_t,
typename mj_lno_t,
typename mj_gno_t,
7478 typename mj_part_t,
typename mj_node_t>
7479void AlgMJ<mj_scalar_t, mj_lno_t, mj_gno_t, mj_part_t, mj_node_t>::
7481 mj_part_t current_num_parts,
7482 mj_part_t output_part_begin_index,
7483 RCP<mj_partBoxVector_t> &output_part_boxes,
7484 bool is_data_ever_migrated)
7486 this->mj_env->timerStart(MACRO_TIMERS,
7487 mj_timer_base_string +
"Part_Assignment");
7489 auto local_part_xadj = part_xadj;
7490 auto local_mj_keep_part_boxes = mj_keep_part_boxes;
7491 auto local_coordinate_permutations = coordinate_permutations;
7492 auto local_assigned_part_ids = assigned_part_ids;
7494 if(local_mj_keep_part_boxes) {
7495 for(
int i = 0; i < current_num_parts; ++i) {
7496 (*output_part_boxes)[i].setpId(i + output_part_begin_index);
7500 Kokkos::TeamPolicy<typename mj_node_t::execution_space> policy(
7501 current_num_parts, Kokkos::AUTO());
7502 typedef typename Kokkos::TeamPolicy<typename mj_node_t::execution_space>::
7503 member_type member_type;
7504 Kokkos::parallel_for(policy, KOKKOS_LAMBDA(member_type team_member) {
7505 int i = team_member.league_rank();
7506 Kokkos::parallel_for(Kokkos::TeamThreadRange (team_member, (i != 0) ?
7507 local_part_xadj(i-1) : 0, local_part_xadj(i)),
7509 mj_lno_t k = local_coordinate_permutations(ii);
7510 local_assigned_part_ids(k) = i + output_part_begin_index;
7514 if(is_data_ever_migrated) {
7515#ifdef ZOLTAN2_MJ_ENABLE_ZOLTAN_MIGRATION
7516 if(
sizeof(mj_lno_t) <=
sizeof(int)) {
7523 ZOLTAN_COMM_OBJ *plan = NULL;
7524 MPI_Comm mpi_comm = Teuchos::getRawMpiComm(*(this->mj_problemComm));
7527 int message_tag = 7856;
7529 this->mj_env->timerStart(MACRO_TIMERS,
7530 mj_timer_base_string +
"Final Z1PlanCreating");
7533 int ierr = Zoltan_Comm_Create( &plan,
int(this->num_local_coords),
7534 this->owner_of_coordinate.data(), mpi_comm, message_tag, &incoming);
7537 this->mj_env->timerStop(MACRO_TIMERS,
7538 mj_timer_base_string +
"Final Z1PlanCreating" );
7540 this->mj_env->timerStart(MACRO_TIMERS,
7541 mj_timer_base_string +
"Final Z1PlanComm");
7548 auto host_current_mj_gnos = Kokkos::create_mirror_view(
7549 Kokkos::HostSpace(), this->current_mj_gnos);
7550 deep_copy(host_current_mj_gnos, this->current_mj_gnos);
7551 Kokkos::View<mj_gno_t*, device_t> dst_gnos(
7552 Kokkos::ViewAllocateWithoutInitializing(
"dst_gnos"), incoming);
7553 auto host_dst_gnos = Kokkos::create_mirror_view(
7554 Kokkos::HostSpace(), dst_gnos);
7556 ierr = Zoltan_Comm_Do( plan, message_tag,
7557 (
char *) host_current_mj_gnos.data(),
7558 sizeof(mj_gno_t), (
char *) host_dst_gnos.data());
7560 Kokkos::deep_copy(dst_gnos, host_dst_gnos);
7561 this->current_mj_gnos = dst_gnos;
7564 auto host_src_part_ids = Kokkos::create_mirror_view(
7565 Kokkos::HostSpace(), this->assigned_part_ids);
7566 deep_copy(host_src_part_ids, this->assigned_part_ids);
7567 Kokkos::View<mj_part_t*, device_t> dst_part_ids(
7568 Kokkos::ViewAllocateWithoutInitializing(
"dst_part_ids"), incoming);
7569 auto host_dst_part_ids = Kokkos::create_mirror_view(
7570 Kokkos::HostSpace(), dst_part_ids);
7572 ierr = Zoltan_Comm_Do( plan, message_tag,
7573 (
char *) host_src_part_ids.data(),
7574 sizeof(mj_part_t), (
char *) host_dst_part_ids.data());
7576 Kokkos::deep_copy(dst_part_ids, host_dst_part_ids);
7577 this->assigned_part_ids = dst_part_ids;
7579 ierr = Zoltan_Comm_Destroy(&plan);
7582 this->num_local_coords = incoming;
7584 this->mj_env->timerStop(MACRO_TIMERS,
7585 mj_timer_base_string +
"Final Z1PlanComm");
7591 this->mj_env->timerStart(MACRO_TIMERS,
7592 mj_timer_base_string +
"Final DistributorPlanCreating");
7593 Tpetra::Distributor distributor(this->mj_problemComm);
7594 ArrayView<const mj_part_t> owners_of_coords(
7595 this->owner_of_coordinate.data(), this->num_local_coords);
7596 mj_lno_t incoming = distributor.createFromSends(owners_of_coords);
7597 this->mj_env->timerStop(MACRO_TIMERS,
7598 mj_timer_base_string +
"Final DistributorPlanCreating" );
7600 this->mj_env->timerStart(MACRO_TIMERS,
7601 mj_timer_base_string +
"Final DistributorPlanComm");
7607 Kokkos::View<mj_gno_t*, Kokkos::HostSpace> sent_gnos(
7608 Kokkos::ViewAllocateWithoutInitializing(
"sent_gnos"),
7609 this->current_mj_gnos.extent(0));
7610 Kokkos::deep_copy(sent_gnos, this->current_mj_gnos);
7612 Kokkos::View<mj_gno_t*, Kokkos::HostSpace> received_gnos(
7613 Kokkos::ViewAllocateWithoutInitializing(
"received_gnos"),
7616 distributor.doPostsAndWaits(sent_gnos, 1, received_gnos);
7618 this->current_mj_gnos = Kokkos::View<mj_gno_t*, device_t>(
7619 Kokkos::ViewAllocateWithoutInitializing(
"current_mj_gnos"), incoming);
7621 Kokkos::deep_copy(this->current_mj_gnos, received_gnos);
7624 Kokkos::View<mj_part_t *, Kokkos::HostSpace> sent_partids(
7625 Kokkos::ViewAllocateWithoutInitializing(
"sent_partids"),
7626 this->assigned_part_ids.extent(0));
7627 Kokkos::deep_copy(sent_partids, this->assigned_part_ids);
7629 Kokkos::View<mj_part_t *, Kokkos::HostSpace> received_partids(
7630 Kokkos::ViewAllocateWithoutInitializing(
"received_partids"),
7633 distributor.doPostsAndWaits(sent_partids, 1, received_partids);
7635 this->assigned_part_ids =
7636 Kokkos::View<mj_part_t*, device_t>(
7637 Kokkos::ViewAllocateWithoutInitializing(
"assigned_part_ids"),
7640 Kokkos::deep_copy(this->assigned_part_ids, received_partids);
7641 this->num_local_coords = incoming;
7643 this->mj_env->timerStop(MACRO_TIMERS,
7644 mj_timer_base_string +
"Final DistributorPlanComm");
7648 this->mj_env->timerStop(MACRO_TIMERS,
7649 mj_timer_base_string +
"Part_Assignment");
7651 this->mj_env->timerStart(MACRO_TIMERS,
7652 mj_timer_base_string +
"Solution_Part_Assignment");
7657 if(this->mj_keep_part_boxes) {
7658 this->kept_boxes = compute_global_box_boundaries(output_part_boxes);
7661 this->mj_env->timerStop(MACRO_TIMERS,
7662 mj_timer_base_string +
"Solution_Part_Assignment");
7677template <
typename mj_scalar_t,
typename mj_lno_t,
typename mj_gno_t,
7678 typename mj_part_t,
typename mj_node_t>
7681 bool distribute_points_on_cut_lines_,
7682 int max_concurrent_part_calculation_,
7683 int check_migrate_avoid_migration_option_,
7684 double minimum_migration_imbalance_,
7685 int migration_type_)
7687 this->distribute_points_on_cut_lines = distribute_points_on_cut_lines_;
7688 this->max_concurrent_part_calculation = max_concurrent_part_calculation_;
7689 this->check_migrate_avoid_migration_option =
7690 check_migrate_avoid_migration_option_;
7691 this->minimum_migration_imbalance = minimum_migration_imbalance_;
7692 this->migration_type = migration_type_;
7722template <
typename mj_scalar_t,
typename mj_lno_t,
typename mj_gno_t,
7723 typename mj_part_t,
typename mj_node_t>
7726 const RCP<const Environment> &env,
7727 RCP<
const Comm<int> > &problemComm,
7728 double imbalance_tolerance_,
7730 size_t num_global_parts_,
7731 Kokkos::View<mj_part_t*, Kokkos::HostSpace> & part_no_array_,
7732 int recursion_depth_,
7734 mj_lno_t num_local_coords_,
7735 mj_gno_t num_global_coords_,
7736 Kokkos::View<const mj_gno_t*, device_t> & initial_mj_gnos_,
7738 Kokkos::View<mj_scalar_t**, Kokkos::LayoutLeft, device_t> & mj_coordinates_,
7739 int num_weights_per_coord_,
7740 Kokkos::View<bool*, Kokkos::HostSpace> & mj_uniform_weights_,
7741 Kokkos::View<mj_scalar_t**, device_t> & mj_weights_,
7742 Kokkos::View<bool*, Kokkos::HostSpace> & mj_uniform_parts_,
7743 Kokkos::View<mj_part_t *, device_t> & result_assigned_part_ids_,
7744 Kokkos::View<mj_gno_t*, device_t> & result_mj_gnos_)
7749 this->mj_timer_base_string =
"MJ(" + std::to_string(execute_counter) +
") - ";
7752 this->mj_problemComm = problemComm;
7753 this->myActualRank = this->myRank = this->mj_problemComm->getRank();
7754 this->mj_env->timerStart(MACRO_TIMERS,
7755 mj_timer_base_string +
"Total");
7756 this->mj_env->debug(3,
"In MultiJagged Jagged");
7757 this->imbalance_tolerance = imbalance_tolerance_;
7758 this->mj_num_teams = num_teams_;
7759 this->num_global_parts = num_global_parts_;
7760 this->part_no_array = part_no_array_;
7761 this->recursion_depth = recursion_depth_;
7762 this->coord_dim = coord_dim_;
7763 this->num_local_coords = num_local_coords_;
7764 this->num_global_coords = num_global_coords_;
7765 this->mj_coordinates = mj_coordinates_;
7766 this->initial_mj_gnos = initial_mj_gnos_;
7767 this->num_weights_per_coord = num_weights_per_coord_;
7768 this->mj_uniform_weights = mj_uniform_weights_;
7769 this->mj_weights = mj_weights_;
7770 this->mj_uniform_parts = mj_uniform_parts_;
7774 this->set_part_specifications();
7776 this->mj_env->timerStart(MACRO_TIMERS,
7777 mj_timer_base_string +
"Allocate Views");
7778 this->allocate_set_work_memory();
7779 this->mj_env->timerStop(MACRO_TIMERS,
7780 mj_timer_base_string +
"Allocate Views");
7784 this->comm = this->mj_problemComm->duplicate();
7787 if(comm->getRank() == 0) {
7788 std::cout <<
"size of gno:" <<
sizeof(mj_gno_t) << std::endl;
7789 std::cout <<
"size of lno:" <<
sizeof(mj_lno_t) << std::endl;
7790 std::cout <<
"size of mj_scalar_t:" <<
sizeof(mj_scalar_t) << std::endl;
7795 mj_part_t current_num_parts = 1;
7796 Kokkos::View<mj_scalar_t *, device_t> current_cut_coordinates =
7797 this->all_cut_coordinates;
7798 this->mj_env->timerStart(MACRO_TIMERS,
7799 mj_timer_base_string +
"Problem_Partitioning");
7800 mj_part_t output_part_begin_index = 0;
7801 mj_part_t future_num_parts = this->total_num_part;
7802 bool is_data_ever_migrated =
false;
7804 std::vector<mj_part_t> *future_num_part_in_parts =
7805 new std::vector<mj_part_t> ();
7806 std::vector<mj_part_t> *next_future_num_parts_in_parts =
7807 new std::vector<mj_part_t> ();
7809 next_future_num_parts_in_parts->push_back(this->num_global_parts);
7811 RCP<mj_partBoxVector_t> input_part_boxes;
7812 RCP<mj_partBoxVector_t> output_part_boxes;
7814 if(this->mj_keep_part_boxes) {
7815 input_part_boxes = RCP<mj_partBoxVector_t>(
new mj_partBoxVector_t(),
true);
7816 output_part_boxes = RCP<mj_partBoxVector_t>(
new mj_partBoxVector_t(),
true);
7817 compute_global_box();
7818 this->init_part_boxes(output_part_boxes);
7821 auto local_part_xadj = this->part_xadj;
7825 Kokkos::View<mj_part_t*, device_t>
7826 view_rectilinear_cut_count(
"view_rectilinear_cut_count", 1);
7827 Kokkos::View<size_t*, device_t>
7828 view_total_reduction_size(
"view_total_reduction_size", 1);
7830 for(
int i = 0; i < this->recursion_depth; ++i) {
7833 std::string istring = std::to_string(i);
7839 std::vector<mj_part_t> *tmpPartVect= future_num_part_in_parts;
7840 future_num_part_in_parts = next_future_num_parts_in_parts;
7841 next_future_num_parts_in_parts = tmpPartVect;
7845 next_future_num_parts_in_parts->clear();
7846 if(this->mj_keep_part_boxes) {
7847 RCP<mj_partBoxVector_t> tmpPartBoxes = input_part_boxes;
7848 input_part_boxes = output_part_boxes;
7849 output_part_boxes = tmpPartBoxes;
7850 output_part_boxes->clear();
7854 mj_part_t output_part_count_in_dimension =
7855 this->update_part_num_arrays(
7856 future_num_part_in_parts,
7857 next_future_num_parts_in_parts,
7862 output_part_boxes, 1);
7867 if(output_part_count_in_dimension == current_num_parts) {
7869 tmpPartVect= future_num_part_in_parts;
7870 future_num_part_in_parts = next_future_num_parts_in_parts;
7871 next_future_num_parts_in_parts = tmpPartVect;
7873 if(this->mj_keep_part_boxes) {
7874 RCP<mj_partBoxVector_t> tmpPartBoxes = input_part_boxes;
7875 input_part_boxes = output_part_boxes;
7876 output_part_boxes = tmpPartBoxes;
7882 int coordInd = i % this->coord_dim;
7884 Kokkos::View<mj_scalar_t *, device_t> mj_current_dim_coords =
7885 Kokkos::subview(this->mj_coordinates, Kokkos::ALL, coordInd);
7887 this->mj_env->timerStart(MACRO_TIMERS,
7888 mj_timer_base_string +
"Problem_Partitioning_" + istring);
7892 this->new_part_xadj = Kokkos::View<mj_lno_t*, device_t>(
7893 "new part xadj", output_part_count_in_dimension);
7896 mj_part_t output_part_index = 0;
7900 mj_part_t output_coordinate_end_index = 0;
7902 mj_part_t current_work_part = 0;
7903 mj_part_t current_concurrent_num_parts =
7904 std::min(current_num_parts - current_work_part,
7905 this->max_concurrent_part_calculation);
7907 mj_part_t obtained_part_index = 0;
7909 auto host_process_local_min_max_coord_total_weight =
7910 Kokkos::create_mirror_view(process_local_min_max_coord_total_weight);
7911 auto host_global_min_max_coord_total_weight =
7912 Kokkos::create_mirror_view(global_min_max_coord_total_weight);
7915 for(; current_work_part < current_num_parts;
7916 current_work_part += current_concurrent_num_parts) {
7918 current_concurrent_num_parts =
7919 std::min(current_num_parts - current_work_part,
7920 this->max_concurrent_part_calculation);
7923 auto local_device_num_partitioning_in_current_dim =
7924 device_num_partitioning_in_current_dim;
7925 Kokkos::parallel_reduce(
"Read bDoingWork",
7926 Kokkos::RangePolicy<typename mj_node_t::execution_space, int> (0, 1),
7927 KOKKOS_LAMBDA(
int dummy,
int & set_single) {
7929 for(
int kk = 0; kk < current_concurrent_num_parts; ++kk) {
7930 if(local_device_num_partitioning_in_current_dim(
7931 current_work_part + kk) != 1) {
7937 bool bDoingWork = (bDoingWork_int != 0) ?
true :
false;
7939 this->mj_get_local_min_max_coord_totW(
7941 current_concurrent_num_parts,
7942 mj_current_dim_coords);
7947 this->mj_get_global_min_max_coord_totW(
7948 current_concurrent_num_parts,
7949 this->process_local_min_max_coord_total_weight,
7950 this->global_min_max_coord_total_weight);
7954 mj_part_t total_incomplete_cut_count = 0;
7959 mj_part_t concurrent_part_cut_shift = 0;
7960 mj_part_t concurrent_part_part_shift = 0;
7962 for(
int kk = 0; kk < current_concurrent_num_parts; ++kk) {
7964 Kokkos::deep_copy(host_global_min_max_coord_total_weight,
7965 global_min_max_coord_total_weight);
7967 mj_scalar_t min_coordinate =
7968 host_global_min_max_coord_total_weight(kk);
7969 mj_scalar_t max_coordinate =
7970 host_global_min_max_coord_total_weight(
7971 kk + current_concurrent_num_parts);
7973 mj_scalar_t global_total_weight =
7974 host_global_min_max_coord_total_weight(
7975 kk + 2 * current_concurrent_num_parts);
7977 mj_part_t concurrent_current_part_index = current_work_part + kk;
7979 mj_part_t partition_count = host_num_partitioning_in_current_dim(
7980 concurrent_current_part_index);
7982 Kokkos::View<mj_scalar_t *, device_t> usedCutCoordinate =
7983 Kokkos::subview(current_cut_coordinates,
7984 std::pair<mj_lno_t, mj_lno_t>(
7985 concurrent_part_cut_shift, current_cut_coordinates.size()));
7986 Kokkos::View<mj_scalar_t *, device_t>
7987 current_target_part_weights =
7988 Kokkos::subview(target_part_weights,
7989 std::pair<mj_lno_t, mj_lno_t>(
7990 concurrent_part_part_shift, target_part_weights.size()));
7993 concurrent_part_cut_shift += partition_count - 1;
7995 concurrent_part_part_shift += partition_count;
7999 if(partition_count > 1 && min_coordinate <= max_coordinate) {
8003 total_incomplete_cut_count += partition_count - 1;
8005 this->incomplete_cut_count(kk) = partition_count - 1;
8008 this->mj_get_initial_cut_coords_target_weights(
8011 partition_count - 1,
8012 global_total_weight,
8014 current_target_part_weights,
8015 future_num_part_in_parts,
8016 next_future_num_parts_in_parts,
8017 concurrent_current_part_index,
8018 obtained_part_index);
8020 mj_lno_t coordinate_end_index =
8021 host_part_xadj(concurrent_current_part_index);
8022 mj_lno_t coordinate_begin_index =
8023 concurrent_current_part_index==0 ? 0 :
8024 host_part_xadj(concurrent_current_part_index - 1);
8026 this->set_initial_coordinate_parts(
8029 coordinate_begin_index, coordinate_end_index,
8030 this->coordinate_permutations,
8031 mj_current_dim_coords,
8032 this->assigned_part_ids,
8038 this->incomplete_cut_count(kk) = 0;
8041 obtained_part_index += partition_count;
8046 double used_imbalance = 0;
8048 this->mj_env->timerStart(MACRO_TIMERS,
8049 mj_timer_base_string +
"Problem_Partitioning Get Part Weights");
8052 mj_current_dim_coords,
8055 current_concurrent_num_parts,
8056 current_cut_coordinates,
8057 total_incomplete_cut_count,
8058 view_rectilinear_cut_count,
8059 view_total_reduction_size);
8061 this->mj_env->timerStop(MACRO_TIMERS,
8062 mj_timer_base_string +
"Problem_Partitioning Get Part Weights");
8067 mj_part_t output_array_shift = 0;
8068 mj_part_t cut_shift = 0;
8069 size_t tlr_shift = 0;
8070 size_t partweight_array_shift = 0;
8071 for(
int kk = 0; kk < current_concurrent_num_parts; ++kk) {
8073 mj_part_t current_concurrent_work_part = current_work_part + kk;
8075 mj_part_t num_parts = host_num_partitioning_in_current_dim(
8076 current_concurrent_work_part);
8079 int coordinateA_bigger_than_coordinateB =
8080 host_global_min_max_coord_total_weight(kk) >
8081 host_global_min_max_coord_total_weight(
8082 kk + current_concurrent_num_parts);
8084 if((num_parts != 1) && coordinateA_bigger_than_coordinateB) {
8087 auto local_new_part_xadj = this->new_part_xadj;
8088 Kokkos::parallel_for(
8089 Kokkos::RangePolicy<typename mj_node_t::execution_space, mj_part_t>
8090 (0, num_parts), KOKKOS_LAMBDA (mj_part_t jj) {
8091 local_new_part_xadj(
8092 output_part_index + output_array_shift + jj) = 0;
8095 cut_shift += num_parts - 1;
8096 tlr_shift += (4 *(num_parts - 1) + 1);
8097 output_array_shift += num_parts;
8098 partweight_array_shift += (2 * (num_parts - 1) + 1);
8102 Kokkos::View<mj_scalar_t *, device_t>
8103 current_concurrent_cut_coordinate =
8104 Kokkos::subview(current_cut_coordinates,
8105 std::pair<mj_lno_t, mj_lno_t>(
8107 current_cut_coordinates.size()));
8108 Kokkos::View<mj_scalar_t *, device_t>
8109 used_local_cut_line_weight_to_left =
8110 Kokkos::subview(process_cut_line_weight_to_put_left,
8111 std::pair<mj_lno_t, mj_lno_t>(
8113 process_cut_line_weight_to_put_left.size()));
8115 this->thread_part_weight_work =
8117 this->thread_part_weights,
8118 std::pair<mj_lno_t, mj_lno_t>(
8119 partweight_array_shift,
8120 this->thread_part_weights.extent(0)));
8123 if(this->mj_keep_part_boxes) {
8125 for(mj_part_t j = 0; j < num_parts - 1; ++j) {
8126 mj_scalar_t temp_get_val;
8127 Kokkos::parallel_reduce(
"Read single",
8128 Kokkos::RangePolicy<typename mj_node_t::execution_space, int> (0, 1),
8129 KOKKOS_LAMBDA(
int dummy, mj_scalar_t & set_single) {
8130 set_single = current_concurrent_cut_coordinate(j);
8132 (*output_part_boxes)
8133 [output_array_shift + output_part_index + j].
8134 updateMinMax(temp_get_val, 1 , coordInd);
8135 (*output_part_boxes)
8136 [output_array_shift + output_part_index + j + 1].
8137 updateMinMax(temp_get_val, 0 , coordInd);
8142 Kokkos::View<mj_lno_t*, device_t> sub_new_part_xadj =
8143 Kokkos::subview(this->new_part_xadj,
8144 std::pair<mj_lno_t, mj_lno_t>(
8145 output_part_index + output_array_shift,
8146 this->new_part_xadj.size()));
8148 this->mj_create_new_partitions(
8150 current_concurrent_work_part,
8151 mj_current_dim_coords,
8152 current_concurrent_cut_coordinate,
8153 used_local_cut_line_weight_to_left,
8158 mj_lno_t coordinate_end = host_part_xadj(
8159 current_concurrent_work_part);
8160 mj_lno_t coordinate_begin =
8161 current_concurrent_work_part==0 ? 0 : host_part_xadj(
8162 current_concurrent_work_part - 1);
8166 mj_lno_t part_size = coordinate_end - coordinate_begin;
8170 auto local_new_part_xadj = this->new_part_xadj;
8171 Kokkos::parallel_for(
8172 Kokkos::RangePolicy<typename mj_node_t::execution_space, int>
8173 (0, 1), KOKKOS_LAMBDA (
int dummy) {
8174 local_new_part_xadj(
8175 output_part_index + output_array_shift) = part_size;
8178 auto subview_new_coordinate_permutations =
8179 Kokkos::subview(this->new_coordinate_permutations,
8180 std::pair<mj_lno_t, mj_lno_t>(
8182 coordinate_begin + part_size));
8183 auto subview_coordinate_permutations =
8184 Kokkos::subview(this->coordinate_permutations,
8185 std::pair<mj_lno_t, mj_lno_t>(
8187 coordinate_begin + part_size));
8188 Kokkos::deep_copy(subview_new_coordinate_permutations,
8189 subview_coordinate_permutations);
8191 cut_shift += num_parts - 1;
8192 output_array_shift += num_parts;
8193 partweight_array_shift += (2 * (num_parts - 1) + 1);
8202 for(mj_part_t kk = 0; kk < current_concurrent_num_parts; ++kk) {
8203 mj_part_t num_parts =
8204 host_num_partitioning_in_current_dim(current_work_part + kk);
8208 auto local_new_part_xadj = this->new_part_xadj;
8209 Kokkos::parallel_for(
8210 Kokkos::RangePolicy<typename mj_node_t::execution_space, mj_part_t>
8211 (0, num_parts), KOKKOS_LAMBDA (mj_part_t ii) {
8212 local_new_part_xadj(output_part_index+ii) +=
8213 output_coordinate_end_index;
8218 Kokkos::parallel_reduce(
"Read single",
8219 Kokkos::RangePolicy<typename mj_node_t::execution_space, int> (0, 1),
8220 KOKKOS_LAMBDA(
int dummy, mj_part_t & set_single) {
8222 local_new_part_xadj(output_part_index + num_parts - 1);
8224 output_coordinate_end_index = temp_get;
8226 output_part_index += num_parts;
8232 int current_world_size = this->comm->getSize();
8233 long migration_reduce_all_population =
8234 this->total_dim_num_reduce_all * current_world_size;
8235 bool is_migrated_in_current_dimension =
false;
8240 if(future_num_parts > 1 &&
8241 this->check_migrate_avoid_migration_option >= 0 &&
8242 current_world_size > 1) {
8243 this->mj_env->timerStart(MACRO_TIMERS,
8244 mj_timer_base_string +
"Problem_Migration-" + istring);
8245 mj_part_t num_parts = output_part_count_in_dimension;
8247 if(this->mj_perform_migration(
8250 next_future_num_parts_in_parts,
8251 output_part_begin_index,
8252 migration_reduce_all_population,
8253 this->num_global_coords / (future_num_parts * current_num_parts),
8255 input_part_boxes, output_part_boxes) )
8257 is_migrated_in_current_dimension =
true;
8258 is_data_ever_migrated =
true;
8259 this->mj_env->timerStop(MACRO_TIMERS,
8260 mj_timer_base_string +
"Problem_Migration-" + istring);
8263 this->total_dim_num_reduce_all /= num_parts;
8266 is_migrated_in_current_dimension =
false;
8267 this->mj_env->timerStop(MACRO_TIMERS,
8268 mj_timer_base_string +
"Problem_Migration-" + istring);
8273 Kokkos::View<mj_lno_t*, device_t> tmp =
8274 this->coordinate_permutations;
8275 this->coordinate_permutations =
8276 this->new_coordinate_permutations;
8278 this->new_coordinate_permutations = tmp;
8279 if(!is_migrated_in_current_dimension) {
8280 this->total_dim_num_reduce_all -= current_num_parts;
8281 current_num_parts = output_part_count_in_dimension;
8285 this->part_xadj = this->new_part_xadj;
8286 local_part_xadj = this->new_part_xadj;
8287 this->host_part_xadj = Kokkos::create_mirror_view(part_xadj);
8288 Kokkos::deep_copy(host_part_xadj, part_xadj);
8290 this->new_part_xadj = Kokkos::View<mj_lno_t*, device_t>(
"empty", 0);
8291 this->mj_env->timerStop(MACRO_TIMERS,
8292 mj_timer_base_string +
"Problem_Partitioning_" + istring);
8297 delete future_num_part_in_parts;
8298 delete next_future_num_parts_in_parts;
8299 this->mj_env->timerStop(MACRO_TIMERS,
8300 mj_timer_base_string +
"Problem_Partitioning");
8306 this->set_final_parts(
8308 output_part_begin_index,
8310 is_data_ever_migrated);
8312 result_assigned_part_ids_ = this->assigned_part_ids;
8313 result_mj_gnos_ = this->current_mj_gnos;
8314 this->mj_env->timerStop(MACRO_TIMERS,
8315 mj_timer_base_string +
"Total");
8316 this->mj_env->debug(3,
"Out of MultiJagged");
8319template <
typename mj_scalar_t,
typename mj_lno_t,
typename mj_gno_t,
8320 typename mj_part_t,
typename mj_node_t>
8321RCP<typename AlgMJ<mj_scalar_t,mj_lno_t,mj_gno_t,mj_part_t, mj_node_t>::
8326 if(this->mj_keep_part_boxes) {
8327 return this->kept_boxes;
8330 throw std::logic_error(
"Error: part boxes are not stored.");
8334template <
typename mj_scalar_t,
typename mj_lno_t,
typename mj_gno_t,
8335 typename mj_part_t,
typename mj_node_t>
8336RCP<typename AlgMJ<mj_scalar_t,mj_lno_t,mj_gno_t,mj_part_t, mj_node_t>::
8342 mj_part_t ntasks = this->num_global_parts;
8343 int dim = (*localPartBoxes)[0].getDim();
8344 coord_t *localPartBoundaries =
new coord_t[ntasks * 2 *dim];
8346 memset(localPartBoundaries, 0,
sizeof(coord_t) * ntasks * 2 *dim);
8348 coord_t *globalPartBoundaries =
new coord_t[ntasks * 2 *dim];
8349 memset(globalPartBoundaries, 0,
sizeof(coord_t) * ntasks * 2 *dim);
8351 coord_t *localPartMins = localPartBoundaries;
8352 coord_t *localPartMaxs = localPartBoundaries + ntasks * dim;
8354 coord_t *globalPartMins = globalPartBoundaries;
8355 coord_t *globalPartMaxs = globalPartBoundaries + ntasks * dim;
8357 mj_part_t boxCount = localPartBoxes->size();
8358 for(mj_part_t i = 0; i < boxCount; ++i) {
8359 mj_part_t pId = (*localPartBoxes)[i].getpId();
8363 coord_t *lmins = (*localPartBoxes)[i].getlmins();
8364 coord_t *lmaxs = (*localPartBoxes)[i].getlmaxs();
8366 for(
int j = 0; j < dim; ++j) {
8367 localPartMins[dim * pId + j] = lmins[j];
8368 localPartMaxs[dim * pId + j] = lmaxs[j];
8381 reduceAll<int, coord_t>(*mj_problemComm, reductionOp,
8382 ntasks * 2 *dim, localPartBoundaries, globalPartBoundaries);
8384 RCP<mj_partBoxVector_t> pB(
new mj_partBoxVector_t(),
true);
8385 for(mj_part_t i = 0; i < ntasks; ++i) {
8387 globalPartMins + dim * i,
8388 globalPartMaxs + dim * i);
8401 delete []localPartBoundaries;
8402 delete []globalPartBoundaries;
8409template <
typename Adapter>
8415#ifndef DOXYGEN_SHOULD_SKIP_THIS
8419 typedef typename Adapter::scalar_t adapter_scalar_t;
8422 typedef float default_mj_scalar_t;
8428 (std::is_same<adapter_scalar_t, float>::value ||
8429 std::is_same<adapter_scalar_t, double>::value),
8430 adapter_scalar_t, default_mj_scalar_t>::type mj_scalar_t;
8432 typedef typename Adapter::gno_t mj_gno_t;
8433 typedef typename Adapter::lno_t mj_lno_t;
8434 typedef typename Adapter::part_t mj_part_t;
8435 typedef typename Adapter::node_t mj_node_t;
8437 typedef std::vector<mj_partBox_t> mj_partBoxVector_t;
8438 typedef typename mj_node_t::device_type device_t;
8441 AlgMJ<mj_scalar_t, mj_lno_t, mj_gno_t, mj_part_t, mj_node_t> mj_partitioner;
8443 RCP<const Environment> mj_env;
8444 RCP<const Comm<int> > mj_problemComm;
8445 RCP<const typename Adapter::base_adapter_t> mj_adapter;
8448 double imbalance_tolerance;
8452 size_t num_global_parts;
8455 Kokkos::View<mj_part_t*, Kokkos::HostSpace> part_no_array;
8458 int recursion_depth;
8461 mj_lno_t num_local_coords;
8462 mj_gno_t num_global_coords;
8465 Kokkos::View<const mj_gno_t*, device_t> initial_mj_gnos;
8469 Kokkos::View<mj_scalar_t**, Kokkos::LayoutLeft, device_t>
8472 int num_weights_per_coord;
8475 Kokkos::View<bool*, Kokkos::HostSpace> mj_uniform_weights;
8478 Kokkos::View<mj_scalar_t**, device_t> mj_weights;
8481 Kokkos::View<bool*, Kokkos::HostSpace> mj_uniform_parts;
8491 mj_part_t num_first_level_parts;
8495 Kokkos::View<mj_part_t*, Kokkos::HostSpace> first_level_distribution;
8499 bool distribute_points_on_cut_lines;
8502 mj_part_t max_concurrent_part_calculation;
8505 int check_migrate_avoid_migration_option;
8513 double minimum_migration_imbalance;
8514 bool mj_keep_part_boxes;
8518 int mj_premigration_option;
8519 int min_coord_per_rank_for_premigration;
8522 ArrayRCP<mj_part_t> comXAdj_;
8525 ArrayRCP<mj_part_t> comAdj_;
8528 const RCP<PartitioningSolution<Adapter> >&solution);
8530 void set_input_parameters(
const Teuchos::ParameterList &p);
8532 RCP<mj_partBoxVector_t> getGlobalBoxBoundaries()
const;
8534 bool mj_premigrate_to_subset(
8536 int migration_selection_option,
8537 RCP<const Environment> mj_env_,
8538 RCP<
const Comm<int> > mj_problemComm_,
8540 mj_lno_t num_local_coords_,
8541 mj_gno_t num_global_coords_,
size_t num_global_parts_,
8542 Kokkos::View<const mj_gno_t*, device_t> & initial_mj_gnos_,
8544 Kokkos::View<mj_scalar_t**, Kokkos::LayoutLeft, device_t> &
8546 int num_weights_per_coord_,
8547 Kokkos::View<mj_scalar_t**, device_t> & mj_weights_,
8549 RCP<
const Comm<int> > &result_problemComm_,
8550 mj_lno_t & result_num_local_coords_,
8551 Kokkos::View<mj_gno_t*, device_t> & result_initial_mj_gnos_,
8553 Kokkos::View<mj_scalar_t**, Kokkos::LayoutLeft, device_t> &
8554 result_mj_coordinates_,
8555 Kokkos::View<mj_scalar_t**, device_t> & result_mj_weights_,
8556 int * &result_actual_owner_rank_);
8561 RCP<
const Comm<int> > &problemComm,
8562 const RCP<const typename Adapter::base_adapter_t> &adapter) :
8565 mj_problemComm(problemComm),
8566 mj_adapter(adapter),
8567 imbalance_tolerance(0),
8569 num_global_parts(1),
8572 num_local_coords(0),
8573 num_global_coords(0),
8574 num_weights_per_coord(0),
8575 num_first_level_parts(1),
8576 distribute_points_on_cut_lines(true),
8577 max_concurrent_part_calculation(1),
8578 check_migrate_avoid_migration_option(0),
8580 minimum_migration_imbalance(0.30),
8581 mj_keep_part_boxes(false),
8582 mj_run_as_rcb(false),
8583 mj_premigration_option(0),
8584 min_coord_per_rank_for_premigration(32000),
8598 const bool bUnsorted =
true;
8599 RCP<Zoltan2::IntegerRangeListValidator<int>> mj_parts_Validator =
8601 pl.set(
"mj_parts",
"0",
"list of parts for multiJagged partitioning "
8602 "algorithm. As many as the dimension count.", mj_parts_Validator);
8604 pl.set(
"mj_concurrent_part_count", 1,
"The number of parts whose cut "
8605 "coordinates will be calculated concurently.",
8608 pl.set(
"mj_minimum_migration_imbalance", 1.1,
8609 "mj_minimum_migration_imbalance, the minimum imbalance of the "
8610 "processors to avoid migration",
8613 RCP<Teuchos::EnhancedNumberValidator<int>> mj_migration_option_validator =
8614 Teuchos::rcp(
new Teuchos::EnhancedNumberValidator<int>(0, 2) );
8615 pl.set(
"mj_migration_option", 1,
"Migration option, 0 for decision "
8616 "depending on the imbalance, 1 for forcing migration, 2 for "
8617 "avoiding migration", mj_migration_option_validator);
8619 RCP<Teuchos::EnhancedNumberValidator<int>> mj_migration_type_validator =
8620 Teuchos::rcp(
new Teuchos::EnhancedNumberValidator<int>(0, 1) );
8621 pl.set(
"mj_migration_type", 0,
8622 "Migration type, 0 for migration to minimize the imbalance "
8623 "1 for migration to minimize messages exchanged the migration.",
8624 mj_migration_option_validator);
8627 pl.set(
"mj_keep_part_boxes",
false,
"Keep the part boundaries of the "
8631 pl.set(
"mj_enable_rcb",
false,
"Use MJ as RCB.",
8634 pl.set(
"mj_recursion_depth", -1,
"Recursion depth for MJ: Must be "
8637 RCP<Teuchos::EnhancedNumberValidator<int>>
8638 mj_num_teams_validator =
8639 Teuchos::rcp(
new Teuchos::EnhancedNumberValidator<int>(
8640 0, Teuchos::EnhancedNumberTraits<int>::max()) );
8641 pl.set(
"mj_num_teams", 0,
8642 "How many teams for the main kernel loop"
8643 , mj_num_teams_validator);
8645 RCP<Teuchos::EnhancedNumberValidator<int>>
8646 mj_premigration_option_validator =
8647 Teuchos::rcp(
new Teuchos::EnhancedNumberValidator<int>(0, 1024) );
8649 pl.set(
"mj_premigration_option", 0,
8650 "Whether to do premigration or not. 0 for no migration "
8651 "x > 0 for migration to consecutive processors, "
8652 "the subset will be 0,x,2x,3x,...subset ranks."
8653 , mj_premigration_option_validator);
8655 pl.set(
"mj_premigration_coordinate_count", 32000,
"How many coordinate to "
8656 "assign each rank in multijagged after premigration"
8665 void partition(
const RCP<PartitioningSolution<Adapter> > &solution);
8669 RCP<mj_partBoxVector_t> pBoxes = this->getGlobalBoxBoundaries();
8673 mj_part_t
pointAssign(
int dim, adapter_scalar_t *point)
const;
8675 void boxAssign(
int dim, adapter_scalar_t *lower, adapter_scalar_t *upper,
8676 size_t &nPartsFound, mj_part_t **partsFound)
const;
8681 const PartitioningSolution<Adapter> *solution,
8682 ArrayRCP<mj_part_t> &comXAdj,
8683 ArrayRCP<mj_part_t> &comAdj);
8686 const RCP<PartitioningSolution<Adapter> >&solution);
8689 std::string timer_base_string;
8697 template<
class dst_t,
class src_t>
8698 typename std::enable_if<std::is_same<
typename dst_t::value_type,
8699 typename src_t::value_type>::value>::type
8700 assign_if_same(dst_t & dst,
const src_t & src) {
8703 template<
class dst_t,
class src_t>
8704 typename std::enable_if<!std::is_same<
typename dst_t::value_type,
8705 typename src_t::value_type>::value>::type
8706 assign_if_same(dst_t & dst,
const src_t & src) {
8711template <
typename Adapter>
8712bool Zoltan2_AlgMJ<Adapter>::mj_premigrate_to_subset(
8714 int migration_selection_option,
8715 RCP<const Environment> mj_env_,
8716 RCP<
const Comm<int> > mj_problemComm_,
8718 mj_lno_t num_local_coords_,
8719 mj_gno_t num_global_coords_,
size_t num_global_parts_,
8720 Kokkos::View<const mj_gno_t*, device_t> & initial_mj_gnos_,
8722 Kokkos::View<mj_scalar_t**, Kokkos::LayoutLeft, device_t> & mj_coordinates_,
8723 int num_weights_per_coord_,
8724 Kokkos::View<mj_scalar_t**, device_t> & mj_weights_,
8726 RCP<
const Comm<int> > & result_problemComm_,
8727 mj_lno_t &result_num_local_coords_,
8728 Kokkos::View<mj_gno_t*, device_t> & result_initial_mj_gnos_,
8730 Kokkos::View<mj_scalar_t**, Kokkos::LayoutLeft, device_t> &
8731 result_mj_coordinates_,
8732 Kokkos::View<mj_scalar_t**, device_t> & result_mj_weights_,
8733 int * &result_actual_owner_rank_)
8735 mj_env_->timerStart(MACRO_TIMERS,
8736 timer_base_string +
"PreMigration DistributorPlanCreating");
8738 int myRank = mj_problemComm_->getRank();
8739 int worldSize = mj_problemComm_->getSize();
8741 mj_part_t groupsize = worldSize / used_num_ranks;
8743 std::vector<mj_part_t> group_begins(used_num_ranks + 1, 0);
8745 mj_part_t i_am_sending_to = 0;
8746 bool am_i_a_receiver =
false;
8748 for(
int i = 0; i < used_num_ranks; ++i) {
8749 group_begins[i+ 1] = group_begins[i] + groupsize;
8750 if(worldSize % used_num_ranks > i) group_begins[i+ 1] += 1;
8751 if(i == used_num_ranks) group_begins[i+ 1] = worldSize;
8752 if(myRank >= group_begins[i] && myRank < group_begins[i + 1]) {
8753 i_am_sending_to = group_begins[i];
8755 if(myRank == group_begins[i]) {
8756 am_i_a_receiver =
true;
8760 ArrayView<const mj_part_t> idView(&(group_begins[0]), used_num_ranks );
8761 result_problemComm_ = mj_problemComm_->createSubcommunicator(idView);
8763 Tpetra::Distributor distributor(mj_problemComm_);
8765 std::vector<mj_part_t>
8766 coordinate_destinations(num_local_coords_, i_am_sending_to);
8768 ArrayView<const mj_part_t>
8769 destinations(&(coordinate_destinations[0]), num_local_coords_);
8770 mj_lno_t num_incoming_gnos = distributor.createFromSends(destinations);
8771 result_num_local_coords_ = num_incoming_gnos;
8772 mj_env_->timerStop(MACRO_TIMERS,
8773 timer_base_string +
"PreMigration DistributorPlanCreating");
8775 mj_env_->timerStart(MACRO_TIMERS,
8776 timer_base_string +
"PreMigration DistributorMigration");
8784 Kokkos::View<mj_gno_t*, Kokkos::HostSpace> sent_gnos(
8785 Kokkos::ViewAllocateWithoutInitializing(
"sent_gnos"),
8786 initial_mj_gnos_.size());
8787 Kokkos::deep_copy(sent_gnos, initial_mj_gnos_);
8789 Kokkos::View<mj_gno_t*, Kokkos::HostSpace> received_gnos (
8790 Kokkos::ViewAllocateWithoutInitializing(
"received_gnos"),
8793 distributor.doPostsAndWaits(sent_gnos, 1, received_gnos);
8795 result_initial_mj_gnos_ = Kokkos::View<mj_gno_t*, device_t>(
8796 Kokkos::ViewAllocateWithoutInitializing(
"result_initial_mj_gnos_"),
8798 Kokkos::deep_copy(result_initial_mj_gnos_, received_gnos);
8804 Kokkos::View<mj_scalar_t**, Kokkos::LayoutLeft, Kokkos::HostSpace>
8805 host_src_coordinates(
8806 Kokkos::ViewAllocateWithoutInitializing(
"mj_coordinates"),
8807 this->mj_coordinates.extent(0), this->mj_coordinates.extent(1));
8809 Kokkos::deep_copy(host_src_coordinates, this->mj_coordinates);
8811 Kokkos::View<mj_scalar_t**, Kokkos::LayoutLeft, device_t> dst_coordinates(
8812 Kokkos::ViewAllocateWithoutInitializing(
"mj_coordinates"),
8813 num_incoming_gnos, this->coord_dim);
8815 Kokkos::View<mj_scalar_t*, Kokkos::HostSpace> received_coord(
8816 Kokkos::ViewAllocateWithoutInitializing(
"received_coord"),
8819 for(
int i = 0; i < this->coord_dim; ++i) {
8821 auto sent_coord = Kokkos::subview(host_src_coordinates, Kokkos::ALL, i);
8823 distributor.doPostsAndWaits(sent_coord, 1, received_coord);
8825 Kokkos::deep_copy(Kokkos::subview(dst_coordinates, Kokkos::ALL, i),
8829 result_mj_coordinates_ = dst_coordinates;
8833 Kokkos::View<mj_scalar_t**, device_t> dst_weights(
8834 Kokkos::ViewAllocateWithoutInitializing(
"mj_weights"),
8835 num_incoming_gnos, this->num_weights_per_coord);
8836 auto host_dst_weights = Kokkos::create_mirror_view(dst_weights);
8838 auto host_src_weights = Kokkos::create_mirror_view_and_copy(
8839 Kokkos::HostSpace(), this->mj_weights);
8842 Kokkos::View<mj_scalar_t*, Kokkos::HostSpace> sent_weight(
8843 Kokkos::ViewAllocateWithoutInitializing(
"send_weight_buffer"),
8844 this->num_local_coords);
8846 Kokkos::View<mj_scalar_t*, Kokkos::HostSpace> received_weight(
8847 Kokkos::ViewAllocateWithoutInitializing(
"received_weight_buffer"),
8850 for(
int i = 0; i < this->num_weights_per_coord; ++i) {
8852 auto sub_host_src_weights
8853 = Kokkos::subview(host_src_weights, Kokkos::ALL, i);
8854 auto sub_host_dst_weights
8855 = Kokkos::subview(host_dst_weights, Kokkos::ALL, i);
8863 for(mj_lno_t n = 0; n < this->num_local_coords; ++n) {
8864 sent_weight[n] = sub_host_src_weights(n);
8867 distributor.doPostsAndWaits(sent_weight, 1, received_weight);
8870 for(mj_lno_t n = 0; n < num_incoming_gnos; ++n) {
8871 sub_host_dst_weights(n) = received_weight[n];
8874 Kokkos::deep_copy(dst_weights, host_dst_weights);
8875 result_mj_weights_ = dst_weights;
8879 Kokkos::View<int*, Kokkos::HostSpace> sent_owners(
8880 Kokkos::ViewAllocateWithoutInitializing(
"sent_owners"),
8882 Kokkos::deep_copy(sent_owners, myRank);
8884 Kokkos::View<int*, Kokkos::HostSpace> received_owners(
8885 Kokkos::ViewAllocateWithoutInitializing(
"received_owners"),
8888 distributor.doPostsAndWaits(sent_owners, 1, received_owners);
8890 result_actual_owner_rank_ =
new int[num_incoming_gnos];
8892 result_actual_owner_rank_,
8893 received_owners.data(),
8894 num_incoming_gnos *
sizeof(
int));
8897 mj_env_->timerStop(MACRO_TIMERS,
8898 timer_base_string +
"PreMigration DistributorMigration");
8899 return am_i_a_receiver;
8909template <
typename Adapter>
8911 const RCP<PartitioningSolution<Adapter> > &solution)
8918 int execute_counter =
8920 timer_base_string =
"partition(" + std::to_string(execute_counter) +
") - ";
8922 this->mj_env->timerStart(MACRO_TIMERS, timer_base_string +
"all");
8924 this->mj_env->timerStart(MACRO_TIMERS, timer_base_string +
"setup");
8926 this->set_up_partitioning_data(solution);
8928 this->set_input_parameters(this->mj_env->getParameters());
8929 if(this->mj_keep_part_boxes) {
8930 this->mj_partitioner.set_to_keep_part_boxes();
8933 this->mj_partitioner.set_partitioning_parameters(
8934 this->distribute_points_on_cut_lines,
8935 this->max_concurrent_part_calculation,
8936 this->check_migrate_avoid_migration_option,
8937 this->minimum_migration_imbalance, this->migration_type);
8939 RCP<const Comm<int> > result_problemComm = this->mj_problemComm;
8940 mj_lno_t result_num_local_coords = this->num_local_coords;
8941 Kokkos::View<mj_gno_t*, device_t> result_initial_mj_gnos;
8943 Kokkos::View<mj_scalar_t**, Kokkos::LayoutLeft, device_t>
8944 result_mj_coordinates = this->mj_coordinates;
8945 Kokkos::View<mj_scalar_t**, device_t> result_mj_weights =
8947 int *result_actual_owner_rank = NULL;
8949 Kokkos::View<const mj_gno_t*, device_t> result_initial_mj_gnos_ =
8950 this->initial_mj_gnos;
8968 int current_world_size = this->mj_problemComm->getSize();
8969 mj_lno_t threshold_num_local_coords =
8970 this->min_coord_per_rank_for_premigration;
8971 bool is_pre_migrated =
false;
8972 bool am_i_in_subset =
true;
8978 if(mj_premigration_option > 0 &&
8979 size_t (current_world_size) > this->num_global_parts &&
8980 this->num_global_coords < mj_gno_t (
8981 current_world_size * threshold_num_local_coords))
8983 if(this->mj_keep_part_boxes) {
8984 throw std::logic_error(
"Multijagged: mj_keep_part_boxes and "
8985 "mj_premigration_option are not supported together yet.");
8988 is_pre_migrated =
true;
8989 int migration_selection_option = mj_premigration_option;
8990 if(migration_selection_option * this->num_global_parts >
8991 (
size_t) (current_world_size)) {
8992 migration_selection_option =
8993 current_world_size / this->num_global_parts;
8996 int used_num_ranks = int (this->num_global_coords /
8997 float (threshold_num_local_coords) + 0.5);
8999 if(used_num_ranks == 0) {
9003 am_i_in_subset = this->mj_premigrate_to_subset(
9005 migration_selection_option,
9007 this->mj_problemComm,
9009 this->num_local_coords,
9010 this->num_global_coords,
9011 this->num_global_parts,
9012 this->initial_mj_gnos,
9013 this->mj_coordinates,
9014 this->num_weights_per_coord,
9018 result_num_local_coords,
9019 result_initial_mj_gnos,
9020 result_mj_coordinates,
9022 result_actual_owner_rank);
9024 result_initial_mj_gnos_ = result_initial_mj_gnos;
9027 Kokkos::View<mj_part_t *, device_t> result_assigned_part_ids;
9028 Kokkos::View<mj_gno_t*, device_t> result_mj_gnos;
9030 this->mj_env->timerStop(MACRO_TIMERS, timer_base_string +
"setup");
9032 if(am_i_in_subset) {
9033 this->mj_partitioner.multi_jagged_part(
9036 this->imbalance_tolerance,
9038 this->num_global_parts,
9039 this->part_no_array,
9040 this->recursion_depth,
9042 result_num_local_coords,
9043 this->num_global_coords,
9044 result_initial_mj_gnos_,
9045 result_mj_coordinates,
9046 this->num_weights_per_coord,
9047 this->mj_uniform_weights,
9049 this->mj_uniform_parts,
9050 result_assigned_part_ids,
9055 this->mj_env->timerStart(MACRO_TIMERS, timer_base_string +
"cleanup");
9058 std::unordered_map<mj_gno_t, mj_lno_t> localGidToLid;
9059 localGidToLid.reserve(result_num_local_coords);
9060 Kokkos::View<mj_gno_t*, Kokkos::HostSpace> host_result_initial_mj_gnos(
9061 Kokkos::ViewAllocateWithoutInitializing(
"host_result_initial_mj_gnos"),
9062 result_initial_mj_gnos_.size());
9063 Kokkos::deep_copy(host_result_initial_mj_gnos, result_initial_mj_gnos_);
9064 for(mj_lno_t i = 0; i < result_num_local_coords; i++) {
9065 localGidToLid[host_result_initial_mj_gnos(i)] = i;
9068 ArrayRCP<mj_part_t> partId = arcp(
new mj_part_t[result_num_local_coords],
9069 0, result_num_local_coords,
true);
9070 auto host_result_assigned_part_ids =
9071 Kokkos::create_mirror_view(result_assigned_part_ids);
9072 Kokkos::deep_copy(host_result_assigned_part_ids, result_assigned_part_ids);
9073 auto host_result_mj_gnos = Kokkos::create_mirror_view(result_mj_gnos);
9074 Kokkos::deep_copy(host_result_mj_gnos, result_mj_gnos);
9075 for(mj_lno_t i = 0; i < result_num_local_coords; i++) {
9076 mj_lno_t origLID = localGidToLid[host_result_mj_gnos(i)];
9077 partId[origLID] = host_result_assigned_part_ids(i);
9082 if(is_pre_migrated) {
9083 this->mj_env->timerStart(MACRO_TIMERS, timer_base_string +
9084 "PostMigration DistributorPlanCreating");
9085 Tpetra::Distributor distributor(this->mj_problemComm);
9087 ArrayView<const mj_part_t> actual_owner_destinations(
9088 result_actual_owner_rank , result_num_local_coords);
9090 mj_lno_t num_incoming_gnos = distributor.createFromSends(
9091 actual_owner_destinations);
9093 if(num_incoming_gnos != this->num_local_coords) {
9094 throw std::logic_error(
"Zoltan2 - Multijagged Post Migration - "
9095 "num incoming is not equal to num local coords");
9098 mj_env->timerStop(MACRO_TIMERS, timer_base_string +
9099 "PostMigration DistributorPlanCreating");
9100 mj_env->timerStart(MACRO_TIMERS, timer_base_string +
9101 "PostMigration DistributorMigration");
9103 Kokkos::View<mj_gno_t*, Kokkos::HostSpace> received_gnos(
9104 Kokkos::ViewAllocateWithoutInitializing(
"received_gnos"),
9106 Kokkos::View<mj_part_t*, Kokkos::HostSpace> received_partids(
9107 Kokkos::ViewAllocateWithoutInitializing(
"received_partids"),
9110 distributor.doPostsAndWaits(host_result_initial_mj_gnos, 1,
9113 Kokkos::View<mj_part_t*, Kokkos::HostSpace> sent_partnos;
9114 if (partId.size() > 0) {
9115 sent_partnos = Kokkos::View<mj_part_t*, Kokkos::HostSpace>(
9116 partId.getRawPtr(), partId.size());
9118 distributor.doPostsAndWaits(sent_partnos, 1, received_partids);
9121 partId = arcp(
new mj_part_t[this->num_local_coords],
9122 0, this->num_local_coords,
true);
9125 std::unordered_map<mj_gno_t, mj_lno_t> localGidToLid2;
9126 localGidToLid2.reserve(this->num_local_coords);
9127 auto host_initial_mj_gnos =
9128 Kokkos::create_mirror_view(this->initial_mj_gnos);
9129 Kokkos::deep_copy(host_initial_mj_gnos,
9130 this->initial_mj_gnos);
9131 for(mj_lno_t i = 0; i < this->num_local_coords; i++) {
9132 localGidToLid2[host_initial_mj_gnos(i)] = i;
9135 for(mj_lno_t i = 0; i < this->num_local_coords; i++) {
9136 mj_lno_t origLID = localGidToLid2[received_gnos[i]];
9137 partId[origLID] = received_partids[i];
9142 delete [] result_actual_owner_rank;
9144 mj_env->timerStop(MACRO_TIMERS,
9145 timer_base_string +
"PostMigration DistributorMigration");
9147 solution->setParts(partId);
9148 this->mj_env->timerStop(MACRO_TIMERS, timer_base_string +
"cleanup");
9151 this->mj_env->timerStop(MACRO_TIMERS, timer_base_string +
"all");
9154 this->mj_coordinates = Kokkos::View<mj_scalar_t **, Kokkos::LayoutLeft, device_t>();
9159template <
typename Adapter>
9161 const RCP<PartitioningSolution<Adapter> > &solution
9165 CoordinateModel<Adapter> mj_coords(mj_adapter, mj_env, mj_problemComm, flags);
9167 this->coord_dim = mj_coords.getCoordinateDim();
9168 this->num_weights_per_coord = mj_coords.getNumWeightsPerCoordinate();
9169 this->num_local_coords = mj_coords.getLocalNumCoordinates();
9170 this->num_global_coords = mj_coords.getGlobalNumCoordinates();
9172 int criteria_dim = (this->num_weights_per_coord ?
9173 this->num_weights_per_coord : 1);
9177 this->num_global_parts = solution->getTargetGlobalNumberOfParts();
9180 this->mj_uniform_parts = Kokkos::View<bool *, Kokkos::HostSpace>(
9181 "uniform parts", criteria_dim);
9182 this->mj_uniform_weights = Kokkos::View<bool *, Kokkos::HostSpace>(
9183 "uniform weights", criteria_dim);
9185 Kokkos::View<const mj_gno_t *, device_t> gnos;
9186 Kokkos::View<adapter_scalar_t **, Kokkos::LayoutLeft, device_t> xyz_adapter;
9188 Kokkos::View<adapter_scalar_t **, device_t> wgts_adapter;
9189 mj_coords.getCoordinatesKokkos(gnos, xyz_adapter, wgts_adapter);
9191 Kokkos::View<mj_scalar_t **, Kokkos::LayoutLeft, device_t> xyz;
9192 Kokkos::View<mj_scalar_t **, device_t> wgts;
9196 if(std::is_same<mj_scalar_t, adapter_scalar_t>()) {
9199 assign_if_same(xyz, xyz_adapter);
9200 assign_if_same(wgts, wgts_adapter);
9205 xyz = Kokkos::View<mj_scalar_t **, Kokkos::LayoutLeft, device_t>
9206 (Kokkos::ViewAllocateWithoutInitializing(
9207 "xyz"), xyz_adapter.extent(0), xyz_adapter.extent(1));
9208 wgts = Kokkos::View<mj_scalar_t **, device_t>(
9209 Kokkos::ViewAllocateWithoutInitializing(
"wgts"),
9210 wgts_adapter.extent(0), wgts_adapter.extent(1));
9212 typedef typename Kokkos::View<mj_scalar_t **, device_t>::size_type view_size_t;
9213 Kokkos::parallel_for(
9214 Kokkos::RangePolicy<typename mj_node_t::execution_space, int>
9215 (0, xyz_adapter.extent(0)), KOKKOS_LAMBDA (
int i) {
9216 for(view_size_t n = 0; n < xyz_adapter.extent(1); ++n) {
9217 xyz(i, n) =
static_cast<mj_scalar_t
>(xyz_adapter(i, n));
9220 Kokkos::parallel_for(
9221 Kokkos::RangePolicy<typename mj_node_t::execution_space, int>
9222 (0, wgts.extent(0)), KOKKOS_LAMBDA (
int i) {
9223 for(view_size_t n = 0; n < wgts.extent(1); ++n) {
9224 wgts(i, n) =
static_cast<mj_scalar_t
>(wgts_adapter(i, n));
9230 this->initial_mj_gnos = gnos;
9232 this->mj_coordinates = xyz;
9235 if(this->num_weights_per_coord == 0) {
9236 this->mj_uniform_weights(0) =
true;
9237 Kokkos::resize(this->mj_weights, 0, 0);
9240 this->mj_weights = wgts;
9241 for(
int wdim = 0; wdim < this->num_weights_per_coord; ++wdim) {
9242 this->mj_uniform_weights(wdim) =
false;
9246 for(
int wdim = 0; wdim < criteria_dim; ++wdim) {
9247 if(solution->criteriaHasUniformPartSizes(wdim)) {
9248 this->mj_uniform_parts(wdim) =
true;
9251 printf(
"Error: MJ does not support non uniform target part weights\n");
9260template <
typename Adapter>
9262 const Teuchos::ParameterList &pl)
9264 const Teuchos::ParameterEntry *pe = pl.getEntryPtr(
"imbalance_tolerance");
9267 tol = pe->getValue(&tol);
9268 this->imbalance_tolerance = tol - 1.0;
9272 if(this->imbalance_tolerance <= 0) {
9273 this->imbalance_tolerance= 10e-4;
9277 Kokkos::resize(this->part_no_array, 0);
9280 this->recursion_depth = 0;
9282 if(pl.getPtr<
int>(
"mj_num_teams")) {
9283 this->num_teams = pl.get<
int>(
"mj_num_teams");
9286 if(pl.getPtr<Array <mj_part_t> >(
"mj_parts")) {
9287 auto mj_parts = pl.get<Array <mj_part_t> >(
"mj_parts");
9288 int mj_parts_size =
static_cast<int>(mj_parts.size());
9291 this->part_no_array = Kokkos::View<mj_part_t*, Kokkos::HostSpace>(
9292 "part_no_array", mj_parts_size);
9293 for(
int i = 0; i < mj_parts_size; ++i) {
9294 this->part_no_array(i) = mj_parts.getRawPtr()[i];
9297 this->recursion_depth = mj_parts_size - 1;
9298 this->mj_env->debug(2,
"mj_parts provided by user");
9302 this->distribute_points_on_cut_lines =
true;
9303 this->max_concurrent_part_calculation = 1;
9305 this->mj_run_as_rcb =
false;
9306 this->mj_premigration_option = 0;
9307 this->min_coord_per_rank_for_premigration = 32000;
9309 int mj_user_recursion_depth = -1;
9310 this->mj_keep_part_boxes =
false;
9311 this->check_migrate_avoid_migration_option = 0;
9312 this->migration_type = 0;
9313 this->minimum_migration_imbalance = 0.35;
9315 pe = pl.getEntryPtr(
"mj_minimum_migration_imbalance");
9318 imb = pe->getValue(&imb);
9319 this->minimum_migration_imbalance = imb - 1.0;
9322 pe = pl.getEntryPtr(
"mj_migration_option");
9324 this->check_migrate_avoid_migration_option =
9325 pe->getValue(&this->check_migrate_avoid_migration_option);
9327 this->check_migrate_avoid_migration_option = 0;
9329 if(this->check_migrate_avoid_migration_option > 1) {
9330 this->check_migrate_avoid_migration_option = -1;
9334 pe = pl.getEntryPtr(
"mj_migration_type");
9336 this->migration_type = pe->getValue(&this->migration_type);
9338 this->migration_type = 0;
9344 pe = pl.getEntryPtr(
"mj_concurrent_part_count");
9346 this->max_concurrent_part_calculation =
9347 pe->getValue(&this->max_concurrent_part_calculation);
9349 this->max_concurrent_part_calculation = 1;
9352 pe = pl.getEntryPtr(
"mj_keep_part_boxes");
9354 this->mj_keep_part_boxes = pe->getValue(&this->mj_keep_part_boxes);
9356 this->mj_keep_part_boxes =
false;
9367 if(this->mj_keep_part_boxes ==
false) {
9368 pe = pl.getEntryPtr(
"mapping_type");
9370 int mapping_type = -1;
9371 mapping_type = pe->getValue(&mapping_type);
9372 if(mapping_type == 0) {
9373 mj_keep_part_boxes =
true;
9379 pe = pl.getEntryPtr(
"mj_enable_rcb");
9381 this->mj_run_as_rcb = pe->getValue(&this->mj_run_as_rcb);
9383 this->mj_run_as_rcb =
false;
9386 pe = pl.getEntryPtr(
"mj_premigration_option");
9388 mj_premigration_option = pe->getValue(&mj_premigration_option);
9390 mj_premigration_option = 0;
9393 pe = pl.getEntryPtr(
"mj_premigration_coordinate_count");
9395 min_coord_per_rank_for_premigration = pe->getValue(&mj_premigration_option);
9397 min_coord_per_rank_for_premigration = 32000;
9400 pe = pl.getEntryPtr(
"mj_recursion_depth");
9402 mj_user_recursion_depth = pe->getValue(&mj_user_recursion_depth);
9404 mj_user_recursion_depth = -1;
9408 pe = pl.getEntryPtr(
"rectilinear");
9410 val = pe->getValue(&val);
9413 this->distribute_points_on_cut_lines =
false;
9415 this->distribute_points_on_cut_lines =
true;
9418 if(this->mj_run_as_rcb) {
9419 mj_user_recursion_depth =
9420 (int)(ceil(log ((this->num_global_parts)) / log (2.0)));
9422 if(this->recursion_depth < 1) {
9423 if(mj_user_recursion_depth > 0) {
9424 this->recursion_depth = mj_user_recursion_depth;
9427 this->recursion_depth = this->coord_dim;
9433template <
typename Adapter>
9436 adapter_scalar_t *lower,
9437 adapter_scalar_t *upper,
9438 size_t &nPartsFound,
9439 typename Adapter::part_t **partsFound)
const
9448 if(this->mj_keep_part_boxes) {
9451 RCP<mj_partBoxVector_t> partBoxes = this->getGlobalBoxBoundaries();
9453 size_t nBoxes = (*partBoxes).size();
9455 throw std::logic_error(
"no part boxes exist");
9459 RCP<mj_partBox_t> globalBox = this->mj_partitioner.get_global_box();
9461 if(globalBox->boxesOverlap(dim, lower, upper)) {
9463 std::vector<typename Adapter::part_t> partlist;
9466 for(
size_t i = 0; i < nBoxes; i++) {
9468 if((*partBoxes)[i].boxesOverlap(dim, lower, upper)) {
9470 partlist.push_back((*partBoxes)[i].getpId());
9492 *partsFound =
new mj_part_t[nPartsFound];
9493 for(
size_t i = 0; i < nPartsFound; i++)
9494 (*partsFound)[i] = partlist[i];
9506 throw std::logic_error(
"need to use keep_cuts parameter for boxAssign");
9511template <
typename Adapter>
9514 adapter_scalar_t *point)
const
9520 if(this->mj_keep_part_boxes) {
9521 typename Adapter::part_t foundPart = -1;
9524 RCP<mj_partBoxVector_t> partBoxes = this->getGlobalBoxBoundaries();
9526 size_t nBoxes = (*partBoxes).size();
9528 throw std::logic_error(
"no part boxes exist");
9532 RCP<mj_partBox_t> globalBox = this->mj_partitioner.get_global_box();
9534 if(globalBox->pointInBox(dim, point)) {
9538 for(i = 0; i < nBoxes; i++) {
9540 if((*partBoxes)[i].pointInBox(dim, point)) {
9541 foundPart = (*partBoxes)[i].getpId();
9555 std::ostringstream oss;
9557 for(
int j = 0; j < dim; j++) oss << point[j] <<
" ";
9558 oss <<
") not found in domain";
9559 throw std::logic_error(oss.str());
9569 size_t closestBox = 0;
9570 coord_t minDistance = std::numeric_limits<coord_t>::max();
9571 coord_t *centroid =
new coord_t[dim];
9572 for(
size_t i = 0; i < nBoxes; i++) {
9573 (*partBoxes)[i].computeCentroid(centroid);
9576 for(
int j = 0; j < dim; j++) {
9577 diff = centroid[j] - point[j];
9580 if(sum < minDistance) {
9585 foundPart = (*partBoxes)[closestBox].getpId();
9592 throw std::logic_error(
"need to use keep_cuts parameter for pointAssign");
9596template <
typename Adapter>
9598 const PartitioningSolution<Adapter> *solution,
9602 if(comXAdj_.getRawPtr() == NULL && comAdj_.getRawPtr() == NULL) {
9603 RCP<mj_partBoxVector_t> pBoxes = this->getGlobalBoxBoundaries();
9604 mj_part_t ntasks = (*pBoxes).size();
9605 int dim = (*pBoxes)[0].getDim();
9606 GridHash grid(pBoxes, ntasks, dim);
9607 grid.getAdjArrays(comXAdj_, comAdj_);
9613template <
typename Adapter>
9614RCP<typename Zoltan2_AlgMJ<Adapter>::mj_partBoxVector_t>
9617 return this->mj_partitioner.get_kept_boxes();
Defines the CoordinateModel classes.
#define Z2_FORWARD_EXCEPTIONS
Forward an exception back through call stack.
#define Z2_ASSERT_VALUE(actual, expected)
Throw an error when actual value is not equal to expected value.
#define Z2_THROW_OUTSIDE_ERROR(env)
Throw an error returned from outside the Zoltan2 library.
Define IntegerRangeList validator.
Contains Teuchos redcution operators for the Multi-jagged algorthm.
Defines Parameter related enumerators, declares functions.
A gathering of useful namespace methods.
Zoltan2_BoxBoundaries is a reduction operation to all reduce the all box boundaries.
void reduce(const Ordinal count, const T inBuffer[], T inoutBuffer[]) const
Implement Teuchos::ValueTypeReductionOp interface.
Zoltan2_BoxBoundaries()
Default Constructor.
Zoltan2_BoxBoundaries(Ordinal s_)
Constructor.
Multi Jagged coordinate partitioning algorithm.
void set_partitioning_parameters(bool distribute_points_on_cut_lines_, int max_concurrent_part_calculation_, int check_migrate_avoid_migration_option_, double minimum_migration_imbalance_, int migration_type_=0)
Multi Jagged coordinate partitioning algorithm.
RCP< mj_partBoxVector_t > compute_global_box_boundaries(RCP< mj_partBoxVector_t > &localPartBoxes) const
DOCWORK: Documentation.
void sequential_task_partitioning(const RCP< const Environment > &env, mj_lno_t num_total_coords, mj_lno_t num_selected_coords, size_t num_target_part, int coord_dim, Kokkos::View< mj_scalar_t **, Kokkos::LayoutLeft, device_t > &mj_coordinates_, Kokkos::View< mj_lno_t *, device_t > &initial_selected_coords_output_permutation, mj_lno_t *output_xadj, int recursion_depth_, const Kokkos::View< mj_part_t *, Kokkos::HostSpace > &part_no_array, bool partition_along_longest_dim, int num_ranks_per_node, bool divide_to_prime_first_, mj_part_t num_first_level_parts_=1, const Kokkos::View< mj_part_t *, Kokkos::HostSpace > &first_level_distribution_=Kokkos::View< mj_part_t *, Kokkos::HostSpace >())
Special function for partitioning for task mapping. Runs sequential, and performs deterministic parti...
void multi_jagged_part(const RCP< const Environment > &env, RCP< const Comm< int > > &problemComm, double imbalance_tolerance, int num_teams, size_t num_global_parts, Kokkos::View< mj_part_t *, Kokkos::HostSpace > &part_no_array, int recursion_depth, int coord_dim, mj_lno_t num_local_coords, mj_gno_t num_global_coords, Kokkos::View< const mj_gno_t *, device_t > &initial_mj_gnos, Kokkos::View< mj_scalar_t **, Kokkos::LayoutLeft, device_t > &mj_coordinates, int num_weights_per_coord, Kokkos::View< bool *, Kokkos::HostSpace > &mj_uniform_weights, Kokkos::View< mj_scalar_t **, device_t > &mj_weights, Kokkos::View< bool *, Kokkos::HostSpace > &mj_uniform_parts, Kokkos::View< mj_part_t *, device_t > &result_assigned_part_ids, Kokkos::View< mj_gno_t *, device_t > &result_mj_gnos)
Multi Jagged coordinate partitioning algorithm.
RCP< mj_partBoxVector_t > get_kept_boxes() const
DOCWORK: Documentation.
AlgMJ()
Multi Jagged coordinate partitioning algorithm default constructor.
RCP< mj_partBox_t > get_global_box() const
DOCWORK: Documentation.
void set_to_keep_part_boxes()
Function call, if the part boxes are intended to be kept.
Algorithm defines the base class for all algorithms.
static RCP< Teuchos::BoolParameterEntryValidator > getBoolValidator()
Exists to make setting up validators less cluttered.
static RCP< Teuchos::AnyNumberParameterEntryValidator > getAnyDoubleValidator()
Exists to make setting up validators less cluttered.
static RCP< Teuchos::AnyNumberParameterEntryValidator > getAnyIntValidator()
Exists to make setting up validators less cluttered.
GridHash Class, Hashing Class for part boxes.
A ParameterList validator for integer range lists.
Multi Jagged coordinate partitioning algorithm.
void set_up_partitioning_data(const RCP< PartitioningSolution< Adapter > > &solution)
Zoltan2_AlgMJ(const RCP< const Environment > &env, RCP< const Comm< int > > &problemComm, const RCP< const typename Adapter::base_adapter_t > &adapter)
void partition(const RCP< PartitioningSolution< Adapter > > &solution)
Multi Jagged coordinate partitioning algorithm.
mj_part_t pointAssign(int dim, adapter_scalar_t *point) const
void boxAssign(int dim, adapter_scalar_t *lower, adapter_scalar_t *upper, size_t &nPartsFound, mj_part_t **partsFound) const
static void getValidParameters(ParameterList &pl)
Set up validators specific to this algorithm.
void getCommunicationGraph(const PartitioningSolution< Adapter > *solution, ArrayRCP< mj_part_t > &comXAdj, ArrayRCP< mj_part_t > &comAdj)
returns communication graph resulting from MJ partitioning.
mj_partBoxVector_t & getPartBoxesView() const
for partitioning methods, return bounding boxes of the
coordinateModelPartBox Class, represents the boundaries of the box which is a result of a geometric p...
Class for sorting items with multiple values. First sorting with respect to val[0],...
void set(IT index_, CT count_, WT *vals_)
bool operator<(const uMultiSortItem< IT, CT, WT > &other) const
uMultiSortItem(IT index_, CT count_, WT *vals_)
Created by mbenlioglu on Aug 31, 2020.
Tpetra::global_size_t global_size_t
std::bitset< NUM_MODEL_FLAGS > modelFlag_t
void uqsort(IT n, uSortItem< IT, WT > *arr)
Quick sort function. Sorts the arr of uSortItems, with respect to increasing vals....
void uqSignsort(IT n, uSignedSortItem< IT, WT, SIGN > *arr)
Quick sort function. Sorts the arr of uSignedSortItems, with respect to increasing vals.
SparseMatrixAdapter_t::part_t part_t
KOKKOS_INLINE_FUNCTION value_type & reference() const
KOKKOS_INLINE_FUNCTION void join(value_type &dst, const value_type &src) const
int value_count_rightleft
Zoltan2_MJArrayType< scalar_t > value_type
KOKKOS_INLINE_FUNCTION void join(volatile value_type &dst, const volatile value_type &src) const
KOKKOS_INLINE_FUNCTION ArrayCombinationReducer(scalar_t mj_max_scalar, value_type &val, int mj_value_count_rightleft, int mj_value_count_weights)
ArrayCombinationReducer reducer
KOKKOS_INLINE_FUNCTION void init(value_type &dst) const
Zoltan2_MJArrayType< scalar_t > value_type
KOKKOS_INLINE_FUNCTION void init(value_type &dst) const
KOKKOS_INLINE_FUNCTION ArrayReducer(value_type &val, int mj_value_count)
KOKKOS_INLINE_FUNCTION void join(value_type &dst, const value_type &src) const
KOKKOS_INLINE_FUNCTION value_type & reference() const
KOKKOS_INLINE_FUNCTION void init(value_type dst) const
Kokkos::View< part_t *, device_t > parts
Kokkos::View< scalar_t * > scalar_view_t
policy_t::member_type member_type
Kokkos::View< index_t *, device_t > part_xadj
ReduceArrayFunctor(part_t mj_concurrent_current_part, part_t mj_weight_array_size, Kokkos::View< index_t *, device_t > &mj_permutations, Kokkos::View< scalar_t *, device_t > &mj_coordinates, Kokkos::View< part_t *, device_t > &mj_parts, Kokkos::View< index_t *, device_t > &mj_part_xadj, Kokkos::View< index_t *, device_t > &mj_track_on_cuts)
Kokkos::View< index_t *, device_t > track_on_cuts
Kokkos::View< scalar_t *, device_t > coordinates
part_t concurrent_current_part
size_t team_shmem_size(int team_size) const
Kokkos::View< index_t *, device_t > permutations
KOKKOS_INLINE_FUNCTION void join(value_type dst, const value_type src) const
Kokkos::View< scalar_t *, device_t > cut_coordinates
KOKKOS_INLINE_FUNCTION void init(value_type dst) const
part_t concurrent_current_part
Kokkos::View< scalar_t **, device_t > weights
ReduceWeightsFunctor(int mj_loop_count, array_t mj_max_scalar, part_t mj_concurrent_current_part, part_t mj_num_cuts, part_t mj_current_work_part, part_t mj_current_concurrent_num_parts, part_t mj_left_right_array_size, part_t mj_weight_array_size, Kokkos::View< index_t *, device_t > &mj_permutations, Kokkos::View< scalar_t *, device_t > &mj_coordinates, Kokkos::View< scalar_t **, device_t > &mj_weights, Kokkos::View< part_t *, device_t > &mj_parts, Kokkos::View< scalar_t *, device_t > &mj_cut_coordinates, Kokkos::View< index_t *, device_t > &mj_part_xadj, bool mj_uniform_weights0, scalar_t mj_sEpsilon)
KOKKOS_INLINE_FUNCTION void join(value_type dst, const value_type src) const
part_t current_concurrent_num_parts
Kokkos::View< scalar_t *, device_t > coordinates
Kokkos::View< part_t *, device_t > parts
size_t team_shmem_size(int team_size) const
Kokkos::View< index_t *, device_t > part_xadj
Kokkos::View< index_t *, device_t > permutations
KOKKOS_INLINE_FUNCTION void operator()(const member_type &teamMember, value_type teamSum) const
Kokkos::View< scalar_t * > scalar_view_t
policy_t::member_type member_type
int value_count_rightleft
static int get_counter_Zoltan2_AlgMJ()
static int get_counter_AlgMJ()
Zoltan2_MJArrayType< scalar_t > & operator=(const volatile Zoltan2_MJArrayType< scalar_t > &zmj)
KOKKOS_INLINE_FUNCTION Zoltan2_MJArrayType()
KOKKOS_INLINE_FUNCTION Zoltan2_MJArrayType(scalar_t *pSetPtr)
bool operator<=(const uSignedSortItem< IT, WT, SIGN > &rhs)
bool operator<(const uSignedSortItem< IT, WT, SIGN > &rhs) const
Sort items for quick sort function.