void compute_imbalance(const int global_box[][2], const int local_box[][2], float& largest_imbalance, float& std_dev, YAML_Doc& doc, bool record_in_doc) { int numprocs = 1, myproc = 0; #ifdef HAVE_MPI MPI_Comm_size(MPI_COMM_WORLD, &numprocs); MPI_Comm_rank(MPI_COMM_WORLD, &myproc); #endif GlobalOrdinal local_nrows = get_num_ids<GlobalOrdinal>(local_box); GlobalOrdinal min_nrows = 0, max_nrows = 0, global_nrows = 0; int min_proc = myproc, max_proc = myproc; get_global_min_max(local_nrows, global_nrows, min_nrows, min_proc, max_nrows, max_proc); float avg_nrows = global_nrows; avg_nrows /= numprocs; //largest_imbalance will be the difference between the min (or max) //rows-per-processor and avg_nrows, represented as a percentage: largest_imbalance = percentage_difference<float>(min_nrows, avg_nrows); float tmp = percentage_difference<float>(max_nrows, avg_nrows); if (tmp > largest_imbalance) largest_imbalance = tmp; std_dev = compute_std_dev_as_percentage<float>(local_nrows, avg_nrows); if (myproc == 0 && record_in_doc) { doc.add("Rows-per-proc Load Imbalance",""); doc.get("Rows-per-proc Load Imbalance")->add("Largest (from avg, %)",largest_imbalance); doc.get("Rows-per-proc Load Imbalance")->add("Std Dev (%)",std_dev); } }
void add_imbalance(const int global_box[][2], int local_box[][2], float imbalance, YAML_Doc& doc) { int numprocs = 1, myproc = 0; #ifdef HAVE_MPI MPI_Comm_size(MPI_COMM_WORLD, &numprocs); MPI_Comm_rank(MPI_COMM_WORLD, &myproc); #endif if (numprocs == 1) { return; } float cur_imbalance = 0, cur_std_dev = 0; compute_imbalance<GlobalOrdinal>(global_box, local_box, cur_imbalance, cur_std_dev, doc, false); while (cur_imbalance < imbalance) { GlobalOrdinal local_nrows = get_num_ids<GlobalOrdinal>(local_box); GlobalOrdinal min_nrows = 0, max_nrows = 0, global_nrows = 0; int min_proc = myproc, max_proc = myproc; get_global_min_max(local_nrows, global_nrows, min_nrows, min_proc, max_nrows, max_proc); std::pair<int,int> grow(NONE,UPPER); int grow_axis_val = -1; std::pair<int,int> shrink(NONE,UPPER); int shrink_axis_val = -1; if (myproc == max_proc) { grow = decide_how_to_grow(global_box, local_box); if (grow.first != NONE) { grow_axis_val = local_box[grow.first][grow.second]; } } if (myproc == min_proc) { shrink = decide_how_to_shrink(global_box, local_box); if (shrink.first != NONE) { shrink_axis_val = local_box[shrink.first][shrink.second]; } } int grow_info[8] = {grow.first, grow.second, local_box[X][0], local_box[X][1], local_box[Y][0], local_box[Y][1], local_box[Z][0], local_box[Z][1]}; int shrink_info[8] = {shrink.first, shrink.second, local_box[X][0], local_box[X][1], local_box[Y][0], local_box[Y][1], local_box[Z][0], local_box[Z][1]}; #ifdef HAVE_MPI MPI_Bcast(&grow_info[0], 8, MPI_INT, max_proc, MPI_COMM_WORLD); MPI_Bcast(&shrink_info[0], 8, MPI_INT, min_proc, MPI_COMM_WORLD); #endif int grow_axis = grow_info[0]; int grow_end = grow_info[1]; int shrink_axis = shrink_info[0]; int shrink_end = shrink_info[1]; int grow_incr = 1; if (grow_end == LOWER) grow_incr = -1; int shrink_incr = -1; if (shrink_end == LOWER) shrink_incr = 1; if (grow_axis != NONE) grow_axis_val = grow_info[2+grow_axis*2+grow_end]; if (shrink_axis != NONE) shrink_axis_val = shrink_info[2+shrink_axis*2+shrink_end]; if (grow_axis == NONE && shrink_axis == NONE) break; bool grow_status = grow_axis==NONE ? false : true; if (grow_axis != NONE) { if ((grow_incr == 1 && local_box[grow_axis][0] == grow_axis_val) || (grow_incr == -1 && local_box[grow_axis][1] == grow_axis_val)) { if (local_box[grow_axis][1] - local_box[grow_axis][0] < 2) { grow_status = false; } } } bool shrink_status = shrink_axis==NONE ? false : true; if (shrink_axis != NONE) { if ((shrink_incr == 1 && local_box[shrink_axis][0] == shrink_axis_val) || (shrink_incr == -1 && local_box[shrink_axis][1] == shrink_axis_val)) { if (local_box[shrink_axis][1] - local_box[shrink_axis][0] < 2) { shrink_status = false; } } } #ifdef HAVE_MPI int statusints[2] = { grow_status ? 0 : 1, shrink_status ? 0 : 1 }; int globalstatus[2] = { 0, 0 }; MPI_Allreduce(&statusints, &globalstatus, 2, MPI_INT, MPI_SUM, MPI_COMM_WORLD); grow_status = globalstatus[0]>0 ? false : true; shrink_status = globalstatus[1]>0 ? false : true; #endif if (grow_status == false && shrink_status == false) break; if (grow_status && grow_axis != NONE) { if (local_box[grow_axis][0] == grow_axis_val) { local_box[grow_axis][0] += grow_incr; } if (local_box[grow_axis][1] == grow_axis_val) { local_box[grow_axis][1] += grow_incr; } } if (shrink_status && shrink_axis != NONE) { if (local_box[shrink_axis][0] == shrink_axis_val) { local_box[shrink_axis][0] += shrink_incr; } if (local_box[shrink_axis][1] == shrink_axis_val) { local_box[shrink_axis][1] += shrink_incr; } } compute_imbalance<GlobalOrdinal>(global_box, local_box, cur_imbalance, cur_std_dev, doc, false); } }
size_t compute_matrix_stats(const MatrixType& A, int myproc, int numprocs, YAML_Doc& ydoc) { typedef typename MatrixType::GlobalOrdinalType GlobalOrdinal; typedef typename MatrixType::LocalOrdinalType LocalOrdinal; typedef typename MatrixType::ScalarType Scalar; GlobalOrdinal min_nrows = 0, max_nrows = 0, global_nrows = 0; int min_proc = 0, max_proc = 0; GlobalOrdinal local_nrows = A.rows.size(); get_global_min_max(local_nrows, global_nrows, min_nrows, min_proc, max_nrows, max_proc); //Gather stats on global, min/max matrix num-nonzeros: double local_nnz = A.num_nonzeros(); double dglobal_nnz = 0, dmin_nnz = 0, dmax_nnz = 0; get_global_min_max(local_nnz, dglobal_nnz, dmin_nnz, min_proc, dmax_nnz, max_proc); double avg_nrows = global_nrows; avg_nrows /= numprocs; double avg_nnz = dglobal_nnz; avg_nnz /= numprocs; double mem_overhead_MB = parallel_memory_overhead_MB(A); size_t global_nnz = static_cast<size_t>(std::ceil(dglobal_nnz)); size_t min_nnz = static_cast<size_t>(std::ceil(dmin_nnz)); size_t max_nnz = static_cast<size_t>(std::ceil(dmax_nnz)); size_t global_num_rows = global_nrows; if (myproc == 0) { ydoc.add("Matrix attributes",""); ydoc.get("Matrix attributes")->add("Global Nrows",global_num_rows); ydoc.get("Matrix attributes")->add("Global NNZ",global_nnz); //compute how much memory the matrix occupies: //num-bytes = sizeof(GlobalOrdinal)*global_nrows for A.rows // + sizeof(LocalOrdinal)*global_nrows for A.rows_offsets // + sizeof(GlobalOrdinal)*global_nnz for A.packed_cols // + sizeof(Scalar)*global_nnz for A.packed_coefs double invGB = 1.0/(1024*1024*1024); double memGB = invGB*global_nrows*sizeof(GlobalOrdinal); memGB += invGB*global_nrows*sizeof(LocalOrdinal); memGB += invGB*global_nnz*sizeof(GlobalOrdinal); memGB += invGB*global_nnz*sizeof(Scalar); ydoc.get("Matrix attributes")->add("Global Memory (GB)",memGB); ydoc.get("Matrix attributes")->add("Pll Memory Overhead (MB)",mem_overhead_MB); size_t min_num_rows = min_nrows; size_t max_num_rows = max_nrows; ydoc.get("Matrix attributes")->add("Rows per proc MIN",min_num_rows); ydoc.get("Matrix attributes")->add("Rows per proc MAX",max_num_rows); ydoc.get("Matrix attributes")->add("Rows per proc AVG",avg_nrows); ydoc.get("Matrix attributes")->add("NNZ per proc MIN",min_nnz); ydoc.get("Matrix attributes")->add("NNZ per proc MAX",max_nnz); ydoc.get("Matrix attributes")->add("NNZ per proc AVG",avg_nnz); } return global_nnz; }