void BSFC_create_bins(int num_local_objects, BSFC_VERTEX_PTR sfc_vert_ptr, int* amount_of_bits_used, int size_of_unsigned, float* global_actual_work_allocated, float *work_percent_array, float* total_weight_ptr, int* balanced_flag, unstructured_communication* verts_in_cut_info, int* number_of_cuts, int bins_per_proc, int myid, int numprocs) { int i, j, number_of_bins, ierr = 0; int array_location = 0; int comm_tag = 4190; int * proclist; int nreturn = 0; int off_proc_objects = 0; /*counter to keep track of how many objects will be off processor*/ float * binned_weight_array; int hashtable_length; int counter = 0; float *extra_float_array; float my_work_percent; int *bin_proc_array; float scanned_work_prev_allocated; /*scanned_work_prev_allocated is the amount of work allocated to higher ranked procs */ int amount_of_bits; BSFC_VERTEX_PTR send_vert_buffer; float* send_wgt_buffer; int current_proc; float* extra_float_array2 = NULL; int local_balanced_flag; int* number_of_cuts_in_bin; /*assume initially that each processor has the same amount of bins*/ number_of_bins = numprocs * bins_per_proc; i=0; while(number_of_bins > BSFC_pow(2,i)) i++; amount_of_bits = i; /* check to see that we have not used up all of the bits */ if(amount_of_bits > 8*size_of_unsigned * KEYLENGTH) amount_of_bits = 8*size_of_unsigned * KEYLENGTH; number_of_bins = BSFC_pow(2,i); *amount_of_bits_used = amount_of_bits; /*hash table */ float* tmp_float_array = new float[number_of_bins+1]; for(i=0;i<number_of_bins;i++) tmp_float_array[i] = 0; for(i=0;i<num_local_objects;i++) { sfc_vert_ptr[i].my_bin = BSFC_get_array_location(number_of_bins, amount_of_bits, 0, (sfc_vert_ptr+i)); tmp_float_array[sfc_vert_ptr[i].my_bin] += sfc_vert_ptr[i].lb_weight; } tmp_float_array[number_of_bins] = *total_weight_ptr; binned_weight_array = (float*) malloc(sizeof(float) * (number_of_bins+1)); i = MPI_Allreduce(tmp_float_array, binned_weight_array, number_of_bins+1, MPI_FLOAT, MPI_SUM, MPI_COMM_WORLD); delete []tmp_float_array; *total_weight_ptr = binned_weight_array[number_of_bins]; /* global weight array has been created, now perform the scan operation on it */ /* put in desired amount of work here, needs to be changed for varying workloads */ // currently we assume that each processor should get the same amount of work my_work_percent = 1.0/((float) numprocs); work_percent_array[0] = 2.0; for(i=1;i<numprocs;i++) work_percent_array[i] = ((float) (numprocs-i))*my_work_percent; ////////////////////////////////////////////////////////////////////////////////////// ////////////////////////////////////////////////////////////////////////////////////// // the following is if the workload should be varied (e.g. for heterogeneous computers) ////////////////////////////////////////////////////////////////////////////////////// ////////////////////////////////////////////////////////////////////////////////////// /* for(i=0;i<numprocs;i++) extra_float_array[i] = 0.0; extra_float_array[myid] = my_work_percent; // make sure that proc 0 gets all of the rest of the work percent if(myid == 0) extra_float_array[0] = 1.1; ierr = MPI_Allreduce(extra_float_array, work_percent_array, numprocs, MPI_FLOAT, MPI_SUM, MPI_COMM_WORLD); for(i=numprocs-2;i>=0;i--) work_percent_array[i] += work_percent_array[i+1]; */ // done varying workload... /* each processor needs to know which bins get partitioned into which processor, bin_proc_array lists max bin that a processor should get */ for(i=0;i<numprocs;i++) global_actual_work_allocated[i] = 0; bin_proc_array = (int*) malloc(sizeof(int) * number_of_bins); current_proc = numprocs-1; i = number_of_bins-1; scanned_work_prev_allocated = 0; while(i >= 0 && current_proc > -1) { scanned_work_prev_allocated += binned_weight_array[i]; bin_proc_array[i] = current_proc; if(scanned_work_prev_allocated > work_percent_array[current_proc]* *total_weight_ptr) { global_actual_work_allocated[current_proc] = scanned_work_prev_allocated; bin_proc_array[i] = -current_proc; current_proc--; // the while statement is if there is more than 1 cut in a bin while(current_proc >= 0 && scanned_work_prev_allocated > work_percent_array[current_proc]* *total_weight_ptr) { global_actual_work_allocated[current_proc] = scanned_work_prev_allocated; current_proc--; } } i--; } /* make sure that the last bin does not have a cut in it */ if(bin_proc_array[0] > 0) bin_proc_array[0] = 0; global_actual_work_allocated[0] = *total_weight_ptr; /* specify which processor an object belongs to, we will know this because we know what bin an object belongs to and we know what processor a bin belongs to */ for(i=0;i<num_local_objects;i++) { if(bin_proc_array[sfc_vert_ptr[i].my_bin] >= 0) { sfc_vert_ptr[i].cut_bin_flag = BSFC_NO_CUT; sfc_vert_ptr[i].destination_proc = bin_proc_array[sfc_vert_ptr[i].my_bin]; } else { sfc_vert_ptr[i].cut_bin_flag = BSFC_CUT; sfc_vert_ptr[i].destination_proc = -bin_proc_array[sfc_vert_ptr[i].my_bin]; } } /* check to see if any cut-bin has too many objects in it and refine it. the problem with too many objects in a cut-bin is that the processor that gets assigned to that bin will get swamped with communication and might not have enough memory to hold all of the information. we detect the cut-bins with too many objects by how many cuts are in that bin. numprocs - 1 is the amount of cuts and if this is less than or equal to max_cuts_in_bin then there is no possibility that a coarse bin is overloaded */ /* if(numprocs - 1 > max_cuts_in_bin) ierr = Zoltan_BSFC_refine_overloaded_bins(zz, max_cuts_in_bin, 2*bins_per_proc, number_of_cuts_in_bin, wgt_dim, sfc_vert_ptr, num_local_objects, amount_of_bits, size_of_unsigned, work_percent_array, total_weight_array, global_actual_work_allocated);*/ //free(number_of_cuts_in_bin); local_balanced_flag = BSFC_find_imbalance(work_percent_array, global_actual_work_allocated[myid], *total_weight_ptr, myid, numprocs); ierr = MPI_Allreduce(&local_balanced_flag, balanced_flag, 1, MPI_INT, MPI_MAX, MPI_COMM_WORLD); free(bin_proc_array); free(binned_weight_array); // *balanced_flag = BSFC_BALANCED; /* if the current partitioning is acceptable, the algorithm is finished */ if(*balanced_flag == BSFC_BALANCED) { //printf("first level is good enough on proc %d \n", myid); return; } //printf("############# need to refine some bins.... ##############\n"); /* if the size of an unsigned integer is different on different processors, need to 'shrink' down the key size of unsigned integers on all processors to have the same amount of bits but only need to do this for sfc objects that are in a cut bin (currently not supported) */ /* move the sfc objects that belong to any bin that contains a cut to the proper processor */ verts_in_cut_info->used_flag = 1; verts_in_cut_info->send_procs_ptr = new int[numprocs]; verts_in_cut_info->recv_procs_ptr = new int[numprocs]; // int* send_procs = verts_in_cut_info->send_procs_ptr; // int* recv_procs = verts_in_cut_info->recv_procs_ptr; for(i=0;i<numprocs;i++) verts_in_cut_info->send_procs_ptr[i] = 0; for(i=0;i<num_local_objects;i++) if(sfc_vert_ptr[i].cut_bin_flag == BSFC_CUT) { // this vert is in a cut... verts_in_cut_info->send_procs_ptr[sfc_vert_ptr[i].destination_proc] += 1; } i=MPI_Alltoall(verts_in_cut_info->send_procs_ptr, 1, MPI_INT, verts_in_cut_info->recv_procs_ptr, 1, MPI_INT, MPI_COMM_WORLD); //recalculate send_procs because it probably got changed for(i=0;i<numprocs;i++) verts_in_cut_info->send_procs_ptr[i] = 0; for(i=0;i<num_local_objects;i++) if(sfc_vert_ptr[i].cut_bin_flag == BSFC_CUT) // this vert is in a cut... verts_in_cut_info->send_procs_ptr[sfc_vert_ptr[i].destination_proc] += 1; MPI_Request* send_request = new MPI_Request[numprocs]; MPI_Request* recv_request = new MPI_Request[numprocs]; int send_count = 0, recv_count = 0; for(i=0;i<numprocs;i++) send_count += verts_in_cut_info->send_procs_ptr[i]; for(i=0;i<numprocs;i++) recv_count += verts_in_cut_info->recv_procs_ptr[i]; verts_in_cut_info->send_count = send_count; verts_in_cut_info->recv_count = recv_count; sfc_vertex* send_sfc_vert = new sfc_vertex[send_count]; //temp storage for objects that get sent out... verts_in_cut_info->recv_sfc_vert = new sfc_vertex[recv_count]; /* for(i=0;i<numprocs;i++) printf("proc %d is sending %d objects to %d and receiving %d objects \n",myid, verts_in_cut_info->send_procs_ptr[i], i, verts_in_cut_info->recv_procs_ptr[i]); */ // fill up the send array... int* proc_counter = new int[numprocs]; proc_counter[0] = 0; for(i=1;i<numprocs;i++) proc_counter[i] = proc_counter[i-1] + verts_in_cut_info->send_procs_ptr[i-1]; recv_count = 0; for(i=0;i<myid;i++) recv_count += verts_in_cut_info->recv_procs_ptr[i]; //printf("proc %d has recv_count of %d \n", myid, recv_count); for(i=0;i<num_local_objects;i++) if(sfc_vert_ptr[i].cut_bin_flag == BSFC_CUT) { if(sfc_vert_ptr[i].destination_proc != myid) { send_sfc_vert[proc_counter[sfc_vert_ptr[i].destination_proc]] = sfc_vert_ptr[i]; proc_counter[sfc_vert_ptr[i].destination_proc] += 1; } else { // if i need to send to myself... verts_in_cut_info->recv_sfc_vert[recv_count] = sfc_vert_ptr[i]; recv_count++; } } //done filling up the send array int tag = 21503; proc_counter[0] = 0; for(i=1;i<numprocs;i++) proc_counter[i] = proc_counter[i-1] + verts_in_cut_info->send_procs_ptr[i-1]; recv_count = 0; for(i=0;i<numprocs;i++) { if(i!= myid) { // send out necessary info here... if(verts_in_cut_info->send_procs_ptr[i] != 0) { j = MPI_Isend((send_sfc_vert+proc_counter[i]), verts_in_cut_info->send_procs_ptr[i], LB_VERT_TYPE, i, tag, MPI_COMM_WORLD, (send_request+i)); } // receive necessary info here... if(verts_in_cut_info->recv_procs_ptr[i] != 0) { j = MPI_Irecv(&(verts_in_cut_info->recv_sfc_vert[recv_count]), verts_in_cut_info->recv_procs_ptr[i], LB_VERT_TYPE, i, tag, MPI_COMM_WORLD, (recv_request+i)); } } recv_count += verts_in_cut_info->recv_procs_ptr[i]; } delete []proc_counter; // wait until the info is sent and received... for(i=0;i<numprocs;i++) if(i!= myid) { if(verts_in_cut_info->send_procs_ptr[i] != 0) { MPI_Status status; j = MPI_Wait((send_request+i), &status); } if(verts_in_cut_info->recv_procs_ptr[i] != 0) { MPI_Status status; j = MPI_Wait((recv_request+i), &status); } } delete []send_request; delete []recv_request; delete []send_sfc_vert; //*balanced_flag = BSFC_BALANCED; return; }
void BSFC_refine_partition(int* local_balanced_flag, int *amount_of_used_bits, int num_vert_in_cut, BSFC_VERTEX_PTR vert_in_cut_ptr, float* work_percent_array, float total_weight, float* global_actual_work_allocated, int number_of_cuts, int* ll_bins_head, float* work_prev_allocated, int subbins_per_bin, int* local_balanced_flag_array, int myid, int numprocs) { //printf("proc %d is refining the partition level\n",myid); int i = 0, j = 0, k, current_proc; int amount_of_bits; float* binned_weight_array; int* bin_proc_array; int* ll_prev_bins; int ll_counter, ll_location, *ll_bins_head_copy; /* amount of sub-bins in a bin, probably want this as a passed in parameter */ int number_of_bins = subbins_per_bin; /* check to see that all of the bits of the sfc key have not already been used */ if(*amount_of_used_bits >= sizeof(unsigned) * KEYLENGTH * 8) { //printf("No more refinement is possible in the repartitioning on proc %d.\n", myid); *local_balanced_flag = BSFC_BALANCED; return; } /* assume initially that all the partitions on this processor are balanced. we will check later on whether any are not balanced */ *local_balanced_flag = BSFC_BALANCED; /* if there are a lot of cuts in a bin, we want the amount of bins to be greater than the amount of cuts */ if(number_of_cuts >= number_of_bins) number_of_bins = number_of_cuts + 1; /*increase sub-bins so that there is a power of 2 */ i = 0; while (number_of_bins > BSFC_pow(2, i)) i++; amount_of_bits = i; if(amount_of_bits + *amount_of_used_bits > 8 * sizeof(unsigned) * KEYLENGTH) amount_of_bits = 8 * sizeof(unsigned) * KEYLENGTH - *amount_of_used_bits; number_of_bins = BSFC_pow(2, i); ll_prev_bins = (int*) malloc(sizeof(int) * (number_of_cuts + 1)); ll_bins_head_copy = (int*) malloc(sizeof(int) * (number_of_cuts + 1)); for(i = 0; i <= number_of_cuts; i++) ll_bins_head_copy[i] = -1; /* loop over all bins that have a cut in them using linklist to find objects in the cut bins */ for(ll_counter = 0; ll_counter <= number_of_cuts; ll_counter++) if((local_balanced_flag_array[ll_counter] == BSFC_NOT_BALANCED) && ll_bins_head[ll_counter] != -1) { /* calculate new bin numbers for objects that are in a cut bin */ ll_location = ll_bins_head[ll_counter]; while (ll_location != -1) { vert_in_cut_ptr[ll_location].my_bin = BSFC_get_array_location(number_of_bins, amount_of_bits, *amount_of_used_bits, (vert_in_cut_ptr + ll_location)); ll_location = vert_in_cut_ptr[ll_location].next_sfc_vert_index; } binned_weight_array = (float*) malloc(number_of_bins * sizeof(float)); for(i = 0; i < number_of_bins; i++) binned_weight_array[i] = 0; /* fill up the weight array */ ll_location = ll_bins_head[ll_counter]; while (ll_location != -1) { binned_weight_array[vert_in_cut_ptr[ll_location].my_bin] += vert_in_cut_ptr[ll_location].lb_weight; ll_location = vert_in_cut_ptr[ll_location].next_sfc_vert_index; } bin_proc_array = (int*) malloc(sizeof(int) * number_of_bins); i = number_of_bins - 1; current_proc = vert_in_cut_ptr[ll_bins_head[ll_counter]].destination_proc; float scanned_work_prev_allocated = work_prev_allocated[current_proc]; while (i >= 0 && current_proc > -1) { scanned_work_prev_allocated += binned_weight_array[i]; bin_proc_array[i] = current_proc; if(scanned_work_prev_allocated > work_percent_array[current_proc] * total_weight) { global_actual_work_allocated[current_proc] = scanned_work_prev_allocated; bin_proc_array[i] = -current_proc; current_proc--; // the while statement is if there is more than 1 cut in a bin while (current_proc >= 0 && scanned_work_prev_allocated > work_percent_array[current_proc] * total_weight) { global_actual_work_allocated[current_proc] = scanned_work_prev_allocated; current_proc--; } } i--; } /* specify which processor an object belongs to, we will know this because we know what bin an object belongs to and we know what processor a bin belongs to */ ll_location = ll_bins_head[ll_counter]; while (ll_location != -1) { if(bin_proc_array[vert_in_cut_ptr[ll_location].my_bin] >= 0) { vert_in_cut_ptr[ll_location].cut_bin_flag = BSFC_NO_CUT; vert_in_cut_ptr[ll_location].destination_proc = bin_proc_array[vert_in_cut_ptr[ll_location].my_bin]; ll_location = vert_in_cut_ptr[ll_location].next_sfc_vert_index; } else { /* if this object is in a bin with a cut... */ vert_in_cut_ptr[ll_location].cut_bin_flag = BSFC_CUT; vert_in_cut_ptr[ll_location].destination_proc = -bin_proc_array[vert_in_cut_ptr[ll_location].my_bin]; if(ll_bins_head_copy[number_of_cuts - myid + //array continued on the next line vert_in_cut_ptr[ll_location].destination_proc] != -1) { int ll_next = vert_in_cut_ptr[ll_location].next_sfc_vert_index; vert_in_cut_ptr[ll_location].next_sfc_vert_index = ll_bins_head_copy[number_of_cuts - myid + vert_in_cut_ptr[ll_location].destination_proc]; ll_bins_head_copy[number_of_cuts - myid + vert_in_cut_ptr[ll_location].destination_proc] = ll_location; ll_location = ll_next; } else { int ll_next = vert_in_cut_ptr[ll_location].next_sfc_vert_index; vert_in_cut_ptr[ll_location].next_sfc_vert_index = -1; ll_bins_head_copy[number_of_cuts - myid + vert_in_cut_ptr[ll_location].destination_proc] = ll_location; /* calculate work_prev_allocated for this new partition */ work_prev_allocated[vert_in_cut_ptr[ll_location].destination_proc] = work_prev_allocated[ll_counter - number_of_cuts + myid]; for(i = vert_in_cut_ptr[ll_location].my_bin + 1; i < number_of_bins; i++) work_prev_allocated[vert_in_cut_ptr[ll_location].destination_proc] += binned_weight_array[i]; ll_location = ll_next; } } } free(binned_weight_array); free(bin_proc_array); } for(i = 0; i <= number_of_cuts; i++) ll_bins_head[i] = ll_bins_head_copy[i]; free(ll_prev_bins); free(ll_bins_head_copy); *amount_of_used_bits += amount_of_bits; /* check which partitions that are not balanced */ for(i = 0; i <= number_of_cuts; i++) if(ll_bins_head[i] != -1 || local_balanced_flag_array[i] != BSFC_BALANCED) local_balanced_flag_array[i] = BSFC_find_imbalance( work_percent_array, global_actual_work_allocated[(myid + i - number_of_cuts)], total_weight, myid + i - number_of_cuts, numprocs); /* check if any of the partitions are not balanced */ *local_balanced_flag = BSFC_BALANCED; i = 0; while (*local_balanced_flag == BSFC_BALANCED && i <= number_of_cuts) { *local_balanced_flag = local_balanced_flag_array[i]; i++; } /* check the partitions to see if any more improvement can be made on them */ if(*local_balanced_flag == BSFC_NOT_BALANCED) { for(i = 0; i <= number_of_cuts; i++) if(ll_bins_head[i] != -1) { /* check if there is only 1 object in this bin. if there is, no further bin refinement will improve load-balance */ if(vert_in_cut_ptr[ll_bins_head[i]].next_sfc_vert_index == -1) { ll_bins_head[i] = -1; local_balanced_flag_array[i] = BSFC_BALANCED; //printf("Bin refinement cannot improve load balance on proc %d\n", myid); } } /* check again if any of the partitions are not balanced */ *local_balanced_flag = BSFC_BALANCED; j = 0; while (*local_balanced_flag == BSFC_BALANCED && j <= number_of_cuts) { *local_balanced_flag = local_balanced_flag_array[j]; j++; } } return; }