Beispiel #1
0
void BSFC_create_bins(int num_local_objects,
		      BSFC_VERTEX_PTR sfc_vert_ptr, 
		      int* amount_of_bits_used, int size_of_unsigned,
		      float* global_actual_work_allocated, 
		      float *work_percent_array, float* total_weight_ptr,
		      int* balanced_flag, unstructured_communication* verts_in_cut_info,
		      int* number_of_cuts,  
		      int bins_per_proc,
		      int myid, int numprocs)
{
  int i, j, number_of_bins, ierr = 0;
  int array_location = 0;
  int comm_tag = 4190; 
  int * proclist;
  int nreturn = 0;
  int off_proc_objects = 0;  /*counter to keep track of how 
			       many objects will be off processor*/
  float * binned_weight_array;
  int hashtable_length;
  int counter = 0;
  float *extra_float_array;
  float my_work_percent;
  int *bin_proc_array;
  float scanned_work_prev_allocated; /*scanned_work_prev_allocated is the 
					amount of work allocated to higher
					ranked procs */
  int amount_of_bits;
  BSFC_VERTEX_PTR send_vert_buffer;
  float* send_wgt_buffer;
  int current_proc;
  float* extra_float_array2 = NULL;
  int local_balanced_flag;
  int* number_of_cuts_in_bin;

  /*assume initially that each processor has the same amount of bins*/
  number_of_bins = numprocs * bins_per_proc;
  i=0;
  while(number_of_bins > BSFC_pow(2,i))
    i++;
  amount_of_bits = i;
  /* check to see that we have not used up all of the bits */
  if(amount_of_bits > 8*size_of_unsigned * KEYLENGTH)
    amount_of_bits = 8*size_of_unsigned * KEYLENGTH;
  number_of_bins = BSFC_pow(2,i);
  *amount_of_bits_used = amount_of_bits;

  /*hash table */
  float* tmp_float_array = new float[number_of_bins+1];
  for(i=0;i<number_of_bins;i++)
    tmp_float_array[i] = 0;
  for(i=0;i<num_local_objects;i++) {
    sfc_vert_ptr[i].my_bin = 
      BSFC_get_array_location(number_of_bins, amount_of_bits, 0, (sfc_vert_ptr+i));
    tmp_float_array[sfc_vert_ptr[i].my_bin] += sfc_vert_ptr[i].lb_weight;
  }
  tmp_float_array[number_of_bins] = *total_weight_ptr;
  binned_weight_array = (float*) malloc(sizeof(float) * (number_of_bins+1));
  i = MPI_Allreduce(tmp_float_array, binned_weight_array, 
		    number_of_bins+1, MPI_FLOAT, MPI_SUM, MPI_COMM_WORLD);
  delete []tmp_float_array;
  *total_weight_ptr = binned_weight_array[number_of_bins];
  /* global weight array has been created,
     now perform the scan operation on it */
  
  
  /* put in desired amount of work here, needs to
     be changed for varying workloads */
   
  // currently we assume that each processor should get the same amount of work
  my_work_percent = 1.0/((float) numprocs);
  work_percent_array[0] = 2.0;
  for(i=1;i<numprocs;i++)
    work_percent_array[i] = ((float) (numprocs-i))*my_work_percent;

  //////////////////////////////////////////////////////////////////////////////////////
  //////////////////////////////////////////////////////////////////////////////////////
  // the following is if the workload should be varied (e.g. for heterogeneous computers)
  //////////////////////////////////////////////////////////////////////////////////////
  //////////////////////////////////////////////////////////////////////////////////////
  /*  for(i=0;i<numprocs;i++)
    extra_float_array[i] = 0.0;
  extra_float_array[myid] = my_work_percent;
  // make sure that proc 0 gets all of the rest of the work percent 
  if(myid == 0)
    extra_float_array[0] = 1.1;    
  
  ierr = MPI_Allreduce(extra_float_array, work_percent_array, 
		       numprocs, MPI_FLOAT, MPI_SUM, MPI_COMM_WORLD);
  
  for(i=numprocs-2;i>=0;i--)
    work_percent_array[i] += work_percent_array[i+1];
  */  // done varying workload...


  /* each processor needs to know which bins get partitioned
     into which processor, bin_proc_array lists max bin that
     a processor should get */
  for(i=0;i<numprocs;i++)
    global_actual_work_allocated[i] = 0;
  bin_proc_array = (int*) malloc(sizeof(int) * number_of_bins);
  current_proc = numprocs-1;
  i = number_of_bins-1;
  scanned_work_prev_allocated = 0;
  while(i >= 0 && current_proc > -1) {
    scanned_work_prev_allocated += binned_weight_array[i];
    bin_proc_array[i] = current_proc;
    if(scanned_work_prev_allocated > work_percent_array[current_proc]* *total_weight_ptr) {
      global_actual_work_allocated[current_proc] = scanned_work_prev_allocated;            
      bin_proc_array[i] = -current_proc;
      current_proc--;
      // the while statement is if there is more than 1 cut in a bin
      while(current_proc >= 0 &&
	    scanned_work_prev_allocated > work_percent_array[current_proc]* *total_weight_ptr) {
	global_actual_work_allocated[current_proc] = scanned_work_prev_allocated; 
	current_proc--;
      }
    }
    i--;
  }

  /* make sure that the last bin does not have a cut in it */
  if(bin_proc_array[0] > 0)
    bin_proc_array[0] = 0;
  global_actual_work_allocated[0] = *total_weight_ptr;


  /* specify which processor an object belongs to,
     we will know this because we know what bin an object 
     belongs to and we know what processor a bin belongs to */
  for(i=0;i<num_local_objects;i++) {
    if(bin_proc_array[sfc_vert_ptr[i].my_bin] >= 0) {
      sfc_vert_ptr[i].cut_bin_flag = BSFC_NO_CUT;
      sfc_vert_ptr[i].destination_proc = bin_proc_array[sfc_vert_ptr[i].my_bin];
    }
    else {
      sfc_vert_ptr[i].cut_bin_flag = BSFC_CUT;
      sfc_vert_ptr[i].destination_proc = -bin_proc_array[sfc_vert_ptr[i].my_bin];
    }
  }

  /* check to see if any cut-bin has too many objects in it and refine it. 
     the problem with too many objects in a cut-bin is that the processor 
     that gets assigned to that bin will get swamped with communication
     and might not have enough memory to hold all of the information.  we
     detect the cut-bins with too many objects by how many cuts are in that 
     bin.  numprocs - 1 is the amount of cuts and if this is less than
     or equal to max_cuts_in_bin then there is no possibility that a
     coarse bin is overloaded */
/*  if(numprocs - 1 > max_cuts_in_bin)
    ierr = Zoltan_BSFC_refine_overloaded_bins(zz, max_cuts_in_bin, 2*bins_per_proc, 
				      number_of_cuts_in_bin, wgt_dim,
				      sfc_vert_ptr, num_local_objects,
				      amount_of_bits, size_of_unsigned,
				      work_percent_array, total_weight_array, 
				      global_actual_work_allocated);*/
  
  //free(number_of_cuts_in_bin);    
  local_balanced_flag = 
    BSFC_find_imbalance(work_percent_array, global_actual_work_allocated[myid],
	 		*total_weight_ptr, myid, numprocs);

  ierr = MPI_Allreduce(&local_balanced_flag, balanced_flag, 1,
		       MPI_INT, MPI_MAX, MPI_COMM_WORLD);

  free(bin_proc_array);
  free(binned_weight_array);
  
 // *balanced_flag = BSFC_BALANCED;

  /* if the current partitioning is acceptable, the algorithm is finished */
  if(*balanced_flag == BSFC_BALANCED) {
    //printf("first level is good enough on proc %d \n", myid);
    return;
  }

  
  //printf("#############    need to refine some bins....   ##############\n");

  /* if the size of an unsigned integer is different on different processors,
     need to 'shrink' down the key size of unsigned integers on all processors
     to have the same amount of bits but only need to do this for sfc objects
     that are in a cut bin (currently not supported) */


  /* move the sfc objects that belong to any bin that contains a cut
     to the proper processor */
  verts_in_cut_info->used_flag = 1;
  verts_in_cut_info->send_procs_ptr = new int[numprocs];
  verts_in_cut_info->recv_procs_ptr = new int[numprocs];
  //  int* send_procs = verts_in_cut_info->send_procs_ptr;
  //  int* recv_procs = verts_in_cut_info->recv_procs_ptr;
  
  
  for(i=0;i<numprocs;i++)
    verts_in_cut_info->send_procs_ptr[i] = 0;
  for(i=0;i<num_local_objects;i++)
    if(sfc_vert_ptr[i].cut_bin_flag == BSFC_CUT) { // this vert is in a cut... 
      verts_in_cut_info->send_procs_ptr[sfc_vert_ptr[i].destination_proc] += 1;
    }

  i=MPI_Alltoall(verts_in_cut_info->send_procs_ptr, 1, MPI_INT, 
		 verts_in_cut_info->recv_procs_ptr,
		 1, MPI_INT, MPI_COMM_WORLD);

  //recalculate send_procs because it probably got changed
  for(i=0;i<numprocs;i++)
    verts_in_cut_info->send_procs_ptr[i] = 0;
  for(i=0;i<num_local_objects;i++) 
    if(sfc_vert_ptr[i].cut_bin_flag == BSFC_CUT)  // this vert is in a cut...
      verts_in_cut_info->send_procs_ptr[sfc_vert_ptr[i].destination_proc] += 1;

  MPI_Request* send_request = new MPI_Request[numprocs];
  MPI_Request* recv_request = new MPI_Request[numprocs];
  int send_count = 0, recv_count = 0;
  
  for(i=0;i<numprocs;i++)
    send_count += verts_in_cut_info->send_procs_ptr[i];
  for(i=0;i<numprocs;i++)
    recv_count += verts_in_cut_info->recv_procs_ptr[i];
  verts_in_cut_info->send_count = send_count;
  verts_in_cut_info->recv_count = recv_count;

  sfc_vertex* send_sfc_vert = new sfc_vertex[send_count];  //temp storage for objects that get sent out...
  verts_in_cut_info->recv_sfc_vert = new sfc_vertex[recv_count];
  
/*  for(i=0;i<numprocs;i++)
    printf("proc %d is sending %d objects to %d and receiving %d objects \n",myid,
	   verts_in_cut_info->send_procs_ptr[i], i, verts_in_cut_info->recv_procs_ptr[i]); */


  // fill up the send array...
  int* proc_counter = new int[numprocs];
  proc_counter[0] = 0;
  for(i=1;i<numprocs;i++)
    proc_counter[i] = proc_counter[i-1] + verts_in_cut_info->send_procs_ptr[i-1];
  
  recv_count = 0;
  for(i=0;i<myid;i++)
    recv_count += verts_in_cut_info->recv_procs_ptr[i];
  //printf("proc %d has recv_count of %d \n", myid, recv_count);
  for(i=0;i<num_local_objects;i++)
    if(sfc_vert_ptr[i].cut_bin_flag == BSFC_CUT) {
      if(sfc_vert_ptr[i].destination_proc != myid) {
	send_sfc_vert[proc_counter[sfc_vert_ptr[i].destination_proc]] = sfc_vert_ptr[i];
	proc_counter[sfc_vert_ptr[i].destination_proc] += 1;
      }
      else { // if i need to send to myself... 
	verts_in_cut_info->recv_sfc_vert[recv_count] = sfc_vert_ptr[i];
	recv_count++; 
      }
    }
  //done filling up the send array
  int tag = 21503;
  proc_counter[0] = 0;
  for(i=1;i<numprocs;i++)
    proc_counter[i] = proc_counter[i-1] + verts_in_cut_info->send_procs_ptr[i-1];

  recv_count = 0;
  for(i=0;i<numprocs;i++) {
    if(i!= myid) {
      // send out necessary info here...
      if(verts_in_cut_info->send_procs_ptr[i] != 0) {
	j = MPI_Isend((send_sfc_vert+proc_counter[i]), 
		      verts_in_cut_info->send_procs_ptr[i], LB_VERT_TYPE, 
		      i, tag, MPI_COMM_WORLD, (send_request+i));
      }
      // receive necessary info here...
      if(verts_in_cut_info->recv_procs_ptr[i] != 0) {
	j = MPI_Irecv(&(verts_in_cut_info->recv_sfc_vert[recv_count]), 
		      verts_in_cut_info->recv_procs_ptr[i], LB_VERT_TYPE, 
		      i, tag, MPI_COMM_WORLD, (recv_request+i));
      }
    }
    recv_count += verts_in_cut_info->recv_procs_ptr[i];
  }
  delete []proc_counter;
  
  // wait until the info is sent and received...
  for(i=0;i<numprocs;i++)
    if(i!= myid)
      {
	if(verts_in_cut_info->send_procs_ptr[i] != 0) {
	  MPI_Status status;
	  j = MPI_Wait((send_request+i), &status);
	}
	if(verts_in_cut_info->recv_procs_ptr[i] != 0) {
	  MPI_Status status;
	  j = MPI_Wait((recv_request+i), &status);
	}
      }
  
  delete []send_request;
  delete []recv_request;
  delete []send_sfc_vert;

  //*balanced_flag = BSFC_BALANCED;
  return;
}
void BSFC_refine_partition(int* local_balanced_flag, int *amount_of_used_bits, int num_vert_in_cut,
                           BSFC_VERTEX_PTR vert_in_cut_ptr, float* work_percent_array, float total_weight,
                           float* global_actual_work_allocated, int number_of_cuts, int* ll_bins_head,
                           float* work_prev_allocated, int subbins_per_bin, int* local_balanced_flag_array, int myid,
                           int numprocs)

{
    //printf("proc %d is refining the partition level\n",myid);
    int i = 0, j = 0, k, current_proc;
    int amount_of_bits;
    float* binned_weight_array;
    int* bin_proc_array;
    int* ll_prev_bins;
    int ll_counter, ll_location, *ll_bins_head_copy;
    
    /* amount of sub-bins in a bin, probably want this as a passed in parameter */
    int number_of_bins = subbins_per_bin;
    
    /* check to see that all of the bits of the sfc key 
     have not already been used */
    if(*amount_of_used_bits >= sizeof(unsigned) * KEYLENGTH * 8)
    {
        //printf("No more refinement is possible in the repartitioning on proc %d.\n", myid);
        *local_balanced_flag = BSFC_BALANCED;
        return;
    }
    
    /*  assume initially that all the partitions on this processor are balanced.
     we will check later on whether any are not balanced */
    *local_balanced_flag = BSFC_BALANCED;
    
    /* if there are a lot of cuts in a bin, we want the amount of bins
     to be greater than the amount of cuts */
    if(number_of_cuts >= number_of_bins)
        number_of_bins = number_of_cuts + 1;
    
    /*increase sub-bins so that there is a power of 2 */
    i = 0;
    while (number_of_bins > BSFC_pow(2, i))
        i++;
    amount_of_bits = i;
    if(amount_of_bits + *amount_of_used_bits > 8 * sizeof(unsigned) * KEYLENGTH)
        amount_of_bits = 8 * sizeof(unsigned) * KEYLENGTH - *amount_of_used_bits;
    number_of_bins = BSFC_pow(2, i);
    
    ll_prev_bins = (int*) malloc(sizeof(int) * (number_of_cuts + 1));
    
    ll_bins_head_copy = (int*) malloc(sizeof(int) * (number_of_cuts + 1));
    
    for(i = 0; i <= number_of_cuts; i++)
        ll_bins_head_copy[i] = -1;
    
    /* loop over all bins that have a cut in them using linklist 
     to find objects in the cut bins */
    for(ll_counter = 0; ll_counter <= number_of_cuts; ll_counter++)
        if((local_balanced_flag_array[ll_counter] == BSFC_NOT_BALANCED) && ll_bins_head[ll_counter] != -1)
        {
            
            /* calculate new bin numbers for objects that are in a cut bin */
            ll_location = ll_bins_head[ll_counter];
            while (ll_location != -1)
            {
                vert_in_cut_ptr[ll_location].my_bin = BSFC_get_array_location(number_of_bins, amount_of_bits,
                                                                              *amount_of_used_bits,
                                                                              (vert_in_cut_ptr + ll_location));
                ll_location = vert_in_cut_ptr[ll_location].next_sfc_vert_index;
            }
            
            binned_weight_array = (float*) malloc(number_of_bins * sizeof(float));
            for(i = 0; i < number_of_bins; i++)
                binned_weight_array[i] = 0;
            
            /* fill up the weight array */
            ll_location = ll_bins_head[ll_counter];
            while (ll_location != -1)
            {
                binned_weight_array[vert_in_cut_ptr[ll_location].my_bin] += vert_in_cut_ptr[ll_location].lb_weight;
                ll_location = vert_in_cut_ptr[ll_location].next_sfc_vert_index;
            }
            
            bin_proc_array = (int*) malloc(sizeof(int) * number_of_bins);
            i = number_of_bins - 1;
            
            current_proc = vert_in_cut_ptr[ll_bins_head[ll_counter]].destination_proc;
            float scanned_work_prev_allocated = work_prev_allocated[current_proc];
            while (i >= 0 && current_proc > -1)
            {
                scanned_work_prev_allocated += binned_weight_array[i];
                bin_proc_array[i] = current_proc;
                if(scanned_work_prev_allocated > work_percent_array[current_proc] * total_weight)
                {
                    global_actual_work_allocated[current_proc] = scanned_work_prev_allocated;
                    bin_proc_array[i] = -current_proc;
                    current_proc--;
                    // the while statement is if there is more than 1 cut in a bin
                    while (current_proc >= 0 && scanned_work_prev_allocated
                            > work_percent_array[current_proc] * total_weight)
                    {
                        global_actual_work_allocated[current_proc] = scanned_work_prev_allocated;
                        current_proc--;
                    }
                }
                i--;
            }
            
            /* specify which processor an object belongs to,
             we will know this because we know what bin an object 
             belongs to and we know what processor a bin belongs to */

            ll_location = ll_bins_head[ll_counter];
            while (ll_location != -1)
            {
                if(bin_proc_array[vert_in_cut_ptr[ll_location].my_bin] >= 0)
                {
                    vert_in_cut_ptr[ll_location].cut_bin_flag = BSFC_NO_CUT;
                    vert_in_cut_ptr[ll_location].destination_proc = bin_proc_array[vert_in_cut_ptr[ll_location].my_bin];
                    ll_location = vert_in_cut_ptr[ll_location].next_sfc_vert_index;
                }
                else
                { /* if this object is in a bin with a cut... */
                    vert_in_cut_ptr[ll_location].cut_bin_flag = BSFC_CUT;
                    vert_in_cut_ptr[ll_location].destination_proc =
                            -bin_proc_array[vert_in_cut_ptr[ll_location].my_bin];
                    if(ll_bins_head_copy[number_of_cuts - myid + //array continued on the next line
                            vert_in_cut_ptr[ll_location].destination_proc]
                       != -1)
                    {
                        int ll_next = vert_in_cut_ptr[ll_location].next_sfc_vert_index;
                        vert_in_cut_ptr[ll_location].next_sfc_vert_index = ll_bins_head_copy[number_of_cuts - myid
                                + vert_in_cut_ptr[ll_location].destination_proc];
                        ll_bins_head_copy[number_of_cuts - myid + vert_in_cut_ptr[ll_location].destination_proc] =
                                ll_location;
                        ll_location = ll_next;
                    }
                    else
                    {
                        int ll_next = vert_in_cut_ptr[ll_location].next_sfc_vert_index;
                        vert_in_cut_ptr[ll_location].next_sfc_vert_index = -1;
                        ll_bins_head_copy[number_of_cuts - myid + vert_in_cut_ptr[ll_location].destination_proc] =
                                ll_location;
                        /* calculate work_prev_allocated for this new partition */
                        work_prev_allocated[vert_in_cut_ptr[ll_location].destination_proc] =
                                work_prev_allocated[ll_counter - number_of_cuts + myid];
                        for(i = vert_in_cut_ptr[ll_location].my_bin + 1; i < number_of_bins; i++)
                            work_prev_allocated[vert_in_cut_ptr[ll_location].destination_proc] +=
                                    binned_weight_array[i];
                        ll_location = ll_next;
                    }
                }
            }
            free(binned_weight_array);
            free(bin_proc_array);
        }
    
    for(i = 0; i <= number_of_cuts; i++)
        ll_bins_head[i] = ll_bins_head_copy[i];
    
    free(ll_prev_bins);
    free(ll_bins_head_copy);
    
    *amount_of_used_bits += amount_of_bits;
    
    /* check which partitions that are not balanced */
    for(i = 0; i <= number_of_cuts; i++)
        if(ll_bins_head[i] != -1 || local_balanced_flag_array[i] != BSFC_BALANCED)
            local_balanced_flag_array[i] = BSFC_find_imbalance(
                    work_percent_array, global_actual_work_allocated[(myid + i - number_of_cuts)], total_weight,
                    myid + i - number_of_cuts, numprocs);
    
    /* check if any of the partitions are not balanced */
    *local_balanced_flag = BSFC_BALANCED;
    i = 0;
    while (*local_balanced_flag == BSFC_BALANCED && i <= number_of_cuts)
    {
        *local_balanced_flag = local_balanced_flag_array[i];
        i++;
    }
    
    /* check the partitions to see if any more improvement can be made on them */
    if(*local_balanced_flag == BSFC_NOT_BALANCED)
    {
        for(i = 0; i <= number_of_cuts; i++)
            if(ll_bins_head[i] != -1)
            {
                /* check if there is only 1 object in this bin. if there is, 
                 no further bin refinement will improve load-balance */
                if(vert_in_cut_ptr[ll_bins_head[i]].next_sfc_vert_index == -1)
                {
                    ll_bins_head[i] = -1;
                    local_balanced_flag_array[i] = BSFC_BALANCED;
                    //printf("Bin refinement cannot improve load balance on proc %d\n", myid);
                }
            }
        /* check again if any of the partitions are not balanced */
        *local_balanced_flag = BSFC_BALANCED;
        j = 0;
        while (*local_balanced_flag == BSFC_BALANCED && j <= number_of_cuts)
        {
            *local_balanced_flag = local_balanced_flag_array[j];
            j++;
        }
    }
    
    return;
}