Example #1
0
static int serial_fm2 (ZZ *zz,
    HGraph *hg,
    int p,
    float *part_sizes,
    Partition part,
    PHGPartParams *hgp,
    float bal_tol)
{
int    i, j, vertex, edge, *pins[2], *locked = 0, *locked_list = 0, round = 0;
double total_weight, part_weight[2], max_weight[2];
double cutsize_beforepass, best_cutsize, *gain = 0;
HEAP   heap[2];
int    steplimit;
char   *yo="serial_fm2";
int    part_dim = (hg->VtxWeightDim ? hg->VtxWeightDim : 1);
#ifdef HANDLE_ISOLATED_VERTICES    
 int    isocnt=0;
#endif
#ifdef _DEBUG
 double tw0, imbal, cutsize;
#endif

double error, best_error;
int    best_imbalance, imbalance;

  if (p != 2) {
     ZOLTAN_PRINT_ERROR(zz->Proc, yo, "p!=2 not allowed for local_fm2.");
     return ZOLTAN_FATAL;
     }

  if (hg->nEdge == 0)
     return ZOLTAN_OK;

  /* Calculate the weights in each partition and total, then maxima */
  part_weight[0] = 0.0;
  part_weight[1] = 0.0;
  if (hg->vwgt)  {
     for (i = 0; i < hg->nVtx; i++)
        part_weight[part[i]] += hg->vwgt[i*hg->VtxWeightDim];
     total_weight = part_weight[0] + part_weight[1];
  }
  else  {
     total_weight = (double)(hg->nVtx);
     for (i = 0; i < hg->nVtx; i++)
        part_weight[part[i]] += 1.0;
  }
  max_weight[0] = total_weight * bal_tol * part_sizes[0];
  max_weight[1] = total_weight * bal_tol * part_sizes[part_dim];

#ifdef _DEBUG
  tw0 = total_weight * part_sizes[0];
#endif
  
  if (!(pins[0]     = (int*)   ZOLTAN_CALLOC(2*hg->nEdge, sizeof(int)))
   || !(locked      = (int*)   ZOLTAN_CALLOC(hg->nVtx,    sizeof(int)))
   || !(locked_list = (int*)   ZOLTAN_CALLOC(hg->nVtx,    sizeof(int)))
   || !(gain        = (double*)ZOLTAN_CALLOC(hg->nVtx,    sizeof(double))) ) {
     Zoltan_Multifree(__FILE__,__LINE__, 4, &pins[0], &locked, &locked_list,
          &gain);
     ZOLTAN_PRINT_ERROR(zz->Proc, yo, "Insufficient memory.");
     return ZOLTAN_MEMERR;
  }
  pins[1] = &(pins[0][hg->nEdge]);

  /* Initial calculation of the pins distribution and gain values */
  for (i = 0; i < hg->nEdge; i++)
     for (j = hg->hindex[i]; j < hg->hindex[i+1]; j++)
        (pins[part[hg->hvertex[j]]][i])++;
  for (i = 0; i < hg->nVtx; i++)
     for (j = hg->vindex[i]; j < hg->vindex[i+1]; j++) {
        edge = hg->vedge[j];
        if (pins[part[i]][edge] == 1)
           gain[i] += (hg->ewgt ? hg->ewgt[edge] : 1.0);
        else if (pins[1-part[i]][edge] == 0)
           gain[i] -= (hg->ewgt ? hg->ewgt[edge] : 1.0);
     }

  /* Initialize the heaps and fill them with the gain values */
  Zoltan_Heap_Init(zz, &heap[0], hg->nVtx);
  Zoltan_Heap_Init(zz, &heap[1], hg->nVtx);  
  for (i = 0; i < hg->nVtx; i++)
      if (!hgp->UseFixedVtx || hg->fixed_part[i]<0) {
#ifdef HANDLE_ISOLATED_VERTICES          
          if (hg->vindex[i+1]==hg->vindex[i]) { /* isolated vertex */
              part_weight[part[i]] -= (hg->vwgt ? hg->vwgt[i*hg->VtxWeightDim] 
                                                : 1.0);
              part[i] = -(part[i]+1); /* remove those vertices from that part*/
              ++isocnt;
          } else
#endif
              Zoltan_Heap_Input(&heap[part[i]], i, gain[i]);
      }
#ifdef _DEBUG
      else {
          int pp = (hg->fixed_part[i] < hg->bisec_split) ? 0 : 1;
          if (part[i]!=pp) 
              errexit("%s: beginning of pass for hg->info=%d vertex %d is fixed at %d bisec_split is %d but its part is %d\n", uMe(hg->comm), hg->info, i, hg->fixed_part[i], hg->bisec_split, part[i]);
          
              
      }
#endif
  Zoltan_Heap_Make(&heap[0]);
  Zoltan_Heap_Make(&heap[1]);

  /* Initialize given partition as best partition */
  best_cutsize = cutsize_beforepass = Zoltan_PHG_Compute_NetCut(hg->comm, hg, part);
  best_error = MAX (part_weight[0]-max_weight[0], part_weight[1]-max_weight[1]);
  best_imbalance = (part_weight[0]>max_weight[0])||(part_weight[1]>max_weight[1]);
  do {
    int step = 0, no_better_steps = 0, number_locked = 0, best_locked = 0;
    int sour, dest;
    double cur_cutsize=best_cutsize;

    round++;
    cutsize_beforepass = best_cutsize;
    if (hgp->output_level > PHG_DEBUG_LIST)
      printf("ROUND %d:\nSTEP VERTEX  PARTS MAX_WGT CHANGE CUTSIZE\n",round);

    steplimit = (hgp->fm_max_neg_move < 0) ? hg->nVtx : hgp->fm_max_neg_move;
    /* steplimit = hg->nVtx/4;  Robsys previous choice */

    while (step < hg->nVtx && no_better_steps < steplimit) {
        step++;
        no_better_steps++;

        if (Zoltan_Heap_Empty(&heap[0]))
           sour = 1;
        else if (Zoltan_Heap_Empty(&heap[1]))
           sour = 0;
        else if (part_weight[0] > max_weight[0])
           sour = 0;
        else if (part_weight[1] > max_weight[1])
           sour = 1;
        else if (Zoltan_Heap_Max_Value(&heap[0])
              >  Zoltan_Heap_Max_Value(&heap[1]))
           sour = 0;
        else
           sour = 1;
        dest = 1-sour;
        vertex = Zoltan_Heap_Extract_Max(&heap[sour]);
        if (vertex<0)
            break;

        locked[vertex] = part[vertex] + 1;
        locked_list[number_locked++] = vertex;
        cur_cutsize -= gain[vertex];        
        
        Zoltan_HG_move_vertex (hg, vertex, sour, dest, part, pins, gain, heap);

#ifdef _DEBUG
        imbal = (tw0==0.0) ? 0.0 : (part_weight[0]-tw0)/tw0;
        uprintf(hg->comm, "%4d: SEQ moving %4d from %d to %d cut=%6.0lf bal=%.3lf\n", step, vertex, sour, dest, cur_cutsize, imbal);
        /* Just for debugging */
        cutsize = Zoltan_PHG_Compute_NetCut(hg->comm, hg, part);
        if (cur_cutsize!=cutsize) {
            errexit("%s: SEQ after move cutsize=%.2lf Verify: total=%.2lf\n", uMe(hg->comm), cur_cutsize,
                    cutsize);
        }
#endif
        
        part_weight[sour] -= (hg->vwgt ? hg->vwgt[vertex*hg->VtxWeightDim] 
                                       : 1.0);
        part_weight[dest] += (hg->vwgt ? hg->vwgt[vertex*hg->VtxWeightDim] 
                                       : 1.0);

        error = MAX (part_weight[0]-max_weight[0],part_weight[1]-max_weight[1]);
        imbalance = (part_weight[0]>max_weight[0])||(part_weight[1]>max_weight[1]);

        if ( ( best_imbalance && (error < best_error))
          || (!imbalance && (cur_cutsize < best_cutsize)))  {
            best_error   = error;
            best_imbalance = imbalance;
            best_locked  = number_locked;
            best_cutsize = cur_cutsize;
            no_better_steps = 0;
        }
        if (hgp->output_level > PHG_DEBUG_LIST+1)
           printf ("%4d %6d %2d->%2d %7.2f %f %f\n", step, vertex, sour, dest,
            error, cur_cutsize - cutsize_beforepass, cur_cutsize);
    }

#ifdef _DEBUG
    uprintf(hg->comm, "SEQ Best CUT=%6.0lf at move %d\n", best_cutsize, best_locked);
#endif
    
    /* rollback */
     while (number_locked != best_locked) {
        vertex = locked_list[--number_locked];
        sour = part[vertex];
        dest = locked[vertex] - 1;

        Zoltan_HG_move_vertex (hg, vertex, sour, dest, part, pins, gain, heap);

        part_weight[sour] -= (hg->vwgt ? hg->vwgt[vertex*hg->VtxWeightDim] 
                                       : 1.0);
        part_weight[dest] += (hg->vwgt ? hg->vwgt[vertex*hg->VtxWeightDim] 
                                       : 1.0);
        Zoltan_Heap_Input(&heap[dest], vertex, gain[vertex]);
        locked[vertex] = 0;
     }

     /* only update data structures if we're going to do another pass */
     if ((best_cutsize < cutsize_beforepass) &&  (round < hgp->fm_loop_limit)) {         
         while (number_locked) {
             vertex = locked_list[--number_locked];
             locked[vertex] = 0;
             Zoltan_Heap_Input(&heap[part[vertex]], vertex, gain[vertex]);
         }
         
         Zoltan_Heap_Make(&(heap[0]));
         Zoltan_Heap_Make(&(heap[1]));
     }
  } while ((best_cutsize < cutsize_beforepass) &&  (round < hgp->fm_loop_limit));

#ifdef HANDLE_ISOLATED_VERTICES
  if (isocnt) {
#ifdef _DEBUG      
      double isoimbalbefore, isoimbal;
#endif
      double targetw0;
      
      targetw0 = total_weight * part_sizes[0];
#ifdef _DEBUG      
      isoimbalbefore = (targetw0==0) ? 0.0 : (part_weight[0] - targetw0)/ targetw0;
#endif
      for (i=0; i < hg->nVtx; ++i)
          if (!hgp->UseFixedVtx || hg->fixed_part[i]<0) {
              if (hg->vindex[i+1]==hg->vindex[i])  { /* go over isolated vertices */
                  int npno = (part_weight[0] <  targetw0) ? 0 : 1;
                  part_weight[npno] += (hg->vwgt ? hg->vwgt[i*hg->VtxWeightDim] 
                                                 : 1.0);                
                  part[i] = npno;
              }
          }
#ifdef _DEBUG      
      isoimbal = (targetw0==0) ? 0.0 : (part_weight[0] - targetw0)/ targetw0;
      uprintf(hg->comm, "SEQ %d isolated vertices, balance before: %.3lf  after: %.3lf\n", isocnt, isoimbalbefore, isoimbal);
#endif
  }
#endif  
  
  /* gain_check (hg, gain, part, pins); */
  Zoltan_Multifree(__FILE__,__LINE__, 4, &pins[0], &locked, &locked_list, &gain);
  Zoltan_Heap_Free(&heap[0]);
  Zoltan_Heap_Free(&heap[1]);

  return ZOLTAN_OK;
}
Example #2
0
static int refine_fm2 (ZZ *zz,
                       HGraph *hg,
                       int p,
                       float *part_sizes,
                       Partition part,
                       PHGPartParams *hgp,
                       float bal_tol
    )
{
    int    i, j, ierr=ZOLTAN_OK, *pins[2]={NULL,NULL}, *lpins[2]={NULL,NULL};
    int    *moves=NULL, *mark=NULL, *adj=NULL, passcnt=0;
    float  *gain=NULL, *lgain=NULL;
    int    best_cutsizeat, cont, successivefails=0;
    double total_weight, weights[2], total_lweight, lweights[2], lwadjust[2],
        max_weight[2], lmax_weight[2], avail[2], gavail[2];
    int availcnt[2], gavailcnt[2];
    double targetw0, ltargetw0, minvw=DBL_MAX;
    double cutsize, best_cutsize, 
        best_limbal, imbal, limbal;
    HEAP   heap[2];
    char   *yo="refine_fm2";
    int    part_dim = (hg->VtxWeightDim ? hg->VtxWeightDim : 1);
#ifdef HANDLE_ISOLATED_VERTICES    
    int    isocnt=hg->nVtx; /* only root uses isocnt, isolated vertices
                               are kept at the end of moves array */
    int    *deg=NULL, *ldeg=NULL;
#if 0
    double best_imbal;
#endif
#endif
    PHGComm *hgc=hg->comm;
    int rootRank;
    
    struct phg_timer_indices *timer = Zoltan_PHG_LB_Data_timers(zz);
    int do_timing = (hgp->use_timers > 2);
    int detail_timing = (hgp->use_timers > 3);

    ZOLTAN_TRACE_ENTER(zz, yo);

    if (p != 2) {
        ZOLTAN_PRINT_ERROR(zz->Proc, yo, "p!=2 not allowed for refine_fm2.");
        ZOLTAN_TRACE_EXIT(zz, yo);
        return ZOLTAN_FATAL;
    }

    /* return only if globally there is no edge or vertex */
    if (!hg->dist_y[hgc->nProc_y] || hg->dist_x[hgc->nProc_x] == 0) {
        ZOLTAN_TRACE_EXIT(zz, yo);
        return ZOLTAN_OK;
    }


#ifdef USE_SERIAL_REFINEMENT_ON_ONE_PROC
    if (hgc->nProc==1){ /* only one proc? use serial code */
        ZOLTAN_TRACE_EXIT(zz, yo);
        return serial_fm2 (zz, hg, p, part_sizes, part, hgp, bal_tol);
    }
#endif

    if (do_timing) { 
        if (timer->rfrefine < 0) 
            timer->rfrefine = Zoltan_Timer_Init(zz->ZTime, 1, "Ref_P_Total");
        ZOLTAN_TIMER_START(zz->ZTime, timer->rfrefine, hgc->Communicator);
    }
    if (detail_timing) {
        if (timer->rfpins < 0) 
            timer->rfpins = Zoltan_Timer_Init(zz->ZTime, 0, "Ref_P_Pins");
        if (timer->rfiso < 0) 
            timer->rfiso = Zoltan_Timer_Init(zz->ZTime, 0, "Ref_P_IsolatedVert");
        if (timer->rfgain < 0) 
            timer->rfgain = Zoltan_Timer_Init(zz->ZTime, 0, "Ref_P_Gain");
        if (timer->rfheap < 0) 
            timer->rfheap = Zoltan_Timer_Init(zz->ZTime, 0, "Ref_P_Heap");
        if (timer->rfpass < 0) 
            timer->rfpass = Zoltan_Timer_Init(zz->ZTime, 0, "Ref_P_Pass");
        if (timer->rfroll < 0) 
            timer->rfroll = Zoltan_Timer_Init(zz->ZTime, 0, "Ref_P_Roll");
        if (timer->rfnonroot < 0) 
            timer->rfnonroot = Zoltan_Timer_Init(zz->ZTime, 0, "Ref_P_NonRoot");
    }
    
    
    /* find the index of the proc in column group with 
       the most #nonzeros; it will be our root
       proc for computing moves since it has better 
       knowedge about global hypergraph.
       We ignore returned #pins (i) in root */
    Zoltan_PHG_Find_Root(hg->nPins, hgc->myProc_y, hgc->col_comm, 
                         &i, &rootRank);
    
    /* Calculate the weights in each partition and total, then maxima */
    weights[0] = weights[1] = 0.0;
    lweights[0] = lweights[1] = 0.0;
    if (hg->vwgt) 
        for (i = 0; i < hg->nVtx; i++) {
            lweights[part[i]] += hg->vwgt[i*hg->VtxWeightDim];
            minvw = (minvw > hg->vwgt[i*hg->VtxWeightDim]) 
                  ? hg->vwgt[i*hg->VtxWeightDim] 
                  : minvw;
        }
    else {
        minvw = 1.0;
        for (i = 0; i < hg->nVtx; i++)
            lweights[part[i]] += 1.0;
    }

    MPI_Allreduce(lweights, weights, 2, MPI_DOUBLE, MPI_SUM, hgc->row_comm);
    total_weight = weights[0] + weights[1];
    targetw0 = total_weight * part_sizes[0]; /* global target weight for part 0 */

    max_weight[0] = total_weight * bal_tol * part_sizes[0];
    max_weight[1] = total_weight * bal_tol * part_sizes[part_dim]; /* should be (1 - part_sizes[0]) */


    if (weights[0]==0.0) {
        ltargetw0 = targetw0 / hgc->nProc_x;
        lmax_weight[0] = max_weight[0] / hgc->nProc_x;
    } else {
        lmax_weight[0] = (weights[0]==0.0) ? 0.0 : lweights[0] +
            (max_weight[0] - weights[0]) * ( lweights[0] / weights[0] );
        ltargetw0 = targetw0 * ( lweights[0] / weights[0] ); /* local target weight */
    }
    if (weights[1]==0.0)
        lmax_weight[1] = max_weight[1] / hgc->nProc_x;
    else
        lmax_weight[1] = (weights[1]==0.0) ? 0.0 : lweights[1] +
            (max_weight[1] - weights[1]) * ( lweights[1] / weights[1] );

    total_lweight = lweights[0]+lweights[1];
    
    avail[0] = MAX(0.0, lmax_weight[0]-total_lweight);
    avail[1] = MAX(0.0, lmax_weight[1]-total_lweight);
    availcnt[0] = (avail[0] == 0) ? 1 : 0;
    availcnt[1] = (avail[1] == 0) ? 1 : 0; 
    MPI_Allreduce(avail, gavail, 2, MPI_DOUBLE, MPI_SUM, hgc->row_comm);
    MPI_Allreduce(availcnt, gavailcnt, 2, MPI_INT, MPI_SUM, hgc->row_comm);

#ifdef _DEBUG
    if (gavailcnt[0] || gavailcnt[1])
        uprintf(hgc, "before adjustment, LMW[%.1lf, %.1lf]\n", lmax_weight[0], lmax_weight[1]);
#endif

    if (gavailcnt[0]) 
        lmax_weight[0] += gavail[0] / (double) gavailcnt[0];
    
    if (gavailcnt[1]) 
        lmax_weight[1] += gavail[1] / (double) gavailcnt[1];
    
    /* Our strategy is to stay close to the current local weight balance.
       We do not need the same local balance on each proc, as long as
       we achieve approximate global balance.                            */

#ifdef _DEBUG
    imbal = (targetw0==0.0) ? 0.0 : fabs(weights[0]-targetw0)/targetw0;
    limbal = (ltargetw0==0.0) ? 0.0 : fabs(lweights[0]-ltargetw0)/ltargetw0;
    uprintf(hgc, "H(%d, %d, %d), FM2: W[%.1lf, %.1lf] MW:[%.1lf, %.1lf] I=%.3lf  LW[%.1lf, %.1lf] LMW[%.1lf, %.1lf] LI=%.3lf\n", hg->nVtx, hg->nEdge, hg->nPins, weights[0], weights[1], max_weight[0], max_weight[1], imbal, lweights[0], lweights[1], lmax_weight[0], lmax_weight[1], limbal);
#endif

    
    if ((hg->nEdge && (!(pins[0]    = (int*) ZOLTAN_MALLOC(2 * hg->nEdge * sizeof(int)))
                      || !(lpins[0] = (int*) ZOLTAN_CALLOC(2 * hg->nEdge, sizeof(int))))) ||
        (hg->nVtx && (!(moves   = (int*)   ZOLTAN_MALLOC(hg->nVtx * sizeof(int)))
                     || !(lgain = (float*) ZOLTAN_MALLOC(hg->nVtx * sizeof(float))))))
        MEMORY_ERROR;

    if (hg->nEdge) {
        pins[1] = &(pins[0][hg->nEdge]);
        lpins[1] = &(lpins[0][hg->nEdge]);
    }

    if (hgc->myProc_y==rootRank) { /* only root needs mark, adj, gain and heaps*/
        Zoltan_Heap_Init(zz, &heap[0], hg->nVtx);
        Zoltan_Heap_Init(zz, &heap[1], hg->nVtx);  
        if (hg->nVtx &&
            (!(mark     = (int*)   ZOLTAN_CALLOC(hg->nVtx, sizeof(int)))
             || !(adj   = (int*)   ZOLTAN_MALLOC(hg->nVtx * sizeof(int)))   
             || !(gain  = (float*) ZOLTAN_MALLOC(hg->nVtx * sizeof(float)))))
            MEMORY_ERROR;
    }

    /* Initial calculation of the local pin distribution (sigma in UVC's papers)  */
    if (detail_timing)         
        ZOLTAN_TIMER_START(zz->ZTime, timer->rfpins, hgc->Communicator);                        
    for (i = 0; i < hg->nEdge; ++i)
        for (j = hg->hindex[i]; j < hg->hindex[i+1]; ++j){
            ++(lpins[part[hg->hvertex[j]]][i]);
        }
    if (detail_timing)         
        ZOLTAN_TIMER_STOP(zz->ZTime, timer->rfpins, hgc->Communicator);                    
    

#ifdef HANDLE_ISOLATED_VERTICES        
    /* first compute vertex degree to find any isolated vertices
       we use lgain and gain, as ldeg, deg.*/
    if (hg->nVtx) {
        if (detail_timing)         
            ZOLTAN_TIMER_START(zz->ZTime, timer->rfiso, hgc->Communicator);        
        ldeg = (int *) lgain;
        deg = (int *) gain; /* null for non-root but that is fine */
        for (i = 0; i < hg->nVtx; ++i)
            ldeg[i] = hg->vindex[i+1] - hg->vindex[i];
        MPI_Reduce(ldeg, deg, hg->nVtx, MPI_INT, MPI_SUM, rootRank,
                   hg->comm->col_comm);

        if (hgc->myProc_y==rootRank) { /* root marks isolated vertices */
            for (i=0; i<hg->nVtx; ++i)
                if (!hgp->UseFixedVtx || hg->fixed_part[i]<0) {
                    if (!deg[i]) {
                        moves[--isocnt] = i;
                        part[i] = -(part[i]+1); /* remove those vertices from that part*/
                    }
                }
        }   
        if (detail_timing)         
            ZOLTAN_TIMER_STOP(zz->ZTime, timer->rfiso, hgc->Communicator);        

    }
#endif
    
    do {
        int v=1, movecnt=0, neggaincnt=0, from, to;
        int maxneggain = (hgp->fm_max_neg_move < 0) ? hg->nVtx : hgp->fm_max_neg_move;
        int notfeasible=(weights[0]>max_weight[0]) || (weights[1]>max_weight[1]);
    
        /* now compute global pin distribution */
        if (hg->nEdge) {
            if (detail_timing)         
                ZOLTAN_TIMER_START(zz->ZTime, timer->rfpins, hgc->Communicator);                    
            MPI_Allreduce(lpins[0], pins[0], 2*hg->nEdge, MPI_INT, MPI_SUM, 
                          hgc->row_comm);
            if (detail_timing)         
                ZOLTAN_TIMER_STOP(zz->ZTime, timer->rfpins, hgc->Communicator);                    
        }

        /* now we can compute actual cut */
        best_cutsizeat=0;
        cutsize = 0.0;
        for (i=0; i < hg->nEdge; ++i) {
            if (pins[0][i] && pins[1][i])
                cutsize += (hg->ewgt ? hg->ewgt[i] : 1.0);
        }
        MPI_Allreduce(&cutsize, &best_cutsize, 1, MPI_DOUBLE, MPI_SUM, hgc->col_comm);
        cutsize = best_cutsize;

        imbal = (targetw0==0.0) ? 0.0 : fabs(weights[0]-targetw0)/targetw0;        
        best_limbal = limbal = (ltargetw0==0.0) ? 0.0
            : fabs(lweights[0]-ltargetw0)/ltargetw0;

        /* UVCUVC: it looks like instead of moving always from overloaded
           part, alternating the 'from' part gives better results.
           Hence if the imbal is not really bad (2x worse) we use that approach  */
        if (imbal > BADBALANCE*(bal_tol-1.0) ) /* decide which way the moves will be in this pass */
            from = (weights[0] < targetw0) ? 1 : 0;
        else 
            from = passcnt % 2; 
        /* we want to be sure that everybody!!! picks the same source */
        MPI_Bcast(&from, 1, MPI_INT, 0, hgc->Communicator); 

        to = 1-from;
        
#ifdef _DEBUG
        /* Just for debugging */
        best_cutsize = Zoltan_PHG_Compute_NetCut(hgc, hg, part);
        if (best_cutsize!=cutsize) {
            errexit("%s: Initial cutsize=%.2lf Verify: total=%.2lf\n", uMe(hgc), cutsize,
                    best_cutsize);
        }
        if (hgc->myProc_y==rootRank)
            for (i = 0; i< hg->nVtx; ++i)
                if (mark[i])
                    errexit("mark[%d]=%d", i, mark[i]);
        /* debuggging code ends here */
#endif

        /* compute only the gains of the vertices from 'from' part */
        if (detail_timing)         
            ZOLTAN_TIMER_START(zz->ZTime, timer->rfgain, hgc->Communicator);                    
        
        for (i = 0; i < hg->nVtx; ++i) {
            lgain[i] = 0.0;
            if ((part[i]==from) && (!hgp->UseFixedVtx || hg->fixed_part[i]<0))
                for (j = hg->vindex[i]; j < hg->vindex[i+1]; j++) {
                    int edge = hg->vedge[j];
                    if ((pins[0][edge]+pins[1][edge])>1) { /* if they have at least 2 pins :) */
                        if (pins[part[i]][edge] == 1)
                            lgain[i] += (hg->ewgt ? hg->ewgt[edge] : 1.0);
                        else if (pins[1-part[i]][edge] == 0)
                            lgain[i] -= (hg->ewgt ? hg->ewgt[edge] : 1.0);
                    }
                }
        }
        /* now sum up all gains on only root proc */
        if (hg->nVtx)
            MPI_Reduce(lgain, gain, hg->nVtx, MPI_FLOAT, MPI_SUM, rootRank, 
                       hgc->col_comm);
        if (detail_timing)         
            ZOLTAN_TIMER_STOP(zz->ZTime, timer->rfgain, hgc->Communicator);                    
        

        if (hgp->output_level >= PHG_DEBUG_ALL) {
            imbal = (targetw0==0.0) ? 0.0 : fabs(weights[0]-targetw0)/targetw0;
            printf("%s FM Pass %d (%d->%d) Cut=%.2f W[%5.0f, %5.0f] I= %.2f LW[%5.0f, %5.0f] LI= %.2f\n", uMe(hgc), passcnt, from, to, cutsize, weights[0], weights[1], imbal, lweights[0], lweights[1], limbal);
        }

        if (hgc->myProc_y==rootRank) {
            /* those are the lucky ones; each proc in column-group
               could have compute the same moves concurrently; but for this
               version we'll do it in the root procs and broadcast */

#ifdef HANDLE_ISOLATED_VERTICES
            if (detail_timing)         
                ZOLTAN_TIMER_START(zz->ZTime, timer->rfiso, hgc->Communicator);                    
            lwadjust[0] = lwadjust[1] = 0.0;
            for (i=isocnt; i < hg->nVtx; ++i) { /* go over isolated vertices */
                int   u=moves[i], pno=-part[u]-1;
                float w=(hg->vwgt ? hg->vwgt[u*hg->VtxWeightDim] : 1.0);

                if (pno<0 || pno>1)
                    errexit("heeeey pno=%d", pno);
                /* let's remove it from its part */
                lwadjust[pno] -= w;                
            }
            lweights[0] += lwadjust[0];
            lweights[1] += lwadjust[1];
            if (detail_timing)         
                ZOLTAN_TIMER_STOP(zz->ZTime, timer->rfiso, hgc->Communicator);                    
#endif

            if (detail_timing)         
                ZOLTAN_TIMER_START(zz->ZTime, timer->rfheap, hgc->Communicator);                    
            
            /* Initialize the heaps and fill them with the gain values */
            Zoltan_Heap_Clear(&heap[from]);  
            for (i = 0; i < hg->nVtx; ++i)
                if ((part[i]==from) && (!hgp->UseFixedVtx || hg->fixed_part[i]<0))
                    Zoltan_Heap_Input(&heap[from], i, gain[i]);
            Zoltan_Heap_Make(&heap[from]);
            if (detail_timing) {
                ZOLTAN_TIMER_STOP(zz->ZTime, timer->rfheap, hgc->Communicator);
                ZOLTAN_TIMER_START(zz->ZTime, timer->rfpass, hgc->Communicator);
            }

            while ((neggaincnt < maxneggain) && ((lweights[to]+minvw) <= lmax_weight[to]) ) {
                if (Zoltan_Heap_Empty(&heap[from])) { /* too bad it is empty */
                    v = -1;
                    break;
                }
                
                v = Zoltan_Heap_Extract_Max(&heap[from]);    
                
#ifdef _DEBUG
                if (from != part[v])
                    errexit("hooop from=%d part[%d]=%d", from, v, part[v]);
#endif

                /* Mark vertex we picked from the heap so it is "locked". 
                   For the current strategy, moving only one direction 
                   at a time, the mark information is not critical.
                   Note that the mark array is also used in the move/update 
                   routine so don't remove it! */
                ++mark[v];
                if (lweights[to]+((hg->vwgt)?hg->vwgt[v*hg->VtxWeightDim]:1.0) > lmax_weight[to]) {
#ifdef _DEBUG2                    
                    printf("%s %4d: %6d (g: %5.1lf), p:%2d [%4.0lf, %4.0lf] NF\n", uMe(hgc), movecnt, v, gain[v], from, weights[0], weights[1]);
#endif
                    /* Negative value in moves array means we have examined 
                       the vertex but couldn't move it. Note offset by one,
                       otherwise zero would be ambiguous. */
                    moves[movecnt++] = -(v+1);
                    continue;
                } 

                    
                moves[movecnt] = v;
                ++neggaincnt;
                cutsize -= gain[v];

                fm2_move_vertex_oneway(v, hg, part, gain, heap, pins, lpins, weights, lweights, mark, adj);
                imbal = (targetw0==0.0) ? 0.0
                    : fabs(weights[0]-targetw0)/targetw0;
                limbal = (ltargetw0==0.0) ? 0.0
                    : fabs(lweights[0]-ltargetw0)/ltargetw0;

                if (notfeasible || (cutsize<best_cutsize) ||
                                   (cutsize==best_cutsize && limbal < best_limbal)) {
#ifdef _DEBUG2                    
                    printf("%s %4d: %6d (g: %5.1lf), p:%2d W[%4.0lf, %4.0lf] I:%.2lf LW[%4.0lf, %4.0lf] LI:%.2lf C:%.1lf<-- Best\n", uMe(hgc), movecnt, v, gain[v], from, weights[0], weights[1], imbal, lweights[0], lweights[1], limbal, cutsize); /* after move gain is -oldgain */
#endif
                    notfeasible = weights[from]>max_weight[from];
                    best_cutsize = cutsize;
                    best_cutsizeat = movecnt+1;
                    best_limbal = limbal;
                    neggaincnt = 0;
                }
#ifdef _DEBUG2                
                else
                    printf("%s %4d: %6d (g: %5.1lf), p:%2d [%4.0lf, %4.0lf] %.1lf\n", uMe(hgc), movecnt, v, gain[v], from, weights[0], weights[1], cutsize);
#endif
                ++movecnt;
            }

            
            if (detail_timing) {
                ZOLTAN_TIMER_STOP(zz->ZTime, timer->rfpass, hgc->Communicator);
                ZOLTAN_TIMER_START(zz->ZTime, timer->rfroll, hgc->Communicator);
            }

#ifdef _DEBUG
	    if (v<0)
                uprintf(hgc, "EOLB @ %d there was no vertex to select: v=%d\n", movecnt, v);
	    else if (neggaincnt >= maxneggain) 
                uprintf(hgc, "EOLB @ %d max neg move reached neggaincnt(%d) >= maxneggain\n", movecnt, neggaincnt, maxneggain);
	    else 
                uprintf(hgc, "EOLB @ %d balance constraint LW[%.1lf, %.1lf] and MAXW[%.1lf, %.1lf]\n", movecnt, lweights[0], lweights[1], lmax_weight[0], lmax_weight[1]);
#endif
            
            /* roll back the moves without any improvement */
            for (i=movecnt-1; i>=best_cutsizeat; --i) {
                int vv = moves[i];
                if (vv<0)
                    vv = -vv-1;
                else /* we don't need to roll pins, or weights etc; rolling local ones suffices */
                    fm2_move_vertex_oneway_nonroot(vv, hg, part, lpins, lweights);
                mark[vv] = 0;
            }
            for (i=0; i<best_cutsizeat; ++i){
                int vv = (moves[i] < 0 ) ? -moves[i] - 1 : moves[i];
                mark[vv] = 0;
            }
            if (detail_timing) 
                ZOLTAN_TIMER_STOP(zz->ZTime, timer->rfroll, hgc->Communicator);            
        }

        if (detail_timing) 
            ZOLTAN_TIMER_START(zz->ZTime, timer->rfnonroot, hgc->Communicator);            
        
        /* now root bcast moves to column procs */
        MPI_Bcast(&best_cutsizeat, 1, MPI_INT, rootRank, hgc->col_comm);
        MPI_Bcast(moves, best_cutsizeat, MPI_INT, rootRank, hgc->col_comm);
        if (hgc->myProc_y!=rootRank) { /* now non-root does move simulation */
            for (i=0; i<best_cutsizeat; ++i) {
                int vv = moves[i];
                if (vv>=0)
                    fm2_move_vertex_oneway_nonroot(vv, hg, part, lpins, lweights);
            }
        }
        if (detail_timing) 
            ZOLTAN_TIMER_STOP(zz->ZTime, timer->rfnonroot, hgc->Communicator);            

        
#ifdef _DEBUG
        for (i = 0; i < hg->nEdge; ++i) {
            int lp[2];

            lp[0] = lp[1] = 0;
            for (j = hg->hindex[i]; j < hg->hindex[i+1]; ++j)
                ++(lp[part[hg->hvertex[j]]]);
            if ((lp[0] != lpins[0][i]) || (lp[1] != lpins[1][i]))
                errexit("for net %d -- lp=[%d, %d] lpins[%d, %d]", i, lp[0], lp[1], lpins[0][i], lpins[1][i]);
        }
#endif


#ifdef HANDLE_ISOLATED_VERTICES
        if (detail_timing)         
            ZOLTAN_TIMER_START(zz->ZTime, timer->rfiso, hgc->Communicator);        
        
#if 0
        MPI_Allreduce(lweights, weights, 2, MPI_DOUBLE, MPI_SUM, hgc->row_comm);        
        best_imbal = (targetw0==0.0) ? 0.0 : fabs(weights[0]-targetw0)/targetw0;
        if (hgc->myProc_y==rootRank)             
            uprintf(hgc, "BEFORE ISOLATED VERTEX HANDLING WE *THINK* GLOBAL IMBALANCE is %.3lf\n", best_imbal);
#endif
        
        if (hgc->myProc_y==rootRank) {
            best_limbal = (ltargetw0==0.0) ? 0.0
                : fabs(lweights[0]-ltargetw0)/ltargetw0;
            
            for (i=isocnt; i < hg->nVtx; ++i) { /* go over isolated vertices */
                int u = moves[i], npno;
                float w=(hg->vwgt ? hg->vwgt[u*hg->VtxWeightDim] : 1.0);

                npno = (lweights[0] < ltargetw0) ? 0 : 1;
                lweights[npno] += w;
                lwadjust[npno] += w;
                part[u] = -(npno+1); /* move to npno (might be same as pno;
                                        so it may not be a real move */
            }
            limbal = (ltargetw0==0.0) ? 0.0
                : fabs(lweights[0]-ltargetw0)/ltargetw0;
#if 0           
            uprintf(hgc, "before binpacking of %d isolated vertices balance was: %.3lf now: %.3lf\n", hg->nVtx-isocnt, best_limbal, limbal);
#endif
        }

        MPI_Bcast(lwadjust, 2, MPI_DOUBLE, rootRank, hgc->col_comm);
        if (hgc->myProc_y!=rootRank) {
            lweights[0] += lwadjust[0];
            lweights[1] += lwadjust[1];
        }
        if (detail_timing)         
            ZOLTAN_TIMER_STOP(zz->ZTime, timer->rfiso, hgc->Communicator);                
#endif        
        
        MPI_Allreduce(lweights, weights, 2, MPI_DOUBLE, MPI_SUM, hgc->row_comm);
#if 0       
        best_imbal = (targetw0==0.0) ? 0.0 : fabs(weights[0]-targetw0)/targetw0;
        if (hgc->myProc_y==rootRank)             
            uprintf(hgc, "NEW GLOBAL IMBALANCE is %.3lf\n", best_imbal);
#endif
        
        if (weights[0]==0.0) 
            ltargetw0 = lmax_weight[0] = 0.0;
        else {
            lmax_weight[0] = lweights[0] +
                (max_weight[0] - weights[0]) * ( lweights[0] / weights[0] );
            ltargetw0 = targetw0 * ( lweights[0] / weights[0] ); /* local target weight */
        }
        lmax_weight[1] = (weights[1]==0.0) ? 0.0 : lweights[1] +
            (max_weight[1] - weights[1]) * ( lweights[1] / weights[1] );
        
        cont = 0;
        MPI_Allreduce(&best_cutsizeat, &cont, 1, MPI_INT, MPI_LOR, hgc->row_comm);

        /* since we're only moving in one direction; make sure two successive
           pass didn't produce any improvement before terminating */
        if (!cont)
            ++successivefails; 
        else
            successivefails = 0; 
#ifdef _DEBUG
        /* Just for debugging */
        best_cutsize = Zoltan_PHG_Compute_NetCut(hgc, hg, part);
        imbal = (targetw0 == 0.0) ? 0.0 : fabs(weights[0]-targetw0)/targetw0;
        printf("%s End of Pass %d Comp.Cut=%.2lf RealCut=%.2lf W[%5.0lf, %5.0lf] Imbal=%.2lf\n", uMe(hgc), passcnt, cutsize, best_cutsize, weights[0], weights[1], imbal);
        /* debuggging code ends here */
#endif
    } while (successivefails<2 &&  (++passcnt < hgp->fm_loop_limit));


#ifdef HANDLE_ISOLATED_VERTICES
    if (detail_timing)         
        ZOLTAN_TIMER_START(zz->ZTime, timer->rfiso, hgc->Communicator);            
    /* now root sneds the final part no's of isolated vertices; if any */
    MPI_Bcast(&isocnt, 1, MPI_INT, rootRank, hgc->col_comm);
    if (isocnt<hg->nVtx) {
        deg = (int *) lgain; /* we'll use for part no's of isolated vertices */
        if (hgc->myProc_y==rootRank) 
            for (i=isocnt; i < hg->nVtx; ++i) { /* go over isolated vertices */
                int u = moves[i];
                deg[i] = part[u] = -part[u]-1; 
            }
            
        MPI_Bcast(&moves[isocnt], hg->nVtx-isocnt, MPI_INT, rootRank, hgc->col_comm);
        MPI_Bcast(&deg[isocnt], hg->nVtx-isocnt, MPI_INT, rootRank, hgc->col_comm);
        if (hgc->myProc_y!=rootRank) 
            for (i=isocnt; i < hg->nVtx; ++i)  /* go over isolated vertices */
                part[moves[i]] = deg[i];
    }
    if (detail_timing)         
        ZOLTAN_TIMER_STOP(zz->ZTime, timer->rfiso, hgc->Communicator);            
#endif
 End:    

    if (hgc->myProc_y==rootRank) { /* only root needs mark, adj, gain and heaps*/        
        Zoltan_Multifree(__FILE__,__LINE__, 3, &mark, &adj, &gain);
        Zoltan_Heap_Free(&heap[0]);
        Zoltan_Heap_Free(&heap[1]);        
    }
    
    Zoltan_Multifree(__FILE__, __LINE__, 4, &pins[0], &lpins[0], &moves, &lgain);

    if (do_timing) 
        ZOLTAN_TIMER_STOP(zz->ZTime, timer->rfrefine, hgc->Communicator);
    
    
    ZOLTAN_TRACE_EXIT(zz, yo);
    return ierr;
}
Example #3
0
int Zoltan_PHG(
ZZ *zz,                    /* The Zoltan structure  */
float *part_sizes,         /* Input:  Array of size zz->Num_Global_Parts 
                                containing the percentage of work assigned 
                                to each partition. */
int *num_imp,              /* not computed */
ZOLTAN_ID_PTR *imp_gids,   /* not computed */
ZOLTAN_ID_PTR *imp_lids,   /* not computed */
int **imp_procs,           /* not computed */
int **imp_to_part,         /* not computed */
int *num_exp,              /* number of objects to be exported */
ZOLTAN_ID_PTR *exp_gids,   /* global ids of objects to be exported */
ZOLTAN_ID_PTR *exp_lids,   /* local  ids of objects to be exported */
int **exp_procs,           /* list of processors to export to */
int **exp_to_part )         /* list of partitions to which exported objs
                                are assigned. */
{
  char *yo = "Zoltan_PHG";
  ZHG *zoltan_hg = NULL;
  PHGPartParams hgp;               /* Hypergraph parameters. */
  HGraph *hg = NULL;               /* Hypergraph itself */
  Partition parts = NULL;          /* Partition assignments in 
                                      2D distribution. */
  int err = ZOLTAN_OK, p=0;
  struct phg_timer_indices *timer = NULL; 
  int do_timing = 0;

  ZOLTAN_TRACE_ENTER(zz, yo);

  /* Initialization of return arguments. */
  *num_imp   = *num_exp   = -1;
  *imp_gids  = *exp_gids  = NULL;
  *imp_lids  = *exp_lids  = NULL;
  *imp_procs = *exp_procs = NULL;
  
  /* Initialize HG parameters. */
  err = Zoltan_PHG_Initialize_Params(zz, part_sizes, &hgp);
  if (err != ZOLTAN_OK)
    goto End;

  if (hgp.use_timers) {
    if (!zz->LB.Data_Structure)  {
      zz->LB.Data_Structure = (struct phg_timer_indices *) 
                               ZOLTAN_MALLOC(sizeof(struct phg_timer_indices));
      initialize_timer_indices((struct phg_timer_indices *)zz->LB.Data_Structure);
    }
    timer = zz->LB.Data_Structure;
    if (timer->all < 0) 
      timer->all = Zoltan_Timer_Init(zz->ZTime, 1, "Zoltan_PHG");
  }

  if (hgp.use_timers > 1) {
    do_timing = 1;
    if (timer->build < 0) 
      timer->build = Zoltan_Timer_Init(zz->ZTime, 1, "Build");
    if (timer->setupvmap < 0) 
      timer->setupvmap = Zoltan_Timer_Init(zz->ZTime, 0, "Vmaps");
  }

  if (hgp.use_timers) 
    ZOLTAN_TIMER_START(zz->ZTime, timer->all, zz->Communicator);
    
  if (do_timing)
    ZOLTAN_TIMER_START(zz->ZTime, timer->build, zz->Communicator);
    
  /* build initial Zoltan hypergraph from callback functions. */

  err = Zoltan_PHG_Build_Hypergraph (zz, &zoltan_hg, &parts, &hgp);
  if (err != ZOLTAN_OK && err != ZOLTAN_WARN) {
    ZOLTAN_PRINT_ERROR(zz->Proc, yo, "Error building hypergraph.");
    goto End;
  }

  if (zoltan_hg->GnObj == 0){
    /* degenerate case - no objects to partition */
    hgp.final_output = 0;
    goto End;
  }

  hg = &zoltan_hg->HG;
  p = zz->LB.Num_Global_Parts;  
  zoltan_hg->HG.redl = MAX(hgp.redl, p);     /* redl needs to be dynamic */

  /* RTHRTH -- redl may need to be scaled by number of procs */
  /* EBEB -- at least make sure redl > #procs */

  if (hgp.UseFixedVtx)
      hg->bisec_split = 1; /* this will be used only #parts=2
                              otherwise rdivide will set to appropriate
                              value */

  if (hgp.UsePrefPart || hgp.UseFixedVtx) { /* allocate memory for pref_part */
    if (hg->nVtx &&                       
        !(hg->pref_part = (int*) ZOLTAN_MALLOC (sizeof(int) * hg->nVtx))) {
        ZOLTAN_PRINT_ERROR(zz->Proc, yo, "Error building hypergraph.");
        goto End;
    }
  }
  if (hgp.UsePrefPart) /* copy input parts as pref_part;
                          UVCUVC: TODO: this code (and alloc of pref_part)
                          should go to Build_Hypergraph later */
      memcpy(hg->pref_part, parts, sizeof(int) * hg->nVtx);
  
  if (hgp.UseFixedVtx) {
      int i;
      for (i=0; i<hg->nVtx; ++i)
          if (hg->fixed_part[i]>=0)
              hg->pref_part[i] = hg->fixed_part[i];
          else if (!hgp.UsePrefPart)
              hg->pref_part[i] = -1;      
  }
  hgp.UsePrefPart |= hgp.UseFixedVtx;

  if (do_timing)
    ZOLTAN_TIMER_STOP(zz->ZTime, timer->build, zz->Communicator);


/*
  UVCUVC DEBUG PRINT
  uprintf(hg->comm, "Zoltan_PHG kway=%d #parts=%d\n", hgp.kway, zz->LB.Num_Global_Parts);
*/

  if (!strcasecmp(hgp.hgraph_pkg, "PARKWAY")){
    if (do_timing) {
      if (timer->parkway < 0)
        timer->parkway = Zoltan_Timer_Init(zz->ZTime, 0, "PHG_ParKway");
      ZOLTAN_TIMER_START(zz->ZTime, timer->parkway, zz->Communicator);
    }
    err = Zoltan_PHG_ParKway(zz, hg, p,
                             parts, &hgp);
    if (err != ZOLTAN_OK) 
        goto End;
    if (do_timing)
      ZOLTAN_TIMER_STOP(zz->ZTime, timer->parkway, zz->Communicator);
  } else if (!strcasecmp(hgp.hgraph_pkg, "PATOH")){
    if (hgp.use_timers > 1) {
      if (timer->patoh < 0)
        timer->patoh = Zoltan_Timer_Init(zz->ZTime, 0, "HG_PaToH");
      ZOLTAN_TIMER_START(zz->ZTime, timer->patoh, zz->Communicator);
    }
    err = Zoltan_PHG_PaToH(zz, hg, p,
                           parts, &hgp);
    if (err != ZOLTAN_OK) 
      goto End;
    if (hgp.use_timers > 1)
      ZOLTAN_TIMER_STOP(zz->ZTime, timer->patoh, zz->Communicator);
  }      
  else { /* it must be PHG  */
    /* UVC: if it is bisection anyways; no need to create vmap etc; 
       rdivide is going to call Zoltan_PHG_Partition anyways... */
    if (hgp.globalcomm.Communicator != MPI_COMM_NULL) {
      /* This processor is part of the 2D data distribution; it should
         participate in partitioning. */

        
      if (hgp.kway || zz->LB.Num_Global_Parts == 2) {
        /* call main V cycle routine */
        err = Zoltan_PHG_Partition(zz, hg, p,
                                   hgp.part_sizes, parts, &hgp);
        if (err != ZOLTAN_OK) {
          ZOLTAN_PRINT_ERROR(zz->Proc, yo, "Error partitioning hypergraph.");
          goto End;
        }
      }
      else {
        int i;
          
        if (do_timing) 
          ZOLTAN_TIMER_START(zz->ZTime, timer->setupvmap, zz->Communicator);
        /* vmap associates original vertices to sub hypergraphs */
        if (hg->nVtx && 
            !(hg->vmap = (int*) ZOLTAN_MALLOC(hg->nVtx*sizeof (int))))  {
          err = ZOLTAN_MEMERR;
          ZOLTAN_PRINT_ERROR(zz->Proc, yo, "Memory error.");
          goto End;
        }
        for (i = 0; i < hg->nVtx; ++i)
          hg->vmap[i] = i;
  
        if (do_timing) 
          ZOLTAN_TIMER_STOP(zz->ZTime, timer->setupvmap, zz->Communicator);
  
          
        /* partition hypergraph */
        err = Zoltan_PHG_rdivide (0, p-1, parts, zz, hg, &hgp, 0);
  
        if (hgp.output_level >= PHG_DEBUG_LIST)     
          uprintf(hg->comm, "FINAL %3d |V|=%6d |E|=%6d #pins=%6d %s/%s/%s/%s p=%d "
                  "bal=%.2f cutl=%.2f\n", 
                  hg->info, hg->nVtx, hg->nEdge, hg->nPins,
                  hgp.convert_str, hgp.redm_str, 
                  hgp.coarsepartition_str, hgp.refinement_str, p,
                  Zoltan_PHG_Compute_Balance(zz, hg, hgp.part_sizes, 0,
                                             p, parts),
                  Zoltan_PHG_Compute_ConCut(hg->comm, hg, parts, p, &err));
            
        if (err != ZOLTAN_OK)  {
          ZOLTAN_PRINT_ERROR(zz->Proc, yo, "Error partitioning hypergraph.");
          goto End;
        }
        ZOLTAN_FREE (&hg->vmap);
      }
#ifdef CHECK_LEFTALONE_VERTICES          
      findAndSaveLeftAloneVertices(zz, hg, p, parts, &hgp);
#endif      
    }
  }

  if (!strcasecmp(hgp.hgraph_method, "REPARTITION")) {
    Zoltan_PHG_Remove_Repart_Data(zz, zoltan_hg, hg, &hgp);
  }


/* UVC DEBUG PRINT
  if (!strcasecmp(hgp->hgraph_method, "REFINE")){
      uprintf(hg->comm, 
              "UVC ATTHEEND |V|=%6d |E|=%6d #pins=%6d p=%d bal=%.2f cutl=%.2f\n",
              hg->nVtx, hg->nEdge, hg->nPins, p,
              Zoltan_PHG_Compute_Balance(zz, hg, part_sizes, p, parts),
              Zoltan_PHG_Compute_ConCut(hg->comm, hg, parts, p, &err));
      detailed_balance_info(zz, hg, part_sizes, p, parts);
  }
*/

  
  /* Initialize these timers here so their output is near end of printout */
  if (do_timing)
    if (timer->retlist < 0) 
      timer->retlist = Zoltan_Timer_Init(zz->ZTime, 1, "Return_Lists");

  if (hgp.use_timers)
    if (timer->finaloutput < 0) 
      timer->finaloutput = Zoltan_Timer_Init(zz->ZTime, 1, "Final_Output");

  if (do_timing) 
    ZOLTAN_TIMER_START(zz->ZTime, timer->retlist, zz->Communicator);

  /* Build Zoltan's Output_Parts, mapped from 2D distribution 
     to input distribution. */

  Zoltan_PHG_Output_Parts(zz, zoltan_hg, parts);

  /* Build Zoltan's return arguments. */
  Zoltan_PHG_Return_Lists(zz, zoltan_hg, num_exp, exp_gids,
   exp_lids, exp_procs, exp_to_part);
    
  if (do_timing)
    ZOLTAN_TIMER_STOP(zz->ZTime, timer->retlist, zz->Communicator);

End:
  if (err == ZOLTAN_MEMERR)
    ZOLTAN_PRINT_ERROR (zz->Proc, yo, "Memory error.")
  else if (err != ZOLTAN_OK)
    ZOLTAN_PRINT_ERROR (zz->Proc, yo, "Error partitioning hypergraph.")
    
  /* KDDKDD The following code prints a final quality result even when
   * KDDKDD phg_output_level is zero.  It is useful for our tests and
   * KDDKDD data collection. */
  if ((err == ZOLTAN_OK) && hgp.final_output) {
    static int nRuns=0;
    static double balsum = 0.0, cutlsum = 0.0, cutnsum = 0.0, movesum = 0.0, repartsum = 0.0;
    static double balmax = 0.0, cutlmax = 0.0, cutnmax = 0.0, movemax = 0.0, repartmax = 0.0;
    static double balmin = 1e100, cutlmin = 1e100, cutnmin = 1e100, movemin = 1e100, repartmin = 1e100;
    double bal = 0.; 
    double cutl = 0.; /* Connnectivity cuts:  sum_over_edges((npart-1)*ewgt) */
    double cutn = 0.; /* Net cuts:  sum_over_edges((nparts>1)*ewgt) */

    double rlocal[2];  /* local cut stats for removed edges */
    double rglobal[2]; /* global cut stats for removed edges */
    int gnremove, i;
    double move=0.0, gmove;  /* local and global migration costs */
    double repart=0.0;   /* total repartitioning cost: comcost x multiplier + migration_cost */

    
    if (hgp.use_timers) {
      /* Do not include final output time in partitioning time */
      ZOLTAN_TIMER_STOP(zz->ZTime, timer->all, zz->Communicator);
      ZOLTAN_TIMER_START(zz->ZTime, timer->finaloutput, zz->Communicator);
    }

    if (hgp.globalcomm.Communicator != MPI_COMM_NULL) {
      /* Processor participated in partitioning */
      bal = Zoltan_PHG_Compute_Balance(zz, hg, hgp.part_sizes, 0,
                                       zz->LB.Num_Global_Parts, parts);
      cutl= Zoltan_PHG_Compute_ConCut(hg->comm, hg, parts,
                                      zz->LB.Num_Global_Parts, &err);
      cutn = Zoltan_PHG_Compute_NetCut(hg->comm, hg, parts,
                                       zz->LB.Num_Global_Parts);
      for (i = 0; i < zoltan_hg->nObj; ++i) {
        /* uprintf(hg->comm, " obj[%d] = %d  in=%d out=%d\n", i, zoltan_hg->AppObjSizes[i], zoltan_hg->Input_Parts[i], zoltan_hg->Output_Parts[i]); */
	if (zoltan_hg->Input_Parts[i] != zoltan_hg->Output_Parts[i])
            move += (double) ((zoltan_hg->AppObjSizes) ? zoltan_hg->AppObjSizes[i] : 1.0);

      }
    }

    if (!err) {
     
      /* Add in cut contributions from removed edges */
      MPI_Allreduce(&(zoltan_hg->nRemove), &gnremove, 1, MPI_INT, MPI_SUM,
                    zz->Communicator);
      if (gnremove) {
        err = Zoltan_PHG_Removed_Cuts(zz, zoltan_hg, rlocal);
        MPI_Allreduce(rlocal, rglobal, 2, MPI_DOUBLE,MPI_SUM,zz->Communicator);
        
        cutl += rglobal[0];
        cutn += rglobal[1];
      }

      MPI_Allreduce(&move, &gmove, 1, MPI_DOUBLE, MPI_SUM, zz->Communicator);

      repart = cutl*hgp.RepartMultiplier + gmove;
      repartsum += repart;
      if (repart > repartmax) repartmax = repart;
      if (repart < repartmin) repartmin = repart;
      movesum += gmove;
      if (gmove > movemax) movemax = gmove;
      if (gmove < movemin) movemin = gmove;
      cutlsum += cutl;
      if (cutl > cutlmax) cutlmax = cutl;
      if (cutl < cutlmin) cutlmin = cutl;
      cutnsum += cutn;
      if (cutn > cutnmax) cutnmax = cutn;
      if (cutn < cutnmin) cutnmin = cutn;
      balsum += bal;
      if (bal > balmax) balmax = bal;
      if (bal < balmin) balmin = bal;
      nRuns++;
   
      if (zz->Proc == 0) {
        uprintf(hg->comm, 
                "STATS Runs %d  bal  CURRENT %f  MAX %f  MIN %f  AVG %f\n", 
                nRuns, bal, balmax, balmin, balsum/nRuns);
        uprintf(hg->comm, 
                "STATS Runs %d  cutl CURRENT %f  MAX %f  MIN %f  AVG %f\n", 
                nRuns, cutl, cutlmax, cutlmin, cutlsum/nRuns);
        uprintf(hg->comm, 
                "STATS Runs %d  cutn CURRENT %f  MAX %f  MIN %f  AVG %f\n", 
                nRuns, cutn, cutnmax, cutnmin, cutnsum/nRuns);
	uprintf(hg->comm,
		"STATS Runs %d  %s CURRENT %f  MAX %f  MIN %f  AVG %f\n",
		nRuns, (zoltan_hg->showMoveVol) ? "moveVol" : "moveCnt", gmove, movemax, movemin, movesum/nRuns);
        if (zoltan_hg->showMoveVol) 
            uprintf(hg->comm,
		"STATS Runs %d  repart CURRENT %f  MAX %f  MIN %f  AVG %f\n",
		nRuns, repart, repartmax, repartmin, repartsum/nRuns);        
      }
    }

    if (hgp.use_timers) {
      ZOLTAN_TIMER_STOP(zz->ZTime, timer->finaloutput, zz->Communicator);
      ZOLTAN_TIMER_START(zz->ZTime, timer->all, zz->Communicator);
    }
  }
  /* KDDKDD  End of printing section. */
  
  ZOLTAN_FREE(&parts);
  if (zoltan_hg != NULL) {
    Zoltan_PHG_Free_Hypergraph_Data(zoltan_hg);
    ZOLTAN_FREE (&zoltan_hg);
  }

  if (hgp.use_timers) {
    ZOLTAN_TIMER_STOP(zz->ZTime, timer->all, zz->Communicator);
    if (hgp.globalcomm.Communicator != MPI_COMM_NULL)
      Zoltan_Timer_PrintAll(zz->ZTime, 0, hgp.globalcomm.Communicator, stdout);
  }

  if (hgp.globalcomm.row_comm != MPI_COMM_NULL)
    MPI_Comm_free(&(hgp.globalcomm.row_comm));
  if (hgp.globalcomm.col_comm != MPI_COMM_NULL)
    MPI_Comm_free(&(hgp.globalcomm.col_comm));
  if (hgp.globalcomm.Communicator != MPI_COMM_NULL)
    MPI_Comm_free(&(hgp.globalcomm.Communicator));

  /* Free part_sizes if created new due to ADD_OBJ_WEIGHT */
  if (hgp.part_sizes != part_sizes)
    ZOLTAN_FREE(&hgp.part_sizes);

  ZOLTAN_TRACE_EXIT(zz, yo);
  return err;
}